{ "best_global_step": 499760, "best_metric": 1.0584163665771484, "best_model_checkpoint": "/media/user/Expansion1/opus-mt-en-zhtw-google-translate3/checkpoint-499760", "epoch": 5.0, "eval_steps": 500, "global_step": 499760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005002401152553225, "grad_norm": 9.338144302368164, "learning_rate": 4.997503801824876e-05, "loss": 2.3751, "num_input_tokens_seen": 281752, "step": 500 }, { "epoch": 0.01000480230510645, "grad_norm": 10.810979843139648, "learning_rate": 4.9950026012485996e-05, "loss": 2.2026, "num_input_tokens_seen": 552352, "step": 1000 }, { "epoch": 0.015007203457659676, "grad_norm": 9.087040901184082, "learning_rate": 4.992501400672323e-05, "loss": 2.1448, "num_input_tokens_seen": 823176, "step": 1500 }, { "epoch": 0.0200096046102129, "grad_norm": 8.617157936096191, "learning_rate": 4.990000200096046e-05, "loss": 2.0951, "num_input_tokens_seen": 1102760, "step": 2000 }, { "epoch": 0.02501200576276613, "grad_norm": 7.297477722167969, "learning_rate": 4.9874989995197695e-05, "loss": 2.0308, "num_input_tokens_seen": 1378560, "step": 2500 }, { "epoch": 0.030014406915319352, "grad_norm": 8.311019897460938, "learning_rate": 4.984997798943493e-05, "loss": 2.0307, "num_input_tokens_seen": 1653488, "step": 3000 }, { "epoch": 0.03501680806787258, "grad_norm": 8.287640571594238, "learning_rate": 4.9824965983672164e-05, "loss": 1.9855, "num_input_tokens_seen": 1923368, "step": 3500 }, { "epoch": 0.0400192092204258, "grad_norm": 9.764960289001465, "learning_rate": 4.97999539779094e-05, "loss": 1.9682, "num_input_tokens_seen": 2199816, "step": 4000 }, { "epoch": 0.04502161037297903, "grad_norm": 7.692084312438965, "learning_rate": 4.9774941972146634e-05, "loss": 1.9222, "num_input_tokens_seen": 2467792, "step": 4500 }, { "epoch": 0.05002401152553226, "grad_norm": 7.139247417449951, "learning_rate": 4.974992996638387e-05, "loss": 1.9266, "num_input_tokens_seen": 2731792, "step": 5000 }, { "epoch": 0.05502641267808548, "grad_norm": 7.6170244216918945, "learning_rate": 4.97249179606211e-05, "loss": 1.9024, "num_input_tokens_seen": 3004120, "step": 5500 }, { "epoch": 0.060028813830638704, "grad_norm": 12.332016944885254, "learning_rate": 4.969990595485833e-05, "loss": 1.8823, "num_input_tokens_seen": 3284568, "step": 6000 }, { "epoch": 0.06503121498319193, "grad_norm": 7.665128231048584, "learning_rate": 4.967489394909557e-05, "loss": 1.8642, "num_input_tokens_seen": 3554600, "step": 6500 }, { "epoch": 0.07003361613574516, "grad_norm": 10.934691429138184, "learning_rate": 4.96498819433328e-05, "loss": 1.8556, "num_input_tokens_seen": 3824936, "step": 7000 }, { "epoch": 0.07503601728829838, "grad_norm": 7.880730152130127, "learning_rate": 4.962486993757004e-05, "loss": 1.8606, "num_input_tokens_seen": 4099080, "step": 7500 }, { "epoch": 0.0800384184408516, "grad_norm": 7.548530578613281, "learning_rate": 4.959985793180727e-05, "loss": 1.83, "num_input_tokens_seen": 4366808, "step": 8000 }, { "epoch": 0.08504081959340483, "grad_norm": 7.900990009307861, "learning_rate": 4.957484592604451e-05, "loss": 1.8031, "num_input_tokens_seen": 4638816, "step": 8500 }, { "epoch": 0.09004322074595807, "grad_norm": 8.125676155090332, "learning_rate": 4.9549833920281736e-05, "loss": 1.8455, "num_input_tokens_seen": 4915000, "step": 9000 }, { "epoch": 0.09504562189851129, "grad_norm": 7.727709770202637, "learning_rate": 4.952482191451897e-05, "loss": 1.8024, "num_input_tokens_seen": 5188672, "step": 9500 }, { "epoch": 0.10004802305106451, "grad_norm": 5.897092342376709, "learning_rate": 4.9499809908756206e-05, "loss": 1.7928, "num_input_tokens_seen": 5468592, "step": 10000 }, { "epoch": 0.10505042420361774, "grad_norm": 11.170528411865234, "learning_rate": 4.947479790299344e-05, "loss": 1.7868, "num_input_tokens_seen": 5733256, "step": 10500 }, { "epoch": 0.11005282535617096, "grad_norm": 8.682831764221191, "learning_rate": 4.944978589723067e-05, "loss": 1.7878, "num_input_tokens_seen": 6008088, "step": 11000 }, { "epoch": 0.11505522650872418, "grad_norm": 7.914422988891602, "learning_rate": 4.942477389146791e-05, "loss": 1.7355, "num_input_tokens_seen": 6274960, "step": 11500 }, { "epoch": 0.12005762766127741, "grad_norm": 8.685178756713867, "learning_rate": 4.9399761885705145e-05, "loss": 1.7744, "num_input_tokens_seen": 6554288, "step": 12000 }, { "epoch": 0.12506002881383063, "grad_norm": 7.942957401275635, "learning_rate": 4.9374749879942374e-05, "loss": 1.7293, "num_input_tokens_seen": 6832880, "step": 12500 }, { "epoch": 0.13006242996638387, "grad_norm": 6.650600910186768, "learning_rate": 4.934973787417961e-05, "loss": 1.7523, "num_input_tokens_seen": 7107168, "step": 13000 }, { "epoch": 0.13506483111893708, "grad_norm": 7.683079242706299, "learning_rate": 4.9324725868416844e-05, "loss": 1.7432, "num_input_tokens_seen": 7377472, "step": 13500 }, { "epoch": 0.14006723227149032, "grad_norm": 8.168213844299316, "learning_rate": 4.929971386265408e-05, "loss": 1.745, "num_input_tokens_seen": 7653272, "step": 14000 }, { "epoch": 0.14506963342404355, "grad_norm": 8.087789535522461, "learning_rate": 4.927470185689131e-05, "loss": 1.7179, "num_input_tokens_seen": 7926040, "step": 14500 }, { "epoch": 0.15007203457659676, "grad_norm": 8.388677597045898, "learning_rate": 4.924968985112854e-05, "loss": 1.7219, "num_input_tokens_seen": 8195288, "step": 15000 }, { "epoch": 0.15507443572915, "grad_norm": 8.354930877685547, "learning_rate": 4.922467784536578e-05, "loss": 1.7376, "num_input_tokens_seen": 8469792, "step": 15500 }, { "epoch": 0.1600768368817032, "grad_norm": 8.638579368591309, "learning_rate": 4.919966583960301e-05, "loss": 1.701, "num_input_tokens_seen": 8735800, "step": 16000 }, { "epoch": 0.16507923803425645, "grad_norm": 6.771655559539795, "learning_rate": 4.9174653833840247e-05, "loss": 1.719, "num_input_tokens_seen": 9005256, "step": 16500 }, { "epoch": 0.17008163918680966, "grad_norm": 12.017413139343262, "learning_rate": 4.914964182807748e-05, "loss": 1.7029, "num_input_tokens_seen": 9279816, "step": 17000 }, { "epoch": 0.1750840403393629, "grad_norm": 7.177635669708252, "learning_rate": 4.9124629822314716e-05, "loss": 1.6925, "num_input_tokens_seen": 9548800, "step": 17500 }, { "epoch": 0.18008644149191613, "grad_norm": 6.606298446655273, "learning_rate": 4.9099617816551945e-05, "loss": 1.6957, "num_input_tokens_seen": 9826416, "step": 18000 }, { "epoch": 0.18508884264446934, "grad_norm": 6.026829242706299, "learning_rate": 4.907460581078918e-05, "loss": 1.7022, "num_input_tokens_seen": 10095920, "step": 18500 }, { "epoch": 0.19009124379702258, "grad_norm": 8.743913650512695, "learning_rate": 4.9049593805026415e-05, "loss": 1.6904, "num_input_tokens_seen": 10371760, "step": 19000 }, { "epoch": 0.1950936449495758, "grad_norm": 9.37678050994873, "learning_rate": 4.902458179926365e-05, "loss": 1.6617, "num_input_tokens_seen": 10639680, "step": 19500 }, { "epoch": 0.20009604610212903, "grad_norm": 7.834632396697998, "learning_rate": 4.8999569793500885e-05, "loss": 1.6631, "num_input_tokens_seen": 10910752, "step": 20000 }, { "epoch": 0.20509844725468224, "grad_norm": 7.523416519165039, "learning_rate": 4.897455778773812e-05, "loss": 1.6526, "num_input_tokens_seen": 11174272, "step": 20500 }, { "epoch": 0.21010084840723547, "grad_norm": 7.593217372894287, "learning_rate": 4.894954578197535e-05, "loss": 1.6618, "num_input_tokens_seen": 11447056, "step": 21000 }, { "epoch": 0.2151032495597887, "grad_norm": 7.984575271606445, "learning_rate": 4.892453377621258e-05, "loss": 1.6668, "num_input_tokens_seen": 11715856, "step": 21500 }, { "epoch": 0.22010565071234192, "grad_norm": 7.122634410858154, "learning_rate": 4.889952177044982e-05, "loss": 1.6697, "num_input_tokens_seen": 11994600, "step": 22000 }, { "epoch": 0.22510805186489516, "grad_norm": 6.745737552642822, "learning_rate": 4.887450976468705e-05, "loss": 1.6471, "num_input_tokens_seen": 12264272, "step": 22500 }, { "epoch": 0.23011045301744837, "grad_norm": 6.742521286010742, "learning_rate": 4.884949775892428e-05, "loss": 1.6588, "num_input_tokens_seen": 12538760, "step": 23000 }, { "epoch": 0.2351128541700016, "grad_norm": 8.37658977508545, "learning_rate": 4.882448575316152e-05, "loss": 1.667, "num_input_tokens_seen": 12814464, "step": 23500 }, { "epoch": 0.24011525532255482, "grad_norm": 7.458651065826416, "learning_rate": 4.879947374739876e-05, "loss": 1.6459, "num_input_tokens_seen": 13083456, "step": 24000 }, { "epoch": 0.24511765647510805, "grad_norm": 10.364368438720703, "learning_rate": 4.8774461741635986e-05, "loss": 1.651, "num_input_tokens_seen": 13361568, "step": 24500 }, { "epoch": 0.25012005762766126, "grad_norm": 6.404083251953125, "learning_rate": 4.874944973587322e-05, "loss": 1.6271, "num_input_tokens_seen": 13638952, "step": 25000 }, { "epoch": 0.2551224587802145, "grad_norm": 7.239497184753418, "learning_rate": 4.8724437730110456e-05, "loss": 1.6325, "num_input_tokens_seen": 13909896, "step": 25500 }, { "epoch": 0.26012485993276774, "grad_norm": 9.5720796585083, "learning_rate": 4.869942572434769e-05, "loss": 1.6227, "num_input_tokens_seen": 14182688, "step": 26000 }, { "epoch": 0.265127261085321, "grad_norm": 7.255125045776367, "learning_rate": 4.867441371858492e-05, "loss": 1.6328, "num_input_tokens_seen": 14455888, "step": 26500 }, { "epoch": 0.27012966223787416, "grad_norm": 7.990500450134277, "learning_rate": 4.8649401712822154e-05, "loss": 1.6315, "num_input_tokens_seen": 14726048, "step": 27000 }, { "epoch": 0.2751320633904274, "grad_norm": 7.787556171417236, "learning_rate": 4.8624389707059396e-05, "loss": 1.6357, "num_input_tokens_seen": 15005208, "step": 27500 }, { "epoch": 0.28013446454298063, "grad_norm": 6.046635627746582, "learning_rate": 4.8599377701296624e-05, "loss": 1.6173, "num_input_tokens_seen": 15272768, "step": 28000 }, { "epoch": 0.28513686569553387, "grad_norm": 6.547093391418457, "learning_rate": 4.857436569553386e-05, "loss": 1.6067, "num_input_tokens_seen": 15539248, "step": 28500 }, { "epoch": 0.2901392668480871, "grad_norm": 8.07209587097168, "learning_rate": 4.8549353689771094e-05, "loss": 1.6018, "num_input_tokens_seen": 15810072, "step": 29000 }, { "epoch": 0.2951416680006403, "grad_norm": 7.229666709899902, "learning_rate": 4.852434168400833e-05, "loss": 1.6238, "num_input_tokens_seen": 16082968, "step": 29500 }, { "epoch": 0.3001440691531935, "grad_norm": 6.863572597503662, "learning_rate": 4.849932967824556e-05, "loss": 1.6104, "num_input_tokens_seen": 16354784, "step": 30000 }, { "epoch": 0.30514647030574676, "grad_norm": 8.546486854553223, "learning_rate": 4.847431767248279e-05, "loss": 1.5961, "num_input_tokens_seen": 16623816, "step": 30500 }, { "epoch": 0.3101488714583, "grad_norm": 6.493512153625488, "learning_rate": 4.844930566672003e-05, "loss": 1.5977, "num_input_tokens_seen": 16896624, "step": 31000 }, { "epoch": 0.3151512726108532, "grad_norm": 7.902426242828369, "learning_rate": 4.842429366095726e-05, "loss": 1.6049, "num_input_tokens_seen": 17168672, "step": 31500 }, { "epoch": 0.3201536737634064, "grad_norm": 8.033360481262207, "learning_rate": 4.83992816551945e-05, "loss": 1.581, "num_input_tokens_seen": 17433848, "step": 32000 }, { "epoch": 0.32515607491595966, "grad_norm": 7.9239325523376465, "learning_rate": 4.837426964943173e-05, "loss": 1.6029, "num_input_tokens_seen": 17704920, "step": 32500 }, { "epoch": 0.3301584760685129, "grad_norm": 6.995474815368652, "learning_rate": 4.834925764366897e-05, "loss": 1.576, "num_input_tokens_seen": 17969776, "step": 33000 }, { "epoch": 0.33516087722106613, "grad_norm": 8.305245399475098, "learning_rate": 4.8324245637906195e-05, "loss": 1.5909, "num_input_tokens_seen": 18239784, "step": 33500 }, { "epoch": 0.3401632783736193, "grad_norm": 6.429056167602539, "learning_rate": 4.829923363214343e-05, "loss": 1.5742, "num_input_tokens_seen": 18507688, "step": 34000 }, { "epoch": 0.34516567952617255, "grad_norm": 7.208921432495117, "learning_rate": 4.8274221626380665e-05, "loss": 1.573, "num_input_tokens_seen": 18776368, "step": 34500 }, { "epoch": 0.3501680806787258, "grad_norm": 7.433680057525635, "learning_rate": 4.82492096206179e-05, "loss": 1.588, "num_input_tokens_seen": 19050584, "step": 35000 }, { "epoch": 0.35517048183127903, "grad_norm": 6.901820182800293, "learning_rate": 4.8224197614855135e-05, "loss": 1.58, "num_input_tokens_seen": 19333968, "step": 35500 }, { "epoch": 0.36017288298383227, "grad_norm": 8.789533615112305, "learning_rate": 4.819918560909237e-05, "loss": 1.5758, "num_input_tokens_seen": 19612632, "step": 36000 }, { "epoch": 0.36517528413638545, "grad_norm": 7.546513557434082, "learning_rate": 4.8174173603329605e-05, "loss": 1.5763, "num_input_tokens_seen": 19883800, "step": 36500 }, { "epoch": 0.3701776852889387, "grad_norm": 6.489349842071533, "learning_rate": 4.814916159756683e-05, "loss": 1.5866, "num_input_tokens_seen": 20158632, "step": 37000 }, { "epoch": 0.3751800864414919, "grad_norm": 7.696920871734619, "learning_rate": 4.812414959180407e-05, "loss": 1.5858, "num_input_tokens_seen": 20430440, "step": 37500 }, { "epoch": 0.38018248759404516, "grad_norm": 6.559112071990967, "learning_rate": 4.80991375860413e-05, "loss": 1.5596, "num_input_tokens_seen": 20703976, "step": 38000 }, { "epoch": 0.38518488874659834, "grad_norm": 8.480047225952148, "learning_rate": 4.807412558027853e-05, "loss": 1.5533, "num_input_tokens_seen": 20972048, "step": 38500 }, { "epoch": 0.3901872898991516, "grad_norm": 5.950156211853027, "learning_rate": 4.804911357451577e-05, "loss": 1.5642, "num_input_tokens_seen": 21240912, "step": 39000 }, { "epoch": 0.3951896910517048, "grad_norm": 5.799604892730713, "learning_rate": 4.802410156875301e-05, "loss": 1.5598, "num_input_tokens_seen": 21507728, "step": 39500 }, { "epoch": 0.40019209220425805, "grad_norm": 5.980112075805664, "learning_rate": 4.7999089562990236e-05, "loss": 1.5609, "num_input_tokens_seen": 21783160, "step": 40000 }, { "epoch": 0.4051944933568113, "grad_norm": 5.936067581176758, "learning_rate": 4.797407755722747e-05, "loss": 1.5523, "num_input_tokens_seen": 22052792, "step": 40500 }, { "epoch": 0.4101968945093645, "grad_norm": 9.166204452514648, "learning_rate": 4.7949065551464706e-05, "loss": 1.5452, "num_input_tokens_seen": 22324072, "step": 41000 }, { "epoch": 0.4151992956619177, "grad_norm": 8.854216575622559, "learning_rate": 4.792405354570194e-05, "loss": 1.5516, "num_input_tokens_seen": 22590624, "step": 41500 }, { "epoch": 0.42020169681447095, "grad_norm": 9.261016845703125, "learning_rate": 4.789904153993917e-05, "loss": 1.5374, "num_input_tokens_seen": 22862424, "step": 42000 }, { "epoch": 0.4252040979670242, "grad_norm": 7.714609622955322, "learning_rate": 4.7874029534176404e-05, "loss": 1.544, "num_input_tokens_seen": 23128888, "step": 42500 }, { "epoch": 0.4302064991195774, "grad_norm": 5.665945529937744, "learning_rate": 4.784901752841364e-05, "loss": 1.5532, "num_input_tokens_seen": 23407576, "step": 43000 }, { "epoch": 0.4352089002721306, "grad_norm": 6.948183536529541, "learning_rate": 4.7824005522650874e-05, "loss": 1.5467, "num_input_tokens_seen": 23683544, "step": 43500 }, { "epoch": 0.44021130142468384, "grad_norm": 5.725684642791748, "learning_rate": 4.779899351688811e-05, "loss": 1.5295, "num_input_tokens_seen": 23948640, "step": 44000 }, { "epoch": 0.4452137025772371, "grad_norm": 6.168211936950684, "learning_rate": 4.7773981511125344e-05, "loss": 1.5477, "num_input_tokens_seen": 24212584, "step": 44500 }, { "epoch": 0.4502161037297903, "grad_norm": 6.778971195220947, "learning_rate": 4.774896950536258e-05, "loss": 1.5134, "num_input_tokens_seen": 24481104, "step": 45000 }, { "epoch": 0.4552185048823435, "grad_norm": 7.2399210929870605, "learning_rate": 4.772395749959981e-05, "loss": 1.5447, "num_input_tokens_seen": 24756992, "step": 45500 }, { "epoch": 0.46022090603489674, "grad_norm": 6.476212024688721, "learning_rate": 4.769894549383704e-05, "loss": 1.5361, "num_input_tokens_seen": 25020392, "step": 46000 }, { "epoch": 0.46522330718745, "grad_norm": 10.64287281036377, "learning_rate": 4.767393348807428e-05, "loss": 1.5409, "num_input_tokens_seen": 25296728, "step": 46500 }, { "epoch": 0.4702257083400032, "grad_norm": 7.746724605560303, "learning_rate": 4.764892148231151e-05, "loss": 1.4953, "num_input_tokens_seen": 25562256, "step": 47000 }, { "epoch": 0.47522810949255645, "grad_norm": 6.38646125793457, "learning_rate": 4.762390947654875e-05, "loss": 1.5518, "num_input_tokens_seen": 25833464, "step": 47500 }, { "epoch": 0.48023051064510963, "grad_norm": 6.2214555740356445, "learning_rate": 4.759889747078598e-05, "loss": 1.5375, "num_input_tokens_seen": 26108816, "step": 48000 }, { "epoch": 0.48523291179766287, "grad_norm": 7.317322731018066, "learning_rate": 4.757388546502322e-05, "loss": 1.5449, "num_input_tokens_seen": 26385360, "step": 48500 }, { "epoch": 0.4902353129502161, "grad_norm": 6.4762701988220215, "learning_rate": 4.7548873459260445e-05, "loss": 1.5179, "num_input_tokens_seen": 26656488, "step": 49000 }, { "epoch": 0.49523771410276934, "grad_norm": 7.051132678985596, "learning_rate": 4.752386145349768e-05, "loss": 1.5213, "num_input_tokens_seen": 26925744, "step": 49500 }, { "epoch": 0.5002401152553225, "grad_norm": 8.628856658935547, "learning_rate": 4.7498849447734915e-05, "loss": 1.5517, "num_input_tokens_seen": 27202296, "step": 50000 }, { "epoch": 0.5052425164078758, "grad_norm": 6.756930351257324, "learning_rate": 4.747383744197215e-05, "loss": 1.5061, "num_input_tokens_seen": 27469216, "step": 50500 }, { "epoch": 0.510244917560429, "grad_norm": 8.543140411376953, "learning_rate": 4.7448825436209385e-05, "loss": 1.5323, "num_input_tokens_seen": 27746208, "step": 51000 }, { "epoch": 0.5152473187129822, "grad_norm": 7.62526273727417, "learning_rate": 4.742381343044662e-05, "loss": 1.5284, "num_input_tokens_seen": 28024152, "step": 51500 }, { "epoch": 0.5202497198655355, "grad_norm": 6.117633819580078, "learning_rate": 4.7398801424683855e-05, "loss": 1.5304, "num_input_tokens_seen": 28300312, "step": 52000 }, { "epoch": 0.5252521210180887, "grad_norm": 6.280879497528076, "learning_rate": 4.737378941892108e-05, "loss": 1.4873, "num_input_tokens_seen": 28573472, "step": 52500 }, { "epoch": 0.530254522170642, "grad_norm": 6.297519683837891, "learning_rate": 4.734877741315832e-05, "loss": 1.512, "num_input_tokens_seen": 28846960, "step": 53000 }, { "epoch": 0.5352569233231952, "grad_norm": 7.927740097045898, "learning_rate": 4.732376540739555e-05, "loss": 1.5303, "num_input_tokens_seen": 29120648, "step": 53500 }, { "epoch": 0.5402593244757483, "grad_norm": 6.746880054473877, "learning_rate": 4.729875340163278e-05, "loss": 1.5187, "num_input_tokens_seen": 29389712, "step": 54000 }, { "epoch": 0.5452617256283016, "grad_norm": 6.765636920928955, "learning_rate": 4.7273741395870016e-05, "loss": 1.502, "num_input_tokens_seen": 29661920, "step": 54500 }, { "epoch": 0.5502641267808548, "grad_norm": 4.868513107299805, "learning_rate": 4.724872939010726e-05, "loss": 1.499, "num_input_tokens_seen": 29933584, "step": 55000 }, { "epoch": 0.555266527933408, "grad_norm": 6.332437515258789, "learning_rate": 4.722371738434449e-05, "loss": 1.4815, "num_input_tokens_seen": 30200096, "step": 55500 }, { "epoch": 0.5602689290859613, "grad_norm": 6.429714679718018, "learning_rate": 4.719870537858172e-05, "loss": 1.5138, "num_input_tokens_seen": 30475600, "step": 56000 }, { "epoch": 0.5652713302385145, "grad_norm": 6.9108991622924805, "learning_rate": 4.7173693372818956e-05, "loss": 1.5218, "num_input_tokens_seen": 30750536, "step": 56500 }, { "epoch": 0.5702737313910677, "grad_norm": 7.187946796417236, "learning_rate": 4.714868136705619e-05, "loss": 1.4937, "num_input_tokens_seen": 31023928, "step": 57000 }, { "epoch": 0.575276132543621, "grad_norm": 7.880980014801025, "learning_rate": 4.712366936129342e-05, "loss": 1.5149, "num_input_tokens_seen": 31295016, "step": 57500 }, { "epoch": 0.5802785336961742, "grad_norm": 6.050008773803711, "learning_rate": 4.7098657355530654e-05, "loss": 1.5172, "num_input_tokens_seen": 31571344, "step": 58000 }, { "epoch": 0.5852809348487273, "grad_norm": 5.153658390045166, "learning_rate": 4.707364534976789e-05, "loss": 1.499, "num_input_tokens_seen": 31847512, "step": 58500 }, { "epoch": 0.5902833360012806, "grad_norm": 8.292108535766602, "learning_rate": 4.704863334400513e-05, "loss": 1.4897, "num_input_tokens_seen": 32118720, "step": 59000 }, { "epoch": 0.5952857371538338, "grad_norm": 5.900440216064453, "learning_rate": 4.702362133824236e-05, "loss": 1.5195, "num_input_tokens_seen": 32397744, "step": 59500 }, { "epoch": 0.600288138306387, "grad_norm": 7.023585796356201, "learning_rate": 4.6998609332479594e-05, "loss": 1.4755, "num_input_tokens_seen": 32671912, "step": 60000 }, { "epoch": 0.6052905394589403, "grad_norm": 7.419212818145752, "learning_rate": 4.697359732671683e-05, "loss": 1.4844, "num_input_tokens_seen": 32941344, "step": 60500 }, { "epoch": 0.6102929406114935, "grad_norm": 7.654923915863037, "learning_rate": 4.694858532095406e-05, "loss": 1.497, "num_input_tokens_seen": 33221672, "step": 61000 }, { "epoch": 0.6152953417640468, "grad_norm": 6.979129791259766, "learning_rate": 4.692357331519129e-05, "loss": 1.4855, "num_input_tokens_seen": 33489080, "step": 61500 }, { "epoch": 0.6202977429166, "grad_norm": 6.450369834899902, "learning_rate": 4.689856130942853e-05, "loss": 1.4945, "num_input_tokens_seen": 33763488, "step": 62000 }, { "epoch": 0.6253001440691532, "grad_norm": 6.070815563201904, "learning_rate": 4.687354930366576e-05, "loss": 1.4991, "num_input_tokens_seen": 34031328, "step": 62500 }, { "epoch": 0.6303025452217064, "grad_norm": 5.402656078338623, "learning_rate": 4.6848537297903e-05, "loss": 1.4957, "num_input_tokens_seen": 34304608, "step": 63000 }, { "epoch": 0.6353049463742596, "grad_norm": 10.961112022399902, "learning_rate": 4.682352529214023e-05, "loss": 1.4767, "num_input_tokens_seen": 34569600, "step": 63500 }, { "epoch": 0.6403073475268128, "grad_norm": 7.523622035980225, "learning_rate": 4.679851328637747e-05, "loss": 1.4953, "num_input_tokens_seen": 34838720, "step": 64000 }, { "epoch": 0.6453097486793661, "grad_norm": 7.367971420288086, "learning_rate": 4.6773501280614695e-05, "loss": 1.4959, "num_input_tokens_seen": 35110952, "step": 64500 }, { "epoch": 0.6503121498319193, "grad_norm": 7.122432231903076, "learning_rate": 4.674848927485193e-05, "loss": 1.4617, "num_input_tokens_seen": 35379336, "step": 65000 }, { "epoch": 0.6553145509844726, "grad_norm": 6.941073894500732, "learning_rate": 4.6723477269089165e-05, "loss": 1.4828, "num_input_tokens_seen": 35654144, "step": 65500 }, { "epoch": 0.6603169521370258, "grad_norm": 7.309379577636719, "learning_rate": 4.669846526332639e-05, "loss": 1.4695, "num_input_tokens_seen": 35922592, "step": 66000 }, { "epoch": 0.665319353289579, "grad_norm": 8.28540325164795, "learning_rate": 4.6673453257563635e-05, "loss": 1.4853, "num_input_tokens_seen": 36195592, "step": 66500 }, { "epoch": 0.6703217544421323, "grad_norm": 6.311332702636719, "learning_rate": 4.664844125180087e-05, "loss": 1.478, "num_input_tokens_seen": 36471728, "step": 67000 }, { "epoch": 0.6753241555946854, "grad_norm": 6.863243579864502, "learning_rate": 4.6623429246038105e-05, "loss": 1.4935, "num_input_tokens_seen": 36738232, "step": 67500 }, { "epoch": 0.6803265567472386, "grad_norm": 5.535435199737549, "learning_rate": 4.659841724027533e-05, "loss": 1.4689, "num_input_tokens_seen": 37003552, "step": 68000 }, { "epoch": 0.6853289578997919, "grad_norm": 7.348452568054199, "learning_rate": 4.657340523451257e-05, "loss": 1.4802, "num_input_tokens_seen": 37273624, "step": 68500 }, { "epoch": 0.6903313590523451, "grad_norm": 5.919636249542236, "learning_rate": 4.65483932287498e-05, "loss": 1.468, "num_input_tokens_seen": 37542216, "step": 69000 }, { "epoch": 0.6953337602048983, "grad_norm": 6.997893333435059, "learning_rate": 4.652338122298703e-05, "loss": 1.4711, "num_input_tokens_seen": 37817712, "step": 69500 }, { "epoch": 0.7003361613574516, "grad_norm": 7.683621883392334, "learning_rate": 4.6498369217224266e-05, "loss": 1.4467, "num_input_tokens_seen": 38086368, "step": 70000 }, { "epoch": 0.7053385625100048, "grad_norm": 5.56058931350708, "learning_rate": 4.647335721146151e-05, "loss": 1.4631, "num_input_tokens_seen": 38364016, "step": 70500 }, { "epoch": 0.7103409636625581, "grad_norm": 5.151466369628906, "learning_rate": 4.644834520569874e-05, "loss": 1.4776, "num_input_tokens_seen": 38639544, "step": 71000 }, { "epoch": 0.7153433648151113, "grad_norm": 7.764716625213623, "learning_rate": 4.642333319993597e-05, "loss": 1.4629, "num_input_tokens_seen": 38900248, "step": 71500 }, { "epoch": 0.7203457659676645, "grad_norm": 7.205192565917969, "learning_rate": 4.6398321194173206e-05, "loss": 1.4699, "num_input_tokens_seen": 39177440, "step": 72000 }, { "epoch": 0.7253481671202177, "grad_norm": 6.734379768371582, "learning_rate": 4.637330918841044e-05, "loss": 1.4649, "num_input_tokens_seen": 39442976, "step": 72500 }, { "epoch": 0.7303505682727709, "grad_norm": 6.191771507263184, "learning_rate": 4.634829718264767e-05, "loss": 1.4764, "num_input_tokens_seen": 39715104, "step": 73000 }, { "epoch": 0.7353529694253241, "grad_norm": 7.378221035003662, "learning_rate": 4.6323285176884904e-05, "loss": 1.4515, "num_input_tokens_seen": 39990848, "step": 73500 }, { "epoch": 0.7403553705778774, "grad_norm": 6.436953067779541, "learning_rate": 4.629827317112214e-05, "loss": 1.4495, "num_input_tokens_seen": 40258280, "step": 74000 }, { "epoch": 0.7453577717304306, "grad_norm": 5.954966068267822, "learning_rate": 4.6273261165359374e-05, "loss": 1.4497, "num_input_tokens_seen": 40535544, "step": 74500 }, { "epoch": 0.7503601728829838, "grad_norm": 6.085744857788086, "learning_rate": 4.624824915959661e-05, "loss": 1.4428, "num_input_tokens_seen": 40804552, "step": 75000 }, { "epoch": 0.7553625740355371, "grad_norm": 6.737603664398193, "learning_rate": 4.6223237153833844e-05, "loss": 1.4394, "num_input_tokens_seen": 41072144, "step": 75500 }, { "epoch": 0.7603649751880903, "grad_norm": 5.9119439125061035, "learning_rate": 4.619822514807108e-05, "loss": 1.4408, "num_input_tokens_seen": 41341080, "step": 76000 }, { "epoch": 0.7653673763406436, "grad_norm": 7.842981815338135, "learning_rate": 4.617321314230831e-05, "loss": 1.4538, "num_input_tokens_seen": 41613760, "step": 76500 }, { "epoch": 0.7703697774931967, "grad_norm": 7.999574184417725, "learning_rate": 4.614820113654554e-05, "loss": 1.451, "num_input_tokens_seen": 41886512, "step": 77000 }, { "epoch": 0.7753721786457499, "grad_norm": 5.851772308349609, "learning_rate": 4.612318913078278e-05, "loss": 1.4317, "num_input_tokens_seen": 42154824, "step": 77500 }, { "epoch": 0.7803745797983032, "grad_norm": 7.43974494934082, "learning_rate": 4.609817712502001e-05, "loss": 1.4707, "num_input_tokens_seen": 42425080, "step": 78000 }, { "epoch": 0.7853769809508564, "grad_norm": 6.566989898681641, "learning_rate": 4.607316511925725e-05, "loss": 1.4725, "num_input_tokens_seen": 42695896, "step": 78500 }, { "epoch": 0.7903793821034096, "grad_norm": 6.765398979187012, "learning_rate": 4.604815311349448e-05, "loss": 1.4454, "num_input_tokens_seen": 42961864, "step": 79000 }, { "epoch": 0.7953817832559629, "grad_norm": 5.989940643310547, "learning_rate": 4.602314110773172e-05, "loss": 1.4551, "num_input_tokens_seen": 43237224, "step": 79500 }, { "epoch": 0.8003841844085161, "grad_norm": 8.16629409790039, "learning_rate": 4.5998129101968945e-05, "loss": 1.4354, "num_input_tokens_seen": 43512872, "step": 80000 }, { "epoch": 0.8053865855610693, "grad_norm": 6.704333305358887, "learning_rate": 4.597311709620618e-05, "loss": 1.4481, "num_input_tokens_seen": 43779448, "step": 80500 }, { "epoch": 0.8103889867136226, "grad_norm": 6.2965593338012695, "learning_rate": 4.5948105090443415e-05, "loss": 1.4443, "num_input_tokens_seen": 44053296, "step": 81000 }, { "epoch": 0.8153913878661757, "grad_norm": 6.224064350128174, "learning_rate": 4.592309308468064e-05, "loss": 1.47, "num_input_tokens_seen": 44328488, "step": 81500 }, { "epoch": 0.820393789018729, "grad_norm": 6.873196601867676, "learning_rate": 4.589808107891788e-05, "loss": 1.4429, "num_input_tokens_seen": 44604152, "step": 82000 }, { "epoch": 0.8253961901712822, "grad_norm": 6.774177551269531, "learning_rate": 4.587306907315512e-05, "loss": 1.4427, "num_input_tokens_seen": 44880240, "step": 82500 }, { "epoch": 0.8303985913238354, "grad_norm": 7.543479919433594, "learning_rate": 4.5848057067392355e-05, "loss": 1.4559, "num_input_tokens_seen": 45150264, "step": 83000 }, { "epoch": 0.8354009924763887, "grad_norm": 6.445783615112305, "learning_rate": 4.582304506162958e-05, "loss": 1.4215, "num_input_tokens_seen": 45419664, "step": 83500 }, { "epoch": 0.8404033936289419, "grad_norm": 8.083765029907227, "learning_rate": 4.579803305586682e-05, "loss": 1.4636, "num_input_tokens_seen": 45691768, "step": 84000 }, { "epoch": 0.8454057947814951, "grad_norm": 6.205325126647949, "learning_rate": 4.577302105010405e-05, "loss": 1.4386, "num_input_tokens_seen": 45965288, "step": 84500 }, { "epoch": 0.8504081959340484, "grad_norm": 5.954364776611328, "learning_rate": 4.574800904434128e-05, "loss": 1.454, "num_input_tokens_seen": 46239520, "step": 85000 }, { "epoch": 0.8554105970866016, "grad_norm": 7.476288318634033, "learning_rate": 4.5722997038578516e-05, "loss": 1.4375, "num_input_tokens_seen": 46506456, "step": 85500 }, { "epoch": 0.8604129982391548, "grad_norm": 9.656715393066406, "learning_rate": 4.569798503281575e-05, "loss": 1.4293, "num_input_tokens_seen": 46775832, "step": 86000 }, { "epoch": 0.865415399391708, "grad_norm": 5.440873622894287, "learning_rate": 4.567297302705299e-05, "loss": 1.428, "num_input_tokens_seen": 47053184, "step": 86500 }, { "epoch": 0.8704178005442612, "grad_norm": 6.26190710067749, "learning_rate": 4.564796102129022e-05, "loss": 1.4175, "num_input_tokens_seen": 47326160, "step": 87000 }, { "epoch": 0.8754202016968144, "grad_norm": 5.701922416687012, "learning_rate": 4.5622949015527456e-05, "loss": 1.4323, "num_input_tokens_seen": 47596624, "step": 87500 }, { "epoch": 0.8804226028493677, "grad_norm": 7.687532901763916, "learning_rate": 4.559793700976469e-05, "loss": 1.4403, "num_input_tokens_seen": 47866072, "step": 88000 }, { "epoch": 0.8854250040019209, "grad_norm": 4.988935470581055, "learning_rate": 4.557292500400192e-05, "loss": 1.445, "num_input_tokens_seen": 48153664, "step": 88500 }, { "epoch": 0.8904274051544742, "grad_norm": 5.36391544342041, "learning_rate": 4.5547912998239154e-05, "loss": 1.43, "num_input_tokens_seen": 48432248, "step": 89000 }, { "epoch": 0.8954298063070274, "grad_norm": 7.618863105773926, "learning_rate": 4.552290099247639e-05, "loss": 1.4122, "num_input_tokens_seen": 48705584, "step": 89500 }, { "epoch": 0.9004322074595806, "grad_norm": 9.530303001403809, "learning_rate": 4.5497888986713624e-05, "loss": 1.4392, "num_input_tokens_seen": 48985152, "step": 90000 }, { "epoch": 0.9054346086121339, "grad_norm": 7.428534030914307, "learning_rate": 4.547287698095086e-05, "loss": 1.4289, "num_input_tokens_seen": 49262664, "step": 90500 }, { "epoch": 0.910437009764687, "grad_norm": 7.3600287437438965, "learning_rate": 4.5447864975188094e-05, "loss": 1.4242, "num_input_tokens_seen": 49536480, "step": 91000 }, { "epoch": 0.9154394109172402, "grad_norm": 5.594141960144043, "learning_rate": 4.542285296942533e-05, "loss": 1.4266, "num_input_tokens_seen": 49809336, "step": 91500 }, { "epoch": 0.9204418120697935, "grad_norm": 6.597540378570557, "learning_rate": 4.539784096366256e-05, "loss": 1.4216, "num_input_tokens_seen": 50082984, "step": 92000 }, { "epoch": 0.9254442132223467, "grad_norm": 8.180904388427734, "learning_rate": 4.537282895789979e-05, "loss": 1.4374, "num_input_tokens_seen": 50358512, "step": 92500 }, { "epoch": 0.9304466143749, "grad_norm": 7.512216567993164, "learning_rate": 4.534781695213703e-05, "loss": 1.4178, "num_input_tokens_seen": 50634216, "step": 93000 }, { "epoch": 0.9354490155274532, "grad_norm": 6.1448283195495605, "learning_rate": 4.532280494637426e-05, "loss": 1.399, "num_input_tokens_seen": 50902088, "step": 93500 }, { "epoch": 0.9404514166800064, "grad_norm": 6.424488544464111, "learning_rate": 4.52977929406115e-05, "loss": 1.4115, "num_input_tokens_seen": 51174136, "step": 94000 }, { "epoch": 0.9454538178325597, "grad_norm": 5.398598670959473, "learning_rate": 4.527278093484873e-05, "loss": 1.4146, "num_input_tokens_seen": 51453752, "step": 94500 }, { "epoch": 0.9504562189851129, "grad_norm": 6.272931098937988, "learning_rate": 4.524776892908597e-05, "loss": 1.4166, "num_input_tokens_seen": 51728016, "step": 95000 }, { "epoch": 0.955458620137666, "grad_norm": 6.412170886993408, "learning_rate": 4.5222756923323195e-05, "loss": 1.4241, "num_input_tokens_seen": 51995272, "step": 95500 }, { "epoch": 0.9604610212902193, "grad_norm": 7.181222438812256, "learning_rate": 4.519774491756043e-05, "loss": 1.4166, "num_input_tokens_seen": 52276888, "step": 96000 }, { "epoch": 0.9654634224427725, "grad_norm": 6.848874092102051, "learning_rate": 4.5172732911797665e-05, "loss": 1.4028, "num_input_tokens_seen": 52555928, "step": 96500 }, { "epoch": 0.9704658235953257, "grad_norm": 6.6588568687438965, "learning_rate": 4.51477209060349e-05, "loss": 1.3993, "num_input_tokens_seen": 52824472, "step": 97000 }, { "epoch": 0.975468224747879, "grad_norm": 7.8776373863220215, "learning_rate": 4.512270890027213e-05, "loss": 1.4204, "num_input_tokens_seen": 53098176, "step": 97500 }, { "epoch": 0.9804706259004322, "grad_norm": 5.281984806060791, "learning_rate": 4.509769689450937e-05, "loss": 1.4191, "num_input_tokens_seen": 53379376, "step": 98000 }, { "epoch": 0.9854730270529855, "grad_norm": 8.383103370666504, "learning_rate": 4.5072684888746605e-05, "loss": 1.4232, "num_input_tokens_seen": 53654608, "step": 98500 }, { "epoch": 0.9904754282055387, "grad_norm": 5.8474626541137695, "learning_rate": 4.504767288298383e-05, "loss": 1.4099, "num_input_tokens_seen": 53931080, "step": 99000 }, { "epoch": 0.9954778293580919, "grad_norm": 6.058784008026123, "learning_rate": 4.502266087722107e-05, "loss": 1.3993, "num_input_tokens_seen": 54204800, "step": 99500 }, { "epoch": 1.0, "eval_loss": 1.2487133741378784, "eval_runtime": 187.129, "eval_samples_per_second": 1068.274, "eval_steps_per_second": 133.539, "num_input_tokens_seen": 54454616, "step": 99952 }, { "epoch": 1.000480230510645, "grad_norm": 5.304110050201416, "learning_rate": 4.49976488714583e-05, "loss": 1.3882, "num_input_tokens_seen": 54481288, "step": 100000 }, { "epoch": 1.0054826316631984, "grad_norm": 7.098052501678467, "learning_rate": 4.497263686569553e-05, "loss": 1.2973, "num_input_tokens_seen": 54749928, "step": 100500 }, { "epoch": 1.0104850328157515, "grad_norm": 7.15824031829834, "learning_rate": 4.4947624859932766e-05, "loss": 1.3323, "num_input_tokens_seen": 55027920, "step": 101000 }, { "epoch": 1.0154874339683049, "grad_norm": 6.138706684112549, "learning_rate": 4.492261285417e-05, "loss": 1.3195, "num_input_tokens_seen": 55303960, "step": 101500 }, { "epoch": 1.020489835120858, "grad_norm": 8.01395320892334, "learning_rate": 4.4897600848407236e-05, "loss": 1.2913, "num_input_tokens_seen": 55577184, "step": 102000 }, { "epoch": 1.0254922362734111, "grad_norm": 7.413015842437744, "learning_rate": 4.487258884264447e-05, "loss": 1.3284, "num_input_tokens_seen": 55851192, "step": 102500 }, { "epoch": 1.0304946374259645, "grad_norm": 6.665005207061768, "learning_rate": 4.4847576836881706e-05, "loss": 1.3239, "num_input_tokens_seen": 56125184, "step": 103000 }, { "epoch": 1.0354970385785176, "grad_norm": 6.208978652954102, "learning_rate": 4.482256483111894e-05, "loss": 1.3198, "num_input_tokens_seen": 56399640, "step": 103500 }, { "epoch": 1.040499439731071, "grad_norm": 6.494995594024658, "learning_rate": 4.479755282535617e-05, "loss": 1.3036, "num_input_tokens_seen": 56672752, "step": 104000 }, { "epoch": 1.045501840883624, "grad_norm": 7.3449625968933105, "learning_rate": 4.4772540819593404e-05, "loss": 1.3304, "num_input_tokens_seen": 56942744, "step": 104500 }, { "epoch": 1.0505042420361774, "grad_norm": 5.880083084106445, "learning_rate": 4.474752881383064e-05, "loss": 1.3273, "num_input_tokens_seen": 57223568, "step": 105000 }, { "epoch": 1.0555066431887306, "grad_norm": 7.793262004852295, "learning_rate": 4.4722516808067874e-05, "loss": 1.3364, "num_input_tokens_seen": 57501104, "step": 105500 }, { "epoch": 1.060509044341284, "grad_norm": 5.995269298553467, "learning_rate": 4.469750480230511e-05, "loss": 1.3157, "num_input_tokens_seen": 57774032, "step": 106000 }, { "epoch": 1.065511445493837, "grad_norm": 6.386702060699463, "learning_rate": 4.4672492796542344e-05, "loss": 1.2906, "num_input_tokens_seen": 58052328, "step": 106500 }, { "epoch": 1.0705138466463904, "grad_norm": 6.049729347229004, "learning_rate": 4.464748079077958e-05, "loss": 1.3073, "num_input_tokens_seen": 58325864, "step": 107000 }, { "epoch": 1.0755162477989435, "grad_norm": 6.0326433181762695, "learning_rate": 4.462246878501681e-05, "loss": 1.3223, "num_input_tokens_seen": 58605688, "step": 107500 }, { "epoch": 1.0805186489514966, "grad_norm": 7.254247188568115, "learning_rate": 4.459745677925404e-05, "loss": 1.3131, "num_input_tokens_seen": 58875792, "step": 108000 }, { "epoch": 1.08552105010405, "grad_norm": 5.334825038909912, "learning_rate": 4.457244477349128e-05, "loss": 1.3313, "num_input_tokens_seen": 59148200, "step": 108500 }, { "epoch": 1.090523451256603, "grad_norm": 5.982466697692871, "learning_rate": 4.454743276772851e-05, "loss": 1.3031, "num_input_tokens_seen": 59416680, "step": 109000 }, { "epoch": 1.0955258524091565, "grad_norm": 5.858680725097656, "learning_rate": 4.452242076196575e-05, "loss": 1.2964, "num_input_tokens_seen": 59680504, "step": 109500 }, { "epoch": 1.1005282535617096, "grad_norm": 7.001748085021973, "learning_rate": 4.449740875620298e-05, "loss": 1.3203, "num_input_tokens_seen": 59951112, "step": 110000 }, { "epoch": 1.105530654714263, "grad_norm": 7.0456013679504395, "learning_rate": 4.447239675044022e-05, "loss": 1.3229, "num_input_tokens_seen": 60223952, "step": 110500 }, { "epoch": 1.110533055866816, "grad_norm": 8.38005256652832, "learning_rate": 4.4447384744677446e-05, "loss": 1.3045, "num_input_tokens_seen": 60497304, "step": 111000 }, { "epoch": 1.1155354570193694, "grad_norm": 6.44760799407959, "learning_rate": 4.442237273891468e-05, "loss": 1.3298, "num_input_tokens_seen": 60770032, "step": 111500 }, { "epoch": 1.1205378581719225, "grad_norm": 7.661795616149902, "learning_rate": 4.4397360733151915e-05, "loss": 1.299, "num_input_tokens_seen": 61041904, "step": 112000 }, { "epoch": 1.1255402593244757, "grad_norm": 7.2505340576171875, "learning_rate": 4.437234872738915e-05, "loss": 1.3444, "num_input_tokens_seen": 61315792, "step": 112500 }, { "epoch": 1.130542660477029, "grad_norm": 8.16947078704834, "learning_rate": 4.434733672162638e-05, "loss": 1.313, "num_input_tokens_seen": 61591968, "step": 113000 }, { "epoch": 1.1355450616295821, "grad_norm": 6.221188068389893, "learning_rate": 4.4322324715863614e-05, "loss": 1.3266, "num_input_tokens_seen": 61862648, "step": 113500 }, { "epoch": 1.1405474627821355, "grad_norm": 5.967212677001953, "learning_rate": 4.4297312710100855e-05, "loss": 1.3521, "num_input_tokens_seen": 62138024, "step": 114000 }, { "epoch": 1.1455498639346886, "grad_norm": 6.872376441955566, "learning_rate": 4.4272300704338084e-05, "loss": 1.3218, "num_input_tokens_seen": 62414352, "step": 114500 }, { "epoch": 1.150552265087242, "grad_norm": 6.218190670013428, "learning_rate": 4.424728869857532e-05, "loss": 1.3306, "num_input_tokens_seen": 62689104, "step": 115000 }, { "epoch": 1.155554666239795, "grad_norm": 8.191985130310059, "learning_rate": 4.4222276692812553e-05, "loss": 1.3236, "num_input_tokens_seen": 62963216, "step": 115500 }, { "epoch": 1.1605570673923484, "grad_norm": 6.161906719207764, "learning_rate": 4.419726468704979e-05, "loss": 1.3258, "num_input_tokens_seen": 63235456, "step": 116000 }, { "epoch": 1.1655594685449016, "grad_norm": 7.158758640289307, "learning_rate": 4.417225268128702e-05, "loss": 1.3037, "num_input_tokens_seen": 63505248, "step": 116500 }, { "epoch": 1.1705618696974547, "grad_norm": 5.683105945587158, "learning_rate": 4.414724067552425e-05, "loss": 1.3154, "num_input_tokens_seen": 63772504, "step": 117000 }, { "epoch": 1.175564270850008, "grad_norm": 7.0123467445373535, "learning_rate": 4.4122228669761487e-05, "loss": 1.3043, "num_input_tokens_seen": 64045928, "step": 117500 }, { "epoch": 1.1805666720025612, "grad_norm": 5.434397220611572, "learning_rate": 4.409721666399872e-05, "loss": 1.3247, "num_input_tokens_seen": 64313624, "step": 118000 }, { "epoch": 1.1855690731551145, "grad_norm": 6.298323631286621, "learning_rate": 4.4072204658235956e-05, "loss": 1.3333, "num_input_tokens_seen": 64591384, "step": 118500 }, { "epoch": 1.1905714743076676, "grad_norm": 6.530762672424316, "learning_rate": 4.404719265247319e-05, "loss": 1.3324, "num_input_tokens_seen": 64864128, "step": 119000 }, { "epoch": 1.195573875460221, "grad_norm": 7.463630199432373, "learning_rate": 4.4022180646710426e-05, "loss": 1.314, "num_input_tokens_seen": 65134680, "step": 119500 }, { "epoch": 1.200576276612774, "grad_norm": 8.017274856567383, "learning_rate": 4.3997168640947655e-05, "loss": 1.3, "num_input_tokens_seen": 65400128, "step": 120000 }, { "epoch": 1.2055786777653275, "grad_norm": 6.083741188049316, "learning_rate": 4.397215663518489e-05, "loss": 1.3122, "num_input_tokens_seen": 65670200, "step": 120500 }, { "epoch": 1.2105810789178806, "grad_norm": 7.809543609619141, "learning_rate": 4.3947144629422125e-05, "loss": 1.316, "num_input_tokens_seen": 65935248, "step": 121000 }, { "epoch": 1.2155834800704337, "grad_norm": 6.627076148986816, "learning_rate": 4.392213262365936e-05, "loss": 1.3024, "num_input_tokens_seen": 66206584, "step": 121500 }, { "epoch": 1.220585881222987, "grad_norm": 5.432526111602783, "learning_rate": 4.3897120617896594e-05, "loss": 1.3181, "num_input_tokens_seen": 66476424, "step": 122000 }, { "epoch": 1.2255882823755402, "grad_norm": 5.557873249053955, "learning_rate": 4.387210861213383e-05, "loss": 1.3066, "num_input_tokens_seen": 66746568, "step": 122500 }, { "epoch": 1.2305906835280935, "grad_norm": 5.4136738777160645, "learning_rate": 4.384709660637106e-05, "loss": 1.3065, "num_input_tokens_seen": 67013472, "step": 123000 }, { "epoch": 1.2355930846806467, "grad_norm": 4.602624416351318, "learning_rate": 4.382208460060829e-05, "loss": 1.2921, "num_input_tokens_seen": 67284136, "step": 123500 }, { "epoch": 1.2405954858332, "grad_norm": 7.711009502410889, "learning_rate": 4.379707259484553e-05, "loss": 1.3104, "num_input_tokens_seen": 67555712, "step": 124000 }, { "epoch": 1.2455978869857531, "grad_norm": 5.971095561981201, "learning_rate": 4.377206058908276e-05, "loss": 1.3288, "num_input_tokens_seen": 67830816, "step": 124500 }, { "epoch": 1.2506002881383065, "grad_norm": 5.992773056030273, "learning_rate": 4.374704858331999e-05, "loss": 1.3372, "num_input_tokens_seen": 68113208, "step": 125000 }, { "epoch": 1.2556026892908596, "grad_norm": 7.2574238777160645, "learning_rate": 4.372203657755723e-05, "loss": 1.2964, "num_input_tokens_seen": 68376088, "step": 125500 }, { "epoch": 1.2606050904434127, "grad_norm": 4.974996566772461, "learning_rate": 4.369702457179447e-05, "loss": 1.3021, "num_input_tokens_seen": 68641168, "step": 126000 }, { "epoch": 1.265607491595966, "grad_norm": 5.745625019073486, "learning_rate": 4.3672012566031696e-05, "loss": 1.3217, "num_input_tokens_seen": 68909752, "step": 126500 }, { "epoch": 1.2706098927485192, "grad_norm": 6.78819465637207, "learning_rate": 4.364700056026893e-05, "loss": 1.3211, "num_input_tokens_seen": 69181824, "step": 127000 }, { "epoch": 1.2756122939010726, "grad_norm": 7.1991047859191895, "learning_rate": 4.3621988554506166e-05, "loss": 1.3175, "num_input_tokens_seen": 69448304, "step": 127500 }, { "epoch": 1.2806146950536257, "grad_norm": 5.636517524719238, "learning_rate": 4.35969765487434e-05, "loss": 1.308, "num_input_tokens_seen": 69724960, "step": 128000 }, { "epoch": 1.285617096206179, "grad_norm": 6.406187057495117, "learning_rate": 4.357196454298063e-05, "loss": 1.3225, "num_input_tokens_seen": 70004440, "step": 128500 }, { "epoch": 1.2906194973587322, "grad_norm": 5.746100902557373, "learning_rate": 4.3546952537217864e-05, "loss": 1.3084, "num_input_tokens_seen": 70276824, "step": 129000 }, { "epoch": 1.2956218985112855, "grad_norm": 5.6266584396362305, "learning_rate": 4.3521940531455105e-05, "loss": 1.3251, "num_input_tokens_seen": 70549080, "step": 129500 }, { "epoch": 1.3006242996638386, "grad_norm": 6.3568315505981445, "learning_rate": 4.3496928525692334e-05, "loss": 1.2909, "num_input_tokens_seen": 70822216, "step": 130000 }, { "epoch": 1.3056267008163918, "grad_norm": 6.566619873046875, "learning_rate": 4.347191651992957e-05, "loss": 1.3083, "num_input_tokens_seen": 71088152, "step": 130500 }, { "epoch": 1.310629101968945, "grad_norm": 8.060522079467773, "learning_rate": 4.3446904514166804e-05, "loss": 1.3124, "num_input_tokens_seen": 71354416, "step": 131000 }, { "epoch": 1.3156315031214985, "grad_norm": 7.366143226623535, "learning_rate": 4.342189250840404e-05, "loss": 1.317, "num_input_tokens_seen": 71630192, "step": 131500 }, { "epoch": 1.3206339042740516, "grad_norm": 6.985642910003662, "learning_rate": 4.339688050264127e-05, "loss": 1.3115, "num_input_tokens_seen": 71898288, "step": 132000 }, { "epoch": 1.3256363054266047, "grad_norm": 6.185933589935303, "learning_rate": 4.33718684968785e-05, "loss": 1.3227, "num_input_tokens_seen": 72177880, "step": 132500 }, { "epoch": 1.330638706579158, "grad_norm": 5.259435176849365, "learning_rate": 4.334685649111574e-05, "loss": 1.3202, "num_input_tokens_seen": 72456024, "step": 133000 }, { "epoch": 1.3356411077317112, "grad_norm": 6.163081169128418, "learning_rate": 4.332184448535297e-05, "loss": 1.3021, "num_input_tokens_seen": 72724464, "step": 133500 }, { "epoch": 1.3406435088842645, "grad_norm": 5.284718036651611, "learning_rate": 4.329683247959021e-05, "loss": 1.3063, "num_input_tokens_seen": 72991696, "step": 134000 }, { "epoch": 1.3456459100368177, "grad_norm": 6.016850471496582, "learning_rate": 4.327182047382744e-05, "loss": 1.3012, "num_input_tokens_seen": 73261048, "step": 134500 }, { "epoch": 1.3506483111893708, "grad_norm": 6.393965244293213, "learning_rate": 4.3246808468064677e-05, "loss": 1.2991, "num_input_tokens_seen": 73529952, "step": 135000 }, { "epoch": 1.3556507123419241, "grad_norm": 7.240478992462158, "learning_rate": 4.3221796462301905e-05, "loss": 1.3297, "num_input_tokens_seen": 73806208, "step": 135500 }, { "epoch": 1.3606531134944775, "grad_norm": 6.343556880950928, "learning_rate": 4.319678445653914e-05, "loss": 1.3228, "num_input_tokens_seen": 74076360, "step": 136000 }, { "epoch": 1.3656555146470306, "grad_norm": 5.717186450958252, "learning_rate": 4.3171772450776375e-05, "loss": 1.3018, "num_input_tokens_seen": 74350688, "step": 136500 }, { "epoch": 1.3706579157995837, "grad_norm": 5.872751235961914, "learning_rate": 4.314676044501361e-05, "loss": 1.3053, "num_input_tokens_seen": 74623168, "step": 137000 }, { "epoch": 1.375660316952137, "grad_norm": 6.422801971435547, "learning_rate": 4.3121748439250845e-05, "loss": 1.3107, "num_input_tokens_seen": 74892400, "step": 137500 }, { "epoch": 1.3806627181046902, "grad_norm": 5.038456439971924, "learning_rate": 4.309673643348808e-05, "loss": 1.3261, "num_input_tokens_seen": 75161376, "step": 138000 }, { "epoch": 1.3856651192572436, "grad_norm": 6.162600040435791, "learning_rate": 4.3071724427725315e-05, "loss": 1.2904, "num_input_tokens_seen": 75437000, "step": 138500 }, { "epoch": 1.3906675204097967, "grad_norm": 5.364713191986084, "learning_rate": 4.304671242196254e-05, "loss": 1.3162, "num_input_tokens_seen": 75711312, "step": 139000 }, { "epoch": 1.3956699215623498, "grad_norm": 6.959611415863037, "learning_rate": 4.302170041619978e-05, "loss": 1.3231, "num_input_tokens_seen": 75982336, "step": 139500 }, { "epoch": 1.4006723227149032, "grad_norm": 7.737590789794922, "learning_rate": 4.299668841043701e-05, "loss": 1.3175, "num_input_tokens_seen": 76261536, "step": 140000 }, { "epoch": 1.4056747238674565, "grad_norm": 5.541545391082764, "learning_rate": 4.297167640467424e-05, "loss": 1.3075, "num_input_tokens_seen": 76530928, "step": 140500 }, { "epoch": 1.4106771250200096, "grad_norm": 6.196156024932861, "learning_rate": 4.2946664398911476e-05, "loss": 1.3045, "num_input_tokens_seen": 76805928, "step": 141000 }, { "epoch": 1.4156795261725628, "grad_norm": 5.349905490875244, "learning_rate": 4.292165239314872e-05, "loss": 1.3223, "num_input_tokens_seen": 77083224, "step": 141500 }, { "epoch": 1.4206819273251161, "grad_norm": 5.8378586769104, "learning_rate": 4.2896640387385946e-05, "loss": 1.3025, "num_input_tokens_seen": 77352800, "step": 142000 }, { "epoch": 1.4256843284776692, "grad_norm": 6.061739921569824, "learning_rate": 4.287162838162318e-05, "loss": 1.3074, "num_input_tokens_seen": 77625328, "step": 142500 }, { "epoch": 1.4306867296302226, "grad_norm": 5.522953510284424, "learning_rate": 4.2846616375860416e-05, "loss": 1.3052, "num_input_tokens_seen": 77902368, "step": 143000 }, { "epoch": 1.4356891307827757, "grad_norm": 6.295720100402832, "learning_rate": 4.282160437009765e-05, "loss": 1.3118, "num_input_tokens_seen": 78177488, "step": 143500 }, { "epoch": 1.4406915319353288, "grad_norm": 6.575023651123047, "learning_rate": 4.279659236433488e-05, "loss": 1.3093, "num_input_tokens_seen": 78446712, "step": 144000 }, { "epoch": 1.4456939330878822, "grad_norm": 6.984113693237305, "learning_rate": 4.2771580358572114e-05, "loss": 1.3076, "num_input_tokens_seen": 78720880, "step": 144500 }, { "epoch": 1.4506963342404355, "grad_norm": 5.520240306854248, "learning_rate": 4.274656835280935e-05, "loss": 1.3001, "num_input_tokens_seen": 78987824, "step": 145000 }, { "epoch": 1.4556987353929887, "grad_norm": 8.607036590576172, "learning_rate": 4.2721556347046584e-05, "loss": 1.3129, "num_input_tokens_seen": 79265448, "step": 145500 }, { "epoch": 1.4607011365455418, "grad_norm": 5.851890563964844, "learning_rate": 4.269654434128382e-05, "loss": 1.283, "num_input_tokens_seen": 79533224, "step": 146000 }, { "epoch": 1.4657035376980951, "grad_norm": 6.837863922119141, "learning_rate": 4.2671532335521054e-05, "loss": 1.3191, "num_input_tokens_seen": 79806784, "step": 146500 }, { "epoch": 1.4707059388506483, "grad_norm": 8.558204650878906, "learning_rate": 4.264652032975829e-05, "loss": 1.3004, "num_input_tokens_seen": 80082392, "step": 147000 }, { "epoch": 1.4757083400032016, "grad_norm": 5.562234401702881, "learning_rate": 4.262150832399552e-05, "loss": 1.3127, "num_input_tokens_seen": 80357544, "step": 147500 }, { "epoch": 1.4807107411557547, "grad_norm": 6.331244945526123, "learning_rate": 4.259649631823275e-05, "loss": 1.2875, "num_input_tokens_seen": 80619480, "step": 148000 }, { "epoch": 1.4857131423083079, "grad_norm": 7.26661491394043, "learning_rate": 4.257148431246999e-05, "loss": 1.286, "num_input_tokens_seen": 80889016, "step": 148500 }, { "epoch": 1.4907155434608612, "grad_norm": 6.140303134918213, "learning_rate": 4.254647230670722e-05, "loss": 1.3209, "num_input_tokens_seen": 81158600, "step": 149000 }, { "epoch": 1.4957179446134146, "grad_norm": 6.452395439147949, "learning_rate": 4.252146030094446e-05, "loss": 1.3115, "num_input_tokens_seen": 81438680, "step": 149500 }, { "epoch": 1.5007203457659677, "grad_norm": 7.9884257316589355, "learning_rate": 4.249644829518169e-05, "loss": 1.2958, "num_input_tokens_seen": 81705824, "step": 150000 }, { "epoch": 1.5057227469185208, "grad_norm": 5.807667255401611, "learning_rate": 4.247143628941893e-05, "loss": 1.3309, "num_input_tokens_seen": 81978560, "step": 150500 }, { "epoch": 1.5107251480710742, "grad_norm": 6.487443447113037, "learning_rate": 4.2446424283656155e-05, "loss": 1.303, "num_input_tokens_seen": 82250552, "step": 151000 }, { "epoch": 1.5157275492236273, "grad_norm": 7.297651767730713, "learning_rate": 4.242141227789339e-05, "loss": 1.2961, "num_input_tokens_seen": 82528296, "step": 151500 }, { "epoch": 1.5207299503761806, "grad_norm": 6.434643268585205, "learning_rate": 4.2396400272130625e-05, "loss": 1.2926, "num_input_tokens_seen": 82791496, "step": 152000 }, { "epoch": 1.5257323515287338, "grad_norm": 6.918686389923096, "learning_rate": 4.237138826636785e-05, "loss": 1.2998, "num_input_tokens_seen": 83063776, "step": 152500 }, { "epoch": 1.530734752681287, "grad_norm": 5.594851493835449, "learning_rate": 4.2346376260605095e-05, "loss": 1.321, "num_input_tokens_seen": 83339208, "step": 153000 }, { "epoch": 1.5357371538338402, "grad_norm": 6.245510578155518, "learning_rate": 4.232136425484233e-05, "loss": 1.2743, "num_input_tokens_seen": 83610920, "step": 153500 }, { "epoch": 1.5407395549863936, "grad_norm": 6.392094612121582, "learning_rate": 4.2296352249079565e-05, "loss": 1.3062, "num_input_tokens_seen": 83883600, "step": 154000 }, { "epoch": 1.5457419561389467, "grad_norm": 6.538769245147705, "learning_rate": 4.227134024331679e-05, "loss": 1.3111, "num_input_tokens_seen": 84152704, "step": 154500 }, { "epoch": 1.5507443572914998, "grad_norm": 6.384563446044922, "learning_rate": 4.224632823755403e-05, "loss": 1.2767, "num_input_tokens_seen": 84425920, "step": 155000 }, { "epoch": 1.5557467584440532, "grad_norm": 6.407052040100098, "learning_rate": 4.222131623179126e-05, "loss": 1.2865, "num_input_tokens_seen": 84697904, "step": 155500 }, { "epoch": 1.5607491595966063, "grad_norm": 6.534234046936035, "learning_rate": 4.219630422602849e-05, "loss": 1.2817, "num_input_tokens_seen": 84968680, "step": 156000 }, { "epoch": 1.5657515607491597, "grad_norm": 5.641045093536377, "learning_rate": 4.2171292220265726e-05, "loss": 1.2963, "num_input_tokens_seen": 85238032, "step": 156500 }, { "epoch": 1.5707539619017128, "grad_norm": 6.242879867553711, "learning_rate": 4.214628021450297e-05, "loss": 1.2924, "num_input_tokens_seen": 85511200, "step": 157000 }, { "epoch": 1.575756363054266, "grad_norm": 6.90887451171875, "learning_rate": 4.21212682087402e-05, "loss": 1.2968, "num_input_tokens_seen": 85785448, "step": 157500 }, { "epoch": 1.5807587642068193, "grad_norm": 7.269606590270996, "learning_rate": 4.209625620297743e-05, "loss": 1.2845, "num_input_tokens_seen": 86056312, "step": 158000 }, { "epoch": 1.5857611653593726, "grad_norm": 5.152353763580322, "learning_rate": 4.2071244197214666e-05, "loss": 1.2909, "num_input_tokens_seen": 86333312, "step": 158500 }, { "epoch": 1.5907635665119257, "grad_norm": 6.0240631103515625, "learning_rate": 4.20462321914519e-05, "loss": 1.2923, "num_input_tokens_seen": 86610480, "step": 159000 }, { "epoch": 1.5957659676644789, "grad_norm": 7.361881256103516, "learning_rate": 4.202122018568913e-05, "loss": 1.2759, "num_input_tokens_seen": 86882480, "step": 159500 }, { "epoch": 1.6007683688170322, "grad_norm": 5.192800521850586, "learning_rate": 4.1996208179926364e-05, "loss": 1.298, "num_input_tokens_seen": 87144592, "step": 160000 }, { "epoch": 1.6057707699695856, "grad_norm": 7.1856369972229, "learning_rate": 4.19711961741636e-05, "loss": 1.2955, "num_input_tokens_seen": 87420328, "step": 160500 }, { "epoch": 1.6107731711221387, "grad_norm": 5.096145153045654, "learning_rate": 4.1946184168400834e-05, "loss": 1.2923, "num_input_tokens_seen": 87696968, "step": 161000 }, { "epoch": 1.6157755722746918, "grad_norm": 6.808541297912598, "learning_rate": 4.192117216263807e-05, "loss": 1.3044, "num_input_tokens_seen": 87977352, "step": 161500 }, { "epoch": 1.620777973427245, "grad_norm": 5.258007526397705, "learning_rate": 4.1896160156875304e-05, "loss": 1.3023, "num_input_tokens_seen": 88251864, "step": 162000 }, { "epoch": 1.6257803745797983, "grad_norm": 5.184575080871582, "learning_rate": 4.187114815111254e-05, "loss": 1.2594, "num_input_tokens_seen": 88521800, "step": 162500 }, { "epoch": 1.6307827757323516, "grad_norm": 5.858316421508789, "learning_rate": 4.184613614534977e-05, "loss": 1.2854, "num_input_tokens_seen": 88788776, "step": 163000 }, { "epoch": 1.6357851768849048, "grad_norm": 7.03213357925415, "learning_rate": 4.1821124139587e-05, "loss": 1.2745, "num_input_tokens_seen": 89054472, "step": 163500 }, { "epoch": 1.640787578037458, "grad_norm": 7.509394645690918, "learning_rate": 4.179611213382424e-05, "loss": 1.2932, "num_input_tokens_seen": 89332456, "step": 164000 }, { "epoch": 1.6457899791900112, "grad_norm": 7.114541530609131, "learning_rate": 4.177110012806147e-05, "loss": 1.2751, "num_input_tokens_seen": 89609920, "step": 164500 }, { "epoch": 1.6507923803425646, "grad_norm": 14.539456367492676, "learning_rate": 4.174608812229871e-05, "loss": 1.3115, "num_input_tokens_seen": 89875856, "step": 165000 }, { "epoch": 1.6557947814951177, "grad_norm": 5.730625629425049, "learning_rate": 4.172107611653594e-05, "loss": 1.2938, "num_input_tokens_seen": 90148472, "step": 165500 }, { "epoch": 1.6607971826476708, "grad_norm": 5.901363849639893, "learning_rate": 4.169606411077318e-05, "loss": 1.2895, "num_input_tokens_seen": 90424400, "step": 166000 }, { "epoch": 1.665799583800224, "grad_norm": 5.94663667678833, "learning_rate": 4.1671052105010405e-05, "loss": 1.2988, "num_input_tokens_seen": 90702152, "step": 166500 }, { "epoch": 1.6708019849527773, "grad_norm": 5.720317363739014, "learning_rate": 4.164604009924764e-05, "loss": 1.2921, "num_input_tokens_seen": 90980904, "step": 167000 }, { "epoch": 1.6758043861053307, "grad_norm": 8.514877319335938, "learning_rate": 4.1621028093484875e-05, "loss": 1.2762, "num_input_tokens_seen": 91249200, "step": 167500 }, { "epoch": 1.6808067872578838, "grad_norm": 8.756369590759277, "learning_rate": 4.15960160877221e-05, "loss": 1.2898, "num_input_tokens_seen": 91523408, "step": 168000 }, { "epoch": 1.685809188410437, "grad_norm": 4.922306537628174, "learning_rate": 4.1571004081959345e-05, "loss": 1.2873, "num_input_tokens_seen": 91795848, "step": 168500 }, { "epoch": 1.6908115895629903, "grad_norm": 5.668425559997559, "learning_rate": 4.154599207619658e-05, "loss": 1.2662, "num_input_tokens_seen": 92067336, "step": 169000 }, { "epoch": 1.6958139907155436, "grad_norm": 6.631772518157959, "learning_rate": 4.1520980070433815e-05, "loss": 1.3048, "num_input_tokens_seen": 92339392, "step": 169500 }, { "epoch": 1.7008163918680967, "grad_norm": 6.489889144897461, "learning_rate": 4.149596806467104e-05, "loss": 1.2835, "num_input_tokens_seen": 92613216, "step": 170000 }, { "epoch": 1.7058187930206499, "grad_norm": 6.344711780548096, "learning_rate": 4.147095605890828e-05, "loss": 1.2918, "num_input_tokens_seen": 92890872, "step": 170500 }, { "epoch": 1.710821194173203, "grad_norm": 7.276896953582764, "learning_rate": 4.144594405314551e-05, "loss": 1.303, "num_input_tokens_seen": 93161528, "step": 171000 }, { "epoch": 1.7158235953257563, "grad_norm": 6.139397144317627, "learning_rate": 4.142093204738274e-05, "loss": 1.2884, "num_input_tokens_seen": 93434024, "step": 171500 }, { "epoch": 1.7208259964783097, "grad_norm": 5.353676795959473, "learning_rate": 4.1395920041619976e-05, "loss": 1.2865, "num_input_tokens_seen": 93712728, "step": 172000 }, { "epoch": 1.7258283976308628, "grad_norm": 7.979468822479248, "learning_rate": 4.137090803585721e-05, "loss": 1.2884, "num_input_tokens_seen": 93992640, "step": 172500 }, { "epoch": 1.730830798783416, "grad_norm": 5.386059761047363, "learning_rate": 4.134589603009445e-05, "loss": 1.2875, "num_input_tokens_seen": 94262344, "step": 173000 }, { "epoch": 1.7358331999359693, "grad_norm": 4.8488311767578125, "learning_rate": 4.132088402433168e-05, "loss": 1.2636, "num_input_tokens_seen": 94536416, "step": 173500 }, { "epoch": 1.7408356010885226, "grad_norm": 7.375112056732178, "learning_rate": 4.1295872018568916e-05, "loss": 1.276, "num_input_tokens_seen": 94806528, "step": 174000 }, { "epoch": 1.7458380022410758, "grad_norm": 4.830787181854248, "learning_rate": 4.127086001280615e-05, "loss": 1.2681, "num_input_tokens_seen": 95076832, "step": 174500 }, { "epoch": 1.750840403393629, "grad_norm": 5.590123653411865, "learning_rate": 4.124584800704338e-05, "loss": 1.273, "num_input_tokens_seen": 95342672, "step": 175000 }, { "epoch": 1.755842804546182, "grad_norm": 5.334784984588623, "learning_rate": 4.1220836001280614e-05, "loss": 1.29, "num_input_tokens_seen": 95613968, "step": 175500 }, { "epoch": 1.7608452056987354, "grad_norm": 5.795757293701172, "learning_rate": 4.119582399551785e-05, "loss": 1.2683, "num_input_tokens_seen": 95880488, "step": 176000 }, { "epoch": 1.7658476068512887, "grad_norm": 5.436016082763672, "learning_rate": 4.1170811989755084e-05, "loss": 1.2665, "num_input_tokens_seen": 96148192, "step": 176500 }, { "epoch": 1.7708500080038418, "grad_norm": 7.753640174865723, "learning_rate": 4.114579998399232e-05, "loss": 1.3018, "num_input_tokens_seen": 96422808, "step": 177000 }, { "epoch": 1.775852409156395, "grad_norm": 6.833972454071045, "learning_rate": 4.1120787978229554e-05, "loss": 1.2731, "num_input_tokens_seen": 96691008, "step": 177500 }, { "epoch": 1.7808548103089483, "grad_norm": 5.354393482208252, "learning_rate": 4.109577597246679e-05, "loss": 1.2683, "num_input_tokens_seen": 96964040, "step": 178000 }, { "epoch": 1.7858572114615017, "grad_norm": 5.666247367858887, "learning_rate": 4.107076396670402e-05, "loss": 1.3028, "num_input_tokens_seen": 97244192, "step": 178500 }, { "epoch": 1.7908596126140548, "grad_norm": 5.841219902038574, "learning_rate": 4.104575196094125e-05, "loss": 1.2637, "num_input_tokens_seen": 97515856, "step": 179000 }, { "epoch": 1.795862013766608, "grad_norm": 6.097582817077637, "learning_rate": 4.102073995517849e-05, "loss": 1.2602, "num_input_tokens_seen": 97784032, "step": 179500 }, { "epoch": 1.800864414919161, "grad_norm": 6.291224002838135, "learning_rate": 4.099572794941572e-05, "loss": 1.2792, "num_input_tokens_seen": 98061744, "step": 180000 }, { "epoch": 1.8058668160717144, "grad_norm": 6.529845237731934, "learning_rate": 4.097071594365296e-05, "loss": 1.2751, "num_input_tokens_seen": 98333416, "step": 180500 }, { "epoch": 1.8108692172242677, "grad_norm": 5.767446041107178, "learning_rate": 4.094570393789019e-05, "loss": 1.2924, "num_input_tokens_seen": 98611352, "step": 181000 }, { "epoch": 1.8158716183768209, "grad_norm": 11.748208999633789, "learning_rate": 4.092069193212743e-05, "loss": 1.2656, "num_input_tokens_seen": 98875048, "step": 181500 }, { "epoch": 1.820874019529374, "grad_norm": 6.215290069580078, "learning_rate": 4.0895679926364655e-05, "loss": 1.2854, "num_input_tokens_seen": 99142440, "step": 182000 }, { "epoch": 1.8258764206819273, "grad_norm": 4.965378284454346, "learning_rate": 4.087066792060189e-05, "loss": 1.3011, "num_input_tokens_seen": 99420240, "step": 182500 }, { "epoch": 1.8308788218344807, "grad_norm": 4.903427600860596, "learning_rate": 4.0845655914839125e-05, "loss": 1.29, "num_input_tokens_seen": 99700544, "step": 183000 }, { "epoch": 1.8358812229870338, "grad_norm": 7.428767204284668, "learning_rate": 4.082064390907635e-05, "loss": 1.2784, "num_input_tokens_seen": 99976104, "step": 183500 }, { "epoch": 1.840883624139587, "grad_norm": 5.334924697875977, "learning_rate": 4.079563190331359e-05, "loss": 1.2617, "num_input_tokens_seen": 100248360, "step": 184000 }, { "epoch": 1.84588602529214, "grad_norm": 5.380727291107178, "learning_rate": 4.077061989755083e-05, "loss": 1.2824, "num_input_tokens_seen": 100520712, "step": 184500 }, { "epoch": 1.8508884264446934, "grad_norm": 6.993951320648193, "learning_rate": 4.0745607891788065e-05, "loss": 1.3006, "num_input_tokens_seen": 100795856, "step": 185000 }, { "epoch": 1.8558908275972468, "grad_norm": 6.079780578613281, "learning_rate": 4.072059588602529e-05, "loss": 1.2844, "num_input_tokens_seen": 101069232, "step": 185500 }, { "epoch": 1.8608932287498, "grad_norm": 5.772866725921631, "learning_rate": 4.069558388026253e-05, "loss": 1.269, "num_input_tokens_seen": 101347816, "step": 186000 }, { "epoch": 1.865895629902353, "grad_norm": 6.067032337188721, "learning_rate": 4.067057187449976e-05, "loss": 1.2753, "num_input_tokens_seen": 101618784, "step": 186500 }, { "epoch": 1.8708980310549064, "grad_norm": 8.178043365478516, "learning_rate": 4.064555986873699e-05, "loss": 1.2694, "num_input_tokens_seen": 101889416, "step": 187000 }, { "epoch": 1.8759004322074597, "grad_norm": 5.999898433685303, "learning_rate": 4.0620547862974226e-05, "loss": 1.2774, "num_input_tokens_seen": 102163040, "step": 187500 }, { "epoch": 1.8809028333600128, "grad_norm": 7.069881439208984, "learning_rate": 4.059553585721146e-05, "loss": 1.2848, "num_input_tokens_seen": 102431528, "step": 188000 }, { "epoch": 1.885905234512566, "grad_norm": 5.21435546875, "learning_rate": 4.05705238514487e-05, "loss": 1.2599, "num_input_tokens_seen": 102705520, "step": 188500 }, { "epoch": 1.890907635665119, "grad_norm": 6.542243003845215, "learning_rate": 4.054551184568593e-05, "loss": 1.283, "num_input_tokens_seen": 102981304, "step": 189000 }, { "epoch": 1.8959100368176725, "grad_norm": 6.719133377075195, "learning_rate": 4.0520499839923166e-05, "loss": 1.2879, "num_input_tokens_seen": 103259112, "step": 189500 }, { "epoch": 1.9009124379702258, "grad_norm": 6.38728666305542, "learning_rate": 4.04954878341604e-05, "loss": 1.2548, "num_input_tokens_seen": 103534888, "step": 190000 }, { "epoch": 1.905914839122779, "grad_norm": 5.428126811981201, "learning_rate": 4.047047582839763e-05, "loss": 1.28, "num_input_tokens_seen": 103803600, "step": 190500 }, { "epoch": 1.910917240275332, "grad_norm": 5.377976894378662, "learning_rate": 4.0445463822634864e-05, "loss": 1.2658, "num_input_tokens_seen": 104075160, "step": 191000 }, { "epoch": 1.9159196414278854, "grad_norm": 5.453880786895752, "learning_rate": 4.04204518168721e-05, "loss": 1.2643, "num_input_tokens_seen": 104349992, "step": 191500 }, { "epoch": 1.9209220425804387, "grad_norm": 5.114168167114258, "learning_rate": 4.0395439811109334e-05, "loss": 1.2769, "num_input_tokens_seen": 104621104, "step": 192000 }, { "epoch": 1.9259244437329919, "grad_norm": 5.22728157043457, "learning_rate": 4.037042780534657e-05, "loss": 1.2842, "num_input_tokens_seen": 104890976, "step": 192500 }, { "epoch": 1.930926844885545, "grad_norm": 5.4410881996154785, "learning_rate": 4.0345415799583804e-05, "loss": 1.2627, "num_input_tokens_seen": 105165152, "step": 193000 }, { "epoch": 1.9359292460380981, "grad_norm": 5.700538158416748, "learning_rate": 4.032040379382104e-05, "loss": 1.275, "num_input_tokens_seen": 105431920, "step": 193500 }, { "epoch": 1.9409316471906515, "grad_norm": 5.171668529510498, "learning_rate": 4.029539178805827e-05, "loss": 1.2852, "num_input_tokens_seen": 105709976, "step": 194000 }, { "epoch": 1.9459340483432048, "grad_norm": 7.026444911956787, "learning_rate": 4.02703797822955e-05, "loss": 1.2718, "num_input_tokens_seen": 105983472, "step": 194500 }, { "epoch": 1.950936449495758, "grad_norm": 6.670947074890137, "learning_rate": 4.024536777653274e-05, "loss": 1.2574, "num_input_tokens_seen": 106257632, "step": 195000 }, { "epoch": 1.955938850648311, "grad_norm": 5.797586441040039, "learning_rate": 4.022035577076997e-05, "loss": 1.2821, "num_input_tokens_seen": 106533272, "step": 195500 }, { "epoch": 1.9609412518008644, "grad_norm": 7.070456504821777, "learning_rate": 4.019534376500721e-05, "loss": 1.2749, "num_input_tokens_seen": 106804176, "step": 196000 }, { "epoch": 1.9659436529534178, "grad_norm": 5.074236869812012, "learning_rate": 4.017033175924444e-05, "loss": 1.2837, "num_input_tokens_seen": 107077264, "step": 196500 }, { "epoch": 1.970946054105971, "grad_norm": 5.952401161193848, "learning_rate": 4.014531975348168e-05, "loss": 1.2481, "num_input_tokens_seen": 107342400, "step": 197000 }, { "epoch": 1.975948455258524, "grad_norm": 6.63128662109375, "learning_rate": 4.0120307747718905e-05, "loss": 1.2769, "num_input_tokens_seen": 107619760, "step": 197500 }, { "epoch": 1.9809508564110772, "grad_norm": 6.601523399353027, "learning_rate": 4.009529574195614e-05, "loss": 1.275, "num_input_tokens_seen": 107898512, "step": 198000 }, { "epoch": 1.9859532575636305, "grad_norm": 6.857260227203369, "learning_rate": 4.0070283736193375e-05, "loss": 1.2508, "num_input_tokens_seen": 108163904, "step": 198500 }, { "epoch": 1.9909556587161839, "grad_norm": 5.871264934539795, "learning_rate": 4.004527173043061e-05, "loss": 1.2712, "num_input_tokens_seen": 108440480, "step": 199000 }, { "epoch": 1.995958059868737, "grad_norm": 7.567385673522949, "learning_rate": 4.002025972466784e-05, "loss": 1.2801, "num_input_tokens_seen": 108714992, "step": 199500 }, { "epoch": 2.0, "eval_loss": 1.1701077222824097, "eval_runtime": 186.6785, "eval_samples_per_second": 1070.851, "eval_steps_per_second": 133.861, "num_input_tokens_seen": 108935048, "step": 199904 }, { "epoch": 2.00096046102129, "grad_norm": 5.430812835693359, "learning_rate": 3.999524771890507e-05, "loss": 1.2538, "num_input_tokens_seen": 108986608, "step": 200000 }, { "epoch": 2.0059628621738432, "grad_norm": 7.064018249511719, "learning_rate": 3.9970235713142315e-05, "loss": 1.1538, "num_input_tokens_seen": 109256864, "step": 200500 }, { "epoch": 2.010965263326397, "grad_norm": 6.479573726654053, "learning_rate": 3.994522370737954e-05, "loss": 1.1664, "num_input_tokens_seen": 109523672, "step": 201000 }, { "epoch": 2.01596766447895, "grad_norm": 6.595979690551758, "learning_rate": 3.992021170161678e-05, "loss": 1.1338, "num_input_tokens_seen": 109791408, "step": 201500 }, { "epoch": 2.020970065631503, "grad_norm": 7.46008825302124, "learning_rate": 3.989519969585401e-05, "loss": 1.1799, "num_input_tokens_seen": 110064104, "step": 202000 }, { "epoch": 2.025972466784056, "grad_norm": 5.414816379547119, "learning_rate": 3.987018769009124e-05, "loss": 1.1688, "num_input_tokens_seen": 110335328, "step": 202500 }, { "epoch": 2.0309748679366098, "grad_norm": 7.442201137542725, "learning_rate": 3.9845175684328476e-05, "loss": 1.1804, "num_input_tokens_seen": 110611648, "step": 203000 }, { "epoch": 2.035977269089163, "grad_norm": 5.2355475425720215, "learning_rate": 3.982016367856571e-05, "loss": 1.1776, "num_input_tokens_seen": 110883064, "step": 203500 }, { "epoch": 2.040979670241716, "grad_norm": 7.008761882781982, "learning_rate": 3.9795151672802946e-05, "loss": 1.1622, "num_input_tokens_seen": 111162464, "step": 204000 }, { "epoch": 2.045982071394269, "grad_norm": 5.213141918182373, "learning_rate": 3.977013966704018e-05, "loss": 1.1863, "num_input_tokens_seen": 111434960, "step": 204500 }, { "epoch": 2.0509844725468223, "grad_norm": 6.3171000480651855, "learning_rate": 3.9745127661277416e-05, "loss": 1.1735, "num_input_tokens_seen": 111707896, "step": 205000 }, { "epoch": 2.055986873699376, "grad_norm": 5.790093898773193, "learning_rate": 3.972011565551465e-05, "loss": 1.1916, "num_input_tokens_seen": 111976120, "step": 205500 }, { "epoch": 2.060989274851929, "grad_norm": 5.817662239074707, "learning_rate": 3.969510364975188e-05, "loss": 1.1996, "num_input_tokens_seen": 112248384, "step": 206000 }, { "epoch": 2.065991676004482, "grad_norm": 7.098010063171387, "learning_rate": 3.9670091643989114e-05, "loss": 1.1698, "num_input_tokens_seen": 112525640, "step": 206500 }, { "epoch": 2.070994077157035, "grad_norm": 5.172534942626953, "learning_rate": 3.964507963822635e-05, "loss": 1.17, "num_input_tokens_seen": 112794848, "step": 207000 }, { "epoch": 2.0759964783095888, "grad_norm": 5.681086540222168, "learning_rate": 3.9620067632463584e-05, "loss": 1.1812, "num_input_tokens_seen": 113070744, "step": 207500 }, { "epoch": 2.080998879462142, "grad_norm": 6.1572489738464355, "learning_rate": 3.959505562670082e-05, "loss": 1.1733, "num_input_tokens_seen": 113352016, "step": 208000 }, { "epoch": 2.086001280614695, "grad_norm": 6.149631023406982, "learning_rate": 3.9570043620938054e-05, "loss": 1.1733, "num_input_tokens_seen": 113629168, "step": 208500 }, { "epoch": 2.091003681767248, "grad_norm": 4.973777770996094, "learning_rate": 3.954503161517529e-05, "loss": 1.1732, "num_input_tokens_seen": 113899440, "step": 209000 }, { "epoch": 2.0960060829198017, "grad_norm": 12.051576614379883, "learning_rate": 3.952001960941252e-05, "loss": 1.1974, "num_input_tokens_seen": 114170160, "step": 209500 }, { "epoch": 2.101008484072355, "grad_norm": 5.458679676055908, "learning_rate": 3.949500760364975e-05, "loss": 1.1664, "num_input_tokens_seen": 114441696, "step": 210000 }, { "epoch": 2.106010885224908, "grad_norm": 5.033444404602051, "learning_rate": 3.946999559788699e-05, "loss": 1.1781, "num_input_tokens_seen": 114715752, "step": 210500 }, { "epoch": 2.111013286377461, "grad_norm": 5.643963813781738, "learning_rate": 3.944498359212422e-05, "loss": 1.1849, "num_input_tokens_seen": 114996136, "step": 211000 }, { "epoch": 2.1160156875300142, "grad_norm": 6.656442165374756, "learning_rate": 3.941997158636145e-05, "loss": 1.1792, "num_input_tokens_seen": 115259352, "step": 211500 }, { "epoch": 2.121018088682568, "grad_norm": 5.712615013122559, "learning_rate": 3.939495958059869e-05, "loss": 1.1799, "num_input_tokens_seen": 115534944, "step": 212000 }, { "epoch": 2.126020489835121, "grad_norm": 8.317249298095703, "learning_rate": 3.936994757483593e-05, "loss": 1.1853, "num_input_tokens_seen": 115808536, "step": 212500 }, { "epoch": 2.131022890987674, "grad_norm": 6.112279415130615, "learning_rate": 3.9344935569073155e-05, "loss": 1.1822, "num_input_tokens_seen": 116087064, "step": 213000 }, { "epoch": 2.136025292140227, "grad_norm": 7.357901096343994, "learning_rate": 3.931992356331039e-05, "loss": 1.1866, "num_input_tokens_seen": 116365200, "step": 213500 }, { "epoch": 2.1410276932927808, "grad_norm": 5.3409929275512695, "learning_rate": 3.9294911557547625e-05, "loss": 1.1897, "num_input_tokens_seen": 116636120, "step": 214000 }, { "epoch": 2.146030094445334, "grad_norm": 7.562960624694824, "learning_rate": 3.926989955178486e-05, "loss": 1.1809, "num_input_tokens_seen": 116916360, "step": 214500 }, { "epoch": 2.151032495597887, "grad_norm": 5.4258503913879395, "learning_rate": 3.924488754602209e-05, "loss": 1.1871, "num_input_tokens_seen": 117184808, "step": 215000 }, { "epoch": 2.15603489675044, "grad_norm": 6.741093158721924, "learning_rate": 3.9219875540259324e-05, "loss": 1.176, "num_input_tokens_seen": 117454928, "step": 215500 }, { "epoch": 2.1610372979029933, "grad_norm": 6.085869789123535, "learning_rate": 3.9194863534496565e-05, "loss": 1.1789, "num_input_tokens_seen": 117722352, "step": 216000 }, { "epoch": 2.166039699055547, "grad_norm": 6.9086151123046875, "learning_rate": 3.9169851528733793e-05, "loss": 1.189, "num_input_tokens_seen": 117990704, "step": 216500 }, { "epoch": 2.1710421002081, "grad_norm": 5.497861385345459, "learning_rate": 3.914483952297103e-05, "loss": 1.1768, "num_input_tokens_seen": 118255368, "step": 217000 }, { "epoch": 2.176044501360653, "grad_norm": 8.487640380859375, "learning_rate": 3.9119827517208263e-05, "loss": 1.1743, "num_input_tokens_seen": 118525432, "step": 217500 }, { "epoch": 2.181046902513206, "grad_norm": 5.8003435134887695, "learning_rate": 3.90948155114455e-05, "loss": 1.1697, "num_input_tokens_seen": 118799496, "step": 218000 }, { "epoch": 2.18604930366576, "grad_norm": 7.726077079772949, "learning_rate": 3.9069803505682727e-05, "loss": 1.1784, "num_input_tokens_seen": 119074944, "step": 218500 }, { "epoch": 2.191051704818313, "grad_norm": 5.625581741333008, "learning_rate": 3.904479149991996e-05, "loss": 1.1856, "num_input_tokens_seen": 119353528, "step": 219000 }, { "epoch": 2.196054105970866, "grad_norm": 5.582902908325195, "learning_rate": 3.9019779494157196e-05, "loss": 1.1832, "num_input_tokens_seen": 119627520, "step": 219500 }, { "epoch": 2.201056507123419, "grad_norm": 5.2057671546936035, "learning_rate": 3.899476748839443e-05, "loss": 1.1888, "num_input_tokens_seen": 119894432, "step": 220000 }, { "epoch": 2.2060589082759723, "grad_norm": 6.18375825881958, "learning_rate": 3.8969755482631666e-05, "loss": 1.1636, "num_input_tokens_seen": 120165872, "step": 220500 }, { "epoch": 2.211061309428526, "grad_norm": 7.083649158477783, "learning_rate": 3.89447434768689e-05, "loss": 1.1716, "num_input_tokens_seen": 120437360, "step": 221000 }, { "epoch": 2.216063710581079, "grad_norm": 6.966033458709717, "learning_rate": 3.891973147110613e-05, "loss": 1.1899, "num_input_tokens_seen": 120707824, "step": 221500 }, { "epoch": 2.221066111733632, "grad_norm": 4.439563751220703, "learning_rate": 3.8894719465343365e-05, "loss": 1.1714, "num_input_tokens_seen": 120977840, "step": 222000 }, { "epoch": 2.2260685128861852, "grad_norm": 6.870123386383057, "learning_rate": 3.88697074595806e-05, "loss": 1.1793, "num_input_tokens_seen": 121254560, "step": 222500 }, { "epoch": 2.231070914038739, "grad_norm": 8.789484024047852, "learning_rate": 3.8844695453817834e-05, "loss": 1.1851, "num_input_tokens_seen": 121523936, "step": 223000 }, { "epoch": 2.236073315191292, "grad_norm": 6.196369647979736, "learning_rate": 3.881968344805507e-05, "loss": 1.1857, "num_input_tokens_seen": 121795288, "step": 223500 }, { "epoch": 2.241075716343845, "grad_norm": 5.902594566345215, "learning_rate": 3.8794671442292304e-05, "loss": 1.2016, "num_input_tokens_seen": 122065592, "step": 224000 }, { "epoch": 2.246078117496398, "grad_norm": 6.811281681060791, "learning_rate": 3.876965943652954e-05, "loss": 1.1837, "num_input_tokens_seen": 122340584, "step": 224500 }, { "epoch": 2.2510805186489513, "grad_norm": 6.388464450836182, "learning_rate": 3.874464743076677e-05, "loss": 1.1816, "num_input_tokens_seen": 122612352, "step": 225000 }, { "epoch": 2.256082919801505, "grad_norm": 6.045330047607422, "learning_rate": 3.8719635425004e-05, "loss": 1.1835, "num_input_tokens_seen": 122878624, "step": 225500 }, { "epoch": 2.261085320954058, "grad_norm": 7.601827621459961, "learning_rate": 3.869462341924124e-05, "loss": 1.1766, "num_input_tokens_seen": 123143944, "step": 226000 }, { "epoch": 2.266087722106611, "grad_norm": 5.323575496673584, "learning_rate": 3.866961141347847e-05, "loss": 1.1913, "num_input_tokens_seen": 123415600, "step": 226500 }, { "epoch": 2.2710901232591643, "grad_norm": 6.938271522521973, "learning_rate": 3.86445994077157e-05, "loss": 1.1812, "num_input_tokens_seen": 123682608, "step": 227000 }, { "epoch": 2.276092524411718, "grad_norm": 5.9254021644592285, "learning_rate": 3.861958740195294e-05, "loss": 1.1856, "num_input_tokens_seen": 123954888, "step": 227500 }, { "epoch": 2.281094925564271, "grad_norm": 7.544998645782471, "learning_rate": 3.859457539619018e-05, "loss": 1.1957, "num_input_tokens_seen": 124230632, "step": 228000 }, { "epoch": 2.286097326716824, "grad_norm": 6.14992618560791, "learning_rate": 3.8569563390427406e-05, "loss": 1.1894, "num_input_tokens_seen": 124507352, "step": 228500 }, { "epoch": 2.291099727869377, "grad_norm": 5.440382957458496, "learning_rate": 3.854455138466464e-05, "loss": 1.1752, "num_input_tokens_seen": 124781440, "step": 229000 }, { "epoch": 2.2961021290219303, "grad_norm": 7.271317481994629, "learning_rate": 3.8519539378901876e-05, "loss": 1.1749, "num_input_tokens_seen": 125055624, "step": 229500 }, { "epoch": 2.301104530174484, "grad_norm": 5.141626834869385, "learning_rate": 3.849452737313911e-05, "loss": 1.18, "num_input_tokens_seen": 125329000, "step": 230000 }, { "epoch": 2.306106931327037, "grad_norm": 6.321171760559082, "learning_rate": 3.846951536737634e-05, "loss": 1.1741, "num_input_tokens_seen": 125605816, "step": 230500 }, { "epoch": 2.31110933247959, "grad_norm": 5.19276237487793, "learning_rate": 3.8444503361613574e-05, "loss": 1.1966, "num_input_tokens_seen": 125887328, "step": 231000 }, { "epoch": 2.3161117336321433, "grad_norm": 5.9422125816345215, "learning_rate": 3.841949135585081e-05, "loss": 1.1638, "num_input_tokens_seen": 126158768, "step": 231500 }, { "epoch": 2.321114134784697, "grad_norm": 5.361838340759277, "learning_rate": 3.8394479350088044e-05, "loss": 1.1737, "num_input_tokens_seen": 126429432, "step": 232000 }, { "epoch": 2.32611653593725, "grad_norm": 6.030839920043945, "learning_rate": 3.836946734432528e-05, "loss": 1.1975, "num_input_tokens_seen": 126703336, "step": 232500 }, { "epoch": 2.331118937089803, "grad_norm": 6.013172149658203, "learning_rate": 3.8344455338562514e-05, "loss": 1.1785, "num_input_tokens_seen": 126981120, "step": 233000 }, { "epoch": 2.3361213382423562, "grad_norm": 5.227244853973389, "learning_rate": 3.831944333279975e-05, "loss": 1.1934, "num_input_tokens_seen": 127248672, "step": 233500 }, { "epoch": 2.3411237393949094, "grad_norm": 5.995646953582764, "learning_rate": 3.829443132703698e-05, "loss": 1.198, "num_input_tokens_seen": 127525032, "step": 234000 }, { "epoch": 2.346126140547463, "grad_norm": 8.163732528686523, "learning_rate": 3.826941932127421e-05, "loss": 1.1743, "num_input_tokens_seen": 127793384, "step": 234500 }, { "epoch": 2.351128541700016, "grad_norm": 5.394166946411133, "learning_rate": 3.824440731551145e-05, "loss": 1.1726, "num_input_tokens_seen": 128065120, "step": 235000 }, { "epoch": 2.356130942852569, "grad_norm": 5.673594951629639, "learning_rate": 3.821939530974868e-05, "loss": 1.1959, "num_input_tokens_seen": 128334976, "step": 235500 }, { "epoch": 2.3611333440051223, "grad_norm": 5.715531826019287, "learning_rate": 3.8194383303985917e-05, "loss": 1.1936, "num_input_tokens_seen": 128610504, "step": 236000 }, { "epoch": 2.366135745157676, "grad_norm": 5.725061416625977, "learning_rate": 3.816937129822315e-05, "loss": 1.1771, "num_input_tokens_seen": 128881800, "step": 236500 }, { "epoch": 2.371138146310229, "grad_norm": 4.505105972290039, "learning_rate": 3.8144359292460386e-05, "loss": 1.1826, "num_input_tokens_seen": 129157576, "step": 237000 }, { "epoch": 2.376140547462782, "grad_norm": 5.860077857971191, "learning_rate": 3.8119347286697615e-05, "loss": 1.1925, "num_input_tokens_seen": 129432392, "step": 237500 }, { "epoch": 2.3811429486153353, "grad_norm": 6.7791337966918945, "learning_rate": 3.809433528093485e-05, "loss": 1.1746, "num_input_tokens_seen": 129700968, "step": 238000 }, { "epoch": 2.3861453497678884, "grad_norm": 5.708649635314941, "learning_rate": 3.8069323275172085e-05, "loss": 1.1793, "num_input_tokens_seen": 129977384, "step": 238500 }, { "epoch": 2.391147750920442, "grad_norm": 5.659774303436279, "learning_rate": 3.804431126940932e-05, "loss": 1.1797, "num_input_tokens_seen": 130248672, "step": 239000 }, { "epoch": 2.396150152072995, "grad_norm": 6.859200477600098, "learning_rate": 3.8019299263646555e-05, "loss": 1.1853, "num_input_tokens_seen": 130522208, "step": 239500 }, { "epoch": 2.401152553225548, "grad_norm": 6.860942840576172, "learning_rate": 3.799428725788379e-05, "loss": 1.1922, "num_input_tokens_seen": 130799088, "step": 240000 }, { "epoch": 2.4061549543781013, "grad_norm": 6.199068069458008, "learning_rate": 3.7969275252121025e-05, "loss": 1.1825, "num_input_tokens_seen": 131067984, "step": 240500 }, { "epoch": 2.411157355530655, "grad_norm": 5.724475383758545, "learning_rate": 3.794426324635825e-05, "loss": 1.168, "num_input_tokens_seen": 131340552, "step": 241000 }, { "epoch": 2.416159756683208, "grad_norm": 5.187953472137451, "learning_rate": 3.791925124059549e-05, "loss": 1.1875, "num_input_tokens_seen": 131613968, "step": 241500 }, { "epoch": 2.421162157835761, "grad_norm": 6.069790363311768, "learning_rate": 3.789423923483272e-05, "loss": 1.1866, "num_input_tokens_seen": 131880736, "step": 242000 }, { "epoch": 2.4261645589883143, "grad_norm": 6.761556148529053, "learning_rate": 3.786922722906995e-05, "loss": 1.205, "num_input_tokens_seen": 132150456, "step": 242500 }, { "epoch": 2.4311669601408674, "grad_norm": 5.816013336181641, "learning_rate": 3.7844215223307186e-05, "loss": 1.1938, "num_input_tokens_seen": 132424832, "step": 243000 }, { "epoch": 2.436169361293421, "grad_norm": 6.447406768798828, "learning_rate": 3.781920321754443e-05, "loss": 1.1792, "num_input_tokens_seen": 132691704, "step": 243500 }, { "epoch": 2.441171762445974, "grad_norm": 6.802369117736816, "learning_rate": 3.7794191211781656e-05, "loss": 1.1891, "num_input_tokens_seen": 132962376, "step": 244000 }, { "epoch": 2.4461741635985272, "grad_norm": 5.149132251739502, "learning_rate": 3.776917920601889e-05, "loss": 1.1691, "num_input_tokens_seen": 133236272, "step": 244500 }, { "epoch": 2.4511765647510804, "grad_norm": 6.554666996002197, "learning_rate": 3.7744167200256126e-05, "loss": 1.1958, "num_input_tokens_seen": 133505504, "step": 245000 }, { "epoch": 2.456178965903634, "grad_norm": 5.13792610168457, "learning_rate": 3.771915519449336e-05, "loss": 1.1771, "num_input_tokens_seen": 133785904, "step": 245500 }, { "epoch": 2.461181367056187, "grad_norm": 4.5011491775512695, "learning_rate": 3.769414318873059e-05, "loss": 1.192, "num_input_tokens_seen": 134055360, "step": 246000 }, { "epoch": 2.46618376820874, "grad_norm": 7.41070556640625, "learning_rate": 3.7669131182967824e-05, "loss": 1.1682, "num_input_tokens_seen": 134321528, "step": 246500 }, { "epoch": 2.4711861693612933, "grad_norm": 7.749119281768799, "learning_rate": 3.764411917720506e-05, "loss": 1.188, "num_input_tokens_seen": 134595208, "step": 247000 }, { "epoch": 2.4761885705138464, "grad_norm": 5.476714134216309, "learning_rate": 3.7619107171442294e-05, "loss": 1.1668, "num_input_tokens_seen": 134869136, "step": 247500 }, { "epoch": 2.4811909716664, "grad_norm": 5.9990010261535645, "learning_rate": 3.759409516567953e-05, "loss": 1.182, "num_input_tokens_seen": 135144112, "step": 248000 }, { "epoch": 2.486193372818953, "grad_norm": 5.635094165802002, "learning_rate": 3.7569083159916764e-05, "loss": 1.1861, "num_input_tokens_seen": 135413984, "step": 248500 }, { "epoch": 2.4911957739715063, "grad_norm": 5.974431991577148, "learning_rate": 3.7544071154154e-05, "loss": 1.1927, "num_input_tokens_seen": 135697672, "step": 249000 }, { "epoch": 2.4961981751240594, "grad_norm": 6.688498497009277, "learning_rate": 3.751905914839123e-05, "loss": 1.2016, "num_input_tokens_seen": 135975272, "step": 249500 }, { "epoch": 2.501200576276613, "grad_norm": 8.589900970458984, "learning_rate": 3.749404714262846e-05, "loss": 1.1711, "num_input_tokens_seen": 136241376, "step": 250000 }, { "epoch": 2.506202977429166, "grad_norm": 6.064274311065674, "learning_rate": 3.74690351368657e-05, "loss": 1.2027, "num_input_tokens_seen": 136510824, "step": 250500 }, { "epoch": 2.511205378581719, "grad_norm": 5.36790657043457, "learning_rate": 3.744402313110293e-05, "loss": 1.189, "num_input_tokens_seen": 136777568, "step": 251000 }, { "epoch": 2.5162077797342723, "grad_norm": 5.9187703132629395, "learning_rate": 3.741901112534017e-05, "loss": 1.1894, "num_input_tokens_seen": 137049696, "step": 251500 }, { "epoch": 2.5212101808868255, "grad_norm": 5.2425007820129395, "learning_rate": 3.73939991195774e-05, "loss": 1.2087, "num_input_tokens_seen": 137319688, "step": 252000 }, { "epoch": 2.526212582039379, "grad_norm": 6.622330188751221, "learning_rate": 3.736898711381464e-05, "loss": 1.172, "num_input_tokens_seen": 137592360, "step": 252500 }, { "epoch": 2.531214983191932, "grad_norm": 5.9546122550964355, "learning_rate": 3.7343975108051865e-05, "loss": 1.1784, "num_input_tokens_seen": 137869696, "step": 253000 }, { "epoch": 2.5362173843444853, "grad_norm": 6.10466194152832, "learning_rate": 3.73189631022891e-05, "loss": 1.1806, "num_input_tokens_seen": 138146440, "step": 253500 }, { "epoch": 2.5412197854970384, "grad_norm": 7.046773433685303, "learning_rate": 3.7293951096526335e-05, "loss": 1.187, "num_input_tokens_seen": 138432672, "step": 254000 }, { "epoch": 2.546222186649592, "grad_norm": 5.8726115226745605, "learning_rate": 3.726893909076356e-05, "loss": 1.1769, "num_input_tokens_seen": 138704056, "step": 254500 }, { "epoch": 2.551224587802145, "grad_norm": 6.145564079284668, "learning_rate": 3.7243927085000805e-05, "loss": 1.1783, "num_input_tokens_seen": 138972048, "step": 255000 }, { "epoch": 2.5562269889546982, "grad_norm": 8.949604988098145, "learning_rate": 3.721891507923804e-05, "loss": 1.1928, "num_input_tokens_seen": 139249808, "step": 255500 }, { "epoch": 2.5612293901072514, "grad_norm": 6.0869975090026855, "learning_rate": 3.7193903073475275e-05, "loss": 1.2032, "num_input_tokens_seen": 139528824, "step": 256000 }, { "epoch": 2.5662317912598045, "grad_norm": 6.634551048278809, "learning_rate": 3.71688910677125e-05, "loss": 1.1977, "num_input_tokens_seen": 139798352, "step": 256500 }, { "epoch": 2.571234192412358, "grad_norm": 5.805966377258301, "learning_rate": 3.714387906194974e-05, "loss": 1.1725, "num_input_tokens_seen": 140071304, "step": 257000 }, { "epoch": 2.576236593564911, "grad_norm": 5.509829998016357, "learning_rate": 3.711886705618697e-05, "loss": 1.1794, "num_input_tokens_seen": 140351576, "step": 257500 }, { "epoch": 2.5812389947174643, "grad_norm": 7.246334552764893, "learning_rate": 3.70938550504242e-05, "loss": 1.1638, "num_input_tokens_seen": 140629520, "step": 258000 }, { "epoch": 2.5862413958700174, "grad_norm": 5.683703899383545, "learning_rate": 3.7068843044661436e-05, "loss": 1.1772, "num_input_tokens_seen": 140907480, "step": 258500 }, { "epoch": 2.591243797022571, "grad_norm": 5.520617485046387, "learning_rate": 3.704383103889867e-05, "loss": 1.1874, "num_input_tokens_seen": 141174448, "step": 259000 }, { "epoch": 2.596246198175124, "grad_norm": 6.609923839569092, "learning_rate": 3.701881903313591e-05, "loss": 1.1954, "num_input_tokens_seen": 141451848, "step": 259500 }, { "epoch": 2.6012485993276773, "grad_norm": 5.208652973175049, "learning_rate": 3.699380702737314e-05, "loss": 1.1777, "num_input_tokens_seen": 141719928, "step": 260000 }, { "epoch": 2.6062510004802304, "grad_norm": 6.525882720947266, "learning_rate": 3.6968795021610376e-05, "loss": 1.1725, "num_input_tokens_seen": 141993992, "step": 260500 }, { "epoch": 2.6112534016327835, "grad_norm": 6.694952011108398, "learning_rate": 3.694378301584761e-05, "loss": 1.1764, "num_input_tokens_seen": 142262512, "step": 261000 }, { "epoch": 2.616255802785337, "grad_norm": 6.036692142486572, "learning_rate": 3.691877101008484e-05, "loss": 1.1863, "num_input_tokens_seen": 142535016, "step": 261500 }, { "epoch": 2.62125820393789, "grad_norm": 4.5128021240234375, "learning_rate": 3.6893759004322074e-05, "loss": 1.1914, "num_input_tokens_seen": 142817040, "step": 262000 }, { "epoch": 2.6262606050904433, "grad_norm": 6.445744037628174, "learning_rate": 3.686874699855931e-05, "loss": 1.1938, "num_input_tokens_seen": 143093520, "step": 262500 }, { "epoch": 2.631263006242997, "grad_norm": 5.805507183074951, "learning_rate": 3.6843734992796544e-05, "loss": 1.176, "num_input_tokens_seen": 143361184, "step": 263000 }, { "epoch": 2.63626540739555, "grad_norm": 7.369002819061279, "learning_rate": 3.681872298703378e-05, "loss": 1.1737, "num_input_tokens_seen": 143633104, "step": 263500 }, { "epoch": 2.641267808548103, "grad_norm": 5.3200459480285645, "learning_rate": 3.6793710981271014e-05, "loss": 1.1853, "num_input_tokens_seen": 143903304, "step": 264000 }, { "epoch": 2.6462702097006563, "grad_norm": 4.868594169616699, "learning_rate": 3.676869897550825e-05, "loss": 1.1924, "num_input_tokens_seen": 144176568, "step": 264500 }, { "epoch": 2.6512726108532094, "grad_norm": 6.198353290557861, "learning_rate": 3.674368696974548e-05, "loss": 1.1854, "num_input_tokens_seen": 144457912, "step": 265000 }, { "epoch": 2.6562750120057625, "grad_norm": 5.720507621765137, "learning_rate": 3.671867496398271e-05, "loss": 1.1922, "num_input_tokens_seen": 144733560, "step": 265500 }, { "epoch": 2.661277413158316, "grad_norm": 6.092404365539551, "learning_rate": 3.669366295821995e-05, "loss": 1.1784, "num_input_tokens_seen": 145006896, "step": 266000 }, { "epoch": 2.6662798143108692, "grad_norm": 5.7721266746521, "learning_rate": 3.666865095245718e-05, "loss": 1.1682, "num_input_tokens_seen": 145276408, "step": 266500 }, { "epoch": 2.6712822154634224, "grad_norm": 5.34429407119751, "learning_rate": 3.664363894669442e-05, "loss": 1.2014, "num_input_tokens_seen": 145549104, "step": 267000 }, { "epoch": 2.676284616615976, "grad_norm": 5.627655982971191, "learning_rate": 3.661862694093165e-05, "loss": 1.1873, "num_input_tokens_seen": 145813696, "step": 267500 }, { "epoch": 2.681287017768529, "grad_norm": 5.520989894866943, "learning_rate": 3.659361493516889e-05, "loss": 1.1801, "num_input_tokens_seen": 146081824, "step": 268000 }, { "epoch": 2.686289418921082, "grad_norm": 5.194046497344971, "learning_rate": 3.6568602929406115e-05, "loss": 1.2043, "num_input_tokens_seen": 146359992, "step": 268500 }, { "epoch": 2.6912918200736353, "grad_norm": 7.6289753913879395, "learning_rate": 3.654359092364335e-05, "loss": 1.1667, "num_input_tokens_seen": 146635688, "step": 269000 }, { "epoch": 2.6962942212261884, "grad_norm": 6.4248504638671875, "learning_rate": 3.6518578917880585e-05, "loss": 1.182, "num_input_tokens_seen": 146907280, "step": 269500 }, { "epoch": 2.7012966223787416, "grad_norm": 7.369548797607422, "learning_rate": 3.649356691211781e-05, "loss": 1.1863, "num_input_tokens_seen": 147180456, "step": 270000 }, { "epoch": 2.706299023531295, "grad_norm": 7.325328826904297, "learning_rate": 3.646855490635505e-05, "loss": 1.1731, "num_input_tokens_seen": 147447936, "step": 270500 }, { "epoch": 2.7113014246838483, "grad_norm": 6.618239879608154, "learning_rate": 3.644354290059229e-05, "loss": 1.1898, "num_input_tokens_seen": 147714576, "step": 271000 }, { "epoch": 2.7163038258364014, "grad_norm": 6.6161932945251465, "learning_rate": 3.6418530894829525e-05, "loss": 1.1757, "num_input_tokens_seen": 147982616, "step": 271500 }, { "epoch": 2.721306226988955, "grad_norm": 4.964172840118408, "learning_rate": 3.639351888906675e-05, "loss": 1.1822, "num_input_tokens_seen": 148261928, "step": 272000 }, { "epoch": 2.726308628141508, "grad_norm": 5.542762756347656, "learning_rate": 3.636850688330399e-05, "loss": 1.1979, "num_input_tokens_seen": 148537656, "step": 272500 }, { "epoch": 2.731311029294061, "grad_norm": 6.122353553771973, "learning_rate": 3.634349487754122e-05, "loss": 1.1837, "num_input_tokens_seen": 148805656, "step": 273000 }, { "epoch": 2.7363134304466143, "grad_norm": 5.522734642028809, "learning_rate": 3.631848287177845e-05, "loss": 1.1755, "num_input_tokens_seen": 149071656, "step": 273500 }, { "epoch": 2.7413158315991675, "grad_norm": 7.560063362121582, "learning_rate": 3.6293470866015686e-05, "loss": 1.1801, "num_input_tokens_seen": 149339936, "step": 274000 }, { "epoch": 2.7463182327517206, "grad_norm": 5.46027135848999, "learning_rate": 3.626845886025292e-05, "loss": 1.1734, "num_input_tokens_seen": 149616152, "step": 274500 }, { "epoch": 2.751320633904274, "grad_norm": 5.810853004455566, "learning_rate": 3.624344685449016e-05, "loss": 1.1806, "num_input_tokens_seen": 149886976, "step": 275000 }, { "epoch": 2.7563230350568273, "grad_norm": 5.957060813903809, "learning_rate": 3.621843484872739e-05, "loss": 1.1777, "num_input_tokens_seen": 150161384, "step": 275500 }, { "epoch": 2.7613254362093804, "grad_norm": 8.7448148727417, "learning_rate": 3.6193422842964626e-05, "loss": 1.1535, "num_input_tokens_seen": 150426192, "step": 276000 }, { "epoch": 2.766327837361934, "grad_norm": 6.24728536605835, "learning_rate": 3.616841083720186e-05, "loss": 1.1741, "num_input_tokens_seen": 150694480, "step": 276500 }, { "epoch": 2.771330238514487, "grad_norm": 8.271539688110352, "learning_rate": 3.614339883143909e-05, "loss": 1.1853, "num_input_tokens_seen": 150965896, "step": 277000 }, { "epoch": 2.7763326396670402, "grad_norm": 6.075042247772217, "learning_rate": 3.6118386825676324e-05, "loss": 1.1729, "num_input_tokens_seen": 151239800, "step": 277500 }, { "epoch": 2.7813350408195934, "grad_norm": 7.93595552444458, "learning_rate": 3.609337481991356e-05, "loss": 1.185, "num_input_tokens_seen": 151512560, "step": 278000 }, { "epoch": 2.7863374419721465, "grad_norm": 7.406468868255615, "learning_rate": 3.6068362814150794e-05, "loss": 1.1908, "num_input_tokens_seen": 151789264, "step": 278500 }, { "epoch": 2.7913398431246996, "grad_norm": 6.6226325035095215, "learning_rate": 3.604335080838803e-05, "loss": 1.1696, "num_input_tokens_seen": 152057600, "step": 279000 }, { "epoch": 2.796342244277253, "grad_norm": 5.142138481140137, "learning_rate": 3.6018338802625264e-05, "loss": 1.1694, "num_input_tokens_seen": 152328800, "step": 279500 }, { "epoch": 2.8013446454298063, "grad_norm": 6.834789752960205, "learning_rate": 3.59933267968625e-05, "loss": 1.1798, "num_input_tokens_seen": 152610624, "step": 280000 }, { "epoch": 2.8063470465823595, "grad_norm": 5.720213890075684, "learning_rate": 3.596831479109973e-05, "loss": 1.1781, "num_input_tokens_seen": 152876576, "step": 280500 }, { "epoch": 2.811349447734913, "grad_norm": 6.060703277587891, "learning_rate": 3.594330278533696e-05, "loss": 1.1781, "num_input_tokens_seen": 153146728, "step": 281000 }, { "epoch": 2.816351848887466, "grad_norm": 7.374409198760986, "learning_rate": 3.59182907795742e-05, "loss": 1.1925, "num_input_tokens_seen": 153416872, "step": 281500 }, { "epoch": 2.8213542500400193, "grad_norm": 6.183439254760742, "learning_rate": 3.5893278773811425e-05, "loss": 1.1588, "num_input_tokens_seen": 153688560, "step": 282000 }, { "epoch": 2.8263566511925724, "grad_norm": 7.167964935302734, "learning_rate": 3.586826676804867e-05, "loss": 1.1799, "num_input_tokens_seen": 153968432, "step": 282500 }, { "epoch": 2.8313590523451255, "grad_norm": 5.514324188232422, "learning_rate": 3.58432547622859e-05, "loss": 1.1695, "num_input_tokens_seen": 154241312, "step": 283000 }, { "epoch": 2.8363614534976787, "grad_norm": 4.6626667976379395, "learning_rate": 3.581824275652314e-05, "loss": 1.1876, "num_input_tokens_seen": 154513432, "step": 283500 }, { "epoch": 2.8413638546502322, "grad_norm": 5.130783557891846, "learning_rate": 3.5793230750760365e-05, "loss": 1.1806, "num_input_tokens_seen": 154791160, "step": 284000 }, { "epoch": 2.8463662558027853, "grad_norm": 6.905600547790527, "learning_rate": 3.57682187449976e-05, "loss": 1.1774, "num_input_tokens_seen": 155060824, "step": 284500 }, { "epoch": 2.8513686569553385, "grad_norm": 5.459284782409668, "learning_rate": 3.5743206739234835e-05, "loss": 1.17, "num_input_tokens_seen": 155335544, "step": 285000 }, { "epoch": 2.856371058107892, "grad_norm": 6.299667835235596, "learning_rate": 3.571819473347206e-05, "loss": 1.1751, "num_input_tokens_seen": 155613112, "step": 285500 }, { "epoch": 2.861373459260445, "grad_norm": 5.297176361083984, "learning_rate": 3.56931827277093e-05, "loss": 1.173, "num_input_tokens_seen": 155888552, "step": 286000 }, { "epoch": 2.8663758604129983, "grad_norm": 7.074682712554932, "learning_rate": 3.566817072194654e-05, "loss": 1.1753, "num_input_tokens_seen": 156163384, "step": 286500 }, { "epoch": 2.8713782615655514, "grad_norm": 7.402191638946533, "learning_rate": 3.5643158716183775e-05, "loss": 1.1763, "num_input_tokens_seen": 156433192, "step": 287000 }, { "epoch": 2.8763806627181046, "grad_norm": 6.5372419357299805, "learning_rate": 3.5618146710421e-05, "loss": 1.1931, "num_input_tokens_seen": 156704752, "step": 287500 }, { "epoch": 2.8813830638706577, "grad_norm": 6.030176162719727, "learning_rate": 3.559313470465824e-05, "loss": 1.1808, "num_input_tokens_seen": 156979192, "step": 288000 }, { "epoch": 2.8863854650232112, "grad_norm": 5.74777364730835, "learning_rate": 3.556812269889547e-05, "loss": 1.1693, "num_input_tokens_seen": 157251448, "step": 288500 }, { "epoch": 2.8913878661757644, "grad_norm": 5.995535373687744, "learning_rate": 3.55431106931327e-05, "loss": 1.1637, "num_input_tokens_seen": 157518624, "step": 289000 }, { "epoch": 2.8963902673283175, "grad_norm": 7.268390655517578, "learning_rate": 3.5518098687369936e-05, "loss": 1.171, "num_input_tokens_seen": 157789104, "step": 289500 }, { "epoch": 2.901392668480871, "grad_norm": 6.150352478027344, "learning_rate": 3.549308668160717e-05, "loss": 1.1607, "num_input_tokens_seen": 158064360, "step": 290000 }, { "epoch": 2.906395069633424, "grad_norm": 9.338305473327637, "learning_rate": 3.5468074675844406e-05, "loss": 1.1777, "num_input_tokens_seen": 158337456, "step": 290500 }, { "epoch": 2.9113974707859773, "grad_norm": 7.0623674392700195, "learning_rate": 3.544306267008164e-05, "loss": 1.1673, "num_input_tokens_seen": 158610232, "step": 291000 }, { "epoch": 2.9163998719385305, "grad_norm": 6.665122985839844, "learning_rate": 3.5418050664318876e-05, "loss": 1.1764, "num_input_tokens_seen": 158883800, "step": 291500 }, { "epoch": 2.9214022730910836, "grad_norm": 5.489156723022461, "learning_rate": 3.539303865855611e-05, "loss": 1.1674, "num_input_tokens_seen": 159156920, "step": 292000 }, { "epoch": 2.9264046742436367, "grad_norm": 4.9325456619262695, "learning_rate": 3.536802665279334e-05, "loss": 1.1724, "num_input_tokens_seen": 159422696, "step": 292500 }, { "epoch": 2.9314070753961903, "grad_norm": 4.590809345245361, "learning_rate": 3.5343014647030574e-05, "loss": 1.1691, "num_input_tokens_seen": 159694472, "step": 293000 }, { "epoch": 2.9364094765487434, "grad_norm": 5.634531497955322, "learning_rate": 3.531800264126781e-05, "loss": 1.1875, "num_input_tokens_seen": 159966504, "step": 293500 }, { "epoch": 2.9414118777012965, "grad_norm": 7.636883735656738, "learning_rate": 3.5292990635505044e-05, "loss": 1.1771, "num_input_tokens_seen": 160236568, "step": 294000 }, { "epoch": 2.94641427885385, "grad_norm": 4.785983562469482, "learning_rate": 3.526797862974228e-05, "loss": 1.1746, "num_input_tokens_seen": 160505800, "step": 294500 }, { "epoch": 2.9514166800064032, "grad_norm": 5.1736931800842285, "learning_rate": 3.5242966623979514e-05, "loss": 1.1753, "num_input_tokens_seen": 160785240, "step": 295000 }, { "epoch": 2.9564190811589564, "grad_norm": 6.308248519897461, "learning_rate": 3.521795461821675e-05, "loss": 1.1851, "num_input_tokens_seen": 161049888, "step": 295500 }, { "epoch": 2.9614214823115095, "grad_norm": 6.6797404289245605, "learning_rate": 3.519294261245398e-05, "loss": 1.1486, "num_input_tokens_seen": 161316952, "step": 296000 }, { "epoch": 2.9664238834640626, "grad_norm": 5.875812530517578, "learning_rate": 3.516793060669121e-05, "loss": 1.1689, "num_input_tokens_seen": 161584440, "step": 296500 }, { "epoch": 2.9714262846166157, "grad_norm": 6.539891719818115, "learning_rate": 3.514291860092845e-05, "loss": 1.1567, "num_input_tokens_seen": 161861384, "step": 297000 }, { "epoch": 2.9764286857691693, "grad_norm": 4.355959415435791, "learning_rate": 3.511790659516568e-05, "loss": 1.1654, "num_input_tokens_seen": 162134976, "step": 297500 }, { "epoch": 2.9814310869217224, "grad_norm": 8.101115226745605, "learning_rate": 3.509289458940292e-05, "loss": 1.1749, "num_input_tokens_seen": 162408584, "step": 298000 }, { "epoch": 2.9864334880742756, "grad_norm": 6.168905735015869, "learning_rate": 3.506788258364015e-05, "loss": 1.1823, "num_input_tokens_seen": 162688056, "step": 298500 }, { "epoch": 2.991435889226829, "grad_norm": 6.624521255493164, "learning_rate": 3.504287057787739e-05, "loss": 1.1709, "num_input_tokens_seen": 162956872, "step": 299000 }, { "epoch": 2.9964382903793823, "grad_norm": 6.812922954559326, "learning_rate": 3.5017858572114615e-05, "loss": 1.1728, "num_input_tokens_seen": 163231632, "step": 299500 }, { "epoch": 3.0, "eval_loss": 1.1232455968856812, "eval_runtime": 186.7896, "eval_samples_per_second": 1070.215, "eval_steps_per_second": 133.782, "num_input_tokens_seen": 163424808, "step": 299856 }, { "epoch": 3.0014406915319354, "grad_norm": 4.925895690917969, "learning_rate": 3.499284656635185e-05, "loss": 1.1557, "num_input_tokens_seen": 163507488, "step": 300000 }, { "epoch": 3.0064430926844885, "grad_norm": 4.663059234619141, "learning_rate": 3.4967834560589085e-05, "loss": 1.0655, "num_input_tokens_seen": 163782720, "step": 300500 }, { "epoch": 3.0114454938370416, "grad_norm": 7.381974220275879, "learning_rate": 3.494282255482632e-05, "loss": 1.0891, "num_input_tokens_seen": 164062072, "step": 301000 }, { "epoch": 3.016447894989595, "grad_norm": 6.4466094970703125, "learning_rate": 3.491781054906355e-05, "loss": 1.074, "num_input_tokens_seen": 164336176, "step": 301500 }, { "epoch": 3.0214502961421483, "grad_norm": 5.126181602478027, "learning_rate": 3.489279854330078e-05, "loss": 1.0666, "num_input_tokens_seen": 164610040, "step": 302000 }, { "epoch": 3.0264526972947015, "grad_norm": 5.322078227996826, "learning_rate": 3.4867786537538025e-05, "loss": 1.0837, "num_input_tokens_seen": 164887400, "step": 302500 }, { "epoch": 3.0314550984472546, "grad_norm": 5.671963691711426, "learning_rate": 3.484277453177525e-05, "loss": 1.0829, "num_input_tokens_seen": 165162256, "step": 303000 }, { "epoch": 3.0364574995998077, "grad_norm": 4.6445441246032715, "learning_rate": 3.481776252601249e-05, "loss": 1.0771, "num_input_tokens_seen": 165430680, "step": 303500 }, { "epoch": 3.0414599007523613, "grad_norm": 5.632525444030762, "learning_rate": 3.479275052024972e-05, "loss": 1.0893, "num_input_tokens_seen": 165706048, "step": 304000 }, { "epoch": 3.0464623019049144, "grad_norm": 4.770864963531494, "learning_rate": 3.476773851448695e-05, "loss": 1.0772, "num_input_tokens_seen": 165979496, "step": 304500 }, { "epoch": 3.0514647030574675, "grad_norm": 7.340290546417236, "learning_rate": 3.4742726508724186e-05, "loss": 1.0851, "num_input_tokens_seen": 166256376, "step": 305000 }, { "epoch": 3.0564671042100207, "grad_norm": 5.2338080406188965, "learning_rate": 3.471771450296142e-05, "loss": 1.0949, "num_input_tokens_seen": 166529712, "step": 305500 }, { "epoch": 3.0614695053625742, "grad_norm": 5.540538311004639, "learning_rate": 3.4692702497198656e-05, "loss": 1.0929, "num_input_tokens_seen": 166795520, "step": 306000 }, { "epoch": 3.0664719065151274, "grad_norm": 6.512203693389893, "learning_rate": 3.466769049143589e-05, "loss": 1.075, "num_input_tokens_seen": 167066728, "step": 306500 }, { "epoch": 3.0714743076676805, "grad_norm": 5.783512592315674, "learning_rate": 3.4642678485673126e-05, "loss": 1.0749, "num_input_tokens_seen": 167335800, "step": 307000 }, { "epoch": 3.0764767088202336, "grad_norm": 5.550832271575928, "learning_rate": 3.461766647991036e-05, "loss": 1.0886, "num_input_tokens_seen": 167610656, "step": 307500 }, { "epoch": 3.0814791099727867, "grad_norm": 5.394260883331299, "learning_rate": 3.459265447414759e-05, "loss": 1.0906, "num_input_tokens_seen": 167882480, "step": 308000 }, { "epoch": 3.0864815111253403, "grad_norm": 5.690032005310059, "learning_rate": 3.4567642468384824e-05, "loss": 1.0851, "num_input_tokens_seen": 168156832, "step": 308500 }, { "epoch": 3.0914839122778934, "grad_norm": 6.620737552642822, "learning_rate": 3.454263046262206e-05, "loss": 1.0931, "num_input_tokens_seen": 168435520, "step": 309000 }, { "epoch": 3.0964863134304466, "grad_norm": 6.105669021606445, "learning_rate": 3.4517618456859294e-05, "loss": 1.0755, "num_input_tokens_seen": 168708304, "step": 309500 }, { "epoch": 3.1014887145829997, "grad_norm": 6.636053562164307, "learning_rate": 3.449260645109653e-05, "loss": 1.08, "num_input_tokens_seen": 168969008, "step": 310000 }, { "epoch": 3.1064911157355533, "grad_norm": 7.361510753631592, "learning_rate": 3.4467594445333764e-05, "loss": 1.0981, "num_input_tokens_seen": 169241920, "step": 310500 }, { "epoch": 3.1114935168881064, "grad_norm": 4.566134929656982, "learning_rate": 3.4442582439571e-05, "loss": 1.0713, "num_input_tokens_seen": 169520520, "step": 311000 }, { "epoch": 3.1164959180406595, "grad_norm": 5.323643207550049, "learning_rate": 3.441757043380823e-05, "loss": 1.0884, "num_input_tokens_seen": 169794728, "step": 311500 }, { "epoch": 3.1214983191932126, "grad_norm": 5.005212306976318, "learning_rate": 3.439255842804546e-05, "loss": 1.0817, "num_input_tokens_seen": 170074952, "step": 312000 }, { "epoch": 3.1265007203457658, "grad_norm": 6.78676176071167, "learning_rate": 3.43675464222827e-05, "loss": 1.1067, "num_input_tokens_seen": 170350784, "step": 312500 }, { "epoch": 3.1315031214983193, "grad_norm": 5.532153129577637, "learning_rate": 3.434253441651993e-05, "loss": 1.0729, "num_input_tokens_seen": 170621008, "step": 313000 }, { "epoch": 3.1365055226508725, "grad_norm": 6.041494369506836, "learning_rate": 3.431752241075716e-05, "loss": 1.0667, "num_input_tokens_seen": 170896544, "step": 313500 }, { "epoch": 3.1415079238034256, "grad_norm": 5.707986831665039, "learning_rate": 3.42925104049944e-05, "loss": 1.0803, "num_input_tokens_seen": 171167464, "step": 314000 }, { "epoch": 3.1465103249559787, "grad_norm": 6.608933925628662, "learning_rate": 3.426749839923164e-05, "loss": 1.0944, "num_input_tokens_seen": 171437824, "step": 314500 }, { "epoch": 3.1515127261085323, "grad_norm": 4.988198280334473, "learning_rate": 3.4242486393468865e-05, "loss": 1.0928, "num_input_tokens_seen": 171713920, "step": 315000 }, { "epoch": 3.1565151272610854, "grad_norm": 5.763394832611084, "learning_rate": 3.42174743877061e-05, "loss": 1.0655, "num_input_tokens_seen": 171991720, "step": 315500 }, { "epoch": 3.1615175284136385, "grad_norm": 6.287621974945068, "learning_rate": 3.4192462381943335e-05, "loss": 1.1027, "num_input_tokens_seen": 172269040, "step": 316000 }, { "epoch": 3.1665199295661917, "grad_norm": 7.083132266998291, "learning_rate": 3.416745037618057e-05, "loss": 1.097, "num_input_tokens_seen": 172556304, "step": 316500 }, { "epoch": 3.1715223307187452, "grad_norm": 5.856710433959961, "learning_rate": 3.41424383704178e-05, "loss": 1.0848, "num_input_tokens_seen": 172825392, "step": 317000 }, { "epoch": 3.1765247318712984, "grad_norm": 5.9765849113464355, "learning_rate": 3.4117426364655033e-05, "loss": 1.0959, "num_input_tokens_seen": 173095280, "step": 317500 }, { "epoch": 3.1815271330238515, "grad_norm": 7.099453926086426, "learning_rate": 3.409241435889227e-05, "loss": 1.0853, "num_input_tokens_seen": 173365904, "step": 318000 }, { "epoch": 3.1865295341764046, "grad_norm": 6.180022239685059, "learning_rate": 3.40674023531295e-05, "loss": 1.0783, "num_input_tokens_seen": 173632760, "step": 318500 }, { "epoch": 3.1915319353289577, "grad_norm": 5.463505744934082, "learning_rate": 3.404239034736674e-05, "loss": 1.0838, "num_input_tokens_seen": 173903984, "step": 319000 }, { "epoch": 3.1965343364815113, "grad_norm": 5.173684120178223, "learning_rate": 3.401737834160397e-05, "loss": 1.1002, "num_input_tokens_seen": 174179216, "step": 319500 }, { "epoch": 3.2015367376340644, "grad_norm": 7.594663619995117, "learning_rate": 3.399236633584121e-05, "loss": 1.096, "num_input_tokens_seen": 174445040, "step": 320000 }, { "epoch": 3.2065391387866176, "grad_norm": 6.0014262199401855, "learning_rate": 3.3967354330078436e-05, "loss": 1.0874, "num_input_tokens_seen": 174714360, "step": 320500 }, { "epoch": 3.2115415399391707, "grad_norm": 6.118896961212158, "learning_rate": 3.394234232431567e-05, "loss": 1.0911, "num_input_tokens_seen": 174982632, "step": 321000 }, { "epoch": 3.2165439410917243, "grad_norm": 6.8333587646484375, "learning_rate": 3.3917330318552906e-05, "loss": 1.1197, "num_input_tokens_seen": 175255784, "step": 321500 }, { "epoch": 3.2215463422442774, "grad_norm": 4.892594337463379, "learning_rate": 3.389231831279014e-05, "loss": 1.1041, "num_input_tokens_seen": 175526640, "step": 322000 }, { "epoch": 3.2265487433968305, "grad_norm": 5.051529884338379, "learning_rate": 3.3867306307027376e-05, "loss": 1.102, "num_input_tokens_seen": 175794768, "step": 322500 }, { "epoch": 3.2315511445493836, "grad_norm": 5.638453960418701, "learning_rate": 3.384229430126461e-05, "loss": 1.1116, "num_input_tokens_seen": 176065112, "step": 323000 }, { "epoch": 3.2365535457019368, "grad_norm": 7.20506477355957, "learning_rate": 3.381728229550184e-05, "loss": 1.0918, "num_input_tokens_seen": 176336008, "step": 323500 }, { "epoch": 3.2415559468544903, "grad_norm": 7.046761512756348, "learning_rate": 3.3792270289739074e-05, "loss": 1.0734, "num_input_tokens_seen": 176605936, "step": 324000 }, { "epoch": 3.2465583480070435, "grad_norm": 6.106048107147217, "learning_rate": 3.376725828397631e-05, "loss": 1.0953, "num_input_tokens_seen": 176880432, "step": 324500 }, { "epoch": 3.2515607491595966, "grad_norm": 6.578117847442627, "learning_rate": 3.3742246278213544e-05, "loss": 1.0973, "num_input_tokens_seen": 177149064, "step": 325000 }, { "epoch": 3.2565631503121497, "grad_norm": 5.515709400177002, "learning_rate": 3.371723427245078e-05, "loss": 1.1044, "num_input_tokens_seen": 177427928, "step": 325500 }, { "epoch": 3.2615655514647033, "grad_norm": 6.71830940246582, "learning_rate": 3.3692222266688014e-05, "loss": 1.0983, "num_input_tokens_seen": 177700128, "step": 326000 }, { "epoch": 3.2665679526172564, "grad_norm": 6.004988670349121, "learning_rate": 3.366721026092525e-05, "loss": 1.0844, "num_input_tokens_seen": 177979224, "step": 326500 }, { "epoch": 3.2715703537698095, "grad_norm": 6.418676376342773, "learning_rate": 3.364219825516248e-05, "loss": 1.0996, "num_input_tokens_seen": 178254504, "step": 327000 }, { "epoch": 3.2765727549223627, "grad_norm": 6.826735973358154, "learning_rate": 3.361718624939971e-05, "loss": 1.086, "num_input_tokens_seen": 178527880, "step": 327500 }, { "epoch": 3.281575156074916, "grad_norm": 7.035877704620361, "learning_rate": 3.359217424363695e-05, "loss": 1.0919, "num_input_tokens_seen": 178801256, "step": 328000 }, { "epoch": 3.2865775572274694, "grad_norm": 7.336743354797363, "learning_rate": 3.356716223787418e-05, "loss": 1.1121, "num_input_tokens_seen": 179072160, "step": 328500 }, { "epoch": 3.2915799583800225, "grad_norm": 5.2435383796691895, "learning_rate": 3.354215023211141e-05, "loss": 1.0915, "num_input_tokens_seen": 179344440, "step": 329000 }, { "epoch": 3.2965823595325756, "grad_norm": 7.368856906890869, "learning_rate": 3.3517138226348646e-05, "loss": 1.0991, "num_input_tokens_seen": 179619064, "step": 329500 }, { "epoch": 3.3015847606851287, "grad_norm": 6.245655059814453, "learning_rate": 3.349212622058589e-05, "loss": 1.0883, "num_input_tokens_seen": 179888680, "step": 330000 }, { "epoch": 3.3065871618376823, "grad_norm": 6.055501461029053, "learning_rate": 3.3467114214823116e-05, "loss": 1.0854, "num_input_tokens_seen": 180162488, "step": 330500 }, { "epoch": 3.3115895629902354, "grad_norm": 5.36578893661499, "learning_rate": 3.344210220906035e-05, "loss": 1.0829, "num_input_tokens_seen": 180435488, "step": 331000 }, { "epoch": 3.3165919641427886, "grad_norm": 4.865072727203369, "learning_rate": 3.3417090203297585e-05, "loss": 1.0914, "num_input_tokens_seen": 180708088, "step": 331500 }, { "epoch": 3.3215943652953417, "grad_norm": 5.984726428985596, "learning_rate": 3.339207819753482e-05, "loss": 1.0988, "num_input_tokens_seen": 180972032, "step": 332000 }, { "epoch": 3.326596766447895, "grad_norm": 6.17361307144165, "learning_rate": 3.336706619177205e-05, "loss": 1.1045, "num_input_tokens_seen": 181243824, "step": 332500 }, { "epoch": 3.3315991676004484, "grad_norm": 5.614140510559082, "learning_rate": 3.3342054186009284e-05, "loss": 1.1017, "num_input_tokens_seen": 181516200, "step": 333000 }, { "epoch": 3.3366015687530015, "grad_norm": 6.182852268218994, "learning_rate": 3.331704218024652e-05, "loss": 1.104, "num_input_tokens_seen": 181789432, "step": 333500 }, { "epoch": 3.3416039699055546, "grad_norm": 6.281063079833984, "learning_rate": 3.3292030174483754e-05, "loss": 1.1114, "num_input_tokens_seen": 182061728, "step": 334000 }, { "epoch": 3.3466063710581078, "grad_norm": 5.531891822814941, "learning_rate": 3.326701816872099e-05, "loss": 1.0908, "num_input_tokens_seen": 182340072, "step": 334500 }, { "epoch": 3.3516087722106613, "grad_norm": 5.755847930908203, "learning_rate": 3.3242006162958223e-05, "loss": 1.0978, "num_input_tokens_seen": 182603520, "step": 335000 }, { "epoch": 3.3566111733632145, "grad_norm": 5.261629104614258, "learning_rate": 3.321699415719546e-05, "loss": 1.089, "num_input_tokens_seen": 182871456, "step": 335500 }, { "epoch": 3.3616135745157676, "grad_norm": 9.492514610290527, "learning_rate": 3.319198215143269e-05, "loss": 1.0943, "num_input_tokens_seen": 183145224, "step": 336000 }, { "epoch": 3.3666159756683207, "grad_norm": 5.316561222076416, "learning_rate": 3.316697014566992e-05, "loss": 1.1153, "num_input_tokens_seen": 183418328, "step": 336500 }, { "epoch": 3.371618376820874, "grad_norm": 4.869199275970459, "learning_rate": 3.3141958139907157e-05, "loss": 1.0922, "num_input_tokens_seen": 183701552, "step": 337000 }, { "epoch": 3.3766207779734274, "grad_norm": 5.928160667419434, "learning_rate": 3.311694613414439e-05, "loss": 1.1058, "num_input_tokens_seen": 183971600, "step": 337500 }, { "epoch": 3.3816231791259805, "grad_norm": 5.425112724304199, "learning_rate": 3.3091934128381626e-05, "loss": 1.0863, "num_input_tokens_seen": 184239416, "step": 338000 }, { "epoch": 3.3866255802785337, "grad_norm": 5.094555854797363, "learning_rate": 3.306692212261886e-05, "loss": 1.0826, "num_input_tokens_seen": 184503600, "step": 338500 }, { "epoch": 3.391627981431087, "grad_norm": 6.472997665405273, "learning_rate": 3.3041910116856096e-05, "loss": 1.0987, "num_input_tokens_seen": 184777584, "step": 339000 }, { "epoch": 3.3966303825836404, "grad_norm": 5.41008996963501, "learning_rate": 3.3016898111093325e-05, "loss": 1.1025, "num_input_tokens_seen": 185048208, "step": 339500 }, { "epoch": 3.4016327837361935, "grad_norm": 6.235612869262695, "learning_rate": 3.299188610533056e-05, "loss": 1.1097, "num_input_tokens_seen": 185317376, "step": 340000 }, { "epoch": 3.4066351848887466, "grad_norm": 5.876267910003662, "learning_rate": 3.2966874099567795e-05, "loss": 1.111, "num_input_tokens_seen": 185588416, "step": 340500 }, { "epoch": 3.4116375860412997, "grad_norm": 4.517580032348633, "learning_rate": 3.294186209380502e-05, "loss": 1.0877, "num_input_tokens_seen": 185855440, "step": 341000 }, { "epoch": 3.416639987193853, "grad_norm": 7.28811502456665, "learning_rate": 3.2916850088042264e-05, "loss": 1.1021, "num_input_tokens_seen": 186125800, "step": 341500 }, { "epoch": 3.4216423883464064, "grad_norm": 7.394123077392578, "learning_rate": 3.28918380822795e-05, "loss": 1.1103, "num_input_tokens_seen": 186390984, "step": 342000 }, { "epoch": 3.4266447894989596, "grad_norm": 6.393476963043213, "learning_rate": 3.286682607651673e-05, "loss": 1.0946, "num_input_tokens_seen": 186659016, "step": 342500 }, { "epoch": 3.4316471906515127, "grad_norm": 5.5101470947265625, "learning_rate": 3.284181407075396e-05, "loss": 1.1031, "num_input_tokens_seen": 186933800, "step": 343000 }, { "epoch": 3.436649591804066, "grad_norm": 5.820064067840576, "learning_rate": 3.28168020649912e-05, "loss": 1.0992, "num_input_tokens_seen": 187203688, "step": 343500 }, { "epoch": 3.4416519929566194, "grad_norm": 4.500607013702393, "learning_rate": 3.279179005922843e-05, "loss": 1.1083, "num_input_tokens_seen": 187477344, "step": 344000 }, { "epoch": 3.4466543941091725, "grad_norm": 6.536877632141113, "learning_rate": 3.276677805346566e-05, "loss": 1.0988, "num_input_tokens_seen": 187753544, "step": 344500 }, { "epoch": 3.4516567952617256, "grad_norm": 6.723674774169922, "learning_rate": 3.2741766047702896e-05, "loss": 1.0814, "num_input_tokens_seen": 188027648, "step": 345000 }, { "epoch": 3.4566591964142788, "grad_norm": 5.175849437713623, "learning_rate": 3.271675404194014e-05, "loss": 1.1108, "num_input_tokens_seen": 188298976, "step": 345500 }, { "epoch": 3.461661597566832, "grad_norm": 4.006369590759277, "learning_rate": 3.2691742036177366e-05, "loss": 1.102, "num_input_tokens_seen": 188573952, "step": 346000 }, { "epoch": 3.4666639987193855, "grad_norm": 5.444148063659668, "learning_rate": 3.26667300304146e-05, "loss": 1.0963, "num_input_tokens_seen": 188843992, "step": 346500 }, { "epoch": 3.4716663998719386, "grad_norm": 6.093343257904053, "learning_rate": 3.2641718024651836e-05, "loss": 1.1117, "num_input_tokens_seen": 189117128, "step": 347000 }, { "epoch": 3.4766688010244917, "grad_norm": 5.752835750579834, "learning_rate": 3.261670601888907e-05, "loss": 1.0973, "num_input_tokens_seen": 189389104, "step": 347500 }, { "epoch": 3.481671202177045, "grad_norm": 4.975690841674805, "learning_rate": 3.25916940131263e-05, "loss": 1.1074, "num_input_tokens_seen": 189665040, "step": 348000 }, { "epoch": 3.4866736033295984, "grad_norm": 5.228826999664307, "learning_rate": 3.2566682007363534e-05, "loss": 1.0942, "num_input_tokens_seen": 189939112, "step": 348500 }, { "epoch": 3.4916760044821515, "grad_norm": 5.240488052368164, "learning_rate": 3.254167000160077e-05, "loss": 1.1023, "num_input_tokens_seen": 190214888, "step": 349000 }, { "epoch": 3.4966784056347047, "grad_norm": 6.247119903564453, "learning_rate": 3.2516657995838004e-05, "loss": 1.1001, "num_input_tokens_seen": 190486416, "step": 349500 }, { "epoch": 3.501680806787258, "grad_norm": 7.789793968200684, "learning_rate": 3.249164599007524e-05, "loss": 1.1066, "num_input_tokens_seen": 190761576, "step": 350000 }, { "epoch": 3.506683207939811, "grad_norm": 4.448274612426758, "learning_rate": 3.2466633984312474e-05, "loss": 1.1009, "num_input_tokens_seen": 191031280, "step": 350500 }, { "epoch": 3.5116856090923645, "grad_norm": 7.334349632263184, "learning_rate": 3.244162197854971e-05, "loss": 1.1059, "num_input_tokens_seen": 191300640, "step": 351000 }, { "epoch": 3.5166880102449176, "grad_norm": 6.003718852996826, "learning_rate": 3.241660997278694e-05, "loss": 1.097, "num_input_tokens_seen": 191573504, "step": 351500 }, { "epoch": 3.5216904113974707, "grad_norm": 5.930721759796143, "learning_rate": 3.239159796702417e-05, "loss": 1.0897, "num_input_tokens_seen": 191844912, "step": 352000 }, { "epoch": 3.526692812550024, "grad_norm": 4.852160453796387, "learning_rate": 3.236658596126141e-05, "loss": 1.0989, "num_input_tokens_seen": 192115928, "step": 352500 }, { "epoch": 3.5316952137025774, "grad_norm": 5.043008327484131, "learning_rate": 3.234157395549864e-05, "loss": 1.0835, "num_input_tokens_seen": 192381304, "step": 353000 }, { "epoch": 3.5366976148551306, "grad_norm": 5.529479503631592, "learning_rate": 3.231656194973588e-05, "loss": 1.1085, "num_input_tokens_seen": 192651760, "step": 353500 }, { "epoch": 3.5417000160076837, "grad_norm": 6.701032638549805, "learning_rate": 3.229154994397311e-05, "loss": 1.1058, "num_input_tokens_seen": 192924280, "step": 354000 }, { "epoch": 3.546702417160237, "grad_norm": 6.587806224822998, "learning_rate": 3.2266537938210347e-05, "loss": 1.0952, "num_input_tokens_seen": 193194504, "step": 354500 }, { "epoch": 3.55170481831279, "grad_norm": 5.651816368103027, "learning_rate": 3.2241525932447575e-05, "loss": 1.0987, "num_input_tokens_seen": 193457896, "step": 355000 }, { "epoch": 3.5567072194653435, "grad_norm": 4.911685943603516, "learning_rate": 3.221651392668481e-05, "loss": 1.0894, "num_input_tokens_seen": 193726256, "step": 355500 }, { "epoch": 3.5617096206178966, "grad_norm": 5.760750770568848, "learning_rate": 3.2191501920922045e-05, "loss": 1.1061, "num_input_tokens_seen": 194000904, "step": 356000 }, { "epoch": 3.5667120217704498, "grad_norm": 5.3068647384643555, "learning_rate": 3.216648991515927e-05, "loss": 1.0917, "num_input_tokens_seen": 194271728, "step": 356500 }, { "epoch": 3.571714422923003, "grad_norm": 5.526483535766602, "learning_rate": 3.2141477909396515e-05, "loss": 1.0853, "num_input_tokens_seen": 194541832, "step": 357000 }, { "epoch": 3.5767168240755565, "grad_norm": 6.068410396575928, "learning_rate": 3.211646590363375e-05, "loss": 1.1037, "num_input_tokens_seen": 194815496, "step": 357500 }, { "epoch": 3.5817192252281096, "grad_norm": 5.573991775512695, "learning_rate": 3.2091453897870985e-05, "loss": 1.104, "num_input_tokens_seen": 195090856, "step": 358000 }, { "epoch": 3.5867216263806627, "grad_norm": 7.24959135055542, "learning_rate": 3.206644189210821e-05, "loss": 1.1011, "num_input_tokens_seen": 195370496, "step": 358500 }, { "epoch": 3.591724027533216, "grad_norm": 5.9966535568237305, "learning_rate": 3.204142988634545e-05, "loss": 1.1042, "num_input_tokens_seen": 195642920, "step": 359000 }, { "epoch": 3.596726428685769, "grad_norm": 10.24399185180664, "learning_rate": 3.201641788058268e-05, "loss": 1.1068, "num_input_tokens_seen": 195918104, "step": 359500 }, { "epoch": 3.6017288298383225, "grad_norm": 6.0826215744018555, "learning_rate": 3.199140587481991e-05, "loss": 1.0953, "num_input_tokens_seen": 196193816, "step": 360000 }, { "epoch": 3.6067312309908757, "grad_norm": 5.735098838806152, "learning_rate": 3.1966393869057146e-05, "loss": 1.0956, "num_input_tokens_seen": 196461344, "step": 360500 }, { "epoch": 3.611733632143429, "grad_norm": 4.604750156402588, "learning_rate": 3.194138186329438e-05, "loss": 1.0863, "num_input_tokens_seen": 196732248, "step": 361000 }, { "epoch": 3.616736033295982, "grad_norm": 5.826147079467773, "learning_rate": 3.191636985753162e-05, "loss": 1.1043, "num_input_tokens_seen": 197008704, "step": 361500 }, { "epoch": 3.6217384344485355, "grad_norm": 6.071508884429932, "learning_rate": 3.189135785176885e-05, "loss": 1.1086, "num_input_tokens_seen": 197287472, "step": 362000 }, { "epoch": 3.6267408356010886, "grad_norm": 7.109647750854492, "learning_rate": 3.1866345846006086e-05, "loss": 1.1049, "num_input_tokens_seen": 197561376, "step": 362500 }, { "epoch": 3.6317432367536417, "grad_norm": 5.95808219909668, "learning_rate": 3.184133384024332e-05, "loss": 1.1102, "num_input_tokens_seen": 197833112, "step": 363000 }, { "epoch": 3.636745637906195, "grad_norm": 5.6464080810546875, "learning_rate": 3.181632183448055e-05, "loss": 1.1011, "num_input_tokens_seen": 198108448, "step": 363500 }, { "epoch": 3.641748039058748, "grad_norm": 6.354126453399658, "learning_rate": 3.1791309828717784e-05, "loss": 1.1115, "num_input_tokens_seen": 198384984, "step": 364000 }, { "epoch": 3.6467504402113016, "grad_norm": 4.0459394454956055, "learning_rate": 3.176629782295502e-05, "loss": 1.0864, "num_input_tokens_seen": 198658952, "step": 364500 }, { "epoch": 3.6517528413638547, "grad_norm": 5.361639022827148, "learning_rate": 3.1741285817192254e-05, "loss": 1.0827, "num_input_tokens_seen": 198924888, "step": 365000 }, { "epoch": 3.656755242516408, "grad_norm": 5.508306503295898, "learning_rate": 3.171627381142949e-05, "loss": 1.1204, "num_input_tokens_seen": 199201528, "step": 365500 }, { "epoch": 3.661757643668961, "grad_norm": 5.771850109100342, "learning_rate": 3.1691261805666724e-05, "loss": 1.0936, "num_input_tokens_seen": 199477568, "step": 366000 }, { "epoch": 3.6667600448215145, "grad_norm": 5.311666011810303, "learning_rate": 3.166624979990396e-05, "loss": 1.0837, "num_input_tokens_seen": 199742528, "step": 366500 }, { "epoch": 3.6717624459740676, "grad_norm": 6.869203090667725, "learning_rate": 3.164123779414119e-05, "loss": 1.0877, "num_input_tokens_seen": 200016352, "step": 367000 }, { "epoch": 3.6767648471266208, "grad_norm": 5.720645427703857, "learning_rate": 3.161622578837842e-05, "loss": 1.1157, "num_input_tokens_seen": 200288848, "step": 367500 }, { "epoch": 3.681767248279174, "grad_norm": 4.348053455352783, "learning_rate": 3.159121378261566e-05, "loss": 1.1081, "num_input_tokens_seen": 200560176, "step": 368000 }, { "epoch": 3.686769649431727, "grad_norm": 10.115488052368164, "learning_rate": 3.1566201776852885e-05, "loss": 1.0972, "num_input_tokens_seen": 200829688, "step": 368500 }, { "epoch": 3.6917720505842806, "grad_norm": 5.798775672912598, "learning_rate": 3.154118977109013e-05, "loss": 1.0972, "num_input_tokens_seen": 201096760, "step": 369000 }, { "epoch": 3.6967744517368337, "grad_norm": 6.090835094451904, "learning_rate": 3.151617776532736e-05, "loss": 1.0971, "num_input_tokens_seen": 201367440, "step": 369500 }, { "epoch": 3.701776852889387, "grad_norm": 5.695186138153076, "learning_rate": 3.14911657595646e-05, "loss": 1.0839, "num_input_tokens_seen": 201639960, "step": 370000 }, { "epoch": 3.70677925404194, "grad_norm": 7.136424541473389, "learning_rate": 3.1466153753801825e-05, "loss": 1.1157, "num_input_tokens_seen": 201913680, "step": 370500 }, { "epoch": 3.7117816551944935, "grad_norm": 5.564599514007568, "learning_rate": 3.144114174803906e-05, "loss": 1.0987, "num_input_tokens_seen": 202193184, "step": 371000 }, { "epoch": 3.7167840563470467, "grad_norm": 5.429393291473389, "learning_rate": 3.1416129742276295e-05, "loss": 1.0872, "num_input_tokens_seen": 202465104, "step": 371500 }, { "epoch": 3.7217864574996, "grad_norm": 6.241130828857422, "learning_rate": 3.139111773651352e-05, "loss": 1.1101, "num_input_tokens_seen": 202739128, "step": 372000 }, { "epoch": 3.726788858652153, "grad_norm": 5.023561954498291, "learning_rate": 3.136610573075076e-05, "loss": 1.1091, "num_input_tokens_seen": 203013680, "step": 372500 }, { "epoch": 3.731791259804706, "grad_norm": 4.592106342315674, "learning_rate": 3.1341093724988e-05, "loss": 1.105, "num_input_tokens_seen": 203285192, "step": 373000 }, { "epoch": 3.7367936609572596, "grad_norm": 4.939518928527832, "learning_rate": 3.1316081719225235e-05, "loss": 1.1075, "num_input_tokens_seen": 203559176, "step": 373500 }, { "epoch": 3.7417960621098127, "grad_norm": 5.232937812805176, "learning_rate": 3.129106971346246e-05, "loss": 1.1105, "num_input_tokens_seen": 203835152, "step": 374000 }, { "epoch": 3.746798463262366, "grad_norm": 4.963284492492676, "learning_rate": 3.12660577076997e-05, "loss": 1.0907, "num_input_tokens_seen": 204105752, "step": 374500 }, { "epoch": 3.751800864414919, "grad_norm": 5.728975296020508, "learning_rate": 3.124104570193693e-05, "loss": 1.1002, "num_input_tokens_seen": 204373136, "step": 375000 }, { "epoch": 3.7568032655674726, "grad_norm": 6.109611511230469, "learning_rate": 3.121603369617416e-05, "loss": 1.108, "num_input_tokens_seen": 204638480, "step": 375500 }, { "epoch": 3.7618056667200257, "grad_norm": 5.837881088256836, "learning_rate": 3.1191021690411396e-05, "loss": 1.1266, "num_input_tokens_seen": 204909880, "step": 376000 }, { "epoch": 3.766808067872579, "grad_norm": 6.2475666999816895, "learning_rate": 3.116600968464863e-05, "loss": 1.088, "num_input_tokens_seen": 205188256, "step": 376500 }, { "epoch": 3.771810469025132, "grad_norm": 5.80530309677124, "learning_rate": 3.1140997678885866e-05, "loss": 1.0914, "num_input_tokens_seen": 205462952, "step": 377000 }, { "epoch": 3.776812870177685, "grad_norm": 8.078316688537598, "learning_rate": 3.11159856731231e-05, "loss": 1.0968, "num_input_tokens_seen": 205733776, "step": 377500 }, { "epoch": 3.7818152713302386, "grad_norm": 6.782426834106445, "learning_rate": 3.1090973667360336e-05, "loss": 1.0869, "num_input_tokens_seen": 206004512, "step": 378000 }, { "epoch": 3.7868176724827918, "grad_norm": 5.787932395935059, "learning_rate": 3.106596166159757e-05, "loss": 1.1081, "num_input_tokens_seen": 206278760, "step": 378500 }, { "epoch": 3.791820073635345, "grad_norm": 6.141157150268555, "learning_rate": 3.10409496558348e-05, "loss": 1.1042, "num_input_tokens_seen": 206552664, "step": 379000 }, { "epoch": 3.796822474787898, "grad_norm": 5.748921871185303, "learning_rate": 3.1015937650072034e-05, "loss": 1.1058, "num_input_tokens_seen": 206824976, "step": 379500 }, { "epoch": 3.8018248759404516, "grad_norm": 5.540569305419922, "learning_rate": 3.099092564430927e-05, "loss": 1.102, "num_input_tokens_seen": 207100016, "step": 380000 }, { "epoch": 3.8068272770930047, "grad_norm": 6.440171718597412, "learning_rate": 3.0965913638546504e-05, "loss": 1.1179, "num_input_tokens_seen": 207370128, "step": 380500 }, { "epoch": 3.811829678245558, "grad_norm": 4.424386024475098, "learning_rate": 3.094090163278374e-05, "loss": 1.0881, "num_input_tokens_seen": 207637240, "step": 381000 }, { "epoch": 3.816832079398111, "grad_norm": 5.059506416320801, "learning_rate": 3.0915889627020974e-05, "loss": 1.0892, "num_input_tokens_seen": 207914904, "step": 381500 }, { "epoch": 3.821834480550664, "grad_norm": 5.5119805335998535, "learning_rate": 3.089087762125821e-05, "loss": 1.089, "num_input_tokens_seen": 208195200, "step": 382000 }, { "epoch": 3.8268368817032177, "grad_norm": 5.340829372406006, "learning_rate": 3.086586561549544e-05, "loss": 1.0989, "num_input_tokens_seen": 208469512, "step": 382500 }, { "epoch": 3.831839282855771, "grad_norm": 5.793147087097168, "learning_rate": 3.084085360973267e-05, "loss": 1.1051, "num_input_tokens_seen": 208740512, "step": 383000 }, { "epoch": 3.836841684008324, "grad_norm": 4.490692138671875, "learning_rate": 3.081584160396991e-05, "loss": 1.1077, "num_input_tokens_seen": 209018232, "step": 383500 }, { "epoch": 3.8418440851608775, "grad_norm": 6.107596397399902, "learning_rate": 3.0790829598207135e-05, "loss": 1.0995, "num_input_tokens_seen": 209290016, "step": 384000 }, { "epoch": 3.8468464863134306, "grad_norm": 7.825516223907471, "learning_rate": 3.076581759244438e-05, "loss": 1.0869, "num_input_tokens_seen": 209556592, "step": 384500 }, { "epoch": 3.8518488874659838, "grad_norm": 4.849490165710449, "learning_rate": 3.074080558668161e-05, "loss": 1.1221, "num_input_tokens_seen": 209832880, "step": 385000 }, { "epoch": 3.856851288618537, "grad_norm": 6.529792308807373, "learning_rate": 3.071579358091885e-05, "loss": 1.1062, "num_input_tokens_seen": 210114184, "step": 385500 }, { "epoch": 3.86185368977109, "grad_norm": 6.837585926055908, "learning_rate": 3.0690781575156075e-05, "loss": 1.0878, "num_input_tokens_seen": 210380480, "step": 386000 }, { "epoch": 3.866856090923643, "grad_norm": 6.309233665466309, "learning_rate": 3.066576956939331e-05, "loss": 1.1116, "num_input_tokens_seen": 210653688, "step": 386500 }, { "epoch": 3.8718584920761967, "grad_norm": 6.287944316864014, "learning_rate": 3.0640757563630545e-05, "loss": 1.1021, "num_input_tokens_seen": 210927232, "step": 387000 }, { "epoch": 3.87686089322875, "grad_norm": 5.488702774047852, "learning_rate": 3.061574555786777e-05, "loss": 1.1043, "num_input_tokens_seen": 211197296, "step": 387500 }, { "epoch": 3.881863294381303, "grad_norm": 8.246638298034668, "learning_rate": 3.059073355210501e-05, "loss": 1.0917, "num_input_tokens_seen": 211469200, "step": 388000 }, { "epoch": 3.8868656955338565, "grad_norm": 6.3921332359313965, "learning_rate": 3.056572154634224e-05, "loss": 1.1079, "num_input_tokens_seen": 211736248, "step": 388500 }, { "epoch": 3.8918680966864097, "grad_norm": 5.241750717163086, "learning_rate": 3.0540709540579485e-05, "loss": 1.0928, "num_input_tokens_seen": 212005560, "step": 389000 }, { "epoch": 3.8968704978389628, "grad_norm": 5.063024997711182, "learning_rate": 3.0515697534816713e-05, "loss": 1.1103, "num_input_tokens_seen": 212271040, "step": 389500 }, { "epoch": 3.901872898991516, "grad_norm": 5.5935139656066895, "learning_rate": 3.0490685529053948e-05, "loss": 1.1169, "num_input_tokens_seen": 212544560, "step": 390000 }, { "epoch": 3.906875300144069, "grad_norm": 5.44050931930542, "learning_rate": 3.0465673523291183e-05, "loss": 1.0862, "num_input_tokens_seen": 212819160, "step": 390500 }, { "epoch": 3.911877701296622, "grad_norm": 5.747745990753174, "learning_rate": 3.044066151752841e-05, "loss": 1.0898, "num_input_tokens_seen": 213087032, "step": 391000 }, { "epoch": 3.9168801024491757, "grad_norm": 5.6474995613098145, "learning_rate": 3.041564951176565e-05, "loss": 1.1183, "num_input_tokens_seen": 213366232, "step": 391500 }, { "epoch": 3.921882503601729, "grad_norm": 5.1681928634643555, "learning_rate": 3.0390637506002884e-05, "loss": 1.1001, "num_input_tokens_seen": 213633560, "step": 392000 }, { "epoch": 3.926884904754282, "grad_norm": 7.847573280334473, "learning_rate": 3.036562550024012e-05, "loss": 1.0939, "num_input_tokens_seen": 213908816, "step": 392500 }, { "epoch": 3.9318873059068356, "grad_norm": 7.0550713539123535, "learning_rate": 3.0340613494477348e-05, "loss": 1.1101, "num_input_tokens_seen": 214186464, "step": 393000 }, { "epoch": 3.9368897070593887, "grad_norm": 5.558708667755127, "learning_rate": 3.0315601488714586e-05, "loss": 1.1038, "num_input_tokens_seen": 214455448, "step": 393500 }, { "epoch": 3.941892108211942, "grad_norm": 7.920301914215088, "learning_rate": 3.029058948295182e-05, "loss": 1.1085, "num_input_tokens_seen": 214732032, "step": 394000 }, { "epoch": 3.946894509364495, "grad_norm": 6.4054789543151855, "learning_rate": 3.026557747718905e-05, "loss": 1.1035, "num_input_tokens_seen": 215009992, "step": 394500 }, { "epoch": 3.951896910517048, "grad_norm": 5.385251045227051, "learning_rate": 3.0240565471426284e-05, "loss": 1.1004, "num_input_tokens_seen": 215281032, "step": 395000 }, { "epoch": 3.956899311669601, "grad_norm": 6.670193672180176, "learning_rate": 3.0215553465663523e-05, "loss": 1.0895, "num_input_tokens_seen": 215547536, "step": 395500 }, { "epoch": 3.9619017128221548, "grad_norm": 9.283798217773438, "learning_rate": 3.0190541459900757e-05, "loss": 1.0853, "num_input_tokens_seen": 215818168, "step": 396000 }, { "epoch": 3.966904113974708, "grad_norm": 5.494171142578125, "learning_rate": 3.0165529454137986e-05, "loss": 1.1118, "num_input_tokens_seen": 216097808, "step": 396500 }, { "epoch": 3.971906515127261, "grad_norm": 9.865717887878418, "learning_rate": 3.014051744837522e-05, "loss": 1.1092, "num_input_tokens_seen": 216372792, "step": 397000 }, { "epoch": 3.9769089162798146, "grad_norm": 7.068398952484131, "learning_rate": 3.0115505442612456e-05, "loss": 1.0978, "num_input_tokens_seen": 216645048, "step": 397500 }, { "epoch": 3.9819113174323677, "grad_norm": 7.0897626876831055, "learning_rate": 3.0090493436849687e-05, "loss": 1.0978, "num_input_tokens_seen": 216922104, "step": 398000 }, { "epoch": 3.986913718584921, "grad_norm": 6.884424686431885, "learning_rate": 3.0065481431086922e-05, "loss": 1.1057, "num_input_tokens_seen": 217197472, "step": 398500 }, { "epoch": 3.991916119737474, "grad_norm": 8.55648136138916, "learning_rate": 3.0040469425324157e-05, "loss": 1.0986, "num_input_tokens_seen": 217464560, "step": 399000 }, { "epoch": 3.996918520890027, "grad_norm": 6.080700874328613, "learning_rate": 3.0015457419561392e-05, "loss": 1.1001, "num_input_tokens_seen": 217738936, "step": 399500 }, { "epoch": 4.0, "eval_loss": 1.0870640277862549, "eval_runtime": 187.0155, "eval_samples_per_second": 1068.922, "eval_steps_per_second": 133.62, "num_input_tokens_seen": 217911400, "step": 399808 }, { "epoch": 4.00192092204258, "grad_norm": 5.729778289794922, "learning_rate": 2.9990445413798624e-05, "loss": 1.041, "num_input_tokens_seen": 218020144, "step": 400000 }, { "epoch": 4.006923323195133, "grad_norm": 5.8337225914001465, "learning_rate": 2.996543340803586e-05, "loss": 1.0154, "num_input_tokens_seen": 218293352, "step": 400500 }, { "epoch": 4.0119257243476865, "grad_norm": 6.142926216125488, "learning_rate": 2.9940421402273094e-05, "loss": 1.0043, "num_input_tokens_seen": 218559112, "step": 401000 }, { "epoch": 4.0169281255002405, "grad_norm": 4.911243915557861, "learning_rate": 2.9915409396510325e-05, "loss": 1.0061, "num_input_tokens_seen": 218839224, "step": 401500 }, { "epoch": 4.021930526652794, "grad_norm": 5.466070175170898, "learning_rate": 2.989039739074756e-05, "loss": 0.9953, "num_input_tokens_seen": 219111776, "step": 402000 }, { "epoch": 4.026932927805347, "grad_norm": 6.471262454986572, "learning_rate": 2.9865385384984795e-05, "loss": 0.9955, "num_input_tokens_seen": 219383912, "step": 402500 }, { "epoch": 4.0319353289579, "grad_norm": 7.179049491882324, "learning_rate": 2.9840373379222027e-05, "loss": 1.0101, "num_input_tokens_seen": 219647992, "step": 403000 }, { "epoch": 4.036937730110453, "grad_norm": 5.031703948974609, "learning_rate": 2.981536137345926e-05, "loss": 1.0021, "num_input_tokens_seen": 219913768, "step": 403500 }, { "epoch": 4.041940131263006, "grad_norm": 4.3193840980529785, "learning_rate": 2.9790349367696497e-05, "loss": 1.0078, "num_input_tokens_seen": 220190376, "step": 404000 }, { "epoch": 4.046942532415559, "grad_norm": 5.400819778442383, "learning_rate": 2.976533736193373e-05, "loss": 0.9949, "num_input_tokens_seen": 220460224, "step": 404500 }, { "epoch": 4.051944933568112, "grad_norm": 6.279000759124756, "learning_rate": 2.9740325356170963e-05, "loss": 1.0322, "num_input_tokens_seen": 220736472, "step": 405000 }, { "epoch": 4.0569473347206655, "grad_norm": 7.3011627197265625, "learning_rate": 2.9715313350408198e-05, "loss": 1.0112, "num_input_tokens_seen": 221004608, "step": 405500 }, { "epoch": 4.0619497358732195, "grad_norm": 6.007471561431885, "learning_rate": 2.9690301344645433e-05, "loss": 1.0231, "num_input_tokens_seen": 221278264, "step": 406000 }, { "epoch": 4.066952137025773, "grad_norm": 5.404012203216553, "learning_rate": 2.966528933888266e-05, "loss": 1.0205, "num_input_tokens_seen": 221555688, "step": 406500 }, { "epoch": 4.071954538178326, "grad_norm": 4.693950653076172, "learning_rate": 2.9640277333119896e-05, "loss": 1.021, "num_input_tokens_seen": 221832040, "step": 407000 }, { "epoch": 4.076956939330879, "grad_norm": 5.678884029388428, "learning_rate": 2.9615265327357135e-05, "loss": 1.0268, "num_input_tokens_seen": 222112352, "step": 407500 }, { "epoch": 4.081959340483432, "grad_norm": 5.514533042907715, "learning_rate": 2.959025332159437e-05, "loss": 1.0236, "num_input_tokens_seen": 222383544, "step": 408000 }, { "epoch": 4.086961741635985, "grad_norm": 6.353760719299316, "learning_rate": 2.9565241315831598e-05, "loss": 1.0076, "num_input_tokens_seen": 222653528, "step": 408500 }, { "epoch": 4.091964142788538, "grad_norm": 5.7514519691467285, "learning_rate": 2.9540229310068833e-05, "loss": 1.0175, "num_input_tokens_seen": 222929176, "step": 409000 }, { "epoch": 4.096966543941091, "grad_norm": 5.185674667358398, "learning_rate": 2.951521730430607e-05, "loss": 1.023, "num_input_tokens_seen": 223205600, "step": 409500 }, { "epoch": 4.1019689450936445, "grad_norm": 6.269286632537842, "learning_rate": 2.94902052985433e-05, "loss": 1.0177, "num_input_tokens_seen": 223471424, "step": 410000 }, { "epoch": 4.1069713462461985, "grad_norm": 5.551058292388916, "learning_rate": 2.9465193292780534e-05, "loss": 1.014, "num_input_tokens_seen": 223744904, "step": 410500 }, { "epoch": 4.111973747398752, "grad_norm": 7.259944438934326, "learning_rate": 2.944018128701777e-05, "loss": 1.0176, "num_input_tokens_seen": 224017960, "step": 411000 }, { "epoch": 4.116976148551305, "grad_norm": 6.2288498878479, "learning_rate": 2.9415169281255008e-05, "loss": 1.0208, "num_input_tokens_seen": 224288328, "step": 411500 }, { "epoch": 4.121978549703858, "grad_norm": 4.875370502471924, "learning_rate": 2.9390157275492236e-05, "loss": 1.0218, "num_input_tokens_seen": 224564744, "step": 412000 }, { "epoch": 4.126980950856411, "grad_norm": 5.8250603675842285, "learning_rate": 2.936514526972947e-05, "loss": 1.0176, "num_input_tokens_seen": 224833216, "step": 412500 }, { "epoch": 4.131983352008964, "grad_norm": 4.689972877502441, "learning_rate": 2.9340133263966706e-05, "loss": 1.0031, "num_input_tokens_seen": 225109008, "step": 413000 }, { "epoch": 4.136985753161517, "grad_norm": 6.370342254638672, "learning_rate": 2.9315121258203937e-05, "loss": 1.0235, "num_input_tokens_seen": 225386144, "step": 413500 }, { "epoch": 4.14198815431407, "grad_norm": 5.214616298675537, "learning_rate": 2.9290109252441172e-05, "loss": 1.0147, "num_input_tokens_seen": 225665576, "step": 414000 }, { "epoch": 4.146990555466624, "grad_norm": 5.056887626647949, "learning_rate": 2.9265097246678407e-05, "loss": 1.0134, "num_input_tokens_seen": 225936744, "step": 414500 }, { "epoch": 4.1519929566191776, "grad_norm": 7.385371685028076, "learning_rate": 2.9240085240915642e-05, "loss": 1.0133, "num_input_tokens_seen": 226206672, "step": 415000 }, { "epoch": 4.156995357771731, "grad_norm": 6.09354829788208, "learning_rate": 2.9215073235152874e-05, "loss": 1.0257, "num_input_tokens_seen": 226483208, "step": 415500 }, { "epoch": 4.161997758924284, "grad_norm": 6.554540634155273, "learning_rate": 2.919006122939011e-05, "loss": 1.004, "num_input_tokens_seen": 226756440, "step": 416000 }, { "epoch": 4.167000160076837, "grad_norm": 6.016900539398193, "learning_rate": 2.9165049223627344e-05, "loss": 1.0373, "num_input_tokens_seen": 227035824, "step": 416500 }, { "epoch": 4.17200256122939, "grad_norm": 5.212109565734863, "learning_rate": 2.9140037217864575e-05, "loss": 1.0168, "num_input_tokens_seen": 227309792, "step": 417000 }, { "epoch": 4.177004962381943, "grad_norm": 5.641068935394287, "learning_rate": 2.911502521210181e-05, "loss": 1.0184, "num_input_tokens_seen": 227578216, "step": 417500 }, { "epoch": 4.182007363534496, "grad_norm": 4.603857040405273, "learning_rate": 2.9090013206339045e-05, "loss": 1.0092, "num_input_tokens_seen": 227844640, "step": 418000 }, { "epoch": 4.187009764687049, "grad_norm": 7.76889705657959, "learning_rate": 2.906500120057628e-05, "loss": 1.0324, "num_input_tokens_seen": 228114632, "step": 418500 }, { "epoch": 4.1920121658396035, "grad_norm": 5.698912143707275, "learning_rate": 2.9039989194813512e-05, "loss": 1.0159, "num_input_tokens_seen": 228392896, "step": 419000 }, { "epoch": 4.197014566992157, "grad_norm": 4.45599365234375, "learning_rate": 2.9014977189050747e-05, "loss": 1.0243, "num_input_tokens_seen": 228657144, "step": 419500 }, { "epoch": 4.20201696814471, "grad_norm": 4.775566577911377, "learning_rate": 2.8989965183287982e-05, "loss": 1.0249, "num_input_tokens_seen": 228930912, "step": 420000 }, { "epoch": 4.207019369297263, "grad_norm": 4.6044511795043945, "learning_rate": 2.896495317752521e-05, "loss": 1.0171, "num_input_tokens_seen": 229190776, "step": 420500 }, { "epoch": 4.212021770449816, "grad_norm": 5.821028709411621, "learning_rate": 2.8939941171762448e-05, "loss": 1.0197, "num_input_tokens_seen": 229464464, "step": 421000 }, { "epoch": 4.217024171602369, "grad_norm": 6.407191753387451, "learning_rate": 2.8914929165999683e-05, "loss": 1.0193, "num_input_tokens_seen": 229735080, "step": 421500 }, { "epoch": 4.222026572754922, "grad_norm": 6.624352931976318, "learning_rate": 2.8889917160236918e-05, "loss": 1.0388, "num_input_tokens_seen": 230010112, "step": 422000 }, { "epoch": 4.227028973907475, "grad_norm": 5.672749042510986, "learning_rate": 2.8864905154474146e-05, "loss": 1.0094, "num_input_tokens_seen": 230273040, "step": 422500 }, { "epoch": 4.2320313750600285, "grad_norm": 4.765455722808838, "learning_rate": 2.8839893148711385e-05, "loss": 1.038, "num_input_tokens_seen": 230549568, "step": 423000 }, { "epoch": 4.2370337762125825, "grad_norm": 5.471391677856445, "learning_rate": 2.881488114294862e-05, "loss": 1.0137, "num_input_tokens_seen": 230815880, "step": 423500 }, { "epoch": 4.242036177365136, "grad_norm": 5.090280532836914, "learning_rate": 2.8789869137185848e-05, "loss": 1.0254, "num_input_tokens_seen": 231087416, "step": 424000 }, { "epoch": 4.247038578517689, "grad_norm": 5.823254585266113, "learning_rate": 2.8764857131423083e-05, "loss": 1.0369, "num_input_tokens_seen": 231351728, "step": 424500 }, { "epoch": 4.252040979670242, "grad_norm": 5.406543731689453, "learning_rate": 2.873984512566032e-05, "loss": 1.0233, "num_input_tokens_seen": 231626944, "step": 425000 }, { "epoch": 4.257043380822795, "grad_norm": 6.112472057342529, "learning_rate": 2.871483311989755e-05, "loss": 1.046, "num_input_tokens_seen": 231901904, "step": 425500 }, { "epoch": 4.262045781975348, "grad_norm": 5.495764255523682, "learning_rate": 2.8689821114134784e-05, "loss": 1.0127, "num_input_tokens_seen": 232177576, "step": 426000 }, { "epoch": 4.267048183127901, "grad_norm": 5.970737934112549, "learning_rate": 2.866480910837202e-05, "loss": 1.0266, "num_input_tokens_seen": 232448808, "step": 426500 }, { "epoch": 4.272050584280454, "grad_norm": 6.965437889099121, "learning_rate": 2.8639797102609254e-05, "loss": 1.0311, "num_input_tokens_seen": 232724960, "step": 427000 }, { "epoch": 4.2770529854330075, "grad_norm": 6.662547588348389, "learning_rate": 2.8614785096846486e-05, "loss": 1.0216, "num_input_tokens_seen": 232999800, "step": 427500 }, { "epoch": 4.2820553865855615, "grad_norm": 4.90582275390625, "learning_rate": 2.858977309108372e-05, "loss": 1.036, "num_input_tokens_seen": 233278352, "step": 428000 }, { "epoch": 4.287057787738115, "grad_norm": 5.090430736541748, "learning_rate": 2.8564761085320956e-05, "loss": 1.0253, "num_input_tokens_seen": 233542648, "step": 428500 }, { "epoch": 4.292060188890668, "grad_norm": 6.307216167449951, "learning_rate": 2.8539749079558187e-05, "loss": 1.0367, "num_input_tokens_seen": 233821008, "step": 429000 }, { "epoch": 4.297062590043221, "grad_norm": 5.634079933166504, "learning_rate": 2.8514737073795422e-05, "loss": 1.0248, "num_input_tokens_seen": 234099000, "step": 429500 }, { "epoch": 4.302064991195774, "grad_norm": 6.014862060546875, "learning_rate": 2.8489725068032657e-05, "loss": 1.0231, "num_input_tokens_seen": 234375176, "step": 430000 }, { "epoch": 4.307067392348327, "grad_norm": 5.199640274047852, "learning_rate": 2.8464713062269892e-05, "loss": 1.0366, "num_input_tokens_seen": 234650128, "step": 430500 }, { "epoch": 4.31206979350088, "grad_norm": 4.8902692794799805, "learning_rate": 2.8439701056507124e-05, "loss": 1.0394, "num_input_tokens_seen": 234918712, "step": 431000 }, { "epoch": 4.317072194653433, "grad_norm": 4.592429161071777, "learning_rate": 2.841468905074436e-05, "loss": 1.029, "num_input_tokens_seen": 235196600, "step": 431500 }, { "epoch": 4.3220745958059865, "grad_norm": 5.6518144607543945, "learning_rate": 2.8389677044981594e-05, "loss": 1.0217, "num_input_tokens_seen": 235465976, "step": 432000 }, { "epoch": 4.3270769969585405, "grad_norm": 5.183743000030518, "learning_rate": 2.8364665039218825e-05, "loss": 1.0262, "num_input_tokens_seen": 235745992, "step": 432500 }, { "epoch": 4.332079398111094, "grad_norm": 4.891019821166992, "learning_rate": 2.833965303345606e-05, "loss": 1.0363, "num_input_tokens_seen": 236018376, "step": 433000 }, { "epoch": 4.337081799263647, "grad_norm": 4.2536725997924805, "learning_rate": 2.8314641027693295e-05, "loss": 1.022, "num_input_tokens_seen": 236289296, "step": 433500 }, { "epoch": 4.3420842004162, "grad_norm": 6.686141014099121, "learning_rate": 2.828962902193053e-05, "loss": 1.0393, "num_input_tokens_seen": 236575672, "step": 434000 }, { "epoch": 4.347086601568753, "grad_norm": 4.4611945152282715, "learning_rate": 2.8264617016167762e-05, "loss": 1.0419, "num_input_tokens_seen": 236849624, "step": 434500 }, { "epoch": 4.352089002721306, "grad_norm": 4.447482585906982, "learning_rate": 2.8239605010404997e-05, "loss": 1.0337, "num_input_tokens_seen": 237115712, "step": 435000 }, { "epoch": 4.357091403873859, "grad_norm": 5.549137115478516, "learning_rate": 2.8214593004642232e-05, "loss": 1.0161, "num_input_tokens_seen": 237386704, "step": 435500 }, { "epoch": 4.362093805026412, "grad_norm": 6.824407577514648, "learning_rate": 2.818958099887946e-05, "loss": 1.025, "num_input_tokens_seen": 237662672, "step": 436000 }, { "epoch": 4.3670962061789655, "grad_norm": 5.618262767791748, "learning_rate": 2.8164568993116695e-05, "loss": 1.0222, "num_input_tokens_seen": 237934856, "step": 436500 }, { "epoch": 4.37209860733152, "grad_norm": 5.112995624542236, "learning_rate": 2.8139556987353933e-05, "loss": 1.0119, "num_input_tokens_seen": 238206992, "step": 437000 }, { "epoch": 4.377101008484073, "grad_norm": 5.395593166351318, "learning_rate": 2.811454498159117e-05, "loss": 1.0337, "num_input_tokens_seen": 238477792, "step": 437500 }, { "epoch": 4.382103409636626, "grad_norm": 8.581912994384766, "learning_rate": 2.8089532975828397e-05, "loss": 1.0447, "num_input_tokens_seen": 238754960, "step": 438000 }, { "epoch": 4.387105810789179, "grad_norm": 5.694709777832031, "learning_rate": 2.806452097006563e-05, "loss": 1.0404, "num_input_tokens_seen": 239027008, "step": 438500 }, { "epoch": 4.392108211941732, "grad_norm": 6.605731010437012, "learning_rate": 2.803950896430287e-05, "loss": 1.0412, "num_input_tokens_seen": 239292592, "step": 439000 }, { "epoch": 4.397110613094285, "grad_norm": 5.162715911865234, "learning_rate": 2.8014496958540098e-05, "loss": 1.0266, "num_input_tokens_seen": 239564344, "step": 439500 }, { "epoch": 4.402113014246838, "grad_norm": 8.414751052856445, "learning_rate": 2.7989484952777333e-05, "loss": 1.0461, "num_input_tokens_seen": 239839984, "step": 440000 }, { "epoch": 4.4071154153993914, "grad_norm": 5.043530464172363, "learning_rate": 2.7964472947014568e-05, "loss": 1.0312, "num_input_tokens_seen": 240116944, "step": 440500 }, { "epoch": 4.412117816551945, "grad_norm": 6.508191108703613, "learning_rate": 2.7939460941251806e-05, "loss": 1.0355, "num_input_tokens_seen": 240386144, "step": 441000 }, { "epoch": 4.417120217704499, "grad_norm": 4.704832077026367, "learning_rate": 2.7914448935489035e-05, "loss": 1.0252, "num_input_tokens_seen": 240655192, "step": 441500 }, { "epoch": 4.422122618857052, "grad_norm": 6.601123332977295, "learning_rate": 2.788943692972627e-05, "loss": 1.0564, "num_input_tokens_seen": 240931640, "step": 442000 }, { "epoch": 4.427125020009605, "grad_norm": 5.828186988830566, "learning_rate": 2.7864424923963504e-05, "loss": 1.0359, "num_input_tokens_seen": 241199768, "step": 442500 }, { "epoch": 4.432127421162158, "grad_norm": 4.463243007659912, "learning_rate": 2.7839412918200736e-05, "loss": 1.03, "num_input_tokens_seen": 241474320, "step": 443000 }, { "epoch": 4.437129822314711, "grad_norm": 5.028249263763428, "learning_rate": 2.781440091243797e-05, "loss": 1.0403, "num_input_tokens_seen": 241750640, "step": 443500 }, { "epoch": 4.442132223467264, "grad_norm": 4.5420684814453125, "learning_rate": 2.7789388906675206e-05, "loss": 1.0171, "num_input_tokens_seen": 242018848, "step": 444000 }, { "epoch": 4.447134624619817, "grad_norm": 7.803140640258789, "learning_rate": 2.7764376900912438e-05, "loss": 1.0297, "num_input_tokens_seen": 242284168, "step": 444500 }, { "epoch": 4.4521370257723705, "grad_norm": 5.844732761383057, "learning_rate": 2.7739364895149673e-05, "loss": 1.0348, "num_input_tokens_seen": 242553128, "step": 445000 }, { "epoch": 4.457139426924924, "grad_norm": 5.830750942230225, "learning_rate": 2.7714352889386908e-05, "loss": 1.0429, "num_input_tokens_seen": 242827088, "step": 445500 }, { "epoch": 4.462141828077478, "grad_norm": 4.908278942108154, "learning_rate": 2.7689340883624143e-05, "loss": 1.028, "num_input_tokens_seen": 243093120, "step": 446000 }, { "epoch": 4.467144229230031, "grad_norm": 5.725689888000488, "learning_rate": 2.7664328877861374e-05, "loss": 1.023, "num_input_tokens_seen": 243364816, "step": 446500 }, { "epoch": 4.472146630382584, "grad_norm": 5.354498386383057, "learning_rate": 2.763931687209861e-05, "loss": 1.0433, "num_input_tokens_seen": 243641016, "step": 447000 }, { "epoch": 4.477149031535137, "grad_norm": 6.727901458740234, "learning_rate": 2.7614304866335844e-05, "loss": 1.0142, "num_input_tokens_seen": 243912976, "step": 447500 }, { "epoch": 4.48215143268769, "grad_norm": 5.042398452758789, "learning_rate": 2.7589292860573072e-05, "loss": 1.0301, "num_input_tokens_seen": 244181688, "step": 448000 }, { "epoch": 4.487153833840243, "grad_norm": 6.022967338562012, "learning_rate": 2.756428085481031e-05, "loss": 1.0242, "num_input_tokens_seen": 244448704, "step": 448500 }, { "epoch": 4.492156234992796, "grad_norm": 5.077592849731445, "learning_rate": 2.7539268849047546e-05, "loss": 1.0373, "num_input_tokens_seen": 244722392, "step": 449000 }, { "epoch": 4.4971586361453495, "grad_norm": 5.527291774749756, "learning_rate": 2.751425684328478e-05, "loss": 1.0183, "num_input_tokens_seen": 244995952, "step": 449500 }, { "epoch": 4.502161037297903, "grad_norm": 5.025604248046875, "learning_rate": 2.748924483752201e-05, "loss": 1.0164, "num_input_tokens_seen": 245272304, "step": 450000 }, { "epoch": 4.507163438450457, "grad_norm": 5.344061374664307, "learning_rate": 2.7464232831759247e-05, "loss": 1.0373, "num_input_tokens_seen": 245546016, "step": 450500 }, { "epoch": 4.51216583960301, "grad_norm": 4.6710524559021, "learning_rate": 2.7439220825996482e-05, "loss": 1.0194, "num_input_tokens_seen": 245823488, "step": 451000 }, { "epoch": 4.517168240755563, "grad_norm": 5.240355014801025, "learning_rate": 2.741420882023371e-05, "loss": 1.0277, "num_input_tokens_seen": 246097728, "step": 451500 }, { "epoch": 4.522170641908116, "grad_norm": 5.007404327392578, "learning_rate": 2.7389196814470945e-05, "loss": 1.0217, "num_input_tokens_seen": 246368736, "step": 452000 }, { "epoch": 4.527173043060669, "grad_norm": 5.346477508544922, "learning_rate": 2.7364184808708184e-05, "loss": 1.0283, "num_input_tokens_seen": 246637968, "step": 452500 }, { "epoch": 4.532175444213222, "grad_norm": 5.416214466094971, "learning_rate": 2.733917280294542e-05, "loss": 1.0448, "num_input_tokens_seen": 246918648, "step": 453000 }, { "epoch": 4.537177845365775, "grad_norm": 7.101502418518066, "learning_rate": 2.7314160797182647e-05, "loss": 1.0469, "num_input_tokens_seen": 247181624, "step": 453500 }, { "epoch": 4.5421802465183285, "grad_norm": 5.758215427398682, "learning_rate": 2.728914879141988e-05, "loss": 1.0337, "num_input_tokens_seen": 247456808, "step": 454000 }, { "epoch": 4.547182647670882, "grad_norm": 7.215245246887207, "learning_rate": 2.726413678565712e-05, "loss": 1.0265, "num_input_tokens_seen": 247730456, "step": 454500 }, { "epoch": 4.552185048823436, "grad_norm": 5.474388122558594, "learning_rate": 2.7239124779894348e-05, "loss": 1.0319, "num_input_tokens_seen": 248006488, "step": 455000 }, { "epoch": 4.557187449975989, "grad_norm": 4.975455284118652, "learning_rate": 2.7214112774131583e-05, "loss": 1.029, "num_input_tokens_seen": 248283776, "step": 455500 }, { "epoch": 4.562189851128542, "grad_norm": 5.586923599243164, "learning_rate": 2.7189100768368818e-05, "loss": 1.0314, "num_input_tokens_seen": 248564784, "step": 456000 }, { "epoch": 4.567192252281095, "grad_norm": 7.202296257019043, "learning_rate": 2.7164088762606053e-05, "loss": 1.0436, "num_input_tokens_seen": 248837928, "step": 456500 }, { "epoch": 4.572194653433648, "grad_norm": 6.214195728302002, "learning_rate": 2.7139076756843285e-05, "loss": 1.0253, "num_input_tokens_seen": 249109248, "step": 457000 }, { "epoch": 4.577197054586201, "grad_norm": 6.16148567199707, "learning_rate": 2.711406475108052e-05, "loss": 1.0385, "num_input_tokens_seen": 249384792, "step": 457500 }, { "epoch": 4.582199455738754, "grad_norm": 5.71275520324707, "learning_rate": 2.7089052745317755e-05, "loss": 1.0432, "num_input_tokens_seen": 249651928, "step": 458000 }, { "epoch": 4.5872018568913075, "grad_norm": 4.817130088806152, "learning_rate": 2.7064040739554986e-05, "loss": 1.02, "num_input_tokens_seen": 249917104, "step": 458500 }, { "epoch": 4.592204258043861, "grad_norm": 5.333267688751221, "learning_rate": 2.703902873379222e-05, "loss": 1.0214, "num_input_tokens_seen": 250188040, "step": 459000 }, { "epoch": 4.597206659196415, "grad_norm": 5.287978172302246, "learning_rate": 2.7014016728029456e-05, "loss": 1.0401, "num_input_tokens_seen": 250468056, "step": 459500 }, { "epoch": 4.602209060348968, "grad_norm": 4.713915824890137, "learning_rate": 2.698900472226669e-05, "loss": 1.032, "num_input_tokens_seen": 250737232, "step": 460000 }, { "epoch": 4.607211461501521, "grad_norm": 6.2646965980529785, "learning_rate": 2.6963992716503923e-05, "loss": 1.0289, "num_input_tokens_seen": 251009944, "step": 460500 }, { "epoch": 4.612213862654074, "grad_norm": 6.391628742218018, "learning_rate": 2.6938980710741158e-05, "loss": 1.037, "num_input_tokens_seen": 251276984, "step": 461000 }, { "epoch": 4.617216263806627, "grad_norm": 6.245530128479004, "learning_rate": 2.6913968704978393e-05, "loss": 1.0321, "num_input_tokens_seen": 251544248, "step": 461500 }, { "epoch": 4.62221866495918, "grad_norm": 5.505767345428467, "learning_rate": 2.6888956699215624e-05, "loss": 1.023, "num_input_tokens_seen": 251820376, "step": 462000 }, { "epoch": 4.6272210661117334, "grad_norm": 5.286034107208252, "learning_rate": 2.686394469345286e-05, "loss": 1.029, "num_input_tokens_seen": 252089664, "step": 462500 }, { "epoch": 4.632223467264287, "grad_norm": 5.050361156463623, "learning_rate": 2.6838932687690094e-05, "loss": 1.0215, "num_input_tokens_seen": 252354304, "step": 463000 }, { "epoch": 4.63722586841684, "grad_norm": 4.83864164352417, "learning_rate": 2.6813920681927322e-05, "loss": 1.0292, "num_input_tokens_seen": 252625992, "step": 463500 }, { "epoch": 4.642228269569394, "grad_norm": 4.267606735229492, "learning_rate": 2.678890867616456e-05, "loss": 1.0356, "num_input_tokens_seen": 252900240, "step": 464000 }, { "epoch": 4.647230670721947, "grad_norm": 5.304383754730225, "learning_rate": 2.6763896670401796e-05, "loss": 1.0182, "num_input_tokens_seen": 253175392, "step": 464500 }, { "epoch": 4.6522330718745, "grad_norm": 7.107183933258057, "learning_rate": 2.673888466463903e-05, "loss": 1.0311, "num_input_tokens_seen": 253449480, "step": 465000 }, { "epoch": 4.657235473027053, "grad_norm": 7.010105133056641, "learning_rate": 2.671387265887626e-05, "loss": 1.0213, "num_input_tokens_seen": 253717600, "step": 465500 }, { "epoch": 4.662237874179606, "grad_norm": 5.442753791809082, "learning_rate": 2.6688860653113494e-05, "loss": 1.0624, "num_input_tokens_seen": 253996064, "step": 466000 }, { "epoch": 4.667240275332159, "grad_norm": 5.711010932922363, "learning_rate": 2.6663848647350732e-05, "loss": 1.0382, "num_input_tokens_seen": 254277992, "step": 466500 }, { "epoch": 4.6722426764847125, "grad_norm": 5.396849632263184, "learning_rate": 2.663883664158796e-05, "loss": 1.0358, "num_input_tokens_seen": 254544856, "step": 467000 }, { "epoch": 4.677245077637266, "grad_norm": 7.533030033111572, "learning_rate": 2.6613824635825195e-05, "loss": 1.0344, "num_input_tokens_seen": 254815184, "step": 467500 }, { "epoch": 4.682247478789819, "grad_norm": 6.03594446182251, "learning_rate": 2.658881263006243e-05, "loss": 1.0234, "num_input_tokens_seen": 255087048, "step": 468000 }, { "epoch": 4.687249879942373, "grad_norm": 6.070241928100586, "learning_rate": 2.656380062429967e-05, "loss": 1.0226, "num_input_tokens_seen": 255357680, "step": 468500 }, { "epoch": 4.692252281094926, "grad_norm": 6.334639549255371, "learning_rate": 2.6538788618536897e-05, "loss": 1.0262, "num_input_tokens_seen": 255624976, "step": 469000 }, { "epoch": 4.697254682247479, "grad_norm": 6.096264839172363, "learning_rate": 2.6513776612774132e-05, "loss": 1.0208, "num_input_tokens_seen": 255898192, "step": 469500 }, { "epoch": 4.702257083400032, "grad_norm": 6.717766761779785, "learning_rate": 2.6488764607011367e-05, "loss": 1.0395, "num_input_tokens_seen": 256174976, "step": 470000 }, { "epoch": 4.707259484552585, "grad_norm": 5.3120527267456055, "learning_rate": 2.64637526012486e-05, "loss": 1.0346, "num_input_tokens_seen": 256445504, "step": 470500 }, { "epoch": 4.712261885705138, "grad_norm": 5.765807151794434, "learning_rate": 2.6438740595485833e-05, "loss": 1.0221, "num_input_tokens_seen": 256728192, "step": 471000 }, { "epoch": 4.7172642868576915, "grad_norm": 5.730865478515625, "learning_rate": 2.6413728589723068e-05, "loss": 1.0305, "num_input_tokens_seen": 256998424, "step": 471500 }, { "epoch": 4.722266688010245, "grad_norm": 7.514993190765381, "learning_rate": 2.6388716583960303e-05, "loss": 1.0316, "num_input_tokens_seen": 257266816, "step": 472000 }, { "epoch": 4.727269089162798, "grad_norm": 4.826568603515625, "learning_rate": 2.6363704578197535e-05, "loss": 1.0336, "num_input_tokens_seen": 257528416, "step": 472500 }, { "epoch": 4.732271490315352, "grad_norm": 5.88137674331665, "learning_rate": 2.633869257243477e-05, "loss": 1.0379, "num_input_tokens_seen": 257803064, "step": 473000 }, { "epoch": 4.737273891467905, "grad_norm": 5.539977073669434, "learning_rate": 2.6313680566672005e-05, "loss": 1.0487, "num_input_tokens_seen": 258074752, "step": 473500 }, { "epoch": 4.742276292620458, "grad_norm": 4.8047871589660645, "learning_rate": 2.6288668560909236e-05, "loss": 1.0437, "num_input_tokens_seen": 258356488, "step": 474000 }, { "epoch": 4.747278693773011, "grad_norm": 5.240783214569092, "learning_rate": 2.626365655514647e-05, "loss": 1.0349, "num_input_tokens_seen": 258631128, "step": 474500 }, { "epoch": 4.752281094925564, "grad_norm": 5.152280807495117, "learning_rate": 2.6238644549383706e-05, "loss": 1.0395, "num_input_tokens_seen": 258902464, "step": 475000 }, { "epoch": 4.757283496078117, "grad_norm": 4.534987449645996, "learning_rate": 2.621363254362094e-05, "loss": 1.0417, "num_input_tokens_seen": 259167448, "step": 475500 }, { "epoch": 4.7622858972306705, "grad_norm": 5.842191219329834, "learning_rate": 2.6188620537858173e-05, "loss": 1.0251, "num_input_tokens_seen": 259437192, "step": 476000 }, { "epoch": 4.767288298383224, "grad_norm": 6.532055377960205, "learning_rate": 2.6163608532095408e-05, "loss": 1.0235, "num_input_tokens_seen": 259710112, "step": 476500 }, { "epoch": 4.772290699535777, "grad_norm": 4.665198802947998, "learning_rate": 2.6138596526332643e-05, "loss": 1.0329, "num_input_tokens_seen": 259983048, "step": 477000 }, { "epoch": 4.777293100688331, "grad_norm": 5.536545276641846, "learning_rate": 2.611358452056987e-05, "loss": 1.0144, "num_input_tokens_seen": 260257608, "step": 477500 }, { "epoch": 4.782295501840884, "grad_norm": 7.246273994445801, "learning_rate": 2.608857251480711e-05, "loss": 1.0353, "num_input_tokens_seen": 260533312, "step": 478000 }, { "epoch": 4.787297902993437, "grad_norm": 5.359396934509277, "learning_rate": 2.6063560509044344e-05, "loss": 1.0359, "num_input_tokens_seen": 260808264, "step": 478500 }, { "epoch": 4.79230030414599, "grad_norm": 5.461490154266357, "learning_rate": 2.603854850328158e-05, "loss": 1.0342, "num_input_tokens_seen": 261080680, "step": 479000 }, { "epoch": 4.797302705298543, "grad_norm": 6.074306488037109, "learning_rate": 2.6013536497518807e-05, "loss": 1.0386, "num_input_tokens_seen": 261352616, "step": 479500 }, { "epoch": 4.802305106451096, "grad_norm": 4.465676307678223, "learning_rate": 2.5988524491756046e-05, "loss": 1.0287, "num_input_tokens_seen": 261635744, "step": 480000 }, { "epoch": 4.8073075076036496, "grad_norm": 5.1833953857421875, "learning_rate": 2.596351248599328e-05, "loss": 1.0237, "num_input_tokens_seen": 261911376, "step": 480500 }, { "epoch": 4.812309908756203, "grad_norm": 7.636727809906006, "learning_rate": 2.593850048023051e-05, "loss": 1.0376, "num_input_tokens_seen": 262186256, "step": 481000 }, { "epoch": 4.817312309908756, "grad_norm": 5.770178318023682, "learning_rate": 2.5913488474467744e-05, "loss": 1.0221, "num_input_tokens_seen": 262461448, "step": 481500 }, { "epoch": 4.82231471106131, "grad_norm": 7.173573970794678, "learning_rate": 2.5888476468704982e-05, "loss": 1.0206, "num_input_tokens_seen": 262734672, "step": 482000 }, { "epoch": 4.827317112213863, "grad_norm": 5.029593467712402, "learning_rate": 2.5863464462942217e-05, "loss": 1.0507, "num_input_tokens_seen": 263009408, "step": 482500 }, { "epoch": 4.832319513366416, "grad_norm": 6.359258651733398, "learning_rate": 2.5838452457179445e-05, "loss": 1.0275, "num_input_tokens_seen": 263280584, "step": 483000 }, { "epoch": 4.837321914518969, "grad_norm": 5.677992820739746, "learning_rate": 2.581344045141668e-05, "loss": 1.0195, "num_input_tokens_seen": 263545768, "step": 483500 }, { "epoch": 4.842324315671522, "grad_norm": 4.935763835906982, "learning_rate": 2.578842844565392e-05, "loss": 1.0311, "num_input_tokens_seen": 263822840, "step": 484000 }, { "epoch": 4.8473267168240755, "grad_norm": 5.072977542877197, "learning_rate": 2.5763416439891147e-05, "loss": 1.0246, "num_input_tokens_seen": 264100088, "step": 484500 }, { "epoch": 4.852329117976629, "grad_norm": 6.382875442504883, "learning_rate": 2.5738404434128382e-05, "loss": 1.0211, "num_input_tokens_seen": 264373424, "step": 485000 }, { "epoch": 4.857331519129182, "grad_norm": 5.98667049407959, "learning_rate": 2.5713392428365617e-05, "loss": 1.0457, "num_input_tokens_seen": 264652616, "step": 485500 }, { "epoch": 4.862333920281735, "grad_norm": 5.859986782073975, "learning_rate": 2.568838042260285e-05, "loss": 1.0291, "num_input_tokens_seen": 264919240, "step": 486000 }, { "epoch": 4.867336321434289, "grad_norm": 5.3083271980285645, "learning_rate": 2.5663368416840083e-05, "loss": 1.024, "num_input_tokens_seen": 265186880, "step": 486500 }, { "epoch": 4.872338722586842, "grad_norm": 5.720509052276611, "learning_rate": 2.563835641107732e-05, "loss": 1.0329, "num_input_tokens_seen": 265456584, "step": 487000 }, { "epoch": 4.877341123739395, "grad_norm": 5.909444332122803, "learning_rate": 2.5613344405314553e-05, "loss": 1.0205, "num_input_tokens_seen": 265720768, "step": 487500 }, { "epoch": 4.882343524891948, "grad_norm": 4.779830455780029, "learning_rate": 2.5588332399551785e-05, "loss": 1.0226, "num_input_tokens_seen": 265991224, "step": 488000 }, { "epoch": 4.887345926044501, "grad_norm": 5.503864765167236, "learning_rate": 2.556332039378902e-05, "loss": 1.0299, "num_input_tokens_seen": 266260752, "step": 488500 }, { "epoch": 4.8923483271970545, "grad_norm": 6.2289581298828125, "learning_rate": 2.5538308388026255e-05, "loss": 1.0165, "num_input_tokens_seen": 266538032, "step": 489000 }, { "epoch": 4.897350728349608, "grad_norm": 6.163370132446289, "learning_rate": 2.5513296382263486e-05, "loss": 1.0273, "num_input_tokens_seen": 266812688, "step": 489500 }, { "epoch": 4.902353129502161, "grad_norm": 5.308876991271973, "learning_rate": 2.548828437650072e-05, "loss": 1.0302, "num_input_tokens_seen": 267083632, "step": 490000 }, { "epoch": 4.907355530654714, "grad_norm": 6.824766635894775, "learning_rate": 2.5463272370737956e-05, "loss": 1.0287, "num_input_tokens_seen": 267354032, "step": 490500 }, { "epoch": 4.912357931807268, "grad_norm": 5.9447102546691895, "learning_rate": 2.543826036497519e-05, "loss": 1.0552, "num_input_tokens_seen": 267626432, "step": 491000 }, { "epoch": 4.917360332959821, "grad_norm": 5.845020771026611, "learning_rate": 2.5413248359212423e-05, "loss": 1.0136, "num_input_tokens_seen": 267900520, "step": 491500 }, { "epoch": 4.922362734112374, "grad_norm": 5.4116082191467285, "learning_rate": 2.5388236353449658e-05, "loss": 1.0163, "num_input_tokens_seen": 268165832, "step": 492000 }, { "epoch": 4.927365135264927, "grad_norm": 7.0753326416015625, "learning_rate": 2.5363224347686893e-05, "loss": 1.0393, "num_input_tokens_seen": 268441848, "step": 492500 }, { "epoch": 4.93236753641748, "grad_norm": 7.350298881530762, "learning_rate": 2.533821234192412e-05, "loss": 1.0463, "num_input_tokens_seen": 268716696, "step": 493000 }, { "epoch": 4.9373699375700335, "grad_norm": 5.284552574157715, "learning_rate": 2.531320033616136e-05, "loss": 1.0311, "num_input_tokens_seen": 268988968, "step": 493500 }, { "epoch": 4.942372338722587, "grad_norm": 6.068382740020752, "learning_rate": 2.5288188330398594e-05, "loss": 1.0379, "num_input_tokens_seen": 269264776, "step": 494000 }, { "epoch": 4.94737473987514, "grad_norm": 6.806668281555176, "learning_rate": 2.526317632463583e-05, "loss": 1.0403, "num_input_tokens_seen": 269535320, "step": 494500 }, { "epoch": 4.952377141027693, "grad_norm": 5.127531051635742, "learning_rate": 2.5238164318873058e-05, "loss": 1.0269, "num_input_tokens_seen": 269805928, "step": 495000 }, { "epoch": 4.957379542180247, "grad_norm": 4.772179126739502, "learning_rate": 2.5213152313110293e-05, "loss": 1.0251, "num_input_tokens_seen": 270085064, "step": 495500 }, { "epoch": 4.9623819433328, "grad_norm": 7.208611011505127, "learning_rate": 2.518814030734753e-05, "loss": 1.0134, "num_input_tokens_seen": 270359424, "step": 496000 }, { "epoch": 4.967384344485353, "grad_norm": 5.730184555053711, "learning_rate": 2.516312830158476e-05, "loss": 1.0226, "num_input_tokens_seen": 270634264, "step": 496500 }, { "epoch": 4.972386745637906, "grad_norm": 5.047354698181152, "learning_rate": 2.5138116295821994e-05, "loss": 1.034, "num_input_tokens_seen": 270902440, "step": 497000 }, { "epoch": 4.977389146790459, "grad_norm": 4.870574951171875, "learning_rate": 2.511310429005923e-05, "loss": 1.0434, "num_input_tokens_seen": 271179104, "step": 497500 }, { "epoch": 4.9823915479430125, "grad_norm": 5.616664409637451, "learning_rate": 2.5088092284296467e-05, "loss": 1.0236, "num_input_tokens_seen": 271451512, "step": 498000 }, { "epoch": 4.987393949095566, "grad_norm": 4.676699638366699, "learning_rate": 2.5063080278533696e-05, "loss": 1.0464, "num_input_tokens_seen": 271724472, "step": 498500 }, { "epoch": 4.992396350248119, "grad_norm": 5.634840965270996, "learning_rate": 2.503806827277093e-05, "loss": 1.0291, "num_input_tokens_seen": 271995512, "step": 499000 }, { "epoch": 4.997398751400672, "grad_norm": 6.081726551055908, "learning_rate": 2.5013056267008166e-05, "loss": 1.0243, "num_input_tokens_seen": 272263560, "step": 499500 }, { "epoch": 5.0, "eval_loss": 1.0584163665771484, "eval_runtime": 192.7527, "eval_samples_per_second": 1037.106, "eval_steps_per_second": 129.643, "num_input_tokens_seen": 272407288, "step": 499760 }, { "epoch": 5.0, "num_input_tokens_seen": 272407288, "step": 499760, "total_flos": 7.214188795055309e+16, "train_loss": 0.0, "train_runtime": 0.0544, "train_samples_per_second": 73478382.327, "train_steps_per_second": 9184797.791, "train_tokens_per_second": 5003948262.574 } ], "logging_steps": 500, "max_steps": 499760, "num_input_tokens_seen": 272407288, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.214188795055309e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }