en-zhtw / trainer_state.json
agentlans's picture
Upload 13 files
54d741c verified
{
"best_global_step": 499760,
"best_metric": 1.0584163665771484,
"best_model_checkpoint": "/media/user/Expansion1/opus-mt-en-zhtw-google-translate3/checkpoint-499760",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 499760,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005002401152553225,
"grad_norm": 9.338144302368164,
"learning_rate": 4.997503801824876e-05,
"loss": 2.3751,
"num_input_tokens_seen": 281752,
"step": 500
},
{
"epoch": 0.01000480230510645,
"grad_norm": 10.810979843139648,
"learning_rate": 4.9950026012485996e-05,
"loss": 2.2026,
"num_input_tokens_seen": 552352,
"step": 1000
},
{
"epoch": 0.015007203457659676,
"grad_norm": 9.087040901184082,
"learning_rate": 4.992501400672323e-05,
"loss": 2.1448,
"num_input_tokens_seen": 823176,
"step": 1500
},
{
"epoch": 0.0200096046102129,
"grad_norm": 8.617157936096191,
"learning_rate": 4.990000200096046e-05,
"loss": 2.0951,
"num_input_tokens_seen": 1102760,
"step": 2000
},
{
"epoch": 0.02501200576276613,
"grad_norm": 7.297477722167969,
"learning_rate": 4.9874989995197695e-05,
"loss": 2.0308,
"num_input_tokens_seen": 1378560,
"step": 2500
},
{
"epoch": 0.030014406915319352,
"grad_norm": 8.311019897460938,
"learning_rate": 4.984997798943493e-05,
"loss": 2.0307,
"num_input_tokens_seen": 1653488,
"step": 3000
},
{
"epoch": 0.03501680806787258,
"grad_norm": 8.287640571594238,
"learning_rate": 4.9824965983672164e-05,
"loss": 1.9855,
"num_input_tokens_seen": 1923368,
"step": 3500
},
{
"epoch": 0.0400192092204258,
"grad_norm": 9.764960289001465,
"learning_rate": 4.97999539779094e-05,
"loss": 1.9682,
"num_input_tokens_seen": 2199816,
"step": 4000
},
{
"epoch": 0.04502161037297903,
"grad_norm": 7.692084312438965,
"learning_rate": 4.9774941972146634e-05,
"loss": 1.9222,
"num_input_tokens_seen": 2467792,
"step": 4500
},
{
"epoch": 0.05002401152553226,
"grad_norm": 7.139247417449951,
"learning_rate": 4.974992996638387e-05,
"loss": 1.9266,
"num_input_tokens_seen": 2731792,
"step": 5000
},
{
"epoch": 0.05502641267808548,
"grad_norm": 7.6170244216918945,
"learning_rate": 4.97249179606211e-05,
"loss": 1.9024,
"num_input_tokens_seen": 3004120,
"step": 5500
},
{
"epoch": 0.060028813830638704,
"grad_norm": 12.332016944885254,
"learning_rate": 4.969990595485833e-05,
"loss": 1.8823,
"num_input_tokens_seen": 3284568,
"step": 6000
},
{
"epoch": 0.06503121498319193,
"grad_norm": 7.665128231048584,
"learning_rate": 4.967489394909557e-05,
"loss": 1.8642,
"num_input_tokens_seen": 3554600,
"step": 6500
},
{
"epoch": 0.07003361613574516,
"grad_norm": 10.934691429138184,
"learning_rate": 4.96498819433328e-05,
"loss": 1.8556,
"num_input_tokens_seen": 3824936,
"step": 7000
},
{
"epoch": 0.07503601728829838,
"grad_norm": 7.880730152130127,
"learning_rate": 4.962486993757004e-05,
"loss": 1.8606,
"num_input_tokens_seen": 4099080,
"step": 7500
},
{
"epoch": 0.0800384184408516,
"grad_norm": 7.548530578613281,
"learning_rate": 4.959985793180727e-05,
"loss": 1.83,
"num_input_tokens_seen": 4366808,
"step": 8000
},
{
"epoch": 0.08504081959340483,
"grad_norm": 7.900990009307861,
"learning_rate": 4.957484592604451e-05,
"loss": 1.8031,
"num_input_tokens_seen": 4638816,
"step": 8500
},
{
"epoch": 0.09004322074595807,
"grad_norm": 8.125676155090332,
"learning_rate": 4.9549833920281736e-05,
"loss": 1.8455,
"num_input_tokens_seen": 4915000,
"step": 9000
},
{
"epoch": 0.09504562189851129,
"grad_norm": 7.727709770202637,
"learning_rate": 4.952482191451897e-05,
"loss": 1.8024,
"num_input_tokens_seen": 5188672,
"step": 9500
},
{
"epoch": 0.10004802305106451,
"grad_norm": 5.897092342376709,
"learning_rate": 4.9499809908756206e-05,
"loss": 1.7928,
"num_input_tokens_seen": 5468592,
"step": 10000
},
{
"epoch": 0.10505042420361774,
"grad_norm": 11.170528411865234,
"learning_rate": 4.947479790299344e-05,
"loss": 1.7868,
"num_input_tokens_seen": 5733256,
"step": 10500
},
{
"epoch": 0.11005282535617096,
"grad_norm": 8.682831764221191,
"learning_rate": 4.944978589723067e-05,
"loss": 1.7878,
"num_input_tokens_seen": 6008088,
"step": 11000
},
{
"epoch": 0.11505522650872418,
"grad_norm": 7.914422988891602,
"learning_rate": 4.942477389146791e-05,
"loss": 1.7355,
"num_input_tokens_seen": 6274960,
"step": 11500
},
{
"epoch": 0.12005762766127741,
"grad_norm": 8.685178756713867,
"learning_rate": 4.9399761885705145e-05,
"loss": 1.7744,
"num_input_tokens_seen": 6554288,
"step": 12000
},
{
"epoch": 0.12506002881383063,
"grad_norm": 7.942957401275635,
"learning_rate": 4.9374749879942374e-05,
"loss": 1.7293,
"num_input_tokens_seen": 6832880,
"step": 12500
},
{
"epoch": 0.13006242996638387,
"grad_norm": 6.650600910186768,
"learning_rate": 4.934973787417961e-05,
"loss": 1.7523,
"num_input_tokens_seen": 7107168,
"step": 13000
},
{
"epoch": 0.13506483111893708,
"grad_norm": 7.683079242706299,
"learning_rate": 4.9324725868416844e-05,
"loss": 1.7432,
"num_input_tokens_seen": 7377472,
"step": 13500
},
{
"epoch": 0.14006723227149032,
"grad_norm": 8.168213844299316,
"learning_rate": 4.929971386265408e-05,
"loss": 1.745,
"num_input_tokens_seen": 7653272,
"step": 14000
},
{
"epoch": 0.14506963342404355,
"grad_norm": 8.087789535522461,
"learning_rate": 4.927470185689131e-05,
"loss": 1.7179,
"num_input_tokens_seen": 7926040,
"step": 14500
},
{
"epoch": 0.15007203457659676,
"grad_norm": 8.388677597045898,
"learning_rate": 4.924968985112854e-05,
"loss": 1.7219,
"num_input_tokens_seen": 8195288,
"step": 15000
},
{
"epoch": 0.15507443572915,
"grad_norm": 8.354930877685547,
"learning_rate": 4.922467784536578e-05,
"loss": 1.7376,
"num_input_tokens_seen": 8469792,
"step": 15500
},
{
"epoch": 0.1600768368817032,
"grad_norm": 8.638579368591309,
"learning_rate": 4.919966583960301e-05,
"loss": 1.701,
"num_input_tokens_seen": 8735800,
"step": 16000
},
{
"epoch": 0.16507923803425645,
"grad_norm": 6.771655559539795,
"learning_rate": 4.9174653833840247e-05,
"loss": 1.719,
"num_input_tokens_seen": 9005256,
"step": 16500
},
{
"epoch": 0.17008163918680966,
"grad_norm": 12.017413139343262,
"learning_rate": 4.914964182807748e-05,
"loss": 1.7029,
"num_input_tokens_seen": 9279816,
"step": 17000
},
{
"epoch": 0.1750840403393629,
"grad_norm": 7.177635669708252,
"learning_rate": 4.9124629822314716e-05,
"loss": 1.6925,
"num_input_tokens_seen": 9548800,
"step": 17500
},
{
"epoch": 0.18008644149191613,
"grad_norm": 6.606298446655273,
"learning_rate": 4.9099617816551945e-05,
"loss": 1.6957,
"num_input_tokens_seen": 9826416,
"step": 18000
},
{
"epoch": 0.18508884264446934,
"grad_norm": 6.026829242706299,
"learning_rate": 4.907460581078918e-05,
"loss": 1.7022,
"num_input_tokens_seen": 10095920,
"step": 18500
},
{
"epoch": 0.19009124379702258,
"grad_norm": 8.743913650512695,
"learning_rate": 4.9049593805026415e-05,
"loss": 1.6904,
"num_input_tokens_seen": 10371760,
"step": 19000
},
{
"epoch": 0.1950936449495758,
"grad_norm": 9.37678050994873,
"learning_rate": 4.902458179926365e-05,
"loss": 1.6617,
"num_input_tokens_seen": 10639680,
"step": 19500
},
{
"epoch": 0.20009604610212903,
"grad_norm": 7.834632396697998,
"learning_rate": 4.8999569793500885e-05,
"loss": 1.6631,
"num_input_tokens_seen": 10910752,
"step": 20000
},
{
"epoch": 0.20509844725468224,
"grad_norm": 7.523416519165039,
"learning_rate": 4.897455778773812e-05,
"loss": 1.6526,
"num_input_tokens_seen": 11174272,
"step": 20500
},
{
"epoch": 0.21010084840723547,
"grad_norm": 7.593217372894287,
"learning_rate": 4.894954578197535e-05,
"loss": 1.6618,
"num_input_tokens_seen": 11447056,
"step": 21000
},
{
"epoch": 0.2151032495597887,
"grad_norm": 7.984575271606445,
"learning_rate": 4.892453377621258e-05,
"loss": 1.6668,
"num_input_tokens_seen": 11715856,
"step": 21500
},
{
"epoch": 0.22010565071234192,
"grad_norm": 7.122634410858154,
"learning_rate": 4.889952177044982e-05,
"loss": 1.6697,
"num_input_tokens_seen": 11994600,
"step": 22000
},
{
"epoch": 0.22510805186489516,
"grad_norm": 6.745737552642822,
"learning_rate": 4.887450976468705e-05,
"loss": 1.6471,
"num_input_tokens_seen": 12264272,
"step": 22500
},
{
"epoch": 0.23011045301744837,
"grad_norm": 6.742521286010742,
"learning_rate": 4.884949775892428e-05,
"loss": 1.6588,
"num_input_tokens_seen": 12538760,
"step": 23000
},
{
"epoch": 0.2351128541700016,
"grad_norm": 8.37658977508545,
"learning_rate": 4.882448575316152e-05,
"loss": 1.667,
"num_input_tokens_seen": 12814464,
"step": 23500
},
{
"epoch": 0.24011525532255482,
"grad_norm": 7.458651065826416,
"learning_rate": 4.879947374739876e-05,
"loss": 1.6459,
"num_input_tokens_seen": 13083456,
"step": 24000
},
{
"epoch": 0.24511765647510805,
"grad_norm": 10.364368438720703,
"learning_rate": 4.8774461741635986e-05,
"loss": 1.651,
"num_input_tokens_seen": 13361568,
"step": 24500
},
{
"epoch": 0.25012005762766126,
"grad_norm": 6.404083251953125,
"learning_rate": 4.874944973587322e-05,
"loss": 1.6271,
"num_input_tokens_seen": 13638952,
"step": 25000
},
{
"epoch": 0.2551224587802145,
"grad_norm": 7.239497184753418,
"learning_rate": 4.8724437730110456e-05,
"loss": 1.6325,
"num_input_tokens_seen": 13909896,
"step": 25500
},
{
"epoch": 0.26012485993276774,
"grad_norm": 9.5720796585083,
"learning_rate": 4.869942572434769e-05,
"loss": 1.6227,
"num_input_tokens_seen": 14182688,
"step": 26000
},
{
"epoch": 0.265127261085321,
"grad_norm": 7.255125045776367,
"learning_rate": 4.867441371858492e-05,
"loss": 1.6328,
"num_input_tokens_seen": 14455888,
"step": 26500
},
{
"epoch": 0.27012966223787416,
"grad_norm": 7.990500450134277,
"learning_rate": 4.8649401712822154e-05,
"loss": 1.6315,
"num_input_tokens_seen": 14726048,
"step": 27000
},
{
"epoch": 0.2751320633904274,
"grad_norm": 7.787556171417236,
"learning_rate": 4.8624389707059396e-05,
"loss": 1.6357,
"num_input_tokens_seen": 15005208,
"step": 27500
},
{
"epoch": 0.28013446454298063,
"grad_norm": 6.046635627746582,
"learning_rate": 4.8599377701296624e-05,
"loss": 1.6173,
"num_input_tokens_seen": 15272768,
"step": 28000
},
{
"epoch": 0.28513686569553387,
"grad_norm": 6.547093391418457,
"learning_rate": 4.857436569553386e-05,
"loss": 1.6067,
"num_input_tokens_seen": 15539248,
"step": 28500
},
{
"epoch": 0.2901392668480871,
"grad_norm": 8.07209587097168,
"learning_rate": 4.8549353689771094e-05,
"loss": 1.6018,
"num_input_tokens_seen": 15810072,
"step": 29000
},
{
"epoch": 0.2951416680006403,
"grad_norm": 7.229666709899902,
"learning_rate": 4.852434168400833e-05,
"loss": 1.6238,
"num_input_tokens_seen": 16082968,
"step": 29500
},
{
"epoch": 0.3001440691531935,
"grad_norm": 6.863572597503662,
"learning_rate": 4.849932967824556e-05,
"loss": 1.6104,
"num_input_tokens_seen": 16354784,
"step": 30000
},
{
"epoch": 0.30514647030574676,
"grad_norm": 8.546486854553223,
"learning_rate": 4.847431767248279e-05,
"loss": 1.5961,
"num_input_tokens_seen": 16623816,
"step": 30500
},
{
"epoch": 0.3101488714583,
"grad_norm": 6.493512153625488,
"learning_rate": 4.844930566672003e-05,
"loss": 1.5977,
"num_input_tokens_seen": 16896624,
"step": 31000
},
{
"epoch": 0.3151512726108532,
"grad_norm": 7.902426242828369,
"learning_rate": 4.842429366095726e-05,
"loss": 1.6049,
"num_input_tokens_seen": 17168672,
"step": 31500
},
{
"epoch": 0.3201536737634064,
"grad_norm": 8.033360481262207,
"learning_rate": 4.83992816551945e-05,
"loss": 1.581,
"num_input_tokens_seen": 17433848,
"step": 32000
},
{
"epoch": 0.32515607491595966,
"grad_norm": 7.9239325523376465,
"learning_rate": 4.837426964943173e-05,
"loss": 1.6029,
"num_input_tokens_seen": 17704920,
"step": 32500
},
{
"epoch": 0.3301584760685129,
"grad_norm": 6.995474815368652,
"learning_rate": 4.834925764366897e-05,
"loss": 1.576,
"num_input_tokens_seen": 17969776,
"step": 33000
},
{
"epoch": 0.33516087722106613,
"grad_norm": 8.305245399475098,
"learning_rate": 4.8324245637906195e-05,
"loss": 1.5909,
"num_input_tokens_seen": 18239784,
"step": 33500
},
{
"epoch": 0.3401632783736193,
"grad_norm": 6.429056167602539,
"learning_rate": 4.829923363214343e-05,
"loss": 1.5742,
"num_input_tokens_seen": 18507688,
"step": 34000
},
{
"epoch": 0.34516567952617255,
"grad_norm": 7.208921432495117,
"learning_rate": 4.8274221626380665e-05,
"loss": 1.573,
"num_input_tokens_seen": 18776368,
"step": 34500
},
{
"epoch": 0.3501680806787258,
"grad_norm": 7.433680057525635,
"learning_rate": 4.82492096206179e-05,
"loss": 1.588,
"num_input_tokens_seen": 19050584,
"step": 35000
},
{
"epoch": 0.35517048183127903,
"grad_norm": 6.901820182800293,
"learning_rate": 4.8224197614855135e-05,
"loss": 1.58,
"num_input_tokens_seen": 19333968,
"step": 35500
},
{
"epoch": 0.36017288298383227,
"grad_norm": 8.789533615112305,
"learning_rate": 4.819918560909237e-05,
"loss": 1.5758,
"num_input_tokens_seen": 19612632,
"step": 36000
},
{
"epoch": 0.36517528413638545,
"grad_norm": 7.546513557434082,
"learning_rate": 4.8174173603329605e-05,
"loss": 1.5763,
"num_input_tokens_seen": 19883800,
"step": 36500
},
{
"epoch": 0.3701776852889387,
"grad_norm": 6.489349842071533,
"learning_rate": 4.814916159756683e-05,
"loss": 1.5866,
"num_input_tokens_seen": 20158632,
"step": 37000
},
{
"epoch": 0.3751800864414919,
"grad_norm": 7.696920871734619,
"learning_rate": 4.812414959180407e-05,
"loss": 1.5858,
"num_input_tokens_seen": 20430440,
"step": 37500
},
{
"epoch": 0.38018248759404516,
"grad_norm": 6.559112071990967,
"learning_rate": 4.80991375860413e-05,
"loss": 1.5596,
"num_input_tokens_seen": 20703976,
"step": 38000
},
{
"epoch": 0.38518488874659834,
"grad_norm": 8.480047225952148,
"learning_rate": 4.807412558027853e-05,
"loss": 1.5533,
"num_input_tokens_seen": 20972048,
"step": 38500
},
{
"epoch": 0.3901872898991516,
"grad_norm": 5.950156211853027,
"learning_rate": 4.804911357451577e-05,
"loss": 1.5642,
"num_input_tokens_seen": 21240912,
"step": 39000
},
{
"epoch": 0.3951896910517048,
"grad_norm": 5.799604892730713,
"learning_rate": 4.802410156875301e-05,
"loss": 1.5598,
"num_input_tokens_seen": 21507728,
"step": 39500
},
{
"epoch": 0.40019209220425805,
"grad_norm": 5.980112075805664,
"learning_rate": 4.7999089562990236e-05,
"loss": 1.5609,
"num_input_tokens_seen": 21783160,
"step": 40000
},
{
"epoch": 0.4051944933568113,
"grad_norm": 5.936067581176758,
"learning_rate": 4.797407755722747e-05,
"loss": 1.5523,
"num_input_tokens_seen": 22052792,
"step": 40500
},
{
"epoch": 0.4101968945093645,
"grad_norm": 9.166204452514648,
"learning_rate": 4.7949065551464706e-05,
"loss": 1.5452,
"num_input_tokens_seen": 22324072,
"step": 41000
},
{
"epoch": 0.4151992956619177,
"grad_norm": 8.854216575622559,
"learning_rate": 4.792405354570194e-05,
"loss": 1.5516,
"num_input_tokens_seen": 22590624,
"step": 41500
},
{
"epoch": 0.42020169681447095,
"grad_norm": 9.261016845703125,
"learning_rate": 4.789904153993917e-05,
"loss": 1.5374,
"num_input_tokens_seen": 22862424,
"step": 42000
},
{
"epoch": 0.4252040979670242,
"grad_norm": 7.714609622955322,
"learning_rate": 4.7874029534176404e-05,
"loss": 1.544,
"num_input_tokens_seen": 23128888,
"step": 42500
},
{
"epoch": 0.4302064991195774,
"grad_norm": 5.665945529937744,
"learning_rate": 4.784901752841364e-05,
"loss": 1.5532,
"num_input_tokens_seen": 23407576,
"step": 43000
},
{
"epoch": 0.4352089002721306,
"grad_norm": 6.948183536529541,
"learning_rate": 4.7824005522650874e-05,
"loss": 1.5467,
"num_input_tokens_seen": 23683544,
"step": 43500
},
{
"epoch": 0.44021130142468384,
"grad_norm": 5.725684642791748,
"learning_rate": 4.779899351688811e-05,
"loss": 1.5295,
"num_input_tokens_seen": 23948640,
"step": 44000
},
{
"epoch": 0.4452137025772371,
"grad_norm": 6.168211936950684,
"learning_rate": 4.7773981511125344e-05,
"loss": 1.5477,
"num_input_tokens_seen": 24212584,
"step": 44500
},
{
"epoch": 0.4502161037297903,
"grad_norm": 6.778971195220947,
"learning_rate": 4.774896950536258e-05,
"loss": 1.5134,
"num_input_tokens_seen": 24481104,
"step": 45000
},
{
"epoch": 0.4552185048823435,
"grad_norm": 7.2399210929870605,
"learning_rate": 4.772395749959981e-05,
"loss": 1.5447,
"num_input_tokens_seen": 24756992,
"step": 45500
},
{
"epoch": 0.46022090603489674,
"grad_norm": 6.476212024688721,
"learning_rate": 4.769894549383704e-05,
"loss": 1.5361,
"num_input_tokens_seen": 25020392,
"step": 46000
},
{
"epoch": 0.46522330718745,
"grad_norm": 10.64287281036377,
"learning_rate": 4.767393348807428e-05,
"loss": 1.5409,
"num_input_tokens_seen": 25296728,
"step": 46500
},
{
"epoch": 0.4702257083400032,
"grad_norm": 7.746724605560303,
"learning_rate": 4.764892148231151e-05,
"loss": 1.4953,
"num_input_tokens_seen": 25562256,
"step": 47000
},
{
"epoch": 0.47522810949255645,
"grad_norm": 6.38646125793457,
"learning_rate": 4.762390947654875e-05,
"loss": 1.5518,
"num_input_tokens_seen": 25833464,
"step": 47500
},
{
"epoch": 0.48023051064510963,
"grad_norm": 6.2214555740356445,
"learning_rate": 4.759889747078598e-05,
"loss": 1.5375,
"num_input_tokens_seen": 26108816,
"step": 48000
},
{
"epoch": 0.48523291179766287,
"grad_norm": 7.317322731018066,
"learning_rate": 4.757388546502322e-05,
"loss": 1.5449,
"num_input_tokens_seen": 26385360,
"step": 48500
},
{
"epoch": 0.4902353129502161,
"grad_norm": 6.4762701988220215,
"learning_rate": 4.7548873459260445e-05,
"loss": 1.5179,
"num_input_tokens_seen": 26656488,
"step": 49000
},
{
"epoch": 0.49523771410276934,
"grad_norm": 7.051132678985596,
"learning_rate": 4.752386145349768e-05,
"loss": 1.5213,
"num_input_tokens_seen": 26925744,
"step": 49500
},
{
"epoch": 0.5002401152553225,
"grad_norm": 8.628856658935547,
"learning_rate": 4.7498849447734915e-05,
"loss": 1.5517,
"num_input_tokens_seen": 27202296,
"step": 50000
},
{
"epoch": 0.5052425164078758,
"grad_norm": 6.756930351257324,
"learning_rate": 4.747383744197215e-05,
"loss": 1.5061,
"num_input_tokens_seen": 27469216,
"step": 50500
},
{
"epoch": 0.510244917560429,
"grad_norm": 8.543140411376953,
"learning_rate": 4.7448825436209385e-05,
"loss": 1.5323,
"num_input_tokens_seen": 27746208,
"step": 51000
},
{
"epoch": 0.5152473187129822,
"grad_norm": 7.62526273727417,
"learning_rate": 4.742381343044662e-05,
"loss": 1.5284,
"num_input_tokens_seen": 28024152,
"step": 51500
},
{
"epoch": 0.5202497198655355,
"grad_norm": 6.117633819580078,
"learning_rate": 4.7398801424683855e-05,
"loss": 1.5304,
"num_input_tokens_seen": 28300312,
"step": 52000
},
{
"epoch": 0.5252521210180887,
"grad_norm": 6.280879497528076,
"learning_rate": 4.737378941892108e-05,
"loss": 1.4873,
"num_input_tokens_seen": 28573472,
"step": 52500
},
{
"epoch": 0.530254522170642,
"grad_norm": 6.297519683837891,
"learning_rate": 4.734877741315832e-05,
"loss": 1.512,
"num_input_tokens_seen": 28846960,
"step": 53000
},
{
"epoch": 0.5352569233231952,
"grad_norm": 7.927740097045898,
"learning_rate": 4.732376540739555e-05,
"loss": 1.5303,
"num_input_tokens_seen": 29120648,
"step": 53500
},
{
"epoch": 0.5402593244757483,
"grad_norm": 6.746880054473877,
"learning_rate": 4.729875340163278e-05,
"loss": 1.5187,
"num_input_tokens_seen": 29389712,
"step": 54000
},
{
"epoch": 0.5452617256283016,
"grad_norm": 6.765636920928955,
"learning_rate": 4.7273741395870016e-05,
"loss": 1.502,
"num_input_tokens_seen": 29661920,
"step": 54500
},
{
"epoch": 0.5502641267808548,
"grad_norm": 4.868513107299805,
"learning_rate": 4.724872939010726e-05,
"loss": 1.499,
"num_input_tokens_seen": 29933584,
"step": 55000
},
{
"epoch": 0.555266527933408,
"grad_norm": 6.332437515258789,
"learning_rate": 4.722371738434449e-05,
"loss": 1.4815,
"num_input_tokens_seen": 30200096,
"step": 55500
},
{
"epoch": 0.5602689290859613,
"grad_norm": 6.429714679718018,
"learning_rate": 4.719870537858172e-05,
"loss": 1.5138,
"num_input_tokens_seen": 30475600,
"step": 56000
},
{
"epoch": 0.5652713302385145,
"grad_norm": 6.9108991622924805,
"learning_rate": 4.7173693372818956e-05,
"loss": 1.5218,
"num_input_tokens_seen": 30750536,
"step": 56500
},
{
"epoch": 0.5702737313910677,
"grad_norm": 7.187946796417236,
"learning_rate": 4.714868136705619e-05,
"loss": 1.4937,
"num_input_tokens_seen": 31023928,
"step": 57000
},
{
"epoch": 0.575276132543621,
"grad_norm": 7.880980014801025,
"learning_rate": 4.712366936129342e-05,
"loss": 1.5149,
"num_input_tokens_seen": 31295016,
"step": 57500
},
{
"epoch": 0.5802785336961742,
"grad_norm": 6.050008773803711,
"learning_rate": 4.7098657355530654e-05,
"loss": 1.5172,
"num_input_tokens_seen": 31571344,
"step": 58000
},
{
"epoch": 0.5852809348487273,
"grad_norm": 5.153658390045166,
"learning_rate": 4.707364534976789e-05,
"loss": 1.499,
"num_input_tokens_seen": 31847512,
"step": 58500
},
{
"epoch": 0.5902833360012806,
"grad_norm": 8.292108535766602,
"learning_rate": 4.704863334400513e-05,
"loss": 1.4897,
"num_input_tokens_seen": 32118720,
"step": 59000
},
{
"epoch": 0.5952857371538338,
"grad_norm": 5.900440216064453,
"learning_rate": 4.702362133824236e-05,
"loss": 1.5195,
"num_input_tokens_seen": 32397744,
"step": 59500
},
{
"epoch": 0.600288138306387,
"grad_norm": 7.023585796356201,
"learning_rate": 4.6998609332479594e-05,
"loss": 1.4755,
"num_input_tokens_seen": 32671912,
"step": 60000
},
{
"epoch": 0.6052905394589403,
"grad_norm": 7.419212818145752,
"learning_rate": 4.697359732671683e-05,
"loss": 1.4844,
"num_input_tokens_seen": 32941344,
"step": 60500
},
{
"epoch": 0.6102929406114935,
"grad_norm": 7.654923915863037,
"learning_rate": 4.694858532095406e-05,
"loss": 1.497,
"num_input_tokens_seen": 33221672,
"step": 61000
},
{
"epoch": 0.6152953417640468,
"grad_norm": 6.979129791259766,
"learning_rate": 4.692357331519129e-05,
"loss": 1.4855,
"num_input_tokens_seen": 33489080,
"step": 61500
},
{
"epoch": 0.6202977429166,
"grad_norm": 6.450369834899902,
"learning_rate": 4.689856130942853e-05,
"loss": 1.4945,
"num_input_tokens_seen": 33763488,
"step": 62000
},
{
"epoch": 0.6253001440691532,
"grad_norm": 6.070815563201904,
"learning_rate": 4.687354930366576e-05,
"loss": 1.4991,
"num_input_tokens_seen": 34031328,
"step": 62500
},
{
"epoch": 0.6303025452217064,
"grad_norm": 5.402656078338623,
"learning_rate": 4.6848537297903e-05,
"loss": 1.4957,
"num_input_tokens_seen": 34304608,
"step": 63000
},
{
"epoch": 0.6353049463742596,
"grad_norm": 10.961112022399902,
"learning_rate": 4.682352529214023e-05,
"loss": 1.4767,
"num_input_tokens_seen": 34569600,
"step": 63500
},
{
"epoch": 0.6403073475268128,
"grad_norm": 7.523622035980225,
"learning_rate": 4.679851328637747e-05,
"loss": 1.4953,
"num_input_tokens_seen": 34838720,
"step": 64000
},
{
"epoch": 0.6453097486793661,
"grad_norm": 7.367971420288086,
"learning_rate": 4.6773501280614695e-05,
"loss": 1.4959,
"num_input_tokens_seen": 35110952,
"step": 64500
},
{
"epoch": 0.6503121498319193,
"grad_norm": 7.122432231903076,
"learning_rate": 4.674848927485193e-05,
"loss": 1.4617,
"num_input_tokens_seen": 35379336,
"step": 65000
},
{
"epoch": 0.6553145509844726,
"grad_norm": 6.941073894500732,
"learning_rate": 4.6723477269089165e-05,
"loss": 1.4828,
"num_input_tokens_seen": 35654144,
"step": 65500
},
{
"epoch": 0.6603169521370258,
"grad_norm": 7.309379577636719,
"learning_rate": 4.669846526332639e-05,
"loss": 1.4695,
"num_input_tokens_seen": 35922592,
"step": 66000
},
{
"epoch": 0.665319353289579,
"grad_norm": 8.28540325164795,
"learning_rate": 4.6673453257563635e-05,
"loss": 1.4853,
"num_input_tokens_seen": 36195592,
"step": 66500
},
{
"epoch": 0.6703217544421323,
"grad_norm": 6.311332702636719,
"learning_rate": 4.664844125180087e-05,
"loss": 1.478,
"num_input_tokens_seen": 36471728,
"step": 67000
},
{
"epoch": 0.6753241555946854,
"grad_norm": 6.863243579864502,
"learning_rate": 4.6623429246038105e-05,
"loss": 1.4935,
"num_input_tokens_seen": 36738232,
"step": 67500
},
{
"epoch": 0.6803265567472386,
"grad_norm": 5.535435199737549,
"learning_rate": 4.659841724027533e-05,
"loss": 1.4689,
"num_input_tokens_seen": 37003552,
"step": 68000
},
{
"epoch": 0.6853289578997919,
"grad_norm": 7.348452568054199,
"learning_rate": 4.657340523451257e-05,
"loss": 1.4802,
"num_input_tokens_seen": 37273624,
"step": 68500
},
{
"epoch": 0.6903313590523451,
"grad_norm": 5.919636249542236,
"learning_rate": 4.65483932287498e-05,
"loss": 1.468,
"num_input_tokens_seen": 37542216,
"step": 69000
},
{
"epoch": 0.6953337602048983,
"grad_norm": 6.997893333435059,
"learning_rate": 4.652338122298703e-05,
"loss": 1.4711,
"num_input_tokens_seen": 37817712,
"step": 69500
},
{
"epoch": 0.7003361613574516,
"grad_norm": 7.683621883392334,
"learning_rate": 4.6498369217224266e-05,
"loss": 1.4467,
"num_input_tokens_seen": 38086368,
"step": 70000
},
{
"epoch": 0.7053385625100048,
"grad_norm": 5.56058931350708,
"learning_rate": 4.647335721146151e-05,
"loss": 1.4631,
"num_input_tokens_seen": 38364016,
"step": 70500
},
{
"epoch": 0.7103409636625581,
"grad_norm": 5.151466369628906,
"learning_rate": 4.644834520569874e-05,
"loss": 1.4776,
"num_input_tokens_seen": 38639544,
"step": 71000
},
{
"epoch": 0.7153433648151113,
"grad_norm": 7.764716625213623,
"learning_rate": 4.642333319993597e-05,
"loss": 1.4629,
"num_input_tokens_seen": 38900248,
"step": 71500
},
{
"epoch": 0.7203457659676645,
"grad_norm": 7.205192565917969,
"learning_rate": 4.6398321194173206e-05,
"loss": 1.4699,
"num_input_tokens_seen": 39177440,
"step": 72000
},
{
"epoch": 0.7253481671202177,
"grad_norm": 6.734379768371582,
"learning_rate": 4.637330918841044e-05,
"loss": 1.4649,
"num_input_tokens_seen": 39442976,
"step": 72500
},
{
"epoch": 0.7303505682727709,
"grad_norm": 6.191771507263184,
"learning_rate": 4.634829718264767e-05,
"loss": 1.4764,
"num_input_tokens_seen": 39715104,
"step": 73000
},
{
"epoch": 0.7353529694253241,
"grad_norm": 7.378221035003662,
"learning_rate": 4.6323285176884904e-05,
"loss": 1.4515,
"num_input_tokens_seen": 39990848,
"step": 73500
},
{
"epoch": 0.7403553705778774,
"grad_norm": 6.436953067779541,
"learning_rate": 4.629827317112214e-05,
"loss": 1.4495,
"num_input_tokens_seen": 40258280,
"step": 74000
},
{
"epoch": 0.7453577717304306,
"grad_norm": 5.954966068267822,
"learning_rate": 4.6273261165359374e-05,
"loss": 1.4497,
"num_input_tokens_seen": 40535544,
"step": 74500
},
{
"epoch": 0.7503601728829838,
"grad_norm": 6.085744857788086,
"learning_rate": 4.624824915959661e-05,
"loss": 1.4428,
"num_input_tokens_seen": 40804552,
"step": 75000
},
{
"epoch": 0.7553625740355371,
"grad_norm": 6.737603664398193,
"learning_rate": 4.6223237153833844e-05,
"loss": 1.4394,
"num_input_tokens_seen": 41072144,
"step": 75500
},
{
"epoch": 0.7603649751880903,
"grad_norm": 5.9119439125061035,
"learning_rate": 4.619822514807108e-05,
"loss": 1.4408,
"num_input_tokens_seen": 41341080,
"step": 76000
},
{
"epoch": 0.7653673763406436,
"grad_norm": 7.842981815338135,
"learning_rate": 4.617321314230831e-05,
"loss": 1.4538,
"num_input_tokens_seen": 41613760,
"step": 76500
},
{
"epoch": 0.7703697774931967,
"grad_norm": 7.999574184417725,
"learning_rate": 4.614820113654554e-05,
"loss": 1.451,
"num_input_tokens_seen": 41886512,
"step": 77000
},
{
"epoch": 0.7753721786457499,
"grad_norm": 5.851772308349609,
"learning_rate": 4.612318913078278e-05,
"loss": 1.4317,
"num_input_tokens_seen": 42154824,
"step": 77500
},
{
"epoch": 0.7803745797983032,
"grad_norm": 7.43974494934082,
"learning_rate": 4.609817712502001e-05,
"loss": 1.4707,
"num_input_tokens_seen": 42425080,
"step": 78000
},
{
"epoch": 0.7853769809508564,
"grad_norm": 6.566989898681641,
"learning_rate": 4.607316511925725e-05,
"loss": 1.4725,
"num_input_tokens_seen": 42695896,
"step": 78500
},
{
"epoch": 0.7903793821034096,
"grad_norm": 6.765398979187012,
"learning_rate": 4.604815311349448e-05,
"loss": 1.4454,
"num_input_tokens_seen": 42961864,
"step": 79000
},
{
"epoch": 0.7953817832559629,
"grad_norm": 5.989940643310547,
"learning_rate": 4.602314110773172e-05,
"loss": 1.4551,
"num_input_tokens_seen": 43237224,
"step": 79500
},
{
"epoch": 0.8003841844085161,
"grad_norm": 8.16629409790039,
"learning_rate": 4.5998129101968945e-05,
"loss": 1.4354,
"num_input_tokens_seen": 43512872,
"step": 80000
},
{
"epoch": 0.8053865855610693,
"grad_norm": 6.704333305358887,
"learning_rate": 4.597311709620618e-05,
"loss": 1.4481,
"num_input_tokens_seen": 43779448,
"step": 80500
},
{
"epoch": 0.8103889867136226,
"grad_norm": 6.2965593338012695,
"learning_rate": 4.5948105090443415e-05,
"loss": 1.4443,
"num_input_tokens_seen": 44053296,
"step": 81000
},
{
"epoch": 0.8153913878661757,
"grad_norm": 6.224064350128174,
"learning_rate": 4.592309308468064e-05,
"loss": 1.47,
"num_input_tokens_seen": 44328488,
"step": 81500
},
{
"epoch": 0.820393789018729,
"grad_norm": 6.873196601867676,
"learning_rate": 4.589808107891788e-05,
"loss": 1.4429,
"num_input_tokens_seen": 44604152,
"step": 82000
},
{
"epoch": 0.8253961901712822,
"grad_norm": 6.774177551269531,
"learning_rate": 4.587306907315512e-05,
"loss": 1.4427,
"num_input_tokens_seen": 44880240,
"step": 82500
},
{
"epoch": 0.8303985913238354,
"grad_norm": 7.543479919433594,
"learning_rate": 4.5848057067392355e-05,
"loss": 1.4559,
"num_input_tokens_seen": 45150264,
"step": 83000
},
{
"epoch": 0.8354009924763887,
"grad_norm": 6.445783615112305,
"learning_rate": 4.582304506162958e-05,
"loss": 1.4215,
"num_input_tokens_seen": 45419664,
"step": 83500
},
{
"epoch": 0.8404033936289419,
"grad_norm": 8.083765029907227,
"learning_rate": 4.579803305586682e-05,
"loss": 1.4636,
"num_input_tokens_seen": 45691768,
"step": 84000
},
{
"epoch": 0.8454057947814951,
"grad_norm": 6.205325126647949,
"learning_rate": 4.577302105010405e-05,
"loss": 1.4386,
"num_input_tokens_seen": 45965288,
"step": 84500
},
{
"epoch": 0.8504081959340484,
"grad_norm": 5.954364776611328,
"learning_rate": 4.574800904434128e-05,
"loss": 1.454,
"num_input_tokens_seen": 46239520,
"step": 85000
},
{
"epoch": 0.8554105970866016,
"grad_norm": 7.476288318634033,
"learning_rate": 4.5722997038578516e-05,
"loss": 1.4375,
"num_input_tokens_seen": 46506456,
"step": 85500
},
{
"epoch": 0.8604129982391548,
"grad_norm": 9.656715393066406,
"learning_rate": 4.569798503281575e-05,
"loss": 1.4293,
"num_input_tokens_seen": 46775832,
"step": 86000
},
{
"epoch": 0.865415399391708,
"grad_norm": 5.440873622894287,
"learning_rate": 4.567297302705299e-05,
"loss": 1.428,
"num_input_tokens_seen": 47053184,
"step": 86500
},
{
"epoch": 0.8704178005442612,
"grad_norm": 6.26190710067749,
"learning_rate": 4.564796102129022e-05,
"loss": 1.4175,
"num_input_tokens_seen": 47326160,
"step": 87000
},
{
"epoch": 0.8754202016968144,
"grad_norm": 5.701922416687012,
"learning_rate": 4.5622949015527456e-05,
"loss": 1.4323,
"num_input_tokens_seen": 47596624,
"step": 87500
},
{
"epoch": 0.8804226028493677,
"grad_norm": 7.687532901763916,
"learning_rate": 4.559793700976469e-05,
"loss": 1.4403,
"num_input_tokens_seen": 47866072,
"step": 88000
},
{
"epoch": 0.8854250040019209,
"grad_norm": 4.988935470581055,
"learning_rate": 4.557292500400192e-05,
"loss": 1.445,
"num_input_tokens_seen": 48153664,
"step": 88500
},
{
"epoch": 0.8904274051544742,
"grad_norm": 5.36391544342041,
"learning_rate": 4.5547912998239154e-05,
"loss": 1.43,
"num_input_tokens_seen": 48432248,
"step": 89000
},
{
"epoch": 0.8954298063070274,
"grad_norm": 7.618863105773926,
"learning_rate": 4.552290099247639e-05,
"loss": 1.4122,
"num_input_tokens_seen": 48705584,
"step": 89500
},
{
"epoch": 0.9004322074595806,
"grad_norm": 9.530303001403809,
"learning_rate": 4.5497888986713624e-05,
"loss": 1.4392,
"num_input_tokens_seen": 48985152,
"step": 90000
},
{
"epoch": 0.9054346086121339,
"grad_norm": 7.428534030914307,
"learning_rate": 4.547287698095086e-05,
"loss": 1.4289,
"num_input_tokens_seen": 49262664,
"step": 90500
},
{
"epoch": 0.910437009764687,
"grad_norm": 7.3600287437438965,
"learning_rate": 4.5447864975188094e-05,
"loss": 1.4242,
"num_input_tokens_seen": 49536480,
"step": 91000
},
{
"epoch": 0.9154394109172402,
"grad_norm": 5.594141960144043,
"learning_rate": 4.542285296942533e-05,
"loss": 1.4266,
"num_input_tokens_seen": 49809336,
"step": 91500
},
{
"epoch": 0.9204418120697935,
"grad_norm": 6.597540378570557,
"learning_rate": 4.539784096366256e-05,
"loss": 1.4216,
"num_input_tokens_seen": 50082984,
"step": 92000
},
{
"epoch": 0.9254442132223467,
"grad_norm": 8.180904388427734,
"learning_rate": 4.537282895789979e-05,
"loss": 1.4374,
"num_input_tokens_seen": 50358512,
"step": 92500
},
{
"epoch": 0.9304466143749,
"grad_norm": 7.512216567993164,
"learning_rate": 4.534781695213703e-05,
"loss": 1.4178,
"num_input_tokens_seen": 50634216,
"step": 93000
},
{
"epoch": 0.9354490155274532,
"grad_norm": 6.1448283195495605,
"learning_rate": 4.532280494637426e-05,
"loss": 1.399,
"num_input_tokens_seen": 50902088,
"step": 93500
},
{
"epoch": 0.9404514166800064,
"grad_norm": 6.424488544464111,
"learning_rate": 4.52977929406115e-05,
"loss": 1.4115,
"num_input_tokens_seen": 51174136,
"step": 94000
},
{
"epoch": 0.9454538178325597,
"grad_norm": 5.398598670959473,
"learning_rate": 4.527278093484873e-05,
"loss": 1.4146,
"num_input_tokens_seen": 51453752,
"step": 94500
},
{
"epoch": 0.9504562189851129,
"grad_norm": 6.272931098937988,
"learning_rate": 4.524776892908597e-05,
"loss": 1.4166,
"num_input_tokens_seen": 51728016,
"step": 95000
},
{
"epoch": 0.955458620137666,
"grad_norm": 6.412170886993408,
"learning_rate": 4.5222756923323195e-05,
"loss": 1.4241,
"num_input_tokens_seen": 51995272,
"step": 95500
},
{
"epoch": 0.9604610212902193,
"grad_norm": 7.181222438812256,
"learning_rate": 4.519774491756043e-05,
"loss": 1.4166,
"num_input_tokens_seen": 52276888,
"step": 96000
},
{
"epoch": 0.9654634224427725,
"grad_norm": 6.848874092102051,
"learning_rate": 4.5172732911797665e-05,
"loss": 1.4028,
"num_input_tokens_seen": 52555928,
"step": 96500
},
{
"epoch": 0.9704658235953257,
"grad_norm": 6.6588568687438965,
"learning_rate": 4.51477209060349e-05,
"loss": 1.3993,
"num_input_tokens_seen": 52824472,
"step": 97000
},
{
"epoch": 0.975468224747879,
"grad_norm": 7.8776373863220215,
"learning_rate": 4.512270890027213e-05,
"loss": 1.4204,
"num_input_tokens_seen": 53098176,
"step": 97500
},
{
"epoch": 0.9804706259004322,
"grad_norm": 5.281984806060791,
"learning_rate": 4.509769689450937e-05,
"loss": 1.4191,
"num_input_tokens_seen": 53379376,
"step": 98000
},
{
"epoch": 0.9854730270529855,
"grad_norm": 8.383103370666504,
"learning_rate": 4.5072684888746605e-05,
"loss": 1.4232,
"num_input_tokens_seen": 53654608,
"step": 98500
},
{
"epoch": 0.9904754282055387,
"grad_norm": 5.8474626541137695,
"learning_rate": 4.504767288298383e-05,
"loss": 1.4099,
"num_input_tokens_seen": 53931080,
"step": 99000
},
{
"epoch": 0.9954778293580919,
"grad_norm": 6.058784008026123,
"learning_rate": 4.502266087722107e-05,
"loss": 1.3993,
"num_input_tokens_seen": 54204800,
"step": 99500
},
{
"epoch": 1.0,
"eval_loss": 1.2487133741378784,
"eval_runtime": 187.129,
"eval_samples_per_second": 1068.274,
"eval_steps_per_second": 133.539,
"num_input_tokens_seen": 54454616,
"step": 99952
},
{
"epoch": 1.000480230510645,
"grad_norm": 5.304110050201416,
"learning_rate": 4.49976488714583e-05,
"loss": 1.3882,
"num_input_tokens_seen": 54481288,
"step": 100000
},
{
"epoch": 1.0054826316631984,
"grad_norm": 7.098052501678467,
"learning_rate": 4.497263686569553e-05,
"loss": 1.2973,
"num_input_tokens_seen": 54749928,
"step": 100500
},
{
"epoch": 1.0104850328157515,
"grad_norm": 7.15824031829834,
"learning_rate": 4.4947624859932766e-05,
"loss": 1.3323,
"num_input_tokens_seen": 55027920,
"step": 101000
},
{
"epoch": 1.0154874339683049,
"grad_norm": 6.138706684112549,
"learning_rate": 4.492261285417e-05,
"loss": 1.3195,
"num_input_tokens_seen": 55303960,
"step": 101500
},
{
"epoch": 1.020489835120858,
"grad_norm": 8.01395320892334,
"learning_rate": 4.4897600848407236e-05,
"loss": 1.2913,
"num_input_tokens_seen": 55577184,
"step": 102000
},
{
"epoch": 1.0254922362734111,
"grad_norm": 7.413015842437744,
"learning_rate": 4.487258884264447e-05,
"loss": 1.3284,
"num_input_tokens_seen": 55851192,
"step": 102500
},
{
"epoch": 1.0304946374259645,
"grad_norm": 6.665005207061768,
"learning_rate": 4.4847576836881706e-05,
"loss": 1.3239,
"num_input_tokens_seen": 56125184,
"step": 103000
},
{
"epoch": 1.0354970385785176,
"grad_norm": 6.208978652954102,
"learning_rate": 4.482256483111894e-05,
"loss": 1.3198,
"num_input_tokens_seen": 56399640,
"step": 103500
},
{
"epoch": 1.040499439731071,
"grad_norm": 6.494995594024658,
"learning_rate": 4.479755282535617e-05,
"loss": 1.3036,
"num_input_tokens_seen": 56672752,
"step": 104000
},
{
"epoch": 1.045501840883624,
"grad_norm": 7.3449625968933105,
"learning_rate": 4.4772540819593404e-05,
"loss": 1.3304,
"num_input_tokens_seen": 56942744,
"step": 104500
},
{
"epoch": 1.0505042420361774,
"grad_norm": 5.880083084106445,
"learning_rate": 4.474752881383064e-05,
"loss": 1.3273,
"num_input_tokens_seen": 57223568,
"step": 105000
},
{
"epoch": 1.0555066431887306,
"grad_norm": 7.793262004852295,
"learning_rate": 4.4722516808067874e-05,
"loss": 1.3364,
"num_input_tokens_seen": 57501104,
"step": 105500
},
{
"epoch": 1.060509044341284,
"grad_norm": 5.995269298553467,
"learning_rate": 4.469750480230511e-05,
"loss": 1.3157,
"num_input_tokens_seen": 57774032,
"step": 106000
},
{
"epoch": 1.065511445493837,
"grad_norm": 6.386702060699463,
"learning_rate": 4.4672492796542344e-05,
"loss": 1.2906,
"num_input_tokens_seen": 58052328,
"step": 106500
},
{
"epoch": 1.0705138466463904,
"grad_norm": 6.049729347229004,
"learning_rate": 4.464748079077958e-05,
"loss": 1.3073,
"num_input_tokens_seen": 58325864,
"step": 107000
},
{
"epoch": 1.0755162477989435,
"grad_norm": 6.0326433181762695,
"learning_rate": 4.462246878501681e-05,
"loss": 1.3223,
"num_input_tokens_seen": 58605688,
"step": 107500
},
{
"epoch": 1.0805186489514966,
"grad_norm": 7.254247188568115,
"learning_rate": 4.459745677925404e-05,
"loss": 1.3131,
"num_input_tokens_seen": 58875792,
"step": 108000
},
{
"epoch": 1.08552105010405,
"grad_norm": 5.334825038909912,
"learning_rate": 4.457244477349128e-05,
"loss": 1.3313,
"num_input_tokens_seen": 59148200,
"step": 108500
},
{
"epoch": 1.090523451256603,
"grad_norm": 5.982466697692871,
"learning_rate": 4.454743276772851e-05,
"loss": 1.3031,
"num_input_tokens_seen": 59416680,
"step": 109000
},
{
"epoch": 1.0955258524091565,
"grad_norm": 5.858680725097656,
"learning_rate": 4.452242076196575e-05,
"loss": 1.2964,
"num_input_tokens_seen": 59680504,
"step": 109500
},
{
"epoch": 1.1005282535617096,
"grad_norm": 7.001748085021973,
"learning_rate": 4.449740875620298e-05,
"loss": 1.3203,
"num_input_tokens_seen": 59951112,
"step": 110000
},
{
"epoch": 1.105530654714263,
"grad_norm": 7.0456013679504395,
"learning_rate": 4.447239675044022e-05,
"loss": 1.3229,
"num_input_tokens_seen": 60223952,
"step": 110500
},
{
"epoch": 1.110533055866816,
"grad_norm": 8.38005256652832,
"learning_rate": 4.4447384744677446e-05,
"loss": 1.3045,
"num_input_tokens_seen": 60497304,
"step": 111000
},
{
"epoch": 1.1155354570193694,
"grad_norm": 6.44760799407959,
"learning_rate": 4.442237273891468e-05,
"loss": 1.3298,
"num_input_tokens_seen": 60770032,
"step": 111500
},
{
"epoch": 1.1205378581719225,
"grad_norm": 7.661795616149902,
"learning_rate": 4.4397360733151915e-05,
"loss": 1.299,
"num_input_tokens_seen": 61041904,
"step": 112000
},
{
"epoch": 1.1255402593244757,
"grad_norm": 7.2505340576171875,
"learning_rate": 4.437234872738915e-05,
"loss": 1.3444,
"num_input_tokens_seen": 61315792,
"step": 112500
},
{
"epoch": 1.130542660477029,
"grad_norm": 8.16947078704834,
"learning_rate": 4.434733672162638e-05,
"loss": 1.313,
"num_input_tokens_seen": 61591968,
"step": 113000
},
{
"epoch": 1.1355450616295821,
"grad_norm": 6.221188068389893,
"learning_rate": 4.4322324715863614e-05,
"loss": 1.3266,
"num_input_tokens_seen": 61862648,
"step": 113500
},
{
"epoch": 1.1405474627821355,
"grad_norm": 5.967212677001953,
"learning_rate": 4.4297312710100855e-05,
"loss": 1.3521,
"num_input_tokens_seen": 62138024,
"step": 114000
},
{
"epoch": 1.1455498639346886,
"grad_norm": 6.872376441955566,
"learning_rate": 4.4272300704338084e-05,
"loss": 1.3218,
"num_input_tokens_seen": 62414352,
"step": 114500
},
{
"epoch": 1.150552265087242,
"grad_norm": 6.218190670013428,
"learning_rate": 4.424728869857532e-05,
"loss": 1.3306,
"num_input_tokens_seen": 62689104,
"step": 115000
},
{
"epoch": 1.155554666239795,
"grad_norm": 8.191985130310059,
"learning_rate": 4.4222276692812553e-05,
"loss": 1.3236,
"num_input_tokens_seen": 62963216,
"step": 115500
},
{
"epoch": 1.1605570673923484,
"grad_norm": 6.161906719207764,
"learning_rate": 4.419726468704979e-05,
"loss": 1.3258,
"num_input_tokens_seen": 63235456,
"step": 116000
},
{
"epoch": 1.1655594685449016,
"grad_norm": 7.158758640289307,
"learning_rate": 4.417225268128702e-05,
"loss": 1.3037,
"num_input_tokens_seen": 63505248,
"step": 116500
},
{
"epoch": 1.1705618696974547,
"grad_norm": 5.683105945587158,
"learning_rate": 4.414724067552425e-05,
"loss": 1.3154,
"num_input_tokens_seen": 63772504,
"step": 117000
},
{
"epoch": 1.175564270850008,
"grad_norm": 7.0123467445373535,
"learning_rate": 4.4122228669761487e-05,
"loss": 1.3043,
"num_input_tokens_seen": 64045928,
"step": 117500
},
{
"epoch": 1.1805666720025612,
"grad_norm": 5.434397220611572,
"learning_rate": 4.409721666399872e-05,
"loss": 1.3247,
"num_input_tokens_seen": 64313624,
"step": 118000
},
{
"epoch": 1.1855690731551145,
"grad_norm": 6.298323631286621,
"learning_rate": 4.4072204658235956e-05,
"loss": 1.3333,
"num_input_tokens_seen": 64591384,
"step": 118500
},
{
"epoch": 1.1905714743076676,
"grad_norm": 6.530762672424316,
"learning_rate": 4.404719265247319e-05,
"loss": 1.3324,
"num_input_tokens_seen": 64864128,
"step": 119000
},
{
"epoch": 1.195573875460221,
"grad_norm": 7.463630199432373,
"learning_rate": 4.4022180646710426e-05,
"loss": 1.314,
"num_input_tokens_seen": 65134680,
"step": 119500
},
{
"epoch": 1.200576276612774,
"grad_norm": 8.017274856567383,
"learning_rate": 4.3997168640947655e-05,
"loss": 1.3,
"num_input_tokens_seen": 65400128,
"step": 120000
},
{
"epoch": 1.2055786777653275,
"grad_norm": 6.083741188049316,
"learning_rate": 4.397215663518489e-05,
"loss": 1.3122,
"num_input_tokens_seen": 65670200,
"step": 120500
},
{
"epoch": 1.2105810789178806,
"grad_norm": 7.809543609619141,
"learning_rate": 4.3947144629422125e-05,
"loss": 1.316,
"num_input_tokens_seen": 65935248,
"step": 121000
},
{
"epoch": 1.2155834800704337,
"grad_norm": 6.627076148986816,
"learning_rate": 4.392213262365936e-05,
"loss": 1.3024,
"num_input_tokens_seen": 66206584,
"step": 121500
},
{
"epoch": 1.220585881222987,
"grad_norm": 5.432526111602783,
"learning_rate": 4.3897120617896594e-05,
"loss": 1.3181,
"num_input_tokens_seen": 66476424,
"step": 122000
},
{
"epoch": 1.2255882823755402,
"grad_norm": 5.557873249053955,
"learning_rate": 4.387210861213383e-05,
"loss": 1.3066,
"num_input_tokens_seen": 66746568,
"step": 122500
},
{
"epoch": 1.2305906835280935,
"grad_norm": 5.4136738777160645,
"learning_rate": 4.384709660637106e-05,
"loss": 1.3065,
"num_input_tokens_seen": 67013472,
"step": 123000
},
{
"epoch": 1.2355930846806467,
"grad_norm": 4.602624416351318,
"learning_rate": 4.382208460060829e-05,
"loss": 1.2921,
"num_input_tokens_seen": 67284136,
"step": 123500
},
{
"epoch": 1.2405954858332,
"grad_norm": 7.711009502410889,
"learning_rate": 4.379707259484553e-05,
"loss": 1.3104,
"num_input_tokens_seen": 67555712,
"step": 124000
},
{
"epoch": 1.2455978869857531,
"grad_norm": 5.971095561981201,
"learning_rate": 4.377206058908276e-05,
"loss": 1.3288,
"num_input_tokens_seen": 67830816,
"step": 124500
},
{
"epoch": 1.2506002881383065,
"grad_norm": 5.992773056030273,
"learning_rate": 4.374704858331999e-05,
"loss": 1.3372,
"num_input_tokens_seen": 68113208,
"step": 125000
},
{
"epoch": 1.2556026892908596,
"grad_norm": 7.2574238777160645,
"learning_rate": 4.372203657755723e-05,
"loss": 1.2964,
"num_input_tokens_seen": 68376088,
"step": 125500
},
{
"epoch": 1.2606050904434127,
"grad_norm": 4.974996566772461,
"learning_rate": 4.369702457179447e-05,
"loss": 1.3021,
"num_input_tokens_seen": 68641168,
"step": 126000
},
{
"epoch": 1.265607491595966,
"grad_norm": 5.745625019073486,
"learning_rate": 4.3672012566031696e-05,
"loss": 1.3217,
"num_input_tokens_seen": 68909752,
"step": 126500
},
{
"epoch": 1.2706098927485192,
"grad_norm": 6.78819465637207,
"learning_rate": 4.364700056026893e-05,
"loss": 1.3211,
"num_input_tokens_seen": 69181824,
"step": 127000
},
{
"epoch": 1.2756122939010726,
"grad_norm": 7.1991047859191895,
"learning_rate": 4.3621988554506166e-05,
"loss": 1.3175,
"num_input_tokens_seen": 69448304,
"step": 127500
},
{
"epoch": 1.2806146950536257,
"grad_norm": 5.636517524719238,
"learning_rate": 4.35969765487434e-05,
"loss": 1.308,
"num_input_tokens_seen": 69724960,
"step": 128000
},
{
"epoch": 1.285617096206179,
"grad_norm": 6.406187057495117,
"learning_rate": 4.357196454298063e-05,
"loss": 1.3225,
"num_input_tokens_seen": 70004440,
"step": 128500
},
{
"epoch": 1.2906194973587322,
"grad_norm": 5.746100902557373,
"learning_rate": 4.3546952537217864e-05,
"loss": 1.3084,
"num_input_tokens_seen": 70276824,
"step": 129000
},
{
"epoch": 1.2956218985112855,
"grad_norm": 5.6266584396362305,
"learning_rate": 4.3521940531455105e-05,
"loss": 1.3251,
"num_input_tokens_seen": 70549080,
"step": 129500
},
{
"epoch": 1.3006242996638386,
"grad_norm": 6.3568315505981445,
"learning_rate": 4.3496928525692334e-05,
"loss": 1.2909,
"num_input_tokens_seen": 70822216,
"step": 130000
},
{
"epoch": 1.3056267008163918,
"grad_norm": 6.566619873046875,
"learning_rate": 4.347191651992957e-05,
"loss": 1.3083,
"num_input_tokens_seen": 71088152,
"step": 130500
},
{
"epoch": 1.310629101968945,
"grad_norm": 8.060522079467773,
"learning_rate": 4.3446904514166804e-05,
"loss": 1.3124,
"num_input_tokens_seen": 71354416,
"step": 131000
},
{
"epoch": 1.3156315031214985,
"grad_norm": 7.366143226623535,
"learning_rate": 4.342189250840404e-05,
"loss": 1.317,
"num_input_tokens_seen": 71630192,
"step": 131500
},
{
"epoch": 1.3206339042740516,
"grad_norm": 6.985642910003662,
"learning_rate": 4.339688050264127e-05,
"loss": 1.3115,
"num_input_tokens_seen": 71898288,
"step": 132000
},
{
"epoch": 1.3256363054266047,
"grad_norm": 6.185933589935303,
"learning_rate": 4.33718684968785e-05,
"loss": 1.3227,
"num_input_tokens_seen": 72177880,
"step": 132500
},
{
"epoch": 1.330638706579158,
"grad_norm": 5.259435176849365,
"learning_rate": 4.334685649111574e-05,
"loss": 1.3202,
"num_input_tokens_seen": 72456024,
"step": 133000
},
{
"epoch": 1.3356411077317112,
"grad_norm": 6.163081169128418,
"learning_rate": 4.332184448535297e-05,
"loss": 1.3021,
"num_input_tokens_seen": 72724464,
"step": 133500
},
{
"epoch": 1.3406435088842645,
"grad_norm": 5.284718036651611,
"learning_rate": 4.329683247959021e-05,
"loss": 1.3063,
"num_input_tokens_seen": 72991696,
"step": 134000
},
{
"epoch": 1.3456459100368177,
"grad_norm": 6.016850471496582,
"learning_rate": 4.327182047382744e-05,
"loss": 1.3012,
"num_input_tokens_seen": 73261048,
"step": 134500
},
{
"epoch": 1.3506483111893708,
"grad_norm": 6.393965244293213,
"learning_rate": 4.3246808468064677e-05,
"loss": 1.2991,
"num_input_tokens_seen": 73529952,
"step": 135000
},
{
"epoch": 1.3556507123419241,
"grad_norm": 7.240478992462158,
"learning_rate": 4.3221796462301905e-05,
"loss": 1.3297,
"num_input_tokens_seen": 73806208,
"step": 135500
},
{
"epoch": 1.3606531134944775,
"grad_norm": 6.343556880950928,
"learning_rate": 4.319678445653914e-05,
"loss": 1.3228,
"num_input_tokens_seen": 74076360,
"step": 136000
},
{
"epoch": 1.3656555146470306,
"grad_norm": 5.717186450958252,
"learning_rate": 4.3171772450776375e-05,
"loss": 1.3018,
"num_input_tokens_seen": 74350688,
"step": 136500
},
{
"epoch": 1.3706579157995837,
"grad_norm": 5.872751235961914,
"learning_rate": 4.314676044501361e-05,
"loss": 1.3053,
"num_input_tokens_seen": 74623168,
"step": 137000
},
{
"epoch": 1.375660316952137,
"grad_norm": 6.422801971435547,
"learning_rate": 4.3121748439250845e-05,
"loss": 1.3107,
"num_input_tokens_seen": 74892400,
"step": 137500
},
{
"epoch": 1.3806627181046902,
"grad_norm": 5.038456439971924,
"learning_rate": 4.309673643348808e-05,
"loss": 1.3261,
"num_input_tokens_seen": 75161376,
"step": 138000
},
{
"epoch": 1.3856651192572436,
"grad_norm": 6.162600040435791,
"learning_rate": 4.3071724427725315e-05,
"loss": 1.2904,
"num_input_tokens_seen": 75437000,
"step": 138500
},
{
"epoch": 1.3906675204097967,
"grad_norm": 5.364713191986084,
"learning_rate": 4.304671242196254e-05,
"loss": 1.3162,
"num_input_tokens_seen": 75711312,
"step": 139000
},
{
"epoch": 1.3956699215623498,
"grad_norm": 6.959611415863037,
"learning_rate": 4.302170041619978e-05,
"loss": 1.3231,
"num_input_tokens_seen": 75982336,
"step": 139500
},
{
"epoch": 1.4006723227149032,
"grad_norm": 7.737590789794922,
"learning_rate": 4.299668841043701e-05,
"loss": 1.3175,
"num_input_tokens_seen": 76261536,
"step": 140000
},
{
"epoch": 1.4056747238674565,
"grad_norm": 5.541545391082764,
"learning_rate": 4.297167640467424e-05,
"loss": 1.3075,
"num_input_tokens_seen": 76530928,
"step": 140500
},
{
"epoch": 1.4106771250200096,
"grad_norm": 6.196156024932861,
"learning_rate": 4.2946664398911476e-05,
"loss": 1.3045,
"num_input_tokens_seen": 76805928,
"step": 141000
},
{
"epoch": 1.4156795261725628,
"grad_norm": 5.349905490875244,
"learning_rate": 4.292165239314872e-05,
"loss": 1.3223,
"num_input_tokens_seen": 77083224,
"step": 141500
},
{
"epoch": 1.4206819273251161,
"grad_norm": 5.8378586769104,
"learning_rate": 4.2896640387385946e-05,
"loss": 1.3025,
"num_input_tokens_seen": 77352800,
"step": 142000
},
{
"epoch": 1.4256843284776692,
"grad_norm": 6.061739921569824,
"learning_rate": 4.287162838162318e-05,
"loss": 1.3074,
"num_input_tokens_seen": 77625328,
"step": 142500
},
{
"epoch": 1.4306867296302226,
"grad_norm": 5.522953510284424,
"learning_rate": 4.2846616375860416e-05,
"loss": 1.3052,
"num_input_tokens_seen": 77902368,
"step": 143000
},
{
"epoch": 1.4356891307827757,
"grad_norm": 6.295720100402832,
"learning_rate": 4.282160437009765e-05,
"loss": 1.3118,
"num_input_tokens_seen": 78177488,
"step": 143500
},
{
"epoch": 1.4406915319353288,
"grad_norm": 6.575023651123047,
"learning_rate": 4.279659236433488e-05,
"loss": 1.3093,
"num_input_tokens_seen": 78446712,
"step": 144000
},
{
"epoch": 1.4456939330878822,
"grad_norm": 6.984113693237305,
"learning_rate": 4.2771580358572114e-05,
"loss": 1.3076,
"num_input_tokens_seen": 78720880,
"step": 144500
},
{
"epoch": 1.4506963342404355,
"grad_norm": 5.520240306854248,
"learning_rate": 4.274656835280935e-05,
"loss": 1.3001,
"num_input_tokens_seen": 78987824,
"step": 145000
},
{
"epoch": 1.4556987353929887,
"grad_norm": 8.607036590576172,
"learning_rate": 4.2721556347046584e-05,
"loss": 1.3129,
"num_input_tokens_seen": 79265448,
"step": 145500
},
{
"epoch": 1.4607011365455418,
"grad_norm": 5.851890563964844,
"learning_rate": 4.269654434128382e-05,
"loss": 1.283,
"num_input_tokens_seen": 79533224,
"step": 146000
},
{
"epoch": 1.4657035376980951,
"grad_norm": 6.837863922119141,
"learning_rate": 4.2671532335521054e-05,
"loss": 1.3191,
"num_input_tokens_seen": 79806784,
"step": 146500
},
{
"epoch": 1.4707059388506483,
"grad_norm": 8.558204650878906,
"learning_rate": 4.264652032975829e-05,
"loss": 1.3004,
"num_input_tokens_seen": 80082392,
"step": 147000
},
{
"epoch": 1.4757083400032016,
"grad_norm": 5.562234401702881,
"learning_rate": 4.262150832399552e-05,
"loss": 1.3127,
"num_input_tokens_seen": 80357544,
"step": 147500
},
{
"epoch": 1.4807107411557547,
"grad_norm": 6.331244945526123,
"learning_rate": 4.259649631823275e-05,
"loss": 1.2875,
"num_input_tokens_seen": 80619480,
"step": 148000
},
{
"epoch": 1.4857131423083079,
"grad_norm": 7.26661491394043,
"learning_rate": 4.257148431246999e-05,
"loss": 1.286,
"num_input_tokens_seen": 80889016,
"step": 148500
},
{
"epoch": 1.4907155434608612,
"grad_norm": 6.140303134918213,
"learning_rate": 4.254647230670722e-05,
"loss": 1.3209,
"num_input_tokens_seen": 81158600,
"step": 149000
},
{
"epoch": 1.4957179446134146,
"grad_norm": 6.452395439147949,
"learning_rate": 4.252146030094446e-05,
"loss": 1.3115,
"num_input_tokens_seen": 81438680,
"step": 149500
},
{
"epoch": 1.5007203457659677,
"grad_norm": 7.9884257316589355,
"learning_rate": 4.249644829518169e-05,
"loss": 1.2958,
"num_input_tokens_seen": 81705824,
"step": 150000
},
{
"epoch": 1.5057227469185208,
"grad_norm": 5.807667255401611,
"learning_rate": 4.247143628941893e-05,
"loss": 1.3309,
"num_input_tokens_seen": 81978560,
"step": 150500
},
{
"epoch": 1.5107251480710742,
"grad_norm": 6.487443447113037,
"learning_rate": 4.2446424283656155e-05,
"loss": 1.303,
"num_input_tokens_seen": 82250552,
"step": 151000
},
{
"epoch": 1.5157275492236273,
"grad_norm": 7.297651767730713,
"learning_rate": 4.242141227789339e-05,
"loss": 1.2961,
"num_input_tokens_seen": 82528296,
"step": 151500
},
{
"epoch": 1.5207299503761806,
"grad_norm": 6.434643268585205,
"learning_rate": 4.2396400272130625e-05,
"loss": 1.2926,
"num_input_tokens_seen": 82791496,
"step": 152000
},
{
"epoch": 1.5257323515287338,
"grad_norm": 6.918686389923096,
"learning_rate": 4.237138826636785e-05,
"loss": 1.2998,
"num_input_tokens_seen": 83063776,
"step": 152500
},
{
"epoch": 1.530734752681287,
"grad_norm": 5.594851493835449,
"learning_rate": 4.2346376260605095e-05,
"loss": 1.321,
"num_input_tokens_seen": 83339208,
"step": 153000
},
{
"epoch": 1.5357371538338402,
"grad_norm": 6.245510578155518,
"learning_rate": 4.232136425484233e-05,
"loss": 1.2743,
"num_input_tokens_seen": 83610920,
"step": 153500
},
{
"epoch": 1.5407395549863936,
"grad_norm": 6.392094612121582,
"learning_rate": 4.2296352249079565e-05,
"loss": 1.3062,
"num_input_tokens_seen": 83883600,
"step": 154000
},
{
"epoch": 1.5457419561389467,
"grad_norm": 6.538769245147705,
"learning_rate": 4.227134024331679e-05,
"loss": 1.3111,
"num_input_tokens_seen": 84152704,
"step": 154500
},
{
"epoch": 1.5507443572914998,
"grad_norm": 6.384563446044922,
"learning_rate": 4.224632823755403e-05,
"loss": 1.2767,
"num_input_tokens_seen": 84425920,
"step": 155000
},
{
"epoch": 1.5557467584440532,
"grad_norm": 6.407052040100098,
"learning_rate": 4.222131623179126e-05,
"loss": 1.2865,
"num_input_tokens_seen": 84697904,
"step": 155500
},
{
"epoch": 1.5607491595966063,
"grad_norm": 6.534234046936035,
"learning_rate": 4.219630422602849e-05,
"loss": 1.2817,
"num_input_tokens_seen": 84968680,
"step": 156000
},
{
"epoch": 1.5657515607491597,
"grad_norm": 5.641045093536377,
"learning_rate": 4.2171292220265726e-05,
"loss": 1.2963,
"num_input_tokens_seen": 85238032,
"step": 156500
},
{
"epoch": 1.5707539619017128,
"grad_norm": 6.242879867553711,
"learning_rate": 4.214628021450297e-05,
"loss": 1.2924,
"num_input_tokens_seen": 85511200,
"step": 157000
},
{
"epoch": 1.575756363054266,
"grad_norm": 6.90887451171875,
"learning_rate": 4.21212682087402e-05,
"loss": 1.2968,
"num_input_tokens_seen": 85785448,
"step": 157500
},
{
"epoch": 1.5807587642068193,
"grad_norm": 7.269606590270996,
"learning_rate": 4.209625620297743e-05,
"loss": 1.2845,
"num_input_tokens_seen": 86056312,
"step": 158000
},
{
"epoch": 1.5857611653593726,
"grad_norm": 5.152353763580322,
"learning_rate": 4.2071244197214666e-05,
"loss": 1.2909,
"num_input_tokens_seen": 86333312,
"step": 158500
},
{
"epoch": 1.5907635665119257,
"grad_norm": 6.0240631103515625,
"learning_rate": 4.20462321914519e-05,
"loss": 1.2923,
"num_input_tokens_seen": 86610480,
"step": 159000
},
{
"epoch": 1.5957659676644789,
"grad_norm": 7.361881256103516,
"learning_rate": 4.202122018568913e-05,
"loss": 1.2759,
"num_input_tokens_seen": 86882480,
"step": 159500
},
{
"epoch": 1.6007683688170322,
"grad_norm": 5.192800521850586,
"learning_rate": 4.1996208179926364e-05,
"loss": 1.298,
"num_input_tokens_seen": 87144592,
"step": 160000
},
{
"epoch": 1.6057707699695856,
"grad_norm": 7.1856369972229,
"learning_rate": 4.19711961741636e-05,
"loss": 1.2955,
"num_input_tokens_seen": 87420328,
"step": 160500
},
{
"epoch": 1.6107731711221387,
"grad_norm": 5.096145153045654,
"learning_rate": 4.1946184168400834e-05,
"loss": 1.2923,
"num_input_tokens_seen": 87696968,
"step": 161000
},
{
"epoch": 1.6157755722746918,
"grad_norm": 6.808541297912598,
"learning_rate": 4.192117216263807e-05,
"loss": 1.3044,
"num_input_tokens_seen": 87977352,
"step": 161500
},
{
"epoch": 1.620777973427245,
"grad_norm": 5.258007526397705,
"learning_rate": 4.1896160156875304e-05,
"loss": 1.3023,
"num_input_tokens_seen": 88251864,
"step": 162000
},
{
"epoch": 1.6257803745797983,
"grad_norm": 5.184575080871582,
"learning_rate": 4.187114815111254e-05,
"loss": 1.2594,
"num_input_tokens_seen": 88521800,
"step": 162500
},
{
"epoch": 1.6307827757323516,
"grad_norm": 5.858316421508789,
"learning_rate": 4.184613614534977e-05,
"loss": 1.2854,
"num_input_tokens_seen": 88788776,
"step": 163000
},
{
"epoch": 1.6357851768849048,
"grad_norm": 7.03213357925415,
"learning_rate": 4.1821124139587e-05,
"loss": 1.2745,
"num_input_tokens_seen": 89054472,
"step": 163500
},
{
"epoch": 1.640787578037458,
"grad_norm": 7.509394645690918,
"learning_rate": 4.179611213382424e-05,
"loss": 1.2932,
"num_input_tokens_seen": 89332456,
"step": 164000
},
{
"epoch": 1.6457899791900112,
"grad_norm": 7.114541530609131,
"learning_rate": 4.177110012806147e-05,
"loss": 1.2751,
"num_input_tokens_seen": 89609920,
"step": 164500
},
{
"epoch": 1.6507923803425646,
"grad_norm": 14.539456367492676,
"learning_rate": 4.174608812229871e-05,
"loss": 1.3115,
"num_input_tokens_seen": 89875856,
"step": 165000
},
{
"epoch": 1.6557947814951177,
"grad_norm": 5.730625629425049,
"learning_rate": 4.172107611653594e-05,
"loss": 1.2938,
"num_input_tokens_seen": 90148472,
"step": 165500
},
{
"epoch": 1.6607971826476708,
"grad_norm": 5.901363849639893,
"learning_rate": 4.169606411077318e-05,
"loss": 1.2895,
"num_input_tokens_seen": 90424400,
"step": 166000
},
{
"epoch": 1.665799583800224,
"grad_norm": 5.94663667678833,
"learning_rate": 4.1671052105010405e-05,
"loss": 1.2988,
"num_input_tokens_seen": 90702152,
"step": 166500
},
{
"epoch": 1.6708019849527773,
"grad_norm": 5.720317363739014,
"learning_rate": 4.164604009924764e-05,
"loss": 1.2921,
"num_input_tokens_seen": 90980904,
"step": 167000
},
{
"epoch": 1.6758043861053307,
"grad_norm": 8.514877319335938,
"learning_rate": 4.1621028093484875e-05,
"loss": 1.2762,
"num_input_tokens_seen": 91249200,
"step": 167500
},
{
"epoch": 1.6808067872578838,
"grad_norm": 8.756369590759277,
"learning_rate": 4.15960160877221e-05,
"loss": 1.2898,
"num_input_tokens_seen": 91523408,
"step": 168000
},
{
"epoch": 1.685809188410437,
"grad_norm": 4.922306537628174,
"learning_rate": 4.1571004081959345e-05,
"loss": 1.2873,
"num_input_tokens_seen": 91795848,
"step": 168500
},
{
"epoch": 1.6908115895629903,
"grad_norm": 5.668425559997559,
"learning_rate": 4.154599207619658e-05,
"loss": 1.2662,
"num_input_tokens_seen": 92067336,
"step": 169000
},
{
"epoch": 1.6958139907155436,
"grad_norm": 6.631772518157959,
"learning_rate": 4.1520980070433815e-05,
"loss": 1.3048,
"num_input_tokens_seen": 92339392,
"step": 169500
},
{
"epoch": 1.7008163918680967,
"grad_norm": 6.489889144897461,
"learning_rate": 4.149596806467104e-05,
"loss": 1.2835,
"num_input_tokens_seen": 92613216,
"step": 170000
},
{
"epoch": 1.7058187930206499,
"grad_norm": 6.344711780548096,
"learning_rate": 4.147095605890828e-05,
"loss": 1.2918,
"num_input_tokens_seen": 92890872,
"step": 170500
},
{
"epoch": 1.710821194173203,
"grad_norm": 7.276896953582764,
"learning_rate": 4.144594405314551e-05,
"loss": 1.303,
"num_input_tokens_seen": 93161528,
"step": 171000
},
{
"epoch": 1.7158235953257563,
"grad_norm": 6.139397144317627,
"learning_rate": 4.142093204738274e-05,
"loss": 1.2884,
"num_input_tokens_seen": 93434024,
"step": 171500
},
{
"epoch": 1.7208259964783097,
"grad_norm": 5.353676795959473,
"learning_rate": 4.1395920041619976e-05,
"loss": 1.2865,
"num_input_tokens_seen": 93712728,
"step": 172000
},
{
"epoch": 1.7258283976308628,
"grad_norm": 7.979468822479248,
"learning_rate": 4.137090803585721e-05,
"loss": 1.2884,
"num_input_tokens_seen": 93992640,
"step": 172500
},
{
"epoch": 1.730830798783416,
"grad_norm": 5.386059761047363,
"learning_rate": 4.134589603009445e-05,
"loss": 1.2875,
"num_input_tokens_seen": 94262344,
"step": 173000
},
{
"epoch": 1.7358331999359693,
"grad_norm": 4.8488311767578125,
"learning_rate": 4.132088402433168e-05,
"loss": 1.2636,
"num_input_tokens_seen": 94536416,
"step": 173500
},
{
"epoch": 1.7408356010885226,
"grad_norm": 7.375112056732178,
"learning_rate": 4.1295872018568916e-05,
"loss": 1.276,
"num_input_tokens_seen": 94806528,
"step": 174000
},
{
"epoch": 1.7458380022410758,
"grad_norm": 4.830787181854248,
"learning_rate": 4.127086001280615e-05,
"loss": 1.2681,
"num_input_tokens_seen": 95076832,
"step": 174500
},
{
"epoch": 1.750840403393629,
"grad_norm": 5.590123653411865,
"learning_rate": 4.124584800704338e-05,
"loss": 1.273,
"num_input_tokens_seen": 95342672,
"step": 175000
},
{
"epoch": 1.755842804546182,
"grad_norm": 5.334784984588623,
"learning_rate": 4.1220836001280614e-05,
"loss": 1.29,
"num_input_tokens_seen": 95613968,
"step": 175500
},
{
"epoch": 1.7608452056987354,
"grad_norm": 5.795757293701172,
"learning_rate": 4.119582399551785e-05,
"loss": 1.2683,
"num_input_tokens_seen": 95880488,
"step": 176000
},
{
"epoch": 1.7658476068512887,
"grad_norm": 5.436016082763672,
"learning_rate": 4.1170811989755084e-05,
"loss": 1.2665,
"num_input_tokens_seen": 96148192,
"step": 176500
},
{
"epoch": 1.7708500080038418,
"grad_norm": 7.753640174865723,
"learning_rate": 4.114579998399232e-05,
"loss": 1.3018,
"num_input_tokens_seen": 96422808,
"step": 177000
},
{
"epoch": 1.775852409156395,
"grad_norm": 6.833972454071045,
"learning_rate": 4.1120787978229554e-05,
"loss": 1.2731,
"num_input_tokens_seen": 96691008,
"step": 177500
},
{
"epoch": 1.7808548103089483,
"grad_norm": 5.354393482208252,
"learning_rate": 4.109577597246679e-05,
"loss": 1.2683,
"num_input_tokens_seen": 96964040,
"step": 178000
},
{
"epoch": 1.7858572114615017,
"grad_norm": 5.666247367858887,
"learning_rate": 4.107076396670402e-05,
"loss": 1.3028,
"num_input_tokens_seen": 97244192,
"step": 178500
},
{
"epoch": 1.7908596126140548,
"grad_norm": 5.841219902038574,
"learning_rate": 4.104575196094125e-05,
"loss": 1.2637,
"num_input_tokens_seen": 97515856,
"step": 179000
},
{
"epoch": 1.795862013766608,
"grad_norm": 6.097582817077637,
"learning_rate": 4.102073995517849e-05,
"loss": 1.2602,
"num_input_tokens_seen": 97784032,
"step": 179500
},
{
"epoch": 1.800864414919161,
"grad_norm": 6.291224002838135,
"learning_rate": 4.099572794941572e-05,
"loss": 1.2792,
"num_input_tokens_seen": 98061744,
"step": 180000
},
{
"epoch": 1.8058668160717144,
"grad_norm": 6.529845237731934,
"learning_rate": 4.097071594365296e-05,
"loss": 1.2751,
"num_input_tokens_seen": 98333416,
"step": 180500
},
{
"epoch": 1.8108692172242677,
"grad_norm": 5.767446041107178,
"learning_rate": 4.094570393789019e-05,
"loss": 1.2924,
"num_input_tokens_seen": 98611352,
"step": 181000
},
{
"epoch": 1.8158716183768209,
"grad_norm": 11.748208999633789,
"learning_rate": 4.092069193212743e-05,
"loss": 1.2656,
"num_input_tokens_seen": 98875048,
"step": 181500
},
{
"epoch": 1.820874019529374,
"grad_norm": 6.215290069580078,
"learning_rate": 4.0895679926364655e-05,
"loss": 1.2854,
"num_input_tokens_seen": 99142440,
"step": 182000
},
{
"epoch": 1.8258764206819273,
"grad_norm": 4.965378284454346,
"learning_rate": 4.087066792060189e-05,
"loss": 1.3011,
"num_input_tokens_seen": 99420240,
"step": 182500
},
{
"epoch": 1.8308788218344807,
"grad_norm": 4.903427600860596,
"learning_rate": 4.0845655914839125e-05,
"loss": 1.29,
"num_input_tokens_seen": 99700544,
"step": 183000
},
{
"epoch": 1.8358812229870338,
"grad_norm": 7.428767204284668,
"learning_rate": 4.082064390907635e-05,
"loss": 1.2784,
"num_input_tokens_seen": 99976104,
"step": 183500
},
{
"epoch": 1.840883624139587,
"grad_norm": 5.334924697875977,
"learning_rate": 4.079563190331359e-05,
"loss": 1.2617,
"num_input_tokens_seen": 100248360,
"step": 184000
},
{
"epoch": 1.84588602529214,
"grad_norm": 5.380727291107178,
"learning_rate": 4.077061989755083e-05,
"loss": 1.2824,
"num_input_tokens_seen": 100520712,
"step": 184500
},
{
"epoch": 1.8508884264446934,
"grad_norm": 6.993951320648193,
"learning_rate": 4.0745607891788065e-05,
"loss": 1.3006,
"num_input_tokens_seen": 100795856,
"step": 185000
},
{
"epoch": 1.8558908275972468,
"grad_norm": 6.079780578613281,
"learning_rate": 4.072059588602529e-05,
"loss": 1.2844,
"num_input_tokens_seen": 101069232,
"step": 185500
},
{
"epoch": 1.8608932287498,
"grad_norm": 5.772866725921631,
"learning_rate": 4.069558388026253e-05,
"loss": 1.269,
"num_input_tokens_seen": 101347816,
"step": 186000
},
{
"epoch": 1.865895629902353,
"grad_norm": 6.067032337188721,
"learning_rate": 4.067057187449976e-05,
"loss": 1.2753,
"num_input_tokens_seen": 101618784,
"step": 186500
},
{
"epoch": 1.8708980310549064,
"grad_norm": 8.178043365478516,
"learning_rate": 4.064555986873699e-05,
"loss": 1.2694,
"num_input_tokens_seen": 101889416,
"step": 187000
},
{
"epoch": 1.8759004322074597,
"grad_norm": 5.999898433685303,
"learning_rate": 4.0620547862974226e-05,
"loss": 1.2774,
"num_input_tokens_seen": 102163040,
"step": 187500
},
{
"epoch": 1.8809028333600128,
"grad_norm": 7.069881439208984,
"learning_rate": 4.059553585721146e-05,
"loss": 1.2848,
"num_input_tokens_seen": 102431528,
"step": 188000
},
{
"epoch": 1.885905234512566,
"grad_norm": 5.21435546875,
"learning_rate": 4.05705238514487e-05,
"loss": 1.2599,
"num_input_tokens_seen": 102705520,
"step": 188500
},
{
"epoch": 1.890907635665119,
"grad_norm": 6.542243003845215,
"learning_rate": 4.054551184568593e-05,
"loss": 1.283,
"num_input_tokens_seen": 102981304,
"step": 189000
},
{
"epoch": 1.8959100368176725,
"grad_norm": 6.719133377075195,
"learning_rate": 4.0520499839923166e-05,
"loss": 1.2879,
"num_input_tokens_seen": 103259112,
"step": 189500
},
{
"epoch": 1.9009124379702258,
"grad_norm": 6.38728666305542,
"learning_rate": 4.04954878341604e-05,
"loss": 1.2548,
"num_input_tokens_seen": 103534888,
"step": 190000
},
{
"epoch": 1.905914839122779,
"grad_norm": 5.428126811981201,
"learning_rate": 4.047047582839763e-05,
"loss": 1.28,
"num_input_tokens_seen": 103803600,
"step": 190500
},
{
"epoch": 1.910917240275332,
"grad_norm": 5.377976894378662,
"learning_rate": 4.0445463822634864e-05,
"loss": 1.2658,
"num_input_tokens_seen": 104075160,
"step": 191000
},
{
"epoch": 1.9159196414278854,
"grad_norm": 5.453880786895752,
"learning_rate": 4.04204518168721e-05,
"loss": 1.2643,
"num_input_tokens_seen": 104349992,
"step": 191500
},
{
"epoch": 1.9209220425804387,
"grad_norm": 5.114168167114258,
"learning_rate": 4.0395439811109334e-05,
"loss": 1.2769,
"num_input_tokens_seen": 104621104,
"step": 192000
},
{
"epoch": 1.9259244437329919,
"grad_norm": 5.22728157043457,
"learning_rate": 4.037042780534657e-05,
"loss": 1.2842,
"num_input_tokens_seen": 104890976,
"step": 192500
},
{
"epoch": 1.930926844885545,
"grad_norm": 5.4410881996154785,
"learning_rate": 4.0345415799583804e-05,
"loss": 1.2627,
"num_input_tokens_seen": 105165152,
"step": 193000
},
{
"epoch": 1.9359292460380981,
"grad_norm": 5.700538158416748,
"learning_rate": 4.032040379382104e-05,
"loss": 1.275,
"num_input_tokens_seen": 105431920,
"step": 193500
},
{
"epoch": 1.9409316471906515,
"grad_norm": 5.171668529510498,
"learning_rate": 4.029539178805827e-05,
"loss": 1.2852,
"num_input_tokens_seen": 105709976,
"step": 194000
},
{
"epoch": 1.9459340483432048,
"grad_norm": 7.026444911956787,
"learning_rate": 4.02703797822955e-05,
"loss": 1.2718,
"num_input_tokens_seen": 105983472,
"step": 194500
},
{
"epoch": 1.950936449495758,
"grad_norm": 6.670947074890137,
"learning_rate": 4.024536777653274e-05,
"loss": 1.2574,
"num_input_tokens_seen": 106257632,
"step": 195000
},
{
"epoch": 1.955938850648311,
"grad_norm": 5.797586441040039,
"learning_rate": 4.022035577076997e-05,
"loss": 1.2821,
"num_input_tokens_seen": 106533272,
"step": 195500
},
{
"epoch": 1.9609412518008644,
"grad_norm": 7.070456504821777,
"learning_rate": 4.019534376500721e-05,
"loss": 1.2749,
"num_input_tokens_seen": 106804176,
"step": 196000
},
{
"epoch": 1.9659436529534178,
"grad_norm": 5.074236869812012,
"learning_rate": 4.017033175924444e-05,
"loss": 1.2837,
"num_input_tokens_seen": 107077264,
"step": 196500
},
{
"epoch": 1.970946054105971,
"grad_norm": 5.952401161193848,
"learning_rate": 4.014531975348168e-05,
"loss": 1.2481,
"num_input_tokens_seen": 107342400,
"step": 197000
},
{
"epoch": 1.975948455258524,
"grad_norm": 6.63128662109375,
"learning_rate": 4.0120307747718905e-05,
"loss": 1.2769,
"num_input_tokens_seen": 107619760,
"step": 197500
},
{
"epoch": 1.9809508564110772,
"grad_norm": 6.601523399353027,
"learning_rate": 4.009529574195614e-05,
"loss": 1.275,
"num_input_tokens_seen": 107898512,
"step": 198000
},
{
"epoch": 1.9859532575636305,
"grad_norm": 6.857260227203369,
"learning_rate": 4.0070283736193375e-05,
"loss": 1.2508,
"num_input_tokens_seen": 108163904,
"step": 198500
},
{
"epoch": 1.9909556587161839,
"grad_norm": 5.871264934539795,
"learning_rate": 4.004527173043061e-05,
"loss": 1.2712,
"num_input_tokens_seen": 108440480,
"step": 199000
},
{
"epoch": 1.995958059868737,
"grad_norm": 7.567385673522949,
"learning_rate": 4.002025972466784e-05,
"loss": 1.2801,
"num_input_tokens_seen": 108714992,
"step": 199500
},
{
"epoch": 2.0,
"eval_loss": 1.1701077222824097,
"eval_runtime": 186.6785,
"eval_samples_per_second": 1070.851,
"eval_steps_per_second": 133.861,
"num_input_tokens_seen": 108935048,
"step": 199904
},
{
"epoch": 2.00096046102129,
"grad_norm": 5.430812835693359,
"learning_rate": 3.999524771890507e-05,
"loss": 1.2538,
"num_input_tokens_seen": 108986608,
"step": 200000
},
{
"epoch": 2.0059628621738432,
"grad_norm": 7.064018249511719,
"learning_rate": 3.9970235713142315e-05,
"loss": 1.1538,
"num_input_tokens_seen": 109256864,
"step": 200500
},
{
"epoch": 2.010965263326397,
"grad_norm": 6.479573726654053,
"learning_rate": 3.994522370737954e-05,
"loss": 1.1664,
"num_input_tokens_seen": 109523672,
"step": 201000
},
{
"epoch": 2.01596766447895,
"grad_norm": 6.595979690551758,
"learning_rate": 3.992021170161678e-05,
"loss": 1.1338,
"num_input_tokens_seen": 109791408,
"step": 201500
},
{
"epoch": 2.020970065631503,
"grad_norm": 7.46008825302124,
"learning_rate": 3.989519969585401e-05,
"loss": 1.1799,
"num_input_tokens_seen": 110064104,
"step": 202000
},
{
"epoch": 2.025972466784056,
"grad_norm": 5.414816379547119,
"learning_rate": 3.987018769009124e-05,
"loss": 1.1688,
"num_input_tokens_seen": 110335328,
"step": 202500
},
{
"epoch": 2.0309748679366098,
"grad_norm": 7.442201137542725,
"learning_rate": 3.9845175684328476e-05,
"loss": 1.1804,
"num_input_tokens_seen": 110611648,
"step": 203000
},
{
"epoch": 2.035977269089163,
"grad_norm": 5.2355475425720215,
"learning_rate": 3.982016367856571e-05,
"loss": 1.1776,
"num_input_tokens_seen": 110883064,
"step": 203500
},
{
"epoch": 2.040979670241716,
"grad_norm": 7.008761882781982,
"learning_rate": 3.9795151672802946e-05,
"loss": 1.1622,
"num_input_tokens_seen": 111162464,
"step": 204000
},
{
"epoch": 2.045982071394269,
"grad_norm": 5.213141918182373,
"learning_rate": 3.977013966704018e-05,
"loss": 1.1863,
"num_input_tokens_seen": 111434960,
"step": 204500
},
{
"epoch": 2.0509844725468223,
"grad_norm": 6.3171000480651855,
"learning_rate": 3.9745127661277416e-05,
"loss": 1.1735,
"num_input_tokens_seen": 111707896,
"step": 205000
},
{
"epoch": 2.055986873699376,
"grad_norm": 5.790093898773193,
"learning_rate": 3.972011565551465e-05,
"loss": 1.1916,
"num_input_tokens_seen": 111976120,
"step": 205500
},
{
"epoch": 2.060989274851929,
"grad_norm": 5.817662239074707,
"learning_rate": 3.969510364975188e-05,
"loss": 1.1996,
"num_input_tokens_seen": 112248384,
"step": 206000
},
{
"epoch": 2.065991676004482,
"grad_norm": 7.098010063171387,
"learning_rate": 3.9670091643989114e-05,
"loss": 1.1698,
"num_input_tokens_seen": 112525640,
"step": 206500
},
{
"epoch": 2.070994077157035,
"grad_norm": 5.172534942626953,
"learning_rate": 3.964507963822635e-05,
"loss": 1.17,
"num_input_tokens_seen": 112794848,
"step": 207000
},
{
"epoch": 2.0759964783095888,
"grad_norm": 5.681086540222168,
"learning_rate": 3.9620067632463584e-05,
"loss": 1.1812,
"num_input_tokens_seen": 113070744,
"step": 207500
},
{
"epoch": 2.080998879462142,
"grad_norm": 6.1572489738464355,
"learning_rate": 3.959505562670082e-05,
"loss": 1.1733,
"num_input_tokens_seen": 113352016,
"step": 208000
},
{
"epoch": 2.086001280614695,
"grad_norm": 6.149631023406982,
"learning_rate": 3.9570043620938054e-05,
"loss": 1.1733,
"num_input_tokens_seen": 113629168,
"step": 208500
},
{
"epoch": 2.091003681767248,
"grad_norm": 4.973777770996094,
"learning_rate": 3.954503161517529e-05,
"loss": 1.1732,
"num_input_tokens_seen": 113899440,
"step": 209000
},
{
"epoch": 2.0960060829198017,
"grad_norm": 12.051576614379883,
"learning_rate": 3.952001960941252e-05,
"loss": 1.1974,
"num_input_tokens_seen": 114170160,
"step": 209500
},
{
"epoch": 2.101008484072355,
"grad_norm": 5.458679676055908,
"learning_rate": 3.949500760364975e-05,
"loss": 1.1664,
"num_input_tokens_seen": 114441696,
"step": 210000
},
{
"epoch": 2.106010885224908,
"grad_norm": 5.033444404602051,
"learning_rate": 3.946999559788699e-05,
"loss": 1.1781,
"num_input_tokens_seen": 114715752,
"step": 210500
},
{
"epoch": 2.111013286377461,
"grad_norm": 5.643963813781738,
"learning_rate": 3.944498359212422e-05,
"loss": 1.1849,
"num_input_tokens_seen": 114996136,
"step": 211000
},
{
"epoch": 2.1160156875300142,
"grad_norm": 6.656442165374756,
"learning_rate": 3.941997158636145e-05,
"loss": 1.1792,
"num_input_tokens_seen": 115259352,
"step": 211500
},
{
"epoch": 2.121018088682568,
"grad_norm": 5.712615013122559,
"learning_rate": 3.939495958059869e-05,
"loss": 1.1799,
"num_input_tokens_seen": 115534944,
"step": 212000
},
{
"epoch": 2.126020489835121,
"grad_norm": 8.317249298095703,
"learning_rate": 3.936994757483593e-05,
"loss": 1.1853,
"num_input_tokens_seen": 115808536,
"step": 212500
},
{
"epoch": 2.131022890987674,
"grad_norm": 6.112279415130615,
"learning_rate": 3.9344935569073155e-05,
"loss": 1.1822,
"num_input_tokens_seen": 116087064,
"step": 213000
},
{
"epoch": 2.136025292140227,
"grad_norm": 7.357901096343994,
"learning_rate": 3.931992356331039e-05,
"loss": 1.1866,
"num_input_tokens_seen": 116365200,
"step": 213500
},
{
"epoch": 2.1410276932927808,
"grad_norm": 5.3409929275512695,
"learning_rate": 3.9294911557547625e-05,
"loss": 1.1897,
"num_input_tokens_seen": 116636120,
"step": 214000
},
{
"epoch": 2.146030094445334,
"grad_norm": 7.562960624694824,
"learning_rate": 3.926989955178486e-05,
"loss": 1.1809,
"num_input_tokens_seen": 116916360,
"step": 214500
},
{
"epoch": 2.151032495597887,
"grad_norm": 5.4258503913879395,
"learning_rate": 3.924488754602209e-05,
"loss": 1.1871,
"num_input_tokens_seen": 117184808,
"step": 215000
},
{
"epoch": 2.15603489675044,
"grad_norm": 6.741093158721924,
"learning_rate": 3.9219875540259324e-05,
"loss": 1.176,
"num_input_tokens_seen": 117454928,
"step": 215500
},
{
"epoch": 2.1610372979029933,
"grad_norm": 6.085869789123535,
"learning_rate": 3.9194863534496565e-05,
"loss": 1.1789,
"num_input_tokens_seen": 117722352,
"step": 216000
},
{
"epoch": 2.166039699055547,
"grad_norm": 6.9086151123046875,
"learning_rate": 3.9169851528733793e-05,
"loss": 1.189,
"num_input_tokens_seen": 117990704,
"step": 216500
},
{
"epoch": 2.1710421002081,
"grad_norm": 5.497861385345459,
"learning_rate": 3.914483952297103e-05,
"loss": 1.1768,
"num_input_tokens_seen": 118255368,
"step": 217000
},
{
"epoch": 2.176044501360653,
"grad_norm": 8.487640380859375,
"learning_rate": 3.9119827517208263e-05,
"loss": 1.1743,
"num_input_tokens_seen": 118525432,
"step": 217500
},
{
"epoch": 2.181046902513206,
"grad_norm": 5.8003435134887695,
"learning_rate": 3.90948155114455e-05,
"loss": 1.1697,
"num_input_tokens_seen": 118799496,
"step": 218000
},
{
"epoch": 2.18604930366576,
"grad_norm": 7.726077079772949,
"learning_rate": 3.9069803505682727e-05,
"loss": 1.1784,
"num_input_tokens_seen": 119074944,
"step": 218500
},
{
"epoch": 2.191051704818313,
"grad_norm": 5.625581741333008,
"learning_rate": 3.904479149991996e-05,
"loss": 1.1856,
"num_input_tokens_seen": 119353528,
"step": 219000
},
{
"epoch": 2.196054105970866,
"grad_norm": 5.582902908325195,
"learning_rate": 3.9019779494157196e-05,
"loss": 1.1832,
"num_input_tokens_seen": 119627520,
"step": 219500
},
{
"epoch": 2.201056507123419,
"grad_norm": 5.2057671546936035,
"learning_rate": 3.899476748839443e-05,
"loss": 1.1888,
"num_input_tokens_seen": 119894432,
"step": 220000
},
{
"epoch": 2.2060589082759723,
"grad_norm": 6.18375825881958,
"learning_rate": 3.8969755482631666e-05,
"loss": 1.1636,
"num_input_tokens_seen": 120165872,
"step": 220500
},
{
"epoch": 2.211061309428526,
"grad_norm": 7.083649158477783,
"learning_rate": 3.89447434768689e-05,
"loss": 1.1716,
"num_input_tokens_seen": 120437360,
"step": 221000
},
{
"epoch": 2.216063710581079,
"grad_norm": 6.966033458709717,
"learning_rate": 3.891973147110613e-05,
"loss": 1.1899,
"num_input_tokens_seen": 120707824,
"step": 221500
},
{
"epoch": 2.221066111733632,
"grad_norm": 4.439563751220703,
"learning_rate": 3.8894719465343365e-05,
"loss": 1.1714,
"num_input_tokens_seen": 120977840,
"step": 222000
},
{
"epoch": 2.2260685128861852,
"grad_norm": 6.870123386383057,
"learning_rate": 3.88697074595806e-05,
"loss": 1.1793,
"num_input_tokens_seen": 121254560,
"step": 222500
},
{
"epoch": 2.231070914038739,
"grad_norm": 8.789484024047852,
"learning_rate": 3.8844695453817834e-05,
"loss": 1.1851,
"num_input_tokens_seen": 121523936,
"step": 223000
},
{
"epoch": 2.236073315191292,
"grad_norm": 6.196369647979736,
"learning_rate": 3.881968344805507e-05,
"loss": 1.1857,
"num_input_tokens_seen": 121795288,
"step": 223500
},
{
"epoch": 2.241075716343845,
"grad_norm": 5.902594566345215,
"learning_rate": 3.8794671442292304e-05,
"loss": 1.2016,
"num_input_tokens_seen": 122065592,
"step": 224000
},
{
"epoch": 2.246078117496398,
"grad_norm": 6.811281681060791,
"learning_rate": 3.876965943652954e-05,
"loss": 1.1837,
"num_input_tokens_seen": 122340584,
"step": 224500
},
{
"epoch": 2.2510805186489513,
"grad_norm": 6.388464450836182,
"learning_rate": 3.874464743076677e-05,
"loss": 1.1816,
"num_input_tokens_seen": 122612352,
"step": 225000
},
{
"epoch": 2.256082919801505,
"grad_norm": 6.045330047607422,
"learning_rate": 3.8719635425004e-05,
"loss": 1.1835,
"num_input_tokens_seen": 122878624,
"step": 225500
},
{
"epoch": 2.261085320954058,
"grad_norm": 7.601827621459961,
"learning_rate": 3.869462341924124e-05,
"loss": 1.1766,
"num_input_tokens_seen": 123143944,
"step": 226000
},
{
"epoch": 2.266087722106611,
"grad_norm": 5.323575496673584,
"learning_rate": 3.866961141347847e-05,
"loss": 1.1913,
"num_input_tokens_seen": 123415600,
"step": 226500
},
{
"epoch": 2.2710901232591643,
"grad_norm": 6.938271522521973,
"learning_rate": 3.86445994077157e-05,
"loss": 1.1812,
"num_input_tokens_seen": 123682608,
"step": 227000
},
{
"epoch": 2.276092524411718,
"grad_norm": 5.9254021644592285,
"learning_rate": 3.861958740195294e-05,
"loss": 1.1856,
"num_input_tokens_seen": 123954888,
"step": 227500
},
{
"epoch": 2.281094925564271,
"grad_norm": 7.544998645782471,
"learning_rate": 3.859457539619018e-05,
"loss": 1.1957,
"num_input_tokens_seen": 124230632,
"step": 228000
},
{
"epoch": 2.286097326716824,
"grad_norm": 6.14992618560791,
"learning_rate": 3.8569563390427406e-05,
"loss": 1.1894,
"num_input_tokens_seen": 124507352,
"step": 228500
},
{
"epoch": 2.291099727869377,
"grad_norm": 5.440382957458496,
"learning_rate": 3.854455138466464e-05,
"loss": 1.1752,
"num_input_tokens_seen": 124781440,
"step": 229000
},
{
"epoch": 2.2961021290219303,
"grad_norm": 7.271317481994629,
"learning_rate": 3.8519539378901876e-05,
"loss": 1.1749,
"num_input_tokens_seen": 125055624,
"step": 229500
},
{
"epoch": 2.301104530174484,
"grad_norm": 5.141626834869385,
"learning_rate": 3.849452737313911e-05,
"loss": 1.18,
"num_input_tokens_seen": 125329000,
"step": 230000
},
{
"epoch": 2.306106931327037,
"grad_norm": 6.321171760559082,
"learning_rate": 3.846951536737634e-05,
"loss": 1.1741,
"num_input_tokens_seen": 125605816,
"step": 230500
},
{
"epoch": 2.31110933247959,
"grad_norm": 5.19276237487793,
"learning_rate": 3.8444503361613574e-05,
"loss": 1.1966,
"num_input_tokens_seen": 125887328,
"step": 231000
},
{
"epoch": 2.3161117336321433,
"grad_norm": 5.9422125816345215,
"learning_rate": 3.841949135585081e-05,
"loss": 1.1638,
"num_input_tokens_seen": 126158768,
"step": 231500
},
{
"epoch": 2.321114134784697,
"grad_norm": 5.361838340759277,
"learning_rate": 3.8394479350088044e-05,
"loss": 1.1737,
"num_input_tokens_seen": 126429432,
"step": 232000
},
{
"epoch": 2.32611653593725,
"grad_norm": 6.030839920043945,
"learning_rate": 3.836946734432528e-05,
"loss": 1.1975,
"num_input_tokens_seen": 126703336,
"step": 232500
},
{
"epoch": 2.331118937089803,
"grad_norm": 6.013172149658203,
"learning_rate": 3.8344455338562514e-05,
"loss": 1.1785,
"num_input_tokens_seen": 126981120,
"step": 233000
},
{
"epoch": 2.3361213382423562,
"grad_norm": 5.227244853973389,
"learning_rate": 3.831944333279975e-05,
"loss": 1.1934,
"num_input_tokens_seen": 127248672,
"step": 233500
},
{
"epoch": 2.3411237393949094,
"grad_norm": 5.995646953582764,
"learning_rate": 3.829443132703698e-05,
"loss": 1.198,
"num_input_tokens_seen": 127525032,
"step": 234000
},
{
"epoch": 2.346126140547463,
"grad_norm": 8.163732528686523,
"learning_rate": 3.826941932127421e-05,
"loss": 1.1743,
"num_input_tokens_seen": 127793384,
"step": 234500
},
{
"epoch": 2.351128541700016,
"grad_norm": 5.394166946411133,
"learning_rate": 3.824440731551145e-05,
"loss": 1.1726,
"num_input_tokens_seen": 128065120,
"step": 235000
},
{
"epoch": 2.356130942852569,
"grad_norm": 5.673594951629639,
"learning_rate": 3.821939530974868e-05,
"loss": 1.1959,
"num_input_tokens_seen": 128334976,
"step": 235500
},
{
"epoch": 2.3611333440051223,
"grad_norm": 5.715531826019287,
"learning_rate": 3.8194383303985917e-05,
"loss": 1.1936,
"num_input_tokens_seen": 128610504,
"step": 236000
},
{
"epoch": 2.366135745157676,
"grad_norm": 5.725061416625977,
"learning_rate": 3.816937129822315e-05,
"loss": 1.1771,
"num_input_tokens_seen": 128881800,
"step": 236500
},
{
"epoch": 2.371138146310229,
"grad_norm": 4.505105972290039,
"learning_rate": 3.8144359292460386e-05,
"loss": 1.1826,
"num_input_tokens_seen": 129157576,
"step": 237000
},
{
"epoch": 2.376140547462782,
"grad_norm": 5.860077857971191,
"learning_rate": 3.8119347286697615e-05,
"loss": 1.1925,
"num_input_tokens_seen": 129432392,
"step": 237500
},
{
"epoch": 2.3811429486153353,
"grad_norm": 6.7791337966918945,
"learning_rate": 3.809433528093485e-05,
"loss": 1.1746,
"num_input_tokens_seen": 129700968,
"step": 238000
},
{
"epoch": 2.3861453497678884,
"grad_norm": 5.708649635314941,
"learning_rate": 3.8069323275172085e-05,
"loss": 1.1793,
"num_input_tokens_seen": 129977384,
"step": 238500
},
{
"epoch": 2.391147750920442,
"grad_norm": 5.659774303436279,
"learning_rate": 3.804431126940932e-05,
"loss": 1.1797,
"num_input_tokens_seen": 130248672,
"step": 239000
},
{
"epoch": 2.396150152072995,
"grad_norm": 6.859200477600098,
"learning_rate": 3.8019299263646555e-05,
"loss": 1.1853,
"num_input_tokens_seen": 130522208,
"step": 239500
},
{
"epoch": 2.401152553225548,
"grad_norm": 6.860942840576172,
"learning_rate": 3.799428725788379e-05,
"loss": 1.1922,
"num_input_tokens_seen": 130799088,
"step": 240000
},
{
"epoch": 2.4061549543781013,
"grad_norm": 6.199068069458008,
"learning_rate": 3.7969275252121025e-05,
"loss": 1.1825,
"num_input_tokens_seen": 131067984,
"step": 240500
},
{
"epoch": 2.411157355530655,
"grad_norm": 5.724475383758545,
"learning_rate": 3.794426324635825e-05,
"loss": 1.168,
"num_input_tokens_seen": 131340552,
"step": 241000
},
{
"epoch": 2.416159756683208,
"grad_norm": 5.187953472137451,
"learning_rate": 3.791925124059549e-05,
"loss": 1.1875,
"num_input_tokens_seen": 131613968,
"step": 241500
},
{
"epoch": 2.421162157835761,
"grad_norm": 6.069790363311768,
"learning_rate": 3.789423923483272e-05,
"loss": 1.1866,
"num_input_tokens_seen": 131880736,
"step": 242000
},
{
"epoch": 2.4261645589883143,
"grad_norm": 6.761556148529053,
"learning_rate": 3.786922722906995e-05,
"loss": 1.205,
"num_input_tokens_seen": 132150456,
"step": 242500
},
{
"epoch": 2.4311669601408674,
"grad_norm": 5.816013336181641,
"learning_rate": 3.7844215223307186e-05,
"loss": 1.1938,
"num_input_tokens_seen": 132424832,
"step": 243000
},
{
"epoch": 2.436169361293421,
"grad_norm": 6.447406768798828,
"learning_rate": 3.781920321754443e-05,
"loss": 1.1792,
"num_input_tokens_seen": 132691704,
"step": 243500
},
{
"epoch": 2.441171762445974,
"grad_norm": 6.802369117736816,
"learning_rate": 3.7794191211781656e-05,
"loss": 1.1891,
"num_input_tokens_seen": 132962376,
"step": 244000
},
{
"epoch": 2.4461741635985272,
"grad_norm": 5.149132251739502,
"learning_rate": 3.776917920601889e-05,
"loss": 1.1691,
"num_input_tokens_seen": 133236272,
"step": 244500
},
{
"epoch": 2.4511765647510804,
"grad_norm": 6.554666996002197,
"learning_rate": 3.7744167200256126e-05,
"loss": 1.1958,
"num_input_tokens_seen": 133505504,
"step": 245000
},
{
"epoch": 2.456178965903634,
"grad_norm": 5.13792610168457,
"learning_rate": 3.771915519449336e-05,
"loss": 1.1771,
"num_input_tokens_seen": 133785904,
"step": 245500
},
{
"epoch": 2.461181367056187,
"grad_norm": 4.5011491775512695,
"learning_rate": 3.769414318873059e-05,
"loss": 1.192,
"num_input_tokens_seen": 134055360,
"step": 246000
},
{
"epoch": 2.46618376820874,
"grad_norm": 7.41070556640625,
"learning_rate": 3.7669131182967824e-05,
"loss": 1.1682,
"num_input_tokens_seen": 134321528,
"step": 246500
},
{
"epoch": 2.4711861693612933,
"grad_norm": 7.749119281768799,
"learning_rate": 3.764411917720506e-05,
"loss": 1.188,
"num_input_tokens_seen": 134595208,
"step": 247000
},
{
"epoch": 2.4761885705138464,
"grad_norm": 5.476714134216309,
"learning_rate": 3.7619107171442294e-05,
"loss": 1.1668,
"num_input_tokens_seen": 134869136,
"step": 247500
},
{
"epoch": 2.4811909716664,
"grad_norm": 5.9990010261535645,
"learning_rate": 3.759409516567953e-05,
"loss": 1.182,
"num_input_tokens_seen": 135144112,
"step": 248000
},
{
"epoch": 2.486193372818953,
"grad_norm": 5.635094165802002,
"learning_rate": 3.7569083159916764e-05,
"loss": 1.1861,
"num_input_tokens_seen": 135413984,
"step": 248500
},
{
"epoch": 2.4911957739715063,
"grad_norm": 5.974431991577148,
"learning_rate": 3.7544071154154e-05,
"loss": 1.1927,
"num_input_tokens_seen": 135697672,
"step": 249000
},
{
"epoch": 2.4961981751240594,
"grad_norm": 6.688498497009277,
"learning_rate": 3.751905914839123e-05,
"loss": 1.2016,
"num_input_tokens_seen": 135975272,
"step": 249500
},
{
"epoch": 2.501200576276613,
"grad_norm": 8.589900970458984,
"learning_rate": 3.749404714262846e-05,
"loss": 1.1711,
"num_input_tokens_seen": 136241376,
"step": 250000
},
{
"epoch": 2.506202977429166,
"grad_norm": 6.064274311065674,
"learning_rate": 3.74690351368657e-05,
"loss": 1.2027,
"num_input_tokens_seen": 136510824,
"step": 250500
},
{
"epoch": 2.511205378581719,
"grad_norm": 5.36790657043457,
"learning_rate": 3.744402313110293e-05,
"loss": 1.189,
"num_input_tokens_seen": 136777568,
"step": 251000
},
{
"epoch": 2.5162077797342723,
"grad_norm": 5.9187703132629395,
"learning_rate": 3.741901112534017e-05,
"loss": 1.1894,
"num_input_tokens_seen": 137049696,
"step": 251500
},
{
"epoch": 2.5212101808868255,
"grad_norm": 5.2425007820129395,
"learning_rate": 3.73939991195774e-05,
"loss": 1.2087,
"num_input_tokens_seen": 137319688,
"step": 252000
},
{
"epoch": 2.526212582039379,
"grad_norm": 6.622330188751221,
"learning_rate": 3.736898711381464e-05,
"loss": 1.172,
"num_input_tokens_seen": 137592360,
"step": 252500
},
{
"epoch": 2.531214983191932,
"grad_norm": 5.9546122550964355,
"learning_rate": 3.7343975108051865e-05,
"loss": 1.1784,
"num_input_tokens_seen": 137869696,
"step": 253000
},
{
"epoch": 2.5362173843444853,
"grad_norm": 6.10466194152832,
"learning_rate": 3.73189631022891e-05,
"loss": 1.1806,
"num_input_tokens_seen": 138146440,
"step": 253500
},
{
"epoch": 2.5412197854970384,
"grad_norm": 7.046773433685303,
"learning_rate": 3.7293951096526335e-05,
"loss": 1.187,
"num_input_tokens_seen": 138432672,
"step": 254000
},
{
"epoch": 2.546222186649592,
"grad_norm": 5.8726115226745605,
"learning_rate": 3.726893909076356e-05,
"loss": 1.1769,
"num_input_tokens_seen": 138704056,
"step": 254500
},
{
"epoch": 2.551224587802145,
"grad_norm": 6.145564079284668,
"learning_rate": 3.7243927085000805e-05,
"loss": 1.1783,
"num_input_tokens_seen": 138972048,
"step": 255000
},
{
"epoch": 2.5562269889546982,
"grad_norm": 8.949604988098145,
"learning_rate": 3.721891507923804e-05,
"loss": 1.1928,
"num_input_tokens_seen": 139249808,
"step": 255500
},
{
"epoch": 2.5612293901072514,
"grad_norm": 6.0869975090026855,
"learning_rate": 3.7193903073475275e-05,
"loss": 1.2032,
"num_input_tokens_seen": 139528824,
"step": 256000
},
{
"epoch": 2.5662317912598045,
"grad_norm": 6.634551048278809,
"learning_rate": 3.71688910677125e-05,
"loss": 1.1977,
"num_input_tokens_seen": 139798352,
"step": 256500
},
{
"epoch": 2.571234192412358,
"grad_norm": 5.805966377258301,
"learning_rate": 3.714387906194974e-05,
"loss": 1.1725,
"num_input_tokens_seen": 140071304,
"step": 257000
},
{
"epoch": 2.576236593564911,
"grad_norm": 5.509829998016357,
"learning_rate": 3.711886705618697e-05,
"loss": 1.1794,
"num_input_tokens_seen": 140351576,
"step": 257500
},
{
"epoch": 2.5812389947174643,
"grad_norm": 7.246334552764893,
"learning_rate": 3.70938550504242e-05,
"loss": 1.1638,
"num_input_tokens_seen": 140629520,
"step": 258000
},
{
"epoch": 2.5862413958700174,
"grad_norm": 5.683703899383545,
"learning_rate": 3.7068843044661436e-05,
"loss": 1.1772,
"num_input_tokens_seen": 140907480,
"step": 258500
},
{
"epoch": 2.591243797022571,
"grad_norm": 5.520617485046387,
"learning_rate": 3.704383103889867e-05,
"loss": 1.1874,
"num_input_tokens_seen": 141174448,
"step": 259000
},
{
"epoch": 2.596246198175124,
"grad_norm": 6.609923839569092,
"learning_rate": 3.701881903313591e-05,
"loss": 1.1954,
"num_input_tokens_seen": 141451848,
"step": 259500
},
{
"epoch": 2.6012485993276773,
"grad_norm": 5.208652973175049,
"learning_rate": 3.699380702737314e-05,
"loss": 1.1777,
"num_input_tokens_seen": 141719928,
"step": 260000
},
{
"epoch": 2.6062510004802304,
"grad_norm": 6.525882720947266,
"learning_rate": 3.6968795021610376e-05,
"loss": 1.1725,
"num_input_tokens_seen": 141993992,
"step": 260500
},
{
"epoch": 2.6112534016327835,
"grad_norm": 6.694952011108398,
"learning_rate": 3.694378301584761e-05,
"loss": 1.1764,
"num_input_tokens_seen": 142262512,
"step": 261000
},
{
"epoch": 2.616255802785337,
"grad_norm": 6.036692142486572,
"learning_rate": 3.691877101008484e-05,
"loss": 1.1863,
"num_input_tokens_seen": 142535016,
"step": 261500
},
{
"epoch": 2.62125820393789,
"grad_norm": 4.5128021240234375,
"learning_rate": 3.6893759004322074e-05,
"loss": 1.1914,
"num_input_tokens_seen": 142817040,
"step": 262000
},
{
"epoch": 2.6262606050904433,
"grad_norm": 6.445744037628174,
"learning_rate": 3.686874699855931e-05,
"loss": 1.1938,
"num_input_tokens_seen": 143093520,
"step": 262500
},
{
"epoch": 2.631263006242997,
"grad_norm": 5.805507183074951,
"learning_rate": 3.6843734992796544e-05,
"loss": 1.176,
"num_input_tokens_seen": 143361184,
"step": 263000
},
{
"epoch": 2.63626540739555,
"grad_norm": 7.369002819061279,
"learning_rate": 3.681872298703378e-05,
"loss": 1.1737,
"num_input_tokens_seen": 143633104,
"step": 263500
},
{
"epoch": 2.641267808548103,
"grad_norm": 5.3200459480285645,
"learning_rate": 3.6793710981271014e-05,
"loss": 1.1853,
"num_input_tokens_seen": 143903304,
"step": 264000
},
{
"epoch": 2.6462702097006563,
"grad_norm": 4.868594169616699,
"learning_rate": 3.676869897550825e-05,
"loss": 1.1924,
"num_input_tokens_seen": 144176568,
"step": 264500
},
{
"epoch": 2.6512726108532094,
"grad_norm": 6.198353290557861,
"learning_rate": 3.674368696974548e-05,
"loss": 1.1854,
"num_input_tokens_seen": 144457912,
"step": 265000
},
{
"epoch": 2.6562750120057625,
"grad_norm": 5.720507621765137,
"learning_rate": 3.671867496398271e-05,
"loss": 1.1922,
"num_input_tokens_seen": 144733560,
"step": 265500
},
{
"epoch": 2.661277413158316,
"grad_norm": 6.092404365539551,
"learning_rate": 3.669366295821995e-05,
"loss": 1.1784,
"num_input_tokens_seen": 145006896,
"step": 266000
},
{
"epoch": 2.6662798143108692,
"grad_norm": 5.7721266746521,
"learning_rate": 3.666865095245718e-05,
"loss": 1.1682,
"num_input_tokens_seen": 145276408,
"step": 266500
},
{
"epoch": 2.6712822154634224,
"grad_norm": 5.34429407119751,
"learning_rate": 3.664363894669442e-05,
"loss": 1.2014,
"num_input_tokens_seen": 145549104,
"step": 267000
},
{
"epoch": 2.676284616615976,
"grad_norm": 5.627655982971191,
"learning_rate": 3.661862694093165e-05,
"loss": 1.1873,
"num_input_tokens_seen": 145813696,
"step": 267500
},
{
"epoch": 2.681287017768529,
"grad_norm": 5.520989894866943,
"learning_rate": 3.659361493516889e-05,
"loss": 1.1801,
"num_input_tokens_seen": 146081824,
"step": 268000
},
{
"epoch": 2.686289418921082,
"grad_norm": 5.194046497344971,
"learning_rate": 3.6568602929406115e-05,
"loss": 1.2043,
"num_input_tokens_seen": 146359992,
"step": 268500
},
{
"epoch": 2.6912918200736353,
"grad_norm": 7.6289753913879395,
"learning_rate": 3.654359092364335e-05,
"loss": 1.1667,
"num_input_tokens_seen": 146635688,
"step": 269000
},
{
"epoch": 2.6962942212261884,
"grad_norm": 6.4248504638671875,
"learning_rate": 3.6518578917880585e-05,
"loss": 1.182,
"num_input_tokens_seen": 146907280,
"step": 269500
},
{
"epoch": 2.7012966223787416,
"grad_norm": 7.369548797607422,
"learning_rate": 3.649356691211781e-05,
"loss": 1.1863,
"num_input_tokens_seen": 147180456,
"step": 270000
},
{
"epoch": 2.706299023531295,
"grad_norm": 7.325328826904297,
"learning_rate": 3.646855490635505e-05,
"loss": 1.1731,
"num_input_tokens_seen": 147447936,
"step": 270500
},
{
"epoch": 2.7113014246838483,
"grad_norm": 6.618239879608154,
"learning_rate": 3.644354290059229e-05,
"loss": 1.1898,
"num_input_tokens_seen": 147714576,
"step": 271000
},
{
"epoch": 2.7163038258364014,
"grad_norm": 6.6161932945251465,
"learning_rate": 3.6418530894829525e-05,
"loss": 1.1757,
"num_input_tokens_seen": 147982616,
"step": 271500
},
{
"epoch": 2.721306226988955,
"grad_norm": 4.964172840118408,
"learning_rate": 3.639351888906675e-05,
"loss": 1.1822,
"num_input_tokens_seen": 148261928,
"step": 272000
},
{
"epoch": 2.726308628141508,
"grad_norm": 5.542762756347656,
"learning_rate": 3.636850688330399e-05,
"loss": 1.1979,
"num_input_tokens_seen": 148537656,
"step": 272500
},
{
"epoch": 2.731311029294061,
"grad_norm": 6.122353553771973,
"learning_rate": 3.634349487754122e-05,
"loss": 1.1837,
"num_input_tokens_seen": 148805656,
"step": 273000
},
{
"epoch": 2.7363134304466143,
"grad_norm": 5.522734642028809,
"learning_rate": 3.631848287177845e-05,
"loss": 1.1755,
"num_input_tokens_seen": 149071656,
"step": 273500
},
{
"epoch": 2.7413158315991675,
"grad_norm": 7.560063362121582,
"learning_rate": 3.6293470866015686e-05,
"loss": 1.1801,
"num_input_tokens_seen": 149339936,
"step": 274000
},
{
"epoch": 2.7463182327517206,
"grad_norm": 5.46027135848999,
"learning_rate": 3.626845886025292e-05,
"loss": 1.1734,
"num_input_tokens_seen": 149616152,
"step": 274500
},
{
"epoch": 2.751320633904274,
"grad_norm": 5.810853004455566,
"learning_rate": 3.624344685449016e-05,
"loss": 1.1806,
"num_input_tokens_seen": 149886976,
"step": 275000
},
{
"epoch": 2.7563230350568273,
"grad_norm": 5.957060813903809,
"learning_rate": 3.621843484872739e-05,
"loss": 1.1777,
"num_input_tokens_seen": 150161384,
"step": 275500
},
{
"epoch": 2.7613254362093804,
"grad_norm": 8.7448148727417,
"learning_rate": 3.6193422842964626e-05,
"loss": 1.1535,
"num_input_tokens_seen": 150426192,
"step": 276000
},
{
"epoch": 2.766327837361934,
"grad_norm": 6.24728536605835,
"learning_rate": 3.616841083720186e-05,
"loss": 1.1741,
"num_input_tokens_seen": 150694480,
"step": 276500
},
{
"epoch": 2.771330238514487,
"grad_norm": 8.271539688110352,
"learning_rate": 3.614339883143909e-05,
"loss": 1.1853,
"num_input_tokens_seen": 150965896,
"step": 277000
},
{
"epoch": 2.7763326396670402,
"grad_norm": 6.075042247772217,
"learning_rate": 3.6118386825676324e-05,
"loss": 1.1729,
"num_input_tokens_seen": 151239800,
"step": 277500
},
{
"epoch": 2.7813350408195934,
"grad_norm": 7.93595552444458,
"learning_rate": 3.609337481991356e-05,
"loss": 1.185,
"num_input_tokens_seen": 151512560,
"step": 278000
},
{
"epoch": 2.7863374419721465,
"grad_norm": 7.406468868255615,
"learning_rate": 3.6068362814150794e-05,
"loss": 1.1908,
"num_input_tokens_seen": 151789264,
"step": 278500
},
{
"epoch": 2.7913398431246996,
"grad_norm": 6.6226325035095215,
"learning_rate": 3.604335080838803e-05,
"loss": 1.1696,
"num_input_tokens_seen": 152057600,
"step": 279000
},
{
"epoch": 2.796342244277253,
"grad_norm": 5.142138481140137,
"learning_rate": 3.6018338802625264e-05,
"loss": 1.1694,
"num_input_tokens_seen": 152328800,
"step": 279500
},
{
"epoch": 2.8013446454298063,
"grad_norm": 6.834789752960205,
"learning_rate": 3.59933267968625e-05,
"loss": 1.1798,
"num_input_tokens_seen": 152610624,
"step": 280000
},
{
"epoch": 2.8063470465823595,
"grad_norm": 5.720213890075684,
"learning_rate": 3.596831479109973e-05,
"loss": 1.1781,
"num_input_tokens_seen": 152876576,
"step": 280500
},
{
"epoch": 2.811349447734913,
"grad_norm": 6.060703277587891,
"learning_rate": 3.594330278533696e-05,
"loss": 1.1781,
"num_input_tokens_seen": 153146728,
"step": 281000
},
{
"epoch": 2.816351848887466,
"grad_norm": 7.374409198760986,
"learning_rate": 3.59182907795742e-05,
"loss": 1.1925,
"num_input_tokens_seen": 153416872,
"step": 281500
},
{
"epoch": 2.8213542500400193,
"grad_norm": 6.183439254760742,
"learning_rate": 3.5893278773811425e-05,
"loss": 1.1588,
"num_input_tokens_seen": 153688560,
"step": 282000
},
{
"epoch": 2.8263566511925724,
"grad_norm": 7.167964935302734,
"learning_rate": 3.586826676804867e-05,
"loss": 1.1799,
"num_input_tokens_seen": 153968432,
"step": 282500
},
{
"epoch": 2.8313590523451255,
"grad_norm": 5.514324188232422,
"learning_rate": 3.58432547622859e-05,
"loss": 1.1695,
"num_input_tokens_seen": 154241312,
"step": 283000
},
{
"epoch": 2.8363614534976787,
"grad_norm": 4.6626667976379395,
"learning_rate": 3.581824275652314e-05,
"loss": 1.1876,
"num_input_tokens_seen": 154513432,
"step": 283500
},
{
"epoch": 2.8413638546502322,
"grad_norm": 5.130783557891846,
"learning_rate": 3.5793230750760365e-05,
"loss": 1.1806,
"num_input_tokens_seen": 154791160,
"step": 284000
},
{
"epoch": 2.8463662558027853,
"grad_norm": 6.905600547790527,
"learning_rate": 3.57682187449976e-05,
"loss": 1.1774,
"num_input_tokens_seen": 155060824,
"step": 284500
},
{
"epoch": 2.8513686569553385,
"grad_norm": 5.459284782409668,
"learning_rate": 3.5743206739234835e-05,
"loss": 1.17,
"num_input_tokens_seen": 155335544,
"step": 285000
},
{
"epoch": 2.856371058107892,
"grad_norm": 6.299667835235596,
"learning_rate": 3.571819473347206e-05,
"loss": 1.1751,
"num_input_tokens_seen": 155613112,
"step": 285500
},
{
"epoch": 2.861373459260445,
"grad_norm": 5.297176361083984,
"learning_rate": 3.56931827277093e-05,
"loss": 1.173,
"num_input_tokens_seen": 155888552,
"step": 286000
},
{
"epoch": 2.8663758604129983,
"grad_norm": 7.074682712554932,
"learning_rate": 3.566817072194654e-05,
"loss": 1.1753,
"num_input_tokens_seen": 156163384,
"step": 286500
},
{
"epoch": 2.8713782615655514,
"grad_norm": 7.402191638946533,
"learning_rate": 3.5643158716183775e-05,
"loss": 1.1763,
"num_input_tokens_seen": 156433192,
"step": 287000
},
{
"epoch": 2.8763806627181046,
"grad_norm": 6.5372419357299805,
"learning_rate": 3.5618146710421e-05,
"loss": 1.1931,
"num_input_tokens_seen": 156704752,
"step": 287500
},
{
"epoch": 2.8813830638706577,
"grad_norm": 6.030176162719727,
"learning_rate": 3.559313470465824e-05,
"loss": 1.1808,
"num_input_tokens_seen": 156979192,
"step": 288000
},
{
"epoch": 2.8863854650232112,
"grad_norm": 5.74777364730835,
"learning_rate": 3.556812269889547e-05,
"loss": 1.1693,
"num_input_tokens_seen": 157251448,
"step": 288500
},
{
"epoch": 2.8913878661757644,
"grad_norm": 5.995535373687744,
"learning_rate": 3.55431106931327e-05,
"loss": 1.1637,
"num_input_tokens_seen": 157518624,
"step": 289000
},
{
"epoch": 2.8963902673283175,
"grad_norm": 7.268390655517578,
"learning_rate": 3.5518098687369936e-05,
"loss": 1.171,
"num_input_tokens_seen": 157789104,
"step": 289500
},
{
"epoch": 2.901392668480871,
"grad_norm": 6.150352478027344,
"learning_rate": 3.549308668160717e-05,
"loss": 1.1607,
"num_input_tokens_seen": 158064360,
"step": 290000
},
{
"epoch": 2.906395069633424,
"grad_norm": 9.338305473327637,
"learning_rate": 3.5468074675844406e-05,
"loss": 1.1777,
"num_input_tokens_seen": 158337456,
"step": 290500
},
{
"epoch": 2.9113974707859773,
"grad_norm": 7.0623674392700195,
"learning_rate": 3.544306267008164e-05,
"loss": 1.1673,
"num_input_tokens_seen": 158610232,
"step": 291000
},
{
"epoch": 2.9163998719385305,
"grad_norm": 6.665122985839844,
"learning_rate": 3.5418050664318876e-05,
"loss": 1.1764,
"num_input_tokens_seen": 158883800,
"step": 291500
},
{
"epoch": 2.9214022730910836,
"grad_norm": 5.489156723022461,
"learning_rate": 3.539303865855611e-05,
"loss": 1.1674,
"num_input_tokens_seen": 159156920,
"step": 292000
},
{
"epoch": 2.9264046742436367,
"grad_norm": 4.9325456619262695,
"learning_rate": 3.536802665279334e-05,
"loss": 1.1724,
"num_input_tokens_seen": 159422696,
"step": 292500
},
{
"epoch": 2.9314070753961903,
"grad_norm": 4.590809345245361,
"learning_rate": 3.5343014647030574e-05,
"loss": 1.1691,
"num_input_tokens_seen": 159694472,
"step": 293000
},
{
"epoch": 2.9364094765487434,
"grad_norm": 5.634531497955322,
"learning_rate": 3.531800264126781e-05,
"loss": 1.1875,
"num_input_tokens_seen": 159966504,
"step": 293500
},
{
"epoch": 2.9414118777012965,
"grad_norm": 7.636883735656738,
"learning_rate": 3.5292990635505044e-05,
"loss": 1.1771,
"num_input_tokens_seen": 160236568,
"step": 294000
},
{
"epoch": 2.94641427885385,
"grad_norm": 4.785983562469482,
"learning_rate": 3.526797862974228e-05,
"loss": 1.1746,
"num_input_tokens_seen": 160505800,
"step": 294500
},
{
"epoch": 2.9514166800064032,
"grad_norm": 5.1736931800842285,
"learning_rate": 3.5242966623979514e-05,
"loss": 1.1753,
"num_input_tokens_seen": 160785240,
"step": 295000
},
{
"epoch": 2.9564190811589564,
"grad_norm": 6.308248519897461,
"learning_rate": 3.521795461821675e-05,
"loss": 1.1851,
"num_input_tokens_seen": 161049888,
"step": 295500
},
{
"epoch": 2.9614214823115095,
"grad_norm": 6.6797404289245605,
"learning_rate": 3.519294261245398e-05,
"loss": 1.1486,
"num_input_tokens_seen": 161316952,
"step": 296000
},
{
"epoch": 2.9664238834640626,
"grad_norm": 5.875812530517578,
"learning_rate": 3.516793060669121e-05,
"loss": 1.1689,
"num_input_tokens_seen": 161584440,
"step": 296500
},
{
"epoch": 2.9714262846166157,
"grad_norm": 6.539891719818115,
"learning_rate": 3.514291860092845e-05,
"loss": 1.1567,
"num_input_tokens_seen": 161861384,
"step": 297000
},
{
"epoch": 2.9764286857691693,
"grad_norm": 4.355959415435791,
"learning_rate": 3.511790659516568e-05,
"loss": 1.1654,
"num_input_tokens_seen": 162134976,
"step": 297500
},
{
"epoch": 2.9814310869217224,
"grad_norm": 8.101115226745605,
"learning_rate": 3.509289458940292e-05,
"loss": 1.1749,
"num_input_tokens_seen": 162408584,
"step": 298000
},
{
"epoch": 2.9864334880742756,
"grad_norm": 6.168905735015869,
"learning_rate": 3.506788258364015e-05,
"loss": 1.1823,
"num_input_tokens_seen": 162688056,
"step": 298500
},
{
"epoch": 2.991435889226829,
"grad_norm": 6.624521255493164,
"learning_rate": 3.504287057787739e-05,
"loss": 1.1709,
"num_input_tokens_seen": 162956872,
"step": 299000
},
{
"epoch": 2.9964382903793823,
"grad_norm": 6.812922954559326,
"learning_rate": 3.5017858572114615e-05,
"loss": 1.1728,
"num_input_tokens_seen": 163231632,
"step": 299500
},
{
"epoch": 3.0,
"eval_loss": 1.1232455968856812,
"eval_runtime": 186.7896,
"eval_samples_per_second": 1070.215,
"eval_steps_per_second": 133.782,
"num_input_tokens_seen": 163424808,
"step": 299856
},
{
"epoch": 3.0014406915319354,
"grad_norm": 4.925895690917969,
"learning_rate": 3.499284656635185e-05,
"loss": 1.1557,
"num_input_tokens_seen": 163507488,
"step": 300000
},
{
"epoch": 3.0064430926844885,
"grad_norm": 4.663059234619141,
"learning_rate": 3.4967834560589085e-05,
"loss": 1.0655,
"num_input_tokens_seen": 163782720,
"step": 300500
},
{
"epoch": 3.0114454938370416,
"grad_norm": 7.381974220275879,
"learning_rate": 3.494282255482632e-05,
"loss": 1.0891,
"num_input_tokens_seen": 164062072,
"step": 301000
},
{
"epoch": 3.016447894989595,
"grad_norm": 6.4466094970703125,
"learning_rate": 3.491781054906355e-05,
"loss": 1.074,
"num_input_tokens_seen": 164336176,
"step": 301500
},
{
"epoch": 3.0214502961421483,
"grad_norm": 5.126181602478027,
"learning_rate": 3.489279854330078e-05,
"loss": 1.0666,
"num_input_tokens_seen": 164610040,
"step": 302000
},
{
"epoch": 3.0264526972947015,
"grad_norm": 5.322078227996826,
"learning_rate": 3.4867786537538025e-05,
"loss": 1.0837,
"num_input_tokens_seen": 164887400,
"step": 302500
},
{
"epoch": 3.0314550984472546,
"grad_norm": 5.671963691711426,
"learning_rate": 3.484277453177525e-05,
"loss": 1.0829,
"num_input_tokens_seen": 165162256,
"step": 303000
},
{
"epoch": 3.0364574995998077,
"grad_norm": 4.6445441246032715,
"learning_rate": 3.481776252601249e-05,
"loss": 1.0771,
"num_input_tokens_seen": 165430680,
"step": 303500
},
{
"epoch": 3.0414599007523613,
"grad_norm": 5.632525444030762,
"learning_rate": 3.479275052024972e-05,
"loss": 1.0893,
"num_input_tokens_seen": 165706048,
"step": 304000
},
{
"epoch": 3.0464623019049144,
"grad_norm": 4.770864963531494,
"learning_rate": 3.476773851448695e-05,
"loss": 1.0772,
"num_input_tokens_seen": 165979496,
"step": 304500
},
{
"epoch": 3.0514647030574675,
"grad_norm": 7.340290546417236,
"learning_rate": 3.4742726508724186e-05,
"loss": 1.0851,
"num_input_tokens_seen": 166256376,
"step": 305000
},
{
"epoch": 3.0564671042100207,
"grad_norm": 5.2338080406188965,
"learning_rate": 3.471771450296142e-05,
"loss": 1.0949,
"num_input_tokens_seen": 166529712,
"step": 305500
},
{
"epoch": 3.0614695053625742,
"grad_norm": 5.540538311004639,
"learning_rate": 3.4692702497198656e-05,
"loss": 1.0929,
"num_input_tokens_seen": 166795520,
"step": 306000
},
{
"epoch": 3.0664719065151274,
"grad_norm": 6.512203693389893,
"learning_rate": 3.466769049143589e-05,
"loss": 1.075,
"num_input_tokens_seen": 167066728,
"step": 306500
},
{
"epoch": 3.0714743076676805,
"grad_norm": 5.783512592315674,
"learning_rate": 3.4642678485673126e-05,
"loss": 1.0749,
"num_input_tokens_seen": 167335800,
"step": 307000
},
{
"epoch": 3.0764767088202336,
"grad_norm": 5.550832271575928,
"learning_rate": 3.461766647991036e-05,
"loss": 1.0886,
"num_input_tokens_seen": 167610656,
"step": 307500
},
{
"epoch": 3.0814791099727867,
"grad_norm": 5.394260883331299,
"learning_rate": 3.459265447414759e-05,
"loss": 1.0906,
"num_input_tokens_seen": 167882480,
"step": 308000
},
{
"epoch": 3.0864815111253403,
"grad_norm": 5.690032005310059,
"learning_rate": 3.4567642468384824e-05,
"loss": 1.0851,
"num_input_tokens_seen": 168156832,
"step": 308500
},
{
"epoch": 3.0914839122778934,
"grad_norm": 6.620737552642822,
"learning_rate": 3.454263046262206e-05,
"loss": 1.0931,
"num_input_tokens_seen": 168435520,
"step": 309000
},
{
"epoch": 3.0964863134304466,
"grad_norm": 6.105669021606445,
"learning_rate": 3.4517618456859294e-05,
"loss": 1.0755,
"num_input_tokens_seen": 168708304,
"step": 309500
},
{
"epoch": 3.1014887145829997,
"grad_norm": 6.636053562164307,
"learning_rate": 3.449260645109653e-05,
"loss": 1.08,
"num_input_tokens_seen": 168969008,
"step": 310000
},
{
"epoch": 3.1064911157355533,
"grad_norm": 7.361510753631592,
"learning_rate": 3.4467594445333764e-05,
"loss": 1.0981,
"num_input_tokens_seen": 169241920,
"step": 310500
},
{
"epoch": 3.1114935168881064,
"grad_norm": 4.566134929656982,
"learning_rate": 3.4442582439571e-05,
"loss": 1.0713,
"num_input_tokens_seen": 169520520,
"step": 311000
},
{
"epoch": 3.1164959180406595,
"grad_norm": 5.323643207550049,
"learning_rate": 3.441757043380823e-05,
"loss": 1.0884,
"num_input_tokens_seen": 169794728,
"step": 311500
},
{
"epoch": 3.1214983191932126,
"grad_norm": 5.005212306976318,
"learning_rate": 3.439255842804546e-05,
"loss": 1.0817,
"num_input_tokens_seen": 170074952,
"step": 312000
},
{
"epoch": 3.1265007203457658,
"grad_norm": 6.78676176071167,
"learning_rate": 3.43675464222827e-05,
"loss": 1.1067,
"num_input_tokens_seen": 170350784,
"step": 312500
},
{
"epoch": 3.1315031214983193,
"grad_norm": 5.532153129577637,
"learning_rate": 3.434253441651993e-05,
"loss": 1.0729,
"num_input_tokens_seen": 170621008,
"step": 313000
},
{
"epoch": 3.1365055226508725,
"grad_norm": 6.041494369506836,
"learning_rate": 3.431752241075716e-05,
"loss": 1.0667,
"num_input_tokens_seen": 170896544,
"step": 313500
},
{
"epoch": 3.1415079238034256,
"grad_norm": 5.707986831665039,
"learning_rate": 3.42925104049944e-05,
"loss": 1.0803,
"num_input_tokens_seen": 171167464,
"step": 314000
},
{
"epoch": 3.1465103249559787,
"grad_norm": 6.608933925628662,
"learning_rate": 3.426749839923164e-05,
"loss": 1.0944,
"num_input_tokens_seen": 171437824,
"step": 314500
},
{
"epoch": 3.1515127261085323,
"grad_norm": 4.988198280334473,
"learning_rate": 3.4242486393468865e-05,
"loss": 1.0928,
"num_input_tokens_seen": 171713920,
"step": 315000
},
{
"epoch": 3.1565151272610854,
"grad_norm": 5.763394832611084,
"learning_rate": 3.42174743877061e-05,
"loss": 1.0655,
"num_input_tokens_seen": 171991720,
"step": 315500
},
{
"epoch": 3.1615175284136385,
"grad_norm": 6.287621974945068,
"learning_rate": 3.4192462381943335e-05,
"loss": 1.1027,
"num_input_tokens_seen": 172269040,
"step": 316000
},
{
"epoch": 3.1665199295661917,
"grad_norm": 7.083132266998291,
"learning_rate": 3.416745037618057e-05,
"loss": 1.097,
"num_input_tokens_seen": 172556304,
"step": 316500
},
{
"epoch": 3.1715223307187452,
"grad_norm": 5.856710433959961,
"learning_rate": 3.41424383704178e-05,
"loss": 1.0848,
"num_input_tokens_seen": 172825392,
"step": 317000
},
{
"epoch": 3.1765247318712984,
"grad_norm": 5.9765849113464355,
"learning_rate": 3.4117426364655033e-05,
"loss": 1.0959,
"num_input_tokens_seen": 173095280,
"step": 317500
},
{
"epoch": 3.1815271330238515,
"grad_norm": 7.099453926086426,
"learning_rate": 3.409241435889227e-05,
"loss": 1.0853,
"num_input_tokens_seen": 173365904,
"step": 318000
},
{
"epoch": 3.1865295341764046,
"grad_norm": 6.180022239685059,
"learning_rate": 3.40674023531295e-05,
"loss": 1.0783,
"num_input_tokens_seen": 173632760,
"step": 318500
},
{
"epoch": 3.1915319353289577,
"grad_norm": 5.463505744934082,
"learning_rate": 3.404239034736674e-05,
"loss": 1.0838,
"num_input_tokens_seen": 173903984,
"step": 319000
},
{
"epoch": 3.1965343364815113,
"grad_norm": 5.173684120178223,
"learning_rate": 3.401737834160397e-05,
"loss": 1.1002,
"num_input_tokens_seen": 174179216,
"step": 319500
},
{
"epoch": 3.2015367376340644,
"grad_norm": 7.594663619995117,
"learning_rate": 3.399236633584121e-05,
"loss": 1.096,
"num_input_tokens_seen": 174445040,
"step": 320000
},
{
"epoch": 3.2065391387866176,
"grad_norm": 6.0014262199401855,
"learning_rate": 3.3967354330078436e-05,
"loss": 1.0874,
"num_input_tokens_seen": 174714360,
"step": 320500
},
{
"epoch": 3.2115415399391707,
"grad_norm": 6.118896961212158,
"learning_rate": 3.394234232431567e-05,
"loss": 1.0911,
"num_input_tokens_seen": 174982632,
"step": 321000
},
{
"epoch": 3.2165439410917243,
"grad_norm": 6.8333587646484375,
"learning_rate": 3.3917330318552906e-05,
"loss": 1.1197,
"num_input_tokens_seen": 175255784,
"step": 321500
},
{
"epoch": 3.2215463422442774,
"grad_norm": 4.892594337463379,
"learning_rate": 3.389231831279014e-05,
"loss": 1.1041,
"num_input_tokens_seen": 175526640,
"step": 322000
},
{
"epoch": 3.2265487433968305,
"grad_norm": 5.051529884338379,
"learning_rate": 3.3867306307027376e-05,
"loss": 1.102,
"num_input_tokens_seen": 175794768,
"step": 322500
},
{
"epoch": 3.2315511445493836,
"grad_norm": 5.638453960418701,
"learning_rate": 3.384229430126461e-05,
"loss": 1.1116,
"num_input_tokens_seen": 176065112,
"step": 323000
},
{
"epoch": 3.2365535457019368,
"grad_norm": 7.20506477355957,
"learning_rate": 3.381728229550184e-05,
"loss": 1.0918,
"num_input_tokens_seen": 176336008,
"step": 323500
},
{
"epoch": 3.2415559468544903,
"grad_norm": 7.046761512756348,
"learning_rate": 3.3792270289739074e-05,
"loss": 1.0734,
"num_input_tokens_seen": 176605936,
"step": 324000
},
{
"epoch": 3.2465583480070435,
"grad_norm": 6.106048107147217,
"learning_rate": 3.376725828397631e-05,
"loss": 1.0953,
"num_input_tokens_seen": 176880432,
"step": 324500
},
{
"epoch": 3.2515607491595966,
"grad_norm": 6.578117847442627,
"learning_rate": 3.3742246278213544e-05,
"loss": 1.0973,
"num_input_tokens_seen": 177149064,
"step": 325000
},
{
"epoch": 3.2565631503121497,
"grad_norm": 5.515709400177002,
"learning_rate": 3.371723427245078e-05,
"loss": 1.1044,
"num_input_tokens_seen": 177427928,
"step": 325500
},
{
"epoch": 3.2615655514647033,
"grad_norm": 6.71830940246582,
"learning_rate": 3.3692222266688014e-05,
"loss": 1.0983,
"num_input_tokens_seen": 177700128,
"step": 326000
},
{
"epoch": 3.2665679526172564,
"grad_norm": 6.004988670349121,
"learning_rate": 3.366721026092525e-05,
"loss": 1.0844,
"num_input_tokens_seen": 177979224,
"step": 326500
},
{
"epoch": 3.2715703537698095,
"grad_norm": 6.418676376342773,
"learning_rate": 3.364219825516248e-05,
"loss": 1.0996,
"num_input_tokens_seen": 178254504,
"step": 327000
},
{
"epoch": 3.2765727549223627,
"grad_norm": 6.826735973358154,
"learning_rate": 3.361718624939971e-05,
"loss": 1.086,
"num_input_tokens_seen": 178527880,
"step": 327500
},
{
"epoch": 3.281575156074916,
"grad_norm": 7.035877704620361,
"learning_rate": 3.359217424363695e-05,
"loss": 1.0919,
"num_input_tokens_seen": 178801256,
"step": 328000
},
{
"epoch": 3.2865775572274694,
"grad_norm": 7.336743354797363,
"learning_rate": 3.356716223787418e-05,
"loss": 1.1121,
"num_input_tokens_seen": 179072160,
"step": 328500
},
{
"epoch": 3.2915799583800225,
"grad_norm": 5.2435383796691895,
"learning_rate": 3.354215023211141e-05,
"loss": 1.0915,
"num_input_tokens_seen": 179344440,
"step": 329000
},
{
"epoch": 3.2965823595325756,
"grad_norm": 7.368856906890869,
"learning_rate": 3.3517138226348646e-05,
"loss": 1.0991,
"num_input_tokens_seen": 179619064,
"step": 329500
},
{
"epoch": 3.3015847606851287,
"grad_norm": 6.245655059814453,
"learning_rate": 3.349212622058589e-05,
"loss": 1.0883,
"num_input_tokens_seen": 179888680,
"step": 330000
},
{
"epoch": 3.3065871618376823,
"grad_norm": 6.055501461029053,
"learning_rate": 3.3467114214823116e-05,
"loss": 1.0854,
"num_input_tokens_seen": 180162488,
"step": 330500
},
{
"epoch": 3.3115895629902354,
"grad_norm": 5.36578893661499,
"learning_rate": 3.344210220906035e-05,
"loss": 1.0829,
"num_input_tokens_seen": 180435488,
"step": 331000
},
{
"epoch": 3.3165919641427886,
"grad_norm": 4.865072727203369,
"learning_rate": 3.3417090203297585e-05,
"loss": 1.0914,
"num_input_tokens_seen": 180708088,
"step": 331500
},
{
"epoch": 3.3215943652953417,
"grad_norm": 5.984726428985596,
"learning_rate": 3.339207819753482e-05,
"loss": 1.0988,
"num_input_tokens_seen": 180972032,
"step": 332000
},
{
"epoch": 3.326596766447895,
"grad_norm": 6.17361307144165,
"learning_rate": 3.336706619177205e-05,
"loss": 1.1045,
"num_input_tokens_seen": 181243824,
"step": 332500
},
{
"epoch": 3.3315991676004484,
"grad_norm": 5.614140510559082,
"learning_rate": 3.3342054186009284e-05,
"loss": 1.1017,
"num_input_tokens_seen": 181516200,
"step": 333000
},
{
"epoch": 3.3366015687530015,
"grad_norm": 6.182852268218994,
"learning_rate": 3.331704218024652e-05,
"loss": 1.104,
"num_input_tokens_seen": 181789432,
"step": 333500
},
{
"epoch": 3.3416039699055546,
"grad_norm": 6.281063079833984,
"learning_rate": 3.3292030174483754e-05,
"loss": 1.1114,
"num_input_tokens_seen": 182061728,
"step": 334000
},
{
"epoch": 3.3466063710581078,
"grad_norm": 5.531891822814941,
"learning_rate": 3.326701816872099e-05,
"loss": 1.0908,
"num_input_tokens_seen": 182340072,
"step": 334500
},
{
"epoch": 3.3516087722106613,
"grad_norm": 5.755847930908203,
"learning_rate": 3.3242006162958223e-05,
"loss": 1.0978,
"num_input_tokens_seen": 182603520,
"step": 335000
},
{
"epoch": 3.3566111733632145,
"grad_norm": 5.261629104614258,
"learning_rate": 3.321699415719546e-05,
"loss": 1.089,
"num_input_tokens_seen": 182871456,
"step": 335500
},
{
"epoch": 3.3616135745157676,
"grad_norm": 9.492514610290527,
"learning_rate": 3.319198215143269e-05,
"loss": 1.0943,
"num_input_tokens_seen": 183145224,
"step": 336000
},
{
"epoch": 3.3666159756683207,
"grad_norm": 5.316561222076416,
"learning_rate": 3.316697014566992e-05,
"loss": 1.1153,
"num_input_tokens_seen": 183418328,
"step": 336500
},
{
"epoch": 3.371618376820874,
"grad_norm": 4.869199275970459,
"learning_rate": 3.3141958139907157e-05,
"loss": 1.0922,
"num_input_tokens_seen": 183701552,
"step": 337000
},
{
"epoch": 3.3766207779734274,
"grad_norm": 5.928160667419434,
"learning_rate": 3.311694613414439e-05,
"loss": 1.1058,
"num_input_tokens_seen": 183971600,
"step": 337500
},
{
"epoch": 3.3816231791259805,
"grad_norm": 5.425112724304199,
"learning_rate": 3.3091934128381626e-05,
"loss": 1.0863,
"num_input_tokens_seen": 184239416,
"step": 338000
},
{
"epoch": 3.3866255802785337,
"grad_norm": 5.094555854797363,
"learning_rate": 3.306692212261886e-05,
"loss": 1.0826,
"num_input_tokens_seen": 184503600,
"step": 338500
},
{
"epoch": 3.391627981431087,
"grad_norm": 6.472997665405273,
"learning_rate": 3.3041910116856096e-05,
"loss": 1.0987,
"num_input_tokens_seen": 184777584,
"step": 339000
},
{
"epoch": 3.3966303825836404,
"grad_norm": 5.41008996963501,
"learning_rate": 3.3016898111093325e-05,
"loss": 1.1025,
"num_input_tokens_seen": 185048208,
"step": 339500
},
{
"epoch": 3.4016327837361935,
"grad_norm": 6.235612869262695,
"learning_rate": 3.299188610533056e-05,
"loss": 1.1097,
"num_input_tokens_seen": 185317376,
"step": 340000
},
{
"epoch": 3.4066351848887466,
"grad_norm": 5.876267910003662,
"learning_rate": 3.2966874099567795e-05,
"loss": 1.111,
"num_input_tokens_seen": 185588416,
"step": 340500
},
{
"epoch": 3.4116375860412997,
"grad_norm": 4.517580032348633,
"learning_rate": 3.294186209380502e-05,
"loss": 1.0877,
"num_input_tokens_seen": 185855440,
"step": 341000
},
{
"epoch": 3.416639987193853,
"grad_norm": 7.28811502456665,
"learning_rate": 3.2916850088042264e-05,
"loss": 1.1021,
"num_input_tokens_seen": 186125800,
"step": 341500
},
{
"epoch": 3.4216423883464064,
"grad_norm": 7.394123077392578,
"learning_rate": 3.28918380822795e-05,
"loss": 1.1103,
"num_input_tokens_seen": 186390984,
"step": 342000
},
{
"epoch": 3.4266447894989596,
"grad_norm": 6.393476963043213,
"learning_rate": 3.286682607651673e-05,
"loss": 1.0946,
"num_input_tokens_seen": 186659016,
"step": 342500
},
{
"epoch": 3.4316471906515127,
"grad_norm": 5.5101470947265625,
"learning_rate": 3.284181407075396e-05,
"loss": 1.1031,
"num_input_tokens_seen": 186933800,
"step": 343000
},
{
"epoch": 3.436649591804066,
"grad_norm": 5.820064067840576,
"learning_rate": 3.28168020649912e-05,
"loss": 1.0992,
"num_input_tokens_seen": 187203688,
"step": 343500
},
{
"epoch": 3.4416519929566194,
"grad_norm": 4.500607013702393,
"learning_rate": 3.279179005922843e-05,
"loss": 1.1083,
"num_input_tokens_seen": 187477344,
"step": 344000
},
{
"epoch": 3.4466543941091725,
"grad_norm": 6.536877632141113,
"learning_rate": 3.276677805346566e-05,
"loss": 1.0988,
"num_input_tokens_seen": 187753544,
"step": 344500
},
{
"epoch": 3.4516567952617256,
"grad_norm": 6.723674774169922,
"learning_rate": 3.2741766047702896e-05,
"loss": 1.0814,
"num_input_tokens_seen": 188027648,
"step": 345000
},
{
"epoch": 3.4566591964142788,
"grad_norm": 5.175849437713623,
"learning_rate": 3.271675404194014e-05,
"loss": 1.1108,
"num_input_tokens_seen": 188298976,
"step": 345500
},
{
"epoch": 3.461661597566832,
"grad_norm": 4.006369590759277,
"learning_rate": 3.2691742036177366e-05,
"loss": 1.102,
"num_input_tokens_seen": 188573952,
"step": 346000
},
{
"epoch": 3.4666639987193855,
"grad_norm": 5.444148063659668,
"learning_rate": 3.26667300304146e-05,
"loss": 1.0963,
"num_input_tokens_seen": 188843992,
"step": 346500
},
{
"epoch": 3.4716663998719386,
"grad_norm": 6.093343257904053,
"learning_rate": 3.2641718024651836e-05,
"loss": 1.1117,
"num_input_tokens_seen": 189117128,
"step": 347000
},
{
"epoch": 3.4766688010244917,
"grad_norm": 5.752835750579834,
"learning_rate": 3.261670601888907e-05,
"loss": 1.0973,
"num_input_tokens_seen": 189389104,
"step": 347500
},
{
"epoch": 3.481671202177045,
"grad_norm": 4.975690841674805,
"learning_rate": 3.25916940131263e-05,
"loss": 1.1074,
"num_input_tokens_seen": 189665040,
"step": 348000
},
{
"epoch": 3.4866736033295984,
"grad_norm": 5.228826999664307,
"learning_rate": 3.2566682007363534e-05,
"loss": 1.0942,
"num_input_tokens_seen": 189939112,
"step": 348500
},
{
"epoch": 3.4916760044821515,
"grad_norm": 5.240488052368164,
"learning_rate": 3.254167000160077e-05,
"loss": 1.1023,
"num_input_tokens_seen": 190214888,
"step": 349000
},
{
"epoch": 3.4966784056347047,
"grad_norm": 6.247119903564453,
"learning_rate": 3.2516657995838004e-05,
"loss": 1.1001,
"num_input_tokens_seen": 190486416,
"step": 349500
},
{
"epoch": 3.501680806787258,
"grad_norm": 7.789793968200684,
"learning_rate": 3.249164599007524e-05,
"loss": 1.1066,
"num_input_tokens_seen": 190761576,
"step": 350000
},
{
"epoch": 3.506683207939811,
"grad_norm": 4.448274612426758,
"learning_rate": 3.2466633984312474e-05,
"loss": 1.1009,
"num_input_tokens_seen": 191031280,
"step": 350500
},
{
"epoch": 3.5116856090923645,
"grad_norm": 7.334349632263184,
"learning_rate": 3.244162197854971e-05,
"loss": 1.1059,
"num_input_tokens_seen": 191300640,
"step": 351000
},
{
"epoch": 3.5166880102449176,
"grad_norm": 6.003718852996826,
"learning_rate": 3.241660997278694e-05,
"loss": 1.097,
"num_input_tokens_seen": 191573504,
"step": 351500
},
{
"epoch": 3.5216904113974707,
"grad_norm": 5.930721759796143,
"learning_rate": 3.239159796702417e-05,
"loss": 1.0897,
"num_input_tokens_seen": 191844912,
"step": 352000
},
{
"epoch": 3.526692812550024,
"grad_norm": 4.852160453796387,
"learning_rate": 3.236658596126141e-05,
"loss": 1.0989,
"num_input_tokens_seen": 192115928,
"step": 352500
},
{
"epoch": 3.5316952137025774,
"grad_norm": 5.043008327484131,
"learning_rate": 3.234157395549864e-05,
"loss": 1.0835,
"num_input_tokens_seen": 192381304,
"step": 353000
},
{
"epoch": 3.5366976148551306,
"grad_norm": 5.529479503631592,
"learning_rate": 3.231656194973588e-05,
"loss": 1.1085,
"num_input_tokens_seen": 192651760,
"step": 353500
},
{
"epoch": 3.5417000160076837,
"grad_norm": 6.701032638549805,
"learning_rate": 3.229154994397311e-05,
"loss": 1.1058,
"num_input_tokens_seen": 192924280,
"step": 354000
},
{
"epoch": 3.546702417160237,
"grad_norm": 6.587806224822998,
"learning_rate": 3.2266537938210347e-05,
"loss": 1.0952,
"num_input_tokens_seen": 193194504,
"step": 354500
},
{
"epoch": 3.55170481831279,
"grad_norm": 5.651816368103027,
"learning_rate": 3.2241525932447575e-05,
"loss": 1.0987,
"num_input_tokens_seen": 193457896,
"step": 355000
},
{
"epoch": 3.5567072194653435,
"grad_norm": 4.911685943603516,
"learning_rate": 3.221651392668481e-05,
"loss": 1.0894,
"num_input_tokens_seen": 193726256,
"step": 355500
},
{
"epoch": 3.5617096206178966,
"grad_norm": 5.760750770568848,
"learning_rate": 3.2191501920922045e-05,
"loss": 1.1061,
"num_input_tokens_seen": 194000904,
"step": 356000
},
{
"epoch": 3.5667120217704498,
"grad_norm": 5.3068647384643555,
"learning_rate": 3.216648991515927e-05,
"loss": 1.0917,
"num_input_tokens_seen": 194271728,
"step": 356500
},
{
"epoch": 3.571714422923003,
"grad_norm": 5.526483535766602,
"learning_rate": 3.2141477909396515e-05,
"loss": 1.0853,
"num_input_tokens_seen": 194541832,
"step": 357000
},
{
"epoch": 3.5767168240755565,
"grad_norm": 6.068410396575928,
"learning_rate": 3.211646590363375e-05,
"loss": 1.1037,
"num_input_tokens_seen": 194815496,
"step": 357500
},
{
"epoch": 3.5817192252281096,
"grad_norm": 5.573991775512695,
"learning_rate": 3.2091453897870985e-05,
"loss": 1.104,
"num_input_tokens_seen": 195090856,
"step": 358000
},
{
"epoch": 3.5867216263806627,
"grad_norm": 7.24959135055542,
"learning_rate": 3.206644189210821e-05,
"loss": 1.1011,
"num_input_tokens_seen": 195370496,
"step": 358500
},
{
"epoch": 3.591724027533216,
"grad_norm": 5.9966535568237305,
"learning_rate": 3.204142988634545e-05,
"loss": 1.1042,
"num_input_tokens_seen": 195642920,
"step": 359000
},
{
"epoch": 3.596726428685769,
"grad_norm": 10.24399185180664,
"learning_rate": 3.201641788058268e-05,
"loss": 1.1068,
"num_input_tokens_seen": 195918104,
"step": 359500
},
{
"epoch": 3.6017288298383225,
"grad_norm": 6.0826215744018555,
"learning_rate": 3.199140587481991e-05,
"loss": 1.0953,
"num_input_tokens_seen": 196193816,
"step": 360000
},
{
"epoch": 3.6067312309908757,
"grad_norm": 5.735098838806152,
"learning_rate": 3.1966393869057146e-05,
"loss": 1.0956,
"num_input_tokens_seen": 196461344,
"step": 360500
},
{
"epoch": 3.611733632143429,
"grad_norm": 4.604750156402588,
"learning_rate": 3.194138186329438e-05,
"loss": 1.0863,
"num_input_tokens_seen": 196732248,
"step": 361000
},
{
"epoch": 3.616736033295982,
"grad_norm": 5.826147079467773,
"learning_rate": 3.191636985753162e-05,
"loss": 1.1043,
"num_input_tokens_seen": 197008704,
"step": 361500
},
{
"epoch": 3.6217384344485355,
"grad_norm": 6.071508884429932,
"learning_rate": 3.189135785176885e-05,
"loss": 1.1086,
"num_input_tokens_seen": 197287472,
"step": 362000
},
{
"epoch": 3.6267408356010886,
"grad_norm": 7.109647750854492,
"learning_rate": 3.1866345846006086e-05,
"loss": 1.1049,
"num_input_tokens_seen": 197561376,
"step": 362500
},
{
"epoch": 3.6317432367536417,
"grad_norm": 5.95808219909668,
"learning_rate": 3.184133384024332e-05,
"loss": 1.1102,
"num_input_tokens_seen": 197833112,
"step": 363000
},
{
"epoch": 3.636745637906195,
"grad_norm": 5.6464080810546875,
"learning_rate": 3.181632183448055e-05,
"loss": 1.1011,
"num_input_tokens_seen": 198108448,
"step": 363500
},
{
"epoch": 3.641748039058748,
"grad_norm": 6.354126453399658,
"learning_rate": 3.1791309828717784e-05,
"loss": 1.1115,
"num_input_tokens_seen": 198384984,
"step": 364000
},
{
"epoch": 3.6467504402113016,
"grad_norm": 4.0459394454956055,
"learning_rate": 3.176629782295502e-05,
"loss": 1.0864,
"num_input_tokens_seen": 198658952,
"step": 364500
},
{
"epoch": 3.6517528413638547,
"grad_norm": 5.361639022827148,
"learning_rate": 3.1741285817192254e-05,
"loss": 1.0827,
"num_input_tokens_seen": 198924888,
"step": 365000
},
{
"epoch": 3.656755242516408,
"grad_norm": 5.508306503295898,
"learning_rate": 3.171627381142949e-05,
"loss": 1.1204,
"num_input_tokens_seen": 199201528,
"step": 365500
},
{
"epoch": 3.661757643668961,
"grad_norm": 5.771850109100342,
"learning_rate": 3.1691261805666724e-05,
"loss": 1.0936,
"num_input_tokens_seen": 199477568,
"step": 366000
},
{
"epoch": 3.6667600448215145,
"grad_norm": 5.311666011810303,
"learning_rate": 3.166624979990396e-05,
"loss": 1.0837,
"num_input_tokens_seen": 199742528,
"step": 366500
},
{
"epoch": 3.6717624459740676,
"grad_norm": 6.869203090667725,
"learning_rate": 3.164123779414119e-05,
"loss": 1.0877,
"num_input_tokens_seen": 200016352,
"step": 367000
},
{
"epoch": 3.6767648471266208,
"grad_norm": 5.720645427703857,
"learning_rate": 3.161622578837842e-05,
"loss": 1.1157,
"num_input_tokens_seen": 200288848,
"step": 367500
},
{
"epoch": 3.681767248279174,
"grad_norm": 4.348053455352783,
"learning_rate": 3.159121378261566e-05,
"loss": 1.1081,
"num_input_tokens_seen": 200560176,
"step": 368000
},
{
"epoch": 3.686769649431727,
"grad_norm": 10.115488052368164,
"learning_rate": 3.1566201776852885e-05,
"loss": 1.0972,
"num_input_tokens_seen": 200829688,
"step": 368500
},
{
"epoch": 3.6917720505842806,
"grad_norm": 5.798775672912598,
"learning_rate": 3.154118977109013e-05,
"loss": 1.0972,
"num_input_tokens_seen": 201096760,
"step": 369000
},
{
"epoch": 3.6967744517368337,
"grad_norm": 6.090835094451904,
"learning_rate": 3.151617776532736e-05,
"loss": 1.0971,
"num_input_tokens_seen": 201367440,
"step": 369500
},
{
"epoch": 3.701776852889387,
"grad_norm": 5.695186138153076,
"learning_rate": 3.14911657595646e-05,
"loss": 1.0839,
"num_input_tokens_seen": 201639960,
"step": 370000
},
{
"epoch": 3.70677925404194,
"grad_norm": 7.136424541473389,
"learning_rate": 3.1466153753801825e-05,
"loss": 1.1157,
"num_input_tokens_seen": 201913680,
"step": 370500
},
{
"epoch": 3.7117816551944935,
"grad_norm": 5.564599514007568,
"learning_rate": 3.144114174803906e-05,
"loss": 1.0987,
"num_input_tokens_seen": 202193184,
"step": 371000
},
{
"epoch": 3.7167840563470467,
"grad_norm": 5.429393291473389,
"learning_rate": 3.1416129742276295e-05,
"loss": 1.0872,
"num_input_tokens_seen": 202465104,
"step": 371500
},
{
"epoch": 3.7217864574996,
"grad_norm": 6.241130828857422,
"learning_rate": 3.139111773651352e-05,
"loss": 1.1101,
"num_input_tokens_seen": 202739128,
"step": 372000
},
{
"epoch": 3.726788858652153,
"grad_norm": 5.023561954498291,
"learning_rate": 3.136610573075076e-05,
"loss": 1.1091,
"num_input_tokens_seen": 203013680,
"step": 372500
},
{
"epoch": 3.731791259804706,
"grad_norm": 4.592106342315674,
"learning_rate": 3.1341093724988e-05,
"loss": 1.105,
"num_input_tokens_seen": 203285192,
"step": 373000
},
{
"epoch": 3.7367936609572596,
"grad_norm": 4.939518928527832,
"learning_rate": 3.1316081719225235e-05,
"loss": 1.1075,
"num_input_tokens_seen": 203559176,
"step": 373500
},
{
"epoch": 3.7417960621098127,
"grad_norm": 5.232937812805176,
"learning_rate": 3.129106971346246e-05,
"loss": 1.1105,
"num_input_tokens_seen": 203835152,
"step": 374000
},
{
"epoch": 3.746798463262366,
"grad_norm": 4.963284492492676,
"learning_rate": 3.12660577076997e-05,
"loss": 1.0907,
"num_input_tokens_seen": 204105752,
"step": 374500
},
{
"epoch": 3.751800864414919,
"grad_norm": 5.728975296020508,
"learning_rate": 3.124104570193693e-05,
"loss": 1.1002,
"num_input_tokens_seen": 204373136,
"step": 375000
},
{
"epoch": 3.7568032655674726,
"grad_norm": 6.109611511230469,
"learning_rate": 3.121603369617416e-05,
"loss": 1.108,
"num_input_tokens_seen": 204638480,
"step": 375500
},
{
"epoch": 3.7618056667200257,
"grad_norm": 5.837881088256836,
"learning_rate": 3.1191021690411396e-05,
"loss": 1.1266,
"num_input_tokens_seen": 204909880,
"step": 376000
},
{
"epoch": 3.766808067872579,
"grad_norm": 6.2475666999816895,
"learning_rate": 3.116600968464863e-05,
"loss": 1.088,
"num_input_tokens_seen": 205188256,
"step": 376500
},
{
"epoch": 3.771810469025132,
"grad_norm": 5.80530309677124,
"learning_rate": 3.1140997678885866e-05,
"loss": 1.0914,
"num_input_tokens_seen": 205462952,
"step": 377000
},
{
"epoch": 3.776812870177685,
"grad_norm": 8.078316688537598,
"learning_rate": 3.11159856731231e-05,
"loss": 1.0968,
"num_input_tokens_seen": 205733776,
"step": 377500
},
{
"epoch": 3.7818152713302386,
"grad_norm": 6.782426834106445,
"learning_rate": 3.1090973667360336e-05,
"loss": 1.0869,
"num_input_tokens_seen": 206004512,
"step": 378000
},
{
"epoch": 3.7868176724827918,
"grad_norm": 5.787932395935059,
"learning_rate": 3.106596166159757e-05,
"loss": 1.1081,
"num_input_tokens_seen": 206278760,
"step": 378500
},
{
"epoch": 3.791820073635345,
"grad_norm": 6.141157150268555,
"learning_rate": 3.10409496558348e-05,
"loss": 1.1042,
"num_input_tokens_seen": 206552664,
"step": 379000
},
{
"epoch": 3.796822474787898,
"grad_norm": 5.748921871185303,
"learning_rate": 3.1015937650072034e-05,
"loss": 1.1058,
"num_input_tokens_seen": 206824976,
"step": 379500
},
{
"epoch": 3.8018248759404516,
"grad_norm": 5.540569305419922,
"learning_rate": 3.099092564430927e-05,
"loss": 1.102,
"num_input_tokens_seen": 207100016,
"step": 380000
},
{
"epoch": 3.8068272770930047,
"grad_norm": 6.440171718597412,
"learning_rate": 3.0965913638546504e-05,
"loss": 1.1179,
"num_input_tokens_seen": 207370128,
"step": 380500
},
{
"epoch": 3.811829678245558,
"grad_norm": 4.424386024475098,
"learning_rate": 3.094090163278374e-05,
"loss": 1.0881,
"num_input_tokens_seen": 207637240,
"step": 381000
},
{
"epoch": 3.816832079398111,
"grad_norm": 5.059506416320801,
"learning_rate": 3.0915889627020974e-05,
"loss": 1.0892,
"num_input_tokens_seen": 207914904,
"step": 381500
},
{
"epoch": 3.821834480550664,
"grad_norm": 5.5119805335998535,
"learning_rate": 3.089087762125821e-05,
"loss": 1.089,
"num_input_tokens_seen": 208195200,
"step": 382000
},
{
"epoch": 3.8268368817032177,
"grad_norm": 5.340829372406006,
"learning_rate": 3.086586561549544e-05,
"loss": 1.0989,
"num_input_tokens_seen": 208469512,
"step": 382500
},
{
"epoch": 3.831839282855771,
"grad_norm": 5.793147087097168,
"learning_rate": 3.084085360973267e-05,
"loss": 1.1051,
"num_input_tokens_seen": 208740512,
"step": 383000
},
{
"epoch": 3.836841684008324,
"grad_norm": 4.490692138671875,
"learning_rate": 3.081584160396991e-05,
"loss": 1.1077,
"num_input_tokens_seen": 209018232,
"step": 383500
},
{
"epoch": 3.8418440851608775,
"grad_norm": 6.107596397399902,
"learning_rate": 3.0790829598207135e-05,
"loss": 1.0995,
"num_input_tokens_seen": 209290016,
"step": 384000
},
{
"epoch": 3.8468464863134306,
"grad_norm": 7.825516223907471,
"learning_rate": 3.076581759244438e-05,
"loss": 1.0869,
"num_input_tokens_seen": 209556592,
"step": 384500
},
{
"epoch": 3.8518488874659838,
"grad_norm": 4.849490165710449,
"learning_rate": 3.074080558668161e-05,
"loss": 1.1221,
"num_input_tokens_seen": 209832880,
"step": 385000
},
{
"epoch": 3.856851288618537,
"grad_norm": 6.529792308807373,
"learning_rate": 3.071579358091885e-05,
"loss": 1.1062,
"num_input_tokens_seen": 210114184,
"step": 385500
},
{
"epoch": 3.86185368977109,
"grad_norm": 6.837585926055908,
"learning_rate": 3.0690781575156075e-05,
"loss": 1.0878,
"num_input_tokens_seen": 210380480,
"step": 386000
},
{
"epoch": 3.866856090923643,
"grad_norm": 6.309233665466309,
"learning_rate": 3.066576956939331e-05,
"loss": 1.1116,
"num_input_tokens_seen": 210653688,
"step": 386500
},
{
"epoch": 3.8718584920761967,
"grad_norm": 6.287944316864014,
"learning_rate": 3.0640757563630545e-05,
"loss": 1.1021,
"num_input_tokens_seen": 210927232,
"step": 387000
},
{
"epoch": 3.87686089322875,
"grad_norm": 5.488702774047852,
"learning_rate": 3.061574555786777e-05,
"loss": 1.1043,
"num_input_tokens_seen": 211197296,
"step": 387500
},
{
"epoch": 3.881863294381303,
"grad_norm": 8.246638298034668,
"learning_rate": 3.059073355210501e-05,
"loss": 1.0917,
"num_input_tokens_seen": 211469200,
"step": 388000
},
{
"epoch": 3.8868656955338565,
"grad_norm": 6.3921332359313965,
"learning_rate": 3.056572154634224e-05,
"loss": 1.1079,
"num_input_tokens_seen": 211736248,
"step": 388500
},
{
"epoch": 3.8918680966864097,
"grad_norm": 5.241750717163086,
"learning_rate": 3.0540709540579485e-05,
"loss": 1.0928,
"num_input_tokens_seen": 212005560,
"step": 389000
},
{
"epoch": 3.8968704978389628,
"grad_norm": 5.063024997711182,
"learning_rate": 3.0515697534816713e-05,
"loss": 1.1103,
"num_input_tokens_seen": 212271040,
"step": 389500
},
{
"epoch": 3.901872898991516,
"grad_norm": 5.5935139656066895,
"learning_rate": 3.0490685529053948e-05,
"loss": 1.1169,
"num_input_tokens_seen": 212544560,
"step": 390000
},
{
"epoch": 3.906875300144069,
"grad_norm": 5.44050931930542,
"learning_rate": 3.0465673523291183e-05,
"loss": 1.0862,
"num_input_tokens_seen": 212819160,
"step": 390500
},
{
"epoch": 3.911877701296622,
"grad_norm": 5.747745990753174,
"learning_rate": 3.044066151752841e-05,
"loss": 1.0898,
"num_input_tokens_seen": 213087032,
"step": 391000
},
{
"epoch": 3.9168801024491757,
"grad_norm": 5.6474995613098145,
"learning_rate": 3.041564951176565e-05,
"loss": 1.1183,
"num_input_tokens_seen": 213366232,
"step": 391500
},
{
"epoch": 3.921882503601729,
"grad_norm": 5.1681928634643555,
"learning_rate": 3.0390637506002884e-05,
"loss": 1.1001,
"num_input_tokens_seen": 213633560,
"step": 392000
},
{
"epoch": 3.926884904754282,
"grad_norm": 7.847573280334473,
"learning_rate": 3.036562550024012e-05,
"loss": 1.0939,
"num_input_tokens_seen": 213908816,
"step": 392500
},
{
"epoch": 3.9318873059068356,
"grad_norm": 7.0550713539123535,
"learning_rate": 3.0340613494477348e-05,
"loss": 1.1101,
"num_input_tokens_seen": 214186464,
"step": 393000
},
{
"epoch": 3.9368897070593887,
"grad_norm": 5.558708667755127,
"learning_rate": 3.0315601488714586e-05,
"loss": 1.1038,
"num_input_tokens_seen": 214455448,
"step": 393500
},
{
"epoch": 3.941892108211942,
"grad_norm": 7.920301914215088,
"learning_rate": 3.029058948295182e-05,
"loss": 1.1085,
"num_input_tokens_seen": 214732032,
"step": 394000
},
{
"epoch": 3.946894509364495,
"grad_norm": 6.4054789543151855,
"learning_rate": 3.026557747718905e-05,
"loss": 1.1035,
"num_input_tokens_seen": 215009992,
"step": 394500
},
{
"epoch": 3.951896910517048,
"grad_norm": 5.385251045227051,
"learning_rate": 3.0240565471426284e-05,
"loss": 1.1004,
"num_input_tokens_seen": 215281032,
"step": 395000
},
{
"epoch": 3.956899311669601,
"grad_norm": 6.670193672180176,
"learning_rate": 3.0215553465663523e-05,
"loss": 1.0895,
"num_input_tokens_seen": 215547536,
"step": 395500
},
{
"epoch": 3.9619017128221548,
"grad_norm": 9.283798217773438,
"learning_rate": 3.0190541459900757e-05,
"loss": 1.0853,
"num_input_tokens_seen": 215818168,
"step": 396000
},
{
"epoch": 3.966904113974708,
"grad_norm": 5.494171142578125,
"learning_rate": 3.0165529454137986e-05,
"loss": 1.1118,
"num_input_tokens_seen": 216097808,
"step": 396500
},
{
"epoch": 3.971906515127261,
"grad_norm": 9.865717887878418,
"learning_rate": 3.014051744837522e-05,
"loss": 1.1092,
"num_input_tokens_seen": 216372792,
"step": 397000
},
{
"epoch": 3.9769089162798146,
"grad_norm": 7.068398952484131,
"learning_rate": 3.0115505442612456e-05,
"loss": 1.0978,
"num_input_tokens_seen": 216645048,
"step": 397500
},
{
"epoch": 3.9819113174323677,
"grad_norm": 7.0897626876831055,
"learning_rate": 3.0090493436849687e-05,
"loss": 1.0978,
"num_input_tokens_seen": 216922104,
"step": 398000
},
{
"epoch": 3.986913718584921,
"grad_norm": 6.884424686431885,
"learning_rate": 3.0065481431086922e-05,
"loss": 1.1057,
"num_input_tokens_seen": 217197472,
"step": 398500
},
{
"epoch": 3.991916119737474,
"grad_norm": 8.55648136138916,
"learning_rate": 3.0040469425324157e-05,
"loss": 1.0986,
"num_input_tokens_seen": 217464560,
"step": 399000
},
{
"epoch": 3.996918520890027,
"grad_norm": 6.080700874328613,
"learning_rate": 3.0015457419561392e-05,
"loss": 1.1001,
"num_input_tokens_seen": 217738936,
"step": 399500
},
{
"epoch": 4.0,
"eval_loss": 1.0870640277862549,
"eval_runtime": 187.0155,
"eval_samples_per_second": 1068.922,
"eval_steps_per_second": 133.62,
"num_input_tokens_seen": 217911400,
"step": 399808
},
{
"epoch": 4.00192092204258,
"grad_norm": 5.729778289794922,
"learning_rate": 2.9990445413798624e-05,
"loss": 1.041,
"num_input_tokens_seen": 218020144,
"step": 400000
},
{
"epoch": 4.006923323195133,
"grad_norm": 5.8337225914001465,
"learning_rate": 2.996543340803586e-05,
"loss": 1.0154,
"num_input_tokens_seen": 218293352,
"step": 400500
},
{
"epoch": 4.0119257243476865,
"grad_norm": 6.142926216125488,
"learning_rate": 2.9940421402273094e-05,
"loss": 1.0043,
"num_input_tokens_seen": 218559112,
"step": 401000
},
{
"epoch": 4.0169281255002405,
"grad_norm": 4.911243915557861,
"learning_rate": 2.9915409396510325e-05,
"loss": 1.0061,
"num_input_tokens_seen": 218839224,
"step": 401500
},
{
"epoch": 4.021930526652794,
"grad_norm": 5.466070175170898,
"learning_rate": 2.989039739074756e-05,
"loss": 0.9953,
"num_input_tokens_seen": 219111776,
"step": 402000
},
{
"epoch": 4.026932927805347,
"grad_norm": 6.471262454986572,
"learning_rate": 2.9865385384984795e-05,
"loss": 0.9955,
"num_input_tokens_seen": 219383912,
"step": 402500
},
{
"epoch": 4.0319353289579,
"grad_norm": 7.179049491882324,
"learning_rate": 2.9840373379222027e-05,
"loss": 1.0101,
"num_input_tokens_seen": 219647992,
"step": 403000
},
{
"epoch": 4.036937730110453,
"grad_norm": 5.031703948974609,
"learning_rate": 2.981536137345926e-05,
"loss": 1.0021,
"num_input_tokens_seen": 219913768,
"step": 403500
},
{
"epoch": 4.041940131263006,
"grad_norm": 4.3193840980529785,
"learning_rate": 2.9790349367696497e-05,
"loss": 1.0078,
"num_input_tokens_seen": 220190376,
"step": 404000
},
{
"epoch": 4.046942532415559,
"grad_norm": 5.400819778442383,
"learning_rate": 2.976533736193373e-05,
"loss": 0.9949,
"num_input_tokens_seen": 220460224,
"step": 404500
},
{
"epoch": 4.051944933568112,
"grad_norm": 6.279000759124756,
"learning_rate": 2.9740325356170963e-05,
"loss": 1.0322,
"num_input_tokens_seen": 220736472,
"step": 405000
},
{
"epoch": 4.0569473347206655,
"grad_norm": 7.3011627197265625,
"learning_rate": 2.9715313350408198e-05,
"loss": 1.0112,
"num_input_tokens_seen": 221004608,
"step": 405500
},
{
"epoch": 4.0619497358732195,
"grad_norm": 6.007471561431885,
"learning_rate": 2.9690301344645433e-05,
"loss": 1.0231,
"num_input_tokens_seen": 221278264,
"step": 406000
},
{
"epoch": 4.066952137025773,
"grad_norm": 5.404012203216553,
"learning_rate": 2.966528933888266e-05,
"loss": 1.0205,
"num_input_tokens_seen": 221555688,
"step": 406500
},
{
"epoch": 4.071954538178326,
"grad_norm": 4.693950653076172,
"learning_rate": 2.9640277333119896e-05,
"loss": 1.021,
"num_input_tokens_seen": 221832040,
"step": 407000
},
{
"epoch": 4.076956939330879,
"grad_norm": 5.678884029388428,
"learning_rate": 2.9615265327357135e-05,
"loss": 1.0268,
"num_input_tokens_seen": 222112352,
"step": 407500
},
{
"epoch": 4.081959340483432,
"grad_norm": 5.514533042907715,
"learning_rate": 2.959025332159437e-05,
"loss": 1.0236,
"num_input_tokens_seen": 222383544,
"step": 408000
},
{
"epoch": 4.086961741635985,
"grad_norm": 6.353760719299316,
"learning_rate": 2.9565241315831598e-05,
"loss": 1.0076,
"num_input_tokens_seen": 222653528,
"step": 408500
},
{
"epoch": 4.091964142788538,
"grad_norm": 5.7514519691467285,
"learning_rate": 2.9540229310068833e-05,
"loss": 1.0175,
"num_input_tokens_seen": 222929176,
"step": 409000
},
{
"epoch": 4.096966543941091,
"grad_norm": 5.185674667358398,
"learning_rate": 2.951521730430607e-05,
"loss": 1.023,
"num_input_tokens_seen": 223205600,
"step": 409500
},
{
"epoch": 4.1019689450936445,
"grad_norm": 6.269286632537842,
"learning_rate": 2.94902052985433e-05,
"loss": 1.0177,
"num_input_tokens_seen": 223471424,
"step": 410000
},
{
"epoch": 4.1069713462461985,
"grad_norm": 5.551058292388916,
"learning_rate": 2.9465193292780534e-05,
"loss": 1.014,
"num_input_tokens_seen": 223744904,
"step": 410500
},
{
"epoch": 4.111973747398752,
"grad_norm": 7.259944438934326,
"learning_rate": 2.944018128701777e-05,
"loss": 1.0176,
"num_input_tokens_seen": 224017960,
"step": 411000
},
{
"epoch": 4.116976148551305,
"grad_norm": 6.2288498878479,
"learning_rate": 2.9415169281255008e-05,
"loss": 1.0208,
"num_input_tokens_seen": 224288328,
"step": 411500
},
{
"epoch": 4.121978549703858,
"grad_norm": 4.875370502471924,
"learning_rate": 2.9390157275492236e-05,
"loss": 1.0218,
"num_input_tokens_seen": 224564744,
"step": 412000
},
{
"epoch": 4.126980950856411,
"grad_norm": 5.8250603675842285,
"learning_rate": 2.936514526972947e-05,
"loss": 1.0176,
"num_input_tokens_seen": 224833216,
"step": 412500
},
{
"epoch": 4.131983352008964,
"grad_norm": 4.689972877502441,
"learning_rate": 2.9340133263966706e-05,
"loss": 1.0031,
"num_input_tokens_seen": 225109008,
"step": 413000
},
{
"epoch": 4.136985753161517,
"grad_norm": 6.370342254638672,
"learning_rate": 2.9315121258203937e-05,
"loss": 1.0235,
"num_input_tokens_seen": 225386144,
"step": 413500
},
{
"epoch": 4.14198815431407,
"grad_norm": 5.214616298675537,
"learning_rate": 2.9290109252441172e-05,
"loss": 1.0147,
"num_input_tokens_seen": 225665576,
"step": 414000
},
{
"epoch": 4.146990555466624,
"grad_norm": 5.056887626647949,
"learning_rate": 2.9265097246678407e-05,
"loss": 1.0134,
"num_input_tokens_seen": 225936744,
"step": 414500
},
{
"epoch": 4.1519929566191776,
"grad_norm": 7.385371685028076,
"learning_rate": 2.9240085240915642e-05,
"loss": 1.0133,
"num_input_tokens_seen": 226206672,
"step": 415000
},
{
"epoch": 4.156995357771731,
"grad_norm": 6.09354829788208,
"learning_rate": 2.9215073235152874e-05,
"loss": 1.0257,
"num_input_tokens_seen": 226483208,
"step": 415500
},
{
"epoch": 4.161997758924284,
"grad_norm": 6.554540634155273,
"learning_rate": 2.919006122939011e-05,
"loss": 1.004,
"num_input_tokens_seen": 226756440,
"step": 416000
},
{
"epoch": 4.167000160076837,
"grad_norm": 6.016900539398193,
"learning_rate": 2.9165049223627344e-05,
"loss": 1.0373,
"num_input_tokens_seen": 227035824,
"step": 416500
},
{
"epoch": 4.17200256122939,
"grad_norm": 5.212109565734863,
"learning_rate": 2.9140037217864575e-05,
"loss": 1.0168,
"num_input_tokens_seen": 227309792,
"step": 417000
},
{
"epoch": 4.177004962381943,
"grad_norm": 5.641068935394287,
"learning_rate": 2.911502521210181e-05,
"loss": 1.0184,
"num_input_tokens_seen": 227578216,
"step": 417500
},
{
"epoch": 4.182007363534496,
"grad_norm": 4.603857040405273,
"learning_rate": 2.9090013206339045e-05,
"loss": 1.0092,
"num_input_tokens_seen": 227844640,
"step": 418000
},
{
"epoch": 4.187009764687049,
"grad_norm": 7.76889705657959,
"learning_rate": 2.906500120057628e-05,
"loss": 1.0324,
"num_input_tokens_seen": 228114632,
"step": 418500
},
{
"epoch": 4.1920121658396035,
"grad_norm": 5.698912143707275,
"learning_rate": 2.9039989194813512e-05,
"loss": 1.0159,
"num_input_tokens_seen": 228392896,
"step": 419000
},
{
"epoch": 4.197014566992157,
"grad_norm": 4.45599365234375,
"learning_rate": 2.9014977189050747e-05,
"loss": 1.0243,
"num_input_tokens_seen": 228657144,
"step": 419500
},
{
"epoch": 4.20201696814471,
"grad_norm": 4.775566577911377,
"learning_rate": 2.8989965183287982e-05,
"loss": 1.0249,
"num_input_tokens_seen": 228930912,
"step": 420000
},
{
"epoch": 4.207019369297263,
"grad_norm": 4.6044511795043945,
"learning_rate": 2.896495317752521e-05,
"loss": 1.0171,
"num_input_tokens_seen": 229190776,
"step": 420500
},
{
"epoch": 4.212021770449816,
"grad_norm": 5.821028709411621,
"learning_rate": 2.8939941171762448e-05,
"loss": 1.0197,
"num_input_tokens_seen": 229464464,
"step": 421000
},
{
"epoch": 4.217024171602369,
"grad_norm": 6.407191753387451,
"learning_rate": 2.8914929165999683e-05,
"loss": 1.0193,
"num_input_tokens_seen": 229735080,
"step": 421500
},
{
"epoch": 4.222026572754922,
"grad_norm": 6.624352931976318,
"learning_rate": 2.8889917160236918e-05,
"loss": 1.0388,
"num_input_tokens_seen": 230010112,
"step": 422000
},
{
"epoch": 4.227028973907475,
"grad_norm": 5.672749042510986,
"learning_rate": 2.8864905154474146e-05,
"loss": 1.0094,
"num_input_tokens_seen": 230273040,
"step": 422500
},
{
"epoch": 4.2320313750600285,
"grad_norm": 4.765455722808838,
"learning_rate": 2.8839893148711385e-05,
"loss": 1.038,
"num_input_tokens_seen": 230549568,
"step": 423000
},
{
"epoch": 4.2370337762125825,
"grad_norm": 5.471391677856445,
"learning_rate": 2.881488114294862e-05,
"loss": 1.0137,
"num_input_tokens_seen": 230815880,
"step": 423500
},
{
"epoch": 4.242036177365136,
"grad_norm": 5.090280532836914,
"learning_rate": 2.8789869137185848e-05,
"loss": 1.0254,
"num_input_tokens_seen": 231087416,
"step": 424000
},
{
"epoch": 4.247038578517689,
"grad_norm": 5.823254585266113,
"learning_rate": 2.8764857131423083e-05,
"loss": 1.0369,
"num_input_tokens_seen": 231351728,
"step": 424500
},
{
"epoch": 4.252040979670242,
"grad_norm": 5.406543731689453,
"learning_rate": 2.873984512566032e-05,
"loss": 1.0233,
"num_input_tokens_seen": 231626944,
"step": 425000
},
{
"epoch": 4.257043380822795,
"grad_norm": 6.112472057342529,
"learning_rate": 2.871483311989755e-05,
"loss": 1.046,
"num_input_tokens_seen": 231901904,
"step": 425500
},
{
"epoch": 4.262045781975348,
"grad_norm": 5.495764255523682,
"learning_rate": 2.8689821114134784e-05,
"loss": 1.0127,
"num_input_tokens_seen": 232177576,
"step": 426000
},
{
"epoch": 4.267048183127901,
"grad_norm": 5.970737934112549,
"learning_rate": 2.866480910837202e-05,
"loss": 1.0266,
"num_input_tokens_seen": 232448808,
"step": 426500
},
{
"epoch": 4.272050584280454,
"grad_norm": 6.965437889099121,
"learning_rate": 2.8639797102609254e-05,
"loss": 1.0311,
"num_input_tokens_seen": 232724960,
"step": 427000
},
{
"epoch": 4.2770529854330075,
"grad_norm": 6.662547588348389,
"learning_rate": 2.8614785096846486e-05,
"loss": 1.0216,
"num_input_tokens_seen": 232999800,
"step": 427500
},
{
"epoch": 4.2820553865855615,
"grad_norm": 4.90582275390625,
"learning_rate": 2.858977309108372e-05,
"loss": 1.036,
"num_input_tokens_seen": 233278352,
"step": 428000
},
{
"epoch": 4.287057787738115,
"grad_norm": 5.090430736541748,
"learning_rate": 2.8564761085320956e-05,
"loss": 1.0253,
"num_input_tokens_seen": 233542648,
"step": 428500
},
{
"epoch": 4.292060188890668,
"grad_norm": 6.307216167449951,
"learning_rate": 2.8539749079558187e-05,
"loss": 1.0367,
"num_input_tokens_seen": 233821008,
"step": 429000
},
{
"epoch": 4.297062590043221,
"grad_norm": 5.634079933166504,
"learning_rate": 2.8514737073795422e-05,
"loss": 1.0248,
"num_input_tokens_seen": 234099000,
"step": 429500
},
{
"epoch": 4.302064991195774,
"grad_norm": 6.014862060546875,
"learning_rate": 2.8489725068032657e-05,
"loss": 1.0231,
"num_input_tokens_seen": 234375176,
"step": 430000
},
{
"epoch": 4.307067392348327,
"grad_norm": 5.199640274047852,
"learning_rate": 2.8464713062269892e-05,
"loss": 1.0366,
"num_input_tokens_seen": 234650128,
"step": 430500
},
{
"epoch": 4.31206979350088,
"grad_norm": 4.8902692794799805,
"learning_rate": 2.8439701056507124e-05,
"loss": 1.0394,
"num_input_tokens_seen": 234918712,
"step": 431000
},
{
"epoch": 4.317072194653433,
"grad_norm": 4.592429161071777,
"learning_rate": 2.841468905074436e-05,
"loss": 1.029,
"num_input_tokens_seen": 235196600,
"step": 431500
},
{
"epoch": 4.3220745958059865,
"grad_norm": 5.6518144607543945,
"learning_rate": 2.8389677044981594e-05,
"loss": 1.0217,
"num_input_tokens_seen": 235465976,
"step": 432000
},
{
"epoch": 4.3270769969585405,
"grad_norm": 5.183743000030518,
"learning_rate": 2.8364665039218825e-05,
"loss": 1.0262,
"num_input_tokens_seen": 235745992,
"step": 432500
},
{
"epoch": 4.332079398111094,
"grad_norm": 4.891019821166992,
"learning_rate": 2.833965303345606e-05,
"loss": 1.0363,
"num_input_tokens_seen": 236018376,
"step": 433000
},
{
"epoch": 4.337081799263647,
"grad_norm": 4.2536725997924805,
"learning_rate": 2.8314641027693295e-05,
"loss": 1.022,
"num_input_tokens_seen": 236289296,
"step": 433500
},
{
"epoch": 4.3420842004162,
"grad_norm": 6.686141014099121,
"learning_rate": 2.828962902193053e-05,
"loss": 1.0393,
"num_input_tokens_seen": 236575672,
"step": 434000
},
{
"epoch": 4.347086601568753,
"grad_norm": 4.4611945152282715,
"learning_rate": 2.8264617016167762e-05,
"loss": 1.0419,
"num_input_tokens_seen": 236849624,
"step": 434500
},
{
"epoch": 4.352089002721306,
"grad_norm": 4.447482585906982,
"learning_rate": 2.8239605010404997e-05,
"loss": 1.0337,
"num_input_tokens_seen": 237115712,
"step": 435000
},
{
"epoch": 4.357091403873859,
"grad_norm": 5.549137115478516,
"learning_rate": 2.8214593004642232e-05,
"loss": 1.0161,
"num_input_tokens_seen": 237386704,
"step": 435500
},
{
"epoch": 4.362093805026412,
"grad_norm": 6.824407577514648,
"learning_rate": 2.818958099887946e-05,
"loss": 1.025,
"num_input_tokens_seen": 237662672,
"step": 436000
},
{
"epoch": 4.3670962061789655,
"grad_norm": 5.618262767791748,
"learning_rate": 2.8164568993116695e-05,
"loss": 1.0222,
"num_input_tokens_seen": 237934856,
"step": 436500
},
{
"epoch": 4.37209860733152,
"grad_norm": 5.112995624542236,
"learning_rate": 2.8139556987353933e-05,
"loss": 1.0119,
"num_input_tokens_seen": 238206992,
"step": 437000
},
{
"epoch": 4.377101008484073,
"grad_norm": 5.395593166351318,
"learning_rate": 2.811454498159117e-05,
"loss": 1.0337,
"num_input_tokens_seen": 238477792,
"step": 437500
},
{
"epoch": 4.382103409636626,
"grad_norm": 8.581912994384766,
"learning_rate": 2.8089532975828397e-05,
"loss": 1.0447,
"num_input_tokens_seen": 238754960,
"step": 438000
},
{
"epoch": 4.387105810789179,
"grad_norm": 5.694709777832031,
"learning_rate": 2.806452097006563e-05,
"loss": 1.0404,
"num_input_tokens_seen": 239027008,
"step": 438500
},
{
"epoch": 4.392108211941732,
"grad_norm": 6.605731010437012,
"learning_rate": 2.803950896430287e-05,
"loss": 1.0412,
"num_input_tokens_seen": 239292592,
"step": 439000
},
{
"epoch": 4.397110613094285,
"grad_norm": 5.162715911865234,
"learning_rate": 2.8014496958540098e-05,
"loss": 1.0266,
"num_input_tokens_seen": 239564344,
"step": 439500
},
{
"epoch": 4.402113014246838,
"grad_norm": 8.414751052856445,
"learning_rate": 2.7989484952777333e-05,
"loss": 1.0461,
"num_input_tokens_seen": 239839984,
"step": 440000
},
{
"epoch": 4.4071154153993914,
"grad_norm": 5.043530464172363,
"learning_rate": 2.7964472947014568e-05,
"loss": 1.0312,
"num_input_tokens_seen": 240116944,
"step": 440500
},
{
"epoch": 4.412117816551945,
"grad_norm": 6.508191108703613,
"learning_rate": 2.7939460941251806e-05,
"loss": 1.0355,
"num_input_tokens_seen": 240386144,
"step": 441000
},
{
"epoch": 4.417120217704499,
"grad_norm": 4.704832077026367,
"learning_rate": 2.7914448935489035e-05,
"loss": 1.0252,
"num_input_tokens_seen": 240655192,
"step": 441500
},
{
"epoch": 4.422122618857052,
"grad_norm": 6.601123332977295,
"learning_rate": 2.788943692972627e-05,
"loss": 1.0564,
"num_input_tokens_seen": 240931640,
"step": 442000
},
{
"epoch": 4.427125020009605,
"grad_norm": 5.828186988830566,
"learning_rate": 2.7864424923963504e-05,
"loss": 1.0359,
"num_input_tokens_seen": 241199768,
"step": 442500
},
{
"epoch": 4.432127421162158,
"grad_norm": 4.463243007659912,
"learning_rate": 2.7839412918200736e-05,
"loss": 1.03,
"num_input_tokens_seen": 241474320,
"step": 443000
},
{
"epoch": 4.437129822314711,
"grad_norm": 5.028249263763428,
"learning_rate": 2.781440091243797e-05,
"loss": 1.0403,
"num_input_tokens_seen": 241750640,
"step": 443500
},
{
"epoch": 4.442132223467264,
"grad_norm": 4.5420684814453125,
"learning_rate": 2.7789388906675206e-05,
"loss": 1.0171,
"num_input_tokens_seen": 242018848,
"step": 444000
},
{
"epoch": 4.447134624619817,
"grad_norm": 7.803140640258789,
"learning_rate": 2.7764376900912438e-05,
"loss": 1.0297,
"num_input_tokens_seen": 242284168,
"step": 444500
},
{
"epoch": 4.4521370257723705,
"grad_norm": 5.844732761383057,
"learning_rate": 2.7739364895149673e-05,
"loss": 1.0348,
"num_input_tokens_seen": 242553128,
"step": 445000
},
{
"epoch": 4.457139426924924,
"grad_norm": 5.830750942230225,
"learning_rate": 2.7714352889386908e-05,
"loss": 1.0429,
"num_input_tokens_seen": 242827088,
"step": 445500
},
{
"epoch": 4.462141828077478,
"grad_norm": 4.908278942108154,
"learning_rate": 2.7689340883624143e-05,
"loss": 1.028,
"num_input_tokens_seen": 243093120,
"step": 446000
},
{
"epoch": 4.467144229230031,
"grad_norm": 5.725689888000488,
"learning_rate": 2.7664328877861374e-05,
"loss": 1.023,
"num_input_tokens_seen": 243364816,
"step": 446500
},
{
"epoch": 4.472146630382584,
"grad_norm": 5.354498386383057,
"learning_rate": 2.763931687209861e-05,
"loss": 1.0433,
"num_input_tokens_seen": 243641016,
"step": 447000
},
{
"epoch": 4.477149031535137,
"grad_norm": 6.727901458740234,
"learning_rate": 2.7614304866335844e-05,
"loss": 1.0142,
"num_input_tokens_seen": 243912976,
"step": 447500
},
{
"epoch": 4.48215143268769,
"grad_norm": 5.042398452758789,
"learning_rate": 2.7589292860573072e-05,
"loss": 1.0301,
"num_input_tokens_seen": 244181688,
"step": 448000
},
{
"epoch": 4.487153833840243,
"grad_norm": 6.022967338562012,
"learning_rate": 2.756428085481031e-05,
"loss": 1.0242,
"num_input_tokens_seen": 244448704,
"step": 448500
},
{
"epoch": 4.492156234992796,
"grad_norm": 5.077592849731445,
"learning_rate": 2.7539268849047546e-05,
"loss": 1.0373,
"num_input_tokens_seen": 244722392,
"step": 449000
},
{
"epoch": 4.4971586361453495,
"grad_norm": 5.527291774749756,
"learning_rate": 2.751425684328478e-05,
"loss": 1.0183,
"num_input_tokens_seen": 244995952,
"step": 449500
},
{
"epoch": 4.502161037297903,
"grad_norm": 5.025604248046875,
"learning_rate": 2.748924483752201e-05,
"loss": 1.0164,
"num_input_tokens_seen": 245272304,
"step": 450000
},
{
"epoch": 4.507163438450457,
"grad_norm": 5.344061374664307,
"learning_rate": 2.7464232831759247e-05,
"loss": 1.0373,
"num_input_tokens_seen": 245546016,
"step": 450500
},
{
"epoch": 4.51216583960301,
"grad_norm": 4.6710524559021,
"learning_rate": 2.7439220825996482e-05,
"loss": 1.0194,
"num_input_tokens_seen": 245823488,
"step": 451000
},
{
"epoch": 4.517168240755563,
"grad_norm": 5.240355014801025,
"learning_rate": 2.741420882023371e-05,
"loss": 1.0277,
"num_input_tokens_seen": 246097728,
"step": 451500
},
{
"epoch": 4.522170641908116,
"grad_norm": 5.007404327392578,
"learning_rate": 2.7389196814470945e-05,
"loss": 1.0217,
"num_input_tokens_seen": 246368736,
"step": 452000
},
{
"epoch": 4.527173043060669,
"grad_norm": 5.346477508544922,
"learning_rate": 2.7364184808708184e-05,
"loss": 1.0283,
"num_input_tokens_seen": 246637968,
"step": 452500
},
{
"epoch": 4.532175444213222,
"grad_norm": 5.416214466094971,
"learning_rate": 2.733917280294542e-05,
"loss": 1.0448,
"num_input_tokens_seen": 246918648,
"step": 453000
},
{
"epoch": 4.537177845365775,
"grad_norm": 7.101502418518066,
"learning_rate": 2.7314160797182647e-05,
"loss": 1.0469,
"num_input_tokens_seen": 247181624,
"step": 453500
},
{
"epoch": 4.5421802465183285,
"grad_norm": 5.758215427398682,
"learning_rate": 2.728914879141988e-05,
"loss": 1.0337,
"num_input_tokens_seen": 247456808,
"step": 454000
},
{
"epoch": 4.547182647670882,
"grad_norm": 7.215245246887207,
"learning_rate": 2.726413678565712e-05,
"loss": 1.0265,
"num_input_tokens_seen": 247730456,
"step": 454500
},
{
"epoch": 4.552185048823436,
"grad_norm": 5.474388122558594,
"learning_rate": 2.7239124779894348e-05,
"loss": 1.0319,
"num_input_tokens_seen": 248006488,
"step": 455000
},
{
"epoch": 4.557187449975989,
"grad_norm": 4.975455284118652,
"learning_rate": 2.7214112774131583e-05,
"loss": 1.029,
"num_input_tokens_seen": 248283776,
"step": 455500
},
{
"epoch": 4.562189851128542,
"grad_norm": 5.586923599243164,
"learning_rate": 2.7189100768368818e-05,
"loss": 1.0314,
"num_input_tokens_seen": 248564784,
"step": 456000
},
{
"epoch": 4.567192252281095,
"grad_norm": 7.202296257019043,
"learning_rate": 2.7164088762606053e-05,
"loss": 1.0436,
"num_input_tokens_seen": 248837928,
"step": 456500
},
{
"epoch": 4.572194653433648,
"grad_norm": 6.214195728302002,
"learning_rate": 2.7139076756843285e-05,
"loss": 1.0253,
"num_input_tokens_seen": 249109248,
"step": 457000
},
{
"epoch": 4.577197054586201,
"grad_norm": 6.16148567199707,
"learning_rate": 2.711406475108052e-05,
"loss": 1.0385,
"num_input_tokens_seen": 249384792,
"step": 457500
},
{
"epoch": 4.582199455738754,
"grad_norm": 5.71275520324707,
"learning_rate": 2.7089052745317755e-05,
"loss": 1.0432,
"num_input_tokens_seen": 249651928,
"step": 458000
},
{
"epoch": 4.5872018568913075,
"grad_norm": 4.817130088806152,
"learning_rate": 2.7064040739554986e-05,
"loss": 1.02,
"num_input_tokens_seen": 249917104,
"step": 458500
},
{
"epoch": 4.592204258043861,
"grad_norm": 5.333267688751221,
"learning_rate": 2.703902873379222e-05,
"loss": 1.0214,
"num_input_tokens_seen": 250188040,
"step": 459000
},
{
"epoch": 4.597206659196415,
"grad_norm": 5.287978172302246,
"learning_rate": 2.7014016728029456e-05,
"loss": 1.0401,
"num_input_tokens_seen": 250468056,
"step": 459500
},
{
"epoch": 4.602209060348968,
"grad_norm": 4.713915824890137,
"learning_rate": 2.698900472226669e-05,
"loss": 1.032,
"num_input_tokens_seen": 250737232,
"step": 460000
},
{
"epoch": 4.607211461501521,
"grad_norm": 6.2646965980529785,
"learning_rate": 2.6963992716503923e-05,
"loss": 1.0289,
"num_input_tokens_seen": 251009944,
"step": 460500
},
{
"epoch": 4.612213862654074,
"grad_norm": 6.391628742218018,
"learning_rate": 2.6938980710741158e-05,
"loss": 1.037,
"num_input_tokens_seen": 251276984,
"step": 461000
},
{
"epoch": 4.617216263806627,
"grad_norm": 6.245530128479004,
"learning_rate": 2.6913968704978393e-05,
"loss": 1.0321,
"num_input_tokens_seen": 251544248,
"step": 461500
},
{
"epoch": 4.62221866495918,
"grad_norm": 5.505767345428467,
"learning_rate": 2.6888956699215624e-05,
"loss": 1.023,
"num_input_tokens_seen": 251820376,
"step": 462000
},
{
"epoch": 4.6272210661117334,
"grad_norm": 5.286034107208252,
"learning_rate": 2.686394469345286e-05,
"loss": 1.029,
"num_input_tokens_seen": 252089664,
"step": 462500
},
{
"epoch": 4.632223467264287,
"grad_norm": 5.050361156463623,
"learning_rate": 2.6838932687690094e-05,
"loss": 1.0215,
"num_input_tokens_seen": 252354304,
"step": 463000
},
{
"epoch": 4.63722586841684,
"grad_norm": 4.83864164352417,
"learning_rate": 2.6813920681927322e-05,
"loss": 1.0292,
"num_input_tokens_seen": 252625992,
"step": 463500
},
{
"epoch": 4.642228269569394,
"grad_norm": 4.267606735229492,
"learning_rate": 2.678890867616456e-05,
"loss": 1.0356,
"num_input_tokens_seen": 252900240,
"step": 464000
},
{
"epoch": 4.647230670721947,
"grad_norm": 5.304383754730225,
"learning_rate": 2.6763896670401796e-05,
"loss": 1.0182,
"num_input_tokens_seen": 253175392,
"step": 464500
},
{
"epoch": 4.6522330718745,
"grad_norm": 7.107183933258057,
"learning_rate": 2.673888466463903e-05,
"loss": 1.0311,
"num_input_tokens_seen": 253449480,
"step": 465000
},
{
"epoch": 4.657235473027053,
"grad_norm": 7.010105133056641,
"learning_rate": 2.671387265887626e-05,
"loss": 1.0213,
"num_input_tokens_seen": 253717600,
"step": 465500
},
{
"epoch": 4.662237874179606,
"grad_norm": 5.442753791809082,
"learning_rate": 2.6688860653113494e-05,
"loss": 1.0624,
"num_input_tokens_seen": 253996064,
"step": 466000
},
{
"epoch": 4.667240275332159,
"grad_norm": 5.711010932922363,
"learning_rate": 2.6663848647350732e-05,
"loss": 1.0382,
"num_input_tokens_seen": 254277992,
"step": 466500
},
{
"epoch": 4.6722426764847125,
"grad_norm": 5.396849632263184,
"learning_rate": 2.663883664158796e-05,
"loss": 1.0358,
"num_input_tokens_seen": 254544856,
"step": 467000
},
{
"epoch": 4.677245077637266,
"grad_norm": 7.533030033111572,
"learning_rate": 2.6613824635825195e-05,
"loss": 1.0344,
"num_input_tokens_seen": 254815184,
"step": 467500
},
{
"epoch": 4.682247478789819,
"grad_norm": 6.03594446182251,
"learning_rate": 2.658881263006243e-05,
"loss": 1.0234,
"num_input_tokens_seen": 255087048,
"step": 468000
},
{
"epoch": 4.687249879942373,
"grad_norm": 6.070241928100586,
"learning_rate": 2.656380062429967e-05,
"loss": 1.0226,
"num_input_tokens_seen": 255357680,
"step": 468500
},
{
"epoch": 4.692252281094926,
"grad_norm": 6.334639549255371,
"learning_rate": 2.6538788618536897e-05,
"loss": 1.0262,
"num_input_tokens_seen": 255624976,
"step": 469000
},
{
"epoch": 4.697254682247479,
"grad_norm": 6.096264839172363,
"learning_rate": 2.6513776612774132e-05,
"loss": 1.0208,
"num_input_tokens_seen": 255898192,
"step": 469500
},
{
"epoch": 4.702257083400032,
"grad_norm": 6.717766761779785,
"learning_rate": 2.6488764607011367e-05,
"loss": 1.0395,
"num_input_tokens_seen": 256174976,
"step": 470000
},
{
"epoch": 4.707259484552585,
"grad_norm": 5.3120527267456055,
"learning_rate": 2.64637526012486e-05,
"loss": 1.0346,
"num_input_tokens_seen": 256445504,
"step": 470500
},
{
"epoch": 4.712261885705138,
"grad_norm": 5.765807151794434,
"learning_rate": 2.6438740595485833e-05,
"loss": 1.0221,
"num_input_tokens_seen": 256728192,
"step": 471000
},
{
"epoch": 4.7172642868576915,
"grad_norm": 5.730865478515625,
"learning_rate": 2.6413728589723068e-05,
"loss": 1.0305,
"num_input_tokens_seen": 256998424,
"step": 471500
},
{
"epoch": 4.722266688010245,
"grad_norm": 7.514993190765381,
"learning_rate": 2.6388716583960303e-05,
"loss": 1.0316,
"num_input_tokens_seen": 257266816,
"step": 472000
},
{
"epoch": 4.727269089162798,
"grad_norm": 4.826568603515625,
"learning_rate": 2.6363704578197535e-05,
"loss": 1.0336,
"num_input_tokens_seen": 257528416,
"step": 472500
},
{
"epoch": 4.732271490315352,
"grad_norm": 5.88137674331665,
"learning_rate": 2.633869257243477e-05,
"loss": 1.0379,
"num_input_tokens_seen": 257803064,
"step": 473000
},
{
"epoch": 4.737273891467905,
"grad_norm": 5.539977073669434,
"learning_rate": 2.6313680566672005e-05,
"loss": 1.0487,
"num_input_tokens_seen": 258074752,
"step": 473500
},
{
"epoch": 4.742276292620458,
"grad_norm": 4.8047871589660645,
"learning_rate": 2.6288668560909236e-05,
"loss": 1.0437,
"num_input_tokens_seen": 258356488,
"step": 474000
},
{
"epoch": 4.747278693773011,
"grad_norm": 5.240783214569092,
"learning_rate": 2.626365655514647e-05,
"loss": 1.0349,
"num_input_tokens_seen": 258631128,
"step": 474500
},
{
"epoch": 4.752281094925564,
"grad_norm": 5.152280807495117,
"learning_rate": 2.6238644549383706e-05,
"loss": 1.0395,
"num_input_tokens_seen": 258902464,
"step": 475000
},
{
"epoch": 4.757283496078117,
"grad_norm": 4.534987449645996,
"learning_rate": 2.621363254362094e-05,
"loss": 1.0417,
"num_input_tokens_seen": 259167448,
"step": 475500
},
{
"epoch": 4.7622858972306705,
"grad_norm": 5.842191219329834,
"learning_rate": 2.6188620537858173e-05,
"loss": 1.0251,
"num_input_tokens_seen": 259437192,
"step": 476000
},
{
"epoch": 4.767288298383224,
"grad_norm": 6.532055377960205,
"learning_rate": 2.6163608532095408e-05,
"loss": 1.0235,
"num_input_tokens_seen": 259710112,
"step": 476500
},
{
"epoch": 4.772290699535777,
"grad_norm": 4.665198802947998,
"learning_rate": 2.6138596526332643e-05,
"loss": 1.0329,
"num_input_tokens_seen": 259983048,
"step": 477000
},
{
"epoch": 4.777293100688331,
"grad_norm": 5.536545276641846,
"learning_rate": 2.611358452056987e-05,
"loss": 1.0144,
"num_input_tokens_seen": 260257608,
"step": 477500
},
{
"epoch": 4.782295501840884,
"grad_norm": 7.246273994445801,
"learning_rate": 2.608857251480711e-05,
"loss": 1.0353,
"num_input_tokens_seen": 260533312,
"step": 478000
},
{
"epoch": 4.787297902993437,
"grad_norm": 5.359396934509277,
"learning_rate": 2.6063560509044344e-05,
"loss": 1.0359,
"num_input_tokens_seen": 260808264,
"step": 478500
},
{
"epoch": 4.79230030414599,
"grad_norm": 5.461490154266357,
"learning_rate": 2.603854850328158e-05,
"loss": 1.0342,
"num_input_tokens_seen": 261080680,
"step": 479000
},
{
"epoch": 4.797302705298543,
"grad_norm": 6.074306488037109,
"learning_rate": 2.6013536497518807e-05,
"loss": 1.0386,
"num_input_tokens_seen": 261352616,
"step": 479500
},
{
"epoch": 4.802305106451096,
"grad_norm": 4.465676307678223,
"learning_rate": 2.5988524491756046e-05,
"loss": 1.0287,
"num_input_tokens_seen": 261635744,
"step": 480000
},
{
"epoch": 4.8073075076036496,
"grad_norm": 5.1833953857421875,
"learning_rate": 2.596351248599328e-05,
"loss": 1.0237,
"num_input_tokens_seen": 261911376,
"step": 480500
},
{
"epoch": 4.812309908756203,
"grad_norm": 7.636727809906006,
"learning_rate": 2.593850048023051e-05,
"loss": 1.0376,
"num_input_tokens_seen": 262186256,
"step": 481000
},
{
"epoch": 4.817312309908756,
"grad_norm": 5.770178318023682,
"learning_rate": 2.5913488474467744e-05,
"loss": 1.0221,
"num_input_tokens_seen": 262461448,
"step": 481500
},
{
"epoch": 4.82231471106131,
"grad_norm": 7.173573970794678,
"learning_rate": 2.5888476468704982e-05,
"loss": 1.0206,
"num_input_tokens_seen": 262734672,
"step": 482000
},
{
"epoch": 4.827317112213863,
"grad_norm": 5.029593467712402,
"learning_rate": 2.5863464462942217e-05,
"loss": 1.0507,
"num_input_tokens_seen": 263009408,
"step": 482500
},
{
"epoch": 4.832319513366416,
"grad_norm": 6.359258651733398,
"learning_rate": 2.5838452457179445e-05,
"loss": 1.0275,
"num_input_tokens_seen": 263280584,
"step": 483000
},
{
"epoch": 4.837321914518969,
"grad_norm": 5.677992820739746,
"learning_rate": 2.581344045141668e-05,
"loss": 1.0195,
"num_input_tokens_seen": 263545768,
"step": 483500
},
{
"epoch": 4.842324315671522,
"grad_norm": 4.935763835906982,
"learning_rate": 2.578842844565392e-05,
"loss": 1.0311,
"num_input_tokens_seen": 263822840,
"step": 484000
},
{
"epoch": 4.8473267168240755,
"grad_norm": 5.072977542877197,
"learning_rate": 2.5763416439891147e-05,
"loss": 1.0246,
"num_input_tokens_seen": 264100088,
"step": 484500
},
{
"epoch": 4.852329117976629,
"grad_norm": 6.382875442504883,
"learning_rate": 2.5738404434128382e-05,
"loss": 1.0211,
"num_input_tokens_seen": 264373424,
"step": 485000
},
{
"epoch": 4.857331519129182,
"grad_norm": 5.98667049407959,
"learning_rate": 2.5713392428365617e-05,
"loss": 1.0457,
"num_input_tokens_seen": 264652616,
"step": 485500
},
{
"epoch": 4.862333920281735,
"grad_norm": 5.859986782073975,
"learning_rate": 2.568838042260285e-05,
"loss": 1.0291,
"num_input_tokens_seen": 264919240,
"step": 486000
},
{
"epoch": 4.867336321434289,
"grad_norm": 5.3083271980285645,
"learning_rate": 2.5663368416840083e-05,
"loss": 1.024,
"num_input_tokens_seen": 265186880,
"step": 486500
},
{
"epoch": 4.872338722586842,
"grad_norm": 5.720509052276611,
"learning_rate": 2.563835641107732e-05,
"loss": 1.0329,
"num_input_tokens_seen": 265456584,
"step": 487000
},
{
"epoch": 4.877341123739395,
"grad_norm": 5.909444332122803,
"learning_rate": 2.5613344405314553e-05,
"loss": 1.0205,
"num_input_tokens_seen": 265720768,
"step": 487500
},
{
"epoch": 4.882343524891948,
"grad_norm": 4.779830455780029,
"learning_rate": 2.5588332399551785e-05,
"loss": 1.0226,
"num_input_tokens_seen": 265991224,
"step": 488000
},
{
"epoch": 4.887345926044501,
"grad_norm": 5.503864765167236,
"learning_rate": 2.556332039378902e-05,
"loss": 1.0299,
"num_input_tokens_seen": 266260752,
"step": 488500
},
{
"epoch": 4.8923483271970545,
"grad_norm": 6.2289581298828125,
"learning_rate": 2.5538308388026255e-05,
"loss": 1.0165,
"num_input_tokens_seen": 266538032,
"step": 489000
},
{
"epoch": 4.897350728349608,
"grad_norm": 6.163370132446289,
"learning_rate": 2.5513296382263486e-05,
"loss": 1.0273,
"num_input_tokens_seen": 266812688,
"step": 489500
},
{
"epoch": 4.902353129502161,
"grad_norm": 5.308876991271973,
"learning_rate": 2.548828437650072e-05,
"loss": 1.0302,
"num_input_tokens_seen": 267083632,
"step": 490000
},
{
"epoch": 4.907355530654714,
"grad_norm": 6.824766635894775,
"learning_rate": 2.5463272370737956e-05,
"loss": 1.0287,
"num_input_tokens_seen": 267354032,
"step": 490500
},
{
"epoch": 4.912357931807268,
"grad_norm": 5.9447102546691895,
"learning_rate": 2.543826036497519e-05,
"loss": 1.0552,
"num_input_tokens_seen": 267626432,
"step": 491000
},
{
"epoch": 4.917360332959821,
"grad_norm": 5.845020771026611,
"learning_rate": 2.5413248359212423e-05,
"loss": 1.0136,
"num_input_tokens_seen": 267900520,
"step": 491500
},
{
"epoch": 4.922362734112374,
"grad_norm": 5.4116082191467285,
"learning_rate": 2.5388236353449658e-05,
"loss": 1.0163,
"num_input_tokens_seen": 268165832,
"step": 492000
},
{
"epoch": 4.927365135264927,
"grad_norm": 7.0753326416015625,
"learning_rate": 2.5363224347686893e-05,
"loss": 1.0393,
"num_input_tokens_seen": 268441848,
"step": 492500
},
{
"epoch": 4.93236753641748,
"grad_norm": 7.350298881530762,
"learning_rate": 2.533821234192412e-05,
"loss": 1.0463,
"num_input_tokens_seen": 268716696,
"step": 493000
},
{
"epoch": 4.9373699375700335,
"grad_norm": 5.284552574157715,
"learning_rate": 2.531320033616136e-05,
"loss": 1.0311,
"num_input_tokens_seen": 268988968,
"step": 493500
},
{
"epoch": 4.942372338722587,
"grad_norm": 6.068382740020752,
"learning_rate": 2.5288188330398594e-05,
"loss": 1.0379,
"num_input_tokens_seen": 269264776,
"step": 494000
},
{
"epoch": 4.94737473987514,
"grad_norm": 6.806668281555176,
"learning_rate": 2.526317632463583e-05,
"loss": 1.0403,
"num_input_tokens_seen": 269535320,
"step": 494500
},
{
"epoch": 4.952377141027693,
"grad_norm": 5.127531051635742,
"learning_rate": 2.5238164318873058e-05,
"loss": 1.0269,
"num_input_tokens_seen": 269805928,
"step": 495000
},
{
"epoch": 4.957379542180247,
"grad_norm": 4.772179126739502,
"learning_rate": 2.5213152313110293e-05,
"loss": 1.0251,
"num_input_tokens_seen": 270085064,
"step": 495500
},
{
"epoch": 4.9623819433328,
"grad_norm": 7.208611011505127,
"learning_rate": 2.518814030734753e-05,
"loss": 1.0134,
"num_input_tokens_seen": 270359424,
"step": 496000
},
{
"epoch": 4.967384344485353,
"grad_norm": 5.730184555053711,
"learning_rate": 2.516312830158476e-05,
"loss": 1.0226,
"num_input_tokens_seen": 270634264,
"step": 496500
},
{
"epoch": 4.972386745637906,
"grad_norm": 5.047354698181152,
"learning_rate": 2.5138116295821994e-05,
"loss": 1.034,
"num_input_tokens_seen": 270902440,
"step": 497000
},
{
"epoch": 4.977389146790459,
"grad_norm": 4.870574951171875,
"learning_rate": 2.511310429005923e-05,
"loss": 1.0434,
"num_input_tokens_seen": 271179104,
"step": 497500
},
{
"epoch": 4.9823915479430125,
"grad_norm": 5.616664409637451,
"learning_rate": 2.5088092284296467e-05,
"loss": 1.0236,
"num_input_tokens_seen": 271451512,
"step": 498000
},
{
"epoch": 4.987393949095566,
"grad_norm": 4.676699638366699,
"learning_rate": 2.5063080278533696e-05,
"loss": 1.0464,
"num_input_tokens_seen": 271724472,
"step": 498500
},
{
"epoch": 4.992396350248119,
"grad_norm": 5.634840965270996,
"learning_rate": 2.503806827277093e-05,
"loss": 1.0291,
"num_input_tokens_seen": 271995512,
"step": 499000
},
{
"epoch": 4.997398751400672,
"grad_norm": 6.081726551055908,
"learning_rate": 2.5013056267008166e-05,
"loss": 1.0243,
"num_input_tokens_seen": 272263560,
"step": 499500
},
{
"epoch": 5.0,
"eval_loss": 1.0584163665771484,
"eval_runtime": 192.7527,
"eval_samples_per_second": 1037.106,
"eval_steps_per_second": 129.643,
"num_input_tokens_seen": 272407288,
"step": 499760
},
{
"epoch": 5.0,
"num_input_tokens_seen": 272407288,
"step": 499760,
"total_flos": 7.214188795055309e+16,
"train_loss": 0.0,
"train_runtime": 0.0544,
"train_samples_per_second": 73478382.327,
"train_steps_per_second": 9184797.791,
"train_tokens_per_second": 5003948262.574
}
],
"logging_steps": 500,
"max_steps": 499760,
"num_input_tokens_seen": 272407288,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.214188795055309e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}