diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,23948 +1,16908 @@ { - "best_metric": 0.207763671875, - "best_model_checkpoint": "./results_morgangen_auto/checkpoint-340000", - "epoch": 0.0034, + "best_metric": 0.239013671875, + "best_model_checkpoint": "./results_morgangen_auto/checkpoint-240000", + "epoch": 0.0024, "eval_steps": 20000, - "global_step": 340000, + "global_step": 240000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-06, - "grad_norm": 9.253414154052734, + "grad_norm": 7.192080497741699, "learning_rate": 4.944309013222119e-06, - "loss": 3.5017, + "loss": 3.578, "step": 100 }, { "epoch": 2e-06, - "grad_norm": 4.6249518394470215, + "grad_norm": 5.5353922843933105, "learning_rate": 5.725086528406295e-06, - "loss": 1.8507, + "loss": 1.9606, "step": 200 }, { "epoch": 3e-06, - "grad_norm": 5.4548749923706055, + "grad_norm": 6.314189910888672, "learning_rate": 6.1745550399454076e-06, - "loss": 1.4679, + "loss": 1.5525, "step": 300 }, { "epoch": 4e-06, - "grad_norm": 4.865733623504639, + "grad_norm": 4.842369556427002, "learning_rate": 6.4914927390661495e-06, - "loss": 1.3112, + "loss": 1.3692, "step": 400 }, { "epoch": 5e-06, - "grad_norm": 6.375494956970215, + "grad_norm": 6.878078460693359, "learning_rate": 6.736512997333922e-06, - "loss": 1.2063, + "loss": 1.2631, "step": 500 }, { "epoch": 6e-06, - "grad_norm": 4.774768829345703, + "grad_norm": 5.2255377769470215, "learning_rate": 6.936292414321374e-06, - "loss": 1.1465, + "loss": 1.2033, "step": 600 }, { "epoch": 7e-06, - "grad_norm": 5.399687767028809, + "grad_norm": 6.255804538726807, "learning_rate": 7.104962011475284e-06, - "loss": 1.114, + "loss": 1.176, "step": 700 }, { "epoch": 8e-06, - "grad_norm": 4.800980091094971, + "grad_norm": 5.792397499084473, "learning_rate": 7.250917821641176e-06, - "loss": 1.0785, + "loss": 1.1391, "step": 800 }, { "epoch": 9e-06, - "grad_norm": 4.668561935424805, + "grad_norm": 6.366394996643066, "learning_rate": 7.37955758828978e-06, - "loss": 1.0406, + "loss": 1.0937, "step": 900 }, { "epoch": 1e-05, - "grad_norm": 5.257966995239258, + "grad_norm": 5.171053886413574, "learning_rate": 7.494557701864313e-06, - "loss": 1.0062, + "loss": 1.0665, "step": 1000 }, { "epoch": 1.1e-05, - "grad_norm": 5.205653190612793, + "grad_norm": 5.637969017028809, "learning_rate": 7.598535297940343e-06, - "loss": 0.9969, + "loss": 1.0483, "step": 1100 }, { "epoch": 1.2e-05, - "grad_norm": 4.900371551513672, + "grad_norm": 5.52170467376709, "learning_rate": 7.69341976321039e-06, - "loss": 0.9913, + "loss": 1.0372, "step": 1200 }, { "epoch": 1.3e-05, - "grad_norm": 5.894355297088623, + "grad_norm": 4.622586727142334, "learning_rate": 7.780674421043177e-06, - "loss": 0.9586, + "loss": 1.0008, "step": 1300 }, { "epoch": 1.4e-05, - "grad_norm": 4.883889675140381, + "grad_norm": 4.899344444274902, "learning_rate": 7.86143551902404e-06, - "loss": 0.9411, + "loss": 0.9829, "step": 1400 }, { "epoch": 1.5e-05, - "grad_norm": 5.232024192810059, + "grad_norm": 4.919180870056152, "learning_rate": 7.936602981651121e-06, - "loss": 0.9483, + "loss": 0.9821, "step": 1500 }, { "epoch": 1.6e-05, - "grad_norm": 4.761655330657959, + "grad_norm": 5.980068206787109, "learning_rate": 8.006901718483e-06, - "loss": 0.9174, + "loss": 0.9506, "step": 1600 }, { "epoch": 1.7e-05, - "grad_norm": 5.541046142578125, + "grad_norm": 5.955260276794434, "learning_rate": 8.072924256347751e-06, - "loss": 0.9088, + "loss": 0.9382, "step": 1700 }, { "epoch": 1.8e-05, - "grad_norm": 4.936319351196289, + "grad_norm": 4.828701019287109, "learning_rate": 8.135161132285844e-06, - "loss": 0.8962, + "loss": 0.923, "step": 1800 }, { "epoch": 1.9e-05, - "grad_norm": 7.001105308532715, + "grad_norm": 5.517646789550781, "learning_rate": 8.194023035760226e-06, - "loss": 0.8771, + "loss": 0.9019, "step": 1900 }, { "epoch": 2e-05, - "grad_norm": 4.812566757202148, + "grad_norm": 4.49784517288208, "learning_rate": 8.249857250056917e-06, - "loss": 0.8826, + "loss": 0.9013, "step": 2000 }, { "epoch": 2.1e-05, - "grad_norm": 5.578819274902344, + "grad_norm": 4.274550914764404, "learning_rate": 8.302960068255784e-06, - "loss": 0.8795, + "loss": 0.9038, "step": 2100 }, { "epoch": 2.2e-05, - "grad_norm": 4.541772365570068, - "learning_rate": 8.35309155809673e-06, - "loss": 0.8837, + "grad_norm": 5.395852565765381, + "learning_rate": 8.35358631144535e-06, + "loss": 0.9049, "step": 2200 }, { "epoch": 2.3e-05, - "grad_norm": 4.923720359802246, - "learning_rate": 8.401483533913121e-06, - "loss": 0.8464, + "grad_norm": 4.8221611976623535, + "learning_rate": 8.4019567246832e-06, + "loss": 0.8675, "step": 2300 }, { "epoch": 2.4e-05, - "grad_norm": 4.408301830291748, - "learning_rate": 8.44781036517598e-06, - "loss": 0.8532, + "grad_norm": 4.574971675872803, + "learning_rate": 8.448263794376455e-06, + "loss": 0.8743, "step": 2400 }, { "epoch": 2.5e-05, - "grad_norm": 4.526201248168945, - "learning_rate": 8.49224112285631e-06, - "loss": 0.8293, + "grad_norm": 4.795435905456543, + "learning_rate": 8.492676374898522e-06, + "loss": 0.8473, "step": 2500 }, { "epoch": 2.6e-05, - "grad_norm": 4.34307861328125, - "learning_rate": 8.534924929370153e-06, - "loss": 0.8343, + "grad_norm": 4.126605987548828, + "learning_rate": 8.535343405461191e-06, + "loss": 0.8553, "step": 2600 }, { "epoch": 2.7e-05, - "grad_norm": 4.171513080596924, - "learning_rate": 8.575993978467417e-06, - "loss": 0.8327, + "grad_norm": 4.7121992111206055, + "learning_rate": 8.576396923806893e-06, + "loss": 0.8496, "step": 2700 }, { "epoch": 2.8e-05, - "grad_norm": 4.572521209716797, - "learning_rate": 8.615566004445406e-06, - "loss": 0.8176, + "grad_norm": 4.441779136657715, + "learning_rate": 8.615954530556104e-06, + "loss": 0.8336, "step": 2800 }, { "epoch": 2.9e-05, - "grad_norm": 5.0731282234191895, - "learning_rate": 8.653746316957546e-06, - "loss": 0.8127, + "grad_norm": 4.51931095123291, + "learning_rate": 8.654121420158637e-06, + "loss": 0.8287, "step": 2900 }, { "epoch": 3e-05, - "grad_norm": 3.7510156631469727, - "learning_rate": 8.690629490017583e-06, - "loss": 0.8015, + "grad_norm": 4.081902980804443, + "learning_rate": 8.690992066813325e-06, + "loss": 0.8168, "step": 3000 }, { "epoch": 3.1e-05, - "grad_norm": 4.673043727874756, - "learning_rate": 8.726300773408372e-06, - "loss": 0.7993, + "grad_norm": 4.671773910522461, + "learning_rate": 8.726651633390342e-06, + "loss": 0.8166, "step": 3100 }, { "epoch": 3.2e-05, - "grad_norm": 4.637227535247803, + "grad_norm": 4.348089694976807, "learning_rate": 8.76083727950616e-06, - "loss": 0.7924, + "loss": 0.81, "step": 3200 }, { "epoch": 3.3e-05, - "grad_norm": 4.200488090515137, - "learning_rate": 8.793979326779768e-06, - "loss": 0.7842, + "grad_norm": 4.5769829750061035, + "learning_rate": 8.794308987084338e-06, + "loss": 0.7987, "step": 3300 }, { "epoch": 3.4e-05, - "grad_norm": 4.710316181182861, - "learning_rate": 8.82613960896169e-06, - "loss": 0.7887, + "grad_norm": 4.201402187347412, + "learning_rate": 8.826459649112795e-06, + "loss": 0.8051, "step": 3400 }, { "epoch": 3.5e-05, - "grad_norm": 5.1877241134643555, - "learning_rate": 8.85768558758383e-06, - "loss": 0.7941, + "grad_norm": 4.153467655181885, + "learning_rate": 8.857996464094115e-06, + "loss": 0.8102, "step": 3500 }, { "epoch": 3.6e-05, - "grad_norm": 4.330559730529785, - "learning_rate": 8.888340819988166e-06, - "loss": 0.7708, + "grad_norm": 4.17486047744751, + "learning_rate": 8.888643043011622e-06, + "loss": 0.7847, "step": 3600 }, { "epoch": 3.7e-05, - "grad_norm": 3.8950893878936768, - "learning_rate": 8.918154230884686e-06, - "loss": 0.7704, + "grad_norm": 4.006905555725098, + "learning_rate": 8.918448269127447e-06, + "loss": 0.7765, "step": 3700 }, { "epoch": 3.8e-05, - "grad_norm": 3.9400832653045654, + "grad_norm": 3.959653377532959, "learning_rate": 8.947170821665072e-06, - "loss": 0.7525, + "loss": 0.7704, "step": 3800 }, { "epoch": 3.9e-05, - "grad_norm": 5.068721771240234, + "grad_norm": 5.123909950256348, "learning_rate": 8.975432078990786e-06, - "loss": 0.7562, + "loss": 0.7671, "step": 3900 }, { "epoch": 4e-05, - "grad_norm": 3.934849739074707, + "grad_norm": 3.873758316040039, "learning_rate": 9.002976331538332e-06, - "loss": 0.7452, + "loss": 0.7567, "step": 4000 }, { "epoch": 4.1e-05, - "grad_norm": 4.313928127288818, + "grad_norm": 4.340254783630371, "learning_rate": 9.029839062600307e-06, - "loss": 0.7298, + "loss": 0.7416, "step": 4100 }, { "epoch": 4.2e-05, - "grad_norm": 4.175806522369385, + "grad_norm": 4.056208610534668, "learning_rate": 9.056053184939176e-06, - "loss": 0.7293, + "loss": 0.7445, "step": 4200 }, { "epoch": 4.3e-05, - "grad_norm": 3.6168248653411865, + "grad_norm": 3.6428372859954834, "learning_rate": 9.081649283234784e-06, - "loss": 0.7327, + "loss": 0.7456, "step": 4300 }, { "epoch": 4.4e-05, - "grad_norm": 3.3163771629333496, + "grad_norm": 3.506011486053467, "learning_rate": 9.106655828605087e-06, - "loss": 0.7247, + "loss": 0.7342, "step": 4400 }, { "epoch": 4.5e-05, - "grad_norm": 3.9507524967193604, + "grad_norm": 3.7343637943267822, "learning_rate": 9.13109936897355e-06, - "loss": 0.7339, + "loss": 0.7435, "step": 4500 }, { "epoch": 4.6e-05, - "grad_norm": 3.4026217460632324, - "learning_rate": 9.154531713843154e-06, - "loss": 0.7225, + "grad_norm": 3.65120267868042, + "learning_rate": 9.155004698474792e-06, + "loss": 0.7346, "step": 4600 }, { "epoch": 4.7e-05, - "grad_norm": 3.5905020236968994, - "learning_rate": 9.177932106787708e-06, - "loss": 0.7292, + "grad_norm": 3.66025710105896, + "learning_rate": 9.17839500860873e-06, + "loss": 0.7394, "step": 4700 }, { "epoch": 4.8e-05, - "grad_norm": 4.0042338371276855, - "learning_rate": 9.200838783536409e-06, - "loss": 0.7045, + "grad_norm": 3.7857818603515625, + "learning_rate": 9.201292023453135e-06, + "loss": 0.7137, "step": 4800 }, { "epoch": 4.9e-05, - "grad_norm": 3.7306625843048096, - "learning_rate": 9.223272147809049e-06, - "loss": 0.7094, + "grad_norm": 4.2525858879089355, + "learning_rate": 9.22371612091062e-06, + "loss": 0.7179, "step": 4900 }, { "epoch": 5e-05, - "grad_norm": 3.755902051925659, - "learning_rate": 9.245251364058474e-06, - "loss": 0.7081, + "grad_norm": 3.6449532508850098, + "learning_rate": 9.245686441685918e-06, + "loss": 0.7149, "step": 5000 }, { "epoch": 5.1e-05, - "grad_norm": 5.053667068481445, - "learning_rate": 9.266794455841898e-06, - "loss": 0.716, + "grad_norm": 4.4344401359558105, + "learning_rate": 9.267220987454044e-06, + "loss": 0.7209, "step": 5100 }, { "epoch": 5.2e-05, - "grad_norm": 3.5010838508605957, - "learning_rate": 9.287918394621145e-06, - "loss": 0.7001, + "grad_norm": 3.7667882442474365, + "learning_rate": 9.28833670948078e-06, + "loss": 0.7104, "step": 5200 }, { "epoch": 5.3e-05, - "grad_norm": 3.770101308822632, - "learning_rate": 9.308639180087962e-06, - "loss": 0.6849, + "grad_norm": 3.8509140014648438, + "learning_rate": 9.309049588788657e-06, + "loss": 0.6939, "step": 5300 }, { "epoch": 5.4e-05, - "grad_norm": 4.221272945404053, - "learning_rate": 9.328971912966845e-06, - "loss": 0.6872, + "grad_norm": 4.200638294219971, + "learning_rate": 9.329374708818158e-06, + "loss": 0.6934, "step": 5400 }, { "epoch": 5.5e-05, - "grad_norm": 3.445298194885254, - "learning_rate": 9.348930861125227e-06, - "loss": 0.6746, + "grad_norm": 3.6815011501312256, + "learning_rate": 9.349326321411793e-06, + "loss": 0.6841, "step": 5500 }, { "epoch": 5.6e-05, - "grad_norm": 3.6341142654418945, - "learning_rate": 9.368529519716058e-06, - "loss": 0.7058, + "grad_norm": 4.6741719245910645, + "learning_rate": 9.368917906844062e-06, + "loss": 0.7124, "step": 5600 }, { "epoch": 5.7e-05, - "grad_norm": 3.2344930171966553, - "learning_rate": 9.387780665987677e-06, - "loss": 0.6711, + "grad_norm": 3.726712942123413, + "learning_rate": 9.388162228530614e-06, + "loss": 0.6749, "step": 5700 }, { "epoch": 5.8e-05, - "grad_norm": 3.412182092666626, - "learning_rate": 9.406508873917008e-06, - "loss": 0.7011, + "grad_norm": 3.2452657222747803, + "learning_rate": 9.407071382972726e-06, + "loss": 0.7073, "step": 5800 }, { "epoch": 5.9e-05, - "grad_norm": 3.751844882965088, - "learning_rate": 9.42510388579917e-06, - "loss": 0.6751, + "grad_norm": 3.7711005210876465, + "learning_rate": 9.425656845426483e-06, + "loss": 0.6816, "step": 5900 }, { "epoch": 6e-05, - "grad_norm": 3.3651347160339355, - "learning_rate": 9.443385782710523e-06, - "loss": 0.6671, + "grad_norm": 3.617072105407715, + "learning_rate": 9.443929511728523e-06, + "loss": 0.6788, "step": 6000 }, { "epoch": 6.1e-05, - "grad_norm": 3.4214398860931396, - "learning_rate": 9.46118660940629e-06, - "loss": 0.6639, + "grad_norm": 3.677022933959961, + "learning_rate": 9.461721498753552e-06, + "loss": 0.6707, "step": 6100 }, { "epoch": 6.2e-05, - "grad_norm": 3.7042906284332275, - "learning_rate": 9.478875762568324e-06, - "loss": 0.6621, + "grad_norm": 3.563831329345703, + "learning_rate": 9.479402010032261e-06, + "loss": 0.6688, "step": 6200 }, { "epoch": 6.3e-05, - "grad_norm": 3.310945749282837, - "learning_rate": 9.496281332578378e-06, - "loss": 0.6589, + "grad_norm": 5.436884880065918, + "learning_rate": 9.496799212962515e-06, + "loss": 0.6667, "step": 6300 }, { "epoch": 6.4e-05, - "grad_norm": 3.7326412200927734, - "learning_rate": 9.513412268666114e-06, - "loss": 0.6589, + "grad_norm": 3.69850754737854, + "learning_rate": 9.51392204387139e-06, + "loss": 0.665, "step": 6400 }, { "epoch": 6.5e-05, - "grad_norm": 3.398975372314453, - "learning_rate": 9.53027710300775e-06, - "loss": 0.6591, + "grad_norm": 4.021072864532471, + "learning_rate": 9.530779022827808e-06, + "loss": 0.6652, "step": 6500 }, { "epoch": 6.6e-05, - "grad_norm": 3.4606621265411377, - "learning_rate": 9.546883976244291e-06, - "loss": 0.6531, + "grad_norm": 3.9425106048583984, + "learning_rate": 9.547378279100432e-06, + "loss": 0.6593, "step": 6600 }, { "epoch": 6.7e-05, - "grad_norm": 3.2762691974639893, - "learning_rate": 9.5632406610774e-06, - "loss": 0.6448, + "grad_norm": 4.435314655303955, + "learning_rate": 9.563727574698575e-06, + "loss": 0.6521, "step": 6700 }, { "epoch": 6.8e-05, - "grad_norm": 3.4132208824157715, - "learning_rate": 9.579354584114094e-06, - "loss": 0.6505, + "grad_norm": 3.426171064376831, + "learning_rate": 9.579674435701252e-06, + "loss": 0.6592, "step": 6800 }, { "epoch": 6.9e-05, - "grad_norm": 3.1645874977111816, - "learning_rate": 9.595232846113996e-06, - "loss": 0.6533, + "grad_norm": 3.2786061763763428, + "learning_rate": 9.595548054769063e-06, + "loss": 0.659, "step": 6900 }, { "epoch": 7e-05, - "grad_norm": 4.370247840881348, - "learning_rate": 9.610882240777232e-06, - "loss": 0.6371, + "grad_norm": 3.6172714233398438, + "learning_rate": 9.611192939364202e-06, + "loss": 0.6425, "step": 7000 }, { "epoch": 7.1e-05, - "grad_norm": 3.2068285942077637, - "learning_rate": 9.626309272197486e-06, - "loss": 0.6394, + "grad_norm": 3.1984446048736572, + "learning_rate": 9.626615587957666e-06, + "loss": 0.6476, "step": 7100 }, { "epoch": 7.2e-05, - "grad_norm": 3.303248405456543, - "learning_rate": 9.64152017109234e-06, - "loss": 0.6337, + "grad_norm": 4.468976974487305, + "learning_rate": 9.641822225957206e-06, + "loss": 0.6394, "step": 7200 }, { "epoch": 7.3e-05, - "grad_norm": 3.2665939331054688, - "learning_rate": 9.656520909912353e-06, - "loss": 0.6237, + "grad_norm": 3.582566261291504, + "learning_rate": 9.656818820794936e-06, + "loss": 0.6312, "step": 7300 }, { "epoch": 7.4e-05, - "grad_norm": 4.2720465660095215, - "learning_rate": 9.671170247552466e-06, - "loss": 0.6256, + "grad_norm": 3.2956085205078125, + "learning_rate": 9.671611095987065e-06, + "loss": 0.6349, "step": 7400 }, { "epoch": 7.5e-05, - "grad_norm": 3.540658950805664, - "learning_rate": 9.685769582820098e-06, - "loss": 0.6303, + "grad_norm": 3.747368574142456, + "learning_rate": 9.686204544248665e-06, + "loss": 0.641, "step": 7500 }, { "epoch": 7.6e-05, - "grad_norm": 3.2969183921813965, - "learning_rate": 9.700175210160166e-06, - "loss": 0.6277, + "grad_norm": 3.5763564109802246, + "learning_rate": 9.7006044397387e-06, + "loss": 0.635, "step": 7600 }, { "epoch": 7.7e-05, - "grad_norm": 3.537332534790039, - "learning_rate": 9.71439220267317e-06, - "loss": 0.6139, + "grad_norm": 3.662113666534424, + "learning_rate": 9.714815849503578e-06, + "loss": 0.6216, "step": 7700 }, { "epoch": 7.8e-05, - "grad_norm": 3.5279369354248047, - "learning_rate": 9.728425436740369e-06, - "loss": 0.6304, + "grad_norm": 4.00654935836792, + "learning_rate": 9.728843644181411e-06, + "loss": 0.6385, "step": 7800 }, { "epoch": 7.9e-05, - "grad_norm": 2.9212756156921387, - "learning_rate": 9.74214193184301e-06, - "loss": 0.6162, + "grad_norm": 3.1744225025177, + "learning_rate": 9.74269250802355e-06, + "loss": 0.6225, "step": 7900 }, { "epoch": 8e-05, - "grad_norm": 2.9494481086730957, - "learning_rate": 9.755823264645797e-06, - "loss": 0.6174, + "grad_norm": 2.9433505535125732, + "learning_rate": 9.756366948284976e-06, + "loss": 0.6218, "step": 8000 }, { "epoch": 8.1e-05, - "grad_norm": 3.4161007404327393, - "learning_rate": 9.769334342494138e-06, - "loss": 0.6108, + "grad_norm": 3.351646661758423, + "learning_rate": 9.76987130403068e-06, + "loss": 0.6156, "step": 8100 }, { "epoch": 8.2e-05, - "grad_norm": 3.189723491668701, - "learning_rate": 9.782679350773126e-06, - "loss": 0.6062, + "grad_norm": 3.297624111175537, + "learning_rate": 9.783209754401046e-06, + "loss": 0.6105, "step": 8200 }, { "epoch": 8.3e-05, - "grad_norm": 3.303201913833618, - "learning_rate": 9.795862322405686e-06, - "loss": 0.6026, + "grad_norm": 3.242459535598755, + "learning_rate": 9.796386326375682e-06, + "loss": 0.6121, "step": 8300 }, { "epoch": 8.4e-05, - "grad_norm": 3.1189773082733154, - "learning_rate": 9.808887145168872e-06, - "loss": 0.6038, + "grad_norm": 3.301387071609497, + "learning_rate": 9.80940490207175e-06, + "loss": 0.6103, "step": 8400 }, { "epoch": 8.5e-05, - "grad_norm": 3.5268099308013916, - "learning_rate": 9.821757568576492e-06, - "loss": 0.597, + "grad_norm": 3.537336587905884, + "learning_rate": 9.822269225609881e-06, + "loss": 0.5999, "step": 8500 }, { "epoch": 8.6e-05, - "grad_norm": 3.629392147064209, - "learning_rate": 9.83447721035855e-06, - "loss": 0.6093, + "grad_norm": 3.297769784927368, + "learning_rate": 9.834856506853153e-06, + "loss": 0.6137, "step": 8600 }, { "epoch": 8.7e-05, - "grad_norm": 3.499454975128174, - "learning_rate": 9.847049562565526e-06, - "loss": 0.5941, + "grad_norm": 3.5436480045318604, + "learning_rate": 9.847424493057225e-06, + "loss": 0.5982, "step": 8700 }, { "epoch": 8.8e-05, - "grad_norm": 3.21406888961792, - "learning_rate": 9.859477997323253e-06, - "loss": 0.5903, + "grad_norm": 3.3553879261016846, + "learning_rate": 9.85984866118054e-06, + "loss": 0.5987, "step": 8800 }, { "epoch": 8.9e-05, - "grad_norm": 2.9815945625305176, - "learning_rate": 9.87176577226213e-06, - "loss": 0.5899, + "grad_norm": 2.8393940925598145, + "learning_rate": 9.872010114832027e-06, + "loss": 0.5954, "step": 8900 }, { "epoch": 9e-05, - "grad_norm": 3.250000476837158, - "learning_rate": 9.883916035642515e-06, - "loss": 0.6035, + "grad_norm": 3.286961317062378, + "learning_rate": 9.884157659367727e-06, + "loss": 0.6128, "step": 9000 }, { "epoch": 9.1e-05, - "grad_norm": 2.9647767543792725, - "learning_rate": 9.895931831196517e-06, - "loss": 0.5921, + "grad_norm": 3.427604913711548, + "learning_rate": 9.896170795917358e-06, + "loss": 0.5997, "step": 9100 }, { "epoch": 9.2e-05, - "grad_norm": 3.093561887741089, - "learning_rate": 9.907816102704761e-06, - "loss": 0.5849, + "grad_norm": 4.3479323387146, + "learning_rate": 9.908052466307471e-06, + "loss": 0.592, "step": 9200 }, { "epoch": 9.3e-05, - "grad_norm": 3.159992218017578, - "learning_rate": 9.919454770189498e-06, - "loss": 0.5943, + "grad_norm": 3.0680465698242188, + "learning_rate": 9.919805516826294e-06, + "loss": 0.5973, "step": 9300 }, { "epoch": 9.4e-05, - "grad_norm": 3.5264203548431396, - "learning_rate": 9.931085692393413e-06, - "loss": 0.5915, + "grad_norm": 3.6079111099243164, + "learning_rate": 9.931432702316388e-06, + "loss": 0.5995, "step": 9400 }, { "epoch": 9.5e-05, - "grad_norm": 3.3103344440460205, - "learning_rate": 9.942593338061703e-06, - "loss": 0.5852, + "grad_norm": 3.1534416675567627, + "learning_rate": 9.942936690050469e-06, + "loss": 0.5899, "step": 9500 }, { "epoch": 9.6e-05, - "grad_norm": 2.922833204269409, - "learning_rate": 9.953980293035202e-06, - "loss": 0.5894, + "grad_norm": 2.7021210193634033, + "learning_rate": 9.95432006340404e-06, + "loss": 0.5963, "step": 9600 }, { "epoch": 9.7e-05, - "grad_norm": 2.8479816913604736, - "learning_rate": 9.965249062637824e-06, - "loss": 0.5865, + "grad_norm": 2.8992557525634766, + "learning_rate": 9.965585325337488e-06, + "loss": 0.5919, "step": 9700 }, { "epoch": 9.8e-05, - "grad_norm": 3.1860899925231934, - "learning_rate": 9.976402074985049e-06, - "loss": 0.5791, + "grad_norm": 3.391969680786133, + "learning_rate": 9.976734901699378e-06, + "loss": 0.5814, "step": 9800 }, { "epoch": 9.9e-05, - "grad_norm": 2.8601717948913574, - "learning_rate": 9.987441684124228e-06, - "loss": 0.5645, + "grad_norm": 2.826235771179199, + "learning_rate": 9.987771144361851e-06, + "loss": 0.5675, "step": 9900 }, { "epoch": 0.0001, - "grad_norm": 3.013867139816284, - "learning_rate": 9.998370173016803e-06, - "loss": 0.5825, + "grad_norm": 3.625103235244751, + "learning_rate": 9.998696334198274e-06, + "loss": 0.589, "step": 10000 }, { "epoch": 0.000101, - "grad_norm": 3.2318782806396484, + "grad_norm": 3.538278341293335, "learning_rate": 1e-05, - "loss": 0.5766, + "loss": 0.5863, "step": 10100 }, { "epoch": 0.000102, - "grad_norm": 3.5614089965820312, + "grad_norm": 3.2610013484954834, "learning_rate": 1e-05, - "loss": 0.5793, + "loss": 0.5853, "step": 10200 }, { "epoch": 0.000103, - "grad_norm": 3.036363124847412, + "grad_norm": 3.018453359603882, "learning_rate": 1e-05, - "loss": 0.573, + "loss": 0.581, "step": 10300 }, { "epoch": 0.000104, - "grad_norm": 2.984074831008911, + "grad_norm": 2.997459650039673, "learning_rate": 1e-05, - "loss": 0.572, + "loss": 0.5749, "step": 10400 }, { "epoch": 0.000105, - "grad_norm": 3.480567455291748, + "grad_norm": 3.1793456077575684, "learning_rate": 1e-05, - "loss": 0.558, + "loss": 0.5624, "step": 10500 }, { "epoch": 0.000106, - "grad_norm": 3.1974992752075195, + "grad_norm": 3.571202039718628, "learning_rate": 1e-05, - "loss": 0.5689, + "loss": 0.574, "step": 10600 }, { "epoch": 0.000107, - "grad_norm": 3.1212074756622314, + "grad_norm": 3.742325782775879, "learning_rate": 1e-05, - "loss": 0.5697, + "loss": 0.5736, "step": 10700 }, { "epoch": 0.000108, - "grad_norm": 3.080963134765625, + "grad_norm": 3.2514116764068604, "learning_rate": 1e-05, - "loss": 0.5567, + "loss": 0.5611, "step": 10800 }, { "epoch": 0.000109, - "grad_norm": 2.945516347885132, + "grad_norm": 3.0863165855407715, "learning_rate": 1e-05, - "loss": 0.5535, + "loss": 0.5564, "step": 10900 }, { "epoch": 0.00011, - "grad_norm": 3.322488307952881, + "grad_norm": 3.638606071472168, "learning_rate": 1e-05, - "loss": 0.565, + "loss": 0.5711, "step": 11000 }, { "epoch": 0.000111, - "grad_norm": 2.794234275817871, + "grad_norm": 2.756107807159424, "learning_rate": 1e-05, - "loss": 0.5696, + "loss": 0.5711, "step": 11100 }, { "epoch": 0.000112, - "grad_norm": 3.4487061500549316, + "grad_norm": 3.2166287899017334, "learning_rate": 1e-05, - "loss": 0.5714, + "loss": 0.5781, "step": 11200 }, { "epoch": 0.000113, - "grad_norm": 2.8122172355651855, + "grad_norm": 2.7138330936431885, "learning_rate": 1e-05, - "loss": 0.5527, + "loss": 0.5522, "step": 11300 }, { "epoch": 0.000114, - "grad_norm": 3.1679444313049316, + "grad_norm": 3.1922643184661865, "learning_rate": 1e-05, - "loss": 0.5644, + "loss": 0.5686, "step": 11400 }, { "epoch": 0.000115, - "grad_norm": 3.4525845050811768, + "grad_norm": 3.2307920455932617, "learning_rate": 1e-05, - "loss": 0.5575, + "loss": 0.5621, "step": 11500 }, { "epoch": 0.000116, - "grad_norm": 2.8256614208221436, + "grad_norm": 2.9271585941314697, "learning_rate": 1e-05, - "loss": 0.5517, + "loss": 0.554, "step": 11600 }, { "epoch": 0.000117, - "grad_norm": 3.273900270462036, + "grad_norm": 2.994710922241211, "learning_rate": 1e-05, - "loss": 0.5516, + "loss": 0.5564, "step": 11700 }, { "epoch": 0.000118, - "grad_norm": 3.2579128742218018, + "grad_norm": 3.0383167266845703, "learning_rate": 1e-05, - "loss": 0.5566, + "loss": 0.5584, "step": 11800 }, { "epoch": 0.000119, - "grad_norm": 2.8956398963928223, + "grad_norm": 2.635859489440918, "learning_rate": 1e-05, - "loss": 0.5596, + "loss": 0.5657, "step": 11900 }, { "epoch": 0.00012, - "grad_norm": 2.7935895919799805, + "grad_norm": 2.850497245788574, "learning_rate": 1e-05, - "loss": 0.5493, + "loss": 0.5538, "step": 12000 }, { "epoch": 0.000121, - "grad_norm": 2.8680708408355713, + "grad_norm": 3.060102701187134, "learning_rate": 1e-05, - "loss": 0.5605, + "loss": 0.5671, "step": 12100 }, { "epoch": 0.000122, - "grad_norm": 3.6382384300231934, + "grad_norm": 2.882080316543579, "learning_rate": 1e-05, - "loss": 0.5549, + "loss": 0.5563, "step": 12200 }, { "epoch": 0.000123, - "grad_norm": 3.093803882598877, + "grad_norm": 2.516627311706543, "learning_rate": 1e-05, - "loss": 0.5475, + "loss": 0.5499, "step": 12300 }, { "epoch": 0.000124, - "grad_norm": 2.935946226119995, + "grad_norm": 3.2647488117218018, "learning_rate": 1e-05, - "loss": 0.5501, + "loss": 0.5523, "step": 12400 }, { "epoch": 0.000125, - "grad_norm": 3.2868237495422363, + "grad_norm": 3.0820746421813965, "learning_rate": 1e-05, - "loss": 0.5537, + "loss": 0.5551, "step": 12500 }, { "epoch": 0.000126, - "grad_norm": 3.0379724502563477, + "grad_norm": 3.288663864135742, "learning_rate": 1e-05, - "loss": 0.5402, + "loss": 0.5421, "step": 12600 }, { "epoch": 0.000127, - "grad_norm": 2.967264175415039, + "grad_norm": 2.8991341590881348, "learning_rate": 1e-05, - "loss": 0.5386, + "loss": 0.5379, "step": 12700 }, { "epoch": 0.000128, - "grad_norm": 3.1353132724761963, + "grad_norm": 3.0275886058807373, "learning_rate": 1e-05, - "loss": 0.5535, + "loss": 0.5555, "step": 12800 }, { "epoch": 0.000129, - "grad_norm": 2.8035740852355957, + "grad_norm": 2.8435568809509277, "learning_rate": 1e-05, - "loss": 0.5442, + "loss": 0.5475, "step": 12900 }, { "epoch": 0.00013, - "grad_norm": 2.888056516647339, + "grad_norm": 3.5080063343048096, "learning_rate": 1e-05, - "loss": 0.5451, + "loss": 0.5464, "step": 13000 }, { "epoch": 0.000131, - "grad_norm": 3.296889543533325, + "grad_norm": 3.4270200729370117, "learning_rate": 1e-05, - "loss": 0.5448, + "loss": 0.5489, "step": 13100 }, { "epoch": 0.000132, - "grad_norm": 3.5425004959106445, + "grad_norm": 3.046891689300537, "learning_rate": 1e-05, - "loss": 0.5385, + "loss": 0.5427, "step": 13200 }, { "epoch": 0.000133, - "grad_norm": 2.9780685901641846, + "grad_norm": 2.9758501052856445, "learning_rate": 1e-05, - "loss": 0.5414, + "loss": 0.547, "step": 13300 }, { "epoch": 0.000134, - "grad_norm": 3.036210298538208, + "grad_norm": 2.984278917312622, "learning_rate": 1e-05, - "loss": 0.5392, + "loss": 0.5405, "step": 13400 }, { "epoch": 0.000135, - "grad_norm": 2.895752429962158, + "grad_norm": 2.7465741634368896, "learning_rate": 1e-05, - "loss": 0.541, + "loss": 0.5449, "step": 13500 }, { "epoch": 0.000136, - "grad_norm": 2.6211678981781006, + "grad_norm": 2.6770498752593994, "learning_rate": 1e-05, - "loss": 0.5255, + "loss": 0.5313, "step": 13600 }, { "epoch": 0.000137, - "grad_norm": 3.1706643104553223, + "grad_norm": 3.1820075511932373, "learning_rate": 1e-05, - "loss": 0.5404, + "loss": 0.5444, "step": 13700 }, { "epoch": 0.000138, - "grad_norm": 2.7353274822235107, + "grad_norm": 2.7672953605651855, "learning_rate": 1e-05, - "loss": 0.5343, + "loss": 0.5366, "step": 13800 }, { "epoch": 0.000139, - "grad_norm": 3.278226852416992, + "grad_norm": 3.25970458984375, "learning_rate": 1e-05, - "loss": 0.5276, + "loss": 0.5331, "step": 13900 }, { "epoch": 0.00014, - "grad_norm": 3.0016024112701416, + "grad_norm": 2.8809070587158203, "learning_rate": 1e-05, - "loss": 0.5307, + "loss": 0.5332, "step": 14000 }, { "epoch": 0.000141, - "grad_norm": 2.68979549407959, + "grad_norm": 2.7005808353424072, "learning_rate": 1e-05, - "loss": 0.5293, + "loss": 0.5335, "step": 14100 }, { "epoch": 0.000142, - "grad_norm": 3.298138380050659, + "grad_norm": 3.1988399028778076, "learning_rate": 1e-05, - "loss": 0.5351, + "loss": 0.533, "step": 14200 }, { "epoch": 0.000143, - "grad_norm": 3.21337890625, + "grad_norm": 2.7792532444000244, "learning_rate": 1e-05, - "loss": 0.5239, + "loss": 0.5277, "step": 14300 }, { "epoch": 0.000144, - "grad_norm": 2.945388078689575, + "grad_norm": 3.010068655014038, "learning_rate": 1e-05, - "loss": 0.5281, + "loss": 0.5314, "step": 14400 }, { "epoch": 0.000145, - "grad_norm": 3.1753902435302734, + "grad_norm": 3.3190596103668213, "learning_rate": 1e-05, - "loss": 0.5278, + "loss": 0.5308, "step": 14500 }, { "epoch": 0.000146, - "grad_norm": 4.220496654510498, + "grad_norm": 3.3294529914855957, "learning_rate": 1e-05, - "loss": 0.536, + "loss": 0.5369, "step": 14600 }, { "epoch": 0.000147, - "grad_norm": 3.2694389820098877, + "grad_norm": 3.0750784873962402, "learning_rate": 1e-05, - "loss": 0.537, + "loss": 0.5399, "step": 14700 }, { "epoch": 0.000148, - "grad_norm": 2.7159314155578613, + "grad_norm": 2.720137357711792, "learning_rate": 1e-05, - "loss": 0.5205, + "loss": 0.5255, "step": 14800 }, { "epoch": 0.000149, - "grad_norm": 3.7545464038848877, + "grad_norm": 3.3225038051605225, "learning_rate": 1e-05, - "loss": 0.5287, + "loss": 0.5324, "step": 14900 }, { "epoch": 0.00015, - "grad_norm": 2.9595139026641846, + "grad_norm": 2.890933036804199, "learning_rate": 1e-05, - "loss": 0.5331, + "loss": 0.5356, "step": 15000 }, { "epoch": 0.000151, - "grad_norm": 3.22045636177063, + "grad_norm": 3.118818998336792, "learning_rate": 1e-05, - "loss": 0.5182, + "loss": 0.5209, "step": 15100 }, { "epoch": 0.000152, - "grad_norm": 2.5562326908111572, + "grad_norm": 2.549999237060547, "learning_rate": 1e-05, - "loss": 0.5275, + "loss": 0.5327, "step": 15200 }, { "epoch": 0.000153, - "grad_norm": 3.0378758907318115, + "grad_norm": 2.8534693717956543, "learning_rate": 1e-05, - "loss": 0.5136, + "loss": 0.5161, "step": 15300 }, { "epoch": 0.000154, - "grad_norm": 2.9890029430389404, + "grad_norm": 3.0132813453674316, "learning_rate": 1e-05, - "loss": 0.5197, + "loss": 0.5225, "step": 15400 }, { "epoch": 0.000155, - "grad_norm": 3.1903433799743652, + "grad_norm": 2.934532880783081, "learning_rate": 1e-05, - "loss": 0.5169, + "loss": 0.5185, "step": 15500 }, { "epoch": 0.000156, - "grad_norm": 3.4840199947357178, + "grad_norm": 3.079315423965454, "learning_rate": 1e-05, - "loss": 0.5221, + "loss": 0.5228, "step": 15600 }, { "epoch": 0.000157, - "grad_norm": 2.8992013931274414, + "grad_norm": 2.9515817165374756, "learning_rate": 1e-05, - "loss": 0.5182, + "loss": 0.5183, "step": 15700 }, { "epoch": 0.000158, - "grad_norm": 2.9535939693450928, + "grad_norm": 2.9063994884490967, "learning_rate": 1e-05, - "loss": 0.518, + "loss": 0.5221, "step": 15800 }, { "epoch": 0.000159, - "grad_norm": 3.0388505458831787, + "grad_norm": 2.873257875442505, "learning_rate": 1e-05, - "loss": 0.5157, + "loss": 0.5184, "step": 15900 }, { "epoch": 0.00016, - "grad_norm": 2.8987889289855957, + "grad_norm": 3.2980196475982666, "learning_rate": 1e-05, - "loss": 0.52, + "loss": 0.5208, "step": 16000 }, { "epoch": 0.000161, - "grad_norm": 5.6647629737854, + "grad_norm": 3.3925833702087402, "learning_rate": 1e-05, - "loss": 0.5132, + "loss": 0.5165, "step": 16100 }, { "epoch": 0.000162, - "grad_norm": 2.775214195251465, + "grad_norm": 3.8521616458892822, "learning_rate": 1e-05, - "loss": 0.52, + "loss": 0.525, "step": 16200 }, { "epoch": 0.000163, - "grad_norm": 2.7975962162017822, + "grad_norm": 2.9561917781829834, "learning_rate": 1e-05, - "loss": 0.5217, + "loss": 0.5266, "step": 16300 }, { "epoch": 0.000164, - "grad_norm": 2.918041706085205, + "grad_norm": 3.1145403385162354, "learning_rate": 1e-05, - "loss": 0.5047, + "loss": 0.51, "step": 16400 }, { "epoch": 0.000165, - "grad_norm": 2.617910146713257, + "grad_norm": 2.615156888961792, "learning_rate": 1e-05, - "loss": 0.5225, + "loss": 0.5253, "step": 16500 }, { "epoch": 0.000166, - "grad_norm": 2.482619047164917, + "grad_norm": 2.6065399646759033, "learning_rate": 1e-05, - "loss": 0.5107, + "loss": 0.5121, "step": 16600 }, { "epoch": 0.000167, - "grad_norm": 2.5474319458007812, + "grad_norm": 3.8787238597869873, "learning_rate": 1e-05, - "loss": 0.5141, + "loss": 0.518, "step": 16700 }, { "epoch": 0.000168, - "grad_norm": 2.8877367973327637, + "grad_norm": 2.8339273929595947, "learning_rate": 1e-05, - "loss": 0.5049, + "loss": 0.5098, "step": 16800 }, { "epoch": 0.000169, - "grad_norm": 3.013601303100586, + "grad_norm": 3.400110960006714, "learning_rate": 1e-05, - "loss": 0.5102, + "loss": 0.5139, "step": 16900 }, { "epoch": 0.00017, - "grad_norm": 2.8229708671569824, + "grad_norm": 2.8647141456604004, "learning_rate": 1e-05, - "loss": 0.507, + "loss": 0.51, "step": 17000 }, { "epoch": 0.000171, - "grad_norm": 3.0160152912139893, + "grad_norm": 3.0346758365631104, "learning_rate": 1e-05, - "loss": 0.5158, + "loss": 0.52, "step": 17100 }, { "epoch": 0.000172, - "grad_norm": 3.160031318664551, + "grad_norm": 2.840468168258667, "learning_rate": 1e-05, - "loss": 0.511, + "loss": 0.5126, "step": 17200 }, { "epoch": 0.000173, - "grad_norm": 2.260617733001709, + "grad_norm": 2.32464861869812, "learning_rate": 1e-05, - "loss": 0.5106, + "loss": 0.5141, "step": 17300 }, { "epoch": 0.000174, - "grad_norm": 2.911404848098755, + "grad_norm": 2.6493277549743652, "learning_rate": 1e-05, - "loss": 0.5038, + "loss": 0.5061, "step": 17400 }, { "epoch": 0.000175, - "grad_norm": 2.751133441925049, + "grad_norm": 2.9182634353637695, "learning_rate": 1e-05, - "loss": 0.5052, + "loss": 0.5079, "step": 17500 }, { "epoch": 0.000176, - "grad_norm": 2.7960567474365234, + "grad_norm": 2.817209243774414, "learning_rate": 1e-05, - "loss": 0.5192, + "loss": 0.5209, "step": 17600 }, { "epoch": 0.000177, - "grad_norm": 3.0205790996551514, + "grad_norm": 3.2138490676879883, "learning_rate": 1e-05, - "loss": 0.4982, + "loss": 0.5008, "step": 17700 }, { "epoch": 0.000178, - "grad_norm": 3.0717408657073975, + "grad_norm": 2.925663471221924, "learning_rate": 1e-05, - "loss": 0.5017, + "loss": 0.5045, "step": 17800 }, { "epoch": 0.000179, - "grad_norm": 3.5761964321136475, + "grad_norm": 3.3058528900146484, "learning_rate": 1e-05, - "loss": 0.5014, + "loss": 0.5036, "step": 17900 }, { "epoch": 0.00018, - "grad_norm": 2.684081792831421, + "grad_norm": 2.73296856880188, "learning_rate": 1e-05, - "loss": 0.4978, + "loss": 0.5023, "step": 18000 }, { "epoch": 0.000181, - "grad_norm": 3.2146716117858887, + "grad_norm": 3.112847089767456, "learning_rate": 1e-05, - "loss": 0.5127, + "loss": 0.5178, "step": 18100 }, { "epoch": 0.000182, - "grad_norm": 3.1293933391571045, + "grad_norm": 2.471736431121826, "learning_rate": 1e-05, - "loss": 0.5045, + "loss": 0.5082, "step": 18200 }, { "epoch": 0.000183, - "grad_norm": 2.9724395275115967, + "grad_norm": 2.6374621391296387, "learning_rate": 1e-05, - "loss": 0.5036, + "loss": 0.5069, "step": 18300 }, { "epoch": 0.000184, - "grad_norm": 2.9043684005737305, + "grad_norm": 2.838254690170288, "learning_rate": 1e-05, - "loss": 0.5088, + "loss": 0.5101, "step": 18400 }, { "epoch": 0.000185, - "grad_norm": 3.0223240852355957, + "grad_norm": 2.9953255653381348, "learning_rate": 1e-05, - "loss": 0.5032, + "loss": 0.5076, "step": 18500 }, { "epoch": 0.000186, - "grad_norm": 2.702449321746826, + "grad_norm": 2.7611818313598633, "learning_rate": 1e-05, - "loss": 0.4914, + "loss": 0.4942, "step": 18600 }, { "epoch": 0.000187, - "grad_norm": 3.2200653553009033, + "grad_norm": 2.739598274230957, "learning_rate": 1e-05, - "loss": 0.5049, + "loss": 0.508, "step": 18700 }, { "epoch": 0.000188, - "grad_norm": 2.734180450439453, + "grad_norm": 2.6710898876190186, "learning_rate": 1e-05, - "loss": 0.5084, + "loss": 0.5125, "step": 18800 }, { "epoch": 0.000189, - "grad_norm": 2.968550682067871, + "grad_norm": 2.7952322959899902, "learning_rate": 1e-05, - "loss": 0.482, + "loss": 0.4844, "step": 18900 }, { "epoch": 0.00019, - "grad_norm": 2.3700122833251953, + "grad_norm": 2.4689576625823975, "learning_rate": 1e-05, - "loss": 0.5051, + "loss": 0.5072, "step": 19000 }, { "epoch": 0.000191, - "grad_norm": 2.8745064735412598, + "grad_norm": 2.6042397022247314, "learning_rate": 1e-05, - "loss": 0.4881, + "loss": 0.491, "step": 19100 }, { "epoch": 0.000192, - "grad_norm": 3.040842056274414, + "grad_norm": 2.915821075439453, "learning_rate": 1e-05, - "loss": 0.5027, + "loss": 0.5029, "step": 19200 }, { "epoch": 0.000193, - "grad_norm": 2.9478836059570312, + "grad_norm": 3.0724246501922607, "learning_rate": 1e-05, - "loss": 0.506, + "loss": 0.5085, "step": 19300 }, { "epoch": 0.000194, - "grad_norm": 3.041682720184326, + "grad_norm": 2.755842924118042, "learning_rate": 1e-05, - "loss": 0.5041, + "loss": 0.5073, "step": 19400 }, { "epoch": 0.000195, - "grad_norm": 2.6585474014282227, + "grad_norm": 2.6537370681762695, "learning_rate": 1e-05, - "loss": 0.4911, + "loss": 0.4953, "step": 19500 }, { "epoch": 0.000196, - "grad_norm": 2.6947147846221924, + "grad_norm": 2.8526852130889893, "learning_rate": 1e-05, - "loss": 0.4937, + "loss": 0.4943, "step": 19600 }, { "epoch": 0.000197, - "grad_norm": 2.5482890605926514, + "grad_norm": 2.4322919845581055, "learning_rate": 1e-05, - "loss": 0.494, + "loss": 0.4985, "step": 19700 }, { "epoch": 0.000198, - "grad_norm": 2.9900755882263184, + "grad_norm": 3.18966007232666, "learning_rate": 1e-05, - "loss": 0.4941, + "loss": 0.4946, "step": 19800 }, { "epoch": 0.000199, - "grad_norm": 2.7972075939178467, + "grad_norm": 2.5495989322662354, "learning_rate": 1e-05, - "loss": 0.4942, + "loss": 0.4962, "step": 19900 }, { "epoch": 0.0002, - "grad_norm": 2.8532488346099854, + "grad_norm": 3.0627942085266113, "learning_rate": 1e-05, - "loss": 0.4883, + "loss": 0.4876, "step": 20000 }, { "epoch": 0.0002, - "eval_loss": 0.466796875, - "eval_runtime": 101.9767, - "eval_samples_per_second": 490.308, - "eval_steps_per_second": 30.644, + "eval_loss": 0.46728515625, + "eval_runtime": 111.4705, + "eval_samples_per_second": 448.549, + "eval_steps_per_second": 28.034, "step": 20000 }, { "epoch": 0.000201, - "grad_norm": 2.5173614025115967, + "grad_norm": 2.799205780029297, "learning_rate": 1e-05, - "loss": 0.5074, + "loss": 0.5117, "step": 20100 }, { "epoch": 0.000202, - "grad_norm": 2.629607915878296, + "grad_norm": 2.509645700454712, "learning_rate": 1e-05, - "loss": 0.489, + "loss": 0.4903, "step": 20200 }, { "epoch": 0.000203, - "grad_norm": 2.6820991039276123, + "grad_norm": 3.120729684829712, "learning_rate": 1e-05, - "loss": 0.4994, + "loss": 0.5027, "step": 20300 }, { "epoch": 0.000204, - "grad_norm": 2.247182607650757, + "grad_norm": 2.390143394470215, "learning_rate": 1e-05, - "loss": 0.4953, + "loss": 0.4989, "step": 20400 }, { "epoch": 0.000205, - "grad_norm": 2.465776205062866, + "grad_norm": 2.5487399101257324, "learning_rate": 1e-05, - "loss": 0.4946, + "loss": 0.4945, "step": 20500 }, { "epoch": 0.000206, - "grad_norm": 2.7661895751953125, + "grad_norm": 2.9931600093841553, "learning_rate": 1e-05, - "loss": 0.5018, + "loss": 0.5048, "step": 20600 }, { "epoch": 0.000207, - "grad_norm": 2.6909499168395996, + "grad_norm": 2.5760886669158936, "learning_rate": 1e-05, - "loss": 0.4898, + "loss": 0.4923, "step": 20700 }, { "epoch": 0.000208, - "grad_norm": 2.786198377609253, + "grad_norm": 3.149047613143921, "learning_rate": 1e-05, - "loss": 0.4928, + "loss": 0.4937, "step": 20800 }, { "epoch": 0.000209, - "grad_norm": 3.0669562816619873, + "grad_norm": 2.982687473297119, "learning_rate": 1e-05, - "loss": 0.4872, + "loss": 0.4901, "step": 20900 }, { "epoch": 0.00021, - "grad_norm": 2.468050241470337, + "grad_norm": 2.6681571006774902, "learning_rate": 1e-05, - "loss": 0.4895, + "loss": 0.4938, "step": 21000 }, { "epoch": 0.000211, - "grad_norm": 3.1763715744018555, + "grad_norm": 2.9358367919921875, "learning_rate": 1e-05, - "loss": 0.4953, + "loss": 0.4962, "step": 21100 }, { "epoch": 0.000212, - "grad_norm": 2.5261197090148926, + "grad_norm": 2.8525304794311523, "learning_rate": 1e-05, - "loss": 0.4953, + "loss": 0.4998, "step": 21200 }, { "epoch": 0.000213, - "grad_norm": 2.7113611698150635, + "grad_norm": 2.7039895057678223, "learning_rate": 1e-05, - "loss": 0.4858, + "loss": 0.4876, "step": 21300 }, { "epoch": 0.000214, - "grad_norm": 2.8574161529541016, + "grad_norm": 2.591728448867798, "learning_rate": 1e-05, - "loss": 0.4836, + "loss": 0.4895, "step": 21400 }, { "epoch": 0.000215, - "grad_norm": 2.377772331237793, + "grad_norm": 4.136421203613281, "learning_rate": 1e-05, - "loss": 0.4852, + "loss": 0.4892, "step": 21500 }, { "epoch": 0.000216, - "grad_norm": 2.767341375350952, + "grad_norm": 2.715740919113159, "learning_rate": 1e-05, - "loss": 0.4947, + "loss": 0.4966, "step": 21600 }, { "epoch": 0.000217, - "grad_norm": 2.676880359649658, + "grad_norm": 2.804382562637329, "learning_rate": 1e-05, - "loss": 0.4988, + "loss": 0.5002, "step": 21700 }, { "epoch": 0.000218, - "grad_norm": 2.5679101943969727, + "grad_norm": 2.568103551864624, "learning_rate": 1e-05, - "loss": 0.4889, + "loss": 0.4925, "step": 21800 }, { "epoch": 0.000219, - "grad_norm": 2.739131212234497, + "grad_norm": 2.671935796737671, "learning_rate": 1e-05, - "loss": 0.4844, + "loss": 0.4855, "step": 21900 }, { "epoch": 0.00022, - "grad_norm": 2.3010966777801514, + "grad_norm": 3.6780941486358643, "learning_rate": 1e-05, - "loss": 0.4733, + "loss": 0.4778, "step": 22000 }, { "epoch": 0.000221, - "grad_norm": 2.5198402404785156, + "grad_norm": 2.6798088550567627, "learning_rate": 1e-05, - "loss": 0.4862, + "loss": 0.4893, "step": 22100 }, { "epoch": 0.000222, - "grad_norm": 2.8612120151519775, + "grad_norm": 2.930389642715454, "learning_rate": 1e-05, - "loss": 0.4876, + "loss": 0.4904, "step": 22200 }, { "epoch": 0.000223, - "grad_norm": 2.417520046234131, + "grad_norm": 2.2273404598236084, "learning_rate": 1e-05, - "loss": 0.4811, + "loss": 0.4864, "step": 22300 }, { "epoch": 0.000224, - "grad_norm": 2.6198267936706543, + "grad_norm": 2.7305030822753906, "learning_rate": 1e-05, - "loss": 0.4926, + "loss": 0.4962, "step": 22400 }, { "epoch": 0.000225, - "grad_norm": 2.5785865783691406, + "grad_norm": 2.6182594299316406, "learning_rate": 1e-05, - "loss": 0.4845, + "loss": 0.4843, "step": 22500 }, { "epoch": 0.000226, - "grad_norm": 2.873509645462036, + "grad_norm": 2.7195889949798584, "learning_rate": 1e-05, - "loss": 0.4816, + "loss": 0.4848, "step": 22600 }, { "epoch": 0.000227, - "grad_norm": 2.7305026054382324, + "grad_norm": 2.842867136001587, "learning_rate": 1e-05, - "loss": 0.4819, + "loss": 0.4868, "step": 22700 }, { "epoch": 0.000228, - "grad_norm": 2.664306402206421, + "grad_norm": 2.6133205890655518, "learning_rate": 1e-05, - "loss": 0.4833, + "loss": 0.4854, "step": 22800 }, { "epoch": 0.000229, - "grad_norm": 2.3904993534088135, + "grad_norm": 2.9739954471588135, "learning_rate": 1e-05, - "loss": 0.4774, + "loss": 0.4799, "step": 22900 }, { "epoch": 0.00023, - "grad_norm": 2.9109280109405518, + "grad_norm": 2.704303503036499, "learning_rate": 1e-05, - "loss": 0.4744, + "loss": 0.4787, "step": 23000 }, { "epoch": 0.000231, - "grad_norm": 2.464026927947998, + "grad_norm": 2.6430766582489014, "learning_rate": 1e-05, - "loss": 0.4791, + "loss": 0.4838, "step": 23100 }, { "epoch": 0.000232, - "grad_norm": 2.5042343139648438, + "grad_norm": 2.57578444480896, "learning_rate": 1e-05, - "loss": 0.4791, + "loss": 0.4818, "step": 23200 }, { "epoch": 0.000233, - "grad_norm": 2.692004680633545, + "grad_norm": 2.553027868270874, "learning_rate": 1e-05, - "loss": 0.4849, + "loss": 0.4878, "step": 23300 }, { "epoch": 0.000234, - "grad_norm": 2.7386388778686523, + "grad_norm": 2.853264331817627, "learning_rate": 1e-05, - "loss": 0.4808, + "loss": 0.4841, "step": 23400 }, { "epoch": 0.000235, - "grad_norm": 2.7698347568511963, + "grad_norm": 2.591419219970703, "learning_rate": 1e-05, - "loss": 0.4791, + "loss": 0.4818, "step": 23500 }, { "epoch": 0.000236, - "grad_norm": 2.614785671234131, + "grad_norm": 2.8946421146392822, "learning_rate": 1e-05, - "loss": 0.4797, + "loss": 0.4808, "step": 23600 }, { "epoch": 0.000237, - "grad_norm": 2.6008248329162598, + "grad_norm": 2.9158196449279785, "learning_rate": 1e-05, - "loss": 0.476, + "loss": 0.4755, "step": 23700 }, { "epoch": 0.000238, - "grad_norm": 2.3245067596435547, + "grad_norm": 2.578831195831299, "learning_rate": 1e-05, - "loss": 0.4733, + "loss": 0.4759, "step": 23800 }, { "epoch": 0.000239, - "grad_norm": 2.8326148986816406, + "grad_norm": 2.6290273666381836, "learning_rate": 1e-05, - "loss": 0.4835, + "loss": 0.4858, "step": 23900 }, { "epoch": 0.00024, - "grad_norm": 2.5619540214538574, + "grad_norm": 2.525026321411133, "learning_rate": 1e-05, - "loss": 0.488, + "loss": 0.4913, "step": 24000 }, { "epoch": 0.000241, - "grad_norm": 2.5899136066436768, + "grad_norm": 4.174901008605957, "learning_rate": 1e-05, - "loss": 0.4711, + "loss": 0.474, "step": 24100 }, { "epoch": 0.000242, - "grad_norm": 2.7551770210266113, + "grad_norm": 2.6417720317840576, "learning_rate": 1e-05, - "loss": 0.482, + "loss": 0.4831, "step": 24200 }, { "epoch": 0.000243, - "grad_norm": 2.5321366786956787, + "grad_norm": 2.4943110942840576, "learning_rate": 1e-05, - "loss": 0.4749, + "loss": 0.4763, "step": 24300 }, { "epoch": 0.000244, - "grad_norm": 2.7021589279174805, + "grad_norm": 2.609255075454712, "learning_rate": 1e-05, - "loss": 0.4842, + "loss": 0.4868, "step": 24400 }, { "epoch": 0.000245, - "grad_norm": 2.713613271713257, + "grad_norm": 3.1849722862243652, "learning_rate": 1e-05, - "loss": 0.4798, + "loss": 0.48, "step": 24500 }, { "epoch": 0.000246, - "grad_norm": 2.547306537628174, + "grad_norm": 2.6401076316833496, "learning_rate": 1e-05, - "loss": 0.4836, + "loss": 0.4873, "step": 24600 }, { "epoch": 0.000247, - "grad_norm": 2.7221243381500244, + "grad_norm": 2.936086416244507, "learning_rate": 1e-05, - "loss": 0.4761, + "loss": 0.4769, "step": 24700 }, { "epoch": 0.000248, - "grad_norm": 2.7073612213134766, + "grad_norm": 2.6310338973999023, "learning_rate": 1e-05, - "loss": 0.4759, + "loss": 0.4794, "step": 24800 }, { "epoch": 0.000249, - "grad_norm": 2.4039628505706787, + "grad_norm": 2.3982949256896973, "learning_rate": 1e-05, - "loss": 0.4782, + "loss": 0.4818, "step": 24900 }, { "epoch": 0.00025, - "grad_norm": 2.4823732376098633, + "grad_norm": 2.8052144050598145, "learning_rate": 1e-05, - "loss": 0.4653, + "loss": 0.4674, "step": 25000 }, { "epoch": 0.000251, - "grad_norm": 2.722165107727051, + "grad_norm": 3.136012315750122, "learning_rate": 1e-05, - "loss": 0.4702, + "loss": 0.4739, "step": 25100 }, { "epoch": 0.000252, - "grad_norm": 2.538250207901001, + "grad_norm": 2.721803665161133, "learning_rate": 1e-05, - "loss": 0.4632, + "loss": 0.4653, "step": 25200 }, { "epoch": 0.000253, - "grad_norm": 2.7353577613830566, + "grad_norm": 2.9835872650146484, "learning_rate": 1e-05, - "loss": 0.4723, + "loss": 0.4752, "step": 25300 }, { "epoch": 0.000254, - "grad_norm": 2.572460174560547, + "grad_norm": 2.4551830291748047, "learning_rate": 1e-05, - "loss": 0.4719, + "loss": 0.4732, "step": 25400 }, { "epoch": 0.000255, - "grad_norm": 2.84359073638916, + "grad_norm": 3.3453078269958496, "learning_rate": 1e-05, - "loss": 0.4761, + "loss": 0.4766, "step": 25500 }, { "epoch": 0.000256, - "grad_norm": 2.6090247631073, + "grad_norm": 2.7177910804748535, "learning_rate": 1e-05, - "loss": 0.473, + "loss": 0.4761, "step": 25600 }, { "epoch": 0.000257, - "grad_norm": 2.816680669784546, + "grad_norm": 2.9529471397399902, "learning_rate": 1e-05, - "loss": 0.4778, + "loss": 0.4816, "step": 25700 }, { "epoch": 0.000258, - "grad_norm": 2.657853841781616, + "grad_norm": 2.5339162349700928, "learning_rate": 1e-05, - "loss": 0.4698, + "loss": 0.4719, "step": 25800 }, { "epoch": 0.000259, - "grad_norm": 2.85893177986145, + "grad_norm": 2.5781121253967285, "learning_rate": 1e-05, - "loss": 0.4724, + "loss": 0.4742, "step": 25900 }, { "epoch": 0.00026, - "grad_norm": 2.461073398590088, + "grad_norm": 2.7583415508270264, "learning_rate": 1e-05, - "loss": 0.4761, + "loss": 0.478, "step": 26000 }, { "epoch": 0.000261, - "grad_norm": 2.3560779094696045, + "grad_norm": 2.307037591934204, "learning_rate": 1e-05, - "loss": 0.4764, + "loss": 0.4767, "step": 26100 }, { "epoch": 0.000262, - "grad_norm": 2.778369426727295, + "grad_norm": 2.9219844341278076, "learning_rate": 1e-05, - "loss": 0.4766, + "loss": 0.4783, "step": 26200 }, { "epoch": 0.000263, - "grad_norm": 2.4488160610198975, + "grad_norm": 2.4626011848449707, "learning_rate": 1e-05, - "loss": 0.478, + "loss": 0.4806, "step": 26300 }, { "epoch": 0.000264, - "grad_norm": 2.781723737716675, + "grad_norm": 2.7708845138549805, "learning_rate": 1e-05, - "loss": 0.4699, + "loss": 0.4717, "step": 26400 }, { "epoch": 0.000265, - "grad_norm": 2.6240341663360596, + "grad_norm": 2.806086301803589, "learning_rate": 1e-05, - "loss": 0.4636, + "loss": 0.4641, "step": 26500 }, { "epoch": 0.000266, - "grad_norm": 2.64532470703125, + "grad_norm": 2.395228385925293, "learning_rate": 1e-05, - "loss": 0.4743, + "loss": 0.4769, "step": 26600 }, { "epoch": 0.000267, - "grad_norm": 2.5863447189331055, + "grad_norm": 2.482196807861328, "learning_rate": 1e-05, - "loss": 0.4628, + "loss": 0.4637, "step": 26700 }, { "epoch": 0.000268, - "grad_norm": 2.6023645401000977, + "grad_norm": 2.3239686489105225, "learning_rate": 1e-05, - "loss": 0.472, + "loss": 0.4728, "step": 26800 }, { "epoch": 0.000269, - "grad_norm": 2.685303211212158, + "grad_norm": 2.8760108947753906, "learning_rate": 1e-05, - "loss": 0.4676, + "loss": 0.4683, "step": 26900 }, { "epoch": 0.00027, - "grad_norm": 2.733454704284668, + "grad_norm": 3.0095269680023193, "learning_rate": 1e-05, - "loss": 0.4614, + "loss": 0.4641, "step": 27000 }, { "epoch": 0.000271, - "grad_norm": 2.5530104637145996, + "grad_norm": 2.7603840827941895, "learning_rate": 1e-05, - "loss": 0.4812, + "loss": 0.4824, "step": 27100 }, { "epoch": 0.000272, - "grad_norm": 2.609612464904785, + "grad_norm": 2.6380269527435303, "learning_rate": 1e-05, - "loss": 0.4607, + "loss": 0.4626, "step": 27200 }, { "epoch": 0.000273, - "grad_norm": 2.6109836101531982, + "grad_norm": 3.095323324203491, "learning_rate": 1e-05, - "loss": 0.4662, + "loss": 0.4671, "step": 27300 }, { "epoch": 0.000274, - "grad_norm": 2.4511022567749023, + "grad_norm": 2.6990623474121094, "learning_rate": 1e-05, - "loss": 0.4733, + "loss": 0.4766, "step": 27400 }, { "epoch": 0.000275, - "grad_norm": 2.623624324798584, + "grad_norm": 2.502337694168091, "learning_rate": 1e-05, - "loss": 0.4617, + "loss": 0.4608, "step": 27500 }, { "epoch": 0.000276, - "grad_norm": 2.5565483570098877, + "grad_norm": 2.3775081634521484, "learning_rate": 1e-05, - "loss": 0.4688, + "loss": 0.4749, "step": 27600 }, { "epoch": 0.000277, - "grad_norm": 2.657158851623535, + "grad_norm": 2.7666544914245605, "learning_rate": 1e-05, - "loss": 0.4645, + "loss": 0.465, "step": 27700 }, { "epoch": 0.000278, - "grad_norm": 2.631279945373535, + "grad_norm": 2.4668657779693604, "learning_rate": 1e-05, - "loss": 0.4567, + "loss": 0.4605, "step": 27800 }, { "epoch": 0.000279, - "grad_norm": 3.2603516578674316, + "grad_norm": 3.91645884513855, "learning_rate": 1e-05, - "loss": 0.4641, + "loss": 0.4684, "step": 27900 }, { "epoch": 0.00028, - "grad_norm": 2.5580084323883057, + "grad_norm": 2.781068801879883, "learning_rate": 1e-05, - "loss": 0.4567, + "loss": 0.4612, "step": 28000 }, { "epoch": 0.000281, - "grad_norm": 2.355168581008911, + "grad_norm": 2.4313833713531494, "learning_rate": 1e-05, - "loss": 0.4644, + "loss": 0.4661, "step": 28100 }, { "epoch": 0.000282, - "grad_norm": 2.3641602993011475, + "grad_norm": 2.236158847808838, "learning_rate": 1e-05, - "loss": 0.4738, + "loss": 0.4728, "step": 28200 }, { "epoch": 0.000283, - "grad_norm": 2.60513973236084, + "grad_norm": 2.7676749229431152, "learning_rate": 1e-05, - "loss": 0.4652, + "loss": 0.4667, "step": 28300 }, { "epoch": 0.000284, - "grad_norm": 2.3912665843963623, + "grad_norm": 2.1664578914642334, "learning_rate": 1e-05, - "loss": 0.4612, + "loss": 0.4632, "step": 28400 }, { "epoch": 0.000285, - "grad_norm": 2.462395668029785, + "grad_norm": 2.4924814701080322, "learning_rate": 1e-05, - "loss": 0.4663, + "loss": 0.4699, "step": 28500 }, { "epoch": 0.000286, - "grad_norm": 2.571925163269043, + "grad_norm": 3.364351749420166, "learning_rate": 1e-05, - "loss": 0.4551, + "loss": 0.4563, "step": 28600 }, { "epoch": 0.000287, - "grad_norm": 2.615628719329834, + "grad_norm": 2.29882550239563, "learning_rate": 1e-05, - "loss": 0.464, + "loss": 0.4689, "step": 28700 }, { "epoch": 0.000288, - "grad_norm": 2.360787868499756, + "grad_norm": 2.626985549926758, "learning_rate": 1e-05, - "loss": 0.4607, + "loss": 0.463, "step": 28800 }, { "epoch": 0.000289, - "grad_norm": 2.21407413482666, + "grad_norm": 2.7008321285247803, "learning_rate": 1e-05, - "loss": 0.4575, + "loss": 0.4601, "step": 28900 }, { "epoch": 0.00029, - "grad_norm": 2.7879395484924316, + "grad_norm": 2.5816690921783447, "learning_rate": 1e-05, - "loss": 0.4565, + "loss": 0.4597, "step": 29000 }, { "epoch": 0.000291, - "grad_norm": 2.6157572269439697, + "grad_norm": 2.4688684940338135, "learning_rate": 1e-05, - "loss": 0.4592, + "loss": 0.4599, "step": 29100 }, { "epoch": 0.000292, - "grad_norm": 2.8705883026123047, + "grad_norm": 2.839632749557495, "learning_rate": 1e-05, - "loss": 0.4666, + "loss": 0.4677, "step": 29200 }, { "epoch": 0.000293, - "grad_norm": 2.693171977996826, + "grad_norm": 3.800483465194702, "learning_rate": 1e-05, - "loss": 0.453, + "loss": 0.4565, "step": 29300 }, { "epoch": 0.000294, - "grad_norm": 2.638017177581787, + "grad_norm": 4.7663726806640625, "learning_rate": 1e-05, - "loss": 0.4514, + "loss": 0.4522, "step": 29400 }, { "epoch": 0.000295, - "grad_norm": 2.3667051792144775, + "grad_norm": 2.4576992988586426, "learning_rate": 1e-05, - "loss": 0.4612, + "loss": 0.4638, "step": 29500 }, { "epoch": 0.000296, - "grad_norm": 2.5680370330810547, + "grad_norm": 2.6215529441833496, "learning_rate": 1e-05, - "loss": 0.4503, + "loss": 0.4522, "step": 29600 }, { "epoch": 0.000297, - "grad_norm": 2.237605571746826, + "grad_norm": 2.35202693939209, "learning_rate": 1e-05, - "loss": 0.4517, + "loss": 0.452, "step": 29700 }, { "epoch": 0.000298, - "grad_norm": 2.098609685897827, + "grad_norm": 2.1658172607421875, "learning_rate": 1e-05, - "loss": 0.4649, + "loss": 0.4638, "step": 29800 }, { "epoch": 0.000299, - "grad_norm": 2.843665361404419, + "grad_norm": 2.6954879760742188, "learning_rate": 1e-05, - "loss": 0.452, + "loss": 0.4533, "step": 29900 }, { "epoch": 0.0003, - "grad_norm": 2.2658700942993164, + "grad_norm": 2.465700626373291, "learning_rate": 1e-05, - "loss": 0.4553, + "loss": 0.4595, "step": 30000 }, { "epoch": 0.000301, - "grad_norm": 2.543165683746338, + "grad_norm": 2.5099382400512695, "learning_rate": 1e-05, "loss": 0.464, "step": 30100 }, { "epoch": 0.000302, - "grad_norm": 2.755690813064575, + "grad_norm": 2.632709264755249, "learning_rate": 1e-05, - "loss": 0.4586, + "loss": 0.4583, "step": 30200 }, { "epoch": 0.000303, - "grad_norm": 2.6091063022613525, + "grad_norm": 3.451253890991211, "learning_rate": 1e-05, - "loss": 0.4633, + "loss": 0.4641, "step": 30300 }, { "epoch": 0.000304, - "grad_norm": 2.5299808979034424, + "grad_norm": 2.5576229095458984, "learning_rate": 1e-05, - "loss": 0.4621, + "loss": 0.4643, "step": 30400 }, { "epoch": 0.000305, - "grad_norm": 2.401829719543457, + "grad_norm": 2.63854718208313, "learning_rate": 1e-05, - "loss": 0.4471, + "loss": 0.4509, "step": 30500 }, { "epoch": 0.000306, - "grad_norm": 2.3319270610809326, + "grad_norm": 2.7660105228424072, "learning_rate": 1e-05, - "loss": 0.4534, + "loss": 0.4533, "step": 30600 }, { "epoch": 0.000307, - "grad_norm": 2.4305851459503174, + "grad_norm": 2.862382411956787, "learning_rate": 1e-05, - "loss": 0.453, + "loss": 0.4536, "step": 30700 }, { "epoch": 0.000308, - "grad_norm": 2.6582820415496826, + "grad_norm": 2.6443052291870117, "learning_rate": 1e-05, - "loss": 0.4479, + "loss": 0.4485, "step": 30800 }, { "epoch": 0.000309, - "grad_norm": 2.4796783924102783, + "grad_norm": 2.525301456451416, "learning_rate": 1e-05, - "loss": 0.4575, + "loss": 0.4593, "step": 30900 }, { "epoch": 0.00031, - "grad_norm": 2.445875883102417, + "grad_norm": 2.4334826469421387, "learning_rate": 1e-05, - "loss": 0.4487, + "loss": 0.4495, "step": 31000 }, { "epoch": 0.000311, - "grad_norm": 2.7277352809906006, + "grad_norm": 2.443531036376953, "learning_rate": 1e-05, - "loss": 0.4539, + "loss": 0.4563, "step": 31100 }, { "epoch": 0.000312, - "grad_norm": 2.3069751262664795, + "grad_norm": 2.3432857990264893, "learning_rate": 1e-05, - "loss": 0.4596, + "loss": 0.4587, "step": 31200 }, { "epoch": 0.000313, - "grad_norm": 2.650803804397583, + "grad_norm": 2.470900058746338, "learning_rate": 1e-05, - "loss": 0.445, + "loss": 0.4466, "step": 31300 }, { "epoch": 0.000314, - "grad_norm": 2.586442470550537, + "grad_norm": 2.719302177429199, "learning_rate": 1e-05, - "loss": 0.4603, + "loss": 0.4633, "step": 31400 }, { "epoch": 0.000315, - "grad_norm": 2.7829813957214355, + "grad_norm": 2.7963156700134277, "learning_rate": 1e-05, - "loss": 0.4529, + "loss": 0.4536, "step": 31500 }, { "epoch": 0.000316, - "grad_norm": 2.4229977130889893, + "grad_norm": 2.750457763671875, "learning_rate": 1e-05, - "loss": 0.4528, + "loss": 0.4515, "step": 31600 }, { "epoch": 0.000317, - "grad_norm": 2.4584128856658936, + "grad_norm": 2.4325835704803467, "learning_rate": 1e-05, - "loss": 0.453, + "loss": 0.4555, "step": 31700 }, { "epoch": 0.000318, - "grad_norm": 2.638582229614258, + "grad_norm": 2.4605915546417236, "learning_rate": 1e-05, - "loss": 0.4548, + "loss": 0.4571, "step": 31800 }, { "epoch": 0.000319, - "grad_norm": 2.2935678958892822, + "grad_norm": 2.3610308170318604, "learning_rate": 1e-05, - "loss": 0.4535, + "loss": 0.455, "step": 31900 }, { "epoch": 0.00032, - "grad_norm": 2.4465720653533936, + "grad_norm": 2.4374446868896484, "learning_rate": 1e-05, - "loss": 0.4552, + "loss": 0.4577, "step": 32000 }, { "epoch": 0.000321, - "grad_norm": 2.568911075592041, + "grad_norm": 2.618852138519287, "learning_rate": 1e-05, - "loss": 0.4436, + "loss": 0.4434, "step": 32100 }, { "epoch": 0.000322, - "grad_norm": 2.5050342082977295, + "grad_norm": 3.4312095642089844, "learning_rate": 1e-05, - "loss": 0.4497, + "loss": 0.448, "step": 32200 }, { "epoch": 0.000323, - "grad_norm": 2.882383108139038, + "grad_norm": 3.933258056640625, "learning_rate": 1e-05, - "loss": 0.4505, + "loss": 0.4542, "step": 32300 }, { "epoch": 0.000324, - "grad_norm": 2.3093669414520264, + "grad_norm": 2.4201653003692627, "learning_rate": 1e-05, - "loss": 0.4509, + "loss": 0.4501, "step": 32400 }, { "epoch": 0.000325, - "grad_norm": 2.4759955406188965, + "grad_norm": 2.638230085372925, "learning_rate": 1e-05, - "loss": 0.4576, + "loss": 0.4587, "step": 32500 }, { "epoch": 0.000326, - "grad_norm": 2.2359471321105957, + "grad_norm": 2.310612201690674, "learning_rate": 1e-05, - "loss": 0.4632, + "loss": 0.4647, "step": 32600 }, { "epoch": 0.000327, - "grad_norm": 2.2961983680725098, + "grad_norm": 2.466186285018921, "learning_rate": 1e-05, - "loss": 0.4424, + "loss": 0.4462, "step": 32700 }, { "epoch": 0.000328, - "grad_norm": 2.364755630493164, + "grad_norm": 2.8925211429595947, "learning_rate": 1e-05, - "loss": 0.4531, + "loss": 0.4556, "step": 32800 }, { "epoch": 0.000329, - "grad_norm": 2.6120266914367676, + "grad_norm": 2.5643179416656494, "learning_rate": 1e-05, - "loss": 0.4467, + "loss": 0.4479, "step": 32900 }, { "epoch": 0.00033, - "grad_norm": 2.368785858154297, + "grad_norm": 2.372391939163208, "learning_rate": 1e-05, - "loss": 0.4551, + "loss": 0.4583, "step": 33000 }, { "epoch": 0.000331, - "grad_norm": 2.5879604816436768, + "grad_norm": 2.6954376697540283, "learning_rate": 1e-05, - "loss": 0.4512, + "loss": 0.4514, "step": 33100 }, { "epoch": 0.000332, - "grad_norm": 2.788543462753296, + "grad_norm": 2.380615234375, "learning_rate": 1e-05, - "loss": 0.4443, + "loss": 0.4468, "step": 33200 }, { "epoch": 0.000333, - "grad_norm": 2.4448671340942383, + "grad_norm": 2.5895583629608154, "learning_rate": 1e-05, - "loss": 0.4485, + "loss": 0.4503, "step": 33300 }, { "epoch": 0.000334, - "grad_norm": 2.3465311527252197, + "grad_norm": 3.745288133621216, "learning_rate": 1e-05, - "loss": 0.4438, + "loss": 0.4453, "step": 33400 }, { "epoch": 0.000335, - "grad_norm": 2.266247272491455, + "grad_norm": 2.346338987350464, "learning_rate": 1e-05, - "loss": 0.4581, + "loss": 0.4605, "step": 33500 }, { "epoch": 0.000336, - "grad_norm": 2.685112237930298, + "grad_norm": 3.660322666168213, "learning_rate": 1e-05, - "loss": 0.449, + "loss": 0.4518, "step": 33600 }, { "epoch": 0.000337, - "grad_norm": 2.2699062824249268, + "grad_norm": 2.2425618171691895, "learning_rate": 1e-05, "loss": 0.4433, "step": 33700 }, { "epoch": 0.000338, - "grad_norm": 2.424692153930664, + "grad_norm": 2.385923385620117, "learning_rate": 1e-05, - "loss": 0.447, + "loss": 0.4452, "step": 33800 }, { "epoch": 0.000339, - "grad_norm": 2.2114510536193848, + "grad_norm": 2.3236701488494873, "learning_rate": 1e-05, - "loss": 0.4417, + "loss": 0.4428, "step": 33900 }, { "epoch": 0.00034, - "grad_norm": 2.280491352081299, + "grad_norm": 2.2188169956207275, "learning_rate": 1e-05, - "loss": 0.4509, + "loss": 0.4536, "step": 34000 }, { "epoch": 0.000341, - "grad_norm": 2.457481622695923, + "grad_norm": 2.760098934173584, "learning_rate": 1e-05, - "loss": 0.449, + "loss": 0.4487, "step": 34100 }, { "epoch": 0.000342, - "grad_norm": 2.0714383125305176, + "grad_norm": 2.2766711711883545, "learning_rate": 1e-05, - "loss": 0.4537, + "loss": 0.4545, "step": 34200 }, { "epoch": 0.000343, - "grad_norm": 2.520087480545044, + "grad_norm": 2.6107327938079834, "learning_rate": 1e-05, - "loss": 0.4476, + "loss": 0.4503, "step": 34300 }, { "epoch": 0.000344, - "grad_norm": 2.7613301277160645, + "grad_norm": 2.5845329761505127, "learning_rate": 1e-05, - "loss": 0.4499, + "loss": 0.4503, "step": 34400 }, { "epoch": 0.000345, - "grad_norm": 2.206310510635376, + "grad_norm": 2.1708152294158936, "learning_rate": 1e-05, - "loss": 0.4457, + "loss": 0.4474, "step": 34500 }, { "epoch": 0.000346, - "grad_norm": 2.4119200706481934, + "grad_norm": 2.408508062362671, "learning_rate": 1e-05, - "loss": 0.4515, + "loss": 0.4504, "step": 34600 }, { "epoch": 0.000347, - "grad_norm": 2.7088863849639893, + "grad_norm": 2.5341970920562744, "learning_rate": 1e-05, - "loss": 0.4548, + "loss": 0.4531, "step": 34700 }, { "epoch": 0.000348, - "grad_norm": 2.6948633193969727, + "grad_norm": 2.5573482513427734, "learning_rate": 1e-05, - "loss": 0.448, + "loss": 0.4472, "step": 34800 }, { "epoch": 0.000349, - "grad_norm": 2.457263946533203, + "grad_norm": 2.318730115890503, "learning_rate": 1e-05, - "loss": 0.4438, + "loss": 0.4462, "step": 34900 }, { "epoch": 0.00035, - "grad_norm": 2.3207664489746094, + "grad_norm": 2.341620922088623, "learning_rate": 1e-05, - "loss": 0.4407, + "loss": 0.4423, "step": 35000 }, { "epoch": 0.000351, - "grad_norm": 2.231448173522949, + "grad_norm": 2.1792995929718018, "learning_rate": 1e-05, - "loss": 0.449, + "loss": 0.4519, "step": 35100 }, { "epoch": 0.000352, - "grad_norm": 2.332615613937378, + "grad_norm": 2.4928019046783447, "learning_rate": 1e-05, - "loss": 0.4514, + "loss": 0.4519, "step": 35200 }, { "epoch": 0.000353, - "grad_norm": 2.5433437824249268, + "grad_norm": 2.512012481689453, "learning_rate": 1e-05, - "loss": 0.4549, + "loss": 0.4578, "step": 35300 }, { "epoch": 0.000354, - "grad_norm": 2.326695442199707, + "grad_norm": 2.510221242904663, "learning_rate": 1e-05, - "loss": 0.4484, + "loss": 0.4506, "step": 35400 }, { "epoch": 0.000355, - "grad_norm": 2.481430768966675, + "grad_norm": 2.637925624847412, "learning_rate": 1e-05, - "loss": 0.4431, + "loss": 0.4454, "step": 35500 }, { "epoch": 0.000356, - "grad_norm": 2.3822989463806152, + "grad_norm": 2.6336724758148193, "learning_rate": 1e-05, - "loss": 0.4472, + "loss": 0.4477, "step": 35600 }, { "epoch": 0.000357, - "grad_norm": 3.1235344409942627, + "grad_norm": 2.6396801471710205, "learning_rate": 1e-05, - "loss": 0.4462, + "loss": 0.4424, "step": 35700 }, { "epoch": 0.000358, - "grad_norm": 2.2521212100982666, + "grad_norm": 2.2982890605926514, "learning_rate": 1e-05, - "loss": 0.4403, + "loss": 0.4434, "step": 35800 }, { "epoch": 0.000359, - "grad_norm": 2.542667865753174, + "grad_norm": 2.617039680480957, "learning_rate": 1e-05, - "loss": 0.4427, + "loss": 0.443, "step": 35900 }, { "epoch": 0.00036, - "grad_norm": 2.4560317993164062, + "grad_norm": 2.3763229846954346, "learning_rate": 1e-05, - "loss": 0.4453, + "loss": 0.4462, "step": 36000 }, { "epoch": 0.000361, - "grad_norm": 2.872126579284668, + "grad_norm": 2.4981770515441895, "learning_rate": 1e-05, - "loss": 0.4446, + "loss": 0.445, "step": 36100 }, { "epoch": 0.000362, - "grad_norm": 2.2262322902679443, + "grad_norm": 2.2644827365875244, "learning_rate": 1e-05, - "loss": 0.4436, + "loss": 0.4423, "step": 36200 }, { "epoch": 0.000363, - "grad_norm": 2.1134073734283447, + "grad_norm": 2.1267762184143066, "learning_rate": 1e-05, - "loss": 0.4371, + "loss": 0.4391, "step": 36300 }, { "epoch": 0.000364, - "grad_norm": 2.118499994277954, + "grad_norm": 2.2303924560546875, "learning_rate": 1e-05, - "loss": 0.4447, + "loss": 0.445, "step": 36400 }, { "epoch": 0.000365, - "grad_norm": 2.3149337768554688, + "grad_norm": 2.383427619934082, "learning_rate": 1e-05, - "loss": 0.4364, + "loss": 0.4391, "step": 36500 }, { "epoch": 0.000366, - "grad_norm": 2.334366798400879, + "grad_norm": 2.600222587585449, "learning_rate": 1e-05, - "loss": 0.4456, + "loss": 0.446, "step": 36600 }, { "epoch": 0.000367, - "grad_norm": 2.2575650215148926, + "grad_norm": 2.557803153991699, "learning_rate": 1e-05, - "loss": 0.4498, + "loss": 0.4519, "step": 36700 }, { "epoch": 0.000368, - "grad_norm": 2.675327777862549, + "grad_norm": 2.5691545009613037, "learning_rate": 1e-05, - "loss": 0.4403, + "loss": 0.4385, "step": 36800 }, { "epoch": 0.000369, - "grad_norm": 2.5896053314208984, + "grad_norm": 2.3497157096862793, "learning_rate": 1e-05, - "loss": 0.4382, + "loss": 0.4383, "step": 36900 }, { "epoch": 0.00037, - "grad_norm": 2.684830904006958, + "grad_norm": 2.87781023979187, "learning_rate": 1e-05, - "loss": 0.4396, + "loss": 0.4409, "step": 37000 }, { "epoch": 0.000371, - "grad_norm": 2.4175543785095215, + "grad_norm": 2.447251558303833, "learning_rate": 1e-05, - "loss": 0.4467, + "loss": 0.4485, "step": 37100 }, { "epoch": 0.000372, - "grad_norm": 2.7629592418670654, + "grad_norm": 2.536022424697876, "learning_rate": 1e-05, - "loss": 0.44, + "loss": 0.4406, "step": 37200 }, { "epoch": 0.000373, - "grad_norm": 2.49387526512146, + "grad_norm": 2.4963696002960205, "learning_rate": 1e-05, - "loss": 0.4302, + "loss": 0.4311, "step": 37300 }, { "epoch": 0.000374, - "grad_norm": 2.3728890419006348, + "grad_norm": 2.701169967651367, "learning_rate": 1e-05, - "loss": 0.4361, + "loss": 0.4385, "step": 37400 }, { "epoch": 0.000375, - "grad_norm": 2.337378740310669, + "grad_norm": 2.514227867126465, "learning_rate": 1e-05, - "loss": 0.4332, + "loss": 0.4357, "step": 37500 }, { "epoch": 0.000376, - "grad_norm": 2.330686330795288, + "grad_norm": 2.5205540657043457, "learning_rate": 1e-05, - "loss": 0.4429, + "loss": 0.4447, "step": 37600 }, { "epoch": 0.000377, - "grad_norm": 2.580684185028076, + "grad_norm": 2.7338292598724365, "learning_rate": 1e-05, - "loss": 0.4417, + "loss": 0.4452, "step": 37700 }, { "epoch": 0.000378, - "grad_norm": 2.4268381595611572, + "grad_norm": 2.3861348628997803, "learning_rate": 1e-05, - "loss": 0.435, + "loss": 0.4377, "step": 37800 }, { "epoch": 0.000379, - "grad_norm": 2.2591755390167236, + "grad_norm": 2.41011381149292, "learning_rate": 1e-05, - "loss": 0.439, + "loss": 0.4405, "step": 37900 }, { "epoch": 0.00038, - "grad_norm": 2.4695215225219727, + "grad_norm": 2.449092388153076, "learning_rate": 1e-05, - "loss": 0.4445, + "loss": 0.4451, "step": 38000 }, { "epoch": 0.000381, - "grad_norm": 2.223601818084717, + "grad_norm": 2.571415662765503, "learning_rate": 1e-05, - "loss": 0.4392, + "loss": 0.439, "step": 38100 }, { "epoch": 0.000382, - "grad_norm": 2.2162795066833496, + "grad_norm": 2.169980764389038, "learning_rate": 1e-05, - "loss": 0.4385, + "loss": 0.4384, "step": 38200 }, { "epoch": 0.000383, - "grad_norm": 2.3102917671203613, + "grad_norm": 3.838111162185669, "learning_rate": 1e-05, - "loss": 0.4309, + "loss": 0.4322, "step": 38300 }, { "epoch": 0.000384, - "grad_norm": 2.7429144382476807, + "grad_norm": 3.1614363193511963, "learning_rate": 1e-05, - "loss": 0.4416, + "loss": 0.4426, "step": 38400 }, { "epoch": 0.000385, - "grad_norm": 2.003499746322632, + "grad_norm": 1.975185513496399, "learning_rate": 1e-05, - "loss": 0.4379, + "loss": 0.4375, "step": 38500 }, { "epoch": 0.000386, - "grad_norm": 2.087963104248047, + "grad_norm": 2.3980181217193604, "learning_rate": 1e-05, - "loss": 0.4367, + "loss": 0.4411, "step": 38600 }, { "epoch": 0.000387, - "grad_norm": 2.265549421310425, + "grad_norm": 2.372525453567505, "learning_rate": 1e-05, - "loss": 0.4364, + "loss": 0.4362, "step": 38700 }, { "epoch": 0.000388, - "grad_norm": 2.2483983039855957, + "grad_norm": 2.3161978721618652, "learning_rate": 1e-05, - "loss": 0.4367, + "loss": 0.4405, "step": 38800 }, { "epoch": 0.000389, - "grad_norm": 2.350341320037842, + "grad_norm": 2.4494197368621826, "learning_rate": 1e-05, - "loss": 0.4347, + "loss": 0.4374, "step": 38900 }, { "epoch": 0.00039, - "grad_norm": 2.3627052307128906, + "grad_norm": 2.3522799015045166, "learning_rate": 1e-05, - "loss": 0.4395, + "loss": 0.4432, "step": 39000 }, { "epoch": 0.000391, - "grad_norm": 2.285327434539795, + "grad_norm": 2.3715996742248535, "learning_rate": 1e-05, - "loss": 0.4228, + "loss": 0.4237, "step": 39100 }, { "epoch": 0.000392, - "grad_norm": 2.743180751800537, + "grad_norm": 2.429914951324463, "learning_rate": 1e-05, - "loss": 0.4326, + "loss": 0.4332, "step": 39200 }, { "epoch": 0.000393, - "grad_norm": 2.4519877433776855, + "grad_norm": 2.4736123085021973, "learning_rate": 1e-05, - "loss": 0.432, + "loss": 0.4345, "step": 39300 }, { "epoch": 0.000394, - "grad_norm": 2.107567548751831, + "grad_norm": 2.033489942550659, "learning_rate": 1e-05, - "loss": 0.4379, + "loss": 0.4371, "step": 39400 }, { "epoch": 0.000395, - "grad_norm": 2.036893367767334, + "grad_norm": 2.22459077835083, "learning_rate": 1e-05, - "loss": 0.4335, + "loss": 0.4336, "step": 39500 }, { "epoch": 0.000396, - "grad_norm": 2.268911123275757, + "grad_norm": 2.475951910018921, "learning_rate": 1e-05, - "loss": 0.4339, + "loss": 0.4349, "step": 39600 }, { "epoch": 0.000397, - "grad_norm": 2.1069495677948, + "grad_norm": 2.2297749519348145, "learning_rate": 1e-05, - "loss": 0.4256, + "loss": 0.4274, "step": 39700 }, { "epoch": 0.000398, - "grad_norm": 3.2932920455932617, + "grad_norm": 2.445439338684082, "learning_rate": 1e-05, - "loss": 0.4379, + "loss": 0.4366, "step": 39800 }, { "epoch": 0.000399, - "grad_norm": 2.307811737060547, + "grad_norm": 2.4138917922973633, "learning_rate": 1e-05, "loss": 0.4351, "step": 39900 }, { "epoch": 0.0004, - "grad_norm": 2.5737617015838623, + "grad_norm": 2.551255226135254, "learning_rate": 1e-05, - "loss": 0.4319, + "loss": 0.4333, "step": 40000 }, { "epoch": 0.0004, - "eval_loss": 0.41015625, - "eval_runtime": 105.5647, - "eval_samples_per_second": 473.643, - "eval_steps_per_second": 29.603, + "eval_loss": 0.4091796875, + "eval_runtime": 110.8814, + "eval_samples_per_second": 450.932, + "eval_steps_per_second": 28.183, "step": 40000 }, { "epoch": 0.000401, - "grad_norm": 2.403686046600342, + "grad_norm": 2.6333506107330322, "learning_rate": 1e-05, - "loss": 0.4328, + "loss": 0.4332, "step": 40100 }, { "epoch": 0.000402, - "grad_norm": 2.319885492324829, + "grad_norm": 2.0419421195983887, "learning_rate": 1e-05, - "loss": 0.4332, + "loss": 0.4324, "step": 40200 }, { "epoch": 0.000403, - "grad_norm": 2.2234652042388916, + "grad_norm": 2.1599907875061035, "learning_rate": 1e-05, "loss": 0.4261, "step": 40300 }, { "epoch": 0.000404, - "grad_norm": 2.363771915435791, + "grad_norm": 2.1608216762542725, "learning_rate": 1e-05, - "loss": 0.4251, + "loss": 0.4265, "step": 40400 }, { "epoch": 0.000405, - "grad_norm": 2.3270111083984375, + "grad_norm": 2.0742979049682617, "learning_rate": 1e-05, - "loss": 0.4305, + "loss": 0.4298, "step": 40500 }, { "epoch": 0.000406, - "grad_norm": 2.3788092136383057, + "grad_norm": 2.487959146499634, "learning_rate": 1e-05, - "loss": 0.4324, + "loss": 0.43, "step": 40600 }, { "epoch": 0.000407, - "grad_norm": 2.2893447875976562, + "grad_norm": 2.436591148376465, "learning_rate": 1e-05, - "loss": 0.434, + "loss": 0.4356, "step": 40700 }, { "epoch": 0.000408, - "grad_norm": 2.2032690048217773, + "grad_norm": 2.2447760105133057, "learning_rate": 1e-05, - "loss": 0.4283, + "loss": 0.4269, "step": 40800 }, { "epoch": 0.000409, - "grad_norm": 2.1438772678375244, + "grad_norm": 2.1390585899353027, "learning_rate": 1e-05, - "loss": 0.4257, + "loss": 0.4268, "step": 40900 }, { "epoch": 0.00041, - "grad_norm": 2.1775474548339844, + "grad_norm": 2.390690326690674, "learning_rate": 1e-05, - "loss": 0.4238, + "loss": 0.4233, "step": 41000 }, { "epoch": 0.000411, - "grad_norm": 1.9088200330734253, + "grad_norm": 1.8873522281646729, "learning_rate": 1e-05, - "loss": 0.4239, + "loss": 0.4259, "step": 41100 }, { "epoch": 0.000412, - "grad_norm": 2.337045192718506, + "grad_norm": 2.3923439979553223, "learning_rate": 1e-05, - "loss": 0.4307, + "loss": 0.4296, "step": 41200 }, { "epoch": 0.000413, - "grad_norm": 2.3791816234588623, + "grad_norm": 2.7505736351013184, "learning_rate": 1e-05, - "loss": 0.4253, + "loss": 0.4265, "step": 41300 }, { "epoch": 0.000414, - "grad_norm": 2.5770909786224365, + "grad_norm": 2.2666115760803223, "learning_rate": 1e-05, - "loss": 0.4329, + "loss": 0.4339, "step": 41400 }, { "epoch": 0.000415, - "grad_norm": 2.10221529006958, + "grad_norm": 2.128662586212158, "learning_rate": 1e-05, - "loss": 0.4284, + "loss": 0.4279, "step": 41500 }, { "epoch": 0.000416, - "grad_norm": 2.1052303314208984, + "grad_norm": 2.4806056022644043, "learning_rate": 1e-05, - "loss": 0.4317, + "loss": 0.4335, "step": 41600 }, { "epoch": 0.000417, - "grad_norm": 2.35298228263855, + "grad_norm": 2.5022566318511963, "learning_rate": 1e-05, - "loss": 0.4336, + "loss": 0.4321, "step": 41700 }, { "epoch": 0.000418, - "grad_norm": 2.4934685230255127, + "grad_norm": 2.682896137237549, "learning_rate": 1e-05, - "loss": 0.4365, + "loss": 0.4373, "step": 41800 }, { "epoch": 0.000419, - "grad_norm": 2.944164752960205, + "grad_norm": 2.7449374198913574, "learning_rate": 1e-05, - "loss": 0.4359, + "loss": 0.4362, "step": 41900 }, { "epoch": 0.00042, - "grad_norm": 1.9120829105377197, + "grad_norm": 2.0306496620178223, "learning_rate": 1e-05, - "loss": 0.4268, + "loss": 0.4264, "step": 42000 }, { "epoch": 0.000421, - "grad_norm": 2.0716164112091064, + "grad_norm": 2.0226821899414062, "learning_rate": 1e-05, - "loss": 0.4282, + "loss": 0.4301, "step": 42100 }, { "epoch": 0.000422, - "grad_norm": 2.3713035583496094, + "grad_norm": 2.372490406036377, "learning_rate": 1e-05, - "loss": 0.4276, + "loss": 0.429, "step": 42200 }, { "epoch": 0.000423, - "grad_norm": 2.1499955654144287, + "grad_norm": 2.4113259315490723, "learning_rate": 1e-05, - "loss": 0.4327, + "loss": 0.4347, "step": 42300 }, { "epoch": 0.000424, - "grad_norm": 2.3510193824768066, + "grad_norm": 2.3437299728393555, "learning_rate": 1e-05, - "loss": 0.4222, + "loss": 0.4204, "step": 42400 }, { "epoch": 0.000425, - "grad_norm": 2.3718671798706055, + "grad_norm": 2.460440158843994, "learning_rate": 1e-05, - "loss": 0.4211, + "loss": 0.4195, "step": 42500 }, { "epoch": 0.000426, - "grad_norm": 2.2237861156463623, + "grad_norm": 2.3652024269104004, "learning_rate": 1e-05, - "loss": 0.4192, + "loss": 0.418, "step": 42600 }, { "epoch": 0.000427, - "grad_norm": 2.2722647190093994, + "grad_norm": 2.48496150970459, "learning_rate": 1e-05, - "loss": 0.4247, + "loss": 0.4263, "step": 42700 }, { "epoch": 0.000428, - "grad_norm": 2.3696279525756836, + "grad_norm": 2.4841647148132324, "learning_rate": 1e-05, - "loss": 0.4267, + "loss": 0.4259, "step": 42800 }, { "epoch": 0.000429, - "grad_norm": 2.4816243648529053, + "grad_norm": 2.573284149169922, "learning_rate": 1e-05, - "loss": 0.4296, + "loss": 0.4302, "step": 42900 }, { "epoch": 0.00043, - "grad_norm": 1.9000905752182007, + "grad_norm": 1.9319133758544922, "learning_rate": 1e-05, - "loss": 0.4229, + "loss": 0.4227, "step": 43000 }, { "epoch": 0.000431, - "grad_norm": 2.2765355110168457, + "grad_norm": 2.4806172847747803, "learning_rate": 1e-05, - "loss": 0.4323, + "loss": 0.4325, "step": 43100 }, { "epoch": 0.000432, - "grad_norm": 2.29018235206604, + "grad_norm": 2.4880504608154297, "learning_rate": 1e-05, - "loss": 0.4227, + "loss": 0.4229, "step": 43200 }, { "epoch": 0.000433, - "grad_norm": 2.633532762527466, + "grad_norm": 2.395817995071411, "learning_rate": 1e-05, - "loss": 0.4297, + "loss": 0.43, "step": 43300 }, { "epoch": 0.000434, - "grad_norm": 2.5616657733917236, + "grad_norm": 2.6356828212738037, "learning_rate": 1e-05, - "loss": 0.4165, + "loss": 0.4207, "step": 43400 }, { "epoch": 0.000435, - "grad_norm": 2.310901165008545, + "grad_norm": 2.1054494380950928, "learning_rate": 1e-05, - "loss": 0.4275, + "loss": 0.4284, "step": 43500 }, { "epoch": 0.000436, - "grad_norm": 2.2478537559509277, + "grad_norm": 2.3987913131713867, "learning_rate": 1e-05, - "loss": 0.4253, + "loss": 0.4267, "step": 43600 }, { "epoch": 0.000437, - "grad_norm": 2.416546583175659, + "grad_norm": 2.606956720352173, "learning_rate": 1e-05, - "loss": 0.4219, + "loss": 0.4203, "step": 43700 }, { "epoch": 0.000438, - "grad_norm": 2.506760835647583, + "grad_norm": 2.5072622299194336, "learning_rate": 1e-05, - "loss": 0.4235, + "loss": 0.4276, "step": 43800 }, { "epoch": 0.000439, - "grad_norm": 2.302083969116211, + "grad_norm": 2.300851345062256, "learning_rate": 1e-05, - "loss": 0.4255, + "loss": 0.427, "step": 43900 }, { "epoch": 0.00044, - "grad_norm": 2.447115659713745, + "grad_norm": 2.514756441116333, "learning_rate": 1e-05, - "loss": 0.4238, + "loss": 0.4233, "step": 44000 }, { "epoch": 0.000441, - "grad_norm": 2.3530352115631104, + "grad_norm": 2.5738296508789062, "learning_rate": 1e-05, - "loss": 0.423, + "loss": 0.4255, "step": 44100 }, { "epoch": 0.000442, - "grad_norm": 2.038248300552368, + "grad_norm": 2.324410915374756, "learning_rate": 1e-05, - "loss": 0.418, + "loss": 0.4225, "step": 44200 }, { "epoch": 0.000443, - "grad_norm": 2.4170238971710205, + "grad_norm": 2.1797661781311035, "learning_rate": 1e-05, - "loss": 0.4211, + "loss": 0.4204, "step": 44300 }, { "epoch": 0.000444, - "grad_norm": 2.5111351013183594, + "grad_norm": 2.667961359024048, "learning_rate": 1e-05, - "loss": 0.4289, + "loss": 0.4298, "step": 44400 }, { "epoch": 0.000445, - "grad_norm": 2.4096994400024414, + "grad_norm": 2.6222057342529297, "learning_rate": 1e-05, - "loss": 0.4141, + "loss": 0.4159, "step": 44500 }, { "epoch": 0.000446, - "grad_norm": 2.4049484729766846, + "grad_norm": 2.4339888095855713, "learning_rate": 1e-05, - "loss": 0.4276, + "loss": 0.4298, "step": 44600 }, { "epoch": 0.000447, - "grad_norm": 2.311317205429077, + "grad_norm": 2.394127368927002, "learning_rate": 1e-05, - "loss": 0.4259, + "loss": 0.4299, "step": 44700 }, { "epoch": 0.000448, - "grad_norm": 2.1640536785125732, + "grad_norm": 2.3612658977508545, "learning_rate": 1e-05, - "loss": 0.4282, + "loss": 0.4285, "step": 44800 }, { "epoch": 0.000449, - "grad_norm": 2.4751601219177246, + "grad_norm": 2.4719297885894775, "learning_rate": 1e-05, - "loss": 0.4217, + "loss": 0.4218, "step": 44900 }, { "epoch": 0.00045, - "grad_norm": 1.922454833984375, + "grad_norm": 2.1661250591278076, "learning_rate": 1e-05, - "loss": 0.4201, + "loss": 0.4213, "step": 45000 }, { "epoch": 0.000451, - "grad_norm": 2.265232801437378, + "grad_norm": 2.559985637664795, "learning_rate": 1e-05, - "loss": 0.4276, + "loss": 0.4289, "step": 45100 }, { "epoch": 0.000452, - "grad_norm": 2.061838150024414, + "grad_norm": 2.452289342880249, "learning_rate": 1e-05, - "loss": 0.4273, + "loss": 0.4257, "step": 45200 }, { "epoch": 0.000453, - "grad_norm": 1.9958899021148682, + "grad_norm": 2.0307326316833496, "learning_rate": 1e-05, - "loss": 0.4194, + "loss": 0.4229, "step": 45300 }, { "epoch": 0.000454, - "grad_norm": 1.997963547706604, + "grad_norm": 2.489323377609253, "learning_rate": 1e-05, - "loss": 0.4276, + "loss": 0.4269, "step": 45400 }, { "epoch": 0.000455, - "grad_norm": 2.6008551120758057, + "grad_norm": 2.5684876441955566, "learning_rate": 1e-05, - "loss": 0.4163, + "loss": 0.4173, "step": 45500 }, { "epoch": 0.000456, - "grad_norm": 2.026224136352539, + "grad_norm": 2.4012162685394287, "learning_rate": 1e-05, - "loss": 0.4152, + "loss": 0.4139, "step": 45600 }, { "epoch": 0.000457, - "grad_norm": 2.4276435375213623, + "grad_norm": 2.4833133220672607, "learning_rate": 1e-05, - "loss": 0.4248, + "loss": 0.4233, "step": 45700 }, { "epoch": 0.000458, - "grad_norm": 2.297520160675049, + "grad_norm": 2.4413490295410156, "learning_rate": 1e-05, - "loss": 0.4065, + "loss": 0.4089, "step": 45800 }, { "epoch": 0.000459, - "grad_norm": 2.1715235710144043, + "grad_norm": 2.192959785461426, "learning_rate": 1e-05, - "loss": 0.4163, + "loss": 0.4156, "step": 45900 }, { "epoch": 0.00046, - "grad_norm": 2.2651405334472656, + "grad_norm": 2.3064184188842773, "learning_rate": 1e-05, - "loss": 0.4205, + "loss": 0.4187, "step": 46000 }, { "epoch": 0.000461, - "grad_norm": 2.348522901535034, + "grad_norm": 2.4842922687530518, "learning_rate": 1e-05, "loss": 0.4131, "step": 46100 }, { "epoch": 0.000462, - "grad_norm": 2.101867914199829, + "grad_norm": 2.074312925338745, "learning_rate": 1e-05, - "loss": 0.4262, + "loss": 0.4265, "step": 46200 }, { "epoch": 0.000463, - "grad_norm": 2.4691214561462402, + "grad_norm": 2.4513862133026123, "learning_rate": 1e-05, - "loss": 0.4158, + "loss": 0.4153, "step": 46300 }, { "epoch": 0.000464, - "grad_norm": 2.0133414268493652, + "grad_norm": 2.21403431892395, "learning_rate": 1e-05, - "loss": 0.4161, + "loss": 0.417, "step": 46400 }, { "epoch": 0.000465, - "grad_norm": 4.3807053565979, + "grad_norm": 2.469252109527588, "learning_rate": 1e-05, - "loss": 0.4206, + "loss": 0.4181, "step": 46500 }, { "epoch": 0.000466, - "grad_norm": 2.314570665359497, + "grad_norm": 2.3535096645355225, "learning_rate": 1e-05, - "loss": 0.4145, + "loss": 0.4159, "step": 46600 }, { "epoch": 0.000467, - "grad_norm": 2.0983941555023193, + "grad_norm": 2.8495819568634033, "learning_rate": 1e-05, - "loss": 0.4089, + "loss": 0.4118, "step": 46700 }, { "epoch": 0.000468, - "grad_norm": 2.184023141860962, + "grad_norm": 2.2924575805664062, "learning_rate": 1e-05, - "loss": 0.4215, + "loss": 0.4221, "step": 46800 }, { "epoch": 0.000469, - "grad_norm": 2.104890823364258, + "grad_norm": 2.1148035526275635, "learning_rate": 1e-05, - "loss": 0.4228, + "loss": 0.4221, "step": 46900 }, { "epoch": 0.00047, - "grad_norm": 2.066441059112549, + "grad_norm": 2.5257456302642822, "learning_rate": 1e-05, - "loss": 0.4128, + "loss": 0.4129, "step": 47000 }, { "epoch": 0.000471, - "grad_norm": 2.2610368728637695, + "grad_norm": 2.734550714492798, "learning_rate": 1e-05, - "loss": 0.4237, + "loss": 0.4214, "step": 47100 }, { "epoch": 0.000472, - "grad_norm": 2.2063634395599365, + "grad_norm": 2.844151496887207, "learning_rate": 1e-05, - "loss": 0.4132, + "loss": 0.4133, "step": 47200 }, { "epoch": 0.000473, - "grad_norm": 2.125624179840088, + "grad_norm": 2.2594943046569824, "learning_rate": 1e-05, - "loss": 0.4129, + "loss": 0.4154, "step": 47300 }, { "epoch": 0.000474, - "grad_norm": 2.176231622695923, + "grad_norm": 2.037102699279785, "learning_rate": 1e-05, - "loss": 0.4118, + "loss": 0.4095, "step": 47400 }, { "epoch": 0.000475, - "grad_norm": 3.6416099071502686, + "grad_norm": 2.472301483154297, "learning_rate": 1e-05, - "loss": 0.4152, + "loss": 0.4156, "step": 47500 }, { "epoch": 0.000476, - "grad_norm": 2.1828925609588623, + "grad_norm": 2.0751333236694336, "learning_rate": 1e-05, - "loss": 0.42, + "loss": 0.4196, "step": 47600 }, { "epoch": 0.000477, - "grad_norm": 2.131887912750244, + "grad_norm": 2.197103977203369, "learning_rate": 1e-05, - "loss": 0.4173, + "loss": 0.4129, "step": 47700 }, { "epoch": 0.000478, - "grad_norm": 3.0276741981506348, + "grad_norm": 2.5141637325286865, "learning_rate": 1e-05, - "loss": 0.4075, + "loss": 0.4085, "step": 47800 }, { "epoch": 0.000479, - "grad_norm": 2.068551540374756, + "grad_norm": 2.441049337387085, "learning_rate": 1e-05, - "loss": 0.4126, + "loss": 0.4137, "step": 47900 }, { "epoch": 0.00048, - "grad_norm": 2.490746259689331, + "grad_norm": 2.2101807594299316, "learning_rate": 1e-05, - "loss": 0.4128, + "loss": 0.415, "step": 48000 }, { "epoch": 0.000481, - "grad_norm": 1.9956814050674438, + "grad_norm": 2.235775947570801, "learning_rate": 1e-05, - "loss": 0.4167, + "loss": 0.4169, "step": 48100 }, { "epoch": 0.000482, - "grad_norm": 2.415469169616699, + "grad_norm": 2.0968542098999023, "learning_rate": 1e-05, - "loss": 0.4129, + "loss": 0.4131, "step": 48200 }, { "epoch": 0.000483, - "grad_norm": 2.2937676906585693, + "grad_norm": 2.1529128551483154, "learning_rate": 1e-05, - "loss": 0.4183, + "loss": 0.42, "step": 48300 }, { "epoch": 0.000484, - "grad_norm": 2.287675619125366, + "grad_norm": 2.2251384258270264, "learning_rate": 1e-05, - "loss": 0.417, + "loss": 0.4153, "step": 48400 }, { "epoch": 0.000485, - "grad_norm": 2.3487749099731445, + "grad_norm": 2.5039467811584473, "learning_rate": 1e-05, - "loss": 0.4105, + "loss": 0.4133, "step": 48500 }, { "epoch": 0.000486, - "grad_norm": 2.385653257369995, + "grad_norm": 2.3165180683135986, "learning_rate": 1e-05, - "loss": 0.4109, + "loss": 0.4104, "step": 48600 }, { "epoch": 0.000487, - "grad_norm": 2.3248510360717773, + "grad_norm": 2.2625648975372314, "learning_rate": 1e-05, - "loss": 0.4041, + "loss": 0.4047, "step": 48700 }, { "epoch": 0.000488, - "grad_norm": 2.2816834449768066, + "grad_norm": 2.4777987003326416, "learning_rate": 1e-05, - "loss": 0.4084, + "loss": 0.4112, "step": 48800 }, { "epoch": 0.000489, - "grad_norm": 2.0432040691375732, + "grad_norm": 2.794090986251831, "learning_rate": 1e-05, - "loss": 0.4138, + "loss": 0.4143, "step": 48900 }, { "epoch": 0.00049, - "grad_norm": 2.278571605682373, + "grad_norm": 2.3887550830841064, "learning_rate": 1e-05, - "loss": 0.4154, + "loss": 0.4172, "step": 49000 }, { "epoch": 0.000491, - "grad_norm": 2.3925750255584717, + "grad_norm": 2.0181326866149902, "learning_rate": 1e-05, - "loss": 0.4155, + "loss": 0.4147, "step": 49100 }, { "epoch": 0.000492, - "grad_norm": 1.9109994173049927, + "grad_norm": 2.037066698074341, "learning_rate": 1e-05, - "loss": 0.4071, + "loss": 0.4079, "step": 49200 }, { "epoch": 0.000493, - "grad_norm": 2.457143545150757, + "grad_norm": 2.349827289581299, "learning_rate": 1e-05, - "loss": 0.4172, + "loss": 0.4203, "step": 49300 }, { "epoch": 0.000494, - "grad_norm": 2.347390651702881, + "grad_norm": 2.35591459274292, "learning_rate": 1e-05, - "loss": 0.4089, + "loss": 0.4096, "step": 49400 }, { "epoch": 0.000495, - "grad_norm": 2.082728147506714, + "grad_norm": 2.994199752807617, "learning_rate": 1e-05, - "loss": 0.4164, + "loss": 0.4171, "step": 49500 }, { "epoch": 0.000496, - "grad_norm": 2.3399882316589355, + "grad_norm": 2.415408134460449, "learning_rate": 1e-05, - "loss": 0.4024, + "loss": 0.4027, "step": 49600 }, { "epoch": 0.000497, - "grad_norm": 2.22118878364563, + "grad_norm": 2.205004930496216, "learning_rate": 1e-05, - "loss": 0.4219, + "loss": 0.4208, "step": 49700 }, { "epoch": 0.000498, - "grad_norm": 2.1258790493011475, + "grad_norm": 2.1636242866516113, "learning_rate": 1e-05, - "loss": 0.4123, + "loss": 0.4131, "step": 49800 }, { "epoch": 0.000499, - "grad_norm": 2.5490922927856445, + "grad_norm": 2.4507057666778564, "learning_rate": 1e-05, - "loss": 0.4132, + "loss": 0.4135, "step": 49900 }, { "epoch": 0.0005, - "grad_norm": 2.1127679347991943, + "grad_norm": 2.1614506244659424, "learning_rate": 1e-05, - "loss": 0.408, + "loss": 0.41, "step": 50000 }, { "epoch": 0.000501, - "grad_norm": 2.2153799533843994, + "grad_norm": 2.070063591003418, "learning_rate": 1e-05, - "loss": 0.4093, + "loss": 0.4089, "step": 50100 }, { "epoch": 0.000502, - "grad_norm": 2.2161195278167725, + "grad_norm": 2.339935779571533, "learning_rate": 1e-05, - "loss": 0.4134, + "loss": 0.4115, "step": 50200 }, { "epoch": 0.000503, - "grad_norm": 2.2163915634155273, + "grad_norm": 2.25191330909729, "learning_rate": 1e-05, - "loss": 0.4047, + "loss": 0.4064, "step": 50300 }, { "epoch": 0.000504, - "grad_norm": 2.15307879447937, + "grad_norm": 2.198077440261841, "learning_rate": 1e-05, - "loss": 0.4056, + "loss": 0.4049, "step": 50400 }, { "epoch": 0.000505, - "grad_norm": 2.1074087619781494, + "grad_norm": 2.4001047611236572, "learning_rate": 1e-05, - "loss": 0.4081, + "loss": 0.4146, "step": 50500 }, { "epoch": 0.000506, - "grad_norm": 2.4118545055389404, + "grad_norm": 2.311879873275757, "learning_rate": 1e-05, - "loss": 0.4148, + "loss": 0.4149, "step": 50600 }, { "epoch": 0.000507, - "grad_norm": 2.153163433074951, + "grad_norm": 2.249931573867798, "learning_rate": 1e-05, - "loss": 0.4064, + "loss": 0.4091, "step": 50700 }, { "epoch": 0.000508, - "grad_norm": 2.0141372680664062, + "grad_norm": 1.957440733909607, "learning_rate": 1e-05, - "loss": 0.4066, + "loss": 0.407, "step": 50800 }, { "epoch": 0.000509, - "grad_norm": 2.160839557647705, + "grad_norm": 2.3248322010040283, "learning_rate": 1e-05, - "loss": 0.4115, + "loss": 0.4138, "step": 50900 }, { "epoch": 0.00051, - "grad_norm": 2.2688865661621094, + "grad_norm": 2.2417356967926025, "learning_rate": 1e-05, - "loss": 0.4076, + "loss": 0.4075, "step": 51000 }, { "epoch": 0.000511, - "grad_norm": 2.292311429977417, + "grad_norm": 2.2850210666656494, "learning_rate": 1e-05, - "loss": 0.4105, + "loss": 0.4148, "step": 51100 }, { "epoch": 0.000512, - "grad_norm": 2.381415367126465, + "grad_norm": 2.3242995738983154, "learning_rate": 1e-05, - "loss": 0.4108, + "loss": 0.4134, "step": 51200 }, { "epoch": 0.000513, - "grad_norm": 2.106940507888794, + "grad_norm": 2.0709969997406006, "learning_rate": 1e-05, - "loss": 0.419, + "loss": 0.4196, "step": 51300 }, { "epoch": 0.000514, - "grad_norm": 2.2637765407562256, + "grad_norm": 2.149703025817871, "learning_rate": 1e-05, - "loss": 0.4099, + "loss": 0.4105, "step": 51400 }, { "epoch": 0.000515, - "grad_norm": 2.2846248149871826, + "grad_norm": 2.2497308254241943, "learning_rate": 1e-05, - "loss": 0.4075, + "loss": 0.4098, "step": 51500 }, { "epoch": 0.000516, - "grad_norm": 1.9198349714279175, + "grad_norm": 2.0328240394592285, "learning_rate": 1e-05, - "loss": 0.4053, + "loss": 0.406, "step": 51600 }, { "epoch": 0.000517, - "grad_norm": 2.204289674758911, + "grad_norm": 2.052591562271118, "learning_rate": 1e-05, - "loss": 0.4093, + "loss": 0.4086, "step": 51700 }, { "epoch": 0.000518, - "grad_norm": 1.9825174808502197, + "grad_norm": 2.353180170059204, "learning_rate": 1e-05, - "loss": 0.4023, + "loss": 0.4058, "step": 51800 }, { "epoch": 0.000519, - "grad_norm": 2.1780731678009033, + "grad_norm": 2.352935791015625, "learning_rate": 1e-05, - "loss": 0.4039, + "loss": 0.4057, "step": 51900 }, { "epoch": 0.00052, - "grad_norm": 2.4121592044830322, + "grad_norm": 2.1475372314453125, "learning_rate": 1e-05, - "loss": 0.4033, + "loss": 0.4043, "step": 52000 }, { "epoch": 0.000521, - "grad_norm": 2.2982704639434814, + "grad_norm": 2.2819299697875977, "learning_rate": 1e-05, - "loss": 0.4066, + "loss": 0.4079, "step": 52100 }, { "epoch": 0.000522, - "grad_norm": 1.9331947565078735, + "grad_norm": 2.23323392868042, "learning_rate": 1e-05, - "loss": 0.4102, + "loss": 0.4142, "step": 52200 }, { "epoch": 0.000523, - "grad_norm": 2.186981678009033, + "grad_norm": 2.1115095615386963, "learning_rate": 1e-05, - "loss": 0.4079, + "loss": 0.4091, "step": 52300 }, { "epoch": 0.000524, - "grad_norm": 2.0239098072052, + "grad_norm": 2.34243106842041, "learning_rate": 1e-05, - "loss": 0.4018, + "loss": 0.4028, "step": 52400 }, { "epoch": 0.000525, - "grad_norm": 2.099689483642578, + "grad_norm": 2.1626434326171875, "learning_rate": 1e-05, - "loss": 0.4039, + "loss": 0.4042, "step": 52500 }, { "epoch": 0.000526, - "grad_norm": 2.020476818084717, + "grad_norm": 2.283756732940674, "learning_rate": 1e-05, - "loss": 0.4026, + "loss": 0.4068, "step": 52600 }, { "epoch": 0.000527, - "grad_norm": 2.083711862564087, + "grad_norm": 2.1026229858398438, "learning_rate": 1e-05, - "loss": 0.409, + "loss": 0.4082, "step": 52700 }, { "epoch": 0.000528, - "grad_norm": 2.100088357925415, + "grad_norm": 2.1464221477508545, "learning_rate": 1e-05, - "loss": 0.4068, + "loss": 0.4093, "step": 52800 }, { "epoch": 0.000529, - "grad_norm": 2.005267858505249, + "grad_norm": 2.1100659370422363, "learning_rate": 1e-05, - "loss": 0.4043, + "loss": 0.4049, "step": 52900 }, { "epoch": 0.00053, - "grad_norm": 1.9873567819595337, + "grad_norm": 2.06082820892334, "learning_rate": 1e-05, - "loss": 0.414, + "loss": 0.4174, "step": 53000 }, { "epoch": 0.000531, - "grad_norm": 2.1579301357269287, + "grad_norm": 2.226346492767334, "learning_rate": 1e-05, - "loss": 0.4063, + "loss": 0.4084, "step": 53100 }, { "epoch": 0.000532, - "grad_norm": 2.211074113845825, + "grad_norm": 2.432999849319458, "learning_rate": 1e-05, - "loss": 0.4041, + "loss": 0.406, "step": 53200 }, { "epoch": 0.000533, - "grad_norm": 2.3211870193481445, + "grad_norm": 2.4239957332611084, "learning_rate": 1e-05, - "loss": 0.4065, + "loss": 0.4073, "step": 53300 }, { "epoch": 0.000534, - "grad_norm": 1.9049533605575562, + "grad_norm": 2.08341646194458, "learning_rate": 1e-05, - "loss": 0.4091, + "loss": 0.4092, "step": 53400 }, { "epoch": 0.000535, - "grad_norm": 2.27285099029541, + "grad_norm": 2.4562456607818604, "learning_rate": 1e-05, - "loss": 0.3977, + "loss": 0.3995, "step": 53500 }, { "epoch": 0.000536, - "grad_norm": 2.191856622695923, + "grad_norm": 2.1700892448425293, "learning_rate": 1e-05, - "loss": 0.4109, + "loss": 0.4107, "step": 53600 }, { "epoch": 0.000537, - "grad_norm": 2.0125203132629395, + "grad_norm": 2.071171998977661, "learning_rate": 1e-05, - "loss": 0.4063, + "loss": 0.4085, "step": 53700 }, { "epoch": 0.000538, - "grad_norm": 1.902809739112854, + "grad_norm": 1.977064847946167, "learning_rate": 1e-05, - "loss": 0.4056, + "loss": 0.4046, "step": 53800 }, { "epoch": 0.000539, - "grad_norm": 2.420093059539795, + "grad_norm": 4.314730644226074, "learning_rate": 1e-05, - "loss": 0.4008, + "loss": 0.4023, "step": 53900 }, { "epoch": 0.00054, - "grad_norm": 2.247391939163208, + "grad_norm": 2.5248172283172607, "learning_rate": 1e-05, - "loss": 0.3973, + "loss": 0.3983, "step": 54000 }, { "epoch": 0.000541, - "grad_norm": 1.9090715646743774, + "grad_norm": 2.1219537258148193, "learning_rate": 1e-05, - "loss": 0.4073, + "loss": 0.4068, "step": 54100 }, { "epoch": 0.000542, - "grad_norm": 2.2643890380859375, + "grad_norm": 2.4074840545654297, "learning_rate": 1e-05, - "loss": 0.4024, + "loss": 0.4029, "step": 54200 }, { "epoch": 0.000543, - "grad_norm": 2.251248598098755, + "grad_norm": 2.462904930114746, "learning_rate": 1e-05, - "loss": 0.4104, + "loss": 0.4082, "step": 54300 }, { "epoch": 0.000544, - "grad_norm": 1.9576762914657593, + "grad_norm": 2.5849449634552, "learning_rate": 1e-05, - "loss": 0.3971, + "loss": 0.3998, "step": 54400 }, { "epoch": 0.000545, - "grad_norm": 2.20403790473938, + "grad_norm": 2.1051547527313232, "learning_rate": 1e-05, - "loss": 0.4019, + "loss": 0.4018, "step": 54500 }, { "epoch": 0.000546, - "grad_norm": 2.268015146255493, + "grad_norm": 2.4176714420318604, "learning_rate": 1e-05, - "loss": 0.3954, + "loss": 0.3965, "step": 54600 }, { "epoch": 0.000547, - "grad_norm": 2.130234718322754, + "grad_norm": 2.1228177547454834, "learning_rate": 1e-05, - "loss": 0.3983, + "loss": 0.4007, "step": 54700 }, { "epoch": 0.000548, - "grad_norm": 2.00286865234375, + "grad_norm": 2.0286078453063965, "learning_rate": 1e-05, - "loss": 0.4026, + "loss": 0.402, "step": 54800 }, { "epoch": 0.000549, - "grad_norm": 2.3837525844573975, + "grad_norm": 2.300497531890869, "learning_rate": 1e-05, - "loss": 0.4089, + "loss": 0.4084, "step": 54900 }, { "epoch": 0.00055, - "grad_norm": 1.8765068054199219, + "grad_norm": 2.1815927028656006, "learning_rate": 1e-05, - "loss": 0.4014, + "loss": 0.4031, "step": 55000 }, { "epoch": 0.000551, - "grad_norm": 2.2745885848999023, + "grad_norm": 2.347383975982666, "learning_rate": 1e-05, - "loss": 0.3994, + "loss": 0.402, "step": 55100 }, { "epoch": 0.000552, - "grad_norm": 2.190394639968872, + "grad_norm": 2.059412717819214, "learning_rate": 1e-05, - "loss": 0.4141, + "loss": 0.4169, "step": 55200 }, { "epoch": 0.000553, - "grad_norm": 2.024251937866211, + "grad_norm": 2.089460849761963, "learning_rate": 1e-05, - "loss": 0.4032, + "loss": 0.4059, "step": 55300 }, { "epoch": 0.000554, - "grad_norm": 2.6340456008911133, + "grad_norm": 2.608187675476074, "learning_rate": 1e-05, - "loss": 0.4052, + "loss": 0.4059, "step": 55400 }, { "epoch": 0.000555, - "grad_norm": 2.271549701690674, + "grad_norm": 2.468566656112671, "learning_rate": 1e-05, "loss": 0.4007, "step": 55500 }, { "epoch": 0.000556, - "grad_norm": 1.9378595352172852, + "grad_norm": 2.740276336669922, "learning_rate": 1e-05, - "loss": 0.4092, + "loss": 0.4099, "step": 55600 }, { "epoch": 0.000557, - "grad_norm": 2.367792844772339, + "grad_norm": 2.447087526321411, "learning_rate": 1e-05, - "loss": 0.4128, + "loss": 0.4157, "step": 55700 }, { "epoch": 0.000558, - "grad_norm": 2.333998680114746, + "grad_norm": 2.1900322437286377, "learning_rate": 1e-05, - "loss": 0.3986, + "loss": 0.4018, "step": 55800 }, { "epoch": 0.000559, - "grad_norm": 2.1202468872070312, + "grad_norm": 2.332939386367798, "learning_rate": 1e-05, - "loss": 0.3926, + "loss": 0.3949, "step": 55900 }, { "epoch": 0.00056, - "grad_norm": 2.0503971576690674, + "grad_norm": 2.050628900527954, "learning_rate": 1e-05, - "loss": 0.4031, + "loss": 0.4062, "step": 56000 }, { "epoch": 0.000561, - "grad_norm": 2.037123680114746, + "grad_norm": 2.101712226867676, "learning_rate": 1e-05, - "loss": 0.401, + "loss": 0.4014, "step": 56100 }, { "epoch": 0.000562, - "grad_norm": 2.185070276260376, + "grad_norm": 2.093705177307129, "learning_rate": 1e-05, - "loss": 0.3943, + "loss": 0.3951, "step": 56200 }, { "epoch": 0.000563, - "grad_norm": 2.079923629760742, + "grad_norm": 2.02903413772583, "learning_rate": 1e-05, - "loss": 0.4074, + "loss": 0.4059, "step": 56300 }, { "epoch": 0.000564, - "grad_norm": 2.0761780738830566, + "grad_norm": 2.0588796138763428, "learning_rate": 1e-05, - "loss": 0.4069, + "loss": 0.4104, "step": 56400 }, { "epoch": 0.000565, - "grad_norm": 2.000859498977661, + "grad_norm": 1.968138575553894, "learning_rate": 1e-05, - "loss": 0.3928, + "loss": 0.3955, "step": 56500 }, { "epoch": 0.000566, - "grad_norm": 1.9323351383209229, + "grad_norm": 2.0863802433013916, "learning_rate": 1e-05, - "loss": 0.4054, + "loss": 0.4056, "step": 56600 }, { "epoch": 0.000567, - "grad_norm": 2.0383734703063965, + "grad_norm": 2.0999319553375244, "learning_rate": 1e-05, - "loss": 0.3981, + "loss": 0.3999, "step": 56700 }, { "epoch": 0.000568, - "grad_norm": 1.893697738647461, + "grad_norm": 2.2497940063476562, "learning_rate": 1e-05, - "loss": 0.3941, + "loss": 0.3944, "step": 56800 }, { "epoch": 0.000569, - "grad_norm": 2.3345186710357666, + "grad_norm": 2.327509880065918, "learning_rate": 1e-05, - "loss": 0.3995, + "loss": 0.4045, "step": 56900 }, { "epoch": 0.00057, - "grad_norm": 2.351663589477539, + "grad_norm": 1.9509259462356567, "learning_rate": 1e-05, - "loss": 0.3921, + "loss": 0.3937, "step": 57000 }, { "epoch": 0.000571, - "grad_norm": 2.0262696743011475, + "grad_norm": 1.9733527898788452, "learning_rate": 1e-05, - "loss": 0.3995, + "loss": 0.3994, "step": 57100 }, { "epoch": 0.000572, - "grad_norm": 2.0367591381073, + "grad_norm": 2.3149795532226562, "learning_rate": 1e-05, - "loss": 0.3958, + "loss": 0.3964, "step": 57200 }, { "epoch": 0.000573, - "grad_norm": 2.16660213470459, + "grad_norm": 2.2869510650634766, "learning_rate": 1e-05, - "loss": 0.3957, + "loss": 0.3972, "step": 57300 }, { "epoch": 0.000574, - "grad_norm": 2.2552976608276367, + "grad_norm": 2.807288885116577, "learning_rate": 1e-05, - "loss": 0.397, + "loss": 0.3979, "step": 57400 }, { "epoch": 0.000575, - "grad_norm": 1.951948642730713, + "grad_norm": 1.9130806922912598, "learning_rate": 1e-05, - "loss": 0.3892, + "loss": 0.3909, "step": 57500 }, { "epoch": 0.000576, - "grad_norm": 2.2208938598632812, + "grad_norm": 2.392228841781616, "learning_rate": 1e-05, - "loss": 0.4077, + "loss": 0.4081, "step": 57600 }, { "epoch": 0.000577, - "grad_norm": 2.118006467819214, + "grad_norm": 2.2016382217407227, "learning_rate": 1e-05, - "loss": 0.3951, + "loss": 0.3942, "step": 57700 }, { "epoch": 0.000578, - "grad_norm": 2.1072921752929688, + "grad_norm": 1.9153637886047363, "learning_rate": 1e-05, - "loss": 0.3878, + "loss": 0.3856, "step": 57800 }, { "epoch": 0.000579, - "grad_norm": 2.334207773208618, + "grad_norm": 2.334127902984619, "learning_rate": 1e-05, - "loss": 0.3987, + "loss": 0.4011, "step": 57900 }, { "epoch": 0.00058, - "grad_norm": 1.9321773052215576, + "grad_norm": 2.0389389991760254, "learning_rate": 1e-05, - "loss": 0.3975, + "loss": 0.3964, "step": 58000 }, { "epoch": 0.000581, - "grad_norm": 1.7466888427734375, + "grad_norm": 1.817014217376709, "learning_rate": 1e-05, - "loss": 0.4016, + "loss": 0.401, "step": 58100 }, { "epoch": 0.000582, - "grad_norm": 2.1885852813720703, + "grad_norm": 2.2769718170166016, "learning_rate": 1e-05, - "loss": 0.4005, + "loss": 0.4025, "step": 58200 }, { "epoch": 0.000583, - "grad_norm": 1.9729061126708984, + "grad_norm": 2.2681713104248047, "learning_rate": 1e-05, - "loss": 0.3972, + "loss": 0.3998, "step": 58300 }, { "epoch": 0.000584, - "grad_norm": 2.543989896774292, + "grad_norm": 2.0518765449523926, "learning_rate": 1e-05, - "loss": 0.3925, + "loss": 0.3958, "step": 58400 }, { "epoch": 0.000585, - "grad_norm": 2.0926578044891357, + "grad_norm": 2.0787107944488525, "learning_rate": 1e-05, - "loss": 0.3967, + "loss": 0.3979, "step": 58500 }, { "epoch": 0.000586, - "grad_norm": 2.019137382507324, + "grad_norm": 2.2114005088806152, "learning_rate": 1e-05, - "loss": 0.3951, + "loss": 0.3953, "step": 58600 }, { "epoch": 0.000587, - "grad_norm": 2.1298978328704834, + "grad_norm": 2.2382404804229736, "learning_rate": 1e-05, - "loss": 0.4019, + "loss": 0.4011, "step": 58700 }, { "epoch": 0.000588, - "grad_norm": 2.1271700859069824, + "grad_norm": 2.2104434967041016, "learning_rate": 1e-05, - "loss": 0.3896, + "loss": 0.3897, "step": 58800 }, { "epoch": 0.000589, - "grad_norm": 2.1552135944366455, + "grad_norm": 2.5010359287261963, "learning_rate": 1e-05, - "loss": 0.3938, + "loss": 0.3929, "step": 58900 }, { "epoch": 0.00059, - "grad_norm": 2.2406394481658936, + "grad_norm": 2.6456377506256104, "learning_rate": 1e-05, - "loss": 0.404, + "loss": 0.4048, "step": 59000 }, { "epoch": 0.000591, - "grad_norm": 2.035569667816162, + "grad_norm": 2.2201075553894043, "learning_rate": 1e-05, - "loss": 0.3995, + "loss": 0.4011, "step": 59100 }, { "epoch": 0.000592, - "grad_norm": 2.0483641624450684, + "grad_norm": 2.050746440887451, "learning_rate": 1e-05, - "loss": 0.4001, + "loss": 0.4003, "step": 59200 }, { "epoch": 0.000593, - "grad_norm": 2.0240378379821777, + "grad_norm": 4.129772663116455, "learning_rate": 1e-05, - "loss": 0.389, + "loss": 0.3889, "step": 59300 }, { "epoch": 0.000594, - "grad_norm": 2.2227602005004883, + "grad_norm": 2.160189628601074, "learning_rate": 1e-05, - "loss": 0.403, + "loss": 0.4046, "step": 59400 }, { "epoch": 0.000595, - "grad_norm": 2.2071664333343506, + "grad_norm": 2.5370826721191406, "learning_rate": 1e-05, - "loss": 0.3816, + "loss": 0.3829, "step": 59500 }, { "epoch": 0.000596, - "grad_norm": 1.8237906694412231, + "grad_norm": 2.0123531818389893, "learning_rate": 1e-05, - "loss": 0.3927, + "loss": 0.3976, "step": 59600 }, { "epoch": 0.000597, - "grad_norm": 2.11952805519104, + "grad_norm": 2.175504207611084, "learning_rate": 1e-05, - "loss": 0.3919, + "loss": 0.3908, "step": 59700 }, { "epoch": 0.000598, - "grad_norm": 1.9513181447982788, + "grad_norm": 1.993752360343933, "learning_rate": 1e-05, - "loss": 0.3996, + "loss": 0.401, "step": 59800 }, { "epoch": 0.000599, - "grad_norm": 1.8934307098388672, + "grad_norm": 2.1103925704956055, "learning_rate": 1e-05, "loss": 0.3886, "step": 59900 }, { "epoch": 0.0006, - "grad_norm": 2.0269672870635986, + "grad_norm": 1.866847038269043, "learning_rate": 1e-05, - "loss": 0.3891, + "loss": 0.3929, "step": 60000 }, { "epoch": 0.0006, - "eval_loss": 0.369140625, - "eval_runtime": 102.1638, - "eval_samples_per_second": 489.41, - "eval_steps_per_second": 30.588, + "eval_loss": 0.3701171875, + "eval_runtime": 109.3176, + "eval_samples_per_second": 457.383, + "eval_steps_per_second": 28.586, "step": 60000 }, { "epoch": 0.000601, - "grad_norm": 2.7376186847686768, + "grad_norm": 2.07293701171875, "learning_rate": 1e-05, - "loss": 0.3921, + "loss": 0.3941, "step": 60100 }, { "epoch": 0.000602, - "grad_norm": 2.343090057373047, + "grad_norm": 2.153665542602539, "learning_rate": 1e-05, - "loss": 0.3929, + "loss": 0.3933, "step": 60200 }, { "epoch": 0.000603, - "grad_norm": 2.0406734943389893, + "grad_norm": 1.8806813955307007, "learning_rate": 1e-05, - "loss": 0.3896, + "loss": 0.3908, "step": 60300 }, { "epoch": 0.000604, - "grad_norm": 1.7521495819091797, + "grad_norm": 1.9428766965866089, "learning_rate": 1e-05, - "loss": 0.3912, + "loss": 0.3941, "step": 60400 }, { "epoch": 0.000605, - "grad_norm": 2.1831586360931396, + "grad_norm": 2.4207301139831543, "learning_rate": 1e-05, - "loss": 0.396, + "loss": 0.395, "step": 60500 }, { "epoch": 0.000606, - "grad_norm": 2.375284194946289, + "grad_norm": 2.292665958404541, "learning_rate": 1e-05, - "loss": 0.3899, + "loss": 0.3893, "step": 60600 }, { "epoch": 0.000607, - "grad_norm": 2.453594446182251, + "grad_norm": 2.2332205772399902, "learning_rate": 1e-05, - "loss": 0.3908, + "loss": 0.3939, "step": 60700 }, { "epoch": 0.000608, - "grad_norm": 2.1200437545776367, + "grad_norm": 2.379991054534912, "learning_rate": 1e-05, - "loss": 0.3922, + "loss": 0.3911, "step": 60800 }, { "epoch": 0.000609, - "grad_norm": 2.098179817199707, + "grad_norm": 2.1357316970825195, "learning_rate": 1e-05, - "loss": 0.3927, + "loss": 0.394, "step": 60900 }, { "epoch": 0.00061, - "grad_norm": 1.9624439477920532, + "grad_norm": 2.218677520751953, "learning_rate": 1e-05, - "loss": 0.3895, + "loss": 0.3908, "step": 61000 }, { "epoch": 0.000611, - "grad_norm": 2.049699306488037, + "grad_norm": 2.144749641418457, "learning_rate": 1e-05, - "loss": 0.3955, + "loss": 0.3948, "step": 61100 }, { "epoch": 0.000612, - "grad_norm": 1.7350174188613892, + "grad_norm": 1.9096667766571045, "learning_rate": 1e-05, - "loss": 0.4045, + "loss": 0.4055, "step": 61200 }, { "epoch": 0.000613, - "grad_norm": 1.7695156335830688, + "grad_norm": 1.813551664352417, "learning_rate": 1e-05, - "loss": 0.3881, + "loss": 0.3909, "step": 61300 }, { "epoch": 0.000614, - "grad_norm": 2.1951496601104736, + "grad_norm": 2.0957746505737305, "learning_rate": 1e-05, - "loss": 0.3919, + "loss": 0.3934, "step": 61400 }, { "epoch": 0.000615, - "grad_norm": 2.1556954383850098, + "grad_norm": 2.288628578186035, "learning_rate": 1e-05, - "loss": 0.3907, + "loss": 0.3948, "step": 61500 }, { "epoch": 0.000616, - "grad_norm": 2.0208351612091064, + "grad_norm": 1.8869370222091675, "learning_rate": 1e-05, - "loss": 0.3888, + "loss": 0.3896, "step": 61600 }, { "epoch": 0.000617, - "grad_norm": 1.845552921295166, + "grad_norm": 1.8751919269561768, "learning_rate": 1e-05, - "loss": 0.3945, + "loss": 0.3946, "step": 61700 }, { "epoch": 0.000618, - "grad_norm": 1.836377501487732, + "grad_norm": 2.0423409938812256, "learning_rate": 1e-05, - "loss": 0.3878, + "loss": 0.3914, "step": 61800 }, { "epoch": 0.000619, - "grad_norm": 2.120041847229004, + "grad_norm": 2.154679536819458, "learning_rate": 1e-05, - "loss": 0.3954, + "loss": 0.3942, "step": 61900 }, { "epoch": 0.00062, - "grad_norm": 1.9719525575637817, + "grad_norm": 2.293510913848877, "learning_rate": 1e-05, - "loss": 0.3858, + "loss": 0.3867, "step": 62000 }, { "epoch": 0.000621, - "grad_norm": 1.8394004106521606, + "grad_norm": 2.034313678741455, "learning_rate": 1e-05, - "loss": 0.3828, + "loss": 0.3844, "step": 62100 }, { "epoch": 0.000622, - "grad_norm": 2.1397218704223633, + "grad_norm": 2.105489730834961, "learning_rate": 1e-05, - "loss": 0.3857, + "loss": 0.3886, "step": 62200 }, { "epoch": 0.000623, - "grad_norm": 1.8952577114105225, + "grad_norm": 1.9530473947525024, "learning_rate": 1e-05, - "loss": 0.3934, + "loss": 0.3962, "step": 62300 }, { "epoch": 0.000624, - "grad_norm": 2.150614023208618, + "grad_norm": 2.291125774383545, "learning_rate": 1e-05, - "loss": 0.3908, + "loss": 0.392, "step": 62400 }, { "epoch": 0.000625, - "grad_norm": 2.0335357189178467, + "grad_norm": 2.169159412384033, "learning_rate": 1e-05, - "loss": 0.3917, + "loss": 0.3916, "step": 62500 }, { "epoch": 0.000626, - "grad_norm": 2.4956729412078857, + "grad_norm": 2.2920339107513428, "learning_rate": 1e-05, - "loss": 0.3846, + "loss": 0.3886, "step": 62600 }, { "epoch": 0.000627, - "grad_norm": 2.2129902839660645, + "grad_norm": 2.248567819595337, "learning_rate": 1e-05, - "loss": 0.3979, + "loss": 0.3997, "step": 62700 }, { "epoch": 0.000628, - "grad_norm": 1.9032566547393799, + "grad_norm": 1.9369299411773682, "learning_rate": 1e-05, - "loss": 0.3951, + "loss": 0.3945, "step": 62800 }, { "epoch": 0.000629, - "grad_norm": 1.9463143348693848, + "grad_norm": 1.912782073020935, "learning_rate": 1e-05, - "loss": 0.3886, + "loss": 0.3911, "step": 62900 }, { "epoch": 0.00063, - "grad_norm": 1.7468578815460205, + "grad_norm": 1.7592915296554565, "learning_rate": 1e-05, - "loss": 0.3977, + "loss": 0.3989, "step": 63000 }, { "epoch": 0.000631, - "grad_norm": 2.222890853881836, + "grad_norm": 2.0076982975006104, "learning_rate": 1e-05, - "loss": 0.3888, + "loss": 0.3918, "step": 63100 }, { "epoch": 0.000632, - "grad_norm": 1.917638897895813, + "grad_norm": 2.0114753246307373, "learning_rate": 1e-05, - "loss": 0.3922, + "loss": 0.3932, "step": 63200 }, { "epoch": 0.000633, - "grad_norm": 2.2300891876220703, + "grad_norm": 2.50410795211792, "learning_rate": 1e-05, - "loss": 0.3975, + "loss": 0.3958, "step": 63300 }, { "epoch": 0.000634, - "grad_norm": 2.1845905780792236, + "grad_norm": 2.156872510910034, "learning_rate": 1e-05, - "loss": 0.3859, + "loss": 0.3875, "step": 63400 }, { "epoch": 0.000635, - "grad_norm": 1.9561052322387695, + "grad_norm": 2.2408478260040283, "learning_rate": 1e-05, - "loss": 0.3944, + "loss": 0.3939, "step": 63500 }, { "epoch": 0.000636, - "grad_norm": 2.203681230545044, + "grad_norm": 2.0988857746124268, "learning_rate": 1e-05, - "loss": 0.3874, + "loss": 0.3867, "step": 63600 }, { "epoch": 0.000637, - "grad_norm": 2.248065948486328, + "grad_norm": 2.140925407409668, "learning_rate": 1e-05, - "loss": 0.3874, + "loss": 0.3878, "step": 63700 }, { "epoch": 0.000638, - "grad_norm": 1.9205600023269653, + "grad_norm": 2.2293543815612793, "learning_rate": 1e-05, - "loss": 0.3846, + "loss": 0.3866, "step": 63800 }, { "epoch": 0.000639, - "grad_norm": 1.974914789199829, + "grad_norm": 2.0480923652648926, "learning_rate": 1e-05, - "loss": 0.3801, + "loss": 0.3819, "step": 63900 }, { "epoch": 0.00064, - "grad_norm": 2.3093318939208984, + "grad_norm": 2.129159927368164, "learning_rate": 1e-05, - "loss": 0.384, + "loss": 0.3874, "step": 64000 }, { "epoch": 0.000641, - "grad_norm": 1.782605767250061, + "grad_norm": 1.907259225845337, "learning_rate": 1e-05, - "loss": 0.3908, + "loss": 0.3918, "step": 64100 }, { "epoch": 0.000642, - "grad_norm": 1.8764839172363281, + "grad_norm": 1.9210904836654663, "learning_rate": 1e-05, - "loss": 0.3871, + "loss": 0.3863, "step": 64200 }, { "epoch": 0.000643, - "grad_norm": 2.0580124855041504, + "grad_norm": 2.2560818195343018, "learning_rate": 1e-05, - "loss": 0.3961, + "loss": 0.3986, "step": 64300 }, { "epoch": 0.000644, - "grad_norm": 2.2463715076446533, + "grad_norm": 1.8635262250900269, "learning_rate": 1e-05, - "loss": 0.3761, + "loss": 0.3781, "step": 64400 }, { "epoch": 0.000645, - "grad_norm": 1.9475406408309937, + "grad_norm": 2.076395273208618, "learning_rate": 1e-05, - "loss": 0.382, + "loss": 0.3822, "step": 64500 }, { "epoch": 0.000646, - "grad_norm": 1.841707706451416, + "grad_norm": 1.7710347175598145, "learning_rate": 1e-05, - "loss": 0.3766, + "loss": 0.3794, "step": 64600 }, { "epoch": 0.000647, - "grad_norm": 2.042868137359619, + "grad_norm": 2.0143582820892334, "learning_rate": 1e-05, - "loss": 0.3831, + "loss": 0.3825, "step": 64700 }, { "epoch": 0.000648, - "grad_norm": 2.061643362045288, + "grad_norm": 2.2155025005340576, "learning_rate": 1e-05, - "loss": 0.3912, + "loss": 0.3938, "step": 64800 }, { "epoch": 0.000649, - "grad_norm": 1.9063103199005127, + "grad_norm": 1.8567825555801392, "learning_rate": 1e-05, - "loss": 0.3804, + "loss": 0.3799, "step": 64900 }, { "epoch": 0.00065, - "grad_norm": 2.00238037109375, + "grad_norm": 2.183415412902832, "learning_rate": 1e-05, - "loss": 0.3807, + "loss": 0.3824, "step": 65000 }, { "epoch": 0.000651, - "grad_norm": 1.9954556226730347, + "grad_norm": 2.005911350250244, "learning_rate": 1e-05, - "loss": 0.39, + "loss": 0.3931, "step": 65100 }, { "epoch": 0.000652, - "grad_norm": 1.84696626663208, + "grad_norm": 1.8332974910736084, "learning_rate": 1e-05, - "loss": 0.3792, + "loss": 0.3811, "step": 65200 }, { "epoch": 0.000653, - "grad_norm": 1.8979240655899048, + "grad_norm": 2.1285884380340576, "learning_rate": 1e-05, - "loss": 0.3901, + "loss": 0.3918, "step": 65300 }, { "epoch": 0.000654, - "grad_norm": 2.2162926197052, + "grad_norm": 2.158264636993408, "learning_rate": 1e-05, - "loss": 0.3839, + "loss": 0.3863, "step": 65400 }, { "epoch": 0.000655, - "grad_norm": 1.79831862449646, + "grad_norm": 1.8876869678497314, "learning_rate": 1e-05, - "loss": 0.3914, + "loss": 0.3934, "step": 65500 }, { "epoch": 0.000656, - "grad_norm": 1.9221510887145996, + "grad_norm": 1.8769333362579346, "learning_rate": 1e-05, - "loss": 0.3912, + "loss": 0.3923, "step": 65600 }, { "epoch": 0.000657, - "grad_norm": 1.880818486213684, + "grad_norm": 2.019409656524658, "learning_rate": 1e-05, - "loss": 0.382, + "loss": 0.3826, "step": 65700 }, { "epoch": 0.000658, - "grad_norm": 2.1146156787872314, + "grad_norm": 2.0213446617126465, "learning_rate": 1e-05, - "loss": 0.3907, + "loss": 0.3922, "step": 65800 }, { "epoch": 0.000659, - "grad_norm": 1.9745644330978394, + "grad_norm": 2.2089147567749023, "learning_rate": 1e-05, - "loss": 0.3907, + "loss": 0.3933, "step": 65900 }, { "epoch": 0.00066, - "grad_norm": 2.040328025817871, + "grad_norm": 1.837319254875183, "learning_rate": 1e-05, - "loss": 0.3827, + "loss": 0.3868, "step": 66000 }, { "epoch": 0.000661, - "grad_norm": 1.9534509181976318, + "grad_norm": 1.8547362089157104, "learning_rate": 1e-05, - "loss": 0.3861, + "loss": 0.3887, "step": 66100 }, { "epoch": 0.000662, - "grad_norm": 1.7424492835998535, + "grad_norm": 1.8269295692443848, "learning_rate": 1e-05, - "loss": 0.3812, + "loss": 0.384, "step": 66200 }, { "epoch": 0.000663, - "grad_norm": 1.8719522953033447, + "grad_norm": 2.052025318145752, "learning_rate": 1e-05, - "loss": 0.3829, + "loss": 0.3838, "step": 66300 }, { "epoch": 0.000664, - "grad_norm": 2.3120663166046143, + "grad_norm": 2.030297040939331, "learning_rate": 1e-05, - "loss": 0.3858, + "loss": 0.3873, "step": 66400 }, { "epoch": 0.000665, - "grad_norm": 2.9282732009887695, + "grad_norm": 2.019329309463501, "learning_rate": 1e-05, - "loss": 0.3808, + "loss": 0.3828, "step": 66500 }, { "epoch": 0.000666, - "grad_norm": 2.228846788406372, + "grad_norm": 1.8459995985031128, "learning_rate": 1e-05, - "loss": 0.3859, + "loss": 0.3879, "step": 66600 }, { "epoch": 0.000667, - "grad_norm": 2.008467197418213, + "grad_norm": 1.9611304998397827, "learning_rate": 1e-05, - "loss": 0.3874, + "loss": 0.3893, "step": 66700 }, { "epoch": 0.000668, - "grad_norm": 2.0303399562835693, + "grad_norm": 1.8976935148239136, "learning_rate": 1e-05, - "loss": 0.382, + "loss": 0.3858, "step": 66800 }, { "epoch": 0.000669, - "grad_norm": 1.7990944385528564, + "grad_norm": 1.9818809032440186, "learning_rate": 1e-05, - "loss": 0.3836, + "loss": 0.3838, "step": 66900 }, { "epoch": 0.00067, - "grad_norm": 1.7805147171020508, + "grad_norm": 1.7839868068695068, "learning_rate": 1e-05, - "loss": 0.38, + "loss": 0.3809, "step": 67000 }, { "epoch": 0.000671, - "grad_norm": 1.9477365016937256, + "grad_norm": 2.1452698707580566, "learning_rate": 1e-05, - "loss": 0.3797, + "loss": 0.3843, "step": 67100 }, { "epoch": 0.000672, - "grad_norm": 2.074498176574707, + "grad_norm": 2.077277660369873, "learning_rate": 1e-05, - "loss": 0.3798, + "loss": 0.3837, "step": 67200 }, { "epoch": 0.000673, - "grad_norm": 2.0827853679656982, + "grad_norm": 2.0192837715148926, "learning_rate": 1e-05, - "loss": 0.397, + "loss": 0.3974, "step": 67300 }, { "epoch": 0.000674, - "grad_norm": 1.9423997402191162, + "grad_norm": 2.134225606918335, "learning_rate": 1e-05, - "loss": 0.3781, + "loss": 0.3777, "step": 67400 }, { "epoch": 0.000675, - "grad_norm": 1.8867988586425781, + "grad_norm": 2.0650904178619385, "learning_rate": 1e-05, - "loss": 0.3808, + "loss": 0.3837, "step": 67500 }, { "epoch": 0.000676, - "grad_norm": 1.9745376110076904, + "grad_norm": 1.857259750366211, "learning_rate": 1e-05, - "loss": 0.3773, + "loss": 0.3777, "step": 67600 }, { "epoch": 0.000677, - "grad_norm": 1.7891191244125366, + "grad_norm": 1.8561601638793945, "learning_rate": 1e-05, - "loss": 0.3872, + "loss": 0.3854, "step": 67700 }, { "epoch": 0.000678, - "grad_norm": 2.001190185546875, + "grad_norm": 2.27827525138855, "learning_rate": 1e-05, - "loss": 0.3762, + "loss": 0.3772, "step": 67800 }, { "epoch": 0.000679, - "grad_norm": 1.7278451919555664, + "grad_norm": 1.9206945896148682, "learning_rate": 1e-05, - "loss": 0.3851, + "loss": 0.3882, "step": 67900 }, { "epoch": 0.00068, - "grad_norm": 1.8518315553665161, + "grad_norm": 1.849585771560669, "learning_rate": 1e-05, - "loss": 0.3873, + "loss": 0.3903, "step": 68000 }, { "epoch": 0.000681, - "grad_norm": 1.9316362142562866, + "grad_norm": 2.1679646968841553, "learning_rate": 1e-05, - "loss": 0.3839, + "loss": 0.3844, "step": 68100 }, { "epoch": 0.000682, - "grad_norm": 2.00201678276062, + "grad_norm": 2.105186700820923, "learning_rate": 1e-05, - "loss": 0.3835, + "loss": 0.3843, "step": 68200 }, { "epoch": 0.000683, - "grad_norm": 1.9264591932296753, + "grad_norm": 1.8043280839920044, "learning_rate": 1e-05, - "loss": 0.3763, + "loss": 0.3782, "step": 68300 }, { "epoch": 0.000684, - "grad_norm": 2.0043368339538574, + "grad_norm": 1.9731149673461914, "learning_rate": 1e-05, - "loss": 0.3838, + "loss": 0.3876, "step": 68400 }, { "epoch": 0.000685, - "grad_norm": 1.9504468441009521, + "grad_norm": 1.9924131631851196, "learning_rate": 1e-05, - "loss": 0.3847, + "loss": 0.386, "step": 68500 }, { "epoch": 0.000686, - "grad_norm": 1.7852892875671387, + "grad_norm": 1.8679152727127075, "learning_rate": 1e-05, - "loss": 0.3768, + "loss": 0.3797, "step": 68600 }, { "epoch": 0.000687, - "grad_norm": 1.9766055345535278, + "grad_norm": 2.01244854927063, "learning_rate": 1e-05, - "loss": 0.3827, + "loss": 0.3803, "step": 68700 }, { "epoch": 0.000688, - "grad_norm": 1.9706650972366333, + "grad_norm": 1.9184852838516235, "learning_rate": 1e-05, - "loss": 0.3838, + "loss": 0.3872, "step": 68800 }, { "epoch": 0.000689, - "grad_norm": 1.9108973741531372, + "grad_norm": 2.039447546005249, "learning_rate": 1e-05, - "loss": 0.3795, + "loss": 0.3808, "step": 68900 }, { "epoch": 0.00069, - "grad_norm": 2.094895601272583, + "grad_norm": 2.367798089981079, "learning_rate": 1e-05, - "loss": 0.3817, + "loss": 0.3858, "step": 69000 }, { "epoch": 0.000691, - "grad_norm": 1.9377031326293945, + "grad_norm": 2.0003209114074707, "learning_rate": 1e-05, - "loss": 0.378, + "loss": 0.3808, "step": 69100 }, { "epoch": 0.000692, - "grad_norm": 1.9717905521392822, + "grad_norm": 1.9453091621398926, "learning_rate": 1e-05, - "loss": 0.3712, + "loss": 0.3707, "step": 69200 }, { "epoch": 0.000693, - "grad_norm": 1.7041637897491455, + "grad_norm": 1.6954456567764282, "learning_rate": 1e-05, - "loss": 0.3818, + "loss": 0.3829, "step": 69300 }, { "epoch": 0.000694, - "grad_norm": 2.1144893169403076, + "grad_norm": 2.1012470722198486, "learning_rate": 1e-05, - "loss": 0.3817, + "loss": 0.3833, "step": 69400 }, { "epoch": 0.000695, - "grad_norm": 2.0054380893707275, + "grad_norm": 1.8490900993347168, "learning_rate": 1e-05, - "loss": 0.3869, + "loss": 0.3873, "step": 69500 }, { "epoch": 0.000696, - "grad_norm": 1.9239062070846558, + "grad_norm": 1.8682618141174316, "learning_rate": 1e-05, - "loss": 0.3819, + "loss": 0.3862, "step": 69600 }, { "epoch": 0.000697, - "grad_norm": 2.2233283519744873, + "grad_norm": 2.068352460861206, "learning_rate": 1e-05, - "loss": 0.3773, + "loss": 0.3802, "step": 69700 }, { "epoch": 0.000698, - "grad_norm": 2.1612606048583984, + "grad_norm": 2.118117094039917, "learning_rate": 1e-05, - "loss": 0.3734, + "loss": 0.3764, "step": 69800 }, { "epoch": 0.000699, - "grad_norm": 2.3189737796783447, + "grad_norm": 1.8571758270263672, "learning_rate": 1e-05, - "loss": 0.3777, + "loss": 0.378, "step": 69900 }, { "epoch": 0.0007, - "grad_norm": 1.78034245967865, + "grad_norm": 2.103874921798706, "learning_rate": 1e-05, - "loss": 0.3779, + "loss": 0.3798, "step": 70000 }, { "epoch": 0.000701, - "grad_norm": 3.2710278034210205, + "grad_norm": 2.4420368671417236, "learning_rate": 1e-05, - "loss": 0.3798, + "loss": 0.3796, "step": 70100 }, { "epoch": 0.000702, - "grad_norm": 2.3106212615966797, + "grad_norm": 2.143949270248413, "learning_rate": 1e-05, - "loss": 0.3715, + "loss": 0.3735, "step": 70200 }, { "epoch": 0.000703, - "grad_norm": 1.6837509870529175, + "grad_norm": 2.070586681365967, "learning_rate": 1e-05, - "loss": 0.3771, + "loss": 0.3813, "step": 70300 }, { "epoch": 0.000704, - "grad_norm": 2.1873393058776855, + "grad_norm": 2.0714941024780273, "learning_rate": 1e-05, - "loss": 0.3797, + "loss": 0.383, "step": 70400 }, { "epoch": 0.000705, - "grad_norm": 2.6864142417907715, + "grad_norm": 2.0592539310455322, "learning_rate": 1e-05, - "loss": 0.3756, + "loss": 0.3769, "step": 70500 }, { "epoch": 0.000706, - "grad_norm": 1.8007659912109375, + "grad_norm": 2.0504090785980225, "learning_rate": 1e-05, - "loss": 0.3779, + "loss": 0.3791, "step": 70600 }, { "epoch": 0.000707, - "grad_norm": 2.0238771438598633, + "grad_norm": 1.6406168937683105, "learning_rate": 1e-05, - "loss": 0.3731, + "loss": 0.3758, "step": 70700 }, { "epoch": 0.000708, - "grad_norm": 1.9466392993927002, + "grad_norm": 2.1220123767852783, "learning_rate": 1e-05, - "loss": 0.3863, + "loss": 0.3868, "step": 70800 }, { "epoch": 0.000709, - "grad_norm": 1.9275888204574585, + "grad_norm": 2.0536298751831055, "learning_rate": 1e-05, - "loss": 0.3784, + "loss": 0.3805, "step": 70900 }, { "epoch": 0.00071, - "grad_norm": 2.0599563121795654, + "grad_norm": 2.076979875564575, "learning_rate": 1e-05, - "loss": 0.377, + "loss": 0.3807, "step": 71000 }, { "epoch": 0.000711, - "grad_norm": 2.358717679977417, + "grad_norm": 2.6225621700286865, "learning_rate": 1e-05, - "loss": 0.3705, + "loss": 0.373, "step": 71100 }, { "epoch": 0.000712, - "grad_norm": 2.0105512142181396, + "grad_norm": 2.2727653980255127, "learning_rate": 1e-05, - "loss": 0.3749, + "loss": 0.3762, "step": 71200 }, { "epoch": 0.000713, - "grad_norm": 1.9268088340759277, + "grad_norm": 2.0625195503234863, "learning_rate": 1e-05, - "loss": 0.3798, + "loss": 0.3841, "step": 71300 }, { "epoch": 0.000714, - "grad_norm": 1.8948795795440674, + "grad_norm": 1.9859055280685425, "learning_rate": 1e-05, - "loss": 0.3784, + "loss": 0.3801, "step": 71400 }, { "epoch": 0.000715, - "grad_norm": 2.117152214050293, + "grad_norm": 1.9635552167892456, "learning_rate": 1e-05, - "loss": 0.3825, + "loss": 0.3848, "step": 71500 }, { "epoch": 0.000716, - "grad_norm": 2.1358580589294434, + "grad_norm": 2.121825933456421, "learning_rate": 1e-05, - "loss": 0.3737, + "loss": 0.3745, "step": 71600 }, { "epoch": 0.000717, - "grad_norm": 1.8925243616104126, + "grad_norm": 1.9133636951446533, "learning_rate": 1e-05, - "loss": 0.3822, + "loss": 0.3814, "step": 71700 }, { "epoch": 0.000718, - "grad_norm": 1.8392730951309204, + "grad_norm": 2.1131491661071777, "learning_rate": 1e-05, - "loss": 0.3719, + "loss": 0.37, "step": 71800 }, { "epoch": 0.000719, - "grad_norm": 2.220132827758789, + "grad_norm": 2.0350754261016846, "learning_rate": 1e-05, - "loss": 0.3792, + "loss": 0.3814, "step": 71900 }, { "epoch": 0.00072, - "grad_norm": 2.003448724746704, + "grad_norm": 2.757786750793457, "learning_rate": 1e-05, - "loss": 0.3739, + "loss": 0.3754, "step": 72000 }, { "epoch": 0.000721, - "grad_norm": 1.6669561862945557, + "grad_norm": 1.797782063484192, "learning_rate": 1e-05, - "loss": 0.3695, + "loss": 0.3712, "step": 72100 }, { "epoch": 0.000722, - "grad_norm": 2.0987629890441895, + "grad_norm": 2.0632424354553223, "learning_rate": 1e-05, - "loss": 0.3831, + "loss": 0.3881, "step": 72200 }, { "epoch": 0.000723, - "grad_norm": 1.8519052267074585, + "grad_norm": 1.7604708671569824, "learning_rate": 1e-05, - "loss": 0.38, + "loss": 0.3787, "step": 72300 }, { "epoch": 0.000724, - "grad_norm": 1.8021750450134277, + "grad_norm": 3.3791792392730713, "learning_rate": 1e-05, - "loss": 0.3759, + "loss": 0.3776, "step": 72400 }, { "epoch": 0.000725, - "grad_norm": 1.9603530168533325, + "grad_norm": 2.1998651027679443, "learning_rate": 1e-05, - "loss": 0.3675, + "loss": 0.3701, "step": 72500 }, { "epoch": 0.000726, - "grad_norm": 2.3326728343963623, + "grad_norm": 2.309633731842041, "learning_rate": 1e-05, - "loss": 0.3719, + "loss": 0.3742, "step": 72600 }, { "epoch": 0.000727, - "grad_norm": 2.3862409591674805, + "grad_norm": 2.0794286727905273, "learning_rate": 1e-05, - "loss": 0.375, + "loss": 0.3752, "step": 72700 }, { "epoch": 0.000728, - "grad_norm": 1.8321102857589722, + "grad_norm": 1.98604154586792, "learning_rate": 1e-05, - "loss": 0.3813, + "loss": 0.3841, "step": 72800 }, { "epoch": 0.000729, - "grad_norm": 2.634211778640747, + "grad_norm": 2.0682222843170166, "learning_rate": 1e-05, - "loss": 0.3751, + "loss": 0.376, "step": 72900 }, { "epoch": 0.00073, - "grad_norm": 1.8579164743423462, + "grad_norm": 1.9491254091262817, "learning_rate": 1e-05, - "loss": 0.371, + "loss": 0.3704, "step": 73000 }, { "epoch": 0.000731, - "grad_norm": 1.7001895904541016, + "grad_norm": 1.809173822402954, "learning_rate": 1e-05, - "loss": 0.3745, + "loss": 0.376, "step": 73100 }, { "epoch": 0.000732, - "grad_norm": 1.6375387907028198, + "grad_norm": 1.7224321365356445, "learning_rate": 1e-05, - "loss": 0.3739, + "loss": 0.3737, "step": 73200 }, { "epoch": 0.000733, - "grad_norm": 1.8545457124710083, + "grad_norm": 1.7145380973815918, "learning_rate": 1e-05, - "loss": 0.3768, + "loss": 0.3806, "step": 73300 }, { "epoch": 0.000734, - "grad_norm": 1.9891623258590698, + "grad_norm": 2.0233635902404785, "learning_rate": 1e-05, - "loss": 0.3743, + "loss": 0.3741, "step": 73400 }, { "epoch": 0.000735, - "grad_norm": 1.9894626140594482, + "grad_norm": 1.9742248058319092, "learning_rate": 1e-05, - "loss": 0.3635, + "loss": 0.3645, "step": 73500 }, { "epoch": 0.000736, - "grad_norm": 1.885630488395691, + "grad_norm": 1.889393925666809, "learning_rate": 1e-05, - "loss": 0.3671, + "loss": 0.3696, "step": 73600 }, { "epoch": 0.000737, - "grad_norm": 2.058328866958618, + "grad_norm": 2.075669050216675, "learning_rate": 1e-05, - "loss": 0.3721, + "loss": 0.3735, "step": 73700 }, { "epoch": 0.000738, - "grad_norm": 1.8615614175796509, + "grad_norm": 2.1420507431030273, "learning_rate": 1e-05, - "loss": 0.3669, + "loss": 0.3701, "step": 73800 }, { "epoch": 0.000739, - "grad_norm": 1.9803423881530762, + "grad_norm": 2.1469383239746094, "learning_rate": 1e-05, - "loss": 0.3788, + "loss": 0.379, "step": 73900 }, { "epoch": 0.00074, - "grad_norm": 1.8891525268554688, + "grad_norm": 2.0224719047546387, "learning_rate": 1e-05, - "loss": 0.3736, + "loss": 0.372, "step": 74000 }, { "epoch": 0.000741, - "grad_norm": 1.555033564567566, + "grad_norm": 1.8598190546035767, "learning_rate": 1e-05, - "loss": 0.3772, + "loss": 0.3792, "step": 74100 }, { "epoch": 0.000742, - "grad_norm": 1.9519155025482178, + "grad_norm": 2.1243066787719727, "learning_rate": 1e-05, - "loss": 0.3694, + "loss": 0.3689, "step": 74200 }, { "epoch": 0.000743, - "grad_norm": 2.101541042327881, + "grad_norm": 1.8850631713867188, "learning_rate": 1e-05, - "loss": 0.375, + "loss": 0.3762, "step": 74300 }, { "epoch": 0.000744, - "grad_norm": 2.0038254261016846, + "grad_norm": 2.0598785877227783, "learning_rate": 1e-05, - "loss": 0.3701, + "loss": 0.3715, "step": 74400 }, { "epoch": 0.000745, - "grad_norm": 1.9179317951202393, + "grad_norm": 2.120824098587036, "learning_rate": 1e-05, - "loss": 0.3686, + "loss": 0.3696, "step": 74500 }, { "epoch": 0.000746, - "grad_norm": 1.8006997108459473, + "grad_norm": 1.7642192840576172, "learning_rate": 1e-05, - "loss": 0.3758, + "loss": 0.3771, "step": 74600 }, { "epoch": 0.000747, - "grad_norm": 1.9919887781143188, + "grad_norm": 2.1491034030914307, "learning_rate": 1e-05, - "loss": 0.3854, + "loss": 0.3883, "step": 74700 }, { "epoch": 0.000748, - "grad_norm": 1.8536826372146606, + "grad_norm": 1.8905261754989624, "learning_rate": 1e-05, - "loss": 0.3731, + "loss": 0.3756, "step": 74800 }, { "epoch": 0.000749, - "grad_norm": 1.729183554649353, + "grad_norm": 2.4035165309906006, "learning_rate": 1e-05, - "loss": 0.377, + "loss": 0.3776, "step": 74900 }, { "epoch": 0.00075, - "grad_norm": 2.184899091720581, + "grad_norm": 2.104729652404785, "learning_rate": 1e-05, - "loss": 0.3745, + "loss": 0.3762, "step": 75000 }, { "epoch": 0.000751, - "grad_norm": 1.8467082977294922, + "grad_norm": 2.0208077430725098, "learning_rate": 1e-05, - "loss": 0.3671, + "loss": 0.3696, "step": 75100 }, { "epoch": 0.000752, - "grad_norm": 2.0495119094848633, + "grad_norm": 2.1069564819335938, "learning_rate": 1e-05, - "loss": 0.3723, + "loss": 0.3766, "step": 75200 }, { "epoch": 0.000753, - "grad_norm": 2.115567207336426, + "grad_norm": 1.9399853944778442, "learning_rate": 1e-05, - "loss": 0.3752, + "loss": 0.3767, "step": 75300 }, { "epoch": 0.000754, - "grad_norm": 1.8394079208374023, + "grad_norm": 2.072504758834839, "learning_rate": 1e-05, - "loss": 0.3781, + "loss": 0.3787, "step": 75400 }, { "epoch": 0.000755, - "grad_norm": 1.63499915599823, + "grad_norm": 1.6709392070770264, "learning_rate": 1e-05, - "loss": 0.3724, + "loss": 0.3739, "step": 75500 }, { "epoch": 0.000756, - "grad_norm": 2.3781888484954834, + "grad_norm": 1.9638621807098389, "learning_rate": 1e-05, - "loss": 0.3698, + "loss": 0.3736, "step": 75600 }, { "epoch": 0.000757, - "grad_norm": 1.8783695697784424, + "grad_norm": 1.9661500453948975, "learning_rate": 1e-05, - "loss": 0.3724, + "loss": 0.3742, "step": 75700 }, { "epoch": 0.000758, - "grad_norm": 1.5949982404708862, + "grad_norm": 1.7618420124053955, "learning_rate": 1e-05, - "loss": 0.3697, + "loss": 0.3725, "step": 75800 }, { "epoch": 0.000759, - "grad_norm": 2.0614309310913086, + "grad_norm": 2.090238332748413, "learning_rate": 1e-05, - "loss": 0.3741, + "loss": 0.3771, "step": 75900 }, { "epoch": 0.00076, - "grad_norm": 1.9954396486282349, + "grad_norm": 1.9758362770080566, "learning_rate": 1e-05, - "loss": 0.3657, + "loss": 0.3666, "step": 76000 }, { "epoch": 0.000761, - "grad_norm": 1.876464605331421, + "grad_norm": 2.023850917816162, "learning_rate": 1e-05, - "loss": 0.3737, + "loss": 0.379, "step": 76100 }, { "epoch": 0.000762, - "grad_norm": 1.9084681272506714, + "grad_norm": 1.9351980686187744, "learning_rate": 1e-05, - "loss": 0.3696, + "loss": 0.3705, "step": 76200 }, { "epoch": 0.000763, - "grad_norm": 2.462977170944214, + "grad_norm": 1.8853991031646729, "learning_rate": 1e-05, - "loss": 0.3739, + "loss": 0.3743, "step": 76300 }, { "epoch": 0.000764, - "grad_norm": 1.8210529088974, + "grad_norm": 2.0006158351898193, "learning_rate": 1e-05, - "loss": 0.3689, + "loss": 0.3704, "step": 76400 }, { "epoch": 0.000765, - "grad_norm": 1.578637719154358, + "grad_norm": 2.1643075942993164, "learning_rate": 1e-05, - "loss": 0.368, + "loss": 0.3705, "step": 76500 }, { "epoch": 0.000766, - "grad_norm": 1.870898723602295, + "grad_norm": 1.9577935934066772, "learning_rate": 1e-05, - "loss": 0.3728, + "loss": 0.3742, "step": 76600 }, { "epoch": 0.000767, - "grad_norm": 1.7614187002182007, + "grad_norm": 2.128188371658325, "learning_rate": 1e-05, - "loss": 0.3709, + "loss": 0.3698, "step": 76700 }, { "epoch": 0.000768, - "grad_norm": 2.0262155532836914, + "grad_norm": 1.9895089864730835, "learning_rate": 1e-05, - "loss": 0.374, + "loss": 0.3753, "step": 76800 }, { "epoch": 0.000769, - "grad_norm": 2.127225637435913, + "grad_norm": 2.1536855697631836, "learning_rate": 1e-05, - "loss": 0.3633, + "loss": 0.3644, "step": 76900 }, { "epoch": 0.00077, - "grad_norm": 3.3583128452301025, + "grad_norm": 1.9444348812103271, "learning_rate": 1e-05, - "loss": 0.3673, + "loss": 0.3671, "step": 77000 }, { "epoch": 0.000771, - "grad_norm": 1.8614639043807983, + "grad_norm": 1.8287049531936646, "learning_rate": 1e-05, - "loss": 0.3731, + "loss": 0.3735, "step": 77100 }, { "epoch": 0.000772, - "grad_norm": 1.9169940948486328, + "grad_norm": 1.8443069458007812, "learning_rate": 1e-05, - "loss": 0.3647, + "loss": 0.3686, "step": 77200 }, { "epoch": 0.000773, - "grad_norm": 1.8678412437438965, + "grad_norm": 1.8012452125549316, "learning_rate": 1e-05, - "loss": 0.3736, + "loss": 0.3751, "step": 77300 }, { "epoch": 0.000774, - "grad_norm": 2.1774847507476807, + "grad_norm": 1.9977177381515503, "learning_rate": 1e-05, - "loss": 0.3687, + "loss": 0.3684, "step": 77400 }, { "epoch": 0.000775, - "grad_norm": 1.8314334154129028, + "grad_norm": 1.9906736612319946, "learning_rate": 1e-05, - "loss": 0.3683, + "loss": 0.3712, "step": 77500 }, { "epoch": 0.000776, - "grad_norm": 1.933455228805542, + "grad_norm": 1.9918975830078125, "learning_rate": 1e-05, - "loss": 0.366, + "loss": 0.3687, "step": 77600 }, { "epoch": 0.000777, - "grad_norm": 1.9543218612670898, + "grad_norm": 1.9965052604675293, "learning_rate": 1e-05, - "loss": 0.3679, + "loss": 0.3668, "step": 77700 }, { "epoch": 0.000778, - "grad_norm": 1.8525019884109497, + "grad_norm": 1.9064897298812866, "learning_rate": 1e-05, - "loss": 0.3699, + "loss": 0.3719, "step": 77800 }, { "epoch": 0.000779, - "grad_norm": 1.8097087144851685, + "grad_norm": 1.7971402406692505, "learning_rate": 1e-05, - "loss": 0.3603, + "loss": 0.3645, "step": 77900 }, { "epoch": 0.00078, - "grad_norm": 1.8097748756408691, + "grad_norm": 1.8232814073562622, "learning_rate": 1e-05, - "loss": 0.3731, + "loss": 0.3705, "step": 78000 }, { "epoch": 0.000781, - "grad_norm": 2.0118296146392822, + "grad_norm": 2.1162238121032715, "learning_rate": 1e-05, - "loss": 0.3657, + "loss": 0.3675, "step": 78100 }, { "epoch": 0.000782, - "grad_norm": 1.7090351581573486, + "grad_norm": 1.8850531578063965, "learning_rate": 1e-05, - "loss": 0.3637, + "loss": 0.3668, "step": 78200 }, { "epoch": 0.000783, - "grad_norm": 1.7420722246170044, + "grad_norm": 1.864730715751648, "learning_rate": 1e-05, - "loss": 0.3627, + "loss": 0.3652, "step": 78300 }, { "epoch": 0.000784, - "grad_norm": 1.699184775352478, + "grad_norm": 1.9205899238586426, "learning_rate": 1e-05, - "loss": 0.3698, + "loss": 0.3716, "step": 78400 }, { "epoch": 0.000785, - "grad_norm": 2.1604866981506348, + "grad_norm": 2.325000524520874, "learning_rate": 1e-05, - "loss": 0.372, + "loss": 0.3738, "step": 78500 }, { "epoch": 0.000786, - "grad_norm": 1.9928841590881348, + "grad_norm": 2.2757534980773926, "learning_rate": 1e-05, - "loss": 0.3717, + "loss": 0.3722, "step": 78600 }, { "epoch": 0.000787, - "grad_norm": 1.7823375463485718, + "grad_norm": 1.7619765996932983, "learning_rate": 1e-05, - "loss": 0.3788, + "loss": 0.3799, "step": 78700 }, { "epoch": 0.000788, - "grad_norm": 1.8327288627624512, + "grad_norm": 1.802307367324829, "learning_rate": 1e-05, - "loss": 0.3724, + "loss": 0.3744, "step": 78800 }, { "epoch": 0.000789, - "grad_norm": 1.8031224012374878, + "grad_norm": 1.8677384853363037, "learning_rate": 1e-05, - "loss": 0.373, + "loss": 0.3757, "step": 78900 }, { "epoch": 0.00079, - "grad_norm": 2.2257306575775146, + "grad_norm": 2.1615066528320312, "learning_rate": 1e-05, - "loss": 0.3758, + "loss": 0.3769, "step": 79000 }, { "epoch": 0.000791, - "grad_norm": 1.9993300437927246, + "grad_norm": 1.6998400688171387, "learning_rate": 1e-05, - "loss": 0.3698, + "loss": 0.3705, "step": 79100 }, { "epoch": 0.000792, - "grad_norm": 1.8228765726089478, + "grad_norm": 1.7555445432662964, "learning_rate": 1e-05, - "loss": 0.3716, + "loss": 0.373, "step": 79200 }, { "epoch": 0.000793, - "grad_norm": 1.8790833950042725, + "grad_norm": 2.0142476558685303, "learning_rate": 1e-05, - "loss": 0.3709, + "loss": 0.3723, "step": 79300 }, { "epoch": 0.000794, - "grad_norm": 1.8837450742721558, + "grad_norm": 2.3179373741149902, "learning_rate": 1e-05, - "loss": 0.3688, + "loss": 0.3683, "step": 79400 }, { "epoch": 0.000795, - "grad_norm": 1.979100227355957, + "grad_norm": 1.9455734491348267, "learning_rate": 1e-05, - "loss": 0.3675, + "loss": 0.3677, "step": 79500 }, { "epoch": 0.000796, - "grad_norm": 2.1498560905456543, + "grad_norm": 2.0112357139587402, "learning_rate": 1e-05, - "loss": 0.3672, + "loss": 0.3667, "step": 79600 }, { "epoch": 0.000797, - "grad_norm": 3.0212066173553467, + "grad_norm": 2.257429361343384, "learning_rate": 1e-05, - "loss": 0.3647, + "loss": 0.366, "step": 79700 }, { "epoch": 0.000798, - "grad_norm": 1.8750560283660889, + "grad_norm": 1.7353073358535767, "learning_rate": 1e-05, - "loss": 0.3676, + "loss": 0.3693, "step": 79800 }, { "epoch": 0.000799, - "grad_norm": 1.7192078828811646, + "grad_norm": 1.989250898361206, "learning_rate": 1e-05, - "loss": 0.3629, + "loss": 0.3658, "step": 79900 }, { "epoch": 0.0008, - "grad_norm": 2.3127896785736084, + "grad_norm": 1.8010023832321167, "learning_rate": 1e-05, - "loss": 0.3678, + "loss": 0.3687, "step": 80000 }, { "epoch": 0.0008, - "eval_loss": 0.341064453125, - "eval_runtime": 107.2105, - "eval_samples_per_second": 466.372, - "eval_steps_per_second": 29.148, + "eval_loss": 0.343017578125, + "eval_runtime": 111.2417, + "eval_samples_per_second": 449.472, + "eval_steps_per_second": 28.092, "step": 80000 }, { "epoch": 0.000801, - "grad_norm": 1.7513158321380615, + "grad_norm": 1.8883432149887085, "learning_rate": 1e-05, - "loss": 0.3736, + "loss": 0.3739, "step": 80100 }, { "epoch": 0.000802, - "grad_norm": 1.6448382139205933, + "grad_norm": 1.9652680158615112, "learning_rate": 1e-05, - "loss": 0.3549, + "loss": 0.359, "step": 80200 }, { "epoch": 0.000803, - "grad_norm": 1.8534730672836304, + "grad_norm": 2.1151764392852783, "learning_rate": 1e-05, - "loss": 0.3594, + "loss": 0.3626, "step": 80300 }, { "epoch": 0.000804, - "grad_norm": 1.8926939964294434, + "grad_norm": 1.736228108406067, "learning_rate": 1e-05, - "loss": 0.3707, + "loss": 0.3737, "step": 80400 }, { "epoch": 0.000805, - "grad_norm": 1.7940895557403564, + "grad_norm": 1.800878643989563, "learning_rate": 1e-05, - "loss": 0.3705, + "loss": 0.3731, "step": 80500 }, { "epoch": 0.000806, - "grad_norm": 1.6614346504211426, + "grad_norm": 1.8532053232192993, "learning_rate": 1e-05, - "loss": 0.3707, + "loss": 0.3723, "step": 80600 }, { "epoch": 0.000807, - "grad_norm": 2.4460978507995605, + "grad_norm": 1.838671088218689, "learning_rate": 1e-05, - "loss": 0.3611, + "loss": 0.3666, "step": 80700 }, { "epoch": 0.000808, - "grad_norm": 1.9615494012832642, + "grad_norm": 1.7083035707473755, "learning_rate": 1e-05, - "loss": 0.3757, + "loss": 0.3761, "step": 80800 }, { "epoch": 0.000809, - "grad_norm": 1.915145754814148, + "grad_norm": 1.8676607608795166, "learning_rate": 1e-05, - "loss": 0.3663, + "loss": 0.3684, "step": 80900 }, { "epoch": 0.00081, - "grad_norm": 2.0027689933776855, + "grad_norm": 2.0752341747283936, "learning_rate": 1e-05, - "loss": 0.3655, + "loss": 0.3657, "step": 81000 }, { "epoch": 0.000811, - "grad_norm": 2.2781310081481934, + "grad_norm": 1.793967604637146, "learning_rate": 1e-05, - "loss": 0.3738, + "loss": 0.3748, "step": 81100 }, { "epoch": 0.000812, - "grad_norm": 1.7206125259399414, + "grad_norm": 1.6681337356567383, "learning_rate": 1e-05, - "loss": 0.3647, + "loss": 0.3661, "step": 81200 }, { "epoch": 0.000813, - "grad_norm": 1.732438325881958, + "grad_norm": 2.46724534034729, "learning_rate": 1e-05, - "loss": 0.3599, + "loss": 0.3609, "step": 81300 }, { "epoch": 0.000814, - "grad_norm": 1.7096847295761108, + "grad_norm": 1.8258310556411743, "learning_rate": 1e-05, - "loss": 0.3608, + "loss": 0.3632, "step": 81400 }, { "epoch": 0.000815, - "grad_norm": 1.9984345436096191, + "grad_norm": 1.9003719091415405, "learning_rate": 1e-05, - "loss": 0.3661, + "loss": 0.365, "step": 81500 }, { "epoch": 0.000816, - "grad_norm": 1.6305888891220093, + "grad_norm": 1.8004292249679565, "learning_rate": 1e-05, - "loss": 0.3647, + "loss": 0.3675, "step": 81600 }, { "epoch": 0.000817, - "grad_norm": 1.7355291843414307, + "grad_norm": 1.8981424570083618, "learning_rate": 1e-05, - "loss": 0.3686, + "loss": 0.369, "step": 81700 }, { "epoch": 0.000818, - "grad_norm": 2.154341697692871, + "grad_norm": 2.1236822605133057, "learning_rate": 1e-05, - "loss": 0.3633, + "loss": 0.368, "step": 81800 }, { "epoch": 0.000819, - "grad_norm": 2.0211081504821777, + "grad_norm": 1.9143141508102417, "learning_rate": 1e-05, - "loss": 0.3613, + "loss": 0.3628, "step": 81900 }, { "epoch": 0.00082, - "grad_norm": 1.9967700242996216, + "grad_norm": 1.9660463333129883, "learning_rate": 1e-05, - "loss": 0.3605, + "loss": 0.3627, "step": 82000 }, { "epoch": 0.000821, - "grad_norm": 1.738448143005371, + "grad_norm": 2.1098835468292236, "learning_rate": 1e-05, - "loss": 0.3636, + "loss": 0.3657, "step": 82100 }, { "epoch": 0.000822, - "grad_norm": 1.8240571022033691, + "grad_norm": 1.8893609046936035, "learning_rate": 1e-05, - "loss": 0.3585, + "loss": 0.3621, "step": 82200 }, { "epoch": 0.000823, - "grad_norm": 1.9471300840377808, + "grad_norm": 1.848253607749939, "learning_rate": 1e-05, - "loss": 0.3551, + "loss": 0.3547, "step": 82300 }, { "epoch": 0.000824, - "grad_norm": 1.7154971361160278, + "grad_norm": 1.8891757726669312, "learning_rate": 1e-05, - "loss": 0.3614, + "loss": 0.3654, "step": 82400 }, { "epoch": 0.000825, - "grad_norm": 1.7459511756896973, + "grad_norm": 1.9906656742095947, "learning_rate": 1e-05, - "loss": 0.3661, + "loss": 0.37, "step": 82500 }, { "epoch": 0.000826, - "grad_norm": 1.9061509370803833, + "grad_norm": 2.026745080947876, "learning_rate": 1e-05, - "loss": 0.3599, + "loss": 0.3598, "step": 82600 }, { "epoch": 0.000827, - "grad_norm": 2.1102311611175537, + "grad_norm": 1.8796215057373047, "learning_rate": 1e-05, - "loss": 0.3623, + "loss": 0.3608, "step": 82700 }, { "epoch": 0.000828, - "grad_norm": 1.9421707391738892, + "grad_norm": 1.8934880495071411, "learning_rate": 1e-05, - "loss": 0.3607, + "loss": 0.3633, "step": 82800 }, { "epoch": 0.000829, - "grad_norm": 1.9199868440628052, + "grad_norm": 2.241187572479248, "learning_rate": 1e-05, - "loss": 0.3596, + "loss": 0.3623, "step": 82900 }, { "epoch": 0.00083, - "grad_norm": 1.8587994575500488, + "grad_norm": 1.8311808109283447, "learning_rate": 1e-05, - "loss": 0.3604, + "loss": 0.3643, "step": 83000 }, { "epoch": 0.000831, - "grad_norm": 1.8015069961547852, + "grad_norm": 1.7869751453399658, "learning_rate": 1e-05, - "loss": 0.3695, + "loss": 0.3718, "step": 83100 }, { "epoch": 0.000832, - "grad_norm": 1.8318135738372803, + "grad_norm": 1.894146203994751, "learning_rate": 1e-05, - "loss": 0.3606, + "loss": 0.3629, "step": 83200 }, { "epoch": 0.000833, - "grad_norm": 1.8201472759246826, + "grad_norm": 1.7418984174728394, "learning_rate": 1e-05, - "loss": 0.3573, + "loss": 0.358, "step": 83300 }, { "epoch": 0.000834, - "grad_norm": 2.2370197772979736, + "grad_norm": 2.2200584411621094, "learning_rate": 1e-05, "loss": 0.3602, "step": 83400 }, { "epoch": 0.000835, - "grad_norm": 1.6775529384613037, + "grad_norm": 1.7402255535125732, "learning_rate": 1e-05, - "loss": 0.3639, + "loss": 0.3648, "step": 83500 }, { "epoch": 0.000836, - "grad_norm": 1.7370569705963135, + "grad_norm": 1.7476297616958618, "learning_rate": 1e-05, - "loss": 0.3589, + "loss": 0.3603, "step": 83600 }, { "epoch": 0.000837, - "grad_norm": 1.9636480808258057, + "grad_norm": 2.0509250164031982, "learning_rate": 1e-05, - "loss": 0.366, + "loss": 0.3698, "step": 83700 }, { "epoch": 0.000838, - "grad_norm": 1.7211209535598755, + "grad_norm": 1.819290041923523, "learning_rate": 1e-05, - "loss": 0.3493, + "loss": 0.349, "step": 83800 }, { "epoch": 0.000839, - "grad_norm": 1.8661843538284302, + "grad_norm": 1.9946727752685547, "learning_rate": 1e-05, - "loss": 0.3653, + "loss": 0.3666, "step": 83900 }, { "epoch": 0.00084, - "grad_norm": 1.6116656064987183, + "grad_norm": 1.6956796646118164, "learning_rate": 1e-05, - "loss": 0.3687, + "loss": 0.3686, "step": 84000 }, { "epoch": 0.000841, - "grad_norm": 2.025702953338623, + "grad_norm": 1.8575202226638794, "learning_rate": 1e-05, - "loss": 0.3639, + "loss": 0.3666, "step": 84100 }, { "epoch": 0.000842, - "grad_norm": 1.592247724533081, + "grad_norm": 1.7518588304519653, "learning_rate": 1e-05, - "loss": 0.3606, + "loss": 0.3624, "step": 84200 }, { "epoch": 0.000843, - "grad_norm": 1.588572382926941, + "grad_norm": 1.6281752586364746, "learning_rate": 1e-05, - "loss": 0.3562, + "loss": 0.3567, "step": 84300 }, { "epoch": 0.000844, - "grad_norm": 1.6895296573638916, + "grad_norm": 1.8025518655776978, "learning_rate": 1e-05, - "loss": 0.353, + "loss": 0.355, "step": 84400 }, { "epoch": 0.000845, - "grad_norm": 1.6973440647125244, + "grad_norm": 1.787426471710205, "learning_rate": 1e-05, - "loss": 0.3648, + "loss": 0.3666, "step": 84500 }, { "epoch": 0.000846, - "grad_norm": 1.7850263118743896, + "grad_norm": 1.8636668920516968, "learning_rate": 1e-05, - "loss": 0.3512, + "loss": 0.3539, "step": 84600 }, { "epoch": 0.000847, - "grad_norm": 1.797160267829895, + "grad_norm": 1.999342441558838, "learning_rate": 1e-05, - "loss": 0.3562, + "loss": 0.3573, "step": 84700 }, { "epoch": 0.000848, - "grad_norm": 1.8910343647003174, + "grad_norm": 1.7526439428329468, "learning_rate": 1e-05, - "loss": 0.3598, + "loss": 0.3638, "step": 84800 }, { "epoch": 0.000849, - "grad_norm": 1.749621868133545, + "grad_norm": 1.9818848371505737, "learning_rate": 1e-05, - "loss": 0.3613, + "loss": 0.3622, "step": 84900 }, { "epoch": 0.00085, - "grad_norm": 1.9663804769515991, + "grad_norm": 2.2633955478668213, "learning_rate": 1e-05, - "loss": 0.3666, + "loss": 0.3685, "step": 85000 }, { "epoch": 0.000851, - "grad_norm": 2.096741199493408, + "grad_norm": 2.037205696105957, "learning_rate": 1e-05, - "loss": 0.3615, + "loss": 0.3607, "step": 85100 }, { "epoch": 0.000852, - "grad_norm": 2.0016849040985107, + "grad_norm": 2.008530616760254, "learning_rate": 1e-05, - "loss": 0.3614, + "loss": 0.3666, "step": 85200 }, { "epoch": 0.000853, - "grad_norm": 1.6579258441925049, + "grad_norm": 1.7828373908996582, "learning_rate": 1e-05, - "loss": 0.3597, + "loss": 0.3642, "step": 85300 }, { "epoch": 0.000854, - "grad_norm": 2.004843235015869, + "grad_norm": 2.0201706886291504, "learning_rate": 1e-05, - "loss": 0.3568, + "loss": 0.3597, "step": 85400 }, { "epoch": 0.000855, - "grad_norm": 1.9299733638763428, + "grad_norm": 2.0106265544891357, "learning_rate": 1e-05, - "loss": 0.3602, + "loss": 0.3604, "step": 85500 }, { "epoch": 0.000856, - "grad_norm": 2.02068829536438, + "grad_norm": 1.896898627281189, "learning_rate": 1e-05, - "loss": 0.3635, + "loss": 0.366, "step": 85600 }, { "epoch": 0.000857, - "grad_norm": 1.8524069786071777, + "grad_norm": 1.9812458753585815, "learning_rate": 1e-05, - "loss": 0.3565, + "loss": 0.3596, "step": 85700 }, { "epoch": 0.000858, - "grad_norm": 1.9815930128097534, + "grad_norm": 2.0447208881378174, "learning_rate": 1e-05, - "loss": 0.3608, + "loss": 0.3615, "step": 85800 }, { "epoch": 0.000859, - "grad_norm": 1.6845982074737549, + "grad_norm": 1.9249247312545776, "learning_rate": 1e-05, "loss": 0.3532, "step": 85900 }, { "epoch": 0.00086, - "grad_norm": 1.6186513900756836, + "grad_norm": 1.700594186782837, "learning_rate": 1e-05, - "loss": 0.3557, + "loss": 0.3567, "step": 86000 }, { "epoch": 0.000861, - "grad_norm": 1.697465419769287, + "grad_norm": 1.9149887561798096, "learning_rate": 1e-05, - "loss": 0.358, + "loss": 0.3585, "step": 86100 }, { "epoch": 0.000862, - "grad_norm": 2.0410921573638916, + "grad_norm": 2.202561855316162, "learning_rate": 1e-05, - "loss": 0.3662, + "loss": 0.3667, "step": 86200 }, { "epoch": 0.000863, - "grad_norm": 1.8697097301483154, + "grad_norm": 1.8370987176895142, "learning_rate": 1e-05, - "loss": 0.3593, + "loss": 0.3606, "step": 86300 }, { "epoch": 0.000864, - "grad_norm": 1.7522614002227783, + "grad_norm": 2.0995025634765625, "learning_rate": 1e-05, - "loss": 0.364, + "loss": 0.366, "step": 86400 }, { "epoch": 0.000865, - "grad_norm": 1.6217539310455322, + "grad_norm": 1.7918909788131714, "learning_rate": 1e-05, - "loss": 0.3509, + "loss": 0.3524, "step": 86500 }, { "epoch": 0.000866, - "grad_norm": 1.7393057346343994, + "grad_norm": 1.870877742767334, "learning_rate": 1e-05, - "loss": 0.3608, + "loss": 0.3631, "step": 86600 }, { "epoch": 0.000867, - "grad_norm": 1.9387987852096558, + "grad_norm": 2.0287795066833496, "learning_rate": 1e-05, - "loss": 0.3531, + "loss": 0.3555, "step": 86700 }, { "epoch": 0.000868, - "grad_norm": 1.7620469331741333, + "grad_norm": 1.9686987400054932, "learning_rate": 1e-05, - "loss": 0.3631, + "loss": 0.3622, "step": 86800 }, { "epoch": 0.000869, - "grad_norm": 1.6490765810012817, + "grad_norm": 1.714966893196106, "learning_rate": 1e-05, - "loss": 0.356, + "loss": 0.3585, "step": 86900 }, { "epoch": 0.00087, - "grad_norm": 1.972285270690918, + "grad_norm": 2.0388360023498535, "learning_rate": 1e-05, - "loss": 0.3568, + "loss": 0.3605, "step": 87000 }, { "epoch": 0.000871, - "grad_norm": 1.9594944715499878, + "grad_norm": 1.8588838577270508, "learning_rate": 1e-05, - "loss": 0.3607, + "loss": 0.3588, "step": 87100 }, { "epoch": 0.000872, - "grad_norm": 1.6873729228973389, + "grad_norm": 2.2491447925567627, "learning_rate": 1e-05, - "loss": 0.3566, + "loss": 0.3598, "step": 87200 }, { "epoch": 0.000873, - "grad_norm": 1.8429042100906372, + "grad_norm": 2.023857831954956, "learning_rate": 1e-05, - "loss": 0.3589, + "loss": 0.3618, "step": 87300 }, { "epoch": 0.000874, - "grad_norm": 1.4812936782836914, + "grad_norm": 1.5411655902862549, "learning_rate": 1e-05, - "loss": 0.3652, + "loss": 0.3681, "step": 87400 }, { "epoch": 0.000875, - "grad_norm": 1.7025933265686035, + "grad_norm": 1.5299054384231567, "learning_rate": 1e-05, - "loss": 0.3568, + "loss": 0.3594, "step": 87500 }, { "epoch": 0.000876, - "grad_norm": 2.0091121196746826, + "grad_norm": 1.9162421226501465, "learning_rate": 1e-05, - "loss": 0.3563, + "loss": 0.3569, "step": 87600 }, { "epoch": 0.000877, - "grad_norm": 2.0316507816314697, + "grad_norm": 2.160090684890747, "learning_rate": 1e-05, - "loss": 0.3543, + "loss": 0.358, "step": 87700 }, { "epoch": 0.000878, - "grad_norm": 1.8067389726638794, + "grad_norm": 2.044666051864624, "learning_rate": 1e-05, - "loss": 0.3572, + "loss": 0.3609, "step": 87800 }, { "epoch": 0.000879, - "grad_norm": 1.8313833475112915, + "grad_norm": 1.7112947702407837, "learning_rate": 1e-05, - "loss": 0.3543, + "loss": 0.3544, "step": 87900 }, { "epoch": 0.00088, - "grad_norm": 1.852293848991394, + "grad_norm": 1.8648561239242554, "learning_rate": 1e-05, - "loss": 0.3653, + "loss": 0.365, "step": 88000 }, { "epoch": 0.000881, - "grad_norm": 1.6571025848388672, + "grad_norm": 1.8748390674591064, "learning_rate": 1e-05, - "loss": 0.3689, + "loss": 0.3668, "step": 88100 }, { "epoch": 0.000882, - "grad_norm": 2.1299712657928467, + "grad_norm": 2.2753427028656006, "learning_rate": 1e-05, - "loss": 0.363, + "loss": 0.3631, "step": 88200 }, { "epoch": 0.000883, - "grad_norm": 1.8483059406280518, + "grad_norm": 1.8260302543640137, "learning_rate": 1e-05, - "loss": 0.3603, + "loss": 0.3614, "step": 88300 }, { "epoch": 0.000884, - "grad_norm": 1.7549654245376587, + "grad_norm": 1.8950936794281006, "learning_rate": 1e-05, - "loss": 0.3539, + "loss": 0.3555, "step": 88400 }, { "epoch": 0.000885, - "grad_norm": 1.7578660249710083, + "grad_norm": 1.7748656272888184, "learning_rate": 1e-05, - "loss": 0.3579, + "loss": 0.3607, "step": 88500 }, { "epoch": 0.000886, - "grad_norm": 1.802687406539917, + "grad_norm": 1.7580374479293823, "learning_rate": 1e-05, - "loss": 0.3525, + "loss": 0.352, "step": 88600 }, { "epoch": 0.000887, - "grad_norm": 1.756865382194519, + "grad_norm": 1.9820917844772339, "learning_rate": 1e-05, - "loss": 0.3568, + "loss": 0.3549, "step": 88700 }, { "epoch": 0.000888, - "grad_norm": 1.9765276908874512, + "grad_norm": 1.842002272605896, "learning_rate": 1e-05, - "loss": 0.3565, + "loss": 0.3606, "step": 88800 }, { "epoch": 0.000889, - "grad_norm": 1.6778595447540283, + "grad_norm": 1.8876936435699463, "learning_rate": 1e-05, - "loss": 0.3602, + "loss": 0.3607, "step": 88900 }, { "epoch": 0.00089, - "grad_norm": 1.9490047693252563, + "grad_norm": 1.7980290651321411, "learning_rate": 1e-05, - "loss": 0.3528, + "loss": 0.3555, "step": 89000 }, { "epoch": 0.000891, - "grad_norm": 1.806974172592163, + "grad_norm": 2.300755739212036, "learning_rate": 1e-05, - "loss": 0.3566, + "loss": 0.3582, "step": 89100 }, { "epoch": 0.000892, - "grad_norm": 2.137768268585205, + "grad_norm": 2.7439591884613037, "learning_rate": 1e-05, - "loss": 0.354, + "loss": 0.3545, "step": 89200 }, { "epoch": 0.000893, - "grad_norm": 1.8199964761734009, + "grad_norm": 1.7525635957717896, "learning_rate": 1e-05, - "loss": 0.361, + "loss": 0.3636, "step": 89300 }, { "epoch": 0.000894, - "grad_norm": 1.6717396974563599, + "grad_norm": 1.6738208532333374, "learning_rate": 1e-05, - "loss": 0.3577, + "loss": 0.3569, "step": 89400 }, { "epoch": 0.000895, - "grad_norm": 1.5373586416244507, + "grad_norm": 1.5723131895065308, "learning_rate": 1e-05, - "loss": 0.3601, + "loss": 0.3619, "step": 89500 }, { "epoch": 0.000896, - "grad_norm": 2.035975694656372, + "grad_norm": 1.865443468093872, "learning_rate": 1e-05, - "loss": 0.3539, + "loss": 0.3546, "step": 89600 }, { "epoch": 0.000897, - "grad_norm": 1.791956901550293, + "grad_norm": 1.8413885831832886, "learning_rate": 1e-05, - "loss": 0.3496, + "loss": 0.3519, "step": 89700 }, { "epoch": 0.000898, - "grad_norm": 1.7100282907485962, + "grad_norm": 1.760122537612915, "learning_rate": 1e-05, - "loss": 0.3513, + "loss": 0.3525, "step": 89800 }, { "epoch": 0.000899, - "grad_norm": 2.170893907546997, + "grad_norm": 2.0341832637786865, "learning_rate": 1e-05, - "loss": 0.3493, + "loss": 0.3478, "step": 89900 }, { "epoch": 0.0009, - "grad_norm": 2.64080548286438, + "grad_norm": 1.8548213243484497, "learning_rate": 1e-05, "loss": 0.3486, "step": 90000 }, { "epoch": 0.000901, - "grad_norm": 2.1960160732269287, + "grad_norm": 2.114245891571045, "learning_rate": 1e-05, - "loss": 0.3478, + "loss": 0.3494, "step": 90100 }, { "epoch": 0.000902, - "grad_norm": 1.8696004152297974, + "grad_norm": 2.117030143737793, "learning_rate": 1e-05, - "loss": 0.3507, + "loss": 0.3514, "step": 90200 }, { "epoch": 0.000903, - "grad_norm": 1.8514047861099243, + "grad_norm": 1.8521121740341187, "learning_rate": 1e-05, - "loss": 0.3556, + "loss": 0.3571, "step": 90300 }, { "epoch": 0.000904, - "grad_norm": 1.808189034461975, + "grad_norm": 1.9864593744277954, "learning_rate": 1e-05, - "loss": 0.3615, + "loss": 0.3657, "step": 90400 }, { "epoch": 0.000905, - "grad_norm": 1.9479504823684692, + "grad_norm": 1.9219348430633545, "learning_rate": 1e-05, - "loss": 0.3542, + "loss": 0.3541, "step": 90500 }, { "epoch": 0.000906, - "grad_norm": 2.1592888832092285, + "grad_norm": 2.13183856010437, "learning_rate": 1e-05, - "loss": 0.3551, + "loss": 0.3566, "step": 90600 }, { "epoch": 0.000907, - "grad_norm": 1.8082275390625, + "grad_norm": 1.7505743503570557, "learning_rate": 1e-05, - "loss": 0.3507, + "loss": 0.351, "step": 90700 }, { "epoch": 0.000908, - "grad_norm": 1.8496153354644775, + "grad_norm": 1.7294330596923828, "learning_rate": 1e-05, - "loss": 0.3526, + "loss": 0.3536, "step": 90800 }, { "epoch": 0.000909, - "grad_norm": 1.727019190788269, + "grad_norm": 1.8986823558807373, "learning_rate": 1e-05, - "loss": 0.3575, + "loss": 0.3598, "step": 90900 }, { "epoch": 0.00091, - "grad_norm": 1.6531563997268677, + "grad_norm": 1.6649383306503296, "learning_rate": 1e-05, - "loss": 0.3545, + "loss": 0.3568, "step": 91000 }, { "epoch": 0.000911, - "grad_norm": 1.730660319328308, + "grad_norm": 2.0748260021209717, "learning_rate": 1e-05, - "loss": 0.3519, + "loss": 0.3527, "step": 91100 }, { "epoch": 0.000912, - "grad_norm": 2.2856595516204834, + "grad_norm": 1.905617117881775, "learning_rate": 1e-05, - "loss": 0.3512, + "loss": 0.3535, "step": 91200 }, { "epoch": 0.000913, - "grad_norm": 1.6288682222366333, + "grad_norm": 1.764633059501648, "learning_rate": 1e-05, - "loss": 0.3529, + "loss": 0.3517, "step": 91300 }, { "epoch": 0.000914, - "grad_norm": 1.7392635345458984, + "grad_norm": 1.822187900543213, "learning_rate": 1e-05, - "loss": 0.3571, + "loss": 0.3584, "step": 91400 }, { "epoch": 0.000915, - "grad_norm": 1.709946870803833, + "grad_norm": 1.845644235610962, "learning_rate": 1e-05, - "loss": 0.356, + "loss": 0.3561, "step": 91500 }, { "epoch": 0.000916, - "grad_norm": 2.09854793548584, + "grad_norm": 2.082502603530884, "learning_rate": 1e-05, - "loss": 0.3549, + "loss": 0.3563, "step": 91600 }, { "epoch": 0.000917, - "grad_norm": 2.0266406536102295, + "grad_norm": 2.198960065841675, "learning_rate": 1e-05, - "loss": 0.3576, + "loss": 0.3557, "step": 91700 }, { "epoch": 0.000918, - "grad_norm": 1.79050612449646, + "grad_norm": 1.6692492961883545, "learning_rate": 1e-05, - "loss": 0.3534, + "loss": 0.3558, "step": 91800 }, { "epoch": 0.000919, - "grad_norm": 1.9138318300247192, + "grad_norm": 2.025036334991455, "learning_rate": 1e-05, - "loss": 0.3396, + "loss": 0.3427, "step": 91900 }, { "epoch": 0.00092, - "grad_norm": 1.8089325428009033, + "grad_norm": 1.8072044849395752, "learning_rate": 1e-05, - "loss": 0.3526, + "loss": 0.3534, "step": 92000 }, { "epoch": 0.000921, - "grad_norm": 1.9290741682052612, + "grad_norm": 1.989229679107666, "learning_rate": 1e-05, - "loss": 0.354, + "loss": 0.3579, "step": 92100 }, { "epoch": 0.000922, - "grad_norm": 1.7530072927474976, + "grad_norm": 1.943912386894226, "learning_rate": 1e-05, - "loss": 0.3598, + "loss": 0.3625, "step": 92200 }, { "epoch": 0.000923, - "grad_norm": 1.816968560218811, + "grad_norm": 5.172427654266357, "learning_rate": 1e-05, - "loss": 0.3474, + "loss": 0.3467, "step": 92300 }, { "epoch": 0.000924, - "grad_norm": 1.85427725315094, + "grad_norm": 1.854652762413025, "learning_rate": 1e-05, - "loss": 0.3525, + "loss": 0.3526, "step": 92400 }, { "epoch": 0.000925, - "grad_norm": 1.8021162748336792, + "grad_norm": 1.7196903228759766, "learning_rate": 1e-05, - "loss": 0.348, + "loss": 0.3505, "step": 92500 }, { "epoch": 0.000926, - "grad_norm": 1.6519674062728882, + "grad_norm": 1.6658947467803955, "learning_rate": 1e-05, - "loss": 0.3548, + "loss": 0.3564, "step": 92600 }, { "epoch": 0.000927, - "grad_norm": 1.9673748016357422, + "grad_norm": 2.8138256072998047, "learning_rate": 1e-05, - "loss": 0.3575, + "loss": 0.3571, "step": 92700 }, { "epoch": 0.000928, - "grad_norm": 1.635617971420288, + "grad_norm": 1.700640320777893, "learning_rate": 1e-05, - "loss": 0.3556, + "loss": 0.3576, "step": 92800 }, { "epoch": 0.000929, - "grad_norm": 1.8025121688842773, + "grad_norm": 1.738922119140625, "learning_rate": 1e-05, - "loss": 0.3489, + "loss": 0.3482, "step": 92900 }, { "epoch": 0.00093, - "grad_norm": 1.898805022239685, + "grad_norm": 1.7264224290847778, "learning_rate": 1e-05, - "loss": 0.3529, + "loss": 0.3515, "step": 93000 }, { "epoch": 0.000931, - "grad_norm": 1.7505768537521362, + "grad_norm": 1.7760035991668701, "learning_rate": 1e-05, - "loss": 0.3525, + "loss": 0.3546, "step": 93100 }, { "epoch": 0.000932, - "grad_norm": 1.5433109998703003, + "grad_norm": 1.684767484664917, "learning_rate": 1e-05, - "loss": 0.3493, + "loss": 0.3509, "step": 93200 }, { "epoch": 0.000933, - "grad_norm": 1.8749200105667114, + "grad_norm": 1.9357808828353882, "learning_rate": 1e-05, - "loss": 0.36, + "loss": 0.3613, "step": 93300 }, { "epoch": 0.000934, - "grad_norm": 1.7116992473602295, + "grad_norm": 1.853598952293396, "learning_rate": 1e-05, - "loss": 0.3493, + "loss": 0.3523, "step": 93400 }, { "epoch": 0.000935, - "grad_norm": 1.757457971572876, + "grad_norm": 3.272063732147217, "learning_rate": 1e-05, - "loss": 0.3496, + "loss": 0.3515, "step": 93500 }, { "epoch": 0.000936, - "grad_norm": 1.6416513919830322, + "grad_norm": 1.8037041425704956, "learning_rate": 1e-05, - "loss": 0.3511, + "loss": 0.3526, "step": 93600 }, { "epoch": 0.000937, - "grad_norm": 1.8914117813110352, + "grad_norm": 1.989990472793579, "learning_rate": 1e-05, - "loss": 0.3503, + "loss": 0.3512, "step": 93700 }, { "epoch": 0.000938, - "grad_norm": 1.9646599292755127, + "grad_norm": 1.7665644884109497, "learning_rate": 1e-05, - "loss": 0.3464, + "loss": 0.3503, "step": 93800 }, { "epoch": 0.000939, - "grad_norm": 1.8940532207489014, + "grad_norm": 2.230848550796509, "learning_rate": 1e-05, - "loss": 0.3506, + "loss": 0.3532, "step": 93900 }, { "epoch": 0.00094, - "grad_norm": 1.7155450582504272, + "grad_norm": 1.8637299537658691, "learning_rate": 1e-05, - "loss": 0.353, + "loss": 0.3558, "step": 94000 }, { "epoch": 0.000941, - "grad_norm": 1.8361566066741943, + "grad_norm": 1.9153410196304321, "learning_rate": 1e-05, - "loss": 0.3511, + "loss": 0.3534, "step": 94100 }, { "epoch": 0.000942, - "grad_norm": 1.6558592319488525, + "grad_norm": 1.9178539514541626, "learning_rate": 1e-05, - "loss": 0.3559, + "loss": 0.3578, "step": 94200 }, { "epoch": 0.000943, - "grad_norm": 1.974839448928833, + "grad_norm": 1.9506075382232666, "learning_rate": 1e-05, - "loss": 0.3525, + "loss": 0.3557, "step": 94300 }, { "epoch": 0.000944, - "grad_norm": 1.9294072389602661, + "grad_norm": 1.97675621509552, "learning_rate": 1e-05, - "loss": 0.3495, + "loss": 0.3496, "step": 94400 }, { "epoch": 0.000945, - "grad_norm": 1.6195063591003418, + "grad_norm": 1.571119785308838, "learning_rate": 1e-05, - "loss": 0.3521, + "loss": 0.3549, "step": 94500 }, { "epoch": 0.000946, - "grad_norm": 1.8439295291900635, + "grad_norm": 1.84198796749115, "learning_rate": 1e-05, - "loss": 0.3552, + "loss": 0.3564, "step": 94600 }, { "epoch": 0.000947, - "grad_norm": 1.7115392684936523, + "grad_norm": 1.6789623498916626, "learning_rate": 1e-05, - "loss": 0.3439, + "loss": 0.3459, "step": 94700 }, { "epoch": 0.000948, - "grad_norm": 1.865861415863037, + "grad_norm": 1.7345160245895386, "learning_rate": 1e-05, - "loss": 0.3506, + "loss": 0.348, "step": 94800 }, { "epoch": 0.000949, - "grad_norm": 1.554669737815857, + "grad_norm": 1.626235008239746, "learning_rate": 1e-05, - "loss": 0.3539, + "loss": 0.3579, "step": 94900 }, { "epoch": 0.00095, - "grad_norm": 1.9082772731781006, + "grad_norm": 1.8632274866104126, "learning_rate": 1e-05, - "loss": 0.3523, + "loss": 0.3555, "step": 95000 }, { "epoch": 0.000951, - "grad_norm": 1.5038203001022339, + "grad_norm": 1.5302915573120117, "learning_rate": 1e-05, - "loss": 0.3462, + "loss": 0.3515, "step": 95100 }, { "epoch": 0.000952, - "grad_norm": 1.8105841875076294, + "grad_norm": 1.759491205215454, "learning_rate": 1e-05, - "loss": 0.3425, + "loss": 0.3451, "step": 95200 }, { "epoch": 0.000953, - "grad_norm": 2.078392505645752, + "grad_norm": 2.1866915225982666, "learning_rate": 1e-05, - "loss": 0.34, + "loss": 0.3392, "step": 95300 }, { "epoch": 0.000954, - "grad_norm": 1.9626461267471313, + "grad_norm": 1.6935898065567017, "learning_rate": 1e-05, - "loss": 0.3469, + "loss": 0.3497, "step": 95400 }, { "epoch": 0.000955, - "grad_norm": 1.792861819267273, + "grad_norm": 1.9268600940704346, "learning_rate": 1e-05, - "loss": 0.3468, + "loss": 0.3463, "step": 95500 }, { "epoch": 0.000956, - "grad_norm": 1.877244234085083, + "grad_norm": 1.9195621013641357, "learning_rate": 1e-05, - "loss": 0.3493, + "loss": 0.351, "step": 95600 }, { "epoch": 0.000957, - "grad_norm": 2.146423578262329, + "grad_norm": 1.845158338546753, "learning_rate": 1e-05, - "loss": 0.3453, + "loss": 0.3465, "step": 95700 }, { "epoch": 0.000958, - "grad_norm": 1.8045830726623535, + "grad_norm": 2.0196573734283447, "learning_rate": 1e-05, - "loss": 0.3498, + "loss": 0.3525, "step": 95800 }, { "epoch": 0.000959, - "grad_norm": 1.7552355527877808, + "grad_norm": 1.8416608572006226, "learning_rate": 1e-05, - "loss": 0.3492, + "loss": 0.3502, "step": 95900 }, { "epoch": 0.00096, - "grad_norm": 1.742870807647705, + "grad_norm": 1.83146071434021, "learning_rate": 1e-05, - "loss": 0.3502, + "loss": 0.3535, "step": 96000 }, { "epoch": 0.000961, - "grad_norm": 1.8937788009643555, + "grad_norm": 1.8110991716384888, "learning_rate": 1e-05, - "loss": 0.3517, + "loss": 0.3536, "step": 96100 }, { "epoch": 0.000962, - "grad_norm": 1.9000352621078491, + "grad_norm": 1.798935055732727, "learning_rate": 1e-05, - "loss": 0.3495, + "loss": 0.3513, "step": 96200 }, { "epoch": 0.000963, - "grad_norm": 1.5748040676116943, + "grad_norm": 1.7838218212127686, "learning_rate": 1e-05, - "loss": 0.3516, + "loss": 0.353, "step": 96300 }, { "epoch": 0.000964, - "grad_norm": 1.863660454750061, + "grad_norm": 1.8357594013214111, "learning_rate": 1e-05, - "loss": 0.3478, + "loss": 0.35, "step": 96400 }, { "epoch": 0.000965, - "grad_norm": 2.033308982849121, + "grad_norm": 1.9533332586288452, "learning_rate": 1e-05, - "loss": 0.3454, + "loss": 0.3497, "step": 96500 }, { "epoch": 0.000966, - "grad_norm": 1.6928563117980957, + "grad_norm": 1.8430505990982056, "learning_rate": 1e-05, - "loss": 0.3515, + "loss": 0.3535, "step": 96600 }, { "epoch": 0.000967, - "grad_norm": 1.8354567289352417, + "grad_norm": 1.842871069908142, "learning_rate": 1e-05, - "loss": 0.346, + "loss": 0.3455, "step": 96700 }, { "epoch": 0.000968, - "grad_norm": 1.739592432975769, + "grad_norm": 1.8501172065734863, "learning_rate": 1e-05, - "loss": 0.3434, + "loss": 0.3469, "step": 96800 }, { "epoch": 0.000969, - "grad_norm": 1.8741235733032227, + "grad_norm": 1.8171736001968384, "learning_rate": 1e-05, - "loss": 0.3486, + "loss": 0.3503, "step": 96900 }, { "epoch": 0.00097, - "grad_norm": 1.694320559501648, + "grad_norm": 1.8180707693099976, "learning_rate": 1e-05, - "loss": 0.3463, + "loss": 0.3509, "step": 97000 }, { "epoch": 0.000971, - "grad_norm": 1.54352867603302, + "grad_norm": 1.6564078330993652, "learning_rate": 1e-05, - "loss": 0.3461, + "loss": 0.3449, "step": 97100 }, { "epoch": 0.000972, - "grad_norm": 1.796715259552002, + "grad_norm": 1.9035217761993408, "learning_rate": 1e-05, - "loss": 0.344, + "loss": 0.3464, "step": 97200 }, { "epoch": 0.000973, - "grad_norm": 1.8386529684066772, + "grad_norm": 1.7870876789093018, "learning_rate": 1e-05, - "loss": 0.3555, + "loss": 0.358, "step": 97300 }, { "epoch": 0.000974, - "grad_norm": 1.7978730201721191, + "grad_norm": 2.026207447052002, "learning_rate": 1e-05, - "loss": 0.3445, + "loss": 0.3469, "step": 97400 }, { "epoch": 0.000975, - "grad_norm": 1.9349309206008911, + "grad_norm": 1.839242935180664, "learning_rate": 1e-05, - "loss": 0.3474, + "loss": 0.3527, "step": 97500 }, { "epoch": 0.000976, - "grad_norm": 1.8861294984817505, + "grad_norm": 2.1023123264312744, "learning_rate": 1e-05, - "loss": 0.3565, + "loss": 0.3575, "step": 97600 }, { "epoch": 0.000977, - "grad_norm": 1.7746880054473877, + "grad_norm": 1.7062361240386963, "learning_rate": 1e-05, - "loss": 0.34, + "loss": 0.3427, "step": 97700 }, { "epoch": 0.000978, - "grad_norm": 1.9722641706466675, + "grad_norm": 1.8973636627197266, "learning_rate": 1e-05, - "loss": 0.349, + "loss": 0.3496, "step": 97800 }, { "epoch": 0.000979, - "grad_norm": 1.6193134784698486, + "grad_norm": 4.865823745727539, "learning_rate": 1e-05, - "loss": 0.3542, + "loss": 0.3586, "step": 97900 }, { "epoch": 0.00098, - "grad_norm": 1.6607670783996582, + "grad_norm": 1.6862282752990723, "learning_rate": 1e-05, - "loss": 0.3479, + "loss": 0.3483, "step": 98000 }, { "epoch": 0.000981, - "grad_norm": 1.7609422206878662, + "grad_norm": 1.7278543710708618, "learning_rate": 1e-05, - "loss": 0.3427, + "loss": 0.345, "step": 98100 }, { "epoch": 0.000982, - "grad_norm": 1.7859488725662231, + "grad_norm": 1.9642508029937744, "learning_rate": 1e-05, - "loss": 0.3522, + "loss": 0.3552, "step": 98200 }, { "epoch": 0.000983, - "grad_norm": 1.7998120784759521, + "grad_norm": 1.6919240951538086, "learning_rate": 1e-05, - "loss": 0.3468, + "loss": 0.3481, "step": 98300 }, { "epoch": 0.000984, - "grad_norm": 1.695994257926941, + "grad_norm": 1.7211792469024658, "learning_rate": 1e-05, - "loss": 0.341, + "loss": 0.3456, "step": 98400 }, { "epoch": 0.000985, - "grad_norm": 1.7125250101089478, + "grad_norm": 1.8794984817504883, "learning_rate": 1e-05, - "loss": 0.3526, + "loss": 0.3547, "step": 98500 }, { "epoch": 0.000986, - "grad_norm": 1.738882064819336, + "grad_norm": 1.7422791719436646, "learning_rate": 1e-05, - "loss": 0.3441, + "loss": 0.3459, "step": 98600 }, { "epoch": 0.000987, - "grad_norm": 1.620566725730896, + "grad_norm": 1.7812235355377197, "learning_rate": 1e-05, - "loss": 0.3498, + "loss": 0.3534, "step": 98700 }, { "epoch": 0.000988, - "grad_norm": 1.5262073278427124, + "grad_norm": 1.7994880676269531, "learning_rate": 1e-05, - "loss": 0.3395, + "loss": 0.3406, "step": 98800 }, { "epoch": 0.000989, - "grad_norm": 1.791178822517395, + "grad_norm": 1.766994595527649, "learning_rate": 1e-05, - "loss": 0.3433, + "loss": 0.3454, "step": 98900 }, { "epoch": 0.00099, - "grad_norm": 1.7370917797088623, + "grad_norm": 1.9302865266799927, "learning_rate": 1e-05, - "loss": 0.3423, + "loss": 0.3434, "step": 99000 }, { "epoch": 0.000991, - "grad_norm": 1.5319187641143799, + "grad_norm": 1.6279524564743042, "learning_rate": 1e-05, - "loss": 0.3455, + "loss": 0.3443, "step": 99100 }, { "epoch": 0.000992, - "grad_norm": 1.8497802019119263, + "grad_norm": 1.878088116645813, "learning_rate": 1e-05, - "loss": 0.3422, + "loss": 0.3433, "step": 99200 }, { "epoch": 0.000993, - "grad_norm": 1.6758663654327393, + "grad_norm": 1.9811022281646729, "learning_rate": 1e-05, - "loss": 0.3428, + "loss": 0.3444, "step": 99300 }, { "epoch": 0.000994, - "grad_norm": 1.7789148092269897, + "grad_norm": 1.9504814147949219, "learning_rate": 1e-05, - "loss": 0.3449, + "loss": 0.3448, "step": 99400 }, { "epoch": 0.000995, - "grad_norm": 1.6672728061676025, + "grad_norm": 1.7477716207504272, "learning_rate": 1e-05, - "loss": 0.3339, + "loss": 0.3372, "step": 99500 }, { "epoch": 0.000996, - "grad_norm": 1.8337355852127075, + "grad_norm": 1.9687480926513672, "learning_rate": 1e-05, - "loss": 0.3461, + "loss": 0.346, "step": 99600 }, { "epoch": 0.000997, - "grad_norm": 1.9244012832641602, + "grad_norm": 2.0356996059417725, "learning_rate": 1e-05, - "loss": 0.3489, + "loss": 0.3508, "step": 99700 }, { "epoch": 0.000998, - "grad_norm": 1.7637393474578857, + "grad_norm": 1.816023349761963, "learning_rate": 1e-05, - "loss": 0.3531, + "loss": 0.3524, "step": 99800 }, { "epoch": 0.000999, - "grad_norm": 1.8803791999816895, + "grad_norm": 2.0732617378234863, "learning_rate": 1e-05, - "loss": 0.3429, + "loss": 0.3468, "step": 99900 }, { "epoch": 0.001, - "grad_norm": 1.7920255661010742, + "grad_norm": 1.8265982866287231, "learning_rate": 1e-05, - "loss": 0.3463, + "loss": 0.3485, "step": 100000 }, { "epoch": 0.001, - "eval_loss": 0.3212890625, - "eval_runtime": 106.2469, - "eval_samples_per_second": 470.602, - "eval_steps_per_second": 29.413, + "eval_loss": 0.322509765625, + "eval_runtime": 109.8279, + "eval_samples_per_second": 455.258, + "eval_steps_per_second": 28.454, "step": 100000 }, { "epoch": 0.001001, - "grad_norm": 1.6265007257461548, + "grad_norm": 1.6607346534729004, "learning_rate": 1e-05, - "loss": 0.349, + "loss": 0.35, "step": 100100 }, { "epoch": 0.001002, - "grad_norm": 1.710910439491272, + "grad_norm": 1.8534297943115234, "learning_rate": 1e-05, - "loss": 0.3498, + "loss": 0.3515, "step": 100200 }, { "epoch": 0.001003, - "grad_norm": 1.9668190479278564, + "grad_norm": 1.7251813411712646, "learning_rate": 1e-05, - "loss": 0.344, + "loss": 0.3449, "step": 100300 }, { "epoch": 0.001004, - "grad_norm": 1.730014681816101, + "grad_norm": 2.1961374282836914, "learning_rate": 1e-05, - "loss": 0.3425, + "loss": 0.3442, "step": 100400 }, { "epoch": 0.001005, - "grad_norm": 1.4988311529159546, + "grad_norm": 1.8674982786178589, "learning_rate": 1e-05, - "loss": 0.3532, + "loss": 0.357, "step": 100500 }, { "epoch": 0.001006, - "grad_norm": 1.882324457168579, + "grad_norm": 1.9515495300292969, "learning_rate": 1e-05, - "loss": 0.3475, + "loss": 0.3516, "step": 100600 }, { "epoch": 0.001007, - "grad_norm": 1.6972439289093018, + "grad_norm": 1.8294038772583008, "learning_rate": 1e-05, - "loss": 0.3461, + "loss": 0.3475, "step": 100700 }, { "epoch": 0.001008, - "grad_norm": 1.5815422534942627, + "grad_norm": 1.7040425539016724, "learning_rate": 1e-05, - "loss": 0.3397, + "loss": 0.3403, "step": 100800 }, { "epoch": 0.001009, - "grad_norm": 2.057607650756836, + "grad_norm": 2.464323043823242, "learning_rate": 1e-05, - "loss": 0.3455, + "loss": 0.3437, "step": 100900 }, { "epoch": 0.00101, - "grad_norm": 1.5138230323791504, + "grad_norm": 1.5711098909378052, "learning_rate": 1e-05, - "loss": 0.3431, + "loss": 0.3465, "step": 101000 }, { "epoch": 0.001011, - "grad_norm": 1.500015139579773, + "grad_norm": 1.6807917356491089, "learning_rate": 1e-05, - "loss": 0.3419, + "loss": 0.3462, "step": 101100 }, { "epoch": 0.001012, - "grad_norm": 2.0951130390167236, + "grad_norm": 2.2576940059661865, "learning_rate": 1e-05, - "loss": 0.3393, + "loss": 0.3389, "step": 101200 }, { "epoch": 0.001013, - "grad_norm": 1.982593297958374, + "grad_norm": 1.7972438335418701, "learning_rate": 1e-05, - "loss": 0.3419, + "loss": 0.3442, "step": 101300 }, { "epoch": 0.001014, - "grad_norm": 1.7170299291610718, + "grad_norm": 1.8780492544174194, "learning_rate": 1e-05, - "loss": 0.3422, + "loss": 0.3424, "step": 101400 }, { "epoch": 0.001015, - "grad_norm": 1.6410114765167236, + "grad_norm": 1.7459834814071655, "learning_rate": 1e-05, - "loss": 0.3378, + "loss": 0.3403, "step": 101500 }, { "epoch": 0.001016, - "grad_norm": 1.5306416749954224, + "grad_norm": 1.549613118171692, "learning_rate": 1e-05, - "loss": 0.3341, + "loss": 0.3354, "step": 101600 }, { "epoch": 0.001017, - "grad_norm": 1.8244845867156982, + "grad_norm": 1.737701416015625, "learning_rate": 1e-05, - "loss": 0.3421, + "loss": 0.3447, "step": 101700 }, { "epoch": 0.001018, - "grad_norm": 1.8460711240768433, + "grad_norm": 1.936926245689392, "learning_rate": 1e-05, - "loss": 0.3445, + "loss": 0.3457, "step": 101800 }, { "epoch": 0.001019, - "grad_norm": 1.66615891456604, + "grad_norm": 1.579476237297058, "learning_rate": 1e-05, - "loss": 0.3383, + "loss": 0.3388, "step": 101900 }, { "epoch": 0.00102, - "grad_norm": 1.6163735389709473, + "grad_norm": 1.620912790298462, "learning_rate": 1e-05, - "loss": 0.3395, + "loss": 0.342, "step": 102000 }, { "epoch": 0.001021, - "grad_norm": 1.6465198993682861, + "grad_norm": 1.801440715789795, "learning_rate": 1e-05, - "loss": 0.3442, + "loss": 0.3479, "step": 102100 }, { "epoch": 0.001022, - "grad_norm": 1.7939703464508057, + "grad_norm": 1.9294061660766602, "learning_rate": 1e-05, - "loss": 0.3528, + "loss": 0.3535, "step": 102200 }, { "epoch": 0.001023, - "grad_norm": 1.5161067247390747, + "grad_norm": 1.7232532501220703, "learning_rate": 1e-05, - "loss": 0.346, + "loss": 0.3465, "step": 102300 }, { "epoch": 0.001024, - "grad_norm": 1.5762897729873657, + "grad_norm": 2.327086925506592, "learning_rate": 1e-05, - "loss": 0.3372, + "loss": 0.3379, "step": 102400 }, { "epoch": 0.001025, - "grad_norm": 1.6103670597076416, + "grad_norm": 1.652092695236206, "learning_rate": 1e-05, - "loss": 0.3413, + "loss": 0.3399, "step": 102500 }, { "epoch": 0.001026, - "grad_norm": 1.7266919612884521, + "grad_norm": 1.6813795566558838, "learning_rate": 1e-05, - "loss": 0.3468, + "loss": 0.3489, "step": 102600 }, { "epoch": 0.001027, - "grad_norm": 1.832675576210022, + "grad_norm": 1.9206030368804932, "learning_rate": 1e-05, - "loss": 0.3468, + "loss": 0.3476, "step": 102700 }, { "epoch": 0.001028, - "grad_norm": 1.5487641096115112, + "grad_norm": 1.5671733617782593, "learning_rate": 1e-05, - "loss": 0.3362, + "loss": 0.337, "step": 102800 }, { "epoch": 0.001029, - "grad_norm": 1.7946821451187134, + "grad_norm": 1.8689380884170532, "learning_rate": 1e-05, - "loss": 0.3434, + "loss": 0.3449, "step": 102900 }, { "epoch": 0.00103, - "grad_norm": 1.5789737701416016, + "grad_norm": 1.6365469694137573, "learning_rate": 1e-05, - "loss": 0.3406, + "loss": 0.3421, "step": 103000 }, { "epoch": 0.001031, - "grad_norm": 1.8219558000564575, + "grad_norm": 2.0219128131866455, "learning_rate": 1e-05, - "loss": 0.3396, + "loss": 0.3402, "step": 103100 }, { "epoch": 0.001032, - "grad_norm": 1.5498402118682861, + "grad_norm": 1.6188757419586182, "learning_rate": 1e-05, - "loss": 0.3469, + "loss": 0.3458, "step": 103200 }, { "epoch": 0.001033, - "grad_norm": 1.8296899795532227, + "grad_norm": 1.809544324874878, "learning_rate": 1e-05, - "loss": 0.3448, + "loss": 0.3436, "step": 103300 }, { "epoch": 0.001034, - "grad_norm": 1.6243722438812256, + "grad_norm": 1.6749653816223145, "learning_rate": 1e-05, - "loss": 0.3454, + "loss": 0.3436, "step": 103400 }, { "epoch": 0.001035, - "grad_norm": 1.926013469696045, + "grad_norm": 1.9452251195907593, "learning_rate": 1e-05, - "loss": 0.3348, + "loss": 0.3355, "step": 103500 }, { "epoch": 0.001036, - "grad_norm": 1.9348326921463013, + "grad_norm": 1.790397047996521, "learning_rate": 1e-05, - "loss": 0.3475, + "loss": 0.3489, "step": 103600 }, { "epoch": 0.001037, - "grad_norm": 1.5674424171447754, + "grad_norm": 1.5875970125198364, "learning_rate": 1e-05, - "loss": 0.3366, + "loss": 0.3358, "step": 103700 }, { "epoch": 0.001038, - "grad_norm": 1.6316710710525513, + "grad_norm": 1.6320905685424805, "learning_rate": 1e-05, - "loss": 0.3399, + "loss": 0.3436, "step": 103800 }, { "epoch": 0.001039, - "grad_norm": 1.6558021306991577, + "grad_norm": 1.6067711114883423, "learning_rate": 1e-05, - "loss": 0.3412, + "loss": 0.3438, "step": 103900 }, { "epoch": 0.00104, - "grad_norm": 2.274764060974121, + "grad_norm": 1.8375946283340454, "learning_rate": 1e-05, - "loss": 0.3415, + "loss": 0.344, "step": 104000 }, { "epoch": 0.001041, - "grad_norm": 1.8469206094741821, + "grad_norm": 1.708240270614624, "learning_rate": 1e-05, - "loss": 0.3469, + "loss": 0.3491, "step": 104100 }, { "epoch": 0.001042, - "grad_norm": 1.9673590660095215, + "grad_norm": 2.3994433879852295, "learning_rate": 1e-05, - "loss": 0.3436, + "loss": 0.3462, "step": 104200 }, { "epoch": 0.001043, - "grad_norm": 1.799859881401062, + "grad_norm": 1.7040139436721802, "learning_rate": 1e-05, - "loss": 0.3438, + "loss": 0.3459, "step": 104300 }, { "epoch": 0.001044, - "grad_norm": 1.7712419033050537, + "grad_norm": 2.01163911819458, "learning_rate": 1e-05, - "loss": 0.3442, + "loss": 0.3463, "step": 104400 }, { "epoch": 0.001045, - "grad_norm": 1.701420545578003, + "grad_norm": 1.604658842086792, "learning_rate": 1e-05, - "loss": 0.3378, + "loss": 0.341, "step": 104500 }, { "epoch": 0.001046, - "grad_norm": 2.030165195465088, + "grad_norm": 2.69278883934021, "learning_rate": 1e-05, - "loss": 0.3384, + "loss": 0.3418, "step": 104600 }, { "epoch": 0.001047, - "grad_norm": 1.625741958618164, + "grad_norm": 1.6742432117462158, "learning_rate": 1e-05, - "loss": 0.3358, + "loss": 0.3376, "step": 104700 }, { "epoch": 0.001048, - "grad_norm": 1.5410789251327515, + "grad_norm": 1.7139792442321777, "learning_rate": 1e-05, - "loss": 0.3417, + "loss": 0.3448, "step": 104800 }, { "epoch": 0.001049, - "grad_norm": 1.746232509613037, + "grad_norm": 1.9812430143356323, "learning_rate": 1e-05, - "loss": 0.3453, + "loss": 0.3443, "step": 104900 }, { "epoch": 0.00105, - "grad_norm": 1.8489423990249634, + "grad_norm": 1.9630818367004395, "learning_rate": 1e-05, - "loss": 0.3366, + "loss": 0.3367, "step": 105000 }, { "epoch": 0.001051, - "grad_norm": 1.4868258237838745, + "grad_norm": 1.913673758506775, "learning_rate": 1e-05, - "loss": 0.3419, + "loss": 0.3433, "step": 105100 }, { "epoch": 0.001052, - "grad_norm": 1.6324551105499268, + "grad_norm": 1.6645994186401367, "learning_rate": 1e-05, - "loss": 0.3451, + "loss": 0.3483, "step": 105200 }, { "epoch": 0.001053, - "grad_norm": 1.6764284372329712, + "grad_norm": 1.6963456869125366, "learning_rate": 1e-05, - "loss": 0.3504, + "loss": 0.3518, "step": 105300 }, { "epoch": 0.001054, - "grad_norm": 1.6849185228347778, + "grad_norm": 1.774322748184204, "learning_rate": 1e-05, - "loss": 0.3397, + "loss": 0.339, "step": 105400 }, { "epoch": 0.001055, - "grad_norm": 1.5984441041946411, + "grad_norm": 1.6794517040252686, "learning_rate": 1e-05, - "loss": 0.3374, + "loss": 0.3412, "step": 105500 }, { "epoch": 0.001056, - "grad_norm": 1.8809632062911987, + "grad_norm": 2.0357189178466797, "learning_rate": 1e-05, - "loss": 0.3376, + "loss": 0.3406, "step": 105600 }, { "epoch": 0.001057, - "grad_norm": 1.7457271814346313, + "grad_norm": 1.7818143367767334, "learning_rate": 1e-05, - "loss": 0.3331, + "loss": 0.3379, "step": 105700 }, { "epoch": 0.001058, - "grad_norm": 1.699915885925293, + "grad_norm": 1.819798231124878, "learning_rate": 1e-05, - "loss": 0.3303, + "loss": 0.3328, "step": 105800 }, { "epoch": 0.001059, - "grad_norm": 1.5862298011779785, + "grad_norm": 1.751774787902832, "learning_rate": 1e-05, - "loss": 0.3405, + "loss": 0.3403, "step": 105900 }, { "epoch": 0.00106, - "grad_norm": 1.8356544971466064, + "grad_norm": 1.720474362373352, "learning_rate": 1e-05, - "loss": 0.3409, + "loss": 0.3427, "step": 106000 }, { "epoch": 0.001061, - "grad_norm": 1.8761811256408691, + "grad_norm": 1.7977921962738037, "learning_rate": 1e-05, - "loss": 0.3392, + "loss": 0.3435, "step": 106100 }, { "epoch": 0.001062, - "grad_norm": 1.8505558967590332, + "grad_norm": 2.1512701511383057, "learning_rate": 1e-05, - "loss": 0.3418, + "loss": 0.3443, "step": 106200 }, { "epoch": 0.001063, - "grad_norm": 1.6820476055145264, + "grad_norm": 1.7027465105056763, "learning_rate": 1e-05, - "loss": 0.3343, + "loss": 0.3362, "step": 106300 }, { "epoch": 0.001064, - "grad_norm": 1.6789436340332031, + "grad_norm": 1.663902997970581, "learning_rate": 1e-05, - "loss": 0.3433, + "loss": 0.3459, "step": 106400 }, { "epoch": 0.001065, - "grad_norm": 1.4228408336639404, + "grad_norm": 1.7038410902023315, "learning_rate": 1e-05, - "loss": 0.3331, + "loss": 0.3356, "step": 106500 }, { "epoch": 0.001066, - "grad_norm": 1.5530163049697876, + "grad_norm": 1.5602421760559082, "learning_rate": 1e-05, - "loss": 0.3358, + "loss": 0.3354, "step": 106600 }, { "epoch": 0.001067, - "grad_norm": 2.206634044647217, + "grad_norm": 1.8753788471221924, "learning_rate": 1e-05, - "loss": 0.3307, + "loss": 0.3342, "step": 106700 }, { "epoch": 0.001068, - "grad_norm": 1.8783141374588013, + "grad_norm": 1.731338620185852, "learning_rate": 1e-05, - "loss": 0.3375, + "loss": 0.34, "step": 106800 }, { "epoch": 0.001069, - "grad_norm": 1.6686680316925049, + "grad_norm": 1.7717700004577637, "learning_rate": 1e-05, - "loss": 0.3362, + "loss": 0.3404, "step": 106900 }, { "epoch": 0.00107, - "grad_norm": 1.478062629699707, + "grad_norm": 1.5369184017181396, "learning_rate": 1e-05, - "loss": 0.35, + "loss": 0.3534, "step": 107000 }, { "epoch": 0.001071, - "grad_norm": 2.0970678329467773, + "grad_norm": 2.0892210006713867, "learning_rate": 1e-05, - "loss": 0.3384, + "loss": 0.3396, "step": 107100 }, { "epoch": 0.001072, - "grad_norm": 2.636615514755249, + "grad_norm": 1.9275939464569092, "learning_rate": 1e-05, - "loss": 0.3379, + "loss": 0.3403, "step": 107200 }, { "epoch": 0.001073, - "grad_norm": 1.7082911729812622, + "grad_norm": 1.9656401872634888, "learning_rate": 1e-05, - "loss": 0.3394, + "loss": 0.3392, "step": 107300 }, { "epoch": 0.001074, - "grad_norm": 1.6989812850952148, + "grad_norm": 1.7235068082809448, "learning_rate": 1e-05, - "loss": 0.3402, + "loss": 0.3416, "step": 107400 }, { "epoch": 0.001075, - "grad_norm": 1.8082529306411743, + "grad_norm": 1.8416111469268799, "learning_rate": 1e-05, - "loss": 0.3371, + "loss": 0.3367, "step": 107500 }, { "epoch": 0.001076, - "grad_norm": 1.7570844888687134, + "grad_norm": 1.7464598417282104, "learning_rate": 1e-05, - "loss": 0.3395, + "loss": 0.3436, "step": 107600 }, { "epoch": 0.001077, - "grad_norm": 1.8686490058898926, + "grad_norm": 1.8630284070968628, "learning_rate": 1e-05, - "loss": 0.3377, + "loss": 0.3432, "step": 107700 }, { "epoch": 0.001078, - "grad_norm": 1.869361400604248, + "grad_norm": 1.740233302116394, "learning_rate": 1e-05, - "loss": 0.3364, + "loss": 0.3384, "step": 107800 }, { "epoch": 0.001079, - "grad_norm": 1.6268250942230225, + "grad_norm": 1.5509331226348877, "learning_rate": 1e-05, - "loss": 0.3296, + "loss": 0.3332, "step": 107900 }, { "epoch": 0.00108, - "grad_norm": 1.7727303504943848, + "grad_norm": 1.7383582592010498, "learning_rate": 1e-05, - "loss": 0.3417, + "loss": 0.3456, "step": 108000 }, { "epoch": 0.001081, - "grad_norm": 1.874608039855957, + "grad_norm": 1.9408977031707764, "learning_rate": 1e-05, - "loss": 0.3373, + "loss": 0.3396, "step": 108100 }, { "epoch": 0.001082, - "grad_norm": 1.6596509218215942, + "grad_norm": 1.6888933181762695, "learning_rate": 1e-05, - "loss": 0.339, + "loss": 0.3404, "step": 108200 }, { "epoch": 0.001083, - "grad_norm": 1.9414929151535034, + "grad_norm": 1.9360098838806152, "learning_rate": 1e-05, - "loss": 0.3387, + "loss": 0.3431, "step": 108300 }, { "epoch": 0.001084, - "grad_norm": 1.7047326564788818, + "grad_norm": 1.7306195497512817, "learning_rate": 1e-05, - "loss": 0.333, + "loss": 0.3348, "step": 108400 }, { "epoch": 0.001085, - "grad_norm": 1.4662975072860718, + "grad_norm": 1.6970361471176147, "learning_rate": 1e-05, - "loss": 0.3369, + "loss": 0.3381, "step": 108500 }, { "epoch": 0.001086, - "grad_norm": 1.733799695968628, + "grad_norm": 1.664059042930603, "learning_rate": 1e-05, - "loss": 0.3365, + "loss": 0.3403, "step": 108600 }, { "epoch": 0.001087, - "grad_norm": 1.5928646326065063, + "grad_norm": 1.7473076581954956, "learning_rate": 1e-05, - "loss": 0.3393, + "loss": 0.3403, "step": 108700 }, { "epoch": 0.001088, - "grad_norm": 1.700872778892517, + "grad_norm": 1.705640196800232, "learning_rate": 1e-05, - "loss": 0.3313, + "loss": 0.3353, "step": 108800 }, { "epoch": 0.001089, - "grad_norm": 1.597302794456482, + "grad_norm": 1.8058274984359741, "learning_rate": 1e-05, - "loss": 0.3276, + "loss": 0.3309, "step": 108900 }, { "epoch": 0.00109, - "grad_norm": 1.8077725172042847, + "grad_norm": 1.7639211416244507, "learning_rate": 1e-05, - "loss": 0.3329, + "loss": 0.3322, "step": 109000 }, { "epoch": 0.001091, - "grad_norm": 1.863740086555481, + "grad_norm": 2.020731210708618, "learning_rate": 1e-05, - "loss": 0.3388, + "loss": 0.3421, "step": 109100 }, { "epoch": 0.001092, - "grad_norm": 1.613777756690979, + "grad_norm": 1.7614929676055908, "learning_rate": 1e-05, - "loss": 0.3359, + "loss": 0.3395, "step": 109200 }, { "epoch": 0.001093, - "grad_norm": 1.744760513305664, + "grad_norm": 1.8153364658355713, "learning_rate": 1e-05, - "loss": 0.3402, + "loss": 0.3413, "step": 109300 }, { "epoch": 0.001094, - "grad_norm": 1.7309943437576294, + "grad_norm": 1.803002119064331, "learning_rate": 1e-05, - "loss": 0.3324, + "loss": 0.337, "step": 109400 }, { "epoch": 0.001095, - "grad_norm": 1.7989113330841064, + "grad_norm": 1.6940698623657227, "learning_rate": 1e-05, - "loss": 0.3344, + "loss": 0.3372, "step": 109500 }, { "epoch": 0.001096, - "grad_norm": 1.7089550495147705, + "grad_norm": 1.8647571802139282, "learning_rate": 1e-05, - "loss": 0.3383, + "loss": 0.338, "step": 109600 }, { "epoch": 0.001097, - "grad_norm": 1.8682044744491577, + "grad_norm": 1.9329015016555786, "learning_rate": 1e-05, - "loss": 0.3392, + "loss": 0.3397, "step": 109700 }, { "epoch": 0.001098, - "grad_norm": 1.7993671894073486, + "grad_norm": 1.8334521055221558, "learning_rate": 1e-05, - "loss": 0.3348, + "loss": 0.3368, "step": 109800 }, { "epoch": 0.001099, - "grad_norm": 1.9016611576080322, + "grad_norm": 1.6593636274337769, "learning_rate": 1e-05, - "loss": 0.3363, + "loss": 0.335, "step": 109900 }, { "epoch": 0.0011, - "grad_norm": 1.8169256448745728, + "grad_norm": 1.8620237112045288, "learning_rate": 1e-05, - "loss": 0.3358, + "loss": 0.3383, "step": 110000 }, { "epoch": 0.001101, - "grad_norm": 1.7962956428527832, + "grad_norm": 1.8764339685440063, "learning_rate": 1e-05, - "loss": 0.3347, + "loss": 0.3338, "step": 110100 }, { "epoch": 0.001102, - "grad_norm": 1.964445948600769, + "grad_norm": 1.9678648710250854, "learning_rate": 1e-05, - "loss": 0.3432, + "loss": 0.3441, "step": 110200 }, { "epoch": 0.001103, - "grad_norm": 1.6810815334320068, + "grad_norm": 1.5820297002792358, "learning_rate": 1e-05, - "loss": 0.3367, + "loss": 0.3388, "step": 110300 }, { "epoch": 0.001104, - "grad_norm": 1.7712125778198242, + "grad_norm": 1.78269362449646, "learning_rate": 1e-05, - "loss": 0.3418, + "loss": 0.34, "step": 110400 }, { "epoch": 0.001105, - "grad_norm": 1.7732539176940918, + "grad_norm": 2.253110647201538, "learning_rate": 1e-05, - "loss": 0.3324, + "loss": 0.3352, "step": 110500 }, { "epoch": 0.001106, - "grad_norm": 1.6650182008743286, + "grad_norm": 1.553359866142273, "learning_rate": 1e-05, - "loss": 0.3287, + "loss": 0.3316, "step": 110600 }, { "epoch": 0.001107, - "grad_norm": 1.7873729467391968, + "grad_norm": 1.8975584506988525, "learning_rate": 1e-05, - "loss": 0.3326, + "loss": 0.3363, "step": 110700 }, { "epoch": 0.001108, - "grad_norm": 1.9017982482910156, + "grad_norm": 1.9635969400405884, "learning_rate": 1e-05, - "loss": 0.331, + "loss": 0.3339, "step": 110800 }, { "epoch": 0.001109, - "grad_norm": 1.5583044290542603, + "grad_norm": 1.6212959289550781, "learning_rate": 1e-05, - "loss": 0.3351, + "loss": 0.3365, "step": 110900 }, { "epoch": 0.00111, - "grad_norm": 1.5554203987121582, + "grad_norm": 1.667982578277588, "learning_rate": 1e-05, - "loss": 0.3333, + "loss": 0.3385, "step": 111000 }, { "epoch": 0.001111, - "grad_norm": 3.261989116668701, + "grad_norm": 1.799843192100525, "learning_rate": 1e-05, - "loss": 0.3324, + "loss": 0.3342, "step": 111100 }, { "epoch": 0.001112, - "grad_norm": 1.7398645877838135, + "grad_norm": 1.9425872564315796, "learning_rate": 1e-05, - "loss": 0.3432, + "loss": 0.3445, "step": 111200 }, { "epoch": 0.001113, - "grad_norm": 1.5493426322937012, + "grad_norm": 1.7052315473556519, "learning_rate": 1e-05, - "loss": 0.3338, + "loss": 0.3321, "step": 111300 }, { "epoch": 0.001114, - "grad_norm": 1.820183277130127, + "grad_norm": 1.8439725637435913, "learning_rate": 1e-05, - "loss": 0.334, + "loss": 0.3362, "step": 111400 }, { "epoch": 0.001115, - "grad_norm": 1.4531216621398926, + "grad_norm": 1.5285687446594238, "learning_rate": 1e-05, - "loss": 0.336, + "loss": 0.3362, "step": 111500 }, { "epoch": 0.001116, - "grad_norm": 1.7043826580047607, + "grad_norm": 1.8127108812332153, "learning_rate": 1e-05, - "loss": 0.331, + "loss": 0.3351, "step": 111600 }, { "epoch": 0.001117, - "grad_norm": 1.4433501958847046, + "grad_norm": 2.099846363067627, "learning_rate": 1e-05, - "loss": 0.3328, + "loss": 0.3348, "step": 111700 }, { "epoch": 0.001118, - "grad_norm": 1.7214539051055908, + "grad_norm": 1.8282333612442017, "learning_rate": 1e-05, - "loss": 0.3323, + "loss": 0.3353, "step": 111800 }, { "epoch": 0.001119, - "grad_norm": 1.9365381002426147, + "grad_norm": 1.9214797019958496, "learning_rate": 1e-05, - "loss": 0.3374, + "loss": 0.3384, "step": 111900 }, { "epoch": 0.00112, - "grad_norm": 1.510133981704712, + "grad_norm": 1.5597374439239502, "learning_rate": 1e-05, - "loss": 0.3346, + "loss": 0.339, "step": 112000 }, { "epoch": 0.001121, - "grad_norm": 1.7911183834075928, + "grad_norm": 1.7652438879013062, "learning_rate": 1e-05, - "loss": 0.3364, + "loss": 0.336, "step": 112100 }, { "epoch": 0.001122, - "grad_norm": 1.4433517456054688, + "grad_norm": 1.6596375703811646, "learning_rate": 1e-05, - "loss": 0.3353, + "loss": 0.3375, "step": 112200 }, { "epoch": 0.001123, - "grad_norm": 1.7540043592453003, + "grad_norm": 1.7220653295516968, "learning_rate": 1e-05, - "loss": 0.3376, + "loss": 0.3424, "step": 112300 }, { "epoch": 0.001124, - "grad_norm": 1.6091129779815674, + "grad_norm": 1.7662781476974487, "learning_rate": 1e-05, - "loss": 0.3368, + "loss": 0.3389, "step": 112400 }, { "epoch": 0.001125, - "grad_norm": 1.854957938194275, + "grad_norm": 2.3776822090148926, "learning_rate": 1e-05, - "loss": 0.3344, + "loss": 0.3366, "step": 112500 }, { "epoch": 0.001126, - "grad_norm": 1.3771413564682007, + "grad_norm": 1.8062163591384888, "learning_rate": 1e-05, - "loss": 0.3326, + "loss": 0.3366, "step": 112600 }, { "epoch": 0.001127, - "grad_norm": 1.4237767457962036, + "grad_norm": 1.6732230186462402, "learning_rate": 1e-05, - "loss": 0.3348, + "loss": 0.3338, "step": 112700 }, { "epoch": 0.001128, - "grad_norm": 1.9680719375610352, + "grad_norm": 1.7580362558364868, "learning_rate": 1e-05, - "loss": 0.3276, + "loss": 0.33, "step": 112800 }, { "epoch": 0.001129, - "grad_norm": 1.586517572402954, + "grad_norm": 1.4763306379318237, "learning_rate": 1e-05, - "loss": 0.3307, + "loss": 0.3323, "step": 112900 }, { "epoch": 0.00113, - "grad_norm": 1.860843539237976, + "grad_norm": 1.7444915771484375, "learning_rate": 1e-05, - "loss": 0.336, + "loss": 0.3368, "step": 113000 }, { "epoch": 0.001131, - "grad_norm": 1.5043953657150269, + "grad_norm": 1.487596869468689, "learning_rate": 1e-05, - "loss": 0.3329, + "loss": 0.3372, "step": 113100 }, { "epoch": 0.001132, - "grad_norm": 1.5637104511260986, + "grad_norm": 1.7567662000656128, "learning_rate": 1e-05, - "loss": 0.3344, + "loss": 0.3363, "step": 113200 }, { "epoch": 0.001133, - "grad_norm": 1.5974730253219604, + "grad_norm": 1.742226243019104, "learning_rate": 1e-05, - "loss": 0.3319, + "loss": 0.3353, "step": 113300 }, { "epoch": 0.001134, - "grad_norm": 1.614250659942627, + "grad_norm": 1.7816177606582642, "learning_rate": 1e-05, - "loss": 0.3353, + "loss": 0.3363, "step": 113400 }, { "epoch": 0.001135, - "grad_norm": 2.156856060028076, + "grad_norm": 1.9755498170852661, "learning_rate": 1e-05, - "loss": 0.3394, + "loss": 0.3418, "step": 113500 }, { "epoch": 0.001136, - "grad_norm": 1.6673293113708496, + "grad_norm": 1.867906093597412, "learning_rate": 1e-05, - "loss": 0.3311, + "loss": 0.3334, "step": 113600 }, { "epoch": 0.001137, - "grad_norm": 1.6201618909835815, + "grad_norm": 1.6979436874389648, "learning_rate": 1e-05, - "loss": 0.3228, + "loss": 0.3229, "step": 113700 }, { "epoch": 0.001138, - "grad_norm": 1.708338975906372, + "grad_norm": 1.7003270387649536, "learning_rate": 1e-05, - "loss": 0.3391, + "loss": 0.339, "step": 113800 }, { "epoch": 0.001139, - "grad_norm": 1.5306422710418701, + "grad_norm": 1.7175750732421875, "learning_rate": 1e-05, - "loss": 0.3256, + "loss": 0.328, "step": 113900 }, { "epoch": 0.00114, - "grad_norm": 1.8795247077941895, + "grad_norm": 1.8313535451889038, "learning_rate": 1e-05, - "loss": 0.3318, + "loss": 0.334, "step": 114000 }, { "epoch": 0.001141, - "grad_norm": 1.6934237480163574, + "grad_norm": 1.863562822341919, "learning_rate": 1e-05, - "loss": 0.3315, + "loss": 0.3331, "step": 114100 }, { "epoch": 0.001142, - "grad_norm": 1.4109803438186646, + "grad_norm": 1.5223082304000854, "learning_rate": 1e-05, - "loss": 0.3318, + "loss": 0.3327, "step": 114200 }, { "epoch": 0.001143, - "grad_norm": 1.7249387502670288, + "grad_norm": 1.6789870262145996, "learning_rate": 1e-05, - "loss": 0.3281, + "loss": 0.3313, "step": 114300 }, { "epoch": 0.001144, - "grad_norm": 1.558007001876831, + "grad_norm": 1.6574594974517822, "learning_rate": 1e-05, - "loss": 0.3343, + "loss": 0.3376, "step": 114400 }, { "epoch": 0.001145, - "grad_norm": 1.6334872245788574, + "grad_norm": 1.8169411420822144, "learning_rate": 1e-05, - "loss": 0.3382, + "loss": 0.3392, "step": 114500 }, { "epoch": 0.001146, - "grad_norm": 1.7595210075378418, + "grad_norm": 2.384134292602539, "learning_rate": 1e-05, - "loss": 0.3386, + "loss": 0.3421, "step": 114600 }, { "epoch": 0.001147, - "grad_norm": 1.6955580711364746, + "grad_norm": 1.8304411172866821, "learning_rate": 1e-05, - "loss": 0.3326, + "loss": 0.3356, "step": 114700 }, { "epoch": 0.001148, - "grad_norm": 2.2905585765838623, + "grad_norm": 1.5321639776229858, "learning_rate": 1e-05, - "loss": 0.3312, + "loss": 0.3317, "step": 114800 }, { "epoch": 0.001149, - "grad_norm": 1.529779076576233, + "grad_norm": 1.7843445539474487, "learning_rate": 1e-05, - "loss": 0.3218, + "loss": 0.3267, "step": 114900 }, { "epoch": 0.00115, - "grad_norm": 1.6238619089126587, + "grad_norm": 1.8861100673675537, "learning_rate": 1e-05, - "loss": 0.3307, + "loss": 0.3318, "step": 115000 }, { "epoch": 0.001151, - "grad_norm": 1.8231860399246216, + "grad_norm": 1.8112998008728027, "learning_rate": 1e-05, - "loss": 0.3272, + "loss": 0.3293, "step": 115100 }, { "epoch": 0.001152, - "grad_norm": 1.7048276662826538, + "grad_norm": 1.7408936023712158, "learning_rate": 1e-05, - "loss": 0.3354, + "loss": 0.339, "step": 115200 }, { "epoch": 0.001153, - "grad_norm": 1.5172830820083618, + "grad_norm": 1.5955983400344849, "learning_rate": 1e-05, - "loss": 0.3293, + "loss": 0.3326, "step": 115300 }, { "epoch": 0.001154, - "grad_norm": 1.9103567600250244, + "grad_norm": 1.6836644411087036, "learning_rate": 1e-05, - "loss": 0.3342, + "loss": 0.3377, "step": 115400 }, { "epoch": 0.001155, - "grad_norm": 1.350592851638794, + "grad_norm": 1.7743850946426392, "learning_rate": 1e-05, - "loss": 0.3277, + "loss": 0.3278, "step": 115500 }, { "epoch": 0.001156, - "grad_norm": 1.8583245277404785, + "grad_norm": 1.5382933616638184, "learning_rate": 1e-05, - "loss": 0.3367, + "loss": 0.3397, "step": 115600 }, { "epoch": 0.001157, - "grad_norm": 1.5466270446777344, + "grad_norm": 1.7360892295837402, "learning_rate": 1e-05, - "loss": 0.3185, + "loss": 0.3207, "step": 115700 }, { "epoch": 0.001158, - "grad_norm": 1.8161695003509521, + "grad_norm": 1.7574350833892822, "learning_rate": 1e-05, - "loss": 0.3415, + "loss": 0.3439, "step": 115800 }, { "epoch": 0.001159, - "grad_norm": 1.6409080028533936, + "grad_norm": 1.8633227348327637, "learning_rate": 1e-05, - "loss": 0.332, + "loss": 0.335, "step": 115900 }, { "epoch": 0.00116, - "grad_norm": 1.6916916370391846, + "grad_norm": 1.4621351957321167, "learning_rate": 1e-05, - "loss": 0.3339, + "loss": 0.3358, "step": 116000 }, { "epoch": 0.001161, - "grad_norm": 1.6149839162826538, + "grad_norm": 1.9157224893569946, "learning_rate": 1e-05, - "loss": 0.3389, + "loss": 0.3406, "step": 116100 }, { "epoch": 0.001162, - "grad_norm": 1.573641300201416, + "grad_norm": 1.6284751892089844, "learning_rate": 1e-05, - "loss": 0.3234, + "loss": 0.3277, "step": 116200 }, { "epoch": 0.001163, - "grad_norm": 1.8769241571426392, + "grad_norm": 2.2173221111297607, "learning_rate": 1e-05, - "loss": 0.3226, + "loss": 0.3259, "step": 116300 }, { "epoch": 0.001164, - "grad_norm": 1.6541870832443237, + "grad_norm": 1.8805922269821167, "learning_rate": 1e-05, - "loss": 0.3274, + "loss": 0.3304, "step": 116400 }, { "epoch": 0.001165, - "grad_norm": 1.4887248277664185, + "grad_norm": 1.5072230100631714, "learning_rate": 1e-05, - "loss": 0.3311, + "loss": 0.3329, "step": 116500 }, { "epoch": 0.001166, - "grad_norm": 1.6876665353775024, + "grad_norm": 1.7337315082550049, "learning_rate": 1e-05, - "loss": 0.3324, + "loss": 0.3358, "step": 116600 }, { "epoch": 0.001167, - "grad_norm": 1.7973747253417969, + "grad_norm": 1.8346338272094727, "learning_rate": 1e-05, - "loss": 0.3354, + "loss": 0.335, "step": 116700 }, { "epoch": 0.001168, - "grad_norm": 1.8499912023544312, + "grad_norm": 2.003572940826416, "learning_rate": 1e-05, - "loss": 0.3312, + "loss": 0.333, "step": 116800 }, { "epoch": 0.001169, - "grad_norm": 1.5359294414520264, + "grad_norm": 1.6946192979812622, "learning_rate": 1e-05, - "loss": 0.3262, + "loss": 0.3274, "step": 116900 }, { "epoch": 0.00117, - "grad_norm": 1.666365623474121, + "grad_norm": 1.7123721837997437, "learning_rate": 1e-05, - "loss": 0.3304, + "loss": 0.3346, "step": 117000 }, { "epoch": 0.001171, - "grad_norm": 1.9105374813079834, + "grad_norm": 1.8998627662658691, "learning_rate": 1e-05, - "loss": 0.3324, + "loss": 0.3355, "step": 117100 }, { "epoch": 0.001172, - "grad_norm": 1.6057831048965454, + "grad_norm": 1.5401489734649658, "learning_rate": 1e-05, - "loss": 0.3332, + "loss": 0.3365, "step": 117200 }, { "epoch": 0.001173, - "grad_norm": 1.7070896625518799, + "grad_norm": 1.7201097011566162, "learning_rate": 1e-05, - "loss": 0.3361, + "loss": 0.3382, "step": 117300 }, { "epoch": 0.001174, - "grad_norm": 1.746102213859558, + "grad_norm": 1.8772022724151611, "learning_rate": 1e-05, - "loss": 0.3309, + "loss": 0.3323, "step": 117400 }, { "epoch": 0.001175, - "grad_norm": 1.7227739095687866, + "grad_norm": 1.5749614238739014, "learning_rate": 1e-05, - "loss": 0.3201, + "loss": 0.3247, "step": 117500 }, { "epoch": 0.001176, - "grad_norm": 1.5749499797821045, + "grad_norm": 1.753891944885254, "learning_rate": 1e-05, - "loss": 0.324, + "loss": 0.325, "step": 117600 }, { "epoch": 0.001177, - "grad_norm": 1.6398141384124756, + "grad_norm": 1.6012095212936401, "learning_rate": 1e-05, - "loss": 0.3305, + "loss": 0.3353, "step": 117700 }, { "epoch": 0.001178, - "grad_norm": 1.6476861238479614, + "grad_norm": 1.8432629108428955, "learning_rate": 1e-05, - "loss": 0.3338, + "loss": 0.3342, "step": 117800 }, { "epoch": 0.001179, - "grad_norm": 1.5112991333007812, + "grad_norm": 1.583196997642517, "learning_rate": 1e-05, - "loss": 0.3282, + "loss": 0.33, "step": 117900 }, { "epoch": 0.00118, - "grad_norm": 1.8406567573547363, + "grad_norm": 1.9006246328353882, "learning_rate": 1e-05, - "loss": 0.3252, + "loss": 0.3282, "step": 118000 }, { "epoch": 0.001181, - "grad_norm": 1.658325433731079, + "grad_norm": 1.6398696899414062, "learning_rate": 1e-05, - "loss": 0.3295, + "loss": 0.3326, "step": 118100 }, { "epoch": 0.001182, - "grad_norm": 1.7115981578826904, + "grad_norm": 1.7854382991790771, "learning_rate": 1e-05, - "loss": 0.3241, + "loss": 0.3273, "step": 118200 }, { "epoch": 0.001183, - "grad_norm": 1.7053550481796265, + "grad_norm": 2.0176942348480225, "learning_rate": 1e-05, - "loss": 0.3298, + "loss": 0.3296, "step": 118300 }, { "epoch": 0.001184, - "grad_norm": 1.5642719268798828, + "grad_norm": 2.0091938972473145, "learning_rate": 1e-05, - "loss": 0.3263, + "loss": 0.3291, "step": 118400 }, { "epoch": 0.001185, - "grad_norm": 1.6774392127990723, + "grad_norm": 1.906575322151184, "learning_rate": 1e-05, - "loss": 0.3312, + "loss": 0.3354, "step": 118500 }, { "epoch": 0.001186, - "grad_norm": 1.7588557004928589, + "grad_norm": 1.7136719226837158, "learning_rate": 1e-05, - "loss": 0.3296, + "loss": 0.3314, "step": 118600 }, { "epoch": 0.001187, - "grad_norm": 1.8824719190597534, + "grad_norm": 1.7901870012283325, "learning_rate": 1e-05, - "loss": 0.3219, + "loss": 0.3236, "step": 118700 }, { "epoch": 0.001188, - "grad_norm": 1.7733078002929688, + "grad_norm": 1.7387175559997559, "learning_rate": 1e-05, - "loss": 0.3218, + "loss": 0.3227, "step": 118800 }, { "epoch": 0.001189, - "grad_norm": 1.8235989809036255, + "grad_norm": 1.7231628894805908, "learning_rate": 1e-05, - "loss": 0.3229, + "loss": 0.324, "step": 118900 }, { "epoch": 0.00119, - "grad_norm": 1.5493932962417603, + "grad_norm": 1.516570806503296, "learning_rate": 1e-05, - "loss": 0.3274, + "loss": 0.3332, "step": 119000 }, { "epoch": 0.001191, - "grad_norm": 1.5555813312530518, + "grad_norm": 1.7026876211166382, "learning_rate": 1e-05, - "loss": 0.3216, + "loss": 0.3272, "step": 119100 }, { "epoch": 0.001192, - "grad_norm": 1.3635920286178589, + "grad_norm": 1.3457015752792358, "learning_rate": 1e-05, - "loss": 0.3316, + "loss": 0.3306, "step": 119200 }, { "epoch": 0.001193, - "grad_norm": 1.768019199371338, + "grad_norm": 1.9337682723999023, "learning_rate": 1e-05, - "loss": 0.3185, + "loss": 0.3237, "step": 119300 }, { "epoch": 0.001194, - "grad_norm": 1.8128565549850464, + "grad_norm": 1.6353681087493896, "learning_rate": 1e-05, - "loss": 0.3277, + "loss": 0.3287, "step": 119400 }, { "epoch": 0.001195, - "grad_norm": 1.7543752193450928, + "grad_norm": 1.875755786895752, "learning_rate": 1e-05, - "loss": 0.3227, + "loss": 0.3245, "step": 119500 }, { "epoch": 0.001196, - "grad_norm": 1.6978943347930908, + "grad_norm": 2.4236490726470947, "learning_rate": 1e-05, - "loss": 0.3257, + "loss": 0.3274, "step": 119600 }, { "epoch": 0.001197, - "grad_norm": 1.5249478816986084, + "grad_norm": 1.7631841897964478, "learning_rate": 1e-05, - "loss": 0.3217, + "loss": 0.3237, "step": 119700 }, { "epoch": 0.001198, - "grad_norm": 1.6230567693710327, + "grad_norm": 1.536399245262146, "learning_rate": 1e-05, - "loss": 0.3259, + "loss": 0.3285, "step": 119800 }, { "epoch": 0.001199, - "grad_norm": 1.8395847082138062, + "grad_norm": 1.7705007791519165, "learning_rate": 1e-05, "loss": 0.3291, "step": 119900 }, { "epoch": 0.0012, - "grad_norm": 1.7216225862503052, + "grad_norm": 1.8081552982330322, "learning_rate": 1e-05, - "loss": 0.325, + "loss": 0.3251, "step": 120000 }, { "epoch": 0.0012, - "eval_loss": 0.300537109375, - "eval_runtime": 102.3348, - "eval_samples_per_second": 488.592, - "eval_steps_per_second": 30.537, + "eval_loss": 0.301513671875, + "eval_runtime": 114.8107, + "eval_samples_per_second": 435.499, + "eval_steps_per_second": 27.219, "step": 120000 }, { "epoch": 0.001201, - "grad_norm": 1.6509754657745361, + "grad_norm": 1.8171324729919434, "learning_rate": 1e-05, - "loss": 0.3289, + "loss": 0.3342, "step": 120100 }, { "epoch": 0.001202, - "grad_norm": 1.7181107997894287, + "grad_norm": 1.6765378713607788, "learning_rate": 1e-05, - "loss": 0.3274, + "loss": 0.3276, "step": 120200 }, { "epoch": 0.001203, - "grad_norm": 1.7411495447158813, + "grad_norm": 1.8498486280441284, "learning_rate": 1e-05, - "loss": 0.3267, + "loss": 0.3314, "step": 120300 }, { "epoch": 0.001204, - "grad_norm": 1.537964105606079, + "grad_norm": 1.536418080329895, "learning_rate": 1e-05, - "loss": 0.3218, + "loss": 0.3237, "step": 120400 }, { "epoch": 0.001205, - "grad_norm": 1.5406032800674438, + "grad_norm": 1.5730416774749756, "learning_rate": 1e-05, - "loss": 0.3234, + "loss": 0.3263, "step": 120500 }, { "epoch": 0.001206, - "grad_norm": 1.8979731798171997, + "grad_norm": 1.7513419389724731, "learning_rate": 1e-05, - "loss": 0.3324, + "loss": 0.3355, "step": 120600 }, { "epoch": 0.001207, - "grad_norm": 1.5506062507629395, + "grad_norm": 1.7094500064849854, "learning_rate": 1e-05, - "loss": 0.3176, + "loss": 0.3243, "step": 120700 }, { "epoch": 0.001208, - "grad_norm": 1.5046718120574951, + "grad_norm": 1.8792259693145752, "learning_rate": 1e-05, - "loss": 0.3278, + "loss": 0.3283, "step": 120800 }, { "epoch": 0.001209, - "grad_norm": 1.5129989385604858, + "grad_norm": 1.7763640880584717, "learning_rate": 1e-05, - "loss": 0.3305, + "loss": 0.3325, "step": 120900 }, { "epoch": 0.00121, - "grad_norm": 1.5582902431488037, + "grad_norm": 1.8400176763534546, "learning_rate": 1e-05, - "loss": 0.3354, + "loss": 0.3342, "step": 121000 }, { "epoch": 0.001211, - "grad_norm": 1.9122488498687744, + "grad_norm": 1.9107251167297363, "learning_rate": 1e-05, - "loss": 0.3278, + "loss": 0.3314, "step": 121100 }, { "epoch": 0.001212, - "grad_norm": 1.3912389278411865, + "grad_norm": 1.4877129793167114, "learning_rate": 1e-05, - "loss": 0.326, + "loss": 0.3317, "step": 121200 }, { "epoch": 0.001213, - "grad_norm": 1.48383367061615, + "grad_norm": 1.6370038986206055, "learning_rate": 1e-05, - "loss": 0.325, + "loss": 0.3253, "step": 121300 }, { "epoch": 0.001214, - "grad_norm": 1.6192889213562012, + "grad_norm": 1.9504472017288208, "learning_rate": 1e-05, - "loss": 0.3223, + "loss": 0.3242, "step": 121400 }, { "epoch": 0.001215, - "grad_norm": 1.563012957572937, + "grad_norm": 2.077836036682129, "learning_rate": 1e-05, - "loss": 0.3263, + "loss": 0.3302, "step": 121500 }, { "epoch": 0.001216, - "grad_norm": 1.4866485595703125, + "grad_norm": 1.4920196533203125, "learning_rate": 1e-05, - "loss": 0.3259, + "loss": 0.332, "step": 121600 }, { "epoch": 0.001217, - "grad_norm": 1.6754919290542603, + "grad_norm": 1.7362091541290283, "learning_rate": 1e-05, - "loss": 0.3294, + "loss": 0.3313, "step": 121700 }, { "epoch": 0.001218, - "grad_norm": 1.5653374195098877, + "grad_norm": 1.662297248840332, "learning_rate": 1e-05, - "loss": 0.3266, + "loss": 0.3287, "step": 121800 }, { "epoch": 0.001219, - "grad_norm": 1.4365466833114624, + "grad_norm": 1.6544877290725708, "learning_rate": 1e-05, - "loss": 0.3227, + "loss": 0.3266, "step": 121900 }, { "epoch": 0.00122, - "grad_norm": 1.4021244049072266, + "grad_norm": 1.5775099992752075, "learning_rate": 1e-05, - "loss": 0.3248, + "loss": 0.3263, "step": 122000 }, { "epoch": 0.001221, - "grad_norm": 1.9654580354690552, + "grad_norm": 1.704094648361206, "learning_rate": 1e-05, - "loss": 0.3228, + "loss": 0.325, "step": 122100 }, { "epoch": 0.001222, - "grad_norm": 1.5937784910202026, + "grad_norm": 1.9005615711212158, "learning_rate": 1e-05, - "loss": 0.3235, + "loss": 0.3268, "step": 122200 }, { "epoch": 0.001223, - "grad_norm": 1.4813224077224731, + "grad_norm": 2.027251958847046, "learning_rate": 1e-05, - "loss": 0.3244, + "loss": 0.3266, "step": 122300 }, { "epoch": 0.001224, - "grad_norm": 1.7086066007614136, + "grad_norm": 1.8168870210647583, "learning_rate": 1e-05, - "loss": 0.328, + "loss": 0.3298, "step": 122400 }, { "epoch": 0.001225, - "grad_norm": 1.5664122104644775, + "grad_norm": 1.709088921546936, "learning_rate": 1e-05, - "loss": 0.3202, + "loss": 0.322, "step": 122500 }, { "epoch": 0.001226, - "grad_norm": 1.779064416885376, + "grad_norm": 1.7680180072784424, "learning_rate": 1e-05, - "loss": 0.334, + "loss": 0.3362, "step": 122600 }, { "epoch": 0.001227, - "grad_norm": 2.2756104469299316, + "grad_norm": 1.681280493736267, "learning_rate": 1e-05, - "loss": 0.3208, + "loss": 0.3236, "step": 122700 }, { "epoch": 0.001228, - "grad_norm": 1.7406623363494873, + "grad_norm": 1.7112354040145874, "learning_rate": 1e-05, - "loss": 0.3278, + "loss": 0.3308, "step": 122800 }, { "epoch": 0.001229, - "grad_norm": 1.4303823709487915, + "grad_norm": 1.623887062072754, "learning_rate": 1e-05, - "loss": 0.3232, + "loss": 0.3255, "step": 122900 }, { "epoch": 0.00123, - "grad_norm": 1.7534031867980957, + "grad_norm": 1.880348801612854, "learning_rate": 1e-05, - "loss": 0.3332, + "loss": 0.3348, "step": 123000 }, { "epoch": 0.001231, - "grad_norm": 1.6112945079803467, + "grad_norm": 1.8015272617340088, "learning_rate": 1e-05, - "loss": 0.325, + "loss": 0.3263, "step": 123100 }, { "epoch": 0.001232, - "grad_norm": 1.6410695314407349, + "grad_norm": 1.794119954109192, "learning_rate": 1e-05, - "loss": 0.3166, + "loss": 0.3215, "step": 123200 }, { "epoch": 0.001233, - "grad_norm": 1.5222517251968384, + "grad_norm": 1.3672672510147095, "learning_rate": 1e-05, - "loss": 0.3211, + "loss": 0.3246, "step": 123300 }, { "epoch": 0.001234, - "grad_norm": 1.4771156311035156, + "grad_norm": 1.702120304107666, "learning_rate": 1e-05, - "loss": 0.3261, + "loss": 0.3277, "step": 123400 }, { "epoch": 0.001235, - "grad_norm": 1.702616572380066, + "grad_norm": 1.6856110095977783, "learning_rate": 1e-05, - "loss": 0.3264, + "loss": 0.3308, "step": 123500 }, { "epoch": 0.001236, - "grad_norm": 1.3545397520065308, + "grad_norm": 1.3940743207931519, "learning_rate": 1e-05, - "loss": 0.3212, + "loss": 0.3224, "step": 123600 }, { "epoch": 0.001237, - "grad_norm": 1.345359444618225, + "grad_norm": 1.4997862577438354, "learning_rate": 1e-05, - "loss": 0.3187, + "loss": 0.3209, "step": 123700 }, { "epoch": 0.001238, - "grad_norm": 1.73200261592865, + "grad_norm": 1.6719286441802979, "learning_rate": 1e-05, - "loss": 0.323, + "loss": 0.3287, "step": 123800 }, { "epoch": 0.001239, - "grad_norm": 1.5590332746505737, + "grad_norm": 1.4933640956878662, "learning_rate": 1e-05, - "loss": 0.3249, + "loss": 0.3275, "step": 123900 }, { "epoch": 0.00124, - "grad_norm": 1.4319485425949097, + "grad_norm": 1.6647841930389404, "learning_rate": 1e-05, - "loss": 0.3167, + "loss": 0.3217, "step": 124000 }, { "epoch": 0.001241, - "grad_norm": 1.8128015995025635, + "grad_norm": 1.656747817993164, "learning_rate": 1e-05, - "loss": 0.316, + "loss": 0.3185, "step": 124100 }, { "epoch": 0.001242, - "grad_norm": 1.7296788692474365, + "grad_norm": 1.7526649236679077, "learning_rate": 1e-05, - "loss": 0.3217, + "loss": 0.3238, "step": 124200 }, { "epoch": 0.001243, - "grad_norm": 1.2617566585540771, + "grad_norm": 1.5294679403305054, "learning_rate": 1e-05, - "loss": 0.3246, + "loss": 0.3282, "step": 124300 }, { "epoch": 0.001244, - "grad_norm": 1.6063637733459473, + "grad_norm": 1.5839802026748657, "learning_rate": 1e-05, - "loss": 0.3162, + "loss": 0.3191, "step": 124400 }, { "epoch": 0.001245, - "grad_norm": 1.6809676885604858, + "grad_norm": 2.2223639488220215, "learning_rate": 1e-05, - "loss": 0.3242, + "loss": 0.3296, "step": 124500 }, { "epoch": 0.001246, - "grad_norm": 1.6716912984848022, + "grad_norm": 1.7450740337371826, "learning_rate": 1e-05, - "loss": 0.3272, + "loss": 0.3275, "step": 124600 }, { "epoch": 0.001247, - "grad_norm": 1.5568071603775024, + "grad_norm": 1.591633677482605, "learning_rate": 1e-05, - "loss": 0.3179, + "loss": 0.3152, "step": 124700 }, { "epoch": 0.001248, - "grad_norm": 1.7201974391937256, + "grad_norm": 1.8334294557571411, "learning_rate": 1e-05, - "loss": 0.3225, + "loss": 0.325, "step": 124800 }, { "epoch": 0.001249, - "grad_norm": 1.6703535318374634, + "grad_norm": 1.8498133420944214, "learning_rate": 1e-05, - "loss": 0.3225, + "loss": 0.3267, "step": 124900 }, { "epoch": 0.00125, - "grad_norm": 1.849350094795227, + "grad_norm": 1.8751991987228394, "learning_rate": 1e-05, - "loss": 0.3293, + "loss": 0.329, "step": 125000 }, { "epoch": 0.001251, - "grad_norm": 1.7884984016418457, + "grad_norm": 1.7075512409210205, "learning_rate": 1e-05, - "loss": 0.3216, + "loss": 0.3233, "step": 125100 }, { "epoch": 0.001252, - "grad_norm": 1.578443169593811, + "grad_norm": 1.7127974033355713, "learning_rate": 1e-05, - "loss": 0.3246, + "loss": 0.3256, "step": 125200 }, { "epoch": 0.001253, - "grad_norm": 1.5520350933074951, + "grad_norm": 1.6013959646224976, "learning_rate": 1e-05, - "loss": 0.3267, + "loss": 0.3294, "step": 125300 }, { "epoch": 0.001254, - "grad_norm": 1.6242430210113525, + "grad_norm": 1.616365671157837, "learning_rate": 1e-05, - "loss": 0.323, + "loss": 0.3256, "step": 125400 }, { "epoch": 0.001255, - "grad_norm": 1.485471487045288, + "grad_norm": 1.9536734819412231, "learning_rate": 1e-05, - "loss": 0.325, + "loss": 0.3268, "step": 125500 }, { "epoch": 0.001256, - "grad_norm": 1.6397300958633423, + "grad_norm": 1.725591778755188, "learning_rate": 1e-05, - "loss": 0.3284, + "loss": 0.329, "step": 125600 }, { "epoch": 0.001257, - "grad_norm": 1.4572651386260986, + "grad_norm": 1.4925453662872314, "learning_rate": 1e-05, - "loss": 0.3272, + "loss": 0.3284, "step": 125700 }, { "epoch": 0.001258, - "grad_norm": 1.7062714099884033, + "grad_norm": 1.718002438545227, "learning_rate": 1e-05, - "loss": 0.324, + "loss": 0.3269, "step": 125800 }, { "epoch": 0.001259, - "grad_norm": 1.5527808666229248, + "grad_norm": 1.6296608448028564, "learning_rate": 1e-05, - "loss": 0.3146, + "loss": 0.3187, "step": 125900 }, { "epoch": 0.00126, - "grad_norm": 1.5469679832458496, + "grad_norm": 1.584074854850769, "learning_rate": 1e-05, - "loss": 0.3208, + "loss": 0.3239, "step": 126000 }, { "epoch": 0.001261, - "grad_norm": 1.6673294305801392, + "grad_norm": 1.7755796909332275, "learning_rate": 1e-05, - "loss": 0.3239, + "loss": 0.3231, "step": 126100 }, { "epoch": 0.001262, - "grad_norm": 1.4047091007232666, + "grad_norm": 1.5260752439498901, "learning_rate": 1e-05, - "loss": 0.3225, + "loss": 0.3227, "step": 126200 }, { "epoch": 0.001263, - "grad_norm": 1.6836607456207275, + "grad_norm": 1.7237966060638428, "learning_rate": 1e-05, - "loss": 0.3271, + "loss": 0.3281, "step": 126300 }, { "epoch": 0.001264, - "grad_norm": 1.6571979522705078, + "grad_norm": 1.6392747163772583, "learning_rate": 1e-05, - "loss": 0.3189, + "loss": 0.3211, "step": 126400 }, { "epoch": 0.001265, - "grad_norm": 1.491667628288269, + "grad_norm": 1.8122193813323975, "learning_rate": 1e-05, - "loss": 0.3242, + "loss": 0.3264, "step": 126500 }, { "epoch": 0.001266, - "grad_norm": 1.8864578008651733, + "grad_norm": 1.8250197172164917, "learning_rate": 1e-05, - "loss": 0.3158, + "loss": 0.32, "step": 126600 }, { "epoch": 0.001267, - "grad_norm": 1.692036747932434, + "grad_norm": 1.7046698331832886, "learning_rate": 1e-05, - "loss": 0.3231, + "loss": 0.3254, "step": 126700 }, { "epoch": 0.001268, - "grad_norm": 1.924976110458374, + "grad_norm": 1.74703049659729, "learning_rate": 1e-05, - "loss": 0.3216, + "loss": 0.3239, "step": 126800 }, { "epoch": 0.001269, - "grad_norm": 1.5480746030807495, + "grad_norm": 1.683023452758789, "learning_rate": 1e-05, - "loss": 0.3294, + "loss": 0.3318, "step": 126900 }, { "epoch": 0.00127, - "grad_norm": 1.9113504886627197, + "grad_norm": 1.9313528537750244, "learning_rate": 1e-05, - "loss": 0.3159, + "loss": 0.3203, "step": 127000 }, { "epoch": 0.001271, - "grad_norm": 1.4901503324508667, + "grad_norm": 1.3307342529296875, "learning_rate": 1e-05, - "loss": 0.3292, + "loss": 0.327, "step": 127100 }, { "epoch": 0.001272, - "grad_norm": 1.6837888956069946, + "grad_norm": 1.817285418510437, "learning_rate": 1e-05, - "loss": 0.3315, + "loss": 0.3352, "step": 127200 }, { "epoch": 0.001273, - "grad_norm": 1.7318413257598877, + "grad_norm": 1.7280235290527344, "learning_rate": 1e-05, - "loss": 0.3194, + "loss": 0.3207, "step": 127300 }, { "epoch": 0.001274, - "grad_norm": 1.5804219245910645, + "grad_norm": 1.7646243572235107, "learning_rate": 1e-05, - "loss": 0.3269, + "loss": 0.3303, "step": 127400 }, { "epoch": 0.001275, - "grad_norm": 1.669191598892212, + "grad_norm": 1.6660072803497314, "learning_rate": 1e-05, - "loss": 0.3228, + "loss": 0.3262, "step": 127500 }, { "epoch": 0.001276, - "grad_norm": 1.8091787099838257, + "grad_norm": 1.8076339960098267, "learning_rate": 1e-05, - "loss": 0.3186, + "loss": 0.3195, "step": 127600 }, { "epoch": 0.001277, - "grad_norm": 1.578346848487854, + "grad_norm": 1.6329010725021362, "learning_rate": 1e-05, - "loss": 0.3132, + "loss": 0.3166, "step": 127700 }, { "epoch": 0.001278, - "grad_norm": 2.2615110874176025, + "grad_norm": 1.8912221193313599, "learning_rate": 1e-05, - "loss": 0.324, + "loss": 0.3253, "step": 127800 }, { "epoch": 0.001279, - "grad_norm": 1.5542341470718384, + "grad_norm": 1.6600183248519897, "learning_rate": 1e-05, - "loss": 0.3291, + "loss": 0.3328, "step": 127900 }, { "epoch": 0.00128, - "grad_norm": 1.830991506576538, + "grad_norm": 2.2682251930236816, "learning_rate": 1e-05, - "loss": 0.3189, + "loss": 0.322, "step": 128000 }, { "epoch": 0.001281, - "grad_norm": 1.693804144859314, + "grad_norm": 1.9191845655441284, "learning_rate": 1e-05, - "loss": 0.318, + "loss": 0.3236, "step": 128100 }, { "epoch": 0.001282, - "grad_norm": 1.8038767576217651, + "grad_norm": 1.8286381959915161, "learning_rate": 1e-05, - "loss": 0.3136, + "loss": 0.3161, "step": 128200 }, { "epoch": 0.001283, - "grad_norm": 1.5050418376922607, + "grad_norm": 1.5892844200134277, "learning_rate": 1e-05, - "loss": 0.3158, + "loss": 0.3176, "step": 128300 }, { "epoch": 0.001284, - "grad_norm": 1.599582314491272, + "grad_norm": 1.5269801616668701, "learning_rate": 1e-05, - "loss": 0.3208, + "loss": 0.3213, "step": 128400 }, { "epoch": 0.001285, - "grad_norm": 1.8382525444030762, + "grad_norm": 1.9540048837661743, "learning_rate": 1e-05, - "loss": 0.3255, + "loss": 0.3254, "step": 128500 }, { "epoch": 0.001286, - "grad_norm": 1.6649285554885864, + "grad_norm": 1.669771671295166, "learning_rate": 1e-05, - "loss": 0.3222, + "loss": 0.3218, "step": 128600 }, { "epoch": 0.001287, - "grad_norm": 1.6677497625350952, + "grad_norm": 1.8254411220550537, "learning_rate": 1e-05, - "loss": 0.3221, + "loss": 0.3241, "step": 128700 }, { "epoch": 0.001288, - "grad_norm": 1.6319321393966675, + "grad_norm": 1.9790143966674805, "learning_rate": 1e-05, - "loss": 0.3223, + "loss": 0.323, "step": 128800 }, { "epoch": 0.001289, - "grad_norm": 1.7767342329025269, + "grad_norm": 1.7570897340774536, "learning_rate": 1e-05, - "loss": 0.3224, + "loss": 0.3233, "step": 128900 }, { "epoch": 0.00129, - "grad_norm": 1.820730447769165, + "grad_norm": 1.662227988243103, "learning_rate": 1e-05, - "loss": 0.3226, + "loss": 0.3232, "step": 129000 }, { "epoch": 0.001291, - "grad_norm": 3.3329765796661377, + "grad_norm": 1.6427834033966064, "learning_rate": 1e-05, - "loss": 0.3167, + "loss": 0.3163, "step": 129100 }, { "epoch": 0.001292, - "grad_norm": 1.8179858922958374, + "grad_norm": 1.74818754196167, "learning_rate": 1e-05, - "loss": 0.3192, + "loss": 0.3221, "step": 129200 }, { "epoch": 0.001293, - "grad_norm": 1.422192931175232, + "grad_norm": 1.4715895652770996, "learning_rate": 1e-05, - "loss": 0.3183, + "loss": 0.3199, "step": 129300 }, { "epoch": 0.001294, - "grad_norm": 1.7148090600967407, + "grad_norm": 1.5854099988937378, "learning_rate": 1e-05, - "loss": 0.3257, + "loss": 0.3276, "step": 129400 }, { "epoch": 0.001295, - "grad_norm": 1.6796051263809204, + "grad_norm": 1.6624189615249634, "learning_rate": 1e-05, - "loss": 0.3211, + "loss": 0.3206, "step": 129500 }, { "epoch": 0.001296, - "grad_norm": 1.4217095375061035, + "grad_norm": 1.5426676273345947, "learning_rate": 1e-05, - "loss": 0.3179, + "loss": 0.3215, "step": 129600 }, { "epoch": 0.001297, - "grad_norm": 1.5488269329071045, + "grad_norm": 1.7038050889968872, "learning_rate": 1e-05, - "loss": 0.3217, + "loss": 0.3225, "step": 129700 }, { "epoch": 0.001298, - "grad_norm": 1.4994601011276245, + "grad_norm": 1.4283004999160767, "learning_rate": 1e-05, - "loss": 0.3191, + "loss": 0.3216, "step": 129800 }, { "epoch": 0.001299, - "grad_norm": 1.3932366371154785, + "grad_norm": 1.5062975883483887, "learning_rate": 1e-05, - "loss": 0.3188, + "loss": 0.3221, "step": 129900 }, { "epoch": 0.0013, - "grad_norm": 1.8380815982818604, + "grad_norm": 1.7706925868988037, "learning_rate": 1e-05, - "loss": 0.3189, + "loss": 0.3205, "step": 130000 }, { "epoch": 0.001301, - "grad_norm": 1.6254947185516357, + "grad_norm": 1.6793439388275146, "learning_rate": 1e-05, - "loss": 0.3187, + "loss": 0.3186, "step": 130100 }, { "epoch": 0.001302, - "grad_norm": 1.6264735460281372, + "grad_norm": 1.5680670738220215, "learning_rate": 1e-05, - "loss": 0.318, + "loss": 0.3203, "step": 130200 }, { "epoch": 0.001303, - "grad_norm": 1.7539392709732056, + "grad_norm": 1.8509889841079712, "learning_rate": 1e-05, - "loss": 0.3265, + "loss": 0.3293, "step": 130300 }, { "epoch": 0.001304, - "grad_norm": 1.5360561609268188, + "grad_norm": 1.6747349500656128, "learning_rate": 1e-05, - "loss": 0.3187, + "loss": 0.3208, "step": 130400 }, { "epoch": 0.001305, - "grad_norm": 1.602621078491211, + "grad_norm": 1.7840492725372314, "learning_rate": 1e-05, - "loss": 0.3129, + "loss": 0.3176, "step": 130500 }, { "epoch": 0.001306, - "grad_norm": 1.461145281791687, + "grad_norm": 1.4201829433441162, "learning_rate": 1e-05, - "loss": 0.318, + "loss": 0.3198, "step": 130600 }, { "epoch": 0.001307, - "grad_norm": 1.6204843521118164, + "grad_norm": 1.5987930297851562, "learning_rate": 1e-05, - "loss": 0.328, + "loss": 0.3276, "step": 130700 }, { "epoch": 0.001308, - "grad_norm": 1.6122827529907227, + "grad_norm": 1.5990506410598755, "learning_rate": 1e-05, - "loss": 0.3169, + "loss": 0.3216, "step": 130800 }, { "epoch": 0.001309, - "grad_norm": 1.7438867092132568, + "grad_norm": 1.563931941986084, "learning_rate": 1e-05, - "loss": 0.319, + "loss": 0.3202, "step": 130900 }, { "epoch": 0.00131, - "grad_norm": 1.9274895191192627, + "grad_norm": 1.9966181516647339, "learning_rate": 1e-05, - "loss": 0.3151, + "loss": 0.3173, "step": 131000 }, { "epoch": 0.001311, - "grad_norm": 1.6077162027359009, + "grad_norm": 1.6617968082427979, "learning_rate": 1e-05, - "loss": 0.3224, + "loss": 0.3241, "step": 131100 }, { "epoch": 0.001312, - "grad_norm": 1.7657335996627808, + "grad_norm": 1.903935432434082, "learning_rate": 1e-05, - "loss": 0.3175, + "loss": 0.3215, "step": 131200 }, { "epoch": 0.001313, - "grad_norm": 1.7522004842758179, + "grad_norm": 1.9104382991790771, "learning_rate": 1e-05, - "loss": 0.3158, + "loss": 0.3183, "step": 131300 }, { "epoch": 0.001314, - "grad_norm": 1.6238682270050049, + "grad_norm": 1.5082734823226929, "learning_rate": 1e-05, - "loss": 0.3246, + "loss": 0.3251, "step": 131400 }, { "epoch": 0.001315, - "grad_norm": 1.5966116189956665, + "grad_norm": 1.869626522064209, "learning_rate": 1e-05, - "loss": 0.3184, + "loss": 0.3206, "step": 131500 }, { "epoch": 0.001316, - "grad_norm": 1.610478401184082, + "grad_norm": 1.8665653467178345, "learning_rate": 1e-05, - "loss": 0.3187, + "loss": 0.3178, "step": 131600 }, { "epoch": 0.001317, - "grad_norm": 1.5870428085327148, + "grad_norm": 1.52765691280365, "learning_rate": 1e-05, - "loss": 0.3192, + "loss": 0.321, "step": 131700 }, { "epoch": 0.001318, - "grad_norm": 1.4560139179229736, + "grad_norm": 1.4063327312469482, "learning_rate": 1e-05, - "loss": 0.3126, + "loss": 0.3158, "step": 131800 }, { "epoch": 0.001319, - "grad_norm": 1.5527640581130981, + "grad_norm": 1.8856468200683594, "learning_rate": 1e-05, - "loss": 0.3125, + "loss": 0.3144, "step": 131900 }, { "epoch": 0.00132, - "grad_norm": 1.7717013359069824, + "grad_norm": 1.7623271942138672, "learning_rate": 1e-05, - "loss": 0.3218, + "loss": 0.3224, "step": 132000 }, { "epoch": 0.001321, - "grad_norm": 1.4609984159469604, + "grad_norm": 1.6237845420837402, "learning_rate": 1e-05, - "loss": 0.3171, + "loss": 0.3207, "step": 132100 }, { "epoch": 0.001322, - "grad_norm": 1.4167782068252563, + "grad_norm": 1.55039644241333, "learning_rate": 1e-05, - "loss": 0.3172, + "loss": 0.3188, "step": 132200 }, { "epoch": 0.001323, - "grad_norm": 1.542642593383789, + "grad_norm": 1.6823863983154297, "learning_rate": 1e-05, - "loss": 0.3182, + "loss": 0.3204, "step": 132300 }, { "epoch": 0.001324, - "grad_norm": 1.7482138872146606, + "grad_norm": 1.6407947540283203, "learning_rate": 1e-05, - "loss": 0.3117, + "loss": 0.3129, "step": 132400 }, { "epoch": 0.001325, - "grad_norm": 1.4038406610488892, + "grad_norm": 1.6221039295196533, "learning_rate": 1e-05, - "loss": 0.3157, + "loss": 0.3184, "step": 132500 }, { "epoch": 0.001326, - "grad_norm": 1.6383379697799683, + "grad_norm": 1.6350358724594116, "learning_rate": 1e-05, - "loss": 0.3117, + "loss": 0.312, "step": 132600 }, { "epoch": 0.001327, - "grad_norm": 1.8213622570037842, + "grad_norm": 1.5436922311782837, "learning_rate": 1e-05, - "loss": 0.3183, + "loss": 0.3194, "step": 132700 }, { "epoch": 0.001328, - "grad_norm": 1.617414951324463, + "grad_norm": 1.4995625019073486, "learning_rate": 1e-05, - "loss": 0.3159, + "loss": 0.3202, "step": 132800 }, { "epoch": 0.001329, - "grad_norm": 1.5281394720077515, + "grad_norm": 1.4857275485992432, "learning_rate": 1e-05, - "loss": 0.3261, + "loss": 0.3263, "step": 132900 }, { "epoch": 0.00133, - "grad_norm": 1.4511768817901611, + "grad_norm": 1.6003996133804321, "learning_rate": 1e-05, - "loss": 0.3163, + "loss": 0.3185, "step": 133000 }, { "epoch": 0.001331, - "grad_norm": 1.4975697994232178, + "grad_norm": 1.6696702241897583, "learning_rate": 1e-05, - "loss": 0.316, + "loss": 0.3217, "step": 133100 }, { "epoch": 0.001332, - "grad_norm": 1.3760722875595093, + "grad_norm": 1.5747042894363403, "learning_rate": 1e-05, - "loss": 0.3185, + "loss": 0.3193, "step": 133200 }, { "epoch": 0.001333, - "grad_norm": 1.5633317232131958, + "grad_norm": 1.5583536624908447, "learning_rate": 1e-05, - "loss": 0.3192, + "loss": 0.3216, "step": 133300 }, { "epoch": 0.001334, - "grad_norm": 1.732689619064331, + "grad_norm": 1.7547342777252197, "learning_rate": 1e-05, - "loss": 0.3133, + "loss": 0.3156, "step": 133400 }, { "epoch": 0.001335, - "grad_norm": 2.506681203842163, + "grad_norm": 1.4350770711898804, "learning_rate": 1e-05, - "loss": 0.3066, + "loss": 0.3104, "step": 133500 }, { "epoch": 0.001336, - "grad_norm": 1.5448195934295654, + "grad_norm": 1.7420467138290405, "learning_rate": 1e-05, - "loss": 0.3249, + "loss": 0.3268, "step": 133600 }, { "epoch": 0.001337, - "grad_norm": 1.821373701095581, + "grad_norm": 1.7873330116271973, "learning_rate": 1e-05, - "loss": 0.3184, + "loss": 0.3225, "step": 133700 }, { "epoch": 0.001338, - "grad_norm": 1.8194351196289062, + "grad_norm": 1.7741400003433228, "learning_rate": 1e-05, - "loss": 0.3179, + "loss": 0.322, "step": 133800 }, { "epoch": 0.001339, - "grad_norm": 1.7590010166168213, + "grad_norm": 1.6877092123031616, "learning_rate": 1e-05, - "loss": 0.3166, + "loss": 0.3209, "step": 133900 }, { "epoch": 0.00134, - "grad_norm": 1.7983680963516235, + "grad_norm": 1.7802987098693848, "learning_rate": 1e-05, - "loss": 0.3192, + "loss": 0.3206, "step": 134000 }, { "epoch": 0.001341, - "grad_norm": 1.6013023853302002, + "grad_norm": 1.8283404111862183, "learning_rate": 1e-05, - "loss": 0.3262, + "loss": 0.3256, "step": 134100 }, { "epoch": 0.001342, - "grad_norm": 1.5687752962112427, + "grad_norm": 1.5722459554672241, "learning_rate": 1e-05, - "loss": 0.313, + "loss": 0.3163, "step": 134200 }, { "epoch": 0.001343, - "grad_norm": 1.532816767692566, + "grad_norm": 1.7147879600524902, "learning_rate": 1e-05, - "loss": 0.315, + "loss": 0.3161, "step": 134300 }, { "epoch": 0.001344, - "grad_norm": 1.6382378339767456, + "grad_norm": 1.8033101558685303, "learning_rate": 1e-05, - "loss": 0.313, + "loss": 0.3146, "step": 134400 }, { "epoch": 0.001345, - "grad_norm": 1.6533745527267456, + "grad_norm": 1.5288691520690918, "learning_rate": 1e-05, - "loss": 0.3221, + "loss": 0.3225, "step": 134500 }, { "epoch": 0.001346, - "grad_norm": 1.4666370153427124, + "grad_norm": 1.6482738256454468, "learning_rate": 1e-05, - "loss": 0.3127, + "loss": 0.3136, "step": 134600 }, { "epoch": 0.001347, - "grad_norm": 2.7927446365356445, + "grad_norm": 1.5121197700500488, "learning_rate": 1e-05, - "loss": 0.3178, + "loss": 0.3229, "step": 134700 }, { "epoch": 0.001348, - "grad_norm": 1.6600843667984009, + "grad_norm": 1.602310061454773, "learning_rate": 1e-05, - "loss": 0.3149, + "loss": 0.3183, "step": 134800 }, { "epoch": 0.001349, - "grad_norm": 1.6370255947113037, + "grad_norm": 1.596355676651001, "learning_rate": 1e-05, - "loss": 0.3079, + "loss": 0.3099, "step": 134900 }, { "epoch": 0.00135, - "grad_norm": 1.9031014442443848, + "grad_norm": 2.1672589778900146, "learning_rate": 1e-05, - "loss": 0.3154, + "loss": 0.3183, "step": 135000 }, { "epoch": 0.001351, - "grad_norm": 1.435365915298462, + "grad_norm": 1.529840350151062, "learning_rate": 1e-05, - "loss": 0.3188, + "loss": 0.322, "step": 135100 }, { "epoch": 0.001352, - "grad_norm": 1.4787670373916626, + "grad_norm": 1.48147714138031, "learning_rate": 1e-05, - "loss": 0.3157, + "loss": 0.3178, "step": 135200 }, { "epoch": 0.001353, - "grad_norm": 1.5059831142425537, + "grad_norm": 1.488888144493103, "learning_rate": 1e-05, - "loss": 0.3072, + "loss": 0.3121, "step": 135300 }, { "epoch": 0.001354, - "grad_norm": 1.4885427951812744, + "grad_norm": 1.3818961381912231, "learning_rate": 1e-05, - "loss": 0.3207, + "loss": 0.325, "step": 135400 }, { "epoch": 0.001355, - "grad_norm": 1.5241745710372925, + "grad_norm": 1.6328446865081787, "learning_rate": 1e-05, - "loss": 0.3163, + "loss": 0.3189, "step": 135500 }, { "epoch": 0.001356, - "grad_norm": 1.739099383354187, + "grad_norm": 1.7009713649749756, "learning_rate": 1e-05, - "loss": 0.3158, + "loss": 0.3188, "step": 135600 }, { "epoch": 0.001357, - "grad_norm": 1.6188503503799438, + "grad_norm": 1.5784467458724976, "learning_rate": 1e-05, - "loss": 0.3196, + "loss": 0.3221, "step": 135700 }, { "epoch": 0.001358, - "grad_norm": 2.0876073837280273, + "grad_norm": 1.5164488554000854, "learning_rate": 1e-05, - "loss": 0.3085, + "loss": 0.3105, "step": 135800 }, { "epoch": 0.001359, - "grad_norm": 1.6344412565231323, + "grad_norm": 1.668848991394043, "learning_rate": 1e-05, - "loss": 0.3058, + "loss": 0.3104, "step": 135900 }, { "epoch": 0.00136, - "grad_norm": 2.1860053539276123, + "grad_norm": 1.978113055229187, "learning_rate": 1e-05, - "loss": 0.3145, + "loss": 0.3151, "step": 136000 }, { "epoch": 0.001361, - "grad_norm": 1.4819742441177368, + "grad_norm": 1.6246618032455444, "learning_rate": 1e-05, - "loss": 0.3144, + "loss": 0.3165, "step": 136100 }, { "epoch": 0.001362, - "grad_norm": 1.5477856397628784, + "grad_norm": 1.5396642684936523, "learning_rate": 1e-05, - "loss": 0.3147, + "loss": 0.3154, "step": 136200 }, { "epoch": 0.001363, - "grad_norm": 1.9053215980529785, + "grad_norm": 1.660476565361023, "learning_rate": 1e-05, - "loss": 0.3133, + "loss": 0.313, "step": 136300 }, { "epoch": 0.001364, - "grad_norm": 1.5794596672058105, + "grad_norm": 1.4768214225769043, "learning_rate": 1e-05, - "loss": 0.3128, + "loss": 0.3171, "step": 136400 }, { "epoch": 0.001365, - "grad_norm": 1.5301233530044556, + "grad_norm": 1.7115814685821533, "learning_rate": 1e-05, - "loss": 0.3126, + "loss": 0.3133, "step": 136500 }, { "epoch": 0.001366, - "grad_norm": 1.657422661781311, + "grad_norm": 1.6259845495224, "learning_rate": 1e-05, - "loss": 0.3136, + "loss": 0.3153, "step": 136600 }, { "epoch": 0.001367, - "grad_norm": 1.5858018398284912, + "grad_norm": 1.5243622064590454, "learning_rate": 1e-05, - "loss": 0.3111, + "loss": 0.3151, "step": 136700 }, { "epoch": 0.001368, - "grad_norm": 1.702895164489746, + "grad_norm": 1.8191895484924316, "learning_rate": 1e-05, - "loss": 0.3193, + "loss": 0.3229, "step": 136800 }, { "epoch": 0.001369, - "grad_norm": 1.5635457038879395, + "grad_norm": 1.5113294124603271, "learning_rate": 1e-05, - "loss": 0.3175, + "loss": 0.317, "step": 136900 }, { "epoch": 0.00137, - "grad_norm": 1.4933804273605347, + "grad_norm": 1.46476149559021, "learning_rate": 1e-05, - "loss": 0.3125, + "loss": 0.3156, "step": 137000 }, { "epoch": 0.001371, - "grad_norm": 1.6892173290252686, + "grad_norm": 1.4856374263763428, "learning_rate": 1e-05, - "loss": 0.308, + "loss": 0.3116, "step": 137100 }, { "epoch": 0.001372, - "grad_norm": 1.3880424499511719, + "grad_norm": 1.6964186429977417, "learning_rate": 1e-05, - "loss": 0.3137, + "loss": 0.3155, "step": 137200 }, { "epoch": 0.001373, - "grad_norm": 1.5656882524490356, + "grad_norm": 1.8333369493484497, "learning_rate": 1e-05, - "loss": 0.31, + "loss": 0.3142, "step": 137300 }, { "epoch": 0.001374, - "grad_norm": 1.5059945583343506, + "grad_norm": 1.6640217304229736, "learning_rate": 1e-05, - "loss": 0.3151, + "loss": 0.3177, "step": 137400 }, { "epoch": 0.001375, - "grad_norm": 1.5968074798583984, + "grad_norm": 1.566697597503662, "learning_rate": 1e-05, - "loss": 0.3109, + "loss": 0.3114, "step": 137500 }, { "epoch": 0.001376, - "grad_norm": 1.9155410528182983, + "grad_norm": 1.9087796211242676, "learning_rate": 1e-05, - "loss": 0.3096, + "loss": 0.3106, "step": 137600 }, { "epoch": 0.001377, - "grad_norm": 1.5058423280715942, + "grad_norm": 1.7321326732635498, "learning_rate": 1e-05, - "loss": 0.3118, + "loss": 0.3171, "step": 137700 }, { "epoch": 0.001378, - "grad_norm": 1.7207130193710327, + "grad_norm": 1.8865094184875488, "learning_rate": 1e-05, - "loss": 0.3137, + "loss": 0.3154, "step": 137800 }, { "epoch": 0.001379, - "grad_norm": 1.6587121486663818, + "grad_norm": 1.7816592454910278, "learning_rate": 1e-05, - "loss": 0.3049, + "loss": 0.3079, "step": 137900 }, { "epoch": 0.00138, - "grad_norm": 1.3595772981643677, + "grad_norm": 1.442036509513855, "learning_rate": 1e-05, - "loss": 0.3082, + "loss": 0.3102, "step": 138000 }, { "epoch": 0.001381, - "grad_norm": 1.447014570236206, + "grad_norm": 1.6105217933654785, "learning_rate": 1e-05, - "loss": 0.3117, + "loss": 0.3145, "step": 138100 }, { "epoch": 0.001382, - "grad_norm": 1.575817346572876, + "grad_norm": 1.597864031791687, "learning_rate": 1e-05, - "loss": 0.3118, + "loss": 0.3116, "step": 138200 }, { "epoch": 0.001383, - "grad_norm": 2.083845376968384, + "grad_norm": 1.888089656829834, "learning_rate": 1e-05, "loss": 0.3215, "step": 138300 }, { "epoch": 0.001384, - "grad_norm": 1.2700806856155396, + "grad_norm": 1.372367262840271, "learning_rate": 1e-05, - "loss": 0.3096, + "loss": 0.3121, "step": 138400 }, { "epoch": 0.001385, - "grad_norm": 1.7027220726013184, + "grad_norm": 1.7237030267715454, "learning_rate": 1e-05, - "loss": 0.3046, + "loss": 0.3066, "step": 138500 }, { "epoch": 0.001386, - "grad_norm": 1.600897192955017, + "grad_norm": 1.8450541496276855, "learning_rate": 1e-05, - "loss": 0.3184, + "loss": 0.3197, "step": 138600 }, { "epoch": 0.001387, - "grad_norm": 1.6428141593933105, + "grad_norm": 1.6042810678482056, "learning_rate": 1e-05, - "loss": 0.3126, + "loss": 0.3143, "step": 138700 }, { "epoch": 0.001388, - "grad_norm": 1.743947148323059, + "grad_norm": 1.914825677871704, "learning_rate": 1e-05, - "loss": 0.3102, + "loss": 0.3109, "step": 138800 }, { "epoch": 0.001389, - "grad_norm": 1.577340841293335, + "grad_norm": 1.810681939125061, "learning_rate": 1e-05, - "loss": 0.3199, + "loss": 0.3207, "step": 138900 }, { "epoch": 0.00139, - "grad_norm": 1.8040262460708618, + "grad_norm": 1.6505619287490845, "learning_rate": 1e-05, - "loss": 0.3113, + "loss": 0.3112, "step": 139000 }, { "epoch": 0.001391, - "grad_norm": 1.514061450958252, + "grad_norm": 1.5920408964157104, "learning_rate": 1e-05, - "loss": 0.312, + "loss": 0.3143, "step": 139100 }, { "epoch": 0.001392, - "grad_norm": 1.6503139734268188, + "grad_norm": 1.603387713432312, "learning_rate": 1e-05, - "loss": 0.3083, + "loss": 0.3121, "step": 139200 }, { "epoch": 0.001393, - "grad_norm": 1.5279432535171509, + "grad_norm": 1.7222926616668701, "learning_rate": 1e-05, - "loss": 0.3139, + "loss": 0.3161, "step": 139300 }, { "epoch": 0.001394, - "grad_norm": 1.5044186115264893, + "grad_norm": 1.6358146667480469, "learning_rate": 1e-05, - "loss": 0.308, + "loss": 0.3083, "step": 139400 }, { "epoch": 0.001395, - "grad_norm": 1.5812429189682007, + "grad_norm": 2.0886547565460205, "learning_rate": 1e-05, - "loss": 0.3189, + "loss": 0.3215, "step": 139500 }, { "epoch": 0.001396, - "grad_norm": 1.5580233335494995, + "grad_norm": 1.68631911277771, "learning_rate": 1e-05, - "loss": 0.3047, + "loss": 0.3082, "step": 139600 }, { "epoch": 0.001397, - "grad_norm": 1.6258715391159058, + "grad_norm": 1.5744168758392334, "learning_rate": 1e-05, - "loss": 0.3142, + "loss": 0.315, "step": 139700 }, { "epoch": 0.001398, - "grad_norm": 1.4624382257461548, + "grad_norm": 1.7238872051239014, "learning_rate": 1e-05, - "loss": 0.3143, + "loss": 0.3151, "step": 139800 }, { "epoch": 0.001399, - "grad_norm": 1.5806007385253906, + "grad_norm": 1.6450138092041016, "learning_rate": 1e-05, - "loss": 0.3098, + "loss": 0.3129, "step": 139900 }, { "epoch": 0.0014, - "grad_norm": 1.6465153694152832, + "grad_norm": 1.4830751419067383, "learning_rate": 1e-05, - "loss": 0.3126, + "loss": 0.3164, "step": 140000 }, { "epoch": 0.0014, - "eval_loss": 0.2880859375, - "eval_runtime": 107.0407, - "eval_samples_per_second": 467.112, - "eval_steps_per_second": 29.195, + "eval_loss": 0.287109375, + "eval_runtime": 109.9314, + "eval_samples_per_second": 454.829, + "eval_steps_per_second": 28.427, "step": 140000 }, { "epoch": 0.001401, - "grad_norm": 1.7238470315933228, + "grad_norm": 1.7286059856414795, "learning_rate": 1e-05, - "loss": 0.3086, + "loss": 0.311, "step": 140100 }, { "epoch": 0.001402, - "grad_norm": 1.563961386680603, + "grad_norm": 1.7702032327651978, "learning_rate": 1e-05, - "loss": 0.3222, + "loss": 0.326, "step": 140200 }, { "epoch": 0.001403, - "grad_norm": 1.5772298574447632, + "grad_norm": 1.6898577213287354, "learning_rate": 1e-05, - "loss": 0.3178, + "loss": 0.3216, "step": 140300 }, { "epoch": 0.001404, - "grad_norm": 1.5530885457992554, + "grad_norm": 1.6577975749969482, "learning_rate": 1e-05, - "loss": 0.3123, + "loss": 0.3145, "step": 140400 }, { "epoch": 0.001405, - "grad_norm": 1.4736943244934082, + "grad_norm": 1.444854736328125, "learning_rate": 1e-05, - "loss": 0.3192, + "loss": 0.3216, "step": 140500 }, { "epoch": 0.001406, - "grad_norm": 1.8380192518234253, + "grad_norm": 1.6251206398010254, "learning_rate": 1e-05, - "loss": 0.3079, + "loss": 0.3108, "step": 140600 }, { "epoch": 0.001407, - "grad_norm": 1.3954215049743652, + "grad_norm": 1.4880505800247192, "learning_rate": 1e-05, - "loss": 0.311, + "loss": 0.3131, "step": 140700 }, { "epoch": 0.001408, - "grad_norm": 1.540252685546875, + "grad_norm": 1.8925511837005615, "learning_rate": 1e-05, - "loss": 0.3088, + "loss": 0.3117, "step": 140800 }, { "epoch": 0.001409, - "grad_norm": 1.701472282409668, + "grad_norm": 1.6990015506744385, "learning_rate": 1e-05, - "loss": 0.3077, + "loss": 0.3101, "step": 140900 }, { "epoch": 0.00141, - "grad_norm": 1.6292940378189087, + "grad_norm": 1.498661756515503, "learning_rate": 1e-05, - "loss": 0.3065, + "loss": 0.3082, "step": 141000 }, { "epoch": 0.001411, - "grad_norm": 1.3741705417633057, + "grad_norm": 1.7527713775634766, "learning_rate": 1e-05, - "loss": 0.3143, + "loss": 0.3178, "step": 141100 }, { "epoch": 0.001412, - "grad_norm": 1.6212708950042725, + "grad_norm": 1.6200438737869263, "learning_rate": 1e-05, - "loss": 0.3217, + "loss": 0.3228, "step": 141200 }, { "epoch": 0.001413, - "grad_norm": 1.409938097000122, + "grad_norm": 1.3735147714614868, "learning_rate": 1e-05, - "loss": 0.3111, + "loss": 0.3124, "step": 141300 }, { "epoch": 0.001414, - "grad_norm": 1.5531816482543945, + "grad_norm": 2.0076656341552734, "learning_rate": 1e-05, - "loss": 0.3103, + "loss": 0.3119, "step": 141400 }, { "epoch": 0.001415, - "grad_norm": 1.4415562152862549, + "grad_norm": 1.5308282375335693, "learning_rate": 1e-05, - "loss": 0.3188, + "loss": 0.321, "step": 141500 }, { "epoch": 0.001416, - "grad_norm": 1.5227936506271362, + "grad_norm": 1.5367777347564697, "learning_rate": 1e-05, - "loss": 0.307, + "loss": 0.3064, "step": 141600 }, { "epoch": 0.001417, - "grad_norm": 1.7079826593399048, + "grad_norm": 1.64597749710083, "learning_rate": 1e-05, - "loss": 0.3109, + "loss": 0.3116, "step": 141700 }, { "epoch": 0.001418, - "grad_norm": 1.5872690677642822, + "grad_norm": 1.976902723312378, "learning_rate": 1e-05, - "loss": 0.3183, + "loss": 0.3187, "step": 141800 }, { "epoch": 0.001419, - "grad_norm": 1.491664171218872, + "grad_norm": 1.8003846406936646, "learning_rate": 1e-05, - "loss": 0.3082, + "loss": 0.3106, "step": 141900 }, { "epoch": 0.00142, - "grad_norm": 1.6363643407821655, + "grad_norm": 2.8026585578918457, "learning_rate": 1e-05, - "loss": 0.3148, + "loss": 0.3186, "step": 142000 }, { "epoch": 0.001421, - "grad_norm": 1.6952561140060425, + "grad_norm": 3.536267042160034, "learning_rate": 1e-05, - "loss": 0.3074, + "loss": 0.3104, "step": 142100 }, { "epoch": 0.001422, - "grad_norm": 1.5672736167907715, + "grad_norm": 1.5805248022079468, "learning_rate": 1e-05, - "loss": 0.311, + "loss": 0.3136, "step": 142200 }, { "epoch": 0.001423, - "grad_norm": 1.5457510948181152, + "grad_norm": 1.5467828512191772, "learning_rate": 1e-05, - "loss": 0.3091, + "loss": 0.3102, "step": 142300 }, { "epoch": 0.001424, - "grad_norm": 1.5032873153686523, + "grad_norm": 1.5788090229034424, "learning_rate": 1e-05, - "loss": 0.3141, + "loss": 0.3153, "step": 142400 }, { "epoch": 0.001425, - "grad_norm": 1.7208070755004883, + "grad_norm": 1.8822991847991943, "learning_rate": 1e-05, - "loss": 0.3092, + "loss": 0.3124, "step": 142500 }, { "epoch": 0.001426, - "grad_norm": 1.697360873222351, + "grad_norm": 1.547636866569519, "learning_rate": 1e-05, - "loss": 0.314, + "loss": 0.3141, "step": 142600 }, { "epoch": 0.001427, - "grad_norm": 1.752466082572937, + "grad_norm": 1.628211498260498, "learning_rate": 1e-05, - "loss": 0.3156, + "loss": 0.3182, "step": 142700 }, { "epoch": 0.001428, - "grad_norm": 1.6433829069137573, + "grad_norm": 1.645572304725647, "learning_rate": 1e-05, - "loss": 0.3104, + "loss": 0.3106, "step": 142800 }, { "epoch": 0.001429, - "grad_norm": 1.6453399658203125, + "grad_norm": 1.5614272356033325, "learning_rate": 1e-05, - "loss": 0.3089, + "loss": 0.3113, "step": 142900 }, { "epoch": 0.00143, - "grad_norm": 1.762367844581604, + "grad_norm": 1.6089304685592651, "learning_rate": 1e-05, - "loss": 0.3109, + "loss": 0.3115, "step": 143000 }, { "epoch": 0.001431, - "grad_norm": 1.509325385093689, + "grad_norm": 1.7061288356781006, "learning_rate": 1e-05, - "loss": 0.307, + "loss": 0.3097, "step": 143100 }, { "epoch": 0.001432, - "grad_norm": 2.4036171436309814, + "grad_norm": 1.629626989364624, "learning_rate": 1e-05, - "loss": 0.3113, + "loss": 0.3127, "step": 143200 }, { "epoch": 0.001433, - "grad_norm": 1.6559257507324219, + "grad_norm": 1.5930266380310059, "learning_rate": 1e-05, - "loss": 0.312, + "loss": 0.3136, "step": 143300 }, { "epoch": 0.001434, - "grad_norm": 1.9893611669540405, + "grad_norm": 1.6816060543060303, "learning_rate": 1e-05, - "loss": 0.3143, + "loss": 0.3171, "step": 143400 }, { "epoch": 0.001435, - "grad_norm": 1.8000917434692383, + "grad_norm": 1.6803059577941895, "learning_rate": 1e-05, - "loss": 0.317, + "loss": 0.3162, "step": 143500 }, { "epoch": 0.001436, - "grad_norm": 1.4374059438705444, + "grad_norm": 1.4301313161849976, "learning_rate": 1e-05, - "loss": 0.298, + "loss": 0.3009, "step": 143600 }, { "epoch": 0.001437, - "grad_norm": 1.405443787574768, + "grad_norm": 1.407421588897705, "learning_rate": 1e-05, - "loss": 0.3063, + "loss": 0.3078, "step": 143700 }, { "epoch": 0.001438, - "grad_norm": 1.523026943206787, + "grad_norm": 1.6475402116775513, "learning_rate": 1e-05, - "loss": 0.3057, + "loss": 0.3088, "step": 143800 }, { "epoch": 0.001439, - "grad_norm": 1.7228341102600098, + "grad_norm": 1.5251747369766235, "learning_rate": 1e-05, "loss": 0.3086, "step": 143900 }, { "epoch": 0.00144, - "grad_norm": 1.7864826917648315, + "grad_norm": 1.5712449550628662, "learning_rate": 1e-05, - "loss": 0.3083, + "loss": 0.3117, "step": 144000 }, { "epoch": 0.001441, - "grad_norm": 1.5745184421539307, + "grad_norm": 1.6450409889221191, "learning_rate": 1e-05, - "loss": 0.3127, + "loss": 0.3131, "step": 144100 }, { "epoch": 0.001442, - "grad_norm": 1.3796170949935913, + "grad_norm": 1.451005458831787, "learning_rate": 1e-05, - "loss": 0.3103, + "loss": 0.3117, "step": 144200 }, { "epoch": 0.001443, - "grad_norm": 1.4458056688308716, + "grad_norm": 1.5392875671386719, "learning_rate": 1e-05, - "loss": 0.3172, + "loss": 0.3186, "step": 144300 }, { "epoch": 0.001444, - "grad_norm": 1.706773042678833, + "grad_norm": 1.8175650835037231, "learning_rate": 1e-05, - "loss": 0.3152, + "loss": 0.3183, "step": 144400 }, { "epoch": 0.001445, - "grad_norm": 1.4717893600463867, + "grad_norm": 1.506216287612915, "learning_rate": 1e-05, - "loss": 0.3149, + "loss": 0.318, "step": 144500 }, { "epoch": 0.001446, - "grad_norm": 1.4462265968322754, + "grad_norm": 1.4329332113265991, "learning_rate": 1e-05, - "loss": 0.31, + "loss": 0.3107, "step": 144600 }, { "epoch": 0.001447, - "grad_norm": 1.4916396141052246, + "grad_norm": 1.6957765817642212, "learning_rate": 1e-05, - "loss": 0.3084, + "loss": 0.309, "step": 144700 }, { "epoch": 0.001448, - "grad_norm": 1.4818063974380493, + "grad_norm": 1.3159312009811401, "learning_rate": 1e-05, - "loss": 0.3095, + "loss": 0.313, "step": 144800 }, { "epoch": 0.001449, - "grad_norm": 1.5169060230255127, + "grad_norm": 1.6114338636398315, "learning_rate": 1e-05, - "loss": 0.3076, + "loss": 0.3096, "step": 144900 }, { "epoch": 0.00145, - "grad_norm": 1.712086796760559, + "grad_norm": 1.8013079166412354, "learning_rate": 1e-05, - "loss": 0.3159, + "loss": 0.3154, "step": 145000 }, { "epoch": 0.001451, - "grad_norm": 1.5643309354782104, + "grad_norm": 1.7098653316497803, "learning_rate": 1e-05, - "loss": 0.3095, + "loss": 0.3091, "step": 145100 }, { "epoch": 0.001452, - "grad_norm": 1.566231608390808, + "grad_norm": 1.5512733459472656, "learning_rate": 1e-05, - "loss": 0.308, + "loss": 0.3109, "step": 145200 }, { "epoch": 0.001453, - "grad_norm": 1.8817647695541382, + "grad_norm": 1.725237488746643, "learning_rate": 1e-05, - "loss": 0.3017, + "loss": 0.3048, "step": 145300 }, { "epoch": 0.001454, - "grad_norm": 1.6510733366012573, + "grad_norm": 4.254234313964844, "learning_rate": 1e-05, - "loss": 0.3112, + "loss": 0.3154, "step": 145400 }, { "epoch": 0.001455, - "grad_norm": 1.8020539283752441, + "grad_norm": 1.7910503149032593, "learning_rate": 1e-05, - "loss": 0.3094, + "loss": 0.3085, "step": 145500 }, { "epoch": 0.001456, - "grad_norm": 1.4022586345672607, + "grad_norm": 1.4521243572235107, "learning_rate": 1e-05, - "loss": 0.3144, + "loss": 0.316, "step": 145600 }, { "epoch": 0.001457, - "grad_norm": 1.6672667264938354, + "grad_norm": 1.7298940420150757, "learning_rate": 1e-05, - "loss": 0.3012, + "loss": 0.3052, "step": 145700 }, { "epoch": 0.001458, - "grad_norm": 1.5812125205993652, + "grad_norm": 1.7451497316360474, "learning_rate": 1e-05, - "loss": 0.3079, + "loss": 0.3111, "step": 145800 }, { "epoch": 0.001459, - "grad_norm": 1.5840048789978027, + "grad_norm": 1.7721543312072754, "learning_rate": 1e-05, - "loss": 0.3112, + "loss": 0.3124, "step": 145900 }, { "epoch": 0.00146, - "grad_norm": 1.7802815437316895, + "grad_norm": 2.228154182434082, "learning_rate": 1e-05, - "loss": 0.3107, + "loss": 0.3114, "step": 146000 }, { "epoch": 0.001461, - "grad_norm": 1.5505414009094238, + "grad_norm": 1.6171804666519165, "learning_rate": 1e-05, - "loss": 0.3006, + "loss": 0.3028, "step": 146100 }, { "epoch": 0.001462, - "grad_norm": 1.6433098316192627, + "grad_norm": 1.8199244737625122, "learning_rate": 1e-05, - "loss": 0.3155, + "loss": 0.3165, "step": 146200 }, { "epoch": 0.001463, - "grad_norm": 1.5074726343154907, + "grad_norm": 1.5894031524658203, "learning_rate": 1e-05, - "loss": 0.3049, + "loss": 0.3069, "step": 146300 }, { "epoch": 0.001464, - "grad_norm": 1.9928641319274902, + "grad_norm": 1.5978094339370728, "learning_rate": 1e-05, - "loss": 0.3083, + "loss": 0.3092, "step": 146400 }, { "epoch": 0.001465, - "grad_norm": 1.479493260383606, + "grad_norm": 1.4421255588531494, "learning_rate": 1e-05, - "loss": 0.3112, + "loss": 0.312, "step": 146500 }, { "epoch": 0.001466, - "grad_norm": 1.4818761348724365, + "grad_norm": 1.6096898317337036, "learning_rate": 1e-05, - "loss": 0.3026, + "loss": 0.3049, "step": 146600 }, { "epoch": 0.001467, - "grad_norm": 1.5526435375213623, + "grad_norm": 1.5457234382629395, "learning_rate": 1e-05, - "loss": 0.3081, + "loss": 0.3096, "step": 146700 }, { "epoch": 0.001468, - "grad_norm": 1.6320880651474, + "grad_norm": 1.6169909238815308, "learning_rate": 1e-05, - "loss": 0.3064, + "loss": 0.3086, "step": 146800 }, { "epoch": 0.001469, - "grad_norm": 1.6767406463623047, + "grad_norm": 1.7878210544586182, "learning_rate": 1e-05, - "loss": 0.3118, + "loss": 0.315, "step": 146900 }, { "epoch": 0.00147, - "grad_norm": 1.614627480506897, + "grad_norm": 6.46168327331543, "learning_rate": 1e-05, - "loss": 0.3032, + "loss": 0.305, "step": 147000 }, { "epoch": 0.001471, - "grad_norm": 1.511580467224121, + "grad_norm": 1.6731548309326172, "learning_rate": 1e-05, - "loss": 0.307, + "loss": 0.3102, "step": 147100 }, { "epoch": 0.001472, - "grad_norm": 1.7579758167266846, + "grad_norm": 1.9476267099380493, "learning_rate": 1e-05, - "loss": 0.3098, + "loss": 0.3122, "step": 147200 }, { "epoch": 0.001473, - "grad_norm": 1.7419276237487793, + "grad_norm": 1.6077353954315186, "learning_rate": 1e-05, - "loss": 0.3087, + "loss": 0.3084, "step": 147300 }, { "epoch": 0.001474, - "grad_norm": 1.5828056335449219, + "grad_norm": 1.7394909858703613, "learning_rate": 1e-05, - "loss": 0.3037, + "loss": 0.3063, "step": 147400 }, { "epoch": 0.001475, - "grad_norm": 1.4185255765914917, + "grad_norm": 1.4782484769821167, "learning_rate": 1e-05, - "loss": 0.31, + "loss": 0.3105, "step": 147500 }, { "epoch": 0.001476, - "grad_norm": 1.7061275243759155, + "grad_norm": 1.6981760263442993, "learning_rate": 1e-05, - "loss": 0.3083, + "loss": 0.3119, "step": 147600 }, { "epoch": 0.001477, - "grad_norm": 1.466453194618225, + "grad_norm": 1.697596549987793, "learning_rate": 1e-05, - "loss": 0.3126, + "loss": 0.3132, "step": 147700 }, { "epoch": 0.001478, - "grad_norm": 1.7051310539245605, + "grad_norm": 1.630706787109375, "learning_rate": 1e-05, - "loss": 0.3113, + "loss": 0.3147, "step": 147800 }, { "epoch": 0.001479, - "grad_norm": 1.5825525522232056, + "grad_norm": 1.685257077217102, "learning_rate": 1e-05, - "loss": 0.3114, + "loss": 0.3096, "step": 147900 }, { "epoch": 0.00148, - "grad_norm": 2.481156349182129, + "grad_norm": 1.8165167570114136, "learning_rate": 1e-05, - "loss": 0.3034, + "loss": 0.307, "step": 148000 }, { "epoch": 0.001481, - "grad_norm": 1.8164572715759277, + "grad_norm": 1.6408799886703491, "learning_rate": 1e-05, - "loss": 0.3069, + "loss": 0.3111, "step": 148100 }, { "epoch": 0.001482, - "grad_norm": 1.5599901676177979, + "grad_norm": 1.6128547191619873, "learning_rate": 1e-05, - "loss": 0.3076, + "loss": 0.3083, "step": 148200 }, { "epoch": 0.001483, - "grad_norm": 1.8044281005859375, + "grad_norm": 1.8511683940887451, "learning_rate": 1e-05, - "loss": 0.2987, + "loss": 0.3014, "step": 148300 }, { "epoch": 0.001484, - "grad_norm": 1.5855646133422852, + "grad_norm": 1.576206922531128, "learning_rate": 1e-05, - "loss": 0.3069, + "loss": 0.309, "step": 148400 }, { "epoch": 0.001485, - "grad_norm": 1.4387843608856201, + "grad_norm": 1.543514370918274, "learning_rate": 1e-05, - "loss": 0.308, + "loss": 0.3093, "step": 148500 }, { "epoch": 0.001486, - "grad_norm": 1.6682666540145874, + "grad_norm": 1.5939360857009888, "learning_rate": 1e-05, - "loss": 0.3077, + "loss": 0.3105, "step": 148600 }, { "epoch": 0.001487, - "grad_norm": 1.525200366973877, + "grad_norm": 1.4022550582885742, "learning_rate": 1e-05, - "loss": 0.3079, + "loss": 0.311, "step": 148700 }, { "epoch": 0.001488, - "grad_norm": 1.8183655738830566, + "grad_norm": 1.8196625709533691, "learning_rate": 1e-05, - "loss": 0.3105, + "loss": 0.3119, "step": 148800 }, { "epoch": 0.001489, - "grad_norm": 1.4334073066711426, + "grad_norm": 1.6308430433273315, "learning_rate": 1e-05, - "loss": 0.3038, + "loss": 0.3042, "step": 148900 }, { "epoch": 0.00149, - "grad_norm": 1.4512112140655518, + "grad_norm": 1.6367475986480713, "learning_rate": 1e-05, - "loss": 0.3088, + "loss": 0.3142, "step": 149000 }, { "epoch": 0.001491, - "grad_norm": 1.455888032913208, + "grad_norm": 1.5516581535339355, "learning_rate": 1e-05, - "loss": 0.2994, + "loss": 0.3069, "step": 149100 }, { "epoch": 0.001492, - "grad_norm": 1.47465181350708, + "grad_norm": 1.524357557296753, "learning_rate": 1e-05, - "loss": 0.3004, + "loss": 0.3031, "step": 149200 }, { "epoch": 0.001493, - "grad_norm": 1.750570297241211, + "grad_norm": 1.6461905241012573, "learning_rate": 1e-05, - "loss": 0.3091, + "loss": 0.3083, "step": 149300 }, { "epoch": 0.001494, - "grad_norm": 1.743314266204834, + "grad_norm": 1.8886070251464844, "learning_rate": 1e-05, - "loss": 0.3117, + "loss": 0.3137, "step": 149400 }, { "epoch": 0.001495, - "grad_norm": 1.512024998664856, + "grad_norm": 1.7399191856384277, "learning_rate": 1e-05, - "loss": 0.3064, + "loss": 0.3098, "step": 149500 }, { "epoch": 0.001496, - "grad_norm": 1.7793420553207397, + "grad_norm": 1.7540628910064697, "learning_rate": 1e-05, - "loss": 0.3143, + "loss": 0.3155, "step": 149600 }, { "epoch": 0.001497, - "grad_norm": 1.6427032947540283, + "grad_norm": 1.6544119119644165, "learning_rate": 1e-05, - "loss": 0.3005, + "loss": 0.3034, "step": 149700 }, { "epoch": 0.001498, - "grad_norm": 1.5276447534561157, + "grad_norm": 1.6985324621200562, "learning_rate": 1e-05, - "loss": 0.2996, + "loss": 0.3041, "step": 149800 }, { "epoch": 0.001499, - "grad_norm": 1.6565724611282349, + "grad_norm": 1.6984387636184692, "learning_rate": 1e-05, - "loss": 0.3035, + "loss": 0.3041, "step": 149900 }, { "epoch": 0.0015, - "grad_norm": 1.651855230331421, + "grad_norm": 1.5459750890731812, "learning_rate": 1e-05, - "loss": 0.3042, + "loss": 0.3055, "step": 150000 }, { "epoch": 0.001501, - "grad_norm": 1.5383728742599487, + "grad_norm": 1.6217613220214844, "learning_rate": 1e-05, - "loss": 0.3068, + "loss": 0.3083, "step": 150100 }, { "epoch": 0.001502, - "grad_norm": 1.4919730424880981, + "grad_norm": 1.6481753587722778, "learning_rate": 1e-05, - "loss": 0.3015, + "loss": 0.3045, "step": 150200 }, { "epoch": 0.001503, - "grad_norm": 1.6426124572753906, + "grad_norm": 1.5565217733383179, "learning_rate": 1e-05, - "loss": 0.3063, + "loss": 0.3099, "step": 150300 }, { "epoch": 0.001504, - "grad_norm": 1.5691157579421997, + "grad_norm": 1.678059458732605, "learning_rate": 1e-05, - "loss": 0.3078, + "loss": 0.3116, "step": 150400 }, { "epoch": 0.001505, - "grad_norm": 1.3877143859863281, + "grad_norm": 1.6894927024841309, "learning_rate": 1e-05, - "loss": 0.3098, + "loss": 0.3101, "step": 150500 }, { "epoch": 0.001506, - "grad_norm": 1.544697880744934, + "grad_norm": 1.5071243047714233, "learning_rate": 1e-05, - "loss": 0.3056, + "loss": 0.3049, "step": 150600 }, { "epoch": 0.001507, - "grad_norm": 1.56227445602417, + "grad_norm": 1.6531084775924683, "learning_rate": 1e-05, - "loss": 0.3093, + "loss": 0.31, "step": 150700 }, { "epoch": 0.001508, - "grad_norm": 1.6413577795028687, + "grad_norm": 1.5029364824295044, "learning_rate": 1e-05, - "loss": 0.2983, + "loss": 0.2991, "step": 150800 }, { "epoch": 0.001509, - "grad_norm": 1.488944411277771, + "grad_norm": 1.687752366065979, "learning_rate": 1e-05, - "loss": 0.3041, + "loss": 0.3079, "step": 150900 }, { "epoch": 0.00151, - "grad_norm": 1.360754132270813, + "grad_norm": 1.485236406326294, "learning_rate": 1e-05, - "loss": 0.2973, + "loss": 0.2977, "step": 151000 }, { "epoch": 0.001511, - "grad_norm": 1.447159767150879, + "grad_norm": 1.5481332540512085, "learning_rate": 1e-05, - "loss": 0.3066, + "loss": 0.3076, "step": 151100 }, { "epoch": 0.001512, - "grad_norm": 1.4485472440719604, + "grad_norm": 1.5191718339920044, "learning_rate": 1e-05, - "loss": 0.3002, + "loss": 0.302, "step": 151200 }, { "epoch": 0.001513, - "grad_norm": 1.4946742057800293, + "grad_norm": 1.6339939832687378, "learning_rate": 1e-05, - "loss": 0.3006, + "loss": 0.3012, "step": 151300 }, { "epoch": 0.001514, - "grad_norm": 1.3341621160507202, + "grad_norm": 1.5735626220703125, "learning_rate": 1e-05, - "loss": 0.3087, + "loss": 0.3124, "step": 151400 }, { "epoch": 0.001515, - "grad_norm": 1.6931672096252441, + "grad_norm": 1.6196351051330566, "learning_rate": 1e-05, - "loss": 0.3109, + "loss": 0.3135, "step": 151500 }, { "epoch": 0.001516, - "grad_norm": 1.3993533849716187, + "grad_norm": 1.6257820129394531, "learning_rate": 1e-05, - "loss": 0.302, + "loss": 0.3034, "step": 151600 }, { "epoch": 0.001517, - "grad_norm": 2.082754373550415, + "grad_norm": 1.5410822629928589, "learning_rate": 1e-05, - "loss": 0.2975, + "loss": 0.3014, "step": 151700 }, { "epoch": 0.001518, - "grad_norm": 1.8456296920776367, + "grad_norm": 1.7772650718688965, "learning_rate": 1e-05, - "loss": 0.301, + "loss": 0.3023, "step": 151800 }, { "epoch": 0.001519, - "grad_norm": 1.6281243562698364, + "grad_norm": 1.683762788772583, "learning_rate": 1e-05, - "loss": 0.2998, + "loss": 0.3003, "step": 151900 }, { "epoch": 0.00152, - "grad_norm": 1.5657627582550049, + "grad_norm": 2.5986571311950684, "learning_rate": 1e-05, - "loss": 0.3058, + "loss": 0.3066, "step": 152000 }, { "epoch": 0.001521, - "grad_norm": 1.808322548866272, + "grad_norm": 1.802271842956543, "learning_rate": 1e-05, - "loss": 0.3062, + "loss": 0.3066, "step": 152100 }, { "epoch": 0.001522, - "grad_norm": 1.6712733507156372, + "grad_norm": 1.8014322519302368, "learning_rate": 1e-05, - "loss": 0.3006, + "loss": 0.3039, "step": 152200 }, { "epoch": 0.001523, - "grad_norm": 1.5771421194076538, + "grad_norm": 3.4917097091674805, "learning_rate": 1e-05, - "loss": 0.3049, + "loss": 0.3077, "step": 152300 }, { "epoch": 0.001524, - "grad_norm": 1.5209197998046875, + "grad_norm": 1.3952269554138184, "learning_rate": 1e-05, - "loss": 0.3051, + "loss": 0.3063, "step": 152400 }, { "epoch": 0.001525, - "grad_norm": 1.3719326257705688, + "grad_norm": 1.4687715768814087, "learning_rate": 1e-05, - "loss": 0.3137, + "loss": 0.3133, "step": 152500 }, { "epoch": 0.001526, - "grad_norm": 1.65715754032135, + "grad_norm": 1.571907877922058, "learning_rate": 1e-05, - "loss": 0.3027, + "loss": 0.3053, "step": 152600 }, { "epoch": 0.001527, - "grad_norm": 1.6018787622451782, + "grad_norm": 1.6538187265396118, "learning_rate": 1e-05, - "loss": 0.2995, + "loss": 0.3025, "step": 152700 }, { "epoch": 0.001528, - "grad_norm": 1.4952369928359985, + "grad_norm": 1.332327961921692, "learning_rate": 1e-05, - "loss": 0.2996, + "loss": 0.3036, "step": 152800 }, { "epoch": 0.001529, - "grad_norm": 1.6246006488800049, + "grad_norm": 1.9874423742294312, "learning_rate": 1e-05, - "loss": 0.3028, + "loss": 0.304, "step": 152900 }, { "epoch": 0.00153, - "grad_norm": 1.531864881515503, + "grad_norm": 1.5692553520202637, "learning_rate": 1e-05, - "loss": 0.3037, + "loss": 0.3054, "step": 153000 }, { "epoch": 0.001531, - "grad_norm": 1.6094797849655151, + "grad_norm": 1.6490308046340942, "learning_rate": 1e-05, - "loss": 0.3072, + "loss": 0.3094, "step": 153100 }, { "epoch": 0.001532, - "grad_norm": 1.3827775716781616, + "grad_norm": 1.4249401092529297, "learning_rate": 1e-05, - "loss": 0.3041, + "loss": 0.3078, "step": 153200 }, { "epoch": 0.001533, - "grad_norm": 1.483332872390747, + "grad_norm": 1.432947039604187, "learning_rate": 1e-05, - "loss": 0.2964, + "loss": 0.3029, "step": 153300 }, { "epoch": 0.001534, - "grad_norm": 1.5488700866699219, + "grad_norm": 1.6225773096084595, "learning_rate": 1e-05, - "loss": 0.2978, + "loss": 0.2992, "step": 153400 }, { "epoch": 0.001535, - "grad_norm": 1.6395751237869263, + "grad_norm": 1.6100537776947021, "learning_rate": 1e-05, - "loss": 0.3051, + "loss": 0.3072, "step": 153500 }, { "epoch": 0.001536, - "grad_norm": 1.7893060445785522, + "grad_norm": 1.6616079807281494, "learning_rate": 1e-05, - "loss": 0.3035, + "loss": 0.3066, "step": 153600 }, { "epoch": 0.001537, - "grad_norm": 1.6110421419143677, + "grad_norm": 1.8805843591690063, "learning_rate": 1e-05, - "loss": 0.2974, + "loss": 0.3005, "step": 153700 }, { "epoch": 0.001538, - "grad_norm": 1.3510857820510864, + "grad_norm": 1.3726553916931152, "learning_rate": 1e-05, - "loss": 0.2992, + "loss": 0.3055, "step": 153800 }, { "epoch": 0.001539, - "grad_norm": 1.7193515300750732, + "grad_norm": 1.6736137866973877, "learning_rate": 1e-05, - "loss": 0.3045, + "loss": 0.3053, "step": 153900 }, { "epoch": 0.00154, - "grad_norm": 1.5728325843811035, + "grad_norm": 1.704338550567627, "learning_rate": 1e-05, - "loss": 0.3078, + "loss": 0.3085, "step": 154000 }, { "epoch": 0.001541, - "grad_norm": 1.3395737409591675, + "grad_norm": 1.5360641479492188, "learning_rate": 1e-05, - "loss": 0.3034, + "loss": 0.3051, "step": 154100 }, { "epoch": 0.001542, - "grad_norm": 1.6116013526916504, + "grad_norm": 1.5851362943649292, "learning_rate": 1e-05, - "loss": 0.3066, + "loss": 0.3098, "step": 154200 }, { "epoch": 0.001543, - "grad_norm": 1.5507358312606812, + "grad_norm": 1.5866998434066772, "learning_rate": 1e-05, - "loss": 0.3048, + "loss": 0.3059, "step": 154300 }, { "epoch": 0.001544, - "grad_norm": 1.3512567281723022, + "grad_norm": 1.4236769676208496, "learning_rate": 1e-05, - "loss": 0.3022, + "loss": 0.3052, "step": 154400 }, { "epoch": 0.001545, - "grad_norm": 1.489431619644165, + "grad_norm": 1.5114357471466064, "learning_rate": 1e-05, - "loss": 0.2981, + "loss": 0.3013, "step": 154500 }, { "epoch": 0.001546, - "grad_norm": 1.6759475469589233, + "grad_norm": 1.5853383541107178, "learning_rate": 1e-05, - "loss": 0.3002, + "loss": 0.3032, "step": 154600 }, { "epoch": 0.001547, - "grad_norm": 1.5494149923324585, + "grad_norm": 1.7222450971603394, "learning_rate": 1e-05, - "loss": 0.2971, + "loss": 0.3004, "step": 154700 }, { "epoch": 0.001548, - "grad_norm": 1.6345161199569702, + "grad_norm": 1.832231879234314, "learning_rate": 1e-05, - "loss": 0.302, + "loss": 0.3022, "step": 154800 }, { "epoch": 0.001549, - "grad_norm": 1.4096956253051758, + "grad_norm": 1.5499014854431152, "learning_rate": 1e-05, - "loss": 0.3039, + "loss": 0.307, "step": 154900 }, { "epoch": 0.00155, - "grad_norm": 1.5778037309646606, + "grad_norm": 1.6969635486602783, "learning_rate": 1e-05, - "loss": 0.3012, + "loss": 0.306, "step": 155000 }, { "epoch": 0.001551, - "grad_norm": 1.4208520650863647, + "grad_norm": 1.4039770364761353, "learning_rate": 1e-05, - "loss": 0.2944, + "loss": 0.2953, "step": 155100 }, { "epoch": 0.001552, - "grad_norm": 1.546958327293396, + "grad_norm": 1.4944506883621216, "learning_rate": 1e-05, - "loss": 0.305, + "loss": 0.3068, "step": 155200 }, { "epoch": 0.001553, - "grad_norm": 1.6199829578399658, + "grad_norm": 1.6612184047698975, "learning_rate": 1e-05, - "loss": 0.2999, + "loss": 0.3011, "step": 155300 }, { "epoch": 0.001554, - "grad_norm": 1.4075100421905518, + "grad_norm": 1.3576972484588623, "learning_rate": 1e-05, - "loss": 0.2968, + "loss": 0.2988, "step": 155400 }, { "epoch": 0.001555, - "grad_norm": 1.5693169832229614, + "grad_norm": 1.514603853225708, "learning_rate": 1e-05, - "loss": 0.2981, + "loss": 0.3004, "step": 155500 }, { "epoch": 0.001556, - "grad_norm": 1.648750901222229, + "grad_norm": 1.7504587173461914, "learning_rate": 1e-05, - "loss": 0.3027, + "loss": 0.3031, "step": 155600 }, { "epoch": 0.001557, - "grad_norm": 1.6390204429626465, + "grad_norm": 1.723211407661438, "learning_rate": 1e-05, - "loss": 0.3012, + "loss": 0.3057, "step": 155700 }, { "epoch": 0.001558, - "grad_norm": 1.4099714756011963, + "grad_norm": 1.395255446434021, "learning_rate": 1e-05, - "loss": 0.3054, + "loss": 0.3081, "step": 155800 }, { "epoch": 0.001559, - "grad_norm": 1.7042121887207031, + "grad_norm": 1.8021862506866455, "learning_rate": 1e-05, - "loss": 0.2985, + "loss": 0.3039, "step": 155900 }, { "epoch": 0.00156, - "grad_norm": 1.7378901243209839, + "grad_norm": 1.6634401082992554, "learning_rate": 1e-05, - "loss": 0.3029, + "loss": 0.3064, "step": 156000 }, { "epoch": 0.001561, - "grad_norm": 1.5757272243499756, + "grad_norm": 1.6071007251739502, "learning_rate": 1e-05, - "loss": 0.3017, + "loss": 0.3046, "step": 156100 }, { "epoch": 0.001562, - "grad_norm": 1.4470566511154175, + "grad_norm": 1.7638030052185059, "learning_rate": 1e-05, - "loss": 0.3045, + "loss": 0.3073, "step": 156200 }, { "epoch": 0.001563, - "grad_norm": 1.6149996519088745, + "grad_norm": 1.6403735876083374, "learning_rate": 1e-05, - "loss": 0.3001, + "loss": 0.304, "step": 156300 }, { "epoch": 0.001564, - "grad_norm": 1.612799882888794, + "grad_norm": 1.8615624904632568, "learning_rate": 1e-05, - "loss": 0.2925, + "loss": 0.2961, "step": 156400 }, { "epoch": 0.001565, - "grad_norm": 1.1957837343215942, + "grad_norm": 1.501093864440918, "learning_rate": 1e-05, - "loss": 0.2978, + "loss": 0.2993, "step": 156500 }, { "epoch": 0.001566, - "grad_norm": 1.4723354578018188, + "grad_norm": 1.7402315139770508, "learning_rate": 1e-05, - "loss": 0.3017, + "loss": 0.3033, "step": 156600 }, { "epoch": 0.001567, - "grad_norm": 1.4041662216186523, + "grad_norm": 1.4010441303253174, "learning_rate": 1e-05, - "loss": 0.304, + "loss": 0.3079, "step": 156700 }, { "epoch": 0.001568, - "grad_norm": 1.907631516456604, + "grad_norm": 1.894376516342163, "learning_rate": 1e-05, - "loss": 0.2974, + "loss": 0.3006, "step": 156800 }, { "epoch": 0.001569, - "grad_norm": 1.296934962272644, + "grad_norm": 1.381251335144043, "learning_rate": 1e-05, - "loss": 0.3031, + "loss": 0.3053, "step": 156900 }, { "epoch": 0.00157, - "grad_norm": 1.785561203956604, + "grad_norm": 1.8080320358276367, "learning_rate": 1e-05, - "loss": 0.2996, + "loss": 0.2998, "step": 157000 }, { "epoch": 0.001571, - "grad_norm": 1.4075443744659424, + "grad_norm": 1.4762428998947144, "learning_rate": 1e-05, - "loss": 0.3006, + "loss": 0.304, "step": 157100 }, { "epoch": 0.001572, - "grad_norm": 1.395680546760559, + "grad_norm": 1.4068925380706787, "learning_rate": 1e-05, - "loss": 0.3053, + "loss": 0.3069, "step": 157200 }, { "epoch": 0.001573, - "grad_norm": 1.5104478597640991, + "grad_norm": 1.5252565145492554, "learning_rate": 1e-05, - "loss": 0.292, + "loss": 0.2932, "step": 157300 }, { "epoch": 0.001574, - "grad_norm": 1.6522486209869385, + "grad_norm": 1.5361262559890747, "learning_rate": 1e-05, - "loss": 0.303, + "loss": 0.3026, "step": 157400 }, { "epoch": 0.001575, - "grad_norm": 1.631731629371643, + "grad_norm": 1.4667400121688843, "learning_rate": 1e-05, - "loss": 0.2983, + "loss": 0.2973, "step": 157500 }, { "epoch": 0.001576, - "grad_norm": 1.327804446220398, + "grad_norm": 1.3710594177246094, "learning_rate": 1e-05, - "loss": 0.2931, + "loss": 0.2958, "step": 157600 }, { "epoch": 0.001577, - "grad_norm": 1.589500069618225, + "grad_norm": 1.4320799112319946, "learning_rate": 1e-05, - "loss": 0.3043, + "loss": 0.3053, "step": 157700 }, { "epoch": 0.001578, - "grad_norm": 1.6291720867156982, + "grad_norm": 1.7390029430389404, "learning_rate": 1e-05, - "loss": 0.3029, + "loss": 0.3042, "step": 157800 }, { "epoch": 0.001579, - "grad_norm": 1.6552425622940063, + "grad_norm": 1.7083619832992554, "learning_rate": 1e-05, - "loss": 0.3039, + "loss": 0.3051, "step": 157900 }, { "epoch": 0.00158, - "grad_norm": 2.408303737640381, + "grad_norm": 1.5926868915557861, "learning_rate": 1e-05, - "loss": 0.3067, + "loss": 0.3092, "step": 158000 }, { "epoch": 0.001581, - "grad_norm": 1.8357452154159546, + "grad_norm": 1.8559361696243286, "learning_rate": 1e-05, - "loss": 0.3028, + "loss": 0.3046, "step": 158100 }, { "epoch": 0.001582, - "grad_norm": 1.6411058902740479, + "grad_norm": 1.7098585367202759, "learning_rate": 1e-05, - "loss": 0.297, + "loss": 0.3016, "step": 158200 }, { "epoch": 0.001583, - "grad_norm": 1.6585663557052612, + "grad_norm": 1.5086554288864136, "learning_rate": 1e-05, - "loss": 0.3088, + "loss": 0.3099, "step": 158300 }, { "epoch": 0.001584, - "grad_norm": 1.5179107189178467, + "grad_norm": 1.5402973890304565, "learning_rate": 1e-05, - "loss": 0.3044, + "loss": 0.3047, "step": 158400 }, { "epoch": 0.001585, - "grad_norm": 1.8210595846176147, + "grad_norm": 1.8630613088607788, "learning_rate": 1e-05, - "loss": 0.2982, + "loss": 0.2992, "step": 158500 }, { "epoch": 0.001586, - "grad_norm": 1.4478996992111206, + "grad_norm": 1.4310539960861206, "learning_rate": 1e-05, - "loss": 0.2997, + "loss": 0.3038, "step": 158600 }, { "epoch": 0.001587, - "grad_norm": 1.6085536479949951, + "grad_norm": 1.7185118198394775, "learning_rate": 1e-05, - "loss": 0.2894, + "loss": 0.2946, "step": 158700 }, { "epoch": 0.001588, - "grad_norm": 1.6204121112823486, + "grad_norm": 1.6692184209823608, "learning_rate": 1e-05, - "loss": 0.3006, + "loss": 0.3042, "step": 158800 }, { "epoch": 0.001589, - "grad_norm": 1.4800257682800293, + "grad_norm": 1.501453161239624, "learning_rate": 1e-05, - "loss": 0.2935, + "loss": 0.2964, "step": 158900 }, { "epoch": 0.00159, - "grad_norm": 1.6100873947143555, + "grad_norm": 1.6817126274108887, "learning_rate": 1e-05, - "loss": 0.2996, + "loss": 0.3005, "step": 159000 }, { "epoch": 0.001591, - "grad_norm": 1.4301503896713257, + "grad_norm": 1.4650346040725708, "learning_rate": 1e-05, - "loss": 0.3001, + "loss": 0.3043, "step": 159100 }, { "epoch": 0.001592, - "grad_norm": 1.743107557296753, + "grad_norm": 1.7585411071777344, "learning_rate": 1e-05, - "loss": 0.2973, + "loss": 0.2997, "step": 159200 }, { "epoch": 0.001593, - "grad_norm": 1.3405005931854248, + "grad_norm": 1.440165638923645, "learning_rate": 1e-05, - "loss": 0.3058, + "loss": 0.3077, "step": 159300 }, { "epoch": 0.001594, - "grad_norm": 1.5504013299942017, + "grad_norm": 1.6657960414886475, "learning_rate": 1e-05, - "loss": 0.2961, + "loss": 0.301, "step": 159400 }, { "epoch": 0.001595, - "grad_norm": 1.4421133995056152, + "grad_norm": 1.4905738830566406, "learning_rate": 1e-05, - "loss": 0.2987, + "loss": 0.3025, "step": 159500 }, { "epoch": 0.001596, - "grad_norm": 1.598412275314331, + "grad_norm": 1.744041085243225, "learning_rate": 1e-05, - "loss": 0.2994, + "loss": 0.3015, "step": 159600 }, { "epoch": 0.001597, - "grad_norm": 1.4896574020385742, + "grad_norm": 1.3322395086288452, "learning_rate": 1e-05, - "loss": 0.2948, + "loss": 0.2977, "step": 159700 }, { "epoch": 0.001598, - "grad_norm": 1.4277527332305908, + "grad_norm": 1.577319622039795, "learning_rate": 1e-05, - "loss": 0.294, + "loss": 0.2962, "step": 159800 }, { "epoch": 0.001599, - "grad_norm": 1.3650788068771362, + "grad_norm": 1.4697024822235107, "learning_rate": 1e-05, - "loss": 0.2985, + "loss": 0.2978, "step": 159900 }, { "epoch": 0.0016, - "grad_norm": 1.4294025897979736, + "grad_norm": 1.4797179698944092, "learning_rate": 1e-05, - "loss": 0.2958, + "loss": 0.2991, "step": 160000 }, { "epoch": 0.0016, - "eval_loss": 0.275390625, - "eval_runtime": 102.2926, - "eval_samples_per_second": 488.794, - "eval_steps_per_second": 30.55, + "eval_loss": 0.2783203125, + "eval_runtime": 115.6285, + "eval_samples_per_second": 432.419, + "eval_steps_per_second": 27.026, "step": 160000 }, { "epoch": 0.001601, - "grad_norm": 1.4310728311538696, + "grad_norm": 1.489996075630188, "learning_rate": 1e-05, - "loss": 0.3031, + "loss": 0.3065, "step": 160100 }, { "epoch": 0.001602, - "grad_norm": 1.299068570137024, + "grad_norm": 1.6529942750930786, "learning_rate": 1e-05, - "loss": 0.2946, + "loss": 0.2961, "step": 160200 }, { "epoch": 0.001603, - "grad_norm": 1.4256137609481812, + "grad_norm": 1.6032297611236572, "learning_rate": 1e-05, - "loss": 0.2998, + "loss": 0.3036, "step": 160300 }, { "epoch": 0.001604, - "grad_norm": 1.2395631074905396, + "grad_norm": 1.3672584295272827, "learning_rate": 1e-05, - "loss": 0.3004, + "loss": 0.3034, "step": 160400 }, { "epoch": 0.001605, - "grad_norm": 1.614815354347229, + "grad_norm": 1.5010960102081299, "learning_rate": 1e-05, - "loss": 0.3013, + "loss": 0.3026, "step": 160500 }, { "epoch": 0.001606, - "grad_norm": 1.480604887008667, + "grad_norm": 1.631774663925171, "learning_rate": 1e-05, - "loss": 0.3039, + "loss": 0.3042, "step": 160600 }, { "epoch": 0.001607, - "grad_norm": 1.201635718345642, + "grad_norm": 1.3571579456329346, "learning_rate": 1e-05, - "loss": 0.3004, + "loss": 0.3008, "step": 160700 }, { "epoch": 0.001608, - "grad_norm": 1.6528321504592896, + "grad_norm": 1.9333149194717407, "learning_rate": 1e-05, - "loss": 0.297, + "loss": 0.2979, "step": 160800 }, { "epoch": 0.001609, - "grad_norm": 1.6721405982971191, + "grad_norm": 1.5662444829940796, "learning_rate": 1e-05, - "loss": 0.2963, + "loss": 0.2991, "step": 160900 }, { "epoch": 0.00161, - "grad_norm": 2.010145425796509, + "grad_norm": 1.4831576347351074, "learning_rate": 1e-05, - "loss": 0.2954, + "loss": 0.2976, "step": 161000 }, { "epoch": 0.001611, - "grad_norm": 1.2307894229888916, + "grad_norm": 1.3023030757904053, "learning_rate": 1e-05, - "loss": 0.3018, + "loss": 0.3031, "step": 161100 }, { "epoch": 0.001612, - "grad_norm": 1.8804794549942017, + "grad_norm": 1.3196107149124146, "learning_rate": 1e-05, - "loss": 0.2943, + "loss": 0.2966, "step": 161200 }, { "epoch": 0.001613, - "grad_norm": 1.5667016506195068, + "grad_norm": 1.6283353567123413, "learning_rate": 1e-05, - "loss": 0.2999, + "loss": 0.3041, "step": 161300 }, { "epoch": 0.001614, - "grad_norm": 1.5471899509429932, + "grad_norm": 1.5330151319503784, "learning_rate": 1e-05, - "loss": 0.2936, + "loss": 0.2953, "step": 161400 }, { "epoch": 0.001615, - "grad_norm": 2.1454241275787354, + "grad_norm": 1.583950400352478, "learning_rate": 1e-05, - "loss": 0.2963, + "loss": 0.2976, "step": 161500 }, { "epoch": 0.001616, - "grad_norm": 1.440755844116211, + "grad_norm": 1.557778000831604, "learning_rate": 1e-05, - "loss": 0.2993, + "loss": 0.3016, "step": 161600 }, { "epoch": 0.001617, - "grad_norm": 1.525349497795105, + "grad_norm": 2.2620208263397217, "learning_rate": 1e-05, - "loss": 0.3037, + "loss": 0.3063, "step": 161700 }, { "epoch": 0.001618, - "grad_norm": 1.6071661710739136, + "grad_norm": 1.6115883588790894, "learning_rate": 1e-05, - "loss": 0.2939, + "loss": 0.2965, "step": 161800 }, { "epoch": 0.001619, - "grad_norm": 2.3565502166748047, + "grad_norm": 1.6807005405426025, "learning_rate": 1e-05, - "loss": 0.2999, + "loss": 0.302, "step": 161900 }, { "epoch": 0.00162, - "grad_norm": 1.2953723669052124, + "grad_norm": 1.4891862869262695, "learning_rate": 1e-05, - "loss": 0.2901, + "loss": 0.293, "step": 162000 }, { "epoch": 0.001621, - "grad_norm": 1.5353537797927856, + "grad_norm": 1.6026562452316284, "learning_rate": 1e-05, - "loss": 0.3022, + "loss": 0.3031, "step": 162100 }, { "epoch": 0.001622, - "grad_norm": 1.378483772277832, + "grad_norm": 1.4442458152770996, "learning_rate": 1e-05, - "loss": 0.3076, + "loss": 0.3084, "step": 162200 }, { "epoch": 0.001623, - "grad_norm": 1.7302042245864868, + "grad_norm": 1.5560252666473389, "learning_rate": 1e-05, - "loss": 0.3005, + "loss": 0.3017, "step": 162300 }, { "epoch": 0.001624, - "grad_norm": 1.6428160667419434, + "grad_norm": 1.6526131629943848, "learning_rate": 1e-05, - "loss": 0.2939, + "loss": 0.2969, "step": 162400 }, { "epoch": 0.001625, - "grad_norm": 1.4420937299728394, + "grad_norm": 1.4917162656784058, "learning_rate": 1e-05, - "loss": 0.2948, + "loss": 0.3017, "step": 162500 }, { "epoch": 0.001626, - "grad_norm": 1.3886778354644775, + "grad_norm": 1.526892066001892, "learning_rate": 1e-05, - "loss": 0.3002, + "loss": 0.3001, "step": 162600 }, { "epoch": 0.001627, - "grad_norm": 1.3009254932403564, + "grad_norm": 1.3089638948440552, "learning_rate": 1e-05, - "loss": 0.2935, + "loss": 0.2965, "step": 162700 }, { "epoch": 0.001628, - "grad_norm": 1.4960280656814575, + "grad_norm": 1.6630245447158813, "learning_rate": 1e-05, - "loss": 0.293, + "loss": 0.2936, "step": 162800 }, { "epoch": 0.001629, - "grad_norm": 1.5243725776672363, + "grad_norm": 1.420673131942749, "learning_rate": 1e-05, - "loss": 0.302, + "loss": 0.3051, "step": 162900 }, { "epoch": 0.00163, - "grad_norm": 1.396635890007019, + "grad_norm": 1.4711486101150513, "learning_rate": 1e-05, - "loss": 0.2921, + "loss": 0.2923, "step": 163000 }, { "epoch": 0.001631, - "grad_norm": 1.774428129196167, + "grad_norm": 1.6381266117095947, "learning_rate": 1e-05, - "loss": 0.2926, + "loss": 0.294, "step": 163100 }, { "epoch": 0.001632, - "grad_norm": 1.544506311416626, + "grad_norm": 1.5917518138885498, "learning_rate": 1e-05, - "loss": 0.2983, + "loss": 0.3024, "step": 163200 }, { "epoch": 0.001633, - "grad_norm": 1.5357224941253662, + "grad_norm": 1.6768611669540405, "learning_rate": 1e-05, - "loss": 0.2959, + "loss": 0.2992, "step": 163300 }, { "epoch": 0.001634, - "grad_norm": 2.2939233779907227, + "grad_norm": 1.5716297626495361, "learning_rate": 1e-05, - "loss": 0.3027, + "loss": 0.3028, "step": 163400 }, { "epoch": 0.001635, - "grad_norm": 1.5708130598068237, + "grad_norm": 1.5690321922302246, "learning_rate": 1e-05, - "loss": 0.2952, + "loss": 0.2989, "step": 163500 }, { "epoch": 0.001636, - "grad_norm": 1.8040704727172852, + "grad_norm": 1.698068618774414, "learning_rate": 1e-05, - "loss": 0.2989, + "loss": 0.3009, "step": 163600 }, { "epoch": 0.001637, - "grad_norm": 1.727067470550537, + "grad_norm": 1.7230242490768433, "learning_rate": 1e-05, - "loss": 0.3005, + "loss": 0.304, "step": 163700 }, { "epoch": 0.001638, - "grad_norm": 1.596760630607605, + "grad_norm": 1.6072338819503784, "learning_rate": 1e-05, - "loss": 0.2925, + "loss": 0.2956, "step": 163800 }, { "epoch": 0.001639, - "grad_norm": 1.6670427322387695, + "grad_norm": 1.611342430114746, "learning_rate": 1e-05, - "loss": 0.2989, + "loss": 0.304, "step": 163900 }, { "epoch": 0.00164, - "grad_norm": 1.4313431978225708, + "grad_norm": 1.4601253271102905, "learning_rate": 1e-05, - "loss": 0.3057, + "loss": 0.3055, "step": 164000 }, { "epoch": 0.001641, - "grad_norm": 1.3369249105453491, + "grad_norm": 1.567654013633728, "learning_rate": 1e-05, - "loss": 0.296, + "loss": 0.2974, "step": 164100 }, { "epoch": 0.001642, - "grad_norm": 1.7383766174316406, + "grad_norm": 2.222820281982422, "learning_rate": 1e-05, - "loss": 0.2949, + "loss": 0.2955, "step": 164200 }, { "epoch": 0.001643, - "grad_norm": 1.556339144706726, + "grad_norm": 4.182979106903076, "learning_rate": 1e-05, - "loss": 0.3001, + "loss": 0.3032, "step": 164300 }, { "epoch": 0.001644, - "grad_norm": 1.5934113264083862, + "grad_norm": 1.8960726261138916, "learning_rate": 1e-05, - "loss": 0.2977, + "loss": 0.3026, "step": 164400 }, { "epoch": 0.001645, - "grad_norm": 1.4391930103302002, + "grad_norm": 1.5564576387405396, "learning_rate": 1e-05, - "loss": 0.2918, + "loss": 0.2951, "step": 164500 }, { "epoch": 0.001646, - "grad_norm": 1.5636848211288452, + "grad_norm": 1.519041657447815, "learning_rate": 1e-05, - "loss": 0.2902, + "loss": 0.2961, "step": 164600 }, { "epoch": 0.001647, - "grad_norm": 1.6338316202163696, + "grad_norm": 1.6985987424850464, "learning_rate": 1e-05, - "loss": 0.2937, + "loss": 0.2964, "step": 164700 }, { "epoch": 0.001648, - "grad_norm": 1.3964899778366089, + "grad_norm": 1.3167078495025635, "learning_rate": 1e-05, - "loss": 0.2969, + "loss": 0.2982, "step": 164800 }, { "epoch": 0.001649, - "grad_norm": 1.4167790412902832, + "grad_norm": 1.5005210638046265, "learning_rate": 1e-05, - "loss": 0.2921, + "loss": 0.2936, "step": 164900 }, { "epoch": 0.00165, - "grad_norm": 1.4142581224441528, + "grad_norm": 1.4177864789962769, "learning_rate": 1e-05, - "loss": 0.2979, + "loss": 0.2978, "step": 165000 }, { "epoch": 0.001651, - "grad_norm": 1.3829108476638794, + "grad_norm": 1.4829902648925781, "learning_rate": 1e-05, - "loss": 0.2943, + "loss": 0.2986, "step": 165100 }, { "epoch": 0.001652, - "grad_norm": 1.5044398307800293, + "grad_norm": 1.3919358253479004, "learning_rate": 1e-05, - "loss": 0.2914, + "loss": 0.2917, "step": 165200 }, { "epoch": 0.001653, - "grad_norm": 1.3914971351623535, + "grad_norm": 1.3996176719665527, "learning_rate": 1e-05, - "loss": 0.2961, + "loss": 0.298, "step": 165300 }, { "epoch": 0.001654, - "grad_norm": 1.2378852367401123, + "grad_norm": 2.0305674076080322, "learning_rate": 1e-05, - "loss": 0.2963, + "loss": 0.2974, "step": 165400 }, { "epoch": 0.001655, - "grad_norm": 1.7061465978622437, + "grad_norm": 1.710474967956543, "learning_rate": 1e-05, - "loss": 0.2979, + "loss": 0.2989, "step": 165500 }, { "epoch": 0.001656, - "grad_norm": 1.3293792009353638, + "grad_norm": 1.4588967561721802, "learning_rate": 1e-05, - "loss": 0.2943, + "loss": 0.2953, "step": 165600 }, { "epoch": 0.001657, - "grad_norm": 1.4436366558074951, + "grad_norm": 1.4981319904327393, "learning_rate": 1e-05, - "loss": 0.2981, + "loss": 0.2997, "step": 165700 }, { "epoch": 0.001658, - "grad_norm": 1.3741648197174072, + "grad_norm": 1.4303194284439087, "learning_rate": 1e-05, - "loss": 0.2961, + "loss": 0.2996, "step": 165800 }, { "epoch": 0.001659, - "grad_norm": 1.3221266269683838, + "grad_norm": 1.3741976022720337, "learning_rate": 1e-05, - "loss": 0.291, + "loss": 0.2921, "step": 165900 }, { "epoch": 0.00166, - "grad_norm": 1.7817293405532837, + "grad_norm": 1.6370424032211304, "learning_rate": 1e-05, - "loss": 0.2997, + "loss": 0.302, "step": 166000 }, { "epoch": 0.001661, - "grad_norm": 1.4453301429748535, + "grad_norm": 1.6333328485488892, "learning_rate": 1e-05, - "loss": 0.2925, + "loss": 0.2957, "step": 166100 }, { "epoch": 0.001662, - "grad_norm": 1.4666496515274048, + "grad_norm": 1.5434244871139526, "learning_rate": 1e-05, - "loss": 0.2872, + "loss": 0.2877, "step": 166200 }, { "epoch": 0.001663, - "grad_norm": 1.5226787328720093, + "grad_norm": 1.4523191452026367, "learning_rate": 1e-05, - "loss": 0.297, + "loss": 0.2984, "step": 166300 }, { "epoch": 0.001664, - "grad_norm": 1.3819777965545654, + "grad_norm": 1.4161934852600098, "learning_rate": 1e-05, - "loss": 0.3076, + "loss": 0.309, "step": 166400 }, { "epoch": 0.001665, - "grad_norm": 1.5542329549789429, + "grad_norm": 1.5231043100357056, "learning_rate": 1e-05, - "loss": 0.2914, + "loss": 0.2942, "step": 166500 }, { "epoch": 0.001666, - "grad_norm": 1.4394443035125732, + "grad_norm": 1.4963332414627075, "learning_rate": 1e-05, - "loss": 0.2905, + "loss": 0.2931, "step": 166600 }, { "epoch": 0.001667, - "grad_norm": 1.636993169784546, + "grad_norm": 1.5862305164337158, "learning_rate": 1e-05, - "loss": 0.2871, + "loss": 0.2876, "step": 166700 }, { "epoch": 0.001668, - "grad_norm": 1.4282114505767822, + "grad_norm": 1.4414396286010742, "learning_rate": 1e-05, - "loss": 0.2939, + "loss": 0.2971, "step": 166800 }, { "epoch": 0.001669, - "grad_norm": 1.4972128868103027, + "grad_norm": 1.5475411415100098, "learning_rate": 1e-05, - "loss": 0.3017, + "loss": 0.3065, "step": 166900 }, { "epoch": 0.00167, - "grad_norm": 1.2904422283172607, + "grad_norm": 1.6189428567886353, "learning_rate": 1e-05, - "loss": 0.291, + "loss": 0.2929, "step": 167000 }, { "epoch": 0.001671, - "grad_norm": 1.5197112560272217, + "grad_norm": 3.0786776542663574, "learning_rate": 1e-05, - "loss": 0.2927, + "loss": 0.2959, "step": 167100 }, { "epoch": 0.001672, - "grad_norm": 1.474998116493225, + "grad_norm": 1.6727555990219116, "learning_rate": 1e-05, - "loss": 0.3, + "loss": 0.302, "step": 167200 }, { "epoch": 0.001673, - "grad_norm": 1.5436956882476807, + "grad_norm": 1.6792882680892944, "learning_rate": 1e-05, - "loss": 0.2931, + "loss": 0.2956, "step": 167300 }, { "epoch": 0.001674, - "grad_norm": 1.3137630224227905, + "grad_norm": 1.4409688711166382, "learning_rate": 1e-05, - "loss": 0.2886, + "loss": 0.2931, "step": 167400 }, { "epoch": 0.001675, - "grad_norm": 1.3990117311477661, + "grad_norm": 1.84697687625885, "learning_rate": 1e-05, - "loss": 0.2904, + "loss": 0.2942, "step": 167500 }, { "epoch": 0.001676, - "grad_norm": 1.5064667463302612, + "grad_norm": 1.294731855392456, "learning_rate": 1e-05, - "loss": 0.2916, + "loss": 0.2908, "step": 167600 }, { "epoch": 0.001677, - "grad_norm": 1.602165937423706, + "grad_norm": 1.557396411895752, "learning_rate": 1e-05, - "loss": 0.2853, + "loss": 0.2906, "step": 167700 }, { "epoch": 0.001678, - "grad_norm": 1.6960735321044922, + "grad_norm": 1.6044083833694458, "learning_rate": 1e-05, - "loss": 0.2931, + "loss": 0.2953, "step": 167800 }, { "epoch": 0.001679, - "grad_norm": 1.660844326019287, + "grad_norm": 1.5445910692214966, "learning_rate": 1e-05, - "loss": 0.296, + "loss": 0.2985, "step": 167900 }, { "epoch": 0.00168, - "grad_norm": 1.309916615486145, + "grad_norm": 1.5258064270019531, "learning_rate": 1e-05, - "loss": 0.2916, + "loss": 0.292, "step": 168000 }, { "epoch": 0.001681, - "grad_norm": 1.5618187189102173, + "grad_norm": 1.6000378131866455, "learning_rate": 1e-05, - "loss": 0.2928, + "loss": 0.2961, "step": 168100 }, { "epoch": 0.001682, - "grad_norm": 1.3995943069458008, + "grad_norm": 1.4779728651046753, "learning_rate": 1e-05, - "loss": 0.3029, + "loss": 0.3035, "step": 168200 }, { "epoch": 0.001683, - "grad_norm": 1.4468162059783936, + "grad_norm": 1.8927539587020874, "learning_rate": 1e-05, - "loss": 0.2932, + "loss": 0.2961, "step": 168300 }, { "epoch": 0.001684, - "grad_norm": 1.3448352813720703, + "grad_norm": 1.5276615619659424, "learning_rate": 1e-05, - "loss": 0.2874, + "loss": 0.2941, "step": 168400 }, { "epoch": 0.001685, - "grad_norm": 1.3857319355010986, + "grad_norm": 1.4742182493209839, "learning_rate": 1e-05, - "loss": 0.2936, + "loss": 0.2977, "step": 168500 }, { "epoch": 0.001686, - "grad_norm": 1.2962915897369385, + "grad_norm": 1.3637776374816895, "learning_rate": 1e-05, - "loss": 0.2898, + "loss": 0.2919, "step": 168600 }, { "epoch": 0.001687, - "grad_norm": 1.3578494787216187, + "grad_norm": 2.3156306743621826, "learning_rate": 1e-05, - "loss": 0.3011, + "loss": 0.3035, "step": 168700 }, { "epoch": 0.001688, - "grad_norm": 1.3680819272994995, + "grad_norm": 1.5192285776138306, "learning_rate": 1e-05, - "loss": 0.2907, + "loss": 0.2936, "step": 168800 }, { "epoch": 0.001689, - "grad_norm": 1.6389213800430298, + "grad_norm": 1.6717158555984497, "learning_rate": 1e-05, - "loss": 0.2862, + "loss": 0.2873, "step": 168900 }, { "epoch": 0.00169, - "grad_norm": 2.12386155128479, + "grad_norm": 1.9118905067443848, "learning_rate": 1e-05, - "loss": 0.2871, + "loss": 0.287, "step": 169000 }, { "epoch": 0.001691, - "grad_norm": 1.7779594659805298, + "grad_norm": 1.750054955482483, "learning_rate": 1e-05, - "loss": 0.2932, + "loss": 0.2943, "step": 169100 }, { "epoch": 0.001692, - "grad_norm": 1.466528058052063, + "grad_norm": 1.545607566833496, "learning_rate": 1e-05, - "loss": 0.2951, + "loss": 0.2942, "step": 169200 }, { "epoch": 0.001693, - "grad_norm": 1.5539381504058838, + "grad_norm": 1.576042652130127, "learning_rate": 1e-05, - "loss": 0.2892, + "loss": 0.2913, "step": 169300 }, { "epoch": 0.001694, - "grad_norm": 1.460390567779541, + "grad_norm": 1.5161222219467163, "learning_rate": 1e-05, - "loss": 0.2897, + "loss": 0.2922, "step": 169400 }, { "epoch": 0.001695, - "grad_norm": 1.5887478590011597, + "grad_norm": 1.5191494226455688, "learning_rate": 1e-05, - "loss": 0.3009, + "loss": 0.3026, "step": 169500 }, { "epoch": 0.001696, - "grad_norm": 1.5351765155792236, + "grad_norm": 1.807310700416565, "learning_rate": 1e-05, - "loss": 0.294, + "loss": 0.2954, "step": 169600 }, { "epoch": 0.001697, - "grad_norm": 1.5349736213684082, + "grad_norm": 1.5956357717514038, "learning_rate": 1e-05, - "loss": 0.2987, + "loss": 0.3014, "step": 169700 }, { "epoch": 0.001698, - "grad_norm": 1.4935643672943115, + "grad_norm": 2.556617259979248, "learning_rate": 1e-05, - "loss": 0.2964, + "loss": 0.2993, "step": 169800 }, { "epoch": 0.001699, - "grad_norm": 1.473772406578064, + "grad_norm": 1.5786460638046265, "learning_rate": 1e-05, - "loss": 0.2927, + "loss": 0.2942, "step": 169900 }, { "epoch": 0.0017, - "grad_norm": 1.596717119216919, + "grad_norm": 1.6583482027053833, "learning_rate": 1e-05, - "loss": 0.2982, + "loss": 0.3006, "step": 170000 }, { "epoch": 0.001701, - "grad_norm": 1.5636470317840576, + "grad_norm": 1.8018178939819336, "learning_rate": 1e-05, - "loss": 0.2955, + "loss": 0.296, "step": 170100 }, { "epoch": 0.001702, - "grad_norm": 1.4247992038726807, + "grad_norm": 1.3693221807479858, "learning_rate": 1e-05, - "loss": 0.2942, + "loss": 0.2949, "step": 170200 }, { "epoch": 0.001703, - "grad_norm": 1.6491881608963013, + "grad_norm": 1.5675960779190063, "learning_rate": 1e-05, - "loss": 0.2916, + "loss": 0.2949, "step": 170300 }, { "epoch": 0.001704, - "grad_norm": 1.6906230449676514, + "grad_norm": 1.8481919765472412, "learning_rate": 1e-05, - "loss": 0.3003, + "loss": 0.3014, "step": 170400 }, { "epoch": 0.001705, - "grad_norm": 1.8152414560317993, + "grad_norm": 1.6740190982818604, "learning_rate": 1e-05, - "loss": 0.2975, + "loss": 0.2991, "step": 170500 }, { "epoch": 0.001706, - "grad_norm": 1.49286687374115, + "grad_norm": 1.5895060300827026, "learning_rate": 1e-05, - "loss": 0.2912, + "loss": 0.2957, "step": 170600 }, { "epoch": 0.001707, - "grad_norm": 1.3003183603286743, + "grad_norm": 1.3283610343933105, "learning_rate": 1e-05, - "loss": 0.286, + "loss": 0.2902, "step": 170700 }, { "epoch": 0.001708, - "grad_norm": 1.5960720777511597, + "grad_norm": 1.542960524559021, "learning_rate": 1e-05, - "loss": 0.2917, + "loss": 0.2922, "step": 170800 }, { "epoch": 0.001709, - "grad_norm": 1.5299168825149536, + "grad_norm": 1.5971072912216187, "learning_rate": 1e-05, - "loss": 0.2945, + "loss": 0.2965, "step": 170900 }, { "epoch": 0.00171, - "grad_norm": 1.5307646989822388, + "grad_norm": 1.559484601020813, "learning_rate": 1e-05, - "loss": 0.288, + "loss": 0.2917, "step": 171000 }, { "epoch": 0.001711, - "grad_norm": 1.8089083433151245, + "grad_norm": 1.4500508308410645, "learning_rate": 1e-05, - "loss": 0.2928, + "loss": 0.2948, "step": 171100 }, { "epoch": 0.001712, - "grad_norm": 1.45745050907135, + "grad_norm": 1.7252469062805176, "learning_rate": 1e-05, - "loss": 0.2959, + "loss": 0.2978, "step": 171200 }, { "epoch": 0.001713, - "grad_norm": 1.3431226015090942, + "grad_norm": 1.3989806175231934, "learning_rate": 1e-05, - "loss": 0.2882, + "loss": 0.2918, "step": 171300 }, { "epoch": 0.001714, - "grad_norm": 1.398280143737793, + "grad_norm": 1.3513588905334473, "learning_rate": 1e-05, - "loss": 0.2914, + "loss": 0.2945, "step": 171400 }, { "epoch": 0.001715, - "grad_norm": 1.6024528741836548, + "grad_norm": 1.7322951555252075, "learning_rate": 1e-05, - "loss": 0.2899, + "loss": 0.2937, "step": 171500 }, { "epoch": 0.001716, - "grad_norm": 1.3974082469940186, + "grad_norm": 1.5518382787704468, "learning_rate": 1e-05, - "loss": 0.2948, + "loss": 0.2963, "step": 171600 }, { "epoch": 0.001717, - "grad_norm": 1.5628585815429688, + "grad_norm": 1.6225837469100952, "learning_rate": 1e-05, - "loss": 0.2949, + "loss": 0.2996, "step": 171700 }, { "epoch": 0.001718, - "grad_norm": 1.5860384702682495, + "grad_norm": 1.6591675281524658, "learning_rate": 1e-05, - "loss": 0.2984, + "loss": 0.3009, "step": 171800 }, { "epoch": 0.001719, - "grad_norm": 1.5331531763076782, + "grad_norm": 1.6477521657943726, "learning_rate": 1e-05, - "loss": 0.296, + "loss": 0.2986, "step": 171900 }, { "epoch": 0.00172, - "grad_norm": 1.3094617128372192, + "grad_norm": 1.392760992050171, "learning_rate": 1e-05, - "loss": 0.2978, + "loss": 0.2988, "step": 172000 }, { "epoch": 0.001721, - "grad_norm": 1.4261012077331543, + "grad_norm": 2.2496235370635986, "learning_rate": 1e-05, - "loss": 0.2897, + "loss": 0.2929, "step": 172100 }, { "epoch": 0.001722, - "grad_norm": 1.3938125371932983, + "grad_norm": 1.5061190128326416, "learning_rate": 1e-05, - "loss": 0.2864, + "loss": 0.2886, "step": 172200 }, { "epoch": 0.001723, - "grad_norm": 1.6276272535324097, + "grad_norm": 1.567452311515808, "learning_rate": 1e-05, - "loss": 0.2947, + "loss": 0.2995, "step": 172300 }, { "epoch": 0.001724, - "grad_norm": 1.5261038541793823, + "grad_norm": 1.5281249284744263, "learning_rate": 1e-05, - "loss": 0.2953, + "loss": 0.3001, "step": 172400 }, { "epoch": 0.001725, - "grad_norm": 1.6271032094955444, + "grad_norm": 1.5612703561782837, "learning_rate": 1e-05, - "loss": 0.2998, + "loss": 0.3001, "step": 172500 }, { "epoch": 0.001726, - "grad_norm": 1.5781676769256592, + "grad_norm": 1.5059471130371094, "learning_rate": 1e-05, - "loss": 0.2874, + "loss": 0.2931, "step": 172600 }, { "epoch": 0.001727, - "grad_norm": 1.5198475122451782, + "grad_norm": 1.598825216293335, "learning_rate": 1e-05, - "loss": 0.2958, + "loss": 0.2935, "step": 172700 }, { "epoch": 0.001728, - "grad_norm": 1.493341088294983, + "grad_norm": 1.605755090713501, "learning_rate": 1e-05, - "loss": 0.2931, + "loss": 0.2971, "step": 172800 }, { "epoch": 0.001729, - "grad_norm": 1.5318119525909424, + "grad_norm": 1.6247023344039917, "learning_rate": 1e-05, - "loss": 0.2923, + "loss": 0.2902, "step": 172900 }, { "epoch": 0.00173, - "grad_norm": 1.418683409690857, + "grad_norm": 1.3344452381134033, "learning_rate": 1e-05, - "loss": 0.2988, + "loss": 0.3012, "step": 173000 }, { "epoch": 0.001731, - "grad_norm": 1.5809842348098755, + "grad_norm": 1.5707899332046509, "learning_rate": 1e-05, - "loss": 0.2899, + "loss": 0.2924, "step": 173100 }, { "epoch": 0.001732, - "grad_norm": 1.497093677520752, + "grad_norm": 1.3291939496994019, "learning_rate": 1e-05, - "loss": 0.286, + "loss": 0.2883, "step": 173200 }, { "epoch": 0.001733, - "grad_norm": 1.5036687850952148, + "grad_norm": 1.429669737815857, "learning_rate": 1e-05, - "loss": 0.295, + "loss": 0.2973, "step": 173300 }, { "epoch": 0.001734, - "grad_norm": 1.9745302200317383, + "grad_norm": 1.5371525287628174, "learning_rate": 1e-05, - "loss": 0.2939, + "loss": 0.2962, "step": 173400 }, { "epoch": 0.001735, - "grad_norm": 1.303313136100769, + "grad_norm": 1.5190463066101074, "learning_rate": 1e-05, - "loss": 0.2971, + "loss": 0.3017, "step": 173500 }, { "epoch": 0.001736, - "grad_norm": 1.3194680213928223, + "grad_norm": 1.4913296699523926, "learning_rate": 1e-05, - "loss": 0.2978, + "loss": 0.3002, "step": 173600 }, { "epoch": 0.001737, - "grad_norm": 1.490945816040039, + "grad_norm": 1.5345001220703125, "learning_rate": 1e-05, - "loss": 0.2912, + "loss": 0.2946, "step": 173700 }, { "epoch": 0.001738, - "grad_norm": 1.5308412313461304, + "grad_norm": 1.5316048860549927, "learning_rate": 1e-05, - "loss": 0.2932, + "loss": 0.2978, "step": 173800 }, { "epoch": 0.001739, - "grad_norm": 1.6066863536834717, + "grad_norm": 1.597626805305481, "learning_rate": 1e-05, - "loss": 0.2857, + "loss": 0.289, "step": 173900 }, { "epoch": 0.00174, - "grad_norm": 1.3577286005020142, + "grad_norm": 1.2749770879745483, "learning_rate": 1e-05, - "loss": 0.2923, + "loss": 0.2933, "step": 174000 }, { "epoch": 0.001741, - "grad_norm": 1.5625604391098022, + "grad_norm": 1.5808467864990234, "learning_rate": 1e-05, - "loss": 0.2979, + "loss": 0.2997, "step": 174100 }, { "epoch": 0.001742, - "grad_norm": 1.6609506607055664, + "grad_norm": 1.2959426641464233, "learning_rate": 1e-05, "loss": 0.2873, "step": 174200 }, { "epoch": 0.001743, - "grad_norm": 1.4918367862701416, + "grad_norm": 1.6337339878082275, "learning_rate": 1e-05, - "loss": 0.2837, + "loss": 0.2862, "step": 174300 }, { "epoch": 0.001744, - "grad_norm": 1.509962797164917, + "grad_norm": 1.528238296508789, "learning_rate": 1e-05, - "loss": 0.2823, + "loss": 0.2847, "step": 174400 }, { "epoch": 0.001745, - "grad_norm": 1.4650171995162964, + "grad_norm": 1.4361398220062256, "learning_rate": 1e-05, - "loss": 0.2922, + "loss": 0.2953, "step": 174500 }, { "epoch": 0.001746, - "grad_norm": 1.634386420249939, + "grad_norm": 1.6236249208450317, "learning_rate": 1e-05, - "loss": 0.2929, + "loss": 0.2961, "step": 174600 }, { "epoch": 0.001747, - "grad_norm": 1.3554291725158691, + "grad_norm": 1.4904263019561768, "learning_rate": 1e-05, - "loss": 0.2853, + "loss": 0.287, "step": 174700 }, { "epoch": 0.001748, - "grad_norm": 1.7688937187194824, + "grad_norm": 1.9077177047729492, "learning_rate": 1e-05, - "loss": 0.2899, + "loss": 0.292, "step": 174800 }, { "epoch": 0.001749, - "grad_norm": 1.3382573127746582, + "grad_norm": 1.7309199571609497, "learning_rate": 1e-05, - "loss": 0.2936, + "loss": 0.2952, "step": 174900 }, { "epoch": 0.00175, - "grad_norm": 1.6003446578979492, + "grad_norm": 1.6516369581222534, "learning_rate": 1e-05, - "loss": 0.2915, + "loss": 0.2935, "step": 175000 }, { "epoch": 0.001751, - "grad_norm": 1.4911869764328003, + "grad_norm": 1.6567866802215576, "learning_rate": 1e-05, - "loss": 0.2874, + "loss": 0.2913, "step": 175100 }, { "epoch": 0.001752, - "grad_norm": 1.6006922721862793, + "grad_norm": 1.4897688627243042, "learning_rate": 1e-05, - "loss": 0.2895, + "loss": 0.2883, "step": 175200 }, { "epoch": 0.001753, - "grad_norm": 1.3755617141723633, + "grad_norm": 1.3633755445480347, "learning_rate": 1e-05, - "loss": 0.2958, + "loss": 0.2985, "step": 175300 }, { "epoch": 0.001754, - "grad_norm": 1.7918363809585571, + "grad_norm": 1.6736537218093872, "learning_rate": 1e-05, - "loss": 0.293, + "loss": 0.2924, "step": 175400 }, { "epoch": 0.001755, - "grad_norm": 1.5009455680847168, + "grad_norm": 1.5742863416671753, "learning_rate": 1e-05, - "loss": 0.283, + "loss": 0.285, "step": 175500 }, { "epoch": 0.001756, - "grad_norm": 1.3881324529647827, + "grad_norm": 1.5823429822921753, "learning_rate": 1e-05, "loss": 0.2865, "step": 175600 }, { "epoch": 0.001757, - "grad_norm": 1.3606234788894653, + "grad_norm": 1.4199198484420776, "learning_rate": 1e-05, - "loss": 0.2981, + "loss": 0.2969, "step": 175700 }, { "epoch": 0.001758, - "grad_norm": 1.428105354309082, + "grad_norm": 1.4227279424667358, "learning_rate": 1e-05, - "loss": 0.2863, + "loss": 0.2889, "step": 175800 }, { "epoch": 0.001759, - "grad_norm": 1.5567902326583862, + "grad_norm": 1.3790444135665894, "learning_rate": 1e-05, - "loss": 0.2927, + "loss": 0.2947, "step": 175900 }, { "epoch": 0.00176, - "grad_norm": 1.582445502281189, + "grad_norm": 1.717409372329712, "learning_rate": 1e-05, - "loss": 0.2882, + "loss": 0.2934, "step": 176000 }, { "epoch": 0.001761, - "grad_norm": 1.476045846939087, + "grad_norm": 1.5897624492645264, "learning_rate": 1e-05, - "loss": 0.2964, + "loss": 0.3003, "step": 176100 }, { "epoch": 0.001762, - "grad_norm": 1.7226948738098145, + "grad_norm": 1.5490648746490479, "learning_rate": 1e-05, - "loss": 0.2894, + "loss": 0.294, "step": 176200 }, { "epoch": 0.001763, - "grad_norm": 1.4645779132843018, + "grad_norm": 1.5469624996185303, "learning_rate": 1e-05, - "loss": 0.2907, + "loss": 0.2915, "step": 176300 }, { "epoch": 0.001764, - "grad_norm": 1.6208833456039429, + "grad_norm": 1.6679738759994507, "learning_rate": 1e-05, - "loss": 0.2932, + "loss": 0.2966, "step": 176400 }, { "epoch": 0.001765, - "grad_norm": 1.3378137350082397, + "grad_norm": 1.3103872537612915, "learning_rate": 1e-05, - "loss": 0.2935, + "loss": 0.2943, "step": 176500 }, { "epoch": 0.001766, - "grad_norm": 1.4714703559875488, + "grad_norm": 1.6076604127883911, "learning_rate": 1e-05, - "loss": 0.2922, + "loss": 0.293, "step": 176600 }, { "epoch": 0.001767, - "grad_norm": 1.4893999099731445, + "grad_norm": 1.7157478332519531, "learning_rate": 1e-05, - "loss": 0.29, + "loss": 0.2908, "step": 176700 }, { "epoch": 0.001768, - "grad_norm": 1.5170120000839233, + "grad_norm": 1.6932576894760132, "learning_rate": 1e-05, - "loss": 0.2911, + "loss": 0.2931, "step": 176800 }, { "epoch": 0.001769, - "grad_norm": 1.3923100233078003, + "grad_norm": 1.3828284740447998, "learning_rate": 1e-05, - "loss": 0.2859, + "loss": 0.2886, "step": 176900 }, { "epoch": 0.00177, - "grad_norm": 2.3041770458221436, + "grad_norm": 1.5653789043426514, "learning_rate": 1e-05, - "loss": 0.2933, + "loss": 0.2952, "step": 177000 }, { "epoch": 0.001771, - "grad_norm": 1.785549521446228, + "grad_norm": 1.7369993925094604, "learning_rate": 1e-05, - "loss": 0.2858, + "loss": 0.287, "step": 177100 }, { "epoch": 0.001772, - "grad_norm": 1.4296283721923828, + "grad_norm": 1.541108250617981, "learning_rate": 1e-05, - "loss": 0.287, + "loss": 0.2885, "step": 177200 }, { "epoch": 0.001773, - "grad_norm": 1.612414002418518, + "grad_norm": 1.5713878870010376, "learning_rate": 1e-05, - "loss": 0.2922, + "loss": 0.2951, "step": 177300 }, { "epoch": 0.001774, - "grad_norm": 1.428350567817688, + "grad_norm": 1.6048654317855835, "learning_rate": 1e-05, - "loss": 0.289, + "loss": 0.2922, "step": 177400 }, { "epoch": 0.001775, - "grad_norm": 1.41500985622406, + "grad_norm": 1.6117722988128662, "learning_rate": 1e-05, - "loss": 0.2896, + "loss": 0.2909, "step": 177500 }, { "epoch": 0.001776, - "grad_norm": 1.4545865058898926, + "grad_norm": 1.759687900543213, "learning_rate": 1e-05, - "loss": 0.2951, + "loss": 0.2967, "step": 177600 }, { "epoch": 0.001777, - "grad_norm": 1.3556345701217651, + "grad_norm": 1.4271762371063232, "learning_rate": 1e-05, - "loss": 0.2844, + "loss": 0.2868, "step": 177700 }, { "epoch": 0.001778, - "grad_norm": 1.5635255575180054, + "grad_norm": 1.4909316301345825, "learning_rate": 1e-05, - "loss": 0.292, + "loss": 0.2918, "step": 177800 }, { "epoch": 0.001779, - "grad_norm": 1.488157868385315, + "grad_norm": 1.498526692390442, "learning_rate": 1e-05, - "loss": 0.2946, + "loss": 0.2973, "step": 177900 }, { "epoch": 0.00178, - "grad_norm": 1.5837771892547607, + "grad_norm": 1.373579502105713, "learning_rate": 1e-05, - "loss": 0.2856, + "loss": 0.2857, "step": 178000 }, { "epoch": 0.001781, - "grad_norm": 1.4179489612579346, + "grad_norm": 1.5439717769622803, "learning_rate": 1e-05, - "loss": 0.2863, + "loss": 0.2907, "step": 178100 }, { "epoch": 0.001782, - "grad_norm": 1.3762588500976562, + "grad_norm": 1.6108523607254028, "learning_rate": 1e-05, - "loss": 0.2914, + "loss": 0.294, "step": 178200 }, { "epoch": 0.001783, - "grad_norm": 1.508596420288086, + "grad_norm": 1.3739374876022339, "learning_rate": 1e-05, - "loss": 0.2947, + "loss": 0.2959, "step": 178300 }, { "epoch": 0.001784, - "grad_norm": 1.5911142826080322, + "grad_norm": 2.883863925933838, "learning_rate": 1e-05, - "loss": 0.2846, + "loss": 0.2891, "step": 178400 }, { "epoch": 0.001785, - "grad_norm": 1.5076138973236084, + "grad_norm": 1.3920929431915283, "learning_rate": 1e-05, - "loss": 0.2884, + "loss": 0.291, "step": 178500 }, { "epoch": 0.001786, - "grad_norm": 1.4542021751403809, + "grad_norm": 1.4327913522720337, "learning_rate": 1e-05, - "loss": 0.2861, + "loss": 0.2892, "step": 178600 }, { "epoch": 0.001787, - "grad_norm": 1.6039193868637085, + "grad_norm": 1.6962852478027344, "learning_rate": 1e-05, - "loss": 0.2835, + "loss": 0.2852, "step": 178700 }, { "epoch": 0.001788, - "grad_norm": 1.6753792762756348, + "grad_norm": 1.5259137153625488, "learning_rate": 1e-05, - "loss": 0.2918, + "loss": 0.2974, "step": 178800 }, { "epoch": 0.001789, - "grad_norm": 1.7615842819213867, + "grad_norm": 1.4931055307388306, "learning_rate": 1e-05, - "loss": 0.2838, + "loss": 0.2888, "step": 178900 }, { "epoch": 0.00179, - "grad_norm": 1.5452884435653687, + "grad_norm": 1.5622413158416748, "learning_rate": 1e-05, - "loss": 0.2972, + "loss": 0.2997, "step": 179000 }, { "epoch": 0.001791, - "grad_norm": 1.355545163154602, + "grad_norm": 1.5523242950439453, "learning_rate": 1e-05, - "loss": 0.2881, + "loss": 0.2929, "step": 179100 }, { "epoch": 0.001792, - "grad_norm": 1.5445750951766968, + "grad_norm": 1.4353841543197632, "learning_rate": 1e-05, - "loss": 0.282, + "loss": 0.285, "step": 179200 }, { "epoch": 0.001793, - "grad_norm": 1.4062409400939941, + "grad_norm": 1.4831793308258057, "learning_rate": 1e-05, - "loss": 0.2917, + "loss": 0.2938, "step": 179300 }, { "epoch": 0.001794, - "grad_norm": 1.487202525138855, + "grad_norm": 1.483508825302124, "learning_rate": 1e-05, - "loss": 0.2923, + "loss": 0.2919, "step": 179400 }, { "epoch": 0.001795, - "grad_norm": 1.52508544921875, + "grad_norm": 1.4768630266189575, "learning_rate": 1e-05, - "loss": 0.2889, + "loss": 0.2892, "step": 179500 }, { "epoch": 0.001796, - "grad_norm": 1.2828269004821777, + "grad_norm": 1.329671859741211, "learning_rate": 1e-05, - "loss": 0.2887, + "loss": 0.2902, "step": 179600 }, { "epoch": 0.001797, - "grad_norm": 1.4416427612304688, + "grad_norm": 1.4865089654922485, "learning_rate": 1e-05, - "loss": 0.2873, + "loss": 0.2906, "step": 179700 }, { "epoch": 0.001798, - "grad_norm": 1.436898112297058, + "grad_norm": 1.6199544668197632, "learning_rate": 1e-05, - "loss": 0.297, + "loss": 0.3013, "step": 179800 }, { "epoch": 0.001799, - "grad_norm": 1.3973064422607422, + "grad_norm": 1.633739948272705, "learning_rate": 1e-05, - "loss": 0.2897, + "loss": 0.29, "step": 179900 }, { "epoch": 0.0018, - "grad_norm": 1.392570972442627, + "grad_norm": 1.3728777170181274, "learning_rate": 1e-05, - "loss": 0.2864, + "loss": 0.2875, "step": 180000 }, { "epoch": 0.0018, - "eval_loss": 0.26318359375, - "eval_runtime": 101.5435, - "eval_samples_per_second": 492.4, - "eval_steps_per_second": 30.775, + "eval_loss": 0.264892578125, + "eval_runtime": 113.9893, + "eval_samples_per_second": 438.638, + "eval_steps_per_second": 27.415, "step": 180000 }, { "epoch": 0.001801, - "grad_norm": 1.500074028968811, + "grad_norm": 2.310349702835083, "learning_rate": 1e-05, - "loss": 0.2922, + "loss": 0.2925, "step": 180100 }, { "epoch": 0.001802, - "grad_norm": 1.3383935689926147, + "grad_norm": 1.3453627824783325, "learning_rate": 1e-05, - "loss": 0.2926, + "loss": 0.292, "step": 180200 }, { "epoch": 0.001803, - "grad_norm": 1.3997195959091187, + "grad_norm": 1.8540631532669067, "learning_rate": 1e-05, - "loss": 0.2944, + "loss": 0.2946, "step": 180300 }, { "epoch": 0.001804, - "grad_norm": 1.3471624851226807, + "grad_norm": 1.594420075416565, "learning_rate": 1e-05, - "loss": 0.2899, + "loss": 0.2923, "step": 180400 }, { "epoch": 0.001805, - "grad_norm": 1.6335119009017944, + "grad_norm": 1.5511283874511719, "learning_rate": 1e-05, - "loss": 0.2902, + "loss": 0.2924, "step": 180500 }, { "epoch": 0.001806, - "grad_norm": 1.5379420518875122, + "grad_norm": 1.8114066123962402, "learning_rate": 1e-05, - "loss": 0.2861, + "loss": 0.2868, "step": 180600 }, { "epoch": 0.001807, - "grad_norm": 1.503728985786438, + "grad_norm": 1.5278881788253784, "learning_rate": 1e-05, - "loss": 0.2866, + "loss": 0.2896, "step": 180700 }, { "epoch": 0.001808, - "grad_norm": 1.481974720954895, + "grad_norm": 1.4767954349517822, "learning_rate": 1e-05, - "loss": 0.2905, + "loss": 0.2928, "step": 180800 }, { "epoch": 0.001809, - "grad_norm": 1.363460898399353, + "grad_norm": 1.3067213296890259, "learning_rate": 1e-05, - "loss": 0.2892, + "loss": 0.2918, "step": 180900 }, { "epoch": 0.00181, - "grad_norm": 1.3591028451919556, + "grad_norm": 1.7097564935684204, "learning_rate": 1e-05, - "loss": 0.2864, + "loss": 0.2913, "step": 181000 }, { "epoch": 0.001811, - "grad_norm": 1.5501904487609863, + "grad_norm": 1.6690146923065186, "learning_rate": 1e-05, - "loss": 0.2818, + "loss": 0.2808, "step": 181100 }, { "epoch": 0.001812, - "grad_norm": 1.7127107381820679, + "grad_norm": 1.6829502582550049, "learning_rate": 1e-05, - "loss": 0.2849, + "loss": 0.2874, "step": 181200 }, { "epoch": 0.001813, - "grad_norm": 3.471078395843506, + "grad_norm": 1.2836750745773315, "learning_rate": 1e-05, - "loss": 0.2894, + "loss": 0.2914, "step": 181300 }, { "epoch": 0.001814, - "grad_norm": 1.5325214862823486, + "grad_norm": 1.5141675472259521, "learning_rate": 1e-05, - "loss": 0.2856, + "loss": 0.2869, "step": 181400 }, { "epoch": 0.001815, - "grad_norm": 1.3696657419204712, + "grad_norm": 1.571880578994751, "learning_rate": 1e-05, - "loss": 0.2883, + "loss": 0.2908, "step": 181500 }, { "epoch": 0.001816, - "grad_norm": 1.676340103149414, + "grad_norm": 1.5643311738967896, "learning_rate": 1e-05, - "loss": 0.2875, + "loss": 0.2905, "step": 181600 }, { "epoch": 0.001817, - "grad_norm": 1.3041986227035522, + "grad_norm": 1.9679372310638428, "learning_rate": 1e-05, - "loss": 0.2853, + "loss": 0.2866, "step": 181700 }, { "epoch": 0.001818, - "grad_norm": 1.4370715618133545, + "grad_norm": 1.5207774639129639, "learning_rate": 1e-05, - "loss": 0.2863, + "loss": 0.2876, "step": 181800 }, { "epoch": 0.001819, - "grad_norm": 1.2346585988998413, + "grad_norm": 1.4971661567687988, "learning_rate": 1e-05, - "loss": 0.28, + "loss": 0.2837, "step": 181900 }, { "epoch": 0.00182, - "grad_norm": 1.4242684841156006, + "grad_norm": 1.3630481958389282, "learning_rate": 1e-05, - "loss": 0.2884, + "loss": 0.2901, "step": 182000 }, { "epoch": 0.001821, - "grad_norm": 1.7271345853805542, + "grad_norm": 1.7479013204574585, "learning_rate": 1e-05, - "loss": 0.279, + "loss": 0.2809, "step": 182100 }, { "epoch": 0.001822, - "grad_norm": 1.4798322916030884, + "grad_norm": 1.6308436393737793, "learning_rate": 1e-05, - "loss": 0.2849, + "loss": 0.2869, "step": 182200 }, { "epoch": 0.001823, - "grad_norm": 1.5771898031234741, + "grad_norm": 1.6583669185638428, "learning_rate": 1e-05, - "loss": 0.2792, + "loss": 0.2828, "step": 182300 }, { "epoch": 0.001824, - "grad_norm": 1.6728217601776123, + "grad_norm": 1.7341161966323853, "learning_rate": 1e-05, - "loss": 0.2933, + "loss": 0.296, "step": 182400 }, { "epoch": 0.001825, - "grad_norm": 1.3526890277862549, + "grad_norm": 1.2434451580047607, "learning_rate": 1e-05, - "loss": 0.2868, + "loss": 0.2903, "step": 182500 }, { "epoch": 0.001826, - "grad_norm": 1.3895459175109863, + "grad_norm": 1.4031060934066772, "learning_rate": 1e-05, - "loss": 0.2852, + "loss": 0.2871, "step": 182600 }, { "epoch": 0.001827, - "grad_norm": 1.3883079290390015, + "grad_norm": 1.417802095413208, "learning_rate": 1e-05, - "loss": 0.2801, + "loss": 0.2849, "step": 182700 }, { "epoch": 0.001828, - "grad_norm": 1.5432260036468506, + "grad_norm": 1.6376116275787354, "learning_rate": 1e-05, - "loss": 0.285, + "loss": 0.2888, "step": 182800 }, { "epoch": 0.001829, - "grad_norm": 1.460884928703308, + "grad_norm": 1.5004040002822876, "learning_rate": 1e-05, - "loss": 0.2875, + "loss": 0.2889, "step": 182900 }, { "epoch": 0.00183, - "grad_norm": 1.4623795747756958, + "grad_norm": 1.3705480098724365, "learning_rate": 1e-05, - "loss": 0.2868, + "loss": 0.2916, "step": 183000 }, { "epoch": 0.001831, - "grad_norm": 1.390573263168335, + "grad_norm": 1.4046076536178589, "learning_rate": 1e-05, - "loss": 0.2846, + "loss": 0.2871, "step": 183100 }, { "epoch": 0.001832, - "grad_norm": 1.5001994371414185, + "grad_norm": 1.460054874420166, "learning_rate": 1e-05, - "loss": 0.2878, + "loss": 0.2915, "step": 183200 }, { "epoch": 0.001833, - "grad_norm": 1.5562909841537476, + "grad_norm": 2.7054927349090576, "learning_rate": 1e-05, - "loss": 0.2884, + "loss": 0.2913, "step": 183300 }, { "epoch": 0.001834, - "grad_norm": 1.5299289226531982, + "grad_norm": 1.5564157962799072, "learning_rate": 1e-05, - "loss": 0.281, + "loss": 0.2803, "step": 183400 }, { "epoch": 0.001835, - "grad_norm": 1.4576228857040405, + "grad_norm": 1.4496382474899292, "learning_rate": 1e-05, - "loss": 0.2914, + "loss": 0.2936, "step": 183500 }, { "epoch": 0.001836, - "grad_norm": 1.2668393850326538, + "grad_norm": 1.3869458436965942, "learning_rate": 1e-05, - "loss": 0.2858, + "loss": 0.2927, "step": 183600 }, { "epoch": 0.001837, - "grad_norm": 1.374589204788208, + "grad_norm": 1.5351581573486328, "learning_rate": 1e-05, - "loss": 0.2897, + "loss": 0.294, "step": 183700 }, { "epoch": 0.001838, - "grad_norm": 1.353493094444275, + "grad_norm": 1.3545173406600952, "learning_rate": 1e-05, - "loss": 0.2889, + "loss": 0.289, "step": 183800 }, { "epoch": 0.001839, - "grad_norm": 1.3850983381271362, + "grad_norm": 2.300602674484253, "learning_rate": 1e-05, - "loss": 0.2761, + "loss": 0.2814, "step": 183900 }, { "epoch": 0.00184, - "grad_norm": 1.5236430168151855, + "grad_norm": 1.4842824935913086, "learning_rate": 1e-05, - "loss": 0.2859, + "loss": 0.2895, "step": 184000 }, { "epoch": 0.001841, - "grad_norm": 2.55208158493042, + "grad_norm": 1.6287872791290283, "learning_rate": 1e-05, - "loss": 0.2898, + "loss": 0.2941, "step": 184100 }, { "epoch": 0.001842, - "grad_norm": 1.6300517320632935, + "grad_norm": 1.3800750970840454, "learning_rate": 1e-05, - "loss": 0.2845, + "loss": 0.2834, "step": 184200 }, { "epoch": 0.001843, - "grad_norm": 1.828763723373413, + "grad_norm": 1.4979010820388794, "learning_rate": 1e-05, - "loss": 0.2884, + "loss": 0.2903, "step": 184300 }, { "epoch": 0.001844, - "grad_norm": 1.4066919088363647, + "grad_norm": 1.5063962936401367, "learning_rate": 1e-05, - "loss": 0.2852, + "loss": 0.2857, "step": 184400 }, { "epoch": 0.001845, - "grad_norm": 1.4668482542037964, + "grad_norm": 1.2965285778045654, "learning_rate": 1e-05, - "loss": 0.2839, + "loss": 0.2856, "step": 184500 }, { "epoch": 0.001846, - "grad_norm": 1.375360131263733, + "grad_norm": 1.467738389968872, "learning_rate": 1e-05, - "loss": 0.284, + "loss": 0.2826, "step": 184600 }, { "epoch": 0.001847, - "grad_norm": 1.352906346321106, + "grad_norm": 1.4696054458618164, "learning_rate": 1e-05, - "loss": 0.2907, + "loss": 0.2938, "step": 184700 }, { "epoch": 0.001848, - "grad_norm": 1.2676482200622559, + "grad_norm": 1.6094032526016235, "learning_rate": 1e-05, - "loss": 0.2838, + "loss": 0.2852, "step": 184800 }, { "epoch": 0.001849, - "grad_norm": 1.5844227075576782, + "grad_norm": 1.5092664957046509, "learning_rate": 1e-05, - "loss": 0.2902, + "loss": 0.2951, "step": 184900 }, { "epoch": 0.00185, - "grad_norm": 1.5632858276367188, + "grad_norm": 1.482487678527832, "learning_rate": 1e-05, - "loss": 0.2868, + "loss": 0.2905, "step": 185000 }, { "epoch": 0.001851, - "grad_norm": 1.4897798299789429, + "grad_norm": 1.4276769161224365, "learning_rate": 1e-05, - "loss": 0.2815, + "loss": 0.2852, "step": 185100 }, { "epoch": 0.001852, - "grad_norm": 1.4093352556228638, + "grad_norm": 1.756006121635437, "learning_rate": 1e-05, - "loss": 0.2893, + "loss": 0.2904, "step": 185200 }, { "epoch": 0.001853, - "grad_norm": 1.969905138015747, + "grad_norm": 1.7595645189285278, "learning_rate": 1e-05, - "loss": 0.2804, + "loss": 0.2808, "step": 185300 }, { "epoch": 0.001854, - "grad_norm": 1.606662631034851, + "grad_norm": 1.4511970281600952, "learning_rate": 1e-05, - "loss": 0.2853, + "loss": 0.2854, "step": 185400 }, { "epoch": 0.001855, - "grad_norm": 1.5151783227920532, + "grad_norm": 1.4957120418548584, "learning_rate": 1e-05, - "loss": 0.283, + "loss": 0.287, "step": 185500 }, { "epoch": 0.001856, - "grad_norm": 1.450666069984436, + "grad_norm": 1.5508650541305542, "learning_rate": 1e-05, - "loss": 0.2816, + "loss": 0.2868, "step": 185600 }, { "epoch": 0.001857, - "grad_norm": 1.531380534172058, + "grad_norm": 1.4669588804244995, "learning_rate": 1e-05, - "loss": 0.2921, + "loss": 0.2955, "step": 185700 }, { "epoch": 0.001858, - "grad_norm": 1.3878519535064697, + "grad_norm": 1.456214189529419, "learning_rate": 1e-05, - "loss": 0.2847, + "loss": 0.2887, "step": 185800 }, { "epoch": 0.001859, - "grad_norm": 1.5633848905563354, + "grad_norm": 1.5572723150253296, "learning_rate": 1e-05, - "loss": 0.2839, + "loss": 0.2878, "step": 185900 }, { "epoch": 0.00186, - "grad_norm": 1.233081579208374, + "grad_norm": 1.1924431324005127, "learning_rate": 1e-05, - "loss": 0.2843, + "loss": 0.2872, "step": 186000 }, { "epoch": 0.001861, - "grad_norm": 1.5852998495101929, + "grad_norm": 1.5211331844329834, "learning_rate": 1e-05, - "loss": 0.2807, + "loss": 0.2831, "step": 186100 }, { "epoch": 0.001862, - "grad_norm": 1.3963110446929932, + "grad_norm": 3.843024969100952, "learning_rate": 1e-05, - "loss": 0.281, + "loss": 0.2831, "step": 186200 }, { "epoch": 0.001863, - "grad_norm": 1.4942704439163208, + "grad_norm": 1.5179247856140137, "learning_rate": 1e-05, - "loss": 0.2853, + "loss": 0.285, "step": 186300 }, { "epoch": 0.001864, - "grad_norm": 1.4397003650665283, + "grad_norm": 1.3682702779769897, "learning_rate": 1e-05, - "loss": 0.2855, + "loss": 0.2882, "step": 186400 }, { "epoch": 0.001865, - "grad_norm": 1.609068512916565, + "grad_norm": 1.5251023769378662, "learning_rate": 1e-05, - "loss": 0.2922, + "loss": 0.2915, "step": 186500 }, { "epoch": 0.001866, - "grad_norm": 1.6162950992584229, + "grad_norm": 1.627709984779358, "learning_rate": 1e-05, - "loss": 0.2817, + "loss": 0.2847, "step": 186600 }, { "epoch": 0.001867, - "grad_norm": 1.4839801788330078, + "grad_norm": 1.5060405731201172, "learning_rate": 1e-05, - "loss": 0.281, + "loss": 0.2845, "step": 186700 }, { "epoch": 0.001868, - "grad_norm": 1.4925748109817505, + "grad_norm": 1.6034202575683594, "learning_rate": 1e-05, - "loss": 0.2827, + "loss": 0.287, "step": 186800 }, { "epoch": 0.001869, - "grad_norm": 1.4187835454940796, + "grad_norm": 1.4927774667739868, "learning_rate": 1e-05, - "loss": 0.2807, + "loss": 0.2857, "step": 186900 }, { "epoch": 0.00187, - "grad_norm": 1.2236676216125488, + "grad_norm": 1.4778563976287842, "learning_rate": 1e-05, - "loss": 0.2794, + "loss": 0.2844, "step": 187000 }, { "epoch": 0.001871, - "grad_norm": 1.3004987239837646, + "grad_norm": 1.2116749286651611, "learning_rate": 1e-05, - "loss": 0.2798, + "loss": 0.2805, "step": 187100 }, { "epoch": 0.001872, - "grad_norm": 1.5337393283843994, + "grad_norm": 1.4425694942474365, "learning_rate": 1e-05, - "loss": 0.2813, + "loss": 0.2824, "step": 187200 }, { "epoch": 0.001873, - "grad_norm": 1.191408395767212, + "grad_norm": 1.2100266218185425, "learning_rate": 1e-05, - "loss": 0.2827, + "loss": 0.2836, "step": 187300 }, { "epoch": 0.001874, - "grad_norm": 1.492838740348816, + "grad_norm": 1.548009991645813, "learning_rate": 1e-05, - "loss": 0.2833, + "loss": 0.2854, "step": 187400 }, { "epoch": 0.001875, - "grad_norm": 1.4533647298812866, + "grad_norm": 1.5316060781478882, "learning_rate": 1e-05, - "loss": 0.2854, + "loss": 0.288, "step": 187500 }, { "epoch": 0.001876, - "grad_norm": 1.4473025798797607, + "grad_norm": 1.5404126644134521, "learning_rate": 1e-05, - "loss": 0.2839, + "loss": 0.2892, "step": 187600 }, { "epoch": 0.001877, - "grad_norm": 1.4145002365112305, + "grad_norm": 1.592418909072876, "learning_rate": 1e-05, - "loss": 0.2776, + "loss": 0.2818, "step": 187700 }, { "epoch": 0.001878, - "grad_norm": 1.8289521932601929, + "grad_norm": 1.5386697053909302, "learning_rate": 1e-05, - "loss": 0.2796, + "loss": 0.2812, "step": 187800 }, { "epoch": 0.001879, - "grad_norm": 1.4247504472732544, + "grad_norm": 1.7977592945098877, "learning_rate": 1e-05, "loss": 0.2883, "step": 187900 }, { "epoch": 0.00188, - "grad_norm": 1.377250075340271, + "grad_norm": 1.4943695068359375, "learning_rate": 1e-05, - "loss": 0.2875, + "loss": 0.2924, "step": 188000 }, { "epoch": 0.001881, - "grad_norm": 1.7697957754135132, + "grad_norm": 1.7356852293014526, "learning_rate": 1e-05, - "loss": 0.2842, + "loss": 0.2861, "step": 188100 }, { "epoch": 0.001882, - "grad_norm": 1.312009572982788, + "grad_norm": 1.300524353981018, "learning_rate": 1e-05, - "loss": 0.2827, + "loss": 0.2871, "step": 188200 }, { "epoch": 0.001883, - "grad_norm": 1.6143733263015747, + "grad_norm": 1.5252922773361206, "learning_rate": 1e-05, - "loss": 0.2921, + "loss": 0.293, "step": 188300 }, { "epoch": 0.001884, - "grad_norm": 1.5691176652908325, + "grad_norm": 1.6379549503326416, "learning_rate": 1e-05, - "loss": 0.2883, + "loss": 0.2928, "step": 188400 }, { "epoch": 0.001885, - "grad_norm": 1.4667760133743286, + "grad_norm": 1.485962152481079, "learning_rate": 1e-05, - "loss": 0.2789, + "loss": 0.2797, "step": 188500 }, { "epoch": 0.001886, - "grad_norm": 1.4449518918991089, + "grad_norm": 1.5201560258865356, "learning_rate": 1e-05, - "loss": 0.2886, + "loss": 0.2906, "step": 188600 }, { "epoch": 0.001887, - "grad_norm": 1.618916630744934, + "grad_norm": 1.5450036525726318, "learning_rate": 1e-05, "loss": 0.2903, "step": 188700 }, { "epoch": 0.001888, - "grad_norm": 1.8084404468536377, + "grad_norm": 1.6390937566757202, "learning_rate": 1e-05, - "loss": 0.2812, + "loss": 0.2824, "step": 188800 }, { "epoch": 0.001889, - "grad_norm": 1.462833046913147, + "grad_norm": 1.4087883234024048, "learning_rate": 1e-05, - "loss": 0.2798, + "loss": 0.2825, "step": 188900 }, { "epoch": 0.00189, - "grad_norm": 1.7136729955673218, + "grad_norm": 1.5856794118881226, "learning_rate": 1e-05, - "loss": 0.2787, + "loss": 0.2836, "step": 189000 }, { "epoch": 0.001891, - "grad_norm": 1.4144402742385864, + "grad_norm": 1.4078369140625, "learning_rate": 1e-05, - "loss": 0.2749, + "loss": 0.2774, "step": 189100 }, { "epoch": 0.001892, - "grad_norm": 1.2183703184127808, + "grad_norm": 1.7400150299072266, "learning_rate": 1e-05, - "loss": 0.2828, + "loss": 0.2856, "step": 189200 }, { "epoch": 0.001893, - "grad_norm": 1.6140680313110352, + "grad_norm": 1.580092191696167, "learning_rate": 1e-05, - "loss": 0.2791, + "loss": 0.2798, "step": 189300 }, { "epoch": 0.001894, - "grad_norm": 1.5306097269058228, + "grad_norm": 1.6314994096755981, "learning_rate": 1e-05, - "loss": 0.2856, + "loss": 0.2876, "step": 189400 }, { "epoch": 0.001895, - "grad_norm": 1.2866476774215698, + "grad_norm": 1.33416748046875, "learning_rate": 1e-05, "loss": 0.2809, "step": 189500 }, { "epoch": 0.001896, - "grad_norm": 4.816654682159424, + "grad_norm": 1.512454628944397, "learning_rate": 1e-05, - "loss": 0.2755, + "loss": 0.2784, "step": 189600 }, { "epoch": 0.001897, - "grad_norm": 1.310422420501709, + "grad_norm": 1.4232842922210693, "learning_rate": 1e-05, - "loss": 0.284, + "loss": 0.2867, "step": 189700 }, { "epoch": 0.001898, - "grad_norm": 1.61820650100708, + "grad_norm": 1.3176745176315308, "learning_rate": 1e-05, - "loss": 0.2811, + "loss": 0.284, "step": 189800 }, { "epoch": 0.001899, - "grad_norm": 1.4744561910629272, + "grad_norm": 1.394158959388733, "learning_rate": 1e-05, - "loss": 0.2791, + "loss": 0.2833, "step": 189900 }, { "epoch": 0.0019, - "grad_norm": 1.3192812204360962, + "grad_norm": 1.4825711250305176, "learning_rate": 1e-05, - "loss": 0.2855, + "loss": 0.287, "step": 190000 }, { "epoch": 0.001901, - "grad_norm": 1.447884202003479, + "grad_norm": 1.6661640405654907, "learning_rate": 1e-05, - "loss": 0.278, + "loss": 0.2821, "step": 190100 }, { "epoch": 0.001902, - "grad_norm": 1.6728529930114746, + "grad_norm": 1.7882932424545288, "learning_rate": 1e-05, - "loss": 0.2783, + "loss": 0.2793, "step": 190200 }, { "epoch": 0.001903, - "grad_norm": 1.5528289079666138, + "grad_norm": 1.774411678314209, "learning_rate": 1e-05, - "loss": 0.29, + "loss": 0.2914, "step": 190300 }, { "epoch": 0.001904, - "grad_norm": 1.7119053602218628, + "grad_norm": 1.4190601110458374, "learning_rate": 1e-05, - "loss": 0.2829, + "loss": 0.285, "step": 190400 }, { "epoch": 0.001905, - "grad_norm": 1.560429334640503, + "grad_norm": 1.6410194635391235, "learning_rate": 1e-05, - "loss": 0.2816, + "loss": 0.2836, "step": 190500 }, { "epoch": 0.001906, - "grad_norm": 1.3440685272216797, + "grad_norm": 1.3235634565353394, "learning_rate": 1e-05, - "loss": 0.2818, + "loss": 0.2843, "step": 190600 }, { "epoch": 0.001907, - "grad_norm": 1.5559247732162476, + "grad_norm": 1.5994374752044678, "learning_rate": 1e-05, - "loss": 0.287, + "loss": 0.2911, "step": 190700 }, { "epoch": 0.001908, - "grad_norm": 1.880592703819275, + "grad_norm": 1.479262113571167, "learning_rate": 1e-05, - "loss": 0.2808, + "loss": 0.2851, "step": 190800 }, { "epoch": 0.001909, - "grad_norm": 1.5825804471969604, + "grad_norm": 1.8383204936981201, "learning_rate": 1e-05, - "loss": 0.2801, + "loss": 0.2861, "step": 190900 }, { "epoch": 0.00191, - "grad_norm": 1.7596060037612915, + "grad_norm": 1.7182331085205078, "learning_rate": 1e-05, - "loss": 0.285, + "loss": 0.2892, "step": 191000 }, { "epoch": 0.001911, - "grad_norm": 1.3567836284637451, + "grad_norm": 1.2352073192596436, "learning_rate": 1e-05, - "loss": 0.286, + "loss": 0.2882, "step": 191100 }, { "epoch": 0.001912, - "grad_norm": 1.8503133058547974, + "grad_norm": 1.6550449132919312, "learning_rate": 1e-05, - "loss": 0.2823, + "loss": 0.2857, "step": 191200 }, { "epoch": 0.001913, - "grad_norm": 1.7642414569854736, + "grad_norm": 1.4845826625823975, "learning_rate": 1e-05, - "loss": 0.2809, + "loss": 0.2815, "step": 191300 }, { "epoch": 0.001914, - "grad_norm": 1.4115045070648193, + "grad_norm": 1.3901734352111816, "learning_rate": 1e-05, - "loss": 0.2726, + "loss": 0.2775, "step": 191400 }, { "epoch": 0.001915, - "grad_norm": 1.3123126029968262, + "grad_norm": 1.410360336303711, "learning_rate": 1e-05, - "loss": 0.2825, + "loss": 0.2842, "step": 191500 }, { "epoch": 0.001916, - "grad_norm": 1.6739060878753662, + "grad_norm": 1.6031688451766968, "learning_rate": 1e-05, - "loss": 0.2781, + "loss": 0.2809, "step": 191600 }, { "epoch": 0.001917, - "grad_norm": 1.4738795757293701, + "grad_norm": 1.6306028366088867, "learning_rate": 1e-05, - "loss": 0.2812, + "loss": 0.2844, "step": 191700 }, { "epoch": 0.001918, - "grad_norm": 1.3301559686660767, + "grad_norm": 1.450675129890442, "learning_rate": 1e-05, - "loss": 0.2764, + "loss": 0.2808, "step": 191800 }, { "epoch": 0.001919, - "grad_norm": 1.5323306322097778, + "grad_norm": 1.4375156164169312, "learning_rate": 1e-05, - "loss": 0.2853, + "loss": 0.288, "step": 191900 }, { "epoch": 0.00192, - "grad_norm": 4.301568984985352, + "grad_norm": 1.5712558031082153, "learning_rate": 1e-05, - "loss": 0.2797, + "loss": 0.2827, "step": 192000 }, { "epoch": 0.001921, - "grad_norm": 1.520705223083496, + "grad_norm": 1.6321333646774292, "learning_rate": 1e-05, - "loss": 0.2822, + "loss": 0.2853, "step": 192100 }, { "epoch": 0.001922, - "grad_norm": 1.3320016860961914, + "grad_norm": 1.3940213918685913, "learning_rate": 1e-05, - "loss": 0.2848, + "loss": 0.2868, "step": 192200 }, { "epoch": 0.001923, - "grad_norm": 1.5260173082351685, + "grad_norm": 1.506698489189148, "learning_rate": 1e-05, - "loss": 0.2843, + "loss": 0.2892, "step": 192300 }, { "epoch": 0.001924, - "grad_norm": 1.3168988227844238, + "grad_norm": 1.4711730480194092, "learning_rate": 1e-05, - "loss": 0.2862, + "loss": 0.2865, "step": 192400 }, { "epoch": 0.001925, - "grad_norm": 1.4016045331954956, + "grad_norm": 1.4492762088775635, "learning_rate": 1e-05, - "loss": 0.2842, + "loss": 0.2858, "step": 192500 }, { "epoch": 0.001926, - "grad_norm": 1.539643406867981, + "grad_norm": 1.5630435943603516, "learning_rate": 1e-05, - "loss": 0.2832, + "loss": 0.285, "step": 192600 }, { "epoch": 0.001927, - "grad_norm": 1.4450684785842896, + "grad_norm": 1.4810446500778198, "learning_rate": 1e-05, - "loss": 0.2848, + "loss": 0.283, "step": 192700 }, { "epoch": 0.001928, - "grad_norm": 1.4840152263641357, + "grad_norm": 1.5246132612228394, "learning_rate": 1e-05, - "loss": 0.2795, + "loss": 0.2844, "step": 192800 }, { "epoch": 0.001929, - "grad_norm": 1.4051638841629028, + "grad_norm": 1.7029883861541748, "learning_rate": 1e-05, - "loss": 0.2768, + "loss": 0.28, "step": 192900 }, { "epoch": 0.00193, - "grad_norm": 1.6651637554168701, + "grad_norm": 1.930101990699768, "learning_rate": 1e-05, - "loss": 0.2806, + "loss": 0.28, "step": 193000 }, { "epoch": 0.001931, - "grad_norm": 1.4165291786193848, + "grad_norm": 2.439939260482788, "learning_rate": 1e-05, - "loss": 0.2804, + "loss": 0.284, "step": 193100 }, { "epoch": 0.001932, - "grad_norm": 1.3136743307113647, + "grad_norm": 1.4808944463729858, "learning_rate": 1e-05, - "loss": 0.2783, + "loss": 0.2805, "step": 193200 }, { "epoch": 0.001933, - "grad_norm": 1.4304327964782715, + "grad_norm": 1.3932912349700928, "learning_rate": 1e-05, - "loss": 0.2754, + "loss": 0.2787, "step": 193300 }, { "epoch": 0.001934, - "grad_norm": 1.3229684829711914, + "grad_norm": 1.4781297445297241, "learning_rate": 1e-05, - "loss": 0.2748, + "loss": 0.2774, "step": 193400 }, { "epoch": 0.001935, - "grad_norm": 2.029120683670044, + "grad_norm": 1.4157606363296509, "learning_rate": 1e-05, - "loss": 0.2806, + "loss": 0.2855, "step": 193500 }, { "epoch": 0.001936, - "grad_norm": 1.4201704263687134, + "grad_norm": 1.5318036079406738, "learning_rate": 1e-05, - "loss": 0.2784, + "loss": 0.281, "step": 193600 }, { "epoch": 0.001937, - "grad_norm": 1.5672061443328857, + "grad_norm": 1.4803863763809204, "learning_rate": 1e-05, - "loss": 0.2777, + "loss": 0.2792, "step": 193700 }, { "epoch": 0.001938, - "grad_norm": 1.4912543296813965, + "grad_norm": 1.4421052932739258, "learning_rate": 1e-05, - "loss": 0.2869, + "loss": 0.2888, "step": 193800 }, { "epoch": 0.001939, - "grad_norm": 1.218873143196106, + "grad_norm": 1.336422085762024, "learning_rate": 1e-05, - "loss": 0.2761, + "loss": 0.2798, "step": 193900 }, { "epoch": 0.00194, - "grad_norm": 1.6425585746765137, + "grad_norm": 1.4723069667816162, "learning_rate": 1e-05, - "loss": 0.2823, + "loss": 0.2826, "step": 194000 }, { "epoch": 0.001941, - "grad_norm": 1.4207195043563843, + "grad_norm": 1.549203634262085, "learning_rate": 1e-05, - "loss": 0.2778, + "loss": 0.2804, "step": 194100 }, { "epoch": 0.001942, - "grad_norm": 2.1050803661346436, + "grad_norm": 1.5718244314193726, "learning_rate": 1e-05, - "loss": 0.2729, + "loss": 0.2773, "step": 194200 }, { "epoch": 0.001943, - "grad_norm": 1.3131808042526245, + "grad_norm": 1.4759576320648193, "learning_rate": 1e-05, - "loss": 0.2745, + "loss": 0.2773, "step": 194300 }, { "epoch": 0.001944, - "grad_norm": 1.3653678894042969, + "grad_norm": 1.3965938091278076, "learning_rate": 1e-05, - "loss": 0.2772, + "loss": 0.2771, "step": 194400 }, { "epoch": 0.001945, - "grad_norm": 1.2917543649673462, + "grad_norm": 1.3787996768951416, "learning_rate": 1e-05, - "loss": 0.2796, + "loss": 0.2806, "step": 194500 }, { "epoch": 0.001946, - "grad_norm": 1.304482340812683, + "grad_norm": 1.3691178560256958, "learning_rate": 1e-05, - "loss": 0.2901, + "loss": 0.2892, "step": 194600 }, { "epoch": 0.001947, - "grad_norm": 1.5340479612350464, + "grad_norm": 1.4109934568405151, "learning_rate": 1e-05, - "loss": 0.2857, + "loss": 0.2902, "step": 194700 }, { "epoch": 0.001948, - "grad_norm": 1.379583477973938, + "grad_norm": 1.4349749088287354, "learning_rate": 1e-05, - "loss": 0.2807, + "loss": 0.2852, "step": 194800 }, { "epoch": 0.001949, - "grad_norm": 1.5749056339263916, + "grad_norm": 1.3615974187850952, "learning_rate": 1e-05, - "loss": 0.2793, + "loss": 0.2808, "step": 194900 }, { "epoch": 0.00195, - "grad_norm": 1.464102029800415, + "grad_norm": 1.5208380222320557, "learning_rate": 1e-05, - "loss": 0.2778, + "loss": 0.2804, "step": 195000 }, { "epoch": 0.001951, - "grad_norm": 1.5343683958053589, + "grad_norm": 1.5616921186447144, "learning_rate": 1e-05, - "loss": 0.2758, + "loss": 0.2793, "step": 195100 }, { "epoch": 0.001952, - "grad_norm": 1.4226011037826538, + "grad_norm": 1.490240454673767, "learning_rate": 1e-05, - "loss": 0.2814, + "loss": 0.2835, "step": 195200 }, { "epoch": 0.001953, - "grad_norm": 1.4280790090560913, + "grad_norm": 1.4141552448272705, "learning_rate": 1e-05, - "loss": 0.2783, + "loss": 0.2841, "step": 195300 }, { "epoch": 0.001954, - "grad_norm": 1.2915691137313843, + "grad_norm": 1.4841254949569702, "learning_rate": 1e-05, - "loss": 0.2849, + "loss": 0.2899, "step": 195400 }, { "epoch": 0.001955, - "grad_norm": 1.366072416305542, + "grad_norm": 1.3822132349014282, "learning_rate": 1e-05, - "loss": 0.284, + "loss": 0.2873, "step": 195500 }, { "epoch": 0.001956, - "grad_norm": 1.4957631826400757, + "grad_norm": 1.5400711297988892, "learning_rate": 1e-05, - "loss": 0.2765, + "loss": 0.2816, "step": 195600 }, { "epoch": 0.001957, - "grad_norm": 1.3651175498962402, + "grad_norm": 3.026294708251953, "learning_rate": 1e-05, - "loss": 0.28, + "loss": 0.2818, "step": 195700 }, { "epoch": 0.001958, - "grad_norm": 1.3223210573196411, + "grad_norm": 1.5581517219543457, "learning_rate": 1e-05, - "loss": 0.2848, + "loss": 0.2866, "step": 195800 }, { "epoch": 0.001959, - "grad_norm": 1.370012640953064, + "grad_norm": 1.292336106300354, "learning_rate": 1e-05, - "loss": 0.2791, + "loss": 0.2807, "step": 195900 }, { "epoch": 0.00196, - "grad_norm": 1.228473424911499, + "grad_norm": 1.3840731382369995, "learning_rate": 1e-05, - "loss": 0.2779, + "loss": 0.2799, "step": 196000 }, { "epoch": 0.001961, - "grad_norm": 1.4302570819854736, + "grad_norm": 1.82817542552948, "learning_rate": 1e-05, - "loss": 0.2838, + "loss": 0.2871, "step": 196100 }, { "epoch": 0.001962, - "grad_norm": 1.7938988208770752, + "grad_norm": 2.580714702606201, "learning_rate": 1e-05, - "loss": 0.2716, + "loss": 0.2752, "step": 196200 }, { "epoch": 0.001963, - "grad_norm": 1.404895544052124, + "grad_norm": 1.8855743408203125, "learning_rate": 1e-05, - "loss": 0.2777, + "loss": 0.2822, "step": 196300 }, { "epoch": 0.001964, - "grad_norm": 1.5656688213348389, + "grad_norm": 1.3740893602371216, "learning_rate": 1e-05, - "loss": 0.2796, + "loss": 0.2811, "step": 196400 }, { "epoch": 0.001965, - "grad_norm": 1.3794924020767212, + "grad_norm": 1.2532157897949219, "learning_rate": 1e-05, - "loss": 0.2865, + "loss": 0.2889, "step": 196500 }, { "epoch": 0.001966, - "grad_norm": 1.7276804447174072, + "grad_norm": 1.313609004020691, "learning_rate": 1e-05, - "loss": 0.2779, + "loss": 0.2818, "step": 196600 }, { "epoch": 0.001967, - "grad_norm": 1.2390432357788086, + "grad_norm": 1.326478123664856, "learning_rate": 1e-05, - "loss": 0.285, + "loss": 0.287, "step": 196700 }, { "epoch": 0.001968, - "grad_norm": 1.5375555753707886, + "grad_norm": 1.5435999631881714, "learning_rate": 1e-05, - "loss": 0.2821, + "loss": 0.2848, "step": 196800 }, { "epoch": 0.001969, - "grad_norm": 1.4694851636886597, + "grad_norm": 1.5859767198562622, "learning_rate": 1e-05, - "loss": 0.2792, + "loss": 0.282, "step": 196900 }, { "epoch": 0.00197, - "grad_norm": 1.3612979650497437, + "grad_norm": 1.3006385564804077, "learning_rate": 1e-05, - "loss": 0.2794, + "loss": 0.2793, "step": 197000 }, { "epoch": 0.001971, - "grad_norm": 1.1596492528915405, + "grad_norm": 1.2609304189682007, "learning_rate": 1e-05, - "loss": 0.276, + "loss": 0.2791, "step": 197100 }, { "epoch": 0.001972, - "grad_norm": 1.2629233598709106, + "grad_norm": 1.4160981178283691, "learning_rate": 1e-05, - "loss": 0.2793, + "loss": 0.2832, "step": 197200 }, { "epoch": 0.001973, - "grad_norm": 1.4659279584884644, + "grad_norm": 2.234437942504883, "learning_rate": 1e-05, - "loss": 0.2839, + "loss": 0.2852, "step": 197300 }, { "epoch": 0.001974, - "grad_norm": 1.3676152229309082, + "grad_norm": 1.295417070388794, "learning_rate": 1e-05, - "loss": 0.2823, + "loss": 0.2825, "step": 197400 }, { "epoch": 0.001975, - "grad_norm": 1.3299367427825928, + "grad_norm": 1.227103352546692, "learning_rate": 1e-05, - "loss": 0.2772, + "loss": 0.2787, "step": 197500 }, { "epoch": 0.001976, - "grad_norm": 1.3246991634368896, + "grad_norm": 1.5311518907546997, "learning_rate": 1e-05, - "loss": 0.2792, + "loss": 0.2807, "step": 197600 }, { "epoch": 0.001977, - "grad_norm": 1.6460968255996704, + "grad_norm": 1.3764680624008179, "learning_rate": 1e-05, - "loss": 0.2774, + "loss": 0.2772, "step": 197700 }, { "epoch": 0.001978, - "grad_norm": 1.5189824104309082, + "grad_norm": 1.43855619430542, "learning_rate": 1e-05, - "loss": 0.2807, + "loss": 0.2806, "step": 197800 }, { "epoch": 0.001979, - "grad_norm": 1.4497886896133423, + "grad_norm": 1.5360887050628662, "learning_rate": 1e-05, - "loss": 0.2736, + "loss": 0.2774, "step": 197900 }, { "epoch": 0.00198, - "grad_norm": 1.2332127094268799, + "grad_norm": 1.2137932777404785, "learning_rate": 1e-05, - "loss": 0.2811, + "loss": 0.2833, "step": 198000 }, { "epoch": 0.001981, - "grad_norm": 2.433859348297119, + "grad_norm": 1.5310431718826294, "learning_rate": 1e-05, - "loss": 0.2775, + "loss": 0.2795, "step": 198100 }, { "epoch": 0.001982, - "grad_norm": 1.4312340021133423, + "grad_norm": 1.6440815925598145, "learning_rate": 1e-05, - "loss": 0.2787, + "loss": 0.2814, "step": 198200 }, { "epoch": 0.001983, - "grad_norm": 2.0906150341033936, + "grad_norm": 1.8036118745803833, "learning_rate": 1e-05, - "loss": 0.2816, + "loss": 0.2835, "step": 198300 }, { "epoch": 0.001984, - "grad_norm": 1.3633174896240234, + "grad_norm": 1.3439860343933105, "learning_rate": 1e-05, - "loss": 0.2787, + "loss": 0.2829, "step": 198400 }, { "epoch": 0.001985, - "grad_norm": 1.3862175941467285, + "grad_norm": 1.4137046337127686, "learning_rate": 1e-05, - "loss": 0.2766, + "loss": 0.2814, "step": 198500 }, { "epoch": 0.001986, - "grad_norm": 1.452214241027832, + "grad_norm": 1.4916012287139893, "learning_rate": 1e-05, - "loss": 0.2745, + "loss": 0.277, "step": 198600 }, { "epoch": 0.001987, - "grad_norm": 1.5617133378982544, + "grad_norm": 1.4562140703201294, "learning_rate": 1e-05, - "loss": 0.2848, + "loss": 0.2866, "step": 198700 }, { "epoch": 0.001988, - "grad_norm": 1.4280415773391724, + "grad_norm": 1.6629130840301514, "learning_rate": 1e-05, - "loss": 0.2765, + "loss": 0.2775, "step": 198800 }, { "epoch": 0.001989, - "grad_norm": 1.5873243808746338, + "grad_norm": 1.524603247642517, "learning_rate": 1e-05, - "loss": 0.2767, + "loss": 0.2808, "step": 198900 }, { "epoch": 0.00199, - "grad_norm": 1.6872470378875732, + "grad_norm": 2.1545159816741943, "learning_rate": 1e-05, - "loss": 0.2738, + "loss": 0.2791, "step": 199000 }, { "epoch": 0.001991, - "grad_norm": 1.3586468696594238, + "grad_norm": 1.3595882654190063, "learning_rate": 1e-05, - "loss": 0.2787, + "loss": 0.2845, "step": 199100 }, { "epoch": 0.001992, - "grad_norm": 1.3425086736679077, + "grad_norm": 1.4160555601119995, "learning_rate": 1e-05, - "loss": 0.2769, + "loss": 0.2779, "step": 199200 }, { "epoch": 0.001993, - "grad_norm": 1.635860562324524, + "grad_norm": 1.612523078918457, "learning_rate": 1e-05, - "loss": 0.2863, + "loss": 0.2878, "step": 199300 }, { "epoch": 0.001994, - "grad_norm": 1.5790388584136963, + "grad_norm": 1.4364423751831055, "learning_rate": 1e-05, - "loss": 0.2823, + "loss": 0.2813, "step": 199400 }, { "epoch": 0.001995, - "grad_norm": 1.4874083995819092, + "grad_norm": 1.47967529296875, "learning_rate": 1e-05, - "loss": 0.2828, + "loss": 0.2841, "step": 199500 }, { "epoch": 0.001996, - "grad_norm": 1.6232818365097046, + "grad_norm": 1.621627688407898, "learning_rate": 1e-05, - "loss": 0.2726, + "loss": 0.2765, "step": 199600 }, { "epoch": 0.001997, - "grad_norm": 1.4796119928359985, + "grad_norm": 1.3066225051879883, "learning_rate": 1e-05, - "loss": 0.2788, + "loss": 0.2828, "step": 199700 }, { "epoch": 0.001998, - "grad_norm": 1.5076864957809448, + "grad_norm": 1.614108681678772, "learning_rate": 1e-05, - "loss": 0.279, + "loss": 0.2808, "step": 199800 }, { "epoch": 0.001999, - "grad_norm": 1.5838216543197632, + "grad_norm": 1.457350730895996, "learning_rate": 1e-05, - "loss": 0.2798, + "loss": 0.2815, "step": 199900 }, { "epoch": 0.002, - "grad_norm": 1.4970403909683228, + "grad_norm": 1.6253563165664673, "learning_rate": 1e-05, - "loss": 0.2778, + "loss": 0.2792, "step": 200000 }, { "epoch": 0.002, - "eval_loss": 0.25390625, - "eval_runtime": 101.8586, - "eval_samples_per_second": 490.877, - "eval_steps_per_second": 30.68, + "eval_loss": 0.256103515625, + "eval_runtime": 115.1679, + "eval_samples_per_second": 434.149, + "eval_steps_per_second": 27.134, "step": 200000 }, { "epoch": 0.002001, - "grad_norm": 1.4980673789978027, + "grad_norm": 1.5363661050796509, "learning_rate": 1e-05, - "loss": 0.2744, + "loss": 0.2754, "step": 200100 }, { "epoch": 0.002002, - "grad_norm": 1.8671051263809204, + "grad_norm": 1.5348174571990967, "learning_rate": 1e-05, - "loss": 0.2826, + "loss": 0.2851, "step": 200200 }, { "epoch": 0.002003, - "grad_norm": 1.4163048267364502, + "grad_norm": 1.4772671461105347, "learning_rate": 1e-05, - "loss": 0.2765, + "loss": 0.279, "step": 200300 }, { "epoch": 0.002004, - "grad_norm": 1.3866466283798218, + "grad_norm": 1.5636744499206543, "learning_rate": 1e-05, - "loss": 0.2836, + "loss": 0.2862, "step": 200400 }, { "epoch": 0.002005, - "grad_norm": 1.451857328414917, + "grad_norm": 1.8320351839065552, "learning_rate": 1e-05, - "loss": 0.2778, + "loss": 0.279, "step": 200500 }, { "epoch": 0.002006, - "grad_norm": 1.4547383785247803, + "grad_norm": 1.7066452503204346, "learning_rate": 1e-05, - "loss": 0.2756, + "loss": 0.2757, "step": 200600 }, { "epoch": 0.002007, - "grad_norm": 1.370554804801941, + "grad_norm": 1.3273617029190063, "learning_rate": 1e-05, - "loss": 0.2806, + "loss": 0.2815, "step": 200700 }, { "epoch": 0.002008, - "grad_norm": 1.8153938055038452, + "grad_norm": 1.492756724357605, "learning_rate": 1e-05, - "loss": 0.2765, + "loss": 0.2779, "step": 200800 }, { "epoch": 0.002009, - "grad_norm": 1.943874478340149, + "grad_norm": 1.5056836605072021, "learning_rate": 1e-05, - "loss": 0.2774, + "loss": 0.2805, "step": 200900 }, { "epoch": 0.00201, - "grad_norm": 1.5531177520751953, + "grad_norm": 1.504055142402649, "learning_rate": 1e-05, - "loss": 0.2708, + "loss": 0.2725, "step": 201000 }, { "epoch": 0.002011, - "grad_norm": 1.3408933877944946, + "grad_norm": 1.424229621887207, "learning_rate": 1e-05, - "loss": 0.274, + "loss": 0.2768, "step": 201100 }, { "epoch": 0.002012, - "grad_norm": 1.5348567962646484, + "grad_norm": 1.6440435647964478, "learning_rate": 1e-05, - "loss": 0.2778, + "loss": 0.2776, "step": 201200 }, { "epoch": 0.002013, - "grad_norm": 1.4430639743804932, + "grad_norm": 1.4276853799819946, "learning_rate": 1e-05, - "loss": 0.2698, + "loss": 0.2737, "step": 201300 }, { "epoch": 0.002014, - "grad_norm": 1.4272840023040771, + "grad_norm": 1.479280948638916, "learning_rate": 1e-05, - "loss": 0.2732, + "loss": 0.2772, "step": 201400 }, { "epoch": 0.002015, - "grad_norm": 1.6337707042694092, + "grad_norm": 1.7645676136016846, "learning_rate": 1e-05, - "loss": 0.2757, + "loss": 0.2779, "step": 201500 }, { "epoch": 0.002016, - "grad_norm": 1.4537687301635742, + "grad_norm": 1.7327306270599365, "learning_rate": 1e-05, - "loss": 0.2785, + "loss": 0.2809, "step": 201600 }, { "epoch": 0.002017, - "grad_norm": 1.42153000831604, + "grad_norm": 1.4628037214279175, "learning_rate": 1e-05, - "loss": 0.278, + "loss": 0.2815, "step": 201700 }, { "epoch": 0.002018, - "grad_norm": 1.1965439319610596, + "grad_norm": 1.2225489616394043, "learning_rate": 1e-05, - "loss": 0.2839, + "loss": 0.286, "step": 201800 }, { "epoch": 0.002019, - "grad_norm": 1.5051965713500977, + "grad_norm": 1.4493396282196045, "learning_rate": 1e-05, - "loss": 0.2755, + "loss": 0.2781, "step": 201900 }, { "epoch": 0.00202, - "grad_norm": 1.2453120946884155, + "grad_norm": 1.1203590631484985, "learning_rate": 1e-05, - "loss": 0.2794, + "loss": 0.2782, "step": 202000 }, { "epoch": 0.002021, - "grad_norm": 1.2896701097488403, + "grad_norm": 1.4502707719802856, "learning_rate": 1e-05, - "loss": 0.2751, + "loss": 0.277, "step": 202100 }, { "epoch": 0.002022, - "grad_norm": 1.5587193965911865, + "grad_norm": 1.7395519018173218, "learning_rate": 1e-05, - "loss": 0.2821, + "loss": 0.286, "step": 202200 }, { "epoch": 0.002023, - "grad_norm": 1.5537805557250977, + "grad_norm": 1.9898625612258911, "learning_rate": 1e-05, - "loss": 0.2708, + "loss": 0.2721, "step": 202300 }, { "epoch": 0.002024, - "grad_norm": 1.933854579925537, + "grad_norm": 1.6557375192642212, "learning_rate": 1e-05, - "loss": 0.2721, + "loss": 0.2752, "step": 202400 }, { "epoch": 0.002025, - "grad_norm": 1.3421093225479126, + "grad_norm": 1.4543732404708862, "learning_rate": 1e-05, - "loss": 0.2858, + "loss": 0.2872, "step": 202500 }, { "epoch": 0.002026, - "grad_norm": 1.5518301725387573, + "grad_norm": 1.44635009765625, "learning_rate": 1e-05, - "loss": 0.2741, + "loss": 0.2745, "step": 202600 }, { "epoch": 0.002027, - "grad_norm": 1.3805172443389893, + "grad_norm": 1.3892033100128174, "learning_rate": 1e-05, - "loss": 0.2801, + "loss": 0.2804, "step": 202700 }, { "epoch": 0.002028, - "grad_norm": 1.201749324798584, + "grad_norm": 1.2940878868103027, "learning_rate": 1e-05, - "loss": 0.2729, + "loss": 0.2778, "step": 202800 }, { "epoch": 0.002029, - "grad_norm": 1.399541974067688, + "grad_norm": 2.028639554977417, "learning_rate": 1e-05, - "loss": 0.2685, + "loss": 0.2741, "step": 202900 }, { "epoch": 0.00203, - "grad_norm": 1.3271256685256958, + "grad_norm": 1.462415099143982, "learning_rate": 1e-05, - "loss": 0.2729, + "loss": 0.2775, "step": 203000 }, { "epoch": 0.002031, - "grad_norm": 1.5752711296081543, + "grad_norm": 1.60899019241333, "learning_rate": 1e-05, - "loss": 0.2744, + "loss": 0.2764, "step": 203100 }, { "epoch": 0.002032, - "grad_norm": 1.4272950887680054, + "grad_norm": 1.5385807752609253, "learning_rate": 1e-05, - "loss": 0.2791, + "loss": 0.2794, "step": 203200 }, { "epoch": 0.002033, - "grad_norm": 1.4877036809921265, + "grad_norm": 1.3882631063461304, "learning_rate": 1e-05, - "loss": 0.2707, + "loss": 0.2736, "step": 203300 }, { "epoch": 0.002034, - "grad_norm": 1.400262475013733, + "grad_norm": 1.2531319856643677, "learning_rate": 1e-05, - "loss": 0.2824, + "loss": 0.2848, "step": 203400 }, { "epoch": 0.002035, - "grad_norm": 1.2589569091796875, + "grad_norm": 1.621891736984253, "learning_rate": 1e-05, - "loss": 0.2762, + "loss": 0.2773, "step": 203500 }, { "epoch": 0.002036, - "grad_norm": 1.7342783212661743, + "grad_norm": 1.3949509859085083, "learning_rate": 1e-05, - "loss": 0.276, + "loss": 0.2804, "step": 203600 }, { "epoch": 0.002037, - "grad_norm": 1.4136003255844116, + "grad_norm": 1.3484961986541748, "learning_rate": 1e-05, - "loss": 0.2757, + "loss": 0.2785, "step": 203700 }, { "epoch": 0.002038, - "grad_norm": 1.4067256450653076, + "grad_norm": 1.524052381515503, "learning_rate": 1e-05, - "loss": 0.2821, + "loss": 0.2817, "step": 203800 }, { "epoch": 0.002039, - "grad_norm": 1.5178284645080566, + "grad_norm": 1.4757261276245117, "learning_rate": 1e-05, - "loss": 0.273, + "loss": 0.2786, "step": 203900 }, { "epoch": 0.00204, - "grad_norm": 1.6708519458770752, + "grad_norm": 1.7308425903320312, "learning_rate": 1e-05, - "loss": 0.2727, + "loss": 0.2765, "step": 204000 }, { "epoch": 0.002041, - "grad_norm": 1.4715983867645264, + "grad_norm": 1.433393120765686, "learning_rate": 1e-05, - "loss": 0.271, + "loss": 0.2745, "step": 204100 }, { "epoch": 0.002042, - "grad_norm": 1.3123623132705688, + "grad_norm": 1.398602843284607, "learning_rate": 1e-05, - "loss": 0.2767, + "loss": 0.278, "step": 204200 }, { "epoch": 0.002043, - "grad_norm": 1.7161884307861328, + "grad_norm": 1.6153137683868408, "learning_rate": 1e-05, - "loss": 0.2797, + "loss": 0.2814, "step": 204300 }, { "epoch": 0.002044, - "grad_norm": 1.6859389543533325, + "grad_norm": 1.516076683998108, "learning_rate": 1e-05, - "loss": 0.2698, + "loss": 0.2709, "step": 204400 }, { "epoch": 0.002045, - "grad_norm": 1.144152045249939, + "grad_norm": 1.6530369520187378, "learning_rate": 1e-05, - "loss": 0.2781, + "loss": 0.2824, "step": 204500 }, { "epoch": 0.002046, - "grad_norm": 1.3815115690231323, + "grad_norm": 1.4678118228912354, "learning_rate": 1e-05, - "loss": 0.2742, + "loss": 0.2754, "step": 204600 }, { "epoch": 0.002047, - "grad_norm": 1.492659568786621, + "grad_norm": 1.64580237865448, "learning_rate": 1e-05, - "loss": 0.2777, + "loss": 0.2796, "step": 204700 }, { "epoch": 0.002048, - "grad_norm": 1.6630390882492065, + "grad_norm": 1.6422346830368042, "learning_rate": 1e-05, - "loss": 0.2738, + "loss": 0.2742, "step": 204800 }, { "epoch": 0.002049, - "grad_norm": 1.5455238819122314, + "grad_norm": 1.5554474592208862, "learning_rate": 1e-05, - "loss": 0.2821, + "loss": 0.2838, "step": 204900 }, { "epoch": 0.00205, - "grad_norm": 1.4743176698684692, + "grad_norm": 1.4247395992279053, "learning_rate": 1e-05, - "loss": 0.2752, + "loss": 0.2788, "step": 205000 }, { "epoch": 0.002051, - "grad_norm": 1.5893384218215942, + "grad_norm": 3.377981662750244, "learning_rate": 1e-05, - "loss": 0.2734, + "loss": 0.2739, "step": 205100 }, { "epoch": 0.002052, - "grad_norm": 1.578606128692627, + "grad_norm": 1.668649435043335, "learning_rate": 1e-05, - "loss": 0.2784, + "loss": 0.2817, "step": 205200 }, { "epoch": 0.002053, - "grad_norm": 1.3604061603546143, + "grad_norm": 1.3109833002090454, "learning_rate": 1e-05, - "loss": 0.2784, + "loss": 0.281, "step": 205300 }, { "epoch": 0.002054, - "grad_norm": 1.2492297887802124, + "grad_norm": 1.3675384521484375, "learning_rate": 1e-05, - "loss": 0.2673, + "loss": 0.2686, "step": 205400 }, { "epoch": 0.002055, - "grad_norm": 1.440869688987732, + "grad_norm": 1.542527437210083, "learning_rate": 1e-05, - "loss": 0.2667, + "loss": 0.2709, "step": 205500 }, { "epoch": 0.002056, - "grad_norm": 1.3074662685394287, + "grad_norm": 1.3900882005691528, "learning_rate": 1e-05, - "loss": 0.2835, + "loss": 0.2841, "step": 205600 }, { "epoch": 0.002057, - "grad_norm": 1.417528510093689, + "grad_norm": 1.5033669471740723, "learning_rate": 1e-05, - "loss": 0.2746, + "loss": 0.2798, "step": 205700 }, { "epoch": 0.002058, - "grad_norm": 1.5445703268051147, + "grad_norm": 1.638098120689392, "learning_rate": 1e-05, - "loss": 0.2725, + "loss": 0.275, "step": 205800 }, { "epoch": 0.002059, - "grad_norm": 1.2117547988891602, + "grad_norm": 1.4195244312286377, "learning_rate": 1e-05, - "loss": 0.2745, + "loss": 0.2781, "step": 205900 }, { "epoch": 0.00206, - "grad_norm": 2.358872652053833, + "grad_norm": 1.5741572380065918, "learning_rate": 1e-05, - "loss": 0.2775, + "loss": 0.2811, "step": 206000 }, { "epoch": 0.002061, - "grad_norm": 1.3709622621536255, + "grad_norm": 1.4734532833099365, "learning_rate": 1e-05, - "loss": 0.2734, + "loss": 0.2757, "step": 206100 }, { "epoch": 0.002062, - "grad_norm": 1.9914414882659912, + "grad_norm": 1.4549283981323242, "learning_rate": 1e-05, - "loss": 0.277, + "loss": 0.2797, "step": 206200 }, { "epoch": 0.002063, - "grad_norm": 1.350724220275879, + "grad_norm": 1.3052780628204346, "learning_rate": 1e-05, - "loss": 0.2777, + "loss": 0.2819, "step": 206300 }, { "epoch": 0.002064, - "grad_norm": 1.228498101234436, + "grad_norm": 1.349663257598877, "learning_rate": 1e-05, - "loss": 0.2712, + "loss": 0.2726, "step": 206400 }, { "epoch": 0.002065, - "grad_norm": 1.4231927394866943, + "grad_norm": 1.477702021598816, "learning_rate": 1e-05, - "loss": 0.2766, + "loss": 0.278, "step": 206500 }, { "epoch": 0.002066, - "grad_norm": 1.3211643695831299, + "grad_norm": 1.4143027067184448, "learning_rate": 1e-05, - "loss": 0.2761, + "loss": 0.2789, "step": 206600 }, { "epoch": 0.002067, - "grad_norm": 1.5264800786972046, + "grad_norm": 1.456390380859375, "learning_rate": 1e-05, - "loss": 0.2717, + "loss": 0.2764, "step": 206700 }, { "epoch": 0.002068, - "grad_norm": 1.4224491119384766, + "grad_norm": 1.343540906906128, "learning_rate": 1e-05, - "loss": 0.2757, + "loss": 0.2773, "step": 206800 }, { "epoch": 0.002069, - "grad_norm": 1.4214329719543457, + "grad_norm": 1.3482036590576172, "learning_rate": 1e-05, - "loss": 0.2702, + "loss": 0.2727, "step": 206900 }, { "epoch": 0.00207, - "grad_norm": 1.6917356252670288, + "grad_norm": 1.5749857425689697, "learning_rate": 1e-05, - "loss": 0.275, + "loss": 0.2752, "step": 207000 }, { "epoch": 0.002071, - "grad_norm": 1.4668591022491455, + "grad_norm": 1.458197832107544, "learning_rate": 1e-05, - "loss": 0.2773, + "loss": 0.2803, "step": 207100 }, { "epoch": 0.002072, - "grad_norm": 1.5522023439407349, + "grad_norm": 1.5353132486343384, "learning_rate": 1e-05, - "loss": 0.2676, + "loss": 0.2698, "step": 207200 }, { "epoch": 0.002073, - "grad_norm": 1.5233978033065796, + "grad_norm": 1.5403767824172974, "learning_rate": 1e-05, - "loss": 0.2762, + "loss": 0.2768, "step": 207300 }, { "epoch": 0.002074, - "grad_norm": 1.3931946754455566, + "grad_norm": 1.4076712131500244, "learning_rate": 1e-05, - "loss": 0.2748, + "loss": 0.2783, "step": 207400 }, { "epoch": 0.002075, - "grad_norm": 1.64429771900177, + "grad_norm": 1.5805490016937256, "learning_rate": 1e-05, - "loss": 0.2757, + "loss": 0.2782, "step": 207500 }, { "epoch": 0.002076, - "grad_norm": 1.365023136138916, + "grad_norm": 1.342297911643982, "learning_rate": 1e-05, - "loss": 0.2798, + "loss": 0.2791, "step": 207600 }, { "epoch": 0.002077, - "grad_norm": 1.397874116897583, + "grad_norm": 1.5067929029464722, "learning_rate": 1e-05, - "loss": 0.2742, + "loss": 0.2768, "step": 207700 }, { "epoch": 0.002078, - "grad_norm": 1.0273443460464478, + "grad_norm": 1.2661635875701904, "learning_rate": 1e-05, - "loss": 0.2715, + "loss": 0.2719, "step": 207800 }, { "epoch": 0.002079, - "grad_norm": 1.2967342138290405, + "grad_norm": 1.2550525665283203, "learning_rate": 1e-05, - "loss": 0.272, + "loss": 0.2734, "step": 207900 }, { "epoch": 0.00208, - "grad_norm": 1.5307880640029907, + "grad_norm": 1.478445053100586, "learning_rate": 1e-05, - "loss": 0.2712, + "loss": 0.2722, "step": 208000 }, { "epoch": 0.002081, - "grad_norm": 1.3682029247283936, + "grad_norm": 1.4284790754318237, "learning_rate": 1e-05, - "loss": 0.2675, + "loss": 0.2698, "step": 208100 }, { "epoch": 0.002082, - "grad_norm": 1.3451956510543823, + "grad_norm": 1.6922401189804077, "learning_rate": 1e-05, - "loss": 0.2783, + "loss": 0.2813, "step": 208200 }, { "epoch": 0.002083, - "grad_norm": 1.4379639625549316, + "grad_norm": 1.50126314163208, "learning_rate": 1e-05, - "loss": 0.2746, + "loss": 0.2797, "step": 208300 }, { "epoch": 0.002084, - "grad_norm": 1.3421787023544312, + "grad_norm": 1.4059314727783203, "learning_rate": 1e-05, - "loss": 0.2733, + "loss": 0.2759, "step": 208400 }, { "epoch": 0.002085, - "grad_norm": 1.6314587593078613, + "grad_norm": 1.5741177797317505, "learning_rate": 1e-05, - "loss": 0.2729, + "loss": 0.2758, "step": 208500 }, { "epoch": 0.002086, - "grad_norm": 1.5723602771759033, + "grad_norm": 1.491023063659668, "learning_rate": 1e-05, - "loss": 0.2778, + "loss": 0.2769, "step": 208600 }, { "epoch": 0.002087, - "grad_norm": 1.4296612739562988, + "grad_norm": 1.4003244638442993, "learning_rate": 1e-05, - "loss": 0.2694, + "loss": 0.2726, "step": 208700 }, { "epoch": 0.002088, - "grad_norm": 1.5000821352005005, + "grad_norm": 1.695166826248169, "learning_rate": 1e-05, - "loss": 0.2771, + "loss": 0.2795, "step": 208800 }, { "epoch": 0.002089, - "grad_norm": 1.3068898916244507, + "grad_norm": 1.3839768171310425, "learning_rate": 1e-05, - "loss": 0.2718, + "loss": 0.2716, "step": 208900 }, { "epoch": 0.00209, - "grad_norm": 1.2877533435821533, + "grad_norm": 1.2503012418746948, "learning_rate": 1e-05, - "loss": 0.2728, + "loss": 0.2749, "step": 209000 }, { "epoch": 0.002091, - "grad_norm": 1.288883090019226, + "grad_norm": 1.3623175621032715, "learning_rate": 1e-05, - "loss": 0.2764, + "loss": 0.2772, "step": 209100 }, { "epoch": 0.002092, - "grad_norm": 1.3227872848510742, + "grad_norm": 1.3409157991409302, "learning_rate": 1e-05, - "loss": 0.2717, + "loss": 0.2735, "step": 209200 }, { "epoch": 0.002093, - "grad_norm": 1.644485354423523, + "grad_norm": 1.5793631076812744, "learning_rate": 1e-05, - "loss": 0.2815, + "loss": 0.2813, "step": 209300 }, { "epoch": 0.002094, - "grad_norm": 1.5578631162643433, + "grad_norm": 1.5260995626449585, "learning_rate": 1e-05, - "loss": 0.2604, + "loss": 0.2626, "step": 209400 }, { "epoch": 0.002095, - "grad_norm": 1.407353162765503, + "grad_norm": 1.5600001811981201, "learning_rate": 1e-05, - "loss": 0.2771, + "loss": 0.2792, "step": 209500 }, { "epoch": 0.002096, - "grad_norm": 1.4527989625930786, + "grad_norm": 1.5654038190841675, "learning_rate": 1e-05, - "loss": 0.2774, + "loss": 0.2796, "step": 209600 }, { "epoch": 0.002097, - "grad_norm": 1.3239778280258179, + "grad_norm": 1.3999748229980469, "learning_rate": 1e-05, - "loss": 0.2732, + "loss": 0.2745, "step": 209700 }, { "epoch": 0.002098, - "grad_norm": 1.4142417907714844, + "grad_norm": 1.4651896953582764, "learning_rate": 1e-05, - "loss": 0.278, + "loss": 0.2809, "step": 209800 }, { "epoch": 0.002099, - "grad_norm": 1.4922010898590088, + "grad_norm": 2.7390918731689453, "learning_rate": 1e-05, - "loss": 0.269, + "loss": 0.2699, "step": 209900 }, { "epoch": 0.0021, - "grad_norm": 1.5319709777832031, + "grad_norm": 1.6523025035858154, "learning_rate": 1e-05, - "loss": 0.2703, + "loss": 0.2718, "step": 210000 }, { "epoch": 0.002101, - "grad_norm": 1.4030638933181763, + "grad_norm": 1.4938172101974487, "learning_rate": 1e-05, - "loss": 0.2754, + "loss": 0.278, "step": 210100 }, { "epoch": 0.002102, - "grad_norm": 1.3011295795440674, + "grad_norm": 1.3947250843048096, "learning_rate": 1e-05, - "loss": 0.2634, + "loss": 0.2638, "step": 210200 }, { "epoch": 0.002103, - "grad_norm": 1.250349760055542, + "grad_norm": 1.4332685470581055, "learning_rate": 1e-05, - "loss": 0.2741, + "loss": 0.2776, "step": 210300 }, { "epoch": 0.002104, - "grad_norm": 1.4862924814224243, + "grad_norm": 1.5522576570510864, "learning_rate": 1e-05, - "loss": 0.2689, + "loss": 0.2715, "step": 210400 }, { "epoch": 0.002105, - "grad_norm": 1.3478277921676636, + "grad_norm": 1.2607855796813965, "learning_rate": 1e-05, - "loss": 0.2775, + "loss": 0.2801, "step": 210500 }, { "epoch": 0.002106, - "grad_norm": 1.2528977394104004, + "grad_norm": 1.489503026008606, "learning_rate": 1e-05, - "loss": 0.2724, + "loss": 0.2772, "step": 210600 }, { "epoch": 0.002107, - "grad_norm": 1.5019500255584717, + "grad_norm": 1.315061092376709, "learning_rate": 1e-05, - "loss": 0.2767, + "loss": 0.2785, "step": 210700 }, { "epoch": 0.002108, - "grad_norm": 1.4680724143981934, + "grad_norm": 1.56254243850708, "learning_rate": 1e-05, - "loss": 0.2688, + "loss": 0.2724, "step": 210800 }, { "epoch": 0.002109, - "grad_norm": 1.3208777904510498, + "grad_norm": 1.4691557884216309, "learning_rate": 1e-05, - "loss": 0.2729, + "loss": 0.2741, "step": 210900 }, { "epoch": 0.00211, - "grad_norm": 1.644145131111145, + "grad_norm": 1.551344633102417, "learning_rate": 1e-05, - "loss": 0.2769, + "loss": 0.2802, "step": 211000 }, { "epoch": 0.002111, - "grad_norm": 1.494957685470581, + "grad_norm": 1.3702019453048706, "learning_rate": 1e-05, - "loss": 0.2723, + "loss": 0.275, "step": 211100 }, { "epoch": 0.002112, - "grad_norm": 1.6504093408584595, + "grad_norm": 1.563392996788025, "learning_rate": 1e-05, - "loss": 0.2737, + "loss": 0.2778, "step": 211200 }, { "epoch": 0.002113, - "grad_norm": 1.4272003173828125, + "grad_norm": 1.403418779373169, "learning_rate": 1e-05, - "loss": 0.2743, + "loss": 0.2751, "step": 211300 }, { "epoch": 0.002114, - "grad_norm": 1.3573968410491943, + "grad_norm": 1.391108512878418, "learning_rate": 1e-05, - "loss": 0.2728, + "loss": 0.2762, "step": 211400 }, { "epoch": 0.002115, - "grad_norm": 1.8569047451019287, + "grad_norm": 1.3116768598556519, "learning_rate": 1e-05, - "loss": 0.268, + "loss": 0.2719, "step": 211500 }, { "epoch": 0.002116, - "grad_norm": 1.3718923330307007, + "grad_norm": 1.449575424194336, "learning_rate": 1e-05, - "loss": 0.2765, + "loss": 0.2787, "step": 211600 }, { "epoch": 0.002117, - "grad_norm": 1.878782868385315, + "grad_norm": 1.7022771835327148, "learning_rate": 1e-05, - "loss": 0.2691, + "loss": 0.2728, "step": 211700 }, { "epoch": 0.002118, - "grad_norm": 1.5091789960861206, + "grad_norm": 1.4799153804779053, "learning_rate": 1e-05, - "loss": 0.2687, + "loss": 0.2734, "step": 211800 }, { "epoch": 0.002119, - "grad_norm": 1.2896639108657837, + "grad_norm": 1.2987920045852661, "learning_rate": 1e-05, - "loss": 0.2718, + "loss": 0.2739, "step": 211900 }, { "epoch": 0.00212, - "grad_norm": 1.455105185508728, + "grad_norm": 1.3577724695205688, "learning_rate": 1e-05, - "loss": 0.2757, + "loss": 0.2772, "step": 212000 }, { "epoch": 0.002121, - "grad_norm": 1.464115023612976, + "grad_norm": 1.3344751596450806, "learning_rate": 1e-05, - "loss": 0.2713, + "loss": 0.2714, "step": 212100 }, { "epoch": 0.002122, - "grad_norm": 1.6060124635696411, + "grad_norm": 1.6037973165512085, "learning_rate": 1e-05, - "loss": 0.2729, + "loss": 0.2761, "step": 212200 }, { "epoch": 0.002123, - "grad_norm": 1.194265365600586, + "grad_norm": 1.3923399448394775, "learning_rate": 1e-05, - "loss": 0.2743, + "loss": 0.2758, "step": 212300 }, { "epoch": 0.002124, - "grad_norm": 1.3586254119873047, + "grad_norm": 8.393562316894531, "learning_rate": 1e-05, - "loss": 0.2716, + "loss": 0.2752, "step": 212400 }, { "epoch": 0.002125, - "grad_norm": 1.3535470962524414, + "grad_norm": 1.412778615951538, "learning_rate": 1e-05, - "loss": 0.2766, + "loss": 0.2771, "step": 212500 }, { "epoch": 0.002126, - "grad_norm": 1.3460197448730469, + "grad_norm": 1.3804876804351807, "learning_rate": 1e-05, - "loss": 0.2717, + "loss": 0.2745, "step": 212600 }, { "epoch": 0.002127, - "grad_norm": 1.4486021995544434, + "grad_norm": 1.4782977104187012, "learning_rate": 1e-05, - "loss": 0.2751, + "loss": 0.2773, "step": 212700 }, { "epoch": 0.002128, - "grad_norm": 1.2856372594833374, + "grad_norm": 1.4774922132492065, "learning_rate": 1e-05, - "loss": 0.2764, + "loss": 0.2782, "step": 212800 }, { "epoch": 0.002129, - "grad_norm": 1.6471552848815918, + "grad_norm": 1.4704676866531372, "learning_rate": 1e-05, - "loss": 0.2689, + "loss": 0.2703, "step": 212900 }, { "epoch": 0.00213, - "grad_norm": 1.3634934425354004, + "grad_norm": 1.2644281387329102, "learning_rate": 1e-05, - "loss": 0.2685, + "loss": 0.2716, "step": 213000 }, { "epoch": 0.002131, - "grad_norm": 1.4267513751983643, + "grad_norm": 1.6625502109527588, "learning_rate": 1e-05, - "loss": 0.2761, + "loss": 0.277, "step": 213100 }, { "epoch": 0.002132, - "grad_norm": 1.249268651008606, + "grad_norm": 1.3630250692367554, "learning_rate": 1e-05, - "loss": 0.2729, + "loss": 0.2773, "step": 213200 }, { "epoch": 0.002133, - "grad_norm": 1.1896673440933228, + "grad_norm": 1.6373766660690308, "learning_rate": 1e-05, - "loss": 0.2705, + "loss": 0.2701, "step": 213300 }, { "epoch": 0.002134, - "grad_norm": 1.7257354259490967, + "grad_norm": 1.4623923301696777, "learning_rate": 1e-05, - "loss": 0.2632, + "loss": 0.2655, "step": 213400 }, { "epoch": 0.002135, - "grad_norm": 1.3130804300308228, + "grad_norm": 1.6090341806411743, "learning_rate": 1e-05, - "loss": 0.2734, + "loss": 0.2739, "step": 213500 }, { "epoch": 0.002136, - "grad_norm": 1.4132580757141113, + "grad_norm": 1.5013991594314575, "learning_rate": 1e-05, - "loss": 0.2696, + "loss": 0.2708, "step": 213600 }, { "epoch": 0.002137, - "grad_norm": 1.227081298828125, + "grad_norm": 1.26161527633667, "learning_rate": 1e-05, - "loss": 0.2715, + "loss": 0.2757, "step": 213700 }, { "epoch": 0.002138, - "grad_norm": 1.298254132270813, + "grad_norm": 1.418388843536377, "learning_rate": 1e-05, - "loss": 0.2727, + "loss": 0.2761, "step": 213800 }, { "epoch": 0.002139, - "grad_norm": 1.3379558324813843, + "grad_norm": 1.293133020401001, "learning_rate": 1e-05, - "loss": 0.2815, + "loss": 0.2855, "step": 213900 }, { "epoch": 0.00214, - "grad_norm": 1.528085470199585, + "grad_norm": 1.9040838479995728, "learning_rate": 1e-05, - "loss": 0.2753, + "loss": 0.2761, "step": 214000 }, { "epoch": 0.002141, - "grad_norm": 1.62379789352417, + "grad_norm": 1.5187069177627563, "learning_rate": 1e-05, - "loss": 0.2759, + "loss": 0.2758, "step": 214100 }, { "epoch": 0.002142, - "grad_norm": 1.346968173980713, + "grad_norm": 1.3588346242904663, "learning_rate": 1e-05, - "loss": 0.271, + "loss": 0.2709, "step": 214200 }, { "epoch": 0.002143, - "grad_norm": 1.4713540077209473, + "grad_norm": 1.5500917434692383, "learning_rate": 1e-05, - "loss": 0.2775, + "loss": 0.2794, "step": 214300 }, { "epoch": 0.002144, - "grad_norm": 1.535642147064209, + "grad_norm": 1.6189924478530884, "learning_rate": 1e-05, - "loss": 0.2708, + "loss": 0.273, "step": 214400 }, { "epoch": 0.002145, - "grad_norm": 1.5173916816711426, + "grad_norm": 1.7019140720367432, "learning_rate": 1e-05, - "loss": 0.274, + "loss": 0.2763, "step": 214500 }, { "epoch": 0.002146, - "grad_norm": 1.2202578783035278, + "grad_norm": 1.259387493133545, "learning_rate": 1e-05, - "loss": 0.2738, + "loss": 0.2776, "step": 214600 }, { "epoch": 0.002147, - "grad_norm": 1.3884676694869995, + "grad_norm": 2.1989023685455322, "learning_rate": 1e-05, - "loss": 0.2765, + "loss": 0.2803, "step": 214700 }, { "epoch": 0.002148, - "grad_norm": 1.3344085216522217, + "grad_norm": 2.2808122634887695, "learning_rate": 1e-05, - "loss": 0.2739, + "loss": 0.2789, "step": 214800 }, { "epoch": 0.002149, - "grad_norm": 1.3964022397994995, + "grad_norm": 1.5754348039627075, "learning_rate": 1e-05, - "loss": 0.2717, + "loss": 0.2754, "step": 214900 }, { "epoch": 0.00215, - "grad_norm": 1.319870948791504, + "grad_norm": 1.4869142770767212, "learning_rate": 1e-05, - "loss": 0.2705, + "loss": 0.2714, "step": 215000 }, { "epoch": 0.002151, - "grad_norm": 1.5503780841827393, + "grad_norm": 1.4373691082000732, "learning_rate": 1e-05, - "loss": 0.2718, + "loss": 0.2723, "step": 215100 }, { "epoch": 0.002152, - "grad_norm": 1.6045480966567993, + "grad_norm": 1.5293453931808472, "learning_rate": 1e-05, - "loss": 0.2743, + "loss": 0.2758, "step": 215200 }, { "epoch": 0.002153, - "grad_norm": 1.4609630107879639, + "grad_norm": 1.556516408920288, "learning_rate": 1e-05, - "loss": 0.2677, + "loss": 0.2691, "step": 215300 }, { "epoch": 0.002154, - "grad_norm": 1.3960351943969727, + "grad_norm": 1.6215890645980835, "learning_rate": 1e-05, - "loss": 0.2773, + "loss": 0.2792, "step": 215400 }, { "epoch": 0.002155, - "grad_norm": 1.3534421920776367, + "grad_norm": 1.3890748023986816, "learning_rate": 1e-05, - "loss": 0.2708, + "loss": 0.2742, "step": 215500 }, { "epoch": 0.002156, - "grad_norm": 1.5042314529418945, + "grad_norm": 1.41310453414917, "learning_rate": 1e-05, - "loss": 0.2716, + "loss": 0.2732, "step": 215600 }, { "epoch": 0.002157, - "grad_norm": 1.5703091621398926, + "grad_norm": 1.4331954717636108, "learning_rate": 1e-05, - "loss": 0.2727, + "loss": 0.2775, "step": 215700 }, { "epoch": 0.002158, - "grad_norm": 1.365871787071228, + "grad_norm": 1.3733137845993042, "learning_rate": 1e-05, - "loss": 0.2703, + "loss": 0.2709, "step": 215800 }, { "epoch": 0.002159, - "grad_norm": 1.2744897603988647, + "grad_norm": 1.2462104558944702, "learning_rate": 1e-05, - "loss": 0.2732, + "loss": 0.2761, "step": 215900 }, { "epoch": 0.00216, - "grad_norm": 1.5184082984924316, + "grad_norm": 1.506518006324768, "learning_rate": 1e-05, - "loss": 0.2721, + "loss": 0.2728, "step": 216000 }, { "epoch": 0.002161, - "grad_norm": 1.2325046062469482, + "grad_norm": 1.5095326900482178, "learning_rate": 1e-05, - "loss": 0.2634, + "loss": 0.2656, "step": 216100 }, { "epoch": 0.002162, - "grad_norm": 2.7802062034606934, + "grad_norm": 1.406726360321045, "learning_rate": 1e-05, - "loss": 0.2744, + "loss": 0.2774, "step": 216200 }, { "epoch": 0.002163, - "grad_norm": 1.4167160987854004, + "grad_norm": 1.5820876359939575, "learning_rate": 1e-05, - "loss": 0.2715, + "loss": 0.2735, "step": 216300 }, { "epoch": 0.002164, - "grad_norm": 1.255246877670288, + "grad_norm": 1.4047499895095825, "learning_rate": 1e-05, - "loss": 0.2775, + "loss": 0.2771, "step": 216400 }, { "epoch": 0.002165, - "grad_norm": 1.4111384153366089, + "grad_norm": 1.3778250217437744, "learning_rate": 1e-05, - "loss": 0.2692, + "loss": 0.2708, "step": 216500 }, { "epoch": 0.002166, - "grad_norm": 1.4857150316238403, + "grad_norm": 1.4947162866592407, "learning_rate": 1e-05, - "loss": 0.2732, + "loss": 0.2746, "step": 216600 }, { "epoch": 0.002167, - "grad_norm": 1.4398163557052612, + "grad_norm": 1.6999680995941162, "learning_rate": 1e-05, - "loss": 0.2711, + "loss": 0.2738, "step": 216700 }, { "epoch": 0.002168, - "grad_norm": 1.0538699626922607, + "grad_norm": 1.2014107704162598, "learning_rate": 1e-05, - "loss": 0.2758, + "loss": 0.2782, "step": 216800 }, { "epoch": 0.002169, - "grad_norm": 1.4064178466796875, + "grad_norm": 1.5182017087936401, "learning_rate": 1e-05, - "loss": 0.267, + "loss": 0.2693, "step": 216900 }, { "epoch": 0.00217, - "grad_norm": 1.3014734983444214, + "grad_norm": 1.5457252264022827, "learning_rate": 1e-05, - "loss": 0.2697, + "loss": 0.2716, "step": 217000 }, { "epoch": 0.002171, - "grad_norm": 1.322107195854187, + "grad_norm": 1.2823336124420166, "learning_rate": 1e-05, - "loss": 0.2673, + "loss": 0.2699, "step": 217100 }, { "epoch": 0.002172, - "grad_norm": 1.4945542812347412, + "grad_norm": 1.5415891408920288, "learning_rate": 1e-05, - "loss": 0.273, + "loss": 0.2761, "step": 217200 }, { "epoch": 0.002173, - "grad_norm": 1.4387496709823608, + "grad_norm": 1.4495307207107544, "learning_rate": 1e-05, - "loss": 0.272, + "loss": 0.2759, "step": 217300 }, { "epoch": 0.002174, - "grad_norm": 1.3536561727523804, + "grad_norm": 1.3674854040145874, "learning_rate": 1e-05, - "loss": 0.2707, + "loss": 0.273, "step": 217400 }, { "epoch": 0.002175, - "grad_norm": 1.2636030912399292, + "grad_norm": 1.3650346994400024, "learning_rate": 1e-05, - "loss": 0.2742, + "loss": 0.2745, "step": 217500 }, { "epoch": 0.002176, - "grad_norm": 1.7091553211212158, + "grad_norm": 1.835408329963684, "learning_rate": 1e-05, - "loss": 0.2688, + "loss": 0.2683, "step": 217600 }, { "epoch": 0.002177, - "grad_norm": 1.4158265590667725, + "grad_norm": 1.332398533821106, "learning_rate": 1e-05, - "loss": 0.2733, + "loss": 0.273, "step": 217700 }, { "epoch": 0.002178, - "grad_norm": 1.6173919439315796, + "grad_norm": 1.5285613536834717, "learning_rate": 1e-05, - "loss": 0.2766, + "loss": 0.2775, "step": 217800 }, { "epoch": 0.002179, - "grad_norm": 1.6343995332717896, + "grad_norm": 1.9105749130249023, "learning_rate": 1e-05, - "loss": 0.2751, + "loss": 0.2749, "step": 217900 }, { "epoch": 0.00218, - "grad_norm": 1.4717875719070435, + "grad_norm": 1.4697386026382446, "learning_rate": 1e-05, - "loss": 0.2771, + "loss": 0.2796, "step": 218000 }, { "epoch": 0.002181, - "grad_norm": 2.167228937149048, + "grad_norm": 1.2347923517227173, "learning_rate": 1e-05, - "loss": 0.2681, + "loss": 0.2715, "step": 218100 }, { "epoch": 0.002182, - "grad_norm": 1.4285328388214111, + "grad_norm": 1.539542317390442, "learning_rate": 1e-05, - "loss": 0.2703, + "loss": 0.2731, "step": 218200 }, { "epoch": 0.002183, - "grad_norm": 1.6143985986709595, + "grad_norm": 1.3916699886322021, "learning_rate": 1e-05, - "loss": 0.2608, + "loss": 0.2647, "step": 218300 }, { "epoch": 0.002184, - "grad_norm": 1.3126425743103027, + "grad_norm": 1.3453309535980225, "learning_rate": 1e-05, - "loss": 0.2702, + "loss": 0.2732, "step": 218400 }, { "epoch": 0.002185, - "grad_norm": 1.3291963338851929, + "grad_norm": 1.341639518737793, "learning_rate": 1e-05, - "loss": 0.2726, + "loss": 0.2738, "step": 218500 }, { "epoch": 0.002186, - "grad_norm": 1.3692171573638916, + "grad_norm": 1.5608952045440674, "learning_rate": 1e-05, - "loss": 0.2707, + "loss": 0.2731, "step": 218600 }, { "epoch": 0.002187, - "grad_norm": 1.246952772140503, + "grad_norm": 1.3826178312301636, "learning_rate": 1e-05, - "loss": 0.2686, + "loss": 0.272, "step": 218700 }, { "epoch": 0.002188, - "grad_norm": 1.384102463722229, + "grad_norm": 1.3272230625152588, "learning_rate": 1e-05, - "loss": 0.2621, + "loss": 0.2663, "step": 218800 }, { "epoch": 0.002189, - "grad_norm": 1.2710458040237427, + "grad_norm": 2.7572853565216064, "learning_rate": 1e-05, - "loss": 0.2712, + "loss": 0.2723, "step": 218900 }, { "epoch": 0.00219, - "grad_norm": 1.4765645265579224, + "grad_norm": 1.3882721662521362, "learning_rate": 1e-05, - "loss": 0.2666, + "loss": 0.2683, "step": 219000 }, { "epoch": 0.002191, - "grad_norm": 1.492040991783142, + "grad_norm": 1.5614526271820068, "learning_rate": 1e-05, - "loss": 0.2632, + "loss": 0.2685, "step": 219100 }, { "epoch": 0.002192, - "grad_norm": 1.3907136917114258, + "grad_norm": 1.5847523212432861, "learning_rate": 1e-05, - "loss": 0.2628, + "loss": 0.2656, "step": 219200 }, { "epoch": 0.002193, - "grad_norm": 1.266809105873108, + "grad_norm": 1.4185631275177002, "learning_rate": 1e-05, - "loss": 0.2722, + "loss": 0.2745, "step": 219300 }, { "epoch": 0.002194, - "grad_norm": 1.3720169067382812, + "grad_norm": 1.476169228553772, "learning_rate": 1e-05, - "loss": 0.2655, + "loss": 0.2712, "step": 219400 }, { "epoch": 0.002195, - "grad_norm": 1.5363353490829468, + "grad_norm": 1.512122631072998, "learning_rate": 1e-05, - "loss": 0.2717, + "loss": 0.2721, "step": 219500 }, { "epoch": 0.002196, - "grad_norm": 1.4530705213546753, + "grad_norm": 1.4664489030838013, "learning_rate": 1e-05, - "loss": 0.2694, + "loss": 0.2706, "step": 219600 }, { "epoch": 0.002197, - "grad_norm": 1.3838139772415161, + "grad_norm": 1.3304619789123535, "learning_rate": 1e-05, - "loss": 0.2676, + "loss": 0.272, "step": 219700 }, { "epoch": 0.002198, - "grad_norm": 1.5510233640670776, + "grad_norm": 1.65354585647583, "learning_rate": 1e-05, - "loss": 0.2676, + "loss": 0.2716, "step": 219800 }, { "epoch": 0.002199, - "grad_norm": 1.4454500675201416, + "grad_norm": 1.534138798713684, "learning_rate": 1e-05, - "loss": 0.2695, + "loss": 0.2689, "step": 219900 }, { "epoch": 0.0022, - "grad_norm": 1.7749993801116943, + "grad_norm": 1.5334886312484741, "learning_rate": 1e-05, - "loss": 0.2659, + "loss": 0.2702, "step": 220000 }, { "epoch": 0.0022, - "eval_loss": 0.2449951171875, - "eval_runtime": 101.7089, - "eval_samples_per_second": 491.599, - "eval_steps_per_second": 30.725, + "eval_loss": 0.2470703125, + "eval_runtime": 109.4816, + "eval_samples_per_second": 456.698, + "eval_steps_per_second": 28.544, "step": 220000 }, { "epoch": 0.002201, - "grad_norm": 1.4798575639724731, + "grad_norm": 1.6148320436477661, "learning_rate": 1e-05, - "loss": 0.2657, + "loss": 0.2714, "step": 220100 }, { "epoch": 0.002202, - "grad_norm": 1.3197557926177979, + "grad_norm": 1.3717103004455566, "learning_rate": 1e-05, - "loss": 0.2682, + "loss": 0.2714, "step": 220200 }, { "epoch": 0.002203, - "grad_norm": 1.5322121381759644, + "grad_norm": 1.325392484664917, "learning_rate": 1e-05, - "loss": 0.2625, + "loss": 0.2685, "step": 220300 }, { "epoch": 0.002204, - "grad_norm": 1.2285499572753906, + "grad_norm": 1.1573683023452759, "learning_rate": 1e-05, - "loss": 0.276, + "loss": 0.2777, "step": 220400 }, { "epoch": 0.002205, - "grad_norm": 1.5836375951766968, + "grad_norm": 1.4676803350448608, "learning_rate": 1e-05, - "loss": 0.2711, + "loss": 0.2766, "step": 220500 }, { "epoch": 0.002206, - "grad_norm": 1.4884577989578247, + "grad_norm": 1.3724902868270874, "learning_rate": 1e-05, - "loss": 0.2645, + "loss": 0.267, "step": 220600 }, { "epoch": 0.002207, - "grad_norm": 1.452649474143982, + "grad_norm": 1.3178821802139282, "learning_rate": 1e-05, - "loss": 0.2665, + "loss": 0.2677, "step": 220700 }, { "epoch": 0.002208, - "grad_norm": 1.5605190992355347, + "grad_norm": 1.4839378595352173, "learning_rate": 1e-05, - "loss": 0.2711, + "loss": 0.2751, "step": 220800 }, { "epoch": 0.002209, - "grad_norm": 2.8433327674865723, + "grad_norm": 1.4000365734100342, "learning_rate": 1e-05, - "loss": 0.2679, + "loss": 0.2715, "step": 220900 }, { "epoch": 0.00221, - "grad_norm": 1.545668363571167, + "grad_norm": 1.4803142547607422, "learning_rate": 1e-05, - "loss": 0.2634, + "loss": 0.2633, "step": 221000 }, { "epoch": 0.002211, - "grad_norm": 1.3548511266708374, + "grad_norm": 1.8429442644119263, "learning_rate": 1e-05, - "loss": 0.2689, + "loss": 0.2721, "step": 221100 }, { "epoch": 0.002212, - "grad_norm": 1.3411448001861572, + "grad_norm": 1.3296468257904053, "learning_rate": 1e-05, - "loss": 0.2646, + "loss": 0.2657, "step": 221200 }, { "epoch": 0.002213, - "grad_norm": 1.2828326225280762, + "grad_norm": 1.283510446548462, "learning_rate": 1e-05, - "loss": 0.2624, + "loss": 0.2664, "step": 221300 }, { "epoch": 0.002214, - "grad_norm": 2.1007606983184814, + "grad_norm": 1.7253391742706299, "learning_rate": 1e-05, - "loss": 0.2699, + "loss": 0.2719, "step": 221400 }, { "epoch": 0.002215, - "grad_norm": 1.604114294052124, + "grad_norm": 1.482222557067871, "learning_rate": 1e-05, - "loss": 0.267, + "loss": 0.27, "step": 221500 }, { "epoch": 0.002216, - "grad_norm": 1.8212924003601074, + "grad_norm": 1.497300148010254, "learning_rate": 1e-05, - "loss": 0.2717, + "loss": 0.2721, "step": 221600 }, { "epoch": 0.002217, - "grad_norm": 1.3568710088729858, + "grad_norm": 1.3005281686782837, "learning_rate": 1e-05, - "loss": 0.2689, + "loss": 0.271, "step": 221700 }, { "epoch": 0.002218, - "grad_norm": 1.6409475803375244, + "grad_norm": 1.3060221672058105, "learning_rate": 1e-05, - "loss": 0.2708, + "loss": 0.2728, "step": 221800 }, { "epoch": 0.002219, - "grad_norm": 1.321970820426941, + "grad_norm": 1.7013475894927979, "learning_rate": 1e-05, - "loss": 0.2619, + "loss": 0.2626, "step": 221900 }, { "epoch": 0.00222, - "grad_norm": 1.4816898107528687, + "grad_norm": 1.3594257831573486, "learning_rate": 1e-05, - "loss": 0.2665, + "loss": 0.2687, "step": 222000 }, { "epoch": 0.002221, - "grad_norm": 1.331111192703247, + "grad_norm": 1.3790507316589355, "learning_rate": 1e-05, - "loss": 0.273, + "loss": 0.2747, "step": 222100 }, { "epoch": 0.002222, - "grad_norm": 1.7249109745025635, + "grad_norm": 1.2899534702301025, "learning_rate": 1e-05, - "loss": 0.2645, + "loss": 0.2636, "step": 222200 }, { "epoch": 0.002223, - "grad_norm": 1.2523905038833618, + "grad_norm": 1.3310524225234985, "learning_rate": 1e-05, - "loss": 0.2784, + "loss": 0.2781, "step": 222300 }, { "epoch": 0.002224, - "grad_norm": 1.4860199689865112, + "grad_norm": 1.5631316900253296, "learning_rate": 1e-05, - "loss": 0.2665, + "loss": 0.2668, "step": 222400 }, { "epoch": 0.002225, - "grad_norm": 1.351641297340393, + "grad_norm": 1.5219013690948486, "learning_rate": 1e-05, - "loss": 0.2691, + "loss": 0.2705, "step": 222500 }, { "epoch": 0.002226, - "grad_norm": 1.241297960281372, + "grad_norm": 1.2552835941314697, "learning_rate": 1e-05, - "loss": 0.2687, + "loss": 0.2717, "step": 222600 }, { "epoch": 0.002227, - "grad_norm": 1.4678940773010254, + "grad_norm": 1.4404419660568237, "learning_rate": 1e-05, - "loss": 0.269, + "loss": 0.2696, "step": 222700 }, { "epoch": 0.002228, - "grad_norm": 1.5993295907974243, + "grad_norm": 1.3716353178024292, "learning_rate": 1e-05, - "loss": 0.2576, + "loss": 0.2623, "step": 222800 }, { "epoch": 0.002229, - "grad_norm": 1.4906038045883179, + "grad_norm": 1.5366202592849731, "learning_rate": 1e-05, - "loss": 0.2621, + "loss": 0.2639, "step": 222900 }, { "epoch": 0.00223, - "grad_norm": 1.4458608627319336, + "grad_norm": 1.366859793663025, "learning_rate": 1e-05, - "loss": 0.2625, + "loss": 0.2639, "step": 223000 }, { "epoch": 0.002231, - "grad_norm": 1.257728099822998, + "grad_norm": 1.3707561492919922, "learning_rate": 1e-05, - "loss": 0.2711, + "loss": 0.2705, "step": 223100 }, { "epoch": 0.002232, - "grad_norm": 1.4083467721939087, + "grad_norm": 1.5594546794891357, "learning_rate": 1e-05, - "loss": 0.2711, + "loss": 0.2729, "step": 223200 }, { "epoch": 0.002233, - "grad_norm": 1.3221286535263062, + "grad_norm": 1.404626488685608, "learning_rate": 1e-05, - "loss": 0.2616, + "loss": 0.2649, "step": 223300 }, { "epoch": 0.002234, - "grad_norm": 1.6637743711471558, + "grad_norm": 1.6944884061813354, "learning_rate": 1e-05, - "loss": 0.2689, + "loss": 0.2686, "step": 223400 }, { "epoch": 0.002235, - "grad_norm": 1.293407917022705, + "grad_norm": 1.9831916093826294, "learning_rate": 1e-05, - "loss": 0.2706, + "loss": 0.274, "step": 223500 }, { "epoch": 0.002236, - "grad_norm": 1.4848270416259766, + "grad_norm": 1.7819033861160278, "learning_rate": 1e-05, - "loss": 0.2711, + "loss": 0.2734, "step": 223600 }, { "epoch": 0.002237, - "grad_norm": 1.4327635765075684, + "grad_norm": 1.5690064430236816, "learning_rate": 1e-05, - "loss": 0.2733, + "loss": 0.2737, "step": 223700 }, { "epoch": 0.002238, - "grad_norm": 1.2239385843276978, + "grad_norm": 1.2317233085632324, "learning_rate": 1e-05, - "loss": 0.2647, + "loss": 0.2679, "step": 223800 }, { "epoch": 0.002239, - "grad_norm": 1.5136600732803345, + "grad_norm": 2.0383543968200684, "learning_rate": 1e-05, - "loss": 0.2663, + "loss": 0.2679, "step": 223900 }, { "epoch": 0.00224, - "grad_norm": 1.4159045219421387, + "grad_norm": 1.4997539520263672, "learning_rate": 1e-05, - "loss": 0.2682, + "loss": 0.2727, "step": 224000 }, { "epoch": 0.002241, - "grad_norm": 1.4657328128814697, + "grad_norm": 1.3992339372634888, "learning_rate": 1e-05, - "loss": 0.2697, + "loss": 0.273, "step": 224100 }, { "epoch": 0.002242, - "grad_norm": 1.5326930284500122, + "grad_norm": 1.5344079732894897, "learning_rate": 1e-05, - "loss": 0.2694, + "loss": 0.2712, "step": 224200 }, { "epoch": 0.002243, - "grad_norm": 1.5971113443374634, + "grad_norm": 1.6358779668807983, "learning_rate": 1e-05, - "loss": 0.2679, + "loss": 0.2728, "step": 224300 }, { "epoch": 0.002244, - "grad_norm": 1.3622255325317383, + "grad_norm": 1.6222172975540161, "learning_rate": 1e-05, - "loss": 0.258, + "loss": 0.2621, "step": 224400 }, { "epoch": 0.002245, - "grad_norm": 1.2654393911361694, + "grad_norm": 1.5196174383163452, "learning_rate": 1e-05, - "loss": 0.2651, + "loss": 0.2696, "step": 224500 }, { "epoch": 0.002246, - "grad_norm": 1.4576301574707031, + "grad_norm": 1.477639079093933, "learning_rate": 1e-05, - "loss": 0.2789, + "loss": 0.2822, "step": 224600 }, { "epoch": 0.002247, - "grad_norm": 1.5406455993652344, + "grad_norm": 1.6515791416168213, "learning_rate": 1e-05, - "loss": 0.2703, + "loss": 0.2716, "step": 224700 }, { "epoch": 0.002248, - "grad_norm": 1.2421468496322632, + "grad_norm": 1.387439489364624, "learning_rate": 1e-05, - "loss": 0.2642, + "loss": 0.2674, "step": 224800 }, { "epoch": 0.002249, - "grad_norm": 1.5023775100708008, + "grad_norm": 1.264535665512085, "learning_rate": 1e-05, - "loss": 0.2689, + "loss": 0.2708, "step": 224900 }, { "epoch": 0.00225, - "grad_norm": 1.5009279251098633, + "grad_norm": 1.4407367706298828, "learning_rate": 1e-05, - "loss": 0.2625, + "loss": 0.2631, "step": 225000 }, { "epoch": 0.002251, - "grad_norm": 1.5099420547485352, + "grad_norm": 1.5102108716964722, "learning_rate": 1e-05, - "loss": 0.2708, + "loss": 0.272, "step": 225100 }, { "epoch": 0.002252, - "grad_norm": 1.4483380317687988, + "grad_norm": 1.5774545669555664, "learning_rate": 1e-05, - "loss": 0.2628, + "loss": 0.2675, "step": 225200 }, { "epoch": 0.002253, - "grad_norm": 1.2387980222702026, + "grad_norm": 1.331253170967102, "learning_rate": 1e-05, - "loss": 0.2623, + "loss": 0.2638, "step": 225300 }, { "epoch": 0.002254, - "grad_norm": 1.3785974979400635, + "grad_norm": 1.443367600440979, "learning_rate": 1e-05, - "loss": 0.2729, + "loss": 0.2756, "step": 225400 }, { "epoch": 0.002255, - "grad_norm": 1.186869740486145, + "grad_norm": 1.3950369358062744, "learning_rate": 1e-05, - "loss": 0.2645, + "loss": 0.2666, "step": 225500 }, { "epoch": 0.002256, - "grad_norm": 1.29552161693573, + "grad_norm": 2.172563314437866, "learning_rate": 1e-05, - "loss": 0.2758, + "loss": 0.2777, "step": 225600 }, { "epoch": 0.002257, - "grad_norm": 1.2129687070846558, + "grad_norm": 1.1899489164352417, "learning_rate": 1e-05, - "loss": 0.263, + "loss": 0.2651, "step": 225700 }, { "epoch": 0.002258, - "grad_norm": 1.4880510568618774, + "grad_norm": 1.459839105606079, "learning_rate": 1e-05, - "loss": 0.2656, + "loss": 0.2691, "step": 225800 }, { "epoch": 0.002259, - "grad_norm": 1.6327648162841797, + "grad_norm": 1.577430248260498, "learning_rate": 1e-05, - "loss": 0.2684, + "loss": 0.274, "step": 225900 }, { "epoch": 0.00226, - "grad_norm": 1.4231723546981812, + "grad_norm": 2.454540729522705, "learning_rate": 1e-05, - "loss": 0.2702, + "loss": 0.2731, "step": 226000 }, { "epoch": 0.002261, - "grad_norm": 1.4302500486373901, + "grad_norm": 1.4699666500091553, "learning_rate": 1e-05, - "loss": 0.2658, + "loss": 0.2684, "step": 226100 }, { "epoch": 0.002262, - "grad_norm": 1.5919798612594604, + "grad_norm": 1.745713472366333, "learning_rate": 1e-05, - "loss": 0.2712, + "loss": 0.2742, "step": 226200 }, { "epoch": 0.002263, - "grad_norm": 2.9743430614471436, + "grad_norm": 1.7182310819625854, "learning_rate": 1e-05, - "loss": 0.2662, + "loss": 0.2695, "step": 226300 }, { "epoch": 0.002264, - "grad_norm": 1.2029353380203247, + "grad_norm": 1.3382000923156738, "learning_rate": 1e-05, - "loss": 0.2596, + "loss": 0.2638, "step": 226400 }, { "epoch": 0.002265, - "grad_norm": 1.660622477531433, + "grad_norm": 1.5027062892913818, "learning_rate": 1e-05, - "loss": 0.2732, + "loss": 0.2738, "step": 226500 }, { "epoch": 0.002266, - "grad_norm": 1.301240086555481, + "grad_norm": 1.4339383840560913, "learning_rate": 1e-05, - "loss": 0.2635, + "loss": 0.2651, "step": 226600 }, { "epoch": 0.002267, - "grad_norm": 1.5606175661087036, + "grad_norm": 1.5018837451934814, "learning_rate": 1e-05, - "loss": 0.2637, + "loss": 0.2681, "step": 226700 }, { "epoch": 0.002268, - "grad_norm": 1.3633745908737183, + "grad_norm": 1.4683713912963867, "learning_rate": 1e-05, - "loss": 0.2645, + "loss": 0.267, "step": 226800 }, { "epoch": 0.002269, - "grad_norm": 1.6376230716705322, + "grad_norm": 1.5575743913650513, "learning_rate": 1e-05, - "loss": 0.2662, + "loss": 0.2668, "step": 226900 }, { "epoch": 0.00227, - "grad_norm": 1.6246817111968994, + "grad_norm": 1.3830864429473877, "learning_rate": 1e-05, - "loss": 0.2692, + "loss": 0.2701, "step": 227000 }, { "epoch": 0.002271, - "grad_norm": 1.391745686531067, + "grad_norm": 1.3803704977035522, "learning_rate": 1e-05, - "loss": 0.2617, + "loss": 0.266, "step": 227100 }, { "epoch": 0.002272, - "grad_norm": 1.7494397163391113, + "grad_norm": 1.6529074907302856, "learning_rate": 1e-05, - "loss": 0.269, + "loss": 0.2724, "step": 227200 }, { "epoch": 0.002273, - "grad_norm": 1.5674599409103394, + "grad_norm": 1.5334415435791016, "learning_rate": 1e-05, - "loss": 0.2666, + "loss": 0.272, "step": 227300 }, { "epoch": 0.002274, - "grad_norm": 1.3235244750976562, + "grad_norm": 1.4912134408950806, "learning_rate": 1e-05, - "loss": 0.2702, + "loss": 0.2723, "step": 227400 }, { "epoch": 0.002275, - "grad_norm": 1.903848648071289, + "grad_norm": 1.375284194946289, "learning_rate": 1e-05, - "loss": 0.267, + "loss": 0.2702, "step": 227500 }, { "epoch": 0.002276, - "grad_norm": 1.184098720550537, + "grad_norm": 1.2836390733718872, "learning_rate": 1e-05, - "loss": 0.2725, + "loss": 0.2738, "step": 227600 }, { "epoch": 0.002277, - "grad_norm": 1.2618509531021118, + "grad_norm": 1.2948681116104126, "learning_rate": 1e-05, - "loss": 0.2754, + "loss": 0.275, "step": 227700 }, { "epoch": 0.002278, - "grad_norm": 1.40862238407135, + "grad_norm": 1.3349876403808594, "learning_rate": 1e-05, - "loss": 0.2643, + "loss": 0.2673, "step": 227800 }, { "epoch": 0.002279, - "grad_norm": 1.617973804473877, + "grad_norm": 1.422441840171814, "learning_rate": 1e-05, - "loss": 0.2638, + "loss": 0.267, "step": 227900 }, { "epoch": 0.00228, - "grad_norm": 1.454044222831726, + "grad_norm": 1.4071296453475952, "learning_rate": 1e-05, - "loss": 0.2651, + "loss": 0.2688, "step": 228000 }, { "epoch": 0.002281, - "grad_norm": 1.306423306465149, + "grad_norm": 1.3052148818969727, "learning_rate": 1e-05, - "loss": 0.266, + "loss": 0.2674, "step": 228100 }, { "epoch": 0.002282, - "grad_norm": 1.5154908895492554, + "grad_norm": 1.4938229322433472, "learning_rate": 1e-05, - "loss": 0.267, + "loss": 0.2693, "step": 228200 }, { "epoch": 0.002283, - "grad_norm": 1.431080937385559, + "grad_norm": 1.48699951171875, "learning_rate": 1e-05, - "loss": 0.2644, + "loss": 0.2681, "step": 228300 }, { "epoch": 0.002284, - "grad_norm": 1.347579836845398, + "grad_norm": 1.5458961725234985, "learning_rate": 1e-05, - "loss": 0.2691, + "loss": 0.2697, "step": 228400 }, { "epoch": 0.002285, - "grad_norm": 1.3767662048339844, + "grad_norm": 1.394895315170288, "learning_rate": 1e-05, - "loss": 0.265, + "loss": 0.2678, "step": 228500 }, { "epoch": 0.002286, - "grad_norm": 1.3711931705474854, + "grad_norm": 1.3542561531066895, "learning_rate": 1e-05, - "loss": 0.2649, + "loss": 0.2669, "step": 228600 }, { "epoch": 0.002287, - "grad_norm": 1.49092698097229, + "grad_norm": 1.5436086654663086, "learning_rate": 1e-05, - "loss": 0.2656, + "loss": 0.2681, "step": 228700 }, { "epoch": 0.002288, - "grad_norm": 1.4322484731674194, + "grad_norm": 1.3657710552215576, "learning_rate": 1e-05, - "loss": 0.2704, + "loss": 0.2731, "step": 228800 }, { "epoch": 0.002289, - "grad_norm": 1.3011993169784546, + "grad_norm": 1.2138234376907349, "learning_rate": 1e-05, - "loss": 0.2677, + "loss": 0.2706, "step": 228900 }, { "epoch": 0.00229, - "grad_norm": 1.3548883199691772, + "grad_norm": 1.4697974920272827, "learning_rate": 1e-05, - "loss": 0.268, + "loss": 0.2695, "step": 229000 }, { "epoch": 0.002291, - "grad_norm": 1.2932956218719482, + "grad_norm": 1.532043695449829, "learning_rate": 1e-05, - "loss": 0.2666, + "loss": 0.2691, "step": 229100 }, { "epoch": 0.002292, - "grad_norm": 1.358920931816101, + "grad_norm": 1.5469400882720947, "learning_rate": 1e-05, - "loss": 0.2641, + "loss": 0.2635, "step": 229200 }, { "epoch": 0.002293, - "grad_norm": 1.2997891902923584, + "grad_norm": 1.3268221616744995, "learning_rate": 1e-05, - "loss": 0.2687, + "loss": 0.2703, "step": 229300 }, { "epoch": 0.002294, - "grad_norm": 1.49066162109375, + "grad_norm": 1.3366742134094238, "learning_rate": 1e-05, - "loss": 0.2646, + "loss": 0.2658, "step": 229400 }, { "epoch": 0.002295, - "grad_norm": 1.2723405361175537, + "grad_norm": 1.3616935014724731, "learning_rate": 1e-05, - "loss": 0.2698, + "loss": 0.2728, "step": 229500 }, { "epoch": 0.002296, - "grad_norm": 1.260061264038086, + "grad_norm": 1.3293488025665283, "learning_rate": 1e-05, - "loss": 0.271, + "loss": 0.2703, "step": 229600 }, { "epoch": 0.002297, - "grad_norm": 1.3549233675003052, + "grad_norm": 1.58241605758667, "learning_rate": 1e-05, - "loss": 0.2581, + "loss": 0.2629, "step": 229700 }, { "epoch": 0.002298, - "grad_norm": 1.2694374322891235, + "grad_norm": 1.240764856338501, "learning_rate": 1e-05, - "loss": 0.2613, + "loss": 0.2664, "step": 229800 }, { "epoch": 0.002299, - "grad_norm": 1.4169176816940308, + "grad_norm": 1.480238676071167, "learning_rate": 1e-05, - "loss": 0.2704, + "loss": 0.2737, "step": 229900 }, { "epoch": 0.0023, - "grad_norm": 1.4261996746063232, + "grad_norm": 1.6399883031845093, "learning_rate": 1e-05, - "loss": 0.2646, + "loss": 0.265, "step": 230000 }, { "epoch": 0.002301, - "grad_norm": 1.4658910036087036, + "grad_norm": 1.5711274147033691, "learning_rate": 1e-05, - "loss": 0.2655, + "loss": 0.265, "step": 230100 }, { "epoch": 0.002302, - "grad_norm": 1.3811699151992798, + "grad_norm": 1.3830393552780151, "learning_rate": 1e-05, - "loss": 0.2673, + "loss": 0.2708, "step": 230200 }, { "epoch": 0.002303, - "grad_norm": 1.4524728059768677, + "grad_norm": 1.3761115074157715, "learning_rate": 1e-05, - "loss": 0.2656, + "loss": 0.2665, "step": 230300 }, { "epoch": 0.002304, - "grad_norm": 1.6824735403060913, + "grad_norm": 1.4468395709991455, "learning_rate": 1e-05, - "loss": 0.2635, + "loss": 0.2665, "step": 230400 }, { "epoch": 0.002305, - "grad_norm": 1.353838324546814, + "grad_norm": 1.2549031972885132, "learning_rate": 1e-05, - "loss": 0.267, + "loss": 0.2704, "step": 230500 }, { "epoch": 0.002306, - "grad_norm": 1.4063193798065186, + "grad_norm": 1.5276696681976318, "learning_rate": 1e-05, - "loss": 0.2583, + "loss": 0.2617, "step": 230600 }, { "epoch": 0.002307, - "grad_norm": 1.391334056854248, + "grad_norm": 1.222853660583496, "learning_rate": 1e-05, - "loss": 0.2689, + "loss": 0.2692, "step": 230700 }, { "epoch": 0.002308, - "grad_norm": 1.1624835729599, + "grad_norm": 1.504705548286438, "learning_rate": 1e-05, - "loss": 0.2702, + "loss": 0.2743, "step": 230800 }, { "epoch": 0.002309, - "grad_norm": 1.2580177783966064, + "grad_norm": 1.380125880241394, "learning_rate": 1e-05, - "loss": 0.2684, + "loss": 0.273, "step": 230900 }, { "epoch": 0.00231, - "grad_norm": 1.429121732711792, + "grad_norm": 1.5362893342971802, "learning_rate": 1e-05, - "loss": 0.2596, + "loss": 0.2617, "step": 231000 }, { "epoch": 0.002311, - "grad_norm": 1.3477896451950073, + "grad_norm": 1.308255672454834, "learning_rate": 1e-05, - "loss": 0.2582, + "loss": 0.2607, "step": 231100 }, { "epoch": 0.002312, - "grad_norm": 1.3960341215133667, + "grad_norm": 1.46437406539917, "learning_rate": 1e-05, - "loss": 0.2603, + "loss": 0.2623, "step": 231200 }, { "epoch": 0.002313, - "grad_norm": 1.3546319007873535, + "grad_norm": 1.7176834344863892, "learning_rate": 1e-05, - "loss": 0.2706, + "loss": 0.2733, "step": 231300 }, { "epoch": 0.002314, - "grad_norm": 1.348419189453125, + "grad_norm": 1.2125487327575684, "learning_rate": 1e-05, - "loss": 0.2622, + "loss": 0.2661, "step": 231400 }, { "epoch": 0.002315, - "grad_norm": 1.7092764377593994, + "grad_norm": 1.656404972076416, "learning_rate": 1e-05, - "loss": 0.271, + "loss": 0.2719, "step": 231500 }, { "epoch": 0.002316, - "grad_norm": 1.2994548082351685, + "grad_norm": 1.2319536209106445, "learning_rate": 1e-05, - "loss": 0.2628, + "loss": 0.2662, "step": 231600 }, { "epoch": 0.002317, - "grad_norm": 1.4784059524536133, + "grad_norm": 1.4529757499694824, "learning_rate": 1e-05, - "loss": 0.2649, + "loss": 0.2668, "step": 231700 }, { "epoch": 0.002318, - "grad_norm": 1.473240852355957, + "grad_norm": 1.3441050052642822, "learning_rate": 1e-05, - "loss": 0.2621, + "loss": 0.2664, "step": 231800 }, { "epoch": 0.002319, - "grad_norm": 1.2553753852844238, + "grad_norm": 1.3161287307739258, "learning_rate": 1e-05, - "loss": 0.2622, + "loss": 0.2664, "step": 231900 }, { "epoch": 0.00232, - "grad_norm": 1.5442348718643188, + "grad_norm": 1.5237764120101929, "learning_rate": 1e-05, - "loss": 0.264, + "loss": 0.2661, "step": 232000 }, { "epoch": 0.002321, - "grad_norm": 1.6117587089538574, + "grad_norm": 1.4569116830825806, "learning_rate": 1e-05, - "loss": 0.2641, + "loss": 0.2683, "step": 232100 }, { "epoch": 0.002322, - "grad_norm": 1.269234538078308, + "grad_norm": 1.4047276973724365, "learning_rate": 1e-05, - "loss": 0.2619, + "loss": 0.2658, "step": 232200 }, { "epoch": 0.002323, - "grad_norm": 1.7244213819503784, + "grad_norm": 1.6638625860214233, "learning_rate": 1e-05, - "loss": 0.2627, + "loss": 0.2632, "step": 232300 }, { "epoch": 0.002324, - "grad_norm": 1.218603491783142, + "grad_norm": 1.2283929586410522, "learning_rate": 1e-05, - "loss": 0.2641, + "loss": 0.2639, "step": 232400 }, { "epoch": 0.002325, - "grad_norm": 1.2990721464157104, + "grad_norm": 1.2033153772354126, "learning_rate": 1e-05, - "loss": 0.2658, + "loss": 0.2679, "step": 232500 }, { "epoch": 0.002326, - "grad_norm": 1.3279768228530884, + "grad_norm": 1.37595534324646, "learning_rate": 1e-05, - "loss": 0.2616, + "loss": 0.2671, "step": 232600 }, { "epoch": 0.002327, - "grad_norm": 1.482283115386963, + "grad_norm": 1.4282982349395752, "learning_rate": 1e-05, - "loss": 0.2629, + "loss": 0.2645, "step": 232700 }, { "epoch": 0.002328, - "grad_norm": 1.283960223197937, + "grad_norm": 1.3374396562576294, "learning_rate": 1e-05, - "loss": 0.2691, + "loss": 0.2702, "step": 232800 }, { "epoch": 0.002329, - "grad_norm": 1.2554558515548706, + "grad_norm": 1.2464615106582642, "learning_rate": 1e-05, - "loss": 0.2674, + "loss": 0.2704, "step": 232900 }, { "epoch": 0.00233, - "grad_norm": 1.3035228252410889, + "grad_norm": 1.4305086135864258, "learning_rate": 1e-05, - "loss": 0.2645, + "loss": 0.2675, "step": 233000 }, { "epoch": 0.002331, - "grad_norm": 1.2804003953933716, + "grad_norm": 1.404268503189087, "learning_rate": 1e-05, - "loss": 0.2646, + "loss": 0.2673, "step": 233100 }, { "epoch": 0.002332, - "grad_norm": 1.4150599241256714, + "grad_norm": 1.2417124509811401, "learning_rate": 1e-05, - "loss": 0.2645, + "loss": 0.2652, "step": 233200 }, { "epoch": 0.002333, - "grad_norm": 1.4224042892456055, + "grad_norm": 1.4305444955825806, "learning_rate": 1e-05, - "loss": 0.2636, + "loss": 0.264, "step": 233300 }, { "epoch": 0.002334, - "grad_norm": 1.4150424003601074, + "grad_norm": 1.4114798307418823, "learning_rate": 1e-05, - "loss": 0.2687, + "loss": 0.2721, "step": 233400 }, { "epoch": 0.002335, - "grad_norm": 1.3149292469024658, + "grad_norm": 1.5132521390914917, "learning_rate": 1e-05, - "loss": 0.2641, + "loss": 0.2662, "step": 233500 }, { "epoch": 0.002336, - "grad_norm": 1.3036744594573975, + "grad_norm": 1.2459546327590942, "learning_rate": 1e-05, - "loss": 0.2559, + "loss": 0.2608, "step": 233600 }, { "epoch": 0.002337, - "grad_norm": 1.3279225826263428, + "grad_norm": 1.3642733097076416, "learning_rate": 1e-05, - "loss": 0.2648, + "loss": 0.2673, "step": 233700 }, { "epoch": 0.002338, - "grad_norm": 1.228463888168335, + "grad_norm": 1.2351726293563843, "learning_rate": 1e-05, - "loss": 0.2621, + "loss": 0.2655, "step": 233800 }, { "epoch": 0.002339, - "grad_norm": 1.3678582906723022, + "grad_norm": 1.2862097024917603, "learning_rate": 1e-05, - "loss": 0.2697, + "loss": 0.2726, "step": 233900 }, { "epoch": 0.00234, - "grad_norm": 1.3787598609924316, + "grad_norm": 1.286604642868042, "learning_rate": 1e-05, - "loss": 0.2617, + "loss": 0.2626, "step": 234000 }, { "epoch": 0.002341, - "grad_norm": 1.555228590965271, + "grad_norm": 1.5336260795593262, "learning_rate": 1e-05, - "loss": 0.2627, + "loss": 0.2654, "step": 234100 }, { "epoch": 0.002342, - "grad_norm": 1.5529192686080933, + "grad_norm": 1.5840083360671997, "learning_rate": 1e-05, - "loss": 0.2665, + "loss": 0.2664, "step": 234200 }, { "epoch": 0.002343, - "grad_norm": 1.4861159324645996, + "grad_norm": 1.7138983011245728, "learning_rate": 1e-05, - "loss": 0.2594, + "loss": 0.2649, "step": 234300 }, { "epoch": 0.002344, - "grad_norm": 1.5626715421676636, + "grad_norm": 1.624403953552246, "learning_rate": 1e-05, - "loss": 0.2601, + "loss": 0.2604, "step": 234400 }, { "epoch": 0.002345, - "grad_norm": 1.320625901222229, + "grad_norm": 1.43915593624115, "learning_rate": 1e-05, - "loss": 0.2635, + "loss": 0.2656, "step": 234500 }, { "epoch": 0.002346, - "grad_norm": 1.521616816520691, + "grad_norm": 1.4849474430084229, "learning_rate": 1e-05, - "loss": 0.2642, + "loss": 0.2651, "step": 234600 }, { "epoch": 0.002347, - "grad_norm": 1.5700000524520874, + "grad_norm": 1.5203824043273926, "learning_rate": 1e-05, - "loss": 0.2607, + "loss": 0.2628, "step": 234700 }, { "epoch": 0.002348, - "grad_norm": 1.406499981880188, + "grad_norm": 1.2643849849700928, "learning_rate": 1e-05, - "loss": 0.2628, + "loss": 0.2661, "step": 234800 }, { "epoch": 0.002349, - "grad_norm": 1.3192278146743774, + "grad_norm": 1.2797199487686157, "learning_rate": 1e-05, - "loss": 0.2629, + "loss": 0.2624, "step": 234900 }, { "epoch": 0.00235, - "grad_norm": 1.389024019241333, + "grad_norm": 1.4327815771102905, "learning_rate": 1e-05, - "loss": 0.2636, + "loss": 0.2635, "step": 235000 }, { "epoch": 0.002351, - "grad_norm": 1.502858281135559, + "grad_norm": 1.5249953269958496, "learning_rate": 1e-05, - "loss": 0.2602, + "loss": 0.265, "step": 235100 }, { "epoch": 0.002352, - "grad_norm": 1.3187223672866821, + "grad_norm": 1.46501624584198, "learning_rate": 1e-05, - "loss": 0.2604, + "loss": 0.2676, "step": 235200 }, { "epoch": 0.002353, - "grad_norm": 1.6255407333374023, + "grad_norm": 1.5722047090530396, "learning_rate": 1e-05, - "loss": 0.2565, + "loss": 0.2574, "step": 235300 }, { "epoch": 0.002354, - "grad_norm": 1.2539702653884888, + "grad_norm": 1.2025928497314453, "learning_rate": 1e-05, - "loss": 0.2641, + "loss": 0.2671, "step": 235400 }, { "epoch": 0.002355, - "grad_norm": 1.321232557296753, + "grad_norm": 1.554432988166809, "learning_rate": 1e-05, - "loss": 0.2675, + "loss": 0.2703, "step": 235500 }, { "epoch": 0.002356, - "grad_norm": 1.3780826330184937, + "grad_norm": 1.4228944778442383, "learning_rate": 1e-05, - "loss": 0.2715, + "loss": 0.2728, "step": 235600 }, { "epoch": 0.002357, - "grad_norm": 1.315024495124817, + "grad_norm": 1.5761821269989014, "learning_rate": 1e-05, - "loss": 0.2626, + "loss": 0.2637, "step": 235700 }, { "epoch": 0.002358, - "grad_norm": 1.3643429279327393, + "grad_norm": 1.1505320072174072, "learning_rate": 1e-05, - "loss": 0.2668, + "loss": 0.2675, "step": 235800 }, { "epoch": 0.002359, - "grad_norm": 1.4477214813232422, + "grad_norm": 1.4779144525527954, "learning_rate": 1e-05, - "loss": 0.2693, + "loss": 0.2716, "step": 235900 }, { "epoch": 0.00236, - "grad_norm": 1.356468677520752, + "grad_norm": 1.3939759731292725, "learning_rate": 1e-05, - "loss": 0.2612, + "loss": 0.2616, "step": 236000 }, { "epoch": 0.002361, - "grad_norm": 1.3910270929336548, + "grad_norm": 1.5327190160751343, "learning_rate": 1e-05, - "loss": 0.2638, + "loss": 0.263, "step": 236100 }, { "epoch": 0.002362, - "grad_norm": 1.2352834939956665, + "grad_norm": 1.338335394859314, "learning_rate": 1e-05, - "loss": 0.2668, + "loss": 0.2696, "step": 236200 }, { "epoch": 0.002363, - "grad_norm": 1.416629672050476, + "grad_norm": 1.4766395092010498, "learning_rate": 1e-05, - "loss": 0.2648, + "loss": 0.2657, "step": 236300 }, { "epoch": 0.002364, - "grad_norm": 1.2549015283584595, + "grad_norm": 1.292716145515442, "learning_rate": 1e-05, - "loss": 0.2599, + "loss": 0.2611, "step": 236400 }, { "epoch": 0.002365, - "grad_norm": 1.3216965198516846, + "grad_norm": 1.3678349256515503, "learning_rate": 1e-05, - "loss": 0.2629, + "loss": 0.2663, "step": 236500 }, { "epoch": 0.002366, - "grad_norm": 1.429036259651184, + "grad_norm": 1.5033866167068481, "learning_rate": 1e-05, - "loss": 0.266, + "loss": 0.2693, "step": 236600 }, { "epoch": 0.002367, - "grad_norm": 1.1952705383300781, + "grad_norm": 1.1508349180221558, "learning_rate": 1e-05, - "loss": 0.2601, + "loss": 0.2631, "step": 236700 }, { "epoch": 0.002368, - "grad_norm": 1.5837066173553467, + "grad_norm": 1.627375602722168, "learning_rate": 1e-05, - "loss": 0.265, + "loss": 0.2661, "step": 236800 }, { "epoch": 0.002369, - "grad_norm": 1.5196020603179932, + "grad_norm": 1.5816211700439453, "learning_rate": 1e-05, - "loss": 0.2637, + "loss": 0.2655, "step": 236900 }, { "epoch": 0.00237, - "grad_norm": 1.6510149240493774, + "grad_norm": 1.6481306552886963, "learning_rate": 1e-05, - "loss": 0.2609, + "loss": 0.2649, "step": 237000 }, { "epoch": 0.002371, - "grad_norm": 1.6215910911560059, + "grad_norm": 1.6917251348495483, "learning_rate": 1e-05, "loss": 0.2653, "step": 237100 }, { "epoch": 0.002372, - "grad_norm": 1.1515642404556274, + "grad_norm": 2.067993640899658, "learning_rate": 1e-05, - "loss": 0.2594, + "loss": 0.2593, "step": 237200 }, { "epoch": 0.002373, - "grad_norm": 1.3704147338867188, + "grad_norm": 1.3310753107070923, "learning_rate": 1e-05, - "loss": 0.2624, + "loss": 0.2667, "step": 237300 }, { "epoch": 0.002374, - "grad_norm": 1.3209127187728882, + "grad_norm": 1.6005228757858276, "learning_rate": 1e-05, - "loss": 0.2666, + "loss": 0.2658, "step": 237400 }, { "epoch": 0.002375, - "grad_norm": 1.4141191244125366, + "grad_norm": 1.285658836364746, "learning_rate": 1e-05, - "loss": 0.2559, + "loss": 0.2605, "step": 237500 }, { "epoch": 0.002376, - "grad_norm": 1.3395062685012817, + "grad_norm": 1.37336003780365, "learning_rate": 1e-05, - "loss": 0.2686, + "loss": 0.2691, "step": 237600 }, { "epoch": 0.002377, - "grad_norm": 1.380592703819275, + "grad_norm": 1.3081128597259521, "learning_rate": 1e-05, - "loss": 0.2565, + "loss": 0.2597, "step": 237700 }, { "epoch": 0.002378, - "grad_norm": 1.498141884803772, + "grad_norm": 1.4721745252609253, "learning_rate": 1e-05, - "loss": 0.267, + "loss": 0.2679, "step": 237800 }, { "epoch": 0.002379, - "grad_norm": 1.5127662420272827, + "grad_norm": 1.4306268692016602, "learning_rate": 1e-05, - "loss": 0.2648, + "loss": 0.267, "step": 237900 }, { "epoch": 0.00238, - "grad_norm": 1.3514988422393799, + "grad_norm": 1.323388934135437, "learning_rate": 1e-05, - "loss": 0.2592, + "loss": 0.2656, "step": 238000 }, { "epoch": 0.002381, - "grad_norm": 1.5719401836395264, + "grad_norm": 1.5063282251358032, "learning_rate": 1e-05, - "loss": 0.2645, + "loss": 0.2647, "step": 238100 }, { "epoch": 0.002382, - "grad_norm": 1.3926323652267456, + "grad_norm": 1.50105619430542, "learning_rate": 1e-05, - "loss": 0.2602, + "loss": 0.2627, "step": 238200 }, { "epoch": 0.002383, - "grad_norm": 1.1819911003112793, + "grad_norm": 1.325971245765686, "learning_rate": 1e-05, - "loss": 0.265, + "loss": 0.2691, "step": 238300 }, { "epoch": 0.002384, - "grad_norm": 1.4454882144927979, + "grad_norm": 1.3668450117111206, "learning_rate": 1e-05, - "loss": 0.2595, + "loss": 0.2607, "step": 238400 }, { "epoch": 0.002385, - "grad_norm": 1.5571578741073608, + "grad_norm": 1.299739122390747, "learning_rate": 1e-05, - "loss": 0.263, + "loss": 0.2658, "step": 238500 }, { "epoch": 0.002386, - "grad_norm": 1.6916178464889526, + "grad_norm": 1.3577795028686523, "learning_rate": 1e-05, - "loss": 0.2643, + "loss": 0.267, "step": 238600 }, { "epoch": 0.002387, - "grad_norm": 1.394270658493042, + "grad_norm": 2.8680946826934814, "learning_rate": 1e-05, - "loss": 0.2642, + "loss": 0.2659, "step": 238700 }, { "epoch": 0.002388, - "grad_norm": 1.093780279159546, + "grad_norm": 1.344599962234497, "learning_rate": 1e-05, - "loss": 0.2679, + "loss": 0.2677, "step": 238800 }, { "epoch": 0.002389, - "grad_norm": 1.2436660528182983, + "grad_norm": 1.3766945600509644, "learning_rate": 1e-05, - "loss": 0.2644, + "loss": 0.269, "step": 238900 }, { "epoch": 0.00239, - "grad_norm": 1.3904812335968018, + "grad_norm": 1.4144660234451294, "learning_rate": 1e-05, - "loss": 0.2561, + "loss": 0.2602, "step": 239000 }, { "epoch": 0.002391, - "grad_norm": 1.4332994222640991, + "grad_norm": 1.5478261709213257, "learning_rate": 1e-05, - "loss": 0.2629, + "loss": 0.2665, "step": 239100 }, { "epoch": 0.002392, - "grad_norm": 1.5054080486297607, + "grad_norm": 2.6023824214935303, "learning_rate": 1e-05, - "loss": 0.2646, + "loss": 0.267, "step": 239200 }, { "epoch": 0.002393, - "grad_norm": 1.5768684148788452, + "grad_norm": 1.681246042251587, "learning_rate": 1e-05, - "loss": 0.2596, + "loss": 0.2617, "step": 239300 }, { "epoch": 0.002394, - "grad_norm": 1.3104645013809204, + "grad_norm": 1.3988147974014282, "learning_rate": 1e-05, - "loss": 0.2636, + "loss": 0.2642, "step": 239400 }, { "epoch": 0.002395, - "grad_norm": 1.2154474258422852, + "grad_norm": 1.481540560722351, "learning_rate": 1e-05, - "loss": 0.2627, + "loss": 0.2671, "step": 239500 }, { "epoch": 0.002396, - "grad_norm": 1.2982553243637085, + "grad_norm": 1.4419687986373901, "learning_rate": 1e-05, - "loss": 0.2569, + "loss": 0.2587, "step": 239600 }, { "epoch": 0.002397, - "grad_norm": 1.4037916660308838, + "grad_norm": 1.3948259353637695, "learning_rate": 1e-05, - "loss": 0.2654, + "loss": 0.2675, "step": 239700 }, { "epoch": 0.002398, - "grad_norm": 1.4082489013671875, + "grad_norm": 1.5281838178634644, "learning_rate": 1e-05, - "loss": 0.2615, + "loss": 0.2645, "step": 239800 }, { "epoch": 0.002399, - "grad_norm": 1.6160800457000732, + "grad_norm": 1.4255881309509277, "learning_rate": 1e-05, - "loss": 0.2633, + "loss": 0.2655, "step": 239900 }, { "epoch": 0.0024, - "grad_norm": 1.3580223321914673, + "grad_norm": 1.332002878189087, "learning_rate": 1e-05, - "loss": 0.2634, + "loss": 0.2671, "step": 240000 }, { "epoch": 0.0024, - "eval_loss": 0.2374267578125, - "eval_runtime": 99.0066, - "eval_samples_per_second": 505.017, - "eval_steps_per_second": 31.564, + "eval_loss": 0.239013671875, + "eval_runtime": 114.9087, + "eval_samples_per_second": 435.128, + "eval_steps_per_second": 27.195, "step": 240000 - }, - { - "epoch": 0.002401, - "grad_norm": 1.6873232126235962, - "learning_rate": 1e-05, - "loss": 0.2524, - "step": 240100 - }, - { - "epoch": 0.002402, - "grad_norm": 1.5831952095031738, - "learning_rate": 1e-05, - "loss": 0.2584, - "step": 240200 - }, - { - "epoch": 0.002403, - "grad_norm": 1.4643056392669678, - "learning_rate": 1e-05, - "loss": 0.266, - "step": 240300 - }, - { - "epoch": 0.002404, - "grad_norm": 1.3002352714538574, - "learning_rate": 1e-05, - "loss": 0.2605, - "step": 240400 - }, - { - "epoch": 0.002405, - "grad_norm": 1.3950726985931396, - "learning_rate": 1e-05, - "loss": 0.2625, - "step": 240500 - }, - { - "epoch": 0.002406, - "grad_norm": 1.2444671392440796, - "learning_rate": 1e-05, - "loss": 0.2633, - "step": 240600 - }, - { - "epoch": 0.002407, - "grad_norm": 1.4205822944641113, - "learning_rate": 1e-05, - "loss": 0.2676, - "step": 240700 - }, - { - "epoch": 0.002408, - "grad_norm": 1.366444706916809, - "learning_rate": 1e-05, - "loss": 0.2633, - "step": 240800 - }, - { - "epoch": 0.002409, - "grad_norm": 1.349879503250122, - "learning_rate": 1e-05, - "loss": 0.2613, - "step": 240900 - }, - { - "epoch": 0.00241, - "grad_norm": 1.456283688545227, - "learning_rate": 1e-05, - "loss": 0.2611, - "step": 241000 - }, - { - "epoch": 0.002411, - "grad_norm": 1.406483769416809, - "learning_rate": 1e-05, - "loss": 0.2615, - "step": 241100 - }, - { - "epoch": 0.002412, - "grad_norm": 1.261183738708496, - "learning_rate": 1e-05, - "loss": 0.259, - "step": 241200 - }, - { - "epoch": 0.002413, - "grad_norm": 1.306257724761963, - "learning_rate": 1e-05, - "loss": 0.2637, - "step": 241300 - }, - { - "epoch": 0.002414, - "grad_norm": 1.4715845584869385, - "learning_rate": 1e-05, - "loss": 0.2602, - "step": 241400 - }, - { - "epoch": 0.002415, - "grad_norm": 1.716931939125061, - "learning_rate": 1e-05, - "loss": 0.2543, - "step": 241500 - }, - { - "epoch": 0.002416, - "grad_norm": 1.4704197645187378, - "learning_rate": 1e-05, - "loss": 0.2612, - "step": 241600 - }, - { - "epoch": 0.002417, - "grad_norm": 1.2537198066711426, - "learning_rate": 1e-05, - "loss": 0.2625, - "step": 241700 - }, - { - "epoch": 0.002418, - "grad_norm": 1.543087124824524, - "learning_rate": 1e-05, - "loss": 0.2612, - "step": 241800 - }, - { - "epoch": 0.002419, - "grad_norm": 1.1939878463745117, - "learning_rate": 1e-05, - "loss": 0.2525, - "step": 241900 - }, - { - "epoch": 0.00242, - "grad_norm": 1.4119691848754883, - "learning_rate": 1e-05, - "loss": 0.2711, - "step": 242000 - }, - { - "epoch": 0.002421, - "grad_norm": 1.4125303030014038, - "learning_rate": 1e-05, - "loss": 0.2589, - "step": 242100 - }, - { - "epoch": 0.002422, - "grad_norm": 1.341392993927002, - "learning_rate": 1e-05, - "loss": 0.2598, - "step": 242200 - }, - { - "epoch": 0.002423, - "grad_norm": 1.237683653831482, - "learning_rate": 1e-05, - "loss": 0.2566, - "step": 242300 - }, - { - "epoch": 0.002424, - "grad_norm": 1.3581604957580566, - "learning_rate": 1e-05, - "loss": 0.2584, - "step": 242400 - }, - { - "epoch": 0.002425, - "grad_norm": 1.281236171722412, - "learning_rate": 1e-05, - "loss": 0.2584, - "step": 242500 - }, - { - "epoch": 0.002426, - "grad_norm": 1.415809988975525, - "learning_rate": 1e-05, - "loss": 0.2593, - "step": 242600 - }, - { - "epoch": 0.002427, - "grad_norm": 1.3727725744247437, - "learning_rate": 1e-05, - "loss": 0.2571, - "step": 242700 - }, - { - "epoch": 0.002428, - "grad_norm": 1.4346425533294678, - "learning_rate": 1e-05, - "loss": 0.256, - "step": 242800 - }, - { - "epoch": 0.002429, - "grad_norm": 1.489984154701233, - "learning_rate": 1e-05, - "loss": 0.2679, - "step": 242900 - }, - { - "epoch": 0.00243, - "grad_norm": 1.287827491760254, - "learning_rate": 1e-05, - "loss": 0.262, - "step": 243000 - }, - { - "epoch": 0.002431, - "grad_norm": 1.237215518951416, - "learning_rate": 1e-05, - "loss": 0.2566, - "step": 243100 - }, - { - "epoch": 0.002432, - "grad_norm": 1.4607627391815186, - "learning_rate": 1e-05, - "loss": 0.2665, - "step": 243200 - }, - { - "epoch": 0.002433, - "grad_norm": 1.623380184173584, - "learning_rate": 1e-05, - "loss": 0.261, - "step": 243300 - }, - { - "epoch": 0.002434, - "grad_norm": 1.6318728923797607, - "learning_rate": 1e-05, - "loss": 0.2597, - "step": 243400 - }, - { - "epoch": 0.002435, - "grad_norm": 1.3361358642578125, - "learning_rate": 1e-05, - "loss": 0.2631, - "step": 243500 - }, - { - "epoch": 0.002436, - "grad_norm": 1.4091212749481201, - "learning_rate": 1e-05, - "loss": 0.2537, - "step": 243600 - }, - { - "epoch": 0.002437, - "grad_norm": 1.5947695970535278, - "learning_rate": 1e-05, - "loss": 0.2617, - "step": 243700 - }, - { - "epoch": 0.002438, - "grad_norm": 1.224189281463623, - "learning_rate": 1e-05, - "loss": 0.2546, - "step": 243800 - }, - { - "epoch": 0.002439, - "grad_norm": 1.45286226272583, - "learning_rate": 1e-05, - "loss": 0.2604, - "step": 243900 - }, - { - "epoch": 0.00244, - "grad_norm": 1.122475504875183, - "learning_rate": 1e-05, - "loss": 0.2608, - "step": 244000 - }, - { - "epoch": 0.002441, - "grad_norm": 1.402537226676941, - "learning_rate": 1e-05, - "loss": 0.2648, - "step": 244100 - }, - { - "epoch": 0.002442, - "grad_norm": 1.8614498376846313, - "learning_rate": 1e-05, - "loss": 0.2592, - "step": 244200 - }, - { - "epoch": 0.002443, - "grad_norm": 1.365920066833496, - "learning_rate": 1e-05, - "loss": 0.2557, - "step": 244300 - }, - { - "epoch": 0.002444, - "grad_norm": 1.2067946195602417, - "learning_rate": 1e-05, - "loss": 0.2561, - "step": 244400 - }, - { - "epoch": 0.002445, - "grad_norm": 1.3701143264770508, - "learning_rate": 1e-05, - "loss": 0.2661, - "step": 244500 - }, - { - "epoch": 0.002446, - "grad_norm": 1.3481754064559937, - "learning_rate": 1e-05, - "loss": 0.2574, - "step": 244600 - }, - { - "epoch": 0.002447, - "grad_norm": 1.4324839115142822, - "learning_rate": 1e-05, - "loss": 0.2585, - "step": 244700 - }, - { - "epoch": 0.002448, - "grad_norm": 1.3916932344436646, - "learning_rate": 1e-05, - "loss": 0.259, - "step": 244800 - }, - { - "epoch": 0.002449, - "grad_norm": 1.0971498489379883, - "learning_rate": 1e-05, - "loss": 0.2558, - "step": 244900 - }, - { - "epoch": 0.00245, - "grad_norm": 1.1850725412368774, - "learning_rate": 1e-05, - "loss": 0.258, - "step": 245000 - }, - { - "epoch": 0.002451, - "grad_norm": 1.2461484670639038, - "learning_rate": 1e-05, - "loss": 0.2559, - "step": 245100 - }, - { - "epoch": 0.002452, - "grad_norm": 1.3085113763809204, - "learning_rate": 1e-05, - "loss": 0.2611, - "step": 245200 - }, - { - "epoch": 0.002453, - "grad_norm": 1.3492810726165771, - "learning_rate": 1e-05, - "loss": 0.2534, - "step": 245300 - }, - { - "epoch": 0.002454, - "grad_norm": 1.6019471883773804, - "learning_rate": 1e-05, - "loss": 0.2597, - "step": 245400 - }, - { - "epoch": 0.002455, - "grad_norm": 3.4496567249298096, - "learning_rate": 1e-05, - "loss": 0.2644, - "step": 245500 - }, - { - "epoch": 0.002456, - "grad_norm": 1.407660961151123, - "learning_rate": 1e-05, - "loss": 0.2552, - "step": 245600 - }, - { - "epoch": 0.002457, - "grad_norm": 1.327666997909546, - "learning_rate": 1e-05, - "loss": 0.2594, - "step": 245700 - }, - { - "epoch": 0.002458, - "grad_norm": 1.679745078086853, - "learning_rate": 1e-05, - "loss": 0.2584, - "step": 245800 - }, - { - "epoch": 0.002459, - "grad_norm": 1.0852830410003662, - "learning_rate": 1e-05, - "loss": 0.2564, - "step": 245900 - }, - { - "epoch": 0.00246, - "grad_norm": 1.4752178192138672, - "learning_rate": 1e-05, - "loss": 0.2641, - "step": 246000 - }, - { - "epoch": 0.002461, - "grad_norm": 1.1382917165756226, - "learning_rate": 1e-05, - "loss": 0.2574, - "step": 246100 - }, - { - "epoch": 0.002462, - "grad_norm": 1.302988886833191, - "learning_rate": 1e-05, - "loss": 0.2633, - "step": 246200 - }, - { - "epoch": 0.002463, - "grad_norm": 1.2887983322143555, - "learning_rate": 1e-05, - "loss": 0.2598, - "step": 246300 - }, - { - "epoch": 0.002464, - "grad_norm": 1.2620441913604736, - "learning_rate": 1e-05, - "loss": 0.2593, - "step": 246400 - }, - { - "epoch": 0.002465, - "grad_norm": 1.200119137763977, - "learning_rate": 1e-05, - "loss": 0.2576, - "step": 246500 - }, - { - "epoch": 0.002466, - "grad_norm": 1.4056912660598755, - "learning_rate": 1e-05, - "loss": 0.2591, - "step": 246600 - }, - { - "epoch": 0.002467, - "grad_norm": 1.4518241882324219, - "learning_rate": 1e-05, - "loss": 0.2555, - "step": 246700 - }, - { - "epoch": 0.002468, - "grad_norm": 1.0981643199920654, - "learning_rate": 1e-05, - "loss": 0.2655, - "step": 246800 - }, - { - "epoch": 0.002469, - "grad_norm": 1.3863120079040527, - "learning_rate": 1e-05, - "loss": 0.2578, - "step": 246900 - }, - { - "epoch": 0.00247, - "grad_norm": 1.4632368087768555, - "learning_rate": 1e-05, - "loss": 0.2627, - "step": 247000 - }, - { - "epoch": 0.002471, - "grad_norm": 1.3600306510925293, - "learning_rate": 1e-05, - "loss": 0.2628, - "step": 247100 - }, - { - "epoch": 0.002472, - "grad_norm": 1.1578032970428467, - "learning_rate": 1e-05, - "loss": 0.2554, - "step": 247200 - }, - { - "epoch": 0.002473, - "grad_norm": 1.5791317224502563, - "learning_rate": 1e-05, - "loss": 0.2635, - "step": 247300 - }, - { - "epoch": 0.002474, - "grad_norm": 1.4354877471923828, - "learning_rate": 1e-05, - "loss": 0.2622, - "step": 247400 - }, - { - "epoch": 0.002475, - "grad_norm": 1.2446706295013428, - "learning_rate": 1e-05, - "loss": 0.2579, - "step": 247500 - }, - { - "epoch": 0.002476, - "grad_norm": 1.2223926782608032, - "learning_rate": 1e-05, - "loss": 0.2563, - "step": 247600 - }, - { - "epoch": 0.002477, - "grad_norm": 1.417852520942688, - "learning_rate": 1e-05, - "loss": 0.2565, - "step": 247700 - }, - { - "epoch": 0.002478, - "grad_norm": 1.731817364692688, - "learning_rate": 1e-05, - "loss": 0.259, - "step": 247800 - }, - { - "epoch": 0.002479, - "grad_norm": 1.4741661548614502, - "learning_rate": 1e-05, - "loss": 0.2563, - "step": 247900 - }, - { - "epoch": 0.00248, - "grad_norm": 1.38791823387146, - "learning_rate": 1e-05, - "loss": 0.251, - "step": 248000 - }, - { - "epoch": 0.002481, - "grad_norm": 1.2169920206069946, - "learning_rate": 1e-05, - "loss": 0.26, - "step": 248100 - }, - { - "epoch": 0.002482, - "grad_norm": 1.4752185344696045, - "learning_rate": 1e-05, - "loss": 0.2541, - "step": 248200 - }, - { - "epoch": 0.002483, - "grad_norm": 1.5729482173919678, - "learning_rate": 1e-05, - "loss": 0.2653, - "step": 248300 - }, - { - "epoch": 0.002484, - "grad_norm": 1.310727834701538, - "learning_rate": 1e-05, - "loss": 0.2602, - "step": 248400 - }, - { - "epoch": 0.002485, - "grad_norm": 1.2637379169464111, - "learning_rate": 1e-05, - "loss": 0.259, - "step": 248500 - }, - { - "epoch": 0.002486, - "grad_norm": 1.365087866783142, - "learning_rate": 1e-05, - "loss": 0.2573, - "step": 248600 - }, - { - "epoch": 0.002487, - "grad_norm": 1.503307580947876, - "learning_rate": 1e-05, - "loss": 0.2611, - "step": 248700 - }, - { - "epoch": 0.002488, - "grad_norm": 1.1937447786331177, - "learning_rate": 1e-05, - "loss": 0.2591, - "step": 248800 - }, - { - "epoch": 0.002489, - "grad_norm": 1.3836543560028076, - "learning_rate": 1e-05, - "loss": 0.2532, - "step": 248900 - }, - { - "epoch": 0.00249, - "grad_norm": 2.1695077419281006, - "learning_rate": 1e-05, - "loss": 0.2554, - "step": 249000 - }, - { - "epoch": 0.002491, - "grad_norm": 1.4748849868774414, - "learning_rate": 1e-05, - "loss": 0.26, - "step": 249100 - }, - { - "epoch": 0.002492, - "grad_norm": 1.3633815050125122, - "learning_rate": 1e-05, - "loss": 0.2621, - "step": 249200 - }, - { - "epoch": 0.002493, - "grad_norm": 1.4418655633926392, - "learning_rate": 1e-05, - "loss": 0.2526, - "step": 249300 - }, - { - "epoch": 0.002494, - "grad_norm": 1.3128098249435425, - "learning_rate": 1e-05, - "loss": 0.2576, - "step": 249400 - }, - { - "epoch": 0.002495, - "grad_norm": 1.4768147468566895, - "learning_rate": 1e-05, - "loss": 0.2634, - "step": 249500 - }, - { - "epoch": 0.002496, - "grad_norm": 1.3567637205123901, - "learning_rate": 1e-05, - "loss": 0.2577, - "step": 249600 - }, - { - "epoch": 0.002497, - "grad_norm": 1.3782933950424194, - "learning_rate": 1e-05, - "loss": 0.2512, - "step": 249700 - }, - { - "epoch": 0.002498, - "grad_norm": 1.5667747259140015, - "learning_rate": 1e-05, - "loss": 0.2602, - "step": 249800 - }, - { - "epoch": 0.002499, - "grad_norm": 1.3336353302001953, - "learning_rate": 1e-05, - "loss": 0.2588, - "step": 249900 - }, - { - "epoch": 0.0025, - "grad_norm": 1.6043107509613037, - "learning_rate": 1e-05, - "loss": 0.2606, - "step": 250000 - }, - { - "epoch": 0.002501, - "grad_norm": 1.2945209741592407, - "learning_rate": 1e-05, - "loss": 0.2624, - "step": 250100 - }, - { - "epoch": 0.002502, - "grad_norm": 1.3692872524261475, - "learning_rate": 1e-05, - "loss": 0.2597, - "step": 250200 - }, - { - "epoch": 0.002503, - "grad_norm": 1.4170732498168945, - "learning_rate": 1e-05, - "loss": 0.2528, - "step": 250300 - }, - { - "epoch": 0.002504, - "grad_norm": 1.4010334014892578, - "learning_rate": 1e-05, - "loss": 0.2622, - "step": 250400 - }, - { - "epoch": 0.002505, - "grad_norm": 1.4450767040252686, - "learning_rate": 1e-05, - "loss": 0.2552, - "step": 250500 - }, - { - "epoch": 0.002506, - "grad_norm": 1.1123557090759277, - "learning_rate": 1e-05, - "loss": 0.2592, - "step": 250600 - }, - { - "epoch": 0.002507, - "grad_norm": 1.0991851091384888, - "learning_rate": 1e-05, - "loss": 0.25, - "step": 250700 - }, - { - "epoch": 0.002508, - "grad_norm": 1.523680329322815, - "learning_rate": 1e-05, - "loss": 0.2587, - "step": 250800 - }, - { - "epoch": 0.002509, - "grad_norm": 1.2263822555541992, - "learning_rate": 1e-05, - "loss": 0.2535, - "step": 250900 - }, - { - "epoch": 0.00251, - "grad_norm": 1.4539568424224854, - "learning_rate": 1e-05, - "loss": 0.2609, - "step": 251000 - }, - { - "epoch": 0.002511, - "grad_norm": 1.3021104335784912, - "learning_rate": 1e-05, - "loss": 0.2505, - "step": 251100 - }, - { - "epoch": 0.002512, - "grad_norm": 1.6156435012817383, - "learning_rate": 1e-05, - "loss": 0.2568, - "step": 251200 - }, - { - "epoch": 0.002513, - "grad_norm": 1.1429095268249512, - "learning_rate": 1e-05, - "loss": 0.2548, - "step": 251300 - }, - { - "epoch": 0.002514, - "grad_norm": 1.317590355873108, - "learning_rate": 1e-05, - "loss": 0.2544, - "step": 251400 - }, - { - "epoch": 0.002515, - "grad_norm": 1.390028953552246, - "learning_rate": 1e-05, - "loss": 0.2558, - "step": 251500 - }, - { - "epoch": 0.002516, - "grad_norm": 1.2217508554458618, - "learning_rate": 1e-05, - "loss": 0.2594, - "step": 251600 - }, - { - "epoch": 0.002517, - "grad_norm": 1.3467427492141724, - "learning_rate": 1e-05, - "loss": 0.2604, - "step": 251700 - }, - { - "epoch": 0.002518, - "grad_norm": 1.3831214904785156, - "learning_rate": 1e-05, - "loss": 0.26, - "step": 251800 - }, - { - "epoch": 0.002519, - "grad_norm": 1.4048727750778198, - "learning_rate": 1e-05, - "loss": 0.2595, - "step": 251900 - }, - { - "epoch": 0.00252, - "grad_norm": 1.3685673475265503, - "learning_rate": 1e-05, - "loss": 0.259, - "step": 252000 - }, - { - "epoch": 0.002521, - "grad_norm": 1.159077525138855, - "learning_rate": 1e-05, - "loss": 0.2563, - "step": 252100 - }, - { - "epoch": 0.002522, - "grad_norm": 1.2651586532592773, - "learning_rate": 1e-05, - "loss": 0.2574, - "step": 252200 - }, - { - "epoch": 0.002523, - "grad_norm": 1.3214409351348877, - "learning_rate": 1e-05, - "loss": 0.2556, - "step": 252300 - }, - { - "epoch": 0.002524, - "grad_norm": 1.228649377822876, - "learning_rate": 1e-05, - "loss": 0.2509, - "step": 252400 - }, - { - "epoch": 0.002525, - "grad_norm": 1.1997714042663574, - "learning_rate": 1e-05, - "loss": 0.2606, - "step": 252500 - }, - { - "epoch": 0.002526, - "grad_norm": 1.4123938083648682, - "learning_rate": 1e-05, - "loss": 0.2559, - "step": 252600 - }, - { - "epoch": 0.002527, - "grad_norm": 1.4667912721633911, - "learning_rate": 1e-05, - "loss": 0.2554, - "step": 252700 - }, - { - "epoch": 0.002528, - "grad_norm": 1.8115984201431274, - "learning_rate": 1e-05, - "loss": 0.259, - "step": 252800 - }, - { - "epoch": 0.002529, - "grad_norm": 1.2667086124420166, - "learning_rate": 1e-05, - "loss": 0.2543, - "step": 252900 - }, - { - "epoch": 0.00253, - "grad_norm": 1.5103466510772705, - "learning_rate": 1e-05, - "loss": 0.2547, - "step": 253000 - }, - { - "epoch": 0.002531, - "grad_norm": 1.1528403759002686, - "learning_rate": 1e-05, - "loss": 0.2527, - "step": 253100 - }, - { - "epoch": 0.002532, - "grad_norm": 1.388048768043518, - "learning_rate": 1e-05, - "loss": 0.263, - "step": 253200 - }, - { - "epoch": 0.002533, - "grad_norm": 1.3220521211624146, - "learning_rate": 1e-05, - "loss": 0.2639, - "step": 253300 - }, - { - "epoch": 0.002534, - "grad_norm": 1.4927985668182373, - "learning_rate": 1e-05, - "loss": 0.2531, - "step": 253400 - }, - { - "epoch": 0.002535, - "grad_norm": 1.7103698253631592, - "learning_rate": 1e-05, - "loss": 0.2623, - "step": 253500 - }, - { - "epoch": 0.002536, - "grad_norm": 1.3543158769607544, - "learning_rate": 1e-05, - "loss": 0.2584, - "step": 253600 - }, - { - "epoch": 0.002537, - "grad_norm": 1.5863299369812012, - "learning_rate": 1e-05, - "loss": 0.2593, - "step": 253700 - }, - { - "epoch": 0.002538, - "grad_norm": 1.0556745529174805, - "learning_rate": 1e-05, - "loss": 0.2567, - "step": 253800 - }, - { - "epoch": 0.002539, - "grad_norm": 1.2754689455032349, - "learning_rate": 1e-05, - "loss": 0.2584, - "step": 253900 - }, - { - "epoch": 0.00254, - "grad_norm": 1.337578535079956, - "learning_rate": 1e-05, - "loss": 0.2599, - "step": 254000 - }, - { - "epoch": 0.002541, - "grad_norm": 1.2153486013412476, - "learning_rate": 1e-05, - "loss": 0.2554, - "step": 254100 - }, - { - "epoch": 0.002542, - "grad_norm": 1.420327067375183, - "learning_rate": 1e-05, - "loss": 0.2563, - "step": 254200 - }, - { - "epoch": 0.002543, - "grad_norm": 1.3639488220214844, - "learning_rate": 1e-05, - "loss": 0.2501, - "step": 254300 - }, - { - "epoch": 0.002544, - "grad_norm": 1.2883007526397705, - "learning_rate": 1e-05, - "loss": 0.2595, - "step": 254400 - }, - { - "epoch": 0.002545, - "grad_norm": 1.211978554725647, - "learning_rate": 1e-05, - "loss": 0.2515, - "step": 254500 - }, - { - "epoch": 0.002546, - "grad_norm": 1.6221923828125, - "learning_rate": 1e-05, - "loss": 0.2568, - "step": 254600 - }, - { - "epoch": 0.002547, - "grad_norm": 1.3807469606399536, - "learning_rate": 1e-05, - "loss": 0.2571, - "step": 254700 - }, - { - "epoch": 0.002548, - "grad_norm": 1.3948322534561157, - "learning_rate": 1e-05, - "loss": 0.2529, - "step": 254800 - }, - { - "epoch": 0.002549, - "grad_norm": 1.7052496671676636, - "learning_rate": 1e-05, - "loss": 0.2563, - "step": 254900 - }, - { - "epoch": 0.00255, - "grad_norm": 1.2968933582305908, - "learning_rate": 1e-05, - "loss": 0.2561, - "step": 255000 - }, - { - "epoch": 0.002551, - "grad_norm": 1.354202151298523, - "learning_rate": 1e-05, - "loss": 0.2638, - "step": 255100 - }, - { - "epoch": 0.002552, - "grad_norm": 1.2567942142486572, - "learning_rate": 1e-05, - "loss": 0.2558, - "step": 255200 - }, - { - "epoch": 0.002553, - "grad_norm": 1.826661229133606, - "learning_rate": 1e-05, - "loss": 0.2589, - "step": 255300 - }, - { - "epoch": 0.002554, - "grad_norm": 1.3948241472244263, - "learning_rate": 1e-05, - "loss": 0.2618, - "step": 255400 - }, - { - "epoch": 0.002555, - "grad_norm": 1.2569594383239746, - "learning_rate": 1e-05, - "loss": 0.2535, - "step": 255500 - }, - { - "epoch": 0.002556, - "grad_norm": 1.1714115142822266, - "learning_rate": 1e-05, - "loss": 0.259, - "step": 255600 - }, - { - "epoch": 0.002557, - "grad_norm": 1.5077919960021973, - "learning_rate": 1e-05, - "loss": 0.2626, - "step": 255700 - }, - { - "epoch": 0.002558, - "grad_norm": 1.3146556615829468, - "learning_rate": 1e-05, - "loss": 0.252, - "step": 255800 - }, - { - "epoch": 0.002559, - "grad_norm": 1.155070424079895, - "learning_rate": 1e-05, - "loss": 0.2478, - "step": 255900 - }, - { - "epoch": 0.00256, - "grad_norm": 1.493503451347351, - "learning_rate": 1e-05, - "loss": 0.2509, - "step": 256000 - }, - { - "epoch": 0.002561, - "grad_norm": 1.2542399168014526, - "learning_rate": 1e-05, - "loss": 0.2555, - "step": 256100 - }, - { - "epoch": 0.002562, - "grad_norm": 1.374351978302002, - "learning_rate": 1e-05, - "loss": 0.2554, - "step": 256200 - }, - { - "epoch": 0.002563, - "grad_norm": 1.6055238246917725, - "learning_rate": 1e-05, - "loss": 0.2504, - "step": 256300 - }, - { - "epoch": 0.002564, - "grad_norm": 1.3827863931655884, - "learning_rate": 1e-05, - "loss": 0.2557, - "step": 256400 - }, - { - "epoch": 0.002565, - "grad_norm": 1.2286943197250366, - "learning_rate": 1e-05, - "loss": 0.2572, - "step": 256500 - }, - { - "epoch": 0.002566, - "grad_norm": 1.5171817541122437, - "learning_rate": 1e-05, - "loss": 0.2577, - "step": 256600 - }, - { - "epoch": 0.002567, - "grad_norm": 1.657822608947754, - "learning_rate": 1e-05, - "loss": 0.2544, - "step": 256700 - }, - { - "epoch": 0.002568, - "grad_norm": 1.1966177225112915, - "learning_rate": 1e-05, - "loss": 0.2517, - "step": 256800 - }, - { - "epoch": 0.002569, - "grad_norm": 1.2600973844528198, - "learning_rate": 1e-05, - "loss": 0.2578, - "step": 256900 - }, - { - "epoch": 0.00257, - "grad_norm": 1.3290305137634277, - "learning_rate": 1e-05, - "loss": 0.2584, - "step": 257000 - }, - { - "epoch": 0.002571, - "grad_norm": 1.3210926055908203, - "learning_rate": 1e-05, - "loss": 0.2515, - "step": 257100 - }, - { - "epoch": 0.002572, - "grad_norm": 2.118511199951172, - "learning_rate": 1e-05, - "loss": 0.2505, - "step": 257200 - }, - { - "epoch": 0.002573, - "grad_norm": 1.4024235010147095, - "learning_rate": 1e-05, - "loss": 0.2534, - "step": 257300 - }, - { - "epoch": 0.002574, - "grad_norm": 1.3937634229660034, - "learning_rate": 1e-05, - "loss": 0.2589, - "step": 257400 - }, - { - "epoch": 0.002575, - "grad_norm": 2.1172142028808594, - "learning_rate": 1e-05, - "loss": 0.2464, - "step": 257500 - }, - { - "epoch": 0.002576, - "grad_norm": 1.2907713651657104, - "learning_rate": 1e-05, - "loss": 0.2535, - "step": 257600 - }, - { - "epoch": 0.002577, - "grad_norm": 1.272306203842163, - "learning_rate": 1e-05, - "loss": 0.2502, - "step": 257700 - }, - { - "epoch": 0.002578, - "grad_norm": 1.2183401584625244, - "learning_rate": 1e-05, - "loss": 0.2419, - "step": 257800 - }, - { - "epoch": 0.002579, - "grad_norm": 1.2403819561004639, - "learning_rate": 1e-05, - "loss": 0.2532, - "step": 257900 - }, - { - "epoch": 0.00258, - "grad_norm": 1.1558672189712524, - "learning_rate": 1e-05, - "loss": 0.2563, - "step": 258000 - }, - { - "epoch": 0.002581, - "grad_norm": 1.3124866485595703, - "learning_rate": 1e-05, - "loss": 0.2574, - "step": 258100 - }, - { - "epoch": 0.002582, - "grad_norm": 1.4900401830673218, - "learning_rate": 1e-05, - "loss": 0.2608, - "step": 258200 - }, - { - "epoch": 0.002583, - "grad_norm": 1.5026384592056274, - "learning_rate": 1e-05, - "loss": 0.2531, - "step": 258300 - }, - { - "epoch": 0.002584, - "grad_norm": 1.2160420417785645, - "learning_rate": 1e-05, - "loss": 0.2517, - "step": 258400 - }, - { - "epoch": 0.002585, - "grad_norm": 1.3727961778640747, - "learning_rate": 1e-05, - "loss": 0.256, - "step": 258500 - }, - { - "epoch": 0.002586, - "grad_norm": 1.328669548034668, - "learning_rate": 1e-05, - "loss": 0.2522, - "step": 258600 - }, - { - "epoch": 0.002587, - "grad_norm": 1.3253076076507568, - "learning_rate": 1e-05, - "loss": 0.2527, - "step": 258700 - }, - { - "epoch": 0.002588, - "grad_norm": 1.32355797290802, - "learning_rate": 1e-05, - "loss": 0.2498, - "step": 258800 - }, - { - "epoch": 0.002589, - "grad_norm": 1.3386601209640503, - "learning_rate": 1e-05, - "loss": 0.2551, - "step": 258900 - }, - { - "epoch": 0.00259, - "grad_norm": 1.3468818664550781, - "learning_rate": 1e-05, - "loss": 0.2561, - "step": 259000 - }, - { - "epoch": 0.002591, - "grad_norm": 1.3587111234664917, - "learning_rate": 1e-05, - "loss": 0.2494, - "step": 259100 - }, - { - "epoch": 0.002592, - "grad_norm": 1.509482502937317, - "learning_rate": 1e-05, - "loss": 0.2492, - "step": 259200 - }, - { - "epoch": 0.002593, - "grad_norm": 1.302701711654663, - "learning_rate": 1e-05, - "loss": 0.2577, - "step": 259300 - }, - { - "epoch": 0.002594, - "grad_norm": 1.492660403251648, - "learning_rate": 1e-05, - "loss": 0.2558, - "step": 259400 - }, - { - "epoch": 0.002595, - "grad_norm": 1.187463402748108, - "learning_rate": 1e-05, - "loss": 0.2557, - "step": 259500 - }, - { - "epoch": 0.002596, - "grad_norm": 1.3792933225631714, - "learning_rate": 1e-05, - "loss": 0.257, - "step": 259600 - }, - { - "epoch": 0.002597, - "grad_norm": 1.2955125570297241, - "learning_rate": 1e-05, - "loss": 0.2548, - "step": 259700 - }, - { - "epoch": 0.002598, - "grad_norm": 1.2613247632980347, - "learning_rate": 1e-05, - "loss": 0.2484, - "step": 259800 - }, - { - "epoch": 0.002599, - "grad_norm": 1.4856301546096802, - "learning_rate": 1e-05, - "loss": 0.2486, - "step": 259900 - }, - { - "epoch": 0.0026, - "grad_norm": 1.4062153100967407, - "learning_rate": 1e-05, - "loss": 0.259, - "step": 260000 - }, - { - "epoch": 0.0026, - "eval_loss": 0.2310791015625, - "eval_runtime": 101.6679, - "eval_samples_per_second": 491.798, - "eval_steps_per_second": 30.737, - "step": 260000 - }, - { - "epoch": 0.002601, - "grad_norm": 1.3712681531906128, - "learning_rate": 1e-05, - "loss": 0.243, - "step": 260100 - }, - { - "epoch": 0.002602, - "grad_norm": 1.1989260911941528, - "learning_rate": 1e-05, - "loss": 0.2612, - "step": 260200 - }, - { - "epoch": 0.002603, - "grad_norm": 1.6630966663360596, - "learning_rate": 1e-05, - "loss": 0.2548, - "step": 260300 - }, - { - "epoch": 0.002604, - "grad_norm": 1.5054157972335815, - "learning_rate": 1e-05, - "loss": 0.2544, - "step": 260400 - }, - { - "epoch": 0.002605, - "grad_norm": 1.198943853378296, - "learning_rate": 1e-05, - "loss": 0.2535, - "step": 260500 - }, - { - "epoch": 0.002606, - "grad_norm": 1.1426377296447754, - "learning_rate": 1e-05, - "loss": 0.2576, - "step": 260600 - }, - { - "epoch": 0.002607, - "grad_norm": 1.3320115804672241, - "learning_rate": 1e-05, - "loss": 0.2608, - "step": 260700 - }, - { - "epoch": 0.002608, - "grad_norm": 1.1172069311141968, - "learning_rate": 1e-05, - "loss": 0.2503, - "step": 260800 - }, - { - "epoch": 0.002609, - "grad_norm": 1.3026396036148071, - "learning_rate": 1e-05, - "loss": 0.2474, - "step": 260900 - }, - { - "epoch": 0.00261, - "grad_norm": 2.2753891944885254, - "learning_rate": 1e-05, - "loss": 0.2518, - "step": 261000 - }, - { - "epoch": 0.002611, - "grad_norm": 1.420216679573059, - "learning_rate": 1e-05, - "loss": 0.2573, - "step": 261100 - }, - { - "epoch": 0.002612, - "grad_norm": 1.1930761337280273, - "learning_rate": 1e-05, - "loss": 0.2573, - "step": 261200 - }, - { - "epoch": 0.002613, - "grad_norm": 1.2439950704574585, - "learning_rate": 1e-05, - "loss": 0.2556, - "step": 261300 - }, - { - "epoch": 0.002614, - "grad_norm": 1.4804890155792236, - "learning_rate": 1e-05, - "loss": 0.2613, - "step": 261400 - }, - { - "epoch": 0.002615, - "grad_norm": 1.312845230102539, - "learning_rate": 1e-05, - "loss": 0.2601, - "step": 261500 - }, - { - "epoch": 0.002616, - "grad_norm": 1.5155603885650635, - "learning_rate": 1e-05, - "loss": 0.2495, - "step": 261600 - }, - { - "epoch": 0.002617, - "grad_norm": 1.7103291749954224, - "learning_rate": 1e-05, - "loss": 0.2478, - "step": 261700 - }, - { - "epoch": 0.002618, - "grad_norm": 1.2495895624160767, - "learning_rate": 1e-05, - "loss": 0.2518, - "step": 261800 - }, - { - "epoch": 0.002619, - "grad_norm": 1.3669322729110718, - "learning_rate": 1e-05, - "loss": 0.2552, - "step": 261900 - }, - { - "epoch": 0.00262, - "grad_norm": 1.177885890007019, - "learning_rate": 1e-05, - "loss": 0.2598, - "step": 262000 - }, - { - "epoch": 0.002621, - "grad_norm": 1.2433159351348877, - "learning_rate": 1e-05, - "loss": 0.2553, - "step": 262100 - }, - { - "epoch": 0.002622, - "grad_norm": 1.9729152917861938, - "learning_rate": 1e-05, - "loss": 0.2485, - "step": 262200 - }, - { - "epoch": 0.002623, - "grad_norm": 1.4853062629699707, - "learning_rate": 1e-05, - "loss": 0.2512, - "step": 262300 - }, - { - "epoch": 0.002624, - "grad_norm": 1.3633800745010376, - "learning_rate": 1e-05, - "loss": 0.2605, - "step": 262400 - }, - { - "epoch": 0.002625, - "grad_norm": 1.501418113708496, - "learning_rate": 1e-05, - "loss": 0.2496, - "step": 262500 - }, - { - "epoch": 0.002626, - "grad_norm": 2.107804775238037, - "learning_rate": 1e-05, - "loss": 0.2523, - "step": 262600 - }, - { - "epoch": 0.002627, - "grad_norm": 1.1937731504440308, - "learning_rate": 1e-05, - "loss": 0.2559, - "step": 262700 - }, - { - "epoch": 0.002628, - "grad_norm": 1.5096913576126099, - "learning_rate": 1e-05, - "loss": 0.2566, - "step": 262800 - }, - { - "epoch": 0.002629, - "grad_norm": 1.4642647504806519, - "learning_rate": 1e-05, - "loss": 0.2416, - "step": 262900 - }, - { - "epoch": 0.00263, - "grad_norm": 1.1869677305221558, - "learning_rate": 1e-05, - "loss": 0.2502, - "step": 263000 - }, - { - "epoch": 0.002631, - "grad_norm": 1.1233562231063843, - "learning_rate": 1e-05, - "loss": 0.2554, - "step": 263100 - }, - { - "epoch": 0.002632, - "grad_norm": 1.298271656036377, - "learning_rate": 1e-05, - "loss": 0.2521, - "step": 263200 - }, - { - "epoch": 0.002633, - "grad_norm": 1.257735252380371, - "learning_rate": 1e-05, - "loss": 0.2556, - "step": 263300 - }, - { - "epoch": 0.002634, - "grad_norm": 1.2978613376617432, - "learning_rate": 1e-05, - "loss": 0.2525, - "step": 263400 - }, - { - "epoch": 0.002635, - "grad_norm": 1.567204475402832, - "learning_rate": 1e-05, - "loss": 0.2581, - "step": 263500 - }, - { - "epoch": 0.002636, - "grad_norm": 1.4010810852050781, - "learning_rate": 1e-05, - "loss": 0.2554, - "step": 263600 - }, - { - "epoch": 0.002637, - "grad_norm": 1.3030946254730225, - "learning_rate": 1e-05, - "loss": 0.2504, - "step": 263700 - }, - { - "epoch": 0.002638, - "grad_norm": 1.4009398221969604, - "learning_rate": 1e-05, - "loss": 0.2507, - "step": 263800 - }, - { - "epoch": 0.002639, - "grad_norm": 1.2480578422546387, - "learning_rate": 1e-05, - "loss": 0.2642, - "step": 263900 - }, - { - "epoch": 0.00264, - "grad_norm": 1.3833935260772705, - "learning_rate": 1e-05, - "loss": 0.2534, - "step": 264000 - }, - { - "epoch": 0.002641, - "grad_norm": 1.2210851907730103, - "learning_rate": 1e-05, - "loss": 0.2544, - "step": 264100 - }, - { - "epoch": 0.002642, - "grad_norm": 1.1819926500320435, - "learning_rate": 1e-05, - "loss": 0.2519, - "step": 264200 - }, - { - "epoch": 0.002643, - "grad_norm": 1.081667423248291, - "learning_rate": 1e-05, - "loss": 0.2581, - "step": 264300 - }, - { - "epoch": 0.002644, - "grad_norm": 1.1439013481140137, - "learning_rate": 1e-05, - "loss": 0.2527, - "step": 264400 - }, - { - "epoch": 0.002645, - "grad_norm": 1.4290095567703247, - "learning_rate": 1e-05, - "loss": 0.2537, - "step": 264500 - }, - { - "epoch": 0.002646, - "grad_norm": 1.396436095237732, - "learning_rate": 1e-05, - "loss": 0.2535, - "step": 264600 - }, - { - "epoch": 0.002647, - "grad_norm": 1.180820345878601, - "learning_rate": 1e-05, - "loss": 0.2552, - "step": 264700 - }, - { - "epoch": 0.002648, - "grad_norm": 1.303807258605957, - "learning_rate": 1e-05, - "loss": 0.2537, - "step": 264800 - }, - { - "epoch": 0.002649, - "grad_norm": 1.156326413154602, - "learning_rate": 1e-05, - "loss": 0.2517, - "step": 264900 - }, - { - "epoch": 0.00265, - "grad_norm": 1.2519687414169312, - "learning_rate": 1e-05, - "loss": 0.2515, - "step": 265000 - }, - { - "epoch": 0.002651, - "grad_norm": 1.3290817737579346, - "learning_rate": 1e-05, - "loss": 0.2476, - "step": 265100 - }, - { - "epoch": 0.002652, - "grad_norm": 1.101126790046692, - "learning_rate": 1e-05, - "loss": 0.2531, - "step": 265200 - }, - { - "epoch": 0.002653, - "grad_norm": 1.932999610900879, - "learning_rate": 1e-05, - "loss": 0.2529, - "step": 265300 - }, - { - "epoch": 0.002654, - "grad_norm": 1.4042562246322632, - "learning_rate": 1e-05, - "loss": 0.2492, - "step": 265400 - }, - { - "epoch": 0.002655, - "grad_norm": 1.5448696613311768, - "learning_rate": 1e-05, - "loss": 0.2512, - "step": 265500 - }, - { - "epoch": 0.002656, - "grad_norm": 1.3398382663726807, - "learning_rate": 1e-05, - "loss": 0.2506, - "step": 265600 - }, - { - "epoch": 0.002657, - "grad_norm": 1.2211768627166748, - "learning_rate": 1e-05, - "loss": 0.2567, - "step": 265700 - }, - { - "epoch": 0.002658, - "grad_norm": 1.2229231595993042, - "learning_rate": 1e-05, - "loss": 0.2476, - "step": 265800 - }, - { - "epoch": 0.002659, - "grad_norm": 1.2593238353729248, - "learning_rate": 1e-05, - "loss": 0.2534, - "step": 265900 - }, - { - "epoch": 0.00266, - "grad_norm": 1.4399446249008179, - "learning_rate": 1e-05, - "loss": 0.2528, - "step": 266000 - }, - { - "epoch": 0.002661, - "grad_norm": 1.3300777673721313, - "learning_rate": 1e-05, - "loss": 0.2586, - "step": 266100 - }, - { - "epoch": 0.002662, - "grad_norm": 1.4608235359191895, - "learning_rate": 1e-05, - "loss": 0.2488, - "step": 266200 - }, - { - "epoch": 0.002663, - "grad_norm": 1.4699537754058838, - "learning_rate": 1e-05, - "loss": 0.249, - "step": 266300 - }, - { - "epoch": 0.002664, - "grad_norm": 1.2573784589767456, - "learning_rate": 1e-05, - "loss": 0.2397, - "step": 266400 - }, - { - "epoch": 0.002665, - "grad_norm": 1.3080177307128906, - "learning_rate": 1e-05, - "loss": 0.253, - "step": 266500 - }, - { - "epoch": 0.002666, - "grad_norm": 1.1709935665130615, - "learning_rate": 1e-05, - "loss": 0.2573, - "step": 266600 - }, - { - "epoch": 0.002667, - "grad_norm": 1.5160483121871948, - "learning_rate": 1e-05, - "loss": 0.2576, - "step": 266700 - }, - { - "epoch": 0.002668, - "grad_norm": 1.2000160217285156, - "learning_rate": 1e-05, - "loss": 0.2508, - "step": 266800 - }, - { - "epoch": 0.002669, - "grad_norm": 1.5436961650848389, - "learning_rate": 1e-05, - "loss": 0.2531, - "step": 266900 - }, - { - "epoch": 0.00267, - "grad_norm": 1.043220043182373, - "learning_rate": 1e-05, - "loss": 0.2579, - "step": 267000 - }, - { - "epoch": 0.002671, - "grad_norm": 1.2471952438354492, - "learning_rate": 1e-05, - "loss": 0.2529, - "step": 267100 - }, - { - "epoch": 0.002672, - "grad_norm": 1.1114460229873657, - "learning_rate": 1e-05, - "loss": 0.2505, - "step": 267200 - }, - { - "epoch": 0.002673, - "grad_norm": 1.303299069404602, - "learning_rate": 1e-05, - "loss": 0.2529, - "step": 267300 - }, - { - "epoch": 0.002674, - "grad_norm": 1.233544945716858, - "learning_rate": 1e-05, - "loss": 0.2479, - "step": 267400 - }, - { - "epoch": 0.002675, - "grad_norm": 1.0908334255218506, - "learning_rate": 1e-05, - "loss": 0.2573, - "step": 267500 - }, - { - "epoch": 0.002676, - "grad_norm": 4.520893573760986, - "learning_rate": 1e-05, - "loss": 0.2552, - "step": 267600 - }, - { - "epoch": 0.002677, - "grad_norm": 1.499179482460022, - "learning_rate": 1e-05, - "loss": 0.2575, - "step": 267700 - }, - { - "epoch": 0.002678, - "grad_norm": 1.286794662475586, - "learning_rate": 1e-05, - "loss": 0.2534, - "step": 267800 - }, - { - "epoch": 0.002679, - "grad_norm": 1.4592018127441406, - "learning_rate": 1e-05, - "loss": 0.2514, - "step": 267900 - }, - { - "epoch": 0.00268, - "grad_norm": 1.1641000509262085, - "learning_rate": 1e-05, - "loss": 0.2522, - "step": 268000 - }, - { - "epoch": 0.002681, - "grad_norm": 1.3304522037506104, - "learning_rate": 1e-05, - "loss": 0.2537, - "step": 268100 - }, - { - "epoch": 0.002682, - "grad_norm": 1.2909622192382812, - "learning_rate": 1e-05, - "loss": 0.2493, - "step": 268200 - }, - { - "epoch": 0.002683, - "grad_norm": 1.1704047918319702, - "learning_rate": 1e-05, - "loss": 0.2552, - "step": 268300 - }, - { - "epoch": 0.002684, - "grad_norm": 1.282389760017395, - "learning_rate": 1e-05, - "loss": 0.2451, - "step": 268400 - }, - { - "epoch": 0.002685, - "grad_norm": 1.391392469406128, - "learning_rate": 1e-05, - "loss": 0.2558, - "step": 268500 - }, - { - "epoch": 0.002686, - "grad_norm": 1.2439310550689697, - "learning_rate": 1e-05, - "loss": 0.2429, - "step": 268600 - }, - { - "epoch": 0.002687, - "grad_norm": 1.2771368026733398, - "learning_rate": 1e-05, - "loss": 0.2504, - "step": 268700 - }, - { - "epoch": 0.002688, - "grad_norm": 1.2955467700958252, - "learning_rate": 1e-05, - "loss": 0.2494, - "step": 268800 - }, - { - "epoch": 0.002689, - "grad_norm": 1.223596215248108, - "learning_rate": 1e-05, - "loss": 0.2523, - "step": 268900 - }, - { - "epoch": 0.00269, - "grad_norm": 1.2867069244384766, - "learning_rate": 1e-05, - "loss": 0.2502, - "step": 269000 - }, - { - "epoch": 0.002691, - "grad_norm": 1.4488933086395264, - "learning_rate": 1e-05, - "loss": 0.2437, - "step": 269100 - }, - { - "epoch": 0.002692, - "grad_norm": 1.3014060258865356, - "learning_rate": 1e-05, - "loss": 0.2498, - "step": 269200 - }, - { - "epoch": 0.002693, - "grad_norm": 1.3868091106414795, - "learning_rate": 1e-05, - "loss": 0.2538, - "step": 269300 - }, - { - "epoch": 0.002694, - "grad_norm": 1.3943779468536377, - "learning_rate": 1e-05, - "loss": 0.2525, - "step": 269400 - }, - { - "epoch": 0.002695, - "grad_norm": 1.7637842893600464, - "learning_rate": 1e-05, - "loss": 0.2517, - "step": 269500 - }, - { - "epoch": 0.002696, - "grad_norm": 1.26023530960083, - "learning_rate": 1e-05, - "loss": 0.2486, - "step": 269600 - }, - { - "epoch": 0.002697, - "grad_norm": 1.155168890953064, - "learning_rate": 1e-05, - "loss": 0.2508, - "step": 269700 - }, - { - "epoch": 0.002698, - "grad_norm": 1.2394658327102661, - "learning_rate": 1e-05, - "loss": 0.2487, - "step": 269800 - }, - { - "epoch": 0.002699, - "grad_norm": 1.9545338153839111, - "learning_rate": 1e-05, - "loss": 0.2565, - "step": 269900 - }, - { - "epoch": 0.0027, - "grad_norm": 1.4001786708831787, - "learning_rate": 1e-05, - "loss": 0.2523, - "step": 270000 - }, - { - "epoch": 0.002701, - "grad_norm": 1.299181580543518, - "learning_rate": 1e-05, - "loss": 0.2569, - "step": 270100 - }, - { - "epoch": 0.002702, - "grad_norm": 1.331376552581787, - "learning_rate": 1e-05, - "loss": 0.252, - "step": 270200 - }, - { - "epoch": 0.002703, - "grad_norm": 1.322163701057434, - "learning_rate": 1e-05, - "loss": 0.2512, - "step": 270300 - }, - { - "epoch": 0.002704, - "grad_norm": 1.2857742309570312, - "learning_rate": 1e-05, - "loss": 0.2524, - "step": 270400 - }, - { - "epoch": 0.002705, - "grad_norm": 1.3181077241897583, - "learning_rate": 1e-05, - "loss": 0.2445, - "step": 270500 - }, - { - "epoch": 0.002706, - "grad_norm": 1.4367825984954834, - "learning_rate": 1e-05, - "loss": 0.246, - "step": 270600 - }, - { - "epoch": 0.002707, - "grad_norm": 2.484334707260132, - "learning_rate": 1e-05, - "loss": 0.2458, - "step": 270700 - }, - { - "epoch": 0.002708, - "grad_norm": 1.2078081369400024, - "learning_rate": 1e-05, - "loss": 0.2419, - "step": 270800 - }, - { - "epoch": 0.002709, - "grad_norm": 1.2299176454544067, - "learning_rate": 1e-05, - "loss": 0.2522, - "step": 270900 - }, - { - "epoch": 0.00271, - "grad_norm": 1.1722851991653442, - "learning_rate": 1e-05, - "loss": 0.2558, - "step": 271000 - }, - { - "epoch": 0.002711, - "grad_norm": 1.1677212715148926, - "learning_rate": 1e-05, - "loss": 0.2512, - "step": 271100 - }, - { - "epoch": 0.002712, - "grad_norm": 1.3179787397384644, - "learning_rate": 1e-05, - "loss": 0.2488, - "step": 271200 - }, - { - "epoch": 0.002713, - "grad_norm": 1.3971023559570312, - "learning_rate": 1e-05, - "loss": 0.2465, - "step": 271300 - }, - { - "epoch": 0.002714, - "grad_norm": 1.4288114309310913, - "learning_rate": 1e-05, - "loss": 0.2493, - "step": 271400 - }, - { - "epoch": 0.002715, - "grad_norm": 1.4019008874893188, - "learning_rate": 1e-05, - "loss": 0.2517, - "step": 271500 - }, - { - "epoch": 0.002716, - "grad_norm": 1.1395725011825562, - "learning_rate": 1e-05, - "loss": 0.2572, - "step": 271600 - }, - { - "epoch": 0.002717, - "grad_norm": 1.3662317991256714, - "learning_rate": 1e-05, - "loss": 0.2567, - "step": 271700 - }, - { - "epoch": 0.002718, - "grad_norm": 1.1824346780776978, - "learning_rate": 1e-05, - "loss": 0.2464, - "step": 271800 - }, - { - "epoch": 0.002719, - "grad_norm": 1.497937798500061, - "learning_rate": 1e-05, - "loss": 0.2542, - "step": 271900 - }, - { - "epoch": 0.00272, - "grad_norm": 1.4962044954299927, - "learning_rate": 1e-05, - "loss": 0.2494, - "step": 272000 - }, - { - "epoch": 0.002721, - "grad_norm": 1.4152017831802368, - "learning_rate": 1e-05, - "loss": 0.2432, - "step": 272100 - }, - { - "epoch": 0.002722, - "grad_norm": 1.1556645631790161, - "learning_rate": 1e-05, - "loss": 0.2512, - "step": 272200 - }, - { - "epoch": 0.002723, - "grad_norm": 1.3742530345916748, - "learning_rate": 1e-05, - "loss": 0.2457, - "step": 272300 - }, - { - "epoch": 0.002724, - "grad_norm": 1.4176385402679443, - "learning_rate": 1e-05, - "loss": 0.2496, - "step": 272400 - }, - { - "epoch": 0.002725, - "grad_norm": 1.2945231199264526, - "learning_rate": 1e-05, - "loss": 0.249, - "step": 272500 - }, - { - "epoch": 0.002726, - "grad_norm": 1.3481082916259766, - "learning_rate": 1e-05, - "loss": 0.2506, - "step": 272600 - }, - { - "epoch": 0.002727, - "grad_norm": 1.5182290077209473, - "learning_rate": 1e-05, - "loss": 0.2466, - "step": 272700 - }, - { - "epoch": 0.002728, - "grad_norm": 1.333655834197998, - "learning_rate": 1e-05, - "loss": 0.2465, - "step": 272800 - }, - { - "epoch": 0.002729, - "grad_norm": 1.1650173664093018, - "learning_rate": 1e-05, - "loss": 0.2488, - "step": 272900 - }, - { - "epoch": 0.00273, - "grad_norm": 1.316748023033142, - "learning_rate": 1e-05, - "loss": 0.2434, - "step": 273000 - }, - { - "epoch": 0.002731, - "grad_norm": 1.770902156829834, - "learning_rate": 1e-05, - "loss": 0.2495, - "step": 273100 - }, - { - "epoch": 0.002732, - "grad_norm": 1.7999628782272339, - "learning_rate": 1e-05, - "loss": 0.2465, - "step": 273200 - }, - { - "epoch": 0.002733, - "grad_norm": 1.366982340812683, - "learning_rate": 1e-05, - "loss": 0.2512, - "step": 273300 - }, - { - "epoch": 0.002734, - "grad_norm": 1.3451694250106812, - "learning_rate": 1e-05, - "loss": 0.249, - "step": 273400 - }, - { - "epoch": 0.002735, - "grad_norm": 1.5666559934616089, - "learning_rate": 1e-05, - "loss": 0.2488, - "step": 273500 - }, - { - "epoch": 0.002736, - "grad_norm": 1.4752038717269897, - "learning_rate": 1e-05, - "loss": 0.2469, - "step": 273600 - }, - { - "epoch": 0.002737, - "grad_norm": 1.3000835180282593, - "learning_rate": 1e-05, - "loss": 0.2502, - "step": 273700 - }, - { - "epoch": 0.002738, - "grad_norm": 1.6196129322052002, - "learning_rate": 1e-05, - "loss": 0.2537, - "step": 273800 - }, - { - "epoch": 0.002739, - "grad_norm": 1.1972854137420654, - "learning_rate": 1e-05, - "loss": 0.2498, - "step": 273900 - }, - { - "epoch": 0.00274, - "grad_norm": 1.271113395690918, - "learning_rate": 1e-05, - "loss": 0.244, - "step": 274000 - }, - { - "epoch": 0.002741, - "grad_norm": 3.1418569087982178, - "learning_rate": 1e-05, - "loss": 0.2474, - "step": 274100 - }, - { - "epoch": 0.002742, - "grad_norm": 1.3042404651641846, - "learning_rate": 1e-05, - "loss": 0.2541, - "step": 274200 - }, - { - "epoch": 0.002743, - "grad_norm": 1.4384160041809082, - "learning_rate": 1e-05, - "loss": 0.2481, - "step": 274300 - }, - { - "epoch": 0.002744, - "grad_norm": 1.4222426414489746, - "learning_rate": 1e-05, - "loss": 0.252, - "step": 274400 - }, - { - "epoch": 0.002745, - "grad_norm": 1.1778122186660767, - "learning_rate": 1e-05, - "loss": 0.2477, - "step": 274500 - }, - { - "epoch": 0.002746, - "grad_norm": 1.3980047702789307, - "learning_rate": 1e-05, - "loss": 0.2485, - "step": 274600 - }, - { - "epoch": 0.002747, - "grad_norm": 1.2494852542877197, - "learning_rate": 1e-05, - "loss": 0.2453, - "step": 274700 - }, - { - "epoch": 0.002748, - "grad_norm": 1.4286943674087524, - "learning_rate": 1e-05, - "loss": 0.2519, - "step": 274800 - }, - { - "epoch": 0.002749, - "grad_norm": 1.3011080026626587, - "learning_rate": 1e-05, - "loss": 0.2469, - "step": 274900 - }, - { - "epoch": 0.00275, - "grad_norm": 1.3677890300750732, - "learning_rate": 1e-05, - "loss": 0.2472, - "step": 275000 - }, - { - "epoch": 0.002751, - "grad_norm": 2.367795944213867, - "learning_rate": 1e-05, - "loss": 0.2508, - "step": 275100 - }, - { - "epoch": 0.002752, - "grad_norm": 1.5321639776229858, - "learning_rate": 1e-05, - "loss": 0.2511, - "step": 275200 - }, - { - "epoch": 0.002753, - "grad_norm": 1.301969051361084, - "learning_rate": 1e-05, - "loss": 0.2441, - "step": 275300 - }, - { - "epoch": 0.002754, - "grad_norm": 1.2542939186096191, - "learning_rate": 1e-05, - "loss": 0.2426, - "step": 275400 - }, - { - "epoch": 0.002755, - "grad_norm": 1.3610330820083618, - "learning_rate": 1e-05, - "loss": 0.2504, - "step": 275500 - }, - { - "epoch": 0.002756, - "grad_norm": 1.451761245727539, - "learning_rate": 1e-05, - "loss": 0.2446, - "step": 275600 - }, - { - "epoch": 0.002757, - "grad_norm": 1.3225687742233276, - "learning_rate": 1e-05, - "loss": 0.2489, - "step": 275700 - }, - { - "epoch": 0.002758, - "grad_norm": 1.3390498161315918, - "learning_rate": 1e-05, - "loss": 0.2517, - "step": 275800 - }, - { - "epoch": 0.002759, - "grad_norm": 1.3310461044311523, - "learning_rate": 1e-05, - "loss": 0.2544, - "step": 275900 - }, - { - "epoch": 0.00276, - "grad_norm": 1.2745022773742676, - "learning_rate": 1e-05, - "loss": 0.2529, - "step": 276000 - }, - { - "epoch": 0.002761, - "grad_norm": 1.4929732084274292, - "learning_rate": 1e-05, - "loss": 0.2493, - "step": 276100 - }, - { - "epoch": 0.002762, - "grad_norm": 1.4375929832458496, - "learning_rate": 1e-05, - "loss": 0.2476, - "step": 276200 - }, - { - "epoch": 0.002763, - "grad_norm": 1.3432118892669678, - "learning_rate": 1e-05, - "loss": 0.2541, - "step": 276300 - }, - { - "epoch": 0.002764, - "grad_norm": 1.9944952726364136, - "learning_rate": 1e-05, - "loss": 0.2506, - "step": 276400 - }, - { - "epoch": 0.002765, - "grad_norm": 1.2195799350738525, - "learning_rate": 1e-05, - "loss": 0.243, - "step": 276500 - }, - { - "epoch": 0.002766, - "grad_norm": 1.354514241218567, - "learning_rate": 1e-05, - "loss": 0.2437, - "step": 276600 - }, - { - "epoch": 0.002767, - "grad_norm": 1.4972262382507324, - "learning_rate": 1e-05, - "loss": 0.2527, - "step": 276700 - }, - { - "epoch": 0.002768, - "grad_norm": 0.9542062878608704, - "learning_rate": 1e-05, - "loss": 0.2463, - "step": 276800 - }, - { - "epoch": 0.002769, - "grad_norm": 1.5111498832702637, - "learning_rate": 1e-05, - "loss": 0.2508, - "step": 276900 - }, - { - "epoch": 0.00277, - "grad_norm": 1.3207476139068604, - "learning_rate": 1e-05, - "loss": 0.2447, - "step": 277000 - }, - { - "epoch": 0.002771, - "grad_norm": 1.3004294633865356, - "learning_rate": 1e-05, - "loss": 0.2431, - "step": 277100 - }, - { - "epoch": 0.002772, - "grad_norm": 1.4649089574813843, - "learning_rate": 1e-05, - "loss": 0.2487, - "step": 277200 - }, - { - "epoch": 0.002773, - "grad_norm": 1.4927515983581543, - "learning_rate": 1e-05, - "loss": 0.246, - "step": 277300 - }, - { - "epoch": 0.002774, - "grad_norm": 1.3661631345748901, - "learning_rate": 1e-05, - "loss": 0.2398, - "step": 277400 - }, - { - "epoch": 0.002775, - "grad_norm": 1.47564697265625, - "learning_rate": 1e-05, - "loss": 0.2479, - "step": 277500 - }, - { - "epoch": 0.002776, - "grad_norm": 1.2620567083358765, - "learning_rate": 1e-05, - "loss": 0.2482, - "step": 277600 - }, - { - "epoch": 0.002777, - "grad_norm": 1.3561931848526, - "learning_rate": 1e-05, - "loss": 0.251, - "step": 277700 - }, - { - "epoch": 0.002778, - "grad_norm": 1.3768113851547241, - "learning_rate": 1e-05, - "loss": 0.2513, - "step": 277800 - }, - { - "epoch": 0.002779, - "grad_norm": 1.3965851068496704, - "learning_rate": 1e-05, - "loss": 0.2438, - "step": 277900 - }, - { - "epoch": 0.00278, - "grad_norm": 1.3073445558547974, - "learning_rate": 1e-05, - "loss": 0.2434, - "step": 278000 - }, - { - "epoch": 0.002781, - "grad_norm": 1.431525468826294, - "learning_rate": 1e-05, - "loss": 0.2479, - "step": 278100 - }, - { - "epoch": 0.002782, - "grad_norm": 1.3677194118499756, - "learning_rate": 1e-05, - "loss": 0.2524, - "step": 278200 - }, - { - "epoch": 0.002783, - "grad_norm": 1.4899870157241821, - "learning_rate": 1e-05, - "loss": 0.2481, - "step": 278300 - }, - { - "epoch": 0.002784, - "grad_norm": 1.2802644968032837, - "learning_rate": 1e-05, - "loss": 0.25, - "step": 278400 - }, - { - "epoch": 0.002785, - "grad_norm": 1.2976142168045044, - "learning_rate": 1e-05, - "loss": 0.249, - "step": 278500 - }, - { - "epoch": 0.002786, - "grad_norm": 1.2336742877960205, - "learning_rate": 1e-05, - "loss": 0.2441, - "step": 278600 - }, - { - "epoch": 0.002787, - "grad_norm": 1.2916064262390137, - "learning_rate": 1e-05, - "loss": 0.2521, - "step": 278700 - }, - { - "epoch": 0.002788, - "grad_norm": 1.4833120107650757, - "learning_rate": 1e-05, - "loss": 0.2499, - "step": 278800 - }, - { - "epoch": 0.002789, - "grad_norm": 1.4841572046279907, - "learning_rate": 1e-05, - "loss": 0.2501, - "step": 278900 - }, - { - "epoch": 0.00279, - "grad_norm": 1.2843296527862549, - "learning_rate": 1e-05, - "loss": 0.2526, - "step": 279000 - }, - { - "epoch": 0.002791, - "grad_norm": 1.1688668727874756, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 279100 - }, - { - "epoch": 0.002792, - "grad_norm": 1.5335739850997925, - "learning_rate": 1e-05, - "loss": 0.2481, - "step": 279200 - }, - { - "epoch": 0.002793, - "grad_norm": 1.1553764343261719, - "learning_rate": 1e-05, - "loss": 0.2456, - "step": 279300 - }, - { - "epoch": 0.002794, - "grad_norm": 1.350967526435852, - "learning_rate": 1e-05, - "loss": 0.2446, - "step": 279400 - }, - { - "epoch": 0.002795, - "grad_norm": 1.4153188467025757, - "learning_rate": 1e-05, - "loss": 0.2447, - "step": 279500 - }, - { - "epoch": 0.002796, - "grad_norm": 1.3319013118743896, - "learning_rate": 1e-05, - "loss": 0.2525, - "step": 279600 - }, - { - "epoch": 0.002797, - "grad_norm": 1.1495552062988281, - "learning_rate": 1e-05, - "loss": 0.2457, - "step": 279700 - }, - { - "epoch": 0.002798, - "grad_norm": 1.5219277143478394, - "learning_rate": 1e-05, - "loss": 0.2478, - "step": 279800 - }, - { - "epoch": 0.002799, - "grad_norm": 1.5209633111953735, - "learning_rate": 1e-05, - "loss": 0.2466, - "step": 279900 - }, - { - "epoch": 0.0028, - "grad_norm": 1.3003034591674805, - "learning_rate": 1e-05, - "loss": 0.25, - "step": 280000 - }, - { - "epoch": 0.0028, - "eval_loss": 0.2237548828125, - "eval_runtime": 105.6227, - "eval_samples_per_second": 473.383, - "eval_steps_per_second": 29.586, - "step": 280000 - }, - { - "epoch": 0.002801, - "grad_norm": 1.4235516786575317, - "learning_rate": 1e-05, - "loss": 0.2477, - "step": 280100 - }, - { - "epoch": 0.002802, - "grad_norm": 1.2791297435760498, - "learning_rate": 1e-05, - "loss": 0.2498, - "step": 280200 - }, - { - "epoch": 0.002803, - "grad_norm": 1.1194697618484497, - "learning_rate": 1e-05, - "loss": 0.2472, - "step": 280300 - }, - { - "epoch": 0.002804, - "grad_norm": 1.262161374092102, - "learning_rate": 1e-05, - "loss": 0.2495, - "step": 280400 - }, - { - "epoch": 0.002805, - "grad_norm": 1.2329294681549072, - "learning_rate": 1e-05, - "loss": 0.2451, - "step": 280500 - }, - { - "epoch": 0.002806, - "grad_norm": 1.394741177558899, - "learning_rate": 1e-05, - "loss": 0.2524, - "step": 280600 - }, - { - "epoch": 0.002807, - "grad_norm": 1.3729023933410645, - "learning_rate": 1e-05, - "loss": 0.2554, - "step": 280700 - }, - { - "epoch": 0.002808, - "grad_norm": 1.3671327829360962, - "learning_rate": 1e-05, - "loss": 0.2453, - "step": 280800 - }, - { - "epoch": 0.002809, - "grad_norm": 1.2624722719192505, - "learning_rate": 1e-05, - "loss": 0.2466, - "step": 280900 - }, - { - "epoch": 0.00281, - "grad_norm": 1.2615208625793457, - "learning_rate": 1e-05, - "loss": 0.2405, - "step": 281000 - }, - { - "epoch": 0.002811, - "grad_norm": 1.6201788187026978, - "learning_rate": 1e-05, - "loss": 0.2454, - "step": 281100 - }, - { - "epoch": 0.002812, - "grad_norm": 1.3017542362213135, - "learning_rate": 1e-05, - "loss": 0.2473, - "step": 281200 - }, - { - "epoch": 0.002813, - "grad_norm": 1.2781213521957397, - "learning_rate": 1e-05, - "loss": 0.2462, - "step": 281300 - }, - { - "epoch": 0.002814, - "grad_norm": 1.408088207244873, - "learning_rate": 1e-05, - "loss": 0.248, - "step": 281400 - }, - { - "epoch": 0.002815, - "grad_norm": 1.3907461166381836, - "learning_rate": 1e-05, - "loss": 0.2545, - "step": 281500 - }, - { - "epoch": 0.002816, - "grad_norm": 1.3763288259506226, - "learning_rate": 1e-05, - "loss": 0.2551, - "step": 281600 - }, - { - "epoch": 0.002817, - "grad_norm": 1.1664878129959106, - "learning_rate": 1e-05, - "loss": 0.2445, - "step": 281700 - }, - { - "epoch": 0.002818, - "grad_norm": 1.3430664539337158, - "learning_rate": 1e-05, - "loss": 0.25, - "step": 281800 - }, - { - "epoch": 0.002819, - "grad_norm": 1.2395867109298706, - "learning_rate": 1e-05, - "loss": 0.2503, - "step": 281900 - }, - { - "epoch": 0.00282, - "grad_norm": 1.1876102685928345, - "learning_rate": 1e-05, - "loss": 0.2484, - "step": 282000 - }, - { - "epoch": 0.002821, - "grad_norm": 1.3158706426620483, - "learning_rate": 1e-05, - "loss": 0.2457, - "step": 282100 - }, - { - "epoch": 0.002822, - "grad_norm": 1.1923071146011353, - "learning_rate": 1e-05, - "loss": 0.2454, - "step": 282200 - }, - { - "epoch": 0.002823, - "grad_norm": 1.1963248252868652, - "learning_rate": 1e-05, - "loss": 0.2501, - "step": 282300 - }, - { - "epoch": 0.002824, - "grad_norm": 1.3409608602523804, - "learning_rate": 1e-05, - "loss": 0.2509, - "step": 282400 - }, - { - "epoch": 0.002825, - "grad_norm": 1.6170743703842163, - "learning_rate": 1e-05, - "loss": 0.2543, - "step": 282500 - }, - { - "epoch": 0.002826, - "grad_norm": 1.0626087188720703, - "learning_rate": 1e-05, - "loss": 0.2473, - "step": 282600 - }, - { - "epoch": 0.002827, - "grad_norm": 1.2063398361206055, - "learning_rate": 1e-05, - "loss": 0.2456, - "step": 282700 - }, - { - "epoch": 0.002828, - "grad_norm": 1.3952311277389526, - "learning_rate": 1e-05, - "loss": 0.2507, - "step": 282800 - }, - { - "epoch": 0.002829, - "grad_norm": 1.535420298576355, - "learning_rate": 1e-05, - "loss": 0.2432, - "step": 282900 - }, - { - "epoch": 0.00283, - "grad_norm": 1.3006575107574463, - "learning_rate": 1e-05, - "loss": 0.2461, - "step": 283000 - }, - { - "epoch": 0.002831, - "grad_norm": 1.3280394077301025, - "learning_rate": 1e-05, - "loss": 0.2453, - "step": 283100 - }, - { - "epoch": 0.002832, - "grad_norm": 1.2034204006195068, - "learning_rate": 1e-05, - "loss": 0.2442, - "step": 283200 - }, - { - "epoch": 0.002833, - "grad_norm": 1.3199750185012817, - "learning_rate": 1e-05, - "loss": 0.2547, - "step": 283300 - }, - { - "epoch": 0.002834, - "grad_norm": 1.5480409860610962, - "learning_rate": 1e-05, - "loss": 0.2529, - "step": 283400 - }, - { - "epoch": 0.002835, - "grad_norm": 1.1071147918701172, - "learning_rate": 1e-05, - "loss": 0.2442, - "step": 283500 - }, - { - "epoch": 0.002836, - "grad_norm": 1.2989801168441772, - "learning_rate": 1e-05, - "loss": 0.2449, - "step": 283600 - }, - { - "epoch": 0.002837, - "grad_norm": 1.500909447669983, - "learning_rate": 1e-05, - "loss": 0.2495, - "step": 283700 - }, - { - "epoch": 0.002838, - "grad_norm": 1.2470767498016357, - "learning_rate": 1e-05, - "loss": 0.2514, - "step": 283800 - }, - { - "epoch": 0.002839, - "grad_norm": 1.315054178237915, - "learning_rate": 1e-05, - "loss": 0.2437, - "step": 283900 - }, - { - "epoch": 0.00284, - "grad_norm": 1.3769065141677856, - "learning_rate": 1e-05, - "loss": 0.2517, - "step": 284000 - }, - { - "epoch": 0.002841, - "grad_norm": 1.3219605684280396, - "learning_rate": 1e-05, - "loss": 0.2429, - "step": 284100 - }, - { - "epoch": 0.002842, - "grad_norm": 1.290816068649292, - "learning_rate": 1e-05, - "loss": 0.2442, - "step": 284200 - }, - { - "epoch": 0.002843, - "grad_norm": 1.3180721998214722, - "learning_rate": 1e-05, - "loss": 0.2411, - "step": 284300 - }, - { - "epoch": 0.002844, - "grad_norm": 1.392189621925354, - "learning_rate": 1e-05, - "loss": 0.2439, - "step": 284400 - }, - { - "epoch": 0.002845, - "grad_norm": 1.3460537195205688, - "learning_rate": 1e-05, - "loss": 0.2457, - "step": 284500 - }, - { - "epoch": 0.002846, - "grad_norm": 1.2485060691833496, - "learning_rate": 1e-05, - "loss": 0.2429, - "step": 284600 - }, - { - "epoch": 0.002847, - "grad_norm": 1.1561580896377563, - "learning_rate": 1e-05, - "loss": 0.2363, - "step": 284700 - }, - { - "epoch": 0.002848, - "grad_norm": 1.145930290222168, - "learning_rate": 1e-05, - "loss": 0.2422, - "step": 284800 - }, - { - "epoch": 0.002849, - "grad_norm": 1.345758318901062, - "learning_rate": 1e-05, - "loss": 0.2531, - "step": 284900 - }, - { - "epoch": 0.00285, - "grad_norm": 1.4995092153549194, - "learning_rate": 1e-05, - "loss": 0.2374, - "step": 285000 - }, - { - "epoch": 0.002851, - "grad_norm": 1.1388330459594727, - "learning_rate": 1e-05, - "loss": 0.2458, - "step": 285100 - }, - { - "epoch": 0.002852, - "grad_norm": 1.1494935750961304, - "learning_rate": 1e-05, - "loss": 0.2435, - "step": 285200 - }, - { - "epoch": 0.002853, - "grad_norm": 1.3155019283294678, - "learning_rate": 1e-05, - "loss": 0.2438, - "step": 285300 - }, - { - "epoch": 0.002854, - "grad_norm": 1.4451137781143188, - "learning_rate": 1e-05, - "loss": 0.2467, - "step": 285400 - }, - { - "epoch": 0.002855, - "grad_norm": 1.1886345148086548, - "learning_rate": 1e-05, - "loss": 0.2504, - "step": 285500 - }, - { - "epoch": 0.002856, - "grad_norm": 1.292470097541809, - "learning_rate": 1e-05, - "loss": 0.2504, - "step": 285600 - }, - { - "epoch": 0.002857, - "grad_norm": 1.4449888467788696, - "learning_rate": 1e-05, - "loss": 0.2492, - "step": 285700 - }, - { - "epoch": 0.002858, - "grad_norm": 1.300668716430664, - "learning_rate": 1e-05, - "loss": 0.2531, - "step": 285800 - }, - { - "epoch": 0.002859, - "grad_norm": 4.405996799468994, - "learning_rate": 1e-05, - "loss": 0.2425, - "step": 285900 - }, - { - "epoch": 0.00286, - "grad_norm": 1.332196831703186, - "learning_rate": 1e-05, - "loss": 0.2428, - "step": 286000 - }, - { - "epoch": 0.002861, - "grad_norm": 1.3829389810562134, - "learning_rate": 1e-05, - "loss": 0.2454, - "step": 286100 - }, - { - "epoch": 0.002862, - "grad_norm": 1.2246516942977905, - "learning_rate": 1e-05, - "loss": 0.2459, - "step": 286200 - }, - { - "epoch": 0.002863, - "grad_norm": 1.1918455362319946, - "learning_rate": 1e-05, - "loss": 0.243, - "step": 286300 - }, - { - "epoch": 0.002864, - "grad_norm": 1.4211211204528809, - "learning_rate": 1e-05, - "loss": 0.2467, - "step": 286400 - }, - { - "epoch": 0.002865, - "grad_norm": 1.3562350273132324, - "learning_rate": 1e-05, - "loss": 0.2459, - "step": 286500 - }, - { - "epoch": 0.002866, - "grad_norm": 1.6160293817520142, - "learning_rate": 1e-05, - "loss": 0.2451, - "step": 286600 - }, - { - "epoch": 0.002867, - "grad_norm": 1.3269939422607422, - "learning_rate": 1e-05, - "loss": 0.247, - "step": 286700 - }, - { - "epoch": 0.002868, - "grad_norm": 1.417557716369629, - "learning_rate": 1e-05, - "loss": 0.2488, - "step": 286800 - }, - { - "epoch": 0.002869, - "grad_norm": 1.4066275358200073, - "learning_rate": 1e-05, - "loss": 0.2476, - "step": 286900 - }, - { - "epoch": 0.00287, - "grad_norm": 1.3806079626083374, - "learning_rate": 1e-05, - "loss": 0.2452, - "step": 287000 - }, - { - "epoch": 0.002871, - "grad_norm": 1.4028304815292358, - "learning_rate": 1e-05, - "loss": 0.2416, - "step": 287100 - }, - { - "epoch": 0.002872, - "grad_norm": 1.1866804361343384, - "learning_rate": 1e-05, - "loss": 0.2437, - "step": 287200 - }, - { - "epoch": 0.002873, - "grad_norm": 1.3341212272644043, - "learning_rate": 1e-05, - "loss": 0.2439, - "step": 287300 - }, - { - "epoch": 0.002874, - "grad_norm": 1.3511171340942383, - "learning_rate": 1e-05, - "loss": 0.2464, - "step": 287400 - }, - { - "epoch": 0.002875, - "grad_norm": 1.5069416761398315, - "learning_rate": 1e-05, - "loss": 0.2443, - "step": 287500 - }, - { - "epoch": 0.002876, - "grad_norm": 1.2887665033340454, - "learning_rate": 1e-05, - "loss": 0.2429, - "step": 287600 - }, - { - "epoch": 0.002877, - "grad_norm": 1.373012661933899, - "learning_rate": 1e-05, - "loss": 0.2477, - "step": 287700 - }, - { - "epoch": 0.002878, - "grad_norm": 1.2032406330108643, - "learning_rate": 1e-05, - "loss": 0.2499, - "step": 287800 - }, - { - "epoch": 0.002879, - "grad_norm": 1.3944289684295654, - "learning_rate": 1e-05, - "loss": 0.2483, - "step": 287900 - }, - { - "epoch": 0.00288, - "grad_norm": 1.2849273681640625, - "learning_rate": 1e-05, - "loss": 0.2417, - "step": 288000 - }, - { - "epoch": 0.002881, - "grad_norm": 1.2784684896469116, - "learning_rate": 1e-05, - "loss": 0.2483, - "step": 288100 - }, - { - "epoch": 0.002882, - "grad_norm": 1.3414957523345947, - "learning_rate": 1e-05, - "loss": 0.2454, - "step": 288200 - }, - { - "epoch": 0.002883, - "grad_norm": 2.811215400695801, - "learning_rate": 1e-05, - "loss": 0.2477, - "step": 288300 - }, - { - "epoch": 0.002884, - "grad_norm": 1.6408745050430298, - "learning_rate": 1e-05, - "loss": 0.2412, - "step": 288400 - }, - { - "epoch": 0.002885, - "grad_norm": 1.3462251424789429, - "learning_rate": 1e-05, - "loss": 0.2491, - "step": 288500 - }, - { - "epoch": 0.002886, - "grad_norm": 1.3377293348312378, - "learning_rate": 1e-05, - "loss": 0.2482, - "step": 288600 - }, - { - "epoch": 0.002887, - "grad_norm": 1.2955459356307983, - "learning_rate": 1e-05, - "loss": 0.2434, - "step": 288700 - }, - { - "epoch": 0.002888, - "grad_norm": 1.2297544479370117, - "learning_rate": 1e-05, - "loss": 0.2425, - "step": 288800 - }, - { - "epoch": 0.002889, - "grad_norm": 1.1746385097503662, - "learning_rate": 1e-05, - "loss": 0.2449, - "step": 288900 - }, - { - "epoch": 0.00289, - "grad_norm": 1.2574794292449951, - "learning_rate": 1e-05, - "loss": 0.2411, - "step": 289000 - }, - { - "epoch": 0.002891, - "grad_norm": 1.2351876497268677, - "learning_rate": 1e-05, - "loss": 0.2408, - "step": 289100 - }, - { - "epoch": 0.002892, - "grad_norm": 1.114088773727417, - "learning_rate": 1e-05, - "loss": 0.2437, - "step": 289200 - }, - { - "epoch": 0.002893, - "grad_norm": 1.2709494829177856, - "learning_rate": 1e-05, - "loss": 0.248, - "step": 289300 - }, - { - "epoch": 0.002894, - "grad_norm": 1.510506510734558, - "learning_rate": 1e-05, - "loss": 0.2428, - "step": 289400 - }, - { - "epoch": 0.002895, - "grad_norm": 1.4081695079803467, - "learning_rate": 1e-05, - "loss": 0.2388, - "step": 289500 - }, - { - "epoch": 0.002896, - "grad_norm": 1.2253382205963135, - "learning_rate": 1e-05, - "loss": 0.2406, - "step": 289600 - }, - { - "epoch": 0.002897, - "grad_norm": 1.3243523836135864, - "learning_rate": 1e-05, - "loss": 0.242, - "step": 289700 - }, - { - "epoch": 0.002898, - "grad_norm": 1.2745314836502075, - "learning_rate": 1e-05, - "loss": 0.2429, - "step": 289800 - }, - { - "epoch": 0.002899, - "grad_norm": 1.3616939783096313, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 289900 - }, - { - "epoch": 0.0029, - "grad_norm": 1.4914360046386719, - "learning_rate": 1e-05, - "loss": 0.2501, - "step": 290000 - }, - { - "epoch": 0.002901, - "grad_norm": 1.3326282501220703, - "learning_rate": 1e-05, - "loss": 0.2433, - "step": 290100 - }, - { - "epoch": 0.002902, - "grad_norm": 1.2369986772537231, - "learning_rate": 1e-05, - "loss": 0.2441, - "step": 290200 - }, - { - "epoch": 0.002903, - "grad_norm": 1.4830586910247803, - "learning_rate": 1e-05, - "loss": 0.2478, - "step": 290300 - }, - { - "epoch": 0.002904, - "grad_norm": 1.2769259214401245, - "learning_rate": 1e-05, - "loss": 0.2394, - "step": 290400 - }, - { - "epoch": 0.002905, - "grad_norm": 1.4869470596313477, - "learning_rate": 1e-05, - "loss": 0.2493, - "step": 290500 - }, - { - "epoch": 0.002906, - "grad_norm": 1.221601963043213, - "learning_rate": 1e-05, - "loss": 0.2493, - "step": 290600 - }, - { - "epoch": 0.002907, - "grad_norm": 1.3075189590454102, - "learning_rate": 1e-05, - "loss": 0.2423, - "step": 290700 - }, - { - "epoch": 0.002908, - "grad_norm": 1.2492666244506836, - "learning_rate": 1e-05, - "loss": 0.2392, - "step": 290800 - }, - { - "epoch": 0.002909, - "grad_norm": 1.5218042135238647, - "learning_rate": 1e-05, - "loss": 0.2391, - "step": 290900 - }, - { - "epoch": 0.00291, - "grad_norm": 1.1374872922897339, - "learning_rate": 1e-05, - "loss": 0.2388, - "step": 291000 - }, - { - "epoch": 0.002911, - "grad_norm": 1.3382660150527954, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 291100 - }, - { - "epoch": 0.002912, - "grad_norm": 1.3432732820510864, - "learning_rate": 1e-05, - "loss": 0.2427, - "step": 291200 - }, - { - "epoch": 0.002913, - "grad_norm": 1.3313060998916626, - "learning_rate": 1e-05, - "loss": 0.2483, - "step": 291300 - }, - { - "epoch": 0.002914, - "grad_norm": 1.354307770729065, - "learning_rate": 1e-05, - "loss": 0.2464, - "step": 291400 - }, - { - "epoch": 0.002915, - "grad_norm": 1.2047741413116455, - "learning_rate": 1e-05, - "loss": 0.2425, - "step": 291500 - }, - { - "epoch": 0.002916, - "grad_norm": 1.4269415140151978, - "learning_rate": 1e-05, - "loss": 0.2413, - "step": 291600 - }, - { - "epoch": 0.002917, - "grad_norm": 1.3683712482452393, - "learning_rate": 1e-05, - "loss": 0.2427, - "step": 291700 - }, - { - "epoch": 0.002918, - "grad_norm": 1.431900143623352, - "learning_rate": 1e-05, - "loss": 0.2387, - "step": 291800 - }, - { - "epoch": 0.002919, - "grad_norm": 1.3799906969070435, - "learning_rate": 1e-05, - "loss": 0.2412, - "step": 291900 - }, - { - "epoch": 0.00292, - "grad_norm": 1.252881407737732, - "learning_rate": 1e-05, - "loss": 0.2406, - "step": 292000 - }, - { - "epoch": 0.002921, - "grad_norm": 1.3206135034561157, - "learning_rate": 1e-05, - "loss": 0.2411, - "step": 292100 - }, - { - "epoch": 0.002922, - "grad_norm": 1.2838083505630493, - "learning_rate": 1e-05, - "loss": 0.2435, - "step": 292200 - }, - { - "epoch": 0.002923, - "grad_norm": 1.3026096820831299, - "learning_rate": 1e-05, - "loss": 0.2476, - "step": 292300 - }, - { - "epoch": 0.002924, - "grad_norm": 1.4028531312942505, - "learning_rate": 1e-05, - "loss": 0.244, - "step": 292400 - }, - { - "epoch": 0.002925, - "grad_norm": 1.3861136436462402, - "learning_rate": 1e-05, - "loss": 0.2446, - "step": 292500 - }, - { - "epoch": 0.002926, - "grad_norm": 1.3757134675979614, - "learning_rate": 1e-05, - "loss": 0.2418, - "step": 292600 - }, - { - "epoch": 0.002927, - "grad_norm": 1.2491317987442017, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 292700 - }, - { - "epoch": 0.002928, - "grad_norm": 1.3485047817230225, - "learning_rate": 1e-05, - "loss": 0.2437, - "step": 292800 - }, - { - "epoch": 0.002929, - "grad_norm": 1.3878129720687866, - "learning_rate": 1e-05, - "loss": 0.2444, - "step": 292900 - }, - { - "epoch": 0.00293, - "grad_norm": 1.5195536613464355, - "learning_rate": 1e-05, - "loss": 0.249, - "step": 293000 - }, - { - "epoch": 0.002931, - "grad_norm": 1.2559834718704224, - "learning_rate": 1e-05, - "loss": 0.2464, - "step": 293100 - }, - { - "epoch": 0.002932, - "grad_norm": 1.2232775688171387, - "learning_rate": 1e-05, - "loss": 0.247, - "step": 293200 - }, - { - "epoch": 0.002933, - "grad_norm": 1.1313245296478271, - "learning_rate": 1e-05, - "loss": 0.2472, - "step": 293300 - }, - { - "epoch": 0.002934, - "grad_norm": 1.4928311109542847, - "learning_rate": 1e-05, - "loss": 0.2417, - "step": 293400 - }, - { - "epoch": 0.002935, - "grad_norm": 1.389172077178955, - "learning_rate": 1e-05, - "loss": 0.2378, - "step": 293500 - }, - { - "epoch": 0.002936, - "grad_norm": 1.2595118284225464, - "learning_rate": 1e-05, - "loss": 0.2423, - "step": 293600 - }, - { - "epoch": 0.002937, - "grad_norm": 2.2567596435546875, - "learning_rate": 1e-05, - "loss": 0.2435, - "step": 293700 - }, - { - "epoch": 0.002938, - "grad_norm": 1.262451410293579, - "learning_rate": 1e-05, - "loss": 0.2452, - "step": 293800 - }, - { - "epoch": 0.002939, - "grad_norm": 1.1872419118881226, - "learning_rate": 1e-05, - "loss": 0.2416, - "step": 293900 - }, - { - "epoch": 0.00294, - "grad_norm": 1.4861950874328613, - "learning_rate": 1e-05, - "loss": 0.2377, - "step": 294000 - }, - { - "epoch": 0.002941, - "grad_norm": 1.4680427312850952, - "learning_rate": 1e-05, - "loss": 0.2408, - "step": 294100 - }, - { - "epoch": 0.002942, - "grad_norm": 1.2542996406555176, - "learning_rate": 1e-05, - "loss": 0.2398, - "step": 294200 - }, - { - "epoch": 0.002943, - "grad_norm": 1.4607197046279907, - "learning_rate": 1e-05, - "loss": 0.2416, - "step": 294300 - }, - { - "epoch": 0.002944, - "grad_norm": 2.1277048587799072, - "learning_rate": 1e-05, - "loss": 0.2472, - "step": 294400 - }, - { - "epoch": 0.002945, - "grad_norm": 1.1014450788497925, - "learning_rate": 1e-05, - "loss": 0.246, - "step": 294500 - }, - { - "epoch": 0.002946, - "grad_norm": 1.288939356803894, - "learning_rate": 1e-05, - "loss": 0.2482, - "step": 294600 - }, - { - "epoch": 0.002947, - "grad_norm": 1.3552707433700562, - "learning_rate": 1e-05, - "loss": 0.2337, - "step": 294700 - }, - { - "epoch": 0.002948, - "grad_norm": 1.200110912322998, - "learning_rate": 1e-05, - "loss": 0.2477, - "step": 294800 - }, - { - "epoch": 0.002949, - "grad_norm": 1.4520933628082275, - "learning_rate": 1e-05, - "loss": 0.2462, - "step": 294900 - }, - { - "epoch": 0.00295, - "grad_norm": 1.514265537261963, - "learning_rate": 1e-05, - "loss": 0.2374, - "step": 295000 - }, - { - "epoch": 0.002951, - "grad_norm": 1.420652985572815, - "learning_rate": 1e-05, - "loss": 0.2392, - "step": 295100 - }, - { - "epoch": 0.002952, - "grad_norm": 1.18659508228302, - "learning_rate": 1e-05, - "loss": 0.2427, - "step": 295200 - }, - { - "epoch": 0.002953, - "grad_norm": 1.2362042665481567, - "learning_rate": 1e-05, - "loss": 0.2486, - "step": 295300 - }, - { - "epoch": 0.002954, - "grad_norm": 1.4769152402877808, - "learning_rate": 1e-05, - "loss": 0.2383, - "step": 295400 - }, - { - "epoch": 0.002955, - "grad_norm": 1.1464706659317017, - "learning_rate": 1e-05, - "loss": 0.2459, - "step": 295500 - }, - { - "epoch": 0.002956, - "grad_norm": 1.2575162649154663, - "learning_rate": 1e-05, - "loss": 0.2363, - "step": 295600 - }, - { - "epoch": 0.002957, - "grad_norm": 1.4010258913040161, - "learning_rate": 1e-05, - "loss": 0.2388, - "step": 295700 - }, - { - "epoch": 0.002958, - "grad_norm": 1.3643419742584229, - "learning_rate": 1e-05, - "loss": 0.2422, - "step": 295800 - }, - { - "epoch": 0.002959, - "grad_norm": 1.2145565748214722, - "learning_rate": 1e-05, - "loss": 0.241, - "step": 295900 - }, - { - "epoch": 0.00296, - "grad_norm": 1.3718632459640503, - "learning_rate": 1e-05, - "loss": 0.2529, - "step": 296000 - }, - { - "epoch": 0.002961, - "grad_norm": 1.191248893737793, - "learning_rate": 1e-05, - "loss": 0.2412, - "step": 296100 - }, - { - "epoch": 0.002962, - "grad_norm": 1.3857762813568115, - "learning_rate": 1e-05, - "loss": 0.2404, - "step": 296200 - }, - { - "epoch": 0.002963, - "grad_norm": 1.5052165985107422, - "learning_rate": 1e-05, - "loss": 0.2455, - "step": 296300 - }, - { - "epoch": 0.002964, - "grad_norm": 1.2893377542495728, - "learning_rate": 1e-05, - "loss": 0.244, - "step": 296400 - }, - { - "epoch": 0.002965, - "grad_norm": 2.0221776962280273, - "learning_rate": 1e-05, - "loss": 0.2385, - "step": 296500 - }, - { - "epoch": 0.002966, - "grad_norm": 1.3078244924545288, - "learning_rate": 1e-05, - "loss": 0.2497, - "step": 296600 - }, - { - "epoch": 0.002967, - "grad_norm": 1.4541032314300537, - "learning_rate": 1e-05, - "loss": 0.2398, - "step": 296700 - }, - { - "epoch": 0.002968, - "grad_norm": 1.8015938997268677, - "learning_rate": 1e-05, - "loss": 0.243, - "step": 296800 - }, - { - "epoch": 0.002969, - "grad_norm": 2.0037682056427, - "learning_rate": 1e-05, - "loss": 0.2427, - "step": 296900 - }, - { - "epoch": 0.00297, - "grad_norm": 1.5175869464874268, - "learning_rate": 1e-05, - "loss": 0.2399, - "step": 297000 - }, - { - "epoch": 0.002971, - "grad_norm": 1.7888078689575195, - "learning_rate": 1e-05, - "loss": 0.2461, - "step": 297100 - }, - { - "epoch": 0.002972, - "grad_norm": 1.379382610321045, - "learning_rate": 1e-05, - "loss": 0.244, - "step": 297200 - }, - { - "epoch": 0.002973, - "grad_norm": 2.4753177165985107, - "learning_rate": 1e-05, - "loss": 0.2357, - "step": 297300 - }, - { - "epoch": 0.002974, - "grad_norm": 1.2226715087890625, - "learning_rate": 1e-05, - "loss": 0.2427, - "step": 297400 - }, - { - "epoch": 0.002975, - "grad_norm": 1.280057430267334, - "learning_rate": 1e-05, - "loss": 0.2427, - "step": 297500 - }, - { - "epoch": 0.002976, - "grad_norm": 1.2715860605239868, - "learning_rate": 1e-05, - "loss": 0.237, - "step": 297600 - }, - { - "epoch": 0.002977, - "grad_norm": 1.0956978797912598, - "learning_rate": 1e-05, - "loss": 0.2452, - "step": 297700 - }, - { - "epoch": 0.002978, - "grad_norm": 1.5181704759597778, - "learning_rate": 1e-05, - "loss": 0.2422, - "step": 297800 - }, - { - "epoch": 0.002979, - "grad_norm": 1.179020643234253, - "learning_rate": 1e-05, - "loss": 0.2397, - "step": 297900 - }, - { - "epoch": 0.00298, - "grad_norm": 1.3676578998565674, - "learning_rate": 1e-05, - "loss": 0.2432, - "step": 298000 - }, - { - "epoch": 0.002981, - "grad_norm": 1.3237693309783936, - "learning_rate": 1e-05, - "loss": 0.2407, - "step": 298100 - }, - { - "epoch": 0.002982, - "grad_norm": 1.2314772605895996, - "learning_rate": 1e-05, - "loss": 0.2443, - "step": 298200 - }, - { - "epoch": 0.002983, - "grad_norm": 1.324709415435791, - "learning_rate": 1e-05, - "loss": 0.2348, - "step": 298300 - }, - { - "epoch": 0.002984, - "grad_norm": 1.4330154657363892, - "learning_rate": 1e-05, - "loss": 0.2437, - "step": 298400 - }, - { - "epoch": 0.002985, - "grad_norm": 1.6337260007858276, - "learning_rate": 1e-05, - "loss": 0.2445, - "step": 298500 - }, - { - "epoch": 0.002986, - "grad_norm": 1.2202649116516113, - "learning_rate": 1e-05, - "loss": 0.2353, - "step": 298600 - }, - { - "epoch": 0.002987, - "grad_norm": 1.4072762727737427, - "learning_rate": 1e-05, - "loss": 0.245, - "step": 298700 - }, - { - "epoch": 0.002988, - "grad_norm": 1.3483080863952637, - "learning_rate": 1e-05, - "loss": 0.2494, - "step": 298800 - }, - { - "epoch": 0.002989, - "grad_norm": 1.530049443244934, - "learning_rate": 1e-05, - "loss": 0.2395, - "step": 298900 - }, - { - "epoch": 0.00299, - "grad_norm": 1.0752151012420654, - "learning_rate": 1e-05, - "loss": 0.2403, - "step": 299000 - }, - { - "epoch": 0.002991, - "grad_norm": 1.2792757749557495, - "learning_rate": 1e-05, - "loss": 0.2398, - "step": 299100 - }, - { - "epoch": 0.002992, - "grad_norm": 1.111703872680664, - "learning_rate": 1e-05, - "loss": 0.2474, - "step": 299200 - }, - { - "epoch": 0.002993, - "grad_norm": 1.6271562576293945, - "learning_rate": 1e-05, - "loss": 0.2494, - "step": 299300 - }, - { - "epoch": 0.002994, - "grad_norm": 1.414760708808899, - "learning_rate": 1e-05, - "loss": 0.2377, - "step": 299400 - }, - { - "epoch": 0.002995, - "grad_norm": 1.3182244300842285, - "learning_rate": 1e-05, - "loss": 0.235, - "step": 299500 - }, - { - "epoch": 0.002996, - "grad_norm": 1.2058541774749756, - "learning_rate": 1e-05, - "loss": 0.2443, - "step": 299600 - }, - { - "epoch": 0.002997, - "grad_norm": 1.3847687244415283, - "learning_rate": 1e-05, - "loss": 0.2373, - "step": 299700 - }, - { - "epoch": 0.002998, - "grad_norm": 1.2531335353851318, - "learning_rate": 1e-05, - "loss": 0.2454, - "step": 299800 - }, - { - "epoch": 0.002999, - "grad_norm": 1.1601791381835938, - "learning_rate": 1e-05, - "loss": 0.2404, - "step": 299900 - }, - { - "epoch": 0.003, - "grad_norm": 1.3942663669586182, - "learning_rate": 1e-05, - "loss": 0.2393, - "step": 300000 - }, - { - "epoch": 0.003, - "eval_loss": 0.21923828125, - "eval_runtime": 116.7941, - "eval_samples_per_second": 428.104, - "eval_steps_per_second": 26.756, - "step": 300000 - }, - { - "epoch": 0.003001, - "grad_norm": 1.266499400138855, - "learning_rate": 1e-05, - "loss": 0.2393, - "step": 300100 - }, - { - "epoch": 0.003002, - "grad_norm": 1.4448562860488892, - "learning_rate": 1e-05, - "loss": 0.2364, - "step": 300200 - }, - { - "epoch": 0.003003, - "grad_norm": 1.4585342407226562, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 300300 - }, - { - "epoch": 0.003004, - "grad_norm": 1.2878295183181763, - "learning_rate": 1e-05, - "loss": 0.2374, - "step": 300400 - }, - { - "epoch": 0.003005, - "grad_norm": 2.4662258625030518, - "learning_rate": 1e-05, - "loss": 0.2444, - "step": 300500 - }, - { - "epoch": 0.003006, - "grad_norm": 1.5313811302185059, - "learning_rate": 1e-05, - "loss": 0.2355, - "step": 300600 - }, - { - "epoch": 0.003007, - "grad_norm": 1.6990282535552979, - "learning_rate": 1e-05, - "loss": 0.2459, - "step": 300700 - }, - { - "epoch": 0.003008, - "grad_norm": 1.2280924320220947, - "learning_rate": 1e-05, - "loss": 0.241, - "step": 300800 - }, - { - "epoch": 0.003009, - "grad_norm": 1.3592984676361084, - "learning_rate": 1e-05, - "loss": 0.2423, - "step": 300900 - }, - { - "epoch": 0.00301, - "grad_norm": 1.4027960300445557, - "learning_rate": 1e-05, - "loss": 0.2375, - "step": 301000 - }, - { - "epoch": 0.003011, - "grad_norm": 1.3223481178283691, - "learning_rate": 1e-05, - "loss": 0.2397, - "step": 301100 - }, - { - "epoch": 0.003012, - "grad_norm": 1.1765297651290894, - "learning_rate": 1e-05, - "loss": 0.243, - "step": 301200 - }, - { - "epoch": 0.003013, - "grad_norm": 1.372947335243225, - "learning_rate": 1e-05, - "loss": 0.2395, - "step": 301300 - }, - { - "epoch": 0.003014, - "grad_norm": 1.4614768028259277, - "learning_rate": 1e-05, - "loss": 0.2473, - "step": 301400 - }, - { - "epoch": 0.003015, - "grad_norm": 1.3650870323181152, - "learning_rate": 1e-05, - "loss": 0.2417, - "step": 301500 - }, - { - "epoch": 0.003016, - "grad_norm": 1.286664366722107, - "learning_rate": 1e-05, - "loss": 0.2346, - "step": 301600 - }, - { - "epoch": 0.003017, - "grad_norm": 1.7471123933792114, - "learning_rate": 1e-05, - "loss": 0.2369, - "step": 301700 - }, - { - "epoch": 0.003018, - "grad_norm": 1.3718763589859009, - "learning_rate": 1e-05, - "loss": 0.236, - "step": 301800 - }, - { - "epoch": 0.003019, - "grad_norm": 1.4841729402542114, - "learning_rate": 1e-05, - "loss": 0.2446, - "step": 301900 - }, - { - "epoch": 0.00302, - "grad_norm": 0.9965932965278625, - "learning_rate": 1e-05, - "loss": 0.2438, - "step": 302000 - }, - { - "epoch": 0.003021, - "grad_norm": 1.137410283088684, - "learning_rate": 1e-05, - "loss": 0.2406, - "step": 302100 - }, - { - "epoch": 0.003022, - "grad_norm": 2.058351993560791, - "learning_rate": 1e-05, - "loss": 0.2394, - "step": 302200 - }, - { - "epoch": 0.003023, - "grad_norm": 1.2746084928512573, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 302300 - }, - { - "epoch": 0.003024, - "grad_norm": 1.286949634552002, - "learning_rate": 1e-05, - "loss": 0.2398, - "step": 302400 - }, - { - "epoch": 0.003025, - "grad_norm": 1.3544740676879883, - "learning_rate": 1e-05, - "loss": 0.235, - "step": 302500 - }, - { - "epoch": 0.003026, - "grad_norm": 1.3173516988754272, - "learning_rate": 1e-05, - "loss": 0.2424, - "step": 302600 - }, - { - "epoch": 0.003027, - "grad_norm": 1.4908658266067505, - "learning_rate": 1e-05, - "loss": 0.2357, - "step": 302700 - }, - { - "epoch": 0.003028, - "grad_norm": 1.7289472818374634, - "learning_rate": 1e-05, - "loss": 0.2392, - "step": 302800 - }, - { - "epoch": 0.003029, - "grad_norm": 1.1493161916732788, - "learning_rate": 1e-05, - "loss": 0.2396, - "step": 302900 - }, - { - "epoch": 0.00303, - "grad_norm": 1.1347864866256714, - "learning_rate": 1e-05, - "loss": 0.2435, - "step": 303000 - }, - { - "epoch": 0.003031, - "grad_norm": 1.1742538213729858, - "learning_rate": 1e-05, - "loss": 0.2464, - "step": 303100 - }, - { - "epoch": 0.003032, - "grad_norm": 1.2878683805465698, - "learning_rate": 1e-05, - "loss": 0.2478, - "step": 303200 - }, - { - "epoch": 0.003033, - "grad_norm": 1.354936122894287, - "learning_rate": 1e-05, - "loss": 0.2451, - "step": 303300 - }, - { - "epoch": 0.003034, - "grad_norm": 1.3237261772155762, - "learning_rate": 1e-05, - "loss": 0.2299, - "step": 303400 - }, - { - "epoch": 0.003035, - "grad_norm": 1.230892300605774, - "learning_rate": 1e-05, - "loss": 0.2403, - "step": 303500 - }, - { - "epoch": 0.003036, - "grad_norm": 1.4075496196746826, - "learning_rate": 1e-05, - "loss": 0.2488, - "step": 303600 - }, - { - "epoch": 0.003037, - "grad_norm": 1.3775579929351807, - "learning_rate": 1e-05, - "loss": 0.2399, - "step": 303700 - }, - { - "epoch": 0.003038, - "grad_norm": 1.672793984413147, - "learning_rate": 1e-05, - "loss": 0.2428, - "step": 303800 - }, - { - "epoch": 0.003039, - "grad_norm": 1.0980926752090454, - "learning_rate": 1e-05, - "loss": 0.2376, - "step": 303900 - }, - { - "epoch": 0.00304, - "grad_norm": 1.1451281309127808, - "learning_rate": 1e-05, - "loss": 0.2383, - "step": 304000 - }, - { - "epoch": 0.003041, - "grad_norm": 1.3741756677627563, - "learning_rate": 1e-05, - "loss": 0.2418, - "step": 304100 - }, - { - "epoch": 0.003042, - "grad_norm": 1.3082318305969238, - "learning_rate": 1e-05, - "loss": 0.24, - "step": 304200 - }, - { - "epoch": 0.003043, - "grad_norm": 1.2661668062210083, - "learning_rate": 1e-05, - "loss": 0.2407, - "step": 304300 - }, - { - "epoch": 0.003044, - "grad_norm": 1.3737661838531494, - "learning_rate": 1e-05, - "loss": 0.244, - "step": 304400 - }, - { - "epoch": 0.003045, - "grad_norm": 1.2678500413894653, - "learning_rate": 1e-05, - "loss": 0.2386, - "step": 304500 - }, - { - "epoch": 0.003046, - "grad_norm": 1.1583538055419922, - "learning_rate": 1e-05, - "loss": 0.2369, - "step": 304600 - }, - { - "epoch": 0.003047, - "grad_norm": 1.4184924364089966, - "learning_rate": 1e-05, - "loss": 0.2487, - "step": 304700 - }, - { - "epoch": 0.003048, - "grad_norm": 1.4402222633361816, - "learning_rate": 1e-05, - "loss": 0.2374, - "step": 304800 - }, - { - "epoch": 0.003049, - "grad_norm": 1.3228724002838135, - "learning_rate": 1e-05, - "loss": 0.236, - "step": 304900 - }, - { - "epoch": 0.00305, - "grad_norm": 1.4274728298187256, - "learning_rate": 1e-05, - "loss": 0.2446, - "step": 305000 - }, - { - "epoch": 0.003051, - "grad_norm": 1.3890833854675293, - "learning_rate": 1e-05, - "loss": 0.2398, - "step": 305100 - }, - { - "epoch": 0.003052, - "grad_norm": 1.1778687238693237, - "learning_rate": 1e-05, - "loss": 0.243, - "step": 305200 - }, - { - "epoch": 0.003053, - "grad_norm": 1.256292462348938, - "learning_rate": 1e-05, - "loss": 0.2372, - "step": 305300 - }, - { - "epoch": 0.003054, - "grad_norm": 1.3377788066864014, - "learning_rate": 1e-05, - "loss": 0.2408, - "step": 305400 - }, - { - "epoch": 0.003055, - "grad_norm": 1.2276209592819214, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 305500 - }, - { - "epoch": 0.003056, - "grad_norm": 1.4787712097167969, - "learning_rate": 1e-05, - "loss": 0.2393, - "step": 305600 - }, - { - "epoch": 0.003057, - "grad_norm": 1.123220682144165, - "learning_rate": 1e-05, - "loss": 0.2337, - "step": 305700 - }, - { - "epoch": 0.003058, - "grad_norm": 1.480110764503479, - "learning_rate": 1e-05, - "loss": 0.2415, - "step": 305800 - }, - { - "epoch": 0.003059, - "grad_norm": 1.1088813543319702, - "learning_rate": 1e-05, - "loss": 0.2397, - "step": 305900 - }, - { - "epoch": 0.00306, - "grad_norm": 1.226161241531372, - "learning_rate": 1e-05, - "loss": 0.2413, - "step": 306000 - }, - { - "epoch": 0.003061, - "grad_norm": 1.9262425899505615, - "learning_rate": 1e-05, - "loss": 0.2422, - "step": 306100 - }, - { - "epoch": 0.003062, - "grad_norm": 1.0367608070373535, - "learning_rate": 1e-05, - "loss": 0.2394, - "step": 306200 - }, - { - "epoch": 0.003063, - "grad_norm": 1.4568185806274414, - "learning_rate": 1e-05, - "loss": 0.2388, - "step": 306300 - }, - { - "epoch": 0.003064, - "grad_norm": 1.270838975906372, - "learning_rate": 1e-05, - "loss": 0.245, - "step": 306400 - }, - { - "epoch": 0.003065, - "grad_norm": 1.2517333030700684, - "learning_rate": 1e-05, - "loss": 0.2428, - "step": 306500 - }, - { - "epoch": 0.003066, - "grad_norm": 3.923546552658081, - "learning_rate": 1e-05, - "loss": 0.2481, - "step": 306600 - }, - { - "epoch": 0.003067, - "grad_norm": 1.3486560583114624, - "learning_rate": 1e-05, - "loss": 0.237, - "step": 306700 - }, - { - "epoch": 0.003068, - "grad_norm": 1.3074215650558472, - "learning_rate": 1e-05, - "loss": 0.2421, - "step": 306800 - }, - { - "epoch": 0.003069, - "grad_norm": 1.3834184408187866, - "learning_rate": 1e-05, - "loss": 0.2371, - "step": 306900 - }, - { - "epoch": 0.00307, - "grad_norm": 1.1799381971359253, - "learning_rate": 1e-05, - "loss": 0.2433, - "step": 307000 - }, - { - "epoch": 0.003071, - "grad_norm": 1.4102553129196167, - "learning_rate": 1e-05, - "loss": 0.2393, - "step": 307100 - }, - { - "epoch": 0.003072, - "grad_norm": 1.6468982696533203, - "learning_rate": 1e-05, - "loss": 0.2393, - "step": 307200 - }, - { - "epoch": 0.003073, - "grad_norm": 1.1382652521133423, - "learning_rate": 1e-05, - "loss": 0.2403, - "step": 307300 - }, - { - "epoch": 0.003074, - "grad_norm": 1.1657092571258545, - "learning_rate": 1e-05, - "loss": 0.2315, - "step": 307400 - }, - { - "epoch": 0.003075, - "grad_norm": 1.4603681564331055, - "learning_rate": 1e-05, - "loss": 0.2407, - "step": 307500 - }, - { - "epoch": 0.003076, - "grad_norm": 1.1055331230163574, - "learning_rate": 1e-05, - "loss": 0.2433, - "step": 307600 - }, - { - "epoch": 0.003077, - "grad_norm": 1.2054595947265625, - "learning_rate": 1e-05, - "loss": 0.2387, - "step": 307700 - }, - { - "epoch": 0.003078, - "grad_norm": 1.2264481782913208, - "learning_rate": 1e-05, - "loss": 0.2376, - "step": 307800 - }, - { - "epoch": 0.003079, - "grad_norm": 1.3053686618804932, - "learning_rate": 1e-05, - "loss": 0.2438, - "step": 307900 - }, - { - "epoch": 0.00308, - "grad_norm": 1.3429399728775024, - "learning_rate": 1e-05, - "loss": 0.2435, - "step": 308000 - }, - { - "epoch": 0.003081, - "grad_norm": 1.4589062929153442, - "learning_rate": 1e-05, - "loss": 0.2439, - "step": 308100 - }, - { - "epoch": 0.003082, - "grad_norm": 1.3560503721237183, - "learning_rate": 1e-05, - "loss": 0.2372, - "step": 308200 - }, - { - "epoch": 0.003083, - "grad_norm": 1.4815075397491455, - "learning_rate": 1e-05, - "loss": 0.2465, - "step": 308300 - }, - { - "epoch": 0.003084, - "grad_norm": 1.369139313697815, - "learning_rate": 1e-05, - "loss": 0.2376, - "step": 308400 - }, - { - "epoch": 0.003085, - "grad_norm": 1.327416181564331, - "learning_rate": 1e-05, - "loss": 0.241, - "step": 308500 - }, - { - "epoch": 0.003086, - "grad_norm": 1.4156224727630615, - "learning_rate": 1e-05, - "loss": 0.2346, - "step": 308600 - }, - { - "epoch": 0.003087, - "grad_norm": 1.4687318801879883, - "learning_rate": 1e-05, - "loss": 0.2368, - "step": 308700 - }, - { - "epoch": 0.003088, - "grad_norm": 1.3687090873718262, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 308800 - }, - { - "epoch": 0.003089, - "grad_norm": 1.058356523513794, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 308900 - }, - { - "epoch": 0.00309, - "grad_norm": 1.3809763193130493, - "learning_rate": 1e-05, - "loss": 0.2414, - "step": 309000 - }, - { - "epoch": 0.003091, - "grad_norm": 1.3075354099273682, - "learning_rate": 1e-05, - "loss": 0.2399, - "step": 309100 - }, - { - "epoch": 0.003092, - "grad_norm": 1.6219615936279297, - "learning_rate": 1e-05, - "loss": 0.2368, - "step": 309200 - }, - { - "epoch": 0.003093, - "grad_norm": 1.2773137092590332, - "learning_rate": 1e-05, - "loss": 0.2369, - "step": 309300 - }, - { - "epoch": 0.003094, - "grad_norm": 1.314570426940918, - "learning_rate": 1e-05, - "loss": 0.236, - "step": 309400 - }, - { - "epoch": 0.003095, - "grad_norm": 1.2769191265106201, - "learning_rate": 1e-05, - "loss": 0.24, - "step": 309500 - }, - { - "epoch": 0.003096, - "grad_norm": 1.4884626865386963, - "learning_rate": 1e-05, - "loss": 0.2396, - "step": 309600 - }, - { - "epoch": 0.003097, - "grad_norm": 1.187654733657837, - "learning_rate": 1e-05, - "loss": 0.2371, - "step": 309700 - }, - { - "epoch": 0.003098, - "grad_norm": 1.2243677377700806, - "learning_rate": 1e-05, - "loss": 0.2421, - "step": 309800 - }, - { - "epoch": 0.003099, - "grad_norm": 1.440105676651001, - "learning_rate": 1e-05, - "loss": 0.2362, - "step": 309900 - }, - { - "epoch": 0.0031, - "grad_norm": 1.2419408559799194, - "learning_rate": 1e-05, - "loss": 0.2341, - "step": 310000 - }, - { - "epoch": 0.003101, - "grad_norm": 1.4774112701416016, - "learning_rate": 1e-05, - "loss": 0.2421, - "step": 310100 - }, - { - "epoch": 0.003102, - "grad_norm": 1.2358349561691284, - "learning_rate": 1e-05, - "loss": 0.2315, - "step": 310200 - }, - { - "epoch": 0.003103, - "grad_norm": 1.423378825187683, - "learning_rate": 1e-05, - "loss": 0.2348, - "step": 310300 - }, - { - "epoch": 0.003104, - "grad_norm": 1.2779252529144287, - "learning_rate": 1e-05, - "loss": 0.2357, - "step": 310400 - }, - { - "epoch": 0.003105, - "grad_norm": 1.3882098197937012, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 310500 - }, - { - "epoch": 0.003106, - "grad_norm": 1.3466850519180298, - "learning_rate": 1e-05, - "loss": 0.2365, - "step": 310600 - }, - { - "epoch": 0.003107, - "grad_norm": 1.2759345769882202, - "learning_rate": 1e-05, - "loss": 0.2365, - "step": 310700 - }, - { - "epoch": 0.003108, - "grad_norm": 1.3532472848892212, - "learning_rate": 1e-05, - "loss": 0.239, - "step": 310800 - }, - { - "epoch": 0.003109, - "grad_norm": 1.4525240659713745, - "learning_rate": 1e-05, - "loss": 0.234, - "step": 310900 - }, - { - "epoch": 0.00311, - "grad_norm": 1.3393089771270752, - "learning_rate": 1e-05, - "loss": 0.2399, - "step": 311000 - }, - { - "epoch": 0.003111, - "grad_norm": 1.394953966140747, - "learning_rate": 1e-05, - "loss": 0.2388, - "step": 311100 - }, - { - "epoch": 0.003112, - "grad_norm": 1.2774931192398071, - "learning_rate": 1e-05, - "loss": 0.2412, - "step": 311200 - }, - { - "epoch": 0.003113, - "grad_norm": 1.4417829513549805, - "learning_rate": 1e-05, - "loss": 0.2399, - "step": 311300 - }, - { - "epoch": 0.003114, - "grad_norm": 1.4170730113983154, - "learning_rate": 1e-05, - "loss": 0.2372, - "step": 311400 - }, - { - "epoch": 0.003115, - "grad_norm": 1.43710458278656, - "learning_rate": 1e-05, - "loss": 0.2424, - "step": 311500 - }, - { - "epoch": 0.003116, - "grad_norm": 1.1141654253005981, - "learning_rate": 1e-05, - "loss": 0.2366, - "step": 311600 - }, - { - "epoch": 0.003117, - "grad_norm": 1.2110333442687988, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 311700 - }, - { - "epoch": 0.003118, - "grad_norm": 1.2857674360275269, - "learning_rate": 1e-05, - "loss": 0.229, - "step": 311800 - }, - { - "epoch": 0.003119, - "grad_norm": 1.2459951639175415, - "learning_rate": 1e-05, - "loss": 0.2431, - "step": 311900 - }, - { - "epoch": 0.00312, - "grad_norm": 1.1275666952133179, - "learning_rate": 1e-05, - "loss": 0.2419, - "step": 312000 - }, - { - "epoch": 0.003121, - "grad_norm": 1.2489356994628906, - "learning_rate": 1e-05, - "loss": 0.2379, - "step": 312100 - }, - { - "epoch": 0.003122, - "grad_norm": 1.9926685094833374, - "learning_rate": 1e-05, - "loss": 0.2372, - "step": 312200 - }, - { - "epoch": 0.003123, - "grad_norm": 1.3316718339920044, - "learning_rate": 1e-05, - "loss": 0.2369, - "step": 312300 - }, - { - "epoch": 0.003124, - "grad_norm": 1.169675350189209, - "learning_rate": 1e-05, - "loss": 0.2399, - "step": 312400 - }, - { - "epoch": 0.003125, - "grad_norm": 1.3205904960632324, - "learning_rate": 1e-05, - "loss": 0.2314, - "step": 312500 - }, - { - "epoch": 0.003126, - "grad_norm": 1.8434507846832275, - "learning_rate": 1e-05, - "loss": 0.2339, - "step": 312600 - }, - { - "epoch": 0.003127, - "grad_norm": 1.343630313873291, - "learning_rate": 1e-05, - "loss": 0.2348, - "step": 312700 - }, - { - "epoch": 0.003128, - "grad_norm": 1.119792103767395, - "learning_rate": 1e-05, - "loss": 0.2313, - "step": 312800 - }, - { - "epoch": 0.003129, - "grad_norm": 1.3983734846115112, - "learning_rate": 1e-05, - "loss": 0.2454, - "step": 312900 - }, - { - "epoch": 0.00313, - "grad_norm": 1.2698651552200317, - "learning_rate": 1e-05, - "loss": 0.2333, - "step": 313000 - }, - { - "epoch": 0.003131, - "grad_norm": 1.2215535640716553, - "learning_rate": 1e-05, - "loss": 0.2412, - "step": 313100 - }, - { - "epoch": 0.003132, - "grad_norm": 1.2362414598464966, - "learning_rate": 1e-05, - "loss": 0.2404, - "step": 313200 - }, - { - "epoch": 0.003133, - "grad_norm": 1.5521022081375122, - "learning_rate": 1e-05, - "loss": 0.2374, - "step": 313300 - }, - { - "epoch": 0.003134, - "grad_norm": 1.188597559928894, - "learning_rate": 1e-05, - "loss": 0.2397, - "step": 313400 - }, - { - "epoch": 0.003135, - "grad_norm": 1.2732994556427002, - "learning_rate": 1e-05, - "loss": 0.2339, - "step": 313500 - }, - { - "epoch": 0.003136, - "grad_norm": 1.5800340175628662, - "learning_rate": 1e-05, - "loss": 0.2477, - "step": 313600 - }, - { - "epoch": 0.003137, - "grad_norm": 1.4321262836456299, - "learning_rate": 1e-05, - "loss": 0.2328, - "step": 313700 - }, - { - "epoch": 0.003138, - "grad_norm": 1.429616093635559, - "learning_rate": 1e-05, - "loss": 0.2376, - "step": 313800 - }, - { - "epoch": 0.003139, - "grad_norm": 1.156804084777832, - "learning_rate": 1e-05, - "loss": 0.2367, - "step": 313900 - }, - { - "epoch": 0.00314, - "grad_norm": 1.2023727893829346, - "learning_rate": 1e-05, - "loss": 0.2337, - "step": 314000 - }, - { - "epoch": 0.003141, - "grad_norm": 1.2538083791732788, - "learning_rate": 1e-05, - "loss": 0.2411, - "step": 314100 - }, - { - "epoch": 0.003142, - "grad_norm": 1.1308847665786743, - "learning_rate": 1e-05, - "loss": 0.2364, - "step": 314200 - }, - { - "epoch": 0.003143, - "grad_norm": 1.0516680479049683, - "learning_rate": 1e-05, - "loss": 0.2333, - "step": 314300 - }, - { - "epoch": 0.003144, - "grad_norm": 1.2595715522766113, - "learning_rate": 1e-05, - "loss": 0.2249, - "step": 314400 - }, - { - "epoch": 0.003145, - "grad_norm": 1.3008543252944946, - "learning_rate": 1e-05, - "loss": 0.2437, - "step": 314500 - }, - { - "epoch": 0.003146, - "grad_norm": 1.337666630744934, - "learning_rate": 1e-05, - "loss": 0.2377, - "step": 314600 - }, - { - "epoch": 0.003147, - "grad_norm": 1.1450291872024536, - "learning_rate": 1e-05, - "loss": 0.2369, - "step": 314700 - }, - { - "epoch": 0.003148, - "grad_norm": 1.2482352256774902, - "learning_rate": 1e-05, - "loss": 0.2401, - "step": 314800 - }, - { - "epoch": 0.003149, - "grad_norm": 1.2616183757781982, - "learning_rate": 1e-05, - "loss": 0.2349, - "step": 314900 - }, - { - "epoch": 0.00315, - "grad_norm": 1.3074548244476318, - "learning_rate": 1e-05, - "loss": 0.2429, - "step": 315000 - }, - { - "epoch": 0.003151, - "grad_norm": 1.4957194328308105, - "learning_rate": 1e-05, - "loss": 0.2395, - "step": 315100 - }, - { - "epoch": 0.003152, - "grad_norm": 1.3458212614059448, - "learning_rate": 1e-05, - "loss": 0.2412, - "step": 315200 - }, - { - "epoch": 0.003153, - "grad_norm": 1.1305044889450073, - "learning_rate": 1e-05, - "loss": 0.2387, - "step": 315300 - }, - { - "epoch": 0.003154, - "grad_norm": 1.8351118564605713, - "learning_rate": 1e-05, - "loss": 0.2349, - "step": 315400 - }, - { - "epoch": 0.003155, - "grad_norm": 1.1634564399719238, - "learning_rate": 1e-05, - "loss": 0.2404, - "step": 315500 - }, - { - "epoch": 0.003156, - "grad_norm": 1.31840980052948, - "learning_rate": 1e-05, - "loss": 0.2404, - "step": 315600 - }, - { - "epoch": 0.003157, - "grad_norm": 1.2562260627746582, - "learning_rate": 1e-05, - "loss": 0.2382, - "step": 315700 - }, - { - "epoch": 0.003158, - "grad_norm": 1.5455302000045776, - "learning_rate": 1e-05, - "loss": 0.2443, - "step": 315800 - }, - { - "epoch": 0.003159, - "grad_norm": 1.194831371307373, - "learning_rate": 1e-05, - "loss": 0.2368, - "step": 315900 - }, - { - "epoch": 0.00316, - "grad_norm": 1.4830539226531982, - "learning_rate": 1e-05, - "loss": 0.2433, - "step": 316000 - }, - { - "epoch": 0.003161, - "grad_norm": 1.433650016784668, - "learning_rate": 1e-05, - "loss": 0.2301, - "step": 316100 - }, - { - "epoch": 0.003162, - "grad_norm": 1.1843311786651611, - "learning_rate": 1e-05, - "loss": 0.2401, - "step": 316200 - }, - { - "epoch": 0.003163, - "grad_norm": 1.2167657613754272, - "learning_rate": 1e-05, - "loss": 0.2427, - "step": 316300 - }, - { - "epoch": 0.003164, - "grad_norm": 1.228497862815857, - "learning_rate": 1e-05, - "loss": 0.2455, - "step": 316400 - }, - { - "epoch": 0.003165, - "grad_norm": 1.3738011121749878, - "learning_rate": 1e-05, - "loss": 0.2375, - "step": 316500 - }, - { - "epoch": 0.003166, - "grad_norm": 1.303950548171997, - "learning_rate": 1e-05, - "loss": 0.2381, - "step": 316600 - }, - { - "epoch": 0.003167, - "grad_norm": 1.5050631761550903, - "learning_rate": 1e-05, - "loss": 0.241, - "step": 316700 - }, - { - "epoch": 0.003168, - "grad_norm": 1.0819690227508545, - "learning_rate": 1e-05, - "loss": 0.235, - "step": 316800 - }, - { - "epoch": 0.003169, - "grad_norm": 1.4763338565826416, - "learning_rate": 1e-05, - "loss": 0.2377, - "step": 316900 - }, - { - "epoch": 0.00317, - "grad_norm": 1.3414305448532104, - "learning_rate": 1e-05, - "loss": 0.2355, - "step": 317000 - }, - { - "epoch": 0.003171, - "grad_norm": 1.292258858680725, - "learning_rate": 1e-05, - "loss": 0.231, - "step": 317100 - }, - { - "epoch": 0.003172, - "grad_norm": 1.2649155855178833, - "learning_rate": 1e-05, - "loss": 0.2458, - "step": 317200 - }, - { - "epoch": 0.003173, - "grad_norm": 1.5519813299179077, - "learning_rate": 1e-05, - "loss": 0.2402, - "step": 317300 - }, - { - "epoch": 0.003174, - "grad_norm": 1.343334436416626, - "learning_rate": 1e-05, - "loss": 0.2332, - "step": 317400 - }, - { - "epoch": 0.003175, - "grad_norm": 1.5822913646697998, - "learning_rate": 1e-05, - "loss": 0.2302, - "step": 317500 - }, - { - "epoch": 0.003176, - "grad_norm": 1.4342949390411377, - "learning_rate": 1e-05, - "loss": 0.23, - "step": 317600 - }, - { - "epoch": 0.003177, - "grad_norm": 0.9082462191581726, - "learning_rate": 1e-05, - "loss": 0.2358, - "step": 317700 - }, - { - "epoch": 0.003178, - "grad_norm": 1.373437523841858, - "learning_rate": 1e-05, - "loss": 0.2353, - "step": 317800 - }, - { - "epoch": 0.003179, - "grad_norm": 1.3905372619628906, - "learning_rate": 1e-05, - "loss": 0.2375, - "step": 317900 - }, - { - "epoch": 0.00318, - "grad_norm": 1.3060153722763062, - "learning_rate": 1e-05, - "loss": 0.2395, - "step": 318000 - }, - { - "epoch": 0.003181, - "grad_norm": 1.2086070775985718, - "learning_rate": 1e-05, - "loss": 0.2426, - "step": 318100 - }, - { - "epoch": 0.003182, - "grad_norm": 1.2720617055892944, - "learning_rate": 1e-05, - "loss": 0.2397, - "step": 318200 - }, - { - "epoch": 0.003183, - "grad_norm": 1.4451135396957397, - "learning_rate": 1e-05, - "loss": 0.2378, - "step": 318300 - }, - { - "epoch": 0.003184, - "grad_norm": 1.4509061574935913, - "learning_rate": 1e-05, - "loss": 0.2423, - "step": 318400 - }, - { - "epoch": 0.003185, - "grad_norm": 1.4142173528671265, - "learning_rate": 1e-05, - "loss": 0.2335, - "step": 318500 - }, - { - "epoch": 0.003186, - "grad_norm": 1.1529561281204224, - "learning_rate": 1e-05, - "loss": 0.2287, - "step": 318600 - }, - { - "epoch": 0.003187, - "grad_norm": 1.2890421152114868, - "learning_rate": 1e-05, - "loss": 0.2393, - "step": 318700 - }, - { - "epoch": 0.003188, - "grad_norm": 1.3999617099761963, - "learning_rate": 1e-05, - "loss": 0.237, - "step": 318800 - }, - { - "epoch": 0.003189, - "grad_norm": 1.3913931846618652, - "learning_rate": 1e-05, - "loss": 0.2377, - "step": 318900 - }, - { - "epoch": 0.00319, - "grad_norm": 1.2426543235778809, - "learning_rate": 1e-05, - "loss": 0.239, - "step": 319000 - }, - { - "epoch": 0.003191, - "grad_norm": 1.302836298942566, - "learning_rate": 1e-05, - "loss": 0.2364, - "step": 319100 - }, - { - "epoch": 0.003192, - "grad_norm": 1.3547791242599487, - "learning_rate": 1e-05, - "loss": 0.2347, - "step": 319200 - }, - { - "epoch": 0.003193, - "grad_norm": 1.3384548425674438, - "learning_rate": 1e-05, - "loss": 0.2411, - "step": 319300 - }, - { - "epoch": 0.003194, - "grad_norm": 1.3114272356033325, - "learning_rate": 1e-05, - "loss": 0.238, - "step": 319400 - }, - { - "epoch": 0.003195, - "grad_norm": 1.3199174404144287, - "learning_rate": 1e-05, - "loss": 0.2352, - "step": 319500 - }, - { - "epoch": 0.003196, - "grad_norm": 1.4394826889038086, - "learning_rate": 1e-05, - "loss": 0.2334, - "step": 319600 - }, - { - "epoch": 0.003197, - "grad_norm": 1.43064284324646, - "learning_rate": 1e-05, - "loss": 0.234, - "step": 319700 - }, - { - "epoch": 0.003198, - "grad_norm": 1.2477679252624512, - "learning_rate": 1e-05, - "loss": 0.2435, - "step": 319800 - }, - { - "epoch": 0.003199, - "grad_norm": 1.533571720123291, - "learning_rate": 1e-05, - "loss": 0.2342, - "step": 319900 - }, - { - "epoch": 0.0032, - "grad_norm": 1.1788870096206665, - "learning_rate": 1e-05, - "loss": 0.2365, - "step": 320000 - }, - { - "epoch": 0.0032, - "eval_loss": 0.2120361328125, - "eval_runtime": 114.5409, - "eval_samples_per_second": 436.525, - "eval_steps_per_second": 27.283, - "step": 320000 - }, - { - "epoch": 0.003201, - "grad_norm": 1.3262966871261597, - "learning_rate": 1e-05, - "loss": 0.24, - "step": 320100 - }, - { - "epoch": 0.003202, - "grad_norm": 1.628220558166504, - "learning_rate": 1e-05, - "loss": 0.2335, - "step": 320200 - }, - { - "epoch": 0.003203, - "grad_norm": 1.3940200805664062, - "learning_rate": 1e-05, - "loss": 0.2412, - "step": 320300 - }, - { - "epoch": 0.003204, - "grad_norm": 1.4075195789337158, - "learning_rate": 1e-05, - "loss": 0.2354, - "step": 320400 - }, - { - "epoch": 0.003205, - "grad_norm": 1.2457551956176758, - "learning_rate": 1e-05, - "loss": 0.2365, - "step": 320500 - }, - { - "epoch": 0.003206, - "grad_norm": 1.2386821508407593, - "learning_rate": 1e-05, - "loss": 0.2405, - "step": 320600 - }, - { - "epoch": 0.003207, - "grad_norm": 1.433843731880188, - "learning_rate": 1e-05, - "loss": 0.2357, - "step": 320700 - }, - { - "epoch": 0.003208, - "grad_norm": 1.223730444908142, - "learning_rate": 1e-05, - "loss": 0.2397, - "step": 320800 - }, - { - "epoch": 0.003209, - "grad_norm": 1.3134397268295288, - "learning_rate": 1e-05, - "loss": 0.2432, - "step": 320900 - }, - { - "epoch": 0.00321, - "grad_norm": 1.0871469974517822, - "learning_rate": 1e-05, - "loss": 0.2336, - "step": 321000 - }, - { - "epoch": 0.003211, - "grad_norm": 1.1775238513946533, - "learning_rate": 1e-05, - "loss": 0.2359, - "step": 321100 - }, - { - "epoch": 0.003212, - "grad_norm": 1.2703193426132202, - "learning_rate": 1e-05, - "loss": 0.2321, - "step": 321200 - }, - { - "epoch": 0.003213, - "grad_norm": 1.2876147031784058, - "learning_rate": 1e-05, - "loss": 0.2386, - "step": 321300 - }, - { - "epoch": 0.003214, - "grad_norm": 1.0872234106063843, - "learning_rate": 1e-05, - "loss": 0.2301, - "step": 321400 - }, - { - "epoch": 0.003215, - "grad_norm": 2.1837048530578613, - "learning_rate": 1e-05, - "loss": 0.2373, - "step": 321500 - }, - { - "epoch": 0.003216, - "grad_norm": 1.2259389162063599, - "learning_rate": 1e-05, - "loss": 0.2355, - "step": 321600 - }, - { - "epoch": 0.003217, - "grad_norm": 1.3419442176818848, - "learning_rate": 1e-05, - "loss": 0.2309, - "step": 321700 - }, - { - "epoch": 0.003218, - "grad_norm": 2.667717456817627, - "learning_rate": 1e-05, - "loss": 0.2282, - "step": 321800 - }, - { - "epoch": 0.003219, - "grad_norm": 1.3365943431854248, - "learning_rate": 1e-05, - "loss": 0.2366, - "step": 321900 - }, - { - "epoch": 0.00322, - "grad_norm": 1.4526586532592773, - "learning_rate": 1e-05, - "loss": 0.2356, - "step": 322000 - }, - { - "epoch": 0.003221, - "grad_norm": 1.3023200035095215, - "learning_rate": 1e-05, - "loss": 0.2299, - "step": 322100 - }, - { - "epoch": 0.003222, - "grad_norm": 1.360321283340454, - "learning_rate": 1e-05, - "loss": 0.2402, - "step": 322200 - }, - { - "epoch": 0.003223, - "grad_norm": 1.3585379123687744, - "learning_rate": 1e-05, - "loss": 0.234, - "step": 322300 - }, - { - "epoch": 0.003224, - "grad_norm": 1.2170279026031494, - "learning_rate": 1e-05, - "loss": 0.2393, - "step": 322400 - }, - { - "epoch": 0.003225, - "grad_norm": 1.2210286855697632, - "learning_rate": 1e-05, - "loss": 0.2294, - "step": 322500 - }, - { - "epoch": 0.003226, - "grad_norm": 1.1365993022918701, - "learning_rate": 1e-05, - "loss": 0.2368, - "step": 322600 - }, - { - "epoch": 0.003227, - "grad_norm": 1.3429679870605469, - "learning_rate": 1e-05, - "loss": 0.2343, - "step": 322700 - }, - { - "epoch": 0.003228, - "grad_norm": 1.158896803855896, - "learning_rate": 1e-05, - "loss": 0.2326, - "step": 322800 - }, - { - "epoch": 0.003229, - "grad_norm": 1.3973841667175293, - "learning_rate": 1e-05, - "loss": 0.2422, - "step": 322900 - }, - { - "epoch": 0.00323, - "grad_norm": 1.2723912000656128, - "learning_rate": 1e-05, - "loss": 0.2369, - "step": 323000 - }, - { - "epoch": 0.003231, - "grad_norm": 1.0782179832458496, - "learning_rate": 1e-05, - "loss": 0.2381, - "step": 323100 - }, - { - "epoch": 0.003232, - "grad_norm": 1.2514499425888062, - "learning_rate": 1e-05, - "loss": 0.2323, - "step": 323200 - }, - { - "epoch": 0.003233, - "grad_norm": 1.2795170545578003, - "learning_rate": 1e-05, - "loss": 0.2325, - "step": 323300 - }, - { - "epoch": 0.003234, - "grad_norm": 1.2373872995376587, - "learning_rate": 1e-05, - "loss": 0.2407, - "step": 323400 - }, - { - "epoch": 0.003235, - "grad_norm": 1.7018437385559082, - "learning_rate": 1e-05, - "loss": 0.2395, - "step": 323500 - }, - { - "epoch": 0.003236, - "grad_norm": 1.2047086954116821, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 323600 - }, - { - "epoch": 0.003237, - "grad_norm": 1.3701646327972412, - "learning_rate": 1e-05, - "loss": 0.241, - "step": 323700 - }, - { - "epoch": 0.003238, - "grad_norm": 1.1767195463180542, - "learning_rate": 1e-05, - "loss": 0.2345, - "step": 323800 - }, - { - "epoch": 0.003239, - "grad_norm": 1.3310903310775757, - "learning_rate": 1e-05, - "loss": 0.2358, - "step": 323900 - }, - { - "epoch": 0.00324, - "grad_norm": 1.2926236391067505, - "learning_rate": 1e-05, - "loss": 0.2367, - "step": 324000 - }, - { - "epoch": 0.003241, - "grad_norm": 1.1691709756851196, - "learning_rate": 1e-05, - "loss": 0.2343, - "step": 324100 - }, - { - "epoch": 0.003242, - "grad_norm": 1.244135856628418, - "learning_rate": 1e-05, - "loss": 0.2365, - "step": 324200 - }, - { - "epoch": 0.003243, - "grad_norm": 1.316163182258606, - "learning_rate": 1e-05, - "loss": 0.2351, - "step": 324300 - }, - { - "epoch": 0.003244, - "grad_norm": 1.616714358329773, - "learning_rate": 1e-05, - "loss": 0.2343, - "step": 324400 - }, - { - "epoch": 0.003245, - "grad_norm": 1.5194528102874756, - "learning_rate": 1e-05, - "loss": 0.242, - "step": 324500 - }, - { - "epoch": 0.003246, - "grad_norm": 1.3706395626068115, - "learning_rate": 1e-05, - "loss": 0.2355, - "step": 324600 - }, - { - "epoch": 0.003247, - "grad_norm": 1.5692068338394165, - "learning_rate": 1e-05, - "loss": 0.2319, - "step": 324700 - }, - { - "epoch": 0.003248, - "grad_norm": 1.4041072130203247, - "learning_rate": 1e-05, - "loss": 0.2394, - "step": 324800 - }, - { - "epoch": 0.003249, - "grad_norm": 1.4589322805404663, - "learning_rate": 1e-05, - "loss": 0.2366, - "step": 324900 - }, - { - "epoch": 0.00325, - "grad_norm": 1.2800227403640747, - "learning_rate": 1e-05, - "loss": 0.2348, - "step": 325000 - }, - { - "epoch": 0.003251, - "grad_norm": 1.4000357389450073, - "learning_rate": 1e-05, - "loss": 0.2342, - "step": 325100 - }, - { - "epoch": 0.003252, - "grad_norm": 1.4728070497512817, - "learning_rate": 1e-05, - "loss": 0.2379, - "step": 325200 - }, - { - "epoch": 0.003253, - "grad_norm": 1.0459872484207153, - "learning_rate": 1e-05, - "loss": 0.2356, - "step": 325300 - }, - { - "epoch": 0.003254, - "grad_norm": 1.5237504243850708, - "learning_rate": 1e-05, - "loss": 0.2323, - "step": 325400 - }, - { - "epoch": 0.003255, - "grad_norm": 1.1053203344345093, - "learning_rate": 1e-05, - "loss": 0.2284, - "step": 325500 - }, - { - "epoch": 0.003256, - "grad_norm": 1.220874547958374, - "learning_rate": 1e-05, - "loss": 0.2368, - "step": 325600 - }, - { - "epoch": 0.003257, - "grad_norm": 1.2732406854629517, - "learning_rate": 1e-05, - "loss": 0.2366, - "step": 325700 - }, - { - "epoch": 0.003258, - "grad_norm": 1.1701757907867432, - "learning_rate": 1e-05, - "loss": 0.2391, - "step": 325800 - }, - { - "epoch": 0.003259, - "grad_norm": 1.1687523126602173, - "learning_rate": 1e-05, - "loss": 0.2325, - "step": 325900 - }, - { - "epoch": 0.00326, - "grad_norm": 1.33219313621521, - "learning_rate": 1e-05, - "loss": 0.2315, - "step": 326000 - }, - { - "epoch": 0.003261, - "grad_norm": 1.2226959466934204, - "learning_rate": 1e-05, - "loss": 0.2313, - "step": 326100 - }, - { - "epoch": 0.003262, - "grad_norm": 1.3035367727279663, - "learning_rate": 1e-05, - "loss": 0.2346, - "step": 326200 - }, - { - "epoch": 0.003263, - "grad_norm": 1.298467993736267, - "learning_rate": 1e-05, - "loss": 0.232, - "step": 326300 - }, - { - "epoch": 0.003264, - "grad_norm": 1.3820903301239014, - "learning_rate": 1e-05, - "loss": 0.2372, - "step": 326400 - }, - { - "epoch": 0.003265, - "grad_norm": 1.479079008102417, - "learning_rate": 1e-05, - "loss": 0.2287, - "step": 326500 - }, - { - "epoch": 0.003266, - "grad_norm": 1.1762094497680664, - "learning_rate": 1e-05, - "loss": 0.2307, - "step": 326600 - }, - { - "epoch": 0.003267, - "grad_norm": 1.5140327215194702, - "learning_rate": 1e-05, - "loss": 0.2333, - "step": 326700 - }, - { - "epoch": 0.003268, - "grad_norm": 1.4078545570373535, - "learning_rate": 1e-05, - "loss": 0.2368, - "step": 326800 - }, - { - "epoch": 0.003269, - "grad_norm": 1.3485603332519531, - "learning_rate": 1e-05, - "loss": 0.2409, - "step": 326900 - }, - { - "epoch": 0.00327, - "grad_norm": 1.250405192375183, - "learning_rate": 1e-05, - "loss": 0.2373, - "step": 327000 - }, - { - "epoch": 0.003271, - "grad_norm": 1.2252273559570312, - "learning_rate": 1e-05, - "loss": 0.2391, - "step": 327100 - }, - { - "epoch": 0.003272, - "grad_norm": 1.5675464868545532, - "learning_rate": 1e-05, - "loss": 0.2344, - "step": 327200 - }, - { - "epoch": 0.003273, - "grad_norm": 1.2831748723983765, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 327300 - }, - { - "epoch": 0.003274, - "grad_norm": 1.3781280517578125, - "learning_rate": 1e-05, - "loss": 0.2322, - "step": 327400 - }, - { - "epoch": 0.003275, - "grad_norm": 1.280205488204956, - "learning_rate": 1e-05, - "loss": 0.2369, - "step": 327500 - }, - { - "epoch": 0.003276, - "grad_norm": 1.1911392211914062, - "learning_rate": 1e-05, - "loss": 0.235, - "step": 327600 - }, - { - "epoch": 0.003277, - "grad_norm": 1.4558595418930054, - "learning_rate": 1e-05, - "loss": 0.2336, - "step": 327700 - }, - { - "epoch": 0.003278, - "grad_norm": 1.813578724861145, - "learning_rate": 1e-05, - "loss": 0.2391, - "step": 327800 - }, - { - "epoch": 0.003279, - "grad_norm": 1.0403188467025757, - "learning_rate": 1e-05, - "loss": 0.226, - "step": 327900 - }, - { - "epoch": 0.00328, - "grad_norm": 1.4072662591934204, - "learning_rate": 1e-05, - "loss": 0.2387, - "step": 328000 - }, - { - "epoch": 0.003281, - "grad_norm": 1.10177481174469, - "learning_rate": 1e-05, - "loss": 0.2357, - "step": 328100 - }, - { - "epoch": 0.003282, - "grad_norm": 1.090941309928894, - "learning_rate": 1e-05, - "loss": 0.2438, - "step": 328200 - }, - { - "epoch": 0.003283, - "grad_norm": 1.2674936056137085, - "learning_rate": 1e-05, - "loss": 0.2325, - "step": 328300 - }, - { - "epoch": 0.003284, - "grad_norm": 1.3046340942382812, - "learning_rate": 1e-05, - "loss": 0.2322, - "step": 328400 - }, - { - "epoch": 0.003285, - "grad_norm": 1.2432596683502197, - "learning_rate": 1e-05, - "loss": 0.2363, - "step": 328500 - }, - { - "epoch": 0.003286, - "grad_norm": 1.4052422046661377, - "learning_rate": 1e-05, - "loss": 0.2324, - "step": 328600 - }, - { - "epoch": 0.003287, - "grad_norm": 1.529941201210022, - "learning_rate": 1e-05, - "loss": 0.2399, - "step": 328700 - }, - { - "epoch": 0.003288, - "grad_norm": 1.1230138540267944, - "learning_rate": 1e-05, - "loss": 0.2297, - "step": 328800 - }, - { - "epoch": 0.003289, - "grad_norm": 1.1800566911697388, - "learning_rate": 1e-05, - "loss": 0.2379, - "step": 328900 - }, - { - "epoch": 0.00329, - "grad_norm": 1.436422348022461, - "learning_rate": 1e-05, - "loss": 0.2304, - "step": 329000 - }, - { - "epoch": 0.003291, - "grad_norm": 1.3106859922409058, - "learning_rate": 1e-05, - "loss": 0.2399, - "step": 329100 - }, - { - "epoch": 0.003292, - "grad_norm": 1.3878048658370972, - "learning_rate": 1e-05, - "loss": 0.2347, - "step": 329200 - }, - { - "epoch": 0.003293, - "grad_norm": 1.0402004718780518, - "learning_rate": 1e-05, - "loss": 0.2356, - "step": 329300 - }, - { - "epoch": 0.003294, - "grad_norm": 1.0895251035690308, - "learning_rate": 1e-05, - "loss": 0.2312, - "step": 329400 - }, - { - "epoch": 0.003295, - "grad_norm": 1.3054835796356201, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 329500 - }, - { - "epoch": 0.003296, - "grad_norm": 1.2511048316955566, - "learning_rate": 1e-05, - "loss": 0.2374, - "step": 329600 - }, - { - "epoch": 0.003297, - "grad_norm": 1.48338782787323, - "learning_rate": 1e-05, - "loss": 0.2416, - "step": 329700 - }, - { - "epoch": 0.003298, - "grad_norm": 1.257490634918213, - "learning_rate": 1e-05, - "loss": 0.2384, - "step": 329800 - }, - { - "epoch": 0.003299, - "grad_norm": 1.2889269590377808, - "learning_rate": 1e-05, - "loss": 0.2297, - "step": 329900 - }, - { - "epoch": 0.0033, - "grad_norm": 1.3305553197860718, - "learning_rate": 1e-05, - "loss": 0.2323, - "step": 330000 - }, - { - "epoch": 0.003301, - "grad_norm": 1.4710475206375122, - "learning_rate": 1e-05, - "loss": 0.2262, - "step": 330100 - }, - { - "epoch": 0.003302, - "grad_norm": 1.3060176372528076, - "learning_rate": 1e-05, - "loss": 0.2328, - "step": 330200 - }, - { - "epoch": 0.003303, - "grad_norm": 1.0658338069915771, - "learning_rate": 1e-05, - "loss": 0.234, - "step": 330300 - }, - { - "epoch": 0.003304, - "grad_norm": 1.840126395225525, - "learning_rate": 1e-05, - "loss": 0.2324, - "step": 330400 - }, - { - "epoch": 0.003305, - "grad_norm": 1.263517141342163, - "learning_rate": 1e-05, - "loss": 0.2346, - "step": 330500 - }, - { - "epoch": 0.003306, - "grad_norm": 1.1184327602386475, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 330600 - }, - { - "epoch": 0.003307, - "grad_norm": 1.3393223285675049, - "learning_rate": 1e-05, - "loss": 0.2314, - "step": 330700 - }, - { - "epoch": 0.003308, - "grad_norm": 1.3231823444366455, - "learning_rate": 1e-05, - "loss": 0.2389, - "step": 330800 - }, - { - "epoch": 0.003309, - "grad_norm": 1.4390610456466675, - "learning_rate": 1e-05, - "loss": 0.2269, - "step": 330900 - }, - { - "epoch": 0.00331, - "grad_norm": 1.1492725610733032, - "learning_rate": 1e-05, - "loss": 0.2368, - "step": 331000 - }, - { - "epoch": 0.003311, - "grad_norm": 1.1424356698989868, - "learning_rate": 1e-05, - "loss": 0.2361, - "step": 331100 - }, - { - "epoch": 0.003312, - "grad_norm": 1.6432085037231445, - "learning_rate": 1e-05, - "loss": 0.2287, - "step": 331200 - }, - { - "epoch": 0.003313, - "grad_norm": 1.120138168334961, - "learning_rate": 1e-05, - "loss": 0.2402, - "step": 331300 - }, - { - "epoch": 0.003314, - "grad_norm": 1.4646905660629272, - "learning_rate": 1e-05, - "loss": 0.2263, - "step": 331400 - }, - { - "epoch": 0.003315, - "grad_norm": 1.3460516929626465, - "learning_rate": 1e-05, - "loss": 0.2337, - "step": 331500 - }, - { - "epoch": 0.003316, - "grad_norm": 1.357112169265747, - "learning_rate": 1e-05, - "loss": 0.2338, - "step": 331600 - }, - { - "epoch": 0.003317, - "grad_norm": 1.540134072303772, - "learning_rate": 1e-05, - "loss": 0.2372, - "step": 331700 - }, - { - "epoch": 0.003318, - "grad_norm": 1.350807547569275, - "learning_rate": 1e-05, - "loss": 0.233, - "step": 331800 - }, - { - "epoch": 0.003319, - "grad_norm": 1.3031643629074097, - "learning_rate": 1e-05, - "loss": 0.2316, - "step": 331900 - }, - { - "epoch": 0.00332, - "grad_norm": 1.136181354522705, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 332000 - }, - { - "epoch": 0.003321, - "grad_norm": 1.1599808931350708, - "learning_rate": 1e-05, - "loss": 0.2373, - "step": 332100 - }, - { - "epoch": 0.003322, - "grad_norm": 1.2984144687652588, - "learning_rate": 1e-05, - "loss": 0.2359, - "step": 332200 - }, - { - "epoch": 0.003323, - "grad_norm": 1.4425690174102783, - "learning_rate": 1e-05, - "loss": 0.2339, - "step": 332300 - }, - { - "epoch": 0.003324, - "grad_norm": 1.290778636932373, - "learning_rate": 1e-05, - "loss": 0.2273, - "step": 332400 - }, - { - "epoch": 0.003325, - "grad_norm": 1.3013964891433716, - "learning_rate": 1e-05, - "loss": 0.2299, - "step": 332500 - }, - { - "epoch": 0.003326, - "grad_norm": 1.2175419330596924, - "learning_rate": 1e-05, - "loss": 0.2381, - "step": 332600 - }, - { - "epoch": 0.003327, - "grad_norm": 1.3039450645446777, - "learning_rate": 1e-05, - "loss": 0.2314, - "step": 332700 - }, - { - "epoch": 0.003328, - "grad_norm": 1.4682481288909912, - "learning_rate": 1e-05, - "loss": 0.2377, - "step": 332800 - }, - { - "epoch": 0.003329, - "grad_norm": 1.1764458417892456, - "learning_rate": 1e-05, - "loss": 0.2318, - "step": 332900 - }, - { - "epoch": 0.00333, - "grad_norm": 1.3276108503341675, - "learning_rate": 1e-05, - "loss": 0.235, - "step": 333000 - }, - { - "epoch": 0.003331, - "grad_norm": 1.1627488136291504, - "learning_rate": 1e-05, - "loss": 0.235, - "step": 333100 - }, - { - "epoch": 0.003332, - "grad_norm": 1.1364787817001343, - "learning_rate": 1e-05, - "loss": 0.231, - "step": 333200 - }, - { - "epoch": 0.003333, - "grad_norm": 1.4209784269332886, - "learning_rate": 1e-05, - "loss": 0.2349, - "step": 333300 - }, - { - "epoch": 0.003334, - "grad_norm": 0.9984776973724365, - "learning_rate": 1e-05, - "loss": 0.2323, - "step": 333400 - }, - { - "epoch": 0.003335, - "grad_norm": 1.6686424016952515, - "learning_rate": 1e-05, - "loss": 0.2305, - "step": 333500 - }, - { - "epoch": 0.003336, - "grad_norm": 1.3752646446228027, - "learning_rate": 1e-05, - "loss": 0.2321, - "step": 333600 - }, - { - "epoch": 0.003337, - "grad_norm": 1.346661925315857, - "learning_rate": 1e-05, - "loss": 0.2348, - "step": 333700 - }, - { - "epoch": 0.003338, - "grad_norm": 1.3721468448638916, - "learning_rate": 1e-05, - "loss": 0.2197, - "step": 333800 - }, - { - "epoch": 0.003339, - "grad_norm": 1.1212071180343628, - "learning_rate": 1e-05, - "loss": 0.2332, - "step": 333900 - }, - { - "epoch": 0.00334, - "grad_norm": 1.09237802028656, - "learning_rate": 1e-05, - "loss": 0.2251, - "step": 334000 - }, - { - "epoch": 0.003341, - "grad_norm": 5.280272960662842, - "learning_rate": 1e-05, - "loss": 0.2297, - "step": 334100 - }, - { - "epoch": 0.003342, - "grad_norm": 1.2541955709457397, - "learning_rate": 1e-05, - "loss": 0.2252, - "step": 334200 - }, - { - "epoch": 0.003343, - "grad_norm": 1.3003326654434204, - "learning_rate": 1e-05, - "loss": 0.2266, - "step": 334300 - }, - { - "epoch": 0.003344, - "grad_norm": 1.1321443319320679, - "learning_rate": 1e-05, - "loss": 0.2317, - "step": 334400 - }, - { - "epoch": 0.003345, - "grad_norm": 1.2620316743850708, - "learning_rate": 1e-05, - "loss": 0.2337, - "step": 334500 - }, - { - "epoch": 0.003346, - "grad_norm": 1.2102006673812866, - "learning_rate": 1e-05, - "loss": 0.2348, - "step": 334600 - }, - { - "epoch": 0.003347, - "grad_norm": 1.4066046476364136, - "learning_rate": 1e-05, - "loss": 0.2306, - "step": 334700 - }, - { - "epoch": 0.003348, - "grad_norm": 1.1030715703964233, - "learning_rate": 1e-05, - "loss": 0.234, - "step": 334800 - }, - { - "epoch": 0.003349, - "grad_norm": 1.2863750457763672, - "learning_rate": 1e-05, - "loss": 0.2315, - "step": 334900 - }, - { - "epoch": 0.00335, - "grad_norm": 1.2839666604995728, - "learning_rate": 1e-05, - "loss": 0.2303, - "step": 335000 - }, - { - "epoch": 0.003351, - "grad_norm": 1.3155107498168945, - "learning_rate": 1e-05, - "loss": 0.2338, - "step": 335100 - }, - { - "epoch": 0.003352, - "grad_norm": 1.273241639137268, - "learning_rate": 1e-05, - "loss": 0.2282, - "step": 335200 - }, - { - "epoch": 0.003353, - "grad_norm": 1.396444320678711, - "learning_rate": 1e-05, - "loss": 0.2299, - "step": 335300 - }, - { - "epoch": 0.003354, - "grad_norm": 1.2242732048034668, - "learning_rate": 1e-05, - "loss": 0.2333, - "step": 335400 - }, - { - "epoch": 0.003355, - "grad_norm": 1.3949216604232788, - "learning_rate": 1e-05, - "loss": 0.2341, - "step": 335500 - }, - { - "epoch": 0.003356, - "grad_norm": 1.2203855514526367, - "learning_rate": 1e-05, - "loss": 0.2297, - "step": 335600 - }, - { - "epoch": 0.003357, - "grad_norm": 1.4057589769363403, - "learning_rate": 1e-05, - "loss": 0.229, - "step": 335700 - }, - { - "epoch": 0.003358, - "grad_norm": 1.3634288311004639, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 335800 - }, - { - "epoch": 0.003359, - "grad_norm": 1.3653627634048462, - "learning_rate": 1e-05, - "loss": 0.2345, - "step": 335900 - }, - { - "epoch": 0.00336, - "grad_norm": 1.3090736865997314, - "learning_rate": 1e-05, - "loss": 0.2322, - "step": 336000 - }, - { - "epoch": 0.003361, - "grad_norm": 1.3299808502197266, - "learning_rate": 1e-05, - "loss": 0.2233, - "step": 336100 - }, - { - "epoch": 0.003362, - "grad_norm": 1.3597570657730103, - "learning_rate": 1e-05, - "loss": 0.2347, - "step": 336200 - }, - { - "epoch": 0.003363, - "grad_norm": 1.7366302013397217, - "learning_rate": 1e-05, - "loss": 0.2329, - "step": 336300 - }, - { - "epoch": 0.003364, - "grad_norm": 1.2465051412582397, - "learning_rate": 1e-05, - "loss": 0.2365, - "step": 336400 - }, - { - "epoch": 0.003365, - "grad_norm": 1.4841548204421997, - "learning_rate": 1e-05, - "loss": 0.2339, - "step": 336500 - }, - { - "epoch": 0.003366, - "grad_norm": 1.0925931930541992, - "learning_rate": 1e-05, - "loss": 0.2343, - "step": 336600 - }, - { - "epoch": 0.003367, - "grad_norm": 1.780450701713562, - "learning_rate": 1e-05, - "loss": 0.2304, - "step": 336700 - }, - { - "epoch": 0.003368, - "grad_norm": 1.1564908027648926, - "learning_rate": 1e-05, - "loss": 0.2287, - "step": 336800 - }, - { - "epoch": 0.003369, - "grad_norm": 1.2795488834381104, - "learning_rate": 1e-05, - "loss": 0.2401, - "step": 336900 - }, - { - "epoch": 0.00337, - "grad_norm": 1.3799196481704712, - "learning_rate": 1e-05, - "loss": 0.2321, - "step": 337000 - }, - { - "epoch": 0.003371, - "grad_norm": 1.3650449514389038, - "learning_rate": 1e-05, - "loss": 0.2294, - "step": 337100 - }, - { - "epoch": 0.003372, - "grad_norm": 1.2433546781539917, - "learning_rate": 1e-05, - "loss": 0.2342, - "step": 337200 - }, - { - "epoch": 0.003373, - "grad_norm": 1.1827058792114258, - "learning_rate": 1e-05, - "loss": 0.2337, - "step": 337300 - }, - { - "epoch": 0.003374, - "grad_norm": 1.2716456651687622, - "learning_rate": 1e-05, - "loss": 0.2258, - "step": 337400 - }, - { - "epoch": 0.003375, - "grad_norm": 1.1015453338623047, - "learning_rate": 1e-05, - "loss": 0.2275, - "step": 337500 - }, - { - "epoch": 0.003376, - "grad_norm": 1.186984658241272, - "learning_rate": 1e-05, - "loss": 0.2283, - "step": 337600 - }, - { - "epoch": 0.003377, - "grad_norm": 1.269175410270691, - "learning_rate": 1e-05, - "loss": 0.2367, - "step": 337700 - }, - { - "epoch": 0.003378, - "grad_norm": 1.1895431280136108, - "learning_rate": 1e-05, - "loss": 0.2323, - "step": 337800 - }, - { - "epoch": 0.003379, - "grad_norm": 1.284507393836975, - "learning_rate": 1e-05, - "loss": 0.2296, - "step": 337900 - }, - { - "epoch": 0.00338, - "grad_norm": 1.4350554943084717, - "learning_rate": 1e-05, - "loss": 0.2367, - "step": 338000 - }, - { - "epoch": 0.003381, - "grad_norm": 1.3566421270370483, - "learning_rate": 1e-05, - "loss": 0.2278, - "step": 338100 - }, - { - "epoch": 0.003382, - "grad_norm": 1.2975407838821411, - "learning_rate": 1e-05, - "loss": 0.2353, - "step": 338200 - }, - { - "epoch": 0.003383, - "grad_norm": 1.2475285530090332, - "learning_rate": 1e-05, - "loss": 0.2343, - "step": 338300 - }, - { - "epoch": 0.003384, - "grad_norm": 1.248896837234497, - "learning_rate": 1e-05, - "loss": 0.2345, - "step": 338400 - }, - { - "epoch": 0.003385, - "grad_norm": 1.2539887428283691, - "learning_rate": 1e-05, - "loss": 0.2288, - "step": 338500 - }, - { - "epoch": 0.003386, - "grad_norm": 1.3050204515457153, - "learning_rate": 1e-05, - "loss": 0.2289, - "step": 338600 - }, - { - "epoch": 0.003387, - "grad_norm": 1.2642877101898193, - "learning_rate": 1e-05, - "loss": 0.2367, - "step": 338700 - }, - { - "epoch": 0.003388, - "grad_norm": 1.3642864227294922, - "learning_rate": 1e-05, - "loss": 0.2272, - "step": 338800 - }, - { - "epoch": 0.003389, - "grad_norm": 1.221392035484314, - "learning_rate": 1e-05, - "loss": 0.2312, - "step": 338900 - }, - { - "epoch": 0.00339, - "grad_norm": 1.2033590078353882, - "learning_rate": 1e-05, - "loss": 0.2313, - "step": 339000 - }, - { - "epoch": 0.003391, - "grad_norm": 1.2614527940750122, - "learning_rate": 1e-05, - "loss": 0.2319, - "step": 339100 - }, - { - "epoch": 0.003392, - "grad_norm": 1.2570033073425293, - "learning_rate": 1e-05, - "loss": 0.2273, - "step": 339200 - }, - { - "epoch": 0.003393, - "grad_norm": 1.3876926898956299, - "learning_rate": 1e-05, - "loss": 0.2306, - "step": 339300 - }, - { - "epoch": 0.003394, - "grad_norm": 1.050087809562683, - "learning_rate": 1e-05, - "loss": 0.2398, - "step": 339400 - }, - { - "epoch": 0.003395, - "grad_norm": 1.2645732164382935, - "learning_rate": 1e-05, - "loss": 0.237, - "step": 339500 - }, - { - "epoch": 0.003396, - "grad_norm": 1.2092682123184204, - "learning_rate": 1e-05, - "loss": 0.2294, - "step": 339600 - }, - { - "epoch": 0.003397, - "grad_norm": 2.184636354446411, - "learning_rate": 1e-05, - "loss": 0.2353, - "step": 339700 - }, - { - "epoch": 0.003398, - "grad_norm": 1.2815287113189697, - "learning_rate": 1e-05, - "loss": 0.2284, - "step": 339800 - }, - { - "epoch": 0.003399, - "grad_norm": 1.2131634950637817, - "learning_rate": 1e-05, - "loss": 0.2272, - "step": 339900 - }, - { - "epoch": 0.0034, - "grad_norm": 1.1350979804992676, - "learning_rate": 1e-05, - "loss": 0.2258, - "step": 340000 - }, - { - "epoch": 0.0034, - "eval_loss": 0.207763671875, - "eval_runtime": 117.6953, - "eval_samples_per_second": 424.826, - "eval_steps_per_second": 26.552, - "step": 340000 } ], "logging_steps": 100, @@ -23971,7 +16931,7 @@ "attributes": {} } }, - "total_flos": 7.1071432704e+17, + "total_flos": 5.0168070144e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null