diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,42034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4670460118162641, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.784100196937735e-05, + "grad_norm": 18.677021026611328, + "learning_rate": 1.2453300124533002e-09, + "loss": 1.0606, + "step": 5 + }, + { + "epoch": 0.0001556820039387547, + "grad_norm": 8.49953842163086, + "learning_rate": 2.801992528019925e-09, + "loss": 1.0495, + "step": 10 + }, + { + "epoch": 0.00023352300590813205, + "grad_norm": 4.764372825622559, + "learning_rate": 4.358655043586551e-09, + "loss": 1.0324, + "step": 15 + }, + { + "epoch": 0.0003113640078775094, + "grad_norm": 6.350065231323242, + "learning_rate": 5.915317559153175e-09, + "loss": 1.0236, + "step": 20 + }, + { + "epoch": 0.00038920500984688676, + "grad_norm": 21.48935317993164, + "learning_rate": 7.471980074719801e-09, + "loss": 1.0156, + "step": 25 + }, + { + "epoch": 0.0004670460118162641, + "grad_norm": 11.964753150939941, + "learning_rate": 9.028642590286426e-09, + "loss": 1.1045, + "step": 30 + }, + { + "epoch": 0.0005448870137856414, + "grad_norm": 16.780696868896484, + "learning_rate": 1.0585305105853052e-08, + "loss": 0.9926, + "step": 35 + }, + { + "epoch": 0.0006227280157550188, + "grad_norm": 9.311758041381836, + "learning_rate": 1.2141967621419675e-08, + "loss": 1.0672, + "step": 40 + }, + { + "epoch": 0.0007005690177243961, + "grad_norm": 10.671490669250488, + "learning_rate": 1.36986301369863e-08, + "loss": 1.1432, + "step": 45 + }, + { + "epoch": 0.0007784100196937735, + "grad_norm": 6.056899547576904, + "learning_rate": 1.5255292652552926e-08, + "loss": 1.0207, + "step": 50 + }, + { + "epoch": 0.0008562510216631508, + "grad_norm": 12.727471351623535, + "learning_rate": 1.6811955168119553e-08, + "loss": 1.0626, + "step": 55 + }, + { + "epoch": 0.0009340920236325282, + "grad_norm": 3.2297894954681396, + "learning_rate": 1.8368617683686178e-08, + "loss": 1.0551, + "step": 60 + }, + { + "epoch": 0.0010119330256019056, + "grad_norm": 5.09151554107666, + "learning_rate": 1.9925280199252803e-08, + "loss": 1.0051, + "step": 65 + }, + { + "epoch": 0.0010897740275712829, + "grad_norm": 9.829240798950195, + "learning_rate": 2.1481942714819424e-08, + "loss": 1.0936, + "step": 70 + }, + { + "epoch": 0.0011676150295406602, + "grad_norm": 4.1169023513793945, + "learning_rate": 2.3038605230386048e-08, + "loss": 0.9367, + "step": 75 + }, + { + "epoch": 0.0012454560315100377, + "grad_norm": 22.784198760986328, + "learning_rate": 2.4595267745952676e-08, + "loss": 1.1475, + "step": 80 + }, + { + "epoch": 0.001323297033479415, + "grad_norm": 6.216701507568359, + "learning_rate": 2.61519302615193e-08, + "loss": 1.1738, + "step": 85 + }, + { + "epoch": 0.0014011380354487922, + "grad_norm": 8.767633438110352, + "learning_rate": 2.7708592777085925e-08, + "loss": 1.0995, + "step": 90 + }, + { + "epoch": 0.0014789790374181697, + "grad_norm": 6.444150447845459, + "learning_rate": 2.926525529265255e-08, + "loss": 1.1674, + "step": 95 + }, + { + "epoch": 0.001556820039387547, + "grad_norm": 21.939842224121094, + "learning_rate": 3.082191780821918e-08, + "loss": 0.9658, + "step": 100 + }, + { + "epoch": 0.0016346610413569243, + "grad_norm": 6.07455587387085, + "learning_rate": 3.23785803237858e-08, + "loss": 1.3969, + "step": 105 + }, + { + "epoch": 0.0017125020433263016, + "grad_norm": 7.66893196105957, + "learning_rate": 3.3935242839352427e-08, + "loss": 1.0274, + "step": 110 + }, + { + "epoch": 0.0017903430452956791, + "grad_norm": 6.411283016204834, + "learning_rate": 3.549190535491906e-08, + "loss": 1.0922, + "step": 115 + }, + { + "epoch": 0.0018681840472650564, + "grad_norm": 15.535103797912598, + "learning_rate": 3.704856787048568e-08, + "loss": 1.058, + "step": 120 + }, + { + "epoch": 0.0019460250492344337, + "grad_norm": 13.108068466186523, + "learning_rate": 3.860523038605231e-08, + "loss": 1.1347, + "step": 125 + }, + { + "epoch": 0.002023866051203811, + "grad_norm": 5.452599048614502, + "learning_rate": 4.016189290161893e-08, + "loss": 1.07, + "step": 130 + }, + { + "epoch": 0.0021017070531731885, + "grad_norm": 13.57522964477539, + "learning_rate": 4.1718555417185556e-08, + "loss": 1.0556, + "step": 135 + }, + { + "epoch": 0.0021795480551425658, + "grad_norm": 4.844541072845459, + "learning_rate": 4.3275217932752174e-08, + "loss": 1.0538, + "step": 140 + }, + { + "epoch": 0.002257389057111943, + "grad_norm": 7.6000800132751465, + "learning_rate": 4.48318804483188e-08, + "loss": 1.0884, + "step": 145 + }, + { + "epoch": 0.0023352300590813203, + "grad_norm": 6.445258617401123, + "learning_rate": 4.638854296388542e-08, + "loss": 1.0247, + "step": 150 + }, + { + "epoch": 0.002413071061050698, + "grad_norm": 4.861091136932373, + "learning_rate": 4.794520547945205e-08, + "loss": 1.0553, + "step": 155 + }, + { + "epoch": 0.0024909120630200753, + "grad_norm": 6.040435314178467, + "learning_rate": 4.950186799501867e-08, + "loss": 1.0682, + "step": 160 + }, + { + "epoch": 0.0025687530649894526, + "grad_norm": 10.561899185180664, + "learning_rate": 5.10585305105853e-08, + "loss": 0.9904, + "step": 165 + }, + { + "epoch": 0.00264659406695883, + "grad_norm": 8.238300323486328, + "learning_rate": 5.261519302615193e-08, + "loss": 1.0576, + "step": 170 + }, + { + "epoch": 0.002724435068928207, + "grad_norm": 10.821751594543457, + "learning_rate": 5.417185554171855e-08, + "loss": 1.0507, + "step": 175 + }, + { + "epoch": 0.0028022760708975845, + "grad_norm": 20.215164184570312, + "learning_rate": 5.5728518057285177e-08, + "loss": 1.2059, + "step": 180 + }, + { + "epoch": 0.0028801170728669618, + "grad_norm": 15.447042465209961, + "learning_rate": 5.72851805728518e-08, + "loss": 1.1047, + "step": 185 + }, + { + "epoch": 0.0029579580748363395, + "grad_norm": 13.472341537475586, + "learning_rate": 5.8841843088418426e-08, + "loss": 1.0508, + "step": 190 + }, + { + "epoch": 0.003035799076805717, + "grad_norm": 16.09784507751465, + "learning_rate": 6.039850560398505e-08, + "loss": 1.1144, + "step": 195 + }, + { + "epoch": 0.003113640078775094, + "grad_norm": 5.519948959350586, + "learning_rate": 6.195516811955167e-08, + "loss": 1.1127, + "step": 200 + }, + { + "epoch": 0.0031914810807444714, + "grad_norm": 9.467545509338379, + "learning_rate": 6.351183063511831e-08, + "loss": 1.0882, + "step": 205 + }, + { + "epoch": 0.0032693220827138486, + "grad_norm": 8.895452499389648, + "learning_rate": 6.506849315068492e-08, + "loss": 1.0845, + "step": 210 + }, + { + "epoch": 0.003347163084683226, + "grad_norm": 10.007709503173828, + "learning_rate": 6.662515566625156e-08, + "loss": 1.1585, + "step": 215 + }, + { + "epoch": 0.0034250040866526032, + "grad_norm": 10.499605178833008, + "learning_rate": 6.818181818181817e-08, + "loss": 1.103, + "step": 220 + }, + { + "epoch": 0.003502845088621981, + "grad_norm": 5.367983818054199, + "learning_rate": 6.973848069738481e-08, + "loss": 1.1586, + "step": 225 + }, + { + "epoch": 0.0035806860905913582, + "grad_norm": 19.27895164489746, + "learning_rate": 7.129514321295142e-08, + "loss": 1.0898, + "step": 230 + }, + { + "epoch": 0.0036585270925607355, + "grad_norm": 3.7263176441192627, + "learning_rate": 7.285180572851806e-08, + "loss": 1.0541, + "step": 235 + }, + { + "epoch": 0.003736368094530113, + "grad_norm": 21.48790740966797, + "learning_rate": 7.440846824408468e-08, + "loss": 1.2173, + "step": 240 + }, + { + "epoch": 0.00381420909649949, + "grad_norm": 5.5661702156066895, + "learning_rate": 7.596513075965131e-08, + "loss": 1.1816, + "step": 245 + }, + { + "epoch": 0.0038920500984688674, + "grad_norm": 13.601526260375977, + "learning_rate": 7.752179327521793e-08, + "loss": 1.0989, + "step": 250 + }, + { + "epoch": 0.003969891100438245, + "grad_norm": 9.873005867004395, + "learning_rate": 7.907845579078456e-08, + "loss": 1.1859, + "step": 255 + }, + { + "epoch": 0.004047732102407622, + "grad_norm": 4.8417277336120605, + "learning_rate": 8.063511830635118e-08, + "loss": 0.9859, + "step": 260 + }, + { + "epoch": 0.004125573104377, + "grad_norm": 3.8291945457458496, + "learning_rate": 8.21917808219178e-08, + "loss": 0.9709, + "step": 265 + }, + { + "epoch": 0.004203414106346377, + "grad_norm": 5.504295349121094, + "learning_rate": 8.374844333748443e-08, + "loss": 1.1271, + "step": 270 + }, + { + "epoch": 0.004281255108315754, + "grad_norm": 10.665711402893066, + "learning_rate": 8.530510585305104e-08, + "loss": 1.1773, + "step": 275 + }, + { + "epoch": 0.0043590961102851315, + "grad_norm": 8.259835243225098, + "learning_rate": 8.686176836861768e-08, + "loss": 1.1428, + "step": 280 + }, + { + "epoch": 0.004436937112254509, + "grad_norm": 15.531925201416016, + "learning_rate": 8.84184308841843e-08, + "loss": 1.1423, + "step": 285 + }, + { + "epoch": 0.004514778114223886, + "grad_norm": 17.920616149902344, + "learning_rate": 8.997509339975093e-08, + "loss": 1.3731, + "step": 290 + }, + { + "epoch": 0.004592619116193263, + "grad_norm": 5.740132808685303, + "learning_rate": 9.153175591531755e-08, + "loss": 0.9373, + "step": 295 + }, + { + "epoch": 0.004670460118162641, + "grad_norm": 6.698586463928223, + "learning_rate": 9.308841843088418e-08, + "loss": 1.0603, + "step": 300 + }, + { + "epoch": 0.004748301120132018, + "grad_norm": 4.851785182952881, + "learning_rate": 9.46450809464508e-08, + "loss": 1.004, + "step": 305 + }, + { + "epoch": 0.004826142122101396, + "grad_norm": 7.876951217651367, + "learning_rate": 9.620174346201743e-08, + "loss": 1.1607, + "step": 310 + }, + { + "epoch": 0.004903983124070773, + "grad_norm": 9.093779563903809, + "learning_rate": 9.775840597758405e-08, + "loss": 1.1045, + "step": 315 + }, + { + "epoch": 0.004981824126040151, + "grad_norm": 16.582103729248047, + "learning_rate": 9.931506849315068e-08, + "loss": 1.1154, + "step": 320 + }, + { + "epoch": 0.005059665128009528, + "grad_norm": 13.140198707580566, + "learning_rate": 1.008717310087173e-07, + "loss": 1.17, + "step": 325 + }, + { + "epoch": 0.005137506129978905, + "grad_norm": 3.4895646572113037, + "learning_rate": 1.0242839352428394e-07, + "loss": 0.9535, + "step": 330 + }, + { + "epoch": 0.0052153471319482825, + "grad_norm": 6.645687103271484, + "learning_rate": 1.0398505603985055e-07, + "loss": 1.05, + "step": 335 + }, + { + "epoch": 0.00529318813391766, + "grad_norm": 7.615957736968994, + "learning_rate": 1.0554171855541719e-07, + "loss": 0.9174, + "step": 340 + }, + { + "epoch": 0.005371029135887037, + "grad_norm": 8.536812782287598, + "learning_rate": 1.070983810709838e-07, + "loss": 1.1338, + "step": 345 + }, + { + "epoch": 0.005448870137856414, + "grad_norm": 4.573184967041016, + "learning_rate": 1.0865504358655044e-07, + "loss": 1.2018, + "step": 350 + }, + { + "epoch": 0.005526711139825792, + "grad_norm": 11.614198684692383, + "learning_rate": 1.1021170610211705e-07, + "loss": 1.1853, + "step": 355 + }, + { + "epoch": 0.005604552141795169, + "grad_norm": 12.930988311767578, + "learning_rate": 1.1176836861768369e-07, + "loss": 1.1772, + "step": 360 + }, + { + "epoch": 0.005682393143764546, + "grad_norm": 5.334465980529785, + "learning_rate": 1.133250311332503e-07, + "loss": 1.1402, + "step": 365 + }, + { + "epoch": 0.0057602341457339236, + "grad_norm": 19.55135726928711, + "learning_rate": 1.1488169364881693e-07, + "loss": 0.9569, + "step": 370 + }, + { + "epoch": 0.005838075147703301, + "grad_norm": 14.209831237792969, + "learning_rate": 1.1643835616438355e-07, + "loss": 1.1239, + "step": 375 + }, + { + "epoch": 0.005915916149672679, + "grad_norm": 5.5656352043151855, + "learning_rate": 1.1799501867995018e-07, + "loss": 1.1074, + "step": 380 + }, + { + "epoch": 0.005993757151642056, + "grad_norm": 10.571775436401367, + "learning_rate": 1.1955168119551682e-07, + "loss": 1.1663, + "step": 385 + }, + { + "epoch": 0.006071598153611434, + "grad_norm": 5.807967662811279, + "learning_rate": 1.2110834371108342e-07, + "loss": 1.1868, + "step": 390 + }, + { + "epoch": 0.006149439155580811, + "grad_norm": 7.003355503082275, + "learning_rate": 1.2266500622665007e-07, + "loss": 1.0249, + "step": 395 + }, + { + "epoch": 0.006227280157550188, + "grad_norm": 14.337294578552246, + "learning_rate": 1.2422166874221667e-07, + "loss": 1.0405, + "step": 400 + }, + { + "epoch": 0.006305121159519565, + "grad_norm": 12.388212203979492, + "learning_rate": 1.2577833125778332e-07, + "loss": 1.1801, + "step": 405 + }, + { + "epoch": 0.006382962161488943, + "grad_norm": 11.25795841217041, + "learning_rate": 1.2733499377334994e-07, + "loss": 1.1672, + "step": 410 + }, + { + "epoch": 0.00646080316345832, + "grad_norm": 15.970906257629395, + "learning_rate": 1.2889165628891654e-07, + "loss": 1.0815, + "step": 415 + }, + { + "epoch": 0.006538644165427697, + "grad_norm": 16.4951114654541, + "learning_rate": 1.3044831880448317e-07, + "loss": 1.039, + "step": 420 + }, + { + "epoch": 0.006616485167397075, + "grad_norm": 16.199981689453125, + "learning_rate": 1.3200498132004982e-07, + "loss": 1.1636, + "step": 425 + }, + { + "epoch": 0.006694326169366452, + "grad_norm": 7.787930965423584, + "learning_rate": 1.3356164383561644e-07, + "loss": 1.0949, + "step": 430 + }, + { + "epoch": 0.006772167171335829, + "grad_norm": 4.226932525634766, + "learning_rate": 1.3511830635118307e-07, + "loss": 1.0409, + "step": 435 + }, + { + "epoch": 0.0068500081733052064, + "grad_norm": 19.068387985229492, + "learning_rate": 1.3667496886674967e-07, + "loss": 0.9881, + "step": 440 + }, + { + "epoch": 0.006927849175274584, + "grad_norm": 3.8829450607299805, + "learning_rate": 1.3823163138231632e-07, + "loss": 0.9989, + "step": 445 + }, + { + "epoch": 0.007005690177243962, + "grad_norm": 5.948785305023193, + "learning_rate": 1.3978829389788294e-07, + "loss": 1.007, + "step": 450 + }, + { + "epoch": 0.007083531179213339, + "grad_norm": 5.125, + "learning_rate": 1.4134495641344957e-07, + "loss": 0.8301, + "step": 455 + }, + { + "epoch": 0.0071613721811827164, + "grad_norm": 12.499361038208008, + "learning_rate": 1.4290161892901616e-07, + "loss": 1.0923, + "step": 460 + }, + { + "epoch": 0.007239213183152094, + "grad_norm": 6.266834735870361, + "learning_rate": 1.4445828144458281e-07, + "loss": 0.88, + "step": 465 + }, + { + "epoch": 0.007317054185121471, + "grad_norm": 9.417441368103027, + "learning_rate": 1.4601494396014944e-07, + "loss": 0.9669, + "step": 470 + }, + { + "epoch": 0.007394895187090848, + "grad_norm": 9.376644134521484, + "learning_rate": 1.4757160647571606e-07, + "loss": 1.0241, + "step": 475 + }, + { + "epoch": 0.007472736189060226, + "grad_norm": 10.515301704406738, + "learning_rate": 1.491282689912827e-07, + "loss": 1.0468, + "step": 480 + }, + { + "epoch": 0.007550577191029603, + "grad_norm": 8.439921379089355, + "learning_rate": 1.506849315068493e-07, + "loss": 0.9356, + "step": 485 + }, + { + "epoch": 0.00762841819299898, + "grad_norm": 8.198512077331543, + "learning_rate": 1.5224159402241594e-07, + "loss": 1.1591, + "step": 490 + }, + { + "epoch": 0.0077062591949683575, + "grad_norm": 6.289046287536621, + "learning_rate": 1.5379825653798256e-07, + "loss": 1.0259, + "step": 495 + }, + { + "epoch": 0.007784100196937735, + "grad_norm": 18.078012466430664, + "learning_rate": 1.5535491905354919e-07, + "loss": 1.1059, + "step": 500 + }, + { + "epoch": 0.007861941198907112, + "grad_norm": 8.508400917053223, + "learning_rate": 1.569115815691158e-07, + "loss": 1.001, + "step": 505 + }, + { + "epoch": 0.00793978220087649, + "grad_norm": 6.552981853485107, + "learning_rate": 1.5846824408468243e-07, + "loss": 1.0819, + "step": 510 + }, + { + "epoch": 0.008017623202845867, + "grad_norm": 5.941412925720215, + "learning_rate": 1.6002490660024906e-07, + "loss": 1.0143, + "step": 515 + }, + { + "epoch": 0.008095464204815245, + "grad_norm": 10.764496803283691, + "learning_rate": 1.6158156911581568e-07, + "loss": 1.0228, + "step": 520 + }, + { + "epoch": 0.008173305206784621, + "grad_norm": 5.186371326446533, + "learning_rate": 1.6313823163138233e-07, + "loss": 0.9794, + "step": 525 + }, + { + "epoch": 0.008251146208754, + "grad_norm": 11.401899337768555, + "learning_rate": 1.6469489414694893e-07, + "loss": 1.0773, + "step": 530 + }, + { + "epoch": 0.008328987210723376, + "grad_norm": 5.4313788414001465, + "learning_rate": 1.6625155666251556e-07, + "loss": 0.9984, + "step": 535 + }, + { + "epoch": 0.008406828212692754, + "grad_norm": 7.18859338760376, + "learning_rate": 1.6780821917808218e-07, + "loss": 1.0894, + "step": 540 + }, + { + "epoch": 0.00848466921466213, + "grad_norm": 5.814337253570557, + "learning_rate": 1.6936488169364883e-07, + "loss": 0.983, + "step": 545 + }, + { + "epoch": 0.008562510216631508, + "grad_norm": 11.842198371887207, + "learning_rate": 1.7092154420921543e-07, + "loss": 0.994, + "step": 550 + }, + { + "epoch": 0.008640351218600887, + "grad_norm": 10.12619400024414, + "learning_rate": 1.7247820672478206e-07, + "loss": 1.015, + "step": 555 + }, + { + "epoch": 0.008718192220570263, + "grad_norm": 7.895757675170898, + "learning_rate": 1.7403486924034868e-07, + "loss": 1.1194, + "step": 560 + }, + { + "epoch": 0.008796033222539641, + "grad_norm": 5.340054512023926, + "learning_rate": 1.755915317559153e-07, + "loss": 1.0283, + "step": 565 + }, + { + "epoch": 0.008873874224509018, + "grad_norm": 13.950590133666992, + "learning_rate": 1.7714819427148193e-07, + "loss": 1.155, + "step": 570 + }, + { + "epoch": 0.008951715226478396, + "grad_norm": 10.90434741973877, + "learning_rate": 1.7870485678704855e-07, + "loss": 1.0135, + "step": 575 + }, + { + "epoch": 0.009029556228447772, + "grad_norm": 4.94070291519165, + "learning_rate": 1.8026151930261518e-07, + "loss": 0.9843, + "step": 580 + }, + { + "epoch": 0.00910739723041715, + "grad_norm": 9.50981616973877, + "learning_rate": 1.818181818181818e-07, + "loss": 1.0777, + "step": 585 + }, + { + "epoch": 0.009185238232386527, + "grad_norm": 9.218316078186035, + "learning_rate": 1.8337484433374845e-07, + "loss": 1.1192, + "step": 590 + }, + { + "epoch": 0.009263079234355905, + "grad_norm": 17.782791137695312, + "learning_rate": 1.8493150684931505e-07, + "loss": 1.0513, + "step": 595 + }, + { + "epoch": 0.009340920236325281, + "grad_norm": 5.774691581726074, + "learning_rate": 1.8648816936488168e-07, + "loss": 0.872, + "step": 600 + }, + { + "epoch": 0.00941876123829466, + "grad_norm": 6.310098171234131, + "learning_rate": 1.880448318804483e-07, + "loss": 0.9395, + "step": 605 + }, + { + "epoch": 0.009496602240264036, + "grad_norm": 6.68503999710083, + "learning_rate": 1.8960149439601495e-07, + "loss": 1.0394, + "step": 610 + }, + { + "epoch": 0.009574443242233414, + "grad_norm": 6.972198486328125, + "learning_rate": 1.9115815691158155e-07, + "loss": 1.066, + "step": 615 + }, + { + "epoch": 0.009652284244202792, + "grad_norm": 6.581061363220215, + "learning_rate": 1.9271481942714817e-07, + "loss": 0.9428, + "step": 620 + }, + { + "epoch": 0.009730125246172169, + "grad_norm": 10.010781288146973, + "learning_rate": 1.942714819427148e-07, + "loss": 1.097, + "step": 625 + }, + { + "epoch": 0.009807966248141547, + "grad_norm": 10.270834922790527, + "learning_rate": 1.9582814445828145e-07, + "loss": 1.0373, + "step": 630 + }, + { + "epoch": 0.009885807250110923, + "grad_norm": 7.189127445220947, + "learning_rate": 1.9738480697384807e-07, + "loss": 1.2356, + "step": 635 + }, + { + "epoch": 0.009963648252080301, + "grad_norm": 11.2526216506958, + "learning_rate": 1.9894146948941467e-07, + "loss": 0.8422, + "step": 640 + }, + { + "epoch": 0.010041489254049678, + "grad_norm": 5.1716203689575195, + "learning_rate": 2.004981320049813e-07, + "loss": 1.0132, + "step": 645 + }, + { + "epoch": 0.010119330256019056, + "grad_norm": 4.592648983001709, + "learning_rate": 2.0205479452054795e-07, + "loss": 1.0677, + "step": 650 + }, + { + "epoch": 0.010197171257988432, + "grad_norm": 4.74710750579834, + "learning_rate": 2.0361145703611457e-07, + "loss": 0.9573, + "step": 655 + }, + { + "epoch": 0.01027501225995781, + "grad_norm": 5.075165748596191, + "learning_rate": 2.0516811955168117e-07, + "loss": 0.919, + "step": 660 + }, + { + "epoch": 0.010352853261927187, + "grad_norm": 5.705000400543213, + "learning_rate": 2.067247820672478e-07, + "loss": 0.9722, + "step": 665 + }, + { + "epoch": 0.010430694263896565, + "grad_norm": 8.337606430053711, + "learning_rate": 2.0828144458281445e-07, + "loss": 1.0511, + "step": 670 + }, + { + "epoch": 0.010508535265865942, + "grad_norm": 7.93868350982666, + "learning_rate": 2.0983810709838107e-07, + "loss": 1.0511, + "step": 675 + }, + { + "epoch": 0.01058637626783532, + "grad_norm": 7.5352325439453125, + "learning_rate": 2.113947696139477e-07, + "loss": 1.0529, + "step": 680 + }, + { + "epoch": 0.010664217269804696, + "grad_norm": 12.067502975463867, + "learning_rate": 2.129514321295143e-07, + "loss": 0.9818, + "step": 685 + }, + { + "epoch": 0.010742058271774074, + "grad_norm": 4.793339729309082, + "learning_rate": 2.1450809464508094e-07, + "loss": 0.9757, + "step": 690 + }, + { + "epoch": 0.010819899273743452, + "grad_norm": 5.648492336273193, + "learning_rate": 2.1606475716064757e-07, + "loss": 0.9838, + "step": 695 + }, + { + "epoch": 0.010897740275712829, + "grad_norm": 14.50791072845459, + "learning_rate": 2.176214196762142e-07, + "loss": 1.025, + "step": 700 + }, + { + "epoch": 0.010975581277682207, + "grad_norm": 6.976552486419678, + "learning_rate": 2.191780821917808e-07, + "loss": 1.0636, + "step": 705 + }, + { + "epoch": 0.011053422279651583, + "grad_norm": 8.440703392028809, + "learning_rate": 2.2073474470734744e-07, + "loss": 0.8979, + "step": 710 + }, + { + "epoch": 0.011131263281620962, + "grad_norm": 17.822824478149414, + "learning_rate": 2.2229140722291407e-07, + "loss": 1.126, + "step": 715 + }, + { + "epoch": 0.011209104283590338, + "grad_norm": 3.6384825706481934, + "learning_rate": 2.238480697384807e-07, + "loss": 0.9756, + "step": 720 + }, + { + "epoch": 0.011286945285559716, + "grad_norm": 9.758706092834473, + "learning_rate": 2.2540473225404732e-07, + "loss": 1.0776, + "step": 725 + }, + { + "epoch": 0.011364786287529093, + "grad_norm": 6.821314334869385, + "learning_rate": 2.2696139476961394e-07, + "loss": 1.0809, + "step": 730 + }, + { + "epoch": 0.01144262728949847, + "grad_norm": 5.796785831451416, + "learning_rate": 2.2851805728518056e-07, + "loss": 1.0388, + "step": 735 + }, + { + "epoch": 0.011520468291467847, + "grad_norm": 14.487456321716309, + "learning_rate": 2.300747198007472e-07, + "loss": 0.9906, + "step": 740 + }, + { + "epoch": 0.011598309293437225, + "grad_norm": 5.587100505828857, + "learning_rate": 2.3163138231631381e-07, + "loss": 0.955, + "step": 745 + }, + { + "epoch": 0.011676150295406602, + "grad_norm": 5.029387474060059, + "learning_rate": 2.3318804483188044e-07, + "loss": 0.9509, + "step": 750 + }, + { + "epoch": 0.01175399129737598, + "grad_norm": 16.782621383666992, + "learning_rate": 2.3474470734744706e-07, + "loss": 1.0278, + "step": 755 + }, + { + "epoch": 0.011831832299345358, + "grad_norm": 8.211995124816895, + "learning_rate": 2.363013698630137e-07, + "loss": 1.1128, + "step": 760 + }, + { + "epoch": 0.011909673301314734, + "grad_norm": 8.179312705993652, + "learning_rate": 2.378580323785803e-07, + "loss": 0.9175, + "step": 765 + }, + { + "epoch": 0.011987514303284113, + "grad_norm": 3.8183233737945557, + "learning_rate": 2.3941469489414696e-07, + "loss": 1.0116, + "step": 770 + }, + { + "epoch": 0.012065355305253489, + "grad_norm": 11.87375545501709, + "learning_rate": 2.4097135740971356e-07, + "loss": 0.975, + "step": 775 + }, + { + "epoch": 0.012143196307222867, + "grad_norm": 20.000045776367188, + "learning_rate": 2.425280199252802e-07, + "loss": 1.0561, + "step": 780 + }, + { + "epoch": 0.012221037309192244, + "grad_norm": 4.025638103485107, + "learning_rate": 2.440846824408468e-07, + "loss": 0.977, + "step": 785 + }, + { + "epoch": 0.012298878311161622, + "grad_norm": 8.214958190917969, + "learning_rate": 2.4564134495641346e-07, + "loss": 0.9728, + "step": 790 + }, + { + "epoch": 0.012376719313130998, + "grad_norm": 13.562061309814453, + "learning_rate": 2.4719800747198006e-07, + "loss": 0.8595, + "step": 795 + }, + { + "epoch": 0.012454560315100376, + "grad_norm": 4.473455905914307, + "learning_rate": 2.4875466998754666e-07, + "loss": 0.8972, + "step": 800 + }, + { + "epoch": 0.012532401317069753, + "grad_norm": 5.311202049255371, + "learning_rate": 2.503113325031133e-07, + "loss": 0.8578, + "step": 805 + }, + { + "epoch": 0.01261024231903913, + "grad_norm": 11.063155174255371, + "learning_rate": 2.5186799501867996e-07, + "loss": 0.9938, + "step": 810 + }, + { + "epoch": 0.012688083321008507, + "grad_norm": 7.260047435760498, + "learning_rate": 2.5342465753424656e-07, + "loss": 0.9745, + "step": 815 + }, + { + "epoch": 0.012765924322977885, + "grad_norm": 5.3101067543029785, + "learning_rate": 2.549813200498132e-07, + "loss": 1.0001, + "step": 820 + }, + { + "epoch": 0.012843765324947264, + "grad_norm": 4.430516719818115, + "learning_rate": 2.5653798256537986e-07, + "loss": 0.9283, + "step": 825 + }, + { + "epoch": 0.01292160632691664, + "grad_norm": 4.081624507904053, + "learning_rate": 2.580946450809464e-07, + "loss": 0.9694, + "step": 830 + }, + { + "epoch": 0.012999447328886018, + "grad_norm": 6.339404106140137, + "learning_rate": 2.5965130759651306e-07, + "loss": 1.0485, + "step": 835 + }, + { + "epoch": 0.013077288330855395, + "grad_norm": 8.773398399353027, + "learning_rate": 2.6120797011207965e-07, + "loss": 0.8494, + "step": 840 + }, + { + "epoch": 0.013155129332824773, + "grad_norm": 9.235841751098633, + "learning_rate": 2.627646326276463e-07, + "loss": 0.9223, + "step": 845 + }, + { + "epoch": 0.01323297033479415, + "grad_norm": 5.350943565368652, + "learning_rate": 2.6432129514321296e-07, + "loss": 1.0852, + "step": 850 + }, + { + "epoch": 0.013310811336763527, + "grad_norm": 5.6170268058776855, + "learning_rate": 2.6587795765877955e-07, + "loss": 0.9529, + "step": 855 + }, + { + "epoch": 0.013388652338732904, + "grad_norm": 6.033858776092529, + "learning_rate": 2.674346201743462e-07, + "loss": 0.9264, + "step": 860 + }, + { + "epoch": 0.013466493340702282, + "grad_norm": 10.408087730407715, + "learning_rate": 2.6899128268991286e-07, + "loss": 0.9435, + "step": 865 + }, + { + "epoch": 0.013544334342671658, + "grad_norm": 3.902411460876465, + "learning_rate": 2.7054794520547945e-07, + "loss": 1.0717, + "step": 870 + }, + { + "epoch": 0.013622175344641036, + "grad_norm": 6.315438270568848, + "learning_rate": 2.7210460772104605e-07, + "loss": 1.0222, + "step": 875 + }, + { + "epoch": 0.013700016346610413, + "grad_norm": 9.6283540725708, + "learning_rate": 2.7366127023661265e-07, + "loss": 0.9607, + "step": 880 + }, + { + "epoch": 0.013777857348579791, + "grad_norm": 8.017468452453613, + "learning_rate": 2.752179327521793e-07, + "loss": 0.908, + "step": 885 + }, + { + "epoch": 0.013855698350549167, + "grad_norm": 4.06109094619751, + "learning_rate": 2.7677459526774595e-07, + "loss": 0.9779, + "step": 890 + }, + { + "epoch": 0.013933539352518546, + "grad_norm": 4.540249347686768, + "learning_rate": 2.7833125778331255e-07, + "loss": 1.0685, + "step": 895 + }, + { + "epoch": 0.014011380354487924, + "grad_norm": 5.971028804779053, + "learning_rate": 2.798879202988792e-07, + "loss": 1.0487, + "step": 900 + }, + { + "epoch": 0.0140892213564573, + "grad_norm": 7.365455150604248, + "learning_rate": 2.8144458281444585e-07, + "loss": 1.0255, + "step": 905 + }, + { + "epoch": 0.014167062358426678, + "grad_norm": 5.49646520614624, + "learning_rate": 2.8300124533001245e-07, + "loss": 1.0066, + "step": 910 + }, + { + "epoch": 0.014244903360396055, + "grad_norm": 5.0211615562438965, + "learning_rate": 2.845579078455791e-07, + "loss": 0.9018, + "step": 915 + }, + { + "epoch": 0.014322744362365433, + "grad_norm": 3.7670419216156006, + "learning_rate": 2.8611457036114565e-07, + "loss": 1.0793, + "step": 920 + }, + { + "epoch": 0.01440058536433481, + "grad_norm": 10.098974227905273, + "learning_rate": 2.876712328767123e-07, + "loss": 1.0537, + "step": 925 + }, + { + "epoch": 0.014478426366304187, + "grad_norm": 8.83332633972168, + "learning_rate": 2.8922789539227895e-07, + "loss": 0.9274, + "step": 930 + }, + { + "epoch": 0.014556267368273564, + "grad_norm": 13.259550094604492, + "learning_rate": 2.9078455790784555e-07, + "loss": 0.9684, + "step": 935 + }, + { + "epoch": 0.014634108370242942, + "grad_norm": 9.241827964782715, + "learning_rate": 2.923412204234122e-07, + "loss": 1.031, + "step": 940 + }, + { + "epoch": 0.014711949372212318, + "grad_norm": 7.292890548706055, + "learning_rate": 2.9389788293897885e-07, + "loss": 0.9006, + "step": 945 + }, + { + "epoch": 0.014789790374181697, + "grad_norm": 4.794684886932373, + "learning_rate": 2.9545454545454545e-07, + "loss": 1.0308, + "step": 950 + }, + { + "epoch": 0.014867631376151073, + "grad_norm": 4.3201518058776855, + "learning_rate": 2.970112079701121e-07, + "loss": 0.9706, + "step": 955 + }, + { + "epoch": 0.014945472378120451, + "grad_norm": 3.5388669967651367, + "learning_rate": 2.985678704856787e-07, + "loss": 0.9782, + "step": 960 + }, + { + "epoch": 0.01502331338008983, + "grad_norm": 10.980652809143066, + "learning_rate": 3.001245330012453e-07, + "loss": 1.1015, + "step": 965 + }, + { + "epoch": 0.015101154382059206, + "grad_norm": 7.639592170715332, + "learning_rate": 3.0168119551681194e-07, + "loss": 0.9046, + "step": 970 + }, + { + "epoch": 0.015178995384028584, + "grad_norm": 5.50681734085083, + "learning_rate": 3.0323785803237854e-07, + "loss": 1.0192, + "step": 975 + }, + { + "epoch": 0.01525683638599796, + "grad_norm": 4.924655437469482, + "learning_rate": 3.047945205479452e-07, + "loss": 1.0545, + "step": 980 + }, + { + "epoch": 0.015334677387967338, + "grad_norm": 6.294414043426514, + "learning_rate": 3.0635118306351184e-07, + "loss": 0.9807, + "step": 985 + }, + { + "epoch": 0.015412518389936715, + "grad_norm": 4.609034538269043, + "learning_rate": 3.0790784557907844e-07, + "loss": 1.0205, + "step": 990 + }, + { + "epoch": 0.015490359391906093, + "grad_norm": 3.4544599056243896, + "learning_rate": 3.094645080946451e-07, + "loss": 0.9761, + "step": 995 + }, + { + "epoch": 0.01556820039387547, + "grad_norm": 5.186591148376465, + "learning_rate": 3.110211706102117e-07, + "loss": 0.9933, + "step": 1000 + }, + { + "epoch": 0.015646041395844846, + "grad_norm": 4.516424179077148, + "learning_rate": 3.125778331257783e-07, + "loss": 0.9057, + "step": 1005 + }, + { + "epoch": 0.015723882397814224, + "grad_norm": 4.458924293518066, + "learning_rate": 3.1413449564134494e-07, + "loss": 0.9526, + "step": 1010 + }, + { + "epoch": 0.015801723399783602, + "grad_norm": 5.840490341186523, + "learning_rate": 3.1569115815691154e-07, + "loss": 1.0032, + "step": 1015 + }, + { + "epoch": 0.01587956440175298, + "grad_norm": 13.803277015686035, + "learning_rate": 3.172478206724782e-07, + "loss": 1.0162, + "step": 1020 + }, + { + "epoch": 0.015957405403722355, + "grad_norm": 3.742831230163574, + "learning_rate": 3.1880448318804484e-07, + "loss": 0.9714, + "step": 1025 + }, + { + "epoch": 0.016035246405691733, + "grad_norm": 5.748800277709961, + "learning_rate": 3.2036114570361144e-07, + "loss": 0.9545, + "step": 1030 + }, + { + "epoch": 0.01611308740766111, + "grad_norm": 4.5021491050720215, + "learning_rate": 3.219178082191781e-07, + "loss": 0.9503, + "step": 1035 + }, + { + "epoch": 0.01619092840963049, + "grad_norm": 6.095613956451416, + "learning_rate": 3.234744707347447e-07, + "loss": 0.991, + "step": 1040 + }, + { + "epoch": 0.016268769411599868, + "grad_norm": 4.993571758270264, + "learning_rate": 3.2503113325031134e-07, + "loss": 0.9221, + "step": 1045 + }, + { + "epoch": 0.016346610413569242, + "grad_norm": 5.949316501617432, + "learning_rate": 3.2658779576587794e-07, + "loss": 0.9897, + "step": 1050 + }, + { + "epoch": 0.01642445141553862, + "grad_norm": 5.225283622741699, + "learning_rate": 3.2814445828144453e-07, + "loss": 0.9719, + "step": 1055 + }, + { + "epoch": 0.016502292417508, + "grad_norm": 15.378800392150879, + "learning_rate": 3.297011207970112e-07, + "loss": 0.8633, + "step": 1060 + }, + { + "epoch": 0.016580133419477377, + "grad_norm": 4.347599506378174, + "learning_rate": 3.312577833125778e-07, + "loss": 0.9054, + "step": 1065 + }, + { + "epoch": 0.01665797442144675, + "grad_norm": 5.208911895751953, + "learning_rate": 3.3281444582814443e-07, + "loss": 0.899, + "step": 1070 + }, + { + "epoch": 0.01673581542341613, + "grad_norm": 6.316863059997559, + "learning_rate": 3.343711083437111e-07, + "loss": 1.0165, + "step": 1075 + }, + { + "epoch": 0.016813656425385508, + "grad_norm": 5.477814197540283, + "learning_rate": 3.359277708592777e-07, + "loss": 0.96, + "step": 1080 + }, + { + "epoch": 0.016891497427354886, + "grad_norm": 4.848371505737305, + "learning_rate": 3.3748443337484433e-07, + "loss": 0.9703, + "step": 1085 + }, + { + "epoch": 0.01696933842932426, + "grad_norm": 9.025872230529785, + "learning_rate": 3.39041095890411e-07, + "loss": 0.9239, + "step": 1090 + }, + { + "epoch": 0.01704717943129364, + "grad_norm": 3.3916220664978027, + "learning_rate": 3.4059775840597753e-07, + "loss": 1.0136, + "step": 1095 + }, + { + "epoch": 0.017125020433263017, + "grad_norm": 9.25607967376709, + "learning_rate": 3.421544209215442e-07, + "loss": 0.9626, + "step": 1100 + }, + { + "epoch": 0.017202861435232395, + "grad_norm": 7.245452880859375, + "learning_rate": 3.437110834371108e-07, + "loss": 1.0026, + "step": 1105 + }, + { + "epoch": 0.017280702437201773, + "grad_norm": 3.3463306427001953, + "learning_rate": 3.4526774595267743e-07, + "loss": 0.9243, + "step": 1110 + }, + { + "epoch": 0.017358543439171148, + "grad_norm": 5.334697723388672, + "learning_rate": 3.468244084682441e-07, + "loss": 1.1001, + "step": 1115 + }, + { + "epoch": 0.017436384441140526, + "grad_norm": 4.7469305992126465, + "learning_rate": 3.483810709838107e-07, + "loss": 1.0421, + "step": 1120 + }, + { + "epoch": 0.017514225443109904, + "grad_norm": 4.398116111755371, + "learning_rate": 3.4993773349937733e-07, + "loss": 0.9502, + "step": 1125 + }, + { + "epoch": 0.017592066445079282, + "grad_norm": 3.972031831741333, + "learning_rate": 3.51494396014944e-07, + "loss": 0.8974, + "step": 1130 + }, + { + "epoch": 0.017669907447048657, + "grad_norm": 5.13526725769043, + "learning_rate": 3.530510585305106e-07, + "loss": 0.9668, + "step": 1135 + }, + { + "epoch": 0.017747748449018035, + "grad_norm": 3.752171754837036, + "learning_rate": 3.546077210460772e-07, + "loss": 1.0589, + "step": 1140 + }, + { + "epoch": 0.017825589450987413, + "grad_norm": 6.005197048187256, + "learning_rate": 3.561643835616438e-07, + "loss": 0.9786, + "step": 1145 + }, + { + "epoch": 0.01790343045295679, + "grad_norm": 5.12382173538208, + "learning_rate": 3.5772104607721043e-07, + "loss": 0.936, + "step": 1150 + }, + { + "epoch": 0.017981271454926166, + "grad_norm": 7.456275939941406, + "learning_rate": 3.592777085927771e-07, + "loss": 0.9261, + "step": 1155 + }, + { + "epoch": 0.018059112456895544, + "grad_norm": 3.7287797927856445, + "learning_rate": 3.608343711083437e-07, + "loss": 0.9291, + "step": 1160 + }, + { + "epoch": 0.018136953458864923, + "grad_norm": 3.916651725769043, + "learning_rate": 3.6239103362391033e-07, + "loss": 0.9193, + "step": 1165 + }, + { + "epoch": 0.0182147944608343, + "grad_norm": 4.2813720703125, + "learning_rate": 3.63947696139477e-07, + "loss": 1.0079, + "step": 1170 + }, + { + "epoch": 0.01829263546280368, + "grad_norm": 8.352608680725098, + "learning_rate": 3.655043586550436e-07, + "loss": 0.9901, + "step": 1175 + }, + { + "epoch": 0.018370476464773054, + "grad_norm": 5.297429084777832, + "learning_rate": 3.6706102117061023e-07, + "loss": 0.9049, + "step": 1180 + }, + { + "epoch": 0.01844831746674243, + "grad_norm": 4.064713478088379, + "learning_rate": 3.6861768368617677e-07, + "loss": 1.1307, + "step": 1185 + }, + { + "epoch": 0.01852615846871181, + "grad_norm": 6.08450174331665, + "learning_rate": 3.701743462017434e-07, + "loss": 1.0267, + "step": 1190 + }, + { + "epoch": 0.018603999470681188, + "grad_norm": 4.351869106292725, + "learning_rate": 3.717310087173101e-07, + "loss": 1.0315, + "step": 1195 + }, + { + "epoch": 0.018681840472650563, + "grad_norm": 7.120603084564209, + "learning_rate": 3.7328767123287667e-07, + "loss": 1.0092, + "step": 1200 + }, + { + "epoch": 0.01875968147461994, + "grad_norm": 4.8134660720825195, + "learning_rate": 3.748443337484433e-07, + "loss": 0.9491, + "step": 1205 + }, + { + "epoch": 0.01883752247658932, + "grad_norm": 5.852837085723877, + "learning_rate": 3.7640099626401e-07, + "loss": 1.0029, + "step": 1210 + }, + { + "epoch": 0.018915363478558697, + "grad_norm": 5.291375160217285, + "learning_rate": 3.7795765877957657e-07, + "loss": 0.8198, + "step": 1215 + }, + { + "epoch": 0.018993204480528072, + "grad_norm": 3.2667717933654785, + "learning_rate": 3.795143212951432e-07, + "loss": 0.9155, + "step": 1220 + }, + { + "epoch": 0.01907104548249745, + "grad_norm": 4.952467918395996, + "learning_rate": 3.810709838107098e-07, + "loss": 0.9144, + "step": 1225 + }, + { + "epoch": 0.019148886484466828, + "grad_norm": 4.495504379272461, + "learning_rate": 3.826276463262764e-07, + "loss": 0.9236, + "step": 1230 + }, + { + "epoch": 0.019226727486436206, + "grad_norm": 5.554149627685547, + "learning_rate": 3.8418430884184307e-07, + "loss": 0.7856, + "step": 1235 + }, + { + "epoch": 0.019304568488405584, + "grad_norm": 6.092937469482422, + "learning_rate": 3.8574097135740967e-07, + "loss": 0.8681, + "step": 1240 + }, + { + "epoch": 0.01938240949037496, + "grad_norm": 3.9643170833587646, + "learning_rate": 3.872976338729763e-07, + "loss": 0.836, + "step": 1245 + }, + { + "epoch": 0.019460250492344337, + "grad_norm": 3.9617724418640137, + "learning_rate": 3.8885429638854297e-07, + "loss": 1.015, + "step": 1250 + }, + { + "epoch": 0.019538091494313715, + "grad_norm": 8.572834014892578, + "learning_rate": 3.9041095890410957e-07, + "loss": 0.9308, + "step": 1255 + }, + { + "epoch": 0.019615932496283094, + "grad_norm": 6.380552291870117, + "learning_rate": 3.919676214196762e-07, + "loss": 0.9056, + "step": 1260 + }, + { + "epoch": 0.01969377349825247, + "grad_norm": 5.703736782073975, + "learning_rate": 3.935242839352428e-07, + "loss": 0.9864, + "step": 1265 + }, + { + "epoch": 0.019771614500221846, + "grad_norm": 4.2661824226379395, + "learning_rate": 3.9508094645080947e-07, + "loss": 0.8801, + "step": 1270 + }, + { + "epoch": 0.019849455502191225, + "grad_norm": 3.4654171466827393, + "learning_rate": 3.9663760896637607e-07, + "loss": 0.8954, + "step": 1275 + }, + { + "epoch": 0.019927296504160603, + "grad_norm": 5.910457611083984, + "learning_rate": 3.9819427148194266e-07, + "loss": 0.8456, + "step": 1280 + }, + { + "epoch": 0.020005137506129977, + "grad_norm": 6.345880031585693, + "learning_rate": 3.997509339975093e-07, + "loss": 0.9825, + "step": 1285 + }, + { + "epoch": 0.020082978508099356, + "grad_norm": 11.178544044494629, + "learning_rate": 4.0130759651307597e-07, + "loss": 0.9716, + "step": 1290 + }, + { + "epoch": 0.020160819510068734, + "grad_norm": 3.9438936710357666, + "learning_rate": 4.0286425902864256e-07, + "loss": 0.9629, + "step": 1295 + }, + { + "epoch": 0.020238660512038112, + "grad_norm": 7.4510273933410645, + "learning_rate": 4.044209215442092e-07, + "loss": 0.9815, + "step": 1300 + }, + { + "epoch": 0.020316501514007487, + "grad_norm": 6.15594482421875, + "learning_rate": 4.059775840597758e-07, + "loss": 0.9805, + "step": 1305 + }, + { + "epoch": 0.020394342515976865, + "grad_norm": 5.105663776397705, + "learning_rate": 4.0753424657534246e-07, + "loss": 1.0047, + "step": 1310 + }, + { + "epoch": 0.020472183517946243, + "grad_norm": 4.2579779624938965, + "learning_rate": 4.090909090909091e-07, + "loss": 0.797, + "step": 1315 + }, + { + "epoch": 0.02055002451991562, + "grad_norm": 3.6263747215270996, + "learning_rate": 4.1064757160647566e-07, + "loss": 0.9526, + "step": 1320 + }, + { + "epoch": 0.020627865521885, + "grad_norm": 4.003891944885254, + "learning_rate": 4.122042341220423e-07, + "loss": 0.862, + "step": 1325 + }, + { + "epoch": 0.020705706523854374, + "grad_norm": 4.833682060241699, + "learning_rate": 4.137608966376089e-07, + "loss": 1.0438, + "step": 1330 + }, + { + "epoch": 0.020783547525823752, + "grad_norm": 8.875425338745117, + "learning_rate": 4.1531755915317556e-07, + "loss": 1.0013, + "step": 1335 + }, + { + "epoch": 0.02086138852779313, + "grad_norm": 5.356649398803711, + "learning_rate": 4.168742216687422e-07, + "loss": 0.8956, + "step": 1340 + }, + { + "epoch": 0.02093922952976251, + "grad_norm": 5.640366554260254, + "learning_rate": 4.184308841843088e-07, + "loss": 0.8346, + "step": 1345 + }, + { + "epoch": 0.021017070531731883, + "grad_norm": 3.717663288116455, + "learning_rate": 4.1998754669987546e-07, + "loss": 0.8512, + "step": 1350 + }, + { + "epoch": 0.02109491153370126, + "grad_norm": 3.557542324066162, + "learning_rate": 4.215442092154421e-07, + "loss": 0.9313, + "step": 1355 + }, + { + "epoch": 0.02117275253567064, + "grad_norm": 5.178566932678223, + "learning_rate": 4.231008717310087e-07, + "loss": 0.9086, + "step": 1360 + }, + { + "epoch": 0.021250593537640017, + "grad_norm": 5.773383140563965, + "learning_rate": 4.246575342465753e-07, + "loss": 0.9678, + "step": 1365 + }, + { + "epoch": 0.021328434539609392, + "grad_norm": 4.725634574890137, + "learning_rate": 4.262141967621419e-07, + "loss": 0.9356, + "step": 1370 + }, + { + "epoch": 0.02140627554157877, + "grad_norm": 3.0198757648468018, + "learning_rate": 4.2777085927770856e-07, + "loss": 0.9342, + "step": 1375 + }, + { + "epoch": 0.02148411654354815, + "grad_norm": 5.704006195068359, + "learning_rate": 4.293275217932752e-07, + "loss": 1.0469, + "step": 1380 + }, + { + "epoch": 0.021561957545517527, + "grad_norm": 4.559571743011475, + "learning_rate": 4.308841843088418e-07, + "loss": 0.845, + "step": 1385 + }, + { + "epoch": 0.021639798547486905, + "grad_norm": 9.018213272094727, + "learning_rate": 4.3244084682440846e-07, + "loss": 0.9221, + "step": 1390 + }, + { + "epoch": 0.02171763954945628, + "grad_norm": 6.414641380310059, + "learning_rate": 4.339975093399751e-07, + "loss": 1.0877, + "step": 1395 + }, + { + "epoch": 0.021795480551425658, + "grad_norm": 4.217600345611572, + "learning_rate": 4.355541718555417e-07, + "loss": 0.8761, + "step": 1400 + }, + { + "epoch": 0.021873321553395036, + "grad_norm": 5.274855136871338, + "learning_rate": 4.3711083437110836e-07, + "loss": 0.9046, + "step": 1405 + }, + { + "epoch": 0.021951162555364414, + "grad_norm": 11.607494354248047, + "learning_rate": 4.386674968866749e-07, + "loss": 0.981, + "step": 1410 + }, + { + "epoch": 0.02202900355733379, + "grad_norm": 5.442785263061523, + "learning_rate": 4.4022415940224155e-07, + "loss": 0.9357, + "step": 1415 + }, + { + "epoch": 0.022106844559303167, + "grad_norm": 4.934208869934082, + "learning_rate": 4.417808219178082e-07, + "loss": 0.9124, + "step": 1420 + }, + { + "epoch": 0.022184685561272545, + "grad_norm": 5.2812933921813965, + "learning_rate": 4.433374844333748e-07, + "loss": 0.9655, + "step": 1425 + }, + { + "epoch": 0.022262526563241923, + "grad_norm": 3.1578216552734375, + "learning_rate": 4.4489414694894145e-07, + "loss": 0.9452, + "step": 1430 + }, + { + "epoch": 0.022340367565211298, + "grad_norm": 10.148691177368164, + "learning_rate": 4.464508094645081e-07, + "loss": 0.9015, + "step": 1435 + }, + { + "epoch": 0.022418208567180676, + "grad_norm": 3.150479793548584, + "learning_rate": 4.480074719800747e-07, + "loss": 0.8744, + "step": 1440 + }, + { + "epoch": 0.022496049569150054, + "grad_norm": 5.963056564331055, + "learning_rate": 4.4956413449564135e-07, + "loss": 1.0312, + "step": 1445 + }, + { + "epoch": 0.022573890571119432, + "grad_norm": 5.098721981048584, + "learning_rate": 4.5112079701120795e-07, + "loss": 0.8031, + "step": 1450 + }, + { + "epoch": 0.02265173157308881, + "grad_norm": 5.2625017166137695, + "learning_rate": 4.5267745952677455e-07, + "loss": 0.8132, + "step": 1455 + }, + { + "epoch": 0.022729572575058185, + "grad_norm": 8.537793159484863, + "learning_rate": 4.542341220423412e-07, + "loss": 0.8296, + "step": 1460 + }, + { + "epoch": 0.022807413577027563, + "grad_norm": 6.819812774658203, + "learning_rate": 4.557907845579078e-07, + "loss": 0.9476, + "step": 1465 + }, + { + "epoch": 0.02288525457899694, + "grad_norm": 4.941056251525879, + "learning_rate": 4.5734744707347445e-07, + "loss": 0.8785, + "step": 1470 + }, + { + "epoch": 0.02296309558096632, + "grad_norm": 5.378219127655029, + "learning_rate": 4.589041095890411e-07, + "loss": 0.8987, + "step": 1475 + }, + { + "epoch": 0.023040936582935694, + "grad_norm": 4.793314456939697, + "learning_rate": 4.604607721046077e-07, + "loss": 0.8309, + "step": 1480 + }, + { + "epoch": 0.023118777584905072, + "grad_norm": 7.7251434326171875, + "learning_rate": 4.6201743462017435e-07, + "loss": 0.959, + "step": 1485 + }, + { + "epoch": 0.02319661858687445, + "grad_norm": 3.7208149433135986, + "learning_rate": 4.6357409713574095e-07, + "loss": 0.9126, + "step": 1490 + }, + { + "epoch": 0.02327445958884383, + "grad_norm": 4.322316646575928, + "learning_rate": 4.651307596513076e-07, + "loss": 0.9567, + "step": 1495 + }, + { + "epoch": 0.023352300590813203, + "grad_norm": 5.451142311096191, + "learning_rate": 4.666874221668742e-07, + "loss": 0.9943, + "step": 1500 + }, + { + "epoch": 0.02343014159278258, + "grad_norm": 6.478999614715576, + "learning_rate": 4.682440846824408e-07, + "loss": 0.8577, + "step": 1505 + }, + { + "epoch": 0.02350798259475196, + "grad_norm": 5.626023292541504, + "learning_rate": 4.6980074719800745e-07, + "loss": 0.9176, + "step": 1510 + }, + { + "epoch": 0.023585823596721338, + "grad_norm": 9.153360366821289, + "learning_rate": 4.713574097135741e-07, + "loss": 1.0269, + "step": 1515 + }, + { + "epoch": 0.023663664598690716, + "grad_norm": 11.129598617553711, + "learning_rate": 4.729140722291407e-07, + "loss": 0.9428, + "step": 1520 + }, + { + "epoch": 0.02374150560066009, + "grad_norm": 5.8177313804626465, + "learning_rate": 4.7447073474470735e-07, + "loss": 1.0107, + "step": 1525 + }, + { + "epoch": 0.02381934660262947, + "grad_norm": 6.537820816040039, + "learning_rate": 4.7602739726027394e-07, + "loss": 0.8104, + "step": 1530 + }, + { + "epoch": 0.023897187604598847, + "grad_norm": 4.420594692230225, + "learning_rate": 4.775840597758406e-07, + "loss": 0.8484, + "step": 1535 + }, + { + "epoch": 0.023975028606568225, + "grad_norm": 6.306564807891846, + "learning_rate": 4.791407222914072e-07, + "loss": 0.796, + "step": 1540 + }, + { + "epoch": 0.0240528696085376, + "grad_norm": 11.836288452148438, + "learning_rate": 4.806973848069738e-07, + "loss": 0.8949, + "step": 1545 + }, + { + "epoch": 0.024130710610506978, + "grad_norm": 4.565202713012695, + "learning_rate": 4.822540473225404e-07, + "loss": 0.8881, + "step": 1550 + }, + { + "epoch": 0.024208551612476356, + "grad_norm": 4.610184669494629, + "learning_rate": 4.83810709838107e-07, + "loss": 1.0267, + "step": 1555 + }, + { + "epoch": 0.024286392614445734, + "grad_norm": 4.136282444000244, + "learning_rate": 4.853673723536737e-07, + "loss": 0.9593, + "step": 1560 + }, + { + "epoch": 0.02436423361641511, + "grad_norm": 4.203325271606445, + "learning_rate": 4.869240348692403e-07, + "loss": 0.9377, + "step": 1565 + }, + { + "epoch": 0.024442074618384487, + "grad_norm": 8.468722343444824, + "learning_rate": 4.88480697384807e-07, + "loss": 0.9949, + "step": 1570 + }, + { + "epoch": 0.024519915620353865, + "grad_norm": 7.116949558258057, + "learning_rate": 4.900373599003736e-07, + "loss": 0.92, + "step": 1575 + }, + { + "epoch": 0.024597756622323243, + "grad_norm": 4.857876777648926, + "learning_rate": 4.915940224159402e-07, + "loss": 0.8945, + "step": 1580 + }, + { + "epoch": 0.02467559762429262, + "grad_norm": 7.421228408813477, + "learning_rate": 4.931506849315068e-07, + "loss": 0.8807, + "step": 1585 + }, + { + "epoch": 0.024753438626261996, + "grad_norm": 7.203330993652344, + "learning_rate": 4.947073474470734e-07, + "loss": 0.8998, + "step": 1590 + }, + { + "epoch": 0.024831279628231374, + "grad_norm": 12.598939895629883, + "learning_rate": 4.9626400996264e-07, + "loss": 0.8843, + "step": 1595 + }, + { + "epoch": 0.024909120630200753, + "grad_norm": 6.573790073394775, + "learning_rate": 4.978206724782067e-07, + "loss": 0.8954, + "step": 1600 + }, + { + "epoch": 0.02498696163217013, + "grad_norm": 5.063882350921631, + "learning_rate": 4.993773349937733e-07, + "loss": 0.8539, + "step": 1605 + }, + { + "epoch": 0.025064802634139505, + "grad_norm": 5.859914779663086, + "learning_rate": 5.0093399750934e-07, + "loss": 0.9102, + "step": 1610 + }, + { + "epoch": 0.025142643636108884, + "grad_norm": 4.542943954467773, + "learning_rate": 5.024906600249066e-07, + "loss": 0.876, + "step": 1615 + }, + { + "epoch": 0.02522048463807826, + "grad_norm": 6.943472862243652, + "learning_rate": 5.040473225404732e-07, + "loss": 0.8886, + "step": 1620 + }, + { + "epoch": 0.02529832564004764, + "grad_norm": 5.794211387634277, + "learning_rate": 5.056039850560398e-07, + "loss": 0.9136, + "step": 1625 + }, + { + "epoch": 0.025376166642017015, + "grad_norm": 3.58612322807312, + "learning_rate": 5.071606475716065e-07, + "loss": 0.8483, + "step": 1630 + }, + { + "epoch": 0.025454007643986393, + "grad_norm": 8.513461112976074, + "learning_rate": 5.087173100871731e-07, + "loss": 0.9484, + "step": 1635 + }, + { + "epoch": 0.02553184864595577, + "grad_norm": 3.152209997177124, + "learning_rate": 5.102739726027398e-07, + "loss": 0.8564, + "step": 1640 + }, + { + "epoch": 0.02560968964792515, + "grad_norm": 11.711703300476074, + "learning_rate": 5.118306351183063e-07, + "loss": 0.7842, + "step": 1645 + }, + { + "epoch": 0.025687530649894527, + "grad_norm": 4.101468086242676, + "learning_rate": 5.13387297633873e-07, + "loss": 0.8646, + "step": 1650 + }, + { + "epoch": 0.025765371651863902, + "grad_norm": 3.844512462615967, + "learning_rate": 5.149439601494395e-07, + "loss": 0.8094, + "step": 1655 + }, + { + "epoch": 0.02584321265383328, + "grad_norm": 3.546029567718506, + "learning_rate": 5.165006226650062e-07, + "loss": 0.87, + "step": 1660 + }, + { + "epoch": 0.025921053655802658, + "grad_norm": 3.3729195594787598, + "learning_rate": 5.180572851805728e-07, + "loss": 0.9467, + "step": 1665 + }, + { + "epoch": 0.025998894657772036, + "grad_norm": 3.984131336212158, + "learning_rate": 5.196139476961394e-07, + "loss": 0.8571, + "step": 1670 + }, + { + "epoch": 0.02607673565974141, + "grad_norm": 5.9442291259765625, + "learning_rate": 5.21170610211706e-07, + "loss": 0.892, + "step": 1675 + }, + { + "epoch": 0.02615457666171079, + "grad_norm": 6.404414653778076, + "learning_rate": 5.227272727272727e-07, + "loss": 0.8978, + "step": 1680 + }, + { + "epoch": 0.026232417663680167, + "grad_norm": 8.53201961517334, + "learning_rate": 5.242839352428393e-07, + "loss": 0.8403, + "step": 1685 + }, + { + "epoch": 0.026310258665649545, + "grad_norm": 7.944653511047363, + "learning_rate": 5.25840597758406e-07, + "loss": 0.7752, + "step": 1690 + }, + { + "epoch": 0.02638809966761892, + "grad_norm": 4.13915491104126, + "learning_rate": 5.273972602739725e-07, + "loss": 1.053, + "step": 1695 + }, + { + "epoch": 0.0264659406695883, + "grad_norm": 9.199925422668457, + "learning_rate": 5.289539227895392e-07, + "loss": 1.0808, + "step": 1700 + }, + { + "epoch": 0.026543781671557676, + "grad_norm": 4.507978439331055, + "learning_rate": 5.305105853051058e-07, + "loss": 0.9654, + "step": 1705 + }, + { + "epoch": 0.026621622673527055, + "grad_norm": 5.004615783691406, + "learning_rate": 5.320672478206725e-07, + "loss": 0.9003, + "step": 1710 + }, + { + "epoch": 0.02669946367549643, + "grad_norm": 9.572540283203125, + "learning_rate": 5.336239103362391e-07, + "loss": 1.0496, + "step": 1715 + }, + { + "epoch": 0.026777304677465807, + "grad_norm": 6.494607925415039, + "learning_rate": 5.351805728518058e-07, + "loss": 0.9283, + "step": 1720 + }, + { + "epoch": 0.026855145679435186, + "grad_norm": 6.419877529144287, + "learning_rate": 5.367372353673723e-07, + "loss": 0.9118, + "step": 1725 + }, + { + "epoch": 0.026932986681404564, + "grad_norm": 8.065162658691406, + "learning_rate": 5.38293897882939e-07, + "loss": 0.9376, + "step": 1730 + }, + { + "epoch": 0.027010827683373942, + "grad_norm": 11.1658935546875, + "learning_rate": 5.398505603985056e-07, + "loss": 0.9734, + "step": 1735 + }, + { + "epoch": 0.027088668685343317, + "grad_norm": 8.80482006072998, + "learning_rate": 5.414072229140723e-07, + "loss": 0.9357, + "step": 1740 + }, + { + "epoch": 0.027166509687312695, + "grad_norm": 9.545907974243164, + "learning_rate": 5.429638854296388e-07, + "loss": 1.0168, + "step": 1745 + }, + { + "epoch": 0.027244350689282073, + "grad_norm": 3.2502315044403076, + "learning_rate": 5.445205479452054e-07, + "loss": 0.8814, + "step": 1750 + }, + { + "epoch": 0.02732219169125145, + "grad_norm": 7.160440921783447, + "learning_rate": 5.46077210460772e-07, + "loss": 0.8058, + "step": 1755 + }, + { + "epoch": 0.027400032693220826, + "grad_norm": 4.625821113586426, + "learning_rate": 5.476338729763387e-07, + "loss": 0.8834, + "step": 1760 + }, + { + "epoch": 0.027477873695190204, + "grad_norm": 6.714595317840576, + "learning_rate": 5.491905354919053e-07, + "loss": 0.9709, + "step": 1765 + }, + { + "epoch": 0.027555714697159582, + "grad_norm": 5.669415473937988, + "learning_rate": 5.50747198007472e-07, + "loss": 0.9646, + "step": 1770 + }, + { + "epoch": 0.02763355569912896, + "grad_norm": 6.046622276306152, + "learning_rate": 5.523038605230385e-07, + "loss": 0.8469, + "step": 1775 + }, + { + "epoch": 0.027711396701098335, + "grad_norm": 16.526947021484375, + "learning_rate": 5.538605230386052e-07, + "loss": 0.877, + "step": 1780 + }, + { + "epoch": 0.027789237703067713, + "grad_norm": 4.415500164031982, + "learning_rate": 5.554171855541718e-07, + "loss": 0.8356, + "step": 1785 + }, + { + "epoch": 0.02786707870503709, + "grad_norm": 4.823260307312012, + "learning_rate": 5.569738480697385e-07, + "loss": 0.9628, + "step": 1790 + }, + { + "epoch": 0.02794491970700647, + "grad_norm": 8.501585006713867, + "learning_rate": 5.585305105853051e-07, + "loss": 0.7859, + "step": 1795 + }, + { + "epoch": 0.028022760708975848, + "grad_norm": 10.616768836975098, + "learning_rate": 5.600871731008718e-07, + "loss": 0.9145, + "step": 1800 + }, + { + "epoch": 0.028100601710945222, + "grad_norm": 6.610407829284668, + "learning_rate": 5.616438356164383e-07, + "loss": 0.8058, + "step": 1805 + }, + { + "epoch": 0.0281784427129146, + "grad_norm": 4.978299617767334, + "learning_rate": 5.63200498132005e-07, + "loss": 0.8, + "step": 1810 + }, + { + "epoch": 0.02825628371488398, + "grad_norm": 4.922807693481445, + "learning_rate": 5.647571606475716e-07, + "loss": 1.0388, + "step": 1815 + }, + { + "epoch": 0.028334124716853357, + "grad_norm": 6.71333122253418, + "learning_rate": 5.663138231631383e-07, + "loss": 0.9221, + "step": 1820 + }, + { + "epoch": 0.02841196571882273, + "grad_norm": 4.787428379058838, + "learning_rate": 5.678704856787049e-07, + "loss": 0.8824, + "step": 1825 + }, + { + "epoch": 0.02848980672079211, + "grad_norm": 8.047598838806152, + "learning_rate": 5.694271481942715e-07, + "loss": 0.8187, + "step": 1830 + }, + { + "epoch": 0.028567647722761488, + "grad_norm": 6.064495086669922, + "learning_rate": 5.70983810709838e-07, + "loss": 0.7595, + "step": 1835 + }, + { + "epoch": 0.028645488724730866, + "grad_norm": 4.46295690536499, + "learning_rate": 5.725404732254047e-07, + "loss": 0.9397, + "step": 1840 + }, + { + "epoch": 0.02872332972670024, + "grad_norm": 7.761974334716797, + "learning_rate": 5.740971357409713e-07, + "loss": 0.915, + "step": 1845 + }, + { + "epoch": 0.02880117072866962, + "grad_norm": 5.134248733520508, + "learning_rate": 5.75653798256538e-07, + "loss": 0.862, + "step": 1850 + }, + { + "epoch": 0.028879011730638997, + "grad_norm": 5.424485206604004, + "learning_rate": 5.772104607721045e-07, + "loss": 0.8359, + "step": 1855 + }, + { + "epoch": 0.028956852732608375, + "grad_norm": 2.9714298248291016, + "learning_rate": 5.787671232876712e-07, + "loss": 0.7999, + "step": 1860 + }, + { + "epoch": 0.029034693734577753, + "grad_norm": 6.131465911865234, + "learning_rate": 5.803237858032378e-07, + "loss": 0.8619, + "step": 1865 + }, + { + "epoch": 0.029112534736547128, + "grad_norm": 7.894665241241455, + "learning_rate": 5.818804483188045e-07, + "loss": 0.771, + "step": 1870 + }, + { + "epoch": 0.029190375738516506, + "grad_norm": 3.163548469543457, + "learning_rate": 5.834371108343711e-07, + "loss": 0.7482, + "step": 1875 + }, + { + "epoch": 0.029268216740485884, + "grad_norm": 5.383469581604004, + "learning_rate": 5.849937733499378e-07, + "loss": 0.895, + "step": 1880 + }, + { + "epoch": 0.029346057742455262, + "grad_norm": 6.841033935546875, + "learning_rate": 5.865504358655043e-07, + "loss": 0.8822, + "step": 1885 + }, + { + "epoch": 0.029423898744424637, + "grad_norm": 9.069436073303223, + "learning_rate": 5.88107098381071e-07, + "loss": 0.8947, + "step": 1890 + }, + { + "epoch": 0.029501739746394015, + "grad_norm": 5.3066725730896, + "learning_rate": 5.896637608966376e-07, + "loss": 0.8046, + "step": 1895 + }, + { + "epoch": 0.029579580748363393, + "grad_norm": 5.761783599853516, + "learning_rate": 5.912204234122043e-07, + "loss": 0.9087, + "step": 1900 + }, + { + "epoch": 0.02965742175033277, + "grad_norm": 3.4487996101379395, + "learning_rate": 5.927770859277709e-07, + "loss": 0.9291, + "step": 1905 + }, + { + "epoch": 0.029735262752302146, + "grad_norm": 5.8793816566467285, + "learning_rate": 5.943337484433375e-07, + "loss": 0.8244, + "step": 1910 + }, + { + "epoch": 0.029813103754271524, + "grad_norm": 6.812746047973633, + "learning_rate": 5.958904109589041e-07, + "loss": 0.8169, + "step": 1915 + }, + { + "epoch": 0.029890944756240902, + "grad_norm": 5.695523738861084, + "learning_rate": 5.974470734744707e-07, + "loss": 0.9072, + "step": 1920 + }, + { + "epoch": 0.02996878575821028, + "grad_norm": 3.443061590194702, + "learning_rate": 5.990037359900373e-07, + "loss": 0.8709, + "step": 1925 + }, + { + "epoch": 0.03004662676017966, + "grad_norm": 6.014828681945801, + "learning_rate": 6.00560398505604e-07, + "loss": 0.917, + "step": 1930 + }, + { + "epoch": 0.030124467762149033, + "grad_norm": 4.14946985244751, + "learning_rate": 6.021170610211705e-07, + "loss": 0.8827, + "step": 1935 + }, + { + "epoch": 0.03020230876411841, + "grad_norm": 4.128273963928223, + "learning_rate": 6.036737235367372e-07, + "loss": 0.8062, + "step": 1940 + }, + { + "epoch": 0.03028014976608779, + "grad_norm": 5.5036115646362305, + "learning_rate": 6.052303860523038e-07, + "loss": 0.9832, + "step": 1945 + }, + { + "epoch": 0.030357990768057168, + "grad_norm": 5.694386005401611, + "learning_rate": 6.067870485678705e-07, + "loss": 0.7735, + "step": 1950 + }, + { + "epoch": 0.030435831770026543, + "grad_norm": 3.861293315887451, + "learning_rate": 6.083437110834371e-07, + "loss": 0.8782, + "step": 1955 + }, + { + "epoch": 0.03051367277199592, + "grad_norm": 5.179184436798096, + "learning_rate": 6.099003735990037e-07, + "loss": 0.908, + "step": 1960 + }, + { + "epoch": 0.0305915137739653, + "grad_norm": 4.929222106933594, + "learning_rate": 6.114570361145703e-07, + "loss": 0.8967, + "step": 1965 + }, + { + "epoch": 0.030669354775934677, + "grad_norm": 3.300053596496582, + "learning_rate": 6.13013698630137e-07, + "loss": 0.9517, + "step": 1970 + }, + { + "epoch": 0.03074719577790405, + "grad_norm": 4.976810932159424, + "learning_rate": 6.145703611457036e-07, + "loss": 0.8676, + "step": 1975 + }, + { + "epoch": 0.03082503677987343, + "grad_norm": 3.866328477859497, + "learning_rate": 6.161270236612703e-07, + "loss": 0.9735, + "step": 1980 + }, + { + "epoch": 0.030902877781842808, + "grad_norm": 4.272680759429932, + "learning_rate": 6.176836861768369e-07, + "loss": 0.9716, + "step": 1985 + }, + { + "epoch": 0.030980718783812186, + "grad_norm": 6.74641752243042, + "learning_rate": 6.192403486924035e-07, + "loss": 0.815, + "step": 1990 + }, + { + "epoch": 0.031058559785781564, + "grad_norm": 3.4278452396392822, + "learning_rate": 6.207970112079701e-07, + "loss": 0.9165, + "step": 1995 + }, + { + "epoch": 0.03113640078775094, + "grad_norm": 7.538846492767334, + "learning_rate": 6.223536737235368e-07, + "loss": 0.9513, + "step": 2000 + }, + { + "epoch": 0.031214241789720317, + "grad_norm": 4.969770431518555, + "learning_rate": 6.239103362391034e-07, + "loss": 0.816, + "step": 2005 + }, + { + "epoch": 0.03129208279168969, + "grad_norm": 9.244134902954102, + "learning_rate": 6.2546699875467e-07, + "loss": 0.9293, + "step": 2010 + }, + { + "epoch": 0.03136992379365907, + "grad_norm": 5.617055416107178, + "learning_rate": 6.270236612702365e-07, + "loss": 0.8553, + "step": 2015 + }, + { + "epoch": 0.03144776479562845, + "grad_norm": 4.888432502746582, + "learning_rate": 6.285803237858031e-07, + "loss": 0.8679, + "step": 2020 + }, + { + "epoch": 0.031525605797597826, + "grad_norm": 4.528554916381836, + "learning_rate": 6.301369863013698e-07, + "loss": 0.9086, + "step": 2025 + }, + { + "epoch": 0.031603446799567204, + "grad_norm": 6.504762172698975, + "learning_rate": 6.316936488169364e-07, + "loss": 0.9501, + "step": 2030 + }, + { + "epoch": 0.03168128780153658, + "grad_norm": 3.974257230758667, + "learning_rate": 6.332503113325031e-07, + "loss": 0.888, + "step": 2035 + }, + { + "epoch": 0.03175912880350596, + "grad_norm": 8.628198623657227, + "learning_rate": 6.348069738480696e-07, + "loss": 0.8664, + "step": 2040 + }, + { + "epoch": 0.03183696980547534, + "grad_norm": 4.1892805099487305, + "learning_rate": 6.363636363636363e-07, + "loss": 0.9896, + "step": 2045 + }, + { + "epoch": 0.03191481080744471, + "grad_norm": 5.350588321685791, + "learning_rate": 6.37920298879203e-07, + "loss": 0.8815, + "step": 2050 + }, + { + "epoch": 0.03199265180941409, + "grad_norm": 5.569740295410156, + "learning_rate": 6.394769613947696e-07, + "loss": 0.8785, + "step": 2055 + }, + { + "epoch": 0.032070492811383466, + "grad_norm": 7.358509063720703, + "learning_rate": 6.410336239103362e-07, + "loss": 0.8415, + "step": 2060 + }, + { + "epoch": 0.032148333813352845, + "grad_norm": 5.384446144104004, + "learning_rate": 6.425902864259029e-07, + "loss": 0.9108, + "step": 2065 + }, + { + "epoch": 0.03222617481532222, + "grad_norm": 4.48892068862915, + "learning_rate": 6.441469489414694e-07, + "loss": 0.9423, + "step": 2070 + }, + { + "epoch": 0.0323040158172916, + "grad_norm": 4.302936553955078, + "learning_rate": 6.457036114570361e-07, + "loss": 0.8849, + "step": 2075 + }, + { + "epoch": 0.03238185681926098, + "grad_norm": 5.185121536254883, + "learning_rate": 6.472602739726027e-07, + "loss": 0.8177, + "step": 2080 + }, + { + "epoch": 0.03245969782123036, + "grad_norm": 3.2999234199523926, + "learning_rate": 6.488169364881694e-07, + "loss": 0.9199, + "step": 2085 + }, + { + "epoch": 0.032537538823199735, + "grad_norm": 19.133163452148438, + "learning_rate": 6.50373599003736e-07, + "loss": 0.9283, + "step": 2090 + }, + { + "epoch": 0.03261537982516911, + "grad_norm": 3.4535083770751953, + "learning_rate": 6.519302615193026e-07, + "loss": 0.9707, + "step": 2095 + }, + { + "epoch": 0.032693220827138485, + "grad_norm": 11.507316589355469, + "learning_rate": 6.534869240348691e-07, + "loss": 0.8878, + "step": 2100 + }, + { + "epoch": 0.03277106182910786, + "grad_norm": 21.363101959228516, + "learning_rate": 6.550435865504358e-07, + "loss": 0.9761, + "step": 2105 + }, + { + "epoch": 0.03284890283107724, + "grad_norm": 4.29213285446167, + "learning_rate": 6.566002490660024e-07, + "loss": 0.7789, + "step": 2110 + }, + { + "epoch": 0.03292674383304662, + "grad_norm": 7.540319442749023, + "learning_rate": 6.581569115815691e-07, + "loss": 0.7991, + "step": 2115 + }, + { + "epoch": 0.033004584835016, + "grad_norm": 3.658780097961426, + "learning_rate": 6.597135740971356e-07, + "loss": 0.9211, + "step": 2120 + }, + { + "epoch": 0.033082425836985375, + "grad_norm": 8.205567359924316, + "learning_rate": 6.612702366127023e-07, + "loss": 0.8083, + "step": 2125 + }, + { + "epoch": 0.033160266838954754, + "grad_norm": 6.272342681884766, + "learning_rate": 6.628268991282689e-07, + "loss": 0.9724, + "step": 2130 + }, + { + "epoch": 0.03323810784092413, + "grad_norm": 7.037917137145996, + "learning_rate": 6.643835616438356e-07, + "loss": 0.8821, + "step": 2135 + }, + { + "epoch": 0.0333159488428935, + "grad_norm": 6.3946027755737305, + "learning_rate": 6.659402241594022e-07, + "loss": 0.9065, + "step": 2140 + }, + { + "epoch": 0.03339378984486288, + "grad_norm": 7.079307556152344, + "learning_rate": 6.674968866749689e-07, + "loss": 0.993, + "step": 2145 + }, + { + "epoch": 0.03347163084683226, + "grad_norm": 6.372123718261719, + "learning_rate": 6.690535491905354e-07, + "loss": 0.9372, + "step": 2150 + }, + { + "epoch": 0.03354947184880164, + "grad_norm": 2.9949862957000732, + "learning_rate": 6.706102117061021e-07, + "loss": 0.7785, + "step": 2155 + }, + { + "epoch": 0.033627312850771016, + "grad_norm": 5.278440475463867, + "learning_rate": 6.721668742216687e-07, + "loss": 0.8646, + "step": 2160 + }, + { + "epoch": 0.033705153852740394, + "grad_norm": 3.972559928894043, + "learning_rate": 6.737235367372354e-07, + "loss": 0.8816, + "step": 2165 + }, + { + "epoch": 0.03378299485470977, + "grad_norm": 7.038811683654785, + "learning_rate": 6.75280199252802e-07, + "loss": 0.8586, + "step": 2170 + }, + { + "epoch": 0.03386083585667915, + "grad_norm": 4.659327507019043, + "learning_rate": 6.768368617683686e-07, + "loss": 0.8671, + "step": 2175 + }, + { + "epoch": 0.03393867685864852, + "grad_norm": 3.272244453430176, + "learning_rate": 6.783935242839352e-07, + "loss": 0.8553, + "step": 2180 + }, + { + "epoch": 0.0340165178606179, + "grad_norm": 4.486519813537598, + "learning_rate": 6.799501867995019e-07, + "loss": 0.9569, + "step": 2185 + }, + { + "epoch": 0.03409435886258728, + "grad_norm": 3.5172436237335205, + "learning_rate": 6.815068493150684e-07, + "loss": 0.9172, + "step": 2190 + }, + { + "epoch": 0.034172199864556656, + "grad_norm": 8.919556617736816, + "learning_rate": 6.830635118306351e-07, + "loss": 0.9005, + "step": 2195 + }, + { + "epoch": 0.034250040866526034, + "grad_norm": 3.1688411235809326, + "learning_rate": 6.846201743462016e-07, + "loss": 0.9052, + "step": 2200 + }, + { + "epoch": 0.03432788186849541, + "grad_norm": 8.181324005126953, + "learning_rate": 6.861768368617683e-07, + "loss": 0.9344, + "step": 2205 + }, + { + "epoch": 0.03440572287046479, + "grad_norm": 14.188647270202637, + "learning_rate": 6.877334993773349e-07, + "loss": 0.8221, + "step": 2210 + }, + { + "epoch": 0.03448356387243417, + "grad_norm": 2.8779571056365967, + "learning_rate": 6.892901618929016e-07, + "loss": 0.8213, + "step": 2215 + }, + { + "epoch": 0.034561404874403547, + "grad_norm": 4.762483596801758, + "learning_rate": 6.908468244084682e-07, + "loss": 0.9539, + "step": 2220 + }, + { + "epoch": 0.03463924587637292, + "grad_norm": 5.372674942016602, + "learning_rate": 6.924034869240348e-07, + "loss": 0.9323, + "step": 2225 + }, + { + "epoch": 0.034717086878342296, + "grad_norm": 4.73727560043335, + "learning_rate": 6.939601494396014e-07, + "loss": 0.9555, + "step": 2230 + }, + { + "epoch": 0.034794927880311674, + "grad_norm": 2.479062557220459, + "learning_rate": 6.955168119551681e-07, + "loss": 0.8026, + "step": 2235 + }, + { + "epoch": 0.03487276888228105, + "grad_norm": 4.98023796081543, + "learning_rate": 6.970734744707347e-07, + "loss": 0.9514, + "step": 2240 + }, + { + "epoch": 0.03495060988425043, + "grad_norm": 4.072389125823975, + "learning_rate": 6.986301369863014e-07, + "loss": 0.9739, + "step": 2245 + }, + { + "epoch": 0.03502845088621981, + "grad_norm": 3.26598858833313, + "learning_rate": 7.00186799501868e-07, + "loss": 0.8112, + "step": 2250 + }, + { + "epoch": 0.03510629188818919, + "grad_norm": 10.324394226074219, + "learning_rate": 7.017434620174346e-07, + "loss": 0.8578, + "step": 2255 + }, + { + "epoch": 0.035184132890158565, + "grad_norm": 7.579793453216553, + "learning_rate": 7.033001245330012e-07, + "loss": 0.8586, + "step": 2260 + }, + { + "epoch": 0.035261973892127936, + "grad_norm": 3.6266613006591797, + "learning_rate": 7.048567870485679e-07, + "loss": 0.8904, + "step": 2265 + }, + { + "epoch": 0.035339814894097314, + "grad_norm": 4.336295127868652, + "learning_rate": 7.064134495641345e-07, + "loss": 0.84, + "step": 2270 + }, + { + "epoch": 0.03541765589606669, + "grad_norm": 3.5872817039489746, + "learning_rate": 7.079701120797012e-07, + "loss": 0.7951, + "step": 2275 + }, + { + "epoch": 0.03549549689803607, + "grad_norm": 4.598228454589844, + "learning_rate": 7.095267745952676e-07, + "loss": 0.7467, + "step": 2280 + }, + { + "epoch": 0.03557333790000545, + "grad_norm": 3.560222625732422, + "learning_rate": 7.110834371108343e-07, + "loss": 0.9047, + "step": 2285 + }, + { + "epoch": 0.03565117890197483, + "grad_norm": 2.8487563133239746, + "learning_rate": 7.126400996264009e-07, + "loss": 0.9243, + "step": 2290 + }, + { + "epoch": 0.035729019903944205, + "grad_norm": 5.525490760803223, + "learning_rate": 7.141967621419676e-07, + "loss": 0.8549, + "step": 2295 + }, + { + "epoch": 0.03580686090591358, + "grad_norm": 3.5428950786590576, + "learning_rate": 7.157534246575342e-07, + "loss": 0.9336, + "step": 2300 + }, + { + "epoch": 0.03588470190788296, + "grad_norm": 8.396724700927734, + "learning_rate": 7.173100871731008e-07, + "loss": 0.9101, + "step": 2305 + }, + { + "epoch": 0.03596254290985233, + "grad_norm": 6.355068206787109, + "learning_rate": 7.188667496886674e-07, + "loss": 0.8673, + "step": 2310 + }, + { + "epoch": 0.03604038391182171, + "grad_norm": 8.388739585876465, + "learning_rate": 7.204234122042341e-07, + "loss": 0.9225, + "step": 2315 + }, + { + "epoch": 0.03611822491379109, + "grad_norm": 4.088027477264404, + "learning_rate": 7.219800747198007e-07, + "loss": 0.8003, + "step": 2320 + }, + { + "epoch": 0.03619606591576047, + "grad_norm": 3.6764137744903564, + "learning_rate": 7.235367372353674e-07, + "loss": 0.782, + "step": 2325 + }, + { + "epoch": 0.036273906917729845, + "grad_norm": 3.6554110050201416, + "learning_rate": 7.25093399750934e-07, + "loss": 0.9257, + "step": 2330 + }, + { + "epoch": 0.03635174791969922, + "grad_norm": 6.99379301071167, + "learning_rate": 7.266500622665006e-07, + "loss": 0.9329, + "step": 2335 + }, + { + "epoch": 0.0364295889216686, + "grad_norm": 3.984800100326538, + "learning_rate": 7.282067247820672e-07, + "loss": 0.9596, + "step": 2340 + }, + { + "epoch": 0.03650742992363798, + "grad_norm": 7.992112159729004, + "learning_rate": 7.297633872976339e-07, + "loss": 0.8945, + "step": 2345 + }, + { + "epoch": 0.03658527092560736, + "grad_norm": 3.314192295074463, + "learning_rate": 7.313200498132005e-07, + "loss": 0.817, + "step": 2350 + }, + { + "epoch": 0.03666311192757673, + "grad_norm": 5.738452434539795, + "learning_rate": 7.328767123287672e-07, + "loss": 0.8133, + "step": 2355 + }, + { + "epoch": 0.03674095292954611, + "grad_norm": 4.364063739776611, + "learning_rate": 7.344333748443337e-07, + "loss": 0.7555, + "step": 2360 + }, + { + "epoch": 0.036818793931515485, + "grad_norm": 6.397834777832031, + "learning_rate": 7.359900373599004e-07, + "loss": 0.9289, + "step": 2365 + }, + { + "epoch": 0.03689663493348486, + "grad_norm": 4.602386951446533, + "learning_rate": 7.375466998754669e-07, + "loss": 0.8466, + "step": 2370 + }, + { + "epoch": 0.03697447593545424, + "grad_norm": 4.438021659851074, + "learning_rate": 7.391033623910336e-07, + "loss": 0.8155, + "step": 2375 + }, + { + "epoch": 0.03705231693742362, + "grad_norm": 5.829861164093018, + "learning_rate": 7.406600249066002e-07, + "loss": 0.9119, + "step": 2380 + }, + { + "epoch": 0.037130157939393, + "grad_norm": 3.999397039413452, + "learning_rate": 7.422166874221668e-07, + "loss": 0.9544, + "step": 2385 + }, + { + "epoch": 0.037207998941362376, + "grad_norm": 7.094069480895996, + "learning_rate": 7.437733499377334e-07, + "loss": 0.8562, + "step": 2390 + }, + { + "epoch": 0.03728583994333175, + "grad_norm": 7.502668857574463, + "learning_rate": 7.453300124533001e-07, + "loss": 0.7626, + "step": 2395 + }, + { + "epoch": 0.037363680945301125, + "grad_norm": 4.224865913391113, + "learning_rate": 7.468866749688667e-07, + "loss": 0.8287, + "step": 2400 + }, + { + "epoch": 0.037441521947270504, + "grad_norm": 4.2678046226501465, + "learning_rate": 7.484433374844334e-07, + "loss": 0.9631, + "step": 2405 + }, + { + "epoch": 0.03751936294923988, + "grad_norm": 4.143566608428955, + "learning_rate": 7.5e-07, + "loss": 0.9013, + "step": 2410 + }, + { + "epoch": 0.03759720395120926, + "grad_norm": 3.8706650733947754, + "learning_rate": 7.515566625155666e-07, + "loss": 0.8463, + "step": 2415 + }, + { + "epoch": 0.03767504495317864, + "grad_norm": 6.372035503387451, + "learning_rate": 7.531133250311332e-07, + "loss": 0.8966, + "step": 2420 + }, + { + "epoch": 0.037752885955148016, + "grad_norm": 4.3398613929748535, + "learning_rate": 7.546699875466999e-07, + "loss": 0.8045, + "step": 2425 + }, + { + "epoch": 0.037830726957117394, + "grad_norm": 2.7824904918670654, + "learning_rate": 7.562266500622665e-07, + "loss": 0.8311, + "step": 2430 + }, + { + "epoch": 0.03790856795908677, + "grad_norm": 3.9570069313049316, + "learning_rate": 7.577833125778332e-07, + "loss": 0.9548, + "step": 2435 + }, + { + "epoch": 0.037986408961056144, + "grad_norm": 4.316530227661133, + "learning_rate": 7.593399750933997e-07, + "loss": 0.7945, + "step": 2440 + }, + { + "epoch": 0.03806424996302552, + "grad_norm": 4.4045844078063965, + "learning_rate": 7.608966376089664e-07, + "loss": 0.9145, + "step": 2445 + }, + { + "epoch": 0.0381420909649949, + "grad_norm": 3.736820697784424, + "learning_rate": 7.62453300124533e-07, + "loss": 0.9292, + "step": 2450 + }, + { + "epoch": 0.03821993196696428, + "grad_norm": 3.8448410034179688, + "learning_rate": 7.640099626400996e-07, + "loss": 0.8863, + "step": 2455 + }, + { + "epoch": 0.038297772968933656, + "grad_norm": 7.468678951263428, + "learning_rate": 7.655666251556662e-07, + "loss": 0.8776, + "step": 2460 + }, + { + "epoch": 0.038375613970903034, + "grad_norm": 4.066128253936768, + "learning_rate": 7.671232876712328e-07, + "loss": 0.906, + "step": 2465 + }, + { + "epoch": 0.03845345497287241, + "grad_norm": 8.009504318237305, + "learning_rate": 7.686799501867994e-07, + "loss": 0.9295, + "step": 2470 + }, + { + "epoch": 0.03853129597484179, + "grad_norm": 3.9662601947784424, + "learning_rate": 7.702366127023661e-07, + "loss": 0.927, + "step": 2475 + }, + { + "epoch": 0.03860913697681117, + "grad_norm": 3.94587779045105, + "learning_rate": 7.717932752179327e-07, + "loss": 0.9679, + "step": 2480 + }, + { + "epoch": 0.03868697797878054, + "grad_norm": 3.856196641921997, + "learning_rate": 7.733499377334994e-07, + "loss": 0.981, + "step": 2485 + }, + { + "epoch": 0.03876481898074992, + "grad_norm": 12.542234420776367, + "learning_rate": 7.749066002490659e-07, + "loss": 0.8065, + "step": 2490 + }, + { + "epoch": 0.038842659982719296, + "grad_norm": 5.717936038970947, + "learning_rate": 7.764632627646326e-07, + "loss": 0.8912, + "step": 2495 + }, + { + "epoch": 0.038920500984688675, + "grad_norm": 9.94604206085205, + "learning_rate": 7.780199252801992e-07, + "loss": 0.9442, + "step": 2500 + }, + { + "epoch": 0.03899834198665805, + "grad_norm": 5.26216983795166, + "learning_rate": 7.795765877957659e-07, + "loss": 0.9552, + "step": 2505 + }, + { + "epoch": 0.03907618298862743, + "grad_norm": 6.468954563140869, + "learning_rate": 7.811332503113325e-07, + "loss": 0.9372, + "step": 2510 + }, + { + "epoch": 0.03915402399059681, + "grad_norm": 2.9301857948303223, + "learning_rate": 7.826899128268992e-07, + "loss": 0.8479, + "step": 2515 + }, + { + "epoch": 0.03923186499256619, + "grad_norm": 6.389108657836914, + "learning_rate": 7.842465753424657e-07, + "loss": 0.9255, + "step": 2520 + }, + { + "epoch": 0.03930970599453556, + "grad_norm": 4.842959880828857, + "learning_rate": 7.858032378580324e-07, + "loss": 0.8478, + "step": 2525 + }, + { + "epoch": 0.03938754699650494, + "grad_norm": 3.118706464767456, + "learning_rate": 7.87359900373599e-07, + "loss": 0.8668, + "step": 2530 + }, + { + "epoch": 0.039465387998474315, + "grad_norm": 6.257364273071289, + "learning_rate": 7.889165628891657e-07, + "loss": 0.873, + "step": 2535 + }, + { + "epoch": 0.03954322900044369, + "grad_norm": 4.405180931091309, + "learning_rate": 7.904732254047323e-07, + "loss": 0.841, + "step": 2540 + }, + { + "epoch": 0.03962107000241307, + "grad_norm": 9.870434761047363, + "learning_rate": 7.920298879202987e-07, + "loss": 0.937, + "step": 2545 + }, + { + "epoch": 0.03969891100438245, + "grad_norm": 3.4615135192871094, + "learning_rate": 7.935865504358654e-07, + "loss": 0.9465, + "step": 2550 + }, + { + "epoch": 0.03977675200635183, + "grad_norm": 4.9833760261535645, + "learning_rate": 7.95143212951432e-07, + "loss": 0.8261, + "step": 2555 + }, + { + "epoch": 0.039854593008321205, + "grad_norm": 4.042236804962158, + "learning_rate": 7.966998754669987e-07, + "loss": 0.9602, + "step": 2560 + }, + { + "epoch": 0.039932434010290584, + "grad_norm": 4.549630641937256, + "learning_rate": 7.982565379825654e-07, + "loss": 0.8674, + "step": 2565 + }, + { + "epoch": 0.040010275012259955, + "grad_norm": 3.67543363571167, + "learning_rate": 7.998132004981319e-07, + "loss": 0.9073, + "step": 2570 + }, + { + "epoch": 0.04008811601422933, + "grad_norm": 6.078221321105957, + "learning_rate": 8.013698630136985e-07, + "loss": 0.9359, + "step": 2575 + }, + { + "epoch": 0.04016595701619871, + "grad_norm": 5.599534034729004, + "learning_rate": 8.029265255292652e-07, + "loss": 0.8628, + "step": 2580 + }, + { + "epoch": 0.04024379801816809, + "grad_norm": 5.098958492279053, + "learning_rate": 8.044831880448319e-07, + "loss": 0.9412, + "step": 2585 + }, + { + "epoch": 0.04032163902013747, + "grad_norm": 7.108897686004639, + "learning_rate": 8.060398505603985e-07, + "loss": 0.9517, + "step": 2590 + }, + { + "epoch": 0.040399480022106846, + "grad_norm": 4.495419979095459, + "learning_rate": 8.075965130759652e-07, + "loss": 0.8605, + "step": 2595 + }, + { + "epoch": 0.040477321024076224, + "grad_norm": 4.583033084869385, + "learning_rate": 8.091531755915317e-07, + "loss": 0.8956, + "step": 2600 + }, + { + "epoch": 0.0405551620260456, + "grad_norm": 5.067065238952637, + "learning_rate": 8.107098381070983e-07, + "loss": 0.8306, + "step": 2605 + }, + { + "epoch": 0.04063300302801497, + "grad_norm": 7.724658012390137, + "learning_rate": 8.12266500622665e-07, + "loss": 0.8716, + "step": 2610 + }, + { + "epoch": 0.04071084402998435, + "grad_norm": 2.8972911834716797, + "learning_rate": 8.138231631382317e-07, + "loss": 0.8987, + "step": 2615 + }, + { + "epoch": 0.04078868503195373, + "grad_norm": 7.840747833251953, + "learning_rate": 8.153798256537983e-07, + "loss": 0.8418, + "step": 2620 + }, + { + "epoch": 0.04086652603392311, + "grad_norm": 7.727685928344727, + "learning_rate": 8.169364881693648e-07, + "loss": 0.9107, + "step": 2625 + }, + { + "epoch": 0.040944367035892486, + "grad_norm": 3.801807165145874, + "learning_rate": 8.184931506849315e-07, + "loss": 0.8083, + "step": 2630 + }, + { + "epoch": 0.041022208037861864, + "grad_norm": 12.985006332397461, + "learning_rate": 8.20049813200498e-07, + "loss": 0.9866, + "step": 2635 + }, + { + "epoch": 0.04110004903983124, + "grad_norm": 3.2062785625457764, + "learning_rate": 8.216064757160647e-07, + "loss": 0.8945, + "step": 2640 + }, + { + "epoch": 0.04117789004180062, + "grad_norm": 4.6915459632873535, + "learning_rate": 8.231631382316313e-07, + "loss": 0.8236, + "step": 2645 + }, + { + "epoch": 0.04125573104377, + "grad_norm": 3.5803701877593994, + "learning_rate": 8.247198007471979e-07, + "loss": 0.8776, + "step": 2650 + }, + { + "epoch": 0.04133357204573937, + "grad_norm": 12.053580284118652, + "learning_rate": 8.262764632627645e-07, + "loss": 0.9105, + "step": 2655 + }, + { + "epoch": 0.04141141304770875, + "grad_norm": 6.285280227661133, + "learning_rate": 8.278331257783312e-07, + "loss": 0.9066, + "step": 2660 + }, + { + "epoch": 0.041489254049678126, + "grad_norm": 5.232326984405518, + "learning_rate": 8.293897882938978e-07, + "loss": 0.8508, + "step": 2665 + }, + { + "epoch": 0.041567095051647504, + "grad_norm": 3.038318395614624, + "learning_rate": 8.309464508094645e-07, + "loss": 0.8356, + "step": 2670 + }, + { + "epoch": 0.04164493605361688, + "grad_norm": 7.6262335777282715, + "learning_rate": 8.32503113325031e-07, + "loss": 0.9568, + "step": 2675 + }, + { + "epoch": 0.04172277705558626, + "grad_norm": 3.0321080684661865, + "learning_rate": 8.340597758405977e-07, + "loss": 0.7881, + "step": 2680 + }, + { + "epoch": 0.04180061805755564, + "grad_norm": 9.739387512207031, + "learning_rate": 8.356164383561643e-07, + "loss": 0.74, + "step": 2685 + }, + { + "epoch": 0.04187845905952502, + "grad_norm": 8.000276565551758, + "learning_rate": 8.37173100871731e-07, + "loss": 0.9413, + "step": 2690 + }, + { + "epoch": 0.041956300061494395, + "grad_norm": 6.706925868988037, + "learning_rate": 8.387297633872976e-07, + "loss": 0.7788, + "step": 2695 + }, + { + "epoch": 0.042034141063463766, + "grad_norm": 6.568419933319092, + "learning_rate": 8.402864259028643e-07, + "loss": 0.8554, + "step": 2700 + }, + { + "epoch": 0.042111982065433144, + "grad_norm": 3.8879165649414062, + "learning_rate": 8.418430884184308e-07, + "loss": 0.9076, + "step": 2705 + }, + { + "epoch": 0.04218982306740252, + "grad_norm": 5.89036226272583, + "learning_rate": 8.433997509339975e-07, + "loss": 0.91, + "step": 2710 + }, + { + "epoch": 0.0422676640693719, + "grad_norm": 5.522625923156738, + "learning_rate": 8.449564134495641e-07, + "loss": 0.9102, + "step": 2715 + }, + { + "epoch": 0.04234550507134128, + "grad_norm": 4.862393379211426, + "learning_rate": 8.465130759651308e-07, + "loss": 0.8503, + "step": 2720 + }, + { + "epoch": 0.04242334607331066, + "grad_norm": 8.545342445373535, + "learning_rate": 8.480697384806973e-07, + "loss": 0.7882, + "step": 2725 + }, + { + "epoch": 0.042501187075280035, + "grad_norm": 3.1325466632843018, + "learning_rate": 8.496264009962639e-07, + "loss": 0.7993, + "step": 2730 + }, + { + "epoch": 0.04257902807724941, + "grad_norm": 3.6244635581970215, + "learning_rate": 8.511830635118305e-07, + "loss": 0.7498, + "step": 2735 + }, + { + "epoch": 0.042656869079218784, + "grad_norm": 7.154248237609863, + "learning_rate": 8.527397260273972e-07, + "loss": 0.856, + "step": 2740 + }, + { + "epoch": 0.04273471008118816, + "grad_norm": 4.3253960609436035, + "learning_rate": 8.542963885429638e-07, + "loss": 1.0214, + "step": 2745 + }, + { + "epoch": 0.04281255108315754, + "grad_norm": 4.56231164932251, + "learning_rate": 8.558530510585305e-07, + "loss": 0.8501, + "step": 2750 + }, + { + "epoch": 0.04289039208512692, + "grad_norm": 3.396204710006714, + "learning_rate": 8.57409713574097e-07, + "loss": 0.9002, + "step": 2755 + }, + { + "epoch": 0.0429682330870963, + "grad_norm": 5.2896952629089355, + "learning_rate": 8.589663760896637e-07, + "loss": 0.8824, + "step": 2760 + }, + { + "epoch": 0.043046074089065675, + "grad_norm": 2.9441330432891846, + "learning_rate": 8.605230386052303e-07, + "loss": 0.915, + "step": 2765 + }, + { + "epoch": 0.04312391509103505, + "grad_norm": 3.7935092449188232, + "learning_rate": 8.62079701120797e-07, + "loss": 0.9273, + "step": 2770 + }, + { + "epoch": 0.04320175609300443, + "grad_norm": 2.87821102142334, + "learning_rate": 8.636363636363636e-07, + "loss": 0.7991, + "step": 2775 + }, + { + "epoch": 0.04327959709497381, + "grad_norm": 6.359185218811035, + "learning_rate": 8.651930261519303e-07, + "loss": 0.8009, + "step": 2780 + }, + { + "epoch": 0.04335743809694318, + "grad_norm": 4.339592456817627, + "learning_rate": 8.667496886674968e-07, + "loss": 0.9357, + "step": 2785 + }, + { + "epoch": 0.04343527909891256, + "grad_norm": 5.373045921325684, + "learning_rate": 8.683063511830635e-07, + "loss": 0.9278, + "step": 2790 + }, + { + "epoch": 0.04351312010088194, + "grad_norm": 4.687058448791504, + "learning_rate": 8.698630136986301e-07, + "loss": 0.8102, + "step": 2795 + }, + { + "epoch": 0.043590961102851315, + "grad_norm": 3.0270180702209473, + "learning_rate": 8.714196762141968e-07, + "loss": 0.7115, + "step": 2800 + }, + { + "epoch": 0.04366880210482069, + "grad_norm": 4.379403114318848, + "learning_rate": 8.729763387297634e-07, + "loss": 0.9258, + "step": 2805 + }, + { + "epoch": 0.04374664310679007, + "grad_norm": 5.339996814727783, + "learning_rate": 8.7453300124533e-07, + "loss": 0.8254, + "step": 2810 + }, + { + "epoch": 0.04382448410875945, + "grad_norm": 8.269269943237305, + "learning_rate": 8.760896637608965e-07, + "loss": 0.8743, + "step": 2815 + }, + { + "epoch": 0.04390232511072883, + "grad_norm": 3.314060688018799, + "learning_rate": 8.776463262764632e-07, + "loss": 0.8462, + "step": 2820 + }, + { + "epoch": 0.043980166112698206, + "grad_norm": 5.628077507019043, + "learning_rate": 8.792029887920298e-07, + "loss": 0.8659, + "step": 2825 + }, + { + "epoch": 0.04405800711466758, + "grad_norm": 5.050654888153076, + "learning_rate": 8.807596513075965e-07, + "loss": 0.9523, + "step": 2830 + }, + { + "epoch": 0.044135848116636955, + "grad_norm": 3.8180229663848877, + "learning_rate": 8.82316313823163e-07, + "loss": 0.8504, + "step": 2835 + }, + { + "epoch": 0.044213689118606334, + "grad_norm": 3.3269238471984863, + "learning_rate": 8.838729763387297e-07, + "loss": 0.8544, + "step": 2840 + }, + { + "epoch": 0.04429153012057571, + "grad_norm": 5.524733066558838, + "learning_rate": 8.854296388542963e-07, + "loss": 0.8963, + "step": 2845 + }, + { + "epoch": 0.04436937112254509, + "grad_norm": 4.741829872131348, + "learning_rate": 8.86986301369863e-07, + "loss": 0.8262, + "step": 2850 + }, + { + "epoch": 0.04444721212451447, + "grad_norm": 4.213449001312256, + "learning_rate": 8.885429638854296e-07, + "loss": 0.8722, + "step": 2855 + }, + { + "epoch": 0.044525053126483846, + "grad_norm": 3.3883323669433594, + "learning_rate": 8.900996264009963e-07, + "loss": 0.8378, + "step": 2860 + }, + { + "epoch": 0.044602894128453224, + "grad_norm": 5.192069053649902, + "learning_rate": 8.916562889165628e-07, + "loss": 0.8204, + "step": 2865 + }, + { + "epoch": 0.044680735130422596, + "grad_norm": 3.5852484703063965, + "learning_rate": 8.932129514321295e-07, + "loss": 0.9008, + "step": 2870 + }, + { + "epoch": 0.044758576132391974, + "grad_norm": 5.334090709686279, + "learning_rate": 8.947696139476961e-07, + "loss": 0.7885, + "step": 2875 + }, + { + "epoch": 0.04483641713436135, + "grad_norm": 5.502117156982422, + "learning_rate": 8.963262764632628e-07, + "loss": 0.8715, + "step": 2880 + }, + { + "epoch": 0.04491425813633073, + "grad_norm": 3.577226400375366, + "learning_rate": 8.978829389788294e-07, + "loss": 0.8076, + "step": 2885 + }, + { + "epoch": 0.04499209913830011, + "grad_norm": 2.6925950050354004, + "learning_rate": 8.99439601494396e-07, + "loss": 0.8624, + "step": 2890 + }, + { + "epoch": 0.045069940140269486, + "grad_norm": 15.992047309875488, + "learning_rate": 9.009962640099626e-07, + "loss": 0.8628, + "step": 2895 + }, + { + "epoch": 0.045147781142238864, + "grad_norm": 4.682984352111816, + "learning_rate": 9.025529265255293e-07, + "loss": 0.876, + "step": 2900 + }, + { + "epoch": 0.04522562214420824, + "grad_norm": 3.694166421890259, + "learning_rate": 9.041095890410958e-07, + "loss": 0.7707, + "step": 2905 + }, + { + "epoch": 0.04530346314617762, + "grad_norm": 6.382852077484131, + "learning_rate": 9.056662515566625e-07, + "loss": 0.8372, + "step": 2910 + }, + { + "epoch": 0.04538130414814699, + "grad_norm": 21.609174728393555, + "learning_rate": 9.07222914072229e-07, + "loss": 0.9099, + "step": 2915 + }, + { + "epoch": 0.04545914515011637, + "grad_norm": 2.4359121322631836, + "learning_rate": 9.087795765877957e-07, + "loss": 0.9096, + "step": 2920 + }, + { + "epoch": 0.04553698615208575, + "grad_norm": 3.6651458740234375, + "learning_rate": 9.103362391033623e-07, + "loss": 0.9408, + "step": 2925 + }, + { + "epoch": 0.045614827154055126, + "grad_norm": 5.385332107543945, + "learning_rate": 9.11892901618929e-07, + "loss": 0.9312, + "step": 2930 + }, + { + "epoch": 0.045692668156024505, + "grad_norm": 3.9548020362854004, + "learning_rate": 9.134495641344956e-07, + "loss": 0.8158, + "step": 2935 + }, + { + "epoch": 0.04577050915799388, + "grad_norm": 2.7402524948120117, + "learning_rate": 9.150062266500622e-07, + "loss": 0.7451, + "step": 2940 + }, + { + "epoch": 0.04584835015996326, + "grad_norm": 6.259926795959473, + "learning_rate": 9.165628891656288e-07, + "loss": 0.7399, + "step": 2945 + }, + { + "epoch": 0.04592619116193264, + "grad_norm": 2.590571403503418, + "learning_rate": 9.181195516811955e-07, + "loss": 0.9088, + "step": 2950 + }, + { + "epoch": 0.04600403216390201, + "grad_norm": 4.331900596618652, + "learning_rate": 9.196762141967621e-07, + "loss": 0.8636, + "step": 2955 + }, + { + "epoch": 0.04608187316587139, + "grad_norm": 5.567667007446289, + "learning_rate": 9.212328767123288e-07, + "loss": 0.9059, + "step": 2960 + }, + { + "epoch": 0.04615971416784077, + "grad_norm": 4.826610565185547, + "learning_rate": 9.227895392278954e-07, + "loss": 0.7942, + "step": 2965 + }, + { + "epoch": 0.046237555169810145, + "grad_norm": 7.561775207519531, + "learning_rate": 9.24346201743462e-07, + "loss": 0.8461, + "step": 2970 + }, + { + "epoch": 0.04631539617177952, + "grad_norm": 4.251841068267822, + "learning_rate": 9.259028642590286e-07, + "loss": 0.8541, + "step": 2975 + }, + { + "epoch": 0.0463932371737489, + "grad_norm": 5.157746315002441, + "learning_rate": 9.274595267745953e-07, + "loss": 0.8863, + "step": 2980 + }, + { + "epoch": 0.04647107817571828, + "grad_norm": 4.906675815582275, + "learning_rate": 9.290161892901619e-07, + "loss": 0.8432, + "step": 2985 + }, + { + "epoch": 0.04654891917768766, + "grad_norm": 4.339780807495117, + "learning_rate": 9.305728518057285e-07, + "loss": 0.8448, + "step": 2990 + }, + { + "epoch": 0.046626760179657036, + "grad_norm": 7.889379024505615, + "learning_rate": 9.32129514321295e-07, + "loss": 0.9431, + "step": 2995 + }, + { + "epoch": 0.04670460118162641, + "grad_norm": 3.7697620391845703, + "learning_rate": 9.336861768368617e-07, + "loss": 0.8569, + "step": 3000 + }, + { + "epoch": 0.046782442183595785, + "grad_norm": 8.277153968811035, + "learning_rate": 9.352428393524283e-07, + "loss": 0.7687, + "step": 3005 + }, + { + "epoch": 0.04686028318556516, + "grad_norm": 6.6820149421691895, + "learning_rate": 9.36799501867995e-07, + "loss": 0.8283, + "step": 3010 + }, + { + "epoch": 0.04693812418753454, + "grad_norm": 6.457581996917725, + "learning_rate": 9.383561643835616e-07, + "loss": 0.8447, + "step": 3015 + }, + { + "epoch": 0.04701596518950392, + "grad_norm": 8.55042552947998, + "learning_rate": 9.399128268991282e-07, + "loss": 0.8986, + "step": 3020 + }, + { + "epoch": 0.0470938061914733, + "grad_norm": 8.297921180725098, + "learning_rate": 9.414694894146948e-07, + "loss": 0.9012, + "step": 3025 + }, + { + "epoch": 0.047171647193442676, + "grad_norm": 7.09883975982666, + "learning_rate": 9.430261519302615e-07, + "loss": 0.7454, + "step": 3030 + }, + { + "epoch": 0.047249488195412054, + "grad_norm": 8.166378021240234, + "learning_rate": 9.445828144458281e-07, + "loss": 0.8405, + "step": 3035 + }, + { + "epoch": 0.04732732919738143, + "grad_norm": 4.509795665740967, + "learning_rate": 9.461394769613948e-07, + "loss": 0.8076, + "step": 3040 + }, + { + "epoch": 0.0474051701993508, + "grad_norm": 8.811022758483887, + "learning_rate": 9.476961394769614e-07, + "loss": 0.8387, + "step": 3045 + }, + { + "epoch": 0.04748301120132018, + "grad_norm": 3.122080087661743, + "learning_rate": 9.49252801992528e-07, + "loss": 0.7069, + "step": 3050 + }, + { + "epoch": 0.04756085220328956, + "grad_norm": 6.84942626953125, + "learning_rate": 9.508094645080946e-07, + "loss": 0.7412, + "step": 3055 + }, + { + "epoch": 0.04763869320525894, + "grad_norm": 7.2728424072265625, + "learning_rate": 9.523661270236613e-07, + "loss": 0.8156, + "step": 3060 + }, + { + "epoch": 0.047716534207228316, + "grad_norm": 4.008538246154785, + "learning_rate": 9.539227895392278e-07, + "loss": 0.8618, + "step": 3065 + }, + { + "epoch": 0.047794375209197694, + "grad_norm": 3.9167230129241943, + "learning_rate": 9.554794520547946e-07, + "loss": 0.8026, + "step": 3070 + }, + { + "epoch": 0.04787221621116707, + "grad_norm": 3.683629274368286, + "learning_rate": 9.570361145703611e-07, + "loss": 0.9918, + "step": 3075 + }, + { + "epoch": 0.04795005721313645, + "grad_norm": 4.745260238647461, + "learning_rate": 9.585927770859277e-07, + "loss": 0.8148, + "step": 3080 + }, + { + "epoch": 0.04802789821510582, + "grad_norm": 2.839996814727783, + "learning_rate": 9.601494396014944e-07, + "loss": 0.8176, + "step": 3085 + }, + { + "epoch": 0.0481057392170752, + "grad_norm": 5.715896129608154, + "learning_rate": 9.61706102117061e-07, + "loss": 0.9266, + "step": 3090 + }, + { + "epoch": 0.04818358021904458, + "grad_norm": 3.44376540184021, + "learning_rate": 9.632627646326275e-07, + "loss": 0.7851, + "step": 3095 + }, + { + "epoch": 0.048261421221013956, + "grad_norm": 4.266874313354492, + "learning_rate": 9.648194271481943e-07, + "loss": 0.8701, + "step": 3100 + }, + { + "epoch": 0.048339262222983334, + "grad_norm": 3.6180949211120605, + "learning_rate": 9.663760896637608e-07, + "loss": 0.9206, + "step": 3105 + }, + { + "epoch": 0.04841710322495271, + "grad_norm": 4.103425979614258, + "learning_rate": 9.679327521793276e-07, + "loss": 0.9025, + "step": 3110 + }, + { + "epoch": 0.04849494422692209, + "grad_norm": 3.339601516723633, + "learning_rate": 9.69489414694894e-07, + "loss": 0.7176, + "step": 3115 + }, + { + "epoch": 0.04857278522889147, + "grad_norm": 5.723580360412598, + "learning_rate": 9.710460772104606e-07, + "loss": 0.7712, + "step": 3120 + }, + { + "epoch": 0.04865062623086085, + "grad_norm": 9.84174919128418, + "learning_rate": 9.726027397260274e-07, + "loss": 0.8301, + "step": 3125 + }, + { + "epoch": 0.04872846723283022, + "grad_norm": 4.004911422729492, + "learning_rate": 9.74159402241594e-07, + "loss": 0.8232, + "step": 3130 + }, + { + "epoch": 0.048806308234799596, + "grad_norm": 4.22821044921875, + "learning_rate": 9.757160647571607e-07, + "loss": 0.8408, + "step": 3135 + }, + { + "epoch": 0.048884149236768974, + "grad_norm": 3.268477439880371, + "learning_rate": 9.772727272727273e-07, + "loss": 0.854, + "step": 3140 + }, + { + "epoch": 0.04896199023873835, + "grad_norm": 3.3312723636627197, + "learning_rate": 9.788293897882938e-07, + "loss": 0.7584, + "step": 3145 + }, + { + "epoch": 0.04903983124070773, + "grad_norm": 2.6721420288085938, + "learning_rate": 9.803860523038606e-07, + "loss": 0.7101, + "step": 3150 + }, + { + "epoch": 0.04911767224267711, + "grad_norm": 3.1036221981048584, + "learning_rate": 9.81942714819427e-07, + "loss": 0.8943, + "step": 3155 + }, + { + "epoch": 0.04919551324464649, + "grad_norm": 4.581750869750977, + "learning_rate": 9.834993773349939e-07, + "loss": 0.956, + "step": 3160 + }, + { + "epoch": 0.049273354246615865, + "grad_norm": 5.273120880126953, + "learning_rate": 9.850560398505604e-07, + "loss": 0.8077, + "step": 3165 + }, + { + "epoch": 0.04935119524858524, + "grad_norm": 7.310013771057129, + "learning_rate": 9.86612702366127e-07, + "loss": 0.8506, + "step": 3170 + }, + { + "epoch": 0.049429036250554614, + "grad_norm": 5.949068069458008, + "learning_rate": 9.881693648816935e-07, + "loss": 0.8132, + "step": 3175 + }, + { + "epoch": 0.04950687725252399, + "grad_norm": 5.228186130523682, + "learning_rate": 9.897260273972602e-07, + "loss": 0.9303, + "step": 3180 + }, + { + "epoch": 0.04958471825449337, + "grad_norm": 4.407035827636719, + "learning_rate": 9.912826899128268e-07, + "loss": 0.8184, + "step": 3185 + }, + { + "epoch": 0.04966255925646275, + "grad_norm": 4.605864524841309, + "learning_rate": 9.928393524283936e-07, + "loss": 0.9336, + "step": 3190 + }, + { + "epoch": 0.04974040025843213, + "grad_norm": 3.0708847045898438, + "learning_rate": 9.9439601494396e-07, + "loss": 0.8725, + "step": 3195 + }, + { + "epoch": 0.049818241260401505, + "grad_norm": 3.3742926120758057, + "learning_rate": 9.959526774595266e-07, + "loss": 0.8121, + "step": 3200 + }, + { + "epoch": 0.04989608226237088, + "grad_norm": 2.685382843017578, + "learning_rate": 9.975093399750934e-07, + "loss": 0.7798, + "step": 3205 + }, + { + "epoch": 0.04997392326434026, + "grad_norm": 4.932633876800537, + "learning_rate": 9.9906600249066e-07, + "loss": 0.8492, + "step": 3210 + }, + { + "epoch": 0.05005176426630963, + "grad_norm": 8.489307403564453, + "learning_rate": 9.999672243981579e-07, + "loss": 0.8355, + "step": 3215 + }, + { + "epoch": 0.05012960526827901, + "grad_norm": 4.679005146026611, + "learning_rate": 9.99885285393553e-07, + "loss": 0.9012, + "step": 3220 + }, + { + "epoch": 0.05020744627024839, + "grad_norm": 6.65717887878418, + "learning_rate": 9.99803346388948e-07, + "loss": 1.0277, + "step": 3225 + }, + { + "epoch": 0.05028528727221777, + "grad_norm": 5.373363494873047, + "learning_rate": 9.99721407384343e-07, + "loss": 1.0007, + "step": 3230 + }, + { + "epoch": 0.050363128274187145, + "grad_norm": 3.9103312492370605, + "learning_rate": 9.996394683797382e-07, + "loss": 0.8015, + "step": 3235 + }, + { + "epoch": 0.05044096927615652, + "grad_norm": 6.019688606262207, + "learning_rate": 9.995575293751332e-07, + "loss": 0.8575, + "step": 3240 + }, + { + "epoch": 0.0505188102781259, + "grad_norm": 17.253416061401367, + "learning_rate": 9.99475590370528e-07, + "loss": 0.7818, + "step": 3245 + }, + { + "epoch": 0.05059665128009528, + "grad_norm": 9.291438102722168, + "learning_rate": 9.993936513659232e-07, + "loss": 0.9093, + "step": 3250 + }, + { + "epoch": 0.05067449228206466, + "grad_norm": 4.7031121253967285, + "learning_rate": 9.993117123613182e-07, + "loss": 0.792, + "step": 3255 + }, + { + "epoch": 0.05075233328403403, + "grad_norm": 3.9141600131988525, + "learning_rate": 9.992297733567131e-07, + "loss": 0.8803, + "step": 3260 + }, + { + "epoch": 0.05083017428600341, + "grad_norm": 5.731180191040039, + "learning_rate": 9.991478343521082e-07, + "loss": 0.9267, + "step": 3265 + }, + { + "epoch": 0.050908015287972785, + "grad_norm": 4.963929653167725, + "learning_rate": 9.990658953475033e-07, + "loss": 0.8762, + "step": 3270 + }, + { + "epoch": 0.050985856289942164, + "grad_norm": 5.126701831817627, + "learning_rate": 9.989839563428983e-07, + "loss": 0.9762, + "step": 3275 + }, + { + "epoch": 0.05106369729191154, + "grad_norm": 5.1071953773498535, + "learning_rate": 9.989020173382934e-07, + "loss": 0.9504, + "step": 3280 + }, + { + "epoch": 0.05114153829388092, + "grad_norm": 4.061114311218262, + "learning_rate": 9.988200783336883e-07, + "loss": 0.821, + "step": 3285 + }, + { + "epoch": 0.0512193792958503, + "grad_norm": 3.604483127593994, + "learning_rate": 9.987381393290833e-07, + "loss": 0.9565, + "step": 3290 + }, + { + "epoch": 0.051297220297819676, + "grad_norm": 4.070693016052246, + "learning_rate": 9.986562003244784e-07, + "loss": 0.7469, + "step": 3295 + }, + { + "epoch": 0.051375061299789054, + "grad_norm": 3.4125092029571533, + "learning_rate": 9.985742613198735e-07, + "loss": 0.7926, + "step": 3300 + }, + { + "epoch": 0.051452902301758426, + "grad_norm": 7.950231075286865, + "learning_rate": 9.984923223152686e-07, + "loss": 0.8422, + "step": 3305 + }, + { + "epoch": 0.051530743303727804, + "grad_norm": 3.185955762863159, + "learning_rate": 9.984103833106634e-07, + "loss": 0.7959, + "step": 3310 + }, + { + "epoch": 0.05160858430569718, + "grad_norm": 4.626750946044922, + "learning_rate": 9.983284443060585e-07, + "loss": 0.9932, + "step": 3315 + }, + { + "epoch": 0.05168642530766656, + "grad_norm": 2.5758249759674072, + "learning_rate": 9.982465053014536e-07, + "loss": 0.7739, + "step": 3320 + }, + { + "epoch": 0.05176426630963594, + "grad_norm": 3.6274349689483643, + "learning_rate": 9.981645662968484e-07, + "loss": 0.8351, + "step": 3325 + }, + { + "epoch": 0.051842107311605316, + "grad_norm": 3.520857572555542, + "learning_rate": 9.980826272922435e-07, + "loss": 0.8815, + "step": 3330 + }, + { + "epoch": 0.051919948313574694, + "grad_norm": 4.665640354156494, + "learning_rate": 9.980006882876386e-07, + "loss": 0.8575, + "step": 3335 + }, + { + "epoch": 0.05199778931554407, + "grad_norm": 5.597052574157715, + "learning_rate": 9.979187492830337e-07, + "loss": 0.8373, + "step": 3340 + }, + { + "epoch": 0.052075630317513444, + "grad_norm": 5.660586357116699, + "learning_rate": 9.978368102784287e-07, + "loss": 0.9164, + "step": 3345 + }, + { + "epoch": 0.05215347131948282, + "grad_norm": 11.376925468444824, + "learning_rate": 9.977548712738238e-07, + "loss": 0.8779, + "step": 3350 + }, + { + "epoch": 0.0522313123214522, + "grad_norm": 3.930678606033325, + "learning_rate": 9.976729322692187e-07, + "loss": 0.8638, + "step": 3355 + }, + { + "epoch": 0.05230915332342158, + "grad_norm": 4.059145450592041, + "learning_rate": 9.975909932646138e-07, + "loss": 0.7965, + "step": 3360 + }, + { + "epoch": 0.052386994325390956, + "grad_norm": 4.585720539093018, + "learning_rate": 9.975090542600088e-07, + "loss": 0.8034, + "step": 3365 + }, + { + "epoch": 0.052464835327360335, + "grad_norm": 5.015563488006592, + "learning_rate": 9.974271152554037e-07, + "loss": 0.8109, + "step": 3370 + }, + { + "epoch": 0.05254267632932971, + "grad_norm": 3.2969090938568115, + "learning_rate": 9.973451762507988e-07, + "loss": 0.9502, + "step": 3375 + }, + { + "epoch": 0.05262051733129909, + "grad_norm": 3.2702388763427734, + "learning_rate": 9.972632372461938e-07, + "loss": 0.8148, + "step": 3380 + }, + { + "epoch": 0.05269835833326847, + "grad_norm": 2.95889949798584, + "learning_rate": 9.97181298241589e-07, + "loss": 0.8935, + "step": 3385 + }, + { + "epoch": 0.05277619933523784, + "grad_norm": 5.157326698303223, + "learning_rate": 9.97099359236984e-07, + "loss": 0.9001, + "step": 3390 + }, + { + "epoch": 0.05285404033720722, + "grad_norm": 3.6577107906341553, + "learning_rate": 9.97017420232379e-07, + "loss": 0.7983, + "step": 3395 + }, + { + "epoch": 0.0529318813391766, + "grad_norm": 2.539867401123047, + "learning_rate": 9.969354812277741e-07, + "loss": 0.732, + "step": 3400 + }, + { + "epoch": 0.053009722341145975, + "grad_norm": 6.6847076416015625, + "learning_rate": 9.96853542223169e-07, + "loss": 0.8909, + "step": 3405 + }, + { + "epoch": 0.05308756334311535, + "grad_norm": 3.6293387413024902, + "learning_rate": 9.96771603218564e-07, + "loss": 0.7757, + "step": 3410 + }, + { + "epoch": 0.05316540434508473, + "grad_norm": 9.500846862792969, + "learning_rate": 9.966896642139592e-07, + "loss": 0.8709, + "step": 3415 + }, + { + "epoch": 0.05324324534705411, + "grad_norm": 8.317655563354492, + "learning_rate": 9.96607725209354e-07, + "loss": 0.8833, + "step": 3420 + }, + { + "epoch": 0.05332108634902349, + "grad_norm": 6.386698246002197, + "learning_rate": 9.96525786204749e-07, + "loss": 0.9136, + "step": 3425 + }, + { + "epoch": 0.05339892735099286, + "grad_norm": 3.567600965499878, + "learning_rate": 9.964438472001442e-07, + "loss": 0.8465, + "step": 3430 + }, + { + "epoch": 0.05347676835296224, + "grad_norm": 7.062701225280762, + "learning_rate": 9.963619081955392e-07, + "loss": 0.8179, + "step": 3435 + }, + { + "epoch": 0.053554609354931615, + "grad_norm": 3.983492851257324, + "learning_rate": 9.962799691909343e-07, + "loss": 0.899, + "step": 3440 + }, + { + "epoch": 0.05363245035690099, + "grad_norm": 7.150521278381348, + "learning_rate": 9.961980301863292e-07, + "loss": 0.7949, + "step": 3445 + }, + { + "epoch": 0.05371029135887037, + "grad_norm": 5.3643107414245605, + "learning_rate": 9.961160911817243e-07, + "loss": 0.893, + "step": 3450 + }, + { + "epoch": 0.05378813236083975, + "grad_norm": 7.8569440841674805, + "learning_rate": 9.960341521771193e-07, + "loss": 0.7597, + "step": 3455 + }, + { + "epoch": 0.05386597336280913, + "grad_norm": 2.990384817123413, + "learning_rate": 9.959522131725144e-07, + "loss": 0.8968, + "step": 3460 + }, + { + "epoch": 0.053943814364778506, + "grad_norm": 11.023333549499512, + "learning_rate": 9.958702741679093e-07, + "loss": 0.8577, + "step": 3465 + }, + { + "epoch": 0.054021655366747884, + "grad_norm": 3.8599610328674316, + "learning_rate": 9.957883351633043e-07, + "loss": 0.8187, + "step": 3470 + }, + { + "epoch": 0.054099496368717255, + "grad_norm": 4.514223575592041, + "learning_rate": 9.957063961586994e-07, + "loss": 0.8948, + "step": 3475 + }, + { + "epoch": 0.05417733737068663, + "grad_norm": 5.561735153198242, + "learning_rate": 9.956244571540945e-07, + "loss": 0.7144, + "step": 3480 + }, + { + "epoch": 0.05425517837265601, + "grad_norm": 2.5921874046325684, + "learning_rate": 9.955425181494896e-07, + "loss": 0.8599, + "step": 3485 + }, + { + "epoch": 0.05433301937462539, + "grad_norm": 4.871161937713623, + "learning_rate": 9.954605791448844e-07, + "loss": 0.9644, + "step": 3490 + }, + { + "epoch": 0.05441086037659477, + "grad_norm": 6.471960544586182, + "learning_rate": 9.953786401402795e-07, + "loss": 0.764, + "step": 3495 + }, + { + "epoch": 0.054488701378564146, + "grad_norm": 5.133829593658447, + "learning_rate": 9.952967011356746e-07, + "loss": 0.8484, + "step": 3500 + }, + { + "epoch": 0.054566542380533524, + "grad_norm": 15.294747352600098, + "learning_rate": 9.952147621310697e-07, + "loss": 0.9278, + "step": 3505 + }, + { + "epoch": 0.0546443833825029, + "grad_norm": 4.0458526611328125, + "learning_rate": 9.951328231264645e-07, + "loss": 0.8015, + "step": 3510 + }, + { + "epoch": 0.05472222438447228, + "grad_norm": 3.96840238571167, + "learning_rate": 9.950508841218596e-07, + "loss": 0.9182, + "step": 3515 + }, + { + "epoch": 0.05480006538644165, + "grad_norm": 3.493230104446411, + "learning_rate": 9.949689451172547e-07, + "loss": 0.7351, + "step": 3520 + }, + { + "epoch": 0.05487790638841103, + "grad_norm": 6.453081130981445, + "learning_rate": 9.948870061126497e-07, + "loss": 0.6706, + "step": 3525 + }, + { + "epoch": 0.05495574739038041, + "grad_norm": 4.883228302001953, + "learning_rate": 9.948050671080446e-07, + "loss": 0.8926, + "step": 3530 + }, + { + "epoch": 0.055033588392349786, + "grad_norm": 8.88487434387207, + "learning_rate": 9.947231281034397e-07, + "loss": 0.7815, + "step": 3535 + }, + { + "epoch": 0.055111429394319164, + "grad_norm": 3.5414915084838867, + "learning_rate": 9.946411890988348e-07, + "loss": 0.8066, + "step": 3540 + }, + { + "epoch": 0.05518927039628854, + "grad_norm": 3.3924477100372314, + "learning_rate": 9.945592500942298e-07, + "loss": 0.8942, + "step": 3545 + }, + { + "epoch": 0.05526711139825792, + "grad_norm": 8.606155395507812, + "learning_rate": 9.94477311089625e-07, + "loss": 0.8062, + "step": 3550 + }, + { + "epoch": 0.0553449524002273, + "grad_norm": 3.5798611640930176, + "learning_rate": 9.9439537208502e-07, + "loss": 0.8145, + "step": 3555 + }, + { + "epoch": 0.05542279340219667, + "grad_norm": 4.816424369812012, + "learning_rate": 9.943134330804148e-07, + "loss": 0.9767, + "step": 3560 + }, + { + "epoch": 0.05550063440416605, + "grad_norm": 3.161212682723999, + "learning_rate": 9.9423149407581e-07, + "loss": 0.7526, + "step": 3565 + }, + { + "epoch": 0.055578475406135426, + "grad_norm": 5.3241143226623535, + "learning_rate": 9.94149555071205e-07, + "loss": 0.8756, + "step": 3570 + }, + { + "epoch": 0.055656316408104804, + "grad_norm": 4.702089786529541, + "learning_rate": 9.940676160665999e-07, + "loss": 0.7844, + "step": 3575 + }, + { + "epoch": 0.05573415741007418, + "grad_norm": 3.6324615478515625, + "learning_rate": 9.93985677061995e-07, + "loss": 0.9757, + "step": 3580 + }, + { + "epoch": 0.05581199841204356, + "grad_norm": 5.574779510498047, + "learning_rate": 9.9390373805739e-07, + "loss": 0.8368, + "step": 3585 + }, + { + "epoch": 0.05588983941401294, + "grad_norm": 3.3760433197021484, + "learning_rate": 9.93821799052785e-07, + "loss": 0.9753, + "step": 3590 + }, + { + "epoch": 0.05596768041598232, + "grad_norm": 3.6447086334228516, + "learning_rate": 9.937398600481802e-07, + "loss": 0.8654, + "step": 3595 + }, + { + "epoch": 0.056045521417951695, + "grad_norm": 10.935750007629395, + "learning_rate": 9.936579210435752e-07, + "loss": 0.8504, + "step": 3600 + }, + { + "epoch": 0.056123362419921066, + "grad_norm": 5.356347560882568, + "learning_rate": 9.9357598203897e-07, + "loss": 0.8439, + "step": 3605 + }, + { + "epoch": 0.056201203421890444, + "grad_norm": 7.737555027008057, + "learning_rate": 9.934940430343652e-07, + "loss": 0.8997, + "step": 3610 + }, + { + "epoch": 0.05627904442385982, + "grad_norm": 4.059571266174316, + "learning_rate": 9.934121040297602e-07, + "loss": 0.736, + "step": 3615 + }, + { + "epoch": 0.0563568854258292, + "grad_norm": 10.28212833404541, + "learning_rate": 9.933301650251551e-07, + "loss": 0.8219, + "step": 3620 + }, + { + "epoch": 0.05643472642779858, + "grad_norm": 7.522468090057373, + "learning_rate": 9.932482260205502e-07, + "loss": 0.7058, + "step": 3625 + }, + { + "epoch": 0.05651256742976796, + "grad_norm": 4.0811872482299805, + "learning_rate": 9.931662870159453e-07, + "loss": 0.8334, + "step": 3630 + }, + { + "epoch": 0.056590408431737335, + "grad_norm": 2.533539295196533, + "learning_rate": 9.930843480113403e-07, + "loss": 0.8185, + "step": 3635 + }, + { + "epoch": 0.05666824943370671, + "grad_norm": 2.272587776184082, + "learning_rate": 9.930024090067354e-07, + "loss": 0.8294, + "step": 3640 + }, + { + "epoch": 0.05674609043567609, + "grad_norm": 4.402963638305664, + "learning_rate": 9.929204700021305e-07, + "loss": 0.8253, + "step": 3645 + }, + { + "epoch": 0.05682393143764546, + "grad_norm": 4.450977802276611, + "learning_rate": 9.928385309975253e-07, + "loss": 0.7287, + "step": 3650 + }, + { + "epoch": 0.05690177243961484, + "grad_norm": 4.995216369628906, + "learning_rate": 9.927565919929204e-07, + "loss": 0.7744, + "step": 3655 + }, + { + "epoch": 0.05697961344158422, + "grad_norm": 4.42352294921875, + "learning_rate": 9.926746529883155e-07, + "loss": 0.8216, + "step": 3660 + }, + { + "epoch": 0.0570574544435536, + "grad_norm": 5.005922317504883, + "learning_rate": 9.925927139837106e-07, + "loss": 0.88, + "step": 3665 + }, + { + "epoch": 0.057135295445522975, + "grad_norm": 4.319427013397217, + "learning_rate": 9.925107749791054e-07, + "loss": 0.9386, + "step": 3670 + }, + { + "epoch": 0.05721313644749235, + "grad_norm": 4.61904239654541, + "learning_rate": 9.924288359745005e-07, + "loss": 0.8248, + "step": 3675 + }, + { + "epoch": 0.05729097744946173, + "grad_norm": 3.656996250152588, + "learning_rate": 9.923468969698956e-07, + "loss": 0.8898, + "step": 3680 + }, + { + "epoch": 0.05736881845143111, + "grad_norm": 10.73847484588623, + "learning_rate": 9.922649579652907e-07, + "loss": 0.7295, + "step": 3685 + }, + { + "epoch": 0.05744665945340048, + "grad_norm": 3.2956910133361816, + "learning_rate": 9.921830189606855e-07, + "loss": 0.7937, + "step": 3690 + }, + { + "epoch": 0.05752450045536986, + "grad_norm": 3.310476541519165, + "learning_rate": 9.921010799560806e-07, + "loss": 0.7597, + "step": 3695 + }, + { + "epoch": 0.05760234145733924, + "grad_norm": 6.073892116546631, + "learning_rate": 9.920191409514757e-07, + "loss": 0.775, + "step": 3700 + }, + { + "epoch": 0.057680182459308615, + "grad_norm": 4.651096820831299, + "learning_rate": 9.919372019468707e-07, + "loss": 0.9085, + "step": 3705 + }, + { + "epoch": 0.057758023461277994, + "grad_norm": 5.112009048461914, + "learning_rate": 9.918552629422658e-07, + "loss": 0.854, + "step": 3710 + }, + { + "epoch": 0.05783586446324737, + "grad_norm": 3.9226460456848145, + "learning_rate": 9.917733239376607e-07, + "loss": 0.8815, + "step": 3715 + }, + { + "epoch": 0.05791370546521675, + "grad_norm": 5.9531707763671875, + "learning_rate": 9.916913849330558e-07, + "loss": 0.8794, + "step": 3720 + }, + { + "epoch": 0.05799154646718613, + "grad_norm": 7.749881744384766, + "learning_rate": 9.916094459284508e-07, + "loss": 0.8347, + "step": 3725 + }, + { + "epoch": 0.058069387469155506, + "grad_norm": 3.2161874771118164, + "learning_rate": 9.915275069238457e-07, + "loss": 0.8297, + "step": 3730 + }, + { + "epoch": 0.05814722847112488, + "grad_norm": 3.4381978511810303, + "learning_rate": 9.914455679192408e-07, + "loss": 0.8016, + "step": 3735 + }, + { + "epoch": 0.058225069473094256, + "grad_norm": 6.175289630889893, + "learning_rate": 9.913636289146358e-07, + "loss": 0.7378, + "step": 3740 + }, + { + "epoch": 0.058302910475063634, + "grad_norm": 7.808245658874512, + "learning_rate": 9.91281689910031e-07, + "loss": 0.8631, + "step": 3745 + }, + { + "epoch": 0.05838075147703301, + "grad_norm": 8.13048267364502, + "learning_rate": 9.91199750905426e-07, + "loss": 0.7241, + "step": 3750 + }, + { + "epoch": 0.05845859247900239, + "grad_norm": 14.47769546508789, + "learning_rate": 9.91117811900821e-07, + "loss": 0.7604, + "step": 3755 + }, + { + "epoch": 0.05853643348097177, + "grad_norm": 13.544578552246094, + "learning_rate": 9.91035872896216e-07, + "loss": 0.9168, + "step": 3760 + }, + { + "epoch": 0.058614274482941146, + "grad_norm": 3.012338638305664, + "learning_rate": 9.90953933891611e-07, + "loss": 0.8438, + "step": 3765 + }, + { + "epoch": 0.058692115484910524, + "grad_norm": 8.543879508972168, + "learning_rate": 9.90871994887006e-07, + "loss": 0.8027, + "step": 3770 + }, + { + "epoch": 0.058769956486879896, + "grad_norm": 3.5552265644073486, + "learning_rate": 9.907900558824012e-07, + "loss": 0.9394, + "step": 3775 + }, + { + "epoch": 0.058847797488849274, + "grad_norm": 2.7634129524230957, + "learning_rate": 9.90708116877796e-07, + "loss": 0.8544, + "step": 3780 + }, + { + "epoch": 0.05892563849081865, + "grad_norm": 4.050414085388184, + "learning_rate": 9.90626177873191e-07, + "loss": 0.8405, + "step": 3785 + }, + { + "epoch": 0.05900347949278803, + "grad_norm": 3.3038461208343506, + "learning_rate": 9.905442388685862e-07, + "loss": 0.763, + "step": 3790 + }, + { + "epoch": 0.05908132049475741, + "grad_norm": 5.79196834564209, + "learning_rate": 9.904622998639812e-07, + "loss": 0.8174, + "step": 3795 + }, + { + "epoch": 0.059159161496726786, + "grad_norm": 4.359936714172363, + "learning_rate": 9.903803608593763e-07, + "loss": 0.8229, + "step": 3800 + }, + { + "epoch": 0.059237002498696165, + "grad_norm": 6.546017169952393, + "learning_rate": 9.902984218547714e-07, + "loss": 0.835, + "step": 3805 + }, + { + "epoch": 0.05931484350066554, + "grad_norm": 6.203246593475342, + "learning_rate": 9.902164828501663e-07, + "loss": 0.9859, + "step": 3810 + }, + { + "epoch": 0.05939268450263492, + "grad_norm": 3.92028546333313, + "learning_rate": 9.901345438455613e-07, + "loss": 0.84, + "step": 3815 + }, + { + "epoch": 0.05947052550460429, + "grad_norm": 4.098803520202637, + "learning_rate": 9.900526048409564e-07, + "loss": 0.8088, + "step": 3820 + }, + { + "epoch": 0.05954836650657367, + "grad_norm": 4.060965061187744, + "learning_rate": 9.899706658363513e-07, + "loss": 0.8048, + "step": 3825 + }, + { + "epoch": 0.05962620750854305, + "grad_norm": 7.130313873291016, + "learning_rate": 9.898887268317463e-07, + "loss": 0.991, + "step": 3830 + }, + { + "epoch": 0.05970404851051243, + "grad_norm": 4.355027198791504, + "learning_rate": 9.898067878271414e-07, + "loss": 0.9168, + "step": 3835 + }, + { + "epoch": 0.059781889512481805, + "grad_norm": 4.409844398498535, + "learning_rate": 9.897248488225365e-07, + "loss": 0.7811, + "step": 3840 + }, + { + "epoch": 0.05985973051445118, + "grad_norm": 4.593713283538818, + "learning_rate": 9.896429098179316e-07, + "loss": 0.9282, + "step": 3845 + }, + { + "epoch": 0.05993757151642056, + "grad_norm": 3.813417911529541, + "learning_rate": 9.895609708133266e-07, + "loss": 0.8671, + "step": 3850 + }, + { + "epoch": 0.06001541251838994, + "grad_norm": 9.554966926574707, + "learning_rate": 9.894790318087215e-07, + "loss": 0.8516, + "step": 3855 + }, + { + "epoch": 0.06009325352035932, + "grad_norm": 3.616415500640869, + "learning_rate": 9.893970928041166e-07, + "loss": 0.8382, + "step": 3860 + }, + { + "epoch": 0.06017109452232869, + "grad_norm": 3.379333019256592, + "learning_rate": 9.893151537995117e-07, + "loss": 0.9661, + "step": 3865 + }, + { + "epoch": 0.06024893552429807, + "grad_norm": 2.6693906784057617, + "learning_rate": 9.892332147949065e-07, + "loss": 0.8133, + "step": 3870 + }, + { + "epoch": 0.060326776526267445, + "grad_norm": 4.557685375213623, + "learning_rate": 9.891512757903016e-07, + "loss": 0.8617, + "step": 3875 + }, + { + "epoch": 0.06040461752823682, + "grad_norm": 2.69423770904541, + "learning_rate": 9.890693367856967e-07, + "loss": 0.7904, + "step": 3880 + }, + { + "epoch": 0.0604824585302062, + "grad_norm": 3.213026762008667, + "learning_rate": 9.889873977810917e-07, + "loss": 0.7852, + "step": 3885 + }, + { + "epoch": 0.06056029953217558, + "grad_norm": 3.25534725189209, + "learning_rate": 9.889054587764868e-07, + "loss": 0.8165, + "step": 3890 + }, + { + "epoch": 0.06063814053414496, + "grad_norm": 5.834784984588623, + "learning_rate": 9.888235197718817e-07, + "loss": 0.9304, + "step": 3895 + }, + { + "epoch": 0.060715981536114336, + "grad_norm": 3.369537353515625, + "learning_rate": 9.887415807672768e-07, + "loss": 0.7562, + "step": 3900 + }, + { + "epoch": 0.06079382253808371, + "grad_norm": 5.367571830749512, + "learning_rate": 9.886596417626718e-07, + "loss": 0.8158, + "step": 3905 + }, + { + "epoch": 0.060871663540053085, + "grad_norm": 4.397671222686768, + "learning_rate": 9.88577702758067e-07, + "loss": 0.8699, + "step": 3910 + }, + { + "epoch": 0.06094950454202246, + "grad_norm": 3.270768404006958, + "learning_rate": 9.88495763753462e-07, + "loss": 0.9022, + "step": 3915 + }, + { + "epoch": 0.06102734554399184, + "grad_norm": 4.194687366485596, + "learning_rate": 9.884138247488568e-07, + "loss": 0.937, + "step": 3920 + }, + { + "epoch": 0.06110518654596122, + "grad_norm": 3.5028905868530273, + "learning_rate": 9.88331885744252e-07, + "loss": 0.9853, + "step": 3925 + }, + { + "epoch": 0.0611830275479306, + "grad_norm": 9.81811237335205, + "learning_rate": 9.88249946739647e-07, + "loss": 0.9332, + "step": 3930 + }, + { + "epoch": 0.061260868549899976, + "grad_norm": 9.531314849853516, + "learning_rate": 9.881680077350419e-07, + "loss": 0.8402, + "step": 3935 + }, + { + "epoch": 0.061338709551869354, + "grad_norm": 6.465907096862793, + "learning_rate": 9.88086068730437e-07, + "loss": 0.9443, + "step": 3940 + }, + { + "epoch": 0.06141655055383873, + "grad_norm": 9.462715148925781, + "learning_rate": 9.88004129725832e-07, + "loss": 0.7544, + "step": 3945 + }, + { + "epoch": 0.0614943915558081, + "grad_norm": 4.005988121032715, + "learning_rate": 9.87922190721227e-07, + "loss": 0.8655, + "step": 3950 + }, + { + "epoch": 0.06157223255777748, + "grad_norm": 6.533730983734131, + "learning_rate": 9.878402517166222e-07, + "loss": 0.9202, + "step": 3955 + }, + { + "epoch": 0.06165007355974686, + "grad_norm": 4.695230484008789, + "learning_rate": 9.877583127120172e-07, + "loss": 0.7832, + "step": 3960 + }, + { + "epoch": 0.06172791456171624, + "grad_norm": 4.281477451324463, + "learning_rate": 9.87676373707412e-07, + "loss": 0.8885, + "step": 3965 + }, + { + "epoch": 0.061805755563685616, + "grad_norm": 4.162761688232422, + "learning_rate": 9.875944347028072e-07, + "loss": 0.8782, + "step": 3970 + }, + { + "epoch": 0.061883596565654994, + "grad_norm": 3.2788217067718506, + "learning_rate": 9.875124956982022e-07, + "loss": 0.901, + "step": 3975 + }, + { + "epoch": 0.06196143756762437, + "grad_norm": 3.823699951171875, + "learning_rate": 9.874305566935971e-07, + "loss": 0.8811, + "step": 3980 + }, + { + "epoch": 0.06203927856959375, + "grad_norm": 5.366037368774414, + "learning_rate": 9.873486176889922e-07, + "loss": 0.9297, + "step": 3985 + }, + { + "epoch": 0.06211711957156313, + "grad_norm": 3.4064414501190186, + "learning_rate": 9.872666786843873e-07, + "loss": 0.8329, + "step": 3990 + }, + { + "epoch": 0.0621949605735325, + "grad_norm": 6.189504146575928, + "learning_rate": 9.871847396797823e-07, + "loss": 0.8325, + "step": 3995 + }, + { + "epoch": 0.06227280157550188, + "grad_norm": 2.825984001159668, + "learning_rate": 9.871028006751774e-07, + "loss": 0.7901, + "step": 4000 + }, + { + "epoch": 0.062350642577471256, + "grad_norm": 3.610321521759033, + "learning_rate": 9.870208616705725e-07, + "loss": 0.7974, + "step": 4005 + }, + { + "epoch": 0.062428483579440634, + "grad_norm": 4.4487128257751465, + "learning_rate": 9.869389226659676e-07, + "loss": 0.9165, + "step": 4010 + }, + { + "epoch": 0.06250632458141, + "grad_norm": 3.201486110687256, + "learning_rate": 9.868569836613624e-07, + "loss": 0.9165, + "step": 4015 + }, + { + "epoch": 0.06258416558337938, + "grad_norm": 6.013232231140137, + "learning_rate": 9.867750446567575e-07, + "loss": 0.7316, + "step": 4020 + }, + { + "epoch": 0.06266200658534876, + "grad_norm": 4.562684535980225, + "learning_rate": 9.866931056521526e-07, + "loss": 0.8648, + "step": 4025 + }, + { + "epoch": 0.06273984758731814, + "grad_norm": 3.915780544281006, + "learning_rate": 9.866111666475474e-07, + "loss": 0.7684, + "step": 4030 + }, + { + "epoch": 0.06281768858928752, + "grad_norm": 13.098698616027832, + "learning_rate": 9.865292276429425e-07, + "loss": 0.8222, + "step": 4035 + }, + { + "epoch": 0.0628955295912569, + "grad_norm": 5.85524320602417, + "learning_rate": 9.864472886383376e-07, + "loss": 0.8593, + "step": 4040 + }, + { + "epoch": 0.06297337059322627, + "grad_norm": 12.446966171264648, + "learning_rate": 9.863653496337327e-07, + "loss": 0.6881, + "step": 4045 + }, + { + "epoch": 0.06305121159519565, + "grad_norm": 3.663348436355591, + "learning_rate": 9.862834106291277e-07, + "loss": 0.6791, + "step": 4050 + }, + { + "epoch": 0.06312905259716503, + "grad_norm": 5.9468159675598145, + "learning_rate": 9.862014716245226e-07, + "loss": 0.883, + "step": 4055 + }, + { + "epoch": 0.06320689359913441, + "grad_norm": 4.544028282165527, + "learning_rate": 9.861195326199177e-07, + "loss": 0.6979, + "step": 4060 + }, + { + "epoch": 0.06328473460110379, + "grad_norm": 4.25548791885376, + "learning_rate": 9.860375936153127e-07, + "loss": 0.757, + "step": 4065 + }, + { + "epoch": 0.06336257560307317, + "grad_norm": 4.892475128173828, + "learning_rate": 9.859556546107078e-07, + "loss": 0.8346, + "step": 4070 + }, + { + "epoch": 0.06344041660504254, + "grad_norm": 3.967132091522217, + "learning_rate": 9.858737156061027e-07, + "loss": 0.7614, + "step": 4075 + }, + { + "epoch": 0.06351825760701192, + "grad_norm": 9.065237998962402, + "learning_rate": 9.857917766014978e-07, + "loss": 0.8471, + "step": 4080 + }, + { + "epoch": 0.0635960986089813, + "grad_norm": 5.109429359436035, + "learning_rate": 9.857098375968928e-07, + "loss": 0.7441, + "step": 4085 + }, + { + "epoch": 0.06367393961095068, + "grad_norm": 13.242950439453125, + "learning_rate": 9.85627898592288e-07, + "loss": 0.7784, + "step": 4090 + }, + { + "epoch": 0.06375178061292006, + "grad_norm": 7.870430946350098, + "learning_rate": 9.855459595876828e-07, + "loss": 0.9225, + "step": 4095 + }, + { + "epoch": 0.06382962161488942, + "grad_norm": 6.2109761238098145, + "learning_rate": 9.854640205830778e-07, + "loss": 0.8741, + "step": 4100 + }, + { + "epoch": 0.0639074626168588, + "grad_norm": 4.566768169403076, + "learning_rate": 9.85382081578473e-07, + "loss": 0.7312, + "step": 4105 + }, + { + "epoch": 0.06398530361882818, + "grad_norm": 4.343275547027588, + "learning_rate": 9.85300142573868e-07, + "loss": 0.8077, + "step": 4110 + }, + { + "epoch": 0.06406314462079755, + "grad_norm": 3.710590124130249, + "learning_rate": 9.85218203569263e-07, + "loss": 0.8512, + "step": 4115 + }, + { + "epoch": 0.06414098562276693, + "grad_norm": 5.875495433807373, + "learning_rate": 9.85136264564658e-07, + "loss": 0.8588, + "step": 4120 + }, + { + "epoch": 0.06421882662473631, + "grad_norm": 5.609859943389893, + "learning_rate": 9.85054325560053e-07, + "loss": 0.9756, + "step": 4125 + }, + { + "epoch": 0.06429666762670569, + "grad_norm": 3.695260763168335, + "learning_rate": 9.84972386555448e-07, + "loss": 0.8677, + "step": 4130 + }, + { + "epoch": 0.06437450862867507, + "grad_norm": 4.265758991241455, + "learning_rate": 9.848904475508432e-07, + "loss": 0.868, + "step": 4135 + }, + { + "epoch": 0.06445234963064445, + "grad_norm": 5.0540361404418945, + "learning_rate": 9.84808508546238e-07, + "loss": 0.7448, + "step": 4140 + }, + { + "epoch": 0.06453019063261382, + "grad_norm": 3.1422078609466553, + "learning_rate": 9.84726569541633e-07, + "loss": 0.8085, + "step": 4145 + }, + { + "epoch": 0.0646080316345832, + "grad_norm": 3.257333755493164, + "learning_rate": 9.846446305370282e-07, + "loss": 0.904, + "step": 4150 + }, + { + "epoch": 0.06468587263655258, + "grad_norm": 6.303824424743652, + "learning_rate": 9.845626915324232e-07, + "loss": 0.6844, + "step": 4155 + }, + { + "epoch": 0.06476371363852196, + "grad_norm": 7.541611194610596, + "learning_rate": 9.844807525278183e-07, + "loss": 0.9385, + "step": 4160 + }, + { + "epoch": 0.06484155464049134, + "grad_norm": 3.217496633529663, + "learning_rate": 9.843988135232134e-07, + "loss": 0.8201, + "step": 4165 + }, + { + "epoch": 0.06491939564246071, + "grad_norm": 4.375589370727539, + "learning_rate": 9.843168745186083e-07, + "loss": 0.8042, + "step": 4170 + }, + { + "epoch": 0.06499723664443009, + "grad_norm": 6.62051248550415, + "learning_rate": 9.842349355140033e-07, + "loss": 0.7035, + "step": 4175 + }, + { + "epoch": 0.06507507764639947, + "grad_norm": 4.503577709197998, + "learning_rate": 9.841529965093984e-07, + "loss": 0.8564, + "step": 4180 + }, + { + "epoch": 0.06515291864836883, + "grad_norm": 3.583695411682129, + "learning_rate": 9.840710575047933e-07, + "loss": 0.9069, + "step": 4185 + }, + { + "epoch": 0.06523075965033821, + "grad_norm": 4.029445648193359, + "learning_rate": 9.839891185001883e-07, + "loss": 0.8835, + "step": 4190 + }, + { + "epoch": 0.06530860065230759, + "grad_norm": 3.6656410694122314, + "learning_rate": 9.839071794955834e-07, + "loss": 0.7814, + "step": 4195 + }, + { + "epoch": 0.06538644165427697, + "grad_norm": 3.0505213737487793, + "learning_rate": 9.838252404909785e-07, + "loss": 0.7942, + "step": 4200 + }, + { + "epoch": 0.06546428265624635, + "grad_norm": 4.775297164916992, + "learning_rate": 9.837433014863736e-07, + "loss": 0.8875, + "step": 4205 + }, + { + "epoch": 0.06554212365821573, + "grad_norm": 5.490566253662109, + "learning_rate": 9.836613624817686e-07, + "loss": 0.7635, + "step": 4210 + }, + { + "epoch": 0.0656199646601851, + "grad_norm": 3.202033519744873, + "learning_rate": 9.835794234771635e-07, + "loss": 0.7558, + "step": 4215 + }, + { + "epoch": 0.06569780566215448, + "grad_norm": 5.484325408935547, + "learning_rate": 9.834974844725586e-07, + "loss": 0.8066, + "step": 4220 + }, + { + "epoch": 0.06577564666412386, + "grad_norm": 2.903610944747925, + "learning_rate": 9.834155454679537e-07, + "loss": 0.7833, + "step": 4225 + }, + { + "epoch": 0.06585348766609324, + "grad_norm": 3.188546895980835, + "learning_rate": 9.833336064633485e-07, + "loss": 0.7774, + "step": 4230 + }, + { + "epoch": 0.06593132866806262, + "grad_norm": 3.055574655532837, + "learning_rate": 9.832516674587436e-07, + "loss": 0.6551, + "step": 4235 + }, + { + "epoch": 0.066009169670032, + "grad_norm": 4.439972877502441, + "learning_rate": 9.831697284541387e-07, + "loss": 0.7456, + "step": 4240 + }, + { + "epoch": 0.06608701067200137, + "grad_norm": 2.4513139724731445, + "learning_rate": 9.830877894495337e-07, + "loss": 0.7752, + "step": 4245 + }, + { + "epoch": 0.06616485167397075, + "grad_norm": 4.66846227645874, + "learning_rate": 9.830058504449288e-07, + "loss": 0.9322, + "step": 4250 + }, + { + "epoch": 0.06624269267594013, + "grad_norm": 4.819527626037598, + "learning_rate": 9.82923911440324e-07, + "loss": 0.9361, + "step": 4255 + }, + { + "epoch": 0.06632053367790951, + "grad_norm": 8.028414726257324, + "learning_rate": 9.828419724357188e-07, + "loss": 0.8214, + "step": 4260 + }, + { + "epoch": 0.06639837467987889, + "grad_norm": 3.565459728240967, + "learning_rate": 9.827600334311138e-07, + "loss": 0.7668, + "step": 4265 + }, + { + "epoch": 0.06647621568184826, + "grad_norm": 2.9492602348327637, + "learning_rate": 9.82678094426509e-07, + "loss": 0.7513, + "step": 4270 + }, + { + "epoch": 0.06655405668381763, + "grad_norm": 4.8683576583862305, + "learning_rate": 9.82596155421904e-07, + "loss": 0.8725, + "step": 4275 + }, + { + "epoch": 0.066631897685787, + "grad_norm": 4.162265300750732, + "learning_rate": 9.825142164172989e-07, + "loss": 0.8275, + "step": 4280 + }, + { + "epoch": 0.06670973868775638, + "grad_norm": 3.6537702083587646, + "learning_rate": 9.82432277412694e-07, + "loss": 0.7264, + "step": 4285 + }, + { + "epoch": 0.06678757968972576, + "grad_norm": 3.9282073974609375, + "learning_rate": 9.82350338408089e-07, + "loss": 0.977, + "step": 4290 + }, + { + "epoch": 0.06686542069169514, + "grad_norm": 5.129037857055664, + "learning_rate": 9.82268399403484e-07, + "loss": 0.8609, + "step": 4295 + }, + { + "epoch": 0.06694326169366452, + "grad_norm": 4.563994884490967, + "learning_rate": 9.82186460398879e-07, + "loss": 0.8303, + "step": 4300 + }, + { + "epoch": 0.0670211026956339, + "grad_norm": 3.177889585494995, + "learning_rate": 9.82104521394274e-07, + "loss": 0.9134, + "step": 4305 + }, + { + "epoch": 0.06709894369760327, + "grad_norm": 4.675817966461182, + "learning_rate": 9.82022582389669e-07, + "loss": 0.7188, + "step": 4310 + }, + { + "epoch": 0.06717678469957265, + "grad_norm": 6.9661173820495605, + "learning_rate": 9.819406433850642e-07, + "loss": 0.7871, + "step": 4315 + }, + { + "epoch": 0.06725462570154203, + "grad_norm": 6.177728176116943, + "learning_rate": 9.818587043804592e-07, + "loss": 0.7438, + "step": 4320 + }, + { + "epoch": 0.06733246670351141, + "grad_norm": 3.9021103382110596, + "learning_rate": 9.81776765375854e-07, + "loss": 0.8456, + "step": 4325 + }, + { + "epoch": 0.06741030770548079, + "grad_norm": 6.576573371887207, + "learning_rate": 9.816948263712492e-07, + "loss": 0.8173, + "step": 4330 + }, + { + "epoch": 0.06748814870745017, + "grad_norm": 3.117799997329712, + "learning_rate": 9.816128873666442e-07, + "loss": 0.8552, + "step": 4335 + }, + { + "epoch": 0.06756598970941954, + "grad_norm": 5.52931022644043, + "learning_rate": 9.815309483620391e-07, + "loss": 0.7353, + "step": 4340 + }, + { + "epoch": 0.06764383071138892, + "grad_norm": 3.3571298122406006, + "learning_rate": 9.814490093574342e-07, + "loss": 0.7253, + "step": 4345 + }, + { + "epoch": 0.0677216717133583, + "grad_norm": 4.7125468254089355, + "learning_rate": 9.813670703528293e-07, + "loss": 0.8708, + "step": 4350 + }, + { + "epoch": 0.06779951271532768, + "grad_norm": 3.7811620235443115, + "learning_rate": 9.812851313482243e-07, + "loss": 0.744, + "step": 4355 + }, + { + "epoch": 0.06787735371729704, + "grad_norm": 4.079869270324707, + "learning_rate": 9.812031923436194e-07, + "loss": 0.8291, + "step": 4360 + }, + { + "epoch": 0.06795519471926642, + "grad_norm": 2.9714179039001465, + "learning_rate": 9.811212533390145e-07, + "loss": 0.879, + "step": 4365 + }, + { + "epoch": 0.0680330357212358, + "grad_norm": 4.301975250244141, + "learning_rate": 9.810393143344094e-07, + "loss": 0.7528, + "step": 4370 + }, + { + "epoch": 0.06811087672320518, + "grad_norm": 4.707742214202881, + "learning_rate": 9.809573753298044e-07, + "loss": 0.7686, + "step": 4375 + }, + { + "epoch": 0.06818871772517456, + "grad_norm": 2.911092758178711, + "learning_rate": 9.808754363251995e-07, + "loss": 0.8224, + "step": 4380 + }, + { + "epoch": 0.06826655872714393, + "grad_norm": 3.809354543685913, + "learning_rate": 9.807934973205944e-07, + "loss": 0.9337, + "step": 4385 + }, + { + "epoch": 0.06834439972911331, + "grad_norm": 3.0105934143066406, + "learning_rate": 9.807115583159894e-07, + "loss": 0.7952, + "step": 4390 + }, + { + "epoch": 0.06842224073108269, + "grad_norm": 4.267519474029541, + "learning_rate": 9.806296193113845e-07, + "loss": 0.9312, + "step": 4395 + }, + { + "epoch": 0.06850008173305207, + "grad_norm": 13.714824676513672, + "learning_rate": 9.805476803067796e-07, + "loss": 0.7266, + "step": 4400 + }, + { + "epoch": 0.06857792273502145, + "grad_norm": 5.861302852630615, + "learning_rate": 9.804657413021747e-07, + "loss": 0.8267, + "step": 4405 + }, + { + "epoch": 0.06865576373699082, + "grad_norm": 4.226170539855957, + "learning_rate": 9.803838022975697e-07, + "loss": 0.6108, + "step": 4410 + }, + { + "epoch": 0.0687336047389602, + "grad_norm": 4.260887145996094, + "learning_rate": 9.803018632929648e-07, + "loss": 0.7917, + "step": 4415 + }, + { + "epoch": 0.06881144574092958, + "grad_norm": 2.1800050735473633, + "learning_rate": 9.802199242883597e-07, + "loss": 0.7279, + "step": 4420 + }, + { + "epoch": 0.06888928674289896, + "grad_norm": 4.386568069458008, + "learning_rate": 9.801379852837548e-07, + "loss": 0.7997, + "step": 4425 + }, + { + "epoch": 0.06896712774486834, + "grad_norm": 7.1831135749816895, + "learning_rate": 9.800560462791498e-07, + "loss": 0.9703, + "step": 4430 + }, + { + "epoch": 0.06904496874683771, + "grad_norm": 7.631860733032227, + "learning_rate": 9.799741072745447e-07, + "loss": 0.7836, + "step": 4435 + }, + { + "epoch": 0.06912280974880709, + "grad_norm": 3.6150078773498535, + "learning_rate": 9.798921682699398e-07, + "loss": 0.6982, + "step": 4440 + }, + { + "epoch": 0.06920065075077646, + "grad_norm": 5.267273902893066, + "learning_rate": 9.798102292653348e-07, + "loss": 0.9292, + "step": 4445 + }, + { + "epoch": 0.06927849175274584, + "grad_norm": 6.139009952545166, + "learning_rate": 9.7972829026073e-07, + "loss": 0.776, + "step": 4450 + }, + { + "epoch": 0.06935633275471521, + "grad_norm": 6.20229959487915, + "learning_rate": 9.79646351256125e-07, + "loss": 0.8239, + "step": 4455 + }, + { + "epoch": 0.06943417375668459, + "grad_norm": 3.204371929168701, + "learning_rate": 9.7956441225152e-07, + "loss": 0.8123, + "step": 4460 + }, + { + "epoch": 0.06951201475865397, + "grad_norm": 4.521599769592285, + "learning_rate": 9.79482473246915e-07, + "loss": 0.7049, + "step": 4465 + }, + { + "epoch": 0.06958985576062335, + "grad_norm": 5.0935750007629395, + "learning_rate": 9.7940053424231e-07, + "loss": 0.8673, + "step": 4470 + }, + { + "epoch": 0.06966769676259273, + "grad_norm": 7.926290512084961, + "learning_rate": 9.79318595237705e-07, + "loss": 0.8195, + "step": 4475 + }, + { + "epoch": 0.0697455377645621, + "grad_norm": 4.315165042877197, + "learning_rate": 9.792366562331e-07, + "loss": 0.8674, + "step": 4480 + }, + { + "epoch": 0.06982337876653148, + "grad_norm": 3.775836706161499, + "learning_rate": 9.79154717228495e-07, + "loss": 0.8334, + "step": 4485 + }, + { + "epoch": 0.06990121976850086, + "grad_norm": 2.560904026031494, + "learning_rate": 9.7907277822389e-07, + "loss": 0.8271, + "step": 4490 + }, + { + "epoch": 0.06997906077047024, + "grad_norm": 11.29925537109375, + "learning_rate": 9.789908392192852e-07, + "loss": 0.9633, + "step": 4495 + }, + { + "epoch": 0.07005690177243962, + "grad_norm": 4.101975917816162, + "learning_rate": 9.789089002146802e-07, + "loss": 0.8858, + "step": 4500 + }, + { + "epoch": 0.070134742774409, + "grad_norm": 2.970782518386841, + "learning_rate": 9.78826961210075e-07, + "loss": 0.8608, + "step": 4505 + }, + { + "epoch": 0.07021258377637837, + "grad_norm": 7.289088726043701, + "learning_rate": 9.787450222054702e-07, + "loss": 0.8347, + "step": 4510 + }, + { + "epoch": 0.07029042477834775, + "grad_norm": 7.107760429382324, + "learning_rate": 9.786630832008653e-07, + "loss": 0.771, + "step": 4515 + }, + { + "epoch": 0.07036826578031713, + "grad_norm": 3.630275249481201, + "learning_rate": 9.785811441962603e-07, + "loss": 0.7113, + "step": 4520 + }, + { + "epoch": 0.07044610678228651, + "grad_norm": 4.681270122528076, + "learning_rate": 9.784992051916554e-07, + "loss": 0.8278, + "step": 4525 + }, + { + "epoch": 0.07052394778425587, + "grad_norm": 3.6923000812530518, + "learning_rate": 9.784172661870503e-07, + "loss": 0.8067, + "step": 4530 + }, + { + "epoch": 0.07060178878622525, + "grad_norm": 3.538496255874634, + "learning_rate": 9.783353271824453e-07, + "loss": 0.7577, + "step": 4535 + }, + { + "epoch": 0.07067962978819463, + "grad_norm": 3.3996520042419434, + "learning_rate": 9.782533881778404e-07, + "loss": 0.9051, + "step": 4540 + }, + { + "epoch": 0.070757470790164, + "grad_norm": 4.107473850250244, + "learning_rate": 9.781714491732353e-07, + "loss": 0.8982, + "step": 4545 + }, + { + "epoch": 0.07083531179213338, + "grad_norm": 2.9986937046051025, + "learning_rate": 9.780895101686304e-07, + "loss": 0.9025, + "step": 4550 + }, + { + "epoch": 0.07091315279410276, + "grad_norm": 3.413224697113037, + "learning_rate": 9.780075711640254e-07, + "loss": 0.7699, + "step": 4555 + }, + { + "epoch": 0.07099099379607214, + "grad_norm": 3.332380771636963, + "learning_rate": 9.779256321594205e-07, + "loss": 0.6913, + "step": 4560 + }, + { + "epoch": 0.07106883479804152, + "grad_norm": 3.161701202392578, + "learning_rate": 9.778436931548156e-07, + "loss": 0.7502, + "step": 4565 + }, + { + "epoch": 0.0711466758000109, + "grad_norm": 3.6863913536071777, + "learning_rate": 9.777617541502107e-07, + "loss": 0.7959, + "step": 4570 + }, + { + "epoch": 0.07122451680198028, + "grad_norm": 4.537403583526611, + "learning_rate": 9.776798151456055e-07, + "loss": 0.8646, + "step": 4575 + }, + { + "epoch": 0.07130235780394965, + "grad_norm": 4.111873149871826, + "learning_rate": 9.775978761410006e-07, + "loss": 0.8503, + "step": 4580 + }, + { + "epoch": 0.07138019880591903, + "grad_norm": 8.788448333740234, + "learning_rate": 9.775159371363957e-07, + "loss": 0.8046, + "step": 4585 + }, + { + "epoch": 0.07145803980788841, + "grad_norm": 5.538233757019043, + "learning_rate": 9.774339981317905e-07, + "loss": 0.8018, + "step": 4590 + }, + { + "epoch": 0.07153588080985779, + "grad_norm": 6.06341028213501, + "learning_rate": 9.773520591271856e-07, + "loss": 0.8073, + "step": 4595 + }, + { + "epoch": 0.07161372181182717, + "grad_norm": 3.6553616523742676, + "learning_rate": 9.772701201225807e-07, + "loss": 0.8142, + "step": 4600 + }, + { + "epoch": 0.07169156281379654, + "grad_norm": 4.252196311950684, + "learning_rate": 9.771881811179758e-07, + "loss": 0.7505, + "step": 4605 + }, + { + "epoch": 0.07176940381576592, + "grad_norm": 3.3813109397888184, + "learning_rate": 9.771062421133708e-07, + "loss": 0.7076, + "step": 4610 + }, + { + "epoch": 0.0718472448177353, + "grad_norm": 9.012163162231445, + "learning_rate": 9.77024303108766e-07, + "loss": 0.8812, + "step": 4615 + }, + { + "epoch": 0.07192508581970466, + "grad_norm": 6.12354040145874, + "learning_rate": 9.769423641041608e-07, + "loss": 0.8218, + "step": 4620 + }, + { + "epoch": 0.07200292682167404, + "grad_norm": 3.364898681640625, + "learning_rate": 9.768604250995558e-07, + "loss": 0.8775, + "step": 4625 + }, + { + "epoch": 0.07208076782364342, + "grad_norm": 13.047234535217285, + "learning_rate": 9.76778486094951e-07, + "loss": 0.7578, + "step": 4630 + }, + { + "epoch": 0.0721586088256128, + "grad_norm": 6.722197532653809, + "learning_rate": 9.766965470903458e-07, + "loss": 0.8446, + "step": 4635 + }, + { + "epoch": 0.07223644982758218, + "grad_norm": 4.028960227966309, + "learning_rate": 9.766146080857409e-07, + "loss": 0.8284, + "step": 4640 + }, + { + "epoch": 0.07231429082955156, + "grad_norm": 3.668736219406128, + "learning_rate": 9.76532669081136e-07, + "loss": 0.8298, + "step": 4645 + }, + { + "epoch": 0.07239213183152093, + "grad_norm": 3.391463041305542, + "learning_rate": 9.76450730076531e-07, + "loss": 0.8038, + "step": 4650 + }, + { + "epoch": 0.07246997283349031, + "grad_norm": 2.8080356121063232, + "learning_rate": 9.76368791071926e-07, + "loss": 0.8614, + "step": 4655 + }, + { + "epoch": 0.07254781383545969, + "grad_norm": 3.9080796241760254, + "learning_rate": 9.762868520673212e-07, + "loss": 0.8239, + "step": 4660 + }, + { + "epoch": 0.07262565483742907, + "grad_norm": 3.0968992710113525, + "learning_rate": 9.76204913062716e-07, + "loss": 0.8751, + "step": 4665 + }, + { + "epoch": 0.07270349583939845, + "grad_norm": 6.975797176361084, + "learning_rate": 9.76122974058111e-07, + "loss": 0.7877, + "step": 4670 + }, + { + "epoch": 0.07278133684136782, + "grad_norm": 5.175839424133301, + "learning_rate": 9.760410350535062e-07, + "loss": 0.7242, + "step": 4675 + }, + { + "epoch": 0.0728591778433372, + "grad_norm": 3.86811900138855, + "learning_rate": 9.759590960489012e-07, + "loss": 0.8628, + "step": 4680 + }, + { + "epoch": 0.07293701884530658, + "grad_norm": 4.670974254608154, + "learning_rate": 9.75877157044296e-07, + "loss": 0.7741, + "step": 4685 + }, + { + "epoch": 0.07301485984727596, + "grad_norm": 3.4863369464874268, + "learning_rate": 9.757952180396912e-07, + "loss": 0.9401, + "step": 4690 + }, + { + "epoch": 0.07309270084924534, + "grad_norm": 4.012441158294678, + "learning_rate": 9.757132790350863e-07, + "loss": 0.8949, + "step": 4695 + }, + { + "epoch": 0.07317054185121472, + "grad_norm": 3.7120773792266846, + "learning_rate": 9.756313400304813e-07, + "loss": 0.912, + "step": 4700 + }, + { + "epoch": 0.07324838285318408, + "grad_norm": 4.149153232574463, + "learning_rate": 9.755494010258762e-07, + "loss": 0.7284, + "step": 4705 + }, + { + "epoch": 0.07332622385515346, + "grad_norm": 3.724862813949585, + "learning_rate": 9.754674620212713e-07, + "loss": 0.8486, + "step": 4710 + }, + { + "epoch": 0.07340406485712284, + "grad_norm": 5.275464057922363, + "learning_rate": 9.753855230166663e-07, + "loss": 0.7661, + "step": 4715 + }, + { + "epoch": 0.07348190585909221, + "grad_norm": 8.389967918395996, + "learning_rate": 9.753035840120614e-07, + "loss": 0.8646, + "step": 4720 + }, + { + "epoch": 0.07355974686106159, + "grad_norm": 4.1537017822265625, + "learning_rate": 9.752216450074565e-07, + "loss": 0.8596, + "step": 4725 + }, + { + "epoch": 0.07363758786303097, + "grad_norm": 3.4971349239349365, + "learning_rate": 9.751397060028514e-07, + "loss": 0.7888, + "step": 4730 + }, + { + "epoch": 0.07371542886500035, + "grad_norm": 2.9952375888824463, + "learning_rate": 9.750577669982464e-07, + "loss": 0.8127, + "step": 4735 + }, + { + "epoch": 0.07379326986696973, + "grad_norm": 3.0156424045562744, + "learning_rate": 9.749758279936415e-07, + "loss": 0.6775, + "step": 4740 + }, + { + "epoch": 0.0738711108689391, + "grad_norm": 4.386186122894287, + "learning_rate": 9.748938889890366e-07, + "loss": 0.8813, + "step": 4745 + }, + { + "epoch": 0.07394895187090848, + "grad_norm": 8.352777481079102, + "learning_rate": 9.748119499844314e-07, + "loss": 0.841, + "step": 4750 + }, + { + "epoch": 0.07402679287287786, + "grad_norm": 3.9071156978607178, + "learning_rate": 9.747300109798265e-07, + "loss": 0.8124, + "step": 4755 + }, + { + "epoch": 0.07410463387484724, + "grad_norm": 6.337040901184082, + "learning_rate": 9.746480719752216e-07, + "loss": 0.7601, + "step": 4760 + }, + { + "epoch": 0.07418247487681662, + "grad_norm": 4.740725040435791, + "learning_rate": 9.745661329706167e-07, + "loss": 0.8621, + "step": 4765 + }, + { + "epoch": 0.074260315878786, + "grad_norm": 3.6366703510284424, + "learning_rate": 9.744841939660117e-07, + "loss": 0.7521, + "step": 4770 + }, + { + "epoch": 0.07433815688075537, + "grad_norm": 5.869968891143799, + "learning_rate": 9.744022549614068e-07, + "loss": 0.8219, + "step": 4775 + }, + { + "epoch": 0.07441599788272475, + "grad_norm": 4.8249006271362305, + "learning_rate": 9.743203159568017e-07, + "loss": 0.8733, + "step": 4780 + }, + { + "epoch": 0.07449383888469413, + "grad_norm": 3.9930624961853027, + "learning_rate": 9.742383769521968e-07, + "loss": 0.7968, + "step": 4785 + }, + { + "epoch": 0.0745716798866635, + "grad_norm": 5.8335418701171875, + "learning_rate": 9.741564379475918e-07, + "loss": 0.8224, + "step": 4790 + }, + { + "epoch": 0.07464952088863287, + "grad_norm": 5.657021522521973, + "learning_rate": 9.740744989429867e-07, + "loss": 0.7896, + "step": 4795 + }, + { + "epoch": 0.07472736189060225, + "grad_norm": 6.225119590759277, + "learning_rate": 9.739925599383818e-07, + "loss": 0.8297, + "step": 4800 + }, + { + "epoch": 0.07480520289257163, + "grad_norm": 3.373596429824829, + "learning_rate": 9.739106209337768e-07, + "loss": 0.8255, + "step": 4805 + }, + { + "epoch": 0.07488304389454101, + "grad_norm": 2.2436752319335938, + "learning_rate": 9.73828681929172e-07, + "loss": 0.7097, + "step": 4810 + }, + { + "epoch": 0.07496088489651039, + "grad_norm": 3.6879262924194336, + "learning_rate": 9.73746742924567e-07, + "loss": 0.6912, + "step": 4815 + }, + { + "epoch": 0.07503872589847976, + "grad_norm": 3.399632692337036, + "learning_rate": 9.73664803919962e-07, + "loss": 0.8371, + "step": 4820 + }, + { + "epoch": 0.07511656690044914, + "grad_norm": 9.186985969543457, + "learning_rate": 9.73582864915357e-07, + "loss": 0.8157, + "step": 4825 + }, + { + "epoch": 0.07519440790241852, + "grad_norm": 3.6295411586761475, + "learning_rate": 9.73500925910752e-07, + "loss": 0.7785, + "step": 4830 + }, + { + "epoch": 0.0752722489043879, + "grad_norm": 3.534175395965576, + "learning_rate": 9.73418986906147e-07, + "loss": 0.878, + "step": 4835 + }, + { + "epoch": 0.07535008990635728, + "grad_norm": 3.0165436267852783, + "learning_rate": 9.73337047901542e-07, + "loss": 0.7593, + "step": 4840 + }, + { + "epoch": 0.07542793090832665, + "grad_norm": 4.980969429016113, + "learning_rate": 9.73255108896937e-07, + "loss": 0.7592, + "step": 4845 + }, + { + "epoch": 0.07550577191029603, + "grad_norm": 3.377429723739624, + "learning_rate": 9.73173169892332e-07, + "loss": 0.7849, + "step": 4850 + }, + { + "epoch": 0.07558361291226541, + "grad_norm": 5.916225910186768, + "learning_rate": 9.730912308877272e-07, + "loss": 0.8283, + "step": 4855 + }, + { + "epoch": 0.07566145391423479, + "grad_norm": 6.396664619445801, + "learning_rate": 9.730092918831222e-07, + "loss": 0.8801, + "step": 4860 + }, + { + "epoch": 0.07573929491620417, + "grad_norm": 3.2279000282287598, + "learning_rate": 9.729273528785173e-07, + "loss": 0.7746, + "step": 4865 + }, + { + "epoch": 0.07581713591817354, + "grad_norm": 3.3522236347198486, + "learning_rate": 9.728454138739122e-07, + "loss": 1.0139, + "step": 4870 + }, + { + "epoch": 0.07589497692014291, + "grad_norm": 7.16496467590332, + "learning_rate": 9.727634748693073e-07, + "loss": 0.8203, + "step": 4875 + }, + { + "epoch": 0.07597281792211229, + "grad_norm": 3.7520346641540527, + "learning_rate": 9.726815358647023e-07, + "loss": 0.7859, + "step": 4880 + }, + { + "epoch": 0.07605065892408167, + "grad_norm": 5.511653900146484, + "learning_rate": 9.725995968600972e-07, + "loss": 0.8594, + "step": 4885 + }, + { + "epoch": 0.07612849992605104, + "grad_norm": 5.89841365814209, + "learning_rate": 9.725176578554923e-07, + "loss": 0.6535, + "step": 4890 + }, + { + "epoch": 0.07620634092802042, + "grad_norm": 4.694098949432373, + "learning_rate": 9.724357188508873e-07, + "loss": 0.8221, + "step": 4895 + }, + { + "epoch": 0.0762841819299898, + "grad_norm": 4.192508220672607, + "learning_rate": 9.723537798462824e-07, + "loss": 0.8594, + "step": 4900 + }, + { + "epoch": 0.07636202293195918, + "grad_norm": 4.4052534103393555, + "learning_rate": 9.722718408416775e-07, + "loss": 0.7326, + "step": 4905 + }, + { + "epoch": 0.07643986393392856, + "grad_norm": 4.1674299240112305, + "learning_rate": 9.721899018370724e-07, + "loss": 0.7525, + "step": 4910 + }, + { + "epoch": 0.07651770493589793, + "grad_norm": 4.367162227630615, + "learning_rate": 9.721079628324674e-07, + "loss": 0.9038, + "step": 4915 + }, + { + "epoch": 0.07659554593786731, + "grad_norm": 3.0957272052764893, + "learning_rate": 9.720260238278625e-07, + "loss": 0.8247, + "step": 4920 + }, + { + "epoch": 0.07667338693983669, + "grad_norm": 6.5740532875061035, + "learning_rate": 9.719440848232576e-07, + "loss": 0.7707, + "step": 4925 + }, + { + "epoch": 0.07675122794180607, + "grad_norm": 5.08697509765625, + "learning_rate": 9.718621458186527e-07, + "loss": 0.7561, + "step": 4930 + }, + { + "epoch": 0.07682906894377545, + "grad_norm": 6.9134063720703125, + "learning_rate": 9.717802068140475e-07, + "loss": 0.7645, + "step": 4935 + }, + { + "epoch": 0.07690690994574483, + "grad_norm": 3.2047367095947266, + "learning_rate": 9.716982678094426e-07, + "loss": 0.854, + "step": 4940 + }, + { + "epoch": 0.0769847509477142, + "grad_norm": 3.4643442630767822, + "learning_rate": 9.716163288048377e-07, + "loss": 0.7598, + "step": 4945 + }, + { + "epoch": 0.07706259194968358, + "grad_norm": 5.208106517791748, + "learning_rate": 9.715343898002325e-07, + "loss": 0.7699, + "step": 4950 + }, + { + "epoch": 0.07714043295165296, + "grad_norm": 4.935080051422119, + "learning_rate": 9.714524507956276e-07, + "loss": 0.8839, + "step": 4955 + }, + { + "epoch": 0.07721827395362234, + "grad_norm": 4.052170753479004, + "learning_rate": 9.713705117910227e-07, + "loss": 0.841, + "step": 4960 + }, + { + "epoch": 0.0772961149555917, + "grad_norm": 3.409742593765259, + "learning_rate": 9.712885727864178e-07, + "loss": 0.8768, + "step": 4965 + }, + { + "epoch": 0.07737395595756108, + "grad_norm": 5.577835559844971, + "learning_rate": 9.712066337818128e-07, + "loss": 0.7609, + "step": 4970 + }, + { + "epoch": 0.07745179695953046, + "grad_norm": 4.966437816619873, + "learning_rate": 9.71124694777208e-07, + "loss": 0.9478, + "step": 4975 + }, + { + "epoch": 0.07752963796149984, + "grad_norm": 5.092791557312012, + "learning_rate": 9.710427557726028e-07, + "loss": 0.8237, + "step": 4980 + }, + { + "epoch": 0.07760747896346921, + "grad_norm": 2.992233991622925, + "learning_rate": 9.709608167679978e-07, + "loss": 0.8486, + "step": 4985 + }, + { + "epoch": 0.07768531996543859, + "grad_norm": 3.947547197341919, + "learning_rate": 9.70878877763393e-07, + "loss": 1.0368, + "step": 4990 + }, + { + "epoch": 0.07776316096740797, + "grad_norm": 4.660312652587891, + "learning_rate": 9.707969387587878e-07, + "loss": 0.7254, + "step": 4995 + }, + { + "epoch": 0.07784100196937735, + "grad_norm": 5.542099475860596, + "learning_rate": 9.707149997541829e-07, + "loss": 0.7871, + "step": 5000 + }, + { + "epoch": 0.07791884297134673, + "grad_norm": 2.8513717651367188, + "learning_rate": 9.70633060749578e-07, + "loss": 0.7405, + "step": 5005 + }, + { + "epoch": 0.0779966839733161, + "grad_norm": 3.067697286605835, + "learning_rate": 9.70551121744973e-07, + "loss": 0.8251, + "step": 5010 + }, + { + "epoch": 0.07807452497528548, + "grad_norm": 4.706809043884277, + "learning_rate": 9.70469182740368e-07, + "loss": 0.894, + "step": 5015 + }, + { + "epoch": 0.07815236597725486, + "grad_norm": 3.1183722019195557, + "learning_rate": 9.703872437357632e-07, + "loss": 0.7985, + "step": 5020 + }, + { + "epoch": 0.07823020697922424, + "grad_norm": 3.95314884185791, + "learning_rate": 9.703053047311582e-07, + "loss": 0.8673, + "step": 5025 + }, + { + "epoch": 0.07830804798119362, + "grad_norm": 5.186405658721924, + "learning_rate": 9.70223365726553e-07, + "loss": 0.8834, + "step": 5030 + }, + { + "epoch": 0.078385888983163, + "grad_norm": 6.825442790985107, + "learning_rate": 9.701414267219482e-07, + "loss": 0.8638, + "step": 5035 + }, + { + "epoch": 0.07846372998513237, + "grad_norm": 4.547275543212891, + "learning_rate": 9.700594877173432e-07, + "loss": 0.7591, + "step": 5040 + }, + { + "epoch": 0.07854157098710175, + "grad_norm": 3.055347204208374, + "learning_rate": 9.69977548712738e-07, + "loss": 0.7955, + "step": 5045 + }, + { + "epoch": 0.07861941198907112, + "grad_norm": 6.120547294616699, + "learning_rate": 9.698956097081332e-07, + "loss": 0.8606, + "step": 5050 + }, + { + "epoch": 0.0786972529910405, + "grad_norm": 8.125998497009277, + "learning_rate": 9.698136707035283e-07, + "loss": 0.7676, + "step": 5055 + }, + { + "epoch": 0.07877509399300987, + "grad_norm": 3.794414758682251, + "learning_rate": 9.697317316989233e-07, + "loss": 0.8594, + "step": 5060 + }, + { + "epoch": 0.07885293499497925, + "grad_norm": 4.892978191375732, + "learning_rate": 9.696497926943184e-07, + "loss": 0.774, + "step": 5065 + }, + { + "epoch": 0.07893077599694863, + "grad_norm": 4.139584064483643, + "learning_rate": 9.695678536897135e-07, + "loss": 0.83, + "step": 5070 + }, + { + "epoch": 0.07900861699891801, + "grad_norm": 7.144068241119385, + "learning_rate": 9.694859146851083e-07, + "loss": 0.8541, + "step": 5075 + }, + { + "epoch": 0.07908645800088739, + "grad_norm": 10.283439636230469, + "learning_rate": 9.694039756805034e-07, + "loss": 0.9355, + "step": 5080 + }, + { + "epoch": 0.07916429900285676, + "grad_norm": 3.0185656547546387, + "learning_rate": 9.693220366758985e-07, + "loss": 0.8761, + "step": 5085 + }, + { + "epoch": 0.07924214000482614, + "grad_norm": 3.299808979034424, + "learning_rate": 9.692400976712934e-07, + "loss": 0.715, + "step": 5090 + }, + { + "epoch": 0.07931998100679552, + "grad_norm": 7.163717746734619, + "learning_rate": 9.691581586666884e-07, + "loss": 0.6974, + "step": 5095 + }, + { + "epoch": 0.0793978220087649, + "grad_norm": 3.0995216369628906, + "learning_rate": 9.690762196620835e-07, + "loss": 0.8159, + "step": 5100 + }, + { + "epoch": 0.07947566301073428, + "grad_norm": 2.7312302589416504, + "learning_rate": 9.689942806574786e-07, + "loss": 0.8009, + "step": 5105 + }, + { + "epoch": 0.07955350401270365, + "grad_norm": 4.884325981140137, + "learning_rate": 9.689123416528737e-07, + "loss": 0.8839, + "step": 5110 + }, + { + "epoch": 0.07963134501467303, + "grad_norm": 3.1511213779449463, + "learning_rate": 9.688304026482685e-07, + "loss": 0.6633, + "step": 5115 + }, + { + "epoch": 0.07970918601664241, + "grad_norm": 3.034996271133423, + "learning_rate": 9.687484636436636e-07, + "loss": 0.8636, + "step": 5120 + }, + { + "epoch": 0.07978702701861179, + "grad_norm": 6.756342887878418, + "learning_rate": 9.686665246390587e-07, + "loss": 0.844, + "step": 5125 + }, + { + "epoch": 0.07986486802058117, + "grad_norm": 4.012609958648682, + "learning_rate": 9.685845856344537e-07, + "loss": 1.0347, + "step": 5130 + }, + { + "epoch": 0.07994270902255053, + "grad_norm": 4.039714336395264, + "learning_rate": 9.685026466298486e-07, + "loss": 0.8549, + "step": 5135 + }, + { + "epoch": 0.08002055002451991, + "grad_norm": 4.654749393463135, + "learning_rate": 9.684207076252437e-07, + "loss": 0.7294, + "step": 5140 + }, + { + "epoch": 0.08009839102648929, + "grad_norm": 5.652122497558594, + "learning_rate": 9.683387686206388e-07, + "loss": 0.8808, + "step": 5145 + }, + { + "epoch": 0.08017623202845867, + "grad_norm": 5.13718318939209, + "learning_rate": 9.682568296160338e-07, + "loss": 0.8012, + "step": 5150 + }, + { + "epoch": 0.08025407303042804, + "grad_norm": 4.274785995483398, + "learning_rate": 9.681748906114287e-07, + "loss": 0.9497, + "step": 5155 + }, + { + "epoch": 0.08033191403239742, + "grad_norm": 3.5715765953063965, + "learning_rate": 9.680929516068238e-07, + "loss": 0.6932, + "step": 5160 + }, + { + "epoch": 0.0804097550343668, + "grad_norm": 3.721369504928589, + "learning_rate": 9.680110126022188e-07, + "loss": 0.7604, + "step": 5165 + }, + { + "epoch": 0.08048759603633618, + "grad_norm": 4.815948486328125, + "learning_rate": 9.67929073597614e-07, + "loss": 0.8056, + "step": 5170 + }, + { + "epoch": 0.08056543703830556, + "grad_norm": 3.9973649978637695, + "learning_rate": 9.67847134593009e-07, + "loss": 0.8245, + "step": 5175 + }, + { + "epoch": 0.08064327804027493, + "grad_norm": 6.30864143371582, + "learning_rate": 9.67765195588404e-07, + "loss": 0.8158, + "step": 5180 + }, + { + "epoch": 0.08072111904224431, + "grad_norm": 3.627049207687378, + "learning_rate": 9.67683256583799e-07, + "loss": 0.6924, + "step": 5185 + }, + { + "epoch": 0.08079896004421369, + "grad_norm": 3.445680618286133, + "learning_rate": 9.67601317579194e-07, + "loss": 0.7619, + "step": 5190 + }, + { + "epoch": 0.08087680104618307, + "grad_norm": 5.6612868309021, + "learning_rate": 9.67519378574589e-07, + "loss": 0.8712, + "step": 5195 + }, + { + "epoch": 0.08095464204815245, + "grad_norm": 8.172099113464355, + "learning_rate": 9.67437439569984e-07, + "loss": 0.875, + "step": 5200 + }, + { + "epoch": 0.08103248305012183, + "grad_norm": 3.6549482345581055, + "learning_rate": 9.67355500565379e-07, + "loss": 0.7416, + "step": 5205 + }, + { + "epoch": 0.0811103240520912, + "grad_norm": 4.237252712249756, + "learning_rate": 9.67273561560774e-07, + "loss": 0.7864, + "step": 5210 + }, + { + "epoch": 0.08118816505406058, + "grad_norm": 3.6416895389556885, + "learning_rate": 9.671916225561692e-07, + "loss": 0.8346, + "step": 5215 + }, + { + "epoch": 0.08126600605602995, + "grad_norm": 7.055088996887207, + "learning_rate": 9.671096835515642e-07, + "loss": 0.8257, + "step": 5220 + }, + { + "epoch": 0.08134384705799932, + "grad_norm": 4.3031511306762695, + "learning_rate": 9.670277445469593e-07, + "loss": 0.9243, + "step": 5225 + }, + { + "epoch": 0.0814216880599687, + "grad_norm": 12.051529884338379, + "learning_rate": 9.669458055423542e-07, + "loss": 0.8012, + "step": 5230 + }, + { + "epoch": 0.08149952906193808, + "grad_norm": 3.5274226665496826, + "learning_rate": 9.668638665377493e-07, + "loss": 0.8752, + "step": 5235 + }, + { + "epoch": 0.08157737006390746, + "grad_norm": 3.1642568111419678, + "learning_rate": 9.667819275331443e-07, + "loss": 0.7385, + "step": 5240 + }, + { + "epoch": 0.08165521106587684, + "grad_norm": 3.645951271057129, + "learning_rate": 9.666999885285392e-07, + "loss": 0.7538, + "step": 5245 + }, + { + "epoch": 0.08173305206784622, + "grad_norm": 5.045301914215088, + "learning_rate": 9.666180495239343e-07, + "loss": 0.8496, + "step": 5250 + }, + { + "epoch": 0.0818108930698156, + "grad_norm": 3.8335864543914795, + "learning_rate": 9.665361105193293e-07, + "loss": 0.8149, + "step": 5255 + }, + { + "epoch": 0.08188873407178497, + "grad_norm": 5.525310516357422, + "learning_rate": 9.664541715147244e-07, + "loss": 0.8061, + "step": 5260 + }, + { + "epoch": 0.08196657507375435, + "grad_norm": 3.721007823944092, + "learning_rate": 9.663722325101195e-07, + "loss": 0.7835, + "step": 5265 + }, + { + "epoch": 0.08204441607572373, + "grad_norm": 4.0820393562316895, + "learning_rate": 9.662902935055146e-07, + "loss": 0.8629, + "step": 5270 + }, + { + "epoch": 0.0821222570776931, + "grad_norm": 2.5007712841033936, + "learning_rate": 9.662083545009094e-07, + "loss": 0.8987, + "step": 5275 + }, + { + "epoch": 0.08220009807966248, + "grad_norm": 5.49976110458374, + "learning_rate": 9.661264154963045e-07, + "loss": 0.8664, + "step": 5280 + }, + { + "epoch": 0.08227793908163186, + "grad_norm": 3.953249931335449, + "learning_rate": 9.660444764916996e-07, + "loss": 0.8364, + "step": 5285 + }, + { + "epoch": 0.08235578008360124, + "grad_norm": 5.422050476074219, + "learning_rate": 9.659625374870947e-07, + "loss": 0.825, + "step": 5290 + }, + { + "epoch": 0.08243362108557062, + "grad_norm": 6.019737720489502, + "learning_rate": 9.658805984824895e-07, + "loss": 0.7429, + "step": 5295 + }, + { + "epoch": 0.08251146208754, + "grad_norm": 4.360890865325928, + "learning_rate": 9.657986594778846e-07, + "loss": 0.8384, + "step": 5300 + }, + { + "epoch": 0.08258930308950937, + "grad_norm": 2.676135540008545, + "learning_rate": 9.657167204732797e-07, + "loss": 0.7928, + "step": 5305 + }, + { + "epoch": 0.08266714409147874, + "grad_norm": 2.602173328399658, + "learning_rate": 9.656347814686747e-07, + "loss": 0.8301, + "step": 5310 + }, + { + "epoch": 0.08274498509344812, + "grad_norm": 3.2521512508392334, + "learning_rate": 9.655528424640696e-07, + "loss": 0.8491, + "step": 5315 + }, + { + "epoch": 0.0828228260954175, + "grad_norm": 3.9603660106658936, + "learning_rate": 9.654709034594647e-07, + "loss": 0.9349, + "step": 5320 + }, + { + "epoch": 0.08290066709738687, + "grad_norm": 3.615999698638916, + "learning_rate": 9.653889644548598e-07, + "loss": 0.7493, + "step": 5325 + }, + { + "epoch": 0.08297850809935625, + "grad_norm": 4.19753360748291, + "learning_rate": 9.653070254502548e-07, + "loss": 0.8548, + "step": 5330 + }, + { + "epoch": 0.08305634910132563, + "grad_norm": 3.5472726821899414, + "learning_rate": 9.6522508644565e-07, + "loss": 0.6982, + "step": 5335 + }, + { + "epoch": 0.08313419010329501, + "grad_norm": 8.160552024841309, + "learning_rate": 9.651431474410448e-07, + "loss": 0.801, + "step": 5340 + }, + { + "epoch": 0.08321203110526439, + "grad_norm": 5.538876056671143, + "learning_rate": 9.650612084364398e-07, + "loss": 0.7735, + "step": 5345 + }, + { + "epoch": 0.08328987210723376, + "grad_norm": 5.047536849975586, + "learning_rate": 9.64979269431835e-07, + "loss": 0.9173, + "step": 5350 + }, + { + "epoch": 0.08336771310920314, + "grad_norm": 3.526073932647705, + "learning_rate": 9.648973304272298e-07, + "loss": 0.9703, + "step": 5355 + }, + { + "epoch": 0.08344555411117252, + "grad_norm": 12.305222511291504, + "learning_rate": 9.648153914226249e-07, + "loss": 0.8025, + "step": 5360 + }, + { + "epoch": 0.0835233951131419, + "grad_norm": 4.003148078918457, + "learning_rate": 9.6473345241802e-07, + "loss": 0.8206, + "step": 5365 + }, + { + "epoch": 0.08360123611511128, + "grad_norm": 3.3531124591827393, + "learning_rate": 9.64651513413415e-07, + "loss": 0.7922, + "step": 5370 + }, + { + "epoch": 0.08367907711708066, + "grad_norm": 3.3483853340148926, + "learning_rate": 9.6456957440881e-07, + "loss": 0.805, + "step": 5375 + }, + { + "epoch": 0.08375691811905003, + "grad_norm": 3.580211639404297, + "learning_rate": 9.644876354042052e-07, + "loss": 0.8038, + "step": 5380 + }, + { + "epoch": 0.08383475912101941, + "grad_norm": 4.441928863525391, + "learning_rate": 9.644056963996e-07, + "loss": 0.8188, + "step": 5385 + }, + { + "epoch": 0.08391260012298879, + "grad_norm": 4.342660903930664, + "learning_rate": 9.64323757394995e-07, + "loss": 0.9376, + "step": 5390 + }, + { + "epoch": 0.08399044112495815, + "grad_norm": 3.5513997077941895, + "learning_rate": 9.642418183903902e-07, + "loss": 0.8407, + "step": 5395 + }, + { + "epoch": 0.08406828212692753, + "grad_norm": 3.7131507396698, + "learning_rate": 9.64159879385785e-07, + "loss": 0.8832, + "step": 5400 + }, + { + "epoch": 0.08414612312889691, + "grad_norm": 4.675576686859131, + "learning_rate": 9.6407794038118e-07, + "loss": 0.8137, + "step": 5405 + }, + { + "epoch": 0.08422396413086629, + "grad_norm": 5.775442600250244, + "learning_rate": 9.639960013765752e-07, + "loss": 0.8705, + "step": 5410 + }, + { + "epoch": 0.08430180513283567, + "grad_norm": 3.2232508659362793, + "learning_rate": 9.639140623719703e-07, + "loss": 0.8266, + "step": 5415 + }, + { + "epoch": 0.08437964613480504, + "grad_norm": 3.5636298656463623, + "learning_rate": 9.638321233673653e-07, + "loss": 0.739, + "step": 5420 + }, + { + "epoch": 0.08445748713677442, + "grad_norm": 6.0133442878723145, + "learning_rate": 9.637501843627604e-07, + "loss": 0.7714, + "step": 5425 + }, + { + "epoch": 0.0845353281387438, + "grad_norm": 3.2928476333618164, + "learning_rate": 9.636682453581555e-07, + "loss": 0.7455, + "step": 5430 + }, + { + "epoch": 0.08461316914071318, + "grad_norm": 3.734174966812134, + "learning_rate": 9.635863063535503e-07, + "loss": 0.8668, + "step": 5435 + }, + { + "epoch": 0.08469101014268256, + "grad_norm": 3.125318765640259, + "learning_rate": 9.635043673489454e-07, + "loss": 0.8308, + "step": 5440 + }, + { + "epoch": 0.08476885114465194, + "grad_norm": 4.821923732757568, + "learning_rate": 9.634224283443405e-07, + "loss": 0.7993, + "step": 5445 + }, + { + "epoch": 0.08484669214662131, + "grad_norm": 3.52372407913208, + "learning_rate": 9.633404893397354e-07, + "loss": 0.9816, + "step": 5450 + }, + { + "epoch": 0.08492453314859069, + "grad_norm": 4.727661609649658, + "learning_rate": 9.632585503351304e-07, + "loss": 0.8819, + "step": 5455 + }, + { + "epoch": 0.08500237415056007, + "grad_norm": 6.401661396026611, + "learning_rate": 9.631766113305255e-07, + "loss": 0.879, + "step": 5460 + }, + { + "epoch": 0.08508021515252945, + "grad_norm": 3.203312873840332, + "learning_rate": 9.630946723259206e-07, + "loss": 0.7982, + "step": 5465 + }, + { + "epoch": 0.08515805615449883, + "grad_norm": 4.19862174987793, + "learning_rate": 9.630127333213157e-07, + "loss": 0.8441, + "step": 5470 + }, + { + "epoch": 0.0852358971564682, + "grad_norm": 3.9910812377929688, + "learning_rate": 9.629307943167107e-07, + "loss": 0.9595, + "step": 5475 + }, + { + "epoch": 0.08531373815843757, + "grad_norm": 3.739917755126953, + "learning_rate": 9.628488553121056e-07, + "loss": 0.9027, + "step": 5480 + }, + { + "epoch": 0.08539157916040695, + "grad_norm": 3.8963537216186523, + "learning_rate": 9.627669163075007e-07, + "loss": 0.7635, + "step": 5485 + }, + { + "epoch": 0.08546942016237633, + "grad_norm": 11.066873550415039, + "learning_rate": 9.626849773028957e-07, + "loss": 0.828, + "step": 5490 + }, + { + "epoch": 0.0855472611643457, + "grad_norm": 5.069997310638428, + "learning_rate": 9.626030382982906e-07, + "loss": 0.892, + "step": 5495 + }, + { + "epoch": 0.08562510216631508, + "grad_norm": 2.1011128425598145, + "learning_rate": 9.625210992936857e-07, + "loss": 0.7157, + "step": 5500 + }, + { + "epoch": 0.08570294316828446, + "grad_norm": 5.490849494934082, + "learning_rate": 9.624391602890808e-07, + "loss": 0.7912, + "step": 5505 + }, + { + "epoch": 0.08578078417025384, + "grad_norm": 5.189328670501709, + "learning_rate": 9.623572212844758e-07, + "loss": 0.6751, + "step": 5510 + }, + { + "epoch": 0.08585862517222322, + "grad_norm": 3.257615089416504, + "learning_rate": 9.62275282279871e-07, + "loss": 0.7616, + "step": 5515 + }, + { + "epoch": 0.0859364661741926, + "grad_norm": 3.8244619369506836, + "learning_rate": 9.621933432752658e-07, + "loss": 0.7356, + "step": 5520 + }, + { + "epoch": 0.08601430717616197, + "grad_norm": 4.616507530212402, + "learning_rate": 9.621114042706608e-07, + "loss": 0.8844, + "step": 5525 + }, + { + "epoch": 0.08609214817813135, + "grad_norm": 8.950932502746582, + "learning_rate": 9.62029465266056e-07, + "loss": 0.8493, + "step": 5530 + }, + { + "epoch": 0.08616998918010073, + "grad_norm": 3.257582187652588, + "learning_rate": 9.61947526261451e-07, + "loss": 0.7523, + "step": 5535 + }, + { + "epoch": 0.0862478301820701, + "grad_norm": 9.792999267578125, + "learning_rate": 9.61865587256846e-07, + "loss": 0.8037, + "step": 5540 + }, + { + "epoch": 0.08632567118403948, + "grad_norm": 3.294633626937866, + "learning_rate": 9.61783648252241e-07, + "loss": 0.7864, + "step": 5545 + }, + { + "epoch": 0.08640351218600886, + "grad_norm": 3.527974843978882, + "learning_rate": 9.61701709247636e-07, + "loss": 0.7226, + "step": 5550 + }, + { + "epoch": 0.08648135318797824, + "grad_norm": 10.905069351196289, + "learning_rate": 9.61619770243031e-07, + "loss": 0.8916, + "step": 5555 + }, + { + "epoch": 0.08655919418994762, + "grad_norm": 5.191342830657959, + "learning_rate": 9.61537831238426e-07, + "loss": 0.7855, + "step": 5560 + }, + { + "epoch": 0.08663703519191698, + "grad_norm": 4.45928430557251, + "learning_rate": 9.61455892233821e-07, + "loss": 0.6989, + "step": 5565 + }, + { + "epoch": 0.08671487619388636, + "grad_norm": 3.573596954345703, + "learning_rate": 9.61373953229216e-07, + "loss": 0.718, + "step": 5570 + }, + { + "epoch": 0.08679271719585574, + "grad_norm": 3.490968942642212, + "learning_rate": 9.612920142246112e-07, + "loss": 0.9475, + "step": 5575 + }, + { + "epoch": 0.08687055819782512, + "grad_norm": 5.315331935882568, + "learning_rate": 9.612100752200062e-07, + "loss": 0.7643, + "step": 5580 + }, + { + "epoch": 0.0869483991997945, + "grad_norm": 5.576305389404297, + "learning_rate": 9.611281362154013e-07, + "loss": 0.8273, + "step": 5585 + }, + { + "epoch": 0.08702624020176387, + "grad_norm": 3.3249528408050537, + "learning_rate": 9.610461972107962e-07, + "loss": 0.7327, + "step": 5590 + }, + { + "epoch": 0.08710408120373325, + "grad_norm": 5.021561622619629, + "learning_rate": 9.609642582061913e-07, + "loss": 0.7693, + "step": 5595 + }, + { + "epoch": 0.08718192220570263, + "grad_norm": 4.7560834884643555, + "learning_rate": 9.608823192015863e-07, + "loss": 0.7056, + "step": 5600 + }, + { + "epoch": 0.08725976320767201, + "grad_norm": 4.182785987854004, + "learning_rate": 9.608003801969812e-07, + "loss": 0.7807, + "step": 5605 + }, + { + "epoch": 0.08733760420964139, + "grad_norm": 3.104510545730591, + "learning_rate": 9.607184411923763e-07, + "loss": 0.771, + "step": 5610 + }, + { + "epoch": 0.08741544521161076, + "grad_norm": 4.316323280334473, + "learning_rate": 9.606365021877713e-07, + "loss": 0.8881, + "step": 5615 + }, + { + "epoch": 0.08749328621358014, + "grad_norm": 4.004445552825928, + "learning_rate": 9.605545631831664e-07, + "loss": 0.8125, + "step": 5620 + }, + { + "epoch": 0.08757112721554952, + "grad_norm": 5.998608112335205, + "learning_rate": 9.604726241785615e-07, + "loss": 0.8896, + "step": 5625 + }, + { + "epoch": 0.0876489682175189, + "grad_norm": 6.251708507537842, + "learning_rate": 9.603906851739566e-07, + "loss": 0.97, + "step": 5630 + }, + { + "epoch": 0.08772680921948828, + "grad_norm": 4.202377796173096, + "learning_rate": 9.603087461693514e-07, + "loss": 0.9106, + "step": 5635 + }, + { + "epoch": 0.08780465022145766, + "grad_norm": 7.748138427734375, + "learning_rate": 9.602268071647465e-07, + "loss": 0.8634, + "step": 5640 + }, + { + "epoch": 0.08788249122342703, + "grad_norm": 5.498707294464111, + "learning_rate": 9.601448681601416e-07, + "loss": 0.7508, + "step": 5645 + }, + { + "epoch": 0.08796033222539641, + "grad_norm": 3.504171133041382, + "learning_rate": 9.600629291555364e-07, + "loss": 0.8638, + "step": 5650 + }, + { + "epoch": 0.08803817322736578, + "grad_norm": 4.243772983551025, + "learning_rate": 9.599809901509315e-07, + "loss": 0.8651, + "step": 5655 + }, + { + "epoch": 0.08811601422933515, + "grad_norm": 2.332878589630127, + "learning_rate": 9.598990511463266e-07, + "loss": 0.6884, + "step": 5660 + }, + { + "epoch": 0.08819385523130453, + "grad_norm": 5.470850944519043, + "learning_rate": 9.598171121417217e-07, + "loss": 0.9159, + "step": 5665 + }, + { + "epoch": 0.08827169623327391, + "grad_norm": 3.167588710784912, + "learning_rate": 9.597351731371167e-07, + "loss": 0.7925, + "step": 5670 + }, + { + "epoch": 0.08834953723524329, + "grad_norm": 8.463876724243164, + "learning_rate": 9.596532341325118e-07, + "loss": 0.773, + "step": 5675 + }, + { + "epoch": 0.08842737823721267, + "grad_norm": 5.318755626678467, + "learning_rate": 9.595712951279067e-07, + "loss": 0.8523, + "step": 5680 + }, + { + "epoch": 0.08850521923918205, + "grad_norm": 8.276546478271484, + "learning_rate": 9.594893561233018e-07, + "loss": 0.7573, + "step": 5685 + }, + { + "epoch": 0.08858306024115142, + "grad_norm": 3.6410884857177734, + "learning_rate": 9.594074171186968e-07, + "loss": 1.0501, + "step": 5690 + }, + { + "epoch": 0.0886609012431208, + "grad_norm": 4.722231388092041, + "learning_rate": 9.59325478114092e-07, + "loss": 0.8005, + "step": 5695 + }, + { + "epoch": 0.08873874224509018, + "grad_norm": 4.808355808258057, + "learning_rate": 9.592435391094868e-07, + "loss": 0.8858, + "step": 5700 + }, + { + "epoch": 0.08881658324705956, + "grad_norm": 3.3222663402557373, + "learning_rate": 9.591616001048818e-07, + "loss": 0.8279, + "step": 5705 + }, + { + "epoch": 0.08889442424902894, + "grad_norm": 6.019637584686279, + "learning_rate": 9.59079661100277e-07, + "loss": 0.8425, + "step": 5710 + }, + { + "epoch": 0.08897226525099831, + "grad_norm": 3.4430840015411377, + "learning_rate": 9.58997722095672e-07, + "loss": 0.8277, + "step": 5715 + }, + { + "epoch": 0.08905010625296769, + "grad_norm": 2.4599595069885254, + "learning_rate": 9.58915783091067e-07, + "loss": 0.8047, + "step": 5720 + }, + { + "epoch": 0.08912794725493707, + "grad_norm": 5.123390197753906, + "learning_rate": 9.58833844086462e-07, + "loss": 0.7134, + "step": 5725 + }, + { + "epoch": 0.08920578825690645, + "grad_norm": 5.264007568359375, + "learning_rate": 9.58751905081857e-07, + "loss": 0.914, + "step": 5730 + }, + { + "epoch": 0.08928362925887583, + "grad_norm": 2.6615512371063232, + "learning_rate": 9.58669966077252e-07, + "loss": 0.8613, + "step": 5735 + }, + { + "epoch": 0.08936147026084519, + "grad_norm": 2.7306411266326904, + "learning_rate": 9.585880270726472e-07, + "loss": 0.6556, + "step": 5740 + }, + { + "epoch": 0.08943931126281457, + "grad_norm": 3.124546766281128, + "learning_rate": 9.58506088068042e-07, + "loss": 0.7402, + "step": 5745 + }, + { + "epoch": 0.08951715226478395, + "grad_norm": 3.2253921031951904, + "learning_rate": 9.58424149063437e-07, + "loss": 0.8324, + "step": 5750 + }, + { + "epoch": 0.08959499326675333, + "grad_norm": 4.765871524810791, + "learning_rate": 9.583422100588322e-07, + "loss": 0.8187, + "step": 5755 + }, + { + "epoch": 0.0896728342687227, + "grad_norm": 5.348093509674072, + "learning_rate": 9.582602710542272e-07, + "loss": 0.8607, + "step": 5760 + }, + { + "epoch": 0.08975067527069208, + "grad_norm": 3.306044340133667, + "learning_rate": 9.581783320496221e-07, + "loss": 0.8503, + "step": 5765 + }, + { + "epoch": 0.08982851627266146, + "grad_norm": 5.045707702636719, + "learning_rate": 9.580963930450172e-07, + "loss": 0.8423, + "step": 5770 + }, + { + "epoch": 0.08990635727463084, + "grad_norm": 6.576409816741943, + "learning_rate": 9.580144540404123e-07, + "loss": 0.9193, + "step": 5775 + }, + { + "epoch": 0.08998419827660022, + "grad_norm": 7.775379180908203, + "learning_rate": 9.579325150358073e-07, + "loss": 0.8257, + "step": 5780 + }, + { + "epoch": 0.0900620392785696, + "grad_norm": 8.183690071105957, + "learning_rate": 9.578505760312024e-07, + "loss": 0.7978, + "step": 5785 + }, + { + "epoch": 0.09013988028053897, + "grad_norm": 5.8509087562561035, + "learning_rate": 9.577686370265975e-07, + "loss": 0.7486, + "step": 5790 + }, + { + "epoch": 0.09021772128250835, + "grad_norm": 7.354578971862793, + "learning_rate": 9.576866980219923e-07, + "loss": 0.8432, + "step": 5795 + }, + { + "epoch": 0.09029556228447773, + "grad_norm": 3.6449766159057617, + "learning_rate": 9.576047590173874e-07, + "loss": 0.8296, + "step": 5800 + }, + { + "epoch": 0.09037340328644711, + "grad_norm": 9.557231903076172, + "learning_rate": 9.575228200127825e-07, + "loss": 0.7974, + "step": 5805 + }, + { + "epoch": 0.09045124428841649, + "grad_norm": 4.868302345275879, + "learning_rate": 9.574408810081774e-07, + "loss": 0.9218, + "step": 5810 + }, + { + "epoch": 0.09052908529038586, + "grad_norm": 4.260608196258545, + "learning_rate": 9.573589420035724e-07, + "loss": 0.7428, + "step": 5815 + }, + { + "epoch": 0.09060692629235524, + "grad_norm": 3.023204803466797, + "learning_rate": 9.572770029989675e-07, + "loss": 0.8903, + "step": 5820 + }, + { + "epoch": 0.0906847672943246, + "grad_norm": 3.036348819732666, + "learning_rate": 9.571950639943626e-07, + "loss": 0.89, + "step": 5825 + }, + { + "epoch": 0.09076260829629398, + "grad_norm": 4.273719310760498, + "learning_rate": 9.571131249897577e-07, + "loss": 0.8611, + "step": 5830 + }, + { + "epoch": 0.09084044929826336, + "grad_norm": 3.51576828956604, + "learning_rate": 9.570311859851527e-07, + "loss": 0.829, + "step": 5835 + }, + { + "epoch": 0.09091829030023274, + "grad_norm": 3.904651641845703, + "learning_rate": 9.569492469805476e-07, + "loss": 0.9588, + "step": 5840 + }, + { + "epoch": 0.09099613130220212, + "grad_norm": 3.824842691421509, + "learning_rate": 9.568673079759427e-07, + "loss": 0.7191, + "step": 5845 + }, + { + "epoch": 0.0910739723041715, + "grad_norm": 5.210089683532715, + "learning_rate": 9.567853689713377e-07, + "loss": 0.8695, + "step": 5850 + }, + { + "epoch": 0.09115181330614087, + "grad_norm": 2.847330093383789, + "learning_rate": 9.567034299667326e-07, + "loss": 0.8151, + "step": 5855 + }, + { + "epoch": 0.09122965430811025, + "grad_norm": 4.297481060028076, + "learning_rate": 9.566214909621277e-07, + "loss": 0.91, + "step": 5860 + }, + { + "epoch": 0.09130749531007963, + "grad_norm": 5.124939918518066, + "learning_rate": 9.565395519575228e-07, + "loss": 0.7646, + "step": 5865 + }, + { + "epoch": 0.09138533631204901, + "grad_norm": 4.084904193878174, + "learning_rate": 9.564576129529178e-07, + "loss": 0.8639, + "step": 5870 + }, + { + "epoch": 0.09146317731401839, + "grad_norm": 7.2979254722595215, + "learning_rate": 9.56375673948313e-07, + "loss": 0.909, + "step": 5875 + }, + { + "epoch": 0.09154101831598777, + "grad_norm": 3.893127202987671, + "learning_rate": 9.56293734943708e-07, + "loss": 0.7701, + "step": 5880 + }, + { + "epoch": 0.09161885931795714, + "grad_norm": 3.9665653705596924, + "learning_rate": 9.562117959391028e-07, + "loss": 0.8257, + "step": 5885 + }, + { + "epoch": 0.09169670031992652, + "grad_norm": 3.298375129699707, + "learning_rate": 9.56129856934498e-07, + "loss": 0.8005, + "step": 5890 + }, + { + "epoch": 0.0917745413218959, + "grad_norm": 3.643336057662964, + "learning_rate": 9.56047917929893e-07, + "loss": 0.8725, + "step": 5895 + }, + { + "epoch": 0.09185238232386528, + "grad_norm": 11.060576438903809, + "learning_rate": 9.559659789252879e-07, + "loss": 0.8681, + "step": 5900 + }, + { + "epoch": 0.09193022332583466, + "grad_norm": 4.433795928955078, + "learning_rate": 9.55884039920683e-07, + "loss": 0.8007, + "step": 5905 + }, + { + "epoch": 0.09200806432780402, + "grad_norm": 6.115171909332275, + "learning_rate": 9.55802100916078e-07, + "loss": 0.73, + "step": 5910 + }, + { + "epoch": 0.0920859053297734, + "grad_norm": 4.329387187957764, + "learning_rate": 9.55720161911473e-07, + "loss": 0.7698, + "step": 5915 + }, + { + "epoch": 0.09216374633174278, + "grad_norm": 4.206638813018799, + "learning_rate": 9.556382229068682e-07, + "loss": 0.7973, + "step": 5920 + }, + { + "epoch": 0.09224158733371216, + "grad_norm": 3.0813913345336914, + "learning_rate": 9.55556283902263e-07, + "loss": 0.7804, + "step": 5925 + }, + { + "epoch": 0.09231942833568153, + "grad_norm": 6.411551475524902, + "learning_rate": 9.55474344897658e-07, + "loss": 0.8881, + "step": 5930 + }, + { + "epoch": 0.09239726933765091, + "grad_norm": 2.5208792686462402, + "learning_rate": 9.553924058930532e-07, + "loss": 0.841, + "step": 5935 + }, + { + "epoch": 0.09247511033962029, + "grad_norm": 2.8447041511535645, + "learning_rate": 9.553104668884482e-07, + "loss": 0.791, + "step": 5940 + }, + { + "epoch": 0.09255295134158967, + "grad_norm": 4.374822616577148, + "learning_rate": 9.552285278838433e-07, + "loss": 0.8153, + "step": 5945 + }, + { + "epoch": 0.09263079234355905, + "grad_norm": 6.252150058746338, + "learning_rate": 9.551465888792382e-07, + "loss": 0.8223, + "step": 5950 + }, + { + "epoch": 0.09270863334552842, + "grad_norm": 3.3018994331359863, + "learning_rate": 9.550646498746333e-07, + "loss": 0.9067, + "step": 5955 + }, + { + "epoch": 0.0927864743474978, + "grad_norm": 4.026679515838623, + "learning_rate": 9.549827108700283e-07, + "loss": 0.7962, + "step": 5960 + }, + { + "epoch": 0.09286431534946718, + "grad_norm": 3.2476413249969482, + "learning_rate": 9.549007718654232e-07, + "loss": 0.8074, + "step": 5965 + }, + { + "epoch": 0.09294215635143656, + "grad_norm": 2.852954149246216, + "learning_rate": 9.548188328608183e-07, + "loss": 0.8071, + "step": 5970 + }, + { + "epoch": 0.09301999735340594, + "grad_norm": 3.4490416049957275, + "learning_rate": 9.547368938562133e-07, + "loss": 0.9519, + "step": 5975 + }, + { + "epoch": 0.09309783835537531, + "grad_norm": 2.473008155822754, + "learning_rate": 9.546549548516084e-07, + "loss": 0.8446, + "step": 5980 + }, + { + "epoch": 0.09317567935734469, + "grad_norm": 7.381313800811768, + "learning_rate": 9.545730158470035e-07, + "loss": 0.802, + "step": 5985 + }, + { + "epoch": 0.09325352035931407, + "grad_norm": 4.133596897125244, + "learning_rate": 9.544910768423986e-07, + "loss": 0.7167, + "step": 5990 + }, + { + "epoch": 0.09333136136128345, + "grad_norm": 4.466127872467041, + "learning_rate": 9.544091378377934e-07, + "loss": 0.9329, + "step": 5995 + }, + { + "epoch": 0.09340920236325281, + "grad_norm": 4.252684593200684, + "learning_rate": 9.543271988331885e-07, + "loss": 0.856, + "step": 6000 + }, + { + "epoch": 0.09348704336522219, + "grad_norm": 3.630127429962158, + "learning_rate": 9.542452598285836e-07, + "loss": 0.8738, + "step": 6005 + }, + { + "epoch": 0.09356488436719157, + "grad_norm": 8.133733749389648, + "learning_rate": 9.541633208239784e-07, + "loss": 0.8339, + "step": 6010 + }, + { + "epoch": 0.09364272536916095, + "grad_norm": 11.18271541595459, + "learning_rate": 9.540813818193735e-07, + "loss": 0.7978, + "step": 6015 + }, + { + "epoch": 0.09372056637113033, + "grad_norm": 6.3515214920043945, + "learning_rate": 9.539994428147686e-07, + "loss": 0.7742, + "step": 6020 + }, + { + "epoch": 0.0937984073730997, + "grad_norm": 3.030446767807007, + "learning_rate": 9.539175038101637e-07, + "loss": 0.6648, + "step": 6025 + }, + { + "epoch": 0.09387624837506908, + "grad_norm": 5.10403299331665, + "learning_rate": 9.538355648055587e-07, + "loss": 0.9393, + "step": 6030 + }, + { + "epoch": 0.09395408937703846, + "grad_norm": 2.5483992099761963, + "learning_rate": 9.537536258009538e-07, + "loss": 0.8155, + "step": 6035 + }, + { + "epoch": 0.09403193037900784, + "grad_norm": 6.4216814041137695, + "learning_rate": 9.536716867963488e-07, + "loss": 0.9805, + "step": 6040 + }, + { + "epoch": 0.09410977138097722, + "grad_norm": 8.188408851623535, + "learning_rate": 9.535897477917439e-07, + "loss": 0.8092, + "step": 6045 + }, + { + "epoch": 0.0941876123829466, + "grad_norm": 4.022781848907471, + "learning_rate": 9.535078087871387e-07, + "loss": 0.8009, + "step": 6050 + }, + { + "epoch": 0.09426545338491597, + "grad_norm": 3.303135871887207, + "learning_rate": 9.534258697825338e-07, + "loss": 0.7763, + "step": 6055 + }, + { + "epoch": 0.09434329438688535, + "grad_norm": 5.49419641494751, + "learning_rate": 9.533439307779289e-07, + "loss": 0.8246, + "step": 6060 + }, + { + "epoch": 0.09442113538885473, + "grad_norm": 4.208410739898682, + "learning_rate": 9.532619917733238e-07, + "loss": 0.8875, + "step": 6065 + }, + { + "epoch": 0.09449897639082411, + "grad_norm": 2.5194616317749023, + "learning_rate": 9.531800527687189e-07, + "loss": 0.8782, + "step": 6070 + }, + { + "epoch": 0.09457681739279349, + "grad_norm": 3.4753055572509766, + "learning_rate": 9.53098113764114e-07, + "loss": 0.7719, + "step": 6075 + }, + { + "epoch": 0.09465465839476286, + "grad_norm": 4.319244861602783, + "learning_rate": 9.53016174759509e-07, + "loss": 0.9141, + "step": 6080 + }, + { + "epoch": 0.09473249939673223, + "grad_norm": 2.9613096714019775, + "learning_rate": 9.52934235754904e-07, + "loss": 0.7196, + "step": 6085 + }, + { + "epoch": 0.0948103403987016, + "grad_norm": 6.506518840789795, + "learning_rate": 9.52852296750299e-07, + "loss": 0.8589, + "step": 6090 + }, + { + "epoch": 0.09488818140067098, + "grad_norm": 7.105751037597656, + "learning_rate": 9.527703577456941e-07, + "loss": 0.8392, + "step": 6095 + }, + { + "epoch": 0.09496602240264036, + "grad_norm": 3.0105667114257812, + "learning_rate": 9.52688418741089e-07, + "loss": 0.6962, + "step": 6100 + }, + { + "epoch": 0.09504386340460974, + "grad_norm": 7.148667812347412, + "learning_rate": 9.526064797364841e-07, + "loss": 0.7958, + "step": 6105 + }, + { + "epoch": 0.09512170440657912, + "grad_norm": 11.00757122039795, + "learning_rate": 9.525245407318792e-07, + "loss": 0.8516, + "step": 6110 + }, + { + "epoch": 0.0951995454085485, + "grad_norm": 5.520313739776611, + "learning_rate": 9.524426017272742e-07, + "loss": 0.9142, + "step": 6115 + }, + { + "epoch": 0.09527738641051788, + "grad_norm": 3.418109893798828, + "learning_rate": 9.523606627226692e-07, + "loss": 0.7326, + "step": 6120 + }, + { + "epoch": 0.09535522741248725, + "grad_norm": 3.9674932956695557, + "learning_rate": 9.522787237180643e-07, + "loss": 0.8561, + "step": 6125 + }, + { + "epoch": 0.09543306841445663, + "grad_norm": 3.5758800506591797, + "learning_rate": 9.521967847134592e-07, + "loss": 0.8119, + "step": 6130 + }, + { + "epoch": 0.09551090941642601, + "grad_norm": 4.39679479598999, + "learning_rate": 9.521148457088543e-07, + "loss": 0.8427, + "step": 6135 + }, + { + "epoch": 0.09558875041839539, + "grad_norm": 5.498544216156006, + "learning_rate": 9.520329067042493e-07, + "loss": 0.8786, + "step": 6140 + }, + { + "epoch": 0.09566659142036477, + "grad_norm": 3.5914194583892822, + "learning_rate": 9.519509676996443e-07, + "loss": 0.8465, + "step": 6145 + }, + { + "epoch": 0.09574443242233414, + "grad_norm": 5.639887809753418, + "learning_rate": 9.518690286950394e-07, + "loss": 0.764, + "step": 6150 + }, + { + "epoch": 0.09582227342430352, + "grad_norm": 7.638282299041748, + "learning_rate": 9.517870896904345e-07, + "loss": 0.858, + "step": 6155 + }, + { + "epoch": 0.0959001144262729, + "grad_norm": 3.7768096923828125, + "learning_rate": 9.517051506858294e-07, + "loss": 0.8739, + "step": 6160 + }, + { + "epoch": 0.09597795542824228, + "grad_norm": 3.388122081756592, + "learning_rate": 9.516232116812245e-07, + "loss": 0.7646, + "step": 6165 + }, + { + "epoch": 0.09605579643021164, + "grad_norm": 3.0460891723632812, + "learning_rate": 9.515412726766195e-07, + "loss": 0.9288, + "step": 6170 + }, + { + "epoch": 0.09613363743218102, + "grad_norm": 3.9041805267333984, + "learning_rate": 9.514593336720144e-07, + "loss": 0.735, + "step": 6175 + }, + { + "epoch": 0.0962114784341504, + "grad_norm": 3.894850254058838, + "learning_rate": 9.513773946674095e-07, + "loss": 0.7104, + "step": 6180 + }, + { + "epoch": 0.09628931943611978, + "grad_norm": 3.6872172355651855, + "learning_rate": 9.512954556628046e-07, + "loss": 0.7743, + "step": 6185 + }, + { + "epoch": 0.09636716043808916, + "grad_norm": 2.9574503898620605, + "learning_rate": 9.512135166581996e-07, + "loss": 0.8923, + "step": 6190 + }, + { + "epoch": 0.09644500144005853, + "grad_norm": 3.6874301433563232, + "learning_rate": 9.511315776535946e-07, + "loss": 0.7798, + "step": 6195 + }, + { + "epoch": 0.09652284244202791, + "grad_norm": 5.110114574432373, + "learning_rate": 9.510496386489897e-07, + "loss": 0.8202, + "step": 6200 + }, + { + "epoch": 0.09660068344399729, + "grad_norm": 4.243130683898926, + "learning_rate": 9.509676996443848e-07, + "loss": 0.9956, + "step": 6205 + }, + { + "epoch": 0.09667852444596667, + "grad_norm": 9.388118743896484, + "learning_rate": 9.508857606397796e-07, + "loss": 0.8103, + "step": 6210 + }, + { + "epoch": 0.09675636544793605, + "grad_norm": 4.399020671844482, + "learning_rate": 9.508038216351747e-07, + "loss": 0.9899, + "step": 6215 + }, + { + "epoch": 0.09683420644990542, + "grad_norm": 5.260294437408447, + "learning_rate": 9.507218826305698e-07, + "loss": 0.7776, + "step": 6220 + }, + { + "epoch": 0.0969120474518748, + "grad_norm": 2.903243064880371, + "learning_rate": 9.506399436259648e-07, + "loss": 0.8065, + "step": 6225 + }, + { + "epoch": 0.09698988845384418, + "grad_norm": 7.704418182373047, + "learning_rate": 9.505580046213598e-07, + "loss": 0.6625, + "step": 6230 + }, + { + "epoch": 0.09706772945581356, + "grad_norm": 3.7152814865112305, + "learning_rate": 9.504760656167549e-07, + "loss": 0.8702, + "step": 6235 + }, + { + "epoch": 0.09714557045778294, + "grad_norm": 6.636418342590332, + "learning_rate": 9.503941266121499e-07, + "loss": 0.7592, + "step": 6240 + }, + { + "epoch": 0.09722341145975232, + "grad_norm": 5.01901388168335, + "learning_rate": 9.50312187607545e-07, + "loss": 0.8377, + "step": 6245 + }, + { + "epoch": 0.0973012524617217, + "grad_norm": 6.149816989898682, + "learning_rate": 9.502302486029399e-07, + "loss": 0.7698, + "step": 6250 + }, + { + "epoch": 0.09737909346369107, + "grad_norm": 4.017423152923584, + "learning_rate": 9.501483095983349e-07, + "loss": 0.7522, + "step": 6255 + }, + { + "epoch": 0.09745693446566044, + "grad_norm": 3.395038366317749, + "learning_rate": 9.5006637059373e-07, + "loss": 0.7814, + "step": 6260 + }, + { + "epoch": 0.09753477546762981, + "grad_norm": 8.359529495239258, + "learning_rate": 9.49984431589125e-07, + "loss": 0.8817, + "step": 6265 + }, + { + "epoch": 0.09761261646959919, + "grad_norm": 5.801593780517578, + "learning_rate": 9.4990249258452e-07, + "loss": 0.8346, + "step": 6270 + }, + { + "epoch": 0.09769045747156857, + "grad_norm": 3.440136432647705, + "learning_rate": 9.498205535799151e-07, + "loss": 0.7793, + "step": 6275 + }, + { + "epoch": 0.09776829847353795, + "grad_norm": 3.3891918659210205, + "learning_rate": 9.497386145753102e-07, + "loss": 0.7846, + "step": 6280 + }, + { + "epoch": 0.09784613947550733, + "grad_norm": 3.6862120628356934, + "learning_rate": 9.496566755707051e-07, + "loss": 0.8316, + "step": 6285 + }, + { + "epoch": 0.0979239804774767, + "grad_norm": 4.2608642578125, + "learning_rate": 9.495747365661001e-07, + "loss": 0.8244, + "step": 6290 + }, + { + "epoch": 0.09800182147944608, + "grad_norm": 4.8404459953308105, + "learning_rate": 9.494927975614952e-07, + "loss": 0.7919, + "step": 6295 + }, + { + "epoch": 0.09807966248141546, + "grad_norm": 2.3203227519989014, + "learning_rate": 9.494108585568901e-07, + "loss": 0.7593, + "step": 6300 + }, + { + "epoch": 0.09815750348338484, + "grad_norm": 2.870492935180664, + "learning_rate": 9.493289195522852e-07, + "loss": 0.7388, + "step": 6305 + }, + { + "epoch": 0.09823534448535422, + "grad_norm": 3.6634552478790283, + "learning_rate": 9.492469805476803e-07, + "loss": 0.6632, + "step": 6310 + }, + { + "epoch": 0.0983131854873236, + "grad_norm": 4.569755554199219, + "learning_rate": 9.491650415430753e-07, + "loss": 0.866, + "step": 6315 + }, + { + "epoch": 0.09839102648929297, + "grad_norm": 3.319843053817749, + "learning_rate": 9.490831025384703e-07, + "loss": 0.8812, + "step": 6320 + }, + { + "epoch": 0.09846886749126235, + "grad_norm": 5.647189140319824, + "learning_rate": 9.490011635338654e-07, + "loss": 0.893, + "step": 6325 + }, + { + "epoch": 0.09854670849323173, + "grad_norm": 4.285895347595215, + "learning_rate": 9.489192245292603e-07, + "loss": 0.9623, + "step": 6330 + }, + { + "epoch": 0.09862454949520111, + "grad_norm": 4.257463455200195, + "learning_rate": 9.488372855246553e-07, + "loss": 0.7959, + "step": 6335 + }, + { + "epoch": 0.09870239049717049, + "grad_norm": 4.747158050537109, + "learning_rate": 9.487553465200504e-07, + "loss": 0.6869, + "step": 6340 + }, + { + "epoch": 0.09878023149913985, + "grad_norm": 4.191068172454834, + "learning_rate": 9.486734075154455e-07, + "loss": 0.8163, + "step": 6345 + }, + { + "epoch": 0.09885807250110923, + "grad_norm": 4.583565711975098, + "learning_rate": 9.485914685108405e-07, + "loss": 0.7348, + "step": 6350 + }, + { + "epoch": 0.0989359135030786, + "grad_norm": 3.9108059406280518, + "learning_rate": 9.485095295062355e-07, + "loss": 0.7968, + "step": 6355 + }, + { + "epoch": 0.09901375450504799, + "grad_norm": 5.722688674926758, + "learning_rate": 9.484275905016306e-07, + "loss": 0.8138, + "step": 6360 + }, + { + "epoch": 0.09909159550701736, + "grad_norm": 2.972755193710327, + "learning_rate": 9.483456514970256e-07, + "loss": 0.795, + "step": 6365 + }, + { + "epoch": 0.09916943650898674, + "grad_norm": 8.901226997375488, + "learning_rate": 9.482637124924207e-07, + "loss": 0.7491, + "step": 6370 + }, + { + "epoch": 0.09924727751095612, + "grad_norm": 7.961559772491455, + "learning_rate": 9.481817734878156e-07, + "loss": 1.0205, + "step": 6375 + }, + { + "epoch": 0.0993251185129255, + "grad_norm": 6.042298316955566, + "learning_rate": 9.480998344832106e-07, + "loss": 0.7984, + "step": 6380 + }, + { + "epoch": 0.09940295951489488, + "grad_norm": 10.556697845458984, + "learning_rate": 9.480178954786057e-07, + "loss": 0.8639, + "step": 6385 + }, + { + "epoch": 0.09948080051686425, + "grad_norm": 2.7401647567749023, + "learning_rate": 9.479359564740007e-07, + "loss": 0.799, + "step": 6390 + }, + { + "epoch": 0.09955864151883363, + "grad_norm": 6.288196086883545, + "learning_rate": 9.478540174693957e-07, + "loss": 0.9338, + "step": 6395 + }, + { + "epoch": 0.09963648252080301, + "grad_norm": 4.34282112121582, + "learning_rate": 9.477720784647908e-07, + "loss": 0.8355, + "step": 6400 + }, + { + "epoch": 0.09971432352277239, + "grad_norm": 3.5038483142852783, + "learning_rate": 9.476901394601859e-07, + "loss": 0.7796, + "step": 6405 + }, + { + "epoch": 0.09979216452474177, + "grad_norm": 4.715381622314453, + "learning_rate": 9.476082004555808e-07, + "loss": 0.815, + "step": 6410 + }, + { + "epoch": 0.09987000552671114, + "grad_norm": 3.7928483486175537, + "learning_rate": 9.475262614509758e-07, + "loss": 0.8346, + "step": 6415 + }, + { + "epoch": 0.09994784652868052, + "grad_norm": 5.622752666473389, + "learning_rate": 9.474443224463709e-07, + "loss": 0.8558, + "step": 6420 + }, + { + "epoch": 0.1000256875306499, + "grad_norm": 5.325289726257324, + "learning_rate": 9.473623834417658e-07, + "loss": 0.8078, + "step": 6425 + }, + { + "epoch": 0.10010352853261927, + "grad_norm": 3.5389554500579834, + "learning_rate": 9.472804444371609e-07, + "loss": 0.7461, + "step": 6430 + }, + { + "epoch": 0.10018136953458864, + "grad_norm": 8.74923038482666, + "learning_rate": 9.47198505432556e-07, + "loss": 0.7609, + "step": 6435 + }, + { + "epoch": 0.10025921053655802, + "grad_norm": 4.170187473297119, + "learning_rate": 9.47116566427951e-07, + "loss": 0.8243, + "step": 6440 + }, + { + "epoch": 0.1003370515385274, + "grad_norm": 12.243910789489746, + "learning_rate": 9.47034627423346e-07, + "loss": 0.7365, + "step": 6445 + }, + { + "epoch": 0.10041489254049678, + "grad_norm": 6.56355094909668, + "learning_rate": 9.469526884187411e-07, + "loss": 0.8693, + "step": 6450 + }, + { + "epoch": 0.10049273354246616, + "grad_norm": 4.917191982269287, + "learning_rate": 9.46870749414136e-07, + "loss": 0.8323, + "step": 6455 + }, + { + "epoch": 0.10057057454443553, + "grad_norm": 4.455476760864258, + "learning_rate": 9.467888104095311e-07, + "loss": 0.7539, + "step": 6460 + }, + { + "epoch": 0.10064841554640491, + "grad_norm": 4.135006904602051, + "learning_rate": 9.467068714049261e-07, + "loss": 0.772, + "step": 6465 + }, + { + "epoch": 0.10072625654837429, + "grad_norm": 5.814565658569336, + "learning_rate": 9.466249324003212e-07, + "loss": 0.8389, + "step": 6470 + }, + { + "epoch": 0.10080409755034367, + "grad_norm": 3.4807469844818115, + "learning_rate": 9.465429933957162e-07, + "loss": 0.8365, + "step": 6475 + }, + { + "epoch": 0.10088193855231305, + "grad_norm": 5.241673946380615, + "learning_rate": 9.464610543911112e-07, + "loss": 0.6689, + "step": 6480 + }, + { + "epoch": 0.10095977955428243, + "grad_norm": 6.0900678634643555, + "learning_rate": 9.463791153865063e-07, + "loss": 0.813, + "step": 6485 + }, + { + "epoch": 0.1010376205562518, + "grad_norm": 4.659064769744873, + "learning_rate": 9.462971763819013e-07, + "loss": 0.8624, + "step": 6490 + }, + { + "epoch": 0.10111546155822118, + "grad_norm": 7.9358320236206055, + "learning_rate": 9.462152373772963e-07, + "loss": 0.7209, + "step": 6495 + }, + { + "epoch": 0.10119330256019056, + "grad_norm": 3.9600491523742676, + "learning_rate": 9.461332983726913e-07, + "loss": 0.7995, + "step": 6500 + }, + { + "epoch": 0.10127114356215994, + "grad_norm": 4.832655906677246, + "learning_rate": 9.460513593680863e-07, + "loss": 0.8935, + "step": 6505 + }, + { + "epoch": 0.10134898456412932, + "grad_norm": 4.184332370758057, + "learning_rate": 9.459694203634814e-07, + "loss": 0.7468, + "step": 6510 + }, + { + "epoch": 0.10142682556609868, + "grad_norm": 3.2403645515441895, + "learning_rate": 9.458874813588765e-07, + "loss": 0.7502, + "step": 6515 + }, + { + "epoch": 0.10150466656806806, + "grad_norm": 6.45439338684082, + "learning_rate": 9.458055423542714e-07, + "loss": 0.8359, + "step": 6520 + }, + { + "epoch": 0.10158250757003744, + "grad_norm": 3.9225785732269287, + "learning_rate": 9.457236033496665e-07, + "loss": 0.822, + "step": 6525 + }, + { + "epoch": 0.10166034857200681, + "grad_norm": 6.211043834686279, + "learning_rate": 9.456416643450616e-07, + "loss": 0.7675, + "step": 6530 + }, + { + "epoch": 0.10173818957397619, + "grad_norm": 5.109851360321045, + "learning_rate": 9.455597253404564e-07, + "loss": 0.8539, + "step": 6535 + }, + { + "epoch": 0.10181603057594557, + "grad_norm": 3.1654608249664307, + "learning_rate": 9.454777863358515e-07, + "loss": 0.851, + "step": 6540 + }, + { + "epoch": 0.10189387157791495, + "grad_norm": 6.30355167388916, + "learning_rate": 9.453958473312466e-07, + "loss": 0.8668, + "step": 6545 + }, + { + "epoch": 0.10197171257988433, + "grad_norm": 2.9073293209075928, + "learning_rate": 9.453139083266416e-07, + "loss": 0.8354, + "step": 6550 + }, + { + "epoch": 0.1020495535818537, + "grad_norm": 4.239645481109619, + "learning_rate": 9.452319693220366e-07, + "loss": 0.8394, + "step": 6555 + }, + { + "epoch": 0.10212739458382308, + "grad_norm": 4.341432094573975, + "learning_rate": 9.451500303174317e-07, + "loss": 0.8248, + "step": 6560 + }, + { + "epoch": 0.10220523558579246, + "grad_norm": 5.958523273468018, + "learning_rate": 9.450680913128267e-07, + "loss": 0.7696, + "step": 6565 + }, + { + "epoch": 0.10228307658776184, + "grad_norm": 2.9546141624450684, + "learning_rate": 9.449861523082217e-07, + "loss": 0.8008, + "step": 6570 + }, + { + "epoch": 0.10236091758973122, + "grad_norm": 3.216296672821045, + "learning_rate": 9.449042133036167e-07, + "loss": 0.8286, + "step": 6575 + }, + { + "epoch": 0.1024387585917006, + "grad_norm": 5.784662246704102, + "learning_rate": 9.448222742990117e-07, + "loss": 0.8118, + "step": 6580 + }, + { + "epoch": 0.10251659959366997, + "grad_norm": 18.038049697875977, + "learning_rate": 9.447403352944068e-07, + "loss": 0.7844, + "step": 6585 + }, + { + "epoch": 0.10259444059563935, + "grad_norm": 3.434221029281616, + "learning_rate": 9.446583962898018e-07, + "loss": 0.7745, + "step": 6590 + }, + { + "epoch": 0.10267228159760873, + "grad_norm": 3.0332863330841064, + "learning_rate": 9.445764572851969e-07, + "loss": 0.9026, + "step": 6595 + }, + { + "epoch": 0.10275012259957811, + "grad_norm": 4.459526538848877, + "learning_rate": 9.444945182805919e-07, + "loss": 0.8231, + "step": 6600 + }, + { + "epoch": 0.10282796360154747, + "grad_norm": 11.914100646972656, + "learning_rate": 9.44412579275987e-07, + "loss": 0.7191, + "step": 6605 + }, + { + "epoch": 0.10290580460351685, + "grad_norm": 9.491118431091309, + "learning_rate": 9.44330640271382e-07, + "loss": 0.8438, + "step": 6610 + }, + { + "epoch": 0.10298364560548623, + "grad_norm": 3.312546968460083, + "learning_rate": 9.442487012667769e-07, + "loss": 0.7754, + "step": 6615 + }, + { + "epoch": 0.10306148660745561, + "grad_norm": 2.4198150634765625, + "learning_rate": 9.44166762262172e-07, + "loss": 0.7373, + "step": 6620 + }, + { + "epoch": 0.10313932760942499, + "grad_norm": 3.8953001499176025, + "learning_rate": 9.44084823257567e-07, + "loss": 0.6972, + "step": 6625 + }, + { + "epoch": 0.10321716861139436, + "grad_norm": 3.971245050430298, + "learning_rate": 9.44002884252962e-07, + "loss": 0.7959, + "step": 6630 + }, + { + "epoch": 0.10329500961336374, + "grad_norm": 3.9119505882263184, + "learning_rate": 9.439209452483571e-07, + "loss": 0.8072, + "step": 6635 + }, + { + "epoch": 0.10337285061533312, + "grad_norm": 3.6322784423828125, + "learning_rate": 9.438390062437522e-07, + "loss": 0.8389, + "step": 6640 + }, + { + "epoch": 0.1034506916173025, + "grad_norm": 3.221548557281494, + "learning_rate": 9.437570672391471e-07, + "loss": 0.8113, + "step": 6645 + }, + { + "epoch": 0.10352853261927188, + "grad_norm": 3.768453598022461, + "learning_rate": 9.436751282345422e-07, + "loss": 0.8859, + "step": 6650 + }, + { + "epoch": 0.10360637362124125, + "grad_norm": 3.436704635620117, + "learning_rate": 9.435931892299372e-07, + "loss": 0.9308, + "step": 6655 + }, + { + "epoch": 0.10368421462321063, + "grad_norm": 3.5479848384857178, + "learning_rate": 9.435112502253321e-07, + "loss": 0.8624, + "step": 6660 + }, + { + "epoch": 0.10376205562518001, + "grad_norm": 5.35614538192749, + "learning_rate": 9.434293112207272e-07, + "loss": 0.8095, + "step": 6665 + }, + { + "epoch": 0.10383989662714939, + "grad_norm": 3.0815038681030273, + "learning_rate": 9.433473722161223e-07, + "loss": 0.6846, + "step": 6670 + }, + { + "epoch": 0.10391773762911877, + "grad_norm": 5.047412872314453, + "learning_rate": 9.432654332115173e-07, + "loss": 0.7785, + "step": 6675 + }, + { + "epoch": 0.10399557863108815, + "grad_norm": 4.322173595428467, + "learning_rate": 9.431834942069123e-07, + "loss": 0.8362, + "step": 6680 + }, + { + "epoch": 0.10407341963305752, + "grad_norm": 4.15039587020874, + "learning_rate": 9.431015552023074e-07, + "loss": 0.7475, + "step": 6685 + }, + { + "epoch": 0.10415126063502689, + "grad_norm": 3.9758059978485107, + "learning_rate": 9.430196161977024e-07, + "loss": 0.8058, + "step": 6690 + }, + { + "epoch": 0.10422910163699627, + "grad_norm": 3.778308629989624, + "learning_rate": 9.429376771930975e-07, + "loss": 0.9209, + "step": 6695 + }, + { + "epoch": 0.10430694263896564, + "grad_norm": 7.6523566246032715, + "learning_rate": 9.428557381884924e-07, + "loss": 0.765, + "step": 6700 + }, + { + "epoch": 0.10438478364093502, + "grad_norm": 4.295438289642334, + "learning_rate": 9.427737991838874e-07, + "loss": 0.8556, + "step": 6705 + }, + { + "epoch": 0.1044626246429044, + "grad_norm": 9.01634407043457, + "learning_rate": 9.426918601792825e-07, + "loss": 0.7078, + "step": 6710 + }, + { + "epoch": 0.10454046564487378, + "grad_norm": 4.1538987159729, + "learning_rate": 9.426099211746775e-07, + "loss": 1.0486, + "step": 6715 + }, + { + "epoch": 0.10461830664684316, + "grad_norm": 5.460824489593506, + "learning_rate": 9.425279821700726e-07, + "loss": 0.7296, + "step": 6720 + }, + { + "epoch": 0.10469614764881253, + "grad_norm": 8.995347023010254, + "learning_rate": 9.424460431654676e-07, + "loss": 0.7228, + "step": 6725 + }, + { + "epoch": 0.10477398865078191, + "grad_norm": 3.254420042037964, + "learning_rate": 9.423641041608627e-07, + "loss": 0.8683, + "step": 6730 + }, + { + "epoch": 0.10485182965275129, + "grad_norm": 3.987894058227539, + "learning_rate": 9.422821651562577e-07, + "loss": 0.7663, + "step": 6735 + }, + { + "epoch": 0.10492967065472067, + "grad_norm": 3.244363307952881, + "learning_rate": 9.422002261516526e-07, + "loss": 0.8443, + "step": 6740 + }, + { + "epoch": 0.10500751165669005, + "grad_norm": 4.715000152587891, + "learning_rate": 9.421182871470477e-07, + "loss": 0.8059, + "step": 6745 + }, + { + "epoch": 0.10508535265865943, + "grad_norm": 5.014405727386475, + "learning_rate": 9.420363481424427e-07, + "loss": 0.8381, + "step": 6750 + }, + { + "epoch": 0.1051631936606288, + "grad_norm": 4.085587978363037, + "learning_rate": 9.419544091378377e-07, + "loss": 0.8104, + "step": 6755 + }, + { + "epoch": 0.10524103466259818, + "grad_norm": 12.998879432678223, + "learning_rate": 9.418724701332328e-07, + "loss": 0.8814, + "step": 6760 + }, + { + "epoch": 0.10531887566456756, + "grad_norm": 5.057702541351318, + "learning_rate": 9.417905311286279e-07, + "loss": 0.7355, + "step": 6765 + }, + { + "epoch": 0.10539671666653694, + "grad_norm": 4.116156578063965, + "learning_rate": 9.417085921240228e-07, + "loss": 0.907, + "step": 6770 + }, + { + "epoch": 0.1054745576685063, + "grad_norm": 3.0526468753814697, + "learning_rate": 9.416266531194179e-07, + "loss": 0.832, + "step": 6775 + }, + { + "epoch": 0.10555239867047568, + "grad_norm": 5.218168258666992, + "learning_rate": 9.415447141148129e-07, + "loss": 0.7835, + "step": 6780 + }, + { + "epoch": 0.10563023967244506, + "grad_norm": 7.534468650817871, + "learning_rate": 9.414627751102078e-07, + "loss": 0.8966, + "step": 6785 + }, + { + "epoch": 0.10570808067441444, + "grad_norm": 14.327566146850586, + "learning_rate": 9.413808361056029e-07, + "loss": 0.789, + "step": 6790 + }, + { + "epoch": 0.10578592167638382, + "grad_norm": 4.58953332901001, + "learning_rate": 9.41298897100998e-07, + "loss": 0.8134, + "step": 6795 + }, + { + "epoch": 0.1058637626783532, + "grad_norm": 3.4943652153015137, + "learning_rate": 9.41216958096393e-07, + "loss": 0.9615, + "step": 6800 + }, + { + "epoch": 0.10594160368032257, + "grad_norm": 5.815013408660889, + "learning_rate": 9.41135019091788e-07, + "loss": 0.7671, + "step": 6805 + }, + { + "epoch": 0.10601944468229195, + "grad_norm": 4.9490580558776855, + "learning_rate": 9.410530800871831e-07, + "loss": 0.7993, + "step": 6810 + }, + { + "epoch": 0.10609728568426133, + "grad_norm": 3.030304193496704, + "learning_rate": 9.409711410825781e-07, + "loss": 0.7573, + "step": 6815 + }, + { + "epoch": 0.1061751266862307, + "grad_norm": 3.3977646827697754, + "learning_rate": 9.408892020779731e-07, + "loss": 0.7966, + "step": 6820 + }, + { + "epoch": 0.10625296768820008, + "grad_norm": 9.117260932922363, + "learning_rate": 9.408072630733681e-07, + "loss": 0.8123, + "step": 6825 + }, + { + "epoch": 0.10633080869016946, + "grad_norm": 3.8861453533172607, + "learning_rate": 9.407253240687631e-07, + "loss": 0.8245, + "step": 6830 + }, + { + "epoch": 0.10640864969213884, + "grad_norm": 3.4242775440216064, + "learning_rate": 9.406433850641582e-07, + "loss": 0.7318, + "step": 6835 + }, + { + "epoch": 0.10648649069410822, + "grad_norm": 4.729854106903076, + "learning_rate": 9.405614460595532e-07, + "loss": 0.7959, + "step": 6840 + }, + { + "epoch": 0.1065643316960776, + "grad_norm": 3.1164026260375977, + "learning_rate": 9.404795070549483e-07, + "loss": 0.8327, + "step": 6845 + }, + { + "epoch": 0.10664217269804697, + "grad_norm": 4.031877040863037, + "learning_rate": 9.403975680503433e-07, + "loss": 0.807, + "step": 6850 + }, + { + "epoch": 0.10672001370001635, + "grad_norm": 3.205714702606201, + "learning_rate": 9.403156290457384e-07, + "loss": 0.7962, + "step": 6855 + }, + { + "epoch": 0.10679785470198572, + "grad_norm": 3.2358205318450928, + "learning_rate": 9.402336900411333e-07, + "loss": 0.8694, + "step": 6860 + }, + { + "epoch": 0.1068756957039551, + "grad_norm": 2.7498748302459717, + "learning_rate": 9.401517510365283e-07, + "loss": 0.7554, + "step": 6865 + }, + { + "epoch": 0.10695353670592447, + "grad_norm": 7.2536420822143555, + "learning_rate": 9.400698120319234e-07, + "loss": 0.9135, + "step": 6870 + }, + { + "epoch": 0.10703137770789385, + "grad_norm": 5.090606689453125, + "learning_rate": 9.399878730273185e-07, + "loss": 0.8358, + "step": 6875 + }, + { + "epoch": 0.10710921870986323, + "grad_norm": 3.6972696781158447, + "learning_rate": 9.399059340227134e-07, + "loss": 0.9106, + "step": 6880 + }, + { + "epoch": 0.10718705971183261, + "grad_norm": 3.833972692489624, + "learning_rate": 9.398239950181085e-07, + "loss": 0.9021, + "step": 6885 + }, + { + "epoch": 0.10726490071380199, + "grad_norm": 6.692166805267334, + "learning_rate": 9.397420560135036e-07, + "loss": 0.9014, + "step": 6890 + }, + { + "epoch": 0.10734274171577136, + "grad_norm": 3.5323872566223145, + "learning_rate": 9.396601170088985e-07, + "loss": 0.8122, + "step": 6895 + }, + { + "epoch": 0.10742058271774074, + "grad_norm": 5.148552894592285, + "learning_rate": 9.395781780042935e-07, + "loss": 0.7712, + "step": 6900 + }, + { + "epoch": 0.10749842371971012, + "grad_norm": 4.791245460510254, + "learning_rate": 9.394962389996886e-07, + "loss": 0.8229, + "step": 6905 + }, + { + "epoch": 0.1075762647216795, + "grad_norm": 7.922582149505615, + "learning_rate": 9.394142999950836e-07, + "loss": 0.8641, + "step": 6910 + }, + { + "epoch": 0.10765410572364888, + "grad_norm": 4.787046432495117, + "learning_rate": 9.393323609904786e-07, + "loss": 0.8567, + "step": 6915 + }, + { + "epoch": 0.10773194672561826, + "grad_norm": 7.581035137176514, + "learning_rate": 9.392504219858737e-07, + "loss": 0.9418, + "step": 6920 + }, + { + "epoch": 0.10780978772758763, + "grad_norm": 3.7408881187438965, + "learning_rate": 9.391684829812687e-07, + "loss": 0.7569, + "step": 6925 + }, + { + "epoch": 0.10788762872955701, + "grad_norm": 4.957324981689453, + "learning_rate": 9.390865439766637e-07, + "loss": 0.9865, + "step": 6930 + }, + { + "epoch": 0.10796546973152639, + "grad_norm": 4.249368190765381, + "learning_rate": 9.390046049720588e-07, + "loss": 0.7513, + "step": 6935 + }, + { + "epoch": 0.10804331073349577, + "grad_norm": 4.029480934143066, + "learning_rate": 9.389226659674537e-07, + "loss": 0.8097, + "step": 6940 + }, + { + "epoch": 0.10812115173546515, + "grad_norm": 3.9717726707458496, + "learning_rate": 9.388407269628488e-07, + "loss": 0.8199, + "step": 6945 + }, + { + "epoch": 0.10819899273743451, + "grad_norm": 4.825889587402344, + "learning_rate": 9.387587879582438e-07, + "loss": 0.8086, + "step": 6950 + }, + { + "epoch": 0.10827683373940389, + "grad_norm": 6.288622856140137, + "learning_rate": 9.386768489536388e-07, + "loss": 0.764, + "step": 6955 + }, + { + "epoch": 0.10835467474137327, + "grad_norm": 4.316305637359619, + "learning_rate": 9.385949099490339e-07, + "loss": 0.8855, + "step": 6960 + }, + { + "epoch": 0.10843251574334264, + "grad_norm": 2.9733645915985107, + "learning_rate": 9.38512970944429e-07, + "loss": 0.8997, + "step": 6965 + }, + { + "epoch": 0.10851035674531202, + "grad_norm": 7.586787700653076, + "learning_rate": 9.38431031939824e-07, + "loss": 0.7612, + "step": 6970 + }, + { + "epoch": 0.1085881977472814, + "grad_norm": 6.496944904327393, + "learning_rate": 9.38349092935219e-07, + "loss": 0.8064, + "step": 6975 + }, + { + "epoch": 0.10866603874925078, + "grad_norm": 10.352307319641113, + "learning_rate": 9.382671539306141e-07, + "loss": 0.8054, + "step": 6980 + }, + { + "epoch": 0.10874387975122016, + "grad_norm": 3.3039493560791016, + "learning_rate": 9.38185214926009e-07, + "loss": 0.7777, + "step": 6985 + }, + { + "epoch": 0.10882172075318954, + "grad_norm": 2.823133945465088, + "learning_rate": 9.38103275921404e-07, + "loss": 0.8755, + "step": 6990 + }, + { + "epoch": 0.10889956175515891, + "grad_norm": 5.029725074768066, + "learning_rate": 9.380213369167991e-07, + "loss": 0.7765, + "step": 6995 + }, + { + "epoch": 0.10897740275712829, + "grad_norm": 5.5392889976501465, + "learning_rate": 9.379393979121942e-07, + "loss": 0.8166, + "step": 7000 + }, + { + "epoch": 0.10905524375909767, + "grad_norm": 5.657524585723877, + "learning_rate": 9.378574589075891e-07, + "loss": 0.7781, + "step": 7005 + }, + { + "epoch": 0.10913308476106705, + "grad_norm": 5.030917167663574, + "learning_rate": 9.377755199029842e-07, + "loss": 0.8928, + "step": 7010 + }, + { + "epoch": 0.10921092576303643, + "grad_norm": 4.488454341888428, + "learning_rate": 9.376935808983793e-07, + "loss": 0.835, + "step": 7015 + }, + { + "epoch": 0.1092887667650058, + "grad_norm": 3.592827081680298, + "learning_rate": 9.376116418937743e-07, + "loss": 0.7534, + "step": 7020 + }, + { + "epoch": 0.10936660776697518, + "grad_norm": 6.4170331954956055, + "learning_rate": 9.375297028891692e-07, + "loss": 0.7978, + "step": 7025 + }, + { + "epoch": 0.10944444876894456, + "grad_norm": 3.166126251220703, + "learning_rate": 9.374477638845643e-07, + "loss": 0.8483, + "step": 7030 + }, + { + "epoch": 0.10952228977091392, + "grad_norm": 2.965501070022583, + "learning_rate": 9.373658248799593e-07, + "loss": 0.7824, + "step": 7035 + }, + { + "epoch": 0.1096001307728833, + "grad_norm": 4.2378058433532715, + "learning_rate": 9.372838858753543e-07, + "loss": 0.9217, + "step": 7040 + }, + { + "epoch": 0.10967797177485268, + "grad_norm": 5.209421634674072, + "learning_rate": 9.372019468707494e-07, + "loss": 0.835, + "step": 7045 + }, + { + "epoch": 0.10975581277682206, + "grad_norm": 4.27461576461792, + "learning_rate": 9.371200078661444e-07, + "loss": 0.9258, + "step": 7050 + }, + { + "epoch": 0.10983365377879144, + "grad_norm": 2.5676474571228027, + "learning_rate": 9.370380688615395e-07, + "loss": 0.8691, + "step": 7055 + }, + { + "epoch": 0.10991149478076082, + "grad_norm": 3.422879934310913, + "learning_rate": 9.369561298569345e-07, + "loss": 0.7527, + "step": 7060 + }, + { + "epoch": 0.1099893357827302, + "grad_norm": 4.083531379699707, + "learning_rate": 9.368741908523294e-07, + "loss": 0.7531, + "step": 7065 + }, + { + "epoch": 0.11006717678469957, + "grad_norm": 4.684252738952637, + "learning_rate": 9.367922518477245e-07, + "loss": 0.7772, + "step": 7070 + }, + { + "epoch": 0.11014501778666895, + "grad_norm": 3.0496606826782227, + "learning_rate": 9.367103128431195e-07, + "loss": 0.7993, + "step": 7075 + }, + { + "epoch": 0.11022285878863833, + "grad_norm": 3.641996145248413, + "learning_rate": 9.366283738385145e-07, + "loss": 0.7938, + "step": 7080 + }, + { + "epoch": 0.1103006997906077, + "grad_norm": 3.0637736320495605, + "learning_rate": 9.365464348339096e-07, + "loss": 0.8944, + "step": 7085 + }, + { + "epoch": 0.11037854079257708, + "grad_norm": 4.9412455558776855, + "learning_rate": 9.364644958293047e-07, + "loss": 0.7653, + "step": 7090 + }, + { + "epoch": 0.11045638179454646, + "grad_norm": 4.0071516036987305, + "learning_rate": 9.363825568246997e-07, + "loss": 0.8582, + "step": 7095 + }, + { + "epoch": 0.11053422279651584, + "grad_norm": 3.297551155090332, + "learning_rate": 9.363006178200947e-07, + "loss": 0.6726, + "step": 7100 + }, + { + "epoch": 0.11061206379848522, + "grad_norm": 6.013480186462402, + "learning_rate": 9.362186788154897e-07, + "loss": 0.777, + "step": 7105 + }, + { + "epoch": 0.1106899048004546, + "grad_norm": 4.557566165924072, + "learning_rate": 9.361367398108848e-07, + "loss": 0.9073, + "step": 7110 + }, + { + "epoch": 0.11076774580242398, + "grad_norm": 3.922395706176758, + "learning_rate": 9.360548008062797e-07, + "loss": 0.7608, + "step": 7115 + }, + { + "epoch": 0.11084558680439334, + "grad_norm": 4.4782867431640625, + "learning_rate": 9.359728618016748e-07, + "loss": 0.8138, + "step": 7120 + }, + { + "epoch": 0.11092342780636272, + "grad_norm": 3.320688486099243, + "learning_rate": 9.358909227970699e-07, + "loss": 0.7954, + "step": 7125 + }, + { + "epoch": 0.1110012688083321, + "grad_norm": 14.582179069519043, + "learning_rate": 9.358089837924648e-07, + "loss": 0.8295, + "step": 7130 + }, + { + "epoch": 0.11107910981030147, + "grad_norm": 5.56791877746582, + "learning_rate": 9.357270447878599e-07, + "loss": 0.7778, + "step": 7135 + }, + { + "epoch": 0.11115695081227085, + "grad_norm": 4.387538909912109, + "learning_rate": 9.35645105783255e-07, + "loss": 0.7354, + "step": 7140 + }, + { + "epoch": 0.11123479181424023, + "grad_norm": 3.590179443359375, + "learning_rate": 9.355631667786499e-07, + "loss": 0.7492, + "step": 7145 + }, + { + "epoch": 0.11131263281620961, + "grad_norm": 2.9361941814422607, + "learning_rate": 9.354812277740449e-07, + "loss": 0.9192, + "step": 7150 + }, + { + "epoch": 0.11139047381817899, + "grad_norm": 4.104539394378662, + "learning_rate": 9.3539928876944e-07, + "loss": 0.7818, + "step": 7155 + }, + { + "epoch": 0.11146831482014836, + "grad_norm": 3.3516862392425537, + "learning_rate": 9.35317349764835e-07, + "loss": 0.7937, + "step": 7160 + }, + { + "epoch": 0.11154615582211774, + "grad_norm": 3.5534565448760986, + "learning_rate": 9.3523541076023e-07, + "loss": 0.7168, + "step": 7165 + }, + { + "epoch": 0.11162399682408712, + "grad_norm": 3.8620402812957764, + "learning_rate": 9.351534717556251e-07, + "loss": 0.7666, + "step": 7170 + }, + { + "epoch": 0.1117018378260565, + "grad_norm": 5.330255031585693, + "learning_rate": 9.350715327510201e-07, + "loss": 0.7302, + "step": 7175 + }, + { + "epoch": 0.11177967882802588, + "grad_norm": 3.8225488662719727, + "learning_rate": 9.349895937464152e-07, + "loss": 0.7087, + "step": 7180 + }, + { + "epoch": 0.11185751982999526, + "grad_norm": 4.536187648773193, + "learning_rate": 9.349076547418101e-07, + "loss": 0.6757, + "step": 7185 + }, + { + "epoch": 0.11193536083196463, + "grad_norm": 3.3316333293914795, + "learning_rate": 9.348257157372051e-07, + "loss": 0.8506, + "step": 7190 + }, + { + "epoch": 0.11201320183393401, + "grad_norm": 4.451030731201172, + "learning_rate": 9.347437767326002e-07, + "loss": 0.8501, + "step": 7195 + }, + { + "epoch": 0.11209104283590339, + "grad_norm": 6.453036308288574, + "learning_rate": 9.346618377279953e-07, + "loss": 0.7665, + "step": 7200 + }, + { + "epoch": 0.11216888383787275, + "grad_norm": 3.7804341316223145, + "learning_rate": 9.345798987233902e-07, + "loss": 0.8482, + "step": 7205 + }, + { + "epoch": 0.11224672483984213, + "grad_norm": 4.703028678894043, + "learning_rate": 9.344979597187853e-07, + "loss": 0.8001, + "step": 7210 + }, + { + "epoch": 0.11232456584181151, + "grad_norm": 3.5996668338775635, + "learning_rate": 9.344160207141804e-07, + "loss": 0.8508, + "step": 7215 + }, + { + "epoch": 0.11240240684378089, + "grad_norm": 3.470485210418701, + "learning_rate": 9.343340817095754e-07, + "loss": 0.8929, + "step": 7220 + }, + { + "epoch": 0.11248024784575027, + "grad_norm": 9.36178970336914, + "learning_rate": 9.342521427049703e-07, + "loss": 0.7707, + "step": 7225 + }, + { + "epoch": 0.11255808884771965, + "grad_norm": 7.091135025024414, + "learning_rate": 9.341702037003654e-07, + "loss": 0.9335, + "step": 7230 + }, + { + "epoch": 0.11263592984968902, + "grad_norm": 4.552675247192383, + "learning_rate": 9.340882646957605e-07, + "loss": 0.9234, + "step": 7235 + }, + { + "epoch": 0.1127137708516584, + "grad_norm": 2.9877877235412598, + "learning_rate": 9.340063256911554e-07, + "loss": 0.8213, + "step": 7240 + }, + { + "epoch": 0.11279161185362778, + "grad_norm": 3.49109148979187, + "learning_rate": 9.339243866865505e-07, + "loss": 0.7713, + "step": 7245 + }, + { + "epoch": 0.11286945285559716, + "grad_norm": 3.662997245788574, + "learning_rate": 9.338424476819456e-07, + "loss": 0.8424, + "step": 7250 + }, + { + "epoch": 0.11294729385756654, + "grad_norm": 3.4681499004364014, + "learning_rate": 9.337605086773405e-07, + "loss": 0.9026, + "step": 7255 + }, + { + "epoch": 0.11302513485953591, + "grad_norm": 3.360700845718384, + "learning_rate": 9.336785696727356e-07, + "loss": 0.9131, + "step": 7260 + }, + { + "epoch": 0.11310297586150529, + "grad_norm": 3.470808982849121, + "learning_rate": 9.335966306681306e-07, + "loss": 0.8477, + "step": 7265 + }, + { + "epoch": 0.11318081686347467, + "grad_norm": 4.002136707305908, + "learning_rate": 9.335146916635256e-07, + "loss": 0.8753, + "step": 7270 + }, + { + "epoch": 0.11325865786544405, + "grad_norm": 3.301177978515625, + "learning_rate": 9.334327526589206e-07, + "loss": 0.8266, + "step": 7275 + }, + { + "epoch": 0.11333649886741343, + "grad_norm": 3.6634960174560547, + "learning_rate": 9.333508136543157e-07, + "loss": 0.6648, + "step": 7280 + }, + { + "epoch": 0.1134143398693828, + "grad_norm": 4.124035835266113, + "learning_rate": 9.332688746497107e-07, + "loss": 0.8151, + "step": 7285 + }, + { + "epoch": 0.11349218087135218, + "grad_norm": 5.273459434509277, + "learning_rate": 9.331869356451058e-07, + "loss": 0.7747, + "step": 7290 + }, + { + "epoch": 0.11357002187332155, + "grad_norm": 3.6614978313446045, + "learning_rate": 9.331049966405008e-07, + "loss": 0.8157, + "step": 7295 + }, + { + "epoch": 0.11364786287529093, + "grad_norm": 5.9759368896484375, + "learning_rate": 9.330230576358958e-07, + "loss": 0.7475, + "step": 7300 + }, + { + "epoch": 0.1137257038772603, + "grad_norm": 3.271934747695923, + "learning_rate": 9.329411186312909e-07, + "loss": 0.8176, + "step": 7305 + }, + { + "epoch": 0.11380354487922968, + "grad_norm": 6.224942207336426, + "learning_rate": 9.328591796266858e-07, + "loss": 0.7874, + "step": 7310 + }, + { + "epoch": 0.11388138588119906, + "grad_norm": 4.060842990875244, + "learning_rate": 9.327772406220808e-07, + "loss": 0.7104, + "step": 7315 + }, + { + "epoch": 0.11395922688316844, + "grad_norm": 3.273303985595703, + "learning_rate": 9.326953016174759e-07, + "loss": 0.9288, + "step": 7320 + }, + { + "epoch": 0.11403706788513782, + "grad_norm": 5.721134662628174, + "learning_rate": 9.32613362612871e-07, + "loss": 0.9447, + "step": 7325 + }, + { + "epoch": 0.1141149088871072, + "grad_norm": 3.942401885986328, + "learning_rate": 9.325314236082659e-07, + "loss": 0.6934, + "step": 7330 + }, + { + "epoch": 0.11419274988907657, + "grad_norm": 8.272555351257324, + "learning_rate": 9.32449484603661e-07, + "loss": 0.7917, + "step": 7335 + }, + { + "epoch": 0.11427059089104595, + "grad_norm": 9.704336166381836, + "learning_rate": 9.323675455990561e-07, + "loss": 0.816, + "step": 7340 + }, + { + "epoch": 0.11434843189301533, + "grad_norm": 3.2745420932769775, + "learning_rate": 9.322856065944512e-07, + "loss": 0.8432, + "step": 7345 + }, + { + "epoch": 0.1144262728949847, + "grad_norm": 4.300100803375244, + "learning_rate": 9.32203667589846e-07, + "loss": 0.8954, + "step": 7350 + }, + { + "epoch": 0.11450411389695409, + "grad_norm": 3.520085334777832, + "learning_rate": 9.321217285852411e-07, + "loss": 0.6849, + "step": 7355 + }, + { + "epoch": 0.11458195489892346, + "grad_norm": 3.1472392082214355, + "learning_rate": 9.320397895806362e-07, + "loss": 0.9081, + "step": 7360 + }, + { + "epoch": 0.11465979590089284, + "grad_norm": 5.217727184295654, + "learning_rate": 9.319578505760311e-07, + "loss": 0.9009, + "step": 7365 + }, + { + "epoch": 0.11473763690286222, + "grad_norm": 2.461811065673828, + "learning_rate": 9.318759115714262e-07, + "loss": 0.8018, + "step": 7370 + }, + { + "epoch": 0.1148154779048316, + "grad_norm": 4.464178562164307, + "learning_rate": 9.317939725668213e-07, + "loss": 0.7952, + "step": 7375 + }, + { + "epoch": 0.11489331890680096, + "grad_norm": 8.113191604614258, + "learning_rate": 9.317120335622163e-07, + "loss": 0.8338, + "step": 7380 + }, + { + "epoch": 0.11497115990877034, + "grad_norm": 3.7389204502105713, + "learning_rate": 9.316300945576113e-07, + "loss": 0.7343, + "step": 7385 + }, + { + "epoch": 0.11504900091073972, + "grad_norm": 3.955479145050049, + "learning_rate": 9.315481555530063e-07, + "loss": 0.8868, + "step": 7390 + }, + { + "epoch": 0.1151268419127091, + "grad_norm": 3.5472121238708496, + "learning_rate": 9.314662165484013e-07, + "loss": 0.7299, + "step": 7395 + }, + { + "epoch": 0.11520468291467847, + "grad_norm": 3.3682563304901123, + "learning_rate": 9.313842775437963e-07, + "loss": 0.8079, + "step": 7400 + }, + { + "epoch": 0.11528252391664785, + "grad_norm": 4.840713977813721, + "learning_rate": 9.313023385391914e-07, + "loss": 0.8041, + "step": 7405 + }, + { + "epoch": 0.11536036491861723, + "grad_norm": 6.368528366088867, + "learning_rate": 9.312203995345864e-07, + "loss": 0.8727, + "step": 7410 + }, + { + "epoch": 0.11543820592058661, + "grad_norm": 8.412065505981445, + "learning_rate": 9.311384605299815e-07, + "loss": 0.7993, + "step": 7415 + }, + { + "epoch": 0.11551604692255599, + "grad_norm": 4.714696884155273, + "learning_rate": 9.310565215253765e-07, + "loss": 0.7354, + "step": 7420 + }, + { + "epoch": 0.11559388792452537, + "grad_norm": 5.104187488555908, + "learning_rate": 9.309745825207715e-07, + "loss": 0.8301, + "step": 7425 + }, + { + "epoch": 0.11567172892649474, + "grad_norm": 2.960247039794922, + "learning_rate": 9.308926435161665e-07, + "loss": 0.7949, + "step": 7430 + }, + { + "epoch": 0.11574956992846412, + "grad_norm": 5.484702110290527, + "learning_rate": 9.308107045115615e-07, + "loss": 0.7747, + "step": 7435 + }, + { + "epoch": 0.1158274109304335, + "grad_norm": 3.192422866821289, + "learning_rate": 9.307287655069565e-07, + "loss": 0.8138, + "step": 7440 + }, + { + "epoch": 0.11590525193240288, + "grad_norm": 4.892508029937744, + "learning_rate": 9.306468265023516e-07, + "loss": 0.8059, + "step": 7445 + }, + { + "epoch": 0.11598309293437226, + "grad_norm": 3.488111972808838, + "learning_rate": 9.305648874977467e-07, + "loss": 0.7975, + "step": 7450 + }, + { + "epoch": 0.11606093393634163, + "grad_norm": 4.7030029296875, + "learning_rate": 9.304829484931416e-07, + "loss": 0.752, + "step": 7455 + }, + { + "epoch": 0.11613877493831101, + "grad_norm": 5.344095706939697, + "learning_rate": 9.304010094885367e-07, + "loss": 0.774, + "step": 7460 + }, + { + "epoch": 0.11621661594028038, + "grad_norm": 2.944584846496582, + "learning_rate": 9.303190704839318e-07, + "loss": 0.7948, + "step": 7465 + }, + { + "epoch": 0.11629445694224975, + "grad_norm": 2.8411998748779297, + "learning_rate": 9.302371314793266e-07, + "loss": 0.7698, + "step": 7470 + }, + { + "epoch": 0.11637229794421913, + "grad_norm": 6.780023097991943, + "learning_rate": 9.301551924747217e-07, + "loss": 0.8843, + "step": 7475 + }, + { + "epoch": 0.11645013894618851, + "grad_norm": 4.495726108551025, + "learning_rate": 9.300732534701168e-07, + "loss": 0.9019, + "step": 7480 + }, + { + "epoch": 0.11652797994815789, + "grad_norm": 4.61745023727417, + "learning_rate": 9.299913144655119e-07, + "loss": 0.8717, + "step": 7485 + }, + { + "epoch": 0.11660582095012727, + "grad_norm": 3.0337278842926025, + "learning_rate": 9.299093754609068e-07, + "loss": 0.791, + "step": 7490 + }, + { + "epoch": 0.11668366195209665, + "grad_norm": 6.76452112197876, + "learning_rate": 9.298274364563019e-07, + "loss": 0.7373, + "step": 7495 + }, + { + "epoch": 0.11676150295406602, + "grad_norm": 2.7412028312683105, + "learning_rate": 9.29745497451697e-07, + "loss": 0.7638, + "step": 7500 + }, + { + "epoch": 0.1168393439560354, + "grad_norm": 4.742910861968994, + "learning_rate": 9.29663558447092e-07, + "loss": 0.8951, + "step": 7505 + }, + { + "epoch": 0.11691718495800478, + "grad_norm": 2.368957042694092, + "learning_rate": 9.295816194424869e-07, + "loss": 0.7237, + "step": 7510 + }, + { + "epoch": 0.11699502595997416, + "grad_norm": 4.270255088806152, + "learning_rate": 9.29499680437882e-07, + "loss": 0.789, + "step": 7515 + }, + { + "epoch": 0.11707286696194354, + "grad_norm": 4.186617851257324, + "learning_rate": 9.29417741433277e-07, + "loss": 0.8655, + "step": 7520 + }, + { + "epoch": 0.11715070796391291, + "grad_norm": 2.4891741275787354, + "learning_rate": 9.29335802428672e-07, + "loss": 0.8008, + "step": 7525 + }, + { + "epoch": 0.11722854896588229, + "grad_norm": 3.5113582611083984, + "learning_rate": 9.292538634240671e-07, + "loss": 0.8465, + "step": 7530 + }, + { + "epoch": 0.11730638996785167, + "grad_norm": 4.365012168884277, + "learning_rate": 9.291719244194621e-07, + "loss": 0.8378, + "step": 7535 + }, + { + "epoch": 0.11738423096982105, + "grad_norm": 4.17802619934082, + "learning_rate": 9.290899854148572e-07, + "loss": 0.7605, + "step": 7540 + }, + { + "epoch": 0.11746207197179043, + "grad_norm": 3.046833038330078, + "learning_rate": 9.290080464102522e-07, + "loss": 0.6796, + "step": 7545 + }, + { + "epoch": 0.11753991297375979, + "grad_norm": 2.9827167987823486, + "learning_rate": 9.289261074056471e-07, + "loss": 0.7666, + "step": 7550 + }, + { + "epoch": 0.11761775397572917, + "grad_norm": 2.8555498123168945, + "learning_rate": 9.288441684010422e-07, + "loss": 0.8337, + "step": 7555 + }, + { + "epoch": 0.11769559497769855, + "grad_norm": 4.013402462005615, + "learning_rate": 9.287622293964373e-07, + "loss": 0.9353, + "step": 7560 + }, + { + "epoch": 0.11777343597966793, + "grad_norm": 6.579110622406006, + "learning_rate": 9.286802903918322e-07, + "loss": 0.7464, + "step": 7565 + }, + { + "epoch": 0.1178512769816373, + "grad_norm": 6.643786907196045, + "learning_rate": 9.285983513872273e-07, + "loss": 0.8852, + "step": 7570 + }, + { + "epoch": 0.11792911798360668, + "grad_norm": 3.837496042251587, + "learning_rate": 9.285164123826224e-07, + "loss": 0.7909, + "step": 7575 + }, + { + "epoch": 0.11800695898557606, + "grad_norm": 3.4954562187194824, + "learning_rate": 9.284344733780173e-07, + "loss": 0.8559, + "step": 7580 + }, + { + "epoch": 0.11808479998754544, + "grad_norm": 3.962890148162842, + "learning_rate": 9.283525343734124e-07, + "loss": 0.833, + "step": 7585 + }, + { + "epoch": 0.11816264098951482, + "grad_norm": 7.724937438964844, + "learning_rate": 9.282705953688074e-07, + "loss": 0.9339, + "step": 7590 + }, + { + "epoch": 0.1182404819914842, + "grad_norm": 5.779001235961914, + "learning_rate": 9.281886563642024e-07, + "loss": 0.6968, + "step": 7595 + }, + { + "epoch": 0.11831832299345357, + "grad_norm": 6.85791540145874, + "learning_rate": 9.281067173595974e-07, + "loss": 0.8279, + "step": 7600 + }, + { + "epoch": 0.11839616399542295, + "grad_norm": 3.720306158065796, + "learning_rate": 9.280247783549925e-07, + "loss": 0.7751, + "step": 7605 + }, + { + "epoch": 0.11847400499739233, + "grad_norm": 4.524914741516113, + "learning_rate": 9.279428393503876e-07, + "loss": 0.7227, + "step": 7610 + }, + { + "epoch": 0.11855184599936171, + "grad_norm": 3.6757309436798096, + "learning_rate": 9.278609003457825e-07, + "loss": 0.8816, + "step": 7615 + }, + { + "epoch": 0.11862968700133109, + "grad_norm": 3.6419453620910645, + "learning_rate": 9.277789613411776e-07, + "loss": 0.7741, + "step": 7620 + }, + { + "epoch": 0.11870752800330046, + "grad_norm": 3.0829155445098877, + "learning_rate": 9.276970223365727e-07, + "loss": 0.7712, + "step": 7625 + }, + { + "epoch": 0.11878536900526984, + "grad_norm": 6.524753570556641, + "learning_rate": 9.276150833319677e-07, + "loss": 0.8025, + "step": 7630 + }, + { + "epoch": 0.11886321000723922, + "grad_norm": 2.959907054901123, + "learning_rate": 9.275331443273626e-07, + "loss": 0.932, + "step": 7635 + }, + { + "epoch": 0.11894105100920858, + "grad_norm": 3.6245338916778564, + "learning_rate": 9.274512053227577e-07, + "loss": 0.6997, + "step": 7640 + }, + { + "epoch": 0.11901889201117796, + "grad_norm": 4.328200340270996, + "learning_rate": 9.273692663181527e-07, + "loss": 0.8786, + "step": 7645 + }, + { + "epoch": 0.11909673301314734, + "grad_norm": 4.048555374145508, + "learning_rate": 9.272873273135478e-07, + "loss": 0.6936, + "step": 7650 + }, + { + "epoch": 0.11917457401511672, + "grad_norm": 3.2797539234161377, + "learning_rate": 9.272053883089428e-07, + "loss": 0.7915, + "step": 7655 + }, + { + "epoch": 0.1192524150170861, + "grad_norm": 9.63089656829834, + "learning_rate": 9.271234493043378e-07, + "loss": 0.8417, + "step": 7660 + }, + { + "epoch": 0.11933025601905548, + "grad_norm": 4.521198272705078, + "learning_rate": 9.270415102997329e-07, + "loss": 0.8831, + "step": 7665 + }, + { + "epoch": 0.11940809702102485, + "grad_norm": 16.66404151916504, + "learning_rate": 9.269595712951279e-07, + "loss": 0.8582, + "step": 7670 + }, + { + "epoch": 0.11948593802299423, + "grad_norm": 8.25177001953125, + "learning_rate": 9.268776322905228e-07, + "loss": 0.818, + "step": 7675 + }, + { + "epoch": 0.11956377902496361, + "grad_norm": 3.3690059185028076, + "learning_rate": 9.267956932859179e-07, + "loss": 0.689, + "step": 7680 + }, + { + "epoch": 0.11964162002693299, + "grad_norm": 4.987194061279297, + "learning_rate": 9.26713754281313e-07, + "loss": 0.8771, + "step": 7685 + }, + { + "epoch": 0.11971946102890237, + "grad_norm": 5.207099914550781, + "learning_rate": 9.266318152767079e-07, + "loss": 0.898, + "step": 7690 + }, + { + "epoch": 0.11979730203087174, + "grad_norm": 4.396019458770752, + "learning_rate": 9.26549876272103e-07, + "loss": 0.7181, + "step": 7695 + }, + { + "epoch": 0.11987514303284112, + "grad_norm": 3.1832330226898193, + "learning_rate": 9.264679372674981e-07, + "loss": 0.8264, + "step": 7700 + }, + { + "epoch": 0.1199529840348105, + "grad_norm": 3.0821919441223145, + "learning_rate": 9.26385998262893e-07, + "loss": 0.8412, + "step": 7705 + }, + { + "epoch": 0.12003082503677988, + "grad_norm": 4.4039626121521, + "learning_rate": 9.263040592582881e-07, + "loss": 0.7891, + "step": 7710 + }, + { + "epoch": 0.12010866603874926, + "grad_norm": 3.0394608974456787, + "learning_rate": 9.262221202536831e-07, + "loss": 0.7594, + "step": 7715 + }, + { + "epoch": 0.12018650704071863, + "grad_norm": 3.430525779724121, + "learning_rate": 9.261401812490781e-07, + "loss": 0.9487, + "step": 7720 + }, + { + "epoch": 0.120264348042688, + "grad_norm": 3.6181464195251465, + "learning_rate": 9.260582422444731e-07, + "loss": 0.881, + "step": 7725 + }, + { + "epoch": 0.12034218904465738, + "grad_norm": 9.917842864990234, + "learning_rate": 9.259763032398682e-07, + "loss": 0.7738, + "step": 7730 + }, + { + "epoch": 0.12042003004662676, + "grad_norm": 5.7242255210876465, + "learning_rate": 9.258943642352633e-07, + "loss": 0.9353, + "step": 7735 + }, + { + "epoch": 0.12049787104859613, + "grad_norm": 5.354982852935791, + "learning_rate": 9.258124252306583e-07, + "loss": 0.7719, + "step": 7740 + }, + { + "epoch": 0.12057571205056551, + "grad_norm": 5.967920780181885, + "learning_rate": 9.257304862260533e-07, + "loss": 0.7304, + "step": 7745 + }, + { + "epoch": 0.12065355305253489, + "grad_norm": 3.245041847229004, + "learning_rate": 9.256485472214484e-07, + "loss": 0.6752, + "step": 7750 + }, + { + "epoch": 0.12073139405450427, + "grad_norm": 6.362682342529297, + "learning_rate": 9.255666082168433e-07, + "loss": 0.8159, + "step": 7755 + }, + { + "epoch": 0.12080923505647365, + "grad_norm": 5.759777069091797, + "learning_rate": 9.254846692122383e-07, + "loss": 0.6947, + "step": 7760 + }, + { + "epoch": 0.12088707605844302, + "grad_norm": 3.7885186672210693, + "learning_rate": 9.254027302076334e-07, + "loss": 0.8279, + "step": 7765 + }, + { + "epoch": 0.1209649170604124, + "grad_norm": 2.970975399017334, + "learning_rate": 9.253207912030284e-07, + "loss": 0.8366, + "step": 7770 + }, + { + "epoch": 0.12104275806238178, + "grad_norm": 3.9208946228027344, + "learning_rate": 9.252388521984235e-07, + "loss": 0.8122, + "step": 7775 + }, + { + "epoch": 0.12112059906435116, + "grad_norm": 3.411137580871582, + "learning_rate": 9.251569131938185e-07, + "loss": 0.736, + "step": 7780 + }, + { + "epoch": 0.12119844006632054, + "grad_norm": 10.479853630065918, + "learning_rate": 9.250749741892135e-07, + "loss": 0.9075, + "step": 7785 + }, + { + "epoch": 0.12127628106828992, + "grad_norm": 4.920512676239014, + "learning_rate": 9.249930351846086e-07, + "loss": 0.8346, + "step": 7790 + }, + { + "epoch": 0.1213541220702593, + "grad_norm": 5.350609302520752, + "learning_rate": 9.249110961800035e-07, + "loss": 0.8034, + "step": 7795 + }, + { + "epoch": 0.12143196307222867, + "grad_norm": 6.461511611938477, + "learning_rate": 9.248291571753985e-07, + "loss": 0.6603, + "step": 7800 + }, + { + "epoch": 0.12150980407419805, + "grad_norm": 4.6280837059021, + "learning_rate": 9.247472181707936e-07, + "loss": 0.801, + "step": 7805 + }, + { + "epoch": 0.12158764507616741, + "grad_norm": 8.22148323059082, + "learning_rate": 9.246652791661887e-07, + "loss": 0.8335, + "step": 7810 + }, + { + "epoch": 0.12166548607813679, + "grad_norm": 7.433074951171875, + "learning_rate": 9.245833401615836e-07, + "loss": 0.7171, + "step": 7815 + }, + { + "epoch": 0.12174332708010617, + "grad_norm": 5.692368507385254, + "learning_rate": 9.245014011569787e-07, + "loss": 0.8649, + "step": 7820 + }, + { + "epoch": 0.12182116808207555, + "grad_norm": 3.0104875564575195, + "learning_rate": 9.244194621523738e-07, + "loss": 0.6021, + "step": 7825 + }, + { + "epoch": 0.12189900908404493, + "grad_norm": 6.160313606262207, + "learning_rate": 9.243375231477688e-07, + "loss": 0.8652, + "step": 7830 + }, + { + "epoch": 0.1219768500860143, + "grad_norm": 4.373056411743164, + "learning_rate": 9.242555841431637e-07, + "loss": 0.7689, + "step": 7835 + }, + { + "epoch": 0.12205469108798368, + "grad_norm": 5.760363578796387, + "learning_rate": 9.241736451385588e-07, + "loss": 0.8888, + "step": 7840 + }, + { + "epoch": 0.12213253208995306, + "grad_norm": 4.07029914855957, + "learning_rate": 9.240917061339538e-07, + "loss": 0.7693, + "step": 7845 + }, + { + "epoch": 0.12221037309192244, + "grad_norm": 3.7920992374420166, + "learning_rate": 9.240097671293488e-07, + "loss": 0.8221, + "step": 7850 + }, + { + "epoch": 0.12228821409389182, + "grad_norm": 7.572727203369141, + "learning_rate": 9.239278281247439e-07, + "loss": 0.8255, + "step": 7855 + }, + { + "epoch": 0.1223660550958612, + "grad_norm": 5.047962188720703, + "learning_rate": 9.23845889120139e-07, + "loss": 0.8189, + "step": 7860 + }, + { + "epoch": 0.12244389609783057, + "grad_norm": 6.445250988006592, + "learning_rate": 9.23763950115534e-07, + "loss": 0.9137, + "step": 7865 + }, + { + "epoch": 0.12252173709979995, + "grad_norm": 2.9722390174865723, + "learning_rate": 9.23682011110929e-07, + "loss": 0.895, + "step": 7870 + }, + { + "epoch": 0.12259957810176933, + "grad_norm": 3.0519466400146484, + "learning_rate": 9.23600072106324e-07, + "loss": 0.8048, + "step": 7875 + }, + { + "epoch": 0.12267741910373871, + "grad_norm": 5.44154167175293, + "learning_rate": 9.23518133101719e-07, + "loss": 0.6992, + "step": 7880 + }, + { + "epoch": 0.12275526010570809, + "grad_norm": 11.307611465454102, + "learning_rate": 9.23436194097114e-07, + "loss": 0.8595, + "step": 7885 + }, + { + "epoch": 0.12283310110767746, + "grad_norm": 6.504110336303711, + "learning_rate": 9.233542550925091e-07, + "loss": 0.7882, + "step": 7890 + }, + { + "epoch": 0.12291094210964683, + "grad_norm": 2.5726773738861084, + "learning_rate": 9.232723160879041e-07, + "loss": 0.8535, + "step": 7895 + }, + { + "epoch": 0.1229887831116162, + "grad_norm": 3.1990294456481934, + "learning_rate": 9.231903770832992e-07, + "loss": 0.7941, + "step": 7900 + }, + { + "epoch": 0.12306662411358558, + "grad_norm": 3.614875555038452, + "learning_rate": 9.231084380786942e-07, + "loss": 0.6803, + "step": 7905 + }, + { + "epoch": 0.12314446511555496, + "grad_norm": 5.973575592041016, + "learning_rate": 9.230264990740892e-07, + "loss": 0.7994, + "step": 7910 + }, + { + "epoch": 0.12322230611752434, + "grad_norm": 3.558375358581543, + "learning_rate": 9.229445600694842e-07, + "loss": 0.8292, + "step": 7915 + }, + { + "epoch": 0.12330014711949372, + "grad_norm": 4.37818717956543, + "learning_rate": 9.228626210648793e-07, + "loss": 0.8512, + "step": 7920 + }, + { + "epoch": 0.1233779881214631, + "grad_norm": 7.052612781524658, + "learning_rate": 9.227806820602742e-07, + "loss": 0.7453, + "step": 7925 + }, + { + "epoch": 0.12345582912343248, + "grad_norm": 9.181270599365234, + "learning_rate": 9.226987430556693e-07, + "loss": 0.6847, + "step": 7930 + }, + { + "epoch": 0.12353367012540185, + "grad_norm": 4.499727249145508, + "learning_rate": 9.226168040510644e-07, + "loss": 0.7718, + "step": 7935 + }, + { + "epoch": 0.12361151112737123, + "grad_norm": 3.2490360736846924, + "learning_rate": 9.225348650464593e-07, + "loss": 0.7631, + "step": 7940 + }, + { + "epoch": 0.12368935212934061, + "grad_norm": 3.26198410987854, + "learning_rate": 9.224529260418544e-07, + "loss": 0.8257, + "step": 7945 + }, + { + "epoch": 0.12376719313130999, + "grad_norm": 7.192509174346924, + "learning_rate": 9.223709870372495e-07, + "loss": 0.809, + "step": 7950 + }, + { + "epoch": 0.12384503413327937, + "grad_norm": 3.1407008171081543, + "learning_rate": 9.222890480326445e-07, + "loss": 0.8444, + "step": 7955 + }, + { + "epoch": 0.12392287513524874, + "grad_norm": 4.12625789642334, + "learning_rate": 9.222071090280394e-07, + "loss": 0.8275, + "step": 7960 + }, + { + "epoch": 0.12400071613721812, + "grad_norm": 5.525479316711426, + "learning_rate": 9.221251700234345e-07, + "loss": 0.8999, + "step": 7965 + }, + { + "epoch": 0.1240785571391875, + "grad_norm": 6.302455902099609, + "learning_rate": 9.220432310188295e-07, + "loss": 0.7642, + "step": 7970 + }, + { + "epoch": 0.12415639814115688, + "grad_norm": 4.649979114532471, + "learning_rate": 9.219612920142245e-07, + "loss": 0.7002, + "step": 7975 + }, + { + "epoch": 0.12423423914312626, + "grad_norm": 5.463395595550537, + "learning_rate": 9.218793530096196e-07, + "loss": 0.8473, + "step": 7980 + }, + { + "epoch": 0.12431208014509562, + "grad_norm": 6.704756736755371, + "learning_rate": 9.217974140050147e-07, + "loss": 0.6967, + "step": 7985 + }, + { + "epoch": 0.124389921147065, + "grad_norm": 4.5808539390563965, + "learning_rate": 9.217154750004097e-07, + "loss": 0.7898, + "step": 7990 + }, + { + "epoch": 0.12446776214903438, + "grad_norm": 2.9757680892944336, + "learning_rate": 9.216335359958047e-07, + "loss": 0.698, + "step": 7995 + }, + { + "epoch": 0.12454560315100376, + "grad_norm": 5.5388617515563965, + "learning_rate": 9.215515969911997e-07, + "loss": 0.8798, + "step": 8000 + }, + { + "epoch": 0.12462344415297313, + "grad_norm": 5.424650192260742, + "learning_rate": 9.214696579865947e-07, + "loss": 0.814, + "step": 8005 + }, + { + "epoch": 0.12470128515494251, + "grad_norm": 4.0730109214782715, + "learning_rate": 9.213877189819898e-07, + "loss": 0.6547, + "step": 8010 + }, + { + "epoch": 0.12477912615691189, + "grad_norm": 5.436715126037598, + "learning_rate": 9.213057799773848e-07, + "loss": 0.776, + "step": 8015 + }, + { + "epoch": 0.12485696715888127, + "grad_norm": 3.6354191303253174, + "learning_rate": 9.212238409727798e-07, + "loss": 0.7196, + "step": 8020 + }, + { + "epoch": 0.12493480816085065, + "grad_norm": 3.5990068912506104, + "learning_rate": 9.211419019681749e-07, + "loss": 0.8079, + "step": 8025 + }, + { + "epoch": 0.12501264916282, + "grad_norm": 4.007763385772705, + "learning_rate": 9.2105996296357e-07, + "loss": 0.7711, + "step": 8030 + }, + { + "epoch": 0.1250904901647894, + "grad_norm": 4.223349571228027, + "learning_rate": 9.209780239589649e-07, + "loss": 0.8481, + "step": 8035 + }, + { + "epoch": 0.12516833116675877, + "grad_norm": 4.40108060836792, + "learning_rate": 9.208960849543599e-07, + "loss": 0.7981, + "step": 8040 + }, + { + "epoch": 0.12524617216872816, + "grad_norm": 2.7487142086029053, + "learning_rate": 9.20814145949755e-07, + "loss": 0.6693, + "step": 8045 + }, + { + "epoch": 0.12532401317069752, + "grad_norm": 3.568763017654419, + "learning_rate": 9.207322069451499e-07, + "loss": 0.7716, + "step": 8050 + }, + { + "epoch": 0.12540185417266692, + "grad_norm": 3.7271010875701904, + "learning_rate": 9.20650267940545e-07, + "loss": 0.7969, + "step": 8055 + }, + { + "epoch": 0.12547969517463628, + "grad_norm": 4.352176189422607, + "learning_rate": 9.205683289359401e-07, + "loss": 0.8513, + "step": 8060 + }, + { + "epoch": 0.12555753617660567, + "grad_norm": 3.4279236793518066, + "learning_rate": 9.20486389931335e-07, + "loss": 0.8315, + "step": 8065 + }, + { + "epoch": 0.12563537717857504, + "grad_norm": 5.192807197570801, + "learning_rate": 9.204044509267301e-07, + "loss": 0.6887, + "step": 8070 + }, + { + "epoch": 0.12571321818054443, + "grad_norm": 7.44572114944458, + "learning_rate": 9.203225119221252e-07, + "loss": 0.8179, + "step": 8075 + }, + { + "epoch": 0.1257910591825138, + "grad_norm": 2.792656183242798, + "learning_rate": 9.202405729175201e-07, + "loss": 0.8136, + "step": 8080 + }, + { + "epoch": 0.12586890018448318, + "grad_norm": 3.840090036392212, + "learning_rate": 9.201586339129151e-07, + "loss": 0.7216, + "step": 8085 + }, + { + "epoch": 0.12594674118645255, + "grad_norm": 6.609809398651123, + "learning_rate": 9.200766949083102e-07, + "loss": 0.8109, + "step": 8090 + }, + { + "epoch": 0.12602458218842194, + "grad_norm": 3.8064773082733154, + "learning_rate": 9.199947559037052e-07, + "loss": 0.7874, + "step": 8095 + }, + { + "epoch": 0.1261024231903913, + "grad_norm": 4.612837791442871, + "learning_rate": 9.199128168991003e-07, + "loss": 0.8989, + "step": 8100 + }, + { + "epoch": 0.1261802641923607, + "grad_norm": 3.837954044342041, + "learning_rate": 9.198308778944953e-07, + "loss": 0.7964, + "step": 8105 + }, + { + "epoch": 0.12625810519433006, + "grad_norm": 5.080657482147217, + "learning_rate": 9.197489388898904e-07, + "loss": 0.7835, + "step": 8110 + }, + { + "epoch": 0.12633594619629943, + "grad_norm": 5.589513301849365, + "learning_rate": 9.196669998852854e-07, + "loss": 0.8027, + "step": 8115 + }, + { + "epoch": 0.12641378719826882, + "grad_norm": 5.392527103424072, + "learning_rate": 9.195850608806803e-07, + "loss": 0.7721, + "step": 8120 + }, + { + "epoch": 0.12649162820023818, + "grad_norm": 8.648225784301758, + "learning_rate": 9.195031218760754e-07, + "loss": 0.8396, + "step": 8125 + }, + { + "epoch": 0.12656946920220757, + "grad_norm": 3.0926201343536377, + "learning_rate": 9.194211828714704e-07, + "loss": 0.8282, + "step": 8130 + }, + { + "epoch": 0.12664731020417694, + "grad_norm": 4.515932083129883, + "learning_rate": 9.193392438668655e-07, + "loss": 0.8238, + "step": 8135 + }, + { + "epoch": 0.12672515120614633, + "grad_norm": 6.131096839904785, + "learning_rate": 9.192573048622605e-07, + "loss": 0.8638, + "step": 8140 + }, + { + "epoch": 0.1268029922081157, + "grad_norm": 3.634267807006836, + "learning_rate": 9.191753658576555e-07, + "loss": 0.716, + "step": 8145 + }, + { + "epoch": 0.1268808332100851, + "grad_norm": 4.676586627960205, + "learning_rate": 9.190934268530506e-07, + "loss": 0.8273, + "step": 8150 + }, + { + "epoch": 0.12695867421205445, + "grad_norm": 3.584019422531128, + "learning_rate": 9.190114878484457e-07, + "loss": 0.805, + "step": 8155 + }, + { + "epoch": 0.12703651521402384, + "grad_norm": 3.7464358806610107, + "learning_rate": 9.189295488438405e-07, + "loss": 0.7724, + "step": 8160 + }, + { + "epoch": 0.1271143562159932, + "grad_norm": 3.974726438522339, + "learning_rate": 9.188476098392356e-07, + "loss": 0.807, + "step": 8165 + }, + { + "epoch": 0.1271921972179626, + "grad_norm": 5.144652843475342, + "learning_rate": 9.187656708346307e-07, + "loss": 0.8721, + "step": 8170 + }, + { + "epoch": 0.12727003821993196, + "grad_norm": 7.679945945739746, + "learning_rate": 9.186837318300256e-07, + "loss": 0.7945, + "step": 8175 + }, + { + "epoch": 0.12734787922190136, + "grad_norm": 5.530436992645264, + "learning_rate": 9.186017928254207e-07, + "loss": 0.9354, + "step": 8180 + }, + { + "epoch": 0.12742572022387072, + "grad_norm": 3.981515884399414, + "learning_rate": 9.185198538208158e-07, + "loss": 0.6024, + "step": 8185 + }, + { + "epoch": 0.1275035612258401, + "grad_norm": 3.5425384044647217, + "learning_rate": 9.184379148162108e-07, + "loss": 0.8061, + "step": 8190 + }, + { + "epoch": 0.12758140222780948, + "grad_norm": 3.2570059299468994, + "learning_rate": 9.183559758116058e-07, + "loss": 0.7803, + "step": 8195 + }, + { + "epoch": 0.12765924322977884, + "grad_norm": 3.267265558242798, + "learning_rate": 9.182740368070008e-07, + "loss": 0.7927, + "step": 8200 + }, + { + "epoch": 0.12773708423174823, + "grad_norm": 3.4302942752838135, + "learning_rate": 9.181920978023958e-07, + "loss": 0.8104, + "step": 8205 + }, + { + "epoch": 0.1278149252337176, + "grad_norm": 4.691220760345459, + "learning_rate": 9.181101587977908e-07, + "loss": 0.7826, + "step": 8210 + }, + { + "epoch": 0.127892766235687, + "grad_norm": 4.321291446685791, + "learning_rate": 9.180282197931859e-07, + "loss": 0.8503, + "step": 8215 + }, + { + "epoch": 0.12797060723765635, + "grad_norm": 6.356113433837891, + "learning_rate": 9.179462807885809e-07, + "loss": 0.8026, + "step": 8220 + }, + { + "epoch": 0.12804844823962575, + "grad_norm": 6.146854400634766, + "learning_rate": 9.17864341783976e-07, + "loss": 0.7779, + "step": 8225 + }, + { + "epoch": 0.1281262892415951, + "grad_norm": 4.845507621765137, + "learning_rate": 9.17782402779371e-07, + "loss": 0.855, + "step": 8230 + }, + { + "epoch": 0.1282041302435645, + "grad_norm": 4.020603179931641, + "learning_rate": 9.177004637747661e-07, + "loss": 0.8117, + "step": 8235 + }, + { + "epoch": 0.12828197124553387, + "grad_norm": 3.1681554317474365, + "learning_rate": 9.17618524770161e-07, + "loss": 1.0093, + "step": 8240 + }, + { + "epoch": 0.12835981224750326, + "grad_norm": 12.957433700561523, + "learning_rate": 9.17536585765556e-07, + "loss": 0.8047, + "step": 8245 + }, + { + "epoch": 0.12843765324947262, + "grad_norm": 3.9429116249084473, + "learning_rate": 9.174546467609511e-07, + "loss": 0.7415, + "step": 8250 + }, + { + "epoch": 0.12851549425144201, + "grad_norm": 5.232780933380127, + "learning_rate": 9.173727077563461e-07, + "loss": 0.9999, + "step": 8255 + }, + { + "epoch": 0.12859333525341138, + "grad_norm": 5.215351104736328, + "learning_rate": 9.172907687517412e-07, + "loss": 0.7994, + "step": 8260 + }, + { + "epoch": 0.12867117625538077, + "grad_norm": 3.917405366897583, + "learning_rate": 9.172088297471362e-07, + "loss": 0.7882, + "step": 8265 + }, + { + "epoch": 0.12874901725735013, + "grad_norm": 5.005404949188232, + "learning_rate": 9.171268907425312e-07, + "loss": 0.8112, + "step": 8270 + }, + { + "epoch": 0.12882685825931953, + "grad_norm": 3.008211851119995, + "learning_rate": 9.170449517379263e-07, + "loss": 0.8249, + "step": 8275 + }, + { + "epoch": 0.1289046992612889, + "grad_norm": 9.127318382263184, + "learning_rate": 9.169630127333214e-07, + "loss": 0.8224, + "step": 8280 + }, + { + "epoch": 0.12898254026325826, + "grad_norm": 4.283410549163818, + "learning_rate": 9.168810737287162e-07, + "loss": 0.8319, + "step": 8285 + }, + { + "epoch": 0.12906038126522765, + "grad_norm": 9.957722663879395, + "learning_rate": 9.167991347241113e-07, + "loss": 0.7951, + "step": 8290 + }, + { + "epoch": 0.129138222267197, + "grad_norm": 3.6089370250701904, + "learning_rate": 9.167171957195064e-07, + "loss": 0.7409, + "step": 8295 + }, + { + "epoch": 0.1292160632691664, + "grad_norm": 6.038301467895508, + "learning_rate": 9.166352567149013e-07, + "loss": 0.802, + "step": 8300 + }, + { + "epoch": 0.12929390427113577, + "grad_norm": 5.123950481414795, + "learning_rate": 9.165533177102964e-07, + "loss": 0.723, + "step": 8305 + }, + { + "epoch": 0.12937174527310516, + "grad_norm": 3.86824369430542, + "learning_rate": 9.164713787056915e-07, + "loss": 0.811, + "step": 8310 + }, + { + "epoch": 0.12944958627507452, + "grad_norm": 4.312297821044922, + "learning_rate": 9.163894397010865e-07, + "loss": 0.7, + "step": 8315 + }, + { + "epoch": 0.12952742727704392, + "grad_norm": 2.920485258102417, + "learning_rate": 9.163075006964815e-07, + "loss": 0.8012, + "step": 8320 + }, + { + "epoch": 0.12960526827901328, + "grad_norm": 3.8033828735351562, + "learning_rate": 9.162255616918765e-07, + "loss": 0.8236, + "step": 8325 + }, + { + "epoch": 0.12968310928098267, + "grad_norm": 7.963630199432373, + "learning_rate": 9.161436226872715e-07, + "loss": 0.8041, + "step": 8330 + }, + { + "epoch": 0.12976095028295204, + "grad_norm": 3.964761734008789, + "learning_rate": 9.160616836826666e-07, + "loss": 0.8897, + "step": 8335 + }, + { + "epoch": 0.12983879128492143, + "grad_norm": 3.360156297683716, + "learning_rate": 9.159797446780616e-07, + "loss": 0.7087, + "step": 8340 + }, + { + "epoch": 0.1299166322868908, + "grad_norm": 4.731776237487793, + "learning_rate": 9.158978056734567e-07, + "loss": 0.7926, + "step": 8345 + }, + { + "epoch": 0.12999447328886019, + "grad_norm": 3.076554775238037, + "learning_rate": 9.158158666688517e-07, + "loss": 0.7662, + "step": 8350 + }, + { + "epoch": 0.13007231429082955, + "grad_norm": 3.2537529468536377, + "learning_rate": 9.157339276642467e-07, + "loss": 0.8568, + "step": 8355 + }, + { + "epoch": 0.13015015529279894, + "grad_norm": 4.160289287567139, + "learning_rate": 9.156519886596418e-07, + "loss": 0.7748, + "step": 8360 + }, + { + "epoch": 0.1302279962947683, + "grad_norm": 3.388763666152954, + "learning_rate": 9.155700496550367e-07, + "loss": 0.7411, + "step": 8365 + }, + { + "epoch": 0.13030583729673767, + "grad_norm": 3.7193074226379395, + "learning_rate": 9.154881106504318e-07, + "loss": 0.7553, + "step": 8370 + }, + { + "epoch": 0.13038367829870706, + "grad_norm": 4.124868392944336, + "learning_rate": 9.154061716458268e-07, + "loss": 0.8061, + "step": 8375 + }, + { + "epoch": 0.13046151930067643, + "grad_norm": 3.1243176460266113, + "learning_rate": 9.153242326412218e-07, + "loss": 0.7673, + "step": 8380 + }, + { + "epoch": 0.13053936030264582, + "grad_norm": 3.175187587738037, + "learning_rate": 9.152422936366169e-07, + "loss": 0.636, + "step": 8385 + }, + { + "epoch": 0.13061720130461518, + "grad_norm": 3.486941337585449, + "learning_rate": 9.15160354632012e-07, + "loss": 0.768, + "step": 8390 + }, + { + "epoch": 0.13069504230658457, + "grad_norm": 3.408848762512207, + "learning_rate": 9.150784156274069e-07, + "loss": 0.8326, + "step": 8395 + }, + { + "epoch": 0.13077288330855394, + "grad_norm": 5.37129545211792, + "learning_rate": 9.14996476622802e-07, + "loss": 0.7712, + "step": 8400 + }, + { + "epoch": 0.13085072431052333, + "grad_norm": 2.642165422439575, + "learning_rate": 9.14914537618197e-07, + "loss": 0.8611, + "step": 8405 + }, + { + "epoch": 0.1309285653124927, + "grad_norm": 3.8213489055633545, + "learning_rate": 9.148325986135919e-07, + "loss": 0.7938, + "step": 8410 + }, + { + "epoch": 0.1310064063144621, + "grad_norm": 3.893542528152466, + "learning_rate": 9.14750659608987e-07, + "loss": 0.7761, + "step": 8415 + }, + { + "epoch": 0.13108424731643145, + "grad_norm": 3.6185567378997803, + "learning_rate": 9.146687206043821e-07, + "loss": 0.776, + "step": 8420 + }, + { + "epoch": 0.13116208831840084, + "grad_norm": 7.76255989074707, + "learning_rate": 9.14586781599777e-07, + "loss": 0.8723, + "step": 8425 + }, + { + "epoch": 0.1312399293203702, + "grad_norm": 3.2361936569213867, + "learning_rate": 9.145048425951721e-07, + "loss": 0.7151, + "step": 8430 + }, + { + "epoch": 0.1313177703223396, + "grad_norm": 7.259923458099365, + "learning_rate": 9.144229035905672e-07, + "loss": 0.7714, + "step": 8435 + }, + { + "epoch": 0.13139561132430896, + "grad_norm": 8.949355125427246, + "learning_rate": 9.143409645859622e-07, + "loss": 0.8829, + "step": 8440 + }, + { + "epoch": 0.13147345232627836, + "grad_norm": 5.755862712860107, + "learning_rate": 9.142590255813571e-07, + "loss": 0.8672, + "step": 8445 + }, + { + "epoch": 0.13155129332824772, + "grad_norm": 4.543202877044678, + "learning_rate": 9.141770865767522e-07, + "loss": 0.8165, + "step": 8450 + }, + { + "epoch": 0.1316291343302171, + "grad_norm": 3.6541123390197754, + "learning_rate": 9.140951475721472e-07, + "loss": 0.8017, + "step": 8455 + }, + { + "epoch": 0.13170697533218648, + "grad_norm": 3.5702321529388428, + "learning_rate": 9.140132085675423e-07, + "loss": 0.8063, + "step": 8460 + }, + { + "epoch": 0.13178481633415584, + "grad_norm": 3.831411361694336, + "learning_rate": 9.139312695629373e-07, + "loss": 0.832, + "step": 8465 + }, + { + "epoch": 0.13186265733612523, + "grad_norm": 3.0359880924224854, + "learning_rate": 9.138493305583324e-07, + "loss": 0.6776, + "step": 8470 + }, + { + "epoch": 0.1319404983380946, + "grad_norm": 4.264082908630371, + "learning_rate": 9.137673915537274e-07, + "loss": 0.6582, + "step": 8475 + }, + { + "epoch": 0.132018339340064, + "grad_norm": 4.2727508544921875, + "learning_rate": 9.136854525491225e-07, + "loss": 0.7432, + "step": 8480 + }, + { + "epoch": 0.13209618034203335, + "grad_norm": 7.4130539894104, + "learning_rate": 9.136035135445174e-07, + "loss": 0.7692, + "step": 8485 + }, + { + "epoch": 0.13217402134400275, + "grad_norm": 5.631756782531738, + "learning_rate": 9.135215745399124e-07, + "loss": 0.7297, + "step": 8490 + }, + { + "epoch": 0.1322518623459721, + "grad_norm": 4.975935459136963, + "learning_rate": 9.134396355353075e-07, + "loss": 0.8036, + "step": 8495 + }, + { + "epoch": 0.1323297033479415, + "grad_norm": 3.5903608798980713, + "learning_rate": 9.133576965307025e-07, + "loss": 0.8833, + "step": 8500 + }, + { + "epoch": 0.13240754434991087, + "grad_norm": 4.563084602355957, + "learning_rate": 9.132757575260975e-07, + "loss": 0.7386, + "step": 8505 + }, + { + "epoch": 0.13248538535188026, + "grad_norm": 5.170467853546143, + "learning_rate": 9.131938185214926e-07, + "loss": 0.8154, + "step": 8510 + }, + { + "epoch": 0.13256322635384962, + "grad_norm": 6.0783233642578125, + "learning_rate": 9.131118795168877e-07, + "loss": 0.8525, + "step": 8515 + }, + { + "epoch": 0.13264106735581901, + "grad_norm": 3.2167932987213135, + "learning_rate": 9.130299405122826e-07, + "loss": 0.7605, + "step": 8520 + }, + { + "epoch": 0.13271890835778838, + "grad_norm": 4.35626745223999, + "learning_rate": 9.129480015076776e-07, + "loss": 0.7577, + "step": 8525 + }, + { + "epoch": 0.13279674935975777, + "grad_norm": 6.491021633148193, + "learning_rate": 9.128660625030727e-07, + "loss": 0.8463, + "step": 8530 + }, + { + "epoch": 0.13287459036172714, + "grad_norm": 3.174940347671509, + "learning_rate": 9.127841234984676e-07, + "loss": 0.7382, + "step": 8535 + }, + { + "epoch": 0.13295243136369653, + "grad_norm": 4.856359004974365, + "learning_rate": 9.127021844938627e-07, + "loss": 0.7025, + "step": 8540 + }, + { + "epoch": 0.1330302723656659, + "grad_norm": 3.6859755516052246, + "learning_rate": 9.126202454892578e-07, + "loss": 0.642, + "step": 8545 + }, + { + "epoch": 0.13310811336763526, + "grad_norm": 3.6811821460723877, + "learning_rate": 9.125383064846528e-07, + "loss": 0.828, + "step": 8550 + }, + { + "epoch": 0.13318595436960465, + "grad_norm": 7.908188819885254, + "learning_rate": 9.124563674800478e-07, + "loss": 0.8043, + "step": 8555 + }, + { + "epoch": 0.133263795371574, + "grad_norm": 3.508521795272827, + "learning_rate": 9.123744284754429e-07, + "loss": 0.8052, + "step": 8560 + }, + { + "epoch": 0.1333416363735434, + "grad_norm": 4.004594326019287, + "learning_rate": 9.122924894708378e-07, + "loss": 0.8126, + "step": 8565 + }, + { + "epoch": 0.13341947737551277, + "grad_norm": 4.229026794433594, + "learning_rate": 9.122105504662328e-07, + "loss": 0.7793, + "step": 8570 + }, + { + "epoch": 0.13349731837748216, + "grad_norm": 5.319640159606934, + "learning_rate": 9.121286114616279e-07, + "loss": 0.765, + "step": 8575 + }, + { + "epoch": 0.13357515937945152, + "grad_norm": 3.8783299922943115, + "learning_rate": 9.120466724570229e-07, + "loss": 0.7036, + "step": 8580 + }, + { + "epoch": 0.13365300038142092, + "grad_norm": 5.243676662445068, + "learning_rate": 9.11964733452418e-07, + "loss": 0.8525, + "step": 8585 + }, + { + "epoch": 0.13373084138339028, + "grad_norm": 6.118826866149902, + "learning_rate": 9.11882794447813e-07, + "loss": 0.8391, + "step": 8590 + }, + { + "epoch": 0.13380868238535967, + "grad_norm": 2.1617751121520996, + "learning_rate": 9.118008554432081e-07, + "loss": 0.7353, + "step": 8595 + }, + { + "epoch": 0.13388652338732904, + "grad_norm": 7.590507507324219, + "learning_rate": 9.117189164386031e-07, + "loss": 0.8919, + "step": 8600 + }, + { + "epoch": 0.13396436438929843, + "grad_norm": 3.345130443572998, + "learning_rate": 9.116369774339982e-07, + "loss": 0.7615, + "step": 8605 + }, + { + "epoch": 0.1340422053912678, + "grad_norm": 3.144171953201294, + "learning_rate": 9.115550384293931e-07, + "loss": 0.7714, + "step": 8610 + }, + { + "epoch": 0.13412004639323719, + "grad_norm": 5.711132049560547, + "learning_rate": 9.114730994247881e-07, + "loss": 0.9119, + "step": 8615 + }, + { + "epoch": 0.13419788739520655, + "grad_norm": 3.997664451599121, + "learning_rate": 9.113911604201832e-07, + "loss": 0.7821, + "step": 8620 + }, + { + "epoch": 0.13427572839717594, + "grad_norm": 5.540621757507324, + "learning_rate": 9.113092214155782e-07, + "loss": 0.8796, + "step": 8625 + }, + { + "epoch": 0.1343535693991453, + "grad_norm": 4.296466827392578, + "learning_rate": 9.112272824109732e-07, + "loss": 0.839, + "step": 8630 + }, + { + "epoch": 0.13443141040111467, + "grad_norm": 5.527231693267822, + "learning_rate": 9.111453434063683e-07, + "loss": 0.7966, + "step": 8635 + }, + { + "epoch": 0.13450925140308406, + "grad_norm": 4.798453330993652, + "learning_rate": 9.110634044017634e-07, + "loss": 0.8808, + "step": 8640 + }, + { + "epoch": 0.13458709240505343, + "grad_norm": 4.989645957946777, + "learning_rate": 9.109814653971583e-07, + "loss": 0.7845, + "step": 8645 + }, + { + "epoch": 0.13466493340702282, + "grad_norm": 5.687048435211182, + "learning_rate": 9.108995263925533e-07, + "loss": 0.8627, + "step": 8650 + }, + { + "epoch": 0.13474277440899218, + "grad_norm": 3.1098756790161133, + "learning_rate": 9.108175873879484e-07, + "loss": 0.7594, + "step": 8655 + }, + { + "epoch": 0.13482061541096158, + "grad_norm": 8.40995979309082, + "learning_rate": 9.107356483833433e-07, + "loss": 0.7599, + "step": 8660 + }, + { + "epoch": 0.13489845641293094, + "grad_norm": 6.420483589172363, + "learning_rate": 9.106537093787384e-07, + "loss": 0.8199, + "step": 8665 + }, + { + "epoch": 0.13497629741490033, + "grad_norm": 5.545563697814941, + "learning_rate": 9.105717703741335e-07, + "loss": 0.7403, + "step": 8670 + }, + { + "epoch": 0.1350541384168697, + "grad_norm": 8.34080696105957, + "learning_rate": 9.104898313695285e-07, + "loss": 0.7979, + "step": 8675 + }, + { + "epoch": 0.1351319794188391, + "grad_norm": 6.4696550369262695, + "learning_rate": 9.104078923649235e-07, + "loss": 0.6935, + "step": 8680 + }, + { + "epoch": 0.13520982042080845, + "grad_norm": 7.9759521484375, + "learning_rate": 9.103259533603186e-07, + "loss": 0.7174, + "step": 8685 + }, + { + "epoch": 0.13528766142277784, + "grad_norm": 4.094228267669678, + "learning_rate": 9.102440143557135e-07, + "loss": 0.9191, + "step": 8690 + }, + { + "epoch": 0.1353655024247472, + "grad_norm": 3.2784035205841064, + "learning_rate": 9.101620753511086e-07, + "loss": 0.8157, + "step": 8695 + }, + { + "epoch": 0.1354433434267166, + "grad_norm": 3.680067300796509, + "learning_rate": 9.100801363465036e-07, + "loss": 0.8478, + "step": 8700 + }, + { + "epoch": 0.13552118442868596, + "grad_norm": 3.991107225418091, + "learning_rate": 9.099981973418986e-07, + "loss": 0.9003, + "step": 8705 + }, + { + "epoch": 0.13559902543065536, + "grad_norm": 2.9558584690093994, + "learning_rate": 9.099162583372937e-07, + "loss": 0.7571, + "step": 8710 + }, + { + "epoch": 0.13567686643262472, + "grad_norm": 2.952221632003784, + "learning_rate": 9.098343193326887e-07, + "loss": 0.8597, + "step": 8715 + }, + { + "epoch": 0.13575470743459409, + "grad_norm": 8.548612594604492, + "learning_rate": 9.097523803280838e-07, + "loss": 0.7225, + "step": 8720 + }, + { + "epoch": 0.13583254843656348, + "grad_norm": 9.161630630493164, + "learning_rate": 9.096704413234788e-07, + "loss": 0.6646, + "step": 8725 + }, + { + "epoch": 0.13591038943853284, + "grad_norm": 4.948508262634277, + "learning_rate": 9.095885023188738e-07, + "loss": 0.6339, + "step": 8730 + }, + { + "epoch": 0.13598823044050223, + "grad_norm": 6.353402614593506, + "learning_rate": 9.095065633142688e-07, + "loss": 0.7525, + "step": 8735 + }, + { + "epoch": 0.1360660714424716, + "grad_norm": 2.8166439533233643, + "learning_rate": 9.094246243096638e-07, + "loss": 0.7942, + "step": 8740 + }, + { + "epoch": 0.136143912444441, + "grad_norm": 3.2891948223114014, + "learning_rate": 9.093426853050589e-07, + "loss": 0.7832, + "step": 8745 + }, + { + "epoch": 0.13622175344641035, + "grad_norm": 6.029998779296875, + "learning_rate": 9.09260746300454e-07, + "loss": 0.9606, + "step": 8750 + }, + { + "epoch": 0.13629959444837975, + "grad_norm": 3.9812381267547607, + "learning_rate": 9.091788072958489e-07, + "loss": 0.8119, + "step": 8755 + }, + { + "epoch": 0.1363774354503491, + "grad_norm": 2.964101791381836, + "learning_rate": 9.09096868291244e-07, + "loss": 0.7792, + "step": 8760 + }, + { + "epoch": 0.1364552764523185, + "grad_norm": 5.025110244750977, + "learning_rate": 9.090149292866391e-07, + "loss": 0.9316, + "step": 8765 + }, + { + "epoch": 0.13653311745428787, + "grad_norm": 8.131609916687012, + "learning_rate": 9.089329902820339e-07, + "loss": 0.8358, + "step": 8770 + }, + { + "epoch": 0.13661095845625726, + "grad_norm": 6.380354881286621, + "learning_rate": 9.08851051277429e-07, + "loss": 0.8002, + "step": 8775 + }, + { + "epoch": 0.13668879945822662, + "grad_norm": 3.922022581100464, + "learning_rate": 9.087691122728241e-07, + "loss": 0.7736, + "step": 8780 + }, + { + "epoch": 0.13676664046019602, + "grad_norm": 7.275602340698242, + "learning_rate": 9.08687173268219e-07, + "loss": 0.8083, + "step": 8785 + }, + { + "epoch": 0.13684448146216538, + "grad_norm": 4.583987236022949, + "learning_rate": 9.086052342636141e-07, + "loss": 0.7587, + "step": 8790 + }, + { + "epoch": 0.13692232246413477, + "grad_norm": 2.945908784866333, + "learning_rate": 9.085232952590092e-07, + "loss": 0.7746, + "step": 8795 + }, + { + "epoch": 0.13700016346610414, + "grad_norm": 3.0301320552825928, + "learning_rate": 9.084413562544042e-07, + "loss": 0.7701, + "step": 8800 + }, + { + "epoch": 0.1370780044680735, + "grad_norm": 2.6927056312561035, + "learning_rate": 9.083594172497992e-07, + "loss": 0.6471, + "step": 8805 + }, + { + "epoch": 0.1371558454700429, + "grad_norm": 3.8082404136657715, + "learning_rate": 9.082774782451942e-07, + "loss": 0.8069, + "step": 8810 + }, + { + "epoch": 0.13723368647201226, + "grad_norm": 3.635481595993042, + "learning_rate": 9.081955392405892e-07, + "loss": 0.7559, + "step": 8815 + }, + { + "epoch": 0.13731152747398165, + "grad_norm": 3.12910795211792, + "learning_rate": 9.081136002359843e-07, + "loss": 0.7618, + "step": 8820 + }, + { + "epoch": 0.137389368475951, + "grad_norm": 4.329128742218018, + "learning_rate": 9.080316612313793e-07, + "loss": 0.7491, + "step": 8825 + }, + { + "epoch": 0.1374672094779204, + "grad_norm": 3.8912899494171143, + "learning_rate": 9.079497222267743e-07, + "loss": 0.8315, + "step": 8830 + }, + { + "epoch": 0.13754505047988977, + "grad_norm": 5.182728290557861, + "learning_rate": 9.078677832221694e-07, + "loss": 0.8504, + "step": 8835 + }, + { + "epoch": 0.13762289148185916, + "grad_norm": 2.6342267990112305, + "learning_rate": 9.077858442175645e-07, + "loss": 0.8111, + "step": 8840 + }, + { + "epoch": 0.13770073248382853, + "grad_norm": 5.059619903564453, + "learning_rate": 9.077039052129595e-07, + "loss": 0.9168, + "step": 8845 + }, + { + "epoch": 0.13777857348579792, + "grad_norm": 3.4851198196411133, + "learning_rate": 9.076219662083544e-07, + "loss": 0.8299, + "step": 8850 + }, + { + "epoch": 0.13785641448776728, + "grad_norm": 8.638031959533691, + "learning_rate": 9.075400272037495e-07, + "loss": 0.6855, + "step": 8855 + }, + { + "epoch": 0.13793425548973667, + "grad_norm": 4.0812811851501465, + "learning_rate": 9.074580881991445e-07, + "loss": 0.7596, + "step": 8860 + }, + { + "epoch": 0.13801209649170604, + "grad_norm": 3.0092952251434326, + "learning_rate": 9.073761491945395e-07, + "loss": 0.7839, + "step": 8865 + }, + { + "epoch": 0.13808993749367543, + "grad_norm": 3.5875368118286133, + "learning_rate": 9.072942101899346e-07, + "loss": 0.8173, + "step": 8870 + }, + { + "epoch": 0.1381677784956448, + "grad_norm": 6.807621955871582, + "learning_rate": 9.072122711853297e-07, + "loss": 0.8204, + "step": 8875 + }, + { + "epoch": 0.13824561949761419, + "grad_norm": 5.541608810424805, + "learning_rate": 9.071303321807246e-07, + "loss": 0.7516, + "step": 8880 + }, + { + "epoch": 0.13832346049958355, + "grad_norm": 4.05478048324585, + "learning_rate": 9.070483931761197e-07, + "loss": 0.841, + "step": 8885 + }, + { + "epoch": 0.13840130150155291, + "grad_norm": 3.128432512283325, + "learning_rate": 9.069664541715148e-07, + "loss": 0.694, + "step": 8890 + }, + { + "epoch": 0.1384791425035223, + "grad_norm": 4.354421138763428, + "learning_rate": 9.068845151669096e-07, + "loss": 0.885, + "step": 8895 + }, + { + "epoch": 0.13855698350549167, + "grad_norm": 4.6781134605407715, + "learning_rate": 9.068025761623047e-07, + "loss": 0.9041, + "step": 8900 + }, + { + "epoch": 0.13863482450746106, + "grad_norm": 12.059392929077148, + "learning_rate": 9.067206371576998e-07, + "loss": 0.7997, + "step": 8905 + }, + { + "epoch": 0.13871266550943043, + "grad_norm": 2.994907855987549, + "learning_rate": 9.066386981530948e-07, + "loss": 0.7988, + "step": 8910 + }, + { + "epoch": 0.13879050651139982, + "grad_norm": 4.420156478881836, + "learning_rate": 9.065567591484898e-07, + "loss": 0.8653, + "step": 8915 + }, + { + "epoch": 0.13886834751336918, + "grad_norm": 8.454998016357422, + "learning_rate": 9.064748201438849e-07, + "loss": 0.8991, + "step": 8920 + }, + { + "epoch": 0.13894618851533858, + "grad_norm": 3.3839731216430664, + "learning_rate": 9.063928811392799e-07, + "loss": 0.7173, + "step": 8925 + }, + { + "epoch": 0.13902402951730794, + "grad_norm": 5.453253746032715, + "learning_rate": 9.06310942134675e-07, + "loss": 0.7141, + "step": 8930 + }, + { + "epoch": 0.13910187051927733, + "grad_norm": 9.229926109313965, + "learning_rate": 9.062290031300699e-07, + "loss": 0.7292, + "step": 8935 + }, + { + "epoch": 0.1391797115212467, + "grad_norm": 5.860846042633057, + "learning_rate": 9.061470641254649e-07, + "loss": 0.8563, + "step": 8940 + }, + { + "epoch": 0.1392575525232161, + "grad_norm": 4.182551383972168, + "learning_rate": 9.0606512512086e-07, + "loss": 0.7552, + "step": 8945 + }, + { + "epoch": 0.13933539352518545, + "grad_norm": 3.165614604949951, + "learning_rate": 9.05983186116255e-07, + "loss": 0.7887, + "step": 8950 + }, + { + "epoch": 0.13941323452715484, + "grad_norm": 3.805906057357788, + "learning_rate": 9.0590124711165e-07, + "loss": 0.8226, + "step": 8955 + }, + { + "epoch": 0.1394910755291242, + "grad_norm": 4.190842151641846, + "learning_rate": 9.058193081070451e-07, + "loss": 0.8951, + "step": 8960 + }, + { + "epoch": 0.1395689165310936, + "grad_norm": 3.0468575954437256, + "learning_rate": 9.057373691024402e-07, + "loss": 0.7346, + "step": 8965 + }, + { + "epoch": 0.13964675753306297, + "grad_norm": 4.917840003967285, + "learning_rate": 9.056554300978352e-07, + "loss": 0.8478, + "step": 8970 + }, + { + "epoch": 0.13972459853503233, + "grad_norm": 3.5989246368408203, + "learning_rate": 9.055734910932301e-07, + "loss": 0.702, + "step": 8975 + }, + { + "epoch": 0.13980243953700172, + "grad_norm": 3.706799030303955, + "learning_rate": 9.054915520886252e-07, + "loss": 0.6731, + "step": 8980 + }, + { + "epoch": 0.13988028053897109, + "grad_norm": 2.194471836090088, + "learning_rate": 9.054096130840202e-07, + "loss": 0.7718, + "step": 8985 + }, + { + "epoch": 0.13995812154094048, + "grad_norm": 4.610592842102051, + "learning_rate": 9.053276740794152e-07, + "loss": 0.657, + "step": 8990 + }, + { + "epoch": 0.14003596254290984, + "grad_norm": 9.641939163208008, + "learning_rate": 9.052457350748103e-07, + "loss": 0.7753, + "step": 8995 + }, + { + "epoch": 0.14011380354487923, + "grad_norm": 4.634001731872559, + "learning_rate": 9.051637960702054e-07, + "loss": 0.8575, + "step": 9000 + }, + { + "epoch": 0.1401916445468486, + "grad_norm": 4.204237461090088, + "learning_rate": 9.050818570656003e-07, + "loss": 0.8538, + "step": 9005 + }, + { + "epoch": 0.140269485548818, + "grad_norm": 4.357415199279785, + "learning_rate": 9.049999180609954e-07, + "loss": 0.7938, + "step": 9010 + }, + { + "epoch": 0.14034732655078735, + "grad_norm": 6.758501052856445, + "learning_rate": 9.049179790563904e-07, + "loss": 0.867, + "step": 9015 + }, + { + "epoch": 0.14042516755275675, + "grad_norm": 4.51594877243042, + "learning_rate": 9.048360400517853e-07, + "loss": 0.7706, + "step": 9020 + }, + { + "epoch": 0.1405030085547261, + "grad_norm": 3.3741414546966553, + "learning_rate": 9.047541010471804e-07, + "loss": 0.8655, + "step": 9025 + }, + { + "epoch": 0.1405808495566955, + "grad_norm": 5.02528190612793, + "learning_rate": 9.046721620425755e-07, + "loss": 0.7714, + "step": 9030 + }, + { + "epoch": 0.14065869055866487, + "grad_norm": 3.8374040126800537, + "learning_rate": 9.045902230379705e-07, + "loss": 0.8834, + "step": 9035 + }, + { + "epoch": 0.14073653156063426, + "grad_norm": 3.829531192779541, + "learning_rate": 9.045082840333655e-07, + "loss": 0.8477, + "step": 9040 + }, + { + "epoch": 0.14081437256260362, + "grad_norm": 7.790329456329346, + "learning_rate": 9.044263450287606e-07, + "loss": 0.8359, + "step": 9045 + }, + { + "epoch": 0.14089221356457302, + "grad_norm": 4.2702460289001465, + "learning_rate": 9.043444060241556e-07, + "loss": 0.8458, + "step": 9050 + }, + { + "epoch": 0.14097005456654238, + "grad_norm": 5.226367950439453, + "learning_rate": 9.042624670195506e-07, + "loss": 0.9577, + "step": 9055 + }, + { + "epoch": 0.14104789556851174, + "grad_norm": 4.627621650695801, + "learning_rate": 9.041805280149456e-07, + "loss": 0.8082, + "step": 9060 + }, + { + "epoch": 0.14112573657048114, + "grad_norm": 4.152894973754883, + "learning_rate": 9.040985890103406e-07, + "loss": 0.6948, + "step": 9065 + }, + { + "epoch": 0.1412035775724505, + "grad_norm": 6.219531059265137, + "learning_rate": 9.040166500057357e-07, + "loss": 0.8201, + "step": 9070 + }, + { + "epoch": 0.1412814185744199, + "grad_norm": 4.419485569000244, + "learning_rate": 9.039347110011307e-07, + "loss": 0.8803, + "step": 9075 + }, + { + "epoch": 0.14135925957638926, + "grad_norm": 4.359714508056641, + "learning_rate": 9.038527719965257e-07, + "loss": 0.7647, + "step": 9080 + }, + { + "epoch": 0.14143710057835865, + "grad_norm": 6.366148948669434, + "learning_rate": 9.037708329919208e-07, + "loss": 0.8826, + "step": 9085 + }, + { + "epoch": 0.141514941580328, + "grad_norm": 3.5685646533966064, + "learning_rate": 9.036888939873159e-07, + "loss": 0.7997, + "step": 9090 + }, + { + "epoch": 0.1415927825822974, + "grad_norm": 3.0451033115386963, + "learning_rate": 9.036069549827107e-07, + "loss": 0.7774, + "step": 9095 + }, + { + "epoch": 0.14167062358426677, + "grad_norm": 4.7669291496276855, + "learning_rate": 9.035250159781058e-07, + "loss": 0.8111, + "step": 9100 + }, + { + "epoch": 0.14174846458623616, + "grad_norm": 3.8222289085388184, + "learning_rate": 9.034430769735009e-07, + "loss": 0.8369, + "step": 9105 + }, + { + "epoch": 0.14182630558820553, + "grad_norm": 9.232769966125488, + "learning_rate": 9.03361137968896e-07, + "loss": 0.7547, + "step": 9110 + }, + { + "epoch": 0.14190414659017492, + "grad_norm": 2.8610751628875732, + "learning_rate": 9.032791989642909e-07, + "loss": 0.6919, + "step": 9115 + }, + { + "epoch": 0.14198198759214428, + "grad_norm": 5.900112152099609, + "learning_rate": 9.03197259959686e-07, + "loss": 0.8343, + "step": 9120 + }, + { + "epoch": 0.14205982859411367, + "grad_norm": 4.022780418395996, + "learning_rate": 9.031153209550811e-07, + "loss": 0.867, + "step": 9125 + }, + { + "epoch": 0.14213766959608304, + "grad_norm": 3.068723201751709, + "learning_rate": 9.03033381950476e-07, + "loss": 0.7645, + "step": 9130 + }, + { + "epoch": 0.14221551059805243, + "grad_norm": 3.156970977783203, + "learning_rate": 9.02951442945871e-07, + "loss": 0.6631, + "step": 9135 + }, + { + "epoch": 0.1422933516000218, + "grad_norm": 3.7697770595550537, + "learning_rate": 9.028695039412661e-07, + "loss": 0.7945, + "step": 9140 + }, + { + "epoch": 0.1423711926019912, + "grad_norm": 9.904714584350586, + "learning_rate": 9.027875649366611e-07, + "loss": 0.8339, + "step": 9145 + }, + { + "epoch": 0.14244903360396055, + "grad_norm": 3.7701642513275146, + "learning_rate": 9.027056259320561e-07, + "loss": 0.7597, + "step": 9150 + }, + { + "epoch": 0.14252687460592992, + "grad_norm": 4.5266828536987305, + "learning_rate": 9.026236869274512e-07, + "loss": 0.7383, + "step": 9155 + }, + { + "epoch": 0.1426047156078993, + "grad_norm": 5.614555358886719, + "learning_rate": 9.025417479228462e-07, + "loss": 0.7889, + "step": 9160 + }, + { + "epoch": 0.14268255660986867, + "grad_norm": 7.812586784362793, + "learning_rate": 9.024598089182412e-07, + "loss": 0.8535, + "step": 9165 + }, + { + "epoch": 0.14276039761183806, + "grad_norm": 5.218164443969727, + "learning_rate": 9.023778699136363e-07, + "loss": 0.814, + "step": 9170 + }, + { + "epoch": 0.14283823861380743, + "grad_norm": 2.824345588684082, + "learning_rate": 9.022959309090312e-07, + "loss": 0.8305, + "step": 9175 + }, + { + "epoch": 0.14291607961577682, + "grad_norm": 3.954688787460327, + "learning_rate": 9.022139919044263e-07, + "loss": 0.8707, + "step": 9180 + }, + { + "epoch": 0.14299392061774618, + "grad_norm": 4.215836048126221, + "learning_rate": 9.021320528998213e-07, + "loss": 0.9382, + "step": 9185 + }, + { + "epoch": 0.14307176161971558, + "grad_norm": 9.17077350616455, + "learning_rate": 9.020501138952163e-07, + "loss": 0.8275, + "step": 9190 + }, + { + "epoch": 0.14314960262168494, + "grad_norm": 6.527652740478516, + "learning_rate": 9.019681748906114e-07, + "loss": 0.7504, + "step": 9195 + }, + { + "epoch": 0.14322744362365433, + "grad_norm": 3.313584089279175, + "learning_rate": 9.018862358860065e-07, + "loss": 0.8984, + "step": 9200 + }, + { + "epoch": 0.1433052846256237, + "grad_norm": 4.872673988342285, + "learning_rate": 9.018042968814014e-07, + "loss": 0.9078, + "step": 9205 + }, + { + "epoch": 0.1433831256275931, + "grad_norm": 4.736050128936768, + "learning_rate": 9.017223578767965e-07, + "loss": 0.7167, + "step": 9210 + }, + { + "epoch": 0.14346096662956245, + "grad_norm": 3.0113682746887207, + "learning_rate": 9.016404188721916e-07, + "loss": 0.901, + "step": 9215 + }, + { + "epoch": 0.14353880763153185, + "grad_norm": 4.565042495727539, + "learning_rate": 9.015584798675864e-07, + "loss": 0.7495, + "step": 9220 + }, + { + "epoch": 0.1436166486335012, + "grad_norm": 3.0236637592315674, + "learning_rate": 9.014765408629815e-07, + "loss": 0.7655, + "step": 9225 + }, + { + "epoch": 0.1436944896354706, + "grad_norm": 5.903986930847168, + "learning_rate": 9.013946018583766e-07, + "loss": 0.8993, + "step": 9230 + }, + { + "epoch": 0.14377233063743997, + "grad_norm": 4.84224796295166, + "learning_rate": 9.013126628537717e-07, + "loss": 0.7633, + "step": 9235 + }, + { + "epoch": 0.14385017163940933, + "grad_norm": 3.5455453395843506, + "learning_rate": 9.012307238491666e-07, + "loss": 0.7984, + "step": 9240 + }, + { + "epoch": 0.14392801264137872, + "grad_norm": 9.098531723022461, + "learning_rate": 9.011487848445617e-07, + "loss": 0.8721, + "step": 9245 + }, + { + "epoch": 0.1440058536433481, + "grad_norm": 3.4469218254089355, + "learning_rate": 9.010668458399568e-07, + "loss": 0.7364, + "step": 9250 + }, + { + "epoch": 0.14408369464531748, + "grad_norm": 3.0461971759796143, + "learning_rate": 9.009849068353517e-07, + "loss": 0.8158, + "step": 9255 + }, + { + "epoch": 0.14416153564728684, + "grad_norm": 5.753521919250488, + "learning_rate": 9.009029678307467e-07, + "loss": 0.7637, + "step": 9260 + }, + { + "epoch": 0.14423937664925623, + "grad_norm": 10.035242080688477, + "learning_rate": 9.008210288261418e-07, + "loss": 0.8554, + "step": 9265 + }, + { + "epoch": 0.1443172176512256, + "grad_norm": 3.796072483062744, + "learning_rate": 9.007390898215368e-07, + "loss": 0.8241, + "step": 9270 + }, + { + "epoch": 0.144395058653195, + "grad_norm": 4.330554962158203, + "learning_rate": 9.006571508169318e-07, + "loss": 0.7456, + "step": 9275 + }, + { + "epoch": 0.14447289965516436, + "grad_norm": 3.3564655780792236, + "learning_rate": 9.005752118123269e-07, + "loss": 0.8016, + "step": 9280 + }, + { + "epoch": 0.14455074065713375, + "grad_norm": 2.9985568523406982, + "learning_rate": 9.004932728077219e-07, + "loss": 0.8099, + "step": 9285 + }, + { + "epoch": 0.1446285816591031, + "grad_norm": 2.705263376235962, + "learning_rate": 9.00411333803117e-07, + "loss": 0.7769, + "step": 9290 + }, + { + "epoch": 0.1447064226610725, + "grad_norm": 3.700831890106201, + "learning_rate": 9.00329394798512e-07, + "loss": 0.7327, + "step": 9295 + }, + { + "epoch": 0.14478426366304187, + "grad_norm": 6.56169319152832, + "learning_rate": 9.002474557939069e-07, + "loss": 0.7674, + "step": 9300 + }, + { + "epoch": 0.14486210466501126, + "grad_norm": 4.469850063323975, + "learning_rate": 9.00165516789302e-07, + "loss": 0.8658, + "step": 9305 + }, + { + "epoch": 0.14493994566698062, + "grad_norm": 8.878783226013184, + "learning_rate": 9.00083577784697e-07, + "loss": 0.7462, + "step": 9310 + }, + { + "epoch": 0.14501778666895002, + "grad_norm": 2.435040235519409, + "learning_rate": 9.00001638780092e-07, + "loss": 0.8363, + "step": 9315 + }, + { + "epoch": 0.14509562767091938, + "grad_norm": 3.0248606204986572, + "learning_rate": 8.999196997754871e-07, + "loss": 0.8078, + "step": 9320 + }, + { + "epoch": 0.14517346867288874, + "grad_norm": 3.466975450515747, + "learning_rate": 8.998377607708822e-07, + "loss": 0.7242, + "step": 9325 + }, + { + "epoch": 0.14525130967485814, + "grad_norm": 3.6273536682128906, + "learning_rate": 8.997558217662771e-07, + "loss": 0.8677, + "step": 9330 + }, + { + "epoch": 0.1453291506768275, + "grad_norm": 6.970489025115967, + "learning_rate": 8.996738827616722e-07, + "loss": 0.8854, + "step": 9335 + }, + { + "epoch": 0.1454069916787969, + "grad_norm": 3.735153913497925, + "learning_rate": 8.995919437570672e-07, + "loss": 0.7732, + "step": 9340 + }, + { + "epoch": 0.14548483268076626, + "grad_norm": 8.700521469116211, + "learning_rate": 8.995100047524621e-07, + "loss": 0.9042, + "step": 9345 + }, + { + "epoch": 0.14556267368273565, + "grad_norm": 7.9652276039123535, + "learning_rate": 8.994280657478572e-07, + "loss": 0.867, + "step": 9350 + }, + { + "epoch": 0.145640514684705, + "grad_norm": 3.8336143493652344, + "learning_rate": 8.993461267432523e-07, + "loss": 0.8283, + "step": 9355 + }, + { + "epoch": 0.1457183556866744, + "grad_norm": 7.037674903869629, + "learning_rate": 8.992641877386474e-07, + "loss": 0.7827, + "step": 9360 + }, + { + "epoch": 0.14579619668864377, + "grad_norm": 6.455174922943115, + "learning_rate": 8.991822487340423e-07, + "loss": 0.7666, + "step": 9365 + }, + { + "epoch": 0.14587403769061316, + "grad_norm": 3.610822916030884, + "learning_rate": 8.991003097294374e-07, + "loss": 0.8234, + "step": 9370 + }, + { + "epoch": 0.14595187869258253, + "grad_norm": 4.823882102966309, + "learning_rate": 8.990183707248325e-07, + "loss": 0.7568, + "step": 9375 + }, + { + "epoch": 0.14602971969455192, + "grad_norm": 5.625290393829346, + "learning_rate": 8.989364317202273e-07, + "loss": 0.7957, + "step": 9380 + }, + { + "epoch": 0.14610756069652128, + "grad_norm": 5.454258441925049, + "learning_rate": 8.988544927156224e-07, + "loss": 0.6412, + "step": 9385 + }, + { + "epoch": 0.14618540169849067, + "grad_norm": 5.896919250488281, + "learning_rate": 8.987725537110175e-07, + "loss": 0.7889, + "step": 9390 + }, + { + "epoch": 0.14626324270046004, + "grad_norm": 3.3774161338806152, + "learning_rate": 8.986906147064125e-07, + "loss": 0.9058, + "step": 9395 + }, + { + "epoch": 0.14634108370242943, + "grad_norm": 3.2311689853668213, + "learning_rate": 8.986086757018075e-07, + "loss": 0.7764, + "step": 9400 + }, + { + "epoch": 0.1464189247043988, + "grad_norm": 5.298305511474609, + "learning_rate": 8.985267366972026e-07, + "loss": 0.7435, + "step": 9405 + }, + { + "epoch": 0.14649676570636816, + "grad_norm": 4.45401668548584, + "learning_rate": 8.984447976925976e-07, + "loss": 0.7631, + "step": 9410 + }, + { + "epoch": 0.14657460670833755, + "grad_norm": 3.3566505908966064, + "learning_rate": 8.983628586879927e-07, + "loss": 0.8497, + "step": 9415 + }, + { + "epoch": 0.14665244771030692, + "grad_norm": 3.440115213394165, + "learning_rate": 8.982809196833876e-07, + "loss": 0.7614, + "step": 9420 + }, + { + "epoch": 0.1467302887122763, + "grad_norm": 3.5758979320526123, + "learning_rate": 8.981989806787826e-07, + "loss": 0.651, + "step": 9425 + }, + { + "epoch": 0.14680812971424567, + "grad_norm": 4.706757545471191, + "learning_rate": 8.981170416741777e-07, + "loss": 0.9389, + "step": 9430 + }, + { + "epoch": 0.14688597071621506, + "grad_norm": 4.142934799194336, + "learning_rate": 8.980351026695727e-07, + "loss": 0.9398, + "step": 9435 + }, + { + "epoch": 0.14696381171818443, + "grad_norm": 3.549654483795166, + "learning_rate": 8.979531636649677e-07, + "loss": 0.7552, + "step": 9440 + }, + { + "epoch": 0.14704165272015382, + "grad_norm": 3.9092509746551514, + "learning_rate": 8.978712246603628e-07, + "loss": 0.8207, + "step": 9445 + }, + { + "epoch": 0.14711949372212318, + "grad_norm": 4.208457946777344, + "learning_rate": 8.977892856557579e-07, + "loss": 0.7046, + "step": 9450 + }, + { + "epoch": 0.14719733472409258, + "grad_norm": 8.339073181152344, + "learning_rate": 8.977073466511528e-07, + "loss": 0.7736, + "step": 9455 + }, + { + "epoch": 0.14727517572606194, + "grad_norm": 2.834137439727783, + "learning_rate": 8.976254076465478e-07, + "loss": 0.893, + "step": 9460 + }, + { + "epoch": 0.14735301672803133, + "grad_norm": 4.870460033416748, + "learning_rate": 8.975434686419429e-07, + "loss": 0.819, + "step": 9465 + }, + { + "epoch": 0.1474308577300007, + "grad_norm": 4.1774582862854, + "learning_rate": 8.974615296373378e-07, + "loss": 0.7164, + "step": 9470 + }, + { + "epoch": 0.1475086987319701, + "grad_norm": 3.573882818222046, + "learning_rate": 8.973795906327329e-07, + "loss": 0.7982, + "step": 9475 + }, + { + "epoch": 0.14758653973393945, + "grad_norm": 3.5654754638671875, + "learning_rate": 8.97297651628128e-07, + "loss": 0.8293, + "step": 9480 + }, + { + "epoch": 0.14766438073590885, + "grad_norm": 3.387242078781128, + "learning_rate": 8.972157126235231e-07, + "loss": 0.7351, + "step": 9485 + }, + { + "epoch": 0.1477422217378782, + "grad_norm": 7.289836883544922, + "learning_rate": 8.97133773618918e-07, + "loss": 0.8013, + "step": 9490 + }, + { + "epoch": 0.14782006273984757, + "grad_norm": 4.657222270965576, + "learning_rate": 8.970518346143131e-07, + "loss": 0.8237, + "step": 9495 + }, + { + "epoch": 0.14789790374181697, + "grad_norm": 4.462035179138184, + "learning_rate": 8.969698956097081e-07, + "loss": 0.7025, + "step": 9500 + }, + { + "epoch": 0.14797574474378633, + "grad_norm": 4.082338809967041, + "learning_rate": 8.968879566051031e-07, + "loss": 0.7665, + "step": 9505 + }, + { + "epoch": 0.14805358574575572, + "grad_norm": 4.678539276123047, + "learning_rate": 8.968060176004981e-07, + "loss": 0.8325, + "step": 9510 + }, + { + "epoch": 0.1481314267477251, + "grad_norm": 4.395145893096924, + "learning_rate": 8.967240785958932e-07, + "loss": 0.7172, + "step": 9515 + }, + { + "epoch": 0.14820926774969448, + "grad_norm": 3.5010783672332764, + "learning_rate": 8.966421395912882e-07, + "loss": 0.7459, + "step": 9520 + }, + { + "epoch": 0.14828710875166384, + "grad_norm": 5.013630390167236, + "learning_rate": 8.965602005866832e-07, + "loss": 0.7536, + "step": 9525 + }, + { + "epoch": 0.14836494975363324, + "grad_norm": 3.0716755390167236, + "learning_rate": 8.964782615820783e-07, + "loss": 0.8559, + "step": 9530 + }, + { + "epoch": 0.1484427907556026, + "grad_norm": 8.94082260131836, + "learning_rate": 8.963963225774733e-07, + "loss": 0.8765, + "step": 9535 + }, + { + "epoch": 0.148520631757572, + "grad_norm": 5.629791736602783, + "learning_rate": 8.963143835728684e-07, + "loss": 0.9076, + "step": 9540 + }, + { + "epoch": 0.14859847275954136, + "grad_norm": 4.679615020751953, + "learning_rate": 8.962324445682633e-07, + "loss": 0.7733, + "step": 9545 + }, + { + "epoch": 0.14867631376151075, + "grad_norm": 2.7258129119873047, + "learning_rate": 8.961505055636583e-07, + "loss": 0.7082, + "step": 9550 + }, + { + "epoch": 0.1487541547634801, + "grad_norm": 4.701254844665527, + "learning_rate": 8.960685665590534e-07, + "loss": 0.6848, + "step": 9555 + }, + { + "epoch": 0.1488319957654495, + "grad_norm": 3.3973538875579834, + "learning_rate": 8.959866275544485e-07, + "loss": 0.87, + "step": 9560 + }, + { + "epoch": 0.14890983676741887, + "grad_norm": 5.04672384262085, + "learning_rate": 8.959046885498434e-07, + "loss": 0.7941, + "step": 9565 + }, + { + "epoch": 0.14898767776938826, + "grad_norm": 5.399540901184082, + "learning_rate": 8.958227495452385e-07, + "loss": 0.7677, + "step": 9570 + }, + { + "epoch": 0.14906551877135762, + "grad_norm": 3.8206582069396973, + "learning_rate": 8.957408105406336e-07, + "loss": 0.7326, + "step": 9575 + }, + { + "epoch": 0.149143359773327, + "grad_norm": 2.07293963432312, + "learning_rate": 8.956588715360285e-07, + "loss": 0.7492, + "step": 9580 + }, + { + "epoch": 0.14922120077529638, + "grad_norm": 8.431818008422852, + "learning_rate": 8.955769325314235e-07, + "loss": 0.7455, + "step": 9585 + }, + { + "epoch": 0.14929904177726575, + "grad_norm": 7.549500465393066, + "learning_rate": 8.954949935268186e-07, + "loss": 0.7746, + "step": 9590 + }, + { + "epoch": 0.14937688277923514, + "grad_norm": 5.316057205200195, + "learning_rate": 8.954130545222136e-07, + "loss": 0.7876, + "step": 9595 + }, + { + "epoch": 0.1494547237812045, + "grad_norm": 7.522194862365723, + "learning_rate": 8.953311155176086e-07, + "loss": 0.802, + "step": 9600 + }, + { + "epoch": 0.1495325647831739, + "grad_norm": 3.6271915435791016, + "learning_rate": 8.952491765130037e-07, + "loss": 0.7795, + "step": 9605 + }, + { + "epoch": 0.14961040578514326, + "grad_norm": 5.297993183135986, + "learning_rate": 8.951672375083988e-07, + "loss": 0.7834, + "step": 9610 + }, + { + "epoch": 0.14968824678711265, + "grad_norm": 3.3175265789031982, + "learning_rate": 8.950852985037937e-07, + "loss": 0.6759, + "step": 9615 + }, + { + "epoch": 0.14976608778908201, + "grad_norm": 3.857908010482788, + "learning_rate": 8.950033594991888e-07, + "loss": 0.7961, + "step": 9620 + }, + { + "epoch": 0.1498439287910514, + "grad_norm": 6.590905666351318, + "learning_rate": 8.949214204945838e-07, + "loss": 0.8008, + "step": 9625 + }, + { + "epoch": 0.14992176979302077, + "grad_norm": 3.9845802783966064, + "learning_rate": 8.948394814899788e-07, + "loss": 0.8439, + "step": 9630 + }, + { + "epoch": 0.14999961079499016, + "grad_norm": 4.234025001525879, + "learning_rate": 8.947575424853738e-07, + "loss": 0.829, + "step": 9635 + }, + { + "epoch": 0.15007745179695953, + "grad_norm": 4.937478542327881, + "learning_rate": 8.946756034807689e-07, + "loss": 0.8498, + "step": 9640 + }, + { + "epoch": 0.15015529279892892, + "grad_norm": 5.999597072601318, + "learning_rate": 8.945936644761639e-07, + "loss": 0.7593, + "step": 9645 + }, + { + "epoch": 0.15023313380089828, + "grad_norm": 5.461806774139404, + "learning_rate": 8.94511725471559e-07, + "loss": 0.8598, + "step": 9650 + }, + { + "epoch": 0.15031097480286768, + "grad_norm": 5.157901763916016, + "learning_rate": 8.94429786466954e-07, + "loss": 0.8738, + "step": 9655 + }, + { + "epoch": 0.15038881580483704, + "grad_norm": 3.0504953861236572, + "learning_rate": 8.94347847462349e-07, + "loss": 0.7486, + "step": 9660 + }, + { + "epoch": 0.1504666568068064, + "grad_norm": 10.305487632751465, + "learning_rate": 8.94265908457744e-07, + "loss": 0.7844, + "step": 9665 + }, + { + "epoch": 0.1505444978087758, + "grad_norm": 3.5192887783050537, + "learning_rate": 8.94183969453139e-07, + "loss": 0.8386, + "step": 9670 + }, + { + "epoch": 0.15062233881074516, + "grad_norm": 4.425221920013428, + "learning_rate": 8.94102030448534e-07, + "loss": 0.7428, + "step": 9675 + }, + { + "epoch": 0.15070017981271455, + "grad_norm": 5.125747203826904, + "learning_rate": 8.940200914439291e-07, + "loss": 0.7855, + "step": 9680 + }, + { + "epoch": 0.15077802081468392, + "grad_norm": 5.673059463500977, + "learning_rate": 8.939381524393242e-07, + "loss": 0.9193, + "step": 9685 + }, + { + "epoch": 0.1508558618166533, + "grad_norm": 2.934014320373535, + "learning_rate": 8.938562134347191e-07, + "loss": 0.6473, + "step": 9690 + }, + { + "epoch": 0.15093370281862267, + "grad_norm": 6.39523983001709, + "learning_rate": 8.937742744301142e-07, + "loss": 0.7309, + "step": 9695 + }, + { + "epoch": 0.15101154382059206, + "grad_norm": 3.2489750385284424, + "learning_rate": 8.936923354255093e-07, + "loss": 0.8391, + "step": 9700 + }, + { + "epoch": 0.15108938482256143, + "grad_norm": 3.2904443740844727, + "learning_rate": 8.936103964209041e-07, + "loss": 0.6639, + "step": 9705 + }, + { + "epoch": 0.15116722582453082, + "grad_norm": 5.773887634277344, + "learning_rate": 8.935284574162992e-07, + "loss": 0.7859, + "step": 9710 + }, + { + "epoch": 0.15124506682650019, + "grad_norm": 4.7919697761535645, + "learning_rate": 8.934465184116943e-07, + "loss": 0.7519, + "step": 9715 + }, + { + "epoch": 0.15132290782846958, + "grad_norm": 4.2467942237854, + "learning_rate": 8.933645794070893e-07, + "loss": 0.8234, + "step": 9720 + }, + { + "epoch": 0.15140074883043894, + "grad_norm": 3.2644877433776855, + "learning_rate": 8.932826404024843e-07, + "loss": 0.6703, + "step": 9725 + }, + { + "epoch": 0.15147858983240833, + "grad_norm": 4.7553534507751465, + "learning_rate": 8.932007013978794e-07, + "loss": 0.7273, + "step": 9730 + }, + { + "epoch": 0.1515564308343777, + "grad_norm": 3.009917974472046, + "learning_rate": 8.931187623932745e-07, + "loss": 0.8056, + "step": 9735 + }, + { + "epoch": 0.1516342718363471, + "grad_norm": 2.296459436416626, + "learning_rate": 8.930368233886695e-07, + "loss": 0.7561, + "step": 9740 + }, + { + "epoch": 0.15171211283831645, + "grad_norm": 9.049659729003906, + "learning_rate": 8.929548843840644e-07, + "loss": 0.7144, + "step": 9745 + }, + { + "epoch": 0.15178995384028582, + "grad_norm": 6.223394393920898, + "learning_rate": 8.928729453794595e-07, + "loss": 0.737, + "step": 9750 + }, + { + "epoch": 0.1518677948422552, + "grad_norm": 7.632771968841553, + "learning_rate": 8.927910063748545e-07, + "loss": 0.8498, + "step": 9755 + }, + { + "epoch": 0.15194563584422457, + "grad_norm": 6.848038196563721, + "learning_rate": 8.927090673702495e-07, + "loss": 0.806, + "step": 9760 + }, + { + "epoch": 0.15202347684619397, + "grad_norm": 4.97011137008667, + "learning_rate": 8.926271283656446e-07, + "loss": 0.7157, + "step": 9765 + }, + { + "epoch": 0.15210131784816333, + "grad_norm": 3.5741093158721924, + "learning_rate": 8.925451893610396e-07, + "loss": 0.8259, + "step": 9770 + }, + { + "epoch": 0.15217915885013272, + "grad_norm": 3.7775120735168457, + "learning_rate": 8.924632503564347e-07, + "loss": 0.7166, + "step": 9775 + }, + { + "epoch": 0.1522569998521021, + "grad_norm": 3.5487003326416016, + "learning_rate": 8.923813113518297e-07, + "loss": 0.7971, + "step": 9780 + }, + { + "epoch": 0.15233484085407148, + "grad_norm": 4.203275203704834, + "learning_rate": 8.922993723472246e-07, + "loss": 0.8493, + "step": 9785 + }, + { + "epoch": 0.15241268185604084, + "grad_norm": 3.4415955543518066, + "learning_rate": 8.922174333426197e-07, + "loss": 0.7878, + "step": 9790 + }, + { + "epoch": 0.15249052285801024, + "grad_norm": 3.142867088317871, + "learning_rate": 8.921354943380148e-07, + "loss": 0.7424, + "step": 9795 + }, + { + "epoch": 0.1525683638599796, + "grad_norm": 4.120011806488037, + "learning_rate": 8.920535553334097e-07, + "loss": 0.8535, + "step": 9800 + }, + { + "epoch": 0.152646204861949, + "grad_norm": 3.5447614192962646, + "learning_rate": 8.919716163288048e-07, + "loss": 0.7767, + "step": 9805 + }, + { + "epoch": 0.15272404586391836, + "grad_norm": 3.2715671062469482, + "learning_rate": 8.918896773241999e-07, + "loss": 0.8062, + "step": 9810 + }, + { + "epoch": 0.15280188686588775, + "grad_norm": 2.4431989192962646, + "learning_rate": 8.918077383195948e-07, + "loss": 0.7578, + "step": 9815 + }, + { + "epoch": 0.1528797278678571, + "grad_norm": 3.4583024978637695, + "learning_rate": 8.917257993149899e-07, + "loss": 0.7531, + "step": 9820 + }, + { + "epoch": 0.1529575688698265, + "grad_norm": 2.993116617202759, + "learning_rate": 8.916438603103849e-07, + "loss": 0.7672, + "step": 9825 + }, + { + "epoch": 0.15303540987179587, + "grad_norm": 3.1958775520324707, + "learning_rate": 8.915619213057799e-07, + "loss": 0.8598, + "step": 9830 + }, + { + "epoch": 0.15311325087376526, + "grad_norm": 6.617530345916748, + "learning_rate": 8.914799823011749e-07, + "loss": 0.6993, + "step": 9835 + }, + { + "epoch": 0.15319109187573463, + "grad_norm": 3.9187653064727783, + "learning_rate": 8.9139804329657e-07, + "loss": 0.7635, + "step": 9840 + }, + { + "epoch": 0.153268932877704, + "grad_norm": 3.482074499130249, + "learning_rate": 8.91316104291965e-07, + "loss": 0.7737, + "step": 9845 + }, + { + "epoch": 0.15334677387967338, + "grad_norm": 9.398933410644531, + "learning_rate": 8.9123416528736e-07, + "loss": 0.8737, + "step": 9850 + }, + { + "epoch": 0.15342461488164275, + "grad_norm": 3.637568473815918, + "learning_rate": 8.911522262827551e-07, + "loss": 0.7667, + "step": 9855 + }, + { + "epoch": 0.15350245588361214, + "grad_norm": 3.3821427822113037, + "learning_rate": 8.910702872781502e-07, + "loss": 0.7836, + "step": 9860 + }, + { + "epoch": 0.1535802968855815, + "grad_norm": 2.9914300441741943, + "learning_rate": 8.909883482735452e-07, + "loss": 0.8564, + "step": 9865 + }, + { + "epoch": 0.1536581378875509, + "grad_norm": 6.171716690063477, + "learning_rate": 8.909064092689401e-07, + "loss": 0.8064, + "step": 9870 + }, + { + "epoch": 0.15373597888952026, + "grad_norm": 3.4968278408050537, + "learning_rate": 8.908244702643352e-07, + "loss": 0.7524, + "step": 9875 + }, + { + "epoch": 0.15381381989148965, + "grad_norm": 5.08829402923584, + "learning_rate": 8.907425312597302e-07, + "loss": 0.8207, + "step": 9880 + }, + { + "epoch": 0.15389166089345901, + "grad_norm": 3.5643150806427, + "learning_rate": 8.906605922551253e-07, + "loss": 0.7776, + "step": 9885 + }, + { + "epoch": 0.1539695018954284, + "grad_norm": 3.4858274459838867, + "learning_rate": 8.905786532505203e-07, + "loss": 0.8979, + "step": 9890 + }, + { + "epoch": 0.15404734289739777, + "grad_norm": 3.9792239665985107, + "learning_rate": 8.904967142459153e-07, + "loss": 0.797, + "step": 9895 + }, + { + "epoch": 0.15412518389936716, + "grad_norm": 9.95738410949707, + "learning_rate": 8.904147752413104e-07, + "loss": 0.7492, + "step": 9900 + }, + { + "epoch": 0.15420302490133653, + "grad_norm": 4.636844158172607, + "learning_rate": 8.903328362367054e-07, + "loss": 0.7867, + "step": 9905 + }, + { + "epoch": 0.15428086590330592, + "grad_norm": 7.289409637451172, + "learning_rate": 8.902508972321003e-07, + "loss": 0.7982, + "step": 9910 + }, + { + "epoch": 0.15435870690527528, + "grad_norm": 3.7324790954589844, + "learning_rate": 8.901689582274954e-07, + "loss": 0.7001, + "step": 9915 + }, + { + "epoch": 0.15443654790724468, + "grad_norm": 4.735339641571045, + "learning_rate": 8.900870192228905e-07, + "loss": 0.7155, + "step": 9920 + }, + { + "epoch": 0.15451438890921404, + "grad_norm": 8.172463417053223, + "learning_rate": 8.900050802182854e-07, + "loss": 0.8489, + "step": 9925 + }, + { + "epoch": 0.1545922299111834, + "grad_norm": 3.3313515186309814, + "learning_rate": 8.899231412136805e-07, + "loss": 0.8064, + "step": 9930 + }, + { + "epoch": 0.1546700709131528, + "grad_norm": 9.073691368103027, + "learning_rate": 8.898412022090756e-07, + "loss": 0.8577, + "step": 9935 + }, + { + "epoch": 0.15474791191512216, + "grad_norm": 7.126258373260498, + "learning_rate": 8.897592632044705e-07, + "loss": 0.7679, + "step": 9940 + }, + { + "epoch": 0.15482575291709155, + "grad_norm": 11.254960060119629, + "learning_rate": 8.896773241998656e-07, + "loss": 0.7725, + "step": 9945 + }, + { + "epoch": 0.15490359391906092, + "grad_norm": 3.706859827041626, + "learning_rate": 8.895953851952606e-07, + "loss": 0.9131, + "step": 9950 + }, + { + "epoch": 0.1549814349210303, + "grad_norm": 3.0554311275482178, + "learning_rate": 8.895134461906556e-07, + "loss": 0.7683, + "step": 9955 + }, + { + "epoch": 0.15505927592299967, + "grad_norm": 3.64799165725708, + "learning_rate": 8.894315071860506e-07, + "loss": 0.7275, + "step": 9960 + }, + { + "epoch": 0.15513711692496907, + "grad_norm": 3.383768320083618, + "learning_rate": 8.893495681814457e-07, + "loss": 0.8155, + "step": 9965 + }, + { + "epoch": 0.15521495792693843, + "grad_norm": 3.561455726623535, + "learning_rate": 8.892676291768407e-07, + "loss": 0.7256, + "step": 9970 + }, + { + "epoch": 0.15529279892890782, + "grad_norm": 3.7967913150787354, + "learning_rate": 8.891856901722358e-07, + "loss": 0.8299, + "step": 9975 + }, + { + "epoch": 0.15537063993087719, + "grad_norm": 3.6479313373565674, + "learning_rate": 8.891037511676308e-07, + "loss": 0.7851, + "step": 9980 + }, + { + "epoch": 0.15544848093284658, + "grad_norm": 4.59113883972168, + "learning_rate": 8.890218121630259e-07, + "loss": 0.8115, + "step": 9985 + }, + { + "epoch": 0.15552632193481594, + "grad_norm": 5.832945823669434, + "learning_rate": 8.889398731584208e-07, + "loss": 0.9049, + "step": 9990 + }, + { + "epoch": 0.15560416293678533, + "grad_norm": 3.0644237995147705, + "learning_rate": 8.888579341538158e-07, + "loss": 0.6787, + "step": 9995 + }, + { + "epoch": 0.1556820039387547, + "grad_norm": 4.14565372467041, + "learning_rate": 8.887759951492109e-07, + "loss": 0.8273, + "step": 10000 + }, + { + "epoch": 0.1557598449407241, + "grad_norm": 10.58462142944336, + "learning_rate": 8.886940561446059e-07, + "loss": 0.7526, + "step": 10005 + }, + { + "epoch": 0.15583768594269345, + "grad_norm": 4.249096870422363, + "learning_rate": 8.88612117140001e-07, + "loss": 0.7715, + "step": 10010 + }, + { + "epoch": 0.15591552694466282, + "grad_norm": 6.341519355773926, + "learning_rate": 8.88530178135396e-07, + "loss": 0.7869, + "step": 10015 + }, + { + "epoch": 0.1559933679466322, + "grad_norm": 2.831510543823242, + "learning_rate": 8.88448239130791e-07, + "loss": 0.807, + "step": 10020 + }, + { + "epoch": 0.15607120894860158, + "grad_norm": 3.297983169555664, + "learning_rate": 8.883663001261861e-07, + "loss": 0.7433, + "step": 10025 + }, + { + "epoch": 0.15614904995057097, + "grad_norm": 3.2775771617889404, + "learning_rate": 8.88284361121581e-07, + "loss": 0.7378, + "step": 10030 + }, + { + "epoch": 0.15622689095254033, + "grad_norm": 4.8905768394470215, + "learning_rate": 8.88202422116976e-07, + "loss": 0.9689, + "step": 10035 + }, + { + "epoch": 0.15630473195450972, + "grad_norm": 5.260064601898193, + "learning_rate": 8.881204831123711e-07, + "loss": 0.8044, + "step": 10040 + }, + { + "epoch": 0.1563825729564791, + "grad_norm": 3.0201990604400635, + "learning_rate": 8.880385441077662e-07, + "loss": 0.7943, + "step": 10045 + }, + { + "epoch": 0.15646041395844848, + "grad_norm": 3.7755088806152344, + "learning_rate": 8.879566051031611e-07, + "loss": 0.7369, + "step": 10050 + }, + { + "epoch": 0.15653825496041784, + "grad_norm": 4.074087619781494, + "learning_rate": 8.878746660985562e-07, + "loss": 0.8229, + "step": 10055 + }, + { + "epoch": 0.15661609596238724, + "grad_norm": 4.723263263702393, + "learning_rate": 8.877927270939513e-07, + "loss": 0.6489, + "step": 10060 + }, + { + "epoch": 0.1566939369643566, + "grad_norm": 4.477192401885986, + "learning_rate": 8.877107880893463e-07, + "loss": 0.8427, + "step": 10065 + }, + { + "epoch": 0.156771777966326, + "grad_norm": 2.5688400268554688, + "learning_rate": 8.876288490847412e-07, + "loss": 0.8412, + "step": 10070 + }, + { + "epoch": 0.15684961896829536, + "grad_norm": 4.53289794921875, + "learning_rate": 8.875469100801363e-07, + "loss": 0.8295, + "step": 10075 + }, + { + "epoch": 0.15692745997026475, + "grad_norm": 7.105226993560791, + "learning_rate": 8.874649710755313e-07, + "loss": 0.8648, + "step": 10080 + }, + { + "epoch": 0.1570053009722341, + "grad_norm": 2.8844571113586426, + "learning_rate": 8.873830320709263e-07, + "loss": 0.7657, + "step": 10085 + }, + { + "epoch": 0.1570831419742035, + "grad_norm": 4.569636344909668, + "learning_rate": 8.873010930663214e-07, + "loss": 0.6868, + "step": 10090 + }, + { + "epoch": 0.15716098297617287, + "grad_norm": 3.5974533557891846, + "learning_rate": 8.872191540617164e-07, + "loss": 0.9193, + "step": 10095 + }, + { + "epoch": 0.15723882397814223, + "grad_norm": 4.02683687210083, + "learning_rate": 8.871372150571115e-07, + "loss": 0.8309, + "step": 10100 + }, + { + "epoch": 0.15731666498011163, + "grad_norm": 3.6681370735168457, + "learning_rate": 8.870552760525065e-07, + "loss": 0.7995, + "step": 10105 + }, + { + "epoch": 0.157394505982081, + "grad_norm": 3.227896213531494, + "learning_rate": 8.869733370479014e-07, + "loss": 0.7712, + "step": 10110 + }, + { + "epoch": 0.15747234698405038, + "grad_norm": 7.877114295959473, + "learning_rate": 8.868913980432965e-07, + "loss": 0.7099, + "step": 10115 + }, + { + "epoch": 0.15755018798601975, + "grad_norm": 4.573225021362305, + "learning_rate": 8.868094590386915e-07, + "loss": 0.6932, + "step": 10120 + }, + { + "epoch": 0.15762802898798914, + "grad_norm": 4.980703353881836, + "learning_rate": 8.867275200340866e-07, + "loss": 0.9058, + "step": 10125 + }, + { + "epoch": 0.1577058699899585, + "grad_norm": 4.040239334106445, + "learning_rate": 8.866455810294816e-07, + "loss": 0.813, + "step": 10130 + }, + { + "epoch": 0.1577837109919279, + "grad_norm": 3.489124059677124, + "learning_rate": 8.865636420248767e-07, + "loss": 0.7237, + "step": 10135 + }, + { + "epoch": 0.15786155199389726, + "grad_norm": 2.5940945148468018, + "learning_rate": 8.864817030202717e-07, + "loss": 0.7494, + "step": 10140 + }, + { + "epoch": 0.15793939299586665, + "grad_norm": 3.312206745147705, + "learning_rate": 8.863997640156667e-07, + "loss": 0.8447, + "step": 10145 + }, + { + "epoch": 0.15801723399783602, + "grad_norm": 3.279050350189209, + "learning_rate": 8.863178250110617e-07, + "loss": 0.8192, + "step": 10150 + }, + { + "epoch": 0.1580950749998054, + "grad_norm": 3.221031904220581, + "learning_rate": 8.862358860064568e-07, + "loss": 0.8593, + "step": 10155 + }, + { + "epoch": 0.15817291600177477, + "grad_norm": 5.654365062713623, + "learning_rate": 8.861539470018517e-07, + "loss": 0.8396, + "step": 10160 + }, + { + "epoch": 0.15825075700374416, + "grad_norm": 3.0688130855560303, + "learning_rate": 8.860720079972468e-07, + "loss": 0.6908, + "step": 10165 + }, + { + "epoch": 0.15832859800571353, + "grad_norm": 4.515695571899414, + "learning_rate": 8.859900689926419e-07, + "loss": 0.8116, + "step": 10170 + }, + { + "epoch": 0.15840643900768292, + "grad_norm": 5.167048454284668, + "learning_rate": 8.859081299880368e-07, + "loss": 0.8373, + "step": 10175 + }, + { + "epoch": 0.15848428000965228, + "grad_norm": 8.9721097946167, + "learning_rate": 8.858261909834319e-07, + "loss": 0.9478, + "step": 10180 + }, + { + "epoch": 0.15856212101162165, + "grad_norm": 3.8844072818756104, + "learning_rate": 8.85744251978827e-07, + "loss": 0.8713, + "step": 10185 + }, + { + "epoch": 0.15863996201359104, + "grad_norm": 3.705676794052124, + "learning_rate": 8.85662312974222e-07, + "loss": 0.817, + "step": 10190 + }, + { + "epoch": 0.1587178030155604, + "grad_norm": 4.741096496582031, + "learning_rate": 8.855803739696169e-07, + "loss": 0.8058, + "step": 10195 + }, + { + "epoch": 0.1587956440175298, + "grad_norm": 6.481576442718506, + "learning_rate": 8.85498434965012e-07, + "loss": 0.9077, + "step": 10200 + }, + { + "epoch": 0.15887348501949916, + "grad_norm": 5.847426414489746, + "learning_rate": 8.85416495960407e-07, + "loss": 0.7257, + "step": 10205 + }, + { + "epoch": 0.15895132602146855, + "grad_norm": 3.6452407836914062, + "learning_rate": 8.85334556955802e-07, + "loss": 0.7282, + "step": 10210 + }, + { + "epoch": 0.15902916702343792, + "grad_norm": 3.5716402530670166, + "learning_rate": 8.852526179511971e-07, + "loss": 0.7504, + "step": 10215 + }, + { + "epoch": 0.1591070080254073, + "grad_norm": 4.702507495880127, + "learning_rate": 8.851706789465921e-07, + "loss": 0.8345, + "step": 10220 + }, + { + "epoch": 0.15918484902737667, + "grad_norm": 3.6276495456695557, + "learning_rate": 8.850887399419872e-07, + "loss": 0.8267, + "step": 10225 + }, + { + "epoch": 0.15926269002934607, + "grad_norm": 3.432413101196289, + "learning_rate": 8.850068009373822e-07, + "loss": 0.9476, + "step": 10230 + }, + { + "epoch": 0.15934053103131543, + "grad_norm": 4.081861972808838, + "learning_rate": 8.849248619327771e-07, + "loss": 0.7688, + "step": 10235 + }, + { + "epoch": 0.15941837203328482, + "grad_norm": 6.799472332000732, + "learning_rate": 8.848429229281722e-07, + "loss": 0.7084, + "step": 10240 + }, + { + "epoch": 0.1594962130352542, + "grad_norm": 2.736294984817505, + "learning_rate": 8.847609839235673e-07, + "loss": 0.7835, + "step": 10245 + }, + { + "epoch": 0.15957405403722358, + "grad_norm": 4.227167129516602, + "learning_rate": 8.846790449189623e-07, + "loss": 0.7833, + "step": 10250 + }, + { + "epoch": 0.15965189503919294, + "grad_norm": 3.3678765296936035, + "learning_rate": 8.845971059143573e-07, + "loss": 0.9005, + "step": 10255 + }, + { + "epoch": 0.15972973604116233, + "grad_norm": 3.6415300369262695, + "learning_rate": 8.845151669097524e-07, + "loss": 0.8214, + "step": 10260 + }, + { + "epoch": 0.1598075770431317, + "grad_norm": 4.28493595123291, + "learning_rate": 8.844332279051474e-07, + "loss": 0.7597, + "step": 10265 + }, + { + "epoch": 0.15988541804510106, + "grad_norm": 3.2147064208984375, + "learning_rate": 8.843512889005424e-07, + "loss": 0.7652, + "step": 10270 + }, + { + "epoch": 0.15996325904707046, + "grad_norm": 7.0327959060668945, + "learning_rate": 8.842693498959374e-07, + "loss": 0.7703, + "step": 10275 + }, + { + "epoch": 0.16004110004903982, + "grad_norm": 3.714036464691162, + "learning_rate": 8.841874108913325e-07, + "loss": 0.8864, + "step": 10280 + }, + { + "epoch": 0.1601189410510092, + "grad_norm": 3.1920130252838135, + "learning_rate": 8.841054718867274e-07, + "loss": 0.8203, + "step": 10285 + }, + { + "epoch": 0.16019678205297858, + "grad_norm": 4.356620788574219, + "learning_rate": 8.840235328821225e-07, + "loss": 0.8215, + "step": 10290 + }, + { + "epoch": 0.16027462305494797, + "grad_norm": 3.031799793243408, + "learning_rate": 8.839415938775176e-07, + "loss": 0.6909, + "step": 10295 + }, + { + "epoch": 0.16035246405691733, + "grad_norm": 3.817898750305176, + "learning_rate": 8.838596548729125e-07, + "loss": 0.7237, + "step": 10300 + }, + { + "epoch": 0.16043030505888672, + "grad_norm": 4.679711818695068, + "learning_rate": 8.837777158683076e-07, + "loss": 0.7296, + "step": 10305 + }, + { + "epoch": 0.1605081460608561, + "grad_norm": 3.7384095191955566, + "learning_rate": 8.836957768637027e-07, + "loss": 0.9011, + "step": 10310 + }, + { + "epoch": 0.16058598706282548, + "grad_norm": 4.7621541023254395, + "learning_rate": 8.836138378590976e-07, + "loss": 0.9026, + "step": 10315 + }, + { + "epoch": 0.16066382806479484, + "grad_norm": 6.402016639709473, + "learning_rate": 8.835318988544926e-07, + "loss": 0.8081, + "step": 10320 + }, + { + "epoch": 0.16074166906676424, + "grad_norm": 4.472290515899658, + "learning_rate": 8.834499598498877e-07, + "loss": 0.7606, + "step": 10325 + }, + { + "epoch": 0.1608195100687336, + "grad_norm": 5.197412967681885, + "learning_rate": 8.833680208452827e-07, + "loss": 0.7824, + "step": 10330 + }, + { + "epoch": 0.160897351070703, + "grad_norm": 3.40364408493042, + "learning_rate": 8.832860818406778e-07, + "loss": 0.7884, + "step": 10335 + }, + { + "epoch": 0.16097519207267236, + "grad_norm": 6.786139965057373, + "learning_rate": 8.832041428360728e-07, + "loss": 0.8743, + "step": 10340 + }, + { + "epoch": 0.16105303307464175, + "grad_norm": 4.048831462860107, + "learning_rate": 8.831222038314678e-07, + "loss": 0.8958, + "step": 10345 + }, + { + "epoch": 0.1611308740766111, + "grad_norm": 4.102180480957031, + "learning_rate": 8.830402648268629e-07, + "loss": 0.7835, + "step": 10350 + }, + { + "epoch": 0.16120871507858048, + "grad_norm": 3.135587453842163, + "learning_rate": 8.829583258222578e-07, + "loss": 0.6362, + "step": 10355 + }, + { + "epoch": 0.16128655608054987, + "grad_norm": 3.9217071533203125, + "learning_rate": 8.828763868176528e-07, + "loss": 0.7069, + "step": 10360 + }, + { + "epoch": 0.16136439708251923, + "grad_norm": 3.276561737060547, + "learning_rate": 8.827944478130479e-07, + "loss": 0.8588, + "step": 10365 + }, + { + "epoch": 0.16144223808448863, + "grad_norm": 3.9652113914489746, + "learning_rate": 8.82712508808443e-07, + "loss": 0.8276, + "step": 10370 + }, + { + "epoch": 0.161520079086458, + "grad_norm": 5.678191184997559, + "learning_rate": 8.82630569803838e-07, + "loss": 0.7526, + "step": 10375 + }, + { + "epoch": 0.16159792008842738, + "grad_norm": 5.475320816040039, + "learning_rate": 8.82548630799233e-07, + "loss": 0.8913, + "step": 10380 + }, + { + "epoch": 0.16167576109039675, + "grad_norm": 15.008077621459961, + "learning_rate": 8.824666917946281e-07, + "loss": 0.7912, + "step": 10385 + }, + { + "epoch": 0.16175360209236614, + "grad_norm": 3.8481225967407227, + "learning_rate": 8.823847527900232e-07, + "loss": 0.7949, + "step": 10390 + }, + { + "epoch": 0.1618314430943355, + "grad_norm": 2.759436845779419, + "learning_rate": 8.82302813785418e-07, + "loss": 0.7449, + "step": 10395 + }, + { + "epoch": 0.1619092840963049, + "grad_norm": 8.001230239868164, + "learning_rate": 8.822208747808131e-07, + "loss": 0.853, + "step": 10400 + }, + { + "epoch": 0.16198712509827426, + "grad_norm": 5.410604953765869, + "learning_rate": 8.821389357762082e-07, + "loss": 0.8032, + "step": 10405 + }, + { + "epoch": 0.16206496610024365, + "grad_norm": 17.351768493652344, + "learning_rate": 8.820569967716031e-07, + "loss": 0.7418, + "step": 10410 + }, + { + "epoch": 0.16214280710221302, + "grad_norm": 3.534604549407959, + "learning_rate": 8.819750577669982e-07, + "loss": 0.7653, + "step": 10415 + }, + { + "epoch": 0.1622206481041824, + "grad_norm": 8.916220664978027, + "learning_rate": 8.818931187623933e-07, + "loss": 0.837, + "step": 10420 + }, + { + "epoch": 0.16229848910615177, + "grad_norm": 4.538114547729492, + "learning_rate": 8.818111797577883e-07, + "loss": 0.7677, + "step": 10425 + }, + { + "epoch": 0.16237633010812116, + "grad_norm": 3.6164751052856445, + "learning_rate": 8.817292407531833e-07, + "loss": 0.7675, + "step": 10430 + }, + { + "epoch": 0.16245417111009053, + "grad_norm": 4.763157844543457, + "learning_rate": 8.816473017485783e-07, + "loss": 0.7237, + "step": 10435 + }, + { + "epoch": 0.1625320121120599, + "grad_norm": 4.728906631469727, + "learning_rate": 8.815653627439733e-07, + "loss": 0.8593, + "step": 10440 + }, + { + "epoch": 0.16260985311402928, + "grad_norm": 3.878848075866699, + "learning_rate": 8.814834237393683e-07, + "loss": 0.8701, + "step": 10445 + }, + { + "epoch": 0.16268769411599865, + "grad_norm": 5.231772422790527, + "learning_rate": 8.814014847347634e-07, + "loss": 0.8108, + "step": 10450 + }, + { + "epoch": 0.16276553511796804, + "grad_norm": 9.139507293701172, + "learning_rate": 8.813195457301584e-07, + "loss": 0.7747, + "step": 10455 + }, + { + "epoch": 0.1628433761199374, + "grad_norm": 4.208261013031006, + "learning_rate": 8.812376067255535e-07, + "loss": 0.7136, + "step": 10460 + }, + { + "epoch": 0.1629212171219068, + "grad_norm": 4.353900909423828, + "learning_rate": 8.811556677209485e-07, + "loss": 0.7753, + "step": 10465 + }, + { + "epoch": 0.16299905812387616, + "grad_norm": 2.729976177215576, + "learning_rate": 8.810737287163435e-07, + "loss": 0.7926, + "step": 10470 + }, + { + "epoch": 0.16307689912584555, + "grad_norm": 2.928906202316284, + "learning_rate": 8.809917897117385e-07, + "loss": 0.7747, + "step": 10475 + }, + { + "epoch": 0.16315474012781492, + "grad_norm": 4.206002712249756, + "learning_rate": 8.809098507071335e-07, + "loss": 0.8576, + "step": 10480 + }, + { + "epoch": 0.1632325811297843, + "grad_norm": 7.724112510681152, + "learning_rate": 8.808279117025286e-07, + "loss": 0.8393, + "step": 10485 + }, + { + "epoch": 0.16331042213175367, + "grad_norm": 3.295132637023926, + "learning_rate": 8.807459726979236e-07, + "loss": 0.778, + "step": 10490 + }, + { + "epoch": 0.16338826313372307, + "grad_norm": 3.3869988918304443, + "learning_rate": 8.806640336933187e-07, + "loss": 0.7569, + "step": 10495 + }, + { + "epoch": 0.16346610413569243, + "grad_norm": 3.718599319458008, + "learning_rate": 8.805820946887137e-07, + "loss": 0.8747, + "step": 10500 + }, + { + "epoch": 0.16354394513766182, + "grad_norm": 3.8476598262786865, + "learning_rate": 8.805001556841087e-07, + "loss": 0.6924, + "step": 10505 + }, + { + "epoch": 0.1636217861396312, + "grad_norm": 3.9861888885498047, + "learning_rate": 8.804182166795038e-07, + "loss": 0.8145, + "step": 10510 + }, + { + "epoch": 0.16369962714160058, + "grad_norm": 3.1306610107421875, + "learning_rate": 8.803362776748989e-07, + "loss": 0.6579, + "step": 10515 + }, + { + "epoch": 0.16377746814356994, + "grad_norm": 4.904189586639404, + "learning_rate": 8.802543386702937e-07, + "loss": 0.8639, + "step": 10520 + }, + { + "epoch": 0.16385530914553934, + "grad_norm": 5.17719841003418, + "learning_rate": 8.801723996656888e-07, + "loss": 0.8722, + "step": 10525 + }, + { + "epoch": 0.1639331501475087, + "grad_norm": 2.9521892070770264, + "learning_rate": 8.800904606610839e-07, + "loss": 0.7665, + "step": 10530 + }, + { + "epoch": 0.16401099114947806, + "grad_norm": 3.255908966064453, + "learning_rate": 8.800085216564788e-07, + "loss": 0.7702, + "step": 10535 + }, + { + "epoch": 0.16408883215144746, + "grad_norm": 6.015521049499512, + "learning_rate": 8.799265826518739e-07, + "loss": 0.7845, + "step": 10540 + }, + { + "epoch": 0.16416667315341682, + "grad_norm": 4.091128826141357, + "learning_rate": 8.79844643647269e-07, + "loss": 0.9369, + "step": 10545 + }, + { + "epoch": 0.1642445141553862, + "grad_norm": 4.375763416290283, + "learning_rate": 8.79762704642664e-07, + "loss": 0.8318, + "step": 10550 + }, + { + "epoch": 0.16432235515735558, + "grad_norm": 6.054563522338867, + "learning_rate": 8.79680765638059e-07, + "loss": 0.8422, + "step": 10555 + }, + { + "epoch": 0.16440019615932497, + "grad_norm": 4.5666303634643555, + "learning_rate": 8.79598826633454e-07, + "loss": 0.8438, + "step": 10560 + }, + { + "epoch": 0.16447803716129433, + "grad_norm": 3.462005615234375, + "learning_rate": 8.79516887628849e-07, + "loss": 0.7772, + "step": 10565 + }, + { + "epoch": 0.16455587816326372, + "grad_norm": 3.66213321685791, + "learning_rate": 8.79434948624244e-07, + "loss": 0.9158, + "step": 10570 + }, + { + "epoch": 0.1646337191652331, + "grad_norm": 3.454305648803711, + "learning_rate": 8.793530096196391e-07, + "loss": 0.7452, + "step": 10575 + }, + { + "epoch": 0.16471156016720248, + "grad_norm": 4.568371295928955, + "learning_rate": 8.792710706150341e-07, + "loss": 0.855, + "step": 10580 + }, + { + "epoch": 0.16478940116917185, + "grad_norm": 5.541522979736328, + "learning_rate": 8.791891316104292e-07, + "loss": 0.9156, + "step": 10585 + }, + { + "epoch": 0.16486724217114124, + "grad_norm": 4.69566011428833, + "learning_rate": 8.791071926058242e-07, + "loss": 0.7321, + "step": 10590 + }, + { + "epoch": 0.1649450831731106, + "grad_norm": 7.592962741851807, + "learning_rate": 8.790252536012192e-07, + "loss": 0.7888, + "step": 10595 + }, + { + "epoch": 0.16502292417508, + "grad_norm": 5.3040080070495605, + "learning_rate": 8.789433145966142e-07, + "loss": 0.7643, + "step": 10600 + }, + { + "epoch": 0.16510076517704936, + "grad_norm": 2.616908073425293, + "learning_rate": 8.788613755920093e-07, + "loss": 0.7614, + "step": 10605 + }, + { + "epoch": 0.16517860617901875, + "grad_norm": 3.683166980743408, + "learning_rate": 8.787794365874043e-07, + "loss": 0.715, + "step": 10610 + }, + { + "epoch": 0.16525644718098811, + "grad_norm": 4.2956132888793945, + "learning_rate": 8.786974975827993e-07, + "loss": 0.9153, + "step": 10615 + }, + { + "epoch": 0.16533428818295748, + "grad_norm": 3.76724910736084, + "learning_rate": 8.786155585781944e-07, + "loss": 0.86, + "step": 10620 + }, + { + "epoch": 0.16541212918492687, + "grad_norm": 3.4380204677581787, + "learning_rate": 8.785336195735894e-07, + "loss": 0.8662, + "step": 10625 + }, + { + "epoch": 0.16548997018689623, + "grad_norm": 8.281620025634766, + "learning_rate": 8.784516805689844e-07, + "loss": 0.8607, + "step": 10630 + }, + { + "epoch": 0.16556781118886563, + "grad_norm": 7.735804557800293, + "learning_rate": 8.783697415643795e-07, + "loss": 0.7513, + "step": 10635 + }, + { + "epoch": 0.165645652190835, + "grad_norm": 6.491322040557861, + "learning_rate": 8.782878025597745e-07, + "loss": 0.824, + "step": 10640 + }, + { + "epoch": 0.16572349319280438, + "grad_norm": 2.7615108489990234, + "learning_rate": 8.782058635551694e-07, + "loss": 0.9231, + "step": 10645 + }, + { + "epoch": 0.16580133419477375, + "grad_norm": 5.219547748565674, + "learning_rate": 8.781239245505645e-07, + "loss": 0.7178, + "step": 10650 + }, + { + "epoch": 0.16587917519674314, + "grad_norm": 8.346015930175781, + "learning_rate": 8.780419855459596e-07, + "loss": 0.8779, + "step": 10655 + }, + { + "epoch": 0.1659570161987125, + "grad_norm": 9.989750862121582, + "learning_rate": 8.779600465413545e-07, + "loss": 0.7609, + "step": 10660 + }, + { + "epoch": 0.1660348572006819, + "grad_norm": 6.761223793029785, + "learning_rate": 8.778781075367496e-07, + "loss": 0.9394, + "step": 10665 + }, + { + "epoch": 0.16611269820265126, + "grad_norm": 3.055026054382324, + "learning_rate": 8.777961685321447e-07, + "loss": 0.8548, + "step": 10670 + }, + { + "epoch": 0.16619053920462065, + "grad_norm": 7.2378950119018555, + "learning_rate": 8.777142295275397e-07, + "loss": 0.8538, + "step": 10675 + }, + { + "epoch": 0.16626838020659002, + "grad_norm": 11.686674118041992, + "learning_rate": 8.776322905229346e-07, + "loss": 0.7245, + "step": 10680 + }, + { + "epoch": 0.1663462212085594, + "grad_norm": 3.3093395233154297, + "learning_rate": 8.775503515183297e-07, + "loss": 0.7876, + "step": 10685 + }, + { + "epoch": 0.16642406221052877, + "grad_norm": 5.579384803771973, + "learning_rate": 8.774684125137247e-07, + "loss": 0.8294, + "step": 10690 + }, + { + "epoch": 0.16650190321249816, + "grad_norm": 3.092195749282837, + "learning_rate": 8.773864735091198e-07, + "loss": 0.8116, + "step": 10695 + }, + { + "epoch": 0.16657974421446753, + "grad_norm": 5.8653130531311035, + "learning_rate": 8.773045345045148e-07, + "loss": 0.7827, + "step": 10700 + }, + { + "epoch": 0.1666575852164369, + "grad_norm": 8.496586799621582, + "learning_rate": 8.772225954999098e-07, + "loss": 0.871, + "step": 10705 + }, + { + "epoch": 0.16673542621840629, + "grad_norm": 3.554830312728882, + "learning_rate": 8.771406564953049e-07, + "loss": 0.7876, + "step": 10710 + }, + { + "epoch": 0.16681326722037565, + "grad_norm": 4.988160610198975, + "learning_rate": 8.770587174907e-07, + "loss": 0.8173, + "step": 10715 + }, + { + "epoch": 0.16689110822234504, + "grad_norm": 4.899982929229736, + "learning_rate": 8.769767784860948e-07, + "loss": 0.7582, + "step": 10720 + }, + { + "epoch": 0.1669689492243144, + "grad_norm": 3.1257996559143066, + "learning_rate": 8.768948394814899e-07, + "loss": 0.7396, + "step": 10725 + }, + { + "epoch": 0.1670467902262838, + "grad_norm": 7.962161064147949, + "learning_rate": 8.76812900476885e-07, + "loss": 0.8768, + "step": 10730 + }, + { + "epoch": 0.16712463122825316, + "grad_norm": 9.329161643981934, + "learning_rate": 8.7673096147228e-07, + "loss": 0.7469, + "step": 10735 + }, + { + "epoch": 0.16720247223022255, + "grad_norm": 2.5543742179870605, + "learning_rate": 8.76649022467675e-07, + "loss": 0.8198, + "step": 10740 + }, + { + "epoch": 0.16728031323219192, + "grad_norm": 3.192782402038574, + "learning_rate": 8.765670834630701e-07, + "loss": 0.7747, + "step": 10745 + }, + { + "epoch": 0.1673581542341613, + "grad_norm": 3.3908746242523193, + "learning_rate": 8.764851444584652e-07, + "loss": 0.8149, + "step": 10750 + }, + { + "epoch": 0.16743599523613067, + "grad_norm": 3.8879611492156982, + "learning_rate": 8.764032054538601e-07, + "loss": 0.774, + "step": 10755 + }, + { + "epoch": 0.16751383623810007, + "grad_norm": 5.076687335968018, + "learning_rate": 8.763212664492551e-07, + "loss": 0.799, + "step": 10760 + }, + { + "epoch": 0.16759167724006943, + "grad_norm": 5.339931964874268, + "learning_rate": 8.762393274446502e-07, + "loss": 0.756, + "step": 10765 + }, + { + "epoch": 0.16766951824203882, + "grad_norm": 4.076517581939697, + "learning_rate": 8.761573884400451e-07, + "loss": 0.8323, + "step": 10770 + }, + { + "epoch": 0.1677473592440082, + "grad_norm": 4.2539777755737305, + "learning_rate": 8.760754494354402e-07, + "loss": 0.6913, + "step": 10775 + }, + { + "epoch": 0.16782520024597758, + "grad_norm": 3.196856737136841, + "learning_rate": 8.759935104308353e-07, + "loss": 0.7324, + "step": 10780 + }, + { + "epoch": 0.16790304124794694, + "grad_norm": 13.658886909484863, + "learning_rate": 8.759115714262303e-07, + "loss": 0.7811, + "step": 10785 + }, + { + "epoch": 0.1679808822499163, + "grad_norm": 3.423370838165283, + "learning_rate": 8.758296324216253e-07, + "loss": 0.7902, + "step": 10790 + }, + { + "epoch": 0.1680587232518857, + "grad_norm": 4.14307165145874, + "learning_rate": 8.757476934170204e-07, + "loss": 0.7751, + "step": 10795 + }, + { + "epoch": 0.16813656425385506, + "grad_norm": 3.6597414016723633, + "learning_rate": 8.756657544124153e-07, + "loss": 0.7519, + "step": 10800 + }, + { + "epoch": 0.16821440525582446, + "grad_norm": 3.839320182800293, + "learning_rate": 8.755838154078103e-07, + "loss": 0.6788, + "step": 10805 + }, + { + "epoch": 0.16829224625779382, + "grad_norm": 5.377815246582031, + "learning_rate": 8.755018764032054e-07, + "loss": 0.8277, + "step": 10810 + }, + { + "epoch": 0.1683700872597632, + "grad_norm": 2.5293519496917725, + "learning_rate": 8.754199373986004e-07, + "loss": 0.7754, + "step": 10815 + }, + { + "epoch": 0.16844792826173258, + "grad_norm": 3.5276260375976562, + "learning_rate": 8.753379983939955e-07, + "loss": 0.6878, + "step": 10820 + }, + { + "epoch": 0.16852576926370197, + "grad_norm": 4.435384273529053, + "learning_rate": 8.752560593893905e-07, + "loss": 0.7848, + "step": 10825 + }, + { + "epoch": 0.16860361026567133, + "grad_norm": 4.338689804077148, + "learning_rate": 8.751741203847855e-07, + "loss": 0.7934, + "step": 10830 + }, + { + "epoch": 0.16868145126764073, + "grad_norm": 5.802286148071289, + "learning_rate": 8.750921813801806e-07, + "loss": 0.6343, + "step": 10835 + }, + { + "epoch": 0.1687592922696101, + "grad_norm": 5.789932727813721, + "learning_rate": 8.750102423755757e-07, + "loss": 0.8002, + "step": 10840 + }, + { + "epoch": 0.16883713327157948, + "grad_norm": 3.8039424419403076, + "learning_rate": 8.749283033709705e-07, + "loss": 0.7334, + "step": 10845 + }, + { + "epoch": 0.16891497427354885, + "grad_norm": 3.930701971054077, + "learning_rate": 8.748463643663656e-07, + "loss": 0.8077, + "step": 10850 + }, + { + "epoch": 0.16899281527551824, + "grad_norm": 15.397842407226562, + "learning_rate": 8.747644253617607e-07, + "loss": 0.9553, + "step": 10855 + }, + { + "epoch": 0.1690706562774876, + "grad_norm": 7.080071926116943, + "learning_rate": 8.746824863571557e-07, + "loss": 0.7472, + "step": 10860 + }, + { + "epoch": 0.169148497279457, + "grad_norm": 3.849839687347412, + "learning_rate": 8.746005473525507e-07, + "loss": 0.7191, + "step": 10865 + }, + { + "epoch": 0.16922633828142636, + "grad_norm": 3.95355486869812, + "learning_rate": 8.745186083479458e-07, + "loss": 0.661, + "step": 10870 + }, + { + "epoch": 0.16930417928339572, + "grad_norm": 2.4966964721679688, + "learning_rate": 8.744366693433409e-07, + "loss": 0.8823, + "step": 10875 + }, + { + "epoch": 0.16938202028536511, + "grad_norm": 6.468384742736816, + "learning_rate": 8.743547303387358e-07, + "loss": 0.7943, + "step": 10880 + }, + { + "epoch": 0.16945986128733448, + "grad_norm": 4.023189544677734, + "learning_rate": 8.742727913341308e-07, + "loss": 0.7107, + "step": 10885 + }, + { + "epoch": 0.16953770228930387, + "grad_norm": 3.180260419845581, + "learning_rate": 8.741908523295259e-07, + "loss": 0.7373, + "step": 10890 + }, + { + "epoch": 0.16961554329127324, + "grad_norm": 4.347747802734375, + "learning_rate": 8.741089133249208e-07, + "loss": 0.7309, + "step": 10895 + }, + { + "epoch": 0.16969338429324263, + "grad_norm": 6.062042713165283, + "learning_rate": 8.740269743203159e-07, + "loss": 0.7572, + "step": 10900 + }, + { + "epoch": 0.169771225295212, + "grad_norm": 2.9714925289154053, + "learning_rate": 8.73945035315711e-07, + "loss": 0.7924, + "step": 10905 + }, + { + "epoch": 0.16984906629718138, + "grad_norm": 3.827422618865967, + "learning_rate": 8.73863096311106e-07, + "loss": 0.912, + "step": 10910 + }, + { + "epoch": 0.16992690729915075, + "grad_norm": 5.133169174194336, + "learning_rate": 8.73781157306501e-07, + "loss": 0.7078, + "step": 10915 + }, + { + "epoch": 0.17000474830112014, + "grad_norm": 2.856977939605713, + "learning_rate": 8.736992183018961e-07, + "loss": 0.8076, + "step": 10920 + }, + { + "epoch": 0.1700825893030895, + "grad_norm": 4.684539318084717, + "learning_rate": 8.73617279297291e-07, + "loss": 0.7045, + "step": 10925 + }, + { + "epoch": 0.1701604303050589, + "grad_norm": 4.420496940612793, + "learning_rate": 8.73535340292686e-07, + "loss": 0.8248, + "step": 10930 + }, + { + "epoch": 0.17023827130702826, + "grad_norm": 3.429518461227417, + "learning_rate": 8.734534012880811e-07, + "loss": 0.7643, + "step": 10935 + }, + { + "epoch": 0.17031611230899765, + "grad_norm": 3.1888604164123535, + "learning_rate": 8.733714622834761e-07, + "loss": 0.7827, + "step": 10940 + }, + { + "epoch": 0.17039395331096702, + "grad_norm": 3.491792917251587, + "learning_rate": 8.732895232788712e-07, + "loss": 0.8594, + "step": 10945 + }, + { + "epoch": 0.1704717943129364, + "grad_norm": 4.644289493560791, + "learning_rate": 8.732075842742662e-07, + "loss": 0.7508, + "step": 10950 + }, + { + "epoch": 0.17054963531490577, + "grad_norm": 3.222562313079834, + "learning_rate": 8.731256452696612e-07, + "loss": 0.7755, + "step": 10955 + }, + { + "epoch": 0.17062747631687514, + "grad_norm": 7.415604591369629, + "learning_rate": 8.730437062650563e-07, + "loss": 0.677, + "step": 10960 + }, + { + "epoch": 0.17070531731884453, + "grad_norm": 3.6360597610473633, + "learning_rate": 8.729617672604513e-07, + "loss": 0.8338, + "step": 10965 + }, + { + "epoch": 0.1707831583208139, + "grad_norm": 9.364253044128418, + "learning_rate": 8.728798282558462e-07, + "loss": 0.7644, + "step": 10970 + }, + { + "epoch": 0.17086099932278329, + "grad_norm": 3.551168441772461, + "learning_rate": 8.727978892512413e-07, + "loss": 0.7808, + "step": 10975 + }, + { + "epoch": 0.17093884032475265, + "grad_norm": 6.72664213180542, + "learning_rate": 8.727159502466364e-07, + "loss": 0.7789, + "step": 10980 + }, + { + "epoch": 0.17101668132672204, + "grad_norm": 3.8760182857513428, + "learning_rate": 8.726340112420314e-07, + "loss": 0.7943, + "step": 10985 + }, + { + "epoch": 0.1710945223286914, + "grad_norm": 4.414963722229004, + "learning_rate": 8.725520722374264e-07, + "loss": 0.753, + "step": 10990 + }, + { + "epoch": 0.1711723633306608, + "grad_norm": 8.448708534240723, + "learning_rate": 8.724701332328215e-07, + "loss": 0.704, + "step": 10995 + }, + { + "epoch": 0.17125020433263016, + "grad_norm": 3.2329261302948, + "learning_rate": 8.723881942282166e-07, + "loss": 0.7565, + "step": 11000 + }, + { + "epoch": 0.17132804533459955, + "grad_norm": 7.1403303146362305, + "learning_rate": 8.723062552236114e-07, + "loss": 0.7924, + "step": 11005 + }, + { + "epoch": 0.17140588633656892, + "grad_norm": 2.50464129447937, + "learning_rate": 8.722243162190065e-07, + "loss": 0.8038, + "step": 11010 + }, + { + "epoch": 0.1714837273385383, + "grad_norm": 3.236157178878784, + "learning_rate": 8.721423772144016e-07, + "loss": 0.7118, + "step": 11015 + }, + { + "epoch": 0.17156156834050768, + "grad_norm": 4.607430934906006, + "learning_rate": 8.720604382097966e-07, + "loss": 0.8654, + "step": 11020 + }, + { + "epoch": 0.17163940934247707, + "grad_norm": 3.002549171447754, + "learning_rate": 8.719784992051916e-07, + "loss": 0.7738, + "step": 11025 + }, + { + "epoch": 0.17171725034444643, + "grad_norm": 3.545802116394043, + "learning_rate": 8.718965602005867e-07, + "loss": 0.7659, + "step": 11030 + }, + { + "epoch": 0.17179509134641582, + "grad_norm": 4.228000640869141, + "learning_rate": 8.718146211959817e-07, + "loss": 0.89, + "step": 11035 + }, + { + "epoch": 0.1718729323483852, + "grad_norm": 10.417171478271484, + "learning_rate": 8.717326821913767e-07, + "loss": 0.7852, + "step": 11040 + }, + { + "epoch": 0.17195077335035455, + "grad_norm": 5.213564872741699, + "learning_rate": 8.716507431867717e-07, + "loss": 0.8346, + "step": 11045 + }, + { + "epoch": 0.17202861435232394, + "grad_norm": 3.869044780731201, + "learning_rate": 8.715688041821667e-07, + "loss": 0.7275, + "step": 11050 + }, + { + "epoch": 0.1721064553542933, + "grad_norm": 3.7918996810913086, + "learning_rate": 8.714868651775618e-07, + "loss": 0.9288, + "step": 11055 + }, + { + "epoch": 0.1721842963562627, + "grad_norm": 3.620591640472412, + "learning_rate": 8.714049261729568e-07, + "loss": 0.8667, + "step": 11060 + }, + { + "epoch": 0.17226213735823206, + "grad_norm": 3.3272628784179688, + "learning_rate": 8.713229871683518e-07, + "loss": 0.7839, + "step": 11065 + }, + { + "epoch": 0.17233997836020146, + "grad_norm": 3.3981659412384033, + "learning_rate": 8.712410481637469e-07, + "loss": 0.8703, + "step": 11070 + }, + { + "epoch": 0.17241781936217082, + "grad_norm": 2.7353670597076416, + "learning_rate": 8.71159109159142e-07, + "loss": 0.8323, + "step": 11075 + }, + { + "epoch": 0.1724956603641402, + "grad_norm": 3.0876946449279785, + "learning_rate": 8.710771701545369e-07, + "loss": 0.7806, + "step": 11080 + }, + { + "epoch": 0.17257350136610958, + "grad_norm": 4.163149833679199, + "learning_rate": 8.709952311499319e-07, + "loss": 0.738, + "step": 11085 + }, + { + "epoch": 0.17265134236807897, + "grad_norm": 3.454596519470215, + "learning_rate": 8.70913292145327e-07, + "loss": 0.7692, + "step": 11090 + }, + { + "epoch": 0.17272918337004833, + "grad_norm": 5.42886209487915, + "learning_rate": 8.708313531407219e-07, + "loss": 0.8061, + "step": 11095 + }, + { + "epoch": 0.17280702437201773, + "grad_norm": 6.216919898986816, + "learning_rate": 8.70749414136117e-07, + "loss": 0.7982, + "step": 11100 + }, + { + "epoch": 0.1728848653739871, + "grad_norm": 3.19822359085083, + "learning_rate": 8.706674751315121e-07, + "loss": 0.7365, + "step": 11105 + }, + { + "epoch": 0.17296270637595648, + "grad_norm": 6.890682220458984, + "learning_rate": 8.705855361269072e-07, + "loss": 0.8121, + "step": 11110 + }, + { + "epoch": 0.17304054737792585, + "grad_norm": 3.8538219928741455, + "learning_rate": 8.705035971223021e-07, + "loss": 0.8005, + "step": 11115 + }, + { + "epoch": 0.17311838837989524, + "grad_norm": 4.351493835449219, + "learning_rate": 8.704216581176972e-07, + "loss": 0.7921, + "step": 11120 + }, + { + "epoch": 0.1731962293818646, + "grad_norm": 4.173168182373047, + "learning_rate": 8.703397191130923e-07, + "loss": 0.7759, + "step": 11125 + }, + { + "epoch": 0.17327407038383397, + "grad_norm": 3.0455987453460693, + "learning_rate": 8.702577801084871e-07, + "loss": 0.7061, + "step": 11130 + }, + { + "epoch": 0.17335191138580336, + "grad_norm": 8.838937759399414, + "learning_rate": 8.701758411038822e-07, + "loss": 0.6888, + "step": 11135 + }, + { + "epoch": 0.17342975238777272, + "grad_norm": 10.514293670654297, + "learning_rate": 8.700939020992773e-07, + "loss": 0.8403, + "step": 11140 + }, + { + "epoch": 0.17350759338974212, + "grad_norm": 2.9291927814483643, + "learning_rate": 8.700119630946723e-07, + "loss": 0.8609, + "step": 11145 + }, + { + "epoch": 0.17358543439171148, + "grad_norm": 4.258464336395264, + "learning_rate": 8.699300240900673e-07, + "loss": 0.8436, + "step": 11150 + }, + { + "epoch": 0.17366327539368087, + "grad_norm": 4.499458312988281, + "learning_rate": 8.698480850854624e-07, + "loss": 0.883, + "step": 11155 + }, + { + "epoch": 0.17374111639565024, + "grad_norm": 4.2294745445251465, + "learning_rate": 8.697661460808574e-07, + "loss": 0.711, + "step": 11160 + }, + { + "epoch": 0.17381895739761963, + "grad_norm": 2.956054925918579, + "learning_rate": 8.696842070762525e-07, + "loss": 0.8658, + "step": 11165 + }, + { + "epoch": 0.173896798399589, + "grad_norm": 6.596224308013916, + "learning_rate": 8.696022680716474e-07, + "loss": 0.8289, + "step": 11170 + }, + { + "epoch": 0.17397463940155838, + "grad_norm": 4.582150936126709, + "learning_rate": 8.695203290670424e-07, + "loss": 0.7524, + "step": 11175 + }, + { + "epoch": 0.17405248040352775, + "grad_norm": 2.815945625305176, + "learning_rate": 8.694383900624375e-07, + "loss": 0.8105, + "step": 11180 + }, + { + "epoch": 0.17413032140549714, + "grad_norm": 7.399906635284424, + "learning_rate": 8.693564510578325e-07, + "loss": 0.628, + "step": 11185 + }, + { + "epoch": 0.1742081624074665, + "grad_norm": 2.3162524700164795, + "learning_rate": 8.692745120532275e-07, + "loss": 0.6802, + "step": 11190 + }, + { + "epoch": 0.1742860034094359, + "grad_norm": 4.691956043243408, + "learning_rate": 8.691925730486226e-07, + "loss": 0.9559, + "step": 11195 + }, + { + "epoch": 0.17436384441140526, + "grad_norm": 5.132546901702881, + "learning_rate": 8.691106340440177e-07, + "loss": 0.8418, + "step": 11200 + }, + { + "epoch": 0.17444168541337465, + "grad_norm": 6.407838344573975, + "learning_rate": 8.690286950394126e-07, + "loss": 0.9149, + "step": 11205 + }, + { + "epoch": 0.17451952641534402, + "grad_norm": 2.8634774684906006, + "learning_rate": 8.689467560348076e-07, + "loss": 0.8361, + "step": 11210 + }, + { + "epoch": 0.1745973674173134, + "grad_norm": 5.304955959320068, + "learning_rate": 8.688648170302027e-07, + "loss": 0.7911, + "step": 11215 + }, + { + "epoch": 0.17467520841928277, + "grad_norm": 6.025475978851318, + "learning_rate": 8.687828780255976e-07, + "loss": 0.7571, + "step": 11220 + }, + { + "epoch": 0.17475304942125214, + "grad_norm": 4.901416301727295, + "learning_rate": 8.687009390209927e-07, + "loss": 0.7321, + "step": 11225 + }, + { + "epoch": 0.17483089042322153, + "grad_norm": 2.951046943664551, + "learning_rate": 8.686190000163878e-07, + "loss": 0.756, + "step": 11230 + }, + { + "epoch": 0.1749087314251909, + "grad_norm": 3.0569169521331787, + "learning_rate": 8.685370610117829e-07, + "loss": 0.7937, + "step": 11235 + }, + { + "epoch": 0.1749865724271603, + "grad_norm": 3.96439528465271, + "learning_rate": 8.684551220071778e-07, + "loss": 0.7065, + "step": 11240 + }, + { + "epoch": 0.17506441342912965, + "grad_norm": 5.546309947967529, + "learning_rate": 8.683731830025729e-07, + "loss": 0.7787, + "step": 11245 + }, + { + "epoch": 0.17514225443109904, + "grad_norm": 4.913859844207764, + "learning_rate": 8.682912439979679e-07, + "loss": 0.7689, + "step": 11250 + }, + { + "epoch": 0.1752200954330684, + "grad_norm": 3.5462117195129395, + "learning_rate": 8.682093049933628e-07, + "loss": 0.7766, + "step": 11255 + }, + { + "epoch": 0.1752979364350378, + "grad_norm": 5.14613676071167, + "learning_rate": 8.681273659887579e-07, + "loss": 0.8672, + "step": 11260 + }, + { + "epoch": 0.17537577743700716, + "grad_norm": 3.5507776737213135, + "learning_rate": 8.68045426984153e-07, + "loss": 0.8519, + "step": 11265 + }, + { + "epoch": 0.17545361843897656, + "grad_norm": 5.96008825302124, + "learning_rate": 8.67963487979548e-07, + "loss": 0.8072, + "step": 11270 + }, + { + "epoch": 0.17553145944094592, + "grad_norm": 3.9602410793304443, + "learning_rate": 8.67881548974943e-07, + "loss": 0.7046, + "step": 11275 + }, + { + "epoch": 0.1756093004429153, + "grad_norm": 9.932755470275879, + "learning_rate": 8.677996099703381e-07, + "loss": 0.6901, + "step": 11280 + }, + { + "epoch": 0.17568714144488468, + "grad_norm": 3.0417749881744385, + "learning_rate": 8.677176709657331e-07, + "loss": 0.8085, + "step": 11285 + }, + { + "epoch": 0.17576498244685407, + "grad_norm": 8.406500816345215, + "learning_rate": 8.67635731961128e-07, + "loss": 0.8751, + "step": 11290 + }, + { + "epoch": 0.17584282344882343, + "grad_norm": 3.1526453495025635, + "learning_rate": 8.675537929565231e-07, + "loss": 0.779, + "step": 11295 + }, + { + "epoch": 0.17592066445079282, + "grad_norm": 8.264491081237793, + "learning_rate": 8.674718539519181e-07, + "loss": 0.6696, + "step": 11300 + }, + { + "epoch": 0.1759985054527622, + "grad_norm": 4.0117011070251465, + "learning_rate": 8.673899149473132e-07, + "loss": 0.7622, + "step": 11305 + }, + { + "epoch": 0.17607634645473155, + "grad_norm": 5.8008527755737305, + "learning_rate": 8.673079759427082e-07, + "loss": 0.9383, + "step": 11310 + }, + { + "epoch": 0.17615418745670094, + "grad_norm": 3.362180709838867, + "learning_rate": 8.672260369381032e-07, + "loss": 0.7315, + "step": 11315 + }, + { + "epoch": 0.1762320284586703, + "grad_norm": 3.2175207138061523, + "learning_rate": 8.671440979334983e-07, + "loss": 0.7064, + "step": 11320 + }, + { + "epoch": 0.1763098694606397, + "grad_norm": 3.180907964706421, + "learning_rate": 8.670621589288934e-07, + "loss": 0.7622, + "step": 11325 + }, + { + "epoch": 0.17638771046260907, + "grad_norm": 4.250461578369141, + "learning_rate": 8.669802199242882e-07, + "loss": 0.9496, + "step": 11330 + }, + { + "epoch": 0.17646555146457846, + "grad_norm": 3.719573497772217, + "learning_rate": 8.668982809196833e-07, + "loss": 0.7536, + "step": 11335 + }, + { + "epoch": 0.17654339246654782, + "grad_norm": 4.155417442321777, + "learning_rate": 8.668163419150784e-07, + "loss": 0.7122, + "step": 11340 + }, + { + "epoch": 0.1766212334685172, + "grad_norm": 5.191723346710205, + "learning_rate": 8.667344029104733e-07, + "loss": 0.6908, + "step": 11345 + }, + { + "epoch": 0.17669907447048658, + "grad_norm": 6.202426910400391, + "learning_rate": 8.666524639058684e-07, + "loss": 0.8704, + "step": 11350 + }, + { + "epoch": 0.17677691547245597, + "grad_norm": 3.2510268688201904, + "learning_rate": 8.665705249012635e-07, + "loss": 0.702, + "step": 11355 + }, + { + "epoch": 0.17685475647442533, + "grad_norm": 7.769083023071289, + "learning_rate": 8.664885858966586e-07, + "loss": 0.8048, + "step": 11360 + }, + { + "epoch": 0.17693259747639473, + "grad_norm": 3.8902037143707275, + "learning_rate": 8.664066468920535e-07, + "loss": 0.7719, + "step": 11365 + }, + { + "epoch": 0.1770104384783641, + "grad_norm": 4.6013407707214355, + "learning_rate": 8.663247078874485e-07, + "loss": 0.7691, + "step": 11370 + }, + { + "epoch": 0.17708827948033348, + "grad_norm": 7.063052654266357, + "learning_rate": 8.662427688828436e-07, + "loss": 0.8091, + "step": 11375 + }, + { + "epoch": 0.17716612048230285, + "grad_norm": 4.544634819030762, + "learning_rate": 8.661608298782386e-07, + "loss": 0.9371, + "step": 11380 + }, + { + "epoch": 0.17724396148427224, + "grad_norm": 4.195473670959473, + "learning_rate": 8.660788908736336e-07, + "loss": 0.7634, + "step": 11385 + }, + { + "epoch": 0.1773218024862416, + "grad_norm": 18.608367919921875, + "learning_rate": 8.659969518690287e-07, + "loss": 0.7902, + "step": 11390 + }, + { + "epoch": 0.17739964348821097, + "grad_norm": 3.536041498184204, + "learning_rate": 8.659150128644237e-07, + "loss": 0.7095, + "step": 11395 + }, + { + "epoch": 0.17747748449018036, + "grad_norm": 3.1353678703308105, + "learning_rate": 8.658330738598187e-07, + "loss": 0.688, + "step": 11400 + }, + { + "epoch": 0.17755532549214972, + "grad_norm": 2.603710889816284, + "learning_rate": 8.657511348552138e-07, + "loss": 0.8218, + "step": 11405 + }, + { + "epoch": 0.17763316649411912, + "grad_norm": 3.3999760150909424, + "learning_rate": 8.656691958506087e-07, + "loss": 0.7279, + "step": 11410 + }, + { + "epoch": 0.17771100749608848, + "grad_norm": 3.0341594219207764, + "learning_rate": 8.655872568460038e-07, + "loss": 0.754, + "step": 11415 + }, + { + "epoch": 0.17778884849805787, + "grad_norm": 5.157776355743408, + "learning_rate": 8.655053178413988e-07, + "loss": 0.8347, + "step": 11420 + }, + { + "epoch": 0.17786668950002724, + "grad_norm": 5.881651878356934, + "learning_rate": 8.654233788367938e-07, + "loss": 0.868, + "step": 11425 + }, + { + "epoch": 0.17794453050199663, + "grad_norm": 7.980086326599121, + "learning_rate": 8.653414398321889e-07, + "loss": 0.8467, + "step": 11430 + }, + { + "epoch": 0.178022371503966, + "grad_norm": 3.4080753326416016, + "learning_rate": 8.65259500827584e-07, + "loss": 0.641, + "step": 11435 + }, + { + "epoch": 0.17810021250593538, + "grad_norm": 8.840136528015137, + "learning_rate": 8.651775618229789e-07, + "loss": 0.8391, + "step": 11440 + }, + { + "epoch": 0.17817805350790475, + "grad_norm": 6.058139324188232, + "learning_rate": 8.65095622818374e-07, + "loss": 0.6896, + "step": 11445 + }, + { + "epoch": 0.17825589450987414, + "grad_norm": 3.771533250808716, + "learning_rate": 8.650136838137691e-07, + "loss": 0.7656, + "step": 11450 + }, + { + "epoch": 0.1783337355118435, + "grad_norm": 5.53726053237915, + "learning_rate": 8.649317448091639e-07, + "loss": 0.7161, + "step": 11455 + }, + { + "epoch": 0.1784115765138129, + "grad_norm": 4.151732444763184, + "learning_rate": 8.64849805804559e-07, + "loss": 0.7258, + "step": 11460 + }, + { + "epoch": 0.17848941751578226, + "grad_norm": 5.9917731285095215, + "learning_rate": 8.647678667999541e-07, + "loss": 0.8906, + "step": 11465 + }, + { + "epoch": 0.17856725851775165, + "grad_norm": 3.5444159507751465, + "learning_rate": 8.64685927795349e-07, + "loss": 0.8006, + "step": 11470 + }, + { + "epoch": 0.17864509951972102, + "grad_norm": 7.527516841888428, + "learning_rate": 8.646039887907441e-07, + "loss": 0.837, + "step": 11475 + }, + { + "epoch": 0.17872294052169038, + "grad_norm": 3.505826711654663, + "learning_rate": 8.645220497861392e-07, + "loss": 0.7962, + "step": 11480 + }, + { + "epoch": 0.17880078152365977, + "grad_norm": 4.634178161621094, + "learning_rate": 8.644401107815343e-07, + "loss": 0.7389, + "step": 11485 + }, + { + "epoch": 0.17887862252562914, + "grad_norm": 2.834869623184204, + "learning_rate": 8.643581717769292e-07, + "loss": 0.837, + "step": 11490 + }, + { + "epoch": 0.17895646352759853, + "grad_norm": 4.649651527404785, + "learning_rate": 8.642762327723242e-07, + "loss": 0.8676, + "step": 11495 + }, + { + "epoch": 0.1790343045295679, + "grad_norm": 4.7943925857543945, + "learning_rate": 8.641942937677193e-07, + "loss": 0.7031, + "step": 11500 + }, + { + "epoch": 0.1791121455315373, + "grad_norm": 3.4490435123443604, + "learning_rate": 8.641123547631143e-07, + "loss": 0.8332, + "step": 11505 + }, + { + "epoch": 0.17918998653350665, + "grad_norm": 2.6124467849731445, + "learning_rate": 8.640304157585093e-07, + "loss": 0.8277, + "step": 11510 + }, + { + "epoch": 0.17926782753547604, + "grad_norm": 3.379868507385254, + "learning_rate": 8.639484767539044e-07, + "loss": 0.8617, + "step": 11515 + }, + { + "epoch": 0.1793456685374454, + "grad_norm": 4.773552417755127, + "learning_rate": 8.638665377492994e-07, + "loss": 0.8771, + "step": 11520 + }, + { + "epoch": 0.1794235095394148, + "grad_norm": 8.424163818359375, + "learning_rate": 8.637845987446945e-07, + "loss": 0.8067, + "step": 11525 + }, + { + "epoch": 0.17950135054138416, + "grad_norm": 4.11102294921875, + "learning_rate": 8.637026597400895e-07, + "loss": 0.7825, + "step": 11530 + }, + { + "epoch": 0.17957919154335356, + "grad_norm": 3.056248188018799, + "learning_rate": 8.636207207354844e-07, + "loss": 0.805, + "step": 11535 + }, + { + "epoch": 0.17965703254532292, + "grad_norm": 6.354325294494629, + "learning_rate": 8.635387817308795e-07, + "loss": 0.697, + "step": 11540 + }, + { + "epoch": 0.1797348735472923, + "grad_norm": 4.208707809448242, + "learning_rate": 8.634568427262745e-07, + "loss": 0.7442, + "step": 11545 + }, + { + "epoch": 0.17981271454926168, + "grad_norm": 3.256444215774536, + "learning_rate": 8.633749037216695e-07, + "loss": 0.7955, + "step": 11550 + }, + { + "epoch": 0.17989055555123107, + "grad_norm": 2.9025378227233887, + "learning_rate": 8.632929647170646e-07, + "loss": 0.8062, + "step": 11555 + }, + { + "epoch": 0.17996839655320043, + "grad_norm": 5.104341506958008, + "learning_rate": 8.632110257124597e-07, + "loss": 0.9282, + "step": 11560 + }, + { + "epoch": 0.1800462375551698, + "grad_norm": 3.7267444133758545, + "learning_rate": 8.631290867078546e-07, + "loss": 0.7416, + "step": 11565 + }, + { + "epoch": 0.1801240785571392, + "grad_norm": 2.9586052894592285, + "learning_rate": 8.630471477032497e-07, + "loss": 0.7961, + "step": 11570 + }, + { + "epoch": 0.18020191955910855, + "grad_norm": 4.650422096252441, + "learning_rate": 8.629652086986447e-07, + "loss": 0.7148, + "step": 11575 + }, + { + "epoch": 0.18027976056107795, + "grad_norm": 3.344991445541382, + "learning_rate": 8.628832696940396e-07, + "loss": 0.7804, + "step": 11580 + }, + { + "epoch": 0.1803576015630473, + "grad_norm": 3.1043858528137207, + "learning_rate": 8.628013306894347e-07, + "loss": 0.7004, + "step": 11585 + }, + { + "epoch": 0.1804354425650167, + "grad_norm": 4.275179862976074, + "learning_rate": 8.627193916848298e-07, + "loss": 0.702, + "step": 11590 + }, + { + "epoch": 0.18051328356698607, + "grad_norm": 5.884884357452393, + "learning_rate": 8.626374526802248e-07, + "loss": 0.7196, + "step": 11595 + }, + { + "epoch": 0.18059112456895546, + "grad_norm": 5.935474395751953, + "learning_rate": 8.625555136756198e-07, + "loss": 0.8548, + "step": 11600 + }, + { + "epoch": 0.18066896557092482, + "grad_norm": 3.8844761848449707, + "learning_rate": 8.624735746710149e-07, + "loss": 0.7467, + "step": 11605 + }, + { + "epoch": 0.18074680657289421, + "grad_norm": 3.4248223304748535, + "learning_rate": 8.6239163566641e-07, + "loss": 0.8664, + "step": 11610 + }, + { + "epoch": 0.18082464757486358, + "grad_norm": 4.0504679679870605, + "learning_rate": 8.623096966618048e-07, + "loss": 0.8649, + "step": 11615 + }, + { + "epoch": 0.18090248857683297, + "grad_norm": 3.0257060527801514, + "learning_rate": 8.622277576571999e-07, + "loss": 0.7091, + "step": 11620 + }, + { + "epoch": 0.18098032957880233, + "grad_norm": 4.030515193939209, + "learning_rate": 8.62145818652595e-07, + "loss": 0.8485, + "step": 11625 + }, + { + "epoch": 0.18105817058077173, + "grad_norm": 3.375437021255493, + "learning_rate": 8.6206387964799e-07, + "loss": 0.6832, + "step": 11630 + }, + { + "epoch": 0.1811360115827411, + "grad_norm": 4.179788112640381, + "learning_rate": 8.61981940643385e-07, + "loss": 0.8009, + "step": 11635 + }, + { + "epoch": 0.18121385258471048, + "grad_norm": 5.095760822296143, + "learning_rate": 8.619000016387801e-07, + "loss": 0.7984, + "step": 11640 + }, + { + "epoch": 0.18129169358667985, + "grad_norm": 3.096256732940674, + "learning_rate": 8.618180626341751e-07, + "loss": 0.8468, + "step": 11645 + }, + { + "epoch": 0.1813695345886492, + "grad_norm": 3.6533854007720947, + "learning_rate": 8.617361236295702e-07, + "loss": 0.7336, + "step": 11650 + }, + { + "epoch": 0.1814473755906186, + "grad_norm": 3.505079984664917, + "learning_rate": 8.616541846249651e-07, + "loss": 0.7025, + "step": 11655 + }, + { + "epoch": 0.18152521659258797, + "grad_norm": 3.6542341709136963, + "learning_rate": 8.615722456203601e-07, + "loss": 0.8489, + "step": 11660 + }, + { + "epoch": 0.18160305759455736, + "grad_norm": 6.4186811447143555, + "learning_rate": 8.614903066157552e-07, + "loss": 0.7712, + "step": 11665 + }, + { + "epoch": 0.18168089859652672, + "grad_norm": 5.470929145812988, + "learning_rate": 8.614083676111502e-07, + "loss": 0.819, + "step": 11670 + }, + { + "epoch": 0.18175873959849612, + "grad_norm": 3.3178632259368896, + "learning_rate": 8.613264286065452e-07, + "loss": 0.813, + "step": 11675 + }, + { + "epoch": 0.18183658060046548, + "grad_norm": 3.1316630840301514, + "learning_rate": 8.612444896019403e-07, + "loss": 0.7939, + "step": 11680 + }, + { + "epoch": 0.18191442160243487, + "grad_norm": 5.289381980895996, + "learning_rate": 8.611625505973354e-07, + "loss": 0.7354, + "step": 11685 + }, + { + "epoch": 0.18199226260440424, + "grad_norm": 2.2420859336853027, + "learning_rate": 8.610806115927303e-07, + "loss": 0.7458, + "step": 11690 + }, + { + "epoch": 0.18207010360637363, + "grad_norm": 5.712198257446289, + "learning_rate": 8.609986725881253e-07, + "loss": 0.7175, + "step": 11695 + }, + { + "epoch": 0.182147944608343, + "grad_norm": 4.2332353591918945, + "learning_rate": 8.609167335835204e-07, + "loss": 0.7712, + "step": 11700 + }, + { + "epoch": 0.18222578561031239, + "grad_norm": 3.4181714057922363, + "learning_rate": 8.608347945789153e-07, + "loss": 0.7699, + "step": 11705 + }, + { + "epoch": 0.18230362661228175, + "grad_norm": 4.554285049438477, + "learning_rate": 8.607528555743104e-07, + "loss": 0.7718, + "step": 11710 + }, + { + "epoch": 0.18238146761425114, + "grad_norm": 4.693836212158203, + "learning_rate": 8.606709165697055e-07, + "loss": 0.8118, + "step": 11715 + }, + { + "epoch": 0.1824593086162205, + "grad_norm": 5.117660999298096, + "learning_rate": 8.605889775651005e-07, + "loss": 0.7083, + "step": 11720 + }, + { + "epoch": 0.1825371496181899, + "grad_norm": 7.209866046905518, + "learning_rate": 8.605070385604955e-07, + "loss": 0.832, + "step": 11725 + }, + { + "epoch": 0.18261499062015926, + "grad_norm": 3.6614935398101807, + "learning_rate": 8.604250995558906e-07, + "loss": 0.7817, + "step": 11730 + }, + { + "epoch": 0.18269283162212863, + "grad_norm": 3.863678216934204, + "learning_rate": 8.603431605512855e-07, + "loss": 0.8006, + "step": 11735 + }, + { + "epoch": 0.18277067262409802, + "grad_norm": 3.742063522338867, + "learning_rate": 8.602612215466806e-07, + "loss": 0.7947, + "step": 11740 + }, + { + "epoch": 0.18284851362606738, + "grad_norm": 2.856513261795044, + "learning_rate": 8.601792825420756e-07, + "loss": 0.745, + "step": 11745 + }, + { + "epoch": 0.18292635462803677, + "grad_norm": 2.845435619354248, + "learning_rate": 8.600973435374707e-07, + "loss": 0.794, + "step": 11750 + }, + { + "epoch": 0.18300419563000614, + "grad_norm": 8.055949211120605, + "learning_rate": 8.600154045328657e-07, + "loss": 0.7383, + "step": 11755 + }, + { + "epoch": 0.18308203663197553, + "grad_norm": 4.0927510261535645, + "learning_rate": 8.599334655282607e-07, + "loss": 0.6759, + "step": 11760 + }, + { + "epoch": 0.1831598776339449, + "grad_norm": 3.1355292797088623, + "learning_rate": 8.598515265236558e-07, + "loss": 0.771, + "step": 11765 + }, + { + "epoch": 0.1832377186359143, + "grad_norm": 9.864462852478027, + "learning_rate": 8.597695875190508e-07, + "loss": 0.709, + "step": 11770 + }, + { + "epoch": 0.18331555963788365, + "grad_norm": 5.377257823944092, + "learning_rate": 8.596876485144459e-07, + "loss": 0.903, + "step": 11775 + }, + { + "epoch": 0.18339340063985304, + "grad_norm": 3.909209728240967, + "learning_rate": 8.596057095098408e-07, + "loss": 0.7645, + "step": 11780 + }, + { + "epoch": 0.1834712416418224, + "grad_norm": 4.229231834411621, + "learning_rate": 8.595237705052358e-07, + "loss": 0.8469, + "step": 11785 + }, + { + "epoch": 0.1835490826437918, + "grad_norm": 9.634142875671387, + "learning_rate": 8.594418315006309e-07, + "loss": 0.88, + "step": 11790 + }, + { + "epoch": 0.18362692364576116, + "grad_norm": 2.9738218784332275, + "learning_rate": 8.59359892496026e-07, + "loss": 0.689, + "step": 11795 + }, + { + "epoch": 0.18370476464773056, + "grad_norm": 3.0469038486480713, + "learning_rate": 8.592779534914209e-07, + "loss": 0.8383, + "step": 11800 + }, + { + "epoch": 0.18378260564969992, + "grad_norm": 4.129268646240234, + "learning_rate": 8.59196014486816e-07, + "loss": 0.6902, + "step": 11805 + }, + { + "epoch": 0.1838604466516693, + "grad_norm": 4.603461265563965, + "learning_rate": 8.591140754822111e-07, + "loss": 0.813, + "step": 11810 + }, + { + "epoch": 0.18393828765363868, + "grad_norm": 4.814962387084961, + "learning_rate": 8.59032136477606e-07, + "loss": 0.8819, + "step": 11815 + }, + { + "epoch": 0.18401612865560804, + "grad_norm": 7.185861110687256, + "learning_rate": 8.58950197473001e-07, + "loss": 0.8366, + "step": 11820 + }, + { + "epoch": 0.18409396965757743, + "grad_norm": 7.243460655212402, + "learning_rate": 8.588682584683961e-07, + "loss": 0.9382, + "step": 11825 + }, + { + "epoch": 0.1841718106595468, + "grad_norm": 6.482030391693115, + "learning_rate": 8.587863194637911e-07, + "loss": 0.8316, + "step": 11830 + }, + { + "epoch": 0.1842496516615162, + "grad_norm": 12.124528884887695, + "learning_rate": 8.587043804591861e-07, + "loss": 0.8205, + "step": 11835 + }, + { + "epoch": 0.18432749266348555, + "grad_norm": 4.70367431640625, + "learning_rate": 8.586224414545812e-07, + "loss": 0.6944, + "step": 11840 + }, + { + "epoch": 0.18440533366545495, + "grad_norm": 6.1089558601379395, + "learning_rate": 8.585405024499762e-07, + "loss": 0.8094, + "step": 11845 + }, + { + "epoch": 0.1844831746674243, + "grad_norm": 3.807187557220459, + "learning_rate": 8.584585634453712e-07, + "loss": 0.8715, + "step": 11850 + }, + { + "epoch": 0.1845610156693937, + "grad_norm": 4.469877243041992, + "learning_rate": 8.583766244407663e-07, + "loss": 0.7794, + "step": 11855 + }, + { + "epoch": 0.18463885667136307, + "grad_norm": 5.07852029800415, + "learning_rate": 8.582946854361612e-07, + "loss": 0.7603, + "step": 11860 + }, + { + "epoch": 0.18471669767333246, + "grad_norm": 3.7635741233825684, + "learning_rate": 8.582127464315563e-07, + "loss": 0.817, + "step": 11865 + }, + { + "epoch": 0.18479453867530182, + "grad_norm": 5.198869228363037, + "learning_rate": 8.581308074269513e-07, + "loss": 0.8442, + "step": 11870 + }, + { + "epoch": 0.18487237967727121, + "grad_norm": 3.176208019256592, + "learning_rate": 8.580488684223464e-07, + "loss": 0.8135, + "step": 11875 + }, + { + "epoch": 0.18495022067924058, + "grad_norm": 5.479611396789551, + "learning_rate": 8.579669294177414e-07, + "loss": 0.686, + "step": 11880 + }, + { + "epoch": 0.18502806168120997, + "grad_norm": 5.823884010314941, + "learning_rate": 8.578849904131365e-07, + "loss": 0.7851, + "step": 11885 + }, + { + "epoch": 0.18510590268317934, + "grad_norm": 4.050318717956543, + "learning_rate": 8.578030514085315e-07, + "loss": 0.7321, + "step": 11890 + }, + { + "epoch": 0.18518374368514873, + "grad_norm": 2.703254461288452, + "learning_rate": 8.577211124039265e-07, + "loss": 0.7569, + "step": 11895 + }, + { + "epoch": 0.1852615846871181, + "grad_norm": 6.201961517333984, + "learning_rate": 8.576391733993215e-07, + "loss": 0.8172, + "step": 11900 + }, + { + "epoch": 0.18533942568908748, + "grad_norm": 4.213263511657715, + "learning_rate": 8.575572343947165e-07, + "loss": 0.8344, + "step": 11905 + }, + { + "epoch": 0.18541726669105685, + "grad_norm": 6.157925128936768, + "learning_rate": 8.574752953901115e-07, + "loss": 0.7343, + "step": 11910 + }, + { + "epoch": 0.1854951076930262, + "grad_norm": 4.5453691482543945, + "learning_rate": 8.573933563855066e-07, + "loss": 0.7038, + "step": 11915 + }, + { + "epoch": 0.1855729486949956, + "grad_norm": 4.831943035125732, + "learning_rate": 8.573114173809017e-07, + "loss": 0.8805, + "step": 11920 + }, + { + "epoch": 0.18565078969696497, + "grad_norm": 2.977743625640869, + "learning_rate": 8.572294783762966e-07, + "loss": 0.784, + "step": 11925 + }, + { + "epoch": 0.18572863069893436, + "grad_norm": 5.694007396697998, + "learning_rate": 8.571475393716917e-07, + "loss": 0.6915, + "step": 11930 + }, + { + "epoch": 0.18580647170090372, + "grad_norm": 4.1263017654418945, + "learning_rate": 8.570656003670868e-07, + "loss": 0.7476, + "step": 11935 + }, + { + "epoch": 0.18588431270287312, + "grad_norm": 5.350509166717529, + "learning_rate": 8.569836613624816e-07, + "loss": 0.6784, + "step": 11940 + }, + { + "epoch": 0.18596215370484248, + "grad_norm": 3.7994115352630615, + "learning_rate": 8.569017223578767e-07, + "loss": 0.7191, + "step": 11945 + }, + { + "epoch": 0.18603999470681187, + "grad_norm": 5.086226940155029, + "learning_rate": 8.568197833532718e-07, + "loss": 0.7841, + "step": 11950 + }, + { + "epoch": 0.18611783570878124, + "grad_norm": 4.248946189880371, + "learning_rate": 8.567378443486668e-07, + "loss": 0.8489, + "step": 11955 + }, + { + "epoch": 0.18619567671075063, + "grad_norm": 2.877885580062866, + "learning_rate": 8.566559053440618e-07, + "loss": 0.66, + "step": 11960 + }, + { + "epoch": 0.18627351771272, + "grad_norm": 6.250997066497803, + "learning_rate": 8.565739663394569e-07, + "loss": 0.7974, + "step": 11965 + }, + { + "epoch": 0.18635135871468939, + "grad_norm": 5.646812915802002, + "learning_rate": 8.564920273348519e-07, + "loss": 0.87, + "step": 11970 + }, + { + "epoch": 0.18642919971665875, + "grad_norm": 3.944369316101074, + "learning_rate": 8.56410088330247e-07, + "loss": 0.8484, + "step": 11975 + }, + { + "epoch": 0.18650704071862814, + "grad_norm": 3.154167413711548, + "learning_rate": 8.563281493256419e-07, + "loss": 0.728, + "step": 11980 + }, + { + "epoch": 0.1865848817205975, + "grad_norm": 5.012053489685059, + "learning_rate": 8.562462103210369e-07, + "loss": 0.7125, + "step": 11985 + }, + { + "epoch": 0.1866627227225669, + "grad_norm": 5.746982097625732, + "learning_rate": 8.56164271316432e-07, + "loss": 0.7059, + "step": 11990 + }, + { + "epoch": 0.18674056372453626, + "grad_norm": 3.114208698272705, + "learning_rate": 8.56082332311827e-07, + "loss": 0.76, + "step": 11995 + }, + { + "epoch": 0.18681840472650563, + "grad_norm": 3.1285858154296875, + "learning_rate": 8.560003933072221e-07, + "loss": 0.8049, + "step": 12000 + }, + { + "epoch": 0.18689624572847502, + "grad_norm": 3.38901686668396, + "learning_rate": 8.559184543026171e-07, + "loss": 0.6621, + "step": 12005 + }, + { + "epoch": 0.18697408673044438, + "grad_norm": 3.768120050430298, + "learning_rate": 8.558365152980122e-07, + "loss": 0.7425, + "step": 12010 + }, + { + "epoch": 0.18705192773241378, + "grad_norm": 5.235153675079346, + "learning_rate": 8.557545762934072e-07, + "loss": 0.731, + "step": 12015 + }, + { + "epoch": 0.18712976873438314, + "grad_norm": 5.779262065887451, + "learning_rate": 8.556726372888021e-07, + "loss": 0.6507, + "step": 12020 + }, + { + "epoch": 0.18720760973635253, + "grad_norm": 6.876397609710693, + "learning_rate": 8.555906982841972e-07, + "loss": 0.9053, + "step": 12025 + }, + { + "epoch": 0.1872854507383219, + "grad_norm": 3.9089648723602295, + "learning_rate": 8.555087592795922e-07, + "loss": 0.887, + "step": 12030 + }, + { + "epoch": 0.1873632917402913, + "grad_norm": 4.342790603637695, + "learning_rate": 8.554268202749872e-07, + "loss": 0.8031, + "step": 12035 + }, + { + "epoch": 0.18744113274226065, + "grad_norm": 4.621131420135498, + "learning_rate": 8.553448812703823e-07, + "loss": 0.7683, + "step": 12040 + }, + { + "epoch": 0.18751897374423004, + "grad_norm": 4.215606212615967, + "learning_rate": 8.552629422657774e-07, + "loss": 0.7317, + "step": 12045 + }, + { + "epoch": 0.1875968147461994, + "grad_norm": 4.2532172203063965, + "learning_rate": 8.551810032611723e-07, + "loss": 0.8652, + "step": 12050 + }, + { + "epoch": 0.1876746557481688, + "grad_norm": 3.5596415996551514, + "learning_rate": 8.550990642565674e-07, + "loss": 0.8786, + "step": 12055 + }, + { + "epoch": 0.18775249675013816, + "grad_norm": 4.018403053283691, + "learning_rate": 8.550171252519624e-07, + "loss": 0.8089, + "step": 12060 + }, + { + "epoch": 0.18783033775210756, + "grad_norm": 3.3754570484161377, + "learning_rate": 8.549351862473573e-07, + "loss": 0.8194, + "step": 12065 + }, + { + "epoch": 0.18790817875407692, + "grad_norm": 3.737074375152588, + "learning_rate": 8.548532472427524e-07, + "loss": 0.881, + "step": 12070 + }, + { + "epoch": 0.1879860197560463, + "grad_norm": 2.991834878921509, + "learning_rate": 8.547713082381475e-07, + "loss": 0.7424, + "step": 12075 + }, + { + "epoch": 0.18806386075801568, + "grad_norm": 12.81692886352539, + "learning_rate": 8.546893692335425e-07, + "loss": 0.747, + "step": 12080 + }, + { + "epoch": 0.18814170175998504, + "grad_norm": 5.266377925872803, + "learning_rate": 8.546074302289375e-07, + "loss": 0.799, + "step": 12085 + }, + { + "epoch": 0.18821954276195443, + "grad_norm": 4.115131378173828, + "learning_rate": 8.545254912243326e-07, + "loss": 0.8458, + "step": 12090 + }, + { + "epoch": 0.1882973837639238, + "grad_norm": 2.350405216217041, + "learning_rate": 8.544435522197276e-07, + "loss": 0.8161, + "step": 12095 + }, + { + "epoch": 0.1883752247658932, + "grad_norm": 6.091660499572754, + "learning_rate": 8.543616132151227e-07, + "loss": 0.7929, + "step": 12100 + }, + { + "epoch": 0.18845306576786255, + "grad_norm": 5.162372589111328, + "learning_rate": 8.542796742105176e-07, + "loss": 0.7737, + "step": 12105 + }, + { + "epoch": 0.18853090676983195, + "grad_norm": 3.9324533939361572, + "learning_rate": 8.541977352059126e-07, + "loss": 0.7208, + "step": 12110 + }, + { + "epoch": 0.1886087477718013, + "grad_norm": 5.336818218231201, + "learning_rate": 8.541157962013077e-07, + "loss": 0.8538, + "step": 12115 + }, + { + "epoch": 0.1886865887737707, + "grad_norm": 4.819896221160889, + "learning_rate": 8.540338571967027e-07, + "loss": 0.8404, + "step": 12120 + }, + { + "epoch": 0.18876442977574007, + "grad_norm": 4.829216480255127, + "learning_rate": 8.539519181920978e-07, + "loss": 0.8186, + "step": 12125 + }, + { + "epoch": 0.18884227077770946, + "grad_norm": 3.225750684738159, + "learning_rate": 8.538699791874928e-07, + "loss": 0.605, + "step": 12130 + }, + { + "epoch": 0.18892011177967882, + "grad_norm": 5.357349872589111, + "learning_rate": 8.537880401828879e-07, + "loss": 0.8215, + "step": 12135 + }, + { + "epoch": 0.18899795278164822, + "grad_norm": 5.640933990478516, + "learning_rate": 8.537061011782829e-07, + "loss": 0.844, + "step": 12140 + }, + { + "epoch": 0.18907579378361758, + "grad_norm": 6.281761169433594, + "learning_rate": 8.536241621736778e-07, + "loss": 0.7619, + "step": 12145 + }, + { + "epoch": 0.18915363478558697, + "grad_norm": 4.4332661628723145, + "learning_rate": 8.535422231690729e-07, + "loss": 0.7782, + "step": 12150 + }, + { + "epoch": 0.18923147578755634, + "grad_norm": 6.337418556213379, + "learning_rate": 8.53460284164468e-07, + "loss": 0.8855, + "step": 12155 + }, + { + "epoch": 0.18930931678952573, + "grad_norm": 3.157975912094116, + "learning_rate": 8.533783451598629e-07, + "loss": 0.6462, + "step": 12160 + }, + { + "epoch": 0.1893871577914951, + "grad_norm": 3.5442986488342285, + "learning_rate": 8.53296406155258e-07, + "loss": 0.7451, + "step": 12165 + }, + { + "epoch": 0.18946499879346446, + "grad_norm": 3.5435712337493896, + "learning_rate": 8.532144671506531e-07, + "loss": 0.741, + "step": 12170 + }, + { + "epoch": 0.18954283979543385, + "grad_norm": 2.5443406105041504, + "learning_rate": 8.53132528146048e-07, + "loss": 0.8358, + "step": 12175 + }, + { + "epoch": 0.1896206807974032, + "grad_norm": 3.2555923461914062, + "learning_rate": 8.530505891414431e-07, + "loss": 0.7672, + "step": 12180 + }, + { + "epoch": 0.1896985217993726, + "grad_norm": 4.258267879486084, + "learning_rate": 8.529686501368381e-07, + "loss": 0.6727, + "step": 12185 + }, + { + "epoch": 0.18977636280134197, + "grad_norm": 5.2711920738220215, + "learning_rate": 8.528867111322331e-07, + "loss": 0.6505, + "step": 12190 + }, + { + "epoch": 0.18985420380331136, + "grad_norm": 3.2238636016845703, + "learning_rate": 8.528047721276281e-07, + "loss": 0.8701, + "step": 12195 + }, + { + "epoch": 0.18993204480528073, + "grad_norm": 3.0218734741210938, + "learning_rate": 8.527228331230232e-07, + "loss": 0.7897, + "step": 12200 + }, + { + "epoch": 0.19000988580725012, + "grad_norm": 3.167506694793701, + "learning_rate": 8.526408941184182e-07, + "loss": 0.7731, + "step": 12205 + }, + { + "epoch": 0.19008772680921948, + "grad_norm": 2.4751815795898438, + "learning_rate": 8.525589551138132e-07, + "loss": 0.6739, + "step": 12210 + }, + { + "epoch": 0.19016556781118887, + "grad_norm": 6.196854114532471, + "learning_rate": 8.524770161092083e-07, + "loss": 0.7885, + "step": 12215 + }, + { + "epoch": 0.19024340881315824, + "grad_norm": 3.2802228927612305, + "learning_rate": 8.523950771046033e-07, + "loss": 0.8174, + "step": 12220 + }, + { + "epoch": 0.19032124981512763, + "grad_norm": 2.8430838584899902, + "learning_rate": 8.523131380999983e-07, + "loss": 0.7345, + "step": 12225 + }, + { + "epoch": 0.190399090817097, + "grad_norm": 3.185624837875366, + "learning_rate": 8.522311990953933e-07, + "loss": 0.7387, + "step": 12230 + }, + { + "epoch": 0.1904769318190664, + "grad_norm": 4.954017639160156, + "learning_rate": 8.521492600907883e-07, + "loss": 0.7631, + "step": 12235 + }, + { + "epoch": 0.19055477282103575, + "grad_norm": 3.4082043170928955, + "learning_rate": 8.520673210861834e-07, + "loss": 0.6641, + "step": 12240 + }, + { + "epoch": 0.19063261382300514, + "grad_norm": 4.00759744644165, + "learning_rate": 8.519853820815785e-07, + "loss": 0.6956, + "step": 12245 + }, + { + "epoch": 0.1907104548249745, + "grad_norm": 3.7741944789886475, + "learning_rate": 8.519034430769735e-07, + "loss": 0.8373, + "step": 12250 + }, + { + "epoch": 0.19078829582694387, + "grad_norm": 8.064949035644531, + "learning_rate": 8.518215040723685e-07, + "loss": 0.8908, + "step": 12255 + }, + { + "epoch": 0.19086613682891326, + "grad_norm": 3.7881693840026855, + "learning_rate": 8.517395650677636e-07, + "loss": 0.7637, + "step": 12260 + }, + { + "epoch": 0.19094397783088263, + "grad_norm": 3.755758762359619, + "learning_rate": 8.516576260631585e-07, + "loss": 0.7973, + "step": 12265 + }, + { + "epoch": 0.19102181883285202, + "grad_norm": 5.2573370933532715, + "learning_rate": 8.515756870585535e-07, + "loss": 0.6995, + "step": 12270 + }, + { + "epoch": 0.19109965983482138, + "grad_norm": 3.6603024005889893, + "learning_rate": 8.514937480539486e-07, + "loss": 0.7853, + "step": 12275 + }, + { + "epoch": 0.19117750083679078, + "grad_norm": 5.118966102600098, + "learning_rate": 8.514118090493437e-07, + "loss": 0.884, + "step": 12280 + }, + { + "epoch": 0.19125534183876014, + "grad_norm": 3.1857612133026123, + "learning_rate": 8.513298700447386e-07, + "loss": 0.804, + "step": 12285 + }, + { + "epoch": 0.19133318284072953, + "grad_norm": 5.507519245147705, + "learning_rate": 8.512479310401337e-07, + "loss": 0.7884, + "step": 12290 + }, + { + "epoch": 0.1914110238426989, + "grad_norm": 8.014042854309082, + "learning_rate": 8.511659920355288e-07, + "loss": 0.9034, + "step": 12295 + }, + { + "epoch": 0.1914888648446683, + "grad_norm": 3.642604112625122, + "learning_rate": 8.510840530309238e-07, + "loss": 0.7831, + "step": 12300 + }, + { + "epoch": 0.19156670584663765, + "grad_norm": 3.5193252563476562, + "learning_rate": 8.510021140263187e-07, + "loss": 0.8202, + "step": 12305 + }, + { + "epoch": 0.19164454684860704, + "grad_norm": 4.22996187210083, + "learning_rate": 8.509201750217138e-07, + "loss": 0.6753, + "step": 12310 + }, + { + "epoch": 0.1917223878505764, + "grad_norm": 3.661057472229004, + "learning_rate": 8.508382360171088e-07, + "loss": 0.785, + "step": 12315 + }, + { + "epoch": 0.1918002288525458, + "grad_norm": 3.3151731491088867, + "learning_rate": 8.507562970125038e-07, + "loss": 0.7575, + "step": 12320 + }, + { + "epoch": 0.19187806985451517, + "grad_norm": 2.6864590644836426, + "learning_rate": 8.506743580078989e-07, + "loss": 0.8164, + "step": 12325 + }, + { + "epoch": 0.19195591085648456, + "grad_norm": 5.895023345947266, + "learning_rate": 8.505924190032939e-07, + "loss": 0.8764, + "step": 12330 + }, + { + "epoch": 0.19203375185845392, + "grad_norm": 5.192370414733887, + "learning_rate": 8.50510479998689e-07, + "loss": 0.6947, + "step": 12335 + }, + { + "epoch": 0.19211159286042329, + "grad_norm": 2.818570375442505, + "learning_rate": 8.50428540994084e-07, + "loss": 0.8609, + "step": 12340 + }, + { + "epoch": 0.19218943386239268, + "grad_norm": 4.249600887298584, + "learning_rate": 8.503466019894789e-07, + "loss": 0.7599, + "step": 12345 + }, + { + "epoch": 0.19226727486436204, + "grad_norm": 3.1460306644439697, + "learning_rate": 8.50264662984874e-07, + "loss": 0.6936, + "step": 12350 + }, + { + "epoch": 0.19234511586633143, + "grad_norm": 9.153656959533691, + "learning_rate": 8.50182723980269e-07, + "loss": 0.8209, + "step": 12355 + }, + { + "epoch": 0.1924229568683008, + "grad_norm": 4.739767074584961, + "learning_rate": 8.50100784975664e-07, + "loss": 0.8558, + "step": 12360 + }, + { + "epoch": 0.1925007978702702, + "grad_norm": 2.902334451675415, + "learning_rate": 8.500188459710591e-07, + "loss": 0.6182, + "step": 12365 + }, + { + "epoch": 0.19257863887223955, + "grad_norm": 5.967688083648682, + "learning_rate": 8.499369069664542e-07, + "loss": 0.843, + "step": 12370 + }, + { + "epoch": 0.19265647987420895, + "grad_norm": 3.8589203357696533, + "learning_rate": 8.498549679618492e-07, + "loss": 0.7795, + "step": 12375 + }, + { + "epoch": 0.1927343208761783, + "grad_norm": 3.3635146617889404, + "learning_rate": 8.497730289572442e-07, + "loss": 0.7461, + "step": 12380 + }, + { + "epoch": 0.1928121618781477, + "grad_norm": 3.772067070007324, + "learning_rate": 8.496910899526392e-07, + "loss": 0.8692, + "step": 12385 + }, + { + "epoch": 0.19289000288011707, + "grad_norm": 3.3439524173736572, + "learning_rate": 8.496091509480343e-07, + "loss": 0.6822, + "step": 12390 + }, + { + "epoch": 0.19296784388208646, + "grad_norm": 6.4128875732421875, + "learning_rate": 8.495272119434292e-07, + "loss": 0.8279, + "step": 12395 + }, + { + "epoch": 0.19304568488405582, + "grad_norm": 3.5066323280334473, + "learning_rate": 8.494452729388243e-07, + "loss": 0.8597, + "step": 12400 + }, + { + "epoch": 0.19312352588602522, + "grad_norm": 4.82898473739624, + "learning_rate": 8.493633339342194e-07, + "loss": 0.8872, + "step": 12405 + }, + { + "epoch": 0.19320136688799458, + "grad_norm": 7.195667266845703, + "learning_rate": 8.492813949296143e-07, + "loss": 0.7993, + "step": 12410 + }, + { + "epoch": 0.19327920788996397, + "grad_norm": 3.0416855812072754, + "learning_rate": 8.491994559250094e-07, + "loss": 0.8976, + "step": 12415 + }, + { + "epoch": 0.19335704889193334, + "grad_norm": 4.605866432189941, + "learning_rate": 8.491175169204045e-07, + "loss": 0.6899, + "step": 12420 + }, + { + "epoch": 0.1934348898939027, + "grad_norm": 2.6966631412506104, + "learning_rate": 8.490355779157995e-07, + "loss": 0.7145, + "step": 12425 + }, + { + "epoch": 0.1935127308958721, + "grad_norm": 4.953130722045898, + "learning_rate": 8.489536389111944e-07, + "loss": 0.789, + "step": 12430 + }, + { + "epoch": 0.19359057189784146, + "grad_norm": 2.913949489593506, + "learning_rate": 8.488716999065895e-07, + "loss": 0.7734, + "step": 12435 + }, + { + "epoch": 0.19366841289981085, + "grad_norm": 2.37611722946167, + "learning_rate": 8.487897609019845e-07, + "loss": 0.663, + "step": 12440 + }, + { + "epoch": 0.1937462539017802, + "grad_norm": 6.736103534698486, + "learning_rate": 8.487078218973795e-07, + "loss": 0.5834, + "step": 12445 + }, + { + "epoch": 0.1938240949037496, + "grad_norm": 7.212032318115234, + "learning_rate": 8.486258828927746e-07, + "loss": 0.7853, + "step": 12450 + }, + { + "epoch": 0.19390193590571897, + "grad_norm": 3.8290274143218994, + "learning_rate": 8.485439438881696e-07, + "loss": 0.7891, + "step": 12455 + }, + { + "epoch": 0.19397977690768836, + "grad_norm": 2.9193921089172363, + "learning_rate": 8.484620048835647e-07, + "loss": 0.8621, + "step": 12460 + }, + { + "epoch": 0.19405761790965773, + "grad_norm": 3.4873313903808594, + "learning_rate": 8.483800658789597e-07, + "loss": 0.6691, + "step": 12465 + }, + { + "epoch": 0.19413545891162712, + "grad_norm": 4.49635124206543, + "learning_rate": 8.482981268743546e-07, + "loss": 0.7417, + "step": 12470 + }, + { + "epoch": 0.19421329991359648, + "grad_norm": 3.002316951751709, + "learning_rate": 8.482161878697497e-07, + "loss": 0.8712, + "step": 12475 + }, + { + "epoch": 0.19429114091556587, + "grad_norm": 2.9635400772094727, + "learning_rate": 8.481342488651448e-07, + "loss": 0.7171, + "step": 12480 + }, + { + "epoch": 0.19436898191753524, + "grad_norm": 3.025629997253418, + "learning_rate": 8.480523098605397e-07, + "loss": 0.7841, + "step": 12485 + }, + { + "epoch": 0.19444682291950463, + "grad_norm": 5.012829303741455, + "learning_rate": 8.479703708559348e-07, + "loss": 0.7381, + "step": 12490 + }, + { + "epoch": 0.194524663921474, + "grad_norm": 4.070782661437988, + "learning_rate": 8.478884318513299e-07, + "loss": 0.7512, + "step": 12495 + }, + { + "epoch": 0.1946025049234434, + "grad_norm": 3.400742769241333, + "learning_rate": 8.478064928467249e-07, + "loss": 0.7998, + "step": 12500 + }, + { + "epoch": 0.19468034592541275, + "grad_norm": 6.554990768432617, + "learning_rate": 8.477245538421199e-07, + "loss": 0.8623, + "step": 12505 + }, + { + "epoch": 0.19475818692738214, + "grad_norm": 3.837843179702759, + "learning_rate": 8.476426148375149e-07, + "loss": 0.7377, + "step": 12510 + }, + { + "epoch": 0.1948360279293515, + "grad_norm": 3.646177291870117, + "learning_rate": 8.4756067583291e-07, + "loss": 0.8649, + "step": 12515 + }, + { + "epoch": 0.19491386893132087, + "grad_norm": 5.887502193450928, + "learning_rate": 8.474787368283049e-07, + "loss": 0.7227, + "step": 12520 + }, + { + "epoch": 0.19499170993329026, + "grad_norm": 3.3450663089752197, + "learning_rate": 8.473967978237e-07, + "loss": 0.7593, + "step": 12525 + }, + { + "epoch": 0.19506955093525963, + "grad_norm": 3.6260218620300293, + "learning_rate": 8.473148588190951e-07, + "loss": 0.6893, + "step": 12530 + }, + { + "epoch": 0.19514739193722902, + "grad_norm": 3.210761308670044, + "learning_rate": 8.4723291981449e-07, + "loss": 0.64, + "step": 12535 + }, + { + "epoch": 0.19522523293919838, + "grad_norm": 7.629252910614014, + "learning_rate": 8.471509808098851e-07, + "loss": 0.7134, + "step": 12540 + }, + { + "epoch": 0.19530307394116778, + "grad_norm": 3.362891435623169, + "learning_rate": 8.470690418052802e-07, + "loss": 0.6621, + "step": 12545 + }, + { + "epoch": 0.19538091494313714, + "grad_norm": 3.2264938354492188, + "learning_rate": 8.469871028006751e-07, + "loss": 0.7544, + "step": 12550 + }, + { + "epoch": 0.19545875594510653, + "grad_norm": 4.121139049530029, + "learning_rate": 8.469051637960701e-07, + "loss": 0.8005, + "step": 12555 + }, + { + "epoch": 0.1955365969470759, + "grad_norm": 4.077367305755615, + "learning_rate": 8.468232247914652e-07, + "loss": 0.7671, + "step": 12560 + }, + { + "epoch": 0.1956144379490453, + "grad_norm": 4.537203311920166, + "learning_rate": 8.467412857868602e-07, + "loss": 0.7573, + "step": 12565 + }, + { + "epoch": 0.19569227895101465, + "grad_norm": 4.151341438293457, + "learning_rate": 8.466593467822553e-07, + "loss": 0.7534, + "step": 12570 + }, + { + "epoch": 0.19577011995298405, + "grad_norm": 4.7857584953308105, + "learning_rate": 8.465774077776503e-07, + "loss": 0.7586, + "step": 12575 + }, + { + "epoch": 0.1958479609549534, + "grad_norm": 3.5333428382873535, + "learning_rate": 8.464954687730453e-07, + "loss": 0.7703, + "step": 12580 + }, + { + "epoch": 0.1959258019569228, + "grad_norm": 3.486469030380249, + "learning_rate": 8.464135297684404e-07, + "loss": 0.7808, + "step": 12585 + }, + { + "epoch": 0.19600364295889217, + "grad_norm": 5.123147487640381, + "learning_rate": 8.463315907638353e-07, + "loss": 0.7586, + "step": 12590 + }, + { + "epoch": 0.19608148396086156, + "grad_norm": 5.875349521636963, + "learning_rate": 8.462496517592303e-07, + "loss": 0.8283, + "step": 12595 + }, + { + "epoch": 0.19615932496283092, + "grad_norm": 3.429596424102783, + "learning_rate": 8.461677127546254e-07, + "loss": 0.9282, + "step": 12600 + }, + { + "epoch": 0.1962371659648003, + "grad_norm": 4.255092620849609, + "learning_rate": 8.460857737500205e-07, + "loss": 0.7489, + "step": 12605 + }, + { + "epoch": 0.19631500696676968, + "grad_norm": 5.298252582550049, + "learning_rate": 8.460038347454154e-07, + "loss": 0.8015, + "step": 12610 + }, + { + "epoch": 0.19639284796873904, + "grad_norm": 3.1947810649871826, + "learning_rate": 8.459218957408105e-07, + "loss": 0.8742, + "step": 12615 + }, + { + "epoch": 0.19647068897070843, + "grad_norm": 2.557737112045288, + "learning_rate": 8.458399567362056e-07, + "loss": 0.7221, + "step": 12620 + }, + { + "epoch": 0.1965485299726778, + "grad_norm": 3.7738475799560547, + "learning_rate": 8.457580177316007e-07, + "loss": 0.8054, + "step": 12625 + }, + { + "epoch": 0.1966263709746472, + "grad_norm": 3.820087432861328, + "learning_rate": 8.456760787269955e-07, + "loss": 0.8202, + "step": 12630 + }, + { + "epoch": 0.19670421197661656, + "grad_norm": 9.728828430175781, + "learning_rate": 8.455941397223906e-07, + "loss": 0.9442, + "step": 12635 + }, + { + "epoch": 0.19678205297858595, + "grad_norm": 2.848874807357788, + "learning_rate": 8.455122007177857e-07, + "loss": 0.8407, + "step": 12640 + }, + { + "epoch": 0.1968598939805553, + "grad_norm": 2.538989305496216, + "learning_rate": 8.454302617131806e-07, + "loss": 0.6267, + "step": 12645 + }, + { + "epoch": 0.1969377349825247, + "grad_norm": 4.164561748504639, + "learning_rate": 8.453483227085757e-07, + "loss": 0.8619, + "step": 12650 + }, + { + "epoch": 0.19701557598449407, + "grad_norm": 2.70489239692688, + "learning_rate": 8.452663837039708e-07, + "loss": 0.6901, + "step": 12655 + }, + { + "epoch": 0.19709341698646346, + "grad_norm": 5.329331398010254, + "learning_rate": 8.451844446993658e-07, + "loss": 0.8083, + "step": 12660 + }, + { + "epoch": 0.19717125798843282, + "grad_norm": 3.1759660243988037, + "learning_rate": 8.451025056947608e-07, + "loss": 0.6953, + "step": 12665 + }, + { + "epoch": 0.19724909899040222, + "grad_norm": 3.253971576690674, + "learning_rate": 8.450205666901558e-07, + "loss": 0.7275, + "step": 12670 + }, + { + "epoch": 0.19732693999237158, + "grad_norm": 4.8069634437561035, + "learning_rate": 8.449386276855508e-07, + "loss": 0.874, + "step": 12675 + }, + { + "epoch": 0.19740478099434097, + "grad_norm": 5.485657215118408, + "learning_rate": 8.448566886809458e-07, + "loss": 0.7752, + "step": 12680 + }, + { + "epoch": 0.19748262199631034, + "grad_norm": 3.5033183097839355, + "learning_rate": 8.447747496763409e-07, + "loss": 0.8751, + "step": 12685 + }, + { + "epoch": 0.1975604629982797, + "grad_norm": 3.121022939682007, + "learning_rate": 8.446928106717359e-07, + "loss": 0.7357, + "step": 12690 + }, + { + "epoch": 0.1976383040002491, + "grad_norm": 10.636984825134277, + "learning_rate": 8.44610871667131e-07, + "loss": 0.8506, + "step": 12695 + }, + { + "epoch": 0.19771614500221846, + "grad_norm": 5.662878513336182, + "learning_rate": 8.44528932662526e-07, + "loss": 0.8426, + "step": 12700 + }, + { + "epoch": 0.19779398600418785, + "grad_norm": 3.862569808959961, + "learning_rate": 8.44446993657921e-07, + "loss": 0.8391, + "step": 12705 + }, + { + "epoch": 0.1978718270061572, + "grad_norm": 5.017756938934326, + "learning_rate": 8.44365054653316e-07, + "loss": 0.8664, + "step": 12710 + }, + { + "epoch": 0.1979496680081266, + "grad_norm": 3.679381847381592, + "learning_rate": 8.44283115648711e-07, + "loss": 0.95, + "step": 12715 + }, + { + "epoch": 0.19802750901009597, + "grad_norm": 3.268360137939453, + "learning_rate": 8.44201176644106e-07, + "loss": 0.7672, + "step": 12720 + }, + { + "epoch": 0.19810535001206536, + "grad_norm": 5.429895401000977, + "learning_rate": 8.441192376395011e-07, + "loss": 0.7644, + "step": 12725 + }, + { + "epoch": 0.19818319101403473, + "grad_norm": 5.481542587280273, + "learning_rate": 8.440372986348962e-07, + "loss": 0.8911, + "step": 12730 + }, + { + "epoch": 0.19826103201600412, + "grad_norm": 3.77721905708313, + "learning_rate": 8.439553596302911e-07, + "loss": 0.7636, + "step": 12735 + }, + { + "epoch": 0.19833887301797348, + "grad_norm": 3.9851577281951904, + "learning_rate": 8.438734206256862e-07, + "loss": 0.9312, + "step": 12740 + }, + { + "epoch": 0.19841671401994287, + "grad_norm": 3.65083909034729, + "learning_rate": 8.437914816210813e-07, + "loss": 0.7529, + "step": 12745 + }, + { + "epoch": 0.19849455502191224, + "grad_norm": 3.0100839138031006, + "learning_rate": 8.437095426164764e-07, + "loss": 0.7147, + "step": 12750 + }, + { + "epoch": 0.19857239602388163, + "grad_norm": 3.7276885509490967, + "learning_rate": 8.436276036118712e-07, + "loss": 0.7436, + "step": 12755 + }, + { + "epoch": 0.198650237025851, + "grad_norm": 3.0928821563720703, + "learning_rate": 8.435456646072663e-07, + "loss": 0.7429, + "step": 12760 + }, + { + "epoch": 0.1987280780278204, + "grad_norm": 3.2721216678619385, + "learning_rate": 8.434637256026614e-07, + "loss": 0.7477, + "step": 12765 + }, + { + "epoch": 0.19880591902978975, + "grad_norm": 4.112125396728516, + "learning_rate": 8.433817865980563e-07, + "loss": 0.7392, + "step": 12770 + }, + { + "epoch": 0.19888376003175912, + "grad_norm": 3.3520290851593018, + "learning_rate": 8.432998475934514e-07, + "loss": 0.8243, + "step": 12775 + }, + { + "epoch": 0.1989616010337285, + "grad_norm": 2.7560410499572754, + "learning_rate": 8.432179085888465e-07, + "loss": 0.8496, + "step": 12780 + }, + { + "epoch": 0.19903944203569787, + "grad_norm": 5.738790988922119, + "learning_rate": 8.431359695842415e-07, + "loss": 0.8122, + "step": 12785 + }, + { + "epoch": 0.19911728303766726, + "grad_norm": 3.3104143142700195, + "learning_rate": 8.430540305796365e-07, + "loss": 0.8499, + "step": 12790 + }, + { + "epoch": 0.19919512403963663, + "grad_norm": 3.488959312438965, + "learning_rate": 8.429720915750315e-07, + "loss": 0.7974, + "step": 12795 + }, + { + "epoch": 0.19927296504160602, + "grad_norm": 5.885444164276123, + "learning_rate": 8.428901525704265e-07, + "loss": 0.8403, + "step": 12800 + }, + { + "epoch": 0.19935080604357538, + "grad_norm": 7.075294017791748, + "learning_rate": 8.428082135658215e-07, + "loss": 0.9154, + "step": 12805 + }, + { + "epoch": 0.19942864704554478, + "grad_norm": 7.824799060821533, + "learning_rate": 8.427262745612166e-07, + "loss": 0.8342, + "step": 12810 + }, + { + "epoch": 0.19950648804751414, + "grad_norm": 3.2951488494873047, + "learning_rate": 8.426443355566116e-07, + "loss": 0.8083, + "step": 12815 + }, + { + "epoch": 0.19958432904948353, + "grad_norm": 4.159008026123047, + "learning_rate": 8.425623965520067e-07, + "loss": 0.9696, + "step": 12820 + }, + { + "epoch": 0.1996621700514529, + "grad_norm": 2.9382009506225586, + "learning_rate": 8.424804575474017e-07, + "loss": 0.7681, + "step": 12825 + }, + { + "epoch": 0.1997400110534223, + "grad_norm": 4.053783893585205, + "learning_rate": 8.423985185427967e-07, + "loss": 0.8329, + "step": 12830 + }, + { + "epoch": 0.19981785205539165, + "grad_norm": 5.080565452575684, + "learning_rate": 8.423165795381917e-07, + "loss": 0.777, + "step": 12835 + }, + { + "epoch": 0.19989569305736105, + "grad_norm": 2.973886251449585, + "learning_rate": 8.422346405335868e-07, + "loss": 0.8252, + "step": 12840 + }, + { + "epoch": 0.1999735340593304, + "grad_norm": 6.690211296081543, + "learning_rate": 8.421527015289817e-07, + "loss": 0.7814, + "step": 12845 + }, + { + "epoch": 0.2000513750612998, + "grad_norm": 3.9737579822540283, + "learning_rate": 8.420707625243768e-07, + "loss": 0.8165, + "step": 12850 + }, + { + "epoch": 0.20012921606326917, + "grad_norm": 4.720776081085205, + "learning_rate": 8.419888235197719e-07, + "loss": 0.8016, + "step": 12855 + }, + { + "epoch": 0.20020705706523853, + "grad_norm": 3.9128286838531494, + "learning_rate": 8.419068845151668e-07, + "loss": 0.8222, + "step": 12860 + }, + { + "epoch": 0.20028489806720792, + "grad_norm": 2.903087854385376, + "learning_rate": 8.418249455105619e-07, + "loss": 0.8552, + "step": 12865 + }, + { + "epoch": 0.2003627390691773, + "grad_norm": 3.1490824222564697, + "learning_rate": 8.41743006505957e-07, + "loss": 0.7545, + "step": 12870 + }, + { + "epoch": 0.20044058007114668, + "grad_norm": 7.775855541229248, + "learning_rate": 8.41661067501352e-07, + "loss": 0.8454, + "step": 12875 + }, + { + "epoch": 0.20051842107311604, + "grad_norm": 4.137736797332764, + "learning_rate": 8.415791284967469e-07, + "loss": 0.8048, + "step": 12880 + }, + { + "epoch": 0.20059626207508544, + "grad_norm": 2.8976640701293945, + "learning_rate": 8.41497189492142e-07, + "loss": 0.709, + "step": 12885 + }, + { + "epoch": 0.2006741030770548, + "grad_norm": 3.4031612873077393, + "learning_rate": 8.414152504875371e-07, + "loss": 0.7075, + "step": 12890 + }, + { + "epoch": 0.2007519440790242, + "grad_norm": 4.480353832244873, + "learning_rate": 8.41333311482932e-07, + "loss": 0.7674, + "step": 12895 + }, + { + "epoch": 0.20082978508099356, + "grad_norm": 3.8333520889282227, + "learning_rate": 8.412513724783271e-07, + "loss": 0.7645, + "step": 12900 + }, + { + "epoch": 0.20090762608296295, + "grad_norm": 3.1221156120300293, + "learning_rate": 8.411694334737222e-07, + "loss": 0.7816, + "step": 12905 + }, + { + "epoch": 0.2009854670849323, + "grad_norm": 22.836925506591797, + "learning_rate": 8.410874944691172e-07, + "loss": 0.8316, + "step": 12910 + }, + { + "epoch": 0.2010633080869017, + "grad_norm": 4.997188568115234, + "learning_rate": 8.410055554645121e-07, + "loss": 0.7576, + "step": 12915 + }, + { + "epoch": 0.20114114908887107, + "grad_norm": 4.20601749420166, + "learning_rate": 8.409236164599072e-07, + "loss": 0.7546, + "step": 12920 + }, + { + "epoch": 0.20121899009084046, + "grad_norm": 4.1795783042907715, + "learning_rate": 8.408416774553022e-07, + "loss": 0.845, + "step": 12925 + }, + { + "epoch": 0.20129683109280982, + "grad_norm": 3.2515761852264404, + "learning_rate": 8.407597384506973e-07, + "loss": 0.7512, + "step": 12930 + }, + { + "epoch": 0.20137467209477922, + "grad_norm": 5.808708190917969, + "learning_rate": 8.406777994460923e-07, + "loss": 0.7683, + "step": 12935 + }, + { + "epoch": 0.20145251309674858, + "grad_norm": 2.7343006134033203, + "learning_rate": 8.405958604414873e-07, + "loss": 0.8085, + "step": 12940 + }, + { + "epoch": 0.20153035409871795, + "grad_norm": 4.954883098602295, + "learning_rate": 8.405139214368824e-07, + "loss": 0.7211, + "step": 12945 + }, + { + "epoch": 0.20160819510068734, + "grad_norm": 8.119803428649902, + "learning_rate": 8.404319824322774e-07, + "loss": 0.8099, + "step": 12950 + }, + { + "epoch": 0.2016860361026567, + "grad_norm": 3.069868803024292, + "learning_rate": 8.403500434276723e-07, + "loss": 0.7915, + "step": 12955 + }, + { + "epoch": 0.2017638771046261, + "grad_norm": 2.7691965103149414, + "learning_rate": 8.402681044230674e-07, + "loss": 0.8081, + "step": 12960 + }, + { + "epoch": 0.20184171810659546, + "grad_norm": 2.716383457183838, + "learning_rate": 8.401861654184625e-07, + "loss": 0.8375, + "step": 12965 + }, + { + "epoch": 0.20191955910856485, + "grad_norm": 4.081620693206787, + "learning_rate": 8.401042264138574e-07, + "loss": 0.8029, + "step": 12970 + }, + { + "epoch": 0.20199740011053421, + "grad_norm": 4.608429908752441, + "learning_rate": 8.400222874092525e-07, + "loss": 0.9028, + "step": 12975 + }, + { + "epoch": 0.2020752411125036, + "grad_norm": 2.8588662147521973, + "learning_rate": 8.399403484046476e-07, + "loss": 0.7334, + "step": 12980 + }, + { + "epoch": 0.20215308211447297, + "grad_norm": 5.044338703155518, + "learning_rate": 8.398584094000425e-07, + "loss": 0.7045, + "step": 12985 + }, + { + "epoch": 0.20223092311644236, + "grad_norm": 3.933218479156494, + "learning_rate": 8.397764703954376e-07, + "loss": 0.8562, + "step": 12990 + }, + { + "epoch": 0.20230876411841173, + "grad_norm": 4.926372528076172, + "learning_rate": 8.396945313908326e-07, + "loss": 0.737, + "step": 12995 + }, + { + "epoch": 0.20238660512038112, + "grad_norm": 3.583951711654663, + "learning_rate": 8.396125923862277e-07, + "loss": 0.7308, + "step": 13000 + }, + { + "epoch": 0.20246444612235048, + "grad_norm": 3.6071455478668213, + "learning_rate": 8.395306533816226e-07, + "loss": 0.7894, + "step": 13005 + }, + { + "epoch": 0.20254228712431988, + "grad_norm": 4.990063667297363, + "learning_rate": 8.394487143770177e-07, + "loss": 0.8604, + "step": 13010 + }, + { + "epoch": 0.20262012812628924, + "grad_norm": 5.4818549156188965, + "learning_rate": 8.393667753724128e-07, + "loss": 0.6678, + "step": 13015 + }, + { + "epoch": 0.20269796912825863, + "grad_norm": 7.279552936553955, + "learning_rate": 8.392848363678078e-07, + "loss": 0.8089, + "step": 13020 + }, + { + "epoch": 0.202775810130228, + "grad_norm": 5.273457050323486, + "learning_rate": 8.392028973632028e-07, + "loss": 0.6855, + "step": 13025 + }, + { + "epoch": 0.20285365113219736, + "grad_norm": 5.578176021575928, + "learning_rate": 8.391209583585979e-07, + "loss": 0.8303, + "step": 13030 + }, + { + "epoch": 0.20293149213416675, + "grad_norm": 4.444499492645264, + "learning_rate": 8.390390193539929e-07, + "loss": 0.6954, + "step": 13035 + }, + { + "epoch": 0.20300933313613612, + "grad_norm": 4.023461818695068, + "learning_rate": 8.389570803493878e-07, + "loss": 0.678, + "step": 13040 + }, + { + "epoch": 0.2030871741381055, + "grad_norm": 2.834993839263916, + "learning_rate": 8.388751413447829e-07, + "loss": 0.7115, + "step": 13045 + }, + { + "epoch": 0.20316501514007487, + "grad_norm": 4.737156391143799, + "learning_rate": 8.387932023401779e-07, + "loss": 0.7809, + "step": 13050 + }, + { + "epoch": 0.20324285614204426, + "grad_norm": 12.347189903259277, + "learning_rate": 8.38711263335573e-07, + "loss": 0.7702, + "step": 13055 + }, + { + "epoch": 0.20332069714401363, + "grad_norm": 4.668651103973389, + "learning_rate": 8.38629324330968e-07, + "loss": 0.7655, + "step": 13060 + }, + { + "epoch": 0.20339853814598302, + "grad_norm": 3.7084391117095947, + "learning_rate": 8.38547385326363e-07, + "loss": 0.714, + "step": 13065 + }, + { + "epoch": 0.20347637914795239, + "grad_norm": 2.783339262008667, + "learning_rate": 8.384654463217581e-07, + "loss": 0.6384, + "step": 13070 + }, + { + "epoch": 0.20355422014992178, + "grad_norm": 3.5190024375915527, + "learning_rate": 8.383835073171532e-07, + "loss": 0.85, + "step": 13075 + }, + { + "epoch": 0.20363206115189114, + "grad_norm": 3.285179615020752, + "learning_rate": 8.38301568312548e-07, + "loss": 0.7181, + "step": 13080 + }, + { + "epoch": 0.20370990215386053, + "grad_norm": 5.458109378814697, + "learning_rate": 8.382196293079431e-07, + "loss": 0.9302, + "step": 13085 + }, + { + "epoch": 0.2037877431558299, + "grad_norm": 4.708980083465576, + "learning_rate": 8.381376903033382e-07, + "loss": 0.8137, + "step": 13090 + }, + { + "epoch": 0.2038655841577993, + "grad_norm": 3.8359594345092773, + "learning_rate": 8.380557512987331e-07, + "loss": 0.6734, + "step": 13095 + }, + { + "epoch": 0.20394342515976865, + "grad_norm": 3.476069688796997, + "learning_rate": 8.379738122941282e-07, + "loss": 0.6435, + "step": 13100 + }, + { + "epoch": 0.20402126616173805, + "grad_norm": 4.068753242492676, + "learning_rate": 8.378918732895233e-07, + "loss": 0.7332, + "step": 13105 + }, + { + "epoch": 0.2040991071637074, + "grad_norm": 5.591004371643066, + "learning_rate": 8.378099342849184e-07, + "loss": 0.7835, + "step": 13110 + }, + { + "epoch": 0.20417694816567677, + "grad_norm": 4.07530403137207, + "learning_rate": 8.377279952803133e-07, + "loss": 0.7763, + "step": 13115 + }, + { + "epoch": 0.20425478916764617, + "grad_norm": 6.400569915771484, + "learning_rate": 8.376460562757083e-07, + "loss": 0.7816, + "step": 13120 + }, + { + "epoch": 0.20433263016961553, + "grad_norm": 4.189541339874268, + "learning_rate": 8.375641172711034e-07, + "loss": 0.8091, + "step": 13125 + }, + { + "epoch": 0.20441047117158492, + "grad_norm": 4.928549289703369, + "learning_rate": 8.374821782664983e-07, + "loss": 0.7569, + "step": 13130 + }, + { + "epoch": 0.2044883121735543, + "grad_norm": 3.725872039794922, + "learning_rate": 8.374002392618934e-07, + "loss": 0.8412, + "step": 13135 + }, + { + "epoch": 0.20456615317552368, + "grad_norm": 4.936779975891113, + "learning_rate": 8.373183002572885e-07, + "loss": 0.7803, + "step": 13140 + }, + { + "epoch": 0.20464399417749304, + "grad_norm": 3.692831039428711, + "learning_rate": 8.372363612526835e-07, + "loss": 0.7979, + "step": 13145 + }, + { + "epoch": 0.20472183517946244, + "grad_norm": 5.788731098175049, + "learning_rate": 8.371544222480785e-07, + "loss": 0.7817, + "step": 13150 + }, + { + "epoch": 0.2047996761814318, + "grad_norm": 5.3713154792785645, + "learning_rate": 8.370724832434736e-07, + "loss": 0.7444, + "step": 13155 + }, + { + "epoch": 0.2048775171834012, + "grad_norm": 3.327475070953369, + "learning_rate": 8.369905442388685e-07, + "loss": 0.853, + "step": 13160 + }, + { + "epoch": 0.20495535818537056, + "grad_norm": 4.377721309661865, + "learning_rate": 8.369086052342635e-07, + "loss": 0.7105, + "step": 13165 + }, + { + "epoch": 0.20503319918733995, + "grad_norm": 2.7629923820495605, + "learning_rate": 8.368266662296586e-07, + "loss": 0.7283, + "step": 13170 + }, + { + "epoch": 0.2051110401893093, + "grad_norm": 4.508754730224609, + "learning_rate": 8.367447272250536e-07, + "loss": 0.7238, + "step": 13175 + }, + { + "epoch": 0.2051888811912787, + "grad_norm": 3.4627845287323, + "learning_rate": 8.366627882204487e-07, + "loss": 0.7426, + "step": 13180 + }, + { + "epoch": 0.20526672219324807, + "grad_norm": 4.33838415145874, + "learning_rate": 8.365808492158437e-07, + "loss": 0.7616, + "step": 13185 + }, + { + "epoch": 0.20534456319521746, + "grad_norm": 4.036031723022461, + "learning_rate": 8.364989102112387e-07, + "loss": 0.6372, + "step": 13190 + }, + { + "epoch": 0.20542240419718683, + "grad_norm": 9.265192985534668, + "learning_rate": 8.364169712066338e-07, + "loss": 0.6713, + "step": 13195 + }, + { + "epoch": 0.20550024519915622, + "grad_norm": 5.069167613983154, + "learning_rate": 8.363350322020288e-07, + "loss": 0.8708, + "step": 13200 + }, + { + "epoch": 0.20557808620112558, + "grad_norm": 3.5143802165985107, + "learning_rate": 8.362530931974237e-07, + "loss": 0.8575, + "step": 13205 + }, + { + "epoch": 0.20565592720309495, + "grad_norm": 4.60297966003418, + "learning_rate": 8.361711541928188e-07, + "loss": 0.8428, + "step": 13210 + }, + { + "epoch": 0.20573376820506434, + "grad_norm": 4.301421642303467, + "learning_rate": 8.360892151882139e-07, + "loss": 0.7471, + "step": 13215 + }, + { + "epoch": 0.2058116092070337, + "grad_norm": 7.8147125244140625, + "learning_rate": 8.360072761836088e-07, + "loss": 0.6705, + "step": 13220 + }, + { + "epoch": 0.2058894502090031, + "grad_norm": 4.782034873962402, + "learning_rate": 8.359253371790039e-07, + "loss": 0.695, + "step": 13225 + }, + { + "epoch": 0.20596729121097246, + "grad_norm": 4.737283229827881, + "learning_rate": 8.35843398174399e-07, + "loss": 0.8739, + "step": 13230 + }, + { + "epoch": 0.20604513221294185, + "grad_norm": 4.9094014167785645, + "learning_rate": 8.357614591697941e-07, + "loss": 0.7827, + "step": 13235 + }, + { + "epoch": 0.20612297321491121, + "grad_norm": 4.747406959533691, + "learning_rate": 8.356795201651889e-07, + "loss": 0.7785, + "step": 13240 + }, + { + "epoch": 0.2062008142168806, + "grad_norm": 3.4002225399017334, + "learning_rate": 8.35597581160584e-07, + "loss": 0.8564, + "step": 13245 + }, + { + "epoch": 0.20627865521884997, + "grad_norm": 5.391127109527588, + "learning_rate": 8.355156421559791e-07, + "loss": 0.7454, + "step": 13250 + }, + { + "epoch": 0.20635649622081936, + "grad_norm": 4.001280784606934, + "learning_rate": 8.35433703151374e-07, + "loss": 0.7198, + "step": 13255 + }, + { + "epoch": 0.20643433722278873, + "grad_norm": 3.865567207336426, + "learning_rate": 8.353517641467691e-07, + "loss": 0.8462, + "step": 13260 + }, + { + "epoch": 0.20651217822475812, + "grad_norm": 5.007477283477783, + "learning_rate": 8.352698251421642e-07, + "loss": 0.9695, + "step": 13265 + }, + { + "epoch": 0.20659001922672748, + "grad_norm": 4.396978378295898, + "learning_rate": 8.351878861375592e-07, + "loss": 0.8082, + "step": 13270 + }, + { + "epoch": 0.20666786022869688, + "grad_norm": 3.5921192169189453, + "learning_rate": 8.351059471329542e-07, + "loss": 0.7191, + "step": 13275 + }, + { + "epoch": 0.20674570123066624, + "grad_norm": 3.7623448371887207, + "learning_rate": 8.350240081283492e-07, + "loss": 0.7479, + "step": 13280 + }, + { + "epoch": 0.20682354223263563, + "grad_norm": 6.096179008483887, + "learning_rate": 8.349420691237442e-07, + "loss": 0.7466, + "step": 13285 + }, + { + "epoch": 0.206901383234605, + "grad_norm": 5.423510551452637, + "learning_rate": 8.348601301191393e-07, + "loss": 0.7874, + "step": 13290 + }, + { + "epoch": 0.20697922423657436, + "grad_norm": 4.719470500946045, + "learning_rate": 8.347781911145343e-07, + "loss": 0.7438, + "step": 13295 + }, + { + "epoch": 0.20705706523854375, + "grad_norm": 3.9418418407440186, + "learning_rate": 8.346962521099293e-07, + "loss": 0.7436, + "step": 13300 + }, + { + "epoch": 0.20713490624051312, + "grad_norm": 3.420037269592285, + "learning_rate": 8.346143131053244e-07, + "loss": 0.8426, + "step": 13305 + }, + { + "epoch": 0.2072127472424825, + "grad_norm": 3.9989964962005615, + "learning_rate": 8.345323741007194e-07, + "loss": 0.8473, + "step": 13310 + }, + { + "epoch": 0.20729058824445187, + "grad_norm": 12.799483299255371, + "learning_rate": 8.344504350961144e-07, + "loss": 0.719, + "step": 13315 + }, + { + "epoch": 0.20736842924642127, + "grad_norm": 3.141099691390991, + "learning_rate": 8.343684960915094e-07, + "loss": 0.8719, + "step": 13320 + }, + { + "epoch": 0.20744627024839063, + "grad_norm": 4.788917541503906, + "learning_rate": 8.342865570869045e-07, + "loss": 0.785, + "step": 13325 + }, + { + "epoch": 0.20752411125036002, + "grad_norm": 3.2936322689056396, + "learning_rate": 8.342046180822994e-07, + "loss": 0.8711, + "step": 13330 + }, + { + "epoch": 0.20760195225232939, + "grad_norm": 5.100861072540283, + "learning_rate": 8.341226790776945e-07, + "loss": 0.79, + "step": 13335 + }, + { + "epoch": 0.20767979325429878, + "grad_norm": 2.7465577125549316, + "learning_rate": 8.340407400730896e-07, + "loss": 0.6895, + "step": 13340 + }, + { + "epoch": 0.20775763425626814, + "grad_norm": 5.790363788604736, + "learning_rate": 8.339588010684845e-07, + "loss": 0.7492, + "step": 13345 + }, + { + "epoch": 0.20783547525823753, + "grad_norm": 3.6077206134796143, + "learning_rate": 8.338768620638796e-07, + "loss": 0.6855, + "step": 13350 + }, + { + "epoch": 0.2079133162602069, + "grad_norm": 3.6149632930755615, + "learning_rate": 8.337949230592747e-07, + "loss": 0.7914, + "step": 13355 + }, + { + "epoch": 0.2079911572621763, + "grad_norm": 4.330005168914795, + "learning_rate": 8.337129840546698e-07, + "loss": 0.7572, + "step": 13360 + }, + { + "epoch": 0.20806899826414565, + "grad_norm": 6.381075859069824, + "learning_rate": 8.336310450500646e-07, + "loss": 0.8347, + "step": 13365 + }, + { + "epoch": 0.20814683926611505, + "grad_norm": 3.4584124088287354, + "learning_rate": 8.335491060454597e-07, + "loss": 0.7925, + "step": 13370 + }, + { + "epoch": 0.2082246802680844, + "grad_norm": 5.580089092254639, + "learning_rate": 8.334671670408548e-07, + "loss": 0.7925, + "step": 13375 + }, + { + "epoch": 0.20830252127005378, + "grad_norm": 3.6964595317840576, + "learning_rate": 8.333852280362498e-07, + "loss": 0.7562, + "step": 13380 + }, + { + "epoch": 0.20838036227202317, + "grad_norm": 3.2514872550964355, + "learning_rate": 8.333032890316448e-07, + "loss": 0.7701, + "step": 13385 + }, + { + "epoch": 0.20845820327399253, + "grad_norm": 4.424211502075195, + "learning_rate": 8.332213500270399e-07, + "loss": 0.7148, + "step": 13390 + }, + { + "epoch": 0.20853604427596192, + "grad_norm": 5.740350723266602, + "learning_rate": 8.331394110224349e-07, + "loss": 0.8141, + "step": 13395 + }, + { + "epoch": 0.2086138852779313, + "grad_norm": 3.7232847213745117, + "learning_rate": 8.3305747201783e-07, + "loss": 0.8316, + "step": 13400 + }, + { + "epoch": 0.20869172627990068, + "grad_norm": 4.571991443634033, + "learning_rate": 8.329755330132249e-07, + "loss": 0.6605, + "step": 13405 + }, + { + "epoch": 0.20876956728187004, + "grad_norm": 3.058192729949951, + "learning_rate": 8.328935940086199e-07, + "loss": 0.8583, + "step": 13410 + }, + { + "epoch": 0.20884740828383944, + "grad_norm": 3.834956645965576, + "learning_rate": 8.32811655004015e-07, + "loss": 0.7717, + "step": 13415 + }, + { + "epoch": 0.2089252492858088, + "grad_norm": 2.910698652267456, + "learning_rate": 8.3272971599941e-07, + "loss": 0.8361, + "step": 13420 + }, + { + "epoch": 0.2090030902877782, + "grad_norm": 4.002933025360107, + "learning_rate": 8.32647776994805e-07, + "loss": 0.9264, + "step": 13425 + }, + { + "epoch": 0.20908093128974756, + "grad_norm": 4.509486198425293, + "learning_rate": 8.325658379902001e-07, + "loss": 0.7, + "step": 13430 + }, + { + "epoch": 0.20915877229171695, + "grad_norm": 3.453099250793457, + "learning_rate": 8.324838989855952e-07, + "loss": 0.8373, + "step": 13435 + }, + { + "epoch": 0.2092366132936863, + "grad_norm": 5.16492223739624, + "learning_rate": 8.324019599809901e-07, + "loss": 0.8035, + "step": 13440 + }, + { + "epoch": 0.2093144542956557, + "grad_norm": 5.247521877288818, + "learning_rate": 8.323200209763851e-07, + "loss": 0.7963, + "step": 13445 + }, + { + "epoch": 0.20939229529762507, + "grad_norm": 2.5298125743865967, + "learning_rate": 8.322380819717802e-07, + "loss": 0.8136, + "step": 13450 + }, + { + "epoch": 0.20947013629959446, + "grad_norm": 4.630289077758789, + "learning_rate": 8.321561429671751e-07, + "loss": 0.8526, + "step": 13455 + }, + { + "epoch": 0.20954797730156383, + "grad_norm": 7.223544597625732, + "learning_rate": 8.320742039625702e-07, + "loss": 0.8431, + "step": 13460 + }, + { + "epoch": 0.2096258183035332, + "grad_norm": 4.768827438354492, + "learning_rate": 8.319922649579653e-07, + "loss": 0.9167, + "step": 13465 + }, + { + "epoch": 0.20970365930550258, + "grad_norm": 4.539663314819336, + "learning_rate": 8.319103259533603e-07, + "loss": 0.8486, + "step": 13470 + }, + { + "epoch": 0.20978150030747195, + "grad_norm": 4.0133795738220215, + "learning_rate": 8.318283869487553e-07, + "loss": 0.8473, + "step": 13475 + }, + { + "epoch": 0.20985934130944134, + "grad_norm": 3.0814623832702637, + "learning_rate": 8.317464479441504e-07, + "loss": 0.704, + "step": 13480 + }, + { + "epoch": 0.2099371823114107, + "grad_norm": 4.812824726104736, + "learning_rate": 8.316645089395453e-07, + "loss": 0.7252, + "step": 13485 + }, + { + "epoch": 0.2100150233133801, + "grad_norm": 11.381315231323242, + "learning_rate": 8.315825699349403e-07, + "loss": 0.7837, + "step": 13490 + }, + { + "epoch": 0.21009286431534946, + "grad_norm": 4.615209579467773, + "learning_rate": 8.315006309303354e-07, + "loss": 0.8026, + "step": 13495 + }, + { + "epoch": 0.21017070531731885, + "grad_norm": 4.024731159210205, + "learning_rate": 8.314186919257305e-07, + "loss": 0.8345, + "step": 13500 + }, + { + "epoch": 0.21024854631928822, + "grad_norm": 5.9903764724731445, + "learning_rate": 8.313367529211255e-07, + "loss": 0.7786, + "step": 13505 + }, + { + "epoch": 0.2103263873212576, + "grad_norm": 5.442336082458496, + "learning_rate": 8.312548139165205e-07, + "loss": 0.7564, + "step": 13510 + }, + { + "epoch": 0.21040422832322697, + "grad_norm": 3.8722071647644043, + "learning_rate": 8.311728749119156e-07, + "loss": 0.8054, + "step": 13515 + }, + { + "epoch": 0.21048206932519636, + "grad_norm": 6.410277843475342, + "learning_rate": 8.310909359073106e-07, + "loss": 0.7405, + "step": 13520 + }, + { + "epoch": 0.21055991032716573, + "grad_norm": 4.545468807220459, + "learning_rate": 8.310089969027055e-07, + "loss": 0.6794, + "step": 13525 + }, + { + "epoch": 0.21063775132913512, + "grad_norm": 5.3065667152404785, + "learning_rate": 8.309270578981006e-07, + "loss": 0.8487, + "step": 13530 + }, + { + "epoch": 0.21071559233110448, + "grad_norm": 3.9609336853027344, + "learning_rate": 8.308451188934956e-07, + "loss": 0.7859, + "step": 13535 + }, + { + "epoch": 0.21079343333307388, + "grad_norm": 8.753162384033203, + "learning_rate": 8.307631798888907e-07, + "loss": 0.7646, + "step": 13540 + }, + { + "epoch": 0.21087127433504324, + "grad_norm": 5.301315784454346, + "learning_rate": 8.306812408842857e-07, + "loss": 0.6595, + "step": 13545 + }, + { + "epoch": 0.2109491153370126, + "grad_norm": 3.3421101570129395, + "learning_rate": 8.305993018796807e-07, + "loss": 0.7312, + "step": 13550 + }, + { + "epoch": 0.211026956338982, + "grad_norm": 6.578088283538818, + "learning_rate": 8.305173628750758e-07, + "loss": 0.8821, + "step": 13555 + }, + { + "epoch": 0.21110479734095136, + "grad_norm": 4.213285446166992, + "learning_rate": 8.304354238704709e-07, + "loss": 0.8494, + "step": 13560 + }, + { + "epoch": 0.21118263834292075, + "grad_norm": 3.34028959274292, + "learning_rate": 8.303534848658657e-07, + "loss": 0.802, + "step": 13565 + }, + { + "epoch": 0.21126047934489012, + "grad_norm": 6.317659378051758, + "learning_rate": 8.302715458612608e-07, + "loss": 0.7412, + "step": 13570 + }, + { + "epoch": 0.2113383203468595, + "grad_norm": 3.612773895263672, + "learning_rate": 8.301896068566559e-07, + "loss": 0.772, + "step": 13575 + }, + { + "epoch": 0.21141616134882887, + "grad_norm": 6.666327476501465, + "learning_rate": 8.301076678520508e-07, + "loss": 0.7994, + "step": 13580 + }, + { + "epoch": 0.21149400235079827, + "grad_norm": 3.3673384189605713, + "learning_rate": 8.300257288474459e-07, + "loss": 0.8642, + "step": 13585 + }, + { + "epoch": 0.21157184335276763, + "grad_norm": 4.446502685546875, + "learning_rate": 8.29943789842841e-07, + "loss": 0.8119, + "step": 13590 + }, + { + "epoch": 0.21164968435473702, + "grad_norm": 2.946279525756836, + "learning_rate": 8.29861850838236e-07, + "loss": 0.7246, + "step": 13595 + }, + { + "epoch": 0.2117275253567064, + "grad_norm": 3.9437286853790283, + "learning_rate": 8.29779911833631e-07, + "loss": 0.7658, + "step": 13600 + }, + { + "epoch": 0.21180536635867578, + "grad_norm": 5.029818534851074, + "learning_rate": 8.29697972829026e-07, + "loss": 0.8914, + "step": 13605 + }, + { + "epoch": 0.21188320736064514, + "grad_norm": 3.175124406814575, + "learning_rate": 8.29616033824421e-07, + "loss": 0.7365, + "step": 13610 + }, + { + "epoch": 0.21196104836261453, + "grad_norm": 4.092532634735107, + "learning_rate": 8.29534094819816e-07, + "loss": 0.8208, + "step": 13615 + }, + { + "epoch": 0.2120388893645839, + "grad_norm": 4.064992427825928, + "learning_rate": 8.294521558152111e-07, + "loss": 0.7227, + "step": 13620 + }, + { + "epoch": 0.2121167303665533, + "grad_norm": 4.1546244621276855, + "learning_rate": 8.293702168106062e-07, + "loss": 0.8453, + "step": 13625 + }, + { + "epoch": 0.21219457136852266, + "grad_norm": 2.7323529720306396, + "learning_rate": 8.292882778060012e-07, + "loss": 0.7896, + "step": 13630 + }, + { + "epoch": 0.21227241237049202, + "grad_norm": 3.339479446411133, + "learning_rate": 8.292063388013962e-07, + "loss": 0.7644, + "step": 13635 + }, + { + "epoch": 0.2123502533724614, + "grad_norm": 5.718183994293213, + "learning_rate": 8.291243997967913e-07, + "loss": 0.6686, + "step": 13640 + }, + { + "epoch": 0.21242809437443078, + "grad_norm": 3.4797685146331787, + "learning_rate": 8.290424607921862e-07, + "loss": 0.6722, + "step": 13645 + }, + { + "epoch": 0.21250593537640017, + "grad_norm": 3.628688335418701, + "learning_rate": 8.289605217875813e-07, + "loss": 0.8625, + "step": 13650 + }, + { + "epoch": 0.21258377637836953, + "grad_norm": 5.237287998199463, + "learning_rate": 8.288785827829763e-07, + "loss": 0.92, + "step": 13655 + }, + { + "epoch": 0.21266161738033892, + "grad_norm": 3.847942590713501, + "learning_rate": 8.287966437783713e-07, + "loss": 0.7552, + "step": 13660 + }, + { + "epoch": 0.2127394583823083, + "grad_norm": 3.620185613632202, + "learning_rate": 8.287147047737664e-07, + "loss": 0.7319, + "step": 13665 + }, + { + "epoch": 0.21281729938427768, + "grad_norm": 2.827895402908325, + "learning_rate": 8.286327657691615e-07, + "loss": 0.8578, + "step": 13670 + }, + { + "epoch": 0.21289514038624704, + "grad_norm": 4.379232406616211, + "learning_rate": 8.285508267645564e-07, + "loss": 0.7956, + "step": 13675 + }, + { + "epoch": 0.21297298138821644, + "grad_norm": 5.4567036628723145, + "learning_rate": 8.284688877599515e-07, + "loss": 0.7348, + "step": 13680 + }, + { + "epoch": 0.2130508223901858, + "grad_norm": 3.210332155227661, + "learning_rate": 8.283869487553466e-07, + "loss": 0.7783, + "step": 13685 + }, + { + "epoch": 0.2131286633921552, + "grad_norm": 3.489346981048584, + "learning_rate": 8.283050097507414e-07, + "loss": 0.6617, + "step": 13690 + }, + { + "epoch": 0.21320650439412456, + "grad_norm": 5.191878318786621, + "learning_rate": 8.282230707461365e-07, + "loss": 0.7675, + "step": 13695 + }, + { + "epoch": 0.21328434539609395, + "grad_norm": 3.5417792797088623, + "learning_rate": 8.281411317415316e-07, + "loss": 0.8721, + "step": 13700 + }, + { + "epoch": 0.2133621863980633, + "grad_norm": 3.657003164291382, + "learning_rate": 8.280591927369266e-07, + "loss": 0.8504, + "step": 13705 + }, + { + "epoch": 0.2134400274000327, + "grad_norm": 3.983947515487671, + "learning_rate": 8.279772537323216e-07, + "loss": 0.6321, + "step": 13710 + }, + { + "epoch": 0.21351786840200207, + "grad_norm": 5.221202373504639, + "learning_rate": 8.278953147277167e-07, + "loss": 0.7542, + "step": 13715 + }, + { + "epoch": 0.21359570940397143, + "grad_norm": 3.3143723011016846, + "learning_rate": 8.278133757231117e-07, + "loss": 0.7038, + "step": 13720 + }, + { + "epoch": 0.21367355040594083, + "grad_norm": 3.3426454067230225, + "learning_rate": 8.277314367185067e-07, + "loss": 0.8252, + "step": 13725 + }, + { + "epoch": 0.2137513914079102, + "grad_norm": 4.690375328063965, + "learning_rate": 8.276494977139017e-07, + "loss": 0.9626, + "step": 13730 + }, + { + "epoch": 0.21382923240987958, + "grad_norm": 5.993745803833008, + "learning_rate": 8.275675587092967e-07, + "loss": 0.81, + "step": 13735 + }, + { + "epoch": 0.21390707341184895, + "grad_norm": 5.37796688079834, + "learning_rate": 8.274856197046918e-07, + "loss": 0.8553, + "step": 13740 + }, + { + "epoch": 0.21398491441381834, + "grad_norm": 3.129485607147217, + "learning_rate": 8.274036807000868e-07, + "loss": 0.6703, + "step": 13745 + }, + { + "epoch": 0.2140627554157877, + "grad_norm": 3.359469413757324, + "learning_rate": 8.273217416954819e-07, + "loss": 0.7105, + "step": 13750 + }, + { + "epoch": 0.2141405964177571, + "grad_norm": 5.8597869873046875, + "learning_rate": 8.272398026908769e-07, + "loss": 0.9019, + "step": 13755 + }, + { + "epoch": 0.21421843741972646, + "grad_norm": 2.56128191947937, + "learning_rate": 8.27157863686272e-07, + "loss": 0.7213, + "step": 13760 + }, + { + "epoch": 0.21429627842169585, + "grad_norm": 3.979757070541382, + "learning_rate": 8.27075924681667e-07, + "loss": 0.8644, + "step": 13765 + }, + { + "epoch": 0.21437411942366522, + "grad_norm": 4.351536273956299, + "learning_rate": 8.269939856770619e-07, + "loss": 0.8617, + "step": 13770 + }, + { + "epoch": 0.2144519604256346, + "grad_norm": 4.314666748046875, + "learning_rate": 8.26912046672457e-07, + "loss": 0.7729, + "step": 13775 + }, + { + "epoch": 0.21452980142760397, + "grad_norm": 4.594080448150635, + "learning_rate": 8.26830107667852e-07, + "loss": 0.7507, + "step": 13780 + }, + { + "epoch": 0.21460764242957336, + "grad_norm": 3.6102042198181152, + "learning_rate": 8.26748168663247e-07, + "loss": 0.8858, + "step": 13785 + }, + { + "epoch": 0.21468548343154273, + "grad_norm": 5.565709114074707, + "learning_rate": 8.266662296586421e-07, + "loss": 0.7261, + "step": 13790 + }, + { + "epoch": 0.21476332443351212, + "grad_norm": 4.304652690887451, + "learning_rate": 8.265842906540372e-07, + "loss": 0.7998, + "step": 13795 + }, + { + "epoch": 0.21484116543548148, + "grad_norm": 4.141101360321045, + "learning_rate": 8.265023516494321e-07, + "loss": 0.7796, + "step": 13800 + }, + { + "epoch": 0.21491900643745085, + "grad_norm": 3.0917515754699707, + "learning_rate": 8.264204126448272e-07, + "loss": 0.7534, + "step": 13805 + }, + { + "epoch": 0.21499684743942024, + "grad_norm": 3.7405171394348145, + "learning_rate": 8.263384736402222e-07, + "loss": 0.8425, + "step": 13810 + }, + { + "epoch": 0.2150746884413896, + "grad_norm": 8.8234224319458, + "learning_rate": 8.262565346356171e-07, + "loss": 0.7727, + "step": 13815 + }, + { + "epoch": 0.215152529443359, + "grad_norm": 3.842259645462036, + "learning_rate": 8.261745956310122e-07, + "loss": 0.7739, + "step": 13820 + }, + { + "epoch": 0.21523037044532836, + "grad_norm": 6.93010950088501, + "learning_rate": 8.260926566264073e-07, + "loss": 0.9363, + "step": 13825 + }, + { + "epoch": 0.21530821144729775, + "grad_norm": 4.047563552856445, + "learning_rate": 8.260107176218023e-07, + "loss": 0.8669, + "step": 13830 + }, + { + "epoch": 0.21538605244926712, + "grad_norm": 3.978311777114868, + "learning_rate": 8.259287786171973e-07, + "loss": 0.9023, + "step": 13835 + }, + { + "epoch": 0.2154638934512365, + "grad_norm": 5.298313140869141, + "learning_rate": 8.258468396125924e-07, + "loss": 0.7525, + "step": 13840 + }, + { + "epoch": 0.21554173445320587, + "grad_norm": 3.4469170570373535, + "learning_rate": 8.257649006079874e-07, + "loss": 0.793, + "step": 13845 + }, + { + "epoch": 0.21561957545517527, + "grad_norm": 5.635530948638916, + "learning_rate": 8.256829616033823e-07, + "loss": 0.7775, + "step": 13850 + }, + { + "epoch": 0.21569741645714463, + "grad_norm": 5.665985584259033, + "learning_rate": 8.256010225987774e-07, + "loss": 0.7704, + "step": 13855 + }, + { + "epoch": 0.21577525745911402, + "grad_norm": 6.2332024574279785, + "learning_rate": 8.255190835941724e-07, + "loss": 0.8562, + "step": 13860 + }, + { + "epoch": 0.2158530984610834, + "grad_norm": 3.517777681350708, + "learning_rate": 8.254371445895675e-07, + "loss": 0.7324, + "step": 13865 + }, + { + "epoch": 0.21593093946305278, + "grad_norm": 6.235748767852783, + "learning_rate": 8.253552055849625e-07, + "loss": 0.7729, + "step": 13870 + }, + { + "epoch": 0.21600878046502214, + "grad_norm": 5.898641109466553, + "learning_rate": 8.252732665803576e-07, + "loss": 0.7918, + "step": 13875 + }, + { + "epoch": 0.21608662146699154, + "grad_norm": 3.069035530090332, + "learning_rate": 8.251913275757526e-07, + "loss": 0.6672, + "step": 13880 + }, + { + "epoch": 0.2161644624689609, + "grad_norm": 5.234879493713379, + "learning_rate": 8.251093885711477e-07, + "loss": 0.8629, + "step": 13885 + }, + { + "epoch": 0.2162423034709303, + "grad_norm": 7.040566921234131, + "learning_rate": 8.250274495665426e-07, + "loss": 0.7679, + "step": 13890 + }, + { + "epoch": 0.21632014447289966, + "grad_norm": 2.981306552886963, + "learning_rate": 8.249455105619376e-07, + "loss": 0.8243, + "step": 13895 + }, + { + "epoch": 0.21639798547486902, + "grad_norm": 9.416054725646973, + "learning_rate": 8.248635715573327e-07, + "loss": 0.7234, + "step": 13900 + }, + { + "epoch": 0.2164758264768384, + "grad_norm": 4.333786964416504, + "learning_rate": 8.247816325527277e-07, + "loss": 0.7538, + "step": 13905 + }, + { + "epoch": 0.21655366747880778, + "grad_norm": 5.313504695892334, + "learning_rate": 8.246996935481227e-07, + "loss": 0.9174, + "step": 13910 + }, + { + "epoch": 0.21663150848077717, + "grad_norm": 2.9732582569122314, + "learning_rate": 8.246177545435178e-07, + "loss": 0.7441, + "step": 13915 + }, + { + "epoch": 0.21670934948274653, + "grad_norm": 2.9341235160827637, + "learning_rate": 8.245358155389129e-07, + "loss": 0.8753, + "step": 13920 + }, + { + "epoch": 0.21678719048471592, + "grad_norm": 12.563216209411621, + "learning_rate": 8.244538765343078e-07, + "loss": 0.8747, + "step": 13925 + }, + { + "epoch": 0.2168650314866853, + "grad_norm": 3.163398504257202, + "learning_rate": 8.243719375297028e-07, + "loss": 0.8394, + "step": 13930 + }, + { + "epoch": 0.21694287248865468, + "grad_norm": 2.861363410949707, + "learning_rate": 8.242899985250979e-07, + "loss": 0.7728, + "step": 13935 + }, + { + "epoch": 0.21702071349062405, + "grad_norm": 4.490225791931152, + "learning_rate": 8.242080595204928e-07, + "loss": 0.8157, + "step": 13940 + }, + { + "epoch": 0.21709855449259344, + "grad_norm": 4.911387920379639, + "learning_rate": 8.241261205158879e-07, + "loss": 0.8017, + "step": 13945 + }, + { + "epoch": 0.2171763954945628, + "grad_norm": 4.898251533508301, + "learning_rate": 8.24044181511283e-07, + "loss": 0.7364, + "step": 13950 + }, + { + "epoch": 0.2172542364965322, + "grad_norm": 5.873484134674072, + "learning_rate": 8.23962242506678e-07, + "loss": 0.7311, + "step": 13955 + }, + { + "epoch": 0.21733207749850156, + "grad_norm": 3.597964286804199, + "learning_rate": 8.23880303502073e-07, + "loss": 0.834, + "step": 13960 + }, + { + "epoch": 0.21740991850047095, + "grad_norm": 2.7035915851593018, + "learning_rate": 8.237983644974681e-07, + "loss": 0.6948, + "step": 13965 + }, + { + "epoch": 0.21748775950244031, + "grad_norm": 5.127129554748535, + "learning_rate": 8.23716425492863e-07, + "loss": 0.6872, + "step": 13970 + }, + { + "epoch": 0.2175656005044097, + "grad_norm": 6.334181785583496, + "learning_rate": 8.23634486488258e-07, + "loss": 0.6572, + "step": 13975 + }, + { + "epoch": 0.21764344150637907, + "grad_norm": 4.3845744132995605, + "learning_rate": 8.235525474836531e-07, + "loss": 0.8049, + "step": 13980 + }, + { + "epoch": 0.21772128250834843, + "grad_norm": 3.1326639652252197, + "learning_rate": 8.234706084790481e-07, + "loss": 0.7638, + "step": 13985 + }, + { + "epoch": 0.21779912351031783, + "grad_norm": 2.6377274990081787, + "learning_rate": 8.233886694744432e-07, + "loss": 0.6976, + "step": 13990 + }, + { + "epoch": 0.2178769645122872, + "grad_norm": 3.378631353378296, + "learning_rate": 8.233067304698382e-07, + "loss": 0.7974, + "step": 13995 + }, + { + "epoch": 0.21795480551425658, + "grad_norm": 3.709468364715576, + "learning_rate": 8.232247914652333e-07, + "loss": 0.7535, + "step": 14000 + }, + { + "epoch": 0.21803264651622595, + "grad_norm": 5.473926067352295, + "learning_rate": 8.231428524606283e-07, + "loss": 0.8026, + "step": 14005 + }, + { + "epoch": 0.21811048751819534, + "grad_norm": 3.7275307178497314, + "learning_rate": 8.230609134560234e-07, + "loss": 0.7762, + "step": 14010 + }, + { + "epoch": 0.2181883285201647, + "grad_norm": 8.003002166748047, + "learning_rate": 8.229789744514183e-07, + "loss": 0.8362, + "step": 14015 + }, + { + "epoch": 0.2182661695221341, + "grad_norm": 2.683030605316162, + "learning_rate": 8.228970354468133e-07, + "loss": 0.7537, + "step": 14020 + }, + { + "epoch": 0.21834401052410346, + "grad_norm": 23.4306583404541, + "learning_rate": 8.228150964422084e-07, + "loss": 0.8595, + "step": 14025 + }, + { + "epoch": 0.21842185152607285, + "grad_norm": 4.96809720993042, + "learning_rate": 8.227331574376035e-07, + "loss": 0.7552, + "step": 14030 + }, + { + "epoch": 0.21849969252804222, + "grad_norm": 5.7253098487854, + "learning_rate": 8.226512184329984e-07, + "loss": 0.7839, + "step": 14035 + }, + { + "epoch": 0.2185775335300116, + "grad_norm": 4.622411251068115, + "learning_rate": 8.225692794283935e-07, + "loss": 0.8006, + "step": 14040 + }, + { + "epoch": 0.21865537453198097, + "grad_norm": 4.409444332122803, + "learning_rate": 8.224873404237886e-07, + "loss": 0.8242, + "step": 14045 + }, + { + "epoch": 0.21873321553395036, + "grad_norm": 3.5355610847473145, + "learning_rate": 8.224054014191835e-07, + "loss": 0.8879, + "step": 14050 + }, + { + "epoch": 0.21881105653591973, + "grad_norm": 7.076291084289551, + "learning_rate": 8.223234624145785e-07, + "loss": 0.8007, + "step": 14055 + }, + { + "epoch": 0.21888889753788912, + "grad_norm": 2.4964358806610107, + "learning_rate": 8.222415234099736e-07, + "loss": 0.7088, + "step": 14060 + }, + { + "epoch": 0.21896673853985849, + "grad_norm": 3.8193085193634033, + "learning_rate": 8.221595844053686e-07, + "loss": 0.779, + "step": 14065 + }, + { + "epoch": 0.21904457954182785, + "grad_norm": 4.243593692779541, + "learning_rate": 8.220776454007636e-07, + "loss": 0.7483, + "step": 14070 + }, + { + "epoch": 0.21912242054379724, + "grad_norm": 3.811239719390869, + "learning_rate": 8.219957063961587e-07, + "loss": 0.8161, + "step": 14075 + }, + { + "epoch": 0.2192002615457666, + "grad_norm": 7.058951377868652, + "learning_rate": 8.219137673915537e-07, + "loss": 0.856, + "step": 14080 + }, + { + "epoch": 0.219278102547736, + "grad_norm": 4.006752014160156, + "learning_rate": 8.218318283869487e-07, + "loss": 0.8221, + "step": 14085 + }, + { + "epoch": 0.21935594354970536, + "grad_norm": 4.000454425811768, + "learning_rate": 8.217498893823438e-07, + "loss": 0.7499, + "step": 14090 + }, + { + "epoch": 0.21943378455167475, + "grad_norm": 4.066524982452393, + "learning_rate": 8.216679503777387e-07, + "loss": 0.7449, + "step": 14095 + }, + { + "epoch": 0.21951162555364412, + "grad_norm": 8.29305362701416, + "learning_rate": 8.215860113731338e-07, + "loss": 0.7864, + "step": 14100 + }, + { + "epoch": 0.2195894665556135, + "grad_norm": 5.086592674255371, + "learning_rate": 8.215040723685288e-07, + "loss": 0.904, + "step": 14105 + }, + { + "epoch": 0.21966730755758287, + "grad_norm": 5.60768985748291, + "learning_rate": 8.214221333639238e-07, + "loss": 0.6991, + "step": 14110 + }, + { + "epoch": 0.21974514855955227, + "grad_norm": 6.108041286468506, + "learning_rate": 8.213401943593189e-07, + "loss": 0.8331, + "step": 14115 + }, + { + "epoch": 0.21982298956152163, + "grad_norm": 3.089353561401367, + "learning_rate": 8.21258255354714e-07, + "loss": 0.8687, + "step": 14120 + }, + { + "epoch": 0.21990083056349102, + "grad_norm": 4.127028465270996, + "learning_rate": 8.21176316350109e-07, + "loss": 0.7391, + "step": 14125 + }, + { + "epoch": 0.2199786715654604, + "grad_norm": 7.309013843536377, + "learning_rate": 8.21094377345504e-07, + "loss": 0.8205, + "step": 14130 + }, + { + "epoch": 0.22005651256742978, + "grad_norm": 5.600733757019043, + "learning_rate": 8.21012438340899e-07, + "loss": 0.7669, + "step": 14135 + }, + { + "epoch": 0.22013435356939914, + "grad_norm": 5.287996768951416, + "learning_rate": 8.20930499336294e-07, + "loss": 0.7142, + "step": 14140 + }, + { + "epoch": 0.22021219457136854, + "grad_norm": 3.8828279972076416, + "learning_rate": 8.20848560331689e-07, + "loss": 0.8409, + "step": 14145 + }, + { + "epoch": 0.2202900355733379, + "grad_norm": 4.1061224937438965, + "learning_rate": 8.207666213270841e-07, + "loss": 0.6962, + "step": 14150 + }, + { + "epoch": 0.22036787657530726, + "grad_norm": 2.734174966812134, + "learning_rate": 8.206846823224792e-07, + "loss": 0.7433, + "step": 14155 + }, + { + "epoch": 0.22044571757727666, + "grad_norm": 2.9642739295959473, + "learning_rate": 8.206027433178741e-07, + "loss": 0.8107, + "step": 14160 + }, + { + "epoch": 0.22052355857924602, + "grad_norm": 5.745486736297607, + "learning_rate": 8.205208043132692e-07, + "loss": 0.8115, + "step": 14165 + }, + { + "epoch": 0.2206013995812154, + "grad_norm": 4.336205005645752, + "learning_rate": 8.204388653086643e-07, + "loss": 0.8461, + "step": 14170 + }, + { + "epoch": 0.22067924058318478, + "grad_norm": 2.4944536685943604, + "learning_rate": 8.203569263040591e-07, + "loss": 0.6971, + "step": 14175 + }, + { + "epoch": 0.22075708158515417, + "grad_norm": 4.578847408294678, + "learning_rate": 8.202749872994542e-07, + "loss": 0.6928, + "step": 14180 + }, + { + "epoch": 0.22083492258712353, + "grad_norm": 2.760460138320923, + "learning_rate": 8.201930482948493e-07, + "loss": 0.7466, + "step": 14185 + }, + { + "epoch": 0.22091276358909293, + "grad_norm": 3.805088520050049, + "learning_rate": 8.201111092902443e-07, + "loss": 0.8223, + "step": 14190 + }, + { + "epoch": 0.2209906045910623, + "grad_norm": 3.4560000896453857, + "learning_rate": 8.200291702856393e-07, + "loss": 0.8931, + "step": 14195 + }, + { + "epoch": 0.22106844559303168, + "grad_norm": 7.692502498626709, + "learning_rate": 8.199472312810344e-07, + "loss": 0.677, + "step": 14200 + }, + { + "epoch": 0.22114628659500105, + "grad_norm": 2.7867279052734375, + "learning_rate": 8.198652922764294e-07, + "loss": 0.7933, + "step": 14205 + }, + { + "epoch": 0.22122412759697044, + "grad_norm": 3.4029629230499268, + "learning_rate": 8.197833532718245e-07, + "loss": 0.7878, + "step": 14210 + }, + { + "epoch": 0.2213019685989398, + "grad_norm": 6.6466546058654785, + "learning_rate": 8.197014142672194e-07, + "loss": 0.6714, + "step": 14215 + }, + { + "epoch": 0.2213798096009092, + "grad_norm": 3.9622604846954346, + "learning_rate": 8.196194752626144e-07, + "loss": 0.6719, + "step": 14220 + }, + { + "epoch": 0.22145765060287856, + "grad_norm": 3.521365165710449, + "learning_rate": 8.195375362580095e-07, + "loss": 0.8028, + "step": 14225 + }, + { + "epoch": 0.22153549160484795, + "grad_norm": 3.510684013366699, + "learning_rate": 8.194555972534045e-07, + "loss": 0.7864, + "step": 14230 + }, + { + "epoch": 0.22161333260681731, + "grad_norm": 5.596205711364746, + "learning_rate": 8.193736582487995e-07, + "loss": 0.8561, + "step": 14235 + }, + { + "epoch": 0.22169117360878668, + "grad_norm": 3.478048086166382, + "learning_rate": 8.192917192441946e-07, + "loss": 0.7405, + "step": 14240 + }, + { + "epoch": 0.22176901461075607, + "grad_norm": 2.7496485710144043, + "learning_rate": 8.192097802395897e-07, + "loss": 0.7695, + "step": 14245 + }, + { + "epoch": 0.22184685561272544, + "grad_norm": 5.8107781410217285, + "learning_rate": 8.191278412349847e-07, + "loss": 0.8052, + "step": 14250 + }, + { + "epoch": 0.22192469661469483, + "grad_norm": 3.761655330657959, + "learning_rate": 8.190459022303796e-07, + "loss": 0.7027, + "step": 14255 + }, + { + "epoch": 0.2220025376166642, + "grad_norm": 3.4367716312408447, + "learning_rate": 8.189639632257747e-07, + "loss": 0.801, + "step": 14260 + }, + { + "epoch": 0.22208037861863358, + "grad_norm": 3.767960548400879, + "learning_rate": 8.188820242211697e-07, + "loss": 0.7308, + "step": 14265 + }, + { + "epoch": 0.22215821962060295, + "grad_norm": 4.997518062591553, + "learning_rate": 8.188000852165647e-07, + "loss": 0.8017, + "step": 14270 + }, + { + "epoch": 0.22223606062257234, + "grad_norm": 2.965698480606079, + "learning_rate": 8.187181462119598e-07, + "loss": 0.8322, + "step": 14275 + }, + { + "epoch": 0.2223139016245417, + "grad_norm": 4.3722662925720215, + "learning_rate": 8.186362072073549e-07, + "loss": 0.8415, + "step": 14280 + }, + { + "epoch": 0.2223917426265111, + "grad_norm": 3.201127767562866, + "learning_rate": 8.185542682027498e-07, + "loss": 0.8765, + "step": 14285 + }, + { + "epoch": 0.22246958362848046, + "grad_norm": 7.47006368637085, + "learning_rate": 8.184723291981449e-07, + "loss": 0.7547, + "step": 14290 + }, + { + "epoch": 0.22254742463044985, + "grad_norm": 4.898451805114746, + "learning_rate": 8.183903901935399e-07, + "loss": 0.7302, + "step": 14295 + }, + { + "epoch": 0.22262526563241922, + "grad_norm": 4.226808547973633, + "learning_rate": 8.183084511889348e-07, + "loss": 0.7815, + "step": 14300 + }, + { + "epoch": 0.2227031066343886, + "grad_norm": 3.9173691272735596, + "learning_rate": 8.182265121843299e-07, + "loss": 0.8184, + "step": 14305 + }, + { + "epoch": 0.22278094763635797, + "grad_norm": 4.693234443664551, + "learning_rate": 8.18144573179725e-07, + "loss": 0.7193, + "step": 14310 + }, + { + "epoch": 0.22285878863832737, + "grad_norm": 3.06361985206604, + "learning_rate": 8.1806263417512e-07, + "loss": 0.727, + "step": 14315 + }, + { + "epoch": 0.22293662964029673, + "grad_norm": 3.4421865940093994, + "learning_rate": 8.17980695170515e-07, + "loss": 0.6528, + "step": 14320 + }, + { + "epoch": 0.2230144706422661, + "grad_norm": 3.3549883365631104, + "learning_rate": 8.178987561659101e-07, + "loss": 0.8015, + "step": 14325 + }, + { + "epoch": 0.22309231164423549, + "grad_norm": 5.654025077819824, + "learning_rate": 8.178168171613051e-07, + "loss": 0.7657, + "step": 14330 + }, + { + "epoch": 0.22317015264620485, + "grad_norm": 5.917110919952393, + "learning_rate": 8.177348781567002e-07, + "loss": 0.7654, + "step": 14335 + }, + { + "epoch": 0.22324799364817424, + "grad_norm": 6.301812171936035, + "learning_rate": 8.176529391520951e-07, + "loss": 0.7356, + "step": 14340 + }, + { + "epoch": 0.2233258346501436, + "grad_norm": 7.683788776397705, + "learning_rate": 8.175710001474901e-07, + "loss": 0.7445, + "step": 14345 + }, + { + "epoch": 0.223403675652113, + "grad_norm": 3.3383662700653076, + "learning_rate": 8.174890611428852e-07, + "loss": 0.8202, + "step": 14350 + }, + { + "epoch": 0.22348151665408236, + "grad_norm": 4.314518928527832, + "learning_rate": 8.174071221382802e-07, + "loss": 0.8356, + "step": 14355 + }, + { + "epoch": 0.22355935765605175, + "grad_norm": 2.8232765197753906, + "learning_rate": 8.173251831336752e-07, + "loss": 0.7911, + "step": 14360 + }, + { + "epoch": 0.22363719865802112, + "grad_norm": 6.986234188079834, + "learning_rate": 8.172432441290703e-07, + "loss": 0.7589, + "step": 14365 + }, + { + "epoch": 0.2237150396599905, + "grad_norm": 3.733344316482544, + "learning_rate": 8.171613051244654e-07, + "loss": 0.7412, + "step": 14370 + }, + { + "epoch": 0.22379288066195988, + "grad_norm": 3.7331125736236572, + "learning_rate": 8.170793661198604e-07, + "loss": 0.8827, + "step": 14375 + }, + { + "epoch": 0.22387072166392927, + "grad_norm": 2.895965099334717, + "learning_rate": 8.169974271152553e-07, + "loss": 0.8259, + "step": 14380 + }, + { + "epoch": 0.22394856266589863, + "grad_norm": 3.6977014541625977, + "learning_rate": 8.169154881106504e-07, + "loss": 0.8569, + "step": 14385 + }, + { + "epoch": 0.22402640366786802, + "grad_norm": 6.653420925140381, + "learning_rate": 8.168335491060455e-07, + "loss": 0.841, + "step": 14390 + }, + { + "epoch": 0.2241042446698374, + "grad_norm": 3.430891275405884, + "learning_rate": 8.167516101014404e-07, + "loss": 0.7575, + "step": 14395 + }, + { + "epoch": 0.22418208567180678, + "grad_norm": 3.0250182151794434, + "learning_rate": 8.166696710968355e-07, + "loss": 0.8719, + "step": 14400 + }, + { + "epoch": 0.22425992667377614, + "grad_norm": 2.6156387329101562, + "learning_rate": 8.165877320922306e-07, + "loss": 0.6721, + "step": 14405 + }, + { + "epoch": 0.2243377676757455, + "grad_norm": 5.948540210723877, + "learning_rate": 8.165057930876255e-07, + "loss": 0.8922, + "step": 14410 + }, + { + "epoch": 0.2244156086777149, + "grad_norm": 3.9800314903259277, + "learning_rate": 8.164238540830206e-07, + "loss": 0.8119, + "step": 14415 + }, + { + "epoch": 0.22449344967968426, + "grad_norm": 4.236239433288574, + "learning_rate": 8.163419150784156e-07, + "loss": 0.8216, + "step": 14420 + }, + { + "epoch": 0.22457129068165366, + "grad_norm": 3.8411571979522705, + "learning_rate": 8.162599760738106e-07, + "loss": 0.7578, + "step": 14425 + }, + { + "epoch": 0.22464913168362302, + "grad_norm": 4.027297496795654, + "learning_rate": 8.161780370692056e-07, + "loss": 0.7417, + "step": 14430 + }, + { + "epoch": 0.2247269726855924, + "grad_norm": 2.982266902923584, + "learning_rate": 8.160960980646007e-07, + "loss": 0.6673, + "step": 14435 + }, + { + "epoch": 0.22480481368756178, + "grad_norm": 7.821840763092041, + "learning_rate": 8.160141590599957e-07, + "loss": 0.7697, + "step": 14440 + }, + { + "epoch": 0.22488265468953117, + "grad_norm": 4.410890102386475, + "learning_rate": 8.159322200553907e-07, + "loss": 0.7914, + "step": 14445 + }, + { + "epoch": 0.22496049569150053, + "grad_norm": 5.73233699798584, + "learning_rate": 8.158502810507858e-07, + "loss": 0.6921, + "step": 14450 + }, + { + "epoch": 0.22503833669346993, + "grad_norm": 3.975536346435547, + "learning_rate": 8.157683420461808e-07, + "loss": 0.7207, + "step": 14455 + }, + { + "epoch": 0.2251161776954393, + "grad_norm": 3.909505605697632, + "learning_rate": 8.156864030415758e-07, + "loss": 0.7957, + "step": 14460 + }, + { + "epoch": 0.22519401869740868, + "grad_norm": 3.489048480987549, + "learning_rate": 8.156044640369708e-07, + "loss": 0.8026, + "step": 14465 + }, + { + "epoch": 0.22527185969937805, + "grad_norm": 3.770564317703247, + "learning_rate": 8.155225250323658e-07, + "loss": 0.787, + "step": 14470 + }, + { + "epoch": 0.22534970070134744, + "grad_norm": 3.319371223449707, + "learning_rate": 8.154405860277609e-07, + "loss": 0.7437, + "step": 14475 + }, + { + "epoch": 0.2254275417033168, + "grad_norm": 3.6019351482391357, + "learning_rate": 8.15358647023156e-07, + "loss": 0.6795, + "step": 14480 + }, + { + "epoch": 0.2255053827052862, + "grad_norm": 3.6138195991516113, + "learning_rate": 8.152767080185509e-07, + "loss": 0.9009, + "step": 14485 + }, + { + "epoch": 0.22558322370725556, + "grad_norm": 2.5489044189453125, + "learning_rate": 8.15194769013946e-07, + "loss": 0.8335, + "step": 14490 + }, + { + "epoch": 0.22566106470922492, + "grad_norm": 3.9875316619873047, + "learning_rate": 8.151128300093411e-07, + "loss": 0.8167, + "step": 14495 + }, + { + "epoch": 0.22573890571119432, + "grad_norm": 3.159693956375122, + "learning_rate": 8.150308910047359e-07, + "loss": 0.7998, + "step": 14500 + }, + { + "epoch": 0.22581674671316368, + "grad_norm": 6.574869632720947, + "learning_rate": 8.14948952000131e-07, + "loss": 0.9038, + "step": 14505 + }, + { + "epoch": 0.22589458771513307, + "grad_norm": 5.537595748901367, + "learning_rate": 8.148670129955261e-07, + "loss": 0.7544, + "step": 14510 + }, + { + "epoch": 0.22597242871710244, + "grad_norm": 7.5179057121276855, + "learning_rate": 8.147850739909212e-07, + "loss": 0.7809, + "step": 14515 + }, + { + "epoch": 0.22605026971907183, + "grad_norm": 7.547605037689209, + "learning_rate": 8.147031349863161e-07, + "loss": 0.7712, + "step": 14520 + }, + { + "epoch": 0.2261281107210412, + "grad_norm": 3.386322498321533, + "learning_rate": 8.146211959817112e-07, + "loss": 0.7648, + "step": 14525 + }, + { + "epoch": 0.22620595172301058, + "grad_norm": 3.413363218307495, + "learning_rate": 8.145392569771063e-07, + "loss": 0.7465, + "step": 14530 + }, + { + "epoch": 0.22628379272497995, + "grad_norm": 4.657474040985107, + "learning_rate": 8.144573179725012e-07, + "loss": 0.755, + "step": 14535 + }, + { + "epoch": 0.22636163372694934, + "grad_norm": 3.3240861892700195, + "learning_rate": 8.143753789678962e-07, + "loss": 0.7667, + "step": 14540 + }, + { + "epoch": 0.2264394747289187, + "grad_norm": 6.471132755279541, + "learning_rate": 8.142934399632913e-07, + "loss": 0.7321, + "step": 14545 + }, + { + "epoch": 0.2265173157308881, + "grad_norm": 6.002511024475098, + "learning_rate": 8.142115009586863e-07, + "loss": 0.8867, + "step": 14550 + }, + { + "epoch": 0.22659515673285746, + "grad_norm": 4.303719997406006, + "learning_rate": 8.141295619540813e-07, + "loss": 0.6478, + "step": 14555 + }, + { + "epoch": 0.22667299773482685, + "grad_norm": 4.221592426300049, + "learning_rate": 8.140476229494764e-07, + "loss": 0.7632, + "step": 14560 + }, + { + "epoch": 0.22675083873679622, + "grad_norm": 6.414244174957275, + "learning_rate": 8.139656839448714e-07, + "loss": 0.6771, + "step": 14565 + }, + { + "epoch": 0.2268286797387656, + "grad_norm": 3.563746690750122, + "learning_rate": 8.138837449402665e-07, + "loss": 0.826, + "step": 14570 + }, + { + "epoch": 0.22690652074073497, + "grad_norm": 3.3471519947052, + "learning_rate": 8.138018059356615e-07, + "loss": 0.8127, + "step": 14575 + }, + { + "epoch": 0.22698436174270437, + "grad_norm": 3.464322328567505, + "learning_rate": 8.137198669310564e-07, + "loss": 0.851, + "step": 14580 + }, + { + "epoch": 0.22706220274467373, + "grad_norm": 2.899655342102051, + "learning_rate": 8.136379279264515e-07, + "loss": 0.8432, + "step": 14585 + }, + { + "epoch": 0.2271400437466431, + "grad_norm": 3.654853343963623, + "learning_rate": 8.135559889218465e-07, + "loss": 0.8792, + "step": 14590 + }, + { + "epoch": 0.2272178847486125, + "grad_norm": 3.7926645278930664, + "learning_rate": 8.134740499172415e-07, + "loss": 0.8283, + "step": 14595 + }, + { + "epoch": 0.22729572575058185, + "grad_norm": 2.3791191577911377, + "learning_rate": 8.133921109126366e-07, + "loss": 0.5844, + "step": 14600 + }, + { + "epoch": 0.22737356675255124, + "grad_norm": 4.844578266143799, + "learning_rate": 8.133101719080317e-07, + "loss": 0.746, + "step": 14605 + }, + { + "epoch": 0.2274514077545206, + "grad_norm": 5.1527934074401855, + "learning_rate": 8.132282329034266e-07, + "loss": 0.7452, + "step": 14610 + }, + { + "epoch": 0.22752924875649, + "grad_norm": 2.648463010787964, + "learning_rate": 8.131462938988217e-07, + "loss": 0.6665, + "step": 14615 + }, + { + "epoch": 0.22760708975845936, + "grad_norm": 4.308215618133545, + "learning_rate": 8.130643548942167e-07, + "loss": 0.814, + "step": 14620 + }, + { + "epoch": 0.22768493076042876, + "grad_norm": 4.505756378173828, + "learning_rate": 8.129824158896116e-07, + "loss": 0.7675, + "step": 14625 + }, + { + "epoch": 0.22776277176239812, + "grad_norm": 4.09812593460083, + "learning_rate": 8.129004768850067e-07, + "loss": 0.8119, + "step": 14630 + }, + { + "epoch": 0.2278406127643675, + "grad_norm": 5.634159088134766, + "learning_rate": 8.128185378804018e-07, + "loss": 0.7209, + "step": 14635 + }, + { + "epoch": 0.22791845376633688, + "grad_norm": 5.862931728363037, + "learning_rate": 8.127365988757969e-07, + "loss": 0.8805, + "step": 14640 + }, + { + "epoch": 0.22799629476830627, + "grad_norm": 3.8466310501098633, + "learning_rate": 8.126546598711918e-07, + "loss": 0.7862, + "step": 14645 + }, + { + "epoch": 0.22807413577027563, + "grad_norm": 2.5441253185272217, + "learning_rate": 8.125727208665869e-07, + "loss": 0.7283, + "step": 14650 + }, + { + "epoch": 0.22815197677224502, + "grad_norm": 2.762737274169922, + "learning_rate": 8.12490781861982e-07, + "loss": 0.7527, + "step": 14655 + }, + { + "epoch": 0.2282298177742144, + "grad_norm": 3.7195687294006348, + "learning_rate": 8.12408842857377e-07, + "loss": 0.8323, + "step": 14660 + }, + { + "epoch": 0.22830765877618378, + "grad_norm": 4.145010471343994, + "learning_rate": 8.123269038527719e-07, + "loss": 0.936, + "step": 14665 + }, + { + "epoch": 0.22838549977815314, + "grad_norm": 3.6851210594177246, + "learning_rate": 8.12244964848167e-07, + "loss": 0.8665, + "step": 14670 + }, + { + "epoch": 0.2284633407801225, + "grad_norm": 3.9991884231567383, + "learning_rate": 8.12163025843562e-07, + "loss": 0.7744, + "step": 14675 + }, + { + "epoch": 0.2285411817820919, + "grad_norm": 4.037894248962402, + "learning_rate": 8.12081086838957e-07, + "loss": 0.8248, + "step": 14680 + }, + { + "epoch": 0.22861902278406127, + "grad_norm": 3.2940948009490967, + "learning_rate": 8.119991478343521e-07, + "loss": 0.7561, + "step": 14685 + }, + { + "epoch": 0.22869686378603066, + "grad_norm": 5.218342304229736, + "learning_rate": 8.119172088297471e-07, + "loss": 0.838, + "step": 14690 + }, + { + "epoch": 0.22877470478800002, + "grad_norm": 4.6300578117370605, + "learning_rate": 8.118352698251422e-07, + "loss": 0.7774, + "step": 14695 + }, + { + "epoch": 0.2288525457899694, + "grad_norm": 6.564635753631592, + "learning_rate": 8.117533308205372e-07, + "loss": 0.7365, + "step": 14700 + }, + { + "epoch": 0.22893038679193878, + "grad_norm": 7.371841907501221, + "learning_rate": 8.116713918159321e-07, + "loss": 0.7821, + "step": 14705 + }, + { + "epoch": 0.22900822779390817, + "grad_norm": 4.6451096534729, + "learning_rate": 8.115894528113272e-07, + "loss": 0.7552, + "step": 14710 + }, + { + "epoch": 0.22908606879587753, + "grad_norm": 3.867570161819458, + "learning_rate": 8.115075138067222e-07, + "loss": 0.6779, + "step": 14715 + }, + { + "epoch": 0.22916390979784693, + "grad_norm": 7.25125789642334, + "learning_rate": 8.114255748021172e-07, + "loss": 0.7304, + "step": 14720 + }, + { + "epoch": 0.2292417507998163, + "grad_norm": 3.8599355220794678, + "learning_rate": 8.113436357975123e-07, + "loss": 0.7558, + "step": 14725 + }, + { + "epoch": 0.22931959180178568, + "grad_norm": 6.007401466369629, + "learning_rate": 8.112616967929074e-07, + "loss": 0.7334, + "step": 14730 + }, + { + "epoch": 0.22939743280375505, + "grad_norm": 6.439403057098389, + "learning_rate": 8.111797577883023e-07, + "loss": 0.8409, + "step": 14735 + }, + { + "epoch": 0.22947527380572444, + "grad_norm": 5.950405597686768, + "learning_rate": 8.110978187836974e-07, + "loss": 0.7371, + "step": 14740 + }, + { + "epoch": 0.2295531148076938, + "grad_norm": 4.082195281982422, + "learning_rate": 8.110158797790924e-07, + "loss": 0.877, + "step": 14745 + }, + { + "epoch": 0.2296309558096632, + "grad_norm": 6.829986095428467, + "learning_rate": 8.109339407744873e-07, + "loss": 0.6617, + "step": 14750 + }, + { + "epoch": 0.22970879681163256, + "grad_norm": 4.060400485992432, + "learning_rate": 8.108520017698824e-07, + "loss": 0.7444, + "step": 14755 + }, + { + "epoch": 0.22978663781360192, + "grad_norm": 3.6605048179626465, + "learning_rate": 8.107700627652775e-07, + "loss": 0.8301, + "step": 14760 + }, + { + "epoch": 0.22986447881557132, + "grad_norm": 5.560822010040283, + "learning_rate": 8.106881237606726e-07, + "loss": 0.7338, + "step": 14765 + }, + { + "epoch": 0.22994231981754068, + "grad_norm": 3.1847989559173584, + "learning_rate": 8.106061847560675e-07, + "loss": 0.6357, + "step": 14770 + }, + { + "epoch": 0.23002016081951007, + "grad_norm": 4.955717086791992, + "learning_rate": 8.105242457514626e-07, + "loss": 0.8697, + "step": 14775 + }, + { + "epoch": 0.23009800182147944, + "grad_norm": 7.307408332824707, + "learning_rate": 8.104423067468577e-07, + "loss": 0.7504, + "step": 14780 + }, + { + "epoch": 0.23017584282344883, + "grad_norm": 12.575167655944824, + "learning_rate": 8.103603677422526e-07, + "loss": 0.7651, + "step": 14785 + }, + { + "epoch": 0.2302536838254182, + "grad_norm": 3.67960524559021, + "learning_rate": 8.102784287376476e-07, + "loss": 0.7182, + "step": 14790 + }, + { + "epoch": 0.23033152482738758, + "grad_norm": 6.872060298919678, + "learning_rate": 8.101964897330427e-07, + "loss": 0.8097, + "step": 14795 + }, + { + "epoch": 0.23040936582935695, + "grad_norm": 4.601255416870117, + "learning_rate": 8.101145507284377e-07, + "loss": 0.8425, + "step": 14800 + }, + { + "epoch": 0.23048720683132634, + "grad_norm": 4.442498683929443, + "learning_rate": 8.100326117238327e-07, + "loss": 0.8298, + "step": 14805 + }, + { + "epoch": 0.2305650478332957, + "grad_norm": 5.014193058013916, + "learning_rate": 8.099506727192278e-07, + "loss": 0.7692, + "step": 14810 + }, + { + "epoch": 0.2306428888352651, + "grad_norm": 5.516265392303467, + "learning_rate": 8.098687337146228e-07, + "loss": 0.7929, + "step": 14815 + }, + { + "epoch": 0.23072072983723446, + "grad_norm": 2.710554361343384, + "learning_rate": 8.097867947100179e-07, + "loss": 0.6834, + "step": 14820 + }, + { + "epoch": 0.23079857083920385, + "grad_norm": 3.622087240219116, + "learning_rate": 8.097048557054128e-07, + "loss": 0.9235, + "step": 14825 + }, + { + "epoch": 0.23087641184117322, + "grad_norm": 3.3825948238372803, + "learning_rate": 8.096229167008078e-07, + "loss": 0.8111, + "step": 14830 + }, + { + "epoch": 0.2309542528431426, + "grad_norm": 3.4113945960998535, + "learning_rate": 8.095409776962029e-07, + "loss": 0.8036, + "step": 14835 + }, + { + "epoch": 0.23103209384511197, + "grad_norm": 5.018237590789795, + "learning_rate": 8.09459038691598e-07, + "loss": 0.7452, + "step": 14840 + }, + { + "epoch": 0.23110993484708134, + "grad_norm": 2.8325562477111816, + "learning_rate": 8.093770996869929e-07, + "loss": 0.7413, + "step": 14845 + }, + { + "epoch": 0.23118777584905073, + "grad_norm": 7.24515962600708, + "learning_rate": 8.09295160682388e-07, + "loss": 0.7887, + "step": 14850 + }, + { + "epoch": 0.2312656168510201, + "grad_norm": 2.9999563694000244, + "learning_rate": 8.092132216777831e-07, + "loss": 0.8002, + "step": 14855 + }, + { + "epoch": 0.2313434578529895, + "grad_norm": 2.6894278526306152, + "learning_rate": 8.09131282673178e-07, + "loss": 0.9197, + "step": 14860 + }, + { + "epoch": 0.23142129885495885, + "grad_norm": 4.730764865875244, + "learning_rate": 8.09049343668573e-07, + "loss": 0.7456, + "step": 14865 + }, + { + "epoch": 0.23149913985692824, + "grad_norm": 2.9281725883483887, + "learning_rate": 8.089674046639681e-07, + "loss": 0.782, + "step": 14870 + }, + { + "epoch": 0.2315769808588976, + "grad_norm": 3.086618185043335, + "learning_rate": 8.088854656593631e-07, + "loss": 0.7647, + "step": 14875 + }, + { + "epoch": 0.231654821860867, + "grad_norm": 3.007323980331421, + "learning_rate": 8.088035266547581e-07, + "loss": 0.7483, + "step": 14880 + }, + { + "epoch": 0.23173266286283636, + "grad_norm": 3.2050905227661133, + "learning_rate": 8.087215876501532e-07, + "loss": 0.7355, + "step": 14885 + }, + { + "epoch": 0.23181050386480576, + "grad_norm": 4.09403657913208, + "learning_rate": 8.086396486455483e-07, + "loss": 0.7159, + "step": 14890 + }, + { + "epoch": 0.23188834486677512, + "grad_norm": 3.1509900093078613, + "learning_rate": 8.085577096409432e-07, + "loss": 0.8726, + "step": 14895 + }, + { + "epoch": 0.2319661858687445, + "grad_norm": 3.883364200592041, + "learning_rate": 8.084757706363383e-07, + "loss": 0.8345, + "step": 14900 + }, + { + "epoch": 0.23204402687071388, + "grad_norm": 2.933501720428467, + "learning_rate": 8.083938316317333e-07, + "loss": 0.8286, + "step": 14905 + }, + { + "epoch": 0.23212186787268327, + "grad_norm": 3.5567376613616943, + "learning_rate": 8.083118926271283e-07, + "loss": 0.863, + "step": 14910 + }, + { + "epoch": 0.23219970887465263, + "grad_norm": 4.056962490081787, + "learning_rate": 8.082299536225233e-07, + "loss": 0.853, + "step": 14915 + }, + { + "epoch": 0.23227754987662202, + "grad_norm": 3.4202661514282227, + "learning_rate": 8.081480146179184e-07, + "loss": 0.7843, + "step": 14920 + }, + { + "epoch": 0.2323553908785914, + "grad_norm": 3.929119348526001, + "learning_rate": 8.080660756133134e-07, + "loss": 0.7301, + "step": 14925 + }, + { + "epoch": 0.23243323188056075, + "grad_norm": 5.521277904510498, + "learning_rate": 8.079841366087085e-07, + "loss": 0.7049, + "step": 14930 + }, + { + "epoch": 0.23251107288253015, + "grad_norm": 3.254838466644287, + "learning_rate": 8.079021976041035e-07, + "loss": 0.7977, + "step": 14935 + }, + { + "epoch": 0.2325889138844995, + "grad_norm": 4.433169364929199, + "learning_rate": 8.078202585994985e-07, + "loss": 0.8519, + "step": 14940 + }, + { + "epoch": 0.2326667548864689, + "grad_norm": 3.3750112056732178, + "learning_rate": 8.077383195948935e-07, + "loss": 0.804, + "step": 14945 + }, + { + "epoch": 0.23274459588843827, + "grad_norm": 3.0588154792785645, + "learning_rate": 8.076563805902885e-07, + "loss": 0.7398, + "step": 14950 + }, + { + "epoch": 0.23282243689040766, + "grad_norm": 2.8940608501434326, + "learning_rate": 8.075744415856835e-07, + "loss": 0.7417, + "step": 14955 + }, + { + "epoch": 0.23290027789237702, + "grad_norm": 6.039758205413818, + "learning_rate": 8.074925025810786e-07, + "loss": 0.8312, + "step": 14960 + }, + { + "epoch": 0.23297811889434641, + "grad_norm": 3.923211097717285, + "learning_rate": 8.074105635764737e-07, + "loss": 0.7279, + "step": 14965 + }, + { + "epoch": 0.23305595989631578, + "grad_norm": 2.865115165710449, + "learning_rate": 8.073286245718686e-07, + "loss": 0.8598, + "step": 14970 + }, + { + "epoch": 0.23313380089828517, + "grad_norm": 3.202042818069458, + "learning_rate": 8.072466855672637e-07, + "loss": 0.8332, + "step": 14975 + }, + { + "epoch": 0.23321164190025453, + "grad_norm": 4.588990211486816, + "learning_rate": 8.071647465626588e-07, + "loss": 0.6646, + "step": 14980 + }, + { + "epoch": 0.23328948290222393, + "grad_norm": 3.9389398097991943, + "learning_rate": 8.070828075580538e-07, + "loss": 0.8169, + "step": 14985 + }, + { + "epoch": 0.2333673239041933, + "grad_norm": 5.094730854034424, + "learning_rate": 8.070008685534487e-07, + "loss": 0.7144, + "step": 14990 + }, + { + "epoch": 0.23344516490616268, + "grad_norm": 4.285375595092773, + "learning_rate": 8.069189295488438e-07, + "loss": 0.7464, + "step": 14995 + }, + { + "epoch": 0.23352300590813205, + "grad_norm": 3.9989163875579834, + "learning_rate": 8.068369905442388e-07, + "loss": 0.7893, + "step": 15000 + }, + { + "epoch": 0.23360084691010144, + "grad_norm": 3.326927661895752, + "learning_rate": 8.067550515396338e-07, + "loss": 0.7341, + "step": 15005 + }, + { + "epoch": 0.2336786879120708, + "grad_norm": 3.371975898742676, + "learning_rate": 8.066731125350289e-07, + "loss": 0.7949, + "step": 15010 + }, + { + "epoch": 0.23375652891404017, + "grad_norm": 8.783902168273926, + "learning_rate": 8.06591173530424e-07, + "loss": 0.7686, + "step": 15015 + }, + { + "epoch": 0.23383436991600956, + "grad_norm": 5.643832206726074, + "learning_rate": 8.06509234525819e-07, + "loss": 0.6986, + "step": 15020 + }, + { + "epoch": 0.23391221091797892, + "grad_norm": 2.909189462661743, + "learning_rate": 8.06427295521214e-07, + "loss": 0.832, + "step": 15025 + }, + { + "epoch": 0.23399005191994832, + "grad_norm": 3.0144805908203125, + "learning_rate": 8.06345356516609e-07, + "loss": 0.7332, + "step": 15030 + }, + { + "epoch": 0.23406789292191768, + "grad_norm": 4.5122246742248535, + "learning_rate": 8.06263417512004e-07, + "loss": 0.7674, + "step": 15035 + }, + { + "epoch": 0.23414573392388707, + "grad_norm": 3.568117141723633, + "learning_rate": 8.06181478507399e-07, + "loss": 0.7287, + "step": 15040 + }, + { + "epoch": 0.23422357492585644, + "grad_norm": 4.4060845375061035, + "learning_rate": 8.060995395027941e-07, + "loss": 0.8157, + "step": 15045 + }, + { + "epoch": 0.23430141592782583, + "grad_norm": 4.511560440063477, + "learning_rate": 8.060176004981891e-07, + "loss": 0.7997, + "step": 15050 + }, + { + "epoch": 0.2343792569297952, + "grad_norm": 4.127213001251221, + "learning_rate": 8.059356614935842e-07, + "loss": 0.6869, + "step": 15055 + }, + { + "epoch": 0.23445709793176459, + "grad_norm": 3.2110824584960938, + "learning_rate": 8.058537224889792e-07, + "loss": 0.8188, + "step": 15060 + }, + { + "epoch": 0.23453493893373395, + "grad_norm": 6.899344444274902, + "learning_rate": 8.057717834843742e-07, + "loss": 0.7663, + "step": 15065 + }, + { + "epoch": 0.23461277993570334, + "grad_norm": 6.0621538162231445, + "learning_rate": 8.056898444797692e-07, + "loss": 0.8239, + "step": 15070 + }, + { + "epoch": 0.2346906209376727, + "grad_norm": 5.787536144256592, + "learning_rate": 8.056079054751643e-07, + "loss": 0.78, + "step": 15075 + }, + { + "epoch": 0.2347684619396421, + "grad_norm": 3.954349994659424, + "learning_rate": 8.055259664705592e-07, + "loss": 0.8584, + "step": 15080 + }, + { + "epoch": 0.23484630294161146, + "grad_norm": 4.174968719482422, + "learning_rate": 8.054440274659543e-07, + "loss": 0.7563, + "step": 15085 + }, + { + "epoch": 0.23492414394358085, + "grad_norm": 4.891374111175537, + "learning_rate": 8.053620884613494e-07, + "loss": 0.859, + "step": 15090 + }, + { + "epoch": 0.23500198494555022, + "grad_norm": 3.874782085418701, + "learning_rate": 8.052801494567443e-07, + "loss": 0.825, + "step": 15095 + }, + { + "epoch": 0.23507982594751958, + "grad_norm": 6.848973274230957, + "learning_rate": 8.051982104521394e-07, + "loss": 0.7221, + "step": 15100 + }, + { + "epoch": 0.23515766694948897, + "grad_norm": 8.270047187805176, + "learning_rate": 8.051162714475345e-07, + "loss": 0.7624, + "step": 15105 + }, + { + "epoch": 0.23523550795145834, + "grad_norm": 4.251575946807861, + "learning_rate": 8.050343324429294e-07, + "loss": 0.7126, + "step": 15110 + }, + { + "epoch": 0.23531334895342773, + "grad_norm": 3.7854721546173096, + "learning_rate": 8.049523934383244e-07, + "loss": 0.8051, + "step": 15115 + }, + { + "epoch": 0.2353911899553971, + "grad_norm": 3.765505790710449, + "learning_rate": 8.048704544337195e-07, + "loss": 0.7562, + "step": 15120 + }, + { + "epoch": 0.2354690309573665, + "grad_norm": 7.419190406799316, + "learning_rate": 8.047885154291146e-07, + "loss": 0.7952, + "step": 15125 + }, + { + "epoch": 0.23554687195933585, + "grad_norm": 3.807638168334961, + "learning_rate": 8.047065764245095e-07, + "loss": 0.88, + "step": 15130 + }, + { + "epoch": 0.23562471296130524, + "grad_norm": 3.85225248336792, + "learning_rate": 8.046246374199046e-07, + "loss": 0.8683, + "step": 15135 + }, + { + "epoch": 0.2357025539632746, + "grad_norm": 3.4142606258392334, + "learning_rate": 8.045426984152997e-07, + "loss": 0.636, + "step": 15140 + }, + { + "epoch": 0.235780394965244, + "grad_norm": 4.804329872131348, + "learning_rate": 8.044607594106947e-07, + "loss": 0.7109, + "step": 15145 + }, + { + "epoch": 0.23585823596721336, + "grad_norm": 5.378768444061279, + "learning_rate": 8.043788204060896e-07, + "loss": 0.8313, + "step": 15150 + }, + { + "epoch": 0.23593607696918276, + "grad_norm": 5.029935359954834, + "learning_rate": 8.042968814014847e-07, + "loss": 0.7756, + "step": 15155 + }, + { + "epoch": 0.23601391797115212, + "grad_norm": 3.552234649658203, + "learning_rate": 8.042149423968797e-07, + "loss": 0.7499, + "step": 15160 + }, + { + "epoch": 0.2360917589731215, + "grad_norm": 5.424056053161621, + "learning_rate": 8.041330033922748e-07, + "loss": 0.7542, + "step": 15165 + }, + { + "epoch": 0.23616959997509088, + "grad_norm": 2.835927724838257, + "learning_rate": 8.040510643876698e-07, + "loss": 0.7637, + "step": 15170 + }, + { + "epoch": 0.23624744097706027, + "grad_norm": 3.721679210662842, + "learning_rate": 8.039691253830648e-07, + "loss": 0.6655, + "step": 15175 + }, + { + "epoch": 0.23632528197902963, + "grad_norm": 3.6744985580444336, + "learning_rate": 8.038871863784599e-07, + "loss": 0.7801, + "step": 15180 + }, + { + "epoch": 0.236403122980999, + "grad_norm": 4.117432117462158, + "learning_rate": 8.038052473738549e-07, + "loss": 0.7342, + "step": 15185 + }, + { + "epoch": 0.2364809639829684, + "grad_norm": 3.8154187202453613, + "learning_rate": 8.037233083692498e-07, + "loss": 0.8315, + "step": 15190 + }, + { + "epoch": 0.23655880498493775, + "grad_norm": 2.46156907081604, + "learning_rate": 8.036413693646449e-07, + "loss": 0.7739, + "step": 15195 + }, + { + "epoch": 0.23663664598690715, + "grad_norm": 2.249382257461548, + "learning_rate": 8.0355943036004e-07, + "loss": 0.6846, + "step": 15200 + }, + { + "epoch": 0.2367144869888765, + "grad_norm": 3.7491369247436523, + "learning_rate": 8.034774913554349e-07, + "loss": 0.6684, + "step": 15205 + }, + { + "epoch": 0.2367923279908459, + "grad_norm": 5.3089599609375, + "learning_rate": 8.0339555235083e-07, + "loss": 0.7049, + "step": 15210 + }, + { + "epoch": 0.23687016899281527, + "grad_norm": 9.281523704528809, + "learning_rate": 8.033136133462251e-07, + "loss": 0.8498, + "step": 15215 + }, + { + "epoch": 0.23694800999478466, + "grad_norm": 7.066847324371338, + "learning_rate": 8.0323167434162e-07, + "loss": 0.7983, + "step": 15220 + }, + { + "epoch": 0.23702585099675402, + "grad_norm": 4.059422016143799, + "learning_rate": 8.031497353370151e-07, + "loss": 0.8096, + "step": 15225 + }, + { + "epoch": 0.23710369199872341, + "grad_norm": 3.0671002864837646, + "learning_rate": 8.030677963324101e-07, + "loss": 0.7808, + "step": 15230 + }, + { + "epoch": 0.23718153300069278, + "grad_norm": 4.942725658416748, + "learning_rate": 8.029858573278051e-07, + "loss": 0.6983, + "step": 15235 + }, + { + "epoch": 0.23725937400266217, + "grad_norm": 6.2389235496521, + "learning_rate": 8.029039183232001e-07, + "loss": 0.8416, + "step": 15240 + }, + { + "epoch": 0.23733721500463154, + "grad_norm": 5.170253753662109, + "learning_rate": 8.028219793185952e-07, + "loss": 0.698, + "step": 15245 + }, + { + "epoch": 0.23741505600660093, + "grad_norm": 3.286102294921875, + "learning_rate": 8.027400403139903e-07, + "loss": 0.6214, + "step": 15250 + }, + { + "epoch": 0.2374928970085703, + "grad_norm": 3.854853868484497, + "learning_rate": 8.026581013093853e-07, + "loss": 0.7179, + "step": 15255 + }, + { + "epoch": 0.23757073801053968, + "grad_norm": 3.141479015350342, + "learning_rate": 8.025761623047803e-07, + "loss": 0.8504, + "step": 15260 + }, + { + "epoch": 0.23764857901250905, + "grad_norm": 4.667540073394775, + "learning_rate": 8.024942233001754e-07, + "loss": 0.7591, + "step": 15265 + }, + { + "epoch": 0.23772642001447844, + "grad_norm": 4.2037882804870605, + "learning_rate": 8.024122842955704e-07, + "loss": 0.7391, + "step": 15270 + }, + { + "epoch": 0.2378042610164478, + "grad_norm": 4.822389125823975, + "learning_rate": 8.023303452909653e-07, + "loss": 0.7682, + "step": 15275 + }, + { + "epoch": 0.23788210201841717, + "grad_norm": 3.8853273391723633, + "learning_rate": 8.022484062863604e-07, + "loss": 0.7997, + "step": 15280 + }, + { + "epoch": 0.23795994302038656, + "grad_norm": 3.6971592903137207, + "learning_rate": 8.021664672817554e-07, + "loss": 0.7844, + "step": 15285 + }, + { + "epoch": 0.23803778402235592, + "grad_norm": 4.4559712409973145, + "learning_rate": 8.020845282771505e-07, + "loss": 0.7987, + "step": 15290 + }, + { + "epoch": 0.23811562502432532, + "grad_norm": 4.48579740524292, + "learning_rate": 8.020025892725455e-07, + "loss": 0.692, + "step": 15295 + }, + { + "epoch": 0.23819346602629468, + "grad_norm": 3.990757465362549, + "learning_rate": 8.019206502679405e-07, + "loss": 0.7665, + "step": 15300 + }, + { + "epoch": 0.23827130702826407, + "grad_norm": 4.188387870788574, + "learning_rate": 8.018387112633356e-07, + "loss": 0.7027, + "step": 15305 + }, + { + "epoch": 0.23834914803023344, + "grad_norm": 3.3271737098693848, + "learning_rate": 8.017567722587307e-07, + "loss": 0.7322, + "step": 15310 + }, + { + "epoch": 0.23842698903220283, + "grad_norm": 4.802268028259277, + "learning_rate": 8.016748332541255e-07, + "loss": 0.8131, + "step": 15315 + }, + { + "epoch": 0.2385048300341722, + "grad_norm": 3.292767286300659, + "learning_rate": 8.015928942495206e-07, + "loss": 0.837, + "step": 15320 + }, + { + "epoch": 0.23858267103614159, + "grad_norm": 8.88932991027832, + "learning_rate": 8.015109552449157e-07, + "loss": 0.7772, + "step": 15325 + }, + { + "epoch": 0.23866051203811095, + "grad_norm": 3.7067458629608154, + "learning_rate": 8.014290162403106e-07, + "loss": 0.7729, + "step": 15330 + }, + { + "epoch": 0.23873835304008034, + "grad_norm": 4.151866912841797, + "learning_rate": 8.013470772357057e-07, + "loss": 0.8282, + "step": 15335 + }, + { + "epoch": 0.2388161940420497, + "grad_norm": 5.464903831481934, + "learning_rate": 8.012651382311008e-07, + "loss": 0.6652, + "step": 15340 + }, + { + "epoch": 0.2388940350440191, + "grad_norm": 3.9546003341674805, + "learning_rate": 8.011831992264958e-07, + "loss": 0.7602, + "step": 15345 + }, + { + "epoch": 0.23897187604598846, + "grad_norm": 3.7713892459869385, + "learning_rate": 8.011012602218908e-07, + "loss": 0.823, + "step": 15350 + }, + { + "epoch": 0.23904971704795785, + "grad_norm": 3.8322677612304688, + "learning_rate": 8.010193212172858e-07, + "loss": 0.8645, + "step": 15355 + }, + { + "epoch": 0.23912755804992722, + "grad_norm": 3.5602362155914307, + "learning_rate": 8.009373822126808e-07, + "loss": 0.7581, + "step": 15360 + }, + { + "epoch": 0.23920539905189658, + "grad_norm": 3.7427737712860107, + "learning_rate": 8.008554432080758e-07, + "loss": 0.9271, + "step": 15365 + }, + { + "epoch": 0.23928324005386598, + "grad_norm": 3.3515186309814453, + "learning_rate": 8.007735042034709e-07, + "loss": 0.5673, + "step": 15370 + }, + { + "epoch": 0.23936108105583534, + "grad_norm": 5.299621105194092, + "learning_rate": 8.00691565198866e-07, + "loss": 0.809, + "step": 15375 + }, + { + "epoch": 0.23943892205780473, + "grad_norm": 3.726717233657837, + "learning_rate": 8.00609626194261e-07, + "loss": 0.8541, + "step": 15380 + }, + { + "epoch": 0.2395167630597741, + "grad_norm": 4.9893083572387695, + "learning_rate": 8.00527687189656e-07, + "loss": 0.6938, + "step": 15385 + }, + { + "epoch": 0.2395946040617435, + "grad_norm": 3.4375534057617188, + "learning_rate": 8.004457481850511e-07, + "loss": 0.6838, + "step": 15390 + }, + { + "epoch": 0.23967244506371285, + "grad_norm": 10.630546569824219, + "learning_rate": 8.00363809180446e-07, + "loss": 0.9148, + "step": 15395 + }, + { + "epoch": 0.23975028606568224, + "grad_norm": 4.449067115783691, + "learning_rate": 8.00281870175841e-07, + "loss": 0.7932, + "step": 15400 + }, + { + "epoch": 0.2398281270676516, + "grad_norm": 3.6890759468078613, + "learning_rate": 8.001999311712361e-07, + "loss": 0.7808, + "step": 15405 + }, + { + "epoch": 0.239905968069621, + "grad_norm": 5.842904567718506, + "learning_rate": 8.001179921666311e-07, + "loss": 0.784, + "step": 15410 + }, + { + "epoch": 0.23998380907159036, + "grad_norm": 3.3193156719207764, + "learning_rate": 8.000360531620262e-07, + "loss": 0.7356, + "step": 15415 + }, + { + "epoch": 0.24006165007355976, + "grad_norm": 3.566133737564087, + "learning_rate": 7.999541141574212e-07, + "loss": 0.77, + "step": 15420 + }, + { + "epoch": 0.24013949107552912, + "grad_norm": 3.7197113037109375, + "learning_rate": 7.998721751528162e-07, + "loss": 0.7782, + "step": 15425 + }, + { + "epoch": 0.2402173320774985, + "grad_norm": 4.48840856552124, + "learning_rate": 7.997902361482113e-07, + "loss": 0.7564, + "step": 15430 + }, + { + "epoch": 0.24029517307946788, + "grad_norm": 3.8312463760375977, + "learning_rate": 7.997082971436063e-07, + "loss": 0.7527, + "step": 15435 + }, + { + "epoch": 0.24037301408143727, + "grad_norm": 3.0325686931610107, + "learning_rate": 7.996263581390012e-07, + "loss": 0.8293, + "step": 15440 + }, + { + "epoch": 0.24045085508340663, + "grad_norm": 5.339194297790527, + "learning_rate": 7.995444191343963e-07, + "loss": 0.7972, + "step": 15445 + }, + { + "epoch": 0.240528696085376, + "grad_norm": 3.5099070072174072, + "learning_rate": 7.994624801297914e-07, + "loss": 0.6688, + "step": 15450 + }, + { + "epoch": 0.2406065370873454, + "grad_norm": 4.041937828063965, + "learning_rate": 7.993805411251863e-07, + "loss": 0.7599, + "step": 15455 + }, + { + "epoch": 0.24068437808931475, + "grad_norm": 2.9792280197143555, + "learning_rate": 7.992986021205814e-07, + "loss": 0.763, + "step": 15460 + }, + { + "epoch": 0.24076221909128415, + "grad_norm": 3.3873233795166016, + "learning_rate": 7.992166631159765e-07, + "loss": 0.7828, + "step": 15465 + }, + { + "epoch": 0.2408400600932535, + "grad_norm": 7.740002155303955, + "learning_rate": 7.991347241113715e-07, + "loss": 0.8105, + "step": 15470 + }, + { + "epoch": 0.2409179010952229, + "grad_norm": 3.067972183227539, + "learning_rate": 7.990527851067664e-07, + "loss": 0.7807, + "step": 15475 + }, + { + "epoch": 0.24099574209719227, + "grad_norm": 5.455771446228027, + "learning_rate": 7.989708461021615e-07, + "loss": 0.8325, + "step": 15480 + }, + { + "epoch": 0.24107358309916166, + "grad_norm": 5.027036190032959, + "learning_rate": 7.988889070975565e-07, + "loss": 0.8411, + "step": 15485 + }, + { + "epoch": 0.24115142410113102, + "grad_norm": 3.955979347229004, + "learning_rate": 7.988069680929515e-07, + "loss": 0.7524, + "step": 15490 + }, + { + "epoch": 0.24122926510310042, + "grad_norm": 8.366657257080078, + "learning_rate": 7.987250290883466e-07, + "loss": 0.6843, + "step": 15495 + }, + { + "epoch": 0.24130710610506978, + "grad_norm": 3.6064229011535645, + "learning_rate": 7.986430900837417e-07, + "loss": 0.6553, + "step": 15500 + }, + { + "epoch": 0.24138494710703917, + "grad_norm": 4.347894668579102, + "learning_rate": 7.985611510791367e-07, + "loss": 0.8618, + "step": 15505 + }, + { + "epoch": 0.24146278810900854, + "grad_norm": 3.172020435333252, + "learning_rate": 7.984792120745317e-07, + "loss": 0.7782, + "step": 15510 + }, + { + "epoch": 0.24154062911097793, + "grad_norm": 2.4569530487060547, + "learning_rate": 7.983972730699267e-07, + "loss": 0.7837, + "step": 15515 + }, + { + "epoch": 0.2416184701129473, + "grad_norm": 3.2561228275299072, + "learning_rate": 7.983153340653217e-07, + "loss": 0.7358, + "step": 15520 + }, + { + "epoch": 0.24169631111491668, + "grad_norm": 3.373246431350708, + "learning_rate": 7.982333950607168e-07, + "loss": 0.698, + "step": 15525 + }, + { + "epoch": 0.24177415211688605, + "grad_norm": 3.8543834686279297, + "learning_rate": 7.981514560561118e-07, + "loss": 0.8319, + "step": 15530 + }, + { + "epoch": 0.2418519931188554, + "grad_norm": 5.210475921630859, + "learning_rate": 7.980695170515068e-07, + "loss": 0.7735, + "step": 15535 + }, + { + "epoch": 0.2419298341208248, + "grad_norm": 5.1110358238220215, + "learning_rate": 7.979875780469019e-07, + "loss": 0.7848, + "step": 15540 + }, + { + "epoch": 0.24200767512279417, + "grad_norm": 4.9611663818359375, + "learning_rate": 7.979056390422969e-07, + "loss": 0.8454, + "step": 15545 + }, + { + "epoch": 0.24208551612476356, + "grad_norm": 2.9595394134521484, + "learning_rate": 7.978237000376919e-07, + "loss": 0.7968, + "step": 15550 + }, + { + "epoch": 0.24216335712673293, + "grad_norm": 5.836178302764893, + "learning_rate": 7.977417610330869e-07, + "loss": 0.8707, + "step": 15555 + }, + { + "epoch": 0.24224119812870232, + "grad_norm": 3.4866228103637695, + "learning_rate": 7.97659822028482e-07, + "loss": 0.7657, + "step": 15560 + }, + { + "epoch": 0.24231903913067168, + "grad_norm": 3.4374146461486816, + "learning_rate": 7.975778830238769e-07, + "loss": 0.7517, + "step": 15565 + }, + { + "epoch": 0.24239688013264107, + "grad_norm": 3.317115068435669, + "learning_rate": 7.97495944019272e-07, + "loss": 0.734, + "step": 15570 + }, + { + "epoch": 0.24247472113461044, + "grad_norm": 3.37998104095459, + "learning_rate": 7.974140050146671e-07, + "loss": 0.7899, + "step": 15575 + }, + { + "epoch": 0.24255256213657983, + "grad_norm": 4.553988933563232, + "learning_rate": 7.97332066010062e-07, + "loss": 0.8539, + "step": 15580 + }, + { + "epoch": 0.2426304031385492, + "grad_norm": 2.3986995220184326, + "learning_rate": 7.972501270054571e-07, + "loss": 0.6941, + "step": 15585 + }, + { + "epoch": 0.2427082441405186, + "grad_norm": 5.917320728302002, + "learning_rate": 7.971681880008522e-07, + "loss": 0.7735, + "step": 15590 + }, + { + "epoch": 0.24278608514248795, + "grad_norm": 3.8159122467041016, + "learning_rate": 7.970862489962472e-07, + "loss": 0.7445, + "step": 15595 + }, + { + "epoch": 0.24286392614445734, + "grad_norm": 4.68142032623291, + "learning_rate": 7.970043099916421e-07, + "loss": 0.7696, + "step": 15600 + }, + { + "epoch": 0.2429417671464267, + "grad_norm": 3.4699482917785645, + "learning_rate": 7.969223709870372e-07, + "loss": 0.8056, + "step": 15605 + }, + { + "epoch": 0.2430196081483961, + "grad_norm": 2.8645105361938477, + "learning_rate": 7.968404319824322e-07, + "loss": 0.8157, + "step": 15610 + }, + { + "epoch": 0.24309744915036546, + "grad_norm": 5.023718357086182, + "learning_rate": 7.967584929778273e-07, + "loss": 0.7347, + "step": 15615 + }, + { + "epoch": 0.24317529015233483, + "grad_norm": 5.544175148010254, + "learning_rate": 7.966765539732223e-07, + "loss": 0.865, + "step": 15620 + }, + { + "epoch": 0.24325313115430422, + "grad_norm": 3.5016930103302, + "learning_rate": 7.965946149686174e-07, + "loss": 0.8305, + "step": 15625 + }, + { + "epoch": 0.24333097215627358, + "grad_norm": 3.4511895179748535, + "learning_rate": 7.965126759640124e-07, + "loss": 0.7034, + "step": 15630 + }, + { + "epoch": 0.24340881315824298, + "grad_norm": 7.369199275970459, + "learning_rate": 7.964307369594074e-07, + "loss": 0.7782, + "step": 15635 + }, + { + "epoch": 0.24348665416021234, + "grad_norm": 3.0479769706726074, + "learning_rate": 7.963487979548024e-07, + "loss": 0.7787, + "step": 15640 + }, + { + "epoch": 0.24356449516218173, + "grad_norm": 3.6762473583221436, + "learning_rate": 7.962668589501974e-07, + "loss": 0.7846, + "step": 15645 + }, + { + "epoch": 0.2436423361641511, + "grad_norm": 2.366706371307373, + "learning_rate": 7.961849199455925e-07, + "loss": 0.8315, + "step": 15650 + }, + { + "epoch": 0.2437201771661205, + "grad_norm": 3.059541940689087, + "learning_rate": 7.961029809409875e-07, + "loss": 0.9155, + "step": 15655 + }, + { + "epoch": 0.24379801816808985, + "grad_norm": 3.0604653358459473, + "learning_rate": 7.960210419363825e-07, + "loss": 0.7558, + "step": 15660 + }, + { + "epoch": 0.24387585917005924, + "grad_norm": 3.0823192596435547, + "learning_rate": 7.959391029317776e-07, + "loss": 0.7389, + "step": 15665 + }, + { + "epoch": 0.2439537001720286, + "grad_norm": 4.451327800750732, + "learning_rate": 7.958571639271727e-07, + "loss": 0.7755, + "step": 15670 + }, + { + "epoch": 0.244031541173998, + "grad_norm": 2.8147003650665283, + "learning_rate": 7.957752249225676e-07, + "loss": 0.7032, + "step": 15675 + }, + { + "epoch": 0.24410938217596737, + "grad_norm": 3.849137544631958, + "learning_rate": 7.956932859179626e-07, + "loss": 0.6895, + "step": 15680 + }, + { + "epoch": 0.24418722317793676, + "grad_norm": 6.4774627685546875, + "learning_rate": 7.956113469133577e-07, + "loss": 0.7932, + "step": 15685 + }, + { + "epoch": 0.24426506417990612, + "grad_norm": 6.876297473907471, + "learning_rate": 7.955294079087526e-07, + "loss": 0.8019, + "step": 15690 + }, + { + "epoch": 0.2443429051818755, + "grad_norm": 4.014187812805176, + "learning_rate": 7.954474689041477e-07, + "loss": 0.8425, + "step": 15695 + }, + { + "epoch": 0.24442074618384488, + "grad_norm": 4.625513553619385, + "learning_rate": 7.953655298995428e-07, + "loss": 0.8028, + "step": 15700 + }, + { + "epoch": 0.24449858718581424, + "grad_norm": 7.612455368041992, + "learning_rate": 7.952835908949378e-07, + "loss": 0.9008, + "step": 15705 + }, + { + "epoch": 0.24457642818778363, + "grad_norm": 6.782050609588623, + "learning_rate": 7.952016518903328e-07, + "loss": 0.9255, + "step": 15710 + }, + { + "epoch": 0.244654269189753, + "grad_norm": 4.165322780609131, + "learning_rate": 7.951197128857279e-07, + "loss": 0.8253, + "step": 15715 + }, + { + "epoch": 0.2447321101917224, + "grad_norm": 6.379940032958984, + "learning_rate": 7.950377738811228e-07, + "loss": 0.8267, + "step": 15720 + }, + { + "epoch": 0.24480995119369175, + "grad_norm": 2.715947389602661, + "learning_rate": 7.949558348765178e-07, + "loss": 0.7533, + "step": 15725 + }, + { + "epoch": 0.24488779219566115, + "grad_norm": 5.163042068481445, + "learning_rate": 7.948738958719129e-07, + "loss": 0.8279, + "step": 15730 + }, + { + "epoch": 0.2449656331976305, + "grad_norm": 3.102045774459839, + "learning_rate": 7.947919568673079e-07, + "loss": 0.8667, + "step": 15735 + }, + { + "epoch": 0.2450434741995999, + "grad_norm": 4.849552631378174, + "learning_rate": 7.94710017862703e-07, + "loss": 0.7192, + "step": 15740 + }, + { + "epoch": 0.24512131520156927, + "grad_norm": 3.216055393218994, + "learning_rate": 7.94628078858098e-07, + "loss": 0.8751, + "step": 15745 + }, + { + "epoch": 0.24519915620353866, + "grad_norm": 7.7601237297058105, + "learning_rate": 7.945461398534931e-07, + "loss": 0.7621, + "step": 15750 + }, + { + "epoch": 0.24527699720550802, + "grad_norm": 3.9958834648132324, + "learning_rate": 7.944642008488881e-07, + "loss": 0.706, + "step": 15755 + }, + { + "epoch": 0.24535483820747742, + "grad_norm": 2.99743390083313, + "learning_rate": 7.94382261844283e-07, + "loss": 0.7574, + "step": 15760 + }, + { + "epoch": 0.24543267920944678, + "grad_norm": 2.5857388973236084, + "learning_rate": 7.943003228396781e-07, + "loss": 0.8547, + "step": 15765 + }, + { + "epoch": 0.24551052021141617, + "grad_norm": 4.666770935058594, + "learning_rate": 7.942183838350731e-07, + "loss": 0.7977, + "step": 15770 + }, + { + "epoch": 0.24558836121338554, + "grad_norm": 7.485809803009033, + "learning_rate": 7.941364448304682e-07, + "loss": 0.8136, + "step": 15775 + }, + { + "epoch": 0.24566620221535493, + "grad_norm": 3.478633165359497, + "learning_rate": 7.940545058258632e-07, + "loss": 0.7625, + "step": 15780 + }, + { + "epoch": 0.2457440432173243, + "grad_norm": 4.919050216674805, + "learning_rate": 7.939725668212582e-07, + "loss": 0.7858, + "step": 15785 + }, + { + "epoch": 0.24582188421929366, + "grad_norm": 4.7012505531311035, + "learning_rate": 7.938906278166533e-07, + "loss": 0.7982, + "step": 15790 + }, + { + "epoch": 0.24589972522126305, + "grad_norm": 3.941943883895874, + "learning_rate": 7.938086888120484e-07, + "loss": 0.7948, + "step": 15795 + }, + { + "epoch": 0.2459775662232324, + "grad_norm": 2.9950075149536133, + "learning_rate": 7.937267498074432e-07, + "loss": 0.8232, + "step": 15800 + }, + { + "epoch": 0.2460554072252018, + "grad_norm": 3.231367588043213, + "learning_rate": 7.936448108028383e-07, + "loss": 0.6793, + "step": 15805 + }, + { + "epoch": 0.24613324822717117, + "grad_norm": 6.495508670806885, + "learning_rate": 7.935628717982334e-07, + "loss": 0.7306, + "step": 15810 + }, + { + "epoch": 0.24621108922914056, + "grad_norm": 3.594294786453247, + "learning_rate": 7.934809327936283e-07, + "loss": 0.8135, + "step": 15815 + }, + { + "epoch": 0.24628893023110993, + "grad_norm": 4.430262088775635, + "learning_rate": 7.933989937890234e-07, + "loss": 0.7158, + "step": 15820 + }, + { + "epoch": 0.24636677123307932, + "grad_norm": 5.1624836921691895, + "learning_rate": 7.933170547844185e-07, + "loss": 0.7402, + "step": 15825 + }, + { + "epoch": 0.24644461223504868, + "grad_norm": 2.362281322479248, + "learning_rate": 7.932351157798135e-07, + "loss": 0.6743, + "step": 15830 + }, + { + "epoch": 0.24652245323701807, + "grad_norm": 2.9471206665039062, + "learning_rate": 7.931531767752085e-07, + "loss": 0.6006, + "step": 15835 + }, + { + "epoch": 0.24660029423898744, + "grad_norm": 3.280003786087036, + "learning_rate": 7.930712377706035e-07, + "loss": 0.7592, + "step": 15840 + }, + { + "epoch": 0.24667813524095683, + "grad_norm": 5.314359188079834, + "learning_rate": 7.929892987659985e-07, + "loss": 0.8627, + "step": 15845 + }, + { + "epoch": 0.2467559762429262, + "grad_norm": 6.30027437210083, + "learning_rate": 7.929073597613935e-07, + "loss": 0.8176, + "step": 15850 + }, + { + "epoch": 0.2468338172448956, + "grad_norm": 8.492682456970215, + "learning_rate": 7.928254207567886e-07, + "loss": 0.8596, + "step": 15855 + }, + { + "epoch": 0.24691165824686495, + "grad_norm": 2.7843966484069824, + "learning_rate": 7.927434817521836e-07, + "loss": 0.7205, + "step": 15860 + }, + { + "epoch": 0.24698949924883434, + "grad_norm": 3.647057294845581, + "learning_rate": 7.926615427475787e-07, + "loss": 0.7801, + "step": 15865 + }, + { + "epoch": 0.2470673402508037, + "grad_norm": 5.774316310882568, + "learning_rate": 7.925796037429737e-07, + "loss": 0.7323, + "step": 15870 + }, + { + "epoch": 0.2471451812527731, + "grad_norm": 2.779609441757202, + "learning_rate": 7.924976647383688e-07, + "loss": 0.7907, + "step": 15875 + }, + { + "epoch": 0.24722302225474246, + "grad_norm": 5.235540390014648, + "learning_rate": 7.924157257337637e-07, + "loss": 0.8759, + "step": 15880 + }, + { + "epoch": 0.24730086325671183, + "grad_norm": 3.683673620223999, + "learning_rate": 7.923337867291588e-07, + "loss": 0.8407, + "step": 15885 + }, + { + "epoch": 0.24737870425868122, + "grad_norm": 4.882210731506348, + "learning_rate": 7.922518477245538e-07, + "loss": 0.654, + "step": 15890 + }, + { + "epoch": 0.24745654526065058, + "grad_norm": 6.311478137969971, + "learning_rate": 7.921699087199488e-07, + "loss": 0.7533, + "step": 15895 + }, + { + "epoch": 0.24753438626261998, + "grad_norm": 3.136744260787964, + "learning_rate": 7.920879697153439e-07, + "loss": 0.7993, + "step": 15900 + }, + { + "epoch": 0.24761222726458934, + "grad_norm": 3.370462656021118, + "learning_rate": 7.92006030710739e-07, + "loss": 0.7827, + "step": 15905 + }, + { + "epoch": 0.24769006826655873, + "grad_norm": 6.1946892738342285, + "learning_rate": 7.919240917061339e-07, + "loss": 0.6617, + "step": 15910 + }, + { + "epoch": 0.2477679092685281, + "grad_norm": 4.19883918762207, + "learning_rate": 7.91842152701529e-07, + "loss": 0.6765, + "step": 15915 + }, + { + "epoch": 0.2478457502704975, + "grad_norm": 7.124974250793457, + "learning_rate": 7.917602136969241e-07, + "loss": 0.877, + "step": 15920 + }, + { + "epoch": 0.24792359127246685, + "grad_norm": 3.6568546295166016, + "learning_rate": 7.916782746923189e-07, + "loss": 0.7851, + "step": 15925 + }, + { + "epoch": 0.24800143227443625, + "grad_norm": 3.275956392288208, + "learning_rate": 7.91596335687714e-07, + "loss": 0.8097, + "step": 15930 + }, + { + "epoch": 0.2480792732764056, + "grad_norm": 3.5456111431121826, + "learning_rate": 7.915143966831091e-07, + "loss": 0.825, + "step": 15935 + }, + { + "epoch": 0.248157114278375, + "grad_norm": 5.092121601104736, + "learning_rate": 7.91432457678504e-07, + "loss": 0.7222, + "step": 15940 + }, + { + "epoch": 0.24823495528034437, + "grad_norm": 3.306962251663208, + "learning_rate": 7.913505186738991e-07, + "loss": 0.8208, + "step": 15945 + }, + { + "epoch": 0.24831279628231376, + "grad_norm": 3.8366684913635254, + "learning_rate": 7.912685796692942e-07, + "loss": 0.71, + "step": 15950 + }, + { + "epoch": 0.24839063728428312, + "grad_norm": 7.828287124633789, + "learning_rate": 7.911866406646892e-07, + "loss": 0.7674, + "step": 15955 + }, + { + "epoch": 0.24846847828625251, + "grad_norm": 6.6405510902404785, + "learning_rate": 7.911047016600842e-07, + "loss": 0.7211, + "step": 15960 + }, + { + "epoch": 0.24854631928822188, + "grad_norm": 5.808908462524414, + "learning_rate": 7.910227626554792e-07, + "loss": 0.6894, + "step": 15965 + }, + { + "epoch": 0.24862416029019124, + "grad_norm": 3.5409669876098633, + "learning_rate": 7.909408236508742e-07, + "loss": 0.953, + "step": 15970 + }, + { + "epoch": 0.24870200129216063, + "grad_norm": 4.125702381134033, + "learning_rate": 7.908588846462693e-07, + "loss": 0.8751, + "step": 15975 + }, + { + "epoch": 0.24877984229413, + "grad_norm": 5.823986053466797, + "learning_rate": 7.907769456416643e-07, + "loss": 0.7372, + "step": 15980 + }, + { + "epoch": 0.2488576832960994, + "grad_norm": 2.8857569694519043, + "learning_rate": 7.906950066370593e-07, + "loss": 0.7657, + "step": 15985 + }, + { + "epoch": 0.24893552429806876, + "grad_norm": 3.998352289199829, + "learning_rate": 7.906130676324544e-07, + "loss": 0.7888, + "step": 15990 + }, + { + "epoch": 0.24901336530003815, + "grad_norm": 3.0391101837158203, + "learning_rate": 7.905311286278494e-07, + "loss": 0.8028, + "step": 15995 + }, + { + "epoch": 0.2490912063020075, + "grad_norm": 5.301479816436768, + "learning_rate": 7.904491896232445e-07, + "loss": 0.8295, + "step": 16000 + }, + { + "epoch": 0.2491690473039769, + "grad_norm": 3.8986215591430664, + "learning_rate": 7.903672506186394e-07, + "loss": 0.7564, + "step": 16005 + }, + { + "epoch": 0.24924688830594627, + "grad_norm": 2.7448720932006836, + "learning_rate": 7.902853116140345e-07, + "loss": 0.6304, + "step": 16010 + }, + { + "epoch": 0.24932472930791566, + "grad_norm": 9.283259391784668, + "learning_rate": 7.902033726094295e-07, + "loss": 0.77, + "step": 16015 + }, + { + "epoch": 0.24940257030988502, + "grad_norm": 3.1082334518432617, + "learning_rate": 7.901214336048245e-07, + "loss": 0.7327, + "step": 16020 + }, + { + "epoch": 0.24948041131185442, + "grad_norm": 3.517364025115967, + "learning_rate": 7.900394946002196e-07, + "loss": 0.7992, + "step": 16025 + }, + { + "epoch": 0.24955825231382378, + "grad_norm": 3.1072371006011963, + "learning_rate": 7.899575555956147e-07, + "loss": 0.7692, + "step": 16030 + }, + { + "epoch": 0.24963609331579317, + "grad_norm": 4.7211527824401855, + "learning_rate": 7.898756165910096e-07, + "loss": 0.6787, + "step": 16035 + }, + { + "epoch": 0.24971393431776254, + "grad_norm": 5.028372764587402, + "learning_rate": 7.897936775864047e-07, + "loss": 0.6971, + "step": 16040 + }, + { + "epoch": 0.24979177531973193, + "grad_norm": 3.3576149940490723, + "learning_rate": 7.897117385817997e-07, + "loss": 0.7766, + "step": 16045 + }, + { + "epoch": 0.2498696163217013, + "grad_norm": 3.800384998321533, + "learning_rate": 7.896297995771946e-07, + "loss": 0.8661, + "step": 16050 + }, + { + "epoch": 0.24994745732367066, + "grad_norm": 2.6306138038635254, + "learning_rate": 7.895478605725897e-07, + "loss": 0.7385, + "step": 16055 + }, + { + "epoch": 0.25002529832564, + "grad_norm": 3.010930061340332, + "learning_rate": 7.894659215679848e-07, + "loss": 0.6919, + "step": 16060 + }, + { + "epoch": 0.2501031393276094, + "grad_norm": 5.057335376739502, + "learning_rate": 7.893839825633798e-07, + "loss": 0.8734, + "step": 16065 + }, + { + "epoch": 0.2501809803295788, + "grad_norm": 2.867112398147583, + "learning_rate": 7.893020435587748e-07, + "loss": 0.8205, + "step": 16070 + }, + { + "epoch": 0.2502588213315482, + "grad_norm": 3.206636905670166, + "learning_rate": 7.892201045541699e-07, + "loss": 0.7503, + "step": 16075 + }, + { + "epoch": 0.25033666233351753, + "grad_norm": 2.602874755859375, + "learning_rate": 7.891381655495649e-07, + "loss": 0.632, + "step": 16080 + }, + { + "epoch": 0.2504145033354869, + "grad_norm": 12.468692779541016, + "learning_rate": 7.890562265449598e-07, + "loss": 0.8202, + "step": 16085 + }, + { + "epoch": 0.2504923443374563, + "grad_norm": 5.095680236816406, + "learning_rate": 7.889742875403549e-07, + "loss": 0.6518, + "step": 16090 + }, + { + "epoch": 0.2505701853394257, + "grad_norm": 3.303833246231079, + "learning_rate": 7.888923485357499e-07, + "loss": 0.6671, + "step": 16095 + }, + { + "epoch": 0.25064802634139505, + "grad_norm": 4.023092746734619, + "learning_rate": 7.88810409531145e-07, + "loss": 0.784, + "step": 16100 + }, + { + "epoch": 0.25072586734336444, + "grad_norm": 4.499414443969727, + "learning_rate": 7.8872847052654e-07, + "loss": 0.8782, + "step": 16105 + }, + { + "epoch": 0.25080370834533383, + "grad_norm": 2.9642868041992188, + "learning_rate": 7.88646531521935e-07, + "loss": 0.7308, + "step": 16110 + }, + { + "epoch": 0.2508815493473032, + "grad_norm": 3.3162338733673096, + "learning_rate": 7.885645925173301e-07, + "loss": 0.6679, + "step": 16115 + }, + { + "epoch": 0.25095939034927256, + "grad_norm": 3.08477783203125, + "learning_rate": 7.884826535127252e-07, + "loss": 0.7544, + "step": 16120 + }, + { + "epoch": 0.25103723135124195, + "grad_norm": 3.925215482711792, + "learning_rate": 7.8840071450812e-07, + "loss": 0.7136, + "step": 16125 + }, + { + "epoch": 0.25111507235321134, + "grad_norm": 2.451855182647705, + "learning_rate": 7.883187755035151e-07, + "loss": 0.7571, + "step": 16130 + }, + { + "epoch": 0.2511929133551807, + "grad_norm": 4.570413112640381, + "learning_rate": 7.882368364989102e-07, + "loss": 0.7323, + "step": 16135 + }, + { + "epoch": 0.2512707543571501, + "grad_norm": 4.142566204071045, + "learning_rate": 7.881548974943052e-07, + "loss": 0.7532, + "step": 16140 + }, + { + "epoch": 0.25134859535911946, + "grad_norm": 3.8024239540100098, + "learning_rate": 7.880729584897002e-07, + "loss": 0.7589, + "step": 16145 + }, + { + "epoch": 0.25142643636108886, + "grad_norm": 7.670729637145996, + "learning_rate": 7.879910194850953e-07, + "loss": 0.7944, + "step": 16150 + }, + { + "epoch": 0.2515042773630582, + "grad_norm": 4.074754238128662, + "learning_rate": 7.879090804804904e-07, + "loss": 0.7572, + "step": 16155 + }, + { + "epoch": 0.2515821183650276, + "grad_norm": 4.261497974395752, + "learning_rate": 7.878271414758853e-07, + "loss": 0.7098, + "step": 16160 + }, + { + "epoch": 0.251659959366997, + "grad_norm": 3.9497246742248535, + "learning_rate": 7.877452024712803e-07, + "loss": 0.7485, + "step": 16165 + }, + { + "epoch": 0.25173780036896637, + "grad_norm": 3.1572659015655518, + "learning_rate": 7.876632634666754e-07, + "loss": 0.8074, + "step": 16170 + }, + { + "epoch": 0.2518156413709357, + "grad_norm": 5.557485580444336, + "learning_rate": 7.875813244620703e-07, + "loss": 0.8997, + "step": 16175 + }, + { + "epoch": 0.2518934823729051, + "grad_norm": 3.017874240875244, + "learning_rate": 7.874993854574654e-07, + "loss": 0.7874, + "step": 16180 + }, + { + "epoch": 0.2519713233748745, + "grad_norm": 4.374793529510498, + "learning_rate": 7.874174464528605e-07, + "loss": 0.762, + "step": 16185 + }, + { + "epoch": 0.2520491643768439, + "grad_norm": 2.7853641510009766, + "learning_rate": 7.873355074482555e-07, + "loss": 0.8455, + "step": 16190 + }, + { + "epoch": 0.2521270053788132, + "grad_norm": 3.7544867992401123, + "learning_rate": 7.872535684436505e-07, + "loss": 0.699, + "step": 16195 + }, + { + "epoch": 0.2522048463807826, + "grad_norm": 7.739635944366455, + "learning_rate": 7.871716294390456e-07, + "loss": 0.782, + "step": 16200 + }, + { + "epoch": 0.252282687382752, + "grad_norm": 3.971346855163574, + "learning_rate": 7.870896904344405e-07, + "loss": 0.6151, + "step": 16205 + }, + { + "epoch": 0.2523605283847214, + "grad_norm": 4.750772953033447, + "learning_rate": 7.870077514298355e-07, + "loss": 0.8195, + "step": 16210 + }, + { + "epoch": 0.25243836938669073, + "grad_norm": 3.0286149978637695, + "learning_rate": 7.869258124252306e-07, + "loss": 0.7301, + "step": 16215 + }, + { + "epoch": 0.2525162103886601, + "grad_norm": 4.7284932136535645, + "learning_rate": 7.868438734206256e-07, + "loss": 0.8878, + "step": 16220 + }, + { + "epoch": 0.2525940513906295, + "grad_norm": 7.5153398513793945, + "learning_rate": 7.867619344160207e-07, + "loss": 0.7866, + "step": 16225 + }, + { + "epoch": 0.25267189239259885, + "grad_norm": 5.353423595428467, + "learning_rate": 7.866799954114157e-07, + "loss": 0.7017, + "step": 16230 + }, + { + "epoch": 0.25274973339456824, + "grad_norm": 3.3869919776916504, + "learning_rate": 7.865980564068107e-07, + "loss": 0.8093, + "step": 16235 + }, + { + "epoch": 0.25282757439653764, + "grad_norm": 4.509708881378174, + "learning_rate": 7.865161174022058e-07, + "loss": 0.6865, + "step": 16240 + }, + { + "epoch": 0.252905415398507, + "grad_norm": 4.100122928619385, + "learning_rate": 7.864341783976009e-07, + "loss": 0.8192, + "step": 16245 + }, + { + "epoch": 0.25298325640047636, + "grad_norm": 6.802055358886719, + "learning_rate": 7.863522393929957e-07, + "loss": 0.775, + "step": 16250 + }, + { + "epoch": 0.25306109740244576, + "grad_norm": 5.319190502166748, + "learning_rate": 7.862703003883908e-07, + "loss": 0.7606, + "step": 16255 + }, + { + "epoch": 0.25313893840441515, + "grad_norm": 4.581395626068115, + "learning_rate": 7.861883613837859e-07, + "loss": 0.7658, + "step": 16260 + }, + { + "epoch": 0.25321677940638454, + "grad_norm": 3.787923812866211, + "learning_rate": 7.86106422379181e-07, + "loss": 0.8342, + "step": 16265 + }, + { + "epoch": 0.2532946204083539, + "grad_norm": 5.891742706298828, + "learning_rate": 7.860244833745759e-07, + "loss": 0.7354, + "step": 16270 + }, + { + "epoch": 0.25337246141032327, + "grad_norm": 3.339656114578247, + "learning_rate": 7.85942544369971e-07, + "loss": 0.7455, + "step": 16275 + }, + { + "epoch": 0.25345030241229266, + "grad_norm": 4.084258556365967, + "learning_rate": 7.858606053653661e-07, + "loss": 0.7613, + "step": 16280 + }, + { + "epoch": 0.25352814341426205, + "grad_norm": 8.612784385681152, + "learning_rate": 7.85778666360761e-07, + "loss": 0.755, + "step": 16285 + }, + { + "epoch": 0.2536059844162314, + "grad_norm": 5.70842981338501, + "learning_rate": 7.85696727356156e-07, + "loss": 0.6829, + "step": 16290 + }, + { + "epoch": 0.2536838254182008, + "grad_norm": 4.7522501945495605, + "learning_rate": 7.856147883515511e-07, + "loss": 0.7412, + "step": 16295 + }, + { + "epoch": 0.2537616664201702, + "grad_norm": 3.4117345809936523, + "learning_rate": 7.85532849346946e-07, + "loss": 0.6579, + "step": 16300 + }, + { + "epoch": 0.25383950742213957, + "grad_norm": 4.271410942077637, + "learning_rate": 7.854509103423411e-07, + "loss": 0.8531, + "step": 16305 + }, + { + "epoch": 0.2539173484241089, + "grad_norm": 4.646153450012207, + "learning_rate": 7.853689713377362e-07, + "loss": 0.8344, + "step": 16310 + }, + { + "epoch": 0.2539951894260783, + "grad_norm": 3.57694411277771, + "learning_rate": 7.852870323331312e-07, + "loss": 0.7752, + "step": 16315 + }, + { + "epoch": 0.2540730304280477, + "grad_norm": 5.518935680389404, + "learning_rate": 7.852050933285262e-07, + "loss": 0.7524, + "step": 16320 + }, + { + "epoch": 0.254150871430017, + "grad_norm": 4.475656509399414, + "learning_rate": 7.851231543239213e-07, + "loss": 0.8602, + "step": 16325 + }, + { + "epoch": 0.2542287124319864, + "grad_norm": 5.29443883895874, + "learning_rate": 7.850412153193162e-07, + "loss": 0.6992, + "step": 16330 + }, + { + "epoch": 0.2543065534339558, + "grad_norm": 4.594540119171143, + "learning_rate": 7.849592763147113e-07, + "loss": 0.823, + "step": 16335 + }, + { + "epoch": 0.2543843944359252, + "grad_norm": 4.070684909820557, + "learning_rate": 7.848773373101063e-07, + "loss": 0.8951, + "step": 16340 + }, + { + "epoch": 0.25446223543789454, + "grad_norm": 4.30888032913208, + "learning_rate": 7.847953983055013e-07, + "loss": 0.9167, + "step": 16345 + }, + { + "epoch": 0.2545400764398639, + "grad_norm": 2.9648544788360596, + "learning_rate": 7.847134593008964e-07, + "loss": 0.7235, + "step": 16350 + }, + { + "epoch": 0.2546179174418333, + "grad_norm": 7.27984094619751, + "learning_rate": 7.846315202962915e-07, + "loss": 0.7889, + "step": 16355 + }, + { + "epoch": 0.2546957584438027, + "grad_norm": 4.1877827644348145, + "learning_rate": 7.845495812916864e-07, + "loss": 0.737, + "step": 16360 + }, + { + "epoch": 0.25477359944577205, + "grad_norm": 3.393552780151367, + "learning_rate": 7.844676422870815e-07, + "loss": 0.7156, + "step": 16365 + }, + { + "epoch": 0.25485144044774144, + "grad_norm": 10.564290046691895, + "learning_rate": 7.843857032824765e-07, + "loss": 0.7583, + "step": 16370 + }, + { + "epoch": 0.25492928144971083, + "grad_norm": 5.3602800369262695, + "learning_rate": 7.843037642778714e-07, + "loss": 0.7714, + "step": 16375 + }, + { + "epoch": 0.2550071224516802, + "grad_norm": 5.475079536437988, + "learning_rate": 7.842218252732665e-07, + "loss": 0.6836, + "step": 16380 + }, + { + "epoch": 0.25508496345364956, + "grad_norm": 3.7718818187713623, + "learning_rate": 7.841398862686616e-07, + "loss": 0.6086, + "step": 16385 + }, + { + "epoch": 0.25516280445561895, + "grad_norm": 2.7791218757629395, + "learning_rate": 7.840579472640567e-07, + "loss": 0.7479, + "step": 16390 + }, + { + "epoch": 0.25524064545758834, + "grad_norm": 5.199838161468506, + "learning_rate": 7.839760082594516e-07, + "loss": 0.8453, + "step": 16395 + }, + { + "epoch": 0.2553184864595577, + "grad_norm": 5.045164585113525, + "learning_rate": 7.838940692548467e-07, + "loss": 0.7707, + "step": 16400 + }, + { + "epoch": 0.2553963274615271, + "grad_norm": 3.9481828212738037, + "learning_rate": 7.838121302502418e-07, + "loss": 0.8319, + "step": 16405 + }, + { + "epoch": 0.25547416846349646, + "grad_norm": 3.1789422035217285, + "learning_rate": 7.837301912456366e-07, + "loss": 0.891, + "step": 16410 + }, + { + "epoch": 0.25555200946546586, + "grad_norm": 3.5486695766448975, + "learning_rate": 7.836482522410317e-07, + "loss": 0.8076, + "step": 16415 + }, + { + "epoch": 0.2556298504674352, + "grad_norm": 2.756218910217285, + "learning_rate": 7.835663132364268e-07, + "loss": 0.8272, + "step": 16420 + }, + { + "epoch": 0.2557076914694046, + "grad_norm": 3.782132148742676, + "learning_rate": 7.834843742318218e-07, + "loss": 0.6728, + "step": 16425 + }, + { + "epoch": 0.255785532471374, + "grad_norm": 3.6226656436920166, + "learning_rate": 7.834024352272168e-07, + "loss": 0.7395, + "step": 16430 + }, + { + "epoch": 0.25586337347334337, + "grad_norm": 3.3157193660736084, + "learning_rate": 7.833204962226119e-07, + "loss": 0.8317, + "step": 16435 + }, + { + "epoch": 0.2559412144753127, + "grad_norm": 3.4248037338256836, + "learning_rate": 7.832385572180069e-07, + "loss": 0.7668, + "step": 16440 + }, + { + "epoch": 0.2560190554772821, + "grad_norm": 7.231293201446533, + "learning_rate": 7.83156618213402e-07, + "loss": 0.6963, + "step": 16445 + }, + { + "epoch": 0.2560968964792515, + "grad_norm": 10.40032958984375, + "learning_rate": 7.830746792087969e-07, + "loss": 0.6711, + "step": 16450 + }, + { + "epoch": 0.2561747374812209, + "grad_norm": 6.381346702575684, + "learning_rate": 7.829927402041919e-07, + "loss": 0.7014, + "step": 16455 + }, + { + "epoch": 0.2562525784831902, + "grad_norm": 2.866187572479248, + "learning_rate": 7.82910801199587e-07, + "loss": 0.841, + "step": 16460 + }, + { + "epoch": 0.2563304194851596, + "grad_norm": 5.38809061050415, + "learning_rate": 7.82828862194982e-07, + "loss": 0.751, + "step": 16465 + }, + { + "epoch": 0.256408260487129, + "grad_norm": 3.7504756450653076, + "learning_rate": 7.82746923190377e-07, + "loss": 0.7829, + "step": 16470 + }, + { + "epoch": 0.2564861014890984, + "grad_norm": 3.9122817516326904, + "learning_rate": 7.826649841857721e-07, + "loss": 0.7979, + "step": 16475 + }, + { + "epoch": 0.25656394249106773, + "grad_norm": 3.9444596767425537, + "learning_rate": 7.825830451811672e-07, + "loss": 0.8804, + "step": 16480 + }, + { + "epoch": 0.2566417834930371, + "grad_norm": 4.131224155426025, + "learning_rate": 7.825011061765621e-07, + "loss": 0.7349, + "step": 16485 + }, + { + "epoch": 0.2567196244950065, + "grad_norm": 4.119733810424805, + "learning_rate": 7.824191671719571e-07, + "loss": 0.612, + "step": 16490 + }, + { + "epoch": 0.25679746549697585, + "grad_norm": 5.177633762359619, + "learning_rate": 7.823372281673522e-07, + "loss": 0.7003, + "step": 16495 + }, + { + "epoch": 0.25687530649894524, + "grad_norm": 5.1837992668151855, + "learning_rate": 7.822552891627471e-07, + "loss": 0.7929, + "step": 16500 + }, + { + "epoch": 0.25695314750091464, + "grad_norm": 6.409270763397217, + "learning_rate": 7.821733501581422e-07, + "loss": 0.7425, + "step": 16505 + }, + { + "epoch": 0.25703098850288403, + "grad_norm": 3.130068063735962, + "learning_rate": 7.820914111535373e-07, + "loss": 0.6838, + "step": 16510 + }, + { + "epoch": 0.25710882950485336, + "grad_norm": 2.76039719581604, + "learning_rate": 7.820094721489324e-07, + "loss": 0.7023, + "step": 16515 + }, + { + "epoch": 0.25718667050682276, + "grad_norm": 4.199296474456787, + "learning_rate": 7.819275331443273e-07, + "loss": 0.777, + "step": 16520 + }, + { + "epoch": 0.25726451150879215, + "grad_norm": 2.0903480052948, + "learning_rate": 7.818455941397224e-07, + "loss": 0.6523, + "step": 16525 + }, + { + "epoch": 0.25734235251076154, + "grad_norm": 3.675006866455078, + "learning_rate": 7.817636551351174e-07, + "loss": 0.7007, + "step": 16530 + }, + { + "epoch": 0.2574201935127309, + "grad_norm": 5.696536540985107, + "learning_rate": 7.816817161305123e-07, + "loss": 0.822, + "step": 16535 + }, + { + "epoch": 0.25749803451470027, + "grad_norm": 3.226651668548584, + "learning_rate": 7.815997771259074e-07, + "loss": 0.7139, + "step": 16540 + }, + { + "epoch": 0.25757587551666966, + "grad_norm": 4.20841121673584, + "learning_rate": 7.815178381213025e-07, + "loss": 0.795, + "step": 16545 + }, + { + "epoch": 0.25765371651863905, + "grad_norm": 7.1250834465026855, + "learning_rate": 7.814358991166975e-07, + "loss": 0.7926, + "step": 16550 + }, + { + "epoch": 0.2577315575206084, + "grad_norm": 3.3494808673858643, + "learning_rate": 7.813539601120925e-07, + "loss": 0.8161, + "step": 16555 + }, + { + "epoch": 0.2578093985225778, + "grad_norm": 3.7074177265167236, + "learning_rate": 7.812720211074876e-07, + "loss": 0.8033, + "step": 16560 + }, + { + "epoch": 0.2578872395245472, + "grad_norm": 3.2563693523406982, + "learning_rate": 7.811900821028826e-07, + "loss": 0.9385, + "step": 16565 + }, + { + "epoch": 0.2579650805265165, + "grad_norm": 4.425312042236328, + "learning_rate": 7.811081430982777e-07, + "loss": 0.7872, + "step": 16570 + }, + { + "epoch": 0.2580429215284859, + "grad_norm": 4.398519515991211, + "learning_rate": 7.810262040936726e-07, + "loss": 0.6697, + "step": 16575 + }, + { + "epoch": 0.2581207625304553, + "grad_norm": 2.759260892868042, + "learning_rate": 7.809442650890676e-07, + "loss": 0.7034, + "step": 16580 + }, + { + "epoch": 0.2581986035324247, + "grad_norm": 6.658419609069824, + "learning_rate": 7.808623260844627e-07, + "loss": 0.8203, + "step": 16585 + }, + { + "epoch": 0.258276444534394, + "grad_norm": 2.738607406616211, + "learning_rate": 7.807803870798577e-07, + "loss": 0.7464, + "step": 16590 + }, + { + "epoch": 0.2583542855363634, + "grad_norm": 3.0051686763763428, + "learning_rate": 7.806984480752527e-07, + "loss": 0.7817, + "step": 16595 + }, + { + "epoch": 0.2584321265383328, + "grad_norm": 3.117178440093994, + "learning_rate": 7.806165090706478e-07, + "loss": 0.79, + "step": 16600 + }, + { + "epoch": 0.2585099675403022, + "grad_norm": 4.8895158767700195, + "learning_rate": 7.805345700660429e-07, + "loss": 0.7484, + "step": 16605 + }, + { + "epoch": 0.25858780854227154, + "grad_norm": 3.8371946811676025, + "learning_rate": 7.804526310614378e-07, + "loss": 0.6784, + "step": 16610 + }, + { + "epoch": 0.2586656495442409, + "grad_norm": 3.57291579246521, + "learning_rate": 7.803706920568328e-07, + "loss": 0.7957, + "step": 16615 + }, + { + "epoch": 0.2587434905462103, + "grad_norm": 3.827907085418701, + "learning_rate": 7.802887530522279e-07, + "loss": 0.7731, + "step": 16620 + }, + { + "epoch": 0.2588213315481797, + "grad_norm": 3.4150476455688477, + "learning_rate": 7.802068140476228e-07, + "loss": 0.9146, + "step": 16625 + }, + { + "epoch": 0.25889917255014905, + "grad_norm": 5.377663612365723, + "learning_rate": 7.801248750430179e-07, + "loss": 0.7602, + "step": 16630 + }, + { + "epoch": 0.25897701355211844, + "grad_norm": 4.645751476287842, + "learning_rate": 7.80042936038413e-07, + "loss": 0.8021, + "step": 16635 + }, + { + "epoch": 0.25905485455408783, + "grad_norm": 6.805863380432129, + "learning_rate": 7.799609970338081e-07, + "loss": 0.7908, + "step": 16640 + }, + { + "epoch": 0.2591326955560572, + "grad_norm": 2.4049344062805176, + "learning_rate": 7.79879058029203e-07, + "loss": 0.7619, + "step": 16645 + }, + { + "epoch": 0.25921053655802656, + "grad_norm": 3.3766093254089355, + "learning_rate": 7.797971190245981e-07, + "loss": 0.7428, + "step": 16650 + }, + { + "epoch": 0.25928837755999595, + "grad_norm": 4.741754531860352, + "learning_rate": 7.797151800199931e-07, + "loss": 0.8635, + "step": 16655 + }, + { + "epoch": 0.25936621856196534, + "grad_norm": 2.6133670806884766, + "learning_rate": 7.79633241015388e-07, + "loss": 0.6963, + "step": 16660 + }, + { + "epoch": 0.2594440595639347, + "grad_norm": 4.859498977661133, + "learning_rate": 7.795513020107831e-07, + "loss": 0.7334, + "step": 16665 + }, + { + "epoch": 0.2595219005659041, + "grad_norm": 3.187589406967163, + "learning_rate": 7.794693630061782e-07, + "loss": 0.7591, + "step": 16670 + }, + { + "epoch": 0.25959974156787347, + "grad_norm": 6.532161235809326, + "learning_rate": 7.793874240015732e-07, + "loss": 0.7892, + "step": 16675 + }, + { + "epoch": 0.25967758256984286, + "grad_norm": 4.1265177726745605, + "learning_rate": 7.793054849969682e-07, + "loss": 0.7637, + "step": 16680 + }, + { + "epoch": 0.2597554235718122, + "grad_norm": 5.0222649574279785, + "learning_rate": 7.792235459923633e-07, + "loss": 0.7339, + "step": 16685 + }, + { + "epoch": 0.2598332645737816, + "grad_norm": 3.554102659225464, + "learning_rate": 7.791416069877583e-07, + "loss": 0.821, + "step": 16690 + }, + { + "epoch": 0.259911105575751, + "grad_norm": 3.7950658798217773, + "learning_rate": 7.790596679831533e-07, + "loss": 0.8341, + "step": 16695 + }, + { + "epoch": 0.25998894657772037, + "grad_norm": 3.3983871936798096, + "learning_rate": 7.789777289785483e-07, + "loss": 0.7982, + "step": 16700 + }, + { + "epoch": 0.2600667875796897, + "grad_norm": 3.4833576679229736, + "learning_rate": 7.788957899739433e-07, + "loss": 0.8516, + "step": 16705 + }, + { + "epoch": 0.2601446285816591, + "grad_norm": 2.6888585090637207, + "learning_rate": 7.788138509693384e-07, + "loss": 0.6446, + "step": 16710 + }, + { + "epoch": 0.2602224695836285, + "grad_norm": 3.0143544673919678, + "learning_rate": 7.787319119647335e-07, + "loss": 0.8344, + "step": 16715 + }, + { + "epoch": 0.2603003105855979, + "grad_norm": 9.82540512084961, + "learning_rate": 7.786499729601284e-07, + "loss": 0.7017, + "step": 16720 + }, + { + "epoch": 0.2603781515875672, + "grad_norm": 3.5165607929229736, + "learning_rate": 7.785680339555235e-07, + "loss": 0.6927, + "step": 16725 + }, + { + "epoch": 0.2604559925895366, + "grad_norm": 3.0352158546447754, + "learning_rate": 7.784860949509186e-07, + "loss": 0.8134, + "step": 16730 + }, + { + "epoch": 0.260533833591506, + "grad_norm": 2.934072494506836, + "learning_rate": 7.784041559463134e-07, + "loss": 0.7767, + "step": 16735 + }, + { + "epoch": 0.26061167459347534, + "grad_norm": 5.560964107513428, + "learning_rate": 7.783222169417085e-07, + "loss": 0.835, + "step": 16740 + }, + { + "epoch": 0.26068951559544473, + "grad_norm": 6.389545917510986, + "learning_rate": 7.782402779371036e-07, + "loss": 0.8353, + "step": 16745 + }, + { + "epoch": 0.2607673565974141, + "grad_norm": 2.7560980319976807, + "learning_rate": 7.781583389324986e-07, + "loss": 0.7777, + "step": 16750 + }, + { + "epoch": 0.2608451975993835, + "grad_norm": 5.903055667877197, + "learning_rate": 7.780763999278936e-07, + "loss": 0.8077, + "step": 16755 + }, + { + "epoch": 0.26092303860135285, + "grad_norm": 3.2455592155456543, + "learning_rate": 7.779944609232887e-07, + "loss": 0.8842, + "step": 16760 + }, + { + "epoch": 0.26100087960332224, + "grad_norm": 3.2503061294555664, + "learning_rate": 7.779125219186838e-07, + "loss": 0.7527, + "step": 16765 + }, + { + "epoch": 0.26107872060529164, + "grad_norm": 13.79643440246582, + "learning_rate": 7.778305829140787e-07, + "loss": 0.7926, + "step": 16770 + }, + { + "epoch": 0.26115656160726103, + "grad_norm": 4.413746356964111, + "learning_rate": 7.777486439094737e-07, + "loss": 0.6204, + "step": 16775 + }, + { + "epoch": 0.26123440260923037, + "grad_norm": 4.079379081726074, + "learning_rate": 7.776667049048688e-07, + "loss": 0.8961, + "step": 16780 + }, + { + "epoch": 0.26131224361119976, + "grad_norm": 3.8018133640289307, + "learning_rate": 7.775847659002638e-07, + "loss": 0.7947, + "step": 16785 + }, + { + "epoch": 0.26139008461316915, + "grad_norm": 3.7187345027923584, + "learning_rate": 7.775028268956588e-07, + "loss": 0.7183, + "step": 16790 + }, + { + "epoch": 0.26146792561513854, + "grad_norm": 3.7661221027374268, + "learning_rate": 7.774208878910539e-07, + "loss": 0.7923, + "step": 16795 + }, + { + "epoch": 0.2615457666171079, + "grad_norm": 4.487337112426758, + "learning_rate": 7.773389488864489e-07, + "loss": 0.8413, + "step": 16800 + }, + { + "epoch": 0.26162360761907727, + "grad_norm": 3.0049211978912354, + "learning_rate": 7.77257009881844e-07, + "loss": 0.7203, + "step": 16805 + }, + { + "epoch": 0.26170144862104666, + "grad_norm": 9.142400741577148, + "learning_rate": 7.77175070877239e-07, + "loss": 0.7582, + "step": 16810 + }, + { + "epoch": 0.26177928962301605, + "grad_norm": 6.436301231384277, + "learning_rate": 7.770931318726339e-07, + "loss": 0.651, + "step": 16815 + }, + { + "epoch": 0.2618571306249854, + "grad_norm": 11.26479434967041, + "learning_rate": 7.77011192868029e-07, + "loss": 0.7751, + "step": 16820 + }, + { + "epoch": 0.2619349716269548, + "grad_norm": 4.235225200653076, + "learning_rate": 7.76929253863424e-07, + "loss": 0.7435, + "step": 16825 + }, + { + "epoch": 0.2620128126289242, + "grad_norm": 3.807072162628174, + "learning_rate": 7.76847314858819e-07, + "loss": 0.7725, + "step": 16830 + }, + { + "epoch": 0.2620906536308935, + "grad_norm": 3.0418660640716553, + "learning_rate": 7.767653758542141e-07, + "loss": 0.7339, + "step": 16835 + }, + { + "epoch": 0.2621684946328629, + "grad_norm": 4.268150329589844, + "learning_rate": 7.766834368496092e-07, + "loss": 0.7511, + "step": 16840 + }, + { + "epoch": 0.2622463356348323, + "grad_norm": 3.158903121948242, + "learning_rate": 7.766014978450041e-07, + "loss": 0.8017, + "step": 16845 + }, + { + "epoch": 0.2623241766368017, + "grad_norm": 2.870638132095337, + "learning_rate": 7.765195588403992e-07, + "loss": 0.8768, + "step": 16850 + }, + { + "epoch": 0.262402017638771, + "grad_norm": 5.223047733306885, + "learning_rate": 7.764376198357942e-07, + "loss": 0.7877, + "step": 16855 + }, + { + "epoch": 0.2624798586407404, + "grad_norm": 4.719412326812744, + "learning_rate": 7.763556808311891e-07, + "loss": 0.7911, + "step": 16860 + }, + { + "epoch": 0.2625576996427098, + "grad_norm": 4.325451850891113, + "learning_rate": 7.762737418265842e-07, + "loss": 0.8931, + "step": 16865 + }, + { + "epoch": 0.2626355406446792, + "grad_norm": 7.159557342529297, + "learning_rate": 7.761918028219793e-07, + "loss": 0.718, + "step": 16870 + }, + { + "epoch": 0.26271338164664854, + "grad_norm": 3.957003116607666, + "learning_rate": 7.761098638173743e-07, + "loss": 0.8571, + "step": 16875 + }, + { + "epoch": 0.26279122264861793, + "grad_norm": 3.6029467582702637, + "learning_rate": 7.760279248127693e-07, + "loss": 0.8155, + "step": 16880 + }, + { + "epoch": 0.2628690636505873, + "grad_norm": 3.1437227725982666, + "learning_rate": 7.759459858081644e-07, + "loss": 0.7031, + "step": 16885 + }, + { + "epoch": 0.2629469046525567, + "grad_norm": 8.502701759338379, + "learning_rate": 7.758640468035595e-07, + "loss": 0.7562, + "step": 16890 + }, + { + "epoch": 0.26302474565452605, + "grad_norm": 4.806173324584961, + "learning_rate": 7.757821077989545e-07, + "loss": 0.8258, + "step": 16895 + }, + { + "epoch": 0.26310258665649544, + "grad_norm": 15.015897750854492, + "learning_rate": 7.757001687943494e-07, + "loss": 0.7796, + "step": 16900 + }, + { + "epoch": 0.26318042765846483, + "grad_norm": 4.395715236663818, + "learning_rate": 7.756182297897445e-07, + "loss": 0.6802, + "step": 16905 + }, + { + "epoch": 0.2632582686604342, + "grad_norm": 4.44493293762207, + "learning_rate": 7.755362907851395e-07, + "loss": 0.6893, + "step": 16910 + }, + { + "epoch": 0.26333610966240356, + "grad_norm": 4.132498264312744, + "learning_rate": 7.754543517805345e-07, + "loss": 0.7528, + "step": 16915 + }, + { + "epoch": 0.26341395066437295, + "grad_norm": 4.151634693145752, + "learning_rate": 7.753724127759296e-07, + "loss": 0.7714, + "step": 16920 + }, + { + "epoch": 0.26349179166634235, + "grad_norm": 3.9134562015533447, + "learning_rate": 7.752904737713246e-07, + "loss": 0.7701, + "step": 16925 + }, + { + "epoch": 0.2635696326683117, + "grad_norm": 7.147616386413574, + "learning_rate": 7.752085347667197e-07, + "loss": 0.8402, + "step": 16930 + }, + { + "epoch": 0.2636474736702811, + "grad_norm": 2.874391555786133, + "learning_rate": 7.751265957621147e-07, + "loss": 0.8452, + "step": 16935 + }, + { + "epoch": 0.26372531467225047, + "grad_norm": 3.2869579792022705, + "learning_rate": 7.750446567575096e-07, + "loss": 0.7295, + "step": 16940 + }, + { + "epoch": 0.26380315567421986, + "grad_norm": 4.201857566833496, + "learning_rate": 7.749627177529047e-07, + "loss": 0.6901, + "step": 16945 + }, + { + "epoch": 0.2638809966761892, + "grad_norm": 6.836452960968018, + "learning_rate": 7.748807787482997e-07, + "loss": 0.8212, + "step": 16950 + }, + { + "epoch": 0.2639588376781586, + "grad_norm": 3.0617053508758545, + "learning_rate": 7.747988397436947e-07, + "loss": 0.6808, + "step": 16955 + }, + { + "epoch": 0.264036678680128, + "grad_norm": 3.325904607772827, + "learning_rate": 7.747169007390898e-07, + "loss": 0.7128, + "step": 16960 + }, + { + "epoch": 0.26411451968209737, + "grad_norm": 10.036579132080078, + "learning_rate": 7.746349617344849e-07, + "loss": 0.7599, + "step": 16965 + }, + { + "epoch": 0.2641923606840667, + "grad_norm": 3.4642529487609863, + "learning_rate": 7.745530227298798e-07, + "loss": 0.7073, + "step": 16970 + }, + { + "epoch": 0.2642702016860361, + "grad_norm": 4.849530220031738, + "learning_rate": 7.744710837252749e-07, + "loss": 0.7978, + "step": 16975 + }, + { + "epoch": 0.2643480426880055, + "grad_norm": 4.592496871948242, + "learning_rate": 7.743891447206699e-07, + "loss": 0.8337, + "step": 16980 + }, + { + "epoch": 0.2644258836899749, + "grad_norm": 7.480959892272949, + "learning_rate": 7.743072057160648e-07, + "loss": 0.6988, + "step": 16985 + }, + { + "epoch": 0.2645037246919442, + "grad_norm": 3.0004184246063232, + "learning_rate": 7.742252667114599e-07, + "loss": 0.706, + "step": 16990 + }, + { + "epoch": 0.2645815656939136, + "grad_norm": 5.580563545227051, + "learning_rate": 7.74143327706855e-07, + "loss": 0.7985, + "step": 16995 + }, + { + "epoch": 0.264659406695883, + "grad_norm": 3.1316614151000977, + "learning_rate": 7.7406138870225e-07, + "loss": 0.7035, + "step": 17000 + }, + { + "epoch": 0.26473724769785234, + "grad_norm": 3.28928804397583, + "learning_rate": 7.73979449697645e-07, + "loss": 0.8902, + "step": 17005 + }, + { + "epoch": 0.26481508869982173, + "grad_norm": 3.257534980773926, + "learning_rate": 7.738975106930401e-07, + "loss": 0.666, + "step": 17010 + }, + { + "epoch": 0.2648929297017911, + "grad_norm": 3.9892382621765137, + "learning_rate": 7.738155716884352e-07, + "loss": 0.7318, + "step": 17015 + }, + { + "epoch": 0.2649707707037605, + "grad_norm": 4.157562732696533, + "learning_rate": 7.737336326838301e-07, + "loss": 0.7503, + "step": 17020 + }, + { + "epoch": 0.26504861170572985, + "grad_norm": 3.515887498855591, + "learning_rate": 7.736516936792251e-07, + "loss": 0.7764, + "step": 17025 + }, + { + "epoch": 0.26512645270769924, + "grad_norm": 6.801537990570068, + "learning_rate": 7.735697546746202e-07, + "loss": 0.7394, + "step": 17030 + }, + { + "epoch": 0.26520429370966864, + "grad_norm": 5.358715534210205, + "learning_rate": 7.734878156700152e-07, + "loss": 0.7025, + "step": 17035 + }, + { + "epoch": 0.26528213471163803, + "grad_norm": 3.9038212299346924, + "learning_rate": 7.734058766654102e-07, + "loss": 0.7226, + "step": 17040 + }, + { + "epoch": 0.26535997571360737, + "grad_norm": 2.7967400550842285, + "learning_rate": 7.733239376608053e-07, + "loss": 0.7166, + "step": 17045 + }, + { + "epoch": 0.26543781671557676, + "grad_norm": 4.307606220245361, + "learning_rate": 7.732419986562003e-07, + "loss": 0.8099, + "step": 17050 + }, + { + "epoch": 0.26551565771754615, + "grad_norm": 3.752150058746338, + "learning_rate": 7.731600596515954e-07, + "loss": 0.7633, + "step": 17055 + }, + { + "epoch": 0.26559349871951554, + "grad_norm": 4.0929412841796875, + "learning_rate": 7.730781206469903e-07, + "loss": 0.8023, + "step": 17060 + }, + { + "epoch": 0.2656713397214849, + "grad_norm": 3.6266283988952637, + "learning_rate": 7.729961816423853e-07, + "loss": 0.8214, + "step": 17065 + }, + { + "epoch": 0.26574918072345427, + "grad_norm": 2.470479726791382, + "learning_rate": 7.729142426377804e-07, + "loss": 0.7045, + "step": 17070 + }, + { + "epoch": 0.26582702172542366, + "grad_norm": 3.274996519088745, + "learning_rate": 7.728323036331755e-07, + "loss": 0.7351, + "step": 17075 + }, + { + "epoch": 0.26590486272739305, + "grad_norm": 3.2078683376312256, + "learning_rate": 7.727503646285704e-07, + "loss": 0.7526, + "step": 17080 + }, + { + "epoch": 0.2659827037293624, + "grad_norm": 6.345730304718018, + "learning_rate": 7.726684256239655e-07, + "loss": 0.6919, + "step": 17085 + }, + { + "epoch": 0.2660605447313318, + "grad_norm": 3.1593596935272217, + "learning_rate": 7.725864866193606e-07, + "loss": 0.7087, + "step": 17090 + }, + { + "epoch": 0.2661383857333012, + "grad_norm": 4.418591022491455, + "learning_rate": 7.725045476147555e-07, + "loss": 0.8193, + "step": 17095 + }, + { + "epoch": 0.2662162267352705, + "grad_norm": 3.5372109413146973, + "learning_rate": 7.724226086101505e-07, + "loss": 0.7493, + "step": 17100 + }, + { + "epoch": 0.2662940677372399, + "grad_norm": 3.8144540786743164, + "learning_rate": 7.723406696055456e-07, + "loss": 0.9164, + "step": 17105 + }, + { + "epoch": 0.2663719087392093, + "grad_norm": 3.1666834354400635, + "learning_rate": 7.722587306009406e-07, + "loss": 0.8379, + "step": 17110 + }, + { + "epoch": 0.2664497497411787, + "grad_norm": 4.0041375160217285, + "learning_rate": 7.721767915963356e-07, + "loss": 0.7178, + "step": 17115 + }, + { + "epoch": 0.266527590743148, + "grad_norm": 8.216415405273438, + "learning_rate": 7.720948525917307e-07, + "loss": 0.692, + "step": 17120 + }, + { + "epoch": 0.2666054317451174, + "grad_norm": 7.888249397277832, + "learning_rate": 7.720129135871257e-07, + "loss": 0.8232, + "step": 17125 + }, + { + "epoch": 0.2666832727470868, + "grad_norm": 7.171913146972656, + "learning_rate": 7.719309745825207e-07, + "loss": 0.8611, + "step": 17130 + }, + { + "epoch": 0.2667611137490562, + "grad_norm": 3.7759389877319336, + "learning_rate": 7.718490355779158e-07, + "loss": 0.8469, + "step": 17135 + }, + { + "epoch": 0.26683895475102554, + "grad_norm": 4.734642028808594, + "learning_rate": 7.717670965733107e-07, + "loss": 0.7777, + "step": 17140 + }, + { + "epoch": 0.26691679575299493, + "grad_norm": 4.96793270111084, + "learning_rate": 7.716851575687058e-07, + "loss": 0.843, + "step": 17145 + }, + { + "epoch": 0.2669946367549643, + "grad_norm": 6.541609764099121, + "learning_rate": 7.716032185641008e-07, + "loss": 0.8751, + "step": 17150 + }, + { + "epoch": 0.2670724777569337, + "grad_norm": 4.108456134796143, + "learning_rate": 7.715212795594959e-07, + "loss": 0.8128, + "step": 17155 + }, + { + "epoch": 0.26715031875890305, + "grad_norm": 3.626671314239502, + "learning_rate": 7.714393405548909e-07, + "loss": 0.7681, + "step": 17160 + }, + { + "epoch": 0.26722815976087244, + "grad_norm": 4.041867733001709, + "learning_rate": 7.71357401550286e-07, + "loss": 0.7371, + "step": 17165 + }, + { + "epoch": 0.26730600076284183, + "grad_norm": 3.3305304050445557, + "learning_rate": 7.71275462545681e-07, + "loss": 0.8206, + "step": 17170 + }, + { + "epoch": 0.26738384176481117, + "grad_norm": 6.281455039978027, + "learning_rate": 7.71193523541076e-07, + "loss": 0.6028, + "step": 17175 + }, + { + "epoch": 0.26746168276678056, + "grad_norm": 2.861084222793579, + "learning_rate": 7.711115845364711e-07, + "loss": 0.7768, + "step": 17180 + }, + { + "epoch": 0.26753952376874995, + "grad_norm": 4.843693733215332, + "learning_rate": 7.71029645531866e-07, + "loss": 0.8696, + "step": 17185 + }, + { + "epoch": 0.26761736477071935, + "grad_norm": 2.4090218544006348, + "learning_rate": 7.70947706527261e-07, + "loss": 0.729, + "step": 17190 + }, + { + "epoch": 0.2676952057726887, + "grad_norm": 7.0648651123046875, + "learning_rate": 7.708657675226561e-07, + "loss": 0.7541, + "step": 17195 + }, + { + "epoch": 0.2677730467746581, + "grad_norm": 5.795253276824951, + "learning_rate": 7.707838285180512e-07, + "loss": 0.7521, + "step": 17200 + }, + { + "epoch": 0.26785088777662747, + "grad_norm": 2.898508310317993, + "learning_rate": 7.707018895134461e-07, + "loss": 0.6649, + "step": 17205 + }, + { + "epoch": 0.26792872877859686, + "grad_norm": 4.25831937789917, + "learning_rate": 7.706199505088412e-07, + "loss": 0.7483, + "step": 17210 + }, + { + "epoch": 0.2680065697805662, + "grad_norm": 3.3910627365112305, + "learning_rate": 7.705380115042363e-07, + "loss": 0.7649, + "step": 17215 + }, + { + "epoch": 0.2680844107825356, + "grad_norm": 3.8831562995910645, + "learning_rate": 7.704560724996312e-07, + "loss": 0.9277, + "step": 17220 + }, + { + "epoch": 0.268162251784505, + "grad_norm": 3.593287229537964, + "learning_rate": 7.703741334950262e-07, + "loss": 0.6956, + "step": 17225 + }, + { + "epoch": 0.26824009278647437, + "grad_norm": 4.377844333648682, + "learning_rate": 7.702921944904213e-07, + "loss": 0.7818, + "step": 17230 + }, + { + "epoch": 0.2683179337884437, + "grad_norm": 3.796743392944336, + "learning_rate": 7.702102554858163e-07, + "loss": 0.9001, + "step": 17235 + }, + { + "epoch": 0.2683957747904131, + "grad_norm": 3.4302916526794434, + "learning_rate": 7.701283164812113e-07, + "loss": 0.7686, + "step": 17240 + }, + { + "epoch": 0.2684736157923825, + "grad_norm": 3.4582180976867676, + "learning_rate": 7.700463774766064e-07, + "loss": 0.778, + "step": 17245 + }, + { + "epoch": 0.2685514567943519, + "grad_norm": 3.298583507537842, + "learning_rate": 7.699644384720014e-07, + "loss": 0.6798, + "step": 17250 + }, + { + "epoch": 0.2686292977963212, + "grad_norm": 3.020228624343872, + "learning_rate": 7.698824994673965e-07, + "loss": 0.7602, + "step": 17255 + }, + { + "epoch": 0.2687071387982906, + "grad_norm": 3.2028045654296875, + "learning_rate": 7.698005604627915e-07, + "loss": 0.8621, + "step": 17260 + }, + { + "epoch": 0.26878497980026, + "grad_norm": 5.402839660644531, + "learning_rate": 7.697186214581865e-07, + "loss": 0.784, + "step": 17265 + }, + { + "epoch": 0.26886282080222934, + "grad_norm": 6.431519508361816, + "learning_rate": 7.696366824535815e-07, + "loss": 0.7682, + "step": 17270 + }, + { + "epoch": 0.26894066180419873, + "grad_norm": 3.140960931777954, + "learning_rate": 7.695547434489765e-07, + "loss": 0.6265, + "step": 17275 + }, + { + "epoch": 0.2690185028061681, + "grad_norm": 2.3430335521698, + "learning_rate": 7.694728044443716e-07, + "loss": 0.7718, + "step": 17280 + }, + { + "epoch": 0.2690963438081375, + "grad_norm": 3.5215811729431152, + "learning_rate": 7.693908654397666e-07, + "loss": 0.802, + "step": 17285 + }, + { + "epoch": 0.26917418481010685, + "grad_norm": 3.7115468978881836, + "learning_rate": 7.693089264351617e-07, + "loss": 0.7033, + "step": 17290 + }, + { + "epoch": 0.26925202581207625, + "grad_norm": 4.185376167297363, + "learning_rate": 7.692269874305567e-07, + "loss": 0.7204, + "step": 17295 + }, + { + "epoch": 0.26932986681404564, + "grad_norm": 6.7096123695373535, + "learning_rate": 7.691450484259517e-07, + "loss": 0.7501, + "step": 17300 + }, + { + "epoch": 0.26940770781601503, + "grad_norm": 3.2766761779785156, + "learning_rate": 7.690631094213467e-07, + "loss": 0.7548, + "step": 17305 + }, + { + "epoch": 0.26948554881798437, + "grad_norm": 2.9786577224731445, + "learning_rate": 7.689811704167417e-07, + "loss": 0.944, + "step": 17310 + }, + { + "epoch": 0.26956338981995376, + "grad_norm": 4.937072277069092, + "learning_rate": 7.688992314121367e-07, + "loss": 0.728, + "step": 17315 + }, + { + "epoch": 0.26964123082192315, + "grad_norm": 3.2550084590911865, + "learning_rate": 7.688172924075318e-07, + "loss": 0.8478, + "step": 17320 + }, + { + "epoch": 0.26971907182389254, + "grad_norm": 11.952690124511719, + "learning_rate": 7.687353534029269e-07, + "loss": 0.7935, + "step": 17325 + }, + { + "epoch": 0.2697969128258619, + "grad_norm": 8.415304183959961, + "learning_rate": 7.686534143983218e-07, + "loss": 0.8136, + "step": 17330 + }, + { + "epoch": 0.26987475382783127, + "grad_norm": 3.8239519596099854, + "learning_rate": 7.685714753937169e-07, + "loss": 0.9237, + "step": 17335 + }, + { + "epoch": 0.26995259482980066, + "grad_norm": 6.529261112213135, + "learning_rate": 7.68489536389112e-07, + "loss": 0.8137, + "step": 17340 + }, + { + "epoch": 0.27003043583177, + "grad_norm": 4.317914009094238, + "learning_rate": 7.684075973845068e-07, + "loss": 0.9643, + "step": 17345 + }, + { + "epoch": 0.2701082768337394, + "grad_norm": 6.890023708343506, + "learning_rate": 7.683256583799019e-07, + "loss": 0.6974, + "step": 17350 + }, + { + "epoch": 0.2701861178357088, + "grad_norm": 3.332381248474121, + "learning_rate": 7.68243719375297e-07, + "loss": 0.8654, + "step": 17355 + }, + { + "epoch": 0.2702639588376782, + "grad_norm": 4.557866096496582, + "learning_rate": 7.68161780370692e-07, + "loss": 0.6106, + "step": 17360 + }, + { + "epoch": 0.2703417998396475, + "grad_norm": 4.092104434967041, + "learning_rate": 7.68079841366087e-07, + "loss": 0.7621, + "step": 17365 + }, + { + "epoch": 0.2704196408416169, + "grad_norm": 3.59782075881958, + "learning_rate": 7.679979023614821e-07, + "loss": 0.8053, + "step": 17370 + }, + { + "epoch": 0.2704974818435863, + "grad_norm": 4.809222221374512, + "learning_rate": 7.679159633568771e-07, + "loss": 0.871, + "step": 17375 + }, + { + "epoch": 0.2705753228455557, + "grad_norm": 3.0703799724578857, + "learning_rate": 7.678340243522722e-07, + "loss": 0.8575, + "step": 17380 + }, + { + "epoch": 0.270653163847525, + "grad_norm": 3.1767992973327637, + "learning_rate": 7.677520853476671e-07, + "loss": 0.8037, + "step": 17385 + }, + { + "epoch": 0.2707310048494944, + "grad_norm": 10.665190696716309, + "learning_rate": 7.676701463430622e-07, + "loss": 0.7498, + "step": 17390 + }, + { + "epoch": 0.2708088458514638, + "grad_norm": 2.9556710720062256, + "learning_rate": 7.675882073384572e-07, + "loss": 0.7574, + "step": 17395 + }, + { + "epoch": 0.2708866868534332, + "grad_norm": 5.384457588195801, + "learning_rate": 7.675062683338522e-07, + "loss": 0.7484, + "step": 17400 + }, + { + "epoch": 0.27096452785540254, + "grad_norm": 3.542102575302124, + "learning_rate": 7.674243293292473e-07, + "loss": 0.7361, + "step": 17405 + }, + { + "epoch": 0.27104236885737193, + "grad_norm": 3.0414071083068848, + "learning_rate": 7.673423903246423e-07, + "loss": 0.7546, + "step": 17410 + }, + { + "epoch": 0.2711202098593413, + "grad_norm": 3.4305264949798584, + "learning_rate": 7.672604513200374e-07, + "loss": 0.8032, + "step": 17415 + }, + { + "epoch": 0.2711980508613107, + "grad_norm": 3.261608123779297, + "learning_rate": 7.671785123154324e-07, + "loss": 0.7438, + "step": 17420 + }, + { + "epoch": 0.27127589186328005, + "grad_norm": 3.1833226680755615, + "learning_rate": 7.670965733108273e-07, + "loss": 0.7677, + "step": 17425 + }, + { + "epoch": 0.27135373286524944, + "grad_norm": 5.976661205291748, + "learning_rate": 7.670146343062224e-07, + "loss": 0.7372, + "step": 17430 + }, + { + "epoch": 0.27143157386721883, + "grad_norm": 16.816925048828125, + "learning_rate": 7.669326953016175e-07, + "loss": 0.7339, + "step": 17435 + }, + { + "epoch": 0.27150941486918817, + "grad_norm": 4.602113723754883, + "learning_rate": 7.668507562970124e-07, + "loss": 0.7989, + "step": 17440 + }, + { + "epoch": 0.27158725587115756, + "grad_norm": 5.182931423187256, + "learning_rate": 7.667688172924075e-07, + "loss": 0.7642, + "step": 17445 + }, + { + "epoch": 0.27166509687312695, + "grad_norm": 5.678241729736328, + "learning_rate": 7.666868782878026e-07, + "loss": 0.7479, + "step": 17450 + }, + { + "epoch": 0.27174293787509635, + "grad_norm": 3.3905954360961914, + "learning_rate": 7.666049392831975e-07, + "loss": 0.7594, + "step": 17455 + }, + { + "epoch": 0.2718207788770657, + "grad_norm": 8.445892333984375, + "learning_rate": 7.665230002785926e-07, + "loss": 0.8912, + "step": 17460 + }, + { + "epoch": 0.2718986198790351, + "grad_norm": 2.7368478775024414, + "learning_rate": 7.664410612739876e-07, + "loss": 0.7081, + "step": 17465 + }, + { + "epoch": 0.27197646088100447, + "grad_norm": 3.5980234146118164, + "learning_rate": 7.663591222693826e-07, + "loss": 0.7346, + "step": 17470 + }, + { + "epoch": 0.27205430188297386, + "grad_norm": 6.195345878601074, + "learning_rate": 7.662771832647776e-07, + "loss": 0.7162, + "step": 17475 + }, + { + "epoch": 0.2721321428849432, + "grad_norm": 8.303228378295898, + "learning_rate": 7.661952442601727e-07, + "loss": 0.8932, + "step": 17480 + }, + { + "epoch": 0.2722099838869126, + "grad_norm": 3.866856813430786, + "learning_rate": 7.661133052555677e-07, + "loss": 0.7422, + "step": 17485 + }, + { + "epoch": 0.272287824888882, + "grad_norm": 3.056715965270996, + "learning_rate": 7.660313662509627e-07, + "loss": 0.7901, + "step": 17490 + }, + { + "epoch": 0.27236566589085137, + "grad_norm": 4.668761730194092, + "learning_rate": 7.659494272463578e-07, + "loss": 0.7008, + "step": 17495 + }, + { + "epoch": 0.2724435068928207, + "grad_norm": 6.530035495758057, + "learning_rate": 7.658674882417528e-07, + "loss": 0.8165, + "step": 17500 + }, + { + "epoch": 0.2725213478947901, + "grad_norm": 2.7709784507751465, + "learning_rate": 7.657855492371479e-07, + "loss": 0.7451, + "step": 17505 + }, + { + "epoch": 0.2725991888967595, + "grad_norm": 2.8162500858306885, + "learning_rate": 7.657036102325428e-07, + "loss": 0.6419, + "step": 17510 + }, + { + "epoch": 0.27267702989872883, + "grad_norm": 5.088579177856445, + "learning_rate": 7.656216712279379e-07, + "loss": 0.7556, + "step": 17515 + }, + { + "epoch": 0.2727548709006982, + "grad_norm": 6.667660713195801, + "learning_rate": 7.655397322233329e-07, + "loss": 0.811, + "step": 17520 + }, + { + "epoch": 0.2728327119026676, + "grad_norm": 3.972935199737549, + "learning_rate": 7.65457793218728e-07, + "loss": 0.7962, + "step": 17525 + }, + { + "epoch": 0.272910552904637, + "grad_norm": 5.444567680358887, + "learning_rate": 7.65375854214123e-07, + "loss": 0.764, + "step": 17530 + }, + { + "epoch": 0.27298839390660634, + "grad_norm": 5.834167957305908, + "learning_rate": 7.65293915209518e-07, + "loss": 0.7616, + "step": 17535 + }, + { + "epoch": 0.27306623490857573, + "grad_norm": 3.4438259601593018, + "learning_rate": 7.652119762049131e-07, + "loss": 0.7918, + "step": 17540 + }, + { + "epoch": 0.2731440759105451, + "grad_norm": 13.500309944152832, + "learning_rate": 7.651300372003081e-07, + "loss": 0.6592, + "step": 17545 + }, + { + "epoch": 0.2732219169125145, + "grad_norm": 6.822749137878418, + "learning_rate": 7.65048098195703e-07, + "loss": 0.7099, + "step": 17550 + }, + { + "epoch": 0.27329975791448385, + "grad_norm": 3.325993537902832, + "learning_rate": 7.649661591910981e-07, + "loss": 0.702, + "step": 17555 + }, + { + "epoch": 0.27337759891645325, + "grad_norm": 3.109536647796631, + "learning_rate": 7.648842201864932e-07, + "loss": 0.6603, + "step": 17560 + }, + { + "epoch": 0.27345543991842264, + "grad_norm": 4.584033012390137, + "learning_rate": 7.648022811818881e-07, + "loss": 0.7757, + "step": 17565 + }, + { + "epoch": 0.27353328092039203, + "grad_norm": 4.5637383460998535, + "learning_rate": 7.647203421772832e-07, + "loss": 0.7424, + "step": 17570 + }, + { + "epoch": 0.27361112192236137, + "grad_norm": 3.220829486846924, + "learning_rate": 7.646384031726783e-07, + "loss": 0.7742, + "step": 17575 + }, + { + "epoch": 0.27368896292433076, + "grad_norm": 2.887474775314331, + "learning_rate": 7.645564641680733e-07, + "loss": 0.6854, + "step": 17580 + }, + { + "epoch": 0.27376680392630015, + "grad_norm": 3.1210684776306152, + "learning_rate": 7.644745251634683e-07, + "loss": 0.7429, + "step": 17585 + }, + { + "epoch": 0.27384464492826954, + "grad_norm": 3.1878061294555664, + "learning_rate": 7.643925861588633e-07, + "loss": 0.7759, + "step": 17590 + }, + { + "epoch": 0.2739224859302389, + "grad_norm": 3.132434368133545, + "learning_rate": 7.643106471542583e-07, + "loss": 0.8279, + "step": 17595 + }, + { + "epoch": 0.27400032693220827, + "grad_norm": 7.325407981872559, + "learning_rate": 7.642287081496533e-07, + "loss": 0.8546, + "step": 17600 + }, + { + "epoch": 0.27407816793417766, + "grad_norm": 2.864562749862671, + "learning_rate": 7.641467691450484e-07, + "loss": 0.7245, + "step": 17605 + }, + { + "epoch": 0.274156008936147, + "grad_norm": 4.206888675689697, + "learning_rate": 7.640648301404434e-07, + "loss": 0.8363, + "step": 17610 + }, + { + "epoch": 0.2742338499381164, + "grad_norm": 3.8253650665283203, + "learning_rate": 7.639828911358385e-07, + "loss": 0.8165, + "step": 17615 + }, + { + "epoch": 0.2743116909400858, + "grad_norm": 4.832096099853516, + "learning_rate": 7.639009521312335e-07, + "loss": 0.799, + "step": 17620 + }, + { + "epoch": 0.2743895319420552, + "grad_norm": 3.5117223262786865, + "learning_rate": 7.638190131266285e-07, + "loss": 0.7837, + "step": 17625 + }, + { + "epoch": 0.2744673729440245, + "grad_norm": 6.5699286460876465, + "learning_rate": 7.637370741220235e-07, + "loss": 0.6992, + "step": 17630 + }, + { + "epoch": 0.2745452139459939, + "grad_norm": 4.263813018798828, + "learning_rate": 7.636551351174185e-07, + "loss": 0.8177, + "step": 17635 + }, + { + "epoch": 0.2746230549479633, + "grad_norm": 3.31231427192688, + "learning_rate": 7.635731961128136e-07, + "loss": 0.7273, + "step": 17640 + }, + { + "epoch": 0.2747008959499327, + "grad_norm": 3.007326602935791, + "learning_rate": 7.634912571082086e-07, + "loss": 0.7857, + "step": 17645 + }, + { + "epoch": 0.274778736951902, + "grad_norm": 3.9053094387054443, + "learning_rate": 7.634093181036037e-07, + "loss": 0.6829, + "step": 17650 + }, + { + "epoch": 0.2748565779538714, + "grad_norm": 4.185614585876465, + "learning_rate": 7.633273790989987e-07, + "loss": 0.6869, + "step": 17655 + }, + { + "epoch": 0.2749344189558408, + "grad_norm": 2.9816734790802, + "learning_rate": 7.632454400943937e-07, + "loss": 0.7999, + "step": 17660 + }, + { + "epoch": 0.2750122599578102, + "grad_norm": 2.800800085067749, + "learning_rate": 7.631635010897888e-07, + "loss": 0.7201, + "step": 17665 + }, + { + "epoch": 0.27509010095977954, + "grad_norm": 4.0509033203125, + "learning_rate": 7.630815620851838e-07, + "loss": 0.7324, + "step": 17670 + }, + { + "epoch": 0.27516794196174893, + "grad_norm": 4.819879531860352, + "learning_rate": 7.629996230805787e-07, + "loss": 0.7498, + "step": 17675 + }, + { + "epoch": 0.2752457829637183, + "grad_norm": 4.175679683685303, + "learning_rate": 7.629176840759738e-07, + "loss": 0.7992, + "step": 17680 + }, + { + "epoch": 0.2753236239656877, + "grad_norm": 5.96135139465332, + "learning_rate": 7.628357450713689e-07, + "loss": 0.608, + "step": 17685 + }, + { + "epoch": 0.27540146496765705, + "grad_norm": 4.20062255859375, + "learning_rate": 7.627538060667638e-07, + "loss": 0.8923, + "step": 17690 + }, + { + "epoch": 0.27547930596962644, + "grad_norm": 7.188648223876953, + "learning_rate": 7.626718670621589e-07, + "loss": 0.877, + "step": 17695 + }, + { + "epoch": 0.27555714697159583, + "grad_norm": 6.603199481964111, + "learning_rate": 7.62589928057554e-07, + "loss": 0.733, + "step": 17700 + }, + { + "epoch": 0.27563498797356517, + "grad_norm": 4.94620418548584, + "learning_rate": 7.62507989052949e-07, + "loss": 0.8015, + "step": 17705 + }, + { + "epoch": 0.27571282897553456, + "grad_norm": 3.5170164108276367, + "learning_rate": 7.624260500483439e-07, + "loss": 0.6238, + "step": 17710 + }, + { + "epoch": 0.27579066997750395, + "grad_norm": 3.5444560050964355, + "learning_rate": 7.62344111043739e-07, + "loss": 0.8081, + "step": 17715 + }, + { + "epoch": 0.27586851097947335, + "grad_norm": 3.9345004558563232, + "learning_rate": 7.62262172039134e-07, + "loss": 0.67, + "step": 17720 + }, + { + "epoch": 0.2759463519814427, + "grad_norm": 3.7450127601623535, + "learning_rate": 7.62180233034529e-07, + "loss": 0.8191, + "step": 17725 + }, + { + "epoch": 0.2760241929834121, + "grad_norm": 5.089782238006592, + "learning_rate": 7.620982940299241e-07, + "loss": 0.7944, + "step": 17730 + }, + { + "epoch": 0.27610203398538147, + "grad_norm": 3.7452938556671143, + "learning_rate": 7.620163550253191e-07, + "loss": 0.8261, + "step": 17735 + }, + { + "epoch": 0.27617987498735086, + "grad_norm": 2.857208013534546, + "learning_rate": 7.619344160207142e-07, + "loss": 0.7092, + "step": 17740 + }, + { + "epoch": 0.2762577159893202, + "grad_norm": 3.1661722660064697, + "learning_rate": 7.618524770161092e-07, + "loss": 0.9596, + "step": 17745 + }, + { + "epoch": 0.2763355569912896, + "grad_norm": 4.01740837097168, + "learning_rate": 7.617705380115041e-07, + "loss": 0.7322, + "step": 17750 + }, + { + "epoch": 0.276413397993259, + "grad_norm": 3.3057632446289062, + "learning_rate": 7.616885990068992e-07, + "loss": 0.7321, + "step": 17755 + }, + { + "epoch": 0.27649123899522837, + "grad_norm": 4.020196437835693, + "learning_rate": 7.616066600022943e-07, + "loss": 0.6533, + "step": 17760 + }, + { + "epoch": 0.2765690799971977, + "grad_norm": 3.5413827896118164, + "learning_rate": 7.615247209976893e-07, + "loss": 0.7581, + "step": 17765 + }, + { + "epoch": 0.2766469209991671, + "grad_norm": 4.459136486053467, + "learning_rate": 7.614427819930843e-07, + "loss": 0.7323, + "step": 17770 + }, + { + "epoch": 0.2767247620011365, + "grad_norm": 3.8716177940368652, + "learning_rate": 7.613608429884794e-07, + "loss": 0.8683, + "step": 17775 + }, + { + "epoch": 0.27680260300310583, + "grad_norm": 2.5693435668945312, + "learning_rate": 7.612789039838744e-07, + "loss": 0.7723, + "step": 17780 + }, + { + "epoch": 0.2768804440050752, + "grad_norm": 6.711716175079346, + "learning_rate": 7.611969649792694e-07, + "loss": 0.7423, + "step": 17785 + }, + { + "epoch": 0.2769582850070446, + "grad_norm": 7.211732387542725, + "learning_rate": 7.611150259746644e-07, + "loss": 0.7655, + "step": 17790 + }, + { + "epoch": 0.277036126009014, + "grad_norm": 3.0362424850463867, + "learning_rate": 7.610330869700595e-07, + "loss": 0.7852, + "step": 17795 + }, + { + "epoch": 0.27711396701098334, + "grad_norm": 7.21616268157959, + "learning_rate": 7.609511479654544e-07, + "loss": 0.6979, + "step": 17800 + }, + { + "epoch": 0.27719180801295273, + "grad_norm": 2.9571242332458496, + "learning_rate": 7.608692089608495e-07, + "loss": 0.736, + "step": 17805 + }, + { + "epoch": 0.2772696490149221, + "grad_norm": 3.2685022354125977, + "learning_rate": 7.607872699562446e-07, + "loss": 0.7699, + "step": 17810 + }, + { + "epoch": 0.2773474900168915, + "grad_norm": 3.817214012145996, + "learning_rate": 7.607053309516395e-07, + "loss": 0.7695, + "step": 17815 + }, + { + "epoch": 0.27742533101886085, + "grad_norm": 3.7128727436065674, + "learning_rate": 7.606233919470346e-07, + "loss": 0.8229, + "step": 17820 + }, + { + "epoch": 0.27750317202083025, + "grad_norm": 4.211459636688232, + "learning_rate": 7.605414529424297e-07, + "loss": 0.7276, + "step": 17825 + }, + { + "epoch": 0.27758101302279964, + "grad_norm": 3.2515077590942383, + "learning_rate": 7.604595139378247e-07, + "loss": 0.7933, + "step": 17830 + }, + { + "epoch": 0.27765885402476903, + "grad_norm": 3.9488120079040527, + "learning_rate": 7.603775749332196e-07, + "loss": 0.6916, + "step": 17835 + }, + { + "epoch": 0.27773669502673837, + "grad_norm": 4.689972877502441, + "learning_rate": 7.602956359286147e-07, + "loss": 0.783, + "step": 17840 + }, + { + "epoch": 0.27781453602870776, + "grad_norm": 3.274508476257324, + "learning_rate": 7.602136969240097e-07, + "loss": 0.6912, + "step": 17845 + }, + { + "epoch": 0.27789237703067715, + "grad_norm": 3.4465394020080566, + "learning_rate": 7.601317579194048e-07, + "loss": 0.7393, + "step": 17850 + }, + { + "epoch": 0.27797021803264654, + "grad_norm": 2.9954683780670166, + "learning_rate": 7.600498189147998e-07, + "loss": 0.6276, + "step": 17855 + }, + { + "epoch": 0.2780480590346159, + "grad_norm": 6.088066101074219, + "learning_rate": 7.599678799101948e-07, + "loss": 0.7727, + "step": 17860 + }, + { + "epoch": 0.27812590003658527, + "grad_norm": 2.8900184631347656, + "learning_rate": 7.598859409055899e-07, + "loss": 0.7322, + "step": 17865 + }, + { + "epoch": 0.27820374103855466, + "grad_norm": 5.254889965057373, + "learning_rate": 7.598040019009849e-07, + "loss": 0.8718, + "step": 17870 + }, + { + "epoch": 0.278281582040524, + "grad_norm": 3.176635265350342, + "learning_rate": 7.597220628963798e-07, + "loss": 0.6775, + "step": 17875 + }, + { + "epoch": 0.2783594230424934, + "grad_norm": 4.2815260887146, + "learning_rate": 7.596401238917749e-07, + "loss": 0.7797, + "step": 17880 + }, + { + "epoch": 0.2784372640444628, + "grad_norm": 2.851503372192383, + "learning_rate": 7.5955818488717e-07, + "loss": 0.6992, + "step": 17885 + }, + { + "epoch": 0.2785151050464322, + "grad_norm": 4.479244232177734, + "learning_rate": 7.59476245882565e-07, + "loss": 0.8804, + "step": 17890 + }, + { + "epoch": 0.2785929460484015, + "grad_norm": 3.6790082454681396, + "learning_rate": 7.5939430687796e-07, + "loss": 0.7211, + "step": 17895 + }, + { + "epoch": 0.2786707870503709, + "grad_norm": 4.7184600830078125, + "learning_rate": 7.593123678733551e-07, + "loss": 0.862, + "step": 17900 + }, + { + "epoch": 0.2787486280523403, + "grad_norm": 3.2201459407806396, + "learning_rate": 7.592304288687502e-07, + "loss": 0.8146, + "step": 17905 + }, + { + "epoch": 0.2788264690543097, + "grad_norm": 3.8004696369171143, + "learning_rate": 7.591484898641451e-07, + "loss": 0.7318, + "step": 17910 + }, + { + "epoch": 0.278904310056279, + "grad_norm": 5.019532203674316, + "learning_rate": 7.590665508595401e-07, + "loss": 0.8417, + "step": 17915 + }, + { + "epoch": 0.2789821510582484, + "grad_norm": 3.6091842651367188, + "learning_rate": 7.589846118549352e-07, + "loss": 0.7375, + "step": 17920 + }, + { + "epoch": 0.2790599920602178, + "grad_norm": 4.4064226150512695, + "learning_rate": 7.589026728503301e-07, + "loss": 0.7558, + "step": 17925 + }, + { + "epoch": 0.2791378330621872, + "grad_norm": 2.371734619140625, + "learning_rate": 7.588207338457252e-07, + "loss": 0.6613, + "step": 17930 + }, + { + "epoch": 0.27921567406415654, + "grad_norm": 10.808391571044922, + "learning_rate": 7.587387948411203e-07, + "loss": 0.879, + "step": 17935 + }, + { + "epoch": 0.27929351506612593, + "grad_norm": 5.366029262542725, + "learning_rate": 7.586568558365153e-07, + "loss": 0.7484, + "step": 17940 + }, + { + "epoch": 0.2793713560680953, + "grad_norm": 5.2202911376953125, + "learning_rate": 7.585749168319103e-07, + "loss": 0.6601, + "step": 17945 + }, + { + "epoch": 0.27944919707006466, + "grad_norm": 5.043264865875244, + "learning_rate": 7.584929778273054e-07, + "loss": 0.8133, + "step": 17950 + }, + { + "epoch": 0.27952703807203405, + "grad_norm": 6.051628112792969, + "learning_rate": 7.584110388227003e-07, + "loss": 0.8647, + "step": 17955 + }, + { + "epoch": 0.27960487907400344, + "grad_norm": 3.5219998359680176, + "learning_rate": 7.583290998180953e-07, + "loss": 0.7895, + "step": 17960 + }, + { + "epoch": 0.27968272007597283, + "grad_norm": 4.5519232749938965, + "learning_rate": 7.582471608134904e-07, + "loss": 0.8633, + "step": 17965 + }, + { + "epoch": 0.27976056107794217, + "grad_norm": 3.4150590896606445, + "learning_rate": 7.581652218088854e-07, + "loss": 0.7193, + "step": 17970 + }, + { + "epoch": 0.27983840207991156, + "grad_norm": 4.484484672546387, + "learning_rate": 7.580832828042805e-07, + "loss": 0.7381, + "step": 17975 + }, + { + "epoch": 0.27991624308188096, + "grad_norm": 4.187557220458984, + "learning_rate": 7.580013437996755e-07, + "loss": 0.7605, + "step": 17980 + }, + { + "epoch": 0.27999408408385035, + "grad_norm": 3.3712828159332275, + "learning_rate": 7.579194047950705e-07, + "loss": 0.718, + "step": 17985 + }, + { + "epoch": 0.2800719250858197, + "grad_norm": 4.786652565002441, + "learning_rate": 7.578374657904656e-07, + "loss": 0.771, + "step": 17990 + }, + { + "epoch": 0.2801497660877891, + "grad_norm": 3.188917875289917, + "learning_rate": 7.577555267858605e-07, + "loss": 0.8198, + "step": 17995 + }, + { + "epoch": 0.28022760708975847, + "grad_norm": 3.1383514404296875, + "learning_rate": 7.576735877812555e-07, + "loss": 0.8076, + "step": 18000 + }, + { + "epoch": 0.28030544809172786, + "grad_norm": 3.447678327560425, + "learning_rate": 7.575916487766506e-07, + "loss": 0.7277, + "step": 18005 + }, + { + "epoch": 0.2803832890936972, + "grad_norm": 3.8167226314544678, + "learning_rate": 7.575097097720457e-07, + "loss": 0.6305, + "step": 18010 + }, + { + "epoch": 0.2804611300956666, + "grad_norm": 5.272345066070557, + "learning_rate": 7.574277707674407e-07, + "loss": 0.8302, + "step": 18015 + }, + { + "epoch": 0.280538971097636, + "grad_norm": 3.4920878410339355, + "learning_rate": 7.573458317628357e-07, + "loss": 0.8639, + "step": 18020 + }, + { + "epoch": 0.2806168120996054, + "grad_norm": 3.8000481128692627, + "learning_rate": 7.572638927582308e-07, + "loss": 0.6455, + "step": 18025 + }, + { + "epoch": 0.2806946531015747, + "grad_norm": 5.670722007751465, + "learning_rate": 7.571819537536259e-07, + "loss": 0.7034, + "step": 18030 + }, + { + "epoch": 0.2807724941035441, + "grad_norm": 6.655512809753418, + "learning_rate": 7.571000147490207e-07, + "loss": 0.8541, + "step": 18035 + }, + { + "epoch": 0.2808503351055135, + "grad_norm": 4.9885573387146, + "learning_rate": 7.570180757444158e-07, + "loss": 0.7457, + "step": 18040 + }, + { + "epoch": 0.28092817610748283, + "grad_norm": 3.309530019760132, + "learning_rate": 7.569361367398109e-07, + "loss": 0.7874, + "step": 18045 + }, + { + "epoch": 0.2810060171094522, + "grad_norm": 2.9954628944396973, + "learning_rate": 7.568541977352058e-07, + "loss": 0.8728, + "step": 18050 + }, + { + "epoch": 0.2810838581114216, + "grad_norm": 9.354890823364258, + "learning_rate": 7.567722587306009e-07, + "loss": 0.7902, + "step": 18055 + }, + { + "epoch": 0.281161699113391, + "grad_norm": 6.805416107177734, + "learning_rate": 7.56690319725996e-07, + "loss": 0.8763, + "step": 18060 + }, + { + "epoch": 0.28123954011536034, + "grad_norm": 11.965883255004883, + "learning_rate": 7.56608380721391e-07, + "loss": 0.8265, + "step": 18065 + }, + { + "epoch": 0.28131738111732973, + "grad_norm": 3.0321974754333496, + "learning_rate": 7.56526441716786e-07, + "loss": 0.7889, + "step": 18070 + }, + { + "epoch": 0.2813952221192991, + "grad_norm": 3.4996132850646973, + "learning_rate": 7.56444502712181e-07, + "loss": 0.7354, + "step": 18075 + }, + { + "epoch": 0.2814730631212685, + "grad_norm": 8.6669921875, + "learning_rate": 7.56362563707576e-07, + "loss": 0.7775, + "step": 18080 + }, + { + "epoch": 0.28155090412323786, + "grad_norm": 6.530038356781006, + "learning_rate": 7.56280624702971e-07, + "loss": 0.7778, + "step": 18085 + }, + { + "epoch": 0.28162874512520725, + "grad_norm": 2.507265329360962, + "learning_rate": 7.561986856983661e-07, + "loss": 0.7254, + "step": 18090 + }, + { + "epoch": 0.28170658612717664, + "grad_norm": 3.001185655593872, + "learning_rate": 7.561167466937611e-07, + "loss": 0.7498, + "step": 18095 + }, + { + "epoch": 0.28178442712914603, + "grad_norm": 4.192236423492432, + "learning_rate": 7.560348076891562e-07, + "loss": 0.7007, + "step": 18100 + }, + { + "epoch": 0.28186226813111537, + "grad_norm": 6.101157188415527, + "learning_rate": 7.559528686845512e-07, + "loss": 0.7048, + "step": 18105 + }, + { + "epoch": 0.28194010913308476, + "grad_norm": 4.111788749694824, + "learning_rate": 7.558709296799462e-07, + "loss": 0.7734, + "step": 18110 + }, + { + "epoch": 0.28201795013505415, + "grad_norm": 3.0076723098754883, + "learning_rate": 7.557889906753412e-07, + "loss": 0.7345, + "step": 18115 + }, + { + "epoch": 0.2820957911370235, + "grad_norm": 2.860314130783081, + "learning_rate": 7.557070516707363e-07, + "loss": 0.7199, + "step": 18120 + }, + { + "epoch": 0.2821736321389929, + "grad_norm": 3.355159044265747, + "learning_rate": 7.556251126661312e-07, + "loss": 0.7313, + "step": 18125 + }, + { + "epoch": 0.2822514731409623, + "grad_norm": 5.150790691375732, + "learning_rate": 7.555431736615263e-07, + "loss": 0.643, + "step": 18130 + }, + { + "epoch": 0.28232931414293166, + "grad_norm": 3.4499988555908203, + "learning_rate": 7.554612346569214e-07, + "loss": 0.8814, + "step": 18135 + }, + { + "epoch": 0.282407155144901, + "grad_norm": 4.242376804351807, + "learning_rate": 7.553792956523164e-07, + "loss": 0.7596, + "step": 18140 + }, + { + "epoch": 0.2824849961468704, + "grad_norm": 3.851548433303833, + "learning_rate": 7.552973566477114e-07, + "loss": 0.736, + "step": 18145 + }, + { + "epoch": 0.2825628371488398, + "grad_norm": 6.2824602127075195, + "learning_rate": 7.552154176431065e-07, + "loss": 0.759, + "step": 18150 + }, + { + "epoch": 0.2826406781508092, + "grad_norm": 5.529384613037109, + "learning_rate": 7.551334786385016e-07, + "loss": 0.6954, + "step": 18155 + }, + { + "epoch": 0.2827185191527785, + "grad_norm": 4.210619926452637, + "learning_rate": 7.550515396338964e-07, + "loss": 0.8275, + "step": 18160 + }, + { + "epoch": 0.2827963601547479, + "grad_norm": 4.350375175476074, + "learning_rate": 7.549696006292915e-07, + "loss": 0.7411, + "step": 18165 + }, + { + "epoch": 0.2828742011567173, + "grad_norm": 5.59158992767334, + "learning_rate": 7.548876616246866e-07, + "loss": 0.8171, + "step": 18170 + }, + { + "epoch": 0.2829520421586867, + "grad_norm": 5.024247169494629, + "learning_rate": 7.548057226200815e-07, + "loss": 0.7903, + "step": 18175 + }, + { + "epoch": 0.283029883160656, + "grad_norm": 2.6078712940216064, + "learning_rate": 7.547237836154766e-07, + "loss": 0.6803, + "step": 18180 + }, + { + "epoch": 0.2831077241626254, + "grad_norm": 4.355838775634766, + "learning_rate": 7.546418446108717e-07, + "loss": 0.7313, + "step": 18185 + }, + { + "epoch": 0.2831855651645948, + "grad_norm": 3.1983373165130615, + "learning_rate": 7.545599056062667e-07, + "loss": 0.7433, + "step": 18190 + }, + { + "epoch": 0.2832634061665642, + "grad_norm": 2.885439395904541, + "learning_rate": 7.544779666016617e-07, + "loss": 0.6819, + "step": 18195 + }, + { + "epoch": 0.28334124716853354, + "grad_norm": 9.0044527053833, + "learning_rate": 7.543960275970567e-07, + "loss": 0.8062, + "step": 18200 + }, + { + "epoch": 0.28341908817050293, + "grad_norm": 4.429920673370361, + "learning_rate": 7.543140885924517e-07, + "loss": 0.7271, + "step": 18205 + }, + { + "epoch": 0.2834969291724723, + "grad_norm": 3.505737543106079, + "learning_rate": 7.542321495878468e-07, + "loss": 0.7338, + "step": 18210 + }, + { + "epoch": 0.28357477017444166, + "grad_norm": 3.533663272857666, + "learning_rate": 7.541502105832418e-07, + "loss": 0.8234, + "step": 18215 + }, + { + "epoch": 0.28365261117641105, + "grad_norm": 3.877680540084839, + "learning_rate": 7.540682715786368e-07, + "loss": 0.7557, + "step": 18220 + }, + { + "epoch": 0.28373045217838044, + "grad_norm": 2.824127435684204, + "learning_rate": 7.539863325740319e-07, + "loss": 0.6762, + "step": 18225 + }, + { + "epoch": 0.28380829318034984, + "grad_norm": 5.132575035095215, + "learning_rate": 7.539043935694269e-07, + "loss": 0.8278, + "step": 18230 + }, + { + "epoch": 0.28388613418231917, + "grad_norm": 3.5914008617401123, + "learning_rate": 7.538224545648219e-07, + "loss": 0.7782, + "step": 18235 + }, + { + "epoch": 0.28396397518428856, + "grad_norm": 3.1311373710632324, + "learning_rate": 7.537405155602169e-07, + "loss": 0.9563, + "step": 18240 + }, + { + "epoch": 0.28404181618625796, + "grad_norm": 4.662816524505615, + "learning_rate": 7.53658576555612e-07, + "loss": 0.8451, + "step": 18245 + }, + { + "epoch": 0.28411965718822735, + "grad_norm": 4.352396011352539, + "learning_rate": 7.535766375510069e-07, + "loss": 0.7883, + "step": 18250 + }, + { + "epoch": 0.2841974981901967, + "grad_norm": 7.063146114349365, + "learning_rate": 7.53494698546402e-07, + "loss": 0.795, + "step": 18255 + }, + { + "epoch": 0.2842753391921661, + "grad_norm": 6.286665439605713, + "learning_rate": 7.534127595417971e-07, + "loss": 0.7575, + "step": 18260 + }, + { + "epoch": 0.28435318019413547, + "grad_norm": 3.584050178527832, + "learning_rate": 7.533308205371922e-07, + "loss": 0.7749, + "step": 18265 + }, + { + "epoch": 0.28443102119610486, + "grad_norm": 3.3104610443115234, + "learning_rate": 7.532488815325871e-07, + "loss": 0.8768, + "step": 18270 + }, + { + "epoch": 0.2845088621980742, + "grad_norm": 3.0027613639831543, + "learning_rate": 7.531669425279822e-07, + "loss": 0.7214, + "step": 18275 + }, + { + "epoch": 0.2845867032000436, + "grad_norm": 7.029557228088379, + "learning_rate": 7.530850035233772e-07, + "loss": 0.8627, + "step": 18280 + }, + { + "epoch": 0.284664544202013, + "grad_norm": 3.829381227493286, + "learning_rate": 7.530030645187721e-07, + "loss": 0.7763, + "step": 18285 + }, + { + "epoch": 0.2847423852039824, + "grad_norm": 3.7503976821899414, + "learning_rate": 7.529211255141672e-07, + "loss": 0.9111, + "step": 18290 + }, + { + "epoch": 0.2848202262059517, + "grad_norm": 5.3771586418151855, + "learning_rate": 7.528391865095623e-07, + "loss": 0.8367, + "step": 18295 + }, + { + "epoch": 0.2848980672079211, + "grad_norm": 2.6693806648254395, + "learning_rate": 7.527572475049573e-07, + "loss": 0.7148, + "step": 18300 + }, + { + "epoch": 0.2849759082098905, + "grad_norm": 8.563738822937012, + "learning_rate": 7.526753085003523e-07, + "loss": 0.8822, + "step": 18305 + }, + { + "epoch": 0.28505374921185983, + "grad_norm": 3.3797926902770996, + "learning_rate": 7.525933694957474e-07, + "loss": 0.7949, + "step": 18310 + }, + { + "epoch": 0.2851315902138292, + "grad_norm": 2.9566423892974854, + "learning_rate": 7.525114304911424e-07, + "loss": 0.7643, + "step": 18315 + }, + { + "epoch": 0.2852094312157986, + "grad_norm": 2.9523532390594482, + "learning_rate": 7.524294914865373e-07, + "loss": 0.6975, + "step": 18320 + }, + { + "epoch": 0.285287272217768, + "grad_norm": 7.08513069152832, + "learning_rate": 7.523475524819324e-07, + "loss": 0.772, + "step": 18325 + }, + { + "epoch": 0.28536511321973734, + "grad_norm": 3.7659263610839844, + "learning_rate": 7.522656134773274e-07, + "loss": 0.8242, + "step": 18330 + }, + { + "epoch": 0.28544295422170674, + "grad_norm": 3.876816511154175, + "learning_rate": 7.521836744727225e-07, + "loss": 0.8292, + "step": 18335 + }, + { + "epoch": 0.2855207952236761, + "grad_norm": 3.302157163619995, + "learning_rate": 7.521017354681175e-07, + "loss": 0.7672, + "step": 18340 + }, + { + "epoch": 0.2855986362256455, + "grad_norm": 3.461796760559082, + "learning_rate": 7.520197964635125e-07, + "loss": 0.7899, + "step": 18345 + }, + { + "epoch": 0.28567647722761486, + "grad_norm": 4.767899990081787, + "learning_rate": 7.519378574589076e-07, + "loss": 0.7807, + "step": 18350 + }, + { + "epoch": 0.28575431822958425, + "grad_norm": 2.773707389831543, + "learning_rate": 7.518559184543027e-07, + "loss": 0.6448, + "step": 18355 + }, + { + "epoch": 0.28583215923155364, + "grad_norm": 3.4016494750976562, + "learning_rate": 7.517739794496975e-07, + "loss": 0.697, + "step": 18360 + }, + { + "epoch": 0.28591000023352303, + "grad_norm": 3.069875955581665, + "learning_rate": 7.516920404450926e-07, + "loss": 0.7969, + "step": 18365 + }, + { + "epoch": 0.28598784123549237, + "grad_norm": 4.186999797821045, + "learning_rate": 7.516101014404877e-07, + "loss": 0.7641, + "step": 18370 + }, + { + "epoch": 0.28606568223746176, + "grad_norm": 5.006409168243408, + "learning_rate": 7.515281624358826e-07, + "loss": 0.6768, + "step": 18375 + }, + { + "epoch": 0.28614352323943115, + "grad_norm": 8.393912315368652, + "learning_rate": 7.514462234312777e-07, + "loss": 0.843, + "step": 18380 + }, + { + "epoch": 0.2862213642414005, + "grad_norm": 4.208598613739014, + "learning_rate": 7.513642844266728e-07, + "loss": 0.87, + "step": 18385 + }, + { + "epoch": 0.2862992052433699, + "grad_norm": 2.9314475059509277, + "learning_rate": 7.512823454220679e-07, + "loss": 0.8289, + "step": 18390 + }, + { + "epoch": 0.2863770462453393, + "grad_norm": 5.165114879608154, + "learning_rate": 7.512004064174628e-07, + "loss": 0.7525, + "step": 18395 + }, + { + "epoch": 0.28645488724730866, + "grad_norm": 5.7505669593811035, + "learning_rate": 7.511184674128578e-07, + "loss": 0.6644, + "step": 18400 + }, + { + "epoch": 0.286532728249278, + "grad_norm": 3.4606173038482666, + "learning_rate": 7.510365284082529e-07, + "loss": 0.7591, + "step": 18405 + }, + { + "epoch": 0.2866105692512474, + "grad_norm": 4.732656478881836, + "learning_rate": 7.509545894036478e-07, + "loss": 0.9361, + "step": 18410 + }, + { + "epoch": 0.2866884102532168, + "grad_norm": 5.098369121551514, + "learning_rate": 7.508726503990429e-07, + "loss": 0.832, + "step": 18415 + }, + { + "epoch": 0.2867662512551862, + "grad_norm": 4.499561786651611, + "learning_rate": 7.50790711394438e-07, + "loss": 0.6976, + "step": 18420 + }, + { + "epoch": 0.2868440922571555, + "grad_norm": 7.913333892822266, + "learning_rate": 7.50708772389833e-07, + "loss": 0.9198, + "step": 18425 + }, + { + "epoch": 0.2869219332591249, + "grad_norm": 2.9314780235290527, + "learning_rate": 7.50626833385228e-07, + "loss": 0.7318, + "step": 18430 + }, + { + "epoch": 0.2869997742610943, + "grad_norm": 3.969515323638916, + "learning_rate": 7.505448943806231e-07, + "loss": 0.9026, + "step": 18435 + }, + { + "epoch": 0.2870776152630637, + "grad_norm": 5.8380351066589355, + "learning_rate": 7.50462955376018e-07, + "loss": 0.7391, + "step": 18440 + }, + { + "epoch": 0.287155456265033, + "grad_norm": 3.473393201828003, + "learning_rate": 7.50381016371413e-07, + "loss": 0.863, + "step": 18445 + }, + { + "epoch": 0.2872332972670024, + "grad_norm": 5.020837783813477, + "learning_rate": 7.502990773668081e-07, + "loss": 0.864, + "step": 18450 + }, + { + "epoch": 0.2873111382689718, + "grad_norm": 3.1780622005462646, + "learning_rate": 7.502171383622031e-07, + "loss": 0.7748, + "step": 18455 + }, + { + "epoch": 0.2873889792709412, + "grad_norm": 3.086195230484009, + "learning_rate": 7.501351993575982e-07, + "loss": 0.8153, + "step": 18460 + }, + { + "epoch": 0.28746682027291054, + "grad_norm": 3.143611192703247, + "learning_rate": 7.500532603529932e-07, + "loss": 0.7826, + "step": 18465 + }, + { + "epoch": 0.28754466127487993, + "grad_norm": 3.706808090209961, + "learning_rate": 7.499713213483882e-07, + "loss": 0.6314, + "step": 18470 + }, + { + "epoch": 0.2876225022768493, + "grad_norm": 3.3525521755218506, + "learning_rate": 7.498893823437833e-07, + "loss": 0.7845, + "step": 18475 + }, + { + "epoch": 0.28770034327881866, + "grad_norm": 3.0572566986083984, + "learning_rate": 7.498074433391784e-07, + "loss": 0.7662, + "step": 18480 + }, + { + "epoch": 0.28777818428078805, + "grad_norm": 3.026519298553467, + "learning_rate": 7.497255043345732e-07, + "loss": 0.743, + "step": 18485 + }, + { + "epoch": 0.28785602528275744, + "grad_norm": 6.692337512969971, + "learning_rate": 7.496435653299683e-07, + "loss": 0.8563, + "step": 18490 + }, + { + "epoch": 0.28793386628472684, + "grad_norm": 4.200711727142334, + "learning_rate": 7.495616263253634e-07, + "loss": 0.7121, + "step": 18495 + }, + { + "epoch": 0.2880117072866962, + "grad_norm": 3.3459670543670654, + "learning_rate": 7.494796873207583e-07, + "loss": 0.8345, + "step": 18500 + }, + { + "epoch": 0.28808954828866556, + "grad_norm": 3.4595234394073486, + "learning_rate": 7.493977483161534e-07, + "loss": 0.8437, + "step": 18505 + }, + { + "epoch": 0.28816738929063496, + "grad_norm": 3.5343360900878906, + "learning_rate": 7.493158093115485e-07, + "loss": 0.8151, + "step": 18510 + }, + { + "epoch": 0.28824523029260435, + "grad_norm": 4.573742866516113, + "learning_rate": 7.492338703069436e-07, + "loss": 0.7683, + "step": 18515 + }, + { + "epoch": 0.2883230712945737, + "grad_norm": 6.558521747589111, + "learning_rate": 7.491519313023385e-07, + "loss": 0.8551, + "step": 18520 + }, + { + "epoch": 0.2884009122965431, + "grad_norm": 8.159137725830078, + "learning_rate": 7.490699922977335e-07, + "loss": 0.8556, + "step": 18525 + }, + { + "epoch": 0.28847875329851247, + "grad_norm": 4.157485008239746, + "learning_rate": 7.489880532931286e-07, + "loss": 0.7474, + "step": 18530 + }, + { + "epoch": 0.28855659430048186, + "grad_norm": 2.989870071411133, + "learning_rate": 7.489061142885235e-07, + "loss": 0.7662, + "step": 18535 + }, + { + "epoch": 0.2886344353024512, + "grad_norm": 3.084144115447998, + "learning_rate": 7.488241752839186e-07, + "loss": 0.7597, + "step": 18540 + }, + { + "epoch": 0.2887122763044206, + "grad_norm": 3.9873554706573486, + "learning_rate": 7.487422362793137e-07, + "loss": 0.6703, + "step": 18545 + }, + { + "epoch": 0.28879011730639, + "grad_norm": 2.9624643325805664, + "learning_rate": 7.486602972747087e-07, + "loss": 0.7915, + "step": 18550 + }, + { + "epoch": 0.2888679583083593, + "grad_norm": 3.346914529800415, + "learning_rate": 7.485783582701037e-07, + "loss": 0.6576, + "step": 18555 + }, + { + "epoch": 0.2889457993103287, + "grad_norm": 3.5888848304748535, + "learning_rate": 7.484964192654988e-07, + "loss": 0.7523, + "step": 18560 + }, + { + "epoch": 0.2890236403122981, + "grad_norm": 4.135461807250977, + "learning_rate": 7.484144802608937e-07, + "loss": 0.7744, + "step": 18565 + }, + { + "epoch": 0.2891014813142675, + "grad_norm": 3.826291799545288, + "learning_rate": 7.483325412562888e-07, + "loss": 0.8145, + "step": 18570 + }, + { + "epoch": 0.28917932231623683, + "grad_norm": 4.202657699584961, + "learning_rate": 7.482506022516838e-07, + "loss": 0.7333, + "step": 18575 + }, + { + "epoch": 0.2892571633182062, + "grad_norm": 3.219723701477051, + "learning_rate": 7.481686632470788e-07, + "loss": 0.7905, + "step": 18580 + }, + { + "epoch": 0.2893350043201756, + "grad_norm": 3.597161293029785, + "learning_rate": 7.480867242424739e-07, + "loss": 0.7624, + "step": 18585 + }, + { + "epoch": 0.289412845322145, + "grad_norm": 6.197475433349609, + "learning_rate": 7.48004785237869e-07, + "loss": 0.9616, + "step": 18590 + }, + { + "epoch": 0.28949068632411434, + "grad_norm": 3.9106247425079346, + "learning_rate": 7.479228462332639e-07, + "loss": 0.7329, + "step": 18595 + }, + { + "epoch": 0.28956852732608374, + "grad_norm": 5.9468255043029785, + "learning_rate": 7.47840907228659e-07, + "loss": 0.8381, + "step": 18600 + }, + { + "epoch": 0.2896463683280531, + "grad_norm": 5.991184234619141, + "learning_rate": 7.47758968224054e-07, + "loss": 0.7476, + "step": 18605 + }, + { + "epoch": 0.2897242093300225, + "grad_norm": 4.296688556671143, + "learning_rate": 7.476770292194489e-07, + "loss": 0.7739, + "step": 18610 + }, + { + "epoch": 0.28980205033199186, + "grad_norm": 4.5617146492004395, + "learning_rate": 7.47595090214844e-07, + "loss": 0.7349, + "step": 18615 + }, + { + "epoch": 0.28987989133396125, + "grad_norm": 6.767970085144043, + "learning_rate": 7.475131512102391e-07, + "loss": 0.7144, + "step": 18620 + }, + { + "epoch": 0.28995773233593064, + "grad_norm": 3.624408721923828, + "learning_rate": 7.47431212205634e-07, + "loss": 0.7996, + "step": 18625 + }, + { + "epoch": 0.29003557333790003, + "grad_norm": 2.7725491523742676, + "learning_rate": 7.473492732010291e-07, + "loss": 0.743, + "step": 18630 + }, + { + "epoch": 0.29011341433986937, + "grad_norm": 7.392967224121094, + "learning_rate": 7.472673341964242e-07, + "loss": 0.896, + "step": 18635 + }, + { + "epoch": 0.29019125534183876, + "grad_norm": 6.188554286956787, + "learning_rate": 7.471853951918193e-07, + "loss": 0.6967, + "step": 18640 + }, + { + "epoch": 0.29026909634380815, + "grad_norm": 6.979849338531494, + "learning_rate": 7.471034561872141e-07, + "loss": 0.8246, + "step": 18645 + }, + { + "epoch": 0.2903469373457775, + "grad_norm": 3.2482519149780273, + "learning_rate": 7.470215171826092e-07, + "loss": 0.8414, + "step": 18650 + }, + { + "epoch": 0.2904247783477469, + "grad_norm": 2.916731357574463, + "learning_rate": 7.469395781780043e-07, + "loss": 0.7341, + "step": 18655 + }, + { + "epoch": 0.2905026193497163, + "grad_norm": 2.8692774772644043, + "learning_rate": 7.468576391733993e-07, + "loss": 0.8345, + "step": 18660 + }, + { + "epoch": 0.29058046035168567, + "grad_norm": 4.653710842132568, + "learning_rate": 7.467757001687943e-07, + "loss": 0.7714, + "step": 18665 + }, + { + "epoch": 0.290658301353655, + "grad_norm": 3.6073105335235596, + "learning_rate": 7.466937611641894e-07, + "loss": 0.807, + "step": 18670 + }, + { + "epoch": 0.2907361423556244, + "grad_norm": 2.567059278488159, + "learning_rate": 7.466118221595844e-07, + "loss": 0.6275, + "step": 18675 + }, + { + "epoch": 0.2908139833575938, + "grad_norm": 4.383679389953613, + "learning_rate": 7.465298831549794e-07, + "loss": 0.8669, + "step": 18680 + }, + { + "epoch": 0.2908918243595632, + "grad_norm": 3.321564197540283, + "learning_rate": 7.464479441503744e-07, + "loss": 0.6724, + "step": 18685 + }, + { + "epoch": 0.2909696653615325, + "grad_norm": 6.479825973510742, + "learning_rate": 7.463660051457694e-07, + "loss": 0.7196, + "step": 18690 + }, + { + "epoch": 0.2910475063635019, + "grad_norm": 7.732433795928955, + "learning_rate": 7.462840661411645e-07, + "loss": 0.8695, + "step": 18695 + }, + { + "epoch": 0.2911253473654713, + "grad_norm": 2.9067394733428955, + "learning_rate": 7.462021271365595e-07, + "loss": 0.6753, + "step": 18700 + }, + { + "epoch": 0.2912031883674407, + "grad_norm": 3.960364580154419, + "learning_rate": 7.461201881319545e-07, + "loss": 0.9328, + "step": 18705 + }, + { + "epoch": 0.29128102936941, + "grad_norm": 2.495321750640869, + "learning_rate": 7.460382491273496e-07, + "loss": 0.8255, + "step": 18710 + }, + { + "epoch": 0.2913588703713794, + "grad_norm": 2.928755044937134, + "learning_rate": 7.459563101227447e-07, + "loss": 0.7353, + "step": 18715 + }, + { + "epoch": 0.2914367113733488, + "grad_norm": 2.7744498252868652, + "learning_rate": 7.458743711181396e-07, + "loss": 0.8438, + "step": 18720 + }, + { + "epoch": 0.29151455237531815, + "grad_norm": 5.325984001159668, + "learning_rate": 7.457924321135346e-07, + "loss": 0.6956, + "step": 18725 + }, + { + "epoch": 0.29159239337728754, + "grad_norm": 3.782843589782715, + "learning_rate": 7.457104931089297e-07, + "loss": 0.7555, + "step": 18730 + }, + { + "epoch": 0.29167023437925693, + "grad_norm": 4.154267311096191, + "learning_rate": 7.456285541043246e-07, + "loss": 0.7718, + "step": 18735 + }, + { + "epoch": 0.2917480753812263, + "grad_norm": 4.614317417144775, + "learning_rate": 7.455466150997197e-07, + "loss": 0.8251, + "step": 18740 + }, + { + "epoch": 0.29182591638319566, + "grad_norm": 3.0895204544067383, + "learning_rate": 7.454646760951148e-07, + "loss": 0.7649, + "step": 18745 + }, + { + "epoch": 0.29190375738516505, + "grad_norm": 4.956776142120361, + "learning_rate": 7.453827370905098e-07, + "loss": 0.7662, + "step": 18750 + }, + { + "epoch": 0.29198159838713444, + "grad_norm": 4.089311599731445, + "learning_rate": 7.453007980859048e-07, + "loss": 0.6745, + "step": 18755 + }, + { + "epoch": 0.29205943938910384, + "grad_norm": 4.280338764190674, + "learning_rate": 7.452188590812999e-07, + "loss": 0.8082, + "step": 18760 + }, + { + "epoch": 0.2921372803910732, + "grad_norm": 4.679727554321289, + "learning_rate": 7.451369200766948e-07, + "loss": 0.7907, + "step": 18765 + }, + { + "epoch": 0.29221512139304257, + "grad_norm": 7.030491828918457, + "learning_rate": 7.450549810720898e-07, + "loss": 0.822, + "step": 18770 + }, + { + "epoch": 0.29229296239501196, + "grad_norm": 3.752502202987671, + "learning_rate": 7.449730420674849e-07, + "loss": 0.6647, + "step": 18775 + }, + { + "epoch": 0.29237080339698135, + "grad_norm": 3.4395251274108887, + "learning_rate": 7.4489110306288e-07, + "loss": 0.9502, + "step": 18780 + }, + { + "epoch": 0.2924486443989507, + "grad_norm": 4.704014778137207, + "learning_rate": 7.44809164058275e-07, + "loss": 0.8132, + "step": 18785 + }, + { + "epoch": 0.2925264854009201, + "grad_norm": 6.6849446296691895, + "learning_rate": 7.4472722505367e-07, + "loss": 0.7998, + "step": 18790 + }, + { + "epoch": 0.29260432640288947, + "grad_norm": 5.722506046295166, + "learning_rate": 7.446452860490651e-07, + "loss": 0.7584, + "step": 18795 + }, + { + "epoch": 0.29268216740485886, + "grad_norm": 4.313940048217773, + "learning_rate": 7.445633470444601e-07, + "loss": 0.8576, + "step": 18800 + }, + { + "epoch": 0.2927600084068282, + "grad_norm": 3.7770705223083496, + "learning_rate": 7.444814080398552e-07, + "loss": 0.8339, + "step": 18805 + }, + { + "epoch": 0.2928378494087976, + "grad_norm": 7.073665618896484, + "learning_rate": 7.443994690352501e-07, + "loss": 0.8197, + "step": 18810 + }, + { + "epoch": 0.292915690410767, + "grad_norm": 4.039837837219238, + "learning_rate": 7.443175300306451e-07, + "loss": 0.7743, + "step": 18815 + }, + { + "epoch": 0.2929935314127363, + "grad_norm": 3.668461561203003, + "learning_rate": 7.442355910260402e-07, + "loss": 0.7049, + "step": 18820 + }, + { + "epoch": 0.2930713724147057, + "grad_norm": 4.572951793670654, + "learning_rate": 7.441536520214352e-07, + "loss": 0.7926, + "step": 18825 + }, + { + "epoch": 0.2931492134166751, + "grad_norm": 2.8897600173950195, + "learning_rate": 7.440717130168302e-07, + "loss": 0.885, + "step": 18830 + }, + { + "epoch": 0.2932270544186445, + "grad_norm": 6.474730014801025, + "learning_rate": 7.439897740122253e-07, + "loss": 0.5874, + "step": 18835 + }, + { + "epoch": 0.29330489542061383, + "grad_norm": 3.490199327468872, + "learning_rate": 7.439078350076204e-07, + "loss": 0.8314, + "step": 18840 + }, + { + "epoch": 0.2933827364225832, + "grad_norm": 3.0774970054626465, + "learning_rate": 7.438258960030153e-07, + "loss": 0.7794, + "step": 18845 + }, + { + "epoch": 0.2934605774245526, + "grad_norm": 4.177643299102783, + "learning_rate": 7.437439569984103e-07, + "loss": 0.8409, + "step": 18850 + }, + { + "epoch": 0.293538418426522, + "grad_norm": 4.719580173492432, + "learning_rate": 7.436620179938054e-07, + "loss": 0.8033, + "step": 18855 + }, + { + "epoch": 0.29361625942849134, + "grad_norm": 3.9040963649749756, + "learning_rate": 7.435800789892003e-07, + "loss": 0.7557, + "step": 18860 + }, + { + "epoch": 0.29369410043046074, + "grad_norm": 4.212202548980713, + "learning_rate": 7.434981399845954e-07, + "loss": 0.789, + "step": 18865 + }, + { + "epoch": 0.29377194143243013, + "grad_norm": 6.8639373779296875, + "learning_rate": 7.434162009799905e-07, + "loss": 0.885, + "step": 18870 + }, + { + "epoch": 0.2938497824343995, + "grad_norm": 3.239004611968994, + "learning_rate": 7.433342619753855e-07, + "loss": 0.7409, + "step": 18875 + }, + { + "epoch": 0.29392762343636886, + "grad_norm": 4.313357830047607, + "learning_rate": 7.432523229707805e-07, + "loss": 0.778, + "step": 18880 + }, + { + "epoch": 0.29400546443833825, + "grad_norm": 4.123286724090576, + "learning_rate": 7.431703839661756e-07, + "loss": 0.7855, + "step": 18885 + }, + { + "epoch": 0.29408330544030764, + "grad_norm": 3.8900656700134277, + "learning_rate": 7.430884449615705e-07, + "loss": 0.6672, + "step": 18890 + }, + { + "epoch": 0.294161146442277, + "grad_norm": 5.243304252624512, + "learning_rate": 7.430065059569656e-07, + "loss": 0.8295, + "step": 18895 + }, + { + "epoch": 0.29423898744424637, + "grad_norm": 7.33563756942749, + "learning_rate": 7.429245669523606e-07, + "loss": 0.6894, + "step": 18900 + }, + { + "epoch": 0.29431682844621576, + "grad_norm": 3.1834230422973633, + "learning_rate": 7.428426279477557e-07, + "loss": 0.8793, + "step": 18905 + }, + { + "epoch": 0.29439466944818515, + "grad_norm": 3.197418212890625, + "learning_rate": 7.427606889431507e-07, + "loss": 0.755, + "step": 18910 + }, + { + "epoch": 0.2944725104501545, + "grad_norm": 3.6597862243652344, + "learning_rate": 7.426787499385457e-07, + "loss": 0.8454, + "step": 18915 + }, + { + "epoch": 0.2945503514521239, + "grad_norm": 3.0385353565216064, + "learning_rate": 7.425968109339408e-07, + "loss": 0.7821, + "step": 18920 + }, + { + "epoch": 0.2946281924540933, + "grad_norm": 5.031830310821533, + "learning_rate": 7.425148719293358e-07, + "loss": 0.7794, + "step": 18925 + }, + { + "epoch": 0.29470603345606267, + "grad_norm": 4.892368793487549, + "learning_rate": 7.424329329247308e-07, + "loss": 0.6679, + "step": 18930 + }, + { + "epoch": 0.294783874458032, + "grad_norm": 8.175716400146484, + "learning_rate": 7.423509939201258e-07, + "loss": 0.7837, + "step": 18935 + }, + { + "epoch": 0.2948617154600014, + "grad_norm": 2.7154431343078613, + "learning_rate": 7.422690549155208e-07, + "loss": 0.7933, + "step": 18940 + }, + { + "epoch": 0.2949395564619708, + "grad_norm": 3.6209683418273926, + "learning_rate": 7.421871159109159e-07, + "loss": 0.8131, + "step": 18945 + }, + { + "epoch": 0.2950173974639402, + "grad_norm": 4.062808513641357, + "learning_rate": 7.42105176906311e-07, + "loss": 0.851, + "step": 18950 + }, + { + "epoch": 0.2950952384659095, + "grad_norm": 4.193578720092773, + "learning_rate": 7.420232379017059e-07, + "loss": 0.6818, + "step": 18955 + }, + { + "epoch": 0.2951730794678789, + "grad_norm": 7.759963512420654, + "learning_rate": 7.41941298897101e-07, + "loss": 0.7228, + "step": 18960 + }, + { + "epoch": 0.2952509204698483, + "grad_norm": 4.498473644256592, + "learning_rate": 7.418593598924961e-07, + "loss": 0.7439, + "step": 18965 + }, + { + "epoch": 0.2953287614718177, + "grad_norm": 5.182002067565918, + "learning_rate": 7.417774208878909e-07, + "loss": 0.8098, + "step": 18970 + }, + { + "epoch": 0.295406602473787, + "grad_norm": 4.381763458251953, + "learning_rate": 7.41695481883286e-07, + "loss": 0.6748, + "step": 18975 + }, + { + "epoch": 0.2954844434757564, + "grad_norm": 3.979992628097534, + "learning_rate": 7.416135428786811e-07, + "loss": 0.7706, + "step": 18980 + }, + { + "epoch": 0.2955622844777258, + "grad_norm": 2.989089012145996, + "learning_rate": 7.41531603874076e-07, + "loss": 0.747, + "step": 18985 + }, + { + "epoch": 0.29564012547969515, + "grad_norm": 5.946344375610352, + "learning_rate": 7.414496648694711e-07, + "loss": 0.6734, + "step": 18990 + }, + { + "epoch": 0.29571796648166454, + "grad_norm": 2.8725976943969727, + "learning_rate": 7.413677258648662e-07, + "loss": 0.7419, + "step": 18995 + }, + { + "epoch": 0.29579580748363393, + "grad_norm": 3.7277004718780518, + "learning_rate": 7.412857868602612e-07, + "loss": 0.7738, + "step": 19000 + }, + { + "epoch": 0.2958736484856033, + "grad_norm": 6.9846367835998535, + "learning_rate": 7.412038478556562e-07, + "loss": 0.6773, + "step": 19005 + }, + { + "epoch": 0.29595148948757266, + "grad_norm": 3.8853328227996826, + "learning_rate": 7.411219088510512e-07, + "loss": 0.6712, + "step": 19010 + }, + { + "epoch": 0.29602933048954205, + "grad_norm": 6.792137145996094, + "learning_rate": 7.410399698464462e-07, + "loss": 0.836, + "step": 19015 + }, + { + "epoch": 0.29610717149151145, + "grad_norm": 5.322309494018555, + "learning_rate": 7.409580308418413e-07, + "loss": 0.7975, + "step": 19020 + }, + { + "epoch": 0.29618501249348084, + "grad_norm": 3.9240760803222656, + "learning_rate": 7.408760918372363e-07, + "loss": 0.8532, + "step": 19025 + }, + { + "epoch": 0.2962628534954502, + "grad_norm": 2.847827196121216, + "learning_rate": 7.407941528326314e-07, + "loss": 0.7627, + "step": 19030 + }, + { + "epoch": 0.29634069449741957, + "grad_norm": 4.343638896942139, + "learning_rate": 7.407122138280264e-07, + "loss": 0.9215, + "step": 19035 + }, + { + "epoch": 0.29641853549938896, + "grad_norm": 2.5756824016571045, + "learning_rate": 7.406302748234215e-07, + "loss": 0.7291, + "step": 19040 + }, + { + "epoch": 0.29649637650135835, + "grad_norm": 3.4114644527435303, + "learning_rate": 7.405483358188165e-07, + "loss": 0.8098, + "step": 19045 + }, + { + "epoch": 0.2965742175033277, + "grad_norm": 3.304774522781372, + "learning_rate": 7.404663968142114e-07, + "loss": 0.7402, + "step": 19050 + }, + { + "epoch": 0.2966520585052971, + "grad_norm": 4.325243949890137, + "learning_rate": 7.403844578096065e-07, + "loss": 1.008, + "step": 19055 + }, + { + "epoch": 0.29672989950726647, + "grad_norm": 3.628368616104126, + "learning_rate": 7.403025188050015e-07, + "loss": 0.5888, + "step": 19060 + }, + { + "epoch": 0.29680774050923586, + "grad_norm": 3.809199333190918, + "learning_rate": 7.402205798003965e-07, + "loss": 0.8118, + "step": 19065 + }, + { + "epoch": 0.2968855815112052, + "grad_norm": 3.897184133529663, + "learning_rate": 7.401386407957916e-07, + "loss": 0.7218, + "step": 19070 + }, + { + "epoch": 0.2969634225131746, + "grad_norm": 4.060847759246826, + "learning_rate": 7.400567017911867e-07, + "loss": 0.803, + "step": 19075 + }, + { + "epoch": 0.297041263515144, + "grad_norm": 4.0685906410217285, + "learning_rate": 7.399747627865816e-07, + "loss": 0.7336, + "step": 19080 + }, + { + "epoch": 0.2971191045171133, + "grad_norm": 3.2365095615386963, + "learning_rate": 7.398928237819767e-07, + "loss": 0.9283, + "step": 19085 + }, + { + "epoch": 0.2971969455190827, + "grad_norm": 3.6066620349884033, + "learning_rate": 7.398108847773718e-07, + "loss": 0.7151, + "step": 19090 + }, + { + "epoch": 0.2972747865210521, + "grad_norm": 3.55867600440979, + "learning_rate": 7.397289457727666e-07, + "loss": 0.7524, + "step": 19095 + }, + { + "epoch": 0.2973526275230215, + "grad_norm": 2.7427852153778076, + "learning_rate": 7.396470067681617e-07, + "loss": 0.8824, + "step": 19100 + }, + { + "epoch": 0.29743046852499083, + "grad_norm": 10.828716278076172, + "learning_rate": 7.395650677635568e-07, + "loss": 0.6942, + "step": 19105 + }, + { + "epoch": 0.2975083095269602, + "grad_norm": 4.044482231140137, + "learning_rate": 7.394831287589518e-07, + "loss": 0.7965, + "step": 19110 + }, + { + "epoch": 0.2975861505289296, + "grad_norm": 7.023869514465332, + "learning_rate": 7.394011897543468e-07, + "loss": 0.7369, + "step": 19115 + }, + { + "epoch": 0.297663991530899, + "grad_norm": 6.251187801361084, + "learning_rate": 7.393192507497419e-07, + "loss": 0.7276, + "step": 19120 + }, + { + "epoch": 0.29774183253286834, + "grad_norm": 4.2345194816589355, + "learning_rate": 7.392373117451369e-07, + "loss": 0.8351, + "step": 19125 + }, + { + "epoch": 0.29781967353483774, + "grad_norm": 5.484807014465332, + "learning_rate": 7.39155372740532e-07, + "loss": 0.8511, + "step": 19130 + }, + { + "epoch": 0.29789751453680713, + "grad_norm": 4.656536102294922, + "learning_rate": 7.390734337359269e-07, + "loss": 0.7701, + "step": 19135 + }, + { + "epoch": 0.2979753555387765, + "grad_norm": 4.175867557525635, + "learning_rate": 7.389914947313219e-07, + "loss": 0.8358, + "step": 19140 + }, + { + "epoch": 0.29805319654074586, + "grad_norm": 5.8004279136657715, + "learning_rate": 7.38909555726717e-07, + "loss": 0.8207, + "step": 19145 + }, + { + "epoch": 0.29813103754271525, + "grad_norm": 3.534777879714966, + "learning_rate": 7.38827616722112e-07, + "loss": 0.7163, + "step": 19150 + }, + { + "epoch": 0.29820887854468464, + "grad_norm": 2.747576951980591, + "learning_rate": 7.387456777175071e-07, + "loss": 0.7338, + "step": 19155 + }, + { + "epoch": 0.298286719546654, + "grad_norm": 3.858243465423584, + "learning_rate": 7.386637387129021e-07, + "loss": 0.8268, + "step": 19160 + }, + { + "epoch": 0.29836456054862337, + "grad_norm": 3.667602300643921, + "learning_rate": 7.385817997082972e-07, + "loss": 0.6898, + "step": 19165 + }, + { + "epoch": 0.29844240155059276, + "grad_norm": 5.483829021453857, + "learning_rate": 7.384998607036922e-07, + "loss": 0.7615, + "step": 19170 + }, + { + "epoch": 0.29852024255256215, + "grad_norm": 3.279327630996704, + "learning_rate": 7.384179216990871e-07, + "loss": 0.7374, + "step": 19175 + }, + { + "epoch": 0.2985980835545315, + "grad_norm": 4.456769943237305, + "learning_rate": 7.383359826944822e-07, + "loss": 0.7688, + "step": 19180 + }, + { + "epoch": 0.2986759245565009, + "grad_norm": 5.216207981109619, + "learning_rate": 7.382540436898772e-07, + "loss": 0.7678, + "step": 19185 + }, + { + "epoch": 0.2987537655584703, + "grad_norm": 5.643771648406982, + "learning_rate": 7.381721046852722e-07, + "loss": 0.7999, + "step": 19190 + }, + { + "epoch": 0.29883160656043967, + "grad_norm": 7.622976779937744, + "learning_rate": 7.380901656806673e-07, + "loss": 0.6539, + "step": 19195 + }, + { + "epoch": 0.298909447562409, + "grad_norm": 3.570014476776123, + "learning_rate": 7.380082266760624e-07, + "loss": 0.8342, + "step": 19200 + }, + { + "epoch": 0.2989872885643784, + "grad_norm": 3.948180913925171, + "learning_rate": 7.379262876714573e-07, + "loss": 0.656, + "step": 19205 + }, + { + "epoch": 0.2990651295663478, + "grad_norm": 4.40153169631958, + "learning_rate": 7.378443486668524e-07, + "loss": 0.812, + "step": 19210 + }, + { + "epoch": 0.2991429705683172, + "grad_norm": 4.24730110168457, + "learning_rate": 7.377624096622474e-07, + "loss": 0.6723, + "step": 19215 + }, + { + "epoch": 0.2992208115702865, + "grad_norm": 2.9068751335144043, + "learning_rate": 7.376804706576423e-07, + "loss": 0.7985, + "step": 19220 + }, + { + "epoch": 0.2992986525722559, + "grad_norm": 3.062253475189209, + "learning_rate": 7.375985316530374e-07, + "loss": 0.8082, + "step": 19225 + }, + { + "epoch": 0.2993764935742253, + "grad_norm": 2.8117597103118896, + "learning_rate": 7.375165926484325e-07, + "loss": 0.7478, + "step": 19230 + }, + { + "epoch": 0.2994543345761947, + "grad_norm": 3.2682876586914062, + "learning_rate": 7.374346536438275e-07, + "loss": 0.7893, + "step": 19235 + }, + { + "epoch": 0.29953217557816403, + "grad_norm": 5.783231258392334, + "learning_rate": 7.373527146392225e-07, + "loss": 0.7702, + "step": 19240 + }, + { + "epoch": 0.2996100165801334, + "grad_norm": 4.528698444366455, + "learning_rate": 7.372707756346176e-07, + "loss": 0.7292, + "step": 19245 + }, + { + "epoch": 0.2996878575821028, + "grad_norm": 3.2343766689300537, + "learning_rate": 7.371888366300126e-07, + "loss": 0.7593, + "step": 19250 + }, + { + "epoch": 0.29976569858407215, + "grad_norm": 3.411930799484253, + "learning_rate": 7.371068976254076e-07, + "loss": 0.8373, + "step": 19255 + }, + { + "epoch": 0.29984353958604154, + "grad_norm": 4.680840969085693, + "learning_rate": 7.370249586208026e-07, + "loss": 0.8613, + "step": 19260 + }, + { + "epoch": 0.29992138058801093, + "grad_norm": 2.9823901653289795, + "learning_rate": 7.369430196161976e-07, + "loss": 0.7563, + "step": 19265 + }, + { + "epoch": 0.2999992215899803, + "grad_norm": 9.004485130310059, + "learning_rate": 7.368610806115927e-07, + "loss": 0.7912, + "step": 19270 + }, + { + "epoch": 0.30007706259194966, + "grad_norm": 4.095830917358398, + "learning_rate": 7.367791416069877e-07, + "loss": 0.7193, + "step": 19275 + }, + { + "epoch": 0.30015490359391905, + "grad_norm": 15.25143814086914, + "learning_rate": 7.366972026023828e-07, + "loss": 0.6691, + "step": 19280 + }, + { + "epoch": 0.30023274459588845, + "grad_norm": 2.6462841033935547, + "learning_rate": 7.366152635977778e-07, + "loss": 0.8597, + "step": 19285 + }, + { + "epoch": 0.30031058559785784, + "grad_norm": 3.802515983581543, + "learning_rate": 7.365333245931729e-07, + "loss": 0.9042, + "step": 19290 + }, + { + "epoch": 0.3003884265998272, + "grad_norm": 2.9349098205566406, + "learning_rate": 7.364513855885678e-07, + "loss": 0.7542, + "step": 19295 + }, + { + "epoch": 0.30046626760179657, + "grad_norm": 2.784654378890991, + "learning_rate": 7.363694465839628e-07, + "loss": 0.7769, + "step": 19300 + }, + { + "epoch": 0.30054410860376596, + "grad_norm": 5.91815710067749, + "learning_rate": 7.362875075793579e-07, + "loss": 0.7659, + "step": 19305 + }, + { + "epoch": 0.30062194960573535, + "grad_norm": 6.340529441833496, + "learning_rate": 7.36205568574753e-07, + "loss": 0.7498, + "step": 19310 + }, + { + "epoch": 0.3006997906077047, + "grad_norm": 3.2137279510498047, + "learning_rate": 7.361236295701479e-07, + "loss": 0.7722, + "step": 19315 + }, + { + "epoch": 0.3007776316096741, + "grad_norm": 6.5689473152160645, + "learning_rate": 7.36041690565543e-07, + "loss": 0.7377, + "step": 19320 + }, + { + "epoch": 0.30085547261164347, + "grad_norm": 2.988903760910034, + "learning_rate": 7.359597515609381e-07, + "loss": 0.7134, + "step": 19325 + }, + { + "epoch": 0.3009333136136128, + "grad_norm": 4.176788330078125, + "learning_rate": 7.35877812556333e-07, + "loss": 0.8399, + "step": 19330 + }, + { + "epoch": 0.3010111546155822, + "grad_norm": 3.5837290287017822, + "learning_rate": 7.35795873551728e-07, + "loss": 0.7243, + "step": 19335 + }, + { + "epoch": 0.3010889956175516, + "grad_norm": 6.051422119140625, + "learning_rate": 7.357139345471231e-07, + "loss": 0.7335, + "step": 19340 + }, + { + "epoch": 0.301166836619521, + "grad_norm": 3.454967498779297, + "learning_rate": 7.35631995542518e-07, + "loss": 0.7165, + "step": 19345 + }, + { + "epoch": 0.3012446776214903, + "grad_norm": 3.151862621307373, + "learning_rate": 7.355500565379131e-07, + "loss": 0.8532, + "step": 19350 + }, + { + "epoch": 0.3013225186234597, + "grad_norm": 6.889316558837891, + "learning_rate": 7.354681175333082e-07, + "loss": 0.6624, + "step": 19355 + }, + { + "epoch": 0.3014003596254291, + "grad_norm": 6.973013877868652, + "learning_rate": 7.353861785287032e-07, + "loss": 0.7738, + "step": 19360 + }, + { + "epoch": 0.3014782006273985, + "grad_norm": 3.1961417198181152, + "learning_rate": 7.353042395240982e-07, + "loss": 0.8734, + "step": 19365 + }, + { + "epoch": 0.30155604162936783, + "grad_norm": 4.515864372253418, + "learning_rate": 7.352223005194933e-07, + "loss": 0.8756, + "step": 19370 + }, + { + "epoch": 0.3016338826313372, + "grad_norm": 6.735131740570068, + "learning_rate": 7.351403615148882e-07, + "loss": 0.6706, + "step": 19375 + }, + { + "epoch": 0.3017117236333066, + "grad_norm": 5.002194881439209, + "learning_rate": 7.350584225102833e-07, + "loss": 0.7529, + "step": 19380 + }, + { + "epoch": 0.301789564635276, + "grad_norm": 6.737076759338379, + "learning_rate": 7.349764835056783e-07, + "loss": 0.6168, + "step": 19385 + }, + { + "epoch": 0.30186740563724535, + "grad_norm": 4.550383567810059, + "learning_rate": 7.348945445010733e-07, + "loss": 0.7475, + "step": 19390 + }, + { + "epoch": 0.30194524663921474, + "grad_norm": 6.845092296600342, + "learning_rate": 7.348126054964684e-07, + "loss": 0.7623, + "step": 19395 + }, + { + "epoch": 0.30202308764118413, + "grad_norm": 2.750274896621704, + "learning_rate": 7.347306664918635e-07, + "loss": 0.8408, + "step": 19400 + }, + { + "epoch": 0.3021009286431535, + "grad_norm": 3.3687939643859863, + "learning_rate": 7.346487274872585e-07, + "loss": 0.7362, + "step": 19405 + }, + { + "epoch": 0.30217876964512286, + "grad_norm": 4.238241195678711, + "learning_rate": 7.345667884826535e-07, + "loss": 0.7712, + "step": 19410 + }, + { + "epoch": 0.30225661064709225, + "grad_norm": 3.3636481761932373, + "learning_rate": 7.344848494780486e-07, + "loss": 0.8209, + "step": 19415 + }, + { + "epoch": 0.30233445164906164, + "grad_norm": 3.0912439823150635, + "learning_rate": 7.344029104734435e-07, + "loss": 0.7543, + "step": 19420 + }, + { + "epoch": 0.302412292651031, + "grad_norm": 4.076699733734131, + "learning_rate": 7.343209714688385e-07, + "loss": 0.7493, + "step": 19425 + }, + { + "epoch": 0.30249013365300037, + "grad_norm": 3.4594085216522217, + "learning_rate": 7.342390324642336e-07, + "loss": 0.7467, + "step": 19430 + }, + { + "epoch": 0.30256797465496976, + "grad_norm": 4.686057090759277, + "learning_rate": 7.341570934596287e-07, + "loss": 0.8548, + "step": 19435 + }, + { + "epoch": 0.30264581565693915, + "grad_norm": 2.9473624229431152, + "learning_rate": 7.340751544550236e-07, + "loss": 0.7667, + "step": 19440 + }, + { + "epoch": 0.3027236566589085, + "grad_norm": 2.9822275638580322, + "learning_rate": 7.339932154504187e-07, + "loss": 0.806, + "step": 19445 + }, + { + "epoch": 0.3028014976608779, + "grad_norm": 3.60502552986145, + "learning_rate": 7.339112764458138e-07, + "loss": 0.6964, + "step": 19450 + }, + { + "epoch": 0.3028793386628473, + "grad_norm": 4.586121082305908, + "learning_rate": 7.338293374412087e-07, + "loss": 0.7756, + "step": 19455 + }, + { + "epoch": 0.30295717966481667, + "grad_norm": 4.052720069885254, + "learning_rate": 7.337473984366037e-07, + "loss": 0.7412, + "step": 19460 + }, + { + "epoch": 0.303035020666786, + "grad_norm": 3.488071918487549, + "learning_rate": 7.336654594319988e-07, + "loss": 0.7772, + "step": 19465 + }, + { + "epoch": 0.3031128616687554, + "grad_norm": 3.216257333755493, + "learning_rate": 7.335835204273938e-07, + "loss": 0.8047, + "step": 19470 + }, + { + "epoch": 0.3031907026707248, + "grad_norm": 2.9577155113220215, + "learning_rate": 7.335015814227888e-07, + "loss": 0.77, + "step": 19475 + }, + { + "epoch": 0.3032685436726942, + "grad_norm": 3.1843841075897217, + "learning_rate": 7.334196424181839e-07, + "loss": 0.8028, + "step": 19480 + }, + { + "epoch": 0.3033463846746635, + "grad_norm": 5.264181137084961, + "learning_rate": 7.333377034135789e-07, + "loss": 0.7464, + "step": 19485 + }, + { + "epoch": 0.3034242256766329, + "grad_norm": 6.033514976501465, + "learning_rate": 7.33255764408974e-07, + "loss": 0.9238, + "step": 19490 + }, + { + "epoch": 0.3035020666786023, + "grad_norm": 3.4538733959198, + "learning_rate": 7.33173825404369e-07, + "loss": 0.8628, + "step": 19495 + }, + { + "epoch": 0.30357990768057164, + "grad_norm": 6.510023593902588, + "learning_rate": 7.330918863997639e-07, + "loss": 0.6936, + "step": 19500 + }, + { + "epoch": 0.30365774868254103, + "grad_norm": 6.218939304351807, + "learning_rate": 7.33009947395159e-07, + "loss": 0.7854, + "step": 19505 + }, + { + "epoch": 0.3037355896845104, + "grad_norm": 3.971550464630127, + "learning_rate": 7.32928008390554e-07, + "loss": 0.9312, + "step": 19510 + }, + { + "epoch": 0.3038134306864798, + "grad_norm": 3.35803484916687, + "learning_rate": 7.32846069385949e-07, + "loss": 0.7496, + "step": 19515 + }, + { + "epoch": 0.30389127168844915, + "grad_norm": 4.602055549621582, + "learning_rate": 7.327641303813441e-07, + "loss": 0.7642, + "step": 19520 + }, + { + "epoch": 0.30396911269041854, + "grad_norm": 3.0348055362701416, + "learning_rate": 7.326821913767392e-07, + "loss": 0.7273, + "step": 19525 + }, + { + "epoch": 0.30404695369238793, + "grad_norm": 4.904761791229248, + "learning_rate": 7.326002523721342e-07, + "loss": 0.715, + "step": 19530 + }, + { + "epoch": 0.3041247946943573, + "grad_norm": 7.011703968048096, + "learning_rate": 7.325183133675292e-07, + "loss": 0.7044, + "step": 19535 + }, + { + "epoch": 0.30420263569632666, + "grad_norm": 7.0719895362854, + "learning_rate": 7.324363743629242e-07, + "loss": 0.7905, + "step": 19540 + }, + { + "epoch": 0.30428047669829605, + "grad_norm": 5.640370845794678, + "learning_rate": 7.323544353583192e-07, + "loss": 0.8111, + "step": 19545 + }, + { + "epoch": 0.30435831770026545, + "grad_norm": 3.0712015628814697, + "learning_rate": 7.322724963537142e-07, + "loss": 0.7576, + "step": 19550 + }, + { + "epoch": 0.30443615870223484, + "grad_norm": 3.424154043197632, + "learning_rate": 7.321905573491093e-07, + "loss": 0.8453, + "step": 19555 + }, + { + "epoch": 0.3045139997042042, + "grad_norm": 3.3583383560180664, + "learning_rate": 7.321086183445044e-07, + "loss": 0.7752, + "step": 19560 + }, + { + "epoch": 0.30459184070617357, + "grad_norm": 4.696355819702148, + "learning_rate": 7.320266793398993e-07, + "loss": 0.7201, + "step": 19565 + }, + { + "epoch": 0.30466968170814296, + "grad_norm": 7.2936859130859375, + "learning_rate": 7.319447403352944e-07, + "loss": 0.8347, + "step": 19570 + }, + { + "epoch": 0.30474752271011235, + "grad_norm": 4.277551651000977, + "learning_rate": 7.318628013306895e-07, + "loss": 0.718, + "step": 19575 + }, + { + "epoch": 0.3048253637120817, + "grad_norm": 5.286831855773926, + "learning_rate": 7.317808623260843e-07, + "loss": 0.747, + "step": 19580 + }, + { + "epoch": 0.3049032047140511, + "grad_norm": 4.06176233291626, + "learning_rate": 7.316989233214794e-07, + "loss": 0.7597, + "step": 19585 + }, + { + "epoch": 0.30498104571602047, + "grad_norm": 5.223543643951416, + "learning_rate": 7.316169843168745e-07, + "loss": 0.7336, + "step": 19590 + }, + { + "epoch": 0.3050588867179898, + "grad_norm": 3.522918939590454, + "learning_rate": 7.315350453122695e-07, + "loss": 0.7236, + "step": 19595 + }, + { + "epoch": 0.3051367277199592, + "grad_norm": 2.9740731716156006, + "learning_rate": 7.314531063076645e-07, + "loss": 0.7833, + "step": 19600 + }, + { + "epoch": 0.3052145687219286, + "grad_norm": 3.410181760787964, + "learning_rate": 7.313711673030596e-07, + "loss": 0.8845, + "step": 19605 + }, + { + "epoch": 0.305292409723898, + "grad_norm": 4.243725299835205, + "learning_rate": 7.312892282984546e-07, + "loss": 0.7387, + "step": 19610 + }, + { + "epoch": 0.3053702507258673, + "grad_norm": 3.667628526687622, + "learning_rate": 7.312072892938497e-07, + "loss": 0.8079, + "step": 19615 + }, + { + "epoch": 0.3054480917278367, + "grad_norm": 3.6076266765594482, + "learning_rate": 7.311253502892446e-07, + "loss": 0.7033, + "step": 19620 + }, + { + "epoch": 0.3055259327298061, + "grad_norm": 5.2530999183654785, + "learning_rate": 7.310434112846396e-07, + "loss": 0.6888, + "step": 19625 + }, + { + "epoch": 0.3056037737317755, + "grad_norm": 6.23611307144165, + "learning_rate": 7.309614722800347e-07, + "loss": 0.8028, + "step": 19630 + }, + { + "epoch": 0.30568161473374483, + "grad_norm": 3.4382402896881104, + "learning_rate": 7.308795332754297e-07, + "loss": 0.7078, + "step": 19635 + }, + { + "epoch": 0.3057594557357142, + "grad_norm": 5.48118782043457, + "learning_rate": 7.307975942708247e-07, + "loss": 0.7961, + "step": 19640 + }, + { + "epoch": 0.3058372967376836, + "grad_norm": 6.199230670928955, + "learning_rate": 7.307156552662198e-07, + "loss": 0.7278, + "step": 19645 + }, + { + "epoch": 0.305915137739653, + "grad_norm": 14.622966766357422, + "learning_rate": 7.306337162616149e-07, + "loss": 0.7, + "step": 19650 + }, + { + "epoch": 0.30599297874162235, + "grad_norm": 6.439300060272217, + "learning_rate": 7.305517772570099e-07, + "loss": 0.8521, + "step": 19655 + }, + { + "epoch": 0.30607081974359174, + "grad_norm": 4.3287577629089355, + "learning_rate": 7.304698382524048e-07, + "loss": 0.6632, + "step": 19660 + }, + { + "epoch": 0.30614866074556113, + "grad_norm": 5.56475305557251, + "learning_rate": 7.303878992477999e-07, + "loss": 0.7035, + "step": 19665 + }, + { + "epoch": 0.3062265017475305, + "grad_norm": 2.9900405406951904, + "learning_rate": 7.30305960243195e-07, + "loss": 0.5952, + "step": 19670 + }, + { + "epoch": 0.30630434274949986, + "grad_norm": 3.227675676345825, + "learning_rate": 7.302240212385899e-07, + "loss": 0.8055, + "step": 19675 + }, + { + "epoch": 0.30638218375146925, + "grad_norm": 2.7153420448303223, + "learning_rate": 7.30142082233985e-07, + "loss": 0.7083, + "step": 19680 + }, + { + "epoch": 0.30646002475343864, + "grad_norm": 2.3255703449249268, + "learning_rate": 7.300601432293801e-07, + "loss": 0.7237, + "step": 19685 + }, + { + "epoch": 0.306537865755408, + "grad_norm": 3.549863576889038, + "learning_rate": 7.29978204224775e-07, + "loss": 0.69, + "step": 19690 + }, + { + "epoch": 0.30661570675737737, + "grad_norm": 3.5451266765594482, + "learning_rate": 7.298962652201701e-07, + "loss": 0.8929, + "step": 19695 + }, + { + "epoch": 0.30669354775934676, + "grad_norm": 6.672890663146973, + "learning_rate": 7.298143262155651e-07, + "loss": 0.7899, + "step": 19700 + }, + { + "epoch": 0.30677138876131615, + "grad_norm": 5.6110429763793945, + "learning_rate": 7.297323872109601e-07, + "loss": 0.7105, + "step": 19705 + }, + { + "epoch": 0.3068492297632855, + "grad_norm": 3.9284863471984863, + "learning_rate": 7.296504482063551e-07, + "loss": 0.6883, + "step": 19710 + }, + { + "epoch": 0.3069270707652549, + "grad_norm": 3.8036837577819824, + "learning_rate": 7.295685092017502e-07, + "loss": 0.7643, + "step": 19715 + }, + { + "epoch": 0.3070049117672243, + "grad_norm": 5.060457706451416, + "learning_rate": 7.294865701971452e-07, + "loss": 0.7906, + "step": 19720 + }, + { + "epoch": 0.30708275276919367, + "grad_norm": 3.4861080646514893, + "learning_rate": 7.294046311925402e-07, + "loss": 0.7877, + "step": 19725 + }, + { + "epoch": 0.307160593771163, + "grad_norm": 5.678596496582031, + "learning_rate": 7.293226921879353e-07, + "loss": 0.7542, + "step": 19730 + }, + { + "epoch": 0.3072384347731324, + "grad_norm": 3.183572769165039, + "learning_rate": 7.292407531833303e-07, + "loss": 0.8266, + "step": 19735 + }, + { + "epoch": 0.3073162757751018, + "grad_norm": 3.0133581161499023, + "learning_rate": 7.291588141787254e-07, + "loss": 0.8507, + "step": 19740 + }, + { + "epoch": 0.3073941167770712, + "grad_norm": 5.432462215423584, + "learning_rate": 7.290768751741203e-07, + "loss": 0.8463, + "step": 19745 + }, + { + "epoch": 0.3074719577790405, + "grad_norm": 4.709221363067627, + "learning_rate": 7.289949361695153e-07, + "loss": 0.7677, + "step": 19750 + }, + { + "epoch": 0.3075497987810099, + "grad_norm": 6.288868427276611, + "learning_rate": 7.289129971649104e-07, + "loss": 0.8172, + "step": 19755 + }, + { + "epoch": 0.3076276397829793, + "grad_norm": 3.6717214584350586, + "learning_rate": 7.288310581603055e-07, + "loss": 0.7405, + "step": 19760 + }, + { + "epoch": 0.30770548078494864, + "grad_norm": 3.573765277862549, + "learning_rate": 7.287491191557004e-07, + "loss": 0.7495, + "step": 19765 + }, + { + "epoch": 0.30778332178691803, + "grad_norm": 3.3548150062561035, + "learning_rate": 7.286671801510955e-07, + "loss": 0.7931, + "step": 19770 + }, + { + "epoch": 0.3078611627888874, + "grad_norm": 4.807186126708984, + "learning_rate": 7.285852411464906e-07, + "loss": 0.6837, + "step": 19775 + }, + { + "epoch": 0.3079390037908568, + "grad_norm": 4.181824684143066, + "learning_rate": 7.285033021418856e-07, + "loss": 0.8501, + "step": 19780 + }, + { + "epoch": 0.30801684479282615, + "grad_norm": 3.6679091453552246, + "learning_rate": 7.284213631372805e-07, + "loss": 0.7887, + "step": 19785 + }, + { + "epoch": 0.30809468579479554, + "grad_norm": 3.1990134716033936, + "learning_rate": 7.283394241326756e-07, + "loss": 0.7741, + "step": 19790 + }, + { + "epoch": 0.30817252679676493, + "grad_norm": 2.814786195755005, + "learning_rate": 7.282574851280707e-07, + "loss": 0.7523, + "step": 19795 + }, + { + "epoch": 0.3082503677987343, + "grad_norm": 6.019982814788818, + "learning_rate": 7.281755461234656e-07, + "loss": 0.8319, + "step": 19800 + }, + { + "epoch": 0.30832820880070366, + "grad_norm": 9.461308479309082, + "learning_rate": 7.280936071188607e-07, + "loss": 0.7141, + "step": 19805 + }, + { + "epoch": 0.30840604980267305, + "grad_norm": 5.778023719787598, + "learning_rate": 7.280116681142558e-07, + "loss": 0.6953, + "step": 19810 + }, + { + "epoch": 0.30848389080464245, + "grad_norm": 4.680640697479248, + "learning_rate": 7.279297291096507e-07, + "loss": 0.9028, + "step": 19815 + }, + { + "epoch": 0.30856173180661184, + "grad_norm": 5.0716400146484375, + "learning_rate": 7.278477901050458e-07, + "loss": 0.6978, + "step": 19820 + }, + { + "epoch": 0.3086395728085812, + "grad_norm": 5.249613285064697, + "learning_rate": 7.277658511004408e-07, + "loss": 0.8705, + "step": 19825 + }, + { + "epoch": 0.30871741381055057, + "grad_norm": 3.1706202030181885, + "learning_rate": 7.276839120958358e-07, + "loss": 0.7233, + "step": 19830 + }, + { + "epoch": 0.30879525481251996, + "grad_norm": 2.3790881633758545, + "learning_rate": 7.276019730912308e-07, + "loss": 0.7208, + "step": 19835 + }, + { + "epoch": 0.30887309581448935, + "grad_norm": 5.737145900726318, + "learning_rate": 7.275200340866259e-07, + "loss": 0.6923, + "step": 19840 + }, + { + "epoch": 0.3089509368164587, + "grad_norm": 3.3187997341156006, + "learning_rate": 7.274380950820209e-07, + "loss": 0.7003, + "step": 19845 + }, + { + "epoch": 0.3090287778184281, + "grad_norm": 3.5388314723968506, + "learning_rate": 7.27356156077416e-07, + "loss": 0.7766, + "step": 19850 + }, + { + "epoch": 0.30910661882039747, + "grad_norm": 6.672298908233643, + "learning_rate": 7.27274217072811e-07, + "loss": 0.8126, + "step": 19855 + }, + { + "epoch": 0.3091844598223668, + "grad_norm": 4.5849080085754395, + "learning_rate": 7.27192278068206e-07, + "loss": 0.8502, + "step": 19860 + }, + { + "epoch": 0.3092623008243362, + "grad_norm": 8.410574913024902, + "learning_rate": 7.27110339063601e-07, + "loss": 0.7512, + "step": 19865 + }, + { + "epoch": 0.3093401418263056, + "grad_norm": 8.312150001525879, + "learning_rate": 7.27028400058996e-07, + "loss": 0.6793, + "step": 19870 + }, + { + "epoch": 0.309417982828275, + "grad_norm": 5.622318267822266, + "learning_rate": 7.26946461054391e-07, + "loss": 0.7669, + "step": 19875 + }, + { + "epoch": 0.3094958238302443, + "grad_norm": 3.0122766494750977, + "learning_rate": 7.268645220497861e-07, + "loss": 0.7512, + "step": 19880 + }, + { + "epoch": 0.3095736648322137, + "grad_norm": 4.408302307128906, + "learning_rate": 7.267825830451812e-07, + "loss": 0.7914, + "step": 19885 + }, + { + "epoch": 0.3096515058341831, + "grad_norm": 12.037958145141602, + "learning_rate": 7.267006440405762e-07, + "loss": 0.8234, + "step": 19890 + }, + { + "epoch": 0.3097293468361525, + "grad_norm": 3.986621618270874, + "learning_rate": 7.266187050359712e-07, + "loss": 0.8296, + "step": 19895 + }, + { + "epoch": 0.30980718783812183, + "grad_norm": 6.8099212646484375, + "learning_rate": 7.265367660313663e-07, + "loss": 0.6782, + "step": 19900 + }, + { + "epoch": 0.3098850288400912, + "grad_norm": 5.6810479164123535, + "learning_rate": 7.264548270267612e-07, + "loss": 0.6785, + "step": 19905 + }, + { + "epoch": 0.3099628698420606, + "grad_norm": 3.6507179737091064, + "learning_rate": 7.263728880221562e-07, + "loss": 0.8, + "step": 19910 + }, + { + "epoch": 0.31004071084403, + "grad_norm": 3.3082025051116943, + "learning_rate": 7.262909490175513e-07, + "loss": 0.742, + "step": 19915 + }, + { + "epoch": 0.31011855184599935, + "grad_norm": 4.687744140625, + "learning_rate": 7.262090100129464e-07, + "loss": 0.722, + "step": 19920 + }, + { + "epoch": 0.31019639284796874, + "grad_norm": 4.227119445800781, + "learning_rate": 7.261270710083413e-07, + "loss": 0.8008, + "step": 19925 + }, + { + "epoch": 0.31027423384993813, + "grad_norm": 5.139937877655029, + "learning_rate": 7.260451320037364e-07, + "loss": 0.8128, + "step": 19930 + }, + { + "epoch": 0.31035207485190747, + "grad_norm": 4.09891414642334, + "learning_rate": 7.259631929991315e-07, + "loss": 0.7716, + "step": 19935 + }, + { + "epoch": 0.31042991585387686, + "grad_norm": 3.9586713314056396, + "learning_rate": 7.258812539945265e-07, + "loss": 0.7492, + "step": 19940 + }, + { + "epoch": 0.31050775685584625, + "grad_norm": 3.3138248920440674, + "learning_rate": 7.257993149899214e-07, + "loss": 0.8005, + "step": 19945 + }, + { + "epoch": 0.31058559785781564, + "grad_norm": 3.6728568077087402, + "learning_rate": 7.257173759853165e-07, + "loss": 0.7757, + "step": 19950 + }, + { + "epoch": 0.310663438859785, + "grad_norm": 3.5933239459991455, + "learning_rate": 7.256354369807115e-07, + "loss": 0.8814, + "step": 19955 + }, + { + "epoch": 0.31074127986175437, + "grad_norm": 3.2425715923309326, + "learning_rate": 7.255534979761065e-07, + "loss": 0.7549, + "step": 19960 + }, + { + "epoch": 0.31081912086372376, + "grad_norm": 7.56531286239624, + "learning_rate": 7.254715589715016e-07, + "loss": 0.7529, + "step": 19965 + }, + { + "epoch": 0.31089696186569316, + "grad_norm": 3.339479923248291, + "learning_rate": 7.253896199668966e-07, + "loss": 0.8662, + "step": 19970 + }, + { + "epoch": 0.3109748028676625, + "grad_norm": 3.970248222351074, + "learning_rate": 7.253076809622917e-07, + "loss": 0.8382, + "step": 19975 + }, + { + "epoch": 0.3110526438696319, + "grad_norm": 6.586206912994385, + "learning_rate": 7.252257419576867e-07, + "loss": 0.656, + "step": 19980 + }, + { + "epoch": 0.3111304848716013, + "grad_norm": 4.172905445098877, + "learning_rate": 7.251438029530816e-07, + "loss": 0.8625, + "step": 19985 + }, + { + "epoch": 0.31120832587357067, + "grad_norm": 3.187283754348755, + "learning_rate": 7.250618639484767e-07, + "loss": 0.7319, + "step": 19990 + }, + { + "epoch": 0.31128616687554, + "grad_norm": 3.5772223472595215, + "learning_rate": 7.249799249438717e-07, + "loss": 0.8214, + "step": 19995 + }, + { + "epoch": 0.3113640078775094, + "grad_norm": 3.162219762802124, + "learning_rate": 7.248979859392667e-07, + "loss": 0.7177, + "step": 20000 + }, + { + "epoch": 0.3114418488794788, + "grad_norm": 4.707995891571045, + "learning_rate": 7.248160469346618e-07, + "loss": 0.7666, + "step": 20005 + }, + { + "epoch": 0.3115196898814482, + "grad_norm": 4.667968273162842, + "learning_rate": 7.247341079300569e-07, + "loss": 0.877, + "step": 20010 + }, + { + "epoch": 0.3115975308834175, + "grad_norm": 8.888596534729004, + "learning_rate": 7.246521689254519e-07, + "loss": 0.7889, + "step": 20015 + }, + { + "epoch": 0.3116753718853869, + "grad_norm": 4.097687721252441, + "learning_rate": 7.245702299208469e-07, + "loss": 0.6785, + "step": 20020 + }, + { + "epoch": 0.3117532128873563, + "grad_norm": 5.855955600738525, + "learning_rate": 7.244882909162419e-07, + "loss": 0.7627, + "step": 20025 + }, + { + "epoch": 0.31183105388932564, + "grad_norm": 2.3836493492126465, + "learning_rate": 7.24406351911637e-07, + "loss": 0.6925, + "step": 20030 + }, + { + "epoch": 0.31190889489129503, + "grad_norm": 4.272909641265869, + "learning_rate": 7.243244129070319e-07, + "loss": 0.7573, + "step": 20035 + }, + { + "epoch": 0.3119867358932644, + "grad_norm": 8.327914237976074, + "learning_rate": 7.24242473902427e-07, + "loss": 0.8195, + "step": 20040 + }, + { + "epoch": 0.3120645768952338, + "grad_norm": 5.052828788757324, + "learning_rate": 7.241605348978221e-07, + "loss": 0.9444, + "step": 20045 + }, + { + "epoch": 0.31214241789720315, + "grad_norm": 6.556353569030762, + "learning_rate": 7.24078595893217e-07, + "loss": 0.6907, + "step": 20050 + }, + { + "epoch": 0.31222025889917254, + "grad_norm": 4.166321754455566, + "learning_rate": 7.239966568886121e-07, + "loss": 0.7356, + "step": 20055 + }, + { + "epoch": 0.31229809990114193, + "grad_norm": 3.270192861557007, + "learning_rate": 7.239147178840072e-07, + "loss": 0.7642, + "step": 20060 + }, + { + "epoch": 0.3123759409031113, + "grad_norm": 21.332584381103516, + "learning_rate": 7.238327788794022e-07, + "loss": 0.7392, + "step": 20065 + }, + { + "epoch": 0.31245378190508066, + "grad_norm": 3.567343235015869, + "learning_rate": 7.237508398747971e-07, + "loss": 0.7774, + "step": 20070 + }, + { + "epoch": 0.31253162290705006, + "grad_norm": 4.1884307861328125, + "learning_rate": 7.236689008701922e-07, + "loss": 0.8854, + "step": 20075 + }, + { + "epoch": 0.31260946390901945, + "grad_norm": 3.5007705688476562, + "learning_rate": 7.235869618655872e-07, + "loss": 0.6666, + "step": 20080 + }, + { + "epoch": 0.31268730491098884, + "grad_norm": 4.026138782501221, + "learning_rate": 7.235050228609822e-07, + "loss": 0.8585, + "step": 20085 + }, + { + "epoch": 0.3127651459129582, + "grad_norm": 3.0517191886901855, + "learning_rate": 7.234230838563773e-07, + "loss": 0.8152, + "step": 20090 + }, + { + "epoch": 0.31284298691492757, + "grad_norm": 8.352131843566895, + "learning_rate": 7.233411448517723e-07, + "loss": 0.7127, + "step": 20095 + }, + { + "epoch": 0.31292082791689696, + "grad_norm": 2.481409788131714, + "learning_rate": 7.232592058471674e-07, + "loss": 0.713, + "step": 20100 + }, + { + "epoch": 0.3129986689188663, + "grad_norm": 9.404962539672852, + "learning_rate": 7.231772668425624e-07, + "loss": 0.8564, + "step": 20105 + }, + { + "epoch": 0.3130765099208357, + "grad_norm": 3.086458444595337, + "learning_rate": 7.230953278379573e-07, + "loss": 0.704, + "step": 20110 + }, + { + "epoch": 0.3131543509228051, + "grad_norm": 4.2751240730285645, + "learning_rate": 7.230133888333524e-07, + "loss": 0.7979, + "step": 20115 + }, + { + "epoch": 0.3132321919247745, + "grad_norm": 3.2433555126190186, + "learning_rate": 7.229314498287475e-07, + "loss": 0.7739, + "step": 20120 + }, + { + "epoch": 0.3133100329267438, + "grad_norm": 6.711822032928467, + "learning_rate": 7.228495108241424e-07, + "loss": 0.8249, + "step": 20125 + }, + { + "epoch": 0.3133878739287132, + "grad_norm": 5.886772632598877, + "learning_rate": 7.227675718195375e-07, + "loss": 0.8063, + "step": 20130 + }, + { + "epoch": 0.3134657149306826, + "grad_norm": 5.3925018310546875, + "learning_rate": 7.226856328149326e-07, + "loss": 0.8441, + "step": 20135 + }, + { + "epoch": 0.313543555932652, + "grad_norm": 3.970362663269043, + "learning_rate": 7.226036938103276e-07, + "loss": 0.7704, + "step": 20140 + }, + { + "epoch": 0.3136213969346213, + "grad_norm": 7.354496479034424, + "learning_rate": 7.225217548057226e-07, + "loss": 0.7853, + "step": 20145 + }, + { + "epoch": 0.3136992379365907, + "grad_norm": 4.8241190910339355, + "learning_rate": 7.224398158011176e-07, + "loss": 0.8676, + "step": 20150 + }, + { + "epoch": 0.3137770789385601, + "grad_norm": 5.4294867515563965, + "learning_rate": 7.223578767965127e-07, + "loss": 0.7949, + "step": 20155 + }, + { + "epoch": 0.3138549199405295, + "grad_norm": 6.615969657897949, + "learning_rate": 7.222759377919076e-07, + "loss": 0.7947, + "step": 20160 + }, + { + "epoch": 0.31393276094249883, + "grad_norm": 4.777083396911621, + "learning_rate": 7.221939987873027e-07, + "loss": 0.8299, + "step": 20165 + }, + { + "epoch": 0.3140106019444682, + "grad_norm": 4.006468296051025, + "learning_rate": 7.221120597826978e-07, + "loss": 0.6731, + "step": 20170 + }, + { + "epoch": 0.3140884429464376, + "grad_norm": 2.7182235717773438, + "learning_rate": 7.220301207780927e-07, + "loss": 0.682, + "step": 20175 + }, + { + "epoch": 0.314166283948407, + "grad_norm": 4.555752277374268, + "learning_rate": 7.219481817734878e-07, + "loss": 0.7247, + "step": 20180 + }, + { + "epoch": 0.31424412495037635, + "grad_norm": 3.7917845249176025, + "learning_rate": 7.218662427688829e-07, + "loss": 0.745, + "step": 20185 + }, + { + "epoch": 0.31432196595234574, + "grad_norm": 4.585085391998291, + "learning_rate": 7.217843037642778e-07, + "loss": 0.7629, + "step": 20190 + }, + { + "epoch": 0.31439980695431513, + "grad_norm": 4.027462005615234, + "learning_rate": 7.217023647596728e-07, + "loss": 0.7306, + "step": 20195 + }, + { + "epoch": 0.31447764795628447, + "grad_norm": 3.540985107421875, + "learning_rate": 7.216204257550679e-07, + "loss": 0.6836, + "step": 20200 + }, + { + "epoch": 0.31455548895825386, + "grad_norm": 4.648544788360596, + "learning_rate": 7.215384867504629e-07, + "loss": 0.7504, + "step": 20205 + }, + { + "epoch": 0.31463332996022325, + "grad_norm": 3.58345103263855, + "learning_rate": 7.21456547745858e-07, + "loss": 0.8156, + "step": 20210 + }, + { + "epoch": 0.31471117096219264, + "grad_norm": 6.841537952423096, + "learning_rate": 7.21374608741253e-07, + "loss": 0.7757, + "step": 20215 + }, + { + "epoch": 0.314789011964162, + "grad_norm": 3.252333879470825, + "learning_rate": 7.21292669736648e-07, + "loss": 0.7893, + "step": 20220 + }, + { + "epoch": 0.31486685296613137, + "grad_norm": 3.391663074493408, + "learning_rate": 7.212107307320431e-07, + "loss": 0.8041, + "step": 20225 + }, + { + "epoch": 0.31494469396810076, + "grad_norm": 3.951467990875244, + "learning_rate": 7.21128791727438e-07, + "loss": 0.8238, + "step": 20230 + }, + { + "epoch": 0.31502253497007016, + "grad_norm": 2.974583864212036, + "learning_rate": 7.21046852722833e-07, + "loss": 0.7669, + "step": 20235 + }, + { + "epoch": 0.3151003759720395, + "grad_norm": 7.49880313873291, + "learning_rate": 7.209649137182281e-07, + "loss": 0.768, + "step": 20240 + }, + { + "epoch": 0.3151782169740089, + "grad_norm": 3.172010660171509, + "learning_rate": 7.208829747136232e-07, + "loss": 0.7582, + "step": 20245 + }, + { + "epoch": 0.3152560579759783, + "grad_norm": 4.700904846191406, + "learning_rate": 7.208010357090181e-07, + "loss": 0.6729, + "step": 20250 + }, + { + "epoch": 0.31533389897794767, + "grad_norm": 6.512081146240234, + "learning_rate": 7.207190967044132e-07, + "loss": 0.7653, + "step": 20255 + }, + { + "epoch": 0.315411739979917, + "grad_norm": 3.5270817279815674, + "learning_rate": 7.206371576998083e-07, + "loss": 0.6887, + "step": 20260 + }, + { + "epoch": 0.3154895809818864, + "grad_norm": 7.409488201141357, + "learning_rate": 7.205552186952034e-07, + "loss": 0.8739, + "step": 20265 + }, + { + "epoch": 0.3155674219838558, + "grad_norm": 4.6055121421813965, + "learning_rate": 7.204732796905982e-07, + "loss": 0.722, + "step": 20270 + }, + { + "epoch": 0.3156452629858252, + "grad_norm": 3.539865016937256, + "learning_rate": 7.203913406859933e-07, + "loss": 0.7651, + "step": 20275 + }, + { + "epoch": 0.3157231039877945, + "grad_norm": 12.878244400024414, + "learning_rate": 7.203094016813884e-07, + "loss": 0.8434, + "step": 20280 + }, + { + "epoch": 0.3158009449897639, + "grad_norm": 3.4090824127197266, + "learning_rate": 7.202274626767833e-07, + "loss": 0.7503, + "step": 20285 + }, + { + "epoch": 0.3158787859917333, + "grad_norm": 3.430004119873047, + "learning_rate": 7.201455236721784e-07, + "loss": 0.7558, + "step": 20290 + }, + { + "epoch": 0.31595662699370264, + "grad_norm": 4.504459857940674, + "learning_rate": 7.200635846675735e-07, + "loss": 0.7472, + "step": 20295 + }, + { + "epoch": 0.31603446799567203, + "grad_norm": 7.216919422149658, + "learning_rate": 7.199816456629685e-07, + "loss": 0.6889, + "step": 20300 + }, + { + "epoch": 0.3161123089976414, + "grad_norm": 3.8278470039367676, + "learning_rate": 7.198997066583635e-07, + "loss": 0.7783, + "step": 20305 + }, + { + "epoch": 0.3161901499996108, + "grad_norm": 4.990563869476318, + "learning_rate": 7.198177676537585e-07, + "loss": 0.7194, + "step": 20310 + }, + { + "epoch": 0.31626799100158015, + "grad_norm": 7.858270168304443, + "learning_rate": 7.197358286491535e-07, + "loss": 0.7615, + "step": 20315 + }, + { + "epoch": 0.31634583200354954, + "grad_norm": 3.449202299118042, + "learning_rate": 7.196538896445485e-07, + "loss": 0.7311, + "step": 20320 + }, + { + "epoch": 0.31642367300551894, + "grad_norm": 3.601962089538574, + "learning_rate": 7.195719506399436e-07, + "loss": 0.8149, + "step": 20325 + }, + { + "epoch": 0.3165015140074883, + "grad_norm": 4.08453893661499, + "learning_rate": 7.194900116353386e-07, + "loss": 0.7266, + "step": 20330 + }, + { + "epoch": 0.31657935500945766, + "grad_norm": 3.3417105674743652, + "learning_rate": 7.194080726307337e-07, + "loss": 0.7956, + "step": 20335 + }, + { + "epoch": 0.31665719601142706, + "grad_norm": 3.159672975540161, + "learning_rate": 7.193261336261287e-07, + "loss": 0.7221, + "step": 20340 + }, + { + "epoch": 0.31673503701339645, + "grad_norm": 5.266695022583008, + "learning_rate": 7.192441946215237e-07, + "loss": 0.8936, + "step": 20345 + }, + { + "epoch": 0.31681287801536584, + "grad_norm": 2.914886474609375, + "learning_rate": 7.191622556169187e-07, + "loss": 0.9082, + "step": 20350 + }, + { + "epoch": 0.3168907190173352, + "grad_norm": 3.562744379043579, + "learning_rate": 7.190803166123138e-07, + "loss": 0.7991, + "step": 20355 + }, + { + "epoch": 0.31696856001930457, + "grad_norm": 8.86327838897705, + "learning_rate": 7.189983776077087e-07, + "loss": 0.7872, + "step": 20360 + }, + { + "epoch": 0.31704640102127396, + "grad_norm": 3.006682872772217, + "learning_rate": 7.189164386031038e-07, + "loss": 0.7675, + "step": 20365 + }, + { + "epoch": 0.3171242420232433, + "grad_norm": 6.521688461303711, + "learning_rate": 7.188344995984989e-07, + "loss": 0.7401, + "step": 20370 + }, + { + "epoch": 0.3172020830252127, + "grad_norm": 4.019016742706299, + "learning_rate": 7.187525605938938e-07, + "loss": 0.7771, + "step": 20375 + }, + { + "epoch": 0.3172799240271821, + "grad_norm": 3.9266228675842285, + "learning_rate": 7.186706215892889e-07, + "loss": 0.6257, + "step": 20380 + }, + { + "epoch": 0.3173577650291515, + "grad_norm": 4.3516621589660645, + "learning_rate": 7.18588682584684e-07, + "loss": 0.7592, + "step": 20385 + }, + { + "epoch": 0.3174356060311208, + "grad_norm": 4.869146823883057, + "learning_rate": 7.185067435800791e-07, + "loss": 0.7773, + "step": 20390 + }, + { + "epoch": 0.3175134470330902, + "grad_norm": 4.757040500640869, + "learning_rate": 7.184248045754739e-07, + "loss": 0.8382, + "step": 20395 + }, + { + "epoch": 0.3175912880350596, + "grad_norm": 4.194018363952637, + "learning_rate": 7.18342865570869e-07, + "loss": 0.6359, + "step": 20400 + }, + { + "epoch": 0.317669129037029, + "grad_norm": 3.9095518589019775, + "learning_rate": 7.182609265662641e-07, + "loss": 0.755, + "step": 20405 + }, + { + "epoch": 0.3177469700389983, + "grad_norm": 3.327188730239868, + "learning_rate": 7.18178987561659e-07, + "loss": 0.7284, + "step": 20410 + }, + { + "epoch": 0.3178248110409677, + "grad_norm": 3.2544198036193848, + "learning_rate": 7.180970485570541e-07, + "loss": 0.7143, + "step": 20415 + }, + { + "epoch": 0.3179026520429371, + "grad_norm": 5.302325248718262, + "learning_rate": 7.180151095524492e-07, + "loss": 0.7323, + "step": 20420 + }, + { + "epoch": 0.3179804930449065, + "grad_norm": 3.3627989292144775, + "learning_rate": 7.179331705478442e-07, + "loss": 0.7139, + "step": 20425 + }, + { + "epoch": 0.31805833404687583, + "grad_norm": 2.9275786876678467, + "learning_rate": 7.178512315432392e-07, + "loss": 0.8896, + "step": 20430 + }, + { + "epoch": 0.3181361750488452, + "grad_norm": 2.586864709854126, + "learning_rate": 7.177692925386342e-07, + "loss": 0.8718, + "step": 20435 + }, + { + "epoch": 0.3182140160508146, + "grad_norm": 3.3972079753875732, + "learning_rate": 7.176873535340292e-07, + "loss": 0.7542, + "step": 20440 + }, + { + "epoch": 0.318291857052784, + "grad_norm": 3.256925344467163, + "learning_rate": 7.176054145294243e-07, + "loss": 0.815, + "step": 20445 + }, + { + "epoch": 0.31836969805475335, + "grad_norm": 3.8902106285095215, + "learning_rate": 7.175234755248193e-07, + "loss": 0.8117, + "step": 20450 + }, + { + "epoch": 0.31844753905672274, + "grad_norm": 4.046694755554199, + "learning_rate": 7.174415365202143e-07, + "loss": 0.7104, + "step": 20455 + }, + { + "epoch": 0.31852538005869213, + "grad_norm": 7.7123918533325195, + "learning_rate": 7.173595975156094e-07, + "loss": 0.6584, + "step": 20460 + }, + { + "epoch": 0.31860322106066147, + "grad_norm": 4.0524516105651855, + "learning_rate": 7.172776585110044e-07, + "loss": 0.7106, + "step": 20465 + }, + { + "epoch": 0.31868106206263086, + "grad_norm": 2.6010935306549072, + "learning_rate": 7.171957195063994e-07, + "loss": 0.8609, + "step": 20470 + }, + { + "epoch": 0.31875890306460025, + "grad_norm": 3.557629346847534, + "learning_rate": 7.171137805017944e-07, + "loss": 0.7728, + "step": 20475 + }, + { + "epoch": 0.31883674406656964, + "grad_norm": 6.395180702209473, + "learning_rate": 7.170318414971895e-07, + "loss": 0.8742, + "step": 20480 + }, + { + "epoch": 0.318914585068539, + "grad_norm": 2.464134454727173, + "learning_rate": 7.169499024925844e-07, + "loss": 0.6831, + "step": 20485 + }, + { + "epoch": 0.3189924260705084, + "grad_norm": 3.003485679626465, + "learning_rate": 7.168679634879795e-07, + "loss": 0.7702, + "step": 20490 + }, + { + "epoch": 0.31907026707247776, + "grad_norm": 3.070361614227295, + "learning_rate": 7.167860244833746e-07, + "loss": 0.7564, + "step": 20495 + }, + { + "epoch": 0.31914810807444716, + "grad_norm": 2.900399684906006, + "learning_rate": 7.167040854787695e-07, + "loss": 0.8135, + "step": 20500 + }, + { + "epoch": 0.3192259490764165, + "grad_norm": 2.8308839797973633, + "learning_rate": 7.166221464741646e-07, + "loss": 0.6554, + "step": 20505 + }, + { + "epoch": 0.3193037900783859, + "grad_norm": 3.5729901790618896, + "learning_rate": 7.165402074695597e-07, + "loss": 0.7582, + "step": 20510 + }, + { + "epoch": 0.3193816310803553, + "grad_norm": 8.747566223144531, + "learning_rate": 7.164582684649546e-07, + "loss": 0.8207, + "step": 20515 + }, + { + "epoch": 0.31945947208232467, + "grad_norm": 3.617783784866333, + "learning_rate": 7.163763294603496e-07, + "loss": 0.808, + "step": 20520 + }, + { + "epoch": 0.319537313084294, + "grad_norm": 4.175931453704834, + "learning_rate": 7.162943904557447e-07, + "loss": 0.7721, + "step": 20525 + }, + { + "epoch": 0.3196151540862634, + "grad_norm": 3.4712469577789307, + "learning_rate": 7.162124514511398e-07, + "loss": 0.7462, + "step": 20530 + }, + { + "epoch": 0.3196929950882328, + "grad_norm": 2.555041790008545, + "learning_rate": 7.161305124465348e-07, + "loss": 0.7408, + "step": 20535 + }, + { + "epoch": 0.3197708360902021, + "grad_norm": 3.7030394077301025, + "learning_rate": 7.160485734419298e-07, + "loss": 0.8261, + "step": 20540 + }, + { + "epoch": 0.3198486770921715, + "grad_norm": 4.538588047027588, + "learning_rate": 7.159666344373249e-07, + "loss": 0.8366, + "step": 20545 + }, + { + "epoch": 0.3199265180941409, + "grad_norm": 4.802341938018799, + "learning_rate": 7.158846954327199e-07, + "loss": 0.92, + "step": 20550 + }, + { + "epoch": 0.3200043590961103, + "grad_norm": 2.9012575149536133, + "learning_rate": 7.158027564281148e-07, + "loss": 0.7722, + "step": 20555 + }, + { + "epoch": 0.32008220009807964, + "grad_norm": 2.780606269836426, + "learning_rate": 7.157208174235099e-07, + "loss": 0.6411, + "step": 20560 + }, + { + "epoch": 0.32016004110004903, + "grad_norm": 3.9199488162994385, + "learning_rate": 7.156388784189049e-07, + "loss": 0.6957, + "step": 20565 + }, + { + "epoch": 0.3202378821020184, + "grad_norm": 4.829502105712891, + "learning_rate": 7.155569394143e-07, + "loss": 0.778, + "step": 20570 + }, + { + "epoch": 0.3203157231039878, + "grad_norm": 3.4407572746276855, + "learning_rate": 7.15475000409695e-07, + "loss": 0.6633, + "step": 20575 + }, + { + "epoch": 0.32039356410595715, + "grad_norm": 3.0549888610839844, + "learning_rate": 7.1539306140509e-07, + "loss": 0.6254, + "step": 20580 + }, + { + "epoch": 0.32047140510792654, + "grad_norm": 2.840653896331787, + "learning_rate": 7.153111224004851e-07, + "loss": 0.7218, + "step": 20585 + }, + { + "epoch": 0.32054924610989594, + "grad_norm": 4.250532150268555, + "learning_rate": 7.152291833958802e-07, + "loss": 0.8882, + "step": 20590 + }, + { + "epoch": 0.3206270871118653, + "grad_norm": 3.789776086807251, + "learning_rate": 7.15147244391275e-07, + "loss": 0.8756, + "step": 20595 + }, + { + "epoch": 0.32070492811383466, + "grad_norm": 3.888643503189087, + "learning_rate": 7.150653053866701e-07, + "loss": 0.7486, + "step": 20600 + }, + { + "epoch": 0.32078276911580406, + "grad_norm": 2.7952585220336914, + "learning_rate": 7.149833663820652e-07, + "loss": 0.7398, + "step": 20605 + }, + { + "epoch": 0.32086061011777345, + "grad_norm": 3.1019365787506104, + "learning_rate": 7.149014273774601e-07, + "loss": 0.6767, + "step": 20610 + }, + { + "epoch": 0.32093845111974284, + "grad_norm": 3.4878134727478027, + "learning_rate": 7.148194883728552e-07, + "loss": 0.8461, + "step": 20615 + }, + { + "epoch": 0.3210162921217122, + "grad_norm": 3.560169219970703, + "learning_rate": 7.147375493682503e-07, + "loss": 0.7525, + "step": 20620 + }, + { + "epoch": 0.32109413312368157, + "grad_norm": 6.664193153381348, + "learning_rate": 7.146556103636453e-07, + "loss": 0.8274, + "step": 20625 + }, + { + "epoch": 0.32117197412565096, + "grad_norm": 7.751073360443115, + "learning_rate": 7.145736713590403e-07, + "loss": 0.7782, + "step": 20630 + }, + { + "epoch": 0.3212498151276203, + "grad_norm": 3.1961686611175537, + "learning_rate": 7.144917323544353e-07, + "loss": 0.7371, + "step": 20635 + }, + { + "epoch": 0.3213276561295897, + "grad_norm": 3.616342782974243, + "learning_rate": 7.144097933498303e-07, + "loss": 0.7793, + "step": 20640 + }, + { + "epoch": 0.3214054971315591, + "grad_norm": 14.634657859802246, + "learning_rate": 7.143278543452253e-07, + "loss": 0.7341, + "step": 20645 + }, + { + "epoch": 0.3214833381335285, + "grad_norm": 7.2038726806640625, + "learning_rate": 7.142459153406204e-07, + "loss": 0.7319, + "step": 20650 + }, + { + "epoch": 0.3215611791354978, + "grad_norm": 3.5353848934173584, + "learning_rate": 7.141639763360155e-07, + "loss": 0.8196, + "step": 20655 + }, + { + "epoch": 0.3216390201374672, + "grad_norm": 2.9616966247558594, + "learning_rate": 7.140820373314105e-07, + "loss": 0.7542, + "step": 20660 + }, + { + "epoch": 0.3217168611394366, + "grad_norm": 6.182138442993164, + "learning_rate": 7.140000983268055e-07, + "loss": 0.7098, + "step": 20665 + }, + { + "epoch": 0.321794702141406, + "grad_norm": 3.2851719856262207, + "learning_rate": 7.139181593222006e-07, + "loss": 0.73, + "step": 20670 + }, + { + "epoch": 0.3218725431433753, + "grad_norm": 3.859102249145508, + "learning_rate": 7.138362203175955e-07, + "loss": 0.737, + "step": 20675 + }, + { + "epoch": 0.3219503841453447, + "grad_norm": 3.811932325363159, + "learning_rate": 7.137542813129905e-07, + "loss": 0.7437, + "step": 20680 + }, + { + "epoch": 0.3220282251473141, + "grad_norm": 3.6481974124908447, + "learning_rate": 7.136723423083856e-07, + "loss": 0.8089, + "step": 20685 + }, + { + "epoch": 0.3221060661492835, + "grad_norm": 7.045378684997559, + "learning_rate": 7.135904033037806e-07, + "loss": 0.787, + "step": 20690 + }, + { + "epoch": 0.32218390715125284, + "grad_norm": 5.550210952758789, + "learning_rate": 7.135084642991757e-07, + "loss": 0.8496, + "step": 20695 + }, + { + "epoch": 0.3222617481532222, + "grad_norm": 6.449124813079834, + "learning_rate": 7.134265252945707e-07, + "loss": 0.6899, + "step": 20700 + }, + { + "epoch": 0.3223395891551916, + "grad_norm": 3.796647310256958, + "learning_rate": 7.133445862899657e-07, + "loss": 0.7465, + "step": 20705 + }, + { + "epoch": 0.32241743015716096, + "grad_norm": 3.268113136291504, + "learning_rate": 7.132626472853608e-07, + "loss": 0.7982, + "step": 20710 + }, + { + "epoch": 0.32249527115913035, + "grad_norm": 15.425836563110352, + "learning_rate": 7.131807082807559e-07, + "loss": 0.7995, + "step": 20715 + }, + { + "epoch": 0.32257311216109974, + "grad_norm": 5.5737152099609375, + "learning_rate": 7.130987692761507e-07, + "loss": 0.8068, + "step": 20720 + }, + { + "epoch": 0.32265095316306913, + "grad_norm": 6.799961566925049, + "learning_rate": 7.130168302715458e-07, + "loss": 0.7693, + "step": 20725 + }, + { + "epoch": 0.32272879416503847, + "grad_norm": 6.524772644042969, + "learning_rate": 7.129348912669409e-07, + "loss": 0.8043, + "step": 20730 + }, + { + "epoch": 0.32280663516700786, + "grad_norm": 4.884058475494385, + "learning_rate": 7.128529522623358e-07, + "loss": 0.7259, + "step": 20735 + }, + { + "epoch": 0.32288447616897725, + "grad_norm": 4.115115165710449, + "learning_rate": 7.127710132577309e-07, + "loss": 0.7945, + "step": 20740 + }, + { + "epoch": 0.32296231717094664, + "grad_norm": 3.3195178508758545, + "learning_rate": 7.12689074253126e-07, + "loss": 0.8378, + "step": 20745 + }, + { + "epoch": 0.323040158172916, + "grad_norm": 2.867581367492676, + "learning_rate": 7.12607135248521e-07, + "loss": 0.7582, + "step": 20750 + }, + { + "epoch": 0.3231179991748854, + "grad_norm": 5.850396633148193, + "learning_rate": 7.12525196243916e-07, + "loss": 0.7765, + "step": 20755 + }, + { + "epoch": 0.32319584017685477, + "grad_norm": 7.244020938873291, + "learning_rate": 7.12443257239311e-07, + "loss": 0.8619, + "step": 20760 + }, + { + "epoch": 0.32327368117882416, + "grad_norm": 4.443892478942871, + "learning_rate": 7.12361318234706e-07, + "loss": 0.707, + "step": 20765 + }, + { + "epoch": 0.3233515221807935, + "grad_norm": 4.754065990447998, + "learning_rate": 7.12279379230101e-07, + "loss": 0.7362, + "step": 20770 + }, + { + "epoch": 0.3234293631827629, + "grad_norm": 4.68637228012085, + "learning_rate": 7.121974402254961e-07, + "loss": 0.8056, + "step": 20775 + }, + { + "epoch": 0.3235072041847323, + "grad_norm": 3.501113176345825, + "learning_rate": 7.121155012208912e-07, + "loss": 0.8143, + "step": 20780 + }, + { + "epoch": 0.32358504518670167, + "grad_norm": 4.2049031257629395, + "learning_rate": 7.120335622162862e-07, + "loss": 0.7402, + "step": 20785 + }, + { + "epoch": 0.323662886188671, + "grad_norm": 5.971714973449707, + "learning_rate": 7.119516232116812e-07, + "loss": 0.7949, + "step": 20790 + }, + { + "epoch": 0.3237407271906404, + "grad_norm": 5.241507053375244, + "learning_rate": 7.118696842070763e-07, + "loss": 0.8283, + "step": 20795 + }, + { + "epoch": 0.3238185681926098, + "grad_norm": 3.72343111038208, + "learning_rate": 7.117877452024712e-07, + "loss": 0.8301, + "step": 20800 + }, + { + "epoch": 0.3238964091945791, + "grad_norm": 6.797969818115234, + "learning_rate": 7.117058061978663e-07, + "loss": 0.7162, + "step": 20805 + }, + { + "epoch": 0.3239742501965485, + "grad_norm": 7.002163887023926, + "learning_rate": 7.116238671932613e-07, + "loss": 0.741, + "step": 20810 + }, + { + "epoch": 0.3240520911985179, + "grad_norm": 3.0822055339813232, + "learning_rate": 7.115419281886563e-07, + "loss": 0.7807, + "step": 20815 + }, + { + "epoch": 0.3241299322004873, + "grad_norm": 4.6375508308410645, + "learning_rate": 7.114599891840514e-07, + "loss": 0.8142, + "step": 20820 + }, + { + "epoch": 0.32420777320245664, + "grad_norm": 2.537044048309326, + "learning_rate": 7.113780501794464e-07, + "loss": 0.7271, + "step": 20825 + }, + { + "epoch": 0.32428561420442603, + "grad_norm": 4.382402420043945, + "learning_rate": 7.112961111748414e-07, + "loss": 0.7769, + "step": 20830 + }, + { + "epoch": 0.3243634552063954, + "grad_norm": 4.217374801635742, + "learning_rate": 7.112141721702365e-07, + "loss": 0.7414, + "step": 20835 + }, + { + "epoch": 0.3244412962083648, + "grad_norm": 5.162209510803223, + "learning_rate": 7.111322331656315e-07, + "loss": 0.7683, + "step": 20840 + }, + { + "epoch": 0.32451913721033415, + "grad_norm": 2.4148507118225098, + "learning_rate": 7.110502941610264e-07, + "loss": 0.7117, + "step": 20845 + }, + { + "epoch": 0.32459697821230354, + "grad_norm": 4.48330545425415, + "learning_rate": 7.109683551564215e-07, + "loss": 0.8636, + "step": 20850 + }, + { + "epoch": 0.32467481921427294, + "grad_norm": 3.14823842048645, + "learning_rate": 7.108864161518166e-07, + "loss": 0.7688, + "step": 20855 + }, + { + "epoch": 0.32475266021624233, + "grad_norm": 3.530456781387329, + "learning_rate": 7.108044771472115e-07, + "loss": 0.8039, + "step": 20860 + }, + { + "epoch": 0.32483050121821166, + "grad_norm": 3.5431201457977295, + "learning_rate": 7.107225381426066e-07, + "loss": 0.742, + "step": 20865 + }, + { + "epoch": 0.32490834222018106, + "grad_norm": 9.657413482666016, + "learning_rate": 7.106405991380017e-07, + "loss": 0.7895, + "step": 20870 + }, + { + "epoch": 0.32498618322215045, + "grad_norm": 6.042131423950195, + "learning_rate": 7.105586601333967e-07, + "loss": 0.6917, + "step": 20875 + }, + { + "epoch": 0.3250640242241198, + "grad_norm": 5.177880764007568, + "learning_rate": 7.104767211287916e-07, + "loss": 0.7696, + "step": 20880 + }, + { + "epoch": 0.3251418652260892, + "grad_norm": 2.8649775981903076, + "learning_rate": 7.103947821241867e-07, + "loss": 0.7532, + "step": 20885 + }, + { + "epoch": 0.32521970622805857, + "grad_norm": 3.3477964401245117, + "learning_rate": 7.103128431195817e-07, + "loss": 0.7381, + "step": 20890 + }, + { + "epoch": 0.32529754723002796, + "grad_norm": 3.959502935409546, + "learning_rate": 7.102309041149768e-07, + "loss": 0.8313, + "step": 20895 + }, + { + "epoch": 0.3253753882319973, + "grad_norm": 3.5778965950012207, + "learning_rate": 7.101489651103718e-07, + "loss": 0.8392, + "step": 20900 + }, + { + "epoch": 0.3254532292339667, + "grad_norm": 4.421009063720703, + "learning_rate": 7.100670261057669e-07, + "loss": 0.9063, + "step": 20905 + }, + { + "epoch": 0.3255310702359361, + "grad_norm": 4.305064678192139, + "learning_rate": 7.099850871011619e-07, + "loss": 0.8522, + "step": 20910 + }, + { + "epoch": 0.3256089112379055, + "grad_norm": 25.120641708374023, + "learning_rate": 7.099031480965569e-07, + "loss": 0.7622, + "step": 20915 + }, + { + "epoch": 0.3256867522398748, + "grad_norm": 4.1203227043151855, + "learning_rate": 7.098212090919519e-07, + "loss": 0.7762, + "step": 20920 + }, + { + "epoch": 0.3257645932418442, + "grad_norm": 3.8534672260284424, + "learning_rate": 7.097392700873469e-07, + "loss": 0.833, + "step": 20925 + }, + { + "epoch": 0.3258424342438136, + "grad_norm": 4.1801652908325195, + "learning_rate": 7.09657331082742e-07, + "loss": 0.7922, + "step": 20930 + }, + { + "epoch": 0.325920275245783, + "grad_norm": 4.395228862762451, + "learning_rate": 7.09575392078137e-07, + "loss": 0.7613, + "step": 20935 + }, + { + "epoch": 0.3259981162477523, + "grad_norm": 4.56093168258667, + "learning_rate": 7.09493453073532e-07, + "loss": 0.675, + "step": 20940 + }, + { + "epoch": 0.3260759572497217, + "grad_norm": 7.025084018707275, + "learning_rate": 7.094115140689271e-07, + "loss": 0.759, + "step": 20945 + }, + { + "epoch": 0.3261537982516911, + "grad_norm": 4.575479984283447, + "learning_rate": 7.093295750643222e-07, + "loss": 0.8409, + "step": 20950 + }, + { + "epoch": 0.3262316392536605, + "grad_norm": 5.8464837074279785, + "learning_rate": 7.092476360597171e-07, + "loss": 0.813, + "step": 20955 + }, + { + "epoch": 0.32630948025562984, + "grad_norm": 2.9095611572265625, + "learning_rate": 7.091656970551121e-07, + "loss": 0.7483, + "step": 20960 + }, + { + "epoch": 0.3263873212575992, + "grad_norm": 3.054791212081909, + "learning_rate": 7.090837580505072e-07, + "loss": 0.8178, + "step": 20965 + }, + { + "epoch": 0.3264651622595686, + "grad_norm": 7.010560512542725, + "learning_rate": 7.090018190459021e-07, + "loss": 0.7066, + "step": 20970 + }, + { + "epoch": 0.32654300326153796, + "grad_norm": 6.855307102203369, + "learning_rate": 7.089198800412972e-07, + "loss": 0.8175, + "step": 20975 + }, + { + "epoch": 0.32662084426350735, + "grad_norm": 3.9125053882598877, + "learning_rate": 7.088379410366923e-07, + "loss": 0.7047, + "step": 20980 + }, + { + "epoch": 0.32669868526547674, + "grad_norm": 2.5680830478668213, + "learning_rate": 7.087560020320873e-07, + "loss": 0.5779, + "step": 20985 + }, + { + "epoch": 0.32677652626744613, + "grad_norm": 4.910250663757324, + "learning_rate": 7.086740630274823e-07, + "loss": 0.8485, + "step": 20990 + }, + { + "epoch": 0.32685436726941547, + "grad_norm": 4.934157371520996, + "learning_rate": 7.085921240228774e-07, + "loss": 0.83, + "step": 20995 + }, + { + "epoch": 0.32693220827138486, + "grad_norm": 2.8414039611816406, + "learning_rate": 7.085101850182723e-07, + "loss": 0.8265, + "step": 21000 + }, + { + "epoch": 0.32701004927335425, + "grad_norm": 3.3391027450561523, + "learning_rate": 7.084282460136673e-07, + "loss": 0.7878, + "step": 21005 + }, + { + "epoch": 0.32708789027532365, + "grad_norm": 3.4965968132019043, + "learning_rate": 7.083463070090624e-07, + "loss": 0.7286, + "step": 21010 + }, + { + "epoch": 0.327165731277293, + "grad_norm": 4.536909580230713, + "learning_rate": 7.082643680044574e-07, + "loss": 0.8608, + "step": 21015 + }, + { + "epoch": 0.3272435722792624, + "grad_norm": 7.158962249755859, + "learning_rate": 7.081824289998525e-07, + "loss": 0.8251, + "step": 21020 + }, + { + "epoch": 0.32732141328123177, + "grad_norm": 2.9187631607055664, + "learning_rate": 7.081004899952475e-07, + "loss": 0.8394, + "step": 21025 + }, + { + "epoch": 0.32739925428320116, + "grad_norm": 3.9261114597320557, + "learning_rate": 7.080185509906426e-07, + "loss": 0.8444, + "step": 21030 + }, + { + "epoch": 0.3274770952851705, + "grad_norm": 2.9356627464294434, + "learning_rate": 7.079366119860376e-07, + "loss": 0.7229, + "step": 21035 + }, + { + "epoch": 0.3275549362871399, + "grad_norm": 3.0354654788970947, + "learning_rate": 7.078546729814327e-07, + "loss": 0.7199, + "step": 21040 + }, + { + "epoch": 0.3276327772891093, + "grad_norm": 4.891841888427734, + "learning_rate": 7.077727339768276e-07, + "loss": 0.7688, + "step": 21045 + }, + { + "epoch": 0.32771061829107867, + "grad_norm": 3.367002487182617, + "learning_rate": 7.076907949722226e-07, + "loss": 0.725, + "step": 21050 + }, + { + "epoch": 0.327788459293048, + "grad_norm": 4.214025020599365, + "learning_rate": 7.076088559676177e-07, + "loss": 0.8281, + "step": 21055 + }, + { + "epoch": 0.3278663002950174, + "grad_norm": 3.413587808609009, + "learning_rate": 7.075269169630127e-07, + "loss": 0.8031, + "step": 21060 + }, + { + "epoch": 0.3279441412969868, + "grad_norm": 3.970029592514038, + "learning_rate": 7.074449779584077e-07, + "loss": 0.7813, + "step": 21065 + }, + { + "epoch": 0.3280219822989561, + "grad_norm": 4.808574199676514, + "learning_rate": 7.073630389538028e-07, + "loss": 0.8128, + "step": 21070 + }, + { + "epoch": 0.3280998233009255, + "grad_norm": 3.092742919921875, + "learning_rate": 7.072810999491979e-07, + "loss": 0.8593, + "step": 21075 + }, + { + "epoch": 0.3281776643028949, + "grad_norm": 4.983108997344971, + "learning_rate": 7.071991609445928e-07, + "loss": 0.6809, + "step": 21080 + }, + { + "epoch": 0.3282555053048643, + "grad_norm": 3.6495978832244873, + "learning_rate": 7.071172219399878e-07, + "loss": 0.6302, + "step": 21085 + }, + { + "epoch": 0.32833334630683364, + "grad_norm": 6.250349998474121, + "learning_rate": 7.070352829353829e-07, + "loss": 0.7432, + "step": 21090 + }, + { + "epoch": 0.32841118730880303, + "grad_norm": 5.218113422393799, + "learning_rate": 7.069533439307778e-07, + "loss": 0.7807, + "step": 21095 + }, + { + "epoch": 0.3284890283107724, + "grad_norm": 4.160154819488525, + "learning_rate": 7.068714049261729e-07, + "loss": 0.7347, + "step": 21100 + }, + { + "epoch": 0.3285668693127418, + "grad_norm": 3.9513416290283203, + "learning_rate": 7.06789465921568e-07, + "loss": 0.7936, + "step": 21105 + }, + { + "epoch": 0.32864471031471115, + "grad_norm": 5.0178375244140625, + "learning_rate": 7.06707526916963e-07, + "loss": 0.7616, + "step": 21110 + }, + { + "epoch": 0.32872255131668054, + "grad_norm": 3.6662726402282715, + "learning_rate": 7.06625587912358e-07, + "loss": 0.6692, + "step": 21115 + }, + { + "epoch": 0.32880039231864994, + "grad_norm": 3.0202407836914062, + "learning_rate": 7.065436489077531e-07, + "loss": 0.7599, + "step": 21120 + }, + { + "epoch": 0.32887823332061933, + "grad_norm": 3.204550266265869, + "learning_rate": 7.06461709903148e-07, + "loss": 0.7346, + "step": 21125 + }, + { + "epoch": 0.32895607432258867, + "grad_norm": 7.8837690353393555, + "learning_rate": 7.06379770898543e-07, + "loss": 0.7442, + "step": 21130 + }, + { + "epoch": 0.32903391532455806, + "grad_norm": 4.227644443511963, + "learning_rate": 7.062978318939381e-07, + "loss": 0.7455, + "step": 21135 + }, + { + "epoch": 0.32911175632652745, + "grad_norm": 2.4810287952423096, + "learning_rate": 7.062158928893331e-07, + "loss": 0.7375, + "step": 21140 + }, + { + "epoch": 0.3291895973284968, + "grad_norm": 5.757534503936768, + "learning_rate": 7.061339538847282e-07, + "loss": 0.7165, + "step": 21145 + }, + { + "epoch": 0.3292674383304662, + "grad_norm": 6.480144500732422, + "learning_rate": 7.060520148801232e-07, + "loss": 0.8146, + "step": 21150 + }, + { + "epoch": 0.32934527933243557, + "grad_norm": 4.645882606506348, + "learning_rate": 7.059700758755183e-07, + "loss": 0.8694, + "step": 21155 + }, + { + "epoch": 0.32942312033440496, + "grad_norm": 7.792613983154297, + "learning_rate": 7.058881368709133e-07, + "loss": 0.8158, + "step": 21160 + }, + { + "epoch": 0.3295009613363743, + "grad_norm": 3.325713872909546, + "learning_rate": 7.058061978663083e-07, + "loss": 0.7242, + "step": 21165 + }, + { + "epoch": 0.3295788023383437, + "grad_norm": 5.18583345413208, + "learning_rate": 7.057242588617033e-07, + "loss": 0.8161, + "step": 21170 + }, + { + "epoch": 0.3296566433403131, + "grad_norm": 5.270481586456299, + "learning_rate": 7.056423198570983e-07, + "loss": 0.7999, + "step": 21175 + }, + { + "epoch": 0.3297344843422825, + "grad_norm": 6.009892463684082, + "learning_rate": 7.055603808524934e-07, + "loss": 0.7995, + "step": 21180 + }, + { + "epoch": 0.3298123253442518, + "grad_norm": 6.691226482391357, + "learning_rate": 7.054784418478884e-07, + "loss": 0.8364, + "step": 21185 + }, + { + "epoch": 0.3298901663462212, + "grad_norm": 2.960845947265625, + "learning_rate": 7.053965028432834e-07, + "loss": 0.6677, + "step": 21190 + }, + { + "epoch": 0.3299680073481906, + "grad_norm": 4.097884178161621, + "learning_rate": 7.053145638386785e-07, + "loss": 0.8006, + "step": 21195 + }, + { + "epoch": 0.33004584835016, + "grad_norm": 4.129356384277344, + "learning_rate": 7.052326248340736e-07, + "loss": 0.7322, + "step": 21200 + }, + { + "epoch": 0.3301236893521293, + "grad_norm": 11.891890525817871, + "learning_rate": 7.051506858294684e-07, + "loss": 0.8696, + "step": 21205 + }, + { + "epoch": 0.3302015303540987, + "grad_norm": 4.318405628204346, + "learning_rate": 7.050687468248635e-07, + "loss": 0.8034, + "step": 21210 + }, + { + "epoch": 0.3302793713560681, + "grad_norm": 3.004293441772461, + "learning_rate": 7.049868078202586e-07, + "loss": 0.9435, + "step": 21215 + }, + { + "epoch": 0.3303572123580375, + "grad_norm": 3.503347873687744, + "learning_rate": 7.049048688156535e-07, + "loss": 0.8751, + "step": 21220 + }, + { + "epoch": 0.33043505336000684, + "grad_norm": 4.205401420593262, + "learning_rate": 7.048229298110486e-07, + "loss": 0.7662, + "step": 21225 + }, + { + "epoch": 0.33051289436197623, + "grad_norm": 3.049184560775757, + "learning_rate": 7.047409908064437e-07, + "loss": 0.7602, + "step": 21230 + }, + { + "epoch": 0.3305907353639456, + "grad_norm": 2.9818828105926514, + "learning_rate": 7.046590518018387e-07, + "loss": 0.7905, + "step": 21235 + }, + { + "epoch": 0.33066857636591496, + "grad_norm": 5.376505374908447, + "learning_rate": 7.045771127972337e-07, + "loss": 0.7248, + "step": 21240 + }, + { + "epoch": 0.33074641736788435, + "grad_norm": 5.111346244812012, + "learning_rate": 7.044951737926287e-07, + "loss": 0.7423, + "step": 21245 + }, + { + "epoch": 0.33082425836985374, + "grad_norm": 5.915484428405762, + "learning_rate": 7.044132347880237e-07, + "loss": 0.8131, + "step": 21250 + }, + { + "epoch": 0.33090209937182313, + "grad_norm": 3.484762191772461, + "learning_rate": 7.043312957834188e-07, + "loss": 0.7364, + "step": 21255 + }, + { + "epoch": 0.33097994037379247, + "grad_norm": 8.695598602294922, + "learning_rate": 7.042493567788138e-07, + "loss": 0.7335, + "step": 21260 + }, + { + "epoch": 0.33105778137576186, + "grad_norm": 5.287198066711426, + "learning_rate": 7.041674177742088e-07, + "loss": 0.8465, + "step": 21265 + }, + { + "epoch": 0.33113562237773125, + "grad_norm": 5.074387550354004, + "learning_rate": 7.040854787696039e-07, + "loss": 0.736, + "step": 21270 + }, + { + "epoch": 0.33121346337970065, + "grad_norm": 4.307338714599609, + "learning_rate": 7.04003539764999e-07, + "loss": 0.8083, + "step": 21275 + }, + { + "epoch": 0.33129130438167, + "grad_norm": 4.100557804107666, + "learning_rate": 7.03921600760394e-07, + "loss": 0.7717, + "step": 21280 + }, + { + "epoch": 0.3313691453836394, + "grad_norm": 10.841692924499512, + "learning_rate": 7.038396617557889e-07, + "loss": 0.6512, + "step": 21285 + }, + { + "epoch": 0.33144698638560877, + "grad_norm": 4.395034313201904, + "learning_rate": 7.03757722751184e-07, + "loss": 0.8602, + "step": 21290 + }, + { + "epoch": 0.33152482738757816, + "grad_norm": 4.024008750915527, + "learning_rate": 7.03675783746579e-07, + "loss": 0.8101, + "step": 21295 + }, + { + "epoch": 0.3316026683895475, + "grad_norm": 2.2783641815185547, + "learning_rate": 7.03593844741974e-07, + "loss": 0.7214, + "step": 21300 + }, + { + "epoch": 0.3316805093915169, + "grad_norm": 2.686100721359253, + "learning_rate": 7.035119057373691e-07, + "loss": 0.6999, + "step": 21305 + }, + { + "epoch": 0.3317583503934863, + "grad_norm": 5.044751167297363, + "learning_rate": 7.034299667327642e-07, + "loss": 0.7782, + "step": 21310 + }, + { + "epoch": 0.3318361913954556, + "grad_norm": 4.074419021606445, + "learning_rate": 7.033480277281591e-07, + "loss": 0.8148, + "step": 21315 + }, + { + "epoch": 0.331914032397425, + "grad_norm": 4.122618198394775, + "learning_rate": 7.032660887235542e-07, + "loss": 0.853, + "step": 21320 + }, + { + "epoch": 0.3319918733993944, + "grad_norm": 3.0273494720458984, + "learning_rate": 7.031841497189493e-07, + "loss": 0.8329, + "step": 21325 + }, + { + "epoch": 0.3320697144013638, + "grad_norm": 3.330517053604126, + "learning_rate": 7.031022107143441e-07, + "loss": 0.7399, + "step": 21330 + }, + { + "epoch": 0.33214755540333313, + "grad_norm": 8.765055656433105, + "learning_rate": 7.030202717097392e-07, + "loss": 0.7669, + "step": 21335 + }, + { + "epoch": 0.3322253964053025, + "grad_norm": 2.513746500015259, + "learning_rate": 7.029383327051343e-07, + "loss": 0.7557, + "step": 21340 + }, + { + "epoch": 0.3323032374072719, + "grad_norm": 2.607100009918213, + "learning_rate": 7.028563937005293e-07, + "loss": 0.8703, + "step": 21345 + }, + { + "epoch": 0.3323810784092413, + "grad_norm": 2.982377290725708, + "learning_rate": 7.027744546959243e-07, + "loss": 0.7169, + "step": 21350 + }, + { + "epoch": 0.33245891941121064, + "grad_norm": 3.663597822189331, + "learning_rate": 7.026925156913194e-07, + "loss": 0.7615, + "step": 21355 + }, + { + "epoch": 0.33253676041318003, + "grad_norm": 3.046760320663452, + "learning_rate": 7.026105766867144e-07, + "loss": 0.7096, + "step": 21360 + }, + { + "epoch": 0.3326146014151494, + "grad_norm": 3.6248621940612793, + "learning_rate": 7.025286376821094e-07, + "loss": 0.7434, + "step": 21365 + }, + { + "epoch": 0.3326924424171188, + "grad_norm": 4.510558605194092, + "learning_rate": 7.024466986775044e-07, + "loss": 0.8041, + "step": 21370 + }, + { + "epoch": 0.33277028341908815, + "grad_norm": 4.688333511352539, + "learning_rate": 7.023647596728994e-07, + "loss": 0.727, + "step": 21375 + }, + { + "epoch": 0.33284812442105755, + "grad_norm": 5.505545616149902, + "learning_rate": 7.022828206682945e-07, + "loss": 0.8143, + "step": 21380 + }, + { + "epoch": 0.33292596542302694, + "grad_norm": 3.2059834003448486, + "learning_rate": 7.022008816636895e-07, + "loss": 0.6771, + "step": 21385 + }, + { + "epoch": 0.33300380642499633, + "grad_norm": 3.3329732418060303, + "learning_rate": 7.021189426590845e-07, + "loss": 0.7855, + "step": 21390 + }, + { + "epoch": 0.33308164742696567, + "grad_norm": 4.149011611938477, + "learning_rate": 7.020370036544796e-07, + "loss": 0.8071, + "step": 21395 + }, + { + "epoch": 0.33315948842893506, + "grad_norm": 4.561814308166504, + "learning_rate": 7.019550646498747e-07, + "loss": 0.6988, + "step": 21400 + }, + { + "epoch": 0.33323732943090445, + "grad_norm": 3.655268669128418, + "learning_rate": 7.018731256452697e-07, + "loss": 0.7151, + "step": 21405 + }, + { + "epoch": 0.3333151704328738, + "grad_norm": 4.033401012420654, + "learning_rate": 7.017911866406646e-07, + "loss": 0.7491, + "step": 21410 + }, + { + "epoch": 0.3333930114348432, + "grad_norm": 3.786790609359741, + "learning_rate": 7.017092476360597e-07, + "loss": 0.7232, + "step": 21415 + }, + { + "epoch": 0.33347085243681257, + "grad_norm": 3.5973622798919678, + "learning_rate": 7.016273086314547e-07, + "loss": 0.7981, + "step": 21420 + }, + { + "epoch": 0.33354869343878196, + "grad_norm": 4.2606682777404785, + "learning_rate": 7.015453696268497e-07, + "loss": 0.791, + "step": 21425 + }, + { + "epoch": 0.3336265344407513, + "grad_norm": 3.1162331104278564, + "learning_rate": 7.014634306222448e-07, + "loss": 0.7806, + "step": 21430 + }, + { + "epoch": 0.3337043754427207, + "grad_norm": 2.6775388717651367, + "learning_rate": 7.013814916176399e-07, + "loss": 0.7989, + "step": 21435 + }, + { + "epoch": 0.3337822164446901, + "grad_norm": 3.595301628112793, + "learning_rate": 7.012995526130348e-07, + "loss": 0.7593, + "step": 21440 + }, + { + "epoch": 0.3338600574466595, + "grad_norm": 4.060016632080078, + "learning_rate": 7.012176136084299e-07, + "loss": 0.6843, + "step": 21445 + }, + { + "epoch": 0.3339378984486288, + "grad_norm": 3.22746205329895, + "learning_rate": 7.011356746038249e-07, + "loss": 0.6879, + "step": 21450 + }, + { + "epoch": 0.3340157394505982, + "grad_norm": 2.997220993041992, + "learning_rate": 7.010537355992198e-07, + "loss": 0.7744, + "step": 21455 + }, + { + "epoch": 0.3340935804525676, + "grad_norm": 3.810737371444702, + "learning_rate": 7.009717965946149e-07, + "loss": 0.7834, + "step": 21460 + }, + { + "epoch": 0.334171421454537, + "grad_norm": 4.1732988357543945, + "learning_rate": 7.0088985759001e-07, + "loss": 0.7195, + "step": 21465 + }, + { + "epoch": 0.3342492624565063, + "grad_norm": 2.8913424015045166, + "learning_rate": 7.00807918585405e-07, + "loss": 0.6206, + "step": 21470 + }, + { + "epoch": 0.3343271034584757, + "grad_norm": 3.0899922847747803, + "learning_rate": 7.007259795808e-07, + "loss": 0.7757, + "step": 21475 + }, + { + "epoch": 0.3344049444604451, + "grad_norm": 5.782576084136963, + "learning_rate": 7.006440405761951e-07, + "loss": 0.6863, + "step": 21480 + }, + { + "epoch": 0.33448278546241444, + "grad_norm": 8.251847267150879, + "learning_rate": 7.005621015715901e-07, + "loss": 0.6977, + "step": 21485 + }, + { + "epoch": 0.33456062646438384, + "grad_norm": 3.7665064334869385, + "learning_rate": 7.00480162566985e-07, + "loss": 0.7871, + "step": 21490 + }, + { + "epoch": 0.33463846746635323, + "grad_norm": 4.690029144287109, + "learning_rate": 7.003982235623801e-07, + "loss": 0.7801, + "step": 21495 + }, + { + "epoch": 0.3347163084683226, + "grad_norm": 2.9794564247131348, + "learning_rate": 7.003162845577751e-07, + "loss": 0.7675, + "step": 21500 + }, + { + "epoch": 0.33479414947029196, + "grad_norm": 4.976625442504883, + "learning_rate": 7.002343455531702e-07, + "loss": 0.8075, + "step": 21505 + }, + { + "epoch": 0.33487199047226135, + "grad_norm": 2.9241256713867188, + "learning_rate": 7.001524065485652e-07, + "loss": 0.7282, + "step": 21510 + }, + { + "epoch": 0.33494983147423074, + "grad_norm": 4.694438457489014, + "learning_rate": 7.000704675439602e-07, + "loss": 0.8939, + "step": 21515 + }, + { + "epoch": 0.33502767247620013, + "grad_norm": 4.293147563934326, + "learning_rate": 6.999885285393553e-07, + "loss": 0.7264, + "step": 21520 + }, + { + "epoch": 0.33510551347816947, + "grad_norm": 10.977949142456055, + "learning_rate": 6.999065895347504e-07, + "loss": 0.7685, + "step": 21525 + }, + { + "epoch": 0.33518335448013886, + "grad_norm": 4.644759178161621, + "learning_rate": 6.998246505301452e-07, + "loss": 0.7754, + "step": 21530 + }, + { + "epoch": 0.33526119548210825, + "grad_norm": 5.421177864074707, + "learning_rate": 6.997427115255403e-07, + "loss": 0.7976, + "step": 21535 + }, + { + "epoch": 0.33533903648407765, + "grad_norm": 3.4674479961395264, + "learning_rate": 6.996607725209354e-07, + "loss": 0.6951, + "step": 21540 + }, + { + "epoch": 0.335416877486047, + "grad_norm": 4.617788314819336, + "learning_rate": 6.995788335163304e-07, + "loss": 0.7562, + "step": 21545 + }, + { + "epoch": 0.3354947184880164, + "grad_norm": 2.9097959995269775, + "learning_rate": 6.994968945117254e-07, + "loss": 0.6994, + "step": 21550 + }, + { + "epoch": 0.33557255948998577, + "grad_norm": 4.879605770111084, + "learning_rate": 6.994149555071205e-07, + "loss": 0.7559, + "step": 21555 + }, + { + "epoch": 0.33565040049195516, + "grad_norm": 2.8688557147979736, + "learning_rate": 6.993330165025156e-07, + "loss": 0.7733, + "step": 21560 + }, + { + "epoch": 0.3357282414939245, + "grad_norm": 3.4021968841552734, + "learning_rate": 6.992510774979105e-07, + "loss": 0.8755, + "step": 21565 + }, + { + "epoch": 0.3358060824958939, + "grad_norm": 6.432661056518555, + "learning_rate": 6.991691384933055e-07, + "loss": 0.6555, + "step": 21570 + }, + { + "epoch": 0.3358839234978633, + "grad_norm": 3.397725820541382, + "learning_rate": 6.990871994887006e-07, + "loss": 0.8091, + "step": 21575 + }, + { + "epoch": 0.3359617644998326, + "grad_norm": 2.418670177459717, + "learning_rate": 6.990052604840956e-07, + "loss": 0.7853, + "step": 21580 + }, + { + "epoch": 0.336039605501802, + "grad_norm": 3.4952597618103027, + "learning_rate": 6.989233214794906e-07, + "loss": 0.769, + "step": 21585 + }, + { + "epoch": 0.3361174465037714, + "grad_norm": 4.628364086151123, + "learning_rate": 6.988413824748857e-07, + "loss": 0.7542, + "step": 21590 + }, + { + "epoch": 0.3361952875057408, + "grad_norm": 4.102391242980957, + "learning_rate": 6.987594434702807e-07, + "loss": 0.7571, + "step": 21595 + }, + { + "epoch": 0.33627312850771013, + "grad_norm": 3.2335331439971924, + "learning_rate": 6.986775044656757e-07, + "loss": 0.6834, + "step": 21600 + }, + { + "epoch": 0.3363509695096795, + "grad_norm": 7.069674968719482, + "learning_rate": 6.985955654610708e-07, + "loss": 0.8251, + "step": 21605 + }, + { + "epoch": 0.3364288105116489, + "grad_norm": 4.058067798614502, + "learning_rate": 6.985136264564657e-07, + "loss": 0.8077, + "step": 21610 + }, + { + "epoch": 0.3365066515136183, + "grad_norm": 3.856201171875, + "learning_rate": 6.984316874518608e-07, + "loss": 0.8431, + "step": 21615 + }, + { + "epoch": 0.33658449251558764, + "grad_norm": 5.469402313232422, + "learning_rate": 6.983497484472558e-07, + "loss": 0.7463, + "step": 21620 + }, + { + "epoch": 0.33666233351755703, + "grad_norm": 4.18521785736084, + "learning_rate": 6.982678094426508e-07, + "loss": 0.9012, + "step": 21625 + }, + { + "epoch": 0.3367401745195264, + "grad_norm": 3.856855869293213, + "learning_rate": 6.981858704380459e-07, + "loss": 0.7926, + "step": 21630 + }, + { + "epoch": 0.3368180155214958, + "grad_norm": 3.008106231689453, + "learning_rate": 6.98103931433441e-07, + "loss": 0.7308, + "step": 21635 + }, + { + "epoch": 0.33689585652346515, + "grad_norm": 5.669660568237305, + "learning_rate": 6.980219924288359e-07, + "loss": 0.7562, + "step": 21640 + }, + { + "epoch": 0.33697369752543455, + "grad_norm": 4.197543144226074, + "learning_rate": 6.97940053424231e-07, + "loss": 0.8628, + "step": 21645 + }, + { + "epoch": 0.33705153852740394, + "grad_norm": 2.6384623050689697, + "learning_rate": 6.978581144196261e-07, + "loss": 0.6264, + "step": 21650 + }, + { + "epoch": 0.33712937952937333, + "grad_norm": 4.709079265594482, + "learning_rate": 6.977761754150209e-07, + "loss": 0.7781, + "step": 21655 + }, + { + "epoch": 0.33720722053134267, + "grad_norm": 5.100217342376709, + "learning_rate": 6.97694236410416e-07, + "loss": 0.7359, + "step": 21660 + }, + { + "epoch": 0.33728506153331206, + "grad_norm": 4.674426555633545, + "learning_rate": 6.976122974058111e-07, + "loss": 0.7898, + "step": 21665 + }, + { + "epoch": 0.33736290253528145, + "grad_norm": 5.750451564788818, + "learning_rate": 6.975303584012062e-07, + "loss": 0.7547, + "step": 21670 + }, + { + "epoch": 0.3374407435372508, + "grad_norm": 3.1156442165374756, + "learning_rate": 6.974484193966011e-07, + "loss": 0.7708, + "step": 21675 + }, + { + "epoch": 0.3375185845392202, + "grad_norm": 3.386111259460449, + "learning_rate": 6.973664803919962e-07, + "loss": 0.7096, + "step": 21680 + }, + { + "epoch": 0.33759642554118957, + "grad_norm": 3.170990228652954, + "learning_rate": 6.972845413873913e-07, + "loss": 0.8063, + "step": 21685 + }, + { + "epoch": 0.33767426654315896, + "grad_norm": 2.8356587886810303, + "learning_rate": 6.972026023827862e-07, + "loss": 0.7984, + "step": 21690 + }, + { + "epoch": 0.3377521075451283, + "grad_norm": 3.965768814086914, + "learning_rate": 6.971206633781812e-07, + "loss": 0.8705, + "step": 21695 + }, + { + "epoch": 0.3378299485470977, + "grad_norm": 4.400123119354248, + "learning_rate": 6.970387243735763e-07, + "loss": 0.7655, + "step": 21700 + }, + { + "epoch": 0.3379077895490671, + "grad_norm": 3.1862401962280273, + "learning_rate": 6.969567853689713e-07, + "loss": 0.8166, + "step": 21705 + }, + { + "epoch": 0.3379856305510365, + "grad_norm": 3.2615442276000977, + "learning_rate": 6.968748463643663e-07, + "loss": 0.7554, + "step": 21710 + }, + { + "epoch": 0.3380634715530058, + "grad_norm": 6.5277419090271, + "learning_rate": 6.967929073597614e-07, + "loss": 0.7581, + "step": 21715 + }, + { + "epoch": 0.3381413125549752, + "grad_norm": 6.898044109344482, + "learning_rate": 6.967109683551564e-07, + "loss": 0.826, + "step": 21720 + }, + { + "epoch": 0.3382191535569446, + "grad_norm": 3.163782835006714, + "learning_rate": 6.966290293505515e-07, + "loss": 0.7342, + "step": 21725 + }, + { + "epoch": 0.338296994558914, + "grad_norm": 4.352728843688965, + "learning_rate": 6.965470903459465e-07, + "loss": 0.7732, + "step": 21730 + }, + { + "epoch": 0.3383748355608833, + "grad_norm": 5.486903667449951, + "learning_rate": 6.964651513413414e-07, + "loss": 0.845, + "step": 21735 + }, + { + "epoch": 0.3384526765628527, + "grad_norm": 6.931983470916748, + "learning_rate": 6.963832123367365e-07, + "loss": 0.9539, + "step": 21740 + }, + { + "epoch": 0.3385305175648221, + "grad_norm": 3.2124457359313965, + "learning_rate": 6.963012733321315e-07, + "loss": 0.7011, + "step": 21745 + }, + { + "epoch": 0.33860835856679145, + "grad_norm": 4.073243618011475, + "learning_rate": 6.962193343275265e-07, + "loss": 0.8017, + "step": 21750 + }, + { + "epoch": 0.33868619956876084, + "grad_norm": 2.7231483459472656, + "learning_rate": 6.961373953229216e-07, + "loss": 0.7914, + "step": 21755 + }, + { + "epoch": 0.33876404057073023, + "grad_norm": 4.636868953704834, + "learning_rate": 6.960554563183167e-07, + "loss": 0.6745, + "step": 21760 + }, + { + "epoch": 0.3388418815726996, + "grad_norm": 3.260817289352417, + "learning_rate": 6.959735173137116e-07, + "loss": 0.7471, + "step": 21765 + }, + { + "epoch": 0.33891972257466896, + "grad_norm": 3.095573663711548, + "learning_rate": 6.958915783091067e-07, + "loss": 0.7908, + "step": 21770 + }, + { + "epoch": 0.33899756357663835, + "grad_norm": 4.436625003814697, + "learning_rate": 6.958096393045017e-07, + "loss": 0.8482, + "step": 21775 + }, + { + "epoch": 0.33907540457860774, + "grad_norm": 3.3770432472229004, + "learning_rate": 6.957277002998966e-07, + "loss": 0.6528, + "step": 21780 + }, + { + "epoch": 0.33915324558057713, + "grad_norm": 4.698873996734619, + "learning_rate": 6.956457612952917e-07, + "loss": 0.7155, + "step": 21785 + }, + { + "epoch": 0.33923108658254647, + "grad_norm": 5.815252304077148, + "learning_rate": 6.955638222906868e-07, + "loss": 0.7678, + "step": 21790 + }, + { + "epoch": 0.33930892758451586, + "grad_norm": 5.941638946533203, + "learning_rate": 6.954818832860819e-07, + "loss": 0.7872, + "step": 21795 + }, + { + "epoch": 0.33938676858648525, + "grad_norm": 4.159576416015625, + "learning_rate": 6.953999442814768e-07, + "loss": 0.8849, + "step": 21800 + }, + { + "epoch": 0.33946460958845465, + "grad_norm": 2.9337544441223145, + "learning_rate": 6.953180052768719e-07, + "loss": 0.705, + "step": 21805 + }, + { + "epoch": 0.339542450590424, + "grad_norm": 2.750241994857788, + "learning_rate": 6.95236066272267e-07, + "loss": 0.7139, + "step": 21810 + }, + { + "epoch": 0.3396202915923934, + "grad_norm": 5.717836856842041, + "learning_rate": 6.951541272676618e-07, + "loss": 0.884, + "step": 21815 + }, + { + "epoch": 0.33969813259436277, + "grad_norm": 3.4549524784088135, + "learning_rate": 6.950721882630569e-07, + "loss": 0.7177, + "step": 21820 + }, + { + "epoch": 0.33977597359633216, + "grad_norm": 9.49217700958252, + "learning_rate": 6.94990249258452e-07, + "loss": 0.7746, + "step": 21825 + }, + { + "epoch": 0.3398538145983015, + "grad_norm": 6.191872596740723, + "learning_rate": 6.94908310253847e-07, + "loss": 0.7398, + "step": 21830 + }, + { + "epoch": 0.3399316556002709, + "grad_norm": 5.124302387237549, + "learning_rate": 6.94826371249242e-07, + "loss": 0.8453, + "step": 21835 + }, + { + "epoch": 0.3400094966022403, + "grad_norm": 3.679274082183838, + "learning_rate": 6.947444322446371e-07, + "loss": 0.8208, + "step": 21840 + }, + { + "epoch": 0.3400873376042096, + "grad_norm": 4.2432541847229, + "learning_rate": 6.946624932400321e-07, + "loss": 0.7919, + "step": 21845 + }, + { + "epoch": 0.340165178606179, + "grad_norm": 4.548547267913818, + "learning_rate": 6.945805542354272e-07, + "loss": 0.7669, + "step": 21850 + }, + { + "epoch": 0.3402430196081484, + "grad_norm": 9.537131309509277, + "learning_rate": 6.944986152308221e-07, + "loss": 0.7988, + "step": 21855 + }, + { + "epoch": 0.3403208606101178, + "grad_norm": 3.267117500305176, + "learning_rate": 6.944166762262171e-07, + "loss": 0.6798, + "step": 21860 + }, + { + "epoch": 0.34039870161208713, + "grad_norm": 4.591448783874512, + "learning_rate": 6.943347372216122e-07, + "loss": 0.7488, + "step": 21865 + }, + { + "epoch": 0.3404765426140565, + "grad_norm": 2.8945159912109375, + "learning_rate": 6.942527982170072e-07, + "loss": 0.8226, + "step": 21870 + }, + { + "epoch": 0.3405543836160259, + "grad_norm": 3.5019824504852295, + "learning_rate": 6.941708592124022e-07, + "loss": 0.7698, + "step": 21875 + }, + { + "epoch": 0.3406322246179953, + "grad_norm": 3.800676107406616, + "learning_rate": 6.940889202077973e-07, + "loss": 0.7195, + "step": 21880 + }, + { + "epoch": 0.34071006561996464, + "grad_norm": 6.936948776245117, + "learning_rate": 6.940069812031924e-07, + "loss": 0.7831, + "step": 21885 + }, + { + "epoch": 0.34078790662193403, + "grad_norm": 2.993300676345825, + "learning_rate": 6.939250421985873e-07, + "loss": 0.7205, + "step": 21890 + }, + { + "epoch": 0.3408657476239034, + "grad_norm": 4.39565372467041, + "learning_rate": 6.938431031939823e-07, + "loss": 0.809, + "step": 21895 + }, + { + "epoch": 0.3409435886258728, + "grad_norm": 3.0325002670288086, + "learning_rate": 6.937611641893774e-07, + "loss": 0.6348, + "step": 21900 + }, + { + "epoch": 0.34102142962784215, + "grad_norm": 2.1405999660491943, + "learning_rate": 6.936792251847723e-07, + "loss": 0.7687, + "step": 21905 + }, + { + "epoch": 0.34109927062981155, + "grad_norm": 3.2362265586853027, + "learning_rate": 6.935972861801674e-07, + "loss": 0.9177, + "step": 21910 + }, + { + "epoch": 0.34117711163178094, + "grad_norm": 7.863826751708984, + "learning_rate": 6.935153471755625e-07, + "loss": 0.8616, + "step": 21915 + }, + { + "epoch": 0.3412549526337503, + "grad_norm": 3.197587490081787, + "learning_rate": 6.934334081709576e-07, + "loss": 0.6349, + "step": 21920 + }, + { + "epoch": 0.34133279363571967, + "grad_norm": 3.8425405025482178, + "learning_rate": 6.933514691663525e-07, + "loss": 0.7869, + "step": 21925 + }, + { + "epoch": 0.34141063463768906, + "grad_norm": 3.0469181537628174, + "learning_rate": 6.932695301617476e-07, + "loss": 0.8481, + "step": 21930 + }, + { + "epoch": 0.34148847563965845, + "grad_norm": 2.5239310264587402, + "learning_rate": 6.931875911571426e-07, + "loss": 0.7255, + "step": 21935 + }, + { + "epoch": 0.3415663166416278, + "grad_norm": 5.252364635467529, + "learning_rate": 6.931056521525376e-07, + "loss": 0.7062, + "step": 21940 + }, + { + "epoch": 0.3416441576435972, + "grad_norm": 3.408431053161621, + "learning_rate": 6.930237131479326e-07, + "loss": 0.7958, + "step": 21945 + }, + { + "epoch": 0.34172199864556657, + "grad_norm": 3.285446882247925, + "learning_rate": 6.929417741433277e-07, + "loss": 0.7819, + "step": 21950 + }, + { + "epoch": 0.34179983964753596, + "grad_norm": 4.0770368576049805, + "learning_rate": 6.928598351387227e-07, + "loss": 0.8164, + "step": 21955 + }, + { + "epoch": 0.3418776806495053, + "grad_norm": 3.2159502506256104, + "learning_rate": 6.927778961341177e-07, + "loss": 0.7799, + "step": 21960 + }, + { + "epoch": 0.3419555216514747, + "grad_norm": 3.2708003520965576, + "learning_rate": 6.926959571295128e-07, + "loss": 0.8103, + "step": 21965 + }, + { + "epoch": 0.3420333626534441, + "grad_norm": 5.362832069396973, + "learning_rate": 6.926140181249078e-07, + "loss": 0.7317, + "step": 21970 + }, + { + "epoch": 0.3421112036554135, + "grad_norm": 3.9707159996032715, + "learning_rate": 6.925320791203029e-07, + "loss": 0.7181, + "step": 21975 + }, + { + "epoch": 0.3421890446573828, + "grad_norm": 4.492808818817139, + "learning_rate": 6.924501401156978e-07, + "loss": 0.8526, + "step": 21980 + }, + { + "epoch": 0.3422668856593522, + "grad_norm": 2.531547784805298, + "learning_rate": 6.923682011110928e-07, + "loss": 0.6497, + "step": 21985 + }, + { + "epoch": 0.3423447266613216, + "grad_norm": 4.880797863006592, + "learning_rate": 6.922862621064879e-07, + "loss": 0.7759, + "step": 21990 + }, + { + "epoch": 0.342422567663291, + "grad_norm": 5.803737640380859, + "learning_rate": 6.92204323101883e-07, + "loss": 0.8622, + "step": 21995 + }, + { + "epoch": 0.3425004086652603, + "grad_norm": 4.203178882598877, + "learning_rate": 6.921223840972779e-07, + "loss": 0.8575, + "step": 22000 + }, + { + "epoch": 0.3425782496672297, + "grad_norm": 6.29517936706543, + "learning_rate": 6.92040445092673e-07, + "loss": 0.814, + "step": 22005 + }, + { + "epoch": 0.3426560906691991, + "grad_norm": 3.598208427429199, + "learning_rate": 6.919585060880681e-07, + "loss": 0.7939, + "step": 22010 + }, + { + "epoch": 0.34273393167116845, + "grad_norm": 4.675456523895264, + "learning_rate": 6.91876567083463e-07, + "loss": 0.8032, + "step": 22015 + }, + { + "epoch": 0.34281177267313784, + "grad_norm": 3.1422770023345947, + "learning_rate": 6.91794628078858e-07, + "loss": 0.8217, + "step": 22020 + }, + { + "epoch": 0.34288961367510723, + "grad_norm": 4.944550037384033, + "learning_rate": 6.917126890742531e-07, + "loss": 0.6776, + "step": 22025 + }, + { + "epoch": 0.3429674546770766, + "grad_norm": 5.1036057472229, + "learning_rate": 6.916307500696482e-07, + "loss": 0.6798, + "step": 22030 + }, + { + "epoch": 0.34304529567904596, + "grad_norm": 4.726799488067627, + "learning_rate": 6.915488110650431e-07, + "loss": 0.7783, + "step": 22035 + }, + { + "epoch": 0.34312313668101535, + "grad_norm": 3.355400323867798, + "learning_rate": 6.914668720604382e-07, + "loss": 0.7481, + "step": 22040 + }, + { + "epoch": 0.34320097768298474, + "grad_norm": 4.240068435668945, + "learning_rate": 6.913849330558333e-07, + "loss": 0.7477, + "step": 22045 + }, + { + "epoch": 0.34327881868495413, + "grad_norm": 5.435959815979004, + "learning_rate": 6.913029940512282e-07, + "loss": 0.8529, + "step": 22050 + }, + { + "epoch": 0.34335665968692347, + "grad_norm": 4.2254438400268555, + "learning_rate": 6.912210550466233e-07, + "loss": 0.6756, + "step": 22055 + }, + { + "epoch": 0.34343450068889286, + "grad_norm": 5.156310558319092, + "learning_rate": 6.911391160420183e-07, + "loss": 0.7669, + "step": 22060 + }, + { + "epoch": 0.34351234169086226, + "grad_norm": 2.5324745178222656, + "learning_rate": 6.910571770374133e-07, + "loss": 0.7165, + "step": 22065 + }, + { + "epoch": 0.34359018269283165, + "grad_norm": 5.870659828186035, + "learning_rate": 6.909752380328083e-07, + "loss": 0.7372, + "step": 22070 + }, + { + "epoch": 0.343668023694801, + "grad_norm": 3.9795782566070557, + "learning_rate": 6.908932990282034e-07, + "loss": 0.7804, + "step": 22075 + }, + { + "epoch": 0.3437458646967704, + "grad_norm": 3.463372230529785, + "learning_rate": 6.908113600235984e-07, + "loss": 0.6794, + "step": 22080 + }, + { + "epoch": 0.34382370569873977, + "grad_norm": 2.5854239463806152, + "learning_rate": 6.907294210189935e-07, + "loss": 0.8242, + "step": 22085 + }, + { + "epoch": 0.3439015467007091, + "grad_norm": 5.109464168548584, + "learning_rate": 6.906474820143885e-07, + "loss": 0.7327, + "step": 22090 + }, + { + "epoch": 0.3439793877026785, + "grad_norm": 3.044029474258423, + "learning_rate": 6.905655430097835e-07, + "loss": 0.7222, + "step": 22095 + }, + { + "epoch": 0.3440572287046479, + "grad_norm": 5.811017036437988, + "learning_rate": 6.904836040051785e-07, + "loss": 0.812, + "step": 22100 + }, + { + "epoch": 0.3441350697066173, + "grad_norm": 3.0378737449645996, + "learning_rate": 6.904016650005735e-07, + "loss": 0.6783, + "step": 22105 + }, + { + "epoch": 0.3442129107085866, + "grad_norm": 3.8113794326782227, + "learning_rate": 6.903197259959685e-07, + "loss": 0.7307, + "step": 22110 + }, + { + "epoch": 0.344290751710556, + "grad_norm": 11.150230407714844, + "learning_rate": 6.902377869913636e-07, + "loss": 0.7725, + "step": 22115 + }, + { + "epoch": 0.3443685927125254, + "grad_norm": 4.847995758056641, + "learning_rate": 6.901558479867587e-07, + "loss": 0.6859, + "step": 22120 + }, + { + "epoch": 0.3444464337144948, + "grad_norm": 3.970857620239258, + "learning_rate": 6.900739089821536e-07, + "loss": 0.859, + "step": 22125 + }, + { + "epoch": 0.34452427471646413, + "grad_norm": 4.519296646118164, + "learning_rate": 6.899919699775487e-07, + "loss": 0.664, + "step": 22130 + }, + { + "epoch": 0.3446021157184335, + "grad_norm": 6.204890251159668, + "learning_rate": 6.899100309729438e-07, + "loss": 0.6662, + "step": 22135 + }, + { + "epoch": 0.3446799567204029, + "grad_norm": 5.375654697418213, + "learning_rate": 6.898280919683386e-07, + "loss": 0.7193, + "step": 22140 + }, + { + "epoch": 0.3447577977223723, + "grad_norm": 2.971890449523926, + "learning_rate": 6.897461529637337e-07, + "loss": 0.7933, + "step": 22145 + }, + { + "epoch": 0.34483563872434164, + "grad_norm": 2.947749614715576, + "learning_rate": 6.896642139591288e-07, + "loss": 0.7621, + "step": 22150 + }, + { + "epoch": 0.34491347972631103, + "grad_norm": 9.173848152160645, + "learning_rate": 6.895822749545239e-07, + "loss": 0.8844, + "step": 22155 + }, + { + "epoch": 0.3449913207282804, + "grad_norm": 3.6220645904541016, + "learning_rate": 6.895003359499188e-07, + "loss": 0.8889, + "step": 22160 + }, + { + "epoch": 0.3450691617302498, + "grad_norm": 3.2141635417938232, + "learning_rate": 6.894183969453139e-07, + "loss": 0.752, + "step": 22165 + }, + { + "epoch": 0.34514700273221915, + "grad_norm": 3.2809619903564453, + "learning_rate": 6.89336457940709e-07, + "loss": 0.7753, + "step": 22170 + }, + { + "epoch": 0.34522484373418855, + "grad_norm": 5.106159687042236, + "learning_rate": 6.89254518936104e-07, + "loss": 0.6969, + "step": 22175 + }, + { + "epoch": 0.34530268473615794, + "grad_norm": 3.3226418495178223, + "learning_rate": 6.891725799314989e-07, + "loss": 0.7708, + "step": 22180 + }, + { + "epoch": 0.3453805257381273, + "grad_norm": 2.795375347137451, + "learning_rate": 6.89090640926894e-07, + "loss": 0.8047, + "step": 22185 + }, + { + "epoch": 0.34545836674009667, + "grad_norm": 3.6871438026428223, + "learning_rate": 6.89008701922289e-07, + "loss": 0.7531, + "step": 22190 + }, + { + "epoch": 0.34553620774206606, + "grad_norm": 2.721118211746216, + "learning_rate": 6.88926762917684e-07, + "loss": 0.7146, + "step": 22195 + }, + { + "epoch": 0.34561404874403545, + "grad_norm": 3.5846962928771973, + "learning_rate": 6.888448239130791e-07, + "loss": 0.8803, + "step": 22200 + }, + { + "epoch": 0.3456918897460048, + "grad_norm": 3.11128568649292, + "learning_rate": 6.887628849084741e-07, + "loss": 0.803, + "step": 22205 + }, + { + "epoch": 0.3457697307479742, + "grad_norm": 6.103161334991455, + "learning_rate": 6.886809459038692e-07, + "loss": 0.6508, + "step": 22210 + }, + { + "epoch": 0.34584757174994357, + "grad_norm": 4.672749996185303, + "learning_rate": 6.885990068992642e-07, + "loss": 0.6122, + "step": 22215 + }, + { + "epoch": 0.34592541275191296, + "grad_norm": 6.16982364654541, + "learning_rate": 6.885170678946591e-07, + "loss": 0.6678, + "step": 22220 + }, + { + "epoch": 0.3460032537538823, + "grad_norm": 3.5904715061187744, + "learning_rate": 6.884351288900542e-07, + "loss": 0.7361, + "step": 22225 + }, + { + "epoch": 0.3460810947558517, + "grad_norm": 3.5172922611236572, + "learning_rate": 6.883531898854492e-07, + "loss": 0.8611, + "step": 22230 + }, + { + "epoch": 0.3461589357578211, + "grad_norm": 4.074573993682861, + "learning_rate": 6.882712508808442e-07, + "loss": 0.8436, + "step": 22235 + }, + { + "epoch": 0.3462367767597905, + "grad_norm": 3.7133467197418213, + "learning_rate": 6.881893118762393e-07, + "loss": 0.8602, + "step": 22240 + }, + { + "epoch": 0.3463146177617598, + "grad_norm": 3.753530263900757, + "learning_rate": 6.881073728716344e-07, + "loss": 0.6924, + "step": 22245 + }, + { + "epoch": 0.3463924587637292, + "grad_norm": 3.2277767658233643, + "learning_rate": 6.880254338670293e-07, + "loss": 0.7164, + "step": 22250 + }, + { + "epoch": 0.3464702997656986, + "grad_norm": 4.366305828094482, + "learning_rate": 6.879434948624244e-07, + "loss": 0.7426, + "step": 22255 + }, + { + "epoch": 0.34654814076766793, + "grad_norm": 3.119943618774414, + "learning_rate": 6.878615558578194e-07, + "loss": 0.7346, + "step": 22260 + }, + { + "epoch": 0.3466259817696373, + "grad_norm": 6.399360179901123, + "learning_rate": 6.877796168532143e-07, + "loss": 0.8213, + "step": 22265 + }, + { + "epoch": 0.3467038227716067, + "grad_norm": 2.9667718410491943, + "learning_rate": 6.876976778486094e-07, + "loss": 0.778, + "step": 22270 + }, + { + "epoch": 0.3467816637735761, + "grad_norm": 12.54854679107666, + "learning_rate": 6.876157388440045e-07, + "loss": 0.71, + "step": 22275 + }, + { + "epoch": 0.34685950477554545, + "grad_norm": 3.2707033157348633, + "learning_rate": 6.875337998393996e-07, + "loss": 0.7795, + "step": 22280 + }, + { + "epoch": 0.34693734577751484, + "grad_norm": 3.859928607940674, + "learning_rate": 6.874518608347945e-07, + "loss": 0.7869, + "step": 22285 + }, + { + "epoch": 0.34701518677948423, + "grad_norm": 6.35730504989624, + "learning_rate": 6.873699218301896e-07, + "loss": 0.8143, + "step": 22290 + }, + { + "epoch": 0.3470930277814536, + "grad_norm": 6.358939170837402, + "learning_rate": 6.872879828255847e-07, + "loss": 0.8757, + "step": 22295 + }, + { + "epoch": 0.34717086878342296, + "grad_norm": 3.4988765716552734, + "learning_rate": 6.872060438209797e-07, + "loss": 0.6424, + "step": 22300 + }, + { + "epoch": 0.34724870978539235, + "grad_norm": 3.9090397357940674, + "learning_rate": 6.871241048163746e-07, + "loss": 0.8496, + "step": 22305 + }, + { + "epoch": 0.34732655078736174, + "grad_norm": 5.115474700927734, + "learning_rate": 6.870421658117697e-07, + "loss": 0.7153, + "step": 22310 + }, + { + "epoch": 0.34740439178933114, + "grad_norm": 3.7270002365112305, + "learning_rate": 6.869602268071647e-07, + "loss": 0.8032, + "step": 22315 + }, + { + "epoch": 0.34748223279130047, + "grad_norm": 3.265317678451538, + "learning_rate": 6.868782878025597e-07, + "loss": 0.7228, + "step": 22320 + }, + { + "epoch": 0.34756007379326986, + "grad_norm": 2.9664700031280518, + "learning_rate": 6.867963487979548e-07, + "loss": 0.6512, + "step": 22325 + }, + { + "epoch": 0.34763791479523926, + "grad_norm": 6.434524059295654, + "learning_rate": 6.867144097933498e-07, + "loss": 0.7779, + "step": 22330 + }, + { + "epoch": 0.34771575579720865, + "grad_norm": 7.022162437438965, + "learning_rate": 6.866324707887449e-07, + "loss": 0.8811, + "step": 22335 + }, + { + "epoch": 0.347793596799178, + "grad_norm": 4.4307427406311035, + "learning_rate": 6.865505317841399e-07, + "loss": 0.7314, + "step": 22340 + }, + { + "epoch": 0.3478714378011474, + "grad_norm": 5.711907386779785, + "learning_rate": 6.864685927795348e-07, + "loss": 0.7039, + "step": 22345 + }, + { + "epoch": 0.34794927880311677, + "grad_norm": 2.9050698280334473, + "learning_rate": 6.863866537749299e-07, + "loss": 0.816, + "step": 22350 + }, + { + "epoch": 0.3480271198050861, + "grad_norm": 2.579324960708618, + "learning_rate": 6.86304714770325e-07, + "loss": 0.7305, + "step": 22355 + }, + { + "epoch": 0.3481049608070555, + "grad_norm": 3.0604426860809326, + "learning_rate": 6.862227757657199e-07, + "loss": 0.735, + "step": 22360 + }, + { + "epoch": 0.3481828018090249, + "grad_norm": 3.0506060123443604, + "learning_rate": 6.86140836761115e-07, + "loss": 0.7676, + "step": 22365 + }, + { + "epoch": 0.3482606428109943, + "grad_norm": 3.921900510787964, + "learning_rate": 6.860588977565101e-07, + "loss": 0.7507, + "step": 22370 + }, + { + "epoch": 0.3483384838129636, + "grad_norm": 5.635601043701172, + "learning_rate": 6.85976958751905e-07, + "loss": 0.7118, + "step": 22375 + }, + { + "epoch": 0.348416324814933, + "grad_norm": 7.334948539733887, + "learning_rate": 6.858950197473001e-07, + "loss": 0.6346, + "step": 22380 + }, + { + "epoch": 0.3484941658169024, + "grad_norm": 5.019484996795654, + "learning_rate": 6.858130807426951e-07, + "loss": 0.7292, + "step": 22385 + }, + { + "epoch": 0.3485720068188718, + "grad_norm": 2.8227858543395996, + "learning_rate": 6.857311417380901e-07, + "loss": 0.8263, + "step": 22390 + }, + { + "epoch": 0.34864984782084113, + "grad_norm": 3.0834336280822754, + "learning_rate": 6.856492027334851e-07, + "loss": 0.8384, + "step": 22395 + }, + { + "epoch": 0.3487276888228105, + "grad_norm": 2.915942430496216, + "learning_rate": 6.855672637288802e-07, + "loss": 0.6825, + "step": 22400 + }, + { + "epoch": 0.3488055298247799, + "grad_norm": 10.376582145690918, + "learning_rate": 6.854853247242753e-07, + "loss": 0.7687, + "step": 22405 + }, + { + "epoch": 0.3488833708267493, + "grad_norm": 4.40883731842041, + "learning_rate": 6.854033857196702e-07, + "loss": 0.8783, + "step": 22410 + }, + { + "epoch": 0.34896121182871864, + "grad_norm": 3.5486271381378174, + "learning_rate": 6.853214467150653e-07, + "loss": 0.6884, + "step": 22415 + }, + { + "epoch": 0.34903905283068803, + "grad_norm": 3.900449514389038, + "learning_rate": 6.852395077104604e-07, + "loss": 0.7433, + "step": 22420 + }, + { + "epoch": 0.3491168938326574, + "grad_norm": 4.826340675354004, + "learning_rate": 6.851575687058553e-07, + "loss": 0.6692, + "step": 22425 + }, + { + "epoch": 0.3491947348346268, + "grad_norm": 4.2448930740356445, + "learning_rate": 6.850756297012503e-07, + "loss": 0.8648, + "step": 22430 + }, + { + "epoch": 0.34927257583659616, + "grad_norm": 2.990662097930908, + "learning_rate": 6.849936906966454e-07, + "loss": 0.7769, + "step": 22435 + }, + { + "epoch": 0.34935041683856555, + "grad_norm": 4.034456729888916, + "learning_rate": 6.849117516920404e-07, + "loss": 0.8343, + "step": 22440 + }, + { + "epoch": 0.34942825784053494, + "grad_norm": 6.867827415466309, + "learning_rate": 6.848298126874355e-07, + "loss": 0.8551, + "step": 22445 + }, + { + "epoch": 0.3495060988425043, + "grad_norm": 3.230297327041626, + "learning_rate": 6.847478736828305e-07, + "loss": 0.7914, + "step": 22450 + }, + { + "epoch": 0.34958393984447367, + "grad_norm": 4.498978137969971, + "learning_rate": 6.846659346782255e-07, + "loss": 0.7035, + "step": 22455 + }, + { + "epoch": 0.34966178084644306, + "grad_norm": 4.367350101470947, + "learning_rate": 6.845839956736206e-07, + "loss": 0.8114, + "step": 22460 + }, + { + "epoch": 0.34973962184841245, + "grad_norm": 8.373358726501465, + "learning_rate": 6.845020566690155e-07, + "loss": 0.7263, + "step": 22465 + }, + { + "epoch": 0.3498174628503818, + "grad_norm": 2.916916608810425, + "learning_rate": 6.844201176644105e-07, + "loss": 0.7741, + "step": 22470 + }, + { + "epoch": 0.3498953038523512, + "grad_norm": 3.4376893043518066, + "learning_rate": 6.843381786598056e-07, + "loss": 0.7689, + "step": 22475 + }, + { + "epoch": 0.3499731448543206, + "grad_norm": 4.603682994842529, + "learning_rate": 6.842562396552007e-07, + "loss": 0.7048, + "step": 22480 + }, + { + "epoch": 0.35005098585628996, + "grad_norm": 6.618813991546631, + "learning_rate": 6.841743006505956e-07, + "loss": 0.7252, + "step": 22485 + }, + { + "epoch": 0.3501288268582593, + "grad_norm": 2.3469929695129395, + "learning_rate": 6.840923616459907e-07, + "loss": 0.621, + "step": 22490 + }, + { + "epoch": 0.3502066678602287, + "grad_norm": 4.470626354217529, + "learning_rate": 6.840104226413858e-07, + "loss": 0.7745, + "step": 22495 + }, + { + "epoch": 0.3502845088621981, + "grad_norm": 3.6722328662872314, + "learning_rate": 6.839284836367807e-07, + "loss": 0.7833, + "step": 22500 + }, + { + "epoch": 0.3503623498641675, + "grad_norm": 3.7017297744750977, + "learning_rate": 6.838465446321757e-07, + "loss": 0.7217, + "step": 22505 + }, + { + "epoch": 0.3504401908661368, + "grad_norm": 3.2131004333496094, + "learning_rate": 6.837646056275708e-07, + "loss": 0.7248, + "step": 22510 + }, + { + "epoch": 0.3505180318681062, + "grad_norm": 3.843090772628784, + "learning_rate": 6.836826666229658e-07, + "loss": 0.7478, + "step": 22515 + }, + { + "epoch": 0.3505958728700756, + "grad_norm": 6.186388969421387, + "learning_rate": 6.836007276183608e-07, + "loss": 0.6838, + "step": 22520 + }, + { + "epoch": 0.35067371387204493, + "grad_norm": 14.013809204101562, + "learning_rate": 6.835187886137559e-07, + "loss": 0.7547, + "step": 22525 + }, + { + "epoch": 0.3507515548740143, + "grad_norm": 4.710532188415527, + "learning_rate": 6.83436849609151e-07, + "loss": 0.8102, + "step": 22530 + }, + { + "epoch": 0.3508293958759837, + "grad_norm": 3.910647392272949, + "learning_rate": 6.83354910604546e-07, + "loss": 0.6972, + "step": 22535 + }, + { + "epoch": 0.3509072368779531, + "grad_norm": 4.449835300445557, + "learning_rate": 6.83272971599941e-07, + "loss": 0.7189, + "step": 22540 + }, + { + "epoch": 0.35098507787992245, + "grad_norm": 5.269973278045654, + "learning_rate": 6.83191032595336e-07, + "loss": 0.7083, + "step": 22545 + }, + { + "epoch": 0.35106291888189184, + "grad_norm": 7.316697120666504, + "learning_rate": 6.83109093590731e-07, + "loss": 0.7048, + "step": 22550 + }, + { + "epoch": 0.35114075988386123, + "grad_norm": 3.925175189971924, + "learning_rate": 6.83027154586126e-07, + "loss": 0.7584, + "step": 22555 + }, + { + "epoch": 0.3512186008858306, + "grad_norm": 5.711633682250977, + "learning_rate": 6.829452155815211e-07, + "loss": 0.7734, + "step": 22560 + }, + { + "epoch": 0.35129644188779996, + "grad_norm": 3.5585262775421143, + "learning_rate": 6.828632765769161e-07, + "loss": 0.7737, + "step": 22565 + }, + { + "epoch": 0.35137428288976935, + "grad_norm": 2.6927120685577393, + "learning_rate": 6.827813375723112e-07, + "loss": 0.7866, + "step": 22570 + }, + { + "epoch": 0.35145212389173874, + "grad_norm": 3.7263312339782715, + "learning_rate": 6.826993985677062e-07, + "loss": 0.8001, + "step": 22575 + }, + { + "epoch": 0.35152996489370814, + "grad_norm": 5.524526119232178, + "learning_rate": 6.826174595631012e-07, + "loss": 0.7609, + "step": 22580 + }, + { + "epoch": 0.35160780589567747, + "grad_norm": 6.121705055236816, + "learning_rate": 6.825355205584962e-07, + "loss": 0.8267, + "step": 22585 + }, + { + "epoch": 0.35168564689764686, + "grad_norm": 10.191218376159668, + "learning_rate": 6.824535815538912e-07, + "loss": 0.8606, + "step": 22590 + }, + { + "epoch": 0.35176348789961626, + "grad_norm": 3.0002195835113525, + "learning_rate": 6.823716425492862e-07, + "loss": 0.644, + "step": 22595 + }, + { + "epoch": 0.35184132890158565, + "grad_norm": 9.535001754760742, + "learning_rate": 6.822897035446813e-07, + "loss": 0.7069, + "step": 22600 + }, + { + "epoch": 0.351919169903555, + "grad_norm": 2.9684154987335205, + "learning_rate": 6.822077645400764e-07, + "loss": 0.8066, + "step": 22605 + }, + { + "epoch": 0.3519970109055244, + "grad_norm": 3.142479419708252, + "learning_rate": 6.821258255354713e-07, + "loss": 0.8111, + "step": 22610 + }, + { + "epoch": 0.35207485190749377, + "grad_norm": 4.711599349975586, + "learning_rate": 6.820438865308664e-07, + "loss": 0.7644, + "step": 22615 + }, + { + "epoch": 0.3521526929094631, + "grad_norm": 5.315770149230957, + "learning_rate": 6.819619475262615e-07, + "loss": 0.6956, + "step": 22620 + }, + { + "epoch": 0.3522305339114325, + "grad_norm": 3.60571026802063, + "learning_rate": 6.818800085216565e-07, + "loss": 0.7191, + "step": 22625 + }, + { + "epoch": 0.3523083749134019, + "grad_norm": 3.6298365592956543, + "learning_rate": 6.817980695170514e-07, + "loss": 0.6508, + "step": 22630 + }, + { + "epoch": 0.3523862159153713, + "grad_norm": 4.676616191864014, + "learning_rate": 6.817161305124465e-07, + "loss": 0.8316, + "step": 22635 + }, + { + "epoch": 0.3524640569173406, + "grad_norm": 2.8332366943359375, + "learning_rate": 6.816341915078415e-07, + "loss": 0.7621, + "step": 22640 + }, + { + "epoch": 0.35254189791931, + "grad_norm": 4.0415167808532715, + "learning_rate": 6.815522525032365e-07, + "loss": 0.6998, + "step": 22645 + }, + { + "epoch": 0.3526197389212794, + "grad_norm": 3.8834304809570312, + "learning_rate": 6.814703134986316e-07, + "loss": 0.7337, + "step": 22650 + }, + { + "epoch": 0.3526975799232488, + "grad_norm": 3.6226377487182617, + "learning_rate": 6.813883744940267e-07, + "loss": 0.8377, + "step": 22655 + }, + { + "epoch": 0.35277542092521813, + "grad_norm": 6.299158573150635, + "learning_rate": 6.813064354894217e-07, + "loss": 0.8646, + "step": 22660 + }, + { + "epoch": 0.3528532619271875, + "grad_norm": 7.230133533477783, + "learning_rate": 6.812244964848167e-07, + "loss": 0.8336, + "step": 22665 + }, + { + "epoch": 0.3529311029291569, + "grad_norm": 6.897984981536865, + "learning_rate": 6.811425574802117e-07, + "loss": 0.8237, + "step": 22670 + }, + { + "epoch": 0.3530089439311263, + "grad_norm": 5.134638786315918, + "learning_rate": 6.810606184756067e-07, + "loss": 0.8048, + "step": 22675 + }, + { + "epoch": 0.35308678493309564, + "grad_norm": 2.6234192848205566, + "learning_rate": 6.809786794710017e-07, + "loss": 0.7218, + "step": 22680 + }, + { + "epoch": 0.35316462593506504, + "grad_norm": 4.5663909912109375, + "learning_rate": 6.808967404663968e-07, + "loss": 0.7262, + "step": 22685 + }, + { + "epoch": 0.3532424669370344, + "grad_norm": 3.0789310932159424, + "learning_rate": 6.808148014617918e-07, + "loss": 0.8084, + "step": 22690 + }, + { + "epoch": 0.35332030793900376, + "grad_norm": 4.5040411949157715, + "learning_rate": 6.807328624571869e-07, + "loss": 0.7338, + "step": 22695 + }, + { + "epoch": 0.35339814894097316, + "grad_norm": 3.41920804977417, + "learning_rate": 6.806509234525819e-07, + "loss": 0.8064, + "step": 22700 + }, + { + "epoch": 0.35347598994294255, + "grad_norm": 9.487191200256348, + "learning_rate": 6.805689844479769e-07, + "loss": 0.7528, + "step": 22705 + }, + { + "epoch": 0.35355383094491194, + "grad_norm": 4.45808744430542, + "learning_rate": 6.804870454433719e-07, + "loss": 0.7882, + "step": 22710 + }, + { + "epoch": 0.3536316719468813, + "grad_norm": 4.73945426940918, + "learning_rate": 6.80405106438767e-07, + "loss": 0.8084, + "step": 22715 + }, + { + "epoch": 0.35370951294885067, + "grad_norm": 4.235296726226807, + "learning_rate": 6.803231674341619e-07, + "loss": 0.8334, + "step": 22720 + }, + { + "epoch": 0.35378735395082006, + "grad_norm": 4.562685489654541, + "learning_rate": 6.80241228429557e-07, + "loss": 0.6529, + "step": 22725 + }, + { + "epoch": 0.35386519495278945, + "grad_norm": 3.7877378463745117, + "learning_rate": 6.801592894249521e-07, + "loss": 0.8522, + "step": 22730 + }, + { + "epoch": 0.3539430359547588, + "grad_norm": 7.546112537384033, + "learning_rate": 6.80077350420347e-07, + "loss": 0.6945, + "step": 22735 + }, + { + "epoch": 0.3540208769567282, + "grad_norm": 3.382904052734375, + "learning_rate": 6.799954114157421e-07, + "loss": 0.8918, + "step": 22740 + }, + { + "epoch": 0.3540987179586976, + "grad_norm": 4.363940238952637, + "learning_rate": 6.799134724111372e-07, + "loss": 0.8784, + "step": 22745 + }, + { + "epoch": 0.35417655896066697, + "grad_norm": 4.633853435516357, + "learning_rate": 6.798315334065321e-07, + "loss": 0.8348, + "step": 22750 + }, + { + "epoch": 0.3542543999626363, + "grad_norm": 3.2815988063812256, + "learning_rate": 6.797495944019271e-07, + "loss": 0.7236, + "step": 22755 + }, + { + "epoch": 0.3543322409646057, + "grad_norm": 3.208714723587036, + "learning_rate": 6.796676553973222e-07, + "loss": 0.7547, + "step": 22760 + }, + { + "epoch": 0.3544100819665751, + "grad_norm": 3.2043545246124268, + "learning_rate": 6.795857163927172e-07, + "loss": 0.7042, + "step": 22765 + }, + { + "epoch": 0.3544879229685445, + "grad_norm": 3.376326322555542, + "learning_rate": 6.795037773881122e-07, + "loss": 0.7247, + "step": 22770 + }, + { + "epoch": 0.3545657639705138, + "grad_norm": 2.1214239597320557, + "learning_rate": 6.794218383835073e-07, + "loss": 0.7341, + "step": 22775 + }, + { + "epoch": 0.3546436049724832, + "grad_norm": 3.4116671085357666, + "learning_rate": 6.793398993789024e-07, + "loss": 0.8146, + "step": 22780 + }, + { + "epoch": 0.3547214459744526, + "grad_norm": 4.884197235107422, + "learning_rate": 6.792579603742974e-07, + "loss": 0.6964, + "step": 22785 + }, + { + "epoch": 0.35479928697642193, + "grad_norm": 2.820373773574829, + "learning_rate": 6.791760213696923e-07, + "loss": 0.6723, + "step": 22790 + }, + { + "epoch": 0.3548771279783913, + "grad_norm": 3.8949737548828125, + "learning_rate": 6.790940823650874e-07, + "loss": 0.8173, + "step": 22795 + }, + { + "epoch": 0.3549549689803607, + "grad_norm": 4.030924320220947, + "learning_rate": 6.790121433604824e-07, + "loss": 0.6769, + "step": 22800 + }, + { + "epoch": 0.3550328099823301, + "grad_norm": 4.274011135101318, + "learning_rate": 6.789302043558775e-07, + "loss": 0.6461, + "step": 22805 + }, + { + "epoch": 0.35511065098429945, + "grad_norm": 2.990159749984741, + "learning_rate": 6.788482653512725e-07, + "loss": 0.8349, + "step": 22810 + }, + { + "epoch": 0.35518849198626884, + "grad_norm": 2.8439786434173584, + "learning_rate": 6.787663263466675e-07, + "loss": 0.7096, + "step": 22815 + }, + { + "epoch": 0.35526633298823823, + "grad_norm": 4.419504642486572, + "learning_rate": 6.786843873420626e-07, + "loss": 0.788, + "step": 22820 + }, + { + "epoch": 0.3553441739902076, + "grad_norm": 9.070401191711426, + "learning_rate": 6.786024483374576e-07, + "loss": 0.7732, + "step": 22825 + }, + { + "epoch": 0.35542201499217696, + "grad_norm": 4.542810440063477, + "learning_rate": 6.785205093328525e-07, + "loss": 0.7192, + "step": 22830 + }, + { + "epoch": 0.35549985599414635, + "grad_norm": 5.248653411865234, + "learning_rate": 6.784385703282476e-07, + "loss": 0.7031, + "step": 22835 + }, + { + "epoch": 0.35557769699611574, + "grad_norm": 4.995849132537842, + "learning_rate": 6.783566313236427e-07, + "loss": 0.8282, + "step": 22840 + }, + { + "epoch": 0.35565553799808514, + "grad_norm": 4.0547637939453125, + "learning_rate": 6.782746923190376e-07, + "loss": 0.7117, + "step": 22845 + }, + { + "epoch": 0.3557333790000545, + "grad_norm": 2.6417508125305176, + "learning_rate": 6.781927533144327e-07, + "loss": 0.6833, + "step": 22850 + }, + { + "epoch": 0.35581122000202386, + "grad_norm": 3.902892827987671, + "learning_rate": 6.781108143098278e-07, + "loss": 0.7101, + "step": 22855 + }, + { + "epoch": 0.35588906100399326, + "grad_norm": 6.848975658416748, + "learning_rate": 6.780288753052227e-07, + "loss": 0.7738, + "step": 22860 + }, + { + "epoch": 0.3559669020059626, + "grad_norm": 5.913900375366211, + "learning_rate": 6.779469363006178e-07, + "loss": 0.9128, + "step": 22865 + }, + { + "epoch": 0.356044743007932, + "grad_norm": 4.356508255004883, + "learning_rate": 6.778649972960128e-07, + "loss": 0.8422, + "step": 22870 + }, + { + "epoch": 0.3561225840099014, + "grad_norm": 3.5027060508728027, + "learning_rate": 6.777830582914078e-07, + "loss": 0.8309, + "step": 22875 + }, + { + "epoch": 0.35620042501187077, + "grad_norm": 3.5927176475524902, + "learning_rate": 6.777011192868028e-07, + "loss": 0.7523, + "step": 22880 + }, + { + "epoch": 0.3562782660138401, + "grad_norm": 4.901709079742432, + "learning_rate": 6.776191802821979e-07, + "loss": 0.6756, + "step": 22885 + }, + { + "epoch": 0.3563561070158095, + "grad_norm": 3.4579625129699707, + "learning_rate": 6.775372412775929e-07, + "loss": 0.7965, + "step": 22890 + }, + { + "epoch": 0.3564339480177789, + "grad_norm": 3.0406250953674316, + "learning_rate": 6.77455302272988e-07, + "loss": 0.642, + "step": 22895 + }, + { + "epoch": 0.3565117890197483, + "grad_norm": 4.66744327545166, + "learning_rate": 6.77373363268383e-07, + "loss": 0.7916, + "step": 22900 + }, + { + "epoch": 0.3565896300217176, + "grad_norm": 3.9514176845550537, + "learning_rate": 6.772914242637781e-07, + "loss": 0.7903, + "step": 22905 + }, + { + "epoch": 0.356667471023687, + "grad_norm": 3.19490385055542, + "learning_rate": 6.77209485259173e-07, + "loss": 0.8109, + "step": 22910 + }, + { + "epoch": 0.3567453120256564, + "grad_norm": 3.2852671146392822, + "learning_rate": 6.77127546254568e-07, + "loss": 0.7254, + "step": 22915 + }, + { + "epoch": 0.3568231530276258, + "grad_norm": 2.8636245727539062, + "learning_rate": 6.770456072499631e-07, + "loss": 0.8545, + "step": 22920 + }, + { + "epoch": 0.35690099402959513, + "grad_norm": 3.2930498123168945, + "learning_rate": 6.769636682453581e-07, + "loss": 0.6733, + "step": 22925 + }, + { + "epoch": 0.3569788350315645, + "grad_norm": 4.192903995513916, + "learning_rate": 6.768817292407532e-07, + "loss": 0.7281, + "step": 22930 + }, + { + "epoch": 0.3570566760335339, + "grad_norm": 3.524688482284546, + "learning_rate": 6.767997902361482e-07, + "loss": 0.773, + "step": 22935 + }, + { + "epoch": 0.3571345170355033, + "grad_norm": 3.3027379512786865, + "learning_rate": 6.767178512315432e-07, + "loss": 0.7367, + "step": 22940 + }, + { + "epoch": 0.35721235803747264, + "grad_norm": 5.94819450378418, + "learning_rate": 6.766359122269383e-07, + "loss": 0.7967, + "step": 22945 + }, + { + "epoch": 0.35729019903944204, + "grad_norm": 3.1738839149475098, + "learning_rate": 6.765539732223334e-07, + "loss": 0.8166, + "step": 22950 + }, + { + "epoch": 0.35736804004141143, + "grad_norm": 5.154567241668701, + "learning_rate": 6.764720342177282e-07, + "loss": 0.8211, + "step": 22955 + }, + { + "epoch": 0.35744588104338076, + "grad_norm": 3.465256452560425, + "learning_rate": 6.763900952131233e-07, + "loss": 0.7854, + "step": 22960 + }, + { + "epoch": 0.35752372204535016, + "grad_norm": 4.536584854125977, + "learning_rate": 6.763081562085184e-07, + "loss": 0.7004, + "step": 22965 + }, + { + "epoch": 0.35760156304731955, + "grad_norm": 8.660713195800781, + "learning_rate": 6.762262172039133e-07, + "loss": 0.7063, + "step": 22970 + }, + { + "epoch": 0.35767940404928894, + "grad_norm": 3.680833339691162, + "learning_rate": 6.761442781993084e-07, + "loss": 0.691, + "step": 22975 + }, + { + "epoch": 0.3577572450512583, + "grad_norm": 2.5678701400756836, + "learning_rate": 6.760623391947035e-07, + "loss": 0.7232, + "step": 22980 + }, + { + "epoch": 0.35783508605322767, + "grad_norm": 5.785712718963623, + "learning_rate": 6.759804001900985e-07, + "loss": 0.7438, + "step": 22985 + }, + { + "epoch": 0.35791292705519706, + "grad_norm": 4.117922782897949, + "learning_rate": 6.758984611854935e-07, + "loss": 0.7896, + "step": 22990 + }, + { + "epoch": 0.35799076805716645, + "grad_norm": 3.6091392040252686, + "learning_rate": 6.758165221808885e-07, + "loss": 0.8481, + "step": 22995 + }, + { + "epoch": 0.3580686090591358, + "grad_norm": 5.465922832489014, + "learning_rate": 6.757345831762835e-07, + "loss": 0.7218, + "step": 23000 + }, + { + "epoch": 0.3581464500611052, + "grad_norm": 5.9281206130981445, + "learning_rate": 6.756526441716785e-07, + "loss": 0.6711, + "step": 23005 + }, + { + "epoch": 0.3582242910630746, + "grad_norm": 2.9041519165039062, + "learning_rate": 6.755707051670736e-07, + "loss": 0.8283, + "step": 23010 + }, + { + "epoch": 0.35830213206504397, + "grad_norm": 4.295798301696777, + "learning_rate": 6.754887661624686e-07, + "loss": 0.8098, + "step": 23015 + }, + { + "epoch": 0.3583799730670133, + "grad_norm": 7.160951614379883, + "learning_rate": 6.754068271578637e-07, + "loss": 0.7757, + "step": 23020 + }, + { + "epoch": 0.3584578140689827, + "grad_norm": 2.566189765930176, + "learning_rate": 6.753248881532587e-07, + "loss": 0.7379, + "step": 23025 + }, + { + "epoch": 0.3585356550709521, + "grad_norm": 4.007600784301758, + "learning_rate": 6.752429491486538e-07, + "loss": 0.803, + "step": 23030 + }, + { + "epoch": 0.3586134960729215, + "grad_norm": 3.9632365703582764, + "learning_rate": 6.751610101440487e-07, + "loss": 0.7326, + "step": 23035 + }, + { + "epoch": 0.3586913370748908, + "grad_norm": 4.623491287231445, + "learning_rate": 6.750790711394438e-07, + "loss": 0.746, + "step": 23040 + }, + { + "epoch": 0.3587691780768602, + "grad_norm": 5.454947471618652, + "learning_rate": 6.749971321348388e-07, + "loss": 0.8059, + "step": 23045 + }, + { + "epoch": 0.3588470190788296, + "grad_norm": 2.595407247543335, + "learning_rate": 6.749151931302338e-07, + "loss": 0.7712, + "step": 23050 + }, + { + "epoch": 0.35892486008079894, + "grad_norm": 4.200428485870361, + "learning_rate": 6.748332541256289e-07, + "loss": 0.6896, + "step": 23055 + }, + { + "epoch": 0.3590027010827683, + "grad_norm": 3.9522597789764404, + "learning_rate": 6.747513151210239e-07, + "loss": 0.9086, + "step": 23060 + }, + { + "epoch": 0.3590805420847377, + "grad_norm": 8.130274772644043, + "learning_rate": 6.746693761164189e-07, + "loss": 0.7731, + "step": 23065 + }, + { + "epoch": 0.3591583830867071, + "grad_norm": 5.477586269378662, + "learning_rate": 6.74587437111814e-07, + "loss": 0.7276, + "step": 23070 + }, + { + "epoch": 0.35923622408867645, + "grad_norm": 5.232213973999023, + "learning_rate": 6.74505498107209e-07, + "loss": 0.728, + "step": 23075 + }, + { + "epoch": 0.35931406509064584, + "grad_norm": 4.288384437561035, + "learning_rate": 6.744235591026039e-07, + "loss": 0.7833, + "step": 23080 + }, + { + "epoch": 0.35939190609261523, + "grad_norm": 10.313909530639648, + "learning_rate": 6.74341620097999e-07, + "loss": 0.8917, + "step": 23085 + }, + { + "epoch": 0.3594697470945846, + "grad_norm": 3.7162394523620605, + "learning_rate": 6.742596810933941e-07, + "loss": 0.8603, + "step": 23090 + }, + { + "epoch": 0.35954758809655396, + "grad_norm": 4.621378421783447, + "learning_rate": 6.74177742088789e-07, + "loss": 0.6715, + "step": 23095 + }, + { + "epoch": 0.35962542909852335, + "grad_norm": 3.898655652999878, + "learning_rate": 6.740958030841841e-07, + "loss": 0.7817, + "step": 23100 + }, + { + "epoch": 0.35970327010049274, + "grad_norm": 3.6938929557800293, + "learning_rate": 6.740138640795792e-07, + "loss": 0.7954, + "step": 23105 + }, + { + "epoch": 0.35978111110246214, + "grad_norm": 3.4234812259674072, + "learning_rate": 6.739319250749742e-07, + "loss": 0.7543, + "step": 23110 + }, + { + "epoch": 0.3598589521044315, + "grad_norm": 8.834489822387695, + "learning_rate": 6.738499860703691e-07, + "loss": 0.7172, + "step": 23115 + }, + { + "epoch": 0.35993679310640087, + "grad_norm": 4.670121192932129, + "learning_rate": 6.737680470657642e-07, + "loss": 0.6233, + "step": 23120 + }, + { + "epoch": 0.36001463410837026, + "grad_norm": 5.580198287963867, + "learning_rate": 6.736861080611592e-07, + "loss": 0.8335, + "step": 23125 + }, + { + "epoch": 0.3600924751103396, + "grad_norm": 3.1863162517547607, + "learning_rate": 6.736041690565543e-07, + "loss": 0.7478, + "step": 23130 + }, + { + "epoch": 0.360170316112309, + "grad_norm": 3.62100887298584, + "learning_rate": 6.735222300519493e-07, + "loss": 0.7283, + "step": 23135 + }, + { + "epoch": 0.3602481571142784, + "grad_norm": 3.6080639362335205, + "learning_rate": 6.734402910473443e-07, + "loss": 0.7048, + "step": 23140 + }, + { + "epoch": 0.36032599811624777, + "grad_norm": 4.390148639678955, + "learning_rate": 6.733583520427394e-07, + "loss": 0.7789, + "step": 23145 + }, + { + "epoch": 0.3604038391182171, + "grad_norm": 3.059814214706421, + "learning_rate": 6.732764130381344e-07, + "loss": 0.7741, + "step": 23150 + }, + { + "epoch": 0.3604816801201865, + "grad_norm": 8.699779510498047, + "learning_rate": 6.731944740335293e-07, + "loss": 0.8658, + "step": 23155 + }, + { + "epoch": 0.3605595211221559, + "grad_norm": 2.8748719692230225, + "learning_rate": 6.731125350289244e-07, + "loss": 0.7413, + "step": 23160 + }, + { + "epoch": 0.3606373621241253, + "grad_norm": 10.56540584564209, + "learning_rate": 6.730305960243195e-07, + "loss": 0.7527, + "step": 23165 + }, + { + "epoch": 0.3607152031260946, + "grad_norm": 4.239411354064941, + "learning_rate": 6.729486570197145e-07, + "loss": 0.7167, + "step": 23170 + }, + { + "epoch": 0.360793044128064, + "grad_norm": 4.87721061706543, + "learning_rate": 6.728667180151095e-07, + "loss": 0.6902, + "step": 23175 + }, + { + "epoch": 0.3608708851300334, + "grad_norm": 4.051860809326172, + "learning_rate": 6.727847790105046e-07, + "loss": 0.7506, + "step": 23180 + }, + { + "epoch": 0.3609487261320028, + "grad_norm": 6.635641574859619, + "learning_rate": 6.727028400058997e-07, + "loss": 0.7678, + "step": 23185 + }, + { + "epoch": 0.36102656713397213, + "grad_norm": 4.147698879241943, + "learning_rate": 6.726209010012946e-07, + "loss": 0.7075, + "step": 23190 + }, + { + "epoch": 0.3611044081359415, + "grad_norm": 3.2259066104888916, + "learning_rate": 6.725389619966896e-07, + "loss": 0.6925, + "step": 23195 + }, + { + "epoch": 0.3611822491379109, + "grad_norm": 4.023670673370361, + "learning_rate": 6.724570229920847e-07, + "loss": 0.8152, + "step": 23200 + }, + { + "epoch": 0.3612600901398803, + "grad_norm": 4.489516258239746, + "learning_rate": 6.723750839874796e-07, + "loss": 0.8063, + "step": 23205 + }, + { + "epoch": 0.36133793114184964, + "grad_norm": 14.589898109436035, + "learning_rate": 6.722931449828747e-07, + "loss": 0.7012, + "step": 23210 + }, + { + "epoch": 0.36141577214381904, + "grad_norm": 7.213819980621338, + "learning_rate": 6.722112059782698e-07, + "loss": 0.7749, + "step": 23215 + }, + { + "epoch": 0.36149361314578843, + "grad_norm": 3.634456157684326, + "learning_rate": 6.721292669736648e-07, + "loss": 0.9399, + "step": 23220 + }, + { + "epoch": 0.36157145414775776, + "grad_norm": 5.684159278869629, + "learning_rate": 6.720473279690598e-07, + "loss": 0.8365, + "step": 23225 + }, + { + "epoch": 0.36164929514972716, + "grad_norm": 5.689432144165039, + "learning_rate": 6.719653889644549e-07, + "loss": 0.6303, + "step": 23230 + }, + { + "epoch": 0.36172713615169655, + "grad_norm": 4.839646816253662, + "learning_rate": 6.718834499598499e-07, + "loss": 0.8117, + "step": 23235 + }, + { + "epoch": 0.36180497715366594, + "grad_norm": 3.3537235260009766, + "learning_rate": 6.718015109552448e-07, + "loss": 0.7472, + "step": 23240 + }, + { + "epoch": 0.3618828181556353, + "grad_norm": 3.9428837299346924, + "learning_rate": 6.717195719506399e-07, + "loss": 0.8015, + "step": 23245 + }, + { + "epoch": 0.36196065915760467, + "grad_norm": 4.171321868896484, + "learning_rate": 6.716376329460349e-07, + "loss": 0.6913, + "step": 23250 + }, + { + "epoch": 0.36203850015957406, + "grad_norm": 3.294297933578491, + "learning_rate": 6.7155569394143e-07, + "loss": 0.7597, + "step": 23255 + }, + { + "epoch": 0.36211634116154345, + "grad_norm": 3.333055257797241, + "learning_rate": 6.71473754936825e-07, + "loss": 0.6821, + "step": 23260 + }, + { + "epoch": 0.3621941821635128, + "grad_norm": 3.4170455932617188, + "learning_rate": 6.7139181593222e-07, + "loss": 0.7262, + "step": 23265 + }, + { + "epoch": 0.3622720231654822, + "grad_norm": 10.967249870300293, + "learning_rate": 6.713098769276151e-07, + "loss": 0.6685, + "step": 23270 + }, + { + "epoch": 0.3623498641674516, + "grad_norm": 4.1759467124938965, + "learning_rate": 6.712279379230102e-07, + "loss": 0.712, + "step": 23275 + }, + { + "epoch": 0.36242770516942097, + "grad_norm": 3.056581735610962, + "learning_rate": 6.71145998918405e-07, + "loss": 0.7911, + "step": 23280 + }, + { + "epoch": 0.3625055461713903, + "grad_norm": 4.291056156158447, + "learning_rate": 6.710640599138001e-07, + "loss": 0.7522, + "step": 23285 + }, + { + "epoch": 0.3625833871733597, + "grad_norm": 4.416627407073975, + "learning_rate": 6.709821209091952e-07, + "loss": 0.739, + "step": 23290 + }, + { + "epoch": 0.3626612281753291, + "grad_norm": 3.039506435394287, + "learning_rate": 6.709001819045902e-07, + "loss": 0.6868, + "step": 23295 + }, + { + "epoch": 0.3627390691772984, + "grad_norm": 3.890329360961914, + "learning_rate": 6.708182428999852e-07, + "loss": 0.6522, + "step": 23300 + }, + { + "epoch": 0.3628169101792678, + "grad_norm": 3.143517017364502, + "learning_rate": 6.707363038953803e-07, + "loss": 0.7248, + "step": 23305 + }, + { + "epoch": 0.3628947511812372, + "grad_norm": 2.5914478302001953, + "learning_rate": 6.706543648907754e-07, + "loss": 0.7247, + "step": 23310 + }, + { + "epoch": 0.3629725921832066, + "grad_norm": 3.5781683921813965, + "learning_rate": 6.705724258861703e-07, + "loss": 0.6967, + "step": 23315 + }, + { + "epoch": 0.36305043318517594, + "grad_norm": 7.8637566566467285, + "learning_rate": 6.704904868815653e-07, + "loss": 0.5988, + "step": 23320 + }, + { + "epoch": 0.36312827418714533, + "grad_norm": 14.063533782958984, + "learning_rate": 6.704085478769604e-07, + "loss": 0.7561, + "step": 23325 + }, + { + "epoch": 0.3632061151891147, + "grad_norm": 4.953052043914795, + "learning_rate": 6.703266088723553e-07, + "loss": 0.8064, + "step": 23330 + }, + { + "epoch": 0.3632839561910841, + "grad_norm": 4.630489826202393, + "learning_rate": 6.702446698677504e-07, + "loss": 0.8702, + "step": 23335 + }, + { + "epoch": 0.36336179719305345, + "grad_norm": 3.3488337993621826, + "learning_rate": 6.701627308631455e-07, + "loss": 0.7062, + "step": 23340 + }, + { + "epoch": 0.36343963819502284, + "grad_norm": 4.161873817443848, + "learning_rate": 6.700807918585405e-07, + "loss": 0.8029, + "step": 23345 + }, + { + "epoch": 0.36351747919699223, + "grad_norm": 6.1759467124938965, + "learning_rate": 6.699988528539355e-07, + "loss": 0.6983, + "step": 23350 + }, + { + "epoch": 0.3635953201989616, + "grad_norm": 4.258143901824951, + "learning_rate": 6.699169138493306e-07, + "loss": 0.7348, + "step": 23355 + }, + { + "epoch": 0.36367316120093096, + "grad_norm": 3.5058975219726562, + "learning_rate": 6.698349748447255e-07, + "loss": 0.7475, + "step": 23360 + }, + { + "epoch": 0.36375100220290035, + "grad_norm": 5.003944396972656, + "learning_rate": 6.697530358401205e-07, + "loss": 0.7452, + "step": 23365 + }, + { + "epoch": 0.36382884320486975, + "grad_norm": 3.4567155838012695, + "learning_rate": 6.696710968355156e-07, + "loss": 0.6901, + "step": 23370 + }, + { + "epoch": 0.36390668420683914, + "grad_norm": 11.741866111755371, + "learning_rate": 6.695891578309106e-07, + "loss": 0.7837, + "step": 23375 + }, + { + "epoch": 0.3639845252088085, + "grad_norm": 3.1956512928009033, + "learning_rate": 6.695072188263057e-07, + "loss": 0.7313, + "step": 23380 + }, + { + "epoch": 0.36406236621077787, + "grad_norm": 3.3680739402770996, + "learning_rate": 6.694252798217007e-07, + "loss": 0.8233, + "step": 23385 + }, + { + "epoch": 0.36414020721274726, + "grad_norm": 4.3190202713012695, + "learning_rate": 6.693433408170957e-07, + "loss": 0.7765, + "step": 23390 + }, + { + "epoch": 0.3642180482147166, + "grad_norm": 3.9722366333007812, + "learning_rate": 6.692614018124908e-07, + "loss": 0.8161, + "step": 23395 + }, + { + "epoch": 0.364295889216686, + "grad_norm": 7.148501873016357, + "learning_rate": 6.691794628078858e-07, + "loss": 0.8317, + "step": 23400 + }, + { + "epoch": 0.3643737302186554, + "grad_norm": 3.410125970840454, + "learning_rate": 6.690975238032807e-07, + "loss": 0.8454, + "step": 23405 + }, + { + "epoch": 0.36445157122062477, + "grad_norm": 4.123642921447754, + "learning_rate": 6.690155847986758e-07, + "loss": 0.6667, + "step": 23410 + }, + { + "epoch": 0.3645294122225941, + "grad_norm": 2.6929523944854736, + "learning_rate": 6.689336457940709e-07, + "loss": 0.6559, + "step": 23415 + }, + { + "epoch": 0.3646072532245635, + "grad_norm": 6.187636375427246, + "learning_rate": 6.688517067894659e-07, + "loss": 0.631, + "step": 23420 + }, + { + "epoch": 0.3646850942265329, + "grad_norm": 3.486956834793091, + "learning_rate": 6.687697677848609e-07, + "loss": 0.7464, + "step": 23425 + }, + { + "epoch": 0.3647629352285023, + "grad_norm": 3.228217601776123, + "learning_rate": 6.68687828780256e-07, + "loss": 0.692, + "step": 23430 + }, + { + "epoch": 0.3648407762304716, + "grad_norm": 3.462691307067871, + "learning_rate": 6.686058897756511e-07, + "loss": 0.661, + "step": 23435 + }, + { + "epoch": 0.364918617232441, + "grad_norm": 3.6201229095458984, + "learning_rate": 6.685239507710459e-07, + "loss": 0.6984, + "step": 23440 + }, + { + "epoch": 0.3649964582344104, + "grad_norm": 3.9280552864074707, + "learning_rate": 6.68442011766441e-07, + "loss": 0.7372, + "step": 23445 + }, + { + "epoch": 0.3650742992363798, + "grad_norm": 3.9565505981445312, + "learning_rate": 6.683600727618361e-07, + "loss": 0.7406, + "step": 23450 + }, + { + "epoch": 0.36515214023834913, + "grad_norm": 3.299711227416992, + "learning_rate": 6.68278133757231e-07, + "loss": 0.8143, + "step": 23455 + }, + { + "epoch": 0.3652299812403185, + "grad_norm": 8.119706153869629, + "learning_rate": 6.681961947526261e-07, + "loss": 0.6385, + "step": 23460 + }, + { + "epoch": 0.3653078222422879, + "grad_norm": 4.427123546600342, + "learning_rate": 6.681142557480212e-07, + "loss": 0.7992, + "step": 23465 + }, + { + "epoch": 0.36538566324425725, + "grad_norm": 6.14912223815918, + "learning_rate": 6.680323167434162e-07, + "loss": 0.6641, + "step": 23470 + }, + { + "epoch": 0.36546350424622664, + "grad_norm": 3.034275770187378, + "learning_rate": 6.679503777388112e-07, + "loss": 0.8102, + "step": 23475 + }, + { + "epoch": 0.36554134524819604, + "grad_norm": 4.197099685668945, + "learning_rate": 6.678684387342062e-07, + "loss": 0.7425, + "step": 23480 + }, + { + "epoch": 0.36561918625016543, + "grad_norm": 3.902829647064209, + "learning_rate": 6.677864997296012e-07, + "loss": 0.7829, + "step": 23485 + }, + { + "epoch": 0.36569702725213477, + "grad_norm": 2.425935983657837, + "learning_rate": 6.677045607249963e-07, + "loss": 0.7347, + "step": 23490 + }, + { + "epoch": 0.36577486825410416, + "grad_norm": 5.808422565460205, + "learning_rate": 6.676226217203913e-07, + "loss": 0.7398, + "step": 23495 + }, + { + "epoch": 0.36585270925607355, + "grad_norm": 3.304903984069824, + "learning_rate": 6.675406827157863e-07, + "loss": 0.7069, + "step": 23500 + }, + { + "epoch": 0.36593055025804294, + "grad_norm": 3.726069927215576, + "learning_rate": 6.674587437111814e-07, + "loss": 0.7961, + "step": 23505 + }, + { + "epoch": 0.3660083912600123, + "grad_norm": 3.518789052963257, + "learning_rate": 6.673768047065764e-07, + "loss": 0.8082, + "step": 23510 + }, + { + "epoch": 0.36608623226198167, + "grad_norm": 5.096744537353516, + "learning_rate": 6.672948657019714e-07, + "loss": 0.6841, + "step": 23515 + }, + { + "epoch": 0.36616407326395106, + "grad_norm": 4.866468906402588, + "learning_rate": 6.672129266973664e-07, + "loss": 0.8747, + "step": 23520 + }, + { + "epoch": 0.36624191426592045, + "grad_norm": 3.5222420692443848, + "learning_rate": 6.671309876927615e-07, + "loss": 0.7526, + "step": 23525 + }, + { + "epoch": 0.3663197552678898, + "grad_norm": 4.000988960266113, + "learning_rate": 6.670490486881564e-07, + "loss": 0.7283, + "step": 23530 + }, + { + "epoch": 0.3663975962698592, + "grad_norm": 2.9310030937194824, + "learning_rate": 6.669671096835515e-07, + "loss": 0.7335, + "step": 23535 + }, + { + "epoch": 0.3664754372718286, + "grad_norm": 2.890139579772949, + "learning_rate": 6.668851706789466e-07, + "loss": 0.82, + "step": 23540 + }, + { + "epoch": 0.36655327827379797, + "grad_norm": 4.196646690368652, + "learning_rate": 6.668032316743417e-07, + "loss": 0.6881, + "step": 23545 + }, + { + "epoch": 0.3666311192757673, + "grad_norm": 3.2677407264709473, + "learning_rate": 6.667212926697366e-07, + "loss": 0.7409, + "step": 23550 + }, + { + "epoch": 0.3667089602777367, + "grad_norm": 3.126652717590332, + "learning_rate": 6.666393536651317e-07, + "loss": 0.7847, + "step": 23555 + }, + { + "epoch": 0.3667868012797061, + "grad_norm": 3.171036720275879, + "learning_rate": 6.665574146605268e-07, + "loss": 0.7718, + "step": 23560 + }, + { + "epoch": 0.3668646422816754, + "grad_norm": 6.803696155548096, + "learning_rate": 6.664754756559216e-07, + "loss": 0.8001, + "step": 23565 + }, + { + "epoch": 0.3669424832836448, + "grad_norm": 2.7900009155273438, + "learning_rate": 6.663935366513167e-07, + "loss": 0.7105, + "step": 23570 + }, + { + "epoch": 0.3670203242856142, + "grad_norm": 7.508723258972168, + "learning_rate": 6.663115976467118e-07, + "loss": 0.7273, + "step": 23575 + }, + { + "epoch": 0.3670981652875836, + "grad_norm": 6.162428379058838, + "learning_rate": 6.662296586421068e-07, + "loss": 0.7806, + "step": 23580 + }, + { + "epoch": 0.36717600628955294, + "grad_norm": 3.379800319671631, + "learning_rate": 6.661477196375018e-07, + "loss": 0.6322, + "step": 23585 + }, + { + "epoch": 0.36725384729152233, + "grad_norm": 4.739343643188477, + "learning_rate": 6.660657806328969e-07, + "loss": 0.6487, + "step": 23590 + }, + { + "epoch": 0.3673316882934917, + "grad_norm": 3.727065086364746, + "learning_rate": 6.659838416282919e-07, + "loss": 0.7775, + "step": 23595 + }, + { + "epoch": 0.3674095292954611, + "grad_norm": 12.969565391540527, + "learning_rate": 6.659019026236869e-07, + "loss": 0.8681, + "step": 23600 + }, + { + "epoch": 0.36748737029743045, + "grad_norm": 3.2617616653442383, + "learning_rate": 6.658199636190819e-07, + "loss": 0.7413, + "step": 23605 + }, + { + "epoch": 0.36756521129939984, + "grad_norm": 3.86673641204834, + "learning_rate": 6.657380246144769e-07, + "loss": 0.7602, + "step": 23610 + }, + { + "epoch": 0.36764305230136923, + "grad_norm": 3.8551628589630127, + "learning_rate": 6.65656085609872e-07, + "loss": 0.7935, + "step": 23615 + }, + { + "epoch": 0.3677208933033386, + "grad_norm": 5.538820266723633, + "learning_rate": 6.65574146605267e-07, + "loss": 0.6844, + "step": 23620 + }, + { + "epoch": 0.36779873430530796, + "grad_norm": 3.6687228679656982, + "learning_rate": 6.65492207600662e-07, + "loss": 0.667, + "step": 23625 + }, + { + "epoch": 0.36787657530727735, + "grad_norm": 3.558749198913574, + "learning_rate": 6.654102685960571e-07, + "loss": 0.7229, + "step": 23630 + }, + { + "epoch": 0.36795441630924675, + "grad_norm": 4.3059234619140625, + "learning_rate": 6.653283295914522e-07, + "loss": 0.8282, + "step": 23635 + }, + { + "epoch": 0.3680322573112161, + "grad_norm": 6.534751892089844, + "learning_rate": 6.652463905868471e-07, + "loss": 0.6982, + "step": 23640 + }, + { + "epoch": 0.3681100983131855, + "grad_norm": 2.8018741607666016, + "learning_rate": 6.651644515822421e-07, + "loss": 0.7222, + "step": 23645 + }, + { + "epoch": 0.36818793931515487, + "grad_norm": 6.206122398376465, + "learning_rate": 6.650825125776372e-07, + "loss": 0.7321, + "step": 23650 + }, + { + "epoch": 0.36826578031712426, + "grad_norm": 2.767120122909546, + "learning_rate": 6.650005735730321e-07, + "loss": 0.6645, + "step": 23655 + }, + { + "epoch": 0.3683436213190936, + "grad_norm": 3.2537684440612793, + "learning_rate": 6.649186345684272e-07, + "loss": 0.7628, + "step": 23660 + }, + { + "epoch": 0.368421462321063, + "grad_norm": 5.646450996398926, + "learning_rate": 6.648366955638223e-07, + "loss": 0.7477, + "step": 23665 + }, + { + "epoch": 0.3684993033230324, + "grad_norm": 3.675793409347534, + "learning_rate": 6.647547565592174e-07, + "loss": 0.6524, + "step": 23670 + }, + { + "epoch": 0.36857714432500177, + "grad_norm": 3.5903780460357666, + "learning_rate": 6.646728175546123e-07, + "loss": 0.7101, + "step": 23675 + }, + { + "epoch": 0.3686549853269711, + "grad_norm": 6.434808254241943, + "learning_rate": 6.645908785500074e-07, + "loss": 0.7431, + "step": 23680 + }, + { + "epoch": 0.3687328263289405, + "grad_norm": 3.1390936374664307, + "learning_rate": 6.645089395454024e-07, + "loss": 0.7688, + "step": 23685 + }, + { + "epoch": 0.3688106673309099, + "grad_norm": 3.689784526824951, + "learning_rate": 6.644270005407973e-07, + "loss": 0.7055, + "step": 23690 + }, + { + "epoch": 0.3688885083328793, + "grad_norm": 3.707280158996582, + "learning_rate": 6.643450615361924e-07, + "loss": 0.7503, + "step": 23695 + }, + { + "epoch": 0.3689663493348486, + "grad_norm": 4.782886981964111, + "learning_rate": 6.642631225315875e-07, + "loss": 0.7002, + "step": 23700 + }, + { + "epoch": 0.369044190336818, + "grad_norm": 3.4875693321228027, + "learning_rate": 6.641811835269825e-07, + "loss": 0.8609, + "step": 23705 + }, + { + "epoch": 0.3691220313387874, + "grad_norm": 4.89082670211792, + "learning_rate": 6.640992445223775e-07, + "loss": 0.7487, + "step": 23710 + }, + { + "epoch": 0.3691998723407568, + "grad_norm": 4.395251750946045, + "learning_rate": 6.640173055177726e-07, + "loss": 0.7893, + "step": 23715 + }, + { + "epoch": 0.36927771334272613, + "grad_norm": 3.1876797676086426, + "learning_rate": 6.639353665131676e-07, + "loss": 0.7706, + "step": 23720 + }, + { + "epoch": 0.3693555543446955, + "grad_norm": 4.417449951171875, + "learning_rate": 6.638534275085625e-07, + "loss": 0.7106, + "step": 23725 + }, + { + "epoch": 0.3694333953466649, + "grad_norm": 3.5318500995635986, + "learning_rate": 6.637714885039576e-07, + "loss": 0.6906, + "step": 23730 + }, + { + "epoch": 0.36951123634863425, + "grad_norm": 6.2017822265625, + "learning_rate": 6.636895494993526e-07, + "loss": 0.8055, + "step": 23735 + }, + { + "epoch": 0.36958907735060365, + "grad_norm": 3.228733539581299, + "learning_rate": 6.636076104947477e-07, + "loss": 0.8764, + "step": 23740 + }, + { + "epoch": 0.36966691835257304, + "grad_norm": 8.708043098449707, + "learning_rate": 6.635256714901427e-07, + "loss": 0.8125, + "step": 23745 + }, + { + "epoch": 0.36974475935454243, + "grad_norm": 3.2916955947875977, + "learning_rate": 6.634437324855377e-07, + "loss": 0.9001, + "step": 23750 + }, + { + "epoch": 0.36982260035651177, + "grad_norm": 4.881289958953857, + "learning_rate": 6.633617934809328e-07, + "loss": 0.6748, + "step": 23755 + }, + { + "epoch": 0.36990044135848116, + "grad_norm": 8.595666885375977, + "learning_rate": 6.632798544763279e-07, + "loss": 0.7027, + "step": 23760 + }, + { + "epoch": 0.36997828236045055, + "grad_norm": 3.132411479949951, + "learning_rate": 6.631979154717227e-07, + "loss": 0.8831, + "step": 23765 + }, + { + "epoch": 0.37005612336241994, + "grad_norm": 3.2658257484436035, + "learning_rate": 6.631159764671178e-07, + "loss": 0.8026, + "step": 23770 + }, + { + "epoch": 0.3701339643643893, + "grad_norm": 6.149971961975098, + "learning_rate": 6.630340374625129e-07, + "loss": 0.7649, + "step": 23775 + }, + { + "epoch": 0.37021180536635867, + "grad_norm": 4.137527942657471, + "learning_rate": 6.629520984579078e-07, + "loss": 0.8157, + "step": 23780 + }, + { + "epoch": 0.37028964636832806, + "grad_norm": 2.82962703704834, + "learning_rate": 6.628701594533029e-07, + "loss": 0.8197, + "step": 23785 + }, + { + "epoch": 0.37036748737029745, + "grad_norm": 6.3977370262146, + "learning_rate": 6.62788220448698e-07, + "loss": 0.6795, + "step": 23790 + }, + { + "epoch": 0.3704453283722668, + "grad_norm": 6.537234306335449, + "learning_rate": 6.627062814440931e-07, + "loss": 0.6928, + "step": 23795 + }, + { + "epoch": 0.3705231693742362, + "grad_norm": 3.5378894805908203, + "learning_rate": 6.62624342439488e-07, + "loss": 0.8536, + "step": 23800 + }, + { + "epoch": 0.3706010103762056, + "grad_norm": 8.844374656677246, + "learning_rate": 6.62542403434883e-07, + "loss": 0.8496, + "step": 23805 + }, + { + "epoch": 0.37067885137817497, + "grad_norm": 5.335997104644775, + "learning_rate": 6.624604644302781e-07, + "loss": 0.8001, + "step": 23810 + }, + { + "epoch": 0.3707566923801443, + "grad_norm": 3.984768867492676, + "learning_rate": 6.62378525425673e-07, + "loss": 0.7219, + "step": 23815 + }, + { + "epoch": 0.3708345333821137, + "grad_norm": 2.9257357120513916, + "learning_rate": 6.622965864210681e-07, + "loss": 0.8097, + "step": 23820 + }, + { + "epoch": 0.3709123743840831, + "grad_norm": 3.574244737625122, + "learning_rate": 6.622146474164632e-07, + "loss": 0.8586, + "step": 23825 + }, + { + "epoch": 0.3709902153860524, + "grad_norm": 2.2178125381469727, + "learning_rate": 6.621327084118582e-07, + "loss": 0.7355, + "step": 23830 + }, + { + "epoch": 0.3710680563880218, + "grad_norm": 3.1225383281707764, + "learning_rate": 6.620507694072532e-07, + "loss": 0.665, + "step": 23835 + }, + { + "epoch": 0.3711458973899912, + "grad_norm": 3.577746868133545, + "learning_rate": 6.619688304026483e-07, + "loss": 0.779, + "step": 23840 + }, + { + "epoch": 0.3712237383919606, + "grad_norm": 6.259574890136719, + "learning_rate": 6.618868913980432e-07, + "loss": 0.7787, + "step": 23845 + }, + { + "epoch": 0.37130157939392994, + "grad_norm": 3.1449718475341797, + "learning_rate": 6.618049523934383e-07, + "loss": 0.7209, + "step": 23850 + }, + { + "epoch": 0.37137942039589933, + "grad_norm": 3.897055149078369, + "learning_rate": 6.617230133888333e-07, + "loss": 0.6962, + "step": 23855 + }, + { + "epoch": 0.3714572613978687, + "grad_norm": 3.856555938720703, + "learning_rate": 6.616410743842283e-07, + "loss": 0.7708, + "step": 23860 + }, + { + "epoch": 0.3715351023998381, + "grad_norm": 5.206638813018799, + "learning_rate": 6.615591353796234e-07, + "loss": 0.7102, + "step": 23865 + }, + { + "epoch": 0.37161294340180745, + "grad_norm": 4.0182414054870605, + "learning_rate": 6.614771963750184e-07, + "loss": 0.7327, + "step": 23870 + }, + { + "epoch": 0.37169078440377684, + "grad_norm": 3.4601380825042725, + "learning_rate": 6.613952573704134e-07, + "loss": 0.7188, + "step": 23875 + }, + { + "epoch": 0.37176862540574623, + "grad_norm": 4.397197723388672, + "learning_rate": 6.613133183658085e-07, + "loss": 0.824, + "step": 23880 + }, + { + "epoch": 0.3718464664077156, + "grad_norm": 3.7970356941223145, + "learning_rate": 6.612313793612036e-07, + "loss": 0.7597, + "step": 23885 + }, + { + "epoch": 0.37192430740968496, + "grad_norm": 4.713395118713379, + "learning_rate": 6.611494403565984e-07, + "loss": 0.6744, + "step": 23890 + }, + { + "epoch": 0.37200214841165435, + "grad_norm": 10.343477249145508, + "learning_rate": 6.610675013519935e-07, + "loss": 0.7873, + "step": 23895 + }, + { + "epoch": 0.37207998941362375, + "grad_norm": 5.199718952178955, + "learning_rate": 6.609855623473886e-07, + "loss": 0.6932, + "step": 23900 + }, + { + "epoch": 0.3721578304155931, + "grad_norm": 8.903950691223145, + "learning_rate": 6.609036233427835e-07, + "loss": 0.7658, + "step": 23905 + }, + { + "epoch": 0.3722356714175625, + "grad_norm": 4.624990940093994, + "learning_rate": 6.608216843381786e-07, + "loss": 0.7684, + "step": 23910 + }, + { + "epoch": 0.37231351241953187, + "grad_norm": 5.178587913513184, + "learning_rate": 6.607397453335737e-07, + "loss": 0.767, + "step": 23915 + }, + { + "epoch": 0.37239135342150126, + "grad_norm": 4.226781368255615, + "learning_rate": 6.606578063289688e-07, + "loss": 0.728, + "step": 23920 + }, + { + "epoch": 0.3724691944234706, + "grad_norm": 6.461855411529541, + "learning_rate": 6.605758673243637e-07, + "loss": 0.7692, + "step": 23925 + }, + { + "epoch": 0.37254703542544, + "grad_norm": 3.171079397201538, + "learning_rate": 6.604939283197587e-07, + "loss": 0.6856, + "step": 23930 + }, + { + "epoch": 0.3726248764274094, + "grad_norm": 4.6631693840026855, + "learning_rate": 6.604119893151538e-07, + "loss": 0.8407, + "step": 23935 + }, + { + "epoch": 0.37270271742937877, + "grad_norm": 6.995047569274902, + "learning_rate": 6.603300503105488e-07, + "loss": 0.7624, + "step": 23940 + }, + { + "epoch": 0.3727805584313481, + "grad_norm": 3.8059585094451904, + "learning_rate": 6.602481113059438e-07, + "loss": 0.7307, + "step": 23945 + }, + { + "epoch": 0.3728583994333175, + "grad_norm": 5.160345554351807, + "learning_rate": 6.601661723013389e-07, + "loss": 0.7879, + "step": 23950 + }, + { + "epoch": 0.3729362404352869, + "grad_norm": 7.343621730804443, + "learning_rate": 6.600842332967339e-07, + "loss": 0.7122, + "step": 23955 + }, + { + "epoch": 0.3730140814372563, + "grad_norm": 5.80457067489624, + "learning_rate": 6.60002294292129e-07, + "loss": 0.8733, + "step": 23960 + }, + { + "epoch": 0.3730919224392256, + "grad_norm": 4.190765857696533, + "learning_rate": 6.59920355287524e-07, + "loss": 0.7712, + "step": 23965 + }, + { + "epoch": 0.373169763441195, + "grad_norm": 2.601067304611206, + "learning_rate": 6.598384162829189e-07, + "loss": 0.632, + "step": 23970 + }, + { + "epoch": 0.3732476044431644, + "grad_norm": 4.3673834800720215, + "learning_rate": 6.59756477278314e-07, + "loss": 0.8203, + "step": 23975 + }, + { + "epoch": 0.3733254454451338, + "grad_norm": 2.9783923625946045, + "learning_rate": 6.59674538273709e-07, + "loss": 0.7736, + "step": 23980 + }, + { + "epoch": 0.37340328644710313, + "grad_norm": 6.698786735534668, + "learning_rate": 6.59592599269104e-07, + "loss": 0.8546, + "step": 23985 + }, + { + "epoch": 0.3734811274490725, + "grad_norm": 3.005664348602295, + "learning_rate": 6.595106602644991e-07, + "loss": 0.7989, + "step": 23990 + }, + { + "epoch": 0.3735589684510419, + "grad_norm": 3.8986668586730957, + "learning_rate": 6.594287212598942e-07, + "loss": 0.6315, + "step": 23995 + }, + { + "epoch": 0.37363680945301125, + "grad_norm": 4.2483062744140625, + "learning_rate": 6.593467822552891e-07, + "loss": 0.8664, + "step": 24000 + }, + { + "epoch": 0.37371465045498065, + "grad_norm": 2.8898262977600098, + "learning_rate": 6.592648432506842e-07, + "loss": 0.6167, + "step": 24005 + }, + { + "epoch": 0.37379249145695004, + "grad_norm": 2.7583794593811035, + "learning_rate": 6.591829042460792e-07, + "loss": 0.8595, + "step": 24010 + }, + { + "epoch": 0.37387033245891943, + "grad_norm": 3.2141683101654053, + "learning_rate": 6.591009652414741e-07, + "loss": 0.8204, + "step": 24015 + }, + { + "epoch": 0.37394817346088877, + "grad_norm": 4.640387535095215, + "learning_rate": 6.590190262368692e-07, + "loss": 0.8476, + "step": 24020 + }, + { + "epoch": 0.37402601446285816, + "grad_norm": 8.061158180236816, + "learning_rate": 6.589370872322643e-07, + "loss": 0.6599, + "step": 24025 + }, + { + "epoch": 0.37410385546482755, + "grad_norm": 4.347311973571777, + "learning_rate": 6.588551482276593e-07, + "loss": 0.7904, + "step": 24030 + }, + { + "epoch": 0.37418169646679694, + "grad_norm": 4.115234375, + "learning_rate": 6.587732092230543e-07, + "loss": 0.8568, + "step": 24035 + }, + { + "epoch": 0.3742595374687663, + "grad_norm": 5.665265083312988, + "learning_rate": 6.586912702184494e-07, + "loss": 0.795, + "step": 24040 + }, + { + "epoch": 0.37433737847073567, + "grad_norm": 6.1077423095703125, + "learning_rate": 6.586093312138445e-07, + "loss": 0.7835, + "step": 24045 + }, + { + "epoch": 0.37441521947270506, + "grad_norm": 6.675379276275635, + "learning_rate": 6.585273922092393e-07, + "loss": 0.7311, + "step": 24050 + }, + { + "epoch": 0.37449306047467446, + "grad_norm": 3.762505054473877, + "learning_rate": 6.584454532046344e-07, + "loss": 0.6828, + "step": 24055 + }, + { + "epoch": 0.3745709014766438, + "grad_norm": 7.7305684089660645, + "learning_rate": 6.583635142000295e-07, + "loss": 0.729, + "step": 24060 + }, + { + "epoch": 0.3746487424786132, + "grad_norm": 4.176987171173096, + "learning_rate": 6.582815751954245e-07, + "loss": 0.7009, + "step": 24065 + }, + { + "epoch": 0.3747265834805826, + "grad_norm": 3.813732385635376, + "learning_rate": 6.581996361908195e-07, + "loss": 0.8001, + "step": 24070 + }, + { + "epoch": 0.3748044244825519, + "grad_norm": 5.6425604820251465, + "learning_rate": 6.581176971862146e-07, + "loss": 0.7634, + "step": 24075 + }, + { + "epoch": 0.3748822654845213, + "grad_norm": 4.143874168395996, + "learning_rate": 6.580357581816096e-07, + "loss": 0.7091, + "step": 24080 + }, + { + "epoch": 0.3749601064864907, + "grad_norm": 2.3728559017181396, + "learning_rate": 6.579538191770047e-07, + "loss": 0.7337, + "step": 24085 + }, + { + "epoch": 0.3750379474884601, + "grad_norm": 3.5923044681549072, + "learning_rate": 6.578718801723996e-07, + "loss": 0.6422, + "step": 24090 + }, + { + "epoch": 0.3751157884904294, + "grad_norm": 3.443315029144287, + "learning_rate": 6.577899411677946e-07, + "loss": 0.6181, + "step": 24095 + }, + { + "epoch": 0.3751936294923988, + "grad_norm": 5.803593635559082, + "learning_rate": 6.577080021631897e-07, + "loss": 0.6687, + "step": 24100 + }, + { + "epoch": 0.3752714704943682, + "grad_norm": 3.978525400161743, + "learning_rate": 6.576260631585847e-07, + "loss": 0.6865, + "step": 24105 + }, + { + "epoch": 0.3753493114963376, + "grad_norm": 3.4662880897521973, + "learning_rate": 6.575441241539797e-07, + "loss": 0.7028, + "step": 24110 + }, + { + "epoch": 0.37542715249830694, + "grad_norm": 3.669604539871216, + "learning_rate": 6.574621851493748e-07, + "loss": 0.7207, + "step": 24115 + }, + { + "epoch": 0.37550499350027633, + "grad_norm": 10.295849800109863, + "learning_rate": 6.573802461447699e-07, + "loss": 0.7618, + "step": 24120 + }, + { + "epoch": 0.3755828345022457, + "grad_norm": 3.143021821975708, + "learning_rate": 6.572983071401648e-07, + "loss": 0.6281, + "step": 24125 + }, + { + "epoch": 0.3756606755042151, + "grad_norm": 5.273399353027344, + "learning_rate": 6.572163681355598e-07, + "loss": 0.7412, + "step": 24130 + }, + { + "epoch": 0.37573851650618445, + "grad_norm": 5.699292182922363, + "learning_rate": 6.571344291309549e-07, + "loss": 0.6383, + "step": 24135 + }, + { + "epoch": 0.37581635750815384, + "grad_norm": 3.716749429702759, + "learning_rate": 6.570524901263498e-07, + "loss": 0.8457, + "step": 24140 + }, + { + "epoch": 0.37589419851012323, + "grad_norm": 4.004367351531982, + "learning_rate": 6.569705511217449e-07, + "loss": 0.7126, + "step": 24145 + }, + { + "epoch": 0.3759720395120926, + "grad_norm": 3.647512435913086, + "learning_rate": 6.5688861211714e-07, + "loss": 0.8438, + "step": 24150 + }, + { + "epoch": 0.37604988051406196, + "grad_norm": 2.8271801471710205, + "learning_rate": 6.56806673112535e-07, + "loss": 0.7476, + "step": 24155 + }, + { + "epoch": 0.37612772151603135, + "grad_norm": 5.783123016357422, + "learning_rate": 6.5672473410793e-07, + "loss": 0.8312, + "step": 24160 + }, + { + "epoch": 0.37620556251800075, + "grad_norm": 3.1801908016204834, + "learning_rate": 6.566427951033251e-07, + "loss": 0.8276, + "step": 24165 + }, + { + "epoch": 0.3762834035199701, + "grad_norm": 3.8137319087982178, + "learning_rate": 6.565608560987201e-07, + "loss": 0.6949, + "step": 24170 + }, + { + "epoch": 0.3763612445219395, + "grad_norm": 6.042738437652588, + "learning_rate": 6.56478917094115e-07, + "loss": 0.799, + "step": 24175 + }, + { + "epoch": 0.37643908552390887, + "grad_norm": 2.819098472595215, + "learning_rate": 6.563969780895101e-07, + "loss": 0.7652, + "step": 24180 + }, + { + "epoch": 0.37651692652587826, + "grad_norm": 6.101853370666504, + "learning_rate": 6.563150390849052e-07, + "loss": 0.7183, + "step": 24185 + }, + { + "epoch": 0.3765947675278476, + "grad_norm": 3.3164820671081543, + "learning_rate": 6.562331000803002e-07, + "loss": 0.787, + "step": 24190 + }, + { + "epoch": 0.376672608529817, + "grad_norm": 3.9163126945495605, + "learning_rate": 6.561511610756952e-07, + "loss": 0.7803, + "step": 24195 + }, + { + "epoch": 0.3767504495317864, + "grad_norm": 3.7802116870880127, + "learning_rate": 6.560692220710903e-07, + "loss": 0.718, + "step": 24200 + }, + { + "epoch": 0.37682829053375577, + "grad_norm": 3.106133460998535, + "learning_rate": 6.559872830664853e-07, + "loss": 0.7347, + "step": 24205 + }, + { + "epoch": 0.3769061315357251, + "grad_norm": 7.895877361297607, + "learning_rate": 6.559053440618804e-07, + "loss": 0.7739, + "step": 24210 + }, + { + "epoch": 0.3769839725376945, + "grad_norm": 3.9047114849090576, + "learning_rate": 6.558234050572753e-07, + "loss": 0.8125, + "step": 24215 + }, + { + "epoch": 0.3770618135396639, + "grad_norm": 3.2611491680145264, + "learning_rate": 6.557414660526703e-07, + "loss": 0.7705, + "step": 24220 + }, + { + "epoch": 0.3771396545416333, + "grad_norm": 3.606032609939575, + "learning_rate": 6.556595270480654e-07, + "loss": 0.7115, + "step": 24225 + }, + { + "epoch": 0.3772174955436026, + "grad_norm": 3.7341561317443848, + "learning_rate": 6.555775880434604e-07, + "loss": 0.7128, + "step": 24230 + }, + { + "epoch": 0.377295336545572, + "grad_norm": 3.9536352157592773, + "learning_rate": 6.554956490388554e-07, + "loss": 0.7125, + "step": 24235 + }, + { + "epoch": 0.3773731775475414, + "grad_norm": 3.7920241355895996, + "learning_rate": 6.554137100342505e-07, + "loss": 0.783, + "step": 24240 + }, + { + "epoch": 0.37745101854951074, + "grad_norm": 4.251092433929443, + "learning_rate": 6.553317710296456e-07, + "loss": 0.7633, + "step": 24245 + }, + { + "epoch": 0.37752885955148013, + "grad_norm": 3.8011977672576904, + "learning_rate": 6.552498320250405e-07, + "loss": 0.7667, + "step": 24250 + }, + { + "epoch": 0.3776067005534495, + "grad_norm": 2.624079704284668, + "learning_rate": 6.551678930204355e-07, + "loss": 0.6509, + "step": 24255 + }, + { + "epoch": 0.3776845415554189, + "grad_norm": 4.23248291015625, + "learning_rate": 6.550859540158306e-07, + "loss": 0.7741, + "step": 24260 + }, + { + "epoch": 0.37776238255738825, + "grad_norm": 2.9144062995910645, + "learning_rate": 6.550040150112256e-07, + "loss": 0.8159, + "step": 24265 + }, + { + "epoch": 0.37784022355935765, + "grad_norm": 2.4228739738464355, + "learning_rate": 6.549220760066206e-07, + "loss": 0.7285, + "step": 24270 + }, + { + "epoch": 0.37791806456132704, + "grad_norm": 2.9209840297698975, + "learning_rate": 6.548401370020157e-07, + "loss": 0.6249, + "step": 24275 + }, + { + "epoch": 0.37799590556329643, + "grad_norm": 3.9086556434631348, + "learning_rate": 6.547581979974107e-07, + "loss": 0.69, + "step": 24280 + }, + { + "epoch": 0.37807374656526577, + "grad_norm": 3.2286627292633057, + "learning_rate": 6.546762589928057e-07, + "loss": 0.8162, + "step": 24285 + }, + { + "epoch": 0.37815158756723516, + "grad_norm": 3.5103039741516113, + "learning_rate": 6.545943199882008e-07, + "loss": 0.6934, + "step": 24290 + }, + { + "epoch": 0.37822942856920455, + "grad_norm": 4.203784465789795, + "learning_rate": 6.545123809835958e-07, + "loss": 0.7594, + "step": 24295 + }, + { + "epoch": 0.37830726957117394, + "grad_norm": 3.7477505207061768, + "learning_rate": 6.544304419789908e-07, + "loss": 0.7079, + "step": 24300 + }, + { + "epoch": 0.3783851105731433, + "grad_norm": 3.812622308731079, + "learning_rate": 6.543485029743858e-07, + "loss": 0.7349, + "step": 24305 + }, + { + "epoch": 0.37846295157511267, + "grad_norm": 3.558681011199951, + "learning_rate": 6.542665639697809e-07, + "loss": 0.7898, + "step": 24310 + }, + { + "epoch": 0.37854079257708206, + "grad_norm": 6.393659591674805, + "learning_rate": 6.541846249651759e-07, + "loss": 0.6253, + "step": 24315 + }, + { + "epoch": 0.37861863357905146, + "grad_norm": 2.723544120788574, + "learning_rate": 6.54102685960571e-07, + "loss": 0.7685, + "step": 24320 + }, + { + "epoch": 0.3786964745810208, + "grad_norm": 6.085930347442627, + "learning_rate": 6.54020746955966e-07, + "loss": 0.8936, + "step": 24325 + }, + { + "epoch": 0.3787743155829902, + "grad_norm": 3.1237025260925293, + "learning_rate": 6.53938807951361e-07, + "loss": 0.679, + "step": 24330 + }, + { + "epoch": 0.3788521565849596, + "grad_norm": 5.606638431549072, + "learning_rate": 6.53856868946756e-07, + "loss": 0.8498, + "step": 24335 + }, + { + "epoch": 0.3789299975869289, + "grad_norm": 4.310702323913574, + "learning_rate": 6.53774929942151e-07, + "loss": 0.836, + "step": 24340 + }, + { + "epoch": 0.3790078385888983, + "grad_norm": 4.304116725921631, + "learning_rate": 6.53692990937546e-07, + "loss": 0.8469, + "step": 24345 + }, + { + "epoch": 0.3790856795908677, + "grad_norm": 3.8278419971466064, + "learning_rate": 6.536110519329411e-07, + "loss": 0.8109, + "step": 24350 + }, + { + "epoch": 0.3791635205928371, + "grad_norm": 4.090198516845703, + "learning_rate": 6.535291129283362e-07, + "loss": 0.8282, + "step": 24355 + }, + { + "epoch": 0.3792413615948064, + "grad_norm": 4.38947057723999, + "learning_rate": 6.534471739237311e-07, + "loss": 0.8153, + "step": 24360 + }, + { + "epoch": 0.3793192025967758, + "grad_norm": 2.918142795562744, + "learning_rate": 6.533652349191262e-07, + "loss": 0.744, + "step": 24365 + }, + { + "epoch": 0.3793970435987452, + "grad_norm": 3.9180519580841064, + "learning_rate": 6.532832959145213e-07, + "loss": 0.8869, + "step": 24370 + }, + { + "epoch": 0.3794748846007146, + "grad_norm": 2.603149890899658, + "learning_rate": 6.532013569099161e-07, + "loss": 0.8022, + "step": 24375 + }, + { + "epoch": 0.37955272560268394, + "grad_norm": 4.62812614440918, + "learning_rate": 6.531194179053112e-07, + "loss": 0.7867, + "step": 24380 + }, + { + "epoch": 0.37963056660465333, + "grad_norm": 3.68599009513855, + "learning_rate": 6.530374789007063e-07, + "loss": 0.8338, + "step": 24385 + }, + { + "epoch": 0.3797084076066227, + "grad_norm": 5.189111709594727, + "learning_rate": 6.529555398961013e-07, + "loss": 0.8457, + "step": 24390 + }, + { + "epoch": 0.3797862486085921, + "grad_norm": 3.806553840637207, + "learning_rate": 6.528736008914963e-07, + "loss": 0.7669, + "step": 24395 + }, + { + "epoch": 0.37986408961056145, + "grad_norm": 5.229100227355957, + "learning_rate": 6.527916618868914e-07, + "loss": 0.7026, + "step": 24400 + }, + { + "epoch": 0.37994193061253084, + "grad_norm": 3.422163248062134, + "learning_rate": 6.527097228822864e-07, + "loss": 0.7962, + "step": 24405 + }, + { + "epoch": 0.38001977161450023, + "grad_norm": 5.487661361694336, + "learning_rate": 6.526277838776815e-07, + "loss": 0.8384, + "step": 24410 + }, + { + "epoch": 0.3800976126164696, + "grad_norm": 3.975858449935913, + "learning_rate": 6.525458448730764e-07, + "loss": 0.7887, + "step": 24415 + }, + { + "epoch": 0.38017545361843896, + "grad_norm": 4.209732532501221, + "learning_rate": 6.524639058684715e-07, + "loss": 0.874, + "step": 24420 + }, + { + "epoch": 0.38025329462040836, + "grad_norm": 3.549058198928833, + "learning_rate": 6.523819668638665e-07, + "loss": 0.6626, + "step": 24425 + }, + { + "epoch": 0.38033113562237775, + "grad_norm": 3.021886110305786, + "learning_rate": 6.523000278592615e-07, + "loss": 0.7263, + "step": 24430 + }, + { + "epoch": 0.3804089766243471, + "grad_norm": 2.7419252395629883, + "learning_rate": 6.522180888546566e-07, + "loss": 0.8412, + "step": 24435 + }, + { + "epoch": 0.3804868176263165, + "grad_norm": 3.5946810245513916, + "learning_rate": 6.521361498500516e-07, + "loss": 0.7614, + "step": 24440 + }, + { + "epoch": 0.38056465862828587, + "grad_norm": 4.112155914306641, + "learning_rate": 6.520542108454467e-07, + "loss": 0.6456, + "step": 24445 + }, + { + "epoch": 0.38064249963025526, + "grad_norm": 2.8762810230255127, + "learning_rate": 6.519722718408417e-07, + "loss": 0.7133, + "step": 24450 + }, + { + "epoch": 0.3807203406322246, + "grad_norm": 3.534217119216919, + "learning_rate": 6.518903328362366e-07, + "loss": 0.8656, + "step": 24455 + }, + { + "epoch": 0.380798181634194, + "grad_norm": 3.1278042793273926, + "learning_rate": 6.518083938316317e-07, + "loss": 0.7489, + "step": 24460 + }, + { + "epoch": 0.3808760226361634, + "grad_norm": 3.2034077644348145, + "learning_rate": 6.517264548270267e-07, + "loss": 0.8122, + "step": 24465 + }, + { + "epoch": 0.3809538636381328, + "grad_norm": 4.331535339355469, + "learning_rate": 6.516445158224217e-07, + "loss": 0.6917, + "step": 24470 + }, + { + "epoch": 0.3810317046401021, + "grad_norm": 3.5739593505859375, + "learning_rate": 6.515625768178168e-07, + "loss": 0.7332, + "step": 24475 + }, + { + "epoch": 0.3811095456420715, + "grad_norm": 5.469711780548096, + "learning_rate": 6.514806378132119e-07, + "loss": 0.7239, + "step": 24480 + }, + { + "epoch": 0.3811873866440409, + "grad_norm": 4.376361846923828, + "learning_rate": 6.513986988086068e-07, + "loss": 0.8151, + "step": 24485 + }, + { + "epoch": 0.3812652276460103, + "grad_norm": 13.4918794631958, + "learning_rate": 6.513167598040019e-07, + "loss": 0.7069, + "step": 24490 + }, + { + "epoch": 0.3813430686479796, + "grad_norm": 3.0612945556640625, + "learning_rate": 6.512348207993969e-07, + "loss": 0.7912, + "step": 24495 + }, + { + "epoch": 0.381420909649949, + "grad_norm": 3.844869613647461, + "learning_rate": 6.511528817947918e-07, + "loss": 0.7319, + "step": 24500 + }, + { + "epoch": 0.3814987506519184, + "grad_norm": 3.146636724472046, + "learning_rate": 6.510709427901869e-07, + "loss": 0.7234, + "step": 24505 + }, + { + "epoch": 0.38157659165388774, + "grad_norm": 3.0788705348968506, + "learning_rate": 6.50989003785582e-07, + "loss": 0.9002, + "step": 24510 + }, + { + "epoch": 0.38165443265585713, + "grad_norm": 5.646673679351807, + "learning_rate": 6.50907064780977e-07, + "loss": 0.7844, + "step": 24515 + }, + { + "epoch": 0.3817322736578265, + "grad_norm": 2.48629093170166, + "learning_rate": 6.50825125776372e-07, + "loss": 0.7244, + "step": 24520 + }, + { + "epoch": 0.3818101146597959, + "grad_norm": 4.52423620223999, + "learning_rate": 6.507431867717671e-07, + "loss": 0.7823, + "step": 24525 + }, + { + "epoch": 0.38188795566176525, + "grad_norm": 3.8551182746887207, + "learning_rate": 6.506612477671622e-07, + "loss": 0.7537, + "step": 24530 + }, + { + "epoch": 0.38196579666373465, + "grad_norm": 5.3387770652771, + "learning_rate": 6.505793087625572e-07, + "loss": 0.6455, + "step": 24535 + }, + { + "epoch": 0.38204363766570404, + "grad_norm": 3.0548503398895264, + "learning_rate": 6.504973697579521e-07, + "loss": 0.7108, + "step": 24540 + }, + { + "epoch": 0.38212147866767343, + "grad_norm": 4.870777130126953, + "learning_rate": 6.504154307533472e-07, + "loss": 0.7893, + "step": 24545 + }, + { + "epoch": 0.38219931966964277, + "grad_norm": 3.0125315189361572, + "learning_rate": 6.503334917487422e-07, + "loss": 0.7098, + "step": 24550 + }, + { + "epoch": 0.38227716067161216, + "grad_norm": 3.0246524810791016, + "learning_rate": 6.502515527441372e-07, + "loss": 0.7185, + "step": 24555 + }, + { + "epoch": 0.38235500167358155, + "grad_norm": 2.8637657165527344, + "learning_rate": 6.501696137395323e-07, + "loss": 0.7185, + "step": 24560 + }, + { + "epoch": 0.38243284267555094, + "grad_norm": 2.6888625621795654, + "learning_rate": 6.500876747349273e-07, + "loss": 0.8275, + "step": 24565 + }, + { + "epoch": 0.3825106836775203, + "grad_norm": 3.1520016193389893, + "learning_rate": 6.500057357303224e-07, + "loss": 0.9101, + "step": 24570 + }, + { + "epoch": 0.38258852467948967, + "grad_norm": 6.681828498840332, + "learning_rate": 6.499237967257174e-07, + "loss": 0.8141, + "step": 24575 + }, + { + "epoch": 0.38266636568145906, + "grad_norm": 3.728142738342285, + "learning_rate": 6.498418577211123e-07, + "loss": 0.7488, + "step": 24580 + }, + { + "epoch": 0.38274420668342846, + "grad_norm": 3.855109214782715, + "learning_rate": 6.497599187165074e-07, + "loss": 0.6411, + "step": 24585 + }, + { + "epoch": 0.3828220476853978, + "grad_norm": 3.837766170501709, + "learning_rate": 6.496779797119025e-07, + "loss": 0.7441, + "step": 24590 + }, + { + "epoch": 0.3828998886873672, + "grad_norm": 3.5934677124023438, + "learning_rate": 6.495960407072974e-07, + "loss": 0.6883, + "step": 24595 + }, + { + "epoch": 0.3829777296893366, + "grad_norm": 4.515072345733643, + "learning_rate": 6.495141017026925e-07, + "loss": 0.7892, + "step": 24600 + }, + { + "epoch": 0.3830555706913059, + "grad_norm": 2.74763560295105, + "learning_rate": 6.494321626980876e-07, + "loss": 0.7184, + "step": 24605 + }, + { + "epoch": 0.3831334116932753, + "grad_norm": 5.450028896331787, + "learning_rate": 6.493502236934825e-07, + "loss": 0.9551, + "step": 24610 + }, + { + "epoch": 0.3832112526952447, + "grad_norm": 3.213296413421631, + "learning_rate": 6.492682846888776e-07, + "loss": 0.8129, + "step": 24615 + }, + { + "epoch": 0.3832890936972141, + "grad_norm": 4.963546276092529, + "learning_rate": 6.491863456842726e-07, + "loss": 0.6857, + "step": 24620 + }, + { + "epoch": 0.3833669346991834, + "grad_norm": 3.2613205909729004, + "learning_rate": 6.491044066796676e-07, + "loss": 0.6915, + "step": 24625 + }, + { + "epoch": 0.3834447757011528, + "grad_norm": 6.735360622406006, + "learning_rate": 6.490224676750626e-07, + "loss": 0.7666, + "step": 24630 + }, + { + "epoch": 0.3835226167031222, + "grad_norm": 2.654822587966919, + "learning_rate": 6.489405286704577e-07, + "loss": 0.6861, + "step": 24635 + }, + { + "epoch": 0.3836004577050916, + "grad_norm": 4.433813095092773, + "learning_rate": 6.488585896658527e-07, + "loss": 0.7513, + "step": 24640 + }, + { + "epoch": 0.38367829870706094, + "grad_norm": 5.000208377838135, + "learning_rate": 6.487766506612477e-07, + "loss": 0.9181, + "step": 24645 + }, + { + "epoch": 0.38375613970903033, + "grad_norm": 3.075925827026367, + "learning_rate": 6.486947116566428e-07, + "loss": 0.7587, + "step": 24650 + }, + { + "epoch": 0.3838339807109997, + "grad_norm": 3.4058053493499756, + "learning_rate": 6.486127726520379e-07, + "loss": 0.8517, + "step": 24655 + }, + { + "epoch": 0.3839118217129691, + "grad_norm": 3.323262929916382, + "learning_rate": 6.485308336474328e-07, + "loss": 0.7488, + "step": 24660 + }, + { + "epoch": 0.38398966271493845, + "grad_norm": 3.4578330516815186, + "learning_rate": 6.484488946428278e-07, + "loss": 0.8329, + "step": 24665 + }, + { + "epoch": 0.38406750371690784, + "grad_norm": 4.2953643798828125, + "learning_rate": 6.483669556382229e-07, + "loss": 0.8123, + "step": 24670 + }, + { + "epoch": 0.38414534471887724, + "grad_norm": 3.698613166809082, + "learning_rate": 6.482850166336179e-07, + "loss": 0.7279, + "step": 24675 + }, + { + "epoch": 0.38422318572084657, + "grad_norm": 6.446969985961914, + "learning_rate": 6.48203077629013e-07, + "loss": 0.741, + "step": 24680 + }, + { + "epoch": 0.38430102672281596, + "grad_norm": 3.7218122482299805, + "learning_rate": 6.48121138624408e-07, + "loss": 0.6806, + "step": 24685 + }, + { + "epoch": 0.38437886772478536, + "grad_norm": 3.6571853160858154, + "learning_rate": 6.48039199619803e-07, + "loss": 0.7325, + "step": 24690 + }, + { + "epoch": 0.38445670872675475, + "grad_norm": 3.122586727142334, + "learning_rate": 6.479572606151981e-07, + "loss": 0.7157, + "step": 24695 + }, + { + "epoch": 0.3845345497287241, + "grad_norm": 3.1738834381103516, + "learning_rate": 6.47875321610593e-07, + "loss": 0.7394, + "step": 24700 + }, + { + "epoch": 0.3846123907306935, + "grad_norm": 3.8704867362976074, + "learning_rate": 6.47793382605988e-07, + "loss": 0.7922, + "step": 24705 + }, + { + "epoch": 0.38469023173266287, + "grad_norm": 4.960922718048096, + "learning_rate": 6.477114436013831e-07, + "loss": 0.7774, + "step": 24710 + }, + { + "epoch": 0.38476807273463226, + "grad_norm": 9.464720726013184, + "learning_rate": 6.476295045967782e-07, + "loss": 0.7547, + "step": 24715 + }, + { + "epoch": 0.3848459137366016, + "grad_norm": 3.610100269317627, + "learning_rate": 6.475475655921731e-07, + "loss": 0.6677, + "step": 24720 + }, + { + "epoch": 0.384923754738571, + "grad_norm": 3.670114278793335, + "learning_rate": 6.474656265875682e-07, + "loss": 0.7666, + "step": 24725 + }, + { + "epoch": 0.3850015957405404, + "grad_norm": 4.544345855712891, + "learning_rate": 6.473836875829633e-07, + "loss": 0.6886, + "step": 24730 + }, + { + "epoch": 0.3850794367425098, + "grad_norm": 2.7942495346069336, + "learning_rate": 6.473017485783582e-07, + "loss": 0.6796, + "step": 24735 + }, + { + "epoch": 0.3851572777444791, + "grad_norm": 3.126971960067749, + "learning_rate": 6.472198095737532e-07, + "loss": 0.7763, + "step": 24740 + }, + { + "epoch": 0.3852351187464485, + "grad_norm": 5.752474308013916, + "learning_rate": 6.471378705691483e-07, + "loss": 0.8314, + "step": 24745 + }, + { + "epoch": 0.3853129597484179, + "grad_norm": 3.342573642730713, + "learning_rate": 6.470559315645433e-07, + "loss": 0.7669, + "step": 24750 + }, + { + "epoch": 0.3853908007503873, + "grad_norm": 2.660649061203003, + "learning_rate": 6.469739925599383e-07, + "loss": 0.7613, + "step": 24755 + }, + { + "epoch": 0.3854686417523566, + "grad_norm": 3.439711332321167, + "learning_rate": 6.468920535553334e-07, + "loss": 0.8655, + "step": 24760 + }, + { + "epoch": 0.385546482754326, + "grad_norm": 2.6334118843078613, + "learning_rate": 6.468101145507284e-07, + "loss": 0.7644, + "step": 24765 + }, + { + "epoch": 0.3856243237562954, + "grad_norm": 4.374823093414307, + "learning_rate": 6.467281755461235e-07, + "loss": 0.7128, + "step": 24770 + }, + { + "epoch": 0.38570216475826474, + "grad_norm": 5.410842418670654, + "learning_rate": 6.466462365415185e-07, + "loss": 0.8169, + "step": 24775 + }, + { + "epoch": 0.38578000576023413, + "grad_norm": 3.2832460403442383, + "learning_rate": 6.465642975369134e-07, + "loss": 0.9047, + "step": 24780 + }, + { + "epoch": 0.3858578467622035, + "grad_norm": 3.3888192176818848, + "learning_rate": 6.464823585323085e-07, + "loss": 0.8511, + "step": 24785 + }, + { + "epoch": 0.3859356877641729, + "grad_norm": 3.015672206878662, + "learning_rate": 6.464004195277035e-07, + "loss": 0.7058, + "step": 24790 + }, + { + "epoch": 0.38601352876614226, + "grad_norm": 2.8976247310638428, + "learning_rate": 6.463184805230986e-07, + "loss": 0.5977, + "step": 24795 + }, + { + "epoch": 0.38609136976811165, + "grad_norm": 4.574088096618652, + "learning_rate": 6.462365415184936e-07, + "loss": 0.7404, + "step": 24800 + }, + { + "epoch": 0.38616921077008104, + "grad_norm": 3.3250582218170166, + "learning_rate": 6.461546025138887e-07, + "loss": 0.6752, + "step": 24805 + }, + { + "epoch": 0.38624705177205043, + "grad_norm": 3.7079975605010986, + "learning_rate": 6.460726635092837e-07, + "loss": 0.7038, + "step": 24810 + }, + { + "epoch": 0.38632489277401977, + "grad_norm": 4.7266340255737305, + "learning_rate": 6.459907245046787e-07, + "loss": 0.6684, + "step": 24815 + }, + { + "epoch": 0.38640273377598916, + "grad_norm": 6.4636335372924805, + "learning_rate": 6.459087855000737e-07, + "loss": 0.7269, + "step": 24820 + }, + { + "epoch": 0.38648057477795855, + "grad_norm": 3.2442867755889893, + "learning_rate": 6.458268464954687e-07, + "loss": 0.8114, + "step": 24825 + }, + { + "epoch": 0.38655841577992794, + "grad_norm": 3.611947774887085, + "learning_rate": 6.457449074908637e-07, + "loss": 0.8077, + "step": 24830 + }, + { + "epoch": 0.3866362567818973, + "grad_norm": 3.2939252853393555, + "learning_rate": 6.456629684862588e-07, + "loss": 0.7871, + "step": 24835 + }, + { + "epoch": 0.3867140977838667, + "grad_norm": 6.327769756317139, + "learning_rate": 6.455810294816539e-07, + "loss": 0.7539, + "step": 24840 + }, + { + "epoch": 0.38679193878583606, + "grad_norm": 3.1518681049346924, + "learning_rate": 6.454990904770488e-07, + "loss": 0.8088, + "step": 24845 + }, + { + "epoch": 0.3868697797878054, + "grad_norm": 3.270970106124878, + "learning_rate": 6.454171514724439e-07, + "loss": 0.7782, + "step": 24850 + }, + { + "epoch": 0.3869476207897748, + "grad_norm": 2.967019557952881, + "learning_rate": 6.45335212467839e-07, + "loss": 0.8034, + "step": 24855 + }, + { + "epoch": 0.3870254617917442, + "grad_norm": 3.985086679458618, + "learning_rate": 6.45253273463234e-07, + "loss": 0.6594, + "step": 24860 + }, + { + "epoch": 0.3871033027937136, + "grad_norm": 4.853755950927734, + "learning_rate": 6.451713344586289e-07, + "loss": 0.752, + "step": 24865 + }, + { + "epoch": 0.3871811437956829, + "grad_norm": 4.004329681396484, + "learning_rate": 6.45089395454024e-07, + "loss": 0.8231, + "step": 24870 + }, + { + "epoch": 0.3872589847976523, + "grad_norm": 5.640361309051514, + "learning_rate": 6.45007456449419e-07, + "loss": 0.7791, + "step": 24875 + }, + { + "epoch": 0.3873368257996217, + "grad_norm": 4.084892749786377, + "learning_rate": 6.44925517444814e-07, + "loss": 0.7546, + "step": 24880 + }, + { + "epoch": 0.3874146668015911, + "grad_norm": 4.119089603424072, + "learning_rate": 6.448435784402091e-07, + "loss": 0.6331, + "step": 24885 + }, + { + "epoch": 0.3874925078035604, + "grad_norm": 2.9183003902435303, + "learning_rate": 6.447616394356041e-07, + "loss": 0.8319, + "step": 24890 + }, + { + "epoch": 0.3875703488055298, + "grad_norm": 6.5224432945251465, + "learning_rate": 6.446797004309992e-07, + "loss": 0.8136, + "step": 24895 + }, + { + "epoch": 0.3876481898074992, + "grad_norm": 3.6733174324035645, + "learning_rate": 6.445977614263942e-07, + "loss": 0.8367, + "step": 24900 + }, + { + "epoch": 0.3877260308094686, + "grad_norm": 3.2794108390808105, + "learning_rate": 6.445158224217891e-07, + "loss": 0.8072, + "step": 24905 + }, + { + "epoch": 0.38780387181143794, + "grad_norm": 2.9346797466278076, + "learning_rate": 6.444338834171842e-07, + "loss": 0.7321, + "step": 24910 + }, + { + "epoch": 0.38788171281340733, + "grad_norm": 6.637734889984131, + "learning_rate": 6.443519444125792e-07, + "loss": 0.7524, + "step": 24915 + }, + { + "epoch": 0.3879595538153767, + "grad_norm": 4.200438499450684, + "learning_rate": 6.442700054079743e-07, + "loss": 0.6879, + "step": 24920 + }, + { + "epoch": 0.3880373948173461, + "grad_norm": 4.099344730377197, + "learning_rate": 6.441880664033693e-07, + "loss": 0.8778, + "step": 24925 + }, + { + "epoch": 0.38811523581931545, + "grad_norm": 4.251168251037598, + "learning_rate": 6.441061273987644e-07, + "loss": 0.8255, + "step": 24930 + }, + { + "epoch": 0.38819307682128484, + "grad_norm": 3.3429462909698486, + "learning_rate": 6.440241883941594e-07, + "loss": 0.8582, + "step": 24935 + }, + { + "epoch": 0.38827091782325424, + "grad_norm": 2.959158182144165, + "learning_rate": 6.439422493895544e-07, + "loss": 0.8398, + "step": 24940 + }, + { + "epoch": 0.3883487588252236, + "grad_norm": 3.3628015518188477, + "learning_rate": 6.438603103849494e-07, + "loss": 0.7873, + "step": 24945 + }, + { + "epoch": 0.38842659982719296, + "grad_norm": 5.4761176109313965, + "learning_rate": 6.437783713803445e-07, + "loss": 0.716, + "step": 24950 + }, + { + "epoch": 0.38850444082916236, + "grad_norm": 3.494793176651001, + "learning_rate": 6.436964323757394e-07, + "loss": 0.7331, + "step": 24955 + }, + { + "epoch": 0.38858228183113175, + "grad_norm": 2.821584701538086, + "learning_rate": 6.436144933711345e-07, + "loss": 0.6953, + "step": 24960 + }, + { + "epoch": 0.3886601228331011, + "grad_norm": 3.9607622623443604, + "learning_rate": 6.435325543665296e-07, + "loss": 0.8024, + "step": 24965 + }, + { + "epoch": 0.3887379638350705, + "grad_norm": 4.245300769805908, + "learning_rate": 6.434506153619245e-07, + "loss": 0.6974, + "step": 24970 + }, + { + "epoch": 0.38881580483703987, + "grad_norm": 4.189142227172852, + "learning_rate": 6.433686763573196e-07, + "loss": 0.6792, + "step": 24975 + }, + { + "epoch": 0.38889364583900926, + "grad_norm": 5.561572074890137, + "learning_rate": 6.432867373527147e-07, + "loss": 0.7351, + "step": 24980 + }, + { + "epoch": 0.3889714868409786, + "grad_norm": 4.93323278427124, + "learning_rate": 6.432047983481096e-07, + "loss": 0.8008, + "step": 24985 + }, + { + "epoch": 0.389049327842948, + "grad_norm": 3.7661235332489014, + "learning_rate": 6.431228593435046e-07, + "loss": 0.768, + "step": 24990 + }, + { + "epoch": 0.3891271688449174, + "grad_norm": 9.159685134887695, + "learning_rate": 6.430409203388997e-07, + "loss": 0.8869, + "step": 24995 + }, + { + "epoch": 0.3892050098468868, + "grad_norm": 4.195098876953125, + "learning_rate": 6.429589813342947e-07, + "loss": 0.8319, + "step": 25000 + }, + { + "epoch": 0.3892828508488561, + "grad_norm": 3.203718662261963, + "learning_rate": 6.428770423296897e-07, + "loss": 0.6736, + "step": 25005 + }, + { + "epoch": 0.3893606918508255, + "grad_norm": 3.4601032733917236, + "learning_rate": 6.427951033250848e-07, + "loss": 0.7713, + "step": 25010 + }, + { + "epoch": 0.3894385328527949, + "grad_norm": 3.0744853019714355, + "learning_rate": 6.427131643204798e-07, + "loss": 0.8134, + "step": 25015 + }, + { + "epoch": 0.3895163738547643, + "grad_norm": 3.6189422607421875, + "learning_rate": 6.426312253158749e-07, + "loss": 0.7555, + "step": 25020 + }, + { + "epoch": 0.3895942148567336, + "grad_norm": 7.418415069580078, + "learning_rate": 6.425492863112698e-07, + "loss": 0.7212, + "step": 25025 + }, + { + "epoch": 0.389672055858703, + "grad_norm": 5.042339324951172, + "learning_rate": 6.424673473066648e-07, + "loss": 0.7072, + "step": 25030 + }, + { + "epoch": 0.3897498968606724, + "grad_norm": 3.4799351692199707, + "learning_rate": 6.423854083020599e-07, + "loss": 0.6974, + "step": 25035 + }, + { + "epoch": 0.38982773786264174, + "grad_norm": 3.5405311584472656, + "learning_rate": 6.42303469297455e-07, + "loss": 0.7653, + "step": 25040 + }, + { + "epoch": 0.38990557886461114, + "grad_norm": 8.769624710083008, + "learning_rate": 6.4222153029285e-07, + "loss": 0.7843, + "step": 25045 + }, + { + "epoch": 0.3899834198665805, + "grad_norm": 4.537402629852295, + "learning_rate": 6.42139591288245e-07, + "loss": 0.7039, + "step": 25050 + }, + { + "epoch": 0.3900612608685499, + "grad_norm": 9.237510681152344, + "learning_rate": 6.420576522836401e-07, + "loss": 0.7074, + "step": 25055 + }, + { + "epoch": 0.39013910187051926, + "grad_norm": 5.712589740753174, + "learning_rate": 6.419757132790351e-07, + "loss": 0.7478, + "step": 25060 + }, + { + "epoch": 0.39021694287248865, + "grad_norm": 2.4703423976898193, + "learning_rate": 6.4189377427443e-07, + "loss": 0.7171, + "step": 25065 + }, + { + "epoch": 0.39029478387445804, + "grad_norm": 4.031161308288574, + "learning_rate": 6.418118352698251e-07, + "loss": 0.8543, + "step": 25070 + }, + { + "epoch": 0.39037262487642743, + "grad_norm": 3.8306655883789062, + "learning_rate": 6.417298962652202e-07, + "loss": 0.6768, + "step": 25075 + }, + { + "epoch": 0.39045046587839677, + "grad_norm": 4.797236442565918, + "learning_rate": 6.416479572606151e-07, + "loss": 0.8301, + "step": 25080 + }, + { + "epoch": 0.39052830688036616, + "grad_norm": 5.457010746002197, + "learning_rate": 6.415660182560102e-07, + "loss": 0.8202, + "step": 25085 + }, + { + "epoch": 0.39060614788233555, + "grad_norm": 4.702629089355469, + "learning_rate": 6.414840792514053e-07, + "loss": 0.6733, + "step": 25090 + }, + { + "epoch": 0.39068398888430494, + "grad_norm": 8.158437728881836, + "learning_rate": 6.414021402468002e-07, + "loss": 0.8074, + "step": 25095 + }, + { + "epoch": 0.3907618298862743, + "grad_norm": 8.769953727722168, + "learning_rate": 6.413202012421953e-07, + "loss": 0.7251, + "step": 25100 + }, + { + "epoch": 0.3908396708882437, + "grad_norm": 3.4470908641815186, + "learning_rate": 6.412382622375903e-07, + "loss": 0.8341, + "step": 25105 + }, + { + "epoch": 0.39091751189021307, + "grad_norm": 8.288146018981934, + "learning_rate": 6.411563232329853e-07, + "loss": 0.8349, + "step": 25110 + }, + { + "epoch": 0.3909953528921824, + "grad_norm": 3.165147066116333, + "learning_rate": 6.410743842283803e-07, + "loss": 0.7797, + "step": 25115 + }, + { + "epoch": 0.3910731938941518, + "grad_norm": 3.657697916030884, + "learning_rate": 6.409924452237754e-07, + "loss": 0.763, + "step": 25120 + }, + { + "epoch": 0.3911510348961212, + "grad_norm": 5.660918712615967, + "learning_rate": 6.409105062191704e-07, + "loss": 0.7126, + "step": 25125 + }, + { + "epoch": 0.3912288758980906, + "grad_norm": 5.250837326049805, + "learning_rate": 6.408285672145655e-07, + "loss": 0.6867, + "step": 25130 + }, + { + "epoch": 0.3913067169000599, + "grad_norm": 5.486688613891602, + "learning_rate": 6.407466282099605e-07, + "loss": 0.7719, + "step": 25135 + }, + { + "epoch": 0.3913845579020293, + "grad_norm": 3.74428129196167, + "learning_rate": 6.406646892053555e-07, + "loss": 0.6571, + "step": 25140 + }, + { + "epoch": 0.3914623989039987, + "grad_norm": 4.203434944152832, + "learning_rate": 6.405827502007505e-07, + "loss": 0.8256, + "step": 25145 + }, + { + "epoch": 0.3915402399059681, + "grad_norm": 3.337033987045288, + "learning_rate": 6.405008111961455e-07, + "loss": 0.8205, + "step": 25150 + }, + { + "epoch": 0.3916180809079374, + "grad_norm": 5.5420355796813965, + "learning_rate": 6.404188721915405e-07, + "loss": 0.7959, + "step": 25155 + }, + { + "epoch": 0.3916959219099068, + "grad_norm": 3.8033721446990967, + "learning_rate": 6.403369331869356e-07, + "loss": 0.6854, + "step": 25160 + }, + { + "epoch": 0.3917737629118762, + "grad_norm": 3.5326404571533203, + "learning_rate": 6.402549941823307e-07, + "loss": 0.8763, + "step": 25165 + }, + { + "epoch": 0.3918516039138456, + "grad_norm": 2.995070219039917, + "learning_rate": 6.401730551777257e-07, + "loss": 0.8321, + "step": 25170 + }, + { + "epoch": 0.39192944491581494, + "grad_norm": 5.510183334350586, + "learning_rate": 6.400911161731207e-07, + "loss": 0.6439, + "step": 25175 + }, + { + "epoch": 0.39200728591778433, + "grad_norm": 3.0924971103668213, + "learning_rate": 6.400091771685158e-07, + "loss": 0.801, + "step": 25180 + }, + { + "epoch": 0.3920851269197537, + "grad_norm": 3.860745668411255, + "learning_rate": 6.399272381639109e-07, + "loss": 0.7218, + "step": 25185 + }, + { + "epoch": 0.3921629679217231, + "grad_norm": 3.2652904987335205, + "learning_rate": 6.398452991593057e-07, + "loss": 0.836, + "step": 25190 + }, + { + "epoch": 0.39224080892369245, + "grad_norm": 7.08906364440918, + "learning_rate": 6.397633601547008e-07, + "loss": 0.8404, + "step": 25195 + }, + { + "epoch": 0.39231864992566184, + "grad_norm": 2.7343451976776123, + "learning_rate": 6.396814211500959e-07, + "loss": 0.8146, + "step": 25200 + }, + { + "epoch": 0.39239649092763124, + "grad_norm": 10.399474143981934, + "learning_rate": 6.395994821454908e-07, + "loss": 0.7137, + "step": 25205 + }, + { + "epoch": 0.3924743319296006, + "grad_norm": 5.2434210777282715, + "learning_rate": 6.395175431408859e-07, + "loss": 0.7348, + "step": 25210 + }, + { + "epoch": 0.39255217293156996, + "grad_norm": 3.0255560874938965, + "learning_rate": 6.39435604136281e-07, + "loss": 0.6458, + "step": 25215 + }, + { + "epoch": 0.39263001393353936, + "grad_norm": 2.3744378089904785, + "learning_rate": 6.39353665131676e-07, + "loss": 0.7827, + "step": 25220 + }, + { + "epoch": 0.39270785493550875, + "grad_norm": 3.563133955001831, + "learning_rate": 6.39271726127071e-07, + "loss": 0.7984, + "step": 25225 + }, + { + "epoch": 0.3927856959374781, + "grad_norm": 4.089686393737793, + "learning_rate": 6.39189787122466e-07, + "loss": 0.7627, + "step": 25230 + }, + { + "epoch": 0.3928635369394475, + "grad_norm": 4.701676368713379, + "learning_rate": 6.39107848117861e-07, + "loss": 0.8174, + "step": 25235 + }, + { + "epoch": 0.39294137794141687, + "grad_norm": 4.46544075012207, + "learning_rate": 6.39025909113256e-07, + "loss": 0.7566, + "step": 25240 + }, + { + "epoch": 0.39301921894338626, + "grad_norm": 18.95890998840332, + "learning_rate": 6.389439701086511e-07, + "loss": 0.7883, + "step": 25245 + }, + { + "epoch": 0.3930970599453556, + "grad_norm": 7.087912559509277, + "learning_rate": 6.388620311040461e-07, + "loss": 0.7431, + "step": 25250 + }, + { + "epoch": 0.393174900947325, + "grad_norm": 11.888138771057129, + "learning_rate": 6.387800920994412e-07, + "loss": 0.7625, + "step": 25255 + }, + { + "epoch": 0.3932527419492944, + "grad_norm": 7.013394832611084, + "learning_rate": 6.386981530948362e-07, + "loss": 0.7365, + "step": 25260 + }, + { + "epoch": 0.3933305829512638, + "grad_norm": 3.1967785358428955, + "learning_rate": 6.386162140902312e-07, + "loss": 0.7336, + "step": 25265 + }, + { + "epoch": 0.3934084239532331, + "grad_norm": 3.2710204124450684, + "learning_rate": 6.385342750856262e-07, + "loss": 0.7243, + "step": 25270 + }, + { + "epoch": 0.3934862649552025, + "grad_norm": 5.661640644073486, + "learning_rate": 6.384523360810212e-07, + "loss": 0.7948, + "step": 25275 + }, + { + "epoch": 0.3935641059571719, + "grad_norm": 4.448530197143555, + "learning_rate": 6.383703970764162e-07, + "loss": 0.7979, + "step": 25280 + }, + { + "epoch": 0.39364194695914123, + "grad_norm": 3.535083770751953, + "learning_rate": 6.382884580718113e-07, + "loss": 0.7797, + "step": 25285 + }, + { + "epoch": 0.3937197879611106, + "grad_norm": 3.0342564582824707, + "learning_rate": 6.382065190672064e-07, + "loss": 0.6707, + "step": 25290 + }, + { + "epoch": 0.39379762896308, + "grad_norm": 3.252420663833618, + "learning_rate": 6.381245800626014e-07, + "loss": 0.6687, + "step": 25295 + }, + { + "epoch": 0.3938754699650494, + "grad_norm": 5.64980936050415, + "learning_rate": 6.380426410579964e-07, + "loss": 0.7643, + "step": 25300 + }, + { + "epoch": 0.39395331096701874, + "grad_norm": 2.8796017169952393, + "learning_rate": 6.379607020533915e-07, + "loss": 0.8468, + "step": 25305 + }, + { + "epoch": 0.39403115196898814, + "grad_norm": 4.910164833068848, + "learning_rate": 6.378787630487865e-07, + "loss": 0.8456, + "step": 25310 + }, + { + "epoch": 0.39410899297095753, + "grad_norm": 3.392526865005493, + "learning_rate": 6.377968240441814e-07, + "loss": 0.7253, + "step": 25315 + }, + { + "epoch": 0.3941868339729269, + "grad_norm": 3.7609364986419678, + "learning_rate": 6.377148850395765e-07, + "loss": 0.6837, + "step": 25320 + }, + { + "epoch": 0.39426467497489626, + "grad_norm": 3.4429965019226074, + "learning_rate": 6.376329460349716e-07, + "loss": 0.7907, + "step": 25325 + }, + { + "epoch": 0.39434251597686565, + "grad_norm": 3.964439630508423, + "learning_rate": 6.375510070303665e-07, + "loss": 0.6576, + "step": 25330 + }, + { + "epoch": 0.39442035697883504, + "grad_norm": 9.316553115844727, + "learning_rate": 6.374690680257616e-07, + "loss": 0.7046, + "step": 25335 + }, + { + "epoch": 0.39449819798080443, + "grad_norm": 3.9549331665039062, + "learning_rate": 6.373871290211567e-07, + "loss": 0.752, + "step": 25340 + }, + { + "epoch": 0.39457603898277377, + "grad_norm": 6.220914840698242, + "learning_rate": 6.373051900165517e-07, + "loss": 0.8579, + "step": 25345 + }, + { + "epoch": 0.39465387998474316, + "grad_norm": 5.23582124710083, + "learning_rate": 6.372232510119466e-07, + "loss": 0.757, + "step": 25350 + }, + { + "epoch": 0.39473172098671255, + "grad_norm": 6.521960735321045, + "learning_rate": 6.371413120073417e-07, + "loss": 0.8323, + "step": 25355 + }, + { + "epoch": 0.39480956198868195, + "grad_norm": 3.584139347076416, + "learning_rate": 6.370593730027367e-07, + "loss": 0.7943, + "step": 25360 + }, + { + "epoch": 0.3948874029906513, + "grad_norm": 13.306678771972656, + "learning_rate": 6.369774339981317e-07, + "loss": 0.73, + "step": 25365 + }, + { + "epoch": 0.3949652439926207, + "grad_norm": 4.9485039710998535, + "learning_rate": 6.368954949935268e-07, + "loss": 0.7932, + "step": 25370 + }, + { + "epoch": 0.39504308499459007, + "grad_norm": 3.292325735092163, + "learning_rate": 6.368135559889218e-07, + "loss": 0.8218, + "step": 25375 + }, + { + "epoch": 0.3951209259965594, + "grad_norm": 3.3515844345092773, + "learning_rate": 6.367316169843169e-07, + "loss": 0.7421, + "step": 25380 + }, + { + "epoch": 0.3951987669985288, + "grad_norm": 5.699435710906982, + "learning_rate": 6.366496779797119e-07, + "loss": 0.7649, + "step": 25385 + }, + { + "epoch": 0.3952766080004982, + "grad_norm": 8.88771915435791, + "learning_rate": 6.365677389751068e-07, + "loss": 0.7564, + "step": 25390 + }, + { + "epoch": 0.3953544490024676, + "grad_norm": 3.6982576847076416, + "learning_rate": 6.364857999705019e-07, + "loss": 0.7699, + "step": 25395 + }, + { + "epoch": 0.3954322900044369, + "grad_norm": 4.210885524749756, + "learning_rate": 6.36403860965897e-07, + "loss": 0.7438, + "step": 25400 + }, + { + "epoch": 0.3955101310064063, + "grad_norm": 3.2768607139587402, + "learning_rate": 6.363219219612919e-07, + "loss": 0.7445, + "step": 25405 + }, + { + "epoch": 0.3955879720083757, + "grad_norm": 4.871786117553711, + "learning_rate": 6.36239982956687e-07, + "loss": 0.7446, + "step": 25410 + }, + { + "epoch": 0.3956658130103451, + "grad_norm": 5.095002174377441, + "learning_rate": 6.361580439520821e-07, + "loss": 0.7323, + "step": 25415 + }, + { + "epoch": 0.3957436540123144, + "grad_norm": 6.775503158569336, + "learning_rate": 6.360761049474771e-07, + "loss": 0.7104, + "step": 25420 + }, + { + "epoch": 0.3958214950142838, + "grad_norm": 3.6449718475341797, + "learning_rate": 6.359941659428721e-07, + "loss": 0.8384, + "step": 25425 + }, + { + "epoch": 0.3958993360162532, + "grad_norm": 3.4597651958465576, + "learning_rate": 6.359122269382671e-07, + "loss": 0.7858, + "step": 25430 + }, + { + "epoch": 0.3959771770182226, + "grad_norm": 3.478415012359619, + "learning_rate": 6.358302879336622e-07, + "loss": 0.8736, + "step": 25435 + }, + { + "epoch": 0.39605501802019194, + "grad_norm": 4.179505348205566, + "learning_rate": 6.357483489290571e-07, + "loss": 0.7169, + "step": 25440 + }, + { + "epoch": 0.39613285902216133, + "grad_norm": 3.7945547103881836, + "learning_rate": 6.356664099244522e-07, + "loss": 0.7743, + "step": 25445 + }, + { + "epoch": 0.3962107000241307, + "grad_norm": 3.468790054321289, + "learning_rate": 6.355844709198473e-07, + "loss": 0.7189, + "step": 25450 + }, + { + "epoch": 0.39628854102610006, + "grad_norm": 5.45301628112793, + "learning_rate": 6.355025319152422e-07, + "loss": 0.8203, + "step": 25455 + }, + { + "epoch": 0.39636638202806945, + "grad_norm": 9.016706466674805, + "learning_rate": 6.354205929106373e-07, + "loss": 0.9262, + "step": 25460 + }, + { + "epoch": 0.39644422303003884, + "grad_norm": 3.1717607975006104, + "learning_rate": 6.353386539060324e-07, + "loss": 0.7329, + "step": 25465 + }, + { + "epoch": 0.39652206403200824, + "grad_norm": 2.699633836746216, + "learning_rate": 6.352567149014274e-07, + "loss": 0.6813, + "step": 25470 + }, + { + "epoch": 0.3965999050339776, + "grad_norm": 4.039689064025879, + "learning_rate": 6.351747758968223e-07, + "loss": 0.7226, + "step": 25475 + }, + { + "epoch": 0.39667774603594697, + "grad_norm": 4.107393741607666, + "learning_rate": 6.350928368922174e-07, + "loss": 0.824, + "step": 25480 + }, + { + "epoch": 0.39675558703791636, + "grad_norm": 4.444277763366699, + "learning_rate": 6.350108978876124e-07, + "loss": 0.8854, + "step": 25485 + }, + { + "epoch": 0.39683342803988575, + "grad_norm": 6.592438220977783, + "learning_rate": 6.349289588830075e-07, + "loss": 0.651, + "step": 25490 + }, + { + "epoch": 0.3969112690418551, + "grad_norm": 6.207258224487305, + "learning_rate": 6.348470198784025e-07, + "loss": 0.6449, + "step": 25495 + }, + { + "epoch": 0.3969891100438245, + "grad_norm": 4.503571033477783, + "learning_rate": 6.347650808737975e-07, + "loss": 0.807, + "step": 25500 + }, + { + "epoch": 0.39706695104579387, + "grad_norm": 3.707956075668335, + "learning_rate": 6.346831418691926e-07, + "loss": 0.7628, + "step": 25505 + }, + { + "epoch": 0.39714479204776326, + "grad_norm": 2.8668758869171143, + "learning_rate": 6.346012028645876e-07, + "loss": 0.6728, + "step": 25510 + }, + { + "epoch": 0.3972226330497326, + "grad_norm": 5.505023002624512, + "learning_rate": 6.345192638599825e-07, + "loss": 0.7744, + "step": 25515 + }, + { + "epoch": 0.397300474051702, + "grad_norm": 2.5895702838897705, + "learning_rate": 6.344373248553776e-07, + "loss": 0.7283, + "step": 25520 + }, + { + "epoch": 0.3973783150536714, + "grad_norm": 3.2556867599487305, + "learning_rate": 6.343553858507727e-07, + "loss": 0.6701, + "step": 25525 + }, + { + "epoch": 0.3974561560556408, + "grad_norm": 3.534254550933838, + "learning_rate": 6.342734468461676e-07, + "loss": 0.8053, + "step": 25530 + }, + { + "epoch": 0.3975339970576101, + "grad_norm": 4.892989158630371, + "learning_rate": 6.341915078415627e-07, + "loss": 0.6051, + "step": 25535 + }, + { + "epoch": 0.3976118380595795, + "grad_norm": 3.6802456378936768, + "learning_rate": 6.341095688369578e-07, + "loss": 0.7358, + "step": 25540 + }, + { + "epoch": 0.3976896790615489, + "grad_norm": 3.2066876888275146, + "learning_rate": 6.340276298323529e-07, + "loss": 0.7943, + "step": 25545 + }, + { + "epoch": 0.39776752006351823, + "grad_norm": 5.907846450805664, + "learning_rate": 6.339456908277478e-07, + "loss": 0.7533, + "step": 25550 + }, + { + "epoch": 0.3978453610654876, + "grad_norm": 4.704529762268066, + "learning_rate": 6.338637518231428e-07, + "loss": 0.6999, + "step": 25555 + }, + { + "epoch": 0.397923202067457, + "grad_norm": 3.5781517028808594, + "learning_rate": 6.337818128185379e-07, + "loss": 0.7683, + "step": 25560 + }, + { + "epoch": 0.3980010430694264, + "grad_norm": 3.7871382236480713, + "learning_rate": 6.336998738139328e-07, + "loss": 0.903, + "step": 25565 + }, + { + "epoch": 0.39807888407139574, + "grad_norm": 6.048968315124512, + "learning_rate": 6.336179348093279e-07, + "loss": 0.8356, + "step": 25570 + }, + { + "epoch": 0.39815672507336514, + "grad_norm": 4.461136341094971, + "learning_rate": 6.33535995804723e-07, + "loss": 0.819, + "step": 25575 + }, + { + "epoch": 0.39823456607533453, + "grad_norm": 3.7887139320373535, + "learning_rate": 6.33454056800118e-07, + "loss": 0.7609, + "step": 25580 + }, + { + "epoch": 0.3983124070773039, + "grad_norm": 3.172938823699951, + "learning_rate": 6.33372117795513e-07, + "loss": 0.76, + "step": 25585 + }, + { + "epoch": 0.39839024807927326, + "grad_norm": 4.127932548522949, + "learning_rate": 6.332901787909081e-07, + "loss": 0.808, + "step": 25590 + }, + { + "epoch": 0.39846808908124265, + "grad_norm": 3.771493911743164, + "learning_rate": 6.33208239786303e-07, + "loss": 0.8112, + "step": 25595 + }, + { + "epoch": 0.39854593008321204, + "grad_norm": 4.411073207855225, + "learning_rate": 6.33126300781698e-07, + "loss": 0.894, + "step": 25600 + }, + { + "epoch": 0.39862377108518143, + "grad_norm": 4.447315216064453, + "learning_rate": 6.330443617770931e-07, + "loss": 0.7567, + "step": 25605 + }, + { + "epoch": 0.39870161208715077, + "grad_norm": 6.834893703460693, + "learning_rate": 6.329624227724881e-07, + "loss": 0.9129, + "step": 25610 + }, + { + "epoch": 0.39877945308912016, + "grad_norm": 4.036401748657227, + "learning_rate": 6.328804837678832e-07, + "loss": 0.8468, + "step": 25615 + }, + { + "epoch": 0.39885729409108955, + "grad_norm": 3.791520833969116, + "learning_rate": 6.327985447632782e-07, + "loss": 0.7759, + "step": 25620 + }, + { + "epoch": 0.3989351350930589, + "grad_norm": 3.5723390579223633, + "learning_rate": 6.327166057586732e-07, + "loss": 0.6774, + "step": 25625 + }, + { + "epoch": 0.3990129760950283, + "grad_norm": 10.609052658081055, + "learning_rate": 6.326346667540683e-07, + "loss": 0.8008, + "step": 25630 + }, + { + "epoch": 0.3990908170969977, + "grad_norm": 3.5287327766418457, + "learning_rate": 6.325527277494633e-07, + "loss": 0.6881, + "step": 25635 + }, + { + "epoch": 0.39916865809896707, + "grad_norm": 3.306892156600952, + "learning_rate": 6.324707887448582e-07, + "loss": 0.7992, + "step": 25640 + }, + { + "epoch": 0.3992464991009364, + "grad_norm": 3.6389737129211426, + "learning_rate": 6.323888497402533e-07, + "loss": 0.5907, + "step": 25645 + }, + { + "epoch": 0.3993243401029058, + "grad_norm": 3.8080899715423584, + "learning_rate": 6.323069107356484e-07, + "loss": 0.7649, + "step": 25650 + }, + { + "epoch": 0.3994021811048752, + "grad_norm": 3.856809139251709, + "learning_rate": 6.322249717310433e-07, + "loss": 0.783, + "step": 25655 + }, + { + "epoch": 0.3994800221068446, + "grad_norm": 2.6995177268981934, + "learning_rate": 6.321430327264384e-07, + "loss": 0.7815, + "step": 25660 + }, + { + "epoch": 0.3995578631088139, + "grad_norm": 3.1869149208068848, + "learning_rate": 6.320610937218335e-07, + "loss": 0.7579, + "step": 25665 + }, + { + "epoch": 0.3996357041107833, + "grad_norm": 3.3936657905578613, + "learning_rate": 6.319791547172286e-07, + "loss": 0.6582, + "step": 25670 + }, + { + "epoch": 0.3997135451127527, + "grad_norm": 3.886843204498291, + "learning_rate": 6.318972157126234e-07, + "loss": 0.7062, + "step": 25675 + }, + { + "epoch": 0.3997913861147221, + "grad_norm": 5.519811153411865, + "learning_rate": 6.318152767080185e-07, + "loss": 0.7387, + "step": 25680 + }, + { + "epoch": 0.39986922711669143, + "grad_norm": 4.253294467926025, + "learning_rate": 6.317333377034136e-07, + "loss": 0.6748, + "step": 25685 + }, + { + "epoch": 0.3999470681186608, + "grad_norm": 2.7694480419158936, + "learning_rate": 6.316513986988085e-07, + "loss": 0.8194, + "step": 25690 + }, + { + "epoch": 0.4000249091206302, + "grad_norm": 3.7576897144317627, + "learning_rate": 6.315694596942036e-07, + "loss": 0.6366, + "step": 25695 + }, + { + "epoch": 0.4001027501225996, + "grad_norm": 5.782569408416748, + "learning_rate": 6.314875206895987e-07, + "loss": 0.8115, + "step": 25700 + }, + { + "epoch": 0.40018059112456894, + "grad_norm": 3.806763172149658, + "learning_rate": 6.314055816849937e-07, + "loss": 0.791, + "step": 25705 + }, + { + "epoch": 0.40025843212653833, + "grad_norm": 7.054147720336914, + "learning_rate": 6.313236426803887e-07, + "loss": 0.7312, + "step": 25710 + }, + { + "epoch": 0.4003362731285077, + "grad_norm": 3.344073534011841, + "learning_rate": 6.312417036757837e-07, + "loss": 0.7061, + "step": 25715 + }, + { + "epoch": 0.40041411413047706, + "grad_norm": 3.3587987422943115, + "learning_rate": 6.311597646711787e-07, + "loss": 0.7641, + "step": 25720 + }, + { + "epoch": 0.40049195513244645, + "grad_norm": 2.2525744438171387, + "learning_rate": 6.310778256665738e-07, + "loss": 0.8433, + "step": 25725 + }, + { + "epoch": 0.40056979613441585, + "grad_norm": 5.507537841796875, + "learning_rate": 6.309958866619688e-07, + "loss": 0.8923, + "step": 25730 + }, + { + "epoch": 0.40064763713638524, + "grad_norm": 5.3320512771606445, + "learning_rate": 6.309139476573638e-07, + "loss": 0.7727, + "step": 25735 + }, + { + "epoch": 0.4007254781383546, + "grad_norm": 3.6437861919403076, + "learning_rate": 6.308320086527589e-07, + "loss": 0.7592, + "step": 25740 + }, + { + "epoch": 0.40080331914032397, + "grad_norm": 5.713900566101074, + "learning_rate": 6.307500696481539e-07, + "loss": 0.7082, + "step": 25745 + }, + { + "epoch": 0.40088116014229336, + "grad_norm": 6.304920673370361, + "learning_rate": 6.306681306435489e-07, + "loss": 0.7573, + "step": 25750 + }, + { + "epoch": 0.40095900114426275, + "grad_norm": 3.1046628952026367, + "learning_rate": 6.305861916389439e-07, + "loss": 0.8484, + "step": 25755 + }, + { + "epoch": 0.4010368421462321, + "grad_norm": 2.9262259006500244, + "learning_rate": 6.30504252634339e-07, + "loss": 0.7134, + "step": 25760 + }, + { + "epoch": 0.4011146831482015, + "grad_norm": 5.784107208251953, + "learning_rate": 6.304223136297339e-07, + "loss": 0.8202, + "step": 25765 + }, + { + "epoch": 0.40119252415017087, + "grad_norm": 3.4631705284118652, + "learning_rate": 6.30340374625129e-07, + "loss": 0.9114, + "step": 25770 + }, + { + "epoch": 0.40127036515214026, + "grad_norm": 3.4223835468292236, + "learning_rate": 6.302584356205241e-07, + "loss": 0.7365, + "step": 25775 + }, + { + "epoch": 0.4013482061541096, + "grad_norm": 8.433497428894043, + "learning_rate": 6.30176496615919e-07, + "loss": 0.6492, + "step": 25780 + }, + { + "epoch": 0.401426047156079, + "grad_norm": 4.8615827560424805, + "learning_rate": 6.300945576113141e-07, + "loss": 0.6824, + "step": 25785 + }, + { + "epoch": 0.4015038881580484, + "grad_norm": 4.456131935119629, + "learning_rate": 6.300126186067092e-07, + "loss": 0.7575, + "step": 25790 + }, + { + "epoch": 0.4015817291600178, + "grad_norm": 3.5618176460266113, + "learning_rate": 6.299306796021043e-07, + "loss": 0.7174, + "step": 25795 + }, + { + "epoch": 0.4016595701619871, + "grad_norm": 3.5955419540405273, + "learning_rate": 6.298487405974991e-07, + "loss": 0.7488, + "step": 25800 + }, + { + "epoch": 0.4017374111639565, + "grad_norm": 3.817229986190796, + "learning_rate": 6.297668015928942e-07, + "loss": 0.7705, + "step": 25805 + }, + { + "epoch": 0.4018152521659259, + "grad_norm": 2.675896167755127, + "learning_rate": 6.296848625882893e-07, + "loss": 0.7554, + "step": 25810 + }, + { + "epoch": 0.40189309316789523, + "grad_norm": 5.006580829620361, + "learning_rate": 6.296029235836843e-07, + "loss": 0.7452, + "step": 25815 + }, + { + "epoch": 0.4019709341698646, + "grad_norm": 7.0662946701049805, + "learning_rate": 6.295209845790793e-07, + "loss": 0.7118, + "step": 25820 + }, + { + "epoch": 0.402048775171834, + "grad_norm": 4.054603099822998, + "learning_rate": 6.294390455744744e-07, + "loss": 0.7652, + "step": 25825 + }, + { + "epoch": 0.4021266161738034, + "grad_norm": 6.0550408363342285, + "learning_rate": 6.293571065698694e-07, + "loss": 0.6895, + "step": 25830 + }, + { + "epoch": 0.40220445717577274, + "grad_norm": 4.203376293182373, + "learning_rate": 6.292751675652644e-07, + "loss": 0.751, + "step": 25835 + }, + { + "epoch": 0.40228229817774214, + "grad_norm": 3.9787485599517822, + "learning_rate": 6.291932285606594e-07, + "loss": 0.6545, + "step": 25840 + }, + { + "epoch": 0.40236013917971153, + "grad_norm": 3.0336601734161377, + "learning_rate": 6.291112895560544e-07, + "loss": 0.8396, + "step": 25845 + }, + { + "epoch": 0.4024379801816809, + "grad_norm": 4.524356365203857, + "learning_rate": 6.290293505514495e-07, + "loss": 0.7489, + "step": 25850 + }, + { + "epoch": 0.40251582118365026, + "grad_norm": 2.6053860187530518, + "learning_rate": 6.289474115468445e-07, + "loss": 0.7061, + "step": 25855 + }, + { + "epoch": 0.40259366218561965, + "grad_norm": 3.6728532314300537, + "learning_rate": 6.288654725422395e-07, + "loss": 0.7726, + "step": 25860 + }, + { + "epoch": 0.40267150318758904, + "grad_norm": 10.254779815673828, + "learning_rate": 6.287835335376346e-07, + "loss": 0.7134, + "step": 25865 + }, + { + "epoch": 0.40274934418955843, + "grad_norm": 4.043647766113281, + "learning_rate": 6.287015945330297e-07, + "loss": 0.7779, + "step": 25870 + }, + { + "epoch": 0.40282718519152777, + "grad_norm": 3.0604496002197266, + "learning_rate": 6.286196555284246e-07, + "loss": 0.8593, + "step": 25875 + }, + { + "epoch": 0.40290502619349716, + "grad_norm": 6.564319610595703, + "learning_rate": 6.285377165238196e-07, + "loss": 0.6981, + "step": 25880 + }, + { + "epoch": 0.40298286719546655, + "grad_norm": 4.262159824371338, + "learning_rate": 6.284557775192147e-07, + "loss": 0.7919, + "step": 25885 + }, + { + "epoch": 0.4030607081974359, + "grad_norm": 5.1394853591918945, + "learning_rate": 6.283738385146096e-07, + "loss": 0.8422, + "step": 25890 + }, + { + "epoch": 0.4031385491994053, + "grad_norm": 3.8123087882995605, + "learning_rate": 6.282918995100047e-07, + "loss": 0.7922, + "step": 25895 + }, + { + "epoch": 0.4032163902013747, + "grad_norm": 6.320923805236816, + "learning_rate": 6.282099605053998e-07, + "loss": 0.7794, + "step": 25900 + }, + { + "epoch": 0.40329423120334407, + "grad_norm": 3.729196310043335, + "learning_rate": 6.281280215007948e-07, + "loss": 0.6516, + "step": 25905 + }, + { + "epoch": 0.4033720722053134, + "grad_norm": 4.0743255615234375, + "learning_rate": 6.280460824961898e-07, + "loss": 0.8156, + "step": 25910 + }, + { + "epoch": 0.4034499132072828, + "grad_norm": 2.5866003036499023, + "learning_rate": 6.279641434915849e-07, + "loss": 0.7341, + "step": 25915 + }, + { + "epoch": 0.4035277542092522, + "grad_norm": 3.3593242168426514, + "learning_rate": 6.278822044869798e-07, + "loss": 0.7359, + "step": 25920 + }, + { + "epoch": 0.4036055952112216, + "grad_norm": 3.165477991104126, + "learning_rate": 6.278002654823748e-07, + "loss": 0.6969, + "step": 25925 + }, + { + "epoch": 0.4036834362131909, + "grad_norm": 4.103423595428467, + "learning_rate": 6.277183264777699e-07, + "loss": 0.7591, + "step": 25930 + }, + { + "epoch": 0.4037612772151603, + "grad_norm": 6.460220813751221, + "learning_rate": 6.27636387473165e-07, + "loss": 0.8588, + "step": 25935 + }, + { + "epoch": 0.4038391182171297, + "grad_norm": 4.974887847900391, + "learning_rate": 6.2755444846856e-07, + "loss": 0.8204, + "step": 25940 + }, + { + "epoch": 0.4039169592190991, + "grad_norm": 3.6187150478363037, + "learning_rate": 6.27472509463955e-07, + "loss": 0.8373, + "step": 25945 + }, + { + "epoch": 0.40399480022106843, + "grad_norm": 4.209701061248779, + "learning_rate": 6.273905704593501e-07, + "loss": 0.7623, + "step": 25950 + }, + { + "epoch": 0.4040726412230378, + "grad_norm": 3.2338383197784424, + "learning_rate": 6.273086314547451e-07, + "loss": 0.8593, + "step": 25955 + }, + { + "epoch": 0.4041504822250072, + "grad_norm": 4.725485324859619, + "learning_rate": 6.2722669245014e-07, + "loss": 0.7365, + "step": 25960 + }, + { + "epoch": 0.4042283232269766, + "grad_norm": 7.591894149780273, + "learning_rate": 6.271447534455351e-07, + "loss": 0.6815, + "step": 25965 + }, + { + "epoch": 0.40430616422894594, + "grad_norm": 6.150928497314453, + "learning_rate": 6.270628144409301e-07, + "loss": 0.7711, + "step": 25970 + }, + { + "epoch": 0.40438400523091533, + "grad_norm": 5.162087440490723, + "learning_rate": 6.269808754363252e-07, + "loss": 0.778, + "step": 25975 + }, + { + "epoch": 0.4044618462328847, + "grad_norm": 11.32683277130127, + "learning_rate": 6.268989364317202e-07, + "loss": 0.8482, + "step": 25980 + }, + { + "epoch": 0.40453968723485406, + "grad_norm": 3.6983065605163574, + "learning_rate": 6.268169974271152e-07, + "loss": 0.7804, + "step": 25985 + }, + { + "epoch": 0.40461752823682345, + "grad_norm": 3.600092887878418, + "learning_rate": 6.267350584225103e-07, + "loss": 0.76, + "step": 25990 + }, + { + "epoch": 0.40469536923879285, + "grad_norm": 4.2920050621032715, + "learning_rate": 6.266531194179054e-07, + "loss": 0.6658, + "step": 25995 + }, + { + "epoch": 0.40477321024076224, + "grad_norm": 3.719242572784424, + "learning_rate": 6.265711804133002e-07, + "loss": 0.7362, + "step": 26000 + }, + { + "epoch": 0.4048510512427316, + "grad_norm": 4.887945175170898, + "learning_rate": 6.264892414086953e-07, + "loss": 0.8455, + "step": 26005 + }, + { + "epoch": 0.40492889224470097, + "grad_norm": 3.4599506855010986, + "learning_rate": 6.264073024040904e-07, + "loss": 0.8767, + "step": 26010 + }, + { + "epoch": 0.40500673324667036, + "grad_norm": 3.034026861190796, + "learning_rate": 6.263253633994853e-07, + "loss": 0.7008, + "step": 26015 + }, + { + "epoch": 0.40508457424863975, + "grad_norm": 4.146843433380127, + "learning_rate": 6.262434243948804e-07, + "loss": 0.7626, + "step": 26020 + }, + { + "epoch": 0.4051624152506091, + "grad_norm": 6.154612064361572, + "learning_rate": 6.261614853902755e-07, + "loss": 0.8537, + "step": 26025 + }, + { + "epoch": 0.4052402562525785, + "grad_norm": 2.850789785385132, + "learning_rate": 6.260795463856705e-07, + "loss": 0.7535, + "step": 26030 + }, + { + "epoch": 0.40531809725454787, + "grad_norm": 3.5527889728546143, + "learning_rate": 6.259976073810655e-07, + "loss": 0.8057, + "step": 26035 + }, + { + "epoch": 0.40539593825651726, + "grad_norm": 3.5174005031585693, + "learning_rate": 6.259156683764605e-07, + "loss": 0.7248, + "step": 26040 + }, + { + "epoch": 0.4054737792584866, + "grad_norm": 3.333939790725708, + "learning_rate": 6.258337293718555e-07, + "loss": 0.8365, + "step": 26045 + }, + { + "epoch": 0.405551620260456, + "grad_norm": 3.6032981872558594, + "learning_rate": 6.257517903672505e-07, + "loss": 0.7541, + "step": 26050 + }, + { + "epoch": 0.4056294612624254, + "grad_norm": 2.9027950763702393, + "learning_rate": 6.256698513626456e-07, + "loss": 0.7457, + "step": 26055 + }, + { + "epoch": 0.4057073022643947, + "grad_norm": 3.3258626461029053, + "learning_rate": 6.255879123580407e-07, + "loss": 0.8462, + "step": 26060 + }, + { + "epoch": 0.4057851432663641, + "grad_norm": 7.839389801025391, + "learning_rate": 6.255059733534357e-07, + "loss": 0.6776, + "step": 26065 + }, + { + "epoch": 0.4058629842683335, + "grad_norm": 4.488097190856934, + "learning_rate": 6.254240343488307e-07, + "loss": 0.7482, + "step": 26070 + }, + { + "epoch": 0.4059408252703029, + "grad_norm": 5.980696678161621, + "learning_rate": 6.253420953442258e-07, + "loss": 0.6656, + "step": 26075 + }, + { + "epoch": 0.40601866627227223, + "grad_norm": 2.6422781944274902, + "learning_rate": 6.252601563396207e-07, + "loss": 0.7025, + "step": 26080 + }, + { + "epoch": 0.4060965072742416, + "grad_norm": 6.426515102386475, + "learning_rate": 6.251782173350158e-07, + "loss": 0.8232, + "step": 26085 + }, + { + "epoch": 0.406174348276211, + "grad_norm": 8.42750358581543, + "learning_rate": 6.250962783304108e-07, + "loss": 0.7168, + "step": 26090 + }, + { + "epoch": 0.4062521892781804, + "grad_norm": 3.8758537769317627, + "learning_rate": 6.250143393258058e-07, + "loss": 0.8671, + "step": 26095 + }, + { + "epoch": 0.40633003028014975, + "grad_norm": 3.026585817337036, + "learning_rate": 6.249324003212009e-07, + "loss": 0.8579, + "step": 26100 + }, + { + "epoch": 0.40640787128211914, + "grad_norm": 4.004497051239014, + "learning_rate": 6.248504613165959e-07, + "loss": 0.7519, + "step": 26105 + }, + { + "epoch": 0.40648571228408853, + "grad_norm": 3.6319313049316406, + "learning_rate": 6.247685223119909e-07, + "loss": 0.8311, + "step": 26110 + }, + { + "epoch": 0.4065635532860579, + "grad_norm": 5.742655277252197, + "learning_rate": 6.24686583307386e-07, + "loss": 0.8036, + "step": 26115 + }, + { + "epoch": 0.40664139428802726, + "grad_norm": 8.020777702331543, + "learning_rate": 6.246046443027811e-07, + "loss": 0.8244, + "step": 26120 + }, + { + "epoch": 0.40671923528999665, + "grad_norm": 4.790924072265625, + "learning_rate": 6.245227052981759e-07, + "loss": 0.6761, + "step": 26125 + }, + { + "epoch": 0.40679707629196604, + "grad_norm": 4.601445198059082, + "learning_rate": 6.24440766293571e-07, + "loss": 0.7301, + "step": 26130 + }, + { + "epoch": 0.40687491729393543, + "grad_norm": 4.280304431915283, + "learning_rate": 6.243588272889661e-07, + "loss": 0.7748, + "step": 26135 + }, + { + "epoch": 0.40695275829590477, + "grad_norm": 6.949324607849121, + "learning_rate": 6.24276888284361e-07, + "loss": 0.7076, + "step": 26140 + }, + { + "epoch": 0.40703059929787416, + "grad_norm": 2.876680850982666, + "learning_rate": 6.241949492797561e-07, + "loss": 0.6675, + "step": 26145 + }, + { + "epoch": 0.40710844029984355, + "grad_norm": 3.646688461303711, + "learning_rate": 6.241130102751512e-07, + "loss": 0.7767, + "step": 26150 + }, + { + "epoch": 0.4071862813018129, + "grad_norm": 3.557560920715332, + "learning_rate": 6.240310712705462e-07, + "loss": 0.7684, + "step": 26155 + }, + { + "epoch": 0.4072641223037823, + "grad_norm": 6.114151954650879, + "learning_rate": 6.239491322659412e-07, + "loss": 0.7599, + "step": 26160 + }, + { + "epoch": 0.4073419633057517, + "grad_norm": 5.147528171539307, + "learning_rate": 6.238671932613362e-07, + "loss": 0.7908, + "step": 26165 + }, + { + "epoch": 0.40741980430772107, + "grad_norm": 3.184091091156006, + "learning_rate": 6.237852542567312e-07, + "loss": 0.6857, + "step": 26170 + }, + { + "epoch": 0.4074976453096904, + "grad_norm": 6.086164474487305, + "learning_rate": 6.237033152521263e-07, + "loss": 0.8387, + "step": 26175 + }, + { + "epoch": 0.4075754863116598, + "grad_norm": 2.5259017944335938, + "learning_rate": 6.236213762475213e-07, + "loss": 0.7188, + "step": 26180 + }, + { + "epoch": 0.4076533273136292, + "grad_norm": 5.400747776031494, + "learning_rate": 6.235394372429164e-07, + "loss": 0.7216, + "step": 26185 + }, + { + "epoch": 0.4077311683155986, + "grad_norm": 4.082566738128662, + "learning_rate": 6.234574982383114e-07, + "loss": 0.7062, + "step": 26190 + }, + { + "epoch": 0.4078090093175679, + "grad_norm": 4.59686279296875, + "learning_rate": 6.233755592337064e-07, + "loss": 0.8353, + "step": 26195 + }, + { + "epoch": 0.4078868503195373, + "grad_norm": 3.698218822479248, + "learning_rate": 6.232936202291015e-07, + "loss": 0.7123, + "step": 26200 + }, + { + "epoch": 0.4079646913215067, + "grad_norm": 3.4450857639312744, + "learning_rate": 6.232116812244964e-07, + "loss": 0.7248, + "step": 26205 + }, + { + "epoch": 0.4080425323234761, + "grad_norm": 4.156335353851318, + "learning_rate": 6.231297422198915e-07, + "loss": 0.7683, + "step": 26210 + }, + { + "epoch": 0.40812037332544543, + "grad_norm": 7.560205459594727, + "learning_rate": 6.230478032152865e-07, + "loss": 0.8649, + "step": 26215 + }, + { + "epoch": 0.4081982143274148, + "grad_norm": 5.544355869293213, + "learning_rate": 6.229658642106815e-07, + "loss": 0.6824, + "step": 26220 + }, + { + "epoch": 0.4082760553293842, + "grad_norm": 2.8157401084899902, + "learning_rate": 6.228839252060766e-07, + "loss": 0.8322, + "step": 26225 + }, + { + "epoch": 0.40835389633135355, + "grad_norm": 3.6721606254577637, + "learning_rate": 6.228019862014717e-07, + "loss": 0.7834, + "step": 26230 + }, + { + "epoch": 0.40843173733332294, + "grad_norm": 2.750293731689453, + "learning_rate": 6.227200471968666e-07, + "loss": 0.7128, + "step": 26235 + }, + { + "epoch": 0.40850957833529233, + "grad_norm": 6.421826362609863, + "learning_rate": 6.226381081922617e-07, + "loss": 0.7421, + "step": 26240 + }, + { + "epoch": 0.4085874193372617, + "grad_norm": 3.734286069869995, + "learning_rate": 6.225561691876567e-07, + "loss": 0.7068, + "step": 26245 + }, + { + "epoch": 0.40866526033923106, + "grad_norm": 3.162724494934082, + "learning_rate": 6.224742301830516e-07, + "loss": 0.7293, + "step": 26250 + }, + { + "epoch": 0.40874310134120045, + "grad_norm": 6.575493335723877, + "learning_rate": 6.223922911784467e-07, + "loss": 0.6914, + "step": 26255 + }, + { + "epoch": 0.40882094234316985, + "grad_norm": 5.713708877563477, + "learning_rate": 6.223103521738418e-07, + "loss": 0.7782, + "step": 26260 + }, + { + "epoch": 0.40889878334513924, + "grad_norm": 3.8541054725646973, + "learning_rate": 6.222284131692368e-07, + "loss": 0.8174, + "step": 26265 + }, + { + "epoch": 0.4089766243471086, + "grad_norm": 2.532059907913208, + "learning_rate": 6.221464741646318e-07, + "loss": 0.6114, + "step": 26270 + }, + { + "epoch": 0.40905446534907797, + "grad_norm": 9.393692970275879, + "learning_rate": 6.220645351600269e-07, + "loss": 0.7823, + "step": 26275 + }, + { + "epoch": 0.40913230635104736, + "grad_norm": 3.4701974391937256, + "learning_rate": 6.219825961554219e-07, + "loss": 0.7245, + "step": 26280 + }, + { + "epoch": 0.40921014735301675, + "grad_norm": 9.034595489501953, + "learning_rate": 6.219006571508168e-07, + "loss": 0.8293, + "step": 26285 + }, + { + "epoch": 0.4092879883549861, + "grad_norm": 6.221508026123047, + "learning_rate": 6.218187181462119e-07, + "loss": 0.7924, + "step": 26290 + }, + { + "epoch": 0.4093658293569555, + "grad_norm": 3.99814772605896, + "learning_rate": 6.217367791416069e-07, + "loss": 0.7358, + "step": 26295 + }, + { + "epoch": 0.40944367035892487, + "grad_norm": 2.8011677265167236, + "learning_rate": 6.21654840137002e-07, + "loss": 0.771, + "step": 26300 + }, + { + "epoch": 0.40952151136089426, + "grad_norm": 4.2195515632629395, + "learning_rate": 6.21572901132397e-07, + "loss": 0.6896, + "step": 26305 + }, + { + "epoch": 0.4095993523628636, + "grad_norm": 4.860198974609375, + "learning_rate": 6.214909621277921e-07, + "loss": 0.774, + "step": 26310 + }, + { + "epoch": 0.409677193364833, + "grad_norm": 2.483884334564209, + "learning_rate": 6.214090231231871e-07, + "loss": 0.8224, + "step": 26315 + }, + { + "epoch": 0.4097550343668024, + "grad_norm": 3.0667426586151123, + "learning_rate": 6.213270841185822e-07, + "loss": 0.7239, + "step": 26320 + }, + { + "epoch": 0.4098328753687717, + "grad_norm": 6.193851470947266, + "learning_rate": 6.212451451139771e-07, + "loss": 0.7518, + "step": 26325 + }, + { + "epoch": 0.4099107163707411, + "grad_norm": 3.35817551612854, + "learning_rate": 6.211632061093721e-07, + "loss": 0.8608, + "step": 26330 + }, + { + "epoch": 0.4099885573727105, + "grad_norm": 4.190231800079346, + "learning_rate": 6.210812671047672e-07, + "loss": 0.8772, + "step": 26335 + }, + { + "epoch": 0.4100663983746799, + "grad_norm": 3.7101292610168457, + "learning_rate": 6.209993281001622e-07, + "loss": 0.8175, + "step": 26340 + }, + { + "epoch": 0.41014423937664923, + "grad_norm": 3.1132471561431885, + "learning_rate": 6.209173890955572e-07, + "loss": 0.7291, + "step": 26345 + }, + { + "epoch": 0.4102220803786186, + "grad_norm": 4.467872142791748, + "learning_rate": 6.208354500909523e-07, + "loss": 0.8373, + "step": 26350 + }, + { + "epoch": 0.410299921380588, + "grad_norm": 3.7462499141693115, + "learning_rate": 6.207535110863474e-07, + "loss": 0.7824, + "step": 26355 + }, + { + "epoch": 0.4103777623825574, + "grad_norm": 3.6446940898895264, + "learning_rate": 6.206715720817423e-07, + "loss": 0.858, + "step": 26360 + }, + { + "epoch": 0.41045560338452675, + "grad_norm": 4.626848220825195, + "learning_rate": 6.205896330771373e-07, + "loss": 0.8112, + "step": 26365 + }, + { + "epoch": 0.41053344438649614, + "grad_norm": 12.126510620117188, + "learning_rate": 6.205076940725324e-07, + "loss": 0.6869, + "step": 26370 + }, + { + "epoch": 0.41061128538846553, + "grad_norm": 2.845858335494995, + "learning_rate": 6.204257550679273e-07, + "loss": 0.6595, + "step": 26375 + }, + { + "epoch": 0.4106891263904349, + "grad_norm": 6.0855712890625, + "learning_rate": 6.203438160633224e-07, + "loss": 0.8124, + "step": 26380 + }, + { + "epoch": 0.41076696739240426, + "grad_norm": 3.1703717708587646, + "learning_rate": 6.202618770587175e-07, + "loss": 0.7564, + "step": 26385 + }, + { + "epoch": 0.41084480839437365, + "grad_norm": 2.412677049636841, + "learning_rate": 6.201799380541125e-07, + "loss": 0.8151, + "step": 26390 + }, + { + "epoch": 0.41092264939634304, + "grad_norm": 4.135179042816162, + "learning_rate": 6.200979990495075e-07, + "loss": 0.7567, + "step": 26395 + }, + { + "epoch": 0.41100049039831243, + "grad_norm": 3.759422779083252, + "learning_rate": 6.200160600449026e-07, + "loss": 0.666, + "step": 26400 + }, + { + "epoch": 0.41107833140028177, + "grad_norm": 4.042479038238525, + "learning_rate": 6.199341210402975e-07, + "loss": 0.7447, + "step": 26405 + }, + { + "epoch": 0.41115617240225116, + "grad_norm": 3.1750473976135254, + "learning_rate": 6.198521820356925e-07, + "loss": 0.7239, + "step": 26410 + }, + { + "epoch": 0.41123401340422056, + "grad_norm": 3.660675525665283, + "learning_rate": 6.197702430310876e-07, + "loss": 0.7659, + "step": 26415 + }, + { + "epoch": 0.4113118544061899, + "grad_norm": 5.285322666168213, + "learning_rate": 6.196883040264826e-07, + "loss": 0.7527, + "step": 26420 + }, + { + "epoch": 0.4113896954081593, + "grad_norm": 4.534871578216553, + "learning_rate": 6.196063650218777e-07, + "loss": 0.7233, + "step": 26425 + }, + { + "epoch": 0.4114675364101287, + "grad_norm": 4.243113994598389, + "learning_rate": 6.195244260172727e-07, + "loss": 0.8813, + "step": 26430 + }, + { + "epoch": 0.41154537741209807, + "grad_norm": 8.809892654418945, + "learning_rate": 6.194424870126678e-07, + "loss": 0.7059, + "step": 26435 + }, + { + "epoch": 0.4116232184140674, + "grad_norm": 7.20442008972168, + "learning_rate": 6.193605480080628e-07, + "loss": 0.767, + "step": 26440 + }, + { + "epoch": 0.4117010594160368, + "grad_norm": 6.816311836242676, + "learning_rate": 6.192786090034579e-07, + "loss": 0.727, + "step": 26445 + }, + { + "epoch": 0.4117789004180062, + "grad_norm": 3.0180819034576416, + "learning_rate": 6.191966699988528e-07, + "loss": 0.6395, + "step": 26450 + }, + { + "epoch": 0.4118567414199756, + "grad_norm": 8.097386360168457, + "learning_rate": 6.191147309942478e-07, + "loss": 0.8232, + "step": 26455 + }, + { + "epoch": 0.4119345824219449, + "grad_norm": 2.4666662216186523, + "learning_rate": 6.190327919896429e-07, + "loss": 0.793, + "step": 26460 + }, + { + "epoch": 0.4120124234239143, + "grad_norm": 2.839620351791382, + "learning_rate": 6.18950852985038e-07, + "loss": 0.7188, + "step": 26465 + }, + { + "epoch": 0.4120902644258837, + "grad_norm": 4.023433685302734, + "learning_rate": 6.188689139804329e-07, + "loss": 0.719, + "step": 26470 + }, + { + "epoch": 0.4121681054278531, + "grad_norm": 6.379566192626953, + "learning_rate": 6.18786974975828e-07, + "loss": 0.8544, + "step": 26475 + }, + { + "epoch": 0.41224594642982243, + "grad_norm": 3.7102549076080322, + "learning_rate": 6.187050359712231e-07, + "loss": 0.7426, + "step": 26480 + }, + { + "epoch": 0.4123237874317918, + "grad_norm": 4.392475128173828, + "learning_rate": 6.18623096966618e-07, + "loss": 0.8274, + "step": 26485 + }, + { + "epoch": 0.4124016284337612, + "grad_norm": 3.0677952766418457, + "learning_rate": 6.18541157962013e-07, + "loss": 0.7238, + "step": 26490 + }, + { + "epoch": 0.41247946943573055, + "grad_norm": 4.5266289710998535, + "learning_rate": 6.184592189574081e-07, + "loss": 0.7116, + "step": 26495 + }, + { + "epoch": 0.41255731043769994, + "grad_norm": 7.1209025382995605, + "learning_rate": 6.18377279952803e-07, + "loss": 0.7334, + "step": 26500 + }, + { + "epoch": 0.41263515143966933, + "grad_norm": 2.6258678436279297, + "learning_rate": 6.182953409481981e-07, + "loss": 0.6962, + "step": 26505 + }, + { + "epoch": 0.4127129924416387, + "grad_norm": 2.9764554500579834, + "learning_rate": 6.182134019435932e-07, + "loss": 0.6064, + "step": 26510 + }, + { + "epoch": 0.41279083344360806, + "grad_norm": 3.3296802043914795, + "learning_rate": 6.181314629389882e-07, + "loss": 0.7621, + "step": 26515 + }, + { + "epoch": 0.41286867444557745, + "grad_norm": 4.500085830688477, + "learning_rate": 6.180495239343832e-07, + "loss": 0.5435, + "step": 26520 + }, + { + "epoch": 0.41294651544754685, + "grad_norm": 3.5746519565582275, + "learning_rate": 6.179675849297783e-07, + "loss": 0.7156, + "step": 26525 + }, + { + "epoch": 0.41302435644951624, + "grad_norm": 2.8794310092926025, + "learning_rate": 6.178856459251732e-07, + "loss": 0.7878, + "step": 26530 + }, + { + "epoch": 0.4131021974514856, + "grad_norm": 3.6298744678497314, + "learning_rate": 6.178037069205683e-07, + "loss": 0.8602, + "step": 26535 + }, + { + "epoch": 0.41318003845345497, + "grad_norm": 3.2164409160614014, + "learning_rate": 6.177217679159633e-07, + "loss": 0.756, + "step": 26540 + }, + { + "epoch": 0.41325787945542436, + "grad_norm": 3.0077872276306152, + "learning_rate": 6.176398289113583e-07, + "loss": 0.87, + "step": 26545 + }, + { + "epoch": 0.41333572045739375, + "grad_norm": 3.9495902061462402, + "learning_rate": 6.175578899067534e-07, + "loss": 0.7959, + "step": 26550 + }, + { + "epoch": 0.4134135614593631, + "grad_norm": 4.911811351776123, + "learning_rate": 6.174759509021484e-07, + "loss": 0.7468, + "step": 26555 + }, + { + "epoch": 0.4134914024613325, + "grad_norm": 4.713522434234619, + "learning_rate": 6.173940118975435e-07, + "loss": 0.7409, + "step": 26560 + }, + { + "epoch": 0.41356924346330187, + "grad_norm": 3.8988125324249268, + "learning_rate": 6.173120728929385e-07, + "loss": 0.7385, + "step": 26565 + }, + { + "epoch": 0.41364708446527126, + "grad_norm": 2.3970417976379395, + "learning_rate": 6.172301338883335e-07, + "loss": 0.8039, + "step": 26570 + }, + { + "epoch": 0.4137249254672406, + "grad_norm": 2.4468183517456055, + "learning_rate": 6.171481948837285e-07, + "loss": 0.7449, + "step": 26575 + }, + { + "epoch": 0.41380276646921, + "grad_norm": 3.3654592037200928, + "learning_rate": 6.170662558791235e-07, + "loss": 0.7903, + "step": 26580 + }, + { + "epoch": 0.4138806074711794, + "grad_norm": 3.423818349838257, + "learning_rate": 6.169843168745186e-07, + "loss": 0.7963, + "step": 26585 + }, + { + "epoch": 0.4139584484731487, + "grad_norm": 2.784715175628662, + "learning_rate": 6.169023778699137e-07, + "loss": 0.7938, + "step": 26590 + }, + { + "epoch": 0.4140362894751181, + "grad_norm": 2.766166925430298, + "learning_rate": 6.168204388653086e-07, + "loss": 0.7048, + "step": 26595 + }, + { + "epoch": 0.4141141304770875, + "grad_norm": 3.4769532680511475, + "learning_rate": 6.167384998607037e-07, + "loss": 0.7529, + "step": 26600 + }, + { + "epoch": 0.4141919714790569, + "grad_norm": 3.1103856563568115, + "learning_rate": 6.166565608560988e-07, + "loss": 0.715, + "step": 26605 + }, + { + "epoch": 0.41426981248102623, + "grad_norm": 3.8510234355926514, + "learning_rate": 6.165746218514936e-07, + "loss": 0.7761, + "step": 26610 + }, + { + "epoch": 0.4143476534829956, + "grad_norm": 6.741697788238525, + "learning_rate": 6.164926828468887e-07, + "loss": 0.7113, + "step": 26615 + }, + { + "epoch": 0.414425494484965, + "grad_norm": 2.8889408111572266, + "learning_rate": 6.164107438422838e-07, + "loss": 0.6265, + "step": 26620 + }, + { + "epoch": 0.4145033354869344, + "grad_norm": 9.615153312683105, + "learning_rate": 6.163288048376788e-07, + "loss": 0.8047, + "step": 26625 + }, + { + "epoch": 0.41458117648890375, + "grad_norm": 7.800364971160889, + "learning_rate": 6.162468658330738e-07, + "loss": 0.728, + "step": 26630 + }, + { + "epoch": 0.41465901749087314, + "grad_norm": 2.9394280910491943, + "learning_rate": 6.161649268284689e-07, + "loss": 0.7267, + "step": 26635 + }, + { + "epoch": 0.41473685849284253, + "grad_norm": 4.299229145050049, + "learning_rate": 6.160829878238639e-07, + "loss": 0.9, + "step": 26640 + }, + { + "epoch": 0.4148146994948119, + "grad_norm": 4.3845672607421875, + "learning_rate": 6.16001048819259e-07, + "loss": 0.6087, + "step": 26645 + }, + { + "epoch": 0.41489254049678126, + "grad_norm": 3.6776092052459717, + "learning_rate": 6.159191098146539e-07, + "loss": 0.6703, + "step": 26650 + }, + { + "epoch": 0.41497038149875065, + "grad_norm": 5.197491645812988, + "learning_rate": 6.158371708100489e-07, + "loss": 0.7332, + "step": 26655 + }, + { + "epoch": 0.41504822250072004, + "grad_norm": 3.1170854568481445, + "learning_rate": 6.15755231805444e-07, + "loss": 0.6487, + "step": 26660 + }, + { + "epoch": 0.4151260635026894, + "grad_norm": 4.167092323303223, + "learning_rate": 6.15673292800839e-07, + "loss": 0.8017, + "step": 26665 + }, + { + "epoch": 0.41520390450465877, + "grad_norm": 4.455737113952637, + "learning_rate": 6.155913537962341e-07, + "loss": 0.767, + "step": 26670 + }, + { + "epoch": 0.41528174550662816, + "grad_norm": 5.93499755859375, + "learning_rate": 6.155094147916291e-07, + "loss": 0.8527, + "step": 26675 + }, + { + "epoch": 0.41535958650859756, + "grad_norm": 3.7191712856292725, + "learning_rate": 6.154274757870242e-07, + "loss": 0.8184, + "step": 26680 + }, + { + "epoch": 0.4154374275105669, + "grad_norm": 6.877071380615234, + "learning_rate": 6.153455367824192e-07, + "loss": 0.7743, + "step": 26685 + }, + { + "epoch": 0.4155152685125363, + "grad_norm": 7.615885257720947, + "learning_rate": 6.152635977778141e-07, + "loss": 0.9191, + "step": 26690 + }, + { + "epoch": 0.4155931095145057, + "grad_norm": 3.3617007732391357, + "learning_rate": 6.151816587732092e-07, + "loss": 0.7714, + "step": 26695 + }, + { + "epoch": 0.41567095051647507, + "grad_norm": 2.519935369491577, + "learning_rate": 6.150997197686042e-07, + "loss": 0.765, + "step": 26700 + }, + { + "epoch": 0.4157487915184444, + "grad_norm": 3.2422094345092773, + "learning_rate": 6.150177807639992e-07, + "loss": 0.7896, + "step": 26705 + }, + { + "epoch": 0.4158266325204138, + "grad_norm": 2.624389410018921, + "learning_rate": 6.149358417593943e-07, + "loss": 0.6363, + "step": 26710 + }, + { + "epoch": 0.4159044735223832, + "grad_norm": 3.49373197555542, + "learning_rate": 6.148539027547894e-07, + "loss": 0.8335, + "step": 26715 + }, + { + "epoch": 0.4159823145243526, + "grad_norm": 4.526536464691162, + "learning_rate": 6.147719637501843e-07, + "loss": 0.7632, + "step": 26720 + }, + { + "epoch": 0.4160601555263219, + "grad_norm": 7.346248149871826, + "learning_rate": 6.146900247455794e-07, + "loss": 0.6511, + "step": 26725 + }, + { + "epoch": 0.4161379965282913, + "grad_norm": 6.548068523406982, + "learning_rate": 6.146080857409744e-07, + "loss": 0.7563, + "step": 26730 + }, + { + "epoch": 0.4162158375302607, + "grad_norm": 3.753570079803467, + "learning_rate": 6.145261467363693e-07, + "loss": 0.7684, + "step": 26735 + }, + { + "epoch": 0.4162936785322301, + "grad_norm": 8.335871696472168, + "learning_rate": 6.144442077317644e-07, + "loss": 0.8658, + "step": 26740 + }, + { + "epoch": 0.41637151953419943, + "grad_norm": 4.141759395599365, + "learning_rate": 6.143622687271595e-07, + "loss": 0.8941, + "step": 26745 + }, + { + "epoch": 0.4164493605361688, + "grad_norm": 5.216050148010254, + "learning_rate": 6.142803297225545e-07, + "loss": 0.7769, + "step": 26750 + }, + { + "epoch": 0.4165272015381382, + "grad_norm": 3.003042697906494, + "learning_rate": 6.141983907179495e-07, + "loss": 0.6379, + "step": 26755 + }, + { + "epoch": 0.41660504254010755, + "grad_norm": 5.297163963317871, + "learning_rate": 6.141164517133446e-07, + "loss": 0.8346, + "step": 26760 + }, + { + "epoch": 0.41668288354207694, + "grad_norm": 6.788291931152344, + "learning_rate": 6.140345127087396e-07, + "loss": 0.8368, + "step": 26765 + }, + { + "epoch": 0.41676072454404633, + "grad_norm": 13.022028923034668, + "learning_rate": 6.139525737041347e-07, + "loss": 0.8212, + "step": 26770 + }, + { + "epoch": 0.4168385655460157, + "grad_norm": 3.262367010116577, + "learning_rate": 6.138706346995296e-07, + "loss": 0.6826, + "step": 26775 + }, + { + "epoch": 0.41691640654798506, + "grad_norm": 3.736050844192505, + "learning_rate": 6.137886956949246e-07, + "loss": 0.7972, + "step": 26780 + }, + { + "epoch": 0.41699424754995446, + "grad_norm": 8.119766235351562, + "learning_rate": 6.137067566903197e-07, + "loss": 0.739, + "step": 26785 + }, + { + "epoch": 0.41707208855192385, + "grad_norm": 4.275023460388184, + "learning_rate": 6.136248176857147e-07, + "loss": 0.8204, + "step": 26790 + }, + { + "epoch": 0.41714992955389324, + "grad_norm": 3.132134437561035, + "learning_rate": 6.135428786811098e-07, + "loss": 0.7618, + "step": 26795 + }, + { + "epoch": 0.4172277705558626, + "grad_norm": 3.638437271118164, + "learning_rate": 6.134609396765048e-07, + "loss": 0.7926, + "step": 26800 + }, + { + "epoch": 0.41730561155783197, + "grad_norm": 3.03761625289917, + "learning_rate": 6.133790006718999e-07, + "loss": 0.7097, + "step": 26805 + }, + { + "epoch": 0.41738345255980136, + "grad_norm": 5.049458026885986, + "learning_rate": 6.132970616672949e-07, + "loss": 0.7085, + "step": 26810 + }, + { + "epoch": 0.41746129356177075, + "grad_norm": 3.4833250045776367, + "learning_rate": 6.132151226626898e-07, + "loss": 0.7813, + "step": 26815 + }, + { + "epoch": 0.4175391345637401, + "grad_norm": 3.814908027648926, + "learning_rate": 6.131331836580849e-07, + "loss": 0.7807, + "step": 26820 + }, + { + "epoch": 0.4176169755657095, + "grad_norm": 3.30277419090271, + "learning_rate": 6.1305124465348e-07, + "loss": 0.7181, + "step": 26825 + }, + { + "epoch": 0.4176948165676789, + "grad_norm": 3.3809800148010254, + "learning_rate": 6.129693056488749e-07, + "loss": 0.7821, + "step": 26830 + }, + { + "epoch": 0.4177726575696482, + "grad_norm": 7.558841705322266, + "learning_rate": 6.1288736664427e-07, + "loss": 0.6552, + "step": 26835 + }, + { + "epoch": 0.4178504985716176, + "grad_norm": 5.2530975341796875, + "learning_rate": 6.128054276396651e-07, + "loss": 0.7189, + "step": 26840 + }, + { + "epoch": 0.417928339573587, + "grad_norm": 6.695279121398926, + "learning_rate": 6.1272348863506e-07, + "loss": 0.6808, + "step": 26845 + }, + { + "epoch": 0.4180061805755564, + "grad_norm": 4.834892749786377, + "learning_rate": 6.126415496304551e-07, + "loss": 0.7211, + "step": 26850 + }, + { + "epoch": 0.4180840215775257, + "grad_norm": 4.291613578796387, + "learning_rate": 6.125596106258501e-07, + "loss": 0.7151, + "step": 26855 + }, + { + "epoch": 0.4181618625794951, + "grad_norm": 6.921629428863525, + "learning_rate": 6.12477671621245e-07, + "loss": 0.811, + "step": 26860 + }, + { + "epoch": 0.4182397035814645, + "grad_norm": 3.956561326980591, + "learning_rate": 6.123957326166401e-07, + "loss": 0.782, + "step": 26865 + }, + { + "epoch": 0.4183175445834339, + "grad_norm": 2.7356879711151123, + "learning_rate": 6.123137936120352e-07, + "loss": 0.6748, + "step": 26870 + }, + { + "epoch": 0.41839538558540323, + "grad_norm": 3.5307228565216064, + "learning_rate": 6.122318546074302e-07, + "loss": 0.7285, + "step": 26875 + }, + { + "epoch": 0.4184732265873726, + "grad_norm": 3.408752679824829, + "learning_rate": 6.121499156028252e-07, + "loss": 0.6309, + "step": 26880 + }, + { + "epoch": 0.418551067589342, + "grad_norm": 3.754222869873047, + "learning_rate": 6.120679765982203e-07, + "loss": 0.721, + "step": 26885 + }, + { + "epoch": 0.4186289085913114, + "grad_norm": 3.735696315765381, + "learning_rate": 6.119860375936153e-07, + "loss": 0.7713, + "step": 26890 + }, + { + "epoch": 0.41870674959328075, + "grad_norm": 4.097355365753174, + "learning_rate": 6.119040985890103e-07, + "loss": 0.7273, + "step": 26895 + }, + { + "epoch": 0.41878459059525014, + "grad_norm": 9.404135704040527, + "learning_rate": 6.118221595844053e-07, + "loss": 0.8026, + "step": 26900 + }, + { + "epoch": 0.41886243159721953, + "grad_norm": 4.844088077545166, + "learning_rate": 6.117402205798003e-07, + "loss": 0.8528, + "step": 26905 + }, + { + "epoch": 0.4189402725991889, + "grad_norm": 3.7302608489990234, + "learning_rate": 6.116582815751954e-07, + "loss": 0.8019, + "step": 26910 + }, + { + "epoch": 0.41901811360115826, + "grad_norm": 5.881189823150635, + "learning_rate": 6.115763425705905e-07, + "loss": 0.6222, + "step": 26915 + }, + { + "epoch": 0.41909595460312765, + "grad_norm": 2.738311529159546, + "learning_rate": 6.114944035659855e-07, + "loss": 0.6529, + "step": 26920 + }, + { + "epoch": 0.41917379560509704, + "grad_norm": 3.3108832836151123, + "learning_rate": 6.114124645613805e-07, + "loss": 0.7377, + "step": 26925 + }, + { + "epoch": 0.4192516366070664, + "grad_norm": 3.1821160316467285, + "learning_rate": 6.113305255567756e-07, + "loss": 0.7788, + "step": 26930 + }, + { + "epoch": 0.4193294776090358, + "grad_norm": 2.7442448139190674, + "learning_rate": 6.112485865521705e-07, + "loss": 0.7053, + "step": 26935 + }, + { + "epoch": 0.41940731861100516, + "grad_norm": 2.8890326023101807, + "learning_rate": 6.111666475475655e-07, + "loss": 0.7762, + "step": 26940 + }, + { + "epoch": 0.41948515961297456, + "grad_norm": 4.088584899902344, + "learning_rate": 6.110847085429606e-07, + "loss": 0.8523, + "step": 26945 + }, + { + "epoch": 0.4195630006149439, + "grad_norm": 6.8917341232299805, + "learning_rate": 6.110027695383557e-07, + "loss": 0.8087, + "step": 26950 + }, + { + "epoch": 0.4196408416169133, + "grad_norm": 5.052600383758545, + "learning_rate": 6.109208305337506e-07, + "loss": 0.7649, + "step": 26955 + }, + { + "epoch": 0.4197186826188827, + "grad_norm": 6.6367621421813965, + "learning_rate": 6.108388915291457e-07, + "loss": 0.7778, + "step": 26960 + }, + { + "epoch": 0.41979652362085207, + "grad_norm": 7.393866539001465, + "learning_rate": 6.107569525245408e-07, + "loss": 0.6788, + "step": 26965 + }, + { + "epoch": 0.4198743646228214, + "grad_norm": 11.517690658569336, + "learning_rate": 6.106750135199357e-07, + "loss": 0.7605, + "step": 26970 + }, + { + "epoch": 0.4199522056247908, + "grad_norm": 5.074392795562744, + "learning_rate": 6.105930745153307e-07, + "loss": 0.6848, + "step": 26975 + }, + { + "epoch": 0.4200300466267602, + "grad_norm": 6.310015678405762, + "learning_rate": 6.105111355107258e-07, + "loss": 0.8061, + "step": 26980 + }, + { + "epoch": 0.4201078876287296, + "grad_norm": 3.8176333904266357, + "learning_rate": 6.104291965061208e-07, + "loss": 0.8268, + "step": 26985 + }, + { + "epoch": 0.4201857286306989, + "grad_norm": 3.844820737838745, + "learning_rate": 6.103472575015158e-07, + "loss": 0.6873, + "step": 26990 + }, + { + "epoch": 0.4202635696326683, + "grad_norm": 5.099147796630859, + "learning_rate": 6.102653184969109e-07, + "loss": 0.6911, + "step": 26995 + }, + { + "epoch": 0.4203414106346377, + "grad_norm": 4.644072532653809, + "learning_rate": 6.101833794923059e-07, + "loss": 0.7186, + "step": 27000 + }, + { + "epoch": 0.42041925163660704, + "grad_norm": 5.660889625549316, + "learning_rate": 6.10101440487701e-07, + "loss": 0.8471, + "step": 27005 + }, + { + "epoch": 0.42049709263857643, + "grad_norm": 3.6035196781158447, + "learning_rate": 6.10019501483096e-07, + "loss": 0.7778, + "step": 27010 + }, + { + "epoch": 0.4205749336405458, + "grad_norm": 3.680065393447876, + "learning_rate": 6.099375624784909e-07, + "loss": 0.8525, + "step": 27015 + }, + { + "epoch": 0.4206527746425152, + "grad_norm": 4.620081901550293, + "learning_rate": 6.09855623473886e-07, + "loss": 0.7906, + "step": 27020 + }, + { + "epoch": 0.42073061564448455, + "grad_norm": 2.889920473098755, + "learning_rate": 6.09773684469281e-07, + "loss": 0.7267, + "step": 27025 + }, + { + "epoch": 0.42080845664645394, + "grad_norm": 5.654252052307129, + "learning_rate": 6.09691745464676e-07, + "loss": 0.7148, + "step": 27030 + }, + { + "epoch": 0.42088629764842334, + "grad_norm": 6.712372303009033, + "learning_rate": 6.096098064600711e-07, + "loss": 0.7575, + "step": 27035 + }, + { + "epoch": 0.4209641386503927, + "grad_norm": 2.887517213821411, + "learning_rate": 6.095278674554662e-07, + "loss": 0.7287, + "step": 27040 + }, + { + "epoch": 0.42104197965236206, + "grad_norm": 3.611146926879883, + "learning_rate": 6.094459284508612e-07, + "loss": 0.8218, + "step": 27045 + }, + { + "epoch": 0.42111982065433146, + "grad_norm": 3.5884053707122803, + "learning_rate": 6.093639894462562e-07, + "loss": 0.7259, + "step": 27050 + }, + { + "epoch": 0.42119766165630085, + "grad_norm": 6.537292003631592, + "learning_rate": 6.092820504416512e-07, + "loss": 0.7751, + "step": 27055 + }, + { + "epoch": 0.42127550265827024, + "grad_norm": 3.2337849140167236, + "learning_rate": 6.092001114370462e-07, + "loss": 0.722, + "step": 27060 + }, + { + "epoch": 0.4213533436602396, + "grad_norm": 3.9660308361053467, + "learning_rate": 6.091181724324412e-07, + "loss": 0.7699, + "step": 27065 + }, + { + "epoch": 0.42143118466220897, + "grad_norm": 3.0891098976135254, + "learning_rate": 6.090362334278363e-07, + "loss": 0.8119, + "step": 27070 + }, + { + "epoch": 0.42150902566417836, + "grad_norm": 4.106495380401611, + "learning_rate": 6.089542944232314e-07, + "loss": 0.7763, + "step": 27075 + }, + { + "epoch": 0.42158686666614775, + "grad_norm": 3.0975704193115234, + "learning_rate": 6.088723554186263e-07, + "loss": 0.819, + "step": 27080 + }, + { + "epoch": 0.4216647076681171, + "grad_norm": 3.7811708450317383, + "learning_rate": 6.087904164140214e-07, + "loss": 0.8506, + "step": 27085 + }, + { + "epoch": 0.4217425486700865, + "grad_norm": 4.70298957824707, + "learning_rate": 6.087084774094165e-07, + "loss": 0.8322, + "step": 27090 + }, + { + "epoch": 0.4218203896720559, + "grad_norm": 2.7160446643829346, + "learning_rate": 6.086265384048115e-07, + "loss": 0.7179, + "step": 27095 + }, + { + "epoch": 0.4218982306740252, + "grad_norm": 5.256832122802734, + "learning_rate": 6.085445994002064e-07, + "loss": 0.7015, + "step": 27100 + }, + { + "epoch": 0.4219760716759946, + "grad_norm": 6.849483013153076, + "learning_rate": 6.084626603956015e-07, + "loss": 0.7913, + "step": 27105 + }, + { + "epoch": 0.422053912677964, + "grad_norm": 2.9917688369750977, + "learning_rate": 6.083807213909965e-07, + "loss": 0.7566, + "step": 27110 + }, + { + "epoch": 0.4221317536799334, + "grad_norm": 3.3257033824920654, + "learning_rate": 6.082987823863915e-07, + "loss": 0.6861, + "step": 27115 + }, + { + "epoch": 0.4222095946819027, + "grad_norm": 3.04402232170105, + "learning_rate": 6.082168433817866e-07, + "loss": 0.779, + "step": 27120 + }, + { + "epoch": 0.4222874356838721, + "grad_norm": 2.7107558250427246, + "learning_rate": 6.081349043771816e-07, + "loss": 0.7857, + "step": 27125 + }, + { + "epoch": 0.4223652766858415, + "grad_norm": 8.305509567260742, + "learning_rate": 6.080529653725767e-07, + "loss": 0.8263, + "step": 27130 + }, + { + "epoch": 0.4224431176878109, + "grad_norm": 3.3207385540008545, + "learning_rate": 6.079710263679717e-07, + "loss": 0.7337, + "step": 27135 + }, + { + "epoch": 0.42252095868978023, + "grad_norm": 5.155167102813721, + "learning_rate": 6.078890873633666e-07, + "loss": 0.8061, + "step": 27140 + }, + { + "epoch": 0.4225987996917496, + "grad_norm": 7.670403003692627, + "learning_rate": 6.078071483587617e-07, + "loss": 0.7951, + "step": 27145 + }, + { + "epoch": 0.422676640693719, + "grad_norm": 3.5299322605133057, + "learning_rate": 6.077252093541567e-07, + "loss": 0.7142, + "step": 27150 + }, + { + "epoch": 0.4227544816956884, + "grad_norm": 2.901541233062744, + "learning_rate": 6.076432703495517e-07, + "loss": 0.7646, + "step": 27155 + }, + { + "epoch": 0.42283232269765775, + "grad_norm": 2.260049343109131, + "learning_rate": 6.075613313449468e-07, + "loss": 0.7741, + "step": 27160 + }, + { + "epoch": 0.42291016369962714, + "grad_norm": 2.8874802589416504, + "learning_rate": 6.074793923403419e-07, + "loss": 0.6199, + "step": 27165 + }, + { + "epoch": 0.42298800470159653, + "grad_norm": 5.7339301109313965, + "learning_rate": 6.073974533357369e-07, + "loss": 0.7064, + "step": 27170 + }, + { + "epoch": 0.4230658457035659, + "grad_norm": 3.6913132667541504, + "learning_rate": 6.073155143311319e-07, + "loss": 0.8068, + "step": 27175 + }, + { + "epoch": 0.42314368670553526, + "grad_norm": 3.6943602561950684, + "learning_rate": 6.072335753265269e-07, + "loss": 0.804, + "step": 27180 + }, + { + "epoch": 0.42322152770750465, + "grad_norm": 3.9005420207977295, + "learning_rate": 6.07151636321922e-07, + "loss": 0.6257, + "step": 27185 + }, + { + "epoch": 0.42329936870947404, + "grad_norm": 5.157369613647461, + "learning_rate": 6.070696973173169e-07, + "loss": 0.7783, + "step": 27190 + }, + { + "epoch": 0.4233772097114434, + "grad_norm": 7.655777931213379, + "learning_rate": 6.06987758312712e-07, + "loss": 0.7452, + "step": 27195 + }, + { + "epoch": 0.4234550507134128, + "grad_norm": 5.108643054962158, + "learning_rate": 6.069058193081071e-07, + "loss": 0.6303, + "step": 27200 + }, + { + "epoch": 0.42353289171538216, + "grad_norm": 3.364788293838501, + "learning_rate": 6.06823880303502e-07, + "loss": 0.7605, + "step": 27205 + }, + { + "epoch": 0.42361073271735156, + "grad_norm": 7.872039794921875, + "learning_rate": 6.067419412988971e-07, + "loss": 0.7943, + "step": 27210 + }, + { + "epoch": 0.4236885737193209, + "grad_norm": 3.3262274265289307, + "learning_rate": 6.066600022942922e-07, + "loss": 0.7413, + "step": 27215 + }, + { + "epoch": 0.4237664147212903, + "grad_norm": 6.564119815826416, + "learning_rate": 6.06578063289687e-07, + "loss": 0.751, + "step": 27220 + }, + { + "epoch": 0.4238442557232597, + "grad_norm": 3.2822859287261963, + "learning_rate": 6.064961242850821e-07, + "loss": 0.7151, + "step": 27225 + }, + { + "epoch": 0.42392209672522907, + "grad_norm": 3.66558575630188, + "learning_rate": 6.064141852804772e-07, + "loss": 0.7304, + "step": 27230 + }, + { + "epoch": 0.4239999377271984, + "grad_norm": 4.87346887588501, + "learning_rate": 6.063322462758722e-07, + "loss": 0.6916, + "step": 27235 + }, + { + "epoch": 0.4240777787291678, + "grad_norm": 6.911497592926025, + "learning_rate": 6.062503072712672e-07, + "loss": 0.8004, + "step": 27240 + }, + { + "epoch": 0.4241556197311372, + "grad_norm": 6.917334079742432, + "learning_rate": 6.061683682666623e-07, + "loss": 0.8152, + "step": 27245 + }, + { + "epoch": 0.4242334607331066, + "grad_norm": 4.561256408691406, + "learning_rate": 6.060864292620573e-07, + "loss": 0.6921, + "step": 27250 + }, + { + "epoch": 0.4243113017350759, + "grad_norm": 3.4769482612609863, + "learning_rate": 6.060044902574524e-07, + "loss": 0.6964, + "step": 27255 + }, + { + "epoch": 0.4243891427370453, + "grad_norm": 6.042318344116211, + "learning_rate": 6.059225512528473e-07, + "loss": 0.6912, + "step": 27260 + }, + { + "epoch": 0.4244669837390147, + "grad_norm": 10.03291130065918, + "learning_rate": 6.058406122482423e-07, + "loss": 0.7412, + "step": 27265 + }, + { + "epoch": 0.42454482474098404, + "grad_norm": 4.805675983428955, + "learning_rate": 6.057586732436374e-07, + "loss": 0.6488, + "step": 27270 + }, + { + "epoch": 0.42462266574295343, + "grad_norm": 3.6138381958007812, + "learning_rate": 6.056767342390325e-07, + "loss": 0.6861, + "step": 27275 + }, + { + "epoch": 0.4247005067449228, + "grad_norm": 3.0130345821380615, + "learning_rate": 6.055947952344274e-07, + "loss": 0.7606, + "step": 27280 + }, + { + "epoch": 0.4247783477468922, + "grad_norm": 7.257308006286621, + "learning_rate": 6.055128562298225e-07, + "loss": 0.724, + "step": 27285 + }, + { + "epoch": 0.42485618874886155, + "grad_norm": 3.798450469970703, + "learning_rate": 6.054309172252176e-07, + "loss": 0.8094, + "step": 27290 + }, + { + "epoch": 0.42493402975083094, + "grad_norm": 4.286552429199219, + "learning_rate": 6.053489782206126e-07, + "loss": 0.7206, + "step": 27295 + }, + { + "epoch": 0.42501187075280034, + "grad_norm": 7.024591445922852, + "learning_rate": 6.052670392160075e-07, + "loss": 0.693, + "step": 27300 + }, + { + "epoch": 0.42508971175476973, + "grad_norm": 2.7912750244140625, + "learning_rate": 6.051851002114026e-07, + "loss": 0.7163, + "step": 27305 + }, + { + "epoch": 0.42516755275673906, + "grad_norm": 4.904073715209961, + "learning_rate": 6.051031612067977e-07, + "loss": 0.7379, + "step": 27310 + }, + { + "epoch": 0.42524539375870846, + "grad_norm": 4.198714256286621, + "learning_rate": 6.050212222021926e-07, + "loss": 0.6968, + "step": 27315 + }, + { + "epoch": 0.42532323476067785, + "grad_norm": 8.80378532409668, + "learning_rate": 6.049392831975877e-07, + "loss": 0.7086, + "step": 27320 + }, + { + "epoch": 0.42540107576264724, + "grad_norm": 2.636580467224121, + "learning_rate": 6.048573441929828e-07, + "loss": 0.6348, + "step": 27325 + }, + { + "epoch": 0.4254789167646166, + "grad_norm": 2.9006733894348145, + "learning_rate": 6.047754051883777e-07, + "loss": 0.7363, + "step": 27330 + }, + { + "epoch": 0.42555675776658597, + "grad_norm": 10.586029052734375, + "learning_rate": 6.046934661837728e-07, + "loss": 0.8535, + "step": 27335 + }, + { + "epoch": 0.42563459876855536, + "grad_norm": 7.683661937713623, + "learning_rate": 6.046115271791678e-07, + "loss": 0.6484, + "step": 27340 + }, + { + "epoch": 0.42571243977052475, + "grad_norm": 4.323159217834473, + "learning_rate": 6.045295881745628e-07, + "loss": 0.8111, + "step": 27345 + }, + { + "epoch": 0.4257902807724941, + "grad_norm": 3.4367847442626953, + "learning_rate": 6.044476491699578e-07, + "loss": 0.7728, + "step": 27350 + }, + { + "epoch": 0.4258681217744635, + "grad_norm": 4.658853530883789, + "learning_rate": 6.043657101653529e-07, + "loss": 0.8023, + "step": 27355 + }, + { + "epoch": 0.4259459627764329, + "grad_norm": 4.2418036460876465, + "learning_rate": 6.042837711607479e-07, + "loss": 0.8127, + "step": 27360 + }, + { + "epoch": 0.4260238037784022, + "grad_norm": 5.615492343902588, + "learning_rate": 6.04201832156143e-07, + "loss": 0.7324, + "step": 27365 + }, + { + "epoch": 0.4261016447803716, + "grad_norm": 5.688205718994141, + "learning_rate": 6.04119893151538e-07, + "loss": 0.7359, + "step": 27370 + }, + { + "epoch": 0.426179485782341, + "grad_norm": 2.9322752952575684, + "learning_rate": 6.04037954146933e-07, + "loss": 0.7647, + "step": 27375 + }, + { + "epoch": 0.4262573267843104, + "grad_norm": 4.038652420043945, + "learning_rate": 6.039560151423281e-07, + "loss": 0.7031, + "step": 27380 + }, + { + "epoch": 0.4263351677862797, + "grad_norm": 3.9757728576660156, + "learning_rate": 6.03874076137723e-07, + "loss": 0.7481, + "step": 27385 + }, + { + "epoch": 0.4264130087882491, + "grad_norm": 3.1250228881835938, + "learning_rate": 6.03792137133118e-07, + "loss": 0.8038, + "step": 27390 + }, + { + "epoch": 0.4264908497902185, + "grad_norm": 3.7392306327819824, + "learning_rate": 6.037101981285131e-07, + "loss": 0.7122, + "step": 27395 + }, + { + "epoch": 0.4265686907921879, + "grad_norm": 3.21966814994812, + "learning_rate": 6.036282591239082e-07, + "loss": 0.7205, + "step": 27400 + }, + { + "epoch": 0.42664653179415724, + "grad_norm": 3.8706374168395996, + "learning_rate": 6.035463201193031e-07, + "loss": 0.8379, + "step": 27405 + }, + { + "epoch": 0.4267243727961266, + "grad_norm": 3.5687901973724365, + "learning_rate": 6.034643811146982e-07, + "loss": 0.7195, + "step": 27410 + }, + { + "epoch": 0.426802213798096, + "grad_norm": 3.3177990913391113, + "learning_rate": 6.033824421100933e-07, + "loss": 0.7446, + "step": 27415 + }, + { + "epoch": 0.4268800548000654, + "grad_norm": 3.827878475189209, + "learning_rate": 6.033005031054884e-07, + "loss": 0.8047, + "step": 27420 + }, + { + "epoch": 0.42695789580203475, + "grad_norm": 5.776741027832031, + "learning_rate": 6.032185641008832e-07, + "loss": 0.7703, + "step": 27425 + }, + { + "epoch": 0.42703573680400414, + "grad_norm": 2.8187882900238037, + "learning_rate": 6.031366250962783e-07, + "loss": 0.6875, + "step": 27430 + }, + { + "epoch": 0.42711357780597353, + "grad_norm": 3.621317148208618, + "learning_rate": 6.030546860916734e-07, + "loss": 0.8284, + "step": 27435 + }, + { + "epoch": 0.42719141880794287, + "grad_norm": 8.660701751708984, + "learning_rate": 6.029727470870683e-07, + "loss": 0.877, + "step": 27440 + }, + { + "epoch": 0.42726925980991226, + "grad_norm": 4.218385696411133, + "learning_rate": 6.028908080824634e-07, + "loss": 0.7182, + "step": 27445 + }, + { + "epoch": 0.42734710081188165, + "grad_norm": 3.3534364700317383, + "learning_rate": 6.028088690778585e-07, + "loss": 0.7942, + "step": 27450 + }, + { + "epoch": 0.42742494181385104, + "grad_norm": 7.34171724319458, + "learning_rate": 6.027269300732535e-07, + "loss": 0.7839, + "step": 27455 + }, + { + "epoch": 0.4275027828158204, + "grad_norm": 4.382108211517334, + "learning_rate": 6.026449910686485e-07, + "loss": 0.7193, + "step": 27460 + }, + { + "epoch": 0.4275806238177898, + "grad_norm": 9.386519432067871, + "learning_rate": 6.025630520640435e-07, + "loss": 0.7651, + "step": 27465 + }, + { + "epoch": 0.42765846481975917, + "grad_norm": 5.13021183013916, + "learning_rate": 6.024811130594385e-07, + "loss": 0.7346, + "step": 27470 + }, + { + "epoch": 0.42773630582172856, + "grad_norm": 3.906785249710083, + "learning_rate": 6.023991740548335e-07, + "loss": 0.9051, + "step": 27475 + }, + { + "epoch": 0.4278141468236979, + "grad_norm": 3.4074766635894775, + "learning_rate": 6.023172350502286e-07, + "loss": 0.633, + "step": 27480 + }, + { + "epoch": 0.4278919878256673, + "grad_norm": 3.513617753982544, + "learning_rate": 6.022352960456236e-07, + "loss": 0.7182, + "step": 27485 + }, + { + "epoch": 0.4279698288276367, + "grad_norm": 3.953442335128784, + "learning_rate": 6.021533570410187e-07, + "loss": 0.8214, + "step": 27490 + }, + { + "epoch": 0.42804766982960607, + "grad_norm": 2.662166118621826, + "learning_rate": 6.020714180364137e-07, + "loss": 0.809, + "step": 27495 + }, + { + "epoch": 0.4281255108315754, + "grad_norm": 3.2333011627197266, + "learning_rate": 6.019894790318087e-07, + "loss": 0.8559, + "step": 27500 + }, + { + "epoch": 0.4282033518335448, + "grad_norm": 4.552109718322754, + "learning_rate": 6.019075400272037e-07, + "loss": 0.8152, + "step": 27505 + }, + { + "epoch": 0.4282811928355142, + "grad_norm": 3.784883737564087, + "learning_rate": 6.018256010225987e-07, + "loss": 0.6774, + "step": 27510 + }, + { + "epoch": 0.4283590338374836, + "grad_norm": 3.5555851459503174, + "learning_rate": 6.017436620179937e-07, + "loss": 0.7077, + "step": 27515 + }, + { + "epoch": 0.4284368748394529, + "grad_norm": 3.111800193786621, + "learning_rate": 6.016617230133888e-07, + "loss": 0.742, + "step": 27520 + }, + { + "epoch": 0.4285147158414223, + "grad_norm": 4.403416633605957, + "learning_rate": 6.015797840087839e-07, + "loss": 0.793, + "step": 27525 + }, + { + "epoch": 0.4285925568433917, + "grad_norm": 3.6969947814941406, + "learning_rate": 6.014978450041788e-07, + "loss": 0.7078, + "step": 27530 + }, + { + "epoch": 0.42867039784536104, + "grad_norm": 7.605321407318115, + "learning_rate": 6.014159059995739e-07, + "loss": 0.8086, + "step": 27535 + }, + { + "epoch": 0.42874823884733043, + "grad_norm": 3.8635454177856445, + "learning_rate": 6.01333966994969e-07, + "loss": 0.7304, + "step": 27540 + }, + { + "epoch": 0.4288260798492998, + "grad_norm": 4.390786647796631, + "learning_rate": 6.012520279903638e-07, + "loss": 0.8371, + "step": 27545 + }, + { + "epoch": 0.4289039208512692, + "grad_norm": 3.810314655303955, + "learning_rate": 6.011700889857589e-07, + "loss": 0.7858, + "step": 27550 + }, + { + "epoch": 0.42898176185323855, + "grad_norm": 3.335005283355713, + "learning_rate": 6.01088149981154e-07, + "loss": 0.7657, + "step": 27555 + }, + { + "epoch": 0.42905960285520794, + "grad_norm": 3.9059786796569824, + "learning_rate": 6.010062109765491e-07, + "loss": 0.6121, + "step": 27560 + }, + { + "epoch": 0.42913744385717734, + "grad_norm": 7.90656042098999, + "learning_rate": 6.00924271971944e-07, + "loss": 0.7579, + "step": 27565 + }, + { + "epoch": 0.42921528485914673, + "grad_norm": 2.571089506149292, + "learning_rate": 6.008423329673391e-07, + "loss": 0.7464, + "step": 27570 + }, + { + "epoch": 0.42929312586111606, + "grad_norm": 3.8170342445373535, + "learning_rate": 6.007603939627342e-07, + "loss": 0.8353, + "step": 27575 + }, + { + "epoch": 0.42937096686308546, + "grad_norm": 3.4820406436920166, + "learning_rate": 6.006784549581292e-07, + "loss": 0.7535, + "step": 27580 + }, + { + "epoch": 0.42944880786505485, + "grad_norm": 2.870199203491211, + "learning_rate": 6.005965159535241e-07, + "loss": 0.8004, + "step": 27585 + }, + { + "epoch": 0.42952664886702424, + "grad_norm": 3.029043674468994, + "learning_rate": 6.005145769489192e-07, + "loss": 0.7063, + "step": 27590 + }, + { + "epoch": 0.4296044898689936, + "grad_norm": 5.453882694244385, + "learning_rate": 6.004326379443142e-07, + "loss": 0.814, + "step": 27595 + }, + { + "epoch": 0.42968233087096297, + "grad_norm": 3.331883430480957, + "learning_rate": 6.003506989397092e-07, + "loss": 0.787, + "step": 27600 + }, + { + "epoch": 0.42976017187293236, + "grad_norm": 2.395042657852173, + "learning_rate": 6.002687599351043e-07, + "loss": 0.6732, + "step": 27605 + }, + { + "epoch": 0.4298380128749017, + "grad_norm": 7.470876216888428, + "learning_rate": 6.001868209304993e-07, + "loss": 0.6835, + "step": 27610 + }, + { + "epoch": 0.4299158538768711, + "grad_norm": 3.4728715419769287, + "learning_rate": 6.001048819258944e-07, + "loss": 0.6952, + "step": 27615 + }, + { + "epoch": 0.4299936948788405, + "grad_norm": 2.973357677459717, + "learning_rate": 6.000229429212894e-07, + "loss": 0.6941, + "step": 27620 + }, + { + "epoch": 0.4300715358808099, + "grad_norm": 2.8671586513519287, + "learning_rate": 5.999410039166843e-07, + "loss": 0.6964, + "step": 27625 + }, + { + "epoch": 0.4301493768827792, + "grad_norm": 3.881767749786377, + "learning_rate": 5.998590649120794e-07, + "loss": 0.6912, + "step": 27630 + }, + { + "epoch": 0.4302272178847486, + "grad_norm": 3.46494460105896, + "learning_rate": 5.997771259074745e-07, + "loss": 0.6453, + "step": 27635 + }, + { + "epoch": 0.430305058886718, + "grad_norm": 6.151224613189697, + "learning_rate": 5.996951869028694e-07, + "loss": 0.8322, + "step": 27640 + }, + { + "epoch": 0.4303828998886874, + "grad_norm": 4.667808532714844, + "learning_rate": 5.996132478982645e-07, + "loss": 0.8206, + "step": 27645 + }, + { + "epoch": 0.4304607408906567, + "grad_norm": 5.188647747039795, + "learning_rate": 5.995313088936596e-07, + "loss": 0.663, + "step": 27650 + }, + { + "epoch": 0.4305385818926261, + "grad_norm": 9.459535598754883, + "learning_rate": 5.994493698890545e-07, + "loss": 0.665, + "step": 27655 + }, + { + "epoch": 0.4306164228945955, + "grad_norm": 2.7770302295684814, + "learning_rate": 5.993674308844496e-07, + "loss": 0.696, + "step": 27660 + }, + { + "epoch": 0.4306942638965649, + "grad_norm": 3.644118070602417, + "learning_rate": 5.992854918798446e-07, + "loss": 0.5877, + "step": 27665 + }, + { + "epoch": 0.43077210489853424, + "grad_norm": 3.66776967048645, + "learning_rate": 5.992035528752396e-07, + "loss": 0.8004, + "step": 27670 + }, + { + "epoch": 0.43084994590050363, + "grad_norm": 3.4178576469421387, + "learning_rate": 5.991216138706346e-07, + "loss": 0.8062, + "step": 27675 + }, + { + "epoch": 0.430927786902473, + "grad_norm": 7.568190574645996, + "learning_rate": 5.990396748660297e-07, + "loss": 0.7219, + "step": 27680 + }, + { + "epoch": 0.4310056279044424, + "grad_norm": 3.1899938583374023, + "learning_rate": 5.989577358614248e-07, + "loss": 0.6335, + "step": 27685 + }, + { + "epoch": 0.43108346890641175, + "grad_norm": 4.04885721206665, + "learning_rate": 5.988757968568197e-07, + "loss": 0.8045, + "step": 27690 + }, + { + "epoch": 0.43116130990838114, + "grad_norm": 3.311727285385132, + "learning_rate": 5.987938578522148e-07, + "loss": 0.8141, + "step": 27695 + }, + { + "epoch": 0.43123915091035053, + "grad_norm": 2.6867873668670654, + "learning_rate": 5.987119188476099e-07, + "loss": 0.7318, + "step": 27700 + }, + { + "epoch": 0.43131699191231987, + "grad_norm": 3.6827845573425293, + "learning_rate": 5.986299798430049e-07, + "loss": 0.7296, + "step": 27705 + }, + { + "epoch": 0.43139483291428926, + "grad_norm": 4.631564140319824, + "learning_rate": 5.985480408383998e-07, + "loss": 0.714, + "step": 27710 + }, + { + "epoch": 0.43147267391625865, + "grad_norm": 4.947175979614258, + "learning_rate": 5.984661018337949e-07, + "loss": 0.892, + "step": 27715 + }, + { + "epoch": 0.43155051491822805, + "grad_norm": 2.701998233795166, + "learning_rate": 5.983841628291899e-07, + "loss": 0.7073, + "step": 27720 + }, + { + "epoch": 0.4316283559201974, + "grad_norm": 4.04004430770874, + "learning_rate": 5.98302223824585e-07, + "loss": 0.8043, + "step": 27725 + }, + { + "epoch": 0.4317061969221668, + "grad_norm": 9.032465934753418, + "learning_rate": 5.9822028481998e-07, + "loss": 0.8812, + "step": 27730 + }, + { + "epoch": 0.43178403792413617, + "grad_norm": 4.29990816116333, + "learning_rate": 5.98138345815375e-07, + "loss": 0.671, + "step": 27735 + }, + { + "epoch": 0.43186187892610556, + "grad_norm": 5.574892520904541, + "learning_rate": 5.980564068107701e-07, + "loss": 0.7746, + "step": 27740 + }, + { + "epoch": 0.4319397199280749, + "grad_norm": 4.6538214683532715, + "learning_rate": 5.979744678061651e-07, + "loss": 0.9419, + "step": 27745 + }, + { + "epoch": 0.4320175609300443, + "grad_norm": 3.485159158706665, + "learning_rate": 5.9789252880156e-07, + "loss": 0.8016, + "step": 27750 + }, + { + "epoch": 0.4320954019320137, + "grad_norm": 4.120754718780518, + "learning_rate": 5.978105897969551e-07, + "loss": 0.8587, + "step": 27755 + }, + { + "epoch": 0.43217324293398307, + "grad_norm": 4.850719451904297, + "learning_rate": 5.977286507923502e-07, + "loss": 0.8688, + "step": 27760 + }, + { + "epoch": 0.4322510839359524, + "grad_norm": 5.697882175445557, + "learning_rate": 5.976467117877451e-07, + "loss": 0.7384, + "step": 27765 + }, + { + "epoch": 0.4323289249379218, + "grad_norm": 3.8523309230804443, + "learning_rate": 5.975647727831402e-07, + "loss": 0.8453, + "step": 27770 + }, + { + "epoch": 0.4324067659398912, + "grad_norm": 4.005265235900879, + "learning_rate": 5.974828337785353e-07, + "loss": 0.7361, + "step": 27775 + }, + { + "epoch": 0.4324846069418606, + "grad_norm": 4.518993377685547, + "learning_rate": 5.974008947739302e-07, + "loss": 0.7459, + "step": 27780 + }, + { + "epoch": 0.4325624479438299, + "grad_norm": 3.953925371170044, + "learning_rate": 5.973189557693253e-07, + "loss": 0.7403, + "step": 27785 + }, + { + "epoch": 0.4326402889457993, + "grad_norm": 4.339183807373047, + "learning_rate": 5.972370167647203e-07, + "loss": 0.7239, + "step": 27790 + }, + { + "epoch": 0.4327181299477687, + "grad_norm": 6.377171039581299, + "learning_rate": 5.971550777601153e-07, + "loss": 0.7259, + "step": 27795 + }, + { + "epoch": 0.43279597094973804, + "grad_norm": 4.8265180587768555, + "learning_rate": 5.970731387555103e-07, + "loss": 0.8928, + "step": 27800 + }, + { + "epoch": 0.43287381195170743, + "grad_norm": 3.585028648376465, + "learning_rate": 5.969911997509054e-07, + "loss": 0.727, + "step": 27805 + }, + { + "epoch": 0.4329516529536768, + "grad_norm": 4.21317195892334, + "learning_rate": 5.969092607463005e-07, + "loss": 0.761, + "step": 27810 + }, + { + "epoch": 0.4330294939556462, + "grad_norm": 3.281339645385742, + "learning_rate": 5.968273217416955e-07, + "loss": 0.8078, + "step": 27815 + }, + { + "epoch": 0.43310733495761555, + "grad_norm": 4.054388523101807, + "learning_rate": 5.967453827370905e-07, + "loss": 0.8311, + "step": 27820 + }, + { + "epoch": 0.43318517595958494, + "grad_norm": 3.831932544708252, + "learning_rate": 5.966634437324856e-07, + "loss": 0.7771, + "step": 27825 + }, + { + "epoch": 0.43326301696155434, + "grad_norm": 6.436086654663086, + "learning_rate": 5.965815047278805e-07, + "loss": 0.8272, + "step": 27830 + }, + { + "epoch": 0.43334085796352373, + "grad_norm": 3.4208126068115234, + "learning_rate": 5.964995657232755e-07, + "loss": 0.6237, + "step": 27835 + }, + { + "epoch": 0.43341869896549307, + "grad_norm": 2.5811595916748047, + "learning_rate": 5.964176267186706e-07, + "loss": 0.7275, + "step": 27840 + }, + { + "epoch": 0.43349653996746246, + "grad_norm": 4.617489814758301, + "learning_rate": 5.963356877140656e-07, + "loss": 0.7164, + "step": 27845 + }, + { + "epoch": 0.43357438096943185, + "grad_norm": 3.223447561264038, + "learning_rate": 5.962537487094607e-07, + "loss": 0.7278, + "step": 27850 + }, + { + "epoch": 0.43365222197140124, + "grad_norm": 3.8689448833465576, + "learning_rate": 5.961718097048557e-07, + "loss": 0.8391, + "step": 27855 + }, + { + "epoch": 0.4337300629733706, + "grad_norm": 3.374972343444824, + "learning_rate": 5.960898707002507e-07, + "loss": 0.7914, + "step": 27860 + }, + { + "epoch": 0.43380790397533997, + "grad_norm": 6.62541389465332, + "learning_rate": 5.960079316956458e-07, + "loss": 0.7388, + "step": 27865 + }, + { + "epoch": 0.43388574497730936, + "grad_norm": 3.3307340145111084, + "learning_rate": 5.959259926910407e-07, + "loss": 0.7181, + "step": 27870 + }, + { + "epoch": 0.4339635859792787, + "grad_norm": 3.8283004760742188, + "learning_rate": 5.958440536864357e-07, + "loss": 0.6944, + "step": 27875 + }, + { + "epoch": 0.4340414269812481, + "grad_norm": 3.793822765350342, + "learning_rate": 5.957621146818308e-07, + "loss": 0.6795, + "step": 27880 + }, + { + "epoch": 0.4341192679832175, + "grad_norm": 3.196544647216797, + "learning_rate": 5.956801756772259e-07, + "loss": 0.8672, + "step": 27885 + }, + { + "epoch": 0.4341971089851869, + "grad_norm": 4.767712116241455, + "learning_rate": 5.955982366726208e-07, + "loss": 0.8061, + "step": 27890 + }, + { + "epoch": 0.4342749499871562, + "grad_norm": 2.1641054153442383, + "learning_rate": 5.955162976680159e-07, + "loss": 0.7623, + "step": 27895 + }, + { + "epoch": 0.4343527909891256, + "grad_norm": 3.6267597675323486, + "learning_rate": 5.95434358663411e-07, + "loss": 0.6784, + "step": 27900 + }, + { + "epoch": 0.434430631991095, + "grad_norm": 4.674212455749512, + "learning_rate": 5.95352419658806e-07, + "loss": 0.759, + "step": 27905 + }, + { + "epoch": 0.4345084729930644, + "grad_norm": 6.812900543212891, + "learning_rate": 5.952704806542009e-07, + "loss": 0.7098, + "step": 27910 + }, + { + "epoch": 0.4345863139950337, + "grad_norm": 3.000892162322998, + "learning_rate": 5.95188541649596e-07, + "loss": 0.6528, + "step": 27915 + }, + { + "epoch": 0.4346641549970031, + "grad_norm": 13.540499687194824, + "learning_rate": 5.95106602644991e-07, + "loss": 0.7468, + "step": 27920 + }, + { + "epoch": 0.4347419959989725, + "grad_norm": 4.585391998291016, + "learning_rate": 5.95024663640386e-07, + "loss": 0.6364, + "step": 27925 + }, + { + "epoch": 0.4348198370009419, + "grad_norm": 5.876636505126953, + "learning_rate": 5.949427246357811e-07, + "loss": 0.763, + "step": 27930 + }, + { + "epoch": 0.43489767800291124, + "grad_norm": 4.930020809173584, + "learning_rate": 5.948607856311762e-07, + "loss": 0.5975, + "step": 27935 + }, + { + "epoch": 0.43497551900488063, + "grad_norm": 4.648181438446045, + "learning_rate": 5.947788466265712e-07, + "loss": 0.7269, + "step": 27940 + }, + { + "epoch": 0.43505336000685, + "grad_norm": 4.776635646820068, + "learning_rate": 5.946969076219662e-07, + "loss": 0.6465, + "step": 27945 + }, + { + "epoch": 0.4351312010088194, + "grad_norm": 5.510016918182373, + "learning_rate": 5.946149686173612e-07, + "loss": 0.7738, + "step": 27950 + }, + { + "epoch": 0.43520904201078875, + "grad_norm": 2.281200647354126, + "learning_rate": 5.945330296127562e-07, + "loss": 0.7802, + "step": 27955 + }, + { + "epoch": 0.43528688301275814, + "grad_norm": 3.1887929439544678, + "learning_rate": 5.944510906081512e-07, + "loss": 0.7501, + "step": 27960 + }, + { + "epoch": 0.43536472401472753, + "grad_norm": 3.804816961288452, + "learning_rate": 5.943691516035463e-07, + "loss": 0.6792, + "step": 27965 + }, + { + "epoch": 0.43544256501669687, + "grad_norm": 3.4015350341796875, + "learning_rate": 5.942872125989413e-07, + "loss": 0.8475, + "step": 27970 + }, + { + "epoch": 0.43552040601866626, + "grad_norm": 4.119654178619385, + "learning_rate": 5.942052735943364e-07, + "loss": 0.8177, + "step": 27975 + }, + { + "epoch": 0.43559824702063565, + "grad_norm": 8.41972827911377, + "learning_rate": 5.941233345897314e-07, + "loss": 0.7545, + "step": 27980 + }, + { + "epoch": 0.43567608802260505, + "grad_norm": 5.477794647216797, + "learning_rate": 5.940413955851264e-07, + "loss": 0.7307, + "step": 27985 + }, + { + "epoch": 0.4357539290245744, + "grad_norm": 7.2264838218688965, + "learning_rate": 5.939594565805214e-07, + "loss": 0.8613, + "step": 27990 + }, + { + "epoch": 0.4358317700265438, + "grad_norm": 3.4156153202056885, + "learning_rate": 5.938775175759165e-07, + "loss": 0.7259, + "step": 27995 + }, + { + "epoch": 0.43590961102851317, + "grad_norm": 2.4434916973114014, + "learning_rate": 5.937955785713114e-07, + "loss": 0.7627, + "step": 28000 + }, + { + "epoch": 0.43598745203048256, + "grad_norm": 3.167893171310425, + "learning_rate": 5.937136395667065e-07, + "loss": 0.5847, + "step": 28005 + }, + { + "epoch": 0.4360652930324519, + "grad_norm": 16.382583618164062, + "learning_rate": 5.936317005621016e-07, + "loss": 0.7994, + "step": 28010 + }, + { + "epoch": 0.4361431340344213, + "grad_norm": 3.138211250305176, + "learning_rate": 5.935497615574965e-07, + "loss": 0.7655, + "step": 28015 + }, + { + "epoch": 0.4362209750363907, + "grad_norm": 3.0444304943084717, + "learning_rate": 5.934678225528916e-07, + "loss": 0.7488, + "step": 28020 + }, + { + "epoch": 0.43629881603836007, + "grad_norm": 3.296393394470215, + "learning_rate": 5.933858835482867e-07, + "loss": 0.685, + "step": 28025 + }, + { + "epoch": 0.4363766570403294, + "grad_norm": 3.1975717544555664, + "learning_rate": 5.933039445436817e-07, + "loss": 0.7817, + "step": 28030 + }, + { + "epoch": 0.4364544980422988, + "grad_norm": 2.862736701965332, + "learning_rate": 5.932220055390766e-07, + "loss": 0.8753, + "step": 28035 + }, + { + "epoch": 0.4365323390442682, + "grad_norm": 10.180088996887207, + "learning_rate": 5.931400665344717e-07, + "loss": 0.6301, + "step": 28040 + }, + { + "epoch": 0.43661018004623753, + "grad_norm": 3.7742133140563965, + "learning_rate": 5.930581275298667e-07, + "loss": 0.6582, + "step": 28045 + }, + { + "epoch": 0.4366880210482069, + "grad_norm": 3.1581547260284424, + "learning_rate": 5.929761885252617e-07, + "loss": 0.7242, + "step": 28050 + }, + { + "epoch": 0.4367658620501763, + "grad_norm": 10.063048362731934, + "learning_rate": 5.928942495206568e-07, + "loss": 1.0004, + "step": 28055 + }, + { + "epoch": 0.4368437030521457, + "grad_norm": 3.4235334396362305, + "learning_rate": 5.928123105160519e-07, + "loss": 0.5652, + "step": 28060 + }, + { + "epoch": 0.43692154405411504, + "grad_norm": 3.312119722366333, + "learning_rate": 5.927303715114469e-07, + "loss": 0.7674, + "step": 28065 + }, + { + "epoch": 0.43699938505608443, + "grad_norm": 3.313004970550537, + "learning_rate": 5.926484325068419e-07, + "loss": 0.7633, + "step": 28070 + }, + { + "epoch": 0.4370772260580538, + "grad_norm": 4.131656169891357, + "learning_rate": 5.925664935022369e-07, + "loss": 0.8007, + "step": 28075 + }, + { + "epoch": 0.4371550670600232, + "grad_norm": 3.5195024013519287, + "learning_rate": 5.924845544976319e-07, + "loss": 0.7885, + "step": 28080 + }, + { + "epoch": 0.43723290806199255, + "grad_norm": 3.162827253341675, + "learning_rate": 5.92402615493027e-07, + "loss": 0.7527, + "step": 28085 + }, + { + "epoch": 0.43731074906396195, + "grad_norm": 4.721434593200684, + "learning_rate": 5.92320676488422e-07, + "loss": 0.8578, + "step": 28090 + }, + { + "epoch": 0.43738859006593134, + "grad_norm": 3.5338213443756104, + "learning_rate": 5.92238737483817e-07, + "loss": 0.804, + "step": 28095 + }, + { + "epoch": 0.43746643106790073, + "grad_norm": 3.930555582046509, + "learning_rate": 5.921567984792121e-07, + "loss": 0.7481, + "step": 28100 + }, + { + "epoch": 0.43754427206987007, + "grad_norm": 5.562685966491699, + "learning_rate": 5.920748594746071e-07, + "loss": 0.8776, + "step": 28105 + }, + { + "epoch": 0.43762211307183946, + "grad_norm": 3.6246979236602783, + "learning_rate": 5.919929204700021e-07, + "loss": 0.7283, + "step": 28110 + }, + { + "epoch": 0.43769995407380885, + "grad_norm": 5.065835475921631, + "learning_rate": 5.919109814653971e-07, + "loss": 0.7177, + "step": 28115 + }, + { + "epoch": 0.43777779507577824, + "grad_norm": 3.75182843208313, + "learning_rate": 5.918290424607922e-07, + "loss": 0.7741, + "step": 28120 + }, + { + "epoch": 0.4378556360777476, + "grad_norm": 3.845771312713623, + "learning_rate": 5.917471034561871e-07, + "loss": 0.8227, + "step": 28125 + }, + { + "epoch": 0.43793347707971697, + "grad_norm": 2.205892562866211, + "learning_rate": 5.916651644515822e-07, + "loss": 0.6918, + "step": 28130 + }, + { + "epoch": 0.43801131808168636, + "grad_norm": 3.447526693344116, + "learning_rate": 5.915832254469773e-07, + "loss": 0.8242, + "step": 28135 + }, + { + "epoch": 0.4380891590836557, + "grad_norm": 4.775432586669922, + "learning_rate": 5.915012864423722e-07, + "loss": 0.7072, + "step": 28140 + }, + { + "epoch": 0.4381670000856251, + "grad_norm": 4.1686787605285645, + "learning_rate": 5.914193474377673e-07, + "loss": 0.7002, + "step": 28145 + }, + { + "epoch": 0.4382448410875945, + "grad_norm": 4.378955841064453, + "learning_rate": 5.913374084331624e-07, + "loss": 0.794, + "step": 28150 + }, + { + "epoch": 0.4383226820895639, + "grad_norm": 3.673280715942383, + "learning_rate": 5.912554694285573e-07, + "loss": 0.7652, + "step": 28155 + }, + { + "epoch": 0.4384005230915332, + "grad_norm": 4.4424848556518555, + "learning_rate": 5.911735304239523e-07, + "loss": 0.6848, + "step": 28160 + }, + { + "epoch": 0.4384783640935026, + "grad_norm": 3.245546579360962, + "learning_rate": 5.910915914193474e-07, + "loss": 0.741, + "step": 28165 + }, + { + "epoch": 0.438556205095472, + "grad_norm": 2.628819227218628, + "learning_rate": 5.910096524147424e-07, + "loss": 0.6993, + "step": 28170 + }, + { + "epoch": 0.4386340460974414, + "grad_norm": 4.761402130126953, + "learning_rate": 5.909277134101375e-07, + "loss": 0.732, + "step": 28175 + }, + { + "epoch": 0.4387118870994107, + "grad_norm": 3.5356898307800293, + "learning_rate": 5.908457744055325e-07, + "loss": 0.7505, + "step": 28180 + }, + { + "epoch": 0.4387897281013801, + "grad_norm": 2.67647647857666, + "learning_rate": 5.907638354009276e-07, + "loss": 0.7687, + "step": 28185 + }, + { + "epoch": 0.4388675691033495, + "grad_norm": 9.661590576171875, + "learning_rate": 5.906818963963226e-07, + "loss": 0.7266, + "step": 28190 + }, + { + "epoch": 0.4389454101053189, + "grad_norm": 3.347217321395874, + "learning_rate": 5.905999573917175e-07, + "loss": 0.7665, + "step": 28195 + }, + { + "epoch": 0.43902325110728824, + "grad_norm": 2.7007672786712646, + "learning_rate": 5.905180183871126e-07, + "loss": 0.7923, + "step": 28200 + }, + { + "epoch": 0.43910109210925763, + "grad_norm": 4.161789894104004, + "learning_rate": 5.904360793825076e-07, + "loss": 0.7244, + "step": 28205 + }, + { + "epoch": 0.439178933111227, + "grad_norm": 2.561704158782959, + "learning_rate": 5.903541403779027e-07, + "loss": 0.6945, + "step": 28210 + }, + { + "epoch": 0.43925677411319636, + "grad_norm": 4.047752380371094, + "learning_rate": 5.902722013732977e-07, + "loss": 0.6831, + "step": 28215 + }, + { + "epoch": 0.43933461511516575, + "grad_norm": 5.509623050689697, + "learning_rate": 5.901902623686927e-07, + "loss": 0.8383, + "step": 28220 + }, + { + "epoch": 0.43941245611713514, + "grad_norm": 2.814044237136841, + "learning_rate": 5.901083233640878e-07, + "loss": 0.711, + "step": 28225 + }, + { + "epoch": 0.43949029711910453, + "grad_norm": 6.586339473724365, + "learning_rate": 5.900263843594829e-07, + "loss": 0.8168, + "step": 28230 + }, + { + "epoch": 0.43956813812107387, + "grad_norm": 3.233229637145996, + "learning_rate": 5.899444453548777e-07, + "loss": 0.663, + "step": 28235 + }, + { + "epoch": 0.43964597912304326, + "grad_norm": 3.1166365146636963, + "learning_rate": 5.898625063502728e-07, + "loss": 0.7402, + "step": 28240 + }, + { + "epoch": 0.43972382012501265, + "grad_norm": 3.2115893363952637, + "learning_rate": 5.897805673456679e-07, + "loss": 0.8284, + "step": 28245 + }, + { + "epoch": 0.43980166112698205, + "grad_norm": 4.563340187072754, + "learning_rate": 5.896986283410628e-07, + "loss": 0.6664, + "step": 28250 + }, + { + "epoch": 0.4398795021289514, + "grad_norm": 2.2660348415374756, + "learning_rate": 5.896166893364579e-07, + "loss": 0.7314, + "step": 28255 + }, + { + "epoch": 0.4399573431309208, + "grad_norm": 4.088837623596191, + "learning_rate": 5.89534750331853e-07, + "loss": 0.7606, + "step": 28260 + }, + { + "epoch": 0.44003518413289017, + "grad_norm": 6.516998767852783, + "learning_rate": 5.89452811327248e-07, + "loss": 0.7674, + "step": 28265 + }, + { + "epoch": 0.44011302513485956, + "grad_norm": 4.032287120819092, + "learning_rate": 5.89370872322643e-07, + "loss": 0.7683, + "step": 28270 + }, + { + "epoch": 0.4401908661368289, + "grad_norm": 4.658278465270996, + "learning_rate": 5.89288933318038e-07, + "loss": 0.8606, + "step": 28275 + }, + { + "epoch": 0.4402687071387983, + "grad_norm": 7.811373710632324, + "learning_rate": 5.89206994313433e-07, + "loss": 0.715, + "step": 28280 + }, + { + "epoch": 0.4403465481407677, + "grad_norm": 4.1481547355651855, + "learning_rate": 5.89125055308828e-07, + "loss": 0.6848, + "step": 28285 + }, + { + "epoch": 0.44042438914273707, + "grad_norm": 10.9120512008667, + "learning_rate": 5.890431163042231e-07, + "loss": 0.7103, + "step": 28290 + }, + { + "epoch": 0.4405022301447064, + "grad_norm": 4.8491129875183105, + "learning_rate": 5.889611772996181e-07, + "loss": 0.6921, + "step": 28295 + }, + { + "epoch": 0.4405800711466758, + "grad_norm": 3.8429486751556396, + "learning_rate": 5.888792382950132e-07, + "loss": 0.7103, + "step": 28300 + }, + { + "epoch": 0.4406579121486452, + "grad_norm": 3.482661008834839, + "learning_rate": 5.887972992904082e-07, + "loss": 0.8208, + "step": 28305 + }, + { + "epoch": 0.44073575315061453, + "grad_norm": 3.3518240451812744, + "learning_rate": 5.887153602858033e-07, + "loss": 0.7582, + "step": 28310 + }, + { + "epoch": 0.4408135941525839, + "grad_norm": 3.1931521892547607, + "learning_rate": 5.886334212811982e-07, + "loss": 0.6514, + "step": 28315 + }, + { + "epoch": 0.4408914351545533, + "grad_norm": 3.3006231784820557, + "learning_rate": 5.885514822765933e-07, + "loss": 0.7889, + "step": 28320 + }, + { + "epoch": 0.4409692761565227, + "grad_norm": 3.569957733154297, + "learning_rate": 5.884695432719883e-07, + "loss": 0.6939, + "step": 28325 + }, + { + "epoch": 0.44104711715849204, + "grad_norm": 4.447323799133301, + "learning_rate": 5.883876042673833e-07, + "loss": 0.6871, + "step": 28330 + }, + { + "epoch": 0.44112495816046143, + "grad_norm": 4.069333076477051, + "learning_rate": 5.883056652627784e-07, + "loss": 0.7319, + "step": 28335 + }, + { + "epoch": 0.4412027991624308, + "grad_norm": 2.6958563327789307, + "learning_rate": 5.882237262581734e-07, + "loss": 0.6825, + "step": 28340 + }, + { + "epoch": 0.4412806401644002, + "grad_norm": 5.831766605377197, + "learning_rate": 5.881417872535684e-07, + "loss": 0.6912, + "step": 28345 + }, + { + "epoch": 0.44135848116636955, + "grad_norm": 4.509929656982422, + "learning_rate": 5.880598482489635e-07, + "loss": 0.7606, + "step": 28350 + }, + { + "epoch": 0.44143632216833895, + "grad_norm": 4.06305456161499, + "learning_rate": 5.879779092443586e-07, + "loss": 0.716, + "step": 28355 + }, + { + "epoch": 0.44151416317030834, + "grad_norm": 5.638819217681885, + "learning_rate": 5.878959702397534e-07, + "loss": 0.7627, + "step": 28360 + }, + { + "epoch": 0.44159200417227773, + "grad_norm": 5.382050514221191, + "learning_rate": 5.878140312351485e-07, + "loss": 0.859, + "step": 28365 + }, + { + "epoch": 0.44166984517424707, + "grad_norm": 3.79328989982605, + "learning_rate": 5.877320922305436e-07, + "loss": 0.7537, + "step": 28370 + }, + { + "epoch": 0.44174768617621646, + "grad_norm": 5.255930423736572, + "learning_rate": 5.876501532259385e-07, + "loss": 0.8518, + "step": 28375 + }, + { + "epoch": 0.44182552717818585, + "grad_norm": 3.785076141357422, + "learning_rate": 5.875682142213336e-07, + "loss": 0.6591, + "step": 28380 + }, + { + "epoch": 0.44190336818015524, + "grad_norm": 9.082311630249023, + "learning_rate": 5.874862752167287e-07, + "loss": 0.6978, + "step": 28385 + }, + { + "epoch": 0.4419812091821246, + "grad_norm": 4.221014976501465, + "learning_rate": 5.874043362121237e-07, + "loss": 0.6371, + "step": 28390 + }, + { + "epoch": 0.44205905018409397, + "grad_norm": 4.4617767333984375, + "learning_rate": 5.873223972075187e-07, + "loss": 0.7363, + "step": 28395 + }, + { + "epoch": 0.44213689118606336, + "grad_norm": 4.976316928863525, + "learning_rate": 5.872404582029137e-07, + "loss": 0.758, + "step": 28400 + }, + { + "epoch": 0.4422147321880327, + "grad_norm": 4.9800310134887695, + "learning_rate": 5.871585191983087e-07, + "loss": 0.8536, + "step": 28405 + }, + { + "epoch": 0.4422925731900021, + "grad_norm": 3.337171792984009, + "learning_rate": 5.870765801937038e-07, + "loss": 0.7016, + "step": 28410 + }, + { + "epoch": 0.4423704141919715, + "grad_norm": 3.1711952686309814, + "learning_rate": 5.869946411890988e-07, + "loss": 0.6803, + "step": 28415 + }, + { + "epoch": 0.4424482551939409, + "grad_norm": 4.335486888885498, + "learning_rate": 5.869127021844938e-07, + "loss": 0.8334, + "step": 28420 + }, + { + "epoch": 0.4425260961959102, + "grad_norm": 8.560162544250488, + "learning_rate": 5.868307631798889e-07, + "loss": 0.7931, + "step": 28425 + }, + { + "epoch": 0.4426039371978796, + "grad_norm": 5.151147842407227, + "learning_rate": 5.867488241752839e-07, + "loss": 0.7387, + "step": 28430 + }, + { + "epoch": 0.442681778199849, + "grad_norm": 6.47930908203125, + "learning_rate": 5.86666885170679e-07, + "loss": 0.6809, + "step": 28435 + }, + { + "epoch": 0.4427596192018184, + "grad_norm": 3.0402133464813232, + "learning_rate": 5.865849461660739e-07, + "loss": 0.6932, + "step": 28440 + }, + { + "epoch": 0.4428374602037877, + "grad_norm": 5.68237829208374, + "learning_rate": 5.86503007161469e-07, + "loss": 0.7512, + "step": 28445 + }, + { + "epoch": 0.4429153012057571, + "grad_norm": 7.695286273956299, + "learning_rate": 5.86421068156864e-07, + "loss": 0.6204, + "step": 28450 + }, + { + "epoch": 0.4429931422077265, + "grad_norm": 3.5092580318450928, + "learning_rate": 5.86339129152259e-07, + "loss": 0.7814, + "step": 28455 + }, + { + "epoch": 0.4430709832096959, + "grad_norm": 2.973787784576416, + "learning_rate": 5.862571901476541e-07, + "loss": 0.7079, + "step": 28460 + }, + { + "epoch": 0.44314882421166524, + "grad_norm": 2.6469204425811768, + "learning_rate": 5.861752511430492e-07, + "loss": 0.701, + "step": 28465 + }, + { + "epoch": 0.44322666521363463, + "grad_norm": 3.336364507675171, + "learning_rate": 5.860933121384441e-07, + "loss": 0.6272, + "step": 28470 + }, + { + "epoch": 0.443304506215604, + "grad_norm": 3.0087203979492188, + "learning_rate": 5.860113731338392e-07, + "loss": 0.6707, + "step": 28475 + }, + { + "epoch": 0.44338234721757336, + "grad_norm": 2.546515941619873, + "learning_rate": 5.859294341292342e-07, + "loss": 0.7941, + "step": 28480 + }, + { + "epoch": 0.44346018821954275, + "grad_norm": 7.917011260986328, + "learning_rate": 5.858474951246291e-07, + "loss": 0.8553, + "step": 28485 + }, + { + "epoch": 0.44353802922151214, + "grad_norm": 7.305311679840088, + "learning_rate": 5.857655561200242e-07, + "loss": 0.695, + "step": 28490 + }, + { + "epoch": 0.44361587022348153, + "grad_norm": 5.976718425750732, + "learning_rate": 5.856836171154193e-07, + "loss": 0.6468, + "step": 28495 + }, + { + "epoch": 0.44369371122545087, + "grad_norm": 3.3048276901245117, + "learning_rate": 5.856016781108143e-07, + "loss": 0.7979, + "step": 28500 + }, + { + "epoch": 0.44377155222742026, + "grad_norm": 3.263644218444824, + "learning_rate": 5.855197391062093e-07, + "loss": 0.8195, + "step": 28505 + }, + { + "epoch": 0.44384939322938965, + "grad_norm": 2.6208372116088867, + "learning_rate": 5.854378001016044e-07, + "loss": 0.7456, + "step": 28510 + }, + { + "epoch": 0.44392723423135905, + "grad_norm": 3.8031177520751953, + "learning_rate": 5.853558610969994e-07, + "loss": 0.7559, + "step": 28515 + }, + { + "epoch": 0.4440050752333284, + "grad_norm": 3.0899171829223633, + "learning_rate": 5.852739220923943e-07, + "loss": 0.6635, + "step": 28520 + }, + { + "epoch": 0.4440829162352978, + "grad_norm": 6.144171237945557, + "learning_rate": 5.851919830877894e-07, + "loss": 0.7759, + "step": 28525 + }, + { + "epoch": 0.44416075723726717, + "grad_norm": 4.998770236968994, + "learning_rate": 5.851100440831844e-07, + "loss": 0.8274, + "step": 28530 + }, + { + "epoch": 0.44423859823923656, + "grad_norm": 3.759521007537842, + "learning_rate": 5.850281050785795e-07, + "loss": 0.7592, + "step": 28535 + }, + { + "epoch": 0.4443164392412059, + "grad_norm": 4.322113990783691, + "learning_rate": 5.849461660739745e-07, + "loss": 0.8804, + "step": 28540 + }, + { + "epoch": 0.4443942802431753, + "grad_norm": 10.286017417907715, + "learning_rate": 5.848642270693695e-07, + "loss": 0.7539, + "step": 28545 + }, + { + "epoch": 0.4444721212451447, + "grad_norm": 4.129453659057617, + "learning_rate": 5.847822880647646e-07, + "loss": 0.8594, + "step": 28550 + }, + { + "epoch": 0.44454996224711407, + "grad_norm": 3.813709020614624, + "learning_rate": 5.847003490601597e-07, + "loss": 0.8825, + "step": 28555 + }, + { + "epoch": 0.4446278032490834, + "grad_norm": 4.043671607971191, + "learning_rate": 5.846184100555545e-07, + "loss": 0.7422, + "step": 28560 + }, + { + "epoch": 0.4447056442510528, + "grad_norm": 5.5891008377075195, + "learning_rate": 5.845364710509496e-07, + "loss": 0.8352, + "step": 28565 + }, + { + "epoch": 0.4447834852530222, + "grad_norm": 3.9800727367401123, + "learning_rate": 5.844545320463447e-07, + "loss": 0.8648, + "step": 28570 + }, + { + "epoch": 0.44486132625499153, + "grad_norm": 4.688074588775635, + "learning_rate": 5.843725930417397e-07, + "loss": 0.6971, + "step": 28575 + }, + { + "epoch": 0.4449391672569609, + "grad_norm": 3.784926176071167, + "learning_rate": 5.842906540371347e-07, + "loss": 0.7281, + "step": 28580 + }, + { + "epoch": 0.4450170082589303, + "grad_norm": 2.675313711166382, + "learning_rate": 5.842087150325298e-07, + "loss": 0.7733, + "step": 28585 + }, + { + "epoch": 0.4450948492608997, + "grad_norm": 2.694478988647461, + "learning_rate": 5.841267760279249e-07, + "loss": 0.7713, + "step": 28590 + }, + { + "epoch": 0.44517269026286904, + "grad_norm": 5.597519397735596, + "learning_rate": 5.840448370233198e-07, + "loss": 0.615, + "step": 28595 + }, + { + "epoch": 0.44525053126483843, + "grad_norm": 3.3392319679260254, + "learning_rate": 5.839628980187148e-07, + "loss": 0.7699, + "step": 28600 + }, + { + "epoch": 0.4453283722668078, + "grad_norm": 4.946972846984863, + "learning_rate": 5.838809590141099e-07, + "loss": 0.7339, + "step": 28605 + }, + { + "epoch": 0.4454062132687772, + "grad_norm": 3.593247413635254, + "learning_rate": 5.837990200095048e-07, + "loss": 0.7862, + "step": 28610 + }, + { + "epoch": 0.44548405427074655, + "grad_norm": 5.393089771270752, + "learning_rate": 5.837170810048999e-07, + "loss": 0.8712, + "step": 28615 + }, + { + "epoch": 0.44556189527271595, + "grad_norm": 2.7674152851104736, + "learning_rate": 5.83635142000295e-07, + "loss": 0.7376, + "step": 28620 + }, + { + "epoch": 0.44563973627468534, + "grad_norm": 3.860445499420166, + "learning_rate": 5.8355320299569e-07, + "loss": 0.8108, + "step": 28625 + }, + { + "epoch": 0.44571757727665473, + "grad_norm": 3.221179485321045, + "learning_rate": 5.83471263991085e-07, + "loss": 0.7394, + "step": 28630 + }, + { + "epoch": 0.44579541827862407, + "grad_norm": 3.5511248111724854, + "learning_rate": 5.833893249864801e-07, + "loss": 0.7899, + "step": 28635 + }, + { + "epoch": 0.44587325928059346, + "grad_norm": 2.731159210205078, + "learning_rate": 5.83307385981875e-07, + "loss": 0.6319, + "step": 28640 + }, + { + "epoch": 0.44595110028256285, + "grad_norm": 4.476842403411865, + "learning_rate": 5.8322544697727e-07, + "loss": 0.8062, + "step": 28645 + }, + { + "epoch": 0.4460289412845322, + "grad_norm": 3.592449426651001, + "learning_rate": 5.831435079726651e-07, + "loss": 0.7144, + "step": 28650 + }, + { + "epoch": 0.4461067822865016, + "grad_norm": 4.755378723144531, + "learning_rate": 5.830615689680601e-07, + "loss": 0.757, + "step": 28655 + }, + { + "epoch": 0.44618462328847097, + "grad_norm": 2.9378182888031006, + "learning_rate": 5.829796299634552e-07, + "loss": 0.754, + "step": 28660 + }, + { + "epoch": 0.44626246429044036, + "grad_norm": 4.925267696380615, + "learning_rate": 5.828976909588502e-07, + "loss": 0.8127, + "step": 28665 + }, + { + "epoch": 0.4463403052924097, + "grad_norm": 4.041872978210449, + "learning_rate": 5.828157519542452e-07, + "loss": 0.6518, + "step": 28670 + }, + { + "epoch": 0.4464181462943791, + "grad_norm": 6.165174961090088, + "learning_rate": 5.827338129496403e-07, + "loss": 0.7984, + "step": 28675 + }, + { + "epoch": 0.4464959872963485, + "grad_norm": 4.192864418029785, + "learning_rate": 5.826518739450354e-07, + "loss": 0.7825, + "step": 28680 + }, + { + "epoch": 0.4465738282983179, + "grad_norm": 3.121920108795166, + "learning_rate": 5.825699349404302e-07, + "loss": 0.6702, + "step": 28685 + }, + { + "epoch": 0.4466516693002872, + "grad_norm": 7.205260276794434, + "learning_rate": 5.824879959358253e-07, + "loss": 0.8273, + "step": 28690 + }, + { + "epoch": 0.4467295103022566, + "grad_norm": 3.040034532546997, + "learning_rate": 5.824060569312204e-07, + "loss": 0.822, + "step": 28695 + }, + { + "epoch": 0.446807351304226, + "grad_norm": 2.335698127746582, + "learning_rate": 5.823241179266154e-07, + "loss": 0.6392, + "step": 28700 + }, + { + "epoch": 0.4468851923061954, + "grad_norm": 4.273251533508301, + "learning_rate": 5.822421789220104e-07, + "loss": 0.6925, + "step": 28705 + }, + { + "epoch": 0.4469630333081647, + "grad_norm": 3.9456238746643066, + "learning_rate": 5.821602399174055e-07, + "loss": 0.7481, + "step": 28710 + }, + { + "epoch": 0.4470408743101341, + "grad_norm": 4.079767227172852, + "learning_rate": 5.820783009128006e-07, + "loss": 0.7397, + "step": 28715 + }, + { + "epoch": 0.4471187153121035, + "grad_norm": 4.003576755523682, + "learning_rate": 5.819963619081955e-07, + "loss": 0.8125, + "step": 28720 + }, + { + "epoch": 0.4471965563140729, + "grad_norm": 4.537726402282715, + "learning_rate": 5.819144229035905e-07, + "loss": 0.7417, + "step": 28725 + }, + { + "epoch": 0.44727439731604224, + "grad_norm": 7.388315200805664, + "learning_rate": 5.818324838989856e-07, + "loss": 0.8641, + "step": 28730 + }, + { + "epoch": 0.44735223831801163, + "grad_norm": 3.0487756729125977, + "learning_rate": 5.817505448943805e-07, + "loss": 0.8928, + "step": 28735 + }, + { + "epoch": 0.447430079319981, + "grad_norm": 3.1655802726745605, + "learning_rate": 5.816686058897756e-07, + "loss": 0.707, + "step": 28740 + }, + { + "epoch": 0.44750792032195036, + "grad_norm": 4.178553104400635, + "learning_rate": 5.815866668851707e-07, + "loss": 0.7312, + "step": 28745 + }, + { + "epoch": 0.44758576132391975, + "grad_norm": 7.626601696014404, + "learning_rate": 5.815047278805657e-07, + "loss": 0.7456, + "step": 28750 + }, + { + "epoch": 0.44766360232588914, + "grad_norm": 4.492088794708252, + "learning_rate": 5.814227888759607e-07, + "loss": 0.7463, + "step": 28755 + }, + { + "epoch": 0.44774144332785853, + "grad_norm": 3.4210524559020996, + "learning_rate": 5.813408498713558e-07, + "loss": 0.7552, + "step": 28760 + }, + { + "epoch": 0.44781928432982787, + "grad_norm": 4.002391338348389, + "learning_rate": 5.812589108667507e-07, + "loss": 0.7915, + "step": 28765 + }, + { + "epoch": 0.44789712533179726, + "grad_norm": 2.579582691192627, + "learning_rate": 5.811769718621458e-07, + "loss": 0.845, + "step": 28770 + }, + { + "epoch": 0.44797496633376666, + "grad_norm": 5.933312892913818, + "learning_rate": 5.810950328575408e-07, + "loss": 0.7263, + "step": 28775 + }, + { + "epoch": 0.44805280733573605, + "grad_norm": 6.678994655609131, + "learning_rate": 5.810130938529358e-07, + "loss": 0.9789, + "step": 28780 + }, + { + "epoch": 0.4481306483377054, + "grad_norm": 2.9170658588409424, + "learning_rate": 5.809311548483309e-07, + "loss": 0.7674, + "step": 28785 + }, + { + "epoch": 0.4482084893396748, + "grad_norm": 2.6036124229431152, + "learning_rate": 5.808492158437259e-07, + "loss": 0.5481, + "step": 28790 + }, + { + "epoch": 0.44828633034164417, + "grad_norm": 4.275371551513672, + "learning_rate": 5.807672768391209e-07, + "loss": 0.7133, + "step": 28795 + }, + { + "epoch": 0.44836417134361356, + "grad_norm": 4.063039302825928, + "learning_rate": 5.80685337834516e-07, + "loss": 0.7645, + "step": 28800 + }, + { + "epoch": 0.4484420123455829, + "grad_norm": 4.390212059020996, + "learning_rate": 5.80603398829911e-07, + "loss": 0.7717, + "step": 28805 + }, + { + "epoch": 0.4485198533475523, + "grad_norm": 2.9731879234313965, + "learning_rate": 5.80521459825306e-07, + "loss": 0.8827, + "step": 28810 + }, + { + "epoch": 0.4485976943495217, + "grad_norm": 5.1143035888671875, + "learning_rate": 5.80439520820701e-07, + "loss": 0.6741, + "step": 28815 + }, + { + "epoch": 0.448675535351491, + "grad_norm": 4.85829496383667, + "learning_rate": 5.803575818160961e-07, + "loss": 0.7148, + "step": 28820 + }, + { + "epoch": 0.4487533763534604, + "grad_norm": 5.24547004699707, + "learning_rate": 5.802756428114912e-07, + "loss": 0.7149, + "step": 28825 + }, + { + "epoch": 0.4488312173554298, + "grad_norm": 4.583830833435059, + "learning_rate": 5.801937038068861e-07, + "loss": 0.7881, + "step": 28830 + }, + { + "epoch": 0.4489090583573992, + "grad_norm": 3.0546112060546875, + "learning_rate": 5.801117648022812e-07, + "loss": 0.7414, + "step": 28835 + }, + { + "epoch": 0.44898689935936853, + "grad_norm": 4.399287700653076, + "learning_rate": 5.800298257976763e-07, + "loss": 0.7823, + "step": 28840 + }, + { + "epoch": 0.4490647403613379, + "grad_norm": 3.3814289569854736, + "learning_rate": 5.799478867930711e-07, + "loss": 0.6798, + "step": 28845 + }, + { + "epoch": 0.4491425813633073, + "grad_norm": 4.2093963623046875, + "learning_rate": 5.798659477884662e-07, + "loss": 0.7128, + "step": 28850 + }, + { + "epoch": 0.4492204223652767, + "grad_norm": 3.488926887512207, + "learning_rate": 5.797840087838613e-07, + "loss": 0.7998, + "step": 28855 + }, + { + "epoch": 0.44929826336724604, + "grad_norm": 3.5785272121429443, + "learning_rate": 5.797020697792563e-07, + "loss": 0.7752, + "step": 28860 + }, + { + "epoch": 0.44937610436921543, + "grad_norm": 4.4823150634765625, + "learning_rate": 5.796201307746513e-07, + "loss": 0.7829, + "step": 28865 + }, + { + "epoch": 0.4494539453711848, + "grad_norm": 4.716001033782959, + "learning_rate": 5.795381917700464e-07, + "loss": 0.8278, + "step": 28870 + }, + { + "epoch": 0.4495317863731542, + "grad_norm": 5.834087371826172, + "learning_rate": 5.794562527654414e-07, + "loss": 0.8443, + "step": 28875 + }, + { + "epoch": 0.44960962737512356, + "grad_norm": 3.227731227874756, + "learning_rate": 5.793743137608364e-07, + "loss": 0.7663, + "step": 28880 + }, + { + "epoch": 0.44968746837709295, + "grad_norm": 4.241919994354248, + "learning_rate": 5.792923747562314e-07, + "loss": 0.7926, + "step": 28885 + }, + { + "epoch": 0.44976530937906234, + "grad_norm": 3.0052309036254883, + "learning_rate": 5.792104357516264e-07, + "loss": 0.7092, + "step": 28890 + }, + { + "epoch": 0.44984315038103173, + "grad_norm": 4.822492599487305, + "learning_rate": 5.791284967470215e-07, + "loss": 0.781, + "step": 28895 + }, + { + "epoch": 0.44992099138300107, + "grad_norm": 7.8753180503845215, + "learning_rate": 5.790465577424165e-07, + "loss": 0.7651, + "step": 28900 + }, + { + "epoch": 0.44999883238497046, + "grad_norm": 3.306781053543091, + "learning_rate": 5.789646187378115e-07, + "loss": 0.7367, + "step": 28905 + }, + { + "epoch": 0.45007667338693985, + "grad_norm": 3.1504251956939697, + "learning_rate": 5.788826797332066e-07, + "loss": 0.706, + "step": 28910 + }, + { + "epoch": 0.4501545143889092, + "grad_norm": 4.191858291625977, + "learning_rate": 5.788007407286017e-07, + "loss": 0.7441, + "step": 28915 + }, + { + "epoch": 0.4502323553908786, + "grad_norm": 3.7259771823883057, + "learning_rate": 5.787188017239966e-07, + "loss": 0.8353, + "step": 28920 + }, + { + "epoch": 0.450310196392848, + "grad_norm": 4.479262351989746, + "learning_rate": 5.786368627193916e-07, + "loss": 0.7023, + "step": 28925 + }, + { + "epoch": 0.45038803739481736, + "grad_norm": 4.838881015777588, + "learning_rate": 5.785549237147867e-07, + "loss": 0.7328, + "step": 28930 + }, + { + "epoch": 0.4504658783967867, + "grad_norm": 3.202648401260376, + "learning_rate": 5.784729847101817e-07, + "loss": 0.8272, + "step": 28935 + }, + { + "epoch": 0.4505437193987561, + "grad_norm": 3.269976854324341, + "learning_rate": 5.783910457055767e-07, + "loss": 0.7599, + "step": 28940 + }, + { + "epoch": 0.4506215604007255, + "grad_norm": 3.516141653060913, + "learning_rate": 5.783091067009718e-07, + "loss": 0.7602, + "step": 28945 + }, + { + "epoch": 0.4506994014026949, + "grad_norm": 4.920629501342773, + "learning_rate": 5.782271676963669e-07, + "loss": 0.7543, + "step": 28950 + }, + { + "epoch": 0.4507772424046642, + "grad_norm": 5.533834457397461, + "learning_rate": 5.781452286917618e-07, + "loss": 0.7442, + "step": 28955 + }, + { + "epoch": 0.4508550834066336, + "grad_norm": 5.30636739730835, + "learning_rate": 5.780632896871569e-07, + "loss": 0.676, + "step": 28960 + }, + { + "epoch": 0.450932924408603, + "grad_norm": 10.619142532348633, + "learning_rate": 5.779813506825519e-07, + "loss": 0.8129, + "step": 28965 + }, + { + "epoch": 0.4510107654105724, + "grad_norm": 6.480765342712402, + "learning_rate": 5.778994116779468e-07, + "loss": 0.7446, + "step": 28970 + }, + { + "epoch": 0.4510886064125417, + "grad_norm": 5.204502582550049, + "learning_rate": 5.778174726733419e-07, + "loss": 0.7069, + "step": 28975 + }, + { + "epoch": 0.4511664474145111, + "grad_norm": 4.343836784362793, + "learning_rate": 5.77735533668737e-07, + "loss": 0.7275, + "step": 28980 + }, + { + "epoch": 0.4512442884164805, + "grad_norm": 3.211695671081543, + "learning_rate": 5.77653594664132e-07, + "loss": 0.7882, + "step": 28985 + }, + { + "epoch": 0.45132212941844985, + "grad_norm": 4.3351569175720215, + "learning_rate": 5.77571655659527e-07, + "loss": 0.8266, + "step": 28990 + }, + { + "epoch": 0.45139997042041924, + "grad_norm": 2.9921987056732178, + "learning_rate": 5.774897166549221e-07, + "loss": 0.7562, + "step": 28995 + }, + { + "epoch": 0.45147781142238863, + "grad_norm": 4.759602069854736, + "learning_rate": 5.774077776503171e-07, + "loss": 0.8041, + "step": 29000 + }, + { + "epoch": 0.451555652424358, + "grad_norm": 3.083439826965332, + "learning_rate": 5.773258386457122e-07, + "loss": 0.7023, + "step": 29005 + }, + { + "epoch": 0.45163349342632736, + "grad_norm": 8.841672897338867, + "learning_rate": 5.772438996411071e-07, + "loss": 0.729, + "step": 29010 + }, + { + "epoch": 0.45171133442829675, + "grad_norm": 3.8009612560272217, + "learning_rate": 5.771619606365021e-07, + "loss": 0.8011, + "step": 29015 + }, + { + "epoch": 0.45178917543026614, + "grad_norm": 16.207178115844727, + "learning_rate": 5.770800216318972e-07, + "loss": 0.8641, + "step": 29020 + }, + { + "epoch": 0.45186701643223554, + "grad_norm": 12.364839553833008, + "learning_rate": 5.769980826272922e-07, + "loss": 0.772, + "step": 29025 + }, + { + "epoch": 0.45194485743420487, + "grad_norm": 8.666084289550781, + "learning_rate": 5.769161436226872e-07, + "loss": 0.8252, + "step": 29030 + }, + { + "epoch": 0.45202269843617426, + "grad_norm": 4.036910057067871, + "learning_rate": 5.768342046180823e-07, + "loss": 0.6861, + "step": 29035 + }, + { + "epoch": 0.45210053943814366, + "grad_norm": 4.029730796813965, + "learning_rate": 5.767522656134774e-07, + "loss": 0.6195, + "step": 29040 + }, + { + "epoch": 0.45217838044011305, + "grad_norm": 5.024090766906738, + "learning_rate": 5.766703266088723e-07, + "loss": 0.6818, + "step": 29045 + }, + { + "epoch": 0.4522562214420824, + "grad_norm": 4.02849817276001, + "learning_rate": 5.765883876042673e-07, + "loss": 0.8053, + "step": 29050 + }, + { + "epoch": 0.4523340624440518, + "grad_norm": 4.624151229858398, + "learning_rate": 5.765064485996624e-07, + "loss": 0.8042, + "step": 29055 + }, + { + "epoch": 0.45241190344602117, + "grad_norm": 24.174957275390625, + "learning_rate": 5.764245095950574e-07, + "loss": 0.7939, + "step": 29060 + }, + { + "epoch": 0.45248974444799056, + "grad_norm": 4.263962745666504, + "learning_rate": 5.763425705904524e-07, + "loss": 0.6899, + "step": 29065 + }, + { + "epoch": 0.4525675854499599, + "grad_norm": 2.8683125972747803, + "learning_rate": 5.762606315858475e-07, + "loss": 0.7562, + "step": 29070 + }, + { + "epoch": 0.4526454264519293, + "grad_norm": 5.541211128234863, + "learning_rate": 5.761786925812426e-07, + "loss": 0.7891, + "step": 29075 + }, + { + "epoch": 0.4527232674538987, + "grad_norm": 6.852053642272949, + "learning_rate": 5.760967535766375e-07, + "loss": 0.7556, + "step": 29080 + }, + { + "epoch": 0.452801108455868, + "grad_norm": 3.5538642406463623, + "learning_rate": 5.760148145720326e-07, + "loss": 0.8761, + "step": 29085 + }, + { + "epoch": 0.4528789494578374, + "grad_norm": 4.408565998077393, + "learning_rate": 5.759328755674276e-07, + "loss": 0.712, + "step": 29090 + }, + { + "epoch": 0.4529567904598068, + "grad_norm": 5.765478610992432, + "learning_rate": 5.758509365628225e-07, + "loss": 0.7246, + "step": 29095 + }, + { + "epoch": 0.4530346314617762, + "grad_norm": 4.650556564331055, + "learning_rate": 5.757689975582176e-07, + "loss": 0.8426, + "step": 29100 + }, + { + "epoch": 0.45311247246374553, + "grad_norm": 4.318330764770508, + "learning_rate": 5.756870585536127e-07, + "loss": 0.7968, + "step": 29105 + }, + { + "epoch": 0.4531903134657149, + "grad_norm": 4.4654388427734375, + "learning_rate": 5.756051195490077e-07, + "loss": 0.7472, + "step": 29110 + }, + { + "epoch": 0.4532681544676843, + "grad_norm": 6.117023468017578, + "learning_rate": 5.755231805444027e-07, + "loss": 0.7138, + "step": 29115 + }, + { + "epoch": 0.4533459954696537, + "grad_norm": 7.660935401916504, + "learning_rate": 5.754412415397978e-07, + "loss": 0.808, + "step": 29120 + }, + { + "epoch": 0.45342383647162304, + "grad_norm": 5.994586944580078, + "learning_rate": 5.753593025351928e-07, + "loss": 0.8673, + "step": 29125 + }, + { + "epoch": 0.45350167747359243, + "grad_norm": 3.525341033935547, + "learning_rate": 5.752773635305878e-07, + "loss": 0.7404, + "step": 29130 + }, + { + "epoch": 0.4535795184755618, + "grad_norm": 3.208970308303833, + "learning_rate": 5.751954245259828e-07, + "loss": 0.7721, + "step": 29135 + }, + { + "epoch": 0.4536573594775312, + "grad_norm": 3.5546417236328125, + "learning_rate": 5.751134855213778e-07, + "loss": 0.8085, + "step": 29140 + }, + { + "epoch": 0.45373520047950056, + "grad_norm": 3.8426883220672607, + "learning_rate": 5.750315465167729e-07, + "loss": 0.8667, + "step": 29145 + }, + { + "epoch": 0.45381304148146995, + "grad_norm": 4.148952484130859, + "learning_rate": 5.74949607512168e-07, + "loss": 0.8004, + "step": 29150 + }, + { + "epoch": 0.45389088248343934, + "grad_norm": 4.1008992195129395, + "learning_rate": 5.748676685075629e-07, + "loss": 0.7331, + "step": 29155 + }, + { + "epoch": 0.45396872348540873, + "grad_norm": 5.24363899230957, + "learning_rate": 5.74785729502958e-07, + "loss": 0.8068, + "step": 29160 + }, + { + "epoch": 0.45404656448737807, + "grad_norm": 3.7679924964904785, + "learning_rate": 5.747037904983531e-07, + "loss": 0.7042, + "step": 29165 + }, + { + "epoch": 0.45412440548934746, + "grad_norm": 6.450275897979736, + "learning_rate": 5.746218514937479e-07, + "loss": 0.725, + "step": 29170 + }, + { + "epoch": 0.45420224649131685, + "grad_norm": 4.289034366607666, + "learning_rate": 5.74539912489143e-07, + "loss": 0.8314, + "step": 29175 + }, + { + "epoch": 0.4542800874932862, + "grad_norm": 4.317148208618164, + "learning_rate": 5.744579734845381e-07, + "loss": 0.8628, + "step": 29180 + }, + { + "epoch": 0.4543579284952556, + "grad_norm": 3.4826083183288574, + "learning_rate": 5.743760344799332e-07, + "loss": 0.7807, + "step": 29185 + }, + { + "epoch": 0.454435769497225, + "grad_norm": 4.1573872566223145, + "learning_rate": 5.742940954753281e-07, + "loss": 0.7241, + "step": 29190 + }, + { + "epoch": 0.45451361049919436, + "grad_norm": 2.6551735401153564, + "learning_rate": 5.742121564707232e-07, + "loss": 0.7656, + "step": 29195 + }, + { + "epoch": 0.4545914515011637, + "grad_norm": 3.7584831714630127, + "learning_rate": 5.741302174661183e-07, + "loss": 0.693, + "step": 29200 + }, + { + "epoch": 0.4546692925031331, + "grad_norm": 3.4537553787231445, + "learning_rate": 5.740482784615132e-07, + "loss": 0.7632, + "step": 29205 + }, + { + "epoch": 0.4547471335051025, + "grad_norm": 9.524008750915527, + "learning_rate": 5.739663394569082e-07, + "loss": 0.781, + "step": 29210 + }, + { + "epoch": 0.4548249745070719, + "grad_norm": 4.735363960266113, + "learning_rate": 5.738844004523033e-07, + "loss": 0.7214, + "step": 29215 + }, + { + "epoch": 0.4549028155090412, + "grad_norm": 4.976006031036377, + "learning_rate": 5.738024614476983e-07, + "loss": 0.6781, + "step": 29220 + }, + { + "epoch": 0.4549806565110106, + "grad_norm": 2.9076504707336426, + "learning_rate": 5.737205224430933e-07, + "loss": 0.7562, + "step": 29225 + }, + { + "epoch": 0.45505849751298, + "grad_norm": 6.729519844055176, + "learning_rate": 5.736385834384884e-07, + "loss": 0.7656, + "step": 29230 + }, + { + "epoch": 0.4551363385149494, + "grad_norm": 3.2870726585388184, + "learning_rate": 5.735566444338834e-07, + "loss": 0.753, + "step": 29235 + }, + { + "epoch": 0.4552141795169187, + "grad_norm": 4.06284236907959, + "learning_rate": 5.734747054292784e-07, + "loss": 0.8596, + "step": 29240 + }, + { + "epoch": 0.4552920205188881, + "grad_norm": 3.725203514099121, + "learning_rate": 5.733927664246735e-07, + "loss": 0.6136, + "step": 29245 + }, + { + "epoch": 0.4553698615208575, + "grad_norm": 2.6347107887268066, + "learning_rate": 5.733108274200684e-07, + "loss": 0.7469, + "step": 29250 + }, + { + "epoch": 0.45544770252282685, + "grad_norm": 4.7031636238098145, + "learning_rate": 5.732288884154635e-07, + "loss": 0.8537, + "step": 29255 + }, + { + "epoch": 0.45552554352479624, + "grad_norm": 2.4778549671173096, + "learning_rate": 5.731469494108585e-07, + "loss": 0.8185, + "step": 29260 + }, + { + "epoch": 0.45560338452676563, + "grad_norm": 4.985352516174316, + "learning_rate": 5.730650104062535e-07, + "loss": 0.8419, + "step": 29265 + }, + { + "epoch": 0.455681225528735, + "grad_norm": 3.5948269367218018, + "learning_rate": 5.729830714016486e-07, + "loss": 0.7006, + "step": 29270 + }, + { + "epoch": 0.45575906653070436, + "grad_norm": 3.4470608234405518, + "learning_rate": 5.729011323970437e-07, + "loss": 0.797, + "step": 29275 + }, + { + "epoch": 0.45583690753267375, + "grad_norm": 4.510948657989502, + "learning_rate": 5.728191933924386e-07, + "loss": 0.5894, + "step": 29280 + }, + { + "epoch": 0.45591474853464314, + "grad_norm": 3.0016355514526367, + "learning_rate": 5.727372543878337e-07, + "loss": 0.7484, + "step": 29285 + }, + { + "epoch": 0.45599258953661254, + "grad_norm": 3.7839488983154297, + "learning_rate": 5.726553153832287e-07, + "loss": 0.6431, + "step": 29290 + }, + { + "epoch": 0.4560704305385819, + "grad_norm": 3.340162515640259, + "learning_rate": 5.725733763786236e-07, + "loss": 0.734, + "step": 29295 + }, + { + "epoch": 0.45614827154055126, + "grad_norm": 3.7294671535491943, + "learning_rate": 5.724914373740187e-07, + "loss": 0.7254, + "step": 29300 + }, + { + "epoch": 0.45622611254252066, + "grad_norm": 3.578665018081665, + "learning_rate": 5.724094983694138e-07, + "loss": 0.7948, + "step": 29305 + }, + { + "epoch": 0.45630395354449005, + "grad_norm": 4.261101722717285, + "learning_rate": 5.723275593648089e-07, + "loss": 0.7266, + "step": 29310 + }, + { + "epoch": 0.4563817945464594, + "grad_norm": 5.133454322814941, + "learning_rate": 5.722456203602038e-07, + "loss": 0.7848, + "step": 29315 + }, + { + "epoch": 0.4564596355484288, + "grad_norm": 2.6158576011657715, + "learning_rate": 5.721636813555989e-07, + "loss": 0.7003, + "step": 29320 + }, + { + "epoch": 0.45653747655039817, + "grad_norm": 3.4494142532348633, + "learning_rate": 5.72081742350994e-07, + "loss": 0.7098, + "step": 29325 + }, + { + "epoch": 0.45661531755236756, + "grad_norm": 4.859612941741943, + "learning_rate": 5.71999803346389e-07, + "loss": 0.8115, + "step": 29330 + }, + { + "epoch": 0.4566931585543369, + "grad_norm": 6.412881374359131, + "learning_rate": 5.719178643417839e-07, + "loss": 0.7827, + "step": 29335 + }, + { + "epoch": 0.4567709995563063, + "grad_norm": 2.4222824573516846, + "learning_rate": 5.71835925337179e-07, + "loss": 0.7147, + "step": 29340 + }, + { + "epoch": 0.4568488405582757, + "grad_norm": 3.288569927215576, + "learning_rate": 5.71753986332574e-07, + "loss": 0.8637, + "step": 29345 + }, + { + "epoch": 0.456926681560245, + "grad_norm": 3.8363094329833984, + "learning_rate": 5.71672047327969e-07, + "loss": 0.627, + "step": 29350 + }, + { + "epoch": 0.4570045225622144, + "grad_norm": 2.8272054195404053, + "learning_rate": 5.715901083233641e-07, + "loss": 0.7331, + "step": 29355 + }, + { + "epoch": 0.4570823635641838, + "grad_norm": 7.778134822845459, + "learning_rate": 5.715081693187591e-07, + "loss": 0.6883, + "step": 29360 + }, + { + "epoch": 0.4571602045661532, + "grad_norm": 2.7845077514648438, + "learning_rate": 5.714262303141542e-07, + "loss": 0.782, + "step": 29365 + }, + { + "epoch": 0.45723804556812253, + "grad_norm": 3.7273714542388916, + "learning_rate": 5.713442913095492e-07, + "loss": 0.7656, + "step": 29370 + }, + { + "epoch": 0.4573158865700919, + "grad_norm": 4.313918590545654, + "learning_rate": 5.712623523049441e-07, + "loss": 0.7244, + "step": 29375 + }, + { + "epoch": 0.4573937275720613, + "grad_norm": 2.5442774295806885, + "learning_rate": 5.711804133003392e-07, + "loss": 0.6705, + "step": 29380 + }, + { + "epoch": 0.4574715685740307, + "grad_norm": 2.8726694583892822, + "learning_rate": 5.710984742957342e-07, + "loss": 0.7727, + "step": 29385 + }, + { + "epoch": 0.45754940957600004, + "grad_norm": 4.24089241027832, + "learning_rate": 5.710165352911292e-07, + "loss": 0.6543, + "step": 29390 + }, + { + "epoch": 0.45762725057796944, + "grad_norm": 2.787386417388916, + "learning_rate": 5.709345962865243e-07, + "loss": 0.7181, + "step": 29395 + }, + { + "epoch": 0.4577050915799388, + "grad_norm": 4.498680114746094, + "learning_rate": 5.708526572819194e-07, + "loss": 0.846, + "step": 29400 + }, + { + "epoch": 0.4577829325819082, + "grad_norm": 4.32037878036499, + "learning_rate": 5.707707182773143e-07, + "loss": 0.8141, + "step": 29405 + }, + { + "epoch": 0.45786077358387756, + "grad_norm": 3.402519464492798, + "learning_rate": 5.706887792727094e-07, + "loss": 0.8875, + "step": 29410 + }, + { + "epoch": 0.45793861458584695, + "grad_norm": 3.7643373012542725, + "learning_rate": 5.706068402681044e-07, + "loss": 0.7522, + "step": 29415 + }, + { + "epoch": 0.45801645558781634, + "grad_norm": 3.1787290573120117, + "learning_rate": 5.705249012634993e-07, + "loss": 0.6755, + "step": 29420 + }, + { + "epoch": 0.4580942965897857, + "grad_norm": 6.672213554382324, + "learning_rate": 5.704429622588944e-07, + "loss": 0.8007, + "step": 29425 + }, + { + "epoch": 0.45817213759175507, + "grad_norm": 4.505547523498535, + "learning_rate": 5.703610232542895e-07, + "loss": 0.8463, + "step": 29430 + }, + { + "epoch": 0.45824997859372446, + "grad_norm": 3.9626431465148926, + "learning_rate": 5.702790842496846e-07, + "loss": 0.7459, + "step": 29435 + }, + { + "epoch": 0.45832781959569385, + "grad_norm": 3.782198905944824, + "learning_rate": 5.701971452450795e-07, + "loss": 0.847, + "step": 29440 + }, + { + "epoch": 0.4584056605976632, + "grad_norm": 6.641538143157959, + "learning_rate": 5.701152062404746e-07, + "loss": 0.8242, + "step": 29445 + }, + { + "epoch": 0.4584835015996326, + "grad_norm": 3.9570329189300537, + "learning_rate": 5.700332672358697e-07, + "loss": 0.6582, + "step": 29450 + }, + { + "epoch": 0.458561342601602, + "grad_norm": 2.9618704319000244, + "learning_rate": 5.699513282312645e-07, + "loss": 0.6949, + "step": 29455 + }, + { + "epoch": 0.45863918360357137, + "grad_norm": 3.5960450172424316, + "learning_rate": 5.698693892266596e-07, + "loss": 0.7812, + "step": 29460 + }, + { + "epoch": 0.4587170246055407, + "grad_norm": 3.4689462184906006, + "learning_rate": 5.697874502220547e-07, + "loss": 0.7432, + "step": 29465 + }, + { + "epoch": 0.4587948656075101, + "grad_norm": 3.1630213260650635, + "learning_rate": 5.697055112174497e-07, + "loss": 0.7249, + "step": 29470 + }, + { + "epoch": 0.4588727066094795, + "grad_norm": 4.015108585357666, + "learning_rate": 5.696235722128447e-07, + "loss": 0.7426, + "step": 29475 + }, + { + "epoch": 0.4589505476114489, + "grad_norm": 3.334855079650879, + "learning_rate": 5.695416332082398e-07, + "loss": 0.6557, + "step": 29480 + }, + { + "epoch": 0.4590283886134182, + "grad_norm": 5.416299819946289, + "learning_rate": 5.694596942036348e-07, + "loss": 0.8126, + "step": 29485 + }, + { + "epoch": 0.4591062296153876, + "grad_norm": 5.627259254455566, + "learning_rate": 5.693777551990299e-07, + "loss": 0.7561, + "step": 29490 + }, + { + "epoch": 0.459184070617357, + "grad_norm": 4.5271477699279785, + "learning_rate": 5.692958161944248e-07, + "loss": 0.6919, + "step": 29495 + }, + { + "epoch": 0.4592619116193264, + "grad_norm": 5.4130167961120605, + "learning_rate": 5.692138771898198e-07, + "loss": 0.8622, + "step": 29500 + }, + { + "epoch": 0.4593397526212957, + "grad_norm": 4.641055583953857, + "learning_rate": 5.691319381852149e-07, + "loss": 0.7803, + "step": 29505 + }, + { + "epoch": 0.4594175936232651, + "grad_norm": 5.009194850921631, + "learning_rate": 5.6904999918061e-07, + "loss": 0.8414, + "step": 29510 + }, + { + "epoch": 0.4594954346252345, + "grad_norm": 3.12497878074646, + "learning_rate": 5.689680601760049e-07, + "loss": 0.7733, + "step": 29515 + }, + { + "epoch": 0.45957327562720385, + "grad_norm": 4.629786014556885, + "learning_rate": 5.688861211714e-07, + "loss": 0.8534, + "step": 29520 + }, + { + "epoch": 0.45965111662917324, + "grad_norm": 3.6191458702087402, + "learning_rate": 5.688041821667951e-07, + "loss": 0.8469, + "step": 29525 + }, + { + "epoch": 0.45972895763114263, + "grad_norm": 3.2484846115112305, + "learning_rate": 5.6872224316219e-07, + "loss": 0.83, + "step": 29530 + }, + { + "epoch": 0.459806798633112, + "grad_norm": 5.290815830230713, + "learning_rate": 5.68640304157585e-07, + "loss": 0.6943, + "step": 29535 + }, + { + "epoch": 0.45988463963508136, + "grad_norm": 4.702110290527344, + "learning_rate": 5.685583651529801e-07, + "loss": 0.8024, + "step": 29540 + }, + { + "epoch": 0.45996248063705075, + "grad_norm": 5.266806125640869, + "learning_rate": 5.68476426148375e-07, + "loss": 0.7295, + "step": 29545 + }, + { + "epoch": 0.46004032163902014, + "grad_norm": 3.227226972579956, + "learning_rate": 5.683944871437701e-07, + "loss": 0.7055, + "step": 29550 + }, + { + "epoch": 0.46011816264098954, + "grad_norm": 3.443768262863159, + "learning_rate": 5.683125481391652e-07, + "loss": 0.7523, + "step": 29555 + }, + { + "epoch": 0.4601960036429589, + "grad_norm": 3.3546595573425293, + "learning_rate": 5.682306091345603e-07, + "loss": 0.7309, + "step": 29560 + }, + { + "epoch": 0.46027384464492826, + "grad_norm": 6.671410083770752, + "learning_rate": 5.681486701299552e-07, + "loss": 0.8336, + "step": 29565 + }, + { + "epoch": 0.46035168564689766, + "grad_norm": 6.067061424255371, + "learning_rate": 5.680667311253503e-07, + "loss": 0.7152, + "step": 29570 + }, + { + "epoch": 0.46042952664886705, + "grad_norm": 8.262660026550293, + "learning_rate": 5.679847921207453e-07, + "loss": 0.7065, + "step": 29575 + }, + { + "epoch": 0.4605073676508364, + "grad_norm": 2.8983047008514404, + "learning_rate": 5.679028531161403e-07, + "loss": 0.8039, + "step": 29580 + }, + { + "epoch": 0.4605852086528058, + "grad_norm": 3.577810525894165, + "learning_rate": 5.678209141115353e-07, + "loss": 0.7898, + "step": 29585 + }, + { + "epoch": 0.46066304965477517, + "grad_norm": 3.0140891075134277, + "learning_rate": 5.677389751069304e-07, + "loss": 0.8619, + "step": 29590 + }, + { + "epoch": 0.4607408906567445, + "grad_norm": 3.443023204803467, + "learning_rate": 5.676570361023254e-07, + "loss": 0.7451, + "step": 29595 + }, + { + "epoch": 0.4608187316587139, + "grad_norm": 7.99015998840332, + "learning_rate": 5.675750970977205e-07, + "loss": 0.8006, + "step": 29600 + }, + { + "epoch": 0.4608965726606833, + "grad_norm": 4.371585845947266, + "learning_rate": 5.674931580931155e-07, + "loss": 0.7726, + "step": 29605 + }, + { + "epoch": 0.4609744136626527, + "grad_norm": 3.129302501678467, + "learning_rate": 5.674112190885105e-07, + "loss": 0.7019, + "step": 29610 + }, + { + "epoch": 0.461052254664622, + "grad_norm": 2.687493324279785, + "learning_rate": 5.673292800839056e-07, + "loss": 0.6881, + "step": 29615 + }, + { + "epoch": 0.4611300956665914, + "grad_norm": 9.908051490783691, + "learning_rate": 5.672473410793005e-07, + "loss": 0.8048, + "step": 29620 + }, + { + "epoch": 0.4612079366685608, + "grad_norm": 4.389907360076904, + "learning_rate": 5.671654020746955e-07, + "loss": 0.7758, + "step": 29625 + }, + { + "epoch": 0.4612857776705302, + "grad_norm": 3.2980127334594727, + "learning_rate": 5.670834630700906e-07, + "loss": 0.7126, + "step": 29630 + }, + { + "epoch": 0.46136361867249953, + "grad_norm": 3.6595046520233154, + "learning_rate": 5.670015240654857e-07, + "loss": 0.8298, + "step": 29635 + }, + { + "epoch": 0.4614414596744689, + "grad_norm": 10.296154022216797, + "learning_rate": 5.669195850608806e-07, + "loss": 0.6761, + "step": 29640 + }, + { + "epoch": 0.4615193006764383, + "grad_norm": 3.4469966888427734, + "learning_rate": 5.668376460562757e-07, + "loss": 0.7524, + "step": 29645 + }, + { + "epoch": 0.4615971416784077, + "grad_norm": 3.731187582015991, + "learning_rate": 5.667557070516708e-07, + "loss": 0.7712, + "step": 29650 + }, + { + "epoch": 0.46167498268037704, + "grad_norm": 2.4918253421783447, + "learning_rate": 5.666737680470657e-07, + "loss": 0.6676, + "step": 29655 + }, + { + "epoch": 0.46175282368234644, + "grad_norm": 9.466460227966309, + "learning_rate": 5.665918290424607e-07, + "loss": 0.8928, + "step": 29660 + }, + { + "epoch": 0.46183066468431583, + "grad_norm": 2.682614803314209, + "learning_rate": 5.665098900378558e-07, + "loss": 0.6856, + "step": 29665 + }, + { + "epoch": 0.4619085056862852, + "grad_norm": 3.7968413829803467, + "learning_rate": 5.664279510332508e-07, + "loss": 0.8142, + "step": 29670 + }, + { + "epoch": 0.46198634668825456, + "grad_norm": 3.2053468227386475, + "learning_rate": 5.663460120286458e-07, + "loss": 0.8449, + "step": 29675 + }, + { + "epoch": 0.46206418769022395, + "grad_norm": 3.584822177886963, + "learning_rate": 5.662640730240409e-07, + "loss": 0.7932, + "step": 29680 + }, + { + "epoch": 0.46214202869219334, + "grad_norm": 5.949429512023926, + "learning_rate": 5.66182134019436e-07, + "loss": 0.7658, + "step": 29685 + }, + { + "epoch": 0.4622198696941627, + "grad_norm": 5.678579330444336, + "learning_rate": 5.66100195014831e-07, + "loss": 0.8584, + "step": 29690 + }, + { + "epoch": 0.46229771069613207, + "grad_norm": 3.8462507724761963, + "learning_rate": 5.66018256010226e-07, + "loss": 0.856, + "step": 29695 + }, + { + "epoch": 0.46237555169810146, + "grad_norm": 2.270792245864868, + "learning_rate": 5.65936317005621e-07, + "loss": 0.6093, + "step": 29700 + }, + { + "epoch": 0.46245339270007085, + "grad_norm": 2.746656894683838, + "learning_rate": 5.65854378001016e-07, + "loss": 0.7386, + "step": 29705 + }, + { + "epoch": 0.4625312337020402, + "grad_norm": 4.786650657653809, + "learning_rate": 5.65772438996411e-07, + "loss": 0.8235, + "step": 29710 + }, + { + "epoch": 0.4626090747040096, + "grad_norm": 7.245391368865967, + "learning_rate": 5.656904999918061e-07, + "loss": 0.7279, + "step": 29715 + }, + { + "epoch": 0.462686915705979, + "grad_norm": 5.278894901275635, + "learning_rate": 5.656085609872011e-07, + "loss": 0.8377, + "step": 29720 + }, + { + "epoch": 0.46276475670794837, + "grad_norm": 3.9391064643859863, + "learning_rate": 5.655266219825962e-07, + "loss": 0.7426, + "step": 29725 + }, + { + "epoch": 0.4628425977099177, + "grad_norm": 5.838922500610352, + "learning_rate": 5.654446829779912e-07, + "loss": 0.7045, + "step": 29730 + }, + { + "epoch": 0.4629204387118871, + "grad_norm": 3.234442710876465, + "learning_rate": 5.653627439733862e-07, + "loss": 0.8333, + "step": 29735 + }, + { + "epoch": 0.4629982797138565, + "grad_norm": 2.542278528213501, + "learning_rate": 5.652808049687812e-07, + "loss": 0.6799, + "step": 29740 + }, + { + "epoch": 0.4630761207158259, + "grad_norm": 4.046177387237549, + "learning_rate": 5.651988659641762e-07, + "loss": 0.7613, + "step": 29745 + }, + { + "epoch": 0.4631539617177952, + "grad_norm": 3.80517315864563, + "learning_rate": 5.651169269595712e-07, + "loss": 0.7418, + "step": 29750 + }, + { + "epoch": 0.4632318027197646, + "grad_norm": 5.722404956817627, + "learning_rate": 5.650349879549663e-07, + "loss": 0.7325, + "step": 29755 + }, + { + "epoch": 0.463309643721734, + "grad_norm": 3.710421085357666, + "learning_rate": 5.649530489503614e-07, + "loss": 0.845, + "step": 29760 + }, + { + "epoch": 0.4633874847237034, + "grad_norm": 3.7477540969848633, + "learning_rate": 5.648711099457563e-07, + "loss": 0.6633, + "step": 29765 + }, + { + "epoch": 0.4634653257256727, + "grad_norm": 3.7613565921783447, + "learning_rate": 5.647891709411514e-07, + "loss": 0.8116, + "step": 29770 + }, + { + "epoch": 0.4635431667276421, + "grad_norm": 3.897918224334717, + "learning_rate": 5.647072319365465e-07, + "loss": 0.7957, + "step": 29775 + }, + { + "epoch": 0.4636210077296115, + "grad_norm": 4.354563236236572, + "learning_rate": 5.646252929319413e-07, + "loss": 0.8552, + "step": 29780 + }, + { + "epoch": 0.46369884873158085, + "grad_norm": 4.190893173217773, + "learning_rate": 5.645433539273364e-07, + "loss": 0.6798, + "step": 29785 + }, + { + "epoch": 0.46377668973355024, + "grad_norm": 6.748042583465576, + "learning_rate": 5.644614149227315e-07, + "loss": 0.7531, + "step": 29790 + }, + { + "epoch": 0.46385453073551963, + "grad_norm": 3.1443727016448975, + "learning_rate": 5.643794759181265e-07, + "loss": 0.6858, + "step": 29795 + }, + { + "epoch": 0.463932371737489, + "grad_norm": 4.152675151824951, + "learning_rate": 5.642975369135215e-07, + "loss": 0.7706, + "step": 29800 + }, + { + "epoch": 0.46401021273945836, + "grad_norm": 3.2192883491516113, + "learning_rate": 5.642155979089166e-07, + "loss": 0.8153, + "step": 29805 + }, + { + "epoch": 0.46408805374142775, + "grad_norm": 3.3782994747161865, + "learning_rate": 5.641336589043117e-07, + "loss": 0.6968, + "step": 29810 + }, + { + "epoch": 0.46416589474339714, + "grad_norm": 2.9708139896392822, + "learning_rate": 5.640517198997067e-07, + "loss": 0.7539, + "step": 29815 + }, + { + "epoch": 0.46424373574536654, + "grad_norm": 2.698129177093506, + "learning_rate": 5.639697808951016e-07, + "loss": 0.7498, + "step": 29820 + }, + { + "epoch": 0.4643215767473359, + "grad_norm": 3.8263444900512695, + "learning_rate": 5.638878418904967e-07, + "loss": 0.6767, + "step": 29825 + }, + { + "epoch": 0.46439941774930527, + "grad_norm": 3.469433069229126, + "learning_rate": 5.638059028858917e-07, + "loss": 0.6213, + "step": 29830 + }, + { + "epoch": 0.46447725875127466, + "grad_norm": 10.169317245483398, + "learning_rate": 5.637239638812867e-07, + "loss": 0.7964, + "step": 29835 + }, + { + "epoch": 0.46455509975324405, + "grad_norm": 3.737732410430908, + "learning_rate": 5.636420248766818e-07, + "loss": 0.6369, + "step": 29840 + }, + { + "epoch": 0.4646329407552134, + "grad_norm": 4.020811080932617, + "learning_rate": 5.635600858720768e-07, + "loss": 0.7707, + "step": 29845 + }, + { + "epoch": 0.4647107817571828, + "grad_norm": 3.8647196292877197, + "learning_rate": 5.634781468674719e-07, + "loss": 0.68, + "step": 29850 + }, + { + "epoch": 0.46478862275915217, + "grad_norm": 8.446564674377441, + "learning_rate": 5.633962078628669e-07, + "loss": 0.8873, + "step": 29855 + }, + { + "epoch": 0.4648664637611215, + "grad_norm": 3.6714611053466797, + "learning_rate": 5.633142688582618e-07, + "loss": 0.6804, + "step": 29860 + }, + { + "epoch": 0.4649443047630909, + "grad_norm": 4.163028717041016, + "learning_rate": 5.632323298536569e-07, + "loss": 0.7578, + "step": 29865 + }, + { + "epoch": 0.4650221457650603, + "grad_norm": 3.9823803901672363, + "learning_rate": 5.63150390849052e-07, + "loss": 0.8175, + "step": 29870 + }, + { + "epoch": 0.4650999867670297, + "grad_norm": 3.8202476501464844, + "learning_rate": 5.630684518444469e-07, + "loss": 0.734, + "step": 29875 + }, + { + "epoch": 0.465177827768999, + "grad_norm": 8.29537296295166, + "learning_rate": 5.62986512839842e-07, + "loss": 0.7623, + "step": 29880 + }, + { + "epoch": 0.4652556687709684, + "grad_norm": 3.22029709815979, + "learning_rate": 5.629045738352371e-07, + "loss": 0.9522, + "step": 29885 + }, + { + "epoch": 0.4653335097729378, + "grad_norm": 3.0242271423339844, + "learning_rate": 5.62822634830632e-07, + "loss": 0.7163, + "step": 29890 + }, + { + "epoch": 0.4654113507749072, + "grad_norm": 3.0410945415496826, + "learning_rate": 5.627406958260271e-07, + "loss": 0.7557, + "step": 29895 + }, + { + "epoch": 0.46548919177687653, + "grad_norm": 4.94197940826416, + "learning_rate": 5.626587568214221e-07, + "loss": 0.6429, + "step": 29900 + }, + { + "epoch": 0.4655670327788459, + "grad_norm": 3.7214863300323486, + "learning_rate": 5.62576817816817e-07, + "loss": 0.7183, + "step": 29905 + }, + { + "epoch": 0.4656448737808153, + "grad_norm": 4.76154899597168, + "learning_rate": 5.624948788122121e-07, + "loss": 0.7022, + "step": 29910 + }, + { + "epoch": 0.4657227147827847, + "grad_norm": 3.7771337032318115, + "learning_rate": 5.624129398076072e-07, + "loss": 0.6775, + "step": 29915 + }, + { + "epoch": 0.46580055578475404, + "grad_norm": 2.8825271129608154, + "learning_rate": 5.623310008030022e-07, + "loss": 0.7057, + "step": 29920 + }, + { + "epoch": 0.46587839678672344, + "grad_norm": 3.221827507019043, + "learning_rate": 5.622490617983972e-07, + "loss": 0.8056, + "step": 29925 + }, + { + "epoch": 0.46595623778869283, + "grad_norm": 6.3704938888549805, + "learning_rate": 5.621671227937923e-07, + "loss": 0.8132, + "step": 29930 + }, + { + "epoch": 0.4660340787906622, + "grad_norm": 3.94346284866333, + "learning_rate": 5.620851837891874e-07, + "loss": 0.6742, + "step": 29935 + }, + { + "epoch": 0.46611191979263156, + "grad_norm": 2.9604880809783936, + "learning_rate": 5.620032447845824e-07, + "loss": 0.7277, + "step": 29940 + }, + { + "epoch": 0.46618976079460095, + "grad_norm": 6.1392083168029785, + "learning_rate": 5.619213057799773e-07, + "loss": 0.7164, + "step": 29945 + }, + { + "epoch": 0.46626760179657034, + "grad_norm": 2.6434457302093506, + "learning_rate": 5.618393667753724e-07, + "loss": 0.7129, + "step": 29950 + }, + { + "epoch": 0.4663454427985397, + "grad_norm": 2.7201597690582275, + "learning_rate": 5.617574277707674e-07, + "loss": 0.7211, + "step": 29955 + }, + { + "epoch": 0.46642328380050907, + "grad_norm": 3.1504993438720703, + "learning_rate": 5.616754887661625e-07, + "loss": 0.7811, + "step": 29960 + }, + { + "epoch": 0.46650112480247846, + "grad_norm": 4.6782097816467285, + "learning_rate": 5.615935497615575e-07, + "loss": 0.7411, + "step": 29965 + }, + { + "epoch": 0.46657896580444785, + "grad_norm": 3.7057347297668457, + "learning_rate": 5.615116107569525e-07, + "loss": 0.822, + "step": 29970 + }, + { + "epoch": 0.4666568068064172, + "grad_norm": 2.954636573791504, + "learning_rate": 5.614296717523476e-07, + "loss": 0.7547, + "step": 29975 + }, + { + "epoch": 0.4667346478083866, + "grad_norm": 3.5806500911712646, + "learning_rate": 5.613477327477426e-07, + "loss": 0.7754, + "step": 29980 + }, + { + "epoch": 0.466812488810356, + "grad_norm": 4.228501319885254, + "learning_rate": 5.612657937431375e-07, + "loss": 0.7847, + "step": 29985 + }, + { + "epoch": 0.46689032981232537, + "grad_norm": 3.845973014831543, + "learning_rate": 5.611838547385326e-07, + "loss": 0.7468, + "step": 29990 + }, + { + "epoch": 0.4669681708142947, + "grad_norm": 4.924877166748047, + "learning_rate": 5.611019157339277e-07, + "loss": 0.8489, + "step": 29995 + }, + { + "epoch": 0.4670460118162641, + "grad_norm": 2.9617180824279785, + "learning_rate": 5.610199767293226e-07, + "loss": 0.7707, + "step": 30000 + } + ], + "logging_steps": 5, + "max_steps": 64233, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.202070515725684e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}