{ "best_metric": 0.0021924919669198163, "best_model_checkpoint": "./results-cc/code-t5/codet5_ia3_official_0.0001/checkpoint-14718", "epoch": 1.0, "eval_steps": 500, "global_step": 14718, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003397200706617747, "grad_norm": 2.2195968627929688, "learning_rate": 9.999575349911673e-05, "loss": 10.0277, "step": 5 }, { "epoch": 0.0006794401413235494, "grad_norm": 1.40236496925354, "learning_rate": 9.999320559858677e-05, "loss": 10.112, "step": 10 }, { "epoch": 0.0010191602119853241, "grad_norm": 2.544349431991577, "learning_rate": 9.99889590977035e-05, "loss": 10.7856, "step": 15 }, { "epoch": 0.001358880282647099, "grad_norm": 1.8622939586639404, "learning_rate": 9.998471259682023e-05, "loss": 9.73, "step": 20 }, { "epoch": 0.0016986003533088735, "grad_norm": 2.0292515754699707, "learning_rate": 9.998046609593695e-05, "loss": 10.2627, "step": 25 }, { "epoch": 0.0020383204239706482, "grad_norm": 1.862714171409607, "learning_rate": 9.997621959505368e-05, "loss": 9.8577, "step": 30 }, { "epoch": 0.002378040494632423, "grad_norm": 3.0175716876983643, "learning_rate": 9.997197309417041e-05, "loss": 9.6583, "step": 35 }, { "epoch": 0.002717760565294198, "grad_norm": 2.3832008838653564, "learning_rate": 9.996772659328714e-05, "loss": 9.8661, "step": 40 }, { "epoch": 0.0030574806359559724, "grad_norm": 1.7646359205245972, "learning_rate": 9.996348009240387e-05, "loss": 10.5005, "step": 45 }, { "epoch": 0.003397200706617747, "grad_norm": 1.7444077730178833, "learning_rate": 9.996008289169725e-05, "loss": 9.5758, "step": 50 }, { "epoch": 0.0037369207772795215, "grad_norm": 1.9149781465530396, "learning_rate": 9.995583639081398e-05, "loss": 9.735, "step": 55 }, { "epoch": 0.0040766408479412965, "grad_norm": 1.6488384008407593, "learning_rate": 9.995158988993069e-05, "loss": 9.8686, "step": 60 }, { "epoch": 0.0044163609186030715, "grad_norm": 2.184131622314453, "learning_rate": 9.994734338904743e-05, "loss": 10.1822, "step": 65 }, { "epoch": 0.004756080989264846, "grad_norm": 2.6063928604125977, "learning_rate": 9.994309688816416e-05, "loss": 9.9386, "step": 70 }, { "epoch": 0.005095801059926621, "grad_norm": 1.925402045249939, "learning_rate": 9.993885038728087e-05, "loss": 9.3722, "step": 75 }, { "epoch": 0.005435521130588396, "grad_norm": 3.882549524307251, "learning_rate": 9.993545318657427e-05, "loss": 9.4705, "step": 80 }, { "epoch": 0.00577524120125017, "grad_norm": 2.1654701232910156, "learning_rate": 9.9931206685691e-05, "loss": 9.5574, "step": 85 }, { "epoch": 0.006114961271911945, "grad_norm": 2.0329928398132324, "learning_rate": 9.992696018480773e-05, "loss": 9.7831, "step": 90 }, { "epoch": 0.006454681342573719, "grad_norm": 2.408450126647949, "learning_rate": 9.992271368392445e-05, "loss": 10.2149, "step": 95 }, { "epoch": 0.006794401413235494, "grad_norm": 2.039151430130005, "learning_rate": 9.991846718304117e-05, "loss": 10.2672, "step": 100 }, { "epoch": 0.007134121483897269, "grad_norm": 2.3165321350097656, "learning_rate": 9.991422068215791e-05, "loss": 9.4804, "step": 105 }, { "epoch": 0.007473841554559043, "grad_norm": 2.117964029312134, "learning_rate": 9.990997418127464e-05, "loss": 10.1318, "step": 110 }, { "epoch": 0.007813561625220818, "grad_norm": 2.1264026165008545, "learning_rate": 9.990572768039135e-05, "loss": 9.6686, "step": 115 }, { "epoch": 0.008153281695882593, "grad_norm": 1.2933008670806885, "learning_rate": 9.99014811795081e-05, "loss": 9.4348, "step": 120 }, { "epoch": 0.008493001766544368, "grad_norm": 2.1963112354278564, "learning_rate": 9.989723467862482e-05, "loss": 9.7095, "step": 125 }, { "epoch": 0.008832721837206143, "grad_norm": 1.4545302391052246, "learning_rate": 9.989298817774154e-05, "loss": 9.5571, "step": 130 }, { "epoch": 0.009172441907867916, "grad_norm": 1.4792237281799316, "learning_rate": 9.988874167685828e-05, "loss": 8.9859, "step": 135 }, { "epoch": 0.009512161978529691, "grad_norm": 1.2513773441314697, "learning_rate": 9.9884495175975e-05, "loss": 9.1648, "step": 140 }, { "epoch": 0.009851882049191466, "grad_norm": 2.559937000274658, "learning_rate": 9.988024867509172e-05, "loss": 9.7622, "step": 145 }, { "epoch": 0.010191602119853241, "grad_norm": 1.9447046518325806, "learning_rate": 9.987600217420846e-05, "loss": 9.8491, "step": 150 }, { "epoch": 0.010531322190515016, "grad_norm": 1.3316339254379272, "learning_rate": 9.987175567332519e-05, "loss": 9.0772, "step": 155 }, { "epoch": 0.010871042261176791, "grad_norm": 1.4368464946746826, "learning_rate": 9.98675091724419e-05, "loss": 8.9018, "step": 160 }, { "epoch": 0.011210762331838564, "grad_norm": 1.5808727741241455, "learning_rate": 9.986326267155865e-05, "loss": 8.9733, "step": 165 }, { "epoch": 0.01155048240250034, "grad_norm": 1.7572826147079468, "learning_rate": 9.985901617067537e-05, "loss": 8.7883, "step": 170 }, { "epoch": 0.011890202473162114, "grad_norm": 1.6672322750091553, "learning_rate": 9.985476966979209e-05, "loss": 8.8474, "step": 175 }, { "epoch": 0.01222992254382389, "grad_norm": 1.3700741529464722, "learning_rate": 9.985052316890883e-05, "loss": 9.4713, "step": 180 }, { "epoch": 0.012569642614485664, "grad_norm": 1.9305622577667236, "learning_rate": 9.984627666802555e-05, "loss": 9.2708, "step": 185 }, { "epoch": 0.012909362685147438, "grad_norm": 1.964107871055603, "learning_rate": 9.984203016714227e-05, "loss": 8.8052, "step": 190 }, { "epoch": 0.013249082755809213, "grad_norm": 1.9907019138336182, "learning_rate": 9.983778366625901e-05, "loss": 9.6229, "step": 195 }, { "epoch": 0.013588802826470988, "grad_norm": 1.7012939453125, "learning_rate": 9.983353716537573e-05, "loss": 8.9995, "step": 200 }, { "epoch": 0.013928522897132763, "grad_norm": 1.7597671747207642, "learning_rate": 9.982929066449246e-05, "loss": 8.8808, "step": 205 }, { "epoch": 0.014268242967794538, "grad_norm": 1.6379801034927368, "learning_rate": 9.98250441636092e-05, "loss": 8.8094, "step": 210 }, { "epoch": 0.014607963038456313, "grad_norm": 1.9670891761779785, "learning_rate": 9.982079766272591e-05, "loss": 9.522, "step": 215 }, { "epoch": 0.014947683109118086, "grad_norm": 1.9001445770263672, "learning_rate": 9.981655116184264e-05, "loss": 9.3781, "step": 220 }, { "epoch": 0.015287403179779861, "grad_norm": 1.9136974811553955, "learning_rate": 9.981230466095938e-05, "loss": 8.8219, "step": 225 }, { "epoch": 0.015627123250441636, "grad_norm": 2.4746735095977783, "learning_rate": 9.98080581600761e-05, "loss": 8.9191, "step": 230 }, { "epoch": 0.01596684332110341, "grad_norm": 2.372750759124756, "learning_rate": 9.980381165919283e-05, "loss": 9.1705, "step": 235 }, { "epoch": 0.016306563391765186, "grad_norm": 1.418637752532959, "learning_rate": 9.980041445848621e-05, "loss": 8.6098, "step": 240 }, { "epoch": 0.01664628346242696, "grad_norm": 2.103688955307007, "learning_rate": 9.979616795760294e-05, "loss": 9.1948, "step": 245 }, { "epoch": 0.016986003533088736, "grad_norm": 2.14066481590271, "learning_rate": 9.979192145671968e-05, "loss": 9.0493, "step": 250 }, { "epoch": 0.01732572360375051, "grad_norm": 1.4272230863571167, "learning_rate": 9.978767495583639e-05, "loss": 8.7393, "step": 255 }, { "epoch": 0.017665443674412286, "grad_norm": 1.5108071565628052, "learning_rate": 9.978342845495312e-05, "loss": 9.3101, "step": 260 }, { "epoch": 0.01800516374507406, "grad_norm": 2.017267942428589, "learning_rate": 9.977918195406986e-05, "loss": 9.0584, "step": 265 }, { "epoch": 0.018344883815735832, "grad_norm": 1.4429893493652344, "learning_rate": 9.977493545318658e-05, "loss": 8.5374, "step": 270 }, { "epoch": 0.01868460388639761, "grad_norm": 1.7082629203796387, "learning_rate": 9.97706889523033e-05, "loss": 8.2905, "step": 275 }, { "epoch": 0.019024323957059382, "grad_norm": 1.9540777206420898, "learning_rate": 9.976644245142005e-05, "loss": 8.9633, "step": 280 }, { "epoch": 0.01936404402772116, "grad_norm": 1.4489926099777222, "learning_rate": 9.976219595053676e-05, "loss": 8.7871, "step": 285 }, { "epoch": 0.019703764098382932, "grad_norm": 1.5926896333694458, "learning_rate": 9.975794944965349e-05, "loss": 8.5374, "step": 290 }, { "epoch": 0.020043484169044706, "grad_norm": 1.8608131408691406, "learning_rate": 9.975370294877022e-05, "loss": 8.4816, "step": 295 }, { "epoch": 0.020383204239706482, "grad_norm": 1.6402130126953125, "learning_rate": 9.974945644788694e-05, "loss": 8.3059, "step": 300 }, { "epoch": 0.020722924310368256, "grad_norm": 1.0616756677627563, "learning_rate": 9.974520994700367e-05, "loss": 8.5302, "step": 305 }, { "epoch": 0.021062644381030032, "grad_norm": 2.0486464500427246, "learning_rate": 9.97409634461204e-05, "loss": 8.6573, "step": 310 }, { "epoch": 0.021402364451691806, "grad_norm": 2.764594316482544, "learning_rate": 9.973671694523713e-05, "loss": 9.0297, "step": 315 }, { "epoch": 0.021742084522353582, "grad_norm": 1.457748293876648, "learning_rate": 9.973247044435386e-05, "loss": 8.6019, "step": 320 }, { "epoch": 0.022081804593015356, "grad_norm": 1.5769116878509521, "learning_rate": 9.972822394347058e-05, "loss": 8.2577, "step": 325 }, { "epoch": 0.02242152466367713, "grad_norm": 1.7383826971054077, "learning_rate": 9.972397744258731e-05, "loss": 8.1914, "step": 330 }, { "epoch": 0.022761244734338906, "grad_norm": 1.6818331480026245, "learning_rate": 9.971973094170404e-05, "loss": 8.8289, "step": 335 }, { "epoch": 0.02310096480500068, "grad_norm": 1.429895281791687, "learning_rate": 9.971548444082077e-05, "loss": 8.3524, "step": 340 }, { "epoch": 0.023440684875662456, "grad_norm": 1.2669081687927246, "learning_rate": 9.97112379399375e-05, "loss": 8.289, "step": 345 }, { "epoch": 0.02378040494632423, "grad_norm": 1.399524211883545, "learning_rate": 9.970699143905422e-05, "loss": 7.8466, "step": 350 }, { "epoch": 0.024120125016986002, "grad_norm": 1.6960299015045166, "learning_rate": 9.970274493817095e-05, "loss": 8.6849, "step": 355 }, { "epoch": 0.02445984508764778, "grad_norm": 1.7302825450897217, "learning_rate": 9.969849843728768e-05, "loss": 8.6366, "step": 360 }, { "epoch": 0.024799565158309552, "grad_norm": 2.6233043670654297, "learning_rate": 9.969425193640441e-05, "loss": 8.3961, "step": 365 }, { "epoch": 0.02513928522897133, "grad_norm": 1.480035424232483, "learning_rate": 9.969000543552114e-05, "loss": 8.6224, "step": 370 }, { "epoch": 0.025479005299633102, "grad_norm": 1.156540870666504, "learning_rate": 9.968575893463786e-05, "loss": 7.9226, "step": 375 }, { "epoch": 0.025818725370294875, "grad_norm": 1.7962318658828735, "learning_rate": 9.968151243375459e-05, "loss": 8.5519, "step": 380 }, { "epoch": 0.026158445440956652, "grad_norm": 1.8737194538116455, "learning_rate": 9.967726593287132e-05, "loss": 8.4364, "step": 385 }, { "epoch": 0.026498165511618425, "grad_norm": 1.6001181602478027, "learning_rate": 9.967301943198805e-05, "loss": 7.8641, "step": 390 }, { "epoch": 0.026837885582280202, "grad_norm": 1.6181342601776123, "learning_rate": 9.966877293110478e-05, "loss": 7.3995, "step": 395 }, { "epoch": 0.027177605652941975, "grad_norm": 1.5771849155426025, "learning_rate": 9.96645264302215e-05, "loss": 7.8526, "step": 400 }, { "epoch": 0.027517325723603752, "grad_norm": 1.1884416341781616, "learning_rate": 9.966027992933823e-05, "loss": 8.117, "step": 405 }, { "epoch": 0.027857045794265525, "grad_norm": 2.015026092529297, "learning_rate": 9.965603342845496e-05, "loss": 8.5386, "step": 410 }, { "epoch": 0.0281967658649273, "grad_norm": 1.2226543426513672, "learning_rate": 9.965178692757169e-05, "loss": 8.1885, "step": 415 }, { "epoch": 0.028536485935589075, "grad_norm": 0.8924500942230225, "learning_rate": 9.964754042668842e-05, "loss": 7.9768, "step": 420 }, { "epoch": 0.02887620600625085, "grad_norm": 1.3788869380950928, "learning_rate": 9.964329392580513e-05, "loss": 8.2231, "step": 425 }, { "epoch": 0.029215926076912625, "grad_norm": 1.4598475694656372, "learning_rate": 9.963904742492187e-05, "loss": 7.9246, "step": 430 }, { "epoch": 0.0295556461475744, "grad_norm": 1.8520469665527344, "learning_rate": 9.96348009240386e-05, "loss": 8.0527, "step": 435 }, { "epoch": 0.029895366218236172, "grad_norm": 1.4544007778167725, "learning_rate": 9.963055442315531e-05, "loss": 7.9365, "step": 440 }, { "epoch": 0.03023508628889795, "grad_norm": 1.632197380065918, "learning_rate": 9.962630792227206e-05, "loss": 7.6173, "step": 445 }, { "epoch": 0.030574806359559722, "grad_norm": 1.7875256538391113, "learning_rate": 9.962206142138878e-05, "loss": 7.8095, "step": 450 }, { "epoch": 0.0309145264302215, "grad_norm": 1.496638298034668, "learning_rate": 9.961866422068217e-05, "loss": 8.3371, "step": 455 }, { "epoch": 0.03125424650088327, "grad_norm": 1.915732741355896, "learning_rate": 9.96144177197989e-05, "loss": 8.5992, "step": 460 }, { "epoch": 0.03159396657154505, "grad_norm": 1.1497036218643188, "learning_rate": 9.961017121891561e-05, "loss": 7.7974, "step": 465 }, { "epoch": 0.03193368664220682, "grad_norm": 1.5241578817367554, "learning_rate": 9.960592471803235e-05, "loss": 7.9306, "step": 470 }, { "epoch": 0.032273406712868595, "grad_norm": 1.8431288003921509, "learning_rate": 9.960167821714908e-05, "loss": 8.0421, "step": 475 }, { "epoch": 0.03261312678353037, "grad_norm": 1.520870566368103, "learning_rate": 9.959743171626579e-05, "loss": 7.443, "step": 480 }, { "epoch": 0.03295284685419215, "grad_norm": 1.9101468324661255, "learning_rate": 9.959318521538253e-05, "loss": 7.5383, "step": 485 }, { "epoch": 0.03329256692485392, "grad_norm": 1.2860801219940186, "learning_rate": 9.958893871449926e-05, "loss": 7.4753, "step": 490 }, { "epoch": 0.033632286995515695, "grad_norm": 1.3139928579330444, "learning_rate": 9.958469221361598e-05, "loss": 8.0711, "step": 495 }, { "epoch": 0.03397200706617747, "grad_norm": 2.382516622543335, "learning_rate": 9.958044571273272e-05, "loss": 7.9498, "step": 500 }, { "epoch": 0.03431172713683924, "grad_norm": 1.4085739850997925, "learning_rate": 9.957619921184945e-05, "loss": 7.6138, "step": 505 }, { "epoch": 0.03465144720750102, "grad_norm": 1.34367835521698, "learning_rate": 9.957195271096616e-05, "loss": 7.7689, "step": 510 }, { "epoch": 0.034991167278162795, "grad_norm": 1.3598331212997437, "learning_rate": 9.95677062100829e-05, "loss": 7.4086, "step": 515 }, { "epoch": 0.03533088734882457, "grad_norm": 1.3421567678451538, "learning_rate": 9.956345970919963e-05, "loss": 7.8855, "step": 520 }, { "epoch": 0.03567060741948634, "grad_norm": 1.5375795364379883, "learning_rate": 9.955921320831634e-05, "loss": 7.642, "step": 525 }, { "epoch": 0.03601032749014812, "grad_norm": 1.6586527824401855, "learning_rate": 9.955496670743309e-05, "loss": 7.1819, "step": 530 }, { "epoch": 0.036350047560809895, "grad_norm": 1.179348349571228, "learning_rate": 9.955072020654981e-05, "loss": 7.4156, "step": 535 }, { "epoch": 0.036689767631471665, "grad_norm": 2.1848621368408203, "learning_rate": 9.954647370566653e-05, "loss": 7.4587, "step": 540 }, { "epoch": 0.03702948770213344, "grad_norm": 1.3930761814117432, "learning_rate": 9.954222720478327e-05, "loss": 7.3459, "step": 545 }, { "epoch": 0.03736920777279522, "grad_norm": 1.6147781610488892, "learning_rate": 9.953798070389998e-05, "loss": 7.913, "step": 550 }, { "epoch": 0.03770892784345699, "grad_norm": 1.2318421602249146, "learning_rate": 9.953373420301671e-05, "loss": 7.4613, "step": 555 }, { "epoch": 0.038048647914118765, "grad_norm": 14.445450782775879, "learning_rate": 9.952948770213345e-05, "loss": 7.4797, "step": 560 }, { "epoch": 0.03838836798478054, "grad_norm": 1.447785496711731, "learning_rate": 9.952524120125017e-05, "loss": 7.663, "step": 565 }, { "epoch": 0.03872808805544232, "grad_norm": 1.2133177518844604, "learning_rate": 9.95209947003669e-05, "loss": 7.5911, "step": 570 }, { "epoch": 0.03906780812610409, "grad_norm": 1.6101973056793213, "learning_rate": 9.951674819948364e-05, "loss": 7.7617, "step": 575 }, { "epoch": 0.039407528196765865, "grad_norm": 1.5707918405532837, "learning_rate": 9.951250169860035e-05, "loss": 7.2746, "step": 580 }, { "epoch": 0.03974724826742764, "grad_norm": 1.4747017621994019, "learning_rate": 9.950825519771708e-05, "loss": 7.559, "step": 585 }, { "epoch": 0.04008696833808941, "grad_norm": 1.2751151323318481, "learning_rate": 9.950400869683382e-05, "loss": 7.0742, "step": 590 }, { "epoch": 0.04042668840875119, "grad_norm": 1.4686486721038818, "learning_rate": 9.949976219595054e-05, "loss": 7.179, "step": 595 }, { "epoch": 0.040766408479412965, "grad_norm": 1.333364725112915, "learning_rate": 9.949551569506727e-05, "loss": 7.3349, "step": 600 }, { "epoch": 0.04110612855007474, "grad_norm": 1.2560040950775146, "learning_rate": 9.9491269194184e-05, "loss": 7.2152, "step": 605 }, { "epoch": 0.04144584862073651, "grad_norm": 5.9243011474609375, "learning_rate": 9.948702269330072e-05, "loss": 7.253, "step": 610 }, { "epoch": 0.04178556869139829, "grad_norm": 1.3705462217330933, "learning_rate": 9.948277619241745e-05, "loss": 7.2954, "step": 615 }, { "epoch": 0.042125288762060065, "grad_norm": 1.3280870914459229, "learning_rate": 9.947852969153418e-05, "loss": 7.0023, "step": 620 }, { "epoch": 0.042465008832721834, "grad_norm": 1.5480890274047852, "learning_rate": 9.94742831906509e-05, "loss": 6.6209, "step": 625 }, { "epoch": 0.04280472890338361, "grad_norm": 1.4617500305175781, "learning_rate": 9.947003668976763e-05, "loss": 6.6055, "step": 630 }, { "epoch": 0.04314444897404539, "grad_norm": 1.5756878852844238, "learning_rate": 9.946579018888436e-05, "loss": 7.0135, "step": 635 }, { "epoch": 0.043484169044707165, "grad_norm": 1.4289640188217163, "learning_rate": 9.946154368800109e-05, "loss": 7.1441, "step": 640 }, { "epoch": 0.043823889115368934, "grad_norm": 1.3657900094985962, "learning_rate": 9.945729718711782e-05, "loss": 7.5154, "step": 645 }, { "epoch": 0.04416360918603071, "grad_norm": 1.971498966217041, "learning_rate": 9.94538999864112e-05, "loss": 7.2665, "step": 650 }, { "epoch": 0.04450332925669249, "grad_norm": 1.4446492195129395, "learning_rate": 9.944965348552793e-05, "loss": 7.0674, "step": 655 }, { "epoch": 0.04484304932735426, "grad_norm": 1.0143150091171265, "learning_rate": 9.944540698464467e-05, "loss": 6.7426, "step": 660 }, { "epoch": 0.045182769398016034, "grad_norm": 1.3732986450195312, "learning_rate": 9.944116048376138e-05, "loss": 7.0236, "step": 665 }, { "epoch": 0.04552248946867781, "grad_norm": 1.5511842966079712, "learning_rate": 9.943691398287811e-05, "loss": 7.2107, "step": 670 }, { "epoch": 0.04586220953933958, "grad_norm": 1.4255778789520264, "learning_rate": 9.943266748199484e-05, "loss": 6.817, "step": 675 }, { "epoch": 0.04620192961000136, "grad_norm": 1.0669182538986206, "learning_rate": 9.942842098111157e-05, "loss": 6.9347, "step": 680 }, { "epoch": 0.046541649680663134, "grad_norm": 1.512604832649231, "learning_rate": 9.94241744802283e-05, "loss": 7.0817, "step": 685 }, { "epoch": 0.04688136975132491, "grad_norm": 1.3859061002731323, "learning_rate": 9.941992797934502e-05, "loss": 7.1805, "step": 690 }, { "epoch": 0.04722108982198668, "grad_norm": 3.252913236618042, "learning_rate": 9.941568147846175e-05, "loss": 6.7985, "step": 695 }, { "epoch": 0.04756080989264846, "grad_norm": 1.4156177043914795, "learning_rate": 9.941143497757848e-05, "loss": 6.9955, "step": 700 }, { "epoch": 0.047900529963310234, "grad_norm": 1.5510213375091553, "learning_rate": 9.940718847669521e-05, "loss": 7.0235, "step": 705 }, { "epoch": 0.048240250033972004, "grad_norm": 1.3725285530090332, "learning_rate": 9.940294197581194e-05, "loss": 6.9692, "step": 710 }, { "epoch": 0.04857997010463378, "grad_norm": 1.4986199140548706, "learning_rate": 9.939869547492866e-05, "loss": 6.6778, "step": 715 }, { "epoch": 0.04891969017529556, "grad_norm": 1.2320705652236938, "learning_rate": 9.939444897404539e-05, "loss": 6.8953, "step": 720 }, { "epoch": 0.049259410245957334, "grad_norm": 0.9118322134017944, "learning_rate": 9.939020247316212e-05, "loss": 6.7496, "step": 725 }, { "epoch": 0.049599130316619104, "grad_norm": 3.4886631965637207, "learning_rate": 9.938595597227885e-05, "loss": 7.0242, "step": 730 }, { "epoch": 0.04993885038728088, "grad_norm": 0.9548838138580322, "learning_rate": 9.938170947139558e-05, "loss": 6.6306, "step": 735 }, { "epoch": 0.05027857045794266, "grad_norm": 0.8389047980308533, "learning_rate": 9.93774629705123e-05, "loss": 7.134, "step": 740 }, { "epoch": 0.05061829052860443, "grad_norm": 5.4491801261901855, "learning_rate": 9.937321646962903e-05, "loss": 6.698, "step": 745 }, { "epoch": 0.050958010599266204, "grad_norm": 1.3063551187515259, "learning_rate": 9.936896996874576e-05, "loss": 7.0113, "step": 750 }, { "epoch": 0.05129773066992798, "grad_norm": 1.470941424369812, "learning_rate": 9.936472346786249e-05, "loss": 6.606, "step": 755 }, { "epoch": 0.05163745074058975, "grad_norm": 1.9392439126968384, "learning_rate": 9.936047696697922e-05, "loss": 7.07, "step": 760 }, { "epoch": 0.05197717081125153, "grad_norm": 0.9688730239868164, "learning_rate": 9.935623046609594e-05, "loss": 6.5451, "step": 765 }, { "epoch": 0.052316890881913304, "grad_norm": 1.4289032220840454, "learning_rate": 9.935198396521267e-05, "loss": 6.8784, "step": 770 }, { "epoch": 0.05265661095257508, "grad_norm": 1.4620697498321533, "learning_rate": 9.93477374643294e-05, "loss": 6.5151, "step": 775 }, { "epoch": 0.05299633102323685, "grad_norm": 2.3521432876586914, "learning_rate": 9.934349096344613e-05, "loss": 6.5384, "step": 780 }, { "epoch": 0.05333605109389863, "grad_norm": 3.160248041152954, "learning_rate": 9.933924446256286e-05, "loss": 6.7476, "step": 785 }, { "epoch": 0.053675771164560404, "grad_norm": 1.3147598505020142, "learning_rate": 9.933499796167957e-05, "loss": 6.8376, "step": 790 }, { "epoch": 0.054015491235222174, "grad_norm": 1.6566650867462158, "learning_rate": 9.933075146079631e-05, "loss": 6.2506, "step": 795 }, { "epoch": 0.05435521130588395, "grad_norm": 0.9440861344337463, "learning_rate": 9.932650495991304e-05, "loss": 6.9212, "step": 800 }, { "epoch": 0.05469493137654573, "grad_norm": 1.1842477321624756, "learning_rate": 9.932225845902975e-05, "loss": 6.6892, "step": 805 }, { "epoch": 0.055034651447207504, "grad_norm": 1.1736949682235718, "learning_rate": 9.93180119581465e-05, "loss": 6.9224, "step": 810 }, { "epoch": 0.055374371517869274, "grad_norm": 0.7181898951530457, "learning_rate": 9.931376545726322e-05, "loss": 6.5164, "step": 815 }, { "epoch": 0.05571409158853105, "grad_norm": 0.9374647736549377, "learning_rate": 9.930951895637994e-05, "loss": 6.5026, "step": 820 }, { "epoch": 0.05605381165919283, "grad_norm": 1.2754137516021729, "learning_rate": 9.930527245549668e-05, "loss": 6.4676, "step": 825 }, { "epoch": 0.0563935317298546, "grad_norm": 1.0159765481948853, "learning_rate": 9.930102595461341e-05, "loss": 6.3455, "step": 830 }, { "epoch": 0.056733251800516374, "grad_norm": 1.0118136405944824, "learning_rate": 9.929677945373012e-05, "loss": 6.4984, "step": 835 }, { "epoch": 0.05707297187117815, "grad_norm": 0.9740552306175232, "learning_rate": 9.929253295284686e-05, "loss": 6.665, "step": 840 }, { "epoch": 0.05741269194183993, "grad_norm": 2.6464507579803467, "learning_rate": 9.928828645196359e-05, "loss": 6.5277, "step": 845 }, { "epoch": 0.0577524120125017, "grad_norm": 1.1687380075454712, "learning_rate": 9.92840399510803e-05, "loss": 6.5141, "step": 850 }, { "epoch": 0.058092132083163474, "grad_norm": 1.0684833526611328, "learning_rate": 9.927979345019705e-05, "loss": 6.2975, "step": 855 }, { "epoch": 0.05843185215382525, "grad_norm": 3.8520267009735107, "learning_rate": 9.927554694931378e-05, "loss": 5.9187, "step": 860 }, { "epoch": 0.05877157222448702, "grad_norm": 1.048731803894043, "learning_rate": 9.927130044843049e-05, "loss": 6.3076, "step": 865 }, { "epoch": 0.0591112922951488, "grad_norm": 0.8034812808036804, "learning_rate": 9.926705394754723e-05, "loss": 6.5594, "step": 870 }, { "epoch": 0.059451012365810574, "grad_norm": 0.9210667610168457, "learning_rate": 9.926280744666395e-05, "loss": 6.0014, "step": 875 }, { "epoch": 0.059790732436472344, "grad_norm": 1.0861904621124268, "learning_rate": 9.925856094578067e-05, "loss": 6.2432, "step": 880 }, { "epoch": 0.06013045250713412, "grad_norm": 0.8701607584953308, "learning_rate": 9.925431444489742e-05, "loss": 6.1582, "step": 885 }, { "epoch": 0.0604701725777959, "grad_norm": 0.691939651966095, "learning_rate": 9.925006794401413e-05, "loss": 6.487, "step": 890 }, { "epoch": 0.060809892648457674, "grad_norm": 1.1983147859573364, "learning_rate": 9.924582144313086e-05, "loss": 6.1766, "step": 895 }, { "epoch": 0.061149612719119444, "grad_norm": 1.1613506078720093, "learning_rate": 9.92415749422476e-05, "loss": 6.5303, "step": 900 }, { "epoch": 0.06148933278978122, "grad_norm": 0.8394651412963867, "learning_rate": 9.923732844136431e-05, "loss": 6.1502, "step": 905 }, { "epoch": 0.061829052860443, "grad_norm": 1.2242004871368408, "learning_rate": 9.923308194048104e-05, "loss": 6.2054, "step": 910 }, { "epoch": 0.06216877293110477, "grad_norm": 1.1255033016204834, "learning_rate": 9.922883543959778e-05, "loss": 6.4181, "step": 915 }, { "epoch": 0.06250849300176654, "grad_norm": 0.7849110960960388, "learning_rate": 9.92245889387145e-05, "loss": 6.1411, "step": 920 }, { "epoch": 0.06284821307242831, "grad_norm": 1.0032676458358765, "learning_rate": 9.922034243783123e-05, "loss": 6.4892, "step": 925 }, { "epoch": 0.0631879331430901, "grad_norm": 1.127551555633545, "learning_rate": 9.921609593694797e-05, "loss": 6.3768, "step": 930 }, { "epoch": 0.06352765321375187, "grad_norm": 1.0425925254821777, "learning_rate": 9.921184943606468e-05, "loss": 6.4448, "step": 935 }, { "epoch": 0.06386737328441364, "grad_norm": 1.1642504930496216, "learning_rate": 9.920760293518142e-05, "loss": 5.7809, "step": 940 }, { "epoch": 0.06420709335507542, "grad_norm": 1.8521403074264526, "learning_rate": 9.920335643429814e-05, "loss": 5.9249, "step": 945 }, { "epoch": 0.06454681342573719, "grad_norm": 1.073219895362854, "learning_rate": 9.919910993341487e-05, "loss": 6.0477, "step": 950 }, { "epoch": 0.06488653349639897, "grad_norm": 1.2109575271606445, "learning_rate": 9.919486343253161e-05, "loss": 6.0364, "step": 955 }, { "epoch": 0.06522625356706074, "grad_norm": 1.1780409812927246, "learning_rate": 9.919061693164832e-05, "loss": 6.4147, "step": 960 }, { "epoch": 0.06556597363772251, "grad_norm": 0.8810535073280334, "learning_rate": 9.918637043076505e-05, "loss": 6.0036, "step": 965 }, { "epoch": 0.0659056937083843, "grad_norm": 0.7648366093635559, "learning_rate": 9.918212392988179e-05, "loss": 6.2567, "step": 970 }, { "epoch": 0.06624541377904607, "grad_norm": 2.192458391189575, "learning_rate": 9.91778774289985e-05, "loss": 6.144, "step": 975 }, { "epoch": 0.06658513384970784, "grad_norm": 1.2390516996383667, "learning_rate": 9.917363092811523e-05, "loss": 6.0287, "step": 980 }, { "epoch": 0.06692485392036962, "grad_norm": 0.8258079886436462, "learning_rate": 9.916938442723198e-05, "loss": 6.2043, "step": 985 }, { "epoch": 0.06726457399103139, "grad_norm": 0.9516924023628235, "learning_rate": 9.916513792634869e-05, "loss": 6.0747, "step": 990 }, { "epoch": 0.06760429406169316, "grad_norm": 0.856916069984436, "learning_rate": 9.916089142546542e-05, "loss": 5.926, "step": 995 }, { "epoch": 0.06794401413235494, "grad_norm": 0.8324723839759827, "learning_rate": 9.915664492458216e-05, "loss": 6.2707, "step": 1000 }, { "epoch": 0.06828373420301671, "grad_norm": 0.7908216714859009, "learning_rate": 9.915239842369887e-05, "loss": 5.967, "step": 1005 }, { "epoch": 0.06862345427367848, "grad_norm": 0.9094476103782654, "learning_rate": 9.91481519228156e-05, "loss": 5.6132, "step": 1010 }, { "epoch": 0.06896317434434027, "grad_norm": 0.9734240770339966, "learning_rate": 9.914390542193233e-05, "loss": 5.9794, "step": 1015 }, { "epoch": 0.06930289441500204, "grad_norm": 0.8310399055480957, "learning_rate": 9.913965892104906e-05, "loss": 6.0146, "step": 1020 }, { "epoch": 0.0696426144856638, "grad_norm": 0.9436420798301697, "learning_rate": 9.913541242016579e-05, "loss": 5.9812, "step": 1025 }, { "epoch": 0.06998233455632559, "grad_norm": 1.2283395528793335, "learning_rate": 9.913116591928251e-05, "loss": 5.9537, "step": 1030 }, { "epoch": 0.07032205462698736, "grad_norm": 0.8751355409622192, "learning_rate": 9.912691941839924e-05, "loss": 5.9992, "step": 1035 }, { "epoch": 0.07066177469764914, "grad_norm": 0.6706697344779968, "learning_rate": 9.912267291751597e-05, "loss": 6.0753, "step": 1040 }, { "epoch": 0.07100149476831091, "grad_norm": 0.7029627561569214, "learning_rate": 9.91184264166327e-05, "loss": 6.0136, "step": 1045 }, { "epoch": 0.07134121483897268, "grad_norm": 0.6171499490737915, "learning_rate": 9.911417991574943e-05, "loss": 6.0223, "step": 1050 }, { "epoch": 0.07168093490963447, "grad_norm": 2.5934255123138428, "learning_rate": 9.910993341486615e-05, "loss": 5.807, "step": 1055 }, { "epoch": 0.07202065498029624, "grad_norm": 0.9291547536849976, "learning_rate": 9.910568691398288e-05, "loss": 5.7086, "step": 1060 }, { "epoch": 0.072360375050958, "grad_norm": 1.4394763708114624, "learning_rate": 9.910144041309961e-05, "loss": 5.5023, "step": 1065 }, { "epoch": 0.07270009512161979, "grad_norm": 0.6298092603683472, "learning_rate": 9.909719391221634e-05, "loss": 5.9972, "step": 1070 }, { "epoch": 0.07303981519228156, "grad_norm": 0.6151909232139587, "learning_rate": 9.909294741133307e-05, "loss": 5.6389, "step": 1075 }, { "epoch": 0.07337953526294333, "grad_norm": 1.1861008405685425, "learning_rate": 9.90887009104498e-05, "loss": 6.2689, "step": 1080 }, { "epoch": 0.07371925533360511, "grad_norm": 0.7876234650611877, "learning_rate": 9.908445440956652e-05, "loss": 5.5448, "step": 1085 }, { "epoch": 0.07405897540426688, "grad_norm": 0.592897891998291, "learning_rate": 9.908020790868325e-05, "loss": 5.8057, "step": 1090 }, { "epoch": 0.07439869547492865, "grad_norm": 0.9189316034317017, "learning_rate": 9.907596140779998e-05, "loss": 6.0782, "step": 1095 }, { "epoch": 0.07473841554559044, "grad_norm": 0.6605345010757446, "learning_rate": 9.90717149069167e-05, "loss": 5.6442, "step": 1100 }, { "epoch": 0.0750781356162522, "grad_norm": 0.6724756956100464, "learning_rate": 9.906746840603343e-05, "loss": 6.2757, "step": 1105 }, { "epoch": 0.07541785568691398, "grad_norm": 0.8074867725372314, "learning_rate": 9.906322190515016e-05, "loss": 5.6868, "step": 1110 }, { "epoch": 0.07575757575757576, "grad_norm": 2.2021851539611816, "learning_rate": 9.905897540426689e-05, "loss": 5.755, "step": 1115 }, { "epoch": 0.07609729582823753, "grad_norm": 0.7941934466362, "learning_rate": 9.905472890338362e-05, "loss": 5.6318, "step": 1120 }, { "epoch": 0.07643701589889931, "grad_norm": 0.9947513937950134, "learning_rate": 9.905048240250035e-05, "loss": 5.9247, "step": 1125 }, { "epoch": 0.07677673596956108, "grad_norm": 0.6511673927307129, "learning_rate": 9.904623590161707e-05, "loss": 5.6326, "step": 1130 }, { "epoch": 0.07711645604022285, "grad_norm": 0.6497818231582642, "learning_rate": 9.90419894007338e-05, "loss": 5.8753, "step": 1135 }, { "epoch": 0.07745617611088464, "grad_norm": 0.5531424880027771, "learning_rate": 9.903774289985053e-05, "loss": 5.2715, "step": 1140 }, { "epoch": 0.0777958961815464, "grad_norm": 0.7292714715003967, "learning_rate": 9.903349639896725e-05, "loss": 5.7824, "step": 1145 }, { "epoch": 0.07813561625220818, "grad_norm": 0.6802114248275757, "learning_rate": 9.902924989808399e-05, "loss": 5.7912, "step": 1150 }, { "epoch": 0.07847533632286996, "grad_norm": 0.680204451084137, "learning_rate": 9.902500339720071e-05, "loss": 5.961, "step": 1155 }, { "epoch": 0.07881505639353173, "grad_norm": 0.596501350402832, "learning_rate": 9.902075689631743e-05, "loss": 5.5833, "step": 1160 }, { "epoch": 0.0791547764641935, "grad_norm": 0.622715950012207, "learning_rate": 9.901651039543417e-05, "loss": 5.933, "step": 1165 }, { "epoch": 0.07949449653485528, "grad_norm": 0.9008530974388123, "learning_rate": 9.90122638945509e-05, "loss": 5.568, "step": 1170 }, { "epoch": 0.07983421660551705, "grad_norm": 0.5429263114929199, "learning_rate": 9.900801739366761e-05, "loss": 5.2962, "step": 1175 }, { "epoch": 0.08017393667617882, "grad_norm": 0.6079940795898438, "learning_rate": 9.900377089278435e-05, "loss": 5.8575, "step": 1180 }, { "epoch": 0.0805136567468406, "grad_norm": 0.6796315312385559, "learning_rate": 9.899952439190108e-05, "loss": 5.6058, "step": 1185 }, { "epoch": 0.08085337681750238, "grad_norm": 0.6909620761871338, "learning_rate": 9.89952778910178e-05, "loss": 5.6206, "step": 1190 }, { "epoch": 0.08119309688816416, "grad_norm": 0.8451843857765198, "learning_rate": 9.899103139013454e-05, "loss": 5.5017, "step": 1195 }, { "epoch": 0.08153281695882593, "grad_norm": 0.6521558165550232, "learning_rate": 9.898678488925127e-05, "loss": 5.6647, "step": 1200 }, { "epoch": 0.0818725370294877, "grad_norm": 0.4295422434806824, "learning_rate": 9.898253838836798e-05, "loss": 5.96, "step": 1205 }, { "epoch": 0.08221225710014948, "grad_norm": 1.2797423601150513, "learning_rate": 9.897829188748472e-05, "loss": 5.5158, "step": 1210 }, { "epoch": 0.08255197717081125, "grad_norm": 0.663374125957489, "learning_rate": 9.897404538660144e-05, "loss": 5.5387, "step": 1215 }, { "epoch": 0.08289169724147302, "grad_norm": 0.6130101084709167, "learning_rate": 9.896979888571817e-05, "loss": 5.3825, "step": 1220 }, { "epoch": 0.0832314173121348, "grad_norm": 0.9059043526649475, "learning_rate": 9.896555238483491e-05, "loss": 5.6168, "step": 1225 }, { "epoch": 0.08357113738279658, "grad_norm": 0.9198205471038818, "learning_rate": 9.896130588395162e-05, "loss": 5.7126, "step": 1230 }, { "epoch": 0.08391085745345835, "grad_norm": 0.6826533079147339, "learning_rate": 9.895705938306835e-05, "loss": 5.5344, "step": 1235 }, { "epoch": 0.08425057752412013, "grad_norm": 0.6488471031188965, "learning_rate": 9.895281288218509e-05, "loss": 5.6311, "step": 1240 }, { "epoch": 0.0845902975947819, "grad_norm": 1.4997718334197998, "learning_rate": 9.89485663813018e-05, "loss": 5.3847, "step": 1245 }, { "epoch": 0.08493001766544367, "grad_norm": 0.8614689111709595, "learning_rate": 9.894431988041853e-05, "loss": 5.7996, "step": 1250 }, { "epoch": 0.08526973773610545, "grad_norm": 0.910275936126709, "learning_rate": 9.894007337953527e-05, "loss": 5.6557, "step": 1255 }, { "epoch": 0.08560945780676722, "grad_norm": 0.8584810495376587, "learning_rate": 9.893582687865199e-05, "loss": 5.6384, "step": 1260 }, { "epoch": 0.08594917787742899, "grad_norm": 0.5326058864593506, "learning_rate": 9.893158037776872e-05, "loss": 5.4521, "step": 1265 }, { "epoch": 0.08628889794809078, "grad_norm": 1.008244276046753, "learning_rate": 9.892733387688546e-05, "loss": 5.6282, "step": 1270 }, { "epoch": 0.08662861801875255, "grad_norm": 0.9059062600135803, "learning_rate": 9.892308737600217e-05, "loss": 5.7698, "step": 1275 }, { "epoch": 0.08696833808941433, "grad_norm": 0.754760205745697, "learning_rate": 9.891884087511891e-05, "loss": 5.7735, "step": 1280 }, { "epoch": 0.0873080581600761, "grad_norm": 0.6785455346107483, "learning_rate": 9.891459437423564e-05, "loss": 5.8881, "step": 1285 }, { "epoch": 0.08764777823073787, "grad_norm": 0.8128915429115295, "learning_rate": 9.891034787335236e-05, "loss": 5.5169, "step": 1290 }, { "epoch": 0.08798749830139965, "grad_norm": 0.912551760673523, "learning_rate": 9.89061013724691e-05, "loss": 5.5467, "step": 1295 }, { "epoch": 0.08832721837206142, "grad_norm": 2.0163161754608154, "learning_rate": 9.890185487158581e-05, "loss": 5.5448, "step": 1300 }, { "epoch": 0.08866693844272319, "grad_norm": 0.5310774445533752, "learning_rate": 9.889760837070254e-05, "loss": 5.5268, "step": 1305 }, { "epoch": 0.08900665851338498, "grad_norm": 0.603813886642456, "learning_rate": 9.889336186981928e-05, "loss": 5.6777, "step": 1310 }, { "epoch": 0.08934637858404675, "grad_norm": 0.431869238615036, "learning_rate": 9.8889115368936e-05, "loss": 5.5403, "step": 1315 }, { "epoch": 0.08968609865470852, "grad_norm": 0.7721471190452576, "learning_rate": 9.888486886805273e-05, "loss": 5.2792, "step": 1320 }, { "epoch": 0.0900258187253703, "grad_norm": 1.0440256595611572, "learning_rate": 9.888062236716947e-05, "loss": 5.1811, "step": 1325 }, { "epoch": 0.09036553879603207, "grad_norm": 0.5089631080627441, "learning_rate": 9.887637586628618e-05, "loss": 5.5219, "step": 1330 }, { "epoch": 0.09070525886669384, "grad_norm": 0.5013499855995178, "learning_rate": 9.887212936540291e-05, "loss": 5.5286, "step": 1335 }, { "epoch": 0.09104497893735562, "grad_norm": 0.8858599662780762, "learning_rate": 9.886788286451965e-05, "loss": 5.6356, "step": 1340 }, { "epoch": 0.09138469900801739, "grad_norm": 0.45014268159866333, "learning_rate": 9.886363636363637e-05, "loss": 5.1792, "step": 1345 }, { "epoch": 0.09172441907867916, "grad_norm": 0.6357929706573486, "learning_rate": 9.88593898627531e-05, "loss": 5.6234, "step": 1350 }, { "epoch": 0.09206413914934095, "grad_norm": 4.850500106811523, "learning_rate": 9.885514336186984e-05, "loss": 5.3482, "step": 1355 }, { "epoch": 0.09240385922000272, "grad_norm": 0.477006196975708, "learning_rate": 9.885089686098655e-05, "loss": 5.2999, "step": 1360 }, { "epoch": 0.0927435792906645, "grad_norm": 0.5572239756584167, "learning_rate": 9.884665036010328e-05, "loss": 5.4801, "step": 1365 }, { "epoch": 0.09308329936132627, "grad_norm": 0.3890332877635956, "learning_rate": 9.884240385922e-05, "loss": 5.3536, "step": 1370 }, { "epoch": 0.09342301943198804, "grad_norm": 0.43894070386886597, "learning_rate": 9.883815735833673e-05, "loss": 5.6419, "step": 1375 }, { "epoch": 0.09376273950264982, "grad_norm": 0.48212140798568726, "learning_rate": 9.883391085745346e-05, "loss": 5.431, "step": 1380 }, { "epoch": 0.09410245957331159, "grad_norm": 0.5145598649978638, "learning_rate": 9.882966435657019e-05, "loss": 5.7753, "step": 1385 }, { "epoch": 0.09444217964397336, "grad_norm": 0.4795299172401428, "learning_rate": 9.882541785568692e-05, "loss": 5.5992, "step": 1390 }, { "epoch": 0.09478189971463515, "grad_norm": 0.4439328610897064, "learning_rate": 9.882117135480365e-05, "loss": 5.3292, "step": 1395 }, { "epoch": 0.09512161978529692, "grad_norm": 0.6526133418083191, "learning_rate": 9.881692485392037e-05, "loss": 5.3765, "step": 1400 }, { "epoch": 0.09546133985595869, "grad_norm": 0.8982023596763611, "learning_rate": 9.88126783530371e-05, "loss": 5.4783, "step": 1405 }, { "epoch": 0.09580105992662047, "grad_norm": 0.46160343289375305, "learning_rate": 9.880843185215383e-05, "loss": 5.502, "step": 1410 }, { "epoch": 0.09614077999728224, "grad_norm": 0.6750124096870422, "learning_rate": 9.880418535127056e-05, "loss": 5.4252, "step": 1415 }, { "epoch": 0.09648050006794401, "grad_norm": 0.42901554703712463, "learning_rate": 9.879993885038729e-05, "loss": 5.3442, "step": 1420 }, { "epoch": 0.09682022013860579, "grad_norm": 0.6184918284416199, "learning_rate": 9.879569234950401e-05, "loss": 5.3717, "step": 1425 }, { "epoch": 0.09715994020926756, "grad_norm": 0.5006517171859741, "learning_rate": 9.879144584862074e-05, "loss": 5.4458, "step": 1430 }, { "epoch": 0.09749966027992933, "grad_norm": 0.4495384693145752, "learning_rate": 9.878719934773747e-05, "loss": 5.263, "step": 1435 }, { "epoch": 0.09783938035059112, "grad_norm": 0.4285268783569336, "learning_rate": 9.87829528468542e-05, "loss": 5.2207, "step": 1440 }, { "epoch": 0.09817910042125289, "grad_norm": 1.0460352897644043, "learning_rate": 9.877870634597093e-05, "loss": 5.3448, "step": 1445 }, { "epoch": 0.09851882049191467, "grad_norm": 0.6291869878768921, "learning_rate": 9.877445984508765e-05, "loss": 5.3182, "step": 1450 }, { "epoch": 0.09885854056257644, "grad_norm": 1.0043153762817383, "learning_rate": 9.877021334420438e-05, "loss": 5.4226, "step": 1455 }, { "epoch": 0.09919826063323821, "grad_norm": 0.7458539009094238, "learning_rate": 9.876596684332111e-05, "loss": 5.6298, "step": 1460 }, { "epoch": 0.09953798070389999, "grad_norm": 0.45767852663993835, "learning_rate": 9.876172034243784e-05, "loss": 5.5378, "step": 1465 }, { "epoch": 0.09987770077456176, "grad_norm": 0.4586849510669708, "learning_rate": 9.875747384155457e-05, "loss": 5.7105, "step": 1470 }, { "epoch": 0.10021742084522353, "grad_norm": 0.37701913714408875, "learning_rate": 9.87532273406713e-05, "loss": 5.1507, "step": 1475 }, { "epoch": 0.10055714091588532, "grad_norm": 0.42394065856933594, "learning_rate": 9.874898083978802e-05, "loss": 5.5164, "step": 1480 }, { "epoch": 0.10089686098654709, "grad_norm": 0.6020697355270386, "learning_rate": 9.874473433890475e-05, "loss": 5.3247, "step": 1485 }, { "epoch": 0.10123658105720885, "grad_norm": 0.5261949300765991, "learning_rate": 9.874048783802148e-05, "loss": 5.5159, "step": 1490 }, { "epoch": 0.10157630112787064, "grad_norm": 0.4316195547580719, "learning_rate": 9.87362413371382e-05, "loss": 5.4615, "step": 1495 }, { "epoch": 0.10191602119853241, "grad_norm": 0.3691781759262085, "learning_rate": 9.873199483625492e-05, "loss": 5.5583, "step": 1500 }, { "epoch": 0.10225574126919418, "grad_norm": 0.5686614513397217, "learning_rate": 9.872774833537166e-05, "loss": 5.5936, "step": 1505 }, { "epoch": 0.10259546133985596, "grad_norm": 0.3296063542366028, "learning_rate": 9.872350183448839e-05, "loss": 5.4636, "step": 1510 }, { "epoch": 0.10293518141051773, "grad_norm": 0.46032190322875977, "learning_rate": 9.87192553336051e-05, "loss": 5.1794, "step": 1515 }, { "epoch": 0.1032749014811795, "grad_norm": 0.4383249282836914, "learning_rate": 9.871500883272185e-05, "loss": 5.3524, "step": 1520 }, { "epoch": 0.10361462155184128, "grad_norm": 0.5257749557495117, "learning_rate": 9.871076233183857e-05, "loss": 5.5045, "step": 1525 }, { "epoch": 0.10395434162250305, "grad_norm": 0.5201256275177002, "learning_rate": 9.870651583095529e-05, "loss": 5.3214, "step": 1530 }, { "epoch": 0.10429406169316484, "grad_norm": 0.7715117931365967, "learning_rate": 9.870226933007203e-05, "loss": 5.3583, "step": 1535 }, { "epoch": 0.10463378176382661, "grad_norm": 0.34586212038993835, "learning_rate": 9.869802282918876e-05, "loss": 5.3337, "step": 1540 }, { "epoch": 0.10497350183448838, "grad_norm": 0.45313313603401184, "learning_rate": 9.869377632830547e-05, "loss": 5.2788, "step": 1545 }, { "epoch": 0.10531322190515016, "grad_norm": 0.4078027904033661, "learning_rate": 9.868952982742221e-05, "loss": 5.4364, "step": 1550 }, { "epoch": 0.10565294197581193, "grad_norm": 0.4670262038707733, "learning_rate": 9.868528332653894e-05, "loss": 5.308, "step": 1555 }, { "epoch": 0.1059926620464737, "grad_norm": 0.4951310455799103, "learning_rate": 9.868103682565566e-05, "loss": 5.5171, "step": 1560 }, { "epoch": 0.10633238211713548, "grad_norm": 0.7351198792457581, "learning_rate": 9.86767903247724e-05, "loss": 5.6146, "step": 1565 }, { "epoch": 0.10667210218779725, "grad_norm": 0.5064637660980225, "learning_rate": 9.867254382388911e-05, "loss": 5.3591, "step": 1570 }, { "epoch": 0.10701182225845902, "grad_norm": 0.39143896102905273, "learning_rate": 9.866829732300584e-05, "loss": 5.3523, "step": 1575 }, { "epoch": 0.10735154232912081, "grad_norm": 1.2670384645462036, "learning_rate": 9.866405082212258e-05, "loss": 5.2359, "step": 1580 }, { "epoch": 0.10769126239978258, "grad_norm": 0.3745839297771454, "learning_rate": 9.86598043212393e-05, "loss": 5.2194, "step": 1585 }, { "epoch": 0.10803098247044435, "grad_norm": 0.26325303316116333, "learning_rate": 9.865555782035602e-05, "loss": 5.1649, "step": 1590 }, { "epoch": 0.10837070254110613, "grad_norm": 0.3311369717121124, "learning_rate": 9.865131131947277e-05, "loss": 5.326, "step": 1595 }, { "epoch": 0.1087104226117679, "grad_norm": 0.4302009046077728, "learning_rate": 9.864706481858948e-05, "loss": 5.1757, "step": 1600 }, { "epoch": 0.10905014268242967, "grad_norm": 0.5953149795532227, "learning_rate": 9.864281831770621e-05, "loss": 5.3937, "step": 1605 }, { "epoch": 0.10938986275309145, "grad_norm": 0.4650028645992279, "learning_rate": 9.863857181682295e-05, "loss": 5.4321, "step": 1610 }, { "epoch": 0.10972958282375322, "grad_norm": 1.5760172605514526, "learning_rate": 9.863432531593966e-05, "loss": 5.1935, "step": 1615 }, { "epoch": 0.11006930289441501, "grad_norm": 0.4063778221607208, "learning_rate": 9.86300788150564e-05, "loss": 5.0892, "step": 1620 }, { "epoch": 0.11040902296507678, "grad_norm": 0.3407844603061676, "learning_rate": 9.862583231417313e-05, "loss": 5.2536, "step": 1625 }, { "epoch": 0.11074874303573855, "grad_norm": 0.5247534513473511, "learning_rate": 9.862158581328985e-05, "loss": 5.3708, "step": 1630 }, { "epoch": 0.11108846310640033, "grad_norm": 1.0360844135284424, "learning_rate": 9.861733931240659e-05, "loss": 5.3132, "step": 1635 }, { "epoch": 0.1114281831770621, "grad_norm": 0.5687776803970337, "learning_rate": 9.86130928115233e-05, "loss": 5.2554, "step": 1640 }, { "epoch": 0.11176790324772387, "grad_norm": 0.3441666066646576, "learning_rate": 9.860884631064003e-05, "loss": 5.1661, "step": 1645 }, { "epoch": 0.11210762331838565, "grad_norm": 0.6809844970703125, "learning_rate": 9.860459980975677e-05, "loss": 5.3175, "step": 1650 }, { "epoch": 0.11244734338904742, "grad_norm": 0.4880785644054413, "learning_rate": 9.860035330887349e-05, "loss": 5.3368, "step": 1655 }, { "epoch": 0.1127870634597092, "grad_norm": 0.5996628999710083, "learning_rate": 9.859610680799022e-05, "loss": 5.2202, "step": 1660 }, { "epoch": 0.11312678353037098, "grad_norm": 0.3819567561149597, "learning_rate": 9.859186030710696e-05, "loss": 5.2171, "step": 1665 }, { "epoch": 0.11346650360103275, "grad_norm": 0.8039321899414062, "learning_rate": 9.858761380622367e-05, "loss": 5.1053, "step": 1670 }, { "epoch": 0.11380622367169452, "grad_norm": 0.7948293685913086, "learning_rate": 9.85833673053404e-05, "loss": 5.1902, "step": 1675 }, { "epoch": 0.1141459437423563, "grad_norm": 0.45758846402168274, "learning_rate": 9.857912080445714e-05, "loss": 5.2766, "step": 1680 }, { "epoch": 0.11448566381301807, "grad_norm": 0.33638903498649597, "learning_rate": 9.857487430357386e-05, "loss": 5.1641, "step": 1685 }, { "epoch": 0.11482538388367985, "grad_norm": 0.3370652496814728, "learning_rate": 9.857062780269058e-05, "loss": 4.8306, "step": 1690 }, { "epoch": 0.11516510395434162, "grad_norm": 0.30151012539863586, "learning_rate": 9.856638130180733e-05, "loss": 4.9977, "step": 1695 }, { "epoch": 0.1155048240250034, "grad_norm": 0.4379021227359772, "learning_rate": 9.856213480092404e-05, "loss": 5.2388, "step": 1700 }, { "epoch": 0.11584454409566518, "grad_norm": 0.5139544606208801, "learning_rate": 9.855788830004077e-05, "loss": 5.0948, "step": 1705 }, { "epoch": 0.11618426416632695, "grad_norm": 0.2696703374385834, "learning_rate": 9.855364179915751e-05, "loss": 5.1955, "step": 1710 }, { "epoch": 0.11652398423698872, "grad_norm": 0.43758052587509155, "learning_rate": 9.854939529827422e-05, "loss": 5.1093, "step": 1715 }, { "epoch": 0.1168637043076505, "grad_norm": 0.45877712965011597, "learning_rate": 9.854514879739095e-05, "loss": 5.1859, "step": 1720 }, { "epoch": 0.11720342437831227, "grad_norm": 0.5116316676139832, "learning_rate": 9.854090229650768e-05, "loss": 4.9734, "step": 1725 }, { "epoch": 0.11754314444897404, "grad_norm": 0.3733248710632324, "learning_rate": 9.853665579562441e-05, "loss": 4.9164, "step": 1730 }, { "epoch": 0.11788286451963582, "grad_norm": 0.9365966320037842, "learning_rate": 9.853240929474114e-05, "loss": 5.1925, "step": 1735 }, { "epoch": 0.1182225845902976, "grad_norm": 0.43242642283439636, "learning_rate": 9.852816279385786e-05, "loss": 5.2445, "step": 1740 }, { "epoch": 0.11856230466095936, "grad_norm": 0.28928515315055847, "learning_rate": 9.852391629297459e-05, "loss": 5.2208, "step": 1745 }, { "epoch": 0.11890202473162115, "grad_norm": 0.4645937383174896, "learning_rate": 9.851966979209132e-05, "loss": 5.1848, "step": 1750 }, { "epoch": 0.11924174480228292, "grad_norm": 0.3553106486797333, "learning_rate": 9.851542329120805e-05, "loss": 5.0109, "step": 1755 }, { "epoch": 0.11958146487294469, "grad_norm": 0.47933322191238403, "learning_rate": 9.851117679032478e-05, "loss": 5.2334, "step": 1760 }, { "epoch": 0.11992118494360647, "grad_norm": 0.3383587896823883, "learning_rate": 9.85069302894415e-05, "loss": 5.3224, "step": 1765 }, { "epoch": 0.12026090501426824, "grad_norm": 0.41393041610717773, "learning_rate": 9.850268378855823e-05, "loss": 5.062, "step": 1770 }, { "epoch": 0.12060062508493002, "grad_norm": 0.43941059708595276, "learning_rate": 9.849843728767496e-05, "loss": 5.0013, "step": 1775 }, { "epoch": 0.1209403451555918, "grad_norm": 0.35179632902145386, "learning_rate": 9.849419078679169e-05, "loss": 5.1989, "step": 1780 }, { "epoch": 0.12128006522625356, "grad_norm": 0.7730126976966858, "learning_rate": 9.848994428590842e-05, "loss": 5.1821, "step": 1785 }, { "epoch": 0.12161978529691535, "grad_norm": 0.45934972167015076, "learning_rate": 9.848569778502514e-05, "loss": 5.2454, "step": 1790 }, { "epoch": 0.12195950536757712, "grad_norm": 0.6049938797950745, "learning_rate": 9.848145128414187e-05, "loss": 5.0269, "step": 1795 }, { "epoch": 0.12229922543823889, "grad_norm": 0.9805595874786377, "learning_rate": 9.84772047832586e-05, "loss": 4.885, "step": 1800 }, { "epoch": 0.12263894550890067, "grad_norm": 0.47447869181632996, "learning_rate": 9.847295828237533e-05, "loss": 4.9461, "step": 1805 }, { "epoch": 0.12297866557956244, "grad_norm": 0.4037536382675171, "learning_rate": 9.846871178149206e-05, "loss": 4.79, "step": 1810 }, { "epoch": 0.12331838565022421, "grad_norm": 0.6516850590705872, "learning_rate": 9.846446528060878e-05, "loss": 4.8244, "step": 1815 }, { "epoch": 0.123658105720886, "grad_norm": 0.46356433629989624, "learning_rate": 9.846021877972551e-05, "loss": 4.8264, "step": 1820 }, { "epoch": 0.12399782579154776, "grad_norm": 1.0530160665512085, "learning_rate": 9.845597227884224e-05, "loss": 4.7702, "step": 1825 }, { "epoch": 0.12433754586220953, "grad_norm": 1.3012051582336426, "learning_rate": 9.845172577795897e-05, "loss": 4.9146, "step": 1830 }, { "epoch": 0.12467726593287132, "grad_norm": 0.6158355474472046, "learning_rate": 9.84474792770757e-05, "loss": 5.0082, "step": 1835 }, { "epoch": 0.1250169860035331, "grad_norm": 4.688101768493652, "learning_rate": 9.844323277619241e-05, "loss": 4.9467, "step": 1840 }, { "epoch": 0.12535670607419486, "grad_norm": 0.5098426342010498, "learning_rate": 9.843898627530915e-05, "loss": 4.7443, "step": 1845 }, { "epoch": 0.12569642614485663, "grad_norm": 0.6203608512878418, "learning_rate": 9.843473977442588e-05, "loss": 5.0459, "step": 1850 }, { "epoch": 0.12603614621551842, "grad_norm": 0.5965786576271057, "learning_rate": 9.84304932735426e-05, "loss": 5.0615, "step": 1855 }, { "epoch": 0.1263758662861802, "grad_norm": 0.7298919558525085, "learning_rate": 9.842624677265934e-05, "loss": 4.9333, "step": 1860 }, { "epoch": 0.12671558635684196, "grad_norm": 0.514262318611145, "learning_rate": 9.842200027177606e-05, "loss": 4.9152, "step": 1865 }, { "epoch": 0.12705530642750373, "grad_norm": 0.49908140301704407, "learning_rate": 9.841775377089278e-05, "loss": 4.8825, "step": 1870 }, { "epoch": 0.1273950264981655, "grad_norm": 0.3929906487464905, "learning_rate": 9.841350727000952e-05, "loss": 4.6288, "step": 1875 }, { "epoch": 0.12773474656882727, "grad_norm": 0.6221901178359985, "learning_rate": 9.840926076912625e-05, "loss": 4.6867, "step": 1880 }, { "epoch": 0.12807446663948907, "grad_norm": 0.4237980544567108, "learning_rate": 9.840501426824296e-05, "loss": 5.0275, "step": 1885 }, { "epoch": 0.12841418671015084, "grad_norm": 0.5076737403869629, "learning_rate": 9.84007677673597e-05, "loss": 4.3183, "step": 1890 }, { "epoch": 0.1287539067808126, "grad_norm": 0.562611997127533, "learning_rate": 9.839652126647643e-05, "loss": 4.8492, "step": 1895 }, { "epoch": 0.12909362685147438, "grad_norm": 0.43838977813720703, "learning_rate": 9.839227476559315e-05, "loss": 4.7559, "step": 1900 }, { "epoch": 0.12943334692213615, "grad_norm": 1.7356271743774414, "learning_rate": 9.838802826470989e-05, "loss": 4.4937, "step": 1905 }, { "epoch": 0.12977306699279795, "grad_norm": 0.35975855588912964, "learning_rate": 9.838378176382662e-05, "loss": 4.8153, "step": 1910 }, { "epoch": 0.13011278706345972, "grad_norm": 0.46843382716178894, "learning_rate": 9.837953526294333e-05, "loss": 4.7742, "step": 1915 }, { "epoch": 0.1304525071341215, "grad_norm": 0.49429741501808167, "learning_rate": 9.837528876206007e-05, "loss": 4.5403, "step": 1920 }, { "epoch": 0.13079222720478326, "grad_norm": 0.496423602104187, "learning_rate": 9.837104226117679e-05, "loss": 4.8032, "step": 1925 }, { "epoch": 0.13113194727544503, "grad_norm": 0.7953855395317078, "learning_rate": 9.836679576029352e-05, "loss": 4.8191, "step": 1930 }, { "epoch": 0.1314716673461068, "grad_norm": 0.5093162655830383, "learning_rate": 9.836254925941026e-05, "loss": 4.794, "step": 1935 }, { "epoch": 0.1318113874167686, "grad_norm": 0.37883055210113525, "learning_rate": 9.835830275852697e-05, "loss": 4.3129, "step": 1940 }, { "epoch": 0.13215110748743036, "grad_norm": 0.6972466707229614, "learning_rate": 9.83540562576437e-05, "loss": 4.6677, "step": 1945 }, { "epoch": 0.13249082755809213, "grad_norm": 0.4960924983024597, "learning_rate": 9.834980975676044e-05, "loss": 4.7554, "step": 1950 }, { "epoch": 0.1328305476287539, "grad_norm": 0.3313211500644684, "learning_rate": 9.834556325587716e-05, "loss": 4.5136, "step": 1955 }, { "epoch": 0.13317026769941567, "grad_norm": 0.37889447808265686, "learning_rate": 9.83413167549939e-05, "loss": 4.6352, "step": 1960 }, { "epoch": 0.13350998777007744, "grad_norm": 0.2897196412086487, "learning_rate": 9.833707025411062e-05, "loss": 4.2392, "step": 1965 }, { "epoch": 0.13384970784073924, "grad_norm": 0.4556117653846741, "learning_rate": 9.833282375322734e-05, "loss": 4.4802, "step": 1970 }, { "epoch": 0.134189427911401, "grad_norm": 0.24939770996570587, "learning_rate": 9.832857725234408e-05, "loss": 4.8028, "step": 1975 }, { "epoch": 0.13452914798206278, "grad_norm": 0.5589706301689148, "learning_rate": 9.832433075146081e-05, "loss": 4.5484, "step": 1980 }, { "epoch": 0.13486886805272455, "grad_norm": 0.403367817401886, "learning_rate": 9.832008425057752e-05, "loss": 4.6303, "step": 1985 }, { "epoch": 0.13520858812338632, "grad_norm": 0.2891002595424652, "learning_rate": 9.831583774969426e-05, "loss": 4.6125, "step": 1990 }, { "epoch": 0.13554830819404812, "grad_norm": 0.4545519948005676, "learning_rate": 9.831159124881098e-05, "loss": 4.6801, "step": 1995 }, { "epoch": 0.1358880282647099, "grad_norm": 0.2752302289009094, "learning_rate": 9.830734474792771e-05, "loss": 4.5331, "step": 2000 }, { "epoch": 0.13622774833537166, "grad_norm": 0.4735427498817444, "learning_rate": 9.830309824704445e-05, "loss": 4.5487, "step": 2005 }, { "epoch": 0.13656746840603343, "grad_norm": 0.2892632782459259, "learning_rate": 9.829885174616116e-05, "loss": 4.7872, "step": 2010 }, { "epoch": 0.1369071884766952, "grad_norm": 0.3587241768836975, "learning_rate": 9.829460524527789e-05, "loss": 4.8017, "step": 2015 }, { "epoch": 0.13724690854735697, "grad_norm": 0.8643600940704346, "learning_rate": 9.829035874439463e-05, "loss": 4.9978, "step": 2020 }, { "epoch": 0.13758662861801876, "grad_norm": 0.3995005786418915, "learning_rate": 9.828611224351135e-05, "loss": 4.7966, "step": 2025 }, { "epoch": 0.13792634868868053, "grad_norm": 0.5287114381790161, "learning_rate": 9.828186574262808e-05, "loss": 4.6836, "step": 2030 }, { "epoch": 0.1382660687593423, "grad_norm": 0.356660932302475, "learning_rate": 9.827761924174482e-05, "loss": 4.6598, "step": 2035 }, { "epoch": 0.13860578883000407, "grad_norm": 0.3594839572906494, "learning_rate": 9.827337274086153e-05, "loss": 4.7932, "step": 2040 }, { "epoch": 0.13894550890066584, "grad_norm": 0.460989385843277, "learning_rate": 9.826912623997826e-05, "loss": 4.8404, "step": 2045 }, { "epoch": 0.1392852289713276, "grad_norm": 0.3044515550136566, "learning_rate": 9.8264879739095e-05, "loss": 4.4804, "step": 2050 }, { "epoch": 0.1396249490419894, "grad_norm": 0.2440759837627411, "learning_rate": 9.826063323821172e-05, "loss": 4.6584, "step": 2055 }, { "epoch": 0.13996466911265118, "grad_norm": 0.39719679951667786, "learning_rate": 9.825638673732844e-05, "loss": 4.6913, "step": 2060 }, { "epoch": 0.14030438918331295, "grad_norm": 0.2519219219684601, "learning_rate": 9.825214023644517e-05, "loss": 4.7914, "step": 2065 }, { "epoch": 0.14064410925397472, "grad_norm": 0.27213895320892334, "learning_rate": 9.82478937355619e-05, "loss": 4.4571, "step": 2070 }, { "epoch": 0.1409838293246365, "grad_norm": 0.31952184438705444, "learning_rate": 9.824364723467863e-05, "loss": 4.4334, "step": 2075 }, { "epoch": 0.1413235493952983, "grad_norm": 0.2466011643409729, "learning_rate": 9.823940073379536e-05, "loss": 4.623, "step": 2080 }, { "epoch": 0.14166326946596006, "grad_norm": 0.41923725605010986, "learning_rate": 9.823515423291208e-05, "loss": 4.5557, "step": 2085 }, { "epoch": 0.14200298953662183, "grad_norm": 0.23959270119667053, "learning_rate": 9.823090773202881e-05, "loss": 4.5756, "step": 2090 }, { "epoch": 0.1423427096072836, "grad_norm": 0.7019773721694946, "learning_rate": 9.822666123114554e-05, "loss": 4.74, "step": 2095 }, { "epoch": 0.14268242967794537, "grad_norm": 0.6014403700828552, "learning_rate": 9.822241473026227e-05, "loss": 4.4456, "step": 2100 }, { "epoch": 0.14302214974860714, "grad_norm": 0.2578621804714203, "learning_rate": 9.8218168229379e-05, "loss": 4.6776, "step": 2105 }, { "epoch": 0.14336186981926893, "grad_norm": 0.24368084967136383, "learning_rate": 9.821392172849572e-05, "loss": 4.7798, "step": 2110 }, { "epoch": 0.1437015898899307, "grad_norm": 0.4451867938041687, "learning_rate": 9.820967522761245e-05, "loss": 4.2507, "step": 2115 }, { "epoch": 0.14404130996059247, "grad_norm": 0.27697330713272095, "learning_rate": 9.820542872672918e-05, "loss": 4.6886, "step": 2120 }, { "epoch": 0.14438103003125424, "grad_norm": 0.8379690647125244, "learning_rate": 9.820118222584591e-05, "loss": 4.5629, "step": 2125 }, { "epoch": 0.144720750101916, "grad_norm": 0.9834319353103638, "learning_rate": 9.819693572496264e-05, "loss": 4.4945, "step": 2130 }, { "epoch": 0.14506047017257778, "grad_norm": 0.45272937417030334, "learning_rate": 9.819268922407936e-05, "loss": 4.6099, "step": 2135 }, { "epoch": 0.14540019024323958, "grad_norm": 0.517729640007019, "learning_rate": 9.818844272319609e-05, "loss": 4.6808, "step": 2140 }, { "epoch": 0.14573991031390135, "grad_norm": 0.26133647561073303, "learning_rate": 9.818419622231282e-05, "loss": 4.4916, "step": 2145 }, { "epoch": 0.14607963038456312, "grad_norm": 0.31160035729408264, "learning_rate": 9.817994972142955e-05, "loss": 4.4746, "step": 2150 }, { "epoch": 0.1464193504552249, "grad_norm": 0.3950839936733246, "learning_rate": 9.817570322054628e-05, "loss": 4.8946, "step": 2155 }, { "epoch": 0.14675907052588666, "grad_norm": 0.254171758890152, "learning_rate": 9.8171456719663e-05, "loss": 4.5237, "step": 2160 }, { "epoch": 0.14709879059654846, "grad_norm": 0.4314219653606415, "learning_rate": 9.816721021877973e-05, "loss": 4.6116, "step": 2165 }, { "epoch": 0.14743851066721023, "grad_norm": 0.2894288897514343, "learning_rate": 9.816296371789646e-05, "loss": 4.4748, "step": 2170 }, { "epoch": 0.147778230737872, "grad_norm": 0.2681034207344055, "learning_rate": 9.815871721701319e-05, "loss": 4.5926, "step": 2175 }, { "epoch": 0.14811795080853377, "grad_norm": 0.27911391854286194, "learning_rate": 9.815447071612992e-05, "loss": 4.5391, "step": 2180 }, { "epoch": 0.14845767087919554, "grad_norm": 0.3182697296142578, "learning_rate": 9.815022421524664e-05, "loss": 4.5708, "step": 2185 }, { "epoch": 0.1487973909498573, "grad_norm": 0.2478509396314621, "learning_rate": 9.814597771436337e-05, "loss": 4.3974, "step": 2190 }, { "epoch": 0.1491371110205191, "grad_norm": 0.3418025076389313, "learning_rate": 9.814173121348009e-05, "loss": 4.5312, "step": 2195 }, { "epoch": 0.14947683109118087, "grad_norm": 0.2670694887638092, "learning_rate": 9.813748471259683e-05, "loss": 4.5906, "step": 2200 }, { "epoch": 0.14981655116184264, "grad_norm": 0.29988008737564087, "learning_rate": 9.813323821171356e-05, "loss": 4.4151, "step": 2205 }, { "epoch": 0.1501562712325044, "grad_norm": 0.2230396866798401, "learning_rate": 9.812899171083027e-05, "loss": 4.6196, "step": 2210 }, { "epoch": 0.15049599130316618, "grad_norm": 0.2940434515476227, "learning_rate": 9.812474520994701e-05, "loss": 4.5765, "step": 2215 }, { "epoch": 0.15083571137382795, "grad_norm": 0.2943139672279358, "learning_rate": 9.812049870906374e-05, "loss": 4.7439, "step": 2220 }, { "epoch": 0.15117543144448975, "grad_norm": 0.5938501954078674, "learning_rate": 9.811625220818045e-05, "loss": 4.6233, "step": 2225 }, { "epoch": 0.15151515151515152, "grad_norm": 0.29499292373657227, "learning_rate": 9.81120057072972e-05, "loss": 4.43, "step": 2230 }, { "epoch": 0.1518548715858133, "grad_norm": 0.21327312290668488, "learning_rate": 9.810775920641392e-05, "loss": 4.5865, "step": 2235 }, { "epoch": 0.15219459165647506, "grad_norm": 0.4112052917480469, "learning_rate": 9.810351270553064e-05, "loss": 4.7013, "step": 2240 }, { "epoch": 0.15253431172713683, "grad_norm": 0.40261027216911316, "learning_rate": 9.809926620464738e-05, "loss": 4.6419, "step": 2245 }, { "epoch": 0.15287403179779863, "grad_norm": 0.2737533748149872, "learning_rate": 9.809501970376411e-05, "loss": 4.3994, "step": 2250 }, { "epoch": 0.1532137518684604, "grad_norm": 0.24050559103488922, "learning_rate": 9.809077320288082e-05, "loss": 4.5648, "step": 2255 }, { "epoch": 0.15355347193912217, "grad_norm": 0.3781549036502838, "learning_rate": 9.808652670199756e-05, "loss": 4.4987, "step": 2260 }, { "epoch": 0.15389319200978394, "grad_norm": 0.46098098158836365, "learning_rate": 9.808228020111428e-05, "loss": 4.5322, "step": 2265 }, { "epoch": 0.1542329120804457, "grad_norm": 0.32969388365745544, "learning_rate": 9.8078033700231e-05, "loss": 4.59, "step": 2270 }, { "epoch": 0.15457263215110748, "grad_norm": 0.28195780515670776, "learning_rate": 9.807378719934775e-05, "loss": 4.4548, "step": 2275 }, { "epoch": 0.15491235222176927, "grad_norm": 0.2665387690067291, "learning_rate": 9.806954069846446e-05, "loss": 4.4353, "step": 2280 }, { "epoch": 0.15525207229243104, "grad_norm": 0.3116438686847687, "learning_rate": 9.806529419758119e-05, "loss": 4.3557, "step": 2285 }, { "epoch": 0.1555917923630928, "grad_norm": 0.42467501759529114, "learning_rate": 9.806104769669793e-05, "loss": 4.3241, "step": 2290 }, { "epoch": 0.15593151243375458, "grad_norm": 0.24590204656124115, "learning_rate": 9.805680119581465e-05, "loss": 4.3719, "step": 2295 }, { "epoch": 0.15627123250441635, "grad_norm": 0.7295488119125366, "learning_rate": 9.805255469493139e-05, "loss": 4.5891, "step": 2300 }, { "epoch": 0.15661095257507815, "grad_norm": 0.24560780823230743, "learning_rate": 9.804830819404812e-05, "loss": 4.6124, "step": 2305 }, { "epoch": 0.15695067264573992, "grad_norm": 0.2907837927341461, "learning_rate": 9.804406169316483e-05, "loss": 4.2532, "step": 2310 }, { "epoch": 0.1572903927164017, "grad_norm": 1.0109922885894775, "learning_rate": 9.803981519228157e-05, "loss": 4.5454, "step": 2315 }, { "epoch": 0.15763011278706346, "grad_norm": 0.2637081444263458, "learning_rate": 9.80355686913983e-05, "loss": 4.5952, "step": 2320 }, { "epoch": 0.15796983285772523, "grad_norm": 0.2559982240200043, "learning_rate": 9.803132219051501e-05, "loss": 4.6078, "step": 2325 }, { "epoch": 0.158309552928387, "grad_norm": 0.4410446882247925, "learning_rate": 9.802707568963176e-05, "loss": 4.6202, "step": 2330 }, { "epoch": 0.1586492729990488, "grad_norm": 0.20168878138065338, "learning_rate": 9.802282918874848e-05, "loss": 4.4023, "step": 2335 }, { "epoch": 0.15898899306971057, "grad_norm": 0.29185861349105835, "learning_rate": 9.80185826878652e-05, "loss": 4.543, "step": 2340 }, { "epoch": 0.15932871314037234, "grad_norm": 0.22290275990962982, "learning_rate": 9.801433618698194e-05, "loss": 4.6697, "step": 2345 }, { "epoch": 0.1596684332110341, "grad_norm": 0.7529789805412292, "learning_rate": 9.801008968609865e-05, "loss": 4.737, "step": 2350 }, { "epoch": 0.16000815328169588, "grad_norm": 0.3712422549724579, "learning_rate": 9.800584318521538e-05, "loss": 4.6241, "step": 2355 }, { "epoch": 0.16034787335235764, "grad_norm": 0.23941993713378906, "learning_rate": 9.800159668433212e-05, "loss": 4.6871, "step": 2360 }, { "epoch": 0.16068759342301944, "grad_norm": 0.37533217668533325, "learning_rate": 9.799735018344884e-05, "loss": 4.467, "step": 2365 }, { "epoch": 0.1610273134936812, "grad_norm": 0.2338525950908661, "learning_rate": 9.799310368256557e-05, "loss": 4.435, "step": 2370 }, { "epoch": 0.16136703356434298, "grad_norm": 0.26814886927604675, "learning_rate": 9.798885718168231e-05, "loss": 4.4838, "step": 2375 }, { "epoch": 0.16170675363500475, "grad_norm": 0.3187100887298584, "learning_rate": 9.798461068079902e-05, "loss": 4.354, "step": 2380 }, { "epoch": 0.16204647370566652, "grad_norm": 0.7054830193519592, "learning_rate": 9.798036417991575e-05, "loss": 4.489, "step": 2385 }, { "epoch": 0.16238619377632832, "grad_norm": 0.25023216009140015, "learning_rate": 9.797611767903249e-05, "loss": 4.6024, "step": 2390 }, { "epoch": 0.1627259138469901, "grad_norm": 0.24370110034942627, "learning_rate": 9.79718711781492e-05, "loss": 4.3951, "step": 2395 }, { "epoch": 0.16306563391765186, "grad_norm": 0.23113249242305756, "learning_rate": 9.796762467726593e-05, "loss": 4.4352, "step": 2400 }, { "epoch": 0.16340535398831363, "grad_norm": 0.4448549747467041, "learning_rate": 9.796337817638268e-05, "loss": 4.4063, "step": 2405 }, { "epoch": 0.1637450740589754, "grad_norm": 0.20236225426197052, "learning_rate": 9.795913167549939e-05, "loss": 4.6175, "step": 2410 }, { "epoch": 0.16408479412963717, "grad_norm": 0.5627440810203552, "learning_rate": 9.795488517461612e-05, "loss": 4.5675, "step": 2415 }, { "epoch": 0.16442451420029897, "grad_norm": 0.28272920846939087, "learning_rate": 9.795063867373285e-05, "loss": 4.6146, "step": 2420 }, { "epoch": 0.16476423427096074, "grad_norm": 0.2605418264865875, "learning_rate": 9.794639217284957e-05, "loss": 4.5697, "step": 2425 }, { "epoch": 0.1651039543416225, "grad_norm": 0.23570238053798676, "learning_rate": 9.79421456719663e-05, "loss": 4.5072, "step": 2430 }, { "epoch": 0.16544367441228428, "grad_norm": 0.20745481550693512, "learning_rate": 9.793789917108303e-05, "loss": 4.5735, "step": 2435 }, { "epoch": 0.16578339448294604, "grad_norm": 0.23489026725292206, "learning_rate": 9.793365267019976e-05, "loss": 4.4731, "step": 2440 }, { "epoch": 0.16612311455360781, "grad_norm": 0.4274902939796448, "learning_rate": 9.792940616931649e-05, "loss": 4.6706, "step": 2445 }, { "epoch": 0.1664628346242696, "grad_norm": 0.25951382517814636, "learning_rate": 9.792515966843321e-05, "loss": 4.441, "step": 2450 }, { "epoch": 0.16680255469493138, "grad_norm": 1.9463924169540405, "learning_rate": 9.792091316754994e-05, "loss": 4.4691, "step": 2455 }, { "epoch": 0.16714227476559315, "grad_norm": 0.4177579879760742, "learning_rate": 9.791666666666667e-05, "loss": 4.4903, "step": 2460 }, { "epoch": 0.16748199483625492, "grad_norm": 0.533138632774353, "learning_rate": 9.79124201657834e-05, "loss": 4.3311, "step": 2465 }, { "epoch": 0.1678217149069167, "grad_norm": 0.2822255790233612, "learning_rate": 9.790817366490013e-05, "loss": 4.5948, "step": 2470 }, { "epoch": 0.1681614349775785, "grad_norm": 0.29035472869873047, "learning_rate": 9.790392716401685e-05, "loss": 4.5585, "step": 2475 }, { "epoch": 0.16850115504824026, "grad_norm": 2.6457104682922363, "learning_rate": 9.789968066313358e-05, "loss": 4.5255, "step": 2480 }, { "epoch": 0.16884087511890203, "grad_norm": 0.21925875544548035, "learning_rate": 9.789543416225031e-05, "loss": 4.5955, "step": 2485 }, { "epoch": 0.1691805951895638, "grad_norm": 0.3095509707927704, "learning_rate": 9.789118766136704e-05, "loss": 4.5427, "step": 2490 }, { "epoch": 0.16952031526022557, "grad_norm": 1.3866817951202393, "learning_rate": 9.788694116048377e-05, "loss": 4.3407, "step": 2495 }, { "epoch": 0.16986003533088734, "grad_norm": 0.31529414653778076, "learning_rate": 9.78826946596005e-05, "loss": 4.614, "step": 2500 }, { "epoch": 0.17019975540154914, "grad_norm": 0.25377875566482544, "learning_rate": 9.787844815871722e-05, "loss": 4.5838, "step": 2505 }, { "epoch": 0.1705394754722109, "grad_norm": 0.7861871123313904, "learning_rate": 9.787420165783395e-05, "loss": 4.5731, "step": 2510 }, { "epoch": 0.17087919554287267, "grad_norm": 0.19743318855762482, "learning_rate": 9.786995515695068e-05, "loss": 4.3947, "step": 2515 }, { "epoch": 0.17121891561353444, "grad_norm": 0.3416430950164795, "learning_rate": 9.78657086560674e-05, "loss": 4.5711, "step": 2520 }, { "epoch": 0.17155863568419621, "grad_norm": 0.3679373562335968, "learning_rate": 9.786146215518413e-05, "loss": 4.6518, "step": 2525 }, { "epoch": 0.17189835575485798, "grad_norm": 0.23833996057510376, "learning_rate": 9.785721565430086e-05, "loss": 4.4339, "step": 2530 }, { "epoch": 0.17223807582551978, "grad_norm": 0.25589922070503235, "learning_rate": 9.785296915341759e-05, "loss": 4.6889, "step": 2535 }, { "epoch": 0.17257779589618155, "grad_norm": 0.27489981055259705, "learning_rate": 9.784872265253432e-05, "loss": 4.3215, "step": 2540 }, { "epoch": 0.17291751596684332, "grad_norm": 0.23039469122886658, "learning_rate": 9.784447615165105e-05, "loss": 4.4543, "step": 2545 }, { "epoch": 0.1732572360375051, "grad_norm": 0.3405773341655731, "learning_rate": 9.784022965076776e-05, "loss": 4.5138, "step": 2550 }, { "epoch": 0.17359695610816686, "grad_norm": 0.8154670000076294, "learning_rate": 9.78359831498845e-05, "loss": 4.4716, "step": 2555 }, { "epoch": 0.17393667617882866, "grad_norm": 0.30465012788772583, "learning_rate": 9.783173664900123e-05, "loss": 4.2262, "step": 2560 }, { "epoch": 0.17427639624949043, "grad_norm": 0.3995078504085541, "learning_rate": 9.782749014811795e-05, "loss": 4.5862, "step": 2565 }, { "epoch": 0.1746161163201522, "grad_norm": 0.2636319398880005, "learning_rate": 9.782324364723469e-05, "loss": 4.6933, "step": 2570 }, { "epoch": 0.17495583639081397, "grad_norm": 0.3614608943462372, "learning_rate": 9.781899714635141e-05, "loss": 4.6504, "step": 2575 }, { "epoch": 0.17529555646147574, "grad_norm": 0.3470248878002167, "learning_rate": 9.781475064546813e-05, "loss": 4.7038, "step": 2580 }, { "epoch": 0.1756352765321375, "grad_norm": 0.39428719878196716, "learning_rate": 9.781050414458487e-05, "loss": 4.2662, "step": 2585 }, { "epoch": 0.1759749966027993, "grad_norm": 0.22955843806266785, "learning_rate": 9.78062576437016e-05, "loss": 4.4044, "step": 2590 }, { "epoch": 0.17631471667346107, "grad_norm": 0.2899189293384552, "learning_rate": 9.780201114281831e-05, "loss": 4.4223, "step": 2595 }, { "epoch": 0.17665443674412284, "grad_norm": 0.4230986535549164, "learning_rate": 9.779776464193505e-05, "loss": 4.1781, "step": 2600 }, { "epoch": 0.17699415681478461, "grad_norm": 0.32788804173469543, "learning_rate": 9.779351814105178e-05, "loss": 4.616, "step": 2605 }, { "epoch": 0.17733387688544638, "grad_norm": 0.2200581431388855, "learning_rate": 9.77892716401685e-05, "loss": 4.2604, "step": 2610 }, { "epoch": 0.17767359695610815, "grad_norm": 0.30823394656181335, "learning_rate": 9.778502513928524e-05, "loss": 4.2978, "step": 2615 }, { "epoch": 0.17801331702676995, "grad_norm": 0.22299472987651825, "learning_rate": 9.778077863840195e-05, "loss": 4.3587, "step": 2620 }, { "epoch": 0.17835303709743172, "grad_norm": 0.22951941192150116, "learning_rate": 9.777653213751868e-05, "loss": 4.2573, "step": 2625 }, { "epoch": 0.1786927571680935, "grad_norm": 0.35953882336616516, "learning_rate": 9.777228563663542e-05, "loss": 4.3515, "step": 2630 }, { "epoch": 0.17903247723875526, "grad_norm": 0.4688868522644043, "learning_rate": 9.776803913575214e-05, "loss": 4.5713, "step": 2635 }, { "epoch": 0.17937219730941703, "grad_norm": 0.21083256602287292, "learning_rate": 9.776379263486888e-05, "loss": 4.4969, "step": 2640 }, { "epoch": 0.17971191738007883, "grad_norm": 0.36712825298309326, "learning_rate": 9.775954613398561e-05, "loss": 4.5619, "step": 2645 }, { "epoch": 0.1800516374507406, "grad_norm": 0.2260504513978958, "learning_rate": 9.775529963310232e-05, "loss": 4.6722, "step": 2650 }, { "epoch": 0.18039135752140237, "grad_norm": 0.36943840980529785, "learning_rate": 9.775105313221906e-05, "loss": 4.4934, "step": 2655 }, { "epoch": 0.18073107759206414, "grad_norm": 0.4936888515949249, "learning_rate": 9.774680663133579e-05, "loss": 4.348, "step": 2660 }, { "epoch": 0.1810707976627259, "grad_norm": 0.21958352625370026, "learning_rate": 9.77425601304525e-05, "loss": 4.4938, "step": 2665 }, { "epoch": 0.18141051773338768, "grad_norm": 1.1148053407669067, "learning_rate": 9.773831362956925e-05, "loss": 4.2976, "step": 2670 }, { "epoch": 0.18175023780404947, "grad_norm": 0.39846473932266235, "learning_rate": 9.773406712868597e-05, "loss": 4.3586, "step": 2675 }, { "epoch": 0.18208995787471124, "grad_norm": 0.28287413716316223, "learning_rate": 9.772982062780269e-05, "loss": 4.4437, "step": 2680 }, { "epoch": 0.18242967794537301, "grad_norm": 0.3402862846851349, "learning_rate": 9.772557412691943e-05, "loss": 4.5291, "step": 2685 }, { "epoch": 0.18276939801603478, "grad_norm": 0.3358980715274811, "learning_rate": 9.772132762603616e-05, "loss": 4.3738, "step": 2690 }, { "epoch": 0.18310911808669655, "grad_norm": 0.19017407298088074, "learning_rate": 9.771708112515287e-05, "loss": 4.1701, "step": 2695 }, { "epoch": 0.18344883815735832, "grad_norm": 0.2291361540555954, "learning_rate": 9.771283462426961e-05, "loss": 4.6092, "step": 2700 }, { "epoch": 0.18378855822802012, "grad_norm": 0.42033877968788147, "learning_rate": 9.770858812338633e-05, "loss": 4.3813, "step": 2705 }, { "epoch": 0.1841282782986819, "grad_norm": 0.22784222662448883, "learning_rate": 9.770434162250306e-05, "loss": 4.4408, "step": 2710 }, { "epoch": 0.18446799836934366, "grad_norm": 0.23395958542823792, "learning_rate": 9.77000951216198e-05, "loss": 4.4935, "step": 2715 }, { "epoch": 0.18480771844000543, "grad_norm": 0.2610359191894531, "learning_rate": 9.769584862073651e-05, "loss": 4.4932, "step": 2720 }, { "epoch": 0.1851474385106672, "grad_norm": 0.2646908164024353, "learning_rate": 9.769160211985324e-05, "loss": 4.4914, "step": 2725 }, { "epoch": 0.185487158581329, "grad_norm": 0.31001701951026917, "learning_rate": 9.768735561896998e-05, "loss": 4.5656, "step": 2730 }, { "epoch": 0.18582687865199077, "grad_norm": 0.3422091007232666, "learning_rate": 9.76831091180867e-05, "loss": 4.4946, "step": 2735 }, { "epoch": 0.18616659872265254, "grad_norm": 0.4761231243610382, "learning_rate": 9.767886261720343e-05, "loss": 4.1494, "step": 2740 }, { "epoch": 0.1865063187933143, "grad_norm": 0.23646193742752075, "learning_rate": 9.767461611632017e-05, "loss": 4.3254, "step": 2745 }, { "epoch": 0.18684603886397608, "grad_norm": 1.6517447233200073, "learning_rate": 9.767036961543688e-05, "loss": 4.5333, "step": 2750 }, { "epoch": 0.18718575893463785, "grad_norm": 0.2012016475200653, "learning_rate": 9.766612311455361e-05, "loss": 4.7069, "step": 2755 }, { "epoch": 0.18752547900529964, "grad_norm": 0.20281845331192017, "learning_rate": 9.766187661367035e-05, "loss": 4.4399, "step": 2760 }, { "epoch": 0.18786519907596141, "grad_norm": 0.1804925948381424, "learning_rate": 9.765763011278707e-05, "loss": 4.5354, "step": 2765 }, { "epoch": 0.18820491914662318, "grad_norm": 0.4761740267276764, "learning_rate": 9.76533836119038e-05, "loss": 4.6633, "step": 2770 }, { "epoch": 0.18854463921728495, "grad_norm": 0.22267234325408936, "learning_rate": 9.764913711102052e-05, "loss": 4.5686, "step": 2775 }, { "epoch": 0.18888435928794672, "grad_norm": 0.5881355404853821, "learning_rate": 9.764489061013725e-05, "loss": 4.4554, "step": 2780 }, { "epoch": 0.1892240793586085, "grad_norm": 0.43992605805397034, "learning_rate": 9.764064410925398e-05, "loss": 4.1684, "step": 2785 }, { "epoch": 0.1895637994292703, "grad_norm": 0.21498017013072968, "learning_rate": 9.76363976083707e-05, "loss": 4.5047, "step": 2790 }, { "epoch": 0.18990351949993206, "grad_norm": 0.37874165177345276, "learning_rate": 9.763215110748743e-05, "loss": 4.2255, "step": 2795 }, { "epoch": 0.19024323957059383, "grad_norm": 0.2565677762031555, "learning_rate": 9.762790460660416e-05, "loss": 4.4333, "step": 2800 }, { "epoch": 0.1905829596412556, "grad_norm": 0.2246963530778885, "learning_rate": 9.762365810572089e-05, "loss": 4.4198, "step": 2805 }, { "epoch": 0.19092267971191737, "grad_norm": 0.946719229221344, "learning_rate": 9.761941160483762e-05, "loss": 3.9055, "step": 2810 }, { "epoch": 0.19126239978257917, "grad_norm": 1.0544602870941162, "learning_rate": 9.761516510395435e-05, "loss": 4.4512, "step": 2815 }, { "epoch": 0.19160211985324094, "grad_norm": 0.21298794448375702, "learning_rate": 9.761091860307107e-05, "loss": 4.4729, "step": 2820 }, { "epoch": 0.1919418399239027, "grad_norm": 1.3822523355484009, "learning_rate": 9.76066721021878e-05, "loss": 4.3136, "step": 2825 }, { "epoch": 0.19228155999456448, "grad_norm": 0.1828567236661911, "learning_rate": 9.760242560130453e-05, "loss": 4.5116, "step": 2830 }, { "epoch": 0.19262128006522625, "grad_norm": 0.28580307960510254, "learning_rate": 9.759817910042126e-05, "loss": 4.4903, "step": 2835 }, { "epoch": 0.19296100013588802, "grad_norm": 0.39433717727661133, "learning_rate": 9.759393259953799e-05, "loss": 4.4487, "step": 2840 }, { "epoch": 0.19330072020654981, "grad_norm": 0.49140483140945435, "learning_rate": 9.758968609865471e-05, "loss": 4.4639, "step": 2845 }, { "epoch": 0.19364044027721158, "grad_norm": 0.3383556306362152, "learning_rate": 9.758543959777144e-05, "loss": 4.3328, "step": 2850 }, { "epoch": 0.19398016034787335, "grad_norm": 0.7367972135543823, "learning_rate": 9.758119309688817e-05, "loss": 4.5457, "step": 2855 }, { "epoch": 0.19431988041853512, "grad_norm": 0.19852545857429504, "learning_rate": 9.75769465960049e-05, "loss": 4.2916, "step": 2860 }, { "epoch": 0.1946596004891969, "grad_norm": 0.3379197120666504, "learning_rate": 9.757270009512163e-05, "loss": 4.4011, "step": 2865 }, { "epoch": 0.19499932055985866, "grad_norm": 0.4577140212059021, "learning_rate": 9.756845359423835e-05, "loss": 4.2438, "step": 2870 }, { "epoch": 0.19533904063052046, "grad_norm": 0.32074615359306335, "learning_rate": 9.756420709335508e-05, "loss": 4.3429, "step": 2875 }, { "epoch": 0.19567876070118223, "grad_norm": 0.4993734359741211, "learning_rate": 9.755996059247181e-05, "loss": 4.6458, "step": 2880 }, { "epoch": 0.196018480771844, "grad_norm": 0.21413934230804443, "learning_rate": 9.755571409158854e-05, "loss": 4.3236, "step": 2885 }, { "epoch": 0.19635820084250577, "grad_norm": 0.39588046073913574, "learning_rate": 9.755146759070527e-05, "loss": 4.2725, "step": 2890 }, { "epoch": 0.19669792091316754, "grad_norm": 0.23066450655460358, "learning_rate": 9.7547221089822e-05, "loss": 4.2981, "step": 2895 }, { "epoch": 0.19703764098382934, "grad_norm": 0.24343866109848022, "learning_rate": 9.754297458893872e-05, "loss": 4.4485, "step": 2900 }, { "epoch": 0.1973773610544911, "grad_norm": 0.2774411141872406, "learning_rate": 9.753872808805544e-05, "loss": 4.2737, "step": 2905 }, { "epoch": 0.19771708112515288, "grad_norm": 0.3360697329044342, "learning_rate": 9.753448158717218e-05, "loss": 4.2094, "step": 2910 }, { "epoch": 0.19805680119581465, "grad_norm": 0.3886429965496063, "learning_rate": 9.75302350862889e-05, "loss": 4.4864, "step": 2915 }, { "epoch": 0.19839652126647642, "grad_norm": 0.5242161154747009, "learning_rate": 9.752598858540562e-05, "loss": 4.3877, "step": 2920 }, { "epoch": 0.1987362413371382, "grad_norm": 0.2082594633102417, "learning_rate": 9.752174208452236e-05, "loss": 4.3006, "step": 2925 }, { "epoch": 0.19907596140779998, "grad_norm": 1.1216654777526855, "learning_rate": 9.751749558363909e-05, "loss": 4.2916, "step": 2930 }, { "epoch": 0.19941568147846175, "grad_norm": 0.1812744289636612, "learning_rate": 9.75132490827558e-05, "loss": 4.4246, "step": 2935 }, { "epoch": 0.19975540154912352, "grad_norm": 0.316278874874115, "learning_rate": 9.750900258187255e-05, "loss": 4.2248, "step": 2940 }, { "epoch": 0.2000951216197853, "grad_norm": 0.2795095443725586, "learning_rate": 9.750475608098927e-05, "loss": 4.4285, "step": 2945 }, { "epoch": 0.20043484169044706, "grad_norm": 0.25871169567108154, "learning_rate": 9.750050958010599e-05, "loss": 4.3629, "step": 2950 }, { "epoch": 0.20077456176110883, "grad_norm": 0.3203955888748169, "learning_rate": 9.749626307922273e-05, "loss": 4.8022, "step": 2955 }, { "epoch": 0.20111428183177063, "grad_norm": 0.897880494594574, "learning_rate": 9.749201657833946e-05, "loss": 4.444, "step": 2960 }, { "epoch": 0.2014540019024324, "grad_norm": 0.6095696687698364, "learning_rate": 9.748777007745617e-05, "loss": 4.3442, "step": 2965 }, { "epoch": 0.20179372197309417, "grad_norm": 0.8089606761932373, "learning_rate": 9.748352357657291e-05, "loss": 4.3223, "step": 2970 }, { "epoch": 0.20213344204375594, "grad_norm": 0.5481230616569519, "learning_rate": 9.747927707568963e-05, "loss": 4.3059, "step": 2975 }, { "epoch": 0.2024731621144177, "grad_norm": 0.24502769112586975, "learning_rate": 9.747503057480637e-05, "loss": 4.2946, "step": 2980 }, { "epoch": 0.2028128821850795, "grad_norm": 0.20267254114151, "learning_rate": 9.74707840739231e-05, "loss": 4.3506, "step": 2985 }, { "epoch": 0.20315260225574128, "grad_norm": 1.4581079483032227, "learning_rate": 9.746653757303981e-05, "loss": 4.3024, "step": 2990 }, { "epoch": 0.20349232232640305, "grad_norm": 0.3428595960140228, "learning_rate": 9.746229107215655e-05, "loss": 4.4748, "step": 2995 }, { "epoch": 0.20383204239706482, "grad_norm": 0.3032056391239166, "learning_rate": 9.745804457127328e-05, "loss": 4.3262, "step": 3000 }, { "epoch": 0.2041717624677266, "grad_norm": 0.32317832112312317, "learning_rate": 9.745379807039e-05, "loss": 4.475, "step": 3005 }, { "epoch": 0.20451148253838836, "grad_norm": 0.21782419085502625, "learning_rate": 9.744955156950674e-05, "loss": 4.5021, "step": 3010 }, { "epoch": 0.20485120260905015, "grad_norm": 0.17983724176883698, "learning_rate": 9.744530506862347e-05, "loss": 4.3826, "step": 3015 }, { "epoch": 0.20519092267971192, "grad_norm": 0.3824704587459564, "learning_rate": 9.744105856774018e-05, "loss": 4.4883, "step": 3020 }, { "epoch": 0.2055306427503737, "grad_norm": 0.6417528390884399, "learning_rate": 9.743681206685692e-05, "loss": 4.3623, "step": 3025 }, { "epoch": 0.20587036282103546, "grad_norm": 0.31229910254478455, "learning_rate": 9.743256556597365e-05, "loss": 4.3362, "step": 3030 }, { "epoch": 0.20621008289169723, "grad_norm": 0.35579913854599, "learning_rate": 9.742831906509036e-05, "loss": 4.3119, "step": 3035 }, { "epoch": 0.206549802962359, "grad_norm": 0.21225492656230927, "learning_rate": 9.74240725642071e-05, "loss": 4.4159, "step": 3040 }, { "epoch": 0.2068895230330208, "grad_norm": 0.5204954147338867, "learning_rate": 9.741982606332382e-05, "loss": 4.254, "step": 3045 }, { "epoch": 0.20722924310368257, "grad_norm": 0.22649656236171722, "learning_rate": 9.741557956244055e-05, "loss": 4.3604, "step": 3050 }, { "epoch": 0.20756896317434434, "grad_norm": 0.22533409297466278, "learning_rate": 9.741133306155729e-05, "loss": 4.1594, "step": 3055 }, { "epoch": 0.2079086832450061, "grad_norm": 0.2681191563606262, "learning_rate": 9.7407086560674e-05, "loss": 4.553, "step": 3060 }, { "epoch": 0.20824840331566788, "grad_norm": 1.2959145307540894, "learning_rate": 9.740284005979073e-05, "loss": 4.2071, "step": 3065 }, { "epoch": 0.20858812338632968, "grad_norm": 0.21679522097110748, "learning_rate": 9.739859355890747e-05, "loss": 4.5038, "step": 3070 }, { "epoch": 0.20892784345699145, "grad_norm": 0.36338409781455994, "learning_rate": 9.739434705802419e-05, "loss": 4.3356, "step": 3075 }, { "epoch": 0.20926756352765322, "grad_norm": 0.2271890938282013, "learning_rate": 9.739010055714092e-05, "loss": 4.3573, "step": 3080 }, { "epoch": 0.209607283598315, "grad_norm": 0.2753996253013611, "learning_rate": 9.738585405625766e-05, "loss": 4.4467, "step": 3085 }, { "epoch": 0.20994700366897676, "grad_norm": 0.32643699645996094, "learning_rate": 9.738160755537437e-05, "loss": 4.2911, "step": 3090 }, { "epoch": 0.21028672373963853, "grad_norm": 0.1794055551290512, "learning_rate": 9.73773610544911e-05, "loss": 4.3936, "step": 3095 }, { "epoch": 0.21062644381030032, "grad_norm": 0.2121143937110901, "learning_rate": 9.737311455360784e-05, "loss": 4.4391, "step": 3100 }, { "epoch": 0.2109661638809621, "grad_norm": 0.6584509015083313, "learning_rate": 9.736886805272456e-05, "loss": 4.3936, "step": 3105 }, { "epoch": 0.21130588395162386, "grad_norm": 0.2863527834415436, "learning_rate": 9.736462155184128e-05, "loss": 4.4768, "step": 3110 }, { "epoch": 0.21164560402228563, "grad_norm": 1.1741371154785156, "learning_rate": 9.736037505095803e-05, "loss": 4.3373, "step": 3115 }, { "epoch": 0.2119853240929474, "grad_norm": 0.3653934597969055, "learning_rate": 9.735612855007474e-05, "loss": 4.6397, "step": 3120 }, { "epoch": 0.21232504416360917, "grad_norm": 0.369391530752182, "learning_rate": 9.735188204919147e-05, "loss": 4.1315, "step": 3125 }, { "epoch": 0.21266476423427097, "grad_norm": 0.22272364795207977, "learning_rate": 9.73476355483082e-05, "loss": 4.4174, "step": 3130 }, { "epoch": 0.21300448430493274, "grad_norm": 0.26364120841026306, "learning_rate": 9.734338904742492e-05, "loss": 4.3034, "step": 3135 }, { "epoch": 0.2133442043755945, "grad_norm": 0.2755309045314789, "learning_rate": 9.733914254654165e-05, "loss": 4.3209, "step": 3140 }, { "epoch": 0.21368392444625628, "grad_norm": 0.27905556559562683, "learning_rate": 9.733489604565838e-05, "loss": 4.4046, "step": 3145 }, { "epoch": 0.21402364451691805, "grad_norm": 0.25759658217430115, "learning_rate": 9.733064954477511e-05, "loss": 4.1602, "step": 3150 }, { "epoch": 0.21436336458757985, "grad_norm": 1.0761340856552124, "learning_rate": 9.732640304389184e-05, "loss": 4.2794, "step": 3155 }, { "epoch": 0.21470308465824162, "grad_norm": 0.18029484152793884, "learning_rate": 9.732215654300856e-05, "loss": 4.3554, "step": 3160 }, { "epoch": 0.2150428047289034, "grad_norm": 0.373797208070755, "learning_rate": 9.731791004212529e-05, "loss": 4.3863, "step": 3165 }, { "epoch": 0.21538252479956516, "grad_norm": 0.6202191710472107, "learning_rate": 9.731366354124202e-05, "loss": 4.2955, "step": 3170 }, { "epoch": 0.21572224487022693, "grad_norm": 0.20301900804042816, "learning_rate": 9.730941704035875e-05, "loss": 4.2914, "step": 3175 }, { "epoch": 0.2160619649408887, "grad_norm": 0.17571194469928741, "learning_rate": 9.730517053947548e-05, "loss": 4.276, "step": 3180 }, { "epoch": 0.2164016850115505, "grad_norm": 0.3209381401538849, "learning_rate": 9.73009240385922e-05, "loss": 4.3957, "step": 3185 }, { "epoch": 0.21674140508221226, "grad_norm": 0.2638840079307556, "learning_rate": 9.729667753770893e-05, "loss": 4.6224, "step": 3190 }, { "epoch": 0.21708112515287403, "grad_norm": 0.19320239126682281, "learning_rate": 9.729243103682566e-05, "loss": 4.3039, "step": 3195 }, { "epoch": 0.2174208452235358, "grad_norm": 0.43768310546875, "learning_rate": 9.728818453594239e-05, "loss": 4.3898, "step": 3200 }, { "epoch": 0.21776056529419757, "grad_norm": 0.35756048560142517, "learning_rate": 9.728393803505912e-05, "loss": 4.5883, "step": 3205 }, { "epoch": 0.21810028536485934, "grad_norm": 0.2380749136209488, "learning_rate": 9.727969153417584e-05, "loss": 4.4224, "step": 3210 }, { "epoch": 0.21844000543552114, "grad_norm": 0.23136284947395325, "learning_rate": 9.727544503329257e-05, "loss": 4.3141, "step": 3215 }, { "epoch": 0.2187797255061829, "grad_norm": 0.3109607398509979, "learning_rate": 9.72711985324093e-05, "loss": 4.295, "step": 3220 }, { "epoch": 0.21911944557684468, "grad_norm": 0.4062863290309906, "learning_rate": 9.726695203152603e-05, "loss": 4.2421, "step": 3225 }, { "epoch": 0.21945916564750645, "grad_norm": 0.20023144781589508, "learning_rate": 9.726270553064276e-05, "loss": 4.3144, "step": 3230 }, { "epoch": 0.21979888571816822, "grad_norm": 0.8297600150108337, "learning_rate": 9.725845902975948e-05, "loss": 4.2986, "step": 3235 }, { "epoch": 0.22013860578883002, "grad_norm": 0.6315371990203857, "learning_rate": 9.725421252887621e-05, "loss": 4.4641, "step": 3240 }, { "epoch": 0.22047832585949179, "grad_norm": 0.2108875811100006, "learning_rate": 9.724996602799293e-05, "loss": 4.2764, "step": 3245 }, { "epoch": 0.22081804593015356, "grad_norm": 0.20751313865184784, "learning_rate": 9.724571952710967e-05, "loss": 4.3553, "step": 3250 }, { "epoch": 0.22115776600081533, "grad_norm": 0.24425362050533295, "learning_rate": 9.72414730262264e-05, "loss": 4.243, "step": 3255 }, { "epoch": 0.2214974860714771, "grad_norm": 0.22244137525558472, "learning_rate": 9.723722652534311e-05, "loss": 4.2259, "step": 3260 }, { "epoch": 0.22183720614213887, "grad_norm": 1.1119288206100464, "learning_rate": 9.723298002445985e-05, "loss": 4.322, "step": 3265 }, { "epoch": 0.22217692621280066, "grad_norm": 0.3089415729045868, "learning_rate": 9.722873352357658e-05, "loss": 4.5175, "step": 3270 }, { "epoch": 0.22251664628346243, "grad_norm": 0.2517615556716919, "learning_rate": 9.72244870226933e-05, "loss": 4.4571, "step": 3275 }, { "epoch": 0.2228563663541242, "grad_norm": 0.17470265924930573, "learning_rate": 9.722024052181004e-05, "loss": 4.2085, "step": 3280 }, { "epoch": 0.22319608642478597, "grad_norm": 0.22137637436389923, "learning_rate": 9.721599402092676e-05, "loss": 4.597, "step": 3285 }, { "epoch": 0.22353580649544774, "grad_norm": 0.3537333309650421, "learning_rate": 9.721174752004348e-05, "loss": 4.4751, "step": 3290 }, { "epoch": 0.22387552656610954, "grad_norm": 0.24241957068443298, "learning_rate": 9.720750101916022e-05, "loss": 4.0842, "step": 3295 }, { "epoch": 0.2242152466367713, "grad_norm": 0.2881457805633545, "learning_rate": 9.720325451827695e-05, "loss": 4.1991, "step": 3300 }, { "epoch": 0.22455496670743308, "grad_norm": 0.23095691204071045, "learning_rate": 9.719900801739366e-05, "loss": 4.4024, "step": 3305 }, { "epoch": 0.22489468677809485, "grad_norm": 0.25291046500205994, "learning_rate": 9.71947615165104e-05, "loss": 4.1577, "step": 3310 }, { "epoch": 0.22523440684875662, "grad_norm": 0.2241574227809906, "learning_rate": 9.719051501562713e-05, "loss": 4.2593, "step": 3315 }, { "epoch": 0.2255741269194184, "grad_norm": 0.19019931554794312, "learning_rate": 9.718626851474386e-05, "loss": 4.1674, "step": 3320 }, { "epoch": 0.22591384699008019, "grad_norm": 0.2490902543067932, "learning_rate": 9.718202201386059e-05, "loss": 4.201, "step": 3325 }, { "epoch": 0.22625356706074196, "grad_norm": 0.2902776896953583, "learning_rate": 9.71777755129773e-05, "loss": 4.5505, "step": 3330 }, { "epoch": 0.22659328713140373, "grad_norm": 0.22167052328586578, "learning_rate": 9.717352901209404e-05, "loss": 4.282, "step": 3335 }, { "epoch": 0.2269330072020655, "grad_norm": 3.241713523864746, "learning_rate": 9.716928251121077e-05, "loss": 4.095, "step": 3340 }, { "epoch": 0.22727272727272727, "grad_norm": 0.2534385323524475, "learning_rate": 9.716503601032749e-05, "loss": 4.325, "step": 3345 }, { "epoch": 0.22761244734338903, "grad_norm": 0.2039516121149063, "learning_rate": 9.716078950944423e-05, "loss": 4.3627, "step": 3350 }, { "epoch": 0.22795216741405083, "grad_norm": 0.20797346532344818, "learning_rate": 9.715654300856096e-05, "loss": 4.1518, "step": 3355 }, { "epoch": 0.2282918874847126, "grad_norm": 0.2560058534145355, "learning_rate": 9.715229650767767e-05, "loss": 4.2377, "step": 3360 }, { "epoch": 0.22863160755537437, "grad_norm": 0.20020583271980286, "learning_rate": 9.714805000679441e-05, "loss": 4.4088, "step": 3365 }, { "epoch": 0.22897132762603614, "grad_norm": 0.32701200246810913, "learning_rate": 9.714380350591114e-05, "loss": 4.4603, "step": 3370 }, { "epoch": 0.2293110476966979, "grad_norm": 0.16908589005470276, "learning_rate": 9.713955700502786e-05, "loss": 4.4159, "step": 3375 }, { "epoch": 0.2296507677673597, "grad_norm": 10.86708927154541, "learning_rate": 9.71353105041446e-05, "loss": 4.373, "step": 3380 }, { "epoch": 0.22999048783802148, "grad_norm": 0.2162582278251648, "learning_rate": 9.713106400326132e-05, "loss": 4.3303, "step": 3385 }, { "epoch": 0.23033020790868325, "grad_norm": 0.1772332489490509, "learning_rate": 9.712681750237804e-05, "loss": 4.2396, "step": 3390 }, { "epoch": 0.23066992797934502, "grad_norm": 0.36134952306747437, "learning_rate": 9.712257100149478e-05, "loss": 4.3838, "step": 3395 }, { "epoch": 0.2310096480500068, "grad_norm": 0.32894113659858704, "learning_rate": 9.71183245006115e-05, "loss": 4.177, "step": 3400 }, { "epoch": 0.23134936812066856, "grad_norm": 0.2267148792743683, "learning_rate": 9.711407799972822e-05, "loss": 4.3994, "step": 3405 }, { "epoch": 0.23168908819133036, "grad_norm": 0.22997945547103882, "learning_rate": 9.710983149884496e-05, "loss": 4.2585, "step": 3410 }, { "epoch": 0.23202880826199213, "grad_norm": 0.21913081407546997, "learning_rate": 9.710558499796168e-05, "loss": 4.495, "step": 3415 }, { "epoch": 0.2323685283326539, "grad_norm": 0.2355417013168335, "learning_rate": 9.710133849707841e-05, "loss": 4.4619, "step": 3420 }, { "epoch": 0.23270824840331567, "grad_norm": 0.29134589433670044, "learning_rate": 9.709709199619515e-05, "loss": 4.3438, "step": 3425 }, { "epoch": 0.23304796847397743, "grad_norm": 0.4645059108734131, "learning_rate": 9.709284549531186e-05, "loss": 4.467, "step": 3430 }, { "epoch": 0.2333876885446392, "grad_norm": 0.3466382324695587, "learning_rate": 9.708859899442859e-05, "loss": 4.4636, "step": 3435 }, { "epoch": 0.233727408615301, "grad_norm": 0.2788010835647583, "learning_rate": 9.708435249354533e-05, "loss": 4.1318, "step": 3440 }, { "epoch": 0.23406712868596277, "grad_norm": 0.4784042537212372, "learning_rate": 9.708010599266205e-05, "loss": 4.2089, "step": 3445 }, { "epoch": 0.23440684875662454, "grad_norm": 0.45934122800827026, "learning_rate": 9.707585949177878e-05, "loss": 4.3023, "step": 3450 }, { "epoch": 0.2347465688272863, "grad_norm": 0.25707322359085083, "learning_rate": 9.707161299089552e-05, "loss": 4.2901, "step": 3455 }, { "epoch": 0.23508628889794808, "grad_norm": 0.4797256290912628, "learning_rate": 9.706736649001223e-05, "loss": 4.2574, "step": 3460 }, { "epoch": 0.23542600896860988, "grad_norm": 0.2368171215057373, "learning_rate": 9.706311998912896e-05, "loss": 4.3998, "step": 3465 }, { "epoch": 0.23576572903927165, "grad_norm": 1.7958965301513672, "learning_rate": 9.705887348824569e-05, "loss": 4.212, "step": 3470 }, { "epoch": 0.23610544910993342, "grad_norm": 0.24695445597171783, "learning_rate": 9.705462698736242e-05, "loss": 4.3325, "step": 3475 }, { "epoch": 0.2364451691805952, "grad_norm": 0.24877724051475525, "learning_rate": 9.705038048647914e-05, "loss": 4.4167, "step": 3480 }, { "epoch": 0.23678488925125696, "grad_norm": 0.2147648185491562, "learning_rate": 9.704613398559587e-05, "loss": 4.3511, "step": 3485 }, { "epoch": 0.23712460932191873, "grad_norm": 0.38735896348953247, "learning_rate": 9.70418874847126e-05, "loss": 4.054, "step": 3490 }, { "epoch": 0.23746432939258053, "grad_norm": 0.28407546877861023, "learning_rate": 9.703764098382933e-05, "loss": 4.1299, "step": 3495 }, { "epoch": 0.2378040494632423, "grad_norm": 0.4963781237602234, "learning_rate": 9.703339448294606e-05, "loss": 4.3751, "step": 3500 }, { "epoch": 0.23814376953390406, "grad_norm": 0.2363215535879135, "learning_rate": 9.702914798206278e-05, "loss": 4.3631, "step": 3505 }, { "epoch": 0.23848348960456583, "grad_norm": 0.2752895951271057, "learning_rate": 9.702490148117951e-05, "loss": 4.3192, "step": 3510 }, { "epoch": 0.2388232096752276, "grad_norm": 0.2011261135339737, "learning_rate": 9.702065498029624e-05, "loss": 4.3019, "step": 3515 }, { "epoch": 0.23916292974588937, "grad_norm": 0.30605676770210266, "learning_rate": 9.701640847941297e-05, "loss": 4.2733, "step": 3520 }, { "epoch": 0.23950264981655117, "grad_norm": 0.23777063190937042, "learning_rate": 9.70121619785297e-05, "loss": 4.4391, "step": 3525 }, { "epoch": 0.23984236988721294, "grad_norm": 0.19578081369400024, "learning_rate": 9.700791547764642e-05, "loss": 4.3464, "step": 3530 }, { "epoch": 0.2401820899578747, "grad_norm": 0.43479400873184204, "learning_rate": 9.700366897676315e-05, "loss": 4.1509, "step": 3535 }, { "epoch": 0.24052181002853648, "grad_norm": 0.23320983350276947, "learning_rate": 9.699942247587988e-05, "loss": 4.2031, "step": 3540 }, { "epoch": 0.24086153009919825, "grad_norm": 0.395224004983902, "learning_rate": 9.699517597499661e-05, "loss": 4.3575, "step": 3545 }, { "epoch": 0.24120125016986005, "grad_norm": 0.22553794085979462, "learning_rate": 9.699092947411334e-05, "loss": 4.4234, "step": 3550 }, { "epoch": 0.24154097024052182, "grad_norm": 0.21396693587303162, "learning_rate": 9.698668297323006e-05, "loss": 4.3933, "step": 3555 }, { "epoch": 0.2418806903111836, "grad_norm": 0.2883829176425934, "learning_rate": 9.698243647234679e-05, "loss": 4.1656, "step": 3560 }, { "epoch": 0.24222041038184536, "grad_norm": 0.3761749267578125, "learning_rate": 9.697818997146352e-05, "loss": 4.254, "step": 3565 }, { "epoch": 0.24256013045250713, "grad_norm": 0.2654179036617279, "learning_rate": 9.697394347058025e-05, "loss": 4.0754, "step": 3570 }, { "epoch": 0.2428998505231689, "grad_norm": 0.20404711365699768, "learning_rate": 9.696969696969698e-05, "loss": 4.4404, "step": 3575 }, { "epoch": 0.2432395705938307, "grad_norm": 0.661677360534668, "learning_rate": 9.69654504688137e-05, "loss": 4.4084, "step": 3580 }, { "epoch": 0.24357929066449246, "grad_norm": 0.19168756902217865, "learning_rate": 9.696120396793043e-05, "loss": 4.3991, "step": 3585 }, { "epoch": 0.24391901073515423, "grad_norm": 0.21689128875732422, "learning_rate": 9.695695746704716e-05, "loss": 4.2182, "step": 3590 }, { "epoch": 0.244258730805816, "grad_norm": 0.1910148561000824, "learning_rate": 9.695271096616389e-05, "loss": 4.2426, "step": 3595 }, { "epoch": 0.24459845087647777, "grad_norm": 0.463371604681015, "learning_rate": 9.69484644652806e-05, "loss": 4.1501, "step": 3600 }, { "epoch": 0.24493817094713954, "grad_norm": 0.2187051922082901, "learning_rate": 9.694421796439734e-05, "loss": 4.2383, "step": 3605 }, { "epoch": 0.24527789101780134, "grad_norm": 0.7701082229614258, "learning_rate": 9.693997146351407e-05, "loss": 4.2811, "step": 3610 }, { "epoch": 0.2456176110884631, "grad_norm": 0.2454994171857834, "learning_rate": 9.693572496263079e-05, "loss": 4.3994, "step": 3615 }, { "epoch": 0.24595733115912488, "grad_norm": 0.22179093956947327, "learning_rate": 9.693147846174753e-05, "loss": 4.1261, "step": 3620 }, { "epoch": 0.24629705122978665, "grad_norm": 0.23975835740566254, "learning_rate": 9.692723196086426e-05, "loss": 4.308, "step": 3625 }, { "epoch": 0.24663677130044842, "grad_norm": 0.21660096943378448, "learning_rate": 9.692298545998097e-05, "loss": 4.2105, "step": 3630 }, { "epoch": 0.24697649137111022, "grad_norm": 0.22534438967704773, "learning_rate": 9.691873895909771e-05, "loss": 4.2923, "step": 3635 }, { "epoch": 0.247316211441772, "grad_norm": 0.19649091362953186, "learning_rate": 9.691449245821444e-05, "loss": 4.4036, "step": 3640 }, { "epoch": 0.24765593151243376, "grad_norm": 0.32042601704597473, "learning_rate": 9.691024595733115e-05, "loss": 4.4266, "step": 3645 }, { "epoch": 0.24799565158309553, "grad_norm": 0.6859878301620483, "learning_rate": 9.69059994564479e-05, "loss": 4.2232, "step": 3650 }, { "epoch": 0.2483353716537573, "grad_norm": 0.23352079093456268, "learning_rate": 9.690175295556462e-05, "loss": 4.0696, "step": 3655 }, { "epoch": 0.24867509172441907, "grad_norm": 0.272712767124176, "learning_rate": 9.689750645468135e-05, "loss": 4.1759, "step": 3660 }, { "epoch": 0.24901481179508086, "grad_norm": 0.22009974718093872, "learning_rate": 9.689325995379808e-05, "loss": 4.1973, "step": 3665 }, { "epoch": 0.24935453186574263, "grad_norm": 1.4543390274047852, "learning_rate": 9.68890134529148e-05, "loss": 4.314, "step": 3670 }, { "epoch": 0.2496942519364044, "grad_norm": 0.3941098153591156, "learning_rate": 9.688476695203154e-05, "loss": 4.4651, "step": 3675 }, { "epoch": 0.2500339720070662, "grad_norm": 0.28159454464912415, "learning_rate": 9.688052045114826e-05, "loss": 4.2579, "step": 3680 }, { "epoch": 0.25037369207772797, "grad_norm": 0.22340060770511627, "learning_rate": 9.687627395026498e-05, "loss": 4.2427, "step": 3685 }, { "epoch": 0.2507134121483897, "grad_norm": 0.24438177049160004, "learning_rate": 9.687202744938172e-05, "loss": 4.1173, "step": 3690 }, { "epoch": 0.2510531322190515, "grad_norm": 0.19045932590961456, "learning_rate": 9.686778094849845e-05, "loss": 4.3531, "step": 3695 }, { "epoch": 0.25139285228971325, "grad_norm": 0.21072247624397278, "learning_rate": 9.686353444761516e-05, "loss": 4.3211, "step": 3700 }, { "epoch": 0.25173257236037505, "grad_norm": 0.20157082378864288, "learning_rate": 9.68592879467319e-05, "loss": 4.3939, "step": 3705 }, { "epoch": 0.25207229243103685, "grad_norm": 0.23919062316417694, "learning_rate": 9.685504144584863e-05, "loss": 4.2216, "step": 3710 }, { "epoch": 0.2524120125016986, "grad_norm": 0.3379192352294922, "learning_rate": 9.685079494496535e-05, "loss": 4.15, "step": 3715 }, { "epoch": 0.2527517325723604, "grad_norm": 0.2691631615161896, "learning_rate": 9.684654844408209e-05, "loss": 4.178, "step": 3720 }, { "epoch": 0.25309145264302213, "grad_norm": 0.2460995614528656, "learning_rate": 9.684230194319882e-05, "loss": 4.1714, "step": 3725 }, { "epoch": 0.2534311727136839, "grad_norm": 0.24896664917469025, "learning_rate": 9.683805544231553e-05, "loss": 4.5137, "step": 3730 }, { "epoch": 0.2537708927843457, "grad_norm": 0.2998896837234497, "learning_rate": 9.683380894143227e-05, "loss": 4.2024, "step": 3735 }, { "epoch": 0.25411061285500747, "grad_norm": 0.2170042097568512, "learning_rate": 9.6829562440549e-05, "loss": 4.2967, "step": 3740 }, { "epoch": 0.25445033292566926, "grad_norm": 0.2534918487071991, "learning_rate": 9.682531593966571e-05, "loss": 4.1857, "step": 3745 }, { "epoch": 0.254790052996331, "grad_norm": 0.2045327126979828, "learning_rate": 9.682106943878246e-05, "loss": 3.9961, "step": 3750 }, { "epoch": 0.2551297730669928, "grad_norm": 0.23638245463371277, "learning_rate": 9.681682293789917e-05, "loss": 4.3547, "step": 3755 }, { "epoch": 0.25546949313765455, "grad_norm": 0.22549360990524292, "learning_rate": 9.68125764370159e-05, "loss": 4.1726, "step": 3760 }, { "epoch": 0.25580921320831634, "grad_norm": 0.24715493619441986, "learning_rate": 9.680832993613264e-05, "loss": 4.2836, "step": 3765 }, { "epoch": 0.25614893327897814, "grad_norm": 0.33308762311935425, "learning_rate": 9.680408343524935e-05, "loss": 4.2667, "step": 3770 }, { "epoch": 0.2564886533496399, "grad_norm": 0.4240279197692871, "learning_rate": 9.679983693436608e-05, "loss": 4.2174, "step": 3775 }, { "epoch": 0.2568283734203017, "grad_norm": 0.26198095083236694, "learning_rate": 9.679559043348282e-05, "loss": 4.33, "step": 3780 }, { "epoch": 0.2571680934909634, "grad_norm": 0.21898075938224792, "learning_rate": 9.679134393259954e-05, "loss": 4.2754, "step": 3785 }, { "epoch": 0.2575078135616252, "grad_norm": 0.1936497837305069, "learning_rate": 9.678709743171627e-05, "loss": 4.2178, "step": 3790 }, { "epoch": 0.257847533632287, "grad_norm": 0.3042401075363159, "learning_rate": 9.678285093083301e-05, "loss": 4.2444, "step": 3795 }, { "epoch": 0.25818725370294876, "grad_norm": 0.22089192271232605, "learning_rate": 9.677860442994972e-05, "loss": 4.1209, "step": 3800 }, { "epoch": 0.25852697377361056, "grad_norm": 0.26595672965049744, "learning_rate": 9.677435792906645e-05, "loss": 4.3664, "step": 3805 }, { "epoch": 0.2588666938442723, "grad_norm": 0.38972827792167664, "learning_rate": 9.677011142818319e-05, "loss": 4.3291, "step": 3810 }, { "epoch": 0.2592064139149341, "grad_norm": 0.8308687210083008, "learning_rate": 9.67658649272999e-05, "loss": 4.3588, "step": 3815 }, { "epoch": 0.2595461339855959, "grad_norm": 0.29095426201820374, "learning_rate": 9.676161842641663e-05, "loss": 4.2095, "step": 3820 }, { "epoch": 0.25988585405625764, "grad_norm": 0.6666823625564575, "learning_rate": 9.675737192553336e-05, "loss": 4.249, "step": 3825 }, { "epoch": 0.26022557412691943, "grad_norm": 0.2800503373146057, "learning_rate": 9.675312542465009e-05, "loss": 4.0724, "step": 3830 }, { "epoch": 0.2605652941975812, "grad_norm": 0.31251639127731323, "learning_rate": 9.674887892376682e-05, "loss": 4.2177, "step": 3835 }, { "epoch": 0.260905014268243, "grad_norm": 0.19203290343284607, "learning_rate": 9.674463242288355e-05, "loss": 4.1164, "step": 3840 }, { "epoch": 0.2612447343389047, "grad_norm": 0.21506604552268982, "learning_rate": 9.674038592200027e-05, "loss": 4.3448, "step": 3845 }, { "epoch": 0.2615844544095665, "grad_norm": 0.2286742478609085, "learning_rate": 9.6736139421117e-05, "loss": 4.3154, "step": 3850 }, { "epoch": 0.2619241744802283, "grad_norm": 0.22341595590114594, "learning_rate": 9.673189292023373e-05, "loss": 4.0496, "step": 3855 }, { "epoch": 0.26226389455089005, "grad_norm": 3.2631723880767822, "learning_rate": 9.672764641935046e-05, "loss": 4.2739, "step": 3860 }, { "epoch": 0.26260361462155185, "grad_norm": 0.21692293882369995, "learning_rate": 9.672339991846719e-05, "loss": 4.2239, "step": 3865 }, { "epoch": 0.2629433346922136, "grad_norm": 0.24772769212722778, "learning_rate": 9.671915341758391e-05, "loss": 4.0354, "step": 3870 }, { "epoch": 0.2632830547628754, "grad_norm": 0.2190844863653183, "learning_rate": 9.671490691670064e-05, "loss": 4.2554, "step": 3875 }, { "epoch": 0.2636227748335372, "grad_norm": 0.19608178734779358, "learning_rate": 9.671066041581737e-05, "loss": 4.2199, "step": 3880 }, { "epoch": 0.26396249490419893, "grad_norm": 0.22313562035560608, "learning_rate": 9.67064139149341e-05, "loss": 4.1976, "step": 3885 }, { "epoch": 0.2643022149748607, "grad_norm": 0.25129613280296326, "learning_rate": 9.670216741405083e-05, "loss": 4.2595, "step": 3890 }, { "epoch": 0.26464193504552247, "grad_norm": 0.19212405383586884, "learning_rate": 9.669792091316755e-05, "loss": 4.3704, "step": 3895 }, { "epoch": 0.26498165511618427, "grad_norm": 0.21401169896125793, "learning_rate": 9.669367441228428e-05, "loss": 4.1088, "step": 3900 }, { "epoch": 0.26532137518684606, "grad_norm": 0.2625492811203003, "learning_rate": 9.668942791140101e-05, "loss": 4.4765, "step": 3905 }, { "epoch": 0.2656610952575078, "grad_norm": 0.23690305650234222, "learning_rate": 9.668518141051774e-05, "loss": 4.1193, "step": 3910 }, { "epoch": 0.2660008153281696, "grad_norm": 0.2038702368736267, "learning_rate": 9.668093490963447e-05, "loss": 4.4361, "step": 3915 }, { "epoch": 0.26634053539883135, "grad_norm": 1.9976972341537476, "learning_rate": 9.66766884087512e-05, "loss": 4.448, "step": 3920 }, { "epoch": 0.26668025546949314, "grad_norm": 0.2619224488735199, "learning_rate": 9.667244190786792e-05, "loss": 4.3061, "step": 3925 }, { "epoch": 0.2670199755401549, "grad_norm": 0.17488695681095123, "learning_rate": 9.666819540698465e-05, "loss": 3.924, "step": 3930 }, { "epoch": 0.2673596956108167, "grad_norm": 0.3555572032928467, "learning_rate": 9.666394890610138e-05, "loss": 4.4889, "step": 3935 }, { "epoch": 0.2676994156814785, "grad_norm": 0.18651026487350464, "learning_rate": 9.66597024052181e-05, "loss": 4.2601, "step": 3940 }, { "epoch": 0.2680391357521402, "grad_norm": 0.4118260145187378, "learning_rate": 9.665545590433483e-05, "loss": 4.0048, "step": 3945 }, { "epoch": 0.268378855822802, "grad_norm": 0.21420472860336304, "learning_rate": 9.665120940345156e-05, "loss": 4.5124, "step": 3950 }, { "epoch": 0.26871857589346376, "grad_norm": 0.25867247581481934, "learning_rate": 9.664696290256828e-05, "loss": 3.9366, "step": 3955 }, { "epoch": 0.26905829596412556, "grad_norm": 0.9560242295265198, "learning_rate": 9.664271640168502e-05, "loss": 4.273, "step": 3960 }, { "epoch": 0.26939801603478736, "grad_norm": 0.22547510266304016, "learning_rate": 9.663846990080175e-05, "loss": 4.4065, "step": 3965 }, { "epoch": 0.2697377361054491, "grad_norm": 0.4761745035648346, "learning_rate": 9.663422339991846e-05, "loss": 4.0622, "step": 3970 }, { "epoch": 0.2700774561761109, "grad_norm": 0.26078933477401733, "learning_rate": 9.66299768990352e-05, "loss": 4.4622, "step": 3975 }, { "epoch": 0.27041717624677264, "grad_norm": 0.21970224380493164, "learning_rate": 9.662573039815193e-05, "loss": 4.2412, "step": 3980 }, { "epoch": 0.27075689631743444, "grad_norm": 0.5211921334266663, "learning_rate": 9.662148389726865e-05, "loss": 4.3848, "step": 3985 }, { "epoch": 0.27109661638809623, "grad_norm": 0.31244269013404846, "learning_rate": 9.661723739638539e-05, "loss": 4.3852, "step": 3990 }, { "epoch": 0.271436336458758, "grad_norm": 0.29353293776512146, "learning_rate": 9.661299089550211e-05, "loss": 4.091, "step": 3995 }, { "epoch": 0.2717760565294198, "grad_norm": 0.23753587901592255, "learning_rate": 9.660874439461884e-05, "loss": 4.3828, "step": 4000 }, { "epoch": 0.2721157766000815, "grad_norm": 0.2865026593208313, "learning_rate": 9.660449789373557e-05, "loss": 4.2366, "step": 4005 }, { "epoch": 0.2724554966707433, "grad_norm": 0.32267966866493225, "learning_rate": 9.66002513928523e-05, "loss": 4.1455, "step": 4010 }, { "epoch": 0.27279521674140506, "grad_norm": 0.25711655616760254, "learning_rate": 9.659600489196903e-05, "loss": 4.4231, "step": 4015 }, { "epoch": 0.27313493681206685, "grad_norm": 0.21570606529712677, "learning_rate": 9.659175839108575e-05, "loss": 4.2092, "step": 4020 }, { "epoch": 0.27347465688272865, "grad_norm": 0.22739212214946747, "learning_rate": 9.658751189020247e-05, "loss": 4.2107, "step": 4025 }, { "epoch": 0.2738143769533904, "grad_norm": 0.22698377072811127, "learning_rate": 9.658326538931921e-05, "loss": 3.965, "step": 4030 }, { "epoch": 0.2741540970240522, "grad_norm": 0.2108132541179657, "learning_rate": 9.657901888843594e-05, "loss": 4.3312, "step": 4035 }, { "epoch": 0.27449381709471393, "grad_norm": 0.3985457122325897, "learning_rate": 9.657477238755265e-05, "loss": 4.1752, "step": 4040 }, { "epoch": 0.27483353716537573, "grad_norm": 0.2395816147327423, "learning_rate": 9.65705258866694e-05, "loss": 4.6349, "step": 4045 }, { "epoch": 0.2751732572360375, "grad_norm": 0.24473239481449127, "learning_rate": 9.656627938578612e-05, "loss": 4.4361, "step": 4050 }, { "epoch": 0.27551297730669927, "grad_norm": 0.3756929636001587, "learning_rate": 9.656203288490284e-05, "loss": 4.2976, "step": 4055 }, { "epoch": 0.27585269737736107, "grad_norm": 0.2284708023071289, "learning_rate": 9.655778638401958e-05, "loss": 4.2554, "step": 4060 }, { "epoch": 0.2761924174480228, "grad_norm": 0.26325544714927673, "learning_rate": 9.655353988313631e-05, "loss": 4.152, "step": 4065 }, { "epoch": 0.2765321375186846, "grad_norm": 0.20146878063678741, "learning_rate": 9.654929338225302e-05, "loss": 4.1147, "step": 4070 }, { "epoch": 0.2768718575893464, "grad_norm": 0.21572470664978027, "learning_rate": 9.654504688136976e-05, "loss": 4.2792, "step": 4075 }, { "epoch": 0.27721157766000815, "grad_norm": 0.32967862486839294, "learning_rate": 9.654080038048649e-05, "loss": 4.1379, "step": 4080 }, { "epoch": 0.27755129773066994, "grad_norm": 0.23224560916423798, "learning_rate": 9.65365538796032e-05, "loss": 4.0244, "step": 4085 }, { "epoch": 0.2778910178013317, "grad_norm": 0.2101649045944214, "learning_rate": 9.653230737871995e-05, "loss": 4.2301, "step": 4090 }, { "epoch": 0.2782307378719935, "grad_norm": 0.22994142770767212, "learning_rate": 9.652806087783666e-05, "loss": 4.1505, "step": 4095 }, { "epoch": 0.2785704579426552, "grad_norm": 0.23458316922187805, "learning_rate": 9.652381437695339e-05, "loss": 4.2603, "step": 4100 }, { "epoch": 0.278910178013317, "grad_norm": 1.6248669624328613, "learning_rate": 9.651956787607013e-05, "loss": 4.1157, "step": 4105 }, { "epoch": 0.2792498980839788, "grad_norm": 0.49504292011260986, "learning_rate": 9.651532137518685e-05, "loss": 4.2854, "step": 4110 }, { "epoch": 0.27958961815464056, "grad_norm": 0.2064412534236908, "learning_rate": 9.651107487430357e-05, "loss": 4.2062, "step": 4115 }, { "epoch": 0.27992933822530236, "grad_norm": 0.26098746061325073, "learning_rate": 9.650682837342031e-05, "loss": 4.1904, "step": 4120 }, { "epoch": 0.2802690582959641, "grad_norm": 1.5820512771606445, "learning_rate": 9.650258187253703e-05, "loss": 4.0721, "step": 4125 }, { "epoch": 0.2806087783666259, "grad_norm": 0.28080296516418457, "learning_rate": 9.649833537165376e-05, "loss": 4.1324, "step": 4130 }, { "epoch": 0.2809484984372877, "grad_norm": 0.186203733086586, "learning_rate": 9.64940888707705e-05, "loss": 4.0794, "step": 4135 }, { "epoch": 0.28128821850794944, "grad_norm": 0.3637949824333191, "learning_rate": 9.648984236988721e-05, "loss": 4.135, "step": 4140 }, { "epoch": 0.28162793857861124, "grad_norm": 0.24603451788425446, "learning_rate": 9.648559586900394e-05, "loss": 4.3026, "step": 4145 }, { "epoch": 0.281967658649273, "grad_norm": 0.21384142339229584, "learning_rate": 9.648134936812068e-05, "loss": 4.5439, "step": 4150 }, { "epoch": 0.2823073787199348, "grad_norm": 0.23172922432422638, "learning_rate": 9.64771028672374e-05, "loss": 4.2762, "step": 4155 }, { "epoch": 0.2826470987905966, "grad_norm": 0.22065469622612, "learning_rate": 9.647285636635413e-05, "loss": 4.3842, "step": 4160 }, { "epoch": 0.2829868188612583, "grad_norm": 3.40319561958313, "learning_rate": 9.646860986547087e-05, "loss": 4.1898, "step": 4165 }, { "epoch": 0.2833265389319201, "grad_norm": 0.20233049988746643, "learning_rate": 9.646436336458758e-05, "loss": 4.0764, "step": 4170 }, { "epoch": 0.28366625900258186, "grad_norm": 0.586283802986145, "learning_rate": 9.646011686370431e-05, "loss": 4.314, "step": 4175 }, { "epoch": 0.28400597907324365, "grad_norm": 2.9164395332336426, "learning_rate": 9.645587036282104e-05, "loss": 4.0146, "step": 4180 }, { "epoch": 0.2843456991439054, "grad_norm": 0.28996986150741577, "learning_rate": 9.645162386193777e-05, "loss": 4.2867, "step": 4185 }, { "epoch": 0.2846854192145672, "grad_norm": 0.5857749581336975, "learning_rate": 9.64473773610545e-05, "loss": 3.9711, "step": 4190 }, { "epoch": 0.285025139285229, "grad_norm": 0.2920601963996887, "learning_rate": 9.644313086017122e-05, "loss": 4.2076, "step": 4195 }, { "epoch": 0.28536485935589073, "grad_norm": 0.5181906223297119, "learning_rate": 9.643888435928795e-05, "loss": 4.2119, "step": 4200 }, { "epoch": 0.28570457942655253, "grad_norm": 0.28782758116722107, "learning_rate": 9.643463785840468e-05, "loss": 4.3009, "step": 4205 }, { "epoch": 0.28604429949721427, "grad_norm": 0.23932106792926788, "learning_rate": 9.64303913575214e-05, "loss": 4.2094, "step": 4210 }, { "epoch": 0.28638401956787607, "grad_norm": 0.3413240611553192, "learning_rate": 9.642614485663813e-05, "loss": 4.2305, "step": 4215 }, { "epoch": 0.28672373963853787, "grad_norm": 2.542914390563965, "learning_rate": 9.642189835575486e-05, "loss": 4.4582, "step": 4220 }, { "epoch": 0.2870634597091996, "grad_norm": 0.21945473551750183, "learning_rate": 9.641765185487159e-05, "loss": 4.3744, "step": 4225 }, { "epoch": 0.2874031797798614, "grad_norm": 0.209177166223526, "learning_rate": 9.641340535398832e-05, "loss": 4.2808, "step": 4230 }, { "epoch": 0.28774289985052315, "grad_norm": 0.2568071782588959, "learning_rate": 9.640915885310505e-05, "loss": 3.9962, "step": 4235 }, { "epoch": 0.28808261992118495, "grad_norm": 0.31277233362197876, "learning_rate": 9.640491235222177e-05, "loss": 4.3054, "step": 4240 }, { "epoch": 0.28842233999184674, "grad_norm": 0.24698586761951447, "learning_rate": 9.64006658513385e-05, "loss": 4.5058, "step": 4245 }, { "epoch": 0.2887620600625085, "grad_norm": 0.23559850454330444, "learning_rate": 9.639641935045523e-05, "loss": 4.0805, "step": 4250 }, { "epoch": 0.2891017801331703, "grad_norm": 0.22997041046619415, "learning_rate": 9.639217284957196e-05, "loss": 3.9677, "step": 4255 }, { "epoch": 0.289441500203832, "grad_norm": 0.2484733760356903, "learning_rate": 9.638792634868869e-05, "loss": 4.4475, "step": 4260 }, { "epoch": 0.2897812202744938, "grad_norm": 0.19737331569194794, "learning_rate": 9.638367984780541e-05, "loss": 4.0734, "step": 4265 }, { "epoch": 0.29012094034515556, "grad_norm": 0.21674834191799164, "learning_rate": 9.637943334692214e-05, "loss": 4.2674, "step": 4270 }, { "epoch": 0.29046066041581736, "grad_norm": 1.3918811082839966, "learning_rate": 9.637518684603887e-05, "loss": 4.2559, "step": 4275 }, { "epoch": 0.29080038048647916, "grad_norm": 0.3141735792160034, "learning_rate": 9.63709403451556e-05, "loss": 4.2088, "step": 4280 }, { "epoch": 0.2911401005571409, "grad_norm": 0.17463235557079315, "learning_rate": 9.636669384427233e-05, "loss": 4.2172, "step": 4285 }, { "epoch": 0.2914798206278027, "grad_norm": 0.28866469860076904, "learning_rate": 9.636244734338905e-05, "loss": 4.1058, "step": 4290 }, { "epoch": 0.29181954069846444, "grad_norm": 0.19368840754032135, "learning_rate": 9.635820084250577e-05, "loss": 4.3816, "step": 4295 }, { "epoch": 0.29215926076912624, "grad_norm": 0.5456646680831909, "learning_rate": 9.635395434162251e-05, "loss": 4.004, "step": 4300 }, { "epoch": 0.29249898083978804, "grad_norm": 0.5791100263595581, "learning_rate": 9.634970784073924e-05, "loss": 3.9493, "step": 4305 }, { "epoch": 0.2928387009104498, "grad_norm": 0.18864502012729645, "learning_rate": 9.634546133985595e-05, "loss": 4.2906, "step": 4310 }, { "epoch": 0.2931784209811116, "grad_norm": 0.6542890071868896, "learning_rate": 9.63412148389727e-05, "loss": 4.2943, "step": 4315 }, { "epoch": 0.2935181410517733, "grad_norm": 0.2639864683151245, "learning_rate": 9.633696833808942e-05, "loss": 4.1306, "step": 4320 }, { "epoch": 0.2938578611224351, "grad_norm": 0.24884024262428284, "learning_rate": 9.633272183720614e-05, "loss": 4.1749, "step": 4325 }, { "epoch": 0.2941975811930969, "grad_norm": 1.6146323680877686, "learning_rate": 9.632847533632288e-05, "loss": 4.1622, "step": 4330 }, { "epoch": 0.29453730126375866, "grad_norm": 0.19550690054893494, "learning_rate": 9.63242288354396e-05, "loss": 4.2481, "step": 4335 }, { "epoch": 0.29487702133442045, "grad_norm": 0.48053327202796936, "learning_rate": 9.631998233455633e-05, "loss": 4.1654, "step": 4340 }, { "epoch": 0.2952167414050822, "grad_norm": 0.6082022190093994, "learning_rate": 9.631573583367306e-05, "loss": 3.922, "step": 4345 }, { "epoch": 0.295556461475744, "grad_norm": 0.410819411277771, "learning_rate": 9.631148933278979e-05, "loss": 4.1838, "step": 4350 }, { "epoch": 0.29589618154640573, "grad_norm": 0.20050150156021118, "learning_rate": 9.630724283190652e-05, "loss": 4.2083, "step": 4355 }, { "epoch": 0.29623590161706753, "grad_norm": 0.2641303539276123, "learning_rate": 9.630299633102325e-05, "loss": 4.0804, "step": 4360 }, { "epoch": 0.29657562168772933, "grad_norm": 0.1961575597524643, "learning_rate": 9.629874983013997e-05, "loss": 4.3261, "step": 4365 }, { "epoch": 0.29691534175839107, "grad_norm": 0.17782385647296906, "learning_rate": 9.62945033292567e-05, "loss": 4.3017, "step": 4370 }, { "epoch": 0.29725506182905287, "grad_norm": 1.2139571905136108, "learning_rate": 9.629025682837343e-05, "loss": 4.1051, "step": 4375 }, { "epoch": 0.2975947818997146, "grad_norm": 0.2687116265296936, "learning_rate": 9.628601032749014e-05, "loss": 4.0757, "step": 4380 }, { "epoch": 0.2979345019703764, "grad_norm": 0.19845756888389587, "learning_rate": 9.628176382660689e-05, "loss": 4.0948, "step": 4385 }, { "epoch": 0.2982742220410382, "grad_norm": 0.2517881989479065, "learning_rate": 9.627751732572361e-05, "loss": 4.3492, "step": 4390 }, { "epoch": 0.29861394211169995, "grad_norm": 0.19443516433238983, "learning_rate": 9.627327082484033e-05, "loss": 4.2639, "step": 4395 }, { "epoch": 0.29895366218236175, "grad_norm": 0.21518000960350037, "learning_rate": 9.626902432395707e-05, "loss": 4.0157, "step": 4400 }, { "epoch": 0.2992933822530235, "grad_norm": 0.3461875915527344, "learning_rate": 9.62647778230738e-05, "loss": 4.2115, "step": 4405 }, { "epoch": 0.2996331023236853, "grad_norm": 0.39930984377861023, "learning_rate": 9.626053132219051e-05, "loss": 4.3391, "step": 4410 }, { "epoch": 0.2999728223943471, "grad_norm": 0.22730666399002075, "learning_rate": 9.625628482130725e-05, "loss": 4.4092, "step": 4415 }, { "epoch": 0.3003125424650088, "grad_norm": 0.2596425414085388, "learning_rate": 9.625203832042398e-05, "loss": 4.2127, "step": 4420 }, { "epoch": 0.3006522625356706, "grad_norm": 0.19453459978103638, "learning_rate": 9.62477918195407e-05, "loss": 4.338, "step": 4425 }, { "epoch": 0.30099198260633236, "grad_norm": 0.5006263852119446, "learning_rate": 9.624354531865744e-05, "loss": 4.3887, "step": 4430 }, { "epoch": 0.30133170267699416, "grad_norm": 0.5625196099281311, "learning_rate": 9.623929881777417e-05, "loss": 4.411, "step": 4435 }, { "epoch": 0.3016714227476559, "grad_norm": 0.2295573651790619, "learning_rate": 9.623505231689088e-05, "loss": 4.0798, "step": 4440 }, { "epoch": 0.3020111428183177, "grad_norm": 0.22928239405155182, "learning_rate": 9.623080581600762e-05, "loss": 4.2019, "step": 4445 }, { "epoch": 0.3023508628889795, "grad_norm": 0.7567266225814819, "learning_rate": 9.622655931512434e-05, "loss": 4.069, "step": 4450 }, { "epoch": 0.30269058295964124, "grad_norm": 0.3791631758213043, "learning_rate": 9.622231281424106e-05, "loss": 4.0739, "step": 4455 }, { "epoch": 0.30303030303030304, "grad_norm": 0.21165654063224792, "learning_rate": 9.62180663133578e-05, "loss": 4.0467, "step": 4460 }, { "epoch": 0.3033700231009648, "grad_norm": 0.2274232804775238, "learning_rate": 9.621381981247452e-05, "loss": 4.1655, "step": 4465 }, { "epoch": 0.3037097431716266, "grad_norm": 0.2882256507873535, "learning_rate": 9.620957331159125e-05, "loss": 4.0162, "step": 4470 }, { "epoch": 0.3040494632422884, "grad_norm": 0.21718928217887878, "learning_rate": 9.620532681070799e-05, "loss": 4.1421, "step": 4475 }, { "epoch": 0.3043891833129501, "grad_norm": 0.20482736825942993, "learning_rate": 9.62010803098247e-05, "loss": 4.3067, "step": 4480 }, { "epoch": 0.3047289033836119, "grad_norm": 0.8644359111785889, "learning_rate": 9.619683380894143e-05, "loss": 4.1044, "step": 4485 }, { "epoch": 0.30506862345427366, "grad_norm": 0.2353629469871521, "learning_rate": 9.619258730805817e-05, "loss": 4.6311, "step": 4490 }, { "epoch": 0.30540834352493546, "grad_norm": 0.4746248722076416, "learning_rate": 9.618834080717489e-05, "loss": 4.2593, "step": 4495 }, { "epoch": 0.30574806359559725, "grad_norm": 0.412517786026001, "learning_rate": 9.618409430629162e-05, "loss": 4.2203, "step": 4500 }, { "epoch": 0.306087783666259, "grad_norm": 0.1828744113445282, "learning_rate": 9.617984780540836e-05, "loss": 4.0572, "step": 4505 }, { "epoch": 0.3064275037369208, "grad_norm": 0.2963522672653198, "learning_rate": 9.617560130452507e-05, "loss": 4.3707, "step": 4510 }, { "epoch": 0.30676722380758253, "grad_norm": 0.20454883575439453, "learning_rate": 9.61713548036418e-05, "loss": 4.1609, "step": 4515 }, { "epoch": 0.30710694387824433, "grad_norm": 0.9719656109809875, "learning_rate": 9.616710830275853e-05, "loss": 4.3215, "step": 4520 }, { "epoch": 0.3074466639489061, "grad_norm": 0.6544187664985657, "learning_rate": 9.616286180187526e-05, "loss": 3.9987, "step": 4525 }, { "epoch": 0.30778638401956787, "grad_norm": 0.3224717080593109, "learning_rate": 9.615861530099198e-05, "loss": 4.2707, "step": 4530 }, { "epoch": 0.30812610409022967, "grad_norm": 0.3716621398925781, "learning_rate": 9.615436880010871e-05, "loss": 3.9584, "step": 4535 }, { "epoch": 0.3084658241608914, "grad_norm": 0.23874157667160034, "learning_rate": 9.615012229922544e-05, "loss": 4.0501, "step": 4540 }, { "epoch": 0.3088055442315532, "grad_norm": 0.30314287543296814, "learning_rate": 9.614587579834217e-05, "loss": 3.8515, "step": 4545 }, { "epoch": 0.30914526430221495, "grad_norm": 0.19416074454784393, "learning_rate": 9.61416292974589e-05, "loss": 4.0812, "step": 4550 }, { "epoch": 0.30948498437287675, "grad_norm": 0.5242770910263062, "learning_rate": 9.613738279657562e-05, "loss": 4.37, "step": 4555 }, { "epoch": 0.30982470444353855, "grad_norm": 0.20864085853099823, "learning_rate": 9.613313629569235e-05, "loss": 4.2711, "step": 4560 }, { "epoch": 0.3101644245142003, "grad_norm": 0.20202593505382538, "learning_rate": 9.612888979480908e-05, "loss": 4.2423, "step": 4565 }, { "epoch": 0.3105041445848621, "grad_norm": 0.5426074266433716, "learning_rate": 9.612464329392581e-05, "loss": 4.2333, "step": 4570 }, { "epoch": 0.3108438646555238, "grad_norm": 0.21013125777244568, "learning_rate": 9.612039679304254e-05, "loss": 4.0275, "step": 4575 }, { "epoch": 0.3111835847261856, "grad_norm": 0.3289850056171417, "learning_rate": 9.611615029215926e-05, "loss": 4.3321, "step": 4580 }, { "epoch": 0.3115233047968474, "grad_norm": 0.20981498062610626, "learning_rate": 9.611190379127599e-05, "loss": 4.3237, "step": 4585 }, { "epoch": 0.31186302486750916, "grad_norm": 0.2699143886566162, "learning_rate": 9.610765729039272e-05, "loss": 4.1158, "step": 4590 }, { "epoch": 0.31220274493817096, "grad_norm": 0.27582621574401855, "learning_rate": 9.610341078950945e-05, "loss": 4.1325, "step": 4595 }, { "epoch": 0.3125424650088327, "grad_norm": 0.2031656950712204, "learning_rate": 9.609916428862618e-05, "loss": 4.2928, "step": 4600 }, { "epoch": 0.3128821850794945, "grad_norm": 0.17339491844177246, "learning_rate": 9.60949177877429e-05, "loss": 4.2046, "step": 4605 }, { "epoch": 0.3132219051501563, "grad_norm": 0.2266245186328888, "learning_rate": 9.609067128685963e-05, "loss": 4.0126, "step": 4610 }, { "epoch": 0.31356162522081804, "grad_norm": 0.2599181830883026, "learning_rate": 9.608642478597636e-05, "loss": 4.2055, "step": 4615 }, { "epoch": 0.31390134529147984, "grad_norm": 0.27137070894241333, "learning_rate": 9.608217828509309e-05, "loss": 4.3499, "step": 4620 }, { "epoch": 0.3142410653621416, "grad_norm": 0.22108152508735657, "learning_rate": 9.607793178420982e-05, "loss": 4.257, "step": 4625 }, { "epoch": 0.3145807854328034, "grad_norm": 0.2058125138282776, "learning_rate": 9.607368528332654e-05, "loss": 3.919, "step": 4630 }, { "epoch": 0.3149205055034651, "grad_norm": 0.4298430383205414, "learning_rate": 9.606943878244327e-05, "loss": 4.0876, "step": 4635 }, { "epoch": 0.3152602255741269, "grad_norm": 0.19494764506816864, "learning_rate": 9.606519228156e-05, "loss": 4.2404, "step": 4640 }, { "epoch": 0.3155999456447887, "grad_norm": 0.35915589332580566, "learning_rate": 9.606094578067673e-05, "loss": 4.1185, "step": 4645 }, { "epoch": 0.31593966571545046, "grad_norm": 0.395074725151062, "learning_rate": 9.605669927979344e-05, "loss": 4.2395, "step": 4650 }, { "epoch": 0.31627938578611225, "grad_norm": 0.3247855007648468, "learning_rate": 9.605245277891018e-05, "loss": 3.9987, "step": 4655 }, { "epoch": 0.316619105856774, "grad_norm": 0.18057437241077423, "learning_rate": 9.604820627802691e-05, "loss": 4.3912, "step": 4660 }, { "epoch": 0.3169588259274358, "grad_norm": 0.25986525416374207, "learning_rate": 9.604395977714363e-05, "loss": 4.2514, "step": 4665 }, { "epoch": 0.3172985459980976, "grad_norm": 0.2058873325586319, "learning_rate": 9.603971327626037e-05, "loss": 4.3795, "step": 4670 }, { "epoch": 0.31763826606875933, "grad_norm": 0.30512815713882446, "learning_rate": 9.60354667753771e-05, "loss": 4.2392, "step": 4675 }, { "epoch": 0.31797798613942113, "grad_norm": 0.18412257730960846, "learning_rate": 9.603122027449382e-05, "loss": 4.2086, "step": 4680 }, { "epoch": 0.3183177062100829, "grad_norm": 0.2081792950630188, "learning_rate": 9.602697377361055e-05, "loss": 4.3573, "step": 4685 }, { "epoch": 0.31865742628074467, "grad_norm": 0.3510904014110565, "learning_rate": 9.602272727272728e-05, "loss": 4.3618, "step": 4690 }, { "epoch": 0.31899714635140647, "grad_norm": 0.37145447731018066, "learning_rate": 9.601848077184401e-05, "loss": 4.301, "step": 4695 }, { "epoch": 0.3193368664220682, "grad_norm": 0.23616161942481995, "learning_rate": 9.601423427096074e-05, "loss": 4.2158, "step": 4700 }, { "epoch": 0.31967658649273, "grad_norm": 1.332667589187622, "learning_rate": 9.600998777007746e-05, "loss": 4.0569, "step": 4705 }, { "epoch": 0.32001630656339175, "grad_norm": 0.21901053190231323, "learning_rate": 9.600574126919419e-05, "loss": 4.0555, "step": 4710 }, { "epoch": 0.32035602663405355, "grad_norm": 0.18509267270565033, "learning_rate": 9.600149476831092e-05, "loss": 4.1219, "step": 4715 }, { "epoch": 0.3206957467047153, "grad_norm": 0.17567375302314758, "learning_rate": 9.599724826742764e-05, "loss": 4.2921, "step": 4720 }, { "epoch": 0.3210354667753771, "grad_norm": 0.17881886661052704, "learning_rate": 9.599300176654438e-05, "loss": 4.2577, "step": 4725 }, { "epoch": 0.3213751868460389, "grad_norm": 0.2836432158946991, "learning_rate": 9.59887552656611e-05, "loss": 4.1362, "step": 4730 }, { "epoch": 0.3217149069167006, "grad_norm": 0.43188366293907166, "learning_rate": 9.598450876477782e-05, "loss": 3.7939, "step": 4735 }, { "epoch": 0.3220546269873624, "grad_norm": 0.21462103724479675, "learning_rate": 9.598026226389456e-05, "loss": 4.166, "step": 4740 }, { "epoch": 0.32239434705802417, "grad_norm": 0.16883856058120728, "learning_rate": 9.597601576301129e-05, "loss": 3.9182, "step": 4745 }, { "epoch": 0.32273406712868596, "grad_norm": 0.32869285345077515, "learning_rate": 9.5971769262128e-05, "loss": 4.0832, "step": 4750 }, { "epoch": 0.32307378719934776, "grad_norm": 0.22861923277378082, "learning_rate": 9.596752276124474e-05, "loss": 4.344, "step": 4755 }, { "epoch": 0.3234135072700095, "grad_norm": 0.3567066192626953, "learning_rate": 9.596327626036147e-05, "loss": 4.0191, "step": 4760 }, { "epoch": 0.3237532273406713, "grad_norm": 0.23706111311912537, "learning_rate": 9.595902975947819e-05, "loss": 4.1505, "step": 4765 }, { "epoch": 0.32409294741133304, "grad_norm": 0.22384607791900635, "learning_rate": 9.595478325859493e-05, "loss": 4.1113, "step": 4770 }, { "epoch": 0.32443266748199484, "grad_norm": 0.1979631930589676, "learning_rate": 9.595053675771166e-05, "loss": 4.0686, "step": 4775 }, { "epoch": 0.32477238755265664, "grad_norm": 0.3480912148952484, "learning_rate": 9.594629025682837e-05, "loss": 4.2106, "step": 4780 }, { "epoch": 0.3251121076233184, "grad_norm": 0.23982703685760498, "learning_rate": 9.594204375594511e-05, "loss": 4.2114, "step": 4785 }, { "epoch": 0.3254518276939802, "grad_norm": 1.1133886575698853, "learning_rate": 9.593779725506184e-05, "loss": 3.894, "step": 4790 }, { "epoch": 0.3257915477646419, "grad_norm": 0.1953042596578598, "learning_rate": 9.593355075417856e-05, "loss": 4.1392, "step": 4795 }, { "epoch": 0.3261312678353037, "grad_norm": 0.24198144674301147, "learning_rate": 9.59293042532953e-05, "loss": 3.9054, "step": 4800 }, { "epoch": 0.32647098790596546, "grad_norm": 0.5570387840270996, "learning_rate": 9.592505775241201e-05, "loss": 3.9928, "step": 4805 }, { "epoch": 0.32681070797662726, "grad_norm": 0.3616771399974823, "learning_rate": 9.592081125152874e-05, "loss": 4.3527, "step": 4810 }, { "epoch": 0.32715042804728905, "grad_norm": 0.7734106183052063, "learning_rate": 9.591656475064548e-05, "loss": 3.9594, "step": 4815 }, { "epoch": 0.3274901481179508, "grad_norm": 0.23176033794879913, "learning_rate": 9.59123182497622e-05, "loss": 4.2269, "step": 4820 }, { "epoch": 0.3278298681886126, "grad_norm": 0.22131270170211792, "learning_rate": 9.590807174887892e-05, "loss": 4.205, "step": 4825 }, { "epoch": 0.32816958825927434, "grad_norm": 0.1967305839061737, "learning_rate": 9.590382524799566e-05, "loss": 4.1081, "step": 4830 }, { "epoch": 0.32850930832993613, "grad_norm": 0.20413081347942352, "learning_rate": 9.589957874711238e-05, "loss": 4.2146, "step": 4835 }, { "epoch": 0.32884902840059793, "grad_norm": 0.237589493393898, "learning_rate": 9.589533224622911e-05, "loss": 4.321, "step": 4840 }, { "epoch": 0.3291887484712597, "grad_norm": 0.3263246417045593, "learning_rate": 9.589108574534585e-05, "loss": 4.2572, "step": 4845 }, { "epoch": 0.32952846854192147, "grad_norm": 0.2639991343021393, "learning_rate": 9.588683924446256e-05, "loss": 4.3049, "step": 4850 }, { "epoch": 0.3298681886125832, "grad_norm": 0.33223286271095276, "learning_rate": 9.588259274357929e-05, "loss": 4.3715, "step": 4855 }, { "epoch": 0.330207908683245, "grad_norm": 0.22897298634052277, "learning_rate": 9.587834624269603e-05, "loss": 3.93, "step": 4860 }, { "epoch": 0.3305476287539068, "grad_norm": 0.3667212128639221, "learning_rate": 9.587409974181275e-05, "loss": 4.3149, "step": 4865 }, { "epoch": 0.33088734882456855, "grad_norm": 0.22442007064819336, "learning_rate": 9.586985324092948e-05, "loss": 4.4189, "step": 4870 }, { "epoch": 0.33122706889523035, "grad_norm": 0.18334710597991943, "learning_rate": 9.58656067400462e-05, "loss": 3.9457, "step": 4875 }, { "epoch": 0.3315667889658921, "grad_norm": 0.34593579173088074, "learning_rate": 9.586136023916293e-05, "loss": 3.9007, "step": 4880 }, { "epoch": 0.3319065090365539, "grad_norm": 0.24884456396102905, "learning_rate": 9.585711373827966e-05, "loss": 4.198, "step": 4885 }, { "epoch": 0.33224622910721563, "grad_norm": 0.6150047183036804, "learning_rate": 9.585286723739639e-05, "loss": 4.1738, "step": 4890 }, { "epoch": 0.3325859491778774, "grad_norm": 0.18449634313583374, "learning_rate": 9.584862073651312e-05, "loss": 4.3916, "step": 4895 }, { "epoch": 0.3329256692485392, "grad_norm": 0.45811331272125244, "learning_rate": 9.584437423562984e-05, "loss": 4.0894, "step": 4900 }, { "epoch": 0.33326538931920097, "grad_norm": 0.9670056700706482, "learning_rate": 9.584012773474657e-05, "loss": 4.0947, "step": 4905 }, { "epoch": 0.33360510938986276, "grad_norm": 0.2828699052333832, "learning_rate": 9.58358812338633e-05, "loss": 4.27, "step": 4910 }, { "epoch": 0.3339448294605245, "grad_norm": 0.22989730536937714, "learning_rate": 9.583163473298003e-05, "loss": 4.2658, "step": 4915 }, { "epoch": 0.3342845495311863, "grad_norm": 0.4018714427947998, "learning_rate": 9.582738823209676e-05, "loss": 4.1278, "step": 4920 }, { "epoch": 0.3346242696018481, "grad_norm": 0.5296480059623718, "learning_rate": 9.582314173121348e-05, "loss": 4.1865, "step": 4925 }, { "epoch": 0.33496398967250984, "grad_norm": 0.2477627843618393, "learning_rate": 9.581889523033021e-05, "loss": 4.4925, "step": 4930 }, { "epoch": 0.33530370974317164, "grad_norm": 0.24414370954036713, "learning_rate": 9.581464872944694e-05, "loss": 4.1706, "step": 4935 }, { "epoch": 0.3356434298138334, "grad_norm": 0.15603503584861755, "learning_rate": 9.581040222856367e-05, "loss": 4.0303, "step": 4940 }, { "epoch": 0.3359831498844952, "grad_norm": 0.17460083961486816, "learning_rate": 9.58061557276804e-05, "loss": 4.1986, "step": 4945 }, { "epoch": 0.336322869955157, "grad_norm": 0.2035687118768692, "learning_rate": 9.580190922679712e-05, "loss": 4.3514, "step": 4950 }, { "epoch": 0.3366625900258187, "grad_norm": 0.4037059545516968, "learning_rate": 9.579766272591385e-05, "loss": 4.1645, "step": 4955 }, { "epoch": 0.3370023100964805, "grad_norm": 0.22527378797531128, "learning_rate": 9.579341622503058e-05, "loss": 4.1399, "step": 4960 }, { "epoch": 0.33734203016714226, "grad_norm": 0.27611520886421204, "learning_rate": 9.578916972414731e-05, "loss": 4.1679, "step": 4965 }, { "epoch": 0.33768175023780406, "grad_norm": 0.26130980253219604, "learning_rate": 9.578492322326404e-05, "loss": 3.9786, "step": 4970 }, { "epoch": 0.3380214703084658, "grad_norm": 0.20753854513168335, "learning_rate": 9.578067672238076e-05, "loss": 4.0629, "step": 4975 }, { "epoch": 0.3383611903791276, "grad_norm": 0.198018416762352, "learning_rate": 9.577643022149749e-05, "loss": 4.0956, "step": 4980 }, { "epoch": 0.3387009104497894, "grad_norm": 0.21650417149066925, "learning_rate": 9.577218372061422e-05, "loss": 4.0965, "step": 4985 }, { "epoch": 0.33904063052045114, "grad_norm": 0.3832937777042389, "learning_rate": 9.576793721973095e-05, "loss": 4.1086, "step": 4990 }, { "epoch": 0.33938035059111293, "grad_norm": 0.23973116278648376, "learning_rate": 9.576369071884768e-05, "loss": 4.3533, "step": 4995 }, { "epoch": 0.3397200706617747, "grad_norm": 0.3817537724971771, "learning_rate": 9.57594442179644e-05, "loss": 4.2127, "step": 5000 }, { "epoch": 0.3400597907324365, "grad_norm": 0.2038896679878235, "learning_rate": 9.575519771708112e-05, "loss": 4.3264, "step": 5005 }, { "epoch": 0.34039951080309827, "grad_norm": 0.23982146382331848, "learning_rate": 9.575095121619786e-05, "loss": 4.0702, "step": 5010 }, { "epoch": 0.34073923087376, "grad_norm": 0.30504170060157776, "learning_rate": 9.574670471531459e-05, "loss": 3.9892, "step": 5015 }, { "epoch": 0.3410789509444218, "grad_norm": 0.2355673611164093, "learning_rate": 9.574245821443132e-05, "loss": 3.9423, "step": 5020 }, { "epoch": 0.34141867101508355, "grad_norm": 0.22874650359153748, "learning_rate": 9.573821171354804e-05, "loss": 3.9142, "step": 5025 }, { "epoch": 0.34175839108574535, "grad_norm": 0.21437452733516693, "learning_rate": 9.573396521266477e-05, "loss": 4.1365, "step": 5030 }, { "epoch": 0.34209811115640715, "grad_norm": 0.3095625936985016, "learning_rate": 9.57297187117815e-05, "loss": 4.1468, "step": 5035 }, { "epoch": 0.3424378312270689, "grad_norm": 0.2202177494764328, "learning_rate": 9.572547221089823e-05, "loss": 4.01, "step": 5040 }, { "epoch": 0.3427775512977307, "grad_norm": 0.32474327087402344, "learning_rate": 9.572122571001496e-05, "loss": 4.0868, "step": 5045 }, { "epoch": 0.34311727136839243, "grad_norm": 0.2622280716896057, "learning_rate": 9.571697920913168e-05, "loss": 4.2103, "step": 5050 }, { "epoch": 0.3434569914390542, "grad_norm": 0.7448468208312988, "learning_rate": 9.571273270824841e-05, "loss": 4.2945, "step": 5055 }, { "epoch": 0.34379671150971597, "grad_norm": 0.2269594371318817, "learning_rate": 9.570848620736514e-05, "loss": 3.9127, "step": 5060 }, { "epoch": 0.34413643158037777, "grad_norm": 0.2645524740219116, "learning_rate": 9.570423970648187e-05, "loss": 3.9423, "step": 5065 }, { "epoch": 0.34447615165103956, "grad_norm": 0.246607705950737, "learning_rate": 9.56999932055986e-05, "loss": 4.1347, "step": 5070 }, { "epoch": 0.3448158717217013, "grad_norm": 0.19342374801635742, "learning_rate": 9.569574670471531e-05, "loss": 4.0787, "step": 5075 }, { "epoch": 0.3451555917923631, "grad_norm": 0.31122297048568726, "learning_rate": 9.569150020383205e-05, "loss": 4.1522, "step": 5080 }, { "epoch": 0.34549531186302485, "grad_norm": 1.0425968170166016, "learning_rate": 9.568725370294878e-05, "loss": 3.995, "step": 5085 }, { "epoch": 0.34583503193368664, "grad_norm": 0.22739467024803162, "learning_rate": 9.56830072020655e-05, "loss": 3.9042, "step": 5090 }, { "epoch": 0.34617475200434844, "grad_norm": 0.21344348788261414, "learning_rate": 9.567876070118224e-05, "loss": 4.1351, "step": 5095 }, { "epoch": 0.3465144720750102, "grad_norm": 0.2140883356332779, "learning_rate": 9.567451420029896e-05, "loss": 3.9542, "step": 5100 }, { "epoch": 0.346854192145672, "grad_norm": 0.1795925498008728, "learning_rate": 9.567026769941568e-05, "loss": 4.061, "step": 5105 }, { "epoch": 0.3471939122163337, "grad_norm": 0.37569665908813477, "learning_rate": 9.566602119853242e-05, "loss": 4.3051, "step": 5110 }, { "epoch": 0.3475336322869955, "grad_norm": 0.19528646767139435, "learning_rate": 9.566177469764915e-05, "loss": 4.1591, "step": 5115 }, { "epoch": 0.3478733523576573, "grad_norm": 0.1688258945941925, "learning_rate": 9.565752819676586e-05, "loss": 3.9876, "step": 5120 }, { "epoch": 0.34821307242831906, "grad_norm": 0.17164097726345062, "learning_rate": 9.56532816958826e-05, "loss": 4.2081, "step": 5125 }, { "epoch": 0.34855279249898086, "grad_norm": 0.18012307584285736, "learning_rate": 9.564903519499933e-05, "loss": 3.9195, "step": 5130 }, { "epoch": 0.3488925125696426, "grad_norm": 0.2118469625711441, "learning_rate": 9.564478869411605e-05, "loss": 4.1027, "step": 5135 }, { "epoch": 0.3492322326403044, "grad_norm": 0.21049754321575165, "learning_rate": 9.564054219323279e-05, "loss": 4.0254, "step": 5140 }, { "epoch": 0.34957195271096614, "grad_norm": 0.5723572969436646, "learning_rate": 9.563629569234952e-05, "loss": 3.8725, "step": 5145 }, { "epoch": 0.34991167278162794, "grad_norm": 0.22618575394153595, "learning_rate": 9.563204919146623e-05, "loss": 4.1794, "step": 5150 }, { "epoch": 0.35025139285228973, "grad_norm": 0.1874348223209381, "learning_rate": 9.562780269058297e-05, "loss": 3.9515, "step": 5155 }, { "epoch": 0.3505911129229515, "grad_norm": 0.2910906672477722, "learning_rate": 9.562355618969969e-05, "loss": 4.2863, "step": 5160 }, { "epoch": 0.3509308329936133, "grad_norm": 0.19384640455245972, "learning_rate": 9.561930968881641e-05, "loss": 4.2367, "step": 5165 }, { "epoch": 0.351270553064275, "grad_norm": 0.15815407037734985, "learning_rate": 9.561506318793316e-05, "loss": 3.9907, "step": 5170 }, { "epoch": 0.3516102731349368, "grad_norm": 0.2070821076631546, "learning_rate": 9.561081668704987e-05, "loss": 3.9338, "step": 5175 }, { "epoch": 0.3519499932055986, "grad_norm": 0.24302656948566437, "learning_rate": 9.56065701861666e-05, "loss": 4.2737, "step": 5180 }, { "epoch": 0.35228971327626035, "grad_norm": 0.24706007540225983, "learning_rate": 9.560232368528334e-05, "loss": 4.23, "step": 5185 }, { "epoch": 0.35262943334692215, "grad_norm": 0.5972357988357544, "learning_rate": 9.559807718440005e-05, "loss": 4.157, "step": 5190 }, { "epoch": 0.3529691534175839, "grad_norm": 1.1296205520629883, "learning_rate": 9.559383068351678e-05, "loss": 4.1556, "step": 5195 }, { "epoch": 0.3533088734882457, "grad_norm": 0.19614671170711517, "learning_rate": 9.558958418263352e-05, "loss": 4.1069, "step": 5200 }, { "epoch": 0.3536485935589075, "grad_norm": 0.2329636514186859, "learning_rate": 9.558533768175024e-05, "loss": 4.0195, "step": 5205 }, { "epoch": 0.35398831362956923, "grad_norm": 0.3606981635093689, "learning_rate": 9.558109118086697e-05, "loss": 4.0518, "step": 5210 }, { "epoch": 0.354328033700231, "grad_norm": 0.212651789188385, "learning_rate": 9.557684467998371e-05, "loss": 4.1561, "step": 5215 }, { "epoch": 0.35466775377089277, "grad_norm": 0.2813778817653656, "learning_rate": 9.557259817910042e-05, "loss": 4.1709, "step": 5220 }, { "epoch": 0.35500747384155457, "grad_norm": 0.1836264729499817, "learning_rate": 9.556835167821715e-05, "loss": 3.9968, "step": 5225 }, { "epoch": 0.3553471939122163, "grad_norm": 0.24313072860240936, "learning_rate": 9.556410517733388e-05, "loss": 4.1021, "step": 5230 }, { "epoch": 0.3556869139828781, "grad_norm": 0.21879182755947113, "learning_rate": 9.55598586764506e-05, "loss": 4.0678, "step": 5235 }, { "epoch": 0.3560266340535399, "grad_norm": 0.19957928359508514, "learning_rate": 9.555561217556733e-05, "loss": 4.1579, "step": 5240 }, { "epoch": 0.35636635412420165, "grad_norm": 0.2043609321117401, "learning_rate": 9.555136567468406e-05, "loss": 3.9197, "step": 5245 }, { "epoch": 0.35670607419486344, "grad_norm": 0.1743493527173996, "learning_rate": 9.554711917380079e-05, "loss": 4.3003, "step": 5250 }, { "epoch": 0.3570457942655252, "grad_norm": 0.3488079309463501, "learning_rate": 9.554287267291752e-05, "loss": 4.0098, "step": 5255 }, { "epoch": 0.357385514336187, "grad_norm": 0.2585020959377289, "learning_rate": 9.553862617203425e-05, "loss": 4.0749, "step": 5260 }, { "epoch": 0.3577252344068488, "grad_norm": 0.22201067209243774, "learning_rate": 9.553437967115097e-05, "loss": 3.7807, "step": 5265 }, { "epoch": 0.3580649544775105, "grad_norm": 0.4632178843021393, "learning_rate": 9.55301331702677e-05, "loss": 4.1557, "step": 5270 }, { "epoch": 0.3584046745481723, "grad_norm": 0.4491996765136719, "learning_rate": 9.552588666938443e-05, "loss": 4.2286, "step": 5275 }, { "epoch": 0.35874439461883406, "grad_norm": 0.22126582264900208, "learning_rate": 9.552164016850116e-05, "loss": 3.9587, "step": 5280 }, { "epoch": 0.35908411468949586, "grad_norm": 0.20614346861839294, "learning_rate": 9.551739366761789e-05, "loss": 4.1402, "step": 5285 }, { "epoch": 0.35942383476015766, "grad_norm": 0.2311069220304489, "learning_rate": 9.551314716673461e-05, "loss": 4.1504, "step": 5290 }, { "epoch": 0.3597635548308194, "grad_norm": 0.20152784883975983, "learning_rate": 9.550890066585134e-05, "loss": 4.2783, "step": 5295 }, { "epoch": 0.3601032749014812, "grad_norm": 0.2334737479686737, "learning_rate": 9.550465416496807e-05, "loss": 4.2849, "step": 5300 }, { "epoch": 0.36044299497214294, "grad_norm": 0.17994678020477295, "learning_rate": 9.55004076640848e-05, "loss": 4.3848, "step": 5305 }, { "epoch": 0.36078271504280474, "grad_norm": 0.2141488939523697, "learning_rate": 9.549616116320153e-05, "loss": 4.309, "step": 5310 }, { "epoch": 0.3611224351134665, "grad_norm": 0.2028026133775711, "learning_rate": 9.549191466231825e-05, "loss": 4.4349, "step": 5315 }, { "epoch": 0.3614621551841283, "grad_norm": 0.1849725842475891, "learning_rate": 9.548766816143498e-05, "loss": 3.929, "step": 5320 }, { "epoch": 0.3618018752547901, "grad_norm": 0.20538243651390076, "learning_rate": 9.548342166055171e-05, "loss": 4.2426, "step": 5325 }, { "epoch": 0.3621415953254518, "grad_norm": 0.22145512700080872, "learning_rate": 9.547917515966844e-05, "loss": 4.1321, "step": 5330 }, { "epoch": 0.3624813153961136, "grad_norm": 0.24293570220470428, "learning_rate": 9.547492865878517e-05, "loss": 4.4132, "step": 5335 }, { "epoch": 0.36282103546677535, "grad_norm": 3.3797924518585205, "learning_rate": 9.54706821579019e-05, "loss": 3.9775, "step": 5340 }, { "epoch": 0.36316075553743715, "grad_norm": 0.22349077463150024, "learning_rate": 9.546643565701862e-05, "loss": 4.095, "step": 5345 }, { "epoch": 0.36350047560809895, "grad_norm": 0.22708293795585632, "learning_rate": 9.546218915613535e-05, "loss": 4.1219, "step": 5350 }, { "epoch": 0.3638401956787607, "grad_norm": 0.40382349491119385, "learning_rate": 9.545794265525208e-05, "loss": 3.8157, "step": 5355 }, { "epoch": 0.3641799157494225, "grad_norm": 0.1983574777841568, "learning_rate": 9.54536961543688e-05, "loss": 4.0346, "step": 5360 }, { "epoch": 0.36451963582008423, "grad_norm": 0.3324495851993561, "learning_rate": 9.544944965348553e-05, "loss": 4.2765, "step": 5365 }, { "epoch": 0.36485935589074603, "grad_norm": 0.2055937945842743, "learning_rate": 9.544520315260226e-05, "loss": 3.8416, "step": 5370 }, { "epoch": 0.3651990759614078, "grad_norm": 0.18161867558956146, "learning_rate": 9.544095665171899e-05, "loss": 4.0468, "step": 5375 }, { "epoch": 0.36553879603206957, "grad_norm": 0.2383970320224762, "learning_rate": 9.543671015083572e-05, "loss": 4.0359, "step": 5380 }, { "epoch": 0.36587851610273137, "grad_norm": 0.1611696481704712, "learning_rate": 9.543246364995245e-05, "loss": 4.1817, "step": 5385 }, { "epoch": 0.3662182361733931, "grad_norm": 0.3070268929004669, "learning_rate": 9.542821714906917e-05, "loss": 4.0901, "step": 5390 }, { "epoch": 0.3665579562440549, "grad_norm": 0.17862237989902496, "learning_rate": 9.54239706481859e-05, "loss": 3.9993, "step": 5395 }, { "epoch": 0.36689767631471665, "grad_norm": 0.30012592673301697, "learning_rate": 9.541972414730263e-05, "loss": 4.0737, "step": 5400 }, { "epoch": 0.36723739638537845, "grad_norm": 15.268974304199219, "learning_rate": 9.541547764641936e-05, "loss": 3.8387, "step": 5405 }, { "epoch": 0.36757711645604024, "grad_norm": 0.19847442209720612, "learning_rate": 9.541123114553609e-05, "loss": 4.0158, "step": 5410 }, { "epoch": 0.367916836526702, "grad_norm": 1.345680832862854, "learning_rate": 9.540698464465281e-05, "loss": 4.1187, "step": 5415 }, { "epoch": 0.3682565565973638, "grad_norm": 0.15424399077892303, "learning_rate": 9.540273814376954e-05, "loss": 4.1746, "step": 5420 }, { "epoch": 0.3685962766680255, "grad_norm": 0.47641104459762573, "learning_rate": 9.539849164288627e-05, "loss": 3.8821, "step": 5425 }, { "epoch": 0.3689359967386873, "grad_norm": 0.27253925800323486, "learning_rate": 9.539424514200299e-05, "loss": 4.1306, "step": 5430 }, { "epoch": 0.3692757168093491, "grad_norm": 0.5784019231796265, "learning_rate": 9.538999864111973e-05, "loss": 4.1907, "step": 5435 }, { "epoch": 0.36961543688001086, "grad_norm": 0.21910730004310608, "learning_rate": 9.538575214023645e-05, "loss": 3.9055, "step": 5440 }, { "epoch": 0.36995515695067266, "grad_norm": 0.195495143532753, "learning_rate": 9.538150563935317e-05, "loss": 4.1521, "step": 5445 }, { "epoch": 0.3702948770213344, "grad_norm": 0.20794479548931122, "learning_rate": 9.537725913846991e-05, "loss": 3.9962, "step": 5450 }, { "epoch": 0.3706345970919962, "grad_norm": 1.305681586265564, "learning_rate": 9.537301263758664e-05, "loss": 4.173, "step": 5455 }, { "epoch": 0.370974317162658, "grad_norm": 0.1818116158246994, "learning_rate": 9.536876613670335e-05, "loss": 4.0849, "step": 5460 }, { "epoch": 0.37131403723331974, "grad_norm": 0.38611772656440735, "learning_rate": 9.53645196358201e-05, "loss": 3.9796, "step": 5465 }, { "epoch": 0.37165375730398154, "grad_norm": 0.2650381922721863, "learning_rate": 9.536027313493682e-05, "loss": 4.1452, "step": 5470 }, { "epoch": 0.3719934773746433, "grad_norm": 0.2208934873342514, "learning_rate": 9.535602663405354e-05, "loss": 4.2661, "step": 5475 }, { "epoch": 0.3723331974453051, "grad_norm": 0.20266486704349518, "learning_rate": 9.535178013317028e-05, "loss": 4.207, "step": 5480 }, { "epoch": 0.3726729175159668, "grad_norm": 0.21677860617637634, "learning_rate": 9.5347533632287e-05, "loss": 4.3784, "step": 5485 }, { "epoch": 0.3730126375866286, "grad_norm": 0.33356210589408875, "learning_rate": 9.534328713140372e-05, "loss": 4.0518, "step": 5490 }, { "epoch": 0.3733523576572904, "grad_norm": 0.2748437225818634, "learning_rate": 9.533904063052046e-05, "loss": 4.0669, "step": 5495 }, { "epoch": 0.37369207772795215, "grad_norm": 0.22416415810585022, "learning_rate": 9.533479412963718e-05, "loss": 4.1892, "step": 5500 }, { "epoch": 0.37403179779861395, "grad_norm": 0.20975516736507416, "learning_rate": 9.53305476287539e-05, "loss": 4.2146, "step": 5505 }, { "epoch": 0.3743715178692757, "grad_norm": 0.1820031851530075, "learning_rate": 9.532630112787065e-05, "loss": 4.1054, "step": 5510 }, { "epoch": 0.3747112379399375, "grad_norm": 0.2546124756336212, "learning_rate": 9.532205462698736e-05, "loss": 4.1923, "step": 5515 }, { "epoch": 0.3750509580105993, "grad_norm": 0.5577950477600098, "learning_rate": 9.531780812610409e-05, "loss": 3.851, "step": 5520 }, { "epoch": 0.37539067808126103, "grad_norm": 0.2909904718399048, "learning_rate": 9.531356162522083e-05, "loss": 3.8513, "step": 5525 }, { "epoch": 0.37573039815192283, "grad_norm": 0.20286774635314941, "learning_rate": 9.530931512433755e-05, "loss": 3.8725, "step": 5530 }, { "epoch": 0.37607011822258457, "grad_norm": 0.20398856699466705, "learning_rate": 9.530506862345427e-05, "loss": 4.3243, "step": 5535 }, { "epoch": 0.37640983829324637, "grad_norm": 0.1849180907011032, "learning_rate": 9.530082212257101e-05, "loss": 4.1106, "step": 5540 }, { "epoch": 0.37674955836390817, "grad_norm": 0.1672249287366867, "learning_rate": 9.529657562168773e-05, "loss": 4.4955, "step": 5545 }, { "epoch": 0.3770892784345699, "grad_norm": 0.7186090350151062, "learning_rate": 9.529232912080446e-05, "loss": 4.2498, "step": 5550 }, { "epoch": 0.3774289985052317, "grad_norm": 0.17973625659942627, "learning_rate": 9.52880826199212e-05, "loss": 4.2932, "step": 5555 }, { "epoch": 0.37776871857589345, "grad_norm": 0.23119674623012543, "learning_rate": 9.528383611903791e-05, "loss": 4.2597, "step": 5560 }, { "epoch": 0.37810843864655524, "grad_norm": 0.27012819051742554, "learning_rate": 9.527958961815464e-05, "loss": 4.0075, "step": 5565 }, { "epoch": 0.378448158717217, "grad_norm": 0.22133472561836243, "learning_rate": 9.527534311727138e-05, "loss": 4.1785, "step": 5570 }, { "epoch": 0.3787878787878788, "grad_norm": 0.17481616139411926, "learning_rate": 9.52710966163881e-05, "loss": 3.9544, "step": 5575 }, { "epoch": 0.3791275988585406, "grad_norm": 0.20295321941375732, "learning_rate": 9.526685011550483e-05, "loss": 3.8667, "step": 5580 }, { "epoch": 0.3794673189292023, "grad_norm": 0.3702150881290436, "learning_rate": 9.526260361462155e-05, "loss": 4.012, "step": 5585 }, { "epoch": 0.3798070389998641, "grad_norm": 0.3844399154186249, "learning_rate": 9.525835711373828e-05, "loss": 4.0456, "step": 5590 }, { "epoch": 0.38014675907052586, "grad_norm": 0.27248868346214294, "learning_rate": 9.525411061285501e-05, "loss": 4.0001, "step": 5595 }, { "epoch": 0.38048647914118766, "grad_norm": 0.4196895360946655, "learning_rate": 9.524986411197174e-05, "loss": 4.2368, "step": 5600 }, { "epoch": 0.38082619921184946, "grad_norm": 0.2525693476200104, "learning_rate": 9.524561761108847e-05, "loss": 4.1713, "step": 5605 }, { "epoch": 0.3811659192825112, "grad_norm": 0.19002725183963776, "learning_rate": 9.52413711102052e-05, "loss": 3.9637, "step": 5610 }, { "epoch": 0.381505639353173, "grad_norm": 0.21603484451770782, "learning_rate": 9.523712460932192e-05, "loss": 4.0533, "step": 5615 }, { "epoch": 0.38184535942383474, "grad_norm": 0.1926129311323166, "learning_rate": 9.523287810843865e-05, "loss": 4.0691, "step": 5620 }, { "epoch": 0.38218507949449654, "grad_norm": 0.41377758979797363, "learning_rate": 9.522863160755538e-05, "loss": 4.0141, "step": 5625 }, { "epoch": 0.38252479956515834, "grad_norm": 0.2249571830034256, "learning_rate": 9.52243851066721e-05, "loss": 4.1834, "step": 5630 }, { "epoch": 0.3828645196358201, "grad_norm": 0.1857426017522812, "learning_rate": 9.522013860578883e-05, "loss": 4.0473, "step": 5635 }, { "epoch": 0.3832042397064819, "grad_norm": 0.20423340797424316, "learning_rate": 9.521589210490556e-05, "loss": 4.1039, "step": 5640 }, { "epoch": 0.3835439597771436, "grad_norm": 0.20530074834823608, "learning_rate": 9.521164560402229e-05, "loss": 4.1668, "step": 5645 }, { "epoch": 0.3838836798478054, "grad_norm": 0.21148528158664703, "learning_rate": 9.520739910313902e-05, "loss": 4.0141, "step": 5650 }, { "epoch": 0.38422339991846716, "grad_norm": 0.23851540684700012, "learning_rate": 9.520315260225575e-05, "loss": 4.2311, "step": 5655 }, { "epoch": 0.38456311998912895, "grad_norm": 0.17751434445381165, "learning_rate": 9.519890610137247e-05, "loss": 4.0698, "step": 5660 }, { "epoch": 0.38490284005979075, "grad_norm": 0.4216800332069397, "learning_rate": 9.51946596004892e-05, "loss": 4.0324, "step": 5665 }, { "epoch": 0.3852425601304525, "grad_norm": 0.19707581400871277, "learning_rate": 9.519041309960593e-05, "loss": 4.0026, "step": 5670 }, { "epoch": 0.3855822802011143, "grad_norm": 0.21056298911571503, "learning_rate": 9.518616659872266e-05, "loss": 4.0293, "step": 5675 }, { "epoch": 0.38592200027177603, "grad_norm": 0.18237900733947754, "learning_rate": 9.518192009783939e-05, "loss": 3.9467, "step": 5680 }, { "epoch": 0.38626172034243783, "grad_norm": 0.1838427186012268, "learning_rate": 9.517767359695611e-05, "loss": 3.925, "step": 5685 }, { "epoch": 0.38660144041309963, "grad_norm": 0.21086731553077698, "learning_rate": 9.517342709607284e-05, "loss": 4.2034, "step": 5690 }, { "epoch": 0.38694116048376137, "grad_norm": 0.17555493116378784, "learning_rate": 9.516918059518957e-05, "loss": 3.8963, "step": 5695 }, { "epoch": 0.38728088055442317, "grad_norm": 0.23491710424423218, "learning_rate": 9.51649340943063e-05, "loss": 4.116, "step": 5700 }, { "epoch": 0.3876206006250849, "grad_norm": 0.18439505994319916, "learning_rate": 9.516068759342303e-05, "loss": 4.1069, "step": 5705 }, { "epoch": 0.3879603206957467, "grad_norm": 0.18807227909564972, "learning_rate": 9.515644109253975e-05, "loss": 4.104, "step": 5710 }, { "epoch": 0.3883000407664085, "grad_norm": 0.4963626265525818, "learning_rate": 9.515219459165648e-05, "loss": 4.1994, "step": 5715 }, { "epoch": 0.38863976083707025, "grad_norm": 0.24339251220226288, "learning_rate": 9.514794809077321e-05, "loss": 4.1082, "step": 5720 }, { "epoch": 0.38897948090773204, "grad_norm": 0.17436154186725616, "learning_rate": 9.514370158988994e-05, "loss": 4.1256, "step": 5725 }, { "epoch": 0.3893192009783938, "grad_norm": 0.2445308268070221, "learning_rate": 9.513945508900667e-05, "loss": 4.0222, "step": 5730 }, { "epoch": 0.3896589210490556, "grad_norm": 0.6241475939750671, "learning_rate": 9.51352085881234e-05, "loss": 4.1132, "step": 5735 }, { "epoch": 0.3899986411197173, "grad_norm": 0.16763907670974731, "learning_rate": 9.513096208724012e-05, "loss": 4.3379, "step": 5740 }, { "epoch": 0.3903383611903791, "grad_norm": 0.1730974316596985, "learning_rate": 9.512671558635685e-05, "loss": 4.1179, "step": 5745 }, { "epoch": 0.3906780812610409, "grad_norm": 0.19016407430171967, "learning_rate": 9.512246908547358e-05, "loss": 4.0632, "step": 5750 }, { "epoch": 0.39101780133170266, "grad_norm": 0.19713403284549713, "learning_rate": 9.51182225845903e-05, "loss": 4.088, "step": 5755 }, { "epoch": 0.39135752140236446, "grad_norm": 0.26910024881362915, "learning_rate": 9.511397608370703e-05, "loss": 4.2572, "step": 5760 }, { "epoch": 0.3916972414730262, "grad_norm": 0.20750823616981506, "learning_rate": 9.510972958282376e-05, "loss": 4.0017, "step": 5765 }, { "epoch": 0.392036961543688, "grad_norm": 0.18255822360515594, "learning_rate": 9.510548308194049e-05, "loss": 4.4284, "step": 5770 }, { "epoch": 0.3923766816143498, "grad_norm": 0.19282065331935883, "learning_rate": 9.510123658105722e-05, "loss": 4.1651, "step": 5775 }, { "epoch": 0.39271640168501154, "grad_norm": 0.22694478929042816, "learning_rate": 9.509699008017395e-05, "loss": 4.0152, "step": 5780 }, { "epoch": 0.39305612175567334, "grad_norm": 0.2607446014881134, "learning_rate": 9.509274357929066e-05, "loss": 4.3358, "step": 5785 }, { "epoch": 0.3933958418263351, "grad_norm": 0.22173616290092468, "learning_rate": 9.50884970784074e-05, "loss": 4.2362, "step": 5790 }, { "epoch": 0.3937355618969969, "grad_norm": 0.20545057952404022, "learning_rate": 9.508425057752413e-05, "loss": 4.0821, "step": 5795 }, { "epoch": 0.3940752819676587, "grad_norm": 0.23421698808670044, "learning_rate": 9.508000407664084e-05, "loss": 4.2178, "step": 5800 }, { "epoch": 0.3944150020383204, "grad_norm": 0.2095632702112198, "learning_rate": 9.507575757575759e-05, "loss": 4.2833, "step": 5805 }, { "epoch": 0.3947547221089822, "grad_norm": 0.23404939472675323, "learning_rate": 9.507151107487431e-05, "loss": 4.0823, "step": 5810 }, { "epoch": 0.39509444217964396, "grad_norm": 0.23966114223003387, "learning_rate": 9.506726457399103e-05, "loss": 3.8059, "step": 5815 }, { "epoch": 0.39543416225030575, "grad_norm": 0.2027054876089096, "learning_rate": 9.506301807310777e-05, "loss": 4.2229, "step": 5820 }, { "epoch": 0.3957738823209675, "grad_norm": 0.18689711391925812, "learning_rate": 9.50587715722245e-05, "loss": 4.26, "step": 5825 }, { "epoch": 0.3961136023916293, "grad_norm": 0.263927698135376, "learning_rate": 9.505452507134121e-05, "loss": 4.021, "step": 5830 }, { "epoch": 0.3964533224622911, "grad_norm": 0.18399837613105774, "learning_rate": 9.505027857045795e-05, "loss": 4.218, "step": 5835 }, { "epoch": 0.39679304253295283, "grad_norm": 0.17031966149806976, "learning_rate": 9.504603206957468e-05, "loss": 3.9079, "step": 5840 }, { "epoch": 0.39713276260361463, "grad_norm": 0.3210891783237457, "learning_rate": 9.50417855686914e-05, "loss": 3.9466, "step": 5845 }, { "epoch": 0.3974724826742764, "grad_norm": 0.1981404423713684, "learning_rate": 9.503753906780814e-05, "loss": 4.001, "step": 5850 }, { "epoch": 0.39781220274493817, "grad_norm": 0.3136885464191437, "learning_rate": 9.503329256692485e-05, "loss": 4.058, "step": 5855 }, { "epoch": 0.39815192281559997, "grad_norm": 2.190765857696533, "learning_rate": 9.502904606604158e-05, "loss": 4.0696, "step": 5860 }, { "epoch": 0.3984916428862617, "grad_norm": 0.17315542697906494, "learning_rate": 9.502479956515832e-05, "loss": 4.2626, "step": 5865 }, { "epoch": 0.3988313629569235, "grad_norm": 0.33235201239585876, "learning_rate": 9.502055306427504e-05, "loss": 3.9844, "step": 5870 }, { "epoch": 0.39917108302758525, "grad_norm": 0.23391257226467133, "learning_rate": 9.501630656339176e-05, "loss": 3.8401, "step": 5875 }, { "epoch": 0.39951080309824705, "grad_norm": 0.24006325006484985, "learning_rate": 9.50120600625085e-05, "loss": 4.0724, "step": 5880 }, { "epoch": 0.39985052316890884, "grad_norm": 0.17999830842018127, "learning_rate": 9.500781356162522e-05, "loss": 4.2167, "step": 5885 }, { "epoch": 0.4001902432395706, "grad_norm": 0.18070223927497864, "learning_rate": 9.500356706074195e-05, "loss": 4.105, "step": 5890 }, { "epoch": 0.4005299633102324, "grad_norm": 0.19634656608104706, "learning_rate": 9.499932055985869e-05, "loss": 3.989, "step": 5895 }, { "epoch": 0.4008696833808941, "grad_norm": 0.23722241818904877, "learning_rate": 9.49950740589754e-05, "loss": 4.1728, "step": 5900 }, { "epoch": 0.4012094034515559, "grad_norm": 0.19146768748760223, "learning_rate": 9.499082755809213e-05, "loss": 4.1732, "step": 5905 }, { "epoch": 0.40154912352221767, "grad_norm": 0.21835100650787354, "learning_rate": 9.498658105720887e-05, "loss": 4.0836, "step": 5910 }, { "epoch": 0.40188884359287946, "grad_norm": 0.17060807347297668, "learning_rate": 9.498233455632559e-05, "loss": 4.0794, "step": 5915 }, { "epoch": 0.40222856366354126, "grad_norm": 0.9648451805114746, "learning_rate": 9.497808805544232e-05, "loss": 4.1605, "step": 5920 }, { "epoch": 0.402568283734203, "grad_norm": 0.1983519345521927, "learning_rate": 9.497384155455904e-05, "loss": 3.9843, "step": 5925 }, { "epoch": 0.4029080038048648, "grad_norm": 0.23070013523101807, "learning_rate": 9.496959505367577e-05, "loss": 3.9679, "step": 5930 }, { "epoch": 0.40324772387552654, "grad_norm": 1.1979519128799438, "learning_rate": 9.49653485527925e-05, "loss": 4.1856, "step": 5935 }, { "epoch": 0.40358744394618834, "grad_norm": 0.7521958947181702, "learning_rate": 9.496110205190923e-05, "loss": 4.082, "step": 5940 }, { "epoch": 0.40392716401685014, "grad_norm": 0.192143052816391, "learning_rate": 9.495685555102596e-05, "loss": 4.2121, "step": 5945 }, { "epoch": 0.4042668840875119, "grad_norm": 0.2611311972141266, "learning_rate": 9.495260905014268e-05, "loss": 4.2154, "step": 5950 }, { "epoch": 0.4046066041581737, "grad_norm": 0.18073415756225586, "learning_rate": 9.494836254925941e-05, "loss": 4.1762, "step": 5955 }, { "epoch": 0.4049463242288354, "grad_norm": 0.1921936720609665, "learning_rate": 9.494411604837614e-05, "loss": 4.0705, "step": 5960 }, { "epoch": 0.4052860442994972, "grad_norm": 0.16377374529838562, "learning_rate": 9.493986954749287e-05, "loss": 4.1785, "step": 5965 }, { "epoch": 0.405625764370159, "grad_norm": 0.21104343235492706, "learning_rate": 9.49356230466096e-05, "loss": 4.2715, "step": 5970 }, { "epoch": 0.40596548444082076, "grad_norm": 0.2071741223335266, "learning_rate": 9.493137654572632e-05, "loss": 3.9067, "step": 5975 }, { "epoch": 0.40630520451148255, "grad_norm": 0.22247660160064697, "learning_rate": 9.492713004484305e-05, "loss": 4.0561, "step": 5980 }, { "epoch": 0.4066449245821443, "grad_norm": 0.20433616638183594, "learning_rate": 9.492288354395978e-05, "loss": 4.0168, "step": 5985 }, { "epoch": 0.4069846446528061, "grad_norm": 0.2049606889486313, "learning_rate": 9.491863704307651e-05, "loss": 4.3927, "step": 5990 }, { "epoch": 0.40732436472346784, "grad_norm": 0.22720476984977722, "learning_rate": 9.491439054219324e-05, "loss": 4.2648, "step": 5995 }, { "epoch": 0.40766408479412963, "grad_norm": 0.25233665108680725, "learning_rate": 9.491014404130996e-05, "loss": 3.9343, "step": 6000 }, { "epoch": 0.40800380486479143, "grad_norm": 0.21542391180992126, "learning_rate": 9.490589754042669e-05, "loss": 4.2302, "step": 6005 }, { "epoch": 0.4083435249354532, "grad_norm": 0.8740308284759521, "learning_rate": 9.490165103954342e-05, "loss": 4.0408, "step": 6010 }, { "epoch": 0.40868324500611497, "grad_norm": 0.18519490957260132, "learning_rate": 9.489740453866015e-05, "loss": 4.0773, "step": 6015 }, { "epoch": 0.4090229650767767, "grad_norm": 0.2651638388633728, "learning_rate": 9.489315803777688e-05, "loss": 4.1498, "step": 6020 }, { "epoch": 0.4093626851474385, "grad_norm": 0.4929540157318115, "learning_rate": 9.48889115368936e-05, "loss": 3.9128, "step": 6025 }, { "epoch": 0.4097024052181003, "grad_norm": 0.20049385726451874, "learning_rate": 9.488466503601033e-05, "loss": 4.0097, "step": 6030 }, { "epoch": 0.41004212528876205, "grad_norm": 0.17493902146816254, "learning_rate": 9.488041853512706e-05, "loss": 4.1654, "step": 6035 }, { "epoch": 0.41038184535942385, "grad_norm": 0.30577751994132996, "learning_rate": 9.487617203424379e-05, "loss": 4.0563, "step": 6040 }, { "epoch": 0.4107215654300856, "grad_norm": 0.2669510543346405, "learning_rate": 9.487192553336052e-05, "loss": 4.0721, "step": 6045 }, { "epoch": 0.4110612855007474, "grad_norm": 0.18722812831401825, "learning_rate": 9.486767903247724e-05, "loss": 4.1749, "step": 6050 }, { "epoch": 0.4114010055714092, "grad_norm": 0.1755664199590683, "learning_rate": 9.486343253159397e-05, "loss": 4.1804, "step": 6055 }, { "epoch": 0.4117407256420709, "grad_norm": 0.27995565533638, "learning_rate": 9.48591860307107e-05, "loss": 3.9794, "step": 6060 }, { "epoch": 0.4120804457127327, "grad_norm": 0.2518627345561981, "learning_rate": 9.485493952982743e-05, "loss": 4.2129, "step": 6065 }, { "epoch": 0.41242016578339447, "grad_norm": 0.21856893599033356, "learning_rate": 9.485069302894416e-05, "loss": 4.3444, "step": 6070 }, { "epoch": 0.41275988585405626, "grad_norm": 1.2819626331329346, "learning_rate": 9.484644652806088e-05, "loss": 3.6595, "step": 6075 }, { "epoch": 0.413099605924718, "grad_norm": 0.25162968039512634, "learning_rate": 9.484220002717761e-05, "loss": 4.2169, "step": 6080 }, { "epoch": 0.4134393259953798, "grad_norm": 0.2659519910812378, "learning_rate": 9.483795352629434e-05, "loss": 4.0789, "step": 6085 }, { "epoch": 0.4137790460660416, "grad_norm": 0.20415237545967102, "learning_rate": 9.483370702541107e-05, "loss": 4.2934, "step": 6090 }, { "epoch": 0.41411876613670334, "grad_norm": 0.33283525705337524, "learning_rate": 9.48294605245278e-05, "loss": 4.0536, "step": 6095 }, { "epoch": 0.41445848620736514, "grad_norm": 0.21782909333705902, "learning_rate": 9.482521402364452e-05, "loss": 3.9827, "step": 6100 }, { "epoch": 0.4147982062780269, "grad_norm": 4.150507926940918, "learning_rate": 9.482096752276125e-05, "loss": 3.9654, "step": 6105 }, { "epoch": 0.4151379263486887, "grad_norm": 0.20316611230373383, "learning_rate": 9.481672102187798e-05, "loss": 4.045, "step": 6110 }, { "epoch": 0.4154776464193505, "grad_norm": 1.9692164659500122, "learning_rate": 9.481247452099471e-05, "loss": 4.0912, "step": 6115 }, { "epoch": 0.4158173664900122, "grad_norm": 0.21312934160232544, "learning_rate": 9.480822802011144e-05, "loss": 4.2765, "step": 6120 }, { "epoch": 0.416157086560674, "grad_norm": 0.1886243224143982, "learning_rate": 9.480398151922815e-05, "loss": 4.0305, "step": 6125 }, { "epoch": 0.41649680663133576, "grad_norm": 0.22211480140686035, "learning_rate": 9.479973501834489e-05, "loss": 3.906, "step": 6130 }, { "epoch": 0.41683652670199756, "grad_norm": 0.24448561668395996, "learning_rate": 9.479548851746162e-05, "loss": 3.9652, "step": 6135 }, { "epoch": 0.41717624677265935, "grad_norm": 0.2330089956521988, "learning_rate": 9.479124201657834e-05, "loss": 4.0935, "step": 6140 }, { "epoch": 0.4175159668433211, "grad_norm": 0.354692280292511, "learning_rate": 9.478699551569508e-05, "loss": 4.1893, "step": 6145 }, { "epoch": 0.4178556869139829, "grad_norm": 0.31616339087486267, "learning_rate": 9.47827490148118e-05, "loss": 4.1393, "step": 6150 }, { "epoch": 0.41819540698464464, "grad_norm": 0.6363674402236938, "learning_rate": 9.477850251392852e-05, "loss": 4.1578, "step": 6155 }, { "epoch": 0.41853512705530643, "grad_norm": 0.19385862350463867, "learning_rate": 9.477425601304526e-05, "loss": 4.0994, "step": 6160 }, { "epoch": 0.4188748471259682, "grad_norm": 0.20381571352481842, "learning_rate": 9.477000951216199e-05, "loss": 4.1873, "step": 6165 }, { "epoch": 0.41921456719663, "grad_norm": 0.20795594155788422, "learning_rate": 9.47657630112787e-05, "loss": 4.0544, "step": 6170 }, { "epoch": 0.41955428726729177, "grad_norm": 0.3839801847934723, "learning_rate": 9.476151651039544e-05, "loss": 4.1777, "step": 6175 }, { "epoch": 0.4198940073379535, "grad_norm": 0.2491442710161209, "learning_rate": 9.475727000951217e-05, "loss": 3.9298, "step": 6180 }, { "epoch": 0.4202337274086153, "grad_norm": 0.1739528328180313, "learning_rate": 9.475302350862889e-05, "loss": 4.0482, "step": 6185 }, { "epoch": 0.42057344747927705, "grad_norm": 0.19341996312141418, "learning_rate": 9.474877700774563e-05, "loss": 3.911, "step": 6190 }, { "epoch": 0.42091316754993885, "grad_norm": 0.16241292655467987, "learning_rate": 9.474453050686236e-05, "loss": 4.0751, "step": 6195 }, { "epoch": 0.42125288762060065, "grad_norm": 0.16985565423965454, "learning_rate": 9.474028400597907e-05, "loss": 3.9359, "step": 6200 }, { "epoch": 0.4215926076912624, "grad_norm": 0.21724484860897064, "learning_rate": 9.473603750509581e-05, "loss": 4.0409, "step": 6205 }, { "epoch": 0.4219323277619242, "grad_norm": 0.21480692923069, "learning_rate": 9.473179100421253e-05, "loss": 3.987, "step": 6210 }, { "epoch": 0.42227204783258593, "grad_norm": 0.2604687809944153, "learning_rate": 9.472754450332926e-05, "loss": 3.89, "step": 6215 }, { "epoch": 0.4226117679032477, "grad_norm": 0.22292381525039673, "learning_rate": 9.4723298002446e-05, "loss": 4.029, "step": 6220 }, { "epoch": 0.4229514879739095, "grad_norm": 0.2695325016975403, "learning_rate": 9.471905150156271e-05, "loss": 4.0199, "step": 6225 }, { "epoch": 0.42329120804457127, "grad_norm": 0.17921492457389832, "learning_rate": 9.471480500067944e-05, "loss": 4.0195, "step": 6230 }, { "epoch": 0.42363092811523306, "grad_norm": 0.29654955863952637, "learning_rate": 9.471055849979618e-05, "loss": 4.2356, "step": 6235 }, { "epoch": 0.4239706481858948, "grad_norm": 0.3091282844543457, "learning_rate": 9.47063119989129e-05, "loss": 3.8808, "step": 6240 }, { "epoch": 0.4243103682565566, "grad_norm": 0.20580576360225677, "learning_rate": 9.470206549802962e-05, "loss": 4.0886, "step": 6245 }, { "epoch": 0.42465008832721834, "grad_norm": 0.19273176789283752, "learning_rate": 9.469781899714636e-05, "loss": 3.9993, "step": 6250 }, { "epoch": 0.42498980839788014, "grad_norm": 0.18639002740383148, "learning_rate": 9.469357249626308e-05, "loss": 4.0441, "step": 6255 }, { "epoch": 0.42532952846854194, "grad_norm": 2.978999614715576, "learning_rate": 9.468932599537981e-05, "loss": 3.7926, "step": 6260 }, { "epoch": 0.4256692485392037, "grad_norm": 0.17055857181549072, "learning_rate": 9.468507949449655e-05, "loss": 4.0499, "step": 6265 }, { "epoch": 0.4260089686098655, "grad_norm": 1.0542086362838745, "learning_rate": 9.468083299361326e-05, "loss": 4.2183, "step": 6270 }, { "epoch": 0.4263486886805272, "grad_norm": 1.1902449131011963, "learning_rate": 9.467658649272999e-05, "loss": 3.9083, "step": 6275 }, { "epoch": 0.426688408751189, "grad_norm": 0.26639068126678467, "learning_rate": 9.467233999184672e-05, "loss": 3.9113, "step": 6280 }, { "epoch": 0.4270281288218508, "grad_norm": 0.2732694149017334, "learning_rate": 9.466809349096345e-05, "loss": 4.1468, "step": 6285 }, { "epoch": 0.42736784889251256, "grad_norm": 0.27546462416648865, "learning_rate": 9.466384699008018e-05, "loss": 3.9733, "step": 6290 }, { "epoch": 0.42770756896317436, "grad_norm": 0.1994089037179947, "learning_rate": 9.46596004891969e-05, "loss": 4.1123, "step": 6295 }, { "epoch": 0.4280472890338361, "grad_norm": 0.20160436630249023, "learning_rate": 9.465535398831363e-05, "loss": 4.0467, "step": 6300 }, { "epoch": 0.4283870091044979, "grad_norm": 0.2255067527294159, "learning_rate": 9.465110748743036e-05, "loss": 3.7712, "step": 6305 }, { "epoch": 0.4287267291751597, "grad_norm": 0.1755346655845642, "learning_rate": 9.464686098654709e-05, "loss": 4.3119, "step": 6310 }, { "epoch": 0.42906644924582144, "grad_norm": 0.16779784858226776, "learning_rate": 9.464261448566382e-05, "loss": 4.1447, "step": 6315 }, { "epoch": 0.42940616931648323, "grad_norm": 0.2779237926006317, "learning_rate": 9.463836798478054e-05, "loss": 4.1285, "step": 6320 }, { "epoch": 0.429745889387145, "grad_norm": 0.21850286424160004, "learning_rate": 9.463412148389727e-05, "loss": 4.118, "step": 6325 }, { "epoch": 0.4300856094578068, "grad_norm": 0.2127694934606552, "learning_rate": 9.4629874983014e-05, "loss": 4.1073, "step": 6330 }, { "epoch": 0.4304253295284685, "grad_norm": 0.24460366368293762, "learning_rate": 9.462562848213073e-05, "loss": 4.0523, "step": 6335 }, { "epoch": 0.4307650495991303, "grad_norm": 0.18039904534816742, "learning_rate": 9.462138198124746e-05, "loss": 4.205, "step": 6340 }, { "epoch": 0.4311047696697921, "grad_norm": 0.23940406739711761, "learning_rate": 9.461713548036418e-05, "loss": 4.0222, "step": 6345 }, { "epoch": 0.43144448974045385, "grad_norm": 0.223390132188797, "learning_rate": 9.461288897948091e-05, "loss": 4.0133, "step": 6350 }, { "epoch": 0.43178420981111565, "grad_norm": 0.20645423233509064, "learning_rate": 9.460864247859764e-05, "loss": 3.9967, "step": 6355 }, { "epoch": 0.4321239298817774, "grad_norm": 0.7943032383918762, "learning_rate": 9.460439597771437e-05, "loss": 3.927, "step": 6360 }, { "epoch": 0.4324636499524392, "grad_norm": 0.1684955656528473, "learning_rate": 9.46001494768311e-05, "loss": 4.0099, "step": 6365 }, { "epoch": 0.432803370023101, "grad_norm": 0.21439214050769806, "learning_rate": 9.459590297594782e-05, "loss": 3.7963, "step": 6370 }, { "epoch": 0.43314309009376273, "grad_norm": 0.21478402614593506, "learning_rate": 9.459165647506455e-05, "loss": 4.0739, "step": 6375 }, { "epoch": 0.4334828101644245, "grad_norm": 0.8936269283294678, "learning_rate": 9.458740997418128e-05, "loss": 4.1838, "step": 6380 }, { "epoch": 0.43382253023508627, "grad_norm": 0.5096725821495056, "learning_rate": 9.458316347329801e-05, "loss": 4.047, "step": 6385 }, { "epoch": 0.43416225030574807, "grad_norm": 0.285972535610199, "learning_rate": 9.457891697241474e-05, "loss": 4.1083, "step": 6390 }, { "epoch": 0.43450197037640986, "grad_norm": 0.1650124043226242, "learning_rate": 9.457467047153146e-05, "loss": 4.157, "step": 6395 }, { "epoch": 0.4348416904470716, "grad_norm": 0.19191978871822357, "learning_rate": 9.457042397064819e-05, "loss": 3.9201, "step": 6400 }, { "epoch": 0.4351814105177334, "grad_norm": 0.20855942368507385, "learning_rate": 9.456617746976492e-05, "loss": 4.1146, "step": 6405 }, { "epoch": 0.43552113058839514, "grad_norm": 0.17791011929512024, "learning_rate": 9.456193096888165e-05, "loss": 3.9971, "step": 6410 }, { "epoch": 0.43586085065905694, "grad_norm": 0.2120276242494583, "learning_rate": 9.455768446799838e-05, "loss": 4.0967, "step": 6415 }, { "epoch": 0.4362005707297187, "grad_norm": 0.20230713486671448, "learning_rate": 9.45534379671151e-05, "loss": 3.902, "step": 6420 }, { "epoch": 0.4365402908003805, "grad_norm": 0.5752553343772888, "learning_rate": 9.454919146623183e-05, "loss": 3.9801, "step": 6425 }, { "epoch": 0.4368800108710423, "grad_norm": 0.19792981445789337, "learning_rate": 9.454494496534856e-05, "loss": 4.2249, "step": 6430 }, { "epoch": 0.437219730941704, "grad_norm": 0.20685099065303802, "learning_rate": 9.454069846446529e-05, "loss": 4.2953, "step": 6435 }, { "epoch": 0.4375594510123658, "grad_norm": 0.22175006568431854, "learning_rate": 9.453645196358202e-05, "loss": 3.9621, "step": 6440 }, { "epoch": 0.43789917108302756, "grad_norm": 0.4952181875705719, "learning_rate": 9.453220546269874e-05, "loss": 3.986, "step": 6445 }, { "epoch": 0.43823889115368936, "grad_norm": 0.20618560910224915, "learning_rate": 9.452795896181547e-05, "loss": 3.9802, "step": 6450 }, { "epoch": 0.43857861122435116, "grad_norm": 1.136326551437378, "learning_rate": 9.45237124609322e-05, "loss": 4.2695, "step": 6455 }, { "epoch": 0.4389183312950129, "grad_norm": 0.22814400494098663, "learning_rate": 9.451946596004893e-05, "loss": 3.8791, "step": 6460 }, { "epoch": 0.4392580513656747, "grad_norm": 0.24193866550922394, "learning_rate": 9.451521945916566e-05, "loss": 3.9838, "step": 6465 }, { "epoch": 0.43959777143633644, "grad_norm": 0.25064903497695923, "learning_rate": 9.451097295828238e-05, "loss": 3.9355, "step": 6470 }, { "epoch": 0.43993749150699824, "grad_norm": 1.2106342315673828, "learning_rate": 9.450672645739911e-05, "loss": 4.1844, "step": 6475 }, { "epoch": 0.44027721157766003, "grad_norm": 0.16492827236652374, "learning_rate": 9.450247995651583e-05, "loss": 4.2447, "step": 6480 }, { "epoch": 0.4406169316483218, "grad_norm": 0.1795361191034317, "learning_rate": 9.449823345563257e-05, "loss": 3.8589, "step": 6485 }, { "epoch": 0.44095665171898357, "grad_norm": 0.19358831644058228, "learning_rate": 9.44939869547493e-05, "loss": 4.2963, "step": 6490 }, { "epoch": 0.4412963717896453, "grad_norm": 0.2732826769351959, "learning_rate": 9.448974045386601e-05, "loss": 3.9789, "step": 6495 }, { "epoch": 0.4416360918603071, "grad_norm": 0.23638220131397247, "learning_rate": 9.448549395298275e-05, "loss": 4.2448, "step": 6500 }, { "epoch": 0.4419758119309689, "grad_norm": 0.2072085738182068, "learning_rate": 9.448124745209948e-05, "loss": 3.8356, "step": 6505 }, { "epoch": 0.44231553200163065, "grad_norm": 3.1101341247558594, "learning_rate": 9.44770009512162e-05, "loss": 4.2411, "step": 6510 }, { "epoch": 0.44265525207229245, "grad_norm": 0.4264751374721527, "learning_rate": 9.447275445033294e-05, "loss": 3.9676, "step": 6515 }, { "epoch": 0.4429949721429542, "grad_norm": 0.20776435732841492, "learning_rate": 9.446850794944966e-05, "loss": 3.8493, "step": 6520 }, { "epoch": 0.443334692213616, "grad_norm": 0.3044533133506775, "learning_rate": 9.446426144856638e-05, "loss": 4.1147, "step": 6525 }, { "epoch": 0.44367441228427773, "grad_norm": 0.16665169596672058, "learning_rate": 9.446001494768312e-05, "loss": 3.9521, "step": 6530 }, { "epoch": 0.44401413235493953, "grad_norm": 0.2023710161447525, "learning_rate": 9.445576844679985e-05, "loss": 3.9206, "step": 6535 }, { "epoch": 0.4443538524256013, "grad_norm": 0.4145415425300598, "learning_rate": 9.445152194591656e-05, "loss": 4.0138, "step": 6540 }, { "epoch": 0.44469357249626307, "grad_norm": 0.16682837903499603, "learning_rate": 9.44472754450333e-05, "loss": 4.0957, "step": 6545 }, { "epoch": 0.44503329256692487, "grad_norm": 0.2003334015607834, "learning_rate": 9.444302894415002e-05, "loss": 3.8427, "step": 6550 }, { "epoch": 0.4453730126375866, "grad_norm": 0.29585328698158264, "learning_rate": 9.443878244326675e-05, "loss": 4.2463, "step": 6555 }, { "epoch": 0.4457127327082484, "grad_norm": 0.20154406130313873, "learning_rate": 9.443453594238349e-05, "loss": 4.2644, "step": 6560 }, { "epoch": 0.4460524527789102, "grad_norm": 0.23148468136787415, "learning_rate": 9.44302894415002e-05, "loss": 3.9205, "step": 6565 }, { "epoch": 0.44639217284957194, "grad_norm": 0.1762179434299469, "learning_rate": 9.442604294061693e-05, "loss": 4.0293, "step": 6570 }, { "epoch": 0.44673189292023374, "grad_norm": 0.4714028835296631, "learning_rate": 9.442179643973367e-05, "loss": 4.2011, "step": 6575 }, { "epoch": 0.4470716129908955, "grad_norm": 0.368407666683197, "learning_rate": 9.441754993885039e-05, "loss": 4.0047, "step": 6580 }, { "epoch": 0.4474113330615573, "grad_norm": 0.28887784481048584, "learning_rate": 9.441330343796711e-05, "loss": 4.0332, "step": 6585 }, { "epoch": 0.4477510531322191, "grad_norm": 0.25729164481163025, "learning_rate": 9.440905693708386e-05, "loss": 4.0735, "step": 6590 }, { "epoch": 0.4480907732028808, "grad_norm": 0.1723019927740097, "learning_rate": 9.440481043620057e-05, "loss": 4.0399, "step": 6595 }, { "epoch": 0.4484304932735426, "grad_norm": 0.2043658047914505, "learning_rate": 9.44005639353173e-05, "loss": 4.2346, "step": 6600 }, { "epoch": 0.44877021334420436, "grad_norm": 0.15108297765254974, "learning_rate": 9.439631743443404e-05, "loss": 3.9109, "step": 6605 }, { "epoch": 0.44910993341486616, "grad_norm": 0.18265971541404724, "learning_rate": 9.439207093355075e-05, "loss": 3.845, "step": 6610 }, { "epoch": 0.4494496534855279, "grad_norm": 0.9520887732505798, "learning_rate": 9.438782443266748e-05, "loss": 4.2181, "step": 6615 }, { "epoch": 0.4497893735561897, "grad_norm": 0.28121015429496765, "learning_rate": 9.438357793178422e-05, "loss": 4.0234, "step": 6620 }, { "epoch": 0.4501290936268515, "grad_norm": 0.21010081470012665, "learning_rate": 9.437933143090094e-05, "loss": 4.1509, "step": 6625 }, { "epoch": 0.45046881369751324, "grad_norm": 0.23798321187496185, "learning_rate": 9.437508493001767e-05, "loss": 4.2879, "step": 6630 }, { "epoch": 0.45080853376817503, "grad_norm": 0.20470625162124634, "learning_rate": 9.43708384291344e-05, "loss": 4.0134, "step": 6635 }, { "epoch": 0.4511482538388368, "grad_norm": 0.19223268330097198, "learning_rate": 9.436659192825112e-05, "loss": 4.1928, "step": 6640 }, { "epoch": 0.4514879739094986, "grad_norm": 0.1938367336988449, "learning_rate": 9.436234542736785e-05, "loss": 3.923, "step": 6645 }, { "epoch": 0.45182769398016037, "grad_norm": 0.2842964231967926, "learning_rate": 9.435809892648458e-05, "loss": 4.2081, "step": 6650 }, { "epoch": 0.4521674140508221, "grad_norm": 0.22615915536880493, "learning_rate": 9.43538524256013e-05, "loss": 4.0994, "step": 6655 }, { "epoch": 0.4525071341214839, "grad_norm": 0.23465953767299652, "learning_rate": 9.434960592471803e-05, "loss": 3.8209, "step": 6660 }, { "epoch": 0.45284685419214565, "grad_norm": 0.17599263787269592, "learning_rate": 9.434535942383476e-05, "loss": 3.8742, "step": 6665 }, { "epoch": 0.45318657426280745, "grad_norm": 0.5463417172431946, "learning_rate": 9.434111292295149e-05, "loss": 4.1574, "step": 6670 }, { "epoch": 0.45352629433346925, "grad_norm": 0.21346516907215118, "learning_rate": 9.433686642206822e-05, "loss": 4.3701, "step": 6675 }, { "epoch": 0.453866014404131, "grad_norm": 0.2235599011182785, "learning_rate": 9.433261992118495e-05, "loss": 4.1951, "step": 6680 }, { "epoch": 0.4542057344747928, "grad_norm": 0.2730211615562439, "learning_rate": 9.432837342030167e-05, "loss": 3.8224, "step": 6685 }, { "epoch": 0.45454545454545453, "grad_norm": 0.1868310272693634, "learning_rate": 9.43241269194184e-05, "loss": 3.7951, "step": 6690 }, { "epoch": 0.45488517461611633, "grad_norm": 0.3626730442047119, "learning_rate": 9.431988041853513e-05, "loss": 4.1127, "step": 6695 }, { "epoch": 0.45522489468677807, "grad_norm": 0.22474856674671173, "learning_rate": 9.431563391765186e-05, "loss": 3.8253, "step": 6700 }, { "epoch": 0.45556461475743987, "grad_norm": 0.3556784689426422, "learning_rate": 9.431138741676859e-05, "loss": 4.0104, "step": 6705 }, { "epoch": 0.45590433482810166, "grad_norm": 1.793366551399231, "learning_rate": 9.430714091588531e-05, "loss": 4.1386, "step": 6710 }, { "epoch": 0.4562440548987634, "grad_norm": 0.18291668593883514, "learning_rate": 9.430289441500204e-05, "loss": 4.118, "step": 6715 }, { "epoch": 0.4565837749694252, "grad_norm": 0.3133949339389801, "learning_rate": 9.429864791411877e-05, "loss": 3.9527, "step": 6720 }, { "epoch": 0.45692349504008695, "grad_norm": 0.2146921306848526, "learning_rate": 9.42944014132355e-05, "loss": 3.9596, "step": 6725 }, { "epoch": 0.45726321511074874, "grad_norm": 0.17036296427249908, "learning_rate": 9.429015491235223e-05, "loss": 4.0381, "step": 6730 }, { "epoch": 0.45760293518141054, "grad_norm": 0.32481151819229126, "learning_rate": 9.428590841146895e-05, "loss": 4.0932, "step": 6735 }, { "epoch": 0.4579426552520723, "grad_norm": 0.18955262005329132, "learning_rate": 9.428166191058568e-05, "loss": 4.1081, "step": 6740 }, { "epoch": 0.4582823753227341, "grad_norm": 0.2482958883047104, "learning_rate": 9.427741540970241e-05, "loss": 4.036, "step": 6745 }, { "epoch": 0.4586220953933958, "grad_norm": 0.1786300390958786, "learning_rate": 9.427316890881914e-05, "loss": 4.2296, "step": 6750 }, { "epoch": 0.4589618154640576, "grad_norm": 0.19114576280117035, "learning_rate": 9.426892240793587e-05, "loss": 4.1781, "step": 6755 }, { "epoch": 0.4593015355347194, "grad_norm": 0.1777360886335373, "learning_rate": 9.42646759070526e-05, "loss": 3.7195, "step": 6760 }, { "epoch": 0.45964125560538116, "grad_norm": 0.2255825698375702, "learning_rate": 9.426042940616932e-05, "loss": 4.0727, "step": 6765 }, { "epoch": 0.45998097567604296, "grad_norm": 2.2815988063812256, "learning_rate": 9.425618290528605e-05, "loss": 4.1177, "step": 6770 }, { "epoch": 0.4603206957467047, "grad_norm": 0.18281744420528412, "learning_rate": 9.425193640440278e-05, "loss": 4.1818, "step": 6775 }, { "epoch": 0.4606604158173665, "grad_norm": 0.1932012438774109, "learning_rate": 9.42476899035195e-05, "loss": 4.207, "step": 6780 }, { "epoch": 0.46100013588802824, "grad_norm": 0.3870634138584137, "learning_rate": 9.424344340263623e-05, "loss": 4.0997, "step": 6785 }, { "epoch": 0.46133985595869004, "grad_norm": 0.20440851151943207, "learning_rate": 9.423919690175296e-05, "loss": 3.9454, "step": 6790 }, { "epoch": 0.46167957602935183, "grad_norm": 0.21663671731948853, "learning_rate": 9.423495040086969e-05, "loss": 4.013, "step": 6795 }, { "epoch": 0.4620192961000136, "grad_norm": 0.18921007215976715, "learning_rate": 9.423070389998642e-05, "loss": 4.0641, "step": 6800 }, { "epoch": 0.4623590161706754, "grad_norm": 0.21008360385894775, "learning_rate": 9.422645739910315e-05, "loss": 3.8814, "step": 6805 }, { "epoch": 0.4626987362413371, "grad_norm": 1.4397491216659546, "learning_rate": 9.422221089821987e-05, "loss": 3.977, "step": 6810 }, { "epoch": 0.4630384563119989, "grad_norm": 0.1650581657886505, "learning_rate": 9.42179643973366e-05, "loss": 3.8511, "step": 6815 }, { "epoch": 0.4633781763826607, "grad_norm": 0.24074020981788635, "learning_rate": 9.421371789645333e-05, "loss": 4.4683, "step": 6820 }, { "epoch": 0.46371789645332245, "grad_norm": 0.2204151600599289, "learning_rate": 9.420947139557006e-05, "loss": 4.077, "step": 6825 }, { "epoch": 0.46405761652398425, "grad_norm": 0.24461984634399414, "learning_rate": 9.420522489468679e-05, "loss": 3.9247, "step": 6830 }, { "epoch": 0.464397336594646, "grad_norm": 0.19434142112731934, "learning_rate": 9.42009783938035e-05, "loss": 3.8453, "step": 6835 }, { "epoch": 0.4647370566653078, "grad_norm": 0.2689877450466156, "learning_rate": 9.419673189292024e-05, "loss": 4.0808, "step": 6840 }, { "epoch": 0.4650767767359696, "grad_norm": 0.2343098372220993, "learning_rate": 9.419248539203697e-05, "loss": 4.0781, "step": 6845 }, { "epoch": 0.46541649680663133, "grad_norm": 0.19415900111198425, "learning_rate": 9.418823889115369e-05, "loss": 4.1067, "step": 6850 }, { "epoch": 0.4657562168772931, "grad_norm": 0.20309092104434967, "learning_rate": 9.418399239027043e-05, "loss": 4.0426, "step": 6855 }, { "epoch": 0.46609593694795487, "grad_norm": 0.35404348373413086, "learning_rate": 9.417974588938715e-05, "loss": 4.0913, "step": 6860 }, { "epoch": 0.46643565701861667, "grad_norm": 0.8057840466499329, "learning_rate": 9.417549938850387e-05, "loss": 4.0028, "step": 6865 }, { "epoch": 0.4667753770892784, "grad_norm": 0.23458503186702728, "learning_rate": 9.417125288762061e-05, "loss": 3.878, "step": 6870 }, { "epoch": 0.4671150971599402, "grad_norm": 0.2019844651222229, "learning_rate": 9.416700638673734e-05, "loss": 3.7162, "step": 6875 }, { "epoch": 0.467454817230602, "grad_norm": 0.19805146753787994, "learning_rate": 9.416275988585405e-05, "loss": 3.8687, "step": 6880 }, { "epoch": 0.46779453730126375, "grad_norm": 0.2559395730495453, "learning_rate": 9.41585133849708e-05, "loss": 4.0791, "step": 6885 }, { "epoch": 0.46813425737192554, "grad_norm": 0.3069989085197449, "learning_rate": 9.415426688408752e-05, "loss": 4.1204, "step": 6890 }, { "epoch": 0.4684739774425873, "grad_norm": 0.5808936953544617, "learning_rate": 9.415002038320424e-05, "loss": 4.078, "step": 6895 }, { "epoch": 0.4688136975132491, "grad_norm": 0.2510988414287567, "learning_rate": 9.414577388232098e-05, "loss": 4.23, "step": 6900 }, { "epoch": 0.4691534175839109, "grad_norm": 0.2112618386745453, "learning_rate": 9.414152738143769e-05, "loss": 4.1868, "step": 6905 }, { "epoch": 0.4694931376545726, "grad_norm": 0.22074821591377258, "learning_rate": 9.413728088055442e-05, "loss": 4.116, "step": 6910 }, { "epoch": 0.4698328577252344, "grad_norm": 0.18397152423858643, "learning_rate": 9.413303437967116e-05, "loss": 4.0599, "step": 6915 }, { "epoch": 0.47017257779589616, "grad_norm": 0.23679019510746002, "learning_rate": 9.412878787878788e-05, "loss": 3.8418, "step": 6920 }, { "epoch": 0.47051229786655796, "grad_norm": 0.16602519154548645, "learning_rate": 9.41245413779046e-05, "loss": 4.0308, "step": 6925 }, { "epoch": 0.47085201793721976, "grad_norm": 0.2898738384246826, "learning_rate": 9.412029487702135e-05, "loss": 4.2135, "step": 6930 }, { "epoch": 0.4711917380078815, "grad_norm": 0.21048447489738464, "learning_rate": 9.411604837613806e-05, "loss": 3.9954, "step": 6935 }, { "epoch": 0.4715314580785433, "grad_norm": 0.16546538472175598, "learning_rate": 9.411180187525479e-05, "loss": 4.1153, "step": 6940 }, { "epoch": 0.47187117814920504, "grad_norm": 0.5077167749404907, "learning_rate": 9.410755537437153e-05, "loss": 4.1654, "step": 6945 }, { "epoch": 0.47221089821986684, "grad_norm": 0.20563165843486786, "learning_rate": 9.410330887348825e-05, "loss": 3.9387, "step": 6950 }, { "epoch": 0.4725506182905286, "grad_norm": 0.2332395762205124, "learning_rate": 9.409906237260497e-05, "loss": 4.2803, "step": 6955 }, { "epoch": 0.4728903383611904, "grad_norm": 0.9494916796684265, "learning_rate": 9.409481587172171e-05, "loss": 4.1342, "step": 6960 }, { "epoch": 0.4732300584318522, "grad_norm": 0.4015176296234131, "learning_rate": 9.409056937083843e-05, "loss": 4.1532, "step": 6965 }, { "epoch": 0.4735697785025139, "grad_norm": 0.17285263538360596, "learning_rate": 9.408632286995516e-05, "loss": 3.8368, "step": 6970 }, { "epoch": 0.4739094985731757, "grad_norm": 0.24160470068454742, "learning_rate": 9.408207636907189e-05, "loss": 4.0496, "step": 6975 }, { "epoch": 0.47424921864383746, "grad_norm": 0.16916899383068085, "learning_rate": 9.407782986818861e-05, "loss": 4.3213, "step": 6980 }, { "epoch": 0.47458893871449925, "grad_norm": 0.1876440942287445, "learning_rate": 9.407358336730534e-05, "loss": 4.0836, "step": 6985 }, { "epoch": 0.47492865878516105, "grad_norm": 0.20803870260715485, "learning_rate": 9.406933686642207e-05, "loss": 4.0582, "step": 6990 }, { "epoch": 0.4752683788558228, "grad_norm": 0.5098522305488586, "learning_rate": 9.40650903655388e-05, "loss": 3.7007, "step": 6995 }, { "epoch": 0.4756080989264846, "grad_norm": 0.28446561098098755, "learning_rate": 9.406084386465553e-05, "loss": 3.7383, "step": 7000 }, { "epoch": 0.47594781899714633, "grad_norm": 1.344814658164978, "learning_rate": 9.405659736377225e-05, "loss": 4.3367, "step": 7005 }, { "epoch": 0.47628753906780813, "grad_norm": 0.3608788549900055, "learning_rate": 9.405235086288898e-05, "loss": 4.0906, "step": 7010 }, { "epoch": 0.4766272591384699, "grad_norm": 0.2733428478240967, "learning_rate": 9.404810436200571e-05, "loss": 4.079, "step": 7015 }, { "epoch": 0.47696697920913167, "grad_norm": 0.2144654393196106, "learning_rate": 9.404385786112244e-05, "loss": 3.5828, "step": 7020 }, { "epoch": 0.47730669927979347, "grad_norm": 0.21566329896450043, "learning_rate": 9.403961136023917e-05, "loss": 4.1559, "step": 7025 }, { "epoch": 0.4776464193504552, "grad_norm": 0.19979317486286163, "learning_rate": 9.40353648593559e-05, "loss": 4.078, "step": 7030 }, { "epoch": 0.477986139421117, "grad_norm": 0.179921954870224, "learning_rate": 9.403111835847262e-05, "loss": 3.9188, "step": 7035 }, { "epoch": 0.47832585949177875, "grad_norm": 0.17742060124874115, "learning_rate": 9.402687185758935e-05, "loss": 4.023, "step": 7040 }, { "epoch": 0.47866557956244055, "grad_norm": 0.3981129229068756, "learning_rate": 9.402262535670608e-05, "loss": 4.0171, "step": 7045 }, { "epoch": 0.47900529963310234, "grad_norm": 0.206672802567482, "learning_rate": 9.40183788558228e-05, "loss": 4.175, "step": 7050 }, { "epoch": 0.4793450197037641, "grad_norm": 0.1959320604801178, "learning_rate": 9.401413235493953e-05, "loss": 4.1638, "step": 7055 }, { "epoch": 0.4796847397744259, "grad_norm": 0.2009037733078003, "learning_rate": 9.400988585405626e-05, "loss": 4.1122, "step": 7060 }, { "epoch": 0.4800244598450876, "grad_norm": 0.17592014372348785, "learning_rate": 9.400563935317299e-05, "loss": 4.0237, "step": 7065 }, { "epoch": 0.4803641799157494, "grad_norm": 0.26748034358024597, "learning_rate": 9.400139285228972e-05, "loss": 3.8357, "step": 7070 }, { "epoch": 0.4807038999864112, "grad_norm": 0.16173365712165833, "learning_rate": 9.399714635140645e-05, "loss": 4.0592, "step": 7075 }, { "epoch": 0.48104362005707296, "grad_norm": 0.3452107906341553, "learning_rate": 9.399289985052317e-05, "loss": 4.1379, "step": 7080 }, { "epoch": 0.48138334012773476, "grad_norm": 0.20402079820632935, "learning_rate": 9.39886533496399e-05, "loss": 4.2539, "step": 7085 }, { "epoch": 0.4817230601983965, "grad_norm": 0.3040589690208435, "learning_rate": 9.398440684875663e-05, "loss": 4.0941, "step": 7090 }, { "epoch": 0.4820627802690583, "grad_norm": 0.18901465833187103, "learning_rate": 9.398016034787336e-05, "loss": 4.0795, "step": 7095 }, { "epoch": 0.4824025003397201, "grad_norm": 0.1665019541978836, "learning_rate": 9.397591384699009e-05, "loss": 4.0337, "step": 7100 }, { "epoch": 0.48274222041038184, "grad_norm": 0.42847058176994324, "learning_rate": 9.397166734610681e-05, "loss": 4.108, "step": 7105 }, { "epoch": 0.48308194048104364, "grad_norm": 0.21919941902160645, "learning_rate": 9.396742084522354e-05, "loss": 4.2498, "step": 7110 }, { "epoch": 0.4834216605517054, "grad_norm": 0.7178875803947449, "learning_rate": 9.396317434434027e-05, "loss": 3.938, "step": 7115 }, { "epoch": 0.4837613806223672, "grad_norm": 0.15290340781211853, "learning_rate": 9.3958927843457e-05, "loss": 4.1507, "step": 7120 }, { "epoch": 0.4841011006930289, "grad_norm": 0.20199231803417206, "learning_rate": 9.395468134257373e-05, "loss": 4.165, "step": 7125 }, { "epoch": 0.4844408207636907, "grad_norm": 0.4050713777542114, "learning_rate": 9.395043484169045e-05, "loss": 4.1098, "step": 7130 }, { "epoch": 0.4847805408343525, "grad_norm": 0.22871138155460358, "learning_rate": 9.394618834080718e-05, "loss": 3.8713, "step": 7135 }, { "epoch": 0.48512026090501426, "grad_norm": 0.2092018574476242, "learning_rate": 9.394194183992391e-05, "loss": 4.13, "step": 7140 }, { "epoch": 0.48545998097567605, "grad_norm": 0.23514516651630402, "learning_rate": 9.393769533904064e-05, "loss": 4.1008, "step": 7145 }, { "epoch": 0.4857997010463378, "grad_norm": 0.19990748167037964, "learning_rate": 9.393344883815737e-05, "loss": 4.1277, "step": 7150 }, { "epoch": 0.4861394211169996, "grad_norm": 0.23331451416015625, "learning_rate": 9.39292023372741e-05, "loss": 4.1307, "step": 7155 }, { "epoch": 0.4864791411876614, "grad_norm": 0.4659624397754669, "learning_rate": 9.392495583639082e-05, "loss": 4.0373, "step": 7160 }, { "epoch": 0.48681886125832313, "grad_norm": 0.350339412689209, "learning_rate": 9.392070933550755e-05, "loss": 4.0145, "step": 7165 }, { "epoch": 0.48715858132898493, "grad_norm": 0.29896309971809387, "learning_rate": 9.391646283462428e-05, "loss": 3.7892, "step": 7170 }, { "epoch": 0.48749830139964667, "grad_norm": 0.2809394896030426, "learning_rate": 9.391221633374099e-05, "loss": 4.0247, "step": 7175 }, { "epoch": 0.48783802147030847, "grad_norm": 0.2700156271457672, "learning_rate": 9.390796983285773e-05, "loss": 4.0203, "step": 7180 }, { "epoch": 0.48817774154097027, "grad_norm": 0.2099238932132721, "learning_rate": 9.390372333197446e-05, "loss": 3.8992, "step": 7185 }, { "epoch": 0.488517461611632, "grad_norm": 0.1657472550868988, "learning_rate": 9.389947683109118e-05, "loss": 4.0689, "step": 7190 }, { "epoch": 0.4888571816822938, "grad_norm": 2.5052669048309326, "learning_rate": 9.389523033020792e-05, "loss": 4.2311, "step": 7195 }, { "epoch": 0.48919690175295555, "grad_norm": 0.18674081563949585, "learning_rate": 9.389098382932465e-05, "loss": 4.0554, "step": 7200 }, { "epoch": 0.48953662182361735, "grad_norm": 0.17581969499588013, "learning_rate": 9.388673732844136e-05, "loss": 3.9992, "step": 7205 }, { "epoch": 0.4898763418942791, "grad_norm": 0.2378435879945755, "learning_rate": 9.38824908275581e-05, "loss": 4.1965, "step": 7210 }, { "epoch": 0.4902160619649409, "grad_norm": 0.21895438432693481, "learning_rate": 9.387824432667483e-05, "loss": 3.9994, "step": 7215 }, { "epoch": 0.4905557820356027, "grad_norm": 0.18622034788131714, "learning_rate": 9.387399782579154e-05, "loss": 4.2384, "step": 7220 }, { "epoch": 0.4908955021062644, "grad_norm": 0.21598079800605774, "learning_rate": 9.386975132490829e-05, "loss": 4.2434, "step": 7225 }, { "epoch": 0.4912352221769262, "grad_norm": 1.403287410736084, "learning_rate": 9.386550482402501e-05, "loss": 3.9085, "step": 7230 }, { "epoch": 0.49157494224758796, "grad_norm": 0.31629684567451477, "learning_rate": 9.386125832314173e-05, "loss": 4.0834, "step": 7235 }, { "epoch": 0.49191466231824976, "grad_norm": 0.503648042678833, "learning_rate": 9.385701182225847e-05, "loss": 4.0346, "step": 7240 }, { "epoch": 0.49225438238891156, "grad_norm": 0.16960012912750244, "learning_rate": 9.38527653213752e-05, "loss": 4.0295, "step": 7245 }, { "epoch": 0.4925941024595733, "grad_norm": 0.2992265224456787, "learning_rate": 9.384851882049191e-05, "loss": 4.147, "step": 7250 }, { "epoch": 0.4929338225302351, "grad_norm": 0.37070542573928833, "learning_rate": 9.384427231960865e-05, "loss": 3.6991, "step": 7255 }, { "epoch": 0.49327354260089684, "grad_norm": 0.2252090573310852, "learning_rate": 9.384002581872537e-05, "loss": 4.2128, "step": 7260 }, { "epoch": 0.49361326267155864, "grad_norm": 0.1695706844329834, "learning_rate": 9.38357793178421e-05, "loss": 4.0321, "step": 7265 }, { "epoch": 0.49395298274222044, "grad_norm": 0.23216302692890167, "learning_rate": 9.383153281695884e-05, "loss": 3.9758, "step": 7270 }, { "epoch": 0.4942927028128822, "grad_norm": 0.9034651517868042, "learning_rate": 9.382728631607555e-05, "loss": 3.9329, "step": 7275 }, { "epoch": 0.494632422883544, "grad_norm": 0.21417531371116638, "learning_rate": 9.382303981519228e-05, "loss": 3.8799, "step": 7280 }, { "epoch": 0.4949721429542057, "grad_norm": 0.18411661684513092, "learning_rate": 9.381879331430902e-05, "loss": 4.2956, "step": 7285 }, { "epoch": 0.4953118630248675, "grad_norm": 0.23416036367416382, "learning_rate": 9.381454681342574e-05, "loss": 3.8779, "step": 7290 }, { "epoch": 0.49565158309552926, "grad_norm": 0.2349618524312973, "learning_rate": 9.381030031254246e-05, "loss": 4.027, "step": 7295 }, { "epoch": 0.49599130316619106, "grad_norm": 0.2522861659526825, "learning_rate": 9.38060538116592e-05, "loss": 4.1115, "step": 7300 }, { "epoch": 0.49633102323685285, "grad_norm": 0.18916946649551392, "learning_rate": 9.380180731077592e-05, "loss": 4.1002, "step": 7305 }, { "epoch": 0.4966707433075146, "grad_norm": 0.20227526128292084, "learning_rate": 9.379756080989265e-05, "loss": 4.0984, "step": 7310 }, { "epoch": 0.4970104633781764, "grad_norm": 0.20442931354045868, "learning_rate": 9.379331430900939e-05, "loss": 4.0096, "step": 7315 }, { "epoch": 0.49735018344883813, "grad_norm": 0.19708792865276337, "learning_rate": 9.37890678081261e-05, "loss": 4.2264, "step": 7320 }, { "epoch": 0.49768990351949993, "grad_norm": 0.2058994174003601, "learning_rate": 9.378482130724283e-05, "loss": 3.9661, "step": 7325 }, { "epoch": 0.49802962359016173, "grad_norm": 0.18831300735473633, "learning_rate": 9.378057480635956e-05, "loss": 4.0001, "step": 7330 }, { "epoch": 0.49836934366082347, "grad_norm": 0.5251606702804565, "learning_rate": 9.377632830547629e-05, "loss": 4.1706, "step": 7335 }, { "epoch": 0.49870906373148527, "grad_norm": 0.17007534205913544, "learning_rate": 9.377208180459302e-05, "loss": 4.1699, "step": 7340 }, { "epoch": 0.499048783802147, "grad_norm": 0.3484830856323242, "learning_rate": 9.376783530370974e-05, "loss": 3.944, "step": 7345 }, { "epoch": 0.4993885038728088, "grad_norm": 0.20382869243621826, "learning_rate": 9.376358880282647e-05, "loss": 3.7016, "step": 7350 }, { "epoch": 0.4997282239434706, "grad_norm": 0.2002745270729065, "learning_rate": 9.37593423019432e-05, "loss": 4.1238, "step": 7355 }, { "epoch": 0.5000679440141323, "grad_norm": 0.17399045825004578, "learning_rate": 9.375509580105993e-05, "loss": 4.0909, "step": 7360 }, { "epoch": 0.5004076640847941, "grad_norm": 0.24848084151744843, "learning_rate": 9.375084930017666e-05, "loss": 4.1379, "step": 7365 }, { "epoch": 0.5007473841554559, "grad_norm": 0.5024029016494751, "learning_rate": 9.374660279929338e-05, "loss": 4.2591, "step": 7370 }, { "epoch": 0.5010871042261177, "grad_norm": 0.22552575170993805, "learning_rate": 9.374235629841011e-05, "loss": 4.049, "step": 7375 }, { "epoch": 0.5014268242967794, "grad_norm": 0.2249031662940979, "learning_rate": 9.373810979752684e-05, "loss": 4.2421, "step": 7380 }, { "epoch": 0.5017665443674413, "grad_norm": 0.22408431768417358, "learning_rate": 9.373386329664357e-05, "loss": 4.3666, "step": 7385 }, { "epoch": 0.502106264438103, "grad_norm": 0.16393537819385529, "learning_rate": 9.37296167957603e-05, "loss": 3.7884, "step": 7390 }, { "epoch": 0.5024459845087648, "grad_norm": 0.25391802191734314, "learning_rate": 9.372537029487702e-05, "loss": 4.1085, "step": 7395 }, { "epoch": 0.5027857045794265, "grad_norm": 0.25248852372169495, "learning_rate": 9.372112379399375e-05, "loss": 4.0558, "step": 7400 }, { "epoch": 0.5031254246500884, "grad_norm": 0.2197033017873764, "learning_rate": 9.371687729311048e-05, "loss": 3.979, "step": 7405 }, { "epoch": 0.5034651447207501, "grad_norm": 0.20195040106773376, "learning_rate": 9.371263079222721e-05, "loss": 3.8918, "step": 7410 }, { "epoch": 0.5038048647914118, "grad_norm": 0.1969507336616516, "learning_rate": 9.370838429134394e-05, "loss": 3.9211, "step": 7415 }, { "epoch": 0.5041445848620737, "grad_norm": 0.26221612095832825, "learning_rate": 9.370413779046066e-05, "loss": 4.067, "step": 7420 }, { "epoch": 0.5044843049327354, "grad_norm": 0.20401246845722198, "learning_rate": 9.369989128957739e-05, "loss": 3.9535, "step": 7425 }, { "epoch": 0.5048240250033972, "grad_norm": 0.5490508675575256, "learning_rate": 9.369564478869412e-05, "loss": 3.9772, "step": 7430 }, { "epoch": 0.5051637450740589, "grad_norm": 0.1551188975572586, "learning_rate": 9.369139828781085e-05, "loss": 4.1125, "step": 7435 }, { "epoch": 0.5055034651447208, "grad_norm": 0.31335705518722534, "learning_rate": 9.368715178692758e-05, "loss": 4.0064, "step": 7440 }, { "epoch": 0.5058431852153825, "grad_norm": 0.23974861204624176, "learning_rate": 9.36829052860443e-05, "loss": 3.9679, "step": 7445 }, { "epoch": 0.5061829052860443, "grad_norm": 0.15649579465389252, "learning_rate": 9.367865878516103e-05, "loss": 4.005, "step": 7450 }, { "epoch": 0.5065226253567061, "grad_norm": 0.18585532903671265, "learning_rate": 9.367441228427776e-05, "loss": 3.8728, "step": 7455 }, { "epoch": 0.5068623454273679, "grad_norm": 0.2581709623336792, "learning_rate": 9.367016578339449e-05, "loss": 4.0771, "step": 7460 }, { "epoch": 0.5072020654980296, "grad_norm": 0.2468833178281784, "learning_rate": 9.366591928251122e-05, "loss": 4.054, "step": 7465 }, { "epoch": 0.5075417855686915, "grad_norm": 0.2019713968038559, "learning_rate": 9.366167278162794e-05, "loss": 3.6126, "step": 7470 }, { "epoch": 0.5078815056393532, "grad_norm": 0.25231555104255676, "learning_rate": 9.365742628074467e-05, "loss": 4.4004, "step": 7475 }, { "epoch": 0.5082212257100149, "grad_norm": 0.14785149693489075, "learning_rate": 9.36531797798614e-05, "loss": 3.7965, "step": 7480 }, { "epoch": 0.5085609457806767, "grad_norm": 0.21854764223098755, "learning_rate": 9.364893327897813e-05, "loss": 4.3081, "step": 7485 }, { "epoch": 0.5089006658513385, "grad_norm": 0.35712555050849915, "learning_rate": 9.364468677809486e-05, "loss": 3.7848, "step": 7490 }, { "epoch": 0.5092403859220003, "grad_norm": 0.9170289039611816, "learning_rate": 9.364044027721158e-05, "loss": 4.0845, "step": 7495 }, { "epoch": 0.509580105992662, "grad_norm": 0.19155257940292358, "learning_rate": 9.363619377632831e-05, "loss": 4.0807, "step": 7500 }, { "epoch": 0.5099198260633239, "grad_norm": 0.20362383127212524, "learning_rate": 9.363194727544504e-05, "loss": 4.1181, "step": 7505 }, { "epoch": 0.5102595461339856, "grad_norm": 1.610526204109192, "learning_rate": 9.362770077456177e-05, "loss": 4.1332, "step": 7510 }, { "epoch": 0.5105992662046474, "grad_norm": 0.36077892780303955, "learning_rate": 9.36234542736785e-05, "loss": 4.0379, "step": 7515 }, { "epoch": 0.5109389862753091, "grad_norm": 0.34184160828590393, "learning_rate": 9.361920777279522e-05, "loss": 4.0309, "step": 7520 }, { "epoch": 0.511278706345971, "grad_norm": 0.266156941652298, "learning_rate": 9.361496127191195e-05, "loss": 4.0715, "step": 7525 }, { "epoch": 0.5116184264166327, "grad_norm": 0.21247410774230957, "learning_rate": 9.361071477102867e-05, "loss": 4.1461, "step": 7530 }, { "epoch": 0.5119581464872944, "grad_norm": 0.3173115849494934, "learning_rate": 9.360646827014541e-05, "loss": 4.0909, "step": 7535 }, { "epoch": 0.5122978665579563, "grad_norm": 0.1932353973388672, "learning_rate": 9.360222176926214e-05, "loss": 4.0699, "step": 7540 }, { "epoch": 0.512637586628618, "grad_norm": 0.34887808561325073, "learning_rate": 9.359797526837885e-05, "loss": 3.8442, "step": 7545 }, { "epoch": 0.5129773066992798, "grad_norm": 0.1603212207555771, "learning_rate": 9.359372876749559e-05, "loss": 3.9252, "step": 7550 }, { "epoch": 0.5133170267699416, "grad_norm": 0.18673382699489594, "learning_rate": 9.358948226661232e-05, "loss": 4.132, "step": 7555 }, { "epoch": 0.5136567468406034, "grad_norm": 0.17931464314460754, "learning_rate": 9.358523576572904e-05, "loss": 4.2045, "step": 7560 }, { "epoch": 0.5139964669112651, "grad_norm": 0.20832332968711853, "learning_rate": 9.358098926484578e-05, "loss": 4.1134, "step": 7565 }, { "epoch": 0.5143361869819268, "grad_norm": 0.1900588423013687, "learning_rate": 9.35767427639625e-05, "loss": 4.1737, "step": 7570 }, { "epoch": 0.5146759070525887, "grad_norm": 0.25555694103240967, "learning_rate": 9.357249626307922e-05, "loss": 3.9879, "step": 7575 }, { "epoch": 0.5150156271232504, "grad_norm": 0.3741958737373352, "learning_rate": 9.356824976219596e-05, "loss": 4.0922, "step": 7580 }, { "epoch": 0.5153553471939122, "grad_norm": 0.25290772318840027, "learning_rate": 9.356400326131269e-05, "loss": 4.1101, "step": 7585 }, { "epoch": 0.515695067264574, "grad_norm": 0.23055055737495422, "learning_rate": 9.35597567604294e-05, "loss": 3.9841, "step": 7590 }, { "epoch": 0.5160347873352358, "grad_norm": 0.2004847675561905, "learning_rate": 9.355551025954614e-05, "loss": 4.0573, "step": 7595 }, { "epoch": 0.5163745074058975, "grad_norm": 0.17430374026298523, "learning_rate": 9.355126375866287e-05, "loss": 4.1038, "step": 7600 }, { "epoch": 0.5167142274765593, "grad_norm": 0.19129951298236847, "learning_rate": 9.354701725777959e-05, "loss": 3.9517, "step": 7605 }, { "epoch": 0.5170539475472211, "grad_norm": 0.24866996705532074, "learning_rate": 9.354277075689633e-05, "loss": 3.9927, "step": 7610 }, { "epoch": 0.5173936676178829, "grad_norm": 0.2479562759399414, "learning_rate": 9.353852425601304e-05, "loss": 3.824, "step": 7615 }, { "epoch": 0.5177333876885446, "grad_norm": 0.25535717606544495, "learning_rate": 9.353427775512977e-05, "loss": 4.1203, "step": 7620 }, { "epoch": 0.5180731077592065, "grad_norm": 0.19401293992996216, "learning_rate": 9.353003125424651e-05, "loss": 4.2723, "step": 7625 }, { "epoch": 0.5184128278298682, "grad_norm": 0.25242581963539124, "learning_rate": 9.352578475336323e-05, "loss": 3.7278, "step": 7630 }, { "epoch": 0.5187525479005299, "grad_norm": 0.24895191192626953, "learning_rate": 9.352153825247996e-05, "loss": 4.0693, "step": 7635 }, { "epoch": 0.5190922679711918, "grad_norm": 0.208522230386734, "learning_rate": 9.35172917515967e-05, "loss": 4.105, "step": 7640 }, { "epoch": 0.5194319880418535, "grad_norm": 0.21266648173332214, "learning_rate": 9.351304525071341e-05, "loss": 4.0295, "step": 7645 }, { "epoch": 0.5197717081125153, "grad_norm": 0.1788320690393448, "learning_rate": 9.350879874983014e-05, "loss": 3.9959, "step": 7650 }, { "epoch": 0.520111428183177, "grad_norm": 0.36194828152656555, "learning_rate": 9.350455224894688e-05, "loss": 4.0003, "step": 7655 }, { "epoch": 0.5204511482538389, "grad_norm": 0.22471646964550018, "learning_rate": 9.35003057480636e-05, "loss": 4.0001, "step": 7660 }, { "epoch": 0.5207908683245006, "grad_norm": 0.20140118896961212, "learning_rate": 9.349605924718032e-05, "loss": 3.802, "step": 7665 }, { "epoch": 0.5211305883951624, "grad_norm": 0.2610224485397339, "learning_rate": 9.349181274629706e-05, "loss": 4.0378, "step": 7670 }, { "epoch": 0.5214703084658242, "grad_norm": 0.19528135657310486, "learning_rate": 9.348756624541378e-05, "loss": 4.059, "step": 7675 }, { "epoch": 0.521810028536486, "grad_norm": 0.1775432527065277, "learning_rate": 9.348331974453051e-05, "loss": 4.0532, "step": 7680 }, { "epoch": 0.5221497486071477, "grad_norm": 4.0261311531066895, "learning_rate": 9.347907324364724e-05, "loss": 4.1247, "step": 7685 }, { "epoch": 0.5224894686778094, "grad_norm": 0.18927231431007385, "learning_rate": 9.347482674276396e-05, "loss": 4.1597, "step": 7690 }, { "epoch": 0.5228291887484713, "grad_norm": 0.17525963485240936, "learning_rate": 9.347058024188069e-05, "loss": 3.7783, "step": 7695 }, { "epoch": 0.523168908819133, "grad_norm": 0.15147970616817474, "learning_rate": 9.346633374099742e-05, "loss": 4.017, "step": 7700 }, { "epoch": 0.5235086288897948, "grad_norm": 0.17710480093955994, "learning_rate": 9.346208724011415e-05, "loss": 4.3239, "step": 7705 }, { "epoch": 0.5238483489604566, "grad_norm": 0.1875525414943695, "learning_rate": 9.345784073923088e-05, "loss": 3.9417, "step": 7710 }, { "epoch": 0.5241880690311184, "grad_norm": 0.2575678825378418, "learning_rate": 9.34535942383476e-05, "loss": 4.1485, "step": 7715 }, { "epoch": 0.5245277891017801, "grad_norm": 0.8326651453971863, "learning_rate": 9.344934773746433e-05, "loss": 4.0598, "step": 7720 }, { "epoch": 0.524867509172442, "grad_norm": 0.1673835813999176, "learning_rate": 9.344510123658106e-05, "loss": 4.1255, "step": 7725 }, { "epoch": 0.5252072292431037, "grad_norm": 0.764521598815918, "learning_rate": 9.344085473569779e-05, "loss": 3.9019, "step": 7730 }, { "epoch": 0.5255469493137654, "grad_norm": 0.17800885438919067, "learning_rate": 9.343660823481452e-05, "loss": 3.7869, "step": 7735 }, { "epoch": 0.5258866693844272, "grad_norm": 0.1920061856508255, "learning_rate": 9.343236173393124e-05, "loss": 4.1229, "step": 7740 }, { "epoch": 0.526226389455089, "grad_norm": 3.5604405403137207, "learning_rate": 9.342811523304797e-05, "loss": 3.9478, "step": 7745 }, { "epoch": 0.5265661095257508, "grad_norm": 0.194805309176445, "learning_rate": 9.34238687321647e-05, "loss": 4.2643, "step": 7750 }, { "epoch": 0.5269058295964125, "grad_norm": 0.22051791846752167, "learning_rate": 9.341962223128143e-05, "loss": 4.1483, "step": 7755 }, { "epoch": 0.5272455496670744, "grad_norm": 0.22619082033634186, "learning_rate": 9.341622503057481e-05, "loss": 3.5109, "step": 7760 }, { "epoch": 0.5275852697377361, "grad_norm": 0.2676302194595337, "learning_rate": 9.341197852969154e-05, "loss": 4.174, "step": 7765 }, { "epoch": 0.5279249898083979, "grad_norm": 0.25767549872398376, "learning_rate": 9.340773202880827e-05, "loss": 3.9739, "step": 7770 }, { "epoch": 0.5282647098790596, "grad_norm": 0.18924906849861145, "learning_rate": 9.3403485527925e-05, "loss": 4.2547, "step": 7775 }, { "epoch": 0.5286044299497215, "grad_norm": 0.1977754682302475, "learning_rate": 9.339923902704172e-05, "loss": 3.8264, "step": 7780 }, { "epoch": 0.5289441500203832, "grad_norm": 0.1865202784538269, "learning_rate": 9.339499252615845e-05, "loss": 4.2391, "step": 7785 }, { "epoch": 0.5292838700910449, "grad_norm": 0.22030463814735413, "learning_rate": 9.339074602527518e-05, "loss": 4.1641, "step": 7790 }, { "epoch": 0.5296235901617068, "grad_norm": 0.1868383139371872, "learning_rate": 9.33864995243919e-05, "loss": 4.247, "step": 7795 }, { "epoch": 0.5299633102323685, "grad_norm": 0.19364790618419647, "learning_rate": 9.338225302350863e-05, "loss": 3.9474, "step": 7800 }, { "epoch": 0.5303030303030303, "grad_norm": 0.1819707751274109, "learning_rate": 9.337800652262536e-05, "loss": 4.0559, "step": 7805 }, { "epoch": 0.5306427503736921, "grad_norm": 0.25897395610809326, "learning_rate": 9.337376002174209e-05, "loss": 3.9502, "step": 7810 }, { "epoch": 0.5309824704443539, "grad_norm": 0.23874245584011078, "learning_rate": 9.336951352085882e-05, "loss": 3.89, "step": 7815 }, { "epoch": 0.5313221905150156, "grad_norm": 0.16747428476810455, "learning_rate": 9.336526701997555e-05, "loss": 4.0011, "step": 7820 }, { "epoch": 0.5316619105856774, "grad_norm": 0.17043693363666534, "learning_rate": 9.336102051909227e-05, "loss": 4.0938, "step": 7825 }, { "epoch": 0.5320016306563392, "grad_norm": 0.19424232840538025, "learning_rate": 9.3356774018209e-05, "loss": 3.9479, "step": 7830 }, { "epoch": 0.532341350727001, "grad_norm": 0.1900462657213211, "learning_rate": 9.335252751732573e-05, "loss": 4.2105, "step": 7835 }, { "epoch": 0.5326810707976627, "grad_norm": 0.2120985984802246, "learning_rate": 9.334828101644246e-05, "loss": 4.2686, "step": 7840 }, { "epoch": 0.5330207908683245, "grad_norm": 0.24349243938922882, "learning_rate": 9.334403451555919e-05, "loss": 4.09, "step": 7845 }, { "epoch": 0.5333605109389863, "grad_norm": 0.22167713940143585, "learning_rate": 9.333978801467591e-05, "loss": 4.0405, "step": 7850 }, { "epoch": 0.533700231009648, "grad_norm": 0.2967727780342102, "learning_rate": 9.333554151379263e-05, "loss": 4.3056, "step": 7855 }, { "epoch": 0.5340399510803098, "grad_norm": 0.1543935239315033, "learning_rate": 9.333129501290937e-05, "loss": 4.0946, "step": 7860 }, { "epoch": 0.5343796711509716, "grad_norm": 0.27082520723342896, "learning_rate": 9.33270485120261e-05, "loss": 3.9952, "step": 7865 }, { "epoch": 0.5347193912216334, "grad_norm": 0.27814018726348877, "learning_rate": 9.332280201114281e-05, "loss": 4.2333, "step": 7870 }, { "epoch": 0.5350591112922951, "grad_norm": 0.18961313366889954, "learning_rate": 9.331855551025955e-05, "loss": 4.1017, "step": 7875 }, { "epoch": 0.535398831362957, "grad_norm": 0.19121582806110382, "learning_rate": 9.331430900937628e-05, "loss": 4.1663, "step": 7880 }, { "epoch": 0.5357385514336187, "grad_norm": 0.20209085941314697, "learning_rate": 9.3310062508493e-05, "loss": 3.9602, "step": 7885 }, { "epoch": 0.5360782715042804, "grad_norm": 0.222623810172081, "learning_rate": 9.330581600760974e-05, "loss": 4.208, "step": 7890 }, { "epoch": 0.5364179915749423, "grad_norm": 0.2834756672382355, "learning_rate": 9.330156950672647e-05, "loss": 3.8017, "step": 7895 }, { "epoch": 0.536757711645604, "grad_norm": 0.26975885033607483, "learning_rate": 9.329732300584318e-05, "loss": 3.8618, "step": 7900 }, { "epoch": 0.5370974317162658, "grad_norm": 0.24554960429668427, "learning_rate": 9.329307650495992e-05, "loss": 4.1339, "step": 7905 }, { "epoch": 0.5374371517869275, "grad_norm": 0.19073788821697235, "learning_rate": 9.328883000407665e-05, "loss": 4.1798, "step": 7910 }, { "epoch": 0.5377768718575894, "grad_norm": 0.1700531542301178, "learning_rate": 9.328458350319336e-05, "loss": 4.1152, "step": 7915 }, { "epoch": 0.5381165919282511, "grad_norm": 0.2178450971841812, "learning_rate": 9.32803370023101e-05, "loss": 3.9352, "step": 7920 }, { "epoch": 0.5384563119989129, "grad_norm": 0.20611537992954254, "learning_rate": 9.327609050142682e-05, "loss": 4.0871, "step": 7925 }, { "epoch": 0.5387960320695747, "grad_norm": 0.20074544847011566, "learning_rate": 9.327184400054355e-05, "loss": 4.1689, "step": 7930 }, { "epoch": 0.5391357521402365, "grad_norm": 0.18785405158996582, "learning_rate": 9.326759749966029e-05, "loss": 4.0912, "step": 7935 }, { "epoch": 0.5394754722108982, "grad_norm": 0.2205738127231598, "learning_rate": 9.3263350998777e-05, "loss": 4.0251, "step": 7940 }, { "epoch": 0.5398151922815599, "grad_norm": 0.2106117606163025, "learning_rate": 9.325910449789373e-05, "loss": 3.9616, "step": 7945 }, { "epoch": 0.5401549123522218, "grad_norm": 0.20841112732887268, "learning_rate": 9.325485799701047e-05, "loss": 4.1365, "step": 7950 }, { "epoch": 0.5404946324228835, "grad_norm": 0.20364391803741455, "learning_rate": 9.325061149612719e-05, "loss": 3.8211, "step": 7955 }, { "epoch": 0.5408343524935453, "grad_norm": 0.1856883019208908, "learning_rate": 9.324636499524393e-05, "loss": 4.102, "step": 7960 }, { "epoch": 0.5411740725642071, "grad_norm": 0.22017249464988708, "learning_rate": 9.324211849436066e-05, "loss": 3.9317, "step": 7965 }, { "epoch": 0.5415137926348689, "grad_norm": 0.2391664832830429, "learning_rate": 9.323787199347737e-05, "loss": 3.975, "step": 7970 }, { "epoch": 0.5418535127055306, "grad_norm": 0.15632979571819305, "learning_rate": 9.323362549259411e-05, "loss": 4.1384, "step": 7975 }, { "epoch": 0.5421932327761925, "grad_norm": 0.18496164679527283, "learning_rate": 9.322937899171084e-05, "loss": 4.1871, "step": 7980 }, { "epoch": 0.5425329528468542, "grad_norm": 0.18015219271183014, "learning_rate": 9.322513249082756e-05, "loss": 3.9501, "step": 7985 }, { "epoch": 0.542872672917516, "grad_norm": 0.19946110248565674, "learning_rate": 9.32208859899443e-05, "loss": 3.928, "step": 7990 }, { "epoch": 0.5432123929881777, "grad_norm": 0.18122997879981995, "learning_rate": 9.321663948906103e-05, "loss": 4.0794, "step": 7995 }, { "epoch": 0.5435521130588395, "grad_norm": 0.2898665964603424, "learning_rate": 9.321239298817774e-05, "loss": 4.1946, "step": 8000 }, { "epoch": 0.5438918331295013, "grad_norm": 0.19237551093101501, "learning_rate": 9.320814648729448e-05, "loss": 3.8687, "step": 8005 }, { "epoch": 0.544231553200163, "grad_norm": 0.18704599142074585, "learning_rate": 9.32038999864112e-05, "loss": 4.0483, "step": 8010 }, { "epoch": 0.5445712732708249, "grad_norm": 0.16550405323505402, "learning_rate": 9.319965348552792e-05, "loss": 3.927, "step": 8015 }, { "epoch": 0.5449109933414866, "grad_norm": 0.20201174914836884, "learning_rate": 9.319540698464467e-05, "loss": 4.1593, "step": 8020 }, { "epoch": 0.5452507134121484, "grad_norm": 0.19555409252643585, "learning_rate": 9.319116048376138e-05, "loss": 4.2645, "step": 8025 }, { "epoch": 0.5455904334828101, "grad_norm": 0.20266516506671906, "learning_rate": 9.318691398287811e-05, "loss": 4.165, "step": 8030 }, { "epoch": 0.545930153553472, "grad_norm": 0.18069981038570404, "learning_rate": 9.318266748199485e-05, "loss": 4.0198, "step": 8035 }, { "epoch": 0.5462698736241337, "grad_norm": 0.1856188029050827, "learning_rate": 9.317842098111156e-05, "loss": 3.9087, "step": 8040 }, { "epoch": 0.5466095936947954, "grad_norm": 0.19204466044902802, "learning_rate": 9.317417448022829e-05, "loss": 4.0723, "step": 8045 }, { "epoch": 0.5469493137654573, "grad_norm": 0.19473916292190552, "learning_rate": 9.316992797934503e-05, "loss": 3.9512, "step": 8050 }, { "epoch": 0.547289033836119, "grad_norm": 0.23411086201667786, "learning_rate": 9.316568147846175e-05, "loss": 4.019, "step": 8055 }, { "epoch": 0.5476287539067808, "grad_norm": 0.17357757687568665, "learning_rate": 9.316143497757848e-05, "loss": 3.9088, "step": 8060 }, { "epoch": 0.5479684739774426, "grad_norm": 0.19055502116680145, "learning_rate": 9.315718847669522e-05, "loss": 3.9099, "step": 8065 }, { "epoch": 0.5483081940481044, "grad_norm": 0.21026012301445007, "learning_rate": 9.315294197581193e-05, "loss": 4.0825, "step": 8070 }, { "epoch": 0.5486479141187661, "grad_norm": 0.1714550107717514, "learning_rate": 9.314869547492866e-05, "loss": 4.1415, "step": 8075 }, { "epoch": 0.5489876341894279, "grad_norm": 0.18270696699619293, "learning_rate": 9.314444897404539e-05, "loss": 4.0709, "step": 8080 }, { "epoch": 0.5493273542600897, "grad_norm": 0.1924707442522049, "learning_rate": 9.314020247316212e-05, "loss": 4.0765, "step": 8085 }, { "epoch": 0.5496670743307515, "grad_norm": 1.1790002584457397, "learning_rate": 9.313595597227884e-05, "loss": 4.0, "step": 8090 }, { "epoch": 0.5500067944014132, "grad_norm": 0.2330000251531601, "learning_rate": 9.313170947139557e-05, "loss": 4.0367, "step": 8095 }, { "epoch": 0.550346514472075, "grad_norm": 0.21324478089809418, "learning_rate": 9.31274629705123e-05, "loss": 3.9346, "step": 8100 }, { "epoch": 0.5506862345427368, "grad_norm": 0.1989852339029312, "learning_rate": 9.312321646962903e-05, "loss": 3.883, "step": 8105 }, { "epoch": 0.5510259546133985, "grad_norm": 0.2093362659215927, "learning_rate": 9.311896996874576e-05, "loss": 3.945, "step": 8110 }, { "epoch": 0.5513656746840603, "grad_norm": 0.17465944588184357, "learning_rate": 9.311472346786248e-05, "loss": 3.9291, "step": 8115 }, { "epoch": 0.5517053947547221, "grad_norm": 0.7979787588119507, "learning_rate": 9.311047696697921e-05, "loss": 3.9622, "step": 8120 }, { "epoch": 0.5520451148253839, "grad_norm": 0.20573332905769348, "learning_rate": 9.310623046609594e-05, "loss": 4.0717, "step": 8125 }, { "epoch": 0.5523848348960456, "grad_norm": 0.8882215023040771, "learning_rate": 9.310198396521267e-05, "loss": 4.045, "step": 8130 }, { "epoch": 0.5527245549667075, "grad_norm": 0.1780032366514206, "learning_rate": 9.30977374643294e-05, "loss": 3.8284, "step": 8135 }, { "epoch": 0.5530642750373692, "grad_norm": 0.19671432673931122, "learning_rate": 9.309349096344612e-05, "loss": 4.0132, "step": 8140 }, { "epoch": 0.553403995108031, "grad_norm": 0.17597247660160065, "learning_rate": 9.308924446256285e-05, "loss": 4.0527, "step": 8145 }, { "epoch": 0.5537437151786928, "grad_norm": 0.19633089005947113, "learning_rate": 9.308499796167958e-05, "loss": 3.8693, "step": 8150 }, { "epoch": 0.5540834352493546, "grad_norm": 0.19133026897907257, "learning_rate": 9.308075146079631e-05, "loss": 3.9765, "step": 8155 }, { "epoch": 0.5544231553200163, "grad_norm": 0.18762901425361633, "learning_rate": 9.307650495991304e-05, "loss": 3.9366, "step": 8160 }, { "epoch": 0.554762875390678, "grad_norm": 0.21953530609607697, "learning_rate": 9.307225845902976e-05, "loss": 4.1145, "step": 8165 }, { "epoch": 0.5551025954613399, "grad_norm": 0.1631409376859665, "learning_rate": 9.306801195814649e-05, "loss": 4.3299, "step": 8170 }, { "epoch": 0.5554423155320016, "grad_norm": 0.1782941222190857, "learning_rate": 9.306376545726322e-05, "loss": 4.0921, "step": 8175 }, { "epoch": 0.5557820356026634, "grad_norm": 0.18272286653518677, "learning_rate": 9.305951895637995e-05, "loss": 4.0609, "step": 8180 }, { "epoch": 0.5561217556733252, "grad_norm": 0.5140985250473022, "learning_rate": 9.305527245549668e-05, "loss": 3.9297, "step": 8185 }, { "epoch": 0.556461475743987, "grad_norm": 1.4922987222671509, "learning_rate": 9.30510259546134e-05, "loss": 4.0238, "step": 8190 }, { "epoch": 0.5568011958146487, "grad_norm": 3.8372085094451904, "learning_rate": 9.304677945373013e-05, "loss": 4.1705, "step": 8195 }, { "epoch": 0.5571409158853105, "grad_norm": 0.1523323357105255, "learning_rate": 9.304253295284686e-05, "loss": 4.0109, "step": 8200 }, { "epoch": 0.5574806359559723, "grad_norm": 0.2263563573360443, "learning_rate": 9.303828645196359e-05, "loss": 4.0453, "step": 8205 }, { "epoch": 0.557820356026634, "grad_norm": 0.20858174562454224, "learning_rate": 9.30340399510803e-05, "loss": 4.0197, "step": 8210 }, { "epoch": 0.5581600760972958, "grad_norm": 0.2111169844865799, "learning_rate": 9.302979345019704e-05, "loss": 3.9775, "step": 8215 }, { "epoch": 0.5584997961679576, "grad_norm": 0.16785529255867004, "learning_rate": 9.302554694931377e-05, "loss": 4.1782, "step": 8220 }, { "epoch": 0.5588395162386194, "grad_norm": 0.21160061657428741, "learning_rate": 9.302130044843049e-05, "loss": 3.7809, "step": 8225 }, { "epoch": 0.5591792363092811, "grad_norm": 0.34842145442962646, "learning_rate": 9.301705394754723e-05, "loss": 4.0454, "step": 8230 }, { "epoch": 0.559518956379943, "grad_norm": 0.22332549095153809, "learning_rate": 9.301280744666396e-05, "loss": 3.9447, "step": 8235 }, { "epoch": 0.5598586764506047, "grad_norm": 0.33598944544792175, "learning_rate": 9.300856094578067e-05, "loss": 4.2742, "step": 8240 }, { "epoch": 0.5601983965212665, "grad_norm": 0.18852943181991577, "learning_rate": 9.300431444489741e-05, "loss": 3.9494, "step": 8245 }, { "epoch": 0.5605381165919282, "grad_norm": 0.1902894675731659, "learning_rate": 9.300006794401414e-05, "loss": 4.0114, "step": 8250 }, { "epoch": 0.5608778366625901, "grad_norm": 0.3319913446903229, "learning_rate": 9.299582144313086e-05, "loss": 4.1077, "step": 8255 }, { "epoch": 0.5612175567332518, "grad_norm": 1.5811398029327393, "learning_rate": 9.29915749422476e-05, "loss": 3.9531, "step": 8260 }, { "epoch": 0.5615572768039135, "grad_norm": 0.18648011982440948, "learning_rate": 9.298732844136433e-05, "loss": 4.0406, "step": 8265 }, { "epoch": 0.5618969968745754, "grad_norm": 0.20678003132343292, "learning_rate": 9.298308194048104e-05, "loss": 4.0141, "step": 8270 }, { "epoch": 0.5622367169452371, "grad_norm": 0.284509539604187, "learning_rate": 9.297883543959778e-05, "loss": 3.7511, "step": 8275 }, { "epoch": 0.5625764370158989, "grad_norm": 0.23593567311763763, "learning_rate": 9.29745889387145e-05, "loss": 3.9993, "step": 8280 }, { "epoch": 0.5629161570865606, "grad_norm": 0.221805140376091, "learning_rate": 9.297034243783122e-05, "loss": 4.0023, "step": 8285 }, { "epoch": 0.5632558771572225, "grad_norm": 0.1965785026550293, "learning_rate": 9.296609593694797e-05, "loss": 4.0849, "step": 8290 }, { "epoch": 0.5635955972278842, "grad_norm": 0.3942621648311615, "learning_rate": 9.296184943606468e-05, "loss": 4.0986, "step": 8295 }, { "epoch": 0.563935317298546, "grad_norm": 0.14950452744960785, "learning_rate": 9.295760293518142e-05, "loss": 4.0598, "step": 8300 }, { "epoch": 0.5642750373692078, "grad_norm": 0.19888624548912048, "learning_rate": 9.295335643429815e-05, "loss": 4.1756, "step": 8305 }, { "epoch": 0.5646147574398696, "grad_norm": 0.18374282121658325, "learning_rate": 9.294910993341486e-05, "loss": 4.0213, "step": 8310 }, { "epoch": 0.5649544775105313, "grad_norm": 0.19329077005386353, "learning_rate": 9.29448634325316e-05, "loss": 3.8402, "step": 8315 }, { "epoch": 0.5652941975811931, "grad_norm": 0.4161980152130127, "learning_rate": 9.294061693164833e-05, "loss": 3.8012, "step": 8320 }, { "epoch": 0.5656339176518549, "grad_norm": 0.20671139657497406, "learning_rate": 9.293637043076505e-05, "loss": 4.335, "step": 8325 }, { "epoch": 0.5659736377225166, "grad_norm": 0.20330487191677094, "learning_rate": 9.293212392988179e-05, "loss": 4.0325, "step": 8330 }, { "epoch": 0.5663133577931784, "grad_norm": 0.19175422191619873, "learning_rate": 9.292787742899852e-05, "loss": 4.1178, "step": 8335 }, { "epoch": 0.5666530778638402, "grad_norm": 0.22935859858989716, "learning_rate": 9.292363092811523e-05, "loss": 4.1405, "step": 8340 }, { "epoch": 0.566992797934502, "grad_norm": 0.17749212682247162, "learning_rate": 9.291938442723197e-05, "loss": 3.9575, "step": 8345 }, { "epoch": 0.5673325180051637, "grad_norm": 1.0764920711517334, "learning_rate": 9.291513792634869e-05, "loss": 4.3775, "step": 8350 }, { "epoch": 0.5676722380758256, "grad_norm": 0.15739497542381287, "learning_rate": 9.291089142546542e-05, "loss": 4.1305, "step": 8355 }, { "epoch": 0.5680119581464873, "grad_norm": 0.19351720809936523, "learning_rate": 9.290664492458216e-05, "loss": 3.8318, "step": 8360 }, { "epoch": 0.568351678217149, "grad_norm": 0.16042912006378174, "learning_rate": 9.290239842369887e-05, "loss": 4.1028, "step": 8365 }, { "epoch": 0.5686913982878108, "grad_norm": 0.1768733263015747, "learning_rate": 9.28981519228156e-05, "loss": 3.8895, "step": 8370 }, { "epoch": 0.5690311183584726, "grad_norm": 0.15216295421123505, "learning_rate": 9.289390542193234e-05, "loss": 4.1829, "step": 8375 }, { "epoch": 0.5693708384291344, "grad_norm": 0.20485453307628632, "learning_rate": 9.288965892104906e-05, "loss": 4.1128, "step": 8380 }, { "epoch": 0.5697105584997961, "grad_norm": 0.2530359625816345, "learning_rate": 9.288541242016578e-05, "loss": 4.0308, "step": 8385 }, { "epoch": 0.570050278570458, "grad_norm": 0.24390393495559692, "learning_rate": 9.288116591928253e-05, "loss": 3.9455, "step": 8390 }, { "epoch": 0.5703899986411197, "grad_norm": 0.15447057783603668, "learning_rate": 9.287691941839924e-05, "loss": 4.2323, "step": 8395 }, { "epoch": 0.5707297187117815, "grad_norm": 0.262494295835495, "learning_rate": 9.287267291751597e-05, "loss": 3.8537, "step": 8400 }, { "epoch": 0.5710694387824433, "grad_norm": 0.2734646499156952, "learning_rate": 9.286842641663271e-05, "loss": 4.0975, "step": 8405 }, { "epoch": 0.5714091588531051, "grad_norm": 0.3060397803783417, "learning_rate": 9.286417991574942e-05, "loss": 4.1002, "step": 8410 }, { "epoch": 0.5717488789237668, "grad_norm": 0.194105863571167, "learning_rate": 9.285993341486615e-05, "loss": 3.7785, "step": 8415 }, { "epoch": 0.5720885989944285, "grad_norm": 0.19388654828071594, "learning_rate": 9.28556869139829e-05, "loss": 4.0577, "step": 8420 }, { "epoch": 0.5724283190650904, "grad_norm": 0.4632340669631958, "learning_rate": 9.285144041309961e-05, "loss": 4.0301, "step": 8425 }, { "epoch": 0.5727680391357521, "grad_norm": 0.23817621171474457, "learning_rate": 9.284719391221634e-05, "loss": 3.9506, "step": 8430 }, { "epoch": 0.5731077592064139, "grad_norm": 0.26604166626930237, "learning_rate": 9.284294741133306e-05, "loss": 3.7005, "step": 8435 }, { "epoch": 0.5734474792770757, "grad_norm": 0.21348132193088531, "learning_rate": 9.283870091044979e-05, "loss": 4.0274, "step": 8440 }, { "epoch": 0.5737871993477375, "grad_norm": 0.3157108724117279, "learning_rate": 9.283445440956652e-05, "loss": 3.8789, "step": 8445 }, { "epoch": 0.5741269194183992, "grad_norm": 0.6310774087905884, "learning_rate": 9.283020790868325e-05, "loss": 4.0599, "step": 8450 }, { "epoch": 0.574466639489061, "grad_norm": 0.18738152086734772, "learning_rate": 9.282596140779998e-05, "loss": 3.8024, "step": 8455 }, { "epoch": 0.5748063595597228, "grad_norm": 0.3210128843784332, "learning_rate": 9.28217149069167e-05, "loss": 4.1008, "step": 8460 }, { "epoch": 0.5751460796303846, "grad_norm": 0.180195152759552, "learning_rate": 9.281746840603343e-05, "loss": 4.0769, "step": 8465 }, { "epoch": 0.5754857997010463, "grad_norm": 0.17868779599666595, "learning_rate": 9.281322190515016e-05, "loss": 3.9685, "step": 8470 }, { "epoch": 0.5758255197717081, "grad_norm": 0.5827326774597168, "learning_rate": 9.280897540426689e-05, "loss": 3.9109, "step": 8475 }, { "epoch": 0.5761652398423699, "grad_norm": 0.3124859631061554, "learning_rate": 9.280472890338362e-05, "loss": 3.8527, "step": 8480 }, { "epoch": 0.5765049599130316, "grad_norm": 0.42251700162887573, "learning_rate": 9.280048240250034e-05, "loss": 4.1552, "step": 8485 }, { "epoch": 0.5768446799836935, "grad_norm": 0.1950671374797821, "learning_rate": 9.279623590161707e-05, "loss": 3.836, "step": 8490 }, { "epoch": 0.5771844000543552, "grad_norm": 0.8381164073944092, "learning_rate": 9.27919894007338e-05, "loss": 4.0999, "step": 8495 }, { "epoch": 0.577524120125017, "grad_norm": 0.15643952786922455, "learning_rate": 9.278774289985053e-05, "loss": 3.9804, "step": 8500 }, { "epoch": 0.5778638401956787, "grad_norm": 0.21564146876335144, "learning_rate": 9.278349639896726e-05, "loss": 3.9151, "step": 8505 }, { "epoch": 0.5782035602663406, "grad_norm": 0.21207468211650848, "learning_rate": 9.277924989808398e-05, "loss": 3.9481, "step": 8510 }, { "epoch": 0.5785432803370023, "grad_norm": 0.19761881232261658, "learning_rate": 9.277500339720071e-05, "loss": 4.0216, "step": 8515 }, { "epoch": 0.578883000407664, "grad_norm": 0.18729913234710693, "learning_rate": 9.277075689631744e-05, "loss": 3.9112, "step": 8520 }, { "epoch": 0.5792227204783259, "grad_norm": 0.13949252665042877, "learning_rate": 9.276651039543417e-05, "loss": 3.928, "step": 8525 }, { "epoch": 0.5795624405489876, "grad_norm": 0.1481368988752365, "learning_rate": 9.27622638945509e-05, "loss": 4.1387, "step": 8530 }, { "epoch": 0.5799021606196494, "grad_norm": 0.2295740246772766, "learning_rate": 9.275801739366762e-05, "loss": 4.0267, "step": 8535 }, { "epoch": 0.5802418806903111, "grad_norm": 0.22337546944618225, "learning_rate": 9.275377089278435e-05, "loss": 3.927, "step": 8540 }, { "epoch": 0.580581600760973, "grad_norm": 0.41308972239494324, "learning_rate": 9.274952439190108e-05, "loss": 4.1804, "step": 8545 }, { "epoch": 0.5809213208316347, "grad_norm": 0.15988604724407196, "learning_rate": 9.27452778910178e-05, "loss": 3.8497, "step": 8550 }, { "epoch": 0.5812610409022965, "grad_norm": 0.17153027653694153, "learning_rate": 9.274103139013454e-05, "loss": 4.1164, "step": 8555 }, { "epoch": 0.5816007609729583, "grad_norm": 0.2415970116853714, "learning_rate": 9.273678488925126e-05, "loss": 4.1781, "step": 8560 }, { "epoch": 0.5819404810436201, "grad_norm": 2.764481544494629, "learning_rate": 9.273253838836798e-05, "loss": 3.9708, "step": 8565 }, { "epoch": 0.5822802011142818, "grad_norm": 0.1980726718902588, "learning_rate": 9.272829188748472e-05, "loss": 4.0011, "step": 8570 }, { "epoch": 0.5826199211849437, "grad_norm": 0.23337212204933167, "learning_rate": 9.272404538660145e-05, "loss": 3.9122, "step": 8575 }, { "epoch": 0.5829596412556054, "grad_norm": 0.1616441160440445, "learning_rate": 9.271979888571816e-05, "loss": 3.8085, "step": 8580 }, { "epoch": 0.5832993613262671, "grad_norm": 0.19544613361358643, "learning_rate": 9.27155523848349e-05, "loss": 4.0634, "step": 8585 }, { "epoch": 0.5836390813969289, "grad_norm": 0.19798190891742706, "learning_rate": 9.271130588395163e-05, "loss": 3.9989, "step": 8590 }, { "epoch": 0.5839788014675907, "grad_norm": 0.1608157753944397, "learning_rate": 9.270705938306835e-05, "loss": 4.3514, "step": 8595 }, { "epoch": 0.5843185215382525, "grad_norm": 0.23190398514270782, "learning_rate": 9.270281288218509e-05, "loss": 3.9989, "step": 8600 }, { "epoch": 0.5846582416089142, "grad_norm": 0.694850742816925, "learning_rate": 9.269856638130182e-05, "loss": 4.3087, "step": 8605 }, { "epoch": 0.5849979616795761, "grad_norm": 0.2711631655693054, "learning_rate": 9.269431988041853e-05, "loss": 3.9979, "step": 8610 }, { "epoch": 0.5853376817502378, "grad_norm": 0.17717806994915009, "learning_rate": 9.269007337953527e-05, "loss": 3.9771, "step": 8615 }, { "epoch": 0.5856774018208996, "grad_norm": 0.258022665977478, "learning_rate": 9.2685826878652e-05, "loss": 3.9518, "step": 8620 }, { "epoch": 0.5860171218915613, "grad_norm": 1.0233123302459717, "learning_rate": 9.268158037776871e-05, "loss": 4.0393, "step": 8625 }, { "epoch": 0.5863568419622232, "grad_norm": 0.19119100272655487, "learning_rate": 9.267733387688546e-05, "loss": 4.2572, "step": 8630 }, { "epoch": 0.5866965620328849, "grad_norm": 0.16495579481124878, "learning_rate": 9.267308737600217e-05, "loss": 4.0212, "step": 8635 }, { "epoch": 0.5870362821035466, "grad_norm": 0.3087138831615448, "learning_rate": 9.266884087511891e-05, "loss": 3.6527, "step": 8640 }, { "epoch": 0.5873760021742085, "grad_norm": 0.16957888007164001, "learning_rate": 9.266459437423564e-05, "loss": 4.2809, "step": 8645 }, { "epoch": 0.5877157222448702, "grad_norm": 0.15367551147937775, "learning_rate": 9.266034787335235e-05, "loss": 4.2469, "step": 8650 }, { "epoch": 0.588055442315532, "grad_norm": 0.2356184720993042, "learning_rate": 9.26561013724691e-05, "loss": 3.8713, "step": 8655 }, { "epoch": 0.5883951623861938, "grad_norm": 6.177441120147705, "learning_rate": 9.265185487158582e-05, "loss": 3.8173, "step": 8660 }, { "epoch": 0.5887348824568556, "grad_norm": 0.15291577577590942, "learning_rate": 9.264760837070254e-05, "loss": 4.0039, "step": 8665 }, { "epoch": 0.5890746025275173, "grad_norm": 0.1881755292415619, "learning_rate": 9.264336186981928e-05, "loss": 4.1521, "step": 8670 }, { "epoch": 0.589414322598179, "grad_norm": 0.28705185651779175, "learning_rate": 9.263911536893601e-05, "loss": 4.1136, "step": 8675 }, { "epoch": 0.5897540426688409, "grad_norm": 0.8784993886947632, "learning_rate": 9.263486886805272e-05, "loss": 4.0763, "step": 8680 }, { "epoch": 0.5900937627395026, "grad_norm": 0.5077646970748901, "learning_rate": 9.263062236716946e-05, "loss": 3.818, "step": 8685 }, { "epoch": 0.5904334828101644, "grad_norm": 0.17957673966884613, "learning_rate": 9.262637586628619e-05, "loss": 3.8784, "step": 8690 }, { "epoch": 0.5907732028808262, "grad_norm": 0.2007267326116562, "learning_rate": 9.26221293654029e-05, "loss": 4.0291, "step": 8695 }, { "epoch": 0.591112922951488, "grad_norm": 0.3630281686782837, "learning_rate": 9.261788286451965e-05, "loss": 4.0589, "step": 8700 }, { "epoch": 0.5914526430221497, "grad_norm": 0.33113589882850647, "learning_rate": 9.261363636363636e-05, "loss": 4.0724, "step": 8705 }, { "epoch": 0.5917923630928115, "grad_norm": 0.9370392560958862, "learning_rate": 9.260938986275309e-05, "loss": 3.9571, "step": 8710 }, { "epoch": 0.5921320831634733, "grad_norm": 0.7348127365112305, "learning_rate": 9.260514336186983e-05, "loss": 3.8521, "step": 8715 }, { "epoch": 0.5924718032341351, "grad_norm": 0.20114554464817047, "learning_rate": 9.260089686098655e-05, "loss": 4.0328, "step": 8720 }, { "epoch": 0.5928115233047968, "grad_norm": 0.375806987285614, "learning_rate": 9.259665036010327e-05, "loss": 4.045, "step": 8725 }, { "epoch": 0.5931512433754587, "grad_norm": 0.17481490969657898, "learning_rate": 9.259240385922002e-05, "loss": 4.1616, "step": 8730 }, { "epoch": 0.5934909634461204, "grad_norm": 0.3022591471672058, "learning_rate": 9.258815735833673e-05, "loss": 4.0886, "step": 8735 }, { "epoch": 0.5938306835167821, "grad_norm": 0.17564785480499268, "learning_rate": 9.258391085745346e-05, "loss": 3.9194, "step": 8740 }, { "epoch": 0.594170403587444, "grad_norm": 0.19886420667171478, "learning_rate": 9.25796643565702e-05, "loss": 3.9606, "step": 8745 }, { "epoch": 0.5945101236581057, "grad_norm": 0.18436846137046814, "learning_rate": 9.257541785568691e-05, "loss": 4.0952, "step": 8750 }, { "epoch": 0.5948498437287675, "grad_norm": 0.39597442746162415, "learning_rate": 9.257117135480364e-05, "loss": 4.0044, "step": 8755 }, { "epoch": 0.5951895637994292, "grad_norm": 0.2468959391117096, "learning_rate": 9.256692485392038e-05, "loss": 4.2667, "step": 8760 }, { "epoch": 0.5955292838700911, "grad_norm": 0.21869252622127533, "learning_rate": 9.25626783530371e-05, "loss": 3.862, "step": 8765 }, { "epoch": 0.5958690039407528, "grad_norm": 0.18693305552005768, "learning_rate": 9.255843185215383e-05, "loss": 4.1253, "step": 8770 }, { "epoch": 0.5962087240114146, "grad_norm": 0.2016768604516983, "learning_rate": 9.255418535127055e-05, "loss": 4.0373, "step": 8775 }, { "epoch": 0.5965484440820764, "grad_norm": 0.20323170721530914, "learning_rate": 9.254993885038728e-05, "loss": 3.9236, "step": 8780 }, { "epoch": 0.5968881641527382, "grad_norm": 0.17881441116333008, "learning_rate": 9.254569234950401e-05, "loss": 3.9083, "step": 8785 }, { "epoch": 0.5972278842233999, "grad_norm": 0.22243642807006836, "learning_rate": 9.254144584862074e-05, "loss": 4.0484, "step": 8790 }, { "epoch": 0.5975676042940616, "grad_norm": 0.160291388630867, "learning_rate": 9.253719934773747e-05, "loss": 4.0718, "step": 8795 }, { "epoch": 0.5979073243647235, "grad_norm": 0.18238534033298492, "learning_rate": 9.25329528468542e-05, "loss": 3.8954, "step": 8800 }, { "epoch": 0.5982470444353852, "grad_norm": 0.2048415243625641, "learning_rate": 9.252870634597092e-05, "loss": 3.9912, "step": 8805 }, { "epoch": 0.598586764506047, "grad_norm": 0.17319513857364655, "learning_rate": 9.252445984508765e-05, "loss": 4.0986, "step": 8810 }, { "epoch": 0.5989264845767088, "grad_norm": 0.24065633118152618, "learning_rate": 9.252021334420438e-05, "loss": 4.1444, "step": 8815 }, { "epoch": 0.5992662046473706, "grad_norm": 0.20234987139701843, "learning_rate": 9.251596684332111e-05, "loss": 3.8242, "step": 8820 }, { "epoch": 0.5996059247180323, "grad_norm": 0.21660032868385315, "learning_rate": 9.251172034243783e-05, "loss": 3.8864, "step": 8825 }, { "epoch": 0.5999456447886942, "grad_norm": 0.17871896922588348, "learning_rate": 9.250747384155456e-05, "loss": 3.8432, "step": 8830 }, { "epoch": 0.6002853648593559, "grad_norm": 0.16359539330005646, "learning_rate": 9.250322734067129e-05, "loss": 4.0026, "step": 8835 }, { "epoch": 0.6006250849300176, "grad_norm": 0.2683166265487671, "learning_rate": 9.249898083978802e-05, "loss": 4.1686, "step": 8840 }, { "epoch": 0.6009648050006794, "grad_norm": 0.16088762879371643, "learning_rate": 9.249473433890475e-05, "loss": 3.9448, "step": 8845 }, { "epoch": 0.6013045250713412, "grad_norm": 0.21767516434192657, "learning_rate": 9.249048783802147e-05, "loss": 4.1642, "step": 8850 }, { "epoch": 0.601644245142003, "grad_norm": 1.050563097000122, "learning_rate": 9.24862413371382e-05, "loss": 4.0446, "step": 8855 }, { "epoch": 0.6019839652126647, "grad_norm": 0.16180701553821564, "learning_rate": 9.248199483625493e-05, "loss": 4.014, "step": 8860 }, { "epoch": 0.6023236852833266, "grad_norm": 0.1821652054786682, "learning_rate": 9.247774833537166e-05, "loss": 4.0171, "step": 8865 }, { "epoch": 0.6026634053539883, "grad_norm": 1.0722553730010986, "learning_rate": 9.247350183448839e-05, "loss": 3.8606, "step": 8870 }, { "epoch": 0.6030031254246501, "grad_norm": 0.22805406153202057, "learning_rate": 9.246925533360511e-05, "loss": 3.993, "step": 8875 }, { "epoch": 0.6033428454953118, "grad_norm": 0.2157077044248581, "learning_rate": 9.246500883272184e-05, "loss": 3.7777, "step": 8880 }, { "epoch": 0.6036825655659737, "grad_norm": 0.18143793940544128, "learning_rate": 9.246076233183857e-05, "loss": 4.127, "step": 8885 }, { "epoch": 0.6040222856366354, "grad_norm": 0.18271377682685852, "learning_rate": 9.24565158309553e-05, "loss": 4.0305, "step": 8890 }, { "epoch": 0.6043620057072971, "grad_norm": 0.2080451250076294, "learning_rate": 9.245226933007203e-05, "loss": 3.8488, "step": 8895 }, { "epoch": 0.604701725777959, "grad_norm": 1.494086742401123, "learning_rate": 9.244802282918875e-05, "loss": 4.0619, "step": 8900 }, { "epoch": 0.6050414458486207, "grad_norm": 0.2166607528924942, "learning_rate": 9.244377632830547e-05, "loss": 3.8448, "step": 8905 }, { "epoch": 0.6053811659192825, "grad_norm": 0.24948440492153168, "learning_rate": 9.243952982742221e-05, "loss": 3.9839, "step": 8910 }, { "epoch": 0.6057208859899443, "grad_norm": 0.33411622047424316, "learning_rate": 9.243528332653894e-05, "loss": 4.0442, "step": 8915 }, { "epoch": 0.6060606060606061, "grad_norm": 0.17864017188549042, "learning_rate": 9.243103682565565e-05, "loss": 3.996, "step": 8920 }, { "epoch": 0.6064003261312678, "grad_norm": 0.18109479546546936, "learning_rate": 9.24267903247724e-05, "loss": 4.1308, "step": 8925 }, { "epoch": 0.6067400462019296, "grad_norm": 0.21822641789913177, "learning_rate": 9.242254382388912e-05, "loss": 4.1508, "step": 8930 }, { "epoch": 0.6070797662725914, "grad_norm": 0.37898629903793335, "learning_rate": 9.241829732300584e-05, "loss": 4.1584, "step": 8935 }, { "epoch": 0.6074194863432532, "grad_norm": 0.30412939190864563, "learning_rate": 9.241405082212258e-05, "loss": 3.948, "step": 8940 }, { "epoch": 0.6077592064139149, "grad_norm": 0.20068703591823578, "learning_rate": 9.240980432123931e-05, "loss": 4.1206, "step": 8945 }, { "epoch": 0.6080989264845768, "grad_norm": 0.25400644540786743, "learning_rate": 9.240555782035602e-05, "loss": 4.3133, "step": 8950 }, { "epoch": 0.6084386465552385, "grad_norm": 0.19139589369297028, "learning_rate": 9.240131131947276e-05, "loss": 3.9386, "step": 8955 }, { "epoch": 0.6087783666259002, "grad_norm": 0.16936640441417694, "learning_rate": 9.239706481858949e-05, "loss": 4.0571, "step": 8960 }, { "epoch": 0.609118086696562, "grad_norm": 0.20352940261363983, "learning_rate": 9.23928183177062e-05, "loss": 3.7657, "step": 8965 }, { "epoch": 0.6094578067672238, "grad_norm": 0.1573779433965683, "learning_rate": 9.238857181682295e-05, "loss": 4.1298, "step": 8970 }, { "epoch": 0.6097975268378856, "grad_norm": 0.20279887318611145, "learning_rate": 9.238432531593966e-05, "loss": 3.9271, "step": 8975 }, { "epoch": 0.6101372469085473, "grad_norm": 16.52877426147461, "learning_rate": 9.23800788150564e-05, "loss": 3.8319, "step": 8980 }, { "epoch": 0.6104769669792092, "grad_norm": 0.17606152594089508, "learning_rate": 9.237583231417313e-05, "loss": 3.7453, "step": 8985 }, { "epoch": 0.6108166870498709, "grad_norm": 0.23678942024707794, "learning_rate": 9.237158581328985e-05, "loss": 3.9744, "step": 8990 }, { "epoch": 0.6111564071205327, "grad_norm": 0.21576398611068726, "learning_rate": 9.236733931240659e-05, "loss": 4.1691, "step": 8995 }, { "epoch": 0.6114961271911945, "grad_norm": 0.2256469577550888, "learning_rate": 9.236309281152332e-05, "loss": 3.9419, "step": 9000 }, { "epoch": 0.6118358472618562, "grad_norm": 0.23562292754650116, "learning_rate": 9.235884631064003e-05, "loss": 4.1049, "step": 9005 }, { "epoch": 0.612175567332518, "grad_norm": 0.2894531190395355, "learning_rate": 9.235459980975677e-05, "loss": 3.9535, "step": 9010 }, { "epoch": 0.6125152874031797, "grad_norm": 0.22967609763145447, "learning_rate": 9.23503533088735e-05, "loss": 3.9537, "step": 9015 }, { "epoch": 0.6128550074738416, "grad_norm": 1.3449379205703735, "learning_rate": 9.234610680799021e-05, "loss": 4.08, "step": 9020 }, { "epoch": 0.6131947275445033, "grad_norm": 0.2513696849346161, "learning_rate": 9.234186030710696e-05, "loss": 3.8626, "step": 9025 }, { "epoch": 0.6135344476151651, "grad_norm": 0.19834820926189423, "learning_rate": 9.233761380622368e-05, "loss": 3.9427, "step": 9030 }, { "epoch": 0.6138741676858269, "grad_norm": 0.2118111550807953, "learning_rate": 9.23333673053404e-05, "loss": 3.8768, "step": 9035 }, { "epoch": 0.6142138877564887, "grad_norm": 0.15969218313694, "learning_rate": 9.232912080445714e-05, "loss": 4.1679, "step": 9040 }, { "epoch": 0.6145536078271504, "grad_norm": 0.184014692902565, "learning_rate": 9.232487430357387e-05, "loss": 4.1939, "step": 9045 }, { "epoch": 0.6148933278978121, "grad_norm": 0.17408449947834015, "learning_rate": 9.232062780269058e-05, "loss": 4.0627, "step": 9050 }, { "epoch": 0.615233047968474, "grad_norm": 0.3706364929676056, "learning_rate": 9.231638130180732e-05, "loss": 4.1244, "step": 9055 }, { "epoch": 0.6155727680391357, "grad_norm": 0.2606452703475952, "learning_rate": 9.231213480092404e-05, "loss": 4.0637, "step": 9060 }, { "epoch": 0.6159124881097975, "grad_norm": 0.2730484902858734, "learning_rate": 9.230788830004077e-05, "loss": 4.227, "step": 9065 }, { "epoch": 0.6162522081804593, "grad_norm": 0.17146620154380798, "learning_rate": 9.230364179915751e-05, "loss": 3.9837, "step": 9070 }, { "epoch": 0.6165919282511211, "grad_norm": 0.23479421436786652, "learning_rate": 9.229939529827422e-05, "loss": 3.9549, "step": 9075 }, { "epoch": 0.6169316483217828, "grad_norm": 0.17351080477237701, "learning_rate": 9.229514879739095e-05, "loss": 4.0837, "step": 9080 }, { "epoch": 0.6172713683924447, "grad_norm": 0.18759216368198395, "learning_rate": 9.229090229650769e-05, "loss": 4.1507, "step": 9085 }, { "epoch": 0.6176110884631064, "grad_norm": 0.373471736907959, "learning_rate": 9.22866557956244e-05, "loss": 3.8382, "step": 9090 }, { "epoch": 0.6179508085337682, "grad_norm": 0.16654084622859955, "learning_rate": 9.228240929474113e-05, "loss": 3.9211, "step": 9095 }, { "epoch": 0.6182905286044299, "grad_norm": 2.6412665843963623, "learning_rate": 9.227816279385788e-05, "loss": 4.1437, "step": 9100 }, { "epoch": 0.6186302486750918, "grad_norm": 0.22232688963413239, "learning_rate": 9.227391629297459e-05, "loss": 4.3322, "step": 9105 }, { "epoch": 0.6189699687457535, "grad_norm": 0.19792711734771729, "learning_rate": 9.226966979209132e-05, "loss": 3.6801, "step": 9110 }, { "epoch": 0.6193096888164152, "grad_norm": 0.19452767074108124, "learning_rate": 9.226542329120806e-05, "loss": 4.1374, "step": 9115 }, { "epoch": 0.6196494088870771, "grad_norm": 0.17766821384429932, "learning_rate": 9.226117679032477e-05, "loss": 3.998, "step": 9120 }, { "epoch": 0.6199891289577388, "grad_norm": 0.21171297132968903, "learning_rate": 9.22569302894415e-05, "loss": 4.0408, "step": 9125 }, { "epoch": 0.6203288490284006, "grad_norm": 0.2353495955467224, "learning_rate": 9.225268378855823e-05, "loss": 4.2569, "step": 9130 }, { "epoch": 0.6206685690990623, "grad_norm": 0.1814018189907074, "learning_rate": 9.224843728767496e-05, "loss": 4.1783, "step": 9135 }, { "epoch": 0.6210082891697242, "grad_norm": 0.16691021621227264, "learning_rate": 9.224419078679169e-05, "loss": 3.95, "step": 9140 }, { "epoch": 0.6213480092403859, "grad_norm": 0.24587292969226837, "learning_rate": 9.223994428590841e-05, "loss": 4.187, "step": 9145 }, { "epoch": 0.6216877293110477, "grad_norm": 0.12846866250038147, "learning_rate": 9.223569778502514e-05, "loss": 3.7833, "step": 9150 }, { "epoch": 0.6220274493817095, "grad_norm": 0.15816563367843628, "learning_rate": 9.223145128414187e-05, "loss": 3.9622, "step": 9155 }, { "epoch": 0.6223671694523712, "grad_norm": 0.5960243344306946, "learning_rate": 9.22272047832586e-05, "loss": 4.0261, "step": 9160 }, { "epoch": 0.622706889523033, "grad_norm": 0.21092690527439117, "learning_rate": 9.222295828237533e-05, "loss": 4.0417, "step": 9165 }, { "epoch": 0.6230466095936948, "grad_norm": 0.21227766573429108, "learning_rate": 9.221871178149205e-05, "loss": 4.0396, "step": 9170 }, { "epoch": 0.6233863296643566, "grad_norm": 0.1947152018547058, "learning_rate": 9.221446528060878e-05, "loss": 3.9731, "step": 9175 }, { "epoch": 0.6237260497350183, "grad_norm": 0.264658123254776, "learning_rate": 9.221021877972551e-05, "loss": 3.7214, "step": 9180 }, { "epoch": 0.6240657698056801, "grad_norm": 0.14752690494060516, "learning_rate": 9.220597227884224e-05, "loss": 4.262, "step": 9185 }, { "epoch": 0.6244054898763419, "grad_norm": 0.20559826493263245, "learning_rate": 9.220172577795897e-05, "loss": 4.0443, "step": 9190 }, { "epoch": 0.6247452099470037, "grad_norm": 0.17190653085708618, "learning_rate": 9.21974792770757e-05, "loss": 4.246, "step": 9195 }, { "epoch": 0.6250849300176654, "grad_norm": 0.20981894433498383, "learning_rate": 9.219323277619242e-05, "loss": 4.0482, "step": 9200 }, { "epoch": 0.6254246500883273, "grad_norm": 0.20660968124866486, "learning_rate": 9.218898627530915e-05, "loss": 3.9868, "step": 9205 }, { "epoch": 0.625764370158989, "grad_norm": 0.21324321627616882, "learning_rate": 9.218473977442588e-05, "loss": 4.3044, "step": 9210 }, { "epoch": 0.6261040902296507, "grad_norm": 0.45024943351745605, "learning_rate": 9.21804932735426e-05, "loss": 3.945, "step": 9215 }, { "epoch": 0.6264438103003126, "grad_norm": 0.19029852747917175, "learning_rate": 9.217624677265933e-05, "loss": 4.2293, "step": 9220 }, { "epoch": 0.6267835303709743, "grad_norm": 0.17948591709136963, "learning_rate": 9.217200027177606e-05, "loss": 4.4357, "step": 9225 }, { "epoch": 0.6271232504416361, "grad_norm": 0.30317583680152893, "learning_rate": 9.216775377089279e-05, "loss": 3.9639, "step": 9230 }, { "epoch": 0.6274629705122978, "grad_norm": 0.1628941148519516, "learning_rate": 9.216350727000952e-05, "loss": 4.0654, "step": 9235 }, { "epoch": 0.6278026905829597, "grad_norm": 0.20330828428268433, "learning_rate": 9.215926076912625e-05, "loss": 4.0975, "step": 9240 }, { "epoch": 0.6281424106536214, "grad_norm": 0.19045989215373993, "learning_rate": 9.215501426824297e-05, "loss": 4.0935, "step": 9245 }, { "epoch": 0.6284821307242832, "grad_norm": 1.2690106630325317, "learning_rate": 9.21507677673597e-05, "loss": 3.8794, "step": 9250 }, { "epoch": 0.628821850794945, "grad_norm": 0.3011123538017273, "learning_rate": 9.214652126647643e-05, "loss": 3.8597, "step": 9255 }, { "epoch": 0.6291615708656068, "grad_norm": 0.17079856991767883, "learning_rate": 9.214227476559314e-05, "loss": 3.9687, "step": 9260 }, { "epoch": 0.6295012909362685, "grad_norm": 0.18377001583576202, "learning_rate": 9.213802826470989e-05, "loss": 3.8344, "step": 9265 }, { "epoch": 0.6298410110069302, "grad_norm": 0.26580023765563965, "learning_rate": 9.213378176382661e-05, "loss": 3.9417, "step": 9270 }, { "epoch": 0.6301807310775921, "grad_norm": 0.24509429931640625, "learning_rate": 9.212953526294333e-05, "loss": 3.9923, "step": 9275 }, { "epoch": 0.6305204511482538, "grad_norm": 0.1840851753950119, "learning_rate": 9.212528876206007e-05, "loss": 4.1023, "step": 9280 }, { "epoch": 0.6308601712189156, "grad_norm": 0.16701363027095795, "learning_rate": 9.21210422611768e-05, "loss": 4.1101, "step": 9285 }, { "epoch": 0.6311998912895774, "grad_norm": 0.5241246819496155, "learning_rate": 9.211679576029351e-05, "loss": 3.5958, "step": 9290 }, { "epoch": 0.6315396113602392, "grad_norm": 0.17940564453601837, "learning_rate": 9.211254925941025e-05, "loss": 3.9681, "step": 9295 }, { "epoch": 0.6318793314309009, "grad_norm": 0.2497144490480423, "learning_rate": 9.210830275852698e-05, "loss": 4.0491, "step": 9300 }, { "epoch": 0.6322190515015628, "grad_norm": 0.48636677861213684, "learning_rate": 9.21040562576437e-05, "loss": 3.9699, "step": 9305 }, { "epoch": 0.6325587715722245, "grad_norm": 0.2682591676712036, "learning_rate": 9.209980975676044e-05, "loss": 3.8142, "step": 9310 }, { "epoch": 0.6328984916428863, "grad_norm": 0.5112836360931396, "learning_rate": 9.209556325587717e-05, "loss": 3.8701, "step": 9315 }, { "epoch": 0.633238211713548, "grad_norm": 0.37686988711357117, "learning_rate": 9.20913167549939e-05, "loss": 3.9299, "step": 9320 }, { "epoch": 0.6335779317842098, "grad_norm": 0.3224041759967804, "learning_rate": 9.208707025411062e-05, "loss": 4.0601, "step": 9325 }, { "epoch": 0.6339176518548716, "grad_norm": 0.241369366645813, "learning_rate": 9.208282375322734e-05, "loss": 4.1394, "step": 9330 }, { "epoch": 0.6342573719255333, "grad_norm": 0.1902744024991989, "learning_rate": 9.207857725234408e-05, "loss": 4.1937, "step": 9335 }, { "epoch": 0.6345970919961952, "grad_norm": 0.20103438198566437, "learning_rate": 9.20743307514608e-05, "loss": 4.1394, "step": 9340 }, { "epoch": 0.6349368120668569, "grad_norm": 0.17396235466003418, "learning_rate": 9.207008425057752e-05, "loss": 4.0629, "step": 9345 }, { "epoch": 0.6352765321375187, "grad_norm": 0.1853020042181015, "learning_rate": 9.206583774969426e-05, "loss": 4.0948, "step": 9350 }, { "epoch": 0.6356162522081804, "grad_norm": 0.15830761194229126, "learning_rate": 9.206159124881099e-05, "loss": 4.0189, "step": 9355 }, { "epoch": 0.6359559722788423, "grad_norm": 0.18493899703025818, "learning_rate": 9.20573447479277e-05, "loss": 3.9398, "step": 9360 }, { "epoch": 0.636295692349504, "grad_norm": 0.19994007050991058, "learning_rate": 9.205309824704445e-05, "loss": 4.1339, "step": 9365 }, { "epoch": 0.6366354124201657, "grad_norm": 0.16409240663051605, "learning_rate": 9.204885174616117e-05, "loss": 4.3045, "step": 9370 }, { "epoch": 0.6369751324908276, "grad_norm": 0.34056273102760315, "learning_rate": 9.204460524527789e-05, "loss": 3.9817, "step": 9375 }, { "epoch": 0.6373148525614893, "grad_norm": 0.19899314641952515, "learning_rate": 9.204035874439463e-05, "loss": 3.8643, "step": 9380 }, { "epoch": 0.6376545726321511, "grad_norm": 0.2605150043964386, "learning_rate": 9.203611224351136e-05, "loss": 4.0556, "step": 9385 }, { "epoch": 0.6379942927028129, "grad_norm": 0.23088324069976807, "learning_rate": 9.203186574262807e-05, "loss": 3.9551, "step": 9390 }, { "epoch": 0.6383340127734747, "grad_norm": 0.181942418217659, "learning_rate": 9.202761924174481e-05, "loss": 4.0324, "step": 9395 }, { "epoch": 0.6386737328441364, "grad_norm": 0.1608920693397522, "learning_rate": 9.202337274086154e-05, "loss": 3.8843, "step": 9400 }, { "epoch": 0.6390134529147982, "grad_norm": 0.23045985400676727, "learning_rate": 9.201912623997826e-05, "loss": 3.8905, "step": 9405 }, { "epoch": 0.63935317298546, "grad_norm": 0.20380303263664246, "learning_rate": 9.2014879739095e-05, "loss": 3.988, "step": 9410 }, { "epoch": 0.6396928930561218, "grad_norm": 0.6223300695419312, "learning_rate": 9.201063323821171e-05, "loss": 3.9791, "step": 9415 }, { "epoch": 0.6400326131267835, "grad_norm": 0.8067218065261841, "learning_rate": 9.200638673732844e-05, "loss": 4.1077, "step": 9420 }, { "epoch": 0.6403723331974454, "grad_norm": 0.20715036988258362, "learning_rate": 9.200214023644518e-05, "loss": 4.138, "step": 9425 }, { "epoch": 0.6407120532681071, "grad_norm": 0.20288419723510742, "learning_rate": 9.19978937355619e-05, "loss": 3.9553, "step": 9430 }, { "epoch": 0.6410517733387688, "grad_norm": 0.18636272847652435, "learning_rate": 9.199364723467862e-05, "loss": 3.9276, "step": 9435 }, { "epoch": 0.6413914934094306, "grad_norm": 0.18782015144824982, "learning_rate": 9.198940073379537e-05, "loss": 4.0336, "step": 9440 }, { "epoch": 0.6417312134800924, "grad_norm": 0.19050072133541107, "learning_rate": 9.198515423291208e-05, "loss": 4.0227, "step": 9445 }, { "epoch": 0.6420709335507542, "grad_norm": 0.30288147926330566, "learning_rate": 9.198090773202881e-05, "loss": 3.8769, "step": 9450 }, { "epoch": 0.6424106536214159, "grad_norm": 0.22393420338630676, "learning_rate": 9.197666123114555e-05, "loss": 3.9509, "step": 9455 }, { "epoch": 0.6427503736920778, "grad_norm": 0.21114152669906616, "learning_rate": 9.197241473026226e-05, "loss": 4.1008, "step": 9460 }, { "epoch": 0.6430900937627395, "grad_norm": 0.238412544131279, "learning_rate": 9.196816822937899e-05, "loss": 3.8772, "step": 9465 }, { "epoch": 0.6434298138334013, "grad_norm": 0.6189041137695312, "learning_rate": 9.196392172849573e-05, "loss": 3.9286, "step": 9470 }, { "epoch": 0.6437695339040631, "grad_norm": 0.26222628355026245, "learning_rate": 9.195967522761245e-05, "loss": 4.1328, "step": 9475 }, { "epoch": 0.6441092539747248, "grad_norm": 0.21858513355255127, "learning_rate": 9.195542872672918e-05, "loss": 3.9626, "step": 9480 }, { "epoch": 0.6444489740453866, "grad_norm": 0.21814042329788208, "learning_rate": 9.19511822258459e-05, "loss": 4.3128, "step": 9485 }, { "epoch": 0.6447886941160483, "grad_norm": 0.19709447026252747, "learning_rate": 9.194693572496263e-05, "loss": 3.9969, "step": 9490 }, { "epoch": 0.6451284141867102, "grad_norm": 0.172958642244339, "learning_rate": 9.194268922407936e-05, "loss": 4.0359, "step": 9495 }, { "epoch": 0.6454681342573719, "grad_norm": 0.17147018015384674, "learning_rate": 9.193844272319609e-05, "loss": 3.8998, "step": 9500 }, { "epoch": 0.6458078543280337, "grad_norm": 0.16436798870563507, "learning_rate": 9.193419622231282e-05, "loss": 3.8929, "step": 9505 }, { "epoch": 0.6461475743986955, "grad_norm": 0.17490500211715698, "learning_rate": 9.192994972142954e-05, "loss": 3.9468, "step": 9510 }, { "epoch": 0.6464872944693573, "grad_norm": 0.5229065418243408, "learning_rate": 9.192570322054627e-05, "loss": 4.1358, "step": 9515 }, { "epoch": 0.646827014540019, "grad_norm": 0.1972028911113739, "learning_rate": 9.1921456719663e-05, "loss": 4.1462, "step": 9520 }, { "epoch": 0.6471667346106807, "grad_norm": 0.18450862169265747, "learning_rate": 9.191721021877973e-05, "loss": 4.0512, "step": 9525 }, { "epoch": 0.6475064546813426, "grad_norm": 0.24547728896141052, "learning_rate": 9.191296371789646e-05, "loss": 4.0998, "step": 9530 }, { "epoch": 0.6478461747520043, "grad_norm": 0.19123290479183197, "learning_rate": 9.190871721701318e-05, "loss": 3.8672, "step": 9535 }, { "epoch": 0.6481858948226661, "grad_norm": 0.22196923196315765, "learning_rate": 9.190447071612991e-05, "loss": 4.1529, "step": 9540 }, { "epoch": 0.6485256148933279, "grad_norm": 0.21963347494602203, "learning_rate": 9.190022421524664e-05, "loss": 4.1048, "step": 9545 }, { "epoch": 0.6488653349639897, "grad_norm": 0.19842089712619781, "learning_rate": 9.189597771436337e-05, "loss": 4.0297, "step": 9550 }, { "epoch": 0.6492050550346514, "grad_norm": 0.2944156527519226, "learning_rate": 9.18917312134801e-05, "loss": 4.0261, "step": 9555 }, { "epoch": 0.6495447751053133, "grad_norm": 0.1940944492816925, "learning_rate": 9.188748471259682e-05, "loss": 3.9962, "step": 9560 }, { "epoch": 0.649884495175975, "grad_norm": 0.1459953486919403, "learning_rate": 9.188323821171355e-05, "loss": 4.0297, "step": 9565 }, { "epoch": 0.6502242152466368, "grad_norm": 0.18244166672229767, "learning_rate": 9.187899171083028e-05, "loss": 3.9038, "step": 9570 }, { "epoch": 0.6505639353172985, "grad_norm": 0.5632447004318237, "learning_rate": 9.187474520994701e-05, "loss": 3.8968, "step": 9575 }, { "epoch": 0.6509036553879604, "grad_norm": 0.21160653233528137, "learning_rate": 9.187049870906374e-05, "loss": 4.0296, "step": 9580 }, { "epoch": 0.6512433754586221, "grad_norm": 0.5150845646858215, "learning_rate": 9.186625220818046e-05, "loss": 3.9448, "step": 9585 }, { "epoch": 0.6515830955292838, "grad_norm": 0.16149458289146423, "learning_rate": 9.186200570729719e-05, "loss": 3.9315, "step": 9590 }, { "epoch": 0.6519228155999457, "grad_norm": 0.19759464263916016, "learning_rate": 9.185775920641392e-05, "loss": 3.9934, "step": 9595 }, { "epoch": 0.6522625356706074, "grad_norm": 0.4384717047214508, "learning_rate": 9.185351270553065e-05, "loss": 3.932, "step": 9600 }, { "epoch": 0.6526022557412692, "grad_norm": 0.17127420008182526, "learning_rate": 9.184926620464738e-05, "loss": 4.038, "step": 9605 }, { "epoch": 0.6529419758119309, "grad_norm": 0.1805383861064911, "learning_rate": 9.18450197037641e-05, "loss": 4.077, "step": 9610 }, { "epoch": 0.6532816958825928, "grad_norm": 0.17218123376369476, "learning_rate": 9.184077320288082e-05, "loss": 4.0027, "step": 9615 }, { "epoch": 0.6536214159532545, "grad_norm": 0.21027123928070068, "learning_rate": 9.183652670199756e-05, "loss": 4.004, "step": 9620 }, { "epoch": 0.6539611360239163, "grad_norm": 0.18752577900886536, "learning_rate": 9.183228020111429e-05, "loss": 4.1132, "step": 9625 }, { "epoch": 0.6543008560945781, "grad_norm": 0.2274181991815567, "learning_rate": 9.1828033700231e-05, "loss": 4.0189, "step": 9630 }, { "epoch": 0.6546405761652399, "grad_norm": 0.5568689107894897, "learning_rate": 9.182378719934774e-05, "loss": 3.9176, "step": 9635 }, { "epoch": 0.6549802962359016, "grad_norm": 0.20141251385211945, "learning_rate": 9.181954069846447e-05, "loss": 4.1433, "step": 9640 }, { "epoch": 0.6553200163065634, "grad_norm": 0.1734774261713028, "learning_rate": 9.181529419758119e-05, "loss": 3.8714, "step": 9645 }, { "epoch": 0.6556597363772252, "grad_norm": 0.20372764766216278, "learning_rate": 9.181104769669793e-05, "loss": 3.6352, "step": 9650 }, { "epoch": 0.6559994564478869, "grad_norm": 0.22679565846920013, "learning_rate": 9.180680119581466e-05, "loss": 4.0213, "step": 9655 }, { "epoch": 0.6563391765185487, "grad_norm": 0.19201093912124634, "learning_rate": 9.180255469493138e-05, "loss": 4.3365, "step": 9660 }, { "epoch": 0.6566788965892105, "grad_norm": 0.23552648723125458, "learning_rate": 9.179830819404811e-05, "loss": 3.827, "step": 9665 }, { "epoch": 0.6570186166598723, "grad_norm": 0.2363472580909729, "learning_rate": 9.179406169316484e-05, "loss": 3.5363, "step": 9670 }, { "epoch": 0.657358336730534, "grad_norm": 0.1936025321483612, "learning_rate": 9.178981519228157e-05, "loss": 3.6846, "step": 9675 }, { "epoch": 0.6576980568011959, "grad_norm": 0.16396436095237732, "learning_rate": 9.17855686913983e-05, "loss": 4.2385, "step": 9680 }, { "epoch": 0.6580377768718576, "grad_norm": 0.19219662249088287, "learning_rate": 9.178132219051501e-05, "loss": 3.8246, "step": 9685 }, { "epoch": 0.6583774969425193, "grad_norm": 0.3243776857852936, "learning_rate": 9.177707568963175e-05, "loss": 3.9252, "step": 9690 }, { "epoch": 0.6587172170131811, "grad_norm": 0.2118336260318756, "learning_rate": 9.177282918874848e-05, "loss": 3.9518, "step": 9695 }, { "epoch": 0.6590569370838429, "grad_norm": 0.2181885987520218, "learning_rate": 9.17685826878652e-05, "loss": 3.8817, "step": 9700 }, { "epoch": 0.6593966571545047, "grad_norm": 0.2063203752040863, "learning_rate": 9.176433618698194e-05, "loss": 4.0715, "step": 9705 }, { "epoch": 0.6597363772251664, "grad_norm": 0.16221602261066437, "learning_rate": 9.176008968609867e-05, "loss": 3.8972, "step": 9710 }, { "epoch": 0.6600760972958283, "grad_norm": 0.2043311893939972, "learning_rate": 9.175584318521538e-05, "loss": 3.9982, "step": 9715 }, { "epoch": 0.66041581736649, "grad_norm": 0.15358102321624756, "learning_rate": 9.175159668433212e-05, "loss": 4.4382, "step": 9720 }, { "epoch": 0.6607555374371518, "grad_norm": 0.1804971694946289, "learning_rate": 9.174735018344885e-05, "loss": 4.0575, "step": 9725 }, { "epoch": 0.6610952575078136, "grad_norm": 0.8969916701316833, "learning_rate": 9.174310368256556e-05, "loss": 3.8002, "step": 9730 }, { "epoch": 0.6614349775784754, "grad_norm": 0.17219315469264984, "learning_rate": 9.17388571816823e-05, "loss": 3.8753, "step": 9735 }, { "epoch": 0.6617746976491371, "grad_norm": 1.4561220407485962, "learning_rate": 9.173461068079903e-05, "loss": 3.8808, "step": 9740 }, { "epoch": 0.6621144177197988, "grad_norm": 0.3004949688911438, "learning_rate": 9.173036417991575e-05, "loss": 4.01, "step": 9745 }, { "epoch": 0.6624541377904607, "grad_norm": 0.4193626642227173, "learning_rate": 9.172611767903249e-05, "loss": 4.1539, "step": 9750 }, { "epoch": 0.6627938578611224, "grad_norm": 0.5232547521591187, "learning_rate": 9.17218711781492e-05, "loss": 4.1406, "step": 9755 }, { "epoch": 0.6631335779317842, "grad_norm": 0.2067098319530487, "learning_rate": 9.171762467726593e-05, "loss": 3.9981, "step": 9760 }, { "epoch": 0.663473298002446, "grad_norm": 0.2100781351327896, "learning_rate": 9.171337817638267e-05, "loss": 3.7371, "step": 9765 }, { "epoch": 0.6638130180731078, "grad_norm": 0.28894108533859253, "learning_rate": 9.170913167549939e-05, "loss": 4.1151, "step": 9770 }, { "epoch": 0.6641527381437695, "grad_norm": 0.2366105318069458, "learning_rate": 9.170488517461612e-05, "loss": 3.7963, "step": 9775 }, { "epoch": 0.6644924582144313, "grad_norm": 0.15720832347869873, "learning_rate": 9.170063867373286e-05, "loss": 4.1177, "step": 9780 }, { "epoch": 0.6648321782850931, "grad_norm": 0.16979362070560455, "learning_rate": 9.169639217284957e-05, "loss": 4.1974, "step": 9785 }, { "epoch": 0.6651718983557549, "grad_norm": 0.1718185693025589, "learning_rate": 9.16921456719663e-05, "loss": 3.8146, "step": 9790 }, { "epoch": 0.6655116184264166, "grad_norm": 0.1998133510351181, "learning_rate": 9.168789917108304e-05, "loss": 3.8062, "step": 9795 }, { "epoch": 0.6658513384970784, "grad_norm": 0.19915224611759186, "learning_rate": 9.168365267019976e-05, "loss": 3.8217, "step": 9800 }, { "epoch": 0.6661910585677402, "grad_norm": 0.19672515988349915, "learning_rate": 9.167940616931648e-05, "loss": 3.9699, "step": 9805 }, { "epoch": 0.6665307786384019, "grad_norm": 0.18894067406654358, "learning_rate": 9.167515966843323e-05, "loss": 3.9818, "step": 9810 }, { "epoch": 0.6668704987090638, "grad_norm": 0.2761344611644745, "learning_rate": 9.167091316754994e-05, "loss": 4.0251, "step": 9815 }, { "epoch": 0.6672102187797255, "grad_norm": 0.20272742211818695, "learning_rate": 9.166666666666667e-05, "loss": 4.1925, "step": 9820 }, { "epoch": 0.6675499388503873, "grad_norm": 1.1285771131515503, "learning_rate": 9.166242016578341e-05, "loss": 4.1825, "step": 9825 }, { "epoch": 0.667889658921049, "grad_norm": 0.1726224571466446, "learning_rate": 9.165817366490012e-05, "loss": 3.9708, "step": 9830 }, { "epoch": 0.6682293789917109, "grad_norm": 0.1987651288509369, "learning_rate": 9.165392716401685e-05, "loss": 4.1101, "step": 9835 }, { "epoch": 0.6685690990623726, "grad_norm": 0.19084063172340393, "learning_rate": 9.164968066313358e-05, "loss": 3.9633, "step": 9840 }, { "epoch": 0.6689088191330343, "grad_norm": 0.20953474938869476, "learning_rate": 9.164543416225031e-05, "loss": 4.0714, "step": 9845 }, { "epoch": 0.6692485392036962, "grad_norm": 0.1689091920852661, "learning_rate": 9.164118766136704e-05, "loss": 4.0451, "step": 9850 }, { "epoch": 0.6695882592743579, "grad_norm": 0.38470450043678284, "learning_rate": 9.163694116048376e-05, "loss": 4.2373, "step": 9855 }, { "epoch": 0.6699279793450197, "grad_norm": 0.18749617040157318, "learning_rate": 9.163269465960049e-05, "loss": 4.0489, "step": 9860 }, { "epoch": 0.6702676994156814, "grad_norm": 0.1558235138654709, "learning_rate": 9.162844815871722e-05, "loss": 4.0883, "step": 9865 }, { "epoch": 0.6706074194863433, "grad_norm": 0.17656366527080536, "learning_rate": 9.162420165783395e-05, "loss": 3.8735, "step": 9870 }, { "epoch": 0.670947139557005, "grad_norm": 0.16513271629810333, "learning_rate": 9.161995515695068e-05, "loss": 4.3651, "step": 9875 }, { "epoch": 0.6712868596276668, "grad_norm": 0.369552880525589, "learning_rate": 9.16157086560674e-05, "loss": 4.1642, "step": 9880 }, { "epoch": 0.6716265796983286, "grad_norm": 0.19037148356437683, "learning_rate": 9.161146215518413e-05, "loss": 4.0021, "step": 9885 }, { "epoch": 0.6719662997689904, "grad_norm": 0.1722508817911148, "learning_rate": 9.160721565430086e-05, "loss": 3.8246, "step": 9890 }, { "epoch": 0.6723060198396521, "grad_norm": 0.16816957294940948, "learning_rate": 9.160296915341759e-05, "loss": 3.7951, "step": 9895 }, { "epoch": 0.672645739910314, "grad_norm": 0.18479801714420319, "learning_rate": 9.159872265253432e-05, "loss": 4.0533, "step": 9900 }, { "epoch": 0.6729854599809757, "grad_norm": 0.1568441092967987, "learning_rate": 9.159447615165104e-05, "loss": 3.7568, "step": 9905 }, { "epoch": 0.6733251800516374, "grad_norm": 0.15381227433681488, "learning_rate": 9.159022965076777e-05, "loss": 4.0221, "step": 9910 }, { "epoch": 0.6736649001222992, "grad_norm": 0.22633537650108337, "learning_rate": 9.15859831498845e-05, "loss": 4.0213, "step": 9915 }, { "epoch": 0.674004620192961, "grad_norm": 0.22000354528427124, "learning_rate": 9.158173664900123e-05, "loss": 3.9787, "step": 9920 }, { "epoch": 0.6743443402636228, "grad_norm": 0.18316060304641724, "learning_rate": 9.157749014811796e-05, "loss": 3.939, "step": 9925 }, { "epoch": 0.6746840603342845, "grad_norm": 0.3888174891471863, "learning_rate": 9.157324364723468e-05, "loss": 3.9585, "step": 9930 }, { "epoch": 0.6750237804049464, "grad_norm": 0.24031803011894226, "learning_rate": 9.156899714635141e-05, "loss": 3.9616, "step": 9935 }, { "epoch": 0.6753635004756081, "grad_norm": 0.22554604709148407, "learning_rate": 9.156475064546814e-05, "loss": 4.13, "step": 9940 }, { "epoch": 0.6757032205462699, "grad_norm": 0.15670569241046906, "learning_rate": 9.156050414458487e-05, "loss": 4.0036, "step": 9945 }, { "epoch": 0.6760429406169316, "grad_norm": 0.18224500119686127, "learning_rate": 9.15562576437016e-05, "loss": 4.1177, "step": 9950 }, { "epoch": 0.6763826606875935, "grad_norm": 0.46049654483795166, "learning_rate": 9.155201114281831e-05, "loss": 3.8908, "step": 9955 }, { "epoch": 0.6767223807582552, "grad_norm": 0.1925237625837326, "learning_rate": 9.154776464193505e-05, "loss": 4.0503, "step": 9960 }, { "epoch": 0.6770621008289169, "grad_norm": 0.2237987518310547, "learning_rate": 9.154351814105178e-05, "loss": 4.0872, "step": 9965 }, { "epoch": 0.6774018208995788, "grad_norm": 0.5446917414665222, "learning_rate": 9.15392716401685e-05, "loss": 3.9465, "step": 9970 }, { "epoch": 0.6777415409702405, "grad_norm": 0.19259484112262726, "learning_rate": 9.153502513928524e-05, "loss": 4.0308, "step": 9975 }, { "epoch": 0.6780812610409023, "grad_norm": 0.16928361356258392, "learning_rate": 9.153077863840196e-05, "loss": 3.8983, "step": 9980 }, { "epoch": 0.6784209811115641, "grad_norm": 0.20402824878692627, "learning_rate": 9.152653213751868e-05, "loss": 4.0315, "step": 9985 }, { "epoch": 0.6787607011822259, "grad_norm": 0.18891318142414093, "learning_rate": 9.152228563663542e-05, "loss": 3.9693, "step": 9990 }, { "epoch": 0.6791004212528876, "grad_norm": 0.21101170778274536, "learning_rate": 9.151803913575215e-05, "loss": 3.8452, "step": 9995 }, { "epoch": 0.6794401413235494, "grad_norm": 0.20866774022579193, "learning_rate": 9.151379263486888e-05, "loss": 3.7796, "step": 10000 }, { "epoch": 0.6797798613942112, "grad_norm": 0.22108693420886993, "learning_rate": 9.15095461339856e-05, "loss": 4.1394, "step": 10005 }, { "epoch": 0.680119581464873, "grad_norm": 0.18283884227275848, "learning_rate": 9.150529963310233e-05, "loss": 4.0837, "step": 10010 }, { "epoch": 0.6804593015355347, "grad_norm": 0.3496660888195038, "learning_rate": 9.150105313221906e-05, "loss": 3.9772, "step": 10015 }, { "epoch": 0.6807990216061965, "grad_norm": 0.21944838762283325, "learning_rate": 9.149680663133579e-05, "loss": 3.9497, "step": 10020 }, { "epoch": 0.6811387416768583, "grad_norm": 0.9018047451972961, "learning_rate": 9.149256013045252e-05, "loss": 4.1269, "step": 10025 }, { "epoch": 0.68147846174752, "grad_norm": 0.1882868856191635, "learning_rate": 9.148831362956924e-05, "loss": 3.9114, "step": 10030 }, { "epoch": 0.6818181818181818, "grad_norm": 0.9261885285377502, "learning_rate": 9.148406712868597e-05, "loss": 4.1938, "step": 10035 }, { "epoch": 0.6821579018888436, "grad_norm": 0.20487643778324127, "learning_rate": 9.147982062780269e-05, "loss": 4.029, "step": 10040 }, { "epoch": 0.6824976219595054, "grad_norm": 0.20628653466701508, "learning_rate": 9.147557412691943e-05, "loss": 3.9453, "step": 10045 }, { "epoch": 0.6828373420301671, "grad_norm": 0.18048962950706482, "learning_rate": 9.147132762603616e-05, "loss": 3.7238, "step": 10050 }, { "epoch": 0.683177062100829, "grad_norm": 0.20143994688987732, "learning_rate": 9.146708112515287e-05, "loss": 4.0817, "step": 10055 }, { "epoch": 0.6835167821714907, "grad_norm": 0.1772017776966095, "learning_rate": 9.146283462426961e-05, "loss": 4.3308, "step": 10060 }, { "epoch": 0.6838565022421524, "grad_norm": 0.17534330487251282, "learning_rate": 9.145858812338634e-05, "loss": 3.9769, "step": 10065 }, { "epoch": 0.6841962223128143, "grad_norm": 0.3508715331554413, "learning_rate": 9.145434162250305e-05, "loss": 4.1162, "step": 10070 }, { "epoch": 0.684535942383476, "grad_norm": 0.22710181772708893, "learning_rate": 9.14500951216198e-05, "loss": 3.7313, "step": 10075 }, { "epoch": 0.6848756624541378, "grad_norm": 0.1621914505958557, "learning_rate": 9.144584862073652e-05, "loss": 4.0611, "step": 10080 }, { "epoch": 0.6852153825247995, "grad_norm": 0.26334255933761597, "learning_rate": 9.144160211985324e-05, "loss": 4.0556, "step": 10085 }, { "epoch": 0.6855551025954614, "grad_norm": 0.6280850768089294, "learning_rate": 9.143735561896998e-05, "loss": 4.0588, "step": 10090 }, { "epoch": 0.6858948226661231, "grad_norm": 0.3040946424007416, "learning_rate": 9.143310911808671e-05, "loss": 3.9212, "step": 10095 }, { "epoch": 0.6862345427367849, "grad_norm": 0.30600547790527344, "learning_rate": 9.142886261720342e-05, "loss": 4.0689, "step": 10100 }, { "epoch": 0.6865742628074467, "grad_norm": 0.18050602078437805, "learning_rate": 9.142461611632016e-05, "loss": 3.9464, "step": 10105 }, { "epoch": 0.6869139828781085, "grad_norm": 0.48490941524505615, "learning_rate": 9.142036961543688e-05, "loss": 4.025, "step": 10110 }, { "epoch": 0.6872537029487702, "grad_norm": 0.24022653698921204, "learning_rate": 9.14161231145536e-05, "loss": 4.0635, "step": 10115 }, { "epoch": 0.6875934230194319, "grad_norm": 0.5546215772628784, "learning_rate": 9.141187661367035e-05, "loss": 3.9159, "step": 10120 }, { "epoch": 0.6879331430900938, "grad_norm": 0.1747526228427887, "learning_rate": 9.140763011278706e-05, "loss": 4.0271, "step": 10125 }, { "epoch": 0.6882728631607555, "grad_norm": 0.8281142115592957, "learning_rate": 9.140338361190379e-05, "loss": 3.8248, "step": 10130 }, { "epoch": 0.6886125832314173, "grad_norm": 0.19988751411437988, "learning_rate": 9.139913711102053e-05, "loss": 4.022, "step": 10135 }, { "epoch": 0.6889523033020791, "grad_norm": 0.5465024709701538, "learning_rate": 9.139489061013725e-05, "loss": 3.9433, "step": 10140 }, { "epoch": 0.6892920233727409, "grad_norm": 0.704890787601471, "learning_rate": 9.139064410925397e-05, "loss": 4.2021, "step": 10145 }, { "epoch": 0.6896317434434026, "grad_norm": 0.1683177798986435, "learning_rate": 9.138639760837072e-05, "loss": 4.0005, "step": 10150 }, { "epoch": 0.6899714635140645, "grad_norm": 0.2499643862247467, "learning_rate": 9.138215110748743e-05, "loss": 4.0452, "step": 10155 }, { "epoch": 0.6903111835847262, "grad_norm": 0.20261327922344208, "learning_rate": 9.137790460660416e-05, "loss": 3.9332, "step": 10160 }, { "epoch": 0.690650903655388, "grad_norm": 0.19001875817775726, "learning_rate": 9.13736581057209e-05, "loss": 3.9008, "step": 10165 }, { "epoch": 0.6909906237260497, "grad_norm": 0.18063665926456451, "learning_rate": 9.136941160483761e-05, "loss": 3.8843, "step": 10170 }, { "epoch": 0.6913303437967115, "grad_norm": 0.1968679428100586, "learning_rate": 9.136516510395434e-05, "loss": 4.0298, "step": 10175 }, { "epoch": 0.6916700638673733, "grad_norm": 0.21212510764598846, "learning_rate": 9.136091860307107e-05, "loss": 4.1819, "step": 10180 }, { "epoch": 0.692009783938035, "grad_norm": 0.2317853420972824, "learning_rate": 9.13566721021878e-05, "loss": 3.8627, "step": 10185 }, { "epoch": 0.6923495040086969, "grad_norm": 0.17582780122756958, "learning_rate": 9.135242560130453e-05, "loss": 4.044, "step": 10190 }, { "epoch": 0.6926892240793586, "grad_norm": 0.1663304716348648, "learning_rate": 9.134817910042125e-05, "loss": 3.9946, "step": 10195 }, { "epoch": 0.6930289441500204, "grad_norm": 0.2554979920387268, "learning_rate": 9.134393259953798e-05, "loss": 3.9845, "step": 10200 }, { "epoch": 0.6933686642206821, "grad_norm": 0.19808262586593628, "learning_rate": 9.133968609865471e-05, "loss": 3.7411, "step": 10205 }, { "epoch": 0.693708384291344, "grad_norm": 0.16207602620124817, "learning_rate": 9.133543959777144e-05, "loss": 3.9169, "step": 10210 }, { "epoch": 0.6940481043620057, "grad_norm": 0.19255559146404266, "learning_rate": 9.133119309688817e-05, "loss": 4.0015, "step": 10215 }, { "epoch": 0.6943878244326674, "grad_norm": 0.40374597907066345, "learning_rate": 9.13269465960049e-05, "loss": 4.0568, "step": 10220 }, { "epoch": 0.6947275445033293, "grad_norm": 0.5542440414428711, "learning_rate": 9.132270009512162e-05, "loss": 4.0383, "step": 10225 }, { "epoch": 0.695067264573991, "grad_norm": 0.2855238914489746, "learning_rate": 9.131845359423835e-05, "loss": 3.957, "step": 10230 }, { "epoch": 0.6954069846446528, "grad_norm": 0.30983904004096985, "learning_rate": 9.131420709335508e-05, "loss": 3.9347, "step": 10235 }, { "epoch": 0.6957467047153146, "grad_norm": 0.1820448637008667, "learning_rate": 9.13099605924718e-05, "loss": 4.0424, "step": 10240 }, { "epoch": 0.6960864247859764, "grad_norm": 0.1688624769449234, "learning_rate": 9.130571409158853e-05, "loss": 4.1492, "step": 10245 }, { "epoch": 0.6964261448566381, "grad_norm": 0.18944065272808075, "learning_rate": 9.130146759070526e-05, "loss": 4.0306, "step": 10250 }, { "epoch": 0.6967658649272999, "grad_norm": 0.23642663657665253, "learning_rate": 9.129722108982199e-05, "loss": 3.9713, "step": 10255 }, { "epoch": 0.6971055849979617, "grad_norm": 0.30841895937919617, "learning_rate": 9.129297458893872e-05, "loss": 4.0823, "step": 10260 }, { "epoch": 0.6974453050686235, "grad_norm": 0.20149889588356018, "learning_rate": 9.128872808805545e-05, "loss": 3.9524, "step": 10265 }, { "epoch": 0.6977850251392852, "grad_norm": 0.18833234906196594, "learning_rate": 9.128448158717217e-05, "loss": 4.1513, "step": 10270 }, { "epoch": 0.698124745209947, "grad_norm": 0.1727929264307022, "learning_rate": 9.12802350862889e-05, "loss": 3.928, "step": 10275 }, { "epoch": 0.6984644652806088, "grad_norm": 0.19599127769470215, "learning_rate": 9.127598858540563e-05, "loss": 3.9861, "step": 10280 }, { "epoch": 0.6988041853512705, "grad_norm": 0.19696658849716187, "learning_rate": 9.127174208452236e-05, "loss": 3.9878, "step": 10285 }, { "epoch": 0.6991439054219323, "grad_norm": 0.24747943878173828, "learning_rate": 9.126749558363909e-05, "loss": 3.9343, "step": 10290 }, { "epoch": 0.6994836254925941, "grad_norm": 0.23848794400691986, "learning_rate": 9.126324908275581e-05, "loss": 3.7876, "step": 10295 }, { "epoch": 0.6998233455632559, "grad_norm": 0.18184241652488708, "learning_rate": 9.125900258187254e-05, "loss": 4.003, "step": 10300 }, { "epoch": 0.7001630656339176, "grad_norm": 0.21852192282676697, "learning_rate": 9.125475608098927e-05, "loss": 3.9115, "step": 10305 }, { "epoch": 0.7005027857045795, "grad_norm": 0.18718090653419495, "learning_rate": 9.125050958010599e-05, "loss": 3.8846, "step": 10310 }, { "epoch": 0.7008425057752412, "grad_norm": 0.4584771394729614, "learning_rate": 9.124626307922273e-05, "loss": 3.9954, "step": 10315 }, { "epoch": 0.701182225845903, "grad_norm": 0.19998236000537872, "learning_rate": 9.124201657833945e-05, "loss": 3.9594, "step": 10320 }, { "epoch": 0.7015219459165648, "grad_norm": 0.1826915442943573, "learning_rate": 9.123777007745617e-05, "loss": 3.8949, "step": 10325 }, { "epoch": 0.7018616659872265, "grad_norm": 0.31630200147628784, "learning_rate": 9.123352357657291e-05, "loss": 4.0863, "step": 10330 }, { "epoch": 0.7022013860578883, "grad_norm": 0.22190065681934357, "learning_rate": 9.122927707568964e-05, "loss": 3.9006, "step": 10335 }, { "epoch": 0.70254110612855, "grad_norm": 0.14430834352970123, "learning_rate": 9.122503057480637e-05, "loss": 3.7831, "step": 10340 }, { "epoch": 0.7028808261992119, "grad_norm": 0.23812465369701385, "learning_rate": 9.12207840739231e-05, "loss": 4.07, "step": 10345 }, { "epoch": 0.7032205462698736, "grad_norm": 0.26677510142326355, "learning_rate": 9.121653757303982e-05, "loss": 4.0027, "step": 10350 }, { "epoch": 0.7035602663405354, "grad_norm": 1.3501478433609009, "learning_rate": 9.121229107215655e-05, "loss": 3.987, "step": 10355 }, { "epoch": 0.7038999864111972, "grad_norm": 0.20116499066352844, "learning_rate": 9.120804457127328e-05, "loss": 4.2462, "step": 10360 }, { "epoch": 0.704239706481859, "grad_norm": 0.22384878993034363, "learning_rate": 9.120379807039001e-05, "loss": 3.7601, "step": 10365 }, { "epoch": 0.7045794265525207, "grad_norm": 0.2325512170791626, "learning_rate": 9.119955156950673e-05, "loss": 4.0985, "step": 10370 }, { "epoch": 0.7049191466231824, "grad_norm": 0.16118580102920532, "learning_rate": 9.119530506862346e-05, "loss": 4.0038, "step": 10375 }, { "epoch": 0.7052588666938443, "grad_norm": 0.16809016466140747, "learning_rate": 9.119105856774018e-05, "loss": 4.0822, "step": 10380 }, { "epoch": 0.705598586764506, "grad_norm": 0.29214194416999817, "learning_rate": 9.118681206685692e-05, "loss": 3.9853, "step": 10385 }, { "epoch": 0.7059383068351678, "grad_norm": 0.17380909621715546, "learning_rate": 9.118256556597365e-05, "loss": 3.9326, "step": 10390 }, { "epoch": 0.7062780269058296, "grad_norm": 0.19394385814666748, "learning_rate": 9.117831906509036e-05, "loss": 4.0416, "step": 10395 }, { "epoch": 0.7066177469764914, "grad_norm": 0.16878750920295715, "learning_rate": 9.11740725642071e-05, "loss": 4.2154, "step": 10400 }, { "epoch": 0.7069574670471531, "grad_norm": 0.17969490587711334, "learning_rate": 9.116982606332383e-05, "loss": 3.8792, "step": 10405 }, { "epoch": 0.707297187117815, "grad_norm": 0.18729190528392792, "learning_rate": 9.116557956244055e-05, "loss": 4.0986, "step": 10410 }, { "epoch": 0.7076369071884767, "grad_norm": 0.16963765025138855, "learning_rate": 9.116133306155729e-05, "loss": 4.062, "step": 10415 }, { "epoch": 0.7079766272591385, "grad_norm": 0.2884555757045746, "learning_rate": 9.115708656067402e-05, "loss": 4.078, "step": 10420 }, { "epoch": 0.7083163473298002, "grad_norm": 0.29540711641311646, "learning_rate": 9.115284005979073e-05, "loss": 4.0736, "step": 10425 }, { "epoch": 0.708656067400462, "grad_norm": 0.41383740305900574, "learning_rate": 9.114859355890747e-05, "loss": 4.2034, "step": 10430 }, { "epoch": 0.7089957874711238, "grad_norm": 0.17644955217838287, "learning_rate": 9.11443470580242e-05, "loss": 4.1137, "step": 10435 }, { "epoch": 0.7093355075417855, "grad_norm": 0.1741337925195694, "learning_rate": 9.114010055714091e-05, "loss": 4.0893, "step": 10440 }, { "epoch": 0.7096752276124474, "grad_norm": 0.2527826130390167, "learning_rate": 9.113585405625766e-05, "loss": 4.091, "step": 10445 }, { "epoch": 0.7100149476831091, "grad_norm": 0.1837528795003891, "learning_rate": 9.113160755537438e-05, "loss": 4.0449, "step": 10450 }, { "epoch": 0.7103546677537709, "grad_norm": 0.18858109414577484, "learning_rate": 9.11273610544911e-05, "loss": 4.0575, "step": 10455 }, { "epoch": 0.7106943878244326, "grad_norm": 0.1998942494392395, "learning_rate": 9.112311455360784e-05, "loss": 3.9053, "step": 10460 }, { "epoch": 0.7110341078950945, "grad_norm": 0.20997202396392822, "learning_rate": 9.111886805272455e-05, "loss": 3.8818, "step": 10465 }, { "epoch": 0.7113738279657562, "grad_norm": 0.17449168860912323, "learning_rate": 9.111462155184128e-05, "loss": 4.0812, "step": 10470 }, { "epoch": 0.711713548036418, "grad_norm": 0.20188404619693756, "learning_rate": 9.111037505095802e-05, "loss": 3.918, "step": 10475 }, { "epoch": 0.7120532681070798, "grad_norm": 0.16575513780117035, "learning_rate": 9.110612855007474e-05, "loss": 4.053, "step": 10480 }, { "epoch": 0.7123929881777415, "grad_norm": 0.20615115761756897, "learning_rate": 9.110188204919147e-05, "loss": 3.7282, "step": 10485 }, { "epoch": 0.7127327082484033, "grad_norm": 0.17066192626953125, "learning_rate": 9.109763554830821e-05, "loss": 4.0129, "step": 10490 }, { "epoch": 0.7130724283190651, "grad_norm": 0.2495145946741104, "learning_rate": 9.109338904742492e-05, "loss": 3.9959, "step": 10495 }, { "epoch": 0.7134121483897269, "grad_norm": 0.19648021459579468, "learning_rate": 9.108914254654165e-05, "loss": 4.0106, "step": 10500 }, { "epoch": 0.7137518684603886, "grad_norm": 0.20270267128944397, "learning_rate": 9.108489604565839e-05, "loss": 4.0734, "step": 10505 }, { "epoch": 0.7140915885310504, "grad_norm": 0.1632177233695984, "learning_rate": 9.10806495447751e-05, "loss": 4.2488, "step": 10510 }, { "epoch": 0.7144313086017122, "grad_norm": 0.1604064404964447, "learning_rate": 9.107640304389183e-05, "loss": 3.8896, "step": 10515 }, { "epoch": 0.714771028672374, "grad_norm": 0.21193253993988037, "learning_rate": 9.107215654300858e-05, "loss": 3.9983, "step": 10520 }, { "epoch": 0.7151107487430357, "grad_norm": 0.3716839551925659, "learning_rate": 9.106791004212529e-05, "loss": 3.9367, "step": 10525 }, { "epoch": 0.7154504688136976, "grad_norm": 0.1587960124015808, "learning_rate": 9.106366354124202e-05, "loss": 3.7641, "step": 10530 }, { "epoch": 0.7157901888843593, "grad_norm": 4.4356184005737305, "learning_rate": 9.105941704035875e-05, "loss": 4.0203, "step": 10535 }, { "epoch": 0.716129908955021, "grad_norm": 0.2456178516149521, "learning_rate": 9.105517053947547e-05, "loss": 4.0491, "step": 10540 }, { "epoch": 0.7164696290256828, "grad_norm": 0.5795451402664185, "learning_rate": 9.10509240385922e-05, "loss": 3.8471, "step": 10545 }, { "epoch": 0.7168093490963446, "grad_norm": 0.20285823941230774, "learning_rate": 9.104667753770893e-05, "loss": 4.1314, "step": 10550 }, { "epoch": 0.7171490691670064, "grad_norm": 0.16531841456890106, "learning_rate": 9.104243103682566e-05, "loss": 4.1928, "step": 10555 }, { "epoch": 0.7174887892376681, "grad_norm": 0.1814994513988495, "learning_rate": 9.103818453594239e-05, "loss": 4.0319, "step": 10560 }, { "epoch": 0.71782850930833, "grad_norm": 0.16023671627044678, "learning_rate": 9.103393803505911e-05, "loss": 4.1279, "step": 10565 }, { "epoch": 0.7181682293789917, "grad_norm": 0.2011050581932068, "learning_rate": 9.102969153417584e-05, "loss": 4.0957, "step": 10570 }, { "epoch": 0.7185079494496535, "grad_norm": 0.21786335110664368, "learning_rate": 9.102544503329257e-05, "loss": 3.9609, "step": 10575 }, { "epoch": 0.7188476695203153, "grad_norm": 0.22446627914905548, "learning_rate": 9.10211985324093e-05, "loss": 3.9967, "step": 10580 }, { "epoch": 0.719187389590977, "grad_norm": 0.301862508058548, "learning_rate": 9.101695203152603e-05, "loss": 3.7026, "step": 10585 }, { "epoch": 0.7195271096616388, "grad_norm": 0.19402875006198883, "learning_rate": 9.101270553064275e-05, "loss": 4.1842, "step": 10590 }, { "epoch": 0.7198668297323005, "grad_norm": 0.2339441329240799, "learning_rate": 9.100845902975948e-05, "loss": 3.8018, "step": 10595 }, { "epoch": 0.7202065498029624, "grad_norm": 0.1565333753824234, "learning_rate": 9.100421252887621e-05, "loss": 3.8915, "step": 10600 }, { "epoch": 0.7205462698736241, "grad_norm": 0.16925933957099915, "learning_rate": 9.099996602799294e-05, "loss": 4.1202, "step": 10605 }, { "epoch": 0.7208859899442859, "grad_norm": 0.13388794660568237, "learning_rate": 9.099571952710967e-05, "loss": 3.9878, "step": 10610 }, { "epoch": 0.7212257100149477, "grad_norm": 0.15455901622772217, "learning_rate": 9.09914730262264e-05, "loss": 3.9719, "step": 10615 }, { "epoch": 0.7215654300856095, "grad_norm": 0.19704410433769226, "learning_rate": 9.098722652534312e-05, "loss": 3.9981, "step": 10620 }, { "epoch": 0.7219051501562712, "grad_norm": 0.1932808756828308, "learning_rate": 9.098298002445985e-05, "loss": 4.0945, "step": 10625 }, { "epoch": 0.722244870226933, "grad_norm": 0.2587969899177551, "learning_rate": 9.097873352357658e-05, "loss": 4.0047, "step": 10630 }, { "epoch": 0.7225845902975948, "grad_norm": 0.24855893850326538, "learning_rate": 9.09744870226933e-05, "loss": 3.927, "step": 10635 }, { "epoch": 0.7229243103682566, "grad_norm": 0.2058570235967636, "learning_rate": 9.097024052181003e-05, "loss": 4.0545, "step": 10640 }, { "epoch": 0.7232640304389183, "grad_norm": 0.20533056557178497, "learning_rate": 9.096599402092676e-05, "loss": 4.0696, "step": 10645 }, { "epoch": 0.7236037505095801, "grad_norm": 0.2629019320011139, "learning_rate": 9.096174752004349e-05, "loss": 3.9769, "step": 10650 }, { "epoch": 0.7239434705802419, "grad_norm": 0.2127770483493805, "learning_rate": 9.095750101916022e-05, "loss": 3.9245, "step": 10655 }, { "epoch": 0.7242831906509036, "grad_norm": 0.1867874562740326, "learning_rate": 9.095325451827695e-05, "loss": 3.5818, "step": 10660 }, { "epoch": 0.7246229107215655, "grad_norm": 0.25175002217292786, "learning_rate": 9.094900801739366e-05, "loss": 4.0028, "step": 10665 }, { "epoch": 0.7249626307922272, "grad_norm": 0.18207040429115295, "learning_rate": 9.09447615165104e-05, "loss": 3.9293, "step": 10670 }, { "epoch": 0.725302350862889, "grad_norm": 0.2175348699092865, "learning_rate": 9.094051501562713e-05, "loss": 4.116, "step": 10675 }, { "epoch": 0.7256420709335507, "grad_norm": 0.1736600250005722, "learning_rate": 9.093626851474386e-05, "loss": 4.0422, "step": 10680 }, { "epoch": 0.7259817910042126, "grad_norm": 0.2036193609237671, "learning_rate": 9.093202201386059e-05, "loss": 4.0416, "step": 10685 }, { "epoch": 0.7263215110748743, "grad_norm": 0.5849753618240356, "learning_rate": 9.092777551297731e-05, "loss": 3.9602, "step": 10690 }, { "epoch": 0.726661231145536, "grad_norm": 0.15708011388778687, "learning_rate": 9.092352901209404e-05, "loss": 3.9746, "step": 10695 }, { "epoch": 0.7270009512161979, "grad_norm": 0.21586155891418457, "learning_rate": 9.091928251121077e-05, "loss": 4.1214, "step": 10700 }, { "epoch": 0.7273406712868596, "grad_norm": 0.19140325486660004, "learning_rate": 9.09150360103275e-05, "loss": 4.0866, "step": 10705 }, { "epoch": 0.7276803913575214, "grad_norm": 0.6538243889808655, "learning_rate": 9.091078950944423e-05, "loss": 4.1316, "step": 10710 }, { "epoch": 0.7280201114281831, "grad_norm": 0.1779155284166336, "learning_rate": 9.090654300856095e-05, "loss": 4.0354, "step": 10715 }, { "epoch": 0.728359831498845, "grad_norm": 0.24357867240905762, "learning_rate": 9.090229650767768e-05, "loss": 3.7766, "step": 10720 }, { "epoch": 0.7286995515695067, "grad_norm": 0.15026213228702545, "learning_rate": 9.089805000679441e-05, "loss": 3.8955, "step": 10725 }, { "epoch": 0.7290392716401685, "grad_norm": 0.19349145889282227, "learning_rate": 9.089380350591114e-05, "loss": 3.7735, "step": 10730 }, { "epoch": 0.7293789917108303, "grad_norm": 0.25646016001701355, "learning_rate": 9.088955700502785e-05, "loss": 4.0811, "step": 10735 }, { "epoch": 0.7297187117814921, "grad_norm": 0.17351272702217102, "learning_rate": 9.08853105041446e-05, "loss": 4.1555, "step": 10740 }, { "epoch": 0.7300584318521538, "grad_norm": 0.17100609838962555, "learning_rate": 9.088106400326132e-05, "loss": 4.0126, "step": 10745 }, { "epoch": 0.7303981519228157, "grad_norm": 0.25536659359931946, "learning_rate": 9.087681750237804e-05, "loss": 4.0176, "step": 10750 }, { "epoch": 0.7307378719934774, "grad_norm": 0.2601194977760315, "learning_rate": 9.087257100149478e-05, "loss": 4.0722, "step": 10755 }, { "epoch": 0.7310775920641391, "grad_norm": 0.19794826209545135, "learning_rate": 9.08683245006115e-05, "loss": 4.2004, "step": 10760 }, { "epoch": 0.7314173121348009, "grad_norm": 0.2230055183172226, "learning_rate": 9.086407799972822e-05, "loss": 3.9865, "step": 10765 }, { "epoch": 0.7317570322054627, "grad_norm": 0.24480870366096497, "learning_rate": 9.085983149884496e-05, "loss": 3.7441, "step": 10770 }, { "epoch": 0.7320967522761245, "grad_norm": 0.15868893265724182, "learning_rate": 9.085558499796169e-05, "loss": 4.0704, "step": 10775 }, { "epoch": 0.7324364723467862, "grad_norm": 0.3648226857185364, "learning_rate": 9.08513384970784e-05, "loss": 4.0741, "step": 10780 }, { "epoch": 0.7327761924174481, "grad_norm": 1.1779170036315918, "learning_rate": 9.084709199619515e-05, "loss": 4.0508, "step": 10785 }, { "epoch": 0.7331159124881098, "grad_norm": 0.5466019511222839, "learning_rate": 9.084284549531187e-05, "loss": 3.7824, "step": 10790 }, { "epoch": 0.7334556325587716, "grad_norm": 0.2697416841983795, "learning_rate": 9.083859899442859e-05, "loss": 3.8887, "step": 10795 }, { "epoch": 0.7337953526294333, "grad_norm": 0.1877298504114151, "learning_rate": 9.083435249354533e-05, "loss": 4.0495, "step": 10800 }, { "epoch": 0.7341350727000951, "grad_norm": 0.21535362303256989, "learning_rate": 9.083010599266204e-05, "loss": 4.1075, "step": 10805 }, { "epoch": 0.7344747927707569, "grad_norm": 0.15432433784008026, "learning_rate": 9.082585949177877e-05, "loss": 4.1321, "step": 10810 }, { "epoch": 0.7348145128414186, "grad_norm": 0.17613931000232697, "learning_rate": 9.082161299089551e-05, "loss": 4.0152, "step": 10815 }, { "epoch": 0.7351542329120805, "grad_norm": 0.17943201959133148, "learning_rate": 9.081736649001223e-05, "loss": 4.0291, "step": 10820 }, { "epoch": 0.7354939529827422, "grad_norm": 0.17358125746250153, "learning_rate": 9.081311998912896e-05, "loss": 4.0228, "step": 10825 }, { "epoch": 0.735833673053404, "grad_norm": 0.16327157616615295, "learning_rate": 9.08088734882457e-05, "loss": 3.8393, "step": 10830 }, { "epoch": 0.7361733931240658, "grad_norm": 0.38127797842025757, "learning_rate": 9.080462698736241e-05, "loss": 4.0771, "step": 10835 }, { "epoch": 0.7365131131947276, "grad_norm": 0.17917278409004211, "learning_rate": 9.080038048647914e-05, "loss": 3.9258, "step": 10840 }, { "epoch": 0.7368528332653893, "grad_norm": 0.1688838005065918, "learning_rate": 9.079613398559588e-05, "loss": 3.8387, "step": 10845 }, { "epoch": 0.737192553336051, "grad_norm": 0.1823907047510147, "learning_rate": 9.07918874847126e-05, "loss": 3.9959, "step": 10850 }, { "epoch": 0.7375322734067129, "grad_norm": 0.20357432961463928, "learning_rate": 9.078764098382932e-05, "loss": 3.99, "step": 10855 }, { "epoch": 0.7378719934773746, "grad_norm": 0.18823125958442688, "learning_rate": 9.078339448294607e-05, "loss": 4.0908, "step": 10860 }, { "epoch": 0.7382117135480364, "grad_norm": 0.16174191236495972, "learning_rate": 9.077914798206278e-05, "loss": 4.0641, "step": 10865 }, { "epoch": 0.7385514336186982, "grad_norm": 0.1720336228609085, "learning_rate": 9.077490148117951e-05, "loss": 3.8454, "step": 10870 }, { "epoch": 0.73889115368936, "grad_norm": 0.23603537678718567, "learning_rate": 9.077065498029625e-05, "loss": 3.9752, "step": 10875 }, { "epoch": 0.7392308737600217, "grad_norm": 0.20614124834537506, "learning_rate": 9.076640847941296e-05, "loss": 4.045, "step": 10880 }, { "epoch": 0.7395705938306835, "grad_norm": 0.30947062373161316, "learning_rate": 9.076216197852969e-05, "loss": 3.9429, "step": 10885 }, { "epoch": 0.7399103139013453, "grad_norm": 0.2017713338136673, "learning_rate": 9.075791547764642e-05, "loss": 3.9205, "step": 10890 }, { "epoch": 0.7402500339720071, "grad_norm": 1.3917611837387085, "learning_rate": 9.075366897676315e-05, "loss": 4.0456, "step": 10895 }, { "epoch": 0.7405897540426688, "grad_norm": 0.4103597104549408, "learning_rate": 9.074942247587988e-05, "loss": 3.9107, "step": 10900 }, { "epoch": 0.7409294741133307, "grad_norm": 0.5144510269165039, "learning_rate": 9.07451759749966e-05, "loss": 4.0178, "step": 10905 }, { "epoch": 0.7412691941839924, "grad_norm": 0.16965581476688385, "learning_rate": 9.074092947411333e-05, "loss": 3.9729, "step": 10910 }, { "epoch": 0.7416089142546541, "grad_norm": 0.38037505745887756, "learning_rate": 9.073668297323006e-05, "loss": 4.2043, "step": 10915 }, { "epoch": 0.741948634325316, "grad_norm": 0.26255086064338684, "learning_rate": 9.073243647234679e-05, "loss": 3.8663, "step": 10920 }, { "epoch": 0.7422883543959777, "grad_norm": 0.3262099623680115, "learning_rate": 9.072818997146352e-05, "loss": 4.0803, "step": 10925 }, { "epoch": 0.7426280744666395, "grad_norm": 0.21773168444633484, "learning_rate": 9.072394347058024e-05, "loss": 3.9992, "step": 10930 }, { "epoch": 0.7429677945373012, "grad_norm": 0.22857216000556946, "learning_rate": 9.071969696969697e-05, "loss": 3.8582, "step": 10935 }, { "epoch": 0.7433075146079631, "grad_norm": 0.312259316444397, "learning_rate": 9.07154504688137e-05, "loss": 4.0318, "step": 10940 }, { "epoch": 0.7436472346786248, "grad_norm": 0.1695690155029297, "learning_rate": 9.071120396793043e-05, "loss": 3.9077, "step": 10945 }, { "epoch": 0.7439869547492866, "grad_norm": 0.29498061537742615, "learning_rate": 9.070695746704716e-05, "loss": 4.3087, "step": 10950 }, { "epoch": 0.7443266748199484, "grad_norm": 0.24566805362701416, "learning_rate": 9.070271096616388e-05, "loss": 3.8372, "step": 10955 }, { "epoch": 0.7446663948906102, "grad_norm": 0.163113072514534, "learning_rate": 9.069846446528061e-05, "loss": 4.0402, "step": 10960 }, { "epoch": 0.7450061149612719, "grad_norm": 0.18011754751205444, "learning_rate": 9.069421796439734e-05, "loss": 4.1377, "step": 10965 }, { "epoch": 0.7453458350319336, "grad_norm": 0.8807979822158813, "learning_rate": 9.068997146351407e-05, "loss": 3.8495, "step": 10970 }, { "epoch": 0.7456855551025955, "grad_norm": 0.22865957021713257, "learning_rate": 9.06857249626308e-05, "loss": 4.1146, "step": 10975 }, { "epoch": 0.7460252751732572, "grad_norm": 0.2118086814880371, "learning_rate": 9.068147846174752e-05, "loss": 3.8964, "step": 10980 }, { "epoch": 0.746364995243919, "grad_norm": 0.18207617104053497, "learning_rate": 9.067723196086425e-05, "loss": 3.9706, "step": 10985 }, { "epoch": 0.7467047153145808, "grad_norm": 0.16859905421733856, "learning_rate": 9.067298545998098e-05, "loss": 3.7855, "step": 10990 }, { "epoch": 0.7470444353852426, "grad_norm": 0.16500097513198853, "learning_rate": 9.066873895909771e-05, "loss": 3.7684, "step": 10995 }, { "epoch": 0.7473841554559043, "grad_norm": 0.1520179957151413, "learning_rate": 9.066449245821444e-05, "loss": 3.8084, "step": 11000 }, { "epoch": 0.7477238755265662, "grad_norm": 0.21755331754684448, "learning_rate": 9.066024595733115e-05, "loss": 3.8863, "step": 11005 }, { "epoch": 0.7480635955972279, "grad_norm": 0.20671890676021576, "learning_rate": 9.065599945644789e-05, "loss": 3.9006, "step": 11010 }, { "epoch": 0.7484033156678896, "grad_norm": 0.16787393391132355, "learning_rate": 9.065175295556462e-05, "loss": 3.7611, "step": 11015 }, { "epoch": 0.7487430357385514, "grad_norm": 0.22157283127307892, "learning_rate": 9.064750645468135e-05, "loss": 4.1048, "step": 11020 }, { "epoch": 0.7490827558092132, "grad_norm": 0.22022277116775513, "learning_rate": 9.064325995379808e-05, "loss": 3.8746, "step": 11025 }, { "epoch": 0.749422475879875, "grad_norm": 0.2435934692621231, "learning_rate": 9.06390134529148e-05, "loss": 3.9949, "step": 11030 }, { "epoch": 0.7497621959505367, "grad_norm": 0.18187767267227173, "learning_rate": 9.063476695203153e-05, "loss": 4.0584, "step": 11035 }, { "epoch": 0.7501019160211986, "grad_norm": 0.18477857112884521, "learning_rate": 9.063052045114826e-05, "loss": 4.1217, "step": 11040 }, { "epoch": 0.7504416360918603, "grad_norm": 0.1471758335828781, "learning_rate": 9.062627395026499e-05, "loss": 4.0513, "step": 11045 }, { "epoch": 0.7507813561625221, "grad_norm": 0.20632903277873993, "learning_rate": 9.062202744938172e-05, "loss": 3.9865, "step": 11050 }, { "epoch": 0.7511210762331838, "grad_norm": 0.21105721592903137, "learning_rate": 9.061778094849844e-05, "loss": 3.8277, "step": 11055 }, { "epoch": 0.7514607963038457, "grad_norm": 0.19280456006526947, "learning_rate": 9.061353444761517e-05, "loss": 4.0091, "step": 11060 }, { "epoch": 0.7518005163745074, "grad_norm": 0.1918146163225174, "learning_rate": 9.06092879467319e-05, "loss": 4.0648, "step": 11065 }, { "epoch": 0.7521402364451691, "grad_norm": 0.22963494062423706, "learning_rate": 9.060504144584863e-05, "loss": 3.9061, "step": 11070 }, { "epoch": 0.752479956515831, "grad_norm": 0.16479997336864471, "learning_rate": 9.060079494496536e-05, "loss": 4.0303, "step": 11075 }, { "epoch": 0.7528196765864927, "grad_norm": 0.18432816863059998, "learning_rate": 9.059654844408208e-05, "loss": 3.8773, "step": 11080 }, { "epoch": 0.7531593966571545, "grad_norm": 0.22336050868034363, "learning_rate": 9.059230194319881e-05, "loss": 3.7997, "step": 11085 }, { "epoch": 0.7534991167278163, "grad_norm": 0.242068812251091, "learning_rate": 9.058805544231553e-05, "loss": 3.9268, "step": 11090 }, { "epoch": 0.7538388367984781, "grad_norm": 0.14753904938697815, "learning_rate": 9.058380894143227e-05, "loss": 4.0985, "step": 11095 }, { "epoch": 0.7541785568691398, "grad_norm": 0.19245490431785583, "learning_rate": 9.0579562440549e-05, "loss": 3.8995, "step": 11100 }, { "epoch": 0.7545182769398016, "grad_norm": 0.18615277111530304, "learning_rate": 9.057531593966571e-05, "loss": 4.0072, "step": 11105 }, { "epoch": 0.7548579970104634, "grad_norm": 0.19581812620162964, "learning_rate": 9.057106943878245e-05, "loss": 3.8758, "step": 11110 }, { "epoch": 0.7551977170811252, "grad_norm": 0.15949614346027374, "learning_rate": 9.056682293789918e-05, "loss": 3.9473, "step": 11115 }, { "epoch": 0.7555374371517869, "grad_norm": 1.359558343887329, "learning_rate": 9.05625764370159e-05, "loss": 3.9416, "step": 11120 }, { "epoch": 0.7558771572224487, "grad_norm": 0.1593676656484604, "learning_rate": 9.055832993613264e-05, "loss": 4.0518, "step": 11125 }, { "epoch": 0.7562168772931105, "grad_norm": 0.1715662181377411, "learning_rate": 9.055408343524937e-05, "loss": 3.982, "step": 11130 }, { "epoch": 0.7565565973637722, "grad_norm": 0.1934783011674881, "learning_rate": 9.054983693436608e-05, "loss": 4.1628, "step": 11135 }, { "epoch": 0.756896317434434, "grad_norm": 0.16365660727024078, "learning_rate": 9.054559043348282e-05, "loss": 4.0758, "step": 11140 }, { "epoch": 0.7572360375050958, "grad_norm": 0.2995030879974365, "learning_rate": 9.054134393259955e-05, "loss": 3.915, "step": 11145 }, { "epoch": 0.7575757575757576, "grad_norm": 0.22450530529022217, "learning_rate": 9.053709743171626e-05, "loss": 4.2514, "step": 11150 }, { "epoch": 0.7579154776464193, "grad_norm": 3.4520628452301025, "learning_rate": 9.0532850930833e-05, "loss": 4.0989, "step": 11155 }, { "epoch": 0.7582551977170812, "grad_norm": 0.31930363178253174, "learning_rate": 9.052860442994972e-05, "loss": 4.0048, "step": 11160 }, { "epoch": 0.7585949177877429, "grad_norm": 0.21884280443191528, "learning_rate": 9.052435792906645e-05, "loss": 3.8112, "step": 11165 }, { "epoch": 0.7589346378584046, "grad_norm": 0.1630697101354599, "learning_rate": 9.052011142818319e-05, "loss": 3.9614, "step": 11170 }, { "epoch": 0.7592743579290665, "grad_norm": 0.2021295428276062, "learning_rate": 9.05158649272999e-05, "loss": 3.9433, "step": 11175 }, { "epoch": 0.7596140779997282, "grad_norm": 0.21674887835979462, "learning_rate": 9.051161842641663e-05, "loss": 3.9271, "step": 11180 }, { "epoch": 0.75995379807039, "grad_norm": 0.2624286413192749, "learning_rate": 9.050737192553337e-05, "loss": 3.8687, "step": 11185 }, { "epoch": 0.7602935181410517, "grad_norm": 0.15679609775543213, "learning_rate": 9.050312542465009e-05, "loss": 4.0386, "step": 11190 }, { "epoch": 0.7606332382117136, "grad_norm": 0.298999547958374, "learning_rate": 9.049887892376682e-05, "loss": 4.039, "step": 11195 }, { "epoch": 0.7609729582823753, "grad_norm": 0.1916528344154358, "learning_rate": 9.049463242288356e-05, "loss": 3.9924, "step": 11200 }, { "epoch": 0.7613126783530371, "grad_norm": 0.20813806354999542, "learning_rate": 9.049038592200027e-05, "loss": 3.9728, "step": 11205 }, { "epoch": 0.7616523984236989, "grad_norm": 0.26955127716064453, "learning_rate": 9.0486139421117e-05, "loss": 3.9563, "step": 11210 }, { "epoch": 0.7619921184943607, "grad_norm": 0.22610753774642944, "learning_rate": 9.048189292023374e-05, "loss": 3.9385, "step": 11215 }, { "epoch": 0.7623318385650224, "grad_norm": 0.1515149027109146, "learning_rate": 9.047764641935046e-05, "loss": 4.0009, "step": 11220 }, { "epoch": 0.7626715586356841, "grad_norm": 0.15270265936851501, "learning_rate": 9.047339991846718e-05, "loss": 3.8132, "step": 11225 }, { "epoch": 0.763011278706346, "grad_norm": 0.256085067987442, "learning_rate": 9.046915341758391e-05, "loss": 4.1206, "step": 11230 }, { "epoch": 0.7633509987770077, "grad_norm": 0.19662001729011536, "learning_rate": 9.046490691670064e-05, "loss": 4.0799, "step": 11235 }, { "epoch": 0.7636907188476695, "grad_norm": 0.18421690165996552, "learning_rate": 9.046066041581737e-05, "loss": 4.0328, "step": 11240 }, { "epoch": 0.7640304389183313, "grad_norm": 0.19748954474925995, "learning_rate": 9.04564139149341e-05, "loss": 3.7995, "step": 11245 }, { "epoch": 0.7643701589889931, "grad_norm": 0.15954630076885223, "learning_rate": 9.045216741405082e-05, "loss": 3.8286, "step": 11250 }, { "epoch": 0.7647098790596548, "grad_norm": 0.5489984154701233, "learning_rate": 9.044792091316755e-05, "loss": 4.2052, "step": 11255 }, { "epoch": 0.7650495991303167, "grad_norm": 0.44495344161987305, "learning_rate": 9.044367441228428e-05, "loss": 4.1036, "step": 11260 }, { "epoch": 0.7653893192009784, "grad_norm": 0.16555814445018768, "learning_rate": 9.043942791140101e-05, "loss": 3.9737, "step": 11265 }, { "epoch": 0.7657290392716402, "grad_norm": 0.21692036092281342, "learning_rate": 9.043518141051774e-05, "loss": 4.0437, "step": 11270 }, { "epoch": 0.7660687593423019, "grad_norm": 1.8791022300720215, "learning_rate": 9.043093490963446e-05, "loss": 4.0974, "step": 11275 }, { "epoch": 0.7664084794129638, "grad_norm": 0.19241303205490112, "learning_rate": 9.042668840875119e-05, "loss": 3.9209, "step": 11280 }, { "epoch": 0.7667481994836255, "grad_norm": 0.17803336679935455, "learning_rate": 9.042244190786792e-05, "loss": 3.842, "step": 11285 }, { "epoch": 0.7670879195542872, "grad_norm": 0.20129168033599854, "learning_rate": 9.041819540698465e-05, "loss": 4.0462, "step": 11290 }, { "epoch": 0.7674276396249491, "grad_norm": 0.18283264338970184, "learning_rate": 9.041394890610138e-05, "loss": 3.9094, "step": 11295 }, { "epoch": 0.7677673596956108, "grad_norm": 0.20721754431724548, "learning_rate": 9.04097024052181e-05, "loss": 3.9013, "step": 11300 }, { "epoch": 0.7681070797662726, "grad_norm": 0.43089064955711365, "learning_rate": 9.040545590433483e-05, "loss": 3.964, "step": 11305 }, { "epoch": 0.7684467998369343, "grad_norm": 0.24873760342597961, "learning_rate": 9.040120940345156e-05, "loss": 4.0062, "step": 11310 }, { "epoch": 0.7687865199075962, "grad_norm": 0.22243714332580566, "learning_rate": 9.039696290256829e-05, "loss": 3.8801, "step": 11315 }, { "epoch": 0.7691262399782579, "grad_norm": 0.41750073432922363, "learning_rate": 9.039271640168502e-05, "loss": 4.2682, "step": 11320 }, { "epoch": 0.7694659600489197, "grad_norm": 0.369742214679718, "learning_rate": 9.038846990080174e-05, "loss": 3.8898, "step": 11325 }, { "epoch": 0.7698056801195815, "grad_norm": 0.26420828700065613, "learning_rate": 9.038422339991847e-05, "loss": 3.9038, "step": 11330 }, { "epoch": 0.7701454001902432, "grad_norm": 0.2597283720970154, "learning_rate": 9.03799768990352e-05, "loss": 3.9406, "step": 11335 }, { "epoch": 0.770485120260905, "grad_norm": 0.17518769204616547, "learning_rate": 9.037573039815193e-05, "loss": 3.6741, "step": 11340 }, { "epoch": 0.7708248403315668, "grad_norm": 0.6777191758155823, "learning_rate": 9.037148389726866e-05, "loss": 4.0673, "step": 11345 }, { "epoch": 0.7711645604022286, "grad_norm": 0.201960951089859, "learning_rate": 9.036723739638538e-05, "loss": 4.0074, "step": 11350 }, { "epoch": 0.7715042804728903, "grad_norm": 0.4381665587425232, "learning_rate": 9.036299089550211e-05, "loss": 3.8382, "step": 11355 }, { "epoch": 0.7718440005435521, "grad_norm": 0.1966671347618103, "learning_rate": 9.035874439461884e-05, "loss": 4.1766, "step": 11360 }, { "epoch": 0.7721837206142139, "grad_norm": 0.16876500844955444, "learning_rate": 9.035449789373557e-05, "loss": 3.9402, "step": 11365 }, { "epoch": 0.7725234406848757, "grad_norm": 4.147640705108643, "learning_rate": 9.03502513928523e-05, "loss": 4.109, "step": 11370 }, { "epoch": 0.7728631607555374, "grad_norm": 0.2072206437587738, "learning_rate": 9.034600489196902e-05, "loss": 4.0296, "step": 11375 }, { "epoch": 0.7732028808261993, "grad_norm": 0.2016468346118927, "learning_rate": 9.034175839108575e-05, "loss": 3.7331, "step": 11380 }, { "epoch": 0.773542600896861, "grad_norm": 0.47726747393608093, "learning_rate": 9.033751189020248e-05, "loss": 4.0178, "step": 11385 }, { "epoch": 0.7738823209675227, "grad_norm": 0.17172425985336304, "learning_rate": 9.033326538931921e-05, "loss": 4.1782, "step": 11390 }, { "epoch": 0.7742220410381845, "grad_norm": 0.1911281943321228, "learning_rate": 9.032901888843594e-05, "loss": 3.9972, "step": 11395 }, { "epoch": 0.7745617611088463, "grad_norm": 0.14127899706363678, "learning_rate": 9.032477238755266e-05, "loss": 3.9405, "step": 11400 }, { "epoch": 0.7749014811795081, "grad_norm": 0.1841440349817276, "learning_rate": 9.032052588666939e-05, "loss": 4.0964, "step": 11405 }, { "epoch": 0.7752412012501698, "grad_norm": 0.17826926708221436, "learning_rate": 9.031627938578612e-05, "loss": 4.2658, "step": 11410 }, { "epoch": 0.7755809213208317, "grad_norm": 0.2882947325706482, "learning_rate": 9.031203288490285e-05, "loss": 4.1508, "step": 11415 }, { "epoch": 0.7759206413914934, "grad_norm": 0.21743571758270264, "learning_rate": 9.030778638401958e-05, "loss": 4.1659, "step": 11420 }, { "epoch": 0.7762603614621552, "grad_norm": 0.23792824149131775, "learning_rate": 9.03035398831363e-05, "loss": 3.8291, "step": 11425 }, { "epoch": 0.776600081532817, "grad_norm": 0.2318025827407837, "learning_rate": 9.029929338225302e-05, "loss": 3.8444, "step": 11430 }, { "epoch": 0.7769398016034788, "grad_norm": 0.17949531972408295, "learning_rate": 9.029504688136976e-05, "loss": 3.9841, "step": 11435 }, { "epoch": 0.7772795216741405, "grad_norm": 0.22435292601585388, "learning_rate": 9.029080038048649e-05, "loss": 3.9821, "step": 11440 }, { "epoch": 0.7776192417448022, "grad_norm": 0.1865406632423401, "learning_rate": 9.02865538796032e-05, "loss": 3.935, "step": 11445 }, { "epoch": 0.7779589618154641, "grad_norm": 0.2090293914079666, "learning_rate": 9.028230737871994e-05, "loss": 3.9991, "step": 11450 }, { "epoch": 0.7782986818861258, "grad_norm": 0.18024842441082, "learning_rate": 9.027806087783667e-05, "loss": 4.076, "step": 11455 }, { "epoch": 0.7786384019567876, "grad_norm": 0.17997018992900848, "learning_rate": 9.027381437695339e-05, "loss": 3.7585, "step": 11460 }, { "epoch": 0.7789781220274494, "grad_norm": 0.16544857621192932, "learning_rate": 9.026956787607013e-05, "loss": 4.2184, "step": 11465 }, { "epoch": 0.7793178420981112, "grad_norm": 0.17606359720230103, "learning_rate": 9.026532137518686e-05, "loss": 3.9412, "step": 11470 }, { "epoch": 0.7796575621687729, "grad_norm": 0.2205812931060791, "learning_rate": 9.026107487430357e-05, "loss": 4.0843, "step": 11475 }, { "epoch": 0.7799972822394347, "grad_norm": 0.25740867853164673, "learning_rate": 9.025682837342031e-05, "loss": 3.9544, "step": 11480 }, { "epoch": 0.7803370023100965, "grad_norm": 0.14909543097019196, "learning_rate": 9.025258187253704e-05, "loss": 3.8342, "step": 11485 }, { "epoch": 0.7806767223807582, "grad_norm": 0.24682089686393738, "learning_rate": 9.024833537165375e-05, "loss": 3.9944, "step": 11490 }, { "epoch": 0.78101644245142, "grad_norm": 0.15707463026046753, "learning_rate": 9.02440888707705e-05, "loss": 3.9675, "step": 11495 }, { "epoch": 0.7813561625220818, "grad_norm": 0.22718797624111176, "learning_rate": 9.023984236988722e-05, "loss": 3.6624, "step": 11500 }, { "epoch": 0.7816958825927436, "grad_norm": 0.15948626399040222, "learning_rate": 9.023559586900394e-05, "loss": 3.9947, "step": 11505 }, { "epoch": 0.7820356026634053, "grad_norm": 0.16061913967132568, "learning_rate": 9.023134936812068e-05, "loss": 4.1845, "step": 11510 }, { "epoch": 0.7823753227340672, "grad_norm": 0.25919288396835327, "learning_rate": 9.02271028672374e-05, "loss": 3.9301, "step": 11515 }, { "epoch": 0.7827150428047289, "grad_norm": 0.21657872200012207, "learning_rate": 9.022285636635412e-05, "loss": 4.0299, "step": 11520 }, { "epoch": 0.7830547628753907, "grad_norm": 0.18826550245285034, "learning_rate": 9.021860986547086e-05, "loss": 4.212, "step": 11525 }, { "epoch": 0.7833944829460524, "grad_norm": 0.2549474835395813, "learning_rate": 9.021436336458758e-05, "loss": 4.0705, "step": 11530 }, { "epoch": 0.7837342030167143, "grad_norm": 0.6155955195426941, "learning_rate": 9.02101168637043e-05, "loss": 3.79, "step": 11535 }, { "epoch": 0.784073923087376, "grad_norm": 0.1635499894618988, "learning_rate": 9.020587036282105e-05, "loss": 4.1024, "step": 11540 }, { "epoch": 0.7844136431580377, "grad_norm": 0.15726587176322937, "learning_rate": 9.020162386193776e-05, "loss": 3.9499, "step": 11545 }, { "epoch": 0.7847533632286996, "grad_norm": 0.16258913278579712, "learning_rate": 9.019737736105449e-05, "loss": 3.9322, "step": 11550 }, { "epoch": 0.7850930832993613, "grad_norm": 0.2376587688922882, "learning_rate": 9.019313086017123e-05, "loss": 3.9946, "step": 11555 }, { "epoch": 0.7854328033700231, "grad_norm": 0.1641000360250473, "learning_rate": 9.018888435928795e-05, "loss": 4.1131, "step": 11560 }, { "epoch": 0.7857725234406848, "grad_norm": 0.18432609736919403, "learning_rate": 9.018463785840467e-05, "loss": 3.8617, "step": 11565 }, { "epoch": 0.7861122435113467, "grad_norm": 0.31025978922843933, "learning_rate": 9.018039135752142e-05, "loss": 3.7286, "step": 11570 }, { "epoch": 0.7864519635820084, "grad_norm": 0.18590706586837769, "learning_rate": 9.017614485663813e-05, "loss": 3.9636, "step": 11575 }, { "epoch": 0.7867916836526702, "grad_norm": 0.18814896047115326, "learning_rate": 9.017189835575486e-05, "loss": 4.3321, "step": 11580 }, { "epoch": 0.787131403723332, "grad_norm": 0.17569060623645782, "learning_rate": 9.016765185487159e-05, "loss": 4.0207, "step": 11585 }, { "epoch": 0.7874711237939938, "grad_norm": 0.8084515333175659, "learning_rate": 9.016340535398831e-05, "loss": 3.8702, "step": 11590 }, { "epoch": 0.7878108438646555, "grad_norm": 0.1719738245010376, "learning_rate": 9.015915885310504e-05, "loss": 3.9472, "step": 11595 }, { "epoch": 0.7881505639353173, "grad_norm": 0.3733132481575012, "learning_rate": 9.015491235222177e-05, "loss": 3.9621, "step": 11600 }, { "epoch": 0.7884902840059791, "grad_norm": 0.1931469440460205, "learning_rate": 9.01506658513385e-05, "loss": 4.1482, "step": 11605 }, { "epoch": 0.7888300040766408, "grad_norm": 0.6097820997238159, "learning_rate": 9.014641935045523e-05, "loss": 4.2861, "step": 11610 }, { "epoch": 0.7891697241473026, "grad_norm": 0.23092709481716156, "learning_rate": 9.014217284957195e-05, "loss": 3.7435, "step": 11615 }, { "epoch": 0.7895094442179644, "grad_norm": 0.20437659323215485, "learning_rate": 9.013792634868868e-05, "loss": 4.0259, "step": 11620 }, { "epoch": 0.7898491642886262, "grad_norm": 0.19561974704265594, "learning_rate": 9.013367984780541e-05, "loss": 3.8396, "step": 11625 }, { "epoch": 0.7901888843592879, "grad_norm": 0.22799140214920044, "learning_rate": 9.012943334692214e-05, "loss": 4.0335, "step": 11630 }, { "epoch": 0.7905286044299498, "grad_norm": 0.1820353865623474, "learning_rate": 9.012518684603887e-05, "loss": 3.9942, "step": 11635 }, { "epoch": 0.7908683245006115, "grad_norm": 0.217819482088089, "learning_rate": 9.01209403451556e-05, "loss": 4.1374, "step": 11640 }, { "epoch": 0.7912080445712733, "grad_norm": 0.20061899721622467, "learning_rate": 9.011669384427232e-05, "loss": 3.6462, "step": 11645 }, { "epoch": 0.791547764641935, "grad_norm": 0.21914707124233246, "learning_rate": 9.011244734338905e-05, "loss": 4.0377, "step": 11650 }, { "epoch": 0.7918874847125968, "grad_norm": 0.2225886732339859, "learning_rate": 9.010820084250578e-05, "loss": 4.1587, "step": 11655 }, { "epoch": 0.7922272047832586, "grad_norm": 0.23360738158226013, "learning_rate": 9.01039543416225e-05, "loss": 4.0995, "step": 11660 }, { "epoch": 0.7925669248539203, "grad_norm": 0.20647506415843964, "learning_rate": 9.009970784073923e-05, "loss": 4.0943, "step": 11665 }, { "epoch": 0.7929066449245822, "grad_norm": 0.17202545702457428, "learning_rate": 9.009546133985596e-05, "loss": 3.8883, "step": 11670 }, { "epoch": 0.7932463649952439, "grad_norm": 1.380285382270813, "learning_rate": 9.009121483897269e-05, "loss": 4.0137, "step": 11675 }, { "epoch": 0.7935860850659057, "grad_norm": 0.23098598420619965, "learning_rate": 9.008696833808942e-05, "loss": 4.0737, "step": 11680 }, { "epoch": 0.7939258051365675, "grad_norm": 0.17068329453468323, "learning_rate": 9.008272183720615e-05, "loss": 4.2746, "step": 11685 }, { "epoch": 0.7942655252072293, "grad_norm": 0.23422260582447052, "learning_rate": 9.007847533632287e-05, "loss": 3.7582, "step": 11690 }, { "epoch": 0.794605245277891, "grad_norm": 0.1885872483253479, "learning_rate": 9.00742288354396e-05, "loss": 4.0525, "step": 11695 }, { "epoch": 0.7949449653485527, "grad_norm": 0.18177750706672668, "learning_rate": 9.006998233455633e-05, "loss": 4.0782, "step": 11700 }, { "epoch": 0.7952846854192146, "grad_norm": 2.576247453689575, "learning_rate": 9.006573583367306e-05, "loss": 3.8726, "step": 11705 }, { "epoch": 0.7956244054898763, "grad_norm": 0.16896361112594604, "learning_rate": 9.006148933278979e-05, "loss": 3.8062, "step": 11710 }, { "epoch": 0.7959641255605381, "grad_norm": 0.1680668294429779, "learning_rate": 9.005724283190651e-05, "loss": 4.0136, "step": 11715 }, { "epoch": 0.7963038456311999, "grad_norm": 0.18001356720924377, "learning_rate": 9.005299633102324e-05, "loss": 4.1243, "step": 11720 }, { "epoch": 0.7966435657018617, "grad_norm": 0.19494907557964325, "learning_rate": 9.004874983013997e-05, "loss": 3.9437, "step": 11725 }, { "epoch": 0.7969832857725234, "grad_norm": 0.18916480243206024, "learning_rate": 9.00445033292567e-05, "loss": 4.0779, "step": 11730 }, { "epoch": 0.7973230058431852, "grad_norm": 0.211675226688385, "learning_rate": 9.004025682837343e-05, "loss": 3.8902, "step": 11735 }, { "epoch": 0.797662725913847, "grad_norm": 0.2676939368247986, "learning_rate": 9.003601032749015e-05, "loss": 4.0427, "step": 11740 }, { "epoch": 0.7980024459845088, "grad_norm": 0.20862559974193573, "learning_rate": 9.003176382660688e-05, "loss": 4.2145, "step": 11745 }, { "epoch": 0.7983421660551705, "grad_norm": 0.20464570820331573, "learning_rate": 9.002751732572361e-05, "loss": 4.113, "step": 11750 }, { "epoch": 0.7986818861258324, "grad_norm": 0.17028920352458954, "learning_rate": 9.002327082484034e-05, "loss": 3.6957, "step": 11755 }, { "epoch": 0.7990216061964941, "grad_norm": 0.24813637137413025, "learning_rate": 9.001902432395707e-05, "loss": 4.1226, "step": 11760 }, { "epoch": 0.7993613262671558, "grad_norm": 1.6223915815353394, "learning_rate": 9.00147778230738e-05, "loss": 3.9943, "step": 11765 }, { "epoch": 0.7997010463378177, "grad_norm": 0.1639162003993988, "learning_rate": 9.001053132219052e-05, "loss": 3.9359, "step": 11770 }, { "epoch": 0.8000407664084794, "grad_norm": 0.16888979077339172, "learning_rate": 9.000628482130725e-05, "loss": 3.8101, "step": 11775 }, { "epoch": 0.8003804864791412, "grad_norm": 0.1576785147190094, "learning_rate": 9.000203832042398e-05, "loss": 4.062, "step": 11780 }, { "epoch": 0.8007202065498029, "grad_norm": 0.19945354759693146, "learning_rate": 8.99977918195407e-05, "loss": 4.2053, "step": 11785 }, { "epoch": 0.8010599266204648, "grad_norm": 0.13953137397766113, "learning_rate": 8.999354531865743e-05, "loss": 3.6578, "step": 11790 }, { "epoch": 0.8013996466911265, "grad_norm": 0.1995120495557785, "learning_rate": 8.998929881777416e-05, "loss": 3.9994, "step": 11795 }, { "epoch": 0.8017393667617883, "grad_norm": 0.22360244393348694, "learning_rate": 8.998505231689088e-05, "loss": 4.1224, "step": 11800 }, { "epoch": 0.8020790868324501, "grad_norm": 0.20481501519680023, "learning_rate": 8.998080581600762e-05, "loss": 4.2824, "step": 11805 }, { "epoch": 0.8024188069031118, "grad_norm": 0.39974507689476013, "learning_rate": 8.997655931512435e-05, "loss": 4.1875, "step": 11810 }, { "epoch": 0.8027585269737736, "grad_norm": 0.32297125458717346, "learning_rate": 8.997231281424106e-05, "loss": 3.8753, "step": 11815 }, { "epoch": 0.8030982470444353, "grad_norm": 0.2076197862625122, "learning_rate": 8.996891561353446e-05, "loss": 4.0354, "step": 11820 }, { "epoch": 0.8034379671150972, "grad_norm": 0.17974001169204712, "learning_rate": 8.996466911265119e-05, "loss": 4.0395, "step": 11825 }, { "epoch": 0.8037776871857589, "grad_norm": 0.18468719720840454, "learning_rate": 8.99604226117679e-05, "loss": 4.0589, "step": 11830 }, { "epoch": 0.8041174072564207, "grad_norm": 0.2775026857852936, "learning_rate": 8.995617611088464e-05, "loss": 4.0899, "step": 11835 }, { "epoch": 0.8044571273270825, "grad_norm": 0.20769546926021576, "learning_rate": 8.995192961000136e-05, "loss": 4.1438, "step": 11840 }, { "epoch": 0.8047968473977443, "grad_norm": 0.18603581190109253, "learning_rate": 8.994768310911808e-05, "loss": 3.9822, "step": 11845 }, { "epoch": 0.805136567468406, "grad_norm": 0.3347295820713043, "learning_rate": 8.994343660823483e-05, "loss": 4.042, "step": 11850 }, { "epoch": 0.8054762875390679, "grad_norm": 0.26557305455207825, "learning_rate": 8.993919010735154e-05, "loss": 3.9212, "step": 11855 }, { "epoch": 0.8058160076097296, "grad_norm": 0.27433109283447266, "learning_rate": 8.993494360646827e-05, "loss": 3.7039, "step": 11860 }, { "epoch": 0.8061557276803913, "grad_norm": 0.1835566610097885, "learning_rate": 8.993069710558501e-05, "loss": 3.8467, "step": 11865 }, { "epoch": 0.8064954477510531, "grad_norm": 0.15933853387832642, "learning_rate": 8.992645060470172e-05, "loss": 4.035, "step": 11870 }, { "epoch": 0.8068351678217149, "grad_norm": 0.1779545098543167, "learning_rate": 8.992220410381845e-05, "loss": 4.0, "step": 11875 }, { "epoch": 0.8071748878923767, "grad_norm": 0.19771164655685425, "learning_rate": 8.99179576029352e-05, "loss": 3.5028, "step": 11880 }, { "epoch": 0.8075146079630384, "grad_norm": 0.17675349116325378, "learning_rate": 8.991371110205191e-05, "loss": 3.9989, "step": 11885 }, { "epoch": 0.8078543280337003, "grad_norm": 0.23120000958442688, "learning_rate": 8.990946460116864e-05, "loss": 3.7091, "step": 11890 }, { "epoch": 0.808194048104362, "grad_norm": 0.18149258196353912, "learning_rate": 8.990521810028538e-05, "loss": 3.9628, "step": 11895 }, { "epoch": 0.8085337681750238, "grad_norm": 0.2674315571784973, "learning_rate": 8.990097159940209e-05, "loss": 3.7765, "step": 11900 }, { "epoch": 0.8088734882456855, "grad_norm": 0.21212173998355865, "learning_rate": 8.989672509851883e-05, "loss": 3.8405, "step": 11905 }, { "epoch": 0.8092132083163474, "grad_norm": 0.16879509389400482, "learning_rate": 8.989247859763555e-05, "loss": 4.0809, "step": 11910 }, { "epoch": 0.8095529283870091, "grad_norm": 0.14125306904315948, "learning_rate": 8.988823209675228e-05, "loss": 4.0725, "step": 11915 }, { "epoch": 0.8098926484576708, "grad_norm": 0.1498613953590393, "learning_rate": 8.988398559586902e-05, "loss": 3.7912, "step": 11920 }, { "epoch": 0.8102323685283327, "grad_norm": 0.16456682980060577, "learning_rate": 8.987973909498573e-05, "loss": 4.0187, "step": 11925 }, { "epoch": 0.8105720885989944, "grad_norm": 0.3114604949951172, "learning_rate": 8.987549259410246e-05, "loss": 4.0009, "step": 11930 }, { "epoch": 0.8109118086696562, "grad_norm": 0.5615077018737793, "learning_rate": 8.98712460932192e-05, "loss": 4.0609, "step": 11935 }, { "epoch": 0.811251528740318, "grad_norm": 0.27753254771232605, "learning_rate": 8.986699959233592e-05, "loss": 3.8107, "step": 11940 }, { "epoch": 0.8115912488109798, "grad_norm": 0.21950267255306244, "learning_rate": 8.986275309145264e-05, "loss": 3.8093, "step": 11945 }, { "epoch": 0.8119309688816415, "grad_norm": 0.17988736927509308, "learning_rate": 8.985850659056939e-05, "loss": 3.9343, "step": 11950 }, { "epoch": 0.8122706889523033, "grad_norm": 0.23350049555301666, "learning_rate": 8.98542600896861e-05, "loss": 4.0691, "step": 11955 }, { "epoch": 0.8126104090229651, "grad_norm": 0.19277788698673248, "learning_rate": 8.985001358880283e-05, "loss": 3.9519, "step": 11960 }, { "epoch": 0.8129501290936268, "grad_norm": 0.21622268855571747, "learning_rate": 8.984576708791957e-05, "loss": 3.8689, "step": 11965 }, { "epoch": 0.8132898491642886, "grad_norm": 1.2102338075637817, "learning_rate": 8.984152058703628e-05, "loss": 3.9296, "step": 11970 }, { "epoch": 0.8136295692349504, "grad_norm": 0.2097243219614029, "learning_rate": 8.983727408615301e-05, "loss": 4.2595, "step": 11975 }, { "epoch": 0.8139692893056122, "grad_norm": 0.3595362603664398, "learning_rate": 8.983302758526974e-05, "loss": 3.9542, "step": 11980 }, { "epoch": 0.8143090093762739, "grad_norm": 0.18622025847434998, "learning_rate": 8.982878108438647e-05, "loss": 3.8882, "step": 11985 }, { "epoch": 0.8146487294469357, "grad_norm": 0.19790107011795044, "learning_rate": 8.98245345835032e-05, "loss": 4.1604, "step": 11990 }, { "epoch": 0.8149884495175975, "grad_norm": 0.21050450205802917, "learning_rate": 8.982028808261992e-05, "loss": 3.7831, "step": 11995 }, { "epoch": 0.8153281695882593, "grad_norm": 0.2178838849067688, "learning_rate": 8.981604158173665e-05, "loss": 3.8846, "step": 12000 }, { "epoch": 0.815667889658921, "grad_norm": 0.20060613751411438, "learning_rate": 8.981179508085338e-05, "loss": 4.0856, "step": 12005 }, { "epoch": 0.8160076097295829, "grad_norm": 0.19663146138191223, "learning_rate": 8.980754857997011e-05, "loss": 3.9376, "step": 12010 }, { "epoch": 0.8163473298002446, "grad_norm": 0.36938565969467163, "learning_rate": 8.980330207908684e-05, "loss": 4.121, "step": 12015 }, { "epoch": 0.8166870498709063, "grad_norm": 0.17913353443145752, "learning_rate": 8.979905557820356e-05, "loss": 3.9507, "step": 12020 }, { "epoch": 0.8170267699415682, "grad_norm": 0.18103277683258057, "learning_rate": 8.979480907732029e-05, "loss": 3.651, "step": 12025 }, { "epoch": 0.8173664900122299, "grad_norm": 0.1673816740512848, "learning_rate": 8.979056257643702e-05, "loss": 3.9309, "step": 12030 }, { "epoch": 0.8177062100828917, "grad_norm": 0.4948117434978485, "learning_rate": 8.978631607555375e-05, "loss": 3.9102, "step": 12035 }, { "epoch": 0.8180459301535534, "grad_norm": 0.16142868995666504, "learning_rate": 8.978206957467048e-05, "loss": 3.8234, "step": 12040 }, { "epoch": 0.8183856502242153, "grad_norm": 0.2791318893432617, "learning_rate": 8.97778230737872e-05, "loss": 4.1417, "step": 12045 }, { "epoch": 0.818725370294877, "grad_norm": 0.17257554829120636, "learning_rate": 8.977357657290393e-05, "loss": 4.043, "step": 12050 }, { "epoch": 0.8190650903655388, "grad_norm": 0.2123369425535202, "learning_rate": 8.976933007202066e-05, "loss": 3.9297, "step": 12055 }, { "epoch": 0.8194048104362006, "grad_norm": 0.19854533672332764, "learning_rate": 8.976508357113739e-05, "loss": 3.8441, "step": 12060 }, { "epoch": 0.8197445305068624, "grad_norm": 0.24193492531776428, "learning_rate": 8.976083707025412e-05, "loss": 4.0607, "step": 12065 }, { "epoch": 0.8200842505775241, "grad_norm": 0.1961483359336853, "learning_rate": 8.975659056937084e-05, "loss": 3.8928, "step": 12070 }, { "epoch": 0.8204239706481858, "grad_norm": 0.14383824169635773, "learning_rate": 8.975234406848757e-05, "loss": 4.0603, "step": 12075 }, { "epoch": 0.8207636907188477, "grad_norm": 0.2458658516407013, "learning_rate": 8.97480975676043e-05, "loss": 3.8334, "step": 12080 }, { "epoch": 0.8211034107895094, "grad_norm": 0.17008869349956512, "learning_rate": 8.974385106672103e-05, "loss": 3.9803, "step": 12085 }, { "epoch": 0.8214431308601712, "grad_norm": 0.20078590512275696, "learning_rate": 8.973960456583776e-05, "loss": 4.1641, "step": 12090 }, { "epoch": 0.821782850930833, "grad_norm": 0.1937909722328186, "learning_rate": 8.973535806495448e-05, "loss": 4.0954, "step": 12095 }, { "epoch": 0.8221225710014948, "grad_norm": 0.18328414857387543, "learning_rate": 8.973111156407121e-05, "loss": 3.9427, "step": 12100 }, { "epoch": 0.8224622910721565, "grad_norm": 0.2016650289297104, "learning_rate": 8.972686506318794e-05, "loss": 3.9728, "step": 12105 }, { "epoch": 0.8228020111428184, "grad_norm": 0.17548047006130219, "learning_rate": 8.972261856230465e-05, "loss": 4.1828, "step": 12110 }, { "epoch": 0.8231417312134801, "grad_norm": 0.39229270815849304, "learning_rate": 8.97183720614214e-05, "loss": 4.0582, "step": 12115 }, { "epoch": 0.8234814512841419, "grad_norm": 0.18692153692245483, "learning_rate": 8.971412556053812e-05, "loss": 3.9979, "step": 12120 }, { "epoch": 0.8238211713548036, "grad_norm": 0.22566412389278412, "learning_rate": 8.970987905965484e-05, "loss": 4.0853, "step": 12125 }, { "epoch": 0.8241608914254654, "grad_norm": 0.2699925899505615, "learning_rate": 8.970563255877158e-05, "loss": 4.2611, "step": 12130 }, { "epoch": 0.8245006114961272, "grad_norm": 0.2766724228858948, "learning_rate": 8.970138605788831e-05, "loss": 3.8676, "step": 12135 }, { "epoch": 0.8248403315667889, "grad_norm": 0.149053156375885, "learning_rate": 8.969713955700502e-05, "loss": 4.0457, "step": 12140 }, { "epoch": 0.8251800516374508, "grad_norm": 0.29666438698768616, "learning_rate": 8.969289305612176e-05, "loss": 4.0984, "step": 12145 }, { "epoch": 0.8255197717081125, "grad_norm": 0.1891719549894333, "learning_rate": 8.968864655523849e-05, "loss": 4.009, "step": 12150 }, { "epoch": 0.8258594917787743, "grad_norm": 0.1801346093416214, "learning_rate": 8.968440005435521e-05, "loss": 4.1132, "step": 12155 }, { "epoch": 0.826199211849436, "grad_norm": 1.4226734638214111, "learning_rate": 8.968015355347195e-05, "loss": 3.9887, "step": 12160 }, { "epoch": 0.8265389319200979, "grad_norm": 0.3138851523399353, "learning_rate": 8.967590705258868e-05, "loss": 4.0721, "step": 12165 }, { "epoch": 0.8268786519907596, "grad_norm": 0.19921836256980896, "learning_rate": 8.967166055170539e-05, "loss": 4.0086, "step": 12170 }, { "epoch": 0.8272183720614213, "grad_norm": 0.2232120782136917, "learning_rate": 8.966741405082213e-05, "loss": 3.8876, "step": 12175 }, { "epoch": 0.8275580921320832, "grad_norm": 0.21191275119781494, "learning_rate": 8.966316754993885e-05, "loss": 3.9509, "step": 12180 }, { "epoch": 0.8278978122027449, "grad_norm": 1.0071362257003784, "learning_rate": 8.965892104905557e-05, "loss": 4.0208, "step": 12185 }, { "epoch": 0.8282375322734067, "grad_norm": 0.30778366327285767, "learning_rate": 8.965467454817232e-05, "loss": 4.0989, "step": 12190 }, { "epoch": 0.8285772523440685, "grad_norm": 0.15304256975650787, "learning_rate": 8.965042804728903e-05, "loss": 4.1485, "step": 12195 }, { "epoch": 0.8289169724147303, "grad_norm": 0.1691897213459015, "learning_rate": 8.964618154640576e-05, "loss": 4.1178, "step": 12200 }, { "epoch": 0.829256692485392, "grad_norm": 0.2017151266336441, "learning_rate": 8.96419350455225e-05, "loss": 4.1713, "step": 12205 }, { "epoch": 0.8295964125560538, "grad_norm": 0.45046570897102356, "learning_rate": 8.963768854463921e-05, "loss": 3.8627, "step": 12210 }, { "epoch": 0.8299361326267156, "grad_norm": 0.21693216264247894, "learning_rate": 8.963344204375594e-05, "loss": 3.969, "step": 12215 }, { "epoch": 0.8302758526973774, "grad_norm": 0.22828508913516998, "learning_rate": 8.962919554287268e-05, "loss": 3.9584, "step": 12220 }, { "epoch": 0.8306155727680391, "grad_norm": 0.2518628239631653, "learning_rate": 8.96249490419894e-05, "loss": 4.0381, "step": 12225 }, { "epoch": 0.830955292838701, "grad_norm": 0.1994330883026123, "learning_rate": 8.962070254110613e-05, "loss": 3.8909, "step": 12230 }, { "epoch": 0.8312950129093627, "grad_norm": 0.1634039431810379, "learning_rate": 8.961645604022287e-05, "loss": 3.5102, "step": 12235 }, { "epoch": 0.8316347329800244, "grad_norm": 0.17670801281929016, "learning_rate": 8.961220953933958e-05, "loss": 3.8131, "step": 12240 }, { "epoch": 0.8319744530506862, "grad_norm": 0.19512879848480225, "learning_rate": 8.960796303845632e-05, "loss": 3.5444, "step": 12245 }, { "epoch": 0.832314173121348, "grad_norm": 0.5432287454605103, "learning_rate": 8.960371653757305e-05, "loss": 3.8723, "step": 12250 }, { "epoch": 0.8326538931920098, "grad_norm": 0.21648722887039185, "learning_rate": 8.959947003668977e-05, "loss": 3.7991, "step": 12255 }, { "epoch": 0.8329936132626715, "grad_norm": 0.3105649948120117, "learning_rate": 8.959522353580651e-05, "loss": 3.8407, "step": 12260 }, { "epoch": 0.8333333333333334, "grad_norm": 0.32569703459739685, "learning_rate": 8.959097703492322e-05, "loss": 4.2322, "step": 12265 }, { "epoch": 0.8336730534039951, "grad_norm": 0.2628330588340759, "learning_rate": 8.958673053403995e-05, "loss": 3.599, "step": 12270 }, { "epoch": 0.8340127734746569, "grad_norm": 0.15857040882110596, "learning_rate": 8.958248403315669e-05, "loss": 4.0897, "step": 12275 }, { "epoch": 0.8343524935453187, "grad_norm": 0.20659282803535461, "learning_rate": 8.957823753227341e-05, "loss": 3.8562, "step": 12280 }, { "epoch": 0.8346922136159804, "grad_norm": 0.21449913084506989, "learning_rate": 8.957399103139014e-05, "loss": 3.9756, "step": 12285 }, { "epoch": 0.8350319336866422, "grad_norm": 0.1901203691959381, "learning_rate": 8.956974453050688e-05, "loss": 4.025, "step": 12290 }, { "epoch": 0.8353716537573039, "grad_norm": 0.20290473103523254, "learning_rate": 8.956549802962359e-05, "loss": 3.8794, "step": 12295 }, { "epoch": 0.8357113738279658, "grad_norm": 0.1798480749130249, "learning_rate": 8.956125152874032e-05, "loss": 3.9994, "step": 12300 }, { "epoch": 0.8360510938986275, "grad_norm": 0.25437653064727783, "learning_rate": 8.955700502785706e-05, "loss": 3.6909, "step": 12305 }, { "epoch": 0.8363908139692893, "grad_norm": 0.3736377954483032, "learning_rate": 8.955275852697378e-05, "loss": 4.0622, "step": 12310 }, { "epoch": 0.8367305340399511, "grad_norm": 0.18843521177768707, "learning_rate": 8.95485120260905e-05, "loss": 3.7766, "step": 12315 }, { "epoch": 0.8370702541106129, "grad_norm": 0.18586871027946472, "learning_rate": 8.954426552520724e-05, "loss": 4.0302, "step": 12320 }, { "epoch": 0.8374099741812746, "grad_norm": 0.17058813571929932, "learning_rate": 8.954001902432396e-05, "loss": 4.0756, "step": 12325 }, { "epoch": 0.8377496942519363, "grad_norm": 0.21250483393669128, "learning_rate": 8.953577252344069e-05, "loss": 3.6402, "step": 12330 }, { "epoch": 0.8380894143225982, "grad_norm": 0.18693235516548157, "learning_rate": 8.953152602255742e-05, "loss": 4.2171, "step": 12335 }, { "epoch": 0.83842913439326, "grad_norm": 0.20142552256584167, "learning_rate": 8.952727952167414e-05, "loss": 4.0116, "step": 12340 }, { "epoch": 0.8387688544639217, "grad_norm": 0.5936012864112854, "learning_rate": 8.952303302079087e-05, "loss": 4.0355, "step": 12345 }, { "epoch": 0.8391085745345835, "grad_norm": 0.3252449631690979, "learning_rate": 8.95187865199076e-05, "loss": 4.0143, "step": 12350 }, { "epoch": 0.8394482946052453, "grad_norm": 0.18693962693214417, "learning_rate": 8.951454001902433e-05, "loss": 3.938, "step": 12355 }, { "epoch": 0.839788014675907, "grad_norm": 0.36720210313796997, "learning_rate": 8.951029351814106e-05, "loss": 3.9352, "step": 12360 }, { "epoch": 0.8401277347465689, "grad_norm": 0.14825104176998138, "learning_rate": 8.950604701725778e-05, "loss": 4.2168, "step": 12365 }, { "epoch": 0.8404674548172306, "grad_norm": 0.23677104711532593, "learning_rate": 8.950180051637451e-05, "loss": 3.967, "step": 12370 }, { "epoch": 0.8408071748878924, "grad_norm": 0.5124611258506775, "learning_rate": 8.949755401549124e-05, "loss": 3.9194, "step": 12375 }, { "epoch": 0.8411468949585541, "grad_norm": 0.3029448688030243, "learning_rate": 8.949330751460797e-05, "loss": 3.9354, "step": 12380 }, { "epoch": 0.841486615029216, "grad_norm": 0.20730510354042053, "learning_rate": 8.94890610137247e-05, "loss": 4.1058, "step": 12385 }, { "epoch": 0.8418263350998777, "grad_norm": 0.22315102815628052, "learning_rate": 8.948481451284142e-05, "loss": 3.893, "step": 12390 }, { "epoch": 0.8421660551705394, "grad_norm": 0.47029411792755127, "learning_rate": 8.948056801195815e-05, "loss": 3.8572, "step": 12395 }, { "epoch": 0.8425057752412013, "grad_norm": 0.19684122502803802, "learning_rate": 8.947632151107488e-05, "loss": 4.0835, "step": 12400 }, { "epoch": 0.842845495311863, "grad_norm": 0.18742690980434418, "learning_rate": 8.947207501019161e-05, "loss": 3.9798, "step": 12405 }, { "epoch": 0.8431852153825248, "grad_norm": 0.177710622549057, "learning_rate": 8.946782850930834e-05, "loss": 4.1085, "step": 12410 }, { "epoch": 0.8435249354531865, "grad_norm": 0.18476006388664246, "learning_rate": 8.946358200842506e-05, "loss": 4.0936, "step": 12415 }, { "epoch": 0.8438646555238484, "grad_norm": 0.16293834149837494, "learning_rate": 8.946018480771845e-05, "loss": 3.9854, "step": 12420 }, { "epoch": 0.8442043755945101, "grad_norm": 0.22308799624443054, "learning_rate": 8.945593830683517e-05, "loss": 4.1042, "step": 12425 }, { "epoch": 0.8445440956651719, "grad_norm": 0.16046442091464996, "learning_rate": 8.94516918059519e-05, "loss": 4.0012, "step": 12430 }, { "epoch": 0.8448838157358337, "grad_norm": 0.26154249906539917, "learning_rate": 8.944744530506862e-05, "loss": 4.0459, "step": 12435 }, { "epoch": 0.8452235358064955, "grad_norm": 0.23383556306362152, "learning_rate": 8.944319880418536e-05, "loss": 4.0728, "step": 12440 }, { "epoch": 0.8455632558771572, "grad_norm": 0.22498470544815063, "learning_rate": 8.943895230330209e-05, "loss": 3.979, "step": 12445 }, { "epoch": 0.845902975947819, "grad_norm": 0.18164518475532532, "learning_rate": 8.943470580241881e-05, "loss": 3.8648, "step": 12450 }, { "epoch": 0.8462426960184808, "grad_norm": 0.21877458691596985, "learning_rate": 8.943045930153554e-05, "loss": 4.2495, "step": 12455 }, { "epoch": 0.8465824160891425, "grad_norm": 0.1924646645784378, "learning_rate": 8.942621280065227e-05, "loss": 3.8453, "step": 12460 }, { "epoch": 0.8469221361598043, "grad_norm": 0.20620742440223694, "learning_rate": 8.9421966299769e-05, "loss": 3.9962, "step": 12465 }, { "epoch": 0.8472618562304661, "grad_norm": 0.290763795375824, "learning_rate": 8.941771979888573e-05, "loss": 3.8139, "step": 12470 }, { "epoch": 0.8476015763011279, "grad_norm": 0.16620713472366333, "learning_rate": 8.941347329800245e-05, "loss": 3.7297, "step": 12475 }, { "epoch": 0.8479412963717896, "grad_norm": 2.936108112335205, "learning_rate": 8.940922679711918e-05, "loss": 3.8085, "step": 12480 }, { "epoch": 0.8482810164424515, "grad_norm": 0.1852518916130066, "learning_rate": 8.940498029623591e-05, "loss": 3.9015, "step": 12485 }, { "epoch": 0.8486207365131132, "grad_norm": 0.159807950258255, "learning_rate": 8.940073379535264e-05, "loss": 3.9683, "step": 12490 }, { "epoch": 0.848960456583775, "grad_norm": 0.15564261376857758, "learning_rate": 8.939648729446937e-05, "loss": 4.0309, "step": 12495 }, { "epoch": 0.8493001766544367, "grad_norm": 0.18451645970344543, "learning_rate": 8.93922407935861e-05, "loss": 4.002, "step": 12500 }, { "epoch": 0.8496398967250985, "grad_norm": 0.4409978687763214, "learning_rate": 8.938799429270281e-05, "loss": 3.9582, "step": 12505 }, { "epoch": 0.8499796167957603, "grad_norm": 0.21767017245292664, "learning_rate": 8.938374779181955e-05, "loss": 3.7942, "step": 12510 }, { "epoch": 0.850319336866422, "grad_norm": 0.29611897468566895, "learning_rate": 8.937950129093628e-05, "loss": 3.8641, "step": 12515 }, { "epoch": 0.8506590569370839, "grad_norm": 0.15617810189723969, "learning_rate": 8.937525479005299e-05, "loss": 4.0887, "step": 12520 }, { "epoch": 0.8509987770077456, "grad_norm": 0.19923017919063568, "learning_rate": 8.937100828916973e-05, "loss": 3.9796, "step": 12525 }, { "epoch": 0.8513384970784074, "grad_norm": 0.178538978099823, "learning_rate": 8.936676178828646e-05, "loss": 4.1373, "step": 12530 }, { "epoch": 0.8516782171490692, "grad_norm": 0.20157839357852936, "learning_rate": 8.936251528740318e-05, "loss": 4.1656, "step": 12535 }, { "epoch": 0.852017937219731, "grad_norm": 0.21917836368083954, "learning_rate": 8.935826878651992e-05, "loss": 3.9735, "step": 12540 }, { "epoch": 0.8523576572903927, "grad_norm": 0.1743486225605011, "learning_rate": 8.935402228563665e-05, "loss": 4.0561, "step": 12545 }, { "epoch": 0.8526973773610544, "grad_norm": 0.3735666573047638, "learning_rate": 8.934977578475336e-05, "loss": 3.9863, "step": 12550 }, { "epoch": 0.8530370974317163, "grad_norm": 0.21841804683208466, "learning_rate": 8.93455292838701e-05, "loss": 4.0283, "step": 12555 }, { "epoch": 0.853376817502378, "grad_norm": 0.4104025065898895, "learning_rate": 8.934128278298683e-05, "loss": 3.7465, "step": 12560 }, { "epoch": 0.8537165375730398, "grad_norm": 0.15292176604270935, "learning_rate": 8.933703628210354e-05, "loss": 4.0306, "step": 12565 }, { "epoch": 0.8540562576437016, "grad_norm": 0.30871346592903137, "learning_rate": 8.933278978122029e-05, "loss": 3.9504, "step": 12570 }, { "epoch": 0.8543959777143634, "grad_norm": 0.18270432949066162, "learning_rate": 8.9328543280337e-05, "loss": 4.1066, "step": 12575 }, { "epoch": 0.8547356977850251, "grad_norm": 0.5479381680488586, "learning_rate": 8.932429677945373e-05, "loss": 3.9433, "step": 12580 }, { "epoch": 0.8550754178556869, "grad_norm": 0.19208583235740662, "learning_rate": 8.932005027857047e-05, "loss": 3.7949, "step": 12585 }, { "epoch": 0.8554151379263487, "grad_norm": 0.19413863122463226, "learning_rate": 8.931580377768718e-05, "loss": 4.07, "step": 12590 }, { "epoch": 0.8557548579970105, "grad_norm": 0.19963692128658295, "learning_rate": 8.931155727680391e-05, "loss": 4.038, "step": 12595 }, { "epoch": 0.8560945780676722, "grad_norm": 0.18308939039707184, "learning_rate": 8.930731077592065e-05, "loss": 4.0086, "step": 12600 }, { "epoch": 0.856434298138334, "grad_norm": 0.23906344175338745, "learning_rate": 8.930306427503737e-05, "loss": 3.835, "step": 12605 }, { "epoch": 0.8567740182089958, "grad_norm": 0.15061314404010773, "learning_rate": 8.92988177741541e-05, "loss": 3.9051, "step": 12610 }, { "epoch": 0.8571137382796575, "grad_norm": 0.4929114282131195, "learning_rate": 8.929457127327084e-05, "loss": 3.8619, "step": 12615 }, { "epoch": 0.8574534583503194, "grad_norm": 0.23578637838363647, "learning_rate": 8.929032477238755e-05, "loss": 4.2782, "step": 12620 }, { "epoch": 0.8577931784209811, "grad_norm": 0.2066326141357422, "learning_rate": 8.928607827150428e-05, "loss": 4.2725, "step": 12625 }, { "epoch": 0.8581328984916429, "grad_norm": 0.22157415747642517, "learning_rate": 8.928183177062102e-05, "loss": 4.0701, "step": 12630 }, { "epoch": 0.8584726185623046, "grad_norm": 0.9750187397003174, "learning_rate": 8.927758526973774e-05, "loss": 3.8094, "step": 12635 }, { "epoch": 0.8588123386329665, "grad_norm": 0.17541570961475372, "learning_rate": 8.927333876885446e-05, "loss": 4.1766, "step": 12640 }, { "epoch": 0.8591520587036282, "grad_norm": 0.1566866785287857, "learning_rate": 8.92690922679712e-05, "loss": 4.0965, "step": 12645 }, { "epoch": 0.85949177877429, "grad_norm": 0.18229223787784576, "learning_rate": 8.926484576708792e-05, "loss": 3.9081, "step": 12650 }, { "epoch": 0.8598314988449518, "grad_norm": 0.19233596324920654, "learning_rate": 8.926059926620465e-05, "loss": 3.9146, "step": 12655 }, { "epoch": 0.8601712189156135, "grad_norm": 0.26399555802345276, "learning_rate": 8.925635276532138e-05, "loss": 3.8794, "step": 12660 }, { "epoch": 0.8605109389862753, "grad_norm": 0.18803419172763824, "learning_rate": 8.92521062644381e-05, "loss": 4.2288, "step": 12665 }, { "epoch": 0.860850659056937, "grad_norm": 0.2003091722726822, "learning_rate": 8.924785976355483e-05, "loss": 3.9376, "step": 12670 }, { "epoch": 0.8611903791275989, "grad_norm": 2.6071043014526367, "learning_rate": 8.924361326267156e-05, "loss": 3.9786, "step": 12675 }, { "epoch": 0.8615300991982606, "grad_norm": 0.21313896775245667, "learning_rate": 8.923936676178829e-05, "loss": 4.0317, "step": 12680 }, { "epoch": 0.8618698192689224, "grad_norm": 0.17100752890110016, "learning_rate": 8.923512026090502e-05, "loss": 3.9988, "step": 12685 }, { "epoch": 0.8622095393395842, "grad_norm": 0.23430535197257996, "learning_rate": 8.923087376002174e-05, "loss": 3.7328, "step": 12690 }, { "epoch": 0.862549259410246, "grad_norm": 0.1643848717212677, "learning_rate": 8.922662725913847e-05, "loss": 4.109, "step": 12695 }, { "epoch": 0.8628889794809077, "grad_norm": 0.2526448369026184, "learning_rate": 8.92223807582552e-05, "loss": 4.0189, "step": 12700 }, { "epoch": 0.8632286995515696, "grad_norm": 0.24552328884601593, "learning_rate": 8.921813425737193e-05, "loss": 3.9226, "step": 12705 }, { "epoch": 0.8635684196222313, "grad_norm": 0.17898543179035187, "learning_rate": 8.921388775648866e-05, "loss": 3.7268, "step": 12710 }, { "epoch": 0.863908139692893, "grad_norm": 1.15958833694458, "learning_rate": 8.920964125560538e-05, "loss": 3.9694, "step": 12715 }, { "epoch": 0.8642478597635548, "grad_norm": 0.15424911677837372, "learning_rate": 8.920539475472211e-05, "loss": 4.0695, "step": 12720 }, { "epoch": 0.8645875798342166, "grad_norm": 0.1732960045337677, "learning_rate": 8.920114825383884e-05, "loss": 4.0694, "step": 12725 }, { "epoch": 0.8649272999048784, "grad_norm": 0.25387898087501526, "learning_rate": 8.919690175295557e-05, "loss": 3.9112, "step": 12730 }, { "epoch": 0.8652670199755401, "grad_norm": 0.18854272365570068, "learning_rate": 8.91926552520723e-05, "loss": 4.1747, "step": 12735 }, { "epoch": 0.865606740046202, "grad_norm": 0.18318019807338715, "learning_rate": 8.918840875118902e-05, "loss": 4.0418, "step": 12740 }, { "epoch": 0.8659464601168637, "grad_norm": 0.1997986137866974, "learning_rate": 8.918416225030575e-05, "loss": 3.9741, "step": 12745 }, { "epoch": 0.8662861801875255, "grad_norm": 0.1834951937198639, "learning_rate": 8.917991574942248e-05, "loss": 4.0931, "step": 12750 }, { "epoch": 0.8666259002581872, "grad_norm": 0.1741950660943985, "learning_rate": 8.917566924853921e-05, "loss": 3.8176, "step": 12755 }, { "epoch": 0.866965620328849, "grad_norm": 0.18136630952358246, "learning_rate": 8.917142274765594e-05, "loss": 4.0065, "step": 12760 }, { "epoch": 0.8673053403995108, "grad_norm": 0.22858762741088867, "learning_rate": 8.916717624677266e-05, "loss": 3.8951, "step": 12765 }, { "epoch": 0.8676450604701725, "grad_norm": 0.15833310782909393, "learning_rate": 8.916292974588939e-05, "loss": 3.9966, "step": 12770 }, { "epoch": 0.8679847805408344, "grad_norm": 0.16821962594985962, "learning_rate": 8.915868324500611e-05, "loss": 4.1885, "step": 12775 }, { "epoch": 0.8683245006114961, "grad_norm": 0.16304543614387512, "learning_rate": 8.915443674412285e-05, "loss": 4.054, "step": 12780 }, { "epoch": 0.8686642206821579, "grad_norm": 0.19145479798316956, "learning_rate": 8.915019024323958e-05, "loss": 4.0134, "step": 12785 }, { "epoch": 0.8690039407528197, "grad_norm": 0.20590396225452423, "learning_rate": 8.91459437423563e-05, "loss": 3.9, "step": 12790 }, { "epoch": 0.8693436608234815, "grad_norm": 0.1521267145872116, "learning_rate": 8.914169724147303e-05, "loss": 3.9639, "step": 12795 }, { "epoch": 0.8696833808941432, "grad_norm": 1.8004759550094604, "learning_rate": 8.913745074058976e-05, "loss": 4.1324, "step": 12800 }, { "epoch": 0.870023100964805, "grad_norm": 0.1751425862312317, "learning_rate": 8.913320423970649e-05, "loss": 4.0504, "step": 12805 }, { "epoch": 0.8703628210354668, "grad_norm": 0.21332374215126038, "learning_rate": 8.912895773882322e-05, "loss": 4.095, "step": 12810 }, { "epoch": 0.8707025411061285, "grad_norm": 0.2132454216480255, "learning_rate": 8.912471123793994e-05, "loss": 3.9654, "step": 12815 }, { "epoch": 0.8710422611767903, "grad_norm": 0.1583162248134613, "learning_rate": 8.912046473705667e-05, "loss": 3.9617, "step": 12820 }, { "epoch": 0.8713819812474521, "grad_norm": 0.19870373606681824, "learning_rate": 8.91162182361734e-05, "loss": 4.2326, "step": 12825 }, { "epoch": 0.8717217013181139, "grad_norm": 0.18757444620132446, "learning_rate": 8.911197173529013e-05, "loss": 3.9562, "step": 12830 }, { "epoch": 0.8720614213887756, "grad_norm": 0.15151342749595642, "learning_rate": 8.910772523440686e-05, "loss": 3.8253, "step": 12835 }, { "epoch": 0.8724011414594374, "grad_norm": 0.23465317487716675, "learning_rate": 8.910347873352358e-05, "loss": 3.8039, "step": 12840 }, { "epoch": 0.8727408615300992, "grad_norm": 0.3286290168762207, "learning_rate": 8.909923223264031e-05, "loss": 3.9344, "step": 12845 }, { "epoch": 0.873080581600761, "grad_norm": 0.3796158730983734, "learning_rate": 8.909498573175704e-05, "loss": 4.2968, "step": 12850 }, { "epoch": 0.8734203016714227, "grad_norm": 0.17161943018436432, "learning_rate": 8.909073923087377e-05, "loss": 4.0475, "step": 12855 }, { "epoch": 0.8737600217420846, "grad_norm": 0.18927636742591858, "learning_rate": 8.908649272999048e-05, "loss": 3.9, "step": 12860 }, { "epoch": 0.8740997418127463, "grad_norm": 0.17746715247631073, "learning_rate": 8.908224622910722e-05, "loss": 4.0236, "step": 12865 }, { "epoch": 0.874439461883408, "grad_norm": 0.2432946115732193, "learning_rate": 8.907799972822395e-05, "loss": 3.595, "step": 12870 }, { "epoch": 0.8747791819540699, "grad_norm": 0.18916042149066925, "learning_rate": 8.907375322734067e-05, "loss": 3.9668, "step": 12875 }, { "epoch": 0.8751189020247316, "grad_norm": 0.18429867923259735, "learning_rate": 8.906950672645741e-05, "loss": 3.8215, "step": 12880 }, { "epoch": 0.8754586220953934, "grad_norm": 0.30447253584861755, "learning_rate": 8.906526022557414e-05, "loss": 3.94, "step": 12885 }, { "epoch": 0.8757983421660551, "grad_norm": 0.1514458954334259, "learning_rate": 8.906101372469085e-05, "loss": 3.8559, "step": 12890 }, { "epoch": 0.876138062236717, "grad_norm": 0.1644824743270874, "learning_rate": 8.905676722380759e-05, "loss": 4.136, "step": 12895 }, { "epoch": 0.8764777823073787, "grad_norm": 0.1758906990289688, "learning_rate": 8.905252072292432e-05, "loss": 4.2008, "step": 12900 }, { "epoch": 0.8768175023780405, "grad_norm": 0.19972476363182068, "learning_rate": 8.904827422204104e-05, "loss": 3.9077, "step": 12905 }, { "epoch": 0.8771572224487023, "grad_norm": 0.21778126060962677, "learning_rate": 8.904402772115778e-05, "loss": 3.9808, "step": 12910 }, { "epoch": 0.877496942519364, "grad_norm": 0.19557218253612518, "learning_rate": 8.90397812202745e-05, "loss": 3.7706, "step": 12915 }, { "epoch": 0.8778366625900258, "grad_norm": 0.1903916746377945, "learning_rate": 8.903553471939122e-05, "loss": 3.8328, "step": 12920 }, { "epoch": 0.8781763826606876, "grad_norm": 0.21190743148326874, "learning_rate": 8.903128821850796e-05, "loss": 3.8016, "step": 12925 }, { "epoch": 0.8785161027313494, "grad_norm": 0.22646445035934448, "learning_rate": 8.902704171762468e-05, "loss": 3.9702, "step": 12930 }, { "epoch": 0.8788558228020111, "grad_norm": 0.2211994081735611, "learning_rate": 8.90227952167414e-05, "loss": 4.0118, "step": 12935 }, { "epoch": 0.8791955428726729, "grad_norm": 0.23225241899490356, "learning_rate": 8.901854871585814e-05, "loss": 4.1305, "step": 12940 }, { "epoch": 0.8795352629433347, "grad_norm": 0.20378831028938293, "learning_rate": 8.901430221497486e-05, "loss": 3.9627, "step": 12945 }, { "epoch": 0.8798749830139965, "grad_norm": 0.15453274548053741, "learning_rate": 8.901005571409159e-05, "loss": 3.8527, "step": 12950 }, { "epoch": 0.8802147030846582, "grad_norm": 0.19792801141738892, "learning_rate": 8.900580921320833e-05, "loss": 4.0273, "step": 12955 }, { "epoch": 0.8805544231553201, "grad_norm": 0.17211174964904785, "learning_rate": 8.900156271232504e-05, "loss": 3.9116, "step": 12960 }, { "epoch": 0.8808941432259818, "grad_norm": 0.5672011375427246, "learning_rate": 8.899731621144177e-05, "loss": 3.9642, "step": 12965 }, { "epoch": 0.8812338632966435, "grad_norm": 0.18356992304325104, "learning_rate": 8.899306971055851e-05, "loss": 3.9994, "step": 12970 }, { "epoch": 0.8815735833673053, "grad_norm": 0.18002454936504364, "learning_rate": 8.898882320967523e-05, "loss": 3.9536, "step": 12975 }, { "epoch": 0.8819133034379671, "grad_norm": 0.18387572467327118, "learning_rate": 8.898457670879196e-05, "loss": 4.0552, "step": 12980 }, { "epoch": 0.8822530235086289, "grad_norm": 0.27319806814193726, "learning_rate": 8.89803302079087e-05, "loss": 3.9619, "step": 12985 }, { "epoch": 0.8825927435792906, "grad_norm": 0.2080732136964798, "learning_rate": 8.897608370702541e-05, "loss": 4.1625, "step": 12990 }, { "epoch": 0.8829324636499525, "grad_norm": 0.17268094420433044, "learning_rate": 8.897183720614214e-05, "loss": 3.9614, "step": 12995 }, { "epoch": 0.8832721837206142, "grad_norm": 0.15917377173900604, "learning_rate": 8.896759070525888e-05, "loss": 3.9529, "step": 13000 }, { "epoch": 0.883611903791276, "grad_norm": 0.22826112806797028, "learning_rate": 8.89633442043756e-05, "loss": 4.0541, "step": 13005 }, { "epoch": 0.8839516238619378, "grad_norm": 0.2643020749092102, "learning_rate": 8.895909770349232e-05, "loss": 3.8896, "step": 13010 }, { "epoch": 0.8842913439325996, "grad_norm": 0.19719457626342773, "learning_rate": 8.895485120260905e-05, "loss": 3.8283, "step": 13015 }, { "epoch": 0.8846310640032613, "grad_norm": 0.18295039236545563, "learning_rate": 8.895060470172578e-05, "loss": 3.8955, "step": 13020 }, { "epoch": 0.884970784073923, "grad_norm": 0.23282389342784882, "learning_rate": 8.894635820084251e-05, "loss": 3.9094, "step": 13025 }, { "epoch": 0.8853105041445849, "grad_norm": 0.1835237592458725, "learning_rate": 8.894211169995924e-05, "loss": 3.9402, "step": 13030 }, { "epoch": 0.8856502242152466, "grad_norm": 0.2141278088092804, "learning_rate": 8.893786519907596e-05, "loss": 4.0951, "step": 13035 }, { "epoch": 0.8859899442859084, "grad_norm": 0.16725867986679077, "learning_rate": 8.893361869819269e-05, "loss": 3.8523, "step": 13040 }, { "epoch": 0.8863296643565702, "grad_norm": 0.15616454184055328, "learning_rate": 8.892937219730942e-05, "loss": 3.9671, "step": 13045 }, { "epoch": 0.886669384427232, "grad_norm": 0.17522963881492615, "learning_rate": 8.892512569642615e-05, "loss": 4.0624, "step": 13050 }, { "epoch": 0.8870091044978937, "grad_norm": 0.19032767415046692, "learning_rate": 8.892087919554288e-05, "loss": 3.8977, "step": 13055 }, { "epoch": 0.8873488245685555, "grad_norm": 0.4839637279510498, "learning_rate": 8.89166326946596e-05, "loss": 3.9498, "step": 13060 }, { "epoch": 0.8876885446392173, "grad_norm": 0.15772879123687744, "learning_rate": 8.891238619377633e-05, "loss": 3.9909, "step": 13065 }, { "epoch": 0.8880282647098791, "grad_norm": 0.23280905187129974, "learning_rate": 8.890813969289306e-05, "loss": 3.9899, "step": 13070 }, { "epoch": 0.8883679847805408, "grad_norm": 0.21939218044281006, "learning_rate": 8.890389319200979e-05, "loss": 4.0157, "step": 13075 }, { "epoch": 0.8887077048512027, "grad_norm": 0.19345150887966156, "learning_rate": 8.889964669112652e-05, "loss": 3.8884, "step": 13080 }, { "epoch": 0.8890474249218644, "grad_norm": 0.18044187128543854, "learning_rate": 8.889540019024324e-05, "loss": 3.9464, "step": 13085 }, { "epoch": 0.8893871449925261, "grad_norm": 0.7762069702148438, "learning_rate": 8.889115368935997e-05, "loss": 4.0273, "step": 13090 }, { "epoch": 0.889726865063188, "grad_norm": 0.20319564640522003, "learning_rate": 8.88869071884767e-05, "loss": 3.943, "step": 13095 }, { "epoch": 0.8900665851338497, "grad_norm": 0.4176552891731262, "learning_rate": 8.888266068759343e-05, "loss": 3.9463, "step": 13100 }, { "epoch": 0.8904063052045115, "grad_norm": 0.6137194633483887, "learning_rate": 8.887841418671016e-05, "loss": 3.9732, "step": 13105 }, { "epoch": 0.8907460252751732, "grad_norm": 0.3522084355354309, "learning_rate": 8.887416768582688e-05, "loss": 4.083, "step": 13110 }, { "epoch": 0.8910857453458351, "grad_norm": 0.1649700552225113, "learning_rate": 8.886992118494361e-05, "loss": 3.8638, "step": 13115 }, { "epoch": 0.8914254654164968, "grad_norm": 0.1624513566493988, "learning_rate": 8.886567468406034e-05, "loss": 4.1418, "step": 13120 }, { "epoch": 0.8917651854871586, "grad_norm": 0.18356308341026306, "learning_rate": 8.886142818317707e-05, "loss": 3.8366, "step": 13125 }, { "epoch": 0.8921049055578204, "grad_norm": 0.22682762145996094, "learning_rate": 8.88571816822938e-05, "loss": 3.8805, "step": 13130 }, { "epoch": 0.8924446256284821, "grad_norm": 0.3890301287174225, "learning_rate": 8.885293518141052e-05, "loss": 3.8615, "step": 13135 }, { "epoch": 0.8927843456991439, "grad_norm": 0.1815677434206009, "learning_rate": 8.884868868052725e-05, "loss": 3.9861, "step": 13140 }, { "epoch": 0.8931240657698056, "grad_norm": 0.5793675780296326, "learning_rate": 8.884444217964398e-05, "loss": 4.0116, "step": 13145 }, { "epoch": 0.8934637858404675, "grad_norm": 0.30733972787857056, "learning_rate": 8.884019567876071e-05, "loss": 3.8564, "step": 13150 }, { "epoch": 0.8938035059111292, "grad_norm": 0.15819989144802094, "learning_rate": 8.883594917787744e-05, "loss": 4.2131, "step": 13155 }, { "epoch": 0.894143225981791, "grad_norm": 0.1839209347963333, "learning_rate": 8.883170267699416e-05, "loss": 3.8485, "step": 13160 }, { "epoch": 0.8944829460524528, "grad_norm": 0.2315215915441513, "learning_rate": 8.882745617611089e-05, "loss": 3.7713, "step": 13165 }, { "epoch": 0.8948226661231146, "grad_norm": 0.18673592805862427, "learning_rate": 8.882320967522762e-05, "loss": 3.7431, "step": 13170 }, { "epoch": 0.8951623861937763, "grad_norm": 0.1436724215745926, "learning_rate": 8.881896317434435e-05, "loss": 4.0778, "step": 13175 }, { "epoch": 0.8955021062644382, "grad_norm": 0.23739652335643768, "learning_rate": 8.881471667346108e-05, "loss": 3.925, "step": 13180 }, { "epoch": 0.8958418263350999, "grad_norm": 0.17830795049667358, "learning_rate": 8.88104701725778e-05, "loss": 4.0997, "step": 13185 }, { "epoch": 0.8961815464057616, "grad_norm": 0.20703446865081787, "learning_rate": 8.880622367169453e-05, "loss": 3.8833, "step": 13190 }, { "epoch": 0.8965212664764234, "grad_norm": 0.18058006465435028, "learning_rate": 8.880197717081126e-05, "loss": 4.1206, "step": 13195 }, { "epoch": 0.8968609865470852, "grad_norm": 0.16602760553359985, "learning_rate": 8.879773066992799e-05, "loss": 4.0043, "step": 13200 }, { "epoch": 0.897200706617747, "grad_norm": 0.2073521465063095, "learning_rate": 8.879348416904472e-05, "loss": 3.8992, "step": 13205 }, { "epoch": 0.8975404266884087, "grad_norm": 0.1674661785364151, "learning_rate": 8.878923766816144e-05, "loss": 3.9651, "step": 13210 }, { "epoch": 0.8978801467590706, "grad_norm": 0.19709919393062592, "learning_rate": 8.878499116727816e-05, "loss": 3.8494, "step": 13215 }, { "epoch": 0.8982198668297323, "grad_norm": 0.18262673914432526, "learning_rate": 8.87807446663949e-05, "loss": 3.9929, "step": 13220 }, { "epoch": 0.8985595869003941, "grad_norm": 0.1922248899936676, "learning_rate": 8.877649816551163e-05, "loss": 4.1698, "step": 13225 }, { "epoch": 0.8988993069710558, "grad_norm": 0.22797107696533203, "learning_rate": 8.877225166462834e-05, "loss": 4.1415, "step": 13230 }, { "epoch": 0.8992390270417177, "grad_norm": 0.21377937495708466, "learning_rate": 8.876800516374508e-05, "loss": 3.7365, "step": 13235 }, { "epoch": 0.8995787471123794, "grad_norm": 0.15349645912647247, "learning_rate": 8.876375866286181e-05, "loss": 4.21, "step": 13240 }, { "epoch": 0.8999184671830411, "grad_norm": 0.17594188451766968, "learning_rate": 8.875951216197853e-05, "loss": 3.6277, "step": 13245 }, { "epoch": 0.900258187253703, "grad_norm": 0.21915188431739807, "learning_rate": 8.875526566109527e-05, "loss": 3.8465, "step": 13250 }, { "epoch": 0.9005979073243647, "grad_norm": 0.1609984189271927, "learning_rate": 8.8751019160212e-05, "loss": 3.9646, "step": 13255 }, { "epoch": 0.9009376273950265, "grad_norm": 0.22858203947544098, "learning_rate": 8.874677265932871e-05, "loss": 3.9217, "step": 13260 }, { "epoch": 0.9012773474656883, "grad_norm": 0.1865098774433136, "learning_rate": 8.874252615844545e-05, "loss": 4.0665, "step": 13265 }, { "epoch": 0.9016170675363501, "grad_norm": 0.27133429050445557, "learning_rate": 8.873827965756218e-05, "loss": 4.1901, "step": 13270 }, { "epoch": 0.9019567876070118, "grad_norm": 0.26253220438957214, "learning_rate": 8.87340331566789e-05, "loss": 4.0598, "step": 13275 }, { "epoch": 0.9022965076776736, "grad_norm": 0.19666416943073273, "learning_rate": 8.872978665579564e-05, "loss": 4.1986, "step": 13280 }, { "epoch": 0.9026362277483354, "grad_norm": 0.16205628216266632, "learning_rate": 8.872554015491235e-05, "loss": 3.9244, "step": 13285 }, { "epoch": 0.9029759478189971, "grad_norm": 0.21423132717609406, "learning_rate": 8.872129365402908e-05, "loss": 3.9477, "step": 13290 }, { "epoch": 0.9033156678896589, "grad_norm": 0.18254421651363373, "learning_rate": 8.871704715314582e-05, "loss": 4.0589, "step": 13295 }, { "epoch": 0.9036553879603207, "grad_norm": 0.17389804124832153, "learning_rate": 8.871280065226253e-05, "loss": 4.0397, "step": 13300 }, { "epoch": 0.9039951080309825, "grad_norm": 0.23967847228050232, "learning_rate": 8.870855415137926e-05, "loss": 3.9713, "step": 13305 }, { "epoch": 0.9043348281016442, "grad_norm": 0.15660813450813293, "learning_rate": 8.8704307650496e-05, "loss": 4.155, "step": 13310 }, { "epoch": 0.904674548172306, "grad_norm": 0.1497335135936737, "learning_rate": 8.870006114961272e-05, "loss": 4.2737, "step": 13315 }, { "epoch": 0.9050142682429678, "grad_norm": 0.16438940167427063, "learning_rate": 8.869581464872945e-05, "loss": 3.7956, "step": 13320 }, { "epoch": 0.9053539883136296, "grad_norm": 0.19728811085224152, "learning_rate": 8.869156814784619e-05, "loss": 3.8597, "step": 13325 }, { "epoch": 0.9056937083842913, "grad_norm": 0.19347457587718964, "learning_rate": 8.86873216469629e-05, "loss": 3.9375, "step": 13330 }, { "epoch": 0.9060334284549532, "grad_norm": 0.1782982498407364, "learning_rate": 8.868307514607963e-05, "loss": 4.1215, "step": 13335 }, { "epoch": 0.9063731485256149, "grad_norm": 0.16826027631759644, "learning_rate": 8.867882864519637e-05, "loss": 3.9993, "step": 13340 }, { "epoch": 0.9067128685962766, "grad_norm": 0.1599569171667099, "learning_rate": 8.867458214431309e-05, "loss": 3.8991, "step": 13345 }, { "epoch": 0.9070525886669385, "grad_norm": 2.387441396713257, "learning_rate": 8.867033564342981e-05, "loss": 4.0009, "step": 13350 }, { "epoch": 0.9073923087376002, "grad_norm": 0.16353747248649597, "learning_rate": 8.866608914254654e-05, "loss": 3.8368, "step": 13355 }, { "epoch": 0.907732028808262, "grad_norm": 0.18139252066612244, "learning_rate": 8.866184264166327e-05, "loss": 3.8567, "step": 13360 }, { "epoch": 0.9080717488789237, "grad_norm": 0.28488507866859436, "learning_rate": 8.865759614078e-05, "loss": 3.9918, "step": 13365 }, { "epoch": 0.9084114689495856, "grad_norm": 0.193324476480484, "learning_rate": 8.865334963989673e-05, "loss": 4.0164, "step": 13370 }, { "epoch": 0.9087511890202473, "grad_norm": 0.18897615373134613, "learning_rate": 8.864910313901345e-05, "loss": 3.6622, "step": 13375 }, { "epoch": 0.9090909090909091, "grad_norm": 0.18190960586071014, "learning_rate": 8.864485663813018e-05, "loss": 4.0802, "step": 13380 }, { "epoch": 0.9094306291615709, "grad_norm": 0.3872554302215576, "learning_rate": 8.864061013724691e-05, "loss": 4.1349, "step": 13385 }, { "epoch": 0.9097703492322327, "grad_norm": 0.18860095739364624, "learning_rate": 8.863636363636364e-05, "loss": 4.1308, "step": 13390 }, { "epoch": 0.9101100693028944, "grad_norm": 0.1607290804386139, "learning_rate": 8.863211713548037e-05, "loss": 3.9187, "step": 13395 }, { "epoch": 0.9104497893735561, "grad_norm": 0.24063172936439514, "learning_rate": 8.86278706345971e-05, "loss": 3.7071, "step": 13400 }, { "epoch": 0.910789509444218, "grad_norm": 0.2261088341474533, "learning_rate": 8.862362413371382e-05, "loss": 4.3313, "step": 13405 }, { "epoch": 0.9111292295148797, "grad_norm": 0.2024299055337906, "learning_rate": 8.861937763283055e-05, "loss": 3.7482, "step": 13410 }, { "epoch": 0.9114689495855415, "grad_norm": 0.32624539732933044, "learning_rate": 8.861513113194728e-05, "loss": 3.9679, "step": 13415 }, { "epoch": 0.9118086696562033, "grad_norm": 0.2082969844341278, "learning_rate": 8.8610884631064e-05, "loss": 4.1913, "step": 13420 }, { "epoch": 0.9121483897268651, "grad_norm": 0.20776890218257904, "learning_rate": 8.860663813018073e-05, "loss": 4.0744, "step": 13425 }, { "epoch": 0.9124881097975268, "grad_norm": 0.16512270271778107, "learning_rate": 8.860239162929746e-05, "loss": 4.0043, "step": 13430 }, { "epoch": 0.9128278298681887, "grad_norm": 1.4505500793457031, "learning_rate": 8.859814512841419e-05, "loss": 3.8782, "step": 13435 }, { "epoch": 0.9131675499388504, "grad_norm": 0.25911930203437805, "learning_rate": 8.859389862753092e-05, "loss": 3.6646, "step": 13440 }, { "epoch": 0.9135072700095122, "grad_norm": 0.23000332713127136, "learning_rate": 8.858965212664765e-05, "loss": 4.0076, "step": 13445 }, { "epoch": 0.9138469900801739, "grad_norm": 0.19737814366817474, "learning_rate": 8.858540562576437e-05, "loss": 3.8124, "step": 13450 }, { "epoch": 0.9141867101508357, "grad_norm": 0.1637062281370163, "learning_rate": 8.85811591248811e-05, "loss": 3.8238, "step": 13455 }, { "epoch": 0.9145264302214975, "grad_norm": 0.24078230559825897, "learning_rate": 8.857691262399783e-05, "loss": 4.2022, "step": 13460 }, { "epoch": 0.9148661502921592, "grad_norm": 0.17801740765571594, "learning_rate": 8.857266612311456e-05, "loss": 4.11, "step": 13465 }, { "epoch": 0.9152058703628211, "grad_norm": 0.18943698704242706, "learning_rate": 8.856841962223129e-05, "loss": 4.0061, "step": 13470 }, { "epoch": 0.9155455904334828, "grad_norm": 0.1784680187702179, "learning_rate": 8.856417312134801e-05, "loss": 3.9399, "step": 13475 }, { "epoch": 0.9158853105041446, "grad_norm": 0.19242218136787415, "learning_rate": 8.855992662046474e-05, "loss": 3.7147, "step": 13480 }, { "epoch": 0.9162250305748063, "grad_norm": 0.1983332335948944, "learning_rate": 8.855568011958147e-05, "loss": 3.7828, "step": 13485 }, { "epoch": 0.9165647506454682, "grad_norm": 0.1877221167087555, "learning_rate": 8.85514336186982e-05, "loss": 3.9979, "step": 13490 }, { "epoch": 0.9169044707161299, "grad_norm": 0.287514328956604, "learning_rate": 8.854718711781493e-05, "loss": 4.0574, "step": 13495 }, { "epoch": 0.9172441907867916, "grad_norm": 0.1971914917230606, "learning_rate": 8.854294061693165e-05, "loss": 3.8822, "step": 13500 }, { "epoch": 0.9175839108574535, "grad_norm": 0.19974425435066223, "learning_rate": 8.853869411604838e-05, "loss": 4.0866, "step": 13505 }, { "epoch": 0.9179236309281152, "grad_norm": 0.1471916139125824, "learning_rate": 8.853444761516511e-05, "loss": 4.2775, "step": 13510 }, { "epoch": 0.918263350998777, "grad_norm": 0.2695204019546509, "learning_rate": 8.853020111428184e-05, "loss": 3.5593, "step": 13515 }, { "epoch": 0.9186030710694388, "grad_norm": 0.3328780233860016, "learning_rate": 8.852595461339857e-05, "loss": 4.0474, "step": 13520 }, { "epoch": 0.9189427911401006, "grad_norm": 0.22376228868961334, "learning_rate": 8.85217081125153e-05, "loss": 3.8726, "step": 13525 }, { "epoch": 0.9192825112107623, "grad_norm": 1.0047399997711182, "learning_rate": 8.851746161163202e-05, "loss": 3.6569, "step": 13530 }, { "epoch": 0.9196222312814241, "grad_norm": 0.20537297427654266, "learning_rate": 8.851321511074875e-05, "loss": 4.0576, "step": 13535 }, { "epoch": 0.9199619513520859, "grad_norm": 0.19384372234344482, "learning_rate": 8.850896860986548e-05, "loss": 4.1421, "step": 13540 }, { "epoch": 0.9203016714227477, "grad_norm": 0.24288725852966309, "learning_rate": 8.85047221089822e-05, "loss": 3.9712, "step": 13545 }, { "epoch": 0.9206413914934094, "grad_norm": 0.17010895907878876, "learning_rate": 8.850047560809893e-05, "loss": 4.1179, "step": 13550 }, { "epoch": 0.9209811115640713, "grad_norm": 0.2580501139163971, "learning_rate": 8.849622910721565e-05, "loss": 3.963, "step": 13555 }, { "epoch": 0.921320831634733, "grad_norm": 0.19051308929920197, "learning_rate": 8.849198260633239e-05, "loss": 3.9626, "step": 13560 }, { "epoch": 0.9216605517053947, "grad_norm": 0.18489578366279602, "learning_rate": 8.848773610544912e-05, "loss": 4.0235, "step": 13565 }, { "epoch": 0.9220002717760565, "grad_norm": 0.1751706898212433, "learning_rate": 8.848348960456583e-05, "loss": 4.03, "step": 13570 }, { "epoch": 0.9223399918467183, "grad_norm": 0.15869379043579102, "learning_rate": 8.847924310368257e-05, "loss": 3.9887, "step": 13575 }, { "epoch": 0.9226797119173801, "grad_norm": 0.18910722434520721, "learning_rate": 8.84749966027993e-05, "loss": 3.9694, "step": 13580 }, { "epoch": 0.9230194319880418, "grad_norm": 0.18458014726638794, "learning_rate": 8.847075010191602e-05, "loss": 3.7103, "step": 13585 }, { "epoch": 0.9233591520587037, "grad_norm": 0.15150383114814758, "learning_rate": 8.846650360103276e-05, "loss": 3.9727, "step": 13590 }, { "epoch": 0.9236988721293654, "grad_norm": 0.1747668832540512, "learning_rate": 8.846225710014949e-05, "loss": 4.0081, "step": 13595 }, { "epoch": 0.9240385922000272, "grad_norm": 0.1757059097290039, "learning_rate": 8.84580105992662e-05, "loss": 3.8936, "step": 13600 }, { "epoch": 0.924378312270689, "grad_norm": 0.17729350924491882, "learning_rate": 8.845376409838294e-05, "loss": 4.1143, "step": 13605 }, { "epoch": 0.9247180323413507, "grad_norm": 0.18305832147598267, "learning_rate": 8.844951759749967e-05, "loss": 4.1294, "step": 13610 }, { "epoch": 0.9250577524120125, "grad_norm": 1.0373708009719849, "learning_rate": 8.844527109661639e-05, "loss": 4.0552, "step": 13615 }, { "epoch": 0.9253974724826742, "grad_norm": 0.18093042075634003, "learning_rate": 8.844102459573313e-05, "loss": 4.0405, "step": 13620 }, { "epoch": 0.9257371925533361, "grad_norm": 0.1686364710330963, "learning_rate": 8.843677809484985e-05, "loss": 4.0729, "step": 13625 }, { "epoch": 0.9260769126239978, "grad_norm": 0.12190663069486618, "learning_rate": 8.843253159396657e-05, "loss": 3.958, "step": 13630 }, { "epoch": 0.9264166326946596, "grad_norm": 0.18013213574886322, "learning_rate": 8.842828509308331e-05, "loss": 4.0457, "step": 13635 }, { "epoch": 0.9267563527653214, "grad_norm": 0.47123172879219055, "learning_rate": 8.842403859220003e-05, "loss": 3.9378, "step": 13640 }, { "epoch": 0.9270960728359832, "grad_norm": 0.17332005500793457, "learning_rate": 8.841979209131675e-05, "loss": 3.9994, "step": 13645 }, { "epoch": 0.9274357929066449, "grad_norm": 0.2330494374036789, "learning_rate": 8.84155455904335e-05, "loss": 4.2059, "step": 13650 }, { "epoch": 0.9277755129773066, "grad_norm": 0.1844603419303894, "learning_rate": 8.841129908955021e-05, "loss": 3.9922, "step": 13655 }, { "epoch": 0.9281152330479685, "grad_norm": 0.19840815663337708, "learning_rate": 8.840705258866694e-05, "loss": 4.0601, "step": 13660 }, { "epoch": 0.9284549531186302, "grad_norm": 0.25802430510520935, "learning_rate": 8.840280608778368e-05, "loss": 3.8963, "step": 13665 }, { "epoch": 0.928794673189292, "grad_norm": 0.1573478877544403, "learning_rate": 8.83985595869004e-05, "loss": 4.1013, "step": 13670 }, { "epoch": 0.9291343932599538, "grad_norm": 0.20257075130939484, "learning_rate": 8.839431308601712e-05, "loss": 4.0064, "step": 13675 }, { "epoch": 0.9294741133306156, "grad_norm": 0.2046387791633606, "learning_rate": 8.839006658513386e-05, "loss": 3.948, "step": 13680 }, { "epoch": 0.9298138334012773, "grad_norm": 0.29030993580818176, "learning_rate": 8.838582008425058e-05, "loss": 3.7141, "step": 13685 }, { "epoch": 0.9301535534719392, "grad_norm": 0.1701250672340393, "learning_rate": 8.83815735833673e-05, "loss": 3.8852, "step": 13690 }, { "epoch": 0.9304932735426009, "grad_norm": 0.18545231223106384, "learning_rate": 8.837732708248405e-05, "loss": 3.8857, "step": 13695 }, { "epoch": 0.9308329936132627, "grad_norm": 0.22790156304836273, "learning_rate": 8.837308058160076e-05, "loss": 3.9376, "step": 13700 }, { "epoch": 0.9311727136839244, "grad_norm": 0.2060774713754654, "learning_rate": 8.836883408071749e-05, "loss": 4.0047, "step": 13705 }, { "epoch": 0.9315124337545863, "grad_norm": 0.17100512981414795, "learning_rate": 8.836458757983422e-05, "loss": 4.0347, "step": 13710 }, { "epoch": 0.931852153825248, "grad_norm": 0.18439409136772156, "learning_rate": 8.836034107895095e-05, "loss": 4.0597, "step": 13715 }, { "epoch": 0.9321918738959097, "grad_norm": 0.20171356201171875, "learning_rate": 8.835609457806767e-05, "loss": 3.8904, "step": 13720 }, { "epoch": 0.9325315939665716, "grad_norm": 0.2539232075214386, "learning_rate": 8.83518480771844e-05, "loss": 3.7127, "step": 13725 }, { "epoch": 0.9328713140372333, "grad_norm": 0.303066611289978, "learning_rate": 8.834760157630113e-05, "loss": 3.858, "step": 13730 }, { "epoch": 0.9332110341078951, "grad_norm": 10.150884628295898, "learning_rate": 8.834335507541786e-05, "loss": 4.214, "step": 13735 }, { "epoch": 0.9335507541785568, "grad_norm": 0.17438891530036926, "learning_rate": 8.833910857453459e-05, "loss": 3.9798, "step": 13740 }, { "epoch": 0.9338904742492187, "grad_norm": 0.1963614672422409, "learning_rate": 8.833486207365131e-05, "loss": 4.0225, "step": 13745 }, { "epoch": 0.9342301943198804, "grad_norm": 0.33438101410865784, "learning_rate": 8.833061557276804e-05, "loss": 3.8237, "step": 13750 }, { "epoch": 0.9345699143905422, "grad_norm": 0.15952834486961365, "learning_rate": 8.832636907188477e-05, "loss": 4.0374, "step": 13755 }, { "epoch": 0.934909634461204, "grad_norm": 0.20450946688652039, "learning_rate": 8.83221225710015e-05, "loss": 3.8739, "step": 13760 }, { "epoch": 0.9352493545318658, "grad_norm": 0.2039669156074524, "learning_rate": 8.831787607011823e-05, "loss": 3.953, "step": 13765 }, { "epoch": 0.9355890746025275, "grad_norm": 0.15678437054157257, "learning_rate": 8.831362956923495e-05, "loss": 3.9398, "step": 13770 }, { "epoch": 0.9359287946731893, "grad_norm": 0.5224528312683105, "learning_rate": 8.830938306835168e-05, "loss": 3.8759, "step": 13775 }, { "epoch": 0.9362685147438511, "grad_norm": 0.2511097192764282, "learning_rate": 8.830513656746841e-05, "loss": 3.9613, "step": 13780 }, { "epoch": 0.9366082348145128, "grad_norm": 0.14598341286182404, "learning_rate": 8.830089006658514e-05, "loss": 4.0519, "step": 13785 }, { "epoch": 0.9369479548851746, "grad_norm": 0.18947695195674896, "learning_rate": 8.829664356570187e-05, "loss": 3.9679, "step": 13790 }, { "epoch": 0.9372876749558364, "grad_norm": 0.19946783781051636, "learning_rate": 8.82923970648186e-05, "loss": 4.0397, "step": 13795 }, { "epoch": 0.9376273950264982, "grad_norm": 0.1777060627937317, "learning_rate": 8.828815056393532e-05, "loss": 3.8039, "step": 13800 }, { "epoch": 0.9379671150971599, "grad_norm": 0.2687058746814728, "learning_rate": 8.828390406305205e-05, "loss": 3.9514, "step": 13805 }, { "epoch": 0.9383068351678218, "grad_norm": 0.17168082296848297, "learning_rate": 8.827965756216878e-05, "loss": 3.6143, "step": 13810 }, { "epoch": 0.9386465552384835, "grad_norm": 0.2019256055355072, "learning_rate": 8.82754110612855e-05, "loss": 3.9093, "step": 13815 }, { "epoch": 0.9389862753091452, "grad_norm": 0.15410295128822327, "learning_rate": 8.827116456040223e-05, "loss": 4.0334, "step": 13820 }, { "epoch": 0.939325995379807, "grad_norm": 0.18953141570091248, "learning_rate": 8.826691805951896e-05, "loss": 3.5909, "step": 13825 }, { "epoch": 0.9396657154504688, "grad_norm": 0.4388710856437683, "learning_rate": 8.826267155863569e-05, "loss": 4.0782, "step": 13830 }, { "epoch": 0.9400054355211306, "grad_norm": 0.5807581543922424, "learning_rate": 8.825842505775242e-05, "loss": 3.7892, "step": 13835 }, { "epoch": 0.9403451555917923, "grad_norm": 0.14411011338233948, "learning_rate": 8.825417855686915e-05, "loss": 3.9576, "step": 13840 }, { "epoch": 0.9406848756624542, "grad_norm": 0.21537935733795166, "learning_rate": 8.824993205598587e-05, "loss": 3.8762, "step": 13845 }, { "epoch": 0.9410245957331159, "grad_norm": 1.785944938659668, "learning_rate": 8.82456855551026e-05, "loss": 4.103, "step": 13850 }, { "epoch": 0.9413643158037777, "grad_norm": 0.13651463389396667, "learning_rate": 8.824143905421933e-05, "loss": 4.032, "step": 13855 }, { "epoch": 0.9417040358744395, "grad_norm": 0.16791968047618866, "learning_rate": 8.823719255333606e-05, "loss": 3.9523, "step": 13860 }, { "epoch": 0.9420437559451013, "grad_norm": 0.22517231106758118, "learning_rate": 8.823294605245279e-05, "loss": 3.8512, "step": 13865 }, { "epoch": 0.942383476015763, "grad_norm": 0.2402997463941574, "learning_rate": 8.822869955156951e-05, "loss": 3.9759, "step": 13870 }, { "epoch": 0.9427231960864247, "grad_norm": 0.20395690202713013, "learning_rate": 8.822445305068624e-05, "loss": 3.9377, "step": 13875 }, { "epoch": 0.9430629161570866, "grad_norm": 0.24165527522563934, "learning_rate": 8.822020654980297e-05, "loss": 3.9313, "step": 13880 }, { "epoch": 0.9434026362277483, "grad_norm": 0.3844189941883087, "learning_rate": 8.82159600489197e-05, "loss": 3.7314, "step": 13885 }, { "epoch": 0.9437423562984101, "grad_norm": 0.2007877379655838, "learning_rate": 8.821171354803643e-05, "loss": 4.0631, "step": 13890 }, { "epoch": 0.9440820763690719, "grad_norm": 0.18726769089698792, "learning_rate": 8.820746704715315e-05, "loss": 4.055, "step": 13895 }, { "epoch": 0.9444217964397337, "grad_norm": 0.38226640224456787, "learning_rate": 8.820322054626988e-05, "loss": 4.0028, "step": 13900 }, { "epoch": 0.9447615165103954, "grad_norm": 0.1904405802488327, "learning_rate": 8.819897404538661e-05, "loss": 3.9834, "step": 13905 }, { "epoch": 0.9451012365810572, "grad_norm": 0.1809813529253006, "learning_rate": 8.819472754450332e-05, "loss": 4.0637, "step": 13910 }, { "epoch": 0.945440956651719, "grad_norm": 0.4116685092449188, "learning_rate": 8.819048104362007e-05, "loss": 3.9916, "step": 13915 }, { "epoch": 0.9457806767223808, "grad_norm": 1.9458621740341187, "learning_rate": 8.81862345427368e-05, "loss": 4.0563, "step": 13920 }, { "epoch": 0.9461203967930425, "grad_norm": 0.15543721616268158, "learning_rate": 8.818198804185351e-05, "loss": 4.0744, "step": 13925 }, { "epoch": 0.9464601168637043, "grad_norm": 0.42934080958366394, "learning_rate": 8.817774154097025e-05, "loss": 3.8221, "step": 13930 }, { "epoch": 0.9467998369343661, "grad_norm": 0.228485107421875, "learning_rate": 8.817349504008698e-05, "loss": 3.9268, "step": 13935 }, { "epoch": 0.9471395570050278, "grad_norm": 0.21295882761478424, "learning_rate": 8.816924853920369e-05, "loss": 4.2737, "step": 13940 }, { "epoch": 0.9474792770756897, "grad_norm": 0.2023978978395462, "learning_rate": 8.816500203832043e-05, "loss": 4.0087, "step": 13945 }, { "epoch": 0.9478189971463514, "grad_norm": 0.1767956167459488, "learning_rate": 8.816075553743716e-05, "loss": 3.9129, "step": 13950 }, { "epoch": 0.9481587172170132, "grad_norm": 1.3790664672851562, "learning_rate": 8.815650903655388e-05, "loss": 3.6768, "step": 13955 }, { "epoch": 0.9484984372876749, "grad_norm": 0.583037793636322, "learning_rate": 8.815226253567062e-05, "loss": 3.8518, "step": 13960 }, { "epoch": 0.9488381573583368, "grad_norm": 0.16457735002040863, "learning_rate": 8.814801603478735e-05, "loss": 3.7035, "step": 13965 }, { "epoch": 0.9491778774289985, "grad_norm": 0.2307073473930359, "learning_rate": 8.814376953390406e-05, "loss": 4.1654, "step": 13970 }, { "epoch": 0.9495175974996602, "grad_norm": 0.3080320358276367, "learning_rate": 8.81395230330208e-05, "loss": 3.975, "step": 13975 }, { "epoch": 0.9498573175703221, "grad_norm": 0.17688848078250885, "learning_rate": 8.813527653213752e-05, "loss": 4.0859, "step": 13980 }, { "epoch": 0.9501970376409838, "grad_norm": 0.24346010386943817, "learning_rate": 8.813103003125424e-05, "loss": 3.9282, "step": 13985 }, { "epoch": 0.9505367577116456, "grad_norm": 0.1712096929550171, "learning_rate": 8.812678353037099e-05, "loss": 3.9035, "step": 13990 }, { "epoch": 0.9508764777823073, "grad_norm": 0.19020821154117584, "learning_rate": 8.81225370294877e-05, "loss": 4.1025, "step": 13995 }, { "epoch": 0.9512161978529692, "grad_norm": 0.21272438764572144, "learning_rate": 8.811829052860443e-05, "loss": 3.9025, "step": 14000 }, { "epoch": 0.9515559179236309, "grad_norm": 0.14927330613136292, "learning_rate": 8.811404402772117e-05, "loss": 3.6392, "step": 14005 }, { "epoch": 0.9518956379942927, "grad_norm": 0.14234548807144165, "learning_rate": 8.810979752683788e-05, "loss": 3.7538, "step": 14010 }, { "epoch": 0.9522353580649545, "grad_norm": 0.19329407811164856, "learning_rate": 8.810555102595461e-05, "loss": 4.0138, "step": 14015 }, { "epoch": 0.9525750781356163, "grad_norm": 1.4322816133499146, "learning_rate": 8.810130452507135e-05, "loss": 3.8935, "step": 14020 }, { "epoch": 0.952914798206278, "grad_norm": 0.19049431383609772, "learning_rate": 8.809705802418807e-05, "loss": 4.0516, "step": 14025 }, { "epoch": 0.9532545182769399, "grad_norm": 0.2139282375574112, "learning_rate": 8.80928115233048e-05, "loss": 3.8331, "step": 14030 }, { "epoch": 0.9535942383476016, "grad_norm": 0.17371760308742523, "learning_rate": 8.808856502242154e-05, "loss": 3.9821, "step": 14035 }, { "epoch": 0.9539339584182633, "grad_norm": 0.16707171499729156, "learning_rate": 8.808431852153825e-05, "loss": 4.1538, "step": 14040 }, { "epoch": 0.9542736784889251, "grad_norm": 0.2354259192943573, "learning_rate": 8.808007202065498e-05, "loss": 3.9828, "step": 14045 }, { "epoch": 0.9546133985595869, "grad_norm": 0.26454171538352966, "learning_rate": 8.807582551977172e-05, "loss": 4.1493, "step": 14050 }, { "epoch": 0.9549531186302487, "grad_norm": 0.26322662830352783, "learning_rate": 8.807157901888844e-05, "loss": 4.0161, "step": 14055 }, { "epoch": 0.9552928387009104, "grad_norm": 0.5517749786376953, "learning_rate": 8.806733251800516e-05, "loss": 3.8552, "step": 14060 }, { "epoch": 0.9556325587715723, "grad_norm": 0.34178662300109863, "learning_rate": 8.806308601712189e-05, "loss": 3.9374, "step": 14065 }, { "epoch": 0.955972278842234, "grad_norm": 0.4318985044956207, "learning_rate": 8.805883951623862e-05, "loss": 3.9312, "step": 14070 }, { "epoch": 0.9563119989128958, "grad_norm": 0.7835567593574524, "learning_rate": 8.805459301535535e-05, "loss": 4.0294, "step": 14075 }, { "epoch": 0.9566517189835575, "grad_norm": 0.248532235622406, "learning_rate": 8.805034651447208e-05, "loss": 4.0144, "step": 14080 }, { "epoch": 0.9569914390542194, "grad_norm": 0.17486423254013062, "learning_rate": 8.80461000135888e-05, "loss": 3.8262, "step": 14085 }, { "epoch": 0.9573311591248811, "grad_norm": 0.1894778460264206, "learning_rate": 8.804185351270553e-05, "loss": 3.989, "step": 14090 }, { "epoch": 0.9576708791955428, "grad_norm": 0.1900128871202469, "learning_rate": 8.803760701182226e-05, "loss": 4.0652, "step": 14095 }, { "epoch": 0.9580105992662047, "grad_norm": 0.21038229763507843, "learning_rate": 8.803336051093899e-05, "loss": 3.7708, "step": 14100 }, { "epoch": 0.9583503193368664, "grad_norm": 0.22659200429916382, "learning_rate": 8.802911401005572e-05, "loss": 3.701, "step": 14105 }, { "epoch": 0.9586900394075282, "grad_norm": 0.18774689733982086, "learning_rate": 8.802486750917244e-05, "loss": 3.8036, "step": 14110 }, { "epoch": 0.95902975947819, "grad_norm": 0.7720076441764832, "learning_rate": 8.802062100828917e-05, "loss": 4.0017, "step": 14115 }, { "epoch": 0.9593694795488518, "grad_norm": 0.17789320647716522, "learning_rate": 8.80163745074059e-05, "loss": 4.3506, "step": 14120 }, { "epoch": 0.9597091996195135, "grad_norm": 0.23013809323310852, "learning_rate": 8.801212800652263e-05, "loss": 4.1502, "step": 14125 }, { "epoch": 0.9600489196901753, "grad_norm": 0.20013386011123657, "learning_rate": 8.800788150563936e-05, "loss": 4.002, "step": 14130 }, { "epoch": 0.9603886397608371, "grad_norm": 0.331853449344635, "learning_rate": 8.800363500475608e-05, "loss": 3.9042, "step": 14135 }, { "epoch": 0.9607283598314988, "grad_norm": 0.19631457328796387, "learning_rate": 8.799938850387281e-05, "loss": 4.1109, "step": 14140 }, { "epoch": 0.9610680799021606, "grad_norm": 0.38952094316482544, "learning_rate": 8.799514200298954e-05, "loss": 3.7741, "step": 14145 }, { "epoch": 0.9614077999728224, "grad_norm": 0.15865278244018555, "learning_rate": 8.799089550210627e-05, "loss": 4.0511, "step": 14150 }, { "epoch": 0.9617475200434842, "grad_norm": 0.20270100235939026, "learning_rate": 8.7986649001223e-05, "loss": 3.9072, "step": 14155 }, { "epoch": 0.9620872401141459, "grad_norm": 0.17968709766864777, "learning_rate": 8.798240250033972e-05, "loss": 4.0201, "step": 14160 }, { "epoch": 0.9624269601848077, "grad_norm": 0.19091679155826569, "learning_rate": 8.797815599945645e-05, "loss": 4.0135, "step": 14165 }, { "epoch": 0.9627666802554695, "grad_norm": 0.18477921187877655, "learning_rate": 8.797390949857318e-05, "loss": 4.0118, "step": 14170 }, { "epoch": 0.9631064003261313, "grad_norm": 0.1884998232126236, "learning_rate": 8.796966299768991e-05, "loss": 3.9669, "step": 14175 }, { "epoch": 0.963446120396793, "grad_norm": 0.18764863908290863, "learning_rate": 8.796541649680664e-05, "loss": 3.9139, "step": 14180 }, { "epoch": 0.9637858404674549, "grad_norm": 0.3251302242279053, "learning_rate": 8.796116999592336e-05, "loss": 4.1504, "step": 14185 }, { "epoch": 0.9641255605381166, "grad_norm": 0.17968013882637024, "learning_rate": 8.795692349504009e-05, "loss": 3.8325, "step": 14190 }, { "epoch": 0.9644652806087783, "grad_norm": 0.28362056612968445, "learning_rate": 8.795267699415682e-05, "loss": 3.945, "step": 14195 }, { "epoch": 0.9648050006794402, "grad_norm": 0.18230807781219482, "learning_rate": 8.794843049327355e-05, "loss": 3.8641, "step": 14200 }, { "epoch": 0.9651447207501019, "grad_norm": 0.20012174546718597, "learning_rate": 8.794418399239028e-05, "loss": 3.8536, "step": 14205 }, { "epoch": 0.9654844408207637, "grad_norm": 0.16306108236312866, "learning_rate": 8.7939937491507e-05, "loss": 3.9824, "step": 14210 }, { "epoch": 0.9658241608914254, "grad_norm": 0.19044755399227142, "learning_rate": 8.793569099062373e-05, "loss": 4.0477, "step": 14215 }, { "epoch": 0.9661638809620873, "grad_norm": 0.9334998726844788, "learning_rate": 8.793144448974046e-05, "loss": 4.0839, "step": 14220 }, { "epoch": 0.966503601032749, "grad_norm": 0.2080710530281067, "learning_rate": 8.792719798885719e-05, "loss": 3.935, "step": 14225 }, { "epoch": 0.9668433211034108, "grad_norm": 0.20228326320648193, "learning_rate": 8.792295148797392e-05, "loss": 3.8579, "step": 14230 }, { "epoch": 0.9671830411740726, "grad_norm": 0.14868243038654327, "learning_rate": 8.791870498709064e-05, "loss": 3.9681, "step": 14235 }, { "epoch": 0.9675227612447344, "grad_norm": 0.3080695569515228, "learning_rate": 8.791445848620737e-05, "loss": 3.7428, "step": 14240 }, { "epoch": 0.9678624813153961, "grad_norm": 0.3069162666797638, "learning_rate": 8.79102119853241e-05, "loss": 3.9151, "step": 14245 }, { "epoch": 0.9682022013860578, "grad_norm": 0.18564817309379578, "learning_rate": 8.790596548444083e-05, "loss": 3.9755, "step": 14250 }, { "epoch": 0.9685419214567197, "grad_norm": 0.26427149772644043, "learning_rate": 8.790171898355756e-05, "loss": 4.0877, "step": 14255 }, { "epoch": 0.9688816415273814, "grad_norm": 0.17514817416667938, "learning_rate": 8.789747248267428e-05, "loss": 3.9761, "step": 14260 }, { "epoch": 0.9692213615980432, "grad_norm": 0.17348946630954742, "learning_rate": 8.7893225981791e-05, "loss": 3.9038, "step": 14265 }, { "epoch": 0.969561081668705, "grad_norm": 0.20074446499347687, "learning_rate": 8.788897948090774e-05, "loss": 3.9772, "step": 14270 }, { "epoch": 0.9699008017393668, "grad_norm": 0.24490071833133698, "learning_rate": 8.788473298002447e-05, "loss": 4.1249, "step": 14275 }, { "epoch": 0.9702405218100285, "grad_norm": 0.2058936506509781, "learning_rate": 8.788048647914118e-05, "loss": 3.8624, "step": 14280 }, { "epoch": 0.9705802418806904, "grad_norm": 0.16198603808879852, "learning_rate": 8.787623997825792e-05, "loss": 3.9449, "step": 14285 }, { "epoch": 0.9709199619513521, "grad_norm": 0.15939363837242126, "learning_rate": 8.787199347737465e-05, "loss": 3.9548, "step": 14290 }, { "epoch": 0.9712596820220138, "grad_norm": 0.15633496642112732, "learning_rate": 8.786774697649137e-05, "loss": 3.8646, "step": 14295 }, { "epoch": 0.9715994020926756, "grad_norm": 0.18158230185508728, "learning_rate": 8.786350047560811e-05, "loss": 4.0045, "step": 14300 }, { "epoch": 0.9719391221633374, "grad_norm": 0.2111271321773529, "learning_rate": 8.785925397472484e-05, "loss": 4.1019, "step": 14305 }, { "epoch": 0.9722788422339992, "grad_norm": 0.19582590460777283, "learning_rate": 8.785500747384155e-05, "loss": 4.0289, "step": 14310 }, { "epoch": 0.9726185623046609, "grad_norm": 0.18569405376911163, "learning_rate": 8.785076097295829e-05, "loss": 3.8372, "step": 14315 }, { "epoch": 0.9729582823753228, "grad_norm": 0.1685798466205597, "learning_rate": 8.784651447207502e-05, "loss": 3.9108, "step": 14320 }, { "epoch": 0.9732980024459845, "grad_norm": 0.236485555768013, "learning_rate": 8.784226797119174e-05, "loss": 3.6307, "step": 14325 }, { "epoch": 0.9736377225166463, "grad_norm": 0.17849500477313995, "learning_rate": 8.783802147030848e-05, "loss": 4.0166, "step": 14330 }, { "epoch": 0.973977442587308, "grad_norm": 0.15316098928451538, "learning_rate": 8.783377496942519e-05, "loss": 4.0223, "step": 14335 }, { "epoch": 0.9743171626579699, "grad_norm": 0.14150911569595337, "learning_rate": 8.782952846854192e-05, "loss": 3.9861, "step": 14340 }, { "epoch": 0.9746568827286316, "grad_norm": 0.15742090344429016, "learning_rate": 8.782528196765866e-05, "loss": 4.0853, "step": 14345 }, { "epoch": 0.9749966027992933, "grad_norm": 0.1783253401517868, "learning_rate": 8.782103546677538e-05, "loss": 3.8919, "step": 14350 }, { "epoch": 0.9753363228699552, "grad_norm": 0.16757941246032715, "learning_rate": 8.78167889658921e-05, "loss": 3.5916, "step": 14355 }, { "epoch": 0.9756760429406169, "grad_norm": 0.25573256611824036, "learning_rate": 8.781254246500884e-05, "loss": 3.9848, "step": 14360 }, { "epoch": 0.9760157630112787, "grad_norm": 0.1471344381570816, "learning_rate": 8.780829596412556e-05, "loss": 3.9848, "step": 14365 }, { "epoch": 0.9763554830819405, "grad_norm": 0.17102879285812378, "learning_rate": 8.780404946324229e-05, "loss": 3.9778, "step": 14370 }, { "epoch": 0.9766952031526023, "grad_norm": 0.21928320825099945, "learning_rate": 8.779980296235903e-05, "loss": 3.7909, "step": 14375 }, { "epoch": 0.977034923223264, "grad_norm": 0.1851445883512497, "learning_rate": 8.779555646147574e-05, "loss": 4.0371, "step": 14380 }, { "epoch": 0.9773746432939258, "grad_norm": 0.15043723583221436, "learning_rate": 8.779130996059247e-05, "loss": 3.7731, "step": 14385 }, { "epoch": 0.9777143633645876, "grad_norm": 0.19180312752723694, "learning_rate": 8.778706345970921e-05, "loss": 3.8016, "step": 14390 }, { "epoch": 0.9780540834352494, "grad_norm": 0.18517717719078064, "learning_rate": 8.778281695882593e-05, "loss": 4.0766, "step": 14395 }, { "epoch": 0.9783938035059111, "grad_norm": 0.2040787637233734, "learning_rate": 8.777857045794266e-05, "loss": 3.9668, "step": 14400 }, { "epoch": 0.978733523576573, "grad_norm": 0.17522788047790527, "learning_rate": 8.777432395705938e-05, "loss": 3.7057, "step": 14405 }, { "epoch": 0.9790732436472347, "grad_norm": 0.258577823638916, "learning_rate": 8.777007745617611e-05, "loss": 3.8998, "step": 14410 }, { "epoch": 0.9794129637178964, "grad_norm": 0.16101132333278656, "learning_rate": 8.776583095529284e-05, "loss": 4.024, "step": 14415 }, { "epoch": 0.9797526837885582, "grad_norm": 0.23143424093723297, "learning_rate": 8.776158445440957e-05, "loss": 3.876, "step": 14420 }, { "epoch": 0.98009240385922, "grad_norm": 0.5347188711166382, "learning_rate": 8.77573379535263e-05, "loss": 3.721, "step": 14425 }, { "epoch": 0.9804321239298818, "grad_norm": 3.5879626274108887, "learning_rate": 8.775309145264302e-05, "loss": 3.9406, "step": 14430 }, { "epoch": 0.9807718440005435, "grad_norm": 0.48771944642066956, "learning_rate": 8.774884495175975e-05, "loss": 3.8004, "step": 14435 }, { "epoch": 0.9811115640712054, "grad_norm": 0.1710396111011505, "learning_rate": 8.774459845087648e-05, "loss": 3.9633, "step": 14440 }, { "epoch": 0.9814512841418671, "grad_norm": 0.16899172961711884, "learning_rate": 8.774035194999321e-05, "loss": 4.0779, "step": 14445 }, { "epoch": 0.9817910042125289, "grad_norm": 0.18617889285087585, "learning_rate": 8.773610544910994e-05, "loss": 3.8466, "step": 14450 }, { "epoch": 0.9821307242831907, "grad_norm": 0.13299688696861267, "learning_rate": 8.773185894822666e-05, "loss": 4.0135, "step": 14455 }, { "epoch": 0.9824704443538524, "grad_norm": 0.20678380131721497, "learning_rate": 8.772761244734339e-05, "loss": 3.8453, "step": 14460 }, { "epoch": 0.9828101644245142, "grad_norm": 0.14222410321235657, "learning_rate": 8.772336594646012e-05, "loss": 3.9648, "step": 14465 }, { "epoch": 0.9831498844951759, "grad_norm": 0.21488900482654572, "learning_rate": 8.771911944557685e-05, "loss": 3.9613, "step": 14470 }, { "epoch": 0.9834896045658378, "grad_norm": 0.2058933824300766, "learning_rate": 8.771487294469358e-05, "loss": 3.7448, "step": 14475 }, { "epoch": 0.9838293246364995, "grad_norm": 0.25969386100769043, "learning_rate": 8.77106264438103e-05, "loss": 3.8523, "step": 14480 }, { "epoch": 0.9841690447071613, "grad_norm": 0.17804904282093048, "learning_rate": 8.770637994292703e-05, "loss": 4.0136, "step": 14485 }, { "epoch": 0.9845087647778231, "grad_norm": 0.17092011868953705, "learning_rate": 8.770213344204376e-05, "loss": 3.9277, "step": 14490 }, { "epoch": 0.9848484848484849, "grad_norm": 0.26749852299690247, "learning_rate": 8.769788694116049e-05, "loss": 4.1337, "step": 14495 }, { "epoch": 0.9851882049191466, "grad_norm": 1.6659806966781616, "learning_rate": 8.769364044027722e-05, "loss": 3.9058, "step": 14500 }, { "epoch": 0.9855279249898083, "grad_norm": 0.16872169077396393, "learning_rate": 8.768939393939394e-05, "loss": 3.9551, "step": 14505 }, { "epoch": 0.9858676450604702, "grad_norm": 0.25871741771698, "learning_rate": 8.768514743851067e-05, "loss": 3.9576, "step": 14510 }, { "epoch": 0.9862073651311319, "grad_norm": 0.2599276006221771, "learning_rate": 8.76809009376274e-05, "loss": 4.0486, "step": 14515 }, { "epoch": 0.9865470852017937, "grad_norm": 0.17734144628047943, "learning_rate": 8.767665443674413e-05, "loss": 3.9271, "step": 14520 }, { "epoch": 0.9868868052724555, "grad_norm": 0.15988993644714355, "learning_rate": 8.767240793586086e-05, "loss": 3.7398, "step": 14525 }, { "epoch": 0.9872265253431173, "grad_norm": 0.3864258825778961, "learning_rate": 8.766816143497758e-05, "loss": 4.1109, "step": 14530 }, { "epoch": 0.987566245413779, "grad_norm": 0.1917543262243271, "learning_rate": 8.766391493409431e-05, "loss": 3.9459, "step": 14535 }, { "epoch": 0.9879059654844409, "grad_norm": 0.2340976893901825, "learning_rate": 8.765966843321104e-05, "loss": 3.9126, "step": 14540 }, { "epoch": 0.9882456855551026, "grad_norm": 0.1463710069656372, "learning_rate": 8.765542193232777e-05, "loss": 3.8255, "step": 14545 }, { "epoch": 0.9885854056257644, "grad_norm": 0.20803597569465637, "learning_rate": 8.76511754314445e-05, "loss": 3.9285, "step": 14550 }, { "epoch": 0.9889251256964261, "grad_norm": 0.23730699717998505, "learning_rate": 8.764692893056122e-05, "loss": 3.9045, "step": 14555 }, { "epoch": 0.989264845767088, "grad_norm": 0.17138421535491943, "learning_rate": 8.764268242967795e-05, "loss": 3.8542, "step": 14560 }, { "epoch": 0.9896045658377497, "grad_norm": 0.16893406212329865, "learning_rate": 8.763843592879468e-05, "loss": 4.0535, "step": 14565 }, { "epoch": 0.9899442859084114, "grad_norm": 0.13037355244159698, "learning_rate": 8.763418942791141e-05, "loss": 3.7079, "step": 14570 }, { "epoch": 0.9902840059790733, "grad_norm": 0.1940174400806427, "learning_rate": 8.762994292702814e-05, "loss": 3.9701, "step": 14575 }, { "epoch": 0.990623726049735, "grad_norm": 0.19718214869499207, "learning_rate": 8.762569642614486e-05, "loss": 3.8213, "step": 14580 }, { "epoch": 0.9909634461203968, "grad_norm": 0.6371154189109802, "learning_rate": 8.762144992526159e-05, "loss": 3.8541, "step": 14585 }, { "epoch": 0.9913031661910585, "grad_norm": 0.15423880517482758, "learning_rate": 8.761720342437832e-05, "loss": 4.0511, "step": 14590 }, { "epoch": 0.9916428862617204, "grad_norm": 0.1736377775669098, "learning_rate": 8.761295692349505e-05, "loss": 3.8275, "step": 14595 }, { "epoch": 0.9919826063323821, "grad_norm": 0.19258588552474976, "learning_rate": 8.760871042261178e-05, "loss": 4.1035, "step": 14600 }, { "epoch": 0.9923223264030439, "grad_norm": 0.43297508358955383, "learning_rate": 8.760446392172849e-05, "loss": 4.0978, "step": 14605 }, { "epoch": 0.9926620464737057, "grad_norm": 0.20580710470676422, "learning_rate": 8.760021742084523e-05, "loss": 3.8437, "step": 14610 }, { "epoch": 0.9930017665443674, "grad_norm": 1.218690037727356, "learning_rate": 8.759597091996196e-05, "loss": 4.0635, "step": 14615 }, { "epoch": 0.9933414866150292, "grad_norm": 0.1623927801847458, "learning_rate": 8.759172441907867e-05, "loss": 3.8147, "step": 14620 }, { "epoch": 0.993681206685691, "grad_norm": 0.17431679368019104, "learning_rate": 8.758747791819542e-05, "loss": 3.9406, "step": 14625 }, { "epoch": 0.9940209267563528, "grad_norm": 0.18215017020702362, "learning_rate": 8.758323141731214e-05, "loss": 4.1441, "step": 14630 }, { "epoch": 0.9943606468270145, "grad_norm": 0.20073819160461426, "learning_rate": 8.757898491642886e-05, "loss": 3.9045, "step": 14635 }, { "epoch": 0.9947003668976763, "grad_norm": 0.17208696901798248, "learning_rate": 8.75747384155456e-05, "loss": 4.223, "step": 14640 }, { "epoch": 0.9950400869683381, "grad_norm": 0.18610428273677826, "learning_rate": 8.757049191466233e-05, "loss": 3.9652, "step": 14645 }, { "epoch": 0.9953798070389999, "grad_norm": 0.2461862713098526, "learning_rate": 8.756624541377904e-05, "loss": 3.8623, "step": 14650 }, { "epoch": 0.9957195271096616, "grad_norm": 0.37638500332832336, "learning_rate": 8.756199891289578e-05, "loss": 3.896, "step": 14655 }, { "epoch": 0.9960592471803235, "grad_norm": 0.2814083993434906, "learning_rate": 8.755775241201251e-05, "loss": 3.8634, "step": 14660 }, { "epoch": 0.9963989672509852, "grad_norm": 0.2636764645576477, "learning_rate": 8.755350591112923e-05, "loss": 4.1024, "step": 14665 }, { "epoch": 0.9967386873216469, "grad_norm": 0.1730240136384964, "learning_rate": 8.754925941024597e-05, "loss": 4.0616, "step": 14670 }, { "epoch": 0.9970784073923087, "grad_norm": 0.17876474559307098, "learning_rate": 8.75450129093627e-05, "loss": 4.0613, "step": 14675 }, { "epoch": 0.9974181274629705, "grad_norm": 0.1803043931722641, "learning_rate": 8.754076640847941e-05, "loss": 4.0982, "step": 14680 }, { "epoch": 0.9977578475336323, "grad_norm": 0.2912727892398834, "learning_rate": 8.753651990759615e-05, "loss": 3.8865, "step": 14685 }, { "epoch": 0.998097567604294, "grad_norm": 0.21436181664466858, "learning_rate": 8.753227340671287e-05, "loss": 4.0897, "step": 14690 }, { "epoch": 0.9984372876749559, "grad_norm": 0.1772686094045639, "learning_rate": 8.75280269058296e-05, "loss": 4.1523, "step": 14695 }, { "epoch": 0.9987770077456176, "grad_norm": 0.269540935754776, "learning_rate": 8.752378040494634e-05, "loss": 3.9891, "step": 14700 }, { "epoch": 0.9991167278162794, "grad_norm": 0.20457632839679718, "learning_rate": 8.751953390406305e-05, "loss": 3.8832, "step": 14705 }, { "epoch": 0.9994564478869412, "grad_norm": 0.2145458459854126, "learning_rate": 8.751528740317978e-05, "loss": 4.0273, "step": 14710 }, { "epoch": 0.999796167957603, "grad_norm": 0.18700024485588074, "learning_rate": 8.751104090229652e-05, "loss": 4.1366, "step": 14715 }, { "epoch": 1.0, "eval_bertscore": { "f1": 0.8525333878305233, "precision": 0.8753549892468697, "recall": 0.8312223081395127 }, "eval_bleu_4": 0.0021924919669198163, "eval_exact_match": 0.0, "eval_loss": 3.751943349838257, "eval_meteor": 0.07837825616589945, "eval_rouge": { "rouge1": 0.12766942289604644, "rouge2": 0.01466477971693661, "rougeL": 0.11110403147048781, "rougeLsum": 0.11113999503571864 }, "eval_runtime": 404.0965, "eval_samples_per_second": 25.536, "eval_steps_per_second": 3.192, "step": 14718 } ], "logging_steps": 5, "max_steps": 117744, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.174476031013683e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }