{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1777, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005627462014631402, "grad_norm": 0.96278315782547, "learning_rate": 1.1235955056179775e-06, "loss": 0.8197, "step": 1 }, { "epoch": 0.0028137310073157004, "grad_norm": 1.2738590240478516, "learning_rate": 5.617977528089888e-06, "loss": 1.1947, "step": 5 }, { "epoch": 0.005627462014631401, "grad_norm": 2.892261028289795, "learning_rate": 1.1235955056179776e-05, "loss": 1.2592, "step": 10 }, { "epoch": 0.008441193021947102, "grad_norm": 1.4733870029449463, "learning_rate": 1.6853932584269665e-05, "loss": 1.2391, "step": 15 }, { "epoch": 0.011254924029262802, "grad_norm": 1.0532560348510742, "learning_rate": 2.2471910112359552e-05, "loss": 1.5268, "step": 20 }, { "epoch": 0.014068655036578503, "grad_norm": 1.5811922550201416, "learning_rate": 2.8089887640449443e-05, "loss": 1.0691, "step": 25 }, { "epoch": 0.016882386043894203, "grad_norm": 1.4303256273269653, "learning_rate": 3.370786516853933e-05, "loss": 0.6871, "step": 30 }, { "epoch": 0.019696117051209903, "grad_norm": 2.0926148891448975, "learning_rate": 3.9325842696629214e-05, "loss": 0.6074, "step": 35 }, { "epoch": 0.022509848058525603, "grad_norm": 2.149613857269287, "learning_rate": 4.4943820224719104e-05, "loss": 0.4506, "step": 40 }, { "epoch": 0.025323579065841307, "grad_norm": 1.4852650165557861, "learning_rate": 5.0561797752808995e-05, "loss": 0.6363, "step": 45 }, { "epoch": 0.028137310073157007, "grad_norm": 0.8294332027435303, "learning_rate": 5.6179775280898885e-05, "loss": 0.3258, "step": 50 }, { "epoch": 0.030951041080472707, "grad_norm": 0.9797491431236267, "learning_rate": 6.179775280898876e-05, "loss": 0.4315, "step": 55 }, { "epoch": 0.03376477208778841, "grad_norm": 0.8157183527946472, "learning_rate": 6.741573033707866e-05, "loss": 0.5252, "step": 60 }, { "epoch": 0.03657850309510411, "grad_norm": 0.988738477230072, "learning_rate": 7.303370786516854e-05, "loss": 0.3002, "step": 65 }, { "epoch": 0.03939223410241981, "grad_norm": 1.035030484199524, "learning_rate": 7.865168539325843e-05, "loss": 0.3721, "step": 70 }, { "epoch": 0.04220596510973551, "grad_norm": 1.2941393852233887, "learning_rate": 8.426966292134831e-05, "loss": 0.3631, "step": 75 }, { "epoch": 0.04501969611705121, "grad_norm": 0.8939509391784668, "learning_rate": 8.988764044943821e-05, "loss": 0.3648, "step": 80 }, { "epoch": 0.04783342712436691, "grad_norm": 1.7211397886276245, "learning_rate": 9.550561797752809e-05, "loss": 0.3593, "step": 85 }, { "epoch": 0.050647158131682614, "grad_norm": 0.7832581996917725, "learning_rate": 0.00010112359550561799, "loss": 0.2104, "step": 90 }, { "epoch": 0.05346088913899831, "grad_norm": 1.1701756715774536, "learning_rate": 0.00010674157303370786, "loss": 0.3957, "step": 95 }, { "epoch": 0.056274620146314014, "grad_norm": 0.7531750798225403, "learning_rate": 0.00011235955056179777, "loss": 0.3513, "step": 100 }, { "epoch": 0.05908835115362971, "grad_norm": 0.7287600636482239, "learning_rate": 0.00011797752808988764, "loss": 0.3221, "step": 105 }, { "epoch": 0.061902082160945414, "grad_norm": 1.7473770380020142, "learning_rate": 0.00012359550561797752, "loss": 0.1973, "step": 110 }, { "epoch": 0.06471581316826111, "grad_norm": 1.0901485681533813, "learning_rate": 0.00012921348314606744, "loss": 0.4353, "step": 115 }, { "epoch": 0.06752954417557681, "grad_norm": 0.7513278722763062, "learning_rate": 0.00013483146067415732, "loss": 0.3628, "step": 120 }, { "epoch": 0.07034327518289252, "grad_norm": 1.2991347312927246, "learning_rate": 0.0001404494382022472, "loss": 0.3113, "step": 125 }, { "epoch": 0.07315700619020822, "grad_norm": 0.6642701029777527, "learning_rate": 0.0001460674157303371, "loss": 0.4762, "step": 130 }, { "epoch": 0.07597073719752391, "grad_norm": 1.3134933710098267, "learning_rate": 0.00015168539325842697, "loss": 0.3526, "step": 135 }, { "epoch": 0.07878446820483961, "grad_norm": 1.0081052780151367, "learning_rate": 0.00015730337078651685, "loss": 0.3115, "step": 140 }, { "epoch": 0.08159819921215532, "grad_norm": 0.2705545723438263, "learning_rate": 0.00016292134831460674, "loss": 0.2489, "step": 145 }, { "epoch": 0.08441193021947102, "grad_norm": 1.097110390663147, "learning_rate": 0.00016853932584269662, "loss": 0.4727, "step": 150 }, { "epoch": 0.08722566122678672, "grad_norm": 0.8656176328659058, "learning_rate": 0.00017415730337078653, "loss": 0.4242, "step": 155 }, { "epoch": 0.09003939223410241, "grad_norm": 0.8114811778068542, "learning_rate": 0.00017977528089887642, "loss": 0.3033, "step": 160 }, { "epoch": 0.09285312324141812, "grad_norm": 1.072106957435608, "learning_rate": 0.0001853932584269663, "loss": 0.3359, "step": 165 }, { "epoch": 0.09566685424873382, "grad_norm": 0.6422829031944275, "learning_rate": 0.00019101123595505618, "loss": 0.2874, "step": 170 }, { "epoch": 0.09848058525604952, "grad_norm": 1.2826429605484009, "learning_rate": 0.00019662921348314607, "loss": 0.3826, "step": 175 }, { "epoch": 0.10129431626336523, "grad_norm": 0.24743008613586426, "learning_rate": 0.00019999922797341667, "loss": 0.1672, "step": 180 }, { "epoch": 0.10410804727068092, "grad_norm": 0.9253625273704529, "learning_rate": 0.00019999054281125283, "loss": 0.2247, "step": 185 }, { "epoch": 0.10692177827799662, "grad_norm": 0.6312052011489868, "learning_rate": 0.0001999722082946312, "loss": 0.4353, "step": 190 }, { "epoch": 0.10973550928531232, "grad_norm": 0.7200600504875183, "learning_rate": 0.00019994422619288159, "loss": 0.2801, "step": 195 }, { "epoch": 0.11254924029262803, "grad_norm": 0.46093428134918213, "learning_rate": 0.00019990659920635152, "loss": 0.2552, "step": 200 }, { "epoch": 0.11536297129994373, "grad_norm": 2.7881016731262207, "learning_rate": 0.00019985933096614578, "loss": 0.4449, "step": 205 }, { "epoch": 0.11817670230725942, "grad_norm": 0.7893804311752319, "learning_rate": 0.00019980242603377573, "loss": 0.2687, "step": 210 }, { "epoch": 0.12099043331457512, "grad_norm": 0.644159197807312, "learning_rate": 0.00019973588990071937, "loss": 0.2525, "step": 215 }, { "epoch": 0.12380416432189083, "grad_norm": 0.7028160095214844, "learning_rate": 0.00019965972898789125, "loss": 0.4349, "step": 220 }, { "epoch": 0.12661789532920653, "grad_norm": 0.9951076507568359, "learning_rate": 0.0001995739506450229, "loss": 0.3944, "step": 225 }, { "epoch": 0.12943162633652222, "grad_norm": 0.9317317605018616, "learning_rate": 0.00019947856314995349, "loss": 0.2633, "step": 230 }, { "epoch": 0.13224535734383794, "grad_norm": 0.426104336977005, "learning_rate": 0.00019937357570783107, "loss": 0.2667, "step": 235 }, { "epoch": 0.13505908835115363, "grad_norm": 0.7403711676597595, "learning_rate": 0.0001992589984502243, "loss": 0.4544, "step": 240 }, { "epoch": 0.13787281935846932, "grad_norm": 0.7271831035614014, "learning_rate": 0.0001991348424341445, "loss": 0.3209, "step": 245 }, { "epoch": 0.14068655036578503, "grad_norm": 2.071850299835205, "learning_rate": 0.00019900111964097893, "loss": 0.4721, "step": 250 }, { "epoch": 0.14350028137310072, "grad_norm": 0.5502871870994568, "learning_rate": 0.0001988578429753342, "loss": 0.2698, "step": 255 }, { "epoch": 0.14631401238041644, "grad_norm": 1.0685575008392334, "learning_rate": 0.00019870502626379127, "loss": 0.3656, "step": 260 }, { "epoch": 0.14912774338773213, "grad_norm": 0.521375834941864, "learning_rate": 0.00019854268425357105, "loss": 0.3065, "step": 265 }, { "epoch": 0.15194147439504782, "grad_norm": 0.7491894960403442, "learning_rate": 0.0001983708326111111, "loss": 0.3863, "step": 270 }, { "epoch": 0.15475520540236354, "grad_norm": 0.8233822584152222, "learning_rate": 0.0001981894879205539, "loss": 0.3434, "step": 275 }, { "epoch": 0.15756893640967923, "grad_norm": 0.9702492952346802, "learning_rate": 0.0001979986676821465, "loss": 0.3103, "step": 280 }, { "epoch": 0.16038266741699495, "grad_norm": 1.0381535291671753, "learning_rate": 0.00019779839031055157, "loss": 0.4879, "step": 285 }, { "epoch": 0.16319639842431063, "grad_norm": 0.594953179359436, "learning_rate": 0.00019758867513307047, "loss": 0.2185, "step": 290 }, { "epoch": 0.16601012943162632, "grad_norm": 0.9089880585670471, "learning_rate": 0.00019736954238777792, "loss": 0.4932, "step": 295 }, { "epoch": 0.16882386043894204, "grad_norm": 0.8703072667121887, "learning_rate": 0.00019714101322156915, "loss": 0.3456, "step": 300 }, { "epoch": 0.17163759144625773, "grad_norm": 0.9151054620742798, "learning_rate": 0.00019690310968811914, "loss": 0.3413, "step": 305 }, { "epoch": 0.17445132245357345, "grad_norm": 0.7803131341934204, "learning_rate": 0.0001966558547457543, "loss": 0.2144, "step": 310 }, { "epoch": 0.17726505346088914, "grad_norm": 4.146880626678467, "learning_rate": 0.00019639927225523698, "loss": 0.2838, "step": 315 }, { "epoch": 0.18007878446820483, "grad_norm": 0.8946036696434021, "learning_rate": 0.00019613338697746285, "loss": 0.3069, "step": 320 }, { "epoch": 0.18289251547552055, "grad_norm": 1.0504130125045776, "learning_rate": 0.00019585822457107138, "loss": 0.1849, "step": 325 }, { "epoch": 0.18570624648283623, "grad_norm": 0.5320996642112732, "learning_rate": 0.0001955738115899698, "loss": 0.4705, "step": 330 }, { "epoch": 0.18851997749015195, "grad_norm": 1.0972635746002197, "learning_rate": 0.00019528017548077045, "loss": 0.1279, "step": 335 }, { "epoch": 0.19133370849746764, "grad_norm": 0.9836655855178833, "learning_rate": 0.00019497734458014216, "loss": 0.3454, "step": 340 }, { "epoch": 0.19414743950478333, "grad_norm": 0.9435672163963318, "learning_rate": 0.00019466534811207569, "loss": 0.1713, "step": 345 }, { "epoch": 0.19696117051209905, "grad_norm": 0.8675717115402222, "learning_rate": 0.00019434421618506358, "loss": 0.4799, "step": 350 }, { "epoch": 0.19977490151941474, "grad_norm": 0.5335102677345276, "learning_rate": 0.00019401397978919453, "loss": 0.242, "step": 355 }, { "epoch": 0.20258863252673046, "grad_norm": 0.7480678558349609, "learning_rate": 0.00019367467079316279, "loss": 0.3568, "step": 360 }, { "epoch": 0.20540236353404615, "grad_norm": 0.23846450448036194, "learning_rate": 0.0001933263219411928, "loss": 0.4519, "step": 365 }, { "epoch": 0.20821609454136183, "grad_norm": 0.24460311233997345, "learning_rate": 0.00019296896684987925, "loss": 0.3763, "step": 370 }, { "epoch": 0.21102982554867755, "grad_norm": 0.6336620450019836, "learning_rate": 0.0001926026400049429, "loss": 0.3754, "step": 375 }, { "epoch": 0.21384355655599324, "grad_norm": 0.4195510745048523, "learning_rate": 0.00019222737675790276, "loss": 0.2576, "step": 380 }, { "epoch": 0.21665728756330896, "grad_norm": 1.3396929502487183, "learning_rate": 0.00019184321332266452, "loss": 0.3267, "step": 385 }, { "epoch": 0.21947101857062465, "grad_norm": 0.49325576424598694, "learning_rate": 0.0001914501867720258, "loss": 0.2602, "step": 390 }, { "epoch": 0.22228474957794034, "grad_norm": 0.5282377004623413, "learning_rate": 0.00019104833503409848, "loss": 0.3498, "step": 395 }, { "epoch": 0.22509848058525606, "grad_norm": 0.6364492774009705, "learning_rate": 0.00019063769688864866, "loss": 0.2147, "step": 400 }, { "epoch": 0.22791221159257175, "grad_norm": 0.8976377248764038, "learning_rate": 0.00019021831196335418, "loss": 0.328, "step": 405 }, { "epoch": 0.23072594259988746, "grad_norm": 0.4209904372692108, "learning_rate": 0.0001897902207299805, "loss": 0.2822, "step": 410 }, { "epoch": 0.23353967360720315, "grad_norm": 0.5531566739082336, "learning_rate": 0.0001893534645004751, "loss": 0.4366, "step": 415 }, { "epoch": 0.23635340461451884, "grad_norm": 0.592050313949585, "learning_rate": 0.00018890808542298073, "loss": 0.381, "step": 420 }, { "epoch": 0.23916713562183456, "grad_norm": 0.8051882982254028, "learning_rate": 0.00018845412647776794, "loss": 0.3602, "step": 425 }, { "epoch": 0.24198086662915025, "grad_norm": 0.7593362331390381, "learning_rate": 0.0001879916314730875, "loss": 0.3809, "step": 430 }, { "epoch": 0.24479459763646597, "grad_norm": 1.2135759592056274, "learning_rate": 0.00018752064504094272, "loss": 0.2138, "step": 435 }, { "epoch": 0.24760832864378166, "grad_norm": 0.14413990080356598, "learning_rate": 0.00018704121263278227, "loss": 0.3506, "step": 440 }, { "epoch": 0.2504220596510974, "grad_norm": 0.6321181058883667, "learning_rate": 0.00018655338051511413, "loss": 0.3232, "step": 445 }, { "epoch": 0.25323579065841306, "grad_norm": 0.7276772856712341, "learning_rate": 0.00018605719576504065, "loss": 0.2345, "step": 450 }, { "epoch": 0.25604952166572875, "grad_norm": 0.324861079454422, "learning_rate": 0.00018555270626571555, "loss": 0.1345, "step": 455 }, { "epoch": 0.25886325267304444, "grad_norm": 0.7779459953308105, "learning_rate": 0.000185039960701723, "loss": 0.2958, "step": 460 }, { "epoch": 0.26167698368036013, "grad_norm": 0.6974682211875916, "learning_rate": 0.0001845190085543795, "loss": 0.2257, "step": 465 }, { "epoch": 0.2644907146876759, "grad_norm": 0.9312912821769714, "learning_rate": 0.0001839899000969587, "loss": 0.353, "step": 470 }, { "epoch": 0.26730444569499157, "grad_norm": 0.49484914541244507, "learning_rate": 0.00018345268638984003, "loss": 0.2321, "step": 475 }, { "epoch": 0.27011817670230726, "grad_norm": 0.24110960960388184, "learning_rate": 0.00018290741927558113, "loss": 0.2501, "step": 480 }, { "epoch": 0.27293190770962295, "grad_norm": 0.5313132405281067, "learning_rate": 0.00018235415137391497, "loss": 0.2477, "step": 485 }, { "epoch": 0.27574563871693863, "grad_norm": 0.6360633373260498, "learning_rate": 0.00018179293607667178, "loss": 0.2846, "step": 490 }, { "epoch": 0.2785593697242544, "grad_norm": 0.5810567140579224, "learning_rate": 0.00018122382754262681, "loss": 0.2196, "step": 495 }, { "epoch": 0.28137310073157007, "grad_norm": 0.7277317047119141, "learning_rate": 0.00018064688069227368, "loss": 0.2656, "step": 500 }, { "epoch": 0.28418683173888576, "grad_norm": 0.7561081051826477, "learning_rate": 0.00018006215120252453, "loss": 0.3004, "step": 505 }, { "epoch": 0.28700056274620145, "grad_norm": 0.8930642604827881, "learning_rate": 0.0001794696955013369, "loss": 0.445, "step": 510 }, { "epoch": 0.28981429375351714, "grad_norm": 0.9028257727622986, "learning_rate": 0.00017886957076226838, "loss": 0.3362, "step": 515 }, { "epoch": 0.2926280247608329, "grad_norm": 0.6070359945297241, "learning_rate": 0.0001782618348989593, "loss": 0.1993, "step": 520 }, { "epoch": 0.2954417557681486, "grad_norm": 0.9078888893127441, "learning_rate": 0.0001776465465595437, "loss": 0.2554, "step": 525 }, { "epoch": 0.29825548677546426, "grad_norm": 0.7235105633735657, "learning_rate": 0.0001770237651209898, "loss": 0.1807, "step": 530 }, { "epoch": 0.30106921778277995, "grad_norm": 0.37401115894317627, "learning_rate": 0.00017639355068336987, "loss": 0.153, "step": 535 }, { "epoch": 0.30388294879009564, "grad_norm": 0.4220016896724701, "learning_rate": 0.00017575596406406048, "loss": 0.2249, "step": 540 }, { "epoch": 0.3066966797974114, "grad_norm": 0.6732789278030396, "learning_rate": 0.00017511106679187334, "loss": 0.1663, "step": 545 }, { "epoch": 0.3095104108047271, "grad_norm": 0.8022940754890442, "learning_rate": 0.00017445892110111783, "loss": 0.3083, "step": 550 }, { "epoch": 0.31232414181204277, "grad_norm": 2.0369653701782227, "learning_rate": 0.00017379958992559493, "loss": 0.3939, "step": 555 }, { "epoch": 0.31513787281935846, "grad_norm": 1.5206650495529175, "learning_rate": 0.00017313313689252418, "loss": 0.3079, "step": 560 }, { "epoch": 0.31795160382667415, "grad_norm": 0.343148410320282, "learning_rate": 0.00017245962631640341, "loss": 0.2408, "step": 565 }, { "epoch": 0.3207653348339899, "grad_norm": 0.7586761116981506, "learning_rate": 0.00017177912319280217, "loss": 0.2298, "step": 570 }, { "epoch": 0.3235790658413056, "grad_norm": 0.5147440433502197, "learning_rate": 0.00017109169319208948, "loss": 0.3067, "step": 575 }, { "epoch": 0.32639279684862127, "grad_norm": 0.5967961549758911, "learning_rate": 0.0001703974026530966, "loss": 0.3291, "step": 580 }, { "epoch": 0.32920652785593696, "grad_norm": 0.8705066442489624, "learning_rate": 0.00016969631857671497, "loss": 0.3395, "step": 585 }, { "epoch": 0.33202025886325265, "grad_norm": 0.36913836002349854, "learning_rate": 0.00016898850861943058, "loss": 0.15, "step": 590 }, { "epoch": 0.3348339898705684, "grad_norm": 1.0335655212402344, "learning_rate": 0.00016827404108679485, "loss": 0.2577, "step": 595 }, { "epoch": 0.3376477208778841, "grad_norm": 0.44228196144104004, "learning_rate": 0.00016755298492683308, "loss": 0.3735, "step": 600 }, { "epoch": 0.3404614518851998, "grad_norm": 0.7567837238311768, "learning_rate": 0.0001668254097233907, "loss": 0.233, "step": 605 }, { "epoch": 0.34327518289251546, "grad_norm": 0.7530750632286072, "learning_rate": 0.00016609138568941809, "loss": 0.34, "step": 610 }, { "epoch": 0.34608891389983115, "grad_norm": 0.5381020307540894, "learning_rate": 0.0001653509836601952, "loss": 0.3743, "step": 615 }, { "epoch": 0.3489026449071469, "grad_norm": 1.5347527265548706, "learning_rate": 0.00016460427508649546, "loss": 0.3224, "step": 620 }, { "epoch": 0.3517163759144626, "grad_norm": 0.7328157424926758, "learning_rate": 0.0001638513320276907, "loss": 0.4606, "step": 625 }, { "epoch": 0.3545301069217783, "grad_norm": 0.5422099232673645, "learning_rate": 0.0001630922271447972, "loss": 0.239, "step": 630 }, { "epoch": 0.35734383792909397, "grad_norm": 0.12207705527544022, "learning_rate": 0.0001623270336934638, "loss": 0.1777, "step": 635 }, { "epoch": 0.36015756893640966, "grad_norm": 0.7163983583450317, "learning_rate": 0.00016155582551690236, "loss": 0.308, "step": 640 }, { "epoch": 0.3629712999437254, "grad_norm": 0.5855613350868225, "learning_rate": 0.00016077867703876182, "loss": 0.3146, "step": 645 }, { "epoch": 0.3657850309510411, "grad_norm": 0.3779029846191406, "learning_rate": 0.0001599956632559461, "loss": 0.2512, "step": 650 }, { "epoch": 0.3685987619583568, "grad_norm": 0.4623885452747345, "learning_rate": 0.00015920685973137673, "loss": 0.3344, "step": 655 }, { "epoch": 0.37141249296567247, "grad_norm": 0.5071548819541931, "learning_rate": 0.00015841234258670065, "loss": 0.2301, "step": 660 }, { "epoch": 0.37422622397298816, "grad_norm": 0.8452264070510864, "learning_rate": 0.0001576121884949446, "loss": 0.3051, "step": 665 }, { "epoch": 0.3770399549803039, "grad_norm": 0.3451974391937256, "learning_rate": 0.00015680647467311557, "loss": 0.2358, "step": 670 }, { "epoch": 0.3798536859876196, "grad_norm": 0.7894652485847473, "learning_rate": 0.0001559952788747495, "loss": 0.2995, "step": 675 }, { "epoch": 0.3826674169949353, "grad_norm": 0.6503499746322632, "learning_rate": 0.00015517867938240763, "loss": 0.3838, "step": 680 }, { "epoch": 0.385481148002251, "grad_norm": 0.8255408406257629, "learning_rate": 0.00015435675500012212, "loss": 0.3326, "step": 685 }, { "epoch": 0.38829487900956666, "grad_norm": 0.9840317368507385, "learning_rate": 0.00015352958504579123, "loss": 0.3544, "step": 690 }, { "epoch": 0.3911086100168824, "grad_norm": 0.5513814687728882, "learning_rate": 0.00015269724934352497, "loss": 0.3565, "step": 695 }, { "epoch": 0.3939223410241981, "grad_norm": 0.9206532835960388, "learning_rate": 0.00015185982821594175, "loss": 0.6995, "step": 700 }, { "epoch": 0.3967360720315138, "grad_norm": 0.6624305248260498, "learning_rate": 0.00015101740247641714, "loss": 0.236, "step": 705 }, { "epoch": 0.3995498030388295, "grad_norm": 0.4557129442691803, "learning_rate": 0.00015017005342128517, "loss": 0.2185, "step": 710 }, { "epoch": 0.40236353404614517, "grad_norm": 0.5107508301734924, "learning_rate": 0.000149317862821993, "loss": 0.2376, "step": 715 }, { "epoch": 0.4051772650534609, "grad_norm": 0.6542500257492065, "learning_rate": 0.00014846091291720957, "loss": 0.3315, "step": 720 }, { "epoch": 0.4079909960607766, "grad_norm": 1.1817783117294312, "learning_rate": 0.00014759928640488965, "loss": 0.27, "step": 725 }, { "epoch": 0.4108047270680923, "grad_norm": 0.47976672649383545, "learning_rate": 0.00014673306643429314, "loss": 0.2458, "step": 730 }, { "epoch": 0.413618458075408, "grad_norm": 0.9391474723815918, "learning_rate": 0.00014586233659796087, "loss": 0.424, "step": 735 }, { "epoch": 0.41643218908272367, "grad_norm": 0.6755409240722656, "learning_rate": 0.0001449871809236478, "loss": 0.3009, "step": 740 }, { "epoch": 0.4192459200900394, "grad_norm": 1.1786988973617554, "learning_rate": 0.000144107683866214, "loss": 0.2926, "step": 745 }, { "epoch": 0.4220596510973551, "grad_norm": 0.4564046561717987, "learning_rate": 0.00014322393029947468, "loss": 0.3663, "step": 750 }, { "epoch": 0.4248733821046708, "grad_norm": 0.650117814540863, "learning_rate": 0.00014233600550800962, "loss": 0.2522, "step": 755 }, { "epoch": 0.4276871131119865, "grad_norm": 0.35542795062065125, "learning_rate": 0.0001414439951789328, "loss": 0.1902, "step": 760 }, { "epoch": 0.4305008441193022, "grad_norm": 0.6666870713233948, "learning_rate": 0.00014054798539362356, "loss": 0.3625, "step": 765 }, { "epoch": 0.4333145751266179, "grad_norm": 1.3364002704620361, "learning_rate": 0.00013964806261941944, "loss": 0.2144, "step": 770 }, { "epoch": 0.4361283061339336, "grad_norm": 0.30019038915634155, "learning_rate": 0.00013874431370127188, "loss": 0.205, "step": 775 }, { "epoch": 0.4389420371412493, "grad_norm": 0.7001076936721802, "learning_rate": 0.0001378368258533654, "loss": 0.4068, "step": 780 }, { "epoch": 0.441755768148565, "grad_norm": 1.1424932479858398, "learning_rate": 0.0001369256866507012, "loss": 0.2298, "step": 785 }, { "epoch": 0.4445694991558807, "grad_norm": 0.8599133491516113, "learning_rate": 0.00013601098402064607, "loss": 0.2843, "step": 790 }, { "epoch": 0.4473832301631964, "grad_norm": 0.8263369798660278, "learning_rate": 0.00013509280623444695, "loss": 0.2451, "step": 795 }, { "epoch": 0.4501969611705121, "grad_norm": 0.9365822672843933, "learning_rate": 0.00013417124189871272, "loss": 0.2621, "step": 800 }, { "epoch": 0.4530106921778278, "grad_norm": 0.47065469622612, "learning_rate": 0.00013324637994686326, "loss": 0.191, "step": 805 }, { "epoch": 0.4558244231851435, "grad_norm": 0.6455582976341248, "learning_rate": 0.00013231830963054722, "loss": 0.2073, "step": 810 }, { "epoch": 0.4586381541924592, "grad_norm": 0.8905434012413025, "learning_rate": 0.0001313871205110291, "loss": 0.3213, "step": 815 }, { "epoch": 0.4614518851997749, "grad_norm": 0.44368186593055725, "learning_rate": 0.0001304529024505461, "loss": 0.4487, "step": 820 }, { "epoch": 0.4642656162070906, "grad_norm": 0.3236369788646698, "learning_rate": 0.00012951574560363636, "loss": 0.2343, "step": 825 }, { "epoch": 0.4670793472144063, "grad_norm": 0.3316313624382019, "learning_rate": 0.00012857574040843876, "loss": 0.1704, "step": 830 }, { "epoch": 0.469893078221722, "grad_norm": 1.180114984512329, "learning_rate": 0.0001276329775779655, "loss": 0.2944, "step": 835 }, { "epoch": 0.4727068092290377, "grad_norm": 0.4699708819389343, "learning_rate": 0.00012668754809134773, "loss": 0.1709, "step": 840 }, { "epoch": 0.47552054023635343, "grad_norm": 0.5087912082672119, "learning_rate": 0.00012573954318505624, "loss": 0.2753, "step": 845 }, { "epoch": 0.4783342712436691, "grad_norm": 0.21406421065330505, "learning_rate": 0.00012478905434409662, "loss": 0.2955, "step": 850 }, { "epoch": 0.4811480022509848, "grad_norm": 0.8056962490081787, "learning_rate": 0.0001238361732931808, "loss": 0.275, "step": 855 }, { "epoch": 0.4839617332583005, "grad_norm": 0.7347704768180847, "learning_rate": 0.00012288099198787532, "loss": 0.2448, "step": 860 }, { "epoch": 0.4867754642656162, "grad_norm": 0.43679895997047424, "learning_rate": 0.0001219236026057275, "loss": 0.4004, "step": 865 }, { "epoch": 0.48958919527293193, "grad_norm": 0.4202831983566284, "learning_rate": 0.00012096409753736991, "loss": 0.1963, "step": 870 }, { "epoch": 0.4924029262802476, "grad_norm": 0.8716102838516235, "learning_rate": 0.00012000256937760445, "loss": 0.225, "step": 875 }, { "epoch": 0.4952166572875633, "grad_norm": 0.2482863813638687, "learning_rate": 0.00011903911091646684, "loss": 0.2338, "step": 880 }, { "epoch": 0.498030388294879, "grad_norm": 0.6226937174797058, "learning_rate": 0.000118073815130272, "loss": 0.3606, "step": 885 }, { "epoch": 0.5008441193021947, "grad_norm": 0.4387325942516327, "learning_rate": 0.0001171067751726416, "loss": 0.231, "step": 890 }, { "epoch": 0.5036578503095104, "grad_norm": 0.26261425018310547, "learning_rate": 0.00011613808436551454, "loss": 0.1239, "step": 895 }, { "epoch": 0.5064715813168261, "grad_norm": 1.2383506298065186, "learning_rate": 0.00011516783619014109, "loss": 0.2496, "step": 900 }, { "epoch": 0.5092853123241418, "grad_norm": 1.853761911392212, "learning_rate": 0.00011419612427806172, "loss": 0.3915, "step": 905 }, { "epoch": 0.5120990433314575, "grad_norm": 0.330138623714447, "learning_rate": 0.00011322304240207145, "loss": 0.0917, "step": 910 }, { "epoch": 0.5149127743387732, "grad_norm": 0.41656142473220825, "learning_rate": 0.00011224868446717036, "loss": 0.1754, "step": 915 }, { "epoch": 0.5177265053460889, "grad_norm": 0.6251401901245117, "learning_rate": 0.00011127314450150175, "loss": 0.3901, "step": 920 }, { "epoch": 0.5205402363534046, "grad_norm": 1.254900336265564, "learning_rate": 0.00011029651664727798, "loss": 0.2828, "step": 925 }, { "epoch": 0.5233539673607203, "grad_norm": 0.9572696089744568, "learning_rate": 0.00010931889515169555, "loss": 0.2235, "step": 930 }, { "epoch": 0.526167698368036, "grad_norm": 0.8414142727851868, "learning_rate": 0.00010834037435784008, "loss": 0.2718, "step": 935 }, { "epoch": 0.5289814293753518, "grad_norm": 0.4331166744232178, "learning_rate": 0.00010736104869558176, "loss": 0.2558, "step": 940 }, { "epoch": 0.5317951603826674, "grad_norm": 0.32980430126190186, "learning_rate": 0.00010638101267246283, "loss": 0.1117, "step": 945 }, { "epoch": 0.5346088913899831, "grad_norm": 0.7335298657417297, "learning_rate": 0.00010540036086457723, "loss": 0.4412, "step": 950 }, { "epoch": 0.5374226223972988, "grad_norm": 0.6139857769012451, "learning_rate": 0.00010441918790744372, "loss": 0.2925, "step": 955 }, { "epoch": 0.5402363534046145, "grad_norm": 0.3401097059249878, "learning_rate": 0.00010343758848687341, "loss": 0.2625, "step": 960 }, { "epoch": 0.5430500844119303, "grad_norm": 0.3688424229621887, "learning_rate": 0.00010245565732983227, "loss": 0.211, "step": 965 }, { "epoch": 0.5458638154192459, "grad_norm": 1.3460103273391724, "learning_rate": 0.00010147348919529969, "loss": 0.3091, "step": 970 }, { "epoch": 0.5486775464265616, "grad_norm": 0.4599795937538147, "learning_rate": 0.00010049117886512404, "loss": 0.2301, "step": 975 }, { "epoch": 0.5514912774338773, "grad_norm": 0.5787628293037415, "learning_rate": 9.950882113487598e-05, "loss": 0.258, "step": 980 }, { "epoch": 0.554305008441193, "grad_norm": 0.8748778104782104, "learning_rate": 9.852651080470033e-05, "loss": 0.3606, "step": 985 }, { "epoch": 0.5571187394485088, "grad_norm": 0.4328353703022003, "learning_rate": 9.754434267016775e-05, "loss": 0.2004, "step": 990 }, { "epoch": 0.5599324704558244, "grad_norm": 0.9542059898376465, "learning_rate": 9.656241151312661e-05, "loss": 0.2206, "step": 995 }, { "epoch": 0.5627462014631401, "grad_norm": 0.3367530405521393, "learning_rate": 9.558081209255629e-05, "loss": 0.1936, "step": 1000 }, { "epoch": 0.5655599324704558, "grad_norm": 0.3511320650577545, "learning_rate": 9.459963913542279e-05, "loss": 0.1467, "step": 1005 }, { "epoch": 0.5683736634777715, "grad_norm": 0.5722060799598694, "learning_rate": 9.361898732753716e-05, "loss": 0.3173, "step": 1010 }, { "epoch": 0.5711873944850873, "grad_norm": 0.5380959510803223, "learning_rate": 9.263895130441826e-05, "loss": 0.2697, "step": 1015 }, { "epoch": 0.5740011254924029, "grad_norm": 0.7701444625854492, "learning_rate": 9.165962564215993e-05, "loss": 0.2513, "step": 1020 }, { "epoch": 0.5768148564997186, "grad_norm": 0.44029852747917175, "learning_rate": 9.068110484830447e-05, "loss": 0.3467, "step": 1025 }, { "epoch": 0.5796285875070343, "grad_norm": 0.6167469620704651, "learning_rate": 8.970348335272203e-05, "loss": 0.3191, "step": 1030 }, { "epoch": 0.58244231851435, "grad_norm": 0.8046761751174927, "learning_rate": 8.872685549849827e-05, "loss": 0.2874, "step": 1035 }, { "epoch": 0.5852560495216658, "grad_norm": 0.6045218110084534, "learning_rate": 8.775131553282965e-05, "loss": 0.2724, "step": 1040 }, { "epoch": 0.5880697805289814, "grad_norm": 0.23991712927818298, "learning_rate": 8.67769575979286e-05, "loss": 0.0839, "step": 1045 }, { "epoch": 0.5908835115362971, "grad_norm": 0.5629101395606995, "learning_rate": 8.580387572193829e-05, "loss": 0.2608, "step": 1050 }, { "epoch": 0.5936972425436128, "grad_norm": 0.7069487571716309, "learning_rate": 8.483216380985895e-05, "loss": 0.359, "step": 1055 }, { "epoch": 0.5965109735509285, "grad_norm": 1.0714657306671143, "learning_rate": 8.386191563448548e-05, "loss": 0.2144, "step": 1060 }, { "epoch": 0.5993247045582443, "grad_norm": 0.8178947567939758, "learning_rate": 8.289322482735844e-05, "loss": 0.3105, "step": 1065 }, { "epoch": 0.6021384355655599, "grad_norm": 0.7573699951171875, "learning_rate": 8.192618486972803e-05, "loss": 0.2918, "step": 1070 }, { "epoch": 0.6049521665728756, "grad_norm": 0.3417803645133972, "learning_rate": 8.096088908353315e-05, "loss": 0.1382, "step": 1075 }, { "epoch": 0.6077658975801913, "grad_norm": 0.7476038336753845, "learning_rate": 7.999743062239557e-05, "loss": 0.4213, "step": 1080 }, { "epoch": 0.610579628587507, "grad_norm": 0.3231750726699829, "learning_rate": 7.90359024626301e-05, "loss": 0.2874, "step": 1085 }, { "epoch": 0.6133933595948228, "grad_norm": 0.5958102345466614, "learning_rate": 7.807639739427251e-05, "loss": 0.2656, "step": 1090 }, { "epoch": 0.6162070906021384, "grad_norm": 0.4588276743888855, "learning_rate": 7.711900801212466e-05, "loss": 0.1933, "step": 1095 }, { "epoch": 0.6190208216094542, "grad_norm": 0.5570498704910278, "learning_rate": 7.616382670681924e-05, "loss": 0.3897, "step": 1100 }, { "epoch": 0.6218345526167698, "grad_norm": 0.41902509331703186, "learning_rate": 7.521094565590338e-05, "loss": 0.2403, "step": 1105 }, { "epoch": 0.6246482836240855, "grad_norm": 0.9511467814445496, "learning_rate": 7.426045681494378e-05, "loss": 0.3146, "step": 1110 }, { "epoch": 0.6274620146314013, "grad_norm": 1.1212773323059082, "learning_rate": 7.33124519086523e-05, "loss": 0.2424, "step": 1115 }, { "epoch": 0.6302757456387169, "grad_norm": 0.8666883111000061, "learning_rate": 7.236702242203457e-05, "loss": 0.319, "step": 1120 }, { "epoch": 0.6330894766460327, "grad_norm": 1.2638081312179565, "learning_rate": 7.142425959156125e-05, "loss": 0.1587, "step": 1125 }, { "epoch": 0.6359032076533483, "grad_norm": 0.33488303422927856, "learning_rate": 7.04842543963637e-05, "loss": 0.2139, "step": 1130 }, { "epoch": 0.638716938660664, "grad_norm": 0.6867479681968689, "learning_rate": 6.954709754945394e-05, "loss": 0.2332, "step": 1135 }, { "epoch": 0.6415306696679798, "grad_norm": 0.4780934751033783, "learning_rate": 6.861287948897091e-05, "loss": 0.1127, "step": 1140 }, { "epoch": 0.6443444006752954, "grad_norm": 0.8691847920417786, "learning_rate": 6.768169036945277e-05, "loss": 0.3039, "step": 1145 }, { "epoch": 0.6471581316826112, "grad_norm": 0.4771972894668579, "learning_rate": 6.675362005313677e-05, "loss": 0.2787, "step": 1150 }, { "epoch": 0.6499718626899268, "grad_norm": 0.5366829037666321, "learning_rate": 6.58287581012873e-05, "loss": 0.1824, "step": 1155 }, { "epoch": 0.6527855936972425, "grad_norm": 0.28026753664016724, "learning_rate": 6.490719376555305e-05, "loss": 0.2074, "step": 1160 }, { "epoch": 0.6555993247045583, "grad_norm": 0.9920913577079773, "learning_rate": 6.398901597935393e-05, "loss": 0.3188, "step": 1165 }, { "epoch": 0.6584130557118739, "grad_norm": 0.5217199921607971, "learning_rate": 6.30743133492988e-05, "loss": 0.2846, "step": 1170 }, { "epoch": 0.6612267867191897, "grad_norm": 0.5738883018493652, "learning_rate": 6.216317414663463e-05, "loss": 0.1972, "step": 1175 }, { "epoch": 0.6640405177265053, "grad_norm": 0.3134082853794098, "learning_rate": 6.125568629872813e-05, "loss": 0.1806, "step": 1180 }, { "epoch": 0.666854248733821, "grad_norm": 0.4762999415397644, "learning_rate": 6.035193738058056e-05, "loss": 0.2386, "step": 1185 }, { "epoch": 0.6696679797411368, "grad_norm": 0.48775815963745117, "learning_rate": 5.945201460637645e-05, "loss": 0.1261, "step": 1190 }, { "epoch": 0.6724817107484524, "grad_norm": 0.5460477471351624, "learning_rate": 5.855600482106721e-05, "loss": 0.3201, "step": 1195 }, { "epoch": 0.6752954417557682, "grad_norm": 0.41563519835472107, "learning_rate": 5.766399449199037e-05, "loss": 0.2287, "step": 1200 }, { "epoch": 0.6781091727630838, "grad_norm": 0.832744300365448, "learning_rate": 5.677606970052529e-05, "loss": 0.5409, "step": 1205 }, { "epoch": 0.6809229037703995, "grad_norm": 0.8101387023925781, "learning_rate": 5.5892316133786005e-05, "loss": 0.1934, "step": 1210 }, { "epoch": 0.6837366347777153, "grad_norm": 0.9781274795532227, "learning_rate": 5.501281907635223e-05, "loss": 0.1842, "step": 1215 }, { "epoch": 0.6865503657850309, "grad_norm": 0.36751049757003784, "learning_rate": 5.413766340203914e-05, "loss": 0.2631, "step": 1220 }, { "epoch": 0.6893640967923467, "grad_norm": 0.3681579828262329, "learning_rate": 5.3266933565706865e-05, "loss": 0.2639, "step": 1225 }, { "epoch": 0.6921778277996623, "grad_norm": 0.7795785069465637, "learning_rate": 5.240071359511035e-05, "loss": 0.3817, "step": 1230 }, { "epoch": 0.694991558806978, "grad_norm": 0.6714096069335938, "learning_rate": 5.153908708279045e-05, "loss": 0.2655, "step": 1235 }, { "epoch": 0.6978052898142938, "grad_norm": 0.6018862724304199, "learning_rate": 5.0682137178007025e-05, "loss": 0.2517, "step": 1240 }, { "epoch": 0.7006190208216094, "grad_norm": 0.7170803546905518, "learning_rate": 4.9829946578714825e-05, "loss": 0.3097, "step": 1245 }, { "epoch": 0.7034327518289252, "grad_norm": 0.37394005060195923, "learning_rate": 4.898259752358287e-05, "loss": 0.1883, "step": 1250 }, { "epoch": 0.7062464828362408, "grad_norm": 2.812126874923706, "learning_rate": 4.814017178405829e-05, "loss": 0.1431, "step": 1255 }, { "epoch": 0.7090602138435566, "grad_norm": 0.5540988445281982, "learning_rate": 4.730275065647506e-05, "loss": 0.2109, "step": 1260 }, { "epoch": 0.7118739448508723, "grad_norm": 0.9915019869804382, "learning_rate": 4.6470414954208785e-05, "loss": 0.2857, "step": 1265 }, { "epoch": 0.7146876758581879, "grad_norm": 1.248504400253296, "learning_rate": 4.56432449998779e-05, "loss": 0.4287, "step": 1270 }, { "epoch": 0.7175014068655037, "grad_norm": 0.5127077102661133, "learning_rate": 4.482132061759239e-05, "loss": 0.204, "step": 1275 }, { "epoch": 0.7203151378728193, "grad_norm": 0.6224874258041382, "learning_rate": 4.400472112525051e-05, "loss": 0.2376, "step": 1280 }, { "epoch": 0.7231288688801351, "grad_norm": 0.3423043191432953, "learning_rate": 4.3193525326884435e-05, "loss": 0.1957, "step": 1285 }, { "epoch": 0.7259425998874508, "grad_norm": 0.41790780425071716, "learning_rate": 4.238781150505542e-05, "loss": 0.3171, "step": 1290 }, { "epoch": 0.7287563308947664, "grad_norm": 0.550262451171875, "learning_rate": 4.158765741329935e-05, "loss": 0.2016, "step": 1295 }, { "epoch": 0.7315700619020822, "grad_norm": 0.5064123868942261, "learning_rate": 4.079314026862331e-05, "loss": 0.2747, "step": 1300 }, { "epoch": 0.7343837929093978, "grad_norm": 0.6976874470710754, "learning_rate": 4.000433674405392e-05, "loss": 0.2478, "step": 1305 }, { "epoch": 0.7371975239167136, "grad_norm": 0.8951148986816406, "learning_rate": 3.9221322961238213e-05, "loss": 0.1747, "step": 1310 }, { "epoch": 0.7400112549240293, "grad_norm": 0.5888150930404663, "learning_rate": 3.8444174483097675e-05, "loss": 0.2342, "step": 1315 }, { "epoch": 0.7428249859313449, "grad_norm": 0.39759594202041626, "learning_rate": 3.7672966306536226e-05, "loss": 0.2402, "step": 1320 }, { "epoch": 0.7456387169386607, "grad_norm": 1.4384478330612183, "learning_rate": 3.690777285520281e-05, "loss": 0.3132, "step": 1325 }, { "epoch": 0.7484524479459763, "grad_norm": 0.4053248465061188, "learning_rate": 3.614866797230935e-05, "loss": 0.1615, "step": 1330 }, { "epoch": 0.7512661789532921, "grad_norm": 0.4696710407733917, "learning_rate": 3.5395724913504545e-05, "loss": 0.1633, "step": 1335 }, { "epoch": 0.7540799099606078, "grad_norm": 0.3567434847354889, "learning_rate": 3.464901633980484e-05, "loss": 0.2388, "step": 1340 }, { "epoch": 0.7568936409679234, "grad_norm": 0.8810656070709229, "learning_rate": 3.3908614310581924e-05, "loss": 0.3078, "step": 1345 }, { "epoch": 0.7597073719752392, "grad_norm": 0.9257289171218872, "learning_rate": 3.3174590276609355e-05, "loss": 0.4227, "step": 1350 }, { "epoch": 0.7625211029825548, "grad_norm": 0.3970353305339813, "learning_rate": 3.24470150731669e-05, "loss": 0.108, "step": 1355 }, { "epoch": 0.7653348339898706, "grad_norm": 0.4147047996520996, "learning_rate": 3.1725958913205166e-05, "loss": 0.3138, "step": 1360 }, { "epoch": 0.7681485649971863, "grad_norm": 0.29604053497314453, "learning_rate": 3.1011491380569425e-05, "loss": 0.1246, "step": 1365 }, { "epoch": 0.770962296004502, "grad_norm": 0.7796684503555298, "learning_rate": 3.0303681423285068e-05, "loss": 0.3338, "step": 1370 }, { "epoch": 0.7737760270118177, "grad_norm": 0.5329720973968506, "learning_rate": 2.9602597346903406e-05, "loss": 0.3101, "step": 1375 }, { "epoch": 0.7765897580191333, "grad_norm": 0.6327192187309265, "learning_rate": 2.8908306807910534e-05, "loss": 0.1832, "step": 1380 }, { "epoch": 0.7794034890264491, "grad_norm": 0.6063408851623535, "learning_rate": 2.822087680719783e-05, "loss": 0.2447, "step": 1385 }, { "epoch": 0.7822172200337648, "grad_norm": 0.3461267650127411, "learning_rate": 2.754037368359661e-05, "loss": 0.274, "step": 1390 }, { "epoch": 0.7850309510410804, "grad_norm": 0.598047137260437, "learning_rate": 2.6866863107475803e-05, "loss": 0.173, "step": 1395 }, { "epoch": 0.7878446820483962, "grad_norm": 0.7208033800125122, "learning_rate": 2.620041007440508e-05, "loss": 0.2908, "step": 1400 }, { "epoch": 0.7906584130557118, "grad_norm": 0.3856890797615051, "learning_rate": 2.5541078898882187e-05, "loss": 0.1546, "step": 1405 }, { "epoch": 0.7934721440630276, "grad_norm": 0.9550760388374329, "learning_rate": 2.4888933208126663e-05, "loss": 0.2096, "step": 1410 }, { "epoch": 0.7962858750703433, "grad_norm": 0.4413495361804962, "learning_rate": 2.4244035935939547e-05, "loss": 0.2607, "step": 1415 }, { "epoch": 0.799099606077659, "grad_norm": 1.8407310247421265, "learning_rate": 2.360644931663014e-05, "loss": 0.3178, "step": 1420 }, { "epoch": 0.8019133370849747, "grad_norm": 0.6831107139587402, "learning_rate": 2.2976234879010218e-05, "loss": 0.2287, "step": 1425 }, { "epoch": 0.8047270680922903, "grad_norm": 0.7173850536346436, "learning_rate": 2.2353453440456316e-05, "loss": 0.2534, "step": 1430 }, { "epoch": 0.8075407990996061, "grad_norm": 0.5183877944946289, "learning_rate": 2.173816510104073e-05, "loss": 0.2009, "step": 1435 }, { "epoch": 0.8103545301069218, "grad_norm": 0.50481116771698, "learning_rate": 2.113042923773164e-05, "loss": 0.088, "step": 1440 }, { "epoch": 0.8131682611142375, "grad_norm": 0.648539125919342, "learning_rate": 2.0530304498663143e-05, "loss": 0.1828, "step": 1445 }, { "epoch": 0.8159819921215532, "grad_norm": 0.6068935990333557, "learning_rate": 1.9937848797475488e-05, "loss": 0.2655, "step": 1450 }, { "epoch": 0.8187957231288688, "grad_norm": 0.36234456300735474, "learning_rate": 1.935311930772632e-05, "loss": 0.1955, "step": 1455 }, { "epoch": 0.8216094541361846, "grad_norm": 0.5231152176856995, "learning_rate": 1.877617245737321e-05, "loss": 0.2071, "step": 1460 }, { "epoch": 0.8244231851435003, "grad_norm": 0.21570482850074768, "learning_rate": 1.8207063923328237e-05, "loss": 0.1853, "step": 1465 }, { "epoch": 0.827236916150816, "grad_norm": 0.4301048815250397, "learning_rate": 1.764584862608507e-05, "loss": 0.2832, "step": 1470 }, { "epoch": 0.8300506471581317, "grad_norm": 0.44830775260925293, "learning_rate": 1.7092580724418882e-05, "loss": 0.2344, "step": 1475 }, { "epoch": 0.8328643781654473, "grad_norm": 0.42212042212486267, "learning_rate": 1.6547313610159986e-05, "loss": 0.2679, "step": 1480 }, { "epoch": 0.8356781091727631, "grad_norm": 0.7017850875854492, "learning_rate": 1.6010099903041332e-05, "loss": 0.2124, "step": 1485 }, { "epoch": 0.8384918401800788, "grad_norm": 0.8840892910957336, "learning_rate": 1.5480991445620542e-05, "loss": 0.2939, "step": 1490 }, { "epoch": 0.8413055711873945, "grad_norm": 0.8503584265708923, "learning_rate": 1.4960039298277029e-05, "loss": 0.2429, "step": 1495 }, { "epoch": 0.8441193021947102, "grad_norm": 1.0028765201568604, "learning_rate": 1.4447293734284474e-05, "loss": 0.3548, "step": 1500 }, { "epoch": 0.8469330332020258, "grad_norm": 0.5684967637062073, "learning_rate": 1.3942804234959373e-05, "loss": 0.2871, "step": 1505 }, { "epoch": 0.8497467642093416, "grad_norm": 0.7405120730400085, "learning_rate": 1.3446619484885903e-05, "loss": 0.1683, "step": 1510 }, { "epoch": 0.8525604952166573, "grad_norm": 0.6290687918663025, "learning_rate": 1.2958787367217751e-05, "loss": 0.2926, "step": 1515 }, { "epoch": 0.855374226223973, "grad_norm": 0.7487866878509521, "learning_rate": 1.2479354959057298e-05, "loss": 0.4543, "step": 1520 }, { "epoch": 0.8581879572312887, "grad_norm": 0.2865360379219055, "learning_rate": 1.2008368526912506e-05, "loss": 0.1499, "step": 1525 }, { "epoch": 0.8610016882386043, "grad_norm": 0.7912615537643433, "learning_rate": 1.1545873522232053e-05, "loss": 0.4047, "step": 1530 }, { "epoch": 0.8638154192459201, "grad_norm": 0.4929727017879486, "learning_rate": 1.1091914577019302e-05, "loss": 0.3078, "step": 1535 }, { "epoch": 0.8666291502532358, "grad_norm": 0.590064287185669, "learning_rate": 1.0646535499524902e-05, "loss": 0.2642, "step": 1540 }, { "epoch": 0.8694428812605515, "grad_norm": 0.7222818732261658, "learning_rate": 1.0209779270019525e-05, "loss": 0.241, "step": 1545 }, { "epoch": 0.8722566122678672, "grad_norm": 0.8655977249145508, "learning_rate": 9.781688036645842e-06, "loss": 0.2761, "step": 1550 }, { "epoch": 0.8750703432751828, "grad_norm": 0.4627645015716553, "learning_rate": 9.362303111351378e-06, "loss": 0.1023, "step": 1555 }, { "epoch": 0.8778840742824986, "grad_norm": 0.07925199717283249, "learning_rate": 8.95166496590153e-06, "loss": 0.1914, "step": 1560 }, { "epoch": 0.8806978052898143, "grad_norm": 0.3665456771850586, "learning_rate": 8.549813227974247e-06, "loss": 0.2617, "step": 1565 }, { "epoch": 0.88351153629713, "grad_norm": 0.8194103240966797, "learning_rate": 8.156786677335493e-06, "loss": 0.3987, "step": 1570 }, { "epoch": 0.8863252673044457, "grad_norm": 0.4855242669582367, "learning_rate": 7.772623242097277e-06, "loss": 0.2692, "step": 1575 }, { "epoch": 0.8891389983117614, "grad_norm": 0.531732976436615, "learning_rate": 7.397359995057118e-06, "loss": 0.2017, "step": 1580 }, { "epoch": 0.8919527293190771, "grad_norm": 0.23579372465610504, "learning_rate": 7.03103315012078e-06, "loss": 0.183, "step": 1585 }, { "epoch": 0.8947664603263928, "grad_norm": 0.38668301701545715, "learning_rate": 6.673678058807198e-06, "loss": 0.1825, "step": 1590 }, { "epoch": 0.8975801913337085, "grad_norm": 0.5998560190200806, "learning_rate": 6.325329206837216e-06, "loss": 0.3018, "step": 1595 }, { "epoch": 0.9003939223410242, "grad_norm": 0.96495521068573, "learning_rate": 5.986020210805488e-06, "loss": 0.2978, "step": 1600 }, { "epoch": 0.9032076533483399, "grad_norm": 0.6704295873641968, "learning_rate": 5.655783814936433e-06, "loss": 0.1745, "step": 1605 }, { "epoch": 0.9060213843556556, "grad_norm": 0.502069890499115, "learning_rate": 5.334651887924324e-06, "loss": 0.1923, "step": 1610 }, { "epoch": 0.9088351153629713, "grad_norm": 0.1441662758588791, "learning_rate": 5.0226554198578576e-06, "loss": 0.1279, "step": 1615 }, { "epoch": 0.911648846370287, "grad_norm": 0.6545499563217163, "learning_rate": 4.719824519229554e-06, "loss": 0.2999, "step": 1620 }, { "epoch": 0.9144625773776027, "grad_norm": 0.4436165690422058, "learning_rate": 4.426188410030196e-06, "loss": 0.2404, "step": 1625 }, { "epoch": 0.9172763083849184, "grad_norm": 0.2450067549943924, "learning_rate": 4.1417754289286184e-06, "loss": 0.3257, "step": 1630 }, { "epoch": 0.9200900393922341, "grad_norm": 0.5148952603340149, "learning_rate": 3.866613022537169e-06, "loss": 0.1532, "step": 1635 }, { "epoch": 0.9229037703995498, "grad_norm": 0.39606383442878723, "learning_rate": 3.600727744763044e-06, "loss": 0.3313, "step": 1640 }, { "epoch": 0.9257175014068655, "grad_norm": 0.5422732830047607, "learning_rate": 3.344145254245723e-06, "loss": 0.1593, "step": 1645 }, { "epoch": 0.9285312324141812, "grad_norm": 0.565556526184082, "learning_rate": 3.0968903118808622e-06, "loss": 0.3292, "step": 1650 }, { "epoch": 0.9313449634214969, "grad_norm": 0.9481168389320374, "learning_rate": 2.85898677843085e-06, "loss": 0.1793, "step": 1655 }, { "epoch": 0.9341586944288126, "grad_norm": 0.5848947167396545, "learning_rate": 2.6304576122221035e-06, "loss": 0.2746, "step": 1660 }, { "epoch": 0.9369724254361284, "grad_norm": 0.840390145778656, "learning_rate": 2.411324866929543e-06, "loss": 0.3995, "step": 1665 }, { "epoch": 0.939786156443444, "grad_norm": 0.5747278928756714, "learning_rate": 2.201609689448425e-06, "loss": 0.2925, "step": 1670 }, { "epoch": 0.9425998874507597, "grad_norm": 0.518104612827301, "learning_rate": 2.0013323178535102e-06, "loss": 0.2362, "step": 1675 }, { "epoch": 0.9454136184580754, "grad_norm": 0.5568994879722595, "learning_rate": 1.810512079446125e-06, "loss": 0.2395, "step": 1680 }, { "epoch": 0.9482273494653911, "grad_norm": 0.2408752143383026, "learning_rate": 1.6291673888889302e-06, "loss": 0.1682, "step": 1685 }, { "epoch": 0.9510410804727069, "grad_norm": 0.3361740708351135, "learning_rate": 1.4573157464289554e-06, "loss": 0.1792, "step": 1690 }, { "epoch": 0.9538548114800225, "grad_norm": 0.46246424317359924, "learning_rate": 1.2949737362087156e-06, "loss": 0.1895, "step": 1695 }, { "epoch": 0.9566685424873382, "grad_norm": 0.6043664813041687, "learning_rate": 1.1421570246658242e-06, "loss": 0.2753, "step": 1700 }, { "epoch": 0.9594822734946539, "grad_norm": 0.5759782195091248, "learning_rate": 9.988803590211037e-07, "loss": 0.2, "step": 1705 }, { "epoch": 0.9622960045019696, "grad_norm": 0.7253143787384033, "learning_rate": 8.6515756585549e-07, "loss": 0.3226, "step": 1710 }, { "epoch": 0.9651097355092854, "grad_norm": 0.4216267466545105, "learning_rate": 7.410015497756994e-07, "loss": 0.2211, "step": 1715 }, { "epoch": 0.967923466516601, "grad_norm": 0.6309015154838562, "learning_rate": 6.264242921689257e-07, "loss": 0.2258, "step": 1720 }, { "epoch": 0.9707371975239167, "grad_norm": 0.4388352930545807, "learning_rate": 5.214368500465305e-07, "loss": 0.2074, "step": 1725 }, { "epoch": 0.9735509285312324, "grad_norm": 0.36881500482559204, "learning_rate": 4.260493549771316e-07, "loss": 0.3136, "step": 1730 }, { "epoch": 0.9763646595385481, "grad_norm": 0.612010657787323, "learning_rate": 3.4027101210876155e-07, "loss": 0.29, "step": 1735 }, { "epoch": 0.9791783905458639, "grad_norm": 0.5004868507385254, "learning_rate": 2.6411009928064556e-07, "loss": 0.2251, "step": 1740 }, { "epoch": 0.9819921215531795, "grad_norm": 0.3668440878391266, "learning_rate": 1.9757396622428482e-07, "loss": 0.1813, "step": 1745 }, { "epoch": 0.9848058525604952, "grad_norm": 0.45063304901123047, "learning_rate": 1.406690338542349e-07, "loss": 0.1792, "step": 1750 }, { "epoch": 0.9876195835678109, "grad_norm": 0.5779865384101868, "learning_rate": 9.340079364847931e-08, "loss": 0.2583, "step": 1755 }, { "epoch": 0.9904333145751266, "grad_norm": 0.5747032165527344, "learning_rate": 5.5773807118442154e-08, "loss": 0.2567, "step": 1760 }, { "epoch": 0.9932470455824424, "grad_norm": 0.49229690432548523, "learning_rate": 2.7791705368818143e-08, "loss": 0.1704, "step": 1765 }, { "epoch": 0.996060776589758, "grad_norm": 0.24515922367572784, "learning_rate": 9.457188747186151e-09, "loss": 0.1709, "step": 1770 }, { "epoch": 0.9988745075970737, "grad_norm": 0.45681869983673096, "learning_rate": 7.720265833510709e-10, "loss": 0.2311, "step": 1775 }, { "epoch": 1.0, "step": 1777, "total_flos": 6.548002937199657e+17, "train_loss": 0.29464965595847514, "train_runtime": 5546.9812, "train_samples_per_second": 2.563, "train_steps_per_second": 0.32 } ], "logging_steps": 5, "max_steps": 1777, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.548002937199657e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }