diff --git "a/codet5_ia3_official_0.0001/checkpoint-14718/trainer_state.json" "b/codet5_ia3_official_0.0001/checkpoint-14718/trainer_state.json" new file mode 100644--- /dev/null +++ "b/codet5_ia3_official_0.0001/checkpoint-14718/trainer_state.json" @@ -0,0 +1,20656 @@ +{ + "best_metric": 0.0021924919669198163, + "best_model_checkpoint": "./results-cc/code-t5/codet5_ia3_official_0.0001/checkpoint-14718", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 14718, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003397200706617747, + "grad_norm": 2.2195968627929688, + "learning_rate": 9.999575349911673e-05, + "loss": 10.0277, + "step": 5 + }, + { + "epoch": 0.0006794401413235494, + "grad_norm": 1.40236496925354, + "learning_rate": 9.999320559858677e-05, + "loss": 10.112, + "step": 10 + }, + { + "epoch": 0.0010191602119853241, + "grad_norm": 2.544349431991577, + "learning_rate": 9.99889590977035e-05, + "loss": 10.7856, + "step": 15 + }, + { + "epoch": 0.001358880282647099, + "grad_norm": 1.8622939586639404, + "learning_rate": 9.998471259682023e-05, + "loss": 9.73, + "step": 20 + }, + { + "epoch": 0.0016986003533088735, + "grad_norm": 2.0292515754699707, + "learning_rate": 9.998046609593695e-05, + "loss": 10.2627, + "step": 25 + }, + { + "epoch": 0.0020383204239706482, + "grad_norm": 1.862714171409607, + "learning_rate": 9.997621959505368e-05, + "loss": 9.8577, + "step": 30 + }, + { + "epoch": 0.002378040494632423, + "grad_norm": 3.0175716876983643, + "learning_rate": 9.997197309417041e-05, + "loss": 9.6583, + "step": 35 + }, + { + "epoch": 0.002717760565294198, + "grad_norm": 2.3832008838653564, + "learning_rate": 9.996772659328714e-05, + "loss": 9.8661, + "step": 40 + }, + { + "epoch": 0.0030574806359559724, + "grad_norm": 1.7646359205245972, + "learning_rate": 9.996348009240387e-05, + "loss": 10.5005, + "step": 45 + }, + { + "epoch": 0.003397200706617747, + "grad_norm": 1.7444077730178833, + "learning_rate": 9.996008289169725e-05, + "loss": 9.5758, + "step": 50 + }, + { + "epoch": 0.0037369207772795215, + "grad_norm": 1.9149781465530396, + "learning_rate": 9.995583639081398e-05, + "loss": 9.735, + "step": 55 + }, + { + "epoch": 0.0040766408479412965, + "grad_norm": 1.6488384008407593, + "learning_rate": 9.995158988993069e-05, + "loss": 9.8686, + "step": 60 + }, + { + "epoch": 0.0044163609186030715, + "grad_norm": 2.184131622314453, + "learning_rate": 9.994734338904743e-05, + "loss": 10.1822, + "step": 65 + }, + { + "epoch": 0.004756080989264846, + "grad_norm": 2.6063928604125977, + "learning_rate": 9.994309688816416e-05, + "loss": 9.9386, + "step": 70 + }, + { + "epoch": 0.005095801059926621, + "grad_norm": 1.925402045249939, + "learning_rate": 9.993885038728087e-05, + "loss": 9.3722, + "step": 75 + }, + { + "epoch": 0.005435521130588396, + "grad_norm": 3.882549524307251, + "learning_rate": 9.993545318657427e-05, + "loss": 9.4705, + "step": 80 + }, + { + "epoch": 0.00577524120125017, + "grad_norm": 2.1654701232910156, + "learning_rate": 9.9931206685691e-05, + "loss": 9.5574, + "step": 85 + }, + { + "epoch": 0.006114961271911945, + "grad_norm": 2.0329928398132324, + "learning_rate": 9.992696018480773e-05, + "loss": 9.7831, + "step": 90 + }, + { + "epoch": 0.006454681342573719, + "grad_norm": 2.408450126647949, + "learning_rate": 9.992271368392445e-05, + "loss": 10.2149, + "step": 95 + }, + { + "epoch": 0.006794401413235494, + "grad_norm": 2.039151430130005, + "learning_rate": 9.991846718304117e-05, + "loss": 10.2672, + "step": 100 + }, + { + "epoch": 0.007134121483897269, + "grad_norm": 2.3165321350097656, + "learning_rate": 9.991422068215791e-05, + "loss": 9.4804, + "step": 105 + }, + { + "epoch": 0.007473841554559043, + "grad_norm": 2.117964029312134, + "learning_rate": 9.990997418127464e-05, + "loss": 10.1318, + "step": 110 + }, + { + "epoch": 0.007813561625220818, + "grad_norm": 2.1264026165008545, + "learning_rate": 9.990572768039135e-05, + "loss": 9.6686, + "step": 115 + }, + { + "epoch": 0.008153281695882593, + "grad_norm": 1.2933008670806885, + "learning_rate": 9.99014811795081e-05, + "loss": 9.4348, + "step": 120 + }, + { + "epoch": 0.008493001766544368, + "grad_norm": 2.1963112354278564, + "learning_rate": 9.989723467862482e-05, + "loss": 9.7095, + "step": 125 + }, + { + "epoch": 0.008832721837206143, + "grad_norm": 1.4545302391052246, + "learning_rate": 9.989298817774154e-05, + "loss": 9.5571, + "step": 130 + }, + { + "epoch": 0.009172441907867916, + "grad_norm": 1.4792237281799316, + "learning_rate": 9.988874167685828e-05, + "loss": 8.9859, + "step": 135 + }, + { + "epoch": 0.009512161978529691, + "grad_norm": 1.2513773441314697, + "learning_rate": 9.9884495175975e-05, + "loss": 9.1648, + "step": 140 + }, + { + "epoch": 0.009851882049191466, + "grad_norm": 2.559937000274658, + "learning_rate": 9.988024867509172e-05, + "loss": 9.7622, + "step": 145 + }, + { + "epoch": 0.010191602119853241, + "grad_norm": 1.9447046518325806, + "learning_rate": 9.987600217420846e-05, + "loss": 9.8491, + "step": 150 + }, + { + "epoch": 0.010531322190515016, + "grad_norm": 1.3316339254379272, + "learning_rate": 9.987175567332519e-05, + "loss": 9.0772, + "step": 155 + }, + { + "epoch": 0.010871042261176791, + "grad_norm": 1.4368464946746826, + "learning_rate": 9.98675091724419e-05, + "loss": 8.9018, + "step": 160 + }, + { + "epoch": 0.011210762331838564, + "grad_norm": 1.5808727741241455, + "learning_rate": 9.986326267155865e-05, + "loss": 8.9733, + "step": 165 + }, + { + "epoch": 0.01155048240250034, + "grad_norm": 1.7572826147079468, + "learning_rate": 9.985901617067537e-05, + "loss": 8.7883, + "step": 170 + }, + { + "epoch": 0.011890202473162114, + "grad_norm": 1.6672322750091553, + "learning_rate": 9.985476966979209e-05, + "loss": 8.8474, + "step": 175 + }, + { + "epoch": 0.01222992254382389, + "grad_norm": 1.3700741529464722, + "learning_rate": 9.985052316890883e-05, + "loss": 9.4713, + "step": 180 + }, + { + "epoch": 0.012569642614485664, + "grad_norm": 1.9305622577667236, + "learning_rate": 9.984627666802555e-05, + "loss": 9.2708, + "step": 185 + }, + { + "epoch": 0.012909362685147438, + "grad_norm": 1.964107871055603, + "learning_rate": 9.984203016714227e-05, + "loss": 8.8052, + "step": 190 + }, + { + "epoch": 0.013249082755809213, + "grad_norm": 1.9907019138336182, + "learning_rate": 9.983778366625901e-05, + "loss": 9.6229, + "step": 195 + }, + { + "epoch": 0.013588802826470988, + "grad_norm": 1.7012939453125, + "learning_rate": 9.983353716537573e-05, + "loss": 8.9995, + "step": 200 + }, + { + "epoch": 0.013928522897132763, + "grad_norm": 1.7597671747207642, + "learning_rate": 9.982929066449246e-05, + "loss": 8.8808, + "step": 205 + }, + { + "epoch": 0.014268242967794538, + "grad_norm": 1.6379801034927368, + "learning_rate": 9.98250441636092e-05, + "loss": 8.8094, + "step": 210 + }, + { + "epoch": 0.014607963038456313, + "grad_norm": 1.9670891761779785, + "learning_rate": 9.982079766272591e-05, + "loss": 9.522, + "step": 215 + }, + { + "epoch": 0.014947683109118086, + "grad_norm": 1.9001445770263672, + "learning_rate": 9.981655116184264e-05, + "loss": 9.3781, + "step": 220 + }, + { + "epoch": 0.015287403179779861, + "grad_norm": 1.9136974811553955, + "learning_rate": 9.981230466095938e-05, + "loss": 8.8219, + "step": 225 + }, + { + "epoch": 0.015627123250441636, + "grad_norm": 2.4746735095977783, + "learning_rate": 9.98080581600761e-05, + "loss": 8.9191, + "step": 230 + }, + { + "epoch": 0.01596684332110341, + "grad_norm": 2.372750759124756, + "learning_rate": 9.980381165919283e-05, + "loss": 9.1705, + "step": 235 + }, + { + "epoch": 0.016306563391765186, + "grad_norm": 1.418637752532959, + "learning_rate": 9.980041445848621e-05, + "loss": 8.6098, + "step": 240 + }, + { + "epoch": 0.01664628346242696, + "grad_norm": 2.103688955307007, + "learning_rate": 9.979616795760294e-05, + "loss": 9.1948, + "step": 245 + }, + { + "epoch": 0.016986003533088736, + "grad_norm": 2.14066481590271, + "learning_rate": 9.979192145671968e-05, + "loss": 9.0493, + "step": 250 + }, + { + "epoch": 0.01732572360375051, + "grad_norm": 1.4272230863571167, + "learning_rate": 9.978767495583639e-05, + "loss": 8.7393, + "step": 255 + }, + { + "epoch": 0.017665443674412286, + "grad_norm": 1.5108071565628052, + "learning_rate": 9.978342845495312e-05, + "loss": 9.3101, + "step": 260 + }, + { + "epoch": 0.01800516374507406, + "grad_norm": 2.017267942428589, + "learning_rate": 9.977918195406986e-05, + "loss": 9.0584, + "step": 265 + }, + { + "epoch": 0.018344883815735832, + "grad_norm": 1.4429893493652344, + "learning_rate": 9.977493545318658e-05, + "loss": 8.5374, + "step": 270 + }, + { + "epoch": 0.01868460388639761, + "grad_norm": 1.7082629203796387, + "learning_rate": 9.97706889523033e-05, + "loss": 8.2905, + "step": 275 + }, + { + "epoch": 0.019024323957059382, + "grad_norm": 1.9540777206420898, + "learning_rate": 9.976644245142005e-05, + "loss": 8.9633, + "step": 280 + }, + { + "epoch": 0.01936404402772116, + "grad_norm": 1.4489926099777222, + "learning_rate": 9.976219595053676e-05, + "loss": 8.7871, + "step": 285 + }, + { + "epoch": 0.019703764098382932, + "grad_norm": 1.5926896333694458, + "learning_rate": 9.975794944965349e-05, + "loss": 8.5374, + "step": 290 + }, + { + "epoch": 0.020043484169044706, + "grad_norm": 1.8608131408691406, + "learning_rate": 9.975370294877022e-05, + "loss": 8.4816, + "step": 295 + }, + { + "epoch": 0.020383204239706482, + "grad_norm": 1.6402130126953125, + "learning_rate": 9.974945644788694e-05, + "loss": 8.3059, + "step": 300 + }, + { + "epoch": 0.020722924310368256, + "grad_norm": 1.0616756677627563, + "learning_rate": 9.974520994700367e-05, + "loss": 8.5302, + "step": 305 + }, + { + "epoch": 0.021062644381030032, + "grad_norm": 2.0486464500427246, + "learning_rate": 9.97409634461204e-05, + "loss": 8.6573, + "step": 310 + }, + { + "epoch": 0.021402364451691806, + "grad_norm": 2.764594316482544, + "learning_rate": 9.973671694523713e-05, + "loss": 9.0297, + "step": 315 + }, + { + "epoch": 0.021742084522353582, + "grad_norm": 1.457748293876648, + "learning_rate": 9.973247044435386e-05, + "loss": 8.6019, + "step": 320 + }, + { + "epoch": 0.022081804593015356, + "grad_norm": 1.5769116878509521, + "learning_rate": 9.972822394347058e-05, + "loss": 8.2577, + "step": 325 + }, + { + "epoch": 0.02242152466367713, + "grad_norm": 1.7383826971054077, + "learning_rate": 9.972397744258731e-05, + "loss": 8.1914, + "step": 330 + }, + { + "epoch": 0.022761244734338906, + "grad_norm": 1.6818331480026245, + "learning_rate": 9.971973094170404e-05, + "loss": 8.8289, + "step": 335 + }, + { + "epoch": 0.02310096480500068, + "grad_norm": 1.429895281791687, + "learning_rate": 9.971548444082077e-05, + "loss": 8.3524, + "step": 340 + }, + { + "epoch": 0.023440684875662456, + "grad_norm": 1.2669081687927246, + "learning_rate": 9.97112379399375e-05, + "loss": 8.289, + "step": 345 + }, + { + "epoch": 0.02378040494632423, + "grad_norm": 1.399524211883545, + "learning_rate": 9.970699143905422e-05, + "loss": 7.8466, + "step": 350 + }, + { + "epoch": 0.024120125016986002, + "grad_norm": 1.6960299015045166, + "learning_rate": 9.970274493817095e-05, + "loss": 8.6849, + "step": 355 + }, + { + "epoch": 0.02445984508764778, + "grad_norm": 1.7302825450897217, + "learning_rate": 9.969849843728768e-05, + "loss": 8.6366, + "step": 360 + }, + { + "epoch": 0.024799565158309552, + "grad_norm": 2.6233043670654297, + "learning_rate": 9.969425193640441e-05, + "loss": 8.3961, + "step": 365 + }, + { + "epoch": 0.02513928522897133, + "grad_norm": 1.480035424232483, + "learning_rate": 9.969000543552114e-05, + "loss": 8.6224, + "step": 370 + }, + { + "epoch": 0.025479005299633102, + "grad_norm": 1.156540870666504, + "learning_rate": 9.968575893463786e-05, + "loss": 7.9226, + "step": 375 + }, + { + "epoch": 0.025818725370294875, + "grad_norm": 1.7962318658828735, + "learning_rate": 9.968151243375459e-05, + "loss": 8.5519, + "step": 380 + }, + { + "epoch": 0.026158445440956652, + "grad_norm": 1.8737194538116455, + "learning_rate": 9.967726593287132e-05, + "loss": 8.4364, + "step": 385 + }, + { + "epoch": 0.026498165511618425, + "grad_norm": 1.6001181602478027, + "learning_rate": 9.967301943198805e-05, + "loss": 7.8641, + "step": 390 + }, + { + "epoch": 0.026837885582280202, + "grad_norm": 1.6181342601776123, + "learning_rate": 9.966877293110478e-05, + "loss": 7.3995, + "step": 395 + }, + { + "epoch": 0.027177605652941975, + "grad_norm": 1.5771849155426025, + "learning_rate": 9.96645264302215e-05, + "loss": 7.8526, + "step": 400 + }, + { + "epoch": 0.027517325723603752, + "grad_norm": 1.1884416341781616, + "learning_rate": 9.966027992933823e-05, + "loss": 8.117, + "step": 405 + }, + { + "epoch": 0.027857045794265525, + "grad_norm": 2.015026092529297, + "learning_rate": 9.965603342845496e-05, + "loss": 8.5386, + "step": 410 + }, + { + "epoch": 0.0281967658649273, + "grad_norm": 1.2226543426513672, + "learning_rate": 9.965178692757169e-05, + "loss": 8.1885, + "step": 415 + }, + { + "epoch": 0.028536485935589075, + "grad_norm": 0.8924500942230225, + "learning_rate": 9.964754042668842e-05, + "loss": 7.9768, + "step": 420 + }, + { + "epoch": 0.02887620600625085, + "grad_norm": 1.3788869380950928, + "learning_rate": 9.964329392580513e-05, + "loss": 8.2231, + "step": 425 + }, + { + "epoch": 0.029215926076912625, + "grad_norm": 1.4598475694656372, + "learning_rate": 9.963904742492187e-05, + "loss": 7.9246, + "step": 430 + }, + { + "epoch": 0.0295556461475744, + "grad_norm": 1.8520469665527344, + "learning_rate": 9.96348009240386e-05, + "loss": 8.0527, + "step": 435 + }, + { + "epoch": 0.029895366218236172, + "grad_norm": 1.4544007778167725, + "learning_rate": 9.963055442315531e-05, + "loss": 7.9365, + "step": 440 + }, + { + "epoch": 0.03023508628889795, + "grad_norm": 1.632197380065918, + "learning_rate": 9.962630792227206e-05, + "loss": 7.6173, + "step": 445 + }, + { + "epoch": 0.030574806359559722, + "grad_norm": 1.7875256538391113, + "learning_rate": 9.962206142138878e-05, + "loss": 7.8095, + "step": 450 + }, + { + "epoch": 0.0309145264302215, + "grad_norm": 1.496638298034668, + "learning_rate": 9.961866422068217e-05, + "loss": 8.3371, + "step": 455 + }, + { + "epoch": 0.03125424650088327, + "grad_norm": 1.915732741355896, + "learning_rate": 9.96144177197989e-05, + "loss": 8.5992, + "step": 460 + }, + { + "epoch": 0.03159396657154505, + "grad_norm": 1.1497036218643188, + "learning_rate": 9.961017121891561e-05, + "loss": 7.7974, + "step": 465 + }, + { + "epoch": 0.03193368664220682, + "grad_norm": 1.5241578817367554, + "learning_rate": 9.960592471803235e-05, + "loss": 7.9306, + "step": 470 + }, + { + "epoch": 0.032273406712868595, + "grad_norm": 1.8431288003921509, + "learning_rate": 9.960167821714908e-05, + "loss": 8.0421, + "step": 475 + }, + { + "epoch": 0.03261312678353037, + "grad_norm": 1.520870566368103, + "learning_rate": 9.959743171626579e-05, + "loss": 7.443, + "step": 480 + }, + { + "epoch": 0.03295284685419215, + "grad_norm": 1.9101468324661255, + "learning_rate": 9.959318521538253e-05, + "loss": 7.5383, + "step": 485 + }, + { + "epoch": 0.03329256692485392, + "grad_norm": 1.2860801219940186, + "learning_rate": 9.958893871449926e-05, + "loss": 7.4753, + "step": 490 + }, + { + "epoch": 0.033632286995515695, + "grad_norm": 1.3139928579330444, + "learning_rate": 9.958469221361598e-05, + "loss": 8.0711, + "step": 495 + }, + { + "epoch": 0.03397200706617747, + "grad_norm": 2.382516622543335, + "learning_rate": 9.958044571273272e-05, + "loss": 7.9498, + "step": 500 + }, + { + "epoch": 0.03431172713683924, + "grad_norm": 1.4085739850997925, + "learning_rate": 9.957619921184945e-05, + "loss": 7.6138, + "step": 505 + }, + { + "epoch": 0.03465144720750102, + "grad_norm": 1.34367835521698, + "learning_rate": 9.957195271096616e-05, + "loss": 7.7689, + "step": 510 + }, + { + "epoch": 0.034991167278162795, + "grad_norm": 1.3598331212997437, + "learning_rate": 9.95677062100829e-05, + "loss": 7.4086, + "step": 515 + }, + { + "epoch": 0.03533088734882457, + "grad_norm": 1.3421567678451538, + "learning_rate": 9.956345970919963e-05, + "loss": 7.8855, + "step": 520 + }, + { + "epoch": 0.03567060741948634, + "grad_norm": 1.5375795364379883, + "learning_rate": 9.955921320831634e-05, + "loss": 7.642, + "step": 525 + }, + { + "epoch": 0.03601032749014812, + "grad_norm": 1.6586527824401855, + "learning_rate": 9.955496670743309e-05, + "loss": 7.1819, + "step": 530 + }, + { + "epoch": 0.036350047560809895, + "grad_norm": 1.179348349571228, + "learning_rate": 9.955072020654981e-05, + "loss": 7.4156, + "step": 535 + }, + { + "epoch": 0.036689767631471665, + "grad_norm": 2.1848621368408203, + "learning_rate": 9.954647370566653e-05, + "loss": 7.4587, + "step": 540 + }, + { + "epoch": 0.03702948770213344, + "grad_norm": 1.3930761814117432, + "learning_rate": 9.954222720478327e-05, + "loss": 7.3459, + "step": 545 + }, + { + "epoch": 0.03736920777279522, + "grad_norm": 1.6147781610488892, + "learning_rate": 9.953798070389998e-05, + "loss": 7.913, + "step": 550 + }, + { + "epoch": 0.03770892784345699, + "grad_norm": 1.2318421602249146, + "learning_rate": 9.953373420301671e-05, + "loss": 7.4613, + "step": 555 + }, + { + "epoch": 0.038048647914118765, + "grad_norm": 14.445450782775879, + "learning_rate": 9.952948770213345e-05, + "loss": 7.4797, + "step": 560 + }, + { + "epoch": 0.03838836798478054, + "grad_norm": 1.447785496711731, + "learning_rate": 9.952524120125017e-05, + "loss": 7.663, + "step": 565 + }, + { + "epoch": 0.03872808805544232, + "grad_norm": 1.2133177518844604, + "learning_rate": 9.95209947003669e-05, + "loss": 7.5911, + "step": 570 + }, + { + "epoch": 0.03906780812610409, + "grad_norm": 1.6101973056793213, + "learning_rate": 9.951674819948364e-05, + "loss": 7.7617, + "step": 575 + }, + { + "epoch": 0.039407528196765865, + "grad_norm": 1.5707918405532837, + "learning_rate": 9.951250169860035e-05, + "loss": 7.2746, + "step": 580 + }, + { + "epoch": 0.03974724826742764, + "grad_norm": 1.4747017621994019, + "learning_rate": 9.950825519771708e-05, + "loss": 7.559, + "step": 585 + }, + { + "epoch": 0.04008696833808941, + "grad_norm": 1.2751151323318481, + "learning_rate": 9.950400869683382e-05, + "loss": 7.0742, + "step": 590 + }, + { + "epoch": 0.04042668840875119, + "grad_norm": 1.4686486721038818, + "learning_rate": 9.949976219595054e-05, + "loss": 7.179, + "step": 595 + }, + { + "epoch": 0.040766408479412965, + "grad_norm": 1.333364725112915, + "learning_rate": 9.949551569506727e-05, + "loss": 7.3349, + "step": 600 + }, + { + "epoch": 0.04110612855007474, + "grad_norm": 1.2560040950775146, + "learning_rate": 9.9491269194184e-05, + "loss": 7.2152, + "step": 605 + }, + { + "epoch": 0.04144584862073651, + "grad_norm": 5.9243011474609375, + "learning_rate": 9.948702269330072e-05, + "loss": 7.253, + "step": 610 + }, + { + "epoch": 0.04178556869139829, + "grad_norm": 1.3705462217330933, + "learning_rate": 9.948277619241745e-05, + "loss": 7.2954, + "step": 615 + }, + { + "epoch": 0.042125288762060065, + "grad_norm": 1.3280870914459229, + "learning_rate": 9.947852969153418e-05, + "loss": 7.0023, + "step": 620 + }, + { + "epoch": 0.042465008832721834, + "grad_norm": 1.5480890274047852, + "learning_rate": 9.94742831906509e-05, + "loss": 6.6209, + "step": 625 + }, + { + "epoch": 0.04280472890338361, + "grad_norm": 1.4617500305175781, + "learning_rate": 9.947003668976763e-05, + "loss": 6.6055, + "step": 630 + }, + { + "epoch": 0.04314444897404539, + "grad_norm": 1.5756878852844238, + "learning_rate": 9.946579018888436e-05, + "loss": 7.0135, + "step": 635 + }, + { + "epoch": 0.043484169044707165, + "grad_norm": 1.4289640188217163, + "learning_rate": 9.946154368800109e-05, + "loss": 7.1441, + "step": 640 + }, + { + "epoch": 0.043823889115368934, + "grad_norm": 1.3657900094985962, + "learning_rate": 9.945729718711782e-05, + "loss": 7.5154, + "step": 645 + }, + { + "epoch": 0.04416360918603071, + "grad_norm": 1.971498966217041, + "learning_rate": 9.94538999864112e-05, + "loss": 7.2665, + "step": 650 + }, + { + "epoch": 0.04450332925669249, + "grad_norm": 1.4446492195129395, + "learning_rate": 9.944965348552793e-05, + "loss": 7.0674, + "step": 655 + }, + { + "epoch": 0.04484304932735426, + "grad_norm": 1.0143150091171265, + "learning_rate": 9.944540698464467e-05, + "loss": 6.7426, + "step": 660 + }, + { + "epoch": 0.045182769398016034, + "grad_norm": 1.3732986450195312, + "learning_rate": 9.944116048376138e-05, + "loss": 7.0236, + "step": 665 + }, + { + "epoch": 0.04552248946867781, + "grad_norm": 1.5511842966079712, + "learning_rate": 9.943691398287811e-05, + "loss": 7.2107, + "step": 670 + }, + { + "epoch": 0.04586220953933958, + "grad_norm": 1.4255778789520264, + "learning_rate": 9.943266748199484e-05, + "loss": 6.817, + "step": 675 + }, + { + "epoch": 0.04620192961000136, + "grad_norm": 1.0669182538986206, + "learning_rate": 9.942842098111157e-05, + "loss": 6.9347, + "step": 680 + }, + { + "epoch": 0.046541649680663134, + "grad_norm": 1.512604832649231, + "learning_rate": 9.94241744802283e-05, + "loss": 7.0817, + "step": 685 + }, + { + "epoch": 0.04688136975132491, + "grad_norm": 1.3859061002731323, + "learning_rate": 9.941992797934502e-05, + "loss": 7.1805, + "step": 690 + }, + { + "epoch": 0.04722108982198668, + "grad_norm": 3.252913236618042, + "learning_rate": 9.941568147846175e-05, + "loss": 6.7985, + "step": 695 + }, + { + "epoch": 0.04756080989264846, + "grad_norm": 1.4156177043914795, + "learning_rate": 9.941143497757848e-05, + "loss": 6.9955, + "step": 700 + }, + { + "epoch": 0.047900529963310234, + "grad_norm": 1.5510213375091553, + "learning_rate": 9.940718847669521e-05, + "loss": 7.0235, + "step": 705 + }, + { + "epoch": 0.048240250033972004, + "grad_norm": 1.3725285530090332, + "learning_rate": 9.940294197581194e-05, + "loss": 6.9692, + "step": 710 + }, + { + "epoch": 0.04857997010463378, + "grad_norm": 1.4986199140548706, + "learning_rate": 9.939869547492866e-05, + "loss": 6.6778, + "step": 715 + }, + { + "epoch": 0.04891969017529556, + "grad_norm": 1.2320705652236938, + "learning_rate": 9.939444897404539e-05, + "loss": 6.8953, + "step": 720 + }, + { + "epoch": 0.049259410245957334, + "grad_norm": 0.9118322134017944, + "learning_rate": 9.939020247316212e-05, + "loss": 6.7496, + "step": 725 + }, + { + "epoch": 0.049599130316619104, + "grad_norm": 3.4886631965637207, + "learning_rate": 9.938595597227885e-05, + "loss": 7.0242, + "step": 730 + }, + { + "epoch": 0.04993885038728088, + "grad_norm": 0.9548838138580322, + "learning_rate": 9.938170947139558e-05, + "loss": 6.6306, + "step": 735 + }, + { + "epoch": 0.05027857045794266, + "grad_norm": 0.8389047980308533, + "learning_rate": 9.93774629705123e-05, + "loss": 7.134, + "step": 740 + }, + { + "epoch": 0.05061829052860443, + "grad_norm": 5.4491801261901855, + "learning_rate": 9.937321646962903e-05, + "loss": 6.698, + "step": 745 + }, + { + "epoch": 0.050958010599266204, + "grad_norm": 1.3063551187515259, + "learning_rate": 9.936896996874576e-05, + "loss": 7.0113, + "step": 750 + }, + { + "epoch": 0.05129773066992798, + "grad_norm": 1.470941424369812, + "learning_rate": 9.936472346786249e-05, + "loss": 6.606, + "step": 755 + }, + { + "epoch": 0.05163745074058975, + "grad_norm": 1.9392439126968384, + "learning_rate": 9.936047696697922e-05, + "loss": 7.07, + "step": 760 + }, + { + "epoch": 0.05197717081125153, + "grad_norm": 0.9688730239868164, + "learning_rate": 9.935623046609594e-05, + "loss": 6.5451, + "step": 765 + }, + { + "epoch": 0.052316890881913304, + "grad_norm": 1.4289032220840454, + "learning_rate": 9.935198396521267e-05, + "loss": 6.8784, + "step": 770 + }, + { + "epoch": 0.05265661095257508, + "grad_norm": 1.4620697498321533, + "learning_rate": 9.93477374643294e-05, + "loss": 6.5151, + "step": 775 + }, + { + "epoch": 0.05299633102323685, + "grad_norm": 2.3521432876586914, + "learning_rate": 9.934349096344613e-05, + "loss": 6.5384, + "step": 780 + }, + { + "epoch": 0.05333605109389863, + "grad_norm": 3.160248041152954, + "learning_rate": 9.933924446256286e-05, + "loss": 6.7476, + "step": 785 + }, + { + "epoch": 0.053675771164560404, + "grad_norm": 1.3147598505020142, + "learning_rate": 9.933499796167957e-05, + "loss": 6.8376, + "step": 790 + }, + { + "epoch": 0.054015491235222174, + "grad_norm": 1.6566650867462158, + "learning_rate": 9.933075146079631e-05, + "loss": 6.2506, + "step": 795 + }, + { + "epoch": 0.05435521130588395, + "grad_norm": 0.9440861344337463, + "learning_rate": 9.932650495991304e-05, + "loss": 6.9212, + "step": 800 + }, + { + "epoch": 0.05469493137654573, + "grad_norm": 1.1842477321624756, + "learning_rate": 9.932225845902975e-05, + "loss": 6.6892, + "step": 805 + }, + { + "epoch": 0.055034651447207504, + "grad_norm": 1.1736949682235718, + "learning_rate": 9.93180119581465e-05, + "loss": 6.9224, + "step": 810 + }, + { + "epoch": 0.055374371517869274, + "grad_norm": 0.7181898951530457, + "learning_rate": 9.931376545726322e-05, + "loss": 6.5164, + "step": 815 + }, + { + "epoch": 0.05571409158853105, + "grad_norm": 0.9374647736549377, + "learning_rate": 9.930951895637994e-05, + "loss": 6.5026, + "step": 820 + }, + { + "epoch": 0.05605381165919283, + "grad_norm": 1.2754137516021729, + "learning_rate": 9.930527245549668e-05, + "loss": 6.4676, + "step": 825 + }, + { + "epoch": 0.0563935317298546, + "grad_norm": 1.0159765481948853, + "learning_rate": 9.930102595461341e-05, + "loss": 6.3455, + "step": 830 + }, + { + "epoch": 0.056733251800516374, + "grad_norm": 1.0118136405944824, + "learning_rate": 9.929677945373012e-05, + "loss": 6.4984, + "step": 835 + }, + { + "epoch": 0.05707297187117815, + "grad_norm": 0.9740552306175232, + "learning_rate": 9.929253295284686e-05, + "loss": 6.665, + "step": 840 + }, + { + "epoch": 0.05741269194183993, + "grad_norm": 2.6464507579803467, + "learning_rate": 9.928828645196359e-05, + "loss": 6.5277, + "step": 845 + }, + { + "epoch": 0.0577524120125017, + "grad_norm": 1.1687380075454712, + "learning_rate": 9.92840399510803e-05, + "loss": 6.5141, + "step": 850 + }, + { + "epoch": 0.058092132083163474, + "grad_norm": 1.0684833526611328, + "learning_rate": 9.927979345019705e-05, + "loss": 6.2975, + "step": 855 + }, + { + "epoch": 0.05843185215382525, + "grad_norm": 3.8520267009735107, + "learning_rate": 9.927554694931378e-05, + "loss": 5.9187, + "step": 860 + }, + { + "epoch": 0.05877157222448702, + "grad_norm": 1.048731803894043, + "learning_rate": 9.927130044843049e-05, + "loss": 6.3076, + "step": 865 + }, + { + "epoch": 0.0591112922951488, + "grad_norm": 0.8034812808036804, + "learning_rate": 9.926705394754723e-05, + "loss": 6.5594, + "step": 870 + }, + { + "epoch": 0.059451012365810574, + "grad_norm": 0.9210667610168457, + "learning_rate": 9.926280744666395e-05, + "loss": 6.0014, + "step": 875 + }, + { + "epoch": 0.059790732436472344, + "grad_norm": 1.0861904621124268, + "learning_rate": 9.925856094578067e-05, + "loss": 6.2432, + "step": 880 + }, + { + "epoch": 0.06013045250713412, + "grad_norm": 0.8701607584953308, + "learning_rate": 9.925431444489742e-05, + "loss": 6.1582, + "step": 885 + }, + { + "epoch": 0.0604701725777959, + "grad_norm": 0.691939651966095, + "learning_rate": 9.925006794401413e-05, + "loss": 6.487, + "step": 890 + }, + { + "epoch": 0.060809892648457674, + "grad_norm": 1.1983147859573364, + "learning_rate": 9.924582144313086e-05, + "loss": 6.1766, + "step": 895 + }, + { + "epoch": 0.061149612719119444, + "grad_norm": 1.1613506078720093, + "learning_rate": 9.92415749422476e-05, + "loss": 6.5303, + "step": 900 + }, + { + "epoch": 0.06148933278978122, + "grad_norm": 0.8394651412963867, + "learning_rate": 9.923732844136431e-05, + "loss": 6.1502, + "step": 905 + }, + { + "epoch": 0.061829052860443, + "grad_norm": 1.2242004871368408, + "learning_rate": 9.923308194048104e-05, + "loss": 6.2054, + "step": 910 + }, + { + "epoch": 0.06216877293110477, + "grad_norm": 1.1255033016204834, + "learning_rate": 9.922883543959778e-05, + "loss": 6.4181, + "step": 915 + }, + { + "epoch": 0.06250849300176654, + "grad_norm": 0.7849110960960388, + "learning_rate": 9.92245889387145e-05, + "loss": 6.1411, + "step": 920 + }, + { + "epoch": 0.06284821307242831, + "grad_norm": 1.0032676458358765, + "learning_rate": 9.922034243783123e-05, + "loss": 6.4892, + "step": 925 + }, + { + "epoch": 0.0631879331430901, + "grad_norm": 1.127551555633545, + "learning_rate": 9.921609593694797e-05, + "loss": 6.3768, + "step": 930 + }, + { + "epoch": 0.06352765321375187, + "grad_norm": 1.0425925254821777, + "learning_rate": 9.921184943606468e-05, + "loss": 6.4448, + "step": 935 + }, + { + "epoch": 0.06386737328441364, + "grad_norm": 1.1642504930496216, + "learning_rate": 9.920760293518142e-05, + "loss": 5.7809, + "step": 940 + }, + { + "epoch": 0.06420709335507542, + "grad_norm": 1.8521403074264526, + "learning_rate": 9.920335643429814e-05, + "loss": 5.9249, + "step": 945 + }, + { + "epoch": 0.06454681342573719, + "grad_norm": 1.073219895362854, + "learning_rate": 9.919910993341487e-05, + "loss": 6.0477, + "step": 950 + }, + { + "epoch": 0.06488653349639897, + "grad_norm": 1.2109575271606445, + "learning_rate": 9.919486343253161e-05, + "loss": 6.0364, + "step": 955 + }, + { + "epoch": 0.06522625356706074, + "grad_norm": 1.1780409812927246, + "learning_rate": 9.919061693164832e-05, + "loss": 6.4147, + "step": 960 + }, + { + "epoch": 0.06556597363772251, + "grad_norm": 0.8810535073280334, + "learning_rate": 9.918637043076505e-05, + "loss": 6.0036, + "step": 965 + }, + { + "epoch": 0.0659056937083843, + "grad_norm": 0.7648366093635559, + "learning_rate": 9.918212392988179e-05, + "loss": 6.2567, + "step": 970 + }, + { + "epoch": 0.06624541377904607, + "grad_norm": 2.192458391189575, + "learning_rate": 9.91778774289985e-05, + "loss": 6.144, + "step": 975 + }, + { + "epoch": 0.06658513384970784, + "grad_norm": 1.2390516996383667, + "learning_rate": 9.917363092811523e-05, + "loss": 6.0287, + "step": 980 + }, + { + "epoch": 0.06692485392036962, + "grad_norm": 0.8258079886436462, + "learning_rate": 9.916938442723198e-05, + "loss": 6.2043, + "step": 985 + }, + { + "epoch": 0.06726457399103139, + "grad_norm": 0.9516924023628235, + "learning_rate": 9.916513792634869e-05, + "loss": 6.0747, + "step": 990 + }, + { + "epoch": 0.06760429406169316, + "grad_norm": 0.856916069984436, + "learning_rate": 9.916089142546542e-05, + "loss": 5.926, + "step": 995 + }, + { + "epoch": 0.06794401413235494, + "grad_norm": 0.8324723839759827, + "learning_rate": 9.915664492458216e-05, + "loss": 6.2707, + "step": 1000 + }, + { + "epoch": 0.06828373420301671, + "grad_norm": 0.7908216714859009, + "learning_rate": 9.915239842369887e-05, + "loss": 5.967, + "step": 1005 + }, + { + "epoch": 0.06862345427367848, + "grad_norm": 0.9094476103782654, + "learning_rate": 9.91481519228156e-05, + "loss": 5.6132, + "step": 1010 + }, + { + "epoch": 0.06896317434434027, + "grad_norm": 0.9734240770339966, + "learning_rate": 9.914390542193233e-05, + "loss": 5.9794, + "step": 1015 + }, + { + "epoch": 0.06930289441500204, + "grad_norm": 0.8310399055480957, + "learning_rate": 9.913965892104906e-05, + "loss": 6.0146, + "step": 1020 + }, + { + "epoch": 0.0696426144856638, + "grad_norm": 0.9436420798301697, + "learning_rate": 9.913541242016579e-05, + "loss": 5.9812, + "step": 1025 + }, + { + "epoch": 0.06998233455632559, + "grad_norm": 1.2283395528793335, + "learning_rate": 9.913116591928251e-05, + "loss": 5.9537, + "step": 1030 + }, + { + "epoch": 0.07032205462698736, + "grad_norm": 0.8751355409622192, + "learning_rate": 9.912691941839924e-05, + "loss": 5.9992, + "step": 1035 + }, + { + "epoch": 0.07066177469764914, + "grad_norm": 0.6706697344779968, + "learning_rate": 9.912267291751597e-05, + "loss": 6.0753, + "step": 1040 + }, + { + "epoch": 0.07100149476831091, + "grad_norm": 0.7029627561569214, + "learning_rate": 9.91184264166327e-05, + "loss": 6.0136, + "step": 1045 + }, + { + "epoch": 0.07134121483897268, + "grad_norm": 0.6171499490737915, + "learning_rate": 9.911417991574943e-05, + "loss": 6.0223, + "step": 1050 + }, + { + "epoch": 0.07168093490963447, + "grad_norm": 2.5934255123138428, + "learning_rate": 9.910993341486615e-05, + "loss": 5.807, + "step": 1055 + }, + { + "epoch": 0.07202065498029624, + "grad_norm": 0.9291547536849976, + "learning_rate": 9.910568691398288e-05, + "loss": 5.7086, + "step": 1060 + }, + { + "epoch": 0.072360375050958, + "grad_norm": 1.4394763708114624, + "learning_rate": 9.910144041309961e-05, + "loss": 5.5023, + "step": 1065 + }, + { + "epoch": 0.07270009512161979, + "grad_norm": 0.6298092603683472, + "learning_rate": 9.909719391221634e-05, + "loss": 5.9972, + "step": 1070 + }, + { + "epoch": 0.07303981519228156, + "grad_norm": 0.6151909232139587, + "learning_rate": 9.909294741133307e-05, + "loss": 5.6389, + "step": 1075 + }, + { + "epoch": 0.07337953526294333, + "grad_norm": 1.1861008405685425, + "learning_rate": 9.90887009104498e-05, + "loss": 6.2689, + "step": 1080 + }, + { + "epoch": 0.07371925533360511, + "grad_norm": 0.7876234650611877, + "learning_rate": 9.908445440956652e-05, + "loss": 5.5448, + "step": 1085 + }, + { + "epoch": 0.07405897540426688, + "grad_norm": 0.592897891998291, + "learning_rate": 9.908020790868325e-05, + "loss": 5.8057, + "step": 1090 + }, + { + "epoch": 0.07439869547492865, + "grad_norm": 0.9189316034317017, + "learning_rate": 9.907596140779998e-05, + "loss": 6.0782, + "step": 1095 + }, + { + "epoch": 0.07473841554559044, + "grad_norm": 0.6605345010757446, + "learning_rate": 9.90717149069167e-05, + "loss": 5.6442, + "step": 1100 + }, + { + "epoch": 0.0750781356162522, + "grad_norm": 0.6724756956100464, + "learning_rate": 9.906746840603343e-05, + "loss": 6.2757, + "step": 1105 + }, + { + "epoch": 0.07541785568691398, + "grad_norm": 0.8074867725372314, + "learning_rate": 9.906322190515016e-05, + "loss": 5.6868, + "step": 1110 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 2.2021851539611816, + "learning_rate": 9.905897540426689e-05, + "loss": 5.755, + "step": 1115 + }, + { + "epoch": 0.07609729582823753, + "grad_norm": 0.7941934466362, + "learning_rate": 9.905472890338362e-05, + "loss": 5.6318, + "step": 1120 + }, + { + "epoch": 0.07643701589889931, + "grad_norm": 0.9947513937950134, + "learning_rate": 9.905048240250035e-05, + "loss": 5.9247, + "step": 1125 + }, + { + "epoch": 0.07677673596956108, + "grad_norm": 0.6511673927307129, + "learning_rate": 9.904623590161707e-05, + "loss": 5.6326, + "step": 1130 + }, + { + "epoch": 0.07711645604022285, + "grad_norm": 0.6497818231582642, + "learning_rate": 9.90419894007338e-05, + "loss": 5.8753, + "step": 1135 + }, + { + "epoch": 0.07745617611088464, + "grad_norm": 0.5531424880027771, + "learning_rate": 9.903774289985053e-05, + "loss": 5.2715, + "step": 1140 + }, + { + "epoch": 0.0777958961815464, + "grad_norm": 0.7292714715003967, + "learning_rate": 9.903349639896725e-05, + "loss": 5.7824, + "step": 1145 + }, + { + "epoch": 0.07813561625220818, + "grad_norm": 0.6802114248275757, + "learning_rate": 9.902924989808399e-05, + "loss": 5.7912, + "step": 1150 + }, + { + "epoch": 0.07847533632286996, + "grad_norm": 0.680204451084137, + "learning_rate": 9.902500339720071e-05, + "loss": 5.961, + "step": 1155 + }, + { + "epoch": 0.07881505639353173, + "grad_norm": 0.596501350402832, + "learning_rate": 9.902075689631743e-05, + "loss": 5.5833, + "step": 1160 + }, + { + "epoch": 0.0791547764641935, + "grad_norm": 0.622715950012207, + "learning_rate": 9.901651039543417e-05, + "loss": 5.933, + "step": 1165 + }, + { + "epoch": 0.07949449653485528, + "grad_norm": 0.9008530974388123, + "learning_rate": 9.90122638945509e-05, + "loss": 5.568, + "step": 1170 + }, + { + "epoch": 0.07983421660551705, + "grad_norm": 0.5429263114929199, + "learning_rate": 9.900801739366761e-05, + "loss": 5.2962, + "step": 1175 + }, + { + "epoch": 0.08017393667617882, + "grad_norm": 0.6079940795898438, + "learning_rate": 9.900377089278435e-05, + "loss": 5.8575, + "step": 1180 + }, + { + "epoch": 0.0805136567468406, + "grad_norm": 0.6796315312385559, + "learning_rate": 9.899952439190108e-05, + "loss": 5.6058, + "step": 1185 + }, + { + "epoch": 0.08085337681750238, + "grad_norm": 0.6909620761871338, + "learning_rate": 9.89952778910178e-05, + "loss": 5.6206, + "step": 1190 + }, + { + "epoch": 0.08119309688816416, + "grad_norm": 0.8451843857765198, + "learning_rate": 9.899103139013454e-05, + "loss": 5.5017, + "step": 1195 + }, + { + "epoch": 0.08153281695882593, + "grad_norm": 0.6521558165550232, + "learning_rate": 9.898678488925127e-05, + "loss": 5.6647, + "step": 1200 + }, + { + "epoch": 0.0818725370294877, + "grad_norm": 0.4295422434806824, + "learning_rate": 9.898253838836798e-05, + "loss": 5.96, + "step": 1205 + }, + { + "epoch": 0.08221225710014948, + "grad_norm": 1.2797423601150513, + "learning_rate": 9.897829188748472e-05, + "loss": 5.5158, + "step": 1210 + }, + { + "epoch": 0.08255197717081125, + "grad_norm": 0.663374125957489, + "learning_rate": 9.897404538660144e-05, + "loss": 5.5387, + "step": 1215 + }, + { + "epoch": 0.08289169724147302, + "grad_norm": 0.6130101084709167, + "learning_rate": 9.896979888571817e-05, + "loss": 5.3825, + "step": 1220 + }, + { + "epoch": 0.0832314173121348, + "grad_norm": 0.9059043526649475, + "learning_rate": 9.896555238483491e-05, + "loss": 5.6168, + "step": 1225 + }, + { + "epoch": 0.08357113738279658, + "grad_norm": 0.9198205471038818, + "learning_rate": 9.896130588395162e-05, + "loss": 5.7126, + "step": 1230 + }, + { + "epoch": 0.08391085745345835, + "grad_norm": 0.6826533079147339, + "learning_rate": 9.895705938306835e-05, + "loss": 5.5344, + "step": 1235 + }, + { + "epoch": 0.08425057752412013, + "grad_norm": 0.6488471031188965, + "learning_rate": 9.895281288218509e-05, + "loss": 5.6311, + "step": 1240 + }, + { + "epoch": 0.0845902975947819, + "grad_norm": 1.4997718334197998, + "learning_rate": 9.89485663813018e-05, + "loss": 5.3847, + "step": 1245 + }, + { + "epoch": 0.08493001766544367, + "grad_norm": 0.8614689111709595, + "learning_rate": 9.894431988041853e-05, + "loss": 5.7996, + "step": 1250 + }, + { + "epoch": 0.08526973773610545, + "grad_norm": 0.910275936126709, + "learning_rate": 9.894007337953527e-05, + "loss": 5.6557, + "step": 1255 + }, + { + "epoch": 0.08560945780676722, + "grad_norm": 0.8584810495376587, + "learning_rate": 9.893582687865199e-05, + "loss": 5.6384, + "step": 1260 + }, + { + "epoch": 0.08594917787742899, + "grad_norm": 0.5326058864593506, + "learning_rate": 9.893158037776872e-05, + "loss": 5.4521, + "step": 1265 + }, + { + "epoch": 0.08628889794809078, + "grad_norm": 1.008244276046753, + "learning_rate": 9.892733387688546e-05, + "loss": 5.6282, + "step": 1270 + }, + { + "epoch": 0.08662861801875255, + "grad_norm": 0.9059062600135803, + "learning_rate": 9.892308737600217e-05, + "loss": 5.7698, + "step": 1275 + }, + { + "epoch": 0.08696833808941433, + "grad_norm": 0.754760205745697, + "learning_rate": 9.891884087511891e-05, + "loss": 5.7735, + "step": 1280 + }, + { + "epoch": 0.0873080581600761, + "grad_norm": 0.6785455346107483, + "learning_rate": 9.891459437423564e-05, + "loss": 5.8881, + "step": 1285 + }, + { + "epoch": 0.08764777823073787, + "grad_norm": 0.8128915429115295, + "learning_rate": 9.891034787335236e-05, + "loss": 5.5169, + "step": 1290 + }, + { + "epoch": 0.08798749830139965, + "grad_norm": 0.912551760673523, + "learning_rate": 9.89061013724691e-05, + "loss": 5.5467, + "step": 1295 + }, + { + "epoch": 0.08832721837206142, + "grad_norm": 2.0163161754608154, + "learning_rate": 9.890185487158581e-05, + "loss": 5.5448, + "step": 1300 + }, + { + "epoch": 0.08866693844272319, + "grad_norm": 0.5310774445533752, + "learning_rate": 9.889760837070254e-05, + "loss": 5.5268, + "step": 1305 + }, + { + "epoch": 0.08900665851338498, + "grad_norm": 0.603813886642456, + "learning_rate": 9.889336186981928e-05, + "loss": 5.6777, + "step": 1310 + }, + { + "epoch": 0.08934637858404675, + "grad_norm": 0.431869238615036, + "learning_rate": 9.8889115368936e-05, + "loss": 5.5403, + "step": 1315 + }, + { + "epoch": 0.08968609865470852, + "grad_norm": 0.7721471190452576, + "learning_rate": 9.888486886805273e-05, + "loss": 5.2792, + "step": 1320 + }, + { + "epoch": 0.0900258187253703, + "grad_norm": 1.0440256595611572, + "learning_rate": 9.888062236716947e-05, + "loss": 5.1811, + "step": 1325 + }, + { + "epoch": 0.09036553879603207, + "grad_norm": 0.5089631080627441, + "learning_rate": 9.887637586628618e-05, + "loss": 5.5219, + "step": 1330 + }, + { + "epoch": 0.09070525886669384, + "grad_norm": 0.5013499855995178, + "learning_rate": 9.887212936540291e-05, + "loss": 5.5286, + "step": 1335 + }, + { + "epoch": 0.09104497893735562, + "grad_norm": 0.8858599662780762, + "learning_rate": 9.886788286451965e-05, + "loss": 5.6356, + "step": 1340 + }, + { + "epoch": 0.09138469900801739, + "grad_norm": 0.45014268159866333, + "learning_rate": 9.886363636363637e-05, + "loss": 5.1792, + "step": 1345 + }, + { + "epoch": 0.09172441907867916, + "grad_norm": 0.6357929706573486, + "learning_rate": 9.88593898627531e-05, + "loss": 5.6234, + "step": 1350 + }, + { + "epoch": 0.09206413914934095, + "grad_norm": 4.850500106811523, + "learning_rate": 9.885514336186984e-05, + "loss": 5.3482, + "step": 1355 + }, + { + "epoch": 0.09240385922000272, + "grad_norm": 0.477006196975708, + "learning_rate": 9.885089686098655e-05, + "loss": 5.2999, + "step": 1360 + }, + { + "epoch": 0.0927435792906645, + "grad_norm": 0.5572239756584167, + "learning_rate": 9.884665036010328e-05, + "loss": 5.4801, + "step": 1365 + }, + { + "epoch": 0.09308329936132627, + "grad_norm": 0.3890332877635956, + "learning_rate": 9.884240385922e-05, + "loss": 5.3536, + "step": 1370 + }, + { + "epoch": 0.09342301943198804, + "grad_norm": 0.43894070386886597, + "learning_rate": 9.883815735833673e-05, + "loss": 5.6419, + "step": 1375 + }, + { + "epoch": 0.09376273950264982, + "grad_norm": 0.48212140798568726, + "learning_rate": 9.883391085745346e-05, + "loss": 5.431, + "step": 1380 + }, + { + "epoch": 0.09410245957331159, + "grad_norm": 0.5145598649978638, + "learning_rate": 9.882966435657019e-05, + "loss": 5.7753, + "step": 1385 + }, + { + "epoch": 0.09444217964397336, + "grad_norm": 0.4795299172401428, + "learning_rate": 9.882541785568692e-05, + "loss": 5.5992, + "step": 1390 + }, + { + "epoch": 0.09478189971463515, + "grad_norm": 0.4439328610897064, + "learning_rate": 9.882117135480365e-05, + "loss": 5.3292, + "step": 1395 + }, + { + "epoch": 0.09512161978529692, + "grad_norm": 0.6526133418083191, + "learning_rate": 9.881692485392037e-05, + "loss": 5.3765, + "step": 1400 + }, + { + "epoch": 0.09546133985595869, + "grad_norm": 0.8982023596763611, + "learning_rate": 9.88126783530371e-05, + "loss": 5.4783, + "step": 1405 + }, + { + "epoch": 0.09580105992662047, + "grad_norm": 0.46160343289375305, + "learning_rate": 9.880843185215383e-05, + "loss": 5.502, + "step": 1410 + }, + { + "epoch": 0.09614077999728224, + "grad_norm": 0.6750124096870422, + "learning_rate": 9.880418535127056e-05, + "loss": 5.4252, + "step": 1415 + }, + { + "epoch": 0.09648050006794401, + "grad_norm": 0.42901554703712463, + "learning_rate": 9.879993885038729e-05, + "loss": 5.3442, + "step": 1420 + }, + { + "epoch": 0.09682022013860579, + "grad_norm": 0.6184918284416199, + "learning_rate": 9.879569234950401e-05, + "loss": 5.3717, + "step": 1425 + }, + { + "epoch": 0.09715994020926756, + "grad_norm": 0.5006517171859741, + "learning_rate": 9.879144584862074e-05, + "loss": 5.4458, + "step": 1430 + }, + { + "epoch": 0.09749966027992933, + "grad_norm": 0.4495384693145752, + "learning_rate": 9.878719934773747e-05, + "loss": 5.263, + "step": 1435 + }, + { + "epoch": 0.09783938035059112, + "grad_norm": 0.4285268783569336, + "learning_rate": 9.87829528468542e-05, + "loss": 5.2207, + "step": 1440 + }, + { + "epoch": 0.09817910042125289, + "grad_norm": 1.0460352897644043, + "learning_rate": 9.877870634597093e-05, + "loss": 5.3448, + "step": 1445 + }, + { + "epoch": 0.09851882049191467, + "grad_norm": 0.6291869878768921, + "learning_rate": 9.877445984508765e-05, + "loss": 5.3182, + "step": 1450 + }, + { + "epoch": 0.09885854056257644, + "grad_norm": 1.0043153762817383, + "learning_rate": 9.877021334420438e-05, + "loss": 5.4226, + "step": 1455 + }, + { + "epoch": 0.09919826063323821, + "grad_norm": 0.7458539009094238, + "learning_rate": 9.876596684332111e-05, + "loss": 5.6298, + "step": 1460 + }, + { + "epoch": 0.09953798070389999, + "grad_norm": 0.45767852663993835, + "learning_rate": 9.876172034243784e-05, + "loss": 5.5378, + "step": 1465 + }, + { + "epoch": 0.09987770077456176, + "grad_norm": 0.4586849510669708, + "learning_rate": 9.875747384155457e-05, + "loss": 5.7105, + "step": 1470 + }, + { + "epoch": 0.10021742084522353, + "grad_norm": 0.37701913714408875, + "learning_rate": 9.87532273406713e-05, + "loss": 5.1507, + "step": 1475 + }, + { + "epoch": 0.10055714091588532, + "grad_norm": 0.42394065856933594, + "learning_rate": 9.874898083978802e-05, + "loss": 5.5164, + "step": 1480 + }, + { + "epoch": 0.10089686098654709, + "grad_norm": 0.6020697355270386, + "learning_rate": 9.874473433890475e-05, + "loss": 5.3247, + "step": 1485 + }, + { + "epoch": 0.10123658105720885, + "grad_norm": 0.5261949300765991, + "learning_rate": 9.874048783802148e-05, + "loss": 5.5159, + "step": 1490 + }, + { + "epoch": 0.10157630112787064, + "grad_norm": 0.4316195547580719, + "learning_rate": 9.87362413371382e-05, + "loss": 5.4615, + "step": 1495 + }, + { + "epoch": 0.10191602119853241, + "grad_norm": 0.3691781759262085, + "learning_rate": 9.873199483625492e-05, + "loss": 5.5583, + "step": 1500 + }, + { + "epoch": 0.10225574126919418, + "grad_norm": 0.5686614513397217, + "learning_rate": 9.872774833537166e-05, + "loss": 5.5936, + "step": 1505 + }, + { + "epoch": 0.10259546133985596, + "grad_norm": 0.3296063542366028, + "learning_rate": 9.872350183448839e-05, + "loss": 5.4636, + "step": 1510 + }, + { + "epoch": 0.10293518141051773, + "grad_norm": 0.46032190322875977, + "learning_rate": 9.87192553336051e-05, + "loss": 5.1794, + "step": 1515 + }, + { + "epoch": 0.1032749014811795, + "grad_norm": 0.4383249282836914, + "learning_rate": 9.871500883272185e-05, + "loss": 5.3524, + "step": 1520 + }, + { + "epoch": 0.10361462155184128, + "grad_norm": 0.5257749557495117, + "learning_rate": 9.871076233183857e-05, + "loss": 5.5045, + "step": 1525 + }, + { + "epoch": 0.10395434162250305, + "grad_norm": 0.5201256275177002, + "learning_rate": 9.870651583095529e-05, + "loss": 5.3214, + "step": 1530 + }, + { + "epoch": 0.10429406169316484, + "grad_norm": 0.7715117931365967, + "learning_rate": 9.870226933007203e-05, + "loss": 5.3583, + "step": 1535 + }, + { + "epoch": 0.10463378176382661, + "grad_norm": 0.34586212038993835, + "learning_rate": 9.869802282918876e-05, + "loss": 5.3337, + "step": 1540 + }, + { + "epoch": 0.10497350183448838, + "grad_norm": 0.45313313603401184, + "learning_rate": 9.869377632830547e-05, + "loss": 5.2788, + "step": 1545 + }, + { + "epoch": 0.10531322190515016, + "grad_norm": 0.4078027904033661, + "learning_rate": 9.868952982742221e-05, + "loss": 5.4364, + "step": 1550 + }, + { + "epoch": 0.10565294197581193, + "grad_norm": 0.4670262038707733, + "learning_rate": 9.868528332653894e-05, + "loss": 5.308, + "step": 1555 + }, + { + "epoch": 0.1059926620464737, + "grad_norm": 0.4951310455799103, + "learning_rate": 9.868103682565566e-05, + "loss": 5.5171, + "step": 1560 + }, + { + "epoch": 0.10633238211713548, + "grad_norm": 0.7351198792457581, + "learning_rate": 9.86767903247724e-05, + "loss": 5.6146, + "step": 1565 + }, + { + "epoch": 0.10667210218779725, + "grad_norm": 0.5064637660980225, + "learning_rate": 9.867254382388911e-05, + "loss": 5.3591, + "step": 1570 + }, + { + "epoch": 0.10701182225845902, + "grad_norm": 0.39143896102905273, + "learning_rate": 9.866829732300584e-05, + "loss": 5.3523, + "step": 1575 + }, + { + "epoch": 0.10735154232912081, + "grad_norm": 1.2670384645462036, + "learning_rate": 9.866405082212258e-05, + "loss": 5.2359, + "step": 1580 + }, + { + "epoch": 0.10769126239978258, + "grad_norm": 0.3745839297771454, + "learning_rate": 9.86598043212393e-05, + "loss": 5.2194, + "step": 1585 + }, + { + "epoch": 0.10803098247044435, + "grad_norm": 0.26325303316116333, + "learning_rate": 9.865555782035602e-05, + "loss": 5.1649, + "step": 1590 + }, + { + "epoch": 0.10837070254110613, + "grad_norm": 0.3311369717121124, + "learning_rate": 9.865131131947277e-05, + "loss": 5.326, + "step": 1595 + }, + { + "epoch": 0.1087104226117679, + "grad_norm": 0.4302009046077728, + "learning_rate": 9.864706481858948e-05, + "loss": 5.1757, + "step": 1600 + }, + { + "epoch": 0.10905014268242967, + "grad_norm": 0.5953149795532227, + "learning_rate": 9.864281831770621e-05, + "loss": 5.3937, + "step": 1605 + }, + { + "epoch": 0.10938986275309145, + "grad_norm": 0.4650028645992279, + "learning_rate": 9.863857181682295e-05, + "loss": 5.4321, + "step": 1610 + }, + { + "epoch": 0.10972958282375322, + "grad_norm": 1.5760172605514526, + "learning_rate": 9.863432531593966e-05, + "loss": 5.1935, + "step": 1615 + }, + { + "epoch": 0.11006930289441501, + "grad_norm": 0.4063778221607208, + "learning_rate": 9.86300788150564e-05, + "loss": 5.0892, + "step": 1620 + }, + { + "epoch": 0.11040902296507678, + "grad_norm": 0.3407844603061676, + "learning_rate": 9.862583231417313e-05, + "loss": 5.2536, + "step": 1625 + }, + { + "epoch": 0.11074874303573855, + "grad_norm": 0.5247534513473511, + "learning_rate": 9.862158581328985e-05, + "loss": 5.3708, + "step": 1630 + }, + { + "epoch": 0.11108846310640033, + "grad_norm": 1.0360844135284424, + "learning_rate": 9.861733931240659e-05, + "loss": 5.3132, + "step": 1635 + }, + { + "epoch": 0.1114281831770621, + "grad_norm": 0.5687776803970337, + "learning_rate": 9.86130928115233e-05, + "loss": 5.2554, + "step": 1640 + }, + { + "epoch": 0.11176790324772387, + "grad_norm": 0.3441666066646576, + "learning_rate": 9.860884631064003e-05, + "loss": 5.1661, + "step": 1645 + }, + { + "epoch": 0.11210762331838565, + "grad_norm": 0.6809844970703125, + "learning_rate": 9.860459980975677e-05, + "loss": 5.3175, + "step": 1650 + }, + { + "epoch": 0.11244734338904742, + "grad_norm": 0.4880785644054413, + "learning_rate": 9.860035330887349e-05, + "loss": 5.3368, + "step": 1655 + }, + { + "epoch": 0.1127870634597092, + "grad_norm": 0.5996628999710083, + "learning_rate": 9.859610680799022e-05, + "loss": 5.2202, + "step": 1660 + }, + { + "epoch": 0.11312678353037098, + "grad_norm": 0.3819567561149597, + "learning_rate": 9.859186030710696e-05, + "loss": 5.2171, + "step": 1665 + }, + { + "epoch": 0.11346650360103275, + "grad_norm": 0.8039321899414062, + "learning_rate": 9.858761380622367e-05, + "loss": 5.1053, + "step": 1670 + }, + { + "epoch": 0.11380622367169452, + "grad_norm": 0.7948293685913086, + "learning_rate": 9.85833673053404e-05, + "loss": 5.1902, + "step": 1675 + }, + { + "epoch": 0.1141459437423563, + "grad_norm": 0.45758846402168274, + "learning_rate": 9.857912080445714e-05, + "loss": 5.2766, + "step": 1680 + }, + { + "epoch": 0.11448566381301807, + "grad_norm": 0.33638903498649597, + "learning_rate": 9.857487430357386e-05, + "loss": 5.1641, + "step": 1685 + }, + { + "epoch": 0.11482538388367985, + "grad_norm": 0.3370652496814728, + "learning_rate": 9.857062780269058e-05, + "loss": 4.8306, + "step": 1690 + }, + { + "epoch": 0.11516510395434162, + "grad_norm": 0.30151012539863586, + "learning_rate": 9.856638130180733e-05, + "loss": 4.9977, + "step": 1695 + }, + { + "epoch": 0.1155048240250034, + "grad_norm": 0.4379021227359772, + "learning_rate": 9.856213480092404e-05, + "loss": 5.2388, + "step": 1700 + }, + { + "epoch": 0.11584454409566518, + "grad_norm": 0.5139544606208801, + "learning_rate": 9.855788830004077e-05, + "loss": 5.0948, + "step": 1705 + }, + { + "epoch": 0.11618426416632695, + "grad_norm": 0.2696703374385834, + "learning_rate": 9.855364179915751e-05, + "loss": 5.1955, + "step": 1710 + }, + { + "epoch": 0.11652398423698872, + "grad_norm": 0.43758052587509155, + "learning_rate": 9.854939529827422e-05, + "loss": 5.1093, + "step": 1715 + }, + { + "epoch": 0.1168637043076505, + "grad_norm": 0.45877712965011597, + "learning_rate": 9.854514879739095e-05, + "loss": 5.1859, + "step": 1720 + }, + { + "epoch": 0.11720342437831227, + "grad_norm": 0.5116316676139832, + "learning_rate": 9.854090229650768e-05, + "loss": 4.9734, + "step": 1725 + }, + { + "epoch": 0.11754314444897404, + "grad_norm": 0.3733248710632324, + "learning_rate": 9.853665579562441e-05, + "loss": 4.9164, + "step": 1730 + }, + { + "epoch": 0.11788286451963582, + "grad_norm": 0.9365966320037842, + "learning_rate": 9.853240929474114e-05, + "loss": 5.1925, + "step": 1735 + }, + { + "epoch": 0.1182225845902976, + "grad_norm": 0.43242642283439636, + "learning_rate": 9.852816279385786e-05, + "loss": 5.2445, + "step": 1740 + }, + { + "epoch": 0.11856230466095936, + "grad_norm": 0.28928515315055847, + "learning_rate": 9.852391629297459e-05, + "loss": 5.2208, + "step": 1745 + }, + { + "epoch": 0.11890202473162115, + "grad_norm": 0.4645937383174896, + "learning_rate": 9.851966979209132e-05, + "loss": 5.1848, + "step": 1750 + }, + { + "epoch": 0.11924174480228292, + "grad_norm": 0.3553106486797333, + "learning_rate": 9.851542329120805e-05, + "loss": 5.0109, + "step": 1755 + }, + { + "epoch": 0.11958146487294469, + "grad_norm": 0.47933322191238403, + "learning_rate": 9.851117679032478e-05, + "loss": 5.2334, + "step": 1760 + }, + { + "epoch": 0.11992118494360647, + "grad_norm": 0.3383587896823883, + "learning_rate": 9.85069302894415e-05, + "loss": 5.3224, + "step": 1765 + }, + { + "epoch": 0.12026090501426824, + "grad_norm": 0.41393041610717773, + "learning_rate": 9.850268378855823e-05, + "loss": 5.062, + "step": 1770 + }, + { + "epoch": 0.12060062508493002, + "grad_norm": 0.43941059708595276, + "learning_rate": 9.849843728767496e-05, + "loss": 5.0013, + "step": 1775 + }, + { + "epoch": 0.1209403451555918, + "grad_norm": 0.35179632902145386, + "learning_rate": 9.849419078679169e-05, + "loss": 5.1989, + "step": 1780 + }, + { + "epoch": 0.12128006522625356, + "grad_norm": 0.7730126976966858, + "learning_rate": 9.848994428590842e-05, + "loss": 5.1821, + "step": 1785 + }, + { + "epoch": 0.12161978529691535, + "grad_norm": 0.45934972167015076, + "learning_rate": 9.848569778502514e-05, + "loss": 5.2454, + "step": 1790 + }, + { + "epoch": 0.12195950536757712, + "grad_norm": 0.6049938797950745, + "learning_rate": 9.848145128414187e-05, + "loss": 5.0269, + "step": 1795 + }, + { + "epoch": 0.12229922543823889, + "grad_norm": 0.9805595874786377, + "learning_rate": 9.84772047832586e-05, + "loss": 4.885, + "step": 1800 + }, + { + "epoch": 0.12263894550890067, + "grad_norm": 0.47447869181632996, + "learning_rate": 9.847295828237533e-05, + "loss": 4.9461, + "step": 1805 + }, + { + "epoch": 0.12297866557956244, + "grad_norm": 0.4037536382675171, + "learning_rate": 9.846871178149206e-05, + "loss": 4.79, + "step": 1810 + }, + { + "epoch": 0.12331838565022421, + "grad_norm": 0.6516850590705872, + "learning_rate": 9.846446528060878e-05, + "loss": 4.8244, + "step": 1815 + }, + { + "epoch": 0.123658105720886, + "grad_norm": 0.46356433629989624, + "learning_rate": 9.846021877972551e-05, + "loss": 4.8264, + "step": 1820 + }, + { + "epoch": 0.12399782579154776, + "grad_norm": 1.0530160665512085, + "learning_rate": 9.845597227884224e-05, + "loss": 4.7702, + "step": 1825 + }, + { + "epoch": 0.12433754586220953, + "grad_norm": 1.3012051582336426, + "learning_rate": 9.845172577795897e-05, + "loss": 4.9146, + "step": 1830 + }, + { + "epoch": 0.12467726593287132, + "grad_norm": 0.6158355474472046, + "learning_rate": 9.84474792770757e-05, + "loss": 5.0082, + "step": 1835 + }, + { + "epoch": 0.1250169860035331, + "grad_norm": 4.688101768493652, + "learning_rate": 9.844323277619241e-05, + "loss": 4.9467, + "step": 1840 + }, + { + "epoch": 0.12535670607419486, + "grad_norm": 0.5098426342010498, + "learning_rate": 9.843898627530915e-05, + "loss": 4.7443, + "step": 1845 + }, + { + "epoch": 0.12569642614485663, + "grad_norm": 0.6203608512878418, + "learning_rate": 9.843473977442588e-05, + "loss": 5.0459, + "step": 1850 + }, + { + "epoch": 0.12603614621551842, + "grad_norm": 0.5965786576271057, + "learning_rate": 9.84304932735426e-05, + "loss": 5.0615, + "step": 1855 + }, + { + "epoch": 0.1263758662861802, + "grad_norm": 0.7298919558525085, + "learning_rate": 9.842624677265934e-05, + "loss": 4.9333, + "step": 1860 + }, + { + "epoch": 0.12671558635684196, + "grad_norm": 0.514262318611145, + "learning_rate": 9.842200027177606e-05, + "loss": 4.9152, + "step": 1865 + }, + { + "epoch": 0.12705530642750373, + "grad_norm": 0.49908140301704407, + "learning_rate": 9.841775377089278e-05, + "loss": 4.8825, + "step": 1870 + }, + { + "epoch": 0.1273950264981655, + "grad_norm": 0.3929906487464905, + "learning_rate": 9.841350727000952e-05, + "loss": 4.6288, + "step": 1875 + }, + { + "epoch": 0.12773474656882727, + "grad_norm": 0.6221901178359985, + "learning_rate": 9.840926076912625e-05, + "loss": 4.6867, + "step": 1880 + }, + { + "epoch": 0.12807446663948907, + "grad_norm": 0.4237980544567108, + "learning_rate": 9.840501426824296e-05, + "loss": 5.0275, + "step": 1885 + }, + { + "epoch": 0.12841418671015084, + "grad_norm": 0.5076737403869629, + "learning_rate": 9.84007677673597e-05, + "loss": 4.3183, + "step": 1890 + }, + { + "epoch": 0.1287539067808126, + "grad_norm": 0.562611997127533, + "learning_rate": 9.839652126647643e-05, + "loss": 4.8492, + "step": 1895 + }, + { + "epoch": 0.12909362685147438, + "grad_norm": 0.43838977813720703, + "learning_rate": 9.839227476559315e-05, + "loss": 4.7559, + "step": 1900 + }, + { + "epoch": 0.12943334692213615, + "grad_norm": 1.7356271743774414, + "learning_rate": 9.838802826470989e-05, + "loss": 4.4937, + "step": 1905 + }, + { + "epoch": 0.12977306699279795, + "grad_norm": 0.35975855588912964, + "learning_rate": 9.838378176382662e-05, + "loss": 4.8153, + "step": 1910 + }, + { + "epoch": 0.13011278706345972, + "grad_norm": 0.46843382716178894, + "learning_rate": 9.837953526294333e-05, + "loss": 4.7742, + "step": 1915 + }, + { + "epoch": 0.1304525071341215, + "grad_norm": 0.49429741501808167, + "learning_rate": 9.837528876206007e-05, + "loss": 4.5403, + "step": 1920 + }, + { + "epoch": 0.13079222720478326, + "grad_norm": 0.496423602104187, + "learning_rate": 9.837104226117679e-05, + "loss": 4.8032, + "step": 1925 + }, + { + "epoch": 0.13113194727544503, + "grad_norm": 0.7953855395317078, + "learning_rate": 9.836679576029352e-05, + "loss": 4.8191, + "step": 1930 + }, + { + "epoch": 0.1314716673461068, + "grad_norm": 0.5093162655830383, + "learning_rate": 9.836254925941026e-05, + "loss": 4.794, + "step": 1935 + }, + { + "epoch": 0.1318113874167686, + "grad_norm": 0.37883055210113525, + "learning_rate": 9.835830275852697e-05, + "loss": 4.3129, + "step": 1940 + }, + { + "epoch": 0.13215110748743036, + "grad_norm": 0.6972466707229614, + "learning_rate": 9.83540562576437e-05, + "loss": 4.6677, + "step": 1945 + }, + { + "epoch": 0.13249082755809213, + "grad_norm": 0.4960924983024597, + "learning_rate": 9.834980975676044e-05, + "loss": 4.7554, + "step": 1950 + }, + { + "epoch": 0.1328305476287539, + "grad_norm": 0.3313211500644684, + "learning_rate": 9.834556325587716e-05, + "loss": 4.5136, + "step": 1955 + }, + { + "epoch": 0.13317026769941567, + "grad_norm": 0.37889447808265686, + "learning_rate": 9.83413167549939e-05, + "loss": 4.6352, + "step": 1960 + }, + { + "epoch": 0.13350998777007744, + "grad_norm": 0.2897196412086487, + "learning_rate": 9.833707025411062e-05, + "loss": 4.2392, + "step": 1965 + }, + { + "epoch": 0.13384970784073924, + "grad_norm": 0.4556117653846741, + "learning_rate": 9.833282375322734e-05, + "loss": 4.4802, + "step": 1970 + }, + { + "epoch": 0.134189427911401, + "grad_norm": 0.24939770996570587, + "learning_rate": 9.832857725234408e-05, + "loss": 4.8028, + "step": 1975 + }, + { + "epoch": 0.13452914798206278, + "grad_norm": 0.5589706301689148, + "learning_rate": 9.832433075146081e-05, + "loss": 4.5484, + "step": 1980 + }, + { + "epoch": 0.13486886805272455, + "grad_norm": 0.403367817401886, + "learning_rate": 9.832008425057752e-05, + "loss": 4.6303, + "step": 1985 + }, + { + "epoch": 0.13520858812338632, + "grad_norm": 0.2891002595424652, + "learning_rate": 9.831583774969426e-05, + "loss": 4.6125, + "step": 1990 + }, + { + "epoch": 0.13554830819404812, + "grad_norm": 0.4545519948005676, + "learning_rate": 9.831159124881098e-05, + "loss": 4.6801, + "step": 1995 + }, + { + "epoch": 0.1358880282647099, + "grad_norm": 0.2752302289009094, + "learning_rate": 9.830734474792771e-05, + "loss": 4.5331, + "step": 2000 + }, + { + "epoch": 0.13622774833537166, + "grad_norm": 0.4735427498817444, + "learning_rate": 9.830309824704445e-05, + "loss": 4.5487, + "step": 2005 + }, + { + "epoch": 0.13656746840603343, + "grad_norm": 0.2892632782459259, + "learning_rate": 9.829885174616116e-05, + "loss": 4.7872, + "step": 2010 + }, + { + "epoch": 0.1369071884766952, + "grad_norm": 0.3587241768836975, + "learning_rate": 9.829460524527789e-05, + "loss": 4.8017, + "step": 2015 + }, + { + "epoch": 0.13724690854735697, + "grad_norm": 0.8643600940704346, + "learning_rate": 9.829035874439463e-05, + "loss": 4.9978, + "step": 2020 + }, + { + "epoch": 0.13758662861801876, + "grad_norm": 0.3995005786418915, + "learning_rate": 9.828611224351135e-05, + "loss": 4.7966, + "step": 2025 + }, + { + "epoch": 0.13792634868868053, + "grad_norm": 0.5287114381790161, + "learning_rate": 9.828186574262808e-05, + "loss": 4.6836, + "step": 2030 + }, + { + "epoch": 0.1382660687593423, + "grad_norm": 0.356660932302475, + "learning_rate": 9.827761924174482e-05, + "loss": 4.6598, + "step": 2035 + }, + { + "epoch": 0.13860578883000407, + "grad_norm": 0.3594839572906494, + "learning_rate": 9.827337274086153e-05, + "loss": 4.7932, + "step": 2040 + }, + { + "epoch": 0.13894550890066584, + "grad_norm": 0.460989385843277, + "learning_rate": 9.826912623997826e-05, + "loss": 4.8404, + "step": 2045 + }, + { + "epoch": 0.1392852289713276, + "grad_norm": 0.3044515550136566, + "learning_rate": 9.8264879739095e-05, + "loss": 4.4804, + "step": 2050 + }, + { + "epoch": 0.1396249490419894, + "grad_norm": 0.2440759837627411, + "learning_rate": 9.826063323821172e-05, + "loss": 4.6584, + "step": 2055 + }, + { + "epoch": 0.13996466911265118, + "grad_norm": 0.39719679951667786, + "learning_rate": 9.825638673732844e-05, + "loss": 4.6913, + "step": 2060 + }, + { + "epoch": 0.14030438918331295, + "grad_norm": 0.2519219219684601, + "learning_rate": 9.825214023644517e-05, + "loss": 4.7914, + "step": 2065 + }, + { + "epoch": 0.14064410925397472, + "grad_norm": 0.27213895320892334, + "learning_rate": 9.82478937355619e-05, + "loss": 4.4571, + "step": 2070 + }, + { + "epoch": 0.1409838293246365, + "grad_norm": 0.31952184438705444, + "learning_rate": 9.824364723467863e-05, + "loss": 4.4334, + "step": 2075 + }, + { + "epoch": 0.1413235493952983, + "grad_norm": 0.2466011643409729, + "learning_rate": 9.823940073379536e-05, + "loss": 4.623, + "step": 2080 + }, + { + "epoch": 0.14166326946596006, + "grad_norm": 0.41923725605010986, + "learning_rate": 9.823515423291208e-05, + "loss": 4.5557, + "step": 2085 + }, + { + "epoch": 0.14200298953662183, + "grad_norm": 0.23959270119667053, + "learning_rate": 9.823090773202881e-05, + "loss": 4.5756, + "step": 2090 + }, + { + "epoch": 0.1423427096072836, + "grad_norm": 0.7019773721694946, + "learning_rate": 9.822666123114554e-05, + "loss": 4.74, + "step": 2095 + }, + { + "epoch": 0.14268242967794537, + "grad_norm": 0.6014403700828552, + "learning_rate": 9.822241473026227e-05, + "loss": 4.4456, + "step": 2100 + }, + { + "epoch": 0.14302214974860714, + "grad_norm": 0.2578621804714203, + "learning_rate": 9.8218168229379e-05, + "loss": 4.6776, + "step": 2105 + }, + { + "epoch": 0.14336186981926893, + "grad_norm": 0.24368084967136383, + "learning_rate": 9.821392172849572e-05, + "loss": 4.7798, + "step": 2110 + }, + { + "epoch": 0.1437015898899307, + "grad_norm": 0.4451867938041687, + "learning_rate": 9.820967522761245e-05, + "loss": 4.2507, + "step": 2115 + }, + { + "epoch": 0.14404130996059247, + "grad_norm": 0.27697330713272095, + "learning_rate": 9.820542872672918e-05, + "loss": 4.6886, + "step": 2120 + }, + { + "epoch": 0.14438103003125424, + "grad_norm": 0.8379690647125244, + "learning_rate": 9.820118222584591e-05, + "loss": 4.5629, + "step": 2125 + }, + { + "epoch": 0.144720750101916, + "grad_norm": 0.9834319353103638, + "learning_rate": 9.819693572496264e-05, + "loss": 4.4945, + "step": 2130 + }, + { + "epoch": 0.14506047017257778, + "grad_norm": 0.45272937417030334, + "learning_rate": 9.819268922407936e-05, + "loss": 4.6099, + "step": 2135 + }, + { + "epoch": 0.14540019024323958, + "grad_norm": 0.517729640007019, + "learning_rate": 9.818844272319609e-05, + "loss": 4.6808, + "step": 2140 + }, + { + "epoch": 0.14573991031390135, + "grad_norm": 0.26133647561073303, + "learning_rate": 9.818419622231282e-05, + "loss": 4.4916, + "step": 2145 + }, + { + "epoch": 0.14607963038456312, + "grad_norm": 0.31160035729408264, + "learning_rate": 9.817994972142955e-05, + "loss": 4.4746, + "step": 2150 + }, + { + "epoch": 0.1464193504552249, + "grad_norm": 0.3950839936733246, + "learning_rate": 9.817570322054628e-05, + "loss": 4.8946, + "step": 2155 + }, + { + "epoch": 0.14675907052588666, + "grad_norm": 0.254171758890152, + "learning_rate": 9.8171456719663e-05, + "loss": 4.5237, + "step": 2160 + }, + { + "epoch": 0.14709879059654846, + "grad_norm": 0.4314219653606415, + "learning_rate": 9.816721021877973e-05, + "loss": 4.6116, + "step": 2165 + }, + { + "epoch": 0.14743851066721023, + "grad_norm": 0.2894288897514343, + "learning_rate": 9.816296371789646e-05, + "loss": 4.4748, + "step": 2170 + }, + { + "epoch": 0.147778230737872, + "grad_norm": 0.2681034207344055, + "learning_rate": 9.815871721701319e-05, + "loss": 4.5926, + "step": 2175 + }, + { + "epoch": 0.14811795080853377, + "grad_norm": 0.27911391854286194, + "learning_rate": 9.815447071612992e-05, + "loss": 4.5391, + "step": 2180 + }, + { + "epoch": 0.14845767087919554, + "grad_norm": 0.3182697296142578, + "learning_rate": 9.815022421524664e-05, + "loss": 4.5708, + "step": 2185 + }, + { + "epoch": 0.1487973909498573, + "grad_norm": 0.2478509396314621, + "learning_rate": 9.814597771436337e-05, + "loss": 4.3974, + "step": 2190 + }, + { + "epoch": 0.1491371110205191, + "grad_norm": 0.3418025076389313, + "learning_rate": 9.814173121348009e-05, + "loss": 4.5312, + "step": 2195 + }, + { + "epoch": 0.14947683109118087, + "grad_norm": 0.2670694887638092, + "learning_rate": 9.813748471259683e-05, + "loss": 4.5906, + "step": 2200 + }, + { + "epoch": 0.14981655116184264, + "grad_norm": 0.29988008737564087, + "learning_rate": 9.813323821171356e-05, + "loss": 4.4151, + "step": 2205 + }, + { + "epoch": 0.1501562712325044, + "grad_norm": 0.2230396866798401, + "learning_rate": 9.812899171083027e-05, + "loss": 4.6196, + "step": 2210 + }, + { + "epoch": 0.15049599130316618, + "grad_norm": 0.2940434515476227, + "learning_rate": 9.812474520994701e-05, + "loss": 4.5765, + "step": 2215 + }, + { + "epoch": 0.15083571137382795, + "grad_norm": 0.2943139672279358, + "learning_rate": 9.812049870906374e-05, + "loss": 4.7439, + "step": 2220 + }, + { + "epoch": 0.15117543144448975, + "grad_norm": 0.5938501954078674, + "learning_rate": 9.811625220818045e-05, + "loss": 4.6233, + "step": 2225 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.29499292373657227, + "learning_rate": 9.81120057072972e-05, + "loss": 4.43, + "step": 2230 + }, + { + "epoch": 0.1518548715858133, + "grad_norm": 0.21327312290668488, + "learning_rate": 9.810775920641392e-05, + "loss": 4.5865, + "step": 2235 + }, + { + "epoch": 0.15219459165647506, + "grad_norm": 0.4112052917480469, + "learning_rate": 9.810351270553064e-05, + "loss": 4.7013, + "step": 2240 + }, + { + "epoch": 0.15253431172713683, + "grad_norm": 0.40261027216911316, + "learning_rate": 9.809926620464738e-05, + "loss": 4.6419, + "step": 2245 + }, + { + "epoch": 0.15287403179779863, + "grad_norm": 0.2737533748149872, + "learning_rate": 9.809501970376411e-05, + "loss": 4.3994, + "step": 2250 + }, + { + "epoch": 0.1532137518684604, + "grad_norm": 0.24050559103488922, + "learning_rate": 9.809077320288082e-05, + "loss": 4.5648, + "step": 2255 + }, + { + "epoch": 0.15355347193912217, + "grad_norm": 0.3781549036502838, + "learning_rate": 9.808652670199756e-05, + "loss": 4.4987, + "step": 2260 + }, + { + "epoch": 0.15389319200978394, + "grad_norm": 0.46098098158836365, + "learning_rate": 9.808228020111428e-05, + "loss": 4.5322, + "step": 2265 + }, + { + "epoch": 0.1542329120804457, + "grad_norm": 0.32969388365745544, + "learning_rate": 9.8078033700231e-05, + "loss": 4.59, + "step": 2270 + }, + { + "epoch": 0.15457263215110748, + "grad_norm": 0.28195780515670776, + "learning_rate": 9.807378719934775e-05, + "loss": 4.4548, + "step": 2275 + }, + { + "epoch": 0.15491235222176927, + "grad_norm": 0.2665387690067291, + "learning_rate": 9.806954069846446e-05, + "loss": 4.4353, + "step": 2280 + }, + { + "epoch": 0.15525207229243104, + "grad_norm": 0.3116438686847687, + "learning_rate": 9.806529419758119e-05, + "loss": 4.3557, + "step": 2285 + }, + { + "epoch": 0.1555917923630928, + "grad_norm": 0.42467501759529114, + "learning_rate": 9.806104769669793e-05, + "loss": 4.3241, + "step": 2290 + }, + { + "epoch": 0.15593151243375458, + "grad_norm": 0.24590204656124115, + "learning_rate": 9.805680119581465e-05, + "loss": 4.3719, + "step": 2295 + }, + { + "epoch": 0.15627123250441635, + "grad_norm": 0.7295488119125366, + "learning_rate": 9.805255469493139e-05, + "loss": 4.5891, + "step": 2300 + }, + { + "epoch": 0.15661095257507815, + "grad_norm": 0.24560780823230743, + "learning_rate": 9.804830819404812e-05, + "loss": 4.6124, + "step": 2305 + }, + { + "epoch": 0.15695067264573992, + "grad_norm": 0.2907837927341461, + "learning_rate": 9.804406169316483e-05, + "loss": 4.2532, + "step": 2310 + }, + { + "epoch": 0.1572903927164017, + "grad_norm": 1.0109922885894775, + "learning_rate": 9.803981519228157e-05, + "loss": 4.5454, + "step": 2315 + }, + { + "epoch": 0.15763011278706346, + "grad_norm": 0.2637081444263458, + "learning_rate": 9.80355686913983e-05, + "loss": 4.5952, + "step": 2320 + }, + { + "epoch": 0.15796983285772523, + "grad_norm": 0.2559982240200043, + "learning_rate": 9.803132219051501e-05, + "loss": 4.6078, + "step": 2325 + }, + { + "epoch": 0.158309552928387, + "grad_norm": 0.4410446882247925, + "learning_rate": 9.802707568963176e-05, + "loss": 4.6202, + "step": 2330 + }, + { + "epoch": 0.1586492729990488, + "grad_norm": 0.20168878138065338, + "learning_rate": 9.802282918874848e-05, + "loss": 4.4023, + "step": 2335 + }, + { + "epoch": 0.15898899306971057, + "grad_norm": 0.29185861349105835, + "learning_rate": 9.80185826878652e-05, + "loss": 4.543, + "step": 2340 + }, + { + "epoch": 0.15932871314037234, + "grad_norm": 0.22290275990962982, + "learning_rate": 9.801433618698194e-05, + "loss": 4.6697, + "step": 2345 + }, + { + "epoch": 0.1596684332110341, + "grad_norm": 0.7529789805412292, + "learning_rate": 9.801008968609865e-05, + "loss": 4.737, + "step": 2350 + }, + { + "epoch": 0.16000815328169588, + "grad_norm": 0.3712422549724579, + "learning_rate": 9.800584318521538e-05, + "loss": 4.6241, + "step": 2355 + }, + { + "epoch": 0.16034787335235764, + "grad_norm": 0.23941993713378906, + "learning_rate": 9.800159668433212e-05, + "loss": 4.6871, + "step": 2360 + }, + { + "epoch": 0.16068759342301944, + "grad_norm": 0.37533217668533325, + "learning_rate": 9.799735018344884e-05, + "loss": 4.467, + "step": 2365 + }, + { + "epoch": 0.1610273134936812, + "grad_norm": 0.2338525950908661, + "learning_rate": 9.799310368256557e-05, + "loss": 4.435, + "step": 2370 + }, + { + "epoch": 0.16136703356434298, + "grad_norm": 0.26814886927604675, + "learning_rate": 9.798885718168231e-05, + "loss": 4.4838, + "step": 2375 + }, + { + "epoch": 0.16170675363500475, + "grad_norm": 0.3187100887298584, + "learning_rate": 9.798461068079902e-05, + "loss": 4.354, + "step": 2380 + }, + { + "epoch": 0.16204647370566652, + "grad_norm": 0.7054830193519592, + "learning_rate": 9.798036417991575e-05, + "loss": 4.489, + "step": 2385 + }, + { + "epoch": 0.16238619377632832, + "grad_norm": 0.25023216009140015, + "learning_rate": 9.797611767903249e-05, + "loss": 4.6024, + "step": 2390 + }, + { + "epoch": 0.1627259138469901, + "grad_norm": 0.24370110034942627, + "learning_rate": 9.79718711781492e-05, + "loss": 4.3951, + "step": 2395 + }, + { + "epoch": 0.16306563391765186, + "grad_norm": 0.23113249242305756, + "learning_rate": 9.796762467726593e-05, + "loss": 4.4352, + "step": 2400 + }, + { + "epoch": 0.16340535398831363, + "grad_norm": 0.4448549747467041, + "learning_rate": 9.796337817638268e-05, + "loss": 4.4063, + "step": 2405 + }, + { + "epoch": 0.1637450740589754, + "grad_norm": 0.20236225426197052, + "learning_rate": 9.795913167549939e-05, + "loss": 4.6175, + "step": 2410 + }, + { + "epoch": 0.16408479412963717, + "grad_norm": 0.5627440810203552, + "learning_rate": 9.795488517461612e-05, + "loss": 4.5675, + "step": 2415 + }, + { + "epoch": 0.16442451420029897, + "grad_norm": 0.28272920846939087, + "learning_rate": 9.795063867373285e-05, + "loss": 4.6146, + "step": 2420 + }, + { + "epoch": 0.16476423427096074, + "grad_norm": 0.2605418264865875, + "learning_rate": 9.794639217284957e-05, + "loss": 4.5697, + "step": 2425 + }, + { + "epoch": 0.1651039543416225, + "grad_norm": 0.23570238053798676, + "learning_rate": 9.79421456719663e-05, + "loss": 4.5072, + "step": 2430 + }, + { + "epoch": 0.16544367441228428, + "grad_norm": 0.20745481550693512, + "learning_rate": 9.793789917108303e-05, + "loss": 4.5735, + "step": 2435 + }, + { + "epoch": 0.16578339448294604, + "grad_norm": 0.23489026725292206, + "learning_rate": 9.793365267019976e-05, + "loss": 4.4731, + "step": 2440 + }, + { + "epoch": 0.16612311455360781, + "grad_norm": 0.4274902939796448, + "learning_rate": 9.792940616931649e-05, + "loss": 4.6706, + "step": 2445 + }, + { + "epoch": 0.1664628346242696, + "grad_norm": 0.25951382517814636, + "learning_rate": 9.792515966843321e-05, + "loss": 4.441, + "step": 2450 + }, + { + "epoch": 0.16680255469493138, + "grad_norm": 1.9463924169540405, + "learning_rate": 9.792091316754994e-05, + "loss": 4.4691, + "step": 2455 + }, + { + "epoch": 0.16714227476559315, + "grad_norm": 0.4177579879760742, + "learning_rate": 9.791666666666667e-05, + "loss": 4.4903, + "step": 2460 + }, + { + "epoch": 0.16748199483625492, + "grad_norm": 0.533138632774353, + "learning_rate": 9.79124201657834e-05, + "loss": 4.3311, + "step": 2465 + }, + { + "epoch": 0.1678217149069167, + "grad_norm": 0.2822255790233612, + "learning_rate": 9.790817366490013e-05, + "loss": 4.5948, + "step": 2470 + }, + { + "epoch": 0.1681614349775785, + "grad_norm": 0.29035472869873047, + "learning_rate": 9.790392716401685e-05, + "loss": 4.5585, + "step": 2475 + }, + { + "epoch": 0.16850115504824026, + "grad_norm": 2.6457104682922363, + "learning_rate": 9.789968066313358e-05, + "loss": 4.5255, + "step": 2480 + }, + { + "epoch": 0.16884087511890203, + "grad_norm": 0.21925875544548035, + "learning_rate": 9.789543416225031e-05, + "loss": 4.5955, + "step": 2485 + }, + { + "epoch": 0.1691805951895638, + "grad_norm": 0.3095509707927704, + "learning_rate": 9.789118766136704e-05, + "loss": 4.5427, + "step": 2490 + }, + { + "epoch": 0.16952031526022557, + "grad_norm": 1.3866817951202393, + "learning_rate": 9.788694116048377e-05, + "loss": 4.3407, + "step": 2495 + }, + { + "epoch": 0.16986003533088734, + "grad_norm": 0.31529414653778076, + "learning_rate": 9.78826946596005e-05, + "loss": 4.614, + "step": 2500 + }, + { + "epoch": 0.17019975540154914, + "grad_norm": 0.25377875566482544, + "learning_rate": 9.787844815871722e-05, + "loss": 4.5838, + "step": 2505 + }, + { + "epoch": 0.1705394754722109, + "grad_norm": 0.7861871123313904, + "learning_rate": 9.787420165783395e-05, + "loss": 4.5731, + "step": 2510 + }, + { + "epoch": 0.17087919554287267, + "grad_norm": 0.19743318855762482, + "learning_rate": 9.786995515695068e-05, + "loss": 4.3947, + "step": 2515 + }, + { + "epoch": 0.17121891561353444, + "grad_norm": 0.3416430950164795, + "learning_rate": 9.78657086560674e-05, + "loss": 4.5711, + "step": 2520 + }, + { + "epoch": 0.17155863568419621, + "grad_norm": 0.3679373562335968, + "learning_rate": 9.786146215518413e-05, + "loss": 4.6518, + "step": 2525 + }, + { + "epoch": 0.17189835575485798, + "grad_norm": 0.23833996057510376, + "learning_rate": 9.785721565430086e-05, + "loss": 4.4339, + "step": 2530 + }, + { + "epoch": 0.17223807582551978, + "grad_norm": 0.25589922070503235, + "learning_rate": 9.785296915341759e-05, + "loss": 4.6889, + "step": 2535 + }, + { + "epoch": 0.17257779589618155, + "grad_norm": 0.27489981055259705, + "learning_rate": 9.784872265253432e-05, + "loss": 4.3215, + "step": 2540 + }, + { + "epoch": 0.17291751596684332, + "grad_norm": 0.23039469122886658, + "learning_rate": 9.784447615165105e-05, + "loss": 4.4543, + "step": 2545 + }, + { + "epoch": 0.1732572360375051, + "grad_norm": 0.3405773341655731, + "learning_rate": 9.784022965076776e-05, + "loss": 4.5138, + "step": 2550 + }, + { + "epoch": 0.17359695610816686, + "grad_norm": 0.8154670000076294, + "learning_rate": 9.78359831498845e-05, + "loss": 4.4716, + "step": 2555 + }, + { + "epoch": 0.17393667617882866, + "grad_norm": 0.30465012788772583, + "learning_rate": 9.783173664900123e-05, + "loss": 4.2262, + "step": 2560 + }, + { + "epoch": 0.17427639624949043, + "grad_norm": 0.3995078504085541, + "learning_rate": 9.782749014811795e-05, + "loss": 4.5862, + "step": 2565 + }, + { + "epoch": 0.1746161163201522, + "grad_norm": 0.2636319398880005, + "learning_rate": 9.782324364723469e-05, + "loss": 4.6933, + "step": 2570 + }, + { + "epoch": 0.17495583639081397, + "grad_norm": 0.3614608943462372, + "learning_rate": 9.781899714635141e-05, + "loss": 4.6504, + "step": 2575 + }, + { + "epoch": 0.17529555646147574, + "grad_norm": 0.3470248878002167, + "learning_rate": 9.781475064546813e-05, + "loss": 4.7038, + "step": 2580 + }, + { + "epoch": 0.1756352765321375, + "grad_norm": 0.39428719878196716, + "learning_rate": 9.781050414458487e-05, + "loss": 4.2662, + "step": 2585 + }, + { + "epoch": 0.1759749966027993, + "grad_norm": 0.22955843806266785, + "learning_rate": 9.78062576437016e-05, + "loss": 4.4044, + "step": 2590 + }, + { + "epoch": 0.17631471667346107, + "grad_norm": 0.2899189293384552, + "learning_rate": 9.780201114281831e-05, + "loss": 4.4223, + "step": 2595 + }, + { + "epoch": 0.17665443674412284, + "grad_norm": 0.4230986535549164, + "learning_rate": 9.779776464193505e-05, + "loss": 4.1781, + "step": 2600 + }, + { + "epoch": 0.17699415681478461, + "grad_norm": 0.32788804173469543, + "learning_rate": 9.779351814105178e-05, + "loss": 4.616, + "step": 2605 + }, + { + "epoch": 0.17733387688544638, + "grad_norm": 0.2200581431388855, + "learning_rate": 9.77892716401685e-05, + "loss": 4.2604, + "step": 2610 + }, + { + "epoch": 0.17767359695610815, + "grad_norm": 0.30823394656181335, + "learning_rate": 9.778502513928524e-05, + "loss": 4.2978, + "step": 2615 + }, + { + "epoch": 0.17801331702676995, + "grad_norm": 0.22299472987651825, + "learning_rate": 9.778077863840195e-05, + "loss": 4.3587, + "step": 2620 + }, + { + "epoch": 0.17835303709743172, + "grad_norm": 0.22951941192150116, + "learning_rate": 9.777653213751868e-05, + "loss": 4.2573, + "step": 2625 + }, + { + "epoch": 0.1786927571680935, + "grad_norm": 0.35953882336616516, + "learning_rate": 9.777228563663542e-05, + "loss": 4.3515, + "step": 2630 + }, + { + "epoch": 0.17903247723875526, + "grad_norm": 0.4688868522644043, + "learning_rate": 9.776803913575214e-05, + "loss": 4.5713, + "step": 2635 + }, + { + "epoch": 0.17937219730941703, + "grad_norm": 0.21083256602287292, + "learning_rate": 9.776379263486888e-05, + "loss": 4.4969, + "step": 2640 + }, + { + "epoch": 0.17971191738007883, + "grad_norm": 0.36712825298309326, + "learning_rate": 9.775954613398561e-05, + "loss": 4.5619, + "step": 2645 + }, + { + "epoch": 0.1800516374507406, + "grad_norm": 0.2260504513978958, + "learning_rate": 9.775529963310232e-05, + "loss": 4.6722, + "step": 2650 + }, + { + "epoch": 0.18039135752140237, + "grad_norm": 0.36943840980529785, + "learning_rate": 9.775105313221906e-05, + "loss": 4.4934, + "step": 2655 + }, + { + "epoch": 0.18073107759206414, + "grad_norm": 0.4936888515949249, + "learning_rate": 9.774680663133579e-05, + "loss": 4.348, + "step": 2660 + }, + { + "epoch": 0.1810707976627259, + "grad_norm": 0.21958352625370026, + "learning_rate": 9.77425601304525e-05, + "loss": 4.4938, + "step": 2665 + }, + { + "epoch": 0.18141051773338768, + "grad_norm": 1.1148053407669067, + "learning_rate": 9.773831362956925e-05, + "loss": 4.2976, + "step": 2670 + }, + { + "epoch": 0.18175023780404947, + "grad_norm": 0.39846473932266235, + "learning_rate": 9.773406712868597e-05, + "loss": 4.3586, + "step": 2675 + }, + { + "epoch": 0.18208995787471124, + "grad_norm": 0.28287413716316223, + "learning_rate": 9.772982062780269e-05, + "loss": 4.4437, + "step": 2680 + }, + { + "epoch": 0.18242967794537301, + "grad_norm": 0.3402862846851349, + "learning_rate": 9.772557412691943e-05, + "loss": 4.5291, + "step": 2685 + }, + { + "epoch": 0.18276939801603478, + "grad_norm": 0.3358980715274811, + "learning_rate": 9.772132762603616e-05, + "loss": 4.3738, + "step": 2690 + }, + { + "epoch": 0.18310911808669655, + "grad_norm": 0.19017407298088074, + "learning_rate": 9.771708112515287e-05, + "loss": 4.1701, + "step": 2695 + }, + { + "epoch": 0.18344883815735832, + "grad_norm": 0.2291361540555954, + "learning_rate": 9.771283462426961e-05, + "loss": 4.6092, + "step": 2700 + }, + { + "epoch": 0.18378855822802012, + "grad_norm": 0.42033877968788147, + "learning_rate": 9.770858812338633e-05, + "loss": 4.3813, + "step": 2705 + }, + { + "epoch": 0.1841282782986819, + "grad_norm": 0.22784222662448883, + "learning_rate": 9.770434162250306e-05, + "loss": 4.4408, + "step": 2710 + }, + { + "epoch": 0.18446799836934366, + "grad_norm": 0.23395958542823792, + "learning_rate": 9.77000951216198e-05, + "loss": 4.4935, + "step": 2715 + }, + { + "epoch": 0.18480771844000543, + "grad_norm": 0.2610359191894531, + "learning_rate": 9.769584862073651e-05, + "loss": 4.4932, + "step": 2720 + }, + { + "epoch": 0.1851474385106672, + "grad_norm": 0.2646908164024353, + "learning_rate": 9.769160211985324e-05, + "loss": 4.4914, + "step": 2725 + }, + { + "epoch": 0.185487158581329, + "grad_norm": 0.31001701951026917, + "learning_rate": 9.768735561896998e-05, + "loss": 4.5656, + "step": 2730 + }, + { + "epoch": 0.18582687865199077, + "grad_norm": 0.3422091007232666, + "learning_rate": 9.76831091180867e-05, + "loss": 4.4946, + "step": 2735 + }, + { + "epoch": 0.18616659872265254, + "grad_norm": 0.4761231243610382, + "learning_rate": 9.767886261720343e-05, + "loss": 4.1494, + "step": 2740 + }, + { + "epoch": 0.1865063187933143, + "grad_norm": 0.23646193742752075, + "learning_rate": 9.767461611632017e-05, + "loss": 4.3254, + "step": 2745 + }, + { + "epoch": 0.18684603886397608, + "grad_norm": 1.6517447233200073, + "learning_rate": 9.767036961543688e-05, + "loss": 4.5333, + "step": 2750 + }, + { + "epoch": 0.18718575893463785, + "grad_norm": 0.2012016475200653, + "learning_rate": 9.766612311455361e-05, + "loss": 4.7069, + "step": 2755 + }, + { + "epoch": 0.18752547900529964, + "grad_norm": 0.20281845331192017, + "learning_rate": 9.766187661367035e-05, + "loss": 4.4399, + "step": 2760 + }, + { + "epoch": 0.18786519907596141, + "grad_norm": 0.1804925948381424, + "learning_rate": 9.765763011278707e-05, + "loss": 4.5354, + "step": 2765 + }, + { + "epoch": 0.18820491914662318, + "grad_norm": 0.4761740267276764, + "learning_rate": 9.76533836119038e-05, + "loss": 4.6633, + "step": 2770 + }, + { + "epoch": 0.18854463921728495, + "grad_norm": 0.22267234325408936, + "learning_rate": 9.764913711102052e-05, + "loss": 4.5686, + "step": 2775 + }, + { + "epoch": 0.18888435928794672, + "grad_norm": 0.5881355404853821, + "learning_rate": 9.764489061013725e-05, + "loss": 4.4554, + "step": 2780 + }, + { + "epoch": 0.1892240793586085, + "grad_norm": 0.43992605805397034, + "learning_rate": 9.764064410925398e-05, + "loss": 4.1684, + "step": 2785 + }, + { + "epoch": 0.1895637994292703, + "grad_norm": 0.21498017013072968, + "learning_rate": 9.76363976083707e-05, + "loss": 4.5047, + "step": 2790 + }, + { + "epoch": 0.18990351949993206, + "grad_norm": 0.37874165177345276, + "learning_rate": 9.763215110748743e-05, + "loss": 4.2255, + "step": 2795 + }, + { + "epoch": 0.19024323957059383, + "grad_norm": 0.2565677762031555, + "learning_rate": 9.762790460660416e-05, + "loss": 4.4333, + "step": 2800 + }, + { + "epoch": 0.1905829596412556, + "grad_norm": 0.2246963530778885, + "learning_rate": 9.762365810572089e-05, + "loss": 4.4198, + "step": 2805 + }, + { + "epoch": 0.19092267971191737, + "grad_norm": 0.946719229221344, + "learning_rate": 9.761941160483762e-05, + "loss": 3.9055, + "step": 2810 + }, + { + "epoch": 0.19126239978257917, + "grad_norm": 1.0544602870941162, + "learning_rate": 9.761516510395435e-05, + "loss": 4.4512, + "step": 2815 + }, + { + "epoch": 0.19160211985324094, + "grad_norm": 0.21298794448375702, + "learning_rate": 9.761091860307107e-05, + "loss": 4.4729, + "step": 2820 + }, + { + "epoch": 0.1919418399239027, + "grad_norm": 1.3822523355484009, + "learning_rate": 9.76066721021878e-05, + "loss": 4.3136, + "step": 2825 + }, + { + "epoch": 0.19228155999456448, + "grad_norm": 0.1828567236661911, + "learning_rate": 9.760242560130453e-05, + "loss": 4.5116, + "step": 2830 + }, + { + "epoch": 0.19262128006522625, + "grad_norm": 0.28580307960510254, + "learning_rate": 9.759817910042126e-05, + "loss": 4.4903, + "step": 2835 + }, + { + "epoch": 0.19296100013588802, + "grad_norm": 0.39433717727661133, + "learning_rate": 9.759393259953799e-05, + "loss": 4.4487, + "step": 2840 + }, + { + "epoch": 0.19330072020654981, + "grad_norm": 0.49140483140945435, + "learning_rate": 9.758968609865471e-05, + "loss": 4.4639, + "step": 2845 + }, + { + "epoch": 0.19364044027721158, + "grad_norm": 0.3383556306362152, + "learning_rate": 9.758543959777144e-05, + "loss": 4.3328, + "step": 2850 + }, + { + "epoch": 0.19398016034787335, + "grad_norm": 0.7367972135543823, + "learning_rate": 9.758119309688817e-05, + "loss": 4.5457, + "step": 2855 + }, + { + "epoch": 0.19431988041853512, + "grad_norm": 0.19852545857429504, + "learning_rate": 9.75769465960049e-05, + "loss": 4.2916, + "step": 2860 + }, + { + "epoch": 0.1946596004891969, + "grad_norm": 0.3379197120666504, + "learning_rate": 9.757270009512163e-05, + "loss": 4.4011, + "step": 2865 + }, + { + "epoch": 0.19499932055985866, + "grad_norm": 0.4577140212059021, + "learning_rate": 9.756845359423835e-05, + "loss": 4.2438, + "step": 2870 + }, + { + "epoch": 0.19533904063052046, + "grad_norm": 0.32074615359306335, + "learning_rate": 9.756420709335508e-05, + "loss": 4.3429, + "step": 2875 + }, + { + "epoch": 0.19567876070118223, + "grad_norm": 0.4993734359741211, + "learning_rate": 9.755996059247181e-05, + "loss": 4.6458, + "step": 2880 + }, + { + "epoch": 0.196018480771844, + "grad_norm": 0.21413934230804443, + "learning_rate": 9.755571409158854e-05, + "loss": 4.3236, + "step": 2885 + }, + { + "epoch": 0.19635820084250577, + "grad_norm": 0.39588046073913574, + "learning_rate": 9.755146759070527e-05, + "loss": 4.2725, + "step": 2890 + }, + { + "epoch": 0.19669792091316754, + "grad_norm": 0.23066450655460358, + "learning_rate": 9.7547221089822e-05, + "loss": 4.2981, + "step": 2895 + }, + { + "epoch": 0.19703764098382934, + "grad_norm": 0.24343866109848022, + "learning_rate": 9.754297458893872e-05, + "loss": 4.4485, + "step": 2900 + }, + { + "epoch": 0.1973773610544911, + "grad_norm": 0.2774411141872406, + "learning_rate": 9.753872808805544e-05, + "loss": 4.2737, + "step": 2905 + }, + { + "epoch": 0.19771708112515288, + "grad_norm": 0.3360697329044342, + "learning_rate": 9.753448158717218e-05, + "loss": 4.2094, + "step": 2910 + }, + { + "epoch": 0.19805680119581465, + "grad_norm": 0.3886429965496063, + "learning_rate": 9.75302350862889e-05, + "loss": 4.4864, + "step": 2915 + }, + { + "epoch": 0.19839652126647642, + "grad_norm": 0.5242161154747009, + "learning_rate": 9.752598858540562e-05, + "loss": 4.3877, + "step": 2920 + }, + { + "epoch": 0.1987362413371382, + "grad_norm": 0.2082594633102417, + "learning_rate": 9.752174208452236e-05, + "loss": 4.3006, + "step": 2925 + }, + { + "epoch": 0.19907596140779998, + "grad_norm": 1.1216654777526855, + "learning_rate": 9.751749558363909e-05, + "loss": 4.2916, + "step": 2930 + }, + { + "epoch": 0.19941568147846175, + "grad_norm": 0.1812744289636612, + "learning_rate": 9.75132490827558e-05, + "loss": 4.4246, + "step": 2935 + }, + { + "epoch": 0.19975540154912352, + "grad_norm": 0.316278874874115, + "learning_rate": 9.750900258187255e-05, + "loss": 4.2248, + "step": 2940 + }, + { + "epoch": 0.2000951216197853, + "grad_norm": 0.2795095443725586, + "learning_rate": 9.750475608098927e-05, + "loss": 4.4285, + "step": 2945 + }, + { + "epoch": 0.20043484169044706, + "grad_norm": 0.25871169567108154, + "learning_rate": 9.750050958010599e-05, + "loss": 4.3629, + "step": 2950 + }, + { + "epoch": 0.20077456176110883, + "grad_norm": 0.3203955888748169, + "learning_rate": 9.749626307922273e-05, + "loss": 4.8022, + "step": 2955 + }, + { + "epoch": 0.20111428183177063, + "grad_norm": 0.897880494594574, + "learning_rate": 9.749201657833946e-05, + "loss": 4.444, + "step": 2960 + }, + { + "epoch": 0.2014540019024324, + "grad_norm": 0.6095696687698364, + "learning_rate": 9.748777007745617e-05, + "loss": 4.3442, + "step": 2965 + }, + { + "epoch": 0.20179372197309417, + "grad_norm": 0.8089606761932373, + "learning_rate": 9.748352357657291e-05, + "loss": 4.3223, + "step": 2970 + }, + { + "epoch": 0.20213344204375594, + "grad_norm": 0.5481230616569519, + "learning_rate": 9.747927707568963e-05, + "loss": 4.3059, + "step": 2975 + }, + { + "epoch": 0.2024731621144177, + "grad_norm": 0.24502769112586975, + "learning_rate": 9.747503057480637e-05, + "loss": 4.2946, + "step": 2980 + }, + { + "epoch": 0.2028128821850795, + "grad_norm": 0.20267254114151, + "learning_rate": 9.74707840739231e-05, + "loss": 4.3506, + "step": 2985 + }, + { + "epoch": 0.20315260225574128, + "grad_norm": 1.4581079483032227, + "learning_rate": 9.746653757303981e-05, + "loss": 4.3024, + "step": 2990 + }, + { + "epoch": 0.20349232232640305, + "grad_norm": 0.3428595960140228, + "learning_rate": 9.746229107215655e-05, + "loss": 4.4748, + "step": 2995 + }, + { + "epoch": 0.20383204239706482, + "grad_norm": 0.3032056391239166, + "learning_rate": 9.745804457127328e-05, + "loss": 4.3262, + "step": 3000 + }, + { + "epoch": 0.2041717624677266, + "grad_norm": 0.32317832112312317, + "learning_rate": 9.745379807039e-05, + "loss": 4.475, + "step": 3005 + }, + { + "epoch": 0.20451148253838836, + "grad_norm": 0.21782419085502625, + "learning_rate": 9.744955156950674e-05, + "loss": 4.5021, + "step": 3010 + }, + { + "epoch": 0.20485120260905015, + "grad_norm": 0.17983724176883698, + "learning_rate": 9.744530506862347e-05, + "loss": 4.3826, + "step": 3015 + }, + { + "epoch": 0.20519092267971192, + "grad_norm": 0.3824704587459564, + "learning_rate": 9.744105856774018e-05, + "loss": 4.4883, + "step": 3020 + }, + { + "epoch": 0.2055306427503737, + "grad_norm": 0.6417528390884399, + "learning_rate": 9.743681206685692e-05, + "loss": 4.3623, + "step": 3025 + }, + { + "epoch": 0.20587036282103546, + "grad_norm": 0.31229910254478455, + "learning_rate": 9.743256556597365e-05, + "loss": 4.3362, + "step": 3030 + }, + { + "epoch": 0.20621008289169723, + "grad_norm": 0.35579913854599, + "learning_rate": 9.742831906509036e-05, + "loss": 4.3119, + "step": 3035 + }, + { + "epoch": 0.206549802962359, + "grad_norm": 0.21225492656230927, + "learning_rate": 9.74240725642071e-05, + "loss": 4.4159, + "step": 3040 + }, + { + "epoch": 0.2068895230330208, + "grad_norm": 0.5204954147338867, + "learning_rate": 9.741982606332382e-05, + "loss": 4.254, + "step": 3045 + }, + { + "epoch": 0.20722924310368257, + "grad_norm": 0.22649656236171722, + "learning_rate": 9.741557956244055e-05, + "loss": 4.3604, + "step": 3050 + }, + { + "epoch": 0.20756896317434434, + "grad_norm": 0.22533409297466278, + "learning_rate": 9.741133306155729e-05, + "loss": 4.1594, + "step": 3055 + }, + { + "epoch": 0.2079086832450061, + "grad_norm": 0.2681191563606262, + "learning_rate": 9.7407086560674e-05, + "loss": 4.553, + "step": 3060 + }, + { + "epoch": 0.20824840331566788, + "grad_norm": 1.2959145307540894, + "learning_rate": 9.740284005979073e-05, + "loss": 4.2071, + "step": 3065 + }, + { + "epoch": 0.20858812338632968, + "grad_norm": 0.21679522097110748, + "learning_rate": 9.739859355890747e-05, + "loss": 4.5038, + "step": 3070 + }, + { + "epoch": 0.20892784345699145, + "grad_norm": 0.36338409781455994, + "learning_rate": 9.739434705802419e-05, + "loss": 4.3356, + "step": 3075 + }, + { + "epoch": 0.20926756352765322, + "grad_norm": 0.2271890938282013, + "learning_rate": 9.739010055714092e-05, + "loss": 4.3573, + "step": 3080 + }, + { + "epoch": 0.209607283598315, + "grad_norm": 0.2753996253013611, + "learning_rate": 9.738585405625766e-05, + "loss": 4.4467, + "step": 3085 + }, + { + "epoch": 0.20994700366897676, + "grad_norm": 0.32643699645996094, + "learning_rate": 9.738160755537437e-05, + "loss": 4.2911, + "step": 3090 + }, + { + "epoch": 0.21028672373963853, + "grad_norm": 0.1794055551290512, + "learning_rate": 9.73773610544911e-05, + "loss": 4.3936, + "step": 3095 + }, + { + "epoch": 0.21062644381030032, + "grad_norm": 0.2121143937110901, + "learning_rate": 9.737311455360784e-05, + "loss": 4.4391, + "step": 3100 + }, + { + "epoch": 0.2109661638809621, + "grad_norm": 0.6584509015083313, + "learning_rate": 9.736886805272456e-05, + "loss": 4.3936, + "step": 3105 + }, + { + "epoch": 0.21130588395162386, + "grad_norm": 0.2863527834415436, + "learning_rate": 9.736462155184128e-05, + "loss": 4.4768, + "step": 3110 + }, + { + "epoch": 0.21164560402228563, + "grad_norm": 1.1741371154785156, + "learning_rate": 9.736037505095803e-05, + "loss": 4.3373, + "step": 3115 + }, + { + "epoch": 0.2119853240929474, + "grad_norm": 0.3653934597969055, + "learning_rate": 9.735612855007474e-05, + "loss": 4.6397, + "step": 3120 + }, + { + "epoch": 0.21232504416360917, + "grad_norm": 0.369391530752182, + "learning_rate": 9.735188204919147e-05, + "loss": 4.1315, + "step": 3125 + }, + { + "epoch": 0.21266476423427097, + "grad_norm": 0.22272364795207977, + "learning_rate": 9.73476355483082e-05, + "loss": 4.4174, + "step": 3130 + }, + { + "epoch": 0.21300448430493274, + "grad_norm": 0.26364120841026306, + "learning_rate": 9.734338904742492e-05, + "loss": 4.3034, + "step": 3135 + }, + { + "epoch": 0.2133442043755945, + "grad_norm": 0.2755309045314789, + "learning_rate": 9.733914254654165e-05, + "loss": 4.3209, + "step": 3140 + }, + { + "epoch": 0.21368392444625628, + "grad_norm": 0.27905556559562683, + "learning_rate": 9.733489604565838e-05, + "loss": 4.4046, + "step": 3145 + }, + { + "epoch": 0.21402364451691805, + "grad_norm": 0.25759658217430115, + "learning_rate": 9.733064954477511e-05, + "loss": 4.1602, + "step": 3150 + }, + { + "epoch": 0.21436336458757985, + "grad_norm": 1.0761340856552124, + "learning_rate": 9.732640304389184e-05, + "loss": 4.2794, + "step": 3155 + }, + { + "epoch": 0.21470308465824162, + "grad_norm": 0.18029484152793884, + "learning_rate": 9.732215654300856e-05, + "loss": 4.3554, + "step": 3160 + }, + { + "epoch": 0.2150428047289034, + "grad_norm": 0.373797208070755, + "learning_rate": 9.731791004212529e-05, + "loss": 4.3863, + "step": 3165 + }, + { + "epoch": 0.21538252479956516, + "grad_norm": 0.6202191710472107, + "learning_rate": 9.731366354124202e-05, + "loss": 4.2955, + "step": 3170 + }, + { + "epoch": 0.21572224487022693, + "grad_norm": 0.20301900804042816, + "learning_rate": 9.730941704035875e-05, + "loss": 4.2914, + "step": 3175 + }, + { + "epoch": 0.2160619649408887, + "grad_norm": 0.17571194469928741, + "learning_rate": 9.730517053947548e-05, + "loss": 4.276, + "step": 3180 + }, + { + "epoch": 0.2164016850115505, + "grad_norm": 0.3209381401538849, + "learning_rate": 9.73009240385922e-05, + "loss": 4.3957, + "step": 3185 + }, + { + "epoch": 0.21674140508221226, + "grad_norm": 0.2638840079307556, + "learning_rate": 9.729667753770893e-05, + "loss": 4.6224, + "step": 3190 + }, + { + "epoch": 0.21708112515287403, + "grad_norm": 0.19320239126682281, + "learning_rate": 9.729243103682566e-05, + "loss": 4.3039, + "step": 3195 + }, + { + "epoch": 0.2174208452235358, + "grad_norm": 0.43768310546875, + "learning_rate": 9.728818453594239e-05, + "loss": 4.3898, + "step": 3200 + }, + { + "epoch": 0.21776056529419757, + "grad_norm": 0.35756048560142517, + "learning_rate": 9.728393803505912e-05, + "loss": 4.5883, + "step": 3205 + }, + { + "epoch": 0.21810028536485934, + "grad_norm": 0.2380749136209488, + "learning_rate": 9.727969153417584e-05, + "loss": 4.4224, + "step": 3210 + }, + { + "epoch": 0.21844000543552114, + "grad_norm": 0.23136284947395325, + "learning_rate": 9.727544503329257e-05, + "loss": 4.3141, + "step": 3215 + }, + { + "epoch": 0.2187797255061829, + "grad_norm": 0.3109607398509979, + "learning_rate": 9.72711985324093e-05, + "loss": 4.295, + "step": 3220 + }, + { + "epoch": 0.21911944557684468, + "grad_norm": 0.4062863290309906, + "learning_rate": 9.726695203152603e-05, + "loss": 4.2421, + "step": 3225 + }, + { + "epoch": 0.21945916564750645, + "grad_norm": 0.20023144781589508, + "learning_rate": 9.726270553064276e-05, + "loss": 4.3144, + "step": 3230 + }, + { + "epoch": 0.21979888571816822, + "grad_norm": 0.8297600150108337, + "learning_rate": 9.725845902975948e-05, + "loss": 4.2986, + "step": 3235 + }, + { + "epoch": 0.22013860578883002, + "grad_norm": 0.6315371990203857, + "learning_rate": 9.725421252887621e-05, + "loss": 4.4641, + "step": 3240 + }, + { + "epoch": 0.22047832585949179, + "grad_norm": 0.2108875811100006, + "learning_rate": 9.724996602799293e-05, + "loss": 4.2764, + "step": 3245 + }, + { + "epoch": 0.22081804593015356, + "grad_norm": 0.20751313865184784, + "learning_rate": 9.724571952710967e-05, + "loss": 4.3553, + "step": 3250 + }, + { + "epoch": 0.22115776600081533, + "grad_norm": 0.24425362050533295, + "learning_rate": 9.72414730262264e-05, + "loss": 4.243, + "step": 3255 + }, + { + "epoch": 0.2214974860714771, + "grad_norm": 0.22244137525558472, + "learning_rate": 9.723722652534311e-05, + "loss": 4.2259, + "step": 3260 + }, + { + "epoch": 0.22183720614213887, + "grad_norm": 1.1119288206100464, + "learning_rate": 9.723298002445985e-05, + "loss": 4.322, + "step": 3265 + }, + { + "epoch": 0.22217692621280066, + "grad_norm": 0.3089415729045868, + "learning_rate": 9.722873352357658e-05, + "loss": 4.5175, + "step": 3270 + }, + { + "epoch": 0.22251664628346243, + "grad_norm": 0.2517615556716919, + "learning_rate": 9.72244870226933e-05, + "loss": 4.4571, + "step": 3275 + }, + { + "epoch": 0.2228563663541242, + "grad_norm": 0.17470265924930573, + "learning_rate": 9.722024052181004e-05, + "loss": 4.2085, + "step": 3280 + }, + { + "epoch": 0.22319608642478597, + "grad_norm": 0.22137637436389923, + "learning_rate": 9.721599402092676e-05, + "loss": 4.597, + "step": 3285 + }, + { + "epoch": 0.22353580649544774, + "grad_norm": 0.3537333309650421, + "learning_rate": 9.721174752004348e-05, + "loss": 4.4751, + "step": 3290 + }, + { + "epoch": 0.22387552656610954, + "grad_norm": 0.24241957068443298, + "learning_rate": 9.720750101916022e-05, + "loss": 4.0842, + "step": 3295 + }, + { + "epoch": 0.2242152466367713, + "grad_norm": 0.2881457805633545, + "learning_rate": 9.720325451827695e-05, + "loss": 4.1991, + "step": 3300 + }, + { + "epoch": 0.22455496670743308, + "grad_norm": 0.23095691204071045, + "learning_rate": 9.719900801739366e-05, + "loss": 4.4024, + "step": 3305 + }, + { + "epoch": 0.22489468677809485, + "grad_norm": 0.25291046500205994, + "learning_rate": 9.71947615165104e-05, + "loss": 4.1577, + "step": 3310 + }, + { + "epoch": 0.22523440684875662, + "grad_norm": 0.2241574227809906, + "learning_rate": 9.719051501562713e-05, + "loss": 4.2593, + "step": 3315 + }, + { + "epoch": 0.2255741269194184, + "grad_norm": 0.19019931554794312, + "learning_rate": 9.718626851474386e-05, + "loss": 4.1674, + "step": 3320 + }, + { + "epoch": 0.22591384699008019, + "grad_norm": 0.2490902543067932, + "learning_rate": 9.718202201386059e-05, + "loss": 4.201, + "step": 3325 + }, + { + "epoch": 0.22625356706074196, + "grad_norm": 0.2902776896953583, + "learning_rate": 9.71777755129773e-05, + "loss": 4.5505, + "step": 3330 + }, + { + "epoch": 0.22659328713140373, + "grad_norm": 0.22167052328586578, + "learning_rate": 9.717352901209404e-05, + "loss": 4.282, + "step": 3335 + }, + { + "epoch": 0.2269330072020655, + "grad_norm": 3.241713523864746, + "learning_rate": 9.716928251121077e-05, + "loss": 4.095, + "step": 3340 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.2534385323524475, + "learning_rate": 9.716503601032749e-05, + "loss": 4.325, + "step": 3345 + }, + { + "epoch": 0.22761244734338903, + "grad_norm": 0.2039516121149063, + "learning_rate": 9.716078950944423e-05, + "loss": 4.3627, + "step": 3350 + }, + { + "epoch": 0.22795216741405083, + "grad_norm": 0.20797346532344818, + "learning_rate": 9.715654300856096e-05, + "loss": 4.1518, + "step": 3355 + }, + { + "epoch": 0.2282918874847126, + "grad_norm": 0.2560058534145355, + "learning_rate": 9.715229650767767e-05, + "loss": 4.2377, + "step": 3360 + }, + { + "epoch": 0.22863160755537437, + "grad_norm": 0.20020583271980286, + "learning_rate": 9.714805000679441e-05, + "loss": 4.4088, + "step": 3365 + }, + { + "epoch": 0.22897132762603614, + "grad_norm": 0.32701200246810913, + "learning_rate": 9.714380350591114e-05, + "loss": 4.4603, + "step": 3370 + }, + { + "epoch": 0.2293110476966979, + "grad_norm": 0.16908589005470276, + "learning_rate": 9.713955700502786e-05, + "loss": 4.4159, + "step": 3375 + }, + { + "epoch": 0.2296507677673597, + "grad_norm": 10.86708927154541, + "learning_rate": 9.71353105041446e-05, + "loss": 4.373, + "step": 3380 + }, + { + "epoch": 0.22999048783802148, + "grad_norm": 0.2162582278251648, + "learning_rate": 9.713106400326132e-05, + "loss": 4.3303, + "step": 3385 + }, + { + "epoch": 0.23033020790868325, + "grad_norm": 0.1772332489490509, + "learning_rate": 9.712681750237804e-05, + "loss": 4.2396, + "step": 3390 + }, + { + "epoch": 0.23066992797934502, + "grad_norm": 0.36134952306747437, + "learning_rate": 9.712257100149478e-05, + "loss": 4.3838, + "step": 3395 + }, + { + "epoch": 0.2310096480500068, + "grad_norm": 0.32894113659858704, + "learning_rate": 9.71183245006115e-05, + "loss": 4.177, + "step": 3400 + }, + { + "epoch": 0.23134936812066856, + "grad_norm": 0.2267148792743683, + "learning_rate": 9.711407799972822e-05, + "loss": 4.3994, + "step": 3405 + }, + { + "epoch": 0.23168908819133036, + "grad_norm": 0.22997945547103882, + "learning_rate": 9.710983149884496e-05, + "loss": 4.2585, + "step": 3410 + }, + { + "epoch": 0.23202880826199213, + "grad_norm": 0.21913081407546997, + "learning_rate": 9.710558499796168e-05, + "loss": 4.495, + "step": 3415 + }, + { + "epoch": 0.2323685283326539, + "grad_norm": 0.2355417013168335, + "learning_rate": 9.710133849707841e-05, + "loss": 4.4619, + "step": 3420 + }, + { + "epoch": 0.23270824840331567, + "grad_norm": 0.29134589433670044, + "learning_rate": 9.709709199619515e-05, + "loss": 4.3438, + "step": 3425 + }, + { + "epoch": 0.23304796847397743, + "grad_norm": 0.4645059108734131, + "learning_rate": 9.709284549531186e-05, + "loss": 4.467, + "step": 3430 + }, + { + "epoch": 0.2333876885446392, + "grad_norm": 0.3466382324695587, + "learning_rate": 9.708859899442859e-05, + "loss": 4.4636, + "step": 3435 + }, + { + "epoch": 0.233727408615301, + "grad_norm": 0.2788010835647583, + "learning_rate": 9.708435249354533e-05, + "loss": 4.1318, + "step": 3440 + }, + { + "epoch": 0.23406712868596277, + "grad_norm": 0.4784042537212372, + "learning_rate": 9.708010599266205e-05, + "loss": 4.2089, + "step": 3445 + }, + { + "epoch": 0.23440684875662454, + "grad_norm": 0.45934122800827026, + "learning_rate": 9.707585949177878e-05, + "loss": 4.3023, + "step": 3450 + }, + { + "epoch": 0.2347465688272863, + "grad_norm": 0.25707322359085083, + "learning_rate": 9.707161299089552e-05, + "loss": 4.2901, + "step": 3455 + }, + { + "epoch": 0.23508628889794808, + "grad_norm": 0.4797256290912628, + "learning_rate": 9.706736649001223e-05, + "loss": 4.2574, + "step": 3460 + }, + { + "epoch": 0.23542600896860988, + "grad_norm": 0.2368171215057373, + "learning_rate": 9.706311998912896e-05, + "loss": 4.3998, + "step": 3465 + }, + { + "epoch": 0.23576572903927165, + "grad_norm": 1.7958965301513672, + "learning_rate": 9.705887348824569e-05, + "loss": 4.212, + "step": 3470 + }, + { + "epoch": 0.23610544910993342, + "grad_norm": 0.24695445597171783, + "learning_rate": 9.705462698736242e-05, + "loss": 4.3325, + "step": 3475 + }, + { + "epoch": 0.2364451691805952, + "grad_norm": 0.24877724051475525, + "learning_rate": 9.705038048647914e-05, + "loss": 4.4167, + "step": 3480 + }, + { + "epoch": 0.23678488925125696, + "grad_norm": 0.2147648185491562, + "learning_rate": 9.704613398559587e-05, + "loss": 4.3511, + "step": 3485 + }, + { + "epoch": 0.23712460932191873, + "grad_norm": 0.38735896348953247, + "learning_rate": 9.70418874847126e-05, + "loss": 4.054, + "step": 3490 + }, + { + "epoch": 0.23746432939258053, + "grad_norm": 0.28407546877861023, + "learning_rate": 9.703764098382933e-05, + "loss": 4.1299, + "step": 3495 + }, + { + "epoch": 0.2378040494632423, + "grad_norm": 0.4963781237602234, + "learning_rate": 9.703339448294606e-05, + "loss": 4.3751, + "step": 3500 + }, + { + "epoch": 0.23814376953390406, + "grad_norm": 0.2363215535879135, + "learning_rate": 9.702914798206278e-05, + "loss": 4.3631, + "step": 3505 + }, + { + "epoch": 0.23848348960456583, + "grad_norm": 0.2752895951271057, + "learning_rate": 9.702490148117951e-05, + "loss": 4.3192, + "step": 3510 + }, + { + "epoch": 0.2388232096752276, + "grad_norm": 0.2011261135339737, + "learning_rate": 9.702065498029624e-05, + "loss": 4.3019, + "step": 3515 + }, + { + "epoch": 0.23916292974588937, + "grad_norm": 0.30605676770210266, + "learning_rate": 9.701640847941297e-05, + "loss": 4.2733, + "step": 3520 + }, + { + "epoch": 0.23950264981655117, + "grad_norm": 0.23777063190937042, + "learning_rate": 9.70121619785297e-05, + "loss": 4.4391, + "step": 3525 + }, + { + "epoch": 0.23984236988721294, + "grad_norm": 0.19578081369400024, + "learning_rate": 9.700791547764642e-05, + "loss": 4.3464, + "step": 3530 + }, + { + "epoch": 0.2401820899578747, + "grad_norm": 0.43479400873184204, + "learning_rate": 9.700366897676315e-05, + "loss": 4.1509, + "step": 3535 + }, + { + "epoch": 0.24052181002853648, + "grad_norm": 0.23320983350276947, + "learning_rate": 9.699942247587988e-05, + "loss": 4.2031, + "step": 3540 + }, + { + "epoch": 0.24086153009919825, + "grad_norm": 0.395224004983902, + "learning_rate": 9.699517597499661e-05, + "loss": 4.3575, + "step": 3545 + }, + { + "epoch": 0.24120125016986005, + "grad_norm": 0.22553794085979462, + "learning_rate": 9.699092947411334e-05, + "loss": 4.4234, + "step": 3550 + }, + { + "epoch": 0.24154097024052182, + "grad_norm": 0.21396693587303162, + "learning_rate": 9.698668297323006e-05, + "loss": 4.3933, + "step": 3555 + }, + { + "epoch": 0.2418806903111836, + "grad_norm": 0.2883829176425934, + "learning_rate": 9.698243647234679e-05, + "loss": 4.1656, + "step": 3560 + }, + { + "epoch": 0.24222041038184536, + "grad_norm": 0.3761749267578125, + "learning_rate": 9.697818997146352e-05, + "loss": 4.254, + "step": 3565 + }, + { + "epoch": 0.24256013045250713, + "grad_norm": 0.2654179036617279, + "learning_rate": 9.697394347058025e-05, + "loss": 4.0754, + "step": 3570 + }, + { + "epoch": 0.2428998505231689, + "grad_norm": 0.20404711365699768, + "learning_rate": 9.696969696969698e-05, + "loss": 4.4404, + "step": 3575 + }, + { + "epoch": 0.2432395705938307, + "grad_norm": 0.661677360534668, + "learning_rate": 9.69654504688137e-05, + "loss": 4.4084, + "step": 3580 + }, + { + "epoch": 0.24357929066449246, + "grad_norm": 0.19168756902217865, + "learning_rate": 9.696120396793043e-05, + "loss": 4.3991, + "step": 3585 + }, + { + "epoch": 0.24391901073515423, + "grad_norm": 0.21689128875732422, + "learning_rate": 9.695695746704716e-05, + "loss": 4.2182, + "step": 3590 + }, + { + "epoch": 0.244258730805816, + "grad_norm": 0.1910148561000824, + "learning_rate": 9.695271096616389e-05, + "loss": 4.2426, + "step": 3595 + }, + { + "epoch": 0.24459845087647777, + "grad_norm": 0.463371604681015, + "learning_rate": 9.69484644652806e-05, + "loss": 4.1501, + "step": 3600 + }, + { + "epoch": 0.24493817094713954, + "grad_norm": 0.2187051922082901, + "learning_rate": 9.694421796439734e-05, + "loss": 4.2383, + "step": 3605 + }, + { + "epoch": 0.24527789101780134, + "grad_norm": 0.7701082229614258, + "learning_rate": 9.693997146351407e-05, + "loss": 4.2811, + "step": 3610 + }, + { + "epoch": 0.2456176110884631, + "grad_norm": 0.2454994171857834, + "learning_rate": 9.693572496263079e-05, + "loss": 4.3994, + "step": 3615 + }, + { + "epoch": 0.24595733115912488, + "grad_norm": 0.22179093956947327, + "learning_rate": 9.693147846174753e-05, + "loss": 4.1261, + "step": 3620 + }, + { + "epoch": 0.24629705122978665, + "grad_norm": 0.23975835740566254, + "learning_rate": 9.692723196086426e-05, + "loss": 4.308, + "step": 3625 + }, + { + "epoch": 0.24663677130044842, + "grad_norm": 0.21660096943378448, + "learning_rate": 9.692298545998097e-05, + "loss": 4.2105, + "step": 3630 + }, + { + "epoch": 0.24697649137111022, + "grad_norm": 0.22534438967704773, + "learning_rate": 9.691873895909771e-05, + "loss": 4.2923, + "step": 3635 + }, + { + "epoch": 0.247316211441772, + "grad_norm": 0.19649091362953186, + "learning_rate": 9.691449245821444e-05, + "loss": 4.4036, + "step": 3640 + }, + { + "epoch": 0.24765593151243376, + "grad_norm": 0.32042601704597473, + "learning_rate": 9.691024595733115e-05, + "loss": 4.4266, + "step": 3645 + }, + { + "epoch": 0.24799565158309553, + "grad_norm": 0.6859878301620483, + "learning_rate": 9.69059994564479e-05, + "loss": 4.2232, + "step": 3650 + }, + { + "epoch": 0.2483353716537573, + "grad_norm": 0.23352079093456268, + "learning_rate": 9.690175295556462e-05, + "loss": 4.0696, + "step": 3655 + }, + { + "epoch": 0.24867509172441907, + "grad_norm": 0.272712767124176, + "learning_rate": 9.689750645468135e-05, + "loss": 4.1759, + "step": 3660 + }, + { + "epoch": 0.24901481179508086, + "grad_norm": 0.22009974718093872, + "learning_rate": 9.689325995379808e-05, + "loss": 4.1973, + "step": 3665 + }, + { + "epoch": 0.24935453186574263, + "grad_norm": 1.4543390274047852, + "learning_rate": 9.68890134529148e-05, + "loss": 4.314, + "step": 3670 + }, + { + "epoch": 0.2496942519364044, + "grad_norm": 0.3941098153591156, + "learning_rate": 9.688476695203154e-05, + "loss": 4.4651, + "step": 3675 + }, + { + "epoch": 0.2500339720070662, + "grad_norm": 0.28159454464912415, + "learning_rate": 9.688052045114826e-05, + "loss": 4.2579, + "step": 3680 + }, + { + "epoch": 0.25037369207772797, + "grad_norm": 0.22340060770511627, + "learning_rate": 9.687627395026498e-05, + "loss": 4.2427, + "step": 3685 + }, + { + "epoch": 0.2507134121483897, + "grad_norm": 0.24438177049160004, + "learning_rate": 9.687202744938172e-05, + "loss": 4.1173, + "step": 3690 + }, + { + "epoch": 0.2510531322190515, + "grad_norm": 0.19045932590961456, + "learning_rate": 9.686778094849845e-05, + "loss": 4.3531, + "step": 3695 + }, + { + "epoch": 0.25139285228971325, + "grad_norm": 0.21072247624397278, + "learning_rate": 9.686353444761516e-05, + "loss": 4.3211, + "step": 3700 + }, + { + "epoch": 0.25173257236037505, + "grad_norm": 0.20157082378864288, + "learning_rate": 9.68592879467319e-05, + "loss": 4.3939, + "step": 3705 + }, + { + "epoch": 0.25207229243103685, + "grad_norm": 0.23919062316417694, + "learning_rate": 9.685504144584863e-05, + "loss": 4.2216, + "step": 3710 + }, + { + "epoch": 0.2524120125016986, + "grad_norm": 0.3379192352294922, + "learning_rate": 9.685079494496535e-05, + "loss": 4.15, + "step": 3715 + }, + { + "epoch": 0.2527517325723604, + "grad_norm": 0.2691631615161896, + "learning_rate": 9.684654844408209e-05, + "loss": 4.178, + "step": 3720 + }, + { + "epoch": 0.25309145264302213, + "grad_norm": 0.2460995614528656, + "learning_rate": 9.684230194319882e-05, + "loss": 4.1714, + "step": 3725 + }, + { + "epoch": 0.2534311727136839, + "grad_norm": 0.24896664917469025, + "learning_rate": 9.683805544231553e-05, + "loss": 4.5137, + "step": 3730 + }, + { + "epoch": 0.2537708927843457, + "grad_norm": 0.2998896837234497, + "learning_rate": 9.683380894143227e-05, + "loss": 4.2024, + "step": 3735 + }, + { + "epoch": 0.25411061285500747, + "grad_norm": 0.2170042097568512, + "learning_rate": 9.6829562440549e-05, + "loss": 4.2967, + "step": 3740 + }, + { + "epoch": 0.25445033292566926, + "grad_norm": 0.2534918487071991, + "learning_rate": 9.682531593966571e-05, + "loss": 4.1857, + "step": 3745 + }, + { + "epoch": 0.254790052996331, + "grad_norm": 0.2045327126979828, + "learning_rate": 9.682106943878246e-05, + "loss": 3.9961, + "step": 3750 + }, + { + "epoch": 0.2551297730669928, + "grad_norm": 0.23638245463371277, + "learning_rate": 9.681682293789917e-05, + "loss": 4.3547, + "step": 3755 + }, + { + "epoch": 0.25546949313765455, + "grad_norm": 0.22549360990524292, + "learning_rate": 9.68125764370159e-05, + "loss": 4.1726, + "step": 3760 + }, + { + "epoch": 0.25580921320831634, + "grad_norm": 0.24715493619441986, + "learning_rate": 9.680832993613264e-05, + "loss": 4.2836, + "step": 3765 + }, + { + "epoch": 0.25614893327897814, + "grad_norm": 0.33308762311935425, + "learning_rate": 9.680408343524935e-05, + "loss": 4.2667, + "step": 3770 + }, + { + "epoch": 0.2564886533496399, + "grad_norm": 0.4240279197692871, + "learning_rate": 9.679983693436608e-05, + "loss": 4.2174, + "step": 3775 + }, + { + "epoch": 0.2568283734203017, + "grad_norm": 0.26198095083236694, + "learning_rate": 9.679559043348282e-05, + "loss": 4.33, + "step": 3780 + }, + { + "epoch": 0.2571680934909634, + "grad_norm": 0.21898075938224792, + "learning_rate": 9.679134393259954e-05, + "loss": 4.2754, + "step": 3785 + }, + { + "epoch": 0.2575078135616252, + "grad_norm": 0.1936497837305069, + "learning_rate": 9.678709743171627e-05, + "loss": 4.2178, + "step": 3790 + }, + { + "epoch": 0.257847533632287, + "grad_norm": 0.3042401075363159, + "learning_rate": 9.678285093083301e-05, + "loss": 4.2444, + "step": 3795 + }, + { + "epoch": 0.25818725370294876, + "grad_norm": 0.22089192271232605, + "learning_rate": 9.677860442994972e-05, + "loss": 4.1209, + "step": 3800 + }, + { + "epoch": 0.25852697377361056, + "grad_norm": 0.26595672965049744, + "learning_rate": 9.677435792906645e-05, + "loss": 4.3664, + "step": 3805 + }, + { + "epoch": 0.2588666938442723, + "grad_norm": 0.38972827792167664, + "learning_rate": 9.677011142818319e-05, + "loss": 4.3291, + "step": 3810 + }, + { + "epoch": 0.2592064139149341, + "grad_norm": 0.8308687210083008, + "learning_rate": 9.67658649272999e-05, + "loss": 4.3588, + "step": 3815 + }, + { + "epoch": 0.2595461339855959, + "grad_norm": 0.29095426201820374, + "learning_rate": 9.676161842641663e-05, + "loss": 4.2095, + "step": 3820 + }, + { + "epoch": 0.25988585405625764, + "grad_norm": 0.6666823625564575, + "learning_rate": 9.675737192553336e-05, + "loss": 4.249, + "step": 3825 + }, + { + "epoch": 0.26022557412691943, + "grad_norm": 0.2800503373146057, + "learning_rate": 9.675312542465009e-05, + "loss": 4.0724, + "step": 3830 + }, + { + "epoch": 0.2605652941975812, + "grad_norm": 0.31251639127731323, + "learning_rate": 9.674887892376682e-05, + "loss": 4.2177, + "step": 3835 + }, + { + "epoch": 0.260905014268243, + "grad_norm": 0.19203290343284607, + "learning_rate": 9.674463242288355e-05, + "loss": 4.1164, + "step": 3840 + }, + { + "epoch": 0.2612447343389047, + "grad_norm": 0.21506604552268982, + "learning_rate": 9.674038592200027e-05, + "loss": 4.3448, + "step": 3845 + }, + { + "epoch": 0.2615844544095665, + "grad_norm": 0.2286742478609085, + "learning_rate": 9.6736139421117e-05, + "loss": 4.3154, + "step": 3850 + }, + { + "epoch": 0.2619241744802283, + "grad_norm": 0.22341595590114594, + "learning_rate": 9.673189292023373e-05, + "loss": 4.0496, + "step": 3855 + }, + { + "epoch": 0.26226389455089005, + "grad_norm": 3.2631723880767822, + "learning_rate": 9.672764641935046e-05, + "loss": 4.2739, + "step": 3860 + }, + { + "epoch": 0.26260361462155185, + "grad_norm": 0.21692293882369995, + "learning_rate": 9.672339991846719e-05, + "loss": 4.2239, + "step": 3865 + }, + { + "epoch": 0.2629433346922136, + "grad_norm": 0.24772769212722778, + "learning_rate": 9.671915341758391e-05, + "loss": 4.0354, + "step": 3870 + }, + { + "epoch": 0.2632830547628754, + "grad_norm": 0.2190844863653183, + "learning_rate": 9.671490691670064e-05, + "loss": 4.2554, + "step": 3875 + }, + { + "epoch": 0.2636227748335372, + "grad_norm": 0.19608178734779358, + "learning_rate": 9.671066041581737e-05, + "loss": 4.2199, + "step": 3880 + }, + { + "epoch": 0.26396249490419893, + "grad_norm": 0.22313562035560608, + "learning_rate": 9.67064139149341e-05, + "loss": 4.1976, + "step": 3885 + }, + { + "epoch": 0.2643022149748607, + "grad_norm": 0.25129613280296326, + "learning_rate": 9.670216741405083e-05, + "loss": 4.2595, + "step": 3890 + }, + { + "epoch": 0.26464193504552247, + "grad_norm": 0.19212405383586884, + "learning_rate": 9.669792091316755e-05, + "loss": 4.3704, + "step": 3895 + }, + { + "epoch": 0.26498165511618427, + "grad_norm": 0.21401169896125793, + "learning_rate": 9.669367441228428e-05, + "loss": 4.1088, + "step": 3900 + }, + { + "epoch": 0.26532137518684606, + "grad_norm": 0.2625492811203003, + "learning_rate": 9.668942791140101e-05, + "loss": 4.4765, + "step": 3905 + }, + { + "epoch": 0.2656610952575078, + "grad_norm": 0.23690305650234222, + "learning_rate": 9.668518141051774e-05, + "loss": 4.1193, + "step": 3910 + }, + { + "epoch": 0.2660008153281696, + "grad_norm": 0.2038702368736267, + "learning_rate": 9.668093490963447e-05, + "loss": 4.4361, + "step": 3915 + }, + { + "epoch": 0.26634053539883135, + "grad_norm": 1.9976972341537476, + "learning_rate": 9.66766884087512e-05, + "loss": 4.448, + "step": 3920 + }, + { + "epoch": 0.26668025546949314, + "grad_norm": 0.2619224488735199, + "learning_rate": 9.667244190786792e-05, + "loss": 4.3061, + "step": 3925 + }, + { + "epoch": 0.2670199755401549, + "grad_norm": 0.17488695681095123, + "learning_rate": 9.666819540698465e-05, + "loss": 3.924, + "step": 3930 + }, + { + "epoch": 0.2673596956108167, + "grad_norm": 0.3555572032928467, + "learning_rate": 9.666394890610138e-05, + "loss": 4.4889, + "step": 3935 + }, + { + "epoch": 0.2676994156814785, + "grad_norm": 0.18651026487350464, + "learning_rate": 9.66597024052181e-05, + "loss": 4.2601, + "step": 3940 + }, + { + "epoch": 0.2680391357521402, + "grad_norm": 0.4118260145187378, + "learning_rate": 9.665545590433483e-05, + "loss": 4.0048, + "step": 3945 + }, + { + "epoch": 0.268378855822802, + "grad_norm": 0.21420472860336304, + "learning_rate": 9.665120940345156e-05, + "loss": 4.5124, + "step": 3950 + }, + { + "epoch": 0.26871857589346376, + "grad_norm": 0.25867247581481934, + "learning_rate": 9.664696290256828e-05, + "loss": 3.9366, + "step": 3955 + }, + { + "epoch": 0.26905829596412556, + "grad_norm": 0.9560242295265198, + "learning_rate": 9.664271640168502e-05, + "loss": 4.273, + "step": 3960 + }, + { + "epoch": 0.26939801603478736, + "grad_norm": 0.22547510266304016, + "learning_rate": 9.663846990080175e-05, + "loss": 4.4065, + "step": 3965 + }, + { + "epoch": 0.2697377361054491, + "grad_norm": 0.4761745035648346, + "learning_rate": 9.663422339991846e-05, + "loss": 4.0622, + "step": 3970 + }, + { + "epoch": 0.2700774561761109, + "grad_norm": 0.26078933477401733, + "learning_rate": 9.66299768990352e-05, + "loss": 4.4622, + "step": 3975 + }, + { + "epoch": 0.27041717624677264, + "grad_norm": 0.21970224380493164, + "learning_rate": 9.662573039815193e-05, + "loss": 4.2412, + "step": 3980 + }, + { + "epoch": 0.27075689631743444, + "grad_norm": 0.5211921334266663, + "learning_rate": 9.662148389726865e-05, + "loss": 4.3848, + "step": 3985 + }, + { + "epoch": 0.27109661638809623, + "grad_norm": 0.31244269013404846, + "learning_rate": 9.661723739638539e-05, + "loss": 4.3852, + "step": 3990 + }, + { + "epoch": 0.271436336458758, + "grad_norm": 0.29353293776512146, + "learning_rate": 9.661299089550211e-05, + "loss": 4.091, + "step": 3995 + }, + { + "epoch": 0.2717760565294198, + "grad_norm": 0.23753587901592255, + "learning_rate": 9.660874439461884e-05, + "loss": 4.3828, + "step": 4000 + }, + { + "epoch": 0.2721157766000815, + "grad_norm": 0.2865026593208313, + "learning_rate": 9.660449789373557e-05, + "loss": 4.2366, + "step": 4005 + }, + { + "epoch": 0.2724554966707433, + "grad_norm": 0.32267966866493225, + "learning_rate": 9.66002513928523e-05, + "loss": 4.1455, + "step": 4010 + }, + { + "epoch": 0.27279521674140506, + "grad_norm": 0.25711655616760254, + "learning_rate": 9.659600489196903e-05, + "loss": 4.4231, + "step": 4015 + }, + { + "epoch": 0.27313493681206685, + "grad_norm": 0.21570606529712677, + "learning_rate": 9.659175839108575e-05, + "loss": 4.2092, + "step": 4020 + }, + { + "epoch": 0.27347465688272865, + "grad_norm": 0.22739212214946747, + "learning_rate": 9.658751189020247e-05, + "loss": 4.2107, + "step": 4025 + }, + { + "epoch": 0.2738143769533904, + "grad_norm": 0.22698377072811127, + "learning_rate": 9.658326538931921e-05, + "loss": 3.965, + "step": 4030 + }, + { + "epoch": 0.2741540970240522, + "grad_norm": 0.2108132541179657, + "learning_rate": 9.657901888843594e-05, + "loss": 4.3312, + "step": 4035 + }, + { + "epoch": 0.27449381709471393, + "grad_norm": 0.3985457122325897, + "learning_rate": 9.657477238755265e-05, + "loss": 4.1752, + "step": 4040 + }, + { + "epoch": 0.27483353716537573, + "grad_norm": 0.2395816147327423, + "learning_rate": 9.65705258866694e-05, + "loss": 4.6349, + "step": 4045 + }, + { + "epoch": 0.2751732572360375, + "grad_norm": 0.24473239481449127, + "learning_rate": 9.656627938578612e-05, + "loss": 4.4361, + "step": 4050 + }, + { + "epoch": 0.27551297730669927, + "grad_norm": 0.3756929636001587, + "learning_rate": 9.656203288490284e-05, + "loss": 4.2976, + "step": 4055 + }, + { + "epoch": 0.27585269737736107, + "grad_norm": 0.2284708023071289, + "learning_rate": 9.655778638401958e-05, + "loss": 4.2554, + "step": 4060 + }, + { + "epoch": 0.2761924174480228, + "grad_norm": 0.26325544714927673, + "learning_rate": 9.655353988313631e-05, + "loss": 4.152, + "step": 4065 + }, + { + "epoch": 0.2765321375186846, + "grad_norm": 0.20146878063678741, + "learning_rate": 9.654929338225302e-05, + "loss": 4.1147, + "step": 4070 + }, + { + "epoch": 0.2768718575893464, + "grad_norm": 0.21572470664978027, + "learning_rate": 9.654504688136976e-05, + "loss": 4.2792, + "step": 4075 + }, + { + "epoch": 0.27721157766000815, + "grad_norm": 0.32967862486839294, + "learning_rate": 9.654080038048649e-05, + "loss": 4.1379, + "step": 4080 + }, + { + "epoch": 0.27755129773066994, + "grad_norm": 0.23224560916423798, + "learning_rate": 9.65365538796032e-05, + "loss": 4.0244, + "step": 4085 + }, + { + "epoch": 0.2778910178013317, + "grad_norm": 0.2101649045944214, + "learning_rate": 9.653230737871995e-05, + "loss": 4.2301, + "step": 4090 + }, + { + "epoch": 0.2782307378719935, + "grad_norm": 0.22994142770767212, + "learning_rate": 9.652806087783666e-05, + "loss": 4.1505, + "step": 4095 + }, + { + "epoch": 0.2785704579426552, + "grad_norm": 0.23458316922187805, + "learning_rate": 9.652381437695339e-05, + "loss": 4.2603, + "step": 4100 + }, + { + "epoch": 0.278910178013317, + "grad_norm": 1.6248669624328613, + "learning_rate": 9.651956787607013e-05, + "loss": 4.1157, + "step": 4105 + }, + { + "epoch": 0.2792498980839788, + "grad_norm": 0.49504292011260986, + "learning_rate": 9.651532137518685e-05, + "loss": 4.2854, + "step": 4110 + }, + { + "epoch": 0.27958961815464056, + "grad_norm": 0.2064412534236908, + "learning_rate": 9.651107487430357e-05, + "loss": 4.2062, + "step": 4115 + }, + { + "epoch": 0.27992933822530236, + "grad_norm": 0.26098746061325073, + "learning_rate": 9.650682837342031e-05, + "loss": 4.1904, + "step": 4120 + }, + { + "epoch": 0.2802690582959641, + "grad_norm": 1.5820512771606445, + "learning_rate": 9.650258187253703e-05, + "loss": 4.0721, + "step": 4125 + }, + { + "epoch": 0.2806087783666259, + "grad_norm": 0.28080296516418457, + "learning_rate": 9.649833537165376e-05, + "loss": 4.1324, + "step": 4130 + }, + { + "epoch": 0.2809484984372877, + "grad_norm": 0.186203733086586, + "learning_rate": 9.64940888707705e-05, + "loss": 4.0794, + "step": 4135 + }, + { + "epoch": 0.28128821850794944, + "grad_norm": 0.3637949824333191, + "learning_rate": 9.648984236988721e-05, + "loss": 4.135, + "step": 4140 + }, + { + "epoch": 0.28162793857861124, + "grad_norm": 0.24603451788425446, + "learning_rate": 9.648559586900394e-05, + "loss": 4.3026, + "step": 4145 + }, + { + "epoch": 0.281967658649273, + "grad_norm": 0.21384142339229584, + "learning_rate": 9.648134936812068e-05, + "loss": 4.5439, + "step": 4150 + }, + { + "epoch": 0.2823073787199348, + "grad_norm": 0.23172922432422638, + "learning_rate": 9.64771028672374e-05, + "loss": 4.2762, + "step": 4155 + }, + { + "epoch": 0.2826470987905966, + "grad_norm": 0.22065469622612, + "learning_rate": 9.647285636635413e-05, + "loss": 4.3842, + "step": 4160 + }, + { + "epoch": 0.2829868188612583, + "grad_norm": 3.40319561958313, + "learning_rate": 9.646860986547087e-05, + "loss": 4.1898, + "step": 4165 + }, + { + "epoch": 0.2833265389319201, + "grad_norm": 0.20233049988746643, + "learning_rate": 9.646436336458758e-05, + "loss": 4.0764, + "step": 4170 + }, + { + "epoch": 0.28366625900258186, + "grad_norm": 0.586283802986145, + "learning_rate": 9.646011686370431e-05, + "loss": 4.314, + "step": 4175 + }, + { + "epoch": 0.28400597907324365, + "grad_norm": 2.9164395332336426, + "learning_rate": 9.645587036282104e-05, + "loss": 4.0146, + "step": 4180 + }, + { + "epoch": 0.2843456991439054, + "grad_norm": 0.28996986150741577, + "learning_rate": 9.645162386193777e-05, + "loss": 4.2867, + "step": 4185 + }, + { + "epoch": 0.2846854192145672, + "grad_norm": 0.5857749581336975, + "learning_rate": 9.64473773610545e-05, + "loss": 3.9711, + "step": 4190 + }, + { + "epoch": 0.285025139285229, + "grad_norm": 0.2920601963996887, + "learning_rate": 9.644313086017122e-05, + "loss": 4.2076, + "step": 4195 + }, + { + "epoch": 0.28536485935589073, + "grad_norm": 0.5181906223297119, + "learning_rate": 9.643888435928795e-05, + "loss": 4.2119, + "step": 4200 + }, + { + "epoch": 0.28570457942655253, + "grad_norm": 0.28782758116722107, + "learning_rate": 9.643463785840468e-05, + "loss": 4.3009, + "step": 4205 + }, + { + "epoch": 0.28604429949721427, + "grad_norm": 0.23932106792926788, + "learning_rate": 9.64303913575214e-05, + "loss": 4.2094, + "step": 4210 + }, + { + "epoch": 0.28638401956787607, + "grad_norm": 0.3413240611553192, + "learning_rate": 9.642614485663813e-05, + "loss": 4.2305, + "step": 4215 + }, + { + "epoch": 0.28672373963853787, + "grad_norm": 2.542914390563965, + "learning_rate": 9.642189835575486e-05, + "loss": 4.4582, + "step": 4220 + }, + { + "epoch": 0.2870634597091996, + "grad_norm": 0.21945473551750183, + "learning_rate": 9.641765185487159e-05, + "loss": 4.3744, + "step": 4225 + }, + { + "epoch": 0.2874031797798614, + "grad_norm": 0.209177166223526, + "learning_rate": 9.641340535398832e-05, + "loss": 4.2808, + "step": 4230 + }, + { + "epoch": 0.28774289985052315, + "grad_norm": 0.2568071782588959, + "learning_rate": 9.640915885310505e-05, + "loss": 3.9962, + "step": 4235 + }, + { + "epoch": 0.28808261992118495, + "grad_norm": 0.31277233362197876, + "learning_rate": 9.640491235222177e-05, + "loss": 4.3054, + "step": 4240 + }, + { + "epoch": 0.28842233999184674, + "grad_norm": 0.24698586761951447, + "learning_rate": 9.64006658513385e-05, + "loss": 4.5058, + "step": 4245 + }, + { + "epoch": 0.2887620600625085, + "grad_norm": 0.23559850454330444, + "learning_rate": 9.639641935045523e-05, + "loss": 4.0805, + "step": 4250 + }, + { + "epoch": 0.2891017801331703, + "grad_norm": 0.22997041046619415, + "learning_rate": 9.639217284957196e-05, + "loss": 3.9677, + "step": 4255 + }, + { + "epoch": 0.289441500203832, + "grad_norm": 0.2484733760356903, + "learning_rate": 9.638792634868869e-05, + "loss": 4.4475, + "step": 4260 + }, + { + "epoch": 0.2897812202744938, + "grad_norm": 0.19737331569194794, + "learning_rate": 9.638367984780541e-05, + "loss": 4.0734, + "step": 4265 + }, + { + "epoch": 0.29012094034515556, + "grad_norm": 0.21674834191799164, + "learning_rate": 9.637943334692214e-05, + "loss": 4.2674, + "step": 4270 + }, + { + "epoch": 0.29046066041581736, + "grad_norm": 1.3918811082839966, + "learning_rate": 9.637518684603887e-05, + "loss": 4.2559, + "step": 4275 + }, + { + "epoch": 0.29080038048647916, + "grad_norm": 0.3141735792160034, + "learning_rate": 9.63709403451556e-05, + "loss": 4.2088, + "step": 4280 + }, + { + "epoch": 0.2911401005571409, + "grad_norm": 0.17463235557079315, + "learning_rate": 9.636669384427233e-05, + "loss": 4.2172, + "step": 4285 + }, + { + "epoch": 0.2914798206278027, + "grad_norm": 0.28866469860076904, + "learning_rate": 9.636244734338905e-05, + "loss": 4.1058, + "step": 4290 + }, + { + "epoch": 0.29181954069846444, + "grad_norm": 0.19368840754032135, + "learning_rate": 9.635820084250577e-05, + "loss": 4.3816, + "step": 4295 + }, + { + "epoch": 0.29215926076912624, + "grad_norm": 0.5456646680831909, + "learning_rate": 9.635395434162251e-05, + "loss": 4.004, + "step": 4300 + }, + { + "epoch": 0.29249898083978804, + "grad_norm": 0.5791100263595581, + "learning_rate": 9.634970784073924e-05, + "loss": 3.9493, + "step": 4305 + }, + { + "epoch": 0.2928387009104498, + "grad_norm": 0.18864502012729645, + "learning_rate": 9.634546133985595e-05, + "loss": 4.2906, + "step": 4310 + }, + { + "epoch": 0.2931784209811116, + "grad_norm": 0.6542890071868896, + "learning_rate": 9.63412148389727e-05, + "loss": 4.2943, + "step": 4315 + }, + { + "epoch": 0.2935181410517733, + "grad_norm": 0.2639864683151245, + "learning_rate": 9.633696833808942e-05, + "loss": 4.1306, + "step": 4320 + }, + { + "epoch": 0.2938578611224351, + "grad_norm": 0.24884024262428284, + "learning_rate": 9.633272183720614e-05, + "loss": 4.1749, + "step": 4325 + }, + { + "epoch": 0.2941975811930969, + "grad_norm": 1.6146323680877686, + "learning_rate": 9.632847533632288e-05, + "loss": 4.1622, + "step": 4330 + }, + { + "epoch": 0.29453730126375866, + "grad_norm": 0.19550690054893494, + "learning_rate": 9.63242288354396e-05, + "loss": 4.2481, + "step": 4335 + }, + { + "epoch": 0.29487702133442045, + "grad_norm": 0.48053327202796936, + "learning_rate": 9.631998233455633e-05, + "loss": 4.1654, + "step": 4340 + }, + { + "epoch": 0.2952167414050822, + "grad_norm": 0.6082022190093994, + "learning_rate": 9.631573583367306e-05, + "loss": 3.922, + "step": 4345 + }, + { + "epoch": 0.295556461475744, + "grad_norm": 0.410819411277771, + "learning_rate": 9.631148933278979e-05, + "loss": 4.1838, + "step": 4350 + }, + { + "epoch": 0.29589618154640573, + "grad_norm": 0.20050150156021118, + "learning_rate": 9.630724283190652e-05, + "loss": 4.2083, + "step": 4355 + }, + { + "epoch": 0.29623590161706753, + "grad_norm": 0.2641303539276123, + "learning_rate": 9.630299633102325e-05, + "loss": 4.0804, + "step": 4360 + }, + { + "epoch": 0.29657562168772933, + "grad_norm": 0.1961575597524643, + "learning_rate": 9.629874983013997e-05, + "loss": 4.3261, + "step": 4365 + }, + { + "epoch": 0.29691534175839107, + "grad_norm": 0.17782385647296906, + "learning_rate": 9.62945033292567e-05, + "loss": 4.3017, + "step": 4370 + }, + { + "epoch": 0.29725506182905287, + "grad_norm": 1.2139571905136108, + "learning_rate": 9.629025682837343e-05, + "loss": 4.1051, + "step": 4375 + }, + { + "epoch": 0.2975947818997146, + "grad_norm": 0.2687116265296936, + "learning_rate": 9.628601032749014e-05, + "loss": 4.0757, + "step": 4380 + }, + { + "epoch": 0.2979345019703764, + "grad_norm": 0.19845756888389587, + "learning_rate": 9.628176382660689e-05, + "loss": 4.0948, + "step": 4385 + }, + { + "epoch": 0.2982742220410382, + "grad_norm": 0.2517881989479065, + "learning_rate": 9.627751732572361e-05, + "loss": 4.3492, + "step": 4390 + }, + { + "epoch": 0.29861394211169995, + "grad_norm": 0.19443516433238983, + "learning_rate": 9.627327082484033e-05, + "loss": 4.2639, + "step": 4395 + }, + { + "epoch": 0.29895366218236175, + "grad_norm": 0.21518000960350037, + "learning_rate": 9.626902432395707e-05, + "loss": 4.0157, + "step": 4400 + }, + { + "epoch": 0.2992933822530235, + "grad_norm": 0.3461875915527344, + "learning_rate": 9.62647778230738e-05, + "loss": 4.2115, + "step": 4405 + }, + { + "epoch": 0.2996331023236853, + "grad_norm": 0.39930984377861023, + "learning_rate": 9.626053132219051e-05, + "loss": 4.3391, + "step": 4410 + }, + { + "epoch": 0.2999728223943471, + "grad_norm": 0.22730666399002075, + "learning_rate": 9.625628482130725e-05, + "loss": 4.4092, + "step": 4415 + }, + { + "epoch": 0.3003125424650088, + "grad_norm": 0.2596425414085388, + "learning_rate": 9.625203832042398e-05, + "loss": 4.2127, + "step": 4420 + }, + { + "epoch": 0.3006522625356706, + "grad_norm": 0.19453459978103638, + "learning_rate": 9.62477918195407e-05, + "loss": 4.338, + "step": 4425 + }, + { + "epoch": 0.30099198260633236, + "grad_norm": 0.5006263852119446, + "learning_rate": 9.624354531865744e-05, + "loss": 4.3887, + "step": 4430 + }, + { + "epoch": 0.30133170267699416, + "grad_norm": 0.5625196099281311, + "learning_rate": 9.623929881777417e-05, + "loss": 4.411, + "step": 4435 + }, + { + "epoch": 0.3016714227476559, + "grad_norm": 0.2295573651790619, + "learning_rate": 9.623505231689088e-05, + "loss": 4.0798, + "step": 4440 + }, + { + "epoch": 0.3020111428183177, + "grad_norm": 0.22928239405155182, + "learning_rate": 9.623080581600762e-05, + "loss": 4.2019, + "step": 4445 + }, + { + "epoch": 0.3023508628889795, + "grad_norm": 0.7567266225814819, + "learning_rate": 9.622655931512434e-05, + "loss": 4.069, + "step": 4450 + }, + { + "epoch": 0.30269058295964124, + "grad_norm": 0.3791631758213043, + "learning_rate": 9.622231281424106e-05, + "loss": 4.0739, + "step": 4455 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.21165654063224792, + "learning_rate": 9.62180663133578e-05, + "loss": 4.0467, + "step": 4460 + }, + { + "epoch": 0.3033700231009648, + "grad_norm": 0.2274232804775238, + "learning_rate": 9.621381981247452e-05, + "loss": 4.1655, + "step": 4465 + }, + { + "epoch": 0.3037097431716266, + "grad_norm": 0.2882256507873535, + "learning_rate": 9.620957331159125e-05, + "loss": 4.0162, + "step": 4470 + }, + { + "epoch": 0.3040494632422884, + "grad_norm": 0.21718928217887878, + "learning_rate": 9.620532681070799e-05, + "loss": 4.1421, + "step": 4475 + }, + { + "epoch": 0.3043891833129501, + "grad_norm": 0.20482736825942993, + "learning_rate": 9.62010803098247e-05, + "loss": 4.3067, + "step": 4480 + }, + { + "epoch": 0.3047289033836119, + "grad_norm": 0.8644359111785889, + "learning_rate": 9.619683380894143e-05, + "loss": 4.1044, + "step": 4485 + }, + { + "epoch": 0.30506862345427366, + "grad_norm": 0.2353629469871521, + "learning_rate": 9.619258730805817e-05, + "loss": 4.6311, + "step": 4490 + }, + { + "epoch": 0.30540834352493546, + "grad_norm": 0.4746248722076416, + "learning_rate": 9.618834080717489e-05, + "loss": 4.2593, + "step": 4495 + }, + { + "epoch": 0.30574806359559725, + "grad_norm": 0.412517786026001, + "learning_rate": 9.618409430629162e-05, + "loss": 4.2203, + "step": 4500 + }, + { + "epoch": 0.306087783666259, + "grad_norm": 0.1828744113445282, + "learning_rate": 9.617984780540836e-05, + "loss": 4.0572, + "step": 4505 + }, + { + "epoch": 0.3064275037369208, + "grad_norm": 0.2963522672653198, + "learning_rate": 9.617560130452507e-05, + "loss": 4.3707, + "step": 4510 + }, + { + "epoch": 0.30676722380758253, + "grad_norm": 0.20454883575439453, + "learning_rate": 9.61713548036418e-05, + "loss": 4.1609, + "step": 4515 + }, + { + "epoch": 0.30710694387824433, + "grad_norm": 0.9719656109809875, + "learning_rate": 9.616710830275853e-05, + "loss": 4.3215, + "step": 4520 + }, + { + "epoch": 0.3074466639489061, + "grad_norm": 0.6544187664985657, + "learning_rate": 9.616286180187526e-05, + "loss": 3.9987, + "step": 4525 + }, + { + "epoch": 0.30778638401956787, + "grad_norm": 0.3224717080593109, + "learning_rate": 9.615861530099198e-05, + "loss": 4.2707, + "step": 4530 + }, + { + "epoch": 0.30812610409022967, + "grad_norm": 0.3716621398925781, + "learning_rate": 9.615436880010871e-05, + "loss": 3.9584, + "step": 4535 + }, + { + "epoch": 0.3084658241608914, + "grad_norm": 0.23874157667160034, + "learning_rate": 9.615012229922544e-05, + "loss": 4.0501, + "step": 4540 + }, + { + "epoch": 0.3088055442315532, + "grad_norm": 0.30314287543296814, + "learning_rate": 9.614587579834217e-05, + "loss": 3.8515, + "step": 4545 + }, + { + "epoch": 0.30914526430221495, + "grad_norm": 0.19416074454784393, + "learning_rate": 9.61416292974589e-05, + "loss": 4.0812, + "step": 4550 + }, + { + "epoch": 0.30948498437287675, + "grad_norm": 0.5242770910263062, + "learning_rate": 9.613738279657562e-05, + "loss": 4.37, + "step": 4555 + }, + { + "epoch": 0.30982470444353855, + "grad_norm": 0.20864085853099823, + "learning_rate": 9.613313629569235e-05, + "loss": 4.2711, + "step": 4560 + }, + { + "epoch": 0.3101644245142003, + "grad_norm": 0.20202593505382538, + "learning_rate": 9.612888979480908e-05, + "loss": 4.2423, + "step": 4565 + }, + { + "epoch": 0.3105041445848621, + "grad_norm": 0.5426074266433716, + "learning_rate": 9.612464329392581e-05, + "loss": 4.2333, + "step": 4570 + }, + { + "epoch": 0.3108438646555238, + "grad_norm": 0.21013125777244568, + "learning_rate": 9.612039679304254e-05, + "loss": 4.0275, + "step": 4575 + }, + { + "epoch": 0.3111835847261856, + "grad_norm": 0.3289850056171417, + "learning_rate": 9.611615029215926e-05, + "loss": 4.3321, + "step": 4580 + }, + { + "epoch": 0.3115233047968474, + "grad_norm": 0.20981498062610626, + "learning_rate": 9.611190379127599e-05, + "loss": 4.3237, + "step": 4585 + }, + { + "epoch": 0.31186302486750916, + "grad_norm": 0.2699143886566162, + "learning_rate": 9.610765729039272e-05, + "loss": 4.1158, + "step": 4590 + }, + { + "epoch": 0.31220274493817096, + "grad_norm": 0.27582621574401855, + "learning_rate": 9.610341078950945e-05, + "loss": 4.1325, + "step": 4595 + }, + { + "epoch": 0.3125424650088327, + "grad_norm": 0.2031656950712204, + "learning_rate": 9.609916428862618e-05, + "loss": 4.2928, + "step": 4600 + }, + { + "epoch": 0.3128821850794945, + "grad_norm": 0.17339491844177246, + "learning_rate": 9.60949177877429e-05, + "loss": 4.2046, + "step": 4605 + }, + { + "epoch": 0.3132219051501563, + "grad_norm": 0.2266245186328888, + "learning_rate": 9.609067128685963e-05, + "loss": 4.0126, + "step": 4610 + }, + { + "epoch": 0.31356162522081804, + "grad_norm": 0.2599181830883026, + "learning_rate": 9.608642478597636e-05, + "loss": 4.2055, + "step": 4615 + }, + { + "epoch": 0.31390134529147984, + "grad_norm": 0.27137070894241333, + "learning_rate": 9.608217828509309e-05, + "loss": 4.3499, + "step": 4620 + }, + { + "epoch": 0.3142410653621416, + "grad_norm": 0.22108152508735657, + "learning_rate": 9.607793178420982e-05, + "loss": 4.257, + "step": 4625 + }, + { + "epoch": 0.3145807854328034, + "grad_norm": 0.2058125138282776, + "learning_rate": 9.607368528332654e-05, + "loss": 3.919, + "step": 4630 + }, + { + "epoch": 0.3149205055034651, + "grad_norm": 0.4298430383205414, + "learning_rate": 9.606943878244327e-05, + "loss": 4.0876, + "step": 4635 + }, + { + "epoch": 0.3152602255741269, + "grad_norm": 0.19494764506816864, + "learning_rate": 9.606519228156e-05, + "loss": 4.2404, + "step": 4640 + }, + { + "epoch": 0.3155999456447887, + "grad_norm": 0.35915589332580566, + "learning_rate": 9.606094578067673e-05, + "loss": 4.1185, + "step": 4645 + }, + { + "epoch": 0.31593966571545046, + "grad_norm": 0.395074725151062, + "learning_rate": 9.605669927979344e-05, + "loss": 4.2395, + "step": 4650 + }, + { + "epoch": 0.31627938578611225, + "grad_norm": 0.3247855007648468, + "learning_rate": 9.605245277891018e-05, + "loss": 3.9987, + "step": 4655 + }, + { + "epoch": 0.316619105856774, + "grad_norm": 0.18057437241077423, + "learning_rate": 9.604820627802691e-05, + "loss": 4.3912, + "step": 4660 + }, + { + "epoch": 0.3169588259274358, + "grad_norm": 0.25986525416374207, + "learning_rate": 9.604395977714363e-05, + "loss": 4.2514, + "step": 4665 + }, + { + "epoch": 0.3172985459980976, + "grad_norm": 0.2058873325586319, + "learning_rate": 9.603971327626037e-05, + "loss": 4.3795, + "step": 4670 + }, + { + "epoch": 0.31763826606875933, + "grad_norm": 0.30512815713882446, + "learning_rate": 9.60354667753771e-05, + "loss": 4.2392, + "step": 4675 + }, + { + "epoch": 0.31797798613942113, + "grad_norm": 0.18412257730960846, + "learning_rate": 9.603122027449382e-05, + "loss": 4.2086, + "step": 4680 + }, + { + "epoch": 0.3183177062100829, + "grad_norm": 0.2081792950630188, + "learning_rate": 9.602697377361055e-05, + "loss": 4.3573, + "step": 4685 + }, + { + "epoch": 0.31865742628074467, + "grad_norm": 0.3510904014110565, + "learning_rate": 9.602272727272728e-05, + "loss": 4.3618, + "step": 4690 + }, + { + "epoch": 0.31899714635140647, + "grad_norm": 0.37145447731018066, + "learning_rate": 9.601848077184401e-05, + "loss": 4.301, + "step": 4695 + }, + { + "epoch": 0.3193368664220682, + "grad_norm": 0.23616161942481995, + "learning_rate": 9.601423427096074e-05, + "loss": 4.2158, + "step": 4700 + }, + { + "epoch": 0.31967658649273, + "grad_norm": 1.332667589187622, + "learning_rate": 9.600998777007746e-05, + "loss": 4.0569, + "step": 4705 + }, + { + "epoch": 0.32001630656339175, + "grad_norm": 0.21901053190231323, + "learning_rate": 9.600574126919419e-05, + "loss": 4.0555, + "step": 4710 + }, + { + "epoch": 0.32035602663405355, + "grad_norm": 0.18509267270565033, + "learning_rate": 9.600149476831092e-05, + "loss": 4.1219, + "step": 4715 + }, + { + "epoch": 0.3206957467047153, + "grad_norm": 0.17567375302314758, + "learning_rate": 9.599724826742764e-05, + "loss": 4.2921, + "step": 4720 + }, + { + "epoch": 0.3210354667753771, + "grad_norm": 0.17881886661052704, + "learning_rate": 9.599300176654438e-05, + "loss": 4.2577, + "step": 4725 + }, + { + "epoch": 0.3213751868460389, + "grad_norm": 0.2836432158946991, + "learning_rate": 9.59887552656611e-05, + "loss": 4.1362, + "step": 4730 + }, + { + "epoch": 0.3217149069167006, + "grad_norm": 0.43188366293907166, + "learning_rate": 9.598450876477782e-05, + "loss": 3.7939, + "step": 4735 + }, + { + "epoch": 0.3220546269873624, + "grad_norm": 0.21462103724479675, + "learning_rate": 9.598026226389456e-05, + "loss": 4.166, + "step": 4740 + }, + { + "epoch": 0.32239434705802417, + "grad_norm": 0.16883856058120728, + "learning_rate": 9.597601576301129e-05, + "loss": 3.9182, + "step": 4745 + }, + { + "epoch": 0.32273406712868596, + "grad_norm": 0.32869285345077515, + "learning_rate": 9.5971769262128e-05, + "loss": 4.0832, + "step": 4750 + }, + { + "epoch": 0.32307378719934776, + "grad_norm": 0.22861923277378082, + "learning_rate": 9.596752276124474e-05, + "loss": 4.344, + "step": 4755 + }, + { + "epoch": 0.3234135072700095, + "grad_norm": 0.3567066192626953, + "learning_rate": 9.596327626036147e-05, + "loss": 4.0191, + "step": 4760 + }, + { + "epoch": 0.3237532273406713, + "grad_norm": 0.23706111311912537, + "learning_rate": 9.595902975947819e-05, + "loss": 4.1505, + "step": 4765 + }, + { + "epoch": 0.32409294741133304, + "grad_norm": 0.22384607791900635, + "learning_rate": 9.595478325859493e-05, + "loss": 4.1113, + "step": 4770 + }, + { + "epoch": 0.32443266748199484, + "grad_norm": 0.1979631930589676, + "learning_rate": 9.595053675771166e-05, + "loss": 4.0686, + "step": 4775 + }, + { + "epoch": 0.32477238755265664, + "grad_norm": 0.3480912148952484, + "learning_rate": 9.594629025682837e-05, + "loss": 4.2106, + "step": 4780 + }, + { + "epoch": 0.3251121076233184, + "grad_norm": 0.23982703685760498, + "learning_rate": 9.594204375594511e-05, + "loss": 4.2114, + "step": 4785 + }, + { + "epoch": 0.3254518276939802, + "grad_norm": 1.1133886575698853, + "learning_rate": 9.593779725506184e-05, + "loss": 3.894, + "step": 4790 + }, + { + "epoch": 0.3257915477646419, + "grad_norm": 0.1953042596578598, + "learning_rate": 9.593355075417856e-05, + "loss": 4.1392, + "step": 4795 + }, + { + "epoch": 0.3261312678353037, + "grad_norm": 0.24198144674301147, + "learning_rate": 9.59293042532953e-05, + "loss": 3.9054, + "step": 4800 + }, + { + "epoch": 0.32647098790596546, + "grad_norm": 0.5570387840270996, + "learning_rate": 9.592505775241201e-05, + "loss": 3.9928, + "step": 4805 + }, + { + "epoch": 0.32681070797662726, + "grad_norm": 0.3616771399974823, + "learning_rate": 9.592081125152874e-05, + "loss": 4.3527, + "step": 4810 + }, + { + "epoch": 0.32715042804728905, + "grad_norm": 0.7734106183052063, + "learning_rate": 9.591656475064548e-05, + "loss": 3.9594, + "step": 4815 + }, + { + "epoch": 0.3274901481179508, + "grad_norm": 0.23176033794879913, + "learning_rate": 9.59123182497622e-05, + "loss": 4.2269, + "step": 4820 + }, + { + "epoch": 0.3278298681886126, + "grad_norm": 0.22131270170211792, + "learning_rate": 9.590807174887892e-05, + "loss": 4.205, + "step": 4825 + }, + { + "epoch": 0.32816958825927434, + "grad_norm": 0.1967305839061737, + "learning_rate": 9.590382524799566e-05, + "loss": 4.1081, + "step": 4830 + }, + { + "epoch": 0.32850930832993613, + "grad_norm": 0.20413081347942352, + "learning_rate": 9.589957874711238e-05, + "loss": 4.2146, + "step": 4835 + }, + { + "epoch": 0.32884902840059793, + "grad_norm": 0.237589493393898, + "learning_rate": 9.589533224622911e-05, + "loss": 4.321, + "step": 4840 + }, + { + "epoch": 0.3291887484712597, + "grad_norm": 0.3263246417045593, + "learning_rate": 9.589108574534585e-05, + "loss": 4.2572, + "step": 4845 + }, + { + "epoch": 0.32952846854192147, + "grad_norm": 0.2639991343021393, + "learning_rate": 9.588683924446256e-05, + "loss": 4.3049, + "step": 4850 + }, + { + "epoch": 0.3298681886125832, + "grad_norm": 0.33223286271095276, + "learning_rate": 9.588259274357929e-05, + "loss": 4.3715, + "step": 4855 + }, + { + "epoch": 0.330207908683245, + "grad_norm": 0.22897298634052277, + "learning_rate": 9.587834624269603e-05, + "loss": 3.93, + "step": 4860 + }, + { + "epoch": 0.3305476287539068, + "grad_norm": 0.3667212128639221, + "learning_rate": 9.587409974181275e-05, + "loss": 4.3149, + "step": 4865 + }, + { + "epoch": 0.33088734882456855, + "grad_norm": 0.22442007064819336, + "learning_rate": 9.586985324092948e-05, + "loss": 4.4189, + "step": 4870 + }, + { + "epoch": 0.33122706889523035, + "grad_norm": 0.18334710597991943, + "learning_rate": 9.58656067400462e-05, + "loss": 3.9457, + "step": 4875 + }, + { + "epoch": 0.3315667889658921, + "grad_norm": 0.34593579173088074, + "learning_rate": 9.586136023916293e-05, + "loss": 3.9007, + "step": 4880 + }, + { + "epoch": 0.3319065090365539, + "grad_norm": 0.24884456396102905, + "learning_rate": 9.585711373827966e-05, + "loss": 4.198, + "step": 4885 + }, + { + "epoch": 0.33224622910721563, + "grad_norm": 0.6150047183036804, + "learning_rate": 9.585286723739639e-05, + "loss": 4.1738, + "step": 4890 + }, + { + "epoch": 0.3325859491778774, + "grad_norm": 0.18449634313583374, + "learning_rate": 9.584862073651312e-05, + "loss": 4.3916, + "step": 4895 + }, + { + "epoch": 0.3329256692485392, + "grad_norm": 0.45811331272125244, + "learning_rate": 9.584437423562984e-05, + "loss": 4.0894, + "step": 4900 + }, + { + "epoch": 0.33326538931920097, + "grad_norm": 0.9670056700706482, + "learning_rate": 9.584012773474657e-05, + "loss": 4.0947, + "step": 4905 + }, + { + "epoch": 0.33360510938986276, + "grad_norm": 0.2828699052333832, + "learning_rate": 9.58358812338633e-05, + "loss": 4.27, + "step": 4910 + }, + { + "epoch": 0.3339448294605245, + "grad_norm": 0.22989730536937714, + "learning_rate": 9.583163473298003e-05, + "loss": 4.2658, + "step": 4915 + }, + { + "epoch": 0.3342845495311863, + "grad_norm": 0.4018714427947998, + "learning_rate": 9.582738823209676e-05, + "loss": 4.1278, + "step": 4920 + }, + { + "epoch": 0.3346242696018481, + "grad_norm": 0.5296480059623718, + "learning_rate": 9.582314173121348e-05, + "loss": 4.1865, + "step": 4925 + }, + { + "epoch": 0.33496398967250984, + "grad_norm": 0.2477627843618393, + "learning_rate": 9.581889523033021e-05, + "loss": 4.4925, + "step": 4930 + }, + { + "epoch": 0.33530370974317164, + "grad_norm": 0.24414370954036713, + "learning_rate": 9.581464872944694e-05, + "loss": 4.1706, + "step": 4935 + }, + { + "epoch": 0.3356434298138334, + "grad_norm": 0.15603503584861755, + "learning_rate": 9.581040222856367e-05, + "loss": 4.0303, + "step": 4940 + }, + { + "epoch": 0.3359831498844952, + "grad_norm": 0.17460083961486816, + "learning_rate": 9.58061557276804e-05, + "loss": 4.1986, + "step": 4945 + }, + { + "epoch": 0.336322869955157, + "grad_norm": 0.2035687118768692, + "learning_rate": 9.580190922679712e-05, + "loss": 4.3514, + "step": 4950 + }, + { + "epoch": 0.3366625900258187, + "grad_norm": 0.4037059545516968, + "learning_rate": 9.579766272591385e-05, + "loss": 4.1645, + "step": 4955 + }, + { + "epoch": 0.3370023100964805, + "grad_norm": 0.22527378797531128, + "learning_rate": 9.579341622503058e-05, + "loss": 4.1399, + "step": 4960 + }, + { + "epoch": 0.33734203016714226, + "grad_norm": 0.27611520886421204, + "learning_rate": 9.578916972414731e-05, + "loss": 4.1679, + "step": 4965 + }, + { + "epoch": 0.33768175023780406, + "grad_norm": 0.26130980253219604, + "learning_rate": 9.578492322326404e-05, + "loss": 3.9786, + "step": 4970 + }, + { + "epoch": 0.3380214703084658, + "grad_norm": 0.20753854513168335, + "learning_rate": 9.578067672238076e-05, + "loss": 4.0629, + "step": 4975 + }, + { + "epoch": 0.3383611903791276, + "grad_norm": 0.198018416762352, + "learning_rate": 9.577643022149749e-05, + "loss": 4.0956, + "step": 4980 + }, + { + "epoch": 0.3387009104497894, + "grad_norm": 0.21650417149066925, + "learning_rate": 9.577218372061422e-05, + "loss": 4.0965, + "step": 4985 + }, + { + "epoch": 0.33904063052045114, + "grad_norm": 0.3832937777042389, + "learning_rate": 9.576793721973095e-05, + "loss": 4.1086, + "step": 4990 + }, + { + "epoch": 0.33938035059111293, + "grad_norm": 0.23973116278648376, + "learning_rate": 9.576369071884768e-05, + "loss": 4.3533, + "step": 4995 + }, + { + "epoch": 0.3397200706617747, + "grad_norm": 0.3817537724971771, + "learning_rate": 9.57594442179644e-05, + "loss": 4.2127, + "step": 5000 + }, + { + "epoch": 0.3400597907324365, + "grad_norm": 0.2038896679878235, + "learning_rate": 9.575519771708112e-05, + "loss": 4.3264, + "step": 5005 + }, + { + "epoch": 0.34039951080309827, + "grad_norm": 0.23982146382331848, + "learning_rate": 9.575095121619786e-05, + "loss": 4.0702, + "step": 5010 + }, + { + "epoch": 0.34073923087376, + "grad_norm": 0.30504170060157776, + "learning_rate": 9.574670471531459e-05, + "loss": 3.9892, + "step": 5015 + }, + { + "epoch": 0.3410789509444218, + "grad_norm": 0.2355673611164093, + "learning_rate": 9.574245821443132e-05, + "loss": 3.9423, + "step": 5020 + }, + { + "epoch": 0.34141867101508355, + "grad_norm": 0.22874650359153748, + "learning_rate": 9.573821171354804e-05, + "loss": 3.9142, + "step": 5025 + }, + { + "epoch": 0.34175839108574535, + "grad_norm": 0.21437452733516693, + "learning_rate": 9.573396521266477e-05, + "loss": 4.1365, + "step": 5030 + }, + { + "epoch": 0.34209811115640715, + "grad_norm": 0.3095625936985016, + "learning_rate": 9.57297187117815e-05, + "loss": 4.1468, + "step": 5035 + }, + { + "epoch": 0.3424378312270689, + "grad_norm": 0.2202177494764328, + "learning_rate": 9.572547221089823e-05, + "loss": 4.01, + "step": 5040 + }, + { + "epoch": 0.3427775512977307, + "grad_norm": 0.32474327087402344, + "learning_rate": 9.572122571001496e-05, + "loss": 4.0868, + "step": 5045 + }, + { + "epoch": 0.34311727136839243, + "grad_norm": 0.2622280716896057, + "learning_rate": 9.571697920913168e-05, + "loss": 4.2103, + "step": 5050 + }, + { + "epoch": 0.3434569914390542, + "grad_norm": 0.7448468208312988, + "learning_rate": 9.571273270824841e-05, + "loss": 4.2945, + "step": 5055 + }, + { + "epoch": 0.34379671150971597, + "grad_norm": 0.2269594371318817, + "learning_rate": 9.570848620736514e-05, + "loss": 3.9127, + "step": 5060 + }, + { + "epoch": 0.34413643158037777, + "grad_norm": 0.2645524740219116, + "learning_rate": 9.570423970648187e-05, + "loss": 3.9423, + "step": 5065 + }, + { + "epoch": 0.34447615165103956, + "grad_norm": 0.246607705950737, + "learning_rate": 9.56999932055986e-05, + "loss": 4.1347, + "step": 5070 + }, + { + "epoch": 0.3448158717217013, + "grad_norm": 0.19342374801635742, + "learning_rate": 9.569574670471531e-05, + "loss": 4.0787, + "step": 5075 + }, + { + "epoch": 0.3451555917923631, + "grad_norm": 0.31122297048568726, + "learning_rate": 9.569150020383205e-05, + "loss": 4.1522, + "step": 5080 + }, + { + "epoch": 0.34549531186302485, + "grad_norm": 1.0425968170166016, + "learning_rate": 9.568725370294878e-05, + "loss": 3.995, + "step": 5085 + }, + { + "epoch": 0.34583503193368664, + "grad_norm": 0.22739467024803162, + "learning_rate": 9.56830072020655e-05, + "loss": 3.9042, + "step": 5090 + }, + { + "epoch": 0.34617475200434844, + "grad_norm": 0.21344348788261414, + "learning_rate": 9.567876070118224e-05, + "loss": 4.1351, + "step": 5095 + }, + { + "epoch": 0.3465144720750102, + "grad_norm": 0.2140883356332779, + "learning_rate": 9.567451420029896e-05, + "loss": 3.9542, + "step": 5100 + }, + { + "epoch": 0.346854192145672, + "grad_norm": 0.1795925498008728, + "learning_rate": 9.567026769941568e-05, + "loss": 4.061, + "step": 5105 + }, + { + "epoch": 0.3471939122163337, + "grad_norm": 0.37569665908813477, + "learning_rate": 9.566602119853242e-05, + "loss": 4.3051, + "step": 5110 + }, + { + "epoch": 0.3475336322869955, + "grad_norm": 0.19528646767139435, + "learning_rate": 9.566177469764915e-05, + "loss": 4.1591, + "step": 5115 + }, + { + "epoch": 0.3478733523576573, + "grad_norm": 0.1688258945941925, + "learning_rate": 9.565752819676586e-05, + "loss": 3.9876, + "step": 5120 + }, + { + "epoch": 0.34821307242831906, + "grad_norm": 0.17164097726345062, + "learning_rate": 9.56532816958826e-05, + "loss": 4.2081, + "step": 5125 + }, + { + "epoch": 0.34855279249898086, + "grad_norm": 0.18012307584285736, + "learning_rate": 9.564903519499933e-05, + "loss": 3.9195, + "step": 5130 + }, + { + "epoch": 0.3488925125696426, + "grad_norm": 0.2118469625711441, + "learning_rate": 9.564478869411605e-05, + "loss": 4.1027, + "step": 5135 + }, + { + "epoch": 0.3492322326403044, + "grad_norm": 0.21049754321575165, + "learning_rate": 9.564054219323279e-05, + "loss": 4.0254, + "step": 5140 + }, + { + "epoch": 0.34957195271096614, + "grad_norm": 0.5723572969436646, + "learning_rate": 9.563629569234952e-05, + "loss": 3.8725, + "step": 5145 + }, + { + "epoch": 0.34991167278162794, + "grad_norm": 0.22618575394153595, + "learning_rate": 9.563204919146623e-05, + "loss": 4.1794, + "step": 5150 + }, + { + "epoch": 0.35025139285228973, + "grad_norm": 0.1874348223209381, + "learning_rate": 9.562780269058297e-05, + "loss": 3.9515, + "step": 5155 + }, + { + "epoch": 0.3505911129229515, + "grad_norm": 0.2910906672477722, + "learning_rate": 9.562355618969969e-05, + "loss": 4.2863, + "step": 5160 + }, + { + "epoch": 0.3509308329936133, + "grad_norm": 0.19384640455245972, + "learning_rate": 9.561930968881641e-05, + "loss": 4.2367, + "step": 5165 + }, + { + "epoch": 0.351270553064275, + "grad_norm": 0.15815407037734985, + "learning_rate": 9.561506318793316e-05, + "loss": 3.9907, + "step": 5170 + }, + { + "epoch": 0.3516102731349368, + "grad_norm": 0.2070821076631546, + "learning_rate": 9.561081668704987e-05, + "loss": 3.9338, + "step": 5175 + }, + { + "epoch": 0.3519499932055986, + "grad_norm": 0.24302656948566437, + "learning_rate": 9.56065701861666e-05, + "loss": 4.2737, + "step": 5180 + }, + { + "epoch": 0.35228971327626035, + "grad_norm": 0.24706007540225983, + "learning_rate": 9.560232368528334e-05, + "loss": 4.23, + "step": 5185 + }, + { + "epoch": 0.35262943334692215, + "grad_norm": 0.5972357988357544, + "learning_rate": 9.559807718440005e-05, + "loss": 4.157, + "step": 5190 + }, + { + "epoch": 0.3529691534175839, + "grad_norm": 1.1296205520629883, + "learning_rate": 9.559383068351678e-05, + "loss": 4.1556, + "step": 5195 + }, + { + "epoch": 0.3533088734882457, + "grad_norm": 0.19614671170711517, + "learning_rate": 9.558958418263352e-05, + "loss": 4.1069, + "step": 5200 + }, + { + "epoch": 0.3536485935589075, + "grad_norm": 0.2329636514186859, + "learning_rate": 9.558533768175024e-05, + "loss": 4.0195, + "step": 5205 + }, + { + "epoch": 0.35398831362956923, + "grad_norm": 0.3606981635093689, + "learning_rate": 9.558109118086697e-05, + "loss": 4.0518, + "step": 5210 + }, + { + "epoch": 0.354328033700231, + "grad_norm": 0.212651789188385, + "learning_rate": 9.557684467998371e-05, + "loss": 4.1561, + "step": 5215 + }, + { + "epoch": 0.35466775377089277, + "grad_norm": 0.2813778817653656, + "learning_rate": 9.557259817910042e-05, + "loss": 4.1709, + "step": 5220 + }, + { + "epoch": 0.35500747384155457, + "grad_norm": 0.1836264729499817, + "learning_rate": 9.556835167821715e-05, + "loss": 3.9968, + "step": 5225 + }, + { + "epoch": 0.3553471939122163, + "grad_norm": 0.24313072860240936, + "learning_rate": 9.556410517733388e-05, + "loss": 4.1021, + "step": 5230 + }, + { + "epoch": 0.3556869139828781, + "grad_norm": 0.21879182755947113, + "learning_rate": 9.55598586764506e-05, + "loss": 4.0678, + "step": 5235 + }, + { + "epoch": 0.3560266340535399, + "grad_norm": 0.19957928359508514, + "learning_rate": 9.555561217556733e-05, + "loss": 4.1579, + "step": 5240 + }, + { + "epoch": 0.35636635412420165, + "grad_norm": 0.2043609321117401, + "learning_rate": 9.555136567468406e-05, + "loss": 3.9197, + "step": 5245 + }, + { + "epoch": 0.35670607419486344, + "grad_norm": 0.1743493527173996, + "learning_rate": 9.554711917380079e-05, + "loss": 4.3003, + "step": 5250 + }, + { + "epoch": 0.3570457942655252, + "grad_norm": 0.3488079309463501, + "learning_rate": 9.554287267291752e-05, + "loss": 4.0098, + "step": 5255 + }, + { + "epoch": 0.357385514336187, + "grad_norm": 0.2585020959377289, + "learning_rate": 9.553862617203425e-05, + "loss": 4.0749, + "step": 5260 + }, + { + "epoch": 0.3577252344068488, + "grad_norm": 0.22201067209243774, + "learning_rate": 9.553437967115097e-05, + "loss": 3.7807, + "step": 5265 + }, + { + "epoch": 0.3580649544775105, + "grad_norm": 0.4632178843021393, + "learning_rate": 9.55301331702677e-05, + "loss": 4.1557, + "step": 5270 + }, + { + "epoch": 0.3584046745481723, + "grad_norm": 0.4491996765136719, + "learning_rate": 9.552588666938443e-05, + "loss": 4.2286, + "step": 5275 + }, + { + "epoch": 0.35874439461883406, + "grad_norm": 0.22126582264900208, + "learning_rate": 9.552164016850116e-05, + "loss": 3.9587, + "step": 5280 + }, + { + "epoch": 0.35908411468949586, + "grad_norm": 0.20614346861839294, + "learning_rate": 9.551739366761789e-05, + "loss": 4.1402, + "step": 5285 + }, + { + "epoch": 0.35942383476015766, + "grad_norm": 0.2311069220304489, + "learning_rate": 9.551314716673461e-05, + "loss": 4.1504, + "step": 5290 + }, + { + "epoch": 0.3597635548308194, + "grad_norm": 0.20152784883975983, + "learning_rate": 9.550890066585134e-05, + "loss": 4.2783, + "step": 5295 + }, + { + "epoch": 0.3601032749014812, + "grad_norm": 0.2334737479686737, + "learning_rate": 9.550465416496807e-05, + "loss": 4.2849, + "step": 5300 + }, + { + "epoch": 0.36044299497214294, + "grad_norm": 0.17994678020477295, + "learning_rate": 9.55004076640848e-05, + "loss": 4.3848, + "step": 5305 + }, + { + "epoch": 0.36078271504280474, + "grad_norm": 0.2141488939523697, + "learning_rate": 9.549616116320153e-05, + "loss": 4.309, + "step": 5310 + }, + { + "epoch": 0.3611224351134665, + "grad_norm": 0.2028026133775711, + "learning_rate": 9.549191466231825e-05, + "loss": 4.4349, + "step": 5315 + }, + { + "epoch": 0.3614621551841283, + "grad_norm": 0.1849725842475891, + "learning_rate": 9.548766816143498e-05, + "loss": 3.929, + "step": 5320 + }, + { + "epoch": 0.3618018752547901, + "grad_norm": 0.20538243651390076, + "learning_rate": 9.548342166055171e-05, + "loss": 4.2426, + "step": 5325 + }, + { + "epoch": 0.3621415953254518, + "grad_norm": 0.22145512700080872, + "learning_rate": 9.547917515966844e-05, + "loss": 4.1321, + "step": 5330 + }, + { + "epoch": 0.3624813153961136, + "grad_norm": 0.24293570220470428, + "learning_rate": 9.547492865878517e-05, + "loss": 4.4132, + "step": 5335 + }, + { + "epoch": 0.36282103546677535, + "grad_norm": 3.3797924518585205, + "learning_rate": 9.54706821579019e-05, + "loss": 3.9775, + "step": 5340 + }, + { + "epoch": 0.36316075553743715, + "grad_norm": 0.22349077463150024, + "learning_rate": 9.546643565701862e-05, + "loss": 4.095, + "step": 5345 + }, + { + "epoch": 0.36350047560809895, + "grad_norm": 0.22708293795585632, + "learning_rate": 9.546218915613535e-05, + "loss": 4.1219, + "step": 5350 + }, + { + "epoch": 0.3638401956787607, + "grad_norm": 0.40382349491119385, + "learning_rate": 9.545794265525208e-05, + "loss": 3.8157, + "step": 5355 + }, + { + "epoch": 0.3641799157494225, + "grad_norm": 0.1983574777841568, + "learning_rate": 9.54536961543688e-05, + "loss": 4.0346, + "step": 5360 + }, + { + "epoch": 0.36451963582008423, + "grad_norm": 0.3324495851993561, + "learning_rate": 9.544944965348553e-05, + "loss": 4.2765, + "step": 5365 + }, + { + "epoch": 0.36485935589074603, + "grad_norm": 0.2055937945842743, + "learning_rate": 9.544520315260226e-05, + "loss": 3.8416, + "step": 5370 + }, + { + "epoch": 0.3651990759614078, + "grad_norm": 0.18161867558956146, + "learning_rate": 9.544095665171899e-05, + "loss": 4.0468, + "step": 5375 + }, + { + "epoch": 0.36553879603206957, + "grad_norm": 0.2383970320224762, + "learning_rate": 9.543671015083572e-05, + "loss": 4.0359, + "step": 5380 + }, + { + "epoch": 0.36587851610273137, + "grad_norm": 0.1611696481704712, + "learning_rate": 9.543246364995245e-05, + "loss": 4.1817, + "step": 5385 + }, + { + "epoch": 0.3662182361733931, + "grad_norm": 0.3070268929004669, + "learning_rate": 9.542821714906917e-05, + "loss": 4.0901, + "step": 5390 + }, + { + "epoch": 0.3665579562440549, + "grad_norm": 0.17862237989902496, + "learning_rate": 9.54239706481859e-05, + "loss": 3.9993, + "step": 5395 + }, + { + "epoch": 0.36689767631471665, + "grad_norm": 0.30012592673301697, + "learning_rate": 9.541972414730263e-05, + "loss": 4.0737, + "step": 5400 + }, + { + "epoch": 0.36723739638537845, + "grad_norm": 15.268974304199219, + "learning_rate": 9.541547764641936e-05, + "loss": 3.8387, + "step": 5405 + }, + { + "epoch": 0.36757711645604024, + "grad_norm": 0.19847442209720612, + "learning_rate": 9.541123114553609e-05, + "loss": 4.0158, + "step": 5410 + }, + { + "epoch": 0.367916836526702, + "grad_norm": 1.345680832862854, + "learning_rate": 9.540698464465281e-05, + "loss": 4.1187, + "step": 5415 + }, + { + "epoch": 0.3682565565973638, + "grad_norm": 0.15424399077892303, + "learning_rate": 9.540273814376954e-05, + "loss": 4.1746, + "step": 5420 + }, + { + "epoch": 0.3685962766680255, + "grad_norm": 0.47641104459762573, + "learning_rate": 9.539849164288627e-05, + "loss": 3.8821, + "step": 5425 + }, + { + "epoch": 0.3689359967386873, + "grad_norm": 0.27253925800323486, + "learning_rate": 9.539424514200299e-05, + "loss": 4.1306, + "step": 5430 + }, + { + "epoch": 0.3692757168093491, + "grad_norm": 0.5784019231796265, + "learning_rate": 9.538999864111973e-05, + "loss": 4.1907, + "step": 5435 + }, + { + "epoch": 0.36961543688001086, + "grad_norm": 0.21910730004310608, + "learning_rate": 9.538575214023645e-05, + "loss": 3.9055, + "step": 5440 + }, + { + "epoch": 0.36995515695067266, + "grad_norm": 0.195495143532753, + "learning_rate": 9.538150563935317e-05, + "loss": 4.1521, + "step": 5445 + }, + { + "epoch": 0.3702948770213344, + "grad_norm": 0.20794479548931122, + "learning_rate": 9.537725913846991e-05, + "loss": 3.9962, + "step": 5450 + }, + { + "epoch": 0.3706345970919962, + "grad_norm": 1.305681586265564, + "learning_rate": 9.537301263758664e-05, + "loss": 4.173, + "step": 5455 + }, + { + "epoch": 0.370974317162658, + "grad_norm": 0.1818116158246994, + "learning_rate": 9.536876613670335e-05, + "loss": 4.0849, + "step": 5460 + }, + { + "epoch": 0.37131403723331974, + "grad_norm": 0.38611772656440735, + "learning_rate": 9.53645196358201e-05, + "loss": 3.9796, + "step": 5465 + }, + { + "epoch": 0.37165375730398154, + "grad_norm": 0.2650381922721863, + "learning_rate": 9.536027313493682e-05, + "loss": 4.1452, + "step": 5470 + }, + { + "epoch": 0.3719934773746433, + "grad_norm": 0.2208934873342514, + "learning_rate": 9.535602663405354e-05, + "loss": 4.2661, + "step": 5475 + }, + { + "epoch": 0.3723331974453051, + "grad_norm": 0.20266486704349518, + "learning_rate": 9.535178013317028e-05, + "loss": 4.207, + "step": 5480 + }, + { + "epoch": 0.3726729175159668, + "grad_norm": 0.21677860617637634, + "learning_rate": 9.5347533632287e-05, + "loss": 4.3784, + "step": 5485 + }, + { + "epoch": 0.3730126375866286, + "grad_norm": 0.33356210589408875, + "learning_rate": 9.534328713140372e-05, + "loss": 4.0518, + "step": 5490 + }, + { + "epoch": 0.3733523576572904, + "grad_norm": 0.2748437225818634, + "learning_rate": 9.533904063052046e-05, + "loss": 4.0669, + "step": 5495 + }, + { + "epoch": 0.37369207772795215, + "grad_norm": 0.22416415810585022, + "learning_rate": 9.533479412963718e-05, + "loss": 4.1892, + "step": 5500 + }, + { + "epoch": 0.37403179779861395, + "grad_norm": 0.20975516736507416, + "learning_rate": 9.53305476287539e-05, + "loss": 4.2146, + "step": 5505 + }, + { + "epoch": 0.3743715178692757, + "grad_norm": 0.1820031851530075, + "learning_rate": 9.532630112787065e-05, + "loss": 4.1054, + "step": 5510 + }, + { + "epoch": 0.3747112379399375, + "grad_norm": 0.2546124756336212, + "learning_rate": 9.532205462698736e-05, + "loss": 4.1923, + "step": 5515 + }, + { + "epoch": 0.3750509580105993, + "grad_norm": 0.5577950477600098, + "learning_rate": 9.531780812610409e-05, + "loss": 3.851, + "step": 5520 + }, + { + "epoch": 0.37539067808126103, + "grad_norm": 0.2909904718399048, + "learning_rate": 9.531356162522083e-05, + "loss": 3.8513, + "step": 5525 + }, + { + "epoch": 0.37573039815192283, + "grad_norm": 0.20286774635314941, + "learning_rate": 9.530931512433755e-05, + "loss": 3.8725, + "step": 5530 + }, + { + "epoch": 0.37607011822258457, + "grad_norm": 0.20398856699466705, + "learning_rate": 9.530506862345427e-05, + "loss": 4.3243, + "step": 5535 + }, + { + "epoch": 0.37640983829324637, + "grad_norm": 0.1849180907011032, + "learning_rate": 9.530082212257101e-05, + "loss": 4.1106, + "step": 5540 + }, + { + "epoch": 0.37674955836390817, + "grad_norm": 0.1672249287366867, + "learning_rate": 9.529657562168773e-05, + "loss": 4.4955, + "step": 5545 + }, + { + "epoch": 0.3770892784345699, + "grad_norm": 0.7186090350151062, + "learning_rate": 9.529232912080446e-05, + "loss": 4.2498, + "step": 5550 + }, + { + "epoch": 0.3774289985052317, + "grad_norm": 0.17973625659942627, + "learning_rate": 9.52880826199212e-05, + "loss": 4.2932, + "step": 5555 + }, + { + "epoch": 0.37776871857589345, + "grad_norm": 0.23119674623012543, + "learning_rate": 9.528383611903791e-05, + "loss": 4.2597, + "step": 5560 + }, + { + "epoch": 0.37810843864655524, + "grad_norm": 0.27012819051742554, + "learning_rate": 9.527958961815464e-05, + "loss": 4.0075, + "step": 5565 + }, + { + "epoch": 0.378448158717217, + "grad_norm": 0.22133472561836243, + "learning_rate": 9.527534311727138e-05, + "loss": 4.1785, + "step": 5570 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.17481616139411926, + "learning_rate": 9.52710966163881e-05, + "loss": 3.9544, + "step": 5575 + }, + { + "epoch": 0.3791275988585406, + "grad_norm": 0.20295321941375732, + "learning_rate": 9.526685011550483e-05, + "loss": 3.8667, + "step": 5580 + }, + { + "epoch": 0.3794673189292023, + "grad_norm": 0.3702150881290436, + "learning_rate": 9.526260361462155e-05, + "loss": 4.012, + "step": 5585 + }, + { + "epoch": 0.3798070389998641, + "grad_norm": 0.3844399154186249, + "learning_rate": 9.525835711373828e-05, + "loss": 4.0456, + "step": 5590 + }, + { + "epoch": 0.38014675907052586, + "grad_norm": 0.27248868346214294, + "learning_rate": 9.525411061285501e-05, + "loss": 4.0001, + "step": 5595 + }, + { + "epoch": 0.38048647914118766, + "grad_norm": 0.4196895360946655, + "learning_rate": 9.524986411197174e-05, + "loss": 4.2368, + "step": 5600 + }, + { + "epoch": 0.38082619921184946, + "grad_norm": 0.2525693476200104, + "learning_rate": 9.524561761108847e-05, + "loss": 4.1713, + "step": 5605 + }, + { + "epoch": 0.3811659192825112, + "grad_norm": 0.19002725183963776, + "learning_rate": 9.52413711102052e-05, + "loss": 3.9637, + "step": 5610 + }, + { + "epoch": 0.381505639353173, + "grad_norm": 0.21603484451770782, + "learning_rate": 9.523712460932192e-05, + "loss": 4.0533, + "step": 5615 + }, + { + "epoch": 0.38184535942383474, + "grad_norm": 0.1926129311323166, + "learning_rate": 9.523287810843865e-05, + "loss": 4.0691, + "step": 5620 + }, + { + "epoch": 0.38218507949449654, + "grad_norm": 0.41377758979797363, + "learning_rate": 9.522863160755538e-05, + "loss": 4.0141, + "step": 5625 + }, + { + "epoch": 0.38252479956515834, + "grad_norm": 0.2249571830034256, + "learning_rate": 9.52243851066721e-05, + "loss": 4.1834, + "step": 5630 + }, + { + "epoch": 0.3828645196358201, + "grad_norm": 0.1857426017522812, + "learning_rate": 9.522013860578883e-05, + "loss": 4.0473, + "step": 5635 + }, + { + "epoch": 0.3832042397064819, + "grad_norm": 0.20423340797424316, + "learning_rate": 9.521589210490556e-05, + "loss": 4.1039, + "step": 5640 + }, + { + "epoch": 0.3835439597771436, + "grad_norm": 0.20530074834823608, + "learning_rate": 9.521164560402229e-05, + "loss": 4.1668, + "step": 5645 + }, + { + "epoch": 0.3838836798478054, + "grad_norm": 0.21148528158664703, + "learning_rate": 9.520739910313902e-05, + "loss": 4.0141, + "step": 5650 + }, + { + "epoch": 0.38422339991846716, + "grad_norm": 0.23851540684700012, + "learning_rate": 9.520315260225575e-05, + "loss": 4.2311, + "step": 5655 + }, + { + "epoch": 0.38456311998912895, + "grad_norm": 0.17751434445381165, + "learning_rate": 9.519890610137247e-05, + "loss": 4.0698, + "step": 5660 + }, + { + "epoch": 0.38490284005979075, + "grad_norm": 0.4216800332069397, + "learning_rate": 9.51946596004892e-05, + "loss": 4.0324, + "step": 5665 + }, + { + "epoch": 0.3852425601304525, + "grad_norm": 0.19707581400871277, + "learning_rate": 9.519041309960593e-05, + "loss": 4.0026, + "step": 5670 + }, + { + "epoch": 0.3855822802011143, + "grad_norm": 0.21056298911571503, + "learning_rate": 9.518616659872266e-05, + "loss": 4.0293, + "step": 5675 + }, + { + "epoch": 0.38592200027177603, + "grad_norm": 0.18237900733947754, + "learning_rate": 9.518192009783939e-05, + "loss": 3.9467, + "step": 5680 + }, + { + "epoch": 0.38626172034243783, + "grad_norm": 0.1838427186012268, + "learning_rate": 9.517767359695611e-05, + "loss": 3.925, + "step": 5685 + }, + { + "epoch": 0.38660144041309963, + "grad_norm": 0.21086731553077698, + "learning_rate": 9.517342709607284e-05, + "loss": 4.2034, + "step": 5690 + }, + { + "epoch": 0.38694116048376137, + "grad_norm": 0.17555493116378784, + "learning_rate": 9.516918059518957e-05, + "loss": 3.8963, + "step": 5695 + }, + { + "epoch": 0.38728088055442317, + "grad_norm": 0.23491710424423218, + "learning_rate": 9.51649340943063e-05, + "loss": 4.116, + "step": 5700 + }, + { + "epoch": 0.3876206006250849, + "grad_norm": 0.18439505994319916, + "learning_rate": 9.516068759342303e-05, + "loss": 4.1069, + "step": 5705 + }, + { + "epoch": 0.3879603206957467, + "grad_norm": 0.18807227909564972, + "learning_rate": 9.515644109253975e-05, + "loss": 4.104, + "step": 5710 + }, + { + "epoch": 0.3883000407664085, + "grad_norm": 0.4963626265525818, + "learning_rate": 9.515219459165648e-05, + "loss": 4.1994, + "step": 5715 + }, + { + "epoch": 0.38863976083707025, + "grad_norm": 0.24339251220226288, + "learning_rate": 9.514794809077321e-05, + "loss": 4.1082, + "step": 5720 + }, + { + "epoch": 0.38897948090773204, + "grad_norm": 0.17436154186725616, + "learning_rate": 9.514370158988994e-05, + "loss": 4.1256, + "step": 5725 + }, + { + "epoch": 0.3893192009783938, + "grad_norm": 0.2445308268070221, + "learning_rate": 9.513945508900667e-05, + "loss": 4.0222, + "step": 5730 + }, + { + "epoch": 0.3896589210490556, + "grad_norm": 0.6241475939750671, + "learning_rate": 9.51352085881234e-05, + "loss": 4.1132, + "step": 5735 + }, + { + "epoch": 0.3899986411197173, + "grad_norm": 0.16763907670974731, + "learning_rate": 9.513096208724012e-05, + "loss": 4.3379, + "step": 5740 + }, + { + "epoch": 0.3903383611903791, + "grad_norm": 0.1730974316596985, + "learning_rate": 9.512671558635685e-05, + "loss": 4.1179, + "step": 5745 + }, + { + "epoch": 0.3906780812610409, + "grad_norm": 0.19016407430171967, + "learning_rate": 9.512246908547358e-05, + "loss": 4.0632, + "step": 5750 + }, + { + "epoch": 0.39101780133170266, + "grad_norm": 0.19713403284549713, + "learning_rate": 9.51182225845903e-05, + "loss": 4.088, + "step": 5755 + }, + { + "epoch": 0.39135752140236446, + "grad_norm": 0.26910024881362915, + "learning_rate": 9.511397608370703e-05, + "loss": 4.2572, + "step": 5760 + }, + { + "epoch": 0.3916972414730262, + "grad_norm": 0.20750823616981506, + "learning_rate": 9.510972958282376e-05, + "loss": 4.0017, + "step": 5765 + }, + { + "epoch": 0.392036961543688, + "grad_norm": 0.18255822360515594, + "learning_rate": 9.510548308194049e-05, + "loss": 4.4284, + "step": 5770 + }, + { + "epoch": 0.3923766816143498, + "grad_norm": 0.19282065331935883, + "learning_rate": 9.510123658105722e-05, + "loss": 4.1651, + "step": 5775 + }, + { + "epoch": 0.39271640168501154, + "grad_norm": 0.22694478929042816, + "learning_rate": 9.509699008017395e-05, + "loss": 4.0152, + "step": 5780 + }, + { + "epoch": 0.39305612175567334, + "grad_norm": 0.2607446014881134, + "learning_rate": 9.509274357929066e-05, + "loss": 4.3358, + "step": 5785 + }, + { + "epoch": 0.3933958418263351, + "grad_norm": 0.22173616290092468, + "learning_rate": 9.50884970784074e-05, + "loss": 4.2362, + "step": 5790 + }, + { + "epoch": 0.3937355618969969, + "grad_norm": 0.20545057952404022, + "learning_rate": 9.508425057752413e-05, + "loss": 4.0821, + "step": 5795 + }, + { + "epoch": 0.3940752819676587, + "grad_norm": 0.23421698808670044, + "learning_rate": 9.508000407664084e-05, + "loss": 4.2178, + "step": 5800 + }, + { + "epoch": 0.3944150020383204, + "grad_norm": 0.2095632702112198, + "learning_rate": 9.507575757575759e-05, + "loss": 4.2833, + "step": 5805 + }, + { + "epoch": 0.3947547221089822, + "grad_norm": 0.23404939472675323, + "learning_rate": 9.507151107487431e-05, + "loss": 4.0823, + "step": 5810 + }, + { + "epoch": 0.39509444217964396, + "grad_norm": 0.23966114223003387, + "learning_rate": 9.506726457399103e-05, + "loss": 3.8059, + "step": 5815 + }, + { + "epoch": 0.39543416225030575, + "grad_norm": 0.2027054876089096, + "learning_rate": 9.506301807310777e-05, + "loss": 4.2229, + "step": 5820 + }, + { + "epoch": 0.3957738823209675, + "grad_norm": 0.18689711391925812, + "learning_rate": 9.50587715722245e-05, + "loss": 4.26, + "step": 5825 + }, + { + "epoch": 0.3961136023916293, + "grad_norm": 0.263927698135376, + "learning_rate": 9.505452507134121e-05, + "loss": 4.021, + "step": 5830 + }, + { + "epoch": 0.3964533224622911, + "grad_norm": 0.18399837613105774, + "learning_rate": 9.505027857045795e-05, + "loss": 4.218, + "step": 5835 + }, + { + "epoch": 0.39679304253295283, + "grad_norm": 0.17031966149806976, + "learning_rate": 9.504603206957468e-05, + "loss": 3.9079, + "step": 5840 + }, + { + "epoch": 0.39713276260361463, + "grad_norm": 0.3210891783237457, + "learning_rate": 9.50417855686914e-05, + "loss": 3.9466, + "step": 5845 + }, + { + "epoch": 0.3974724826742764, + "grad_norm": 0.1981404423713684, + "learning_rate": 9.503753906780814e-05, + "loss": 4.001, + "step": 5850 + }, + { + "epoch": 0.39781220274493817, + "grad_norm": 0.3136885464191437, + "learning_rate": 9.503329256692485e-05, + "loss": 4.058, + "step": 5855 + }, + { + "epoch": 0.39815192281559997, + "grad_norm": 2.190765857696533, + "learning_rate": 9.502904606604158e-05, + "loss": 4.0696, + "step": 5860 + }, + { + "epoch": 0.3984916428862617, + "grad_norm": 0.17315542697906494, + "learning_rate": 9.502479956515832e-05, + "loss": 4.2626, + "step": 5865 + }, + { + "epoch": 0.3988313629569235, + "grad_norm": 0.33235201239585876, + "learning_rate": 9.502055306427504e-05, + "loss": 3.9844, + "step": 5870 + }, + { + "epoch": 0.39917108302758525, + "grad_norm": 0.23391257226467133, + "learning_rate": 9.501630656339176e-05, + "loss": 3.8401, + "step": 5875 + }, + { + "epoch": 0.39951080309824705, + "grad_norm": 0.24006325006484985, + "learning_rate": 9.50120600625085e-05, + "loss": 4.0724, + "step": 5880 + }, + { + "epoch": 0.39985052316890884, + "grad_norm": 0.17999830842018127, + "learning_rate": 9.500781356162522e-05, + "loss": 4.2167, + "step": 5885 + }, + { + "epoch": 0.4001902432395706, + "grad_norm": 0.18070223927497864, + "learning_rate": 9.500356706074195e-05, + "loss": 4.105, + "step": 5890 + }, + { + "epoch": 0.4005299633102324, + "grad_norm": 0.19634656608104706, + "learning_rate": 9.499932055985869e-05, + "loss": 3.989, + "step": 5895 + }, + { + "epoch": 0.4008696833808941, + "grad_norm": 0.23722241818904877, + "learning_rate": 9.49950740589754e-05, + "loss": 4.1728, + "step": 5900 + }, + { + "epoch": 0.4012094034515559, + "grad_norm": 0.19146768748760223, + "learning_rate": 9.499082755809213e-05, + "loss": 4.1732, + "step": 5905 + }, + { + "epoch": 0.40154912352221767, + "grad_norm": 0.21835100650787354, + "learning_rate": 9.498658105720887e-05, + "loss": 4.0836, + "step": 5910 + }, + { + "epoch": 0.40188884359287946, + "grad_norm": 0.17060807347297668, + "learning_rate": 9.498233455632559e-05, + "loss": 4.0794, + "step": 5915 + }, + { + "epoch": 0.40222856366354126, + "grad_norm": 0.9648451805114746, + "learning_rate": 9.497808805544232e-05, + "loss": 4.1605, + "step": 5920 + }, + { + "epoch": 0.402568283734203, + "grad_norm": 0.1983519345521927, + "learning_rate": 9.497384155455904e-05, + "loss": 3.9843, + "step": 5925 + }, + { + "epoch": 0.4029080038048648, + "grad_norm": 0.23070013523101807, + "learning_rate": 9.496959505367577e-05, + "loss": 3.9679, + "step": 5930 + }, + { + "epoch": 0.40324772387552654, + "grad_norm": 1.1979519128799438, + "learning_rate": 9.49653485527925e-05, + "loss": 4.1856, + "step": 5935 + }, + { + "epoch": 0.40358744394618834, + "grad_norm": 0.7521958947181702, + "learning_rate": 9.496110205190923e-05, + "loss": 4.082, + "step": 5940 + }, + { + "epoch": 0.40392716401685014, + "grad_norm": 0.192143052816391, + "learning_rate": 9.495685555102596e-05, + "loss": 4.2121, + "step": 5945 + }, + { + "epoch": 0.4042668840875119, + "grad_norm": 0.2611311972141266, + "learning_rate": 9.495260905014268e-05, + "loss": 4.2154, + "step": 5950 + }, + { + "epoch": 0.4046066041581737, + "grad_norm": 0.18073415756225586, + "learning_rate": 9.494836254925941e-05, + "loss": 4.1762, + "step": 5955 + }, + { + "epoch": 0.4049463242288354, + "grad_norm": 0.1921936720609665, + "learning_rate": 9.494411604837614e-05, + "loss": 4.0705, + "step": 5960 + }, + { + "epoch": 0.4052860442994972, + "grad_norm": 0.16377374529838562, + "learning_rate": 9.493986954749287e-05, + "loss": 4.1785, + "step": 5965 + }, + { + "epoch": 0.405625764370159, + "grad_norm": 0.21104343235492706, + "learning_rate": 9.49356230466096e-05, + "loss": 4.2715, + "step": 5970 + }, + { + "epoch": 0.40596548444082076, + "grad_norm": 0.2071741223335266, + "learning_rate": 9.493137654572632e-05, + "loss": 3.9067, + "step": 5975 + }, + { + "epoch": 0.40630520451148255, + "grad_norm": 0.22247660160064697, + "learning_rate": 9.492713004484305e-05, + "loss": 4.0561, + "step": 5980 + }, + { + "epoch": 0.4066449245821443, + "grad_norm": 0.20433616638183594, + "learning_rate": 9.492288354395978e-05, + "loss": 4.0168, + "step": 5985 + }, + { + "epoch": 0.4069846446528061, + "grad_norm": 0.2049606889486313, + "learning_rate": 9.491863704307651e-05, + "loss": 4.3927, + "step": 5990 + }, + { + "epoch": 0.40732436472346784, + "grad_norm": 0.22720476984977722, + "learning_rate": 9.491439054219324e-05, + "loss": 4.2648, + "step": 5995 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 0.25233665108680725, + "learning_rate": 9.491014404130996e-05, + "loss": 3.9343, + "step": 6000 + }, + { + "epoch": 0.40800380486479143, + "grad_norm": 0.21542391180992126, + "learning_rate": 9.490589754042669e-05, + "loss": 4.2302, + "step": 6005 + }, + { + "epoch": 0.4083435249354532, + "grad_norm": 0.8740308284759521, + "learning_rate": 9.490165103954342e-05, + "loss": 4.0408, + "step": 6010 + }, + { + "epoch": 0.40868324500611497, + "grad_norm": 0.18519490957260132, + "learning_rate": 9.489740453866015e-05, + "loss": 4.0773, + "step": 6015 + }, + { + "epoch": 0.4090229650767767, + "grad_norm": 0.2651638388633728, + "learning_rate": 9.489315803777688e-05, + "loss": 4.1498, + "step": 6020 + }, + { + "epoch": 0.4093626851474385, + "grad_norm": 0.4929540157318115, + "learning_rate": 9.48889115368936e-05, + "loss": 3.9128, + "step": 6025 + }, + { + "epoch": 0.4097024052181003, + "grad_norm": 0.20049385726451874, + "learning_rate": 9.488466503601033e-05, + "loss": 4.0097, + "step": 6030 + }, + { + "epoch": 0.41004212528876205, + "grad_norm": 0.17493902146816254, + "learning_rate": 9.488041853512706e-05, + "loss": 4.1654, + "step": 6035 + }, + { + "epoch": 0.41038184535942385, + "grad_norm": 0.30577751994132996, + "learning_rate": 9.487617203424379e-05, + "loss": 4.0563, + "step": 6040 + }, + { + "epoch": 0.4107215654300856, + "grad_norm": 0.2669510543346405, + "learning_rate": 9.487192553336052e-05, + "loss": 4.0721, + "step": 6045 + }, + { + "epoch": 0.4110612855007474, + "grad_norm": 0.18722812831401825, + "learning_rate": 9.486767903247724e-05, + "loss": 4.1749, + "step": 6050 + }, + { + "epoch": 0.4114010055714092, + "grad_norm": 0.1755664199590683, + "learning_rate": 9.486343253159397e-05, + "loss": 4.1804, + "step": 6055 + }, + { + "epoch": 0.4117407256420709, + "grad_norm": 0.27995565533638, + "learning_rate": 9.48591860307107e-05, + "loss": 3.9794, + "step": 6060 + }, + { + "epoch": 0.4120804457127327, + "grad_norm": 0.2518627345561981, + "learning_rate": 9.485493952982743e-05, + "loss": 4.2129, + "step": 6065 + }, + { + "epoch": 0.41242016578339447, + "grad_norm": 0.21856893599033356, + "learning_rate": 9.485069302894416e-05, + "loss": 4.3444, + "step": 6070 + }, + { + "epoch": 0.41275988585405626, + "grad_norm": 1.2819626331329346, + "learning_rate": 9.484644652806088e-05, + "loss": 3.6595, + "step": 6075 + }, + { + "epoch": 0.413099605924718, + "grad_norm": 0.25162968039512634, + "learning_rate": 9.484220002717761e-05, + "loss": 4.2169, + "step": 6080 + }, + { + "epoch": 0.4134393259953798, + "grad_norm": 0.2659519910812378, + "learning_rate": 9.483795352629434e-05, + "loss": 4.0789, + "step": 6085 + }, + { + "epoch": 0.4137790460660416, + "grad_norm": 0.20415237545967102, + "learning_rate": 9.483370702541107e-05, + "loss": 4.2934, + "step": 6090 + }, + { + "epoch": 0.41411876613670334, + "grad_norm": 0.33283525705337524, + "learning_rate": 9.48294605245278e-05, + "loss": 4.0536, + "step": 6095 + }, + { + "epoch": 0.41445848620736514, + "grad_norm": 0.21782909333705902, + "learning_rate": 9.482521402364452e-05, + "loss": 3.9827, + "step": 6100 + }, + { + "epoch": 0.4147982062780269, + "grad_norm": 4.150507926940918, + "learning_rate": 9.482096752276125e-05, + "loss": 3.9654, + "step": 6105 + }, + { + "epoch": 0.4151379263486887, + "grad_norm": 0.20316611230373383, + "learning_rate": 9.481672102187798e-05, + "loss": 4.045, + "step": 6110 + }, + { + "epoch": 0.4154776464193505, + "grad_norm": 1.9692164659500122, + "learning_rate": 9.481247452099471e-05, + "loss": 4.0912, + "step": 6115 + }, + { + "epoch": 0.4158173664900122, + "grad_norm": 0.21312934160232544, + "learning_rate": 9.480822802011144e-05, + "loss": 4.2765, + "step": 6120 + }, + { + "epoch": 0.416157086560674, + "grad_norm": 0.1886243224143982, + "learning_rate": 9.480398151922815e-05, + "loss": 4.0305, + "step": 6125 + }, + { + "epoch": 0.41649680663133576, + "grad_norm": 0.22211480140686035, + "learning_rate": 9.479973501834489e-05, + "loss": 3.906, + "step": 6130 + }, + { + "epoch": 0.41683652670199756, + "grad_norm": 0.24448561668395996, + "learning_rate": 9.479548851746162e-05, + "loss": 3.9652, + "step": 6135 + }, + { + "epoch": 0.41717624677265935, + "grad_norm": 0.2330089956521988, + "learning_rate": 9.479124201657834e-05, + "loss": 4.0935, + "step": 6140 + }, + { + "epoch": 0.4175159668433211, + "grad_norm": 0.354692280292511, + "learning_rate": 9.478699551569508e-05, + "loss": 4.1893, + "step": 6145 + }, + { + "epoch": 0.4178556869139829, + "grad_norm": 0.31616339087486267, + "learning_rate": 9.47827490148118e-05, + "loss": 4.1393, + "step": 6150 + }, + { + "epoch": 0.41819540698464464, + "grad_norm": 0.6363674402236938, + "learning_rate": 9.477850251392852e-05, + "loss": 4.1578, + "step": 6155 + }, + { + "epoch": 0.41853512705530643, + "grad_norm": 0.19385862350463867, + "learning_rate": 9.477425601304526e-05, + "loss": 4.0994, + "step": 6160 + }, + { + "epoch": 0.4188748471259682, + "grad_norm": 0.20381571352481842, + "learning_rate": 9.477000951216199e-05, + "loss": 4.1873, + "step": 6165 + }, + { + "epoch": 0.41921456719663, + "grad_norm": 0.20795594155788422, + "learning_rate": 9.47657630112787e-05, + "loss": 4.0544, + "step": 6170 + }, + { + "epoch": 0.41955428726729177, + "grad_norm": 0.3839801847934723, + "learning_rate": 9.476151651039544e-05, + "loss": 4.1777, + "step": 6175 + }, + { + "epoch": 0.4198940073379535, + "grad_norm": 0.2491442710161209, + "learning_rate": 9.475727000951217e-05, + "loss": 3.9298, + "step": 6180 + }, + { + "epoch": 0.4202337274086153, + "grad_norm": 0.1739528328180313, + "learning_rate": 9.475302350862889e-05, + "loss": 4.0482, + "step": 6185 + }, + { + "epoch": 0.42057344747927705, + "grad_norm": 0.19341996312141418, + "learning_rate": 9.474877700774563e-05, + "loss": 3.911, + "step": 6190 + }, + { + "epoch": 0.42091316754993885, + "grad_norm": 0.16241292655467987, + "learning_rate": 9.474453050686236e-05, + "loss": 4.0751, + "step": 6195 + }, + { + "epoch": 0.42125288762060065, + "grad_norm": 0.16985565423965454, + "learning_rate": 9.474028400597907e-05, + "loss": 3.9359, + "step": 6200 + }, + { + "epoch": 0.4215926076912624, + "grad_norm": 0.21724484860897064, + "learning_rate": 9.473603750509581e-05, + "loss": 4.0409, + "step": 6205 + }, + { + "epoch": 0.4219323277619242, + "grad_norm": 0.21480692923069, + "learning_rate": 9.473179100421253e-05, + "loss": 3.987, + "step": 6210 + }, + { + "epoch": 0.42227204783258593, + "grad_norm": 0.2604687809944153, + "learning_rate": 9.472754450332926e-05, + "loss": 3.89, + "step": 6215 + }, + { + "epoch": 0.4226117679032477, + "grad_norm": 0.22292381525039673, + "learning_rate": 9.4723298002446e-05, + "loss": 4.029, + "step": 6220 + }, + { + "epoch": 0.4229514879739095, + "grad_norm": 0.2695325016975403, + "learning_rate": 9.471905150156271e-05, + "loss": 4.0199, + "step": 6225 + }, + { + "epoch": 0.42329120804457127, + "grad_norm": 0.17921492457389832, + "learning_rate": 9.471480500067944e-05, + "loss": 4.0195, + "step": 6230 + }, + { + "epoch": 0.42363092811523306, + "grad_norm": 0.29654955863952637, + "learning_rate": 9.471055849979618e-05, + "loss": 4.2356, + "step": 6235 + }, + { + "epoch": 0.4239706481858948, + "grad_norm": 0.3091282844543457, + "learning_rate": 9.47063119989129e-05, + "loss": 3.8808, + "step": 6240 + }, + { + "epoch": 0.4243103682565566, + "grad_norm": 0.20580576360225677, + "learning_rate": 9.470206549802962e-05, + "loss": 4.0886, + "step": 6245 + }, + { + "epoch": 0.42465008832721834, + "grad_norm": 0.19273176789283752, + "learning_rate": 9.469781899714636e-05, + "loss": 3.9993, + "step": 6250 + }, + { + "epoch": 0.42498980839788014, + "grad_norm": 0.18639002740383148, + "learning_rate": 9.469357249626308e-05, + "loss": 4.0441, + "step": 6255 + }, + { + "epoch": 0.42532952846854194, + "grad_norm": 2.978999614715576, + "learning_rate": 9.468932599537981e-05, + "loss": 3.7926, + "step": 6260 + }, + { + "epoch": 0.4256692485392037, + "grad_norm": 0.17055857181549072, + "learning_rate": 9.468507949449655e-05, + "loss": 4.0499, + "step": 6265 + }, + { + "epoch": 0.4260089686098655, + "grad_norm": 1.0542086362838745, + "learning_rate": 9.468083299361326e-05, + "loss": 4.2183, + "step": 6270 + }, + { + "epoch": 0.4263486886805272, + "grad_norm": 1.1902449131011963, + "learning_rate": 9.467658649272999e-05, + "loss": 3.9083, + "step": 6275 + }, + { + "epoch": 0.426688408751189, + "grad_norm": 0.26639068126678467, + "learning_rate": 9.467233999184672e-05, + "loss": 3.9113, + "step": 6280 + }, + { + "epoch": 0.4270281288218508, + "grad_norm": 0.2732694149017334, + "learning_rate": 9.466809349096345e-05, + "loss": 4.1468, + "step": 6285 + }, + { + "epoch": 0.42736784889251256, + "grad_norm": 0.27546462416648865, + "learning_rate": 9.466384699008018e-05, + "loss": 3.9733, + "step": 6290 + }, + { + "epoch": 0.42770756896317436, + "grad_norm": 0.1994089037179947, + "learning_rate": 9.46596004891969e-05, + "loss": 4.1123, + "step": 6295 + }, + { + "epoch": 0.4280472890338361, + "grad_norm": 0.20160436630249023, + "learning_rate": 9.465535398831363e-05, + "loss": 4.0467, + "step": 6300 + }, + { + "epoch": 0.4283870091044979, + "grad_norm": 0.2255067527294159, + "learning_rate": 9.465110748743036e-05, + "loss": 3.7712, + "step": 6305 + }, + { + "epoch": 0.4287267291751597, + "grad_norm": 0.1755346655845642, + "learning_rate": 9.464686098654709e-05, + "loss": 4.3119, + "step": 6310 + }, + { + "epoch": 0.42906644924582144, + "grad_norm": 0.16779784858226776, + "learning_rate": 9.464261448566382e-05, + "loss": 4.1447, + "step": 6315 + }, + { + "epoch": 0.42940616931648323, + "grad_norm": 0.2779237926006317, + "learning_rate": 9.463836798478054e-05, + "loss": 4.1285, + "step": 6320 + }, + { + "epoch": 0.429745889387145, + "grad_norm": 0.21850286424160004, + "learning_rate": 9.463412148389727e-05, + "loss": 4.118, + "step": 6325 + }, + { + "epoch": 0.4300856094578068, + "grad_norm": 0.2127694934606552, + "learning_rate": 9.4629874983014e-05, + "loss": 4.1073, + "step": 6330 + }, + { + "epoch": 0.4304253295284685, + "grad_norm": 0.24460366368293762, + "learning_rate": 9.462562848213073e-05, + "loss": 4.0523, + "step": 6335 + }, + { + "epoch": 0.4307650495991303, + "grad_norm": 0.18039904534816742, + "learning_rate": 9.462138198124746e-05, + "loss": 4.205, + "step": 6340 + }, + { + "epoch": 0.4311047696697921, + "grad_norm": 0.23940406739711761, + "learning_rate": 9.461713548036418e-05, + "loss": 4.0222, + "step": 6345 + }, + { + "epoch": 0.43144448974045385, + "grad_norm": 0.223390132188797, + "learning_rate": 9.461288897948091e-05, + "loss": 4.0133, + "step": 6350 + }, + { + "epoch": 0.43178420981111565, + "grad_norm": 0.20645423233509064, + "learning_rate": 9.460864247859764e-05, + "loss": 3.9967, + "step": 6355 + }, + { + "epoch": 0.4321239298817774, + "grad_norm": 0.7943032383918762, + "learning_rate": 9.460439597771437e-05, + "loss": 3.927, + "step": 6360 + }, + { + "epoch": 0.4324636499524392, + "grad_norm": 0.1684955656528473, + "learning_rate": 9.46001494768311e-05, + "loss": 4.0099, + "step": 6365 + }, + { + "epoch": 0.432803370023101, + "grad_norm": 0.21439214050769806, + "learning_rate": 9.459590297594782e-05, + "loss": 3.7963, + "step": 6370 + }, + { + "epoch": 0.43314309009376273, + "grad_norm": 0.21478402614593506, + "learning_rate": 9.459165647506455e-05, + "loss": 4.0739, + "step": 6375 + }, + { + "epoch": 0.4334828101644245, + "grad_norm": 0.8936269283294678, + "learning_rate": 9.458740997418128e-05, + "loss": 4.1838, + "step": 6380 + }, + { + "epoch": 0.43382253023508627, + "grad_norm": 0.5096725821495056, + "learning_rate": 9.458316347329801e-05, + "loss": 4.047, + "step": 6385 + }, + { + "epoch": 0.43416225030574807, + "grad_norm": 0.285972535610199, + "learning_rate": 9.457891697241474e-05, + "loss": 4.1083, + "step": 6390 + }, + { + "epoch": 0.43450197037640986, + "grad_norm": 0.1650124043226242, + "learning_rate": 9.457467047153146e-05, + "loss": 4.157, + "step": 6395 + }, + { + "epoch": 0.4348416904470716, + "grad_norm": 0.19191978871822357, + "learning_rate": 9.457042397064819e-05, + "loss": 3.9201, + "step": 6400 + }, + { + "epoch": 0.4351814105177334, + "grad_norm": 0.20855942368507385, + "learning_rate": 9.456617746976492e-05, + "loss": 4.1146, + "step": 6405 + }, + { + "epoch": 0.43552113058839514, + "grad_norm": 0.17791011929512024, + "learning_rate": 9.456193096888165e-05, + "loss": 3.9971, + "step": 6410 + }, + { + "epoch": 0.43586085065905694, + "grad_norm": 0.2120276242494583, + "learning_rate": 9.455768446799838e-05, + "loss": 4.0967, + "step": 6415 + }, + { + "epoch": 0.4362005707297187, + "grad_norm": 0.20230713486671448, + "learning_rate": 9.45534379671151e-05, + "loss": 3.902, + "step": 6420 + }, + { + "epoch": 0.4365402908003805, + "grad_norm": 0.5752553343772888, + "learning_rate": 9.454919146623183e-05, + "loss": 3.9801, + "step": 6425 + }, + { + "epoch": 0.4368800108710423, + "grad_norm": 0.19792981445789337, + "learning_rate": 9.454494496534856e-05, + "loss": 4.2249, + "step": 6430 + }, + { + "epoch": 0.437219730941704, + "grad_norm": 0.20685099065303802, + "learning_rate": 9.454069846446529e-05, + "loss": 4.2953, + "step": 6435 + }, + { + "epoch": 0.4375594510123658, + "grad_norm": 0.22175006568431854, + "learning_rate": 9.453645196358202e-05, + "loss": 3.9621, + "step": 6440 + }, + { + "epoch": 0.43789917108302756, + "grad_norm": 0.4952181875705719, + "learning_rate": 9.453220546269874e-05, + "loss": 3.986, + "step": 6445 + }, + { + "epoch": 0.43823889115368936, + "grad_norm": 0.20618560910224915, + "learning_rate": 9.452795896181547e-05, + "loss": 3.9802, + "step": 6450 + }, + { + "epoch": 0.43857861122435116, + "grad_norm": 1.136326551437378, + "learning_rate": 9.45237124609322e-05, + "loss": 4.2695, + "step": 6455 + }, + { + "epoch": 0.4389183312950129, + "grad_norm": 0.22814400494098663, + "learning_rate": 9.451946596004893e-05, + "loss": 3.8791, + "step": 6460 + }, + { + "epoch": 0.4392580513656747, + "grad_norm": 0.24193866550922394, + "learning_rate": 9.451521945916566e-05, + "loss": 3.9838, + "step": 6465 + }, + { + "epoch": 0.43959777143633644, + "grad_norm": 0.25064903497695923, + "learning_rate": 9.451097295828238e-05, + "loss": 3.9355, + "step": 6470 + }, + { + "epoch": 0.43993749150699824, + "grad_norm": 1.2106342315673828, + "learning_rate": 9.450672645739911e-05, + "loss": 4.1844, + "step": 6475 + }, + { + "epoch": 0.44027721157766003, + "grad_norm": 0.16492827236652374, + "learning_rate": 9.450247995651583e-05, + "loss": 4.2447, + "step": 6480 + }, + { + "epoch": 0.4406169316483218, + "grad_norm": 0.1795361191034317, + "learning_rate": 9.449823345563257e-05, + "loss": 3.8589, + "step": 6485 + }, + { + "epoch": 0.44095665171898357, + "grad_norm": 0.19358831644058228, + "learning_rate": 9.44939869547493e-05, + "loss": 4.2963, + "step": 6490 + }, + { + "epoch": 0.4412963717896453, + "grad_norm": 0.2732826769351959, + "learning_rate": 9.448974045386601e-05, + "loss": 3.9789, + "step": 6495 + }, + { + "epoch": 0.4416360918603071, + "grad_norm": 0.23638220131397247, + "learning_rate": 9.448549395298275e-05, + "loss": 4.2448, + "step": 6500 + }, + { + "epoch": 0.4419758119309689, + "grad_norm": 0.2072085738182068, + "learning_rate": 9.448124745209948e-05, + "loss": 3.8356, + "step": 6505 + }, + { + "epoch": 0.44231553200163065, + "grad_norm": 3.1101341247558594, + "learning_rate": 9.44770009512162e-05, + "loss": 4.2411, + "step": 6510 + }, + { + "epoch": 0.44265525207229245, + "grad_norm": 0.4264751374721527, + "learning_rate": 9.447275445033294e-05, + "loss": 3.9676, + "step": 6515 + }, + { + "epoch": 0.4429949721429542, + "grad_norm": 0.20776435732841492, + "learning_rate": 9.446850794944966e-05, + "loss": 3.8493, + "step": 6520 + }, + { + "epoch": 0.443334692213616, + "grad_norm": 0.3044533133506775, + "learning_rate": 9.446426144856638e-05, + "loss": 4.1147, + "step": 6525 + }, + { + "epoch": 0.44367441228427773, + "grad_norm": 0.16665169596672058, + "learning_rate": 9.446001494768312e-05, + "loss": 3.9521, + "step": 6530 + }, + { + "epoch": 0.44401413235493953, + "grad_norm": 0.2023710161447525, + "learning_rate": 9.445576844679985e-05, + "loss": 3.9206, + "step": 6535 + }, + { + "epoch": 0.4443538524256013, + "grad_norm": 0.4145415425300598, + "learning_rate": 9.445152194591656e-05, + "loss": 4.0138, + "step": 6540 + }, + { + "epoch": 0.44469357249626307, + "grad_norm": 0.16682837903499603, + "learning_rate": 9.44472754450333e-05, + "loss": 4.0957, + "step": 6545 + }, + { + "epoch": 0.44503329256692487, + "grad_norm": 0.2003334015607834, + "learning_rate": 9.444302894415002e-05, + "loss": 3.8427, + "step": 6550 + }, + { + "epoch": 0.4453730126375866, + "grad_norm": 0.29585328698158264, + "learning_rate": 9.443878244326675e-05, + "loss": 4.2463, + "step": 6555 + }, + { + "epoch": 0.4457127327082484, + "grad_norm": 0.20154406130313873, + "learning_rate": 9.443453594238349e-05, + "loss": 4.2644, + "step": 6560 + }, + { + "epoch": 0.4460524527789102, + "grad_norm": 0.23148468136787415, + "learning_rate": 9.44302894415002e-05, + "loss": 3.9205, + "step": 6565 + }, + { + "epoch": 0.44639217284957194, + "grad_norm": 0.1762179434299469, + "learning_rate": 9.442604294061693e-05, + "loss": 4.0293, + "step": 6570 + }, + { + "epoch": 0.44673189292023374, + "grad_norm": 0.4714028835296631, + "learning_rate": 9.442179643973367e-05, + "loss": 4.2011, + "step": 6575 + }, + { + "epoch": 0.4470716129908955, + "grad_norm": 0.368407666683197, + "learning_rate": 9.441754993885039e-05, + "loss": 4.0047, + "step": 6580 + }, + { + "epoch": 0.4474113330615573, + "grad_norm": 0.28887784481048584, + "learning_rate": 9.441330343796711e-05, + "loss": 4.0332, + "step": 6585 + }, + { + "epoch": 0.4477510531322191, + "grad_norm": 0.25729164481163025, + "learning_rate": 9.440905693708386e-05, + "loss": 4.0735, + "step": 6590 + }, + { + "epoch": 0.4480907732028808, + "grad_norm": 0.1723019927740097, + "learning_rate": 9.440481043620057e-05, + "loss": 4.0399, + "step": 6595 + }, + { + "epoch": 0.4484304932735426, + "grad_norm": 0.2043658047914505, + "learning_rate": 9.44005639353173e-05, + "loss": 4.2346, + "step": 6600 + }, + { + "epoch": 0.44877021334420436, + "grad_norm": 0.15108297765254974, + "learning_rate": 9.439631743443404e-05, + "loss": 3.9109, + "step": 6605 + }, + { + "epoch": 0.44910993341486616, + "grad_norm": 0.18265971541404724, + "learning_rate": 9.439207093355075e-05, + "loss": 3.845, + "step": 6610 + }, + { + "epoch": 0.4494496534855279, + "grad_norm": 0.9520887732505798, + "learning_rate": 9.438782443266748e-05, + "loss": 4.2181, + "step": 6615 + }, + { + "epoch": 0.4497893735561897, + "grad_norm": 0.28121015429496765, + "learning_rate": 9.438357793178422e-05, + "loss": 4.0234, + "step": 6620 + }, + { + "epoch": 0.4501290936268515, + "grad_norm": 0.21010081470012665, + "learning_rate": 9.437933143090094e-05, + "loss": 4.1509, + "step": 6625 + }, + { + "epoch": 0.45046881369751324, + "grad_norm": 0.23798321187496185, + "learning_rate": 9.437508493001767e-05, + "loss": 4.2879, + "step": 6630 + }, + { + "epoch": 0.45080853376817503, + "grad_norm": 0.20470625162124634, + "learning_rate": 9.43708384291344e-05, + "loss": 4.0134, + "step": 6635 + }, + { + "epoch": 0.4511482538388368, + "grad_norm": 0.19223268330097198, + "learning_rate": 9.436659192825112e-05, + "loss": 4.1928, + "step": 6640 + }, + { + "epoch": 0.4514879739094986, + "grad_norm": 0.1938367336988449, + "learning_rate": 9.436234542736785e-05, + "loss": 3.923, + "step": 6645 + }, + { + "epoch": 0.45182769398016037, + "grad_norm": 0.2842964231967926, + "learning_rate": 9.435809892648458e-05, + "loss": 4.2081, + "step": 6650 + }, + { + "epoch": 0.4521674140508221, + "grad_norm": 0.22615915536880493, + "learning_rate": 9.43538524256013e-05, + "loss": 4.0994, + "step": 6655 + }, + { + "epoch": 0.4525071341214839, + "grad_norm": 0.23465953767299652, + "learning_rate": 9.434960592471803e-05, + "loss": 3.8209, + "step": 6660 + }, + { + "epoch": 0.45284685419214565, + "grad_norm": 0.17599263787269592, + "learning_rate": 9.434535942383476e-05, + "loss": 3.8742, + "step": 6665 + }, + { + "epoch": 0.45318657426280745, + "grad_norm": 0.5463417172431946, + "learning_rate": 9.434111292295149e-05, + "loss": 4.1574, + "step": 6670 + }, + { + "epoch": 0.45352629433346925, + "grad_norm": 0.21346516907215118, + "learning_rate": 9.433686642206822e-05, + "loss": 4.3701, + "step": 6675 + }, + { + "epoch": 0.453866014404131, + "grad_norm": 0.2235599011182785, + "learning_rate": 9.433261992118495e-05, + "loss": 4.1951, + "step": 6680 + }, + { + "epoch": 0.4542057344747928, + "grad_norm": 0.2730211615562439, + "learning_rate": 9.432837342030167e-05, + "loss": 3.8224, + "step": 6685 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.1868310272693634, + "learning_rate": 9.43241269194184e-05, + "loss": 3.7951, + "step": 6690 + }, + { + "epoch": 0.45488517461611633, + "grad_norm": 0.3626730442047119, + "learning_rate": 9.431988041853513e-05, + "loss": 4.1127, + "step": 6695 + }, + { + "epoch": 0.45522489468677807, + "grad_norm": 0.22474856674671173, + "learning_rate": 9.431563391765186e-05, + "loss": 3.8253, + "step": 6700 + }, + { + "epoch": 0.45556461475743987, + "grad_norm": 0.3556784689426422, + "learning_rate": 9.431138741676859e-05, + "loss": 4.0104, + "step": 6705 + }, + { + "epoch": 0.45590433482810166, + "grad_norm": 1.793366551399231, + "learning_rate": 9.430714091588531e-05, + "loss": 4.1386, + "step": 6710 + }, + { + "epoch": 0.4562440548987634, + "grad_norm": 0.18291668593883514, + "learning_rate": 9.430289441500204e-05, + "loss": 4.118, + "step": 6715 + }, + { + "epoch": 0.4565837749694252, + "grad_norm": 0.3133949339389801, + "learning_rate": 9.429864791411877e-05, + "loss": 3.9527, + "step": 6720 + }, + { + "epoch": 0.45692349504008695, + "grad_norm": 0.2146921306848526, + "learning_rate": 9.42944014132355e-05, + "loss": 3.9596, + "step": 6725 + }, + { + "epoch": 0.45726321511074874, + "grad_norm": 0.17036296427249908, + "learning_rate": 9.429015491235223e-05, + "loss": 4.0381, + "step": 6730 + }, + { + "epoch": 0.45760293518141054, + "grad_norm": 0.32481151819229126, + "learning_rate": 9.428590841146895e-05, + "loss": 4.0932, + "step": 6735 + }, + { + "epoch": 0.4579426552520723, + "grad_norm": 0.18955262005329132, + "learning_rate": 9.428166191058568e-05, + "loss": 4.1081, + "step": 6740 + }, + { + "epoch": 0.4582823753227341, + "grad_norm": 0.2482958883047104, + "learning_rate": 9.427741540970241e-05, + "loss": 4.036, + "step": 6745 + }, + { + "epoch": 0.4586220953933958, + "grad_norm": 0.1786300390958786, + "learning_rate": 9.427316890881914e-05, + "loss": 4.2296, + "step": 6750 + }, + { + "epoch": 0.4589618154640576, + "grad_norm": 0.19114576280117035, + "learning_rate": 9.426892240793587e-05, + "loss": 4.1781, + "step": 6755 + }, + { + "epoch": 0.4593015355347194, + "grad_norm": 0.1777360886335373, + "learning_rate": 9.42646759070526e-05, + "loss": 3.7195, + "step": 6760 + }, + { + "epoch": 0.45964125560538116, + "grad_norm": 0.2255825698375702, + "learning_rate": 9.426042940616932e-05, + "loss": 4.0727, + "step": 6765 + }, + { + "epoch": 0.45998097567604296, + "grad_norm": 2.2815988063812256, + "learning_rate": 9.425618290528605e-05, + "loss": 4.1177, + "step": 6770 + }, + { + "epoch": 0.4603206957467047, + "grad_norm": 0.18281744420528412, + "learning_rate": 9.425193640440278e-05, + "loss": 4.1818, + "step": 6775 + }, + { + "epoch": 0.4606604158173665, + "grad_norm": 0.1932012438774109, + "learning_rate": 9.42476899035195e-05, + "loss": 4.207, + "step": 6780 + }, + { + "epoch": 0.46100013588802824, + "grad_norm": 0.3870634138584137, + "learning_rate": 9.424344340263623e-05, + "loss": 4.0997, + "step": 6785 + }, + { + "epoch": 0.46133985595869004, + "grad_norm": 0.20440851151943207, + "learning_rate": 9.423919690175296e-05, + "loss": 3.9454, + "step": 6790 + }, + { + "epoch": 0.46167957602935183, + "grad_norm": 0.21663671731948853, + "learning_rate": 9.423495040086969e-05, + "loss": 4.013, + "step": 6795 + }, + { + "epoch": 0.4620192961000136, + "grad_norm": 0.18921007215976715, + "learning_rate": 9.423070389998642e-05, + "loss": 4.0641, + "step": 6800 + }, + { + "epoch": 0.4623590161706754, + "grad_norm": 0.21008360385894775, + "learning_rate": 9.422645739910315e-05, + "loss": 3.8814, + "step": 6805 + }, + { + "epoch": 0.4626987362413371, + "grad_norm": 1.4397491216659546, + "learning_rate": 9.422221089821987e-05, + "loss": 3.977, + "step": 6810 + }, + { + "epoch": 0.4630384563119989, + "grad_norm": 0.1650581657886505, + "learning_rate": 9.42179643973366e-05, + "loss": 3.8511, + "step": 6815 + }, + { + "epoch": 0.4633781763826607, + "grad_norm": 0.24074020981788635, + "learning_rate": 9.421371789645333e-05, + "loss": 4.4683, + "step": 6820 + }, + { + "epoch": 0.46371789645332245, + "grad_norm": 0.2204151600599289, + "learning_rate": 9.420947139557006e-05, + "loss": 4.077, + "step": 6825 + }, + { + "epoch": 0.46405761652398425, + "grad_norm": 0.24461984634399414, + "learning_rate": 9.420522489468679e-05, + "loss": 3.9247, + "step": 6830 + }, + { + "epoch": 0.464397336594646, + "grad_norm": 0.19434142112731934, + "learning_rate": 9.42009783938035e-05, + "loss": 3.8453, + "step": 6835 + }, + { + "epoch": 0.4647370566653078, + "grad_norm": 0.2689877450466156, + "learning_rate": 9.419673189292024e-05, + "loss": 4.0808, + "step": 6840 + }, + { + "epoch": 0.4650767767359696, + "grad_norm": 0.2343098372220993, + "learning_rate": 9.419248539203697e-05, + "loss": 4.0781, + "step": 6845 + }, + { + "epoch": 0.46541649680663133, + "grad_norm": 0.19415900111198425, + "learning_rate": 9.418823889115369e-05, + "loss": 4.1067, + "step": 6850 + }, + { + "epoch": 0.4657562168772931, + "grad_norm": 0.20309092104434967, + "learning_rate": 9.418399239027043e-05, + "loss": 4.0426, + "step": 6855 + }, + { + "epoch": 0.46609593694795487, + "grad_norm": 0.35404348373413086, + "learning_rate": 9.417974588938715e-05, + "loss": 4.0913, + "step": 6860 + }, + { + "epoch": 0.46643565701861667, + "grad_norm": 0.8057840466499329, + "learning_rate": 9.417549938850387e-05, + "loss": 4.0028, + "step": 6865 + }, + { + "epoch": 0.4667753770892784, + "grad_norm": 0.23458503186702728, + "learning_rate": 9.417125288762061e-05, + "loss": 3.878, + "step": 6870 + }, + { + "epoch": 0.4671150971599402, + "grad_norm": 0.2019844651222229, + "learning_rate": 9.416700638673734e-05, + "loss": 3.7162, + "step": 6875 + }, + { + "epoch": 0.467454817230602, + "grad_norm": 0.19805146753787994, + "learning_rate": 9.416275988585405e-05, + "loss": 3.8687, + "step": 6880 + }, + { + "epoch": 0.46779453730126375, + "grad_norm": 0.2559395730495453, + "learning_rate": 9.41585133849708e-05, + "loss": 4.0791, + "step": 6885 + }, + { + "epoch": 0.46813425737192554, + "grad_norm": 0.3069989085197449, + "learning_rate": 9.415426688408752e-05, + "loss": 4.1204, + "step": 6890 + }, + { + "epoch": 0.4684739774425873, + "grad_norm": 0.5808936953544617, + "learning_rate": 9.415002038320424e-05, + "loss": 4.078, + "step": 6895 + }, + { + "epoch": 0.4688136975132491, + "grad_norm": 0.2510988414287567, + "learning_rate": 9.414577388232098e-05, + "loss": 4.23, + "step": 6900 + }, + { + "epoch": 0.4691534175839109, + "grad_norm": 0.2112618386745453, + "learning_rate": 9.414152738143769e-05, + "loss": 4.1868, + "step": 6905 + }, + { + "epoch": 0.4694931376545726, + "grad_norm": 0.22074821591377258, + "learning_rate": 9.413728088055442e-05, + "loss": 4.116, + "step": 6910 + }, + { + "epoch": 0.4698328577252344, + "grad_norm": 0.18397152423858643, + "learning_rate": 9.413303437967116e-05, + "loss": 4.0599, + "step": 6915 + }, + { + "epoch": 0.47017257779589616, + "grad_norm": 0.23679019510746002, + "learning_rate": 9.412878787878788e-05, + "loss": 3.8418, + "step": 6920 + }, + { + "epoch": 0.47051229786655796, + "grad_norm": 0.16602519154548645, + "learning_rate": 9.41245413779046e-05, + "loss": 4.0308, + "step": 6925 + }, + { + "epoch": 0.47085201793721976, + "grad_norm": 0.2898738384246826, + "learning_rate": 9.412029487702135e-05, + "loss": 4.2135, + "step": 6930 + }, + { + "epoch": 0.4711917380078815, + "grad_norm": 0.21048447489738464, + "learning_rate": 9.411604837613806e-05, + "loss": 3.9954, + "step": 6935 + }, + { + "epoch": 0.4715314580785433, + "grad_norm": 0.16546538472175598, + "learning_rate": 9.411180187525479e-05, + "loss": 4.1153, + "step": 6940 + }, + { + "epoch": 0.47187117814920504, + "grad_norm": 0.5077167749404907, + "learning_rate": 9.410755537437153e-05, + "loss": 4.1654, + "step": 6945 + }, + { + "epoch": 0.47221089821986684, + "grad_norm": 0.20563165843486786, + "learning_rate": 9.410330887348825e-05, + "loss": 3.9387, + "step": 6950 + }, + { + "epoch": 0.4725506182905286, + "grad_norm": 0.2332395762205124, + "learning_rate": 9.409906237260497e-05, + "loss": 4.2803, + "step": 6955 + }, + { + "epoch": 0.4728903383611904, + "grad_norm": 0.9494916796684265, + "learning_rate": 9.409481587172171e-05, + "loss": 4.1342, + "step": 6960 + }, + { + "epoch": 0.4732300584318522, + "grad_norm": 0.4015176296234131, + "learning_rate": 9.409056937083843e-05, + "loss": 4.1532, + "step": 6965 + }, + { + "epoch": 0.4735697785025139, + "grad_norm": 0.17285263538360596, + "learning_rate": 9.408632286995516e-05, + "loss": 3.8368, + "step": 6970 + }, + { + "epoch": 0.4739094985731757, + "grad_norm": 0.24160470068454742, + "learning_rate": 9.408207636907189e-05, + "loss": 4.0496, + "step": 6975 + }, + { + "epoch": 0.47424921864383746, + "grad_norm": 0.16916899383068085, + "learning_rate": 9.407782986818861e-05, + "loss": 4.3213, + "step": 6980 + }, + { + "epoch": 0.47458893871449925, + "grad_norm": 0.1876440942287445, + "learning_rate": 9.407358336730534e-05, + "loss": 4.0836, + "step": 6985 + }, + { + "epoch": 0.47492865878516105, + "grad_norm": 0.20803870260715485, + "learning_rate": 9.406933686642207e-05, + "loss": 4.0582, + "step": 6990 + }, + { + "epoch": 0.4752683788558228, + "grad_norm": 0.5098522305488586, + "learning_rate": 9.40650903655388e-05, + "loss": 3.7007, + "step": 6995 + }, + { + "epoch": 0.4756080989264846, + "grad_norm": 0.28446561098098755, + "learning_rate": 9.406084386465553e-05, + "loss": 3.7383, + "step": 7000 + }, + { + "epoch": 0.47594781899714633, + "grad_norm": 1.344814658164978, + "learning_rate": 9.405659736377225e-05, + "loss": 4.3367, + "step": 7005 + }, + { + "epoch": 0.47628753906780813, + "grad_norm": 0.3608788549900055, + "learning_rate": 9.405235086288898e-05, + "loss": 4.0906, + "step": 7010 + }, + { + "epoch": 0.4766272591384699, + "grad_norm": 0.2733428478240967, + "learning_rate": 9.404810436200571e-05, + "loss": 4.079, + "step": 7015 + }, + { + "epoch": 0.47696697920913167, + "grad_norm": 0.2144654393196106, + "learning_rate": 9.404385786112244e-05, + "loss": 3.5828, + "step": 7020 + }, + { + "epoch": 0.47730669927979347, + "grad_norm": 0.21566329896450043, + "learning_rate": 9.403961136023917e-05, + "loss": 4.1559, + "step": 7025 + }, + { + "epoch": 0.4776464193504552, + "grad_norm": 0.19979317486286163, + "learning_rate": 9.40353648593559e-05, + "loss": 4.078, + "step": 7030 + }, + { + "epoch": 0.477986139421117, + "grad_norm": 0.179921954870224, + "learning_rate": 9.403111835847262e-05, + "loss": 3.9188, + "step": 7035 + }, + { + "epoch": 0.47832585949177875, + "grad_norm": 0.17742060124874115, + "learning_rate": 9.402687185758935e-05, + "loss": 4.023, + "step": 7040 + }, + { + "epoch": 0.47866557956244055, + "grad_norm": 0.3981129229068756, + "learning_rate": 9.402262535670608e-05, + "loss": 4.0171, + "step": 7045 + }, + { + "epoch": 0.47900529963310234, + "grad_norm": 0.206672802567482, + "learning_rate": 9.40183788558228e-05, + "loss": 4.175, + "step": 7050 + }, + { + "epoch": 0.4793450197037641, + "grad_norm": 0.1959320604801178, + "learning_rate": 9.401413235493953e-05, + "loss": 4.1638, + "step": 7055 + }, + { + "epoch": 0.4796847397744259, + "grad_norm": 0.2009037733078003, + "learning_rate": 9.400988585405626e-05, + "loss": 4.1122, + "step": 7060 + }, + { + "epoch": 0.4800244598450876, + "grad_norm": 0.17592014372348785, + "learning_rate": 9.400563935317299e-05, + "loss": 4.0237, + "step": 7065 + }, + { + "epoch": 0.4803641799157494, + "grad_norm": 0.26748034358024597, + "learning_rate": 9.400139285228972e-05, + "loss": 3.8357, + "step": 7070 + }, + { + "epoch": 0.4807038999864112, + "grad_norm": 0.16173365712165833, + "learning_rate": 9.399714635140645e-05, + "loss": 4.0592, + "step": 7075 + }, + { + "epoch": 0.48104362005707296, + "grad_norm": 0.3452107906341553, + "learning_rate": 9.399289985052317e-05, + "loss": 4.1379, + "step": 7080 + }, + { + "epoch": 0.48138334012773476, + "grad_norm": 0.20402079820632935, + "learning_rate": 9.39886533496399e-05, + "loss": 4.2539, + "step": 7085 + }, + { + "epoch": 0.4817230601983965, + "grad_norm": 0.3040589690208435, + "learning_rate": 9.398440684875663e-05, + "loss": 4.0941, + "step": 7090 + }, + { + "epoch": 0.4820627802690583, + "grad_norm": 0.18901465833187103, + "learning_rate": 9.398016034787336e-05, + "loss": 4.0795, + "step": 7095 + }, + { + "epoch": 0.4824025003397201, + "grad_norm": 0.1665019541978836, + "learning_rate": 9.397591384699009e-05, + "loss": 4.0337, + "step": 7100 + }, + { + "epoch": 0.48274222041038184, + "grad_norm": 0.42847058176994324, + "learning_rate": 9.397166734610681e-05, + "loss": 4.108, + "step": 7105 + }, + { + "epoch": 0.48308194048104364, + "grad_norm": 0.21919941902160645, + "learning_rate": 9.396742084522354e-05, + "loss": 4.2498, + "step": 7110 + }, + { + "epoch": 0.4834216605517054, + "grad_norm": 0.7178875803947449, + "learning_rate": 9.396317434434027e-05, + "loss": 3.938, + "step": 7115 + }, + { + "epoch": 0.4837613806223672, + "grad_norm": 0.15290340781211853, + "learning_rate": 9.3958927843457e-05, + "loss": 4.1507, + "step": 7120 + }, + { + "epoch": 0.4841011006930289, + "grad_norm": 0.20199231803417206, + "learning_rate": 9.395468134257373e-05, + "loss": 4.165, + "step": 7125 + }, + { + "epoch": 0.4844408207636907, + "grad_norm": 0.4050713777542114, + "learning_rate": 9.395043484169045e-05, + "loss": 4.1098, + "step": 7130 + }, + { + "epoch": 0.4847805408343525, + "grad_norm": 0.22871138155460358, + "learning_rate": 9.394618834080718e-05, + "loss": 3.8713, + "step": 7135 + }, + { + "epoch": 0.48512026090501426, + "grad_norm": 0.2092018574476242, + "learning_rate": 9.394194183992391e-05, + "loss": 4.13, + "step": 7140 + }, + { + "epoch": 0.48545998097567605, + "grad_norm": 0.23514516651630402, + "learning_rate": 9.393769533904064e-05, + "loss": 4.1008, + "step": 7145 + }, + { + "epoch": 0.4857997010463378, + "grad_norm": 0.19990748167037964, + "learning_rate": 9.393344883815737e-05, + "loss": 4.1277, + "step": 7150 + }, + { + "epoch": 0.4861394211169996, + "grad_norm": 0.23331451416015625, + "learning_rate": 9.39292023372741e-05, + "loss": 4.1307, + "step": 7155 + }, + { + "epoch": 0.4864791411876614, + "grad_norm": 0.4659624397754669, + "learning_rate": 9.392495583639082e-05, + "loss": 4.0373, + "step": 7160 + }, + { + "epoch": 0.48681886125832313, + "grad_norm": 0.350339412689209, + "learning_rate": 9.392070933550755e-05, + "loss": 4.0145, + "step": 7165 + }, + { + "epoch": 0.48715858132898493, + "grad_norm": 0.29896309971809387, + "learning_rate": 9.391646283462428e-05, + "loss": 3.7892, + "step": 7170 + }, + { + "epoch": 0.48749830139964667, + "grad_norm": 0.2809394896030426, + "learning_rate": 9.391221633374099e-05, + "loss": 4.0247, + "step": 7175 + }, + { + "epoch": 0.48783802147030847, + "grad_norm": 0.2700156271457672, + "learning_rate": 9.390796983285773e-05, + "loss": 4.0203, + "step": 7180 + }, + { + "epoch": 0.48817774154097027, + "grad_norm": 0.2099238932132721, + "learning_rate": 9.390372333197446e-05, + "loss": 3.8992, + "step": 7185 + }, + { + "epoch": 0.488517461611632, + "grad_norm": 0.1657472550868988, + "learning_rate": 9.389947683109118e-05, + "loss": 4.0689, + "step": 7190 + }, + { + "epoch": 0.4888571816822938, + "grad_norm": 2.5052669048309326, + "learning_rate": 9.389523033020792e-05, + "loss": 4.2311, + "step": 7195 + }, + { + "epoch": 0.48919690175295555, + "grad_norm": 0.18674081563949585, + "learning_rate": 9.389098382932465e-05, + "loss": 4.0554, + "step": 7200 + }, + { + "epoch": 0.48953662182361735, + "grad_norm": 0.17581969499588013, + "learning_rate": 9.388673732844136e-05, + "loss": 3.9992, + "step": 7205 + }, + { + "epoch": 0.4898763418942791, + "grad_norm": 0.2378435879945755, + "learning_rate": 9.38824908275581e-05, + "loss": 4.1965, + "step": 7210 + }, + { + "epoch": 0.4902160619649409, + "grad_norm": 0.21895438432693481, + "learning_rate": 9.387824432667483e-05, + "loss": 3.9994, + "step": 7215 + }, + { + "epoch": 0.4905557820356027, + "grad_norm": 0.18622034788131714, + "learning_rate": 9.387399782579154e-05, + "loss": 4.2384, + "step": 7220 + }, + { + "epoch": 0.4908955021062644, + "grad_norm": 0.21598079800605774, + "learning_rate": 9.386975132490829e-05, + "loss": 4.2434, + "step": 7225 + }, + { + "epoch": 0.4912352221769262, + "grad_norm": 1.403287410736084, + "learning_rate": 9.386550482402501e-05, + "loss": 3.9085, + "step": 7230 + }, + { + "epoch": 0.49157494224758796, + "grad_norm": 0.31629684567451477, + "learning_rate": 9.386125832314173e-05, + "loss": 4.0834, + "step": 7235 + }, + { + "epoch": 0.49191466231824976, + "grad_norm": 0.503648042678833, + "learning_rate": 9.385701182225847e-05, + "loss": 4.0346, + "step": 7240 + }, + { + "epoch": 0.49225438238891156, + "grad_norm": 0.16960012912750244, + "learning_rate": 9.38527653213752e-05, + "loss": 4.0295, + "step": 7245 + }, + { + "epoch": 0.4925941024595733, + "grad_norm": 0.2992265224456787, + "learning_rate": 9.384851882049191e-05, + "loss": 4.147, + "step": 7250 + }, + { + "epoch": 0.4929338225302351, + "grad_norm": 0.37070542573928833, + "learning_rate": 9.384427231960865e-05, + "loss": 3.6991, + "step": 7255 + }, + { + "epoch": 0.49327354260089684, + "grad_norm": 0.2252090573310852, + "learning_rate": 9.384002581872537e-05, + "loss": 4.2128, + "step": 7260 + }, + { + "epoch": 0.49361326267155864, + "grad_norm": 0.1695706844329834, + "learning_rate": 9.38357793178421e-05, + "loss": 4.0321, + "step": 7265 + }, + { + "epoch": 0.49395298274222044, + "grad_norm": 0.23216302692890167, + "learning_rate": 9.383153281695884e-05, + "loss": 3.9758, + "step": 7270 + }, + { + "epoch": 0.4942927028128822, + "grad_norm": 0.9034651517868042, + "learning_rate": 9.382728631607555e-05, + "loss": 3.9329, + "step": 7275 + }, + { + "epoch": 0.494632422883544, + "grad_norm": 0.21417531371116638, + "learning_rate": 9.382303981519228e-05, + "loss": 3.8799, + "step": 7280 + }, + { + "epoch": 0.4949721429542057, + "grad_norm": 0.18411661684513092, + "learning_rate": 9.381879331430902e-05, + "loss": 4.2956, + "step": 7285 + }, + { + "epoch": 0.4953118630248675, + "grad_norm": 0.23416036367416382, + "learning_rate": 9.381454681342574e-05, + "loss": 3.8779, + "step": 7290 + }, + { + "epoch": 0.49565158309552926, + "grad_norm": 0.2349618524312973, + "learning_rate": 9.381030031254246e-05, + "loss": 4.027, + "step": 7295 + }, + { + "epoch": 0.49599130316619106, + "grad_norm": 0.2522861659526825, + "learning_rate": 9.38060538116592e-05, + "loss": 4.1115, + "step": 7300 + }, + { + "epoch": 0.49633102323685285, + "grad_norm": 0.18916946649551392, + "learning_rate": 9.380180731077592e-05, + "loss": 4.1002, + "step": 7305 + }, + { + "epoch": 0.4966707433075146, + "grad_norm": 0.20227526128292084, + "learning_rate": 9.379756080989265e-05, + "loss": 4.0984, + "step": 7310 + }, + { + "epoch": 0.4970104633781764, + "grad_norm": 0.20442931354045868, + "learning_rate": 9.379331430900939e-05, + "loss": 4.0096, + "step": 7315 + }, + { + "epoch": 0.49735018344883813, + "grad_norm": 0.19708792865276337, + "learning_rate": 9.37890678081261e-05, + "loss": 4.2264, + "step": 7320 + }, + { + "epoch": 0.49768990351949993, + "grad_norm": 0.2058994174003601, + "learning_rate": 9.378482130724283e-05, + "loss": 3.9661, + "step": 7325 + }, + { + "epoch": 0.49802962359016173, + "grad_norm": 0.18831300735473633, + "learning_rate": 9.378057480635956e-05, + "loss": 4.0001, + "step": 7330 + }, + { + "epoch": 0.49836934366082347, + "grad_norm": 0.5251606702804565, + "learning_rate": 9.377632830547629e-05, + "loss": 4.1706, + "step": 7335 + }, + { + "epoch": 0.49870906373148527, + "grad_norm": 0.17007534205913544, + "learning_rate": 9.377208180459302e-05, + "loss": 4.1699, + "step": 7340 + }, + { + "epoch": 0.499048783802147, + "grad_norm": 0.3484830856323242, + "learning_rate": 9.376783530370974e-05, + "loss": 3.944, + "step": 7345 + }, + { + "epoch": 0.4993885038728088, + "grad_norm": 0.20382869243621826, + "learning_rate": 9.376358880282647e-05, + "loss": 3.7016, + "step": 7350 + }, + { + "epoch": 0.4997282239434706, + "grad_norm": 0.2002745270729065, + "learning_rate": 9.37593423019432e-05, + "loss": 4.1238, + "step": 7355 + }, + { + "epoch": 0.5000679440141323, + "grad_norm": 0.17399045825004578, + "learning_rate": 9.375509580105993e-05, + "loss": 4.0909, + "step": 7360 + }, + { + "epoch": 0.5004076640847941, + "grad_norm": 0.24848084151744843, + "learning_rate": 9.375084930017666e-05, + "loss": 4.1379, + "step": 7365 + }, + { + "epoch": 0.5007473841554559, + "grad_norm": 0.5024029016494751, + "learning_rate": 9.374660279929338e-05, + "loss": 4.2591, + "step": 7370 + }, + { + "epoch": 0.5010871042261177, + "grad_norm": 0.22552575170993805, + "learning_rate": 9.374235629841011e-05, + "loss": 4.049, + "step": 7375 + }, + { + "epoch": 0.5014268242967794, + "grad_norm": 0.2249031662940979, + "learning_rate": 9.373810979752684e-05, + "loss": 4.2421, + "step": 7380 + }, + { + "epoch": 0.5017665443674413, + "grad_norm": 0.22408431768417358, + "learning_rate": 9.373386329664357e-05, + "loss": 4.3666, + "step": 7385 + }, + { + "epoch": 0.502106264438103, + "grad_norm": 0.16393537819385529, + "learning_rate": 9.37296167957603e-05, + "loss": 3.7884, + "step": 7390 + }, + { + "epoch": 0.5024459845087648, + "grad_norm": 0.25391802191734314, + "learning_rate": 9.372537029487702e-05, + "loss": 4.1085, + "step": 7395 + }, + { + "epoch": 0.5027857045794265, + "grad_norm": 0.25248852372169495, + "learning_rate": 9.372112379399375e-05, + "loss": 4.0558, + "step": 7400 + }, + { + "epoch": 0.5031254246500884, + "grad_norm": 0.2197033017873764, + "learning_rate": 9.371687729311048e-05, + "loss": 3.979, + "step": 7405 + }, + { + "epoch": 0.5034651447207501, + "grad_norm": 0.20195040106773376, + "learning_rate": 9.371263079222721e-05, + "loss": 3.8918, + "step": 7410 + }, + { + "epoch": 0.5038048647914118, + "grad_norm": 0.1969507336616516, + "learning_rate": 9.370838429134394e-05, + "loss": 3.9211, + "step": 7415 + }, + { + "epoch": 0.5041445848620737, + "grad_norm": 0.26221612095832825, + "learning_rate": 9.370413779046066e-05, + "loss": 4.067, + "step": 7420 + }, + { + "epoch": 0.5044843049327354, + "grad_norm": 0.20401246845722198, + "learning_rate": 9.369989128957739e-05, + "loss": 3.9535, + "step": 7425 + }, + { + "epoch": 0.5048240250033972, + "grad_norm": 0.5490508675575256, + "learning_rate": 9.369564478869412e-05, + "loss": 3.9772, + "step": 7430 + }, + { + "epoch": 0.5051637450740589, + "grad_norm": 0.1551188975572586, + "learning_rate": 9.369139828781085e-05, + "loss": 4.1125, + "step": 7435 + }, + { + "epoch": 0.5055034651447208, + "grad_norm": 0.31335705518722534, + "learning_rate": 9.368715178692758e-05, + "loss": 4.0064, + "step": 7440 + }, + { + "epoch": 0.5058431852153825, + "grad_norm": 0.23974861204624176, + "learning_rate": 9.36829052860443e-05, + "loss": 3.9679, + "step": 7445 + }, + { + "epoch": 0.5061829052860443, + "grad_norm": 0.15649579465389252, + "learning_rate": 9.367865878516103e-05, + "loss": 4.005, + "step": 7450 + }, + { + "epoch": 0.5065226253567061, + "grad_norm": 0.18585532903671265, + "learning_rate": 9.367441228427776e-05, + "loss": 3.8728, + "step": 7455 + }, + { + "epoch": 0.5068623454273679, + "grad_norm": 0.2581709623336792, + "learning_rate": 9.367016578339449e-05, + "loss": 4.0771, + "step": 7460 + }, + { + "epoch": 0.5072020654980296, + "grad_norm": 0.2468833178281784, + "learning_rate": 9.366591928251122e-05, + "loss": 4.054, + "step": 7465 + }, + { + "epoch": 0.5075417855686915, + "grad_norm": 0.2019713968038559, + "learning_rate": 9.366167278162794e-05, + "loss": 3.6126, + "step": 7470 + }, + { + "epoch": 0.5078815056393532, + "grad_norm": 0.25231555104255676, + "learning_rate": 9.365742628074467e-05, + "loss": 4.4004, + "step": 7475 + }, + { + "epoch": 0.5082212257100149, + "grad_norm": 0.14785149693489075, + "learning_rate": 9.36531797798614e-05, + "loss": 3.7965, + "step": 7480 + }, + { + "epoch": 0.5085609457806767, + "grad_norm": 0.21854764223098755, + "learning_rate": 9.364893327897813e-05, + "loss": 4.3081, + "step": 7485 + }, + { + "epoch": 0.5089006658513385, + "grad_norm": 0.35712555050849915, + "learning_rate": 9.364468677809486e-05, + "loss": 3.7848, + "step": 7490 + }, + { + "epoch": 0.5092403859220003, + "grad_norm": 0.9170289039611816, + "learning_rate": 9.364044027721158e-05, + "loss": 4.0845, + "step": 7495 + }, + { + "epoch": 0.509580105992662, + "grad_norm": 0.19155257940292358, + "learning_rate": 9.363619377632831e-05, + "loss": 4.0807, + "step": 7500 + }, + { + "epoch": 0.5099198260633239, + "grad_norm": 0.20362383127212524, + "learning_rate": 9.363194727544504e-05, + "loss": 4.1181, + "step": 7505 + }, + { + "epoch": 0.5102595461339856, + "grad_norm": 1.610526204109192, + "learning_rate": 9.362770077456177e-05, + "loss": 4.1332, + "step": 7510 + }, + { + "epoch": 0.5105992662046474, + "grad_norm": 0.36077892780303955, + "learning_rate": 9.36234542736785e-05, + "loss": 4.0379, + "step": 7515 + }, + { + "epoch": 0.5109389862753091, + "grad_norm": 0.34184160828590393, + "learning_rate": 9.361920777279522e-05, + "loss": 4.0309, + "step": 7520 + }, + { + "epoch": 0.511278706345971, + "grad_norm": 0.266156941652298, + "learning_rate": 9.361496127191195e-05, + "loss": 4.0715, + "step": 7525 + }, + { + "epoch": 0.5116184264166327, + "grad_norm": 0.21247410774230957, + "learning_rate": 9.361071477102867e-05, + "loss": 4.1461, + "step": 7530 + }, + { + "epoch": 0.5119581464872944, + "grad_norm": 0.3173115849494934, + "learning_rate": 9.360646827014541e-05, + "loss": 4.0909, + "step": 7535 + }, + { + "epoch": 0.5122978665579563, + "grad_norm": 0.1932353973388672, + "learning_rate": 9.360222176926214e-05, + "loss": 4.0699, + "step": 7540 + }, + { + "epoch": 0.512637586628618, + "grad_norm": 0.34887808561325073, + "learning_rate": 9.359797526837885e-05, + "loss": 3.8442, + "step": 7545 + }, + { + "epoch": 0.5129773066992798, + "grad_norm": 0.1603212207555771, + "learning_rate": 9.359372876749559e-05, + "loss": 3.9252, + "step": 7550 + }, + { + "epoch": 0.5133170267699416, + "grad_norm": 0.18673382699489594, + "learning_rate": 9.358948226661232e-05, + "loss": 4.132, + "step": 7555 + }, + { + "epoch": 0.5136567468406034, + "grad_norm": 0.17931464314460754, + "learning_rate": 9.358523576572904e-05, + "loss": 4.2045, + "step": 7560 + }, + { + "epoch": 0.5139964669112651, + "grad_norm": 0.20832332968711853, + "learning_rate": 9.358098926484578e-05, + "loss": 4.1134, + "step": 7565 + }, + { + "epoch": 0.5143361869819268, + "grad_norm": 0.1900588423013687, + "learning_rate": 9.35767427639625e-05, + "loss": 4.1737, + "step": 7570 + }, + { + "epoch": 0.5146759070525887, + "grad_norm": 0.25555694103240967, + "learning_rate": 9.357249626307922e-05, + "loss": 3.9879, + "step": 7575 + }, + { + "epoch": 0.5150156271232504, + "grad_norm": 0.3741958737373352, + "learning_rate": 9.356824976219596e-05, + "loss": 4.0922, + "step": 7580 + }, + { + "epoch": 0.5153553471939122, + "grad_norm": 0.25290772318840027, + "learning_rate": 9.356400326131269e-05, + "loss": 4.1101, + "step": 7585 + }, + { + "epoch": 0.515695067264574, + "grad_norm": 0.23055055737495422, + "learning_rate": 9.35597567604294e-05, + "loss": 3.9841, + "step": 7590 + }, + { + "epoch": 0.5160347873352358, + "grad_norm": 0.2004847675561905, + "learning_rate": 9.355551025954614e-05, + "loss": 4.0573, + "step": 7595 + }, + { + "epoch": 0.5163745074058975, + "grad_norm": 0.17430374026298523, + "learning_rate": 9.355126375866287e-05, + "loss": 4.1038, + "step": 7600 + }, + { + "epoch": 0.5167142274765593, + "grad_norm": 0.19129951298236847, + "learning_rate": 9.354701725777959e-05, + "loss": 3.9517, + "step": 7605 + }, + { + "epoch": 0.5170539475472211, + "grad_norm": 0.24866996705532074, + "learning_rate": 9.354277075689633e-05, + "loss": 3.9927, + "step": 7610 + }, + { + "epoch": 0.5173936676178829, + "grad_norm": 0.2479562759399414, + "learning_rate": 9.353852425601304e-05, + "loss": 3.824, + "step": 7615 + }, + { + "epoch": 0.5177333876885446, + "grad_norm": 0.25535717606544495, + "learning_rate": 9.353427775512977e-05, + "loss": 4.1203, + "step": 7620 + }, + { + "epoch": 0.5180731077592065, + "grad_norm": 0.19401293992996216, + "learning_rate": 9.353003125424651e-05, + "loss": 4.2723, + "step": 7625 + }, + { + "epoch": 0.5184128278298682, + "grad_norm": 0.25242581963539124, + "learning_rate": 9.352578475336323e-05, + "loss": 3.7278, + "step": 7630 + }, + { + "epoch": 0.5187525479005299, + "grad_norm": 0.24895191192626953, + "learning_rate": 9.352153825247996e-05, + "loss": 4.0693, + "step": 7635 + }, + { + "epoch": 0.5190922679711918, + "grad_norm": 0.208522230386734, + "learning_rate": 9.35172917515967e-05, + "loss": 4.105, + "step": 7640 + }, + { + "epoch": 0.5194319880418535, + "grad_norm": 0.21266648173332214, + "learning_rate": 9.351304525071341e-05, + "loss": 4.0295, + "step": 7645 + }, + { + "epoch": 0.5197717081125153, + "grad_norm": 0.1788320690393448, + "learning_rate": 9.350879874983014e-05, + "loss": 3.9959, + "step": 7650 + }, + { + "epoch": 0.520111428183177, + "grad_norm": 0.36194828152656555, + "learning_rate": 9.350455224894688e-05, + "loss": 4.0003, + "step": 7655 + }, + { + "epoch": 0.5204511482538389, + "grad_norm": 0.22471646964550018, + "learning_rate": 9.35003057480636e-05, + "loss": 4.0001, + "step": 7660 + }, + { + "epoch": 0.5207908683245006, + "grad_norm": 0.20140118896961212, + "learning_rate": 9.349605924718032e-05, + "loss": 3.802, + "step": 7665 + }, + { + "epoch": 0.5211305883951624, + "grad_norm": 0.2610224485397339, + "learning_rate": 9.349181274629706e-05, + "loss": 4.0378, + "step": 7670 + }, + { + "epoch": 0.5214703084658242, + "grad_norm": 0.19528135657310486, + "learning_rate": 9.348756624541378e-05, + "loss": 4.059, + "step": 7675 + }, + { + "epoch": 0.521810028536486, + "grad_norm": 0.1775432527065277, + "learning_rate": 9.348331974453051e-05, + "loss": 4.0532, + "step": 7680 + }, + { + "epoch": 0.5221497486071477, + "grad_norm": 4.0261311531066895, + "learning_rate": 9.347907324364724e-05, + "loss": 4.1247, + "step": 7685 + }, + { + "epoch": 0.5224894686778094, + "grad_norm": 0.18927231431007385, + "learning_rate": 9.347482674276396e-05, + "loss": 4.1597, + "step": 7690 + }, + { + "epoch": 0.5228291887484713, + "grad_norm": 0.17525963485240936, + "learning_rate": 9.347058024188069e-05, + "loss": 3.7783, + "step": 7695 + }, + { + "epoch": 0.523168908819133, + "grad_norm": 0.15147970616817474, + "learning_rate": 9.346633374099742e-05, + "loss": 4.017, + "step": 7700 + }, + { + "epoch": 0.5235086288897948, + "grad_norm": 0.17710480093955994, + "learning_rate": 9.346208724011415e-05, + "loss": 4.3239, + "step": 7705 + }, + { + "epoch": 0.5238483489604566, + "grad_norm": 0.1875525414943695, + "learning_rate": 9.345784073923088e-05, + "loss": 3.9417, + "step": 7710 + }, + { + "epoch": 0.5241880690311184, + "grad_norm": 0.2575678825378418, + "learning_rate": 9.34535942383476e-05, + "loss": 4.1485, + "step": 7715 + }, + { + "epoch": 0.5245277891017801, + "grad_norm": 0.8326651453971863, + "learning_rate": 9.344934773746433e-05, + "loss": 4.0598, + "step": 7720 + }, + { + "epoch": 0.524867509172442, + "grad_norm": 0.1673835813999176, + "learning_rate": 9.344510123658106e-05, + "loss": 4.1255, + "step": 7725 + }, + { + "epoch": 0.5252072292431037, + "grad_norm": 0.764521598815918, + "learning_rate": 9.344085473569779e-05, + "loss": 3.9019, + "step": 7730 + }, + { + "epoch": 0.5255469493137654, + "grad_norm": 0.17800885438919067, + "learning_rate": 9.343660823481452e-05, + "loss": 3.7869, + "step": 7735 + }, + { + "epoch": 0.5258866693844272, + "grad_norm": 0.1920061856508255, + "learning_rate": 9.343236173393124e-05, + "loss": 4.1229, + "step": 7740 + }, + { + "epoch": 0.526226389455089, + "grad_norm": 3.5604405403137207, + "learning_rate": 9.342811523304797e-05, + "loss": 3.9478, + "step": 7745 + }, + { + "epoch": 0.5265661095257508, + "grad_norm": 0.194805309176445, + "learning_rate": 9.34238687321647e-05, + "loss": 4.2643, + "step": 7750 + }, + { + "epoch": 0.5269058295964125, + "grad_norm": 0.22051791846752167, + "learning_rate": 9.341962223128143e-05, + "loss": 4.1483, + "step": 7755 + }, + { + "epoch": 0.5272455496670744, + "grad_norm": 0.22619082033634186, + "learning_rate": 9.341622503057481e-05, + "loss": 3.5109, + "step": 7760 + }, + { + "epoch": 0.5275852697377361, + "grad_norm": 0.2676302194595337, + "learning_rate": 9.341197852969154e-05, + "loss": 4.174, + "step": 7765 + }, + { + "epoch": 0.5279249898083979, + "grad_norm": 0.25767549872398376, + "learning_rate": 9.340773202880827e-05, + "loss": 3.9739, + "step": 7770 + }, + { + "epoch": 0.5282647098790596, + "grad_norm": 0.18924906849861145, + "learning_rate": 9.3403485527925e-05, + "loss": 4.2547, + "step": 7775 + }, + { + "epoch": 0.5286044299497215, + "grad_norm": 0.1977754682302475, + "learning_rate": 9.339923902704172e-05, + "loss": 3.8264, + "step": 7780 + }, + { + "epoch": 0.5289441500203832, + "grad_norm": 0.1865202784538269, + "learning_rate": 9.339499252615845e-05, + "loss": 4.2391, + "step": 7785 + }, + { + "epoch": 0.5292838700910449, + "grad_norm": 0.22030463814735413, + "learning_rate": 9.339074602527518e-05, + "loss": 4.1641, + "step": 7790 + }, + { + "epoch": 0.5296235901617068, + "grad_norm": 0.1868383139371872, + "learning_rate": 9.33864995243919e-05, + "loss": 4.247, + "step": 7795 + }, + { + "epoch": 0.5299633102323685, + "grad_norm": 0.19364790618419647, + "learning_rate": 9.338225302350863e-05, + "loss": 3.9474, + "step": 7800 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.1819707751274109, + "learning_rate": 9.337800652262536e-05, + "loss": 4.0559, + "step": 7805 + }, + { + "epoch": 0.5306427503736921, + "grad_norm": 0.25897395610809326, + "learning_rate": 9.337376002174209e-05, + "loss": 3.9502, + "step": 7810 + }, + { + "epoch": 0.5309824704443539, + "grad_norm": 0.23874245584011078, + "learning_rate": 9.336951352085882e-05, + "loss": 3.89, + "step": 7815 + }, + { + "epoch": 0.5313221905150156, + "grad_norm": 0.16747428476810455, + "learning_rate": 9.336526701997555e-05, + "loss": 4.0011, + "step": 7820 + }, + { + "epoch": 0.5316619105856774, + "grad_norm": 0.17043693363666534, + "learning_rate": 9.336102051909227e-05, + "loss": 4.0938, + "step": 7825 + }, + { + "epoch": 0.5320016306563392, + "grad_norm": 0.19424232840538025, + "learning_rate": 9.3356774018209e-05, + "loss": 3.9479, + "step": 7830 + }, + { + "epoch": 0.532341350727001, + "grad_norm": 0.1900462657213211, + "learning_rate": 9.335252751732573e-05, + "loss": 4.2105, + "step": 7835 + }, + { + "epoch": 0.5326810707976627, + "grad_norm": 0.2120985984802246, + "learning_rate": 9.334828101644246e-05, + "loss": 4.2686, + "step": 7840 + }, + { + "epoch": 0.5330207908683245, + "grad_norm": 0.24349243938922882, + "learning_rate": 9.334403451555919e-05, + "loss": 4.09, + "step": 7845 + }, + { + "epoch": 0.5333605109389863, + "grad_norm": 0.22167713940143585, + "learning_rate": 9.333978801467591e-05, + "loss": 4.0405, + "step": 7850 + }, + { + "epoch": 0.533700231009648, + "grad_norm": 0.2967727780342102, + "learning_rate": 9.333554151379263e-05, + "loss": 4.3056, + "step": 7855 + }, + { + "epoch": 0.5340399510803098, + "grad_norm": 0.1543935239315033, + "learning_rate": 9.333129501290937e-05, + "loss": 4.0946, + "step": 7860 + }, + { + "epoch": 0.5343796711509716, + "grad_norm": 0.27082520723342896, + "learning_rate": 9.33270485120261e-05, + "loss": 3.9952, + "step": 7865 + }, + { + "epoch": 0.5347193912216334, + "grad_norm": 0.27814018726348877, + "learning_rate": 9.332280201114281e-05, + "loss": 4.2333, + "step": 7870 + }, + { + "epoch": 0.5350591112922951, + "grad_norm": 0.18961313366889954, + "learning_rate": 9.331855551025955e-05, + "loss": 4.1017, + "step": 7875 + }, + { + "epoch": 0.535398831362957, + "grad_norm": 0.19121582806110382, + "learning_rate": 9.331430900937628e-05, + "loss": 4.1663, + "step": 7880 + }, + { + "epoch": 0.5357385514336187, + "grad_norm": 0.20209085941314697, + "learning_rate": 9.3310062508493e-05, + "loss": 3.9602, + "step": 7885 + }, + { + "epoch": 0.5360782715042804, + "grad_norm": 0.222623810172081, + "learning_rate": 9.330581600760974e-05, + "loss": 4.208, + "step": 7890 + }, + { + "epoch": 0.5364179915749423, + "grad_norm": 0.2834756672382355, + "learning_rate": 9.330156950672647e-05, + "loss": 3.8017, + "step": 7895 + }, + { + "epoch": 0.536757711645604, + "grad_norm": 0.26975885033607483, + "learning_rate": 9.329732300584318e-05, + "loss": 3.8618, + "step": 7900 + }, + { + "epoch": 0.5370974317162658, + "grad_norm": 0.24554960429668427, + "learning_rate": 9.329307650495992e-05, + "loss": 4.1339, + "step": 7905 + }, + { + "epoch": 0.5374371517869275, + "grad_norm": 0.19073788821697235, + "learning_rate": 9.328883000407665e-05, + "loss": 4.1798, + "step": 7910 + }, + { + "epoch": 0.5377768718575894, + "grad_norm": 0.1700531542301178, + "learning_rate": 9.328458350319336e-05, + "loss": 4.1152, + "step": 7915 + }, + { + "epoch": 0.5381165919282511, + "grad_norm": 0.2178450971841812, + "learning_rate": 9.32803370023101e-05, + "loss": 3.9352, + "step": 7920 + }, + { + "epoch": 0.5384563119989129, + "grad_norm": 0.20611537992954254, + "learning_rate": 9.327609050142682e-05, + "loss": 4.0871, + "step": 7925 + }, + { + "epoch": 0.5387960320695747, + "grad_norm": 0.20074544847011566, + "learning_rate": 9.327184400054355e-05, + "loss": 4.1689, + "step": 7930 + }, + { + "epoch": 0.5391357521402365, + "grad_norm": 0.18785405158996582, + "learning_rate": 9.326759749966029e-05, + "loss": 4.0912, + "step": 7935 + }, + { + "epoch": 0.5394754722108982, + "grad_norm": 0.2205738127231598, + "learning_rate": 9.3263350998777e-05, + "loss": 4.0251, + "step": 7940 + }, + { + "epoch": 0.5398151922815599, + "grad_norm": 0.2106117606163025, + "learning_rate": 9.325910449789373e-05, + "loss": 3.9616, + "step": 7945 + }, + { + "epoch": 0.5401549123522218, + "grad_norm": 0.20841112732887268, + "learning_rate": 9.325485799701047e-05, + "loss": 4.1365, + "step": 7950 + }, + { + "epoch": 0.5404946324228835, + "grad_norm": 0.20364391803741455, + "learning_rate": 9.325061149612719e-05, + "loss": 3.8211, + "step": 7955 + }, + { + "epoch": 0.5408343524935453, + "grad_norm": 0.1856883019208908, + "learning_rate": 9.324636499524393e-05, + "loss": 4.102, + "step": 7960 + }, + { + "epoch": 0.5411740725642071, + "grad_norm": 0.22017249464988708, + "learning_rate": 9.324211849436066e-05, + "loss": 3.9317, + "step": 7965 + }, + { + "epoch": 0.5415137926348689, + "grad_norm": 0.2391664832830429, + "learning_rate": 9.323787199347737e-05, + "loss": 3.975, + "step": 7970 + }, + { + "epoch": 0.5418535127055306, + "grad_norm": 0.15632979571819305, + "learning_rate": 9.323362549259411e-05, + "loss": 4.1384, + "step": 7975 + }, + { + "epoch": 0.5421932327761925, + "grad_norm": 0.18496164679527283, + "learning_rate": 9.322937899171084e-05, + "loss": 4.1871, + "step": 7980 + }, + { + "epoch": 0.5425329528468542, + "grad_norm": 0.18015219271183014, + "learning_rate": 9.322513249082756e-05, + "loss": 3.9501, + "step": 7985 + }, + { + "epoch": 0.542872672917516, + "grad_norm": 0.19946110248565674, + "learning_rate": 9.32208859899443e-05, + "loss": 3.928, + "step": 7990 + }, + { + "epoch": 0.5432123929881777, + "grad_norm": 0.18122997879981995, + "learning_rate": 9.321663948906103e-05, + "loss": 4.0794, + "step": 7995 + }, + { + "epoch": 0.5435521130588395, + "grad_norm": 0.2898665964603424, + "learning_rate": 9.321239298817774e-05, + "loss": 4.1946, + "step": 8000 + }, + { + "epoch": 0.5438918331295013, + "grad_norm": 0.19237551093101501, + "learning_rate": 9.320814648729448e-05, + "loss": 3.8687, + "step": 8005 + }, + { + "epoch": 0.544231553200163, + "grad_norm": 0.18704599142074585, + "learning_rate": 9.32038999864112e-05, + "loss": 4.0483, + "step": 8010 + }, + { + "epoch": 0.5445712732708249, + "grad_norm": 0.16550405323505402, + "learning_rate": 9.319965348552792e-05, + "loss": 3.927, + "step": 8015 + }, + { + "epoch": 0.5449109933414866, + "grad_norm": 0.20201174914836884, + "learning_rate": 9.319540698464467e-05, + "loss": 4.1593, + "step": 8020 + }, + { + "epoch": 0.5452507134121484, + "grad_norm": 0.19555409252643585, + "learning_rate": 9.319116048376138e-05, + "loss": 4.2645, + "step": 8025 + }, + { + "epoch": 0.5455904334828101, + "grad_norm": 0.20266516506671906, + "learning_rate": 9.318691398287811e-05, + "loss": 4.165, + "step": 8030 + }, + { + "epoch": 0.545930153553472, + "grad_norm": 0.18069981038570404, + "learning_rate": 9.318266748199485e-05, + "loss": 4.0198, + "step": 8035 + }, + { + "epoch": 0.5462698736241337, + "grad_norm": 0.1856188029050827, + "learning_rate": 9.317842098111156e-05, + "loss": 3.9087, + "step": 8040 + }, + { + "epoch": 0.5466095936947954, + "grad_norm": 0.19204466044902802, + "learning_rate": 9.317417448022829e-05, + "loss": 4.0723, + "step": 8045 + }, + { + "epoch": 0.5469493137654573, + "grad_norm": 0.19473916292190552, + "learning_rate": 9.316992797934503e-05, + "loss": 3.9512, + "step": 8050 + }, + { + "epoch": 0.547289033836119, + "grad_norm": 0.23411086201667786, + "learning_rate": 9.316568147846175e-05, + "loss": 4.019, + "step": 8055 + }, + { + "epoch": 0.5476287539067808, + "grad_norm": 0.17357757687568665, + "learning_rate": 9.316143497757848e-05, + "loss": 3.9088, + "step": 8060 + }, + { + "epoch": 0.5479684739774426, + "grad_norm": 0.19055502116680145, + "learning_rate": 9.315718847669522e-05, + "loss": 3.9099, + "step": 8065 + }, + { + "epoch": 0.5483081940481044, + "grad_norm": 0.21026012301445007, + "learning_rate": 9.315294197581193e-05, + "loss": 4.0825, + "step": 8070 + }, + { + "epoch": 0.5486479141187661, + "grad_norm": 0.1714550107717514, + "learning_rate": 9.314869547492866e-05, + "loss": 4.1415, + "step": 8075 + }, + { + "epoch": 0.5489876341894279, + "grad_norm": 0.18270696699619293, + "learning_rate": 9.314444897404539e-05, + "loss": 4.0709, + "step": 8080 + }, + { + "epoch": 0.5493273542600897, + "grad_norm": 0.1924707442522049, + "learning_rate": 9.314020247316212e-05, + "loss": 4.0765, + "step": 8085 + }, + { + "epoch": 0.5496670743307515, + "grad_norm": 1.1790002584457397, + "learning_rate": 9.313595597227884e-05, + "loss": 4.0, + "step": 8090 + }, + { + "epoch": 0.5500067944014132, + "grad_norm": 0.2330000251531601, + "learning_rate": 9.313170947139557e-05, + "loss": 4.0367, + "step": 8095 + }, + { + "epoch": 0.550346514472075, + "grad_norm": 0.21324478089809418, + "learning_rate": 9.31274629705123e-05, + "loss": 3.9346, + "step": 8100 + }, + { + "epoch": 0.5506862345427368, + "grad_norm": 0.1989852339029312, + "learning_rate": 9.312321646962903e-05, + "loss": 3.883, + "step": 8105 + }, + { + "epoch": 0.5510259546133985, + "grad_norm": 0.2093362659215927, + "learning_rate": 9.311896996874576e-05, + "loss": 3.945, + "step": 8110 + }, + { + "epoch": 0.5513656746840603, + "grad_norm": 0.17465944588184357, + "learning_rate": 9.311472346786248e-05, + "loss": 3.9291, + "step": 8115 + }, + { + "epoch": 0.5517053947547221, + "grad_norm": 0.7979787588119507, + "learning_rate": 9.311047696697921e-05, + "loss": 3.9622, + "step": 8120 + }, + { + "epoch": 0.5520451148253839, + "grad_norm": 0.20573332905769348, + "learning_rate": 9.310623046609594e-05, + "loss": 4.0717, + "step": 8125 + }, + { + "epoch": 0.5523848348960456, + "grad_norm": 0.8882215023040771, + "learning_rate": 9.310198396521267e-05, + "loss": 4.045, + "step": 8130 + }, + { + "epoch": 0.5527245549667075, + "grad_norm": 0.1780032366514206, + "learning_rate": 9.30977374643294e-05, + "loss": 3.8284, + "step": 8135 + }, + { + "epoch": 0.5530642750373692, + "grad_norm": 0.19671432673931122, + "learning_rate": 9.309349096344612e-05, + "loss": 4.0132, + "step": 8140 + }, + { + "epoch": 0.553403995108031, + "grad_norm": 0.17597247660160065, + "learning_rate": 9.308924446256285e-05, + "loss": 4.0527, + "step": 8145 + }, + { + "epoch": 0.5537437151786928, + "grad_norm": 0.19633089005947113, + "learning_rate": 9.308499796167958e-05, + "loss": 3.8693, + "step": 8150 + }, + { + "epoch": 0.5540834352493546, + "grad_norm": 0.19133026897907257, + "learning_rate": 9.308075146079631e-05, + "loss": 3.9765, + "step": 8155 + }, + { + "epoch": 0.5544231553200163, + "grad_norm": 0.18762901425361633, + "learning_rate": 9.307650495991304e-05, + "loss": 3.9366, + "step": 8160 + }, + { + "epoch": 0.554762875390678, + "grad_norm": 0.21953530609607697, + "learning_rate": 9.307225845902976e-05, + "loss": 4.1145, + "step": 8165 + }, + { + "epoch": 0.5551025954613399, + "grad_norm": 0.1631409376859665, + "learning_rate": 9.306801195814649e-05, + "loss": 4.3299, + "step": 8170 + }, + { + "epoch": 0.5554423155320016, + "grad_norm": 0.1782941222190857, + "learning_rate": 9.306376545726322e-05, + "loss": 4.0921, + "step": 8175 + }, + { + "epoch": 0.5557820356026634, + "grad_norm": 0.18272286653518677, + "learning_rate": 9.305951895637995e-05, + "loss": 4.0609, + "step": 8180 + }, + { + "epoch": 0.5561217556733252, + "grad_norm": 0.5140985250473022, + "learning_rate": 9.305527245549668e-05, + "loss": 3.9297, + "step": 8185 + }, + { + "epoch": 0.556461475743987, + "grad_norm": 1.4922987222671509, + "learning_rate": 9.30510259546134e-05, + "loss": 4.0238, + "step": 8190 + }, + { + "epoch": 0.5568011958146487, + "grad_norm": 3.8372085094451904, + "learning_rate": 9.304677945373013e-05, + "loss": 4.1705, + "step": 8195 + }, + { + "epoch": 0.5571409158853105, + "grad_norm": 0.1523323357105255, + "learning_rate": 9.304253295284686e-05, + "loss": 4.0109, + "step": 8200 + }, + { + "epoch": 0.5574806359559723, + "grad_norm": 0.2263563573360443, + "learning_rate": 9.303828645196359e-05, + "loss": 4.0453, + "step": 8205 + }, + { + "epoch": 0.557820356026634, + "grad_norm": 0.20858174562454224, + "learning_rate": 9.30340399510803e-05, + "loss": 4.0197, + "step": 8210 + }, + { + "epoch": 0.5581600760972958, + "grad_norm": 0.2111169844865799, + "learning_rate": 9.302979345019704e-05, + "loss": 3.9775, + "step": 8215 + }, + { + "epoch": 0.5584997961679576, + "grad_norm": 0.16785529255867004, + "learning_rate": 9.302554694931377e-05, + "loss": 4.1782, + "step": 8220 + }, + { + "epoch": 0.5588395162386194, + "grad_norm": 0.21160061657428741, + "learning_rate": 9.302130044843049e-05, + "loss": 3.7809, + "step": 8225 + }, + { + "epoch": 0.5591792363092811, + "grad_norm": 0.34842145442962646, + "learning_rate": 9.301705394754723e-05, + "loss": 4.0454, + "step": 8230 + }, + { + "epoch": 0.559518956379943, + "grad_norm": 0.22332549095153809, + "learning_rate": 9.301280744666396e-05, + "loss": 3.9447, + "step": 8235 + }, + { + "epoch": 0.5598586764506047, + "grad_norm": 0.33598944544792175, + "learning_rate": 9.300856094578067e-05, + "loss": 4.2742, + "step": 8240 + }, + { + "epoch": 0.5601983965212665, + "grad_norm": 0.18852943181991577, + "learning_rate": 9.300431444489741e-05, + "loss": 3.9494, + "step": 8245 + }, + { + "epoch": 0.5605381165919282, + "grad_norm": 0.1902894675731659, + "learning_rate": 9.300006794401414e-05, + "loss": 4.0114, + "step": 8250 + }, + { + "epoch": 0.5608778366625901, + "grad_norm": 0.3319913446903229, + "learning_rate": 9.299582144313086e-05, + "loss": 4.1077, + "step": 8255 + }, + { + "epoch": 0.5612175567332518, + "grad_norm": 1.5811398029327393, + "learning_rate": 9.29915749422476e-05, + "loss": 3.9531, + "step": 8260 + }, + { + "epoch": 0.5615572768039135, + "grad_norm": 0.18648011982440948, + "learning_rate": 9.298732844136433e-05, + "loss": 4.0406, + "step": 8265 + }, + { + "epoch": 0.5618969968745754, + "grad_norm": 0.20678003132343292, + "learning_rate": 9.298308194048104e-05, + "loss": 4.0141, + "step": 8270 + }, + { + "epoch": 0.5622367169452371, + "grad_norm": 0.284509539604187, + "learning_rate": 9.297883543959778e-05, + "loss": 3.7511, + "step": 8275 + }, + { + "epoch": 0.5625764370158989, + "grad_norm": 0.23593567311763763, + "learning_rate": 9.29745889387145e-05, + "loss": 3.9993, + "step": 8280 + }, + { + "epoch": 0.5629161570865606, + "grad_norm": 0.221805140376091, + "learning_rate": 9.297034243783122e-05, + "loss": 4.0023, + "step": 8285 + }, + { + "epoch": 0.5632558771572225, + "grad_norm": 0.1965785026550293, + "learning_rate": 9.296609593694797e-05, + "loss": 4.0849, + "step": 8290 + }, + { + "epoch": 0.5635955972278842, + "grad_norm": 0.3942621648311615, + "learning_rate": 9.296184943606468e-05, + "loss": 4.0986, + "step": 8295 + }, + { + "epoch": 0.563935317298546, + "grad_norm": 0.14950452744960785, + "learning_rate": 9.295760293518142e-05, + "loss": 4.0598, + "step": 8300 + }, + { + "epoch": 0.5642750373692078, + "grad_norm": 0.19888624548912048, + "learning_rate": 9.295335643429815e-05, + "loss": 4.1756, + "step": 8305 + }, + { + "epoch": 0.5646147574398696, + "grad_norm": 0.18374282121658325, + "learning_rate": 9.294910993341486e-05, + "loss": 4.0213, + "step": 8310 + }, + { + "epoch": 0.5649544775105313, + "grad_norm": 0.19329077005386353, + "learning_rate": 9.29448634325316e-05, + "loss": 3.8402, + "step": 8315 + }, + { + "epoch": 0.5652941975811931, + "grad_norm": 0.4161980152130127, + "learning_rate": 9.294061693164833e-05, + "loss": 3.8012, + "step": 8320 + }, + { + "epoch": 0.5656339176518549, + "grad_norm": 0.20671139657497406, + "learning_rate": 9.293637043076505e-05, + "loss": 4.335, + "step": 8325 + }, + { + "epoch": 0.5659736377225166, + "grad_norm": 0.20330487191677094, + "learning_rate": 9.293212392988179e-05, + "loss": 4.0325, + "step": 8330 + }, + { + "epoch": 0.5663133577931784, + "grad_norm": 0.19175422191619873, + "learning_rate": 9.292787742899852e-05, + "loss": 4.1178, + "step": 8335 + }, + { + "epoch": 0.5666530778638402, + "grad_norm": 0.22935859858989716, + "learning_rate": 9.292363092811523e-05, + "loss": 4.1405, + "step": 8340 + }, + { + "epoch": 0.566992797934502, + "grad_norm": 0.17749212682247162, + "learning_rate": 9.291938442723197e-05, + "loss": 3.9575, + "step": 8345 + }, + { + "epoch": 0.5673325180051637, + "grad_norm": 1.0764920711517334, + "learning_rate": 9.291513792634869e-05, + "loss": 4.3775, + "step": 8350 + }, + { + "epoch": 0.5676722380758256, + "grad_norm": 0.15739497542381287, + "learning_rate": 9.291089142546542e-05, + "loss": 4.1305, + "step": 8355 + }, + { + "epoch": 0.5680119581464873, + "grad_norm": 0.19351720809936523, + "learning_rate": 9.290664492458216e-05, + "loss": 3.8318, + "step": 8360 + }, + { + "epoch": 0.568351678217149, + "grad_norm": 0.16042912006378174, + "learning_rate": 9.290239842369887e-05, + "loss": 4.1028, + "step": 8365 + }, + { + "epoch": 0.5686913982878108, + "grad_norm": 0.1768733263015747, + "learning_rate": 9.28981519228156e-05, + "loss": 3.8895, + "step": 8370 + }, + { + "epoch": 0.5690311183584726, + "grad_norm": 0.15216295421123505, + "learning_rate": 9.289390542193234e-05, + "loss": 4.1829, + "step": 8375 + }, + { + "epoch": 0.5693708384291344, + "grad_norm": 0.20485453307628632, + "learning_rate": 9.288965892104906e-05, + "loss": 4.1128, + "step": 8380 + }, + { + "epoch": 0.5697105584997961, + "grad_norm": 0.2530359625816345, + "learning_rate": 9.288541242016578e-05, + "loss": 4.0308, + "step": 8385 + }, + { + "epoch": 0.570050278570458, + "grad_norm": 0.24390393495559692, + "learning_rate": 9.288116591928253e-05, + "loss": 3.9455, + "step": 8390 + }, + { + "epoch": 0.5703899986411197, + "grad_norm": 0.15447057783603668, + "learning_rate": 9.287691941839924e-05, + "loss": 4.2323, + "step": 8395 + }, + { + "epoch": 0.5707297187117815, + "grad_norm": 0.262494295835495, + "learning_rate": 9.287267291751597e-05, + "loss": 3.8537, + "step": 8400 + }, + { + "epoch": 0.5710694387824433, + "grad_norm": 0.2734646499156952, + "learning_rate": 9.286842641663271e-05, + "loss": 4.0975, + "step": 8405 + }, + { + "epoch": 0.5714091588531051, + "grad_norm": 0.3060397803783417, + "learning_rate": 9.286417991574942e-05, + "loss": 4.1002, + "step": 8410 + }, + { + "epoch": 0.5717488789237668, + "grad_norm": 0.194105863571167, + "learning_rate": 9.285993341486615e-05, + "loss": 3.7785, + "step": 8415 + }, + { + "epoch": 0.5720885989944285, + "grad_norm": 0.19388654828071594, + "learning_rate": 9.28556869139829e-05, + "loss": 4.0577, + "step": 8420 + }, + { + "epoch": 0.5724283190650904, + "grad_norm": 0.4632340669631958, + "learning_rate": 9.285144041309961e-05, + "loss": 4.0301, + "step": 8425 + }, + { + "epoch": 0.5727680391357521, + "grad_norm": 0.23817621171474457, + "learning_rate": 9.284719391221634e-05, + "loss": 3.9506, + "step": 8430 + }, + { + "epoch": 0.5731077592064139, + "grad_norm": 0.26604166626930237, + "learning_rate": 9.284294741133306e-05, + "loss": 3.7005, + "step": 8435 + }, + { + "epoch": 0.5734474792770757, + "grad_norm": 0.21348132193088531, + "learning_rate": 9.283870091044979e-05, + "loss": 4.0274, + "step": 8440 + }, + { + "epoch": 0.5737871993477375, + "grad_norm": 0.3157108724117279, + "learning_rate": 9.283445440956652e-05, + "loss": 3.8789, + "step": 8445 + }, + { + "epoch": 0.5741269194183992, + "grad_norm": 0.6310774087905884, + "learning_rate": 9.283020790868325e-05, + "loss": 4.0599, + "step": 8450 + }, + { + "epoch": 0.574466639489061, + "grad_norm": 0.18738152086734772, + "learning_rate": 9.282596140779998e-05, + "loss": 3.8024, + "step": 8455 + }, + { + "epoch": 0.5748063595597228, + "grad_norm": 0.3210128843784332, + "learning_rate": 9.28217149069167e-05, + "loss": 4.1008, + "step": 8460 + }, + { + "epoch": 0.5751460796303846, + "grad_norm": 0.180195152759552, + "learning_rate": 9.281746840603343e-05, + "loss": 4.0769, + "step": 8465 + }, + { + "epoch": 0.5754857997010463, + "grad_norm": 0.17868779599666595, + "learning_rate": 9.281322190515016e-05, + "loss": 3.9685, + "step": 8470 + }, + { + "epoch": 0.5758255197717081, + "grad_norm": 0.5827326774597168, + "learning_rate": 9.280897540426689e-05, + "loss": 3.9109, + "step": 8475 + }, + { + "epoch": 0.5761652398423699, + "grad_norm": 0.3124859631061554, + "learning_rate": 9.280472890338362e-05, + "loss": 3.8527, + "step": 8480 + }, + { + "epoch": 0.5765049599130316, + "grad_norm": 0.42251700162887573, + "learning_rate": 9.280048240250034e-05, + "loss": 4.1552, + "step": 8485 + }, + { + "epoch": 0.5768446799836935, + "grad_norm": 0.1950671374797821, + "learning_rate": 9.279623590161707e-05, + "loss": 3.836, + "step": 8490 + }, + { + "epoch": 0.5771844000543552, + "grad_norm": 0.8381164073944092, + "learning_rate": 9.27919894007338e-05, + "loss": 4.0999, + "step": 8495 + }, + { + "epoch": 0.577524120125017, + "grad_norm": 0.15643952786922455, + "learning_rate": 9.278774289985053e-05, + "loss": 3.9804, + "step": 8500 + }, + { + "epoch": 0.5778638401956787, + "grad_norm": 0.21564146876335144, + "learning_rate": 9.278349639896726e-05, + "loss": 3.9151, + "step": 8505 + }, + { + "epoch": 0.5782035602663406, + "grad_norm": 0.21207468211650848, + "learning_rate": 9.277924989808398e-05, + "loss": 3.9481, + "step": 8510 + }, + { + "epoch": 0.5785432803370023, + "grad_norm": 0.19761881232261658, + "learning_rate": 9.277500339720071e-05, + "loss": 4.0216, + "step": 8515 + }, + { + "epoch": 0.578883000407664, + "grad_norm": 0.18729913234710693, + "learning_rate": 9.277075689631744e-05, + "loss": 3.9112, + "step": 8520 + }, + { + "epoch": 0.5792227204783259, + "grad_norm": 0.13949252665042877, + "learning_rate": 9.276651039543417e-05, + "loss": 3.928, + "step": 8525 + }, + { + "epoch": 0.5795624405489876, + "grad_norm": 0.1481368988752365, + "learning_rate": 9.27622638945509e-05, + "loss": 4.1387, + "step": 8530 + }, + { + "epoch": 0.5799021606196494, + "grad_norm": 0.2295740246772766, + "learning_rate": 9.275801739366762e-05, + "loss": 4.0267, + "step": 8535 + }, + { + "epoch": 0.5802418806903111, + "grad_norm": 0.22337546944618225, + "learning_rate": 9.275377089278435e-05, + "loss": 3.927, + "step": 8540 + }, + { + "epoch": 0.580581600760973, + "grad_norm": 0.41308972239494324, + "learning_rate": 9.274952439190108e-05, + "loss": 4.1804, + "step": 8545 + }, + { + "epoch": 0.5809213208316347, + "grad_norm": 0.15988604724407196, + "learning_rate": 9.27452778910178e-05, + "loss": 3.8497, + "step": 8550 + }, + { + "epoch": 0.5812610409022965, + "grad_norm": 0.17153027653694153, + "learning_rate": 9.274103139013454e-05, + "loss": 4.1164, + "step": 8555 + }, + { + "epoch": 0.5816007609729583, + "grad_norm": 0.2415970116853714, + "learning_rate": 9.273678488925126e-05, + "loss": 4.1781, + "step": 8560 + }, + { + "epoch": 0.5819404810436201, + "grad_norm": 2.764481544494629, + "learning_rate": 9.273253838836798e-05, + "loss": 3.9708, + "step": 8565 + }, + { + "epoch": 0.5822802011142818, + "grad_norm": 0.1980726718902588, + "learning_rate": 9.272829188748472e-05, + "loss": 4.0011, + "step": 8570 + }, + { + "epoch": 0.5826199211849437, + "grad_norm": 0.23337212204933167, + "learning_rate": 9.272404538660145e-05, + "loss": 3.9122, + "step": 8575 + }, + { + "epoch": 0.5829596412556054, + "grad_norm": 0.1616441160440445, + "learning_rate": 9.271979888571816e-05, + "loss": 3.8085, + "step": 8580 + }, + { + "epoch": 0.5832993613262671, + "grad_norm": 0.19544613361358643, + "learning_rate": 9.27155523848349e-05, + "loss": 4.0634, + "step": 8585 + }, + { + "epoch": 0.5836390813969289, + "grad_norm": 0.19798190891742706, + "learning_rate": 9.271130588395163e-05, + "loss": 3.9989, + "step": 8590 + }, + { + "epoch": 0.5839788014675907, + "grad_norm": 0.1608157753944397, + "learning_rate": 9.270705938306835e-05, + "loss": 4.3514, + "step": 8595 + }, + { + "epoch": 0.5843185215382525, + "grad_norm": 0.23190398514270782, + "learning_rate": 9.270281288218509e-05, + "loss": 3.9989, + "step": 8600 + }, + { + "epoch": 0.5846582416089142, + "grad_norm": 0.694850742816925, + "learning_rate": 9.269856638130182e-05, + "loss": 4.3087, + "step": 8605 + }, + { + "epoch": 0.5849979616795761, + "grad_norm": 0.2711631655693054, + "learning_rate": 9.269431988041853e-05, + "loss": 3.9979, + "step": 8610 + }, + { + "epoch": 0.5853376817502378, + "grad_norm": 0.17717806994915009, + "learning_rate": 9.269007337953527e-05, + "loss": 3.9771, + "step": 8615 + }, + { + "epoch": 0.5856774018208996, + "grad_norm": 0.258022665977478, + "learning_rate": 9.2685826878652e-05, + "loss": 3.9518, + "step": 8620 + }, + { + "epoch": 0.5860171218915613, + "grad_norm": 1.0233123302459717, + "learning_rate": 9.268158037776871e-05, + "loss": 4.0393, + "step": 8625 + }, + { + "epoch": 0.5863568419622232, + "grad_norm": 0.19119100272655487, + "learning_rate": 9.267733387688546e-05, + "loss": 4.2572, + "step": 8630 + }, + { + "epoch": 0.5866965620328849, + "grad_norm": 0.16495579481124878, + "learning_rate": 9.267308737600217e-05, + "loss": 4.0212, + "step": 8635 + }, + { + "epoch": 0.5870362821035466, + "grad_norm": 0.3087138831615448, + "learning_rate": 9.266884087511891e-05, + "loss": 3.6527, + "step": 8640 + }, + { + "epoch": 0.5873760021742085, + "grad_norm": 0.16957888007164001, + "learning_rate": 9.266459437423564e-05, + "loss": 4.2809, + "step": 8645 + }, + { + "epoch": 0.5877157222448702, + "grad_norm": 0.15367551147937775, + "learning_rate": 9.266034787335235e-05, + "loss": 4.2469, + "step": 8650 + }, + { + "epoch": 0.588055442315532, + "grad_norm": 0.2356184720993042, + "learning_rate": 9.26561013724691e-05, + "loss": 3.8713, + "step": 8655 + }, + { + "epoch": 0.5883951623861938, + "grad_norm": 6.177441120147705, + "learning_rate": 9.265185487158582e-05, + "loss": 3.8173, + "step": 8660 + }, + { + "epoch": 0.5887348824568556, + "grad_norm": 0.15291577577590942, + "learning_rate": 9.264760837070254e-05, + "loss": 4.0039, + "step": 8665 + }, + { + "epoch": 0.5890746025275173, + "grad_norm": 0.1881755292415619, + "learning_rate": 9.264336186981928e-05, + "loss": 4.1521, + "step": 8670 + }, + { + "epoch": 0.589414322598179, + "grad_norm": 0.28705185651779175, + "learning_rate": 9.263911536893601e-05, + "loss": 4.1136, + "step": 8675 + }, + { + "epoch": 0.5897540426688409, + "grad_norm": 0.8784993886947632, + "learning_rate": 9.263486886805272e-05, + "loss": 4.0763, + "step": 8680 + }, + { + "epoch": 0.5900937627395026, + "grad_norm": 0.5077646970748901, + "learning_rate": 9.263062236716946e-05, + "loss": 3.818, + "step": 8685 + }, + { + "epoch": 0.5904334828101644, + "grad_norm": 0.17957673966884613, + "learning_rate": 9.262637586628619e-05, + "loss": 3.8784, + "step": 8690 + }, + { + "epoch": 0.5907732028808262, + "grad_norm": 0.2007267326116562, + "learning_rate": 9.26221293654029e-05, + "loss": 4.0291, + "step": 8695 + }, + { + "epoch": 0.591112922951488, + "grad_norm": 0.3630281686782837, + "learning_rate": 9.261788286451965e-05, + "loss": 4.0589, + "step": 8700 + }, + { + "epoch": 0.5914526430221497, + "grad_norm": 0.33113589882850647, + "learning_rate": 9.261363636363636e-05, + "loss": 4.0724, + "step": 8705 + }, + { + "epoch": 0.5917923630928115, + "grad_norm": 0.9370392560958862, + "learning_rate": 9.260938986275309e-05, + "loss": 3.9571, + "step": 8710 + }, + { + "epoch": 0.5921320831634733, + "grad_norm": 0.7348127365112305, + "learning_rate": 9.260514336186983e-05, + "loss": 3.8521, + "step": 8715 + }, + { + "epoch": 0.5924718032341351, + "grad_norm": 0.20114554464817047, + "learning_rate": 9.260089686098655e-05, + "loss": 4.0328, + "step": 8720 + }, + { + "epoch": 0.5928115233047968, + "grad_norm": 0.375806987285614, + "learning_rate": 9.259665036010327e-05, + "loss": 4.045, + "step": 8725 + }, + { + "epoch": 0.5931512433754587, + "grad_norm": 0.17481490969657898, + "learning_rate": 9.259240385922002e-05, + "loss": 4.1616, + "step": 8730 + }, + { + "epoch": 0.5934909634461204, + "grad_norm": 0.3022591471672058, + "learning_rate": 9.258815735833673e-05, + "loss": 4.0886, + "step": 8735 + }, + { + "epoch": 0.5938306835167821, + "grad_norm": 0.17564785480499268, + "learning_rate": 9.258391085745346e-05, + "loss": 3.9194, + "step": 8740 + }, + { + "epoch": 0.594170403587444, + "grad_norm": 0.19886420667171478, + "learning_rate": 9.25796643565702e-05, + "loss": 3.9606, + "step": 8745 + }, + { + "epoch": 0.5945101236581057, + "grad_norm": 0.18436846137046814, + "learning_rate": 9.257541785568691e-05, + "loss": 4.0952, + "step": 8750 + }, + { + "epoch": 0.5948498437287675, + "grad_norm": 0.39597442746162415, + "learning_rate": 9.257117135480364e-05, + "loss": 4.0044, + "step": 8755 + }, + { + "epoch": 0.5951895637994292, + "grad_norm": 0.2468959391117096, + "learning_rate": 9.256692485392038e-05, + "loss": 4.2667, + "step": 8760 + }, + { + "epoch": 0.5955292838700911, + "grad_norm": 0.21869252622127533, + "learning_rate": 9.25626783530371e-05, + "loss": 3.862, + "step": 8765 + }, + { + "epoch": 0.5958690039407528, + "grad_norm": 0.18693305552005768, + "learning_rate": 9.255843185215383e-05, + "loss": 4.1253, + "step": 8770 + }, + { + "epoch": 0.5962087240114146, + "grad_norm": 0.2016768604516983, + "learning_rate": 9.255418535127055e-05, + "loss": 4.0373, + "step": 8775 + }, + { + "epoch": 0.5965484440820764, + "grad_norm": 0.20323170721530914, + "learning_rate": 9.254993885038728e-05, + "loss": 3.9236, + "step": 8780 + }, + { + "epoch": 0.5968881641527382, + "grad_norm": 0.17881441116333008, + "learning_rate": 9.254569234950401e-05, + "loss": 3.9083, + "step": 8785 + }, + { + "epoch": 0.5972278842233999, + "grad_norm": 0.22243642807006836, + "learning_rate": 9.254144584862074e-05, + "loss": 4.0484, + "step": 8790 + }, + { + "epoch": 0.5975676042940616, + "grad_norm": 0.160291388630867, + "learning_rate": 9.253719934773747e-05, + "loss": 4.0718, + "step": 8795 + }, + { + "epoch": 0.5979073243647235, + "grad_norm": 0.18238534033298492, + "learning_rate": 9.25329528468542e-05, + "loss": 3.8954, + "step": 8800 + }, + { + "epoch": 0.5982470444353852, + "grad_norm": 0.2048415243625641, + "learning_rate": 9.252870634597092e-05, + "loss": 3.9912, + "step": 8805 + }, + { + "epoch": 0.598586764506047, + "grad_norm": 0.17319513857364655, + "learning_rate": 9.252445984508765e-05, + "loss": 4.0986, + "step": 8810 + }, + { + "epoch": 0.5989264845767088, + "grad_norm": 0.24065633118152618, + "learning_rate": 9.252021334420438e-05, + "loss": 4.1444, + "step": 8815 + }, + { + "epoch": 0.5992662046473706, + "grad_norm": 0.20234987139701843, + "learning_rate": 9.251596684332111e-05, + "loss": 3.8242, + "step": 8820 + }, + { + "epoch": 0.5996059247180323, + "grad_norm": 0.21660032868385315, + "learning_rate": 9.251172034243783e-05, + "loss": 3.8864, + "step": 8825 + }, + { + "epoch": 0.5999456447886942, + "grad_norm": 0.17871896922588348, + "learning_rate": 9.250747384155456e-05, + "loss": 3.8432, + "step": 8830 + }, + { + "epoch": 0.6002853648593559, + "grad_norm": 0.16359539330005646, + "learning_rate": 9.250322734067129e-05, + "loss": 4.0026, + "step": 8835 + }, + { + "epoch": 0.6006250849300176, + "grad_norm": 0.2683166265487671, + "learning_rate": 9.249898083978802e-05, + "loss": 4.1686, + "step": 8840 + }, + { + "epoch": 0.6009648050006794, + "grad_norm": 0.16088762879371643, + "learning_rate": 9.249473433890475e-05, + "loss": 3.9448, + "step": 8845 + }, + { + "epoch": 0.6013045250713412, + "grad_norm": 0.21767516434192657, + "learning_rate": 9.249048783802147e-05, + "loss": 4.1642, + "step": 8850 + }, + { + "epoch": 0.601644245142003, + "grad_norm": 1.050563097000122, + "learning_rate": 9.24862413371382e-05, + "loss": 4.0446, + "step": 8855 + }, + { + "epoch": 0.6019839652126647, + "grad_norm": 0.16180701553821564, + "learning_rate": 9.248199483625493e-05, + "loss": 4.014, + "step": 8860 + }, + { + "epoch": 0.6023236852833266, + "grad_norm": 0.1821652054786682, + "learning_rate": 9.247774833537166e-05, + "loss": 4.0171, + "step": 8865 + }, + { + "epoch": 0.6026634053539883, + "grad_norm": 1.0722553730010986, + "learning_rate": 9.247350183448839e-05, + "loss": 3.8606, + "step": 8870 + }, + { + "epoch": 0.6030031254246501, + "grad_norm": 0.22805406153202057, + "learning_rate": 9.246925533360511e-05, + "loss": 3.993, + "step": 8875 + }, + { + "epoch": 0.6033428454953118, + "grad_norm": 0.2157077044248581, + "learning_rate": 9.246500883272184e-05, + "loss": 3.7777, + "step": 8880 + }, + { + "epoch": 0.6036825655659737, + "grad_norm": 0.18143793940544128, + "learning_rate": 9.246076233183857e-05, + "loss": 4.127, + "step": 8885 + }, + { + "epoch": 0.6040222856366354, + "grad_norm": 0.18271377682685852, + "learning_rate": 9.24565158309553e-05, + "loss": 4.0305, + "step": 8890 + }, + { + "epoch": 0.6043620057072971, + "grad_norm": 0.2080451250076294, + "learning_rate": 9.245226933007203e-05, + "loss": 3.8488, + "step": 8895 + }, + { + "epoch": 0.604701725777959, + "grad_norm": 1.494086742401123, + "learning_rate": 9.244802282918875e-05, + "loss": 4.0619, + "step": 8900 + }, + { + "epoch": 0.6050414458486207, + "grad_norm": 0.2166607528924942, + "learning_rate": 9.244377632830547e-05, + "loss": 3.8448, + "step": 8905 + }, + { + "epoch": 0.6053811659192825, + "grad_norm": 0.24948440492153168, + "learning_rate": 9.243952982742221e-05, + "loss": 3.9839, + "step": 8910 + }, + { + "epoch": 0.6057208859899443, + "grad_norm": 0.33411622047424316, + "learning_rate": 9.243528332653894e-05, + "loss": 4.0442, + "step": 8915 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.17864017188549042, + "learning_rate": 9.243103682565565e-05, + "loss": 3.996, + "step": 8920 + }, + { + "epoch": 0.6064003261312678, + "grad_norm": 0.18109479546546936, + "learning_rate": 9.24267903247724e-05, + "loss": 4.1308, + "step": 8925 + }, + { + "epoch": 0.6067400462019296, + "grad_norm": 0.21822641789913177, + "learning_rate": 9.242254382388912e-05, + "loss": 4.1508, + "step": 8930 + }, + { + "epoch": 0.6070797662725914, + "grad_norm": 0.37898629903793335, + "learning_rate": 9.241829732300584e-05, + "loss": 4.1584, + "step": 8935 + }, + { + "epoch": 0.6074194863432532, + "grad_norm": 0.30412939190864563, + "learning_rate": 9.241405082212258e-05, + "loss": 3.948, + "step": 8940 + }, + { + "epoch": 0.6077592064139149, + "grad_norm": 0.20068703591823578, + "learning_rate": 9.240980432123931e-05, + "loss": 4.1206, + "step": 8945 + }, + { + "epoch": 0.6080989264845768, + "grad_norm": 0.25400644540786743, + "learning_rate": 9.240555782035602e-05, + "loss": 4.3133, + "step": 8950 + }, + { + "epoch": 0.6084386465552385, + "grad_norm": 0.19139589369297028, + "learning_rate": 9.240131131947276e-05, + "loss": 3.9386, + "step": 8955 + }, + { + "epoch": 0.6087783666259002, + "grad_norm": 0.16936640441417694, + "learning_rate": 9.239706481858949e-05, + "loss": 4.0571, + "step": 8960 + }, + { + "epoch": 0.609118086696562, + "grad_norm": 0.20352940261363983, + "learning_rate": 9.23928183177062e-05, + "loss": 3.7657, + "step": 8965 + }, + { + "epoch": 0.6094578067672238, + "grad_norm": 0.1573779433965683, + "learning_rate": 9.238857181682295e-05, + "loss": 4.1298, + "step": 8970 + }, + { + "epoch": 0.6097975268378856, + "grad_norm": 0.20279887318611145, + "learning_rate": 9.238432531593966e-05, + "loss": 3.9271, + "step": 8975 + }, + { + "epoch": 0.6101372469085473, + "grad_norm": 16.52877426147461, + "learning_rate": 9.23800788150564e-05, + "loss": 3.8319, + "step": 8980 + }, + { + "epoch": 0.6104769669792092, + "grad_norm": 0.17606152594089508, + "learning_rate": 9.237583231417313e-05, + "loss": 3.7453, + "step": 8985 + }, + { + "epoch": 0.6108166870498709, + "grad_norm": 0.23678942024707794, + "learning_rate": 9.237158581328985e-05, + "loss": 3.9744, + "step": 8990 + }, + { + "epoch": 0.6111564071205327, + "grad_norm": 0.21576398611068726, + "learning_rate": 9.236733931240659e-05, + "loss": 4.1691, + "step": 8995 + }, + { + "epoch": 0.6114961271911945, + "grad_norm": 0.2256469577550888, + "learning_rate": 9.236309281152332e-05, + "loss": 3.9419, + "step": 9000 + }, + { + "epoch": 0.6118358472618562, + "grad_norm": 0.23562292754650116, + "learning_rate": 9.235884631064003e-05, + "loss": 4.1049, + "step": 9005 + }, + { + "epoch": 0.612175567332518, + "grad_norm": 0.2894531190395355, + "learning_rate": 9.235459980975677e-05, + "loss": 3.9535, + "step": 9010 + }, + { + "epoch": 0.6125152874031797, + "grad_norm": 0.22967609763145447, + "learning_rate": 9.23503533088735e-05, + "loss": 3.9537, + "step": 9015 + }, + { + "epoch": 0.6128550074738416, + "grad_norm": 1.3449379205703735, + "learning_rate": 9.234610680799021e-05, + "loss": 4.08, + "step": 9020 + }, + { + "epoch": 0.6131947275445033, + "grad_norm": 0.2513696849346161, + "learning_rate": 9.234186030710696e-05, + "loss": 3.8626, + "step": 9025 + }, + { + "epoch": 0.6135344476151651, + "grad_norm": 0.19834820926189423, + "learning_rate": 9.233761380622368e-05, + "loss": 3.9427, + "step": 9030 + }, + { + "epoch": 0.6138741676858269, + "grad_norm": 0.2118111550807953, + "learning_rate": 9.23333673053404e-05, + "loss": 3.8768, + "step": 9035 + }, + { + "epoch": 0.6142138877564887, + "grad_norm": 0.15969218313694, + "learning_rate": 9.232912080445714e-05, + "loss": 4.1679, + "step": 9040 + }, + { + "epoch": 0.6145536078271504, + "grad_norm": 0.184014692902565, + "learning_rate": 9.232487430357387e-05, + "loss": 4.1939, + "step": 9045 + }, + { + "epoch": 0.6148933278978121, + "grad_norm": 0.17408449947834015, + "learning_rate": 9.232062780269058e-05, + "loss": 4.0627, + "step": 9050 + }, + { + "epoch": 0.615233047968474, + "grad_norm": 0.3706364929676056, + "learning_rate": 9.231638130180732e-05, + "loss": 4.1244, + "step": 9055 + }, + { + "epoch": 0.6155727680391357, + "grad_norm": 0.2606452703475952, + "learning_rate": 9.231213480092404e-05, + "loss": 4.0637, + "step": 9060 + }, + { + "epoch": 0.6159124881097975, + "grad_norm": 0.2730484902858734, + "learning_rate": 9.230788830004077e-05, + "loss": 4.227, + "step": 9065 + }, + { + "epoch": 0.6162522081804593, + "grad_norm": 0.17146620154380798, + "learning_rate": 9.230364179915751e-05, + "loss": 3.9837, + "step": 9070 + }, + { + "epoch": 0.6165919282511211, + "grad_norm": 0.23479421436786652, + "learning_rate": 9.229939529827422e-05, + "loss": 3.9549, + "step": 9075 + }, + { + "epoch": 0.6169316483217828, + "grad_norm": 0.17351080477237701, + "learning_rate": 9.229514879739095e-05, + "loss": 4.0837, + "step": 9080 + }, + { + "epoch": 0.6172713683924447, + "grad_norm": 0.18759216368198395, + "learning_rate": 9.229090229650769e-05, + "loss": 4.1507, + "step": 9085 + }, + { + "epoch": 0.6176110884631064, + "grad_norm": 0.373471736907959, + "learning_rate": 9.22866557956244e-05, + "loss": 3.8382, + "step": 9090 + }, + { + "epoch": 0.6179508085337682, + "grad_norm": 0.16654084622859955, + "learning_rate": 9.228240929474113e-05, + "loss": 3.9211, + "step": 9095 + }, + { + "epoch": 0.6182905286044299, + "grad_norm": 2.6412665843963623, + "learning_rate": 9.227816279385788e-05, + "loss": 4.1437, + "step": 9100 + }, + { + "epoch": 0.6186302486750918, + "grad_norm": 0.22232688963413239, + "learning_rate": 9.227391629297459e-05, + "loss": 4.3322, + "step": 9105 + }, + { + "epoch": 0.6189699687457535, + "grad_norm": 0.19792711734771729, + "learning_rate": 9.226966979209132e-05, + "loss": 3.6801, + "step": 9110 + }, + { + "epoch": 0.6193096888164152, + "grad_norm": 0.19452767074108124, + "learning_rate": 9.226542329120806e-05, + "loss": 4.1374, + "step": 9115 + }, + { + "epoch": 0.6196494088870771, + "grad_norm": 0.17766821384429932, + "learning_rate": 9.226117679032477e-05, + "loss": 3.998, + "step": 9120 + }, + { + "epoch": 0.6199891289577388, + "grad_norm": 0.21171297132968903, + "learning_rate": 9.22569302894415e-05, + "loss": 4.0408, + "step": 9125 + }, + { + "epoch": 0.6203288490284006, + "grad_norm": 0.2353495955467224, + "learning_rate": 9.225268378855823e-05, + "loss": 4.2569, + "step": 9130 + }, + { + "epoch": 0.6206685690990623, + "grad_norm": 0.1814018189907074, + "learning_rate": 9.224843728767496e-05, + "loss": 4.1783, + "step": 9135 + }, + { + "epoch": 0.6210082891697242, + "grad_norm": 0.16691021621227264, + "learning_rate": 9.224419078679169e-05, + "loss": 3.95, + "step": 9140 + }, + { + "epoch": 0.6213480092403859, + "grad_norm": 0.24587292969226837, + "learning_rate": 9.223994428590841e-05, + "loss": 4.187, + "step": 9145 + }, + { + "epoch": 0.6216877293110477, + "grad_norm": 0.12846866250038147, + "learning_rate": 9.223569778502514e-05, + "loss": 3.7833, + "step": 9150 + }, + { + "epoch": 0.6220274493817095, + "grad_norm": 0.15816563367843628, + "learning_rate": 9.223145128414187e-05, + "loss": 3.9622, + "step": 9155 + }, + { + "epoch": 0.6223671694523712, + "grad_norm": 0.5960243344306946, + "learning_rate": 9.22272047832586e-05, + "loss": 4.0261, + "step": 9160 + }, + { + "epoch": 0.622706889523033, + "grad_norm": 0.21092690527439117, + "learning_rate": 9.222295828237533e-05, + "loss": 4.0417, + "step": 9165 + }, + { + "epoch": 0.6230466095936948, + "grad_norm": 0.21227766573429108, + "learning_rate": 9.221871178149205e-05, + "loss": 4.0396, + "step": 9170 + }, + { + "epoch": 0.6233863296643566, + "grad_norm": 0.1947152018547058, + "learning_rate": 9.221446528060878e-05, + "loss": 3.9731, + "step": 9175 + }, + { + "epoch": 0.6237260497350183, + "grad_norm": 0.264658123254776, + "learning_rate": 9.221021877972551e-05, + "loss": 3.7214, + "step": 9180 + }, + { + "epoch": 0.6240657698056801, + "grad_norm": 0.14752690494060516, + "learning_rate": 9.220597227884224e-05, + "loss": 4.262, + "step": 9185 + }, + { + "epoch": 0.6244054898763419, + "grad_norm": 0.20559826493263245, + "learning_rate": 9.220172577795897e-05, + "loss": 4.0443, + "step": 9190 + }, + { + "epoch": 0.6247452099470037, + "grad_norm": 0.17190653085708618, + "learning_rate": 9.21974792770757e-05, + "loss": 4.246, + "step": 9195 + }, + { + "epoch": 0.6250849300176654, + "grad_norm": 0.20981894433498383, + "learning_rate": 9.219323277619242e-05, + "loss": 4.0482, + "step": 9200 + }, + { + "epoch": 0.6254246500883273, + "grad_norm": 0.20660968124866486, + "learning_rate": 9.218898627530915e-05, + "loss": 3.9868, + "step": 9205 + }, + { + "epoch": 0.625764370158989, + "grad_norm": 0.21324321627616882, + "learning_rate": 9.218473977442588e-05, + "loss": 4.3044, + "step": 9210 + }, + { + "epoch": 0.6261040902296507, + "grad_norm": 0.45024943351745605, + "learning_rate": 9.21804932735426e-05, + "loss": 3.945, + "step": 9215 + }, + { + "epoch": 0.6264438103003126, + "grad_norm": 0.19029852747917175, + "learning_rate": 9.217624677265933e-05, + "loss": 4.2293, + "step": 9220 + }, + { + "epoch": 0.6267835303709743, + "grad_norm": 0.17948591709136963, + "learning_rate": 9.217200027177606e-05, + "loss": 4.4357, + "step": 9225 + }, + { + "epoch": 0.6271232504416361, + "grad_norm": 0.30317583680152893, + "learning_rate": 9.216775377089279e-05, + "loss": 3.9639, + "step": 9230 + }, + { + "epoch": 0.6274629705122978, + "grad_norm": 0.1628941148519516, + "learning_rate": 9.216350727000952e-05, + "loss": 4.0654, + "step": 9235 + }, + { + "epoch": 0.6278026905829597, + "grad_norm": 0.20330828428268433, + "learning_rate": 9.215926076912625e-05, + "loss": 4.0975, + "step": 9240 + }, + { + "epoch": 0.6281424106536214, + "grad_norm": 0.19045989215373993, + "learning_rate": 9.215501426824297e-05, + "loss": 4.0935, + "step": 9245 + }, + { + "epoch": 0.6284821307242832, + "grad_norm": 1.2690106630325317, + "learning_rate": 9.21507677673597e-05, + "loss": 3.8794, + "step": 9250 + }, + { + "epoch": 0.628821850794945, + "grad_norm": 0.3011123538017273, + "learning_rate": 9.214652126647643e-05, + "loss": 3.8597, + "step": 9255 + }, + { + "epoch": 0.6291615708656068, + "grad_norm": 0.17079856991767883, + "learning_rate": 9.214227476559314e-05, + "loss": 3.9687, + "step": 9260 + }, + { + "epoch": 0.6295012909362685, + "grad_norm": 0.18377001583576202, + "learning_rate": 9.213802826470989e-05, + "loss": 3.8344, + "step": 9265 + }, + { + "epoch": 0.6298410110069302, + "grad_norm": 0.26580023765563965, + "learning_rate": 9.213378176382661e-05, + "loss": 3.9417, + "step": 9270 + }, + { + "epoch": 0.6301807310775921, + "grad_norm": 0.24509429931640625, + "learning_rate": 9.212953526294333e-05, + "loss": 3.9923, + "step": 9275 + }, + { + "epoch": 0.6305204511482538, + "grad_norm": 0.1840851753950119, + "learning_rate": 9.212528876206007e-05, + "loss": 4.1023, + "step": 9280 + }, + { + "epoch": 0.6308601712189156, + "grad_norm": 0.16701363027095795, + "learning_rate": 9.21210422611768e-05, + "loss": 4.1101, + "step": 9285 + }, + { + "epoch": 0.6311998912895774, + "grad_norm": 0.5241246819496155, + "learning_rate": 9.211679576029351e-05, + "loss": 3.5958, + "step": 9290 + }, + { + "epoch": 0.6315396113602392, + "grad_norm": 0.17940564453601837, + "learning_rate": 9.211254925941025e-05, + "loss": 3.9681, + "step": 9295 + }, + { + "epoch": 0.6318793314309009, + "grad_norm": 0.2497144490480423, + "learning_rate": 9.210830275852698e-05, + "loss": 4.0491, + "step": 9300 + }, + { + "epoch": 0.6322190515015628, + "grad_norm": 0.48636677861213684, + "learning_rate": 9.21040562576437e-05, + "loss": 3.9699, + "step": 9305 + }, + { + "epoch": 0.6325587715722245, + "grad_norm": 0.2682591676712036, + "learning_rate": 9.209980975676044e-05, + "loss": 3.8142, + "step": 9310 + }, + { + "epoch": 0.6328984916428863, + "grad_norm": 0.5112836360931396, + "learning_rate": 9.209556325587717e-05, + "loss": 3.8701, + "step": 9315 + }, + { + "epoch": 0.633238211713548, + "grad_norm": 0.37686988711357117, + "learning_rate": 9.20913167549939e-05, + "loss": 3.9299, + "step": 9320 + }, + { + "epoch": 0.6335779317842098, + "grad_norm": 0.3224041759967804, + "learning_rate": 9.208707025411062e-05, + "loss": 4.0601, + "step": 9325 + }, + { + "epoch": 0.6339176518548716, + "grad_norm": 0.241369366645813, + "learning_rate": 9.208282375322734e-05, + "loss": 4.1394, + "step": 9330 + }, + { + "epoch": 0.6342573719255333, + "grad_norm": 0.1902744024991989, + "learning_rate": 9.207857725234408e-05, + "loss": 4.1937, + "step": 9335 + }, + { + "epoch": 0.6345970919961952, + "grad_norm": 0.20103438198566437, + "learning_rate": 9.20743307514608e-05, + "loss": 4.1394, + "step": 9340 + }, + { + "epoch": 0.6349368120668569, + "grad_norm": 0.17396235466003418, + "learning_rate": 9.207008425057752e-05, + "loss": 4.0629, + "step": 9345 + }, + { + "epoch": 0.6352765321375187, + "grad_norm": 0.1853020042181015, + "learning_rate": 9.206583774969426e-05, + "loss": 4.0948, + "step": 9350 + }, + { + "epoch": 0.6356162522081804, + "grad_norm": 0.15830761194229126, + "learning_rate": 9.206159124881099e-05, + "loss": 4.0189, + "step": 9355 + }, + { + "epoch": 0.6359559722788423, + "grad_norm": 0.18493899703025818, + "learning_rate": 9.20573447479277e-05, + "loss": 3.9398, + "step": 9360 + }, + { + "epoch": 0.636295692349504, + "grad_norm": 0.19994007050991058, + "learning_rate": 9.205309824704445e-05, + "loss": 4.1339, + "step": 9365 + }, + { + "epoch": 0.6366354124201657, + "grad_norm": 0.16409240663051605, + "learning_rate": 9.204885174616117e-05, + "loss": 4.3045, + "step": 9370 + }, + { + "epoch": 0.6369751324908276, + "grad_norm": 0.34056273102760315, + "learning_rate": 9.204460524527789e-05, + "loss": 3.9817, + "step": 9375 + }, + { + "epoch": 0.6373148525614893, + "grad_norm": 0.19899314641952515, + "learning_rate": 9.204035874439463e-05, + "loss": 3.8643, + "step": 9380 + }, + { + "epoch": 0.6376545726321511, + "grad_norm": 0.2605150043964386, + "learning_rate": 9.203611224351136e-05, + "loss": 4.0556, + "step": 9385 + }, + { + "epoch": 0.6379942927028129, + "grad_norm": 0.23088324069976807, + "learning_rate": 9.203186574262807e-05, + "loss": 3.9551, + "step": 9390 + }, + { + "epoch": 0.6383340127734747, + "grad_norm": 0.181942418217659, + "learning_rate": 9.202761924174481e-05, + "loss": 4.0324, + "step": 9395 + }, + { + "epoch": 0.6386737328441364, + "grad_norm": 0.1608920693397522, + "learning_rate": 9.202337274086154e-05, + "loss": 3.8843, + "step": 9400 + }, + { + "epoch": 0.6390134529147982, + "grad_norm": 0.23045985400676727, + "learning_rate": 9.201912623997826e-05, + "loss": 3.8905, + "step": 9405 + }, + { + "epoch": 0.63935317298546, + "grad_norm": 0.20380303263664246, + "learning_rate": 9.2014879739095e-05, + "loss": 3.988, + "step": 9410 + }, + { + "epoch": 0.6396928930561218, + "grad_norm": 0.6223300695419312, + "learning_rate": 9.201063323821171e-05, + "loss": 3.9791, + "step": 9415 + }, + { + "epoch": 0.6400326131267835, + "grad_norm": 0.8067218065261841, + "learning_rate": 9.200638673732844e-05, + "loss": 4.1077, + "step": 9420 + }, + { + "epoch": 0.6403723331974454, + "grad_norm": 0.20715036988258362, + "learning_rate": 9.200214023644518e-05, + "loss": 4.138, + "step": 9425 + }, + { + "epoch": 0.6407120532681071, + "grad_norm": 0.20288419723510742, + "learning_rate": 9.19978937355619e-05, + "loss": 3.9553, + "step": 9430 + }, + { + "epoch": 0.6410517733387688, + "grad_norm": 0.18636272847652435, + "learning_rate": 9.199364723467862e-05, + "loss": 3.9276, + "step": 9435 + }, + { + "epoch": 0.6413914934094306, + "grad_norm": 0.18782015144824982, + "learning_rate": 9.198940073379537e-05, + "loss": 4.0336, + "step": 9440 + }, + { + "epoch": 0.6417312134800924, + "grad_norm": 0.19050072133541107, + "learning_rate": 9.198515423291208e-05, + "loss": 4.0227, + "step": 9445 + }, + { + "epoch": 0.6420709335507542, + "grad_norm": 0.30288147926330566, + "learning_rate": 9.198090773202881e-05, + "loss": 3.8769, + "step": 9450 + }, + { + "epoch": 0.6424106536214159, + "grad_norm": 0.22393420338630676, + "learning_rate": 9.197666123114555e-05, + "loss": 3.9509, + "step": 9455 + }, + { + "epoch": 0.6427503736920778, + "grad_norm": 0.21114152669906616, + "learning_rate": 9.197241473026226e-05, + "loss": 4.1008, + "step": 9460 + }, + { + "epoch": 0.6430900937627395, + "grad_norm": 0.238412544131279, + "learning_rate": 9.196816822937899e-05, + "loss": 3.8772, + "step": 9465 + }, + { + "epoch": 0.6434298138334013, + "grad_norm": 0.6189041137695312, + "learning_rate": 9.196392172849573e-05, + "loss": 3.9286, + "step": 9470 + }, + { + "epoch": 0.6437695339040631, + "grad_norm": 0.26222628355026245, + "learning_rate": 9.195967522761245e-05, + "loss": 4.1328, + "step": 9475 + }, + { + "epoch": 0.6441092539747248, + "grad_norm": 0.21858513355255127, + "learning_rate": 9.195542872672918e-05, + "loss": 3.9626, + "step": 9480 + }, + { + "epoch": 0.6444489740453866, + "grad_norm": 0.21814042329788208, + "learning_rate": 9.19511822258459e-05, + "loss": 4.3128, + "step": 9485 + }, + { + "epoch": 0.6447886941160483, + "grad_norm": 0.19709447026252747, + "learning_rate": 9.194693572496263e-05, + "loss": 3.9969, + "step": 9490 + }, + { + "epoch": 0.6451284141867102, + "grad_norm": 0.172958642244339, + "learning_rate": 9.194268922407936e-05, + "loss": 4.0359, + "step": 9495 + }, + { + "epoch": 0.6454681342573719, + "grad_norm": 0.17147018015384674, + "learning_rate": 9.193844272319609e-05, + "loss": 3.8998, + "step": 9500 + }, + { + "epoch": 0.6458078543280337, + "grad_norm": 0.16436798870563507, + "learning_rate": 9.193419622231282e-05, + "loss": 3.8929, + "step": 9505 + }, + { + "epoch": 0.6461475743986955, + "grad_norm": 0.17490500211715698, + "learning_rate": 9.192994972142954e-05, + "loss": 3.9468, + "step": 9510 + }, + { + "epoch": 0.6464872944693573, + "grad_norm": 0.5229065418243408, + "learning_rate": 9.192570322054627e-05, + "loss": 4.1358, + "step": 9515 + }, + { + "epoch": 0.646827014540019, + "grad_norm": 0.1972028911113739, + "learning_rate": 9.1921456719663e-05, + "loss": 4.1462, + "step": 9520 + }, + { + "epoch": 0.6471667346106807, + "grad_norm": 0.18450862169265747, + "learning_rate": 9.191721021877973e-05, + "loss": 4.0512, + "step": 9525 + }, + { + "epoch": 0.6475064546813426, + "grad_norm": 0.24547728896141052, + "learning_rate": 9.191296371789646e-05, + "loss": 4.0998, + "step": 9530 + }, + { + "epoch": 0.6478461747520043, + "grad_norm": 0.19123290479183197, + "learning_rate": 9.190871721701318e-05, + "loss": 3.8672, + "step": 9535 + }, + { + "epoch": 0.6481858948226661, + "grad_norm": 0.22196923196315765, + "learning_rate": 9.190447071612991e-05, + "loss": 4.1529, + "step": 9540 + }, + { + "epoch": 0.6485256148933279, + "grad_norm": 0.21963347494602203, + "learning_rate": 9.190022421524664e-05, + "loss": 4.1048, + "step": 9545 + }, + { + "epoch": 0.6488653349639897, + "grad_norm": 0.19842089712619781, + "learning_rate": 9.189597771436337e-05, + "loss": 4.0297, + "step": 9550 + }, + { + "epoch": 0.6492050550346514, + "grad_norm": 0.2944156527519226, + "learning_rate": 9.18917312134801e-05, + "loss": 4.0261, + "step": 9555 + }, + { + "epoch": 0.6495447751053133, + "grad_norm": 0.1940944492816925, + "learning_rate": 9.188748471259682e-05, + "loss": 3.9962, + "step": 9560 + }, + { + "epoch": 0.649884495175975, + "grad_norm": 0.1459953486919403, + "learning_rate": 9.188323821171355e-05, + "loss": 4.0297, + "step": 9565 + }, + { + "epoch": 0.6502242152466368, + "grad_norm": 0.18244166672229767, + "learning_rate": 9.187899171083028e-05, + "loss": 3.9038, + "step": 9570 + }, + { + "epoch": 0.6505639353172985, + "grad_norm": 0.5632447004318237, + "learning_rate": 9.187474520994701e-05, + "loss": 3.8968, + "step": 9575 + }, + { + "epoch": 0.6509036553879604, + "grad_norm": 0.21160653233528137, + "learning_rate": 9.187049870906374e-05, + "loss": 4.0296, + "step": 9580 + }, + { + "epoch": 0.6512433754586221, + "grad_norm": 0.5150845646858215, + "learning_rate": 9.186625220818046e-05, + "loss": 3.9448, + "step": 9585 + }, + { + "epoch": 0.6515830955292838, + "grad_norm": 0.16149458289146423, + "learning_rate": 9.186200570729719e-05, + "loss": 3.9315, + "step": 9590 + }, + { + "epoch": 0.6519228155999457, + "grad_norm": 0.19759464263916016, + "learning_rate": 9.185775920641392e-05, + "loss": 3.9934, + "step": 9595 + }, + { + "epoch": 0.6522625356706074, + "grad_norm": 0.4384717047214508, + "learning_rate": 9.185351270553065e-05, + "loss": 3.932, + "step": 9600 + }, + { + "epoch": 0.6526022557412692, + "grad_norm": 0.17127420008182526, + "learning_rate": 9.184926620464738e-05, + "loss": 4.038, + "step": 9605 + }, + { + "epoch": 0.6529419758119309, + "grad_norm": 0.1805383861064911, + "learning_rate": 9.18450197037641e-05, + "loss": 4.077, + "step": 9610 + }, + { + "epoch": 0.6532816958825928, + "grad_norm": 0.17218123376369476, + "learning_rate": 9.184077320288082e-05, + "loss": 4.0027, + "step": 9615 + }, + { + "epoch": 0.6536214159532545, + "grad_norm": 0.21027123928070068, + "learning_rate": 9.183652670199756e-05, + "loss": 4.004, + "step": 9620 + }, + { + "epoch": 0.6539611360239163, + "grad_norm": 0.18752577900886536, + "learning_rate": 9.183228020111429e-05, + "loss": 4.1132, + "step": 9625 + }, + { + "epoch": 0.6543008560945781, + "grad_norm": 0.2274181991815567, + "learning_rate": 9.1828033700231e-05, + "loss": 4.0189, + "step": 9630 + }, + { + "epoch": 0.6546405761652399, + "grad_norm": 0.5568689107894897, + "learning_rate": 9.182378719934774e-05, + "loss": 3.9176, + "step": 9635 + }, + { + "epoch": 0.6549802962359016, + "grad_norm": 0.20141251385211945, + "learning_rate": 9.181954069846447e-05, + "loss": 4.1433, + "step": 9640 + }, + { + "epoch": 0.6553200163065634, + "grad_norm": 0.1734774261713028, + "learning_rate": 9.181529419758119e-05, + "loss": 3.8714, + "step": 9645 + }, + { + "epoch": 0.6556597363772252, + "grad_norm": 0.20372764766216278, + "learning_rate": 9.181104769669793e-05, + "loss": 3.6352, + "step": 9650 + }, + { + "epoch": 0.6559994564478869, + "grad_norm": 0.22679565846920013, + "learning_rate": 9.180680119581466e-05, + "loss": 4.0213, + "step": 9655 + }, + { + "epoch": 0.6563391765185487, + "grad_norm": 0.19201093912124634, + "learning_rate": 9.180255469493138e-05, + "loss": 4.3365, + "step": 9660 + }, + { + "epoch": 0.6566788965892105, + "grad_norm": 0.23552648723125458, + "learning_rate": 9.179830819404811e-05, + "loss": 3.827, + "step": 9665 + }, + { + "epoch": 0.6570186166598723, + "grad_norm": 0.2363472580909729, + "learning_rate": 9.179406169316484e-05, + "loss": 3.5363, + "step": 9670 + }, + { + "epoch": 0.657358336730534, + "grad_norm": 0.1936025321483612, + "learning_rate": 9.178981519228157e-05, + "loss": 3.6846, + "step": 9675 + }, + { + "epoch": 0.6576980568011959, + "grad_norm": 0.16396436095237732, + "learning_rate": 9.17855686913983e-05, + "loss": 4.2385, + "step": 9680 + }, + { + "epoch": 0.6580377768718576, + "grad_norm": 0.19219662249088287, + "learning_rate": 9.178132219051501e-05, + "loss": 3.8246, + "step": 9685 + }, + { + "epoch": 0.6583774969425193, + "grad_norm": 0.3243776857852936, + "learning_rate": 9.177707568963175e-05, + "loss": 3.9252, + "step": 9690 + }, + { + "epoch": 0.6587172170131811, + "grad_norm": 0.2118336260318756, + "learning_rate": 9.177282918874848e-05, + "loss": 3.9518, + "step": 9695 + }, + { + "epoch": 0.6590569370838429, + "grad_norm": 0.2181885987520218, + "learning_rate": 9.17685826878652e-05, + "loss": 3.8817, + "step": 9700 + }, + { + "epoch": 0.6593966571545047, + "grad_norm": 0.2063203752040863, + "learning_rate": 9.176433618698194e-05, + "loss": 4.0715, + "step": 9705 + }, + { + "epoch": 0.6597363772251664, + "grad_norm": 0.16221602261066437, + "learning_rate": 9.176008968609867e-05, + "loss": 3.8972, + "step": 9710 + }, + { + "epoch": 0.6600760972958283, + "grad_norm": 0.2043311893939972, + "learning_rate": 9.175584318521538e-05, + "loss": 3.9982, + "step": 9715 + }, + { + "epoch": 0.66041581736649, + "grad_norm": 0.15358102321624756, + "learning_rate": 9.175159668433212e-05, + "loss": 4.4382, + "step": 9720 + }, + { + "epoch": 0.6607555374371518, + "grad_norm": 0.1804971694946289, + "learning_rate": 9.174735018344885e-05, + "loss": 4.0575, + "step": 9725 + }, + { + "epoch": 0.6610952575078136, + "grad_norm": 0.8969916701316833, + "learning_rate": 9.174310368256556e-05, + "loss": 3.8002, + "step": 9730 + }, + { + "epoch": 0.6614349775784754, + "grad_norm": 0.17219315469264984, + "learning_rate": 9.17388571816823e-05, + "loss": 3.8753, + "step": 9735 + }, + { + "epoch": 0.6617746976491371, + "grad_norm": 1.4561220407485962, + "learning_rate": 9.173461068079903e-05, + "loss": 3.8808, + "step": 9740 + }, + { + "epoch": 0.6621144177197988, + "grad_norm": 0.3004949688911438, + "learning_rate": 9.173036417991575e-05, + "loss": 4.01, + "step": 9745 + }, + { + "epoch": 0.6624541377904607, + "grad_norm": 0.4193626642227173, + "learning_rate": 9.172611767903249e-05, + "loss": 4.1539, + "step": 9750 + }, + { + "epoch": 0.6627938578611224, + "grad_norm": 0.5232547521591187, + "learning_rate": 9.17218711781492e-05, + "loss": 4.1406, + "step": 9755 + }, + { + "epoch": 0.6631335779317842, + "grad_norm": 0.2067098319530487, + "learning_rate": 9.171762467726593e-05, + "loss": 3.9981, + "step": 9760 + }, + { + "epoch": 0.663473298002446, + "grad_norm": 0.2100781351327896, + "learning_rate": 9.171337817638267e-05, + "loss": 3.7371, + "step": 9765 + }, + { + "epoch": 0.6638130180731078, + "grad_norm": 0.28894108533859253, + "learning_rate": 9.170913167549939e-05, + "loss": 4.1151, + "step": 9770 + }, + { + "epoch": 0.6641527381437695, + "grad_norm": 0.2366105318069458, + "learning_rate": 9.170488517461612e-05, + "loss": 3.7963, + "step": 9775 + }, + { + "epoch": 0.6644924582144313, + "grad_norm": 0.15720832347869873, + "learning_rate": 9.170063867373286e-05, + "loss": 4.1177, + "step": 9780 + }, + { + "epoch": 0.6648321782850931, + "grad_norm": 0.16979362070560455, + "learning_rate": 9.169639217284957e-05, + "loss": 4.1974, + "step": 9785 + }, + { + "epoch": 0.6651718983557549, + "grad_norm": 0.1718185693025589, + "learning_rate": 9.16921456719663e-05, + "loss": 3.8146, + "step": 9790 + }, + { + "epoch": 0.6655116184264166, + "grad_norm": 0.1998133510351181, + "learning_rate": 9.168789917108304e-05, + "loss": 3.8062, + "step": 9795 + }, + { + "epoch": 0.6658513384970784, + "grad_norm": 0.19915224611759186, + "learning_rate": 9.168365267019976e-05, + "loss": 3.8217, + "step": 9800 + }, + { + "epoch": 0.6661910585677402, + "grad_norm": 0.19672515988349915, + "learning_rate": 9.167940616931648e-05, + "loss": 3.9699, + "step": 9805 + }, + { + "epoch": 0.6665307786384019, + "grad_norm": 0.18894067406654358, + "learning_rate": 9.167515966843323e-05, + "loss": 3.9818, + "step": 9810 + }, + { + "epoch": 0.6668704987090638, + "grad_norm": 0.2761344611644745, + "learning_rate": 9.167091316754994e-05, + "loss": 4.0251, + "step": 9815 + }, + { + "epoch": 0.6672102187797255, + "grad_norm": 0.20272742211818695, + "learning_rate": 9.166666666666667e-05, + "loss": 4.1925, + "step": 9820 + }, + { + "epoch": 0.6675499388503873, + "grad_norm": 1.1285771131515503, + "learning_rate": 9.166242016578341e-05, + "loss": 4.1825, + "step": 9825 + }, + { + "epoch": 0.667889658921049, + "grad_norm": 0.1726224571466446, + "learning_rate": 9.165817366490012e-05, + "loss": 3.9708, + "step": 9830 + }, + { + "epoch": 0.6682293789917109, + "grad_norm": 0.1987651288509369, + "learning_rate": 9.165392716401685e-05, + "loss": 4.1101, + "step": 9835 + }, + { + "epoch": 0.6685690990623726, + "grad_norm": 0.19084063172340393, + "learning_rate": 9.164968066313358e-05, + "loss": 3.9633, + "step": 9840 + }, + { + "epoch": 0.6689088191330343, + "grad_norm": 0.20953474938869476, + "learning_rate": 9.164543416225031e-05, + "loss": 4.0714, + "step": 9845 + }, + { + "epoch": 0.6692485392036962, + "grad_norm": 0.1689091920852661, + "learning_rate": 9.164118766136704e-05, + "loss": 4.0451, + "step": 9850 + }, + { + "epoch": 0.6695882592743579, + "grad_norm": 0.38470450043678284, + "learning_rate": 9.163694116048376e-05, + "loss": 4.2373, + "step": 9855 + }, + { + "epoch": 0.6699279793450197, + "grad_norm": 0.18749617040157318, + "learning_rate": 9.163269465960049e-05, + "loss": 4.0489, + "step": 9860 + }, + { + "epoch": 0.6702676994156814, + "grad_norm": 0.1558235138654709, + "learning_rate": 9.162844815871722e-05, + "loss": 4.0883, + "step": 9865 + }, + { + "epoch": 0.6706074194863433, + "grad_norm": 0.17656366527080536, + "learning_rate": 9.162420165783395e-05, + "loss": 3.8735, + "step": 9870 + }, + { + "epoch": 0.670947139557005, + "grad_norm": 0.16513271629810333, + "learning_rate": 9.161995515695068e-05, + "loss": 4.3651, + "step": 9875 + }, + { + "epoch": 0.6712868596276668, + "grad_norm": 0.369552880525589, + "learning_rate": 9.16157086560674e-05, + "loss": 4.1642, + "step": 9880 + }, + { + "epoch": 0.6716265796983286, + "grad_norm": 0.19037148356437683, + "learning_rate": 9.161146215518413e-05, + "loss": 4.0021, + "step": 9885 + }, + { + "epoch": 0.6719662997689904, + "grad_norm": 0.1722508817911148, + "learning_rate": 9.160721565430086e-05, + "loss": 3.8246, + "step": 9890 + }, + { + "epoch": 0.6723060198396521, + "grad_norm": 0.16816957294940948, + "learning_rate": 9.160296915341759e-05, + "loss": 3.7951, + "step": 9895 + }, + { + "epoch": 0.672645739910314, + "grad_norm": 0.18479801714420319, + "learning_rate": 9.159872265253432e-05, + "loss": 4.0533, + "step": 9900 + }, + { + "epoch": 0.6729854599809757, + "grad_norm": 0.1568441092967987, + "learning_rate": 9.159447615165104e-05, + "loss": 3.7568, + "step": 9905 + }, + { + "epoch": 0.6733251800516374, + "grad_norm": 0.15381227433681488, + "learning_rate": 9.159022965076777e-05, + "loss": 4.0221, + "step": 9910 + }, + { + "epoch": 0.6736649001222992, + "grad_norm": 0.22633537650108337, + "learning_rate": 9.15859831498845e-05, + "loss": 4.0213, + "step": 9915 + }, + { + "epoch": 0.674004620192961, + "grad_norm": 0.22000354528427124, + "learning_rate": 9.158173664900123e-05, + "loss": 3.9787, + "step": 9920 + }, + { + "epoch": 0.6743443402636228, + "grad_norm": 0.18316060304641724, + "learning_rate": 9.157749014811796e-05, + "loss": 3.939, + "step": 9925 + }, + { + "epoch": 0.6746840603342845, + "grad_norm": 0.3888174891471863, + "learning_rate": 9.157324364723468e-05, + "loss": 3.9585, + "step": 9930 + }, + { + "epoch": 0.6750237804049464, + "grad_norm": 0.24031803011894226, + "learning_rate": 9.156899714635141e-05, + "loss": 3.9616, + "step": 9935 + }, + { + "epoch": 0.6753635004756081, + "grad_norm": 0.22554604709148407, + "learning_rate": 9.156475064546814e-05, + "loss": 4.13, + "step": 9940 + }, + { + "epoch": 0.6757032205462699, + "grad_norm": 0.15670569241046906, + "learning_rate": 9.156050414458487e-05, + "loss": 4.0036, + "step": 9945 + }, + { + "epoch": 0.6760429406169316, + "grad_norm": 0.18224500119686127, + "learning_rate": 9.15562576437016e-05, + "loss": 4.1177, + "step": 9950 + }, + { + "epoch": 0.6763826606875935, + "grad_norm": 0.46049654483795166, + "learning_rate": 9.155201114281831e-05, + "loss": 3.8908, + "step": 9955 + }, + { + "epoch": 0.6767223807582552, + "grad_norm": 0.1925237625837326, + "learning_rate": 9.154776464193505e-05, + "loss": 4.0503, + "step": 9960 + }, + { + "epoch": 0.6770621008289169, + "grad_norm": 0.2237987518310547, + "learning_rate": 9.154351814105178e-05, + "loss": 4.0872, + "step": 9965 + }, + { + "epoch": 0.6774018208995788, + "grad_norm": 0.5446917414665222, + "learning_rate": 9.15392716401685e-05, + "loss": 3.9465, + "step": 9970 + }, + { + "epoch": 0.6777415409702405, + "grad_norm": 0.19259484112262726, + "learning_rate": 9.153502513928524e-05, + "loss": 4.0308, + "step": 9975 + }, + { + "epoch": 0.6780812610409023, + "grad_norm": 0.16928361356258392, + "learning_rate": 9.153077863840196e-05, + "loss": 3.8983, + "step": 9980 + }, + { + "epoch": 0.6784209811115641, + "grad_norm": 0.20402824878692627, + "learning_rate": 9.152653213751868e-05, + "loss": 4.0315, + "step": 9985 + }, + { + "epoch": 0.6787607011822259, + "grad_norm": 0.18891318142414093, + "learning_rate": 9.152228563663542e-05, + "loss": 3.9693, + "step": 9990 + }, + { + "epoch": 0.6791004212528876, + "grad_norm": 0.21101170778274536, + "learning_rate": 9.151803913575215e-05, + "loss": 3.8452, + "step": 9995 + }, + { + "epoch": 0.6794401413235494, + "grad_norm": 0.20866774022579193, + "learning_rate": 9.151379263486888e-05, + "loss": 3.7796, + "step": 10000 + }, + { + "epoch": 0.6797798613942112, + "grad_norm": 0.22108693420886993, + "learning_rate": 9.15095461339856e-05, + "loss": 4.1394, + "step": 10005 + }, + { + "epoch": 0.680119581464873, + "grad_norm": 0.18283884227275848, + "learning_rate": 9.150529963310233e-05, + "loss": 4.0837, + "step": 10010 + }, + { + "epoch": 0.6804593015355347, + "grad_norm": 0.3496660888195038, + "learning_rate": 9.150105313221906e-05, + "loss": 3.9772, + "step": 10015 + }, + { + "epoch": 0.6807990216061965, + "grad_norm": 0.21944838762283325, + "learning_rate": 9.149680663133579e-05, + "loss": 3.9497, + "step": 10020 + }, + { + "epoch": 0.6811387416768583, + "grad_norm": 0.9018047451972961, + "learning_rate": 9.149256013045252e-05, + "loss": 4.1269, + "step": 10025 + }, + { + "epoch": 0.68147846174752, + "grad_norm": 0.1882868856191635, + "learning_rate": 9.148831362956924e-05, + "loss": 3.9114, + "step": 10030 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.9261885285377502, + "learning_rate": 9.148406712868597e-05, + "loss": 4.1938, + "step": 10035 + }, + { + "epoch": 0.6821579018888436, + "grad_norm": 0.20487643778324127, + "learning_rate": 9.147982062780269e-05, + "loss": 4.029, + "step": 10040 + }, + { + "epoch": 0.6824976219595054, + "grad_norm": 0.20628653466701508, + "learning_rate": 9.147557412691943e-05, + "loss": 3.9453, + "step": 10045 + }, + { + "epoch": 0.6828373420301671, + "grad_norm": 0.18048962950706482, + "learning_rate": 9.147132762603616e-05, + "loss": 3.7238, + "step": 10050 + }, + { + "epoch": 0.683177062100829, + "grad_norm": 0.20143994688987732, + "learning_rate": 9.146708112515287e-05, + "loss": 4.0817, + "step": 10055 + }, + { + "epoch": 0.6835167821714907, + "grad_norm": 0.1772017776966095, + "learning_rate": 9.146283462426961e-05, + "loss": 4.3308, + "step": 10060 + }, + { + "epoch": 0.6838565022421524, + "grad_norm": 0.17534330487251282, + "learning_rate": 9.145858812338634e-05, + "loss": 3.9769, + "step": 10065 + }, + { + "epoch": 0.6841962223128143, + "grad_norm": 0.3508715331554413, + "learning_rate": 9.145434162250305e-05, + "loss": 4.1162, + "step": 10070 + }, + { + "epoch": 0.684535942383476, + "grad_norm": 0.22710181772708893, + "learning_rate": 9.14500951216198e-05, + "loss": 3.7313, + "step": 10075 + }, + { + "epoch": 0.6848756624541378, + "grad_norm": 0.1621914505958557, + "learning_rate": 9.144584862073652e-05, + "loss": 4.0611, + "step": 10080 + }, + { + "epoch": 0.6852153825247995, + "grad_norm": 0.26334255933761597, + "learning_rate": 9.144160211985324e-05, + "loss": 4.0556, + "step": 10085 + }, + { + "epoch": 0.6855551025954614, + "grad_norm": 0.6280850768089294, + "learning_rate": 9.143735561896998e-05, + "loss": 4.0588, + "step": 10090 + }, + { + "epoch": 0.6858948226661231, + "grad_norm": 0.3040946424007416, + "learning_rate": 9.143310911808671e-05, + "loss": 3.9212, + "step": 10095 + }, + { + "epoch": 0.6862345427367849, + "grad_norm": 0.30600547790527344, + "learning_rate": 9.142886261720342e-05, + "loss": 4.0689, + "step": 10100 + }, + { + "epoch": 0.6865742628074467, + "grad_norm": 0.18050602078437805, + "learning_rate": 9.142461611632016e-05, + "loss": 3.9464, + "step": 10105 + }, + { + "epoch": 0.6869139828781085, + "grad_norm": 0.48490941524505615, + "learning_rate": 9.142036961543688e-05, + "loss": 4.025, + "step": 10110 + }, + { + "epoch": 0.6872537029487702, + "grad_norm": 0.24022653698921204, + "learning_rate": 9.14161231145536e-05, + "loss": 4.0635, + "step": 10115 + }, + { + "epoch": 0.6875934230194319, + "grad_norm": 0.5546215772628784, + "learning_rate": 9.141187661367035e-05, + "loss": 3.9159, + "step": 10120 + }, + { + "epoch": 0.6879331430900938, + "grad_norm": 0.1747526228427887, + "learning_rate": 9.140763011278706e-05, + "loss": 4.0271, + "step": 10125 + }, + { + "epoch": 0.6882728631607555, + "grad_norm": 0.8281142115592957, + "learning_rate": 9.140338361190379e-05, + "loss": 3.8248, + "step": 10130 + }, + { + "epoch": 0.6886125832314173, + "grad_norm": 0.19988751411437988, + "learning_rate": 9.139913711102053e-05, + "loss": 4.022, + "step": 10135 + }, + { + "epoch": 0.6889523033020791, + "grad_norm": 0.5465024709701538, + "learning_rate": 9.139489061013725e-05, + "loss": 3.9433, + "step": 10140 + }, + { + "epoch": 0.6892920233727409, + "grad_norm": 0.704890787601471, + "learning_rate": 9.139064410925397e-05, + "loss": 4.2021, + "step": 10145 + }, + { + "epoch": 0.6896317434434026, + "grad_norm": 0.1683177798986435, + "learning_rate": 9.138639760837072e-05, + "loss": 4.0005, + "step": 10150 + }, + { + "epoch": 0.6899714635140645, + "grad_norm": 0.2499643862247467, + "learning_rate": 9.138215110748743e-05, + "loss": 4.0452, + "step": 10155 + }, + { + "epoch": 0.6903111835847262, + "grad_norm": 0.20261327922344208, + "learning_rate": 9.137790460660416e-05, + "loss": 3.9332, + "step": 10160 + }, + { + "epoch": 0.690650903655388, + "grad_norm": 0.19001875817775726, + "learning_rate": 9.13736581057209e-05, + "loss": 3.9008, + "step": 10165 + }, + { + "epoch": 0.6909906237260497, + "grad_norm": 0.18063665926456451, + "learning_rate": 9.136941160483761e-05, + "loss": 3.8843, + "step": 10170 + }, + { + "epoch": 0.6913303437967115, + "grad_norm": 0.1968679428100586, + "learning_rate": 9.136516510395434e-05, + "loss": 4.0298, + "step": 10175 + }, + { + "epoch": 0.6916700638673733, + "grad_norm": 0.21212510764598846, + "learning_rate": 9.136091860307107e-05, + "loss": 4.1819, + "step": 10180 + }, + { + "epoch": 0.692009783938035, + "grad_norm": 0.2317853420972824, + "learning_rate": 9.13566721021878e-05, + "loss": 3.8627, + "step": 10185 + }, + { + "epoch": 0.6923495040086969, + "grad_norm": 0.17582780122756958, + "learning_rate": 9.135242560130453e-05, + "loss": 4.044, + "step": 10190 + }, + { + "epoch": 0.6926892240793586, + "grad_norm": 0.1663304716348648, + "learning_rate": 9.134817910042125e-05, + "loss": 3.9946, + "step": 10195 + }, + { + "epoch": 0.6930289441500204, + "grad_norm": 0.2554979920387268, + "learning_rate": 9.134393259953798e-05, + "loss": 3.9845, + "step": 10200 + }, + { + "epoch": 0.6933686642206821, + "grad_norm": 0.19808262586593628, + "learning_rate": 9.133968609865471e-05, + "loss": 3.7411, + "step": 10205 + }, + { + "epoch": 0.693708384291344, + "grad_norm": 0.16207602620124817, + "learning_rate": 9.133543959777144e-05, + "loss": 3.9169, + "step": 10210 + }, + { + "epoch": 0.6940481043620057, + "grad_norm": 0.19255559146404266, + "learning_rate": 9.133119309688817e-05, + "loss": 4.0015, + "step": 10215 + }, + { + "epoch": 0.6943878244326674, + "grad_norm": 0.40374597907066345, + "learning_rate": 9.13269465960049e-05, + "loss": 4.0568, + "step": 10220 + }, + { + "epoch": 0.6947275445033293, + "grad_norm": 0.5542440414428711, + "learning_rate": 9.132270009512162e-05, + "loss": 4.0383, + "step": 10225 + }, + { + "epoch": 0.695067264573991, + "grad_norm": 0.2855238914489746, + "learning_rate": 9.131845359423835e-05, + "loss": 3.957, + "step": 10230 + }, + { + "epoch": 0.6954069846446528, + "grad_norm": 0.30983904004096985, + "learning_rate": 9.131420709335508e-05, + "loss": 3.9347, + "step": 10235 + }, + { + "epoch": 0.6957467047153146, + "grad_norm": 0.1820448637008667, + "learning_rate": 9.13099605924718e-05, + "loss": 4.0424, + "step": 10240 + }, + { + "epoch": 0.6960864247859764, + "grad_norm": 0.1688624769449234, + "learning_rate": 9.130571409158853e-05, + "loss": 4.1492, + "step": 10245 + }, + { + "epoch": 0.6964261448566381, + "grad_norm": 0.18944065272808075, + "learning_rate": 9.130146759070526e-05, + "loss": 4.0306, + "step": 10250 + }, + { + "epoch": 0.6967658649272999, + "grad_norm": 0.23642663657665253, + "learning_rate": 9.129722108982199e-05, + "loss": 3.9713, + "step": 10255 + }, + { + "epoch": 0.6971055849979617, + "grad_norm": 0.30841895937919617, + "learning_rate": 9.129297458893872e-05, + "loss": 4.0823, + "step": 10260 + }, + { + "epoch": 0.6974453050686235, + "grad_norm": 0.20149889588356018, + "learning_rate": 9.128872808805545e-05, + "loss": 3.9524, + "step": 10265 + }, + { + "epoch": 0.6977850251392852, + "grad_norm": 0.18833234906196594, + "learning_rate": 9.128448158717217e-05, + "loss": 4.1513, + "step": 10270 + }, + { + "epoch": 0.698124745209947, + "grad_norm": 0.1727929264307022, + "learning_rate": 9.12802350862889e-05, + "loss": 3.928, + "step": 10275 + }, + { + "epoch": 0.6984644652806088, + "grad_norm": 0.19599127769470215, + "learning_rate": 9.127598858540563e-05, + "loss": 3.9861, + "step": 10280 + }, + { + "epoch": 0.6988041853512705, + "grad_norm": 0.19696658849716187, + "learning_rate": 9.127174208452236e-05, + "loss": 3.9878, + "step": 10285 + }, + { + "epoch": 0.6991439054219323, + "grad_norm": 0.24747943878173828, + "learning_rate": 9.126749558363909e-05, + "loss": 3.9343, + "step": 10290 + }, + { + "epoch": 0.6994836254925941, + "grad_norm": 0.23848794400691986, + "learning_rate": 9.126324908275581e-05, + "loss": 3.7876, + "step": 10295 + }, + { + "epoch": 0.6998233455632559, + "grad_norm": 0.18184241652488708, + "learning_rate": 9.125900258187254e-05, + "loss": 4.003, + "step": 10300 + }, + { + "epoch": 0.7001630656339176, + "grad_norm": 0.21852192282676697, + "learning_rate": 9.125475608098927e-05, + "loss": 3.9115, + "step": 10305 + }, + { + "epoch": 0.7005027857045795, + "grad_norm": 0.18718090653419495, + "learning_rate": 9.125050958010599e-05, + "loss": 3.8846, + "step": 10310 + }, + { + "epoch": 0.7008425057752412, + "grad_norm": 0.4584771394729614, + "learning_rate": 9.124626307922273e-05, + "loss": 3.9954, + "step": 10315 + }, + { + "epoch": 0.701182225845903, + "grad_norm": 0.19998236000537872, + "learning_rate": 9.124201657833945e-05, + "loss": 3.9594, + "step": 10320 + }, + { + "epoch": 0.7015219459165648, + "grad_norm": 0.1826915442943573, + "learning_rate": 9.123777007745617e-05, + "loss": 3.8949, + "step": 10325 + }, + { + "epoch": 0.7018616659872265, + "grad_norm": 0.31630200147628784, + "learning_rate": 9.123352357657291e-05, + "loss": 4.0863, + "step": 10330 + }, + { + "epoch": 0.7022013860578883, + "grad_norm": 0.22190065681934357, + "learning_rate": 9.122927707568964e-05, + "loss": 3.9006, + "step": 10335 + }, + { + "epoch": 0.70254110612855, + "grad_norm": 0.14430834352970123, + "learning_rate": 9.122503057480637e-05, + "loss": 3.7831, + "step": 10340 + }, + { + "epoch": 0.7028808261992119, + "grad_norm": 0.23812465369701385, + "learning_rate": 9.12207840739231e-05, + "loss": 4.07, + "step": 10345 + }, + { + "epoch": 0.7032205462698736, + "grad_norm": 0.26677510142326355, + "learning_rate": 9.121653757303982e-05, + "loss": 4.0027, + "step": 10350 + }, + { + "epoch": 0.7035602663405354, + "grad_norm": 1.3501478433609009, + "learning_rate": 9.121229107215655e-05, + "loss": 3.987, + "step": 10355 + }, + { + "epoch": 0.7038999864111972, + "grad_norm": 0.20116499066352844, + "learning_rate": 9.120804457127328e-05, + "loss": 4.2462, + "step": 10360 + }, + { + "epoch": 0.704239706481859, + "grad_norm": 0.22384878993034363, + "learning_rate": 9.120379807039001e-05, + "loss": 3.7601, + "step": 10365 + }, + { + "epoch": 0.7045794265525207, + "grad_norm": 0.2325512170791626, + "learning_rate": 9.119955156950673e-05, + "loss": 4.0985, + "step": 10370 + }, + { + "epoch": 0.7049191466231824, + "grad_norm": 0.16118580102920532, + "learning_rate": 9.119530506862346e-05, + "loss": 4.0038, + "step": 10375 + }, + { + "epoch": 0.7052588666938443, + "grad_norm": 0.16809016466140747, + "learning_rate": 9.119105856774018e-05, + "loss": 4.0822, + "step": 10380 + }, + { + "epoch": 0.705598586764506, + "grad_norm": 0.29214194416999817, + "learning_rate": 9.118681206685692e-05, + "loss": 3.9853, + "step": 10385 + }, + { + "epoch": 0.7059383068351678, + "grad_norm": 0.17380909621715546, + "learning_rate": 9.118256556597365e-05, + "loss": 3.9326, + "step": 10390 + }, + { + "epoch": 0.7062780269058296, + "grad_norm": 0.19394385814666748, + "learning_rate": 9.117831906509036e-05, + "loss": 4.0416, + "step": 10395 + }, + { + "epoch": 0.7066177469764914, + "grad_norm": 0.16878750920295715, + "learning_rate": 9.11740725642071e-05, + "loss": 4.2154, + "step": 10400 + }, + { + "epoch": 0.7069574670471531, + "grad_norm": 0.17969490587711334, + "learning_rate": 9.116982606332383e-05, + "loss": 3.8792, + "step": 10405 + }, + { + "epoch": 0.707297187117815, + "grad_norm": 0.18729190528392792, + "learning_rate": 9.116557956244055e-05, + "loss": 4.0986, + "step": 10410 + }, + { + "epoch": 0.7076369071884767, + "grad_norm": 0.16963765025138855, + "learning_rate": 9.116133306155729e-05, + "loss": 4.062, + "step": 10415 + }, + { + "epoch": 0.7079766272591385, + "grad_norm": 0.2884555757045746, + "learning_rate": 9.115708656067402e-05, + "loss": 4.078, + "step": 10420 + }, + { + "epoch": 0.7083163473298002, + "grad_norm": 0.29540711641311646, + "learning_rate": 9.115284005979073e-05, + "loss": 4.0736, + "step": 10425 + }, + { + "epoch": 0.708656067400462, + "grad_norm": 0.41383740305900574, + "learning_rate": 9.114859355890747e-05, + "loss": 4.2034, + "step": 10430 + }, + { + "epoch": 0.7089957874711238, + "grad_norm": 0.17644955217838287, + "learning_rate": 9.11443470580242e-05, + "loss": 4.1137, + "step": 10435 + }, + { + "epoch": 0.7093355075417855, + "grad_norm": 0.1741337925195694, + "learning_rate": 9.114010055714091e-05, + "loss": 4.0893, + "step": 10440 + }, + { + "epoch": 0.7096752276124474, + "grad_norm": 0.2527826130390167, + "learning_rate": 9.113585405625766e-05, + "loss": 4.091, + "step": 10445 + }, + { + "epoch": 0.7100149476831091, + "grad_norm": 0.1837528795003891, + "learning_rate": 9.113160755537438e-05, + "loss": 4.0449, + "step": 10450 + }, + { + "epoch": 0.7103546677537709, + "grad_norm": 0.18858109414577484, + "learning_rate": 9.11273610544911e-05, + "loss": 4.0575, + "step": 10455 + }, + { + "epoch": 0.7106943878244326, + "grad_norm": 0.1998942494392395, + "learning_rate": 9.112311455360784e-05, + "loss": 3.9053, + "step": 10460 + }, + { + "epoch": 0.7110341078950945, + "grad_norm": 0.20997202396392822, + "learning_rate": 9.111886805272455e-05, + "loss": 3.8818, + "step": 10465 + }, + { + "epoch": 0.7113738279657562, + "grad_norm": 0.17449168860912323, + "learning_rate": 9.111462155184128e-05, + "loss": 4.0812, + "step": 10470 + }, + { + "epoch": 0.711713548036418, + "grad_norm": 0.20188404619693756, + "learning_rate": 9.111037505095802e-05, + "loss": 3.918, + "step": 10475 + }, + { + "epoch": 0.7120532681070798, + "grad_norm": 0.16575513780117035, + "learning_rate": 9.110612855007474e-05, + "loss": 4.053, + "step": 10480 + }, + { + "epoch": 0.7123929881777415, + "grad_norm": 0.20615115761756897, + "learning_rate": 9.110188204919147e-05, + "loss": 3.7282, + "step": 10485 + }, + { + "epoch": 0.7127327082484033, + "grad_norm": 0.17066192626953125, + "learning_rate": 9.109763554830821e-05, + "loss": 4.0129, + "step": 10490 + }, + { + "epoch": 0.7130724283190651, + "grad_norm": 0.2495145946741104, + "learning_rate": 9.109338904742492e-05, + "loss": 3.9959, + "step": 10495 + }, + { + "epoch": 0.7134121483897269, + "grad_norm": 0.19648021459579468, + "learning_rate": 9.108914254654165e-05, + "loss": 4.0106, + "step": 10500 + }, + { + "epoch": 0.7137518684603886, + "grad_norm": 0.20270267128944397, + "learning_rate": 9.108489604565839e-05, + "loss": 4.0734, + "step": 10505 + }, + { + "epoch": 0.7140915885310504, + "grad_norm": 0.1632177233695984, + "learning_rate": 9.10806495447751e-05, + "loss": 4.2488, + "step": 10510 + }, + { + "epoch": 0.7144313086017122, + "grad_norm": 0.1604064404964447, + "learning_rate": 9.107640304389183e-05, + "loss": 3.8896, + "step": 10515 + }, + { + "epoch": 0.714771028672374, + "grad_norm": 0.21193253993988037, + "learning_rate": 9.107215654300858e-05, + "loss": 3.9983, + "step": 10520 + }, + { + "epoch": 0.7151107487430357, + "grad_norm": 0.3716839551925659, + "learning_rate": 9.106791004212529e-05, + "loss": 3.9367, + "step": 10525 + }, + { + "epoch": 0.7154504688136976, + "grad_norm": 0.1587960124015808, + "learning_rate": 9.106366354124202e-05, + "loss": 3.7641, + "step": 10530 + }, + { + "epoch": 0.7157901888843593, + "grad_norm": 4.4356184005737305, + "learning_rate": 9.105941704035875e-05, + "loss": 4.0203, + "step": 10535 + }, + { + "epoch": 0.716129908955021, + "grad_norm": 0.2456178516149521, + "learning_rate": 9.105517053947547e-05, + "loss": 4.0491, + "step": 10540 + }, + { + "epoch": 0.7164696290256828, + "grad_norm": 0.5795451402664185, + "learning_rate": 9.10509240385922e-05, + "loss": 3.8471, + "step": 10545 + }, + { + "epoch": 0.7168093490963446, + "grad_norm": 0.20285823941230774, + "learning_rate": 9.104667753770893e-05, + "loss": 4.1314, + "step": 10550 + }, + { + "epoch": 0.7171490691670064, + "grad_norm": 0.16531841456890106, + "learning_rate": 9.104243103682566e-05, + "loss": 4.1928, + "step": 10555 + }, + { + "epoch": 0.7174887892376681, + "grad_norm": 0.1814994513988495, + "learning_rate": 9.103818453594239e-05, + "loss": 4.0319, + "step": 10560 + }, + { + "epoch": 0.71782850930833, + "grad_norm": 0.16023671627044678, + "learning_rate": 9.103393803505911e-05, + "loss": 4.1279, + "step": 10565 + }, + { + "epoch": 0.7181682293789917, + "grad_norm": 0.2011050581932068, + "learning_rate": 9.102969153417584e-05, + "loss": 4.0957, + "step": 10570 + }, + { + "epoch": 0.7185079494496535, + "grad_norm": 0.21786335110664368, + "learning_rate": 9.102544503329257e-05, + "loss": 3.9609, + "step": 10575 + }, + { + "epoch": 0.7188476695203153, + "grad_norm": 0.22446627914905548, + "learning_rate": 9.10211985324093e-05, + "loss": 3.9967, + "step": 10580 + }, + { + "epoch": 0.719187389590977, + "grad_norm": 0.301862508058548, + "learning_rate": 9.101695203152603e-05, + "loss": 3.7026, + "step": 10585 + }, + { + "epoch": 0.7195271096616388, + "grad_norm": 0.19402875006198883, + "learning_rate": 9.101270553064275e-05, + "loss": 4.1842, + "step": 10590 + }, + { + "epoch": 0.7198668297323005, + "grad_norm": 0.2339441329240799, + "learning_rate": 9.100845902975948e-05, + "loss": 3.8018, + "step": 10595 + }, + { + "epoch": 0.7202065498029624, + "grad_norm": 0.1565333753824234, + "learning_rate": 9.100421252887621e-05, + "loss": 3.8915, + "step": 10600 + }, + { + "epoch": 0.7205462698736241, + "grad_norm": 0.16925933957099915, + "learning_rate": 9.099996602799294e-05, + "loss": 4.1202, + "step": 10605 + }, + { + "epoch": 0.7208859899442859, + "grad_norm": 0.13388794660568237, + "learning_rate": 9.099571952710967e-05, + "loss": 3.9878, + "step": 10610 + }, + { + "epoch": 0.7212257100149477, + "grad_norm": 0.15455901622772217, + "learning_rate": 9.09914730262264e-05, + "loss": 3.9719, + "step": 10615 + }, + { + "epoch": 0.7215654300856095, + "grad_norm": 0.19704410433769226, + "learning_rate": 9.098722652534312e-05, + "loss": 3.9981, + "step": 10620 + }, + { + "epoch": 0.7219051501562712, + "grad_norm": 0.1932808756828308, + "learning_rate": 9.098298002445985e-05, + "loss": 4.0945, + "step": 10625 + }, + { + "epoch": 0.722244870226933, + "grad_norm": 0.2587969899177551, + "learning_rate": 9.097873352357658e-05, + "loss": 4.0047, + "step": 10630 + }, + { + "epoch": 0.7225845902975948, + "grad_norm": 0.24855893850326538, + "learning_rate": 9.09744870226933e-05, + "loss": 3.927, + "step": 10635 + }, + { + "epoch": 0.7229243103682566, + "grad_norm": 0.2058570235967636, + "learning_rate": 9.097024052181003e-05, + "loss": 4.0545, + "step": 10640 + }, + { + "epoch": 0.7232640304389183, + "grad_norm": 0.20533056557178497, + "learning_rate": 9.096599402092676e-05, + "loss": 4.0696, + "step": 10645 + }, + { + "epoch": 0.7236037505095801, + "grad_norm": 0.2629019320011139, + "learning_rate": 9.096174752004349e-05, + "loss": 3.9769, + "step": 10650 + }, + { + "epoch": 0.7239434705802419, + "grad_norm": 0.2127770483493805, + "learning_rate": 9.095750101916022e-05, + "loss": 3.9245, + "step": 10655 + }, + { + "epoch": 0.7242831906509036, + "grad_norm": 0.1867874562740326, + "learning_rate": 9.095325451827695e-05, + "loss": 3.5818, + "step": 10660 + }, + { + "epoch": 0.7246229107215655, + "grad_norm": 0.25175002217292786, + "learning_rate": 9.094900801739366e-05, + "loss": 4.0028, + "step": 10665 + }, + { + "epoch": 0.7249626307922272, + "grad_norm": 0.18207040429115295, + "learning_rate": 9.09447615165104e-05, + "loss": 3.9293, + "step": 10670 + }, + { + "epoch": 0.725302350862889, + "grad_norm": 0.2175348699092865, + "learning_rate": 9.094051501562713e-05, + "loss": 4.116, + "step": 10675 + }, + { + "epoch": 0.7256420709335507, + "grad_norm": 0.1736600250005722, + "learning_rate": 9.093626851474386e-05, + "loss": 4.0422, + "step": 10680 + }, + { + "epoch": 0.7259817910042126, + "grad_norm": 0.2036193609237671, + "learning_rate": 9.093202201386059e-05, + "loss": 4.0416, + "step": 10685 + }, + { + "epoch": 0.7263215110748743, + "grad_norm": 0.5849753618240356, + "learning_rate": 9.092777551297731e-05, + "loss": 3.9602, + "step": 10690 + }, + { + "epoch": 0.726661231145536, + "grad_norm": 0.15708011388778687, + "learning_rate": 9.092352901209404e-05, + "loss": 3.9746, + "step": 10695 + }, + { + "epoch": 0.7270009512161979, + "grad_norm": 0.21586155891418457, + "learning_rate": 9.091928251121077e-05, + "loss": 4.1214, + "step": 10700 + }, + { + "epoch": 0.7273406712868596, + "grad_norm": 0.19140325486660004, + "learning_rate": 9.09150360103275e-05, + "loss": 4.0866, + "step": 10705 + }, + { + "epoch": 0.7276803913575214, + "grad_norm": 0.6538243889808655, + "learning_rate": 9.091078950944423e-05, + "loss": 4.1316, + "step": 10710 + }, + { + "epoch": 0.7280201114281831, + "grad_norm": 0.1779155284166336, + "learning_rate": 9.090654300856095e-05, + "loss": 4.0354, + "step": 10715 + }, + { + "epoch": 0.728359831498845, + "grad_norm": 0.24357867240905762, + "learning_rate": 9.090229650767768e-05, + "loss": 3.7766, + "step": 10720 + }, + { + "epoch": 0.7286995515695067, + "grad_norm": 0.15026213228702545, + "learning_rate": 9.089805000679441e-05, + "loss": 3.8955, + "step": 10725 + }, + { + "epoch": 0.7290392716401685, + "grad_norm": 0.19349145889282227, + "learning_rate": 9.089380350591114e-05, + "loss": 3.7735, + "step": 10730 + }, + { + "epoch": 0.7293789917108303, + "grad_norm": 0.25646016001701355, + "learning_rate": 9.088955700502785e-05, + "loss": 4.0811, + "step": 10735 + }, + { + "epoch": 0.7297187117814921, + "grad_norm": 0.17351272702217102, + "learning_rate": 9.08853105041446e-05, + "loss": 4.1555, + "step": 10740 + }, + { + "epoch": 0.7300584318521538, + "grad_norm": 0.17100609838962555, + "learning_rate": 9.088106400326132e-05, + "loss": 4.0126, + "step": 10745 + }, + { + "epoch": 0.7303981519228157, + "grad_norm": 0.25536659359931946, + "learning_rate": 9.087681750237804e-05, + "loss": 4.0176, + "step": 10750 + }, + { + "epoch": 0.7307378719934774, + "grad_norm": 0.2601194977760315, + "learning_rate": 9.087257100149478e-05, + "loss": 4.0722, + "step": 10755 + }, + { + "epoch": 0.7310775920641391, + "grad_norm": 0.19794826209545135, + "learning_rate": 9.08683245006115e-05, + "loss": 4.2004, + "step": 10760 + }, + { + "epoch": 0.7314173121348009, + "grad_norm": 0.2230055183172226, + "learning_rate": 9.086407799972822e-05, + "loss": 3.9865, + "step": 10765 + }, + { + "epoch": 0.7317570322054627, + "grad_norm": 0.24480870366096497, + "learning_rate": 9.085983149884496e-05, + "loss": 3.7441, + "step": 10770 + }, + { + "epoch": 0.7320967522761245, + "grad_norm": 0.15868893265724182, + "learning_rate": 9.085558499796169e-05, + "loss": 4.0704, + "step": 10775 + }, + { + "epoch": 0.7324364723467862, + "grad_norm": 0.3648226857185364, + "learning_rate": 9.08513384970784e-05, + "loss": 4.0741, + "step": 10780 + }, + { + "epoch": 0.7327761924174481, + "grad_norm": 1.1779170036315918, + "learning_rate": 9.084709199619515e-05, + "loss": 4.0508, + "step": 10785 + }, + { + "epoch": 0.7331159124881098, + "grad_norm": 0.5466019511222839, + "learning_rate": 9.084284549531187e-05, + "loss": 3.7824, + "step": 10790 + }, + { + "epoch": 0.7334556325587716, + "grad_norm": 0.2697416841983795, + "learning_rate": 9.083859899442859e-05, + "loss": 3.8887, + "step": 10795 + }, + { + "epoch": 0.7337953526294333, + "grad_norm": 0.1877298504114151, + "learning_rate": 9.083435249354533e-05, + "loss": 4.0495, + "step": 10800 + }, + { + "epoch": 0.7341350727000951, + "grad_norm": 0.21535362303256989, + "learning_rate": 9.083010599266204e-05, + "loss": 4.1075, + "step": 10805 + }, + { + "epoch": 0.7344747927707569, + "grad_norm": 0.15432433784008026, + "learning_rate": 9.082585949177877e-05, + "loss": 4.1321, + "step": 10810 + }, + { + "epoch": 0.7348145128414186, + "grad_norm": 0.17613931000232697, + "learning_rate": 9.082161299089551e-05, + "loss": 4.0152, + "step": 10815 + }, + { + "epoch": 0.7351542329120805, + "grad_norm": 0.17943201959133148, + "learning_rate": 9.081736649001223e-05, + "loss": 4.0291, + "step": 10820 + }, + { + "epoch": 0.7354939529827422, + "grad_norm": 0.17358125746250153, + "learning_rate": 9.081311998912896e-05, + "loss": 4.0228, + "step": 10825 + }, + { + "epoch": 0.735833673053404, + "grad_norm": 0.16327157616615295, + "learning_rate": 9.08088734882457e-05, + "loss": 3.8393, + "step": 10830 + }, + { + "epoch": 0.7361733931240658, + "grad_norm": 0.38127797842025757, + "learning_rate": 9.080462698736241e-05, + "loss": 4.0771, + "step": 10835 + }, + { + "epoch": 0.7365131131947276, + "grad_norm": 0.17917278409004211, + "learning_rate": 9.080038048647914e-05, + "loss": 3.9258, + "step": 10840 + }, + { + "epoch": 0.7368528332653893, + "grad_norm": 0.1688838005065918, + "learning_rate": 9.079613398559588e-05, + "loss": 3.8387, + "step": 10845 + }, + { + "epoch": 0.737192553336051, + "grad_norm": 0.1823907047510147, + "learning_rate": 9.07918874847126e-05, + "loss": 3.9959, + "step": 10850 + }, + { + "epoch": 0.7375322734067129, + "grad_norm": 0.20357432961463928, + "learning_rate": 9.078764098382932e-05, + "loss": 3.99, + "step": 10855 + }, + { + "epoch": 0.7378719934773746, + "grad_norm": 0.18823125958442688, + "learning_rate": 9.078339448294607e-05, + "loss": 4.0908, + "step": 10860 + }, + { + "epoch": 0.7382117135480364, + "grad_norm": 0.16174191236495972, + "learning_rate": 9.077914798206278e-05, + "loss": 4.0641, + "step": 10865 + }, + { + "epoch": 0.7385514336186982, + "grad_norm": 0.1720336228609085, + "learning_rate": 9.077490148117951e-05, + "loss": 3.8454, + "step": 10870 + }, + { + "epoch": 0.73889115368936, + "grad_norm": 0.23603537678718567, + "learning_rate": 9.077065498029625e-05, + "loss": 3.9752, + "step": 10875 + }, + { + "epoch": 0.7392308737600217, + "grad_norm": 0.20614124834537506, + "learning_rate": 9.076640847941296e-05, + "loss": 4.045, + "step": 10880 + }, + { + "epoch": 0.7395705938306835, + "grad_norm": 0.30947062373161316, + "learning_rate": 9.076216197852969e-05, + "loss": 3.9429, + "step": 10885 + }, + { + "epoch": 0.7399103139013453, + "grad_norm": 0.2017713338136673, + "learning_rate": 9.075791547764642e-05, + "loss": 3.9205, + "step": 10890 + }, + { + "epoch": 0.7402500339720071, + "grad_norm": 1.3917611837387085, + "learning_rate": 9.075366897676315e-05, + "loss": 4.0456, + "step": 10895 + }, + { + "epoch": 0.7405897540426688, + "grad_norm": 0.4103597104549408, + "learning_rate": 9.074942247587988e-05, + "loss": 3.9107, + "step": 10900 + }, + { + "epoch": 0.7409294741133307, + "grad_norm": 0.5144510269165039, + "learning_rate": 9.07451759749966e-05, + "loss": 4.0178, + "step": 10905 + }, + { + "epoch": 0.7412691941839924, + "grad_norm": 0.16965581476688385, + "learning_rate": 9.074092947411333e-05, + "loss": 3.9729, + "step": 10910 + }, + { + "epoch": 0.7416089142546541, + "grad_norm": 0.38037505745887756, + "learning_rate": 9.073668297323006e-05, + "loss": 4.2043, + "step": 10915 + }, + { + "epoch": 0.741948634325316, + "grad_norm": 0.26255086064338684, + "learning_rate": 9.073243647234679e-05, + "loss": 3.8663, + "step": 10920 + }, + { + "epoch": 0.7422883543959777, + "grad_norm": 0.3262099623680115, + "learning_rate": 9.072818997146352e-05, + "loss": 4.0803, + "step": 10925 + }, + { + "epoch": 0.7426280744666395, + "grad_norm": 0.21773168444633484, + "learning_rate": 9.072394347058024e-05, + "loss": 3.9992, + "step": 10930 + }, + { + "epoch": 0.7429677945373012, + "grad_norm": 0.22857216000556946, + "learning_rate": 9.071969696969697e-05, + "loss": 3.8582, + "step": 10935 + }, + { + "epoch": 0.7433075146079631, + "grad_norm": 0.312259316444397, + "learning_rate": 9.07154504688137e-05, + "loss": 4.0318, + "step": 10940 + }, + { + "epoch": 0.7436472346786248, + "grad_norm": 0.1695690155029297, + "learning_rate": 9.071120396793043e-05, + "loss": 3.9077, + "step": 10945 + }, + { + "epoch": 0.7439869547492866, + "grad_norm": 0.29498061537742615, + "learning_rate": 9.070695746704716e-05, + "loss": 4.3087, + "step": 10950 + }, + { + "epoch": 0.7443266748199484, + "grad_norm": 0.24566805362701416, + "learning_rate": 9.070271096616388e-05, + "loss": 3.8372, + "step": 10955 + }, + { + "epoch": 0.7446663948906102, + "grad_norm": 0.163113072514534, + "learning_rate": 9.069846446528061e-05, + "loss": 4.0402, + "step": 10960 + }, + { + "epoch": 0.7450061149612719, + "grad_norm": 0.18011754751205444, + "learning_rate": 9.069421796439734e-05, + "loss": 4.1377, + "step": 10965 + }, + { + "epoch": 0.7453458350319336, + "grad_norm": 0.8807979822158813, + "learning_rate": 9.068997146351407e-05, + "loss": 3.8495, + "step": 10970 + }, + { + "epoch": 0.7456855551025955, + "grad_norm": 0.22865957021713257, + "learning_rate": 9.06857249626308e-05, + "loss": 4.1146, + "step": 10975 + }, + { + "epoch": 0.7460252751732572, + "grad_norm": 0.2118086814880371, + "learning_rate": 9.068147846174752e-05, + "loss": 3.8964, + "step": 10980 + }, + { + "epoch": 0.746364995243919, + "grad_norm": 0.18207617104053497, + "learning_rate": 9.067723196086425e-05, + "loss": 3.9706, + "step": 10985 + }, + { + "epoch": 0.7467047153145808, + "grad_norm": 0.16859905421733856, + "learning_rate": 9.067298545998098e-05, + "loss": 3.7855, + "step": 10990 + }, + { + "epoch": 0.7470444353852426, + "grad_norm": 0.16500097513198853, + "learning_rate": 9.066873895909771e-05, + "loss": 3.7684, + "step": 10995 + }, + { + "epoch": 0.7473841554559043, + "grad_norm": 0.1520179957151413, + "learning_rate": 9.066449245821444e-05, + "loss": 3.8084, + "step": 11000 + }, + { + "epoch": 0.7477238755265662, + "grad_norm": 0.21755331754684448, + "learning_rate": 9.066024595733115e-05, + "loss": 3.8863, + "step": 11005 + }, + { + "epoch": 0.7480635955972279, + "grad_norm": 0.20671890676021576, + "learning_rate": 9.065599945644789e-05, + "loss": 3.9006, + "step": 11010 + }, + { + "epoch": 0.7484033156678896, + "grad_norm": 0.16787393391132355, + "learning_rate": 9.065175295556462e-05, + "loss": 3.7611, + "step": 11015 + }, + { + "epoch": 0.7487430357385514, + "grad_norm": 0.22157283127307892, + "learning_rate": 9.064750645468135e-05, + "loss": 4.1048, + "step": 11020 + }, + { + "epoch": 0.7490827558092132, + "grad_norm": 0.22022277116775513, + "learning_rate": 9.064325995379808e-05, + "loss": 3.8746, + "step": 11025 + }, + { + "epoch": 0.749422475879875, + "grad_norm": 0.2435934692621231, + "learning_rate": 9.06390134529148e-05, + "loss": 3.9949, + "step": 11030 + }, + { + "epoch": 0.7497621959505367, + "grad_norm": 0.18187767267227173, + "learning_rate": 9.063476695203153e-05, + "loss": 4.0584, + "step": 11035 + }, + { + "epoch": 0.7501019160211986, + "grad_norm": 0.18477857112884521, + "learning_rate": 9.063052045114826e-05, + "loss": 4.1217, + "step": 11040 + }, + { + "epoch": 0.7504416360918603, + "grad_norm": 0.1471758335828781, + "learning_rate": 9.062627395026499e-05, + "loss": 4.0513, + "step": 11045 + }, + { + "epoch": 0.7507813561625221, + "grad_norm": 0.20632903277873993, + "learning_rate": 9.062202744938172e-05, + "loss": 3.9865, + "step": 11050 + }, + { + "epoch": 0.7511210762331838, + "grad_norm": 0.21105721592903137, + "learning_rate": 9.061778094849844e-05, + "loss": 3.8277, + "step": 11055 + }, + { + "epoch": 0.7514607963038457, + "grad_norm": 0.19280456006526947, + "learning_rate": 9.061353444761517e-05, + "loss": 4.0091, + "step": 11060 + }, + { + "epoch": 0.7518005163745074, + "grad_norm": 0.1918146163225174, + "learning_rate": 9.06092879467319e-05, + "loss": 4.0648, + "step": 11065 + }, + { + "epoch": 0.7521402364451691, + "grad_norm": 0.22963494062423706, + "learning_rate": 9.060504144584863e-05, + "loss": 3.9061, + "step": 11070 + }, + { + "epoch": 0.752479956515831, + "grad_norm": 0.16479997336864471, + "learning_rate": 9.060079494496536e-05, + "loss": 4.0303, + "step": 11075 + }, + { + "epoch": 0.7528196765864927, + "grad_norm": 0.18432816863059998, + "learning_rate": 9.059654844408208e-05, + "loss": 3.8773, + "step": 11080 + }, + { + "epoch": 0.7531593966571545, + "grad_norm": 0.22336050868034363, + "learning_rate": 9.059230194319881e-05, + "loss": 3.7997, + "step": 11085 + }, + { + "epoch": 0.7534991167278163, + "grad_norm": 0.242068812251091, + "learning_rate": 9.058805544231553e-05, + "loss": 3.9268, + "step": 11090 + }, + { + "epoch": 0.7538388367984781, + "grad_norm": 0.14753904938697815, + "learning_rate": 9.058380894143227e-05, + "loss": 4.0985, + "step": 11095 + }, + { + "epoch": 0.7541785568691398, + "grad_norm": 0.19245490431785583, + "learning_rate": 9.0579562440549e-05, + "loss": 3.8995, + "step": 11100 + }, + { + "epoch": 0.7545182769398016, + "grad_norm": 0.18615277111530304, + "learning_rate": 9.057531593966571e-05, + "loss": 4.0072, + "step": 11105 + }, + { + "epoch": 0.7548579970104634, + "grad_norm": 0.19581812620162964, + "learning_rate": 9.057106943878245e-05, + "loss": 3.8758, + "step": 11110 + }, + { + "epoch": 0.7551977170811252, + "grad_norm": 0.15949614346027374, + "learning_rate": 9.056682293789918e-05, + "loss": 3.9473, + "step": 11115 + }, + { + "epoch": 0.7555374371517869, + "grad_norm": 1.359558343887329, + "learning_rate": 9.05625764370159e-05, + "loss": 3.9416, + "step": 11120 + }, + { + "epoch": 0.7558771572224487, + "grad_norm": 0.1593676656484604, + "learning_rate": 9.055832993613264e-05, + "loss": 4.0518, + "step": 11125 + }, + { + "epoch": 0.7562168772931105, + "grad_norm": 0.1715662181377411, + "learning_rate": 9.055408343524937e-05, + "loss": 3.982, + "step": 11130 + }, + { + "epoch": 0.7565565973637722, + "grad_norm": 0.1934783011674881, + "learning_rate": 9.054983693436608e-05, + "loss": 4.1628, + "step": 11135 + }, + { + "epoch": 0.756896317434434, + "grad_norm": 0.16365660727024078, + "learning_rate": 9.054559043348282e-05, + "loss": 4.0758, + "step": 11140 + }, + { + "epoch": 0.7572360375050958, + "grad_norm": 0.2995030879974365, + "learning_rate": 9.054134393259955e-05, + "loss": 3.915, + "step": 11145 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.22450530529022217, + "learning_rate": 9.053709743171626e-05, + "loss": 4.2514, + "step": 11150 + }, + { + "epoch": 0.7579154776464193, + "grad_norm": 3.4520628452301025, + "learning_rate": 9.0532850930833e-05, + "loss": 4.0989, + "step": 11155 + }, + { + "epoch": 0.7582551977170812, + "grad_norm": 0.31930363178253174, + "learning_rate": 9.052860442994972e-05, + "loss": 4.0048, + "step": 11160 + }, + { + "epoch": 0.7585949177877429, + "grad_norm": 0.21884280443191528, + "learning_rate": 9.052435792906645e-05, + "loss": 3.8112, + "step": 11165 + }, + { + "epoch": 0.7589346378584046, + "grad_norm": 0.1630697101354599, + "learning_rate": 9.052011142818319e-05, + "loss": 3.9614, + "step": 11170 + }, + { + "epoch": 0.7592743579290665, + "grad_norm": 0.2021295428276062, + "learning_rate": 9.05158649272999e-05, + "loss": 3.9433, + "step": 11175 + }, + { + "epoch": 0.7596140779997282, + "grad_norm": 0.21674887835979462, + "learning_rate": 9.051161842641663e-05, + "loss": 3.9271, + "step": 11180 + }, + { + "epoch": 0.75995379807039, + "grad_norm": 0.2624286413192749, + "learning_rate": 9.050737192553337e-05, + "loss": 3.8687, + "step": 11185 + }, + { + "epoch": 0.7602935181410517, + "grad_norm": 0.15679609775543213, + "learning_rate": 9.050312542465009e-05, + "loss": 4.0386, + "step": 11190 + }, + { + "epoch": 0.7606332382117136, + "grad_norm": 0.298999547958374, + "learning_rate": 9.049887892376682e-05, + "loss": 4.039, + "step": 11195 + }, + { + "epoch": 0.7609729582823753, + "grad_norm": 0.1916528344154358, + "learning_rate": 9.049463242288356e-05, + "loss": 3.9924, + "step": 11200 + }, + { + "epoch": 0.7613126783530371, + "grad_norm": 0.20813806354999542, + "learning_rate": 9.049038592200027e-05, + "loss": 3.9728, + "step": 11205 + }, + { + "epoch": 0.7616523984236989, + "grad_norm": 0.26955127716064453, + "learning_rate": 9.0486139421117e-05, + "loss": 3.9563, + "step": 11210 + }, + { + "epoch": 0.7619921184943607, + "grad_norm": 0.22610753774642944, + "learning_rate": 9.048189292023374e-05, + "loss": 3.9385, + "step": 11215 + }, + { + "epoch": 0.7623318385650224, + "grad_norm": 0.1515149027109146, + "learning_rate": 9.047764641935046e-05, + "loss": 4.0009, + "step": 11220 + }, + { + "epoch": 0.7626715586356841, + "grad_norm": 0.15270265936851501, + "learning_rate": 9.047339991846718e-05, + "loss": 3.8132, + "step": 11225 + }, + { + "epoch": 0.763011278706346, + "grad_norm": 0.256085067987442, + "learning_rate": 9.046915341758391e-05, + "loss": 4.1206, + "step": 11230 + }, + { + "epoch": 0.7633509987770077, + "grad_norm": 0.19662001729011536, + "learning_rate": 9.046490691670064e-05, + "loss": 4.0799, + "step": 11235 + }, + { + "epoch": 0.7636907188476695, + "grad_norm": 0.18421690165996552, + "learning_rate": 9.046066041581737e-05, + "loss": 4.0328, + "step": 11240 + }, + { + "epoch": 0.7640304389183313, + "grad_norm": 0.19748954474925995, + "learning_rate": 9.04564139149341e-05, + "loss": 3.7995, + "step": 11245 + }, + { + "epoch": 0.7643701589889931, + "grad_norm": 0.15954630076885223, + "learning_rate": 9.045216741405082e-05, + "loss": 3.8286, + "step": 11250 + }, + { + "epoch": 0.7647098790596548, + "grad_norm": 0.5489984154701233, + "learning_rate": 9.044792091316755e-05, + "loss": 4.2052, + "step": 11255 + }, + { + "epoch": 0.7650495991303167, + "grad_norm": 0.44495344161987305, + "learning_rate": 9.044367441228428e-05, + "loss": 4.1036, + "step": 11260 + }, + { + "epoch": 0.7653893192009784, + "grad_norm": 0.16555814445018768, + "learning_rate": 9.043942791140101e-05, + "loss": 3.9737, + "step": 11265 + }, + { + "epoch": 0.7657290392716402, + "grad_norm": 0.21692036092281342, + "learning_rate": 9.043518141051774e-05, + "loss": 4.0437, + "step": 11270 + }, + { + "epoch": 0.7660687593423019, + "grad_norm": 1.8791022300720215, + "learning_rate": 9.043093490963446e-05, + "loss": 4.0974, + "step": 11275 + }, + { + "epoch": 0.7664084794129638, + "grad_norm": 0.19241303205490112, + "learning_rate": 9.042668840875119e-05, + "loss": 3.9209, + "step": 11280 + }, + { + "epoch": 0.7667481994836255, + "grad_norm": 0.17803336679935455, + "learning_rate": 9.042244190786792e-05, + "loss": 3.842, + "step": 11285 + }, + { + "epoch": 0.7670879195542872, + "grad_norm": 0.20129168033599854, + "learning_rate": 9.041819540698465e-05, + "loss": 4.0462, + "step": 11290 + }, + { + "epoch": 0.7674276396249491, + "grad_norm": 0.18283264338970184, + "learning_rate": 9.041394890610138e-05, + "loss": 3.9094, + "step": 11295 + }, + { + "epoch": 0.7677673596956108, + "grad_norm": 0.20721754431724548, + "learning_rate": 9.04097024052181e-05, + "loss": 3.9013, + "step": 11300 + }, + { + "epoch": 0.7681070797662726, + "grad_norm": 0.43089064955711365, + "learning_rate": 9.040545590433483e-05, + "loss": 3.964, + "step": 11305 + }, + { + "epoch": 0.7684467998369343, + "grad_norm": 0.24873760342597961, + "learning_rate": 9.040120940345156e-05, + "loss": 4.0062, + "step": 11310 + }, + { + "epoch": 0.7687865199075962, + "grad_norm": 0.22243714332580566, + "learning_rate": 9.039696290256829e-05, + "loss": 3.8801, + "step": 11315 + }, + { + "epoch": 0.7691262399782579, + "grad_norm": 0.41750073432922363, + "learning_rate": 9.039271640168502e-05, + "loss": 4.2682, + "step": 11320 + }, + { + "epoch": 0.7694659600489197, + "grad_norm": 0.369742214679718, + "learning_rate": 9.038846990080174e-05, + "loss": 3.8898, + "step": 11325 + }, + { + "epoch": 0.7698056801195815, + "grad_norm": 0.26420828700065613, + "learning_rate": 9.038422339991847e-05, + "loss": 3.9038, + "step": 11330 + }, + { + "epoch": 0.7701454001902432, + "grad_norm": 0.2597283720970154, + "learning_rate": 9.03799768990352e-05, + "loss": 3.9406, + "step": 11335 + }, + { + "epoch": 0.770485120260905, + "grad_norm": 0.17518769204616547, + "learning_rate": 9.037573039815193e-05, + "loss": 3.6741, + "step": 11340 + }, + { + "epoch": 0.7708248403315668, + "grad_norm": 0.6777191758155823, + "learning_rate": 9.037148389726866e-05, + "loss": 4.0673, + "step": 11345 + }, + { + "epoch": 0.7711645604022286, + "grad_norm": 0.201960951089859, + "learning_rate": 9.036723739638538e-05, + "loss": 4.0074, + "step": 11350 + }, + { + "epoch": 0.7715042804728903, + "grad_norm": 0.4381665587425232, + "learning_rate": 9.036299089550211e-05, + "loss": 3.8382, + "step": 11355 + }, + { + "epoch": 0.7718440005435521, + "grad_norm": 0.1966671347618103, + "learning_rate": 9.035874439461884e-05, + "loss": 4.1766, + "step": 11360 + }, + { + "epoch": 0.7721837206142139, + "grad_norm": 0.16876500844955444, + "learning_rate": 9.035449789373557e-05, + "loss": 3.9402, + "step": 11365 + }, + { + "epoch": 0.7725234406848757, + "grad_norm": 4.147640705108643, + "learning_rate": 9.03502513928523e-05, + "loss": 4.109, + "step": 11370 + }, + { + "epoch": 0.7728631607555374, + "grad_norm": 0.2072206437587738, + "learning_rate": 9.034600489196902e-05, + "loss": 4.0296, + "step": 11375 + }, + { + "epoch": 0.7732028808261993, + "grad_norm": 0.2016468346118927, + "learning_rate": 9.034175839108575e-05, + "loss": 3.7331, + "step": 11380 + }, + { + "epoch": 0.773542600896861, + "grad_norm": 0.47726747393608093, + "learning_rate": 9.033751189020248e-05, + "loss": 4.0178, + "step": 11385 + }, + { + "epoch": 0.7738823209675227, + "grad_norm": 0.17172425985336304, + "learning_rate": 9.033326538931921e-05, + "loss": 4.1782, + "step": 11390 + }, + { + "epoch": 0.7742220410381845, + "grad_norm": 0.1911281943321228, + "learning_rate": 9.032901888843594e-05, + "loss": 3.9972, + "step": 11395 + }, + { + "epoch": 0.7745617611088463, + "grad_norm": 0.14127899706363678, + "learning_rate": 9.032477238755266e-05, + "loss": 3.9405, + "step": 11400 + }, + { + "epoch": 0.7749014811795081, + "grad_norm": 0.1841440349817276, + "learning_rate": 9.032052588666939e-05, + "loss": 4.0964, + "step": 11405 + }, + { + "epoch": 0.7752412012501698, + "grad_norm": 0.17826926708221436, + "learning_rate": 9.031627938578612e-05, + "loss": 4.2658, + "step": 11410 + }, + { + "epoch": 0.7755809213208317, + "grad_norm": 0.2882947325706482, + "learning_rate": 9.031203288490285e-05, + "loss": 4.1508, + "step": 11415 + }, + { + "epoch": 0.7759206413914934, + "grad_norm": 0.21743571758270264, + "learning_rate": 9.030778638401958e-05, + "loss": 4.1659, + "step": 11420 + }, + { + "epoch": 0.7762603614621552, + "grad_norm": 0.23792824149131775, + "learning_rate": 9.03035398831363e-05, + "loss": 3.8291, + "step": 11425 + }, + { + "epoch": 0.776600081532817, + "grad_norm": 0.2318025827407837, + "learning_rate": 9.029929338225302e-05, + "loss": 3.8444, + "step": 11430 + }, + { + "epoch": 0.7769398016034788, + "grad_norm": 0.17949531972408295, + "learning_rate": 9.029504688136976e-05, + "loss": 3.9841, + "step": 11435 + }, + { + "epoch": 0.7772795216741405, + "grad_norm": 0.22435292601585388, + "learning_rate": 9.029080038048649e-05, + "loss": 3.9821, + "step": 11440 + }, + { + "epoch": 0.7776192417448022, + "grad_norm": 0.1865406632423401, + "learning_rate": 9.02865538796032e-05, + "loss": 3.935, + "step": 11445 + }, + { + "epoch": 0.7779589618154641, + "grad_norm": 0.2090293914079666, + "learning_rate": 9.028230737871994e-05, + "loss": 3.9991, + "step": 11450 + }, + { + "epoch": 0.7782986818861258, + "grad_norm": 0.18024842441082, + "learning_rate": 9.027806087783667e-05, + "loss": 4.076, + "step": 11455 + }, + { + "epoch": 0.7786384019567876, + "grad_norm": 0.17997018992900848, + "learning_rate": 9.027381437695339e-05, + "loss": 3.7585, + "step": 11460 + }, + { + "epoch": 0.7789781220274494, + "grad_norm": 0.16544857621192932, + "learning_rate": 9.026956787607013e-05, + "loss": 4.2184, + "step": 11465 + }, + { + "epoch": 0.7793178420981112, + "grad_norm": 0.17606359720230103, + "learning_rate": 9.026532137518686e-05, + "loss": 3.9412, + "step": 11470 + }, + { + "epoch": 0.7796575621687729, + "grad_norm": 0.2205812931060791, + "learning_rate": 9.026107487430357e-05, + "loss": 4.0843, + "step": 11475 + }, + { + "epoch": 0.7799972822394347, + "grad_norm": 0.25740867853164673, + "learning_rate": 9.025682837342031e-05, + "loss": 3.9544, + "step": 11480 + }, + { + "epoch": 0.7803370023100965, + "grad_norm": 0.14909543097019196, + "learning_rate": 9.025258187253704e-05, + "loss": 3.8342, + "step": 11485 + }, + { + "epoch": 0.7806767223807582, + "grad_norm": 0.24682089686393738, + "learning_rate": 9.024833537165375e-05, + "loss": 3.9944, + "step": 11490 + }, + { + "epoch": 0.78101644245142, + "grad_norm": 0.15707463026046753, + "learning_rate": 9.02440888707705e-05, + "loss": 3.9675, + "step": 11495 + }, + { + "epoch": 0.7813561625220818, + "grad_norm": 0.22718797624111176, + "learning_rate": 9.023984236988722e-05, + "loss": 3.6624, + "step": 11500 + }, + { + "epoch": 0.7816958825927436, + "grad_norm": 0.15948626399040222, + "learning_rate": 9.023559586900394e-05, + "loss": 3.9947, + "step": 11505 + }, + { + "epoch": 0.7820356026634053, + "grad_norm": 0.16061913967132568, + "learning_rate": 9.023134936812068e-05, + "loss": 4.1845, + "step": 11510 + }, + { + "epoch": 0.7823753227340672, + "grad_norm": 0.25919288396835327, + "learning_rate": 9.02271028672374e-05, + "loss": 3.9301, + "step": 11515 + }, + { + "epoch": 0.7827150428047289, + "grad_norm": 0.21657872200012207, + "learning_rate": 9.022285636635412e-05, + "loss": 4.0299, + "step": 11520 + }, + { + "epoch": 0.7830547628753907, + "grad_norm": 0.18826550245285034, + "learning_rate": 9.021860986547086e-05, + "loss": 4.212, + "step": 11525 + }, + { + "epoch": 0.7833944829460524, + "grad_norm": 0.2549474835395813, + "learning_rate": 9.021436336458758e-05, + "loss": 4.0705, + "step": 11530 + }, + { + "epoch": 0.7837342030167143, + "grad_norm": 0.6155955195426941, + "learning_rate": 9.02101168637043e-05, + "loss": 3.79, + "step": 11535 + }, + { + "epoch": 0.784073923087376, + "grad_norm": 0.1635499894618988, + "learning_rate": 9.020587036282105e-05, + "loss": 4.1024, + "step": 11540 + }, + { + "epoch": 0.7844136431580377, + "grad_norm": 0.15726587176322937, + "learning_rate": 9.020162386193776e-05, + "loss": 3.9499, + "step": 11545 + }, + { + "epoch": 0.7847533632286996, + "grad_norm": 0.16258913278579712, + "learning_rate": 9.019737736105449e-05, + "loss": 3.9322, + "step": 11550 + }, + { + "epoch": 0.7850930832993613, + "grad_norm": 0.2376587688922882, + "learning_rate": 9.019313086017123e-05, + "loss": 3.9946, + "step": 11555 + }, + { + "epoch": 0.7854328033700231, + "grad_norm": 0.1641000360250473, + "learning_rate": 9.018888435928795e-05, + "loss": 4.1131, + "step": 11560 + }, + { + "epoch": 0.7857725234406848, + "grad_norm": 0.18432609736919403, + "learning_rate": 9.018463785840467e-05, + "loss": 3.8617, + "step": 11565 + }, + { + "epoch": 0.7861122435113467, + "grad_norm": 0.31025978922843933, + "learning_rate": 9.018039135752142e-05, + "loss": 3.7286, + "step": 11570 + }, + { + "epoch": 0.7864519635820084, + "grad_norm": 0.18590706586837769, + "learning_rate": 9.017614485663813e-05, + "loss": 3.9636, + "step": 11575 + }, + { + "epoch": 0.7867916836526702, + "grad_norm": 0.18814896047115326, + "learning_rate": 9.017189835575486e-05, + "loss": 4.3321, + "step": 11580 + }, + { + "epoch": 0.787131403723332, + "grad_norm": 0.17569060623645782, + "learning_rate": 9.016765185487159e-05, + "loss": 4.0207, + "step": 11585 + }, + { + "epoch": 0.7874711237939938, + "grad_norm": 0.8084515333175659, + "learning_rate": 9.016340535398831e-05, + "loss": 3.8702, + "step": 11590 + }, + { + "epoch": 0.7878108438646555, + "grad_norm": 0.1719738245010376, + "learning_rate": 9.015915885310504e-05, + "loss": 3.9472, + "step": 11595 + }, + { + "epoch": 0.7881505639353173, + "grad_norm": 0.3733132481575012, + "learning_rate": 9.015491235222177e-05, + "loss": 3.9621, + "step": 11600 + }, + { + "epoch": 0.7884902840059791, + "grad_norm": 0.1931469440460205, + "learning_rate": 9.01506658513385e-05, + "loss": 4.1482, + "step": 11605 + }, + { + "epoch": 0.7888300040766408, + "grad_norm": 0.6097820997238159, + "learning_rate": 9.014641935045523e-05, + "loss": 4.2861, + "step": 11610 + }, + { + "epoch": 0.7891697241473026, + "grad_norm": 0.23092709481716156, + "learning_rate": 9.014217284957195e-05, + "loss": 3.7435, + "step": 11615 + }, + { + "epoch": 0.7895094442179644, + "grad_norm": 0.20437659323215485, + "learning_rate": 9.013792634868868e-05, + "loss": 4.0259, + "step": 11620 + }, + { + "epoch": 0.7898491642886262, + "grad_norm": 0.19561974704265594, + "learning_rate": 9.013367984780541e-05, + "loss": 3.8396, + "step": 11625 + }, + { + "epoch": 0.7901888843592879, + "grad_norm": 0.22799140214920044, + "learning_rate": 9.012943334692214e-05, + "loss": 4.0335, + "step": 11630 + }, + { + "epoch": 0.7905286044299498, + "grad_norm": 0.1820353865623474, + "learning_rate": 9.012518684603887e-05, + "loss": 3.9942, + "step": 11635 + }, + { + "epoch": 0.7908683245006115, + "grad_norm": 0.217819482088089, + "learning_rate": 9.01209403451556e-05, + "loss": 4.1374, + "step": 11640 + }, + { + "epoch": 0.7912080445712733, + "grad_norm": 0.20061899721622467, + "learning_rate": 9.011669384427232e-05, + "loss": 3.6462, + "step": 11645 + }, + { + "epoch": 0.791547764641935, + "grad_norm": 0.21914707124233246, + "learning_rate": 9.011244734338905e-05, + "loss": 4.0377, + "step": 11650 + }, + { + "epoch": 0.7918874847125968, + "grad_norm": 0.2225886732339859, + "learning_rate": 9.010820084250578e-05, + "loss": 4.1587, + "step": 11655 + }, + { + "epoch": 0.7922272047832586, + "grad_norm": 0.23360738158226013, + "learning_rate": 9.01039543416225e-05, + "loss": 4.0995, + "step": 11660 + }, + { + "epoch": 0.7925669248539203, + "grad_norm": 0.20647506415843964, + "learning_rate": 9.009970784073923e-05, + "loss": 4.0943, + "step": 11665 + }, + { + "epoch": 0.7929066449245822, + "grad_norm": 0.17202545702457428, + "learning_rate": 9.009546133985596e-05, + "loss": 3.8883, + "step": 11670 + }, + { + "epoch": 0.7932463649952439, + "grad_norm": 1.380285382270813, + "learning_rate": 9.009121483897269e-05, + "loss": 4.0137, + "step": 11675 + }, + { + "epoch": 0.7935860850659057, + "grad_norm": 0.23098598420619965, + "learning_rate": 9.008696833808942e-05, + "loss": 4.0737, + "step": 11680 + }, + { + "epoch": 0.7939258051365675, + "grad_norm": 0.17068329453468323, + "learning_rate": 9.008272183720615e-05, + "loss": 4.2746, + "step": 11685 + }, + { + "epoch": 0.7942655252072293, + "grad_norm": 0.23422260582447052, + "learning_rate": 9.007847533632287e-05, + "loss": 3.7582, + "step": 11690 + }, + { + "epoch": 0.794605245277891, + "grad_norm": 0.1885872483253479, + "learning_rate": 9.00742288354396e-05, + "loss": 4.0525, + "step": 11695 + }, + { + "epoch": 0.7949449653485527, + "grad_norm": 0.18177750706672668, + "learning_rate": 9.006998233455633e-05, + "loss": 4.0782, + "step": 11700 + }, + { + "epoch": 0.7952846854192146, + "grad_norm": 2.576247453689575, + "learning_rate": 9.006573583367306e-05, + "loss": 3.8726, + "step": 11705 + }, + { + "epoch": 0.7956244054898763, + "grad_norm": 0.16896361112594604, + "learning_rate": 9.006148933278979e-05, + "loss": 3.8062, + "step": 11710 + }, + { + "epoch": 0.7959641255605381, + "grad_norm": 0.1680668294429779, + "learning_rate": 9.005724283190651e-05, + "loss": 4.0136, + "step": 11715 + }, + { + "epoch": 0.7963038456311999, + "grad_norm": 0.18001356720924377, + "learning_rate": 9.005299633102324e-05, + "loss": 4.1243, + "step": 11720 + }, + { + "epoch": 0.7966435657018617, + "grad_norm": 0.19494907557964325, + "learning_rate": 9.004874983013997e-05, + "loss": 3.9437, + "step": 11725 + }, + { + "epoch": 0.7969832857725234, + "grad_norm": 0.18916480243206024, + "learning_rate": 9.00445033292567e-05, + "loss": 4.0779, + "step": 11730 + }, + { + "epoch": 0.7973230058431852, + "grad_norm": 0.211675226688385, + "learning_rate": 9.004025682837343e-05, + "loss": 3.8902, + "step": 11735 + }, + { + "epoch": 0.797662725913847, + "grad_norm": 0.2676939368247986, + "learning_rate": 9.003601032749015e-05, + "loss": 4.0427, + "step": 11740 + }, + { + "epoch": 0.7980024459845088, + "grad_norm": 0.20862559974193573, + "learning_rate": 9.003176382660688e-05, + "loss": 4.2145, + "step": 11745 + }, + { + "epoch": 0.7983421660551705, + "grad_norm": 0.20464570820331573, + "learning_rate": 9.002751732572361e-05, + "loss": 4.113, + "step": 11750 + }, + { + "epoch": 0.7986818861258324, + "grad_norm": 0.17028920352458954, + "learning_rate": 9.002327082484034e-05, + "loss": 3.6957, + "step": 11755 + }, + { + "epoch": 0.7990216061964941, + "grad_norm": 0.24813637137413025, + "learning_rate": 9.001902432395707e-05, + "loss": 4.1226, + "step": 11760 + }, + { + "epoch": 0.7993613262671558, + "grad_norm": 1.6223915815353394, + "learning_rate": 9.00147778230738e-05, + "loss": 3.9943, + "step": 11765 + }, + { + "epoch": 0.7997010463378177, + "grad_norm": 0.1639162003993988, + "learning_rate": 9.001053132219052e-05, + "loss": 3.9359, + "step": 11770 + }, + { + "epoch": 0.8000407664084794, + "grad_norm": 0.16888979077339172, + "learning_rate": 9.000628482130725e-05, + "loss": 3.8101, + "step": 11775 + }, + { + "epoch": 0.8003804864791412, + "grad_norm": 0.1576785147190094, + "learning_rate": 9.000203832042398e-05, + "loss": 4.062, + "step": 11780 + }, + { + "epoch": 0.8007202065498029, + "grad_norm": 0.19945354759693146, + "learning_rate": 8.99977918195407e-05, + "loss": 4.2053, + "step": 11785 + }, + { + "epoch": 0.8010599266204648, + "grad_norm": 0.13953137397766113, + "learning_rate": 8.999354531865743e-05, + "loss": 3.6578, + "step": 11790 + }, + { + "epoch": 0.8013996466911265, + "grad_norm": 0.1995120495557785, + "learning_rate": 8.998929881777416e-05, + "loss": 3.9994, + "step": 11795 + }, + { + "epoch": 0.8017393667617883, + "grad_norm": 0.22360244393348694, + "learning_rate": 8.998505231689088e-05, + "loss": 4.1224, + "step": 11800 + }, + { + "epoch": 0.8020790868324501, + "grad_norm": 0.20481501519680023, + "learning_rate": 8.998080581600762e-05, + "loss": 4.2824, + "step": 11805 + }, + { + "epoch": 0.8024188069031118, + "grad_norm": 0.39974507689476013, + "learning_rate": 8.997655931512435e-05, + "loss": 4.1875, + "step": 11810 + }, + { + "epoch": 0.8027585269737736, + "grad_norm": 0.32297125458717346, + "learning_rate": 8.997231281424106e-05, + "loss": 3.8753, + "step": 11815 + }, + { + "epoch": 0.8030982470444353, + "grad_norm": 0.2076197862625122, + "learning_rate": 8.996891561353446e-05, + "loss": 4.0354, + "step": 11820 + }, + { + "epoch": 0.8034379671150972, + "grad_norm": 0.17974001169204712, + "learning_rate": 8.996466911265119e-05, + "loss": 4.0395, + "step": 11825 + }, + { + "epoch": 0.8037776871857589, + "grad_norm": 0.18468719720840454, + "learning_rate": 8.99604226117679e-05, + "loss": 4.0589, + "step": 11830 + }, + { + "epoch": 0.8041174072564207, + "grad_norm": 0.2775026857852936, + "learning_rate": 8.995617611088464e-05, + "loss": 4.0899, + "step": 11835 + }, + { + "epoch": 0.8044571273270825, + "grad_norm": 0.20769546926021576, + "learning_rate": 8.995192961000136e-05, + "loss": 4.1438, + "step": 11840 + }, + { + "epoch": 0.8047968473977443, + "grad_norm": 0.18603581190109253, + "learning_rate": 8.994768310911808e-05, + "loss": 3.9822, + "step": 11845 + }, + { + "epoch": 0.805136567468406, + "grad_norm": 0.3347295820713043, + "learning_rate": 8.994343660823483e-05, + "loss": 4.042, + "step": 11850 + }, + { + "epoch": 0.8054762875390679, + "grad_norm": 0.26557305455207825, + "learning_rate": 8.993919010735154e-05, + "loss": 3.9212, + "step": 11855 + }, + { + "epoch": 0.8058160076097296, + "grad_norm": 0.27433109283447266, + "learning_rate": 8.993494360646827e-05, + "loss": 3.7039, + "step": 11860 + }, + { + "epoch": 0.8061557276803913, + "grad_norm": 0.1835566610097885, + "learning_rate": 8.993069710558501e-05, + "loss": 3.8467, + "step": 11865 + }, + { + "epoch": 0.8064954477510531, + "grad_norm": 0.15933853387832642, + "learning_rate": 8.992645060470172e-05, + "loss": 4.035, + "step": 11870 + }, + { + "epoch": 0.8068351678217149, + "grad_norm": 0.1779545098543167, + "learning_rate": 8.992220410381845e-05, + "loss": 4.0, + "step": 11875 + }, + { + "epoch": 0.8071748878923767, + "grad_norm": 0.19771164655685425, + "learning_rate": 8.99179576029352e-05, + "loss": 3.5028, + "step": 11880 + }, + { + "epoch": 0.8075146079630384, + "grad_norm": 0.17675349116325378, + "learning_rate": 8.991371110205191e-05, + "loss": 3.9989, + "step": 11885 + }, + { + "epoch": 0.8078543280337003, + "grad_norm": 0.23120000958442688, + "learning_rate": 8.990946460116864e-05, + "loss": 3.7091, + "step": 11890 + }, + { + "epoch": 0.808194048104362, + "grad_norm": 0.18149258196353912, + "learning_rate": 8.990521810028538e-05, + "loss": 3.9628, + "step": 11895 + }, + { + "epoch": 0.8085337681750238, + "grad_norm": 0.2674315571784973, + "learning_rate": 8.990097159940209e-05, + "loss": 3.7765, + "step": 11900 + }, + { + "epoch": 0.8088734882456855, + "grad_norm": 0.21212173998355865, + "learning_rate": 8.989672509851883e-05, + "loss": 3.8405, + "step": 11905 + }, + { + "epoch": 0.8092132083163474, + "grad_norm": 0.16879509389400482, + "learning_rate": 8.989247859763555e-05, + "loss": 4.0809, + "step": 11910 + }, + { + "epoch": 0.8095529283870091, + "grad_norm": 0.14125306904315948, + "learning_rate": 8.988823209675228e-05, + "loss": 4.0725, + "step": 11915 + }, + { + "epoch": 0.8098926484576708, + "grad_norm": 0.1498613953590393, + "learning_rate": 8.988398559586902e-05, + "loss": 3.7912, + "step": 11920 + }, + { + "epoch": 0.8102323685283327, + "grad_norm": 0.16456682980060577, + "learning_rate": 8.987973909498573e-05, + "loss": 4.0187, + "step": 11925 + }, + { + "epoch": 0.8105720885989944, + "grad_norm": 0.3114604949951172, + "learning_rate": 8.987549259410246e-05, + "loss": 4.0009, + "step": 11930 + }, + { + "epoch": 0.8109118086696562, + "grad_norm": 0.5615077018737793, + "learning_rate": 8.98712460932192e-05, + "loss": 4.0609, + "step": 11935 + }, + { + "epoch": 0.811251528740318, + "grad_norm": 0.27753254771232605, + "learning_rate": 8.986699959233592e-05, + "loss": 3.8107, + "step": 11940 + }, + { + "epoch": 0.8115912488109798, + "grad_norm": 0.21950267255306244, + "learning_rate": 8.986275309145264e-05, + "loss": 3.8093, + "step": 11945 + }, + { + "epoch": 0.8119309688816415, + "grad_norm": 0.17988736927509308, + "learning_rate": 8.985850659056939e-05, + "loss": 3.9343, + "step": 11950 + }, + { + "epoch": 0.8122706889523033, + "grad_norm": 0.23350049555301666, + "learning_rate": 8.98542600896861e-05, + "loss": 4.0691, + "step": 11955 + }, + { + "epoch": 0.8126104090229651, + "grad_norm": 0.19277788698673248, + "learning_rate": 8.985001358880283e-05, + "loss": 3.9519, + "step": 11960 + }, + { + "epoch": 0.8129501290936268, + "grad_norm": 0.21622268855571747, + "learning_rate": 8.984576708791957e-05, + "loss": 3.8689, + "step": 11965 + }, + { + "epoch": 0.8132898491642886, + "grad_norm": 1.2102338075637817, + "learning_rate": 8.984152058703628e-05, + "loss": 3.9296, + "step": 11970 + }, + { + "epoch": 0.8136295692349504, + "grad_norm": 0.2097243219614029, + "learning_rate": 8.983727408615301e-05, + "loss": 4.2595, + "step": 11975 + }, + { + "epoch": 0.8139692893056122, + "grad_norm": 0.3595362603664398, + "learning_rate": 8.983302758526974e-05, + "loss": 3.9542, + "step": 11980 + }, + { + "epoch": 0.8143090093762739, + "grad_norm": 0.18622025847434998, + "learning_rate": 8.982878108438647e-05, + "loss": 3.8882, + "step": 11985 + }, + { + "epoch": 0.8146487294469357, + "grad_norm": 0.19790107011795044, + "learning_rate": 8.98245345835032e-05, + "loss": 4.1604, + "step": 11990 + }, + { + "epoch": 0.8149884495175975, + "grad_norm": 0.21050450205802917, + "learning_rate": 8.982028808261992e-05, + "loss": 3.7831, + "step": 11995 + }, + { + "epoch": 0.8153281695882593, + "grad_norm": 0.2178838849067688, + "learning_rate": 8.981604158173665e-05, + "loss": 3.8846, + "step": 12000 + }, + { + "epoch": 0.815667889658921, + "grad_norm": 0.20060613751411438, + "learning_rate": 8.981179508085338e-05, + "loss": 4.0856, + "step": 12005 + }, + { + "epoch": 0.8160076097295829, + "grad_norm": 0.19663146138191223, + "learning_rate": 8.980754857997011e-05, + "loss": 3.9376, + "step": 12010 + }, + { + "epoch": 0.8163473298002446, + "grad_norm": 0.36938565969467163, + "learning_rate": 8.980330207908684e-05, + "loss": 4.121, + "step": 12015 + }, + { + "epoch": 0.8166870498709063, + "grad_norm": 0.17913353443145752, + "learning_rate": 8.979905557820356e-05, + "loss": 3.9507, + "step": 12020 + }, + { + "epoch": 0.8170267699415682, + "grad_norm": 0.18103277683258057, + "learning_rate": 8.979480907732029e-05, + "loss": 3.651, + "step": 12025 + }, + { + "epoch": 0.8173664900122299, + "grad_norm": 0.1673816740512848, + "learning_rate": 8.979056257643702e-05, + "loss": 3.9309, + "step": 12030 + }, + { + "epoch": 0.8177062100828917, + "grad_norm": 0.4948117434978485, + "learning_rate": 8.978631607555375e-05, + "loss": 3.9102, + "step": 12035 + }, + { + "epoch": 0.8180459301535534, + "grad_norm": 0.16142868995666504, + "learning_rate": 8.978206957467048e-05, + "loss": 3.8234, + "step": 12040 + }, + { + "epoch": 0.8183856502242153, + "grad_norm": 0.2791318893432617, + "learning_rate": 8.97778230737872e-05, + "loss": 4.1417, + "step": 12045 + }, + { + "epoch": 0.818725370294877, + "grad_norm": 0.17257554829120636, + "learning_rate": 8.977357657290393e-05, + "loss": 4.043, + "step": 12050 + }, + { + "epoch": 0.8190650903655388, + "grad_norm": 0.2123369425535202, + "learning_rate": 8.976933007202066e-05, + "loss": 3.9297, + "step": 12055 + }, + { + "epoch": 0.8194048104362006, + "grad_norm": 0.19854533672332764, + "learning_rate": 8.976508357113739e-05, + "loss": 3.8441, + "step": 12060 + }, + { + "epoch": 0.8197445305068624, + "grad_norm": 0.24193492531776428, + "learning_rate": 8.976083707025412e-05, + "loss": 4.0607, + "step": 12065 + }, + { + "epoch": 0.8200842505775241, + "grad_norm": 0.1961483359336853, + "learning_rate": 8.975659056937084e-05, + "loss": 3.8928, + "step": 12070 + }, + { + "epoch": 0.8204239706481858, + "grad_norm": 0.14383824169635773, + "learning_rate": 8.975234406848757e-05, + "loss": 4.0603, + "step": 12075 + }, + { + "epoch": 0.8207636907188477, + "grad_norm": 0.2458658516407013, + "learning_rate": 8.97480975676043e-05, + "loss": 3.8334, + "step": 12080 + }, + { + "epoch": 0.8211034107895094, + "grad_norm": 0.17008869349956512, + "learning_rate": 8.974385106672103e-05, + "loss": 3.9803, + "step": 12085 + }, + { + "epoch": 0.8214431308601712, + "grad_norm": 0.20078590512275696, + "learning_rate": 8.973960456583776e-05, + "loss": 4.1641, + "step": 12090 + }, + { + "epoch": 0.821782850930833, + "grad_norm": 0.1937909722328186, + "learning_rate": 8.973535806495448e-05, + "loss": 4.0954, + "step": 12095 + }, + { + "epoch": 0.8221225710014948, + "grad_norm": 0.18328414857387543, + "learning_rate": 8.973111156407121e-05, + "loss": 3.9427, + "step": 12100 + }, + { + "epoch": 0.8224622910721565, + "grad_norm": 0.2016650289297104, + "learning_rate": 8.972686506318794e-05, + "loss": 3.9728, + "step": 12105 + }, + { + "epoch": 0.8228020111428184, + "grad_norm": 0.17548047006130219, + "learning_rate": 8.972261856230465e-05, + "loss": 4.1828, + "step": 12110 + }, + { + "epoch": 0.8231417312134801, + "grad_norm": 0.39229270815849304, + "learning_rate": 8.97183720614214e-05, + "loss": 4.0582, + "step": 12115 + }, + { + "epoch": 0.8234814512841419, + "grad_norm": 0.18692153692245483, + "learning_rate": 8.971412556053812e-05, + "loss": 3.9979, + "step": 12120 + }, + { + "epoch": 0.8238211713548036, + "grad_norm": 0.22566412389278412, + "learning_rate": 8.970987905965484e-05, + "loss": 4.0853, + "step": 12125 + }, + { + "epoch": 0.8241608914254654, + "grad_norm": 0.2699925899505615, + "learning_rate": 8.970563255877158e-05, + "loss": 4.2611, + "step": 12130 + }, + { + "epoch": 0.8245006114961272, + "grad_norm": 0.2766724228858948, + "learning_rate": 8.970138605788831e-05, + "loss": 3.8676, + "step": 12135 + }, + { + "epoch": 0.8248403315667889, + "grad_norm": 0.149053156375885, + "learning_rate": 8.969713955700502e-05, + "loss": 4.0457, + "step": 12140 + }, + { + "epoch": 0.8251800516374508, + "grad_norm": 0.29666438698768616, + "learning_rate": 8.969289305612176e-05, + "loss": 4.0984, + "step": 12145 + }, + { + "epoch": 0.8255197717081125, + "grad_norm": 0.1891719549894333, + "learning_rate": 8.968864655523849e-05, + "loss": 4.009, + "step": 12150 + }, + { + "epoch": 0.8258594917787743, + "grad_norm": 0.1801346093416214, + "learning_rate": 8.968440005435521e-05, + "loss": 4.1132, + "step": 12155 + }, + { + "epoch": 0.826199211849436, + "grad_norm": 1.4226734638214111, + "learning_rate": 8.968015355347195e-05, + "loss": 3.9887, + "step": 12160 + }, + { + "epoch": 0.8265389319200979, + "grad_norm": 0.3138851523399353, + "learning_rate": 8.967590705258868e-05, + "loss": 4.0721, + "step": 12165 + }, + { + "epoch": 0.8268786519907596, + "grad_norm": 0.19921836256980896, + "learning_rate": 8.967166055170539e-05, + "loss": 4.0086, + "step": 12170 + }, + { + "epoch": 0.8272183720614213, + "grad_norm": 0.2232120782136917, + "learning_rate": 8.966741405082213e-05, + "loss": 3.8876, + "step": 12175 + }, + { + "epoch": 0.8275580921320832, + "grad_norm": 0.21191275119781494, + "learning_rate": 8.966316754993885e-05, + "loss": 3.9509, + "step": 12180 + }, + { + "epoch": 0.8278978122027449, + "grad_norm": 1.0071362257003784, + "learning_rate": 8.965892104905557e-05, + "loss": 4.0208, + "step": 12185 + }, + { + "epoch": 0.8282375322734067, + "grad_norm": 0.30778366327285767, + "learning_rate": 8.965467454817232e-05, + "loss": 4.0989, + "step": 12190 + }, + { + "epoch": 0.8285772523440685, + "grad_norm": 0.15304256975650787, + "learning_rate": 8.965042804728903e-05, + "loss": 4.1485, + "step": 12195 + }, + { + "epoch": 0.8289169724147303, + "grad_norm": 0.1691897213459015, + "learning_rate": 8.964618154640576e-05, + "loss": 4.1178, + "step": 12200 + }, + { + "epoch": 0.829256692485392, + "grad_norm": 0.2017151266336441, + "learning_rate": 8.96419350455225e-05, + "loss": 4.1713, + "step": 12205 + }, + { + "epoch": 0.8295964125560538, + "grad_norm": 0.45046570897102356, + "learning_rate": 8.963768854463921e-05, + "loss": 3.8627, + "step": 12210 + }, + { + "epoch": 0.8299361326267156, + "grad_norm": 0.21693216264247894, + "learning_rate": 8.963344204375594e-05, + "loss": 3.969, + "step": 12215 + }, + { + "epoch": 0.8302758526973774, + "grad_norm": 0.22828508913516998, + "learning_rate": 8.962919554287268e-05, + "loss": 3.9584, + "step": 12220 + }, + { + "epoch": 0.8306155727680391, + "grad_norm": 0.2518628239631653, + "learning_rate": 8.96249490419894e-05, + "loss": 4.0381, + "step": 12225 + }, + { + "epoch": 0.830955292838701, + "grad_norm": 0.1994330883026123, + "learning_rate": 8.962070254110613e-05, + "loss": 3.8909, + "step": 12230 + }, + { + "epoch": 0.8312950129093627, + "grad_norm": 0.1634039431810379, + "learning_rate": 8.961645604022287e-05, + "loss": 3.5102, + "step": 12235 + }, + { + "epoch": 0.8316347329800244, + "grad_norm": 0.17670801281929016, + "learning_rate": 8.961220953933958e-05, + "loss": 3.8131, + "step": 12240 + }, + { + "epoch": 0.8319744530506862, + "grad_norm": 0.19512879848480225, + "learning_rate": 8.960796303845632e-05, + "loss": 3.5444, + "step": 12245 + }, + { + "epoch": 0.832314173121348, + "grad_norm": 0.5432287454605103, + "learning_rate": 8.960371653757305e-05, + "loss": 3.8723, + "step": 12250 + }, + { + "epoch": 0.8326538931920098, + "grad_norm": 0.21648722887039185, + "learning_rate": 8.959947003668977e-05, + "loss": 3.7991, + "step": 12255 + }, + { + "epoch": 0.8329936132626715, + "grad_norm": 0.3105649948120117, + "learning_rate": 8.959522353580651e-05, + "loss": 3.8407, + "step": 12260 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.32569703459739685, + "learning_rate": 8.959097703492322e-05, + "loss": 4.2322, + "step": 12265 + }, + { + "epoch": 0.8336730534039951, + "grad_norm": 0.2628330588340759, + "learning_rate": 8.958673053403995e-05, + "loss": 3.599, + "step": 12270 + }, + { + "epoch": 0.8340127734746569, + "grad_norm": 0.15857040882110596, + "learning_rate": 8.958248403315669e-05, + "loss": 4.0897, + "step": 12275 + }, + { + "epoch": 0.8343524935453187, + "grad_norm": 0.20659282803535461, + "learning_rate": 8.957823753227341e-05, + "loss": 3.8562, + "step": 12280 + }, + { + "epoch": 0.8346922136159804, + "grad_norm": 0.21449913084506989, + "learning_rate": 8.957399103139014e-05, + "loss": 3.9756, + "step": 12285 + }, + { + "epoch": 0.8350319336866422, + "grad_norm": 0.1901203691959381, + "learning_rate": 8.956974453050688e-05, + "loss": 4.025, + "step": 12290 + }, + { + "epoch": 0.8353716537573039, + "grad_norm": 0.20290473103523254, + "learning_rate": 8.956549802962359e-05, + "loss": 3.8794, + "step": 12295 + }, + { + "epoch": 0.8357113738279658, + "grad_norm": 0.1798480749130249, + "learning_rate": 8.956125152874032e-05, + "loss": 3.9994, + "step": 12300 + }, + { + "epoch": 0.8360510938986275, + "grad_norm": 0.25437653064727783, + "learning_rate": 8.955700502785706e-05, + "loss": 3.6909, + "step": 12305 + }, + { + "epoch": 0.8363908139692893, + "grad_norm": 0.3736377954483032, + "learning_rate": 8.955275852697378e-05, + "loss": 4.0622, + "step": 12310 + }, + { + "epoch": 0.8367305340399511, + "grad_norm": 0.18843521177768707, + "learning_rate": 8.95485120260905e-05, + "loss": 3.7766, + "step": 12315 + }, + { + "epoch": 0.8370702541106129, + "grad_norm": 0.18586871027946472, + "learning_rate": 8.954426552520724e-05, + "loss": 4.0302, + "step": 12320 + }, + { + "epoch": 0.8374099741812746, + "grad_norm": 0.17058813571929932, + "learning_rate": 8.954001902432396e-05, + "loss": 4.0756, + "step": 12325 + }, + { + "epoch": 0.8377496942519363, + "grad_norm": 0.21250483393669128, + "learning_rate": 8.953577252344069e-05, + "loss": 3.6402, + "step": 12330 + }, + { + "epoch": 0.8380894143225982, + "grad_norm": 0.18693235516548157, + "learning_rate": 8.953152602255742e-05, + "loss": 4.2171, + "step": 12335 + }, + { + "epoch": 0.83842913439326, + "grad_norm": 0.20142552256584167, + "learning_rate": 8.952727952167414e-05, + "loss": 4.0116, + "step": 12340 + }, + { + "epoch": 0.8387688544639217, + "grad_norm": 0.5936012864112854, + "learning_rate": 8.952303302079087e-05, + "loss": 4.0355, + "step": 12345 + }, + { + "epoch": 0.8391085745345835, + "grad_norm": 0.3252449631690979, + "learning_rate": 8.95187865199076e-05, + "loss": 4.0143, + "step": 12350 + }, + { + "epoch": 0.8394482946052453, + "grad_norm": 0.18693962693214417, + "learning_rate": 8.951454001902433e-05, + "loss": 3.938, + "step": 12355 + }, + { + "epoch": 0.839788014675907, + "grad_norm": 0.36720210313796997, + "learning_rate": 8.951029351814106e-05, + "loss": 3.9352, + "step": 12360 + }, + { + "epoch": 0.8401277347465689, + "grad_norm": 0.14825104176998138, + "learning_rate": 8.950604701725778e-05, + "loss": 4.2168, + "step": 12365 + }, + { + "epoch": 0.8404674548172306, + "grad_norm": 0.23677104711532593, + "learning_rate": 8.950180051637451e-05, + "loss": 3.967, + "step": 12370 + }, + { + "epoch": 0.8408071748878924, + "grad_norm": 0.5124611258506775, + "learning_rate": 8.949755401549124e-05, + "loss": 3.9194, + "step": 12375 + }, + { + "epoch": 0.8411468949585541, + "grad_norm": 0.3029448688030243, + "learning_rate": 8.949330751460797e-05, + "loss": 3.9354, + "step": 12380 + }, + { + "epoch": 0.841486615029216, + "grad_norm": 0.20730510354042053, + "learning_rate": 8.94890610137247e-05, + "loss": 4.1058, + "step": 12385 + }, + { + "epoch": 0.8418263350998777, + "grad_norm": 0.22315102815628052, + "learning_rate": 8.948481451284142e-05, + "loss": 3.893, + "step": 12390 + }, + { + "epoch": 0.8421660551705394, + "grad_norm": 0.47029411792755127, + "learning_rate": 8.948056801195815e-05, + "loss": 3.8572, + "step": 12395 + }, + { + "epoch": 0.8425057752412013, + "grad_norm": 0.19684122502803802, + "learning_rate": 8.947632151107488e-05, + "loss": 4.0835, + "step": 12400 + }, + { + "epoch": 0.842845495311863, + "grad_norm": 0.18742690980434418, + "learning_rate": 8.947207501019161e-05, + "loss": 3.9798, + "step": 12405 + }, + { + "epoch": 0.8431852153825248, + "grad_norm": 0.177710622549057, + "learning_rate": 8.946782850930834e-05, + "loss": 4.1085, + "step": 12410 + }, + { + "epoch": 0.8435249354531865, + "grad_norm": 0.18476006388664246, + "learning_rate": 8.946358200842506e-05, + "loss": 4.0936, + "step": 12415 + }, + { + "epoch": 0.8438646555238484, + "grad_norm": 0.16293834149837494, + "learning_rate": 8.946018480771845e-05, + "loss": 3.9854, + "step": 12420 + }, + { + "epoch": 0.8442043755945101, + "grad_norm": 0.22308799624443054, + "learning_rate": 8.945593830683517e-05, + "loss": 4.1042, + "step": 12425 + }, + { + "epoch": 0.8445440956651719, + "grad_norm": 0.16046442091464996, + "learning_rate": 8.94516918059519e-05, + "loss": 4.0012, + "step": 12430 + }, + { + "epoch": 0.8448838157358337, + "grad_norm": 0.26154249906539917, + "learning_rate": 8.944744530506862e-05, + "loss": 4.0459, + "step": 12435 + }, + { + "epoch": 0.8452235358064955, + "grad_norm": 0.23383556306362152, + "learning_rate": 8.944319880418536e-05, + "loss": 4.0728, + "step": 12440 + }, + { + "epoch": 0.8455632558771572, + "grad_norm": 0.22498470544815063, + "learning_rate": 8.943895230330209e-05, + "loss": 3.979, + "step": 12445 + }, + { + "epoch": 0.845902975947819, + "grad_norm": 0.18164518475532532, + "learning_rate": 8.943470580241881e-05, + "loss": 3.8648, + "step": 12450 + }, + { + "epoch": 0.8462426960184808, + "grad_norm": 0.21877458691596985, + "learning_rate": 8.943045930153554e-05, + "loss": 4.2495, + "step": 12455 + }, + { + "epoch": 0.8465824160891425, + "grad_norm": 0.1924646645784378, + "learning_rate": 8.942621280065227e-05, + "loss": 3.8453, + "step": 12460 + }, + { + "epoch": 0.8469221361598043, + "grad_norm": 0.20620742440223694, + "learning_rate": 8.9421966299769e-05, + "loss": 3.9962, + "step": 12465 + }, + { + "epoch": 0.8472618562304661, + "grad_norm": 0.290763795375824, + "learning_rate": 8.941771979888573e-05, + "loss": 3.8139, + "step": 12470 + }, + { + "epoch": 0.8476015763011279, + "grad_norm": 0.16620713472366333, + "learning_rate": 8.941347329800245e-05, + "loss": 3.7297, + "step": 12475 + }, + { + "epoch": 0.8479412963717896, + "grad_norm": 2.936108112335205, + "learning_rate": 8.940922679711918e-05, + "loss": 3.8085, + "step": 12480 + }, + { + "epoch": 0.8482810164424515, + "grad_norm": 0.1852518916130066, + "learning_rate": 8.940498029623591e-05, + "loss": 3.9015, + "step": 12485 + }, + { + "epoch": 0.8486207365131132, + "grad_norm": 0.159807950258255, + "learning_rate": 8.940073379535264e-05, + "loss": 3.9683, + "step": 12490 + }, + { + "epoch": 0.848960456583775, + "grad_norm": 0.15564261376857758, + "learning_rate": 8.939648729446937e-05, + "loss": 4.0309, + "step": 12495 + }, + { + "epoch": 0.8493001766544367, + "grad_norm": 0.18451645970344543, + "learning_rate": 8.93922407935861e-05, + "loss": 4.002, + "step": 12500 + }, + { + "epoch": 0.8496398967250985, + "grad_norm": 0.4409978687763214, + "learning_rate": 8.938799429270281e-05, + "loss": 3.9582, + "step": 12505 + }, + { + "epoch": 0.8499796167957603, + "grad_norm": 0.21767017245292664, + "learning_rate": 8.938374779181955e-05, + "loss": 3.7942, + "step": 12510 + }, + { + "epoch": 0.850319336866422, + "grad_norm": 0.29611897468566895, + "learning_rate": 8.937950129093628e-05, + "loss": 3.8641, + "step": 12515 + }, + { + "epoch": 0.8506590569370839, + "grad_norm": 0.15617810189723969, + "learning_rate": 8.937525479005299e-05, + "loss": 4.0887, + "step": 12520 + }, + { + "epoch": 0.8509987770077456, + "grad_norm": 0.19923017919063568, + "learning_rate": 8.937100828916973e-05, + "loss": 3.9796, + "step": 12525 + }, + { + "epoch": 0.8513384970784074, + "grad_norm": 0.178538978099823, + "learning_rate": 8.936676178828646e-05, + "loss": 4.1373, + "step": 12530 + }, + { + "epoch": 0.8516782171490692, + "grad_norm": 0.20157839357852936, + "learning_rate": 8.936251528740318e-05, + "loss": 4.1656, + "step": 12535 + }, + { + "epoch": 0.852017937219731, + "grad_norm": 0.21917836368083954, + "learning_rate": 8.935826878651992e-05, + "loss": 3.9735, + "step": 12540 + }, + { + "epoch": 0.8523576572903927, + "grad_norm": 0.1743486225605011, + "learning_rate": 8.935402228563665e-05, + "loss": 4.0561, + "step": 12545 + }, + { + "epoch": 0.8526973773610544, + "grad_norm": 0.3735666573047638, + "learning_rate": 8.934977578475336e-05, + "loss": 3.9863, + "step": 12550 + }, + { + "epoch": 0.8530370974317163, + "grad_norm": 0.21841804683208466, + "learning_rate": 8.93455292838701e-05, + "loss": 4.0283, + "step": 12555 + }, + { + "epoch": 0.853376817502378, + "grad_norm": 0.4104025065898895, + "learning_rate": 8.934128278298683e-05, + "loss": 3.7465, + "step": 12560 + }, + { + "epoch": 0.8537165375730398, + "grad_norm": 0.15292176604270935, + "learning_rate": 8.933703628210354e-05, + "loss": 4.0306, + "step": 12565 + }, + { + "epoch": 0.8540562576437016, + "grad_norm": 0.30871346592903137, + "learning_rate": 8.933278978122029e-05, + "loss": 3.9504, + "step": 12570 + }, + { + "epoch": 0.8543959777143634, + "grad_norm": 0.18270432949066162, + "learning_rate": 8.9328543280337e-05, + "loss": 4.1066, + "step": 12575 + }, + { + "epoch": 0.8547356977850251, + "grad_norm": 0.5479381680488586, + "learning_rate": 8.932429677945373e-05, + "loss": 3.9433, + "step": 12580 + }, + { + "epoch": 0.8550754178556869, + "grad_norm": 0.19208583235740662, + "learning_rate": 8.932005027857047e-05, + "loss": 3.7949, + "step": 12585 + }, + { + "epoch": 0.8554151379263487, + "grad_norm": 0.19413863122463226, + "learning_rate": 8.931580377768718e-05, + "loss": 4.07, + "step": 12590 + }, + { + "epoch": 0.8557548579970105, + "grad_norm": 0.19963692128658295, + "learning_rate": 8.931155727680391e-05, + "loss": 4.038, + "step": 12595 + }, + { + "epoch": 0.8560945780676722, + "grad_norm": 0.18308939039707184, + "learning_rate": 8.930731077592065e-05, + "loss": 4.0086, + "step": 12600 + }, + { + "epoch": 0.856434298138334, + "grad_norm": 0.23906344175338745, + "learning_rate": 8.930306427503737e-05, + "loss": 3.835, + "step": 12605 + }, + { + "epoch": 0.8567740182089958, + "grad_norm": 0.15061314404010773, + "learning_rate": 8.92988177741541e-05, + "loss": 3.9051, + "step": 12610 + }, + { + "epoch": 0.8571137382796575, + "grad_norm": 0.4929114282131195, + "learning_rate": 8.929457127327084e-05, + "loss": 3.8619, + "step": 12615 + }, + { + "epoch": 0.8574534583503194, + "grad_norm": 0.23578637838363647, + "learning_rate": 8.929032477238755e-05, + "loss": 4.2782, + "step": 12620 + }, + { + "epoch": 0.8577931784209811, + "grad_norm": 0.2066326141357422, + "learning_rate": 8.928607827150428e-05, + "loss": 4.2725, + "step": 12625 + }, + { + "epoch": 0.8581328984916429, + "grad_norm": 0.22157415747642517, + "learning_rate": 8.928183177062102e-05, + "loss": 4.0701, + "step": 12630 + }, + { + "epoch": 0.8584726185623046, + "grad_norm": 0.9750187397003174, + "learning_rate": 8.927758526973774e-05, + "loss": 3.8094, + "step": 12635 + }, + { + "epoch": 0.8588123386329665, + "grad_norm": 0.17541570961475372, + "learning_rate": 8.927333876885446e-05, + "loss": 4.1766, + "step": 12640 + }, + { + "epoch": 0.8591520587036282, + "grad_norm": 0.1566866785287857, + "learning_rate": 8.92690922679712e-05, + "loss": 4.0965, + "step": 12645 + }, + { + "epoch": 0.85949177877429, + "grad_norm": 0.18229223787784576, + "learning_rate": 8.926484576708792e-05, + "loss": 3.9081, + "step": 12650 + }, + { + "epoch": 0.8598314988449518, + "grad_norm": 0.19233596324920654, + "learning_rate": 8.926059926620465e-05, + "loss": 3.9146, + "step": 12655 + }, + { + "epoch": 0.8601712189156135, + "grad_norm": 0.26399555802345276, + "learning_rate": 8.925635276532138e-05, + "loss": 3.8794, + "step": 12660 + }, + { + "epoch": 0.8605109389862753, + "grad_norm": 0.18803419172763824, + "learning_rate": 8.92521062644381e-05, + "loss": 4.2288, + "step": 12665 + }, + { + "epoch": 0.860850659056937, + "grad_norm": 0.2003091722726822, + "learning_rate": 8.924785976355483e-05, + "loss": 3.9376, + "step": 12670 + }, + { + "epoch": 0.8611903791275989, + "grad_norm": 2.6071043014526367, + "learning_rate": 8.924361326267156e-05, + "loss": 3.9786, + "step": 12675 + }, + { + "epoch": 0.8615300991982606, + "grad_norm": 0.21313896775245667, + "learning_rate": 8.923936676178829e-05, + "loss": 4.0317, + "step": 12680 + }, + { + "epoch": 0.8618698192689224, + "grad_norm": 0.17100752890110016, + "learning_rate": 8.923512026090502e-05, + "loss": 3.9988, + "step": 12685 + }, + { + "epoch": 0.8622095393395842, + "grad_norm": 0.23430535197257996, + "learning_rate": 8.923087376002174e-05, + "loss": 3.7328, + "step": 12690 + }, + { + "epoch": 0.862549259410246, + "grad_norm": 0.1643848717212677, + "learning_rate": 8.922662725913847e-05, + "loss": 4.109, + "step": 12695 + }, + { + "epoch": 0.8628889794809077, + "grad_norm": 0.2526448369026184, + "learning_rate": 8.92223807582552e-05, + "loss": 4.0189, + "step": 12700 + }, + { + "epoch": 0.8632286995515696, + "grad_norm": 0.24552328884601593, + "learning_rate": 8.921813425737193e-05, + "loss": 3.9226, + "step": 12705 + }, + { + "epoch": 0.8635684196222313, + "grad_norm": 0.17898543179035187, + "learning_rate": 8.921388775648866e-05, + "loss": 3.7268, + "step": 12710 + }, + { + "epoch": 0.863908139692893, + "grad_norm": 1.15958833694458, + "learning_rate": 8.920964125560538e-05, + "loss": 3.9694, + "step": 12715 + }, + { + "epoch": 0.8642478597635548, + "grad_norm": 0.15424911677837372, + "learning_rate": 8.920539475472211e-05, + "loss": 4.0695, + "step": 12720 + }, + { + "epoch": 0.8645875798342166, + "grad_norm": 0.1732960045337677, + "learning_rate": 8.920114825383884e-05, + "loss": 4.0694, + "step": 12725 + }, + { + "epoch": 0.8649272999048784, + "grad_norm": 0.25387898087501526, + "learning_rate": 8.919690175295557e-05, + "loss": 3.9112, + "step": 12730 + }, + { + "epoch": 0.8652670199755401, + "grad_norm": 0.18854272365570068, + "learning_rate": 8.91926552520723e-05, + "loss": 4.1747, + "step": 12735 + }, + { + "epoch": 0.865606740046202, + "grad_norm": 0.18318019807338715, + "learning_rate": 8.918840875118902e-05, + "loss": 4.0418, + "step": 12740 + }, + { + "epoch": 0.8659464601168637, + "grad_norm": 0.1997986137866974, + "learning_rate": 8.918416225030575e-05, + "loss": 3.9741, + "step": 12745 + }, + { + "epoch": 0.8662861801875255, + "grad_norm": 0.1834951937198639, + "learning_rate": 8.917991574942248e-05, + "loss": 4.0931, + "step": 12750 + }, + { + "epoch": 0.8666259002581872, + "grad_norm": 0.1741950660943985, + "learning_rate": 8.917566924853921e-05, + "loss": 3.8176, + "step": 12755 + }, + { + "epoch": 0.866965620328849, + "grad_norm": 0.18136630952358246, + "learning_rate": 8.917142274765594e-05, + "loss": 4.0065, + "step": 12760 + }, + { + "epoch": 0.8673053403995108, + "grad_norm": 0.22858762741088867, + "learning_rate": 8.916717624677266e-05, + "loss": 3.8951, + "step": 12765 + }, + { + "epoch": 0.8676450604701725, + "grad_norm": 0.15833310782909393, + "learning_rate": 8.916292974588939e-05, + "loss": 3.9966, + "step": 12770 + }, + { + "epoch": 0.8679847805408344, + "grad_norm": 0.16821962594985962, + "learning_rate": 8.915868324500611e-05, + "loss": 4.1885, + "step": 12775 + }, + { + "epoch": 0.8683245006114961, + "grad_norm": 0.16304543614387512, + "learning_rate": 8.915443674412285e-05, + "loss": 4.054, + "step": 12780 + }, + { + "epoch": 0.8686642206821579, + "grad_norm": 0.19145479798316956, + "learning_rate": 8.915019024323958e-05, + "loss": 4.0134, + "step": 12785 + }, + { + "epoch": 0.8690039407528197, + "grad_norm": 0.20590396225452423, + "learning_rate": 8.91459437423563e-05, + "loss": 3.9, + "step": 12790 + }, + { + "epoch": 0.8693436608234815, + "grad_norm": 0.1521267145872116, + "learning_rate": 8.914169724147303e-05, + "loss": 3.9639, + "step": 12795 + }, + { + "epoch": 0.8696833808941432, + "grad_norm": 1.8004759550094604, + "learning_rate": 8.913745074058976e-05, + "loss": 4.1324, + "step": 12800 + }, + { + "epoch": 0.870023100964805, + "grad_norm": 0.1751425862312317, + "learning_rate": 8.913320423970649e-05, + "loss": 4.0504, + "step": 12805 + }, + { + "epoch": 0.8703628210354668, + "grad_norm": 0.21332374215126038, + "learning_rate": 8.912895773882322e-05, + "loss": 4.095, + "step": 12810 + }, + { + "epoch": 0.8707025411061285, + "grad_norm": 0.2132454216480255, + "learning_rate": 8.912471123793994e-05, + "loss": 3.9654, + "step": 12815 + }, + { + "epoch": 0.8710422611767903, + "grad_norm": 0.1583162248134613, + "learning_rate": 8.912046473705667e-05, + "loss": 3.9617, + "step": 12820 + }, + { + "epoch": 0.8713819812474521, + "grad_norm": 0.19870373606681824, + "learning_rate": 8.91162182361734e-05, + "loss": 4.2326, + "step": 12825 + }, + { + "epoch": 0.8717217013181139, + "grad_norm": 0.18757444620132446, + "learning_rate": 8.911197173529013e-05, + "loss": 3.9562, + "step": 12830 + }, + { + "epoch": 0.8720614213887756, + "grad_norm": 0.15151342749595642, + "learning_rate": 8.910772523440686e-05, + "loss": 3.8253, + "step": 12835 + }, + { + "epoch": 0.8724011414594374, + "grad_norm": 0.23465317487716675, + "learning_rate": 8.910347873352358e-05, + "loss": 3.8039, + "step": 12840 + }, + { + "epoch": 0.8727408615300992, + "grad_norm": 0.3286290168762207, + "learning_rate": 8.909923223264031e-05, + "loss": 3.9344, + "step": 12845 + }, + { + "epoch": 0.873080581600761, + "grad_norm": 0.3796158730983734, + "learning_rate": 8.909498573175704e-05, + "loss": 4.2968, + "step": 12850 + }, + { + "epoch": 0.8734203016714227, + "grad_norm": 0.17161943018436432, + "learning_rate": 8.909073923087377e-05, + "loss": 4.0475, + "step": 12855 + }, + { + "epoch": 0.8737600217420846, + "grad_norm": 0.18927636742591858, + "learning_rate": 8.908649272999048e-05, + "loss": 3.9, + "step": 12860 + }, + { + "epoch": 0.8740997418127463, + "grad_norm": 0.17746715247631073, + "learning_rate": 8.908224622910722e-05, + "loss": 4.0236, + "step": 12865 + }, + { + "epoch": 0.874439461883408, + "grad_norm": 0.2432946115732193, + "learning_rate": 8.907799972822395e-05, + "loss": 3.595, + "step": 12870 + }, + { + "epoch": 0.8747791819540699, + "grad_norm": 0.18916042149066925, + "learning_rate": 8.907375322734067e-05, + "loss": 3.9668, + "step": 12875 + }, + { + "epoch": 0.8751189020247316, + "grad_norm": 0.18429867923259735, + "learning_rate": 8.906950672645741e-05, + "loss": 3.8215, + "step": 12880 + }, + { + "epoch": 0.8754586220953934, + "grad_norm": 0.30447253584861755, + "learning_rate": 8.906526022557414e-05, + "loss": 3.94, + "step": 12885 + }, + { + "epoch": 0.8757983421660551, + "grad_norm": 0.1514458954334259, + "learning_rate": 8.906101372469085e-05, + "loss": 3.8559, + "step": 12890 + }, + { + "epoch": 0.876138062236717, + "grad_norm": 0.1644824743270874, + "learning_rate": 8.905676722380759e-05, + "loss": 4.136, + "step": 12895 + }, + { + "epoch": 0.8764777823073787, + "grad_norm": 0.1758906990289688, + "learning_rate": 8.905252072292432e-05, + "loss": 4.2008, + "step": 12900 + }, + { + "epoch": 0.8768175023780405, + "grad_norm": 0.19972476363182068, + "learning_rate": 8.904827422204104e-05, + "loss": 3.9077, + "step": 12905 + }, + { + "epoch": 0.8771572224487023, + "grad_norm": 0.21778126060962677, + "learning_rate": 8.904402772115778e-05, + "loss": 3.9808, + "step": 12910 + }, + { + "epoch": 0.877496942519364, + "grad_norm": 0.19557218253612518, + "learning_rate": 8.90397812202745e-05, + "loss": 3.7706, + "step": 12915 + }, + { + "epoch": 0.8778366625900258, + "grad_norm": 0.1903916746377945, + "learning_rate": 8.903553471939122e-05, + "loss": 3.8328, + "step": 12920 + }, + { + "epoch": 0.8781763826606876, + "grad_norm": 0.21190743148326874, + "learning_rate": 8.903128821850796e-05, + "loss": 3.8016, + "step": 12925 + }, + { + "epoch": 0.8785161027313494, + "grad_norm": 0.22646445035934448, + "learning_rate": 8.902704171762468e-05, + "loss": 3.9702, + "step": 12930 + }, + { + "epoch": 0.8788558228020111, + "grad_norm": 0.2211994081735611, + "learning_rate": 8.90227952167414e-05, + "loss": 4.0118, + "step": 12935 + }, + { + "epoch": 0.8791955428726729, + "grad_norm": 0.23225241899490356, + "learning_rate": 8.901854871585814e-05, + "loss": 4.1305, + "step": 12940 + }, + { + "epoch": 0.8795352629433347, + "grad_norm": 0.20378831028938293, + "learning_rate": 8.901430221497486e-05, + "loss": 3.9627, + "step": 12945 + }, + { + "epoch": 0.8798749830139965, + "grad_norm": 0.15453274548053741, + "learning_rate": 8.901005571409159e-05, + "loss": 3.8527, + "step": 12950 + }, + { + "epoch": 0.8802147030846582, + "grad_norm": 0.19792801141738892, + "learning_rate": 8.900580921320833e-05, + "loss": 4.0273, + "step": 12955 + }, + { + "epoch": 0.8805544231553201, + "grad_norm": 0.17211174964904785, + "learning_rate": 8.900156271232504e-05, + "loss": 3.9116, + "step": 12960 + }, + { + "epoch": 0.8808941432259818, + "grad_norm": 0.5672011375427246, + "learning_rate": 8.899731621144177e-05, + "loss": 3.9642, + "step": 12965 + }, + { + "epoch": 0.8812338632966435, + "grad_norm": 0.18356992304325104, + "learning_rate": 8.899306971055851e-05, + "loss": 3.9994, + "step": 12970 + }, + { + "epoch": 0.8815735833673053, + "grad_norm": 0.18002454936504364, + "learning_rate": 8.898882320967523e-05, + "loss": 3.9536, + "step": 12975 + }, + { + "epoch": 0.8819133034379671, + "grad_norm": 0.18387572467327118, + "learning_rate": 8.898457670879196e-05, + "loss": 4.0552, + "step": 12980 + }, + { + "epoch": 0.8822530235086289, + "grad_norm": 0.27319806814193726, + "learning_rate": 8.89803302079087e-05, + "loss": 3.9619, + "step": 12985 + }, + { + "epoch": 0.8825927435792906, + "grad_norm": 0.2080732136964798, + "learning_rate": 8.897608370702541e-05, + "loss": 4.1625, + "step": 12990 + }, + { + "epoch": 0.8829324636499525, + "grad_norm": 0.17268094420433044, + "learning_rate": 8.897183720614214e-05, + "loss": 3.9614, + "step": 12995 + }, + { + "epoch": 0.8832721837206142, + "grad_norm": 0.15917377173900604, + "learning_rate": 8.896759070525888e-05, + "loss": 3.9529, + "step": 13000 + }, + { + "epoch": 0.883611903791276, + "grad_norm": 0.22826112806797028, + "learning_rate": 8.89633442043756e-05, + "loss": 4.0541, + "step": 13005 + }, + { + "epoch": 0.8839516238619378, + "grad_norm": 0.2643020749092102, + "learning_rate": 8.895909770349232e-05, + "loss": 3.8896, + "step": 13010 + }, + { + "epoch": 0.8842913439325996, + "grad_norm": 0.19719457626342773, + "learning_rate": 8.895485120260905e-05, + "loss": 3.8283, + "step": 13015 + }, + { + "epoch": 0.8846310640032613, + "grad_norm": 0.18295039236545563, + "learning_rate": 8.895060470172578e-05, + "loss": 3.8955, + "step": 13020 + }, + { + "epoch": 0.884970784073923, + "grad_norm": 0.23282389342784882, + "learning_rate": 8.894635820084251e-05, + "loss": 3.9094, + "step": 13025 + }, + { + "epoch": 0.8853105041445849, + "grad_norm": 0.1835237592458725, + "learning_rate": 8.894211169995924e-05, + "loss": 3.9402, + "step": 13030 + }, + { + "epoch": 0.8856502242152466, + "grad_norm": 0.2141278088092804, + "learning_rate": 8.893786519907596e-05, + "loss": 4.0951, + "step": 13035 + }, + { + "epoch": 0.8859899442859084, + "grad_norm": 0.16725867986679077, + "learning_rate": 8.893361869819269e-05, + "loss": 3.8523, + "step": 13040 + }, + { + "epoch": 0.8863296643565702, + "grad_norm": 0.15616454184055328, + "learning_rate": 8.892937219730942e-05, + "loss": 3.9671, + "step": 13045 + }, + { + "epoch": 0.886669384427232, + "grad_norm": 0.17522963881492615, + "learning_rate": 8.892512569642615e-05, + "loss": 4.0624, + "step": 13050 + }, + { + "epoch": 0.8870091044978937, + "grad_norm": 0.19032767415046692, + "learning_rate": 8.892087919554288e-05, + "loss": 3.8977, + "step": 13055 + }, + { + "epoch": 0.8873488245685555, + "grad_norm": 0.4839637279510498, + "learning_rate": 8.89166326946596e-05, + "loss": 3.9498, + "step": 13060 + }, + { + "epoch": 0.8876885446392173, + "grad_norm": 0.15772879123687744, + "learning_rate": 8.891238619377633e-05, + "loss": 3.9909, + "step": 13065 + }, + { + "epoch": 0.8880282647098791, + "grad_norm": 0.23280905187129974, + "learning_rate": 8.890813969289306e-05, + "loss": 3.9899, + "step": 13070 + }, + { + "epoch": 0.8883679847805408, + "grad_norm": 0.21939218044281006, + "learning_rate": 8.890389319200979e-05, + "loss": 4.0157, + "step": 13075 + }, + { + "epoch": 0.8887077048512027, + "grad_norm": 0.19345150887966156, + "learning_rate": 8.889964669112652e-05, + "loss": 3.8884, + "step": 13080 + }, + { + "epoch": 0.8890474249218644, + "grad_norm": 0.18044187128543854, + "learning_rate": 8.889540019024324e-05, + "loss": 3.9464, + "step": 13085 + }, + { + "epoch": 0.8893871449925261, + "grad_norm": 0.7762069702148438, + "learning_rate": 8.889115368935997e-05, + "loss": 4.0273, + "step": 13090 + }, + { + "epoch": 0.889726865063188, + "grad_norm": 0.20319564640522003, + "learning_rate": 8.88869071884767e-05, + "loss": 3.943, + "step": 13095 + }, + { + "epoch": 0.8900665851338497, + "grad_norm": 0.4176552891731262, + "learning_rate": 8.888266068759343e-05, + "loss": 3.9463, + "step": 13100 + }, + { + "epoch": 0.8904063052045115, + "grad_norm": 0.6137194633483887, + "learning_rate": 8.887841418671016e-05, + "loss": 3.9732, + "step": 13105 + }, + { + "epoch": 0.8907460252751732, + "grad_norm": 0.3522084355354309, + "learning_rate": 8.887416768582688e-05, + "loss": 4.083, + "step": 13110 + }, + { + "epoch": 0.8910857453458351, + "grad_norm": 0.1649700552225113, + "learning_rate": 8.886992118494361e-05, + "loss": 3.8638, + "step": 13115 + }, + { + "epoch": 0.8914254654164968, + "grad_norm": 0.1624513566493988, + "learning_rate": 8.886567468406034e-05, + "loss": 4.1418, + "step": 13120 + }, + { + "epoch": 0.8917651854871586, + "grad_norm": 0.18356308341026306, + "learning_rate": 8.886142818317707e-05, + "loss": 3.8366, + "step": 13125 + }, + { + "epoch": 0.8921049055578204, + "grad_norm": 0.22682762145996094, + "learning_rate": 8.88571816822938e-05, + "loss": 3.8805, + "step": 13130 + }, + { + "epoch": 0.8924446256284821, + "grad_norm": 0.3890301287174225, + "learning_rate": 8.885293518141052e-05, + "loss": 3.8615, + "step": 13135 + }, + { + "epoch": 0.8927843456991439, + "grad_norm": 0.1815677434206009, + "learning_rate": 8.884868868052725e-05, + "loss": 3.9861, + "step": 13140 + }, + { + "epoch": 0.8931240657698056, + "grad_norm": 0.5793675780296326, + "learning_rate": 8.884444217964398e-05, + "loss": 4.0116, + "step": 13145 + }, + { + "epoch": 0.8934637858404675, + "grad_norm": 0.30733972787857056, + "learning_rate": 8.884019567876071e-05, + "loss": 3.8564, + "step": 13150 + }, + { + "epoch": 0.8938035059111292, + "grad_norm": 0.15819989144802094, + "learning_rate": 8.883594917787744e-05, + "loss": 4.2131, + "step": 13155 + }, + { + "epoch": 0.894143225981791, + "grad_norm": 0.1839209347963333, + "learning_rate": 8.883170267699416e-05, + "loss": 3.8485, + "step": 13160 + }, + { + "epoch": 0.8944829460524528, + "grad_norm": 0.2315215915441513, + "learning_rate": 8.882745617611089e-05, + "loss": 3.7713, + "step": 13165 + }, + { + "epoch": 0.8948226661231146, + "grad_norm": 0.18673592805862427, + "learning_rate": 8.882320967522762e-05, + "loss": 3.7431, + "step": 13170 + }, + { + "epoch": 0.8951623861937763, + "grad_norm": 0.1436724215745926, + "learning_rate": 8.881896317434435e-05, + "loss": 4.0778, + "step": 13175 + }, + { + "epoch": 0.8955021062644382, + "grad_norm": 0.23739652335643768, + "learning_rate": 8.881471667346108e-05, + "loss": 3.925, + "step": 13180 + }, + { + "epoch": 0.8958418263350999, + "grad_norm": 0.17830795049667358, + "learning_rate": 8.88104701725778e-05, + "loss": 4.0997, + "step": 13185 + }, + { + "epoch": 0.8961815464057616, + "grad_norm": 0.20703446865081787, + "learning_rate": 8.880622367169453e-05, + "loss": 3.8833, + "step": 13190 + }, + { + "epoch": 0.8965212664764234, + "grad_norm": 0.18058006465435028, + "learning_rate": 8.880197717081126e-05, + "loss": 4.1206, + "step": 13195 + }, + { + "epoch": 0.8968609865470852, + "grad_norm": 0.16602760553359985, + "learning_rate": 8.879773066992799e-05, + "loss": 4.0043, + "step": 13200 + }, + { + "epoch": 0.897200706617747, + "grad_norm": 0.2073521465063095, + "learning_rate": 8.879348416904472e-05, + "loss": 3.8992, + "step": 13205 + }, + { + "epoch": 0.8975404266884087, + "grad_norm": 0.1674661785364151, + "learning_rate": 8.878923766816144e-05, + "loss": 3.9651, + "step": 13210 + }, + { + "epoch": 0.8978801467590706, + "grad_norm": 0.19709919393062592, + "learning_rate": 8.878499116727816e-05, + "loss": 3.8494, + "step": 13215 + }, + { + "epoch": 0.8982198668297323, + "grad_norm": 0.18262673914432526, + "learning_rate": 8.87807446663949e-05, + "loss": 3.9929, + "step": 13220 + }, + { + "epoch": 0.8985595869003941, + "grad_norm": 0.1922248899936676, + "learning_rate": 8.877649816551163e-05, + "loss": 4.1698, + "step": 13225 + }, + { + "epoch": 0.8988993069710558, + "grad_norm": 0.22797107696533203, + "learning_rate": 8.877225166462834e-05, + "loss": 4.1415, + "step": 13230 + }, + { + "epoch": 0.8992390270417177, + "grad_norm": 0.21377937495708466, + "learning_rate": 8.876800516374508e-05, + "loss": 3.7365, + "step": 13235 + }, + { + "epoch": 0.8995787471123794, + "grad_norm": 0.15349645912647247, + "learning_rate": 8.876375866286181e-05, + "loss": 4.21, + "step": 13240 + }, + { + "epoch": 0.8999184671830411, + "grad_norm": 0.17594188451766968, + "learning_rate": 8.875951216197853e-05, + "loss": 3.6277, + "step": 13245 + }, + { + "epoch": 0.900258187253703, + "grad_norm": 0.21915188431739807, + "learning_rate": 8.875526566109527e-05, + "loss": 3.8465, + "step": 13250 + }, + { + "epoch": 0.9005979073243647, + "grad_norm": 0.1609984189271927, + "learning_rate": 8.8751019160212e-05, + "loss": 3.9646, + "step": 13255 + }, + { + "epoch": 0.9009376273950265, + "grad_norm": 0.22858203947544098, + "learning_rate": 8.874677265932871e-05, + "loss": 3.9217, + "step": 13260 + }, + { + "epoch": 0.9012773474656883, + "grad_norm": 0.1865098774433136, + "learning_rate": 8.874252615844545e-05, + "loss": 4.0665, + "step": 13265 + }, + { + "epoch": 0.9016170675363501, + "grad_norm": 0.27133429050445557, + "learning_rate": 8.873827965756218e-05, + "loss": 4.1901, + "step": 13270 + }, + { + "epoch": 0.9019567876070118, + "grad_norm": 0.26253220438957214, + "learning_rate": 8.87340331566789e-05, + "loss": 4.0598, + "step": 13275 + }, + { + "epoch": 0.9022965076776736, + "grad_norm": 0.19666416943073273, + "learning_rate": 8.872978665579564e-05, + "loss": 4.1986, + "step": 13280 + }, + { + "epoch": 0.9026362277483354, + "grad_norm": 0.16205628216266632, + "learning_rate": 8.872554015491235e-05, + "loss": 3.9244, + "step": 13285 + }, + { + "epoch": 0.9029759478189971, + "grad_norm": 0.21423132717609406, + "learning_rate": 8.872129365402908e-05, + "loss": 3.9477, + "step": 13290 + }, + { + "epoch": 0.9033156678896589, + "grad_norm": 0.18254421651363373, + "learning_rate": 8.871704715314582e-05, + "loss": 4.0589, + "step": 13295 + }, + { + "epoch": 0.9036553879603207, + "grad_norm": 0.17389804124832153, + "learning_rate": 8.871280065226253e-05, + "loss": 4.0397, + "step": 13300 + }, + { + "epoch": 0.9039951080309825, + "grad_norm": 0.23967847228050232, + "learning_rate": 8.870855415137926e-05, + "loss": 3.9713, + "step": 13305 + }, + { + "epoch": 0.9043348281016442, + "grad_norm": 0.15660813450813293, + "learning_rate": 8.8704307650496e-05, + "loss": 4.155, + "step": 13310 + }, + { + "epoch": 0.904674548172306, + "grad_norm": 0.1497335135936737, + "learning_rate": 8.870006114961272e-05, + "loss": 4.2737, + "step": 13315 + }, + { + "epoch": 0.9050142682429678, + "grad_norm": 0.16438940167427063, + "learning_rate": 8.869581464872945e-05, + "loss": 3.7956, + "step": 13320 + }, + { + "epoch": 0.9053539883136296, + "grad_norm": 0.19728811085224152, + "learning_rate": 8.869156814784619e-05, + "loss": 3.8597, + "step": 13325 + }, + { + "epoch": 0.9056937083842913, + "grad_norm": 0.19347457587718964, + "learning_rate": 8.86873216469629e-05, + "loss": 3.9375, + "step": 13330 + }, + { + "epoch": 0.9060334284549532, + "grad_norm": 0.1782982498407364, + "learning_rate": 8.868307514607963e-05, + "loss": 4.1215, + "step": 13335 + }, + { + "epoch": 0.9063731485256149, + "grad_norm": 0.16826027631759644, + "learning_rate": 8.867882864519637e-05, + "loss": 3.9993, + "step": 13340 + }, + { + "epoch": 0.9067128685962766, + "grad_norm": 0.1599569171667099, + "learning_rate": 8.867458214431309e-05, + "loss": 3.8991, + "step": 13345 + }, + { + "epoch": 0.9070525886669385, + "grad_norm": 2.387441396713257, + "learning_rate": 8.867033564342981e-05, + "loss": 4.0009, + "step": 13350 + }, + { + "epoch": 0.9073923087376002, + "grad_norm": 0.16353747248649597, + "learning_rate": 8.866608914254654e-05, + "loss": 3.8368, + "step": 13355 + }, + { + "epoch": 0.907732028808262, + "grad_norm": 0.18139252066612244, + "learning_rate": 8.866184264166327e-05, + "loss": 3.8567, + "step": 13360 + }, + { + "epoch": 0.9080717488789237, + "grad_norm": 0.28488507866859436, + "learning_rate": 8.865759614078e-05, + "loss": 3.9918, + "step": 13365 + }, + { + "epoch": 0.9084114689495856, + "grad_norm": 0.193324476480484, + "learning_rate": 8.865334963989673e-05, + "loss": 4.0164, + "step": 13370 + }, + { + "epoch": 0.9087511890202473, + "grad_norm": 0.18897615373134613, + "learning_rate": 8.864910313901345e-05, + "loss": 3.6622, + "step": 13375 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.18190960586071014, + "learning_rate": 8.864485663813018e-05, + "loss": 4.0802, + "step": 13380 + }, + { + "epoch": 0.9094306291615709, + "grad_norm": 0.3872554302215576, + "learning_rate": 8.864061013724691e-05, + "loss": 4.1349, + "step": 13385 + }, + { + "epoch": 0.9097703492322327, + "grad_norm": 0.18860095739364624, + "learning_rate": 8.863636363636364e-05, + "loss": 4.1308, + "step": 13390 + }, + { + "epoch": 0.9101100693028944, + "grad_norm": 0.1607290804386139, + "learning_rate": 8.863211713548037e-05, + "loss": 3.9187, + "step": 13395 + }, + { + "epoch": 0.9104497893735561, + "grad_norm": 0.24063172936439514, + "learning_rate": 8.86278706345971e-05, + "loss": 3.7071, + "step": 13400 + }, + { + "epoch": 0.910789509444218, + "grad_norm": 0.2261088341474533, + "learning_rate": 8.862362413371382e-05, + "loss": 4.3313, + "step": 13405 + }, + { + "epoch": 0.9111292295148797, + "grad_norm": 0.2024299055337906, + "learning_rate": 8.861937763283055e-05, + "loss": 3.7482, + "step": 13410 + }, + { + "epoch": 0.9114689495855415, + "grad_norm": 0.32624539732933044, + "learning_rate": 8.861513113194728e-05, + "loss": 3.9679, + "step": 13415 + }, + { + "epoch": 0.9118086696562033, + "grad_norm": 0.2082969844341278, + "learning_rate": 8.8610884631064e-05, + "loss": 4.1913, + "step": 13420 + }, + { + "epoch": 0.9121483897268651, + "grad_norm": 0.20776890218257904, + "learning_rate": 8.860663813018073e-05, + "loss": 4.0744, + "step": 13425 + }, + { + "epoch": 0.9124881097975268, + "grad_norm": 0.16512270271778107, + "learning_rate": 8.860239162929746e-05, + "loss": 4.0043, + "step": 13430 + }, + { + "epoch": 0.9128278298681887, + "grad_norm": 1.4505500793457031, + "learning_rate": 8.859814512841419e-05, + "loss": 3.8782, + "step": 13435 + }, + { + "epoch": 0.9131675499388504, + "grad_norm": 0.25911930203437805, + "learning_rate": 8.859389862753092e-05, + "loss": 3.6646, + "step": 13440 + }, + { + "epoch": 0.9135072700095122, + "grad_norm": 0.23000332713127136, + "learning_rate": 8.858965212664765e-05, + "loss": 4.0076, + "step": 13445 + }, + { + "epoch": 0.9138469900801739, + "grad_norm": 0.19737814366817474, + "learning_rate": 8.858540562576437e-05, + "loss": 3.8124, + "step": 13450 + }, + { + "epoch": 0.9141867101508357, + "grad_norm": 0.1637062281370163, + "learning_rate": 8.85811591248811e-05, + "loss": 3.8238, + "step": 13455 + }, + { + "epoch": 0.9145264302214975, + "grad_norm": 0.24078230559825897, + "learning_rate": 8.857691262399783e-05, + "loss": 4.2022, + "step": 13460 + }, + { + "epoch": 0.9148661502921592, + "grad_norm": 0.17801740765571594, + "learning_rate": 8.857266612311456e-05, + "loss": 4.11, + "step": 13465 + }, + { + "epoch": 0.9152058703628211, + "grad_norm": 0.18943698704242706, + "learning_rate": 8.856841962223129e-05, + "loss": 4.0061, + "step": 13470 + }, + { + "epoch": 0.9155455904334828, + "grad_norm": 0.1784680187702179, + "learning_rate": 8.856417312134801e-05, + "loss": 3.9399, + "step": 13475 + }, + { + "epoch": 0.9158853105041446, + "grad_norm": 0.19242218136787415, + "learning_rate": 8.855992662046474e-05, + "loss": 3.7147, + "step": 13480 + }, + { + "epoch": 0.9162250305748063, + "grad_norm": 0.1983332335948944, + "learning_rate": 8.855568011958147e-05, + "loss": 3.7828, + "step": 13485 + }, + { + "epoch": 0.9165647506454682, + "grad_norm": 0.1877221167087555, + "learning_rate": 8.85514336186982e-05, + "loss": 3.9979, + "step": 13490 + }, + { + "epoch": 0.9169044707161299, + "grad_norm": 0.287514328956604, + "learning_rate": 8.854718711781493e-05, + "loss": 4.0574, + "step": 13495 + }, + { + "epoch": 0.9172441907867916, + "grad_norm": 0.1971914917230606, + "learning_rate": 8.854294061693165e-05, + "loss": 3.8822, + "step": 13500 + }, + { + "epoch": 0.9175839108574535, + "grad_norm": 0.19974425435066223, + "learning_rate": 8.853869411604838e-05, + "loss": 4.0866, + "step": 13505 + }, + { + "epoch": 0.9179236309281152, + "grad_norm": 0.1471916139125824, + "learning_rate": 8.853444761516511e-05, + "loss": 4.2775, + "step": 13510 + }, + { + "epoch": 0.918263350998777, + "grad_norm": 0.2695204019546509, + "learning_rate": 8.853020111428184e-05, + "loss": 3.5593, + "step": 13515 + }, + { + "epoch": 0.9186030710694388, + "grad_norm": 0.3328780233860016, + "learning_rate": 8.852595461339857e-05, + "loss": 4.0474, + "step": 13520 + }, + { + "epoch": 0.9189427911401006, + "grad_norm": 0.22376228868961334, + "learning_rate": 8.85217081125153e-05, + "loss": 3.8726, + "step": 13525 + }, + { + "epoch": 0.9192825112107623, + "grad_norm": 1.0047399997711182, + "learning_rate": 8.851746161163202e-05, + "loss": 3.6569, + "step": 13530 + }, + { + "epoch": 0.9196222312814241, + "grad_norm": 0.20537297427654266, + "learning_rate": 8.851321511074875e-05, + "loss": 4.0576, + "step": 13535 + }, + { + "epoch": 0.9199619513520859, + "grad_norm": 0.19384372234344482, + "learning_rate": 8.850896860986548e-05, + "loss": 4.1421, + "step": 13540 + }, + { + "epoch": 0.9203016714227477, + "grad_norm": 0.24288725852966309, + "learning_rate": 8.85047221089822e-05, + "loss": 3.9712, + "step": 13545 + }, + { + "epoch": 0.9206413914934094, + "grad_norm": 0.17010895907878876, + "learning_rate": 8.850047560809893e-05, + "loss": 4.1179, + "step": 13550 + }, + { + "epoch": 0.9209811115640713, + "grad_norm": 0.2580501139163971, + "learning_rate": 8.849622910721565e-05, + "loss": 3.963, + "step": 13555 + }, + { + "epoch": 0.921320831634733, + "grad_norm": 0.19051308929920197, + "learning_rate": 8.849198260633239e-05, + "loss": 3.9626, + "step": 13560 + }, + { + "epoch": 0.9216605517053947, + "grad_norm": 0.18489578366279602, + "learning_rate": 8.848773610544912e-05, + "loss": 4.0235, + "step": 13565 + }, + { + "epoch": 0.9220002717760565, + "grad_norm": 0.1751706898212433, + "learning_rate": 8.848348960456583e-05, + "loss": 4.03, + "step": 13570 + }, + { + "epoch": 0.9223399918467183, + "grad_norm": 0.15869379043579102, + "learning_rate": 8.847924310368257e-05, + "loss": 3.9887, + "step": 13575 + }, + { + "epoch": 0.9226797119173801, + "grad_norm": 0.18910722434520721, + "learning_rate": 8.84749966027993e-05, + "loss": 3.9694, + "step": 13580 + }, + { + "epoch": 0.9230194319880418, + "grad_norm": 0.18458014726638794, + "learning_rate": 8.847075010191602e-05, + "loss": 3.7103, + "step": 13585 + }, + { + "epoch": 0.9233591520587037, + "grad_norm": 0.15150383114814758, + "learning_rate": 8.846650360103276e-05, + "loss": 3.9727, + "step": 13590 + }, + { + "epoch": 0.9236988721293654, + "grad_norm": 0.1747668832540512, + "learning_rate": 8.846225710014949e-05, + "loss": 4.0081, + "step": 13595 + }, + { + "epoch": 0.9240385922000272, + "grad_norm": 0.1757059097290039, + "learning_rate": 8.84580105992662e-05, + "loss": 3.8936, + "step": 13600 + }, + { + "epoch": 0.924378312270689, + "grad_norm": 0.17729350924491882, + "learning_rate": 8.845376409838294e-05, + "loss": 4.1143, + "step": 13605 + }, + { + "epoch": 0.9247180323413507, + "grad_norm": 0.18305832147598267, + "learning_rate": 8.844951759749967e-05, + "loss": 4.1294, + "step": 13610 + }, + { + "epoch": 0.9250577524120125, + "grad_norm": 1.0373708009719849, + "learning_rate": 8.844527109661639e-05, + "loss": 4.0552, + "step": 13615 + }, + { + "epoch": 0.9253974724826742, + "grad_norm": 0.18093042075634003, + "learning_rate": 8.844102459573313e-05, + "loss": 4.0405, + "step": 13620 + }, + { + "epoch": 0.9257371925533361, + "grad_norm": 0.1686364710330963, + "learning_rate": 8.843677809484985e-05, + "loss": 4.0729, + "step": 13625 + }, + { + "epoch": 0.9260769126239978, + "grad_norm": 0.12190663069486618, + "learning_rate": 8.843253159396657e-05, + "loss": 3.958, + "step": 13630 + }, + { + "epoch": 0.9264166326946596, + "grad_norm": 0.18013213574886322, + "learning_rate": 8.842828509308331e-05, + "loss": 4.0457, + "step": 13635 + }, + { + "epoch": 0.9267563527653214, + "grad_norm": 0.47123172879219055, + "learning_rate": 8.842403859220003e-05, + "loss": 3.9378, + "step": 13640 + }, + { + "epoch": 0.9270960728359832, + "grad_norm": 0.17332005500793457, + "learning_rate": 8.841979209131675e-05, + "loss": 3.9994, + "step": 13645 + }, + { + "epoch": 0.9274357929066449, + "grad_norm": 0.2330494374036789, + "learning_rate": 8.84155455904335e-05, + "loss": 4.2059, + "step": 13650 + }, + { + "epoch": 0.9277755129773066, + "grad_norm": 0.1844603419303894, + "learning_rate": 8.841129908955021e-05, + "loss": 3.9922, + "step": 13655 + }, + { + "epoch": 0.9281152330479685, + "grad_norm": 0.19840815663337708, + "learning_rate": 8.840705258866694e-05, + "loss": 4.0601, + "step": 13660 + }, + { + "epoch": 0.9284549531186302, + "grad_norm": 0.25802430510520935, + "learning_rate": 8.840280608778368e-05, + "loss": 3.8963, + "step": 13665 + }, + { + "epoch": 0.928794673189292, + "grad_norm": 0.1573478877544403, + "learning_rate": 8.83985595869004e-05, + "loss": 4.1013, + "step": 13670 + }, + { + "epoch": 0.9291343932599538, + "grad_norm": 0.20257075130939484, + "learning_rate": 8.839431308601712e-05, + "loss": 4.0064, + "step": 13675 + }, + { + "epoch": 0.9294741133306156, + "grad_norm": 0.2046387791633606, + "learning_rate": 8.839006658513386e-05, + "loss": 3.948, + "step": 13680 + }, + { + "epoch": 0.9298138334012773, + "grad_norm": 0.29030993580818176, + "learning_rate": 8.838582008425058e-05, + "loss": 3.7141, + "step": 13685 + }, + { + "epoch": 0.9301535534719392, + "grad_norm": 0.1701250672340393, + "learning_rate": 8.83815735833673e-05, + "loss": 3.8852, + "step": 13690 + }, + { + "epoch": 0.9304932735426009, + "grad_norm": 0.18545231223106384, + "learning_rate": 8.837732708248405e-05, + "loss": 3.8857, + "step": 13695 + }, + { + "epoch": 0.9308329936132627, + "grad_norm": 0.22790156304836273, + "learning_rate": 8.837308058160076e-05, + "loss": 3.9376, + "step": 13700 + }, + { + "epoch": 0.9311727136839244, + "grad_norm": 0.2060774713754654, + "learning_rate": 8.836883408071749e-05, + "loss": 4.0047, + "step": 13705 + }, + { + "epoch": 0.9315124337545863, + "grad_norm": 0.17100512981414795, + "learning_rate": 8.836458757983422e-05, + "loss": 4.0347, + "step": 13710 + }, + { + "epoch": 0.931852153825248, + "grad_norm": 0.18439409136772156, + "learning_rate": 8.836034107895095e-05, + "loss": 4.0597, + "step": 13715 + }, + { + "epoch": 0.9321918738959097, + "grad_norm": 0.20171356201171875, + "learning_rate": 8.835609457806767e-05, + "loss": 3.8904, + "step": 13720 + }, + { + "epoch": 0.9325315939665716, + "grad_norm": 0.2539232075214386, + "learning_rate": 8.83518480771844e-05, + "loss": 3.7127, + "step": 13725 + }, + { + "epoch": 0.9328713140372333, + "grad_norm": 0.303066611289978, + "learning_rate": 8.834760157630113e-05, + "loss": 3.858, + "step": 13730 + }, + { + "epoch": 0.9332110341078951, + "grad_norm": 10.150884628295898, + "learning_rate": 8.834335507541786e-05, + "loss": 4.214, + "step": 13735 + }, + { + "epoch": 0.9335507541785568, + "grad_norm": 0.17438891530036926, + "learning_rate": 8.833910857453459e-05, + "loss": 3.9798, + "step": 13740 + }, + { + "epoch": 0.9338904742492187, + "grad_norm": 0.1963614672422409, + "learning_rate": 8.833486207365131e-05, + "loss": 4.0225, + "step": 13745 + }, + { + "epoch": 0.9342301943198804, + "grad_norm": 0.33438101410865784, + "learning_rate": 8.833061557276804e-05, + "loss": 3.8237, + "step": 13750 + }, + { + "epoch": 0.9345699143905422, + "grad_norm": 0.15952834486961365, + "learning_rate": 8.832636907188477e-05, + "loss": 4.0374, + "step": 13755 + }, + { + "epoch": 0.934909634461204, + "grad_norm": 0.20450946688652039, + "learning_rate": 8.83221225710015e-05, + "loss": 3.8739, + "step": 13760 + }, + { + "epoch": 0.9352493545318658, + "grad_norm": 0.2039669156074524, + "learning_rate": 8.831787607011823e-05, + "loss": 3.953, + "step": 13765 + }, + { + "epoch": 0.9355890746025275, + "grad_norm": 0.15678437054157257, + "learning_rate": 8.831362956923495e-05, + "loss": 3.9398, + "step": 13770 + }, + { + "epoch": 0.9359287946731893, + "grad_norm": 0.5224528312683105, + "learning_rate": 8.830938306835168e-05, + "loss": 3.8759, + "step": 13775 + }, + { + "epoch": 0.9362685147438511, + "grad_norm": 0.2511097192764282, + "learning_rate": 8.830513656746841e-05, + "loss": 3.9613, + "step": 13780 + }, + { + "epoch": 0.9366082348145128, + "grad_norm": 0.14598341286182404, + "learning_rate": 8.830089006658514e-05, + "loss": 4.0519, + "step": 13785 + }, + { + "epoch": 0.9369479548851746, + "grad_norm": 0.18947695195674896, + "learning_rate": 8.829664356570187e-05, + "loss": 3.9679, + "step": 13790 + }, + { + "epoch": 0.9372876749558364, + "grad_norm": 0.19946783781051636, + "learning_rate": 8.82923970648186e-05, + "loss": 4.0397, + "step": 13795 + }, + { + "epoch": 0.9376273950264982, + "grad_norm": 0.1777060627937317, + "learning_rate": 8.828815056393532e-05, + "loss": 3.8039, + "step": 13800 + }, + { + "epoch": 0.9379671150971599, + "grad_norm": 0.2687058746814728, + "learning_rate": 8.828390406305205e-05, + "loss": 3.9514, + "step": 13805 + }, + { + "epoch": 0.9383068351678218, + "grad_norm": 0.17168082296848297, + "learning_rate": 8.827965756216878e-05, + "loss": 3.6143, + "step": 13810 + }, + { + "epoch": 0.9386465552384835, + "grad_norm": 0.2019256055355072, + "learning_rate": 8.82754110612855e-05, + "loss": 3.9093, + "step": 13815 + }, + { + "epoch": 0.9389862753091452, + "grad_norm": 0.15410295128822327, + "learning_rate": 8.827116456040223e-05, + "loss": 4.0334, + "step": 13820 + }, + { + "epoch": 0.939325995379807, + "grad_norm": 0.18953141570091248, + "learning_rate": 8.826691805951896e-05, + "loss": 3.5909, + "step": 13825 + }, + { + "epoch": 0.9396657154504688, + "grad_norm": 0.4388710856437683, + "learning_rate": 8.826267155863569e-05, + "loss": 4.0782, + "step": 13830 + }, + { + "epoch": 0.9400054355211306, + "grad_norm": 0.5807581543922424, + "learning_rate": 8.825842505775242e-05, + "loss": 3.7892, + "step": 13835 + }, + { + "epoch": 0.9403451555917923, + "grad_norm": 0.14411011338233948, + "learning_rate": 8.825417855686915e-05, + "loss": 3.9576, + "step": 13840 + }, + { + "epoch": 0.9406848756624542, + "grad_norm": 0.21537935733795166, + "learning_rate": 8.824993205598587e-05, + "loss": 3.8762, + "step": 13845 + }, + { + "epoch": 0.9410245957331159, + "grad_norm": 1.785944938659668, + "learning_rate": 8.82456855551026e-05, + "loss": 4.103, + "step": 13850 + }, + { + "epoch": 0.9413643158037777, + "grad_norm": 0.13651463389396667, + "learning_rate": 8.824143905421933e-05, + "loss": 4.032, + "step": 13855 + }, + { + "epoch": 0.9417040358744395, + "grad_norm": 0.16791968047618866, + "learning_rate": 8.823719255333606e-05, + "loss": 3.9523, + "step": 13860 + }, + { + "epoch": 0.9420437559451013, + "grad_norm": 0.22517231106758118, + "learning_rate": 8.823294605245279e-05, + "loss": 3.8512, + "step": 13865 + }, + { + "epoch": 0.942383476015763, + "grad_norm": 0.2402997463941574, + "learning_rate": 8.822869955156951e-05, + "loss": 3.9759, + "step": 13870 + }, + { + "epoch": 0.9427231960864247, + "grad_norm": 0.20395690202713013, + "learning_rate": 8.822445305068624e-05, + "loss": 3.9377, + "step": 13875 + }, + { + "epoch": 0.9430629161570866, + "grad_norm": 0.24165527522563934, + "learning_rate": 8.822020654980297e-05, + "loss": 3.9313, + "step": 13880 + }, + { + "epoch": 0.9434026362277483, + "grad_norm": 0.3844189941883087, + "learning_rate": 8.82159600489197e-05, + "loss": 3.7314, + "step": 13885 + }, + { + "epoch": 0.9437423562984101, + "grad_norm": 0.2007877379655838, + "learning_rate": 8.821171354803643e-05, + "loss": 4.0631, + "step": 13890 + }, + { + "epoch": 0.9440820763690719, + "grad_norm": 0.18726769089698792, + "learning_rate": 8.820746704715315e-05, + "loss": 4.055, + "step": 13895 + }, + { + "epoch": 0.9444217964397337, + "grad_norm": 0.38226640224456787, + "learning_rate": 8.820322054626988e-05, + "loss": 4.0028, + "step": 13900 + }, + { + "epoch": 0.9447615165103954, + "grad_norm": 0.1904405802488327, + "learning_rate": 8.819897404538661e-05, + "loss": 3.9834, + "step": 13905 + }, + { + "epoch": 0.9451012365810572, + "grad_norm": 0.1809813529253006, + "learning_rate": 8.819472754450332e-05, + "loss": 4.0637, + "step": 13910 + }, + { + "epoch": 0.945440956651719, + "grad_norm": 0.4116685092449188, + "learning_rate": 8.819048104362007e-05, + "loss": 3.9916, + "step": 13915 + }, + { + "epoch": 0.9457806767223808, + "grad_norm": 1.9458621740341187, + "learning_rate": 8.81862345427368e-05, + "loss": 4.0563, + "step": 13920 + }, + { + "epoch": 0.9461203967930425, + "grad_norm": 0.15543721616268158, + "learning_rate": 8.818198804185351e-05, + "loss": 4.0744, + "step": 13925 + }, + { + "epoch": 0.9464601168637043, + "grad_norm": 0.42934080958366394, + "learning_rate": 8.817774154097025e-05, + "loss": 3.8221, + "step": 13930 + }, + { + "epoch": 0.9467998369343661, + "grad_norm": 0.228485107421875, + "learning_rate": 8.817349504008698e-05, + "loss": 3.9268, + "step": 13935 + }, + { + "epoch": 0.9471395570050278, + "grad_norm": 0.21295882761478424, + "learning_rate": 8.816924853920369e-05, + "loss": 4.2737, + "step": 13940 + }, + { + "epoch": 0.9474792770756897, + "grad_norm": 0.2023978978395462, + "learning_rate": 8.816500203832043e-05, + "loss": 4.0087, + "step": 13945 + }, + { + "epoch": 0.9478189971463514, + "grad_norm": 0.1767956167459488, + "learning_rate": 8.816075553743716e-05, + "loss": 3.9129, + "step": 13950 + }, + { + "epoch": 0.9481587172170132, + "grad_norm": 1.3790664672851562, + "learning_rate": 8.815650903655388e-05, + "loss": 3.6768, + "step": 13955 + }, + { + "epoch": 0.9484984372876749, + "grad_norm": 0.583037793636322, + "learning_rate": 8.815226253567062e-05, + "loss": 3.8518, + "step": 13960 + }, + { + "epoch": 0.9488381573583368, + "grad_norm": 0.16457735002040863, + "learning_rate": 8.814801603478735e-05, + "loss": 3.7035, + "step": 13965 + }, + { + "epoch": 0.9491778774289985, + "grad_norm": 0.2307073473930359, + "learning_rate": 8.814376953390406e-05, + "loss": 4.1654, + "step": 13970 + }, + { + "epoch": 0.9495175974996602, + "grad_norm": 0.3080320358276367, + "learning_rate": 8.81395230330208e-05, + "loss": 3.975, + "step": 13975 + }, + { + "epoch": 0.9498573175703221, + "grad_norm": 0.17688848078250885, + "learning_rate": 8.813527653213752e-05, + "loss": 4.0859, + "step": 13980 + }, + { + "epoch": 0.9501970376409838, + "grad_norm": 0.24346010386943817, + "learning_rate": 8.813103003125424e-05, + "loss": 3.9282, + "step": 13985 + }, + { + "epoch": 0.9505367577116456, + "grad_norm": 0.1712096929550171, + "learning_rate": 8.812678353037099e-05, + "loss": 3.9035, + "step": 13990 + }, + { + "epoch": 0.9508764777823073, + "grad_norm": 0.19020821154117584, + "learning_rate": 8.81225370294877e-05, + "loss": 4.1025, + "step": 13995 + }, + { + "epoch": 0.9512161978529692, + "grad_norm": 0.21272438764572144, + "learning_rate": 8.811829052860443e-05, + "loss": 3.9025, + "step": 14000 + }, + { + "epoch": 0.9515559179236309, + "grad_norm": 0.14927330613136292, + "learning_rate": 8.811404402772117e-05, + "loss": 3.6392, + "step": 14005 + }, + { + "epoch": 0.9518956379942927, + "grad_norm": 0.14234548807144165, + "learning_rate": 8.810979752683788e-05, + "loss": 3.7538, + "step": 14010 + }, + { + "epoch": 0.9522353580649545, + "grad_norm": 0.19329407811164856, + "learning_rate": 8.810555102595461e-05, + "loss": 4.0138, + "step": 14015 + }, + { + "epoch": 0.9525750781356163, + "grad_norm": 1.4322816133499146, + "learning_rate": 8.810130452507135e-05, + "loss": 3.8935, + "step": 14020 + }, + { + "epoch": 0.952914798206278, + "grad_norm": 0.19049431383609772, + "learning_rate": 8.809705802418807e-05, + "loss": 4.0516, + "step": 14025 + }, + { + "epoch": 0.9532545182769399, + "grad_norm": 0.2139282375574112, + "learning_rate": 8.80928115233048e-05, + "loss": 3.8331, + "step": 14030 + }, + { + "epoch": 0.9535942383476016, + "grad_norm": 0.17371760308742523, + "learning_rate": 8.808856502242154e-05, + "loss": 3.9821, + "step": 14035 + }, + { + "epoch": 0.9539339584182633, + "grad_norm": 0.16707171499729156, + "learning_rate": 8.808431852153825e-05, + "loss": 4.1538, + "step": 14040 + }, + { + "epoch": 0.9542736784889251, + "grad_norm": 0.2354259192943573, + "learning_rate": 8.808007202065498e-05, + "loss": 3.9828, + "step": 14045 + }, + { + "epoch": 0.9546133985595869, + "grad_norm": 0.26454171538352966, + "learning_rate": 8.807582551977172e-05, + "loss": 4.1493, + "step": 14050 + }, + { + "epoch": 0.9549531186302487, + "grad_norm": 0.26322662830352783, + "learning_rate": 8.807157901888844e-05, + "loss": 4.0161, + "step": 14055 + }, + { + "epoch": 0.9552928387009104, + "grad_norm": 0.5517749786376953, + "learning_rate": 8.806733251800516e-05, + "loss": 3.8552, + "step": 14060 + }, + { + "epoch": 0.9556325587715723, + "grad_norm": 0.34178662300109863, + "learning_rate": 8.806308601712189e-05, + "loss": 3.9374, + "step": 14065 + }, + { + "epoch": 0.955972278842234, + "grad_norm": 0.4318985044956207, + "learning_rate": 8.805883951623862e-05, + "loss": 3.9312, + "step": 14070 + }, + { + "epoch": 0.9563119989128958, + "grad_norm": 0.7835567593574524, + "learning_rate": 8.805459301535535e-05, + "loss": 4.0294, + "step": 14075 + }, + { + "epoch": 0.9566517189835575, + "grad_norm": 0.248532235622406, + "learning_rate": 8.805034651447208e-05, + "loss": 4.0144, + "step": 14080 + }, + { + "epoch": 0.9569914390542194, + "grad_norm": 0.17486423254013062, + "learning_rate": 8.80461000135888e-05, + "loss": 3.8262, + "step": 14085 + }, + { + "epoch": 0.9573311591248811, + "grad_norm": 0.1894778460264206, + "learning_rate": 8.804185351270553e-05, + "loss": 3.989, + "step": 14090 + }, + { + "epoch": 0.9576708791955428, + "grad_norm": 0.1900128871202469, + "learning_rate": 8.803760701182226e-05, + "loss": 4.0652, + "step": 14095 + }, + { + "epoch": 0.9580105992662047, + "grad_norm": 0.21038229763507843, + "learning_rate": 8.803336051093899e-05, + "loss": 3.7708, + "step": 14100 + }, + { + "epoch": 0.9583503193368664, + "grad_norm": 0.22659200429916382, + "learning_rate": 8.802911401005572e-05, + "loss": 3.701, + "step": 14105 + }, + { + "epoch": 0.9586900394075282, + "grad_norm": 0.18774689733982086, + "learning_rate": 8.802486750917244e-05, + "loss": 3.8036, + "step": 14110 + }, + { + "epoch": 0.95902975947819, + "grad_norm": 0.7720076441764832, + "learning_rate": 8.802062100828917e-05, + "loss": 4.0017, + "step": 14115 + }, + { + "epoch": 0.9593694795488518, + "grad_norm": 0.17789320647716522, + "learning_rate": 8.80163745074059e-05, + "loss": 4.3506, + "step": 14120 + }, + { + "epoch": 0.9597091996195135, + "grad_norm": 0.23013809323310852, + "learning_rate": 8.801212800652263e-05, + "loss": 4.1502, + "step": 14125 + }, + { + "epoch": 0.9600489196901753, + "grad_norm": 0.20013386011123657, + "learning_rate": 8.800788150563936e-05, + "loss": 4.002, + "step": 14130 + }, + { + "epoch": 0.9603886397608371, + "grad_norm": 0.331853449344635, + "learning_rate": 8.800363500475608e-05, + "loss": 3.9042, + "step": 14135 + }, + { + "epoch": 0.9607283598314988, + "grad_norm": 0.19631457328796387, + "learning_rate": 8.799938850387281e-05, + "loss": 4.1109, + "step": 14140 + }, + { + "epoch": 0.9610680799021606, + "grad_norm": 0.38952094316482544, + "learning_rate": 8.799514200298954e-05, + "loss": 3.7741, + "step": 14145 + }, + { + "epoch": 0.9614077999728224, + "grad_norm": 0.15865278244018555, + "learning_rate": 8.799089550210627e-05, + "loss": 4.0511, + "step": 14150 + }, + { + "epoch": 0.9617475200434842, + "grad_norm": 0.20270100235939026, + "learning_rate": 8.7986649001223e-05, + "loss": 3.9072, + "step": 14155 + }, + { + "epoch": 0.9620872401141459, + "grad_norm": 0.17968709766864777, + "learning_rate": 8.798240250033972e-05, + "loss": 4.0201, + "step": 14160 + }, + { + "epoch": 0.9624269601848077, + "grad_norm": 0.19091679155826569, + "learning_rate": 8.797815599945645e-05, + "loss": 4.0135, + "step": 14165 + }, + { + "epoch": 0.9627666802554695, + "grad_norm": 0.18477921187877655, + "learning_rate": 8.797390949857318e-05, + "loss": 4.0118, + "step": 14170 + }, + { + "epoch": 0.9631064003261313, + "grad_norm": 0.1884998232126236, + "learning_rate": 8.796966299768991e-05, + "loss": 3.9669, + "step": 14175 + }, + { + "epoch": 0.963446120396793, + "grad_norm": 0.18764863908290863, + "learning_rate": 8.796541649680664e-05, + "loss": 3.9139, + "step": 14180 + }, + { + "epoch": 0.9637858404674549, + "grad_norm": 0.3251302242279053, + "learning_rate": 8.796116999592336e-05, + "loss": 4.1504, + "step": 14185 + }, + { + "epoch": 0.9641255605381166, + "grad_norm": 0.17968013882637024, + "learning_rate": 8.795692349504009e-05, + "loss": 3.8325, + "step": 14190 + }, + { + "epoch": 0.9644652806087783, + "grad_norm": 0.28362056612968445, + "learning_rate": 8.795267699415682e-05, + "loss": 3.945, + "step": 14195 + }, + { + "epoch": 0.9648050006794402, + "grad_norm": 0.18230807781219482, + "learning_rate": 8.794843049327355e-05, + "loss": 3.8641, + "step": 14200 + }, + { + "epoch": 0.9651447207501019, + "grad_norm": 0.20012174546718597, + "learning_rate": 8.794418399239028e-05, + "loss": 3.8536, + "step": 14205 + }, + { + "epoch": 0.9654844408207637, + "grad_norm": 0.16306108236312866, + "learning_rate": 8.7939937491507e-05, + "loss": 3.9824, + "step": 14210 + }, + { + "epoch": 0.9658241608914254, + "grad_norm": 0.19044755399227142, + "learning_rate": 8.793569099062373e-05, + "loss": 4.0477, + "step": 14215 + }, + { + "epoch": 0.9661638809620873, + "grad_norm": 0.9334998726844788, + "learning_rate": 8.793144448974046e-05, + "loss": 4.0839, + "step": 14220 + }, + { + "epoch": 0.966503601032749, + "grad_norm": 0.2080710530281067, + "learning_rate": 8.792719798885719e-05, + "loss": 3.935, + "step": 14225 + }, + { + "epoch": 0.9668433211034108, + "grad_norm": 0.20228326320648193, + "learning_rate": 8.792295148797392e-05, + "loss": 3.8579, + "step": 14230 + }, + { + "epoch": 0.9671830411740726, + "grad_norm": 0.14868243038654327, + "learning_rate": 8.791870498709064e-05, + "loss": 3.9681, + "step": 14235 + }, + { + "epoch": 0.9675227612447344, + "grad_norm": 0.3080695569515228, + "learning_rate": 8.791445848620737e-05, + "loss": 3.7428, + "step": 14240 + }, + { + "epoch": 0.9678624813153961, + "grad_norm": 0.3069162666797638, + "learning_rate": 8.79102119853241e-05, + "loss": 3.9151, + "step": 14245 + }, + { + "epoch": 0.9682022013860578, + "grad_norm": 0.18564817309379578, + "learning_rate": 8.790596548444083e-05, + "loss": 3.9755, + "step": 14250 + }, + { + "epoch": 0.9685419214567197, + "grad_norm": 0.26427149772644043, + "learning_rate": 8.790171898355756e-05, + "loss": 4.0877, + "step": 14255 + }, + { + "epoch": 0.9688816415273814, + "grad_norm": 0.17514817416667938, + "learning_rate": 8.789747248267428e-05, + "loss": 3.9761, + "step": 14260 + }, + { + "epoch": 0.9692213615980432, + "grad_norm": 0.17348946630954742, + "learning_rate": 8.7893225981791e-05, + "loss": 3.9038, + "step": 14265 + }, + { + "epoch": 0.969561081668705, + "grad_norm": 0.20074446499347687, + "learning_rate": 8.788897948090774e-05, + "loss": 3.9772, + "step": 14270 + }, + { + "epoch": 0.9699008017393668, + "grad_norm": 0.24490071833133698, + "learning_rate": 8.788473298002447e-05, + "loss": 4.1249, + "step": 14275 + }, + { + "epoch": 0.9702405218100285, + "grad_norm": 0.2058936506509781, + "learning_rate": 8.788048647914118e-05, + "loss": 3.8624, + "step": 14280 + }, + { + "epoch": 0.9705802418806904, + "grad_norm": 0.16198603808879852, + "learning_rate": 8.787623997825792e-05, + "loss": 3.9449, + "step": 14285 + }, + { + "epoch": 0.9709199619513521, + "grad_norm": 0.15939363837242126, + "learning_rate": 8.787199347737465e-05, + "loss": 3.9548, + "step": 14290 + }, + { + "epoch": 0.9712596820220138, + "grad_norm": 0.15633496642112732, + "learning_rate": 8.786774697649137e-05, + "loss": 3.8646, + "step": 14295 + }, + { + "epoch": 0.9715994020926756, + "grad_norm": 0.18158230185508728, + "learning_rate": 8.786350047560811e-05, + "loss": 4.0045, + "step": 14300 + }, + { + "epoch": 0.9719391221633374, + "grad_norm": 0.2111271321773529, + "learning_rate": 8.785925397472484e-05, + "loss": 4.1019, + "step": 14305 + }, + { + "epoch": 0.9722788422339992, + "grad_norm": 0.19582590460777283, + "learning_rate": 8.785500747384155e-05, + "loss": 4.0289, + "step": 14310 + }, + { + "epoch": 0.9726185623046609, + "grad_norm": 0.18569405376911163, + "learning_rate": 8.785076097295829e-05, + "loss": 3.8372, + "step": 14315 + }, + { + "epoch": 0.9729582823753228, + "grad_norm": 0.1685798466205597, + "learning_rate": 8.784651447207502e-05, + "loss": 3.9108, + "step": 14320 + }, + { + "epoch": 0.9732980024459845, + "grad_norm": 0.236485555768013, + "learning_rate": 8.784226797119174e-05, + "loss": 3.6307, + "step": 14325 + }, + { + "epoch": 0.9736377225166463, + "grad_norm": 0.17849500477313995, + "learning_rate": 8.783802147030848e-05, + "loss": 4.0166, + "step": 14330 + }, + { + "epoch": 0.973977442587308, + "grad_norm": 0.15316098928451538, + "learning_rate": 8.783377496942519e-05, + "loss": 4.0223, + "step": 14335 + }, + { + "epoch": 0.9743171626579699, + "grad_norm": 0.14150911569595337, + "learning_rate": 8.782952846854192e-05, + "loss": 3.9861, + "step": 14340 + }, + { + "epoch": 0.9746568827286316, + "grad_norm": 0.15742090344429016, + "learning_rate": 8.782528196765866e-05, + "loss": 4.0853, + "step": 14345 + }, + { + "epoch": 0.9749966027992933, + "grad_norm": 0.1783253401517868, + "learning_rate": 8.782103546677538e-05, + "loss": 3.8919, + "step": 14350 + }, + { + "epoch": 0.9753363228699552, + "grad_norm": 0.16757941246032715, + "learning_rate": 8.78167889658921e-05, + "loss": 3.5916, + "step": 14355 + }, + { + "epoch": 0.9756760429406169, + "grad_norm": 0.25573256611824036, + "learning_rate": 8.781254246500884e-05, + "loss": 3.9848, + "step": 14360 + }, + { + "epoch": 0.9760157630112787, + "grad_norm": 0.1471344381570816, + "learning_rate": 8.780829596412556e-05, + "loss": 3.9848, + "step": 14365 + }, + { + "epoch": 0.9763554830819405, + "grad_norm": 0.17102879285812378, + "learning_rate": 8.780404946324229e-05, + "loss": 3.9778, + "step": 14370 + }, + { + "epoch": 0.9766952031526023, + "grad_norm": 0.21928320825099945, + "learning_rate": 8.779980296235903e-05, + "loss": 3.7909, + "step": 14375 + }, + { + "epoch": 0.977034923223264, + "grad_norm": 0.1851445883512497, + "learning_rate": 8.779555646147574e-05, + "loss": 4.0371, + "step": 14380 + }, + { + "epoch": 0.9773746432939258, + "grad_norm": 0.15043723583221436, + "learning_rate": 8.779130996059247e-05, + "loss": 3.7731, + "step": 14385 + }, + { + "epoch": 0.9777143633645876, + "grad_norm": 0.19180312752723694, + "learning_rate": 8.778706345970921e-05, + "loss": 3.8016, + "step": 14390 + }, + { + "epoch": 0.9780540834352494, + "grad_norm": 0.18517717719078064, + "learning_rate": 8.778281695882593e-05, + "loss": 4.0766, + "step": 14395 + }, + { + "epoch": 0.9783938035059111, + "grad_norm": 0.2040787637233734, + "learning_rate": 8.777857045794266e-05, + "loss": 3.9668, + "step": 14400 + }, + { + "epoch": 0.978733523576573, + "grad_norm": 0.17522788047790527, + "learning_rate": 8.777432395705938e-05, + "loss": 3.7057, + "step": 14405 + }, + { + "epoch": 0.9790732436472347, + "grad_norm": 0.258577823638916, + "learning_rate": 8.777007745617611e-05, + "loss": 3.8998, + "step": 14410 + }, + { + "epoch": 0.9794129637178964, + "grad_norm": 0.16101132333278656, + "learning_rate": 8.776583095529284e-05, + "loss": 4.024, + "step": 14415 + }, + { + "epoch": 0.9797526837885582, + "grad_norm": 0.23143424093723297, + "learning_rate": 8.776158445440957e-05, + "loss": 3.876, + "step": 14420 + }, + { + "epoch": 0.98009240385922, + "grad_norm": 0.5347188711166382, + "learning_rate": 8.77573379535263e-05, + "loss": 3.721, + "step": 14425 + }, + { + "epoch": 0.9804321239298818, + "grad_norm": 3.5879626274108887, + "learning_rate": 8.775309145264302e-05, + "loss": 3.9406, + "step": 14430 + }, + { + "epoch": 0.9807718440005435, + "grad_norm": 0.48771944642066956, + "learning_rate": 8.774884495175975e-05, + "loss": 3.8004, + "step": 14435 + }, + { + "epoch": 0.9811115640712054, + "grad_norm": 0.1710396111011505, + "learning_rate": 8.774459845087648e-05, + "loss": 3.9633, + "step": 14440 + }, + { + "epoch": 0.9814512841418671, + "grad_norm": 0.16899172961711884, + "learning_rate": 8.774035194999321e-05, + "loss": 4.0779, + "step": 14445 + }, + { + "epoch": 0.9817910042125289, + "grad_norm": 0.18617889285087585, + "learning_rate": 8.773610544910994e-05, + "loss": 3.8466, + "step": 14450 + }, + { + "epoch": 0.9821307242831907, + "grad_norm": 0.13299688696861267, + "learning_rate": 8.773185894822666e-05, + "loss": 4.0135, + "step": 14455 + }, + { + "epoch": 0.9824704443538524, + "grad_norm": 0.20678380131721497, + "learning_rate": 8.772761244734339e-05, + "loss": 3.8453, + "step": 14460 + }, + { + "epoch": 0.9828101644245142, + "grad_norm": 0.14222410321235657, + "learning_rate": 8.772336594646012e-05, + "loss": 3.9648, + "step": 14465 + }, + { + "epoch": 0.9831498844951759, + "grad_norm": 0.21488900482654572, + "learning_rate": 8.771911944557685e-05, + "loss": 3.9613, + "step": 14470 + }, + { + "epoch": 0.9834896045658378, + "grad_norm": 0.2058933824300766, + "learning_rate": 8.771487294469358e-05, + "loss": 3.7448, + "step": 14475 + }, + { + "epoch": 0.9838293246364995, + "grad_norm": 0.25969386100769043, + "learning_rate": 8.77106264438103e-05, + "loss": 3.8523, + "step": 14480 + }, + { + "epoch": 0.9841690447071613, + "grad_norm": 0.17804904282093048, + "learning_rate": 8.770637994292703e-05, + "loss": 4.0136, + "step": 14485 + }, + { + "epoch": 0.9845087647778231, + "grad_norm": 0.17092011868953705, + "learning_rate": 8.770213344204376e-05, + "loss": 3.9277, + "step": 14490 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.26749852299690247, + "learning_rate": 8.769788694116049e-05, + "loss": 4.1337, + "step": 14495 + }, + { + "epoch": 0.9851882049191466, + "grad_norm": 1.6659806966781616, + "learning_rate": 8.769364044027722e-05, + "loss": 3.9058, + "step": 14500 + }, + { + "epoch": 0.9855279249898083, + "grad_norm": 0.16872169077396393, + "learning_rate": 8.768939393939394e-05, + "loss": 3.9551, + "step": 14505 + }, + { + "epoch": 0.9858676450604702, + "grad_norm": 0.25871741771698, + "learning_rate": 8.768514743851067e-05, + "loss": 3.9576, + "step": 14510 + }, + { + "epoch": 0.9862073651311319, + "grad_norm": 0.2599276006221771, + "learning_rate": 8.76809009376274e-05, + "loss": 4.0486, + "step": 14515 + }, + { + "epoch": 0.9865470852017937, + "grad_norm": 0.17734144628047943, + "learning_rate": 8.767665443674413e-05, + "loss": 3.9271, + "step": 14520 + }, + { + "epoch": 0.9868868052724555, + "grad_norm": 0.15988993644714355, + "learning_rate": 8.767240793586086e-05, + "loss": 3.7398, + "step": 14525 + }, + { + "epoch": 0.9872265253431173, + "grad_norm": 0.3864258825778961, + "learning_rate": 8.766816143497758e-05, + "loss": 4.1109, + "step": 14530 + }, + { + "epoch": 0.987566245413779, + "grad_norm": 0.1917543262243271, + "learning_rate": 8.766391493409431e-05, + "loss": 3.9459, + "step": 14535 + }, + { + "epoch": 0.9879059654844409, + "grad_norm": 0.2340976893901825, + "learning_rate": 8.765966843321104e-05, + "loss": 3.9126, + "step": 14540 + }, + { + "epoch": 0.9882456855551026, + "grad_norm": 0.1463710069656372, + "learning_rate": 8.765542193232777e-05, + "loss": 3.8255, + "step": 14545 + }, + { + "epoch": 0.9885854056257644, + "grad_norm": 0.20803597569465637, + "learning_rate": 8.76511754314445e-05, + "loss": 3.9285, + "step": 14550 + }, + { + "epoch": 0.9889251256964261, + "grad_norm": 0.23730699717998505, + "learning_rate": 8.764692893056122e-05, + "loss": 3.9045, + "step": 14555 + }, + { + "epoch": 0.989264845767088, + "grad_norm": 0.17138421535491943, + "learning_rate": 8.764268242967795e-05, + "loss": 3.8542, + "step": 14560 + }, + { + "epoch": 0.9896045658377497, + "grad_norm": 0.16893406212329865, + "learning_rate": 8.763843592879468e-05, + "loss": 4.0535, + "step": 14565 + }, + { + "epoch": 0.9899442859084114, + "grad_norm": 0.13037355244159698, + "learning_rate": 8.763418942791141e-05, + "loss": 3.7079, + "step": 14570 + }, + { + "epoch": 0.9902840059790733, + "grad_norm": 0.1940174400806427, + "learning_rate": 8.762994292702814e-05, + "loss": 3.9701, + "step": 14575 + }, + { + "epoch": 0.990623726049735, + "grad_norm": 0.19718214869499207, + "learning_rate": 8.762569642614486e-05, + "loss": 3.8213, + "step": 14580 + }, + { + "epoch": 0.9909634461203968, + "grad_norm": 0.6371154189109802, + "learning_rate": 8.762144992526159e-05, + "loss": 3.8541, + "step": 14585 + }, + { + "epoch": 0.9913031661910585, + "grad_norm": 0.15423880517482758, + "learning_rate": 8.761720342437832e-05, + "loss": 4.0511, + "step": 14590 + }, + { + "epoch": 0.9916428862617204, + "grad_norm": 0.1736377775669098, + "learning_rate": 8.761295692349505e-05, + "loss": 3.8275, + "step": 14595 + }, + { + "epoch": 0.9919826063323821, + "grad_norm": 0.19258588552474976, + "learning_rate": 8.760871042261178e-05, + "loss": 4.1035, + "step": 14600 + }, + { + "epoch": 0.9923223264030439, + "grad_norm": 0.43297508358955383, + "learning_rate": 8.760446392172849e-05, + "loss": 4.0978, + "step": 14605 + }, + { + "epoch": 0.9926620464737057, + "grad_norm": 0.20580710470676422, + "learning_rate": 8.760021742084523e-05, + "loss": 3.8437, + "step": 14610 + }, + { + "epoch": 0.9930017665443674, + "grad_norm": 1.218690037727356, + "learning_rate": 8.759597091996196e-05, + "loss": 4.0635, + "step": 14615 + }, + { + "epoch": 0.9933414866150292, + "grad_norm": 0.1623927801847458, + "learning_rate": 8.759172441907867e-05, + "loss": 3.8147, + "step": 14620 + }, + { + "epoch": 0.993681206685691, + "grad_norm": 0.17431679368019104, + "learning_rate": 8.758747791819542e-05, + "loss": 3.9406, + "step": 14625 + }, + { + "epoch": 0.9940209267563528, + "grad_norm": 0.18215017020702362, + "learning_rate": 8.758323141731214e-05, + "loss": 4.1441, + "step": 14630 + }, + { + "epoch": 0.9943606468270145, + "grad_norm": 0.20073819160461426, + "learning_rate": 8.757898491642886e-05, + "loss": 3.9045, + "step": 14635 + }, + { + "epoch": 0.9947003668976763, + "grad_norm": 0.17208696901798248, + "learning_rate": 8.75747384155456e-05, + "loss": 4.223, + "step": 14640 + }, + { + "epoch": 0.9950400869683381, + "grad_norm": 0.18610428273677826, + "learning_rate": 8.757049191466233e-05, + "loss": 3.9652, + "step": 14645 + }, + { + "epoch": 0.9953798070389999, + "grad_norm": 0.2461862713098526, + "learning_rate": 8.756624541377904e-05, + "loss": 3.8623, + "step": 14650 + }, + { + "epoch": 0.9957195271096616, + "grad_norm": 0.37638500332832336, + "learning_rate": 8.756199891289578e-05, + "loss": 3.896, + "step": 14655 + }, + { + "epoch": 0.9960592471803235, + "grad_norm": 0.2814083993434906, + "learning_rate": 8.755775241201251e-05, + "loss": 3.8634, + "step": 14660 + }, + { + "epoch": 0.9963989672509852, + "grad_norm": 0.2636764645576477, + "learning_rate": 8.755350591112923e-05, + "loss": 4.1024, + "step": 14665 + }, + { + "epoch": 0.9967386873216469, + "grad_norm": 0.1730240136384964, + "learning_rate": 8.754925941024597e-05, + "loss": 4.0616, + "step": 14670 + }, + { + "epoch": 0.9970784073923087, + "grad_norm": 0.17876474559307098, + "learning_rate": 8.75450129093627e-05, + "loss": 4.0613, + "step": 14675 + }, + { + "epoch": 0.9974181274629705, + "grad_norm": 0.1803043931722641, + "learning_rate": 8.754076640847941e-05, + "loss": 4.0982, + "step": 14680 + }, + { + "epoch": 0.9977578475336323, + "grad_norm": 0.2912727892398834, + "learning_rate": 8.753651990759615e-05, + "loss": 3.8865, + "step": 14685 + }, + { + "epoch": 0.998097567604294, + "grad_norm": 0.21436181664466858, + "learning_rate": 8.753227340671287e-05, + "loss": 4.0897, + "step": 14690 + }, + { + "epoch": 0.9984372876749559, + "grad_norm": 0.1772686094045639, + "learning_rate": 8.75280269058296e-05, + "loss": 4.1523, + "step": 14695 + }, + { + "epoch": 0.9987770077456176, + "grad_norm": 0.269540935754776, + "learning_rate": 8.752378040494634e-05, + "loss": 3.9891, + "step": 14700 + }, + { + "epoch": 0.9991167278162794, + "grad_norm": 0.20457632839679718, + "learning_rate": 8.751953390406305e-05, + "loss": 3.8832, + "step": 14705 + }, + { + "epoch": 0.9994564478869412, + "grad_norm": 0.2145458459854126, + "learning_rate": 8.751528740317978e-05, + "loss": 4.0273, + "step": 14710 + }, + { + "epoch": 0.999796167957603, + "grad_norm": 0.18700024485588074, + "learning_rate": 8.751104090229652e-05, + "loss": 4.1366, + "step": 14715 + }, + { + "epoch": 1.0, + "eval_bertscore": { + "f1": 0.8525333878305233, + "precision": 0.8753549892468697, + "recall": 0.8312223081395127 + }, + "eval_bleu_4": 0.0021924919669198163, + "eval_exact_match": 0.0, + "eval_loss": 3.751943349838257, + "eval_meteor": 0.07837825616589945, + "eval_rouge": { + "rouge1": 0.12766942289604644, + "rouge2": 0.01466477971693661, + "rougeL": 0.11110403147048781, + "rougeLsum": 0.11113999503571864 + }, + "eval_runtime": 404.0965, + "eval_samples_per_second": 25.536, + "eval_steps_per_second": 3.192, + "step": 14718 + } + ], + "logging_steps": 5, + "max_steps": 117744, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.174476031013683e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}