{ "best_global_step": 49390, "best_metric": 0.4634726643562317, "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_hellaswag_1754652170/checkpoint-49390", "epoch": 10.0, "eval_steps": 4490, "global_step": 89790, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005568548836173293, "grad_norm": 2.546940326690674, "learning_rate": 2.2274195344693173e-08, "loss": 13.2252, "num_input_tokens_seen": 5568, "step": 5 }, { "epoch": 0.0011137097672346587, "grad_norm": 2.4502360820770264, "learning_rate": 5.011693952555964e-08, "loss": 13.1923, "num_input_tokens_seen": 11680, "step": 10 }, { "epoch": 0.001670564650851988, "grad_norm": 2.302497148513794, "learning_rate": 7.79596837064261e-08, "loss": 13.139, "num_input_tokens_seen": 17536, "step": 15 }, { "epoch": 0.0022274195344693173, "grad_norm": 2.7239198684692383, "learning_rate": 1.0580242788729258e-07, "loss": 13.0934, "num_input_tokens_seen": 23904, "step": 20 }, { "epoch": 0.0027842744180866467, "grad_norm": 2.687640905380249, "learning_rate": 1.3364517206815904e-07, "loss": 13.0109, "num_input_tokens_seen": 29536, "step": 25 }, { "epoch": 0.003341129301703976, "grad_norm": 2.540816068649292, "learning_rate": 1.614879162490255e-07, "loss": 13.2331, "num_input_tokens_seen": 35776, "step": 30 }, { "epoch": 0.0038979841853213053, "grad_norm": 3.0277206897735596, "learning_rate": 1.8933066042989199e-07, "loss": 13.4366, "num_input_tokens_seen": 41664, "step": 35 }, { "epoch": 0.004454839068938635, "grad_norm": 2.1640467643737793, "learning_rate": 2.1717340461075842e-07, "loss": 12.8857, "num_input_tokens_seen": 48032, "step": 40 }, { "epoch": 0.005011693952555964, "grad_norm": 2.448720693588257, "learning_rate": 2.450161487916249e-07, "loss": 13.0254, "num_input_tokens_seen": 54464, "step": 45 }, { "epoch": 0.005568548836173293, "grad_norm": 2.354466199874878, "learning_rate": 2.7285889297249136e-07, "loss": 13.2791, "num_input_tokens_seen": 60928, "step": 50 }, { "epoch": 0.006125403719790623, "grad_norm": 2.3498528003692627, "learning_rate": 3.0070163715335785e-07, "loss": 13.3573, "num_input_tokens_seen": 66400, "step": 55 }, { "epoch": 0.006682258603407952, "grad_norm": 2.3828771114349365, "learning_rate": 3.285443813342243e-07, "loss": 13.0198, "num_input_tokens_seen": 72352, "step": 60 }, { "epoch": 0.007239113487025281, "grad_norm": 2.3965868949890137, "learning_rate": 3.5638712551509077e-07, "loss": 13.0145, "num_input_tokens_seen": 78560, "step": 65 }, { "epoch": 0.007795968370642611, "grad_norm": 2.5475893020629883, "learning_rate": 3.8422986969595726e-07, "loss": 13.1891, "num_input_tokens_seen": 84768, "step": 70 }, { "epoch": 0.00835282325425994, "grad_norm": 2.554625988006592, "learning_rate": 4.1207261387682374e-07, "loss": 12.8671, "num_input_tokens_seen": 91072, "step": 75 }, { "epoch": 0.00890967813787727, "grad_norm": 2.4653894901275635, "learning_rate": 4.3991535805769023e-07, "loss": 13.1407, "num_input_tokens_seen": 97152, "step": 80 }, { "epoch": 0.009466533021494599, "grad_norm": 2.3816070556640625, "learning_rate": 4.6775810223855666e-07, "loss": 13.0888, "num_input_tokens_seen": 103488, "step": 85 }, { "epoch": 0.010023387905111928, "grad_norm": 2.4425861835479736, "learning_rate": 4.956008464194231e-07, "loss": 13.2236, "num_input_tokens_seen": 109568, "step": 90 }, { "epoch": 0.010580242788729257, "grad_norm": 2.6584105491638184, "learning_rate": 5.234435906002896e-07, "loss": 12.9221, "num_input_tokens_seen": 115584, "step": 95 }, { "epoch": 0.011137097672346587, "grad_norm": 2.3979296684265137, "learning_rate": 5.512863347811561e-07, "loss": 13.2484, "num_input_tokens_seen": 121248, "step": 100 }, { "epoch": 0.011693952555963916, "grad_norm": 2.3905651569366455, "learning_rate": 5.791290789620226e-07, "loss": 13.0557, "num_input_tokens_seen": 127648, "step": 105 }, { "epoch": 0.012250807439581245, "grad_norm": 2.4730939865112305, "learning_rate": 6.06971823142889e-07, "loss": 13.1107, "num_input_tokens_seen": 133472, "step": 110 }, { "epoch": 0.012807662323198575, "grad_norm": 2.437185525894165, "learning_rate": 6.348145673237555e-07, "loss": 13.1453, "num_input_tokens_seen": 139712, "step": 115 }, { "epoch": 0.013364517206815904, "grad_norm": 2.3455934524536133, "learning_rate": 6.62657311504622e-07, "loss": 13.0126, "num_input_tokens_seen": 146048, "step": 120 }, { "epoch": 0.013921372090433233, "grad_norm": 2.300398111343384, "learning_rate": 6.905000556854884e-07, "loss": 13.1746, "num_input_tokens_seen": 151680, "step": 125 }, { "epoch": 0.014478226974050563, "grad_norm": 2.3890275955200195, "learning_rate": 7.183427998663549e-07, "loss": 13.0069, "num_input_tokens_seen": 157888, "step": 130 }, { "epoch": 0.015035081857667892, "grad_norm": 2.5610475540161133, "learning_rate": 7.461855440472214e-07, "loss": 13.0064, "num_input_tokens_seen": 163840, "step": 135 }, { "epoch": 0.015591936741285221, "grad_norm": 2.3567254543304443, "learning_rate": 7.740282882280878e-07, "loss": 13.1842, "num_input_tokens_seen": 170048, "step": 140 }, { "epoch": 0.01614879162490255, "grad_norm": 2.4759767055511475, "learning_rate": 8.018710324089542e-07, "loss": 12.9313, "num_input_tokens_seen": 176064, "step": 145 }, { "epoch": 0.01670564650851988, "grad_norm": 2.3696987628936768, "learning_rate": 8.297137765898208e-07, "loss": 12.8865, "num_input_tokens_seen": 181984, "step": 150 }, { "epoch": 0.017262501392137208, "grad_norm": 2.277526378631592, "learning_rate": 8.575565207706871e-07, "loss": 13.1177, "num_input_tokens_seen": 188256, "step": 155 }, { "epoch": 0.01781935627575454, "grad_norm": 2.836920738220215, "learning_rate": 8.853992649515537e-07, "loss": 12.9035, "num_input_tokens_seen": 194528, "step": 160 }, { "epoch": 0.018376211159371866, "grad_norm": 2.343158483505249, "learning_rate": 9.132420091324201e-07, "loss": 13.3313, "num_input_tokens_seen": 200448, "step": 165 }, { "epoch": 0.018933066042989197, "grad_norm": 2.4851772785186768, "learning_rate": 9.410847533132866e-07, "loss": 13.0117, "num_input_tokens_seen": 206560, "step": 170 }, { "epoch": 0.019489920926606525, "grad_norm": 2.7608935832977295, "learning_rate": 9.68927497494153e-07, "loss": 12.7614, "num_input_tokens_seen": 212736, "step": 175 }, { "epoch": 0.020046775810223856, "grad_norm": 2.4547955989837646, "learning_rate": 9.967702416750195e-07, "loss": 13.0898, "num_input_tokens_seen": 218880, "step": 180 }, { "epoch": 0.020603630693841184, "grad_norm": 2.550628662109375, "learning_rate": 1.024612985855886e-06, "loss": 12.9805, "num_input_tokens_seen": 224832, "step": 185 }, { "epoch": 0.021160485577458515, "grad_norm": 2.367000102996826, "learning_rate": 1.0524557300367526e-06, "loss": 12.9583, "num_input_tokens_seen": 231328, "step": 190 }, { "epoch": 0.021717340461075842, "grad_norm": 2.6558997631073, "learning_rate": 1.0802984742176189e-06, "loss": 13.0372, "num_input_tokens_seen": 237632, "step": 195 }, { "epoch": 0.022274195344693173, "grad_norm": 2.278343915939331, "learning_rate": 1.1081412183984854e-06, "loss": 13.2193, "num_input_tokens_seen": 243840, "step": 200 }, { "epoch": 0.0228310502283105, "grad_norm": 2.304875373840332, "learning_rate": 1.1359839625793518e-06, "loss": 12.9746, "num_input_tokens_seen": 249728, "step": 205 }, { "epoch": 0.023387905111927832, "grad_norm": 2.4761316776275635, "learning_rate": 1.1638267067602183e-06, "loss": 12.8482, "num_input_tokens_seen": 256128, "step": 210 }, { "epoch": 0.02394475999554516, "grad_norm": 2.638070583343506, "learning_rate": 1.1916694509410848e-06, "loss": 12.9425, "num_input_tokens_seen": 262336, "step": 215 }, { "epoch": 0.02450161487916249, "grad_norm": 2.2835183143615723, "learning_rate": 1.2195121951219514e-06, "loss": 12.9635, "num_input_tokens_seen": 268576, "step": 220 }, { "epoch": 0.02505846976277982, "grad_norm": 2.2768056392669678, "learning_rate": 1.2473549393028177e-06, "loss": 12.935, "num_input_tokens_seen": 274496, "step": 225 }, { "epoch": 0.02561532464639715, "grad_norm": 2.3519420623779297, "learning_rate": 1.2751976834836842e-06, "loss": 12.8671, "num_input_tokens_seen": 280896, "step": 230 }, { "epoch": 0.026172179530014477, "grad_norm": 2.476146697998047, "learning_rate": 1.3030404276645506e-06, "loss": 13.0269, "num_input_tokens_seen": 286304, "step": 235 }, { "epoch": 0.026729034413631808, "grad_norm": 2.512741804122925, "learning_rate": 1.330883171845417e-06, "loss": 12.8597, "num_input_tokens_seen": 292576, "step": 240 }, { "epoch": 0.027285889297249136, "grad_norm": 2.421868324279785, "learning_rate": 1.3587259160262836e-06, "loss": 13.0268, "num_input_tokens_seen": 298720, "step": 245 }, { "epoch": 0.027842744180866467, "grad_norm": 2.4381837844848633, "learning_rate": 1.3865686602071502e-06, "loss": 12.924, "num_input_tokens_seen": 304992, "step": 250 }, { "epoch": 0.028399599064483794, "grad_norm": 2.5244863033294678, "learning_rate": 1.4144114043880165e-06, "loss": 13.1326, "num_input_tokens_seen": 310848, "step": 255 }, { "epoch": 0.028956453948101125, "grad_norm": 2.629243850708008, "learning_rate": 1.442254148568883e-06, "loss": 12.8942, "num_input_tokens_seen": 317248, "step": 260 }, { "epoch": 0.029513308831718453, "grad_norm": 2.723191261291504, "learning_rate": 1.4700968927497494e-06, "loss": 12.8725, "num_input_tokens_seen": 323552, "step": 265 }, { "epoch": 0.030070163715335784, "grad_norm": 2.6404054164886475, "learning_rate": 1.497939636930616e-06, "loss": 12.8111, "num_input_tokens_seen": 329216, "step": 270 }, { "epoch": 0.03062701859895311, "grad_norm": 2.5664846897125244, "learning_rate": 1.5257823811114825e-06, "loss": 12.8965, "num_input_tokens_seen": 334976, "step": 275 }, { "epoch": 0.031183873482570443, "grad_norm": 2.3485770225524902, "learning_rate": 1.553625125292349e-06, "loss": 12.7398, "num_input_tokens_seen": 341376, "step": 280 }, { "epoch": 0.03174072836618777, "grad_norm": 2.4188194274902344, "learning_rate": 1.5814678694732153e-06, "loss": 12.6709, "num_input_tokens_seen": 347552, "step": 285 }, { "epoch": 0.0322975832498051, "grad_norm": 2.665802478790283, "learning_rate": 1.6093106136540817e-06, "loss": 12.8753, "num_input_tokens_seen": 353536, "step": 290 }, { "epoch": 0.03285443813342243, "grad_norm": 2.428997278213501, "learning_rate": 1.6371533578349484e-06, "loss": 12.6178, "num_input_tokens_seen": 359968, "step": 295 }, { "epoch": 0.03341129301703976, "grad_norm": 2.3733811378479004, "learning_rate": 1.6649961020158147e-06, "loss": 12.801, "num_input_tokens_seen": 365888, "step": 300 }, { "epoch": 0.03396814790065709, "grad_norm": 2.7335445880889893, "learning_rate": 1.6928388461966813e-06, "loss": 12.6985, "num_input_tokens_seen": 372192, "step": 305 }, { "epoch": 0.034525002784274415, "grad_norm": 2.4541573524475098, "learning_rate": 1.7206815903775476e-06, "loss": 12.5399, "num_input_tokens_seen": 378304, "step": 310 }, { "epoch": 0.03508185766789175, "grad_norm": 2.4288723468780518, "learning_rate": 1.7485243345584143e-06, "loss": 12.6359, "num_input_tokens_seen": 384640, "step": 315 }, { "epoch": 0.03563871255150908, "grad_norm": 2.3930246829986572, "learning_rate": 1.7763670787392807e-06, "loss": 12.6166, "num_input_tokens_seen": 390688, "step": 320 }, { "epoch": 0.036195567435126405, "grad_norm": 2.243838310241699, "learning_rate": 1.804209822920147e-06, "loss": 12.3858, "num_input_tokens_seen": 396960, "step": 325 }, { "epoch": 0.03675242231874373, "grad_norm": 2.408198833465576, "learning_rate": 1.8320525671010135e-06, "loss": 12.8258, "num_input_tokens_seen": 402784, "step": 330 }, { "epoch": 0.03730927720236107, "grad_norm": 2.7404673099517822, "learning_rate": 1.85989531128188e-06, "loss": 12.8128, "num_input_tokens_seen": 408896, "step": 335 }, { "epoch": 0.037866132085978395, "grad_norm": 2.351649045944214, "learning_rate": 1.8877380554627466e-06, "loss": 12.541, "num_input_tokens_seen": 415136, "step": 340 }, { "epoch": 0.03842298696959572, "grad_norm": 2.5694825649261475, "learning_rate": 1.9155807996436127e-06, "loss": 12.711, "num_input_tokens_seen": 421184, "step": 345 }, { "epoch": 0.03897984185321305, "grad_norm": 3.03684401512146, "learning_rate": 1.9434235438244793e-06, "loss": 12.8361, "num_input_tokens_seen": 426944, "step": 350 }, { "epoch": 0.039536696736830385, "grad_norm": 2.402273178100586, "learning_rate": 1.9712662880053462e-06, "loss": 12.6209, "num_input_tokens_seen": 433312, "step": 355 }, { "epoch": 0.04009355162044771, "grad_norm": 2.487967014312744, "learning_rate": 1.9991090321862124e-06, "loss": 12.6529, "num_input_tokens_seen": 439264, "step": 360 }, { "epoch": 0.04065040650406504, "grad_norm": 2.67594313621521, "learning_rate": 2.026951776367079e-06, "loss": 12.7204, "num_input_tokens_seen": 445376, "step": 365 }, { "epoch": 0.04120726138768237, "grad_norm": 2.3373639583587646, "learning_rate": 2.054794520547945e-06, "loss": 12.6844, "num_input_tokens_seen": 451392, "step": 370 }, { "epoch": 0.0417641162712997, "grad_norm": 2.868830919265747, "learning_rate": 2.082637264728812e-06, "loss": 12.5094, "num_input_tokens_seen": 457600, "step": 375 }, { "epoch": 0.04232097115491703, "grad_norm": 2.3314106464385986, "learning_rate": 2.1104800089096785e-06, "loss": 12.6835, "num_input_tokens_seen": 463872, "step": 380 }, { "epoch": 0.04287782603853436, "grad_norm": 2.283095598220825, "learning_rate": 2.1383227530905446e-06, "loss": 12.6505, "num_input_tokens_seen": 469952, "step": 385 }, { "epoch": 0.043434680922151685, "grad_norm": 2.485727548599243, "learning_rate": 2.166165497271411e-06, "loss": 12.6528, "num_input_tokens_seen": 476096, "step": 390 }, { "epoch": 0.04399153580576902, "grad_norm": 2.436326503753662, "learning_rate": 2.1940082414522777e-06, "loss": 12.6584, "num_input_tokens_seen": 481952, "step": 395 }, { "epoch": 0.04454839068938635, "grad_norm": 3.066110610961914, "learning_rate": 2.2218509856331442e-06, "loss": 12.5301, "num_input_tokens_seen": 488032, "step": 400 }, { "epoch": 0.045105245573003674, "grad_norm": 2.9513731002807617, "learning_rate": 2.2496937298140104e-06, "loss": 12.592, "num_input_tokens_seen": 494112, "step": 405 }, { "epoch": 0.045662100456621, "grad_norm": 2.550832509994507, "learning_rate": 2.277536473994877e-06, "loss": 12.5608, "num_input_tokens_seen": 500320, "step": 410 }, { "epoch": 0.04621895534023834, "grad_norm": 2.3011205196380615, "learning_rate": 2.305379218175744e-06, "loss": 12.3916, "num_input_tokens_seen": 506464, "step": 415 }, { "epoch": 0.046775810223855664, "grad_norm": 2.2401609420776367, "learning_rate": 2.33322196235661e-06, "loss": 12.489, "num_input_tokens_seen": 512480, "step": 420 }, { "epoch": 0.04733266510747299, "grad_norm": 2.6921348571777344, "learning_rate": 2.3610647065374765e-06, "loss": 12.411, "num_input_tokens_seen": 518624, "step": 425 }, { "epoch": 0.04788951999109032, "grad_norm": 2.3888981342315674, "learning_rate": 2.3889074507183426e-06, "loss": 12.1877, "num_input_tokens_seen": 524896, "step": 430 }, { "epoch": 0.048446374874707654, "grad_norm": 2.4912898540496826, "learning_rate": 2.4167501948992096e-06, "loss": 12.2585, "num_input_tokens_seen": 531456, "step": 435 }, { "epoch": 0.04900322975832498, "grad_norm": 2.315516233444214, "learning_rate": 2.4445929390800757e-06, "loss": 12.2698, "num_input_tokens_seen": 537472, "step": 440 }, { "epoch": 0.04956008464194231, "grad_norm": 2.564375400543213, "learning_rate": 2.4724356832609423e-06, "loss": 12.3838, "num_input_tokens_seen": 543488, "step": 445 }, { "epoch": 0.05011693952555964, "grad_norm": 2.363136053085327, "learning_rate": 2.500278427441809e-06, "loss": 12.4077, "num_input_tokens_seen": 548384, "step": 450 }, { "epoch": 0.05067379440917697, "grad_norm": 2.4115657806396484, "learning_rate": 2.5281211716226753e-06, "loss": 12.384, "num_input_tokens_seen": 554752, "step": 455 }, { "epoch": 0.0512306492927943, "grad_norm": 2.518820285797119, "learning_rate": 2.555963915803542e-06, "loss": 12.175, "num_input_tokens_seen": 560768, "step": 460 }, { "epoch": 0.051787504176411626, "grad_norm": 2.4597654342651367, "learning_rate": 2.583806659984408e-06, "loss": 12.3189, "num_input_tokens_seen": 567360, "step": 465 }, { "epoch": 0.052344359060028954, "grad_norm": 2.883101224899292, "learning_rate": 2.6116494041652745e-06, "loss": 12.2453, "num_input_tokens_seen": 573312, "step": 470 }, { "epoch": 0.05290121394364629, "grad_norm": 2.5009748935699463, "learning_rate": 2.6394921483461415e-06, "loss": 12.2632, "num_input_tokens_seen": 579456, "step": 475 }, { "epoch": 0.053458068827263616, "grad_norm": 2.4002673625946045, "learning_rate": 2.6673348925270076e-06, "loss": 12.1544, "num_input_tokens_seen": 585728, "step": 480 }, { "epoch": 0.054014923710880944, "grad_norm": 2.58561372756958, "learning_rate": 2.695177636707874e-06, "loss": 12.1958, "num_input_tokens_seen": 591712, "step": 485 }, { "epoch": 0.05457177859449827, "grad_norm": 2.4649252891540527, "learning_rate": 2.7230203808887403e-06, "loss": 12.067, "num_input_tokens_seen": 597408, "step": 490 }, { "epoch": 0.055128633478115606, "grad_norm": 2.484246253967285, "learning_rate": 2.7508631250696072e-06, "loss": 11.9813, "num_input_tokens_seen": 603520, "step": 495 }, { "epoch": 0.055685488361732934, "grad_norm": 2.5803215503692627, "learning_rate": 2.7787058692504733e-06, "loss": 12.0212, "num_input_tokens_seen": 609600, "step": 500 }, { "epoch": 0.05624234324535026, "grad_norm": 2.256615400314331, "learning_rate": 2.80654861343134e-06, "loss": 11.9646, "num_input_tokens_seen": 615776, "step": 505 }, { "epoch": 0.05679919812896759, "grad_norm": 2.456786870956421, "learning_rate": 2.8343913576122064e-06, "loss": 12.1238, "num_input_tokens_seen": 621888, "step": 510 }, { "epoch": 0.05735605301258492, "grad_norm": 2.5329744815826416, "learning_rate": 2.862234101793073e-06, "loss": 11.9867, "num_input_tokens_seen": 627808, "step": 515 }, { "epoch": 0.05791290789620225, "grad_norm": 2.4561455249786377, "learning_rate": 2.8900768459739395e-06, "loss": 12.0737, "num_input_tokens_seen": 633696, "step": 520 }, { "epoch": 0.05846976277981958, "grad_norm": 2.5204923152923584, "learning_rate": 2.9179195901548056e-06, "loss": 11.9504, "num_input_tokens_seen": 639264, "step": 525 }, { "epoch": 0.059026617663436906, "grad_norm": 2.4842875003814697, "learning_rate": 2.945762334335672e-06, "loss": 12.0123, "num_input_tokens_seen": 645280, "step": 530 }, { "epoch": 0.05958347254705424, "grad_norm": 2.6958837509155273, "learning_rate": 2.9736050785165387e-06, "loss": 12.0324, "num_input_tokens_seen": 651488, "step": 535 }, { "epoch": 0.06014032743067157, "grad_norm": 2.329965829849243, "learning_rate": 3.0014478226974052e-06, "loss": 11.9048, "num_input_tokens_seen": 657984, "step": 540 }, { "epoch": 0.060697182314288896, "grad_norm": 2.784182548522949, "learning_rate": 3.0292905668782718e-06, "loss": 12.0906, "num_input_tokens_seen": 663744, "step": 545 }, { "epoch": 0.06125403719790622, "grad_norm": 2.457132339477539, "learning_rate": 3.057133311059138e-06, "loss": 11.7276, "num_input_tokens_seen": 669952, "step": 550 }, { "epoch": 0.06181089208152356, "grad_norm": 2.595261812210083, "learning_rate": 3.084976055240005e-06, "loss": 11.8837, "num_input_tokens_seen": 675488, "step": 555 }, { "epoch": 0.062367746965140886, "grad_norm": 2.4536337852478027, "learning_rate": 3.112818799420871e-06, "loss": 11.8175, "num_input_tokens_seen": 681856, "step": 560 }, { "epoch": 0.06292460184875821, "grad_norm": 2.6754536628723145, "learning_rate": 3.1406615436017375e-06, "loss": 11.7744, "num_input_tokens_seen": 688032, "step": 565 }, { "epoch": 0.06348145673237554, "grad_norm": 2.4193191528320312, "learning_rate": 3.168504287782604e-06, "loss": 11.732, "num_input_tokens_seen": 693664, "step": 570 }, { "epoch": 0.06403831161599287, "grad_norm": 2.349564790725708, "learning_rate": 3.19634703196347e-06, "loss": 11.7285, "num_input_tokens_seen": 699968, "step": 575 }, { "epoch": 0.0645951664996102, "grad_norm": 2.3512489795684814, "learning_rate": 3.2241897761443367e-06, "loss": 11.6217, "num_input_tokens_seen": 706144, "step": 580 }, { "epoch": 0.06515202138322754, "grad_norm": 2.563352108001709, "learning_rate": 3.2520325203252037e-06, "loss": 11.8962, "num_input_tokens_seen": 712544, "step": 585 }, { "epoch": 0.06570887626684487, "grad_norm": 2.5343029499053955, "learning_rate": 3.27987526450607e-06, "loss": 11.6518, "num_input_tokens_seen": 718656, "step": 590 }, { "epoch": 0.06626573115046219, "grad_norm": 2.5419037342071533, "learning_rate": 3.3077180086869363e-06, "loss": 11.652, "num_input_tokens_seen": 724800, "step": 595 }, { "epoch": 0.06682258603407952, "grad_norm": 2.2399890422821045, "learning_rate": 3.335560752867803e-06, "loss": 11.4878, "num_input_tokens_seen": 730784, "step": 600 }, { "epoch": 0.06737944091769685, "grad_norm": 2.3699021339416504, "learning_rate": 3.3634034970486694e-06, "loss": 11.6757, "num_input_tokens_seen": 736928, "step": 605 }, { "epoch": 0.06793629580131418, "grad_norm": 2.416633129119873, "learning_rate": 3.3912462412295355e-06, "loss": 11.3775, "num_input_tokens_seen": 743168, "step": 610 }, { "epoch": 0.0684931506849315, "grad_norm": 2.3520400524139404, "learning_rate": 3.419088985410402e-06, "loss": 11.1835, "num_input_tokens_seen": 749440, "step": 615 }, { "epoch": 0.06905000556854883, "grad_norm": 3.141356945037842, "learning_rate": 3.4469317295912686e-06, "loss": 11.494, "num_input_tokens_seen": 755360, "step": 620 }, { "epoch": 0.06960686045216617, "grad_norm": 2.6556808948516846, "learning_rate": 3.4747744737721347e-06, "loss": 11.3602, "num_input_tokens_seen": 761824, "step": 625 }, { "epoch": 0.0701637153357835, "grad_norm": 2.396432399749756, "learning_rate": 3.502617217953002e-06, "loss": 11.2961, "num_input_tokens_seen": 768064, "step": 630 }, { "epoch": 0.07072057021940083, "grad_norm": 2.3271491527557373, "learning_rate": 3.530459962133868e-06, "loss": 11.319, "num_input_tokens_seen": 773440, "step": 635 }, { "epoch": 0.07127742510301815, "grad_norm": 2.4260337352752686, "learning_rate": 3.5583027063147347e-06, "loss": 11.6828, "num_input_tokens_seen": 779616, "step": 640 }, { "epoch": 0.07183427998663548, "grad_norm": 2.3529956340789795, "learning_rate": 3.586145450495601e-06, "loss": 11.4276, "num_input_tokens_seen": 785824, "step": 645 }, { "epoch": 0.07239113487025281, "grad_norm": 2.4359872341156006, "learning_rate": 3.6139881946764674e-06, "loss": 11.0968, "num_input_tokens_seen": 791872, "step": 650 }, { "epoch": 0.07294798975387014, "grad_norm": 2.3603909015655518, "learning_rate": 3.641830938857334e-06, "loss": 11.2239, "num_input_tokens_seen": 797952, "step": 655 }, { "epoch": 0.07350484463748747, "grad_norm": 2.338613748550415, "learning_rate": 3.6696736830382e-06, "loss": 11.4076, "num_input_tokens_seen": 803744, "step": 660 }, { "epoch": 0.0740616995211048, "grad_norm": 2.939858913421631, "learning_rate": 3.6975164272190666e-06, "loss": 11.2548, "num_input_tokens_seen": 809792, "step": 665 }, { "epoch": 0.07461855440472213, "grad_norm": 2.3889107704162598, "learning_rate": 3.7253591713999336e-06, "loss": 11.2871, "num_input_tokens_seen": 815328, "step": 670 }, { "epoch": 0.07517540928833946, "grad_norm": 2.8695385456085205, "learning_rate": 3.7532019155808e-06, "loss": 11.1848, "num_input_tokens_seen": 821120, "step": 675 }, { "epoch": 0.07573226417195679, "grad_norm": 2.4545488357543945, "learning_rate": 3.781044659761666e-06, "loss": 11.1745, "num_input_tokens_seen": 827424, "step": 680 }, { "epoch": 0.07628911905557412, "grad_norm": 2.620771884918213, "learning_rate": 3.8088874039425327e-06, "loss": 10.6874, "num_input_tokens_seen": 833632, "step": 685 }, { "epoch": 0.07684597393919144, "grad_norm": 2.501732110977173, "learning_rate": 3.836730148123399e-06, "loss": 11.0573, "num_input_tokens_seen": 839840, "step": 690 }, { "epoch": 0.07740282882280877, "grad_norm": 2.7575740814208984, "learning_rate": 3.864572892304266e-06, "loss": 11.0203, "num_input_tokens_seen": 845568, "step": 695 }, { "epoch": 0.0779596837064261, "grad_norm": 2.5743472576141357, "learning_rate": 3.892415636485132e-06, "loss": 10.9523, "num_input_tokens_seen": 851360, "step": 700 }, { "epoch": 0.07851653859004344, "grad_norm": 2.6553328037261963, "learning_rate": 3.920258380665998e-06, "loss": 11.1112, "num_input_tokens_seen": 857376, "step": 705 }, { "epoch": 0.07907339347366077, "grad_norm": 2.396116256713867, "learning_rate": 3.948101124846865e-06, "loss": 10.955, "num_input_tokens_seen": 863520, "step": 710 }, { "epoch": 0.0796302483572781, "grad_norm": 2.715806007385254, "learning_rate": 3.975943869027732e-06, "loss": 10.9816, "num_input_tokens_seen": 869696, "step": 715 }, { "epoch": 0.08018710324089542, "grad_norm": 2.4425861835479736, "learning_rate": 4.003786613208598e-06, "loss": 10.7277, "num_input_tokens_seen": 875904, "step": 720 }, { "epoch": 0.08074395812451275, "grad_norm": 2.4505162239074707, "learning_rate": 4.031629357389464e-06, "loss": 10.8004, "num_input_tokens_seen": 882080, "step": 725 }, { "epoch": 0.08130081300813008, "grad_norm": 2.5438647270202637, "learning_rate": 4.059472101570331e-06, "loss": 10.8994, "num_input_tokens_seen": 888256, "step": 730 }, { "epoch": 0.08185766789174741, "grad_norm": 2.4831128120422363, "learning_rate": 4.087314845751197e-06, "loss": 11.0104, "num_input_tokens_seen": 894304, "step": 735 }, { "epoch": 0.08241452277536473, "grad_norm": 2.6233131885528564, "learning_rate": 4.115157589932063e-06, "loss": 10.8288, "num_input_tokens_seen": 899680, "step": 740 }, { "epoch": 0.08297137765898208, "grad_norm": 2.3833932876586914, "learning_rate": 4.14300033411293e-06, "loss": 10.6124, "num_input_tokens_seen": 906112, "step": 745 }, { "epoch": 0.0835282325425994, "grad_norm": 2.5077638626098633, "learning_rate": 4.170843078293797e-06, "loss": 10.6421, "num_input_tokens_seen": 912160, "step": 750 }, { "epoch": 0.08408508742621673, "grad_norm": 2.3574445247650146, "learning_rate": 4.1986858224746635e-06, "loss": 10.36, "num_input_tokens_seen": 918272, "step": 755 }, { "epoch": 0.08464194230983406, "grad_norm": 2.2986369132995605, "learning_rate": 4.2265285666555296e-06, "loss": 10.8198, "num_input_tokens_seen": 924288, "step": 760 }, { "epoch": 0.08519879719345139, "grad_norm": 2.366379737854004, "learning_rate": 4.2543713108363965e-06, "loss": 10.5194, "num_input_tokens_seen": 930336, "step": 765 }, { "epoch": 0.08575565207706871, "grad_norm": 2.3848702907562256, "learning_rate": 4.282214055017263e-06, "loss": 10.7938, "num_input_tokens_seen": 936256, "step": 770 }, { "epoch": 0.08631250696068604, "grad_norm": 2.7159669399261475, "learning_rate": 4.310056799198129e-06, "loss": 10.4493, "num_input_tokens_seen": 942176, "step": 775 }, { "epoch": 0.08686936184430337, "grad_norm": 2.237990379333496, "learning_rate": 4.337899543378996e-06, "loss": 10.5984, "num_input_tokens_seen": 948096, "step": 780 }, { "epoch": 0.0874262167279207, "grad_norm": 2.4848742485046387, "learning_rate": 4.365742287559862e-06, "loss": 10.3484, "num_input_tokens_seen": 953952, "step": 785 }, { "epoch": 0.08798307161153804, "grad_norm": 2.433652400970459, "learning_rate": 4.393585031740729e-06, "loss": 10.4811, "num_input_tokens_seen": 960352, "step": 790 }, { "epoch": 0.08853992649515537, "grad_norm": 2.827319860458374, "learning_rate": 4.421427775921595e-06, "loss": 10.0731, "num_input_tokens_seen": 966656, "step": 795 }, { "epoch": 0.0890967813787727, "grad_norm": 2.169790267944336, "learning_rate": 4.449270520102462e-06, "loss": 10.3926, "num_input_tokens_seen": 973024, "step": 800 }, { "epoch": 0.08965363626239002, "grad_norm": 2.2864294052124023, "learning_rate": 4.477113264283328e-06, "loss": 10.234, "num_input_tokens_seen": 979296, "step": 805 }, { "epoch": 0.09021049114600735, "grad_norm": 2.5114591121673584, "learning_rate": 4.504956008464194e-06, "loss": 10.6733, "num_input_tokens_seen": 984640, "step": 810 }, { "epoch": 0.09076734602962468, "grad_norm": 2.2907185554504395, "learning_rate": 4.532798752645061e-06, "loss": 10.3146, "num_input_tokens_seen": 991104, "step": 815 }, { "epoch": 0.091324200913242, "grad_norm": 2.341165781021118, "learning_rate": 4.560641496825927e-06, "loss": 10.3541, "num_input_tokens_seen": 997120, "step": 820 }, { "epoch": 0.09188105579685933, "grad_norm": 2.2826123237609863, "learning_rate": 4.588484241006793e-06, "loss": 10.0098, "num_input_tokens_seen": 1003232, "step": 825 }, { "epoch": 0.09243791068047667, "grad_norm": 2.6780571937561035, "learning_rate": 4.61632698518766e-06, "loss": 10.1695, "num_input_tokens_seen": 1009440, "step": 830 }, { "epoch": 0.092994765564094, "grad_norm": 2.3724870681762695, "learning_rate": 4.644169729368527e-06, "loss": 10.043, "num_input_tokens_seen": 1015648, "step": 835 }, { "epoch": 0.09355162044771133, "grad_norm": 2.2883050441741943, "learning_rate": 4.672012473549393e-06, "loss": 10.4349, "num_input_tokens_seen": 1021152, "step": 840 }, { "epoch": 0.09410847533132866, "grad_norm": 2.427647352218628, "learning_rate": 4.6998552177302595e-06, "loss": 10.1022, "num_input_tokens_seen": 1027360, "step": 845 }, { "epoch": 0.09466533021494598, "grad_norm": 2.6186206340789795, "learning_rate": 4.7276979619111264e-06, "loss": 10.1467, "num_input_tokens_seen": 1033184, "step": 850 }, { "epoch": 0.09522218509856331, "grad_norm": 2.4819865226745605, "learning_rate": 4.7555407060919925e-06, "loss": 9.8289, "num_input_tokens_seen": 1039328, "step": 855 }, { "epoch": 0.09577903998218064, "grad_norm": 2.6304588317871094, "learning_rate": 4.783383450272859e-06, "loss": 9.9947, "num_input_tokens_seen": 1045600, "step": 860 }, { "epoch": 0.09633589486579797, "grad_norm": 2.0535049438476562, "learning_rate": 4.811226194453726e-06, "loss": 9.97, "num_input_tokens_seen": 1051680, "step": 865 }, { "epoch": 0.09689274974941531, "grad_norm": 2.428365468978882, "learning_rate": 4.839068938634593e-06, "loss": 9.9204, "num_input_tokens_seen": 1057792, "step": 870 }, { "epoch": 0.09744960463303264, "grad_norm": 2.45074725151062, "learning_rate": 4.866911682815459e-06, "loss": 9.8282, "num_input_tokens_seen": 1063552, "step": 875 }, { "epoch": 0.09800645951664996, "grad_norm": 2.396008014678955, "learning_rate": 4.894754426996325e-06, "loss": 9.6727, "num_input_tokens_seen": 1069472, "step": 880 }, { "epoch": 0.09856331440026729, "grad_norm": 2.164999008178711, "learning_rate": 4.922597171177192e-06, "loss": 9.9268, "num_input_tokens_seen": 1075776, "step": 885 }, { "epoch": 0.09912016928388462, "grad_norm": 2.276677370071411, "learning_rate": 4.950439915358058e-06, "loss": 9.5984, "num_input_tokens_seen": 1081984, "step": 890 }, { "epoch": 0.09967702416750195, "grad_norm": 2.1699087619781494, "learning_rate": 4.978282659538924e-06, "loss": 9.7025, "num_input_tokens_seen": 1088192, "step": 895 }, { "epoch": 0.10023387905111927, "grad_norm": 2.3574507236480713, "learning_rate": 5.006125403719791e-06, "loss": 10.0116, "num_input_tokens_seen": 1093632, "step": 900 }, { "epoch": 0.1007907339347366, "grad_norm": 2.365715265274048, "learning_rate": 5.033968147900657e-06, "loss": 9.5477, "num_input_tokens_seen": 1099776, "step": 905 }, { "epoch": 0.10134758881835394, "grad_norm": 2.1061952114105225, "learning_rate": 5.061810892081524e-06, "loss": 9.5097, "num_input_tokens_seen": 1105920, "step": 910 }, { "epoch": 0.10190444370197127, "grad_norm": 2.339353084564209, "learning_rate": 5.08965363626239e-06, "loss": 9.6819, "num_input_tokens_seen": 1112224, "step": 915 }, { "epoch": 0.1024612985855886, "grad_norm": 2.6191506385803223, "learning_rate": 5.117496380443257e-06, "loss": 9.4982, "num_input_tokens_seen": 1118304, "step": 920 }, { "epoch": 0.10301815346920593, "grad_norm": 2.180079221725464, "learning_rate": 5.145339124624123e-06, "loss": 9.4684, "num_input_tokens_seen": 1124480, "step": 925 }, { "epoch": 0.10357500835282325, "grad_norm": 2.395709753036499, "learning_rate": 5.173181868804989e-06, "loss": 9.1078, "num_input_tokens_seen": 1130752, "step": 930 }, { "epoch": 0.10413186323644058, "grad_norm": 2.264390230178833, "learning_rate": 5.201024612985856e-06, "loss": 9.2516, "num_input_tokens_seen": 1136256, "step": 935 }, { "epoch": 0.10468871812005791, "grad_norm": 2.3432669639587402, "learning_rate": 5.2288673571667224e-06, "loss": 9.1816, "num_input_tokens_seen": 1142240, "step": 940 }, { "epoch": 0.10524557300367524, "grad_norm": 2.1852288246154785, "learning_rate": 5.2567101013475886e-06, "loss": 9.0916, "num_input_tokens_seen": 1148416, "step": 945 }, { "epoch": 0.10580242788729258, "grad_norm": 2.137331485748291, "learning_rate": 5.2845528455284555e-06, "loss": 9.2952, "num_input_tokens_seen": 1154688, "step": 950 }, { "epoch": 0.1063592827709099, "grad_norm": 2.3481836318969727, "learning_rate": 5.3123955897093225e-06, "loss": 9.5272, "num_input_tokens_seen": 1160736, "step": 955 }, { "epoch": 0.10691613765452723, "grad_norm": 2.3458328247070312, "learning_rate": 5.340238333890189e-06, "loss": 9.2868, "num_input_tokens_seen": 1166912, "step": 960 }, { "epoch": 0.10747299253814456, "grad_norm": 2.245295524597168, "learning_rate": 5.368081078071055e-06, "loss": 9.2969, "num_input_tokens_seen": 1173344, "step": 965 }, { "epoch": 0.10802984742176189, "grad_norm": 2.1848220825195312, "learning_rate": 5.395923822251922e-06, "loss": 9.2359, "num_input_tokens_seen": 1179552, "step": 970 }, { "epoch": 0.10858670230537922, "grad_norm": 2.2470004558563232, "learning_rate": 5.423766566432788e-06, "loss": 9.2785, "num_input_tokens_seen": 1185664, "step": 975 }, { "epoch": 0.10914355718899654, "grad_norm": 2.257481098175049, "learning_rate": 5.451609310613654e-06, "loss": 8.9266, "num_input_tokens_seen": 1192032, "step": 980 }, { "epoch": 0.10970041207261387, "grad_norm": 2.362637758255005, "learning_rate": 5.479452054794521e-06, "loss": 8.9338, "num_input_tokens_seen": 1197504, "step": 985 }, { "epoch": 0.11025726695623121, "grad_norm": 2.105440139770508, "learning_rate": 5.507294798975388e-06, "loss": 9.1075, "num_input_tokens_seen": 1203552, "step": 990 }, { "epoch": 0.11081412183984854, "grad_norm": 2.3446905612945557, "learning_rate": 5.535137543156254e-06, "loss": 9.1649, "num_input_tokens_seen": 1209696, "step": 995 }, { "epoch": 0.11137097672346587, "grad_norm": 2.386305570602417, "learning_rate": 5.56298028733712e-06, "loss": 8.9837, "num_input_tokens_seen": 1215776, "step": 1000 }, { "epoch": 0.1119278316070832, "grad_norm": 2.0195536613464355, "learning_rate": 5.590823031517987e-06, "loss": 9.0784, "num_input_tokens_seen": 1222304, "step": 1005 }, { "epoch": 0.11248468649070052, "grad_norm": 2.2465128898620605, "learning_rate": 5.618665775698853e-06, "loss": 9.0063, "num_input_tokens_seen": 1227776, "step": 1010 }, { "epoch": 0.11304154137431785, "grad_norm": 2.4668664932250977, "learning_rate": 5.646508519879719e-06, "loss": 8.7675, "num_input_tokens_seen": 1233536, "step": 1015 }, { "epoch": 0.11359839625793518, "grad_norm": 2.213097333908081, "learning_rate": 5.674351264060586e-06, "loss": 8.6942, "num_input_tokens_seen": 1239808, "step": 1020 }, { "epoch": 0.1141552511415525, "grad_norm": 2.0918190479278564, "learning_rate": 5.702194008241452e-06, "loss": 8.6212, "num_input_tokens_seen": 1246208, "step": 1025 }, { "epoch": 0.11471210602516985, "grad_norm": 2.1159064769744873, "learning_rate": 5.730036752422319e-06, "loss": 8.5168, "num_input_tokens_seen": 1251968, "step": 1030 }, { "epoch": 0.11526896090878717, "grad_norm": 2.4009013175964355, "learning_rate": 5.757879496603185e-06, "loss": 8.602, "num_input_tokens_seen": 1258304, "step": 1035 }, { "epoch": 0.1158258157924045, "grad_norm": 2.416562557220459, "learning_rate": 5.785722240784052e-06, "loss": 8.081, "num_input_tokens_seen": 1264480, "step": 1040 }, { "epoch": 0.11638267067602183, "grad_norm": 2.154738664627075, "learning_rate": 5.8135649849649185e-06, "loss": 8.2825, "num_input_tokens_seen": 1270336, "step": 1045 }, { "epoch": 0.11693952555963916, "grad_norm": 2.2256457805633545, "learning_rate": 5.841407729145785e-06, "loss": 8.6247, "num_input_tokens_seen": 1275936, "step": 1050 }, { "epoch": 0.11749638044325648, "grad_norm": 2.0236549377441406, "learning_rate": 5.8692504733266516e-06, "loss": 8.1432, "num_input_tokens_seen": 1282080, "step": 1055 }, { "epoch": 0.11805323532687381, "grad_norm": 2.1254312992095947, "learning_rate": 5.897093217507518e-06, "loss": 8.4239, "num_input_tokens_seen": 1288032, "step": 1060 }, { "epoch": 0.11861009021049114, "grad_norm": 2.186913251876831, "learning_rate": 5.924935961688384e-06, "loss": 8.3574, "num_input_tokens_seen": 1293792, "step": 1065 }, { "epoch": 0.11916694509410848, "grad_norm": 2.316000461578369, "learning_rate": 5.952778705869251e-06, "loss": 8.139, "num_input_tokens_seen": 1300000, "step": 1070 }, { "epoch": 0.11972379997772581, "grad_norm": 2.0559678077697754, "learning_rate": 5.980621450050118e-06, "loss": 8.5089, "num_input_tokens_seen": 1306592, "step": 1075 }, { "epoch": 0.12028065486134314, "grad_norm": 2.1640820503234863, "learning_rate": 6.008464194230984e-06, "loss": 8.1909, "num_input_tokens_seen": 1312032, "step": 1080 }, { "epoch": 0.12083750974496046, "grad_norm": 2.1751577854156494, "learning_rate": 6.03630693841185e-06, "loss": 8.1283, "num_input_tokens_seen": 1318400, "step": 1085 }, { "epoch": 0.12139436462857779, "grad_norm": 2.5087380409240723, "learning_rate": 6.064149682592717e-06, "loss": 8.1249, "num_input_tokens_seen": 1324448, "step": 1090 }, { "epoch": 0.12195121951219512, "grad_norm": 2.3136379718780518, "learning_rate": 6.091992426773583e-06, "loss": 8.3245, "num_input_tokens_seen": 1330496, "step": 1095 }, { "epoch": 0.12250807439581245, "grad_norm": 2.181807279586792, "learning_rate": 6.119835170954449e-06, "loss": 7.8922, "num_input_tokens_seen": 1336608, "step": 1100 }, { "epoch": 0.12306492927942977, "grad_norm": 2.0599124431610107, "learning_rate": 6.147677915135316e-06, "loss": 8.0662, "num_input_tokens_seen": 1342496, "step": 1105 }, { "epoch": 0.12362178416304712, "grad_norm": 1.978994607925415, "learning_rate": 6.175520659316183e-06, "loss": 8.1167, "num_input_tokens_seen": 1348800, "step": 1110 }, { "epoch": 0.12417863904666444, "grad_norm": 2.1076724529266357, "learning_rate": 6.203363403497049e-06, "loss": 7.8705, "num_input_tokens_seen": 1355072, "step": 1115 }, { "epoch": 0.12473549393028177, "grad_norm": 2.087460517883301, "learning_rate": 6.231206147677915e-06, "loss": 8.1205, "num_input_tokens_seen": 1361024, "step": 1120 }, { "epoch": 0.12529234881389908, "grad_norm": 2.2952208518981934, "learning_rate": 6.259048891858782e-06, "loss": 7.6895, "num_input_tokens_seen": 1367296, "step": 1125 }, { "epoch": 0.12584920369751643, "grad_norm": 2.0407159328460693, "learning_rate": 6.286891636039649e-06, "loss": 7.8716, "num_input_tokens_seen": 1373600, "step": 1130 }, { "epoch": 0.12640605858113377, "grad_norm": 2.0583043098449707, "learning_rate": 6.3147343802205145e-06, "loss": 7.7823, "num_input_tokens_seen": 1379648, "step": 1135 }, { "epoch": 0.12696291346475108, "grad_norm": 1.9689429998397827, "learning_rate": 6.3425771244013815e-06, "loss": 7.5166, "num_input_tokens_seen": 1385728, "step": 1140 }, { "epoch": 0.12751976834836842, "grad_norm": 2.0465540885925293, "learning_rate": 6.370419868582248e-06, "loss": 7.3912, "num_input_tokens_seen": 1391840, "step": 1145 }, { "epoch": 0.12807662323198574, "grad_norm": 1.958626627922058, "learning_rate": 6.3982626127631145e-06, "loss": 7.6542, "num_input_tokens_seen": 1398144, "step": 1150 }, { "epoch": 0.12863347811560308, "grad_norm": 1.863857388496399, "learning_rate": 6.426105356943981e-06, "loss": 7.5301, "num_input_tokens_seen": 1404320, "step": 1155 }, { "epoch": 0.1291903329992204, "grad_norm": 2.023599624633789, "learning_rate": 6.453948101124848e-06, "loss": 7.587, "num_input_tokens_seen": 1410624, "step": 1160 }, { "epoch": 0.12974718788283773, "grad_norm": 2.0408313274383545, "learning_rate": 6.481790845305713e-06, "loss": 7.2762, "num_input_tokens_seen": 1416512, "step": 1165 }, { "epoch": 0.13030404276645507, "grad_norm": 1.777665376663208, "learning_rate": 6.50963358948658e-06, "loss": 7.4466, "num_input_tokens_seen": 1421760, "step": 1170 }, { "epoch": 0.1308608976500724, "grad_norm": 1.762708067893982, "learning_rate": 6.537476333667447e-06, "loss": 7.3352, "num_input_tokens_seen": 1427616, "step": 1175 }, { "epoch": 0.13141775253368973, "grad_norm": 1.92583167552948, "learning_rate": 6.565319077848313e-06, "loss": 7.4523, "num_input_tokens_seen": 1433920, "step": 1180 }, { "epoch": 0.13197460741730704, "grad_norm": 2.3175384998321533, "learning_rate": 6.59316182202918e-06, "loss": 7.3196, "num_input_tokens_seen": 1439648, "step": 1185 }, { "epoch": 0.13253146230092439, "grad_norm": 1.9264743328094482, "learning_rate": 6.621004566210046e-06, "loss": 7.1783, "num_input_tokens_seen": 1446048, "step": 1190 }, { "epoch": 0.1330883171845417, "grad_norm": 2.1209917068481445, "learning_rate": 6.648847310390913e-06, "loss": 7.3447, "num_input_tokens_seen": 1452096, "step": 1195 }, { "epoch": 0.13364517206815904, "grad_norm": 1.4937735795974731, "learning_rate": 6.676690054571778e-06, "loss": 7.1198, "num_input_tokens_seen": 1458336, "step": 1200 }, { "epoch": 0.13420202695177635, "grad_norm": 1.8520196676254272, "learning_rate": 6.704532798752645e-06, "loss": 7.2734, "num_input_tokens_seen": 1463840, "step": 1205 }, { "epoch": 0.1347588818353937, "grad_norm": 1.8185993432998657, "learning_rate": 6.732375542933512e-06, "loss": 7.1079, "num_input_tokens_seen": 1469952, "step": 1210 }, { "epoch": 0.13531573671901104, "grad_norm": 1.9140506982803345, "learning_rate": 6.760218287114378e-06, "loss": 7.2569, "num_input_tokens_seen": 1476512, "step": 1215 }, { "epoch": 0.13587259160262835, "grad_norm": 1.8869870901107788, "learning_rate": 6.788061031295245e-06, "loss": 7.2883, "num_input_tokens_seen": 1482624, "step": 1220 }, { "epoch": 0.1364294464862457, "grad_norm": 1.6817476749420166, "learning_rate": 6.815903775476111e-06, "loss": 7.1292, "num_input_tokens_seen": 1488832, "step": 1225 }, { "epoch": 0.136986301369863, "grad_norm": 1.4499295949935913, "learning_rate": 6.843746519656978e-06, "loss": 7.1345, "num_input_tokens_seen": 1494976, "step": 1230 }, { "epoch": 0.13754315625348035, "grad_norm": 1.8007254600524902, "learning_rate": 6.871589263837844e-06, "loss": 7.1385, "num_input_tokens_seen": 1501280, "step": 1235 }, { "epoch": 0.13810001113709766, "grad_norm": 1.5057588815689087, "learning_rate": 6.8994320080187106e-06, "loss": 6.6345, "num_input_tokens_seen": 1507456, "step": 1240 }, { "epoch": 0.138656866020715, "grad_norm": 1.4331663846969604, "learning_rate": 6.927274752199577e-06, "loss": 6.8963, "num_input_tokens_seen": 1513600, "step": 1245 }, { "epoch": 0.13921372090433234, "grad_norm": 2.0460472106933594, "learning_rate": 6.955117496380444e-06, "loss": 6.9497, "num_input_tokens_seen": 1519648, "step": 1250 }, { "epoch": 0.13977057578794966, "grad_norm": 1.9041415452957153, "learning_rate": 6.982960240561311e-06, "loss": 6.9689, "num_input_tokens_seen": 1525984, "step": 1255 }, { "epoch": 0.140327430671567, "grad_norm": 1.5303281545639038, "learning_rate": 7.010802984742177e-06, "loss": 6.7215, "num_input_tokens_seen": 1532192, "step": 1260 }, { "epoch": 0.1408842855551843, "grad_norm": 1.6940420866012573, "learning_rate": 7.038645728923044e-06, "loss": 7.0044, "num_input_tokens_seen": 1538016, "step": 1265 }, { "epoch": 0.14144114043880165, "grad_norm": 1.5853171348571777, "learning_rate": 7.066488473103909e-06, "loss": 7.0003, "num_input_tokens_seen": 1544288, "step": 1270 }, { "epoch": 0.14199799532241897, "grad_norm": 1.674375295639038, "learning_rate": 7.094331217284776e-06, "loss": 6.5254, "num_input_tokens_seen": 1550240, "step": 1275 }, { "epoch": 0.1425548502060363, "grad_norm": 1.389853835105896, "learning_rate": 7.122173961465642e-06, "loss": 6.8552, "num_input_tokens_seen": 1556288, "step": 1280 }, { "epoch": 0.14311170508965362, "grad_norm": 1.5318281650543213, "learning_rate": 7.150016705646509e-06, "loss": 6.6678, "num_input_tokens_seen": 1562336, "step": 1285 }, { "epoch": 0.14366855997327097, "grad_norm": 1.5855792760849, "learning_rate": 7.177859449827376e-06, "loss": 6.4714, "num_input_tokens_seen": 1568448, "step": 1290 }, { "epoch": 0.1442254148568883, "grad_norm": 1.9128305912017822, "learning_rate": 7.205702194008242e-06, "loss": 6.9267, "num_input_tokens_seen": 1573984, "step": 1295 }, { "epoch": 0.14478226974050562, "grad_norm": 1.4507752656936646, "learning_rate": 7.233544938189109e-06, "loss": 6.5264, "num_input_tokens_seen": 1580000, "step": 1300 }, { "epoch": 0.14533912462412296, "grad_norm": 1.4253405332565308, "learning_rate": 7.261387682369974e-06, "loss": 6.562, "num_input_tokens_seen": 1586112, "step": 1305 }, { "epoch": 0.14589597950774028, "grad_norm": 1.9759693145751953, "learning_rate": 7.289230426550841e-06, "loss": 6.7112, "num_input_tokens_seen": 1591616, "step": 1310 }, { "epoch": 0.14645283439135762, "grad_norm": 1.5256497859954834, "learning_rate": 7.317073170731707e-06, "loss": 6.8545, "num_input_tokens_seen": 1598144, "step": 1315 }, { "epoch": 0.14700968927497493, "grad_norm": 2.04923939704895, "learning_rate": 7.344915914912574e-06, "loss": 6.4498, "num_input_tokens_seen": 1603520, "step": 1320 }, { "epoch": 0.14756654415859227, "grad_norm": 1.3138978481292725, "learning_rate": 7.37275865909344e-06, "loss": 6.6069, "num_input_tokens_seen": 1608992, "step": 1325 }, { "epoch": 0.1481233990422096, "grad_norm": 1.5943448543548584, "learning_rate": 7.400601403274307e-06, "loss": 6.5943, "num_input_tokens_seen": 1615232, "step": 1330 }, { "epoch": 0.14868025392582693, "grad_norm": 1.5329538583755493, "learning_rate": 7.428444147455174e-06, "loss": 6.3568, "num_input_tokens_seen": 1621408, "step": 1335 }, { "epoch": 0.14923710880944427, "grad_norm": 1.2910065650939941, "learning_rate": 7.45628689163604e-06, "loss": 6.466, "num_input_tokens_seen": 1627712, "step": 1340 }, { "epoch": 0.14979396369306158, "grad_norm": 1.3659697771072388, "learning_rate": 7.484129635816907e-06, "loss": 6.4624, "num_input_tokens_seen": 1633760, "step": 1345 }, { "epoch": 0.15035081857667892, "grad_norm": 1.2792892456054688, "learning_rate": 7.511972379997773e-06, "loss": 6.2751, "num_input_tokens_seen": 1639808, "step": 1350 }, { "epoch": 0.15090767346029624, "grad_norm": 1.418913722038269, "learning_rate": 7.53981512417864e-06, "loss": 6.5746, "num_input_tokens_seen": 1645824, "step": 1355 }, { "epoch": 0.15146452834391358, "grad_norm": 1.5592684745788574, "learning_rate": 7.567657868359505e-06, "loss": 6.2796, "num_input_tokens_seen": 1652128, "step": 1360 }, { "epoch": 0.1520213832275309, "grad_norm": 1.4726649522781372, "learning_rate": 7.595500612540373e-06, "loss": 6.3694, "num_input_tokens_seen": 1658688, "step": 1365 }, { "epoch": 0.15257823811114823, "grad_norm": 1.4806846380233765, "learning_rate": 7.62334335672124e-06, "loss": 6.3913, "num_input_tokens_seen": 1664512, "step": 1370 }, { "epoch": 0.15313509299476558, "grad_norm": 1.259272575378418, "learning_rate": 7.651186100902106e-06, "loss": 6.3891, "num_input_tokens_seen": 1670432, "step": 1375 }, { "epoch": 0.1536919478783829, "grad_norm": 1.293160319328308, "learning_rate": 7.679028845082973e-06, "loss": 6.4046, "num_input_tokens_seen": 1676480, "step": 1380 }, { "epoch": 0.15424880276200023, "grad_norm": 1.4075392484664917, "learning_rate": 7.706871589263838e-06, "loss": 6.3071, "num_input_tokens_seen": 1683040, "step": 1385 }, { "epoch": 0.15480565764561754, "grad_norm": 1.316273808479309, "learning_rate": 7.734714333444705e-06, "loss": 6.2287, "num_input_tokens_seen": 1688864, "step": 1390 }, { "epoch": 0.1553625125292349, "grad_norm": 1.2192962169647217, "learning_rate": 7.76255707762557e-06, "loss": 6.1515, "num_input_tokens_seen": 1694976, "step": 1395 }, { "epoch": 0.1559193674128522, "grad_norm": 1.5750267505645752, "learning_rate": 7.790399821806437e-06, "loss": 6.1214, "num_input_tokens_seen": 1701152, "step": 1400 }, { "epoch": 0.15647622229646954, "grad_norm": 1.4135167598724365, "learning_rate": 7.818242565987304e-06, "loss": 6.2257, "num_input_tokens_seen": 1707264, "step": 1405 }, { "epoch": 0.15703307718008688, "grad_norm": 1.3406933546066284, "learning_rate": 7.846085310168171e-06, "loss": 6.3114, "num_input_tokens_seen": 1713504, "step": 1410 }, { "epoch": 0.1575899320637042, "grad_norm": 1.2122575044631958, "learning_rate": 7.873928054349038e-06, "loss": 6.17, "num_input_tokens_seen": 1719200, "step": 1415 }, { "epoch": 0.15814678694732154, "grad_norm": 1.3057941198349, "learning_rate": 7.901770798529903e-06, "loss": 6.0329, "num_input_tokens_seen": 1725248, "step": 1420 }, { "epoch": 0.15870364183093885, "grad_norm": 1.3394852876663208, "learning_rate": 7.92961354271077e-06, "loss": 6.2364, "num_input_tokens_seen": 1731360, "step": 1425 }, { "epoch": 0.1592604967145562, "grad_norm": 1.5908410549163818, "learning_rate": 7.957456286891636e-06, "loss": 6.0874, "num_input_tokens_seen": 1736960, "step": 1430 }, { "epoch": 0.1598173515981735, "grad_norm": 1.2787188291549683, "learning_rate": 7.985299031072503e-06, "loss": 6.0714, "num_input_tokens_seen": 1742944, "step": 1435 }, { "epoch": 0.16037420648179085, "grad_norm": 1.2797513008117676, "learning_rate": 8.01314177525337e-06, "loss": 6.195, "num_input_tokens_seen": 1748960, "step": 1440 }, { "epoch": 0.16093106136540816, "grad_norm": 1.3606212139129639, "learning_rate": 8.040984519434237e-06, "loss": 6.0782, "num_input_tokens_seen": 1755296, "step": 1445 }, { "epoch": 0.1614879162490255, "grad_norm": 1.4129090309143066, "learning_rate": 8.068827263615102e-06, "loss": 6.0004, "num_input_tokens_seen": 1761024, "step": 1450 }, { "epoch": 0.16204477113264285, "grad_norm": 1.366123080253601, "learning_rate": 8.096670007795969e-06, "loss": 5.8263, "num_input_tokens_seen": 1766976, "step": 1455 }, { "epoch": 0.16260162601626016, "grad_norm": 1.169368863105774, "learning_rate": 8.124512751976836e-06, "loss": 5.9654, "num_input_tokens_seen": 1772896, "step": 1460 }, { "epoch": 0.1631584808998775, "grad_norm": 1.2449846267700195, "learning_rate": 8.152355496157701e-06, "loss": 5.8781, "num_input_tokens_seen": 1778912, "step": 1465 }, { "epoch": 0.16371533578349481, "grad_norm": 1.1253387928009033, "learning_rate": 8.180198240338568e-06, "loss": 5.6889, "num_input_tokens_seen": 1784864, "step": 1470 }, { "epoch": 0.16427219066711216, "grad_norm": 1.3033592700958252, "learning_rate": 8.208040984519435e-06, "loss": 5.8283, "num_input_tokens_seen": 1790816, "step": 1475 }, { "epoch": 0.16482904555072947, "grad_norm": 1.1121363639831543, "learning_rate": 8.235883728700302e-06, "loss": 5.7984, "num_input_tokens_seen": 1796928, "step": 1480 }, { "epoch": 0.1653859004343468, "grad_norm": 1.2356096506118774, "learning_rate": 8.263726472881167e-06, "loss": 5.898, "num_input_tokens_seen": 1803296, "step": 1485 }, { "epoch": 0.16594275531796415, "grad_norm": 1.4502595663070679, "learning_rate": 8.291569217062034e-06, "loss": 5.8402, "num_input_tokens_seen": 1809280, "step": 1490 }, { "epoch": 0.16649961020158147, "grad_norm": 1.363883376121521, "learning_rate": 8.319411961242901e-06, "loss": 5.6236, "num_input_tokens_seen": 1815424, "step": 1495 }, { "epoch": 0.1670564650851988, "grad_norm": 1.3194993734359741, "learning_rate": 8.347254705423766e-06, "loss": 5.7867, "num_input_tokens_seen": 1821568, "step": 1500 }, { "epoch": 0.16761331996881612, "grad_norm": 1.2856260538101196, "learning_rate": 8.375097449604633e-06, "loss": 5.7225, "num_input_tokens_seen": 1827616, "step": 1505 }, { "epoch": 0.16817017485243346, "grad_norm": 1.0706396102905273, "learning_rate": 8.4029401937855e-06, "loss": 5.6702, "num_input_tokens_seen": 1833408, "step": 1510 }, { "epoch": 0.16872702973605078, "grad_norm": 1.443325161933899, "learning_rate": 8.430782937966367e-06, "loss": 5.6352, "num_input_tokens_seen": 1839712, "step": 1515 }, { "epoch": 0.16928388461966812, "grad_norm": 1.1372264623641968, "learning_rate": 8.458625682147233e-06, "loss": 5.739, "num_input_tokens_seen": 1846336, "step": 1520 }, { "epoch": 0.16984073950328543, "grad_norm": 1.3064546585083008, "learning_rate": 8.4864684263281e-06, "loss": 5.7781, "num_input_tokens_seen": 1852480, "step": 1525 }, { "epoch": 0.17039759438690277, "grad_norm": 1.0597317218780518, "learning_rate": 8.514311170508965e-06, "loss": 5.7262, "num_input_tokens_seen": 1858016, "step": 1530 }, { "epoch": 0.17095444927052011, "grad_norm": 1.3685699701309204, "learning_rate": 8.542153914689832e-06, "loss": 5.4718, "num_input_tokens_seen": 1863872, "step": 1535 }, { "epoch": 0.17151130415413743, "grad_norm": 1.3895121812820435, "learning_rate": 8.569996658870699e-06, "loss": 5.4184, "num_input_tokens_seen": 1870016, "step": 1540 }, { "epoch": 0.17206815903775477, "grad_norm": 1.2400966882705688, "learning_rate": 8.597839403051566e-06, "loss": 5.6186, "num_input_tokens_seen": 1875968, "step": 1545 }, { "epoch": 0.17262501392137208, "grad_norm": 1.3762930631637573, "learning_rate": 8.625682147232433e-06, "loss": 5.5403, "num_input_tokens_seen": 1882144, "step": 1550 }, { "epoch": 0.17318186880498943, "grad_norm": 1.1453626155853271, "learning_rate": 8.653524891413298e-06, "loss": 5.3207, "num_input_tokens_seen": 1888192, "step": 1555 }, { "epoch": 0.17373872368860674, "grad_norm": 1.2598291635513306, "learning_rate": 8.681367635594165e-06, "loss": 5.4628, "num_input_tokens_seen": 1894368, "step": 1560 }, { "epoch": 0.17429557857222408, "grad_norm": 1.2642457485198975, "learning_rate": 8.70921037977503e-06, "loss": 5.4166, "num_input_tokens_seen": 1900544, "step": 1565 }, { "epoch": 0.1748524334558414, "grad_norm": 1.0456486940383911, "learning_rate": 8.737053123955897e-06, "loss": 5.4662, "num_input_tokens_seen": 1906560, "step": 1570 }, { "epoch": 0.17540928833945874, "grad_norm": 1.365109920501709, "learning_rate": 8.764895868136764e-06, "loss": 5.3621, "num_input_tokens_seen": 1912704, "step": 1575 }, { "epoch": 0.17596614322307608, "grad_norm": 1.154096007347107, "learning_rate": 8.792738612317631e-06, "loss": 5.2621, "num_input_tokens_seen": 1918944, "step": 1580 }, { "epoch": 0.1765229981066934, "grad_norm": 1.232010006904602, "learning_rate": 8.820581356498498e-06, "loss": 5.3046, "num_input_tokens_seen": 1924864, "step": 1585 }, { "epoch": 0.17707985299031073, "grad_norm": 1.0597479343414307, "learning_rate": 8.848424100679363e-06, "loss": 5.3048, "num_input_tokens_seen": 1931072, "step": 1590 }, { "epoch": 0.17763670787392805, "grad_norm": 1.0893194675445557, "learning_rate": 8.87626684486023e-06, "loss": 5.4317, "num_input_tokens_seen": 1937184, "step": 1595 }, { "epoch": 0.1781935627575454, "grad_norm": 1.200007438659668, "learning_rate": 8.904109589041095e-06, "loss": 5.2659, "num_input_tokens_seen": 1943296, "step": 1600 }, { "epoch": 0.1787504176411627, "grad_norm": 1.215143084526062, "learning_rate": 8.931952333221962e-06, "loss": 5.3164, "num_input_tokens_seen": 1949248, "step": 1605 }, { "epoch": 0.17930727252478004, "grad_norm": 1.4703996181488037, "learning_rate": 8.95979507740283e-06, "loss": 5.2226, "num_input_tokens_seen": 1954816, "step": 1610 }, { "epoch": 0.17986412740839738, "grad_norm": 1.320659875869751, "learning_rate": 8.987637821583696e-06, "loss": 5.1626, "num_input_tokens_seen": 1961056, "step": 1615 }, { "epoch": 0.1804209822920147, "grad_norm": 1.2611972093582153, "learning_rate": 9.015480565764563e-06, "loss": 5.2678, "num_input_tokens_seen": 1966912, "step": 1620 }, { "epoch": 0.18097783717563204, "grad_norm": 1.006300926208496, "learning_rate": 9.043323309945429e-06, "loss": 5.1531, "num_input_tokens_seen": 1972480, "step": 1625 }, { "epoch": 0.18153469205924935, "grad_norm": 1.2880504131317139, "learning_rate": 9.071166054126296e-06, "loss": 5.2584, "num_input_tokens_seen": 1978752, "step": 1630 }, { "epoch": 0.1820915469428667, "grad_norm": 1.2899565696716309, "learning_rate": 9.09900879830716e-06, "loss": 5.2569, "num_input_tokens_seen": 1984928, "step": 1635 }, { "epoch": 0.182648401826484, "grad_norm": 1.2097878456115723, "learning_rate": 9.126851542488028e-06, "loss": 5.24, "num_input_tokens_seen": 1990816, "step": 1640 }, { "epoch": 0.18320525671010135, "grad_norm": 1.3883436918258667, "learning_rate": 9.154694286668895e-06, "loss": 5.3306, "num_input_tokens_seen": 1996448, "step": 1645 }, { "epoch": 0.18376211159371866, "grad_norm": 1.1918879747390747, "learning_rate": 9.182537030849762e-06, "loss": 5.3293, "num_input_tokens_seen": 2002784, "step": 1650 }, { "epoch": 0.184318966477336, "grad_norm": 1.3049746751785278, "learning_rate": 9.210379775030629e-06, "loss": 5.0141, "num_input_tokens_seen": 2008800, "step": 1655 }, { "epoch": 0.18487582136095335, "grad_norm": 1.333756446838379, "learning_rate": 9.238222519211494e-06, "loss": 5.1824, "num_input_tokens_seen": 2015072, "step": 1660 }, { "epoch": 0.18543267624457066, "grad_norm": 1.127835750579834, "learning_rate": 9.266065263392361e-06, "loss": 5.1205, "num_input_tokens_seen": 2021216, "step": 1665 }, { "epoch": 0.185989531128188, "grad_norm": 1.2361680269241333, "learning_rate": 9.293908007573226e-06, "loss": 4.8811, "num_input_tokens_seen": 2027360, "step": 1670 }, { "epoch": 0.18654638601180532, "grad_norm": 1.273303508758545, "learning_rate": 9.321750751754093e-06, "loss": 5.0524, "num_input_tokens_seen": 2033120, "step": 1675 }, { "epoch": 0.18710324089542266, "grad_norm": 1.2466565370559692, "learning_rate": 9.34959349593496e-06, "loss": 4.9251, "num_input_tokens_seen": 2039232, "step": 1680 }, { "epoch": 0.18766009577903997, "grad_norm": 1.2591596841812134, "learning_rate": 9.377436240115827e-06, "loss": 4.8907, "num_input_tokens_seen": 2044896, "step": 1685 }, { "epoch": 0.1882169506626573, "grad_norm": 1.390591025352478, "learning_rate": 9.405278984296692e-06, "loss": 4.8847, "num_input_tokens_seen": 2051040, "step": 1690 }, { "epoch": 0.18877380554627465, "grad_norm": 1.1396487951278687, "learning_rate": 9.43312172847756e-06, "loss": 4.8628, "num_input_tokens_seen": 2056992, "step": 1695 }, { "epoch": 0.18933066042989197, "grad_norm": 1.2303922176361084, "learning_rate": 9.460964472658426e-06, "loss": 4.7597, "num_input_tokens_seen": 2063232, "step": 1700 }, { "epoch": 0.1898875153135093, "grad_norm": 1.1652283668518066, "learning_rate": 9.488807216839292e-06, "loss": 4.8265, "num_input_tokens_seen": 2069376, "step": 1705 }, { "epoch": 0.19044437019712662, "grad_norm": 1.304732084274292, "learning_rate": 9.516649961020158e-06, "loss": 4.8573, "num_input_tokens_seen": 2074944, "step": 1710 }, { "epoch": 0.19100122508074396, "grad_norm": 1.4193181991577148, "learning_rate": 9.544492705201025e-06, "loss": 4.7986, "num_input_tokens_seen": 2080960, "step": 1715 }, { "epoch": 0.19155807996436128, "grad_norm": 1.0630719661712646, "learning_rate": 9.572335449381892e-06, "loss": 4.69, "num_input_tokens_seen": 2086944, "step": 1720 }, { "epoch": 0.19211493484797862, "grad_norm": 1.0671154260635376, "learning_rate": 9.600178193562758e-06, "loss": 4.6865, "num_input_tokens_seen": 2093056, "step": 1725 }, { "epoch": 0.19267178973159593, "grad_norm": 1.2402950525283813, "learning_rate": 9.628020937743625e-06, "loss": 4.7758, "num_input_tokens_seen": 2099136, "step": 1730 }, { "epoch": 0.19322864461521327, "grad_norm": 1.0735135078430176, "learning_rate": 9.655863681924492e-06, "loss": 4.6226, "num_input_tokens_seen": 2105184, "step": 1735 }, { "epoch": 0.19378549949883062, "grad_norm": 1.8294992446899414, "learning_rate": 9.683706426105357e-06, "loss": 4.7391, "num_input_tokens_seen": 2111200, "step": 1740 }, { "epoch": 0.19434235438244793, "grad_norm": 1.1348036527633667, "learning_rate": 9.711549170286224e-06, "loss": 4.7454, "num_input_tokens_seen": 2117536, "step": 1745 }, { "epoch": 0.19489920926606527, "grad_norm": 1.4439369440078735, "learning_rate": 9.73939191446709e-06, "loss": 4.5388, "num_input_tokens_seen": 2123328, "step": 1750 }, { "epoch": 0.19545606414968258, "grad_norm": 1.1123504638671875, "learning_rate": 9.767234658647958e-06, "loss": 4.3424, "num_input_tokens_seen": 2129504, "step": 1755 }, { "epoch": 0.19601291903329993, "grad_norm": 1.2383921146392822, "learning_rate": 9.795077402828823e-06, "loss": 4.6335, "num_input_tokens_seen": 2135456, "step": 1760 }, { "epoch": 0.19656977391691724, "grad_norm": 1.1334290504455566, "learning_rate": 9.82292014700969e-06, "loss": 4.4976, "num_input_tokens_seen": 2141920, "step": 1765 }, { "epoch": 0.19712662880053458, "grad_norm": 1.1616417169570923, "learning_rate": 9.850762891190555e-06, "loss": 4.6222, "num_input_tokens_seen": 2147936, "step": 1770 }, { "epoch": 0.19768348368415192, "grad_norm": 1.241438388824463, "learning_rate": 9.878605635371422e-06, "loss": 4.6183, "num_input_tokens_seen": 2154016, "step": 1775 }, { "epoch": 0.19824033856776924, "grad_norm": 1.121077299118042, "learning_rate": 9.90644837955229e-06, "loss": 4.4791, "num_input_tokens_seen": 2160288, "step": 1780 }, { "epoch": 0.19879719345138658, "grad_norm": 1.5024535655975342, "learning_rate": 9.934291123733156e-06, "loss": 4.5024, "num_input_tokens_seen": 2166624, "step": 1785 }, { "epoch": 0.1993540483350039, "grad_norm": 1.0455766916275024, "learning_rate": 9.962133867914023e-06, "loss": 4.3941, "num_input_tokens_seen": 2172576, "step": 1790 }, { "epoch": 0.19991090321862123, "grad_norm": 1.1343914270401, "learning_rate": 9.989976612094888e-06, "loss": 4.4105, "num_input_tokens_seen": 2178592, "step": 1795 }, { "epoch": 0.20046775810223855, "grad_norm": 1.222386360168457, "learning_rate": 1.0017819356275755e-05, "loss": 4.2622, "num_input_tokens_seen": 2184480, "step": 1800 }, { "epoch": 0.2010246129858559, "grad_norm": 1.260638952255249, "learning_rate": 1.004566210045662e-05, "loss": 4.4528, "num_input_tokens_seen": 2190208, "step": 1805 }, { "epoch": 0.2015814678694732, "grad_norm": 1.3032090663909912, "learning_rate": 1.0073504844637488e-05, "loss": 4.2159, "num_input_tokens_seen": 2196608, "step": 1810 }, { "epoch": 0.20213832275309054, "grad_norm": 1.0701489448547363, "learning_rate": 1.0101347588818355e-05, "loss": 4.1976, "num_input_tokens_seen": 2202816, "step": 1815 }, { "epoch": 0.20269517763670789, "grad_norm": 1.2863328456878662, "learning_rate": 1.0129190332999221e-05, "loss": 4.2751, "num_input_tokens_seen": 2208672, "step": 1820 }, { "epoch": 0.2032520325203252, "grad_norm": 1.1929634809494019, "learning_rate": 1.0157033077180088e-05, "loss": 4.1961, "num_input_tokens_seen": 2215104, "step": 1825 }, { "epoch": 0.20380888740394254, "grad_norm": 1.2851202487945557, "learning_rate": 1.0184875821360954e-05, "loss": 4.1224, "num_input_tokens_seen": 2221024, "step": 1830 }, { "epoch": 0.20436574228755985, "grad_norm": 1.210977554321289, "learning_rate": 1.021271856554182e-05, "loss": 4.2027, "num_input_tokens_seen": 2226944, "step": 1835 }, { "epoch": 0.2049225971711772, "grad_norm": 1.180336833000183, "learning_rate": 1.0240561309722686e-05, "loss": 3.989, "num_input_tokens_seen": 2233120, "step": 1840 }, { "epoch": 0.2054794520547945, "grad_norm": 1.398392677307129, "learning_rate": 1.0268404053903553e-05, "loss": 4.0863, "num_input_tokens_seen": 2239168, "step": 1845 }, { "epoch": 0.20603630693841185, "grad_norm": 1.2294245958328247, "learning_rate": 1.029624679808442e-05, "loss": 4.1084, "num_input_tokens_seen": 2245536, "step": 1850 }, { "epoch": 0.2065931618220292, "grad_norm": 1.2649083137512207, "learning_rate": 1.0324089542265287e-05, "loss": 4.1476, "num_input_tokens_seen": 2251712, "step": 1855 }, { "epoch": 0.2071500167056465, "grad_norm": 1.1153972148895264, "learning_rate": 1.0351932286446154e-05, "loss": 4.0931, "num_input_tokens_seen": 2256992, "step": 1860 }, { "epoch": 0.20770687158926385, "grad_norm": 1.152316927909851, "learning_rate": 1.0379775030627019e-05, "loss": 3.9285, "num_input_tokens_seen": 2263232, "step": 1865 }, { "epoch": 0.20826372647288116, "grad_norm": 1.1285210847854614, "learning_rate": 1.0407617774807886e-05, "loss": 3.845, "num_input_tokens_seen": 2269120, "step": 1870 }, { "epoch": 0.2088205813564985, "grad_norm": 1.1218235492706299, "learning_rate": 1.0435460518988751e-05, "loss": 3.8205, "num_input_tokens_seen": 2275040, "step": 1875 }, { "epoch": 0.20937743624011582, "grad_norm": 1.2161211967468262, "learning_rate": 1.0463303263169618e-05, "loss": 4.1361, "num_input_tokens_seen": 2281312, "step": 1880 }, { "epoch": 0.20993429112373316, "grad_norm": 1.203430414199829, "learning_rate": 1.0491146007350485e-05, "loss": 3.8958, "num_input_tokens_seen": 2287232, "step": 1885 }, { "epoch": 0.21049114600735047, "grad_norm": 1.1632081270217896, "learning_rate": 1.0518988751531352e-05, "loss": 3.9635, "num_input_tokens_seen": 2293568, "step": 1890 }, { "epoch": 0.2110480008909678, "grad_norm": 1.1294569969177246, "learning_rate": 1.054683149571222e-05, "loss": 3.7267, "num_input_tokens_seen": 2299776, "step": 1895 }, { "epoch": 0.21160485577458515, "grad_norm": 1.166387915611267, "learning_rate": 1.0574674239893084e-05, "loss": 3.818, "num_input_tokens_seen": 2305728, "step": 1900 }, { "epoch": 0.21216171065820247, "grad_norm": 1.1200344562530518, "learning_rate": 1.0602516984073951e-05, "loss": 3.7918, "num_input_tokens_seen": 2311904, "step": 1905 }, { "epoch": 0.2127185655418198, "grad_norm": 1.089632272720337, "learning_rate": 1.0630359728254817e-05, "loss": 3.834, "num_input_tokens_seen": 2318016, "step": 1910 }, { "epoch": 0.21327542042543712, "grad_norm": 1.4569860696792603, "learning_rate": 1.0658202472435684e-05, "loss": 3.7382, "num_input_tokens_seen": 2323456, "step": 1915 }, { "epoch": 0.21383227530905446, "grad_norm": 1.0596919059753418, "learning_rate": 1.068604521661655e-05, "loss": 3.7881, "num_input_tokens_seen": 2329632, "step": 1920 }, { "epoch": 0.21438913019267178, "grad_norm": 1.0633735656738281, "learning_rate": 1.0713887960797418e-05, "loss": 3.8238, "num_input_tokens_seen": 2336000, "step": 1925 }, { "epoch": 0.21494598507628912, "grad_norm": 1.0695241689682007, "learning_rate": 1.0741730704978283e-05, "loss": 3.7063, "num_input_tokens_seen": 2342336, "step": 1930 }, { "epoch": 0.21550283995990646, "grad_norm": 1.0370508432388306, "learning_rate": 1.076957344915915e-05, "loss": 3.641, "num_input_tokens_seen": 2348288, "step": 1935 }, { "epoch": 0.21605969484352378, "grad_norm": 1.0764390230178833, "learning_rate": 1.0797416193340017e-05, "loss": 3.6522, "num_input_tokens_seen": 2354176, "step": 1940 }, { "epoch": 0.21661654972714112, "grad_norm": 1.0874210596084595, "learning_rate": 1.0825258937520882e-05, "loss": 3.641, "num_input_tokens_seen": 2360512, "step": 1945 }, { "epoch": 0.21717340461075843, "grad_norm": 1.0052776336669922, "learning_rate": 1.0853101681701749e-05, "loss": 3.7348, "num_input_tokens_seen": 2367104, "step": 1950 }, { "epoch": 0.21773025949437577, "grad_norm": 1.0818160772323608, "learning_rate": 1.0880944425882616e-05, "loss": 3.3338, "num_input_tokens_seen": 2372992, "step": 1955 }, { "epoch": 0.21828711437799309, "grad_norm": 1.2439230680465698, "learning_rate": 1.0908787170063483e-05, "loss": 3.5329, "num_input_tokens_seen": 2378624, "step": 1960 }, { "epoch": 0.21884396926161043, "grad_norm": 1.5063925981521606, "learning_rate": 1.0936629914244348e-05, "loss": 3.5693, "num_input_tokens_seen": 2384896, "step": 1965 }, { "epoch": 0.21940082414522774, "grad_norm": 1.132888674736023, "learning_rate": 1.0964472658425215e-05, "loss": 3.4242, "num_input_tokens_seen": 2391072, "step": 1970 }, { "epoch": 0.21995767902884508, "grad_norm": 1.2218122482299805, "learning_rate": 1.0992315402606082e-05, "loss": 3.5293, "num_input_tokens_seen": 2397184, "step": 1975 }, { "epoch": 0.22051453391246242, "grad_norm": 1.127777338027954, "learning_rate": 1.1020158146786947e-05, "loss": 3.4654, "num_input_tokens_seen": 2403392, "step": 1980 }, { "epoch": 0.22107138879607974, "grad_norm": 1.3829678297042847, "learning_rate": 1.1048000890967814e-05, "loss": 3.4228, "num_input_tokens_seen": 2409376, "step": 1985 }, { "epoch": 0.22162824367969708, "grad_norm": 1.026633381843567, "learning_rate": 1.1075843635148681e-05, "loss": 3.262, "num_input_tokens_seen": 2415296, "step": 1990 }, { "epoch": 0.2221850985633144, "grad_norm": 1.2849223613739014, "learning_rate": 1.1103686379329548e-05, "loss": 3.1424, "num_input_tokens_seen": 2421088, "step": 1995 }, { "epoch": 0.22274195344693173, "grad_norm": 1.0453261137008667, "learning_rate": 1.1131529123510414e-05, "loss": 3.3245, "num_input_tokens_seen": 2427072, "step": 2000 }, { "epoch": 0.22329880833054905, "grad_norm": 1.1158897876739502, "learning_rate": 1.115937186769128e-05, "loss": 3.32, "num_input_tokens_seen": 2433152, "step": 2005 }, { "epoch": 0.2238556632141664, "grad_norm": 1.0809091329574585, "learning_rate": 1.1187214611872146e-05, "loss": 3.1876, "num_input_tokens_seen": 2439072, "step": 2010 }, { "epoch": 0.22441251809778373, "grad_norm": 1.0237963199615479, "learning_rate": 1.1215057356053013e-05, "loss": 3.2422, "num_input_tokens_seen": 2444704, "step": 2015 }, { "epoch": 0.22496937298140104, "grad_norm": 1.017773985862732, "learning_rate": 1.124290010023388e-05, "loss": 3.1061, "num_input_tokens_seen": 2450944, "step": 2020 }, { "epoch": 0.22552622786501839, "grad_norm": 1.1687920093536377, "learning_rate": 1.1270742844414747e-05, "loss": 3.1741, "num_input_tokens_seen": 2457184, "step": 2025 }, { "epoch": 0.2260830827486357, "grad_norm": 1.0770233869552612, "learning_rate": 1.1298585588595614e-05, "loss": 3.0734, "num_input_tokens_seen": 2463712, "step": 2030 }, { "epoch": 0.22663993763225304, "grad_norm": 1.08290433883667, "learning_rate": 1.1326428332776479e-05, "loss": 2.9782, "num_input_tokens_seen": 2469792, "step": 2035 }, { "epoch": 0.22719679251587035, "grad_norm": 0.9947497248649597, "learning_rate": 1.1354271076957346e-05, "loss": 2.8792, "num_input_tokens_seen": 2475936, "step": 2040 }, { "epoch": 0.2277536473994877, "grad_norm": 1.009564757347107, "learning_rate": 1.1382113821138211e-05, "loss": 2.8458, "num_input_tokens_seen": 2481952, "step": 2045 }, { "epoch": 0.228310502283105, "grad_norm": 1.2695235013961792, "learning_rate": 1.1409956565319078e-05, "loss": 2.9741, "num_input_tokens_seen": 2487552, "step": 2050 }, { "epoch": 0.22886735716672235, "grad_norm": 1.0563592910766602, "learning_rate": 1.1437799309499945e-05, "loss": 3.0829, "num_input_tokens_seen": 2493856, "step": 2055 }, { "epoch": 0.2294242120503397, "grad_norm": 1.107713222503662, "learning_rate": 1.1465642053680812e-05, "loss": 2.7415, "num_input_tokens_seen": 2499264, "step": 2060 }, { "epoch": 0.229981066933957, "grad_norm": 1.0566704273223877, "learning_rate": 1.1493484797861679e-05, "loss": 3.047, "num_input_tokens_seen": 2505088, "step": 2065 }, { "epoch": 0.23053792181757435, "grad_norm": 1.1893231868743896, "learning_rate": 1.1521327542042544e-05, "loss": 2.978, "num_input_tokens_seen": 2511456, "step": 2070 }, { "epoch": 0.23109477670119166, "grad_norm": 1.0787843465805054, "learning_rate": 1.1549170286223411e-05, "loss": 2.8337, "num_input_tokens_seen": 2517568, "step": 2075 }, { "epoch": 0.231651631584809, "grad_norm": 1.148560643196106, "learning_rate": 1.1577013030404276e-05, "loss": 2.7897, "num_input_tokens_seen": 2523968, "step": 2080 }, { "epoch": 0.23220848646842632, "grad_norm": 1.0116441249847412, "learning_rate": 1.1604855774585143e-05, "loss": 2.6835, "num_input_tokens_seen": 2529792, "step": 2085 }, { "epoch": 0.23276534135204366, "grad_norm": 1.009239912033081, "learning_rate": 1.1632698518766009e-05, "loss": 2.7096, "num_input_tokens_seen": 2535648, "step": 2090 }, { "epoch": 0.233322196235661, "grad_norm": 1.116148591041565, "learning_rate": 1.1660541262946877e-05, "loss": 2.7069, "num_input_tokens_seen": 2541792, "step": 2095 }, { "epoch": 0.23387905111927831, "grad_norm": 1.0033038854599, "learning_rate": 1.1688384007127744e-05, "loss": 2.5292, "num_input_tokens_seen": 2547936, "step": 2100 }, { "epoch": 0.23443590600289566, "grad_norm": 1.0970187187194824, "learning_rate": 1.171622675130861e-05, "loss": 2.865, "num_input_tokens_seen": 2553824, "step": 2105 }, { "epoch": 0.23499276088651297, "grad_norm": 0.9939842224121094, "learning_rate": 1.1744069495489477e-05, "loss": 2.7096, "num_input_tokens_seen": 2559744, "step": 2110 }, { "epoch": 0.2355496157701303, "grad_norm": 1.0421687364578247, "learning_rate": 1.1771912239670342e-05, "loss": 2.6623, "num_input_tokens_seen": 2565696, "step": 2115 }, { "epoch": 0.23610647065374762, "grad_norm": 1.0140630006790161, "learning_rate": 1.1799754983851209e-05, "loss": 2.629, "num_input_tokens_seen": 2572096, "step": 2120 }, { "epoch": 0.23666332553736497, "grad_norm": 0.9903172850608826, "learning_rate": 1.1827597728032076e-05, "loss": 2.4531, "num_input_tokens_seen": 2578528, "step": 2125 }, { "epoch": 0.23722018042098228, "grad_norm": 1.356926441192627, "learning_rate": 1.1855440472212943e-05, "loss": 2.5107, "num_input_tokens_seen": 2584704, "step": 2130 }, { "epoch": 0.23777703530459962, "grad_norm": 0.9249180555343628, "learning_rate": 1.1883283216393808e-05, "loss": 2.5087, "num_input_tokens_seen": 2590464, "step": 2135 }, { "epoch": 0.23833389018821696, "grad_norm": 1.0006029605865479, "learning_rate": 1.1911125960574675e-05, "loss": 2.2466, "num_input_tokens_seen": 2596672, "step": 2140 }, { "epoch": 0.23889074507183428, "grad_norm": 0.9545153379440308, "learning_rate": 1.1938968704755542e-05, "loss": 2.2626, "num_input_tokens_seen": 2602752, "step": 2145 }, { "epoch": 0.23944759995545162, "grad_norm": 1.169034481048584, "learning_rate": 1.1966811448936407e-05, "loss": 2.419, "num_input_tokens_seen": 2608608, "step": 2150 }, { "epoch": 0.24000445483906893, "grad_norm": 1.0103627443313599, "learning_rate": 1.1994654193117274e-05, "loss": 2.2975, "num_input_tokens_seen": 2614592, "step": 2155 }, { "epoch": 0.24056130972268627, "grad_norm": 1.1529446840286255, "learning_rate": 1.2022496937298141e-05, "loss": 2.5132, "num_input_tokens_seen": 2620672, "step": 2160 }, { "epoch": 0.2411181646063036, "grad_norm": 1.029642939567566, "learning_rate": 1.2050339681479008e-05, "loss": 2.2377, "num_input_tokens_seen": 2626688, "step": 2165 }, { "epoch": 0.24167501948992093, "grad_norm": 0.9792430996894836, "learning_rate": 1.2078182425659873e-05, "loss": 2.4582, "num_input_tokens_seen": 2631840, "step": 2170 }, { "epoch": 0.24223187437353827, "grad_norm": 1.135970115661621, "learning_rate": 1.210602516984074e-05, "loss": 2.1969, "num_input_tokens_seen": 2637952, "step": 2175 }, { "epoch": 0.24278872925715558, "grad_norm": 0.8302116990089417, "learning_rate": 1.2133867914021607e-05, "loss": 2.248, "num_input_tokens_seen": 2644032, "step": 2180 }, { "epoch": 0.24334558414077292, "grad_norm": 0.9894834756851196, "learning_rate": 1.2161710658202473e-05, "loss": 2.0713, "num_input_tokens_seen": 2650016, "step": 2185 }, { "epoch": 0.24390243902439024, "grad_norm": 1.0682622194290161, "learning_rate": 1.218955340238334e-05, "loss": 2.1968, "num_input_tokens_seen": 2656288, "step": 2190 }, { "epoch": 0.24445929390800758, "grad_norm": 1.0086075067520142, "learning_rate": 1.2217396146564206e-05, "loss": 2.0552, "num_input_tokens_seen": 2662528, "step": 2195 }, { "epoch": 0.2450161487916249, "grad_norm": 0.9726009964942932, "learning_rate": 1.2245238890745073e-05, "loss": 1.9602, "num_input_tokens_seen": 2668672, "step": 2200 }, { "epoch": 0.24557300367524224, "grad_norm": 1.2705564498901367, "learning_rate": 1.2273081634925939e-05, "loss": 2.2563, "num_input_tokens_seen": 2674560, "step": 2205 }, { "epoch": 0.24612985855885955, "grad_norm": 0.9949143528938293, "learning_rate": 1.2300924379106806e-05, "loss": 2.2437, "num_input_tokens_seen": 2680800, "step": 2210 }, { "epoch": 0.2466867134424769, "grad_norm": 1.3042817115783691, "learning_rate": 1.2328767123287671e-05, "loss": 2.2131, "num_input_tokens_seen": 2686688, "step": 2215 }, { "epoch": 0.24724356832609423, "grad_norm": 0.9578734636306763, "learning_rate": 1.2356609867468538e-05, "loss": 2.1812, "num_input_tokens_seen": 2692544, "step": 2220 }, { "epoch": 0.24780042320971155, "grad_norm": 1.0420918464660645, "learning_rate": 1.2384452611649405e-05, "loss": 2.0942, "num_input_tokens_seen": 2698848, "step": 2225 }, { "epoch": 0.2483572780933289, "grad_norm": 0.9387763738632202, "learning_rate": 1.2412295355830272e-05, "loss": 2.0916, "num_input_tokens_seen": 2704896, "step": 2230 }, { "epoch": 0.2489141329769462, "grad_norm": 0.9800372123718262, "learning_rate": 1.2440138100011139e-05, "loss": 2.0008, "num_input_tokens_seen": 2711136, "step": 2235 }, { "epoch": 0.24947098786056354, "grad_norm": 1.0355908870697021, "learning_rate": 1.2467980844192004e-05, "loss": 2.0112, "num_input_tokens_seen": 2717120, "step": 2240 }, { "epoch": 0.2500278427441809, "grad_norm": 0.9208986759185791, "learning_rate": 1.2495823588372871e-05, "loss": 1.8565, "num_input_tokens_seen": 2723136, "step": 2245 }, { "epoch": 0.25058469762779817, "grad_norm": 1.2817115783691406, "learning_rate": 1.2523666332553738e-05, "loss": 2.0781, "num_input_tokens_seen": 2729216, "step": 2250 }, { "epoch": 0.2511415525114155, "grad_norm": 1.1769691705703735, "learning_rate": 1.2551509076734602e-05, "loss": 1.8651, "num_input_tokens_seen": 2735488, "step": 2255 }, { "epoch": 0.25169840739503285, "grad_norm": 0.9843739867210388, "learning_rate": 1.2579351820915468e-05, "loss": 1.7427, "num_input_tokens_seen": 2741632, "step": 2260 }, { "epoch": 0.2522552622786502, "grad_norm": 0.864913284778595, "learning_rate": 1.2607194565096337e-05, "loss": 1.809, "num_input_tokens_seen": 2747648, "step": 2265 }, { "epoch": 0.25281211716226754, "grad_norm": 0.8927803635597229, "learning_rate": 1.2635037309277204e-05, "loss": 1.8918, "num_input_tokens_seen": 2753952, "step": 2270 }, { "epoch": 0.2533689720458848, "grad_norm": 0.92397540807724, "learning_rate": 1.2662880053458071e-05, "loss": 1.8691, "num_input_tokens_seen": 2759904, "step": 2275 }, { "epoch": 0.25392582692950216, "grad_norm": 0.9828578233718872, "learning_rate": 1.2690722797638935e-05, "loss": 1.8684, "num_input_tokens_seen": 2766112, "step": 2280 }, { "epoch": 0.2544826818131195, "grad_norm": 1.1089473962783813, "learning_rate": 1.2718565541819802e-05, "loss": 1.9486, "num_input_tokens_seen": 2772160, "step": 2285 }, { "epoch": 0.25503953669673685, "grad_norm": 1.0996496677398682, "learning_rate": 1.2746408286000669e-05, "loss": 1.8176, "num_input_tokens_seen": 2778496, "step": 2290 }, { "epoch": 0.25559639158035413, "grad_norm": 1.0029692649841309, "learning_rate": 1.2774251030181536e-05, "loss": 1.7418, "num_input_tokens_seen": 2784640, "step": 2295 }, { "epoch": 0.2561532464639715, "grad_norm": 1.0896837711334229, "learning_rate": 1.2802093774362402e-05, "loss": 1.7612, "num_input_tokens_seen": 2790816, "step": 2300 }, { "epoch": 0.2567101013475888, "grad_norm": 0.9020825028419495, "learning_rate": 1.2829936518543268e-05, "loss": 1.7592, "num_input_tokens_seen": 2796896, "step": 2305 }, { "epoch": 0.25726695623120616, "grad_norm": 1.012915015220642, "learning_rate": 1.2857779262724135e-05, "loss": 1.5834, "num_input_tokens_seen": 2802272, "step": 2310 }, { "epoch": 0.2578238111148235, "grad_norm": 0.9291650652885437, "learning_rate": 1.2885622006905002e-05, "loss": 1.8854, "num_input_tokens_seen": 2808352, "step": 2315 }, { "epoch": 0.2583806659984408, "grad_norm": 0.8879285454750061, "learning_rate": 1.2913464751085869e-05, "loss": 1.7033, "num_input_tokens_seen": 2814784, "step": 2320 }, { "epoch": 0.2589375208820581, "grad_norm": 1.0774869918823242, "learning_rate": 1.2941307495266732e-05, "loss": 1.7211, "num_input_tokens_seen": 2820960, "step": 2325 }, { "epoch": 0.25949437576567547, "grad_norm": 0.8904101252555847, "learning_rate": 1.29691502394476e-05, "loss": 1.6995, "num_input_tokens_seen": 2827168, "step": 2330 }, { "epoch": 0.2600512306492928, "grad_norm": 0.9027001857757568, "learning_rate": 1.2996992983628468e-05, "loss": 1.6032, "num_input_tokens_seen": 2833248, "step": 2335 }, { "epoch": 0.26060808553291015, "grad_norm": 0.9444562196731567, "learning_rate": 1.3024835727809335e-05, "loss": 1.7904, "num_input_tokens_seen": 2839360, "step": 2340 }, { "epoch": 0.26116494041652744, "grad_norm": 1.1698534488677979, "learning_rate": 1.3052678471990202e-05, "loss": 1.6725, "num_input_tokens_seen": 2845472, "step": 2345 }, { "epoch": 0.2617217953001448, "grad_norm": 0.797822117805481, "learning_rate": 1.3080521216171065e-05, "loss": 1.7018, "num_input_tokens_seen": 2851712, "step": 2350 }, { "epoch": 0.2622786501837621, "grad_norm": 0.8720460534095764, "learning_rate": 1.3108363960351932e-05, "loss": 1.4606, "num_input_tokens_seen": 2857952, "step": 2355 }, { "epoch": 0.26283550506737946, "grad_norm": 0.9254328608512878, "learning_rate": 1.31362067045328e-05, "loss": 1.6678, "num_input_tokens_seen": 2864128, "step": 2360 }, { "epoch": 0.26339235995099675, "grad_norm": 1.1368076801300049, "learning_rate": 1.3164049448713666e-05, "loss": 1.759, "num_input_tokens_seen": 2870336, "step": 2365 }, { "epoch": 0.2639492148346141, "grad_norm": 0.8126136064529419, "learning_rate": 1.3191892192894531e-05, "loss": 1.5932, "num_input_tokens_seen": 2876288, "step": 2370 }, { "epoch": 0.26450606971823143, "grad_norm": 1.056198239326477, "learning_rate": 1.3219734937075398e-05, "loss": 1.4847, "num_input_tokens_seen": 2882400, "step": 2375 }, { "epoch": 0.26506292460184877, "grad_norm": 0.9477003812789917, "learning_rate": 1.3247577681256265e-05, "loss": 1.4544, "num_input_tokens_seen": 2888768, "step": 2380 }, { "epoch": 0.2656197794854661, "grad_norm": 1.0282378196716309, "learning_rate": 1.3275420425437132e-05, "loss": 1.5221, "num_input_tokens_seen": 2894496, "step": 2385 }, { "epoch": 0.2661766343690834, "grad_norm": 1.1930720806121826, "learning_rate": 1.3303263169618e-05, "loss": 1.4134, "num_input_tokens_seen": 2900608, "step": 2390 }, { "epoch": 0.26673348925270074, "grad_norm": 1.029958963394165, "learning_rate": 1.3331105913798863e-05, "loss": 1.5426, "num_input_tokens_seen": 2906848, "step": 2395 }, { "epoch": 0.2672903441363181, "grad_norm": 0.9583202600479126, "learning_rate": 1.335894865797973e-05, "loss": 1.3752, "num_input_tokens_seen": 2912864, "step": 2400 }, { "epoch": 0.2678471990199354, "grad_norm": 1.1974016427993774, "learning_rate": 1.3386791402160599e-05, "loss": 1.5297, "num_input_tokens_seen": 2919456, "step": 2405 }, { "epoch": 0.2684040539035527, "grad_norm": 0.7798573970794678, "learning_rate": 1.3414634146341466e-05, "loss": 1.5465, "num_input_tokens_seen": 2925632, "step": 2410 }, { "epoch": 0.26896090878717005, "grad_norm": 0.7921409606933594, "learning_rate": 1.3442476890522329e-05, "loss": 1.4288, "num_input_tokens_seen": 2931648, "step": 2415 }, { "epoch": 0.2695177636707874, "grad_norm": 0.8883889317512512, "learning_rate": 1.3470319634703196e-05, "loss": 1.4494, "num_input_tokens_seen": 2937632, "step": 2420 }, { "epoch": 0.27007461855440473, "grad_norm": 0.8213306665420532, "learning_rate": 1.3498162378884063e-05, "loss": 1.4168, "num_input_tokens_seen": 2943488, "step": 2425 }, { "epoch": 0.2706314734380221, "grad_norm": 0.8741083145141602, "learning_rate": 1.352600512306493e-05, "loss": 1.4122, "num_input_tokens_seen": 2949376, "step": 2430 }, { "epoch": 0.27118832832163936, "grad_norm": 0.7362812757492065, "learning_rate": 1.3553847867245797e-05, "loss": 1.3403, "num_input_tokens_seen": 2955008, "step": 2435 }, { "epoch": 0.2717451832052567, "grad_norm": 1.0260258913040161, "learning_rate": 1.3581690611426662e-05, "loss": 1.3679, "num_input_tokens_seen": 2961408, "step": 2440 }, { "epoch": 0.27230203808887404, "grad_norm": 0.9425215721130371, "learning_rate": 1.360953335560753e-05, "loss": 1.2671, "num_input_tokens_seen": 2967264, "step": 2445 }, { "epoch": 0.2728588929724914, "grad_norm": 0.6412781476974487, "learning_rate": 1.3637376099788396e-05, "loss": 1.1747, "num_input_tokens_seen": 2973408, "step": 2450 }, { "epoch": 0.27341574785610867, "grad_norm": 1.1088789701461792, "learning_rate": 1.3665218843969263e-05, "loss": 1.3011, "num_input_tokens_seen": 2979392, "step": 2455 }, { "epoch": 0.273972602739726, "grad_norm": 0.8271797299385071, "learning_rate": 1.369306158815013e-05, "loss": 1.2663, "num_input_tokens_seen": 2985440, "step": 2460 }, { "epoch": 0.27452945762334335, "grad_norm": 0.9087303280830383, "learning_rate": 1.3720904332330994e-05, "loss": 1.3726, "num_input_tokens_seen": 2991456, "step": 2465 }, { "epoch": 0.2750863125069607, "grad_norm": 0.8108103275299072, "learning_rate": 1.3748747076511862e-05, "loss": 1.2141, "num_input_tokens_seen": 2997600, "step": 2470 }, { "epoch": 0.27564316739057804, "grad_norm": 0.9404833912849426, "learning_rate": 1.377658982069273e-05, "loss": 1.2913, "num_input_tokens_seen": 3003488, "step": 2475 }, { "epoch": 0.2762000222741953, "grad_norm": 0.7948901057243347, "learning_rate": 1.3804432564873596e-05, "loss": 1.1991, "num_input_tokens_seen": 3009760, "step": 2480 }, { "epoch": 0.27675687715781266, "grad_norm": 0.7272253632545471, "learning_rate": 1.383227530905446e-05, "loss": 1.3239, "num_input_tokens_seen": 3016096, "step": 2485 }, { "epoch": 0.27731373204143, "grad_norm": 0.6465078592300415, "learning_rate": 1.3860118053235327e-05, "loss": 1.2834, "num_input_tokens_seen": 3022048, "step": 2490 }, { "epoch": 0.27787058692504735, "grad_norm": 1.0849320888519287, "learning_rate": 1.3887960797416194e-05, "loss": 1.2544, "num_input_tokens_seen": 3028256, "step": 2495 }, { "epoch": 0.2784274418086647, "grad_norm": 0.9729897975921631, "learning_rate": 1.391580354159706e-05, "loss": 1.1802, "num_input_tokens_seen": 3034560, "step": 2500 }, { "epoch": 0.278984296692282, "grad_norm": 0.7948745489120483, "learning_rate": 1.3943646285777928e-05, "loss": 1.2228, "num_input_tokens_seen": 3040416, "step": 2505 }, { "epoch": 0.2795411515758993, "grad_norm": 1.0307199954986572, "learning_rate": 1.3971489029958793e-05, "loss": 1.237, "num_input_tokens_seen": 3046624, "step": 2510 }, { "epoch": 0.28009800645951666, "grad_norm": 0.8080379962921143, "learning_rate": 1.399933177413966e-05, "loss": 1.3357, "num_input_tokens_seen": 3052960, "step": 2515 }, { "epoch": 0.280654861343134, "grad_norm": 0.8540215492248535, "learning_rate": 1.4027174518320527e-05, "loss": 1.2693, "num_input_tokens_seen": 3059232, "step": 2520 }, { "epoch": 0.2812117162267513, "grad_norm": 0.6874392628669739, "learning_rate": 1.4055017262501394e-05, "loss": 1.1381, "num_input_tokens_seen": 3065472, "step": 2525 }, { "epoch": 0.2817685711103686, "grad_norm": 0.6622824668884277, "learning_rate": 1.4082860006682257e-05, "loss": 1.248, "num_input_tokens_seen": 3071488, "step": 2530 }, { "epoch": 0.28232542599398597, "grad_norm": 0.6634848117828369, "learning_rate": 1.4110702750863124e-05, "loss": 1.1569, "num_input_tokens_seen": 3076800, "step": 2535 }, { "epoch": 0.2828822808776033, "grad_norm": 0.7419052720069885, "learning_rate": 1.4138545495043993e-05, "loss": 1.203, "num_input_tokens_seen": 3082944, "step": 2540 }, { "epoch": 0.28343913576122065, "grad_norm": 0.766862690448761, "learning_rate": 1.416638823922486e-05, "loss": 1.1188, "num_input_tokens_seen": 3088896, "step": 2545 }, { "epoch": 0.28399599064483794, "grad_norm": 0.6743480563163757, "learning_rate": 1.4194230983405727e-05, "loss": 1.138, "num_input_tokens_seen": 3095008, "step": 2550 }, { "epoch": 0.2845528455284553, "grad_norm": 0.6835014820098877, "learning_rate": 1.422207372758659e-05, "loss": 1.2359, "num_input_tokens_seen": 3100992, "step": 2555 }, { "epoch": 0.2851097004120726, "grad_norm": 0.8756155967712402, "learning_rate": 1.4249916471767457e-05, "loss": 1.2863, "num_input_tokens_seen": 3107168, "step": 2560 }, { "epoch": 0.28566655529568996, "grad_norm": 0.9893397092819214, "learning_rate": 1.4277759215948324e-05, "loss": 1.2783, "num_input_tokens_seen": 3112544, "step": 2565 }, { "epoch": 0.28622341017930725, "grad_norm": 1.057767629623413, "learning_rate": 1.4305601960129191e-05, "loss": 1.2864, "num_input_tokens_seen": 3118944, "step": 2570 }, { "epoch": 0.2867802650629246, "grad_norm": 0.7238280177116394, "learning_rate": 1.4333444704310057e-05, "loss": 1.1447, "num_input_tokens_seen": 3125152, "step": 2575 }, { "epoch": 0.28733711994654193, "grad_norm": 1.331809163093567, "learning_rate": 1.4361287448490924e-05, "loss": 1.1128, "num_input_tokens_seen": 3131104, "step": 2580 }, { "epoch": 0.28789397483015927, "grad_norm": 0.6798022389411926, "learning_rate": 1.438913019267179e-05, "loss": 1.115, "num_input_tokens_seen": 3136928, "step": 2585 }, { "epoch": 0.2884508297137766, "grad_norm": 0.7632676362991333, "learning_rate": 1.4416972936852658e-05, "loss": 1.127, "num_input_tokens_seen": 3143072, "step": 2590 }, { "epoch": 0.2890076845973939, "grad_norm": 0.8338603377342224, "learning_rate": 1.4444815681033524e-05, "loss": 1.0185, "num_input_tokens_seen": 3148960, "step": 2595 }, { "epoch": 0.28956453948101124, "grad_norm": 0.8562396764755249, "learning_rate": 1.4472658425214388e-05, "loss": 1.1372, "num_input_tokens_seen": 3154720, "step": 2600 }, { "epoch": 0.2901213943646286, "grad_norm": 0.9219415783882141, "learning_rate": 1.4500501169395255e-05, "loss": 1.0899, "num_input_tokens_seen": 3160960, "step": 2605 }, { "epoch": 0.2906782492482459, "grad_norm": 0.8553642630577087, "learning_rate": 1.4528343913576124e-05, "loss": 1.1317, "num_input_tokens_seen": 3166912, "step": 2610 }, { "epoch": 0.2912351041318632, "grad_norm": 0.7650666236877441, "learning_rate": 1.455618665775699e-05, "loss": 1.1533, "num_input_tokens_seen": 3173216, "step": 2615 }, { "epoch": 0.29179195901548055, "grad_norm": 0.7854198813438416, "learning_rate": 1.4584029401937858e-05, "loss": 1.1153, "num_input_tokens_seen": 3179392, "step": 2620 }, { "epoch": 0.2923488138990979, "grad_norm": 1.262644648551941, "learning_rate": 1.4611872146118721e-05, "loss": 1.196, "num_input_tokens_seen": 3185536, "step": 2625 }, { "epoch": 0.29290566878271523, "grad_norm": 1.1299797296524048, "learning_rate": 1.4639714890299588e-05, "loss": 1.1637, "num_input_tokens_seen": 3191648, "step": 2630 }, { "epoch": 0.2934625236663326, "grad_norm": 0.7247252464294434, "learning_rate": 1.4667557634480455e-05, "loss": 1.1056, "num_input_tokens_seen": 3197760, "step": 2635 }, { "epoch": 0.29401937854994986, "grad_norm": 0.9591662287712097, "learning_rate": 1.4695400378661322e-05, "loss": 1.1632, "num_input_tokens_seen": 3203840, "step": 2640 }, { "epoch": 0.2945762334335672, "grad_norm": 0.7036020755767822, "learning_rate": 1.4723243122842187e-05, "loss": 1.0432, "num_input_tokens_seen": 3209920, "step": 2645 }, { "epoch": 0.29513308831718454, "grad_norm": 0.8827358484268188, "learning_rate": 1.4751085867023054e-05, "loss": 1.0339, "num_input_tokens_seen": 3216256, "step": 2650 }, { "epoch": 0.2956899432008019, "grad_norm": 0.939228355884552, "learning_rate": 1.4778928611203921e-05, "loss": 1.0533, "num_input_tokens_seen": 3222336, "step": 2655 }, { "epoch": 0.2962467980844192, "grad_norm": 0.8224652409553528, "learning_rate": 1.4806771355384788e-05, "loss": 1.0802, "num_input_tokens_seen": 3228320, "step": 2660 }, { "epoch": 0.2968036529680365, "grad_norm": 0.780156135559082, "learning_rate": 1.4834614099565655e-05, "loss": 0.9778, "num_input_tokens_seen": 3234304, "step": 2665 }, { "epoch": 0.29736050785165385, "grad_norm": 0.9237071871757507, "learning_rate": 1.4862456843746519e-05, "loss": 1.0759, "num_input_tokens_seen": 3240576, "step": 2670 }, { "epoch": 0.2979173627352712, "grad_norm": 0.7696670293807983, "learning_rate": 1.4890299587927386e-05, "loss": 1.0677, "num_input_tokens_seen": 3246336, "step": 2675 }, { "epoch": 0.29847421761888854, "grad_norm": 0.6582838296890259, "learning_rate": 1.4918142332108254e-05, "loss": 0.9302, "num_input_tokens_seen": 3252224, "step": 2680 }, { "epoch": 0.2990310725025058, "grad_norm": 0.5873396396636963, "learning_rate": 1.4945985076289121e-05, "loss": 0.9442, "num_input_tokens_seen": 3258336, "step": 2685 }, { "epoch": 0.29958792738612317, "grad_norm": 1.159311056137085, "learning_rate": 1.4973827820469985e-05, "loss": 1.0515, "num_input_tokens_seen": 3264000, "step": 2690 }, { "epoch": 0.3001447822697405, "grad_norm": 0.747973620891571, "learning_rate": 1.5001670564650852e-05, "loss": 1.0262, "num_input_tokens_seen": 3270112, "step": 2695 }, { "epoch": 0.30070163715335785, "grad_norm": 0.7253919839859009, "learning_rate": 1.5029513308831719e-05, "loss": 0.9773, "num_input_tokens_seen": 3276384, "step": 2700 }, { "epoch": 0.3012584920369752, "grad_norm": 0.7728376388549805, "learning_rate": 1.5057356053012586e-05, "loss": 0.9767, "num_input_tokens_seen": 3281984, "step": 2705 }, { "epoch": 0.3018153469205925, "grad_norm": 0.9778435230255127, "learning_rate": 1.5085198797193453e-05, "loss": 1.0309, "num_input_tokens_seen": 3287936, "step": 2710 }, { "epoch": 0.3023722018042098, "grad_norm": 1.0216442346572876, "learning_rate": 1.5113041541374318e-05, "loss": 0.968, "num_input_tokens_seen": 3293952, "step": 2715 }, { "epoch": 0.30292905668782716, "grad_norm": 0.7602256536483765, "learning_rate": 1.5140884285555185e-05, "loss": 1.0029, "num_input_tokens_seen": 3299744, "step": 2720 }, { "epoch": 0.3034859115714445, "grad_norm": 0.6576650738716125, "learning_rate": 1.5168727029736052e-05, "loss": 1.0187, "num_input_tokens_seen": 3306048, "step": 2725 }, { "epoch": 0.3040427664550618, "grad_norm": 0.8443461656570435, "learning_rate": 1.5196569773916919e-05, "loss": 0.8537, "num_input_tokens_seen": 3311968, "step": 2730 }, { "epoch": 0.3045996213386791, "grad_norm": 0.6261522173881531, "learning_rate": 1.5224412518097783e-05, "loss": 0.929, "num_input_tokens_seen": 3317856, "step": 2735 }, { "epoch": 0.30515647622229647, "grad_norm": 0.9835910201072693, "learning_rate": 1.525225526227865e-05, "loss": 1.013, "num_input_tokens_seen": 3323904, "step": 2740 }, { "epoch": 0.3057133311059138, "grad_norm": 0.8550453782081604, "learning_rate": 1.5280098006459518e-05, "loss": 0.8988, "num_input_tokens_seen": 3329824, "step": 2745 }, { "epoch": 0.30627018598953115, "grad_norm": 0.8622199892997742, "learning_rate": 1.5307940750640383e-05, "loss": 0.9689, "num_input_tokens_seen": 3336160, "step": 2750 }, { "epoch": 0.30682704087314844, "grad_norm": 0.7270780801773071, "learning_rate": 1.5335783494821252e-05, "loss": 0.8031, "num_input_tokens_seen": 3341888, "step": 2755 }, { "epoch": 0.3073838957567658, "grad_norm": 0.7092320322990417, "learning_rate": 1.5363626239002117e-05, "loss": 0.9622, "num_input_tokens_seen": 3347968, "step": 2760 }, { "epoch": 0.3079407506403831, "grad_norm": 0.7292302846908569, "learning_rate": 1.5391468983182983e-05, "loss": 0.9317, "num_input_tokens_seen": 3354016, "step": 2765 }, { "epoch": 0.30849760552400046, "grad_norm": 1.2052135467529297, "learning_rate": 1.541931172736385e-05, "loss": 0.9629, "num_input_tokens_seen": 3360384, "step": 2770 }, { "epoch": 0.30905446040761775, "grad_norm": 1.1028984785079956, "learning_rate": 1.5447154471544717e-05, "loss": 0.9343, "num_input_tokens_seen": 3366400, "step": 2775 }, { "epoch": 0.3096113152912351, "grad_norm": 0.8676727414131165, "learning_rate": 1.5474997215725585e-05, "loss": 0.9724, "num_input_tokens_seen": 3372704, "step": 2780 }, { "epoch": 0.31016817017485243, "grad_norm": 0.7946200370788574, "learning_rate": 1.5502839959906447e-05, "loss": 0.8842, "num_input_tokens_seen": 3378752, "step": 2785 }, { "epoch": 0.3107250250584698, "grad_norm": 0.9702312350273132, "learning_rate": 1.5530682704087316e-05, "loss": 0.9027, "num_input_tokens_seen": 3384992, "step": 2790 }, { "epoch": 0.3112818799420871, "grad_norm": 0.5389456748962402, "learning_rate": 1.555852544826818e-05, "loss": 0.8621, "num_input_tokens_seen": 3390560, "step": 2795 }, { "epoch": 0.3118387348257044, "grad_norm": 0.888525664806366, "learning_rate": 1.558636819244905e-05, "loss": 0.9088, "num_input_tokens_seen": 3396928, "step": 2800 }, { "epoch": 0.31239558970932174, "grad_norm": 0.9244258999824524, "learning_rate": 1.5614210936629915e-05, "loss": 0.9203, "num_input_tokens_seen": 3403328, "step": 2805 }, { "epoch": 0.3129524445929391, "grad_norm": 0.8504070043563843, "learning_rate": 1.564205368081078e-05, "loss": 0.8586, "num_input_tokens_seen": 3408704, "step": 2810 }, { "epoch": 0.3135092994765564, "grad_norm": 1.2111321687698364, "learning_rate": 1.566989642499165e-05, "loss": 0.848, "num_input_tokens_seen": 3414592, "step": 2815 }, { "epoch": 0.31406615436017377, "grad_norm": 0.6355211734771729, "learning_rate": 1.5697739169172514e-05, "loss": 0.7799, "num_input_tokens_seen": 3420640, "step": 2820 }, { "epoch": 0.31462300924379105, "grad_norm": 0.6604866981506348, "learning_rate": 1.5725581913353383e-05, "loss": 0.9545, "num_input_tokens_seen": 3427104, "step": 2825 }, { "epoch": 0.3151798641274084, "grad_norm": 1.116936206817627, "learning_rate": 1.5753424657534248e-05, "loss": 0.8541, "num_input_tokens_seen": 3433280, "step": 2830 }, { "epoch": 0.31573671901102573, "grad_norm": 0.8128998875617981, "learning_rate": 1.5781267401715113e-05, "loss": 0.9663, "num_input_tokens_seen": 3439360, "step": 2835 }, { "epoch": 0.3162935738946431, "grad_norm": 0.6958063244819641, "learning_rate": 1.5809110145895982e-05, "loss": 0.8721, "num_input_tokens_seen": 3445600, "step": 2840 }, { "epoch": 0.31685042877826036, "grad_norm": 0.7604252099990845, "learning_rate": 1.5836952890076847e-05, "loss": 1.0087, "num_input_tokens_seen": 3451744, "step": 2845 }, { "epoch": 0.3174072836618777, "grad_norm": 0.833640992641449, "learning_rate": 1.5864795634257712e-05, "loss": 1.0019, "num_input_tokens_seen": 3457984, "step": 2850 }, { "epoch": 0.31796413854549505, "grad_norm": 0.8361174464225769, "learning_rate": 1.5892638378438578e-05, "loss": 0.8075, "num_input_tokens_seen": 3464192, "step": 2855 }, { "epoch": 0.3185209934291124, "grad_norm": 0.6933679580688477, "learning_rate": 1.5920481122619446e-05, "loss": 0.8925, "num_input_tokens_seen": 3470592, "step": 2860 }, { "epoch": 0.31907784831272973, "grad_norm": 1.1433703899383545, "learning_rate": 1.594832386680031e-05, "loss": 0.8689, "num_input_tokens_seen": 3476512, "step": 2865 }, { "epoch": 0.319634703196347, "grad_norm": 0.7552950978279114, "learning_rate": 1.597616661098118e-05, "loss": 0.8699, "num_input_tokens_seen": 3482656, "step": 2870 }, { "epoch": 0.32019155807996436, "grad_norm": 0.6430396437644958, "learning_rate": 1.6004009355162046e-05, "loss": 0.9467, "num_input_tokens_seen": 3488832, "step": 2875 }, { "epoch": 0.3207484129635817, "grad_norm": 0.6959503293037415, "learning_rate": 1.603185209934291e-05, "loss": 0.7864, "num_input_tokens_seen": 3495104, "step": 2880 }, { "epoch": 0.32130526784719904, "grad_norm": 0.7289158701896667, "learning_rate": 1.605969484352378e-05, "loss": 0.8912, "num_input_tokens_seen": 3501152, "step": 2885 }, { "epoch": 0.3218621227308163, "grad_norm": 1.1332286596298218, "learning_rate": 1.6087537587704645e-05, "loss": 0.8776, "num_input_tokens_seen": 3507136, "step": 2890 }, { "epoch": 0.32241897761443367, "grad_norm": 0.8876113891601562, "learning_rate": 1.611538033188551e-05, "loss": 0.7865, "num_input_tokens_seen": 3513088, "step": 2895 }, { "epoch": 0.322975832498051, "grad_norm": 0.7384260892868042, "learning_rate": 1.614322307606638e-05, "loss": 0.8567, "num_input_tokens_seen": 3518944, "step": 2900 }, { "epoch": 0.32353268738166835, "grad_norm": 1.2286438941955566, "learning_rate": 1.6171065820247244e-05, "loss": 0.8213, "num_input_tokens_seen": 3525088, "step": 2905 }, { "epoch": 0.3240895422652857, "grad_norm": 0.706199049949646, "learning_rate": 1.6198908564428113e-05, "loss": 0.8344, "num_input_tokens_seen": 3531168, "step": 2910 }, { "epoch": 0.324646397148903, "grad_norm": 0.8894254565238953, "learning_rate": 1.6226751308608978e-05, "loss": 0.8771, "num_input_tokens_seen": 3536992, "step": 2915 }, { "epoch": 0.3252032520325203, "grad_norm": 0.8318414092063904, "learning_rate": 1.6254594052789843e-05, "loss": 0.8119, "num_input_tokens_seen": 3542912, "step": 2920 }, { "epoch": 0.32576010691613766, "grad_norm": 0.6565179824829102, "learning_rate": 1.628243679697071e-05, "loss": 0.7799, "num_input_tokens_seen": 3549056, "step": 2925 }, { "epoch": 0.326316961799755, "grad_norm": 0.6590330004692078, "learning_rate": 1.6310279541151577e-05, "loss": 0.8368, "num_input_tokens_seen": 3554688, "step": 2930 }, { "epoch": 0.3268738166833723, "grad_norm": 0.7032907605171204, "learning_rate": 1.6338122285332442e-05, "loss": 0.8048, "num_input_tokens_seen": 3560672, "step": 2935 }, { "epoch": 0.32743067156698963, "grad_norm": 0.6997689008712769, "learning_rate": 1.6365965029513308e-05, "loss": 0.8206, "num_input_tokens_seen": 3566944, "step": 2940 }, { "epoch": 0.32798752645060697, "grad_norm": 1.0106511116027832, "learning_rate": 1.6393807773694176e-05, "loss": 0.8319, "num_input_tokens_seen": 3573024, "step": 2945 }, { "epoch": 0.3285443813342243, "grad_norm": 0.7239242792129517, "learning_rate": 1.642165051787504e-05, "loss": 0.7339, "num_input_tokens_seen": 3579072, "step": 2950 }, { "epoch": 0.32910123621784165, "grad_norm": 0.6858749985694885, "learning_rate": 1.644949326205591e-05, "loss": 0.7911, "num_input_tokens_seen": 3585120, "step": 2955 }, { "epoch": 0.32965809110145894, "grad_norm": 0.8416125178337097, "learning_rate": 1.6477336006236776e-05, "loss": 0.7441, "num_input_tokens_seen": 3590656, "step": 2960 }, { "epoch": 0.3302149459850763, "grad_norm": 1.1180367469787598, "learning_rate": 1.650517875041764e-05, "loss": 0.8395, "num_input_tokens_seen": 3596416, "step": 2965 }, { "epoch": 0.3307718008686936, "grad_norm": 0.7456669807434082, "learning_rate": 1.653302149459851e-05, "loss": 0.7463, "num_input_tokens_seen": 3602176, "step": 2970 }, { "epoch": 0.33132865575231096, "grad_norm": 0.7539106011390686, "learning_rate": 1.6560864238779375e-05, "loss": 0.7837, "num_input_tokens_seen": 3608256, "step": 2975 }, { "epoch": 0.3318855106359283, "grad_norm": 0.7022818922996521, "learning_rate": 1.6588706982960243e-05, "loss": 0.7905, "num_input_tokens_seen": 3614112, "step": 2980 }, { "epoch": 0.3324423655195456, "grad_norm": 0.8741825819015503, "learning_rate": 1.661654972714111e-05, "loss": 0.7716, "num_input_tokens_seen": 3620352, "step": 2985 }, { "epoch": 0.33299922040316293, "grad_norm": 0.6387346982955933, "learning_rate": 1.6644392471321974e-05, "loss": 0.6208, "num_input_tokens_seen": 3626496, "step": 2990 }, { "epoch": 0.3335560752867803, "grad_norm": 0.7227789759635925, "learning_rate": 1.667223521550284e-05, "loss": 0.77, "num_input_tokens_seen": 3632256, "step": 2995 }, { "epoch": 0.3341129301703976, "grad_norm": 0.7043372988700867, "learning_rate": 1.6700077959683708e-05, "loss": 0.6781, "num_input_tokens_seen": 3638400, "step": 3000 }, { "epoch": 0.3346697850540149, "grad_norm": 0.7883501052856445, "learning_rate": 1.6727920703864573e-05, "loss": 0.7567, "num_input_tokens_seen": 3644480, "step": 3005 }, { "epoch": 0.33522663993763224, "grad_norm": 0.9741755127906799, "learning_rate": 1.675576344804544e-05, "loss": 0.8919, "num_input_tokens_seen": 3650880, "step": 3010 }, { "epoch": 0.3357834948212496, "grad_norm": 0.754317045211792, "learning_rate": 1.6783606192226307e-05, "loss": 0.7109, "num_input_tokens_seen": 3656736, "step": 3015 }, { "epoch": 0.3363403497048669, "grad_norm": 0.7798634767532349, "learning_rate": 1.6811448936407172e-05, "loss": 0.7299, "num_input_tokens_seen": 3662912, "step": 3020 }, { "epoch": 0.33689720458848427, "grad_norm": 0.9960898160934448, "learning_rate": 1.683929168058804e-05, "loss": 0.8325, "num_input_tokens_seen": 3668864, "step": 3025 }, { "epoch": 0.33745405947210155, "grad_norm": 0.6723915338516235, "learning_rate": 1.6867134424768906e-05, "loss": 0.7277, "num_input_tokens_seen": 3674976, "step": 3030 }, { "epoch": 0.3380109143557189, "grad_norm": 0.6873594522476196, "learning_rate": 1.689497716894977e-05, "loss": 0.7642, "num_input_tokens_seen": 3681152, "step": 3035 }, { "epoch": 0.33856776923933624, "grad_norm": 1.082980990409851, "learning_rate": 1.692281991313064e-05, "loss": 0.7308, "num_input_tokens_seen": 3686560, "step": 3040 }, { "epoch": 0.3391246241229536, "grad_norm": 0.645869255065918, "learning_rate": 1.6950662657311505e-05, "loss": 0.7392, "num_input_tokens_seen": 3692608, "step": 3045 }, { "epoch": 0.33968147900657086, "grad_norm": 0.7145723104476929, "learning_rate": 1.6978505401492374e-05, "loss": 0.6951, "num_input_tokens_seen": 3699104, "step": 3050 }, { "epoch": 0.3402383338901882, "grad_norm": 0.797207236289978, "learning_rate": 1.7006348145673236e-05, "loss": 0.8767, "num_input_tokens_seen": 3705152, "step": 3055 }, { "epoch": 0.34079518877380555, "grad_norm": 0.7011590003967285, "learning_rate": 1.7034190889854105e-05, "loss": 0.7062, "num_input_tokens_seen": 3710848, "step": 3060 }, { "epoch": 0.3413520436574229, "grad_norm": 1.3318392038345337, "learning_rate": 1.706203363403497e-05, "loss": 0.7684, "num_input_tokens_seen": 3716960, "step": 3065 }, { "epoch": 0.34190889854104023, "grad_norm": 0.7001156806945801, "learning_rate": 1.708987637821584e-05, "loss": 0.6451, "num_input_tokens_seen": 3723040, "step": 3070 }, { "epoch": 0.3424657534246575, "grad_norm": 0.5246134996414185, "learning_rate": 1.7117719122396704e-05, "loss": 0.7753, "num_input_tokens_seen": 3729376, "step": 3075 }, { "epoch": 0.34302260830827486, "grad_norm": 0.6072102189064026, "learning_rate": 1.714556186657757e-05, "loss": 0.7127, "num_input_tokens_seen": 3735456, "step": 3080 }, { "epoch": 0.3435794631918922, "grad_norm": 0.6230674386024475, "learning_rate": 1.7173404610758438e-05, "loss": 0.8599, "num_input_tokens_seen": 3741696, "step": 3085 }, { "epoch": 0.34413631807550954, "grad_norm": 1.0005154609680176, "learning_rate": 1.7201247354939303e-05, "loss": 0.8345, "num_input_tokens_seen": 3747808, "step": 3090 }, { "epoch": 0.3446931729591268, "grad_norm": 0.746637225151062, "learning_rate": 1.722909009912017e-05, "loss": 0.7613, "num_input_tokens_seen": 3754080, "step": 3095 }, { "epoch": 0.34525002784274417, "grad_norm": 0.7311837673187256, "learning_rate": 1.7256932843301034e-05, "loss": 0.7835, "num_input_tokens_seen": 3760416, "step": 3100 }, { "epoch": 0.3458068827263615, "grad_norm": 0.5767139792442322, "learning_rate": 1.7284775587481902e-05, "loss": 0.6862, "num_input_tokens_seen": 3766592, "step": 3105 }, { "epoch": 0.34636373760997885, "grad_norm": 0.5902727842330933, "learning_rate": 1.731261833166277e-05, "loss": 0.6705, "num_input_tokens_seen": 3772768, "step": 3110 }, { "epoch": 0.3469205924935962, "grad_norm": 0.916875422000885, "learning_rate": 1.7340461075843636e-05, "loss": 0.6688, "num_input_tokens_seen": 3778848, "step": 3115 }, { "epoch": 0.3474774473772135, "grad_norm": 1.022446632385254, "learning_rate": 1.7368303820024505e-05, "loss": 0.7067, "num_input_tokens_seen": 3784992, "step": 3120 }, { "epoch": 0.3480343022608308, "grad_norm": 0.9478371143341064, "learning_rate": 1.7396146564205367e-05, "loss": 0.6739, "num_input_tokens_seen": 3791136, "step": 3125 }, { "epoch": 0.34859115714444816, "grad_norm": 0.5778316259384155, "learning_rate": 1.7423989308386235e-05, "loss": 0.8054, "num_input_tokens_seen": 3797280, "step": 3130 }, { "epoch": 0.3491480120280655, "grad_norm": 0.6948468089103699, "learning_rate": 1.74518320525671e-05, "loss": 0.7189, "num_input_tokens_seen": 3803392, "step": 3135 }, { "epoch": 0.3497048669116828, "grad_norm": 0.8553916215896606, "learning_rate": 1.747967479674797e-05, "loss": 0.7814, "num_input_tokens_seen": 3809344, "step": 3140 }, { "epoch": 0.35026172179530013, "grad_norm": 0.5981284379959106, "learning_rate": 1.7507517540928834e-05, "loss": 0.6801, "num_input_tokens_seen": 3815360, "step": 3145 }, { "epoch": 0.35081857667891747, "grad_norm": 0.7133454084396362, "learning_rate": 1.75353602851097e-05, "loss": 0.6537, "num_input_tokens_seen": 3821344, "step": 3150 }, { "epoch": 0.3513754315625348, "grad_norm": 0.597109317779541, "learning_rate": 1.756320302929057e-05, "loss": 0.713, "num_input_tokens_seen": 3827456, "step": 3155 }, { "epoch": 0.35193228644615215, "grad_norm": 0.646174430847168, "learning_rate": 1.7591045773471434e-05, "loss": 0.562, "num_input_tokens_seen": 3833504, "step": 3160 }, { "epoch": 0.35248914132976944, "grad_norm": 0.740085244178772, "learning_rate": 1.7618888517652302e-05, "loss": 0.6958, "num_input_tokens_seen": 3839584, "step": 3165 }, { "epoch": 0.3530459962133868, "grad_norm": 0.784919798374176, "learning_rate": 1.7646731261833168e-05, "loss": 0.7177, "num_input_tokens_seen": 3846016, "step": 3170 }, { "epoch": 0.3536028510970041, "grad_norm": 0.5879893898963928, "learning_rate": 1.7674574006014033e-05, "loss": 0.7111, "num_input_tokens_seen": 3851968, "step": 3175 }, { "epoch": 0.35415970598062146, "grad_norm": 0.9629414081573486, "learning_rate": 1.77024167501949e-05, "loss": 0.7371, "num_input_tokens_seen": 3858304, "step": 3180 }, { "epoch": 0.3547165608642388, "grad_norm": 1.0962774753570557, "learning_rate": 1.7730259494375767e-05, "loss": 0.6978, "num_input_tokens_seen": 3864416, "step": 3185 }, { "epoch": 0.3552734157478561, "grad_norm": 0.7542714476585388, "learning_rate": 1.7758102238556635e-05, "loss": 0.6989, "num_input_tokens_seen": 3870432, "step": 3190 }, { "epoch": 0.35583027063147343, "grad_norm": 0.6016682386398315, "learning_rate": 1.7785944982737497e-05, "loss": 0.7247, "num_input_tokens_seen": 3876320, "step": 3195 }, { "epoch": 0.3563871255150908, "grad_norm": 0.6868104338645935, "learning_rate": 1.7813787726918366e-05, "loss": 0.6966, "num_input_tokens_seen": 3882496, "step": 3200 }, { "epoch": 0.3569439803987081, "grad_norm": 0.8813942670822144, "learning_rate": 1.784163047109923e-05, "loss": 0.6996, "num_input_tokens_seen": 3888672, "step": 3205 }, { "epoch": 0.3575008352823254, "grad_norm": 0.8284479379653931, "learning_rate": 1.78694732152801e-05, "loss": 0.7177, "num_input_tokens_seen": 3894752, "step": 3210 }, { "epoch": 0.35805769016594274, "grad_norm": 0.9552041888237, "learning_rate": 1.7897315959460965e-05, "loss": 0.7415, "num_input_tokens_seen": 3900640, "step": 3215 }, { "epoch": 0.3586145450495601, "grad_norm": 0.7068825364112854, "learning_rate": 1.792515870364183e-05, "loss": 0.6055, "num_input_tokens_seen": 3906592, "step": 3220 }, { "epoch": 0.3591713999331774, "grad_norm": 2.0214600563049316, "learning_rate": 1.79530014478227e-05, "loss": 0.6817, "num_input_tokens_seen": 3912576, "step": 3225 }, { "epoch": 0.35972825481679477, "grad_norm": 0.8040777444839478, "learning_rate": 1.7980844192003564e-05, "loss": 0.6492, "num_input_tokens_seen": 3918912, "step": 3230 }, { "epoch": 0.36028510970041205, "grad_norm": 0.6712054014205933, "learning_rate": 1.8008686936184433e-05, "loss": 0.8442, "num_input_tokens_seen": 3925088, "step": 3235 }, { "epoch": 0.3608419645840294, "grad_norm": 0.8030280470848083, "learning_rate": 1.80365296803653e-05, "loss": 0.6948, "num_input_tokens_seen": 3931136, "step": 3240 }, { "epoch": 0.36139881946764674, "grad_norm": 0.6449285745620728, "learning_rate": 1.8064372424546164e-05, "loss": 0.6553, "num_input_tokens_seen": 3937344, "step": 3245 }, { "epoch": 0.3619556743512641, "grad_norm": 1.003219485282898, "learning_rate": 1.8092215168727032e-05, "loss": 0.7023, "num_input_tokens_seen": 3943872, "step": 3250 }, { "epoch": 0.36251252923488136, "grad_norm": 0.6382261514663696, "learning_rate": 1.8120057912907898e-05, "loss": 0.6543, "num_input_tokens_seen": 3949920, "step": 3255 }, { "epoch": 0.3630693841184987, "grad_norm": 0.5262326002120972, "learning_rate": 1.8147900657088763e-05, "loss": 0.6872, "num_input_tokens_seen": 3955968, "step": 3260 }, { "epoch": 0.36362623900211605, "grad_norm": 0.5104308128356934, "learning_rate": 1.8175743401269628e-05, "loss": 0.6408, "num_input_tokens_seen": 3962208, "step": 3265 }, { "epoch": 0.3641830938857334, "grad_norm": 0.6452802419662476, "learning_rate": 1.8203586145450497e-05, "loss": 0.6811, "num_input_tokens_seen": 3968032, "step": 3270 }, { "epoch": 0.36473994876935073, "grad_norm": 0.6599747538566589, "learning_rate": 1.8231428889631362e-05, "loss": 0.6531, "num_input_tokens_seen": 3973728, "step": 3275 }, { "epoch": 0.365296803652968, "grad_norm": 0.5117035508155823, "learning_rate": 1.825927163381223e-05, "loss": 0.6689, "num_input_tokens_seen": 3979904, "step": 3280 }, { "epoch": 0.36585365853658536, "grad_norm": 0.6155813336372375, "learning_rate": 1.8287114377993096e-05, "loss": 0.6585, "num_input_tokens_seen": 3985664, "step": 3285 }, { "epoch": 0.3664105134202027, "grad_norm": 0.6607415676116943, "learning_rate": 1.831495712217396e-05, "loss": 0.6798, "num_input_tokens_seen": 3991584, "step": 3290 }, { "epoch": 0.36696736830382004, "grad_norm": 0.7009616494178772, "learning_rate": 1.834279986635483e-05, "loss": 0.6568, "num_input_tokens_seen": 3997344, "step": 3295 }, { "epoch": 0.3675242231874373, "grad_norm": 1.3344988822937012, "learning_rate": 1.8370642610535695e-05, "loss": 0.682, "num_input_tokens_seen": 4002720, "step": 3300 }, { "epoch": 0.36808107807105467, "grad_norm": 0.655612587928772, "learning_rate": 1.8398485354716564e-05, "loss": 0.6858, "num_input_tokens_seen": 4008640, "step": 3305 }, { "epoch": 0.368637932954672, "grad_norm": 0.6551353335380554, "learning_rate": 1.842632809889743e-05, "loss": 0.6575, "num_input_tokens_seen": 4014752, "step": 3310 }, { "epoch": 0.36919478783828935, "grad_norm": 0.8650931715965271, "learning_rate": 1.8454170843078294e-05, "loss": 0.6923, "num_input_tokens_seen": 4020992, "step": 3315 }, { "epoch": 0.3697516427219067, "grad_norm": 0.6311479806900024, "learning_rate": 1.8482013587259163e-05, "loss": 0.6268, "num_input_tokens_seen": 4026752, "step": 3320 }, { "epoch": 0.370308497605524, "grad_norm": 0.656761884689331, "learning_rate": 1.8509856331440028e-05, "loss": 0.6167, "num_input_tokens_seen": 4032928, "step": 3325 }, { "epoch": 0.3708653524891413, "grad_norm": 0.6596508026123047, "learning_rate": 1.8537699075620893e-05, "loss": 0.6013, "num_input_tokens_seen": 4038784, "step": 3330 }, { "epoch": 0.37142220737275866, "grad_norm": 0.5497972369194031, "learning_rate": 1.856554181980176e-05, "loss": 0.6971, "num_input_tokens_seen": 4044992, "step": 3335 }, { "epoch": 0.371979062256376, "grad_norm": 0.8759901523590088, "learning_rate": 1.8593384563982627e-05, "loss": 0.712, "num_input_tokens_seen": 4051232, "step": 3340 }, { "epoch": 0.37253591713999334, "grad_norm": 0.7428078055381775, "learning_rate": 1.8621227308163493e-05, "loss": 0.6254, "num_input_tokens_seen": 4057376, "step": 3345 }, { "epoch": 0.37309277202361063, "grad_norm": 0.579107403755188, "learning_rate": 1.864907005234436e-05, "loss": 0.5719, "num_input_tokens_seen": 4063584, "step": 3350 }, { "epoch": 0.37364962690722797, "grad_norm": 0.8539456725120544, "learning_rate": 1.8676912796525227e-05, "loss": 0.6472, "num_input_tokens_seen": 4069664, "step": 3355 }, { "epoch": 0.3742064817908453, "grad_norm": 0.5895262360572815, "learning_rate": 1.8704755540706092e-05, "loss": 0.6757, "num_input_tokens_seen": 4075456, "step": 3360 }, { "epoch": 0.37476333667446265, "grad_norm": 0.5680410861968994, "learning_rate": 1.873259828488696e-05, "loss": 0.6224, "num_input_tokens_seen": 4081440, "step": 3365 }, { "epoch": 0.37532019155807994, "grad_norm": 1.365017294883728, "learning_rate": 1.8760441029067826e-05, "loss": 0.7741, "num_input_tokens_seen": 4087936, "step": 3370 }, { "epoch": 0.3758770464416973, "grad_norm": 0.5964043140411377, "learning_rate": 1.878828377324869e-05, "loss": 0.621, "num_input_tokens_seen": 4094240, "step": 3375 }, { "epoch": 0.3764339013253146, "grad_norm": 0.7095731496810913, "learning_rate": 1.881612651742956e-05, "loss": 0.6847, "num_input_tokens_seen": 4100352, "step": 3380 }, { "epoch": 0.37699075620893197, "grad_norm": 0.6927310228347778, "learning_rate": 1.8843969261610425e-05, "loss": 0.6506, "num_input_tokens_seen": 4106656, "step": 3385 }, { "epoch": 0.3775476110925493, "grad_norm": 0.5785253643989563, "learning_rate": 1.8871812005791294e-05, "loss": 0.5868, "num_input_tokens_seen": 4112896, "step": 3390 }, { "epoch": 0.3781044659761666, "grad_norm": 0.6710240840911865, "learning_rate": 1.889965474997216e-05, "loss": 0.7016, "num_input_tokens_seen": 4119200, "step": 3395 }, { "epoch": 0.37866132085978393, "grad_norm": 0.7210187911987305, "learning_rate": 1.8927497494153024e-05, "loss": 0.7342, "num_input_tokens_seen": 4124544, "step": 3400 }, { "epoch": 0.3792181757434013, "grad_norm": 0.684636116027832, "learning_rate": 1.895534023833389e-05, "loss": 0.5689, "num_input_tokens_seen": 4130496, "step": 3405 }, { "epoch": 0.3797750306270186, "grad_norm": 0.6498088836669922, "learning_rate": 1.8983182982514758e-05, "loss": 0.6141, "num_input_tokens_seen": 4136704, "step": 3410 }, { "epoch": 0.3803318855106359, "grad_norm": 3.025407075881958, "learning_rate": 1.9011025726695623e-05, "loss": 0.7846, "num_input_tokens_seen": 4143168, "step": 3415 }, { "epoch": 0.38088874039425324, "grad_norm": 0.7455375790596008, "learning_rate": 1.903886847087649e-05, "loss": 0.647, "num_input_tokens_seen": 4149408, "step": 3420 }, { "epoch": 0.3814455952778706, "grad_norm": 0.8761007785797119, "learning_rate": 1.9066711215057357e-05, "loss": 0.6433, "num_input_tokens_seen": 4155616, "step": 3425 }, { "epoch": 0.3820024501614879, "grad_norm": 0.7369247078895569, "learning_rate": 1.9094553959238223e-05, "loss": 0.6529, "num_input_tokens_seen": 4161792, "step": 3430 }, { "epoch": 0.38255930504510527, "grad_norm": 0.5794888734817505, "learning_rate": 1.912239670341909e-05, "loss": 0.631, "num_input_tokens_seen": 4167584, "step": 3435 }, { "epoch": 0.38311615992872255, "grad_norm": 0.7952249646186829, "learning_rate": 1.9150239447599957e-05, "loss": 0.6407, "num_input_tokens_seen": 4173696, "step": 3440 }, { "epoch": 0.3836730148123399, "grad_norm": 0.6974244117736816, "learning_rate": 1.9178082191780822e-05, "loss": 0.6297, "num_input_tokens_seen": 4179296, "step": 3445 }, { "epoch": 0.38422986969595724, "grad_norm": 0.7224573493003845, "learning_rate": 1.920592493596169e-05, "loss": 0.5714, "num_input_tokens_seen": 4185408, "step": 3450 }, { "epoch": 0.3847867245795746, "grad_norm": 0.5471491813659668, "learning_rate": 1.9233767680142556e-05, "loss": 0.5419, "num_input_tokens_seen": 4191712, "step": 3455 }, { "epoch": 0.38534357946319187, "grad_norm": 0.5449718236923218, "learning_rate": 1.9261610424323424e-05, "loss": 0.5916, "num_input_tokens_seen": 4197568, "step": 3460 }, { "epoch": 0.3859004343468092, "grad_norm": 0.6337326765060425, "learning_rate": 1.928945316850429e-05, "loss": 0.5439, "num_input_tokens_seen": 4203296, "step": 3465 }, { "epoch": 0.38645728923042655, "grad_norm": 0.5813928246498108, "learning_rate": 1.9317295912685155e-05, "loss": 0.6445, "num_input_tokens_seen": 4209408, "step": 3470 }, { "epoch": 0.3870141441140439, "grad_norm": 0.6164448857307434, "learning_rate": 1.934513865686602e-05, "loss": 0.58, "num_input_tokens_seen": 4215712, "step": 3475 }, { "epoch": 0.38757099899766123, "grad_norm": 0.5039767026901245, "learning_rate": 1.937298140104689e-05, "loss": 0.5906, "num_input_tokens_seen": 4221952, "step": 3480 }, { "epoch": 0.3881278538812785, "grad_norm": 0.8607691526412964, "learning_rate": 1.9400824145227754e-05, "loss": 0.637, "num_input_tokens_seen": 4228224, "step": 3485 }, { "epoch": 0.38868470876489586, "grad_norm": 0.5879281759262085, "learning_rate": 1.942866688940862e-05, "loss": 0.6138, "num_input_tokens_seen": 4234560, "step": 3490 }, { "epoch": 0.3892415636485132, "grad_norm": 0.5463665723800659, "learning_rate": 1.9456509633589488e-05, "loss": 0.5099, "num_input_tokens_seen": 4240576, "step": 3495 }, { "epoch": 0.38979841853213054, "grad_norm": 0.6000627279281616, "learning_rate": 1.9484352377770353e-05, "loss": 0.6515, "num_input_tokens_seen": 4246720, "step": 3500 }, { "epoch": 0.3903552734157479, "grad_norm": 0.7411845326423645, "learning_rate": 1.9512195121951222e-05, "loss": 0.6391, "num_input_tokens_seen": 4252576, "step": 3505 }, { "epoch": 0.39091212829936517, "grad_norm": 1.3947862386703491, "learning_rate": 1.9540037866132087e-05, "loss": 0.6009, "num_input_tokens_seen": 4258400, "step": 3510 }, { "epoch": 0.3914689831829825, "grad_norm": 0.7570947408676147, "learning_rate": 1.9567880610312952e-05, "loss": 0.6343, "num_input_tokens_seen": 4264672, "step": 3515 }, { "epoch": 0.39202583806659985, "grad_norm": 0.6590403318405151, "learning_rate": 1.959572335449382e-05, "loss": 0.646, "num_input_tokens_seen": 4271040, "step": 3520 }, { "epoch": 0.3925826929502172, "grad_norm": 0.8347069621086121, "learning_rate": 1.9623566098674686e-05, "loss": 0.6757, "num_input_tokens_seen": 4276832, "step": 3525 }, { "epoch": 0.3931395478338345, "grad_norm": 0.8542050123214722, "learning_rate": 1.9651408842855555e-05, "loss": 0.5727, "num_input_tokens_seen": 4283040, "step": 3530 }, { "epoch": 0.3936964027174518, "grad_norm": 0.4897926151752472, "learning_rate": 1.9679251587036417e-05, "loss": 0.5863, "num_input_tokens_seen": 4289120, "step": 3535 }, { "epoch": 0.39425325760106916, "grad_norm": 0.4838256537914276, "learning_rate": 1.9707094331217286e-05, "loss": 0.5767, "num_input_tokens_seen": 4295392, "step": 3540 }, { "epoch": 0.3948101124846865, "grad_norm": 1.063706874847412, "learning_rate": 1.973493707539815e-05, "loss": 0.6157, "num_input_tokens_seen": 4301472, "step": 3545 }, { "epoch": 0.39536696736830385, "grad_norm": 0.44149547815322876, "learning_rate": 1.976277981957902e-05, "loss": 0.6017, "num_input_tokens_seen": 4307552, "step": 3550 }, { "epoch": 0.39592382225192113, "grad_norm": 1.631095290184021, "learning_rate": 1.9790622563759885e-05, "loss": 0.6201, "num_input_tokens_seen": 4313568, "step": 3555 }, { "epoch": 0.3964806771355385, "grad_norm": 0.7899639010429382, "learning_rate": 1.981846530794075e-05, "loss": 0.699, "num_input_tokens_seen": 4320064, "step": 3560 }, { "epoch": 0.3970375320191558, "grad_norm": 0.9677003026008606, "learning_rate": 1.984630805212162e-05, "loss": 0.5717, "num_input_tokens_seen": 4326176, "step": 3565 }, { "epoch": 0.39759438690277316, "grad_norm": 0.6291847229003906, "learning_rate": 1.9874150796302484e-05, "loss": 0.5796, "num_input_tokens_seen": 4332384, "step": 3570 }, { "epoch": 0.39815124178639044, "grad_norm": 0.7799382209777832, "learning_rate": 1.9901993540483353e-05, "loss": 0.6349, "num_input_tokens_seen": 4338624, "step": 3575 }, { "epoch": 0.3987080966700078, "grad_norm": 0.651913583278656, "learning_rate": 1.9929836284664215e-05, "loss": 0.5927, "num_input_tokens_seen": 4344608, "step": 3580 }, { "epoch": 0.3992649515536251, "grad_norm": 0.5329757332801819, "learning_rate": 1.9957679028845083e-05, "loss": 0.5765, "num_input_tokens_seen": 4350784, "step": 3585 }, { "epoch": 0.39982180643724247, "grad_norm": 0.6107724905014038, "learning_rate": 1.9985521773025952e-05, "loss": 0.5719, "num_input_tokens_seen": 4357056, "step": 3590 }, { "epoch": 0.4003786613208598, "grad_norm": 0.5657049417495728, "learning_rate": 2.0013364517206817e-05, "loss": 0.5777, "num_input_tokens_seen": 4362624, "step": 3595 }, { "epoch": 0.4009355162044771, "grad_norm": 0.5067118406295776, "learning_rate": 2.0041207261387686e-05, "loss": 0.6322, "num_input_tokens_seen": 4369216, "step": 3600 }, { "epoch": 0.40149237108809444, "grad_norm": 0.68659907579422, "learning_rate": 2.0069050005568548e-05, "loss": 0.5434, "num_input_tokens_seen": 4375296, "step": 3605 }, { "epoch": 0.4020492259717118, "grad_norm": 0.5772555470466614, "learning_rate": 2.0096892749749416e-05, "loss": 0.6309, "num_input_tokens_seen": 4380928, "step": 3610 }, { "epoch": 0.4026060808553291, "grad_norm": 1.0572892427444458, "learning_rate": 2.012473549393028e-05, "loss": 0.549, "num_input_tokens_seen": 4386528, "step": 3615 }, { "epoch": 0.4031629357389464, "grad_norm": 0.6188274621963501, "learning_rate": 2.015257823811115e-05, "loss": 0.5534, "num_input_tokens_seen": 4392192, "step": 3620 }, { "epoch": 0.40371979062256375, "grad_norm": 0.6468358039855957, "learning_rate": 2.0180420982292015e-05, "loss": 0.5965, "num_input_tokens_seen": 4398624, "step": 3625 }, { "epoch": 0.4042766455061811, "grad_norm": 0.5942001938819885, "learning_rate": 2.020826372647288e-05, "loss": 0.5941, "num_input_tokens_seen": 4404992, "step": 3630 }, { "epoch": 0.40483350038979843, "grad_norm": 0.6883320212364197, "learning_rate": 2.023610647065375e-05, "loss": 0.5917, "num_input_tokens_seen": 4411008, "step": 3635 }, { "epoch": 0.40539035527341577, "grad_norm": 0.693879246711731, "learning_rate": 2.0263949214834615e-05, "loss": 0.601, "num_input_tokens_seen": 4417408, "step": 3640 }, { "epoch": 0.40594721015703306, "grad_norm": 0.5887772440910339, "learning_rate": 2.0291791959015483e-05, "loss": 0.6653, "num_input_tokens_seen": 4423744, "step": 3645 }, { "epoch": 0.4065040650406504, "grad_norm": 0.6162872314453125, "learning_rate": 2.0319634703196345e-05, "loss": 0.5622, "num_input_tokens_seen": 4429632, "step": 3650 }, { "epoch": 0.40706091992426774, "grad_norm": 0.8725115060806274, "learning_rate": 2.0347477447377214e-05, "loss": 0.5967, "num_input_tokens_seen": 4435648, "step": 3655 }, { "epoch": 0.4076177748078851, "grad_norm": 0.5574665665626526, "learning_rate": 2.0375320191558083e-05, "loss": 0.5517, "num_input_tokens_seen": 4441632, "step": 3660 }, { "epoch": 0.4081746296915024, "grad_norm": 0.7224874496459961, "learning_rate": 2.0403162935738948e-05, "loss": 0.5937, "num_input_tokens_seen": 4447616, "step": 3665 }, { "epoch": 0.4087314845751197, "grad_norm": 0.8832809925079346, "learning_rate": 2.0431005679919816e-05, "loss": 0.6545, "num_input_tokens_seen": 4454016, "step": 3670 }, { "epoch": 0.40928833945873705, "grad_norm": 0.7832716703414917, "learning_rate": 2.045884842410068e-05, "loss": 0.6268, "num_input_tokens_seen": 4459872, "step": 3675 }, { "epoch": 0.4098451943423544, "grad_norm": 0.5291270613670349, "learning_rate": 2.0486691168281547e-05, "loss": 0.5727, "num_input_tokens_seen": 4466272, "step": 3680 }, { "epoch": 0.41040204922597173, "grad_norm": 0.6049500107765198, "learning_rate": 2.0514533912462412e-05, "loss": 0.5751, "num_input_tokens_seen": 4472224, "step": 3685 }, { "epoch": 0.410958904109589, "grad_norm": 0.7180649638175964, "learning_rate": 2.054237665664328e-05, "loss": 0.5694, "num_input_tokens_seen": 4478368, "step": 3690 }, { "epoch": 0.41151575899320636, "grad_norm": 0.6458039879798889, "learning_rate": 2.0570219400824146e-05, "loss": 0.6127, "num_input_tokens_seen": 4484384, "step": 3695 }, { "epoch": 0.4120726138768237, "grad_norm": 0.4491298198699951, "learning_rate": 2.059806214500501e-05, "loss": 0.561, "num_input_tokens_seen": 4490528, "step": 3700 }, { "epoch": 0.41262946876044104, "grad_norm": 0.5210258364677429, "learning_rate": 2.062590488918588e-05, "loss": 0.5752, "num_input_tokens_seen": 4496640, "step": 3705 }, { "epoch": 0.4131863236440584, "grad_norm": 0.7473164200782776, "learning_rate": 2.0653747633366745e-05, "loss": 0.6051, "num_input_tokens_seen": 4503136, "step": 3710 }, { "epoch": 0.41374317852767567, "grad_norm": 0.4035009443759918, "learning_rate": 2.0681590377547614e-05, "loss": 0.5598, "num_input_tokens_seen": 4509248, "step": 3715 }, { "epoch": 0.414300033411293, "grad_norm": 0.5333433747291565, "learning_rate": 2.0709433121728476e-05, "loss": 0.6294, "num_input_tokens_seen": 4515808, "step": 3720 }, { "epoch": 0.41485688829491035, "grad_norm": 0.5919249057769775, "learning_rate": 2.0737275865909345e-05, "loss": 0.5322, "num_input_tokens_seen": 4521472, "step": 3725 }, { "epoch": 0.4154137431785277, "grad_norm": 0.4682462513446808, "learning_rate": 2.0765118610090213e-05, "loss": 0.5845, "num_input_tokens_seen": 4527584, "step": 3730 }, { "epoch": 0.415970598062145, "grad_norm": 0.7192133069038391, "learning_rate": 2.079296135427108e-05, "loss": 0.5723, "num_input_tokens_seen": 4533696, "step": 3735 }, { "epoch": 0.4165274529457623, "grad_norm": 0.46795934438705444, "learning_rate": 2.0820804098451944e-05, "loss": 0.5318, "num_input_tokens_seen": 4539840, "step": 3740 }, { "epoch": 0.41708430782937966, "grad_norm": 0.6280922889709473, "learning_rate": 2.084864684263281e-05, "loss": 0.5767, "num_input_tokens_seen": 4545824, "step": 3745 }, { "epoch": 0.417641162712997, "grad_norm": 0.6158500909805298, "learning_rate": 2.0876489586813678e-05, "loss": 0.553, "num_input_tokens_seen": 4552096, "step": 3750 }, { "epoch": 0.41819801759661435, "grad_norm": 0.514862596988678, "learning_rate": 2.0904332330994543e-05, "loss": 0.5717, "num_input_tokens_seen": 4558272, "step": 3755 }, { "epoch": 0.41875487248023163, "grad_norm": 0.5900183916091919, "learning_rate": 2.093217507517541e-05, "loss": 0.5552, "num_input_tokens_seen": 4564512, "step": 3760 }, { "epoch": 0.419311727363849, "grad_norm": 0.5266793966293335, "learning_rate": 2.0960017819356277e-05, "loss": 0.5061, "num_input_tokens_seen": 4570368, "step": 3765 }, { "epoch": 0.4198685822474663, "grad_norm": 0.680773913860321, "learning_rate": 2.0987860563537142e-05, "loss": 0.5701, "num_input_tokens_seen": 4576288, "step": 3770 }, { "epoch": 0.42042543713108366, "grad_norm": 0.5812661647796631, "learning_rate": 2.101570330771801e-05, "loss": 0.5442, "num_input_tokens_seen": 4582432, "step": 3775 }, { "epoch": 0.42098229201470094, "grad_norm": 0.861759603023529, "learning_rate": 2.1043546051898876e-05, "loss": 0.6006, "num_input_tokens_seen": 4588416, "step": 3780 }, { "epoch": 0.4215391468983183, "grad_norm": 0.5906546711921692, "learning_rate": 2.107138879607974e-05, "loss": 0.5473, "num_input_tokens_seen": 4594592, "step": 3785 }, { "epoch": 0.4220960017819356, "grad_norm": 0.6461995840072632, "learning_rate": 2.109923154026061e-05, "loss": 0.609, "num_input_tokens_seen": 4600448, "step": 3790 }, { "epoch": 0.42265285666555297, "grad_norm": 0.5721824169158936, "learning_rate": 2.1127074284441475e-05, "loss": 0.4864, "num_input_tokens_seen": 4606720, "step": 3795 }, { "epoch": 0.4232097115491703, "grad_norm": 0.6975905299186707, "learning_rate": 2.1154917028622344e-05, "loss": 0.5417, "num_input_tokens_seen": 4613056, "step": 3800 }, { "epoch": 0.4237665664327876, "grad_norm": 0.38110190629959106, "learning_rate": 2.118275977280321e-05, "loss": 0.5803, "num_input_tokens_seen": 4619328, "step": 3805 }, { "epoch": 0.42432342131640494, "grad_norm": 0.462920606136322, "learning_rate": 2.1210602516984074e-05, "loss": 0.5652, "num_input_tokens_seen": 4625408, "step": 3810 }, { "epoch": 0.4248802762000223, "grad_norm": 0.5332291126251221, "learning_rate": 2.123844526116494e-05, "loss": 0.5922, "num_input_tokens_seen": 4631488, "step": 3815 }, { "epoch": 0.4254371310836396, "grad_norm": 0.8170026540756226, "learning_rate": 2.126628800534581e-05, "loss": 0.6047, "num_input_tokens_seen": 4637376, "step": 3820 }, { "epoch": 0.4259939859672569, "grad_norm": 0.48023122549057007, "learning_rate": 2.1294130749526674e-05, "loss": 0.5234, "num_input_tokens_seen": 4643328, "step": 3825 }, { "epoch": 0.42655084085087425, "grad_norm": 0.45945581793785095, "learning_rate": 2.1321973493707542e-05, "loss": 0.586, "num_input_tokens_seen": 4649376, "step": 3830 }, { "epoch": 0.4271076957344916, "grad_norm": 0.6750725507736206, "learning_rate": 2.1349816237888408e-05, "loss": 0.5252, "num_input_tokens_seen": 4655264, "step": 3835 }, { "epoch": 0.42766455061810893, "grad_norm": 0.6904374361038208, "learning_rate": 2.1377658982069273e-05, "loss": 0.6262, "num_input_tokens_seen": 4660864, "step": 3840 }, { "epoch": 0.42822140550172627, "grad_norm": 1.276698112487793, "learning_rate": 2.140550172625014e-05, "loss": 0.5909, "num_input_tokens_seen": 4666912, "step": 3845 }, { "epoch": 0.42877826038534356, "grad_norm": 0.6976867914199829, "learning_rate": 2.1433344470431007e-05, "loss": 0.5497, "num_input_tokens_seen": 4672928, "step": 3850 }, { "epoch": 0.4293351152689609, "grad_norm": 0.8028689622879028, "learning_rate": 2.1461187214611872e-05, "loss": 0.5152, "num_input_tokens_seen": 4678976, "step": 3855 }, { "epoch": 0.42989197015257824, "grad_norm": 0.6588928699493408, "learning_rate": 2.148902995879274e-05, "loss": 0.598, "num_input_tokens_seen": 4684992, "step": 3860 }, { "epoch": 0.4304488250361956, "grad_norm": 0.47664397954940796, "learning_rate": 2.1516872702973606e-05, "loss": 0.5655, "num_input_tokens_seen": 4690848, "step": 3865 }, { "epoch": 0.4310056799198129, "grad_norm": 0.4250854551792145, "learning_rate": 2.1544715447154475e-05, "loss": 0.5623, "num_input_tokens_seen": 4696896, "step": 3870 }, { "epoch": 0.4315625348034302, "grad_norm": 0.5364441275596619, "learning_rate": 2.157255819133534e-05, "loss": 0.4901, "num_input_tokens_seen": 4703328, "step": 3875 }, { "epoch": 0.43211938968704755, "grad_norm": 0.585854709148407, "learning_rate": 2.1600400935516205e-05, "loss": 0.5189, "num_input_tokens_seen": 4709504, "step": 3880 }, { "epoch": 0.4326762445706649, "grad_norm": 0.9005963802337646, "learning_rate": 2.162824367969707e-05, "loss": 0.616, "num_input_tokens_seen": 4715904, "step": 3885 }, { "epoch": 0.43323309945428223, "grad_norm": 0.6852855086326599, "learning_rate": 2.165608642387794e-05, "loss": 0.5524, "num_input_tokens_seen": 4721440, "step": 3890 }, { "epoch": 0.4337899543378995, "grad_norm": 0.5016515851020813, "learning_rate": 2.1683929168058804e-05, "loss": 0.5726, "num_input_tokens_seen": 4727232, "step": 3895 }, { "epoch": 0.43434680922151686, "grad_norm": 0.5365946292877197, "learning_rate": 2.171177191223967e-05, "loss": 0.598, "num_input_tokens_seen": 4733536, "step": 3900 }, { "epoch": 0.4349036641051342, "grad_norm": 0.8172489404678345, "learning_rate": 2.1739614656420538e-05, "loss": 0.5215, "num_input_tokens_seen": 4739584, "step": 3905 }, { "epoch": 0.43546051898875154, "grad_norm": 0.574018120765686, "learning_rate": 2.1767457400601404e-05, "loss": 0.5333, "num_input_tokens_seen": 4745696, "step": 3910 }, { "epoch": 0.4360173738723689, "grad_norm": 0.6151098608970642, "learning_rate": 2.1795300144782272e-05, "loss": 0.579, "num_input_tokens_seen": 4752000, "step": 3915 }, { "epoch": 0.43657422875598617, "grad_norm": 0.6040294170379639, "learning_rate": 2.1823142888963137e-05, "loss": 0.5653, "num_input_tokens_seen": 4758080, "step": 3920 }, { "epoch": 0.4371310836396035, "grad_norm": 0.45241132378578186, "learning_rate": 2.1850985633144003e-05, "loss": 0.5499, "num_input_tokens_seen": 4763776, "step": 3925 }, { "epoch": 0.43768793852322085, "grad_norm": 0.8528025150299072, "learning_rate": 2.187882837732487e-05, "loss": 0.5966, "num_input_tokens_seen": 4770208, "step": 3930 }, { "epoch": 0.4382447934068382, "grad_norm": 0.5225035548210144, "learning_rate": 2.1906671121505737e-05, "loss": 0.5355, "num_input_tokens_seen": 4775904, "step": 3935 }, { "epoch": 0.4388016482904555, "grad_norm": 0.6373410224914551, "learning_rate": 2.1934513865686605e-05, "loss": 0.5162, "num_input_tokens_seen": 4781984, "step": 3940 }, { "epoch": 0.4393585031740728, "grad_norm": 0.5328344106674194, "learning_rate": 2.1962356609867467e-05, "loss": 0.5632, "num_input_tokens_seen": 4788064, "step": 3945 }, { "epoch": 0.43991535805769016, "grad_norm": 0.6345775723457336, "learning_rate": 2.1990199354048336e-05, "loss": 0.5672, "num_input_tokens_seen": 4793888, "step": 3950 }, { "epoch": 0.4404722129413075, "grad_norm": 0.6237121224403381, "learning_rate": 2.20180420982292e-05, "loss": 0.5405, "num_input_tokens_seen": 4799872, "step": 3955 }, { "epoch": 0.44102906782492485, "grad_norm": 0.8123361468315125, "learning_rate": 2.204588484241007e-05, "loss": 0.5376, "num_input_tokens_seen": 4806048, "step": 3960 }, { "epoch": 0.44158592270854213, "grad_norm": 0.435773104429245, "learning_rate": 2.2073727586590935e-05, "loss": 0.5567, "num_input_tokens_seen": 4812096, "step": 3965 }, { "epoch": 0.4421427775921595, "grad_norm": 1.3037782907485962, "learning_rate": 2.21015703307718e-05, "loss": 0.5367, "num_input_tokens_seen": 4817984, "step": 3970 }, { "epoch": 0.4426996324757768, "grad_norm": 0.5085246562957764, "learning_rate": 2.212941307495267e-05, "loss": 0.5565, "num_input_tokens_seen": 4824256, "step": 3975 }, { "epoch": 0.44325648735939416, "grad_norm": 0.4481351375579834, "learning_rate": 2.2157255819133534e-05, "loss": 0.5313, "num_input_tokens_seen": 4830432, "step": 3980 }, { "epoch": 0.44381334224301144, "grad_norm": 0.5444828867912292, "learning_rate": 2.2185098563314403e-05, "loss": 0.4836, "num_input_tokens_seen": 4836224, "step": 3985 }, { "epoch": 0.4443701971266288, "grad_norm": 0.5004183053970337, "learning_rate": 2.2212941307495268e-05, "loss": 0.5196, "num_input_tokens_seen": 4841664, "step": 3990 }, { "epoch": 0.4449270520102461, "grad_norm": 0.5266890525817871, "learning_rate": 2.2240784051676133e-05, "loss": 0.5311, "num_input_tokens_seen": 4847232, "step": 3995 }, { "epoch": 0.44548390689386347, "grad_norm": 0.6098106503486633, "learning_rate": 2.2268626795857002e-05, "loss": 0.5128, "num_input_tokens_seen": 4853216, "step": 4000 }, { "epoch": 0.4460407617774808, "grad_norm": 0.5048491358757019, "learning_rate": 2.2296469540037867e-05, "loss": 0.5222, "num_input_tokens_seen": 4859392, "step": 4005 }, { "epoch": 0.4465976166610981, "grad_norm": 0.6657102704048157, "learning_rate": 2.2324312284218736e-05, "loss": 0.5532, "num_input_tokens_seen": 4865408, "step": 4010 }, { "epoch": 0.44715447154471544, "grad_norm": 0.49514108896255493, "learning_rate": 2.2352155028399598e-05, "loss": 0.5776, "num_input_tokens_seen": 4871328, "step": 4015 }, { "epoch": 0.4477113264283328, "grad_norm": 0.4984544515609741, "learning_rate": 2.2379997772580467e-05, "loss": 0.5454, "num_input_tokens_seen": 4877568, "step": 4020 }, { "epoch": 0.4482681813119501, "grad_norm": 0.6363658308982849, "learning_rate": 2.2407840516761332e-05, "loss": 0.5723, "num_input_tokens_seen": 4883712, "step": 4025 }, { "epoch": 0.44882503619556746, "grad_norm": 0.4433944523334503, "learning_rate": 2.24356832609422e-05, "loss": 0.4972, "num_input_tokens_seen": 4889536, "step": 4030 }, { "epoch": 0.44938189107918475, "grad_norm": 0.5405867695808411, "learning_rate": 2.2463526005123066e-05, "loss": 0.5972, "num_input_tokens_seen": 4895520, "step": 4035 }, { "epoch": 0.4499387459628021, "grad_norm": 1.3138625621795654, "learning_rate": 2.249136874930393e-05, "loss": 0.5622, "num_input_tokens_seen": 4901376, "step": 4040 }, { "epoch": 0.45049560084641943, "grad_norm": 0.5278947353363037, "learning_rate": 2.25192114934848e-05, "loss": 0.5122, "num_input_tokens_seen": 4907584, "step": 4045 }, { "epoch": 0.45105245573003677, "grad_norm": 0.47189491987228394, "learning_rate": 2.2547054237665665e-05, "loss": 0.5036, "num_input_tokens_seen": 4913632, "step": 4050 }, { "epoch": 0.45160931061365406, "grad_norm": 0.6987490653991699, "learning_rate": 2.2574896981846534e-05, "loss": 0.5059, "num_input_tokens_seen": 4919616, "step": 4055 }, { "epoch": 0.4521661654972714, "grad_norm": 0.442399263381958, "learning_rate": 2.2602739726027396e-05, "loss": 0.4981, "num_input_tokens_seen": 4925568, "step": 4060 }, { "epoch": 0.45272302038088874, "grad_norm": 0.46676573157310486, "learning_rate": 2.2630582470208264e-05, "loss": 0.5087, "num_input_tokens_seen": 4931616, "step": 4065 }, { "epoch": 0.4532798752645061, "grad_norm": 0.46530991792678833, "learning_rate": 2.2658425214389133e-05, "loss": 0.5013, "num_input_tokens_seen": 4937952, "step": 4070 }, { "epoch": 0.4538367301481234, "grad_norm": 0.5614447593688965, "learning_rate": 2.2686267958569998e-05, "loss": 0.6157, "num_input_tokens_seen": 4944096, "step": 4075 }, { "epoch": 0.4543935850317407, "grad_norm": 0.5306949019432068, "learning_rate": 2.2714110702750867e-05, "loss": 0.5601, "num_input_tokens_seen": 4950368, "step": 4080 }, { "epoch": 0.45495043991535805, "grad_norm": 0.5544019341468811, "learning_rate": 2.274195344693173e-05, "loss": 0.5501, "num_input_tokens_seen": 4956512, "step": 4085 }, { "epoch": 0.4555072947989754, "grad_norm": 0.5008285641670227, "learning_rate": 2.2769796191112597e-05, "loss": 0.5345, "num_input_tokens_seen": 4962624, "step": 4090 }, { "epoch": 0.45606414968259273, "grad_norm": 0.722334623336792, "learning_rate": 2.2797638935293463e-05, "loss": 0.6354, "num_input_tokens_seen": 4968768, "step": 4095 }, { "epoch": 0.45662100456621, "grad_norm": 0.5504441261291504, "learning_rate": 2.282548167947433e-05, "loss": 0.5102, "num_input_tokens_seen": 4975072, "step": 4100 }, { "epoch": 0.45717785944982736, "grad_norm": 0.6458106637001038, "learning_rate": 2.2853324423655196e-05, "loss": 0.5148, "num_input_tokens_seen": 4981184, "step": 4105 }, { "epoch": 0.4577347143334447, "grad_norm": 0.5253353714942932, "learning_rate": 2.2881167167836062e-05, "loss": 0.5201, "num_input_tokens_seen": 4987584, "step": 4110 }, { "epoch": 0.45829156921706204, "grad_norm": 0.740638017654419, "learning_rate": 2.290900991201693e-05, "loss": 0.507, "num_input_tokens_seen": 4993856, "step": 4115 }, { "epoch": 0.4588484241006794, "grad_norm": 0.5176502466201782, "learning_rate": 2.2936852656197796e-05, "loss": 0.5714, "num_input_tokens_seen": 4999936, "step": 4120 }, { "epoch": 0.45940527898429667, "grad_norm": 0.8188176155090332, "learning_rate": 2.2964695400378664e-05, "loss": 0.488, "num_input_tokens_seen": 5006080, "step": 4125 }, { "epoch": 0.459962133867914, "grad_norm": 0.7110271453857422, "learning_rate": 2.2992538144559526e-05, "loss": 0.5207, "num_input_tokens_seen": 5012160, "step": 4130 }, { "epoch": 0.46051898875153136, "grad_norm": 0.524470865726471, "learning_rate": 2.3020380888740395e-05, "loss": 0.535, "num_input_tokens_seen": 5018400, "step": 4135 }, { "epoch": 0.4610758436351487, "grad_norm": 0.7547463178634644, "learning_rate": 2.3048223632921264e-05, "loss": 0.5517, "num_input_tokens_seen": 5024256, "step": 4140 }, { "epoch": 0.461632698518766, "grad_norm": 0.7006763815879822, "learning_rate": 2.307606637710213e-05, "loss": 0.53, "num_input_tokens_seen": 5030368, "step": 4145 }, { "epoch": 0.4621895534023833, "grad_norm": 0.6575197577476501, "learning_rate": 2.3103909121282997e-05, "loss": 0.5192, "num_input_tokens_seen": 5036928, "step": 4150 }, { "epoch": 0.46274640828600067, "grad_norm": 0.7573459148406982, "learning_rate": 2.313175186546386e-05, "loss": 0.5349, "num_input_tokens_seen": 5043392, "step": 4155 }, { "epoch": 0.463303263169618, "grad_norm": 0.6041881442070007, "learning_rate": 2.3159594609644728e-05, "loss": 0.5106, "num_input_tokens_seen": 5049440, "step": 4160 }, { "epoch": 0.46386011805323535, "grad_norm": 0.5992774963378906, "learning_rate": 2.3187437353825593e-05, "loss": 0.5136, "num_input_tokens_seen": 5055648, "step": 4165 }, { "epoch": 0.46441697293685263, "grad_norm": 0.6798260807991028, "learning_rate": 2.3215280098006462e-05, "loss": 0.5443, "num_input_tokens_seen": 5062144, "step": 4170 }, { "epoch": 0.46497382782047, "grad_norm": 0.5205903649330139, "learning_rate": 2.3243122842187327e-05, "loss": 0.4764, "num_input_tokens_seen": 5067776, "step": 4175 }, { "epoch": 0.4655306827040873, "grad_norm": 0.49327921867370605, "learning_rate": 2.3270965586368192e-05, "loss": 0.5536, "num_input_tokens_seen": 5073920, "step": 4180 }, { "epoch": 0.46608753758770466, "grad_norm": 0.5059968829154968, "learning_rate": 2.329880833054906e-05, "loss": 0.5874, "num_input_tokens_seen": 5080064, "step": 4185 }, { "epoch": 0.466644392471322, "grad_norm": 0.3751976788043976, "learning_rate": 2.3326651074729926e-05, "loss": 0.5456, "num_input_tokens_seen": 5086368, "step": 4190 }, { "epoch": 0.4672012473549393, "grad_norm": 0.7099978923797607, "learning_rate": 2.3354493818910795e-05, "loss": 0.5524, "num_input_tokens_seen": 5092800, "step": 4195 }, { "epoch": 0.46775810223855663, "grad_norm": 0.7254162430763245, "learning_rate": 2.3382336563091657e-05, "loss": 0.5248, "num_input_tokens_seen": 5098944, "step": 4200 }, { "epoch": 0.46831495712217397, "grad_norm": 0.6305949091911316, "learning_rate": 2.3410179307272526e-05, "loss": 0.5436, "num_input_tokens_seen": 5105024, "step": 4205 }, { "epoch": 0.4688718120057913, "grad_norm": 0.5184758901596069, "learning_rate": 2.3438022051453394e-05, "loss": 0.4853, "num_input_tokens_seen": 5110944, "step": 4210 }, { "epoch": 0.4694286668894086, "grad_norm": 0.584761917591095, "learning_rate": 2.346586479563426e-05, "loss": 0.5349, "num_input_tokens_seen": 5116928, "step": 4215 }, { "epoch": 0.46998552177302594, "grad_norm": 0.5969018340110779, "learning_rate": 2.3493707539815125e-05, "loss": 0.5465, "num_input_tokens_seen": 5122848, "step": 4220 }, { "epoch": 0.4705423766566433, "grad_norm": 0.5323885679244995, "learning_rate": 2.352155028399599e-05, "loss": 0.4972, "num_input_tokens_seen": 5128992, "step": 4225 }, { "epoch": 0.4710992315402606, "grad_norm": 0.6184549927711487, "learning_rate": 2.354939302817686e-05, "loss": 0.5535, "num_input_tokens_seen": 5134944, "step": 4230 }, { "epoch": 0.47165608642387796, "grad_norm": 0.7375818490982056, "learning_rate": 2.3577235772357724e-05, "loss": 0.4915, "num_input_tokens_seen": 5141248, "step": 4235 }, { "epoch": 0.47221294130749525, "grad_norm": 0.5092220902442932, "learning_rate": 2.3605078516538593e-05, "loss": 0.5122, "num_input_tokens_seen": 5147104, "step": 4240 }, { "epoch": 0.4727697961911126, "grad_norm": 0.5420388579368591, "learning_rate": 2.3632921260719458e-05, "loss": 0.488, "num_input_tokens_seen": 5153440, "step": 4245 }, { "epoch": 0.47332665107472993, "grad_norm": 0.537372350692749, "learning_rate": 2.3660764004900323e-05, "loss": 0.5464, "num_input_tokens_seen": 5159360, "step": 4250 }, { "epoch": 0.4738835059583473, "grad_norm": 0.6947973966598511, "learning_rate": 2.3688606749081192e-05, "loss": 0.5288, "num_input_tokens_seen": 5165376, "step": 4255 }, { "epoch": 0.47444036084196456, "grad_norm": 0.5911089777946472, "learning_rate": 2.3716449493262057e-05, "loss": 0.6154, "num_input_tokens_seen": 5171744, "step": 4260 }, { "epoch": 0.4749972157255819, "grad_norm": 0.867561936378479, "learning_rate": 2.3744292237442922e-05, "loss": 0.5673, "num_input_tokens_seen": 5178208, "step": 4265 }, { "epoch": 0.47555407060919924, "grad_norm": 0.6278514862060547, "learning_rate": 2.3772134981623788e-05, "loss": 0.5546, "num_input_tokens_seen": 5184384, "step": 4270 }, { "epoch": 0.4761109254928166, "grad_norm": 0.6555668115615845, "learning_rate": 2.3799977725804656e-05, "loss": 0.5055, "num_input_tokens_seen": 5190176, "step": 4275 }, { "epoch": 0.4766677803764339, "grad_norm": 0.625932514667511, "learning_rate": 2.3827820469985525e-05, "loss": 0.5685, "num_input_tokens_seen": 5196608, "step": 4280 }, { "epoch": 0.4772246352600512, "grad_norm": 0.45602211356163025, "learning_rate": 2.385566321416639e-05, "loss": 0.5154, "num_input_tokens_seen": 5202624, "step": 4285 }, { "epoch": 0.47778149014366855, "grad_norm": 0.6526634693145752, "learning_rate": 2.3883505958347255e-05, "loss": 0.5917, "num_input_tokens_seen": 5208896, "step": 4290 }, { "epoch": 0.4783383450272859, "grad_norm": 0.5469481945037842, "learning_rate": 2.391134870252812e-05, "loss": 0.518, "num_input_tokens_seen": 5214816, "step": 4295 }, { "epoch": 0.47889519991090324, "grad_norm": 0.44256091117858887, "learning_rate": 2.393919144670899e-05, "loss": 0.5997, "num_input_tokens_seen": 5220672, "step": 4300 }, { "epoch": 0.4794520547945205, "grad_norm": 0.3796844780445099, "learning_rate": 2.3967034190889855e-05, "loss": 0.5012, "num_input_tokens_seen": 5226848, "step": 4305 }, { "epoch": 0.48000890967813786, "grad_norm": 0.47139355540275574, "learning_rate": 2.3994876935070723e-05, "loss": 0.5416, "num_input_tokens_seen": 5232960, "step": 4310 }, { "epoch": 0.4805657645617552, "grad_norm": 0.6543121337890625, "learning_rate": 2.402271967925159e-05, "loss": 0.4824, "num_input_tokens_seen": 5238880, "step": 4315 }, { "epoch": 0.48112261944537255, "grad_norm": 0.6886802911758423, "learning_rate": 2.4050562423432454e-05, "loss": 0.5185, "num_input_tokens_seen": 5244992, "step": 4320 }, { "epoch": 0.4816794743289899, "grad_norm": 0.47042930126190186, "learning_rate": 2.4078405167613323e-05, "loss": 0.4744, "num_input_tokens_seen": 5250976, "step": 4325 }, { "epoch": 0.4822363292126072, "grad_norm": 0.4541086256504059, "learning_rate": 2.4106247911794188e-05, "loss": 0.5218, "num_input_tokens_seen": 5256992, "step": 4330 }, { "epoch": 0.4827931840962245, "grad_norm": 0.5011399388313293, "learning_rate": 2.4134090655975053e-05, "loss": 0.5338, "num_input_tokens_seen": 5262080, "step": 4335 }, { "epoch": 0.48335003897984186, "grad_norm": 0.5383691191673279, "learning_rate": 2.416193340015592e-05, "loss": 0.5593, "num_input_tokens_seen": 5268352, "step": 4340 }, { "epoch": 0.4839068938634592, "grad_norm": 0.6331676244735718, "learning_rate": 2.4189776144336787e-05, "loss": 0.5723, "num_input_tokens_seen": 5274240, "step": 4345 }, { "epoch": 0.48446374874707654, "grad_norm": 0.5208234786987305, "learning_rate": 2.4217618888517656e-05, "loss": 0.5589, "num_input_tokens_seen": 5280416, "step": 4350 }, { "epoch": 0.4850206036306938, "grad_norm": 0.5620369911193848, "learning_rate": 2.424546163269852e-05, "loss": 0.4986, "num_input_tokens_seen": 5286496, "step": 4355 }, { "epoch": 0.48557745851431117, "grad_norm": 0.5609205961227417, "learning_rate": 2.4273304376879386e-05, "loss": 0.4972, "num_input_tokens_seen": 5292640, "step": 4360 }, { "epoch": 0.4861343133979285, "grad_norm": 0.5250622630119324, "learning_rate": 2.430114712106025e-05, "loss": 0.575, "num_input_tokens_seen": 5298176, "step": 4365 }, { "epoch": 0.48669116828154585, "grad_norm": 0.4351769983768463, "learning_rate": 2.432898986524112e-05, "loss": 0.5469, "num_input_tokens_seen": 5303712, "step": 4370 }, { "epoch": 0.48724802316516314, "grad_norm": 0.5712926387786865, "learning_rate": 2.4356832609421985e-05, "loss": 0.5367, "num_input_tokens_seen": 5309280, "step": 4375 }, { "epoch": 0.4878048780487805, "grad_norm": 0.4500388205051422, "learning_rate": 2.438467535360285e-05, "loss": 0.5179, "num_input_tokens_seen": 5315232, "step": 4380 }, { "epoch": 0.4883617329323978, "grad_norm": 0.5775341987609863, "learning_rate": 2.441251809778372e-05, "loss": 0.6109, "num_input_tokens_seen": 5321824, "step": 4385 }, { "epoch": 0.48891858781601516, "grad_norm": 0.6252064108848572, "learning_rate": 2.4440360841964585e-05, "loss": 0.48, "num_input_tokens_seen": 5327904, "step": 4390 }, { "epoch": 0.4894754426996325, "grad_norm": 0.6080237030982971, "learning_rate": 2.4468203586145453e-05, "loss": 0.5335, "num_input_tokens_seen": 5333888, "step": 4395 }, { "epoch": 0.4900322975832498, "grad_norm": 0.7089096903800964, "learning_rate": 2.449604633032632e-05, "loss": 0.5102, "num_input_tokens_seen": 5340352, "step": 4400 }, { "epoch": 0.49058915246686713, "grad_norm": 0.6364566087722778, "learning_rate": 2.4523889074507184e-05, "loss": 0.5275, "num_input_tokens_seen": 5346336, "step": 4405 }, { "epoch": 0.49114600735048447, "grad_norm": 0.3520835340023041, "learning_rate": 2.4551731818688052e-05, "loss": 0.568, "num_input_tokens_seen": 5352544, "step": 4410 }, { "epoch": 0.4917028622341018, "grad_norm": 0.5264956951141357, "learning_rate": 2.4579574562868918e-05, "loss": 0.529, "num_input_tokens_seen": 5358688, "step": 4415 }, { "epoch": 0.4922597171177191, "grad_norm": 0.4809894859790802, "learning_rate": 2.4607417307049786e-05, "loss": 0.513, "num_input_tokens_seen": 5365056, "step": 4420 }, { "epoch": 0.49281657200133644, "grad_norm": 0.5016565322875977, "learning_rate": 2.4635260051230648e-05, "loss": 0.5048, "num_input_tokens_seen": 5371328, "step": 4425 }, { "epoch": 0.4933734268849538, "grad_norm": 0.4715058207511902, "learning_rate": 2.4663102795411517e-05, "loss": 0.4589, "num_input_tokens_seen": 5377600, "step": 4430 }, { "epoch": 0.4939302817685711, "grad_norm": 0.47167107462882996, "learning_rate": 2.4690945539592382e-05, "loss": 0.5187, "num_input_tokens_seen": 5384192, "step": 4435 }, { "epoch": 0.49448713665218846, "grad_norm": 0.5744552612304688, "learning_rate": 2.471878828377325e-05, "loss": 0.5255, "num_input_tokens_seen": 5390464, "step": 4440 }, { "epoch": 0.49504399153580575, "grad_norm": 0.44475051760673523, "learning_rate": 2.4746631027954116e-05, "loss": 0.5175, "num_input_tokens_seen": 5396384, "step": 4445 }, { "epoch": 0.4956008464194231, "grad_norm": 0.43257370591163635, "learning_rate": 2.477447377213498e-05, "loss": 0.4968, "num_input_tokens_seen": 5402432, "step": 4450 }, { "epoch": 0.49615770130304043, "grad_norm": 0.6997108459472656, "learning_rate": 2.480231651631585e-05, "loss": 0.533, "num_input_tokens_seen": 5408448, "step": 4455 }, { "epoch": 0.4967145561866578, "grad_norm": 0.4157126247882843, "learning_rate": 2.4830159260496715e-05, "loss": 0.5072, "num_input_tokens_seen": 5414464, "step": 4460 }, { "epoch": 0.49727141107027506, "grad_norm": 0.6457968354225159, "learning_rate": 2.4858002004677584e-05, "loss": 0.5492, "num_input_tokens_seen": 5420448, "step": 4465 }, { "epoch": 0.4978282659538924, "grad_norm": 0.6834990978240967, "learning_rate": 2.4885844748858446e-05, "loss": 0.4911, "num_input_tokens_seen": 5426528, "step": 4470 }, { "epoch": 0.49838512083750974, "grad_norm": 0.8440697193145752, "learning_rate": 2.4913687493039314e-05, "loss": 0.5275, "num_input_tokens_seen": 5432640, "step": 4475 }, { "epoch": 0.4989419757211271, "grad_norm": 0.5376283526420593, "learning_rate": 2.4941530237220183e-05, "loss": 0.5448, "num_input_tokens_seen": 5438720, "step": 4480 }, { "epoch": 0.4994988306047444, "grad_norm": 0.45717060565948486, "learning_rate": 2.496937298140105e-05, "loss": 0.4822, "num_input_tokens_seen": 5444832, "step": 4485 }, { "epoch": 0.5000556854883618, "grad_norm": 0.48572221398353577, "learning_rate": 2.4997215725581917e-05, "loss": 0.5018, "num_input_tokens_seen": 5450816, "step": 4490 }, { "epoch": 0.5000556854883618, "eval_loss": 0.5130664706230164, "eval_runtime": 113.2078, "eval_samples_per_second": 35.254, "eval_steps_per_second": 8.816, "num_input_tokens_seen": 5450816, "step": 4490 }, { "epoch": 0.5006125403719791, "grad_norm": 0.4971100389957428, "learning_rate": 2.502505846976278e-05, "loss": 0.5047, "num_input_tokens_seen": 5456896, "step": 4495 }, { "epoch": 0.5011693952555963, "grad_norm": 0.4661465287208557, "learning_rate": 2.5052901213943648e-05, "loss": 0.4649, "num_input_tokens_seen": 5462336, "step": 4500 }, { "epoch": 0.5017262501392137, "grad_norm": 0.6230202317237854, "learning_rate": 2.5080743958124513e-05, "loss": 0.4606, "num_input_tokens_seen": 5468320, "step": 4505 }, { "epoch": 0.502283105022831, "grad_norm": 0.7722415328025818, "learning_rate": 2.510858670230538e-05, "loss": 0.5454, "num_input_tokens_seen": 5474720, "step": 4510 }, { "epoch": 0.5028399599064484, "grad_norm": 0.6156852841377258, "learning_rate": 2.5136429446486247e-05, "loss": 0.4837, "num_input_tokens_seen": 5480960, "step": 4515 }, { "epoch": 0.5033968147900657, "grad_norm": 0.6528518199920654, "learning_rate": 2.5164272190667115e-05, "loss": 0.4811, "num_input_tokens_seen": 5486720, "step": 4520 }, { "epoch": 0.503953669673683, "grad_norm": 0.7957804203033447, "learning_rate": 2.519211493484798e-05, "loss": 0.4618, "num_input_tokens_seen": 5492480, "step": 4525 }, { "epoch": 0.5045105245573004, "grad_norm": 0.5955324172973633, "learning_rate": 2.5219957679028843e-05, "loss": 0.5249, "num_input_tokens_seen": 5498752, "step": 4530 }, { "epoch": 0.5050673794409177, "grad_norm": 0.4673166573047638, "learning_rate": 2.524780042320971e-05, "loss": 0.5035, "num_input_tokens_seen": 5504896, "step": 4535 }, { "epoch": 0.5056242343245351, "grad_norm": 0.6022423505783081, "learning_rate": 2.5275643167390577e-05, "loss": 0.5244, "num_input_tokens_seen": 5511232, "step": 4540 }, { "epoch": 0.5061810892081523, "grad_norm": 0.7004314661026001, "learning_rate": 2.5303485911571445e-05, "loss": 0.4915, "num_input_tokens_seen": 5517280, "step": 4545 }, { "epoch": 0.5067379440917696, "grad_norm": 0.4931386709213257, "learning_rate": 2.5331328655752314e-05, "loss": 0.5079, "num_input_tokens_seen": 5523296, "step": 4550 }, { "epoch": 0.507294798975387, "grad_norm": 0.49452972412109375, "learning_rate": 2.535917139993318e-05, "loss": 0.5215, "num_input_tokens_seen": 5529568, "step": 4555 }, { "epoch": 0.5078516538590043, "grad_norm": 0.47778797149658203, "learning_rate": 2.5387014144114048e-05, "loss": 0.459, "num_input_tokens_seen": 5535264, "step": 4560 }, { "epoch": 0.5084085087426217, "grad_norm": 0.5123693346977234, "learning_rate": 2.5414856888294913e-05, "loss": 0.5316, "num_input_tokens_seen": 5541760, "step": 4565 }, { "epoch": 0.508965363626239, "grad_norm": 0.5348398685455322, "learning_rate": 2.544269963247578e-05, "loss": 0.5771, "num_input_tokens_seen": 5547808, "step": 4570 }, { "epoch": 0.5095222185098564, "grad_norm": 0.8126906156539917, "learning_rate": 2.5470542376656644e-05, "loss": 0.5011, "num_input_tokens_seen": 5553120, "step": 4575 }, { "epoch": 0.5100790733934737, "grad_norm": 0.4621800482273102, "learning_rate": 2.549838512083751e-05, "loss": 0.5084, "num_input_tokens_seen": 5559328, "step": 4580 }, { "epoch": 0.510635928277091, "grad_norm": 0.46847453713417053, "learning_rate": 2.5526227865018377e-05, "loss": 0.4933, "num_input_tokens_seen": 5565376, "step": 4585 }, { "epoch": 0.5111927831607083, "grad_norm": 0.49331745505332947, "learning_rate": 2.5554070609199243e-05, "loss": 0.5419, "num_input_tokens_seen": 5571200, "step": 4590 }, { "epoch": 0.5117496380443256, "grad_norm": 0.42283734679222107, "learning_rate": 2.558191335338011e-05, "loss": 0.5411, "num_input_tokens_seen": 5577664, "step": 4595 }, { "epoch": 0.512306492927943, "grad_norm": 0.689016580581665, "learning_rate": 2.5609756097560977e-05, "loss": 0.5907, "num_input_tokens_seen": 5584064, "step": 4600 }, { "epoch": 0.5128633478115603, "grad_norm": 0.713565468788147, "learning_rate": 2.5637598841741845e-05, "loss": 0.4898, "num_input_tokens_seen": 5590304, "step": 4605 }, { "epoch": 0.5134202026951776, "grad_norm": 0.7340446710586548, "learning_rate": 2.566544158592271e-05, "loss": 0.5039, "num_input_tokens_seen": 5596576, "step": 4610 }, { "epoch": 0.513977057578795, "grad_norm": 0.4846142828464508, "learning_rate": 2.569328433010358e-05, "loss": 0.5035, "num_input_tokens_seen": 5602752, "step": 4615 }, { "epoch": 0.5145339124624123, "grad_norm": 0.48834845423698425, "learning_rate": 2.572112707428444e-05, "loss": 0.4907, "num_input_tokens_seen": 5608928, "step": 4620 }, { "epoch": 0.5150907673460297, "grad_norm": 0.5844003558158875, "learning_rate": 2.5748969818465306e-05, "loss": 0.5153, "num_input_tokens_seen": 5615040, "step": 4625 }, { "epoch": 0.515647622229647, "grad_norm": 0.4821818470954895, "learning_rate": 2.5776812562646175e-05, "loss": 0.4934, "num_input_tokens_seen": 5620800, "step": 4630 }, { "epoch": 0.5162044771132643, "grad_norm": 0.9346848726272583, "learning_rate": 2.580465530682704e-05, "loss": 0.5149, "num_input_tokens_seen": 5626944, "step": 4635 }, { "epoch": 0.5167613319968816, "grad_norm": 0.437473863363266, "learning_rate": 2.583249805100791e-05, "loss": 0.5039, "num_input_tokens_seen": 5633024, "step": 4640 }, { "epoch": 0.5173181868804989, "grad_norm": 0.5657913684844971, "learning_rate": 2.5860340795188774e-05, "loss": 0.5094, "num_input_tokens_seen": 5639328, "step": 4645 }, { "epoch": 0.5178750417641163, "grad_norm": 0.8435340523719788, "learning_rate": 2.5888183539369643e-05, "loss": 0.5154, "num_input_tokens_seen": 5645312, "step": 4650 }, { "epoch": 0.5184318966477336, "grad_norm": 0.5063620209693909, "learning_rate": 2.5916026283550508e-05, "loss": 0.5178, "num_input_tokens_seen": 5651456, "step": 4655 }, { "epoch": 0.5189887515313509, "grad_norm": 0.43608054518699646, "learning_rate": 2.5943869027731377e-05, "loss": 0.4989, "num_input_tokens_seen": 5657600, "step": 4660 }, { "epoch": 0.5195456064149683, "grad_norm": 0.5486329197883606, "learning_rate": 2.597171177191224e-05, "loss": 0.5299, "num_input_tokens_seen": 5663776, "step": 4665 }, { "epoch": 0.5201024612985856, "grad_norm": 0.6572924256324768, "learning_rate": 2.5999554516093104e-05, "loss": 0.4756, "num_input_tokens_seen": 5669856, "step": 4670 }, { "epoch": 0.520659316182203, "grad_norm": 0.4046572744846344, "learning_rate": 2.6027397260273973e-05, "loss": 0.4686, "num_input_tokens_seen": 5675936, "step": 4675 }, { "epoch": 0.5212161710658203, "grad_norm": 0.6878944635391235, "learning_rate": 2.6055240004454838e-05, "loss": 0.492, "num_input_tokens_seen": 5681952, "step": 4680 }, { "epoch": 0.5217730259494375, "grad_norm": 0.48948293924331665, "learning_rate": 2.6083082748635707e-05, "loss": 0.5101, "num_input_tokens_seen": 5688000, "step": 4685 }, { "epoch": 0.5223298808330549, "grad_norm": 0.607239842414856, "learning_rate": 2.6110925492816575e-05, "loss": 0.5198, "num_input_tokens_seen": 5694144, "step": 4690 }, { "epoch": 0.5228867357166722, "grad_norm": 0.4834328293800354, "learning_rate": 2.613876823699744e-05, "loss": 0.5044, "num_input_tokens_seen": 5700128, "step": 4695 }, { "epoch": 0.5234435906002896, "grad_norm": 0.42528635263442993, "learning_rate": 2.616661098117831e-05, "loss": 0.4966, "num_input_tokens_seen": 5706112, "step": 4700 }, { "epoch": 0.5240004454839069, "grad_norm": 0.6829854249954224, "learning_rate": 2.6194453725359174e-05, "loss": 0.4857, "num_input_tokens_seen": 5712288, "step": 4705 }, { "epoch": 0.5245573003675242, "grad_norm": 0.3983464241027832, "learning_rate": 2.6222296469540036e-05, "loss": 0.4903, "num_input_tokens_seen": 5718656, "step": 4710 }, { "epoch": 0.5251141552511416, "grad_norm": 0.7747660875320435, "learning_rate": 2.6250139213720905e-05, "loss": 0.54, "num_input_tokens_seen": 5724608, "step": 4715 }, { "epoch": 0.5256710101347589, "grad_norm": 0.6605187058448792, "learning_rate": 2.627798195790177e-05, "loss": 0.5313, "num_input_tokens_seen": 5730816, "step": 4720 }, { "epoch": 0.5262278650183763, "grad_norm": 0.45960476994514465, "learning_rate": 2.630582470208264e-05, "loss": 0.5036, "num_input_tokens_seen": 5736832, "step": 4725 }, { "epoch": 0.5267847199019935, "grad_norm": 0.4802801012992859, "learning_rate": 2.6333667446263504e-05, "loss": 0.4974, "num_input_tokens_seen": 5742912, "step": 4730 }, { "epoch": 0.5273415747856108, "grad_norm": 0.4292217791080475, "learning_rate": 2.6361510190444373e-05, "loss": 0.5173, "num_input_tokens_seen": 5748992, "step": 4735 }, { "epoch": 0.5278984296692282, "grad_norm": 0.6615040898323059, "learning_rate": 2.6389352934625238e-05, "loss": 0.4768, "num_input_tokens_seen": 5755392, "step": 4740 }, { "epoch": 0.5284552845528455, "grad_norm": 0.4220753014087677, "learning_rate": 2.6417195678806107e-05, "loss": 0.4886, "num_input_tokens_seen": 5761440, "step": 4745 }, { "epoch": 0.5290121394364629, "grad_norm": 0.5341446399688721, "learning_rate": 2.6445038422986972e-05, "loss": 0.4991, "num_input_tokens_seen": 5767616, "step": 4750 }, { "epoch": 0.5295689943200802, "grad_norm": 0.5006824135780334, "learning_rate": 2.647288116716784e-05, "loss": 0.5053, "num_input_tokens_seen": 5773664, "step": 4755 }, { "epoch": 0.5301258492036975, "grad_norm": 0.6047151684761047, "learning_rate": 2.6500723911348703e-05, "loss": 0.5233, "num_input_tokens_seen": 5779776, "step": 4760 }, { "epoch": 0.5306827040873149, "grad_norm": 0.6556442975997925, "learning_rate": 2.6528566655529568e-05, "loss": 0.5202, "num_input_tokens_seen": 5786144, "step": 4765 }, { "epoch": 0.5312395589709322, "grad_norm": 0.5366353392601013, "learning_rate": 2.6556409399710436e-05, "loss": 0.5633, "num_input_tokens_seen": 5792256, "step": 4770 }, { "epoch": 0.5317964138545495, "grad_norm": 0.48922497034072876, "learning_rate": 2.6584252143891302e-05, "loss": 0.4881, "num_input_tokens_seen": 5798176, "step": 4775 }, { "epoch": 0.5323532687381668, "grad_norm": 0.5057798027992249, "learning_rate": 2.661209488807217e-05, "loss": 0.5028, "num_input_tokens_seen": 5804416, "step": 4780 }, { "epoch": 0.5329101236217841, "grad_norm": 0.46317604184150696, "learning_rate": 2.6639937632253036e-05, "loss": 0.5259, "num_input_tokens_seen": 5810528, "step": 4785 }, { "epoch": 0.5334669785054015, "grad_norm": 0.531015157699585, "learning_rate": 2.6667780376433904e-05, "loss": 0.5414, "num_input_tokens_seen": 5816416, "step": 4790 }, { "epoch": 0.5340238333890188, "grad_norm": 0.6332029700279236, "learning_rate": 2.669562312061477e-05, "loss": 0.5432, "num_input_tokens_seen": 5822656, "step": 4795 }, { "epoch": 0.5345806882726362, "grad_norm": 0.44080162048339844, "learning_rate": 2.6723465864795638e-05, "loss": 0.5161, "num_input_tokens_seen": 5828512, "step": 4800 }, { "epoch": 0.5351375431562535, "grad_norm": 0.5104998350143433, "learning_rate": 2.67513086089765e-05, "loss": 0.5055, "num_input_tokens_seen": 5834432, "step": 4805 }, { "epoch": 0.5356943980398708, "grad_norm": 0.5192365050315857, "learning_rate": 2.6779151353157365e-05, "loss": 0.4838, "num_input_tokens_seen": 5839808, "step": 4810 }, { "epoch": 0.5362512529234882, "grad_norm": 0.4790550172328949, "learning_rate": 2.6806994097338234e-05, "loss": 0.4542, "num_input_tokens_seen": 5845984, "step": 4815 }, { "epoch": 0.5368081078071054, "grad_norm": 0.8390695452690125, "learning_rate": 2.68348368415191e-05, "loss": 0.5082, "num_input_tokens_seen": 5852032, "step": 4820 }, { "epoch": 0.5373649626907228, "grad_norm": 0.4957265853881836, "learning_rate": 2.6862679585699968e-05, "loss": 0.543, "num_input_tokens_seen": 5858176, "step": 4825 }, { "epoch": 0.5379218175743401, "grad_norm": 0.5074983239173889, "learning_rate": 2.6890522329880837e-05, "loss": 0.4746, "num_input_tokens_seen": 5864352, "step": 4830 }, { "epoch": 0.5384786724579574, "grad_norm": 0.5724844932556152, "learning_rate": 2.6918365074061702e-05, "loss": 0.5012, "num_input_tokens_seen": 5870528, "step": 4835 }, { "epoch": 0.5390355273415748, "grad_norm": 0.47581014037132263, "learning_rate": 2.694620781824257e-05, "loss": 0.5234, "num_input_tokens_seen": 5876640, "step": 4840 }, { "epoch": 0.5395923822251921, "grad_norm": 0.5234749913215637, "learning_rate": 2.6974050562423436e-05, "loss": 0.5046, "num_input_tokens_seen": 5882272, "step": 4845 }, { "epoch": 0.5401492371088095, "grad_norm": 0.4404429793357849, "learning_rate": 2.7001893306604298e-05, "loss": 0.488, "num_input_tokens_seen": 5888288, "step": 4850 }, { "epoch": 0.5407060919924268, "grad_norm": 0.4020816683769226, "learning_rate": 2.7029736050785166e-05, "loss": 0.4802, "num_input_tokens_seen": 5894656, "step": 4855 }, { "epoch": 0.5412629468760441, "grad_norm": 0.4114299714565277, "learning_rate": 2.705757879496603e-05, "loss": 0.5328, "num_input_tokens_seen": 5900832, "step": 4860 }, { "epoch": 0.5418198017596614, "grad_norm": 0.4211278557777405, "learning_rate": 2.70854215391469e-05, "loss": 0.4845, "num_input_tokens_seen": 5906848, "step": 4865 }, { "epoch": 0.5423766566432787, "grad_norm": 0.5576079487800598, "learning_rate": 2.7113264283327766e-05, "loss": 0.4857, "num_input_tokens_seen": 5912832, "step": 4870 }, { "epoch": 0.5429335115268961, "grad_norm": 0.769550085067749, "learning_rate": 2.7141107027508634e-05, "loss": 0.4835, "num_input_tokens_seen": 5918880, "step": 4875 }, { "epoch": 0.5434903664105134, "grad_norm": 0.6430606842041016, "learning_rate": 2.71689497716895e-05, "loss": 0.49, "num_input_tokens_seen": 5925120, "step": 4880 }, { "epoch": 0.5440472212941307, "grad_norm": 0.4859071671962738, "learning_rate": 2.7196792515870368e-05, "loss": 0.4937, "num_input_tokens_seen": 5931360, "step": 4885 }, { "epoch": 0.5446040761777481, "grad_norm": 0.6694382429122925, "learning_rate": 2.7224635260051233e-05, "loss": 0.4887, "num_input_tokens_seen": 5937728, "step": 4890 }, { "epoch": 0.5451609310613654, "grad_norm": 0.5157943367958069, "learning_rate": 2.7252478004232095e-05, "loss": 0.5181, "num_input_tokens_seen": 5944384, "step": 4895 }, { "epoch": 0.5457177859449828, "grad_norm": 0.6861395835876465, "learning_rate": 2.7280320748412964e-05, "loss": 0.5353, "num_input_tokens_seen": 5950336, "step": 4900 }, { "epoch": 0.5462746408286001, "grad_norm": 0.4970448613166809, "learning_rate": 2.730816349259383e-05, "loss": 0.4742, "num_input_tokens_seen": 5956704, "step": 4905 }, { "epoch": 0.5468314957122173, "grad_norm": 0.6933923959732056, "learning_rate": 2.7336006236774698e-05, "loss": 0.5202, "num_input_tokens_seen": 5962752, "step": 4910 }, { "epoch": 0.5473883505958347, "grad_norm": 0.6027467250823975, "learning_rate": 2.7363848980955563e-05, "loss": 0.4992, "num_input_tokens_seen": 5968000, "step": 4915 }, { "epoch": 0.547945205479452, "grad_norm": 0.510402500629425, "learning_rate": 2.7391691725136432e-05, "loss": 0.5487, "num_input_tokens_seen": 5974464, "step": 4920 }, { "epoch": 0.5485020603630694, "grad_norm": 0.5019358992576599, "learning_rate": 2.7419534469317297e-05, "loss": 0.5295, "num_input_tokens_seen": 5979968, "step": 4925 }, { "epoch": 0.5490589152466867, "grad_norm": 0.40406426787376404, "learning_rate": 2.7447377213498166e-05, "loss": 0.513, "num_input_tokens_seen": 5985856, "step": 4930 }, { "epoch": 0.549615770130304, "grad_norm": 0.5646610856056213, "learning_rate": 2.747521995767903e-05, "loss": 0.4999, "num_input_tokens_seen": 5992480, "step": 4935 }, { "epoch": 0.5501726250139214, "grad_norm": 0.5144348740577698, "learning_rate": 2.7503062701859893e-05, "loss": 0.5019, "num_input_tokens_seen": 5998624, "step": 4940 }, { "epoch": 0.5507294798975387, "grad_norm": 0.47970902919769287, "learning_rate": 2.753090544604076e-05, "loss": 0.5053, "num_input_tokens_seen": 6004576, "step": 4945 }, { "epoch": 0.5512863347811561, "grad_norm": 0.44682371616363525, "learning_rate": 2.7558748190221627e-05, "loss": 0.4873, "num_input_tokens_seen": 6010944, "step": 4950 }, { "epoch": 0.5518431896647734, "grad_norm": 0.48236197233200073, "learning_rate": 2.7586590934402495e-05, "loss": 0.478, "num_input_tokens_seen": 6017088, "step": 4955 }, { "epoch": 0.5524000445483906, "grad_norm": 0.6679386496543884, "learning_rate": 2.761443367858336e-05, "loss": 0.4959, "num_input_tokens_seen": 6023360, "step": 4960 }, { "epoch": 0.552956899432008, "grad_norm": 0.48968416452407837, "learning_rate": 2.764227642276423e-05, "loss": 0.4961, "num_input_tokens_seen": 6029408, "step": 4965 }, { "epoch": 0.5535137543156253, "grad_norm": 0.49185431003570557, "learning_rate": 2.7670119166945098e-05, "loss": 0.4422, "num_input_tokens_seen": 6035488, "step": 4970 }, { "epoch": 0.5540706091992427, "grad_norm": 0.7633704543113708, "learning_rate": 2.7697961911125963e-05, "loss": 0.5189, "num_input_tokens_seen": 6041600, "step": 4975 }, { "epoch": 0.55462746408286, "grad_norm": 0.5369128584861755, "learning_rate": 2.7725804655306832e-05, "loss": 0.5438, "num_input_tokens_seen": 6048032, "step": 4980 }, { "epoch": 0.5551843189664774, "grad_norm": 0.6318516731262207, "learning_rate": 2.7753647399487694e-05, "loss": 0.5346, "num_input_tokens_seen": 6054112, "step": 4985 }, { "epoch": 0.5557411738500947, "grad_norm": 0.6468069553375244, "learning_rate": 2.778149014366856e-05, "loss": 0.4855, "num_input_tokens_seen": 6060128, "step": 4990 }, { "epoch": 0.556298028733712, "grad_norm": 0.40884271264076233, "learning_rate": 2.7809332887849428e-05, "loss": 0.5025, "num_input_tokens_seen": 6066144, "step": 4995 }, { "epoch": 0.5568548836173294, "grad_norm": 0.6577207446098328, "learning_rate": 2.7837175632030293e-05, "loss": 0.5074, "num_input_tokens_seen": 6072384, "step": 5000 }, { "epoch": 0.5574117385009466, "grad_norm": 0.5672372579574585, "learning_rate": 2.7865018376211162e-05, "loss": 0.5172, "num_input_tokens_seen": 6078208, "step": 5005 }, { "epoch": 0.557968593384564, "grad_norm": 0.5999768376350403, "learning_rate": 2.7892861120392027e-05, "loss": 0.4867, "num_input_tokens_seen": 6084384, "step": 5010 }, { "epoch": 0.5585254482681813, "grad_norm": 0.44607359170913696, "learning_rate": 2.7920703864572896e-05, "loss": 0.5149, "num_input_tokens_seen": 6090432, "step": 5015 }, { "epoch": 0.5590823031517986, "grad_norm": 0.6348535418510437, "learning_rate": 2.794854660875376e-05, "loss": 0.5174, "num_input_tokens_seen": 6096384, "step": 5020 }, { "epoch": 0.559639158035416, "grad_norm": 0.41428855061531067, "learning_rate": 2.797638935293463e-05, "loss": 0.4691, "num_input_tokens_seen": 6102464, "step": 5025 }, { "epoch": 0.5601960129190333, "grad_norm": 0.36423149704933167, "learning_rate": 2.800423209711549e-05, "loss": 0.4911, "num_input_tokens_seen": 6107904, "step": 5030 }, { "epoch": 0.5607528678026507, "grad_norm": 0.4773004353046417, "learning_rate": 2.8032074841296357e-05, "loss": 0.501, "num_input_tokens_seen": 6113952, "step": 5035 }, { "epoch": 0.561309722686268, "grad_norm": 0.5163204669952393, "learning_rate": 2.8059917585477225e-05, "loss": 0.5044, "num_input_tokens_seen": 6119904, "step": 5040 }, { "epoch": 0.5618665775698853, "grad_norm": 2.1686880588531494, "learning_rate": 2.808776032965809e-05, "loss": 0.4857, "num_input_tokens_seen": 6126240, "step": 5045 }, { "epoch": 0.5624234324535026, "grad_norm": 0.43552112579345703, "learning_rate": 2.811560307383896e-05, "loss": 0.4895, "num_input_tokens_seen": 6132320, "step": 5050 }, { "epoch": 0.5629802873371199, "grad_norm": 0.5435423254966736, "learning_rate": 2.8143445818019825e-05, "loss": 0.5011, "num_input_tokens_seen": 6138272, "step": 5055 }, { "epoch": 0.5635371422207373, "grad_norm": 0.5629425644874573, "learning_rate": 2.8171288562200693e-05, "loss": 0.5049, "num_input_tokens_seen": 6144160, "step": 5060 }, { "epoch": 0.5640939971043546, "grad_norm": 0.46313488483428955, "learning_rate": 2.819913130638156e-05, "loss": 0.4907, "num_input_tokens_seen": 6150080, "step": 5065 }, { "epoch": 0.5646508519879719, "grad_norm": 0.5227314233779907, "learning_rate": 2.8226974050562427e-05, "loss": 0.4851, "num_input_tokens_seen": 6156256, "step": 5070 }, { "epoch": 0.5652077068715893, "grad_norm": 0.5501190423965454, "learning_rate": 2.8254816794743292e-05, "loss": 0.5005, "num_input_tokens_seen": 6161888, "step": 5075 }, { "epoch": 0.5657645617552066, "grad_norm": 0.6625358462333679, "learning_rate": 2.8282659538924154e-05, "loss": 0.5316, "num_input_tokens_seen": 6167776, "step": 5080 }, { "epoch": 0.566321416638824, "grad_norm": 0.36297643184661865, "learning_rate": 2.8310502283105023e-05, "loss": 0.4911, "num_input_tokens_seen": 6173696, "step": 5085 }, { "epoch": 0.5668782715224413, "grad_norm": 0.586957573890686, "learning_rate": 2.8338345027285888e-05, "loss": 0.4586, "num_input_tokens_seen": 6179008, "step": 5090 }, { "epoch": 0.5674351264060585, "grad_norm": 0.6143355965614319, "learning_rate": 2.8366187771466757e-05, "loss": 0.4741, "num_input_tokens_seen": 6185184, "step": 5095 }, { "epoch": 0.5679919812896759, "grad_norm": 0.5469594597816467, "learning_rate": 2.8394030515647626e-05, "loss": 0.5127, "num_input_tokens_seen": 6190816, "step": 5100 }, { "epoch": 0.5685488361732932, "grad_norm": 0.5611235499382019, "learning_rate": 2.842187325982849e-05, "loss": 0.4847, "num_input_tokens_seen": 6196768, "step": 5105 }, { "epoch": 0.5691056910569106, "grad_norm": 0.44568201899528503, "learning_rate": 2.844971600400936e-05, "loss": 0.5147, "num_input_tokens_seen": 6202304, "step": 5110 }, { "epoch": 0.5696625459405279, "grad_norm": 0.52515709400177, "learning_rate": 2.8477558748190225e-05, "loss": 0.4981, "num_input_tokens_seen": 6208512, "step": 5115 }, { "epoch": 0.5702194008241452, "grad_norm": 0.453872948884964, "learning_rate": 2.8505401492371093e-05, "loss": 0.4979, "num_input_tokens_seen": 6214368, "step": 5120 }, { "epoch": 0.5707762557077626, "grad_norm": 0.5075692534446716, "learning_rate": 2.8533244236551955e-05, "loss": 0.4937, "num_input_tokens_seen": 6220224, "step": 5125 }, { "epoch": 0.5713331105913799, "grad_norm": 0.4649978280067444, "learning_rate": 2.856108698073282e-05, "loss": 0.492, "num_input_tokens_seen": 6226368, "step": 5130 }, { "epoch": 0.5718899654749973, "grad_norm": 0.4727384150028229, "learning_rate": 2.858892972491369e-05, "loss": 0.6039, "num_input_tokens_seen": 6232800, "step": 5135 }, { "epoch": 0.5724468203586145, "grad_norm": 0.5622254610061646, "learning_rate": 2.8616772469094554e-05, "loss": 0.4641, "num_input_tokens_seen": 6238912, "step": 5140 }, { "epoch": 0.5730036752422318, "grad_norm": 0.43272778391838074, "learning_rate": 2.8644615213275423e-05, "loss": 0.4928, "num_input_tokens_seen": 6245152, "step": 5145 }, { "epoch": 0.5735605301258492, "grad_norm": 0.6933011412620544, "learning_rate": 2.867245795745629e-05, "loss": 0.4735, "num_input_tokens_seen": 6250976, "step": 5150 }, { "epoch": 0.5741173850094665, "grad_norm": 0.4965987801551819, "learning_rate": 2.8700300701637157e-05, "loss": 0.4622, "num_input_tokens_seen": 6256992, "step": 5155 }, { "epoch": 0.5746742398930839, "grad_norm": 0.5069572329521179, "learning_rate": 2.8728143445818022e-05, "loss": 0.4542, "num_input_tokens_seen": 6262944, "step": 5160 }, { "epoch": 0.5752310947767012, "grad_norm": 0.49763262271881104, "learning_rate": 2.875598618999889e-05, "loss": 0.4758, "num_input_tokens_seen": 6269088, "step": 5165 }, { "epoch": 0.5757879496603185, "grad_norm": 0.594703197479248, "learning_rate": 2.8783828934179753e-05, "loss": 0.4742, "num_input_tokens_seen": 6274720, "step": 5170 }, { "epoch": 0.5763448045439359, "grad_norm": 0.4610941410064697, "learning_rate": 2.8811671678360618e-05, "loss": 0.5181, "num_input_tokens_seen": 6280640, "step": 5175 }, { "epoch": 0.5769016594275532, "grad_norm": 0.4745159149169922, "learning_rate": 2.8839514422541487e-05, "loss": 0.505, "num_input_tokens_seen": 6286656, "step": 5180 }, { "epoch": 0.5774585143111705, "grad_norm": 0.8118071556091309, "learning_rate": 2.8867357166722352e-05, "loss": 0.4825, "num_input_tokens_seen": 6292736, "step": 5185 }, { "epoch": 0.5780153691947878, "grad_norm": 0.5403907299041748, "learning_rate": 2.889519991090322e-05, "loss": 0.473, "num_input_tokens_seen": 6298848, "step": 5190 }, { "epoch": 0.5785722240784051, "grad_norm": 0.6116636395454407, "learning_rate": 2.8923042655084086e-05, "loss": 0.5006, "num_input_tokens_seen": 6305120, "step": 5195 }, { "epoch": 0.5791290789620225, "grad_norm": 0.5314605236053467, "learning_rate": 2.8950885399264955e-05, "loss": 0.4847, "num_input_tokens_seen": 6311264, "step": 5200 }, { "epoch": 0.5796859338456398, "grad_norm": 0.5333969593048096, "learning_rate": 2.897872814344582e-05, "loss": 0.5057, "num_input_tokens_seen": 6317088, "step": 5205 }, { "epoch": 0.5802427887292572, "grad_norm": 0.8610332012176514, "learning_rate": 2.900657088762669e-05, "loss": 0.4988, "num_input_tokens_seen": 6322624, "step": 5210 }, { "epoch": 0.5807996436128745, "grad_norm": 0.6224839091300964, "learning_rate": 2.903441363180755e-05, "loss": 0.4684, "num_input_tokens_seen": 6328160, "step": 5215 }, { "epoch": 0.5813564984964918, "grad_norm": 0.4835337698459625, "learning_rate": 2.9062256375988416e-05, "loss": 0.5035, "num_input_tokens_seen": 6333728, "step": 5220 }, { "epoch": 0.5819133533801092, "grad_norm": 0.4799236059188843, "learning_rate": 2.9090099120169284e-05, "loss": 0.4854, "num_input_tokens_seen": 6339872, "step": 5225 }, { "epoch": 0.5824702082637264, "grad_norm": 0.48343542218208313, "learning_rate": 2.911794186435015e-05, "loss": 0.4758, "num_input_tokens_seen": 6345856, "step": 5230 }, { "epoch": 0.5830270631473438, "grad_norm": 0.4750179946422577, "learning_rate": 2.9145784608531018e-05, "loss": 0.4844, "num_input_tokens_seen": 6351840, "step": 5235 }, { "epoch": 0.5835839180309611, "grad_norm": 0.562828779220581, "learning_rate": 2.9173627352711887e-05, "loss": 0.4747, "num_input_tokens_seen": 6357952, "step": 5240 }, { "epoch": 0.5841407729145784, "grad_norm": 0.4219663441181183, "learning_rate": 2.9201470096892752e-05, "loss": 0.4866, "num_input_tokens_seen": 6363968, "step": 5245 }, { "epoch": 0.5846976277981958, "grad_norm": 0.49724993109703064, "learning_rate": 2.922931284107362e-05, "loss": 0.4962, "num_input_tokens_seen": 6370336, "step": 5250 }, { "epoch": 0.5852544826818131, "grad_norm": 0.6368589401245117, "learning_rate": 2.9257155585254486e-05, "loss": 0.5054, "num_input_tokens_seen": 6376320, "step": 5255 }, { "epoch": 0.5858113375654305, "grad_norm": 0.5198273658752441, "learning_rate": 2.9284998329435348e-05, "loss": 0.4799, "num_input_tokens_seen": 6382592, "step": 5260 }, { "epoch": 0.5863681924490478, "grad_norm": 0.44435709714889526, "learning_rate": 2.9312841073616217e-05, "loss": 0.4649, "num_input_tokens_seen": 6388512, "step": 5265 }, { "epoch": 0.5869250473326652, "grad_norm": 0.4444536566734314, "learning_rate": 2.9340683817797082e-05, "loss": 0.513, "num_input_tokens_seen": 6394560, "step": 5270 }, { "epoch": 0.5874819022162825, "grad_norm": 0.44501158595085144, "learning_rate": 2.936852656197795e-05, "loss": 0.4832, "num_input_tokens_seen": 6400480, "step": 5275 }, { "epoch": 0.5880387570998997, "grad_norm": 0.6461226344108582, "learning_rate": 2.9396369306158816e-05, "loss": 0.5208, "num_input_tokens_seen": 6406048, "step": 5280 }, { "epoch": 0.5885956119835171, "grad_norm": 0.5085161328315735, "learning_rate": 2.9424212050339685e-05, "loss": 0.497, "num_input_tokens_seen": 6412256, "step": 5285 }, { "epoch": 0.5891524668671344, "grad_norm": 0.45155349373817444, "learning_rate": 2.945205479452055e-05, "loss": 0.4803, "num_input_tokens_seen": 6418400, "step": 5290 }, { "epoch": 0.5897093217507517, "grad_norm": 0.4852399230003357, "learning_rate": 2.947989753870142e-05, "loss": 0.5024, "num_input_tokens_seen": 6424640, "step": 5295 }, { "epoch": 0.5902661766343691, "grad_norm": 0.3507018983364105, "learning_rate": 2.9507740282882284e-05, "loss": 0.4863, "num_input_tokens_seen": 6430720, "step": 5300 }, { "epoch": 0.5908230315179864, "grad_norm": 0.48733097314834595, "learning_rate": 2.9535583027063146e-05, "loss": 0.4944, "num_input_tokens_seen": 6436256, "step": 5305 }, { "epoch": 0.5913798864016038, "grad_norm": 0.44163402915000916, "learning_rate": 2.9563425771244014e-05, "loss": 0.478, "num_input_tokens_seen": 6442208, "step": 5310 }, { "epoch": 0.5919367412852211, "grad_norm": 0.4821125268936157, "learning_rate": 2.959126851542488e-05, "loss": 0.4657, "num_input_tokens_seen": 6447840, "step": 5315 }, { "epoch": 0.5924935961688385, "grad_norm": 0.6865218877792358, "learning_rate": 2.9619111259605748e-05, "loss": 0.5023, "num_input_tokens_seen": 6454208, "step": 5320 }, { "epoch": 0.5930504510524557, "grad_norm": 0.4763175845146179, "learning_rate": 2.9646954003786613e-05, "loss": 0.488, "num_input_tokens_seen": 6460480, "step": 5325 }, { "epoch": 0.593607305936073, "grad_norm": 0.47064656019210815, "learning_rate": 2.9674796747967482e-05, "loss": 0.5137, "num_input_tokens_seen": 6466176, "step": 5330 }, { "epoch": 0.5941641608196904, "grad_norm": 0.5310936570167542, "learning_rate": 2.9702639492148347e-05, "loss": 0.5225, "num_input_tokens_seen": 6472064, "step": 5335 }, { "epoch": 0.5947210157033077, "grad_norm": 0.5191647410392761, "learning_rate": 2.9730482236329216e-05, "loss": 0.497, "num_input_tokens_seen": 6478176, "step": 5340 }, { "epoch": 0.595277870586925, "grad_norm": 0.6458681225776672, "learning_rate": 2.975832498051008e-05, "loss": 0.4966, "num_input_tokens_seen": 6484480, "step": 5345 }, { "epoch": 0.5958347254705424, "grad_norm": 0.49928873777389526, "learning_rate": 2.9786167724690943e-05, "loss": 0.4682, "num_input_tokens_seen": 6490368, "step": 5350 }, { "epoch": 0.5963915803541597, "grad_norm": 0.4800792634487152, "learning_rate": 2.9814010468871812e-05, "loss": 0.4996, "num_input_tokens_seen": 6496352, "step": 5355 }, { "epoch": 0.5969484352377771, "grad_norm": 0.49730658531188965, "learning_rate": 2.9841853213052677e-05, "loss": 0.4843, "num_input_tokens_seen": 6502048, "step": 5360 }, { "epoch": 0.5975052901213944, "grad_norm": 0.5556466579437256, "learning_rate": 2.9869695957233546e-05, "loss": 0.4895, "num_input_tokens_seen": 6508384, "step": 5365 }, { "epoch": 0.5980621450050116, "grad_norm": 0.5109179019927979, "learning_rate": 2.989753870141441e-05, "loss": 0.5172, "num_input_tokens_seen": 6514560, "step": 5370 }, { "epoch": 0.598618999888629, "grad_norm": 0.4140401780605316, "learning_rate": 2.992538144559528e-05, "loss": 0.5006, "num_input_tokens_seen": 6520960, "step": 5375 }, { "epoch": 0.5991758547722463, "grad_norm": 0.48383617401123047, "learning_rate": 2.995322418977615e-05, "loss": 0.5002, "num_input_tokens_seen": 6527040, "step": 5380 }, { "epoch": 0.5997327096558637, "grad_norm": 0.4619150459766388, "learning_rate": 2.9981066933957014e-05, "loss": 0.5075, "num_input_tokens_seen": 6533440, "step": 5385 }, { "epoch": 0.600289564539481, "grad_norm": 0.3481505513191223, "learning_rate": 3.0008909678137882e-05, "loss": 0.4977, "num_input_tokens_seen": 6539520, "step": 5390 }, { "epoch": 0.6008464194230984, "grad_norm": 0.6326341032981873, "learning_rate": 3.003675242231874e-05, "loss": 0.4842, "num_input_tokens_seen": 6545408, "step": 5395 }, { "epoch": 0.6014032743067157, "grad_norm": 0.5540825128555298, "learning_rate": 3.006459516649961e-05, "loss": 0.498, "num_input_tokens_seen": 6551424, "step": 5400 }, { "epoch": 0.601960129190333, "grad_norm": 0.4675358533859253, "learning_rate": 3.0092437910680478e-05, "loss": 0.5046, "num_input_tokens_seen": 6557248, "step": 5405 }, { "epoch": 0.6025169840739504, "grad_norm": 0.4630054235458374, "learning_rate": 3.0120280654861343e-05, "loss": 0.5122, "num_input_tokens_seen": 6563584, "step": 5410 }, { "epoch": 0.6030738389575676, "grad_norm": 0.42999890446662903, "learning_rate": 3.0148123399042212e-05, "loss": 0.4871, "num_input_tokens_seen": 6569440, "step": 5415 }, { "epoch": 0.603630693841185, "grad_norm": 0.4344916045665741, "learning_rate": 3.0175966143223077e-05, "loss": 0.4747, "num_input_tokens_seen": 6575712, "step": 5420 }, { "epoch": 0.6041875487248023, "grad_norm": 0.5199522376060486, "learning_rate": 3.0203808887403946e-05, "loss": 0.482, "num_input_tokens_seen": 6581792, "step": 5425 }, { "epoch": 0.6047444036084196, "grad_norm": 0.5810474753379822, "learning_rate": 3.023165163158481e-05, "loss": 0.4588, "num_input_tokens_seen": 6587232, "step": 5430 }, { "epoch": 0.605301258492037, "grad_norm": 0.4579968750476837, "learning_rate": 3.025949437576568e-05, "loss": 0.5025, "num_input_tokens_seen": 6593248, "step": 5435 }, { "epoch": 0.6058581133756543, "grad_norm": 0.4502491056919098, "learning_rate": 3.0287337119946545e-05, "loss": 0.5199, "num_input_tokens_seen": 6599424, "step": 5440 }, { "epoch": 0.6064149682592717, "grad_norm": 0.5488900542259216, "learning_rate": 3.0315179864127407e-05, "loss": 0.5052, "num_input_tokens_seen": 6604864, "step": 5445 }, { "epoch": 0.606971823142889, "grad_norm": 0.5173280239105225, "learning_rate": 3.0343022608308276e-05, "loss": 0.4969, "num_input_tokens_seen": 6611072, "step": 5450 }, { "epoch": 0.6075286780265063, "grad_norm": 0.6459969878196716, "learning_rate": 3.037086535248914e-05, "loss": 0.4911, "num_input_tokens_seen": 6617216, "step": 5455 }, { "epoch": 0.6080855329101236, "grad_norm": 0.49686458706855774, "learning_rate": 3.039870809667001e-05, "loss": 0.4575, "num_input_tokens_seen": 6623360, "step": 5460 }, { "epoch": 0.6086423877937409, "grad_norm": 0.4572870135307312, "learning_rate": 3.0426550840850875e-05, "loss": 0.493, "num_input_tokens_seen": 6629504, "step": 5465 }, { "epoch": 0.6091992426773583, "grad_norm": 0.41412636637687683, "learning_rate": 3.0454393585031743e-05, "loss": 0.469, "num_input_tokens_seen": 6635456, "step": 5470 }, { "epoch": 0.6097560975609756, "grad_norm": 0.6608960032463074, "learning_rate": 3.048223632921261e-05, "loss": 0.4554, "num_input_tokens_seen": 6641376, "step": 5475 }, { "epoch": 0.6103129524445929, "grad_norm": 0.5120620727539062, "learning_rate": 3.0510079073393477e-05, "loss": 0.4817, "num_input_tokens_seen": 6647680, "step": 5480 }, { "epoch": 0.6108698073282103, "grad_norm": 0.42689695954322815, "learning_rate": 3.053792181757434e-05, "loss": 0.4858, "num_input_tokens_seen": 6653728, "step": 5485 }, { "epoch": 0.6114266622118276, "grad_norm": 0.47716960310935974, "learning_rate": 3.0565764561755205e-05, "loss": 0.4725, "num_input_tokens_seen": 6659872, "step": 5490 }, { "epoch": 0.611983517095445, "grad_norm": 0.5690329670906067, "learning_rate": 3.059360730593607e-05, "loss": 0.4931, "num_input_tokens_seen": 6666432, "step": 5495 }, { "epoch": 0.6125403719790623, "grad_norm": 0.5522893071174622, "learning_rate": 3.062145005011694e-05, "loss": 0.4698, "num_input_tokens_seen": 6672608, "step": 5500 }, { "epoch": 0.6130972268626795, "grad_norm": 0.3693057596683502, "learning_rate": 3.0649292794297804e-05, "loss": 0.4845, "num_input_tokens_seen": 6678368, "step": 5505 }, { "epoch": 0.6136540817462969, "grad_norm": 0.5857660174369812, "learning_rate": 3.067713553847867e-05, "loss": 0.4675, "num_input_tokens_seen": 6684736, "step": 5510 }, { "epoch": 0.6142109366299142, "grad_norm": 0.5477136373519897, "learning_rate": 3.070497828265954e-05, "loss": 0.4882, "num_input_tokens_seen": 6690848, "step": 5515 }, { "epoch": 0.6147677915135316, "grad_norm": 0.5610957741737366, "learning_rate": 3.073282102684041e-05, "loss": 0.502, "num_input_tokens_seen": 6697216, "step": 5520 }, { "epoch": 0.6153246463971489, "grad_norm": 0.5132215619087219, "learning_rate": 3.076066377102128e-05, "loss": 0.5032, "num_input_tokens_seen": 6703456, "step": 5525 }, { "epoch": 0.6158815012807662, "grad_norm": 0.6415368318557739, "learning_rate": 3.078850651520214e-05, "loss": 0.5162, "num_input_tokens_seen": 6709728, "step": 5530 }, { "epoch": 0.6164383561643836, "grad_norm": 0.5051767826080322, "learning_rate": 3.0816349259383e-05, "loss": 0.525, "num_input_tokens_seen": 6715616, "step": 5535 }, { "epoch": 0.6169952110480009, "grad_norm": 0.5775355696678162, "learning_rate": 3.084419200356387e-05, "loss": 0.4902, "num_input_tokens_seen": 6721376, "step": 5540 }, { "epoch": 0.6175520659316183, "grad_norm": 0.4799974858760834, "learning_rate": 3.087203474774474e-05, "loss": 0.5189, "num_input_tokens_seen": 6727648, "step": 5545 }, { "epoch": 0.6181089208152355, "grad_norm": 0.5300256013870239, "learning_rate": 3.089987749192561e-05, "loss": 0.519, "num_input_tokens_seen": 6733696, "step": 5550 }, { "epoch": 0.6186657756988528, "grad_norm": 0.44152188301086426, "learning_rate": 3.092772023610647e-05, "loss": 0.4707, "num_input_tokens_seen": 6739872, "step": 5555 }, { "epoch": 0.6192226305824702, "grad_norm": 0.3804726004600525, "learning_rate": 3.095556298028734e-05, "loss": 0.486, "num_input_tokens_seen": 6746016, "step": 5560 }, { "epoch": 0.6197794854660875, "grad_norm": 1.8793643712997437, "learning_rate": 3.098340572446821e-05, "loss": 0.5292, "num_input_tokens_seen": 6752416, "step": 5565 }, { "epoch": 0.6203363403497049, "grad_norm": 0.4463757872581482, "learning_rate": 3.1011248468649076e-05, "loss": 0.4747, "num_input_tokens_seen": 6758688, "step": 5570 }, { "epoch": 0.6208931952333222, "grad_norm": 0.501703679561615, "learning_rate": 3.103909121282994e-05, "loss": 0.473, "num_input_tokens_seen": 6764896, "step": 5575 }, { "epoch": 0.6214500501169395, "grad_norm": 0.4373491108417511, "learning_rate": 3.10669339570108e-05, "loss": 0.4817, "num_input_tokens_seen": 6770976, "step": 5580 }, { "epoch": 0.6220069050005569, "grad_norm": 0.5438941121101379, "learning_rate": 3.109477670119167e-05, "loss": 0.4751, "num_input_tokens_seen": 6776960, "step": 5585 }, { "epoch": 0.6225637598841742, "grad_norm": 0.568481981754303, "learning_rate": 3.112261944537254e-05, "loss": 0.4933, "num_input_tokens_seen": 6783200, "step": 5590 }, { "epoch": 0.6231206147677915, "grad_norm": 0.6089180707931519, "learning_rate": 3.1150462189553406e-05, "loss": 0.5137, "num_input_tokens_seen": 6789344, "step": 5595 }, { "epoch": 0.6236774696514088, "grad_norm": 0.42653656005859375, "learning_rate": 3.117830493373427e-05, "loss": 0.4798, "num_input_tokens_seen": 6795616, "step": 5600 }, { "epoch": 0.6242343245350261, "grad_norm": 0.6277459859848022, "learning_rate": 3.1206147677915136e-05, "loss": 0.4781, "num_input_tokens_seen": 6801600, "step": 5605 }, { "epoch": 0.6247911794186435, "grad_norm": 0.5739933252334595, "learning_rate": 3.1233990422096005e-05, "loss": 0.5228, "num_input_tokens_seen": 6806912, "step": 5610 }, { "epoch": 0.6253480343022608, "grad_norm": 0.5521772503852844, "learning_rate": 3.1261833166276874e-05, "loss": 0.4742, "num_input_tokens_seen": 6813184, "step": 5615 }, { "epoch": 0.6259048891858782, "grad_norm": 0.4434637427330017, "learning_rate": 3.1289675910457735e-05, "loss": 0.4934, "num_input_tokens_seen": 6818976, "step": 5620 }, { "epoch": 0.6264617440694955, "grad_norm": 0.4022592008113861, "learning_rate": 3.13175186546386e-05, "loss": 0.5135, "num_input_tokens_seen": 6824992, "step": 5625 }, { "epoch": 0.6270185989531128, "grad_norm": 0.4153516888618469, "learning_rate": 3.1345361398819466e-05, "loss": 0.4727, "num_input_tokens_seen": 6830752, "step": 5630 }, { "epoch": 0.6275754538367302, "grad_norm": 0.3987622857093811, "learning_rate": 3.1373204143000335e-05, "loss": 0.4894, "num_input_tokens_seen": 6836992, "step": 5635 }, { "epoch": 0.6281323087203475, "grad_norm": 0.5045483112335205, "learning_rate": 3.14010468871812e-05, "loss": 0.4581, "num_input_tokens_seen": 6842880, "step": 5640 }, { "epoch": 0.6286891636039648, "grad_norm": 0.47754576802253723, "learning_rate": 3.1428889631362065e-05, "loss": 0.5048, "num_input_tokens_seen": 6849184, "step": 5645 }, { "epoch": 0.6292460184875821, "grad_norm": 0.4775291085243225, "learning_rate": 3.1456732375542934e-05, "loss": 0.5008, "num_input_tokens_seen": 6855200, "step": 5650 }, { "epoch": 0.6298028733711994, "grad_norm": 0.3989838659763336, "learning_rate": 3.14845751197238e-05, "loss": 0.5041, "num_input_tokens_seen": 6861344, "step": 5655 }, { "epoch": 0.6303597282548168, "grad_norm": 0.5784767866134644, "learning_rate": 3.151241786390467e-05, "loss": 0.4921, "num_input_tokens_seen": 6867616, "step": 5660 }, { "epoch": 0.6309165831384341, "grad_norm": 0.4615471363067627, "learning_rate": 3.154026060808554e-05, "loss": 0.5016, "num_input_tokens_seen": 6873952, "step": 5665 }, { "epoch": 0.6314734380220515, "grad_norm": 0.4638471007347107, "learning_rate": 3.1568103352266395e-05, "loss": 0.5018, "num_input_tokens_seen": 6879968, "step": 5670 }, { "epoch": 0.6320302929056688, "grad_norm": 0.34080010652542114, "learning_rate": 3.1595946096447264e-05, "loss": 0.5511, "num_input_tokens_seen": 6885888, "step": 5675 }, { "epoch": 0.6325871477892862, "grad_norm": 0.40432795882225037, "learning_rate": 3.162378884062813e-05, "loss": 0.4969, "num_input_tokens_seen": 6892288, "step": 5680 }, { "epoch": 0.6331440026729035, "grad_norm": 0.4416799545288086, "learning_rate": 3.1651631584809e-05, "loss": 0.4624, "num_input_tokens_seen": 6898528, "step": 5685 }, { "epoch": 0.6337008575565207, "grad_norm": 0.5388666987419128, "learning_rate": 3.167947432898987e-05, "loss": 0.4914, "num_input_tokens_seen": 6904736, "step": 5690 }, { "epoch": 0.6342577124401381, "grad_norm": 0.37019702792167664, "learning_rate": 3.170731707317073e-05, "loss": 0.481, "num_input_tokens_seen": 6910784, "step": 5695 }, { "epoch": 0.6348145673237554, "grad_norm": 0.392190545797348, "learning_rate": 3.17351598173516e-05, "loss": 0.4541, "num_input_tokens_seen": 6916992, "step": 5700 }, { "epoch": 0.6353714222073727, "grad_norm": 0.6717867255210876, "learning_rate": 3.176300256153247e-05, "loss": 0.4669, "num_input_tokens_seen": 6922624, "step": 5705 }, { "epoch": 0.6359282770909901, "grad_norm": 0.7049024701118469, "learning_rate": 3.179084530571334e-05, "loss": 0.5067, "num_input_tokens_seen": 6928736, "step": 5710 }, { "epoch": 0.6364851319746074, "grad_norm": 0.5475661158561707, "learning_rate": 3.18186880498942e-05, "loss": 0.4872, "num_input_tokens_seen": 6935040, "step": 5715 }, { "epoch": 0.6370419868582248, "grad_norm": 0.6379671096801758, "learning_rate": 3.184653079407506e-05, "loss": 0.4687, "num_input_tokens_seen": 6941440, "step": 5720 }, { "epoch": 0.6375988417418421, "grad_norm": 0.7163261771202087, "learning_rate": 3.187437353825593e-05, "loss": 0.5117, "num_input_tokens_seen": 6946720, "step": 5725 }, { "epoch": 0.6381556966254595, "grad_norm": 0.47914737462997437, "learning_rate": 3.19022162824368e-05, "loss": 0.4967, "num_input_tokens_seen": 6952896, "step": 5730 }, { "epoch": 0.6387125515090767, "grad_norm": 0.639531135559082, "learning_rate": 3.193005902661767e-05, "loss": 0.4406, "num_input_tokens_seen": 6959232, "step": 5735 }, { "epoch": 0.639269406392694, "grad_norm": 0.45594051480293274, "learning_rate": 3.195790177079853e-05, "loss": 0.4809, "num_input_tokens_seen": 6965376, "step": 5740 }, { "epoch": 0.6398262612763114, "grad_norm": 0.5564161539077759, "learning_rate": 3.19857445149794e-05, "loss": 0.4938, "num_input_tokens_seen": 6971520, "step": 5745 }, { "epoch": 0.6403831161599287, "grad_norm": 0.5901573300361633, "learning_rate": 3.2013587259160266e-05, "loss": 0.4702, "num_input_tokens_seen": 6978112, "step": 5750 }, { "epoch": 0.640939971043546, "grad_norm": 0.5138410329818726, "learning_rate": 3.2041430003341135e-05, "loss": 0.5109, "num_input_tokens_seen": 6984256, "step": 5755 }, { "epoch": 0.6414968259271634, "grad_norm": 0.46005579829216003, "learning_rate": 3.2069272747522e-05, "loss": 0.5077, "num_input_tokens_seen": 6990368, "step": 5760 }, { "epoch": 0.6420536808107807, "grad_norm": 0.6073315739631653, "learning_rate": 3.209711549170286e-05, "loss": 0.5209, "num_input_tokens_seen": 6995904, "step": 5765 }, { "epoch": 0.6426105356943981, "grad_norm": 0.47792813181877136, "learning_rate": 3.212495823588373e-05, "loss": 0.4887, "num_input_tokens_seen": 7001824, "step": 5770 }, { "epoch": 0.6431673905780154, "grad_norm": 0.8111823797225952, "learning_rate": 3.2152800980064596e-05, "loss": 0.5036, "num_input_tokens_seen": 7007936, "step": 5775 }, { "epoch": 0.6437242454616326, "grad_norm": 0.4874756336212158, "learning_rate": 3.2180643724245465e-05, "loss": 0.439, "num_input_tokens_seen": 7013984, "step": 5780 }, { "epoch": 0.64428110034525, "grad_norm": 0.36671915650367737, "learning_rate": 3.2208486468426327e-05, "loss": 0.4765, "num_input_tokens_seen": 7020224, "step": 5785 }, { "epoch": 0.6448379552288673, "grad_norm": 0.4722338318824768, "learning_rate": 3.2236329212607195e-05, "loss": 0.4431, "num_input_tokens_seen": 7026240, "step": 5790 }, { "epoch": 0.6453948101124847, "grad_norm": 0.43509641289711, "learning_rate": 3.2264171956788064e-05, "loss": 0.4916, "num_input_tokens_seen": 7032576, "step": 5795 }, { "epoch": 0.645951664996102, "grad_norm": 0.3957189917564392, "learning_rate": 3.229201470096893e-05, "loss": 0.4784, "num_input_tokens_seen": 7038656, "step": 5800 }, { "epoch": 0.6465085198797194, "grad_norm": 0.4429645240306854, "learning_rate": 3.23198574451498e-05, "loss": 0.4848, "num_input_tokens_seen": 7044960, "step": 5805 }, { "epoch": 0.6470653747633367, "grad_norm": 0.4533993601799011, "learning_rate": 3.234770018933066e-05, "loss": 0.494, "num_input_tokens_seen": 7050944, "step": 5810 }, { "epoch": 0.647622229646954, "grad_norm": 0.6995505690574646, "learning_rate": 3.2375542933511525e-05, "loss": 0.4979, "num_input_tokens_seen": 7056896, "step": 5815 }, { "epoch": 0.6481790845305714, "grad_norm": 0.41163185238838196, "learning_rate": 3.2403385677692394e-05, "loss": 0.4872, "num_input_tokens_seen": 7063328, "step": 5820 }, { "epoch": 0.6487359394141886, "grad_norm": 0.46677225828170776, "learning_rate": 3.243122842187326e-05, "loss": 0.4778, "num_input_tokens_seen": 7069216, "step": 5825 }, { "epoch": 0.649292794297806, "grad_norm": 0.4193129539489746, "learning_rate": 3.245907116605413e-05, "loss": 0.5209, "num_input_tokens_seen": 7075264, "step": 5830 }, { "epoch": 0.6498496491814233, "grad_norm": 0.5026870369911194, "learning_rate": 3.248691391023499e-05, "loss": 0.5118, "num_input_tokens_seen": 7081312, "step": 5835 }, { "epoch": 0.6504065040650406, "grad_norm": 0.5737720131874084, "learning_rate": 3.251475665441586e-05, "loss": 0.5137, "num_input_tokens_seen": 7087744, "step": 5840 }, { "epoch": 0.650963358948658, "grad_norm": 0.46333277225494385, "learning_rate": 3.254259939859673e-05, "loss": 0.4653, "num_input_tokens_seen": 7093632, "step": 5845 }, { "epoch": 0.6515202138322753, "grad_norm": 0.49167299270629883, "learning_rate": 3.25704421427776e-05, "loss": 0.4693, "num_input_tokens_seen": 7099808, "step": 5850 }, { "epoch": 0.6520770687158927, "grad_norm": 0.4445135295391083, "learning_rate": 3.259828488695846e-05, "loss": 0.4912, "num_input_tokens_seen": 7105888, "step": 5855 }, { "epoch": 0.65263392359951, "grad_norm": 0.30020928382873535, "learning_rate": 3.262612763113932e-05, "loss": 0.4764, "num_input_tokens_seen": 7112000, "step": 5860 }, { "epoch": 0.6531907784831273, "grad_norm": 0.5113039612770081, "learning_rate": 3.265397037532019e-05, "loss": 0.4751, "num_input_tokens_seen": 7118176, "step": 5865 }, { "epoch": 0.6537476333667446, "grad_norm": 0.5323472023010254, "learning_rate": 3.268181311950106e-05, "loss": 0.479, "num_input_tokens_seen": 7124320, "step": 5870 }, { "epoch": 0.6543044882503619, "grad_norm": 0.5574404001235962, "learning_rate": 3.270965586368193e-05, "loss": 0.4986, "num_input_tokens_seen": 7130528, "step": 5875 }, { "epoch": 0.6548613431339793, "grad_norm": 0.4293875992298126, "learning_rate": 3.273749860786279e-05, "loss": 0.4652, "num_input_tokens_seen": 7136512, "step": 5880 }, { "epoch": 0.6554181980175966, "grad_norm": 0.47468897700309753, "learning_rate": 3.276534135204366e-05, "loss": 0.5152, "num_input_tokens_seen": 7142624, "step": 5885 }, { "epoch": 0.6559750529012139, "grad_norm": 0.6218559145927429, "learning_rate": 3.279318409622453e-05, "loss": 0.471, "num_input_tokens_seen": 7148896, "step": 5890 }, { "epoch": 0.6565319077848313, "grad_norm": 0.43916040658950806, "learning_rate": 3.2821026840405396e-05, "loss": 0.5341, "num_input_tokens_seen": 7154784, "step": 5895 }, { "epoch": 0.6570887626684486, "grad_norm": 0.4843903183937073, "learning_rate": 3.284886958458626e-05, "loss": 0.4635, "num_input_tokens_seen": 7161088, "step": 5900 }, { "epoch": 0.657645617552066, "grad_norm": 0.4685269892215729, "learning_rate": 3.287671232876712e-05, "loss": 0.4952, "num_input_tokens_seen": 7167232, "step": 5905 }, { "epoch": 0.6582024724356833, "grad_norm": 0.4082203805446625, "learning_rate": 3.290455507294799e-05, "loss": 0.4872, "num_input_tokens_seen": 7173376, "step": 5910 }, { "epoch": 0.6587593273193005, "grad_norm": 0.3428295850753784, "learning_rate": 3.293239781712886e-05, "loss": 0.4931, "num_input_tokens_seen": 7179488, "step": 5915 }, { "epoch": 0.6593161822029179, "grad_norm": 0.4586571156978607, "learning_rate": 3.2960240561309726e-05, "loss": 0.4594, "num_input_tokens_seen": 7185856, "step": 5920 }, { "epoch": 0.6598730370865352, "grad_norm": 0.35622265934944153, "learning_rate": 3.298808330549059e-05, "loss": 0.4703, "num_input_tokens_seen": 7192288, "step": 5925 }, { "epoch": 0.6604298919701526, "grad_norm": 0.36295631527900696, "learning_rate": 3.301592604967146e-05, "loss": 0.4937, "num_input_tokens_seen": 7198400, "step": 5930 }, { "epoch": 0.6609867468537699, "grad_norm": 0.5201131701469421, "learning_rate": 3.3043768793852325e-05, "loss": 0.477, "num_input_tokens_seen": 7204576, "step": 5935 }, { "epoch": 0.6615436017373872, "grad_norm": 0.4472949504852295, "learning_rate": 3.3071611538033194e-05, "loss": 0.51, "num_input_tokens_seen": 7211040, "step": 5940 }, { "epoch": 0.6621004566210046, "grad_norm": 0.4364466667175293, "learning_rate": 3.3099454282214056e-05, "loss": 0.4885, "num_input_tokens_seen": 7216928, "step": 5945 }, { "epoch": 0.6626573115046219, "grad_norm": 0.4713364243507385, "learning_rate": 3.3127297026394924e-05, "loss": 0.4807, "num_input_tokens_seen": 7223168, "step": 5950 }, { "epoch": 0.6632141663882393, "grad_norm": 0.4989962577819824, "learning_rate": 3.3155139770575786e-05, "loss": 0.5047, "num_input_tokens_seen": 7229056, "step": 5955 }, { "epoch": 0.6637710212718566, "grad_norm": 0.2891460955142975, "learning_rate": 3.3182982514756655e-05, "loss": 0.4771, "num_input_tokens_seen": 7234464, "step": 5960 }, { "epoch": 0.6643278761554738, "grad_norm": 0.6408382058143616, "learning_rate": 3.3210825258937524e-05, "loss": 0.4675, "num_input_tokens_seen": 7240448, "step": 5965 }, { "epoch": 0.6648847310390912, "grad_norm": 0.3429107666015625, "learning_rate": 3.323866800311839e-05, "loss": 0.4745, "num_input_tokens_seen": 7246464, "step": 5970 }, { "epoch": 0.6654415859227085, "grad_norm": 0.5969050526618958, "learning_rate": 3.3266510747299254e-05, "loss": 0.4769, "num_input_tokens_seen": 7252768, "step": 5975 }, { "epoch": 0.6659984408063259, "grad_norm": 0.42423900961875916, "learning_rate": 3.329435349148012e-05, "loss": 0.4941, "num_input_tokens_seen": 7258976, "step": 5980 }, { "epoch": 0.6665552956899432, "grad_norm": 0.41733041405677795, "learning_rate": 3.332219623566099e-05, "loss": 0.4467, "num_input_tokens_seen": 7265312, "step": 5985 }, { "epoch": 0.6671121505735605, "grad_norm": 0.5016948580741882, "learning_rate": 3.3350038979841853e-05, "loss": 0.506, "num_input_tokens_seen": 7271552, "step": 5990 }, { "epoch": 0.6676690054571779, "grad_norm": 0.3964536786079407, "learning_rate": 3.337788172402272e-05, "loss": 0.4761, "num_input_tokens_seen": 7277664, "step": 5995 }, { "epoch": 0.6682258603407952, "grad_norm": 0.3934280276298523, "learning_rate": 3.3405724468203584e-05, "loss": 0.4771, "num_input_tokens_seen": 7283936, "step": 6000 }, { "epoch": 0.6687827152244126, "grad_norm": 0.44122251868247986, "learning_rate": 3.343356721238445e-05, "loss": 0.4987, "num_input_tokens_seen": 7290016, "step": 6005 }, { "epoch": 0.6693395701080298, "grad_norm": 0.4452816843986511, "learning_rate": 3.346140995656532e-05, "loss": 0.5057, "num_input_tokens_seen": 7296480, "step": 6010 }, { "epoch": 0.6698964249916471, "grad_norm": 0.59247887134552, "learning_rate": 3.348925270074619e-05, "loss": 0.4795, "num_input_tokens_seen": 7302976, "step": 6015 }, { "epoch": 0.6704532798752645, "grad_norm": 0.4532551169395447, "learning_rate": 3.351709544492705e-05, "loss": 0.5402, "num_input_tokens_seen": 7308992, "step": 6020 }, { "epoch": 0.6710101347588818, "grad_norm": 0.47741371393203735, "learning_rate": 3.354493818910792e-05, "loss": 0.4864, "num_input_tokens_seen": 7314880, "step": 6025 }, { "epoch": 0.6715669896424992, "grad_norm": 0.549946665763855, "learning_rate": 3.357278093328879e-05, "loss": 0.4707, "num_input_tokens_seen": 7320864, "step": 6030 }, { "epoch": 0.6721238445261165, "grad_norm": 0.4912715554237366, "learning_rate": 3.360062367746965e-05, "loss": 0.4901, "num_input_tokens_seen": 7327200, "step": 6035 }, { "epoch": 0.6726806994097339, "grad_norm": 0.5600190162658691, "learning_rate": 3.362846642165052e-05, "loss": 0.5103, "num_input_tokens_seen": 7333344, "step": 6040 }, { "epoch": 0.6732375542933512, "grad_norm": 0.6754578948020935, "learning_rate": 3.365630916583138e-05, "loss": 0.505, "num_input_tokens_seen": 7339456, "step": 6045 }, { "epoch": 0.6737944091769685, "grad_norm": 0.5847668647766113, "learning_rate": 3.368415191001225e-05, "loss": 0.4809, "num_input_tokens_seen": 7345728, "step": 6050 }, { "epoch": 0.6743512640605858, "grad_norm": 0.5209317803382874, "learning_rate": 3.371199465419312e-05, "loss": 0.5261, "num_input_tokens_seen": 7351680, "step": 6055 }, { "epoch": 0.6749081189442031, "grad_norm": 0.422370046377182, "learning_rate": 3.373983739837399e-05, "loss": 0.4738, "num_input_tokens_seen": 7357760, "step": 6060 }, { "epoch": 0.6754649738278204, "grad_norm": 0.4241487383842468, "learning_rate": 3.3767680142554856e-05, "loss": 0.4532, "num_input_tokens_seen": 7363968, "step": 6065 }, { "epoch": 0.6760218287114378, "grad_norm": 0.6256452202796936, "learning_rate": 3.379552288673572e-05, "loss": 0.4772, "num_input_tokens_seen": 7370080, "step": 6070 }, { "epoch": 0.6765786835950551, "grad_norm": 0.5198910236358643, "learning_rate": 3.382336563091659e-05, "loss": 0.4555, "num_input_tokens_seen": 7376288, "step": 6075 }, { "epoch": 0.6771355384786725, "grad_norm": 0.4373447895050049, "learning_rate": 3.385120837509745e-05, "loss": 0.5009, "num_input_tokens_seen": 7382400, "step": 6080 }, { "epoch": 0.6776923933622898, "grad_norm": 0.4385882318019867, "learning_rate": 3.387905111927832e-05, "loss": 0.4945, "num_input_tokens_seen": 7388512, "step": 6085 }, { "epoch": 0.6782492482459072, "grad_norm": 0.3378819227218628, "learning_rate": 3.3906893863459186e-05, "loss": 0.4669, "num_input_tokens_seen": 7394464, "step": 6090 }, { "epoch": 0.6788061031295245, "grad_norm": 0.9402928352355957, "learning_rate": 3.393473660764005e-05, "loss": 0.4871, "num_input_tokens_seen": 7400576, "step": 6095 }, { "epoch": 0.6793629580131417, "grad_norm": 0.3469609022140503, "learning_rate": 3.3962579351820916e-05, "loss": 0.4832, "num_input_tokens_seen": 7406528, "step": 6100 }, { "epoch": 0.6799198128967591, "grad_norm": 0.40196144580841064, "learning_rate": 3.3990422096001785e-05, "loss": 0.4646, "num_input_tokens_seen": 7412768, "step": 6105 }, { "epoch": 0.6804766677803764, "grad_norm": 0.5892338156700134, "learning_rate": 3.4018264840182654e-05, "loss": 0.4917, "num_input_tokens_seen": 7418944, "step": 6110 }, { "epoch": 0.6810335226639938, "grad_norm": 0.3937591314315796, "learning_rate": 3.4046107584363516e-05, "loss": 0.4879, "num_input_tokens_seen": 7424896, "step": 6115 }, { "epoch": 0.6815903775476111, "grad_norm": 0.43949294090270996, "learning_rate": 3.4073950328544384e-05, "loss": 0.4797, "num_input_tokens_seen": 7430720, "step": 6120 }, { "epoch": 0.6821472324312284, "grad_norm": 0.44785258173942566, "learning_rate": 3.410179307272525e-05, "loss": 0.4639, "num_input_tokens_seen": 7436704, "step": 6125 }, { "epoch": 0.6827040873148458, "grad_norm": 0.40081408619880676, "learning_rate": 3.4129635816906115e-05, "loss": 0.4568, "num_input_tokens_seen": 7442880, "step": 6130 }, { "epoch": 0.6832609421984631, "grad_norm": 0.3770079016685486, "learning_rate": 3.4157478561086983e-05, "loss": 0.4613, "num_input_tokens_seen": 7448640, "step": 6135 }, { "epoch": 0.6838177970820805, "grad_norm": 0.46453166007995605, "learning_rate": 3.4185321305267845e-05, "loss": 0.5112, "num_input_tokens_seen": 7454656, "step": 6140 }, { "epoch": 0.6843746519656977, "grad_norm": 0.5602030158042908, "learning_rate": 3.4213164049448714e-05, "loss": 0.4452, "num_input_tokens_seen": 7460736, "step": 6145 }, { "epoch": 0.684931506849315, "grad_norm": 0.4836292266845703, "learning_rate": 3.424100679362958e-05, "loss": 0.4684, "num_input_tokens_seen": 7466816, "step": 6150 }, { "epoch": 0.6854883617329324, "grad_norm": 0.3592773377895355, "learning_rate": 3.426884953781045e-05, "loss": 0.5092, "num_input_tokens_seen": 7472928, "step": 6155 }, { "epoch": 0.6860452166165497, "grad_norm": 0.561394214630127, "learning_rate": 3.429669228199131e-05, "loss": 0.4963, "num_input_tokens_seen": 7478496, "step": 6160 }, { "epoch": 0.686602071500167, "grad_norm": 0.5368025302886963, "learning_rate": 3.432453502617218e-05, "loss": 0.5028, "num_input_tokens_seen": 7484768, "step": 6165 }, { "epoch": 0.6871589263837844, "grad_norm": 0.6478268504142761, "learning_rate": 3.435237777035305e-05, "loss": 0.4569, "num_input_tokens_seen": 7490880, "step": 6170 }, { "epoch": 0.6877157812674017, "grad_norm": 0.46223747730255127, "learning_rate": 3.438022051453391e-05, "loss": 0.4801, "num_input_tokens_seen": 7496352, "step": 6175 }, { "epoch": 0.6882726361510191, "grad_norm": 0.42270877957344055, "learning_rate": 3.440806325871478e-05, "loss": 0.4758, "num_input_tokens_seen": 7502400, "step": 6180 }, { "epoch": 0.6888294910346364, "grad_norm": 0.396804541349411, "learning_rate": 3.443590600289564e-05, "loss": 0.4898, "num_input_tokens_seen": 7508544, "step": 6185 }, { "epoch": 0.6893863459182537, "grad_norm": 0.4305221736431122, "learning_rate": 3.446374874707651e-05, "loss": 0.5264, "num_input_tokens_seen": 7514784, "step": 6190 }, { "epoch": 0.689943200801871, "grad_norm": 0.3582288324832916, "learning_rate": 3.449159149125738e-05, "loss": 0.4809, "num_input_tokens_seen": 7520832, "step": 6195 }, { "epoch": 0.6905000556854883, "grad_norm": 0.6200929880142212, "learning_rate": 3.451943423543825e-05, "loss": 0.4852, "num_input_tokens_seen": 7527040, "step": 6200 }, { "epoch": 0.6910569105691057, "grad_norm": 0.3416684567928314, "learning_rate": 3.454727697961912e-05, "loss": 0.5263, "num_input_tokens_seen": 7533632, "step": 6205 }, { "epoch": 0.691613765452723, "grad_norm": 0.42645683884620667, "learning_rate": 3.457511972379998e-05, "loss": 0.4805, "num_input_tokens_seen": 7539776, "step": 6210 }, { "epoch": 0.6921706203363404, "grad_norm": 0.3375701606273651, "learning_rate": 3.460296246798085e-05, "loss": 0.4908, "num_input_tokens_seen": 7545824, "step": 6215 }, { "epoch": 0.6927274752199577, "grad_norm": 0.36162903904914856, "learning_rate": 3.463080521216171e-05, "loss": 0.4726, "num_input_tokens_seen": 7552096, "step": 6220 }, { "epoch": 0.693284330103575, "grad_norm": 0.4820559322834015, "learning_rate": 3.465864795634258e-05, "loss": 0.5138, "num_input_tokens_seen": 7557888, "step": 6225 }, { "epoch": 0.6938411849871924, "grad_norm": 0.5019935369491577, "learning_rate": 3.468649070052345e-05, "loss": 0.4714, "num_input_tokens_seen": 7563840, "step": 6230 }, { "epoch": 0.6943980398708096, "grad_norm": 0.4821746349334717, "learning_rate": 3.471433344470431e-05, "loss": 0.4945, "num_input_tokens_seen": 7569888, "step": 6235 }, { "epoch": 0.694954894754427, "grad_norm": 0.44794097542762756, "learning_rate": 3.474217618888518e-05, "loss": 0.4856, "num_input_tokens_seen": 7576160, "step": 6240 }, { "epoch": 0.6955117496380443, "grad_norm": 0.5432856678962708, "learning_rate": 3.4770018933066046e-05, "loss": 0.4907, "num_input_tokens_seen": 7581440, "step": 6245 }, { "epoch": 0.6960686045216616, "grad_norm": 0.45272910594940186, "learning_rate": 3.4797861677246915e-05, "loss": 0.4992, "num_input_tokens_seen": 7587776, "step": 6250 }, { "epoch": 0.696625459405279, "grad_norm": 0.342334508895874, "learning_rate": 3.482570442142778e-05, "loss": 0.4833, "num_input_tokens_seen": 7594272, "step": 6255 }, { "epoch": 0.6971823142888963, "grad_norm": 0.3708665072917938, "learning_rate": 3.4853547165608646e-05, "loss": 0.5156, "num_input_tokens_seen": 7600736, "step": 6260 }, { "epoch": 0.6977391691725137, "grad_norm": 0.41102874279022217, "learning_rate": 3.488138990978951e-05, "loss": 0.4769, "num_input_tokens_seen": 7606784, "step": 6265 }, { "epoch": 0.698296024056131, "grad_norm": 0.5397707223892212, "learning_rate": 3.4909232653970376e-05, "loss": 0.491, "num_input_tokens_seen": 7613088, "step": 6270 }, { "epoch": 0.6988528789397483, "grad_norm": 0.433945894241333, "learning_rate": 3.4937075398151245e-05, "loss": 0.4925, "num_input_tokens_seen": 7619520, "step": 6275 }, { "epoch": 0.6994097338233656, "grad_norm": 0.46356844902038574, "learning_rate": 3.496491814233211e-05, "loss": 0.46, "num_input_tokens_seen": 7625824, "step": 6280 }, { "epoch": 0.6999665887069829, "grad_norm": 0.5555761456489563, "learning_rate": 3.4992760886512975e-05, "loss": 0.4669, "num_input_tokens_seen": 7632064, "step": 6285 }, { "epoch": 0.7005234435906003, "grad_norm": 0.48775714635849, "learning_rate": 3.5020603630693844e-05, "loss": 0.4845, "num_input_tokens_seen": 7638400, "step": 6290 }, { "epoch": 0.7010802984742176, "grad_norm": 0.32190659642219543, "learning_rate": 3.504844637487471e-05, "loss": 0.4516, "num_input_tokens_seen": 7644480, "step": 6295 }, { "epoch": 0.7016371533578349, "grad_norm": 0.29630690813064575, "learning_rate": 3.5076289119055575e-05, "loss": 0.4954, "num_input_tokens_seen": 7650208, "step": 6300 }, { "epoch": 0.7021940082414523, "grad_norm": 0.5741602778434753, "learning_rate": 3.510413186323644e-05, "loss": 0.4994, "num_input_tokens_seen": 7656128, "step": 6305 }, { "epoch": 0.7027508631250696, "grad_norm": 0.5631380677223206, "learning_rate": 3.5131974607417305e-05, "loss": 0.4979, "num_input_tokens_seen": 7662112, "step": 6310 }, { "epoch": 0.703307718008687, "grad_norm": 0.5273990035057068, "learning_rate": 3.5159817351598174e-05, "loss": 0.4789, "num_input_tokens_seen": 7668352, "step": 6315 }, { "epoch": 0.7038645728923043, "grad_norm": 0.4377085864543915, "learning_rate": 3.518766009577904e-05, "loss": 0.4892, "num_input_tokens_seen": 7674080, "step": 6320 }, { "epoch": 0.7044214277759216, "grad_norm": 0.4591693878173828, "learning_rate": 3.5215502839959904e-05, "loss": 0.49, "num_input_tokens_seen": 7680384, "step": 6325 }, { "epoch": 0.7049782826595389, "grad_norm": 0.5114748477935791, "learning_rate": 3.524334558414077e-05, "loss": 0.5166, "num_input_tokens_seen": 7686528, "step": 6330 }, { "epoch": 0.7055351375431562, "grad_norm": 0.36136317253112793, "learning_rate": 3.527118832832164e-05, "loss": 0.4771, "num_input_tokens_seen": 7692544, "step": 6335 }, { "epoch": 0.7060919924267736, "grad_norm": 0.45111435651779175, "learning_rate": 3.529903107250251e-05, "loss": 0.4652, "num_input_tokens_seen": 7698816, "step": 6340 }, { "epoch": 0.7066488473103909, "grad_norm": 0.6487326622009277, "learning_rate": 3.532687381668338e-05, "loss": 0.4763, "num_input_tokens_seen": 7704992, "step": 6345 }, { "epoch": 0.7072057021940082, "grad_norm": 0.4597870111465454, "learning_rate": 3.535471656086424e-05, "loss": 0.491, "num_input_tokens_seen": 7711296, "step": 6350 }, { "epoch": 0.7077625570776256, "grad_norm": 0.3667698800563812, "learning_rate": 3.53825593050451e-05, "loss": 0.4653, "num_input_tokens_seen": 7717632, "step": 6355 }, { "epoch": 0.7083194119612429, "grad_norm": 0.37333807349205017, "learning_rate": 3.541040204922597e-05, "loss": 0.474, "num_input_tokens_seen": 7723808, "step": 6360 }, { "epoch": 0.7088762668448603, "grad_norm": 0.47278842329978943, "learning_rate": 3.543824479340684e-05, "loss": 0.4619, "num_input_tokens_seen": 7729984, "step": 6365 }, { "epoch": 0.7094331217284776, "grad_norm": 0.46078628301620483, "learning_rate": 3.546608753758771e-05, "loss": 0.4682, "num_input_tokens_seen": 7736000, "step": 6370 }, { "epoch": 0.7099899766120948, "grad_norm": 0.47993212938308716, "learning_rate": 3.549393028176857e-05, "loss": 0.4644, "num_input_tokens_seen": 7742048, "step": 6375 }, { "epoch": 0.7105468314957122, "grad_norm": 0.6709398627281189, "learning_rate": 3.552177302594944e-05, "loss": 0.4774, "num_input_tokens_seen": 7748192, "step": 6380 }, { "epoch": 0.7111036863793295, "grad_norm": 0.34863874316215515, "learning_rate": 3.554961577013031e-05, "loss": 0.4708, "num_input_tokens_seen": 7754208, "step": 6385 }, { "epoch": 0.7116605412629469, "grad_norm": 0.4314553737640381, "learning_rate": 3.5577458514311177e-05, "loss": 0.5125, "num_input_tokens_seen": 7760224, "step": 6390 }, { "epoch": 0.7122173961465642, "grad_norm": 0.4013245403766632, "learning_rate": 3.560530125849204e-05, "loss": 0.4739, "num_input_tokens_seen": 7765952, "step": 6395 }, { "epoch": 0.7127742510301815, "grad_norm": 0.5252406001091003, "learning_rate": 3.56331440026729e-05, "loss": 0.5213, "num_input_tokens_seen": 7772032, "step": 6400 }, { "epoch": 0.7133311059137989, "grad_norm": 0.4259035289287567, "learning_rate": 3.566098674685377e-05, "loss": 0.4959, "num_input_tokens_seen": 7777952, "step": 6405 }, { "epoch": 0.7138879607974162, "grad_norm": 0.4437812566757202, "learning_rate": 3.568882949103464e-05, "loss": 0.4727, "num_input_tokens_seen": 7784352, "step": 6410 }, { "epoch": 0.7144448156810336, "grad_norm": 0.3786076307296753, "learning_rate": 3.5716672235215506e-05, "loss": 0.4778, "num_input_tokens_seen": 7790656, "step": 6415 }, { "epoch": 0.7150016705646508, "grad_norm": 0.41185805201530457, "learning_rate": 3.574451497939637e-05, "loss": 0.4899, "num_input_tokens_seen": 7797024, "step": 6420 }, { "epoch": 0.7155585254482681, "grad_norm": 0.387393593788147, "learning_rate": 3.577235772357724e-05, "loss": 0.4915, "num_input_tokens_seen": 7803232, "step": 6425 }, { "epoch": 0.7161153803318855, "grad_norm": 0.4907417595386505, "learning_rate": 3.5800200467758105e-05, "loss": 0.5027, "num_input_tokens_seen": 7809792, "step": 6430 }, { "epoch": 0.7166722352155028, "grad_norm": 0.42760398983955383, "learning_rate": 3.5828043211938974e-05, "loss": 0.4668, "num_input_tokens_seen": 7816064, "step": 6435 }, { "epoch": 0.7172290900991202, "grad_norm": 0.46253612637519836, "learning_rate": 3.5855885956119836e-05, "loss": 0.4642, "num_input_tokens_seen": 7822144, "step": 6440 }, { "epoch": 0.7177859449827375, "grad_norm": 0.4666707515716553, "learning_rate": 3.5883728700300705e-05, "loss": 0.5035, "num_input_tokens_seen": 7828064, "step": 6445 }, { "epoch": 0.7183427998663549, "grad_norm": 0.4143347442150116, "learning_rate": 3.5911571444481567e-05, "loss": 0.4951, "num_input_tokens_seen": 7834336, "step": 6450 }, { "epoch": 0.7188996547499722, "grad_norm": 0.5306417346000671, "learning_rate": 3.5939414188662435e-05, "loss": 0.455, "num_input_tokens_seen": 7840448, "step": 6455 }, { "epoch": 0.7194565096335895, "grad_norm": 0.46760958433151245, "learning_rate": 3.5967256932843304e-05, "loss": 0.4701, "num_input_tokens_seen": 7847040, "step": 6460 }, { "epoch": 0.7200133645172068, "grad_norm": 0.4581376314163208, "learning_rate": 3.5995099677024166e-05, "loss": 0.4813, "num_input_tokens_seen": 7853120, "step": 6465 }, { "epoch": 0.7205702194008241, "grad_norm": 0.33825448155403137, "learning_rate": 3.6022942421205034e-05, "loss": 0.5139, "num_input_tokens_seen": 7859296, "step": 6470 }, { "epoch": 0.7211270742844414, "grad_norm": 0.7647358775138855, "learning_rate": 3.60507851653859e-05, "loss": 0.4847, "num_input_tokens_seen": 7865568, "step": 6475 }, { "epoch": 0.7216839291680588, "grad_norm": 0.3286718428134918, "learning_rate": 3.607862790956677e-05, "loss": 0.4738, "num_input_tokens_seen": 7871552, "step": 6480 }, { "epoch": 0.7222407840516761, "grad_norm": 0.40965306758880615, "learning_rate": 3.610647065374764e-05, "loss": 0.4578, "num_input_tokens_seen": 7877920, "step": 6485 }, { "epoch": 0.7227976389352935, "grad_norm": 0.32923024892807007, "learning_rate": 3.61343133979285e-05, "loss": 0.4659, "num_input_tokens_seen": 7884352, "step": 6490 }, { "epoch": 0.7233544938189108, "grad_norm": 0.4202875792980194, "learning_rate": 3.6162156142109364e-05, "loss": 0.4867, "num_input_tokens_seen": 7890496, "step": 6495 }, { "epoch": 0.7239113487025282, "grad_norm": 0.3361358344554901, "learning_rate": 3.618999888629023e-05, "loss": 0.4981, "num_input_tokens_seen": 7896288, "step": 6500 }, { "epoch": 0.7244682035861455, "grad_norm": 0.39253273606300354, "learning_rate": 3.62178416304711e-05, "loss": 0.5028, "num_input_tokens_seen": 7902272, "step": 6505 }, { "epoch": 0.7250250584697627, "grad_norm": 0.44577351212501526, "learning_rate": 3.624568437465197e-05, "loss": 0.4884, "num_input_tokens_seen": 7908128, "step": 6510 }, { "epoch": 0.7255819133533801, "grad_norm": 0.6313996315002441, "learning_rate": 3.627352711883283e-05, "loss": 0.478, "num_input_tokens_seen": 7914336, "step": 6515 }, { "epoch": 0.7261387682369974, "grad_norm": 0.754777729511261, "learning_rate": 3.63013698630137e-05, "loss": 0.5033, "num_input_tokens_seen": 7920448, "step": 6520 }, { "epoch": 0.7266956231206148, "grad_norm": 0.4248761832714081, "learning_rate": 3.632921260719457e-05, "loss": 0.4495, "num_input_tokens_seen": 7926272, "step": 6525 }, { "epoch": 0.7272524780042321, "grad_norm": 0.4537213444709778, "learning_rate": 3.635705535137544e-05, "loss": 0.4931, "num_input_tokens_seen": 7932384, "step": 6530 }, { "epoch": 0.7278093328878494, "grad_norm": 0.5640491247177124, "learning_rate": 3.63848980955563e-05, "loss": 0.4676, "num_input_tokens_seen": 7938592, "step": 6535 }, { "epoch": 0.7283661877714668, "grad_norm": 0.5909960865974426, "learning_rate": 3.641274083973716e-05, "loss": 0.507, "num_input_tokens_seen": 7944448, "step": 6540 }, { "epoch": 0.7289230426550841, "grad_norm": 0.4952598214149475, "learning_rate": 3.644058358391803e-05, "loss": 0.482, "num_input_tokens_seen": 7950592, "step": 6545 }, { "epoch": 0.7294798975387015, "grad_norm": 0.6037720441818237, "learning_rate": 3.64684263280989e-05, "loss": 0.4914, "num_input_tokens_seen": 7957056, "step": 6550 }, { "epoch": 0.7300367524223187, "grad_norm": 0.46267184615135193, "learning_rate": 3.649626907227977e-05, "loss": 0.4603, "num_input_tokens_seen": 7963360, "step": 6555 }, { "epoch": 0.730593607305936, "grad_norm": 0.4012250304222107, "learning_rate": 3.652411181646063e-05, "loss": 0.4746, "num_input_tokens_seen": 7969600, "step": 6560 }, { "epoch": 0.7311504621895534, "grad_norm": 0.40991494059562683, "learning_rate": 3.65519545606415e-05, "loss": 0.4759, "num_input_tokens_seen": 7975872, "step": 6565 }, { "epoch": 0.7317073170731707, "grad_norm": 0.404439240694046, "learning_rate": 3.657979730482237e-05, "loss": 0.4719, "num_input_tokens_seen": 7982112, "step": 6570 }, { "epoch": 0.7322641719567881, "grad_norm": 0.3421826958656311, "learning_rate": 3.6607640049003236e-05, "loss": 0.495, "num_input_tokens_seen": 7988288, "step": 6575 }, { "epoch": 0.7328210268404054, "grad_norm": 0.41975247859954834, "learning_rate": 3.66354827931841e-05, "loss": 0.4819, "num_input_tokens_seen": 7994304, "step": 6580 }, { "epoch": 0.7333778817240227, "grad_norm": 0.4193038046360016, "learning_rate": 3.666332553736496e-05, "loss": 0.4701, "num_input_tokens_seen": 8000384, "step": 6585 }, { "epoch": 0.7339347366076401, "grad_norm": 0.47007402777671814, "learning_rate": 3.669116828154583e-05, "loss": 0.4864, "num_input_tokens_seen": 8006432, "step": 6590 }, { "epoch": 0.7344915914912574, "grad_norm": 0.37846997380256653, "learning_rate": 3.67190110257267e-05, "loss": 0.4831, "num_input_tokens_seen": 8012512, "step": 6595 }, { "epoch": 0.7350484463748747, "grad_norm": 0.546959638595581, "learning_rate": 3.6746853769907565e-05, "loss": 0.474, "num_input_tokens_seen": 8018624, "step": 6600 }, { "epoch": 0.735605301258492, "grad_norm": 0.337039053440094, "learning_rate": 3.677469651408843e-05, "loss": 0.4674, "num_input_tokens_seen": 8024704, "step": 6605 }, { "epoch": 0.7361621561421093, "grad_norm": 0.3519502878189087, "learning_rate": 3.6802539258269296e-05, "loss": 0.4753, "num_input_tokens_seen": 8030592, "step": 6610 }, { "epoch": 0.7367190110257267, "grad_norm": 0.4889698326587677, "learning_rate": 3.6830382002450164e-05, "loss": 0.4912, "num_input_tokens_seen": 8036576, "step": 6615 }, { "epoch": 0.737275865909344, "grad_norm": 0.3322230875492096, "learning_rate": 3.685822474663103e-05, "loss": 0.4614, "num_input_tokens_seen": 8042912, "step": 6620 }, { "epoch": 0.7378327207929614, "grad_norm": 0.4933854937553406, "learning_rate": 3.68860674908119e-05, "loss": 0.4678, "num_input_tokens_seen": 8048864, "step": 6625 }, { "epoch": 0.7383895756765787, "grad_norm": 0.3555624783039093, "learning_rate": 3.691391023499276e-05, "loss": 0.4829, "num_input_tokens_seen": 8054272, "step": 6630 }, { "epoch": 0.738946430560196, "grad_norm": 0.38172152638435364, "learning_rate": 3.6941752979173626e-05, "loss": 0.5242, "num_input_tokens_seen": 8060448, "step": 6635 }, { "epoch": 0.7395032854438134, "grad_norm": 0.3349594175815582, "learning_rate": 3.6969595723354494e-05, "loss": 0.483, "num_input_tokens_seen": 8066624, "step": 6640 }, { "epoch": 0.7400601403274307, "grad_norm": 0.5012108683586121, "learning_rate": 3.699743846753536e-05, "loss": 0.4867, "num_input_tokens_seen": 8072704, "step": 6645 }, { "epoch": 0.740616995211048, "grad_norm": 0.4571806788444519, "learning_rate": 3.702528121171623e-05, "loss": 0.462, "num_input_tokens_seen": 8079072, "step": 6650 }, { "epoch": 0.7411738500946653, "grad_norm": 0.3638306260108948, "learning_rate": 3.7053123955897093e-05, "loss": 0.4599, "num_input_tokens_seen": 8084736, "step": 6655 }, { "epoch": 0.7417307049782826, "grad_norm": 0.3241967558860779, "learning_rate": 3.708096670007796e-05, "loss": 0.4473, "num_input_tokens_seen": 8090624, "step": 6660 }, { "epoch": 0.7422875598619, "grad_norm": 0.4500578045845032, "learning_rate": 3.710880944425883e-05, "loss": 0.4605, "num_input_tokens_seen": 8096576, "step": 6665 }, { "epoch": 0.7428444147455173, "grad_norm": 0.3870372176170349, "learning_rate": 3.71366521884397e-05, "loss": 0.4842, "num_input_tokens_seen": 8102848, "step": 6670 }, { "epoch": 0.7434012696291347, "grad_norm": 0.3658025562763214, "learning_rate": 3.716449493262056e-05, "loss": 0.4713, "num_input_tokens_seen": 8108992, "step": 6675 }, { "epoch": 0.743958124512752, "grad_norm": 0.5773656964302063, "learning_rate": 3.719233767680142e-05, "loss": 0.4748, "num_input_tokens_seen": 8115456, "step": 6680 }, { "epoch": 0.7445149793963693, "grad_norm": 0.527798056602478, "learning_rate": 3.722018042098229e-05, "loss": 0.5138, "num_input_tokens_seen": 8121728, "step": 6685 }, { "epoch": 0.7450718342799867, "grad_norm": 0.3840281665325165, "learning_rate": 3.724802316516316e-05, "loss": 0.4687, "num_input_tokens_seen": 8127168, "step": 6690 }, { "epoch": 0.7456286891636039, "grad_norm": 0.7424427270889282, "learning_rate": 3.727586590934403e-05, "loss": 0.5142, "num_input_tokens_seen": 8133152, "step": 6695 }, { "epoch": 0.7461855440472213, "grad_norm": 0.5258194208145142, "learning_rate": 3.730370865352489e-05, "loss": 0.456, "num_input_tokens_seen": 8139488, "step": 6700 }, { "epoch": 0.7467423989308386, "grad_norm": 0.4583008885383606, "learning_rate": 3.733155139770576e-05, "loss": 0.5211, "num_input_tokens_seen": 8145696, "step": 6705 }, { "epoch": 0.7472992538144559, "grad_norm": 0.48811036348342896, "learning_rate": 3.735939414188663e-05, "loss": 0.4616, "num_input_tokens_seen": 8151648, "step": 6710 }, { "epoch": 0.7478561086980733, "grad_norm": 0.4325175881385803, "learning_rate": 3.73872368860675e-05, "loss": 0.4772, "num_input_tokens_seen": 8158112, "step": 6715 }, { "epoch": 0.7484129635816906, "grad_norm": 0.4062194526195526, "learning_rate": 3.741507963024836e-05, "loss": 0.4777, "num_input_tokens_seen": 8163936, "step": 6720 }, { "epoch": 0.748969818465308, "grad_norm": 0.5079994797706604, "learning_rate": 3.744292237442922e-05, "loss": 0.4478, "num_input_tokens_seen": 8170144, "step": 6725 }, { "epoch": 0.7495266733489253, "grad_norm": 0.36339130997657776, "learning_rate": 3.747076511861009e-05, "loss": 0.4964, "num_input_tokens_seen": 8176256, "step": 6730 }, { "epoch": 0.7500835282325427, "grad_norm": 0.6219043135643005, "learning_rate": 3.749860786279096e-05, "loss": 0.471, "num_input_tokens_seen": 8182304, "step": 6735 }, { "epoch": 0.7506403831161599, "grad_norm": 0.4897614121437073, "learning_rate": 3.752645060697183e-05, "loss": 0.4931, "num_input_tokens_seen": 8188160, "step": 6740 }, { "epoch": 0.7511972379997772, "grad_norm": 0.45322564244270325, "learning_rate": 3.755429335115269e-05, "loss": 0.4714, "num_input_tokens_seen": 8194240, "step": 6745 }, { "epoch": 0.7517540928833946, "grad_norm": 0.7186262607574463, "learning_rate": 3.758213609533356e-05, "loss": 0.4955, "num_input_tokens_seen": 8200320, "step": 6750 }, { "epoch": 0.7523109477670119, "grad_norm": 0.3927483558654785, "learning_rate": 3.7609978839514426e-05, "loss": 0.486, "num_input_tokens_seen": 8206688, "step": 6755 }, { "epoch": 0.7528678026506292, "grad_norm": 0.40012025833129883, "learning_rate": 3.7637821583695295e-05, "loss": 0.4974, "num_input_tokens_seen": 8212896, "step": 6760 }, { "epoch": 0.7534246575342466, "grad_norm": 0.41365745663642883, "learning_rate": 3.7665664327876156e-05, "loss": 0.4722, "num_input_tokens_seen": 8218496, "step": 6765 }, { "epoch": 0.7539815124178639, "grad_norm": 0.5086566805839539, "learning_rate": 3.769350707205702e-05, "loss": 0.4687, "num_input_tokens_seen": 8224800, "step": 6770 }, { "epoch": 0.7545383673014813, "grad_norm": 0.3602585792541504, "learning_rate": 3.772134981623789e-05, "loss": 0.4599, "num_input_tokens_seen": 8230944, "step": 6775 }, { "epoch": 0.7550952221850986, "grad_norm": 0.6729400157928467, "learning_rate": 3.7749192560418756e-05, "loss": 0.5095, "num_input_tokens_seen": 8237024, "step": 6780 }, { "epoch": 0.7556520770687158, "grad_norm": 0.44420793652534485, "learning_rate": 3.7777035304599624e-05, "loss": 0.4741, "num_input_tokens_seen": 8243232, "step": 6785 }, { "epoch": 0.7562089319523332, "grad_norm": 0.33808788657188416, "learning_rate": 3.780487804878049e-05, "loss": 0.4839, "num_input_tokens_seen": 8249088, "step": 6790 }, { "epoch": 0.7567657868359505, "grad_norm": 0.5300577282905579, "learning_rate": 3.7832720792961355e-05, "loss": 0.4784, "num_input_tokens_seen": 8255296, "step": 6795 }, { "epoch": 0.7573226417195679, "grad_norm": 0.39151883125305176, "learning_rate": 3.7860563537142223e-05, "loss": 0.5047, "num_input_tokens_seen": 8260928, "step": 6800 }, { "epoch": 0.7578794966031852, "grad_norm": 0.27770838141441345, "learning_rate": 3.788840628132309e-05, "loss": 0.4613, "num_input_tokens_seen": 8267040, "step": 6805 }, { "epoch": 0.7584363514868026, "grad_norm": 0.35590896010398865, "learning_rate": 3.791624902550396e-05, "loss": 0.4592, "num_input_tokens_seen": 8273088, "step": 6810 }, { "epoch": 0.7589932063704199, "grad_norm": 0.446626752614975, "learning_rate": 3.794409176968482e-05, "loss": 0.4979, "num_input_tokens_seen": 8278944, "step": 6815 }, { "epoch": 0.7595500612540372, "grad_norm": 0.3787863552570343, "learning_rate": 3.7971934513865685e-05, "loss": 0.4986, "num_input_tokens_seen": 8284960, "step": 6820 }, { "epoch": 0.7601069161376546, "grad_norm": 0.5223546624183655, "learning_rate": 3.799977725804655e-05, "loss": 0.4883, "num_input_tokens_seen": 8290624, "step": 6825 }, { "epoch": 0.7606637710212718, "grad_norm": 0.5004909634590149, "learning_rate": 3.802762000222742e-05, "loss": 0.4756, "num_input_tokens_seen": 8296864, "step": 6830 }, { "epoch": 0.7612206259048891, "grad_norm": 0.43708547949790955, "learning_rate": 3.805546274640829e-05, "loss": 0.4573, "num_input_tokens_seen": 8303072, "step": 6835 }, { "epoch": 0.7617774807885065, "grad_norm": 0.41723906993865967, "learning_rate": 3.808330549058915e-05, "loss": 0.4706, "num_input_tokens_seen": 8309152, "step": 6840 }, { "epoch": 0.7623343356721238, "grad_norm": 0.4065011143684387, "learning_rate": 3.811114823477002e-05, "loss": 0.4792, "num_input_tokens_seen": 8315200, "step": 6845 }, { "epoch": 0.7628911905557412, "grad_norm": 0.5028240084648132, "learning_rate": 3.813899097895089e-05, "loss": 0.4627, "num_input_tokens_seen": 8320832, "step": 6850 }, { "epoch": 0.7634480454393585, "grad_norm": 0.35388466715812683, "learning_rate": 3.816683372313176e-05, "loss": 0.4756, "num_input_tokens_seen": 8326592, "step": 6855 }, { "epoch": 0.7640049003229759, "grad_norm": 0.4197545051574707, "learning_rate": 3.819467646731262e-05, "loss": 0.4667, "num_input_tokens_seen": 8332672, "step": 6860 }, { "epoch": 0.7645617552065932, "grad_norm": 0.5045791268348694, "learning_rate": 3.822251921149348e-05, "loss": 0.4946, "num_input_tokens_seen": 8338720, "step": 6865 }, { "epoch": 0.7651186100902105, "grad_norm": 0.363972932100296, "learning_rate": 3.825036195567435e-05, "loss": 0.4494, "num_input_tokens_seen": 8344448, "step": 6870 }, { "epoch": 0.7656754649738278, "grad_norm": 0.3830297887325287, "learning_rate": 3.827820469985522e-05, "loss": 0.4967, "num_input_tokens_seen": 8350656, "step": 6875 }, { "epoch": 0.7662323198574451, "grad_norm": 0.4878532588481903, "learning_rate": 3.830604744403609e-05, "loss": 0.4447, "num_input_tokens_seen": 8356576, "step": 6880 }, { "epoch": 0.7667891747410625, "grad_norm": 0.499851256608963, "learning_rate": 3.833389018821695e-05, "loss": 0.5178, "num_input_tokens_seen": 8362720, "step": 6885 }, { "epoch": 0.7673460296246798, "grad_norm": 0.4234555959701538, "learning_rate": 3.836173293239782e-05, "loss": 0.4605, "num_input_tokens_seen": 8368800, "step": 6890 }, { "epoch": 0.7679028845082971, "grad_norm": 0.3565358817577362, "learning_rate": 3.838957567657869e-05, "loss": 0.4579, "num_input_tokens_seen": 8374240, "step": 6895 }, { "epoch": 0.7684597393919145, "grad_norm": 0.39325422048568726, "learning_rate": 3.8417418420759556e-05, "loss": 0.4916, "num_input_tokens_seen": 8380288, "step": 6900 }, { "epoch": 0.7690165942755318, "grad_norm": 0.5641790628433228, "learning_rate": 3.844526116494042e-05, "loss": 0.4706, "num_input_tokens_seen": 8386432, "step": 6905 }, { "epoch": 0.7695734491591492, "grad_norm": 0.36475664377212524, "learning_rate": 3.847310390912128e-05, "loss": 0.4501, "num_input_tokens_seen": 8392448, "step": 6910 }, { "epoch": 0.7701303040427665, "grad_norm": 0.4066353142261505, "learning_rate": 3.850094665330215e-05, "loss": 0.4713, "num_input_tokens_seen": 8398240, "step": 6915 }, { "epoch": 0.7706871589263837, "grad_norm": 0.41233497858047485, "learning_rate": 3.852878939748302e-05, "loss": 0.4807, "num_input_tokens_seen": 8404352, "step": 6920 }, { "epoch": 0.7712440138100011, "grad_norm": 0.4466035068035126, "learning_rate": 3.8556632141663886e-05, "loss": 0.4453, "num_input_tokens_seen": 8410208, "step": 6925 }, { "epoch": 0.7718008686936184, "grad_norm": 0.4507395327091217, "learning_rate": 3.8584474885844754e-05, "loss": 0.4881, "num_input_tokens_seen": 8416128, "step": 6930 }, { "epoch": 0.7723577235772358, "grad_norm": 0.3389889597892761, "learning_rate": 3.8612317630025616e-05, "loss": 0.4798, "num_input_tokens_seen": 8422400, "step": 6935 }, { "epoch": 0.7729145784608531, "grad_norm": 0.3754684627056122, "learning_rate": 3.8640160374206485e-05, "loss": 0.4682, "num_input_tokens_seen": 8428544, "step": 6940 }, { "epoch": 0.7734714333444704, "grad_norm": 0.463097482919693, "learning_rate": 3.8668003118387354e-05, "loss": 0.4885, "num_input_tokens_seen": 8434304, "step": 6945 }, { "epoch": 0.7740282882280878, "grad_norm": 0.3762580454349518, "learning_rate": 3.8695845862568215e-05, "loss": 0.4794, "num_input_tokens_seen": 8440512, "step": 6950 }, { "epoch": 0.7745851431117051, "grad_norm": 0.44417932629585266, "learning_rate": 3.8723688606749084e-05, "loss": 0.492, "num_input_tokens_seen": 8446720, "step": 6955 }, { "epoch": 0.7751419979953225, "grad_norm": 0.43597814440727234, "learning_rate": 3.8751531350929946e-05, "loss": 0.4662, "num_input_tokens_seen": 8452640, "step": 6960 }, { "epoch": 0.7756988528789397, "grad_norm": 0.34440961480140686, "learning_rate": 3.8779374095110815e-05, "loss": 0.4822, "num_input_tokens_seen": 8458784, "step": 6965 }, { "epoch": 0.776255707762557, "grad_norm": 0.48122355341911316, "learning_rate": 3.880721683929168e-05, "loss": 0.4782, "num_input_tokens_seen": 8464832, "step": 6970 }, { "epoch": 0.7768125626461744, "grad_norm": 0.49686184525489807, "learning_rate": 3.883505958347255e-05, "loss": 0.5055, "num_input_tokens_seen": 8470912, "step": 6975 }, { "epoch": 0.7773694175297917, "grad_norm": 0.33862099051475525, "learning_rate": 3.8862902327653414e-05, "loss": 0.4922, "num_input_tokens_seen": 8477056, "step": 6980 }, { "epoch": 0.7779262724134091, "grad_norm": 0.4473043978214264, "learning_rate": 3.889074507183428e-05, "loss": 0.4802, "num_input_tokens_seen": 8483360, "step": 6985 }, { "epoch": 0.7784831272970264, "grad_norm": 0.4064438045024872, "learning_rate": 3.891858781601515e-05, "loss": 0.5283, "num_input_tokens_seen": 8489824, "step": 6990 }, { "epoch": 0.7790399821806437, "grad_norm": 0.48544564843177795, "learning_rate": 3.894643056019601e-05, "loss": 0.4612, "num_input_tokens_seen": 8496032, "step": 6995 }, { "epoch": 0.7795968370642611, "grad_norm": 0.38470780849456787, "learning_rate": 3.897427330437688e-05, "loss": 0.4797, "num_input_tokens_seen": 8501728, "step": 7000 }, { "epoch": 0.7801536919478784, "grad_norm": 0.36427047848701477, "learning_rate": 3.9002116048557744e-05, "loss": 0.4797, "num_input_tokens_seen": 8507520, "step": 7005 }, { "epoch": 0.7807105468314958, "grad_norm": 0.4779553711414337, "learning_rate": 3.902995879273861e-05, "loss": 0.5264, "num_input_tokens_seen": 8514208, "step": 7010 }, { "epoch": 0.781267401715113, "grad_norm": 0.40870460867881775, "learning_rate": 3.905780153691948e-05, "loss": 0.4451, "num_input_tokens_seen": 8520320, "step": 7015 }, { "epoch": 0.7818242565987303, "grad_norm": 0.3742215931415558, "learning_rate": 3.908564428110035e-05, "loss": 0.4501, "num_input_tokens_seen": 8526272, "step": 7020 }, { "epoch": 0.7823811114823477, "grad_norm": 0.4858332574367523, "learning_rate": 3.911348702528121e-05, "loss": 0.4984, "num_input_tokens_seen": 8532352, "step": 7025 }, { "epoch": 0.782937966365965, "grad_norm": 0.4077288508415222, "learning_rate": 3.914132976946208e-05, "loss": 0.5072, "num_input_tokens_seen": 8538432, "step": 7030 }, { "epoch": 0.7834948212495824, "grad_norm": 0.5642166137695312, "learning_rate": 3.916917251364295e-05, "loss": 0.4887, "num_input_tokens_seen": 8544704, "step": 7035 }, { "epoch": 0.7840516761331997, "grad_norm": 0.39633098244667053, "learning_rate": 3.919701525782381e-05, "loss": 0.4699, "num_input_tokens_seen": 8551008, "step": 7040 }, { "epoch": 0.784608531016817, "grad_norm": 0.368455708026886, "learning_rate": 3.922485800200468e-05, "loss": 0.4723, "num_input_tokens_seen": 8557280, "step": 7045 }, { "epoch": 0.7851653859004344, "grad_norm": 0.4113759696483612, "learning_rate": 3.925270074618554e-05, "loss": 0.4674, "num_input_tokens_seen": 8563168, "step": 7050 }, { "epoch": 0.7857222407840517, "grad_norm": 0.5463143587112427, "learning_rate": 3.928054349036641e-05, "loss": 0.4646, "num_input_tokens_seen": 8568992, "step": 7055 }, { "epoch": 0.786279095667669, "grad_norm": 0.3252580463886261, "learning_rate": 3.930838623454728e-05, "loss": 0.4816, "num_input_tokens_seen": 8574816, "step": 7060 }, { "epoch": 0.7868359505512863, "grad_norm": 0.38149505853652954, "learning_rate": 3.933622897872815e-05, "loss": 0.466, "num_input_tokens_seen": 8580960, "step": 7065 }, { "epoch": 0.7873928054349036, "grad_norm": 0.4887966215610504, "learning_rate": 3.9364071722909016e-05, "loss": 0.4949, "num_input_tokens_seen": 8587072, "step": 7070 }, { "epoch": 0.787949660318521, "grad_norm": 0.41581419110298157, "learning_rate": 3.939191446708988e-05, "loss": 0.4603, "num_input_tokens_seen": 8593024, "step": 7075 }, { "epoch": 0.7885065152021383, "grad_norm": 0.3546419143676758, "learning_rate": 3.9419757211270746e-05, "loss": 0.4856, "num_input_tokens_seen": 8599296, "step": 7080 }, { "epoch": 0.7890633700857557, "grad_norm": 0.4119083881378174, "learning_rate": 3.944759995545161e-05, "loss": 0.4789, "num_input_tokens_seen": 8605664, "step": 7085 }, { "epoch": 0.789620224969373, "grad_norm": 0.6919230818748474, "learning_rate": 3.947544269963248e-05, "loss": 0.4663, "num_input_tokens_seen": 8611904, "step": 7090 }, { "epoch": 0.7901770798529903, "grad_norm": 0.41665396094322205, "learning_rate": 3.9503285443813345e-05, "loss": 0.4778, "num_input_tokens_seen": 8617408, "step": 7095 }, { "epoch": 0.7907339347366077, "grad_norm": 0.45157092809677124, "learning_rate": 3.953112818799421e-05, "loss": 0.479, "num_input_tokens_seen": 8623424, "step": 7100 }, { "epoch": 0.7912907896202249, "grad_norm": 0.5534867644309998, "learning_rate": 3.9558970932175076e-05, "loss": 0.4702, "num_input_tokens_seen": 8629472, "step": 7105 }, { "epoch": 0.7918476445038423, "grad_norm": 0.4380035698413849, "learning_rate": 3.9586813676355945e-05, "loss": 0.4486, "num_input_tokens_seen": 8635744, "step": 7110 }, { "epoch": 0.7924044993874596, "grad_norm": 0.3854268193244934, "learning_rate": 3.961465642053681e-05, "loss": 0.4914, "num_input_tokens_seen": 8642112, "step": 7115 }, { "epoch": 0.792961354271077, "grad_norm": 0.440082311630249, "learning_rate": 3.9642499164717675e-05, "loss": 0.4673, "num_input_tokens_seen": 8648384, "step": 7120 }, { "epoch": 0.7935182091546943, "grad_norm": 0.36815449595451355, "learning_rate": 3.9670341908898544e-05, "loss": 0.4671, "num_input_tokens_seen": 8654304, "step": 7125 }, { "epoch": 0.7940750640383116, "grad_norm": 0.35873377323150635, "learning_rate": 3.969818465307941e-05, "loss": 0.4609, "num_input_tokens_seen": 8660448, "step": 7130 }, { "epoch": 0.794631918921929, "grad_norm": 0.39610913395881653, "learning_rate": 3.9726027397260274e-05, "loss": 0.4946, "num_input_tokens_seen": 8666432, "step": 7135 }, { "epoch": 0.7951887738055463, "grad_norm": 0.40447863936424255, "learning_rate": 3.975387014144114e-05, "loss": 0.485, "num_input_tokens_seen": 8672608, "step": 7140 }, { "epoch": 0.7957456286891637, "grad_norm": 0.4848209321498871, "learning_rate": 3.9781712885622005e-05, "loss": 0.4652, "num_input_tokens_seen": 8678464, "step": 7145 }, { "epoch": 0.7963024835727809, "grad_norm": 0.35872527956962585, "learning_rate": 3.9809555629802874e-05, "loss": 0.4976, "num_input_tokens_seen": 8684512, "step": 7150 }, { "epoch": 0.7968593384563982, "grad_norm": 0.47054851055145264, "learning_rate": 3.983739837398374e-05, "loss": 0.4619, "num_input_tokens_seen": 8690432, "step": 7155 }, { "epoch": 0.7974161933400156, "grad_norm": 0.32316499948501587, "learning_rate": 3.986524111816461e-05, "loss": 0.4671, "num_input_tokens_seen": 8696608, "step": 7160 }, { "epoch": 0.7979730482236329, "grad_norm": 0.5312647819519043, "learning_rate": 3.989308386234547e-05, "loss": 0.4934, "num_input_tokens_seen": 8702720, "step": 7165 }, { "epoch": 0.7985299031072502, "grad_norm": 0.3081958293914795, "learning_rate": 3.992092660652634e-05, "loss": 0.4578, "num_input_tokens_seen": 8708128, "step": 7170 }, { "epoch": 0.7990867579908676, "grad_norm": 0.36848971247673035, "learning_rate": 3.994876935070721e-05, "loss": 0.4882, "num_input_tokens_seen": 8714208, "step": 7175 }, { "epoch": 0.7996436128744849, "grad_norm": 0.46614763140678406, "learning_rate": 3.997661209488807e-05, "loss": 0.4852, "num_input_tokens_seen": 8720160, "step": 7180 }, { "epoch": 0.8002004677581023, "grad_norm": 0.5528506636619568, "learning_rate": 4.000445483906894e-05, "loss": 0.507, "num_input_tokens_seen": 8726400, "step": 7185 }, { "epoch": 0.8007573226417196, "grad_norm": 0.7723687887191772, "learning_rate": 4.003229758324981e-05, "loss": 0.469, "num_input_tokens_seen": 8732672, "step": 7190 }, { "epoch": 0.8013141775253368, "grad_norm": 0.3609229326248169, "learning_rate": 4.006014032743067e-05, "loss": 0.4975, "num_input_tokens_seen": 8738848, "step": 7195 }, { "epoch": 0.8018710324089542, "grad_norm": 0.46473532915115356, "learning_rate": 4.008798307161154e-05, "loss": 0.4726, "num_input_tokens_seen": 8744864, "step": 7200 }, { "epoch": 0.8024278872925715, "grad_norm": 0.6071436405181885, "learning_rate": 4.011582581579241e-05, "loss": 0.4767, "num_input_tokens_seen": 8751168, "step": 7205 }, { "epoch": 0.8029847421761889, "grad_norm": 0.4778107702732086, "learning_rate": 4.014366855997328e-05, "loss": 0.4856, "num_input_tokens_seen": 8757152, "step": 7210 }, { "epoch": 0.8035415970598062, "grad_norm": 0.3278772532939911, "learning_rate": 4.017151130415414e-05, "loss": 0.4894, "num_input_tokens_seen": 8763136, "step": 7215 }, { "epoch": 0.8040984519434236, "grad_norm": 0.5297033786773682, "learning_rate": 4.019935404833501e-05, "loss": 0.4826, "num_input_tokens_seen": 8769376, "step": 7220 }, { "epoch": 0.8046553068270409, "grad_norm": 0.4083932042121887, "learning_rate": 4.022719679251587e-05, "loss": 0.4862, "num_input_tokens_seen": 8775456, "step": 7225 }, { "epoch": 0.8052121617106582, "grad_norm": 0.3598753809928894, "learning_rate": 4.025503953669674e-05, "loss": 0.4715, "num_input_tokens_seen": 8781504, "step": 7230 }, { "epoch": 0.8057690165942756, "grad_norm": 0.5517621040344238, "learning_rate": 4.028288228087761e-05, "loss": 0.4617, "num_input_tokens_seen": 8787168, "step": 7235 }, { "epoch": 0.8063258714778928, "grad_norm": 0.46736758947372437, "learning_rate": 4.031072502505847e-05, "loss": 0.4633, "num_input_tokens_seen": 8793216, "step": 7240 }, { "epoch": 0.8068827263615101, "grad_norm": 0.39138227701187134, "learning_rate": 4.033856776923934e-05, "loss": 0.5028, "num_input_tokens_seen": 8799200, "step": 7245 }, { "epoch": 0.8074395812451275, "grad_norm": 0.3510797917842865, "learning_rate": 4.0366410513420206e-05, "loss": 0.454, "num_input_tokens_seen": 8805216, "step": 7250 }, { "epoch": 0.8079964361287448, "grad_norm": 0.3840314447879791, "learning_rate": 4.0394253257601075e-05, "loss": 0.4538, "num_input_tokens_seen": 8811264, "step": 7255 }, { "epoch": 0.8085532910123622, "grad_norm": 0.3409920632839203, "learning_rate": 4.0422096001781937e-05, "loss": 0.4702, "num_input_tokens_seen": 8816832, "step": 7260 }, { "epoch": 0.8091101458959795, "grad_norm": 0.4914396405220032, "learning_rate": 4.0449938745962805e-05, "loss": 0.4958, "num_input_tokens_seen": 8823072, "step": 7265 }, { "epoch": 0.8096670007795969, "grad_norm": 0.38989758491516113, "learning_rate": 4.047778149014367e-05, "loss": 0.4614, "num_input_tokens_seen": 8828736, "step": 7270 }, { "epoch": 0.8102238556632142, "grad_norm": 0.36329352855682373, "learning_rate": 4.0505624234324536e-05, "loss": 0.465, "num_input_tokens_seen": 8834976, "step": 7275 }, { "epoch": 0.8107807105468315, "grad_norm": 0.3878443241119385, "learning_rate": 4.0533466978505404e-05, "loss": 0.4925, "num_input_tokens_seen": 8841088, "step": 7280 }, { "epoch": 0.8113375654304488, "grad_norm": 0.27525919675827026, "learning_rate": 4.0561309722686266e-05, "loss": 0.5144, "num_input_tokens_seen": 8847136, "step": 7285 }, { "epoch": 0.8118944203140661, "grad_norm": 0.31034979224205017, "learning_rate": 4.0589152466867135e-05, "loss": 0.4651, "num_input_tokens_seen": 8853280, "step": 7290 }, { "epoch": 0.8124512751976835, "grad_norm": 0.4416530728340149, "learning_rate": 4.0616995211048004e-05, "loss": 0.4702, "num_input_tokens_seen": 8859264, "step": 7295 }, { "epoch": 0.8130081300813008, "grad_norm": 0.29428547620773315, "learning_rate": 4.064483795522887e-05, "loss": 0.4549, "num_input_tokens_seen": 8865248, "step": 7300 }, { "epoch": 0.8135649849649181, "grad_norm": 0.5503513813018799, "learning_rate": 4.067268069940974e-05, "loss": 0.4813, "num_input_tokens_seen": 8871392, "step": 7305 }, { "epoch": 0.8141218398485355, "grad_norm": 0.5620527267456055, "learning_rate": 4.07005234435906e-05, "loss": 0.4519, "num_input_tokens_seen": 8877312, "step": 7310 }, { "epoch": 0.8146786947321528, "grad_norm": 0.37452346086502075, "learning_rate": 4.0728366187771465e-05, "loss": 0.4819, "num_input_tokens_seen": 8883264, "step": 7315 }, { "epoch": 0.8152355496157702, "grad_norm": 0.31483951210975647, "learning_rate": 4.075620893195233e-05, "loss": 0.4686, "num_input_tokens_seen": 8889504, "step": 7320 }, { "epoch": 0.8157924044993875, "grad_norm": 0.43434348702430725, "learning_rate": 4.07840516761332e-05, "loss": 0.4828, "num_input_tokens_seen": 8894752, "step": 7325 }, { "epoch": 0.8163492593830048, "grad_norm": 0.41560447216033936, "learning_rate": 4.081189442031407e-05, "loss": 0.4785, "num_input_tokens_seen": 8900480, "step": 7330 }, { "epoch": 0.8169061142666221, "grad_norm": 0.43182578682899475, "learning_rate": 4.083973716449493e-05, "loss": 0.455, "num_input_tokens_seen": 8906464, "step": 7335 }, { "epoch": 0.8174629691502394, "grad_norm": 0.4465363621711731, "learning_rate": 4.08675799086758e-05, "loss": 0.4868, "num_input_tokens_seen": 8912640, "step": 7340 }, { "epoch": 0.8180198240338568, "grad_norm": 0.4836183190345764, "learning_rate": 4.089542265285667e-05, "loss": 0.5058, "num_input_tokens_seen": 8919040, "step": 7345 }, { "epoch": 0.8185766789174741, "grad_norm": 0.45918509364128113, "learning_rate": 4.092326539703754e-05, "loss": 0.4932, "num_input_tokens_seen": 8924864, "step": 7350 }, { "epoch": 0.8191335338010914, "grad_norm": 0.532551646232605, "learning_rate": 4.09511081412184e-05, "loss": 0.5074, "num_input_tokens_seen": 8930720, "step": 7355 }, { "epoch": 0.8196903886847088, "grad_norm": 0.5315629243850708, "learning_rate": 4.097895088539926e-05, "loss": 0.466, "num_input_tokens_seen": 8936768, "step": 7360 }, { "epoch": 0.8202472435683261, "grad_norm": 0.45131534337997437, "learning_rate": 4.100679362958013e-05, "loss": 0.4491, "num_input_tokens_seen": 8942368, "step": 7365 }, { "epoch": 0.8208040984519435, "grad_norm": 0.4580237865447998, "learning_rate": 4.1034636373761e-05, "loss": 0.4794, "num_input_tokens_seen": 8948256, "step": 7370 }, { "epoch": 0.8213609533355608, "grad_norm": 0.4335106313228607, "learning_rate": 4.106247911794187e-05, "loss": 0.477, "num_input_tokens_seen": 8954496, "step": 7375 }, { "epoch": 0.821917808219178, "grad_norm": 0.3740100562572479, "learning_rate": 4.109032186212273e-05, "loss": 0.4583, "num_input_tokens_seen": 8960480, "step": 7380 }, { "epoch": 0.8224746631027954, "grad_norm": 0.39177748560905457, "learning_rate": 4.11181646063036e-05, "loss": 0.4671, "num_input_tokens_seen": 8966720, "step": 7385 }, { "epoch": 0.8230315179864127, "grad_norm": 0.4023469090461731, "learning_rate": 4.114600735048447e-05, "loss": 0.4893, "num_input_tokens_seen": 8972768, "step": 7390 }, { "epoch": 0.8235883728700301, "grad_norm": 0.4004787504673004, "learning_rate": 4.1173850094665336e-05, "loss": 0.4934, "num_input_tokens_seen": 8978816, "step": 7395 }, { "epoch": 0.8241452277536474, "grad_norm": 0.5091335773468018, "learning_rate": 4.12016928388462e-05, "loss": 0.5119, "num_input_tokens_seen": 8984928, "step": 7400 }, { "epoch": 0.8247020826372647, "grad_norm": 0.3602542281150818, "learning_rate": 4.122953558302706e-05, "loss": 0.4685, "num_input_tokens_seen": 8991136, "step": 7405 }, { "epoch": 0.8252589375208821, "grad_norm": 0.387464702129364, "learning_rate": 4.125737832720793e-05, "loss": 0.4863, "num_input_tokens_seen": 8997344, "step": 7410 }, { "epoch": 0.8258157924044994, "grad_norm": 0.3534359037876129, "learning_rate": 4.12852210713888e-05, "loss": 0.4781, "num_input_tokens_seen": 9003232, "step": 7415 }, { "epoch": 0.8263726472881168, "grad_norm": 0.39485496282577515, "learning_rate": 4.1313063815569666e-05, "loss": 0.4709, "num_input_tokens_seen": 9008704, "step": 7420 }, { "epoch": 0.826929502171734, "grad_norm": 0.5447079539299011, "learning_rate": 4.134090655975053e-05, "loss": 0.5075, "num_input_tokens_seen": 9015040, "step": 7425 }, { "epoch": 0.8274863570553513, "grad_norm": 0.35580745339393616, "learning_rate": 4.1368749303931396e-05, "loss": 0.4623, "num_input_tokens_seen": 9020832, "step": 7430 }, { "epoch": 0.8280432119389687, "grad_norm": 0.4076179563999176, "learning_rate": 4.1396592048112265e-05, "loss": 0.4881, "num_input_tokens_seen": 9027104, "step": 7435 }, { "epoch": 0.828600066822586, "grad_norm": 0.60045325756073, "learning_rate": 4.1424434792293134e-05, "loss": 0.491, "num_input_tokens_seen": 9033216, "step": 7440 }, { "epoch": 0.8291569217062034, "grad_norm": 0.3536754846572876, "learning_rate": 4.1452277536474e-05, "loss": 0.4659, "num_input_tokens_seen": 9039264, "step": 7445 }, { "epoch": 0.8297137765898207, "grad_norm": 0.4261096119880676, "learning_rate": 4.1480120280654864e-05, "loss": 0.4909, "num_input_tokens_seen": 9045280, "step": 7450 }, { "epoch": 0.830270631473438, "grad_norm": 0.4034510850906372, "learning_rate": 4.1507963024835726e-05, "loss": 0.4793, "num_input_tokens_seen": 9051424, "step": 7455 }, { "epoch": 0.8308274863570554, "grad_norm": 0.3094559609889984, "learning_rate": 4.1535805769016595e-05, "loss": 0.4696, "num_input_tokens_seen": 9056896, "step": 7460 }, { "epoch": 0.8313843412406727, "grad_norm": 0.3675262928009033, "learning_rate": 4.1563648513197463e-05, "loss": 0.467, "num_input_tokens_seen": 9063104, "step": 7465 }, { "epoch": 0.83194119612429, "grad_norm": 0.4023696184158325, "learning_rate": 4.159149125737833e-05, "loss": 0.4869, "num_input_tokens_seen": 9069184, "step": 7470 }, { "epoch": 0.8324980510079073, "grad_norm": 0.42542126774787903, "learning_rate": 4.1619334001559194e-05, "loss": 0.4964, "num_input_tokens_seen": 9075360, "step": 7475 }, { "epoch": 0.8330549058915246, "grad_norm": 0.48159611225128174, "learning_rate": 4.164717674574006e-05, "loss": 0.4592, "num_input_tokens_seen": 9081312, "step": 7480 }, { "epoch": 0.833611760775142, "grad_norm": 0.32551464438438416, "learning_rate": 4.167501948992093e-05, "loss": 0.4748, "num_input_tokens_seen": 9087392, "step": 7485 }, { "epoch": 0.8341686156587593, "grad_norm": 0.4729333519935608, "learning_rate": 4.17028622341018e-05, "loss": 0.485, "num_input_tokens_seen": 9093568, "step": 7490 }, { "epoch": 0.8347254705423767, "grad_norm": 0.39463022351264954, "learning_rate": 4.173070497828266e-05, "loss": 0.4954, "num_input_tokens_seen": 9099552, "step": 7495 }, { "epoch": 0.835282325425994, "grad_norm": 0.49131304025650024, "learning_rate": 4.1758547722463524e-05, "loss": 0.4744, "num_input_tokens_seen": 9105536, "step": 7500 }, { "epoch": 0.8358391803096114, "grad_norm": 0.31461232900619507, "learning_rate": 4.178639046664439e-05, "loss": 0.4773, "num_input_tokens_seen": 9111584, "step": 7505 }, { "epoch": 0.8363960351932287, "grad_norm": 0.38535845279693604, "learning_rate": 4.181423321082526e-05, "loss": 0.4506, "num_input_tokens_seen": 9117056, "step": 7510 }, { "epoch": 0.8369528900768459, "grad_norm": 0.4697727859020233, "learning_rate": 4.184207595500613e-05, "loss": 0.4902, "num_input_tokens_seen": 9123328, "step": 7515 }, { "epoch": 0.8375097449604633, "grad_norm": 0.512149453163147, "learning_rate": 4.186991869918699e-05, "loss": 0.4756, "num_input_tokens_seen": 9129696, "step": 7520 }, { "epoch": 0.8380665998440806, "grad_norm": 0.3938658833503723, "learning_rate": 4.189776144336786e-05, "loss": 0.487, "num_input_tokens_seen": 9135520, "step": 7525 }, { "epoch": 0.838623454727698, "grad_norm": 0.5190106630325317, "learning_rate": 4.192560418754873e-05, "loss": 0.4921, "num_input_tokens_seen": 9141536, "step": 7530 }, { "epoch": 0.8391803096113153, "grad_norm": 0.428853839635849, "learning_rate": 4.19534469317296e-05, "loss": 0.4387, "num_input_tokens_seen": 9147584, "step": 7535 }, { "epoch": 0.8397371644949326, "grad_norm": 0.43377235531806946, "learning_rate": 4.198128967591046e-05, "loss": 0.4905, "num_input_tokens_seen": 9153888, "step": 7540 }, { "epoch": 0.84029401937855, "grad_norm": 0.4101695716381073, "learning_rate": 4.200913242009132e-05, "loss": 0.503, "num_input_tokens_seen": 9159840, "step": 7545 }, { "epoch": 0.8408508742621673, "grad_norm": 0.5759966373443604, "learning_rate": 4.203697516427219e-05, "loss": 0.4761, "num_input_tokens_seen": 9165664, "step": 7550 }, { "epoch": 0.8414077291457847, "grad_norm": 0.33644458651542664, "learning_rate": 4.206481790845306e-05, "loss": 0.4616, "num_input_tokens_seen": 9171808, "step": 7555 }, { "epoch": 0.8419645840294019, "grad_norm": 0.37157171964645386, "learning_rate": 4.209266065263393e-05, "loss": 0.4812, "num_input_tokens_seen": 9178240, "step": 7560 }, { "epoch": 0.8425214389130192, "grad_norm": 0.47008344531059265, "learning_rate": 4.212050339681479e-05, "loss": 0.4602, "num_input_tokens_seen": 9184224, "step": 7565 }, { "epoch": 0.8430782937966366, "grad_norm": 0.3550148904323578, "learning_rate": 4.214834614099566e-05, "loss": 0.466, "num_input_tokens_seen": 9188704, "step": 7570 }, { "epoch": 0.8436351486802539, "grad_norm": 0.33809107542037964, "learning_rate": 4.2176188885176526e-05, "loss": 0.4684, "num_input_tokens_seen": 9194464, "step": 7575 }, { "epoch": 0.8441920035638713, "grad_norm": 0.4332287907600403, "learning_rate": 4.2204031629357395e-05, "loss": 0.4905, "num_input_tokens_seen": 9200416, "step": 7580 }, { "epoch": 0.8447488584474886, "grad_norm": 0.44676467776298523, "learning_rate": 4.2231874373538264e-05, "loss": 0.4587, "num_input_tokens_seen": 9206336, "step": 7585 }, { "epoch": 0.8453057133311059, "grad_norm": 0.5630241632461548, "learning_rate": 4.225971711771912e-05, "loss": 0.4964, "num_input_tokens_seen": 9212672, "step": 7590 }, { "epoch": 0.8458625682147233, "grad_norm": 0.3713912069797516, "learning_rate": 4.228755986189999e-05, "loss": 0.4671, "num_input_tokens_seen": 9218624, "step": 7595 }, { "epoch": 0.8464194230983406, "grad_norm": 0.34330448508262634, "learning_rate": 4.2315402606080856e-05, "loss": 0.4846, "num_input_tokens_seen": 9224928, "step": 7600 }, { "epoch": 0.8469762779819578, "grad_norm": 0.5186704993247986, "learning_rate": 4.2343245350261725e-05, "loss": 0.483, "num_input_tokens_seen": 9231232, "step": 7605 }, { "epoch": 0.8475331328655752, "grad_norm": 0.41592922806739807, "learning_rate": 4.2371088094442594e-05, "loss": 0.4977, "num_input_tokens_seen": 9237568, "step": 7610 }, { "epoch": 0.8480899877491925, "grad_norm": 0.36631229519844055, "learning_rate": 4.2398930838623455e-05, "loss": 0.481, "num_input_tokens_seen": 9243648, "step": 7615 }, { "epoch": 0.8486468426328099, "grad_norm": 0.34762004017829895, "learning_rate": 4.2426773582804324e-05, "loss": 0.4789, "num_input_tokens_seen": 9249760, "step": 7620 }, { "epoch": 0.8492036975164272, "grad_norm": 0.36535221338272095, "learning_rate": 4.245461632698519e-05, "loss": 0.4874, "num_input_tokens_seen": 9255776, "step": 7625 }, { "epoch": 0.8497605524000446, "grad_norm": 0.623512327671051, "learning_rate": 4.248245907116606e-05, "loss": 0.5222, "num_input_tokens_seen": 9261600, "step": 7630 }, { "epoch": 0.8503174072836619, "grad_norm": 0.377207487821579, "learning_rate": 4.251030181534692e-05, "loss": 0.4704, "num_input_tokens_seen": 9267584, "step": 7635 }, { "epoch": 0.8508742621672792, "grad_norm": 0.4740607738494873, "learning_rate": 4.2538144559527785e-05, "loss": 0.5099, "num_input_tokens_seen": 9273984, "step": 7640 }, { "epoch": 0.8514311170508966, "grad_norm": 0.39418184757232666, "learning_rate": 4.2565987303708654e-05, "loss": 0.4603, "num_input_tokens_seen": 9279872, "step": 7645 }, { "epoch": 0.8519879719345138, "grad_norm": 0.3622950315475464, "learning_rate": 4.259383004788952e-05, "loss": 0.4699, "num_input_tokens_seen": 9285856, "step": 7650 }, { "epoch": 0.8525448268181312, "grad_norm": 0.5395426750183105, "learning_rate": 4.262167279207039e-05, "loss": 0.4684, "num_input_tokens_seen": 9292000, "step": 7655 }, { "epoch": 0.8531016817017485, "grad_norm": 0.3792736530303955, "learning_rate": 4.264951553625125e-05, "loss": 0.4874, "num_input_tokens_seen": 9297952, "step": 7660 }, { "epoch": 0.8536585365853658, "grad_norm": 0.38942912220954895, "learning_rate": 4.267735828043212e-05, "loss": 0.46, "num_input_tokens_seen": 9303776, "step": 7665 }, { "epoch": 0.8542153914689832, "grad_norm": 0.4329650402069092, "learning_rate": 4.270520102461299e-05, "loss": 0.4637, "num_input_tokens_seen": 9310080, "step": 7670 }, { "epoch": 0.8547722463526005, "grad_norm": 0.5175822377204895, "learning_rate": 4.273304376879386e-05, "loss": 0.478, "num_input_tokens_seen": 9316000, "step": 7675 }, { "epoch": 0.8553291012362179, "grad_norm": 0.4264994263648987, "learning_rate": 4.276088651297472e-05, "loss": 0.4853, "num_input_tokens_seen": 9322432, "step": 7680 }, { "epoch": 0.8558859561198352, "grad_norm": 0.38340842723846436, "learning_rate": 4.278872925715558e-05, "loss": 0.5236, "num_input_tokens_seen": 9328320, "step": 7685 }, { "epoch": 0.8564428110034525, "grad_norm": 0.37633654475212097, "learning_rate": 4.281657200133645e-05, "loss": 0.4688, "num_input_tokens_seen": 9334336, "step": 7690 }, { "epoch": 0.8569996658870699, "grad_norm": 0.39592018723487854, "learning_rate": 4.284441474551732e-05, "loss": 0.4769, "num_input_tokens_seen": 9340480, "step": 7695 }, { "epoch": 0.8575565207706871, "grad_norm": 0.44359028339385986, "learning_rate": 4.287225748969819e-05, "loss": 0.4621, "num_input_tokens_seen": 9346400, "step": 7700 }, { "epoch": 0.8581133756543045, "grad_norm": 0.3341268301010132, "learning_rate": 4.290010023387905e-05, "loss": 0.4588, "num_input_tokens_seen": 9352160, "step": 7705 }, { "epoch": 0.8586702305379218, "grad_norm": 0.43781229853630066, "learning_rate": 4.292794297805992e-05, "loss": 0.4764, "num_input_tokens_seen": 9358496, "step": 7710 }, { "epoch": 0.8592270854215391, "grad_norm": 0.418281227350235, "learning_rate": 4.295578572224079e-05, "loss": 0.4644, "num_input_tokens_seen": 9364576, "step": 7715 }, { "epoch": 0.8597839403051565, "grad_norm": 0.3087022006511688, "learning_rate": 4.2983628466421657e-05, "loss": 0.4872, "num_input_tokens_seen": 9370464, "step": 7720 }, { "epoch": 0.8603407951887738, "grad_norm": 0.4114629626274109, "learning_rate": 4.301147121060252e-05, "loss": 0.4686, "num_input_tokens_seen": 9376928, "step": 7725 }, { "epoch": 0.8608976500723912, "grad_norm": 0.3962671458721161, "learning_rate": 4.303931395478338e-05, "loss": 0.4706, "num_input_tokens_seen": 9383296, "step": 7730 }, { "epoch": 0.8614545049560085, "grad_norm": 0.3844263553619385, "learning_rate": 4.306715669896425e-05, "loss": 0.4775, "num_input_tokens_seen": 9389280, "step": 7735 }, { "epoch": 0.8620113598396258, "grad_norm": 0.5425528287887573, "learning_rate": 4.309499944314512e-05, "loss": 0.4873, "num_input_tokens_seen": 9394848, "step": 7740 }, { "epoch": 0.8625682147232431, "grad_norm": 0.49495550990104675, "learning_rate": 4.3122842187325986e-05, "loss": 0.4822, "num_input_tokens_seen": 9400928, "step": 7745 }, { "epoch": 0.8631250696068604, "grad_norm": 0.3094795346260071, "learning_rate": 4.3150684931506855e-05, "loss": 0.4638, "num_input_tokens_seen": 9406848, "step": 7750 }, { "epoch": 0.8636819244904778, "grad_norm": 0.382757306098938, "learning_rate": 4.317852767568772e-05, "loss": 0.464, "num_input_tokens_seen": 9413056, "step": 7755 }, { "epoch": 0.8642387793740951, "grad_norm": 0.3411957621574402, "learning_rate": 4.3206370419868585e-05, "loss": 0.4816, "num_input_tokens_seen": 9418880, "step": 7760 }, { "epoch": 0.8647956342577124, "grad_norm": 0.43123677372932434, "learning_rate": 4.3234213164049454e-05, "loss": 0.4545, "num_input_tokens_seen": 9424704, "step": 7765 }, { "epoch": 0.8653524891413298, "grad_norm": 0.44397231936454773, "learning_rate": 4.3262055908230316e-05, "loss": 0.4733, "num_input_tokens_seen": 9431264, "step": 7770 }, { "epoch": 0.8659093440249471, "grad_norm": 0.29557132720947266, "learning_rate": 4.3289898652411185e-05, "loss": 0.4823, "num_input_tokens_seen": 9437056, "step": 7775 }, { "epoch": 0.8664661989085645, "grad_norm": 0.3234345018863678, "learning_rate": 4.3317741396592047e-05, "loss": 0.4735, "num_input_tokens_seen": 9443392, "step": 7780 }, { "epoch": 0.8670230537921818, "grad_norm": 0.35812485218048096, "learning_rate": 4.3345584140772915e-05, "loss": 0.4818, "num_input_tokens_seen": 9449472, "step": 7785 }, { "epoch": 0.867579908675799, "grad_norm": 0.3676682710647583, "learning_rate": 4.3373426884953784e-05, "loss": 0.472, "num_input_tokens_seen": 9455872, "step": 7790 }, { "epoch": 0.8681367635594164, "grad_norm": 0.41543319821357727, "learning_rate": 4.340126962913465e-05, "loss": 0.4868, "num_input_tokens_seen": 9462304, "step": 7795 }, { "epoch": 0.8686936184430337, "grad_norm": 0.3536570072174072, "learning_rate": 4.3429112373315514e-05, "loss": 0.495, "num_input_tokens_seen": 9468192, "step": 7800 }, { "epoch": 0.8692504733266511, "grad_norm": 0.3736005425453186, "learning_rate": 4.345695511749638e-05, "loss": 0.4771, "num_input_tokens_seen": 9474112, "step": 7805 }, { "epoch": 0.8698073282102684, "grad_norm": 0.4790147840976715, "learning_rate": 4.348479786167725e-05, "loss": 0.4852, "num_input_tokens_seen": 9479712, "step": 7810 }, { "epoch": 0.8703641830938857, "grad_norm": 0.3200148642063141, "learning_rate": 4.351264060585812e-05, "loss": 0.4617, "num_input_tokens_seen": 9485696, "step": 7815 }, { "epoch": 0.8709210379775031, "grad_norm": 0.2967025935649872, "learning_rate": 4.354048335003898e-05, "loss": 0.4562, "num_input_tokens_seen": 9491360, "step": 7820 }, { "epoch": 0.8714778928611204, "grad_norm": 0.3466189503669739, "learning_rate": 4.3568326094219844e-05, "loss": 0.4728, "num_input_tokens_seen": 9497792, "step": 7825 }, { "epoch": 0.8720347477447378, "grad_norm": 0.3598310947418213, "learning_rate": 4.359616883840071e-05, "loss": 0.4997, "num_input_tokens_seen": 9503712, "step": 7830 }, { "epoch": 0.872591602628355, "grad_norm": 0.39131274819374084, "learning_rate": 4.362401158258158e-05, "loss": 0.4619, "num_input_tokens_seen": 9510016, "step": 7835 }, { "epoch": 0.8731484575119723, "grad_norm": 0.3627302348613739, "learning_rate": 4.365185432676245e-05, "loss": 0.4784, "num_input_tokens_seen": 9516000, "step": 7840 }, { "epoch": 0.8737053123955897, "grad_norm": 0.34137463569641113, "learning_rate": 4.367969707094331e-05, "loss": 0.4718, "num_input_tokens_seen": 9522240, "step": 7845 }, { "epoch": 0.874262167279207, "grad_norm": 0.40602490305900574, "learning_rate": 4.370753981512418e-05, "loss": 0.4722, "num_input_tokens_seen": 9528384, "step": 7850 }, { "epoch": 0.8748190221628244, "grad_norm": 0.2882041931152344, "learning_rate": 4.373538255930505e-05, "loss": 0.4685, "num_input_tokens_seen": 9534400, "step": 7855 }, { "epoch": 0.8753758770464417, "grad_norm": 0.46626025438308716, "learning_rate": 4.376322530348592e-05, "loss": 0.5001, "num_input_tokens_seen": 9540576, "step": 7860 }, { "epoch": 0.875932731930059, "grad_norm": 0.41933175921440125, "learning_rate": 4.379106804766678e-05, "loss": 0.4891, "num_input_tokens_seen": 9546432, "step": 7865 }, { "epoch": 0.8764895868136764, "grad_norm": 0.41274240612983704, "learning_rate": 4.381891079184764e-05, "loss": 0.4842, "num_input_tokens_seen": 9552640, "step": 7870 }, { "epoch": 0.8770464416972937, "grad_norm": 0.39890575408935547, "learning_rate": 4.384675353602851e-05, "loss": 0.4858, "num_input_tokens_seen": 9559264, "step": 7875 }, { "epoch": 0.877603296580911, "grad_norm": 0.3497903347015381, "learning_rate": 4.387459628020938e-05, "loss": 0.4842, "num_input_tokens_seen": 9565376, "step": 7880 }, { "epoch": 0.8781601514645283, "grad_norm": 0.5259379744529724, "learning_rate": 4.390243902439025e-05, "loss": 0.4782, "num_input_tokens_seen": 9571584, "step": 7885 }, { "epoch": 0.8787170063481456, "grad_norm": 0.34629595279693604, "learning_rate": 4.3930281768571116e-05, "loss": 0.4759, "num_input_tokens_seen": 9577088, "step": 7890 }, { "epoch": 0.879273861231763, "grad_norm": 0.2999376058578491, "learning_rate": 4.395812451275198e-05, "loss": 0.4637, "num_input_tokens_seen": 9583296, "step": 7895 }, { "epoch": 0.8798307161153803, "grad_norm": 0.33644378185272217, "learning_rate": 4.398596725693285e-05, "loss": 0.4652, "num_input_tokens_seen": 9589696, "step": 7900 }, { "epoch": 0.8803875709989977, "grad_norm": 0.2779053747653961, "learning_rate": 4.4013810001113716e-05, "loss": 0.447, "num_input_tokens_seen": 9595808, "step": 7905 }, { "epoch": 0.880944425882615, "grad_norm": 0.31596842408180237, "learning_rate": 4.404165274529458e-05, "loss": 0.4697, "num_input_tokens_seen": 9602080, "step": 7910 }, { "epoch": 0.8815012807662324, "grad_norm": 0.34395942091941833, "learning_rate": 4.4069495489475446e-05, "loss": 0.4993, "num_input_tokens_seen": 9607968, "step": 7915 }, { "epoch": 0.8820581356498497, "grad_norm": 0.4080057740211487, "learning_rate": 4.409733823365631e-05, "loss": 0.4465, "num_input_tokens_seen": 9614368, "step": 7920 }, { "epoch": 0.8826149905334669, "grad_norm": 0.3868236243724823, "learning_rate": 4.4125180977837177e-05, "loss": 0.4882, "num_input_tokens_seen": 9620384, "step": 7925 }, { "epoch": 0.8831718454170843, "grad_norm": 0.2801050841808319, "learning_rate": 4.4153023722018045e-05, "loss": 0.4581, "num_input_tokens_seen": 9626208, "step": 7930 }, { "epoch": 0.8837287003007016, "grad_norm": 0.414303719997406, "learning_rate": 4.4180866466198914e-05, "loss": 0.474, "num_input_tokens_seen": 9632384, "step": 7935 }, { "epoch": 0.884285555184319, "grad_norm": 0.2634739577770233, "learning_rate": 4.4208709210379776e-05, "loss": 0.4684, "num_input_tokens_seen": 9637952, "step": 7940 }, { "epoch": 0.8848424100679363, "grad_norm": 0.47999244928359985, "learning_rate": 4.4236551954560644e-05, "loss": 0.4615, "num_input_tokens_seen": 9643968, "step": 7945 }, { "epoch": 0.8853992649515536, "grad_norm": 0.3267538249492645, "learning_rate": 4.426439469874151e-05, "loss": 0.4605, "num_input_tokens_seen": 9650112, "step": 7950 }, { "epoch": 0.885956119835171, "grad_norm": 0.5590184926986694, "learning_rate": 4.4292237442922375e-05, "loss": 0.4915, "num_input_tokens_seen": 9656352, "step": 7955 }, { "epoch": 0.8865129747187883, "grad_norm": 0.3418710231781006, "learning_rate": 4.4320080187103244e-05, "loss": 0.4898, "num_input_tokens_seen": 9662400, "step": 7960 }, { "epoch": 0.8870698296024057, "grad_norm": 0.42579665780067444, "learning_rate": 4.4347922931284106e-05, "loss": 0.448, "num_input_tokens_seen": 9668416, "step": 7965 }, { "epoch": 0.8876266844860229, "grad_norm": 0.3304736018180847, "learning_rate": 4.4375765675464974e-05, "loss": 0.4901, "num_input_tokens_seen": 9674560, "step": 7970 }, { "epoch": 0.8881835393696402, "grad_norm": 0.38951319456100464, "learning_rate": 4.440360841964584e-05, "loss": 0.4937, "num_input_tokens_seen": 9680544, "step": 7975 }, { "epoch": 0.8887403942532576, "grad_norm": 0.4606209099292755, "learning_rate": 4.443145116382671e-05, "loss": 0.493, "num_input_tokens_seen": 9686880, "step": 7980 }, { "epoch": 0.8892972491368749, "grad_norm": 0.5317670106887817, "learning_rate": 4.445929390800757e-05, "loss": 0.476, "num_input_tokens_seen": 9693024, "step": 7985 }, { "epoch": 0.8898541040204923, "grad_norm": 0.4927161931991577, "learning_rate": 4.448713665218844e-05, "loss": 0.4845, "num_input_tokens_seen": 9699168, "step": 7990 }, { "epoch": 0.8904109589041096, "grad_norm": 0.39240604639053345, "learning_rate": 4.451497939636931e-05, "loss": 0.4981, "num_input_tokens_seen": 9705184, "step": 7995 }, { "epoch": 0.8909678137877269, "grad_norm": 0.3328987658023834, "learning_rate": 4.454282214055017e-05, "loss": 0.4886, "num_input_tokens_seen": 9711200, "step": 8000 }, { "epoch": 0.8915246686713443, "grad_norm": 0.40281742811203003, "learning_rate": 4.457066488473104e-05, "loss": 0.4619, "num_input_tokens_seen": 9716736, "step": 8005 }, { "epoch": 0.8920815235549616, "grad_norm": 0.40739765763282776, "learning_rate": 4.45985076289119e-05, "loss": 0.4746, "num_input_tokens_seen": 9723040, "step": 8010 }, { "epoch": 0.892638378438579, "grad_norm": 0.3197421729564667, "learning_rate": 4.462635037309277e-05, "loss": 0.4656, "num_input_tokens_seen": 9729248, "step": 8015 }, { "epoch": 0.8931952333221962, "grad_norm": 0.40513136982917786, "learning_rate": 4.465419311727364e-05, "loss": 0.4808, "num_input_tokens_seen": 9735296, "step": 8020 }, { "epoch": 0.8937520882058135, "grad_norm": 0.42518696188926697, "learning_rate": 4.468203586145451e-05, "loss": 0.4715, "num_input_tokens_seen": 9741536, "step": 8025 }, { "epoch": 0.8943089430894309, "grad_norm": 0.5251770615577698, "learning_rate": 4.470987860563538e-05, "loss": 0.4845, "num_input_tokens_seen": 9747392, "step": 8030 }, { "epoch": 0.8948657979730482, "grad_norm": 0.3904273211956024, "learning_rate": 4.473772134981624e-05, "loss": 0.4692, "num_input_tokens_seen": 9753952, "step": 8035 }, { "epoch": 0.8954226528566656, "grad_norm": 0.3582710325717926, "learning_rate": 4.476556409399711e-05, "loss": 0.484, "num_input_tokens_seen": 9760160, "step": 8040 }, { "epoch": 0.8959795077402829, "grad_norm": 0.6207363605499268, "learning_rate": 4.479340683817797e-05, "loss": 0.4857, "num_input_tokens_seen": 9766176, "step": 8045 }, { "epoch": 0.8965363626239002, "grad_norm": 0.35419997572898865, "learning_rate": 4.482124958235884e-05, "loss": 0.5156, "num_input_tokens_seen": 9772256, "step": 8050 }, { "epoch": 0.8970932175075176, "grad_norm": 0.38592392206192017, "learning_rate": 4.484909232653971e-05, "loss": 0.464, "num_input_tokens_seen": 9778496, "step": 8055 }, { "epoch": 0.8976500723911349, "grad_norm": 0.4159089922904968, "learning_rate": 4.487693507072057e-05, "loss": 0.481, "num_input_tokens_seen": 9784192, "step": 8060 }, { "epoch": 0.8982069272747522, "grad_norm": 0.30846497416496277, "learning_rate": 4.490477781490144e-05, "loss": 0.4674, "num_input_tokens_seen": 9790304, "step": 8065 }, { "epoch": 0.8987637821583695, "grad_norm": 0.3196124732494354, "learning_rate": 4.493262055908231e-05, "loss": 0.4805, "num_input_tokens_seen": 9796384, "step": 8070 }, { "epoch": 0.8993206370419868, "grad_norm": 0.25984257459640503, "learning_rate": 4.4960463303263175e-05, "loss": 0.4674, "num_input_tokens_seen": 9802528, "step": 8075 }, { "epoch": 0.8998774919256042, "grad_norm": 0.4425402879714966, "learning_rate": 4.498830604744404e-05, "loss": 0.5056, "num_input_tokens_seen": 9809056, "step": 8080 }, { "epoch": 0.9004343468092215, "grad_norm": 0.3311581015586853, "learning_rate": 4.5016148791624906e-05, "loss": 0.4594, "num_input_tokens_seen": 9815616, "step": 8085 }, { "epoch": 0.9009912016928389, "grad_norm": 0.43365156650543213, "learning_rate": 4.504399153580577e-05, "loss": 0.4834, "num_input_tokens_seen": 9821856, "step": 8090 }, { "epoch": 0.9015480565764562, "grad_norm": 0.29459527134895325, "learning_rate": 4.5071834279986636e-05, "loss": 0.4982, "num_input_tokens_seen": 9828576, "step": 8095 }, { "epoch": 0.9021049114600735, "grad_norm": 0.4080735445022583, "learning_rate": 4.5099677024167505e-05, "loss": 0.4697, "num_input_tokens_seen": 9834176, "step": 8100 }, { "epoch": 0.9026617663436909, "grad_norm": 0.2792063057422638, "learning_rate": 4.512751976834837e-05, "loss": 0.463, "num_input_tokens_seen": 9840224, "step": 8105 }, { "epoch": 0.9032186212273081, "grad_norm": 0.4973234236240387, "learning_rate": 4.5155362512529236e-05, "loss": 0.5154, "num_input_tokens_seen": 9846656, "step": 8110 }, { "epoch": 0.9037754761109255, "grad_norm": 0.306742399930954, "learning_rate": 4.5183205256710104e-05, "loss": 0.4642, "num_input_tokens_seen": 9852896, "step": 8115 }, { "epoch": 0.9043323309945428, "grad_norm": 0.3653077185153961, "learning_rate": 4.521104800089097e-05, "loss": 0.4884, "num_input_tokens_seen": 9859104, "step": 8120 }, { "epoch": 0.9048891858781601, "grad_norm": 0.37895745038986206, "learning_rate": 4.5238890745071835e-05, "loss": 0.4738, "num_input_tokens_seen": 9864896, "step": 8125 }, { "epoch": 0.9054460407617775, "grad_norm": 0.5189328193664551, "learning_rate": 4.5266733489252703e-05, "loss": 0.4516, "num_input_tokens_seen": 9870848, "step": 8130 }, { "epoch": 0.9060028956453948, "grad_norm": 0.29220297932624817, "learning_rate": 4.529457623343357e-05, "loss": 0.4799, "num_input_tokens_seen": 9876992, "step": 8135 }, { "epoch": 0.9065597505290122, "grad_norm": 0.44418027997016907, "learning_rate": 4.5322418977614434e-05, "loss": 0.4548, "num_input_tokens_seen": 9883040, "step": 8140 }, { "epoch": 0.9071166054126295, "grad_norm": 0.2408866286277771, "learning_rate": 4.53502617217953e-05, "loss": 0.4816, "num_input_tokens_seen": 9889312, "step": 8145 }, { "epoch": 0.9076734602962468, "grad_norm": 0.3355024456977844, "learning_rate": 4.5378104465976164e-05, "loss": 0.5004, "num_input_tokens_seen": 9895648, "step": 8150 }, { "epoch": 0.9082303151798641, "grad_norm": 0.35569167137145996, "learning_rate": 4.540594721015703e-05, "loss": 0.4741, "num_input_tokens_seen": 9901952, "step": 8155 }, { "epoch": 0.9087871700634814, "grad_norm": 0.3277117609977722, "learning_rate": 4.54337899543379e-05, "loss": 0.4586, "num_input_tokens_seen": 9907904, "step": 8160 }, { "epoch": 0.9093440249470988, "grad_norm": 0.36814936995506287, "learning_rate": 4.546163269851877e-05, "loss": 0.4612, "num_input_tokens_seen": 9913824, "step": 8165 }, { "epoch": 0.9099008798307161, "grad_norm": 0.44100311398506165, "learning_rate": 4.548947544269964e-05, "loss": 0.4652, "num_input_tokens_seen": 9919904, "step": 8170 }, { "epoch": 0.9104577347143334, "grad_norm": 0.38532042503356934, "learning_rate": 4.55173181868805e-05, "loss": 0.4741, "num_input_tokens_seen": 9925824, "step": 8175 }, { "epoch": 0.9110145895979508, "grad_norm": 0.506805956363678, "learning_rate": 4.554516093106137e-05, "loss": 0.4539, "num_input_tokens_seen": 9931808, "step": 8180 }, { "epoch": 0.9115714444815681, "grad_norm": 0.2827002704143524, "learning_rate": 4.557300367524223e-05, "loss": 0.4671, "num_input_tokens_seen": 9937984, "step": 8185 }, { "epoch": 0.9121282993651855, "grad_norm": 0.30180636048316956, "learning_rate": 4.56008464194231e-05, "loss": 0.4559, "num_input_tokens_seen": 9943968, "step": 8190 }, { "epoch": 0.9126851542488028, "grad_norm": 0.27412891387939453, "learning_rate": 4.562868916360397e-05, "loss": 0.4406, "num_input_tokens_seen": 9950048, "step": 8195 }, { "epoch": 0.91324200913242, "grad_norm": 0.4625951945781708, "learning_rate": 4.565653190778483e-05, "loss": 0.4773, "num_input_tokens_seen": 9956064, "step": 8200 }, { "epoch": 0.9137988640160374, "grad_norm": 0.37108245491981506, "learning_rate": 4.56843746519657e-05, "loss": 0.4852, "num_input_tokens_seen": 9962016, "step": 8205 }, { "epoch": 0.9143557188996547, "grad_norm": 0.28477391600608826, "learning_rate": 4.571221739614657e-05, "loss": 0.4882, "num_input_tokens_seen": 9968160, "step": 8210 }, { "epoch": 0.9149125737832721, "grad_norm": 0.5308660268783569, "learning_rate": 4.574006014032744e-05, "loss": 0.4889, "num_input_tokens_seen": 9973952, "step": 8215 }, { "epoch": 0.9154694286668894, "grad_norm": 0.292636513710022, "learning_rate": 4.57679028845083e-05, "loss": 0.476, "num_input_tokens_seen": 9979968, "step": 8220 }, { "epoch": 0.9160262835505067, "grad_norm": 0.4525790810585022, "learning_rate": 4.579574562868917e-05, "loss": 0.4835, "num_input_tokens_seen": 9986112, "step": 8225 }, { "epoch": 0.9165831384341241, "grad_norm": 0.4407338500022888, "learning_rate": 4.582358837287003e-05, "loss": 0.4679, "num_input_tokens_seen": 9991744, "step": 8230 }, { "epoch": 0.9171399933177414, "grad_norm": 0.5110400915145874, "learning_rate": 4.58514311170509e-05, "loss": 0.4864, "num_input_tokens_seen": 9997760, "step": 8235 }, { "epoch": 0.9176968482013588, "grad_norm": 0.3269154727458954, "learning_rate": 4.5879273861231766e-05, "loss": 0.4798, "num_input_tokens_seen": 10004128, "step": 8240 }, { "epoch": 0.918253703084976, "grad_norm": 0.4460153579711914, "learning_rate": 4.590711660541263e-05, "loss": 0.4753, "num_input_tokens_seen": 10009984, "step": 8245 }, { "epoch": 0.9188105579685933, "grad_norm": 0.4041164815425873, "learning_rate": 4.59349593495935e-05, "loss": 0.4767, "num_input_tokens_seen": 10015936, "step": 8250 }, { "epoch": 0.9193674128522107, "grad_norm": 0.32421430945396423, "learning_rate": 4.5962802093774366e-05, "loss": 0.4854, "num_input_tokens_seen": 10022048, "step": 8255 }, { "epoch": 0.919924267735828, "grad_norm": 0.5189085602760315, "learning_rate": 4.5990644837955234e-05, "loss": 0.4664, "num_input_tokens_seen": 10027904, "step": 8260 }, { "epoch": 0.9204811226194454, "grad_norm": 0.2743194103240967, "learning_rate": 4.6018487582136096e-05, "loss": 0.4691, "num_input_tokens_seen": 10033920, "step": 8265 }, { "epoch": 0.9210379775030627, "grad_norm": 0.2935468256473541, "learning_rate": 4.6046330326316965e-05, "loss": 0.4648, "num_input_tokens_seen": 10039744, "step": 8270 }, { "epoch": 0.92159483238668, "grad_norm": 0.4192486107349396, "learning_rate": 4.607417307049783e-05, "loss": 0.4674, "num_input_tokens_seen": 10045824, "step": 8275 }, { "epoch": 0.9221516872702974, "grad_norm": 0.29183679819107056, "learning_rate": 4.6102015814678695e-05, "loss": 0.4906, "num_input_tokens_seen": 10052352, "step": 8280 }, { "epoch": 0.9227085421539147, "grad_norm": 0.4088189899921417, "learning_rate": 4.6129858558859564e-05, "loss": 0.474, "num_input_tokens_seen": 10058464, "step": 8285 }, { "epoch": 0.923265397037532, "grad_norm": 0.33736324310302734, "learning_rate": 4.6157701303040426e-05, "loss": 0.4644, "num_input_tokens_seen": 10064672, "step": 8290 }, { "epoch": 0.9238222519211493, "grad_norm": 0.4424191415309906, "learning_rate": 4.6185544047221295e-05, "loss": 0.4916, "num_input_tokens_seen": 10070560, "step": 8295 }, { "epoch": 0.9243791068047666, "grad_norm": 0.37229910492897034, "learning_rate": 4.621338679140216e-05, "loss": 0.4846, "num_input_tokens_seen": 10076448, "step": 8300 }, { "epoch": 0.924935961688384, "grad_norm": 0.3006739318370819, "learning_rate": 4.624122953558303e-05, "loss": 0.4742, "num_input_tokens_seen": 10082656, "step": 8305 }, { "epoch": 0.9254928165720013, "grad_norm": 0.3059521019458771, "learning_rate": 4.62690722797639e-05, "loss": 0.4881, "num_input_tokens_seen": 10088640, "step": 8310 }, { "epoch": 0.9260496714556187, "grad_norm": 0.523389995098114, "learning_rate": 4.629691502394476e-05, "loss": 0.4855, "num_input_tokens_seen": 10094944, "step": 8315 }, { "epoch": 0.926606526339236, "grad_norm": 0.3795607388019562, "learning_rate": 4.6324757768125624e-05, "loss": 0.4737, "num_input_tokens_seen": 10101216, "step": 8320 }, { "epoch": 0.9271633812228534, "grad_norm": 0.297667920589447, "learning_rate": 4.635260051230649e-05, "loss": 0.4731, "num_input_tokens_seen": 10107552, "step": 8325 }, { "epoch": 0.9277202361064707, "grad_norm": 0.37083691358566284, "learning_rate": 4.638044325648736e-05, "loss": 0.4535, "num_input_tokens_seen": 10113344, "step": 8330 }, { "epoch": 0.928277090990088, "grad_norm": 0.3000487685203552, "learning_rate": 4.640828600066823e-05, "loss": 0.4803, "num_input_tokens_seen": 10119712, "step": 8335 }, { "epoch": 0.9288339458737053, "grad_norm": 0.4869742691516876, "learning_rate": 4.643612874484909e-05, "loss": 0.508, "num_input_tokens_seen": 10125568, "step": 8340 }, { "epoch": 0.9293908007573226, "grad_norm": 0.3354882299900055, "learning_rate": 4.646397148902996e-05, "loss": 0.4856, "num_input_tokens_seen": 10131616, "step": 8345 }, { "epoch": 0.92994765564094, "grad_norm": 0.3466295301914215, "learning_rate": 4.649181423321083e-05, "loss": 0.4357, "num_input_tokens_seen": 10137632, "step": 8350 }, { "epoch": 0.9305045105245573, "grad_norm": 0.36191046237945557, "learning_rate": 4.65196569773917e-05, "loss": 0.4585, "num_input_tokens_seen": 10143584, "step": 8355 }, { "epoch": 0.9310613654081746, "grad_norm": 0.3332503139972687, "learning_rate": 4.654749972157256e-05, "loss": 0.4696, "num_input_tokens_seen": 10149632, "step": 8360 }, { "epoch": 0.931618220291792, "grad_norm": 0.3600594997406006, "learning_rate": 4.657534246575342e-05, "loss": 0.498, "num_input_tokens_seen": 10155680, "step": 8365 }, { "epoch": 0.9321750751754093, "grad_norm": 0.30136701464653015, "learning_rate": 4.660318520993429e-05, "loss": 0.4584, "num_input_tokens_seen": 10161696, "step": 8370 }, { "epoch": 0.9327319300590267, "grad_norm": 0.3294644355773926, "learning_rate": 4.663102795411516e-05, "loss": 0.5048, "num_input_tokens_seen": 10167776, "step": 8375 }, { "epoch": 0.933288784942644, "grad_norm": 0.38459062576293945, "learning_rate": 4.665887069829603e-05, "loss": 0.4952, "num_input_tokens_seen": 10173408, "step": 8380 }, { "epoch": 0.9338456398262612, "grad_norm": 0.4818444848060608, "learning_rate": 4.668671344247689e-05, "loss": 0.4984, "num_input_tokens_seen": 10179424, "step": 8385 }, { "epoch": 0.9344024947098786, "grad_norm": 0.33298611640930176, "learning_rate": 4.671455618665776e-05, "loss": 0.4866, "num_input_tokens_seen": 10185760, "step": 8390 }, { "epoch": 0.9349593495934959, "grad_norm": 0.3988354504108429, "learning_rate": 4.674239893083863e-05, "loss": 0.4788, "num_input_tokens_seen": 10191872, "step": 8395 }, { "epoch": 0.9355162044771133, "grad_norm": 0.2669338583946228, "learning_rate": 4.6770241675019496e-05, "loss": 0.4628, "num_input_tokens_seen": 10197664, "step": 8400 }, { "epoch": 0.9360730593607306, "grad_norm": 0.37269794940948486, "learning_rate": 4.679808441920036e-05, "loss": 0.4486, "num_input_tokens_seen": 10203776, "step": 8405 }, { "epoch": 0.9366299142443479, "grad_norm": 0.27370795607566833, "learning_rate": 4.682592716338122e-05, "loss": 0.468, "num_input_tokens_seen": 10209952, "step": 8410 }, { "epoch": 0.9371867691279653, "grad_norm": 0.2734769582748413, "learning_rate": 4.685376990756209e-05, "loss": 0.447, "num_input_tokens_seen": 10215744, "step": 8415 }, { "epoch": 0.9377436240115826, "grad_norm": 0.4098392128944397, "learning_rate": 4.688161265174296e-05, "loss": 0.4747, "num_input_tokens_seen": 10221440, "step": 8420 }, { "epoch": 0.9383004788952, "grad_norm": 0.3690304756164551, "learning_rate": 4.6909455395923825e-05, "loss": 0.4786, "num_input_tokens_seen": 10227552, "step": 8425 }, { "epoch": 0.9388573337788172, "grad_norm": 0.3220333158969879, "learning_rate": 4.6937298140104694e-05, "loss": 0.4773, "num_input_tokens_seen": 10233024, "step": 8430 }, { "epoch": 0.9394141886624345, "grad_norm": 0.3937741816043854, "learning_rate": 4.6965140884285556e-05, "loss": 0.486, "num_input_tokens_seen": 10239456, "step": 8435 }, { "epoch": 0.9399710435460519, "grad_norm": 0.4262223243713379, "learning_rate": 4.6992983628466425e-05, "loss": 0.4836, "num_input_tokens_seen": 10245280, "step": 8440 }, { "epoch": 0.9405278984296692, "grad_norm": 0.3324336111545563, "learning_rate": 4.702082637264729e-05, "loss": 0.4675, "num_input_tokens_seen": 10251616, "step": 8445 }, { "epoch": 0.9410847533132866, "grad_norm": 0.2887587249279022, "learning_rate": 4.704866911682816e-05, "loss": 0.4964, "num_input_tokens_seen": 10257792, "step": 8450 }, { "epoch": 0.9416416081969039, "grad_norm": 0.36837664246559143, "learning_rate": 4.7076511861009024e-05, "loss": 0.4885, "num_input_tokens_seen": 10263904, "step": 8455 }, { "epoch": 0.9421984630805212, "grad_norm": 0.3547905683517456, "learning_rate": 4.7104354605189886e-05, "loss": 0.4801, "num_input_tokens_seen": 10270272, "step": 8460 }, { "epoch": 0.9427553179641386, "grad_norm": 0.4474828541278839, "learning_rate": 4.7132197349370754e-05, "loss": 0.4664, "num_input_tokens_seen": 10276512, "step": 8465 }, { "epoch": 0.9433121728477559, "grad_norm": 0.38909712433815, "learning_rate": 4.716004009355162e-05, "loss": 0.4587, "num_input_tokens_seen": 10282496, "step": 8470 }, { "epoch": 0.9438690277313732, "grad_norm": 0.3460066020488739, "learning_rate": 4.718788283773249e-05, "loss": 0.4659, "num_input_tokens_seen": 10287872, "step": 8475 }, { "epoch": 0.9444258826149905, "grad_norm": 0.29385727643966675, "learning_rate": 4.7215725581913354e-05, "loss": 0.4975, "num_input_tokens_seen": 10293952, "step": 8480 }, { "epoch": 0.9449827374986078, "grad_norm": 0.30064836144447327, "learning_rate": 4.724356832609422e-05, "loss": 0.4538, "num_input_tokens_seen": 10300128, "step": 8485 }, { "epoch": 0.9455395923822252, "grad_norm": 0.3616045415401459, "learning_rate": 4.727141107027509e-05, "loss": 0.4794, "num_input_tokens_seen": 10306048, "step": 8490 }, { "epoch": 0.9460964472658425, "grad_norm": 0.45400533080101013, "learning_rate": 4.729925381445596e-05, "loss": 0.4621, "num_input_tokens_seen": 10312544, "step": 8495 }, { "epoch": 0.9466533021494599, "grad_norm": 0.31887325644493103, "learning_rate": 4.732709655863682e-05, "loss": 0.4775, "num_input_tokens_seen": 10318752, "step": 8500 }, { "epoch": 0.9472101570330772, "grad_norm": 0.36146512627601624, "learning_rate": 4.735493930281768e-05, "loss": 0.4594, "num_input_tokens_seen": 10324672, "step": 8505 }, { "epoch": 0.9477670119166945, "grad_norm": 0.4020194113254547, "learning_rate": 4.738278204699855e-05, "loss": 0.4672, "num_input_tokens_seen": 10330880, "step": 8510 }, { "epoch": 0.9483238668003119, "grad_norm": 0.40021488070487976, "learning_rate": 4.741062479117942e-05, "loss": 0.4675, "num_input_tokens_seen": 10336832, "step": 8515 }, { "epoch": 0.9488807216839291, "grad_norm": 0.3072737753391266, "learning_rate": 4.743846753536029e-05, "loss": 0.4723, "num_input_tokens_seen": 10343040, "step": 8520 }, { "epoch": 0.9494375765675465, "grad_norm": 0.38329067826271057, "learning_rate": 4.746631027954115e-05, "loss": 0.4821, "num_input_tokens_seen": 10349504, "step": 8525 }, { "epoch": 0.9499944314511638, "grad_norm": 0.5226035714149475, "learning_rate": 4.749415302372202e-05, "loss": 0.488, "num_input_tokens_seen": 10355808, "step": 8530 }, { "epoch": 0.9505512863347811, "grad_norm": 0.3098219335079193, "learning_rate": 4.752199576790289e-05, "loss": 0.4711, "num_input_tokens_seen": 10361920, "step": 8535 }, { "epoch": 0.9511081412183985, "grad_norm": 0.3630794286727905, "learning_rate": 4.754983851208376e-05, "loss": 0.4862, "num_input_tokens_seen": 10368448, "step": 8540 }, { "epoch": 0.9516649961020158, "grad_norm": 0.5315576195716858, "learning_rate": 4.757768125626462e-05, "loss": 0.4833, "num_input_tokens_seen": 10374496, "step": 8545 }, { "epoch": 0.9522218509856332, "grad_norm": 0.33885735273361206, "learning_rate": 4.760552400044548e-05, "loss": 0.4583, "num_input_tokens_seen": 10380256, "step": 8550 }, { "epoch": 0.9527787058692505, "grad_norm": 0.4192189872264862, "learning_rate": 4.763336674462635e-05, "loss": 0.4876, "num_input_tokens_seen": 10386720, "step": 8555 }, { "epoch": 0.9533355607528678, "grad_norm": 0.3726590871810913, "learning_rate": 4.766120948880722e-05, "loss": 0.479, "num_input_tokens_seen": 10393184, "step": 8560 }, { "epoch": 0.9538924156364851, "grad_norm": 0.21768757700920105, "learning_rate": 4.768905223298809e-05, "loss": 0.4685, "num_input_tokens_seen": 10399520, "step": 8565 }, { "epoch": 0.9544492705201024, "grad_norm": 0.25716516375541687, "learning_rate": 4.7716894977168955e-05, "loss": 0.47, "num_input_tokens_seen": 10405504, "step": 8570 }, { "epoch": 0.9550061254037198, "grad_norm": 0.3283163607120514, "learning_rate": 4.774473772134982e-05, "loss": 0.4362, "num_input_tokens_seen": 10411648, "step": 8575 }, { "epoch": 0.9555629802873371, "grad_norm": 0.3283238112926483, "learning_rate": 4.7772580465530686e-05, "loss": 0.4697, "num_input_tokens_seen": 10417824, "step": 8580 }, { "epoch": 0.9561198351709544, "grad_norm": 0.3515586853027344, "learning_rate": 4.7800423209711555e-05, "loss": 0.4732, "num_input_tokens_seen": 10424160, "step": 8585 }, { "epoch": 0.9566766900545718, "grad_norm": 0.37270209193229675, "learning_rate": 4.782826595389242e-05, "loss": 0.4914, "num_input_tokens_seen": 10430144, "step": 8590 }, { "epoch": 0.9572335449381891, "grad_norm": 0.39823320508003235, "learning_rate": 4.7856108698073285e-05, "loss": 0.4783, "num_input_tokens_seen": 10435936, "step": 8595 }, { "epoch": 0.9577903998218065, "grad_norm": 0.35138067603111267, "learning_rate": 4.788395144225415e-05, "loss": 0.4777, "num_input_tokens_seen": 10442016, "step": 8600 }, { "epoch": 0.9583472547054238, "grad_norm": 0.34959638118743896, "learning_rate": 4.7911794186435016e-05, "loss": 0.4734, "num_input_tokens_seen": 10448192, "step": 8605 }, { "epoch": 0.958904109589041, "grad_norm": 0.3781542479991913, "learning_rate": 4.7939636930615884e-05, "loss": 0.4528, "num_input_tokens_seen": 10454560, "step": 8610 }, { "epoch": 0.9594609644726584, "grad_norm": 0.39757242798805237, "learning_rate": 4.796747967479675e-05, "loss": 0.4799, "num_input_tokens_seen": 10460672, "step": 8615 }, { "epoch": 0.9600178193562757, "grad_norm": 0.3697200119495392, "learning_rate": 4.7995322418977615e-05, "loss": 0.4765, "num_input_tokens_seen": 10466592, "step": 8620 }, { "epoch": 0.9605746742398931, "grad_norm": 0.40894100069999695, "learning_rate": 4.8023165163158484e-05, "loss": 0.4673, "num_input_tokens_seen": 10472672, "step": 8625 }, { "epoch": 0.9611315291235104, "grad_norm": 0.305869996547699, "learning_rate": 4.805100790733935e-05, "loss": 0.4424, "num_input_tokens_seen": 10478752, "step": 8630 }, { "epoch": 0.9616883840071278, "grad_norm": 0.4119090735912323, "learning_rate": 4.807885065152022e-05, "loss": 0.4693, "num_input_tokens_seen": 10484480, "step": 8635 }, { "epoch": 0.9622452388907451, "grad_norm": 0.359260618686676, "learning_rate": 4.810669339570108e-05, "loss": 0.4827, "num_input_tokens_seen": 10490560, "step": 8640 }, { "epoch": 0.9628020937743624, "grad_norm": 0.4187556207180023, "learning_rate": 4.8134536139881945e-05, "loss": 0.4757, "num_input_tokens_seen": 10496608, "step": 8645 }, { "epoch": 0.9633589486579798, "grad_norm": 0.4153227210044861, "learning_rate": 4.816237888406281e-05, "loss": 0.487, "num_input_tokens_seen": 10503200, "step": 8650 }, { "epoch": 0.963915803541597, "grad_norm": 0.31132280826568604, "learning_rate": 4.819022162824368e-05, "loss": 0.4828, "num_input_tokens_seen": 10509120, "step": 8655 }, { "epoch": 0.9644726584252143, "grad_norm": 0.34857746958732605, "learning_rate": 4.821806437242455e-05, "loss": 0.4479, "num_input_tokens_seen": 10514816, "step": 8660 }, { "epoch": 0.9650295133088317, "grad_norm": 0.37864789366722107, "learning_rate": 4.824590711660541e-05, "loss": 0.4911, "num_input_tokens_seen": 10520896, "step": 8665 }, { "epoch": 0.965586368192449, "grad_norm": 0.2990054786205292, "learning_rate": 4.827374986078628e-05, "loss": 0.4794, "num_input_tokens_seen": 10527008, "step": 8670 }, { "epoch": 0.9661432230760664, "grad_norm": 0.33057090640068054, "learning_rate": 4.830159260496715e-05, "loss": 0.479, "num_input_tokens_seen": 10533536, "step": 8675 }, { "epoch": 0.9667000779596837, "grad_norm": 0.3776763081550598, "learning_rate": 4.832943534914802e-05, "loss": 0.4802, "num_input_tokens_seen": 10539872, "step": 8680 }, { "epoch": 0.967256932843301, "grad_norm": 0.34583866596221924, "learning_rate": 4.835727809332888e-05, "loss": 0.4761, "num_input_tokens_seen": 10546240, "step": 8685 }, { "epoch": 0.9678137877269184, "grad_norm": 0.4488617777824402, "learning_rate": 4.838512083750974e-05, "loss": 0.4851, "num_input_tokens_seen": 10551904, "step": 8690 }, { "epoch": 0.9683706426105357, "grad_norm": 0.31481122970581055, "learning_rate": 4.841296358169061e-05, "loss": 0.4673, "num_input_tokens_seen": 10558080, "step": 8695 }, { "epoch": 0.9689274974941531, "grad_norm": 0.2985800504684448, "learning_rate": 4.844080632587148e-05, "loss": 0.4784, "num_input_tokens_seen": 10564128, "step": 8700 }, { "epoch": 0.9694843523777703, "grad_norm": 0.3660793602466583, "learning_rate": 4.846864907005235e-05, "loss": 0.4826, "num_input_tokens_seen": 10570368, "step": 8705 }, { "epoch": 0.9700412072613877, "grad_norm": 0.2390000969171524, "learning_rate": 4.849649181423322e-05, "loss": 0.4726, "num_input_tokens_seen": 10575776, "step": 8710 }, { "epoch": 0.970598062145005, "grad_norm": 0.5795178413391113, "learning_rate": 4.852433455841408e-05, "loss": 0.4753, "num_input_tokens_seen": 10582048, "step": 8715 }, { "epoch": 0.9711549170286223, "grad_norm": 0.44147151708602905, "learning_rate": 4.855217730259495e-05, "loss": 0.4607, "num_input_tokens_seen": 10587808, "step": 8720 }, { "epoch": 0.9717117719122397, "grad_norm": 0.3354390263557434, "learning_rate": 4.8580020046775816e-05, "loss": 0.4664, "num_input_tokens_seen": 10593280, "step": 8725 }, { "epoch": 0.972268626795857, "grad_norm": 0.25598353147506714, "learning_rate": 4.860786279095668e-05, "loss": 0.4608, "num_input_tokens_seen": 10599392, "step": 8730 }, { "epoch": 0.9728254816794744, "grad_norm": 0.36420726776123047, "learning_rate": 4.863570553513755e-05, "loss": 0.4615, "num_input_tokens_seen": 10605568, "step": 8735 }, { "epoch": 0.9733823365630917, "grad_norm": 0.23857074975967407, "learning_rate": 4.866354827931841e-05, "loss": 0.4633, "num_input_tokens_seen": 10611552, "step": 8740 }, { "epoch": 0.973939191446709, "grad_norm": 0.5359347462654114, "learning_rate": 4.869139102349928e-05, "loss": 0.5053, "num_input_tokens_seen": 10617312, "step": 8745 }, { "epoch": 0.9744960463303263, "grad_norm": 0.35017839074134827, "learning_rate": 4.8719233767680146e-05, "loss": 0.4728, "num_input_tokens_seen": 10622912, "step": 8750 }, { "epoch": 0.9750529012139436, "grad_norm": 0.4030253291130066, "learning_rate": 4.8747076511861014e-05, "loss": 0.4589, "num_input_tokens_seen": 10628992, "step": 8755 }, { "epoch": 0.975609756097561, "grad_norm": 0.2870570123195648, "learning_rate": 4.8774919256041876e-05, "loss": 0.4733, "num_input_tokens_seen": 10635168, "step": 8760 }, { "epoch": 0.9761666109811783, "grad_norm": 0.33861038088798523, "learning_rate": 4.8802762000222745e-05, "loss": 0.4478, "num_input_tokens_seen": 10640864, "step": 8765 }, { "epoch": 0.9767234658647956, "grad_norm": 0.37329450249671936, "learning_rate": 4.8830604744403614e-05, "loss": 0.4624, "num_input_tokens_seen": 10647104, "step": 8770 }, { "epoch": 0.977280320748413, "grad_norm": 0.3113998770713806, "learning_rate": 4.8858447488584476e-05, "loss": 0.456, "num_input_tokens_seen": 10652608, "step": 8775 }, { "epoch": 0.9778371756320303, "grad_norm": 0.30198583006858826, "learning_rate": 4.8886290232765344e-05, "loss": 0.4579, "num_input_tokens_seen": 10658368, "step": 8780 }, { "epoch": 0.9783940305156477, "grad_norm": 0.4060273766517639, "learning_rate": 4.8914132976946206e-05, "loss": 0.4842, "num_input_tokens_seen": 10664064, "step": 8785 }, { "epoch": 0.978950885399265, "grad_norm": 0.375532329082489, "learning_rate": 4.8941975721127075e-05, "loss": 0.466, "num_input_tokens_seen": 10670048, "step": 8790 }, { "epoch": 0.9795077402828822, "grad_norm": 0.271136999130249, "learning_rate": 4.8969818465307943e-05, "loss": 0.4733, "num_input_tokens_seen": 10676320, "step": 8795 }, { "epoch": 0.9800645951664996, "grad_norm": 0.28003016114234924, "learning_rate": 4.899766120948881e-05, "loss": 0.4626, "num_input_tokens_seen": 10682304, "step": 8800 }, { "epoch": 0.9806214500501169, "grad_norm": 0.3140426278114319, "learning_rate": 4.9025503953669674e-05, "loss": 0.4831, "num_input_tokens_seen": 10688224, "step": 8805 }, { "epoch": 0.9811783049337343, "grad_norm": 0.42479026317596436, "learning_rate": 4.905334669785054e-05, "loss": 0.4739, "num_input_tokens_seen": 10693568, "step": 8810 }, { "epoch": 0.9817351598173516, "grad_norm": 0.41885626316070557, "learning_rate": 4.908118944203141e-05, "loss": 0.4622, "num_input_tokens_seen": 10699360, "step": 8815 }, { "epoch": 0.9822920147009689, "grad_norm": 0.36485084891319275, "learning_rate": 4.910903218621228e-05, "loss": 0.4724, "num_input_tokens_seen": 10705376, "step": 8820 }, { "epoch": 0.9828488695845863, "grad_norm": 0.3479929268360138, "learning_rate": 4.913687493039314e-05, "loss": 0.5098, "num_input_tokens_seen": 10711744, "step": 8825 }, { "epoch": 0.9834057244682036, "grad_norm": 0.2614976763725281, "learning_rate": 4.9164717674574004e-05, "loss": 0.4681, "num_input_tokens_seen": 10718048, "step": 8830 }, { "epoch": 0.983962579351821, "grad_norm": 0.2906840741634369, "learning_rate": 4.919256041875487e-05, "loss": 0.4666, "num_input_tokens_seen": 10724352, "step": 8835 }, { "epoch": 0.9845194342354382, "grad_norm": 0.28630736470222473, "learning_rate": 4.922040316293574e-05, "loss": 0.4959, "num_input_tokens_seen": 10730560, "step": 8840 }, { "epoch": 0.9850762891190555, "grad_norm": 0.2979012727737427, "learning_rate": 4.924824590711661e-05, "loss": 0.4604, "num_input_tokens_seen": 10736768, "step": 8845 }, { "epoch": 0.9856331440026729, "grad_norm": 0.17131076753139496, "learning_rate": 4.927608865129748e-05, "loss": 0.4767, "num_input_tokens_seen": 10742688, "step": 8850 }, { "epoch": 0.9861899988862902, "grad_norm": 0.3228204846382141, "learning_rate": 4.930393139547834e-05, "loss": 0.4874, "num_input_tokens_seen": 10748640, "step": 8855 }, { "epoch": 0.9867468537699076, "grad_norm": 0.36365410685539246, "learning_rate": 4.933177413965921e-05, "loss": 0.4608, "num_input_tokens_seen": 10754624, "step": 8860 }, { "epoch": 0.9873037086535249, "grad_norm": 0.2904936373233795, "learning_rate": 4.935961688384008e-05, "loss": 0.4562, "num_input_tokens_seen": 10760864, "step": 8865 }, { "epoch": 0.9878605635371422, "grad_norm": 0.3728382885456085, "learning_rate": 4.938745962802094e-05, "loss": 0.4672, "num_input_tokens_seen": 10766752, "step": 8870 }, { "epoch": 0.9884174184207596, "grad_norm": 0.29874491691589355, "learning_rate": 4.941530237220181e-05, "loss": 0.4707, "num_input_tokens_seen": 10772800, "step": 8875 }, { "epoch": 0.9889742733043769, "grad_norm": 0.3868101239204407, "learning_rate": 4.944314511638267e-05, "loss": 0.4542, "num_input_tokens_seen": 10778720, "step": 8880 }, { "epoch": 0.9895311281879942, "grad_norm": 0.2586563229560852, "learning_rate": 4.947098786056354e-05, "loss": 0.4707, "num_input_tokens_seen": 10784992, "step": 8885 }, { "epoch": 0.9900879830716115, "grad_norm": 0.4349076747894287, "learning_rate": 4.949883060474441e-05, "loss": 0.4656, "num_input_tokens_seen": 10791104, "step": 8890 }, { "epoch": 0.9906448379552288, "grad_norm": 0.2554020583629608, "learning_rate": 4.9526673348925276e-05, "loss": 0.4786, "num_input_tokens_seen": 10797152, "step": 8895 }, { "epoch": 0.9912016928388462, "grad_norm": 0.2866586148738861, "learning_rate": 4.955451609310614e-05, "loss": 0.4832, "num_input_tokens_seen": 10803648, "step": 8900 }, { "epoch": 0.9917585477224635, "grad_norm": 0.32403555512428284, "learning_rate": 4.9582358837287006e-05, "loss": 0.4613, "num_input_tokens_seen": 10809664, "step": 8905 }, { "epoch": 0.9923154026060809, "grad_norm": 0.3068290054798126, "learning_rate": 4.9610201581467875e-05, "loss": 0.4724, "num_input_tokens_seen": 10815264, "step": 8910 }, { "epoch": 0.9928722574896982, "grad_norm": 0.2097199559211731, "learning_rate": 4.963804432564874e-05, "loss": 0.4498, "num_input_tokens_seen": 10821248, "step": 8915 }, { "epoch": 0.9934291123733155, "grad_norm": 0.28822624683380127, "learning_rate": 4.9665887069829606e-05, "loss": 0.4685, "num_input_tokens_seen": 10827456, "step": 8920 }, { "epoch": 0.9939859672569329, "grad_norm": 0.31065669655799866, "learning_rate": 4.969372981401047e-05, "loss": 0.4735, "num_input_tokens_seen": 10833504, "step": 8925 }, { "epoch": 0.9945428221405501, "grad_norm": 0.36438000202178955, "learning_rate": 4.9721572558191336e-05, "loss": 0.4939, "num_input_tokens_seen": 10839680, "step": 8930 }, { "epoch": 0.9950996770241675, "grad_norm": 0.40135282278060913, "learning_rate": 4.9749415302372205e-05, "loss": 0.4767, "num_input_tokens_seen": 10846144, "step": 8935 }, { "epoch": 0.9956565319077848, "grad_norm": 0.34386616945266724, "learning_rate": 4.9777258046553073e-05, "loss": 0.4618, "num_input_tokens_seen": 10852160, "step": 8940 }, { "epoch": 0.9962133867914021, "grad_norm": 0.39918041229248047, "learning_rate": 4.9805100790733935e-05, "loss": 0.4735, "num_input_tokens_seen": 10858240, "step": 8945 }, { "epoch": 0.9967702416750195, "grad_norm": 0.3918875455856323, "learning_rate": 4.9832943534914804e-05, "loss": 0.4869, "num_input_tokens_seen": 10864480, "step": 8950 }, { "epoch": 0.9973270965586368, "grad_norm": 0.3288344144821167, "learning_rate": 4.986078627909567e-05, "loss": 0.4742, "num_input_tokens_seen": 10870176, "step": 8955 }, { "epoch": 0.9978839514422542, "grad_norm": 0.3808996081352234, "learning_rate": 4.9888629023276535e-05, "loss": 0.4873, "num_input_tokens_seen": 10876256, "step": 8960 }, { "epoch": 0.9984408063258715, "grad_norm": 0.31026536226272583, "learning_rate": 4.99164717674574e-05, "loss": 0.4679, "num_input_tokens_seen": 10882336, "step": 8965 }, { "epoch": 0.9989976612094889, "grad_norm": 0.26746368408203125, "learning_rate": 4.9944314511638265e-05, "loss": 0.4582, "num_input_tokens_seen": 10888352, "step": 8970 }, { "epoch": 0.9995545160931061, "grad_norm": 0.3111901581287384, "learning_rate": 4.9972157255819134e-05, "loss": 0.484, "num_input_tokens_seen": 10894368, "step": 8975 }, { "epoch": 1.0001113709767235, "grad_norm": 0.3461911380290985, "learning_rate": 5e-05, "loss": 0.4862, "num_input_tokens_seen": 10899840, "step": 8980 }, { "epoch": 1.0001113709767235, "eval_loss": 0.47376638650894165, "eval_runtime": 113.15, "eval_samples_per_second": 35.272, "eval_steps_per_second": 8.82, "num_input_tokens_seen": 10899840, "step": 8980 }, { "epoch": 1.0006682258603408, "grad_norm": 0.2939804196357727, "learning_rate": 4.999999952770995e-05, "loss": 0.4638, "num_input_tokens_seen": 10905952, "step": 8985 }, { "epoch": 1.0012250807439582, "grad_norm": 0.27038639783859253, "learning_rate": 4.999999811083979e-05, "loss": 0.4582, "num_input_tokens_seen": 10912224, "step": 8990 }, { "epoch": 1.0017819356275754, "grad_norm": 0.3323693871498108, "learning_rate": 4.9999995749389586e-05, "loss": 0.4891, "num_input_tokens_seen": 10918240, "step": 8995 }, { "epoch": 1.0023387905111927, "grad_norm": 0.3535955250263214, "learning_rate": 4.999999244335943e-05, "loss": 0.4695, "num_input_tokens_seen": 10924256, "step": 9000 }, { "epoch": 1.0028956453948101, "grad_norm": 0.44429051876068115, "learning_rate": 4.999998819274944e-05, "loss": 0.5202, "num_input_tokens_seen": 10929888, "step": 9005 }, { "epoch": 1.0034525002784274, "grad_norm": 0.2537460923194885, "learning_rate": 4.999998299755978e-05, "loss": 0.4847, "num_input_tokens_seen": 10935840, "step": 9010 }, { "epoch": 1.0040093551620448, "grad_norm": 0.3708558678627014, "learning_rate": 4.999997685779065e-05, "loss": 0.4848, "num_input_tokens_seen": 10941952, "step": 9015 }, { "epoch": 1.004566210045662, "grad_norm": 0.245637908577919, "learning_rate": 4.9999969773442275e-05, "loss": 0.4555, "num_input_tokens_seen": 10948032, "step": 9020 }, { "epoch": 1.0051230649292795, "grad_norm": 0.2543051242828369, "learning_rate": 4.9999961744514926e-05, "loss": 0.4883, "num_input_tokens_seen": 10954176, "step": 9025 }, { "epoch": 1.0056799198128967, "grad_norm": 0.35651835799217224, "learning_rate": 4.9999952771008905e-05, "loss": 0.4765, "num_input_tokens_seen": 10960192, "step": 9030 }, { "epoch": 1.0062367746965142, "grad_norm": 0.3647967278957367, "learning_rate": 4.999994285292455e-05, "loss": 0.4728, "num_input_tokens_seen": 10966528, "step": 9035 }, { "epoch": 1.0067936295801314, "grad_norm": 0.2793629765510559, "learning_rate": 4.999993199026224e-05, "loss": 0.4648, "num_input_tokens_seen": 10972736, "step": 9040 }, { "epoch": 1.0073504844637486, "grad_norm": 0.21957945823669434, "learning_rate": 4.999992018302238e-05, "loss": 0.453, "num_input_tokens_seen": 10978880, "step": 9045 }, { "epoch": 1.007907339347366, "grad_norm": 0.2977792024612427, "learning_rate": 4.999990743120543e-05, "loss": 0.4706, "num_input_tokens_seen": 10984928, "step": 9050 }, { "epoch": 1.0084641942309833, "grad_norm": 0.26030585169792175, "learning_rate": 4.999989373481185e-05, "loss": 0.48, "num_input_tokens_seen": 10990656, "step": 9055 }, { "epoch": 1.0090210491146008, "grad_norm": 0.303546279668808, "learning_rate": 4.9999879093842175e-05, "loss": 0.4753, "num_input_tokens_seen": 10996992, "step": 9060 }, { "epoch": 1.009577903998218, "grad_norm": 0.2822627127170563, "learning_rate": 4.999986350829695e-05, "loss": 0.4791, "num_input_tokens_seen": 11003200, "step": 9065 }, { "epoch": 1.0101347588818355, "grad_norm": 0.340243935585022, "learning_rate": 4.999984697817676e-05, "loss": 0.4642, "num_input_tokens_seen": 11009184, "step": 9070 }, { "epoch": 1.0106916137654527, "grad_norm": 0.3188554644584656, "learning_rate": 4.999982950348224e-05, "loss": 0.471, "num_input_tokens_seen": 11015360, "step": 9075 }, { "epoch": 1.0112484686490701, "grad_norm": 0.2749904692173004, "learning_rate": 4.999981108421405e-05, "loss": 0.4745, "num_input_tokens_seen": 11021856, "step": 9080 }, { "epoch": 1.0118053235326874, "grad_norm": 0.3788217604160309, "learning_rate": 4.999979172037288e-05, "loss": 0.463, "num_input_tokens_seen": 11027840, "step": 9085 }, { "epoch": 1.0123621784163046, "grad_norm": 0.3121677339076996, "learning_rate": 4.999977141195946e-05, "loss": 0.4697, "num_input_tokens_seen": 11033920, "step": 9090 }, { "epoch": 1.012919033299922, "grad_norm": 0.24127160012722015, "learning_rate": 4.999975015897456e-05, "loss": 0.4768, "num_input_tokens_seen": 11040000, "step": 9095 }, { "epoch": 1.0134758881835393, "grad_norm": 0.3209507465362549, "learning_rate": 4.999972796141898e-05, "loss": 0.4768, "num_input_tokens_seen": 11046048, "step": 9100 }, { "epoch": 1.0140327430671567, "grad_norm": 0.2572680711746216, "learning_rate": 4.9999704819293566e-05, "loss": 0.4693, "num_input_tokens_seen": 11051808, "step": 9105 }, { "epoch": 1.014589597950774, "grad_norm": 0.34132617712020874, "learning_rate": 4.999968073259918e-05, "loss": 0.4691, "num_input_tokens_seen": 11057536, "step": 9110 }, { "epoch": 1.0151464528343914, "grad_norm": 0.29722222685813904, "learning_rate": 4.999965570133675e-05, "loss": 0.4729, "num_input_tokens_seen": 11064000, "step": 9115 }, { "epoch": 1.0157033077180087, "grad_norm": 0.3031977117061615, "learning_rate": 4.999962972550722e-05, "loss": 0.4816, "num_input_tokens_seen": 11069888, "step": 9120 }, { "epoch": 1.016260162601626, "grad_norm": 0.3445582389831543, "learning_rate": 4.999960280511156e-05, "loss": 0.486, "num_input_tokens_seen": 11075648, "step": 9125 }, { "epoch": 1.0168170174852433, "grad_norm": 0.3950665593147278, "learning_rate": 4.9999574940150786e-05, "loss": 0.4599, "num_input_tokens_seen": 11081472, "step": 9130 }, { "epoch": 1.0173738723688606, "grad_norm": 0.3260103762149811, "learning_rate": 4.999954613062595e-05, "loss": 0.4673, "num_input_tokens_seen": 11087808, "step": 9135 }, { "epoch": 1.017930727252478, "grad_norm": 0.3353246748447418, "learning_rate": 4.9999516376538154e-05, "loss": 0.4727, "num_input_tokens_seen": 11093984, "step": 9140 }, { "epoch": 1.0184875821360952, "grad_norm": 0.3113926649093628, "learning_rate": 4.999948567788851e-05, "loss": 0.4515, "num_input_tokens_seen": 11100160, "step": 9145 }, { "epoch": 1.0190444370197127, "grad_norm": 0.3684960901737213, "learning_rate": 4.999945403467818e-05, "loss": 0.4748, "num_input_tokens_seen": 11106304, "step": 9150 }, { "epoch": 1.01960129190333, "grad_norm": 0.4804513454437256, "learning_rate": 4.9999421446908364e-05, "loss": 0.4938, "num_input_tokens_seen": 11112096, "step": 9155 }, { "epoch": 1.0201581467869474, "grad_norm": 0.420428067445755, "learning_rate": 4.99993879145803e-05, "loss": 0.4754, "num_input_tokens_seen": 11117632, "step": 9160 }, { "epoch": 1.0207150016705646, "grad_norm": 0.26546552777290344, "learning_rate": 4.999935343769524e-05, "loss": 0.4717, "num_input_tokens_seen": 11123744, "step": 9165 }, { "epoch": 1.021271856554182, "grad_norm": 0.28307753801345825, "learning_rate": 4.999931801625449e-05, "loss": 0.4856, "num_input_tokens_seen": 11129664, "step": 9170 }, { "epoch": 1.0218287114377993, "grad_norm": 0.3163868486881256, "learning_rate": 4.9999281650259396e-05, "loss": 0.4857, "num_input_tokens_seen": 11135776, "step": 9175 }, { "epoch": 1.0223855663214167, "grad_norm": 0.42583543062210083, "learning_rate": 4.9999244339711326e-05, "loss": 0.4398, "num_input_tokens_seen": 11141984, "step": 9180 }, { "epoch": 1.022942421205034, "grad_norm": 0.30786582827568054, "learning_rate": 4.999920608461169e-05, "loss": 0.4837, "num_input_tokens_seen": 11147904, "step": 9185 }, { "epoch": 1.0234992760886512, "grad_norm": 0.37167975306510925, "learning_rate": 4.999916688496193e-05, "loss": 0.4407, "num_input_tokens_seen": 11153824, "step": 9190 }, { "epoch": 1.0240561309722687, "grad_norm": 0.3623873293399811, "learning_rate": 4.9999126740763535e-05, "loss": 0.4691, "num_input_tokens_seen": 11160000, "step": 9195 }, { "epoch": 1.024612985855886, "grad_norm": 0.3074789047241211, "learning_rate": 4.999908565201802e-05, "loss": 0.4634, "num_input_tokens_seen": 11165824, "step": 9200 }, { "epoch": 1.0251698407395033, "grad_norm": 0.349698930978775, "learning_rate": 4.999904361872693e-05, "loss": 0.4693, "num_input_tokens_seen": 11171616, "step": 9205 }, { "epoch": 1.0257266956231206, "grad_norm": 0.31930020451545715, "learning_rate": 4.999900064089187e-05, "loss": 0.4612, "num_input_tokens_seen": 11177952, "step": 9210 }, { "epoch": 1.026283550506738, "grad_norm": 0.31717002391815186, "learning_rate": 4.999895671851444e-05, "loss": 0.4736, "num_input_tokens_seen": 11184192, "step": 9215 }, { "epoch": 1.0268404053903553, "grad_norm": 0.32568010687828064, "learning_rate": 4.999891185159632e-05, "loss": 0.4816, "num_input_tokens_seen": 11189920, "step": 9220 }, { "epoch": 1.0273972602739727, "grad_norm": 0.38847261667251587, "learning_rate": 4.999886604013919e-05, "loss": 0.4849, "num_input_tokens_seen": 11196096, "step": 9225 }, { "epoch": 1.02795411515759, "grad_norm": 0.3611518442630768, "learning_rate": 4.999881928414479e-05, "loss": 0.4579, "num_input_tokens_seen": 11202464, "step": 9230 }, { "epoch": 1.0285109700412072, "grad_norm": 0.35567694902420044, "learning_rate": 4.999877158361489e-05, "loss": 0.4605, "num_input_tokens_seen": 11208480, "step": 9235 }, { "epoch": 1.0290678249248246, "grad_norm": 0.34064409136772156, "learning_rate": 4.9998722938551286e-05, "loss": 0.4849, "num_input_tokens_seen": 11214528, "step": 9240 }, { "epoch": 1.0296246798084419, "grad_norm": 0.45057979226112366, "learning_rate": 4.9998673348955815e-05, "loss": 0.481, "num_input_tokens_seen": 11220448, "step": 9245 }, { "epoch": 1.0301815346920593, "grad_norm": 0.2993238866329193, "learning_rate": 4.999862281483035e-05, "loss": 0.4284, "num_input_tokens_seen": 11226528, "step": 9250 }, { "epoch": 1.0307383895756765, "grad_norm": 0.3359127342700958, "learning_rate": 4.9998571336176806e-05, "loss": 0.4661, "num_input_tokens_seen": 11232928, "step": 9255 }, { "epoch": 1.031295244459294, "grad_norm": 0.2895233929157257, "learning_rate": 4.999851891299713e-05, "loss": 0.4613, "num_input_tokens_seen": 11239072, "step": 9260 }, { "epoch": 1.0318520993429112, "grad_norm": 0.278195321559906, "learning_rate": 4.999846554529329e-05, "loss": 0.4835, "num_input_tokens_seen": 11244352, "step": 9265 }, { "epoch": 1.0324089542265287, "grad_norm": 0.2763300836086273, "learning_rate": 4.999841123306731e-05, "loss": 0.4926, "num_input_tokens_seen": 11250144, "step": 9270 }, { "epoch": 1.032965809110146, "grad_norm": 0.2388274371623993, "learning_rate": 4.999835597632125e-05, "loss": 0.4556, "num_input_tokens_seen": 11256256, "step": 9275 }, { "epoch": 1.0335226639937631, "grad_norm": 0.27934345602989197, "learning_rate": 4.9998299775057186e-05, "loss": 0.4842, "num_input_tokens_seen": 11262464, "step": 9280 }, { "epoch": 1.0340795188773806, "grad_norm": 0.504536509513855, "learning_rate": 4.999824262927725e-05, "loss": 0.4758, "num_input_tokens_seen": 11268544, "step": 9285 }, { "epoch": 1.0346363737609978, "grad_norm": 0.3261488676071167, "learning_rate": 4.9998184538983586e-05, "loss": 0.4759, "num_input_tokens_seen": 11274560, "step": 9290 }, { "epoch": 1.0351932286446153, "grad_norm": 0.5009506940841675, "learning_rate": 4.9998125504178405e-05, "loss": 0.468, "num_input_tokens_seen": 11280640, "step": 9295 }, { "epoch": 1.0357500835282325, "grad_norm": 0.3079269230365753, "learning_rate": 4.9998065524863934e-05, "loss": 0.4553, "num_input_tokens_seen": 11286784, "step": 9300 }, { "epoch": 1.03630693841185, "grad_norm": 0.31095564365386963, "learning_rate": 4.999800460104244e-05, "loss": 0.474, "num_input_tokens_seen": 11293088, "step": 9305 }, { "epoch": 1.0368637932954672, "grad_norm": 0.255806028842926, "learning_rate": 4.999794273271621e-05, "loss": 0.4628, "num_input_tokens_seen": 11298912, "step": 9310 }, { "epoch": 1.0374206481790846, "grad_norm": 0.24567565321922302, "learning_rate": 4.99978799198876e-05, "loss": 0.4699, "num_input_tokens_seen": 11304832, "step": 9315 }, { "epoch": 1.0379775030627019, "grad_norm": 0.3406122326850891, "learning_rate": 4.999781616255898e-05, "loss": 0.4692, "num_input_tokens_seen": 11310784, "step": 9320 }, { "epoch": 1.038534357946319, "grad_norm": 0.3603411316871643, "learning_rate": 4.9997751460732754e-05, "loss": 0.4735, "num_input_tokens_seen": 11317152, "step": 9325 }, { "epoch": 1.0390912128299366, "grad_norm": 0.2951117753982544, "learning_rate": 4.999768581441137e-05, "loss": 0.4563, "num_input_tokens_seen": 11323328, "step": 9330 }, { "epoch": 1.0396480677135538, "grad_norm": 0.2449282854795456, "learning_rate": 4.99976192235973e-05, "loss": 0.464, "num_input_tokens_seen": 11329056, "step": 9335 }, { "epoch": 1.0402049225971712, "grad_norm": 0.2058057188987732, "learning_rate": 4.9997551688293074e-05, "loss": 0.4467, "num_input_tokens_seen": 11335168, "step": 9340 }, { "epoch": 1.0407617774807885, "grad_norm": 0.3527044355869293, "learning_rate": 4.9997483208501236e-05, "loss": 0.4626, "num_input_tokens_seen": 11341568, "step": 9345 }, { "epoch": 1.041318632364406, "grad_norm": 0.4345138967037201, "learning_rate": 4.9997413784224376e-05, "loss": 0.4494, "num_input_tokens_seen": 11347456, "step": 9350 }, { "epoch": 1.0418754872480231, "grad_norm": 0.2821640074253082, "learning_rate": 4.9997343415465115e-05, "loss": 0.4518, "num_input_tokens_seen": 11353536, "step": 9355 }, { "epoch": 1.0424323421316406, "grad_norm": 0.3019622266292572, "learning_rate": 4.999727210222611e-05, "loss": 0.4612, "num_input_tokens_seen": 11359776, "step": 9360 }, { "epoch": 1.0429891970152578, "grad_norm": 0.26724621653556824, "learning_rate": 4.9997199844510056e-05, "loss": 0.4709, "num_input_tokens_seen": 11366240, "step": 9365 }, { "epoch": 1.043546051898875, "grad_norm": 0.25438621640205383, "learning_rate": 4.999712664231968e-05, "loss": 0.4707, "num_input_tokens_seen": 11372544, "step": 9370 }, { "epoch": 1.0441029067824925, "grad_norm": 0.3332720100879669, "learning_rate": 4.999705249565776e-05, "loss": 0.4831, "num_input_tokens_seen": 11378464, "step": 9375 }, { "epoch": 1.0446597616661097, "grad_norm": 0.34429776668548584, "learning_rate": 4.999697740452709e-05, "loss": 0.4725, "num_input_tokens_seen": 11384576, "step": 9380 }, { "epoch": 1.0452166165497272, "grad_norm": 0.23015989363193512, "learning_rate": 4.99969013689305e-05, "loss": 0.4773, "num_input_tokens_seen": 11390688, "step": 9385 }, { "epoch": 1.0457734714333444, "grad_norm": 0.39608487486839294, "learning_rate": 4.999682438887087e-05, "loss": 0.4871, "num_input_tokens_seen": 11396864, "step": 9390 }, { "epoch": 1.0463303263169619, "grad_norm": 0.30716025829315186, "learning_rate": 4.999674646435112e-05, "loss": 0.4793, "num_input_tokens_seen": 11403104, "step": 9395 }, { "epoch": 1.046887181200579, "grad_norm": 0.33958593010902405, "learning_rate": 4.999666759537417e-05, "loss": 0.4826, "num_input_tokens_seen": 11409280, "step": 9400 }, { "epoch": 1.0474440360841966, "grad_norm": 0.2821786105632782, "learning_rate": 4.999658778194302e-05, "loss": 0.4718, "num_input_tokens_seen": 11415104, "step": 9405 }, { "epoch": 1.0480008909678138, "grad_norm": 0.3009037971496582, "learning_rate": 4.999650702406067e-05, "loss": 0.454, "num_input_tokens_seen": 11421056, "step": 9410 }, { "epoch": 1.048557745851431, "grad_norm": 0.35360851883888245, "learning_rate": 4.999642532173019e-05, "loss": 0.4603, "num_input_tokens_seen": 11427296, "step": 9415 }, { "epoch": 1.0491146007350485, "grad_norm": 0.33628442883491516, "learning_rate": 4.999634267495464e-05, "loss": 0.5049, "num_input_tokens_seen": 11433440, "step": 9420 }, { "epoch": 1.0496714556186657, "grad_norm": 0.378343790769577, "learning_rate": 4.999625908373717e-05, "loss": 0.4787, "num_input_tokens_seen": 11439680, "step": 9425 }, { "epoch": 1.0502283105022832, "grad_norm": 0.3948984742164612, "learning_rate": 4.999617454808093e-05, "loss": 0.4832, "num_input_tokens_seen": 11445440, "step": 9430 }, { "epoch": 1.0507851653859004, "grad_norm": 0.3563389480113983, "learning_rate": 4.9996089067989116e-05, "loss": 0.4741, "num_input_tokens_seen": 11451616, "step": 9435 }, { "epoch": 1.0513420202695178, "grad_norm": 0.4442249536514282, "learning_rate": 4.999600264346494e-05, "loss": 0.4714, "num_input_tokens_seen": 11457536, "step": 9440 }, { "epoch": 1.051898875153135, "grad_norm": 0.25417622923851013, "learning_rate": 4.9995915274511684e-05, "loss": 0.4684, "num_input_tokens_seen": 11463840, "step": 9445 }, { "epoch": 1.0524557300367525, "grad_norm": 0.31541404128074646, "learning_rate": 4.9995826961132654e-05, "loss": 0.4568, "num_input_tokens_seen": 11470048, "step": 9450 }, { "epoch": 1.0530125849203698, "grad_norm": 0.3137074112892151, "learning_rate": 4.9995737703331166e-05, "loss": 0.4775, "num_input_tokens_seen": 11476448, "step": 9455 }, { "epoch": 1.053569439803987, "grad_norm": 0.4221612811088562, "learning_rate": 4.9995647501110616e-05, "loss": 0.4847, "num_input_tokens_seen": 11482016, "step": 9460 }, { "epoch": 1.0541262946876044, "grad_norm": 0.49226507544517517, "learning_rate": 4.999555635447439e-05, "loss": 0.4665, "num_input_tokens_seen": 11488384, "step": 9465 }, { "epoch": 1.0546831495712217, "grad_norm": 0.3380506932735443, "learning_rate": 4.999546426342595e-05, "loss": 0.4652, "num_input_tokens_seen": 11494368, "step": 9470 }, { "epoch": 1.0552400044548391, "grad_norm": 0.35737860202789307, "learning_rate": 4.999537122796877e-05, "loss": 0.4615, "num_input_tokens_seen": 11500416, "step": 9475 }, { "epoch": 1.0557968593384564, "grad_norm": 0.37799641489982605, "learning_rate": 4.999527724810637e-05, "loss": 0.4686, "num_input_tokens_seen": 11506688, "step": 9480 }, { "epoch": 1.0563537142220738, "grad_norm": 0.34739646315574646, "learning_rate": 4.9995182323842274e-05, "loss": 0.4744, "num_input_tokens_seen": 11512864, "step": 9485 }, { "epoch": 1.056910569105691, "grad_norm": 0.22885853052139282, "learning_rate": 4.9995086455180104e-05, "loss": 0.4627, "num_input_tokens_seen": 11519040, "step": 9490 }, { "epoch": 1.0574674239893085, "grad_norm": 0.27056899666786194, "learning_rate": 4.9994989642123454e-05, "loss": 0.4694, "num_input_tokens_seen": 11525344, "step": 9495 }, { "epoch": 1.0580242788729257, "grad_norm": 0.3681062161922455, "learning_rate": 4.9994891884676006e-05, "loss": 0.4656, "num_input_tokens_seen": 11531552, "step": 9500 }, { "epoch": 1.058581133756543, "grad_norm": 0.24941235780715942, "learning_rate": 4.9994793182841445e-05, "loss": 0.4601, "num_input_tokens_seen": 11537696, "step": 9505 }, { "epoch": 1.0591379886401604, "grad_norm": 0.27313774824142456, "learning_rate": 4.999469353662349e-05, "loss": 0.4806, "num_input_tokens_seen": 11543872, "step": 9510 }, { "epoch": 1.0596948435237776, "grad_norm": 0.28088632225990295, "learning_rate": 4.999459294602592e-05, "loss": 0.4711, "num_input_tokens_seen": 11550016, "step": 9515 }, { "epoch": 1.060251698407395, "grad_norm": 0.29018014669418335, "learning_rate": 4.9994491411052525e-05, "loss": 0.466, "num_input_tokens_seen": 11556256, "step": 9520 }, { "epoch": 1.0608085532910123, "grad_norm": 0.3311847150325775, "learning_rate": 4.999438893170714e-05, "loss": 0.4842, "num_input_tokens_seen": 11562496, "step": 9525 }, { "epoch": 1.0613654081746298, "grad_norm": 0.243488609790802, "learning_rate": 4.9994285507993654e-05, "loss": 0.4593, "num_input_tokens_seen": 11568416, "step": 9530 }, { "epoch": 1.061922263058247, "grad_norm": 0.4580373466014862, "learning_rate": 4.9994181139915956e-05, "loss": 0.4865, "num_input_tokens_seen": 11574592, "step": 9535 }, { "epoch": 1.0624791179418644, "grad_norm": 0.2939301133155823, "learning_rate": 4.999407582747801e-05, "loss": 0.4799, "num_input_tokens_seen": 11580000, "step": 9540 }, { "epoch": 1.0630359728254817, "grad_norm": 0.26850154995918274, "learning_rate": 4.9993969570683764e-05, "loss": 0.4553, "num_input_tokens_seen": 11585824, "step": 9545 }, { "epoch": 1.063592827709099, "grad_norm": 0.3289361000061035, "learning_rate": 4.999386236953726e-05, "loss": 0.4746, "num_input_tokens_seen": 11592128, "step": 9550 }, { "epoch": 1.0641496825927164, "grad_norm": 0.3038017749786377, "learning_rate": 4.9993754224042535e-05, "loss": 0.4984, "num_input_tokens_seen": 11598080, "step": 9555 }, { "epoch": 1.0647065374763336, "grad_norm": 0.38338756561279297, "learning_rate": 4.999364513420368e-05, "loss": 0.4885, "num_input_tokens_seen": 11604320, "step": 9560 }, { "epoch": 1.065263392359951, "grad_norm": 0.28341639041900635, "learning_rate": 4.999353510002481e-05, "loss": 0.4769, "num_input_tokens_seen": 11610240, "step": 9565 }, { "epoch": 1.0658202472435683, "grad_norm": 0.33111000061035156, "learning_rate": 4.99934241215101e-05, "loss": 0.4685, "num_input_tokens_seen": 11616192, "step": 9570 }, { "epoch": 1.0663771021271857, "grad_norm": 0.3766707181930542, "learning_rate": 4.9993312198663725e-05, "loss": 0.4712, "num_input_tokens_seen": 11622560, "step": 9575 }, { "epoch": 1.066933957010803, "grad_norm": 0.35644668340682983, "learning_rate": 4.999319933148992e-05, "loss": 0.4718, "num_input_tokens_seen": 11628544, "step": 9580 }, { "epoch": 1.0674908118944204, "grad_norm": 0.2800096571445465, "learning_rate": 4.999308551999296e-05, "loss": 0.4584, "num_input_tokens_seen": 11634496, "step": 9585 }, { "epoch": 1.0680476667780376, "grad_norm": 0.3346911668777466, "learning_rate": 4.999297076417712e-05, "loss": 0.4648, "num_input_tokens_seen": 11639968, "step": 9590 }, { "epoch": 1.0686045216616549, "grad_norm": 0.24083565175533295, "learning_rate": 4.9992855064046754e-05, "loss": 0.4682, "num_input_tokens_seen": 11646208, "step": 9595 }, { "epoch": 1.0691613765452723, "grad_norm": 0.36111053824424744, "learning_rate": 4.9992738419606235e-05, "loss": 0.4552, "num_input_tokens_seen": 11652288, "step": 9600 }, { "epoch": 1.0697182314288896, "grad_norm": 0.2719552516937256, "learning_rate": 4.999262083085996e-05, "loss": 0.4759, "num_input_tokens_seen": 11658176, "step": 9605 }, { "epoch": 1.070275086312507, "grad_norm": 0.2668299973011017, "learning_rate": 4.999250229781238e-05, "loss": 0.4836, "num_input_tokens_seen": 11664640, "step": 9610 }, { "epoch": 1.0708319411961242, "grad_norm": 0.31316182017326355, "learning_rate": 4.9992382820467965e-05, "loss": 0.4385, "num_input_tokens_seen": 11670816, "step": 9615 }, { "epoch": 1.0713887960797417, "grad_norm": 0.3144727051258087, "learning_rate": 4.9992262398831245e-05, "loss": 0.485, "num_input_tokens_seen": 11676640, "step": 9620 }, { "epoch": 1.071945650963359, "grad_norm": 0.2968153953552246, "learning_rate": 4.999214103290675e-05, "loss": 0.4773, "num_input_tokens_seen": 11682592, "step": 9625 }, { "epoch": 1.0725025058469764, "grad_norm": 0.2410743236541748, "learning_rate": 4.999201872269908e-05, "loss": 0.4446, "num_input_tokens_seen": 11688800, "step": 9630 }, { "epoch": 1.0730593607305936, "grad_norm": 0.3432556390762329, "learning_rate": 4.9991895468212855e-05, "loss": 0.4537, "num_input_tokens_seen": 11694784, "step": 9635 }, { "epoch": 1.0736162156142108, "grad_norm": 0.3294404149055481, "learning_rate": 4.999177126945273e-05, "loss": 0.4615, "num_input_tokens_seen": 11700928, "step": 9640 }, { "epoch": 1.0741730704978283, "grad_norm": 0.3176203668117523, "learning_rate": 4.999164612642339e-05, "loss": 0.5016, "num_input_tokens_seen": 11707040, "step": 9645 }, { "epoch": 1.0747299253814455, "grad_norm": 0.4230417013168335, "learning_rate": 4.9991520039129573e-05, "loss": 0.4697, "num_input_tokens_seen": 11712896, "step": 9650 }, { "epoch": 1.075286780265063, "grad_norm": 0.423503041267395, "learning_rate": 4.999139300757604e-05, "loss": 0.4638, "num_input_tokens_seen": 11718272, "step": 9655 }, { "epoch": 1.0758436351486802, "grad_norm": 0.3924633860588074, "learning_rate": 4.9991265031767586e-05, "loss": 0.4794, "num_input_tokens_seen": 11724672, "step": 9660 }, { "epoch": 1.0764004900322977, "grad_norm": 0.307273805141449, "learning_rate": 4.999113611170906e-05, "loss": 0.4677, "num_input_tokens_seen": 11730816, "step": 9665 }, { "epoch": 1.0769573449159149, "grad_norm": 0.3938625156879425, "learning_rate": 4.999100624740531e-05, "loss": 0.4593, "num_input_tokens_seen": 11737024, "step": 9670 }, { "epoch": 1.0775141997995323, "grad_norm": 0.4064650237560272, "learning_rate": 4.9990875438861276e-05, "loss": 0.4724, "num_input_tokens_seen": 11742976, "step": 9675 }, { "epoch": 1.0780710546831496, "grad_norm": 0.42457982897758484, "learning_rate": 4.999074368608187e-05, "loss": 0.4856, "num_input_tokens_seen": 11749248, "step": 9680 }, { "epoch": 1.0786279095667668, "grad_norm": 0.36300531029701233, "learning_rate": 4.9990610989072074e-05, "loss": 0.4647, "num_input_tokens_seen": 11755520, "step": 9685 }, { "epoch": 1.0791847644503842, "grad_norm": 0.4235304296016693, "learning_rate": 4.999047734783692e-05, "loss": 0.4986, "num_input_tokens_seen": 11761632, "step": 9690 }, { "epoch": 1.0797416193340015, "grad_norm": 0.2595328688621521, "learning_rate": 4.999034276238144e-05, "loss": 0.4897, "num_input_tokens_seen": 11767840, "step": 9695 }, { "epoch": 1.080298474217619, "grad_norm": 0.2981114685535431, "learning_rate": 4.9990207232710715e-05, "loss": 0.4771, "num_input_tokens_seen": 11773888, "step": 9700 }, { "epoch": 1.0808553291012362, "grad_norm": 0.42771807312965393, "learning_rate": 4.999007075882989e-05, "loss": 0.4778, "num_input_tokens_seen": 11780160, "step": 9705 }, { "epoch": 1.0814121839848536, "grad_norm": 0.32366132736206055, "learning_rate": 4.99899333407441e-05, "loss": 0.4737, "num_input_tokens_seen": 11786240, "step": 9710 }, { "epoch": 1.0819690388684708, "grad_norm": 0.2223161906003952, "learning_rate": 4.998979497845855e-05, "loss": 0.4781, "num_input_tokens_seen": 11792288, "step": 9715 }, { "epoch": 1.0825258937520883, "grad_norm": 0.2966096103191376, "learning_rate": 4.998965567197846e-05, "loss": 0.4635, "num_input_tokens_seen": 11798464, "step": 9720 }, { "epoch": 1.0830827486357055, "grad_norm": 0.2684892416000366, "learning_rate": 4.99895154213091e-05, "loss": 0.4671, "num_input_tokens_seen": 11804512, "step": 9725 }, { "epoch": 1.0836396035193228, "grad_norm": 0.2680552303791046, "learning_rate": 4.998937422645575e-05, "loss": 0.4709, "num_input_tokens_seen": 11810688, "step": 9730 }, { "epoch": 1.0841964584029402, "grad_norm": 0.25752294063568115, "learning_rate": 4.998923208742377e-05, "loss": 0.4691, "num_input_tokens_seen": 11816640, "step": 9735 }, { "epoch": 1.0847533132865574, "grad_norm": 0.31278833746910095, "learning_rate": 4.998908900421852e-05, "loss": 0.4627, "num_input_tokens_seen": 11822720, "step": 9740 }, { "epoch": 1.085310168170175, "grad_norm": 0.28063154220581055, "learning_rate": 4.99889449768454e-05, "loss": 0.4957, "num_input_tokens_seen": 11828864, "step": 9745 }, { "epoch": 1.0858670230537921, "grad_norm": 0.25211429595947266, "learning_rate": 4.9988800005309865e-05, "loss": 0.4797, "num_input_tokens_seen": 11834752, "step": 9750 }, { "epoch": 1.0864238779374096, "grad_norm": 0.272523432970047, "learning_rate": 4.998865408961738e-05, "loss": 0.4815, "num_input_tokens_seen": 11841184, "step": 9755 }, { "epoch": 1.0869807328210268, "grad_norm": 0.3106854259967804, "learning_rate": 4.9988507229773466e-05, "loss": 0.4797, "num_input_tokens_seen": 11846912, "step": 9760 }, { "epoch": 1.0875375877046443, "grad_norm": 0.24905428290367126, "learning_rate": 4.998835942578367e-05, "loss": 0.4775, "num_input_tokens_seen": 11853024, "step": 9765 }, { "epoch": 1.0880944425882615, "grad_norm": 0.30669546127319336, "learning_rate": 4.998821067765358e-05, "loss": 0.4628, "num_input_tokens_seen": 11859360, "step": 9770 }, { "epoch": 1.0886512974718787, "grad_norm": 0.39905086159706116, "learning_rate": 4.998806098538881e-05, "loss": 0.4893, "num_input_tokens_seen": 11865344, "step": 9775 }, { "epoch": 1.0892081523554962, "grad_norm": 0.3175061345100403, "learning_rate": 4.9987910348995006e-05, "loss": 0.4751, "num_input_tokens_seen": 11871584, "step": 9780 }, { "epoch": 1.0897650072391134, "grad_norm": 0.2659115493297577, "learning_rate": 4.998775876847788e-05, "loss": 0.4666, "num_input_tokens_seen": 11877152, "step": 9785 }, { "epoch": 1.0903218621227309, "grad_norm": 0.27838319540023804, "learning_rate": 4.998760624384315e-05, "loss": 0.4668, "num_input_tokens_seen": 11883104, "step": 9790 }, { "epoch": 1.090878717006348, "grad_norm": 0.3213210105895996, "learning_rate": 4.9987452775096584e-05, "loss": 0.4729, "num_input_tokens_seen": 11889024, "step": 9795 }, { "epoch": 1.0914355718899655, "grad_norm": 0.45803242921829224, "learning_rate": 4.998729836224397e-05, "loss": 0.4741, "num_input_tokens_seen": 11895200, "step": 9800 }, { "epoch": 1.0919924267735828, "grad_norm": 0.35453271865844727, "learning_rate": 4.998714300529115e-05, "loss": 0.4608, "num_input_tokens_seen": 11901216, "step": 9805 }, { "epoch": 1.0925492816572002, "grad_norm": 0.2151520997285843, "learning_rate": 4.9986986704243985e-05, "loss": 0.4618, "num_input_tokens_seen": 11907552, "step": 9810 }, { "epoch": 1.0931061365408175, "grad_norm": 0.35505610704421997, "learning_rate": 4.99868294591084e-05, "loss": 0.4787, "num_input_tokens_seen": 11913792, "step": 9815 }, { "epoch": 1.0936629914244347, "grad_norm": 0.43549251556396484, "learning_rate": 4.998667126989032e-05, "loss": 0.4702, "num_input_tokens_seen": 11920192, "step": 9820 }, { "epoch": 1.0942198463080521, "grad_norm": 0.2965008020401001, "learning_rate": 4.998651213659572e-05, "loss": 0.4678, "num_input_tokens_seen": 11926208, "step": 9825 }, { "epoch": 1.0947767011916694, "grad_norm": 0.28355273604393005, "learning_rate": 4.9986352059230624e-05, "loss": 0.4796, "num_input_tokens_seen": 11932192, "step": 9830 }, { "epoch": 1.0953335560752868, "grad_norm": 0.29343220591545105, "learning_rate": 4.998619103780107e-05, "loss": 0.5078, "num_input_tokens_seen": 11938304, "step": 9835 }, { "epoch": 1.095890410958904, "grad_norm": 0.3309214413166046, "learning_rate": 4.998602907231315e-05, "loss": 0.4914, "num_input_tokens_seen": 11944352, "step": 9840 }, { "epoch": 1.0964472658425215, "grad_norm": 0.2464793473482132, "learning_rate": 4.998586616277298e-05, "loss": 0.462, "num_input_tokens_seen": 11950528, "step": 9845 }, { "epoch": 1.0970041207261387, "grad_norm": 0.45920103788375854, "learning_rate": 4.9985702309186714e-05, "loss": 0.4683, "num_input_tokens_seen": 11956192, "step": 9850 }, { "epoch": 1.0975609756097562, "grad_norm": 0.28860732913017273, "learning_rate": 4.998553751156054e-05, "loss": 0.4639, "num_input_tokens_seen": 11962144, "step": 9855 }, { "epoch": 1.0981178304933734, "grad_norm": 0.22898252308368683, "learning_rate": 4.99853717699007e-05, "loss": 0.4564, "num_input_tokens_seen": 11968256, "step": 9860 }, { "epoch": 1.0986746853769906, "grad_norm": 0.2296140342950821, "learning_rate": 4.9985205084213436e-05, "loss": 0.4785, "num_input_tokens_seen": 11974368, "step": 9865 }, { "epoch": 1.099231540260608, "grad_norm": 0.3369043469429016, "learning_rate": 4.998503745450506e-05, "loss": 0.4659, "num_input_tokens_seen": 11980448, "step": 9870 }, { "epoch": 1.0997883951442253, "grad_norm": 0.33397209644317627, "learning_rate": 4.99848688807819e-05, "loss": 0.4607, "num_input_tokens_seen": 11985632, "step": 9875 }, { "epoch": 1.1003452500278428, "grad_norm": 0.31352365016937256, "learning_rate": 4.998469936305032e-05, "loss": 0.4743, "num_input_tokens_seen": 11991136, "step": 9880 }, { "epoch": 1.10090210491146, "grad_norm": 0.29012802243232727, "learning_rate": 4.9984528901316726e-05, "loss": 0.4702, "num_input_tokens_seen": 11997344, "step": 9885 }, { "epoch": 1.1014589597950775, "grad_norm": 0.3621923625469208, "learning_rate": 4.998435749558758e-05, "loss": 0.4698, "num_input_tokens_seen": 12003328, "step": 9890 }, { "epoch": 1.1020158146786947, "grad_norm": 0.30951404571533203, "learning_rate": 4.998418514586933e-05, "loss": 0.4676, "num_input_tokens_seen": 12009536, "step": 9895 }, { "epoch": 1.1025726695623121, "grad_norm": 0.3614826798439026, "learning_rate": 4.99840118521685e-05, "loss": 0.47, "num_input_tokens_seen": 12015872, "step": 9900 }, { "epoch": 1.1031295244459294, "grad_norm": 0.32134419679641724, "learning_rate": 4.998383761449164e-05, "loss": 0.4469, "num_input_tokens_seen": 12022336, "step": 9905 }, { "epoch": 1.1036863793295466, "grad_norm": 0.33680781722068787, "learning_rate": 4.998366243284533e-05, "loss": 0.459, "num_input_tokens_seen": 12028480, "step": 9910 }, { "epoch": 1.104243234213164, "grad_norm": 0.2899726927280426, "learning_rate": 4.998348630723619e-05, "loss": 0.474, "num_input_tokens_seen": 12034848, "step": 9915 }, { "epoch": 1.1048000890967813, "grad_norm": 0.36032330989837646, "learning_rate": 4.998330923767087e-05, "loss": 0.4771, "num_input_tokens_seen": 12040992, "step": 9920 }, { "epoch": 1.1053569439803987, "grad_norm": 0.3693784177303314, "learning_rate": 4.9983131224156066e-05, "loss": 0.4575, "num_input_tokens_seen": 12046528, "step": 9925 }, { "epoch": 1.105913798864016, "grad_norm": 0.30183231830596924, "learning_rate": 4.99829522666985e-05, "loss": 0.4602, "num_input_tokens_seen": 12052544, "step": 9930 }, { "epoch": 1.1064706537476334, "grad_norm": 0.3097701966762543, "learning_rate": 4.998277236530494e-05, "loss": 0.4849, "num_input_tokens_seen": 12058496, "step": 9935 }, { "epoch": 1.1070275086312507, "grad_norm": 0.25815197825431824, "learning_rate": 4.998259151998218e-05, "loss": 0.4698, "num_input_tokens_seen": 12064608, "step": 9940 }, { "epoch": 1.107584363514868, "grad_norm": 0.2850498855113983, "learning_rate": 4.998240973073705e-05, "loss": 0.4769, "num_input_tokens_seen": 12070752, "step": 9945 }, { "epoch": 1.1081412183984853, "grad_norm": 0.2254849225282669, "learning_rate": 4.998222699757642e-05, "loss": 0.4642, "num_input_tokens_seen": 12076928, "step": 9950 }, { "epoch": 1.1086980732821026, "grad_norm": 0.3398391008377075, "learning_rate": 4.9982043320507185e-05, "loss": 0.4811, "num_input_tokens_seen": 12082848, "step": 9955 }, { "epoch": 1.10925492816572, "grad_norm": 0.32824939489364624, "learning_rate": 4.998185869953631e-05, "loss": 0.4925, "num_input_tokens_seen": 12088864, "step": 9960 }, { "epoch": 1.1098117830493373, "grad_norm": 0.3065056800842285, "learning_rate": 4.9981673134670746e-05, "loss": 0.4814, "num_input_tokens_seen": 12094912, "step": 9965 }, { "epoch": 1.1103686379329547, "grad_norm": 0.32654857635498047, "learning_rate": 4.9981486625917515e-05, "loss": 0.4563, "num_input_tokens_seen": 12100768, "step": 9970 }, { "epoch": 1.110925492816572, "grad_norm": 0.3062353730201721, "learning_rate": 4.998129917328366e-05, "loss": 0.4587, "num_input_tokens_seen": 12106944, "step": 9975 }, { "epoch": 1.1114823477001894, "grad_norm": 0.2566289007663727, "learning_rate": 4.9981110776776276e-05, "loss": 0.4887, "num_input_tokens_seen": 12112896, "step": 9980 }, { "epoch": 1.1120392025838066, "grad_norm": 0.22221457958221436, "learning_rate": 4.998092143640246e-05, "loss": 0.4768, "num_input_tokens_seen": 12119136, "step": 9985 }, { "epoch": 1.112596057467424, "grad_norm": 0.251160204410553, "learning_rate": 4.998073115216938e-05, "loss": 0.4599, "num_input_tokens_seen": 12125120, "step": 9990 }, { "epoch": 1.1131529123510413, "grad_norm": 0.21419654786586761, "learning_rate": 4.998053992408422e-05, "loss": 0.4607, "num_input_tokens_seen": 12131392, "step": 9995 }, { "epoch": 1.1137097672346585, "grad_norm": 0.39644598960876465, "learning_rate": 4.9980347752154214e-05, "loss": 0.464, "num_input_tokens_seen": 12137408, "step": 10000 }, { "epoch": 1.114266622118276, "grad_norm": 0.2640261948108673, "learning_rate": 4.9980154636386614e-05, "loss": 0.467, "num_input_tokens_seen": 12143328, "step": 10005 }, { "epoch": 1.1148234770018932, "grad_norm": 0.19803565740585327, "learning_rate": 4.997996057678871e-05, "loss": 0.4598, "num_input_tokens_seen": 12149312, "step": 10010 }, { "epoch": 1.1153803318855107, "grad_norm": 0.33819228410720825, "learning_rate": 4.9979765573367854e-05, "loss": 0.4509, "num_input_tokens_seen": 12155456, "step": 10015 }, { "epoch": 1.115937186769128, "grad_norm": 0.2762315273284912, "learning_rate": 4.9979569626131397e-05, "loss": 0.4537, "num_input_tokens_seen": 12161408, "step": 10020 }, { "epoch": 1.1164940416527454, "grad_norm": 0.2627171277999878, "learning_rate": 4.997937273508675e-05, "loss": 0.4854, "num_input_tokens_seen": 12167552, "step": 10025 }, { "epoch": 1.1170508965363626, "grad_norm": 0.24505329132080078, "learning_rate": 4.9979174900241354e-05, "loss": 0.4646, "num_input_tokens_seen": 12173440, "step": 10030 }, { "epoch": 1.11760775141998, "grad_norm": 0.24681709706783295, "learning_rate": 4.9978976121602684e-05, "loss": 0.4725, "num_input_tokens_seen": 12179648, "step": 10035 }, { "epoch": 1.1181646063035973, "grad_norm": 0.27244964241981506, "learning_rate": 4.997877639917824e-05, "loss": 0.4675, "num_input_tokens_seen": 12185760, "step": 10040 }, { "epoch": 1.1187214611872145, "grad_norm": 0.23595547676086426, "learning_rate": 4.997857573297557e-05, "loss": 0.472, "num_input_tokens_seen": 12192000, "step": 10045 }, { "epoch": 1.119278316070832, "grad_norm": 0.2734663188457489, "learning_rate": 4.997837412300227e-05, "loss": 0.4678, "num_input_tokens_seen": 12198048, "step": 10050 }, { "epoch": 1.1198351709544492, "grad_norm": 0.2803158760070801, "learning_rate": 4.9978171569265944e-05, "loss": 0.4347, "num_input_tokens_seen": 12204320, "step": 10055 }, { "epoch": 1.1203920258380666, "grad_norm": 0.3398904800415039, "learning_rate": 4.997796807177426e-05, "loss": 0.4803, "num_input_tokens_seen": 12210528, "step": 10060 }, { "epoch": 1.1209488807216839, "grad_norm": 0.33440420031547546, "learning_rate": 4.9977763630534883e-05, "loss": 0.469, "num_input_tokens_seen": 12216704, "step": 10065 }, { "epoch": 1.1215057356053013, "grad_norm": 0.25697487592697144, "learning_rate": 4.9977558245555555e-05, "loss": 0.4639, "num_input_tokens_seen": 12222688, "step": 10070 }, { "epoch": 1.1220625904889185, "grad_norm": 0.3557835817337036, "learning_rate": 4.997735191684404e-05, "loss": 0.464, "num_input_tokens_seen": 12228768, "step": 10075 }, { "epoch": 1.122619445372536, "grad_norm": 0.35170820355415344, "learning_rate": 4.997714464440811e-05, "loss": 0.4878, "num_input_tokens_seen": 12234560, "step": 10080 }, { "epoch": 1.1231763002561532, "grad_norm": 0.35614675283432007, "learning_rate": 4.997693642825563e-05, "loss": 0.4659, "num_input_tokens_seen": 12240544, "step": 10085 }, { "epoch": 1.1237331551397707, "grad_norm": 0.3377153277397156, "learning_rate": 4.997672726839444e-05, "loss": 0.4879, "num_input_tokens_seen": 12246752, "step": 10090 }, { "epoch": 1.124290010023388, "grad_norm": 0.243442103266716, "learning_rate": 4.997651716483245e-05, "loss": 0.4764, "num_input_tokens_seen": 12252736, "step": 10095 }, { "epoch": 1.1248468649070051, "grad_norm": 0.2378338724374771, "learning_rate": 4.99763061175776e-05, "loss": 0.4769, "num_input_tokens_seen": 12258784, "step": 10100 }, { "epoch": 1.1254037197906226, "grad_norm": 0.25801944732666016, "learning_rate": 4.997609412663787e-05, "loss": 0.4567, "num_input_tokens_seen": 12265056, "step": 10105 }, { "epoch": 1.1259605746742398, "grad_norm": 0.2809634208679199, "learning_rate": 4.9975881192021256e-05, "loss": 0.4952, "num_input_tokens_seen": 12271360, "step": 10110 }, { "epoch": 1.1265174295578573, "grad_norm": 0.2789306044578552, "learning_rate": 4.997566731373582e-05, "loss": 0.4619, "num_input_tokens_seen": 12277312, "step": 10115 }, { "epoch": 1.1270742844414745, "grad_norm": 0.32233864068984985, "learning_rate": 4.997545249178963e-05, "loss": 0.4665, "num_input_tokens_seen": 12283680, "step": 10120 }, { "epoch": 1.127631139325092, "grad_norm": 0.2596501111984253, "learning_rate": 4.997523672619081e-05, "loss": 0.485, "num_input_tokens_seen": 12289760, "step": 10125 }, { "epoch": 1.1281879942087092, "grad_norm": 0.3012183606624603, "learning_rate": 4.9975020016947506e-05, "loss": 0.4698, "num_input_tokens_seen": 12296000, "step": 10130 }, { "epoch": 1.1287448490923264, "grad_norm": 0.3137606084346771, "learning_rate": 4.997480236406791e-05, "loss": 0.47, "num_input_tokens_seen": 12301632, "step": 10135 }, { "epoch": 1.1293017039759439, "grad_norm": 0.2734540104866028, "learning_rate": 4.9974583767560245e-05, "loss": 0.4866, "num_input_tokens_seen": 12307776, "step": 10140 }, { "epoch": 1.129858558859561, "grad_norm": 0.3662734031677246, "learning_rate": 4.9974364227432766e-05, "loss": 0.4725, "num_input_tokens_seen": 12313984, "step": 10145 }, { "epoch": 1.1304154137431786, "grad_norm": 0.2639657258987427, "learning_rate": 4.997414374369378e-05, "loss": 0.4566, "num_input_tokens_seen": 12320096, "step": 10150 }, { "epoch": 1.1309722686267958, "grad_norm": 0.26333898305892944, "learning_rate": 4.9973922316351606e-05, "loss": 0.4608, "num_input_tokens_seen": 12325920, "step": 10155 }, { "epoch": 1.1315291235104132, "grad_norm": 0.2737269699573517, "learning_rate": 4.997369994541462e-05, "loss": 0.4595, "num_input_tokens_seen": 12331808, "step": 10160 }, { "epoch": 1.1320859783940305, "grad_norm": 0.28371116518974304, "learning_rate": 4.997347663089121e-05, "loss": 0.477, "num_input_tokens_seen": 12337792, "step": 10165 }, { "epoch": 1.132642833277648, "grad_norm": 0.294866144657135, "learning_rate": 4.9973252372789825e-05, "loss": 0.4759, "num_input_tokens_seen": 12342784, "step": 10170 }, { "epoch": 1.1331996881612652, "grad_norm": 0.35951340198516846, "learning_rate": 4.9973027171118936e-05, "loss": 0.4867, "num_input_tokens_seen": 12349280, "step": 10175 }, { "epoch": 1.1337565430448824, "grad_norm": 0.33065685629844666, "learning_rate": 4.997280102588705e-05, "loss": 0.4643, "num_input_tokens_seen": 12355360, "step": 10180 }, { "epoch": 1.1343133979284998, "grad_norm": 0.33773353695869446, "learning_rate": 4.997257393710271e-05, "loss": 0.4633, "num_input_tokens_seen": 12361600, "step": 10185 }, { "epoch": 1.1348702528121173, "grad_norm": 0.29226288199424744, "learning_rate": 4.99723459047745e-05, "loss": 0.4752, "num_input_tokens_seen": 12367712, "step": 10190 }, { "epoch": 1.1354271076957345, "grad_norm": 0.2627042531967163, "learning_rate": 4.997211692891103e-05, "loss": 0.452, "num_input_tokens_seen": 12374016, "step": 10195 }, { "epoch": 1.1359839625793517, "grad_norm": 0.45067843794822693, "learning_rate": 4.9971887009520955e-05, "loss": 0.4695, "num_input_tokens_seen": 12379360, "step": 10200 }, { "epoch": 1.1365408174629692, "grad_norm": 0.2811082601547241, "learning_rate": 4.997165614661297e-05, "loss": 0.4961, "num_input_tokens_seen": 12385408, "step": 10205 }, { "epoch": 1.1370976723465864, "grad_norm": 0.37591442465782166, "learning_rate": 4.997142434019578e-05, "loss": 0.4441, "num_input_tokens_seen": 12391200, "step": 10210 }, { "epoch": 1.1376545272302039, "grad_norm": 0.2588190734386444, "learning_rate": 4.997119159027817e-05, "loss": 0.4733, "num_input_tokens_seen": 12397504, "step": 10215 }, { "epoch": 1.1382113821138211, "grad_norm": 0.3047463893890381, "learning_rate": 4.99709578968689e-05, "loss": 0.4439, "num_input_tokens_seen": 12403456, "step": 10220 }, { "epoch": 1.1387682369974383, "grad_norm": 0.22761821746826172, "learning_rate": 4.997072325997682e-05, "loss": 0.4588, "num_input_tokens_seen": 12409568, "step": 10225 }, { "epoch": 1.1393250918810558, "grad_norm": 0.21525529026985168, "learning_rate": 4.99704876796108e-05, "loss": 0.4598, "num_input_tokens_seen": 12415680, "step": 10230 }, { "epoch": 1.1398819467646732, "grad_norm": 0.27302324771881104, "learning_rate": 4.997025115577973e-05, "loss": 0.4435, "num_input_tokens_seen": 12421536, "step": 10235 }, { "epoch": 1.1404388016482905, "grad_norm": 0.2647002637386322, "learning_rate": 4.997001368849255e-05, "loss": 0.4892, "num_input_tokens_seen": 12427456, "step": 10240 }, { "epoch": 1.1409956565319077, "grad_norm": 0.3270103931427002, "learning_rate": 4.996977527775823e-05, "loss": 0.4651, "num_input_tokens_seen": 12433280, "step": 10245 }, { "epoch": 1.1415525114155252, "grad_norm": 0.16488751769065857, "learning_rate": 4.9969535923585785e-05, "loss": 0.4599, "num_input_tokens_seen": 12439328, "step": 10250 }, { "epoch": 1.1421093662991424, "grad_norm": 0.33698728680610657, "learning_rate": 4.996929562598426e-05, "loss": 0.4595, "num_input_tokens_seen": 12445696, "step": 10255 }, { "epoch": 1.1426662211827598, "grad_norm": 0.24608829617500305, "learning_rate": 4.9969054384962715e-05, "loss": 0.4442, "num_input_tokens_seen": 12451520, "step": 10260 }, { "epoch": 1.143223076066377, "grad_norm": 0.2758459150791168, "learning_rate": 4.996881220053029e-05, "loss": 0.469, "num_input_tokens_seen": 12458112, "step": 10265 }, { "epoch": 1.1437799309499945, "grad_norm": 0.27030590176582336, "learning_rate": 4.996856907269611e-05, "loss": 0.4732, "num_input_tokens_seen": 12463904, "step": 10270 }, { "epoch": 1.1443367858336118, "grad_norm": 0.2383967787027359, "learning_rate": 4.996832500146939e-05, "loss": 0.48, "num_input_tokens_seen": 12469952, "step": 10275 }, { "epoch": 1.1448936407172292, "grad_norm": 0.27070996165275574, "learning_rate": 4.996807998685932e-05, "loss": 0.4574, "num_input_tokens_seen": 12476384, "step": 10280 }, { "epoch": 1.1454504956008464, "grad_norm": 0.28189826011657715, "learning_rate": 4.996783402887518e-05, "loss": 0.4538, "num_input_tokens_seen": 12482656, "step": 10285 }, { "epoch": 1.1460073504844637, "grad_norm": 0.25153791904449463, "learning_rate": 4.996758712752626e-05, "loss": 0.4567, "num_input_tokens_seen": 12488896, "step": 10290 }, { "epoch": 1.1465642053680811, "grad_norm": 0.35251104831695557, "learning_rate": 4.996733928282189e-05, "loss": 0.458, "num_input_tokens_seen": 12495040, "step": 10295 }, { "epoch": 1.1471210602516984, "grad_norm": 0.25778254866600037, "learning_rate": 4.9967090494771424e-05, "loss": 0.4659, "num_input_tokens_seen": 12501088, "step": 10300 }, { "epoch": 1.1476779151353158, "grad_norm": 0.29590362310409546, "learning_rate": 4.996684076338427e-05, "loss": 0.4751, "num_input_tokens_seen": 12507040, "step": 10305 }, { "epoch": 1.148234770018933, "grad_norm": 0.31743288040161133, "learning_rate": 4.9966590088669865e-05, "loss": 0.4828, "num_input_tokens_seen": 12513152, "step": 10310 }, { "epoch": 1.1487916249025505, "grad_norm": 0.2957337200641632, "learning_rate": 4.996633847063767e-05, "loss": 0.4657, "num_input_tokens_seen": 12519264, "step": 10315 }, { "epoch": 1.1493484797861677, "grad_norm": 0.3851555585861206, "learning_rate": 4.99660859092972e-05, "loss": 0.477, "num_input_tokens_seen": 12525056, "step": 10320 }, { "epoch": 1.1499053346697852, "grad_norm": 0.35852959752082825, "learning_rate": 4.9965832404658e-05, "loss": 0.4524, "num_input_tokens_seen": 12530816, "step": 10325 }, { "epoch": 1.1504621895534024, "grad_norm": 0.2487042248249054, "learning_rate": 4.996557795672965e-05, "loss": 0.4582, "num_input_tokens_seen": 12536896, "step": 10330 }, { "epoch": 1.1510190444370196, "grad_norm": 0.24482114613056183, "learning_rate": 4.9965322565521745e-05, "loss": 0.4859, "num_input_tokens_seen": 12542880, "step": 10335 }, { "epoch": 1.151575899320637, "grad_norm": 0.3029915392398834, "learning_rate": 4.996506623104396e-05, "loss": 0.4378, "num_input_tokens_seen": 12549408, "step": 10340 }, { "epoch": 1.1521327542042543, "grad_norm": 0.20304065942764282, "learning_rate": 4.9964808953305965e-05, "loss": 0.442, "num_input_tokens_seen": 12555584, "step": 10345 }, { "epoch": 1.1526896090878718, "grad_norm": 0.23949584364891052, "learning_rate": 4.996455073231748e-05, "loss": 0.4643, "num_input_tokens_seen": 12561888, "step": 10350 }, { "epoch": 1.153246463971489, "grad_norm": 0.30984950065612793, "learning_rate": 4.996429156808827e-05, "loss": 0.4894, "num_input_tokens_seen": 12568160, "step": 10355 }, { "epoch": 1.1538033188551065, "grad_norm": 0.35121431946754456, "learning_rate": 4.996403146062812e-05, "loss": 0.4925, "num_input_tokens_seen": 12574528, "step": 10360 }, { "epoch": 1.1543601737387237, "grad_norm": 0.4326859414577484, "learning_rate": 4.9963770409946866e-05, "loss": 0.4777, "num_input_tokens_seen": 12580448, "step": 10365 }, { "epoch": 1.1549170286223411, "grad_norm": 0.24182774126529694, "learning_rate": 4.9963508416054356e-05, "loss": 0.4547, "num_input_tokens_seen": 12586496, "step": 10370 }, { "epoch": 1.1554738835059584, "grad_norm": 0.2745213508605957, "learning_rate": 4.9963245478960505e-05, "loss": 0.4677, "num_input_tokens_seen": 12592512, "step": 10375 }, { "epoch": 1.1560307383895756, "grad_norm": 0.22744274139404297, "learning_rate": 4.996298159867524e-05, "loss": 0.4478, "num_input_tokens_seen": 12598048, "step": 10380 }, { "epoch": 1.156587593273193, "grad_norm": 0.3135499954223633, "learning_rate": 4.996271677520853e-05, "loss": 0.4963, "num_input_tokens_seen": 12604064, "step": 10385 }, { "epoch": 1.1571444481568103, "grad_norm": 0.31148579716682434, "learning_rate": 4.996245100857039e-05, "loss": 0.4752, "num_input_tokens_seen": 12609856, "step": 10390 }, { "epoch": 1.1577013030404277, "grad_norm": 0.26265671849250793, "learning_rate": 4.9962184298770845e-05, "loss": 0.469, "num_input_tokens_seen": 12616032, "step": 10395 }, { "epoch": 1.158258157924045, "grad_norm": 0.202032208442688, "learning_rate": 4.996191664581998e-05, "loss": 0.4595, "num_input_tokens_seen": 12621952, "step": 10400 }, { "epoch": 1.1588150128076624, "grad_norm": 0.2491583377122879, "learning_rate": 4.9961648049727926e-05, "loss": 0.4575, "num_input_tokens_seen": 12628288, "step": 10405 }, { "epoch": 1.1593718676912796, "grad_norm": 0.25316789746284485, "learning_rate": 4.996137851050481e-05, "loss": 0.4556, "num_input_tokens_seen": 12634464, "step": 10410 }, { "epoch": 1.159928722574897, "grad_norm": 0.29338932037353516, "learning_rate": 4.996110802816081e-05, "loss": 0.466, "num_input_tokens_seen": 12640576, "step": 10415 }, { "epoch": 1.1604855774585143, "grad_norm": 0.21708525717258453, "learning_rate": 4.996083660270616e-05, "loss": 0.4705, "num_input_tokens_seen": 12646560, "step": 10420 }, { "epoch": 1.1610424323421316, "grad_norm": 0.30835166573524475, "learning_rate": 4.996056423415112e-05, "loss": 0.4627, "num_input_tokens_seen": 12652576, "step": 10425 }, { "epoch": 1.161599287225749, "grad_norm": 0.29788440465927124, "learning_rate": 4.996029092250597e-05, "loss": 0.4484, "num_input_tokens_seen": 12658784, "step": 10430 }, { "epoch": 1.1621561421093662, "grad_norm": 0.22591063380241394, "learning_rate": 4.996001666778104e-05, "loss": 0.4583, "num_input_tokens_seen": 12664832, "step": 10435 }, { "epoch": 1.1627129969929837, "grad_norm": 0.24443037807941437, "learning_rate": 4.9959741469986694e-05, "loss": 0.4682, "num_input_tokens_seen": 12670848, "step": 10440 }, { "epoch": 1.163269851876601, "grad_norm": 0.34248045086860657, "learning_rate": 4.9959465329133326e-05, "loss": 0.4688, "num_input_tokens_seen": 12676320, "step": 10445 }, { "epoch": 1.1638267067602184, "grad_norm": 0.32312870025634766, "learning_rate": 4.995918824523137e-05, "loss": 0.461, "num_input_tokens_seen": 12682240, "step": 10450 }, { "epoch": 1.1643835616438356, "grad_norm": 0.21205390989780426, "learning_rate": 4.9958910218291295e-05, "loss": 0.4569, "num_input_tokens_seen": 12688416, "step": 10455 }, { "epoch": 1.164940416527453, "grad_norm": 0.2582765817642212, "learning_rate": 4.9958631248323603e-05, "loss": 0.4477, "num_input_tokens_seen": 12694720, "step": 10460 }, { "epoch": 1.1654972714110703, "grad_norm": 0.30687224864959717, "learning_rate": 4.995835133533885e-05, "loss": 0.4514, "num_input_tokens_seen": 12700960, "step": 10465 }, { "epoch": 1.1660541262946875, "grad_norm": 0.2825932800769806, "learning_rate": 4.9958070479347597e-05, "loss": 0.4962, "num_input_tokens_seen": 12707264, "step": 10470 }, { "epoch": 1.166610981178305, "grad_norm": 0.30334725975990295, "learning_rate": 4.995778868036046e-05, "loss": 0.4745, "num_input_tokens_seen": 12713600, "step": 10475 }, { "epoch": 1.1671678360619222, "grad_norm": 0.2649296820163727, "learning_rate": 4.9957505938388084e-05, "loss": 0.4618, "num_input_tokens_seen": 12720000, "step": 10480 }, { "epoch": 1.1677246909455397, "grad_norm": 0.40975257754325867, "learning_rate": 4.995722225344115e-05, "loss": 0.4666, "num_input_tokens_seen": 12726016, "step": 10485 }, { "epoch": 1.1682815458291569, "grad_norm": 0.2868102490901947, "learning_rate": 4.995693762553037e-05, "loss": 0.4913, "num_input_tokens_seen": 12732064, "step": 10490 }, { "epoch": 1.1688384007127743, "grad_norm": 0.20972397923469543, "learning_rate": 4.995665205466653e-05, "loss": 0.4752, "num_input_tokens_seen": 12738176, "step": 10495 }, { "epoch": 1.1693952555963916, "grad_norm": 0.4136090874671936, "learning_rate": 4.995636554086039e-05, "loss": 0.4581, "num_input_tokens_seen": 12744576, "step": 10500 }, { "epoch": 1.169952110480009, "grad_norm": 0.29881471395492554, "learning_rate": 4.995607808412278e-05, "loss": 0.4785, "num_input_tokens_seen": 12750624, "step": 10505 }, { "epoch": 1.1705089653636263, "grad_norm": 0.2948661744594574, "learning_rate": 4.995578968446457e-05, "loss": 0.4849, "num_input_tokens_seen": 12756512, "step": 10510 }, { "epoch": 1.1710658202472435, "grad_norm": 0.24766281247138977, "learning_rate": 4.995550034189664e-05, "loss": 0.4668, "num_input_tokens_seen": 12762624, "step": 10515 }, { "epoch": 1.171622675130861, "grad_norm": 0.2792423665523529, "learning_rate": 4.9955210056429945e-05, "loss": 0.4811, "num_input_tokens_seen": 12768864, "step": 10520 }, { "epoch": 1.1721795300144782, "grad_norm": 0.2320905476808548, "learning_rate": 4.9954918828075436e-05, "loss": 0.4828, "num_input_tokens_seen": 12775104, "step": 10525 }, { "epoch": 1.1727363848980956, "grad_norm": 0.26632916927337646, "learning_rate": 4.9954626656844115e-05, "loss": 0.4698, "num_input_tokens_seen": 12781184, "step": 10530 }, { "epoch": 1.1732932397817128, "grad_norm": 0.26540639996528625, "learning_rate": 4.995433354274704e-05, "loss": 0.489, "num_input_tokens_seen": 12787168, "step": 10535 }, { "epoch": 1.1738500946653303, "grad_norm": 0.22423987090587616, "learning_rate": 4.9954039485795265e-05, "loss": 0.4508, "num_input_tokens_seen": 12793120, "step": 10540 }, { "epoch": 1.1744069495489475, "grad_norm": 0.25075408816337585, "learning_rate": 4.9953744485999915e-05, "loss": 0.4577, "num_input_tokens_seen": 12799104, "step": 10545 }, { "epoch": 1.174963804432565, "grad_norm": 0.2177194058895111, "learning_rate": 4.995344854337213e-05, "loss": 0.4633, "num_input_tokens_seen": 12805440, "step": 10550 }, { "epoch": 1.1755206593161822, "grad_norm": 0.2913030982017517, "learning_rate": 4.9953151657923085e-05, "loss": 0.4709, "num_input_tokens_seen": 12811136, "step": 10555 }, { "epoch": 1.1760775141997994, "grad_norm": 0.2373611479997635, "learning_rate": 4.9952853829664014e-05, "loss": 0.4771, "num_input_tokens_seen": 12817376, "step": 10560 }, { "epoch": 1.176634369083417, "grad_norm": 0.36504197120666504, "learning_rate": 4.995255505860615e-05, "loss": 0.4746, "num_input_tokens_seen": 12822944, "step": 10565 }, { "epoch": 1.1771912239670341, "grad_norm": 0.28797057271003723, "learning_rate": 4.99522553447608e-05, "loss": 0.4594, "num_input_tokens_seen": 12829312, "step": 10570 }, { "epoch": 1.1777480788506516, "grad_norm": 0.2160327136516571, "learning_rate": 4.995195468813927e-05, "loss": 0.4808, "num_input_tokens_seen": 12835296, "step": 10575 }, { "epoch": 1.1783049337342688, "grad_norm": 0.30495110154151917, "learning_rate": 4.9951653088752935e-05, "loss": 0.4771, "num_input_tokens_seen": 12841216, "step": 10580 }, { "epoch": 1.1788617886178863, "grad_norm": 0.27394574880599976, "learning_rate": 4.9951350546613184e-05, "loss": 0.4783, "num_input_tokens_seen": 12846944, "step": 10585 }, { "epoch": 1.1794186435015035, "grad_norm": 0.22150346636772156, "learning_rate": 4.9951047061731455e-05, "loss": 0.4554, "num_input_tokens_seen": 12853120, "step": 10590 }, { "epoch": 1.179975498385121, "grad_norm": 0.2845766246318817, "learning_rate": 4.99507426341192e-05, "loss": 0.4669, "num_input_tokens_seen": 12859392, "step": 10595 }, { "epoch": 1.1805323532687382, "grad_norm": 0.2469296008348465, "learning_rate": 4.995043726378793e-05, "loss": 0.4838, "num_input_tokens_seen": 12865536, "step": 10600 }, { "epoch": 1.1810892081523554, "grad_norm": 0.3921383321285248, "learning_rate": 4.995013095074919e-05, "loss": 0.4718, "num_input_tokens_seen": 12871552, "step": 10605 }, { "epoch": 1.1816460630359729, "grad_norm": 0.29509520530700684, "learning_rate": 4.994982369501454e-05, "loss": 0.4258, "num_input_tokens_seen": 12877888, "step": 10610 }, { "epoch": 1.18220291791959, "grad_norm": 0.2640121281147003, "learning_rate": 4.99495154965956e-05, "loss": 0.4176, "num_input_tokens_seen": 12884064, "step": 10615 }, { "epoch": 1.1827597728032075, "grad_norm": 0.23355065286159515, "learning_rate": 4.9949206355504e-05, "loss": 0.447, "num_input_tokens_seen": 12890304, "step": 10620 }, { "epoch": 1.1833166276868248, "grad_norm": 0.2286069542169571, "learning_rate": 4.994889627175145e-05, "loss": 0.4771, "num_input_tokens_seen": 12896352, "step": 10625 }, { "epoch": 1.1838734825704422, "grad_norm": 0.2667323350906372, "learning_rate": 4.994858524534962e-05, "loss": 0.4814, "num_input_tokens_seen": 12902784, "step": 10630 }, { "epoch": 1.1844303374540595, "grad_norm": 0.4006792902946472, "learning_rate": 4.99482732763103e-05, "loss": 0.4812, "num_input_tokens_seen": 12908928, "step": 10635 }, { "epoch": 1.184987192337677, "grad_norm": 0.3151608109474182, "learning_rate": 4.9947960364645265e-05, "loss": 0.5064, "num_input_tokens_seen": 12915040, "step": 10640 }, { "epoch": 1.1855440472212941, "grad_norm": 0.2874777019023895, "learning_rate": 4.994764651036634e-05, "loss": 0.466, "num_input_tokens_seen": 12920928, "step": 10645 }, { "epoch": 1.1861009021049114, "grad_norm": 0.28636226058006287, "learning_rate": 4.9947331713485376e-05, "loss": 0.4458, "num_input_tokens_seen": 12927072, "step": 10650 }, { "epoch": 1.1866577569885288, "grad_norm": 0.2074556201696396, "learning_rate": 4.994701597401428e-05, "loss": 0.477, "num_input_tokens_seen": 12932608, "step": 10655 }, { "epoch": 1.187214611872146, "grad_norm": 0.33359241485595703, "learning_rate": 4.994669929196497e-05, "loss": 0.4705, "num_input_tokens_seen": 12938784, "step": 10660 }, { "epoch": 1.1877714667557635, "grad_norm": 0.31605419516563416, "learning_rate": 4.994638166734942e-05, "loss": 0.4866, "num_input_tokens_seen": 12945056, "step": 10665 }, { "epoch": 1.1883283216393807, "grad_norm": 0.2793971300125122, "learning_rate": 4.994606310017962e-05, "loss": 0.4786, "num_input_tokens_seen": 12951104, "step": 10670 }, { "epoch": 1.1888851765229982, "grad_norm": 0.3485352694988251, "learning_rate": 4.9945743590467606e-05, "loss": 0.4982, "num_input_tokens_seen": 12957472, "step": 10675 }, { "epoch": 1.1894420314066154, "grad_norm": 0.3131130039691925, "learning_rate": 4.9945423138225464e-05, "loss": 0.47, "num_input_tokens_seen": 12963808, "step": 10680 }, { "epoch": 1.1899988862902329, "grad_norm": 0.2583577036857605, "learning_rate": 4.99451017434653e-05, "loss": 0.4754, "num_input_tokens_seen": 12969856, "step": 10685 }, { "epoch": 1.19055574117385, "grad_norm": 0.21345072984695435, "learning_rate": 4.994477940619924e-05, "loss": 0.459, "num_input_tokens_seen": 12975936, "step": 10690 }, { "epoch": 1.1911125960574673, "grad_norm": 0.3363204896450043, "learning_rate": 4.994445612643949e-05, "loss": 0.4758, "num_input_tokens_seen": 12982464, "step": 10695 }, { "epoch": 1.1916694509410848, "grad_norm": 0.41873905062675476, "learning_rate": 4.994413190419823e-05, "loss": 0.4829, "num_input_tokens_seen": 12987936, "step": 10700 }, { "epoch": 1.192226305824702, "grad_norm": 0.28654536604881287, "learning_rate": 4.994380673948774e-05, "loss": 0.4833, "num_input_tokens_seen": 12994208, "step": 10705 }, { "epoch": 1.1927831607083195, "grad_norm": 0.2719871699810028, "learning_rate": 4.994348063232029e-05, "loss": 0.468, "num_input_tokens_seen": 12999872, "step": 10710 }, { "epoch": 1.1933400155919367, "grad_norm": 0.2968844175338745, "learning_rate": 4.9943153582708216e-05, "loss": 0.4722, "num_input_tokens_seen": 13006080, "step": 10715 }, { "epoch": 1.1938968704755542, "grad_norm": 0.27427342534065247, "learning_rate": 4.9942825590663864e-05, "loss": 0.4505, "num_input_tokens_seen": 13012192, "step": 10720 }, { "epoch": 1.1944537253591714, "grad_norm": 0.2760929465293884, "learning_rate": 4.994249665619962e-05, "loss": 0.4901, "num_input_tokens_seen": 13018528, "step": 10725 }, { "epoch": 1.1950105802427888, "grad_norm": 0.31503862142562866, "learning_rate": 4.994216677932793e-05, "loss": 0.4577, "num_input_tokens_seen": 13025088, "step": 10730 }, { "epoch": 1.195567435126406, "grad_norm": 0.29225218296051025, "learning_rate": 4.994183596006123e-05, "loss": 0.4905, "num_input_tokens_seen": 13031104, "step": 10735 }, { "epoch": 1.1961242900100233, "grad_norm": 0.2856377065181732, "learning_rate": 4.9941504198412054e-05, "loss": 0.4746, "num_input_tokens_seen": 13037312, "step": 10740 }, { "epoch": 1.1966811448936407, "grad_norm": 0.20144392549991608, "learning_rate": 4.9941171494392914e-05, "loss": 0.4549, "num_input_tokens_seen": 13042880, "step": 10745 }, { "epoch": 1.197237999777258, "grad_norm": 0.255787193775177, "learning_rate": 4.994083784801639e-05, "loss": 0.4729, "num_input_tokens_seen": 13049024, "step": 10750 }, { "epoch": 1.1977948546608754, "grad_norm": 0.26916247606277466, "learning_rate": 4.994050325929508e-05, "loss": 0.4624, "num_input_tokens_seen": 13055328, "step": 10755 }, { "epoch": 1.1983517095444927, "grad_norm": 0.21226142346858978, "learning_rate": 4.994016772824164e-05, "loss": 0.4771, "num_input_tokens_seen": 13061280, "step": 10760 }, { "epoch": 1.1989085644281101, "grad_norm": 0.2649023234844208, "learning_rate": 4.993983125486873e-05, "loss": 0.4637, "num_input_tokens_seen": 13067264, "step": 10765 }, { "epoch": 1.1994654193117273, "grad_norm": 0.23812878131866455, "learning_rate": 4.9939493839189076e-05, "loss": 0.4703, "num_input_tokens_seen": 13073312, "step": 10770 }, { "epoch": 1.2000222741953448, "grad_norm": 0.35675644874572754, "learning_rate": 4.9939155481215424e-05, "loss": 0.4654, "num_input_tokens_seen": 13079008, "step": 10775 }, { "epoch": 1.200579129078962, "grad_norm": 0.2583823800086975, "learning_rate": 4.993881618096055e-05, "loss": 0.4499, "num_input_tokens_seen": 13085024, "step": 10780 }, { "epoch": 1.2011359839625793, "grad_norm": 0.3440406024456024, "learning_rate": 4.993847593843729e-05, "loss": 0.4677, "num_input_tokens_seen": 13090848, "step": 10785 }, { "epoch": 1.2016928388461967, "grad_norm": 0.34675151109695435, "learning_rate": 4.993813475365848e-05, "loss": 0.4706, "num_input_tokens_seen": 13097088, "step": 10790 }, { "epoch": 1.202249693729814, "grad_norm": 0.2226720154285431, "learning_rate": 4.993779262663703e-05, "loss": 0.4789, "num_input_tokens_seen": 13102912, "step": 10795 }, { "epoch": 1.2028065486134314, "grad_norm": 0.34938305616378784, "learning_rate": 4.9937449557385845e-05, "loss": 0.4498, "num_input_tokens_seen": 13108896, "step": 10800 }, { "epoch": 1.2033634034970486, "grad_norm": 0.29716891050338745, "learning_rate": 4.9937105545917905e-05, "loss": 0.4856, "num_input_tokens_seen": 13115040, "step": 10805 }, { "epoch": 1.203920258380666, "grad_norm": 0.3102687895298004, "learning_rate": 4.993676059224621e-05, "loss": 0.4618, "num_input_tokens_seen": 13121216, "step": 10810 }, { "epoch": 1.2044771132642833, "grad_norm": 0.20695842802524567, "learning_rate": 4.9936414696383784e-05, "loss": 0.472, "num_input_tokens_seen": 13127040, "step": 10815 }, { "epoch": 1.2050339681479008, "grad_norm": 0.22416497766971588, "learning_rate": 4.993606785834369e-05, "loss": 0.475, "num_input_tokens_seen": 13132864, "step": 10820 }, { "epoch": 1.205590823031518, "grad_norm": 0.22936558723449707, "learning_rate": 4.9935720078139045e-05, "loss": 0.4639, "num_input_tokens_seen": 13139072, "step": 10825 }, { "epoch": 1.2061476779151352, "grad_norm": 0.24513497948646545, "learning_rate": 4.9935371355782986e-05, "loss": 0.4793, "num_input_tokens_seen": 13144832, "step": 10830 }, { "epoch": 1.2067045327987527, "grad_norm": 0.24798762798309326, "learning_rate": 4.993502169128869e-05, "loss": 0.4508, "num_input_tokens_seen": 13150848, "step": 10835 }, { "epoch": 1.20726138768237, "grad_norm": 0.2568886876106262, "learning_rate": 4.993467108466936e-05, "loss": 0.472, "num_input_tokens_seen": 13156704, "step": 10840 }, { "epoch": 1.2078182425659874, "grad_norm": 0.27240657806396484, "learning_rate": 4.993431953593826e-05, "loss": 0.45, "num_input_tokens_seen": 13163072, "step": 10845 }, { "epoch": 1.2083750974496046, "grad_norm": 0.25998061895370483, "learning_rate": 4.993396704510865e-05, "loss": 0.4702, "num_input_tokens_seen": 13169056, "step": 10850 }, { "epoch": 1.208931952333222, "grad_norm": 0.2811622619628906, "learning_rate": 4.9933613612193864e-05, "loss": 0.4766, "num_input_tokens_seen": 13175264, "step": 10855 }, { "epoch": 1.2094888072168393, "grad_norm": 0.2748408019542694, "learning_rate": 4.993325923720725e-05, "loss": 0.4825, "num_input_tokens_seen": 13181568, "step": 10860 }, { "epoch": 1.2100456621004567, "grad_norm": 0.30448976159095764, "learning_rate": 4.993290392016221e-05, "loss": 0.4746, "num_input_tokens_seen": 13187712, "step": 10865 }, { "epoch": 1.210602516984074, "grad_norm": 0.3395589590072632, "learning_rate": 4.993254766107215e-05, "loss": 0.4764, "num_input_tokens_seen": 13193952, "step": 10870 }, { "epoch": 1.2111593718676912, "grad_norm": 0.2746986746788025, "learning_rate": 4.9932190459950534e-05, "loss": 0.4944, "num_input_tokens_seen": 13199968, "step": 10875 }, { "epoch": 1.2117162267513086, "grad_norm": 0.3240812420845032, "learning_rate": 4.9931832316810864e-05, "loss": 0.478, "num_input_tokens_seen": 13206112, "step": 10880 }, { "epoch": 1.2122730816349259, "grad_norm": 0.36417222023010254, "learning_rate": 4.993147323166668e-05, "loss": 0.4763, "num_input_tokens_seen": 13212256, "step": 10885 }, { "epoch": 1.2128299365185433, "grad_norm": 0.24035489559173584, "learning_rate": 4.9931113204531544e-05, "loss": 0.4539, "num_input_tokens_seen": 13218240, "step": 10890 }, { "epoch": 1.2133867914021605, "grad_norm": 0.33184337615966797, "learning_rate": 4.993075223541904e-05, "loss": 0.4784, "num_input_tokens_seen": 13224256, "step": 10895 }, { "epoch": 1.213943646285778, "grad_norm": 0.28213465213775635, "learning_rate": 4.993039032434283e-05, "loss": 0.4837, "num_input_tokens_seen": 13230880, "step": 10900 }, { "epoch": 1.2145005011693952, "grad_norm": 0.25265970826148987, "learning_rate": 4.993002747131659e-05, "loss": 0.4714, "num_input_tokens_seen": 13237344, "step": 10905 }, { "epoch": 1.2150573560530127, "grad_norm": 0.28852108120918274, "learning_rate": 4.992966367635401e-05, "loss": 0.4549, "num_input_tokens_seen": 13243520, "step": 10910 }, { "epoch": 1.21561421093663, "grad_norm": 0.3840990960597992, "learning_rate": 4.9929298939468844e-05, "loss": 0.4717, "num_input_tokens_seen": 13249440, "step": 10915 }, { "epoch": 1.2161710658202471, "grad_norm": 0.2945835292339325, "learning_rate": 4.992893326067488e-05, "loss": 0.467, "num_input_tokens_seen": 13255168, "step": 10920 }, { "epoch": 1.2167279207038646, "grad_norm": 0.24183642864227295, "learning_rate": 4.9928566639985916e-05, "loss": 0.4767, "num_input_tokens_seen": 13260896, "step": 10925 }, { "epoch": 1.2172847755874818, "grad_norm": 0.3559440076351166, "learning_rate": 4.992819907741583e-05, "loss": 0.4819, "num_input_tokens_seen": 13266880, "step": 10930 }, { "epoch": 1.2178416304710993, "grad_norm": 0.3002265691757202, "learning_rate": 4.9927830572978495e-05, "loss": 0.4971, "num_input_tokens_seen": 13272736, "step": 10935 }, { "epoch": 1.2183984853547165, "grad_norm": 0.3185635805130005, "learning_rate": 4.992746112668783e-05, "loss": 0.4725, "num_input_tokens_seen": 13278208, "step": 10940 }, { "epoch": 1.218955340238334, "grad_norm": 0.23062042891979218, "learning_rate": 4.99270907385578e-05, "loss": 0.458, "num_input_tokens_seen": 13284352, "step": 10945 }, { "epoch": 1.2195121951219512, "grad_norm": 0.30409735441207886, "learning_rate": 4.992671940860241e-05, "loss": 0.4444, "num_input_tokens_seen": 13290624, "step": 10950 }, { "epoch": 1.2200690500055686, "grad_norm": 0.2038951963186264, "learning_rate": 4.9926347136835664e-05, "loss": 0.4839, "num_input_tokens_seen": 13296576, "step": 10955 }, { "epoch": 1.2206259048891859, "grad_norm": 0.20157839357852936, "learning_rate": 4.9925973923271654e-05, "loss": 0.4518, "num_input_tokens_seen": 13302080, "step": 10960 }, { "epoch": 1.221182759772803, "grad_norm": 0.24432970583438873, "learning_rate": 4.992559976792447e-05, "loss": 0.4786, "num_input_tokens_seen": 13308480, "step": 10965 }, { "epoch": 1.2217396146564206, "grad_norm": 0.2626798450946808, "learning_rate": 4.992522467080824e-05, "loss": 0.4768, "num_input_tokens_seen": 13314688, "step": 10970 }, { "epoch": 1.2222964695400378, "grad_norm": 0.29337772727012634, "learning_rate": 4.992484863193715e-05, "loss": 0.4916, "num_input_tokens_seen": 13319968, "step": 10975 }, { "epoch": 1.2228533244236552, "grad_norm": 0.34259462356567383, "learning_rate": 4.99244716513254e-05, "loss": 0.4945, "num_input_tokens_seen": 13325824, "step": 10980 }, { "epoch": 1.2234101793072725, "grad_norm": 0.2218751609325409, "learning_rate": 4.9924093728987244e-05, "loss": 0.4659, "num_input_tokens_seen": 13331904, "step": 10985 }, { "epoch": 1.22396703419089, "grad_norm": 0.2933860719203949, "learning_rate": 4.992371486493694e-05, "loss": 0.4814, "num_input_tokens_seen": 13336928, "step": 10990 }, { "epoch": 1.2245238890745072, "grad_norm": 0.2989635169506073, "learning_rate": 4.9923335059188825e-05, "loss": 0.4803, "num_input_tokens_seen": 13342784, "step": 10995 }, { "epoch": 1.2250807439581246, "grad_norm": 0.30715277791023254, "learning_rate": 4.992295431175724e-05, "loss": 0.482, "num_input_tokens_seen": 13349248, "step": 11000 }, { "epoch": 1.2256375988417418, "grad_norm": 0.23241253197193146, "learning_rate": 4.9922572622656575e-05, "loss": 0.4756, "num_input_tokens_seen": 13355296, "step": 11005 }, { "epoch": 1.226194453725359, "grad_norm": 0.2746027410030365, "learning_rate": 4.992218999190125e-05, "loss": 0.4613, "num_input_tokens_seen": 13361376, "step": 11010 }, { "epoch": 1.2267513086089765, "grad_norm": 0.3067416846752167, "learning_rate": 4.9921806419505715e-05, "loss": 0.4665, "num_input_tokens_seen": 13367584, "step": 11015 }, { "epoch": 1.2273081634925938, "grad_norm": 0.46129557490348816, "learning_rate": 4.9921421905484465e-05, "loss": 0.4577, "num_input_tokens_seen": 13374016, "step": 11020 }, { "epoch": 1.2278650183762112, "grad_norm": 0.254402220249176, "learning_rate": 4.992103644985204e-05, "loss": 0.4647, "num_input_tokens_seen": 13380064, "step": 11025 }, { "epoch": 1.2284218732598284, "grad_norm": 0.27045100927352905, "learning_rate": 4.992065005262299e-05, "loss": 0.492, "num_input_tokens_seen": 13386016, "step": 11030 }, { "epoch": 1.2289787281434459, "grad_norm": 0.22349552810192108, "learning_rate": 4.992026271381192e-05, "loss": 0.4871, "num_input_tokens_seen": 13391616, "step": 11035 }, { "epoch": 1.2295355830270631, "grad_norm": 0.31679823994636536, "learning_rate": 4.991987443343345e-05, "loss": 0.4703, "num_input_tokens_seen": 13397568, "step": 11040 }, { "epoch": 1.2300924379106806, "grad_norm": 0.2632501423358917, "learning_rate": 4.9919485211502285e-05, "loss": 0.4541, "num_input_tokens_seen": 13403808, "step": 11045 }, { "epoch": 1.2306492927942978, "grad_norm": 0.3254912197589874, "learning_rate": 4.991909504803309e-05, "loss": 0.4567, "num_input_tokens_seen": 13409920, "step": 11050 }, { "epoch": 1.231206147677915, "grad_norm": 0.23572753369808197, "learning_rate": 4.9918703943040644e-05, "loss": 0.4621, "num_input_tokens_seen": 13416000, "step": 11055 }, { "epoch": 1.2317630025615325, "grad_norm": 0.2725323438644409, "learning_rate": 4.99183118965397e-05, "loss": 0.4677, "num_input_tokens_seen": 13422176, "step": 11060 }, { "epoch": 1.2323198574451497, "grad_norm": 0.1975044161081314, "learning_rate": 4.991791890854508e-05, "loss": 0.4652, "num_input_tokens_seen": 13428352, "step": 11065 }, { "epoch": 1.2328767123287672, "grad_norm": 0.23677363991737366, "learning_rate": 4.991752497907163e-05, "loss": 0.4571, "num_input_tokens_seen": 13434336, "step": 11070 }, { "epoch": 1.2334335672123844, "grad_norm": 0.2278011590242386, "learning_rate": 4.991713010813424e-05, "loss": 0.4578, "num_input_tokens_seen": 13440416, "step": 11075 }, { "epoch": 1.2339904220960018, "grad_norm": 0.22057950496673584, "learning_rate": 4.991673429574781e-05, "loss": 0.4765, "num_input_tokens_seen": 13446496, "step": 11080 }, { "epoch": 1.234547276979619, "grad_norm": 0.24512557685375214, "learning_rate": 4.991633754192732e-05, "loss": 0.4881, "num_input_tokens_seen": 13452800, "step": 11085 }, { "epoch": 1.2351041318632365, "grad_norm": 0.29223424196243286, "learning_rate": 4.9915939846687745e-05, "loss": 0.4678, "num_input_tokens_seen": 13458688, "step": 11090 }, { "epoch": 1.2356609867468538, "grad_norm": 0.2587597072124481, "learning_rate": 4.991554121004411e-05, "loss": 0.4637, "num_input_tokens_seen": 13464736, "step": 11095 }, { "epoch": 1.236217841630471, "grad_norm": 0.2549957036972046, "learning_rate": 4.9915141632011484e-05, "loss": 0.4887, "num_input_tokens_seen": 13470720, "step": 11100 }, { "epoch": 1.2367746965140884, "grad_norm": 0.3115040957927704, "learning_rate": 4.991474111260497e-05, "loss": 0.4724, "num_input_tokens_seen": 13477120, "step": 11105 }, { "epoch": 1.2373315513977057, "grad_norm": 0.17470388114452362, "learning_rate": 4.991433965183969e-05, "loss": 0.4592, "num_input_tokens_seen": 13483264, "step": 11110 }, { "epoch": 1.2378884062813231, "grad_norm": 0.2361675351858139, "learning_rate": 4.9913937249730817e-05, "loss": 0.4807, "num_input_tokens_seen": 13489344, "step": 11115 }, { "epoch": 1.2384452611649404, "grad_norm": 0.15746554732322693, "learning_rate": 4.9913533906293554e-05, "loss": 0.4492, "num_input_tokens_seen": 13495392, "step": 11120 }, { "epoch": 1.2390021160485578, "grad_norm": 0.2893928289413452, "learning_rate": 4.991312962154314e-05, "loss": 0.4892, "num_input_tokens_seen": 13501760, "step": 11125 }, { "epoch": 1.239558970932175, "grad_norm": 0.3145378828048706, "learning_rate": 4.991272439549485e-05, "loss": 0.4669, "num_input_tokens_seen": 13507904, "step": 11130 }, { "epoch": 1.2401158258157925, "grad_norm": 0.21532346308231354, "learning_rate": 4.9912318228163994e-05, "loss": 0.454, "num_input_tokens_seen": 13513888, "step": 11135 }, { "epoch": 1.2406726806994097, "grad_norm": 0.3131033778190613, "learning_rate": 4.9911911119565926e-05, "loss": 0.488, "num_input_tokens_seen": 13519904, "step": 11140 }, { "epoch": 1.241229535583027, "grad_norm": 0.2709996700286865, "learning_rate": 4.9911503069716015e-05, "loss": 0.4682, "num_input_tokens_seen": 13525856, "step": 11145 }, { "epoch": 1.2417863904666444, "grad_norm": 0.28899139165878296, "learning_rate": 4.991109407862969e-05, "loss": 0.47, "num_input_tokens_seen": 13532000, "step": 11150 }, { "epoch": 1.2423432453502616, "grad_norm": 0.2717733681201935, "learning_rate": 4.991068414632239e-05, "loss": 0.462, "num_input_tokens_seen": 13537984, "step": 11155 }, { "epoch": 1.242900100233879, "grad_norm": 0.32372453808784485, "learning_rate": 4.991027327280963e-05, "loss": 0.4877, "num_input_tokens_seen": 13544192, "step": 11160 }, { "epoch": 1.2434569551174963, "grad_norm": 0.2230662703514099, "learning_rate": 4.99098614581069e-05, "loss": 0.4701, "num_input_tokens_seen": 13550368, "step": 11165 }, { "epoch": 1.2440138100011138, "grad_norm": 0.2974305748939514, "learning_rate": 4.990944870222979e-05, "loss": 0.4553, "num_input_tokens_seen": 13556480, "step": 11170 }, { "epoch": 1.244570664884731, "grad_norm": 0.325433611869812, "learning_rate": 4.990903500519387e-05, "loss": 0.4674, "num_input_tokens_seen": 13562400, "step": 11175 }, { "epoch": 1.2451275197683485, "grad_norm": 0.3263052701950073, "learning_rate": 4.990862036701479e-05, "loss": 0.4612, "num_input_tokens_seen": 13568576, "step": 11180 }, { "epoch": 1.2456843746519657, "grad_norm": 0.2598559260368347, "learning_rate": 4.9908204787708205e-05, "loss": 0.4979, "num_input_tokens_seen": 13574784, "step": 11185 }, { "epoch": 1.246241229535583, "grad_norm": 0.28116703033447266, "learning_rate": 4.990778826728982e-05, "loss": 0.4822, "num_input_tokens_seen": 13580864, "step": 11190 }, { "epoch": 1.2467980844192004, "grad_norm": 0.2135390341281891, "learning_rate": 4.9907370805775376e-05, "loss": 0.4489, "num_input_tokens_seen": 13586912, "step": 11195 }, { "epoch": 1.2473549393028176, "grad_norm": 0.23643679916858673, "learning_rate": 4.9906952403180646e-05, "loss": 0.4634, "num_input_tokens_seen": 13593152, "step": 11200 }, { "epoch": 1.247911794186435, "grad_norm": 0.27310875058174133, "learning_rate": 4.990653305952143e-05, "loss": 0.4566, "num_input_tokens_seen": 13599264, "step": 11205 }, { "epoch": 1.2484686490700523, "grad_norm": 0.22710338234901428, "learning_rate": 4.990611277481358e-05, "loss": 0.461, "num_input_tokens_seen": 13605504, "step": 11210 }, { "epoch": 1.2490255039536697, "grad_norm": 0.2951323091983795, "learning_rate": 4.990569154907298e-05, "loss": 0.4909, "num_input_tokens_seen": 13611776, "step": 11215 }, { "epoch": 1.249582358837287, "grad_norm": 0.26834508776664734, "learning_rate": 4.990526938231553e-05, "loss": 0.4916, "num_input_tokens_seen": 13617920, "step": 11220 }, { "epoch": 1.2501392137209044, "grad_norm": 0.2847573459148407, "learning_rate": 4.99048462745572e-05, "loss": 0.475, "num_input_tokens_seen": 13624000, "step": 11225 }, { "epoch": 1.2506960686045216, "grad_norm": 0.41488271951675415, "learning_rate": 4.9904422225813966e-05, "loss": 0.4965, "num_input_tokens_seen": 13630432, "step": 11230 }, { "epoch": 1.2512529234881389, "grad_norm": 0.4295205771923065, "learning_rate": 4.990399723610184e-05, "loss": 0.4511, "num_input_tokens_seen": 13636672, "step": 11235 }, { "epoch": 1.2518097783717563, "grad_norm": 0.21236564218997955, "learning_rate": 4.990357130543689e-05, "loss": 0.4929, "num_input_tokens_seen": 13642176, "step": 11240 }, { "epoch": 1.2523666332553738, "grad_norm": 0.22911667823791504, "learning_rate": 4.990314443383521e-05, "loss": 0.4748, "num_input_tokens_seen": 13648256, "step": 11245 }, { "epoch": 1.252923488138991, "grad_norm": 0.26111939549446106, "learning_rate": 4.990271662131294e-05, "loss": 0.4488, "num_input_tokens_seen": 13654336, "step": 11250 }, { "epoch": 1.2534803430226082, "grad_norm": 0.8012601137161255, "learning_rate": 4.990228786788622e-05, "loss": 0.4823, "num_input_tokens_seen": 13660704, "step": 11255 }, { "epoch": 1.2540371979062257, "grad_norm": 0.225472092628479, "learning_rate": 4.9901858173571256e-05, "loss": 0.4761, "num_input_tokens_seen": 13666880, "step": 11260 }, { "epoch": 1.254594052789843, "grad_norm": 0.20174935460090637, "learning_rate": 4.9901427538384296e-05, "loss": 0.4646, "num_input_tokens_seen": 13672224, "step": 11265 }, { "epoch": 1.2551509076734604, "grad_norm": 0.2058456987142563, "learning_rate": 4.9900995962341604e-05, "loss": 0.4595, "num_input_tokens_seen": 13678592, "step": 11270 }, { "epoch": 1.2557077625570776, "grad_norm": 0.25865229964256287, "learning_rate": 4.990056344545948e-05, "loss": 0.4726, "num_input_tokens_seen": 13684512, "step": 11275 }, { "epoch": 1.2562646174406948, "grad_norm": 0.3311750888824463, "learning_rate": 4.990012998775428e-05, "loss": 0.4748, "num_input_tokens_seen": 13690688, "step": 11280 }, { "epoch": 1.2568214723243123, "grad_norm": 0.251003623008728, "learning_rate": 4.9899695589242365e-05, "loss": 0.4655, "num_input_tokens_seen": 13696608, "step": 11285 }, { "epoch": 1.2573783272079297, "grad_norm": 0.2624037563800812, "learning_rate": 4.989926024994016e-05, "loss": 0.4688, "num_input_tokens_seen": 13702208, "step": 11290 }, { "epoch": 1.257935182091547, "grad_norm": 0.20662391185760498, "learning_rate": 4.989882396986411e-05, "loss": 0.4751, "num_input_tokens_seen": 13708160, "step": 11295 }, { "epoch": 1.2584920369751642, "grad_norm": 0.2148197740316391, "learning_rate": 4.989838674903069e-05, "loss": 0.4577, "num_input_tokens_seen": 13713664, "step": 11300 }, { "epoch": 1.2590488918587817, "grad_norm": 0.24715584516525269, "learning_rate": 4.9897948587456434e-05, "loss": 0.4822, "num_input_tokens_seen": 13719776, "step": 11305 }, { "epoch": 1.259605746742399, "grad_norm": 0.4193404018878937, "learning_rate": 4.989750948515789e-05, "loss": 0.471, "num_input_tokens_seen": 13726080, "step": 11310 }, { "epoch": 1.2601626016260163, "grad_norm": 0.23603856563568115, "learning_rate": 4.989706944215165e-05, "loss": 0.492, "num_input_tokens_seen": 13732160, "step": 11315 }, { "epoch": 1.2607194565096336, "grad_norm": 0.2597726583480835, "learning_rate": 4.989662845845434e-05, "loss": 0.4528, "num_input_tokens_seen": 13738560, "step": 11320 }, { "epoch": 1.2612763113932508, "grad_norm": 0.32341268658638, "learning_rate": 4.9896186534082625e-05, "loss": 0.4667, "num_input_tokens_seen": 13744576, "step": 11325 }, { "epoch": 1.2618331662768683, "grad_norm": 0.2799995541572571, "learning_rate": 4.9895743669053193e-05, "loss": 0.4832, "num_input_tokens_seen": 13750624, "step": 11330 }, { "epoch": 1.2623900211604857, "grad_norm": 0.2991476058959961, "learning_rate": 4.9895299863382784e-05, "loss": 0.4674, "num_input_tokens_seen": 13756384, "step": 11335 }, { "epoch": 1.262946876044103, "grad_norm": 0.24633638560771942, "learning_rate": 4.989485511708817e-05, "loss": 0.4616, "num_input_tokens_seen": 13761952, "step": 11340 }, { "epoch": 1.2635037309277202, "grad_norm": 0.33173519372940063, "learning_rate": 4.989440943018614e-05, "loss": 0.4793, "num_input_tokens_seen": 13767968, "step": 11345 }, { "epoch": 1.2640605858113376, "grad_norm": 0.2878459393978119, "learning_rate": 4.989396280269355e-05, "loss": 0.4694, "num_input_tokens_seen": 13774016, "step": 11350 }, { "epoch": 1.2646174406949549, "grad_norm": 0.21328717470169067, "learning_rate": 4.989351523462727e-05, "loss": 0.4628, "num_input_tokens_seen": 13780000, "step": 11355 }, { "epoch": 1.2651742955785723, "grad_norm": 0.2108580321073532, "learning_rate": 4.989306672600421e-05, "loss": 0.4713, "num_input_tokens_seen": 13785984, "step": 11360 }, { "epoch": 1.2657311504621895, "grad_norm": 0.21664166450500488, "learning_rate": 4.989261727684131e-05, "loss": 0.4712, "num_input_tokens_seen": 13791968, "step": 11365 }, { "epoch": 1.2662880053458068, "grad_norm": 0.20919135212898254, "learning_rate": 4.9892166887155567e-05, "loss": 0.4498, "num_input_tokens_seen": 13797856, "step": 11370 }, { "epoch": 1.2668448602294242, "grad_norm": 0.3018151521682739, "learning_rate": 4.989171555696398e-05, "loss": 0.4529, "num_input_tokens_seen": 13804064, "step": 11375 }, { "epoch": 1.2674017151130417, "grad_norm": 0.33304712176322937, "learning_rate": 4.9891263286283604e-05, "loss": 0.4793, "num_input_tokens_seen": 13810304, "step": 11380 }, { "epoch": 1.267958569996659, "grad_norm": 0.26807573437690735, "learning_rate": 4.9890810075131545e-05, "loss": 0.4721, "num_input_tokens_seen": 13816416, "step": 11385 }, { "epoch": 1.2685154248802761, "grad_norm": 0.20839184522628784, "learning_rate": 4.989035592352491e-05, "loss": 0.4754, "num_input_tokens_seen": 13822560, "step": 11390 }, { "epoch": 1.2690722797638936, "grad_norm": 0.2767036259174347, "learning_rate": 4.9889900831480865e-05, "loss": 0.4881, "num_input_tokens_seen": 13828800, "step": 11395 }, { "epoch": 1.2696291346475108, "grad_norm": 0.2529980540275574, "learning_rate": 4.9889444799016605e-05, "loss": 0.4647, "num_input_tokens_seen": 13834880, "step": 11400 }, { "epoch": 1.2701859895311283, "grad_norm": 0.27018776535987854, "learning_rate": 4.988898782614935e-05, "loss": 0.4596, "num_input_tokens_seen": 13841248, "step": 11405 }, { "epoch": 1.2707428444147455, "grad_norm": 0.2044999897480011, "learning_rate": 4.988852991289638e-05, "loss": 0.5012, "num_input_tokens_seen": 13847200, "step": 11410 }, { "epoch": 1.2712996992983627, "grad_norm": 0.2517634332180023, "learning_rate": 4.988807105927499e-05, "loss": 0.4816, "num_input_tokens_seen": 13852992, "step": 11415 }, { "epoch": 1.2718565541819802, "grad_norm": 0.23293541371822357, "learning_rate": 4.988761126530252e-05, "loss": 0.4594, "num_input_tokens_seen": 13858912, "step": 11420 }, { "epoch": 1.2724134090655976, "grad_norm": 0.29703953862190247, "learning_rate": 4.988715053099634e-05, "loss": 0.4814, "num_input_tokens_seen": 13864992, "step": 11425 }, { "epoch": 1.2729702639492149, "grad_norm": 0.2924721837043762, "learning_rate": 4.9886688856373856e-05, "loss": 0.4748, "num_input_tokens_seen": 13871104, "step": 11430 }, { "epoch": 1.273527118832832, "grad_norm": 0.22151783108711243, "learning_rate": 4.988622624145251e-05, "loss": 0.4699, "num_input_tokens_seen": 13877376, "step": 11435 }, { "epoch": 1.2740839737164495, "grad_norm": 0.20826131105422974, "learning_rate": 4.988576268624979e-05, "loss": 0.4753, "num_input_tokens_seen": 13883712, "step": 11440 }, { "epoch": 1.2746408286000668, "grad_norm": 0.21417716145515442, "learning_rate": 4.9885298190783203e-05, "loss": 0.4681, "num_input_tokens_seen": 13889824, "step": 11445 }, { "epoch": 1.2751976834836842, "grad_norm": 0.17153310775756836, "learning_rate": 4.98848327550703e-05, "loss": 0.4691, "num_input_tokens_seen": 13895776, "step": 11450 }, { "epoch": 1.2757545383673015, "grad_norm": 0.23399479687213898, "learning_rate": 4.988436637912867e-05, "loss": 0.4563, "num_input_tokens_seen": 13901984, "step": 11455 }, { "epoch": 1.2763113932509187, "grad_norm": 0.21300853788852692, "learning_rate": 4.988389906297593e-05, "loss": 0.4721, "num_input_tokens_seen": 13908352, "step": 11460 }, { "epoch": 1.2768682481345361, "grad_norm": 0.33765965700149536, "learning_rate": 4.9883430806629746e-05, "loss": 0.4822, "num_input_tokens_seen": 13914496, "step": 11465 }, { "epoch": 1.2774251030181536, "grad_norm": 0.3430984914302826, "learning_rate": 4.98829616101078e-05, "loss": 0.4599, "num_input_tokens_seen": 13920960, "step": 11470 }, { "epoch": 1.2779819579017708, "grad_norm": 0.3277798295021057, "learning_rate": 4.988249147342782e-05, "loss": 0.4716, "num_input_tokens_seen": 13927008, "step": 11475 }, { "epoch": 1.278538812785388, "grad_norm": 0.2355988770723343, "learning_rate": 4.9882020396607574e-05, "loss": 0.4687, "num_input_tokens_seen": 13932992, "step": 11480 }, { "epoch": 1.2790956676690055, "grad_norm": 0.3588848412036896, "learning_rate": 4.988154837966486e-05, "loss": 0.4761, "num_input_tokens_seen": 13939072, "step": 11485 }, { "epoch": 1.2796525225526227, "grad_norm": 0.24528659880161285, "learning_rate": 4.9881075422617505e-05, "loss": 0.4543, "num_input_tokens_seen": 13945248, "step": 11490 }, { "epoch": 1.2802093774362402, "grad_norm": 0.29645809531211853, "learning_rate": 4.988060152548339e-05, "loss": 0.455, "num_input_tokens_seen": 13951712, "step": 11495 }, { "epoch": 1.2807662323198574, "grad_norm": 0.202579528093338, "learning_rate": 4.988012668828042e-05, "loss": 0.4654, "num_input_tokens_seen": 13957728, "step": 11500 }, { "epoch": 1.2813230872034747, "grad_norm": 0.24200640618801117, "learning_rate": 4.987965091102653e-05, "loss": 0.4518, "num_input_tokens_seen": 13963936, "step": 11505 }, { "epoch": 1.281879942087092, "grad_norm": 0.21064700186252594, "learning_rate": 4.9879174193739695e-05, "loss": 0.4694, "num_input_tokens_seen": 13970144, "step": 11510 }, { "epoch": 1.2824367969707096, "grad_norm": 0.2652631103992462, "learning_rate": 4.987869653643793e-05, "loss": 0.46, "num_input_tokens_seen": 13976064, "step": 11515 }, { "epoch": 1.2829936518543268, "grad_norm": 0.25229817628860474, "learning_rate": 4.987821793913928e-05, "loss": 0.4671, "num_input_tokens_seen": 13982080, "step": 11520 }, { "epoch": 1.283550506737944, "grad_norm": 0.2206130176782608, "learning_rate": 4.9877738401861835e-05, "loss": 0.4953, "num_input_tokens_seen": 13987936, "step": 11525 }, { "epoch": 1.2841073616215615, "grad_norm": 0.2103908360004425, "learning_rate": 4.987725792462371e-05, "loss": 0.4728, "num_input_tokens_seen": 13993856, "step": 11530 }, { "epoch": 1.2846642165051787, "grad_norm": 0.19293411076068878, "learning_rate": 4.987677650744305e-05, "loss": 0.471, "num_input_tokens_seen": 14000480, "step": 11535 }, { "epoch": 1.2852210713887962, "grad_norm": 0.22956649959087372, "learning_rate": 4.9876294150338054e-05, "loss": 0.4681, "num_input_tokens_seen": 14006752, "step": 11540 }, { "epoch": 1.2857779262724134, "grad_norm": 0.26625150442123413, "learning_rate": 4.9875810853326946e-05, "loss": 0.49, "num_input_tokens_seen": 14012704, "step": 11545 }, { "epoch": 1.2863347811560306, "grad_norm": 0.20174209773540497, "learning_rate": 4.987532661642799e-05, "loss": 0.4595, "num_input_tokens_seen": 14018272, "step": 11550 }, { "epoch": 1.286891636039648, "grad_norm": 0.269908607006073, "learning_rate": 4.987484143965947e-05, "loss": 0.4791, "num_input_tokens_seen": 14024544, "step": 11555 }, { "epoch": 1.2874484909232655, "grad_norm": 0.2834090292453766, "learning_rate": 4.987435532303973e-05, "loss": 0.4587, "num_input_tokens_seen": 14030656, "step": 11560 }, { "epoch": 1.2880053458068828, "grad_norm": 0.2284105122089386, "learning_rate": 4.987386826658713e-05, "loss": 0.4773, "num_input_tokens_seen": 14036864, "step": 11565 }, { "epoch": 1.2885622006905, "grad_norm": 0.18867556750774384, "learning_rate": 4.9873380270320083e-05, "loss": 0.4818, "num_input_tokens_seen": 14042976, "step": 11570 }, { "epoch": 1.2891190555741174, "grad_norm": 0.2799842357635498, "learning_rate": 4.987289133425701e-05, "loss": 0.4841, "num_input_tokens_seen": 14048928, "step": 11575 }, { "epoch": 1.2896759104577347, "grad_norm": 0.26165932416915894, "learning_rate": 4.987240145841639e-05, "loss": 0.4804, "num_input_tokens_seen": 14054976, "step": 11580 }, { "epoch": 1.2902327653413521, "grad_norm": 0.24368970096111298, "learning_rate": 4.9871910642816746e-05, "loss": 0.4833, "num_input_tokens_seen": 14060896, "step": 11585 }, { "epoch": 1.2907896202249693, "grad_norm": 0.22465603053569794, "learning_rate": 4.98714188874766e-05, "loss": 0.472, "num_input_tokens_seen": 14066368, "step": 11590 }, { "epoch": 1.2913464751085866, "grad_norm": 0.2872973382472992, "learning_rate": 4.9870926192414554e-05, "loss": 0.4411, "num_input_tokens_seen": 14072352, "step": 11595 }, { "epoch": 1.291903329992204, "grad_norm": 0.26016125082969666, "learning_rate": 4.9870432557649206e-05, "loss": 0.4997, "num_input_tokens_seen": 14078688, "step": 11600 }, { "epoch": 1.2924601848758215, "grad_norm": 0.2454078197479248, "learning_rate": 4.986993798319922e-05, "loss": 0.4678, "num_input_tokens_seen": 14084512, "step": 11605 }, { "epoch": 1.2930170397594387, "grad_norm": 0.187686026096344, "learning_rate": 4.986944246908327e-05, "loss": 0.4631, "num_input_tokens_seen": 14090432, "step": 11610 }, { "epoch": 1.293573894643056, "grad_norm": 0.25181370973587036, "learning_rate": 4.9868946015320094e-05, "loss": 0.4776, "num_input_tokens_seen": 14095904, "step": 11615 }, { "epoch": 1.2941307495266734, "grad_norm": 0.39244258403778076, "learning_rate": 4.986844862192844e-05, "loss": 0.4683, "num_input_tokens_seen": 14102240, "step": 11620 }, { "epoch": 1.2946876044102906, "grad_norm": 0.2127581089735031, "learning_rate": 4.98679502889271e-05, "loss": 0.4668, "num_input_tokens_seen": 14108672, "step": 11625 }, { "epoch": 1.295244459293908, "grad_norm": 0.2597352862358093, "learning_rate": 4.986745101633491e-05, "loss": 0.4574, "num_input_tokens_seen": 14114784, "step": 11630 }, { "epoch": 1.2958013141775253, "grad_norm": 0.22413471341133118, "learning_rate": 4.986695080417072e-05, "loss": 0.47, "num_input_tokens_seen": 14120992, "step": 11635 }, { "epoch": 1.2963581690611425, "grad_norm": 0.1531079113483429, "learning_rate": 4.986644965245345e-05, "loss": 0.4671, "num_input_tokens_seen": 14127392, "step": 11640 }, { "epoch": 1.29691502394476, "grad_norm": 0.24933837354183197, "learning_rate": 4.9865947561202006e-05, "loss": 0.4766, "num_input_tokens_seen": 14133376, "step": 11645 }, { "epoch": 1.2974718788283774, "grad_norm": 0.20129670202732086, "learning_rate": 4.986544453043539e-05, "loss": 0.4846, "num_input_tokens_seen": 14139232, "step": 11650 }, { "epoch": 1.2980287337119947, "grad_norm": 0.2241920381784439, "learning_rate": 4.986494056017259e-05, "loss": 0.4655, "num_input_tokens_seen": 14145280, "step": 11655 }, { "epoch": 1.298585588595612, "grad_norm": 0.2579415738582611, "learning_rate": 4.9864435650432655e-05, "loss": 0.4649, "num_input_tokens_seen": 14151136, "step": 11660 }, { "epoch": 1.2991424434792294, "grad_norm": 0.3068053424358368, "learning_rate": 4.9863929801234657e-05, "loss": 0.4874, "num_input_tokens_seen": 14157024, "step": 11665 }, { "epoch": 1.2996992983628466, "grad_norm": 0.21706870198249817, "learning_rate": 4.986342301259771e-05, "loss": 0.4528, "num_input_tokens_seen": 14163008, "step": 11670 }, { "epoch": 1.300256153246464, "grad_norm": 0.24265585839748383, "learning_rate": 4.986291528454097e-05, "loss": 0.4687, "num_input_tokens_seen": 14169248, "step": 11675 }, { "epoch": 1.3008130081300813, "grad_norm": 0.2421877533197403, "learning_rate": 4.98624066170836e-05, "loss": 0.4307, "num_input_tokens_seen": 14175456, "step": 11680 }, { "epoch": 1.3013698630136985, "grad_norm": 0.23560169339179993, "learning_rate": 4.986189701024484e-05, "loss": 0.4797, "num_input_tokens_seen": 14181504, "step": 11685 }, { "epoch": 1.301926717897316, "grad_norm": 0.18602430820465088, "learning_rate": 4.986138646404394e-05, "loss": 0.4728, "num_input_tokens_seen": 14187104, "step": 11690 }, { "epoch": 1.3024835727809334, "grad_norm": 0.19994299113750458, "learning_rate": 4.9860874978500183e-05, "loss": 0.4614, "num_input_tokens_seen": 14193376, "step": 11695 }, { "epoch": 1.3030404276645506, "grad_norm": 0.18557032942771912, "learning_rate": 4.98603625536329e-05, "loss": 0.4763, "num_input_tokens_seen": 14198560, "step": 11700 }, { "epoch": 1.3035972825481679, "grad_norm": 0.2763175964355469, "learning_rate": 4.985984918946145e-05, "loss": 0.4715, "num_input_tokens_seen": 14204512, "step": 11705 }, { "epoch": 1.3041541374317853, "grad_norm": 0.32657983899116516, "learning_rate": 4.9859334886005237e-05, "loss": 0.4773, "num_input_tokens_seen": 14210528, "step": 11710 }, { "epoch": 1.3047109923154026, "grad_norm": 0.16607539355754852, "learning_rate": 4.9858819643283674e-05, "loss": 0.4542, "num_input_tokens_seen": 14216576, "step": 11715 }, { "epoch": 1.30526784719902, "grad_norm": 0.21320970356464386, "learning_rate": 4.9858303461316245e-05, "loss": 0.4865, "num_input_tokens_seen": 14222272, "step": 11720 }, { "epoch": 1.3058247020826372, "grad_norm": 0.29923906922340393, "learning_rate": 4.985778634012246e-05, "loss": 0.4854, "num_input_tokens_seen": 14228096, "step": 11725 }, { "epoch": 1.3063815569662545, "grad_norm": 0.22350849211215973, "learning_rate": 4.9857268279721837e-05, "loss": 0.4515, "num_input_tokens_seen": 14234016, "step": 11730 }, { "epoch": 1.306938411849872, "grad_norm": 0.2362016886472702, "learning_rate": 4.9856749280133956e-05, "loss": 0.4486, "num_input_tokens_seen": 14239744, "step": 11735 }, { "epoch": 1.3074952667334894, "grad_norm": 0.20430977642536163, "learning_rate": 4.985622934137843e-05, "loss": 0.4684, "num_input_tokens_seen": 14245760, "step": 11740 }, { "epoch": 1.3080521216171066, "grad_norm": 0.1864004284143448, "learning_rate": 4.9855708463474914e-05, "loss": 0.4728, "num_input_tokens_seen": 14252064, "step": 11745 }, { "epoch": 1.3086089765007238, "grad_norm": 0.2286028414964676, "learning_rate": 4.985518664644307e-05, "loss": 0.4639, "num_input_tokens_seen": 14258016, "step": 11750 }, { "epoch": 1.3091658313843413, "grad_norm": 0.24802425503730774, "learning_rate": 4.9854663890302624e-05, "loss": 0.4469, "num_input_tokens_seen": 14264192, "step": 11755 }, { "epoch": 1.3097226862679585, "grad_norm": 0.3396087884902954, "learning_rate": 4.985414019507333e-05, "loss": 0.473, "num_input_tokens_seen": 14270176, "step": 11760 }, { "epoch": 1.310279541151576, "grad_norm": 0.22660627961158752, "learning_rate": 4.985361556077496e-05, "loss": 0.4667, "num_input_tokens_seen": 14276288, "step": 11765 }, { "epoch": 1.3108363960351932, "grad_norm": 0.2918297052383423, "learning_rate": 4.985308998742735e-05, "loss": 0.4685, "num_input_tokens_seen": 14282016, "step": 11770 }, { "epoch": 1.3113932509188104, "grad_norm": 0.21009309589862823, "learning_rate": 4.985256347505036e-05, "loss": 0.4965, "num_input_tokens_seen": 14287200, "step": 11775 }, { "epoch": 1.3119501058024279, "grad_norm": 0.20308177173137665, "learning_rate": 4.985203602366388e-05, "loss": 0.4582, "num_input_tokens_seen": 14293280, "step": 11780 }, { "epoch": 1.3125069606860453, "grad_norm": 0.32400578260421753, "learning_rate": 4.985150763328783e-05, "loss": 0.4798, "num_input_tokens_seen": 14299040, "step": 11785 }, { "epoch": 1.3130638155696626, "grad_norm": 0.24581336975097656, "learning_rate": 4.985097830394219e-05, "loss": 0.4585, "num_input_tokens_seen": 14305120, "step": 11790 }, { "epoch": 1.3136206704532798, "grad_norm": 0.3248359262943268, "learning_rate": 4.985044803564693e-05, "loss": 0.4514, "num_input_tokens_seen": 14311328, "step": 11795 }, { "epoch": 1.3141775253368972, "grad_norm": 0.3826272487640381, "learning_rate": 4.9849916828422125e-05, "loss": 0.4742, "num_input_tokens_seen": 14317440, "step": 11800 }, { "epoch": 1.3147343802205145, "grad_norm": 0.2714981138706207, "learning_rate": 4.9849384682287824e-05, "loss": 0.4843, "num_input_tokens_seen": 14323456, "step": 11805 }, { "epoch": 1.315291235104132, "grad_norm": 0.3164939880371094, "learning_rate": 4.9848851597264136e-05, "loss": 0.4682, "num_input_tokens_seen": 14329184, "step": 11810 }, { "epoch": 1.3158480899877492, "grad_norm": 0.27789196372032166, "learning_rate": 4.984831757337119e-05, "loss": 0.4774, "num_input_tokens_seen": 14335456, "step": 11815 }, { "epoch": 1.3164049448713664, "grad_norm": 0.2368329018354416, "learning_rate": 4.984778261062919e-05, "loss": 0.4912, "num_input_tokens_seen": 14341632, "step": 11820 }, { "epoch": 1.3169617997549838, "grad_norm": 0.2091287523508072, "learning_rate": 4.984724670905833e-05, "loss": 0.4668, "num_input_tokens_seen": 14347584, "step": 11825 }, { "epoch": 1.3175186546386013, "grad_norm": 0.23848888278007507, "learning_rate": 4.9846709868678864e-05, "loss": 0.4538, "num_input_tokens_seen": 14353408, "step": 11830 }, { "epoch": 1.3180755095222185, "grad_norm": 0.23620761930942535, "learning_rate": 4.9846172089511075e-05, "loss": 0.4673, "num_input_tokens_seen": 14359424, "step": 11835 }, { "epoch": 1.3186323644058358, "grad_norm": 0.24698802828788757, "learning_rate": 4.984563337157528e-05, "loss": 0.4882, "num_input_tokens_seen": 14365344, "step": 11840 }, { "epoch": 1.3191892192894532, "grad_norm": 0.235881969332695, "learning_rate": 4.984509371489183e-05, "loss": 0.4737, "num_input_tokens_seen": 14371488, "step": 11845 }, { "epoch": 1.3197460741730704, "grad_norm": 0.2966587543487549, "learning_rate": 4.9844553119481124e-05, "loss": 0.4753, "num_input_tokens_seen": 14377504, "step": 11850 }, { "epoch": 1.320302929056688, "grad_norm": 0.2028995007276535, "learning_rate": 4.984401158536358e-05, "loss": 0.449, "num_input_tokens_seen": 14383744, "step": 11855 }, { "epoch": 1.3208597839403051, "grad_norm": 0.2197382152080536, "learning_rate": 4.984346911255967e-05, "loss": 0.4706, "num_input_tokens_seen": 14389792, "step": 11860 }, { "epoch": 1.3214166388239224, "grad_norm": 0.2635113000869751, "learning_rate": 4.9842925701089874e-05, "loss": 0.4861, "num_input_tokens_seen": 14395968, "step": 11865 }, { "epoch": 1.3219734937075398, "grad_norm": 0.20303623378276825, "learning_rate": 4.984238135097473e-05, "loss": 0.4654, "num_input_tokens_seen": 14401760, "step": 11870 }, { "epoch": 1.3225303485911573, "grad_norm": 0.27548834681510925, "learning_rate": 4.984183606223481e-05, "loss": 0.477, "num_input_tokens_seen": 14407744, "step": 11875 }, { "epoch": 1.3230872034747745, "grad_norm": 0.22829551994800568, "learning_rate": 4.984128983489071e-05, "loss": 0.4757, "num_input_tokens_seen": 14413312, "step": 11880 }, { "epoch": 1.3236440583583917, "grad_norm": 0.23161758482456207, "learning_rate": 4.984074266896308e-05, "loss": 0.4656, "num_input_tokens_seen": 14419520, "step": 11885 }, { "epoch": 1.3242009132420092, "grad_norm": 0.2594762444496155, "learning_rate": 4.984019456447259e-05, "loss": 0.4821, "num_input_tokens_seen": 14425824, "step": 11890 }, { "epoch": 1.3247577681256264, "grad_norm": 0.18075396120548248, "learning_rate": 4.983964552143993e-05, "loss": 0.4677, "num_input_tokens_seen": 14432032, "step": 11895 }, { "epoch": 1.3253146230092439, "grad_norm": 0.233800008893013, "learning_rate": 4.983909553988587e-05, "loss": 0.4731, "num_input_tokens_seen": 14438112, "step": 11900 }, { "epoch": 1.325871477892861, "grad_norm": 0.21744385361671448, "learning_rate": 4.983854461983117e-05, "loss": 0.4766, "num_input_tokens_seen": 14444128, "step": 11905 }, { "epoch": 1.3264283327764783, "grad_norm": 0.16074249148368835, "learning_rate": 4.983799276129667e-05, "loss": 0.4528, "num_input_tokens_seen": 14450336, "step": 11910 }, { "epoch": 1.3269851876600958, "grad_norm": 0.26701757311820984, "learning_rate": 4.983743996430319e-05, "loss": 0.4527, "num_input_tokens_seen": 14455808, "step": 11915 }, { "epoch": 1.3275420425437132, "grad_norm": 0.23569893836975098, "learning_rate": 4.983688622887165e-05, "loss": 0.4727, "num_input_tokens_seen": 14461888, "step": 11920 }, { "epoch": 1.3280988974273304, "grad_norm": 0.3606279790401459, "learning_rate": 4.983633155502294e-05, "loss": 0.468, "num_input_tokens_seen": 14467904, "step": 11925 }, { "epoch": 1.3286557523109477, "grad_norm": 0.21934519708156586, "learning_rate": 4.983577594277804e-05, "loss": 0.4753, "num_input_tokens_seen": 14473760, "step": 11930 }, { "epoch": 1.3292126071945651, "grad_norm": 0.22758245468139648, "learning_rate": 4.983521939215793e-05, "loss": 0.4672, "num_input_tokens_seen": 14480064, "step": 11935 }, { "epoch": 1.3297694620781824, "grad_norm": 0.1949688345193863, "learning_rate": 4.983466190318365e-05, "loss": 0.4567, "num_input_tokens_seen": 14485984, "step": 11940 }, { "epoch": 1.3303263169617998, "grad_norm": 0.24018347263336182, "learning_rate": 4.983410347587625e-05, "loss": 0.4712, "num_input_tokens_seen": 14491968, "step": 11945 }, { "epoch": 1.330883171845417, "grad_norm": 0.27119526267051697, "learning_rate": 4.983354411025684e-05, "loss": 0.4555, "num_input_tokens_seen": 14498176, "step": 11950 }, { "epoch": 1.3314400267290343, "grad_norm": 0.3076115548610687, "learning_rate": 4.983298380634654e-05, "loss": 0.46, "num_input_tokens_seen": 14504224, "step": 11955 }, { "epoch": 1.3319968816126517, "grad_norm": 0.26277753710746765, "learning_rate": 4.9832422564166546e-05, "loss": 0.4513, "num_input_tokens_seen": 14510720, "step": 11960 }, { "epoch": 1.3325537364962692, "grad_norm": 0.19318434596061707, "learning_rate": 4.983186038373804e-05, "loss": 0.4511, "num_input_tokens_seen": 14517056, "step": 11965 }, { "epoch": 1.3331105913798864, "grad_norm": 0.26703622937202454, "learning_rate": 4.983129726508228e-05, "loss": 0.4507, "num_input_tokens_seen": 14523200, "step": 11970 }, { "epoch": 1.3336674462635036, "grad_norm": 0.2919849455356598, "learning_rate": 4.983073320822052e-05, "loss": 0.4671, "num_input_tokens_seen": 14528960, "step": 11975 }, { "epoch": 1.334224301147121, "grad_norm": 0.31035083532333374, "learning_rate": 4.983016821317409e-05, "loss": 0.4846, "num_input_tokens_seen": 14534848, "step": 11980 }, { "epoch": 1.3347811560307383, "grad_norm": 0.3335912823677063, "learning_rate": 4.982960227996434e-05, "loss": 0.4614, "num_input_tokens_seen": 14541088, "step": 11985 }, { "epoch": 1.3353380109143558, "grad_norm": 0.2507714033126831, "learning_rate": 4.982903540861263e-05, "loss": 0.4696, "num_input_tokens_seen": 14547072, "step": 11990 }, { "epoch": 1.335894865797973, "grad_norm": 0.2082262486219406, "learning_rate": 4.98284675991404e-05, "loss": 0.4723, "num_input_tokens_seen": 14553184, "step": 11995 }, { "epoch": 1.3364517206815905, "grad_norm": 0.2251964956521988, "learning_rate": 4.982789885156911e-05, "loss": 0.4876, "num_input_tokens_seen": 14559232, "step": 12000 }, { "epoch": 1.3370085755652077, "grad_norm": 0.20016849040985107, "learning_rate": 4.9827329165920225e-05, "loss": 0.4713, "num_input_tokens_seen": 14565472, "step": 12005 }, { "epoch": 1.3375654304488251, "grad_norm": 0.2519107460975647, "learning_rate": 4.982675854221529e-05, "loss": 0.464, "num_input_tokens_seen": 14571616, "step": 12010 }, { "epoch": 1.3381222853324424, "grad_norm": 0.21315822005271912, "learning_rate": 4.982618698047584e-05, "loss": 0.4687, "num_input_tokens_seen": 14577632, "step": 12015 }, { "epoch": 1.3386791402160596, "grad_norm": 0.18980112671852112, "learning_rate": 4.98256144807235e-05, "loss": 0.4648, "num_input_tokens_seen": 14583104, "step": 12020 }, { "epoch": 1.339235995099677, "grad_norm": 0.30857399106025696, "learning_rate": 4.982504104297988e-05, "loss": 0.4935, "num_input_tokens_seen": 14589120, "step": 12025 }, { "epoch": 1.3397928499832943, "grad_norm": 0.18620748817920685, "learning_rate": 4.9824466667266654e-05, "loss": 0.4829, "num_input_tokens_seen": 14595168, "step": 12030 }, { "epoch": 1.3403497048669117, "grad_norm": 0.19223131239414215, "learning_rate": 4.9823891353605536e-05, "loss": 0.4508, "num_input_tokens_seen": 14601408, "step": 12035 }, { "epoch": 1.340906559750529, "grad_norm": 0.2264835387468338, "learning_rate": 4.982331510201823e-05, "loss": 0.4701, "num_input_tokens_seen": 14607456, "step": 12040 }, { "epoch": 1.3414634146341464, "grad_norm": 0.18033504486083984, "learning_rate": 4.982273791252654e-05, "loss": 0.4535, "num_input_tokens_seen": 14613888, "step": 12045 }, { "epoch": 1.3420202695177637, "grad_norm": 0.24519017338752747, "learning_rate": 4.9822159785152264e-05, "loss": 0.4621, "num_input_tokens_seen": 14619840, "step": 12050 }, { "epoch": 1.342577124401381, "grad_norm": 0.33545586466789246, "learning_rate": 4.982158071991725e-05, "loss": 0.4547, "num_input_tokens_seen": 14625696, "step": 12055 }, { "epoch": 1.3431339792849983, "grad_norm": 0.22546221315860748, "learning_rate": 4.982100071684336e-05, "loss": 0.4733, "num_input_tokens_seen": 14630976, "step": 12060 }, { "epoch": 1.3436908341686156, "grad_norm": 0.27280956506729126, "learning_rate": 4.982041977595253e-05, "loss": 0.4694, "num_input_tokens_seen": 14637280, "step": 12065 }, { "epoch": 1.344247689052233, "grad_norm": 0.2378423810005188, "learning_rate": 4.98198378972667e-05, "loss": 0.4703, "num_input_tokens_seen": 14643264, "step": 12070 }, { "epoch": 1.3448045439358502, "grad_norm": 0.27387216687202454, "learning_rate": 4.981925508080785e-05, "loss": 0.4563, "num_input_tokens_seen": 14649600, "step": 12075 }, { "epoch": 1.3453613988194677, "grad_norm": 0.22076722979545593, "learning_rate": 4.9818671326598e-05, "loss": 0.4746, "num_input_tokens_seen": 14655552, "step": 12080 }, { "epoch": 1.345918253703085, "grad_norm": 0.22225365042686462, "learning_rate": 4.981808663465922e-05, "loss": 0.4545, "num_input_tokens_seen": 14661600, "step": 12085 }, { "epoch": 1.3464751085867024, "grad_norm": 0.26934367418289185, "learning_rate": 4.9817501005013586e-05, "loss": 0.4715, "num_input_tokens_seen": 14667648, "step": 12090 }, { "epoch": 1.3470319634703196, "grad_norm": 0.30438342690467834, "learning_rate": 4.981691443768325e-05, "loss": 0.4672, "num_input_tokens_seen": 14673632, "step": 12095 }, { "epoch": 1.347588818353937, "grad_norm": 0.20379813015460968, "learning_rate": 4.981632693269034e-05, "loss": 0.473, "num_input_tokens_seen": 14679680, "step": 12100 }, { "epoch": 1.3481456732375543, "grad_norm": 0.22599896788597107, "learning_rate": 4.981573849005707e-05, "loss": 0.4637, "num_input_tokens_seen": 14685888, "step": 12105 }, { "epoch": 1.3487025281211715, "grad_norm": 0.2278035581111908, "learning_rate": 4.981514910980569e-05, "loss": 0.4814, "num_input_tokens_seen": 14692000, "step": 12110 }, { "epoch": 1.349259383004789, "grad_norm": 0.23513706028461456, "learning_rate": 4.981455879195844e-05, "loss": 0.4715, "num_input_tokens_seen": 14698400, "step": 12115 }, { "epoch": 1.3498162378884062, "grad_norm": 0.28821223974227905, "learning_rate": 4.9813967536537636e-05, "loss": 0.4765, "num_input_tokens_seen": 14704000, "step": 12120 }, { "epoch": 1.3503730927720237, "grad_norm": 0.23569057881832123, "learning_rate": 4.981337534356563e-05, "loss": 0.4638, "num_input_tokens_seen": 14710048, "step": 12125 }, { "epoch": 1.350929947655641, "grad_norm": 0.196964293718338, "learning_rate": 4.981278221306477e-05, "loss": 0.4656, "num_input_tokens_seen": 14716416, "step": 12130 }, { "epoch": 1.3514868025392583, "grad_norm": 0.2046276032924652, "learning_rate": 4.981218814505749e-05, "loss": 0.4592, "num_input_tokens_seen": 14722144, "step": 12135 }, { "epoch": 1.3520436574228756, "grad_norm": 0.2932204604148865, "learning_rate": 4.9811593139566225e-05, "loss": 0.4579, "num_input_tokens_seen": 14728288, "step": 12140 }, { "epoch": 1.352600512306493, "grad_norm": 0.28430142998695374, "learning_rate": 4.981099719661346e-05, "loss": 0.4833, "num_input_tokens_seen": 14734400, "step": 12145 }, { "epoch": 1.3531573671901103, "grad_norm": 0.2330351620912552, "learning_rate": 4.9810400316221706e-05, "loss": 0.4477, "num_input_tokens_seen": 14740544, "step": 12150 }, { "epoch": 1.3537142220737275, "grad_norm": 0.23006106913089752, "learning_rate": 4.980980249841352e-05, "loss": 0.4777, "num_input_tokens_seen": 14746688, "step": 12155 }, { "epoch": 1.354271076957345, "grad_norm": 0.2329273223876953, "learning_rate": 4.9809203743211494e-05, "loss": 0.484, "num_input_tokens_seen": 14752416, "step": 12160 }, { "epoch": 1.3548279318409622, "grad_norm": 0.2744317054748535, "learning_rate": 4.980860405063824e-05, "loss": 0.4581, "num_input_tokens_seen": 14758560, "step": 12165 }, { "epoch": 1.3553847867245796, "grad_norm": 0.23731958866119385, "learning_rate": 4.980800342071642e-05, "loss": 0.459, "num_input_tokens_seen": 14764192, "step": 12170 }, { "epoch": 1.3559416416081969, "grad_norm": 0.20806393027305603, "learning_rate": 4.980740185346874e-05, "loss": 0.4771, "num_input_tokens_seen": 14770112, "step": 12175 }, { "epoch": 1.3564984964918143, "grad_norm": 0.20769353210926056, "learning_rate": 4.980679934891791e-05, "loss": 0.443, "num_input_tokens_seen": 14776256, "step": 12180 }, { "epoch": 1.3570553513754315, "grad_norm": 0.33531615138053894, "learning_rate": 4.9806195907086706e-05, "loss": 0.4749, "num_input_tokens_seen": 14782368, "step": 12185 }, { "epoch": 1.357612206259049, "grad_norm": 0.2050391435623169, "learning_rate": 4.980559152799793e-05, "loss": 0.4631, "num_input_tokens_seen": 14788512, "step": 12190 }, { "epoch": 1.3581690611426662, "grad_norm": 0.28516876697540283, "learning_rate": 4.980498621167441e-05, "loss": 0.4765, "num_input_tokens_seen": 14794752, "step": 12195 }, { "epoch": 1.3587259160262835, "grad_norm": 0.21916896104812622, "learning_rate": 4.9804379958139014e-05, "loss": 0.471, "num_input_tokens_seen": 14800896, "step": 12200 }, { "epoch": 1.359282770909901, "grad_norm": 0.24358780682086945, "learning_rate": 4.980377276741466e-05, "loss": 0.4437, "num_input_tokens_seen": 14807072, "step": 12205 }, { "epoch": 1.3598396257935181, "grad_norm": 0.2861059606075287, "learning_rate": 4.980316463952428e-05, "loss": 0.4873, "num_input_tokens_seen": 14813056, "step": 12210 }, { "epoch": 1.3603964806771356, "grad_norm": 0.2820834815502167, "learning_rate": 4.980255557449085e-05, "loss": 0.4628, "num_input_tokens_seen": 14818240, "step": 12215 }, { "epoch": 1.3609533355607528, "grad_norm": 0.2034296840429306, "learning_rate": 4.9801945572337396e-05, "loss": 0.4576, "num_input_tokens_seen": 14824160, "step": 12220 }, { "epoch": 1.3615101904443703, "grad_norm": 0.3199249505996704, "learning_rate": 4.9801334633086946e-05, "loss": 0.4731, "num_input_tokens_seen": 14830272, "step": 12225 }, { "epoch": 1.3620670453279875, "grad_norm": 0.19788917899131775, "learning_rate": 4.9800722756762606e-05, "loss": 0.4548, "num_input_tokens_seen": 14836544, "step": 12230 }, { "epoch": 1.362623900211605, "grad_norm": 0.2054743468761444, "learning_rate": 4.9800109943387474e-05, "loss": 0.459, "num_input_tokens_seen": 14842528, "step": 12235 }, { "epoch": 1.3631807550952222, "grad_norm": 0.3057534396648407, "learning_rate": 4.979949619298472e-05, "loss": 0.4805, "num_input_tokens_seen": 14848608, "step": 12240 }, { "epoch": 1.3637376099788394, "grad_norm": 0.22020135819911957, "learning_rate": 4.979888150557752e-05, "loss": 0.4815, "num_input_tokens_seen": 14854848, "step": 12245 }, { "epoch": 1.3642944648624569, "grad_norm": 0.20917944610118866, "learning_rate": 4.97982658811891e-05, "loss": 0.4785, "num_input_tokens_seen": 14860768, "step": 12250 }, { "epoch": 1.364851319746074, "grad_norm": 0.23488548398017883, "learning_rate": 4.979764931984273e-05, "loss": 0.4796, "num_input_tokens_seen": 14866688, "step": 12255 }, { "epoch": 1.3654081746296916, "grad_norm": 0.20096728205680847, "learning_rate": 4.97970318215617e-05, "loss": 0.4667, "num_input_tokens_seen": 14872832, "step": 12260 }, { "epoch": 1.3659650295133088, "grad_norm": 0.2230343520641327, "learning_rate": 4.979641338636935e-05, "loss": 0.4627, "num_input_tokens_seen": 14878848, "step": 12265 }, { "epoch": 1.3665218843969262, "grad_norm": 0.1975427120923996, "learning_rate": 4.9795794014289024e-05, "loss": 0.447, "num_input_tokens_seen": 14884768, "step": 12270 }, { "epoch": 1.3670787392805435, "grad_norm": 0.277742862701416, "learning_rate": 4.979517370534414e-05, "loss": 0.4756, "num_input_tokens_seen": 14891008, "step": 12275 }, { "epoch": 1.367635594164161, "grad_norm": 0.23093341290950775, "learning_rate": 4.979455245955815e-05, "loss": 0.4646, "num_input_tokens_seen": 14896512, "step": 12280 }, { "epoch": 1.3681924490477781, "grad_norm": 0.19276823103427887, "learning_rate": 4.979393027695449e-05, "loss": 0.4902, "num_input_tokens_seen": 14903008, "step": 12285 }, { "epoch": 1.3687493039313954, "grad_norm": 0.20495082437992096, "learning_rate": 4.97933071575567e-05, "loss": 0.4711, "num_input_tokens_seen": 14909152, "step": 12290 }, { "epoch": 1.3693061588150128, "grad_norm": 0.22921377420425415, "learning_rate": 4.979268310138831e-05, "loss": 0.4847, "num_input_tokens_seen": 14914368, "step": 12295 }, { "epoch": 1.36986301369863, "grad_norm": 0.18834377825260162, "learning_rate": 4.97920581084729e-05, "loss": 0.454, "num_input_tokens_seen": 14920480, "step": 12300 }, { "epoch": 1.3704198685822475, "grad_norm": 0.179185688495636, "learning_rate": 4.9791432178834086e-05, "loss": 0.4674, "num_input_tokens_seen": 14926688, "step": 12305 }, { "epoch": 1.3709767234658647, "grad_norm": 0.18755683302879333, "learning_rate": 4.979080531249551e-05, "loss": 0.4638, "num_input_tokens_seen": 14932672, "step": 12310 }, { "epoch": 1.3715335783494822, "grad_norm": 0.18868085741996765, "learning_rate": 4.9790177509480865e-05, "loss": 0.4829, "num_input_tokens_seen": 14938592, "step": 12315 }, { "epoch": 1.3720904332330994, "grad_norm": 0.22155103087425232, "learning_rate": 4.978954876981388e-05, "loss": 0.4747, "num_input_tokens_seen": 14943936, "step": 12320 }, { "epoch": 1.3726472881167169, "grad_norm": 0.21462132036685944, "learning_rate": 4.9788919093518284e-05, "loss": 0.4421, "num_input_tokens_seen": 14950016, "step": 12325 }, { "epoch": 1.373204143000334, "grad_norm": 0.3175213038921356, "learning_rate": 4.97882884806179e-05, "loss": 0.4821, "num_input_tokens_seen": 14956352, "step": 12330 }, { "epoch": 1.3737609978839513, "grad_norm": 0.2344258576631546, "learning_rate": 4.9787656931136526e-05, "loss": 0.478, "num_input_tokens_seen": 14962432, "step": 12335 }, { "epoch": 1.3743178527675688, "grad_norm": 0.2192341536283493, "learning_rate": 4.978702444509804e-05, "loss": 0.4753, "num_input_tokens_seen": 14968576, "step": 12340 }, { "epoch": 1.374874707651186, "grad_norm": 0.20633597671985626, "learning_rate": 4.9786391022526343e-05, "loss": 0.4716, "num_input_tokens_seen": 14974688, "step": 12345 }, { "epoch": 1.3754315625348035, "grad_norm": 0.24203002452850342, "learning_rate": 4.978575666344536e-05, "loss": 0.461, "num_input_tokens_seen": 14980832, "step": 12350 }, { "epoch": 1.3759884174184207, "grad_norm": 0.23495109379291534, "learning_rate": 4.9785121367879056e-05, "loss": 0.4693, "num_input_tokens_seen": 14986816, "step": 12355 }, { "epoch": 1.3765452723020382, "grad_norm": 0.2384873628616333, "learning_rate": 4.978448513585144e-05, "loss": 0.4595, "num_input_tokens_seen": 14992096, "step": 12360 }, { "epoch": 1.3771021271856554, "grad_norm": 0.24588355422019958, "learning_rate": 4.9783847967386556e-05, "loss": 0.4771, "num_input_tokens_seen": 14998272, "step": 12365 }, { "epoch": 1.3776589820692728, "grad_norm": 0.18581975996494293, "learning_rate": 4.978320986250846e-05, "loss": 0.4833, "num_input_tokens_seen": 15004544, "step": 12370 }, { "epoch": 1.37821583695289, "grad_norm": 0.19292974472045898, "learning_rate": 4.978257082124128e-05, "loss": 0.4733, "num_input_tokens_seen": 15010688, "step": 12375 }, { "epoch": 1.3787726918365073, "grad_norm": 0.2335052192211151, "learning_rate": 4.978193084360916e-05, "loss": 0.4661, "num_input_tokens_seen": 15016608, "step": 12380 }, { "epoch": 1.3793295467201248, "grad_norm": 0.2892996370792389, "learning_rate": 4.978128992963627e-05, "loss": 0.4722, "num_input_tokens_seen": 15022816, "step": 12385 }, { "epoch": 1.379886401603742, "grad_norm": 0.24929559230804443, "learning_rate": 4.978064807934683e-05, "loss": 0.4665, "num_input_tokens_seen": 15028512, "step": 12390 }, { "epoch": 1.3804432564873594, "grad_norm": 0.22893425822257996, "learning_rate": 4.9780005292765095e-05, "loss": 0.4766, "num_input_tokens_seen": 15034752, "step": 12395 }, { "epoch": 1.3810001113709767, "grad_norm": 0.22052745521068573, "learning_rate": 4.977936156991535e-05, "loss": 0.4591, "num_input_tokens_seen": 15040960, "step": 12400 }, { "epoch": 1.3815569662545941, "grad_norm": 0.33078330755233765, "learning_rate": 4.977871691082191e-05, "loss": 0.4787, "num_input_tokens_seen": 15046560, "step": 12405 }, { "epoch": 1.3821138211382114, "grad_norm": 0.3384208381175995, "learning_rate": 4.9778071315509145e-05, "loss": 0.4822, "num_input_tokens_seen": 15052736, "step": 12410 }, { "epoch": 1.3826706760218288, "grad_norm": 0.23896539211273193, "learning_rate": 4.977742478400144e-05, "loss": 0.4636, "num_input_tokens_seen": 15058560, "step": 12415 }, { "epoch": 1.383227530905446, "grad_norm": 0.18866875767707825, "learning_rate": 4.977677731632321e-05, "loss": 0.4644, "num_input_tokens_seen": 15064704, "step": 12420 }, { "epoch": 1.3837843857890633, "grad_norm": 0.29865700006484985, "learning_rate": 4.977612891249894e-05, "loss": 0.4619, "num_input_tokens_seen": 15070688, "step": 12425 }, { "epoch": 1.3843412406726807, "grad_norm": 0.21043257415294647, "learning_rate": 4.977547957255313e-05, "loss": 0.4739, "num_input_tokens_seen": 15076416, "step": 12430 }, { "epoch": 1.384898095556298, "grad_norm": 0.21097059547901154, "learning_rate": 4.9774829296510296e-05, "loss": 0.445, "num_input_tokens_seen": 15082368, "step": 12435 }, { "epoch": 1.3854549504399154, "grad_norm": 0.20771466195583344, "learning_rate": 4.977417808439501e-05, "loss": 0.4827, "num_input_tokens_seen": 15088672, "step": 12440 }, { "epoch": 1.3860118053235326, "grad_norm": 0.237727552652359, "learning_rate": 4.977352593623189e-05, "loss": 0.4576, "num_input_tokens_seen": 15094720, "step": 12445 }, { "epoch": 1.38656866020715, "grad_norm": 0.3129393458366394, "learning_rate": 4.977287285204557e-05, "loss": 0.4778, "num_input_tokens_seen": 15100768, "step": 12450 }, { "epoch": 1.3871255150907673, "grad_norm": 0.20642951130867004, "learning_rate": 4.977221883186073e-05, "loss": 0.4693, "num_input_tokens_seen": 15106720, "step": 12455 }, { "epoch": 1.3876823699743848, "grad_norm": 0.2108335793018341, "learning_rate": 4.9771563875702073e-05, "loss": 0.4508, "num_input_tokens_seen": 15112896, "step": 12460 }, { "epoch": 1.388239224858002, "grad_norm": 0.2938227951526642, "learning_rate": 4.9770907983594344e-05, "loss": 0.4716, "num_input_tokens_seen": 15119136, "step": 12465 }, { "epoch": 1.3887960797416192, "grad_norm": 0.2741129398345947, "learning_rate": 4.977025115556233e-05, "loss": 0.4609, "num_input_tokens_seen": 15124896, "step": 12470 }, { "epoch": 1.3893529346252367, "grad_norm": 0.19870324432849884, "learning_rate": 4.976959339163085e-05, "loss": 0.4648, "num_input_tokens_seen": 15130816, "step": 12475 }, { "epoch": 1.389909789508854, "grad_norm": 0.325600802898407, "learning_rate": 4.9768934691824755e-05, "loss": 0.4736, "num_input_tokens_seen": 15137248, "step": 12480 }, { "epoch": 1.3904666443924714, "grad_norm": 0.2934962213039398, "learning_rate": 4.9768275056168924e-05, "loss": 0.4941, "num_input_tokens_seen": 15143296, "step": 12485 }, { "epoch": 1.3910234992760886, "grad_norm": 0.2440243810415268, "learning_rate": 4.97676144846883e-05, "loss": 0.4754, "num_input_tokens_seen": 15149408, "step": 12490 }, { "epoch": 1.391580354159706, "grad_norm": 0.21015802025794983, "learning_rate": 4.976695297740781e-05, "loss": 0.4743, "num_input_tokens_seen": 15155840, "step": 12495 }, { "epoch": 1.3921372090433233, "grad_norm": 0.20779122412204742, "learning_rate": 4.976629053435248e-05, "loss": 0.4606, "num_input_tokens_seen": 15161792, "step": 12500 }, { "epoch": 1.3926940639269407, "grad_norm": 0.2085626721382141, "learning_rate": 4.976562715554733e-05, "loss": 0.4569, "num_input_tokens_seen": 15167264, "step": 12505 }, { "epoch": 1.393250918810558, "grad_norm": 0.3327125906944275, "learning_rate": 4.976496284101741e-05, "loss": 0.4806, "num_input_tokens_seen": 15173152, "step": 12510 }, { "epoch": 1.3938077736941752, "grad_norm": 0.33804556727409363, "learning_rate": 4.976429759078784e-05, "loss": 0.4511, "num_input_tokens_seen": 15179392, "step": 12515 }, { "epoch": 1.3943646285777926, "grad_norm": 0.24661485850811005, "learning_rate": 4.9763631404883745e-05, "loss": 0.4782, "num_input_tokens_seen": 15185440, "step": 12520 }, { "epoch": 1.39492148346141, "grad_norm": 0.26521846652030945, "learning_rate": 4.976296428333029e-05, "loss": 0.4606, "num_input_tokens_seen": 15191744, "step": 12525 }, { "epoch": 1.3954783383450273, "grad_norm": 0.2419975996017456, "learning_rate": 4.976229622615269e-05, "loss": 0.4748, "num_input_tokens_seen": 15197824, "step": 12530 }, { "epoch": 1.3960351932286446, "grad_norm": 0.2306520789861679, "learning_rate": 4.9761627233376187e-05, "loss": 0.454, "num_input_tokens_seen": 15203936, "step": 12535 }, { "epoch": 1.396592048112262, "grad_norm": 0.3000767230987549, "learning_rate": 4.9760957305026054e-05, "loss": 0.4728, "num_input_tokens_seen": 15209440, "step": 12540 }, { "epoch": 1.3971489029958792, "grad_norm": 0.30686187744140625, "learning_rate": 4.9760286441127604e-05, "loss": 0.4436, "num_input_tokens_seen": 15215328, "step": 12545 }, { "epoch": 1.3977057578794967, "grad_norm": 0.2631342113018036, "learning_rate": 4.975961464170617e-05, "loss": 0.4709, "num_input_tokens_seen": 15221440, "step": 12550 }, { "epoch": 1.398262612763114, "grad_norm": 0.2596518397331238, "learning_rate": 4.975894190678717e-05, "loss": 0.4714, "num_input_tokens_seen": 15227424, "step": 12555 }, { "epoch": 1.3988194676467312, "grad_norm": 0.26690101623535156, "learning_rate": 4.9758268236395986e-05, "loss": 0.4774, "num_input_tokens_seen": 15234080, "step": 12560 }, { "epoch": 1.3993763225303486, "grad_norm": 0.26793503761291504, "learning_rate": 4.97575936305581e-05, "loss": 0.4712, "num_input_tokens_seen": 15240192, "step": 12565 }, { "epoch": 1.399933177413966, "grad_norm": 0.24830493330955505, "learning_rate": 4.975691808929898e-05, "loss": 0.4631, "num_input_tokens_seen": 15246528, "step": 12570 }, { "epoch": 1.4004900322975833, "grad_norm": 0.24268738925457, "learning_rate": 4.975624161264415e-05, "loss": 0.4782, "num_input_tokens_seen": 15252832, "step": 12575 }, { "epoch": 1.4010468871812005, "grad_norm": 0.27187851071357727, "learning_rate": 4.975556420061919e-05, "loss": 0.4564, "num_input_tokens_seen": 15258944, "step": 12580 }, { "epoch": 1.401603742064818, "grad_norm": 0.2418971061706543, "learning_rate": 4.975488585324967e-05, "loss": 0.4673, "num_input_tokens_seen": 15264832, "step": 12585 }, { "epoch": 1.4021605969484352, "grad_norm": 0.2788501977920532, "learning_rate": 4.975420657056123e-05, "loss": 0.4615, "num_input_tokens_seen": 15271040, "step": 12590 }, { "epoch": 1.4027174518320527, "grad_norm": 0.206117182970047, "learning_rate": 4.9753526352579546e-05, "loss": 0.4603, "num_input_tokens_seen": 15276960, "step": 12595 }, { "epoch": 1.4032743067156699, "grad_norm": 0.22908154129981995, "learning_rate": 4.9752845199330304e-05, "loss": 0.4758, "num_input_tokens_seen": 15283072, "step": 12600 }, { "epoch": 1.4038311615992871, "grad_norm": 0.307932049036026, "learning_rate": 4.975216311083925e-05, "loss": 0.4753, "num_input_tokens_seen": 15289184, "step": 12605 }, { "epoch": 1.4043880164829046, "grad_norm": 0.27381330728530884, "learning_rate": 4.975148008713215e-05, "loss": 0.4898, "num_input_tokens_seen": 15295200, "step": 12610 }, { "epoch": 1.404944871366522, "grad_norm": 0.29189252853393555, "learning_rate": 4.975079612823481e-05, "loss": 0.4734, "num_input_tokens_seen": 15301024, "step": 12615 }, { "epoch": 1.4055017262501392, "grad_norm": 0.19278402626514435, "learning_rate": 4.975011123417308e-05, "loss": 0.4498, "num_input_tokens_seen": 15306656, "step": 12620 }, { "epoch": 1.4060585811337565, "grad_norm": 0.24346421658992767, "learning_rate": 4.9749425404972825e-05, "loss": 0.4494, "num_input_tokens_seen": 15312608, "step": 12625 }, { "epoch": 1.406615436017374, "grad_norm": 0.23024027049541473, "learning_rate": 4.974873864065997e-05, "loss": 0.4584, "num_input_tokens_seen": 15318560, "step": 12630 }, { "epoch": 1.4071722909009912, "grad_norm": 0.21588844060897827, "learning_rate": 4.974805094126046e-05, "loss": 0.4608, "num_input_tokens_seen": 15324736, "step": 12635 }, { "epoch": 1.4077291457846086, "grad_norm": 0.20339731872081757, "learning_rate": 4.9747362306800274e-05, "loss": 0.4681, "num_input_tokens_seen": 15330848, "step": 12640 }, { "epoch": 1.4082860006682258, "grad_norm": 0.2649179995059967, "learning_rate": 4.9746672737305425e-05, "loss": 0.4711, "num_input_tokens_seen": 15337088, "step": 12645 }, { "epoch": 1.408842855551843, "grad_norm": 0.17575396597385406, "learning_rate": 4.974598223280198e-05, "loss": 0.469, "num_input_tokens_seen": 15343392, "step": 12650 }, { "epoch": 1.4093997104354605, "grad_norm": 0.23452603816986084, "learning_rate": 4.9745290793316034e-05, "loss": 0.474, "num_input_tokens_seen": 15349152, "step": 12655 }, { "epoch": 1.409956565319078, "grad_norm": 0.29144710302352905, "learning_rate": 4.974459841887369e-05, "loss": 0.4725, "num_input_tokens_seen": 15355168, "step": 12660 }, { "epoch": 1.4105134202026952, "grad_norm": 0.2561938166618347, "learning_rate": 4.974390510950112e-05, "loss": 0.4813, "num_input_tokens_seen": 15360352, "step": 12665 }, { "epoch": 1.4110702750863124, "grad_norm": 0.21355929970741272, "learning_rate": 4.974321086522453e-05, "loss": 0.4574, "num_input_tokens_seen": 15366528, "step": 12670 }, { "epoch": 1.41162712996993, "grad_norm": 0.22589905560016632, "learning_rate": 4.974251568607013e-05, "loss": 0.487, "num_input_tokens_seen": 15372736, "step": 12675 }, { "epoch": 1.4121839848535471, "grad_norm": 0.19042910635471344, "learning_rate": 4.9741819572064196e-05, "loss": 0.4531, "num_input_tokens_seen": 15379040, "step": 12680 }, { "epoch": 1.4127408397371646, "grad_norm": 0.1800546795129776, "learning_rate": 4.974112252323304e-05, "loss": 0.4735, "num_input_tokens_seen": 15385344, "step": 12685 }, { "epoch": 1.4132976946207818, "grad_norm": 0.22012777626514435, "learning_rate": 4.974042453960298e-05, "loss": 0.4764, "num_input_tokens_seen": 15391072, "step": 12690 }, { "epoch": 1.413854549504399, "grad_norm": 0.22913461923599243, "learning_rate": 4.973972562120039e-05, "loss": 0.4591, "num_input_tokens_seen": 15397024, "step": 12695 }, { "epoch": 1.4144114043880165, "grad_norm": 0.26739710569381714, "learning_rate": 4.973902576805169e-05, "loss": 0.4805, "num_input_tokens_seen": 15403296, "step": 12700 }, { "epoch": 1.414968259271634, "grad_norm": 0.23704540729522705, "learning_rate": 4.973832498018332e-05, "loss": 0.466, "num_input_tokens_seen": 15409376, "step": 12705 }, { "epoch": 1.4155251141552512, "grad_norm": 0.24517524242401123, "learning_rate": 4.9737623257621757e-05, "loss": 0.4548, "num_input_tokens_seen": 15415744, "step": 12710 }, { "epoch": 1.4160819690388684, "grad_norm": 0.19465993344783783, "learning_rate": 4.97369206003935e-05, "loss": 0.4657, "num_input_tokens_seen": 15421760, "step": 12715 }, { "epoch": 1.4166388239224859, "grad_norm": 0.22114267945289612, "learning_rate": 4.973621700852512e-05, "loss": 0.4656, "num_input_tokens_seen": 15427904, "step": 12720 }, { "epoch": 1.417195678806103, "grad_norm": 0.24706128239631653, "learning_rate": 4.973551248204318e-05, "loss": 0.4807, "num_input_tokens_seen": 15433856, "step": 12725 }, { "epoch": 1.4177525336897205, "grad_norm": 0.26258859038352966, "learning_rate": 4.9734807020974314e-05, "loss": 0.4702, "num_input_tokens_seen": 15440256, "step": 12730 }, { "epoch": 1.4183093885733378, "grad_norm": 0.23637723922729492, "learning_rate": 4.973410062534518e-05, "loss": 0.4767, "num_input_tokens_seen": 15446496, "step": 12735 }, { "epoch": 1.418866243456955, "grad_norm": 0.20742911100387573, "learning_rate": 4.973339329518245e-05, "loss": 0.4584, "num_input_tokens_seen": 15452608, "step": 12740 }, { "epoch": 1.4194230983405725, "grad_norm": 0.23171308636665344, "learning_rate": 4.973268503051286e-05, "loss": 0.4701, "num_input_tokens_seen": 15458720, "step": 12745 }, { "epoch": 1.41997995322419, "grad_norm": 0.24064931273460388, "learning_rate": 4.973197583136317e-05, "loss": 0.4605, "num_input_tokens_seen": 15464640, "step": 12750 }, { "epoch": 1.4205368081078071, "grad_norm": 0.3153110444545746, "learning_rate": 4.973126569776019e-05, "loss": 0.468, "num_input_tokens_seen": 15470944, "step": 12755 }, { "epoch": 1.4210936629914244, "grad_norm": 0.2551974356174469, "learning_rate": 4.973055462973072e-05, "loss": 0.4605, "num_input_tokens_seen": 15477152, "step": 12760 }, { "epoch": 1.4216505178750418, "grad_norm": 0.23528659343719482, "learning_rate": 4.972984262730165e-05, "loss": 0.4713, "num_input_tokens_seen": 15483136, "step": 12765 }, { "epoch": 1.422207372758659, "grad_norm": 0.22148862481117249, "learning_rate": 4.9729129690499873e-05, "loss": 0.4722, "num_input_tokens_seen": 15489120, "step": 12770 }, { "epoch": 1.4227642276422765, "grad_norm": 0.2799599766731262, "learning_rate": 4.972841581935233e-05, "loss": 0.4724, "num_input_tokens_seen": 15495264, "step": 12775 }, { "epoch": 1.4233210825258937, "grad_norm": 0.1673518270254135, "learning_rate": 4.9727701013885985e-05, "loss": 0.4848, "num_input_tokens_seen": 15500704, "step": 12780 }, { "epoch": 1.423877937409511, "grad_norm": 0.22936896979808807, "learning_rate": 4.972698527412786e-05, "loss": 0.4741, "num_input_tokens_seen": 15507072, "step": 12785 }, { "epoch": 1.4244347922931284, "grad_norm": 0.2307756245136261, "learning_rate": 4.9726268600104985e-05, "loss": 0.4673, "num_input_tokens_seen": 15513216, "step": 12790 }, { "epoch": 1.4249916471767459, "grad_norm": 0.16265270113945007, "learning_rate": 4.9725550991844445e-05, "loss": 0.4661, "num_input_tokens_seen": 15519296, "step": 12795 }, { "epoch": 1.425548502060363, "grad_norm": 0.19295109808444977, "learning_rate": 4.972483244937335e-05, "loss": 0.4582, "num_input_tokens_seen": 15525760, "step": 12800 }, { "epoch": 1.4261053569439803, "grad_norm": 0.18669472634792328, "learning_rate": 4.9724112972718854e-05, "loss": 0.4757, "num_input_tokens_seen": 15531776, "step": 12805 }, { "epoch": 1.4266622118275978, "grad_norm": 0.2581567168235779, "learning_rate": 4.9723392561908136e-05, "loss": 0.4931, "num_input_tokens_seen": 15537504, "step": 12810 }, { "epoch": 1.427219066711215, "grad_norm": 0.15887530148029327, "learning_rate": 4.972267121696841e-05, "loss": 0.4515, "num_input_tokens_seen": 15543936, "step": 12815 }, { "epoch": 1.4277759215948325, "grad_norm": 0.25656771659851074, "learning_rate": 4.972194893792694e-05, "loss": 0.4599, "num_input_tokens_seen": 15549920, "step": 12820 }, { "epoch": 1.4283327764784497, "grad_norm": 0.2295692414045334, "learning_rate": 4.9721225724811015e-05, "loss": 0.4734, "num_input_tokens_seen": 15556192, "step": 12825 }, { "epoch": 1.428889631362067, "grad_norm": 0.2054796665906906, "learning_rate": 4.9720501577647964e-05, "loss": 0.4723, "num_input_tokens_seen": 15562400, "step": 12830 }, { "epoch": 1.4294464862456844, "grad_norm": 0.31231412291526794, "learning_rate": 4.9719776496465145e-05, "loss": 0.4699, "num_input_tokens_seen": 15568192, "step": 12835 }, { "epoch": 1.4300033411293018, "grad_norm": 0.23373684287071228, "learning_rate": 4.971905048128994e-05, "loss": 0.4637, "num_input_tokens_seen": 15574208, "step": 12840 }, { "epoch": 1.430560196012919, "grad_norm": 0.17765864729881287, "learning_rate": 4.97183235321498e-05, "loss": 0.4571, "num_input_tokens_seen": 15579840, "step": 12845 }, { "epoch": 1.4311170508965363, "grad_norm": 0.2050299197435379, "learning_rate": 4.971759564907218e-05, "loss": 0.4687, "num_input_tokens_seen": 15586112, "step": 12850 }, { "epoch": 1.4316739057801537, "grad_norm": 0.23380208015441895, "learning_rate": 4.971686683208458e-05, "loss": 0.4896, "num_input_tokens_seen": 15591968, "step": 12855 }, { "epoch": 1.432230760663771, "grad_norm": 0.21424993872642517, "learning_rate": 4.9716137081214554e-05, "loss": 0.4614, "num_input_tokens_seen": 15597792, "step": 12860 }, { "epoch": 1.4327876155473884, "grad_norm": 0.43899813294410706, "learning_rate": 4.971540639648966e-05, "loss": 0.4856, "num_input_tokens_seen": 15603840, "step": 12865 }, { "epoch": 1.4333444704310057, "grad_norm": 0.21774880588054657, "learning_rate": 4.9714674777937504e-05, "loss": 0.462, "num_input_tokens_seen": 15610144, "step": 12870 }, { "epoch": 1.4339013253146229, "grad_norm": 0.29568979144096375, "learning_rate": 4.971394222558573e-05, "loss": 0.4677, "num_input_tokens_seen": 15615936, "step": 12875 }, { "epoch": 1.4344581801982403, "grad_norm": 0.238073468208313, "learning_rate": 4.9713208739462025e-05, "loss": 0.4725, "num_input_tokens_seen": 15622208, "step": 12880 }, { "epoch": 1.4350150350818578, "grad_norm": 0.2045416533946991, "learning_rate": 4.9712474319594094e-05, "loss": 0.4619, "num_input_tokens_seen": 15628416, "step": 12885 }, { "epoch": 1.435571889965475, "grad_norm": 0.2304871380329132, "learning_rate": 4.971173896600969e-05, "loss": 0.4699, "num_input_tokens_seen": 15634400, "step": 12890 }, { "epoch": 1.4361287448490923, "grad_norm": 0.16964305937290192, "learning_rate": 4.97110026787366e-05, "loss": 0.4602, "num_input_tokens_seen": 15640608, "step": 12895 }, { "epoch": 1.4366855997327097, "grad_norm": 0.1802413910627365, "learning_rate": 4.971026545780262e-05, "loss": 0.4569, "num_input_tokens_seen": 15646944, "step": 12900 }, { "epoch": 1.437242454616327, "grad_norm": 0.24563318490982056, "learning_rate": 4.9709527303235636e-05, "loss": 0.4614, "num_input_tokens_seen": 15652992, "step": 12905 }, { "epoch": 1.4377993094999444, "grad_norm": 0.3176572322845459, "learning_rate": 4.970878821506353e-05, "loss": 0.4785, "num_input_tokens_seen": 15659104, "step": 12910 }, { "epoch": 1.4383561643835616, "grad_norm": 0.22047708928585052, "learning_rate": 4.9708048193314213e-05, "loss": 0.4601, "num_input_tokens_seen": 15665312, "step": 12915 }, { "epoch": 1.4389130192671788, "grad_norm": 0.270159512758255, "learning_rate": 4.970730723801565e-05, "loss": 0.4725, "num_input_tokens_seen": 15671424, "step": 12920 }, { "epoch": 1.4394698741507963, "grad_norm": 0.25934478640556335, "learning_rate": 4.970656534919585e-05, "loss": 0.5026, "num_input_tokens_seen": 15677408, "step": 12925 }, { "epoch": 1.4400267290344138, "grad_norm": 0.2379816323518753, "learning_rate": 4.970582252688284e-05, "loss": 0.4641, "num_input_tokens_seen": 15683552, "step": 12930 }, { "epoch": 1.440583583918031, "grad_norm": 0.28376221656799316, "learning_rate": 4.9705078771104676e-05, "loss": 0.4422, "num_input_tokens_seen": 15689568, "step": 12935 }, { "epoch": 1.4411404388016482, "grad_norm": 0.26714053750038147, "learning_rate": 4.9704334081889455e-05, "loss": 0.4592, "num_input_tokens_seen": 15695456, "step": 12940 }, { "epoch": 1.4416972936852657, "grad_norm": 0.2028704434633255, "learning_rate": 4.970358845926534e-05, "loss": 0.4797, "num_input_tokens_seen": 15701728, "step": 12945 }, { "epoch": 1.442254148568883, "grad_norm": 0.7931179404258728, "learning_rate": 4.970284190326048e-05, "loss": 0.499, "num_input_tokens_seen": 15708320, "step": 12950 }, { "epoch": 1.4428110034525004, "grad_norm": 0.22885973751544952, "learning_rate": 4.970209441390308e-05, "loss": 0.4771, "num_input_tokens_seen": 15714752, "step": 12955 }, { "epoch": 1.4433678583361176, "grad_norm": 0.1563037931919098, "learning_rate": 4.97013459912214e-05, "loss": 0.4485, "num_input_tokens_seen": 15720800, "step": 12960 }, { "epoch": 1.4439247132197348, "grad_norm": 0.22571653127670288, "learning_rate": 4.970059663524371e-05, "loss": 0.4584, "num_input_tokens_seen": 15726432, "step": 12965 }, { "epoch": 1.4444815681033523, "grad_norm": 0.17046543955802917, "learning_rate": 4.9699846345998316e-05, "loss": 0.465, "num_input_tokens_seen": 15732608, "step": 12970 }, { "epoch": 1.4450384229869697, "grad_norm": 0.218940868973732, "learning_rate": 4.969909512351357e-05, "loss": 0.4805, "num_input_tokens_seen": 15738720, "step": 12975 }, { "epoch": 1.445595277870587, "grad_norm": 0.173763245344162, "learning_rate": 4.9698342967817867e-05, "loss": 0.4652, "num_input_tokens_seen": 15744864, "step": 12980 }, { "epoch": 1.4461521327542042, "grad_norm": 0.17549683153629303, "learning_rate": 4.9697589878939614e-05, "loss": 0.4603, "num_input_tokens_seen": 15751136, "step": 12985 }, { "epoch": 1.4467089876378216, "grad_norm": 0.2878304421901703, "learning_rate": 4.969683585690727e-05, "loss": 0.498, "num_input_tokens_seen": 15757472, "step": 12990 }, { "epoch": 1.4472658425214389, "grad_norm": 0.25474151968955994, "learning_rate": 4.969608090174932e-05, "loss": 0.4894, "num_input_tokens_seen": 15763776, "step": 12995 }, { "epoch": 1.4478226974050563, "grad_norm": 0.23561257123947144, "learning_rate": 4.969532501349429e-05, "loss": 0.4714, "num_input_tokens_seen": 15769760, "step": 13000 }, { "epoch": 1.4483795522886735, "grad_norm": 0.1779203563928604, "learning_rate": 4.9694568192170745e-05, "loss": 0.4808, "num_input_tokens_seen": 15775648, "step": 13005 }, { "epoch": 1.4489364071722908, "grad_norm": 0.20522131025791168, "learning_rate": 4.969381043780728e-05, "loss": 0.4539, "num_input_tokens_seen": 15781888, "step": 13010 }, { "epoch": 1.4494932620559082, "grad_norm": 0.237454354763031, "learning_rate": 4.9693051750432505e-05, "loss": 0.4677, "num_input_tokens_seen": 15787968, "step": 13015 }, { "epoch": 1.4500501169395257, "grad_norm": 0.1896018534898758, "learning_rate": 4.969229213007511e-05, "loss": 0.479, "num_input_tokens_seen": 15793792, "step": 13020 }, { "epoch": 1.450606971823143, "grad_norm": 0.21693579852581024, "learning_rate": 4.96915315767638e-05, "loss": 0.4683, "num_input_tokens_seen": 15799840, "step": 13025 }, { "epoch": 1.4511638267067601, "grad_norm": 0.16969679296016693, "learning_rate": 4.969077009052728e-05, "loss": 0.4655, "num_input_tokens_seen": 15806304, "step": 13030 }, { "epoch": 1.4517206815903776, "grad_norm": 0.27787715196609497, "learning_rate": 4.9690007671394356e-05, "loss": 0.463, "num_input_tokens_seen": 15812544, "step": 13035 }, { "epoch": 1.4522775364739948, "grad_norm": 0.2210705727338791, "learning_rate": 4.9689244319393816e-05, "loss": 0.4568, "num_input_tokens_seen": 15818304, "step": 13040 }, { "epoch": 1.4528343913576123, "grad_norm": 0.17680838704109192, "learning_rate": 4.9688480034554495e-05, "loss": 0.4886, "num_input_tokens_seen": 15824672, "step": 13045 }, { "epoch": 1.4533912462412295, "grad_norm": 0.1912899613380432, "learning_rate": 4.968771481690529e-05, "loss": 0.4638, "num_input_tokens_seen": 15830912, "step": 13050 }, { "epoch": 1.4539481011248467, "grad_norm": 0.20635610818862915, "learning_rate": 4.96869486664751e-05, "loss": 0.4483, "num_input_tokens_seen": 15837152, "step": 13055 }, { "epoch": 1.4545049560084642, "grad_norm": 0.16771462559700012, "learning_rate": 4.968618158329288e-05, "loss": 0.4661, "num_input_tokens_seen": 15842976, "step": 13060 }, { "epoch": 1.4550618108920816, "grad_norm": 0.24012549221515656, "learning_rate": 4.968541356738761e-05, "loss": 0.4651, "num_input_tokens_seen": 15848896, "step": 13065 }, { "epoch": 1.4556186657756989, "grad_norm": 0.18093477189540863, "learning_rate": 4.96846446187883e-05, "loss": 0.4787, "num_input_tokens_seen": 15854976, "step": 13070 }, { "epoch": 1.456175520659316, "grad_norm": 0.20166124403476715, "learning_rate": 4.968387473752401e-05, "loss": 0.4722, "num_input_tokens_seen": 15860832, "step": 13075 }, { "epoch": 1.4567323755429336, "grad_norm": 0.17610466480255127, "learning_rate": 4.968310392362383e-05, "loss": 0.4719, "num_input_tokens_seen": 15867040, "step": 13080 }, { "epoch": 1.4572892304265508, "grad_norm": 0.2999647259712219, "learning_rate": 4.9682332177116884e-05, "loss": 0.4763, "num_input_tokens_seen": 15872960, "step": 13085 }, { "epoch": 1.4578460853101682, "grad_norm": 0.15782907605171204, "learning_rate": 4.968155949803234e-05, "loss": 0.474, "num_input_tokens_seen": 15879168, "step": 13090 }, { "epoch": 1.4584029401937855, "grad_norm": 0.2078309953212738, "learning_rate": 4.968078588639937e-05, "loss": 0.4444, "num_input_tokens_seen": 15885440, "step": 13095 }, { "epoch": 1.4589597950774027, "grad_norm": 0.18082614243030548, "learning_rate": 4.9680011342247215e-05, "loss": 0.4687, "num_input_tokens_seen": 15891712, "step": 13100 }, { "epoch": 1.4595166499610202, "grad_norm": 0.19050069153308868, "learning_rate": 4.967923586560515e-05, "loss": 0.4666, "num_input_tokens_seen": 15898080, "step": 13105 }, { "epoch": 1.4600735048446376, "grad_norm": 0.17484158277511597, "learning_rate": 4.967845945650246e-05, "loss": 0.4725, "num_input_tokens_seen": 15904128, "step": 13110 }, { "epoch": 1.4606303597282548, "grad_norm": 0.20985956490039825, "learning_rate": 4.9677682114968484e-05, "loss": 0.4758, "num_input_tokens_seen": 15910208, "step": 13115 }, { "epoch": 1.461187214611872, "grad_norm": 0.247705340385437, "learning_rate": 4.967690384103259e-05, "loss": 0.4595, "num_input_tokens_seen": 15916224, "step": 13120 }, { "epoch": 1.4617440694954895, "grad_norm": 0.25325608253479004, "learning_rate": 4.96761246347242e-05, "loss": 0.4648, "num_input_tokens_seen": 15921952, "step": 13125 }, { "epoch": 1.4623009243791067, "grad_norm": 0.1986461877822876, "learning_rate": 4.967534449607274e-05, "loss": 0.4765, "num_input_tokens_seen": 15927680, "step": 13130 }, { "epoch": 1.4628577792627242, "grad_norm": 0.2474573403596878, "learning_rate": 4.967456342510768e-05, "loss": 0.4801, "num_input_tokens_seen": 15933824, "step": 13135 }, { "epoch": 1.4634146341463414, "grad_norm": 0.2864537835121155, "learning_rate": 4.967378142185855e-05, "loss": 0.476, "num_input_tokens_seen": 15940032, "step": 13140 }, { "epoch": 1.4639714890299587, "grad_norm": 0.19657912850379944, "learning_rate": 4.9672998486354885e-05, "loss": 0.4761, "num_input_tokens_seen": 15945920, "step": 13145 }, { "epoch": 1.4645283439135761, "grad_norm": 0.20324978232383728, "learning_rate": 4.9672214618626264e-05, "loss": 0.4603, "num_input_tokens_seen": 15951648, "step": 13150 }, { "epoch": 1.4650851987971936, "grad_norm": 0.21759328246116638, "learning_rate": 4.967142981870232e-05, "loss": 0.4936, "num_input_tokens_seen": 15957728, "step": 13155 }, { "epoch": 1.4656420536808108, "grad_norm": 0.21581625938415527, "learning_rate": 4.9670644086612675e-05, "loss": 0.4847, "num_input_tokens_seen": 15963904, "step": 13160 }, { "epoch": 1.466198908564428, "grad_norm": 0.2826812267303467, "learning_rate": 4.9669857422387046e-05, "loss": 0.4686, "num_input_tokens_seen": 15970048, "step": 13165 }, { "epoch": 1.4667557634480455, "grad_norm": 0.206398144364357, "learning_rate": 4.966906982605515e-05, "loss": 0.469, "num_input_tokens_seen": 15976224, "step": 13170 }, { "epoch": 1.4673126183316627, "grad_norm": 0.15622159838676453, "learning_rate": 4.9668281297646735e-05, "loss": 0.465, "num_input_tokens_seen": 15982464, "step": 13175 }, { "epoch": 1.4678694732152802, "grad_norm": 0.18625310063362122, "learning_rate": 4.966749183719159e-05, "loss": 0.4551, "num_input_tokens_seen": 15988544, "step": 13180 }, { "epoch": 1.4684263280988974, "grad_norm": 0.17059685289859772, "learning_rate": 4.9666701444719564e-05, "loss": 0.463, "num_input_tokens_seen": 15994688, "step": 13185 }, { "epoch": 1.4689831829825146, "grad_norm": 0.23510515689849854, "learning_rate": 4.9665910120260514e-05, "loss": 0.4606, "num_input_tokens_seen": 16001152, "step": 13190 }, { "epoch": 1.469540037866132, "grad_norm": 0.2300063669681549, "learning_rate": 4.966511786384432e-05, "loss": 0.4664, "num_input_tokens_seen": 16007360, "step": 13195 }, { "epoch": 1.4700968927497495, "grad_norm": 0.19457396864891052, "learning_rate": 4.966432467550094e-05, "loss": 0.4777, "num_input_tokens_seen": 16013824, "step": 13200 }, { "epoch": 1.4706537476333668, "grad_norm": 0.1945277601480484, "learning_rate": 4.966353055526033e-05, "loss": 0.4569, "num_input_tokens_seen": 16019840, "step": 13205 }, { "epoch": 1.471210602516984, "grad_norm": 0.21397104859352112, "learning_rate": 4.9662735503152496e-05, "loss": 0.4665, "num_input_tokens_seen": 16025952, "step": 13210 }, { "epoch": 1.4717674574006014, "grad_norm": 0.23114712536334991, "learning_rate": 4.966193951920748e-05, "loss": 0.4431, "num_input_tokens_seen": 16032128, "step": 13215 }, { "epoch": 1.4723243122842187, "grad_norm": 0.26057693362236023, "learning_rate": 4.9661142603455355e-05, "loss": 0.4909, "num_input_tokens_seen": 16037728, "step": 13220 }, { "epoch": 1.4728811671678361, "grad_norm": 0.23337610065937042, "learning_rate": 4.9660344755926236e-05, "loss": 0.4795, "num_input_tokens_seen": 16043968, "step": 13225 }, { "epoch": 1.4734380220514534, "grad_norm": 0.22671525180339813, "learning_rate": 4.9659545976650256e-05, "loss": 0.4721, "num_input_tokens_seen": 16049952, "step": 13230 }, { "epoch": 1.4739948769350706, "grad_norm": 0.23360911011695862, "learning_rate": 4.965874626565761e-05, "loss": 0.4817, "num_input_tokens_seen": 16056224, "step": 13235 }, { "epoch": 1.474551731818688, "grad_norm": 0.22252964973449707, "learning_rate": 4.965794562297851e-05, "loss": 0.4622, "num_input_tokens_seen": 16062368, "step": 13240 }, { "epoch": 1.4751085867023055, "grad_norm": 0.15943823754787445, "learning_rate": 4.96571440486432e-05, "loss": 0.4554, "num_input_tokens_seen": 16067904, "step": 13245 }, { "epoch": 1.4756654415859227, "grad_norm": 0.24558164179325104, "learning_rate": 4.9656341542681974e-05, "loss": 0.4753, "num_input_tokens_seen": 16073792, "step": 13250 }, { "epoch": 1.47622229646954, "grad_norm": 0.21418243646621704, "learning_rate": 4.965553810512514e-05, "loss": 0.4641, "num_input_tokens_seen": 16079776, "step": 13255 }, { "epoch": 1.4767791513531574, "grad_norm": 0.13396316766738892, "learning_rate": 4.965473373600307e-05, "loss": 0.4653, "num_input_tokens_seen": 16085856, "step": 13260 }, { "epoch": 1.4773360062367746, "grad_norm": 0.21186944842338562, "learning_rate": 4.9653928435346155e-05, "loss": 0.4761, "num_input_tokens_seen": 16091328, "step": 13265 }, { "epoch": 1.477892861120392, "grad_norm": 0.1687348335981369, "learning_rate": 4.965312220318481e-05, "loss": 0.4794, "num_input_tokens_seen": 16097248, "step": 13270 }, { "epoch": 1.4784497160040093, "grad_norm": 0.2003619372844696, "learning_rate": 4.96523150395495e-05, "loss": 0.4756, "num_input_tokens_seen": 16103328, "step": 13275 }, { "epoch": 1.4790065708876265, "grad_norm": 0.15769065916538239, "learning_rate": 4.9651506944470725e-05, "loss": 0.4551, "num_input_tokens_seen": 16109280, "step": 13280 }, { "epoch": 1.479563425771244, "grad_norm": 0.195944681763649, "learning_rate": 4.9650697917979025e-05, "loss": 0.4752, "num_input_tokens_seen": 16114976, "step": 13285 }, { "epoch": 1.4801202806548615, "grad_norm": 0.1653687208890915, "learning_rate": 4.964988796010496e-05, "loss": 0.4601, "num_input_tokens_seen": 16121152, "step": 13290 }, { "epoch": 1.4806771355384787, "grad_norm": 0.17715203762054443, "learning_rate": 4.964907707087913e-05, "loss": 0.4624, "num_input_tokens_seen": 16127328, "step": 13295 }, { "epoch": 1.481233990422096, "grad_norm": 0.22641834616661072, "learning_rate": 4.964826525033218e-05, "loss": 0.4689, "num_input_tokens_seen": 16133600, "step": 13300 }, { "epoch": 1.4817908453057134, "grad_norm": 0.21533162891864777, "learning_rate": 4.964745249849477e-05, "loss": 0.4701, "num_input_tokens_seen": 16139776, "step": 13305 }, { "epoch": 1.4823477001893306, "grad_norm": 0.2691459655761719, "learning_rate": 4.9646638815397626e-05, "loss": 0.4683, "num_input_tokens_seen": 16146144, "step": 13310 }, { "epoch": 1.482904555072948, "grad_norm": 0.16896598041057587, "learning_rate": 4.964582420107149e-05, "loss": 0.4985, "num_input_tokens_seen": 16152320, "step": 13315 }, { "epoch": 1.4834614099565653, "grad_norm": 0.2000550478696823, "learning_rate": 4.964500865554712e-05, "loss": 0.4743, "num_input_tokens_seen": 16158432, "step": 13320 }, { "epoch": 1.4840182648401825, "grad_norm": 0.23497097194194794, "learning_rate": 4.964419217885535e-05, "loss": 0.4637, "num_input_tokens_seen": 16164576, "step": 13325 }, { "epoch": 1.4845751197238, "grad_norm": 0.14567840099334717, "learning_rate": 4.964337477102703e-05, "loss": 0.4709, "num_input_tokens_seen": 16170720, "step": 13330 }, { "epoch": 1.4851319746074174, "grad_norm": 0.16751866042613983, "learning_rate": 4.964255643209303e-05, "loss": 0.4506, "num_input_tokens_seen": 16176768, "step": 13335 }, { "epoch": 1.4856888294910346, "grad_norm": 0.23481595516204834, "learning_rate": 4.964173716208428e-05, "loss": 0.4587, "num_input_tokens_seen": 16182848, "step": 13340 }, { "epoch": 1.4862456843746519, "grad_norm": 0.1938714236021042, "learning_rate": 4.964091696103173e-05, "loss": 0.4598, "num_input_tokens_seen": 16189056, "step": 13345 }, { "epoch": 1.4868025392582693, "grad_norm": 0.20796050131320953, "learning_rate": 4.964009582896637e-05, "loss": 0.4714, "num_input_tokens_seen": 16195328, "step": 13350 }, { "epoch": 1.4873593941418866, "grad_norm": 0.19930347800254822, "learning_rate": 4.963927376591923e-05, "loss": 0.4628, "num_input_tokens_seen": 16201504, "step": 13355 }, { "epoch": 1.487916249025504, "grad_norm": 0.16697685420513153, "learning_rate": 4.9638450771921365e-05, "loss": 0.4603, "num_input_tokens_seen": 16207744, "step": 13360 }, { "epoch": 1.4884731039091212, "grad_norm": 0.19735607504844666, "learning_rate": 4.963762684700387e-05, "loss": 0.4643, "num_input_tokens_seen": 16213152, "step": 13365 }, { "epoch": 1.4890299587927387, "grad_norm": 0.21296238899230957, "learning_rate": 4.963680199119788e-05, "loss": 0.4469, "num_input_tokens_seen": 16219488, "step": 13370 }, { "epoch": 1.489586813676356, "grad_norm": 0.17504757642745972, "learning_rate": 4.963597620453456e-05, "loss": 0.4718, "num_input_tokens_seen": 16225472, "step": 13375 }, { "epoch": 1.4901436685599734, "grad_norm": 0.21744516491889954, "learning_rate": 4.9635149487045106e-05, "loss": 0.4827, "num_input_tokens_seen": 16231776, "step": 13380 }, { "epoch": 1.4907005234435906, "grad_norm": 0.40656471252441406, "learning_rate": 4.963432183876077e-05, "loss": 0.4901, "num_input_tokens_seen": 16236672, "step": 13385 }, { "epoch": 1.4912573783272078, "grad_norm": 0.2089151293039322, "learning_rate": 4.9633493259712796e-05, "loss": 0.4608, "num_input_tokens_seen": 16242688, "step": 13390 }, { "epoch": 1.4918142332108253, "grad_norm": 0.15521593391895294, "learning_rate": 4.963266374993251e-05, "loss": 0.462, "num_input_tokens_seen": 16248640, "step": 13395 }, { "epoch": 1.4923710880944425, "grad_norm": 0.1912350058555603, "learning_rate": 4.9631833309451234e-05, "loss": 0.4716, "num_input_tokens_seen": 16254976, "step": 13400 }, { "epoch": 1.49292794297806, "grad_norm": 0.16657978296279907, "learning_rate": 4.9631001938300384e-05, "loss": 0.4895, "num_input_tokens_seen": 16260960, "step": 13405 }, { "epoch": 1.4934847978616772, "grad_norm": 0.18772393465042114, "learning_rate": 4.9630169636511324e-05, "loss": 0.4648, "num_input_tokens_seen": 16266976, "step": 13410 }, { "epoch": 1.4940416527452947, "grad_norm": 0.19280731678009033, "learning_rate": 4.962933640411553e-05, "loss": 0.4585, "num_input_tokens_seen": 16272736, "step": 13415 }, { "epoch": 1.4945985076289119, "grad_norm": 0.1629134714603424, "learning_rate": 4.962850224114449e-05, "loss": 0.4643, "num_input_tokens_seen": 16278592, "step": 13420 }, { "epoch": 1.4951553625125293, "grad_norm": 0.1688709408044815, "learning_rate": 4.96276671476297e-05, "loss": 0.459, "num_input_tokens_seen": 16284960, "step": 13425 }, { "epoch": 1.4957122173961466, "grad_norm": 0.2302827686071396, "learning_rate": 4.962683112360272e-05, "loss": 0.4659, "num_input_tokens_seen": 16290976, "step": 13430 }, { "epoch": 1.4962690722797638, "grad_norm": 0.2996968626976013, "learning_rate": 4.9625994169095145e-05, "loss": 0.4591, "num_input_tokens_seen": 16296544, "step": 13435 }, { "epoch": 1.4968259271633813, "grad_norm": 0.308608740568161, "learning_rate": 4.962515628413859e-05, "loss": 0.4658, "num_input_tokens_seen": 16302048, "step": 13440 }, { "epoch": 1.4973827820469985, "grad_norm": 0.19833418726921082, "learning_rate": 4.9624317468764715e-05, "loss": 0.4573, "num_input_tokens_seen": 16308608, "step": 13445 }, { "epoch": 1.497939636930616, "grad_norm": 0.20677262544631958, "learning_rate": 4.962347772300522e-05, "loss": 0.4593, "num_input_tokens_seen": 16314560, "step": 13450 }, { "epoch": 1.4984964918142332, "grad_norm": 0.25324201583862305, "learning_rate": 4.9622637046891815e-05, "loss": 0.4845, "num_input_tokens_seen": 16320704, "step": 13455 }, { "epoch": 1.4990533466978506, "grad_norm": 0.2674877345561981, "learning_rate": 4.9621795440456285e-05, "loss": 0.4588, "num_input_tokens_seen": 16326496, "step": 13460 }, { "epoch": 1.4996102015814678, "grad_norm": 0.18316686153411865, "learning_rate": 4.962095290373041e-05, "loss": 0.4845, "num_input_tokens_seen": 16332896, "step": 13465 }, { "epoch": 1.5001670564650853, "grad_norm": 0.28252094984054565, "learning_rate": 4.9620109436746045e-05, "loss": 0.4703, "num_input_tokens_seen": 16338976, "step": 13470 }, { "epoch": 1.5001670564650853, "eval_loss": 0.46727171540260315, "eval_runtime": 113.2213, "eval_samples_per_second": 35.25, "eval_steps_per_second": 8.815, "num_input_tokens_seen": 16338976, "step": 13470 }, { "epoch": 1.5007239113487025, "grad_norm": 0.18783624470233917, "learning_rate": 4.961926503953503e-05, "loss": 0.4476, "num_input_tokens_seen": 16344800, "step": 13475 }, { "epoch": 1.5012807662323198, "grad_norm": 0.23591849207878113, "learning_rate": 4.961841971212931e-05, "loss": 0.4791, "num_input_tokens_seen": 16350912, "step": 13480 }, { "epoch": 1.5018376211159372, "grad_norm": 0.24307429790496826, "learning_rate": 4.961757345456077e-05, "loss": 0.4696, "num_input_tokens_seen": 16356672, "step": 13485 }, { "epoch": 1.5023944759995547, "grad_norm": 0.25161826610565186, "learning_rate": 4.9616726266861437e-05, "loss": 0.4658, "num_input_tokens_seen": 16362848, "step": 13490 }, { "epoch": 1.502951330883172, "grad_norm": 0.19313572347164154, "learning_rate": 4.961587814906329e-05, "loss": 0.4726, "num_input_tokens_seen": 16369120, "step": 13495 }, { "epoch": 1.5035081857667891, "grad_norm": 0.16530340909957886, "learning_rate": 4.961502910119837e-05, "loss": 0.4462, "num_input_tokens_seen": 16375168, "step": 13500 }, { "epoch": 1.5040650406504064, "grad_norm": 0.20025262236595154, "learning_rate": 4.9614179123298784e-05, "loss": 0.4648, "num_input_tokens_seen": 16381536, "step": 13505 }, { "epoch": 1.5046218955340238, "grad_norm": 0.17789609730243683, "learning_rate": 4.961332821539663e-05, "loss": 0.4713, "num_input_tokens_seen": 16387648, "step": 13510 }, { "epoch": 1.5051787504176413, "grad_norm": 0.21691599488258362, "learning_rate": 4.961247637752405e-05, "loss": 0.4552, "num_input_tokens_seen": 16393248, "step": 13515 }, { "epoch": 1.5057356053012585, "grad_norm": 0.19477041065692902, "learning_rate": 4.9611623609713236e-05, "loss": 0.4633, "num_input_tokens_seen": 16399232, "step": 13520 }, { "epoch": 1.5062924601848757, "grad_norm": 0.1485375463962555, "learning_rate": 4.9610769911996415e-05, "loss": 0.4566, "num_input_tokens_seen": 16405920, "step": 13525 }, { "epoch": 1.5068493150684932, "grad_norm": 0.18098264932632446, "learning_rate": 4.960991528440584e-05, "loss": 0.4612, "num_input_tokens_seen": 16411296, "step": 13530 }, { "epoch": 1.5074061699521106, "grad_norm": 0.16149909794330597, "learning_rate": 4.960905972697379e-05, "loss": 0.4615, "num_input_tokens_seen": 16417568, "step": 13535 }, { "epoch": 1.5079630248357279, "grad_norm": 0.2286270558834076, "learning_rate": 4.960820323973262e-05, "loss": 0.4871, "num_input_tokens_seen": 16423968, "step": 13540 }, { "epoch": 1.508519879719345, "grad_norm": 0.20373530685901642, "learning_rate": 4.960734582271465e-05, "loss": 0.4715, "num_input_tokens_seen": 16430304, "step": 13545 }, { "epoch": 1.5090767346029623, "grad_norm": 0.21143703162670135, "learning_rate": 4.9606487475952304e-05, "loss": 0.4805, "num_input_tokens_seen": 16435776, "step": 13550 }, { "epoch": 1.5096335894865798, "grad_norm": 0.19515448808670044, "learning_rate": 4.9605628199478e-05, "loss": 0.4755, "num_input_tokens_seen": 16442016, "step": 13555 }, { "epoch": 1.5101904443701972, "grad_norm": 0.2047482132911682, "learning_rate": 4.9604767993324224e-05, "loss": 0.4672, "num_input_tokens_seen": 16448032, "step": 13560 }, { "epoch": 1.5107472992538145, "grad_norm": 0.17931213974952698, "learning_rate": 4.9603906857523455e-05, "loss": 0.463, "num_input_tokens_seen": 16453856, "step": 13565 }, { "epoch": 1.5113041541374317, "grad_norm": 0.22975367307662964, "learning_rate": 4.960304479210824e-05, "loss": 0.4617, "num_input_tokens_seen": 16460160, "step": 13570 }, { "epoch": 1.5118610090210491, "grad_norm": 0.2564483880996704, "learning_rate": 4.960218179711115e-05, "loss": 0.4681, "num_input_tokens_seen": 16466784, "step": 13575 }, { "epoch": 1.5124178639046666, "grad_norm": 0.15935996174812317, "learning_rate": 4.9601317872564786e-05, "loss": 0.4557, "num_input_tokens_seen": 16473120, "step": 13580 }, { "epoch": 1.5129747187882838, "grad_norm": 0.2492247372865677, "learning_rate": 4.9600453018501793e-05, "loss": 0.466, "num_input_tokens_seen": 16479392, "step": 13585 }, { "epoch": 1.513531573671901, "grad_norm": 0.19946664571762085, "learning_rate": 4.959958723495485e-05, "loss": 0.4549, "num_input_tokens_seen": 16485824, "step": 13590 }, { "epoch": 1.5140884285555183, "grad_norm": 0.26642122864723206, "learning_rate": 4.959872052195666e-05, "loss": 0.473, "num_input_tokens_seen": 16492192, "step": 13595 }, { "epoch": 1.5146452834391357, "grad_norm": 0.2193562388420105, "learning_rate": 4.9597852879539995e-05, "loss": 0.4765, "num_input_tokens_seen": 16498368, "step": 13600 }, { "epoch": 1.5152021383227532, "grad_norm": 0.20751002430915833, "learning_rate": 4.9596984307737616e-05, "loss": 0.4618, "num_input_tokens_seen": 16504256, "step": 13605 }, { "epoch": 1.5157589932063704, "grad_norm": 0.19441531598567963, "learning_rate": 4.9596114806582336e-05, "loss": 0.4509, "num_input_tokens_seen": 16510272, "step": 13610 }, { "epoch": 1.5163158480899877, "grad_norm": 0.24104677140712738, "learning_rate": 4.9595244376107026e-05, "loss": 0.4849, "num_input_tokens_seen": 16516416, "step": 13615 }, { "epoch": 1.516872702973605, "grad_norm": 0.18380680680274963, "learning_rate": 4.9594373016344565e-05, "loss": 0.4767, "num_input_tokens_seen": 16522656, "step": 13620 }, { "epoch": 1.5174295578572226, "grad_norm": 0.2759155035018921, "learning_rate": 4.959350072732787e-05, "loss": 0.4679, "num_input_tokens_seen": 16528672, "step": 13625 }, { "epoch": 1.5179864127408398, "grad_norm": 0.2690718472003937, "learning_rate": 4.95926275090899e-05, "loss": 0.4988, "num_input_tokens_seen": 16534848, "step": 13630 }, { "epoch": 1.518543267624457, "grad_norm": 0.2509145736694336, "learning_rate": 4.9591753361663654e-05, "loss": 0.4747, "num_input_tokens_seen": 16541024, "step": 13635 }, { "epoch": 1.5191001225080742, "grad_norm": 0.20190833508968353, "learning_rate": 4.959087828508216e-05, "loss": 0.4835, "num_input_tokens_seen": 16547264, "step": 13640 }, { "epoch": 1.5196569773916917, "grad_norm": 0.16185618937015533, "learning_rate": 4.959000227937848e-05, "loss": 0.4452, "num_input_tokens_seen": 16553216, "step": 13645 }, { "epoch": 1.5202138322753092, "grad_norm": 0.1734624207019806, "learning_rate": 4.9589125344585706e-05, "loss": 0.4505, "num_input_tokens_seen": 16559296, "step": 13650 }, { "epoch": 1.5207706871589264, "grad_norm": 0.21050681173801422, "learning_rate": 4.958824748073698e-05, "loss": 0.4559, "num_input_tokens_seen": 16565344, "step": 13655 }, { "epoch": 1.5213275420425436, "grad_norm": 0.21195384860038757, "learning_rate": 4.958736868786547e-05, "loss": 0.4768, "num_input_tokens_seen": 16571104, "step": 13660 }, { "epoch": 1.521884396926161, "grad_norm": 0.12173876166343689, "learning_rate": 4.9586488966004374e-05, "loss": 0.4749, "num_input_tokens_seen": 16577056, "step": 13665 }, { "epoch": 1.5224412518097785, "grad_norm": 0.28870585560798645, "learning_rate": 4.958560831518694e-05, "loss": 0.4897, "num_input_tokens_seen": 16583360, "step": 13670 }, { "epoch": 1.5229981066933957, "grad_norm": 0.2196332961320877, "learning_rate": 4.958472673544643e-05, "loss": 0.4748, "num_input_tokens_seen": 16589600, "step": 13675 }, { "epoch": 1.523554961577013, "grad_norm": 0.2378702312707901, "learning_rate": 4.9583844226816157e-05, "loss": 0.4565, "num_input_tokens_seen": 16595904, "step": 13680 }, { "epoch": 1.5241118164606302, "grad_norm": 0.221322163939476, "learning_rate": 4.958296078932947e-05, "loss": 0.4669, "num_input_tokens_seen": 16602016, "step": 13685 }, { "epoch": 1.5246686713442477, "grad_norm": 0.18591240048408508, "learning_rate": 4.958207642301974e-05, "loss": 0.4665, "num_input_tokens_seen": 16608224, "step": 13690 }, { "epoch": 1.5252255262278651, "grad_norm": 0.27296867966651917, "learning_rate": 4.958119112792039e-05, "loss": 0.4708, "num_input_tokens_seen": 16614112, "step": 13695 }, { "epoch": 1.5257823811114823, "grad_norm": 0.22783678770065308, "learning_rate": 4.9580304904064856e-05, "loss": 0.4517, "num_input_tokens_seen": 16620224, "step": 13700 }, { "epoch": 1.5263392359950996, "grad_norm": 0.25035980343818665, "learning_rate": 4.957941775148664e-05, "loss": 0.4766, "num_input_tokens_seen": 16626048, "step": 13705 }, { "epoch": 1.526896090878717, "grad_norm": 0.23427025973796844, "learning_rate": 4.957852967021926e-05, "loss": 0.4597, "num_input_tokens_seen": 16632416, "step": 13710 }, { "epoch": 1.5274529457623345, "grad_norm": 0.20865732431411743, "learning_rate": 4.957764066029624e-05, "loss": 0.4523, "num_input_tokens_seen": 16638528, "step": 13715 }, { "epoch": 1.5280098006459517, "grad_norm": 0.2380865216255188, "learning_rate": 4.957675072175121e-05, "loss": 0.4696, "num_input_tokens_seen": 16643712, "step": 13720 }, { "epoch": 1.528566655529569, "grad_norm": 0.2147020846605301, "learning_rate": 4.957585985461778e-05, "loss": 0.4524, "num_input_tokens_seen": 16649568, "step": 13725 }, { "epoch": 1.5291235104131862, "grad_norm": 0.22251194715499878, "learning_rate": 4.957496805892959e-05, "loss": 0.4624, "num_input_tokens_seen": 16655744, "step": 13730 }, { "epoch": 1.5296803652968036, "grad_norm": 0.21415288746356964, "learning_rate": 4.957407533472037e-05, "loss": 0.4763, "num_input_tokens_seen": 16661984, "step": 13735 }, { "epoch": 1.530237220180421, "grad_norm": 0.18933100998401642, "learning_rate": 4.957318168202383e-05, "loss": 0.4585, "num_input_tokens_seen": 16668096, "step": 13740 }, { "epoch": 1.5307940750640383, "grad_norm": 0.2179015427827835, "learning_rate": 4.9572287100873734e-05, "loss": 0.4779, "num_input_tokens_seen": 16674048, "step": 13745 }, { "epoch": 1.5313509299476555, "grad_norm": 0.3057660758495331, "learning_rate": 4.957139159130388e-05, "loss": 0.496, "num_input_tokens_seen": 16680256, "step": 13750 }, { "epoch": 1.531907784831273, "grad_norm": 0.18132635951042175, "learning_rate": 4.957049515334812e-05, "loss": 0.4596, "num_input_tokens_seen": 16686496, "step": 13755 }, { "epoch": 1.5324646397148904, "grad_norm": 0.26002076268196106, "learning_rate": 4.9569597787040306e-05, "loss": 0.4713, "num_input_tokens_seen": 16692288, "step": 13760 }, { "epoch": 1.5330214945985077, "grad_norm": 0.20560471713542938, "learning_rate": 4.956869949241435e-05, "loss": 0.4617, "num_input_tokens_seen": 16698368, "step": 13765 }, { "epoch": 1.533578349482125, "grad_norm": 0.16786223649978638, "learning_rate": 4.9567800269504196e-05, "loss": 0.4702, "num_input_tokens_seen": 16704224, "step": 13770 }, { "epoch": 1.5341352043657421, "grad_norm": 0.16068701446056366, "learning_rate": 4.956690011834382e-05, "loss": 0.4541, "num_input_tokens_seen": 16710208, "step": 13775 }, { "epoch": 1.5346920592493596, "grad_norm": 0.1806153953075409, "learning_rate": 4.956599903896722e-05, "loss": 0.4639, "num_input_tokens_seen": 16716128, "step": 13780 }, { "epoch": 1.535248914132977, "grad_norm": 0.20294618606567383, "learning_rate": 4.956509703140845e-05, "loss": 0.4591, "num_input_tokens_seen": 16721664, "step": 13785 }, { "epoch": 1.5358057690165943, "grad_norm": 0.251713365316391, "learning_rate": 4.9564194095701597e-05, "loss": 0.4738, "num_input_tokens_seen": 16727680, "step": 13790 }, { "epoch": 1.5363626239002115, "grad_norm": 0.3135780394077301, "learning_rate": 4.956329023188078e-05, "loss": 0.4733, "num_input_tokens_seen": 16733632, "step": 13795 }, { "epoch": 1.536919478783829, "grad_norm": 0.21060369908809662, "learning_rate": 4.956238543998013e-05, "loss": 0.4607, "num_input_tokens_seen": 16738976, "step": 13800 }, { "epoch": 1.5374763336674464, "grad_norm": 0.18110030889511108, "learning_rate": 4.956147972003384e-05, "loss": 0.4671, "num_input_tokens_seen": 16744960, "step": 13805 }, { "epoch": 1.5380331885510636, "grad_norm": 0.2684793472290039, "learning_rate": 4.956057307207614e-05, "loss": 0.4945, "num_input_tokens_seen": 16750848, "step": 13810 }, { "epoch": 1.5385900434346809, "grad_norm": 0.19479554891586304, "learning_rate": 4.9559665496141285e-05, "loss": 0.4601, "num_input_tokens_seen": 16756480, "step": 13815 }, { "epoch": 1.539146898318298, "grad_norm": 0.30324873328208923, "learning_rate": 4.955875699226355e-05, "loss": 0.4691, "num_input_tokens_seen": 16762176, "step": 13820 }, { "epoch": 1.5397037532019155, "grad_norm": 0.18410420417785645, "learning_rate": 4.955784756047729e-05, "loss": 0.4381, "num_input_tokens_seen": 16768544, "step": 13825 }, { "epoch": 1.540260608085533, "grad_norm": 0.28811904788017273, "learning_rate": 4.955693720081684e-05, "loss": 0.4216, "num_input_tokens_seen": 16774144, "step": 13830 }, { "epoch": 1.5408174629691502, "grad_norm": 0.2800420820713043, "learning_rate": 4.955602591331661e-05, "loss": 0.4716, "num_input_tokens_seen": 16780352, "step": 13835 }, { "epoch": 1.5413743178527675, "grad_norm": 0.18970979750156403, "learning_rate": 4.9555113698011024e-05, "loss": 0.45, "num_input_tokens_seen": 16786496, "step": 13840 }, { "epoch": 1.541931172736385, "grad_norm": 0.3192828893661499, "learning_rate": 4.955420055493456e-05, "loss": 0.4937, "num_input_tokens_seen": 16792416, "step": 13845 }, { "epoch": 1.5424880276200024, "grad_norm": 0.33984747529029846, "learning_rate": 4.95532864841217e-05, "loss": 0.4844, "num_input_tokens_seen": 16798784, "step": 13850 }, { "epoch": 1.5430448825036196, "grad_norm": 0.2083112597465515, "learning_rate": 4.9552371485606995e-05, "loss": 0.4854, "num_input_tokens_seen": 16804864, "step": 13855 }, { "epoch": 1.5436017373872368, "grad_norm": 0.19831326603889465, "learning_rate": 4.955145555942502e-05, "loss": 0.4496, "num_input_tokens_seen": 16811136, "step": 13860 }, { "epoch": 1.544158592270854, "grad_norm": 0.2317361831665039, "learning_rate": 4.955053870561037e-05, "loss": 0.4671, "num_input_tokens_seen": 16816864, "step": 13865 }, { "epoch": 1.5447154471544715, "grad_norm": 0.20737187564373016, "learning_rate": 4.9549620924197695e-05, "loss": 0.4611, "num_input_tokens_seen": 16823264, "step": 13870 }, { "epoch": 1.545272302038089, "grad_norm": 0.24215474724769592, "learning_rate": 4.954870221522166e-05, "loss": 0.4747, "num_input_tokens_seen": 16829504, "step": 13875 }, { "epoch": 1.5458291569217062, "grad_norm": 0.20182698965072632, "learning_rate": 4.954778257871699e-05, "loss": 0.4471, "num_input_tokens_seen": 16835552, "step": 13880 }, { "epoch": 1.5463860118053234, "grad_norm": 0.1843152493238449, "learning_rate": 4.9546862014718425e-05, "loss": 0.4582, "num_input_tokens_seen": 16841408, "step": 13885 }, { "epoch": 1.5469428666889409, "grad_norm": 0.19498224556446075, "learning_rate": 4.954594052326075e-05, "loss": 0.4755, "num_input_tokens_seen": 16847744, "step": 13890 }, { "epoch": 1.5474997215725583, "grad_norm": 0.19223086535930634, "learning_rate": 4.954501810437878e-05, "loss": 0.4569, "num_input_tokens_seen": 16853120, "step": 13895 }, { "epoch": 1.5480565764561756, "grad_norm": 0.1708400994539261, "learning_rate": 4.954409475810737e-05, "loss": 0.4496, "num_input_tokens_seen": 16859072, "step": 13900 }, { "epoch": 1.5486134313397928, "grad_norm": 0.1964581310749054, "learning_rate": 4.954317048448141e-05, "loss": 0.4587, "num_input_tokens_seen": 16865184, "step": 13905 }, { "epoch": 1.54917028622341, "grad_norm": 0.24533392488956451, "learning_rate": 4.9542245283535807e-05, "loss": 0.4535, "num_input_tokens_seen": 16870912, "step": 13910 }, { "epoch": 1.5497271411070275, "grad_norm": 0.1773114800453186, "learning_rate": 4.9541319155305535e-05, "loss": 0.4436, "num_input_tokens_seen": 16876608, "step": 13915 }, { "epoch": 1.550283995990645, "grad_norm": 0.21505288779735565, "learning_rate": 4.954039209982557e-05, "loss": 0.4635, "num_input_tokens_seen": 16881408, "step": 13920 }, { "epoch": 1.5508408508742622, "grad_norm": 0.14594800770282745, "learning_rate": 4.953946411713095e-05, "loss": 0.4634, "num_input_tokens_seen": 16887520, "step": 13925 }, { "epoch": 1.5513977057578794, "grad_norm": 0.1869124323129654, "learning_rate": 4.953853520725674e-05, "loss": 0.4775, "num_input_tokens_seen": 16893696, "step": 13930 }, { "epoch": 1.5519545606414968, "grad_norm": 0.20429731905460358, "learning_rate": 4.9537605370238025e-05, "loss": 0.48, "num_input_tokens_seen": 16899936, "step": 13935 }, { "epoch": 1.5525114155251143, "grad_norm": 0.18766747415065765, "learning_rate": 4.9536674606109955e-05, "loss": 0.434, "num_input_tokens_seen": 16905824, "step": 13940 }, { "epoch": 1.5530682704087315, "grad_norm": 0.20270563662052155, "learning_rate": 4.953574291490768e-05, "loss": 0.4939, "num_input_tokens_seen": 16912032, "step": 13945 }, { "epoch": 1.5536251252923488, "grad_norm": 0.2002352625131607, "learning_rate": 4.953481029666641e-05, "loss": 0.4702, "num_input_tokens_seen": 16917600, "step": 13950 }, { "epoch": 1.554181980175966, "grad_norm": 0.21705716848373413, "learning_rate": 4.9533876751421384e-05, "loss": 0.4471, "num_input_tokens_seen": 16923808, "step": 13955 }, { "epoch": 1.5547388350595834, "grad_norm": 0.23021118342876434, "learning_rate": 4.9532942279207866e-05, "loss": 0.4559, "num_input_tokens_seen": 16929888, "step": 13960 }, { "epoch": 1.5552956899432009, "grad_norm": 0.23660911619663239, "learning_rate": 4.953200688006118e-05, "loss": 0.4693, "num_input_tokens_seen": 16936128, "step": 13965 }, { "epoch": 1.5558525448268181, "grad_norm": 0.25649696588516235, "learning_rate": 4.9531070554016646e-05, "loss": 0.4583, "num_input_tokens_seen": 16942240, "step": 13970 }, { "epoch": 1.5564093997104353, "grad_norm": 0.2125299721956253, "learning_rate": 4.9530133301109654e-05, "loss": 0.4573, "num_input_tokens_seen": 16948160, "step": 13975 }, { "epoch": 1.5569662545940528, "grad_norm": 0.2047385424375534, "learning_rate": 4.952919512137561e-05, "loss": 0.4592, "num_input_tokens_seen": 16954016, "step": 13980 }, { "epoch": 1.5575231094776703, "grad_norm": 0.17558321356773376, "learning_rate": 4.952825601484998e-05, "loss": 0.4587, "num_input_tokens_seen": 16959872, "step": 13985 }, { "epoch": 1.5580799643612875, "grad_norm": 0.18372742831707, "learning_rate": 4.952731598156823e-05, "loss": 0.454, "num_input_tokens_seen": 16966208, "step": 13990 }, { "epoch": 1.5586368192449047, "grad_norm": 0.17178769409656525, "learning_rate": 4.952637502156587e-05, "loss": 0.4732, "num_input_tokens_seen": 16972544, "step": 13995 }, { "epoch": 1.559193674128522, "grad_norm": 0.28687840700149536, "learning_rate": 4.952543313487848e-05, "loss": 0.491, "num_input_tokens_seen": 16978944, "step": 14000 }, { "epoch": 1.5597505290121394, "grad_norm": 0.27703437209129333, "learning_rate": 4.952449032154162e-05, "loss": 0.4749, "num_input_tokens_seen": 16984768, "step": 14005 }, { "epoch": 1.5603073838957568, "grad_norm": 0.16589106619358063, "learning_rate": 4.952354658159093e-05, "loss": 0.4761, "num_input_tokens_seen": 16991200, "step": 14010 }, { "epoch": 1.560864238779374, "grad_norm": 0.190923810005188, "learning_rate": 4.952260191506205e-05, "loss": 0.4589, "num_input_tokens_seen": 16997312, "step": 14015 }, { "epoch": 1.5614210936629913, "grad_norm": 0.20727092027664185, "learning_rate": 4.952165632199069e-05, "loss": 0.4696, "num_input_tokens_seen": 17003296, "step": 14020 }, { "epoch": 1.5619779485466088, "grad_norm": 0.25101640820503235, "learning_rate": 4.9520709802412566e-05, "loss": 0.4536, "num_input_tokens_seen": 17009216, "step": 14025 }, { "epoch": 1.5625348034302262, "grad_norm": 0.19109435379505157, "learning_rate": 4.951976235636345e-05, "loss": 0.4623, "num_input_tokens_seen": 17015200, "step": 14030 }, { "epoch": 1.5630916583138434, "grad_norm": 0.1725245863199234, "learning_rate": 4.951881398387913e-05, "loss": 0.4636, "num_input_tokens_seen": 17021248, "step": 14035 }, { "epoch": 1.5636485131974607, "grad_norm": 0.19326990842819214, "learning_rate": 4.9517864684995454e-05, "loss": 0.471, "num_input_tokens_seen": 17027552, "step": 14040 }, { "epoch": 1.564205368081078, "grad_norm": 0.21443694829940796, "learning_rate": 4.9516914459748266e-05, "loss": 0.4867, "num_input_tokens_seen": 17033056, "step": 14045 }, { "epoch": 1.5647622229646954, "grad_norm": 0.24337781965732574, "learning_rate": 4.951596330817349e-05, "loss": 0.4693, "num_input_tokens_seen": 17039360, "step": 14050 }, { "epoch": 1.5653190778483128, "grad_norm": 0.24370470643043518, "learning_rate": 4.951501123030705e-05, "loss": 0.4878, "num_input_tokens_seen": 17045824, "step": 14055 }, { "epoch": 1.56587593273193, "grad_norm": 0.17794832587242126, "learning_rate": 4.951405822618492e-05, "loss": 0.4584, "num_input_tokens_seen": 17052096, "step": 14060 }, { "epoch": 1.5664327876155473, "grad_norm": 0.20390287041664124, "learning_rate": 4.951310429584311e-05, "loss": 0.4759, "num_input_tokens_seen": 17058144, "step": 14065 }, { "epoch": 1.5669896424991647, "grad_norm": 0.20436535775661469, "learning_rate": 4.951214943931768e-05, "loss": 0.4488, "num_input_tokens_seen": 17064320, "step": 14070 }, { "epoch": 1.5675464973827822, "grad_norm": 0.5661044120788574, "learning_rate": 4.951119365664468e-05, "loss": 0.4568, "num_input_tokens_seen": 17070464, "step": 14075 }, { "epoch": 1.5681033522663994, "grad_norm": 0.15577572584152222, "learning_rate": 4.951023694786024e-05, "loss": 0.4624, "num_input_tokens_seen": 17076352, "step": 14080 }, { "epoch": 1.5686602071500166, "grad_norm": 0.3042193651199341, "learning_rate": 4.9509279313000486e-05, "loss": 0.4595, "num_input_tokens_seen": 17082272, "step": 14085 }, { "epoch": 1.5692170620336339, "grad_norm": 0.2287653684616089, "learning_rate": 4.950832075210163e-05, "loss": 0.4557, "num_input_tokens_seen": 17087744, "step": 14090 }, { "epoch": 1.5697739169172513, "grad_norm": 0.2940143048763275, "learning_rate": 4.950736126519987e-05, "loss": 0.47, "num_input_tokens_seen": 17093728, "step": 14095 }, { "epoch": 1.5703307718008688, "grad_norm": 0.16503740847110748, "learning_rate": 4.950640085233146e-05, "loss": 0.4314, "num_input_tokens_seen": 17099776, "step": 14100 }, { "epoch": 1.570887626684486, "grad_norm": 0.2677422761917114, "learning_rate": 4.95054395135327e-05, "loss": 0.465, "num_input_tokens_seen": 17105216, "step": 14105 }, { "epoch": 1.5714444815681032, "grad_norm": 0.19313322007656097, "learning_rate": 4.95044772488399e-05, "loss": 0.4739, "num_input_tokens_seen": 17111392, "step": 14110 }, { "epoch": 1.5720013364517207, "grad_norm": 0.17255021631717682, "learning_rate": 4.9503514058289415e-05, "loss": 0.4531, "num_input_tokens_seen": 17117312, "step": 14115 }, { "epoch": 1.5725581913353381, "grad_norm": 0.18119168281555176, "learning_rate": 4.9502549941917656e-05, "loss": 0.4671, "num_input_tokens_seen": 17123616, "step": 14120 }, { "epoch": 1.5731150462189554, "grad_norm": 0.20504146814346313, "learning_rate": 4.950158489976103e-05, "loss": 0.4766, "num_input_tokens_seen": 17129920, "step": 14125 }, { "epoch": 1.5736719011025726, "grad_norm": 0.24876731634140015, "learning_rate": 4.9500618931856e-05, "loss": 0.4672, "num_input_tokens_seen": 17135904, "step": 14130 }, { "epoch": 1.57422875598619, "grad_norm": 0.19660064578056335, "learning_rate": 4.9499652038239084e-05, "loss": 0.4792, "num_input_tokens_seen": 17141760, "step": 14135 }, { "epoch": 1.5747856108698073, "grad_norm": 0.1972694844007492, "learning_rate": 4.949868421894679e-05, "loss": 0.4694, "num_input_tokens_seen": 17147584, "step": 14140 }, { "epoch": 1.5753424657534247, "grad_norm": 0.21092233061790466, "learning_rate": 4.9497715474015704e-05, "loss": 0.4704, "num_input_tokens_seen": 17153696, "step": 14145 }, { "epoch": 1.575899320637042, "grad_norm": 0.24099469184875488, "learning_rate": 4.949674580348242e-05, "loss": 0.4747, "num_input_tokens_seen": 17159904, "step": 14150 }, { "epoch": 1.5764561755206592, "grad_norm": 0.20381762087345123, "learning_rate": 4.9495775207383576e-05, "loss": 0.4749, "num_input_tokens_seen": 17166016, "step": 14155 }, { "epoch": 1.5770130304042767, "grad_norm": 0.2671506404876709, "learning_rate": 4.949480368575584e-05, "loss": 0.4666, "num_input_tokens_seen": 17172256, "step": 14160 }, { "epoch": 1.577569885287894, "grad_norm": 0.25130024552345276, "learning_rate": 4.949383123863593e-05, "loss": 0.4665, "num_input_tokens_seen": 17178336, "step": 14165 }, { "epoch": 1.5781267401715113, "grad_norm": 0.2434871643781662, "learning_rate": 4.949285786606058e-05, "loss": 0.4538, "num_input_tokens_seen": 17184224, "step": 14170 }, { "epoch": 1.5786835950551286, "grad_norm": 0.23739129304885864, "learning_rate": 4.949188356806657e-05, "loss": 0.4729, "num_input_tokens_seen": 17190432, "step": 14175 }, { "epoch": 1.579240449938746, "grad_norm": 0.30675068497657776, "learning_rate": 4.94909083446907e-05, "loss": 0.4633, "num_input_tokens_seen": 17196608, "step": 14180 }, { "epoch": 1.5797973048223632, "grad_norm": 0.2107297033071518, "learning_rate": 4.948993219596984e-05, "loss": 0.4596, "num_input_tokens_seen": 17202784, "step": 14185 }, { "epoch": 1.5803541597059807, "grad_norm": 0.20061348378658295, "learning_rate": 4.948895512194085e-05, "loss": 0.4664, "num_input_tokens_seen": 17208960, "step": 14190 }, { "epoch": 1.580911014589598, "grad_norm": 0.17215664684772491, "learning_rate": 4.948797712264066e-05, "loss": 0.4702, "num_input_tokens_seen": 17215328, "step": 14195 }, { "epoch": 1.5814678694732152, "grad_norm": 0.19982881844043732, "learning_rate": 4.948699819810622e-05, "loss": 0.4639, "num_input_tokens_seen": 17221632, "step": 14200 }, { "epoch": 1.5820247243568326, "grad_norm": 0.2989717423915863, "learning_rate": 4.948601834837451e-05, "loss": 0.4518, "num_input_tokens_seen": 17227392, "step": 14205 }, { "epoch": 1.58258157924045, "grad_norm": 0.18476268649101257, "learning_rate": 4.9485037573482565e-05, "loss": 0.4723, "num_input_tokens_seen": 17233216, "step": 14210 }, { "epoch": 1.5831384341240673, "grad_norm": 0.16968896985054016, "learning_rate": 4.9484055873467436e-05, "loss": 0.4447, "num_input_tokens_seen": 17239136, "step": 14215 }, { "epoch": 1.5836952890076845, "grad_norm": 0.1735527068376541, "learning_rate": 4.948307324836621e-05, "loss": 0.4691, "num_input_tokens_seen": 17245344, "step": 14220 }, { "epoch": 1.584252143891302, "grad_norm": 0.22833257913589478, "learning_rate": 4.9482089698216016e-05, "loss": 0.4614, "num_input_tokens_seen": 17251776, "step": 14225 }, { "epoch": 1.5848089987749192, "grad_norm": 0.25991955399513245, "learning_rate": 4.9481105223054014e-05, "loss": 0.4824, "num_input_tokens_seen": 17258016, "step": 14230 }, { "epoch": 1.5853658536585367, "grad_norm": 0.1902209222316742, "learning_rate": 4.9480119822917404e-05, "loss": 0.4636, "num_input_tokens_seen": 17264064, "step": 14235 }, { "epoch": 1.585922708542154, "grad_norm": 0.25300344824790955, "learning_rate": 4.947913349784342e-05, "loss": 0.4628, "num_input_tokens_seen": 17269696, "step": 14240 }, { "epoch": 1.5864795634257711, "grad_norm": 0.19530457258224487, "learning_rate": 4.947814624786932e-05, "loss": 0.4674, "num_input_tokens_seen": 17275712, "step": 14245 }, { "epoch": 1.5870364183093886, "grad_norm": 0.2086520940065384, "learning_rate": 4.9477158073032414e-05, "loss": 0.4543, "num_input_tokens_seen": 17281824, "step": 14250 }, { "epoch": 1.587593273193006, "grad_norm": 0.18148672580718994, "learning_rate": 4.9476168973370044e-05, "loss": 0.4679, "num_input_tokens_seen": 17287808, "step": 14255 }, { "epoch": 1.5881501280766233, "grad_norm": 0.2630629241466522, "learning_rate": 4.947517894891956e-05, "loss": 0.4629, "num_input_tokens_seen": 17293856, "step": 14260 }, { "epoch": 1.5887069829602405, "grad_norm": 0.2058643400669098, "learning_rate": 4.947418799971838e-05, "loss": 0.4671, "num_input_tokens_seen": 17299904, "step": 14265 }, { "epoch": 1.589263837843858, "grad_norm": 0.242979496717453, "learning_rate": 4.947319612580396e-05, "loss": 0.4658, "num_input_tokens_seen": 17306112, "step": 14270 }, { "epoch": 1.5898206927274752, "grad_norm": 0.17041859030723572, "learning_rate": 4.9472203327213746e-05, "loss": 0.4555, "num_input_tokens_seen": 17312064, "step": 14275 }, { "epoch": 1.5903775476110926, "grad_norm": 0.23467829823493958, "learning_rate": 4.947120960398527e-05, "loss": 0.4601, "num_input_tokens_seen": 17317984, "step": 14280 }, { "epoch": 1.5909344024947099, "grad_norm": 0.28710365295410156, "learning_rate": 4.947021495615608e-05, "loss": 0.4777, "num_input_tokens_seen": 17323936, "step": 14285 }, { "epoch": 1.591491257378327, "grad_norm": 0.18020033836364746, "learning_rate": 4.946921938376374e-05, "loss": 0.4841, "num_input_tokens_seen": 17329888, "step": 14290 }, { "epoch": 1.5920481122619445, "grad_norm": 0.14708705246448517, "learning_rate": 4.946822288684588e-05, "loss": 0.467, "num_input_tokens_seen": 17335904, "step": 14295 }, { "epoch": 1.592604967145562, "grad_norm": 0.30965113639831543, "learning_rate": 4.9467225465440154e-05, "loss": 0.4745, "num_input_tokens_seen": 17341664, "step": 14300 }, { "epoch": 1.5931618220291792, "grad_norm": 0.20003573596477509, "learning_rate": 4.946622711958424e-05, "loss": 0.4728, "num_input_tokens_seen": 17347936, "step": 14305 }, { "epoch": 1.5937186769127965, "grad_norm": 0.18011438846588135, "learning_rate": 4.946522784931585e-05, "loss": 0.4641, "num_input_tokens_seen": 17353408, "step": 14310 }, { "epoch": 1.594275531796414, "grad_norm": 0.14753687381744385, "learning_rate": 4.9464227654672755e-05, "loss": 0.4558, "num_input_tokens_seen": 17359584, "step": 14315 }, { "epoch": 1.5948323866800311, "grad_norm": 0.24335134029388428, "learning_rate": 4.946322653569274e-05, "loss": 0.4802, "num_input_tokens_seen": 17366144, "step": 14320 }, { "epoch": 1.5953892415636486, "grad_norm": 0.2415645718574524, "learning_rate": 4.9462224492413627e-05, "loss": 0.4792, "num_input_tokens_seen": 17372288, "step": 14325 }, { "epoch": 1.5959460964472658, "grad_norm": 0.2231404036283493, "learning_rate": 4.946122152487328e-05, "loss": 0.4781, "num_input_tokens_seen": 17378464, "step": 14330 }, { "epoch": 1.596502951330883, "grad_norm": 0.28692853450775146, "learning_rate": 4.94602176331096e-05, "loss": 0.4599, "num_input_tokens_seen": 17384224, "step": 14335 }, { "epoch": 1.5970598062145005, "grad_norm": 0.1785765439271927, "learning_rate": 4.94592128171605e-05, "loss": 0.4611, "num_input_tokens_seen": 17390112, "step": 14340 }, { "epoch": 1.597616661098118, "grad_norm": 0.17544187605381012, "learning_rate": 4.9458207077063965e-05, "loss": 0.4552, "num_input_tokens_seen": 17396160, "step": 14345 }, { "epoch": 1.5981735159817352, "grad_norm": 0.28287991881370544, "learning_rate": 4.945720041285799e-05, "loss": 0.462, "num_input_tokens_seen": 17402400, "step": 14350 }, { "epoch": 1.5987303708653524, "grad_norm": 0.17260226607322693, "learning_rate": 4.9456192824580605e-05, "loss": 0.4549, "num_input_tokens_seen": 17408672, "step": 14355 }, { "epoch": 1.5992872257489699, "grad_norm": 0.19559140503406525, "learning_rate": 4.9455184312269873e-05, "loss": 0.4641, "num_input_tokens_seen": 17414432, "step": 14360 }, { "epoch": 1.599844080632587, "grad_norm": 0.21544942259788513, "learning_rate": 4.945417487596391e-05, "loss": 0.4626, "num_input_tokens_seen": 17420096, "step": 14365 }, { "epoch": 1.6004009355162045, "grad_norm": 0.2046653926372528, "learning_rate": 4.9453164515700856e-05, "loss": 0.4799, "num_input_tokens_seen": 17426304, "step": 14370 }, { "epoch": 1.6009577903998218, "grad_norm": 0.2015429139137268, "learning_rate": 4.9452153231518884e-05, "loss": 0.4576, "num_input_tokens_seen": 17432416, "step": 14375 }, { "epoch": 1.601514645283439, "grad_norm": 0.27455687522888184, "learning_rate": 4.9451141023456196e-05, "loss": 0.4735, "num_input_tokens_seen": 17438560, "step": 14380 }, { "epoch": 1.6020715001670565, "grad_norm": 0.17140668630599976, "learning_rate": 4.945012789155105e-05, "loss": 0.464, "num_input_tokens_seen": 17444544, "step": 14385 }, { "epoch": 1.602628355050674, "grad_norm": 0.21592991054058075, "learning_rate": 4.9449113835841714e-05, "loss": 0.4724, "num_input_tokens_seen": 17450400, "step": 14390 }, { "epoch": 1.6031852099342911, "grad_norm": 0.2304200977087021, "learning_rate": 4.944809885636651e-05, "loss": 0.4815, "num_input_tokens_seen": 17456352, "step": 14395 }, { "epoch": 1.6037420648179084, "grad_norm": 0.2076960653066635, "learning_rate": 4.9447082953163784e-05, "loss": 0.4718, "num_input_tokens_seen": 17461984, "step": 14400 }, { "epoch": 1.6042989197015258, "grad_norm": 0.1918114870786667, "learning_rate": 4.9446066126271905e-05, "loss": 0.4582, "num_input_tokens_seen": 17468096, "step": 14405 }, { "epoch": 1.604855774585143, "grad_norm": 0.15665899217128754, "learning_rate": 4.944504837572932e-05, "loss": 0.4644, "num_input_tokens_seen": 17474080, "step": 14410 }, { "epoch": 1.6054126294687605, "grad_norm": 0.2276458740234375, "learning_rate": 4.944402970157447e-05, "loss": 0.4731, "num_input_tokens_seen": 17480064, "step": 14415 }, { "epoch": 1.6059694843523777, "grad_norm": 0.18411566317081451, "learning_rate": 4.9443010103845834e-05, "loss": 0.4751, "num_input_tokens_seen": 17486016, "step": 14420 }, { "epoch": 1.606526339235995, "grad_norm": 0.20165173709392548, "learning_rate": 4.944198958258195e-05, "loss": 0.4754, "num_input_tokens_seen": 17492448, "step": 14425 }, { "epoch": 1.6070831941196124, "grad_norm": 0.2509423494338989, "learning_rate": 4.944096813782138e-05, "loss": 0.464, "num_input_tokens_seen": 17498528, "step": 14430 }, { "epoch": 1.6076400490032299, "grad_norm": 0.16183310747146606, "learning_rate": 4.9439945769602695e-05, "loss": 0.4437, "num_input_tokens_seen": 17504800, "step": 14435 }, { "epoch": 1.608196903886847, "grad_norm": 0.17737779021263123, "learning_rate": 4.943892247796454e-05, "loss": 0.4631, "num_input_tokens_seen": 17510880, "step": 14440 }, { "epoch": 1.6087537587704643, "grad_norm": 0.3041091859340668, "learning_rate": 4.943789826294558e-05, "loss": 0.4665, "num_input_tokens_seen": 17516608, "step": 14445 }, { "epoch": 1.6093106136540818, "grad_norm": 0.1572476178407669, "learning_rate": 4.9436873124584507e-05, "loss": 0.4539, "num_input_tokens_seen": 17522496, "step": 14450 }, { "epoch": 1.6098674685376992, "grad_norm": 0.24706393480300903, "learning_rate": 4.9435847062920054e-05, "loss": 0.4584, "num_input_tokens_seen": 17528000, "step": 14455 }, { "epoch": 1.6104243234213165, "grad_norm": 0.20463398098945618, "learning_rate": 4.9434820077991e-05, "loss": 0.4728, "num_input_tokens_seen": 17533856, "step": 14460 }, { "epoch": 1.6109811783049337, "grad_norm": 0.20104524493217468, "learning_rate": 4.943379216983612e-05, "loss": 0.4769, "num_input_tokens_seen": 17540192, "step": 14465 }, { "epoch": 1.611538033188551, "grad_norm": 0.20719583332538605, "learning_rate": 4.943276333849428e-05, "loss": 0.4637, "num_input_tokens_seen": 17546272, "step": 14470 }, { "epoch": 1.6120948880721684, "grad_norm": 0.19076833128929138, "learning_rate": 4.943173358400435e-05, "loss": 0.4893, "num_input_tokens_seen": 17552320, "step": 14475 }, { "epoch": 1.6126517429557858, "grad_norm": 0.2078697383403778, "learning_rate": 4.943070290640521e-05, "loss": 0.458, "num_input_tokens_seen": 17558208, "step": 14480 }, { "epoch": 1.613208597839403, "grad_norm": 0.2709624767303467, "learning_rate": 4.9429671305735844e-05, "loss": 0.476, "num_input_tokens_seen": 17564320, "step": 14485 }, { "epoch": 1.6137654527230203, "grad_norm": 0.19793108105659485, "learning_rate": 4.94286387820352e-05, "loss": 0.4822, "num_input_tokens_seen": 17570528, "step": 14490 }, { "epoch": 1.6143223076066378, "grad_norm": 0.18535156548023224, "learning_rate": 4.942760533534228e-05, "loss": 0.4665, "num_input_tokens_seen": 17576864, "step": 14495 }, { "epoch": 1.6148791624902552, "grad_norm": 0.1743307262659073, "learning_rate": 4.9426570965696175e-05, "loss": 0.4501, "num_input_tokens_seen": 17583232, "step": 14500 }, { "epoch": 1.6154360173738724, "grad_norm": 0.19600684940814972, "learning_rate": 4.942553567313592e-05, "loss": 0.4598, "num_input_tokens_seen": 17589216, "step": 14505 }, { "epoch": 1.6159928722574897, "grad_norm": 0.23469600081443787, "learning_rate": 4.942449945770067e-05, "loss": 0.467, "num_input_tokens_seen": 17595616, "step": 14510 }, { "epoch": 1.616549727141107, "grad_norm": 0.258473664522171, "learning_rate": 4.942346231942955e-05, "loss": 0.4684, "num_input_tokens_seen": 17601664, "step": 14515 }, { "epoch": 1.6171065820247243, "grad_norm": 0.13645972311496735, "learning_rate": 4.942242425836174e-05, "loss": 0.4662, "num_input_tokens_seen": 17607712, "step": 14520 }, { "epoch": 1.6176634369083418, "grad_norm": 0.3263697922229767, "learning_rate": 4.94213852745365e-05, "loss": 0.4638, "num_input_tokens_seen": 17613856, "step": 14525 }, { "epoch": 1.618220291791959, "grad_norm": 0.22494928538799286, "learning_rate": 4.9420345367993045e-05, "loss": 0.4898, "num_input_tokens_seen": 17619936, "step": 14530 }, { "epoch": 1.6187771466755763, "grad_norm": 0.2787463366985321, "learning_rate": 4.9419304538770696e-05, "loss": 0.4599, "num_input_tokens_seen": 17626208, "step": 14535 }, { "epoch": 1.6193340015591937, "grad_norm": 0.19890940189361572, "learning_rate": 4.9418262786908757e-05, "loss": 0.4553, "num_input_tokens_seen": 17632160, "step": 14540 }, { "epoch": 1.6198908564428112, "grad_norm": 0.22467447817325592, "learning_rate": 4.94172201124466e-05, "loss": 0.4795, "num_input_tokens_seen": 17638144, "step": 14545 }, { "epoch": 1.6204477113264284, "grad_norm": 0.1980888843536377, "learning_rate": 4.941617651542362e-05, "loss": 0.4842, "num_input_tokens_seen": 17644288, "step": 14550 }, { "epoch": 1.6210045662100456, "grad_norm": 0.19957731664180756, "learning_rate": 4.941513199587924e-05, "loss": 0.4835, "num_input_tokens_seen": 17649984, "step": 14555 }, { "epoch": 1.6215614210936629, "grad_norm": 0.15399934351444244, "learning_rate": 4.941408655385294e-05, "loss": 0.4616, "num_input_tokens_seen": 17656224, "step": 14560 }, { "epoch": 1.6221182759772803, "grad_norm": 0.1969933807849884, "learning_rate": 4.9413040189384206e-05, "loss": 0.4783, "num_input_tokens_seen": 17661984, "step": 14565 }, { "epoch": 1.6226751308608978, "grad_norm": 0.1851852685213089, "learning_rate": 4.9411992902512574e-05, "loss": 0.4587, "num_input_tokens_seen": 17668320, "step": 14570 }, { "epoch": 1.623231985744515, "grad_norm": 0.18864832818508148, "learning_rate": 4.941094469327763e-05, "loss": 0.4508, "num_input_tokens_seen": 17674464, "step": 14575 }, { "epoch": 1.6237888406281322, "grad_norm": 0.16020245850086212, "learning_rate": 4.9409895561718954e-05, "loss": 0.4667, "num_input_tokens_seen": 17680512, "step": 14580 }, { "epoch": 1.6243456955117497, "grad_norm": 0.18364140391349792, "learning_rate": 4.940884550787621e-05, "loss": 0.4677, "num_input_tokens_seen": 17686848, "step": 14585 }, { "epoch": 1.6249025503953671, "grad_norm": 0.15406256914138794, "learning_rate": 4.940779453178905e-05, "loss": 0.4649, "num_input_tokens_seen": 17692736, "step": 14590 }, { "epoch": 1.6254594052789844, "grad_norm": 0.1707935333251953, "learning_rate": 4.94067426334972e-05, "loss": 0.4718, "num_input_tokens_seen": 17699136, "step": 14595 }, { "epoch": 1.6260162601626016, "grad_norm": 0.19917742908000946, "learning_rate": 4.94056898130404e-05, "loss": 0.4787, "num_input_tokens_seen": 17705600, "step": 14600 }, { "epoch": 1.6265731150462188, "grad_norm": 0.15328441560268402, "learning_rate": 4.940463607045842e-05, "loss": 0.4656, "num_input_tokens_seen": 17711808, "step": 14605 }, { "epoch": 1.6271299699298363, "grad_norm": 0.17210112512111664, "learning_rate": 4.9403581405791087e-05, "loss": 0.4721, "num_input_tokens_seen": 17718080, "step": 14610 }, { "epoch": 1.6276868248134537, "grad_norm": 0.20147590339183807, "learning_rate": 4.940252581907824e-05, "loss": 0.47, "num_input_tokens_seen": 17724352, "step": 14615 }, { "epoch": 1.628243679697071, "grad_norm": 0.19226111471652985, "learning_rate": 4.940146931035976e-05, "loss": 0.4549, "num_input_tokens_seen": 17730592, "step": 14620 }, { "epoch": 1.6288005345806882, "grad_norm": 0.15962721407413483, "learning_rate": 4.9400411879675583e-05, "loss": 0.4753, "num_input_tokens_seen": 17736640, "step": 14625 }, { "epoch": 1.6293573894643056, "grad_norm": 0.17603924870491028, "learning_rate": 4.939935352706565e-05, "loss": 0.4572, "num_input_tokens_seen": 17742720, "step": 14630 }, { "epoch": 1.629914244347923, "grad_norm": 0.1778733730316162, "learning_rate": 4.9398294252569946e-05, "loss": 0.4803, "num_input_tokens_seen": 17748608, "step": 14635 }, { "epoch": 1.6304710992315403, "grad_norm": 0.2290545254945755, "learning_rate": 4.9397234056228495e-05, "loss": 0.4733, "num_input_tokens_seen": 17754752, "step": 14640 }, { "epoch": 1.6310279541151576, "grad_norm": 0.21020066738128662, "learning_rate": 4.9396172938081356e-05, "loss": 0.4579, "num_input_tokens_seen": 17760992, "step": 14645 }, { "epoch": 1.6315848089987748, "grad_norm": 0.20021837949752808, "learning_rate": 4.9395110898168616e-05, "loss": 0.4725, "num_input_tokens_seen": 17767104, "step": 14650 }, { "epoch": 1.6321416638823922, "grad_norm": 0.17432396113872528, "learning_rate": 4.939404793653042e-05, "loss": 0.4714, "num_input_tokens_seen": 17773408, "step": 14655 }, { "epoch": 1.6326985187660097, "grad_norm": 0.22348254919052124, "learning_rate": 4.939298405320691e-05, "loss": 0.4775, "num_input_tokens_seen": 17779488, "step": 14660 }, { "epoch": 1.633255373649627, "grad_norm": 0.15776070952415466, "learning_rate": 4.9391919248238295e-05, "loss": 0.4613, "num_input_tokens_seen": 17785792, "step": 14665 }, { "epoch": 1.6338122285332441, "grad_norm": 0.21299564838409424, "learning_rate": 4.9390853521664803e-05, "loss": 0.4585, "num_input_tokens_seen": 17792096, "step": 14670 }, { "epoch": 1.6343690834168616, "grad_norm": 0.16994386911392212, "learning_rate": 4.93897868735267e-05, "loss": 0.4727, "num_input_tokens_seen": 17798304, "step": 14675 }, { "epoch": 1.634925938300479, "grad_norm": 0.296631783246994, "learning_rate": 4.93887193038643e-05, "loss": 0.4718, "num_input_tokens_seen": 17803808, "step": 14680 }, { "epoch": 1.6354827931840963, "grad_norm": 0.14253072440624237, "learning_rate": 4.9387650812717906e-05, "loss": 0.4648, "num_input_tokens_seen": 17810336, "step": 14685 }, { "epoch": 1.6360396480677135, "grad_norm": 0.2562386393547058, "learning_rate": 4.9386581400127916e-05, "loss": 0.4619, "num_input_tokens_seen": 17816416, "step": 14690 }, { "epoch": 1.6365965029513307, "grad_norm": 0.2010965794324875, "learning_rate": 4.938551106613474e-05, "loss": 0.4853, "num_input_tokens_seen": 17822720, "step": 14695 }, { "epoch": 1.6371533578349482, "grad_norm": 0.1775406152009964, "learning_rate": 4.9384439810778807e-05, "loss": 0.4864, "num_input_tokens_seen": 17828576, "step": 14700 }, { "epoch": 1.6377102127185656, "grad_norm": 0.2586476504802704, "learning_rate": 4.938336763410058e-05, "loss": 0.4722, "num_input_tokens_seen": 17835040, "step": 14705 }, { "epoch": 1.6382670676021829, "grad_norm": 0.1469097137451172, "learning_rate": 4.93822945361406e-05, "loss": 0.4693, "num_input_tokens_seen": 17841280, "step": 14710 }, { "epoch": 1.6388239224858, "grad_norm": 0.25429248809814453, "learning_rate": 4.9381220516939385e-05, "loss": 0.4628, "num_input_tokens_seen": 17847168, "step": 14715 }, { "epoch": 1.6393807773694176, "grad_norm": 0.15604059398174286, "learning_rate": 4.938014557653753e-05, "loss": 0.4675, "num_input_tokens_seen": 17853344, "step": 14720 }, { "epoch": 1.639937632253035, "grad_norm": 0.16380558907985687, "learning_rate": 4.9379069714975644e-05, "loss": 0.4717, "num_input_tokens_seen": 17859328, "step": 14725 }, { "epoch": 1.6404944871366522, "grad_norm": 0.21589140594005585, "learning_rate": 4.937799293229438e-05, "loss": 0.4658, "num_input_tokens_seen": 17865472, "step": 14730 }, { "epoch": 1.6410513420202695, "grad_norm": 0.18987606465816498, "learning_rate": 4.9376915228534415e-05, "loss": 0.4677, "num_input_tokens_seen": 17871680, "step": 14735 }, { "epoch": 1.6416081969038867, "grad_norm": 0.14917466044425964, "learning_rate": 4.937583660373648e-05, "loss": 0.4831, "num_input_tokens_seen": 17877984, "step": 14740 }, { "epoch": 1.6421650517875042, "grad_norm": 0.1794416755437851, "learning_rate": 4.937475705794132e-05, "loss": 0.4584, "num_input_tokens_seen": 17884160, "step": 14745 }, { "epoch": 1.6427219066711216, "grad_norm": 0.17189520597457886, "learning_rate": 4.937367659118972e-05, "loss": 0.4717, "num_input_tokens_seen": 17890464, "step": 14750 }, { "epoch": 1.6432787615547388, "grad_norm": 0.20241211354732513, "learning_rate": 4.937259520352251e-05, "loss": 0.456, "num_input_tokens_seen": 17896864, "step": 14755 }, { "epoch": 1.643835616438356, "grad_norm": 0.17868313193321228, "learning_rate": 4.9371512894980554e-05, "loss": 0.4751, "num_input_tokens_seen": 17902656, "step": 14760 }, { "epoch": 1.6443924713219735, "grad_norm": 0.14486876130104065, "learning_rate": 4.9370429665604734e-05, "loss": 0.4515, "num_input_tokens_seen": 17908256, "step": 14765 }, { "epoch": 1.644949326205591, "grad_norm": 0.19516123831272125, "learning_rate": 4.9369345515435986e-05, "loss": 0.4609, "num_input_tokens_seen": 17913952, "step": 14770 }, { "epoch": 1.6455061810892082, "grad_norm": 0.2091374546289444, "learning_rate": 4.9368260444515264e-05, "loss": 0.4717, "num_input_tokens_seen": 17920256, "step": 14775 }, { "epoch": 1.6460630359728254, "grad_norm": 0.2087995409965515, "learning_rate": 4.9367174452883566e-05, "loss": 0.4688, "num_input_tokens_seen": 17925664, "step": 14780 }, { "epoch": 1.6466198908564427, "grad_norm": 0.19142328202724457, "learning_rate": 4.936608754058194e-05, "loss": 0.4741, "num_input_tokens_seen": 17931744, "step": 14785 }, { "epoch": 1.6471767457400601, "grad_norm": 0.18812811374664307, "learning_rate": 4.9364999707651436e-05, "loss": 0.4679, "num_input_tokens_seen": 17937888, "step": 14790 }, { "epoch": 1.6477336006236776, "grad_norm": 0.19466155767440796, "learning_rate": 4.9363910954133166e-05, "loss": 0.4417, "num_input_tokens_seen": 17943840, "step": 14795 }, { "epoch": 1.6482904555072948, "grad_norm": 0.15729086101055145, "learning_rate": 4.936282128006826e-05, "loss": 0.4803, "num_input_tokens_seen": 17949984, "step": 14800 }, { "epoch": 1.648847310390912, "grad_norm": 0.21789342164993286, "learning_rate": 4.936173068549789e-05, "loss": 0.4474, "num_input_tokens_seen": 17955968, "step": 14805 }, { "epoch": 1.6494041652745295, "grad_norm": 0.19396047294139862, "learning_rate": 4.936063917046326e-05, "loss": 0.45, "num_input_tokens_seen": 17962048, "step": 14810 }, { "epoch": 1.649961020158147, "grad_norm": 0.1805177628993988, "learning_rate": 4.9359546735005616e-05, "loss": 0.4833, "num_input_tokens_seen": 17968096, "step": 14815 }, { "epoch": 1.6505178750417642, "grad_norm": 0.18719510734081268, "learning_rate": 4.935845337916624e-05, "loss": 0.4892, "num_input_tokens_seen": 17974048, "step": 14820 }, { "epoch": 1.6510747299253814, "grad_norm": 0.176234632730484, "learning_rate": 4.935735910298643e-05, "loss": 0.4714, "num_input_tokens_seen": 17980256, "step": 14825 }, { "epoch": 1.6516315848089986, "grad_norm": 0.2843340039253235, "learning_rate": 4.9356263906507534e-05, "loss": 0.4782, "num_input_tokens_seen": 17986432, "step": 14830 }, { "epoch": 1.652188439692616, "grad_norm": 0.19287140667438507, "learning_rate": 4.935516778977093e-05, "loss": 0.4602, "num_input_tokens_seen": 17992320, "step": 14835 }, { "epoch": 1.6527452945762335, "grad_norm": 0.19330552220344543, "learning_rate": 4.935407075281805e-05, "loss": 0.4789, "num_input_tokens_seen": 17998720, "step": 14840 }, { "epoch": 1.6533021494598508, "grad_norm": 0.22567559778690338, "learning_rate": 4.935297279569032e-05, "loss": 0.4622, "num_input_tokens_seen": 18005312, "step": 14845 }, { "epoch": 1.653859004343468, "grad_norm": 0.22432191669940948, "learning_rate": 4.9351873918429236e-05, "loss": 0.4788, "num_input_tokens_seen": 18011328, "step": 14850 }, { "epoch": 1.6544158592270855, "grad_norm": 0.26245054602622986, "learning_rate": 4.9350774121076323e-05, "loss": 0.474, "num_input_tokens_seen": 18017504, "step": 14855 }, { "epoch": 1.654972714110703, "grad_norm": 0.25368592143058777, "learning_rate": 4.9349673403673127e-05, "loss": 0.4581, "num_input_tokens_seen": 18023680, "step": 14860 }, { "epoch": 1.6555295689943201, "grad_norm": 0.20114408433437347, "learning_rate": 4.934857176626123e-05, "loss": 0.4616, "num_input_tokens_seen": 18030016, "step": 14865 }, { "epoch": 1.6560864238779374, "grad_norm": 0.1897040456533432, "learning_rate": 4.9347469208882265e-05, "loss": 0.4827, "num_input_tokens_seen": 18036192, "step": 14870 }, { "epoch": 1.6566432787615546, "grad_norm": 0.2515348792076111, "learning_rate": 4.934636573157789e-05, "loss": 0.4758, "num_input_tokens_seen": 18042144, "step": 14875 }, { "epoch": 1.657200133645172, "grad_norm": 0.17208829522132874, "learning_rate": 4.934526133438979e-05, "loss": 0.4634, "num_input_tokens_seen": 18048544, "step": 14880 }, { "epoch": 1.6577569885287895, "grad_norm": 0.174798846244812, "learning_rate": 4.934415601735971e-05, "loss": 0.474, "num_input_tokens_seen": 18054400, "step": 14885 }, { "epoch": 1.6583138434124067, "grad_norm": 0.1587785929441452, "learning_rate": 4.9343049780529395e-05, "loss": 0.4641, "num_input_tokens_seen": 18060672, "step": 14890 }, { "epoch": 1.658870698296024, "grad_norm": 0.14506542682647705, "learning_rate": 4.934194262394065e-05, "loss": 0.4633, "num_input_tokens_seen": 18066368, "step": 14895 }, { "epoch": 1.6594275531796414, "grad_norm": 0.20568223297595978, "learning_rate": 4.9340834547635295e-05, "loss": 0.4569, "num_input_tokens_seen": 18072768, "step": 14900 }, { "epoch": 1.6599844080632589, "grad_norm": 0.23792670667171478, "learning_rate": 4.933972555165521e-05, "loss": 0.4744, "num_input_tokens_seen": 18078720, "step": 14905 }, { "epoch": 1.660541262946876, "grad_norm": 0.1718105971813202, "learning_rate": 4.933861563604231e-05, "loss": 0.4696, "num_input_tokens_seen": 18084480, "step": 14910 }, { "epoch": 1.6610981178304933, "grad_norm": 0.1737540066242218, "learning_rate": 4.93375048008385e-05, "loss": 0.4604, "num_input_tokens_seen": 18090816, "step": 14915 }, { "epoch": 1.6616549727141106, "grad_norm": 0.15505753457546234, "learning_rate": 4.933639304608577e-05, "loss": 0.459, "num_input_tokens_seen": 18097024, "step": 14920 }, { "epoch": 1.662211827597728, "grad_norm": 0.296031653881073, "learning_rate": 4.933528037182611e-05, "loss": 0.4695, "num_input_tokens_seen": 18103328, "step": 14925 }, { "epoch": 1.6627686824813455, "grad_norm": 0.16741305589675903, "learning_rate": 4.933416677810158e-05, "loss": 0.4708, "num_input_tokens_seen": 18109440, "step": 14930 }, { "epoch": 1.6633255373649627, "grad_norm": 0.1990329623222351, "learning_rate": 4.933305226495425e-05, "loss": 0.4842, "num_input_tokens_seen": 18115104, "step": 14935 }, { "epoch": 1.66388239224858, "grad_norm": 0.16938118636608124, "learning_rate": 4.9331936832426216e-05, "loss": 0.4736, "num_input_tokens_seen": 18121152, "step": 14940 }, { "epoch": 1.6644392471321974, "grad_norm": 0.17892004549503326, "learning_rate": 4.933082048055965e-05, "loss": 0.4873, "num_input_tokens_seen": 18127488, "step": 14945 }, { "epoch": 1.6649961020158148, "grad_norm": 0.18453571200370789, "learning_rate": 4.93297032093967e-05, "loss": 0.4641, "num_input_tokens_seen": 18132832, "step": 14950 }, { "epoch": 1.665552956899432, "grad_norm": 0.2136038839817047, "learning_rate": 4.93285850189796e-05, "loss": 0.4588, "num_input_tokens_seen": 18139008, "step": 14955 }, { "epoch": 1.6661098117830493, "grad_norm": 0.15099848806858063, "learning_rate": 4.932746590935059e-05, "loss": 0.4671, "num_input_tokens_seen": 18144832, "step": 14960 }, { "epoch": 1.6666666666666665, "grad_norm": 0.22569797933101654, "learning_rate": 4.932634588055196e-05, "loss": 0.4519, "num_input_tokens_seen": 18151008, "step": 14965 }, { "epoch": 1.667223521550284, "grad_norm": 0.15977740287780762, "learning_rate": 4.932522493262602e-05, "loss": 0.4712, "num_input_tokens_seen": 18156896, "step": 14970 }, { "epoch": 1.6677803764339014, "grad_norm": 0.22130760550498962, "learning_rate": 4.932410306561513e-05, "loss": 0.4569, "num_input_tokens_seen": 18163456, "step": 14975 }, { "epoch": 1.6683372313175187, "grad_norm": 0.16197462379932404, "learning_rate": 4.932298027956168e-05, "loss": 0.4566, "num_input_tokens_seen": 18169632, "step": 14980 }, { "epoch": 1.6688940862011359, "grad_norm": 0.17263738811016083, "learning_rate": 4.9321856574508085e-05, "loss": 0.4605, "num_input_tokens_seen": 18175616, "step": 14985 }, { "epoch": 1.6694509410847533, "grad_norm": 0.15507544577121735, "learning_rate": 4.9320731950496804e-05, "loss": 0.4596, "num_input_tokens_seen": 18181760, "step": 14990 }, { "epoch": 1.6700077959683708, "grad_norm": 0.22585687041282654, "learning_rate": 4.931960640757033e-05, "loss": 0.4527, "num_input_tokens_seen": 18187968, "step": 14995 }, { "epoch": 1.670564650851988, "grad_norm": 0.19519004225730896, "learning_rate": 4.931847994577119e-05, "loss": 0.451, "num_input_tokens_seen": 18194016, "step": 15000 }, { "epoch": 1.6711215057356053, "grad_norm": 0.22737732529640198, "learning_rate": 4.931735256514195e-05, "loss": 0.4659, "num_input_tokens_seen": 18200256, "step": 15005 }, { "epoch": 1.6716783606192225, "grad_norm": 0.1964365690946579, "learning_rate": 4.93162242657252e-05, "loss": 0.4706, "num_input_tokens_seen": 18206176, "step": 15010 }, { "epoch": 1.67223521550284, "grad_norm": 0.20059989392757416, "learning_rate": 4.931509504756357e-05, "loss": 0.4771, "num_input_tokens_seen": 18212416, "step": 15015 }, { "epoch": 1.6727920703864574, "grad_norm": 0.2389405369758606, "learning_rate": 4.931396491069973e-05, "loss": 0.4559, "num_input_tokens_seen": 18218528, "step": 15020 }, { "epoch": 1.6733489252700746, "grad_norm": 0.16726063191890717, "learning_rate": 4.931283385517637e-05, "loss": 0.4511, "num_input_tokens_seen": 18224064, "step": 15025 }, { "epoch": 1.6739057801536918, "grad_norm": 0.167337104678154, "learning_rate": 4.9311701881036245e-05, "loss": 0.4792, "num_input_tokens_seen": 18229888, "step": 15030 }, { "epoch": 1.6744626350373093, "grad_norm": 0.21608057618141174, "learning_rate": 4.93105689883221e-05, "loss": 0.4737, "num_input_tokens_seen": 18236160, "step": 15035 }, { "epoch": 1.6750194899209268, "grad_norm": 0.16372200846672058, "learning_rate": 4.930943517707676e-05, "loss": 0.4742, "num_input_tokens_seen": 18241824, "step": 15040 }, { "epoch": 1.675576344804544, "grad_norm": 0.16584622859954834, "learning_rate": 4.9308300447343046e-05, "loss": 0.4629, "num_input_tokens_seen": 18248224, "step": 15045 }, { "epoch": 1.6761331996881612, "grad_norm": 0.21866853535175323, "learning_rate": 4.930716479916385e-05, "loss": 0.4727, "num_input_tokens_seen": 18253984, "step": 15050 }, { "epoch": 1.6766900545717784, "grad_norm": 0.16248902678489685, "learning_rate": 4.9306028232582076e-05, "loss": 0.4748, "num_input_tokens_seen": 18259904, "step": 15055 }, { "epoch": 1.677246909455396, "grad_norm": 0.26274919509887695, "learning_rate": 4.930489074764065e-05, "loss": 0.4471, "num_input_tokens_seen": 18265344, "step": 15060 }, { "epoch": 1.6778037643390133, "grad_norm": 0.20012937486171722, "learning_rate": 4.930375234438257e-05, "loss": 0.4649, "num_input_tokens_seen": 18271200, "step": 15065 }, { "epoch": 1.6783606192226306, "grad_norm": 0.1945965439081192, "learning_rate": 4.930261302285084e-05, "loss": 0.475, "num_input_tokens_seen": 18277216, "step": 15070 }, { "epoch": 1.6789174741062478, "grad_norm": 0.17838336527347565, "learning_rate": 4.930147278308852e-05, "loss": 0.4536, "num_input_tokens_seen": 18283232, "step": 15075 }, { "epoch": 1.6794743289898653, "grad_norm": 0.23011738061904907, "learning_rate": 4.9300331625138665e-05, "loss": 0.4678, "num_input_tokens_seen": 18289184, "step": 15080 }, { "epoch": 1.6800311838734827, "grad_norm": 0.15775448083877563, "learning_rate": 4.929918954904441e-05, "loss": 0.4501, "num_input_tokens_seen": 18295456, "step": 15085 }, { "epoch": 1.6805880387571, "grad_norm": 0.18207897245883942, "learning_rate": 4.9298046554848905e-05, "loss": 0.4885, "num_input_tokens_seen": 18301824, "step": 15090 }, { "epoch": 1.6811448936407172, "grad_norm": 0.1509847342967987, "learning_rate": 4.9296902642595336e-05, "loss": 0.4766, "num_input_tokens_seen": 18308128, "step": 15095 }, { "epoch": 1.6817017485243344, "grad_norm": 0.2729351222515106, "learning_rate": 4.9295757812326916e-05, "loss": 0.4642, "num_input_tokens_seen": 18314208, "step": 15100 }, { "epoch": 1.6822586034079519, "grad_norm": 0.2065896838903427, "learning_rate": 4.9294612064086906e-05, "loss": 0.4735, "num_input_tokens_seen": 18320128, "step": 15105 }, { "epoch": 1.6828154582915693, "grad_norm": 0.2696123719215393, "learning_rate": 4.9293465397918605e-05, "loss": 0.4725, "num_input_tokens_seen": 18326496, "step": 15110 }, { "epoch": 1.6833723131751865, "grad_norm": 0.18626071512699127, "learning_rate": 4.929231781386532e-05, "loss": 0.4586, "num_input_tokens_seen": 18332608, "step": 15115 }, { "epoch": 1.6839291680588038, "grad_norm": 0.22748716175556183, "learning_rate": 4.929116931197043e-05, "loss": 0.4795, "num_input_tokens_seen": 18338688, "step": 15120 }, { "epoch": 1.6844860229424212, "grad_norm": 0.21237704157829285, "learning_rate": 4.929001989227731e-05, "loss": 0.4804, "num_input_tokens_seen": 18344896, "step": 15125 }, { "epoch": 1.6850428778260387, "grad_norm": 0.17944034934043884, "learning_rate": 4.92888695548294e-05, "loss": 0.4618, "num_input_tokens_seen": 18350944, "step": 15130 }, { "epoch": 1.685599732709656, "grad_norm": 0.1791679710149765, "learning_rate": 4.928771829967016e-05, "loss": 0.4686, "num_input_tokens_seen": 18357280, "step": 15135 }, { "epoch": 1.6861565875932731, "grad_norm": 0.21792533993721008, "learning_rate": 4.928656612684309e-05, "loss": 0.4739, "num_input_tokens_seen": 18363360, "step": 15140 }, { "epoch": 1.6867134424768904, "grad_norm": 0.21127735078334808, "learning_rate": 4.928541303639172e-05, "loss": 0.4551, "num_input_tokens_seen": 18369504, "step": 15145 }, { "epoch": 1.6872702973605078, "grad_norm": 0.17757432162761688, "learning_rate": 4.9284259028359625e-05, "loss": 0.4864, "num_input_tokens_seen": 18375168, "step": 15150 }, { "epoch": 1.6878271522441253, "grad_norm": 0.28198280930519104, "learning_rate": 4.928310410279039e-05, "loss": 0.4839, "num_input_tokens_seen": 18381120, "step": 15155 }, { "epoch": 1.6883840071277425, "grad_norm": 0.19442704319953918, "learning_rate": 4.9281948259727676e-05, "loss": 0.4957, "num_input_tokens_seen": 18387328, "step": 15160 }, { "epoch": 1.6889408620113597, "grad_norm": 0.16886688768863678, "learning_rate": 4.928079149921513e-05, "loss": 0.4588, "num_input_tokens_seen": 18393152, "step": 15165 }, { "epoch": 1.6894977168949772, "grad_norm": 0.16237612068653107, "learning_rate": 4.927963382129648e-05, "loss": 0.4617, "num_input_tokens_seen": 18399392, "step": 15170 }, { "epoch": 1.6900545717785946, "grad_norm": 0.13990192115306854, "learning_rate": 4.927847522601544e-05, "loss": 0.4683, "num_input_tokens_seen": 18405472, "step": 15175 }, { "epoch": 1.6906114266622119, "grad_norm": 0.1689816415309906, "learning_rate": 4.9277315713415815e-05, "loss": 0.4744, "num_input_tokens_seen": 18411008, "step": 15180 }, { "epoch": 1.691168281545829, "grad_norm": 0.18609684705734253, "learning_rate": 4.9276155283541404e-05, "loss": 0.4567, "num_input_tokens_seen": 18417248, "step": 15185 }, { "epoch": 1.6917251364294463, "grad_norm": 0.21239149570465088, "learning_rate": 4.927499393643604e-05, "loss": 0.4684, "num_input_tokens_seen": 18423232, "step": 15190 }, { "epoch": 1.6922819913130638, "grad_norm": 0.1857145130634308, "learning_rate": 4.927383167214362e-05, "loss": 0.4747, "num_input_tokens_seen": 18429440, "step": 15195 }, { "epoch": 1.6928388461966812, "grad_norm": 0.15365423262119293, "learning_rate": 4.9272668490708046e-05, "loss": 0.4627, "num_input_tokens_seen": 18435488, "step": 15200 }, { "epoch": 1.6933957010802985, "grad_norm": 0.1611592173576355, "learning_rate": 4.9271504392173275e-05, "loss": 0.4631, "num_input_tokens_seen": 18441408, "step": 15205 }, { "epoch": 1.6939525559639157, "grad_norm": 0.17640797793865204, "learning_rate": 4.927033937658328e-05, "loss": 0.4796, "num_input_tokens_seen": 18447360, "step": 15210 }, { "epoch": 1.6945094108475331, "grad_norm": 0.23588845133781433, "learning_rate": 4.926917344398209e-05, "loss": 0.4667, "num_input_tokens_seen": 18453248, "step": 15215 }, { "epoch": 1.6950662657311506, "grad_norm": 0.19126498699188232, "learning_rate": 4.926800659441375e-05, "loss": 0.4789, "num_input_tokens_seen": 18459424, "step": 15220 }, { "epoch": 1.6956231206147678, "grad_norm": 0.244869202375412, "learning_rate": 4.926683882792235e-05, "loss": 0.4666, "num_input_tokens_seen": 18465376, "step": 15225 }, { "epoch": 1.696179975498385, "grad_norm": 0.16515450179576874, "learning_rate": 4.926567014455201e-05, "loss": 0.4819, "num_input_tokens_seen": 18471520, "step": 15230 }, { "epoch": 1.6967368303820023, "grad_norm": 0.1742752641439438, "learning_rate": 4.926450054434689e-05, "loss": 0.4627, "num_input_tokens_seen": 18477856, "step": 15235 }, { "epoch": 1.6972936852656197, "grad_norm": 0.17253711819648743, "learning_rate": 4.9263330027351184e-05, "loss": 0.4553, "num_input_tokens_seen": 18483808, "step": 15240 }, { "epoch": 1.6978505401492372, "grad_norm": 0.22162126004695892, "learning_rate": 4.9262158593609115e-05, "loss": 0.474, "num_input_tokens_seen": 18489952, "step": 15245 }, { "epoch": 1.6984073950328544, "grad_norm": 0.20303447544574738, "learning_rate": 4.926098624316494e-05, "loss": 0.4604, "num_input_tokens_seen": 18495904, "step": 15250 }, { "epoch": 1.6989642499164717, "grad_norm": 0.34942713379859924, "learning_rate": 4.925981297606296e-05, "loss": 0.4862, "num_input_tokens_seen": 18502208, "step": 15255 }, { "epoch": 1.699521104800089, "grad_norm": 0.16303417086601257, "learning_rate": 4.92586387923475e-05, "loss": 0.462, "num_input_tokens_seen": 18508448, "step": 15260 }, { "epoch": 1.7000779596837066, "grad_norm": 0.22414883971214294, "learning_rate": 4.9257463692062916e-05, "loss": 0.4565, "num_input_tokens_seen": 18514752, "step": 15265 }, { "epoch": 1.7006348145673238, "grad_norm": 0.20635981857776642, "learning_rate": 4.9256287675253624e-05, "loss": 0.4513, "num_input_tokens_seen": 18520960, "step": 15270 }, { "epoch": 1.701191669450941, "grad_norm": 0.22614096105098724, "learning_rate": 4.925511074196405e-05, "loss": 0.4585, "num_input_tokens_seen": 18526848, "step": 15275 }, { "epoch": 1.7017485243345583, "grad_norm": 0.16773849725723267, "learning_rate": 4.925393289223866e-05, "loss": 0.4724, "num_input_tokens_seen": 18532608, "step": 15280 }, { "epoch": 1.7023053792181757, "grad_norm": 0.16955670714378357, "learning_rate": 4.925275412612197e-05, "loss": 0.4668, "num_input_tokens_seen": 18538784, "step": 15285 }, { "epoch": 1.7028622341017932, "grad_norm": 0.18605197966098785, "learning_rate": 4.92515744436585e-05, "loss": 0.4673, "num_input_tokens_seen": 18545280, "step": 15290 }, { "epoch": 1.7034190889854104, "grad_norm": 0.18145598471164703, "learning_rate": 4.9250393844892826e-05, "loss": 0.4648, "num_input_tokens_seen": 18551456, "step": 15295 }, { "epoch": 1.7039759438690276, "grad_norm": 0.2005997598171234, "learning_rate": 4.9249212329869564e-05, "loss": 0.4803, "num_input_tokens_seen": 18557376, "step": 15300 }, { "epoch": 1.704532798752645, "grad_norm": 0.18487787246704102, "learning_rate": 4.9248029898633354e-05, "loss": 0.4813, "num_input_tokens_seen": 18563680, "step": 15305 }, { "epoch": 1.7050896536362625, "grad_norm": 0.3105957508087158, "learning_rate": 4.924684655122886e-05, "loss": 0.4586, "num_input_tokens_seen": 18569632, "step": 15310 }, { "epoch": 1.7056465085198798, "grad_norm": 0.13753944635391235, "learning_rate": 4.92456622877008e-05, "loss": 0.4531, "num_input_tokens_seen": 18575712, "step": 15315 }, { "epoch": 1.706203363403497, "grad_norm": 0.2025965303182602, "learning_rate": 4.924447710809392e-05, "loss": 0.4814, "num_input_tokens_seen": 18581760, "step": 15320 }, { "epoch": 1.7067602182871142, "grad_norm": 0.24529050290584564, "learning_rate": 4.924329101245301e-05, "loss": 0.4766, "num_input_tokens_seen": 18588128, "step": 15325 }, { "epoch": 1.7073170731707317, "grad_norm": 0.20637305080890656, "learning_rate": 4.924210400082287e-05, "loss": 0.4425, "num_input_tokens_seen": 18594272, "step": 15330 }, { "epoch": 1.7078739280543491, "grad_norm": 0.2676665484905243, "learning_rate": 4.924091607324834e-05, "loss": 0.476, "num_input_tokens_seen": 18600352, "step": 15335 }, { "epoch": 1.7084307829379664, "grad_norm": 0.20389823615550995, "learning_rate": 4.9239727229774325e-05, "loss": 0.4683, "num_input_tokens_seen": 18606528, "step": 15340 }, { "epoch": 1.7089876378215836, "grad_norm": 0.2540924847126007, "learning_rate": 4.923853747044574e-05, "loss": 0.4608, "num_input_tokens_seen": 18612512, "step": 15345 }, { "epoch": 1.709544492705201, "grad_norm": 0.21344271302223206, "learning_rate": 4.923734679530753e-05, "loss": 0.4532, "num_input_tokens_seen": 18618592, "step": 15350 }, { "epoch": 1.7101013475888185, "grad_norm": 0.17368756234645844, "learning_rate": 4.923615520440468e-05, "loss": 0.4719, "num_input_tokens_seen": 18624448, "step": 15355 }, { "epoch": 1.7106582024724357, "grad_norm": 0.18503513932228088, "learning_rate": 4.9234962697782227e-05, "loss": 0.4649, "num_input_tokens_seen": 18629984, "step": 15360 }, { "epoch": 1.711215057356053, "grad_norm": 0.14930330216884613, "learning_rate": 4.923376927548521e-05, "loss": 0.4596, "num_input_tokens_seen": 18636224, "step": 15365 }, { "epoch": 1.7117719122396702, "grad_norm": 0.19436340034008026, "learning_rate": 4.923257493755874e-05, "loss": 0.4549, "num_input_tokens_seen": 18642432, "step": 15370 }, { "epoch": 1.7123287671232876, "grad_norm": 0.16385775804519653, "learning_rate": 4.9231379684047925e-05, "loss": 0.4592, "num_input_tokens_seen": 18648672, "step": 15375 }, { "epoch": 1.712885622006905, "grad_norm": 0.22041596472263336, "learning_rate": 4.923018351499793e-05, "loss": 0.4501, "num_input_tokens_seen": 18654816, "step": 15380 }, { "epoch": 1.7134424768905223, "grad_norm": 0.16540411114692688, "learning_rate": 4.922898643045395e-05, "loss": 0.4635, "num_input_tokens_seen": 18660928, "step": 15385 }, { "epoch": 1.7139993317741395, "grad_norm": 0.20277690887451172, "learning_rate": 4.922778843046123e-05, "loss": 0.4659, "num_input_tokens_seen": 18667104, "step": 15390 }, { "epoch": 1.714556186657757, "grad_norm": 0.20061194896697998, "learning_rate": 4.922658951506501e-05, "loss": 0.4708, "num_input_tokens_seen": 18673120, "step": 15395 }, { "epoch": 1.7151130415413745, "grad_norm": 0.20354712009429932, "learning_rate": 4.922538968431061e-05, "loss": 0.4671, "num_input_tokens_seen": 18679200, "step": 15400 }, { "epoch": 1.7156698964249917, "grad_norm": 0.20213156938552856, "learning_rate": 4.922418893824334e-05, "loss": 0.4671, "num_input_tokens_seen": 18685568, "step": 15405 }, { "epoch": 1.716226751308609, "grad_norm": 0.1963404268026352, "learning_rate": 4.9222987276908586e-05, "loss": 0.4483, "num_input_tokens_seen": 18691776, "step": 15410 }, { "epoch": 1.7167836061922261, "grad_norm": 0.18313896656036377, "learning_rate": 4.922178470035175e-05, "loss": 0.4548, "num_input_tokens_seen": 18697824, "step": 15415 }, { "epoch": 1.7173404610758436, "grad_norm": 0.16078828275203705, "learning_rate": 4.922058120861827e-05, "loss": 0.4793, "num_input_tokens_seen": 18703712, "step": 15420 }, { "epoch": 1.717897315959461, "grad_norm": 0.24372264742851257, "learning_rate": 4.92193768017536e-05, "loss": 0.4483, "num_input_tokens_seen": 18709568, "step": 15425 }, { "epoch": 1.7184541708430783, "grad_norm": 0.16330061852931976, "learning_rate": 4.921817147980328e-05, "loss": 0.4458, "num_input_tokens_seen": 18715552, "step": 15430 }, { "epoch": 1.7190110257266955, "grad_norm": 0.15701276063919067, "learning_rate": 4.921696524281281e-05, "loss": 0.4757, "num_input_tokens_seen": 18721760, "step": 15435 }, { "epoch": 1.719567880610313, "grad_norm": 0.20137418806552887, "learning_rate": 4.921575809082779e-05, "loss": 0.4774, "num_input_tokens_seen": 18727744, "step": 15440 }, { "epoch": 1.7201247354939304, "grad_norm": 0.1806001365184784, "learning_rate": 4.9214550023893835e-05, "loss": 0.4625, "num_input_tokens_seen": 18733792, "step": 15445 }, { "epoch": 1.7206815903775476, "grad_norm": 0.2560443580150604, "learning_rate": 4.921334104205658e-05, "loss": 0.4598, "num_input_tokens_seen": 18739744, "step": 15450 }, { "epoch": 1.7212384452611649, "grad_norm": 0.1505006104707718, "learning_rate": 4.9212131145361697e-05, "loss": 0.4679, "num_input_tokens_seen": 18745632, "step": 15455 }, { "epoch": 1.721795300144782, "grad_norm": 0.24109913408756256, "learning_rate": 4.9210920333854914e-05, "loss": 0.4648, "num_input_tokens_seen": 18751936, "step": 15460 }, { "epoch": 1.7223521550283996, "grad_norm": 0.2860967218875885, "learning_rate": 4.920970860758197e-05, "loss": 0.4596, "num_input_tokens_seen": 18758080, "step": 15465 }, { "epoch": 1.722909009912017, "grad_norm": 0.17840252816677094, "learning_rate": 4.9208495966588654e-05, "loss": 0.4674, "num_input_tokens_seen": 18764032, "step": 15470 }, { "epoch": 1.7234658647956342, "grad_norm": 0.16374140977859497, "learning_rate": 4.920728241092077e-05, "loss": 0.4444, "num_input_tokens_seen": 18770080, "step": 15475 }, { "epoch": 1.7240227196792515, "grad_norm": 0.17346832156181335, "learning_rate": 4.920606794062419e-05, "loss": 0.4678, "num_input_tokens_seen": 18776224, "step": 15480 }, { "epoch": 1.724579574562869, "grad_norm": 0.20115843415260315, "learning_rate": 4.92048525557448e-05, "loss": 0.4674, "num_input_tokens_seen": 18782400, "step": 15485 }, { "epoch": 1.7251364294464864, "grad_norm": 0.23489435017108917, "learning_rate": 4.92036362563285e-05, "loss": 0.4659, "num_input_tokens_seen": 18787904, "step": 15490 }, { "epoch": 1.7256932843301036, "grad_norm": 0.24097202718257904, "learning_rate": 4.9202419042421254e-05, "loss": 0.4637, "num_input_tokens_seen": 18793696, "step": 15495 }, { "epoch": 1.7262501392137208, "grad_norm": 0.2290819138288498, "learning_rate": 4.9201200914069065e-05, "loss": 0.4793, "num_input_tokens_seen": 18799680, "step": 15500 }, { "epoch": 1.7268069940973383, "grad_norm": 0.17640535533428192, "learning_rate": 4.919998187131795e-05, "loss": 0.4528, "num_input_tokens_seen": 18805824, "step": 15505 }, { "epoch": 1.7273638489809555, "grad_norm": 0.2152310609817505, "learning_rate": 4.919876191421396e-05, "loss": 0.4492, "num_input_tokens_seen": 18812128, "step": 15510 }, { "epoch": 1.727920703864573, "grad_norm": 0.19907566905021667, "learning_rate": 4.9197541042803194e-05, "loss": 0.4579, "num_input_tokens_seen": 18818112, "step": 15515 }, { "epoch": 1.7284775587481902, "grad_norm": 0.22513103485107422, "learning_rate": 4.9196319257131787e-05, "loss": 0.4716, "num_input_tokens_seen": 18824320, "step": 15520 }, { "epoch": 1.7290344136318074, "grad_norm": 0.1638897806406021, "learning_rate": 4.91950965572459e-05, "loss": 0.4865, "num_input_tokens_seen": 18830560, "step": 15525 }, { "epoch": 1.7295912685154249, "grad_norm": 0.2190469652414322, "learning_rate": 4.9193872943191726e-05, "loss": 0.4624, "num_input_tokens_seen": 18836512, "step": 15530 }, { "epoch": 1.7301481233990423, "grad_norm": 0.1912955939769745, "learning_rate": 4.91926484150155e-05, "loss": 0.471, "num_input_tokens_seen": 18842656, "step": 15535 }, { "epoch": 1.7307049782826596, "grad_norm": 0.24889777600765228, "learning_rate": 4.919142297276349e-05, "loss": 0.4459, "num_input_tokens_seen": 18848128, "step": 15540 }, { "epoch": 1.7312618331662768, "grad_norm": 0.2347508817911148, "learning_rate": 4.919019661648199e-05, "loss": 0.4726, "num_input_tokens_seen": 18854368, "step": 15545 }, { "epoch": 1.7318186880498943, "grad_norm": 0.2066151648759842, "learning_rate": 4.918896934621734e-05, "loss": 0.4869, "num_input_tokens_seen": 18860672, "step": 15550 }, { "epoch": 1.7323755429335115, "grad_norm": 0.224335178732872, "learning_rate": 4.918774116201592e-05, "loss": 0.4908, "num_input_tokens_seen": 18867104, "step": 15555 }, { "epoch": 1.732932397817129, "grad_norm": 0.19555425643920898, "learning_rate": 4.918651206392412e-05, "loss": 0.4693, "num_input_tokens_seen": 18872768, "step": 15560 }, { "epoch": 1.7334892527007462, "grad_norm": 0.17414313554763794, "learning_rate": 4.918528205198838e-05, "loss": 0.468, "num_input_tokens_seen": 18878496, "step": 15565 }, { "epoch": 1.7340461075843634, "grad_norm": 0.15564733743667603, "learning_rate": 4.918405112625518e-05, "loss": 0.4615, "num_input_tokens_seen": 18884448, "step": 15570 }, { "epoch": 1.7346029624679808, "grad_norm": 0.1626387983560562, "learning_rate": 4.918281928677103e-05, "loss": 0.4567, "num_input_tokens_seen": 18889760, "step": 15575 }, { "epoch": 1.7351598173515983, "grad_norm": 0.19058580696582794, "learning_rate": 4.918158653358247e-05, "loss": 0.4716, "num_input_tokens_seen": 18896000, "step": 15580 }, { "epoch": 1.7357166722352155, "grad_norm": 0.16014838218688965, "learning_rate": 4.918035286673608e-05, "loss": 0.455, "num_input_tokens_seen": 18901984, "step": 15585 }, { "epoch": 1.7362735271188328, "grad_norm": 0.21741683781147003, "learning_rate": 4.917911828627846e-05, "loss": 0.4601, "num_input_tokens_seen": 18907968, "step": 15590 }, { "epoch": 1.7368303820024502, "grad_norm": 0.16638490557670593, "learning_rate": 4.917788279225627e-05, "loss": 0.4626, "num_input_tokens_seen": 18914208, "step": 15595 }, { "epoch": 1.7373872368860674, "grad_norm": 0.24917230010032654, "learning_rate": 4.917664638471618e-05, "loss": 0.4628, "num_input_tokens_seen": 18920480, "step": 15600 }, { "epoch": 1.737944091769685, "grad_norm": 0.13598482310771942, "learning_rate": 4.917540906370492e-05, "loss": 0.4725, "num_input_tokens_seen": 18926208, "step": 15605 }, { "epoch": 1.7385009466533021, "grad_norm": 0.19429484009742737, "learning_rate": 4.9174170829269235e-05, "loss": 0.4612, "num_input_tokens_seen": 18932352, "step": 15610 }, { "epoch": 1.7390578015369194, "grad_norm": 0.18084682524204254, "learning_rate": 4.9172931681455905e-05, "loss": 0.4684, "num_input_tokens_seen": 18938368, "step": 15615 }, { "epoch": 1.7396146564205368, "grad_norm": 0.17024849355220795, "learning_rate": 4.917169162031174e-05, "loss": 0.457, "num_input_tokens_seen": 18944608, "step": 15620 }, { "epoch": 1.7401715113041543, "grad_norm": 0.22537748515605927, "learning_rate": 4.917045064588361e-05, "loss": 0.4759, "num_input_tokens_seen": 18950496, "step": 15625 }, { "epoch": 1.7407283661877715, "grad_norm": 0.20462515950202942, "learning_rate": 4.91692087582184e-05, "loss": 0.4695, "num_input_tokens_seen": 18956608, "step": 15630 }, { "epoch": 1.7412852210713887, "grad_norm": 0.16564802825450897, "learning_rate": 4.916796595736302e-05, "loss": 0.4626, "num_input_tokens_seen": 18962592, "step": 15635 }, { "epoch": 1.7418420759550062, "grad_norm": 0.1499023139476776, "learning_rate": 4.916672224336444e-05, "loss": 0.4805, "num_input_tokens_seen": 18968768, "step": 15640 }, { "epoch": 1.7423989308386234, "grad_norm": 0.20704276859760284, "learning_rate": 4.916547761626965e-05, "loss": 0.4645, "num_input_tokens_seen": 18974880, "step": 15645 }, { "epoch": 1.7429557857222409, "grad_norm": 0.242214635014534, "learning_rate": 4.916423207612567e-05, "loss": 0.4791, "num_input_tokens_seen": 18980896, "step": 15650 }, { "epoch": 1.743512640605858, "grad_norm": 0.2660747766494751, "learning_rate": 4.9162985622979576e-05, "loss": 0.4782, "num_input_tokens_seen": 18986848, "step": 15655 }, { "epoch": 1.7440694954894753, "grad_norm": 0.16351793706417084, "learning_rate": 4.916173825687844e-05, "loss": 0.4711, "num_input_tokens_seen": 18993024, "step": 15660 }, { "epoch": 1.7446263503730928, "grad_norm": 0.26001471281051636, "learning_rate": 4.9160489977869405e-05, "loss": 0.4935, "num_input_tokens_seen": 18999296, "step": 15665 }, { "epoch": 1.7451832052567102, "grad_norm": 0.15654651820659637, "learning_rate": 4.915924078599963e-05, "loss": 0.4762, "num_input_tokens_seen": 19005408, "step": 15670 }, { "epoch": 1.7457400601403275, "grad_norm": 0.15293197333812714, "learning_rate": 4.915799068131631e-05, "loss": 0.4648, "num_input_tokens_seen": 19011552, "step": 15675 }, { "epoch": 1.7462969150239447, "grad_norm": 0.19041100144386292, "learning_rate": 4.9156739663866704e-05, "loss": 0.4553, "num_input_tokens_seen": 19017504, "step": 15680 }, { "epoch": 1.7468537699075621, "grad_norm": 0.13035008311271667, "learning_rate": 4.915548773369804e-05, "loss": 0.4627, "num_input_tokens_seen": 19023648, "step": 15685 }, { "epoch": 1.7474106247911794, "grad_norm": 0.1463550627231598, "learning_rate": 4.915423489085765e-05, "loss": 0.4498, "num_input_tokens_seen": 19029600, "step": 15690 }, { "epoch": 1.7479674796747968, "grad_norm": 0.17361001670360565, "learning_rate": 4.915298113539285e-05, "loss": 0.4744, "num_input_tokens_seen": 19035744, "step": 15695 }, { "epoch": 1.748524334558414, "grad_norm": 0.19570061564445496, "learning_rate": 4.9151726467351035e-05, "loss": 0.4742, "num_input_tokens_seen": 19042144, "step": 15700 }, { "epoch": 1.7490811894420313, "grad_norm": 0.1733146607875824, "learning_rate": 4.915047088677959e-05, "loss": 0.4457, "num_input_tokens_seen": 19048160, "step": 15705 }, { "epoch": 1.7496380443256487, "grad_norm": 0.18439283967018127, "learning_rate": 4.914921439372595e-05, "loss": 0.4737, "num_input_tokens_seen": 19054272, "step": 15710 }, { "epoch": 1.7501948992092662, "grad_norm": 0.16472363471984863, "learning_rate": 4.9147956988237606e-05, "loss": 0.4799, "num_input_tokens_seen": 19060800, "step": 15715 }, { "epoch": 1.7507517540928834, "grad_norm": 0.165554016828537, "learning_rate": 4.914669867036207e-05, "loss": 0.4662, "num_input_tokens_seen": 19066464, "step": 15720 }, { "epoch": 1.7513086089765006, "grad_norm": 0.1677890121936798, "learning_rate": 4.914543944014687e-05, "loss": 0.4474, "num_input_tokens_seen": 19072608, "step": 15725 }, { "epoch": 1.751865463860118, "grad_norm": 0.21826300024986267, "learning_rate": 4.914417929763959e-05, "loss": 0.449, "num_input_tokens_seen": 19078816, "step": 15730 }, { "epoch": 1.7524223187437353, "grad_norm": 0.18463672697544098, "learning_rate": 4.9142918242887845e-05, "loss": 0.4497, "num_input_tokens_seen": 19084960, "step": 15735 }, { "epoch": 1.7529791736273528, "grad_norm": 0.1751444786787033, "learning_rate": 4.9141656275939276e-05, "loss": 0.4801, "num_input_tokens_seen": 19090912, "step": 15740 }, { "epoch": 1.75353602851097, "grad_norm": 0.16504666209220886, "learning_rate": 4.9140393396841565e-05, "loss": 0.4743, "num_input_tokens_seen": 19096704, "step": 15745 }, { "epoch": 1.7540928833945872, "grad_norm": 0.11433892697095871, "learning_rate": 4.913912960564244e-05, "loss": 0.4528, "num_input_tokens_seen": 19102912, "step": 15750 }, { "epoch": 1.7546497382782047, "grad_norm": 0.18066293001174927, "learning_rate": 4.913786490238963e-05, "loss": 0.4541, "num_input_tokens_seen": 19109024, "step": 15755 }, { "epoch": 1.7552065931618221, "grad_norm": 0.2073470503091812, "learning_rate": 4.913659928713094e-05, "loss": 0.4572, "num_input_tokens_seen": 19115040, "step": 15760 }, { "epoch": 1.7557634480454394, "grad_norm": 0.19017967581748962, "learning_rate": 4.913533275991417e-05, "loss": 0.475, "num_input_tokens_seen": 19121120, "step": 15765 }, { "epoch": 1.7563203029290566, "grad_norm": 0.18777015805244446, "learning_rate": 4.9134065320787185e-05, "loss": 0.4847, "num_input_tokens_seen": 19127104, "step": 15770 }, { "epoch": 1.756877157812674, "grad_norm": 0.19413432478904724, "learning_rate": 4.913279696979787e-05, "loss": 0.4793, "num_input_tokens_seen": 19133216, "step": 15775 }, { "epoch": 1.7574340126962913, "grad_norm": 0.22719748318195343, "learning_rate": 4.913152770699415e-05, "loss": 0.4683, "num_input_tokens_seen": 19139296, "step": 15780 }, { "epoch": 1.7579908675799087, "grad_norm": 0.19243548810482025, "learning_rate": 4.913025753242399e-05, "loss": 0.4571, "num_input_tokens_seen": 19145344, "step": 15785 }, { "epoch": 1.758547722463526, "grad_norm": 0.15475913882255554, "learning_rate": 4.9128986446135355e-05, "loss": 0.4603, "num_input_tokens_seen": 19151424, "step": 15790 }, { "epoch": 1.7591045773471432, "grad_norm": 0.281124472618103, "learning_rate": 4.91277144481763e-05, "loss": 0.4923, "num_input_tokens_seen": 19156928, "step": 15795 }, { "epoch": 1.7596614322307607, "grad_norm": 0.17311133444309235, "learning_rate": 4.912644153859486e-05, "loss": 0.4626, "num_input_tokens_seen": 19163040, "step": 15800 }, { "epoch": 1.760218287114378, "grad_norm": 0.14221371710300446, "learning_rate": 4.912516771743915e-05, "loss": 0.4711, "num_input_tokens_seen": 19169280, "step": 15805 }, { "epoch": 1.7607751419979953, "grad_norm": 0.19763629138469696, "learning_rate": 4.912389298475728e-05, "loss": 0.4546, "num_input_tokens_seen": 19175456, "step": 15810 }, { "epoch": 1.7613319968816126, "grad_norm": 0.13903136551380157, "learning_rate": 4.912261734059744e-05, "loss": 0.4732, "num_input_tokens_seen": 19181792, "step": 15815 }, { "epoch": 1.76188885176523, "grad_norm": 0.1699465960264206, "learning_rate": 4.9121340785007804e-05, "loss": 0.4756, "num_input_tokens_seen": 19188000, "step": 15820 }, { "epoch": 1.7624457066488475, "grad_norm": 0.1617504358291626, "learning_rate": 4.9120063318036616e-05, "loss": 0.4508, "num_input_tokens_seen": 19194240, "step": 15825 }, { "epoch": 1.7630025615324647, "grad_norm": 0.1412741243839264, "learning_rate": 4.911878493973213e-05, "loss": 0.4685, "num_input_tokens_seen": 19200512, "step": 15830 }, { "epoch": 1.763559416416082, "grad_norm": 0.16964834928512573, "learning_rate": 4.9117505650142665e-05, "loss": 0.4647, "num_input_tokens_seen": 19206816, "step": 15835 }, { "epoch": 1.7641162712996992, "grad_norm": 0.14077773690223694, "learning_rate": 4.9116225449316546e-05, "loss": 0.4679, "num_input_tokens_seen": 19212672, "step": 15840 }, { "epoch": 1.7646731261833166, "grad_norm": 0.12115895003080368, "learning_rate": 4.911494433730215e-05, "loss": 0.45, "num_input_tokens_seen": 19218784, "step": 15845 }, { "epoch": 1.765229981066934, "grad_norm": 0.18258033692836761, "learning_rate": 4.911366231414787e-05, "loss": 0.4583, "num_input_tokens_seen": 19224896, "step": 15850 }, { "epoch": 1.7657868359505513, "grad_norm": 0.15440919995307922, "learning_rate": 4.9112379379902154e-05, "loss": 0.4734, "num_input_tokens_seen": 19231072, "step": 15855 }, { "epoch": 1.7663436908341685, "grad_norm": 0.19584955275058746, "learning_rate": 4.911109553461347e-05, "loss": 0.4893, "num_input_tokens_seen": 19236992, "step": 15860 }, { "epoch": 1.766900545717786, "grad_norm": 0.1680433601140976, "learning_rate": 4.910981077833033e-05, "loss": 0.4825, "num_input_tokens_seen": 19242624, "step": 15865 }, { "epoch": 1.7674574006014034, "grad_norm": 0.20425944030284882, "learning_rate": 4.910852511110128e-05, "loss": 0.4887, "num_input_tokens_seen": 19248768, "step": 15870 }, { "epoch": 1.7680142554850207, "grad_norm": 0.21804191172122955, "learning_rate": 4.910723853297489e-05, "loss": 0.4711, "num_input_tokens_seen": 19254528, "step": 15875 }, { "epoch": 1.768571110368638, "grad_norm": 0.15446840226650238, "learning_rate": 4.910595104399976e-05, "loss": 0.4658, "num_input_tokens_seen": 19260576, "step": 15880 }, { "epoch": 1.7691279652522551, "grad_norm": 0.2574777901172638, "learning_rate": 4.910466264422457e-05, "loss": 0.4708, "num_input_tokens_seen": 19266688, "step": 15885 }, { "epoch": 1.7696848201358726, "grad_norm": 0.2281462699174881, "learning_rate": 4.910337333369797e-05, "loss": 0.4602, "num_input_tokens_seen": 19272512, "step": 15890 }, { "epoch": 1.77024167501949, "grad_norm": 0.1967514008283615, "learning_rate": 4.9102083112468674e-05, "loss": 0.4687, "num_input_tokens_seen": 19278592, "step": 15895 }, { "epoch": 1.7707985299031073, "grad_norm": 0.21247601509094238, "learning_rate": 4.910079198058545e-05, "loss": 0.4799, "num_input_tokens_seen": 19284448, "step": 15900 }, { "epoch": 1.7713553847867245, "grad_norm": 0.16767260432243347, "learning_rate": 4.9099499938097057e-05, "loss": 0.4495, "num_input_tokens_seen": 19290656, "step": 15905 }, { "epoch": 1.771912239670342, "grad_norm": 0.16559675335884094, "learning_rate": 4.9098206985052345e-05, "loss": 0.4765, "num_input_tokens_seen": 19296736, "step": 15910 }, { "epoch": 1.7724690945539594, "grad_norm": 0.14665596187114716, "learning_rate": 4.909691312150013e-05, "loss": 0.4733, "num_input_tokens_seen": 19302944, "step": 15915 }, { "epoch": 1.7730259494375766, "grad_norm": 0.17182599008083344, "learning_rate": 4.9095618347489324e-05, "loss": 0.4817, "num_input_tokens_seen": 19309216, "step": 15920 }, { "epoch": 1.7735828043211939, "grad_norm": 0.18737588822841644, "learning_rate": 4.909432266306884e-05, "loss": 0.4592, "num_input_tokens_seen": 19315584, "step": 15925 }, { "epoch": 1.774139659204811, "grad_norm": 0.20984618365764618, "learning_rate": 4.909302606828762e-05, "loss": 0.4655, "num_input_tokens_seen": 19322144, "step": 15930 }, { "epoch": 1.7746965140884285, "grad_norm": 0.16997328400611877, "learning_rate": 4.909172856319467e-05, "loss": 0.4818, "num_input_tokens_seen": 19328320, "step": 15935 }, { "epoch": 1.775253368972046, "grad_norm": 0.14983493089675903, "learning_rate": 4.909043014783902e-05, "loss": 0.4823, "num_input_tokens_seen": 19334208, "step": 15940 }, { "epoch": 1.7758102238556632, "grad_norm": 0.1784917116165161, "learning_rate": 4.90891308222697e-05, "loss": 0.4591, "num_input_tokens_seen": 19340512, "step": 15945 }, { "epoch": 1.7763670787392805, "grad_norm": 0.13991616666316986, "learning_rate": 4.9087830586535835e-05, "loss": 0.4635, "num_input_tokens_seen": 19346848, "step": 15950 }, { "epoch": 1.776923933622898, "grad_norm": 0.17938289046287537, "learning_rate": 4.908652944068653e-05, "loss": 0.4535, "num_input_tokens_seen": 19353216, "step": 15955 }, { "epoch": 1.7774807885065154, "grad_norm": 0.22170424461364746, "learning_rate": 4.908522738477095e-05, "loss": 0.4648, "num_input_tokens_seen": 19359232, "step": 15960 }, { "epoch": 1.7780376433901326, "grad_norm": 0.1952931135892868, "learning_rate": 4.90839244188383e-05, "loss": 0.4635, "num_input_tokens_seen": 19365376, "step": 15965 }, { "epoch": 1.7785944982737498, "grad_norm": 0.1699526607990265, "learning_rate": 4.908262054293781e-05, "loss": 0.4585, "num_input_tokens_seen": 19371744, "step": 15970 }, { "epoch": 1.779151353157367, "grad_norm": 0.14942622184753418, "learning_rate": 4.908131575711873e-05, "loss": 0.4412, "num_input_tokens_seen": 19377792, "step": 15975 }, { "epoch": 1.7797082080409845, "grad_norm": 0.20056715607643127, "learning_rate": 4.908001006143037e-05, "loss": 0.465, "num_input_tokens_seen": 19383840, "step": 15980 }, { "epoch": 1.780265062924602, "grad_norm": 0.1428048312664032, "learning_rate": 4.9078703455922074e-05, "loss": 0.4569, "num_input_tokens_seen": 19390016, "step": 15985 }, { "epoch": 1.7808219178082192, "grad_norm": 0.16048568487167358, "learning_rate": 4.907739594064318e-05, "loss": 0.469, "num_input_tokens_seen": 19396384, "step": 15990 }, { "epoch": 1.7813787726918364, "grad_norm": 0.1672116070985794, "learning_rate": 4.907608751564312e-05, "loss": 0.4831, "num_input_tokens_seen": 19402400, "step": 15995 }, { "epoch": 1.7819356275754539, "grad_norm": 0.1419568657875061, "learning_rate": 4.9074778180971306e-05, "loss": 0.4949, "num_input_tokens_seen": 19408064, "step": 16000 }, { "epoch": 1.7824924824590713, "grad_norm": 0.24570001661777496, "learning_rate": 4.907346793667723e-05, "loss": 0.4705, "num_input_tokens_seen": 19414560, "step": 16005 }, { "epoch": 1.7830493373426886, "grad_norm": 0.16415007412433624, "learning_rate": 4.907215678281039e-05, "loss": 0.4726, "num_input_tokens_seen": 19420704, "step": 16010 }, { "epoch": 1.7836061922263058, "grad_norm": 0.27075207233428955, "learning_rate": 4.907084471942032e-05, "loss": 0.4754, "num_input_tokens_seen": 19426752, "step": 16015 }, { "epoch": 1.784163047109923, "grad_norm": 0.2085830420255661, "learning_rate": 4.906953174655659e-05, "loss": 0.4773, "num_input_tokens_seen": 19433024, "step": 16020 }, { "epoch": 1.7847199019935405, "grad_norm": 0.15245258808135986, "learning_rate": 4.906821786426882e-05, "loss": 0.4686, "num_input_tokens_seen": 19439680, "step": 16025 }, { "epoch": 1.785276756877158, "grad_norm": 0.18878984451293945, "learning_rate": 4.906690307260666e-05, "loss": 0.4657, "num_input_tokens_seen": 19445824, "step": 16030 }, { "epoch": 1.7858336117607752, "grad_norm": 0.14914895594120026, "learning_rate": 4.9065587371619766e-05, "loss": 0.4696, "num_input_tokens_seen": 19452064, "step": 16035 }, { "epoch": 1.7863904666443924, "grad_norm": 0.14935919642448425, "learning_rate": 4.906427076135786e-05, "loss": 0.4681, "num_input_tokens_seen": 19458144, "step": 16040 }, { "epoch": 1.7869473215280098, "grad_norm": 0.1460181325674057, "learning_rate": 4.9062953241870684e-05, "loss": 0.4746, "num_input_tokens_seen": 19464064, "step": 16045 }, { "epoch": 1.7875041764116273, "grad_norm": 0.19328153133392334, "learning_rate": 4.9061634813208026e-05, "loss": 0.4677, "num_input_tokens_seen": 19470080, "step": 16050 }, { "epoch": 1.7880610312952445, "grad_norm": 0.24347251653671265, "learning_rate": 4.9060315475419694e-05, "loss": 0.4717, "num_input_tokens_seen": 19476192, "step": 16055 }, { "epoch": 1.7886178861788617, "grad_norm": 0.16975168883800507, "learning_rate": 4.9058995228555536e-05, "loss": 0.471, "num_input_tokens_seen": 19482464, "step": 16060 }, { "epoch": 1.789174741062479, "grad_norm": 0.20957525074481964, "learning_rate": 4.905767407266544e-05, "loss": 0.4733, "num_input_tokens_seen": 19488768, "step": 16065 }, { "epoch": 1.7897315959460964, "grad_norm": 0.18528151512145996, "learning_rate": 4.905635200779932e-05, "loss": 0.4751, "num_input_tokens_seen": 19495200, "step": 16070 }, { "epoch": 1.7902884508297139, "grad_norm": 0.16164708137512207, "learning_rate": 4.9055029034007136e-05, "loss": 0.4715, "num_input_tokens_seen": 19501216, "step": 16075 }, { "epoch": 1.7908453057133311, "grad_norm": 0.16974695026874542, "learning_rate": 4.9053705151338856e-05, "loss": 0.466, "num_input_tokens_seen": 19506688, "step": 16080 }, { "epoch": 1.7914021605969483, "grad_norm": 0.1858818233013153, "learning_rate": 4.9052380359844516e-05, "loss": 0.4751, "num_input_tokens_seen": 19512672, "step": 16085 }, { "epoch": 1.7919590154805658, "grad_norm": 0.19669878482818604, "learning_rate": 4.905105465957417e-05, "loss": 0.4642, "num_input_tokens_seen": 19518944, "step": 16090 }, { "epoch": 1.7925158703641833, "grad_norm": 0.22818206250667572, "learning_rate": 4.90497280505779e-05, "loss": 0.4651, "num_input_tokens_seen": 19525408, "step": 16095 }, { "epoch": 1.7930727252478005, "grad_norm": 0.1696530431509018, "learning_rate": 4.904840053290583e-05, "loss": 0.4659, "num_input_tokens_seen": 19531456, "step": 16100 }, { "epoch": 1.7936295801314177, "grad_norm": 0.17303642630577087, "learning_rate": 4.9047072106608126e-05, "loss": 0.4587, "num_input_tokens_seen": 19537824, "step": 16105 }, { "epoch": 1.794186435015035, "grad_norm": 0.2046930491924286, "learning_rate": 4.904574277173497e-05, "loss": 0.4609, "num_input_tokens_seen": 19543744, "step": 16110 }, { "epoch": 1.7947432898986524, "grad_norm": 0.1969791203737259, "learning_rate": 4.90444125283366e-05, "loss": 0.4727, "num_input_tokens_seen": 19549824, "step": 16115 }, { "epoch": 1.7953001447822698, "grad_norm": 0.18593481183052063, "learning_rate": 4.9043081376463264e-05, "loss": 0.4482, "num_input_tokens_seen": 19555648, "step": 16120 }, { "epoch": 1.795856999665887, "grad_norm": 0.15215829014778137, "learning_rate": 4.904174931616527e-05, "loss": 0.4592, "num_input_tokens_seen": 19561952, "step": 16125 }, { "epoch": 1.7964138545495043, "grad_norm": 0.1732311248779297, "learning_rate": 4.904041634749293e-05, "loss": 0.4614, "num_input_tokens_seen": 19568160, "step": 16130 }, { "epoch": 1.7969707094331218, "grad_norm": 0.23150734603405, "learning_rate": 4.9039082470496634e-05, "loss": 0.4785, "num_input_tokens_seen": 19574240, "step": 16135 }, { "epoch": 1.7975275643167392, "grad_norm": 0.19169160723686218, "learning_rate": 4.903774768522676e-05, "loss": 0.4641, "num_input_tokens_seen": 19580256, "step": 16140 }, { "epoch": 1.7980844192003564, "grad_norm": 0.13992738723754883, "learning_rate": 4.903641199173374e-05, "loss": 0.47, "num_input_tokens_seen": 19586176, "step": 16145 }, { "epoch": 1.7986412740839737, "grad_norm": 0.14330491423606873, "learning_rate": 4.903507539006805e-05, "loss": 0.468, "num_input_tokens_seen": 19592000, "step": 16150 }, { "epoch": 1.799198128967591, "grad_norm": 0.18874357640743256, "learning_rate": 4.9033737880280196e-05, "loss": 0.4722, "num_input_tokens_seen": 19597696, "step": 16155 }, { "epoch": 1.7997549838512084, "grad_norm": 0.219094917178154, "learning_rate": 4.9032399462420696e-05, "loss": 0.4492, "num_input_tokens_seen": 19603584, "step": 16160 }, { "epoch": 1.8003118387348258, "grad_norm": 0.19722828269004822, "learning_rate": 4.903106013654013e-05, "loss": 0.471, "num_input_tokens_seen": 19609472, "step": 16165 }, { "epoch": 1.800868693618443, "grad_norm": 0.2466457039117813, "learning_rate": 4.90297199026891e-05, "loss": 0.447, "num_input_tokens_seen": 19615424, "step": 16170 }, { "epoch": 1.8014255485020603, "grad_norm": 0.194759801030159, "learning_rate": 4.9028378760918244e-05, "loss": 0.4609, "num_input_tokens_seen": 19621760, "step": 16175 }, { "epoch": 1.8019824033856777, "grad_norm": 0.18790185451507568, "learning_rate": 4.902703671127824e-05, "loss": 0.4755, "num_input_tokens_seen": 19627584, "step": 16180 }, { "epoch": 1.8025392582692952, "grad_norm": 0.18099075555801392, "learning_rate": 4.902569375381979e-05, "loss": 0.489, "num_input_tokens_seen": 19633408, "step": 16185 }, { "epoch": 1.8030961131529124, "grad_norm": 0.1720055490732193, "learning_rate": 4.9024349888593633e-05, "loss": 0.4802, "num_input_tokens_seen": 19639552, "step": 16190 }, { "epoch": 1.8036529680365296, "grad_norm": 0.19418933987617493, "learning_rate": 4.902300511565055e-05, "loss": 0.4603, "num_input_tokens_seen": 19645280, "step": 16195 }, { "epoch": 1.8042098229201469, "grad_norm": 0.19514231383800507, "learning_rate": 4.9021659435041346e-05, "loss": 0.472, "num_input_tokens_seen": 19651040, "step": 16200 }, { "epoch": 1.8047666778037643, "grad_norm": 0.1490158587694168, "learning_rate": 4.902031284681687e-05, "loss": 0.4726, "num_input_tokens_seen": 19657024, "step": 16205 }, { "epoch": 1.8053235326873818, "grad_norm": 0.246254563331604, "learning_rate": 4.901896535102799e-05, "loss": 0.4583, "num_input_tokens_seen": 19663104, "step": 16210 }, { "epoch": 1.805880387570999, "grad_norm": 0.2139839082956314, "learning_rate": 4.9017616947725636e-05, "loss": 0.4734, "num_input_tokens_seen": 19668960, "step": 16215 }, { "epoch": 1.8064372424546162, "grad_norm": 0.16467437148094177, "learning_rate": 4.901626763696074e-05, "loss": 0.4461, "num_input_tokens_seen": 19675552, "step": 16220 }, { "epoch": 1.8069940973382337, "grad_norm": 0.1525878757238388, "learning_rate": 4.901491741878429e-05, "loss": 0.4566, "num_input_tokens_seen": 19681440, "step": 16225 }, { "epoch": 1.8075509522218511, "grad_norm": 0.16870400309562683, "learning_rate": 4.901356629324731e-05, "loss": 0.4709, "num_input_tokens_seen": 19687616, "step": 16230 }, { "epoch": 1.8081078071054684, "grad_norm": 0.17180316150188446, "learning_rate": 4.901221426040082e-05, "loss": 0.4661, "num_input_tokens_seen": 19693664, "step": 16235 }, { "epoch": 1.8086646619890856, "grad_norm": 0.26798710227012634, "learning_rate": 4.901086132029594e-05, "loss": 0.4807, "num_input_tokens_seen": 19699488, "step": 16240 }, { "epoch": 1.8092215168727028, "grad_norm": 0.17143471539020538, "learning_rate": 4.900950747298377e-05, "loss": 0.4544, "num_input_tokens_seen": 19706080, "step": 16245 }, { "epoch": 1.8097783717563203, "grad_norm": 0.18439781665802002, "learning_rate": 4.900815271851547e-05, "loss": 0.4696, "num_input_tokens_seen": 19712256, "step": 16250 }, { "epoch": 1.8103352266399377, "grad_norm": 0.15736325085163116, "learning_rate": 4.900679705694221e-05, "loss": 0.4545, "num_input_tokens_seen": 19718112, "step": 16255 }, { "epoch": 1.810892081523555, "grad_norm": 0.1716102659702301, "learning_rate": 4.900544048831523e-05, "loss": 0.4689, "num_input_tokens_seen": 19724672, "step": 16260 }, { "epoch": 1.8114489364071722, "grad_norm": 0.18095910549163818, "learning_rate": 4.9004083012685776e-05, "loss": 0.4644, "num_input_tokens_seen": 19730624, "step": 16265 }, { "epoch": 1.8120057912907896, "grad_norm": 0.2667500376701355, "learning_rate": 4.900272463010514e-05, "loss": 0.4623, "num_input_tokens_seen": 19737056, "step": 16270 }, { "epoch": 1.812562646174407, "grad_norm": 0.1561530977487564, "learning_rate": 4.9001365340624656e-05, "loss": 0.4813, "num_input_tokens_seen": 19743296, "step": 16275 }, { "epoch": 1.8131195010580243, "grad_norm": 0.16003914177417755, "learning_rate": 4.9000005144295666e-05, "loss": 0.4619, "num_input_tokens_seen": 19749408, "step": 16280 }, { "epoch": 1.8136763559416416, "grad_norm": 0.3014861047267914, "learning_rate": 4.899864404116957e-05, "loss": 0.4648, "num_input_tokens_seen": 19755776, "step": 16285 }, { "epoch": 1.8142332108252588, "grad_norm": 0.1573927253484726, "learning_rate": 4.8997282031297796e-05, "loss": 0.4661, "num_input_tokens_seen": 19761984, "step": 16290 }, { "epoch": 1.8147900657088762, "grad_norm": 0.19654537737369537, "learning_rate": 4.899591911473181e-05, "loss": 0.4744, "num_input_tokens_seen": 19768128, "step": 16295 }, { "epoch": 1.8153469205924937, "grad_norm": 0.14311614632606506, "learning_rate": 4.899455529152309e-05, "loss": 0.4403, "num_input_tokens_seen": 19774144, "step": 16300 }, { "epoch": 1.815903775476111, "grad_norm": 0.15687507390975952, "learning_rate": 4.899319056172319e-05, "loss": 0.4607, "num_input_tokens_seen": 19780448, "step": 16305 }, { "epoch": 1.8164606303597282, "grad_norm": 0.13641899824142456, "learning_rate": 4.899182492538365e-05, "loss": 0.4671, "num_input_tokens_seen": 19785824, "step": 16310 }, { "epoch": 1.8170174852433456, "grad_norm": 0.13882121443748474, "learning_rate": 4.8990458382556084e-05, "loss": 0.4774, "num_input_tokens_seen": 19791904, "step": 16315 }, { "epoch": 1.817574340126963, "grad_norm": 0.14884957671165466, "learning_rate": 4.8989090933292114e-05, "loss": 0.4857, "num_input_tokens_seen": 19798144, "step": 16320 }, { "epoch": 1.8181311950105803, "grad_norm": 0.1801171451807022, "learning_rate": 4.8987722577643414e-05, "loss": 0.4738, "num_input_tokens_seen": 19804416, "step": 16325 }, { "epoch": 1.8186880498941975, "grad_norm": 0.1992015540599823, "learning_rate": 4.898635331566168e-05, "loss": 0.4714, "num_input_tokens_seen": 19810528, "step": 16330 }, { "epoch": 1.8192449047778148, "grad_norm": 0.17337501049041748, "learning_rate": 4.8984983147398654e-05, "loss": 0.4685, "num_input_tokens_seen": 19816480, "step": 16335 }, { "epoch": 1.8198017596614322, "grad_norm": 0.15952014923095703, "learning_rate": 4.898361207290611e-05, "loss": 0.4617, "num_input_tokens_seen": 19822624, "step": 16340 }, { "epoch": 1.8203586145450497, "grad_norm": 0.20065917074680328, "learning_rate": 4.8982240092235826e-05, "loss": 0.4765, "num_input_tokens_seen": 19828736, "step": 16345 }, { "epoch": 1.8209154694286669, "grad_norm": 0.17297273874282837, "learning_rate": 4.898086720543966e-05, "loss": 0.4625, "num_input_tokens_seen": 19834880, "step": 16350 }, { "epoch": 1.8214723243122841, "grad_norm": 0.16870608925819397, "learning_rate": 4.897949341256949e-05, "loss": 0.4803, "num_input_tokens_seen": 19841216, "step": 16355 }, { "epoch": 1.8220291791959016, "grad_norm": 0.1439051330089569, "learning_rate": 4.89781187136772e-05, "loss": 0.4599, "num_input_tokens_seen": 19847072, "step": 16360 }, { "epoch": 1.822586034079519, "grad_norm": 0.2244337648153305, "learning_rate": 4.897674310881475e-05, "loss": 0.4587, "num_input_tokens_seen": 19853184, "step": 16365 }, { "epoch": 1.8231428889631363, "grad_norm": 0.20225706696510315, "learning_rate": 4.897536659803411e-05, "loss": 0.4703, "num_input_tokens_seen": 19859264, "step": 16370 }, { "epoch": 1.8236997438467535, "grad_norm": 0.15008945763111115, "learning_rate": 4.897398918138729e-05, "loss": 0.4426, "num_input_tokens_seen": 19865312, "step": 16375 }, { "epoch": 1.8242565987303707, "grad_norm": 0.16939668357372284, "learning_rate": 4.897261085892632e-05, "loss": 0.4618, "num_input_tokens_seen": 19871232, "step": 16380 }, { "epoch": 1.8248134536139882, "grad_norm": 0.12060534209012985, "learning_rate": 4.8971231630703295e-05, "loss": 0.4797, "num_input_tokens_seen": 19877376, "step": 16385 }, { "epoch": 1.8253703084976056, "grad_norm": 0.2348928302526474, "learning_rate": 4.8969851496770315e-05, "loss": 0.4619, "num_input_tokens_seen": 19883552, "step": 16390 }, { "epoch": 1.8259271633812229, "grad_norm": 0.21764497458934784, "learning_rate": 4.896847045717953e-05, "loss": 0.4686, "num_input_tokens_seen": 19889152, "step": 16395 }, { "epoch": 1.82648401826484, "grad_norm": 0.2141309380531311, "learning_rate": 4.896708851198313e-05, "loss": 0.472, "num_input_tokens_seen": 19895200, "step": 16400 }, { "epoch": 1.8270408731484575, "grad_norm": 0.17726613581180573, "learning_rate": 4.896570566123331e-05, "loss": 0.4559, "num_input_tokens_seen": 19900992, "step": 16405 }, { "epoch": 1.827597728032075, "grad_norm": 0.16836079955101013, "learning_rate": 4.896432190498233e-05, "loss": 0.4826, "num_input_tokens_seen": 19907584, "step": 16410 }, { "epoch": 1.8281545829156922, "grad_norm": 0.23750141263008118, "learning_rate": 4.896293724328247e-05, "loss": 0.4645, "num_input_tokens_seen": 19914080, "step": 16415 }, { "epoch": 1.8287114377993094, "grad_norm": 0.1454615294933319, "learning_rate": 4.896155167618605e-05, "loss": 0.4739, "num_input_tokens_seen": 19919424, "step": 16420 }, { "epoch": 1.8292682926829267, "grad_norm": 0.17843928933143616, "learning_rate": 4.8960165203745424e-05, "loss": 0.4675, "num_input_tokens_seen": 19925824, "step": 16425 }, { "epoch": 1.8298251475665441, "grad_norm": 0.16479063034057617, "learning_rate": 4.8958777826012964e-05, "loss": 0.4689, "num_input_tokens_seen": 19931616, "step": 16430 }, { "epoch": 1.8303820024501616, "grad_norm": 0.1761099249124527, "learning_rate": 4.89573895430411e-05, "loss": 0.4597, "num_input_tokens_seen": 19937024, "step": 16435 }, { "epoch": 1.8309388573337788, "grad_norm": 0.2608475983142853, "learning_rate": 4.8956000354882285e-05, "loss": 0.4576, "num_input_tokens_seen": 19943200, "step": 16440 }, { "epoch": 1.831495712217396, "grad_norm": 0.1350005716085434, "learning_rate": 4.895461026158901e-05, "loss": 0.4561, "num_input_tokens_seen": 19949440, "step": 16445 }, { "epoch": 1.8320525671010135, "grad_norm": 0.19276027381420135, "learning_rate": 4.895321926321379e-05, "loss": 0.4718, "num_input_tokens_seen": 19955584, "step": 16450 }, { "epoch": 1.832609421984631, "grad_norm": 0.18321965634822845, "learning_rate": 4.8951827359809176e-05, "loss": 0.4447, "num_input_tokens_seen": 19961472, "step": 16455 }, { "epoch": 1.8331662768682482, "grad_norm": 0.17872752249240875, "learning_rate": 4.8950434551427784e-05, "loss": 0.4669, "num_input_tokens_seen": 19966912, "step": 16460 }, { "epoch": 1.8337231317518654, "grad_norm": 0.2032269835472107, "learning_rate": 4.8949040838122206e-05, "loss": 0.462, "num_input_tokens_seen": 19972800, "step": 16465 }, { "epoch": 1.8342799866354826, "grad_norm": 0.192551389336586, "learning_rate": 4.8947646219945127e-05, "loss": 0.4698, "num_input_tokens_seen": 19979104, "step": 16470 }, { "epoch": 1.8348368415191, "grad_norm": 0.29256290197372437, "learning_rate": 4.8946250696949225e-05, "loss": 0.4637, "num_input_tokens_seen": 19985280, "step": 16475 }, { "epoch": 1.8353936964027175, "grad_norm": 0.2807256877422333, "learning_rate": 4.8944854269187234e-05, "loss": 0.4745, "num_input_tokens_seen": 19991392, "step": 16480 }, { "epoch": 1.8359505512863348, "grad_norm": 0.20422421395778656, "learning_rate": 4.8943456936711906e-05, "loss": 0.4542, "num_input_tokens_seen": 19997632, "step": 16485 }, { "epoch": 1.836507406169952, "grad_norm": 0.21653595566749573, "learning_rate": 4.8942058699576057e-05, "loss": 0.472, "num_input_tokens_seen": 20003552, "step": 16490 }, { "epoch": 1.8370642610535695, "grad_norm": 0.20467185974121094, "learning_rate": 4.894065955783249e-05, "loss": 0.4618, "num_input_tokens_seen": 20009952, "step": 16495 }, { "epoch": 1.837621115937187, "grad_norm": 0.23970237374305725, "learning_rate": 4.8939259511534095e-05, "loss": 0.4747, "num_input_tokens_seen": 20016352, "step": 16500 }, { "epoch": 1.8381779708208041, "grad_norm": 0.16176725924015045, "learning_rate": 4.8937858560733754e-05, "loss": 0.4631, "num_input_tokens_seen": 20022656, "step": 16505 }, { "epoch": 1.8387348257044214, "grad_norm": 0.1554168164730072, "learning_rate": 4.8936456705484405e-05, "loss": 0.4422, "num_input_tokens_seen": 20028640, "step": 16510 }, { "epoch": 1.8392916805880386, "grad_norm": 0.17329145967960358, "learning_rate": 4.893505394583902e-05, "loss": 0.4665, "num_input_tokens_seen": 20034496, "step": 16515 }, { "epoch": 1.839848535471656, "grad_norm": 0.1686311960220337, "learning_rate": 4.8933650281850585e-05, "loss": 0.4642, "num_input_tokens_seen": 20040608, "step": 16520 }, { "epoch": 1.8404053903552735, "grad_norm": 0.2078564614057541, "learning_rate": 4.893224571357215e-05, "loss": 0.4733, "num_input_tokens_seen": 20046688, "step": 16525 }, { "epoch": 1.8409622452388907, "grad_norm": 0.21211084723472595, "learning_rate": 4.8930840241056766e-05, "loss": 0.4788, "num_input_tokens_seen": 20052864, "step": 16530 }, { "epoch": 1.841519100122508, "grad_norm": 0.23318427801132202, "learning_rate": 4.892943386435755e-05, "loss": 0.4701, "num_input_tokens_seen": 20059168, "step": 16535 }, { "epoch": 1.8420759550061254, "grad_norm": 0.16272547841072083, "learning_rate": 4.892802658352764e-05, "loss": 0.4622, "num_input_tokens_seen": 20065472, "step": 16540 }, { "epoch": 1.8426328098897429, "grad_norm": 0.12269481271505356, "learning_rate": 4.8926618398620206e-05, "loss": 0.4811, "num_input_tokens_seen": 20071520, "step": 16545 }, { "epoch": 1.84318966477336, "grad_norm": 0.18823476135730743, "learning_rate": 4.8925209309688454e-05, "loss": 0.4486, "num_input_tokens_seen": 20077920, "step": 16550 }, { "epoch": 1.8437465196569773, "grad_norm": 0.18615931272506714, "learning_rate": 4.892379931678562e-05, "loss": 0.4789, "num_input_tokens_seen": 20083776, "step": 16555 }, { "epoch": 1.8443033745405946, "grad_norm": 0.21268539130687714, "learning_rate": 4.8922388419964985e-05, "loss": 0.4867, "num_input_tokens_seen": 20090144, "step": 16560 }, { "epoch": 1.844860229424212, "grad_norm": 0.1752193421125412, "learning_rate": 4.8920976619279844e-05, "loss": 0.4538, "num_input_tokens_seen": 20096416, "step": 16565 }, { "epoch": 1.8454170843078295, "grad_norm": 0.20846687257289886, "learning_rate": 4.891956391478355e-05, "loss": 0.4679, "num_input_tokens_seen": 20102656, "step": 16570 }, { "epoch": 1.8459739391914467, "grad_norm": 0.19919675588607788, "learning_rate": 4.891815030652948e-05, "loss": 0.4779, "num_input_tokens_seen": 20108224, "step": 16575 }, { "epoch": 1.846530794075064, "grad_norm": 0.175550639629364, "learning_rate": 4.8916735794571034e-05, "loss": 0.4613, "num_input_tokens_seen": 20113952, "step": 16580 }, { "epoch": 1.8470876489586814, "grad_norm": 0.18456396460533142, "learning_rate": 4.891532037896167e-05, "loss": 0.4664, "num_input_tokens_seen": 20119936, "step": 16585 }, { "epoch": 1.8476445038422988, "grad_norm": 0.2998395264148712, "learning_rate": 4.8913904059754865e-05, "loss": 0.4754, "num_input_tokens_seen": 20125920, "step": 16590 }, { "epoch": 1.848201358725916, "grad_norm": 0.20559096336364746, "learning_rate": 4.891248683700413e-05, "loss": 0.453, "num_input_tokens_seen": 20131488, "step": 16595 }, { "epoch": 1.8487582136095333, "grad_norm": 0.28632214665412903, "learning_rate": 4.891106871076301e-05, "loss": 0.4582, "num_input_tokens_seen": 20137536, "step": 16600 }, { "epoch": 1.8493150684931505, "grad_norm": 0.21760541200637817, "learning_rate": 4.890964968108508e-05, "loss": 0.4583, "num_input_tokens_seen": 20143552, "step": 16605 }, { "epoch": 1.849871923376768, "grad_norm": 0.17231853306293488, "learning_rate": 4.890822974802397e-05, "loss": 0.4653, "num_input_tokens_seen": 20149792, "step": 16610 }, { "epoch": 1.8504287782603854, "grad_norm": 0.20288637280464172, "learning_rate": 4.890680891163332e-05, "loss": 0.4507, "num_input_tokens_seen": 20155264, "step": 16615 }, { "epoch": 1.8509856331440027, "grad_norm": 0.17860320210456848, "learning_rate": 4.8905387171966816e-05, "loss": 0.4634, "num_input_tokens_seen": 20161568, "step": 16620 }, { "epoch": 1.85154248802762, "grad_norm": 0.24880188703536987, "learning_rate": 4.890396452907817e-05, "loss": 0.4444, "num_input_tokens_seen": 20167648, "step": 16625 }, { "epoch": 1.8520993429112373, "grad_norm": 0.1748453825712204, "learning_rate": 4.890254098302115e-05, "loss": 0.467, "num_input_tokens_seen": 20173664, "step": 16630 }, { "epoch": 1.8526561977948548, "grad_norm": 0.16401071846485138, "learning_rate": 4.890111653384953e-05, "loss": 0.4621, "num_input_tokens_seen": 20179584, "step": 16635 }, { "epoch": 1.853213052678472, "grad_norm": 0.19178423285484314, "learning_rate": 4.8899691181617136e-05, "loss": 0.453, "num_input_tokens_seen": 20185632, "step": 16640 }, { "epoch": 1.8537699075620893, "grad_norm": 0.18656130135059357, "learning_rate": 4.889826492637781e-05, "loss": 0.4831, "num_input_tokens_seen": 20191712, "step": 16645 }, { "epoch": 1.8543267624457065, "grad_norm": 0.16593395173549652, "learning_rate": 4.889683776818545e-05, "loss": 0.4528, "num_input_tokens_seen": 20197920, "step": 16650 }, { "epoch": 1.854883617329324, "grad_norm": 0.2510479688644409, "learning_rate": 4.889540970709399e-05, "loss": 0.463, "num_input_tokens_seen": 20204032, "step": 16655 }, { "epoch": 1.8554404722129414, "grad_norm": 0.23104290664196014, "learning_rate": 4.889398074315736e-05, "loss": 0.461, "num_input_tokens_seen": 20210304, "step": 16660 }, { "epoch": 1.8559973270965586, "grad_norm": 0.20702818036079407, "learning_rate": 4.8892550876429575e-05, "loss": 0.4656, "num_input_tokens_seen": 20216224, "step": 16665 }, { "epoch": 1.8565541819801759, "grad_norm": 0.14405378699302673, "learning_rate": 4.889112010696465e-05, "loss": 0.4776, "num_input_tokens_seen": 20222304, "step": 16670 }, { "epoch": 1.8571110368637933, "grad_norm": 0.17178833484649658, "learning_rate": 4.888968843481664e-05, "loss": 0.4386, "num_input_tokens_seen": 20228736, "step": 16675 }, { "epoch": 1.8576678917474108, "grad_norm": 0.2181052416563034, "learning_rate": 4.888825586003964e-05, "loss": 0.4488, "num_input_tokens_seen": 20234720, "step": 16680 }, { "epoch": 1.858224746631028, "grad_norm": 0.20169313251972198, "learning_rate": 4.888682238268778e-05, "loss": 0.4817, "num_input_tokens_seen": 20240800, "step": 16685 }, { "epoch": 1.8587816015146452, "grad_norm": 0.1411435902118683, "learning_rate": 4.8885388002815224e-05, "loss": 0.4661, "num_input_tokens_seen": 20246528, "step": 16690 }, { "epoch": 1.8593384563982625, "grad_norm": 0.13867808878421783, "learning_rate": 4.8883952720476166e-05, "loss": 0.4627, "num_input_tokens_seen": 20252800, "step": 16695 }, { "epoch": 1.85989531128188, "grad_norm": 0.1427234560251236, "learning_rate": 4.888251653572484e-05, "loss": 0.4501, "num_input_tokens_seen": 20258848, "step": 16700 }, { "epoch": 1.8604521661654974, "grad_norm": 0.1450388878583908, "learning_rate": 4.8881079448615494e-05, "loss": 0.4635, "num_input_tokens_seen": 20264960, "step": 16705 }, { "epoch": 1.8610090210491146, "grad_norm": 0.18556001782417297, "learning_rate": 4.8879641459202444e-05, "loss": 0.4823, "num_input_tokens_seen": 20270880, "step": 16710 }, { "epoch": 1.8615658759327318, "grad_norm": 0.20847851037979126, "learning_rate": 4.887820256754001e-05, "loss": 0.4747, "num_input_tokens_seen": 20277120, "step": 16715 }, { "epoch": 1.8621227308163493, "grad_norm": 0.16452795267105103, "learning_rate": 4.887676277368256e-05, "loss": 0.475, "num_input_tokens_seen": 20283328, "step": 16720 }, { "epoch": 1.8626795856999667, "grad_norm": 0.12942832708358765, "learning_rate": 4.887532207768449e-05, "loss": 0.4677, "num_input_tokens_seen": 20289664, "step": 16725 }, { "epoch": 1.863236440583584, "grad_norm": 0.20724236965179443, "learning_rate": 4.887388047960025e-05, "loss": 0.4723, "num_input_tokens_seen": 20295808, "step": 16730 }, { "epoch": 1.8637932954672012, "grad_norm": 0.17326296865940094, "learning_rate": 4.88724379794843e-05, "loss": 0.4588, "num_input_tokens_seen": 20302176, "step": 16735 }, { "epoch": 1.8643501503508184, "grad_norm": 0.15391437709331512, "learning_rate": 4.887099457739113e-05, "loss": 0.4439, "num_input_tokens_seen": 20308384, "step": 16740 }, { "epoch": 1.8649070052344359, "grad_norm": 0.2369670271873474, "learning_rate": 4.8869550273375297e-05, "loss": 0.4514, "num_input_tokens_seen": 20314784, "step": 16745 }, { "epoch": 1.8654638601180533, "grad_norm": 0.18941284716129303, "learning_rate": 4.8868105067491356e-05, "loss": 0.4665, "num_input_tokens_seen": 20320928, "step": 16750 }, { "epoch": 1.8660207150016705, "grad_norm": 0.16787773370742798, "learning_rate": 4.8866658959793906e-05, "loss": 0.4706, "num_input_tokens_seen": 20327200, "step": 16755 }, { "epoch": 1.8665775698852878, "grad_norm": 0.17840279638767242, "learning_rate": 4.8865211950337605e-05, "loss": 0.4598, "num_input_tokens_seen": 20333280, "step": 16760 }, { "epoch": 1.8671344247689052, "grad_norm": 0.12490398436784744, "learning_rate": 4.886376403917712e-05, "loss": 0.4561, "num_input_tokens_seen": 20339232, "step": 16765 }, { "epoch": 1.8676912796525227, "grad_norm": 0.15854278206825256, "learning_rate": 4.886231522636715e-05, "loss": 0.4636, "num_input_tokens_seen": 20345792, "step": 16770 }, { "epoch": 1.86824813453614, "grad_norm": 0.11742818355560303, "learning_rate": 4.886086551196245e-05, "loss": 0.4735, "num_input_tokens_seen": 20352160, "step": 16775 }, { "epoch": 1.8688049894197571, "grad_norm": 0.18181879818439484, "learning_rate": 4.885941489601778e-05, "loss": 0.4673, "num_input_tokens_seen": 20358272, "step": 16780 }, { "epoch": 1.8693618443033744, "grad_norm": 0.2353844791650772, "learning_rate": 4.8857963378587946e-05, "loss": 0.4539, "num_input_tokens_seen": 20363648, "step": 16785 }, { "epoch": 1.8699186991869918, "grad_norm": 0.19843551516532898, "learning_rate": 4.8856510959727804e-05, "loss": 0.4611, "num_input_tokens_seen": 20369376, "step": 16790 }, { "epoch": 1.8704755540706093, "grad_norm": 0.17421524226665497, "learning_rate": 4.8855057639492225e-05, "loss": 0.4661, "num_input_tokens_seen": 20375456, "step": 16795 }, { "epoch": 1.8710324089542265, "grad_norm": 0.1649184376001358, "learning_rate": 4.885360341793612e-05, "loss": 0.4643, "num_input_tokens_seen": 20381632, "step": 16800 }, { "epoch": 1.8715892638378437, "grad_norm": 0.19251176714897156, "learning_rate": 4.8852148295114435e-05, "loss": 0.478, "num_input_tokens_seen": 20387552, "step": 16805 }, { "epoch": 1.8721461187214612, "grad_norm": 0.15704193711280823, "learning_rate": 4.8850692271082156e-05, "loss": 0.475, "num_input_tokens_seen": 20393664, "step": 16810 }, { "epoch": 1.8727029736050786, "grad_norm": 0.1693791151046753, "learning_rate": 4.884923534589428e-05, "loss": 0.4539, "num_input_tokens_seen": 20399968, "step": 16815 }, { "epoch": 1.8732598284886959, "grad_norm": 0.16898244619369507, "learning_rate": 4.884777751960588e-05, "loss": 0.4744, "num_input_tokens_seen": 20406080, "step": 16820 }, { "epoch": 1.873816683372313, "grad_norm": 0.15845806896686554, "learning_rate": 4.8846318792272006e-05, "loss": 0.4606, "num_input_tokens_seen": 20412128, "step": 16825 }, { "epoch": 1.8743735382559303, "grad_norm": 0.1629527062177658, "learning_rate": 4.8844859163947786e-05, "loss": 0.4689, "num_input_tokens_seen": 20418592, "step": 16830 }, { "epoch": 1.8749303931395478, "grad_norm": 0.19897295534610748, "learning_rate": 4.884339863468839e-05, "loss": 0.473, "num_input_tokens_seen": 20424768, "step": 16835 }, { "epoch": 1.8754872480231652, "grad_norm": 0.15952324867248535, "learning_rate": 4.884193720454897e-05, "loss": 0.4436, "num_input_tokens_seen": 20430400, "step": 16840 }, { "epoch": 1.8760441029067825, "grad_norm": 0.2722989022731781, "learning_rate": 4.884047487358476e-05, "loss": 0.4748, "num_input_tokens_seen": 20436800, "step": 16845 }, { "epoch": 1.8766009577903997, "grad_norm": 0.18931201100349426, "learning_rate": 4.883901164185101e-05, "loss": 0.4859, "num_input_tokens_seen": 20442688, "step": 16850 }, { "epoch": 1.8771578126740172, "grad_norm": 0.14343143999576569, "learning_rate": 4.8837547509403e-05, "loss": 0.4718, "num_input_tokens_seen": 20448928, "step": 16855 }, { "epoch": 1.8777146675576346, "grad_norm": 0.16066758334636688, "learning_rate": 4.883608247629606e-05, "loss": 0.4593, "num_input_tokens_seen": 20455200, "step": 16860 }, { "epoch": 1.8782715224412518, "grad_norm": 0.15867573022842407, "learning_rate": 4.8834616542585534e-05, "loss": 0.4651, "num_input_tokens_seen": 20461504, "step": 16865 }, { "epoch": 1.878828377324869, "grad_norm": 0.199089914560318, "learning_rate": 4.8833149708326815e-05, "loss": 0.4658, "num_input_tokens_seen": 20467584, "step": 16870 }, { "epoch": 1.8793852322084865, "grad_norm": 0.21682767570018768, "learning_rate": 4.8831681973575316e-05, "loss": 0.4633, "num_input_tokens_seen": 20472896, "step": 16875 }, { "epoch": 1.8799420870921038, "grad_norm": 0.2389862835407257, "learning_rate": 4.883021333838651e-05, "loss": 0.4565, "num_input_tokens_seen": 20478592, "step": 16880 }, { "epoch": 1.8804989419757212, "grad_norm": 0.16006816923618317, "learning_rate": 4.882874380281587e-05, "loss": 0.4586, "num_input_tokens_seen": 20484768, "step": 16885 }, { "epoch": 1.8810557968593384, "grad_norm": 0.20785163342952728, "learning_rate": 4.882727336691893e-05, "loss": 0.4488, "num_input_tokens_seen": 20490976, "step": 16890 }, { "epoch": 1.8816126517429557, "grad_norm": 0.19961164891719818, "learning_rate": 4.882580203075124e-05, "loss": 0.4574, "num_input_tokens_seen": 20497152, "step": 16895 }, { "epoch": 1.8821695066265731, "grad_norm": 0.18504099547863007, "learning_rate": 4.88243297943684e-05, "loss": 0.4713, "num_input_tokens_seen": 20503136, "step": 16900 }, { "epoch": 1.8827263615101906, "grad_norm": 0.16705553233623505, "learning_rate": 4.882285665782603e-05, "loss": 0.4639, "num_input_tokens_seen": 20509248, "step": 16905 }, { "epoch": 1.8832832163938078, "grad_norm": 0.1879202127456665, "learning_rate": 4.882138262117979e-05, "loss": 0.4593, "num_input_tokens_seen": 20515552, "step": 16910 }, { "epoch": 1.883840071277425, "grad_norm": 0.1644972711801529, "learning_rate": 4.8819907684485376e-05, "loss": 0.4562, "num_input_tokens_seen": 20521728, "step": 16915 }, { "epoch": 1.8843969261610425, "grad_norm": 0.24583183228969574, "learning_rate": 4.881843184779852e-05, "loss": 0.4636, "num_input_tokens_seen": 20527776, "step": 16920 }, { "epoch": 1.8849537810446597, "grad_norm": 0.2306586056947708, "learning_rate": 4.881695511117497e-05, "loss": 0.4671, "num_input_tokens_seen": 20534048, "step": 16925 }, { "epoch": 1.8855106359282772, "grad_norm": 0.15694496035575867, "learning_rate": 4.881547747467053e-05, "loss": 0.4945, "num_input_tokens_seen": 20540096, "step": 16930 }, { "epoch": 1.8860674908118944, "grad_norm": 0.1631072759628296, "learning_rate": 4.8813998938341044e-05, "loss": 0.4773, "num_input_tokens_seen": 20545600, "step": 16935 }, { "epoch": 1.8866243456955116, "grad_norm": 0.23348309099674225, "learning_rate": 4.881251950224235e-05, "loss": 0.4606, "num_input_tokens_seen": 20551680, "step": 16940 }, { "epoch": 1.887181200579129, "grad_norm": 0.1685580611228943, "learning_rate": 4.8811039166430364e-05, "loss": 0.4508, "num_input_tokens_seen": 20557664, "step": 16945 }, { "epoch": 1.8877380554627465, "grad_norm": 0.16962741315364838, "learning_rate": 4.880955793096101e-05, "loss": 0.4753, "num_input_tokens_seen": 20563232, "step": 16950 }, { "epoch": 1.8882949103463638, "grad_norm": 0.20361119508743286, "learning_rate": 4.880807579589025e-05, "loss": 0.4583, "num_input_tokens_seen": 20569248, "step": 16955 }, { "epoch": 1.888851765229981, "grad_norm": 0.16530773043632507, "learning_rate": 4.88065927612741e-05, "loss": 0.4639, "num_input_tokens_seen": 20575488, "step": 16960 }, { "epoch": 1.8894086201135984, "grad_norm": 0.235495924949646, "learning_rate": 4.8805108827168574e-05, "loss": 0.4702, "num_input_tokens_seen": 20581568, "step": 16965 }, { "epoch": 1.8899654749972157, "grad_norm": 0.2160061001777649, "learning_rate": 4.880362399362976e-05, "loss": 0.4747, "num_input_tokens_seen": 20587616, "step": 16970 }, { "epoch": 1.8905223298808331, "grad_norm": 0.17660202085971832, "learning_rate": 4.880213826071375e-05, "loss": 0.4726, "num_input_tokens_seen": 20593920, "step": 16975 }, { "epoch": 1.8910791847644504, "grad_norm": 0.1699819564819336, "learning_rate": 4.880065162847667e-05, "loss": 0.4538, "num_input_tokens_seen": 20599936, "step": 16980 }, { "epoch": 1.8916360396480676, "grad_norm": 0.17524972558021545, "learning_rate": 4.8799164096974695e-05, "loss": 0.4806, "num_input_tokens_seen": 20606272, "step": 16985 }, { "epoch": 1.892192894531685, "grad_norm": 0.1714523732662201, "learning_rate": 4.879767566626404e-05, "loss": 0.4757, "num_input_tokens_seen": 20611968, "step": 16990 }, { "epoch": 1.8927497494153025, "grad_norm": 0.1912199854850769, "learning_rate": 4.879618633640093e-05, "loss": 0.4595, "num_input_tokens_seen": 20617696, "step": 16995 }, { "epoch": 1.8933066042989197, "grad_norm": 0.1467604637145996, "learning_rate": 4.879469610744165e-05, "loss": 0.4722, "num_input_tokens_seen": 20623584, "step": 17000 }, { "epoch": 1.893863459182537, "grad_norm": 0.17178159952163696, "learning_rate": 4.879320497944249e-05, "loss": 0.4794, "num_input_tokens_seen": 20629984, "step": 17005 }, { "epoch": 1.8944203140661544, "grad_norm": 0.19633163511753082, "learning_rate": 4.87917129524598e-05, "loss": 0.4581, "num_input_tokens_seen": 20636000, "step": 17010 }, { "epoch": 1.8949771689497716, "grad_norm": 0.1951829046010971, "learning_rate": 4.879022002654995e-05, "loss": 0.48, "num_input_tokens_seen": 20642272, "step": 17015 }, { "epoch": 1.895534023833389, "grad_norm": 0.23981709778308868, "learning_rate": 4.878872620176934e-05, "loss": 0.4553, "num_input_tokens_seen": 20648224, "step": 17020 }, { "epoch": 1.8960908787170063, "grad_norm": 0.23673893511295319, "learning_rate": 4.8787231478174437e-05, "loss": 0.4528, "num_input_tokens_seen": 20654336, "step": 17025 }, { "epoch": 1.8966477336006236, "grad_norm": 0.19630837440490723, "learning_rate": 4.878573585582168e-05, "loss": 0.471, "num_input_tokens_seen": 20660608, "step": 17030 }, { "epoch": 1.897204588484241, "grad_norm": 0.18267884850502014, "learning_rate": 4.878423933476761e-05, "loss": 0.4724, "num_input_tokens_seen": 20666688, "step": 17035 }, { "epoch": 1.8977614433678585, "grad_norm": 0.17409846186637878, "learning_rate": 4.878274191506875e-05, "loss": 0.4679, "num_input_tokens_seen": 20672864, "step": 17040 }, { "epoch": 1.8983182982514757, "grad_norm": 0.18786579370498657, "learning_rate": 4.878124359678169e-05, "loss": 0.4517, "num_input_tokens_seen": 20678496, "step": 17045 }, { "epoch": 1.898875153135093, "grad_norm": 0.174225315451622, "learning_rate": 4.877974437996303e-05, "loss": 0.4662, "num_input_tokens_seen": 20684768, "step": 17050 }, { "epoch": 1.8994320080187104, "grad_norm": 0.18260006606578827, "learning_rate": 4.8778244264669424e-05, "loss": 0.48, "num_input_tokens_seen": 20690272, "step": 17055 }, { "epoch": 1.8999888629023276, "grad_norm": 0.20916706323623657, "learning_rate": 4.877674325095756e-05, "loss": 0.4634, "num_input_tokens_seen": 20696512, "step": 17060 }, { "epoch": 1.900545717785945, "grad_norm": 0.16191333532333374, "learning_rate": 4.8775241338884125e-05, "loss": 0.4621, "num_input_tokens_seen": 20702720, "step": 17065 }, { "epoch": 1.9011025726695623, "grad_norm": 0.18649013340473175, "learning_rate": 4.877373852850588e-05, "loss": 0.4658, "num_input_tokens_seen": 20709376, "step": 17070 }, { "epoch": 1.9016594275531795, "grad_norm": 0.17071416974067688, "learning_rate": 4.877223481987961e-05, "loss": 0.4856, "num_input_tokens_seen": 20715552, "step": 17075 }, { "epoch": 1.902216282436797, "grad_norm": 0.17069962620735168, "learning_rate": 4.877073021306213e-05, "loss": 0.4684, "num_input_tokens_seen": 20721696, "step": 17080 }, { "epoch": 1.9027731373204144, "grad_norm": 0.13129307329654694, "learning_rate": 4.876922470811029e-05, "loss": 0.4617, "num_input_tokens_seen": 20727840, "step": 17085 }, { "epoch": 1.9033299922040317, "grad_norm": 0.18580196797847748, "learning_rate": 4.876771830508096e-05, "loss": 0.4782, "num_input_tokens_seen": 20733920, "step": 17090 }, { "epoch": 1.9038868470876489, "grad_norm": 0.2284776270389557, "learning_rate": 4.876621100403107e-05, "loss": 0.4738, "num_input_tokens_seen": 20739904, "step": 17095 }, { "epoch": 1.9044437019712663, "grad_norm": 0.16905298829078674, "learning_rate": 4.876470280501756e-05, "loss": 0.4727, "num_input_tokens_seen": 20746016, "step": 17100 }, { "epoch": 1.9050005568548836, "grad_norm": 0.18518836796283722, "learning_rate": 4.876319370809743e-05, "loss": 0.4684, "num_input_tokens_seen": 20752224, "step": 17105 }, { "epoch": 1.905557411738501, "grad_norm": 0.1385151743888855, "learning_rate": 4.8761683713327676e-05, "loss": 0.4574, "num_input_tokens_seen": 20758432, "step": 17110 }, { "epoch": 1.9061142666221182, "grad_norm": 0.13437700271606445, "learning_rate": 4.876017282076537e-05, "loss": 0.4661, "num_input_tokens_seen": 20764608, "step": 17115 }, { "epoch": 1.9066711215057355, "grad_norm": 0.14077815413475037, "learning_rate": 4.87586610304676e-05, "loss": 0.4638, "num_input_tokens_seen": 20770944, "step": 17120 }, { "epoch": 1.907227976389353, "grad_norm": 0.14451059699058533, "learning_rate": 4.875714834249146e-05, "loss": 0.4764, "num_input_tokens_seen": 20776864, "step": 17125 }, { "epoch": 1.9077848312729704, "grad_norm": 0.1704367995262146, "learning_rate": 4.875563475689414e-05, "loss": 0.4595, "num_input_tokens_seen": 20783104, "step": 17130 }, { "epoch": 1.9083416861565876, "grad_norm": 0.19262486696243286, "learning_rate": 4.8754120273732794e-05, "loss": 0.4628, "num_input_tokens_seen": 20789216, "step": 17135 }, { "epoch": 1.9088985410402048, "grad_norm": 0.15767687559127808, "learning_rate": 4.875260489306467e-05, "loss": 0.4727, "num_input_tokens_seen": 20795040, "step": 17140 }, { "epoch": 1.9094553959238223, "grad_norm": 0.19213467836380005, "learning_rate": 4.875108861494701e-05, "loss": 0.4747, "num_input_tokens_seen": 20800800, "step": 17145 }, { "epoch": 1.9100122508074395, "grad_norm": 0.14375422894954681, "learning_rate": 4.8749571439437114e-05, "loss": 0.4529, "num_input_tokens_seen": 20806784, "step": 17150 }, { "epoch": 1.910569105691057, "grad_norm": 0.14414899051189423, "learning_rate": 4.8748053366592295e-05, "loss": 0.4525, "num_input_tokens_seen": 20812832, "step": 17155 }, { "epoch": 1.9111259605746742, "grad_norm": 0.1474706083536148, "learning_rate": 4.874653439646991e-05, "loss": 0.4634, "num_input_tokens_seen": 20818912, "step": 17160 }, { "epoch": 1.9116828154582914, "grad_norm": 0.19245962798595428, "learning_rate": 4.874501452912737e-05, "loss": 0.4728, "num_input_tokens_seen": 20824960, "step": 17165 }, { "epoch": 1.912239670341909, "grad_norm": 0.14963074028491974, "learning_rate": 4.8743493764622075e-05, "loss": 0.4699, "num_input_tokens_seen": 20831136, "step": 17170 }, { "epoch": 1.9127965252255263, "grad_norm": 0.15182746946811676, "learning_rate": 4.8741972103011504e-05, "loss": 0.4586, "num_input_tokens_seen": 20837088, "step": 17175 }, { "epoch": 1.9133533801091436, "grad_norm": 0.21247659623622894, "learning_rate": 4.8740449544353136e-05, "loss": 0.4628, "num_input_tokens_seen": 20843392, "step": 17180 }, { "epoch": 1.9139102349927608, "grad_norm": 0.15217675268650055, "learning_rate": 4.873892608870451e-05, "loss": 0.4533, "num_input_tokens_seen": 20849216, "step": 17185 }, { "epoch": 1.9144670898763783, "grad_norm": 0.1484590768814087, "learning_rate": 4.8737401736123166e-05, "loss": 0.4689, "num_input_tokens_seen": 20855392, "step": 17190 }, { "epoch": 1.9150239447599957, "grad_norm": 0.24827247858047485, "learning_rate": 4.8735876486666734e-05, "loss": 0.4434, "num_input_tokens_seen": 20861344, "step": 17195 }, { "epoch": 1.915580799643613, "grad_norm": 0.1357680857181549, "learning_rate": 4.873435034039281e-05, "loss": 0.4569, "num_input_tokens_seen": 20867392, "step": 17200 }, { "epoch": 1.9161376545272302, "grad_norm": 0.15444931387901306, "learning_rate": 4.873282329735907e-05, "loss": 0.4587, "num_input_tokens_seen": 20873632, "step": 17205 }, { "epoch": 1.9166945094108474, "grad_norm": 0.1257646083831787, "learning_rate": 4.873129535762322e-05, "loss": 0.4465, "num_input_tokens_seen": 20880192, "step": 17210 }, { "epoch": 1.9172513642944649, "grad_norm": 0.18014268577098846, "learning_rate": 4.872976652124297e-05, "loss": 0.4734, "num_input_tokens_seen": 20886400, "step": 17215 }, { "epoch": 1.9178082191780823, "grad_norm": 0.18088407814502716, "learning_rate": 4.8728236788276105e-05, "loss": 0.4586, "num_input_tokens_seen": 20892736, "step": 17220 }, { "epoch": 1.9183650740616995, "grad_norm": 0.262373149394989, "learning_rate": 4.8726706158780404e-05, "loss": 0.4874, "num_input_tokens_seen": 20898848, "step": 17225 }, { "epoch": 1.9189219289453168, "grad_norm": 0.16478782892227173, "learning_rate": 4.8725174632813715e-05, "loss": 0.4814, "num_input_tokens_seen": 20904448, "step": 17230 }, { "epoch": 1.9194787838289342, "grad_norm": 0.23082517087459564, "learning_rate": 4.8723642210433894e-05, "loss": 0.4538, "num_input_tokens_seen": 20910784, "step": 17235 }, { "epoch": 1.9200356387125517, "grad_norm": 0.2190294861793518, "learning_rate": 4.8722108891698845e-05, "loss": 0.4428, "num_input_tokens_seen": 20916960, "step": 17240 }, { "epoch": 1.920592493596169, "grad_norm": 0.16287320852279663, "learning_rate": 4.87205746766665e-05, "loss": 0.461, "num_input_tokens_seen": 20923264, "step": 17245 }, { "epoch": 1.9211493484797861, "grad_norm": 0.15071263909339905, "learning_rate": 4.871903956539483e-05, "loss": 0.4383, "num_input_tokens_seen": 20928576, "step": 17250 }, { "epoch": 1.9217062033634034, "grad_norm": 0.21837614476680756, "learning_rate": 4.871750355794183e-05, "loss": 0.4843, "num_input_tokens_seen": 20934880, "step": 17255 }, { "epoch": 1.9222630582470208, "grad_norm": 0.19014623761177063, "learning_rate": 4.871596665436554e-05, "loss": 0.464, "num_input_tokens_seen": 20940032, "step": 17260 }, { "epoch": 1.9228199131306383, "grad_norm": 0.169485405087471, "learning_rate": 4.871442885472403e-05, "loss": 0.4699, "num_input_tokens_seen": 20946240, "step": 17265 }, { "epoch": 1.9233767680142555, "grad_norm": 0.15270814299583435, "learning_rate": 4.87128901590754e-05, "loss": 0.4674, "num_input_tokens_seen": 20951776, "step": 17270 }, { "epoch": 1.9239336228978727, "grad_norm": 0.17326824367046356, "learning_rate": 4.871135056747779e-05, "loss": 0.4591, "num_input_tokens_seen": 20958336, "step": 17275 }, { "epoch": 1.9244904777814902, "grad_norm": 0.1683216691017151, "learning_rate": 4.870981007998937e-05, "loss": 0.4515, "num_input_tokens_seen": 20964288, "step": 17280 }, { "epoch": 1.9250473326651076, "grad_norm": 0.13121123611927032, "learning_rate": 4.870826869666834e-05, "loss": 0.467, "num_input_tokens_seen": 20970464, "step": 17285 }, { "epoch": 1.9256041875487249, "grad_norm": 0.15004529058933258, "learning_rate": 4.870672641757295e-05, "loss": 0.4637, "num_input_tokens_seen": 20976096, "step": 17290 }, { "epoch": 1.926161042432342, "grad_norm": 0.14913097023963928, "learning_rate": 4.870518324276145e-05, "loss": 0.4723, "num_input_tokens_seen": 20981984, "step": 17295 }, { "epoch": 1.9267178973159593, "grad_norm": 0.15270563960075378, "learning_rate": 4.8703639172292165e-05, "loss": 0.4587, "num_input_tokens_seen": 20988160, "step": 17300 }, { "epoch": 1.9272747521995768, "grad_norm": 0.2031325250864029, "learning_rate": 4.870209420622344e-05, "loss": 0.4704, "num_input_tokens_seen": 20994496, "step": 17305 }, { "epoch": 1.9278316070831942, "grad_norm": 0.14145272970199585, "learning_rate": 4.8700548344613624e-05, "loss": 0.4848, "num_input_tokens_seen": 21000448, "step": 17310 }, { "epoch": 1.9283884619668115, "grad_norm": 0.13738678395748138, "learning_rate": 4.869900158752114e-05, "loss": 0.4804, "num_input_tokens_seen": 21006464, "step": 17315 }, { "epoch": 1.9289453168504287, "grad_norm": 0.16907332837581635, "learning_rate": 4.8697453935004445e-05, "loss": 0.4576, "num_input_tokens_seen": 21012416, "step": 17320 }, { "epoch": 1.9295021717340461, "grad_norm": 0.1430920660495758, "learning_rate": 4.8695905387121985e-05, "loss": 0.4595, "num_input_tokens_seen": 21018272, "step": 17325 }, { "epoch": 1.9300590266176636, "grad_norm": 0.22227221727371216, "learning_rate": 4.8694355943932286e-05, "loss": 0.4737, "num_input_tokens_seen": 21024640, "step": 17330 }, { "epoch": 1.9306158815012808, "grad_norm": 0.1320011168718338, "learning_rate": 4.869280560549389e-05, "loss": 0.4824, "num_input_tokens_seen": 21030688, "step": 17335 }, { "epoch": 1.931172736384898, "grad_norm": 0.13753190636634827, "learning_rate": 4.869125437186536e-05, "loss": 0.459, "num_input_tokens_seen": 21036736, "step": 17340 }, { "epoch": 1.9317295912685153, "grad_norm": 0.20792633295059204, "learning_rate": 4.868970224310533e-05, "loss": 0.4702, "num_input_tokens_seen": 21042752, "step": 17345 }, { "epoch": 1.9322864461521327, "grad_norm": 0.16072583198547363, "learning_rate": 4.868814921927242e-05, "loss": 0.4547, "num_input_tokens_seen": 21047968, "step": 17350 }, { "epoch": 1.9328433010357502, "grad_norm": 0.16212496161460876, "learning_rate": 4.868659530042533e-05, "loss": 0.4556, "num_input_tokens_seen": 21053600, "step": 17355 }, { "epoch": 1.9334001559193674, "grad_norm": 0.23493413627147675, "learning_rate": 4.8685040486622754e-05, "loss": 0.5047, "num_input_tokens_seen": 21059552, "step": 17360 }, { "epoch": 1.9339570108029847, "grad_norm": 0.19238507747650146, "learning_rate": 4.868348477792345e-05, "loss": 0.4784, "num_input_tokens_seen": 21065344, "step": 17365 }, { "epoch": 1.934513865686602, "grad_norm": 0.21082597970962524, "learning_rate": 4.86819281743862e-05, "loss": 0.4629, "num_input_tokens_seen": 21071488, "step": 17370 }, { "epoch": 1.9350707205702196, "grad_norm": 0.1634354442358017, "learning_rate": 4.868037067606981e-05, "loss": 0.4688, "num_input_tokens_seen": 21077632, "step": 17375 }, { "epoch": 1.9356275754538368, "grad_norm": 0.1485954374074936, "learning_rate": 4.867881228303312e-05, "loss": 0.4523, "num_input_tokens_seen": 21083808, "step": 17380 }, { "epoch": 1.936184430337454, "grad_norm": 0.14564388990402222, "learning_rate": 4.8677252995335024e-05, "loss": 0.4673, "num_input_tokens_seen": 21090144, "step": 17385 }, { "epoch": 1.9367412852210713, "grad_norm": 0.14689704775810242, "learning_rate": 4.867569281303443e-05, "loss": 0.4808, "num_input_tokens_seen": 21096288, "step": 17390 }, { "epoch": 1.9372981401046887, "grad_norm": 0.1942971795797348, "learning_rate": 4.8674131736190284e-05, "loss": 0.4725, "num_input_tokens_seen": 21102656, "step": 17395 }, { "epoch": 1.9378549949883062, "grad_norm": 0.16630366444587708, "learning_rate": 4.8672569764861584e-05, "loss": 0.4453, "num_input_tokens_seen": 21108256, "step": 17400 }, { "epoch": 1.9384118498719234, "grad_norm": 0.15313571691513062, "learning_rate": 4.867100689910734e-05, "loss": 0.4622, "num_input_tokens_seen": 21114624, "step": 17405 }, { "epoch": 1.9389687047555406, "grad_norm": 0.15838374197483063, "learning_rate": 4.866944313898658e-05, "loss": 0.47, "num_input_tokens_seen": 21120288, "step": 17410 }, { "epoch": 1.939525559639158, "grad_norm": 0.1631201207637787, "learning_rate": 4.866787848455842e-05, "loss": 0.4652, "num_input_tokens_seen": 21126464, "step": 17415 }, { "epoch": 1.9400824145227755, "grad_norm": 0.16237041354179382, "learning_rate": 4.8666312935881965e-05, "loss": 0.475, "num_input_tokens_seen": 21132416, "step": 17420 }, { "epoch": 1.9406392694063928, "grad_norm": 0.1516467034816742, "learning_rate": 4.8664746493016355e-05, "loss": 0.4541, "num_input_tokens_seen": 21138784, "step": 17425 }, { "epoch": 1.94119612429001, "grad_norm": 0.1954255849123001, "learning_rate": 4.8663179156020787e-05, "loss": 0.451, "num_input_tokens_seen": 21144960, "step": 17430 }, { "epoch": 1.9417529791736272, "grad_norm": 0.18312446773052216, "learning_rate": 4.866161092495448e-05, "loss": 0.4575, "num_input_tokens_seen": 21151200, "step": 17435 }, { "epoch": 1.9423098340572447, "grad_norm": 0.14768880605697632, "learning_rate": 4.866004179987669e-05, "loss": 0.4591, "num_input_tokens_seen": 21157504, "step": 17440 }, { "epoch": 1.9428666889408621, "grad_norm": 0.14297236502170563, "learning_rate": 4.865847178084669e-05, "loss": 0.4619, "num_input_tokens_seen": 21163680, "step": 17445 }, { "epoch": 1.9434235438244793, "grad_norm": 0.19106905162334442, "learning_rate": 4.8656900867923814e-05, "loss": 0.4619, "num_input_tokens_seen": 21169824, "step": 17450 }, { "epoch": 1.9439803987080966, "grad_norm": 0.15859049558639526, "learning_rate": 4.865532906116741e-05, "loss": 0.4567, "num_input_tokens_seen": 21175584, "step": 17455 }, { "epoch": 1.944537253591714, "grad_norm": 0.14950130879878998, "learning_rate": 4.8653756360636865e-05, "loss": 0.4681, "num_input_tokens_seen": 21181888, "step": 17460 }, { "epoch": 1.9450941084753315, "grad_norm": 0.18742279708385468, "learning_rate": 4.865218276639161e-05, "loss": 0.4599, "num_input_tokens_seen": 21188000, "step": 17465 }, { "epoch": 1.9456509633589487, "grad_norm": 0.14934450387954712, "learning_rate": 4.8650608278491084e-05, "loss": 0.478, "num_input_tokens_seen": 21194240, "step": 17470 }, { "epoch": 1.946207818242566, "grad_norm": 0.2545730471611023, "learning_rate": 4.864903289699478e-05, "loss": 0.4719, "num_input_tokens_seen": 21200544, "step": 17475 }, { "epoch": 1.9467646731261832, "grad_norm": 0.16079016029834747, "learning_rate": 4.864745662196224e-05, "loss": 0.4687, "num_input_tokens_seen": 21206528, "step": 17480 }, { "epoch": 1.9473215280098006, "grad_norm": 0.15381698310375214, "learning_rate": 4.8645879453453e-05, "loss": 0.4849, "num_input_tokens_seen": 21212608, "step": 17485 }, { "epoch": 1.947878382893418, "grad_norm": 0.15316292643547058, "learning_rate": 4.864430139152666e-05, "loss": 0.4636, "num_input_tokens_seen": 21218784, "step": 17490 }, { "epoch": 1.9484352377770353, "grad_norm": 0.14115743339061737, "learning_rate": 4.864272243624284e-05, "loss": 0.4452, "num_input_tokens_seen": 21224576, "step": 17495 }, { "epoch": 1.9489920926606525, "grad_norm": 0.18450208008289337, "learning_rate": 4.864114258766119e-05, "loss": 0.4795, "num_input_tokens_seen": 21230176, "step": 17500 }, { "epoch": 1.94954894754427, "grad_norm": 0.19936352968215942, "learning_rate": 4.8639561845841424e-05, "loss": 0.4425, "num_input_tokens_seen": 21236256, "step": 17505 }, { "epoch": 1.9501058024278874, "grad_norm": 0.21936656534671783, "learning_rate": 4.8637980210843254e-05, "loss": 0.4776, "num_input_tokens_seen": 21242496, "step": 17510 }, { "epoch": 1.9506626573115047, "grad_norm": 0.19121456146240234, "learning_rate": 4.863639768272644e-05, "loss": 0.4799, "num_input_tokens_seen": 21248448, "step": 17515 }, { "epoch": 1.951219512195122, "grad_norm": 0.2022031545639038, "learning_rate": 4.8634814261550766e-05, "loss": 0.4599, "num_input_tokens_seen": 21254336, "step": 17520 }, { "epoch": 1.9517763670787391, "grad_norm": 0.16834276914596558, "learning_rate": 4.863322994737608e-05, "loss": 0.4835, "num_input_tokens_seen": 21260704, "step": 17525 }, { "epoch": 1.9523332219623566, "grad_norm": 0.15787237882614136, "learning_rate": 4.863164474026222e-05, "loss": 0.4654, "num_input_tokens_seen": 21266432, "step": 17530 }, { "epoch": 1.952890076845974, "grad_norm": 0.19049619138240814, "learning_rate": 4.8630058640269093e-05, "loss": 0.448, "num_input_tokens_seen": 21272512, "step": 17535 }, { "epoch": 1.9534469317295913, "grad_norm": 0.19070109724998474, "learning_rate": 4.862847164745663e-05, "loss": 0.4662, "num_input_tokens_seen": 21278592, "step": 17540 }, { "epoch": 1.9540037866132085, "grad_norm": 0.18521898984909058, "learning_rate": 4.862688376188478e-05, "loss": 0.467, "num_input_tokens_seen": 21284800, "step": 17545 }, { "epoch": 1.954560641496826, "grad_norm": 0.14091630280017853, "learning_rate": 4.862529498361355e-05, "loss": 0.4684, "num_input_tokens_seen": 21290912, "step": 17550 }, { "epoch": 1.9551174963804434, "grad_norm": 0.22523795068264008, "learning_rate": 4.862370531270296e-05, "loss": 0.4558, "num_input_tokens_seen": 21296928, "step": 17555 }, { "epoch": 1.9556743512640606, "grad_norm": 0.23362761735916138, "learning_rate": 4.8622114749213085e-05, "loss": 0.4942, "num_input_tokens_seen": 21303136, "step": 17560 }, { "epoch": 1.9562312061476779, "grad_norm": 0.2010263204574585, "learning_rate": 4.862052329320401e-05, "loss": 0.4544, "num_input_tokens_seen": 21309152, "step": 17565 }, { "epoch": 1.956788061031295, "grad_norm": 0.2243632823228836, "learning_rate": 4.8618930944735865e-05, "loss": 0.4409, "num_input_tokens_seen": 21315296, "step": 17570 }, { "epoch": 1.9573449159149126, "grad_norm": 0.17145375907421112, "learning_rate": 4.861733770386882e-05, "loss": 0.4522, "num_input_tokens_seen": 21321536, "step": 17575 }, { "epoch": 1.95790177079853, "grad_norm": 0.18422053754329681, "learning_rate": 4.861574357066307e-05, "loss": 0.4546, "num_input_tokens_seen": 21327648, "step": 17580 }, { "epoch": 1.9584586256821472, "grad_norm": 0.12505649030208588, "learning_rate": 4.8614148545178856e-05, "loss": 0.4608, "num_input_tokens_seen": 21333184, "step": 17585 }, { "epoch": 1.9590154805657645, "grad_norm": 0.1708834022283554, "learning_rate": 4.861255262747643e-05, "loss": 0.4696, "num_input_tokens_seen": 21339456, "step": 17590 }, { "epoch": 1.959572335449382, "grad_norm": 0.1700022965669632, "learning_rate": 4.861095581761609e-05, "loss": 0.4648, "num_input_tokens_seen": 21345984, "step": 17595 }, { "epoch": 1.9601291903329994, "grad_norm": 0.16460084915161133, "learning_rate": 4.860935811565818e-05, "loss": 0.482, "num_input_tokens_seen": 21352320, "step": 17600 }, { "epoch": 1.9606860452166166, "grad_norm": 0.20997513830661774, "learning_rate": 4.860775952166306e-05, "loss": 0.4576, "num_input_tokens_seen": 21357984, "step": 17605 }, { "epoch": 1.9612429001002338, "grad_norm": 0.15854112803936005, "learning_rate": 4.860616003569113e-05, "loss": 0.4429, "num_input_tokens_seen": 21363904, "step": 17610 }, { "epoch": 1.961799754983851, "grad_norm": 0.15359103679656982, "learning_rate": 4.8604559657802826e-05, "loss": 0.4688, "num_input_tokens_seen": 21370144, "step": 17615 }, { "epoch": 1.9623566098674685, "grad_norm": 0.1568755954504013, "learning_rate": 4.860295838805861e-05, "loss": 0.4689, "num_input_tokens_seen": 21375904, "step": 17620 }, { "epoch": 1.962913464751086, "grad_norm": 0.1791290044784546, "learning_rate": 4.860135622651899e-05, "loss": 0.4447, "num_input_tokens_seen": 21382240, "step": 17625 }, { "epoch": 1.9634703196347032, "grad_norm": 0.16195712983608246, "learning_rate": 4.8599753173244496e-05, "loss": 0.4691, "num_input_tokens_seen": 21388256, "step": 17630 }, { "epoch": 1.9640271745183204, "grad_norm": 0.19845400750637054, "learning_rate": 4.85981492282957e-05, "loss": 0.466, "num_input_tokens_seen": 21393824, "step": 17635 }, { "epoch": 1.9645840294019379, "grad_norm": 0.17632056772708893, "learning_rate": 4.8596544391733204e-05, "loss": 0.4784, "num_input_tokens_seen": 21399840, "step": 17640 }, { "epoch": 1.9651408842855553, "grad_norm": 0.1921873539686203, "learning_rate": 4.859493866361763e-05, "loss": 0.4527, "num_input_tokens_seen": 21405824, "step": 17645 }, { "epoch": 1.9656977391691726, "grad_norm": 0.1930367797613144, "learning_rate": 4.8593332044009666e-05, "loss": 0.4673, "num_input_tokens_seen": 21412064, "step": 17650 }, { "epoch": 1.9662545940527898, "grad_norm": 0.1386537253856659, "learning_rate": 4.859172453297002e-05, "loss": 0.4651, "num_input_tokens_seen": 21418336, "step": 17655 }, { "epoch": 1.966811448936407, "grad_norm": 0.1627306193113327, "learning_rate": 4.859011613055941e-05, "loss": 0.4515, "num_input_tokens_seen": 21424384, "step": 17660 }, { "epoch": 1.9673683038200245, "grad_norm": 0.14810964465141296, "learning_rate": 4.85885068368386e-05, "loss": 0.4661, "num_input_tokens_seen": 21430656, "step": 17665 }, { "epoch": 1.967925158703642, "grad_norm": 0.16037794947624207, "learning_rate": 4.858689665186842e-05, "loss": 0.4598, "num_input_tokens_seen": 21436928, "step": 17670 }, { "epoch": 1.9684820135872592, "grad_norm": 0.16455937922000885, "learning_rate": 4.85852855757097e-05, "loss": 0.473, "num_input_tokens_seen": 21443232, "step": 17675 }, { "epoch": 1.9690388684708764, "grad_norm": 0.1834389865398407, "learning_rate": 4.8583673608423305e-05, "loss": 0.4858, "num_input_tokens_seen": 21449600, "step": 17680 }, { "epoch": 1.9695957233544938, "grad_norm": 0.17616601288318634, "learning_rate": 4.8582060750070146e-05, "loss": 0.4796, "num_input_tokens_seen": 21455680, "step": 17685 }, { "epoch": 1.9701525782381113, "grad_norm": 0.2094830721616745, "learning_rate": 4.858044700071116e-05, "loss": 0.4707, "num_input_tokens_seen": 21461856, "step": 17690 }, { "epoch": 1.9707094331217285, "grad_norm": 0.15668164193630219, "learning_rate": 4.857883236040732e-05, "loss": 0.4706, "num_input_tokens_seen": 21466848, "step": 17695 }, { "epoch": 1.9712662880053458, "grad_norm": 0.2304825782775879, "learning_rate": 4.857721682921963e-05, "loss": 0.4601, "num_input_tokens_seen": 21473088, "step": 17700 }, { "epoch": 1.971823142888963, "grad_norm": 0.15629997849464417, "learning_rate": 4.857560040720913e-05, "loss": 0.4854, "num_input_tokens_seen": 21479392, "step": 17705 }, { "epoch": 1.9723799977725804, "grad_norm": 0.16139143705368042, "learning_rate": 4.8573983094436896e-05, "loss": 0.4525, "num_input_tokens_seen": 21485408, "step": 17710 }, { "epoch": 1.972936852656198, "grad_norm": 0.18817605078220367, "learning_rate": 4.857236489096403e-05, "loss": 0.4561, "num_input_tokens_seen": 21491488, "step": 17715 }, { "epoch": 1.9734937075398151, "grad_norm": 0.19825169444084167, "learning_rate": 4.857074579685168e-05, "loss": 0.47, "num_input_tokens_seen": 21497472, "step": 17720 }, { "epoch": 1.9740505624234324, "grad_norm": 0.17098861932754517, "learning_rate": 4.856912581216102e-05, "loss": 0.4549, "num_input_tokens_seen": 21503328, "step": 17725 }, { "epoch": 1.9746074173070498, "grad_norm": 0.1620524376630783, "learning_rate": 4.856750493695324e-05, "loss": 0.4721, "num_input_tokens_seen": 21509440, "step": 17730 }, { "epoch": 1.9751642721906673, "grad_norm": 0.20640134811401367, "learning_rate": 4.8565883171289614e-05, "loss": 0.4521, "num_input_tokens_seen": 21515520, "step": 17735 }, { "epoch": 1.9757211270742845, "grad_norm": 0.1682216227054596, "learning_rate": 4.85642605152314e-05, "loss": 0.4533, "num_input_tokens_seen": 21521504, "step": 17740 }, { "epoch": 1.9762779819579017, "grad_norm": 0.1697814017534256, "learning_rate": 4.856263696883991e-05, "loss": 0.4603, "num_input_tokens_seen": 21527616, "step": 17745 }, { "epoch": 1.976834836841519, "grad_norm": 0.2044767141342163, "learning_rate": 4.856101253217648e-05, "loss": 0.4705, "num_input_tokens_seen": 21533728, "step": 17750 }, { "epoch": 1.9773916917251364, "grad_norm": 0.1594896912574768, "learning_rate": 4.8559387205302496e-05, "loss": 0.4663, "num_input_tokens_seen": 21539776, "step": 17755 }, { "epoch": 1.9779485466087539, "grad_norm": 0.2036985605955124, "learning_rate": 4.855776098827935e-05, "loss": 0.4853, "num_input_tokens_seen": 21545984, "step": 17760 }, { "epoch": 1.978505401492371, "grad_norm": 0.1803402602672577, "learning_rate": 4.855613388116852e-05, "loss": 0.497, "num_input_tokens_seen": 21552000, "step": 17765 }, { "epoch": 1.9790622563759883, "grad_norm": 0.14164279401302338, "learning_rate": 4.855450588403145e-05, "loss": 0.4661, "num_input_tokens_seen": 21558080, "step": 17770 }, { "epoch": 1.9796191112596058, "grad_norm": 0.15748555958271027, "learning_rate": 4.855287699692967e-05, "loss": 0.4688, "num_input_tokens_seen": 21564064, "step": 17775 }, { "epoch": 1.9801759661432232, "grad_norm": 0.1948240101337433, "learning_rate": 4.855124721992471e-05, "loss": 0.4584, "num_input_tokens_seen": 21570496, "step": 17780 }, { "epoch": 1.9807328210268405, "grad_norm": 0.14369115233421326, "learning_rate": 4.854961655307816e-05, "loss": 0.4651, "num_input_tokens_seen": 21576672, "step": 17785 }, { "epoch": 1.9812896759104577, "grad_norm": 0.18698619306087494, "learning_rate": 4.854798499645163e-05, "loss": 0.4895, "num_input_tokens_seen": 21583328, "step": 17790 }, { "epoch": 1.981846530794075, "grad_norm": 0.12905555963516235, "learning_rate": 4.854635255010677e-05, "loss": 0.4859, "num_input_tokens_seen": 21589568, "step": 17795 }, { "epoch": 1.9824033856776924, "grad_norm": 0.18737086653709412, "learning_rate": 4.854471921410524e-05, "loss": 0.4644, "num_input_tokens_seen": 21595264, "step": 17800 }, { "epoch": 1.9829602405613098, "grad_norm": 0.18260732293128967, "learning_rate": 4.854308498850877e-05, "loss": 0.4662, "num_input_tokens_seen": 21601888, "step": 17805 }, { "epoch": 1.983517095444927, "grad_norm": 0.13148342072963715, "learning_rate": 4.854144987337911e-05, "loss": 0.4781, "num_input_tokens_seen": 21607648, "step": 17810 }, { "epoch": 1.9840739503285443, "grad_norm": 0.16645412147045135, "learning_rate": 4.853981386877802e-05, "loss": 0.474, "num_input_tokens_seen": 21613568, "step": 17815 }, { "epoch": 1.9846308052121617, "grad_norm": 0.17974533140659332, "learning_rate": 4.853817697476732e-05, "loss": 0.4685, "num_input_tokens_seen": 21619968, "step": 17820 }, { "epoch": 1.9851876600957792, "grad_norm": 0.19057440757751465, "learning_rate": 4.853653919140888e-05, "loss": 0.4576, "num_input_tokens_seen": 21625856, "step": 17825 }, { "epoch": 1.9857445149793964, "grad_norm": 0.12486878037452698, "learning_rate": 4.853490051876455e-05, "loss": 0.46, "num_input_tokens_seen": 21631872, "step": 17830 }, { "epoch": 1.9863013698630136, "grad_norm": 0.21456511318683624, "learning_rate": 4.853326095689627e-05, "loss": 0.4705, "num_input_tokens_seen": 21637280, "step": 17835 }, { "epoch": 1.9868582247466309, "grad_norm": 0.22732719779014587, "learning_rate": 4.853162050586596e-05, "loss": 0.4713, "num_input_tokens_seen": 21643616, "step": 17840 }, { "epoch": 1.9874150796302483, "grad_norm": 0.11785648763179779, "learning_rate": 4.852997916573562e-05, "loss": 0.468, "num_input_tokens_seen": 21650016, "step": 17845 }, { "epoch": 1.9879719345138658, "grad_norm": 0.15440137684345245, "learning_rate": 4.852833693656726e-05, "loss": 0.4494, "num_input_tokens_seen": 21656320, "step": 17850 }, { "epoch": 1.988528789397483, "grad_norm": 0.1748238205909729, "learning_rate": 4.8526693818422944e-05, "loss": 0.4549, "num_input_tokens_seen": 21662592, "step": 17855 }, { "epoch": 1.9890856442811002, "grad_norm": 0.1494361311197281, "learning_rate": 4.852504981136472e-05, "loss": 0.4745, "num_input_tokens_seen": 21668256, "step": 17860 }, { "epoch": 1.9896424991647177, "grad_norm": 0.16132402420043945, "learning_rate": 4.852340491545474e-05, "loss": 0.4562, "num_input_tokens_seen": 21674080, "step": 17865 }, { "epoch": 1.9901993540483351, "grad_norm": 0.17652131617069244, "learning_rate": 4.852175913075513e-05, "loss": 0.4771, "num_input_tokens_seen": 21680416, "step": 17870 }, { "epoch": 1.9907562089319524, "grad_norm": 0.15778326988220215, "learning_rate": 4.852011245732809e-05, "loss": 0.4639, "num_input_tokens_seen": 21686496, "step": 17875 }, { "epoch": 1.9913130638155696, "grad_norm": 0.1953737884759903, "learning_rate": 4.851846489523582e-05, "loss": 0.4754, "num_input_tokens_seen": 21692928, "step": 17880 }, { "epoch": 1.9918699186991868, "grad_norm": 0.2631228268146515, "learning_rate": 4.851681644454058e-05, "loss": 0.4667, "num_input_tokens_seen": 21699488, "step": 17885 }, { "epoch": 1.9924267735828043, "grad_norm": 0.1547699123620987, "learning_rate": 4.851516710530465e-05, "loss": 0.4606, "num_input_tokens_seen": 21705536, "step": 17890 }, { "epoch": 1.9929836284664217, "grad_norm": 0.11923374980688095, "learning_rate": 4.851351687759035e-05, "loss": 0.4656, "num_input_tokens_seen": 21710976, "step": 17895 }, { "epoch": 1.993540483350039, "grad_norm": 0.1630834937095642, "learning_rate": 4.8511865761460024e-05, "loss": 0.4745, "num_input_tokens_seen": 21717024, "step": 17900 }, { "epoch": 1.9940973382336562, "grad_norm": 0.18992142379283905, "learning_rate": 4.851021375697606e-05, "loss": 0.4844, "num_input_tokens_seen": 21722976, "step": 17905 }, { "epoch": 1.9946541931172737, "grad_norm": 0.13021747767925262, "learning_rate": 4.850856086420089e-05, "loss": 0.4771, "num_input_tokens_seen": 21729056, "step": 17910 }, { "epoch": 1.995211048000891, "grad_norm": 0.2111894190311432, "learning_rate": 4.850690708319695e-05, "loss": 0.4614, "num_input_tokens_seen": 21735008, "step": 17915 }, { "epoch": 1.9957679028845083, "grad_norm": 0.20517219603061676, "learning_rate": 4.850525241402672e-05, "loss": 0.4612, "num_input_tokens_seen": 21741280, "step": 17920 }, { "epoch": 1.9963247577681256, "grad_norm": 0.15531718730926514, "learning_rate": 4.850359685675274e-05, "loss": 0.4843, "num_input_tokens_seen": 21747296, "step": 17925 }, { "epoch": 1.9968816126517428, "grad_norm": 0.15149499475955963, "learning_rate": 4.850194041143755e-05, "loss": 0.4632, "num_input_tokens_seen": 21753408, "step": 17930 }, { "epoch": 1.9974384675353603, "grad_norm": 0.1655612736940384, "learning_rate": 4.850028307814373e-05, "loss": 0.4723, "num_input_tokens_seen": 21759680, "step": 17935 }, { "epoch": 1.9979953224189777, "grad_norm": 0.16159556806087494, "learning_rate": 4.8498624856933904e-05, "loss": 0.4578, "num_input_tokens_seen": 21765120, "step": 17940 }, { "epoch": 1.998552177302595, "grad_norm": 0.19160351157188416, "learning_rate": 4.849696574787072e-05, "loss": 0.4589, "num_input_tokens_seen": 21771424, "step": 17945 }, { "epoch": 1.9991090321862122, "grad_norm": 0.17332874238491058, "learning_rate": 4.849530575101689e-05, "loss": 0.471, "num_input_tokens_seen": 21777184, "step": 17950 }, { "epoch": 1.9996658870698296, "grad_norm": 0.17035157978534698, "learning_rate": 4.84936448664351e-05, "loss": 0.4625, "num_input_tokens_seen": 21783424, "step": 17955 }, { "epoch": 2.000222741953447, "grad_norm": 0.17640502750873566, "learning_rate": 4.8491983094188115e-05, "loss": 0.4736, "num_input_tokens_seen": 21789168, "step": 17960 }, { "epoch": 2.000222741953447, "eval_loss": 0.4660240709781647, "eval_runtime": 113.1755, "eval_samples_per_second": 35.264, "eval_steps_per_second": 8.818, "num_input_tokens_seen": 21789168, "step": 17960 }, { "epoch": 2.0007795968370643, "grad_norm": 0.13269802927970886, "learning_rate": 4.849032043433873e-05, "loss": 0.46, "num_input_tokens_seen": 21795216, "step": 17965 }, { "epoch": 2.0013364517206815, "grad_norm": 0.1530141532421112, "learning_rate": 4.8488656886949756e-05, "loss": 0.457, "num_input_tokens_seen": 21801552, "step": 17970 }, { "epoch": 2.0018933066042988, "grad_norm": 0.22028788924217224, "learning_rate": 4.848699245208406e-05, "loss": 0.4682, "num_input_tokens_seen": 21807760, "step": 17975 }, { "epoch": 2.0024501614879164, "grad_norm": 0.17696011066436768, "learning_rate": 4.8485327129804515e-05, "loss": 0.4537, "num_input_tokens_seen": 21813968, "step": 17980 }, { "epoch": 2.0030070163715337, "grad_norm": 0.15656863152980804, "learning_rate": 4.848366092017405e-05, "loss": 0.4541, "num_input_tokens_seen": 21820080, "step": 17985 }, { "epoch": 2.003563871255151, "grad_norm": 0.16716888546943665, "learning_rate": 4.848199382325562e-05, "loss": 0.4735, "num_input_tokens_seen": 21826224, "step": 17990 }, { "epoch": 2.004120726138768, "grad_norm": 0.1456146389245987, "learning_rate": 4.848032583911221e-05, "loss": 0.4671, "num_input_tokens_seen": 21832144, "step": 17995 }, { "epoch": 2.0046775810223854, "grad_norm": 0.10439008474349976, "learning_rate": 4.847865696780685e-05, "loss": 0.4691, "num_input_tokens_seen": 21838064, "step": 18000 }, { "epoch": 2.005234435906003, "grad_norm": 0.15854083001613617, "learning_rate": 4.847698720940258e-05, "loss": 0.4693, "num_input_tokens_seen": 21844368, "step": 18005 }, { "epoch": 2.0057912907896203, "grad_norm": 0.13726434111595154, "learning_rate": 4.8475316563962495e-05, "loss": 0.4624, "num_input_tokens_seen": 21850640, "step": 18010 }, { "epoch": 2.0063481456732375, "grad_norm": 0.17909201979637146, "learning_rate": 4.8473645031549734e-05, "loss": 0.46, "num_input_tokens_seen": 21856368, "step": 18015 }, { "epoch": 2.0069050005568547, "grad_norm": 0.1485586166381836, "learning_rate": 4.847197261222742e-05, "loss": 0.4743, "num_input_tokens_seen": 21862352, "step": 18020 }, { "epoch": 2.0074618554404724, "grad_norm": 0.21068064868450165, "learning_rate": 4.8470299306058774e-05, "loss": 0.4668, "num_input_tokens_seen": 21868400, "step": 18025 }, { "epoch": 2.0080187103240896, "grad_norm": 0.16908268630504608, "learning_rate": 4.8468625113107e-05, "loss": 0.4528, "num_input_tokens_seen": 21874320, "step": 18030 }, { "epoch": 2.008575565207707, "grad_norm": 0.15266937017440796, "learning_rate": 4.8466950033435364e-05, "loss": 0.4717, "num_input_tokens_seen": 21880624, "step": 18035 }, { "epoch": 2.009132420091324, "grad_norm": 0.16260835528373718, "learning_rate": 4.8465274067107155e-05, "loss": 0.461, "num_input_tokens_seen": 21886512, "step": 18040 }, { "epoch": 2.0096892749749413, "grad_norm": 0.17023363709449768, "learning_rate": 4.846359721418569e-05, "loss": 0.4712, "num_input_tokens_seen": 21892528, "step": 18045 }, { "epoch": 2.010246129858559, "grad_norm": 0.15630516409873962, "learning_rate": 4.846191947473433e-05, "loss": 0.456, "num_input_tokens_seen": 21898640, "step": 18050 }, { "epoch": 2.0108029847421762, "grad_norm": 0.1356649547815323, "learning_rate": 4.846024084881646e-05, "loss": 0.4575, "num_input_tokens_seen": 21904720, "step": 18055 }, { "epoch": 2.0113598396257935, "grad_norm": 0.1662978082895279, "learning_rate": 4.845856133649552e-05, "loss": 0.4691, "num_input_tokens_seen": 21910736, "step": 18060 }, { "epoch": 2.0119166945094107, "grad_norm": 0.17387546598911285, "learning_rate": 4.8456880937834946e-05, "loss": 0.4533, "num_input_tokens_seen": 21916592, "step": 18065 }, { "epoch": 2.0124735493930284, "grad_norm": 0.15607905387878418, "learning_rate": 4.845519965289824e-05, "loss": 0.476, "num_input_tokens_seen": 21922576, "step": 18070 }, { "epoch": 2.0130304042766456, "grad_norm": 0.16207996010780334, "learning_rate": 4.845351748174893e-05, "loss": 0.4555, "num_input_tokens_seen": 21928752, "step": 18075 }, { "epoch": 2.013587259160263, "grad_norm": 0.14053791761398315, "learning_rate": 4.8451834424450563e-05, "loss": 0.466, "num_input_tokens_seen": 21934928, "step": 18080 }, { "epoch": 2.01414411404388, "grad_norm": 0.20698809623718262, "learning_rate": 4.845015048106674e-05, "loss": 0.4681, "num_input_tokens_seen": 21940720, "step": 18085 }, { "epoch": 2.0147009689274973, "grad_norm": 0.22505944967269897, "learning_rate": 4.8448465651661084e-05, "loss": 0.464, "num_input_tokens_seen": 21946768, "step": 18090 }, { "epoch": 2.015257823811115, "grad_norm": 0.1384706199169159, "learning_rate": 4.844677993629725e-05, "loss": 0.4475, "num_input_tokens_seen": 21953360, "step": 18095 }, { "epoch": 2.015814678694732, "grad_norm": 0.2348981499671936, "learning_rate": 4.8445093335038924e-05, "loss": 0.4604, "num_input_tokens_seen": 21959408, "step": 18100 }, { "epoch": 2.0163715335783494, "grad_norm": 0.17893058061599731, "learning_rate": 4.8443405847949855e-05, "loss": 0.4878, "num_input_tokens_seen": 21964912, "step": 18105 }, { "epoch": 2.0169283884619666, "grad_norm": 0.16829845309257507, "learning_rate": 4.8441717475093774e-05, "loss": 0.4641, "num_input_tokens_seen": 21971088, "step": 18110 }, { "epoch": 2.0174852433455843, "grad_norm": 0.1459583193063736, "learning_rate": 4.8440028216534487e-05, "loss": 0.4625, "num_input_tokens_seen": 21977072, "step": 18115 }, { "epoch": 2.0180420982292016, "grad_norm": 0.2016264945268631, "learning_rate": 4.843833807233581e-05, "loss": 0.4807, "num_input_tokens_seen": 21983024, "step": 18120 }, { "epoch": 2.018598953112819, "grad_norm": 0.14478717744350433, "learning_rate": 4.843664704256161e-05, "loss": 0.458, "num_input_tokens_seen": 21989040, "step": 18125 }, { "epoch": 2.019155807996436, "grad_norm": 0.11687728017568588, "learning_rate": 4.8434955127275784e-05, "loss": 0.4745, "num_input_tokens_seen": 21995472, "step": 18130 }, { "epoch": 2.0197126628800532, "grad_norm": 0.1751715987920761, "learning_rate": 4.843326232654225e-05, "loss": 0.4759, "num_input_tokens_seen": 22001552, "step": 18135 }, { "epoch": 2.020269517763671, "grad_norm": 0.1429760605096817, "learning_rate": 4.843156864042497e-05, "loss": 0.468, "num_input_tokens_seen": 22007536, "step": 18140 }, { "epoch": 2.020826372647288, "grad_norm": 0.1408516764640808, "learning_rate": 4.842987406898794e-05, "loss": 0.4575, "num_input_tokens_seen": 22013744, "step": 18145 }, { "epoch": 2.0213832275309054, "grad_norm": 0.17470985651016235, "learning_rate": 4.842817861229517e-05, "loss": 0.4471, "num_input_tokens_seen": 22020112, "step": 18150 }, { "epoch": 2.0219400824145226, "grad_norm": 0.14077159762382507, "learning_rate": 4.842648227041075e-05, "loss": 0.4615, "num_input_tokens_seen": 22026160, "step": 18155 }, { "epoch": 2.0224969372981403, "grad_norm": 0.15236367285251617, "learning_rate": 4.8424785043398746e-05, "loss": 0.4667, "num_input_tokens_seen": 22032496, "step": 18160 }, { "epoch": 2.0230537921817575, "grad_norm": 0.2009659707546234, "learning_rate": 4.842308693132329e-05, "loss": 0.4733, "num_input_tokens_seen": 22038576, "step": 18165 }, { "epoch": 2.0236106470653747, "grad_norm": 0.16984950006008148, "learning_rate": 4.842138793424855e-05, "loss": 0.4568, "num_input_tokens_seen": 22044400, "step": 18170 }, { "epoch": 2.024167501948992, "grad_norm": 0.17293678224086761, "learning_rate": 4.8419688052238725e-05, "loss": 0.4483, "num_input_tokens_seen": 22050832, "step": 18175 }, { "epoch": 2.024724356832609, "grad_norm": 0.16706222295761108, "learning_rate": 4.841798728535802e-05, "loss": 0.4823, "num_input_tokens_seen": 22057168, "step": 18180 }, { "epoch": 2.025281211716227, "grad_norm": 0.19765277206897736, "learning_rate": 4.841628563367072e-05, "loss": 0.4765, "num_input_tokens_seen": 22063408, "step": 18185 }, { "epoch": 2.025838066599844, "grad_norm": 0.14045941829681396, "learning_rate": 4.8414583097241106e-05, "loss": 0.4714, "num_input_tokens_seen": 22069392, "step": 18190 }, { "epoch": 2.0263949214834613, "grad_norm": 0.16459451615810394, "learning_rate": 4.841287967613351e-05, "loss": 0.4703, "num_input_tokens_seen": 22075024, "step": 18195 }, { "epoch": 2.0269517763670786, "grad_norm": 0.15879884362220764, "learning_rate": 4.8411175370412285e-05, "loss": 0.4636, "num_input_tokens_seen": 22080208, "step": 18200 }, { "epoch": 2.0275086312506962, "grad_norm": 0.1634090393781662, "learning_rate": 4.8409470180141827e-05, "loss": 0.4849, "num_input_tokens_seen": 22086384, "step": 18205 }, { "epoch": 2.0280654861343135, "grad_norm": 0.18745315074920654, "learning_rate": 4.840776410538657e-05, "loss": 0.4577, "num_input_tokens_seen": 22092304, "step": 18210 }, { "epoch": 2.0286223410179307, "grad_norm": 0.16603517532348633, "learning_rate": 4.840605714621097e-05, "loss": 0.4499, "num_input_tokens_seen": 22097456, "step": 18215 }, { "epoch": 2.029179195901548, "grad_norm": 0.1552019566297531, "learning_rate": 4.840434930267952e-05, "loss": 0.4614, "num_input_tokens_seen": 22103696, "step": 18220 }, { "epoch": 2.029736050785165, "grad_norm": 0.16760477423667908, "learning_rate": 4.840264057485675e-05, "loss": 0.4553, "num_input_tokens_seen": 22109776, "step": 18225 }, { "epoch": 2.030292905668783, "grad_norm": 0.17518381774425507, "learning_rate": 4.840093096280723e-05, "loss": 0.4517, "num_input_tokens_seen": 22115632, "step": 18230 }, { "epoch": 2.0308497605524, "grad_norm": 0.17259995639324188, "learning_rate": 4.839922046659554e-05, "loss": 0.4457, "num_input_tokens_seen": 22121616, "step": 18235 }, { "epoch": 2.0314066154360173, "grad_norm": 0.14305956661701202, "learning_rate": 4.8397509086286315e-05, "loss": 0.4806, "num_input_tokens_seen": 22127696, "step": 18240 }, { "epoch": 2.0319634703196345, "grad_norm": 0.18354591727256775, "learning_rate": 4.8395796821944215e-05, "loss": 0.4632, "num_input_tokens_seen": 22133648, "step": 18245 }, { "epoch": 2.032520325203252, "grad_norm": 0.16697129607200623, "learning_rate": 4.839408367363394e-05, "loss": 0.454, "num_input_tokens_seen": 22139600, "step": 18250 }, { "epoch": 2.0330771800868694, "grad_norm": 0.1516360342502594, "learning_rate": 4.839236964142021e-05, "loss": 0.4641, "num_input_tokens_seen": 22145744, "step": 18255 }, { "epoch": 2.0336340349704867, "grad_norm": 0.1523490995168686, "learning_rate": 4.839065472536779e-05, "loss": 0.4727, "num_input_tokens_seen": 22151856, "step": 18260 }, { "epoch": 2.034190889854104, "grad_norm": 0.13276757299900055, "learning_rate": 4.8388938925541474e-05, "loss": 0.4504, "num_input_tokens_seen": 22157936, "step": 18265 }, { "epoch": 2.034747744737721, "grad_norm": 0.19311363995075226, "learning_rate": 4.838722224200609e-05, "loss": 0.4457, "num_input_tokens_seen": 22164080, "step": 18270 }, { "epoch": 2.035304599621339, "grad_norm": 0.16071514785289764, "learning_rate": 4.838550467482651e-05, "loss": 0.4672, "num_input_tokens_seen": 22169968, "step": 18275 }, { "epoch": 2.035861454504956, "grad_norm": 0.14302651584148407, "learning_rate": 4.8383786224067615e-05, "loss": 0.4593, "num_input_tokens_seen": 22176176, "step": 18280 }, { "epoch": 2.0364183093885733, "grad_norm": 0.18544822931289673, "learning_rate": 4.838206688979434e-05, "loss": 0.4753, "num_input_tokens_seen": 22182288, "step": 18285 }, { "epoch": 2.0369751642721905, "grad_norm": 0.2517112195491791, "learning_rate": 4.8380346672071645e-05, "loss": 0.4564, "num_input_tokens_seen": 22188304, "step": 18290 }, { "epoch": 2.037532019155808, "grad_norm": 0.14010433852672577, "learning_rate": 4.837862557096453e-05, "loss": 0.4507, "num_input_tokens_seen": 22194128, "step": 18295 }, { "epoch": 2.0380888740394254, "grad_norm": 0.1738840937614441, "learning_rate": 4.837690358653802e-05, "loss": 0.4758, "num_input_tokens_seen": 22200240, "step": 18300 }, { "epoch": 2.0386457289230426, "grad_norm": 0.21096257865428925, "learning_rate": 4.837518071885718e-05, "loss": 0.4676, "num_input_tokens_seen": 22206672, "step": 18305 }, { "epoch": 2.03920258380666, "grad_norm": 0.14227080345153809, "learning_rate": 4.83734569679871e-05, "loss": 0.4716, "num_input_tokens_seen": 22212944, "step": 18310 }, { "epoch": 2.039759438690277, "grad_norm": 0.1725252866744995, "learning_rate": 4.837173233399291e-05, "loss": 0.4609, "num_input_tokens_seen": 22219216, "step": 18315 }, { "epoch": 2.0403162935738948, "grad_norm": 0.17170652747154236, "learning_rate": 4.837000681693977e-05, "loss": 0.443, "num_input_tokens_seen": 22225488, "step": 18320 }, { "epoch": 2.040873148457512, "grad_norm": 0.16822324693202972, "learning_rate": 4.8368280416892894e-05, "loss": 0.4451, "num_input_tokens_seen": 22231728, "step": 18325 }, { "epoch": 2.0414300033411292, "grad_norm": 0.20213884115219116, "learning_rate": 4.8366553133917485e-05, "loss": 0.4419, "num_input_tokens_seen": 22237648, "step": 18330 }, { "epoch": 2.0419868582247465, "grad_norm": 0.17507652938365936, "learning_rate": 4.836482496807882e-05, "loss": 0.462, "num_input_tokens_seen": 22243568, "step": 18335 }, { "epoch": 2.042543713108364, "grad_norm": 0.13923506438732147, "learning_rate": 4.836309591944219e-05, "loss": 0.4633, "num_input_tokens_seen": 22249552, "step": 18340 }, { "epoch": 2.0431005679919814, "grad_norm": 0.18969200551509857, "learning_rate": 4.8361365988072923e-05, "loss": 0.4755, "num_input_tokens_seen": 22255376, "step": 18345 }, { "epoch": 2.0436574228755986, "grad_norm": 0.14592225849628448, "learning_rate": 4.835963517403639e-05, "loss": 0.449, "num_input_tokens_seen": 22261424, "step": 18350 }, { "epoch": 2.044214277759216, "grad_norm": 0.1522553414106369, "learning_rate": 4.835790347739798e-05, "loss": 0.4502, "num_input_tokens_seen": 22267312, "step": 18355 }, { "epoch": 2.0447711326428335, "grad_norm": 0.1726326197385788, "learning_rate": 4.835617089822312e-05, "loss": 0.4765, "num_input_tokens_seen": 22273392, "step": 18360 }, { "epoch": 2.0453279875264507, "grad_norm": 0.2203361541032791, "learning_rate": 4.835443743657727e-05, "loss": 0.4744, "num_input_tokens_seen": 22279472, "step": 18365 }, { "epoch": 2.045884842410068, "grad_norm": 0.16293282806873322, "learning_rate": 4.835270309252594e-05, "loss": 0.4717, "num_input_tokens_seen": 22285808, "step": 18370 }, { "epoch": 2.046441697293685, "grad_norm": 0.12819869816303253, "learning_rate": 4.8350967866134646e-05, "loss": 0.4715, "num_input_tokens_seen": 22291792, "step": 18375 }, { "epoch": 2.0469985521773024, "grad_norm": 0.13387323915958405, "learning_rate": 4.834923175746896e-05, "loss": 0.4718, "num_input_tokens_seen": 22298000, "step": 18380 }, { "epoch": 2.04755540706092, "grad_norm": 0.2792089879512787, "learning_rate": 4.834749476659446e-05, "loss": 0.4589, "num_input_tokens_seen": 22304272, "step": 18385 }, { "epoch": 2.0481122619445373, "grad_norm": 0.15266941487789154, "learning_rate": 4.834575689357679e-05, "loss": 0.4614, "num_input_tokens_seen": 22310448, "step": 18390 }, { "epoch": 2.0486691168281546, "grad_norm": 0.12522563338279724, "learning_rate": 4.834401813848162e-05, "loss": 0.4678, "num_input_tokens_seen": 22316656, "step": 18395 }, { "epoch": 2.049225971711772, "grad_norm": 0.16008585691452026, "learning_rate": 4.834227850137463e-05, "loss": 0.4444, "num_input_tokens_seen": 22322608, "step": 18400 }, { "epoch": 2.049782826595389, "grad_norm": 0.1435682475566864, "learning_rate": 4.834053798232154e-05, "loss": 0.474, "num_input_tokens_seen": 22328848, "step": 18405 }, { "epoch": 2.0503396814790067, "grad_norm": 0.11552378535270691, "learning_rate": 4.833879658138815e-05, "loss": 0.4535, "num_input_tokens_seen": 22334864, "step": 18410 }, { "epoch": 2.050896536362624, "grad_norm": 0.1331489235162735, "learning_rate": 4.833705429864022e-05, "loss": 0.442, "num_input_tokens_seen": 22341200, "step": 18415 }, { "epoch": 2.051453391246241, "grad_norm": 0.15725088119506836, "learning_rate": 4.8335311134143586e-05, "loss": 0.4658, "num_input_tokens_seen": 22347664, "step": 18420 }, { "epoch": 2.0520102461298584, "grad_norm": 0.18309760093688965, "learning_rate": 4.833356708796412e-05, "loss": 0.4477, "num_input_tokens_seen": 22353488, "step": 18425 }, { "epoch": 2.052567101013476, "grad_norm": 0.18344613909721375, "learning_rate": 4.8331822160167714e-05, "loss": 0.463, "num_input_tokens_seen": 22359600, "step": 18430 }, { "epoch": 2.0531239558970933, "grad_norm": 0.17498478293418884, "learning_rate": 4.83300763508203e-05, "loss": 0.4716, "num_input_tokens_seen": 22365584, "step": 18435 }, { "epoch": 2.0536808107807105, "grad_norm": 0.13022516667842865, "learning_rate": 4.8328329659987834e-05, "loss": 0.4492, "num_input_tokens_seen": 22371504, "step": 18440 }, { "epoch": 2.0542376656643277, "grad_norm": 0.19461891055107117, "learning_rate": 4.832658208773631e-05, "loss": 0.4555, "num_input_tokens_seen": 22377712, "step": 18445 }, { "epoch": 2.0547945205479454, "grad_norm": 0.15512226521968842, "learning_rate": 4.8324833634131764e-05, "loss": 0.4614, "num_input_tokens_seen": 22384016, "step": 18450 }, { "epoch": 2.0553513754315627, "grad_norm": 0.1766042709350586, "learning_rate": 4.832308429924026e-05, "loss": 0.4646, "num_input_tokens_seen": 22389968, "step": 18455 }, { "epoch": 2.05590823031518, "grad_norm": 0.13777604699134827, "learning_rate": 4.832133408312789e-05, "loss": 0.4656, "num_input_tokens_seen": 22396368, "step": 18460 }, { "epoch": 2.056465085198797, "grad_norm": 0.22241108119487762, "learning_rate": 4.831958298586077e-05, "loss": 0.4485, "num_input_tokens_seen": 22402224, "step": 18465 }, { "epoch": 2.0570219400824143, "grad_norm": 0.1412360817193985, "learning_rate": 4.831783100750508e-05, "loss": 0.4553, "num_input_tokens_seen": 22408496, "step": 18470 }, { "epoch": 2.057578794966032, "grad_norm": 0.15292534232139587, "learning_rate": 4.831607814812701e-05, "loss": 0.4611, "num_input_tokens_seen": 22414608, "step": 18475 }, { "epoch": 2.0581356498496493, "grad_norm": 0.17774660885334015, "learning_rate": 4.831432440779279e-05, "loss": 0.475, "num_input_tokens_seen": 22420944, "step": 18480 }, { "epoch": 2.0586925047332665, "grad_norm": 0.1814614087343216, "learning_rate": 4.831256978656867e-05, "loss": 0.4554, "num_input_tokens_seen": 22427152, "step": 18485 }, { "epoch": 2.0592493596168837, "grad_norm": 0.17898905277252197, "learning_rate": 4.831081428452096e-05, "loss": 0.4613, "num_input_tokens_seen": 22433360, "step": 18490 }, { "epoch": 2.0598062145005014, "grad_norm": 0.2172711342573166, "learning_rate": 4.830905790171598e-05, "loss": 0.4506, "num_input_tokens_seen": 22439408, "step": 18495 }, { "epoch": 2.0603630693841186, "grad_norm": 0.1553412526845932, "learning_rate": 4.8307300638220106e-05, "loss": 0.4669, "num_input_tokens_seen": 22445424, "step": 18500 }, { "epoch": 2.060919924267736, "grad_norm": 0.17412878572940826, "learning_rate": 4.830554249409971e-05, "loss": 0.4865, "num_input_tokens_seen": 22450960, "step": 18505 }, { "epoch": 2.061476779151353, "grad_norm": 0.17749740183353424, "learning_rate": 4.830378346942124e-05, "loss": 0.481, "num_input_tokens_seen": 22456912, "step": 18510 }, { "epoch": 2.0620336340349703, "grad_norm": 0.1745297759771347, "learning_rate": 4.830202356425114e-05, "loss": 0.4582, "num_input_tokens_seen": 22463376, "step": 18515 }, { "epoch": 2.062590488918588, "grad_norm": 0.14378196001052856, "learning_rate": 4.8300262778655926e-05, "loss": 0.4656, "num_input_tokens_seen": 22469296, "step": 18520 }, { "epoch": 2.063147343802205, "grad_norm": 0.24331925809383392, "learning_rate": 4.829850111270211e-05, "loss": 0.4608, "num_input_tokens_seen": 22474640, "step": 18525 }, { "epoch": 2.0637041986858224, "grad_norm": 0.18518973886966705, "learning_rate": 4.829673856645626e-05, "loss": 0.4788, "num_input_tokens_seen": 22480624, "step": 18530 }, { "epoch": 2.0642610535694397, "grad_norm": 0.14983251690864563, "learning_rate": 4.8294975139984964e-05, "loss": 0.4755, "num_input_tokens_seen": 22486544, "step": 18535 }, { "epoch": 2.0648179084530573, "grad_norm": 0.14225231111049652, "learning_rate": 4.8293210833354864e-05, "loss": 0.464, "num_input_tokens_seen": 22492688, "step": 18540 }, { "epoch": 2.0653747633366746, "grad_norm": 0.16915707290172577, "learning_rate": 4.82914456466326e-05, "loss": 0.4808, "num_input_tokens_seen": 22498896, "step": 18545 }, { "epoch": 2.065931618220292, "grad_norm": 0.15154334902763367, "learning_rate": 4.828967957988489e-05, "loss": 0.4569, "num_input_tokens_seen": 22505040, "step": 18550 }, { "epoch": 2.066488473103909, "grad_norm": 0.1954539716243744, "learning_rate": 4.828791263317845e-05, "loss": 0.4561, "num_input_tokens_seen": 22511376, "step": 18555 }, { "epoch": 2.0670453279875263, "grad_norm": 0.17566746473312378, "learning_rate": 4.828614480658003e-05, "loss": 0.4522, "num_input_tokens_seen": 22517648, "step": 18560 }, { "epoch": 2.067602182871144, "grad_norm": 0.15065442025661469, "learning_rate": 4.828437610015644e-05, "loss": 0.4705, "num_input_tokens_seen": 22523696, "step": 18565 }, { "epoch": 2.068159037754761, "grad_norm": 0.14500099420547485, "learning_rate": 4.8282606513974504e-05, "loss": 0.462, "num_input_tokens_seen": 22529936, "step": 18570 }, { "epoch": 2.0687158926383784, "grad_norm": 0.1979886144399643, "learning_rate": 4.8280836048101075e-05, "loss": 0.4612, "num_input_tokens_seen": 22536144, "step": 18575 }, { "epoch": 2.0692727475219956, "grad_norm": 0.14592944085597992, "learning_rate": 4.827906470260306e-05, "loss": 0.4671, "num_input_tokens_seen": 22542288, "step": 18580 }, { "epoch": 2.0698296024056133, "grad_norm": 0.14159852266311646, "learning_rate": 4.8277292477547375e-05, "loss": 0.4745, "num_input_tokens_seen": 22548368, "step": 18585 }, { "epoch": 2.0703864572892305, "grad_norm": 0.12871462106704712, "learning_rate": 4.827551937300099e-05, "loss": 0.4634, "num_input_tokens_seen": 22554736, "step": 18590 }, { "epoch": 2.0709433121728478, "grad_norm": 0.17768710851669312, "learning_rate": 4.827374538903089e-05, "loss": 0.4655, "num_input_tokens_seen": 22560592, "step": 18595 }, { "epoch": 2.071500167056465, "grad_norm": 0.14740990102291107, "learning_rate": 4.82719705257041e-05, "loss": 0.4613, "num_input_tokens_seen": 22566928, "step": 18600 }, { "epoch": 2.0720570219400822, "grad_norm": 0.14726808667182922, "learning_rate": 4.8270194783087693e-05, "loss": 0.4641, "num_input_tokens_seen": 22572912, "step": 18605 }, { "epoch": 2.0726138768237, "grad_norm": 0.13961394131183624, "learning_rate": 4.826841816124875e-05, "loss": 0.4509, "num_input_tokens_seen": 22579248, "step": 18610 }, { "epoch": 2.073170731707317, "grad_norm": 0.16929645836353302, "learning_rate": 4.82666406602544e-05, "loss": 0.4689, "num_input_tokens_seen": 22585488, "step": 18615 }, { "epoch": 2.0737275865909344, "grad_norm": 0.16625946760177612, "learning_rate": 4.826486228017181e-05, "loss": 0.4576, "num_input_tokens_seen": 22591600, "step": 18620 }, { "epoch": 2.0742844414745516, "grad_norm": 0.1325138807296753, "learning_rate": 4.8263083021068165e-05, "loss": 0.4592, "num_input_tokens_seen": 22597712, "step": 18625 }, { "epoch": 2.0748412963581693, "grad_norm": 0.12143531441688538, "learning_rate": 4.826130288301069e-05, "loss": 0.4652, "num_input_tokens_seen": 22603824, "step": 18630 }, { "epoch": 2.0753981512417865, "grad_norm": 0.16597707569599152, "learning_rate": 4.825952186606665e-05, "loss": 0.4631, "num_input_tokens_seen": 22610000, "step": 18635 }, { "epoch": 2.0759550061254037, "grad_norm": 0.189137265086174, "learning_rate": 4.825773997030333e-05, "loss": 0.4716, "num_input_tokens_seen": 22616176, "step": 18640 }, { "epoch": 2.076511861009021, "grad_norm": 0.23846392333507538, "learning_rate": 4.825595719578807e-05, "loss": 0.4688, "num_input_tokens_seen": 22622224, "step": 18645 }, { "epoch": 2.077068715892638, "grad_norm": 0.23460476100444794, "learning_rate": 4.825417354258821e-05, "loss": 0.4623, "num_input_tokens_seen": 22628624, "step": 18650 }, { "epoch": 2.077625570776256, "grad_norm": 0.17244240641593933, "learning_rate": 4.825238901077116e-05, "loss": 0.4684, "num_input_tokens_seen": 22634704, "step": 18655 }, { "epoch": 2.078182425659873, "grad_norm": 0.1298733353614807, "learning_rate": 4.825060360040433e-05, "loss": 0.4683, "num_input_tokens_seen": 22640752, "step": 18660 }, { "epoch": 2.0787392805434903, "grad_norm": 0.17631517350673676, "learning_rate": 4.8248817311555196e-05, "loss": 0.4808, "num_input_tokens_seen": 22647024, "step": 18665 }, { "epoch": 2.0792961354271076, "grad_norm": 0.1440383940935135, "learning_rate": 4.8247030144291225e-05, "loss": 0.4527, "num_input_tokens_seen": 22653136, "step": 18670 }, { "epoch": 2.0798529903107252, "grad_norm": 0.1326429694890976, "learning_rate": 4.8245242098679974e-05, "loss": 0.4731, "num_input_tokens_seen": 22659152, "step": 18675 }, { "epoch": 2.0804098451943425, "grad_norm": 0.17245402932167053, "learning_rate": 4.8243453174788973e-05, "loss": 0.4487, "num_input_tokens_seen": 22665264, "step": 18680 }, { "epoch": 2.0809667000779597, "grad_norm": 0.1380549967288971, "learning_rate": 4.824166337268582e-05, "loss": 0.443, "num_input_tokens_seen": 22671184, "step": 18685 }, { "epoch": 2.081523554961577, "grad_norm": 0.14187565445899963, "learning_rate": 4.823987269243815e-05, "loss": 0.4504, "num_input_tokens_seen": 22677360, "step": 18690 }, { "epoch": 2.082080409845194, "grad_norm": 0.17550352215766907, "learning_rate": 4.823808113411361e-05, "loss": 0.4877, "num_input_tokens_seen": 22683248, "step": 18695 }, { "epoch": 2.082637264728812, "grad_norm": 0.16842736303806305, "learning_rate": 4.82362886977799e-05, "loss": 0.4751, "num_input_tokens_seen": 22689776, "step": 18700 }, { "epoch": 2.083194119612429, "grad_norm": 0.1939944475889206, "learning_rate": 4.823449538350474e-05, "loss": 0.476, "num_input_tokens_seen": 22695792, "step": 18705 }, { "epoch": 2.0837509744960463, "grad_norm": 0.13476628065109253, "learning_rate": 4.8232701191355875e-05, "loss": 0.461, "num_input_tokens_seen": 22701712, "step": 18710 }, { "epoch": 2.0843078293796635, "grad_norm": 0.14195406436920166, "learning_rate": 4.823090612140111e-05, "loss": 0.4523, "num_input_tokens_seen": 22707856, "step": 18715 }, { "epoch": 2.084864684263281, "grad_norm": 0.23575948178768158, "learning_rate": 4.822911017370827e-05, "loss": 0.4644, "num_input_tokens_seen": 22714032, "step": 18720 }, { "epoch": 2.0854215391468984, "grad_norm": 0.1654766947031021, "learning_rate": 4.822731334834521e-05, "loss": 0.4572, "num_input_tokens_seen": 22720080, "step": 18725 }, { "epoch": 2.0859783940305157, "grad_norm": 0.1528697907924652, "learning_rate": 4.82255156453798e-05, "loss": 0.4688, "num_input_tokens_seen": 22726512, "step": 18730 }, { "epoch": 2.086535248914133, "grad_norm": 0.15113618969917297, "learning_rate": 4.822371706487999e-05, "loss": 0.4763, "num_input_tokens_seen": 22732400, "step": 18735 }, { "epoch": 2.08709210379775, "grad_norm": 0.17853324115276337, "learning_rate": 4.822191760691372e-05, "loss": 0.4635, "num_input_tokens_seen": 22738736, "step": 18740 }, { "epoch": 2.087648958681368, "grad_norm": 0.16416028141975403, "learning_rate": 4.8220117271548994e-05, "loss": 0.472, "num_input_tokens_seen": 22744880, "step": 18745 }, { "epoch": 2.088205813564985, "grad_norm": 0.16302768886089325, "learning_rate": 4.821831605885382e-05, "loss": 0.471, "num_input_tokens_seen": 22751312, "step": 18750 }, { "epoch": 2.0887626684486023, "grad_norm": 0.22179993987083435, "learning_rate": 4.821651396889626e-05, "loss": 0.4718, "num_input_tokens_seen": 22757392, "step": 18755 }, { "epoch": 2.0893195233322195, "grad_norm": 0.13523121178150177, "learning_rate": 4.8214711001744394e-05, "loss": 0.4712, "num_input_tokens_seen": 22763568, "step": 18760 }, { "epoch": 2.089876378215837, "grad_norm": 0.17064431309700012, "learning_rate": 4.821290715746636e-05, "loss": 0.4649, "num_input_tokens_seen": 22769648, "step": 18765 }, { "epoch": 2.0904332330994544, "grad_norm": 0.15440015494823456, "learning_rate": 4.8211102436130294e-05, "loss": 0.4716, "num_input_tokens_seen": 22775888, "step": 18770 }, { "epoch": 2.0909900879830716, "grad_norm": 0.16572308540344238, "learning_rate": 4.820929683780441e-05, "loss": 0.4625, "num_input_tokens_seen": 22781872, "step": 18775 }, { "epoch": 2.091546942866689, "grad_norm": 0.19768087565898895, "learning_rate": 4.8207490362556896e-05, "loss": 0.4587, "num_input_tokens_seen": 22787952, "step": 18780 }, { "epoch": 2.092103797750306, "grad_norm": 0.164895161986351, "learning_rate": 4.820568301045604e-05, "loss": 0.4693, "num_input_tokens_seen": 22793872, "step": 18785 }, { "epoch": 2.0926606526339238, "grad_norm": 0.2745194733142853, "learning_rate": 4.8203874781570104e-05, "loss": 0.4652, "num_input_tokens_seen": 22799856, "step": 18790 }, { "epoch": 2.093217507517541, "grad_norm": 0.13988041877746582, "learning_rate": 4.8202065675967415e-05, "loss": 0.4627, "num_input_tokens_seen": 22805936, "step": 18795 }, { "epoch": 2.093774362401158, "grad_norm": 0.16243012249469757, "learning_rate": 4.8200255693716335e-05, "loss": 0.4624, "num_input_tokens_seen": 22812016, "step": 18800 }, { "epoch": 2.0943312172847754, "grad_norm": 0.16175347566604614, "learning_rate": 4.819844483488525e-05, "loss": 0.4623, "num_input_tokens_seen": 22818032, "step": 18805 }, { "epoch": 2.094888072168393, "grad_norm": 0.21346968412399292, "learning_rate": 4.819663309954257e-05, "loss": 0.4608, "num_input_tokens_seen": 22824208, "step": 18810 }, { "epoch": 2.0954449270520104, "grad_norm": 0.1802622675895691, "learning_rate": 4.8194820487756756e-05, "loss": 0.4565, "num_input_tokens_seen": 22830800, "step": 18815 }, { "epoch": 2.0960017819356276, "grad_norm": 0.15007176995277405, "learning_rate": 4.8193006999596294e-05, "loss": 0.4723, "num_input_tokens_seen": 22836848, "step": 18820 }, { "epoch": 2.096558636819245, "grad_norm": 0.16684836149215698, "learning_rate": 4.8191192635129704e-05, "loss": 0.4613, "num_input_tokens_seen": 22842928, "step": 18825 }, { "epoch": 2.097115491702862, "grad_norm": 0.18416085839271545, "learning_rate": 4.818937739442553e-05, "loss": 0.4644, "num_input_tokens_seen": 22849040, "step": 18830 }, { "epoch": 2.0976723465864797, "grad_norm": 0.16388067603111267, "learning_rate": 4.8187561277552374e-05, "loss": 0.4768, "num_input_tokens_seen": 22855088, "step": 18835 }, { "epoch": 2.098229201470097, "grad_norm": 0.16220629215240479, "learning_rate": 4.8185744284578834e-05, "loss": 0.4643, "num_input_tokens_seen": 22861136, "step": 18840 }, { "epoch": 2.098786056353714, "grad_norm": 0.14015324413776398, "learning_rate": 4.818392641557358e-05, "loss": 0.4448, "num_input_tokens_seen": 22867248, "step": 18845 }, { "epoch": 2.0993429112373314, "grad_norm": 0.17898470163345337, "learning_rate": 4.818210767060528e-05, "loss": 0.4493, "num_input_tokens_seen": 22873360, "step": 18850 }, { "epoch": 2.099899766120949, "grad_norm": 0.18632462620735168, "learning_rate": 4.818028804974267e-05, "loss": 0.4587, "num_input_tokens_seen": 22879024, "step": 18855 }, { "epoch": 2.1004566210045663, "grad_norm": 0.1536436527967453, "learning_rate": 4.817846755305449e-05, "loss": 0.4638, "num_input_tokens_seen": 22885328, "step": 18860 }, { "epoch": 2.1010134758881835, "grad_norm": 0.15224647521972656, "learning_rate": 4.817664618060953e-05, "loss": 0.4656, "num_input_tokens_seen": 22891760, "step": 18865 }, { "epoch": 2.1015703307718008, "grad_norm": 0.21744589507579803, "learning_rate": 4.81748239324766e-05, "loss": 0.4891, "num_input_tokens_seen": 22897872, "step": 18870 }, { "epoch": 2.102127185655418, "grad_norm": 0.18990778923034668, "learning_rate": 4.8173000808724555e-05, "loss": 0.4716, "num_input_tokens_seen": 22903728, "step": 18875 }, { "epoch": 2.1026840405390357, "grad_norm": 0.17542654275894165, "learning_rate": 4.8171176809422284e-05, "loss": 0.4727, "num_input_tokens_seen": 22909904, "step": 18880 }, { "epoch": 2.103240895422653, "grad_norm": 0.17510181665420532, "learning_rate": 4.8169351934638684e-05, "loss": 0.4659, "num_input_tokens_seen": 22916016, "step": 18885 }, { "epoch": 2.10379775030627, "grad_norm": 0.13822700083255768, "learning_rate": 4.816752618444273e-05, "loss": 0.4388, "num_input_tokens_seen": 22922256, "step": 18890 }, { "epoch": 2.1043546051898874, "grad_norm": 0.17003485560417175, "learning_rate": 4.816569955890339e-05, "loss": 0.4503, "num_input_tokens_seen": 22928336, "step": 18895 }, { "epoch": 2.104911460073505, "grad_norm": 0.16720731556415558, "learning_rate": 4.8163872058089674e-05, "loss": 0.4462, "num_input_tokens_seen": 22934384, "step": 18900 }, { "epoch": 2.1054683149571223, "grad_norm": 0.14371831715106964, "learning_rate": 4.816204368207066e-05, "loss": 0.4579, "num_input_tokens_seen": 22940720, "step": 18905 }, { "epoch": 2.1060251698407395, "grad_norm": 0.1638164222240448, "learning_rate": 4.8160214430915386e-05, "loss": 0.4604, "num_input_tokens_seen": 22946448, "step": 18910 }, { "epoch": 2.1065820247243567, "grad_norm": 0.13685919344425201, "learning_rate": 4.8158384304693004e-05, "loss": 0.4601, "num_input_tokens_seen": 22952528, "step": 18915 }, { "epoch": 2.107138879607974, "grad_norm": 0.141318678855896, "learning_rate": 4.815655330347264e-05, "loss": 0.4471, "num_input_tokens_seen": 22958832, "step": 18920 }, { "epoch": 2.1076957344915916, "grad_norm": 0.1323620080947876, "learning_rate": 4.815472142732349e-05, "loss": 0.4641, "num_input_tokens_seen": 22964976, "step": 18925 }, { "epoch": 2.108252589375209, "grad_norm": 0.1340130716562271, "learning_rate": 4.815288867631476e-05, "loss": 0.4714, "num_input_tokens_seen": 22971184, "step": 18930 }, { "epoch": 2.108809444258826, "grad_norm": 0.13987015187740326, "learning_rate": 4.81510550505157e-05, "loss": 0.4725, "num_input_tokens_seen": 22977648, "step": 18935 }, { "epoch": 2.1093662991424433, "grad_norm": 0.13443094491958618, "learning_rate": 4.814922054999559e-05, "loss": 0.4511, "num_input_tokens_seen": 22983760, "step": 18940 }, { "epoch": 2.109923154026061, "grad_norm": 0.13093937933444977, "learning_rate": 4.8147385174823736e-05, "loss": 0.4712, "num_input_tokens_seen": 22989808, "step": 18945 }, { "epoch": 2.1104800089096782, "grad_norm": 0.2625039219856262, "learning_rate": 4.8145548925069496e-05, "loss": 0.4588, "num_input_tokens_seen": 22995984, "step": 18950 }, { "epoch": 2.1110368637932955, "grad_norm": 0.15132693946361542, "learning_rate": 4.8143711800802236e-05, "loss": 0.4692, "num_input_tokens_seen": 23002192, "step": 18955 }, { "epoch": 2.1115937186769127, "grad_norm": 0.174399271607399, "learning_rate": 4.8141873802091385e-05, "loss": 0.4708, "num_input_tokens_seen": 23008240, "step": 18960 }, { "epoch": 2.11215057356053, "grad_norm": 0.1942911446094513, "learning_rate": 4.8140034929006386e-05, "loss": 0.4566, "num_input_tokens_seen": 23013968, "step": 18965 }, { "epoch": 2.1127074284441476, "grad_norm": 0.24292238056659698, "learning_rate": 4.81381951816167e-05, "loss": 0.4682, "num_input_tokens_seen": 23019376, "step": 18970 }, { "epoch": 2.113264283327765, "grad_norm": 0.13211633265018463, "learning_rate": 4.8136354559991855e-05, "loss": 0.4712, "num_input_tokens_seen": 23025584, "step": 18975 }, { "epoch": 2.113821138211382, "grad_norm": 0.23363140225410461, "learning_rate": 4.8134513064201384e-05, "loss": 0.4648, "num_input_tokens_seen": 23031728, "step": 18980 }, { "epoch": 2.1143779930949993, "grad_norm": 0.22904761135578156, "learning_rate": 4.813267069431488e-05, "loss": 0.4665, "num_input_tokens_seen": 23037328, "step": 18985 }, { "epoch": 2.114934847978617, "grad_norm": 0.14337100088596344, "learning_rate": 4.813082745040194e-05, "loss": 0.4505, "num_input_tokens_seen": 23043248, "step": 18990 }, { "epoch": 2.115491702862234, "grad_norm": 0.1775093972682953, "learning_rate": 4.812898333253221e-05, "loss": 0.4723, "num_input_tokens_seen": 23049392, "step": 18995 }, { "epoch": 2.1160485577458514, "grad_norm": 0.12564118206501007, "learning_rate": 4.812713834077538e-05, "loss": 0.4658, "num_input_tokens_seen": 23055472, "step": 19000 }, { "epoch": 2.1166054126294687, "grad_norm": 0.19235871732234955, "learning_rate": 4.812529247520114e-05, "loss": 0.4713, "num_input_tokens_seen": 23061552, "step": 19005 }, { "epoch": 2.117162267513086, "grad_norm": 0.16600117087364197, "learning_rate": 4.812344573587924e-05, "loss": 0.471, "num_input_tokens_seen": 23067856, "step": 19010 }, { "epoch": 2.1177191223967036, "grad_norm": 0.17978304624557495, "learning_rate": 4.812159812287946e-05, "loss": 0.4636, "num_input_tokens_seen": 23073936, "step": 19015 }, { "epoch": 2.118275977280321, "grad_norm": 0.18502558767795563, "learning_rate": 4.811974963627161e-05, "loss": 0.4903, "num_input_tokens_seen": 23079952, "step": 19020 }, { "epoch": 2.118832832163938, "grad_norm": 0.13565850257873535, "learning_rate": 4.8117900276125526e-05, "loss": 0.4728, "num_input_tokens_seen": 23086064, "step": 19025 }, { "epoch": 2.1193896870475553, "grad_norm": 0.13808132708072662, "learning_rate": 4.811605004251109e-05, "loss": 0.4762, "num_input_tokens_seen": 23091696, "step": 19030 }, { "epoch": 2.119946541931173, "grad_norm": 0.11224433779716492, "learning_rate": 4.811419893549819e-05, "loss": 0.4733, "num_input_tokens_seen": 23097712, "step": 19035 }, { "epoch": 2.12050339681479, "grad_norm": 0.1834106594324112, "learning_rate": 4.811234695515679e-05, "loss": 0.4569, "num_input_tokens_seen": 23104016, "step": 19040 }, { "epoch": 2.1210602516984074, "grad_norm": 0.1666046530008316, "learning_rate": 4.8110494101556855e-05, "loss": 0.4654, "num_input_tokens_seen": 23110000, "step": 19045 }, { "epoch": 2.1216171065820246, "grad_norm": 0.17809957265853882, "learning_rate": 4.810864037476839e-05, "loss": 0.4658, "num_input_tokens_seen": 23116240, "step": 19050 }, { "epoch": 2.122173961465642, "grad_norm": 0.12173893302679062, "learning_rate": 4.810678577486144e-05, "loss": 0.4693, "num_input_tokens_seen": 23122672, "step": 19055 }, { "epoch": 2.1227308163492595, "grad_norm": 0.17507514357566833, "learning_rate": 4.810493030190607e-05, "loss": 0.4639, "num_input_tokens_seen": 23128720, "step": 19060 }, { "epoch": 2.1232876712328768, "grad_norm": 0.20327819883823395, "learning_rate": 4.8103073955972385e-05, "loss": 0.4513, "num_input_tokens_seen": 23134864, "step": 19065 }, { "epoch": 2.123844526116494, "grad_norm": 0.18601039052009583, "learning_rate": 4.810121673713054e-05, "loss": 0.4451, "num_input_tokens_seen": 23141200, "step": 19070 }, { "epoch": 2.124401381000111, "grad_norm": 0.1229482889175415, "learning_rate": 4.809935864545069e-05, "loss": 0.4588, "num_input_tokens_seen": 23147344, "step": 19075 }, { "epoch": 2.124958235883729, "grad_norm": 0.15903837978839874, "learning_rate": 4.809749968100305e-05, "loss": 0.4696, "num_input_tokens_seen": 23153488, "step": 19080 }, { "epoch": 2.125515090767346, "grad_norm": 0.16956837475299835, "learning_rate": 4.8095639843857835e-05, "loss": 0.4716, "num_input_tokens_seen": 23159664, "step": 19085 }, { "epoch": 2.1260719456509634, "grad_norm": 0.16189152002334595, "learning_rate": 4.809377913408535e-05, "loss": 0.4653, "num_input_tokens_seen": 23166000, "step": 19090 }, { "epoch": 2.1266288005345806, "grad_norm": 0.14834117889404297, "learning_rate": 4.809191755175587e-05, "loss": 0.4617, "num_input_tokens_seen": 23172112, "step": 19095 }, { "epoch": 2.127185655418198, "grad_norm": 0.11354877054691315, "learning_rate": 4.8090055096939755e-05, "loss": 0.446, "num_input_tokens_seen": 23178160, "step": 19100 }, { "epoch": 2.1277425103018155, "grad_norm": 0.1931322067975998, "learning_rate": 4.808819176970736e-05, "loss": 0.4642, "num_input_tokens_seen": 23183920, "step": 19105 }, { "epoch": 2.1282993651854327, "grad_norm": 0.1768934279680252, "learning_rate": 4.808632757012909e-05, "loss": 0.4715, "num_input_tokens_seen": 23189968, "step": 19110 }, { "epoch": 2.12885622006905, "grad_norm": 0.1881576031446457, "learning_rate": 4.808446249827537e-05, "loss": 0.4556, "num_input_tokens_seen": 23195920, "step": 19115 }, { "epoch": 2.129413074952667, "grad_norm": 0.17778873443603516, "learning_rate": 4.808259655421669e-05, "loss": 0.4732, "num_input_tokens_seen": 23201808, "step": 19120 }, { "epoch": 2.129969929836285, "grad_norm": 0.17831271886825562, "learning_rate": 4.808072973802354e-05, "loss": 0.4526, "num_input_tokens_seen": 23208080, "step": 19125 }, { "epoch": 2.130526784719902, "grad_norm": 0.16217836737632751, "learning_rate": 4.8078862049766446e-05, "loss": 0.4699, "num_input_tokens_seen": 23214224, "step": 19130 }, { "epoch": 2.1310836396035193, "grad_norm": 0.16523487865924835, "learning_rate": 4.8076993489515995e-05, "loss": 0.4542, "num_input_tokens_seen": 23220432, "step": 19135 }, { "epoch": 2.1316404944871366, "grad_norm": 0.1680564433336258, "learning_rate": 4.807512405734277e-05, "loss": 0.4714, "num_input_tokens_seen": 23226448, "step": 19140 }, { "epoch": 2.132197349370754, "grad_norm": 0.21512961387634277, "learning_rate": 4.8073253753317416e-05, "loss": 0.4633, "num_input_tokens_seen": 23232592, "step": 19145 }, { "epoch": 2.1327542042543715, "grad_norm": 0.14585961401462555, "learning_rate": 4.807138257751058e-05, "loss": 0.4453, "num_input_tokens_seen": 23238288, "step": 19150 }, { "epoch": 2.1333110591379887, "grad_norm": 0.15631073713302612, "learning_rate": 4.806951052999299e-05, "loss": 0.4592, "num_input_tokens_seen": 23244496, "step": 19155 }, { "epoch": 2.133867914021606, "grad_norm": 0.14399252831935883, "learning_rate": 4.806763761083536e-05, "loss": 0.4735, "num_input_tokens_seen": 23250608, "step": 19160 }, { "epoch": 2.134424768905223, "grad_norm": 0.2047666311264038, "learning_rate": 4.806576382010844e-05, "loss": 0.4699, "num_input_tokens_seen": 23256592, "step": 19165 }, { "epoch": 2.134981623788841, "grad_norm": 0.13397064805030823, "learning_rate": 4.806388915788307e-05, "loss": 0.46, "num_input_tokens_seen": 23262576, "step": 19170 }, { "epoch": 2.135538478672458, "grad_norm": 0.1338481456041336, "learning_rate": 4.806201362423004e-05, "loss": 0.4568, "num_input_tokens_seen": 23268656, "step": 19175 }, { "epoch": 2.1360953335560753, "grad_norm": 0.11866849660873413, "learning_rate": 4.806013721922024e-05, "loss": 0.4673, "num_input_tokens_seen": 23275024, "step": 19180 }, { "epoch": 2.1366521884396925, "grad_norm": 0.14438281953334808, "learning_rate": 4.8058259942924546e-05, "loss": 0.4623, "num_input_tokens_seen": 23281520, "step": 19185 }, { "epoch": 2.1372090433233097, "grad_norm": 0.16134798526763916, "learning_rate": 4.8056381795413896e-05, "loss": 0.4492, "num_input_tokens_seen": 23287536, "step": 19190 }, { "epoch": 2.1377658982069274, "grad_norm": 0.1391264647245407, "learning_rate": 4.8054502776759256e-05, "loss": 0.4409, "num_input_tokens_seen": 23293552, "step": 19195 }, { "epoch": 2.1383227530905446, "grad_norm": 0.20761005580425262, "learning_rate": 4.805262288703162e-05, "loss": 0.47, "num_input_tokens_seen": 23299504, "step": 19200 }, { "epoch": 2.138879607974162, "grad_norm": 0.19936302304267883, "learning_rate": 4.8050742126302023e-05, "loss": 0.4642, "num_input_tokens_seen": 23305424, "step": 19205 }, { "epoch": 2.139436462857779, "grad_norm": 0.12174711376428604, "learning_rate": 4.804886049464151e-05, "loss": 0.4772, "num_input_tokens_seen": 23311472, "step": 19210 }, { "epoch": 2.139993317741397, "grad_norm": 0.18523003160953522, "learning_rate": 4.8046977992121187e-05, "loss": 0.5114, "num_input_tokens_seen": 23317328, "step": 19215 }, { "epoch": 2.140550172625014, "grad_norm": 0.1839974969625473, "learning_rate": 4.804509461881218e-05, "loss": 0.4523, "num_input_tokens_seen": 23323408, "step": 19220 }, { "epoch": 2.1411070275086312, "grad_norm": 0.12767156958580017, "learning_rate": 4.804321037478564e-05, "loss": 0.4665, "num_input_tokens_seen": 23329680, "step": 19225 }, { "epoch": 2.1416638823922485, "grad_norm": 0.18852180242538452, "learning_rate": 4.804132526011278e-05, "loss": 0.4646, "num_input_tokens_seen": 23335824, "step": 19230 }, { "epoch": 2.1422207372758657, "grad_norm": 0.13471944630146027, "learning_rate": 4.803943927486479e-05, "loss": 0.4546, "num_input_tokens_seen": 23341808, "step": 19235 }, { "epoch": 2.1427775921594834, "grad_norm": 0.13220928609371185, "learning_rate": 4.803755241911298e-05, "loss": 0.4659, "num_input_tokens_seen": 23347728, "step": 19240 }, { "epoch": 2.1433344470431006, "grad_norm": 0.1319906860589981, "learning_rate": 4.803566469292859e-05, "loss": 0.4749, "num_input_tokens_seen": 23354000, "step": 19245 }, { "epoch": 2.143891301926718, "grad_norm": 0.14739753305912018, "learning_rate": 4.803377609638297e-05, "loss": 0.4854, "num_input_tokens_seen": 23360144, "step": 19250 }, { "epoch": 2.144448156810335, "grad_norm": 0.2335919290781021, "learning_rate": 4.803188662954747e-05, "loss": 0.4805, "num_input_tokens_seen": 23366384, "step": 19255 }, { "epoch": 2.1450050116939527, "grad_norm": 0.175499826669693, "learning_rate": 4.8029996292493496e-05, "loss": 0.4666, "num_input_tokens_seen": 23372752, "step": 19260 }, { "epoch": 2.14556186657757, "grad_norm": 0.17562945187091827, "learning_rate": 4.802810508529245e-05, "loss": 0.4576, "num_input_tokens_seen": 23379280, "step": 19265 }, { "epoch": 2.146118721461187, "grad_norm": 0.15519852936267853, "learning_rate": 4.8026213008015805e-05, "loss": 0.4644, "num_input_tokens_seen": 23384816, "step": 19270 }, { "epoch": 2.1466755763448044, "grad_norm": 0.23395906388759613, "learning_rate": 4.8024320060735035e-05, "loss": 0.4607, "num_input_tokens_seen": 23390704, "step": 19275 }, { "epoch": 2.1472324312284217, "grad_norm": 0.19526436924934387, "learning_rate": 4.802242624352167e-05, "loss": 0.4735, "num_input_tokens_seen": 23396848, "step": 19280 }, { "epoch": 2.1477892861120393, "grad_norm": 0.1727091372013092, "learning_rate": 4.802053155644726e-05, "loss": 0.4497, "num_input_tokens_seen": 23402896, "step": 19285 }, { "epoch": 2.1483461409956566, "grad_norm": 0.20030924677848816, "learning_rate": 4.8018635999583395e-05, "loss": 0.4787, "num_input_tokens_seen": 23408880, "step": 19290 }, { "epoch": 2.148902995879274, "grad_norm": 0.20097751915454865, "learning_rate": 4.801673957300169e-05, "loss": 0.4758, "num_input_tokens_seen": 23414576, "step": 19295 }, { "epoch": 2.149459850762891, "grad_norm": 0.22076986730098724, "learning_rate": 4.801484227677382e-05, "loss": 0.4664, "num_input_tokens_seen": 23420464, "step": 19300 }, { "epoch": 2.1500167056465087, "grad_norm": 0.20634254813194275, "learning_rate": 4.8012944110971436e-05, "loss": 0.4699, "num_input_tokens_seen": 23426000, "step": 19305 }, { "epoch": 2.150573560530126, "grad_norm": 0.1786620020866394, "learning_rate": 4.801104507566628e-05, "loss": 0.4432, "num_input_tokens_seen": 23432080, "step": 19310 }, { "epoch": 2.151130415413743, "grad_norm": 0.17994649708271027, "learning_rate": 4.800914517093011e-05, "loss": 0.4702, "num_input_tokens_seen": 23438352, "step": 19315 }, { "epoch": 2.1516872702973604, "grad_norm": 0.16293078660964966, "learning_rate": 4.800724439683469e-05, "loss": 0.4716, "num_input_tokens_seen": 23444528, "step": 19320 }, { "epoch": 2.1522441251809776, "grad_norm": 0.149868905544281, "learning_rate": 4.800534275345184e-05, "loss": 0.4516, "num_input_tokens_seen": 23450544, "step": 19325 }, { "epoch": 2.1528009800645953, "grad_norm": 0.15822984278202057, "learning_rate": 4.8003440240853423e-05, "loss": 0.4647, "num_input_tokens_seen": 23456720, "step": 19330 }, { "epoch": 2.1533578349482125, "grad_norm": 0.14065778255462646, "learning_rate": 4.8001536859111315e-05, "loss": 0.4514, "num_input_tokens_seen": 23462512, "step": 19335 }, { "epoch": 2.1539146898318298, "grad_norm": 0.2219206690788269, "learning_rate": 4.799963260829744e-05, "loss": 0.4743, "num_input_tokens_seen": 23469008, "step": 19340 }, { "epoch": 2.154471544715447, "grad_norm": 0.1278694123029709, "learning_rate": 4.799772748848373e-05, "loss": 0.4668, "num_input_tokens_seen": 23475344, "step": 19345 }, { "epoch": 2.1550283995990647, "grad_norm": 0.1700880229473114, "learning_rate": 4.799582149974218e-05, "loss": 0.4603, "num_input_tokens_seen": 23481456, "step": 19350 }, { "epoch": 2.155585254482682, "grad_norm": 0.11457028985023499, "learning_rate": 4.799391464214479e-05, "loss": 0.4698, "num_input_tokens_seen": 23487792, "step": 19355 }, { "epoch": 2.156142109366299, "grad_norm": 0.12983089685440063, "learning_rate": 4.7992006915763634e-05, "loss": 0.4619, "num_input_tokens_seen": 23494128, "step": 19360 }, { "epoch": 2.1566989642499164, "grad_norm": 0.13084888458251953, "learning_rate": 4.7990098320670764e-05, "loss": 0.4544, "num_input_tokens_seen": 23500176, "step": 19365 }, { "epoch": 2.1572558191335336, "grad_norm": 0.17623180150985718, "learning_rate": 4.79881888569383e-05, "loss": 0.4639, "num_input_tokens_seen": 23506000, "step": 19370 }, { "epoch": 2.1578126740171513, "grad_norm": 0.15107423067092896, "learning_rate": 4.798627852463841e-05, "loss": 0.4594, "num_input_tokens_seen": 23512272, "step": 19375 }, { "epoch": 2.1583695289007685, "grad_norm": 0.14499686658382416, "learning_rate": 4.798436732384324e-05, "loss": 0.4633, "num_input_tokens_seen": 23518608, "step": 19380 }, { "epoch": 2.1589263837843857, "grad_norm": 0.1528526097536087, "learning_rate": 4.7982455254625016e-05, "loss": 0.4568, "num_input_tokens_seen": 23524752, "step": 19385 }, { "epoch": 2.159483238668003, "grad_norm": 0.1625157743692398, "learning_rate": 4.798054231705598e-05, "loss": 0.4658, "num_input_tokens_seen": 23530928, "step": 19390 }, { "epoch": 2.1600400935516206, "grad_norm": 0.14416182041168213, "learning_rate": 4.797862851120841e-05, "loss": 0.4705, "num_input_tokens_seen": 23536880, "step": 19395 }, { "epoch": 2.160596948435238, "grad_norm": 0.16230596601963043, "learning_rate": 4.797671383715462e-05, "loss": 0.4677, "num_input_tokens_seen": 23543056, "step": 19400 }, { "epoch": 2.161153803318855, "grad_norm": 0.17178939282894135, "learning_rate": 4.797479829496695e-05, "loss": 0.4593, "num_input_tokens_seen": 23549072, "step": 19405 }, { "epoch": 2.1617106582024723, "grad_norm": 0.1640356034040451, "learning_rate": 4.797288188471777e-05, "loss": 0.4502, "num_input_tokens_seen": 23554992, "step": 19410 }, { "epoch": 2.16226751308609, "grad_norm": 0.19936266541481018, "learning_rate": 4.7970964606479494e-05, "loss": 0.4551, "num_input_tokens_seen": 23561008, "step": 19415 }, { "epoch": 2.1628243679697072, "grad_norm": 0.16423548758029938, "learning_rate": 4.796904646032457e-05, "loss": 0.4712, "num_input_tokens_seen": 23566672, "step": 19420 }, { "epoch": 2.1633812228533245, "grad_norm": 0.12424828857183456, "learning_rate": 4.7967127446325456e-05, "loss": 0.4571, "num_input_tokens_seen": 23572304, "step": 19425 }, { "epoch": 2.1639380777369417, "grad_norm": 0.1925612986087799, "learning_rate": 4.796520756455466e-05, "loss": 0.4724, "num_input_tokens_seen": 23578512, "step": 19430 }, { "epoch": 2.164494932620559, "grad_norm": 0.17569677531719208, "learning_rate": 4.796328681508473e-05, "loss": 0.4623, "num_input_tokens_seen": 23584976, "step": 19435 }, { "epoch": 2.1650517875041766, "grad_norm": 0.18825919926166534, "learning_rate": 4.796136519798824e-05, "loss": 0.4623, "num_input_tokens_seen": 23591056, "step": 19440 }, { "epoch": 2.165608642387794, "grad_norm": 0.2221515029668808, "learning_rate": 4.7959442713337784e-05, "loss": 0.4874, "num_input_tokens_seen": 23597008, "step": 19445 }, { "epoch": 2.166165497271411, "grad_norm": 0.12494780123233795, "learning_rate": 4.795751936120601e-05, "loss": 0.473, "num_input_tokens_seen": 23603248, "step": 19450 }, { "epoch": 2.1667223521550283, "grad_norm": 0.14787638187408447, "learning_rate": 4.7955595141665574e-05, "loss": 0.4599, "num_input_tokens_seen": 23609456, "step": 19455 }, { "epoch": 2.1672792070386455, "grad_norm": 0.18638180196285248, "learning_rate": 4.795367005478919e-05, "loss": 0.4816, "num_input_tokens_seen": 23615280, "step": 19460 }, { "epoch": 2.167836061922263, "grad_norm": 0.15338516235351562, "learning_rate": 4.7951744100649595e-05, "loss": 0.4705, "num_input_tokens_seen": 23621328, "step": 19465 }, { "epoch": 2.1683929168058804, "grad_norm": 0.13220834732055664, "learning_rate": 4.794981727931956e-05, "loss": 0.4581, "num_input_tokens_seen": 23627568, "step": 19470 }, { "epoch": 2.1689497716894977, "grad_norm": 0.15587203204631805, "learning_rate": 4.794788959087189e-05, "loss": 0.4592, "num_input_tokens_seen": 23633424, "step": 19475 }, { "epoch": 2.169506626573115, "grad_norm": 0.19610172510147095, "learning_rate": 4.794596103537939e-05, "loss": 0.4549, "num_input_tokens_seen": 23638864, "step": 19480 }, { "epoch": 2.1700634814567326, "grad_norm": 0.18121840059757233, "learning_rate": 4.7944031612914964e-05, "loss": 0.4657, "num_input_tokens_seen": 23645296, "step": 19485 }, { "epoch": 2.17062033634035, "grad_norm": 0.1375814974308014, "learning_rate": 4.7942101323551494e-05, "loss": 0.4712, "num_input_tokens_seen": 23650832, "step": 19490 }, { "epoch": 2.171177191223967, "grad_norm": 0.15522950887680054, "learning_rate": 4.79401701673619e-05, "loss": 0.4941, "num_input_tokens_seen": 23656944, "step": 19495 }, { "epoch": 2.1717340461075842, "grad_norm": 0.19868385791778564, "learning_rate": 4.793823814441918e-05, "loss": 0.4768, "num_input_tokens_seen": 23663024, "step": 19500 }, { "epoch": 2.172290900991202, "grad_norm": 0.13955548405647278, "learning_rate": 4.7936305254796304e-05, "loss": 0.4748, "num_input_tokens_seen": 23669168, "step": 19505 }, { "epoch": 2.172847755874819, "grad_norm": 0.12054525315761566, "learning_rate": 4.793437149856631e-05, "loss": 0.463, "num_input_tokens_seen": 23675248, "step": 19510 }, { "epoch": 2.1734046107584364, "grad_norm": 0.12893733382225037, "learning_rate": 4.793243687580227e-05, "loss": 0.4834, "num_input_tokens_seen": 23681200, "step": 19515 }, { "epoch": 2.1739614656420536, "grad_norm": 0.21112695336341858, "learning_rate": 4.793050138657727e-05, "loss": 0.4484, "num_input_tokens_seen": 23687056, "step": 19520 }, { "epoch": 2.174518320525671, "grad_norm": 0.17607435584068298, "learning_rate": 4.792856503096445e-05, "loss": 0.4708, "num_input_tokens_seen": 23693136, "step": 19525 }, { "epoch": 2.1750751754092885, "grad_norm": 0.1726965606212616, "learning_rate": 4.7926627809036964e-05, "loss": 0.4679, "num_input_tokens_seen": 23699120, "step": 19530 }, { "epoch": 2.1756320302929057, "grad_norm": 0.14175309240818024, "learning_rate": 4.7924689720868e-05, "loss": 0.4571, "num_input_tokens_seen": 23704688, "step": 19535 }, { "epoch": 2.176188885176523, "grad_norm": 0.16252538561820984, "learning_rate": 4.79227507665308e-05, "loss": 0.4813, "num_input_tokens_seen": 23710640, "step": 19540 }, { "epoch": 2.17674574006014, "grad_norm": 0.18722914159297943, "learning_rate": 4.792081094609862e-05, "loss": 0.4696, "num_input_tokens_seen": 23716720, "step": 19545 }, { "epoch": 2.1773025949437574, "grad_norm": 0.15913134813308716, "learning_rate": 4.791887025964474e-05, "loss": 0.4727, "num_input_tokens_seen": 23722992, "step": 19550 }, { "epoch": 2.177859449827375, "grad_norm": 0.1507982313632965, "learning_rate": 4.791692870724249e-05, "loss": 0.4716, "num_input_tokens_seen": 23729104, "step": 19555 }, { "epoch": 2.1784163047109923, "grad_norm": 0.17150241136550903, "learning_rate": 4.791498628896525e-05, "loss": 0.4637, "num_input_tokens_seen": 23734832, "step": 19560 }, { "epoch": 2.1789731595946096, "grad_norm": 0.12447608262300491, "learning_rate": 4.7913043004886375e-05, "loss": 0.4515, "num_input_tokens_seen": 23740976, "step": 19565 }, { "epoch": 2.179530014478227, "grad_norm": 0.13685400784015656, "learning_rate": 4.791109885507932e-05, "loss": 0.4673, "num_input_tokens_seen": 23747088, "step": 19570 }, { "epoch": 2.1800868693618445, "grad_norm": 0.24032005667686462, "learning_rate": 4.790915383961752e-05, "loss": 0.4642, "num_input_tokens_seen": 23753328, "step": 19575 }, { "epoch": 2.1806437242454617, "grad_norm": 0.13783897459506989, "learning_rate": 4.7907207958574474e-05, "loss": 0.4687, "num_input_tokens_seen": 23759568, "step": 19580 }, { "epoch": 2.181200579129079, "grad_norm": 0.17651021480560303, "learning_rate": 4.79052612120237e-05, "loss": 0.4639, "num_input_tokens_seen": 23766032, "step": 19585 }, { "epoch": 2.181757434012696, "grad_norm": 0.15212509036064148, "learning_rate": 4.790331360003876e-05, "loss": 0.4768, "num_input_tokens_seen": 23772176, "step": 19590 }, { "epoch": 2.182314288896314, "grad_norm": 0.1624082326889038, "learning_rate": 4.790136512269323e-05, "loss": 0.4771, "num_input_tokens_seen": 23778256, "step": 19595 }, { "epoch": 2.182871143779931, "grad_norm": 0.1645497977733612, "learning_rate": 4.7899415780060734e-05, "loss": 0.466, "num_input_tokens_seen": 23784176, "step": 19600 }, { "epoch": 2.1834279986635483, "grad_norm": 0.185175359249115, "learning_rate": 4.789746557221493e-05, "loss": 0.4532, "num_input_tokens_seen": 23790192, "step": 19605 }, { "epoch": 2.1839848535471655, "grad_norm": 0.1858932077884674, "learning_rate": 4.789551449922949e-05, "loss": 0.4576, "num_input_tokens_seen": 23796272, "step": 19610 }, { "epoch": 2.1845417084307828, "grad_norm": 0.18003684282302856, "learning_rate": 4.789356256117815e-05, "loss": 0.4769, "num_input_tokens_seen": 23802640, "step": 19615 }, { "epoch": 2.1850985633144004, "grad_norm": 0.23062007129192352, "learning_rate": 4.789160975813465e-05, "loss": 0.4742, "num_input_tokens_seen": 23808784, "step": 19620 }, { "epoch": 2.1856554181980177, "grad_norm": 0.18003353476524353, "learning_rate": 4.7889656090172766e-05, "loss": 0.4721, "num_input_tokens_seen": 23815120, "step": 19625 }, { "epoch": 2.186212273081635, "grad_norm": 0.22056342661380768, "learning_rate": 4.788770155736632e-05, "loss": 0.4636, "num_input_tokens_seen": 23821328, "step": 19630 }, { "epoch": 2.186769127965252, "grad_norm": 0.2186519056558609, "learning_rate": 4.788574615978917e-05, "loss": 0.4771, "num_input_tokens_seen": 23827088, "step": 19635 }, { "epoch": 2.1873259828488694, "grad_norm": 0.1724114567041397, "learning_rate": 4.788378989751518e-05, "loss": 0.4599, "num_input_tokens_seen": 23832848, "step": 19640 }, { "epoch": 2.187882837732487, "grad_norm": 0.15496359765529633, "learning_rate": 4.7881832770618284e-05, "loss": 0.4509, "num_input_tokens_seen": 23838992, "step": 19645 }, { "epoch": 2.1884396926161043, "grad_norm": 0.2174939066171646, "learning_rate": 4.7879874779172406e-05, "loss": 0.4702, "num_input_tokens_seen": 23845392, "step": 19650 }, { "epoch": 2.1889965474997215, "grad_norm": 0.1625981330871582, "learning_rate": 4.7877915923251546e-05, "loss": 0.4578, "num_input_tokens_seen": 23851472, "step": 19655 }, { "epoch": 2.1895534023833387, "grad_norm": 0.17368577420711517, "learning_rate": 4.787595620292971e-05, "loss": 0.4676, "num_input_tokens_seen": 23857584, "step": 19660 }, { "epoch": 2.1901102572669564, "grad_norm": 0.1388157457113266, "learning_rate": 4.787399561828093e-05, "loss": 0.4629, "num_input_tokens_seen": 23863760, "step": 19665 }, { "epoch": 2.1906671121505736, "grad_norm": 0.17723217606544495, "learning_rate": 4.7872034169379286e-05, "loss": 0.4736, "num_input_tokens_seen": 23869520, "step": 19670 }, { "epoch": 2.191223967034191, "grad_norm": 0.1184983104467392, "learning_rate": 4.7870071856298907e-05, "loss": 0.4603, "num_input_tokens_seen": 23875568, "step": 19675 }, { "epoch": 2.191780821917808, "grad_norm": 0.2027008980512619, "learning_rate": 4.786810867911392e-05, "loss": 0.4678, "num_input_tokens_seen": 23881744, "step": 19680 }, { "epoch": 2.1923376768014258, "grad_norm": 0.26516401767730713, "learning_rate": 4.78661446378985e-05, "loss": 0.4779, "num_input_tokens_seen": 23887568, "step": 19685 }, { "epoch": 2.192894531685043, "grad_norm": 0.15125375986099243, "learning_rate": 4.786417973272686e-05, "loss": 0.46, "num_input_tokens_seen": 23893680, "step": 19690 }, { "epoch": 2.1934513865686602, "grad_norm": 0.17470026016235352, "learning_rate": 4.786221396367323e-05, "loss": 0.4655, "num_input_tokens_seen": 23899664, "step": 19695 }, { "epoch": 2.1940082414522775, "grad_norm": 0.15303610265254974, "learning_rate": 4.7860247330811904e-05, "loss": 0.4602, "num_input_tokens_seen": 23905776, "step": 19700 }, { "epoch": 2.1945650963358947, "grad_norm": 0.1787666529417038, "learning_rate": 4.785827983421716e-05, "loss": 0.4507, "num_input_tokens_seen": 23912048, "step": 19705 }, { "epoch": 2.1951219512195124, "grad_norm": 0.19680474698543549, "learning_rate": 4.785631147396336e-05, "loss": 0.4621, "num_input_tokens_seen": 23918160, "step": 19710 }, { "epoch": 2.1956788061031296, "grad_norm": 0.2089158296585083, "learning_rate": 4.785434225012486e-05, "loss": 0.4415, "num_input_tokens_seen": 23924240, "step": 19715 }, { "epoch": 2.196235660986747, "grad_norm": 0.14341700077056885, "learning_rate": 4.785237216277608e-05, "loss": 0.4786, "num_input_tokens_seen": 23930544, "step": 19720 }, { "epoch": 2.196792515870364, "grad_norm": 0.19627396762371063, "learning_rate": 4.7850401211991436e-05, "loss": 0.4579, "num_input_tokens_seen": 23935568, "step": 19725 }, { "epoch": 2.1973493707539813, "grad_norm": 0.19480212032794952, "learning_rate": 4.784842939784541e-05, "loss": 0.4551, "num_input_tokens_seen": 23941936, "step": 19730 }, { "epoch": 2.197906225637599, "grad_norm": 0.1666540503501892, "learning_rate": 4.78464567204125e-05, "loss": 0.454, "num_input_tokens_seen": 23947760, "step": 19735 }, { "epoch": 2.198463080521216, "grad_norm": 0.2059919834136963, "learning_rate": 4.784448317976724e-05, "loss": 0.476, "num_input_tokens_seen": 23953456, "step": 19740 }, { "epoch": 2.1990199354048334, "grad_norm": 0.2071310430765152, "learning_rate": 4.784250877598419e-05, "loss": 0.4313, "num_input_tokens_seen": 23959568, "step": 19745 }, { "epoch": 2.1995767902884507, "grad_norm": 0.17131003737449646, "learning_rate": 4.7840533509137966e-05, "loss": 0.4643, "num_input_tokens_seen": 23965936, "step": 19750 }, { "epoch": 2.2001336451720683, "grad_norm": 0.15452159941196442, "learning_rate": 4.783855737930319e-05, "loss": 0.4566, "num_input_tokens_seen": 23972688, "step": 19755 }, { "epoch": 2.2006905000556856, "grad_norm": 0.14557504653930664, "learning_rate": 4.783658038655452e-05, "loss": 0.4707, "num_input_tokens_seen": 23978800, "step": 19760 }, { "epoch": 2.201247354939303, "grad_norm": 0.1906665861606598, "learning_rate": 4.7834602530966665e-05, "loss": 0.4702, "num_input_tokens_seen": 23984912, "step": 19765 }, { "epoch": 2.20180420982292, "grad_norm": 0.1868034452199936, "learning_rate": 4.783262381261435e-05, "loss": 0.4804, "num_input_tokens_seen": 23990800, "step": 19770 }, { "epoch": 2.2023610647065377, "grad_norm": 0.19890965521335602, "learning_rate": 4.783064423157233e-05, "loss": 0.454, "num_input_tokens_seen": 23996816, "step": 19775 }, { "epoch": 2.202917919590155, "grad_norm": 0.13671034574508667, "learning_rate": 4.7828663787915405e-05, "loss": 0.4803, "num_input_tokens_seen": 24003152, "step": 19780 }, { "epoch": 2.203474774473772, "grad_norm": 0.1496267318725586, "learning_rate": 4.7826682481718416e-05, "loss": 0.4702, "num_input_tokens_seen": 24009008, "step": 19785 }, { "epoch": 2.2040316293573894, "grad_norm": 0.15884369611740112, "learning_rate": 4.7824700313056204e-05, "loss": 0.462, "num_input_tokens_seen": 24013616, "step": 19790 }, { "epoch": 2.2045884842410066, "grad_norm": 0.1374422162771225, "learning_rate": 4.7822717282003674e-05, "loss": 0.4677, "num_input_tokens_seen": 24019504, "step": 19795 }, { "epoch": 2.2051453391246243, "grad_norm": 0.1432824283838272, "learning_rate": 4.7820733388635736e-05, "loss": 0.4669, "num_input_tokens_seen": 24025648, "step": 19800 }, { "epoch": 2.2057021940082415, "grad_norm": 0.17520630359649658, "learning_rate": 4.781874863302736e-05, "loss": 0.4622, "num_input_tokens_seen": 24031728, "step": 19805 }, { "epoch": 2.2062590488918588, "grad_norm": 0.13874877989292145, "learning_rate": 4.7816763015253544e-05, "loss": 0.4674, "num_input_tokens_seen": 24037520, "step": 19810 }, { "epoch": 2.206815903775476, "grad_norm": 0.16433317959308624, "learning_rate": 4.781477653538929e-05, "loss": 0.4649, "num_input_tokens_seen": 24043856, "step": 19815 }, { "epoch": 2.207372758659093, "grad_norm": 0.13635502755641937, "learning_rate": 4.7812789193509665e-05, "loss": 0.4576, "num_input_tokens_seen": 24050000, "step": 19820 }, { "epoch": 2.207929613542711, "grad_norm": 0.16634812951087952, "learning_rate": 4.781080098968976e-05, "loss": 0.4703, "num_input_tokens_seen": 24056272, "step": 19825 }, { "epoch": 2.208486468426328, "grad_norm": 0.14581888914108276, "learning_rate": 4.780881192400471e-05, "loss": 0.4771, "num_input_tokens_seen": 24062416, "step": 19830 }, { "epoch": 2.2090433233099454, "grad_norm": 0.15783068537712097, "learning_rate": 4.780682199652963e-05, "loss": 0.468, "num_input_tokens_seen": 24068368, "step": 19835 }, { "epoch": 2.2096001781935626, "grad_norm": 0.14739328622817993, "learning_rate": 4.780483120733973e-05, "loss": 0.45, "num_input_tokens_seen": 24074608, "step": 19840 }, { "epoch": 2.2101570330771803, "grad_norm": 0.13150525093078613, "learning_rate": 4.7802839556510224e-05, "loss": 0.4723, "num_input_tokens_seen": 24080528, "step": 19845 }, { "epoch": 2.2107138879607975, "grad_norm": 0.12712037563323975, "learning_rate": 4.780084704411637e-05, "loss": 0.4511, "num_input_tokens_seen": 24086512, "step": 19850 }, { "epoch": 2.2112707428444147, "grad_norm": 0.19432292878627777, "learning_rate": 4.7798853670233446e-05, "loss": 0.4647, "num_input_tokens_seen": 24092688, "step": 19855 }, { "epoch": 2.211827597728032, "grad_norm": 0.20159171521663666, "learning_rate": 4.779685943493677e-05, "loss": 0.4576, "num_input_tokens_seen": 24098896, "step": 19860 }, { "epoch": 2.2123844526116496, "grad_norm": 0.1515418142080307, "learning_rate": 4.779486433830168e-05, "loss": 0.4458, "num_input_tokens_seen": 24104976, "step": 19865 }, { "epoch": 2.212941307495267, "grad_norm": 0.2003980129957199, "learning_rate": 4.779286838040357e-05, "loss": 0.4809, "num_input_tokens_seen": 24110928, "step": 19870 }, { "epoch": 2.213498162378884, "grad_norm": 0.22249259054660797, "learning_rate": 4.779087156131785e-05, "loss": 0.4636, "num_input_tokens_seen": 24116944, "step": 19875 }, { "epoch": 2.2140550172625013, "grad_norm": 0.18736402690410614, "learning_rate": 4.7788873881119964e-05, "loss": 0.4701, "num_input_tokens_seen": 24122864, "step": 19880 }, { "epoch": 2.2146118721461185, "grad_norm": 0.17270426452159882, "learning_rate": 4.7786875339885394e-05, "loss": 0.4775, "num_input_tokens_seen": 24128912, "step": 19885 }, { "epoch": 2.215168727029736, "grad_norm": 0.1608334630727768, "learning_rate": 4.7784875937689646e-05, "loss": 0.474, "num_input_tokens_seen": 24135440, "step": 19890 }, { "epoch": 2.2157255819133534, "grad_norm": 0.1873980015516281, "learning_rate": 4.778287567460827e-05, "loss": 0.4609, "num_input_tokens_seen": 24141776, "step": 19895 }, { "epoch": 2.2162824367969707, "grad_norm": 0.2376449853181839, "learning_rate": 4.778087455071684e-05, "loss": 0.4695, "num_input_tokens_seen": 24147664, "step": 19900 }, { "epoch": 2.216839291680588, "grad_norm": 0.13549602031707764, "learning_rate": 4.777887256609096e-05, "loss": 0.4773, "num_input_tokens_seen": 24153648, "step": 19905 }, { "epoch": 2.217396146564205, "grad_norm": 0.21512971818447113, "learning_rate": 4.777686972080628e-05, "loss": 0.4604, "num_input_tokens_seen": 24159504, "step": 19910 }, { "epoch": 2.217953001447823, "grad_norm": 0.13926787674427032, "learning_rate": 4.777486601493847e-05, "loss": 0.4717, "num_input_tokens_seen": 24165008, "step": 19915 }, { "epoch": 2.21850985633144, "grad_norm": 0.14059068262577057, "learning_rate": 4.7772861448563236e-05, "loss": 0.4545, "num_input_tokens_seen": 24171152, "step": 19920 }, { "epoch": 2.2190667112150573, "grad_norm": 0.16948944330215454, "learning_rate": 4.777085602175632e-05, "loss": 0.4637, "num_input_tokens_seen": 24177264, "step": 19925 }, { "epoch": 2.2196235660986745, "grad_norm": 0.15377983450889587, "learning_rate": 4.776884973459349e-05, "loss": 0.4453, "num_input_tokens_seen": 24183696, "step": 19930 }, { "epoch": 2.220180420982292, "grad_norm": 0.1521041989326477, "learning_rate": 4.7766842587150554e-05, "loss": 0.4809, "num_input_tokens_seen": 24189488, "step": 19935 }, { "epoch": 2.2207372758659094, "grad_norm": 0.1560124158859253, "learning_rate": 4.776483457950334e-05, "loss": 0.4575, "num_input_tokens_seen": 24195696, "step": 19940 }, { "epoch": 2.2212941307495266, "grad_norm": 0.17321549355983734, "learning_rate": 4.776282571172771e-05, "loss": 0.4572, "num_input_tokens_seen": 24201680, "step": 19945 }, { "epoch": 2.221850985633144, "grad_norm": 0.1551780104637146, "learning_rate": 4.77608159838996e-05, "loss": 0.4735, "num_input_tokens_seen": 24207600, "step": 19950 }, { "epoch": 2.2224078405167615, "grad_norm": 0.1832796037197113, "learning_rate": 4.775880539609491e-05, "loss": 0.4624, "num_input_tokens_seen": 24213552, "step": 19955 }, { "epoch": 2.2229646954003788, "grad_norm": 0.14311672747135162, "learning_rate": 4.775679394838962e-05, "loss": 0.4795, "num_input_tokens_seen": 24219600, "step": 19960 }, { "epoch": 2.223521550283996, "grad_norm": 0.1891198456287384, "learning_rate": 4.775478164085973e-05, "loss": 0.4571, "num_input_tokens_seen": 24225616, "step": 19965 }, { "epoch": 2.2240784051676132, "grad_norm": 0.2062847763299942, "learning_rate": 4.775276847358126e-05, "loss": 0.4734, "num_input_tokens_seen": 24231824, "step": 19970 }, { "epoch": 2.2246352600512305, "grad_norm": 0.1795041412115097, "learning_rate": 4.7750754446630286e-05, "loss": 0.446, "num_input_tokens_seen": 24237712, "step": 19975 }, { "epoch": 2.225192114934848, "grad_norm": 0.13292312622070312, "learning_rate": 4.77487395600829e-05, "loss": 0.4751, "num_input_tokens_seen": 24243792, "step": 19980 }, { "epoch": 2.2257489698184654, "grad_norm": 0.12529879808425903, "learning_rate": 4.774672381401523e-05, "loss": 0.4578, "num_input_tokens_seen": 24249776, "step": 19985 }, { "epoch": 2.2263058247020826, "grad_norm": 0.20232641696929932, "learning_rate": 4.7744707208503445e-05, "loss": 0.4721, "num_input_tokens_seen": 24255792, "step": 19990 }, { "epoch": 2.2268626795857, "grad_norm": 0.17131906747817993, "learning_rate": 4.7742689743623725e-05, "loss": 0.4685, "num_input_tokens_seen": 24261904, "step": 19995 }, { "epoch": 2.227419534469317, "grad_norm": 0.1556796282529831, "learning_rate": 4.774067141945231e-05, "loss": 0.4696, "num_input_tokens_seen": 24267984, "step": 20000 }, { "epoch": 2.2279763893529347, "grad_norm": 0.12829940021038055, "learning_rate": 4.773865223606545e-05, "loss": 0.4726, "num_input_tokens_seen": 24274064, "step": 20005 }, { "epoch": 2.228533244236552, "grad_norm": 0.1407666802406311, "learning_rate": 4.773663219353944e-05, "loss": 0.469, "num_input_tokens_seen": 24280240, "step": 20010 }, { "epoch": 2.229090099120169, "grad_norm": 0.16516713798046112, "learning_rate": 4.7734611291950605e-05, "loss": 0.4791, "num_input_tokens_seen": 24286352, "step": 20015 }, { "epoch": 2.2296469540037864, "grad_norm": 0.1818748265504837, "learning_rate": 4.773258953137529e-05, "loss": 0.4625, "num_input_tokens_seen": 24292432, "step": 20020 }, { "epoch": 2.230203808887404, "grad_norm": 0.1929721087217331, "learning_rate": 4.77305669118899e-05, "loss": 0.4645, "num_input_tokens_seen": 24298800, "step": 20025 }, { "epoch": 2.2307606637710213, "grad_norm": 0.21254396438598633, "learning_rate": 4.772854343357085e-05, "loss": 0.4712, "num_input_tokens_seen": 24305104, "step": 20030 }, { "epoch": 2.2313175186546386, "grad_norm": 0.15294510126113892, "learning_rate": 4.7726519096494586e-05, "loss": 0.4837, "num_input_tokens_seen": 24311472, "step": 20035 }, { "epoch": 2.231874373538256, "grad_norm": 0.20182931423187256, "learning_rate": 4.77244939007376e-05, "loss": 0.4665, "num_input_tokens_seen": 24317520, "step": 20040 }, { "epoch": 2.2324312284218735, "grad_norm": 0.13437847793102264, "learning_rate": 4.7722467846376414e-05, "loss": 0.4615, "num_input_tokens_seen": 24323856, "step": 20045 }, { "epoch": 2.2329880833054907, "grad_norm": 0.12317963689565659, "learning_rate": 4.7720440933487575e-05, "loss": 0.4497, "num_input_tokens_seen": 24329776, "step": 20050 }, { "epoch": 2.233544938189108, "grad_norm": 0.15973806381225586, "learning_rate": 4.771841316214767e-05, "loss": 0.458, "num_input_tokens_seen": 24335920, "step": 20055 }, { "epoch": 2.234101793072725, "grad_norm": 0.19033871591091156, "learning_rate": 4.77163845324333e-05, "loss": 0.4373, "num_input_tokens_seen": 24341680, "step": 20060 }, { "epoch": 2.2346586479563424, "grad_norm": 0.16060252487659454, "learning_rate": 4.7714355044421134e-05, "loss": 0.4679, "num_input_tokens_seen": 24347888, "step": 20065 }, { "epoch": 2.23521550283996, "grad_norm": 0.1641918271780014, "learning_rate": 4.771232469818784e-05, "loss": 0.4708, "num_input_tokens_seen": 24354064, "step": 20070 }, { "epoch": 2.2357723577235773, "grad_norm": 0.18633367121219635, "learning_rate": 4.771029349381013e-05, "loss": 0.4697, "num_input_tokens_seen": 24360080, "step": 20075 }, { "epoch": 2.2363292126071945, "grad_norm": 0.2169482558965683, "learning_rate": 4.7708261431364754e-05, "loss": 0.4565, "num_input_tokens_seen": 24366096, "step": 20080 }, { "epoch": 2.2368860674908118, "grad_norm": 0.12231564521789551, "learning_rate": 4.770622851092849e-05, "loss": 0.4731, "num_input_tokens_seen": 24371824, "step": 20085 }, { "epoch": 2.237442922374429, "grad_norm": 0.14755748212337494, "learning_rate": 4.770419473257815e-05, "loss": 0.4643, "num_input_tokens_seen": 24378064, "step": 20090 }, { "epoch": 2.2379997772580467, "grad_norm": 0.11765061318874359, "learning_rate": 4.770216009639057e-05, "loss": 0.4517, "num_input_tokens_seen": 24383888, "step": 20095 }, { "epoch": 2.238556632141664, "grad_norm": 0.13232116401195526, "learning_rate": 4.770012460244263e-05, "loss": 0.4797, "num_input_tokens_seen": 24389968, "step": 20100 }, { "epoch": 2.239113487025281, "grad_norm": 0.150989830493927, "learning_rate": 4.7698088250811244e-05, "loss": 0.4686, "num_input_tokens_seen": 24396080, "step": 20105 }, { "epoch": 2.2396703419088984, "grad_norm": 0.12469439208507538, "learning_rate": 4.769605104157333e-05, "loss": 0.4656, "num_input_tokens_seen": 24401680, "step": 20110 }, { "epoch": 2.240227196792516, "grad_norm": 0.1550198644399643, "learning_rate": 4.769401297480588e-05, "loss": 0.457, "num_input_tokens_seen": 24408048, "step": 20115 }, { "epoch": 2.2407840516761333, "grad_norm": 0.1546550691127777, "learning_rate": 4.76919740505859e-05, "loss": 0.4455, "num_input_tokens_seen": 24414096, "step": 20120 }, { "epoch": 2.2413409065597505, "grad_norm": 0.13568225502967834, "learning_rate": 4.7689934268990415e-05, "loss": 0.4494, "num_input_tokens_seen": 24420240, "step": 20125 }, { "epoch": 2.2418977614433677, "grad_norm": 0.1725587099790573, "learning_rate": 4.76878936300965e-05, "loss": 0.4762, "num_input_tokens_seen": 24426128, "step": 20130 }, { "epoch": 2.2424546163269854, "grad_norm": 0.1364130824804306, "learning_rate": 4.768585213398126e-05, "loss": 0.472, "num_input_tokens_seen": 24432176, "step": 20135 }, { "epoch": 2.2430114712106026, "grad_norm": 0.24346794188022614, "learning_rate": 4.7683809780721825e-05, "loss": 0.4578, "num_input_tokens_seen": 24438064, "step": 20140 }, { "epoch": 2.24356832609422, "grad_norm": 0.17043477296829224, "learning_rate": 4.7681766570395356e-05, "loss": 0.463, "num_input_tokens_seen": 24444272, "step": 20145 }, { "epoch": 2.244125180977837, "grad_norm": 0.16786892712116241, "learning_rate": 4.767972250307906e-05, "loss": 0.4639, "num_input_tokens_seen": 24450288, "step": 20150 }, { "epoch": 2.2446820358614543, "grad_norm": 0.1712445765733719, "learning_rate": 4.7677677578850176e-05, "loss": 0.475, "num_input_tokens_seen": 24456144, "step": 20155 }, { "epoch": 2.245238890745072, "grad_norm": 0.1370946168899536, "learning_rate": 4.767563179778595e-05, "loss": 0.4505, "num_input_tokens_seen": 24462096, "step": 20160 }, { "epoch": 2.245795745628689, "grad_norm": 0.14192385971546173, "learning_rate": 4.76735851599637e-05, "loss": 0.4583, "num_input_tokens_seen": 24467888, "step": 20165 }, { "epoch": 2.2463526005123065, "grad_norm": 0.19831110537052155, "learning_rate": 4.767153766546072e-05, "loss": 0.4733, "num_input_tokens_seen": 24473776, "step": 20170 }, { "epoch": 2.2469094553959237, "grad_norm": 0.14400915801525116, "learning_rate": 4.766948931435441e-05, "loss": 0.4756, "num_input_tokens_seen": 24480368, "step": 20175 }, { "epoch": 2.2474663102795414, "grad_norm": 0.15060696005821228, "learning_rate": 4.766744010672214e-05, "loss": 0.4643, "num_input_tokens_seen": 24486576, "step": 20180 }, { "epoch": 2.2480231651631586, "grad_norm": 0.1501019150018692, "learning_rate": 4.7665390042641345e-05, "loss": 0.4543, "num_input_tokens_seen": 24492592, "step": 20185 }, { "epoch": 2.248580020046776, "grad_norm": 0.16173945367336273, "learning_rate": 4.766333912218947e-05, "loss": 0.4703, "num_input_tokens_seen": 24498704, "step": 20190 }, { "epoch": 2.249136874930393, "grad_norm": 0.18144497275352478, "learning_rate": 4.7661287345444026e-05, "loss": 0.4542, "num_input_tokens_seen": 24504976, "step": 20195 }, { "epoch": 2.2496937298140103, "grad_norm": 0.21111232042312622, "learning_rate": 4.7659234712482515e-05, "loss": 0.4598, "num_input_tokens_seen": 24511152, "step": 20200 }, { "epoch": 2.250250584697628, "grad_norm": 0.15149402618408203, "learning_rate": 4.765718122338251e-05, "loss": 0.4622, "num_input_tokens_seen": 24517296, "step": 20205 }, { "epoch": 2.250807439581245, "grad_norm": 0.23539546132087708, "learning_rate": 4.7655126878221576e-05, "loss": 0.4726, "num_input_tokens_seen": 24522960, "step": 20210 }, { "epoch": 2.2513642944648624, "grad_norm": 0.12778672575950623, "learning_rate": 4.7653071677077365e-05, "loss": 0.4678, "num_input_tokens_seen": 24528752, "step": 20215 }, { "epoch": 2.2519211493484796, "grad_norm": 0.1630522459745407, "learning_rate": 4.7651015620027503e-05, "loss": 0.4615, "num_input_tokens_seen": 24534768, "step": 20220 }, { "epoch": 2.2524780042320973, "grad_norm": 0.13003754615783691, "learning_rate": 4.764895870714968e-05, "loss": 0.4514, "num_input_tokens_seen": 24540720, "step": 20225 }, { "epoch": 2.2530348591157145, "grad_norm": 0.13808654248714447, "learning_rate": 4.764690093852161e-05, "loss": 0.4547, "num_input_tokens_seen": 24546960, "step": 20230 }, { "epoch": 2.253591713999332, "grad_norm": 0.20644982159137726, "learning_rate": 4.764484231422106e-05, "loss": 0.4676, "num_input_tokens_seen": 24552784, "step": 20235 }, { "epoch": 2.254148568882949, "grad_norm": 0.17166781425476074, "learning_rate": 4.764278283432579e-05, "loss": 0.4679, "num_input_tokens_seen": 24558320, "step": 20240 }, { "epoch": 2.2547054237665662, "grad_norm": 0.11465457826852798, "learning_rate": 4.764072249891363e-05, "loss": 0.4694, "num_input_tokens_seen": 24564528, "step": 20245 }, { "epoch": 2.255262278650184, "grad_norm": 0.14837615191936493, "learning_rate": 4.76386613080624e-05, "loss": 0.4652, "num_input_tokens_seen": 24570672, "step": 20250 }, { "epoch": 2.255819133533801, "grad_norm": 0.1588924378156662, "learning_rate": 4.763659926185001e-05, "loss": 0.4642, "num_input_tokens_seen": 24576912, "step": 20255 }, { "epoch": 2.2563759884174184, "grad_norm": 0.14411301910877228, "learning_rate": 4.7634536360354356e-05, "loss": 0.4579, "num_input_tokens_seen": 24582576, "step": 20260 }, { "epoch": 2.2569328433010356, "grad_norm": 0.19530315697193146, "learning_rate": 4.763247260365338e-05, "loss": 0.4643, "num_input_tokens_seen": 24588656, "step": 20265 }, { "epoch": 2.257489698184653, "grad_norm": 0.21244744956493378, "learning_rate": 4.763040799182507e-05, "loss": 0.4653, "num_input_tokens_seen": 24594416, "step": 20270 }, { "epoch": 2.2580465530682705, "grad_norm": 0.14588503539562225, "learning_rate": 4.762834252494741e-05, "loss": 0.4691, "num_input_tokens_seen": 24600560, "step": 20275 }, { "epoch": 2.2586034079518877, "grad_norm": 0.163417249917984, "learning_rate": 4.762627620309846e-05, "loss": 0.4636, "num_input_tokens_seen": 24606544, "step": 20280 }, { "epoch": 2.259160262835505, "grad_norm": 0.15836302936077118, "learning_rate": 4.762420902635628e-05, "loss": 0.4585, "num_input_tokens_seen": 24612528, "step": 20285 }, { "epoch": 2.259717117719122, "grad_norm": 0.1867060363292694, "learning_rate": 4.762214099479899e-05, "loss": 0.4799, "num_input_tokens_seen": 24618416, "step": 20290 }, { "epoch": 2.26027397260274, "grad_norm": 0.14373596012592316, "learning_rate": 4.762007210850471e-05, "loss": 0.4754, "num_input_tokens_seen": 24624560, "step": 20295 }, { "epoch": 2.260830827486357, "grad_norm": 0.12577086687088013, "learning_rate": 4.761800236755162e-05, "loss": 0.4575, "num_input_tokens_seen": 24630224, "step": 20300 }, { "epoch": 2.2613876823699743, "grad_norm": 0.11782670766115189, "learning_rate": 4.7615931772017904e-05, "loss": 0.4591, "num_input_tokens_seen": 24636592, "step": 20305 }, { "epoch": 2.2619445372535916, "grad_norm": 0.24197418987751007, "learning_rate": 4.761386032198183e-05, "loss": 0.4628, "num_input_tokens_seen": 24642480, "step": 20310 }, { "epoch": 2.2625013921372092, "grad_norm": 0.1184685081243515, "learning_rate": 4.761178801752163e-05, "loss": 0.4661, "num_input_tokens_seen": 24648848, "step": 20315 }, { "epoch": 2.2630582470208265, "grad_norm": 0.14397768676280975, "learning_rate": 4.760971485871562e-05, "loss": 0.4563, "num_input_tokens_seen": 24654928, "step": 20320 }, { "epoch": 2.2636151019044437, "grad_norm": 0.0993097797036171, "learning_rate": 4.760764084564212e-05, "loss": 0.4697, "num_input_tokens_seen": 24661200, "step": 20325 }, { "epoch": 2.264171956788061, "grad_norm": 0.12892912328243256, "learning_rate": 4.760556597837951e-05, "loss": 0.4523, "num_input_tokens_seen": 24667120, "step": 20330 }, { "epoch": 2.264728811671678, "grad_norm": 0.12879909574985504, "learning_rate": 4.7603490257006166e-05, "loss": 0.4607, "num_input_tokens_seen": 24673168, "step": 20335 }, { "epoch": 2.265285666555296, "grad_norm": 0.12932997941970825, "learning_rate": 4.760141368160053e-05, "loss": 0.4729, "num_input_tokens_seen": 24679312, "step": 20340 }, { "epoch": 2.265842521438913, "grad_norm": 0.1386614888906479, "learning_rate": 4.7599336252241054e-05, "loss": 0.453, "num_input_tokens_seen": 24685072, "step": 20345 }, { "epoch": 2.2663993763225303, "grad_norm": 0.1420663744211197, "learning_rate": 4.759725796900623e-05, "loss": 0.4725, "num_input_tokens_seen": 24691120, "step": 20350 }, { "epoch": 2.2669562312061475, "grad_norm": 0.13763290643692017, "learning_rate": 4.759517883197458e-05, "loss": 0.4678, "num_input_tokens_seen": 24697264, "step": 20355 }, { "epoch": 2.2675130860897648, "grad_norm": 0.1359287053346634, "learning_rate": 4.759309884122467e-05, "loss": 0.4681, "num_input_tokens_seen": 24703344, "step": 20360 }, { "epoch": 2.2680699409733824, "grad_norm": 0.1269170641899109, "learning_rate": 4.759101799683507e-05, "loss": 0.4631, "num_input_tokens_seen": 24709168, "step": 20365 }, { "epoch": 2.2686267958569997, "grad_norm": 0.14874199032783508, "learning_rate": 4.758893629888443e-05, "loss": 0.454, "num_input_tokens_seen": 24715056, "step": 20370 }, { "epoch": 2.269183650740617, "grad_norm": 0.1266769915819168, "learning_rate": 4.758685374745138e-05, "loss": 0.4784, "num_input_tokens_seen": 24721360, "step": 20375 }, { "epoch": 2.2697405056242346, "grad_norm": 0.11762909591197968, "learning_rate": 4.758477034261461e-05, "loss": 0.4628, "num_input_tokens_seen": 24727824, "step": 20380 }, { "epoch": 2.270297360507852, "grad_norm": 0.15204209089279175, "learning_rate": 4.758268608445284e-05, "loss": 0.4573, "num_input_tokens_seen": 24734064, "step": 20385 }, { "epoch": 2.270854215391469, "grad_norm": 0.1204257607460022, "learning_rate": 4.758060097304482e-05, "loss": 0.4763, "num_input_tokens_seen": 24740208, "step": 20390 }, { "epoch": 2.2714110702750863, "grad_norm": 0.13546445965766907, "learning_rate": 4.757851500846934e-05, "loss": 0.4624, "num_input_tokens_seen": 24746160, "step": 20395 }, { "epoch": 2.2719679251587035, "grad_norm": 0.16332556307315826, "learning_rate": 4.75764281908052e-05, "loss": 0.4588, "num_input_tokens_seen": 24752112, "step": 20400 }, { "epoch": 2.272524780042321, "grad_norm": 0.12859879434108734, "learning_rate": 4.757434052013125e-05, "loss": 0.4703, "num_input_tokens_seen": 24757680, "step": 20405 }, { "epoch": 2.2730816349259384, "grad_norm": 0.19127896428108215, "learning_rate": 4.757225199652638e-05, "loss": 0.4712, "num_input_tokens_seen": 24763440, "step": 20410 }, { "epoch": 2.2736384898095556, "grad_norm": 0.1307288408279419, "learning_rate": 4.757016262006949e-05, "loss": 0.4841, "num_input_tokens_seen": 24768208, "step": 20415 }, { "epoch": 2.274195344693173, "grad_norm": 0.1438436359167099, "learning_rate": 4.756807239083952e-05, "loss": 0.4544, "num_input_tokens_seen": 24774288, "step": 20420 }, { "epoch": 2.27475219957679, "grad_norm": 0.13101129233837128, "learning_rate": 4.756598130891546e-05, "loss": 0.4525, "num_input_tokens_seen": 24780304, "step": 20425 }, { "epoch": 2.2753090544604078, "grad_norm": 0.13825374841690063, "learning_rate": 4.7563889374376315e-05, "loss": 0.4701, "num_input_tokens_seen": 24786032, "step": 20430 }, { "epoch": 2.275865909344025, "grad_norm": 0.14885962009429932, "learning_rate": 4.7561796587301114e-05, "loss": 0.4738, "num_input_tokens_seen": 24792336, "step": 20435 }, { "epoch": 2.2764227642276422, "grad_norm": 0.17476527392864227, "learning_rate": 4.7559702947768926e-05, "loss": 0.4714, "num_input_tokens_seen": 24798288, "step": 20440 }, { "epoch": 2.2769796191112595, "grad_norm": 0.14379236102104187, "learning_rate": 4.755760845585887e-05, "loss": 0.4583, "num_input_tokens_seen": 24804112, "step": 20445 }, { "epoch": 2.2775364739948767, "grad_norm": 0.14061567187309265, "learning_rate": 4.755551311165009e-05, "loss": 0.478, "num_input_tokens_seen": 24810384, "step": 20450 }, { "epoch": 2.2780933288784944, "grad_norm": 0.14624619483947754, "learning_rate": 4.7553416915221723e-05, "loss": 0.4538, "num_input_tokens_seen": 24816528, "step": 20455 }, { "epoch": 2.2786501837621116, "grad_norm": 0.15010319650173187, "learning_rate": 4.7551319866653e-05, "loss": 0.4735, "num_input_tokens_seen": 24822864, "step": 20460 }, { "epoch": 2.279207038645729, "grad_norm": 0.1552404910326004, "learning_rate": 4.754922196602314e-05, "loss": 0.461, "num_input_tokens_seen": 24829328, "step": 20465 }, { "epoch": 2.2797638935293465, "grad_norm": 0.1692945659160614, "learning_rate": 4.7547123213411415e-05, "loss": 0.4633, "num_input_tokens_seen": 24835504, "step": 20470 }, { "epoch": 2.2803207484129637, "grad_norm": 0.1572517454624176, "learning_rate": 4.754502360889711e-05, "loss": 0.4618, "num_input_tokens_seen": 24841584, "step": 20475 }, { "epoch": 2.280877603296581, "grad_norm": 0.11253885179758072, "learning_rate": 4.754292315255957e-05, "loss": 0.4679, "num_input_tokens_seen": 24847792, "step": 20480 }, { "epoch": 2.281434458180198, "grad_norm": 0.09866122156381607, "learning_rate": 4.7540821844478155e-05, "loss": 0.4654, "num_input_tokens_seen": 24853456, "step": 20485 }, { "epoch": 2.2819913130638154, "grad_norm": 0.16423285007476807, "learning_rate": 4.753871968473225e-05, "loss": 0.4716, "num_input_tokens_seen": 24859600, "step": 20490 }, { "epoch": 2.282548167947433, "grad_norm": 0.13626743853092194, "learning_rate": 4.753661667340129e-05, "loss": 0.4512, "num_input_tokens_seen": 24865296, "step": 20495 }, { "epoch": 2.2831050228310503, "grad_norm": 0.1223168894648552, "learning_rate": 4.7534512810564726e-05, "loss": 0.471, "num_input_tokens_seen": 24871472, "step": 20500 }, { "epoch": 2.2836618777146676, "grad_norm": 0.13187257945537567, "learning_rate": 4.753240809630206e-05, "loss": 0.4688, "num_input_tokens_seen": 24877360, "step": 20505 }, { "epoch": 2.284218732598285, "grad_norm": 0.13615171611309052, "learning_rate": 4.753030253069279e-05, "loss": 0.4609, "num_input_tokens_seen": 24883312, "step": 20510 }, { "epoch": 2.284775587481902, "grad_norm": 0.18482111394405365, "learning_rate": 4.752819611381651e-05, "loss": 0.4658, "num_input_tokens_seen": 24889776, "step": 20515 }, { "epoch": 2.2853324423655197, "grad_norm": 0.14264552295207977, "learning_rate": 4.7526088845752775e-05, "loss": 0.463, "num_input_tokens_seen": 24896112, "step": 20520 }, { "epoch": 2.285889297249137, "grad_norm": 0.12604576349258423, "learning_rate": 4.7523980726581216e-05, "loss": 0.4764, "num_input_tokens_seen": 24902384, "step": 20525 }, { "epoch": 2.286446152132754, "grad_norm": 0.13298071920871735, "learning_rate": 4.7521871756381486e-05, "loss": 0.4623, "num_input_tokens_seen": 24908336, "step": 20530 }, { "epoch": 2.2870030070163714, "grad_norm": 0.19046668708324432, "learning_rate": 4.751976193523327e-05, "loss": 0.4731, "num_input_tokens_seen": 24914320, "step": 20535 }, { "epoch": 2.287559861899989, "grad_norm": 0.1313839703798294, "learning_rate": 4.751765126321627e-05, "loss": 0.4482, "num_input_tokens_seen": 24920144, "step": 20540 }, { "epoch": 2.2881167167836063, "grad_norm": 0.11692998558282852, "learning_rate": 4.7515539740410245e-05, "loss": 0.4539, "num_input_tokens_seen": 24926032, "step": 20545 }, { "epoch": 2.2886735716672235, "grad_norm": 0.15228639543056488, "learning_rate": 4.7513427366894976e-05, "loss": 0.4454, "num_input_tokens_seen": 24932080, "step": 20550 }, { "epoch": 2.2892304265508407, "grad_norm": 0.15570099651813507, "learning_rate": 4.751131414275027e-05, "loss": 0.4629, "num_input_tokens_seen": 24938576, "step": 20555 }, { "epoch": 2.2897872814344584, "grad_norm": 0.15008039772510529, "learning_rate": 4.750920006805598e-05, "loss": 0.4622, "num_input_tokens_seen": 24944976, "step": 20560 }, { "epoch": 2.2903441363180757, "grad_norm": 0.13139767944812775, "learning_rate": 4.7507085142891975e-05, "loss": 0.4707, "num_input_tokens_seen": 24951152, "step": 20565 }, { "epoch": 2.290900991201693, "grad_norm": 0.13223932683467865, "learning_rate": 4.750496936733817e-05, "loss": 0.4496, "num_input_tokens_seen": 24957104, "step": 20570 }, { "epoch": 2.29145784608531, "grad_norm": 0.146993026137352, "learning_rate": 4.75028527414745e-05, "loss": 0.4611, "num_input_tokens_seen": 24962128, "step": 20575 }, { "epoch": 2.2920147009689273, "grad_norm": 0.27898919582366943, "learning_rate": 4.750073526538094e-05, "loss": 0.4808, "num_input_tokens_seen": 24968240, "step": 20580 }, { "epoch": 2.292571555852545, "grad_norm": 0.16338114440441132, "learning_rate": 4.7498616939137484e-05, "loss": 0.4645, "num_input_tokens_seen": 24974512, "step": 20585 }, { "epoch": 2.2931284107361622, "grad_norm": 0.19668154418468475, "learning_rate": 4.749649776282419e-05, "loss": 0.4484, "num_input_tokens_seen": 24980784, "step": 20590 }, { "epoch": 2.2936852656197795, "grad_norm": 0.16492018103599548, "learning_rate": 4.749437773652111e-05, "loss": 0.4676, "num_input_tokens_seen": 24986960, "step": 20595 }, { "epoch": 2.2942421205033967, "grad_norm": 0.1418260633945465, "learning_rate": 4.749225686030836e-05, "loss": 0.4662, "num_input_tokens_seen": 24993200, "step": 20600 }, { "epoch": 2.294798975387014, "grad_norm": 0.2209521383047104, "learning_rate": 4.749013513426607e-05, "loss": 0.4672, "num_input_tokens_seen": 24999280, "step": 20605 }, { "epoch": 2.2953558302706316, "grad_norm": 0.17657944560050964, "learning_rate": 4.748801255847439e-05, "loss": 0.4604, "num_input_tokens_seen": 25005616, "step": 20610 }, { "epoch": 2.295912685154249, "grad_norm": 0.07046051323413849, "learning_rate": 4.7485889133013536e-05, "loss": 0.454, "num_input_tokens_seen": 25011728, "step": 20615 }, { "epoch": 2.296469540037866, "grad_norm": 0.15968634188175201, "learning_rate": 4.748376485796373e-05, "loss": 0.4714, "num_input_tokens_seen": 25017712, "step": 20620 }, { "epoch": 2.2970263949214833, "grad_norm": 0.12994249165058136, "learning_rate": 4.748163973340523e-05, "loss": 0.4583, "num_input_tokens_seen": 25023440, "step": 20625 }, { "epoch": 2.297583249805101, "grad_norm": 0.13940668106079102, "learning_rate": 4.747951375941834e-05, "loss": 0.4673, "num_input_tokens_seen": 25029328, "step": 20630 }, { "epoch": 2.298140104688718, "grad_norm": 0.15325643122196198, "learning_rate": 4.747738693608338e-05, "loss": 0.4622, "num_input_tokens_seen": 25035632, "step": 20635 }, { "epoch": 2.2986969595723354, "grad_norm": 0.14089685678482056, "learning_rate": 4.747525926348071e-05, "loss": 0.4591, "num_input_tokens_seen": 25041840, "step": 20640 }, { "epoch": 2.2992538144559527, "grad_norm": 0.12605883181095123, "learning_rate": 4.747313074169071e-05, "loss": 0.4653, "num_input_tokens_seen": 25047376, "step": 20645 }, { "epoch": 2.2998106693395703, "grad_norm": 0.15498599410057068, "learning_rate": 4.747100137079382e-05, "loss": 0.469, "num_input_tokens_seen": 25053488, "step": 20650 }, { "epoch": 2.3003675242231876, "grad_norm": 0.22893834114074707, "learning_rate": 4.746887115087049e-05, "loss": 0.4587, "num_input_tokens_seen": 25059504, "step": 20655 }, { "epoch": 2.300924379106805, "grad_norm": 0.13430902361869812, "learning_rate": 4.74667400820012e-05, "loss": 0.4803, "num_input_tokens_seen": 25064944, "step": 20660 }, { "epoch": 2.301481233990422, "grad_norm": 0.16465523838996887, "learning_rate": 4.746460816426647e-05, "loss": 0.4806, "num_input_tokens_seen": 25070928, "step": 20665 }, { "epoch": 2.3020380888740393, "grad_norm": 0.13691630959510803, "learning_rate": 4.746247539774685e-05, "loss": 0.4702, "num_input_tokens_seen": 25076848, "step": 20670 }, { "epoch": 2.302594943757657, "grad_norm": 0.13459992408752441, "learning_rate": 4.7460341782522934e-05, "loss": 0.448, "num_input_tokens_seen": 25082512, "step": 20675 }, { "epoch": 2.303151798641274, "grad_norm": 0.15757951140403748, "learning_rate": 4.745820731867532e-05, "loss": 0.4755, "num_input_tokens_seen": 25088816, "step": 20680 }, { "epoch": 2.3037086535248914, "grad_norm": 0.1529238075017929, "learning_rate": 4.7456072006284664e-05, "loss": 0.4668, "num_input_tokens_seen": 25095056, "step": 20685 }, { "epoch": 2.3042655084085086, "grad_norm": 0.1763177216053009, "learning_rate": 4.745393584543164e-05, "loss": 0.4566, "num_input_tokens_seen": 25101136, "step": 20690 }, { "epoch": 2.304822363292126, "grad_norm": 0.12558285892009735, "learning_rate": 4.745179883619697e-05, "loss": 0.4616, "num_input_tokens_seen": 25107024, "step": 20695 }, { "epoch": 2.3053792181757435, "grad_norm": 0.14140474796295166, "learning_rate": 4.7449660978661384e-05, "loss": 0.4632, "num_input_tokens_seen": 25113232, "step": 20700 }, { "epoch": 2.3059360730593608, "grad_norm": 0.1367003321647644, "learning_rate": 4.744752227290566e-05, "loss": 0.4588, "num_input_tokens_seen": 25119664, "step": 20705 }, { "epoch": 2.306492927942978, "grad_norm": 0.1767776906490326, "learning_rate": 4.744538271901062e-05, "loss": 0.4659, "num_input_tokens_seen": 25125648, "step": 20710 }, { "epoch": 2.3070497828265952, "grad_norm": 0.12702393531799316, "learning_rate": 4.7443242317057085e-05, "loss": 0.4682, "num_input_tokens_seen": 25131664, "step": 20715 }, { "epoch": 2.307606637710213, "grad_norm": 0.13956530392169952, "learning_rate": 4.744110106712592e-05, "loss": 0.4561, "num_input_tokens_seen": 25137584, "step": 20720 }, { "epoch": 2.30816349259383, "grad_norm": 0.13738781213760376, "learning_rate": 4.743895896929806e-05, "loss": 0.4686, "num_input_tokens_seen": 25143600, "step": 20725 }, { "epoch": 2.3087203474774474, "grad_norm": 0.09902416914701462, "learning_rate": 4.7436816023654415e-05, "loss": 0.4762, "num_input_tokens_seen": 25149616, "step": 20730 }, { "epoch": 2.3092772023610646, "grad_norm": 0.13744240999221802, "learning_rate": 4.7434672230275954e-05, "loss": 0.4621, "num_input_tokens_seen": 25155632, "step": 20735 }, { "epoch": 2.3098340572446823, "grad_norm": 0.22811147570610046, "learning_rate": 4.7432527589243685e-05, "loss": 0.4645, "num_input_tokens_seen": 25161712, "step": 20740 }, { "epoch": 2.3103909121282995, "grad_norm": 0.12860986590385437, "learning_rate": 4.743038210063863e-05, "loss": 0.4856, "num_input_tokens_seen": 25167760, "step": 20745 }, { "epoch": 2.3109477670119167, "grad_norm": 0.17240583896636963, "learning_rate": 4.742823576454186e-05, "loss": 0.4656, "num_input_tokens_seen": 25173808, "step": 20750 }, { "epoch": 2.311504621895534, "grad_norm": 0.14104250073432922, "learning_rate": 4.742608858103447e-05, "loss": 0.4663, "num_input_tokens_seen": 25180304, "step": 20755 }, { "epoch": 2.312061476779151, "grad_norm": 0.14418154954910278, "learning_rate": 4.742394055019759e-05, "loss": 0.4556, "num_input_tokens_seen": 25186096, "step": 20760 }, { "epoch": 2.312618331662769, "grad_norm": 0.17387157678604126, "learning_rate": 4.742179167211237e-05, "loss": 0.4689, "num_input_tokens_seen": 25192272, "step": 20765 }, { "epoch": 2.313175186546386, "grad_norm": 0.12910817563533783, "learning_rate": 4.741964194686e-05, "loss": 0.4563, "num_input_tokens_seen": 25198192, "step": 20770 }, { "epoch": 2.3137320414300033, "grad_norm": 0.1312987059354782, "learning_rate": 4.7417491374521716e-05, "loss": 0.4683, "num_input_tokens_seen": 25204112, "step": 20775 }, { "epoch": 2.3142888963136206, "grad_norm": 0.13620838522911072, "learning_rate": 4.7415339955178766e-05, "loss": 0.4578, "num_input_tokens_seen": 25210320, "step": 20780 }, { "epoch": 2.314845751197238, "grad_norm": 0.14490731060504913, "learning_rate": 4.741318768891244e-05, "loss": 0.4683, "num_input_tokens_seen": 25216592, "step": 20785 }, { "epoch": 2.3154026060808555, "grad_norm": 0.14930129051208496, "learning_rate": 4.7411034575804057e-05, "loss": 0.4663, "num_input_tokens_seen": 25222992, "step": 20790 }, { "epoch": 2.3159594609644727, "grad_norm": 0.1405935287475586, "learning_rate": 4.740888061593497e-05, "loss": 0.4691, "num_input_tokens_seen": 25228976, "step": 20795 }, { "epoch": 2.31651631584809, "grad_norm": 0.1790657639503479, "learning_rate": 4.740672580938656e-05, "loss": 0.4755, "num_input_tokens_seen": 25235248, "step": 20800 }, { "epoch": 2.317073170731707, "grad_norm": 0.14789815247058868, "learning_rate": 4.7404570156240236e-05, "loss": 0.4702, "num_input_tokens_seen": 25241648, "step": 20805 }, { "epoch": 2.317630025615325, "grad_norm": 0.12251561135053635, "learning_rate": 4.740241365657746e-05, "loss": 0.4491, "num_input_tokens_seen": 25247600, "step": 20810 }, { "epoch": 2.318186880498942, "grad_norm": 0.1700904369354248, "learning_rate": 4.7400256310479704e-05, "loss": 0.4631, "num_input_tokens_seen": 25254064, "step": 20815 }, { "epoch": 2.3187437353825593, "grad_norm": 0.13769486546516418, "learning_rate": 4.7398098118028475e-05, "loss": 0.465, "num_input_tokens_seen": 25260240, "step": 20820 }, { "epoch": 2.3193005902661765, "grad_norm": 0.13698329031467438, "learning_rate": 4.739593907930532e-05, "loss": 0.4812, "num_input_tokens_seen": 25266384, "step": 20825 }, { "epoch": 2.319857445149794, "grad_norm": 0.16562402248382568, "learning_rate": 4.739377919439182e-05, "loss": 0.4584, "num_input_tokens_seen": 25272272, "step": 20830 }, { "epoch": 2.3204143000334114, "grad_norm": 0.12626944482326508, "learning_rate": 4.739161846336958e-05, "loss": 0.4652, "num_input_tokens_seen": 25278480, "step": 20835 }, { "epoch": 2.3209711549170287, "grad_norm": 0.1321791559457779, "learning_rate": 4.738945688632023e-05, "loss": 0.4665, "num_input_tokens_seen": 25284496, "step": 20840 }, { "epoch": 2.321528009800646, "grad_norm": 0.1534038782119751, "learning_rate": 4.7387294463325446e-05, "loss": 0.4553, "num_input_tokens_seen": 25289968, "step": 20845 }, { "epoch": 2.322084864684263, "grad_norm": 0.17441067099571228, "learning_rate": 4.7385131194466946e-05, "loss": 0.4673, "num_input_tokens_seen": 25296112, "step": 20850 }, { "epoch": 2.322641719567881, "grad_norm": 0.11747933179140091, "learning_rate": 4.7382967079826445e-05, "loss": 0.4812, "num_input_tokens_seen": 25302320, "step": 20855 }, { "epoch": 2.323198574451498, "grad_norm": 0.1419481337070465, "learning_rate": 4.738080211948572e-05, "loss": 0.4559, "num_input_tokens_seen": 25308752, "step": 20860 }, { "epoch": 2.3237554293351153, "grad_norm": 0.16762183606624603, "learning_rate": 4.737863631352656e-05, "loss": 0.4723, "num_input_tokens_seen": 25314768, "step": 20865 }, { "epoch": 2.3243122842187325, "grad_norm": 0.13458888232707977, "learning_rate": 4.737646966203081e-05, "loss": 0.4631, "num_input_tokens_seen": 25320848, "step": 20870 }, { "epoch": 2.3248691391023497, "grad_norm": 0.1639564037322998, "learning_rate": 4.737430216508033e-05, "loss": 0.4718, "num_input_tokens_seen": 25327120, "step": 20875 }, { "epoch": 2.3254259939859674, "grad_norm": 0.11700136214494705, "learning_rate": 4.7372133822757014e-05, "loss": 0.4656, "num_input_tokens_seen": 25333296, "step": 20880 }, { "epoch": 2.3259828488695846, "grad_norm": 0.135961651802063, "learning_rate": 4.736996463514278e-05, "loss": 0.4441, "num_input_tokens_seen": 25339664, "step": 20885 }, { "epoch": 2.326539703753202, "grad_norm": 0.12822964787483215, "learning_rate": 4.736779460231961e-05, "loss": 0.4702, "num_input_tokens_seen": 25346032, "step": 20890 }, { "epoch": 2.327096558636819, "grad_norm": 0.12012124806642532, "learning_rate": 4.736562372436946e-05, "loss": 0.4567, "num_input_tokens_seen": 25352016, "step": 20895 }, { "epoch": 2.3276534135204368, "grad_norm": 0.26616567373275757, "learning_rate": 4.736345200137437e-05, "loss": 0.4615, "num_input_tokens_seen": 25357488, "step": 20900 }, { "epoch": 2.328210268404054, "grad_norm": 0.17909176647663116, "learning_rate": 4.7361279433416404e-05, "loss": 0.4509, "num_input_tokens_seen": 25363440, "step": 20905 }, { "epoch": 2.328767123287671, "grad_norm": 0.1303715705871582, "learning_rate": 4.735910602057765e-05, "loss": 0.4528, "num_input_tokens_seen": 25369648, "step": 20910 }, { "epoch": 2.3293239781712884, "grad_norm": 0.14173613488674164, "learning_rate": 4.7356931762940206e-05, "loss": 0.47, "num_input_tokens_seen": 25375472, "step": 20915 }, { "epoch": 2.329880833054906, "grad_norm": 0.12779194116592407, "learning_rate": 4.735475666058623e-05, "loss": 0.4895, "num_input_tokens_seen": 25381520, "step": 20920 }, { "epoch": 2.3304376879385234, "grad_norm": 0.22758881747722626, "learning_rate": 4.735258071359792e-05, "loss": 0.4632, "num_input_tokens_seen": 25387024, "step": 20925 }, { "epoch": 2.3309945428221406, "grad_norm": 0.18499475717544556, "learning_rate": 4.7350403922057476e-05, "loss": 0.473, "num_input_tokens_seen": 25393328, "step": 20930 }, { "epoch": 2.331551397705758, "grad_norm": 0.1887214481830597, "learning_rate": 4.734822628604714e-05, "loss": 0.4526, "num_input_tokens_seen": 25399344, "step": 20935 }, { "epoch": 2.332108252589375, "grad_norm": 0.19979733228683472, "learning_rate": 4.734604780564919e-05, "loss": 0.4715, "num_input_tokens_seen": 25405584, "step": 20940 }, { "epoch": 2.3326651074729927, "grad_norm": 0.16601935029029846, "learning_rate": 4.7343868480945956e-05, "loss": 0.4687, "num_input_tokens_seen": 25411408, "step": 20945 }, { "epoch": 2.33322196235661, "grad_norm": 0.159181609749794, "learning_rate": 4.734168831201976e-05, "loss": 0.4714, "num_input_tokens_seen": 25417616, "step": 20950 }, { "epoch": 2.333778817240227, "grad_norm": 0.16412390768527985, "learning_rate": 4.733950729895299e-05, "loss": 0.4769, "num_input_tokens_seen": 25423728, "step": 20955 }, { "epoch": 2.3343356721238444, "grad_norm": 0.13551539182662964, "learning_rate": 4.733732544182803e-05, "loss": 0.4691, "num_input_tokens_seen": 25429680, "step": 20960 }, { "epoch": 2.3348925270074616, "grad_norm": 0.17697487771511078, "learning_rate": 4.7335142740727346e-05, "loss": 0.4706, "num_input_tokens_seen": 25435632, "step": 20965 }, { "epoch": 2.3354493818910793, "grad_norm": 0.21848218142986298, "learning_rate": 4.733295919573339e-05, "loss": 0.4829, "num_input_tokens_seen": 25441872, "step": 20970 }, { "epoch": 2.3360062367746965, "grad_norm": 0.1235026866197586, "learning_rate": 4.7330774806928654e-05, "loss": 0.435, "num_input_tokens_seen": 25448496, "step": 20975 }, { "epoch": 2.3365630916583138, "grad_norm": 0.1369198113679886, "learning_rate": 4.732858957439569e-05, "loss": 0.4565, "num_input_tokens_seen": 25454032, "step": 20980 }, { "epoch": 2.337119946541931, "grad_norm": 0.1433017998933792, "learning_rate": 4.7326403498217064e-05, "loss": 0.4505, "num_input_tokens_seen": 25460048, "step": 20985 }, { "epoch": 2.3376768014255487, "grad_norm": 0.12516635656356812, "learning_rate": 4.732421657847536e-05, "loss": 0.4735, "num_input_tokens_seen": 25465776, "step": 20990 }, { "epoch": 2.338233656309166, "grad_norm": 0.16071532666683197, "learning_rate": 4.732202881525321e-05, "loss": 0.4709, "num_input_tokens_seen": 25472016, "step": 20995 }, { "epoch": 2.338790511192783, "grad_norm": 0.1697724610567093, "learning_rate": 4.731984020863328e-05, "loss": 0.4546, "num_input_tokens_seen": 25478224, "step": 21000 }, { "epoch": 2.3393473660764004, "grad_norm": 0.12951834499835968, "learning_rate": 4.731765075869825e-05, "loss": 0.4596, "num_input_tokens_seen": 25484368, "step": 21005 }, { "epoch": 2.339904220960018, "grad_norm": 0.16098883748054504, "learning_rate": 4.731546046553086e-05, "loss": 0.4666, "num_input_tokens_seen": 25490288, "step": 21010 }, { "epoch": 2.3404610758436353, "grad_norm": 0.17307257652282715, "learning_rate": 4.7313269329213865e-05, "loss": 0.4657, "num_input_tokens_seen": 25496336, "step": 21015 }, { "epoch": 2.3410179307272525, "grad_norm": 0.1371065229177475, "learning_rate": 4.731107734983005e-05, "loss": 0.4563, "num_input_tokens_seen": 25502608, "step": 21020 }, { "epoch": 2.3415747856108697, "grad_norm": 0.17819122970104218, "learning_rate": 4.730888452746223e-05, "loss": 0.4529, "num_input_tokens_seen": 25508688, "step": 21025 }, { "epoch": 2.342131640494487, "grad_norm": 0.2488754689693451, "learning_rate": 4.7306690862193256e-05, "loss": 0.4635, "num_input_tokens_seen": 25514960, "step": 21030 }, { "epoch": 2.3426884953781046, "grad_norm": 0.14534138143062592, "learning_rate": 4.730449635410602e-05, "loss": 0.4555, "num_input_tokens_seen": 25520976, "step": 21035 }, { "epoch": 2.343245350261722, "grad_norm": 0.163502037525177, "learning_rate": 4.730230100328343e-05, "loss": 0.4663, "num_input_tokens_seen": 25527248, "step": 21040 }, { "epoch": 2.343802205145339, "grad_norm": 0.14955736696720123, "learning_rate": 4.730010480980844e-05, "loss": 0.4625, "num_input_tokens_seen": 25533488, "step": 21045 }, { "epoch": 2.3443590600289563, "grad_norm": 0.1557692140340805, "learning_rate": 4.729790777376402e-05, "loss": 0.4621, "num_input_tokens_seen": 25539472, "step": 21050 }, { "epoch": 2.3449159149125736, "grad_norm": 0.1418190449476242, "learning_rate": 4.72957098952332e-05, "loss": 0.4784, "num_input_tokens_seen": 25545488, "step": 21055 }, { "epoch": 2.3454727697961912, "grad_norm": 0.13325540721416473, "learning_rate": 4.7293511174299e-05, "loss": 0.4662, "num_input_tokens_seen": 25551632, "step": 21060 }, { "epoch": 2.3460296246798085, "grad_norm": 0.14880472421646118, "learning_rate": 4.729131161104451e-05, "loss": 0.4611, "num_input_tokens_seen": 25556592, "step": 21065 }, { "epoch": 2.3465864795634257, "grad_norm": 0.14705775678157806, "learning_rate": 4.728911120555283e-05, "loss": 0.4736, "num_input_tokens_seen": 25562576, "step": 21070 }, { "epoch": 2.347143334447043, "grad_norm": 0.2064056694507599, "learning_rate": 4.728690995790709e-05, "loss": 0.4806, "num_input_tokens_seen": 25568432, "step": 21075 }, { "epoch": 2.3477001893306606, "grad_norm": 0.2037077397108078, "learning_rate": 4.728470786819048e-05, "loss": 0.4764, "num_input_tokens_seen": 25574608, "step": 21080 }, { "epoch": 2.348257044214278, "grad_norm": 0.17548446357250214, "learning_rate": 4.728250493648619e-05, "loss": 0.4534, "num_input_tokens_seen": 25580624, "step": 21085 }, { "epoch": 2.348813899097895, "grad_norm": 0.18741142749786377, "learning_rate": 4.7280301162877454e-05, "loss": 0.4729, "num_input_tokens_seen": 25586576, "step": 21090 }, { "epoch": 2.3493707539815123, "grad_norm": 0.15304875373840332, "learning_rate": 4.7278096547447544e-05, "loss": 0.4692, "num_input_tokens_seen": 25592560, "step": 21095 }, { "epoch": 2.34992760886513, "grad_norm": 0.1888812929391861, "learning_rate": 4.727589109027975e-05, "loss": 0.4602, "num_input_tokens_seen": 25598576, "step": 21100 }, { "epoch": 2.350484463748747, "grad_norm": 0.16899147629737854, "learning_rate": 4.72736847914574e-05, "loss": 0.4652, "num_input_tokens_seen": 25604816, "step": 21105 }, { "epoch": 2.3510413186323644, "grad_norm": 0.12013541907072067, "learning_rate": 4.7271477651063865e-05, "loss": 0.4574, "num_input_tokens_seen": 25611152, "step": 21110 }, { "epoch": 2.3515981735159817, "grad_norm": 0.17700912058353424, "learning_rate": 4.7269269669182524e-05, "loss": 0.4686, "num_input_tokens_seen": 25617296, "step": 21115 }, { "epoch": 2.352155028399599, "grad_norm": 0.14784039556980133, "learning_rate": 4.7267060845896816e-05, "loss": 0.4704, "num_input_tokens_seen": 25623248, "step": 21120 }, { "epoch": 2.3527118832832166, "grad_norm": 0.1292901486158371, "learning_rate": 4.726485118129019e-05, "loss": 0.4666, "num_input_tokens_seen": 25629360, "step": 21125 }, { "epoch": 2.353268738166834, "grad_norm": 0.1741081178188324, "learning_rate": 4.726264067544614e-05, "loss": 0.456, "num_input_tokens_seen": 25634928, "step": 21130 }, { "epoch": 2.353825593050451, "grad_norm": 0.13560765981674194, "learning_rate": 4.726042932844818e-05, "loss": 0.444, "num_input_tokens_seen": 25640912, "step": 21135 }, { "epoch": 2.3543824479340683, "grad_norm": 0.16345030069351196, "learning_rate": 4.7258217140379855e-05, "loss": 0.4537, "num_input_tokens_seen": 25646800, "step": 21140 }, { "epoch": 2.3549393028176855, "grad_norm": 0.1862621307373047, "learning_rate": 4.725600411132476e-05, "loss": 0.4824, "num_input_tokens_seen": 25652816, "step": 21145 }, { "epoch": 2.355496157701303, "grad_norm": 0.15230032801628113, "learning_rate": 4.725379024136651e-05, "loss": 0.4602, "num_input_tokens_seen": 25658992, "step": 21150 }, { "epoch": 2.3560530125849204, "grad_norm": 0.1501551866531372, "learning_rate": 4.725157553058874e-05, "loss": 0.459, "num_input_tokens_seen": 25664848, "step": 21155 }, { "epoch": 2.3566098674685376, "grad_norm": 0.14396440982818604, "learning_rate": 4.724935997907515e-05, "loss": 0.4585, "num_input_tokens_seen": 25671184, "step": 21160 }, { "epoch": 2.357166722352155, "grad_norm": 0.12624433636665344, "learning_rate": 4.724714358690943e-05, "loss": 0.472, "num_input_tokens_seen": 25677328, "step": 21165 }, { "epoch": 2.3577235772357725, "grad_norm": 0.10523203760385513, "learning_rate": 4.724492635417533e-05, "loss": 0.4602, "num_input_tokens_seen": 25683504, "step": 21170 }, { "epoch": 2.3582804321193898, "grad_norm": 0.13673993945121765, "learning_rate": 4.724270828095664e-05, "loss": 0.4728, "num_input_tokens_seen": 25689680, "step": 21175 }, { "epoch": 2.358837287003007, "grad_norm": 0.1540449559688568, "learning_rate": 4.7240489367337135e-05, "loss": 0.4675, "num_input_tokens_seen": 25695408, "step": 21180 }, { "epoch": 2.359394141886624, "grad_norm": 0.13545815646648407, "learning_rate": 4.7238269613400674e-05, "loss": 0.4638, "num_input_tokens_seen": 25701232, "step": 21185 }, { "epoch": 2.359950996770242, "grad_norm": 0.1688539981842041, "learning_rate": 4.723604901923112e-05, "loss": 0.4575, "num_input_tokens_seen": 25707184, "step": 21190 }, { "epoch": 2.360507851653859, "grad_norm": 0.15816666185855865, "learning_rate": 4.723382758491237e-05, "loss": 0.464, "num_input_tokens_seen": 25713264, "step": 21195 }, { "epoch": 2.3610647065374764, "grad_norm": 0.140843003988266, "learning_rate": 4.723160531052837e-05, "loss": 0.4665, "num_input_tokens_seen": 25718672, "step": 21200 }, { "epoch": 2.3616215614210936, "grad_norm": 0.1602770835161209, "learning_rate": 4.7229382196163076e-05, "loss": 0.4619, "num_input_tokens_seen": 25724624, "step": 21205 }, { "epoch": 2.362178416304711, "grad_norm": 0.1084374338388443, "learning_rate": 4.722715824190048e-05, "loss": 0.446, "num_input_tokens_seen": 25731024, "step": 21210 }, { "epoch": 2.3627352711883285, "grad_norm": 0.19599059224128723, "learning_rate": 4.722493344782462e-05, "loss": 0.461, "num_input_tokens_seen": 25737040, "step": 21215 }, { "epoch": 2.3632921260719457, "grad_norm": 0.13477428257465363, "learning_rate": 4.7222707814019544e-05, "loss": 0.4612, "num_input_tokens_seen": 25743184, "step": 21220 }, { "epoch": 2.363848980955563, "grad_norm": 0.135838583111763, "learning_rate": 4.722048134056936e-05, "loss": 0.4623, "num_input_tokens_seen": 25749040, "step": 21225 }, { "epoch": 2.36440583583918, "grad_norm": 0.1178923100233078, "learning_rate": 4.721825402755817e-05, "loss": 0.4776, "num_input_tokens_seen": 25755408, "step": 21230 }, { "epoch": 2.3649626907227974, "grad_norm": 0.16066090762615204, "learning_rate": 4.7216025875070156e-05, "loss": 0.4666, "num_input_tokens_seen": 25761264, "step": 21235 }, { "epoch": 2.365519545606415, "grad_norm": 0.12255978584289551, "learning_rate": 4.7213796883189485e-05, "loss": 0.4517, "num_input_tokens_seen": 25767216, "step": 21240 }, { "epoch": 2.3660764004900323, "grad_norm": 0.13887181878089905, "learning_rate": 4.721156705200037e-05, "loss": 0.4702, "num_input_tokens_seen": 25773200, "step": 21245 }, { "epoch": 2.3666332553736495, "grad_norm": 0.25757741928100586, "learning_rate": 4.720933638158708e-05, "loss": 0.4666, "num_input_tokens_seen": 25779120, "step": 21250 }, { "epoch": 2.3671901102572668, "grad_norm": 0.1700187772512436, "learning_rate": 4.720710487203389e-05, "loss": 0.4723, "num_input_tokens_seen": 25785424, "step": 21255 }, { "epoch": 2.3677469651408845, "grad_norm": 0.22351714968681335, "learning_rate": 4.720487252342511e-05, "loss": 0.4478, "num_input_tokens_seen": 25791728, "step": 21260 }, { "epoch": 2.3683038200245017, "grad_norm": 0.20148500800132751, "learning_rate": 4.720263933584508e-05, "loss": 0.4834, "num_input_tokens_seen": 25797488, "step": 21265 }, { "epoch": 2.368860674908119, "grad_norm": 0.1376333385705948, "learning_rate": 4.720040530937819e-05, "loss": 0.4608, "num_input_tokens_seen": 25803888, "step": 21270 }, { "epoch": 2.369417529791736, "grad_norm": 0.1467357724905014, "learning_rate": 4.7198170444108844e-05, "loss": 0.4594, "num_input_tokens_seen": 25809648, "step": 21275 }, { "epoch": 2.369974384675354, "grad_norm": 0.13398301601409912, "learning_rate": 4.7195934740121485e-05, "loss": 0.4707, "num_input_tokens_seen": 25815568, "step": 21280 }, { "epoch": 2.370531239558971, "grad_norm": 0.1269829422235489, "learning_rate": 4.719369819750057e-05, "loss": 0.4689, "num_input_tokens_seen": 25821648, "step": 21285 }, { "epoch": 2.3710880944425883, "grad_norm": 0.12563082575798035, "learning_rate": 4.7191460816330626e-05, "loss": 0.4626, "num_input_tokens_seen": 25827824, "step": 21290 }, { "epoch": 2.3716449493262055, "grad_norm": 0.21004822850227356, "learning_rate": 4.718922259669617e-05, "loss": 0.4585, "num_input_tokens_seen": 25833648, "step": 21295 }, { "epoch": 2.3722018042098227, "grad_norm": 0.1979651302099228, "learning_rate": 4.718698353868177e-05, "loss": 0.4663, "num_input_tokens_seen": 25840048, "step": 21300 }, { "epoch": 2.3727586590934404, "grad_norm": 0.17165593802928925, "learning_rate": 4.718474364237204e-05, "loss": 0.4719, "num_input_tokens_seen": 25846032, "step": 21305 }, { "epoch": 2.3733155139770576, "grad_norm": 0.1824682056903839, "learning_rate": 4.7182502907851597e-05, "loss": 0.4554, "num_input_tokens_seen": 25852208, "step": 21310 }, { "epoch": 2.373872368860675, "grad_norm": 0.19875596463680267, "learning_rate": 4.71802613352051e-05, "loss": 0.4735, "num_input_tokens_seen": 25857648, "step": 21315 }, { "epoch": 2.374429223744292, "grad_norm": 0.1858801394701004, "learning_rate": 4.717801892451726e-05, "loss": 0.4463, "num_input_tokens_seen": 25863952, "step": 21320 }, { "epoch": 2.3749860786279093, "grad_norm": 0.13447138667106628, "learning_rate": 4.7175775675872784e-05, "loss": 0.4666, "num_input_tokens_seen": 25869936, "step": 21325 }, { "epoch": 2.375542933511527, "grad_norm": 0.10955314338207245, "learning_rate": 4.717353158935645e-05, "loss": 0.47, "num_input_tokens_seen": 25876272, "step": 21330 }, { "epoch": 2.3760997883951442, "grad_norm": 0.14153288304805756, "learning_rate": 4.7171286665053016e-05, "loss": 0.4769, "num_input_tokens_seen": 25882320, "step": 21335 }, { "epoch": 2.3766566432787615, "grad_norm": 0.16420523822307587, "learning_rate": 4.7169040903047336e-05, "loss": 0.4615, "num_input_tokens_seen": 25888016, "step": 21340 }, { "epoch": 2.3772134981623787, "grad_norm": 0.13499532639980316, "learning_rate": 4.716679430342423e-05, "loss": 0.4704, "num_input_tokens_seen": 25894288, "step": 21345 }, { "epoch": 2.3777703530459964, "grad_norm": 0.14550594985485077, "learning_rate": 4.716454686626861e-05, "loss": 0.4631, "num_input_tokens_seen": 25900592, "step": 21350 }, { "epoch": 2.3783272079296136, "grad_norm": 0.1545245349407196, "learning_rate": 4.7162298591665384e-05, "loss": 0.4696, "num_input_tokens_seen": 25906992, "step": 21355 }, { "epoch": 2.378884062813231, "grad_norm": 0.14107108116149902, "learning_rate": 4.716004947969949e-05, "loss": 0.4683, "num_input_tokens_seen": 25913040, "step": 21360 }, { "epoch": 2.379440917696848, "grad_norm": 0.1339028775691986, "learning_rate": 4.71577995304559e-05, "loss": 0.4617, "num_input_tokens_seen": 25919312, "step": 21365 }, { "epoch": 2.3799977725804657, "grad_norm": 0.08308961242437363, "learning_rate": 4.715554874401965e-05, "loss": 0.4736, "num_input_tokens_seen": 25925424, "step": 21370 }, { "epoch": 2.380554627464083, "grad_norm": 0.14915530383586884, "learning_rate": 4.7153297120475766e-05, "loss": 0.4668, "num_input_tokens_seen": 25931568, "step": 21375 }, { "epoch": 2.3811114823477, "grad_norm": 0.16607724130153656, "learning_rate": 4.715104465990932e-05, "loss": 0.4517, "num_input_tokens_seen": 25937776, "step": 21380 }, { "epoch": 2.3816683372313174, "grad_norm": 0.11335878074169159, "learning_rate": 4.714879136240542e-05, "loss": 0.4776, "num_input_tokens_seen": 25943280, "step": 21385 }, { "epoch": 2.3822251921149347, "grad_norm": 0.11512992531061172, "learning_rate": 4.71465372280492e-05, "loss": 0.4655, "num_input_tokens_seen": 25949616, "step": 21390 }, { "epoch": 2.3827820469985523, "grad_norm": 0.1204194501042366, "learning_rate": 4.7144282256925834e-05, "loss": 0.4544, "num_input_tokens_seen": 25955408, "step": 21395 }, { "epoch": 2.3833389018821696, "grad_norm": 0.14947234094142914, "learning_rate": 4.7142026449120526e-05, "loss": 0.46, "num_input_tokens_seen": 25961648, "step": 21400 }, { "epoch": 2.383895756765787, "grad_norm": 0.16918382048606873, "learning_rate": 4.713976980471849e-05, "loss": 0.4651, "num_input_tokens_seen": 25967856, "step": 21405 }, { "epoch": 2.384452611649404, "grad_norm": 0.1761907935142517, "learning_rate": 4.713751232380501e-05, "loss": 0.4645, "num_input_tokens_seen": 25973744, "step": 21410 }, { "epoch": 2.3850094665330213, "grad_norm": 0.1933089941740036, "learning_rate": 4.7135254006465365e-05, "loss": 0.456, "num_input_tokens_seen": 25979824, "step": 21415 }, { "epoch": 2.385566321416639, "grad_norm": 0.13092942535877228, "learning_rate": 4.713299485278489e-05, "loss": 0.4641, "num_input_tokens_seen": 25985872, "step": 21420 }, { "epoch": 2.386123176300256, "grad_norm": 0.13816195726394653, "learning_rate": 4.713073486284894e-05, "loss": 0.4646, "num_input_tokens_seen": 25991856, "step": 21425 }, { "epoch": 2.3866800311838734, "grad_norm": 0.23441551625728607, "learning_rate": 4.712847403674291e-05, "loss": 0.4645, "num_input_tokens_seen": 25998096, "step": 21430 }, { "epoch": 2.3872368860674906, "grad_norm": 0.16393494606018066, "learning_rate": 4.712621237455221e-05, "loss": 0.4815, "num_input_tokens_seen": 26004208, "step": 21435 }, { "epoch": 2.3877937409511083, "grad_norm": 0.10583033412694931, "learning_rate": 4.71239498763623e-05, "loss": 0.4665, "num_input_tokens_seen": 26010544, "step": 21440 }, { "epoch": 2.3883505958347255, "grad_norm": 0.14227192103862762, "learning_rate": 4.712168654225867e-05, "loss": 0.4653, "num_input_tokens_seen": 26016816, "step": 21445 }, { "epoch": 2.3889074507183428, "grad_norm": 0.15946142375469208, "learning_rate": 4.7119422372326825e-05, "loss": 0.4654, "num_input_tokens_seen": 26023152, "step": 21450 }, { "epoch": 2.38946430560196, "grad_norm": 0.13658052682876587, "learning_rate": 4.711715736665232e-05, "loss": 0.4704, "num_input_tokens_seen": 26028976, "step": 21455 }, { "epoch": 2.3900211604855777, "grad_norm": 0.11379386484622955, "learning_rate": 4.711489152532073e-05, "loss": 0.4545, "num_input_tokens_seen": 26034608, "step": 21460 }, { "epoch": 2.390578015369195, "grad_norm": 0.09007730334997177, "learning_rate": 4.711262484841766e-05, "loss": 0.4696, "num_input_tokens_seen": 26040368, "step": 21465 }, { "epoch": 2.391134870252812, "grad_norm": 0.13972200453281403, "learning_rate": 4.7110357336028776e-05, "loss": 0.4717, "num_input_tokens_seen": 26046480, "step": 21470 }, { "epoch": 2.3916917251364294, "grad_norm": 0.13047143816947937, "learning_rate": 4.710808898823973e-05, "loss": 0.4706, "num_input_tokens_seen": 26052240, "step": 21475 }, { "epoch": 2.3922485800200466, "grad_norm": 0.14703144133090973, "learning_rate": 4.710581980513623e-05, "loss": 0.4707, "num_input_tokens_seen": 26058032, "step": 21480 }, { "epoch": 2.3928054349036643, "grad_norm": 0.15709568560123444, "learning_rate": 4.7103549786804014e-05, "loss": 0.4632, "num_input_tokens_seen": 26063952, "step": 21485 }, { "epoch": 2.3933622897872815, "grad_norm": 0.14753688871860504, "learning_rate": 4.7101278933328856e-05, "loss": 0.4557, "num_input_tokens_seen": 26070096, "step": 21490 }, { "epoch": 2.3939191446708987, "grad_norm": 0.12443963438272476, "learning_rate": 4.709900724479656e-05, "loss": 0.463, "num_input_tokens_seen": 26076208, "step": 21495 }, { "epoch": 2.394475999554516, "grad_norm": 0.12166588753461838, "learning_rate": 4.709673472129293e-05, "loss": 0.4567, "num_input_tokens_seen": 26081808, "step": 21500 }, { "epoch": 2.395032854438133, "grad_norm": 0.1505039483308792, "learning_rate": 4.709446136290387e-05, "loss": 0.4619, "num_input_tokens_seen": 26088080, "step": 21505 }, { "epoch": 2.395589709321751, "grad_norm": 0.16535133123397827, "learning_rate": 4.709218716971525e-05, "loss": 0.4871, "num_input_tokens_seen": 26094256, "step": 21510 }, { "epoch": 2.396146564205368, "grad_norm": 0.1395493596792221, "learning_rate": 4.7089912141813e-05, "loss": 0.4659, "num_input_tokens_seen": 26099664, "step": 21515 }, { "epoch": 2.3967034190889853, "grad_norm": 0.13344614207744598, "learning_rate": 4.708763627928307e-05, "loss": 0.4738, "num_input_tokens_seen": 26105424, "step": 21520 }, { "epoch": 2.3972602739726026, "grad_norm": 0.12087921798229218, "learning_rate": 4.708535958221147e-05, "loss": 0.4623, "num_input_tokens_seen": 26111344, "step": 21525 }, { "epoch": 2.3978171288562202, "grad_norm": 0.12725436687469482, "learning_rate": 4.70830820506842e-05, "loss": 0.4781, "num_input_tokens_seen": 26117488, "step": 21530 }, { "epoch": 2.3983739837398375, "grad_norm": 0.26373180747032166, "learning_rate": 4.708080368478732e-05, "loss": 0.4534, "num_input_tokens_seen": 26123856, "step": 21535 }, { "epoch": 2.3989308386234547, "grad_norm": 0.140624538064003, "learning_rate": 4.707852448460693e-05, "loss": 0.4704, "num_input_tokens_seen": 26129840, "step": 21540 }, { "epoch": 2.399487693507072, "grad_norm": 0.10549548268318176, "learning_rate": 4.7076244450229117e-05, "loss": 0.4597, "num_input_tokens_seen": 26135952, "step": 21545 }, { "epoch": 2.4000445483906896, "grad_norm": 0.13096043467521667, "learning_rate": 4.707396358174005e-05, "loss": 0.4542, "num_input_tokens_seen": 26142448, "step": 21550 }, { "epoch": 2.400601403274307, "grad_norm": 0.1792604774236679, "learning_rate": 4.707168187922589e-05, "loss": 0.4607, "num_input_tokens_seen": 26149232, "step": 21555 }, { "epoch": 2.401158258157924, "grad_norm": 0.13894149661064148, "learning_rate": 4.7069399342772866e-05, "loss": 0.4747, "num_input_tokens_seen": 26155376, "step": 21560 }, { "epoch": 2.4017151130415413, "grad_norm": 0.15540386736392975, "learning_rate": 4.706711597246721e-05, "loss": 0.4628, "num_input_tokens_seen": 26161264, "step": 21565 }, { "epoch": 2.4022719679251585, "grad_norm": 0.19578330218791962, "learning_rate": 4.706483176839519e-05, "loss": 0.469, "num_input_tokens_seen": 26167600, "step": 21570 }, { "epoch": 2.402828822808776, "grad_norm": 0.19871585071086884, "learning_rate": 4.706254673064312e-05, "loss": 0.4583, "num_input_tokens_seen": 26173648, "step": 21575 }, { "epoch": 2.4033856776923934, "grad_norm": 0.11934928596019745, "learning_rate": 4.7060260859297325e-05, "loss": 0.4845, "num_input_tokens_seen": 26179888, "step": 21580 }, { "epoch": 2.4039425325760106, "grad_norm": 0.176024928689003, "learning_rate": 4.7057974154444184e-05, "loss": 0.4716, "num_input_tokens_seen": 26185712, "step": 21585 }, { "epoch": 2.404499387459628, "grad_norm": 0.2085103839635849, "learning_rate": 4.705568661617009e-05, "loss": 0.4785, "num_input_tokens_seen": 26192144, "step": 21590 }, { "epoch": 2.405056242343245, "grad_norm": 0.1200462356209755, "learning_rate": 4.705339824456147e-05, "loss": 0.4583, "num_input_tokens_seen": 26198640, "step": 21595 }, { "epoch": 2.405613097226863, "grad_norm": 0.1852162927389145, "learning_rate": 4.70511090397048e-05, "loss": 0.4614, "num_input_tokens_seen": 26204656, "step": 21600 }, { "epoch": 2.40616995211048, "grad_norm": 0.16085094213485718, "learning_rate": 4.704881900168656e-05, "loss": 0.4745, "num_input_tokens_seen": 26210832, "step": 21605 }, { "epoch": 2.4067268069940972, "grad_norm": 0.11288165301084518, "learning_rate": 4.7046528130593284e-05, "loss": 0.4771, "num_input_tokens_seen": 26217040, "step": 21610 }, { "epoch": 2.4072836618777145, "grad_norm": 0.1585851013660431, "learning_rate": 4.704423642651151e-05, "loss": 0.4625, "num_input_tokens_seen": 26223312, "step": 21615 }, { "epoch": 2.407840516761332, "grad_norm": 0.16912086308002472, "learning_rate": 4.704194388952785e-05, "loss": 0.4699, "num_input_tokens_seen": 26229520, "step": 21620 }, { "epoch": 2.4083973716449494, "grad_norm": 0.1179840937256813, "learning_rate": 4.703965051972891e-05, "loss": 0.473, "num_input_tokens_seen": 26235696, "step": 21625 }, { "epoch": 2.4089542265285666, "grad_norm": 0.1364699900150299, "learning_rate": 4.703735631720135e-05, "loss": 0.4589, "num_input_tokens_seen": 26241776, "step": 21630 }, { "epoch": 2.409511081412184, "grad_norm": 0.12409099191427231, "learning_rate": 4.703506128203184e-05, "loss": 0.4707, "num_input_tokens_seen": 26248016, "step": 21635 }, { "epoch": 2.4100679362958015, "grad_norm": 0.1311749815940857, "learning_rate": 4.7032765414307103e-05, "loss": 0.4794, "num_input_tokens_seen": 26254032, "step": 21640 }, { "epoch": 2.4106247911794187, "grad_norm": 0.1565682291984558, "learning_rate": 4.7030468714113876e-05, "loss": 0.4589, "num_input_tokens_seen": 26259984, "step": 21645 }, { "epoch": 2.411181646063036, "grad_norm": 0.12808562815189362, "learning_rate": 4.7028171181538946e-05, "loss": 0.4584, "num_input_tokens_seen": 26266320, "step": 21650 }, { "epoch": 2.411738500946653, "grad_norm": 0.12899160385131836, "learning_rate": 4.7025872816669117e-05, "loss": 0.4768, "num_input_tokens_seen": 26272336, "step": 21655 }, { "epoch": 2.4122953558302704, "grad_norm": 0.12331899255514145, "learning_rate": 4.702357361959122e-05, "loss": 0.456, "num_input_tokens_seen": 26278320, "step": 21660 }, { "epoch": 2.412852210713888, "grad_norm": 0.14150148630142212, "learning_rate": 4.702127359039213e-05, "loss": 0.4699, "num_input_tokens_seen": 26284400, "step": 21665 }, { "epoch": 2.4134090655975053, "grad_norm": 0.12141549587249756, "learning_rate": 4.701897272915877e-05, "loss": 0.4548, "num_input_tokens_seen": 26290512, "step": 21670 }, { "epoch": 2.4139659204811226, "grad_norm": 0.11447792500257492, "learning_rate": 4.7016671035978044e-05, "loss": 0.4668, "num_input_tokens_seen": 26296880, "step": 21675 }, { "epoch": 2.41452277536474, "grad_norm": 0.12817299365997314, "learning_rate": 4.701436851093694e-05, "loss": 0.4708, "num_input_tokens_seen": 26302864, "step": 21680 }, { "epoch": 2.415079630248357, "grad_norm": 0.11508198082447052, "learning_rate": 4.701206515412244e-05, "loss": 0.454, "num_input_tokens_seen": 26309008, "step": 21685 }, { "epoch": 2.4156364851319747, "grad_norm": 0.17222562432289124, "learning_rate": 4.7009760965621574e-05, "loss": 0.4606, "num_input_tokens_seen": 26314896, "step": 21690 }, { "epoch": 2.416193340015592, "grad_norm": 0.15161456167697906, "learning_rate": 4.700745594552141e-05, "loss": 0.4821, "num_input_tokens_seen": 26321264, "step": 21695 }, { "epoch": 2.416750194899209, "grad_norm": 0.20583847165107727, "learning_rate": 4.700515009390904e-05, "loss": 0.4555, "num_input_tokens_seen": 26327504, "step": 21700 }, { "epoch": 2.417307049782827, "grad_norm": 0.12412049621343613, "learning_rate": 4.700284341087157e-05, "loss": 0.4748, "num_input_tokens_seen": 26333584, "step": 21705 }, { "epoch": 2.417863904666444, "grad_norm": 0.13329505920410156, "learning_rate": 4.700053589649616e-05, "loss": 0.4597, "num_input_tokens_seen": 26339856, "step": 21710 }, { "epoch": 2.4184207595500613, "grad_norm": 0.15512114763259888, "learning_rate": 4.699822755087001e-05, "loss": 0.4564, "num_input_tokens_seen": 26345968, "step": 21715 }, { "epoch": 2.4189776144336785, "grad_norm": 0.1406044214963913, "learning_rate": 4.699591837408032e-05, "loss": 0.4721, "num_input_tokens_seen": 26351504, "step": 21720 }, { "epoch": 2.4195344693172958, "grad_norm": 0.12907803058624268, "learning_rate": 4.699360836621435e-05, "loss": 0.4797, "num_input_tokens_seen": 26357488, "step": 21725 }, { "epoch": 2.4200913242009134, "grad_norm": 0.14563260972499847, "learning_rate": 4.699129752735937e-05, "loss": 0.454, "num_input_tokens_seen": 26363728, "step": 21730 }, { "epoch": 2.4206481790845307, "grad_norm": 0.16310502588748932, "learning_rate": 4.69889858576027e-05, "loss": 0.4606, "num_input_tokens_seen": 26369616, "step": 21735 }, { "epoch": 2.421205033968148, "grad_norm": 0.13564565777778625, "learning_rate": 4.698667335703167e-05, "loss": 0.4636, "num_input_tokens_seen": 26375696, "step": 21740 }, { "epoch": 2.421761888851765, "grad_norm": 0.13124488294124603, "learning_rate": 4.698436002573366e-05, "loss": 0.4761, "num_input_tokens_seen": 26381808, "step": 21745 }, { "epoch": 2.4223187437353824, "grad_norm": 0.1413586288690567, "learning_rate": 4.698204586379609e-05, "loss": 0.483, "num_input_tokens_seen": 26388464, "step": 21750 }, { "epoch": 2.422875598619, "grad_norm": 0.11191586405038834, "learning_rate": 4.697973087130637e-05, "loss": 0.4546, "num_input_tokens_seen": 26394608, "step": 21755 }, { "epoch": 2.4234324535026173, "grad_norm": 0.11887827515602112, "learning_rate": 4.697741504835198e-05, "loss": 0.4516, "num_input_tokens_seen": 26400560, "step": 21760 }, { "epoch": 2.4239893083862345, "grad_norm": 0.12309285253286362, "learning_rate": 4.697509839502042e-05, "loss": 0.4521, "num_input_tokens_seen": 26406288, "step": 21765 }, { "epoch": 2.4245461632698517, "grad_norm": 0.13163600862026215, "learning_rate": 4.697278091139923e-05, "loss": 0.4649, "num_input_tokens_seen": 26412624, "step": 21770 }, { "epoch": 2.425103018153469, "grad_norm": 0.10784633457660675, "learning_rate": 4.697046259757595e-05, "loss": 0.4664, "num_input_tokens_seen": 26418672, "step": 21775 }, { "epoch": 2.4256598730370866, "grad_norm": 0.12761445343494415, "learning_rate": 4.696814345363819e-05, "loss": 0.47, "num_input_tokens_seen": 26424816, "step": 21780 }, { "epoch": 2.426216727920704, "grad_norm": 0.14501546323299408, "learning_rate": 4.696582347967357e-05, "loss": 0.4551, "num_input_tokens_seen": 26431120, "step": 21785 }, { "epoch": 2.426773582804321, "grad_norm": 0.19192790985107422, "learning_rate": 4.696350267576975e-05, "loss": 0.4293, "num_input_tokens_seen": 26437328, "step": 21790 }, { "epoch": 2.4273304376879388, "grad_norm": 0.16225816309452057, "learning_rate": 4.69611810420144e-05, "loss": 0.4566, "num_input_tokens_seen": 26443216, "step": 21795 }, { "epoch": 2.427887292571556, "grad_norm": 0.22818978130817413, "learning_rate": 4.695885857849527e-05, "loss": 0.4742, "num_input_tokens_seen": 26449392, "step": 21800 }, { "epoch": 2.4284441474551732, "grad_norm": 0.18554644286632538, "learning_rate": 4.695653528530008e-05, "loss": 0.491, "num_input_tokens_seen": 26455440, "step": 21805 }, { "epoch": 2.4290010023387905, "grad_norm": 0.12227799743413925, "learning_rate": 4.695421116251663e-05, "loss": 0.4692, "num_input_tokens_seen": 26461424, "step": 21810 }, { "epoch": 2.4295578572224077, "grad_norm": 0.17274542152881622, "learning_rate": 4.695188621023272e-05, "loss": 0.4563, "num_input_tokens_seen": 26467344, "step": 21815 }, { "epoch": 2.4301147121060254, "grad_norm": 0.16563007235527039, "learning_rate": 4.694956042853621e-05, "loss": 0.4754, "num_input_tokens_seen": 26473264, "step": 21820 }, { "epoch": 2.4306715669896426, "grad_norm": 0.13600149750709534, "learning_rate": 4.694723381751496e-05, "loss": 0.4943, "num_input_tokens_seen": 26479056, "step": 21825 }, { "epoch": 2.43122842187326, "grad_norm": 0.12062422931194305, "learning_rate": 4.694490637725688e-05, "loss": 0.4868, "num_input_tokens_seen": 26484816, "step": 21830 }, { "epoch": 2.431785276756877, "grad_norm": 0.13634902238845825, "learning_rate": 4.694257810784991e-05, "loss": 0.4785, "num_input_tokens_seen": 26490960, "step": 21835 }, { "epoch": 2.4323421316404943, "grad_norm": 0.17937949299812317, "learning_rate": 4.6940249009382024e-05, "loss": 0.4701, "num_input_tokens_seen": 26497136, "step": 21840 }, { "epoch": 2.432898986524112, "grad_norm": 0.1980278044939041, "learning_rate": 4.6937919081941224e-05, "loss": 0.4463, "num_input_tokens_seen": 26502992, "step": 21845 }, { "epoch": 2.433455841407729, "grad_norm": 0.135587677359581, "learning_rate": 4.693558832561553e-05, "loss": 0.4643, "num_input_tokens_seen": 26508944, "step": 21850 }, { "epoch": 2.4340126962913464, "grad_norm": 0.1354699581861496, "learning_rate": 4.693325674049303e-05, "loss": 0.4718, "num_input_tokens_seen": 26515312, "step": 21855 }, { "epoch": 2.4345695511749637, "grad_norm": 0.15025509893894196, "learning_rate": 4.693092432666178e-05, "loss": 0.4683, "num_input_tokens_seen": 26521232, "step": 21860 }, { "epoch": 2.4351264060585813, "grad_norm": 0.12842482328414917, "learning_rate": 4.692859108420994e-05, "loss": 0.4711, "num_input_tokens_seen": 26527024, "step": 21865 }, { "epoch": 2.4356832609421986, "grad_norm": 0.13191388547420502, "learning_rate": 4.692625701322565e-05, "loss": 0.475, "num_input_tokens_seen": 26533328, "step": 21870 }, { "epoch": 2.436240115825816, "grad_norm": 0.1506727933883667, "learning_rate": 4.69239221137971e-05, "loss": 0.4594, "num_input_tokens_seen": 26539376, "step": 21875 }, { "epoch": 2.436796970709433, "grad_norm": 0.14163267612457275, "learning_rate": 4.692158638601253e-05, "loss": 0.4654, "num_input_tokens_seen": 26545552, "step": 21880 }, { "epoch": 2.4373538255930507, "grad_norm": 0.13777080178260803, "learning_rate": 4.691924982996017e-05, "loss": 0.463, "num_input_tokens_seen": 26551728, "step": 21885 }, { "epoch": 2.437910680476668, "grad_norm": 0.20006009936332703, "learning_rate": 4.691691244572829e-05, "loss": 0.4602, "num_input_tokens_seen": 26557808, "step": 21890 }, { "epoch": 2.438467535360285, "grad_norm": 0.164395272731781, "learning_rate": 4.6914574233405236e-05, "loss": 0.4738, "num_input_tokens_seen": 26564304, "step": 21895 }, { "epoch": 2.4390243902439024, "grad_norm": 0.18714411556720734, "learning_rate": 4.691223519307934e-05, "loss": 0.4536, "num_input_tokens_seen": 26569712, "step": 21900 }, { "epoch": 2.4395812451275196, "grad_norm": 0.12522031366825104, "learning_rate": 4.690989532483897e-05, "loss": 0.4595, "num_input_tokens_seen": 26575216, "step": 21905 }, { "epoch": 2.4401381000111373, "grad_norm": 0.2060711681842804, "learning_rate": 4.6907554628772545e-05, "loss": 0.4641, "num_input_tokens_seen": 26580944, "step": 21910 }, { "epoch": 2.4406949548947545, "grad_norm": 0.14223472774028778, "learning_rate": 4.6905213104968505e-05, "loss": 0.4717, "num_input_tokens_seen": 26587056, "step": 21915 }, { "epoch": 2.4412518097783718, "grad_norm": 0.18410807847976685, "learning_rate": 4.690287075351531e-05, "loss": 0.4698, "num_input_tokens_seen": 26593232, "step": 21920 }, { "epoch": 2.441808664661989, "grad_norm": 0.17912468314170837, "learning_rate": 4.690052757450146e-05, "loss": 0.4604, "num_input_tokens_seen": 26599184, "step": 21925 }, { "epoch": 2.442365519545606, "grad_norm": 0.11688116192817688, "learning_rate": 4.689818356801551e-05, "loss": 0.4594, "num_input_tokens_seen": 26605584, "step": 21930 }, { "epoch": 2.442922374429224, "grad_norm": 0.17195811867713928, "learning_rate": 4.6895838734145996e-05, "loss": 0.4753, "num_input_tokens_seen": 26611888, "step": 21935 }, { "epoch": 2.443479229312841, "grad_norm": 0.15903493762016296, "learning_rate": 4.689349307298153e-05, "loss": 0.4783, "num_input_tokens_seen": 26618192, "step": 21940 }, { "epoch": 2.4440360841964583, "grad_norm": 0.24593062698841095, "learning_rate": 4.689114658461074e-05, "loss": 0.4459, "num_input_tokens_seen": 26624464, "step": 21945 }, { "epoch": 2.4445929390800756, "grad_norm": 0.18135812878608704, "learning_rate": 4.688879926912228e-05, "loss": 0.4527, "num_input_tokens_seen": 26630544, "step": 21950 }, { "epoch": 2.4451497939636933, "grad_norm": 0.12114068120718002, "learning_rate": 4.688645112660483e-05, "loss": 0.4636, "num_input_tokens_seen": 26636848, "step": 21955 }, { "epoch": 2.4457066488473105, "grad_norm": 0.16427907347679138, "learning_rate": 4.6884102157147115e-05, "loss": 0.4902, "num_input_tokens_seen": 26642512, "step": 21960 }, { "epoch": 2.4462635037309277, "grad_norm": 0.18773850798606873, "learning_rate": 4.688175236083789e-05, "loss": 0.4539, "num_input_tokens_seen": 26648560, "step": 21965 }, { "epoch": 2.446820358614545, "grad_norm": 0.17914821207523346, "learning_rate": 4.687940173776594e-05, "loss": 0.4396, "num_input_tokens_seen": 26654864, "step": 21970 }, { "epoch": 2.4473772134981626, "grad_norm": 0.1849367618560791, "learning_rate": 4.687705028802008e-05, "loss": 0.4528, "num_input_tokens_seen": 26660816, "step": 21975 }, { "epoch": 2.44793406838178, "grad_norm": 0.13534866273403168, "learning_rate": 4.6874698011689144e-05, "loss": 0.4573, "num_input_tokens_seen": 26666832, "step": 21980 }, { "epoch": 2.448490923265397, "grad_norm": 0.1509055644273758, "learning_rate": 4.687234490886202e-05, "loss": 0.4607, "num_input_tokens_seen": 26673360, "step": 21985 }, { "epoch": 2.4490477781490143, "grad_norm": 0.09890387207269669, "learning_rate": 4.6869990979627616e-05, "loss": 0.4518, "num_input_tokens_seen": 26679280, "step": 21990 }, { "epoch": 2.4496046330326315, "grad_norm": 0.15661707520484924, "learning_rate": 4.6867636224074865e-05, "loss": 0.4582, "num_input_tokens_seen": 26685168, "step": 21995 }, { "epoch": 2.450161487916249, "grad_norm": 0.20255307853221893, "learning_rate": 4.686528064229273e-05, "loss": 0.4517, "num_input_tokens_seen": 26691152, "step": 22000 }, { "epoch": 2.4507183427998664, "grad_norm": 0.12686476111412048, "learning_rate": 4.686292423437023e-05, "loss": 0.4689, "num_input_tokens_seen": 26696848, "step": 22005 }, { "epoch": 2.4512751976834837, "grad_norm": 0.1302933543920517, "learning_rate": 4.686056700039639e-05, "loss": 0.4819, "num_input_tokens_seen": 26703056, "step": 22010 }, { "epoch": 2.451832052567101, "grad_norm": 0.13581790030002594, "learning_rate": 4.685820894046027e-05, "loss": 0.4742, "num_input_tokens_seen": 26708784, "step": 22015 }, { "epoch": 2.452388907450718, "grad_norm": 0.17015279829502106, "learning_rate": 4.6855850054650975e-05, "loss": 0.4651, "num_input_tokens_seen": 26714832, "step": 22020 }, { "epoch": 2.452945762334336, "grad_norm": 0.19187316298484802, "learning_rate": 4.6853490343057614e-05, "loss": 0.4788, "num_input_tokens_seen": 26721168, "step": 22025 }, { "epoch": 2.453502617217953, "grad_norm": 0.19139735400676727, "learning_rate": 4.685112980576936e-05, "loss": 0.4432, "num_input_tokens_seen": 26727472, "step": 22030 }, { "epoch": 2.4540594721015703, "grad_norm": 0.1533704698085785, "learning_rate": 4.68487684428754e-05, "loss": 0.4705, "num_input_tokens_seen": 26733584, "step": 22035 }, { "epoch": 2.4546163269851875, "grad_norm": 0.11522882431745529, "learning_rate": 4.684640625446495e-05, "loss": 0.473, "num_input_tokens_seen": 26739792, "step": 22040 }, { "epoch": 2.455173181868805, "grad_norm": 0.1349593698978424, "learning_rate": 4.684404324062725e-05, "loss": 0.4682, "num_input_tokens_seen": 26745904, "step": 22045 }, { "epoch": 2.4557300367524224, "grad_norm": 0.18440183997154236, "learning_rate": 4.684167940145161e-05, "loss": 0.4692, "num_input_tokens_seen": 26752176, "step": 22050 }, { "epoch": 2.4562868916360396, "grad_norm": 0.16094988584518433, "learning_rate": 4.683931473702732e-05, "loss": 0.4558, "num_input_tokens_seen": 26758032, "step": 22055 }, { "epoch": 2.456843746519657, "grad_norm": 0.17013777792453766, "learning_rate": 4.683694924744373e-05, "loss": 0.4397, "num_input_tokens_seen": 26764368, "step": 22060 }, { "epoch": 2.4574006014032745, "grad_norm": 0.12245962023735046, "learning_rate": 4.683458293279021e-05, "loss": 0.4742, "num_input_tokens_seen": 26770288, "step": 22065 }, { "epoch": 2.4579574562868918, "grad_norm": 0.1425182968378067, "learning_rate": 4.6832215793156186e-05, "loss": 0.4682, "num_input_tokens_seen": 26776368, "step": 22070 }, { "epoch": 2.458514311170509, "grad_norm": 0.1515931636095047, "learning_rate": 4.682984782863109e-05, "loss": 0.4652, "num_input_tokens_seen": 26781904, "step": 22075 }, { "epoch": 2.4590711660541262, "grad_norm": 0.1891637146472931, "learning_rate": 4.682747903930437e-05, "loss": 0.454, "num_input_tokens_seen": 26787600, "step": 22080 }, { "epoch": 2.4596280209377435, "grad_norm": 0.17443397641181946, "learning_rate": 4.6825109425265554e-05, "loss": 0.4586, "num_input_tokens_seen": 26793616, "step": 22085 }, { "epoch": 2.460184875821361, "grad_norm": 0.16461406648159027, "learning_rate": 4.6822738986604155e-05, "loss": 0.4629, "num_input_tokens_seen": 26799952, "step": 22090 }, { "epoch": 2.4607417307049784, "grad_norm": 0.1417202353477478, "learning_rate": 4.682036772340975e-05, "loss": 0.4612, "num_input_tokens_seen": 26805840, "step": 22095 }, { "epoch": 2.4612985855885956, "grad_norm": 0.15105539560317993, "learning_rate": 4.6817995635771916e-05, "loss": 0.4569, "num_input_tokens_seen": 26811824, "step": 22100 }, { "epoch": 2.461855440472213, "grad_norm": 0.12192969769239426, "learning_rate": 4.6815622723780295e-05, "loss": 0.4504, "num_input_tokens_seen": 26818096, "step": 22105 }, { "epoch": 2.46241229535583, "grad_norm": 0.14468765258789062, "learning_rate": 4.6813248987524535e-05, "loss": 0.4669, "num_input_tokens_seen": 26823984, "step": 22110 }, { "epoch": 2.4629691502394477, "grad_norm": 0.24060824513435364, "learning_rate": 4.681087442709432e-05, "loss": 0.4853, "num_input_tokens_seen": 26830480, "step": 22115 }, { "epoch": 2.463526005123065, "grad_norm": 0.13710659742355347, "learning_rate": 4.680849904257938e-05, "loss": 0.4745, "num_input_tokens_seen": 26836368, "step": 22120 }, { "epoch": 2.464082860006682, "grad_norm": 0.19492633640766144, "learning_rate": 4.6806122834069446e-05, "loss": 0.4686, "num_input_tokens_seen": 26842384, "step": 22125 }, { "epoch": 2.4646397148902994, "grad_norm": 0.11409483850002289, "learning_rate": 4.680374580165432e-05, "loss": 0.4682, "num_input_tokens_seen": 26848784, "step": 22130 }, { "epoch": 2.465196569773917, "grad_norm": 0.12783364951610565, "learning_rate": 4.6801367945423805e-05, "loss": 0.4688, "num_input_tokens_seen": 26854736, "step": 22135 }, { "epoch": 2.4657534246575343, "grad_norm": 0.11148807406425476, "learning_rate": 4.6798989265467736e-05, "loss": 0.4624, "num_input_tokens_seen": 26861008, "step": 22140 }, { "epoch": 2.4663102795411516, "grad_norm": 0.14753468334674835, "learning_rate": 4.6796609761876e-05, "loss": 0.467, "num_input_tokens_seen": 26867600, "step": 22145 }, { "epoch": 2.466867134424769, "grad_norm": 0.1607394516468048, "learning_rate": 4.679422943473849e-05, "loss": 0.4739, "num_input_tokens_seen": 26873936, "step": 22150 }, { "epoch": 2.4674239893083865, "grad_norm": 0.14497967064380646, "learning_rate": 4.679184828414516e-05, "loss": 0.467, "num_input_tokens_seen": 26880112, "step": 22155 }, { "epoch": 2.4679808441920037, "grad_norm": 0.13699039816856384, "learning_rate": 4.6789466310185956e-05, "loss": 0.4648, "num_input_tokens_seen": 26886320, "step": 22160 }, { "epoch": 2.468537699075621, "grad_norm": 0.16279207170009613, "learning_rate": 4.678708351295089e-05, "loss": 0.4497, "num_input_tokens_seen": 26892464, "step": 22165 }, { "epoch": 2.469094553959238, "grad_norm": 0.12392404675483704, "learning_rate": 4.678469989252999e-05, "loss": 0.4686, "num_input_tokens_seen": 26898384, "step": 22170 }, { "epoch": 2.4696514088428554, "grad_norm": 0.14008675515651703, "learning_rate": 4.678231544901331e-05, "loss": 0.4612, "num_input_tokens_seen": 26904528, "step": 22175 }, { "epoch": 2.470208263726473, "grad_norm": 0.16826817393302917, "learning_rate": 4.6779930182490956e-05, "loss": 0.4551, "num_input_tokens_seen": 26910832, "step": 22180 }, { "epoch": 2.4707651186100903, "grad_norm": 0.13966770470142365, "learning_rate": 4.677754409305304e-05, "loss": 0.4598, "num_input_tokens_seen": 26916592, "step": 22185 }, { "epoch": 2.4713219734937075, "grad_norm": 0.18952040374279022, "learning_rate": 4.677515718078972e-05, "loss": 0.4597, "num_input_tokens_seen": 26922256, "step": 22190 }, { "epoch": 2.4718788283773248, "grad_norm": 0.15332865715026855, "learning_rate": 4.677276944579117e-05, "loss": 0.4713, "num_input_tokens_seen": 26928400, "step": 22195 }, { "epoch": 2.472435683260942, "grad_norm": 0.12950247526168823, "learning_rate": 4.677038088814762e-05, "loss": 0.4677, "num_input_tokens_seen": 26934288, "step": 22200 }, { "epoch": 2.4729925381445597, "grad_norm": 0.1368262618780136, "learning_rate": 4.6767991507949327e-05, "loss": 0.4697, "num_input_tokens_seen": 26940464, "step": 22205 }, { "epoch": 2.473549393028177, "grad_norm": 0.16223035752773285, "learning_rate": 4.6765601305286545e-05, "loss": 0.459, "num_input_tokens_seen": 26946768, "step": 22210 }, { "epoch": 2.474106247911794, "grad_norm": 0.14532054960727692, "learning_rate": 4.676321028024959e-05, "loss": 0.4752, "num_input_tokens_seen": 26952752, "step": 22215 }, { "epoch": 2.4746631027954114, "grad_norm": 0.1526002436876297, "learning_rate": 4.676081843292881e-05, "loss": 0.4648, "num_input_tokens_seen": 26959056, "step": 22220 }, { "epoch": 2.475219957679029, "grad_norm": 0.09970235079526901, "learning_rate": 4.6758425763414584e-05, "loss": 0.4597, "num_input_tokens_seen": 26965168, "step": 22225 }, { "epoch": 2.4757768125626463, "grad_norm": 0.11691924929618835, "learning_rate": 4.675603227179729e-05, "loss": 0.4461, "num_input_tokens_seen": 26971248, "step": 22230 }, { "epoch": 2.4763336674462635, "grad_norm": 0.13602544367313385, "learning_rate": 4.6753637958167384e-05, "loss": 0.4545, "num_input_tokens_seen": 26977360, "step": 22235 }, { "epoch": 2.4768905223298807, "grad_norm": 0.14121095836162567, "learning_rate": 4.675124282261532e-05, "loss": 0.4547, "num_input_tokens_seen": 26983408, "step": 22240 }, { "epoch": 2.4774473772134984, "grad_norm": 0.137426495552063, "learning_rate": 4.67488468652316e-05, "loss": 0.4615, "num_input_tokens_seen": 26989328, "step": 22245 }, { "epoch": 2.4780042320971156, "grad_norm": 0.14157213270664215, "learning_rate": 4.674645008610674e-05, "loss": 0.4477, "num_input_tokens_seen": 26995088, "step": 22250 }, { "epoch": 2.478561086980733, "grad_norm": 0.15464675426483154, "learning_rate": 4.674405248533131e-05, "loss": 0.4687, "num_input_tokens_seen": 27001008, "step": 22255 }, { "epoch": 2.47911794186435, "grad_norm": 0.2107246369123459, "learning_rate": 4.67416540629959e-05, "loss": 0.4439, "num_input_tokens_seen": 27007152, "step": 22260 }, { "epoch": 2.4796747967479673, "grad_norm": 0.1331714242696762, "learning_rate": 4.6739254819191126e-05, "loss": 0.4668, "num_input_tokens_seen": 27012976, "step": 22265 }, { "epoch": 2.480231651631585, "grad_norm": 0.1629425585269928, "learning_rate": 4.673685475400763e-05, "loss": 0.4628, "num_input_tokens_seen": 27018960, "step": 22270 }, { "epoch": 2.480788506515202, "grad_norm": 0.13917039334774017, "learning_rate": 4.673445386753611e-05, "loss": 0.466, "num_input_tokens_seen": 27025008, "step": 22275 }, { "epoch": 2.4813453613988194, "grad_norm": 0.11293776333332062, "learning_rate": 4.6732052159867265e-05, "loss": 0.4755, "num_input_tokens_seen": 27030896, "step": 22280 }, { "epoch": 2.4819022162824367, "grad_norm": 0.17918117344379425, "learning_rate": 4.6729649631091846e-05, "loss": 0.4512, "num_input_tokens_seen": 27037040, "step": 22285 }, { "epoch": 2.482459071166054, "grad_norm": 0.13980552554130554, "learning_rate": 4.672724628130063e-05, "loss": 0.464, "num_input_tokens_seen": 27043216, "step": 22290 }, { "epoch": 2.4830159260496716, "grad_norm": 0.17759564518928528, "learning_rate": 4.672484211058442e-05, "loss": 0.4707, "num_input_tokens_seen": 27049328, "step": 22295 }, { "epoch": 2.483572780933289, "grad_norm": 0.17534948885440826, "learning_rate": 4.6722437119034055e-05, "loss": 0.4609, "num_input_tokens_seen": 27055440, "step": 22300 }, { "epoch": 2.484129635816906, "grad_norm": 0.13951291143894196, "learning_rate": 4.67200313067404e-05, "loss": 0.4538, "num_input_tokens_seen": 27061520, "step": 22305 }, { "epoch": 2.4846864907005233, "grad_norm": 0.16769780218601227, "learning_rate": 4.671762467379436e-05, "loss": 0.4775, "num_input_tokens_seen": 27067504, "step": 22310 }, { "epoch": 2.485243345584141, "grad_norm": 0.10855333507061005, "learning_rate": 4.6715217220286865e-05, "loss": 0.4655, "num_input_tokens_seen": 27073360, "step": 22315 }, { "epoch": 2.485800200467758, "grad_norm": 0.14183968305587769, "learning_rate": 4.6712808946308874e-05, "loss": 0.4619, "num_input_tokens_seen": 27079600, "step": 22320 }, { "epoch": 2.4863570553513754, "grad_norm": 0.1223965659737587, "learning_rate": 4.671039985195137e-05, "loss": 0.473, "num_input_tokens_seen": 27085648, "step": 22325 }, { "epoch": 2.4869139102349926, "grad_norm": 0.11818345636129379, "learning_rate": 4.67079899373054e-05, "loss": 0.4697, "num_input_tokens_seen": 27091600, "step": 22330 }, { "epoch": 2.4874707651186103, "grad_norm": 0.14618735015392303, "learning_rate": 4.6705579202461994e-05, "loss": 0.463, "num_input_tokens_seen": 27097936, "step": 22335 }, { "epoch": 2.4880276200022275, "grad_norm": 0.14779356122016907, "learning_rate": 4.6703167647512236e-05, "loss": 0.473, "num_input_tokens_seen": 27104176, "step": 22340 }, { "epoch": 2.4885844748858448, "grad_norm": 0.12179314345121384, "learning_rate": 4.670075527254727e-05, "loss": 0.4581, "num_input_tokens_seen": 27110288, "step": 22345 }, { "epoch": 2.489141329769462, "grad_norm": 0.11003522574901581, "learning_rate": 4.669834207765822e-05, "loss": 0.4729, "num_input_tokens_seen": 27116272, "step": 22350 }, { "epoch": 2.4896981846530792, "grad_norm": 0.15513893961906433, "learning_rate": 4.669592806293627e-05, "loss": 0.4671, "num_input_tokens_seen": 27122448, "step": 22355 }, { "epoch": 2.490255039536697, "grad_norm": 0.13108409941196442, "learning_rate": 4.669351322847263e-05, "loss": 0.4714, "num_input_tokens_seen": 27128624, "step": 22360 }, { "epoch": 2.490811894420314, "grad_norm": 0.12522906064987183, "learning_rate": 4.669109757435854e-05, "loss": 0.4602, "num_input_tokens_seen": 27134576, "step": 22365 }, { "epoch": 2.4913687493039314, "grad_norm": 0.13492214679718018, "learning_rate": 4.668868110068527e-05, "loss": 0.4553, "num_input_tokens_seen": 27140048, "step": 22370 }, { "epoch": 2.4919256041875486, "grad_norm": 0.22644788026809692, "learning_rate": 4.668626380754413e-05, "loss": 0.4647, "num_input_tokens_seen": 27146384, "step": 22375 }, { "epoch": 2.492482459071166, "grad_norm": 0.1333344429731369, "learning_rate": 4.668384569502644e-05, "loss": 0.4665, "num_input_tokens_seen": 27152368, "step": 22380 }, { "epoch": 2.4930393139547835, "grad_norm": 0.16582530736923218, "learning_rate": 4.6681426763223565e-05, "loss": 0.4743, "num_input_tokens_seen": 27158416, "step": 22385 }, { "epoch": 2.4935961688384007, "grad_norm": 0.16515420377254486, "learning_rate": 4.667900701222692e-05, "loss": 0.4711, "num_input_tokens_seen": 27164848, "step": 22390 }, { "epoch": 2.494153023722018, "grad_norm": 0.1677958071231842, "learning_rate": 4.667658644212791e-05, "loss": 0.4593, "num_input_tokens_seen": 27170672, "step": 22395 }, { "epoch": 2.494709878605635, "grad_norm": 0.14012126624584198, "learning_rate": 4.667416505301799e-05, "loss": 0.4477, "num_input_tokens_seen": 27176752, "step": 22400 }, { "epoch": 2.495266733489253, "grad_norm": 0.17918215692043304, "learning_rate": 4.667174284498866e-05, "loss": 0.4554, "num_input_tokens_seen": 27182576, "step": 22405 }, { "epoch": 2.49582358837287, "grad_norm": 0.17819765210151672, "learning_rate": 4.666931981813144e-05, "loss": 0.4741, "num_input_tokens_seen": 27188080, "step": 22410 }, { "epoch": 2.4963804432564873, "grad_norm": 0.09852490574121475, "learning_rate": 4.666689597253787e-05, "loss": 0.4669, "num_input_tokens_seen": 27194352, "step": 22415 }, { "epoch": 2.4969372981401046, "grad_norm": 0.14286337792873383, "learning_rate": 4.666447130829954e-05, "loss": 0.4675, "num_input_tokens_seen": 27200720, "step": 22420 }, { "epoch": 2.4974941530237222, "grad_norm": 0.17120243608951569, "learning_rate": 4.6662045825508056e-05, "loss": 0.4537, "num_input_tokens_seen": 27206640, "step": 22425 }, { "epoch": 2.4980510079073395, "grad_norm": 0.14766757190227509, "learning_rate": 4.665961952425506e-05, "loss": 0.4568, "num_input_tokens_seen": 27212656, "step": 22430 }, { "epoch": 2.4986078627909567, "grad_norm": 0.148759663105011, "learning_rate": 4.665719240463222e-05, "loss": 0.441, "num_input_tokens_seen": 27218736, "step": 22435 }, { "epoch": 2.499164717674574, "grad_norm": 0.15764950215816498, "learning_rate": 4.665476446673126e-05, "loss": 0.4762, "num_input_tokens_seen": 27224784, "step": 22440 }, { "epoch": 2.499721572558191, "grad_norm": 0.13547389209270477, "learning_rate": 4.665233571064389e-05, "loss": 0.456, "num_input_tokens_seen": 27230544, "step": 22445 }, { "epoch": 2.500278427441809, "grad_norm": 0.1724739819765091, "learning_rate": 4.66499061364619e-05, "loss": 0.4783, "num_input_tokens_seen": 27236592, "step": 22450 }, { "epoch": 2.500278427441809, "eval_loss": 0.46576693654060364, "eval_runtime": 113.0998, "eval_samples_per_second": 35.287, "eval_steps_per_second": 8.824, "num_input_tokens_seen": 27236592, "step": 22450 }, { "epoch": 2.500835282325426, "grad_norm": 0.14684514701366425, "learning_rate": 4.6647475744277065e-05, "loss": 0.4789, "num_input_tokens_seen": 27242704, "step": 22455 }, { "epoch": 2.5013921372090433, "grad_norm": 0.15385031700134277, "learning_rate": 4.6645044534181225e-05, "loss": 0.4728, "num_input_tokens_seen": 27248752, "step": 22460 }, { "epoch": 2.5019489920926605, "grad_norm": 0.14490273594856262, "learning_rate": 4.6642612506266245e-05, "loss": 0.4596, "num_input_tokens_seen": 27254960, "step": 22465 }, { "epoch": 2.5025058469762778, "grad_norm": 0.14716234803199768, "learning_rate": 4.6640179660624e-05, "loss": 0.4536, "num_input_tokens_seen": 27261296, "step": 22470 }, { "epoch": 2.5030627018598954, "grad_norm": 0.13496944308280945, "learning_rate": 4.6637745997346424e-05, "loss": 0.4603, "num_input_tokens_seen": 27267312, "step": 22475 }, { "epoch": 2.5036195567435127, "grad_norm": 0.15534284710884094, "learning_rate": 4.663531151652546e-05, "loss": 0.4577, "num_input_tokens_seen": 27273424, "step": 22480 }, { "epoch": 2.50417641162713, "grad_norm": 0.15398330986499786, "learning_rate": 4.663287621825309e-05, "loss": 0.4748, "num_input_tokens_seen": 27279952, "step": 22485 }, { "epoch": 2.5047332665107476, "grad_norm": 0.1306920051574707, "learning_rate": 4.663044010262133e-05, "loss": 0.4643, "num_input_tokens_seen": 27285968, "step": 22490 }, { "epoch": 2.5052901213943644, "grad_norm": 0.1817661076784134, "learning_rate": 4.662800316972223e-05, "loss": 0.4816, "num_input_tokens_seen": 27292016, "step": 22495 }, { "epoch": 2.505846976277982, "grad_norm": 0.14605657756328583, "learning_rate": 4.662556541964786e-05, "loss": 0.4794, "num_input_tokens_seen": 27297968, "step": 22500 }, { "epoch": 2.5064038311615993, "grad_norm": 0.11560110002756119, "learning_rate": 4.662312685249032e-05, "loss": 0.456, "num_input_tokens_seen": 27303856, "step": 22505 }, { "epoch": 2.5069606860452165, "grad_norm": 0.1686418503522873, "learning_rate": 4.662068746834176e-05, "loss": 0.4652, "num_input_tokens_seen": 27310096, "step": 22510 }, { "epoch": 2.507517540928834, "grad_norm": 0.12013395875692368, "learning_rate": 4.6618247267294334e-05, "loss": 0.4562, "num_input_tokens_seen": 27316208, "step": 22515 }, { "epoch": 2.5080743958124514, "grad_norm": 0.11448600888252258, "learning_rate": 4.661580624944025e-05, "loss": 0.4685, "num_input_tokens_seen": 27322544, "step": 22520 }, { "epoch": 2.5086312506960686, "grad_norm": 0.18406766653060913, "learning_rate": 4.661336441487174e-05, "loss": 0.4592, "num_input_tokens_seen": 27328592, "step": 22525 }, { "epoch": 2.509188105579686, "grad_norm": 0.10467936843633652, "learning_rate": 4.661092176368105e-05, "loss": 0.4656, "num_input_tokens_seen": 27334736, "step": 22530 }, { "epoch": 2.509744960463303, "grad_norm": 0.1919611245393753, "learning_rate": 4.660847829596049e-05, "loss": 0.4729, "num_input_tokens_seen": 27340944, "step": 22535 }, { "epoch": 2.5103018153469208, "grad_norm": 0.18954776227474213, "learning_rate": 4.660603401180236e-05, "loss": 0.4727, "num_input_tokens_seen": 27346704, "step": 22540 }, { "epoch": 2.510858670230538, "grad_norm": 0.12855687737464905, "learning_rate": 4.6603588911299034e-05, "loss": 0.4726, "num_input_tokens_seen": 27352944, "step": 22545 }, { "epoch": 2.5114155251141552, "grad_norm": 0.14854609966278076, "learning_rate": 4.6601142994542886e-05, "loss": 0.4653, "num_input_tokens_seen": 27359280, "step": 22550 }, { "epoch": 2.5119723799977725, "grad_norm": 0.12459505349397659, "learning_rate": 4.6598696261626326e-05, "loss": 0.4663, "num_input_tokens_seen": 27364688, "step": 22555 }, { "epoch": 2.5125292348813897, "grad_norm": 0.18192635476589203, "learning_rate": 4.659624871264181e-05, "loss": 0.4593, "num_input_tokens_seen": 27370800, "step": 22560 }, { "epoch": 2.5130860897650074, "grad_norm": 0.14335443079471588, "learning_rate": 4.659380034768181e-05, "loss": 0.4634, "num_input_tokens_seen": 27377168, "step": 22565 }, { "epoch": 2.5136429446486246, "grad_norm": 0.13151778280735016, "learning_rate": 4.659135116683883e-05, "loss": 0.4606, "num_input_tokens_seen": 27383536, "step": 22570 }, { "epoch": 2.514199799532242, "grad_norm": 0.13739901781082153, "learning_rate": 4.658890117020541e-05, "loss": 0.4835, "num_input_tokens_seen": 27390000, "step": 22575 }, { "epoch": 2.5147566544158595, "grad_norm": 0.1381477564573288, "learning_rate": 4.658645035787412e-05, "loss": 0.4832, "num_input_tokens_seen": 27396144, "step": 22580 }, { "epoch": 2.5153135092994763, "grad_norm": 0.12353475391864777, "learning_rate": 4.658399872993755e-05, "loss": 0.4563, "num_input_tokens_seen": 27402192, "step": 22585 }, { "epoch": 2.515870364183094, "grad_norm": 0.17612704634666443, "learning_rate": 4.658154628648835e-05, "loss": 0.4795, "num_input_tokens_seen": 27408272, "step": 22590 }, { "epoch": 2.516427219066711, "grad_norm": 0.13789482414722443, "learning_rate": 4.6579093027619166e-05, "loss": 0.4543, "num_input_tokens_seen": 27414160, "step": 22595 }, { "epoch": 2.5169840739503284, "grad_norm": 0.1325899064540863, "learning_rate": 4.657663895342269e-05, "loss": 0.4849, "num_input_tokens_seen": 27420304, "step": 22600 }, { "epoch": 2.517540928833946, "grad_norm": 0.11714641749858856, "learning_rate": 4.6574184063991654e-05, "loss": 0.4634, "num_input_tokens_seen": 27426576, "step": 22605 }, { "epoch": 2.5180977837175633, "grad_norm": 0.11925751715898514, "learning_rate": 4.6571728359418804e-05, "loss": 0.4567, "num_input_tokens_seen": 27432816, "step": 22610 }, { "epoch": 2.5186546386011806, "grad_norm": 0.13415907323360443, "learning_rate": 4.656927183979692e-05, "loss": 0.4415, "num_input_tokens_seen": 27439120, "step": 22615 }, { "epoch": 2.519211493484798, "grad_norm": 0.1428707242012024, "learning_rate": 4.656681450521883e-05, "loss": 0.4543, "num_input_tokens_seen": 27445360, "step": 22620 }, { "epoch": 2.519768348368415, "grad_norm": 0.1480703055858612, "learning_rate": 4.656435635577738e-05, "loss": 0.4749, "num_input_tokens_seen": 27451536, "step": 22625 }, { "epoch": 2.5203252032520327, "grad_norm": 0.15819239616394043, "learning_rate": 4.656189739156543e-05, "loss": 0.454, "num_input_tokens_seen": 27457808, "step": 22630 }, { "epoch": 2.52088205813565, "grad_norm": 0.12496163696050644, "learning_rate": 4.655943761267591e-05, "loss": 0.4535, "num_input_tokens_seen": 27463920, "step": 22635 }, { "epoch": 2.521438913019267, "grad_norm": 0.12704230844974518, "learning_rate": 4.6556977019201734e-05, "loss": 0.4552, "num_input_tokens_seen": 27470224, "step": 22640 }, { "epoch": 2.5219957679028844, "grad_norm": 0.15760205686092377, "learning_rate": 4.655451561123589e-05, "loss": 0.4612, "num_input_tokens_seen": 27476336, "step": 22645 }, { "epoch": 2.5225526227865016, "grad_norm": 0.1098375990986824, "learning_rate": 4.655205338887137e-05, "loss": 0.4682, "num_input_tokens_seen": 27482416, "step": 22650 }, { "epoch": 2.5231094776701193, "grad_norm": 0.17277558147907257, "learning_rate": 4.6549590352201204e-05, "loss": 0.4527, "num_input_tokens_seen": 27488368, "step": 22655 }, { "epoch": 2.5236663325537365, "grad_norm": 0.10132114589214325, "learning_rate": 4.654712650131846e-05, "loss": 0.4716, "num_input_tokens_seen": 27494416, "step": 22660 }, { "epoch": 2.5242231874373537, "grad_norm": 0.1588318943977356, "learning_rate": 4.654466183631622e-05, "loss": 0.453, "num_input_tokens_seen": 27500592, "step": 22665 }, { "epoch": 2.5247800423209714, "grad_norm": 0.1153392568230629, "learning_rate": 4.654219635728762e-05, "loss": 0.4854, "num_input_tokens_seen": 27506608, "step": 22670 }, { "epoch": 2.5253368972045886, "grad_norm": 0.0914185643196106, "learning_rate": 4.65397300643258e-05, "loss": 0.4719, "num_input_tokens_seen": 27512240, "step": 22675 }, { "epoch": 2.525893752088206, "grad_norm": 0.161207914352417, "learning_rate": 4.653726295752395e-05, "loss": 0.4523, "num_input_tokens_seen": 27518768, "step": 22680 }, { "epoch": 2.526450606971823, "grad_norm": 0.12319276481866837, "learning_rate": 4.6534795036975285e-05, "loss": 0.4562, "num_input_tokens_seen": 27524848, "step": 22685 }, { "epoch": 2.5270074618554403, "grad_norm": 0.1498776376247406, "learning_rate": 4.653232630277306e-05, "loss": 0.4658, "num_input_tokens_seen": 27531216, "step": 22690 }, { "epoch": 2.527564316739058, "grad_norm": 0.15118929743766785, "learning_rate": 4.652985675501054e-05, "loss": 0.448, "num_input_tokens_seen": 27537008, "step": 22695 }, { "epoch": 2.5281211716226752, "grad_norm": 0.10656308382749557, "learning_rate": 4.6527386393781036e-05, "loss": 0.4587, "num_input_tokens_seen": 27542864, "step": 22700 }, { "epoch": 2.5286780265062925, "grad_norm": 0.18174368143081665, "learning_rate": 4.6524915219177886e-05, "loss": 0.4672, "num_input_tokens_seen": 27548688, "step": 22705 }, { "epoch": 2.5292348813899097, "grad_norm": 0.147518128156662, "learning_rate": 4.652244323129445e-05, "loss": 0.4697, "num_input_tokens_seen": 27554800, "step": 22710 }, { "epoch": 2.529791736273527, "grad_norm": 0.15552297234535217, "learning_rate": 4.651997043022415e-05, "loss": 0.466, "num_input_tokens_seen": 27560752, "step": 22715 }, { "epoch": 2.5303485911571446, "grad_norm": 0.13536220788955688, "learning_rate": 4.65174968160604e-05, "loss": 0.4709, "num_input_tokens_seen": 27566736, "step": 22720 }, { "epoch": 2.530905446040762, "grad_norm": 0.12317422777414322, "learning_rate": 4.651502238889666e-05, "loss": 0.4502, "num_input_tokens_seen": 27572848, "step": 22725 }, { "epoch": 2.531462300924379, "grad_norm": 0.12951289117336273, "learning_rate": 4.651254714882643e-05, "loss": 0.456, "num_input_tokens_seen": 27579024, "step": 22730 }, { "epoch": 2.5320191558079963, "grad_norm": 0.1725555807352066, "learning_rate": 4.6510071095943227e-05, "loss": 0.4583, "num_input_tokens_seen": 27585040, "step": 22735 }, { "epoch": 2.5325760106916135, "grad_norm": 0.15882326662540436, "learning_rate": 4.65075942303406e-05, "loss": 0.4448, "num_input_tokens_seen": 27591248, "step": 22740 }, { "epoch": 2.533132865575231, "grad_norm": 0.13369442522525787, "learning_rate": 4.650511655211214e-05, "loss": 0.4785, "num_input_tokens_seen": 27597296, "step": 22745 }, { "epoch": 2.5336897204588484, "grad_norm": 0.1190679594874382, "learning_rate": 4.6502638061351465e-05, "loss": 0.4609, "num_input_tokens_seen": 27603696, "step": 22750 }, { "epoch": 2.5342465753424657, "grad_norm": 0.13915997743606567, "learning_rate": 4.650015875815222e-05, "loss": 0.4773, "num_input_tokens_seen": 27609808, "step": 22755 }, { "epoch": 2.5348034302260833, "grad_norm": 0.10954175144433975, "learning_rate": 4.6497678642608064e-05, "loss": 0.4776, "num_input_tokens_seen": 27616176, "step": 22760 }, { "epoch": 2.5353602851097006, "grad_norm": 0.1596253663301468, "learning_rate": 4.6495197714812724e-05, "loss": 0.4602, "num_input_tokens_seen": 27622288, "step": 22765 }, { "epoch": 2.535917139993318, "grad_norm": 0.120481476187706, "learning_rate": 4.649271597485993e-05, "loss": 0.4654, "num_input_tokens_seen": 27628368, "step": 22770 }, { "epoch": 2.536473994876935, "grad_norm": 0.14226345717906952, "learning_rate": 4.649023342284344e-05, "loss": 0.4781, "num_input_tokens_seen": 27634640, "step": 22775 }, { "epoch": 2.5370308497605523, "grad_norm": 0.15745481848716736, "learning_rate": 4.6487750058857076e-05, "loss": 0.4564, "num_input_tokens_seen": 27640496, "step": 22780 }, { "epoch": 2.53758770464417, "grad_norm": 0.14583361148834229, "learning_rate": 4.648526588299465e-05, "loss": 0.4557, "num_input_tokens_seen": 27646672, "step": 22785 }, { "epoch": 2.538144559527787, "grad_norm": 0.1453745812177658, "learning_rate": 4.648278089535002e-05, "loss": 0.4871, "num_input_tokens_seen": 27653072, "step": 22790 }, { "epoch": 2.5387014144114044, "grad_norm": 0.11146695166826248, "learning_rate": 4.648029509601709e-05, "loss": 0.464, "num_input_tokens_seen": 27659088, "step": 22795 }, { "epoch": 2.5392582692950216, "grad_norm": 0.15798042714595795, "learning_rate": 4.647780848508977e-05, "loss": 0.4705, "num_input_tokens_seen": 27665488, "step": 22800 }, { "epoch": 2.539815124178639, "grad_norm": 0.21435298025608063, "learning_rate": 4.6475321062662016e-05, "loss": 0.4796, "num_input_tokens_seen": 27671600, "step": 22805 }, { "epoch": 2.5403719790622565, "grad_norm": 0.1937231868505478, "learning_rate": 4.647283282882782e-05, "loss": 0.4675, "num_input_tokens_seen": 27677040, "step": 22810 }, { "epoch": 2.5409288339458738, "grad_norm": 0.14413654804229736, "learning_rate": 4.647034378368118e-05, "loss": 0.4564, "num_input_tokens_seen": 27682992, "step": 22815 }, { "epoch": 2.541485688829491, "grad_norm": 0.1447623372077942, "learning_rate": 4.646785392731614e-05, "loss": 0.4692, "num_input_tokens_seen": 27689296, "step": 22820 }, { "epoch": 2.5420425437131082, "grad_norm": 0.13083262741565704, "learning_rate": 4.646536325982679e-05, "loss": 0.4602, "num_input_tokens_seen": 27695440, "step": 22825 }, { "epoch": 2.5425993985967255, "grad_norm": 0.1790337711572647, "learning_rate": 4.646287178130723e-05, "loss": 0.4785, "num_input_tokens_seen": 27701168, "step": 22830 }, { "epoch": 2.543156253480343, "grad_norm": 0.11863256990909576, "learning_rate": 4.646037949185158e-05, "loss": 0.4542, "num_input_tokens_seen": 27707280, "step": 22835 }, { "epoch": 2.5437131083639604, "grad_norm": 0.15515603125095367, "learning_rate": 4.645788639155403e-05, "loss": 0.4676, "num_input_tokens_seen": 27713424, "step": 22840 }, { "epoch": 2.5442699632475776, "grad_norm": 0.10474532097578049, "learning_rate": 4.645539248050876e-05, "loss": 0.4637, "num_input_tokens_seen": 27719184, "step": 22845 }, { "epoch": 2.5448268181311953, "grad_norm": 0.2462182492017746, "learning_rate": 4.645289775881001e-05, "loss": 0.4676, "num_input_tokens_seen": 27725328, "step": 22850 }, { "epoch": 2.5453836730148125, "grad_norm": 0.12125910818576813, "learning_rate": 4.645040222655204e-05, "loss": 0.4631, "num_input_tokens_seen": 27731440, "step": 22855 }, { "epoch": 2.5459405278984297, "grad_norm": 0.1226210817694664, "learning_rate": 4.6447905883829116e-05, "loss": 0.4647, "num_input_tokens_seen": 27736400, "step": 22860 }, { "epoch": 2.546497382782047, "grad_norm": 0.13081665337085724, "learning_rate": 4.6445408730735584e-05, "loss": 0.454, "num_input_tokens_seen": 27742416, "step": 22865 }, { "epoch": 2.547054237665664, "grad_norm": 0.1162160336971283, "learning_rate": 4.644291076736579e-05, "loss": 0.4606, "num_input_tokens_seen": 27748688, "step": 22870 }, { "epoch": 2.547611092549282, "grad_norm": 0.14011546969413757, "learning_rate": 4.64404119938141e-05, "loss": 0.4837, "num_input_tokens_seen": 27754800, "step": 22875 }, { "epoch": 2.548167947432899, "grad_norm": 0.13942748308181763, "learning_rate": 4.643791241017494e-05, "loss": 0.4872, "num_input_tokens_seen": 27761008, "step": 22880 }, { "epoch": 2.5487248023165163, "grad_norm": 0.1729709953069687, "learning_rate": 4.643541201654275e-05, "loss": 0.4744, "num_input_tokens_seen": 27767184, "step": 22885 }, { "epoch": 2.5492816572001336, "grad_norm": 0.12787559628486633, "learning_rate": 4.6432910813012e-05, "loss": 0.47, "num_input_tokens_seen": 27773744, "step": 22890 }, { "epoch": 2.549838512083751, "grad_norm": 0.141412153840065, "learning_rate": 4.6430408799677196e-05, "loss": 0.4631, "num_input_tokens_seen": 27779536, "step": 22895 }, { "epoch": 2.5503953669673685, "grad_norm": 0.12073172628879547, "learning_rate": 4.642790597663287e-05, "loss": 0.4636, "num_input_tokens_seen": 27786032, "step": 22900 }, { "epoch": 2.5509522218509857, "grad_norm": 0.13414371013641357, "learning_rate": 4.642540234397358e-05, "loss": 0.4712, "num_input_tokens_seen": 27792080, "step": 22905 }, { "epoch": 2.551509076734603, "grad_norm": 0.132488414645195, "learning_rate": 4.6422897901793936e-05, "loss": 0.487, "num_input_tokens_seen": 27798032, "step": 22910 }, { "epoch": 2.55206593161822, "grad_norm": 0.12079952657222748, "learning_rate": 4.642039265018855e-05, "loss": 0.4523, "num_input_tokens_seen": 27804432, "step": 22915 }, { "epoch": 2.5526227865018374, "grad_norm": 0.15804484486579895, "learning_rate": 4.641788658925209e-05, "loss": 0.4657, "num_input_tokens_seen": 27810416, "step": 22920 }, { "epoch": 2.553179641385455, "grad_norm": 0.17297492921352386, "learning_rate": 4.641537971907924e-05, "loss": 0.4591, "num_input_tokens_seen": 27816112, "step": 22925 }, { "epoch": 2.5537364962690723, "grad_norm": 0.12908512353897095, "learning_rate": 4.641287203976471e-05, "loss": 0.4468, "num_input_tokens_seen": 27822032, "step": 22930 }, { "epoch": 2.5542933511526895, "grad_norm": 0.1286504864692688, "learning_rate": 4.6410363551403254e-05, "loss": 0.4722, "num_input_tokens_seen": 27827888, "step": 22935 }, { "epoch": 2.554850206036307, "grad_norm": 0.12189805507659912, "learning_rate": 4.640785425408965e-05, "loss": 0.468, "num_input_tokens_seen": 27833872, "step": 22940 }, { "epoch": 2.5554070609199244, "grad_norm": 0.1422473043203354, "learning_rate": 4.640534414791871e-05, "loss": 0.4656, "num_input_tokens_seen": 27840112, "step": 22945 }, { "epoch": 2.5559639158035417, "grad_norm": 0.11512042582035065, "learning_rate": 4.640283323298527e-05, "loss": 0.4636, "num_input_tokens_seen": 27846224, "step": 22950 }, { "epoch": 2.556520770687159, "grad_norm": 0.14025485515594482, "learning_rate": 4.6400321509384196e-05, "loss": 0.4494, "num_input_tokens_seen": 27852144, "step": 22955 }, { "epoch": 2.557077625570776, "grad_norm": 0.09094861149787903, "learning_rate": 4.63978089772104e-05, "loss": 0.4641, "num_input_tokens_seen": 27857968, "step": 22960 }, { "epoch": 2.557634480454394, "grad_norm": 0.1890202909708023, "learning_rate": 4.639529563655881e-05, "loss": 0.4608, "num_input_tokens_seen": 27864176, "step": 22965 }, { "epoch": 2.558191335338011, "grad_norm": 0.1273912638425827, "learning_rate": 4.6392781487524384e-05, "loss": 0.4678, "num_input_tokens_seen": 27870416, "step": 22970 }, { "epoch": 2.5587481902216282, "grad_norm": 0.12965624034404755, "learning_rate": 4.639026653020212e-05, "loss": 0.4589, "num_input_tokens_seen": 27876848, "step": 22975 }, { "epoch": 2.5593050451052455, "grad_norm": 0.14027240872383118, "learning_rate": 4.638775076468703e-05, "loss": 0.4743, "num_input_tokens_seen": 27882800, "step": 22980 }, { "epoch": 2.5598618999888627, "grad_norm": 0.1517854928970337, "learning_rate": 4.638523419107419e-05, "loss": 0.4518, "num_input_tokens_seen": 27889104, "step": 22985 }, { "epoch": 2.5604187548724804, "grad_norm": 0.14641432464122772, "learning_rate": 4.6382716809458657e-05, "loss": 0.466, "num_input_tokens_seen": 27894896, "step": 22990 }, { "epoch": 2.5609756097560976, "grad_norm": 0.1535932570695877, "learning_rate": 4.638019861993556e-05, "loss": 0.462, "num_input_tokens_seen": 27900848, "step": 22995 }, { "epoch": 2.561532464639715, "grad_norm": 0.15126541256904602, "learning_rate": 4.6377679622600046e-05, "loss": 0.4477, "num_input_tokens_seen": 27906384, "step": 23000 }, { "epoch": 2.562089319523332, "grad_norm": 0.09889668971300125, "learning_rate": 4.637515981754729e-05, "loss": 0.4645, "num_input_tokens_seen": 27912432, "step": 23005 }, { "epoch": 2.5626461744069493, "grad_norm": 0.15585684776306152, "learning_rate": 4.637263920487249e-05, "loss": 0.4752, "num_input_tokens_seen": 27918160, "step": 23010 }, { "epoch": 2.563203029290567, "grad_norm": 0.10590945184230804, "learning_rate": 4.63701177846709e-05, "loss": 0.4615, "num_input_tokens_seen": 27924144, "step": 23015 }, { "epoch": 2.563759884174184, "grad_norm": 0.11773049831390381, "learning_rate": 4.6367595557037766e-05, "loss": 0.4509, "num_input_tokens_seen": 27930320, "step": 23020 }, { "epoch": 2.5643167390578014, "grad_norm": 0.13290278613567352, "learning_rate": 4.6365072522068396e-05, "loss": 0.4624, "num_input_tokens_seen": 27936144, "step": 23025 }, { "epoch": 2.564873593941419, "grad_norm": 0.12788048386573792, "learning_rate": 4.636254867985812e-05, "loss": 0.4643, "num_input_tokens_seen": 27942512, "step": 23030 }, { "epoch": 2.5654304488250363, "grad_norm": 0.1460762470960617, "learning_rate": 4.6360024030502296e-05, "loss": 0.4619, "num_input_tokens_seen": 27948912, "step": 23035 }, { "epoch": 2.5659873037086536, "grad_norm": 0.1216980591416359, "learning_rate": 4.635749857409631e-05, "loss": 0.475, "num_input_tokens_seen": 27954992, "step": 23040 }, { "epoch": 2.566544158592271, "grad_norm": 0.101622074842453, "learning_rate": 4.635497231073559e-05, "loss": 0.4545, "num_input_tokens_seen": 27960592, "step": 23045 }, { "epoch": 2.567101013475888, "grad_norm": 0.11415897309780121, "learning_rate": 4.635244524051558e-05, "loss": 0.4538, "num_input_tokens_seen": 27966640, "step": 23050 }, { "epoch": 2.5676578683595057, "grad_norm": 0.13914258778095245, "learning_rate": 4.6349917363531757e-05, "loss": 0.4702, "num_input_tokens_seen": 27972656, "step": 23055 }, { "epoch": 2.568214723243123, "grad_norm": 0.14328517019748688, "learning_rate": 4.634738867987964e-05, "loss": 0.4568, "num_input_tokens_seen": 27978928, "step": 23060 }, { "epoch": 2.56877157812674, "grad_norm": 0.1439565122127533, "learning_rate": 4.634485918965477e-05, "loss": 0.4701, "num_input_tokens_seen": 27985200, "step": 23065 }, { "epoch": 2.5693284330103574, "grad_norm": 0.13263939321041107, "learning_rate": 4.6342328892952706e-05, "loss": 0.455, "num_input_tokens_seen": 27990928, "step": 23070 }, { "epoch": 2.5698852878939746, "grad_norm": 0.1295313686132431, "learning_rate": 4.633979778986907e-05, "loss": 0.4472, "num_input_tokens_seen": 27996944, "step": 23075 }, { "epoch": 2.5704421427775923, "grad_norm": 0.11518702656030655, "learning_rate": 4.633726588049949e-05, "loss": 0.4814, "num_input_tokens_seen": 28003024, "step": 23080 }, { "epoch": 2.5709989976612095, "grad_norm": 0.11271504312753677, "learning_rate": 4.633473316493962e-05, "loss": 0.4631, "num_input_tokens_seen": 28009264, "step": 23085 }, { "epoch": 2.5715558525448268, "grad_norm": 0.15092012286186218, "learning_rate": 4.633219964328516e-05, "loss": 0.4804, "num_input_tokens_seen": 28015216, "step": 23090 }, { "epoch": 2.572112707428444, "grad_norm": 0.1440877914428711, "learning_rate": 4.632966531563184e-05, "loss": 0.4844, "num_input_tokens_seen": 28021200, "step": 23095 }, { "epoch": 2.5726695623120612, "grad_norm": 0.11540238559246063, "learning_rate": 4.6327130182075405e-05, "loss": 0.4521, "num_input_tokens_seen": 28027376, "step": 23100 }, { "epoch": 2.573226417195679, "grad_norm": 0.13035057485103607, "learning_rate": 4.632459424271165e-05, "loss": 0.4728, "num_input_tokens_seen": 28033360, "step": 23105 }, { "epoch": 2.573783272079296, "grad_norm": 0.18718339502811432, "learning_rate": 4.632205749763639e-05, "loss": 0.4774, "num_input_tokens_seen": 28039728, "step": 23110 }, { "epoch": 2.5743401269629134, "grad_norm": 0.1914779394865036, "learning_rate": 4.631951994694546e-05, "loss": 0.4683, "num_input_tokens_seen": 28045712, "step": 23115 }, { "epoch": 2.574896981846531, "grad_norm": 0.13053807616233826, "learning_rate": 4.631698159073475e-05, "loss": 0.4638, "num_input_tokens_seen": 28051728, "step": 23120 }, { "epoch": 2.5754538367301483, "grad_norm": 0.15625450015068054, "learning_rate": 4.6314442429100155e-05, "loss": 0.4704, "num_input_tokens_seen": 28057744, "step": 23125 }, { "epoch": 2.5760106916137655, "grad_norm": 0.1689159870147705, "learning_rate": 4.6311902462137624e-05, "loss": 0.4706, "num_input_tokens_seen": 28063952, "step": 23130 }, { "epoch": 2.5765675464973827, "grad_norm": 0.11538397520780563, "learning_rate": 4.630936168994312e-05, "loss": 0.4662, "num_input_tokens_seen": 28070224, "step": 23135 }, { "epoch": 2.577124401381, "grad_norm": 0.13880006968975067, "learning_rate": 4.630682011261265e-05, "loss": 0.4748, "num_input_tokens_seen": 28076560, "step": 23140 }, { "epoch": 2.5776812562646176, "grad_norm": 0.12165875732898712, "learning_rate": 4.630427773024223e-05, "loss": 0.4568, "num_input_tokens_seen": 28082864, "step": 23145 }, { "epoch": 2.578238111148235, "grad_norm": 0.17136791348457336, "learning_rate": 4.6301734542927916e-05, "loss": 0.465, "num_input_tokens_seen": 28089008, "step": 23150 }, { "epoch": 2.578794966031852, "grad_norm": 0.20519879460334778, "learning_rate": 4.629919055076581e-05, "loss": 0.4747, "num_input_tokens_seen": 28095024, "step": 23155 }, { "epoch": 2.5793518209154693, "grad_norm": 0.10713424533605576, "learning_rate": 4.6296645753852035e-05, "loss": 0.4527, "num_input_tokens_seen": 28101136, "step": 23160 }, { "epoch": 2.5799086757990866, "grad_norm": 0.14993014931678772, "learning_rate": 4.629410015228273e-05, "loss": 0.456, "num_input_tokens_seen": 28107312, "step": 23165 }, { "epoch": 2.5804655306827042, "grad_norm": 0.15376435220241547, "learning_rate": 4.629155374615408e-05, "loss": 0.4619, "num_input_tokens_seen": 28113456, "step": 23170 }, { "epoch": 2.5810223855663215, "grad_norm": 0.130108043551445, "learning_rate": 4.6289006535562303e-05, "loss": 0.4619, "num_input_tokens_seen": 28119920, "step": 23175 }, { "epoch": 2.5815792404499387, "grad_norm": 0.14286702871322632, "learning_rate": 4.6286458520603624e-05, "loss": 0.4647, "num_input_tokens_seen": 28126000, "step": 23180 }, { "epoch": 2.582136095333556, "grad_norm": 0.1304425299167633, "learning_rate": 4.628390970137434e-05, "loss": 0.4596, "num_input_tokens_seen": 28132336, "step": 23185 }, { "epoch": 2.582692950217173, "grad_norm": 0.10487425327301025, "learning_rate": 4.6281360077970736e-05, "loss": 0.4538, "num_input_tokens_seen": 28138416, "step": 23190 }, { "epoch": 2.583249805100791, "grad_norm": 0.10132234543561935, "learning_rate": 4.627880965048914e-05, "loss": 0.4653, "num_input_tokens_seen": 28144592, "step": 23195 }, { "epoch": 2.583806659984408, "grad_norm": 0.14611485600471497, "learning_rate": 4.627625841902593e-05, "loss": 0.4649, "num_input_tokens_seen": 28150736, "step": 23200 }, { "epoch": 2.5843635148680253, "grad_norm": 0.151310995221138, "learning_rate": 4.627370638367749e-05, "loss": 0.4692, "num_input_tokens_seen": 28157040, "step": 23205 }, { "epoch": 2.584920369751643, "grad_norm": 0.1427152454853058, "learning_rate": 4.627115354454026e-05, "loss": 0.4736, "num_input_tokens_seen": 28162960, "step": 23210 }, { "epoch": 2.58547722463526, "grad_norm": 0.14499163627624512, "learning_rate": 4.6268599901710676e-05, "loss": 0.4555, "num_input_tokens_seen": 28169040, "step": 23215 }, { "epoch": 2.5860340795188774, "grad_norm": 0.1471845954656601, "learning_rate": 4.626604545528522e-05, "loss": 0.4543, "num_input_tokens_seen": 28175376, "step": 23220 }, { "epoch": 2.5865909344024947, "grad_norm": 0.1643514782190323, "learning_rate": 4.626349020536043e-05, "loss": 0.4466, "num_input_tokens_seen": 28181584, "step": 23225 }, { "epoch": 2.587147789286112, "grad_norm": 0.1443173736333847, "learning_rate": 4.626093415203283e-05, "loss": 0.4652, "num_input_tokens_seen": 28187888, "step": 23230 }, { "epoch": 2.5877046441697296, "grad_norm": 0.1516975313425064, "learning_rate": 4.625837729539901e-05, "loss": 0.4757, "num_input_tokens_seen": 28194032, "step": 23235 }, { "epoch": 2.588261499053347, "grad_norm": 0.17113937437534332, "learning_rate": 4.625581963555556e-05, "loss": 0.4673, "num_input_tokens_seen": 28200272, "step": 23240 }, { "epoch": 2.588818353936964, "grad_norm": 0.14419865608215332, "learning_rate": 4.625326117259913e-05, "loss": 0.494, "num_input_tokens_seen": 28206320, "step": 23245 }, { "epoch": 2.5893752088205813, "grad_norm": 0.1432252824306488, "learning_rate": 4.6250701906626394e-05, "loss": 0.4736, "num_input_tokens_seen": 28212208, "step": 23250 }, { "epoch": 2.5899320637041985, "grad_norm": 0.17961560189723969, "learning_rate": 4.6248141837734024e-05, "loss": 0.4581, "num_input_tokens_seen": 28218224, "step": 23255 }, { "epoch": 2.590488918587816, "grad_norm": 0.1591670662164688, "learning_rate": 4.624558096601877e-05, "loss": 0.4858, "num_input_tokens_seen": 28224464, "step": 23260 }, { "epoch": 2.5910457734714334, "grad_norm": 0.110333152115345, "learning_rate": 4.624301929157738e-05, "loss": 0.4721, "num_input_tokens_seen": 28230544, "step": 23265 }, { "epoch": 2.5916026283550506, "grad_norm": 0.1315757781267166, "learning_rate": 4.624045681450665e-05, "loss": 0.4671, "num_input_tokens_seen": 28236560, "step": 23270 }, { "epoch": 2.592159483238668, "grad_norm": 0.13571467995643616, "learning_rate": 4.623789353490339e-05, "loss": 0.4542, "num_input_tokens_seen": 28242736, "step": 23275 }, { "epoch": 2.592716338122285, "grad_norm": 0.1665598452091217, "learning_rate": 4.6235329452864444e-05, "loss": 0.4502, "num_input_tokens_seen": 28248912, "step": 23280 }, { "epoch": 2.5932731930059028, "grad_norm": 0.12159568071365356, "learning_rate": 4.623276456848671e-05, "loss": 0.4642, "num_input_tokens_seen": 28255152, "step": 23285 }, { "epoch": 2.59383004788952, "grad_norm": 0.1298862099647522, "learning_rate": 4.623019888186708e-05, "loss": 0.4582, "num_input_tokens_seen": 28261360, "step": 23290 }, { "epoch": 2.594386902773137, "grad_norm": 0.12485479563474655, "learning_rate": 4.622763239310251e-05, "loss": 0.4567, "num_input_tokens_seen": 28267248, "step": 23295 }, { "epoch": 2.594943757656755, "grad_norm": 0.14375092089176178, "learning_rate": 4.6225065102289945e-05, "loss": 0.4485, "num_input_tokens_seen": 28273072, "step": 23300 }, { "epoch": 2.595500612540372, "grad_norm": 0.23428687453269958, "learning_rate": 4.622249700952641e-05, "loss": 0.4626, "num_input_tokens_seen": 28279408, "step": 23305 }, { "epoch": 2.5960574674239894, "grad_norm": 0.11686106026172638, "learning_rate": 4.621992811490893e-05, "loss": 0.4543, "num_input_tokens_seen": 28285584, "step": 23310 }, { "epoch": 2.5966143223076066, "grad_norm": 0.15840871632099152, "learning_rate": 4.621735841853456e-05, "loss": 0.4585, "num_input_tokens_seen": 28291568, "step": 23315 }, { "epoch": 2.597171177191224, "grad_norm": 0.12887875735759735, "learning_rate": 4.62147879205004e-05, "loss": 0.457, "num_input_tokens_seen": 28297584, "step": 23320 }, { "epoch": 2.5977280320748415, "grad_norm": 0.1543656885623932, "learning_rate": 4.621221662090356e-05, "loss": 0.4663, "num_input_tokens_seen": 28303632, "step": 23325 }, { "epoch": 2.5982848869584587, "grad_norm": 0.21496093273162842, "learning_rate": 4.62096445198412e-05, "loss": 0.4587, "num_input_tokens_seen": 28309040, "step": 23330 }, { "epoch": 2.598841741842076, "grad_norm": 0.1359773576259613, "learning_rate": 4.62070716174105e-05, "loss": 0.4589, "num_input_tokens_seen": 28315024, "step": 23335 }, { "epoch": 2.599398596725693, "grad_norm": 0.12646225094795227, "learning_rate": 4.620449791370868e-05, "loss": 0.4659, "num_input_tokens_seen": 28321168, "step": 23340 }, { "epoch": 2.5999554516093104, "grad_norm": 0.11024826765060425, "learning_rate": 4.6201923408832966e-05, "loss": 0.4731, "num_input_tokens_seen": 28327088, "step": 23345 }, { "epoch": 2.600512306492928, "grad_norm": 0.12041205167770386, "learning_rate": 4.6199348102880643e-05, "loss": 0.4567, "num_input_tokens_seen": 28333360, "step": 23350 }, { "epoch": 2.6010691613765453, "grad_norm": 0.15944334864616394, "learning_rate": 4.619677199594901e-05, "loss": 0.4493, "num_input_tokens_seen": 28339472, "step": 23355 }, { "epoch": 2.6016260162601625, "grad_norm": 0.116562120616436, "learning_rate": 4.61941950881354e-05, "loss": 0.4729, "num_input_tokens_seen": 28345072, "step": 23360 }, { "epoch": 2.6021828711437798, "grad_norm": 0.12104838341474533, "learning_rate": 4.619161737953719e-05, "loss": 0.4634, "num_input_tokens_seen": 28351088, "step": 23365 }, { "epoch": 2.602739726027397, "grad_norm": 0.12494779378175735, "learning_rate": 4.618903887025176e-05, "loss": 0.4621, "num_input_tokens_seen": 28357200, "step": 23370 }, { "epoch": 2.6032965809110147, "grad_norm": 0.11890285462141037, "learning_rate": 4.618645956037654e-05, "loss": 0.478, "num_input_tokens_seen": 28363056, "step": 23375 }, { "epoch": 2.603853435794632, "grad_norm": 0.10437708348035812, "learning_rate": 4.618387945000897e-05, "loss": 0.4716, "num_input_tokens_seen": 28368752, "step": 23380 }, { "epoch": 2.604410290678249, "grad_norm": 0.11341574043035507, "learning_rate": 4.6181298539246564e-05, "loss": 0.4713, "num_input_tokens_seen": 28375024, "step": 23385 }, { "epoch": 2.604967145561867, "grad_norm": 0.12099717557430267, "learning_rate": 4.6178716828186806e-05, "loss": 0.4767, "num_input_tokens_seen": 28381072, "step": 23390 }, { "epoch": 2.605524000445484, "grad_norm": 0.12723007798194885, "learning_rate": 4.617613431692726e-05, "loss": 0.4635, "num_input_tokens_seen": 28386928, "step": 23395 }, { "epoch": 2.6060808553291013, "grad_norm": 0.12695623934268951, "learning_rate": 4.617355100556551e-05, "loss": 0.4688, "num_input_tokens_seen": 28393296, "step": 23400 }, { "epoch": 2.6066377102127185, "grad_norm": 0.13116025924682617, "learning_rate": 4.617096689419913e-05, "loss": 0.4593, "num_input_tokens_seen": 28399472, "step": 23405 }, { "epoch": 2.6071945650963357, "grad_norm": 0.12691421806812286, "learning_rate": 4.616838198292579e-05, "loss": 0.4597, "num_input_tokens_seen": 28405712, "step": 23410 }, { "epoch": 2.6077514199799534, "grad_norm": 0.10541708767414093, "learning_rate": 4.616579627184313e-05, "loss": 0.472, "num_input_tokens_seen": 28411312, "step": 23415 }, { "epoch": 2.6083082748635706, "grad_norm": 0.13919511437416077, "learning_rate": 4.616320976104887e-05, "loss": 0.4742, "num_input_tokens_seen": 28417712, "step": 23420 }, { "epoch": 2.608865129747188, "grad_norm": 0.11361421644687653, "learning_rate": 4.616062245064071e-05, "loss": 0.4585, "num_input_tokens_seen": 28423696, "step": 23425 }, { "epoch": 2.609421984630805, "grad_norm": 0.10644912719726562, "learning_rate": 4.615803434071643e-05, "loss": 0.474, "num_input_tokens_seen": 28429680, "step": 23430 }, { "epoch": 2.6099788395144223, "grad_norm": 0.13025954365730286, "learning_rate": 4.615544543137381e-05, "loss": 0.4731, "num_input_tokens_seen": 28435760, "step": 23435 }, { "epoch": 2.61053569439804, "grad_norm": 0.12777449190616608, "learning_rate": 4.6152855722710656e-05, "loss": 0.4629, "num_input_tokens_seen": 28441648, "step": 23440 }, { "epoch": 2.6110925492816572, "grad_norm": 0.1583937257528305, "learning_rate": 4.615026521482484e-05, "loss": 0.457, "num_input_tokens_seen": 28448016, "step": 23445 }, { "epoch": 2.6116494041652745, "grad_norm": 0.12409865111112595, "learning_rate": 4.614767390781421e-05, "loss": 0.4677, "num_input_tokens_seen": 28453936, "step": 23450 }, { "epoch": 2.6122062590488917, "grad_norm": 0.13288624584674835, "learning_rate": 4.6145081801776696e-05, "loss": 0.4544, "num_input_tokens_seen": 28459760, "step": 23455 }, { "epoch": 2.612763113932509, "grad_norm": 0.12959249317646027, "learning_rate": 4.614248889681023e-05, "loss": 0.4511, "num_input_tokens_seen": 28465648, "step": 23460 }, { "epoch": 2.6133199688161266, "grad_norm": 0.1461659073829651, "learning_rate": 4.613989519301277e-05, "loss": 0.4694, "num_input_tokens_seen": 28471760, "step": 23465 }, { "epoch": 2.613876823699744, "grad_norm": 0.1261180341243744, "learning_rate": 4.6137300690482335e-05, "loss": 0.4579, "num_input_tokens_seen": 28477968, "step": 23470 }, { "epoch": 2.614433678583361, "grad_norm": 0.13354246318340302, "learning_rate": 4.613470538931693e-05, "loss": 0.4682, "num_input_tokens_seen": 28484368, "step": 23475 }, { "epoch": 2.6149905334669787, "grad_norm": 0.1320958286523819, "learning_rate": 4.613210928961464e-05, "loss": 0.462, "num_input_tokens_seen": 28490672, "step": 23480 }, { "epoch": 2.615547388350596, "grad_norm": 0.11988124996423721, "learning_rate": 4.612951239147353e-05, "loss": 0.4499, "num_input_tokens_seen": 28496944, "step": 23485 }, { "epoch": 2.616104243234213, "grad_norm": 0.1446286141872406, "learning_rate": 4.612691469499173e-05, "loss": 0.457, "num_input_tokens_seen": 28502896, "step": 23490 }, { "epoch": 2.6166610981178304, "grad_norm": 0.14086951315402985, "learning_rate": 4.612431620026739e-05, "loss": 0.4594, "num_input_tokens_seen": 28509040, "step": 23495 }, { "epoch": 2.6172179530014477, "grad_norm": 0.11777922511100769, "learning_rate": 4.6121716907398695e-05, "loss": 0.4539, "num_input_tokens_seen": 28515152, "step": 23500 }, { "epoch": 2.6177748078850653, "grad_norm": 0.12327949702739716, "learning_rate": 4.611911681648384e-05, "loss": 0.4657, "num_input_tokens_seen": 28521104, "step": 23505 }, { "epoch": 2.6183316627686826, "grad_norm": 0.11889517307281494, "learning_rate": 4.611651592762107e-05, "loss": 0.4695, "num_input_tokens_seen": 28526992, "step": 23510 }, { "epoch": 2.6188885176523, "grad_norm": 0.14219637215137482, "learning_rate": 4.611391424090866e-05, "loss": 0.4596, "num_input_tokens_seen": 28533200, "step": 23515 }, { "epoch": 2.619445372535917, "grad_norm": 0.1518266797065735, "learning_rate": 4.6111311756444905e-05, "loss": 0.4677, "num_input_tokens_seen": 28539376, "step": 23520 }, { "epoch": 2.6200022274195343, "grad_norm": 0.1484195441007614, "learning_rate": 4.610870847432815e-05, "loss": 0.4611, "num_input_tokens_seen": 28545552, "step": 23525 }, { "epoch": 2.620559082303152, "grad_norm": 0.12208888679742813, "learning_rate": 4.6106104394656726e-05, "loss": 0.4697, "num_input_tokens_seen": 28551472, "step": 23530 }, { "epoch": 2.621115937186769, "grad_norm": 0.11562571674585342, "learning_rate": 4.610349951752904e-05, "loss": 0.4758, "num_input_tokens_seen": 28557424, "step": 23535 }, { "epoch": 2.6216727920703864, "grad_norm": 0.1296364963054657, "learning_rate": 4.610089384304352e-05, "loss": 0.4607, "num_input_tokens_seen": 28563536, "step": 23540 }, { "epoch": 2.6222296469540036, "grad_norm": 0.1384795755147934, "learning_rate": 4.609828737129861e-05, "loss": 0.47, "num_input_tokens_seen": 28569520, "step": 23545 }, { "epoch": 2.622786501837621, "grad_norm": 0.13570177555084229, "learning_rate": 4.609568010239279e-05, "loss": 0.4558, "num_input_tokens_seen": 28575920, "step": 23550 }, { "epoch": 2.6233433567212385, "grad_norm": 0.17313562333583832, "learning_rate": 4.609307203642457e-05, "loss": 0.4734, "num_input_tokens_seen": 28581936, "step": 23555 }, { "epoch": 2.6239002116048558, "grad_norm": 0.13565412163734436, "learning_rate": 4.6090463173492494e-05, "loss": 0.4569, "num_input_tokens_seen": 28588208, "step": 23560 }, { "epoch": 2.624457066488473, "grad_norm": 0.12551449239253998, "learning_rate": 4.6087853513695126e-05, "loss": 0.4654, "num_input_tokens_seen": 28594640, "step": 23565 }, { "epoch": 2.6250139213720907, "grad_norm": 0.19700825214385986, "learning_rate": 4.608524305713108e-05, "loss": 0.4628, "num_input_tokens_seen": 28601040, "step": 23570 }, { "epoch": 2.625570776255708, "grad_norm": 0.13875354826450348, "learning_rate": 4.6082631803898977e-05, "loss": 0.4497, "num_input_tokens_seen": 28607152, "step": 23575 }, { "epoch": 2.626127631139325, "grad_norm": 0.1576012372970581, "learning_rate": 4.608001975409748e-05, "loss": 0.4682, "num_input_tokens_seen": 28613424, "step": 23580 }, { "epoch": 2.6266844860229424, "grad_norm": 0.10412176698446274, "learning_rate": 4.6077406907825285e-05, "loss": 0.4659, "num_input_tokens_seen": 28619184, "step": 23585 }, { "epoch": 2.6272413409065596, "grad_norm": 0.13392162322998047, "learning_rate": 4.607479326518111e-05, "loss": 0.4735, "num_input_tokens_seen": 28625072, "step": 23590 }, { "epoch": 2.6277981957901773, "grad_norm": 0.1148931011557579, "learning_rate": 4.607217882626371e-05, "loss": 0.4601, "num_input_tokens_seen": 28631120, "step": 23595 }, { "epoch": 2.6283550506737945, "grad_norm": 0.13628563284873962, "learning_rate": 4.606956359117186e-05, "loss": 0.47, "num_input_tokens_seen": 28636656, "step": 23600 }, { "epoch": 2.6289119055574117, "grad_norm": 0.19795382022857666, "learning_rate": 4.606694756000438e-05, "loss": 0.4739, "num_input_tokens_seen": 28642544, "step": 23605 }, { "epoch": 2.629468760441029, "grad_norm": 0.14515574276447296, "learning_rate": 4.60643307328601e-05, "loss": 0.457, "num_input_tokens_seen": 28648560, "step": 23610 }, { "epoch": 2.630025615324646, "grad_norm": 0.16734226047992706, "learning_rate": 4.6061713109837915e-05, "loss": 0.4697, "num_input_tokens_seen": 28654448, "step": 23615 }, { "epoch": 2.630582470208264, "grad_norm": 0.12965120375156403, "learning_rate": 4.605909469103671e-05, "loss": 0.4539, "num_input_tokens_seen": 28660496, "step": 23620 }, { "epoch": 2.631139325091881, "grad_norm": 0.1278269737958908, "learning_rate": 4.605647547655542e-05, "loss": 0.47, "num_input_tokens_seen": 28666640, "step": 23625 }, { "epoch": 2.6316961799754983, "grad_norm": 0.11186320334672928, "learning_rate": 4.605385546649301e-05, "loss": 0.4648, "num_input_tokens_seen": 28672848, "step": 23630 }, { "epoch": 2.6322530348591155, "grad_norm": 0.1400761604309082, "learning_rate": 4.605123466094846e-05, "loss": 0.44, "num_input_tokens_seen": 28678640, "step": 23635 }, { "epoch": 2.632809889742733, "grad_norm": 0.13762851059436798, "learning_rate": 4.604861306002081e-05, "loss": 0.4741, "num_input_tokens_seen": 28684912, "step": 23640 }, { "epoch": 2.6333667446263505, "grad_norm": 0.14688484370708466, "learning_rate": 4.60459906638091e-05, "loss": 0.4884, "num_input_tokens_seen": 28691088, "step": 23645 }, { "epoch": 2.6339235995099677, "grad_norm": 0.11571082472801208, "learning_rate": 4.604336747241242e-05, "loss": 0.457, "num_input_tokens_seen": 28697232, "step": 23650 }, { "epoch": 2.634480454393585, "grad_norm": 0.12657982110977173, "learning_rate": 4.6040743485929885e-05, "loss": 0.4596, "num_input_tokens_seen": 28703408, "step": 23655 }, { "epoch": 2.6350373092772026, "grad_norm": 0.13177333772182465, "learning_rate": 4.603811870446063e-05, "loss": 0.4726, "num_input_tokens_seen": 28709680, "step": 23660 }, { "epoch": 2.63559416416082, "grad_norm": 0.15655894577503204, "learning_rate": 4.603549312810384e-05, "loss": 0.4707, "num_input_tokens_seen": 28715696, "step": 23665 }, { "epoch": 2.636151019044437, "grad_norm": 0.14191119372844696, "learning_rate": 4.603286675695869e-05, "loss": 0.4638, "num_input_tokens_seen": 28722032, "step": 23670 }, { "epoch": 2.6367078739280543, "grad_norm": 0.12773829698562622, "learning_rate": 4.603023959112444e-05, "loss": 0.4601, "num_input_tokens_seen": 28728144, "step": 23675 }, { "epoch": 2.6372647288116715, "grad_norm": 0.17888852953910828, "learning_rate": 4.602761163070034e-05, "loss": 0.4681, "num_input_tokens_seen": 28734480, "step": 23680 }, { "epoch": 2.637821583695289, "grad_norm": 0.16529692709445953, "learning_rate": 4.602498287578569e-05, "loss": 0.4587, "num_input_tokens_seen": 28740624, "step": 23685 }, { "epoch": 2.6383784385789064, "grad_norm": 0.17768171429634094, "learning_rate": 4.60223533264798e-05, "loss": 0.463, "num_input_tokens_seen": 28746576, "step": 23690 }, { "epoch": 2.6389352934625236, "grad_norm": 0.16377070546150208, "learning_rate": 4.601972298288204e-05, "loss": 0.4672, "num_input_tokens_seen": 28752912, "step": 23695 }, { "epoch": 2.639492148346141, "grad_norm": 0.18187831342220306, "learning_rate": 4.6017091845091784e-05, "loss": 0.4583, "num_input_tokens_seen": 28758832, "step": 23700 }, { "epoch": 2.640049003229758, "grad_norm": 0.14434629678726196, "learning_rate": 4.601445991320845e-05, "loss": 0.457, "num_input_tokens_seen": 28765264, "step": 23705 }, { "epoch": 2.640605858113376, "grad_norm": 0.14918969571590424, "learning_rate": 4.6011827187331466e-05, "loss": 0.4759, "num_input_tokens_seen": 28771216, "step": 23710 }, { "epoch": 2.641162712996993, "grad_norm": 0.14803878962993622, "learning_rate": 4.600919366756033e-05, "loss": 0.4563, "num_input_tokens_seen": 28777488, "step": 23715 }, { "epoch": 2.6417195678806102, "grad_norm": 0.11480015516281128, "learning_rate": 4.600655935399452e-05, "loss": 0.4479, "num_input_tokens_seen": 28783920, "step": 23720 }, { "epoch": 2.642276422764228, "grad_norm": 0.16902948915958405, "learning_rate": 4.6003924246733576e-05, "loss": 0.4804, "num_input_tokens_seen": 28790288, "step": 23725 }, { "epoch": 2.6428332776478447, "grad_norm": 0.1797831803560257, "learning_rate": 4.600128834587707e-05, "loss": 0.4679, "num_input_tokens_seen": 28796720, "step": 23730 }, { "epoch": 2.6433901325314624, "grad_norm": 0.1293475180864334, "learning_rate": 4.599865165152458e-05, "loss": 0.461, "num_input_tokens_seen": 28802736, "step": 23735 }, { "epoch": 2.6439469874150796, "grad_norm": 0.13676607608795166, "learning_rate": 4.599601416377575e-05, "loss": 0.4707, "num_input_tokens_seen": 28808848, "step": 23740 }, { "epoch": 2.644503842298697, "grad_norm": 0.14000645279884338, "learning_rate": 4.5993375882730206e-05, "loss": 0.4658, "num_input_tokens_seen": 28815216, "step": 23745 }, { "epoch": 2.6450606971823145, "grad_norm": 0.0947408378124237, "learning_rate": 4.5990736808487654e-05, "loss": 0.4627, "num_input_tokens_seen": 28821392, "step": 23750 }, { "epoch": 2.6456175520659317, "grad_norm": 0.11655968427658081, "learning_rate": 4.59880969411478e-05, "loss": 0.4573, "num_input_tokens_seen": 28827088, "step": 23755 }, { "epoch": 2.646174406949549, "grad_norm": 0.1319262981414795, "learning_rate": 4.598545628081037e-05, "loss": 0.4567, "num_input_tokens_seen": 28833168, "step": 23760 }, { "epoch": 2.646731261833166, "grad_norm": 0.18319866061210632, "learning_rate": 4.598281482757516e-05, "loss": 0.4726, "num_input_tokens_seen": 28839536, "step": 23765 }, { "epoch": 2.6472881167167834, "grad_norm": 0.14365984499454498, "learning_rate": 4.5980172581541956e-05, "loss": 0.4624, "num_input_tokens_seen": 28845712, "step": 23770 }, { "epoch": 2.647844971600401, "grad_norm": 0.13406850397586823, "learning_rate": 4.5977529542810606e-05, "loss": 0.4655, "num_input_tokens_seen": 28852080, "step": 23775 }, { "epoch": 2.6484018264840183, "grad_norm": 0.1263541877269745, "learning_rate": 4.597488571148096e-05, "loss": 0.468, "num_input_tokens_seen": 28857680, "step": 23780 }, { "epoch": 2.6489586813676356, "grad_norm": 0.14097781479358673, "learning_rate": 4.597224108765292e-05, "loss": 0.468, "num_input_tokens_seen": 28863536, "step": 23785 }, { "epoch": 2.649515536251253, "grad_norm": 0.11925245821475983, "learning_rate": 4.59695956714264e-05, "loss": 0.4752, "num_input_tokens_seen": 28869424, "step": 23790 }, { "epoch": 2.65007239113487, "grad_norm": 0.15175855159759521, "learning_rate": 4.5966949462901354e-05, "loss": 0.4776, "num_input_tokens_seen": 28875344, "step": 23795 }, { "epoch": 2.6506292460184877, "grad_norm": 0.16387838125228882, "learning_rate": 4.596430246217776e-05, "loss": 0.469, "num_input_tokens_seen": 28881296, "step": 23800 }, { "epoch": 2.651186100902105, "grad_norm": 0.13243603706359863, "learning_rate": 4.596165466935565e-05, "loss": 0.4533, "num_input_tokens_seen": 28887088, "step": 23805 }, { "epoch": 2.651742955785722, "grad_norm": 0.17384620010852814, "learning_rate": 4.595900608453504e-05, "loss": 0.4615, "num_input_tokens_seen": 28893232, "step": 23810 }, { "epoch": 2.65229981066934, "grad_norm": 0.13299070298671722, "learning_rate": 4.595635670781602e-05, "loss": 0.4683, "num_input_tokens_seen": 28899504, "step": 23815 }, { "epoch": 2.6528566655529566, "grad_norm": 0.13769805431365967, "learning_rate": 4.595370653929868e-05, "loss": 0.4538, "num_input_tokens_seen": 28905648, "step": 23820 }, { "epoch": 2.6534135204365743, "grad_norm": 0.1849275529384613, "learning_rate": 4.595105557908316e-05, "loss": 0.4742, "num_input_tokens_seen": 28911696, "step": 23825 }, { "epoch": 2.6539703753201915, "grad_norm": 0.1171412244439125, "learning_rate": 4.594840382726961e-05, "loss": 0.4556, "num_input_tokens_seen": 28917296, "step": 23830 }, { "epoch": 2.6545272302038088, "grad_norm": 0.14216428995132446, "learning_rate": 4.5945751283958243e-05, "loss": 0.4797, "num_input_tokens_seen": 28923568, "step": 23835 }, { "epoch": 2.6550840850874264, "grad_norm": 0.11389454454183578, "learning_rate": 4.594309794924927e-05, "loss": 0.4711, "num_input_tokens_seen": 28929840, "step": 23840 }, { "epoch": 2.6556409399710437, "grad_norm": 0.1374807506799698, "learning_rate": 4.594044382324293e-05, "loss": 0.4614, "num_input_tokens_seen": 28935536, "step": 23845 }, { "epoch": 2.656197794854661, "grad_norm": 0.13089197874069214, "learning_rate": 4.5937788906039525e-05, "loss": 0.4658, "num_input_tokens_seen": 28941520, "step": 23850 }, { "epoch": 2.656754649738278, "grad_norm": 0.13197048008441925, "learning_rate": 4.593513319773936e-05, "loss": 0.4605, "num_input_tokens_seen": 28947792, "step": 23855 }, { "epoch": 2.6573115046218954, "grad_norm": 0.17152436077594757, "learning_rate": 4.5932476698442764e-05, "loss": 0.4684, "num_input_tokens_seen": 28953712, "step": 23860 }, { "epoch": 2.657868359505513, "grad_norm": 0.18304437398910522, "learning_rate": 4.5929819408250115e-05, "loss": 0.4726, "num_input_tokens_seen": 28960016, "step": 23865 }, { "epoch": 2.6584252143891303, "grad_norm": 0.10346097499132156, "learning_rate": 4.592716132726182e-05, "loss": 0.4661, "num_input_tokens_seen": 28965776, "step": 23870 }, { "epoch": 2.6589820692727475, "grad_norm": 0.14660176634788513, "learning_rate": 4.5924502455578306e-05, "loss": 0.4597, "num_input_tokens_seen": 28971632, "step": 23875 }, { "epoch": 2.6595389241563647, "grad_norm": 0.14035524427890778, "learning_rate": 4.592184279330003e-05, "loss": 0.4576, "num_input_tokens_seen": 28977936, "step": 23880 }, { "epoch": 2.660095779039982, "grad_norm": 0.15548668801784515, "learning_rate": 4.591918234052749e-05, "loss": 0.4571, "num_input_tokens_seen": 28984048, "step": 23885 }, { "epoch": 2.6606526339235996, "grad_norm": 0.20295168459415436, "learning_rate": 4.59165210973612e-05, "loss": 0.4623, "num_input_tokens_seen": 28990224, "step": 23890 }, { "epoch": 2.661209488807217, "grad_norm": 0.16538555920124054, "learning_rate": 4.591385906390171e-05, "loss": 0.4888, "num_input_tokens_seen": 28996464, "step": 23895 }, { "epoch": 2.661766343690834, "grad_norm": 0.1222800761461258, "learning_rate": 4.591119624024961e-05, "loss": 0.4665, "num_input_tokens_seen": 29002608, "step": 23900 }, { "epoch": 2.6623231985744518, "grad_norm": 0.104734867811203, "learning_rate": 4.5908532626505504e-05, "loss": 0.4545, "num_input_tokens_seen": 29008816, "step": 23905 }, { "epoch": 2.6628800534580686, "grad_norm": 0.17268182337284088, "learning_rate": 4.590586822277002e-05, "loss": 0.4541, "num_input_tokens_seen": 29014800, "step": 23910 }, { "epoch": 2.6634369083416862, "grad_norm": 0.15370939671993256, "learning_rate": 4.590320302914385e-05, "loss": 0.4721, "num_input_tokens_seen": 29021168, "step": 23915 }, { "epoch": 2.6639937632253035, "grad_norm": 0.15266090631484985, "learning_rate": 4.590053704572768e-05, "loss": 0.4698, "num_input_tokens_seen": 29027088, "step": 23920 }, { "epoch": 2.6645506181089207, "grad_norm": 0.13304781913757324, "learning_rate": 4.5897870272622234e-05, "loss": 0.4451, "num_input_tokens_seen": 29033296, "step": 23925 }, { "epoch": 2.6651074729925384, "grad_norm": 0.18773648142814636, "learning_rate": 4.5895202709928284e-05, "loss": 0.4579, "num_input_tokens_seen": 29039312, "step": 23930 }, { "epoch": 2.6656643278761556, "grad_norm": 0.1546107530593872, "learning_rate": 4.5892534357746616e-05, "loss": 0.4629, "num_input_tokens_seen": 29045776, "step": 23935 }, { "epoch": 2.666221182759773, "grad_norm": 0.14069333672523499, "learning_rate": 4.588986521617804e-05, "loss": 0.4525, "num_input_tokens_seen": 29052272, "step": 23940 }, { "epoch": 2.66677803764339, "grad_norm": 0.19790412485599518, "learning_rate": 4.588719528532342e-05, "loss": 0.4702, "num_input_tokens_seen": 29058512, "step": 23945 }, { "epoch": 2.6673348925270073, "grad_norm": 0.18298986554145813, "learning_rate": 4.5884524565283616e-05, "loss": 0.4749, "num_input_tokens_seen": 29063344, "step": 23950 }, { "epoch": 2.667891747410625, "grad_norm": 0.17476914823055267, "learning_rate": 4.588185305615955e-05, "loss": 0.4687, "num_input_tokens_seen": 29069424, "step": 23955 }, { "epoch": 2.668448602294242, "grad_norm": 0.1156434714794159, "learning_rate": 4.5879180758052155e-05, "loss": 0.4598, "num_input_tokens_seen": 29075632, "step": 23960 }, { "epoch": 2.6690054571778594, "grad_norm": 0.14806810021400452, "learning_rate": 4.58765076710624e-05, "loss": 0.4528, "num_input_tokens_seen": 29081808, "step": 23965 }, { "epoch": 2.6695623120614766, "grad_norm": 0.18359778821468353, "learning_rate": 4.587383379529129e-05, "loss": 0.4523, "num_input_tokens_seen": 29087760, "step": 23970 }, { "epoch": 2.670119166945094, "grad_norm": 0.1867581456899643, "learning_rate": 4.5871159130839834e-05, "loss": 0.4672, "num_input_tokens_seen": 29093904, "step": 23975 }, { "epoch": 2.6706760218287116, "grad_norm": 0.16606883704662323, "learning_rate": 4.586848367780911e-05, "loss": 0.4733, "num_input_tokens_seen": 29099984, "step": 23980 }, { "epoch": 2.671232876712329, "grad_norm": 0.16681919991970062, "learning_rate": 4.5865807436300196e-05, "loss": 0.4517, "num_input_tokens_seen": 29106224, "step": 23985 }, { "epoch": 2.671789731595946, "grad_norm": 0.15403512120246887, "learning_rate": 4.58631304064142e-05, "loss": 0.479, "num_input_tokens_seen": 29112400, "step": 23990 }, { "epoch": 2.6723465864795637, "grad_norm": 0.1344432532787323, "learning_rate": 4.586045258825229e-05, "loss": 0.4606, "num_input_tokens_seen": 29118704, "step": 23995 }, { "epoch": 2.672903441363181, "grad_norm": 0.13281023502349854, "learning_rate": 4.585777398191563e-05, "loss": 0.474, "num_input_tokens_seen": 29124976, "step": 24000 }, { "epoch": 2.673460296246798, "grad_norm": 0.13746260106563568, "learning_rate": 4.585509458750542e-05, "loss": 0.467, "num_input_tokens_seen": 29130768, "step": 24005 }, { "epoch": 2.6740171511304154, "grad_norm": 0.12468244135379791, "learning_rate": 4.585241440512291e-05, "loss": 0.4765, "num_input_tokens_seen": 29136816, "step": 24010 }, { "epoch": 2.6745740060140326, "grad_norm": 0.18288637697696686, "learning_rate": 4.584973343486935e-05, "loss": 0.4526, "num_input_tokens_seen": 29142960, "step": 24015 }, { "epoch": 2.6751308608976503, "grad_norm": 0.1475377082824707, "learning_rate": 4.5847051676846054e-05, "loss": 0.4538, "num_input_tokens_seen": 29149424, "step": 24020 }, { "epoch": 2.6756877157812675, "grad_norm": 0.09501579403877258, "learning_rate": 4.584436913115433e-05, "loss": 0.4728, "num_input_tokens_seen": 29154960, "step": 24025 }, { "epoch": 2.6762445706648847, "grad_norm": 0.11616598069667816, "learning_rate": 4.584168579789555e-05, "loss": 0.4665, "num_input_tokens_seen": 29160816, "step": 24030 }, { "epoch": 2.676801425548502, "grad_norm": 0.13796843588352203, "learning_rate": 4.5839001677171076e-05, "loss": 0.4722, "num_input_tokens_seen": 29166800, "step": 24035 }, { "epoch": 2.677358280432119, "grad_norm": 0.15941298007965088, "learning_rate": 4.583631676908235e-05, "loss": 0.4622, "num_input_tokens_seen": 29172656, "step": 24040 }, { "epoch": 2.677915135315737, "grad_norm": 0.1588006168603897, "learning_rate": 4.5833631073730795e-05, "loss": 0.4693, "num_input_tokens_seen": 29178960, "step": 24045 }, { "epoch": 2.678471990199354, "grad_norm": 0.19352605938911438, "learning_rate": 4.58309445912179e-05, "loss": 0.4573, "num_input_tokens_seen": 29184816, "step": 24050 }, { "epoch": 2.6790288450829713, "grad_norm": 0.13545671105384827, "learning_rate": 4.582825732164516e-05, "loss": 0.4681, "num_input_tokens_seen": 29190768, "step": 24055 }, { "epoch": 2.6795856999665886, "grad_norm": 0.14017120003700256, "learning_rate": 4.582556926511411e-05, "loss": 0.4696, "num_input_tokens_seen": 29197072, "step": 24060 }, { "epoch": 2.680142554850206, "grad_norm": 0.1429445594549179, "learning_rate": 4.582288042172632e-05, "loss": 0.4635, "num_input_tokens_seen": 29202992, "step": 24065 }, { "epoch": 2.6806994097338235, "grad_norm": 0.16783656179904938, "learning_rate": 4.582019079158337e-05, "loss": 0.4644, "num_input_tokens_seen": 29208752, "step": 24070 }, { "epoch": 2.6812562646174407, "grad_norm": 0.1469593495130539, "learning_rate": 4.581750037478689e-05, "loss": 0.4643, "num_input_tokens_seen": 29214512, "step": 24075 }, { "epoch": 2.681813119501058, "grad_norm": 0.14168742299079895, "learning_rate": 4.581480917143853e-05, "loss": 0.4555, "num_input_tokens_seen": 29220496, "step": 24080 }, { "epoch": 2.6823699743846756, "grad_norm": 0.1203145682811737, "learning_rate": 4.581211718163998e-05, "loss": 0.4549, "num_input_tokens_seen": 29226640, "step": 24085 }, { "epoch": 2.682926829268293, "grad_norm": 0.16881416738033295, "learning_rate": 4.580942440549295e-05, "loss": 0.463, "num_input_tokens_seen": 29232624, "step": 24090 }, { "epoch": 2.68348368415191, "grad_norm": 0.1679723560810089, "learning_rate": 4.5806730843099176e-05, "loss": 0.4497, "num_input_tokens_seen": 29238672, "step": 24095 }, { "epoch": 2.6840405390355273, "grad_norm": 0.14620736241340637, "learning_rate": 4.5804036494560436e-05, "loss": 0.4462, "num_input_tokens_seen": 29244816, "step": 24100 }, { "epoch": 2.6845973939191445, "grad_norm": 0.12359942495822906, "learning_rate": 4.580134135997853e-05, "loss": 0.4759, "num_input_tokens_seen": 29250384, "step": 24105 }, { "epoch": 2.685154248802762, "grad_norm": 0.12978947162628174, "learning_rate": 4.579864543945528e-05, "loss": 0.4513, "num_input_tokens_seen": 29256400, "step": 24110 }, { "epoch": 2.6857111036863794, "grad_norm": 0.16257259249687195, "learning_rate": 4.579594873309255e-05, "loss": 0.4595, "num_input_tokens_seen": 29262384, "step": 24115 }, { "epoch": 2.6862679585699967, "grad_norm": 0.1461169570684433, "learning_rate": 4.5793251240992244e-05, "loss": 0.468, "num_input_tokens_seen": 29268752, "step": 24120 }, { "epoch": 2.686824813453614, "grad_norm": 0.15272431075572968, "learning_rate": 4.579055296325625e-05, "loss": 0.4704, "num_input_tokens_seen": 29275024, "step": 24125 }, { "epoch": 2.687381668337231, "grad_norm": 0.11748488992452621, "learning_rate": 4.5787853899986554e-05, "loss": 0.4641, "num_input_tokens_seen": 29280816, "step": 24130 }, { "epoch": 2.687938523220849, "grad_norm": 0.1662353128194809, "learning_rate": 4.5785154051285125e-05, "loss": 0.4704, "num_input_tokens_seen": 29286256, "step": 24135 }, { "epoch": 2.688495378104466, "grad_norm": 0.15917238593101501, "learning_rate": 4.5782453417253955e-05, "loss": 0.4439, "num_input_tokens_seen": 29292624, "step": 24140 }, { "epoch": 2.6890522329880833, "grad_norm": 0.1610301285982132, "learning_rate": 4.57797519979951e-05, "loss": 0.4618, "num_input_tokens_seen": 29298704, "step": 24145 }, { "epoch": 2.6896090878717005, "grad_norm": 0.12581878900527954, "learning_rate": 4.577704979361062e-05, "loss": 0.4452, "num_input_tokens_seen": 29304496, "step": 24150 }, { "epoch": 2.6901659427553177, "grad_norm": 0.14926563203334808, "learning_rate": 4.577434680420262e-05, "loss": 0.4486, "num_input_tokens_seen": 29310544, "step": 24155 }, { "epoch": 2.6907227976389354, "grad_norm": 0.1689625382423401, "learning_rate": 4.577164302987322e-05, "loss": 0.4606, "num_input_tokens_seen": 29316656, "step": 24160 }, { "epoch": 2.6912796525225526, "grad_norm": 0.14875274896621704, "learning_rate": 4.576893847072457e-05, "loss": 0.4535, "num_input_tokens_seen": 29322384, "step": 24165 }, { "epoch": 2.69183650740617, "grad_norm": 0.1593739241361618, "learning_rate": 4.5766233126858884e-05, "loss": 0.4648, "num_input_tokens_seen": 29328304, "step": 24170 }, { "epoch": 2.6923933622897875, "grad_norm": 0.19038629531860352, "learning_rate": 4.5763526998378356e-05, "loss": 0.4697, "num_input_tokens_seen": 29334800, "step": 24175 }, { "epoch": 2.6929502171734048, "grad_norm": 0.12940585613250732, "learning_rate": 4.5760820085385236e-05, "loss": 0.4762, "num_input_tokens_seen": 29340752, "step": 24180 }, { "epoch": 2.693507072057022, "grad_norm": 0.11441398411989212, "learning_rate": 4.57581123879818e-05, "loss": 0.4616, "num_input_tokens_seen": 29346480, "step": 24185 }, { "epoch": 2.6940639269406392, "grad_norm": 0.1273719221353531, "learning_rate": 4.575540390627035e-05, "loss": 0.4656, "num_input_tokens_seen": 29352528, "step": 24190 }, { "epoch": 2.6946207818242565, "grad_norm": 0.1159227043390274, "learning_rate": 4.5752694640353246e-05, "loss": 0.4583, "num_input_tokens_seen": 29358672, "step": 24195 }, { "epoch": 2.695177636707874, "grad_norm": 0.13463427126407623, "learning_rate": 4.574998459033282e-05, "loss": 0.4554, "num_input_tokens_seen": 29365040, "step": 24200 }, { "epoch": 2.6957344915914914, "grad_norm": 0.12154999375343323, "learning_rate": 4.5747273756311484e-05, "loss": 0.4662, "num_input_tokens_seen": 29371152, "step": 24205 }, { "epoch": 2.6962913464751086, "grad_norm": 0.12816983461380005, "learning_rate": 4.5744562138391657e-05, "loss": 0.4797, "num_input_tokens_seen": 29377264, "step": 24210 }, { "epoch": 2.696848201358726, "grad_norm": 0.12548549473285675, "learning_rate": 4.574184973667579e-05, "loss": 0.4568, "num_input_tokens_seen": 29383440, "step": 24215 }, { "epoch": 2.697405056242343, "grad_norm": 0.2261320948600769, "learning_rate": 4.5739136551266365e-05, "loss": 0.4618, "num_input_tokens_seen": 29389456, "step": 24220 }, { "epoch": 2.6979619111259607, "grad_norm": 0.09819243103265762, "learning_rate": 4.57364225822659e-05, "loss": 0.463, "num_input_tokens_seen": 29395600, "step": 24225 }, { "epoch": 2.698518766009578, "grad_norm": 0.1522940844297409, "learning_rate": 4.573370782977695e-05, "loss": 0.471, "num_input_tokens_seen": 29401328, "step": 24230 }, { "epoch": 2.699075620893195, "grad_norm": 0.14441174268722534, "learning_rate": 4.573099229390207e-05, "loss": 0.4672, "num_input_tokens_seen": 29407376, "step": 24235 }, { "epoch": 2.6996324757768124, "grad_norm": 0.11940489709377289, "learning_rate": 4.572827597474386e-05, "loss": 0.4566, "num_input_tokens_seen": 29413424, "step": 24240 }, { "epoch": 2.7001893306604297, "grad_norm": 0.1689198613166809, "learning_rate": 4.572555887240495e-05, "loss": 0.4631, "num_input_tokens_seen": 29418768, "step": 24245 }, { "epoch": 2.7007461855440473, "grad_norm": 0.10561272501945496, "learning_rate": 4.5722840986988016e-05, "loss": 0.4584, "num_input_tokens_seen": 29424816, "step": 24250 }, { "epoch": 2.7013030404276646, "grad_norm": 0.14111213386058807, "learning_rate": 4.572012231859574e-05, "loss": 0.4571, "num_input_tokens_seen": 29430960, "step": 24255 }, { "epoch": 2.701859895311282, "grad_norm": 0.15608467161655426, "learning_rate": 4.571740286733084e-05, "loss": 0.4914, "num_input_tokens_seen": 29437296, "step": 24260 }, { "epoch": 2.7024167501948995, "grad_norm": 0.13895753026008606, "learning_rate": 4.571468263329607e-05, "loss": 0.4792, "num_input_tokens_seen": 29443088, "step": 24265 }, { "epoch": 2.7029736050785167, "grad_norm": 0.11358015239238739, "learning_rate": 4.5711961616594204e-05, "loss": 0.4525, "num_input_tokens_seen": 29449392, "step": 24270 }, { "epoch": 2.703530459962134, "grad_norm": 0.13363699615001678, "learning_rate": 4.570923981732805e-05, "loss": 0.4602, "num_input_tokens_seen": 29455856, "step": 24275 }, { "epoch": 2.704087314845751, "grad_norm": 0.09362128376960754, "learning_rate": 4.570651723560045e-05, "loss": 0.4594, "num_input_tokens_seen": 29461296, "step": 24280 }, { "epoch": 2.7046441697293684, "grad_norm": 0.13475301861763, "learning_rate": 4.570379387151428e-05, "loss": 0.4661, "num_input_tokens_seen": 29467536, "step": 24285 }, { "epoch": 2.705201024612986, "grad_norm": 0.14298109710216522, "learning_rate": 4.570106972517243e-05, "loss": 0.4647, "num_input_tokens_seen": 29473296, "step": 24290 }, { "epoch": 2.7057578794966033, "grad_norm": 0.15600961446762085, "learning_rate": 4.569834479667782e-05, "loss": 0.4588, "num_input_tokens_seen": 29479568, "step": 24295 }, { "epoch": 2.7063147343802205, "grad_norm": 0.173349991440773, "learning_rate": 4.5695619086133414e-05, "loss": 0.4661, "num_input_tokens_seen": 29485808, "step": 24300 }, { "epoch": 2.7068715892638378, "grad_norm": 0.18289104104042053, "learning_rate": 4.569289259364219e-05, "loss": 0.4815, "num_input_tokens_seen": 29492144, "step": 24305 }, { "epoch": 2.707428444147455, "grad_norm": 0.12384680658578873, "learning_rate": 4.569016531930718e-05, "loss": 0.4674, "num_input_tokens_seen": 29497616, "step": 24310 }, { "epoch": 2.7079852990310727, "grad_norm": 0.12445759773254395, "learning_rate": 4.568743726323141e-05, "loss": 0.4596, "num_input_tokens_seen": 29503568, "step": 24315 }, { "epoch": 2.70854215391469, "grad_norm": 0.11578629165887833, "learning_rate": 4.5684708425517974e-05, "loss": 0.4652, "num_input_tokens_seen": 29509936, "step": 24320 }, { "epoch": 2.709099008798307, "grad_norm": 0.10331857949495316, "learning_rate": 4.568197880626996e-05, "loss": 0.4703, "num_input_tokens_seen": 29516240, "step": 24325 }, { "epoch": 2.7096558636819243, "grad_norm": 0.11883816868066788, "learning_rate": 4.5679248405590505e-05, "loss": 0.4704, "num_input_tokens_seen": 29522416, "step": 24330 }, { "epoch": 2.7102127185655416, "grad_norm": 0.11486456543207169, "learning_rate": 4.5676517223582785e-05, "loss": 0.4645, "num_input_tokens_seen": 29528432, "step": 24335 }, { "epoch": 2.7107695734491593, "grad_norm": 0.12958979606628418, "learning_rate": 4.567378526034998e-05, "loss": 0.4618, "num_input_tokens_seen": 29534480, "step": 24340 }, { "epoch": 2.7113264283327765, "grad_norm": 0.12850046157836914, "learning_rate": 4.5671052515995306e-05, "loss": 0.4599, "num_input_tokens_seen": 29540560, "step": 24345 }, { "epoch": 2.7118832832163937, "grad_norm": 0.11978116631507874, "learning_rate": 4.566831899062204e-05, "loss": 0.4647, "num_input_tokens_seen": 29546480, "step": 24350 }, { "epoch": 2.7124401381000114, "grad_norm": 0.1512393206357956, "learning_rate": 4.566558468433344e-05, "loss": 0.4614, "num_input_tokens_seen": 29552528, "step": 24355 }, { "epoch": 2.7129969929836286, "grad_norm": 0.2199302762746811, "learning_rate": 4.5662849597232814e-05, "loss": 0.4616, "num_input_tokens_seen": 29558896, "step": 24360 }, { "epoch": 2.713553847867246, "grad_norm": 0.17933711409568787, "learning_rate": 4.566011372942352e-05, "loss": 0.4628, "num_input_tokens_seen": 29565328, "step": 24365 }, { "epoch": 2.714110702750863, "grad_norm": 0.12400899082422256, "learning_rate": 4.565737708100893e-05, "loss": 0.457, "num_input_tokens_seen": 29571152, "step": 24370 }, { "epoch": 2.7146675576344803, "grad_norm": 0.21152199804782867, "learning_rate": 4.565463965209242e-05, "loss": 0.4546, "num_input_tokens_seen": 29577392, "step": 24375 }, { "epoch": 2.715224412518098, "grad_norm": 0.11320770531892776, "learning_rate": 4.5651901442777446e-05, "loss": 0.4771, "num_input_tokens_seen": 29583408, "step": 24380 }, { "epoch": 2.715781267401715, "grad_norm": 0.12400036305189133, "learning_rate": 4.564916245316745e-05, "loss": 0.4612, "num_input_tokens_seen": 29589520, "step": 24385 }, { "epoch": 2.7163381222853324, "grad_norm": 0.13993795216083527, "learning_rate": 4.564642268336592e-05, "loss": 0.4782, "num_input_tokens_seen": 29595760, "step": 24390 }, { "epoch": 2.7168949771689497, "grad_norm": 0.20782192051410675, "learning_rate": 4.5643682133476376e-05, "loss": 0.4693, "num_input_tokens_seen": 29601680, "step": 24395 }, { "epoch": 2.717451832052567, "grad_norm": 0.1406443566083908, "learning_rate": 4.5640940803602364e-05, "loss": 0.4572, "num_input_tokens_seen": 29607632, "step": 24400 }, { "epoch": 2.7180086869361846, "grad_norm": 0.12453583627939224, "learning_rate": 4.5638198693847466e-05, "loss": 0.475, "num_input_tokens_seen": 29613392, "step": 24405 }, { "epoch": 2.718565541819802, "grad_norm": 0.11871274560689926, "learning_rate": 4.563545580431528e-05, "loss": 0.465, "num_input_tokens_seen": 29619632, "step": 24410 }, { "epoch": 2.719122396703419, "grad_norm": 0.11833543330430984, "learning_rate": 4.563271213510945e-05, "loss": 0.4761, "num_input_tokens_seen": 29625680, "step": 24415 }, { "epoch": 2.7196792515870363, "grad_norm": 0.15333275496959686, "learning_rate": 4.562996768633362e-05, "loss": 0.4668, "num_input_tokens_seen": 29632080, "step": 24420 }, { "epoch": 2.7202361064706535, "grad_norm": 0.209160715341568, "learning_rate": 4.5627222458091514e-05, "loss": 0.4689, "num_input_tokens_seen": 29638000, "step": 24425 }, { "epoch": 2.720792961354271, "grad_norm": 0.13422563672065735, "learning_rate": 4.5624476450486834e-05, "loss": 0.4627, "num_input_tokens_seen": 29644240, "step": 24430 }, { "epoch": 2.7213498162378884, "grad_norm": 0.13864010572433472, "learning_rate": 4.5621729663623334e-05, "loss": 0.4616, "num_input_tokens_seen": 29650192, "step": 24435 }, { "epoch": 2.7219066711215056, "grad_norm": 0.23621487617492676, "learning_rate": 4.5618982097604815e-05, "loss": 0.4691, "num_input_tokens_seen": 29656592, "step": 24440 }, { "epoch": 2.7224635260051233, "grad_norm": 0.11646445095539093, "learning_rate": 4.561623375253507e-05, "loss": 0.4619, "num_input_tokens_seen": 29662640, "step": 24445 }, { "epoch": 2.7230203808887405, "grad_norm": 0.13839906454086304, "learning_rate": 4.561348462851794e-05, "loss": 0.4518, "num_input_tokens_seen": 29668816, "step": 24450 }, { "epoch": 2.7235772357723578, "grad_norm": 0.09652900695800781, "learning_rate": 4.56107347256573e-05, "loss": 0.4674, "num_input_tokens_seen": 29674928, "step": 24455 }, { "epoch": 2.724134090655975, "grad_norm": 0.16405099630355835, "learning_rate": 4.560798404405706e-05, "loss": 0.4713, "num_input_tokens_seen": 29681264, "step": 24460 }, { "epoch": 2.7246909455395922, "grad_norm": 0.09889049082994461, "learning_rate": 4.5605232583821134e-05, "loss": 0.466, "num_input_tokens_seen": 29687504, "step": 24465 }, { "epoch": 2.72524780042321, "grad_norm": 0.10772306472063065, "learning_rate": 4.5602480345053495e-05, "loss": 0.4658, "num_input_tokens_seen": 29693552, "step": 24470 }, { "epoch": 2.725804655306827, "grad_norm": 0.12717990577220917, "learning_rate": 4.559972732785812e-05, "loss": 0.4638, "num_input_tokens_seen": 29699504, "step": 24475 }, { "epoch": 2.7263615101904444, "grad_norm": 0.16539379954338074, "learning_rate": 4.5596973532339035e-05, "loss": 0.4606, "num_input_tokens_seen": 29705040, "step": 24480 }, { "epoch": 2.7269183650740616, "grad_norm": 0.1315837949514389, "learning_rate": 4.5594218958600285e-05, "loss": 0.4784, "num_input_tokens_seen": 29710928, "step": 24485 }, { "epoch": 2.727475219957679, "grad_norm": 0.12663784623146057, "learning_rate": 4.559146360674594e-05, "loss": 0.4618, "num_input_tokens_seen": 29716944, "step": 24490 }, { "epoch": 2.7280320748412965, "grad_norm": 0.12654510140419006, "learning_rate": 4.558870747688012e-05, "loss": 0.4619, "num_input_tokens_seen": 29723280, "step": 24495 }, { "epoch": 2.7285889297249137, "grad_norm": 0.10964721441268921, "learning_rate": 4.558595056910694e-05, "loss": 0.4494, "num_input_tokens_seen": 29729296, "step": 24500 }, { "epoch": 2.729145784608531, "grad_norm": 0.15660162270069122, "learning_rate": 4.5583192883530586e-05, "loss": 0.454, "num_input_tokens_seen": 29734864, "step": 24505 }, { "epoch": 2.729702639492148, "grad_norm": 0.10745882242918015, "learning_rate": 4.5580434420255235e-05, "loss": 0.4728, "num_input_tokens_seen": 29741328, "step": 24510 }, { "epoch": 2.7302594943757654, "grad_norm": 0.10803286731243134, "learning_rate": 4.5577675179385126e-05, "loss": 0.4657, "num_input_tokens_seen": 29747280, "step": 24515 }, { "epoch": 2.730816349259383, "grad_norm": 0.13472650945186615, "learning_rate": 4.55749151610245e-05, "loss": 0.4657, "num_input_tokens_seen": 29753424, "step": 24520 }, { "epoch": 2.7313732041430003, "grad_norm": 0.1377919316291809, "learning_rate": 4.5572154365277644e-05, "loss": 0.4623, "num_input_tokens_seen": 29759120, "step": 24525 }, { "epoch": 2.7319300590266176, "grad_norm": 0.17099599540233612, "learning_rate": 4.556939279224887e-05, "loss": 0.4544, "num_input_tokens_seen": 29765648, "step": 24530 }, { "epoch": 2.7324869139102352, "grad_norm": 0.1147596463561058, "learning_rate": 4.556663044204252e-05, "loss": 0.4704, "num_input_tokens_seen": 29771824, "step": 24535 }, { "epoch": 2.7330437687938525, "grad_norm": 0.1273290514945984, "learning_rate": 4.556386731476296e-05, "loss": 0.4688, "num_input_tokens_seen": 29777808, "step": 24540 }, { "epoch": 2.7336006236774697, "grad_norm": 0.13759982585906982, "learning_rate": 4.5561103410514594e-05, "loss": 0.4541, "num_input_tokens_seen": 29784144, "step": 24545 }, { "epoch": 2.734157478561087, "grad_norm": 0.15240824222564697, "learning_rate": 4.555833872940185e-05, "loss": 0.4674, "num_input_tokens_seen": 29790256, "step": 24550 }, { "epoch": 2.734714333444704, "grad_norm": 0.11802337318658829, "learning_rate": 4.555557327152919e-05, "loss": 0.4641, "num_input_tokens_seen": 29795856, "step": 24555 }, { "epoch": 2.735271188328322, "grad_norm": 0.17261847853660583, "learning_rate": 4.555280703700109e-05, "loss": 0.459, "num_input_tokens_seen": 29801776, "step": 24560 }, { "epoch": 2.735828043211939, "grad_norm": 0.13518491387367249, "learning_rate": 4.555004002592208e-05, "loss": 0.4587, "num_input_tokens_seen": 29808176, "step": 24565 }, { "epoch": 2.7363848980955563, "grad_norm": 0.1339520514011383, "learning_rate": 4.5547272238396705e-05, "loss": 0.4489, "num_input_tokens_seen": 29814672, "step": 24570 }, { "epoch": 2.7369417529791735, "grad_norm": 0.12435564398765564, "learning_rate": 4.5544503674529537e-05, "loss": 0.4563, "num_input_tokens_seen": 29820400, "step": 24575 }, { "epoch": 2.7374986078627908, "grad_norm": 0.14587152004241943, "learning_rate": 4.554173433442518e-05, "loss": 0.4559, "num_input_tokens_seen": 29826480, "step": 24580 }, { "epoch": 2.7380554627464084, "grad_norm": 0.12466646730899811, "learning_rate": 4.553896421818827e-05, "loss": 0.4647, "num_input_tokens_seen": 29832624, "step": 24585 }, { "epoch": 2.7386123176300257, "grad_norm": 0.09592657536268234, "learning_rate": 4.553619332592347e-05, "loss": 0.4573, "num_input_tokens_seen": 29838672, "step": 24590 }, { "epoch": 2.739169172513643, "grad_norm": 0.15283626317977905, "learning_rate": 4.5533421657735474e-05, "loss": 0.4618, "num_input_tokens_seen": 29844944, "step": 24595 }, { "epoch": 2.73972602739726, "grad_norm": 0.1188439354300499, "learning_rate": 4.553064921372901e-05, "loss": 0.4915, "num_input_tokens_seen": 29851376, "step": 24600 }, { "epoch": 2.7402828822808774, "grad_norm": 0.08609947562217712, "learning_rate": 4.5527875994008823e-05, "loss": 0.4803, "num_input_tokens_seen": 29857488, "step": 24605 }, { "epoch": 2.740839737164495, "grad_norm": 0.12799671292304993, "learning_rate": 4.55251019986797e-05, "loss": 0.4701, "num_input_tokens_seen": 29863504, "step": 24610 }, { "epoch": 2.7413965920481123, "grad_norm": 0.14107128977775574, "learning_rate": 4.552232722784644e-05, "loss": 0.4657, "num_input_tokens_seen": 29869584, "step": 24615 }, { "epoch": 2.7419534469317295, "grad_norm": 0.12265988439321518, "learning_rate": 4.5519551681613894e-05, "loss": 0.4633, "num_input_tokens_seen": 29875632, "step": 24620 }, { "epoch": 2.742510301815347, "grad_norm": 0.13338449597358704, "learning_rate": 4.551677536008693e-05, "loss": 0.4592, "num_input_tokens_seen": 29881904, "step": 24625 }, { "epoch": 2.7430671566989644, "grad_norm": 0.12158283591270447, "learning_rate": 4.551399826337044e-05, "loss": 0.4593, "num_input_tokens_seen": 29887824, "step": 24630 }, { "epoch": 2.7436240115825816, "grad_norm": 0.11708404123783112, "learning_rate": 4.551122039156935e-05, "loss": 0.465, "num_input_tokens_seen": 29893808, "step": 24635 }, { "epoch": 2.744180866466199, "grad_norm": 0.1042545959353447, "learning_rate": 4.5508441744788635e-05, "loss": 0.4682, "num_input_tokens_seen": 29899888, "step": 24640 }, { "epoch": 2.744737721349816, "grad_norm": 0.18243420124053955, "learning_rate": 4.550566232313326e-05, "loss": 0.4656, "num_input_tokens_seen": 29906192, "step": 24645 }, { "epoch": 2.7452945762334338, "grad_norm": 0.12882055342197418, "learning_rate": 4.5502882126708247e-05, "loss": 0.4551, "num_input_tokens_seen": 29912176, "step": 24650 }, { "epoch": 2.745851431117051, "grad_norm": 0.11278720945119858, "learning_rate": 4.5500101155618646e-05, "loss": 0.4668, "num_input_tokens_seen": 29918320, "step": 24655 }, { "epoch": 2.746408286000668, "grad_norm": 0.15212076902389526, "learning_rate": 4.549731940996953e-05, "loss": 0.4597, "num_input_tokens_seen": 29924880, "step": 24660 }, { "epoch": 2.7469651408842855, "grad_norm": 0.11778410524129868, "learning_rate": 4.5494536889866e-05, "loss": 0.4802, "num_input_tokens_seen": 29930416, "step": 24665 }, { "epoch": 2.7475219957679027, "grad_norm": 0.1320665180683136, "learning_rate": 4.5491753595413186e-05, "loss": 0.4599, "num_input_tokens_seen": 29936272, "step": 24670 }, { "epoch": 2.7480788506515204, "grad_norm": 0.14095576107501984, "learning_rate": 4.548896952671625e-05, "loss": 0.4681, "num_input_tokens_seen": 29942256, "step": 24675 }, { "epoch": 2.7486357055351376, "grad_norm": 0.12319140881299973, "learning_rate": 4.548618468388038e-05, "loss": 0.4734, "num_input_tokens_seen": 29948496, "step": 24680 }, { "epoch": 2.749192560418755, "grad_norm": 0.11973083019256592, "learning_rate": 4.548339906701081e-05, "loss": 0.469, "num_input_tokens_seen": 29954864, "step": 24685 }, { "epoch": 2.749749415302372, "grad_norm": 0.10823630541563034, "learning_rate": 4.548061267621279e-05, "loss": 0.4686, "num_input_tokens_seen": 29960688, "step": 24690 }, { "epoch": 2.7503062701859893, "grad_norm": 0.10436650365591049, "learning_rate": 4.547782551159158e-05, "loss": 0.4508, "num_input_tokens_seen": 29967088, "step": 24695 }, { "epoch": 2.750863125069607, "grad_norm": 0.10083327442407608, "learning_rate": 4.5475037573252494e-05, "loss": 0.4786, "num_input_tokens_seen": 29973328, "step": 24700 }, { "epoch": 2.751419979953224, "grad_norm": 0.1777067929506302, "learning_rate": 4.547224886130087e-05, "loss": 0.455, "num_input_tokens_seen": 29979824, "step": 24705 }, { "epoch": 2.7519768348368414, "grad_norm": 0.12601225078105927, "learning_rate": 4.546945937584208e-05, "loss": 0.4654, "num_input_tokens_seen": 29986128, "step": 24710 }, { "epoch": 2.752533689720459, "grad_norm": 0.11530538648366928, "learning_rate": 4.546666911698153e-05, "loss": 0.4707, "num_input_tokens_seen": 29992208, "step": 24715 }, { "epoch": 2.7530905446040763, "grad_norm": 0.10851096361875534, "learning_rate": 4.546387808482462e-05, "loss": 0.4606, "num_input_tokens_seen": 29998160, "step": 24720 }, { "epoch": 2.7536473994876935, "grad_norm": 0.12481027096509933, "learning_rate": 4.546108627947682e-05, "loss": 0.4663, "num_input_tokens_seen": 30004272, "step": 24725 }, { "epoch": 2.754204254371311, "grad_norm": 0.10155370831489563, "learning_rate": 4.5458293701043606e-05, "loss": 0.4588, "num_input_tokens_seen": 30010480, "step": 24730 }, { "epoch": 2.754761109254928, "grad_norm": 0.1294223517179489, "learning_rate": 4.54555003496305e-05, "loss": 0.4654, "num_input_tokens_seen": 30016496, "step": 24735 }, { "epoch": 2.7553179641385457, "grad_norm": 0.1113017350435257, "learning_rate": 4.5452706225343035e-05, "loss": 0.4698, "num_input_tokens_seen": 30022448, "step": 24740 }, { "epoch": 2.755874819022163, "grad_norm": 0.1246437355875969, "learning_rate": 4.544991132828678e-05, "loss": 0.4503, "num_input_tokens_seen": 30028784, "step": 24745 }, { "epoch": 2.75643167390578, "grad_norm": 0.10834231972694397, "learning_rate": 4.5447115658567346e-05, "loss": 0.4523, "num_input_tokens_seen": 30034768, "step": 24750 }, { "epoch": 2.7569885287893974, "grad_norm": 0.12365549802780151, "learning_rate": 4.544431921629035e-05, "loss": 0.4682, "num_input_tokens_seen": 30041264, "step": 24755 }, { "epoch": 2.7575453836730146, "grad_norm": 0.11916089057922363, "learning_rate": 4.544152200156147e-05, "loss": 0.4632, "num_input_tokens_seen": 30047536, "step": 24760 }, { "epoch": 2.7581022385566323, "grad_norm": 0.13626360893249512, "learning_rate": 4.543872401448637e-05, "loss": 0.4693, "num_input_tokens_seen": 30054032, "step": 24765 }, { "epoch": 2.7586590934402495, "grad_norm": 0.11062079668045044, "learning_rate": 4.543592525517079e-05, "loss": 0.4764, "num_input_tokens_seen": 30060016, "step": 24770 }, { "epoch": 2.7592159483238667, "grad_norm": 0.13875630497932434, "learning_rate": 4.5433125723720446e-05, "loss": 0.4769, "num_input_tokens_seen": 30066128, "step": 24775 }, { "epoch": 2.759772803207484, "grad_norm": 0.10732588171958923, "learning_rate": 4.543032542024114e-05, "loss": 0.4636, "num_input_tokens_seen": 30072304, "step": 24780 }, { "epoch": 2.760329658091101, "grad_norm": 0.12545245885849, "learning_rate": 4.542752434483867e-05, "loss": 0.4699, "num_input_tokens_seen": 30078448, "step": 24785 }, { "epoch": 2.760886512974719, "grad_norm": 0.1416396051645279, "learning_rate": 4.542472249761886e-05, "loss": 0.4703, "num_input_tokens_seen": 30084720, "step": 24790 }, { "epoch": 2.761443367858336, "grad_norm": 0.1496385931968689, "learning_rate": 4.542191987868758e-05, "loss": 0.4573, "num_input_tokens_seen": 30090896, "step": 24795 }, { "epoch": 2.7620002227419533, "grad_norm": 0.12577801942825317, "learning_rate": 4.541911648815072e-05, "loss": 0.4763, "num_input_tokens_seen": 30096816, "step": 24800 }, { "epoch": 2.762557077625571, "grad_norm": 0.11235615611076355, "learning_rate": 4.541631232611422e-05, "loss": 0.4599, "num_input_tokens_seen": 30103344, "step": 24805 }, { "epoch": 2.7631139325091882, "grad_norm": 0.13971641659736633, "learning_rate": 4.5413507392683994e-05, "loss": 0.4645, "num_input_tokens_seen": 30109424, "step": 24810 }, { "epoch": 2.7636707873928055, "grad_norm": 0.09089395403862, "learning_rate": 4.541070168796605e-05, "loss": 0.468, "num_input_tokens_seen": 30115120, "step": 24815 }, { "epoch": 2.7642276422764227, "grad_norm": 0.13687893748283386, "learning_rate": 4.540789521206639e-05, "loss": 0.4685, "num_input_tokens_seen": 30120912, "step": 24820 }, { "epoch": 2.76478449716004, "grad_norm": 0.1356520801782608, "learning_rate": 4.540508796509104e-05, "loss": 0.4749, "num_input_tokens_seen": 30126832, "step": 24825 }, { "epoch": 2.7653413520436576, "grad_norm": 0.09938295930624008, "learning_rate": 4.5402279947146075e-05, "loss": 0.4651, "num_input_tokens_seen": 30132848, "step": 24830 }, { "epoch": 2.765898206927275, "grad_norm": 0.12940360605716705, "learning_rate": 4.5399471158337595e-05, "loss": 0.4745, "num_input_tokens_seen": 30138800, "step": 24835 }, { "epoch": 2.766455061810892, "grad_norm": 0.11511076241731644, "learning_rate": 4.5396661598771716e-05, "loss": 0.4584, "num_input_tokens_seen": 30144880, "step": 24840 }, { "epoch": 2.7670119166945093, "grad_norm": 0.12419652193784714, "learning_rate": 4.5393851268554607e-05, "loss": 0.4654, "num_input_tokens_seen": 30151056, "step": 24845 }, { "epoch": 2.7675687715781265, "grad_norm": 0.12329352647066116, "learning_rate": 4.539104016779243e-05, "loss": 0.452, "num_input_tokens_seen": 30157520, "step": 24850 }, { "epoch": 2.768125626461744, "grad_norm": 0.1561354398727417, "learning_rate": 4.538822829659142e-05, "loss": 0.4618, "num_input_tokens_seen": 30163824, "step": 24855 }, { "epoch": 2.7686824813453614, "grad_norm": 0.20497262477874756, "learning_rate": 4.5385415655057804e-05, "loss": 0.4681, "num_input_tokens_seen": 30169744, "step": 24860 }, { "epoch": 2.7692393362289787, "grad_norm": 0.12912949919700623, "learning_rate": 4.5382602243297854e-05, "loss": 0.4668, "num_input_tokens_seen": 30175920, "step": 24865 }, { "epoch": 2.769796191112596, "grad_norm": 0.14519798755645752, "learning_rate": 4.537978806141788e-05, "loss": 0.4582, "num_input_tokens_seen": 30182128, "step": 24870 }, { "epoch": 2.770353045996213, "grad_norm": 0.18136276304721832, "learning_rate": 4.5376973109524195e-05, "loss": 0.4682, "num_input_tokens_seen": 30187984, "step": 24875 }, { "epoch": 2.770909900879831, "grad_norm": 0.1643732488155365, "learning_rate": 4.537415738772317e-05, "loss": 0.4835, "num_input_tokens_seen": 30193296, "step": 24880 }, { "epoch": 2.771466755763448, "grad_norm": 0.12734904885292053, "learning_rate": 4.537134089612118e-05, "loss": 0.4683, "num_input_tokens_seen": 30198832, "step": 24885 }, { "epoch": 2.7720236106470653, "grad_norm": 0.16550810635089874, "learning_rate": 4.536852363482465e-05, "loss": 0.4692, "num_input_tokens_seen": 30204848, "step": 24890 }, { "epoch": 2.772580465530683, "grad_norm": 0.10908549278974533, "learning_rate": 4.5365705603940026e-05, "loss": 0.4562, "num_input_tokens_seen": 30210672, "step": 24895 }, { "epoch": 2.7731373204143, "grad_norm": 0.12492534518241882, "learning_rate": 4.536288680357378e-05, "loss": 0.4466, "num_input_tokens_seen": 30216752, "step": 24900 }, { "epoch": 2.7736941752979174, "grad_norm": 0.14294539391994476, "learning_rate": 4.5360067233832406e-05, "loss": 0.4513, "num_input_tokens_seen": 30223120, "step": 24905 }, { "epoch": 2.7742510301815346, "grad_norm": 0.11835575848817825, "learning_rate": 4.535724689482245e-05, "loss": 0.462, "num_input_tokens_seen": 30228880, "step": 24910 }, { "epoch": 2.774807885065152, "grad_norm": 0.13103364408016205, "learning_rate": 4.535442578665047e-05, "loss": 0.4778, "num_input_tokens_seen": 30235120, "step": 24915 }, { "epoch": 2.7753647399487695, "grad_norm": 0.13786937296390533, "learning_rate": 4.535160390942306e-05, "loss": 0.4802, "num_input_tokens_seen": 30241520, "step": 24920 }, { "epoch": 2.7759215948323868, "grad_norm": 0.14253702759742737, "learning_rate": 4.534878126324682e-05, "loss": 0.4641, "num_input_tokens_seen": 30247824, "step": 24925 }, { "epoch": 2.776478449716004, "grad_norm": 0.09671767055988312, "learning_rate": 4.534595784822843e-05, "loss": 0.4716, "num_input_tokens_seen": 30253616, "step": 24930 }, { "epoch": 2.7770353045996212, "grad_norm": 0.13045181334018707, "learning_rate": 4.534313366447454e-05, "loss": 0.4714, "num_input_tokens_seen": 30259504, "step": 24935 }, { "epoch": 2.7775921594832385, "grad_norm": 0.11926951259374619, "learning_rate": 4.534030871209186e-05, "loss": 0.4697, "num_input_tokens_seen": 30265520, "step": 24940 }, { "epoch": 2.778149014366856, "grad_norm": 0.15442883968353271, "learning_rate": 4.533748299118715e-05, "loss": 0.4625, "num_input_tokens_seen": 30271568, "step": 24945 }, { "epoch": 2.7787058692504734, "grad_norm": 0.14841943979263306, "learning_rate": 4.533465650186715e-05, "loss": 0.4679, "num_input_tokens_seen": 30277840, "step": 24950 }, { "epoch": 2.7792627241340906, "grad_norm": 0.13130749762058258, "learning_rate": 4.5331829244238655e-05, "loss": 0.4669, "num_input_tokens_seen": 30284240, "step": 24955 }, { "epoch": 2.779819579017708, "grad_norm": 0.12278430163860321, "learning_rate": 4.5329001218408505e-05, "loss": 0.4699, "num_input_tokens_seen": 30290352, "step": 24960 }, { "epoch": 2.780376433901325, "grad_norm": 0.16074830293655396, "learning_rate": 4.532617242448354e-05, "loss": 0.4628, "num_input_tokens_seen": 30296144, "step": 24965 }, { "epoch": 2.7809332887849427, "grad_norm": 0.11841700226068497, "learning_rate": 4.532334286257064e-05, "loss": 0.4589, "num_input_tokens_seen": 30302096, "step": 24970 }, { "epoch": 2.78149014366856, "grad_norm": 0.14045801758766174, "learning_rate": 4.532051253277672e-05, "loss": 0.4773, "num_input_tokens_seen": 30308528, "step": 24975 }, { "epoch": 2.782046998552177, "grad_norm": 0.12048639357089996, "learning_rate": 4.5317681435208716e-05, "loss": 0.4705, "num_input_tokens_seen": 30314608, "step": 24980 }, { "epoch": 2.782603853435795, "grad_norm": 0.15148437023162842, "learning_rate": 4.531484956997359e-05, "loss": 0.4579, "num_input_tokens_seen": 30320656, "step": 24985 }, { "epoch": 2.783160708319412, "grad_norm": 0.14046546816825867, "learning_rate": 4.531201693717835e-05, "loss": 0.463, "num_input_tokens_seen": 30326576, "step": 24990 }, { "epoch": 2.7837175632030293, "grad_norm": 0.12882299721240997, "learning_rate": 4.5309183536930014e-05, "loss": 0.4692, "num_input_tokens_seen": 30332752, "step": 24995 }, { "epoch": 2.7842744180866466, "grad_norm": 0.1246546283364296, "learning_rate": 4.530634936933564e-05, "loss": 0.4553, "num_input_tokens_seen": 30338896, "step": 25000 }, { "epoch": 2.784831272970264, "grad_norm": 0.135890394449234, "learning_rate": 4.530351443450232e-05, "loss": 0.4592, "num_input_tokens_seen": 30345008, "step": 25005 }, { "epoch": 2.7853881278538815, "grad_norm": 0.12472202628850937, "learning_rate": 4.530067873253715e-05, "loss": 0.4665, "num_input_tokens_seen": 30351184, "step": 25010 }, { "epoch": 2.7859449827374987, "grad_norm": 0.13656707108020782, "learning_rate": 4.529784226354729e-05, "loss": 0.4602, "num_input_tokens_seen": 30357168, "step": 25015 }, { "epoch": 2.786501837621116, "grad_norm": 0.13231410086154938, "learning_rate": 4.529500502763989e-05, "loss": 0.4533, "num_input_tokens_seen": 30363280, "step": 25020 }, { "epoch": 2.787058692504733, "grad_norm": 0.1310773491859436, "learning_rate": 4.529216702492217e-05, "loss": 0.4579, "num_input_tokens_seen": 30369648, "step": 25025 }, { "epoch": 2.7876155473883504, "grad_norm": 0.10101402550935745, "learning_rate": 4.5289328255501346e-05, "loss": 0.4753, "num_input_tokens_seen": 30374960, "step": 25030 }, { "epoch": 2.788172402271968, "grad_norm": 0.116840660572052, "learning_rate": 4.5286488719484674e-05, "loss": 0.4631, "num_input_tokens_seen": 30380976, "step": 25035 }, { "epoch": 2.7887292571555853, "grad_norm": 0.14204025268554688, "learning_rate": 4.5283648416979466e-05, "loss": 0.4776, "num_input_tokens_seen": 30386832, "step": 25040 }, { "epoch": 2.7892861120392025, "grad_norm": 0.1643020212650299, "learning_rate": 4.5280807348093e-05, "loss": 0.4755, "num_input_tokens_seen": 30393040, "step": 25045 }, { "epoch": 2.78984296692282, "grad_norm": 0.10796207189559937, "learning_rate": 4.527796551293265e-05, "loss": 0.4575, "num_input_tokens_seen": 30398928, "step": 25050 }, { "epoch": 2.790399821806437, "grad_norm": 0.12760981917381287, "learning_rate": 4.527512291160577e-05, "loss": 0.4587, "num_input_tokens_seen": 30404976, "step": 25055 }, { "epoch": 2.7909566766900546, "grad_norm": 0.17877167463302612, "learning_rate": 4.527227954421978e-05, "loss": 0.473, "num_input_tokens_seen": 30411056, "step": 25060 }, { "epoch": 2.791513531573672, "grad_norm": 0.10048125684261322, "learning_rate": 4.52694354108821e-05, "loss": 0.4668, "num_input_tokens_seen": 30417296, "step": 25065 }, { "epoch": 2.792070386457289, "grad_norm": 0.126734659075737, "learning_rate": 4.52665905117002e-05, "loss": 0.4452, "num_input_tokens_seen": 30423216, "step": 25070 }, { "epoch": 2.792627241340907, "grad_norm": 0.15550467371940613, "learning_rate": 4.526374484678155e-05, "loss": 0.4679, "num_input_tokens_seen": 30429552, "step": 25075 }, { "epoch": 2.793184096224524, "grad_norm": 0.1449543535709381, "learning_rate": 4.526089841623369e-05, "loss": 0.4658, "num_input_tokens_seen": 30435664, "step": 25080 }, { "epoch": 2.7937409511081412, "grad_norm": 0.11339521408081055, "learning_rate": 4.525805122016416e-05, "loss": 0.4549, "num_input_tokens_seen": 30441616, "step": 25085 }, { "epoch": 2.7942978059917585, "grad_norm": 0.08563031256198883, "learning_rate": 4.525520325868053e-05, "loss": 0.488, "num_input_tokens_seen": 30447664, "step": 25090 }, { "epoch": 2.7948546608753757, "grad_norm": 0.12361734360456467, "learning_rate": 4.5252354531890404e-05, "loss": 0.4571, "num_input_tokens_seen": 30453584, "step": 25095 }, { "epoch": 2.7954115157589934, "grad_norm": 0.09751442819833755, "learning_rate": 4.524950503990143e-05, "loss": 0.4639, "num_input_tokens_seen": 30459632, "step": 25100 }, { "epoch": 2.7959683706426106, "grad_norm": 0.10875528305768967, "learning_rate": 4.5246654782821266e-05, "loss": 0.461, "num_input_tokens_seen": 30465392, "step": 25105 }, { "epoch": 2.796525225526228, "grad_norm": 0.13141030073165894, "learning_rate": 4.52438037607576e-05, "loss": 0.4579, "num_input_tokens_seen": 30471312, "step": 25110 }, { "epoch": 2.797082080409845, "grad_norm": 0.12330622225999832, "learning_rate": 4.524095197381815e-05, "loss": 0.4555, "num_input_tokens_seen": 30477488, "step": 25115 }, { "epoch": 2.7976389352934623, "grad_norm": 0.1059853658080101, "learning_rate": 4.523809942211067e-05, "loss": 0.4553, "num_input_tokens_seen": 30482864, "step": 25120 }, { "epoch": 2.79819579017708, "grad_norm": 0.10887941718101501, "learning_rate": 4.523524610574293e-05, "loss": 0.456, "num_input_tokens_seen": 30488592, "step": 25125 }, { "epoch": 2.798752645060697, "grad_norm": 0.08666539192199707, "learning_rate": 4.523239202482276e-05, "loss": 0.4777, "num_input_tokens_seen": 30494640, "step": 25130 }, { "epoch": 2.7993094999443144, "grad_norm": 0.14099401235580444, "learning_rate": 4.522953717945797e-05, "loss": 0.4471, "num_input_tokens_seen": 30500816, "step": 25135 }, { "epoch": 2.799866354827932, "grad_norm": 0.1332016885280609, "learning_rate": 4.522668156975644e-05, "loss": 0.4629, "num_input_tokens_seen": 30506224, "step": 25140 }, { "epoch": 2.800423209711549, "grad_norm": 0.13264130055904388, "learning_rate": 4.522382519582606e-05, "loss": 0.4692, "num_input_tokens_seen": 30512336, "step": 25145 }, { "epoch": 2.8009800645951666, "grad_norm": 0.18214859068393707, "learning_rate": 4.522096805777476e-05, "loss": 0.4745, "num_input_tokens_seen": 30518544, "step": 25150 }, { "epoch": 2.801536919478784, "grad_norm": 0.12782366573810577, "learning_rate": 4.521811015571048e-05, "loss": 0.4797, "num_input_tokens_seen": 30525040, "step": 25155 }, { "epoch": 2.802093774362401, "grad_norm": 0.11428103595972061, "learning_rate": 4.521525148974121e-05, "loss": 0.4586, "num_input_tokens_seen": 30531440, "step": 25160 }, { "epoch": 2.8026506292460187, "grad_norm": 0.16547317802906036, "learning_rate": 4.521239205997495e-05, "loss": 0.4663, "num_input_tokens_seen": 30537200, "step": 25165 }, { "epoch": 2.803207484129636, "grad_norm": 0.15730421245098114, "learning_rate": 4.5209531866519746e-05, "loss": 0.4654, "num_input_tokens_seen": 30543312, "step": 25170 }, { "epoch": 2.803764339013253, "grad_norm": 0.10492584109306335, "learning_rate": 4.520667090948366e-05, "loss": 0.4585, "num_input_tokens_seen": 30549424, "step": 25175 }, { "epoch": 2.8043211938968704, "grad_norm": 0.12969282269477844, "learning_rate": 4.52038091889748e-05, "loss": 0.4689, "num_input_tokens_seen": 30555760, "step": 25180 }, { "epoch": 2.8048780487804876, "grad_norm": 0.14480021595954895, "learning_rate": 4.5200946705101276e-05, "loss": 0.4675, "num_input_tokens_seen": 30562000, "step": 25185 }, { "epoch": 2.8054349036641053, "grad_norm": 0.1580754965543747, "learning_rate": 4.519808345797125e-05, "loss": 0.4635, "num_input_tokens_seen": 30567792, "step": 25190 }, { "epoch": 2.8059917585477225, "grad_norm": 0.12125666439533234, "learning_rate": 4.5195219447692904e-05, "loss": 0.4572, "num_input_tokens_seen": 30573904, "step": 25195 }, { "epoch": 2.8065486134313398, "grad_norm": 0.13261733949184418, "learning_rate": 4.519235467437445e-05, "loss": 0.4806, "num_input_tokens_seen": 30579856, "step": 25200 }, { "epoch": 2.807105468314957, "grad_norm": 0.1275329738855362, "learning_rate": 4.518948913812412e-05, "loss": 0.4665, "num_input_tokens_seen": 30586032, "step": 25205 }, { "epoch": 2.8076623231985742, "grad_norm": 0.14216578006744385, "learning_rate": 4.5186622839050196e-05, "loss": 0.4631, "num_input_tokens_seen": 30592080, "step": 25210 }, { "epoch": 2.808219178082192, "grad_norm": 0.11216452717781067, "learning_rate": 4.518375577726097e-05, "loss": 0.4491, "num_input_tokens_seen": 30598288, "step": 25215 }, { "epoch": 2.808776032965809, "grad_norm": 0.12694580852985382, "learning_rate": 4.5180887952864765e-05, "loss": 0.4634, "num_input_tokens_seen": 30604656, "step": 25220 }, { "epoch": 2.8093328878494264, "grad_norm": 0.12557174265384674, "learning_rate": 4.517801936596994e-05, "loss": 0.4646, "num_input_tokens_seen": 30610960, "step": 25225 }, { "epoch": 2.809889742733044, "grad_norm": 0.13271459937095642, "learning_rate": 4.5175150016684886e-05, "loss": 0.4645, "num_input_tokens_seen": 30616848, "step": 25230 }, { "epoch": 2.810446597616661, "grad_norm": 0.14171577990055084, "learning_rate": 4.5172279905118005e-05, "loss": 0.4613, "num_input_tokens_seen": 30622736, "step": 25235 }, { "epoch": 2.8110034525002785, "grad_norm": 0.12875744700431824, "learning_rate": 4.5169409031377746e-05, "loss": 0.4554, "num_input_tokens_seen": 30628848, "step": 25240 }, { "epoch": 2.8115603073838957, "grad_norm": 0.1296698898077011, "learning_rate": 4.5166537395572575e-05, "loss": 0.4642, "num_input_tokens_seen": 30634896, "step": 25245 }, { "epoch": 2.812117162267513, "grad_norm": 0.12845487892627716, "learning_rate": 4.5163664997810996e-05, "loss": 0.4791, "num_input_tokens_seen": 30640944, "step": 25250 }, { "epoch": 2.8126740171511306, "grad_norm": 0.10913330316543579, "learning_rate": 4.516079183820153e-05, "loss": 0.4643, "num_input_tokens_seen": 30646960, "step": 25255 }, { "epoch": 2.813230872034748, "grad_norm": 0.11119750142097473, "learning_rate": 4.515791791685275e-05, "loss": 0.4632, "num_input_tokens_seen": 30653200, "step": 25260 }, { "epoch": 2.813787726918365, "grad_norm": 0.08863379061222076, "learning_rate": 4.515504323387323e-05, "loss": 0.4561, "num_input_tokens_seen": 30658768, "step": 25265 }, { "epoch": 2.8143445818019823, "grad_norm": 0.12573949992656708, "learning_rate": 4.515216778937159e-05, "loss": 0.454, "num_input_tokens_seen": 30664944, "step": 25270 }, { "epoch": 2.8149014366855996, "grad_norm": 0.13235652446746826, "learning_rate": 4.514929158345646e-05, "loss": 0.4507, "num_input_tokens_seen": 30670512, "step": 25275 }, { "epoch": 2.8154582915692172, "grad_norm": 0.10661859810352325, "learning_rate": 4.5146414616236524e-05, "loss": 0.4607, "num_input_tokens_seen": 30676944, "step": 25280 }, { "epoch": 2.8160151464528345, "grad_norm": 0.11918636411428452, "learning_rate": 4.514353688782049e-05, "loss": 0.4661, "num_input_tokens_seen": 30683088, "step": 25285 }, { "epoch": 2.8165720013364517, "grad_norm": 0.1442418247461319, "learning_rate": 4.514065839831707e-05, "loss": 0.4688, "num_input_tokens_seen": 30689488, "step": 25290 }, { "epoch": 2.817128856220069, "grad_norm": 0.13265442848205566, "learning_rate": 4.5137779147835035e-05, "loss": 0.461, "num_input_tokens_seen": 30695344, "step": 25295 }, { "epoch": 2.817685711103686, "grad_norm": 0.13284534215927124, "learning_rate": 4.513489913648317e-05, "loss": 0.4699, "num_input_tokens_seen": 30701456, "step": 25300 }, { "epoch": 2.818242565987304, "grad_norm": 0.12624222040176392, "learning_rate": 4.513201836437029e-05, "loss": 0.4639, "num_input_tokens_seen": 30707504, "step": 25305 }, { "epoch": 2.818799420870921, "grad_norm": 0.1117556095123291, "learning_rate": 4.512913683160524e-05, "loss": 0.465, "num_input_tokens_seen": 30712816, "step": 25310 }, { "epoch": 2.8193562757545383, "grad_norm": 0.1287304162979126, "learning_rate": 4.5126254538296895e-05, "loss": 0.4676, "num_input_tokens_seen": 30719152, "step": 25315 }, { "epoch": 2.819913130638156, "grad_norm": 0.12865614891052246, "learning_rate": 4.512337148455416e-05, "loss": 0.4645, "num_input_tokens_seen": 30725520, "step": 25320 }, { "epoch": 2.8204699855217727, "grad_norm": 0.1411377191543579, "learning_rate": 4.5120487670485945e-05, "loss": 0.4639, "num_input_tokens_seen": 30731792, "step": 25325 }, { "epoch": 2.8210268404053904, "grad_norm": 0.1440371423959732, "learning_rate": 4.511760309620124e-05, "loss": 0.4478, "num_input_tokens_seen": 30738096, "step": 25330 }, { "epoch": 2.8215836952890077, "grad_norm": 0.13010387122631073, "learning_rate": 4.511471776180902e-05, "loss": 0.4781, "num_input_tokens_seen": 30744176, "step": 25335 }, { "epoch": 2.822140550172625, "grad_norm": 0.11883687973022461, "learning_rate": 4.51118316674183e-05, "loss": 0.4722, "num_input_tokens_seen": 30750416, "step": 25340 }, { "epoch": 2.8226974050562426, "grad_norm": 0.16511490941047668, "learning_rate": 4.510894481313813e-05, "loss": 0.4638, "num_input_tokens_seen": 30756592, "step": 25345 }, { "epoch": 2.82325425993986, "grad_norm": 0.117094986140728, "learning_rate": 4.510605719907758e-05, "loss": 0.4658, "num_input_tokens_seen": 30762416, "step": 25350 }, { "epoch": 2.823811114823477, "grad_norm": 0.12389863282442093, "learning_rate": 4.510316882534575e-05, "loss": 0.4715, "num_input_tokens_seen": 30768528, "step": 25355 }, { "epoch": 2.8243679697070943, "grad_norm": 0.1201426088809967, "learning_rate": 4.5100279692051785e-05, "loss": 0.4695, "num_input_tokens_seen": 30774672, "step": 25360 }, { "epoch": 2.8249248245907115, "grad_norm": 0.11392300575971603, "learning_rate": 4.509738979930484e-05, "loss": 0.4412, "num_input_tokens_seen": 30780688, "step": 25365 }, { "epoch": 2.825481679474329, "grad_norm": 0.14397050440311432, "learning_rate": 4.50944991472141e-05, "loss": 0.4627, "num_input_tokens_seen": 30786672, "step": 25370 }, { "epoch": 2.8260385343579464, "grad_norm": 0.15398132801055908, "learning_rate": 4.509160773588879e-05, "loss": 0.4733, "num_input_tokens_seen": 30793392, "step": 25375 }, { "epoch": 2.8265953892415636, "grad_norm": 0.11006668955087662, "learning_rate": 4.508871556543815e-05, "loss": 0.4756, "num_input_tokens_seen": 30798256, "step": 25380 }, { "epoch": 2.827152244125181, "grad_norm": 0.11857328563928604, "learning_rate": 4.508582263597146e-05, "loss": 0.4707, "num_input_tokens_seen": 30804304, "step": 25385 }, { "epoch": 2.827709099008798, "grad_norm": 0.1335425078868866, "learning_rate": 4.5082928947598016e-05, "loss": 0.4763, "num_input_tokens_seen": 30809936, "step": 25390 }, { "epoch": 2.8282659538924158, "grad_norm": 0.10209602862596512, "learning_rate": 4.508003450042717e-05, "loss": 0.4699, "num_input_tokens_seen": 30816496, "step": 25395 }, { "epoch": 2.828822808776033, "grad_norm": 0.10984089225530624, "learning_rate": 4.507713929456826e-05, "loss": 0.4733, "num_input_tokens_seen": 30822480, "step": 25400 }, { "epoch": 2.82937966365965, "grad_norm": 0.12729981541633606, "learning_rate": 4.507424333013069e-05, "loss": 0.4717, "num_input_tokens_seen": 30828624, "step": 25405 }, { "epoch": 2.829936518543268, "grad_norm": 0.153935968875885, "learning_rate": 4.507134660722388e-05, "loss": 0.4717, "num_input_tokens_seen": 30834672, "step": 25410 }, { "epoch": 2.830493373426885, "grad_norm": 0.16618317365646362, "learning_rate": 4.5068449125957266e-05, "loss": 0.4742, "num_input_tokens_seen": 30840624, "step": 25415 }, { "epoch": 2.8310502283105023, "grad_norm": 0.11861523240804672, "learning_rate": 4.5065550886440336e-05, "loss": 0.4775, "num_input_tokens_seen": 30846512, "step": 25420 }, { "epoch": 2.8316070831941196, "grad_norm": 0.1615140587091446, "learning_rate": 4.5062651888782595e-05, "loss": 0.454, "num_input_tokens_seen": 30852624, "step": 25425 }, { "epoch": 2.832163938077737, "grad_norm": 0.13235239684581757, "learning_rate": 4.505975213309357e-05, "loss": 0.4615, "num_input_tokens_seen": 30858448, "step": 25430 }, { "epoch": 2.8327207929613545, "grad_norm": 0.1259472519159317, "learning_rate": 4.5056851619482816e-05, "loss": 0.4645, "num_input_tokens_seen": 30864368, "step": 25435 }, { "epoch": 2.8332776478449717, "grad_norm": 0.11102338880300522, "learning_rate": 4.5053950348059935e-05, "loss": 0.4544, "num_input_tokens_seen": 30870384, "step": 25440 }, { "epoch": 2.833834502728589, "grad_norm": 0.1261064112186432, "learning_rate": 4.505104831893455e-05, "loss": 0.4709, "num_input_tokens_seen": 30876368, "step": 25445 }, { "epoch": 2.834391357612206, "grad_norm": 0.10338570177555084, "learning_rate": 4.50481455322163e-05, "loss": 0.4694, "num_input_tokens_seen": 30882576, "step": 25450 }, { "epoch": 2.8349482124958234, "grad_norm": 0.10358575731515884, "learning_rate": 4.5045241988014865e-05, "loss": 0.4695, "num_input_tokens_seen": 30888848, "step": 25455 }, { "epoch": 2.835505067379441, "grad_norm": 0.11140851676464081, "learning_rate": 4.504233768643995e-05, "loss": 0.4599, "num_input_tokens_seen": 30894192, "step": 25460 }, { "epoch": 2.8360619222630583, "grad_norm": 0.11595169454813004, "learning_rate": 4.5039432627601285e-05, "loss": 0.4657, "num_input_tokens_seen": 30900368, "step": 25465 }, { "epoch": 2.8366187771466755, "grad_norm": 0.12283104658126831, "learning_rate": 4.503652681160864e-05, "loss": 0.4509, "num_input_tokens_seen": 30906256, "step": 25470 }, { "epoch": 2.8371756320302928, "grad_norm": 0.11253098398447037, "learning_rate": 4.50336202385718e-05, "loss": 0.4713, "num_input_tokens_seen": 30912272, "step": 25475 }, { "epoch": 2.83773248691391, "grad_norm": 0.11788459122180939, "learning_rate": 4.503071290860058e-05, "loss": 0.4488, "num_input_tokens_seen": 30918544, "step": 25480 }, { "epoch": 2.8382893417975277, "grad_norm": 0.1823129653930664, "learning_rate": 4.5027804821804844e-05, "loss": 0.4699, "num_input_tokens_seen": 30925008, "step": 25485 }, { "epoch": 2.838846196681145, "grad_norm": 0.14896994829177856, "learning_rate": 4.5024895978294454e-05, "loss": 0.4608, "num_input_tokens_seen": 30930448, "step": 25490 }, { "epoch": 2.839403051564762, "grad_norm": 0.12514793872833252, "learning_rate": 4.5021986378179325e-05, "loss": 0.4643, "num_input_tokens_seen": 30936272, "step": 25495 }, { "epoch": 2.83995990644838, "grad_norm": 0.10587816685438156, "learning_rate": 4.501907602156939e-05, "loss": 0.4703, "num_input_tokens_seen": 30941968, "step": 25500 }, { "epoch": 2.840516761331997, "grad_norm": 0.18150927126407623, "learning_rate": 4.50161649085746e-05, "loss": 0.469, "num_input_tokens_seen": 30947504, "step": 25505 }, { "epoch": 2.8410736162156143, "grad_norm": 0.23381924629211426, "learning_rate": 4.501325303930496e-05, "loss": 0.4624, "num_input_tokens_seen": 30953936, "step": 25510 }, { "epoch": 2.8416304710992315, "grad_norm": 0.09963611513376236, "learning_rate": 4.5010340413870484e-05, "loss": 0.4708, "num_input_tokens_seen": 30960464, "step": 25515 }, { "epoch": 2.8421873259828487, "grad_norm": 0.12330273538827896, "learning_rate": 4.500742703238122e-05, "loss": 0.4629, "num_input_tokens_seen": 30966416, "step": 25520 }, { "epoch": 2.8427441808664664, "grad_norm": 0.10362977534532547, "learning_rate": 4.500451289494725e-05, "loss": 0.4576, "num_input_tokens_seen": 30972464, "step": 25525 }, { "epoch": 2.8433010357500836, "grad_norm": 0.11093220859766006, "learning_rate": 4.500159800167867e-05, "loss": 0.4537, "num_input_tokens_seen": 30978480, "step": 25530 }, { "epoch": 2.843857890633701, "grad_norm": 0.17180117964744568, "learning_rate": 4.499868235268562e-05, "loss": 0.4684, "num_input_tokens_seen": 30984880, "step": 25535 }, { "epoch": 2.844414745517318, "grad_norm": 0.16016650199890137, "learning_rate": 4.4995765948078263e-05, "loss": 0.4593, "num_input_tokens_seen": 30990832, "step": 25540 }, { "epoch": 2.8449716004009353, "grad_norm": 0.12057653814554214, "learning_rate": 4.4992848787966784e-05, "loss": 0.4681, "num_input_tokens_seen": 30996944, "step": 25545 }, { "epoch": 2.845528455284553, "grad_norm": 0.139963299036026, "learning_rate": 4.4989930872461415e-05, "loss": 0.4683, "num_input_tokens_seen": 31003216, "step": 25550 }, { "epoch": 2.8460853101681702, "grad_norm": 0.09969723224639893, "learning_rate": 4.498701220167239e-05, "loss": 0.469, "num_input_tokens_seen": 31009296, "step": 25555 }, { "epoch": 2.8466421650517875, "grad_norm": 0.13648755848407745, "learning_rate": 4.4984092775709995e-05, "loss": 0.4711, "num_input_tokens_seen": 31015312, "step": 25560 }, { "epoch": 2.8471990199354047, "grad_norm": 0.1137038841843605, "learning_rate": 4.498117259468454e-05, "loss": 0.4747, "num_input_tokens_seen": 31021328, "step": 25565 }, { "epoch": 2.847755874819022, "grad_norm": 0.10282310098409653, "learning_rate": 4.497825165870634e-05, "loss": 0.4596, "num_input_tokens_seen": 31027536, "step": 25570 }, { "epoch": 2.8483127297026396, "grad_norm": 0.11075964570045471, "learning_rate": 4.4975329967885774e-05, "loss": 0.4703, "num_input_tokens_seen": 31033456, "step": 25575 }, { "epoch": 2.848869584586257, "grad_norm": 0.12952353060245514, "learning_rate": 4.497240752233324e-05, "loss": 0.465, "num_input_tokens_seen": 31039408, "step": 25580 }, { "epoch": 2.849426439469874, "grad_norm": 0.16318370401859283, "learning_rate": 4.496948432215913e-05, "loss": 0.4573, "num_input_tokens_seen": 31045616, "step": 25585 }, { "epoch": 2.8499832943534917, "grad_norm": 0.2633373439311981, "learning_rate": 4.496656036747391e-05, "loss": 0.4825, "num_input_tokens_seen": 31051632, "step": 25590 }, { "epoch": 2.850540149237109, "grad_norm": 0.1358044594526291, "learning_rate": 4.496363565838805e-05, "loss": 0.4675, "num_input_tokens_seen": 31057776, "step": 25595 }, { "epoch": 2.851097004120726, "grad_norm": 0.1049642339348793, "learning_rate": 4.496071019501206e-05, "loss": 0.4609, "num_input_tokens_seen": 31064080, "step": 25600 }, { "epoch": 2.8516538590043434, "grad_norm": 0.12940673530101776, "learning_rate": 4.495778397745648e-05, "loss": 0.4669, "num_input_tokens_seen": 31070288, "step": 25605 }, { "epoch": 2.8522107138879607, "grad_norm": 0.11308260262012482, "learning_rate": 4.495485700583185e-05, "loss": 0.4602, "num_input_tokens_seen": 31076272, "step": 25610 }, { "epoch": 2.8527675687715783, "grad_norm": 0.14347176253795624, "learning_rate": 4.4951929280248784e-05, "loss": 0.449, "num_input_tokens_seen": 31082832, "step": 25615 }, { "epoch": 2.8533244236551956, "grad_norm": 0.11667188256978989, "learning_rate": 4.494900080081789e-05, "loss": 0.4635, "num_input_tokens_seen": 31088944, "step": 25620 }, { "epoch": 2.853881278538813, "grad_norm": 0.11851309984922409, "learning_rate": 4.49460715676498e-05, "loss": 0.4619, "num_input_tokens_seen": 31094384, "step": 25625 }, { "epoch": 2.85443813342243, "grad_norm": 0.11874406039714813, "learning_rate": 4.494314158085523e-05, "loss": 0.4665, "num_input_tokens_seen": 31100944, "step": 25630 }, { "epoch": 2.8549949883060473, "grad_norm": 0.1105160266160965, "learning_rate": 4.494021084054485e-05, "loss": 0.4623, "num_input_tokens_seen": 31107216, "step": 25635 }, { "epoch": 2.855551843189665, "grad_norm": 0.12641891837120056, "learning_rate": 4.49372793468294e-05, "loss": 0.4496, "num_input_tokens_seen": 31113392, "step": 25640 }, { "epoch": 2.856108698073282, "grad_norm": 0.09981735050678253, "learning_rate": 4.4934347099819644e-05, "loss": 0.4515, "num_input_tokens_seen": 31118960, "step": 25645 }, { "epoch": 2.8566655529568994, "grad_norm": 0.11230212450027466, "learning_rate": 4.4931414099626375e-05, "loss": 0.4695, "num_input_tokens_seen": 31125264, "step": 25650 }, { "epoch": 2.8572224078405166, "grad_norm": 0.11574188619852066, "learning_rate": 4.4928480346360406e-05, "loss": 0.476, "num_input_tokens_seen": 31131312, "step": 25655 }, { "epoch": 2.857779262724134, "grad_norm": 0.12299790233373642, "learning_rate": 4.492554584013259e-05, "loss": 0.4854, "num_input_tokens_seen": 31137424, "step": 25660 }, { "epoch": 2.8583361176077515, "grad_norm": 0.10403178632259369, "learning_rate": 4.492261058105379e-05, "loss": 0.4764, "num_input_tokens_seen": 31143472, "step": 25665 }, { "epoch": 2.8588929724913688, "grad_norm": 0.1739654242992401, "learning_rate": 4.4919674569234925e-05, "loss": 0.459, "num_input_tokens_seen": 31149680, "step": 25670 }, { "epoch": 2.859449827374986, "grad_norm": 0.11902841180562973, "learning_rate": 4.491673780478691e-05, "loss": 0.4696, "num_input_tokens_seen": 31155824, "step": 25675 }, { "epoch": 2.8600066822586037, "grad_norm": 0.13301749527454376, "learning_rate": 4.491380028782073e-05, "loss": 0.4567, "num_input_tokens_seen": 31161840, "step": 25680 }, { "epoch": 2.860563537142221, "grad_norm": 0.13550052046775818, "learning_rate": 4.491086201844734e-05, "loss": 0.4671, "num_input_tokens_seen": 31167728, "step": 25685 }, { "epoch": 2.861120392025838, "grad_norm": 0.19119851291179657, "learning_rate": 4.4907922996777785e-05, "loss": 0.4631, "num_input_tokens_seen": 31173776, "step": 25690 }, { "epoch": 2.8616772469094554, "grad_norm": 0.12603314220905304, "learning_rate": 4.4904983222923105e-05, "loss": 0.4552, "num_input_tokens_seen": 31179792, "step": 25695 }, { "epoch": 2.8622341017930726, "grad_norm": 0.1746147871017456, "learning_rate": 4.490204269699436e-05, "loss": 0.465, "num_input_tokens_seen": 31186000, "step": 25700 }, { "epoch": 2.8627909566766903, "grad_norm": 0.12399311363697052, "learning_rate": 4.489910141910267e-05, "loss": 0.4636, "num_input_tokens_seen": 31192112, "step": 25705 }, { "epoch": 2.8633478115603075, "grad_norm": 0.10183952003717422, "learning_rate": 4.489615938935915e-05, "loss": 0.4516, "num_input_tokens_seen": 31197968, "step": 25710 }, { "epoch": 2.8639046664439247, "grad_norm": 0.18438731133937836, "learning_rate": 4.489321660787498e-05, "loss": 0.4777, "num_input_tokens_seen": 31204272, "step": 25715 }, { "epoch": 2.864461521327542, "grad_norm": 0.11242900043725967, "learning_rate": 4.489027307476132e-05, "loss": 0.4602, "num_input_tokens_seen": 31210544, "step": 25720 }, { "epoch": 2.865018376211159, "grad_norm": 0.15932241082191467, "learning_rate": 4.4887328790129426e-05, "loss": 0.4617, "num_input_tokens_seen": 31216720, "step": 25725 }, { "epoch": 2.865575231094777, "grad_norm": 0.11659611761569977, "learning_rate": 4.4884383754090496e-05, "loss": 0.4792, "num_input_tokens_seen": 31222704, "step": 25730 }, { "epoch": 2.866132085978394, "grad_norm": 0.12269680947065353, "learning_rate": 4.488143796675584e-05, "loss": 0.4588, "num_input_tokens_seen": 31228624, "step": 25735 }, { "epoch": 2.8666889408620113, "grad_norm": 0.10880133509635925, "learning_rate": 4.4878491428236734e-05, "loss": 0.4509, "num_input_tokens_seen": 31234832, "step": 25740 }, { "epoch": 2.8672457957456285, "grad_norm": 0.12949109077453613, "learning_rate": 4.487554413864452e-05, "loss": 0.4679, "num_input_tokens_seen": 31241168, "step": 25745 }, { "epoch": 2.8678026506292458, "grad_norm": 0.10761658847332001, "learning_rate": 4.487259609809055e-05, "loss": 0.4849, "num_input_tokens_seen": 31246544, "step": 25750 }, { "epoch": 2.8683595055128634, "grad_norm": 0.15478940308094025, "learning_rate": 4.4869647306686226e-05, "loss": 0.4484, "num_input_tokens_seen": 31252912, "step": 25755 }, { "epoch": 2.8689163603964807, "grad_norm": 0.12987956404685974, "learning_rate": 4.486669776454294e-05, "loss": 0.4607, "num_input_tokens_seen": 31259152, "step": 25760 }, { "epoch": 2.869473215280098, "grad_norm": 0.12770196795463562, "learning_rate": 4.4863747471772155e-05, "loss": 0.4674, "num_input_tokens_seen": 31265200, "step": 25765 }, { "epoch": 2.8700300701637156, "grad_norm": 0.2039288729429245, "learning_rate": 4.486079642848533e-05, "loss": 0.4519, "num_input_tokens_seen": 31271248, "step": 25770 }, { "epoch": 2.870586925047333, "grad_norm": 0.10992512851953506, "learning_rate": 4.4857844634793965e-05, "loss": 0.4581, "num_input_tokens_seen": 31277424, "step": 25775 }, { "epoch": 2.87114377993095, "grad_norm": 0.14783300459384918, "learning_rate": 4.48548920908096e-05, "loss": 0.4808, "num_input_tokens_seen": 31283760, "step": 25780 }, { "epoch": 2.8717006348145673, "grad_norm": 0.15633761882781982, "learning_rate": 4.4851938796643785e-05, "loss": 0.4687, "num_input_tokens_seen": 31289776, "step": 25785 }, { "epoch": 2.8722574896981845, "grad_norm": 0.12059658020734787, "learning_rate": 4.48489847524081e-05, "loss": 0.4446, "num_input_tokens_seen": 31295664, "step": 25790 }, { "epoch": 2.872814344581802, "grad_norm": 0.13309109210968018, "learning_rate": 4.4846029958214156e-05, "loss": 0.4673, "num_input_tokens_seen": 31301584, "step": 25795 }, { "epoch": 2.8733711994654194, "grad_norm": 0.13465271890163422, "learning_rate": 4.4843074414173605e-05, "loss": 0.4621, "num_input_tokens_seen": 31307856, "step": 25800 }, { "epoch": 2.8739280543490366, "grad_norm": 0.10675827413797379, "learning_rate": 4.484011812039811e-05, "loss": 0.4747, "num_input_tokens_seen": 31313936, "step": 25805 }, { "epoch": 2.874484909232654, "grad_norm": 0.10736087709665298, "learning_rate": 4.483716107699938e-05, "loss": 0.4695, "num_input_tokens_seen": 31319856, "step": 25810 }, { "epoch": 2.875041764116271, "grad_norm": 0.09270695596933365, "learning_rate": 4.4834203284089125e-05, "loss": 0.4695, "num_input_tokens_seen": 31326128, "step": 25815 }, { "epoch": 2.875598618999889, "grad_norm": 0.16316768527030945, "learning_rate": 4.483124474177911e-05, "loss": 0.4361, "num_input_tokens_seen": 31331888, "step": 25820 }, { "epoch": 2.876155473883506, "grad_norm": 0.10693590342998505, "learning_rate": 4.4828285450181116e-05, "loss": 0.4575, "num_input_tokens_seen": 31337840, "step": 25825 }, { "epoch": 2.8767123287671232, "grad_norm": 0.11936626583337784, "learning_rate": 4.482532540940696e-05, "loss": 0.4669, "num_input_tokens_seen": 31344016, "step": 25830 }, { "epoch": 2.8772691836507405, "grad_norm": 0.14147134125232697, "learning_rate": 4.482236461956847e-05, "loss": 0.4638, "num_input_tokens_seen": 31350288, "step": 25835 }, { "epoch": 2.8778260385343577, "grad_norm": 0.14199526607990265, "learning_rate": 4.481940308077752e-05, "loss": 0.4836, "num_input_tokens_seen": 31356304, "step": 25840 }, { "epoch": 2.8783828934179754, "grad_norm": 0.15232476592063904, "learning_rate": 4.4816440793146015e-05, "loss": 0.454, "num_input_tokens_seen": 31362160, "step": 25845 }, { "epoch": 2.8789397483015926, "grad_norm": 0.1067882776260376, "learning_rate": 4.4813477756785874e-05, "loss": 0.4653, "num_input_tokens_seen": 31368176, "step": 25850 }, { "epoch": 2.87949660318521, "grad_norm": 0.1676708459854126, "learning_rate": 4.4810513971809035e-05, "loss": 0.46, "num_input_tokens_seen": 31373744, "step": 25855 }, { "epoch": 2.8800534580688275, "grad_norm": 0.16245999932289124, "learning_rate": 4.4807549438327504e-05, "loss": 0.4627, "num_input_tokens_seen": 31380016, "step": 25860 }, { "epoch": 2.8806103129524447, "grad_norm": 0.1209176778793335, "learning_rate": 4.4804584156453264e-05, "loss": 0.4624, "num_input_tokens_seen": 31385744, "step": 25865 }, { "epoch": 2.881167167836062, "grad_norm": 0.12000299990177155, "learning_rate": 4.480161812629838e-05, "loss": 0.4704, "num_input_tokens_seen": 31391856, "step": 25870 }, { "epoch": 2.881724022719679, "grad_norm": 0.13824835419654846, "learning_rate": 4.4798651347974905e-05, "loss": 0.4604, "num_input_tokens_seen": 31397936, "step": 25875 }, { "epoch": 2.8822808776032964, "grad_norm": 0.11837836354970932, "learning_rate": 4.479568382159493e-05, "loss": 0.4867, "num_input_tokens_seen": 31403792, "step": 25880 }, { "epoch": 2.882837732486914, "grad_norm": 0.12313608080148697, "learning_rate": 4.4792715547270586e-05, "loss": 0.4629, "num_input_tokens_seen": 31409936, "step": 25885 }, { "epoch": 2.8833945873705313, "grad_norm": 0.10360890626907349, "learning_rate": 4.478974652511402e-05, "loss": 0.461, "num_input_tokens_seen": 31416368, "step": 25890 }, { "epoch": 2.8839514422541486, "grad_norm": 0.10711896419525146, "learning_rate": 4.47867767552374e-05, "loss": 0.4713, "num_input_tokens_seen": 31422096, "step": 25895 }, { "epoch": 2.884508297137766, "grad_norm": 0.11835471540689468, "learning_rate": 4.478380623775296e-05, "loss": 0.4648, "num_input_tokens_seen": 31428240, "step": 25900 }, { "epoch": 2.885065152021383, "grad_norm": 0.11994102597236633, "learning_rate": 4.478083497277291e-05, "loss": 0.4739, "num_input_tokens_seen": 31434320, "step": 25905 }, { "epoch": 2.8856220069050007, "grad_norm": 0.12899255752563477, "learning_rate": 4.477786296040953e-05, "loss": 0.4632, "num_input_tokens_seen": 31440464, "step": 25910 }, { "epoch": 2.886178861788618, "grad_norm": 0.23187600076198578, "learning_rate": 4.4774890200775096e-05, "loss": 0.4763, "num_input_tokens_seen": 31446672, "step": 25915 }, { "epoch": 2.886735716672235, "grad_norm": 0.11185378581285477, "learning_rate": 4.4771916693981955e-05, "loss": 0.4699, "num_input_tokens_seen": 31452784, "step": 25920 }, { "epoch": 2.8872925715558524, "grad_norm": 0.11997166275978088, "learning_rate": 4.476894244014243e-05, "loss": 0.4618, "num_input_tokens_seen": 31458320, "step": 25925 }, { "epoch": 2.8878494264394696, "grad_norm": 0.13046136498451233, "learning_rate": 4.47659674393689e-05, "loss": 0.452, "num_input_tokens_seen": 31464304, "step": 25930 }, { "epoch": 2.8884062813230873, "grad_norm": 0.12846557796001434, "learning_rate": 4.476299169177378e-05, "loss": 0.4739, "num_input_tokens_seen": 31470576, "step": 25935 }, { "epoch": 2.8889631362067045, "grad_norm": 0.12943482398986816, "learning_rate": 4.476001519746951e-05, "loss": 0.4672, "num_input_tokens_seen": 31476624, "step": 25940 }, { "epoch": 2.8895199910903218, "grad_norm": 0.10210055857896805, "learning_rate": 4.475703795656853e-05, "loss": 0.4727, "num_input_tokens_seen": 31482800, "step": 25945 }, { "epoch": 2.8900768459739394, "grad_norm": 0.14853864908218384, "learning_rate": 4.4754059969183345e-05, "loss": 0.4587, "num_input_tokens_seen": 31488752, "step": 25950 }, { "epoch": 2.8906337008575567, "grad_norm": 0.16146479547023773, "learning_rate": 4.475108123542647e-05, "loss": 0.4715, "num_input_tokens_seen": 31494992, "step": 25955 }, { "epoch": 2.891190555741174, "grad_norm": 0.1541205793619156, "learning_rate": 4.474810175541046e-05, "loss": 0.4589, "num_input_tokens_seen": 31501456, "step": 25960 }, { "epoch": 2.891747410624791, "grad_norm": 0.1100253239274025, "learning_rate": 4.4745121529247866e-05, "loss": 0.4477, "num_input_tokens_seen": 31507664, "step": 25965 }, { "epoch": 2.8923042655084084, "grad_norm": 0.1327820122241974, "learning_rate": 4.4742140557051315e-05, "loss": 0.4726, "num_input_tokens_seen": 31513712, "step": 25970 }, { "epoch": 2.892861120392026, "grad_norm": 0.17299513518810272, "learning_rate": 4.473915883893342e-05, "loss": 0.4786, "num_input_tokens_seen": 31519216, "step": 25975 }, { "epoch": 2.8934179752756433, "grad_norm": 0.10214470326900482, "learning_rate": 4.4736176375006844e-05, "loss": 0.4704, "num_input_tokens_seen": 31525424, "step": 25980 }, { "epoch": 2.8939748301592605, "grad_norm": 0.1225745752453804, "learning_rate": 4.473319316538428e-05, "loss": 0.4579, "num_input_tokens_seen": 31531472, "step": 25985 }, { "epoch": 2.8945316850428777, "grad_norm": 0.1394176334142685, "learning_rate": 4.4730209210178445e-05, "loss": 0.467, "num_input_tokens_seen": 31537520, "step": 25990 }, { "epoch": 2.895088539926495, "grad_norm": 0.14055971801280975, "learning_rate": 4.4727224509502075e-05, "loss": 0.4648, "num_input_tokens_seen": 31543792, "step": 25995 }, { "epoch": 2.8956453948101126, "grad_norm": 0.12953142821788788, "learning_rate": 4.472423906346794e-05, "loss": 0.4716, "num_input_tokens_seen": 31549904, "step": 26000 }, { "epoch": 2.89620224969373, "grad_norm": 0.09716988354921341, "learning_rate": 4.4721252872188854e-05, "loss": 0.4671, "num_input_tokens_seen": 31555984, "step": 26005 }, { "epoch": 2.896759104577347, "grad_norm": 0.1652761697769165, "learning_rate": 4.4718265935777625e-05, "loss": 0.4658, "num_input_tokens_seen": 31562000, "step": 26010 }, { "epoch": 2.8973159594609643, "grad_norm": 0.12112893909215927, "learning_rate": 4.471527825434712e-05, "loss": 0.4661, "num_input_tokens_seen": 31568144, "step": 26015 }, { "epoch": 2.8978728143445815, "grad_norm": 0.1310916543006897, "learning_rate": 4.4712289828010226e-05, "loss": 0.4594, "num_input_tokens_seen": 31574576, "step": 26020 }, { "epoch": 2.8984296692281992, "grad_norm": 0.11363250762224197, "learning_rate": 4.470930065687985e-05, "loss": 0.4541, "num_input_tokens_seen": 31580720, "step": 26025 }, { "epoch": 2.8989865241118165, "grad_norm": 0.13281817734241486, "learning_rate": 4.470631074106893e-05, "loss": 0.4765, "num_input_tokens_seen": 31586864, "step": 26030 }, { "epoch": 2.8995433789954337, "grad_norm": 0.11355122178792953, "learning_rate": 4.4703320080690436e-05, "loss": 0.4695, "num_input_tokens_seen": 31592752, "step": 26035 }, { "epoch": 2.9001002338790514, "grad_norm": 0.17849913239479065, "learning_rate": 4.470032867585737e-05, "loss": 0.4635, "num_input_tokens_seen": 31598608, "step": 26040 }, { "epoch": 2.9006570887626686, "grad_norm": 0.1460392028093338, "learning_rate": 4.469733652668275e-05, "loss": 0.4635, "num_input_tokens_seen": 31604624, "step": 26045 }, { "epoch": 2.901213943646286, "grad_norm": 0.10657859593629837, "learning_rate": 4.4694343633279644e-05, "loss": 0.4598, "num_input_tokens_seen": 31610736, "step": 26050 }, { "epoch": 2.901770798529903, "grad_norm": 0.12847867608070374, "learning_rate": 4.4691349995761115e-05, "loss": 0.459, "num_input_tokens_seen": 31616688, "step": 26055 }, { "epoch": 2.9023276534135203, "grad_norm": 0.10873091220855713, "learning_rate": 4.468835561424029e-05, "loss": 0.4659, "num_input_tokens_seen": 31622864, "step": 26060 }, { "epoch": 2.902884508297138, "grad_norm": 0.10710429400205612, "learning_rate": 4.468536048883028e-05, "loss": 0.4616, "num_input_tokens_seen": 31628912, "step": 26065 }, { "epoch": 2.903441363180755, "grad_norm": 0.11894872784614563, "learning_rate": 4.468236461964427e-05, "loss": 0.4443, "num_input_tokens_seen": 31635088, "step": 26070 }, { "epoch": 2.9039982180643724, "grad_norm": 0.09975946694612503, "learning_rate": 4.4679368006795444e-05, "loss": 0.4535, "num_input_tokens_seen": 31641520, "step": 26075 }, { "epoch": 2.9045550729479896, "grad_norm": 0.2142878770828247, "learning_rate": 4.4676370650397036e-05, "loss": 0.4574, "num_input_tokens_seen": 31647536, "step": 26080 }, { "epoch": 2.905111927831607, "grad_norm": 0.11492470651865005, "learning_rate": 4.467337255056229e-05, "loss": 0.4637, "num_input_tokens_seen": 31653680, "step": 26085 }, { "epoch": 2.9056687827152246, "grad_norm": 0.0986594706773758, "learning_rate": 4.467037370740448e-05, "loss": 0.4787, "num_input_tokens_seen": 31660016, "step": 26090 }, { "epoch": 2.906225637598842, "grad_norm": 0.1328016072511673, "learning_rate": 4.466737412103692e-05, "loss": 0.4685, "num_input_tokens_seen": 31665744, "step": 26095 }, { "epoch": 2.906782492482459, "grad_norm": 0.14606007933616638, "learning_rate": 4.4664373791572935e-05, "loss": 0.4541, "num_input_tokens_seen": 31671664, "step": 26100 }, { "epoch": 2.9073393473660762, "grad_norm": 0.12219472974538803, "learning_rate": 4.4661372719125886e-05, "loss": 0.4649, "num_input_tokens_seen": 31677456, "step": 26105 }, { "epoch": 2.9078962022496935, "grad_norm": 0.11203787475824356, "learning_rate": 4.4658370903809177e-05, "loss": 0.4641, "num_input_tokens_seen": 31683792, "step": 26110 }, { "epoch": 2.908453057133311, "grad_norm": 0.17218619585037231, "learning_rate": 4.46553683457362e-05, "loss": 0.4612, "num_input_tokens_seen": 31690032, "step": 26115 }, { "epoch": 2.9090099120169284, "grad_norm": 0.1422099620103836, "learning_rate": 4.465236504502044e-05, "loss": 0.4717, "num_input_tokens_seen": 31695408, "step": 26120 }, { "epoch": 2.9095667669005456, "grad_norm": 0.08559779822826385, "learning_rate": 4.4649361001775333e-05, "loss": 0.4717, "num_input_tokens_seen": 31701392, "step": 26125 }, { "epoch": 2.9101236217841633, "grad_norm": 0.11023924499750137, "learning_rate": 4.4646356216114405e-05, "loss": 0.466, "num_input_tokens_seen": 31707440, "step": 26130 }, { "epoch": 2.9106804766677805, "grad_norm": 0.1233861967921257, "learning_rate": 4.464335068815117e-05, "loss": 0.4587, "num_input_tokens_seen": 31713552, "step": 26135 }, { "epoch": 2.9112373315513977, "grad_norm": 0.09887737035751343, "learning_rate": 4.464034441799921e-05, "loss": 0.4641, "num_input_tokens_seen": 31719824, "step": 26140 }, { "epoch": 2.911794186435015, "grad_norm": 0.16244889795780182, "learning_rate": 4.46373374057721e-05, "loss": 0.461, "num_input_tokens_seen": 31725328, "step": 26145 }, { "epoch": 2.912351041318632, "grad_norm": 0.13485078513622284, "learning_rate": 4.4634329651583435e-05, "loss": 0.4616, "num_input_tokens_seen": 31731536, "step": 26150 }, { "epoch": 2.91290789620225, "grad_norm": 0.14463293552398682, "learning_rate": 4.4631321155546886e-05, "loss": 0.4651, "num_input_tokens_seen": 31737168, "step": 26155 }, { "epoch": 2.913464751085867, "grad_norm": 0.11969615519046783, "learning_rate": 4.462831191777611e-05, "loss": 0.4669, "num_input_tokens_seen": 31743664, "step": 26160 }, { "epoch": 2.9140216059694843, "grad_norm": 0.1064324826002121, "learning_rate": 4.46253019383848e-05, "loss": 0.4771, "num_input_tokens_seen": 31749872, "step": 26165 }, { "epoch": 2.9145784608531016, "grad_norm": 0.12109903991222382, "learning_rate": 4.46222912174867e-05, "loss": 0.4652, "num_input_tokens_seen": 31755632, "step": 26170 }, { "epoch": 2.915135315736719, "grad_norm": 0.11591313034296036, "learning_rate": 4.461927975519555e-05, "loss": 0.4567, "num_input_tokens_seen": 31761296, "step": 26175 }, { "epoch": 2.9156921706203365, "grad_norm": 0.13652801513671875, "learning_rate": 4.461626755162514e-05, "loss": 0.4573, "num_input_tokens_seen": 31767152, "step": 26180 }, { "epoch": 2.9162490255039537, "grad_norm": 0.10363011062145233, "learning_rate": 4.461325460688928e-05, "loss": 0.4729, "num_input_tokens_seen": 31773072, "step": 26185 }, { "epoch": 2.916805880387571, "grad_norm": 0.17543038725852966, "learning_rate": 4.46102409211018e-05, "loss": 0.4559, "num_input_tokens_seen": 31779312, "step": 26190 }, { "epoch": 2.917362735271188, "grad_norm": 0.12506677210330963, "learning_rate": 4.460722649437659e-05, "loss": 0.4826, "num_input_tokens_seen": 31785488, "step": 26195 }, { "epoch": 2.9179195901548054, "grad_norm": 0.1369267851114273, "learning_rate": 4.460421132682751e-05, "loss": 0.4609, "num_input_tokens_seen": 31791696, "step": 26200 }, { "epoch": 2.918476445038423, "grad_norm": 0.12024477124214172, "learning_rate": 4.460119541856851e-05, "loss": 0.4563, "num_input_tokens_seen": 31797712, "step": 26205 }, { "epoch": 2.9190332999220403, "grad_norm": 0.11315293610095978, "learning_rate": 4.4598178769713526e-05, "loss": 0.4622, "num_input_tokens_seen": 31803984, "step": 26210 }, { "epoch": 2.9195901548056575, "grad_norm": 0.16973762214183807, "learning_rate": 4.4595161380376546e-05, "loss": 0.461, "num_input_tokens_seen": 31810288, "step": 26215 }, { "epoch": 2.920147009689275, "grad_norm": 0.10191056877374649, "learning_rate": 4.459214325067158e-05, "loss": 0.4666, "num_input_tokens_seen": 31816688, "step": 26220 }, { "epoch": 2.9207038645728924, "grad_norm": 0.1397976279258728, "learning_rate": 4.4589124380712655e-05, "loss": 0.4638, "num_input_tokens_seen": 31822928, "step": 26225 }, { "epoch": 2.9212607194565097, "grad_norm": 0.11547233909368515, "learning_rate": 4.4586104770613825e-05, "loss": 0.4581, "num_input_tokens_seen": 31828688, "step": 26230 }, { "epoch": 2.921817574340127, "grad_norm": 0.10600083321332932, "learning_rate": 4.45830844204892e-05, "loss": 0.4567, "num_input_tokens_seen": 31834960, "step": 26235 }, { "epoch": 2.922374429223744, "grad_norm": 0.1302415281534195, "learning_rate": 4.458006333045288e-05, "loss": 0.463, "num_input_tokens_seen": 31841136, "step": 26240 }, { "epoch": 2.922931284107362, "grad_norm": 0.10961388796567917, "learning_rate": 4.457704150061902e-05, "loss": 0.4724, "num_input_tokens_seen": 31847344, "step": 26245 }, { "epoch": 2.923488138990979, "grad_norm": 0.10959577560424805, "learning_rate": 4.45740189311018e-05, "loss": 0.4613, "num_input_tokens_seen": 31853136, "step": 26250 }, { "epoch": 2.9240449938745963, "grad_norm": 0.11897122114896774, "learning_rate": 4.457099562201542e-05, "loss": 0.4585, "num_input_tokens_seen": 31859536, "step": 26255 }, { "epoch": 2.9246018487582135, "grad_norm": 0.12958820164203644, "learning_rate": 4.45679715734741e-05, "loss": 0.4657, "num_input_tokens_seen": 31865456, "step": 26260 }, { "epoch": 2.9251587036418307, "grad_norm": 0.1302168071269989, "learning_rate": 4.456494678559211e-05, "loss": 0.4658, "num_input_tokens_seen": 31871312, "step": 26265 }, { "epoch": 2.9257155585254484, "grad_norm": 0.10115216672420502, "learning_rate": 4.456192125848373e-05, "loss": 0.4628, "num_input_tokens_seen": 31877392, "step": 26270 }, { "epoch": 2.9262724134090656, "grad_norm": 0.15143978595733643, "learning_rate": 4.4558894992263266e-05, "loss": 0.4424, "num_input_tokens_seen": 31883504, "step": 26275 }, { "epoch": 2.926829268292683, "grad_norm": 0.1937347650527954, "learning_rate": 4.455586798704509e-05, "loss": 0.4677, "num_input_tokens_seen": 31889680, "step": 26280 }, { "epoch": 2.9273861231763, "grad_norm": 0.13174712657928467, "learning_rate": 4.455284024294353e-05, "loss": 0.4498, "num_input_tokens_seen": 31895920, "step": 26285 }, { "epoch": 2.9279429780599173, "grad_norm": 0.1646403968334198, "learning_rate": 4.454981176007301e-05, "loss": 0.4565, "num_input_tokens_seen": 31902000, "step": 26290 }, { "epoch": 2.928499832943535, "grad_norm": 0.11843489110469818, "learning_rate": 4.454678253854796e-05, "loss": 0.4612, "num_input_tokens_seen": 31907920, "step": 26295 }, { "epoch": 2.9290566878271522, "grad_norm": 0.11993960291147232, "learning_rate": 4.4543752578482826e-05, "loss": 0.4652, "num_input_tokens_seen": 31913968, "step": 26300 }, { "epoch": 2.9296135427107695, "grad_norm": 0.11049884557723999, "learning_rate": 4.4540721879992086e-05, "loss": 0.4818, "num_input_tokens_seen": 31920112, "step": 26305 }, { "epoch": 2.930170397594387, "grad_norm": 0.185198113322258, "learning_rate": 4.453769044319025e-05, "loss": 0.4638, "num_input_tokens_seen": 31926384, "step": 26310 }, { "epoch": 2.9307272524780044, "grad_norm": 0.09157755970954895, "learning_rate": 4.453465826819185e-05, "loss": 0.4614, "num_input_tokens_seen": 31932272, "step": 26315 }, { "epoch": 2.9312841073616216, "grad_norm": 0.12628145515918732, "learning_rate": 4.4531625355111476e-05, "loss": 0.4532, "num_input_tokens_seen": 31938800, "step": 26320 }, { "epoch": 2.931840962245239, "grad_norm": 0.12561830878257751, "learning_rate": 4.452859170406369e-05, "loss": 0.4595, "num_input_tokens_seen": 31944848, "step": 26325 }, { "epoch": 2.932397817128856, "grad_norm": 0.12424767762422562, "learning_rate": 4.452555731516313e-05, "loss": 0.4769, "num_input_tokens_seen": 31951344, "step": 26330 }, { "epoch": 2.9329546720124737, "grad_norm": 0.12486737221479416, "learning_rate": 4.452252218852444e-05, "loss": 0.471, "num_input_tokens_seen": 31957584, "step": 26335 }, { "epoch": 2.933511526896091, "grad_norm": 0.11323316395282745, "learning_rate": 4.4519486324262306e-05, "loss": 0.4723, "num_input_tokens_seen": 31963824, "step": 26340 }, { "epoch": 2.934068381779708, "grad_norm": 0.12735529243946075, "learning_rate": 4.4516449722491416e-05, "loss": 0.4831, "num_input_tokens_seen": 31970000, "step": 26345 }, { "epoch": 2.9346252366633254, "grad_norm": 0.14465412497520447, "learning_rate": 4.451341238332652e-05, "loss": 0.465, "num_input_tokens_seen": 31976016, "step": 26350 }, { "epoch": 2.9351820915469427, "grad_norm": 0.13118189573287964, "learning_rate": 4.451037430688236e-05, "loss": 0.4724, "num_input_tokens_seen": 31982192, "step": 26355 }, { "epoch": 2.9357389464305603, "grad_norm": 0.19527891278266907, "learning_rate": 4.450733549327375e-05, "loss": 0.4631, "num_input_tokens_seen": 31988560, "step": 26360 }, { "epoch": 2.9362958013141776, "grad_norm": 0.1670389175415039, "learning_rate": 4.450429594261548e-05, "loss": 0.4689, "num_input_tokens_seen": 31994640, "step": 26365 }, { "epoch": 2.936852656197795, "grad_norm": 0.13963107764720917, "learning_rate": 4.4501255655022413e-05, "loss": 0.4654, "num_input_tokens_seen": 32000656, "step": 26370 }, { "epoch": 2.9374095110814125, "grad_norm": 0.20072627067565918, "learning_rate": 4.44982146306094e-05, "loss": 0.4703, "num_input_tokens_seen": 32006928, "step": 26375 }, { "epoch": 2.9379663659650292, "grad_norm": 0.10522612184286118, "learning_rate": 4.4495172869491366e-05, "loss": 0.4616, "num_input_tokens_seen": 32012560, "step": 26380 }, { "epoch": 2.938523220848647, "grad_norm": 0.1443735957145691, "learning_rate": 4.4492130371783214e-05, "loss": 0.4711, "num_input_tokens_seen": 32018320, "step": 26385 }, { "epoch": 2.939080075732264, "grad_norm": 0.10961020737886429, "learning_rate": 4.448908713759992e-05, "loss": 0.4542, "num_input_tokens_seen": 32024144, "step": 26390 }, { "epoch": 2.9396369306158814, "grad_norm": 0.14836819469928741, "learning_rate": 4.448604316705646e-05, "loss": 0.444, "num_input_tokens_seen": 32030288, "step": 26395 }, { "epoch": 2.940193785499499, "grad_norm": 0.13668787479400635, "learning_rate": 4.448299846026783e-05, "loss": 0.4427, "num_input_tokens_seen": 32036528, "step": 26400 }, { "epoch": 2.9407506403831163, "grad_norm": 0.1217251867055893, "learning_rate": 4.44799530173491e-05, "loss": 0.4601, "num_input_tokens_seen": 32042832, "step": 26405 }, { "epoch": 2.9413074952667335, "grad_norm": 0.13638024032115936, "learning_rate": 4.44769068384153e-05, "loss": 0.4703, "num_input_tokens_seen": 32048976, "step": 26410 }, { "epoch": 2.9418643501503507, "grad_norm": 0.1779947131872177, "learning_rate": 4.447385992358155e-05, "loss": 0.4611, "num_input_tokens_seen": 32055056, "step": 26415 }, { "epoch": 2.942421205033968, "grad_norm": 0.10119841992855072, "learning_rate": 4.447081227296297e-05, "loss": 0.47, "num_input_tokens_seen": 32061232, "step": 26420 }, { "epoch": 2.9429780599175857, "grad_norm": 0.11632601916790009, "learning_rate": 4.44677638866747e-05, "loss": 0.4624, "num_input_tokens_seen": 32067472, "step": 26425 }, { "epoch": 2.943534914801203, "grad_norm": 0.11797702312469482, "learning_rate": 4.446471476483192e-05, "loss": 0.4586, "num_input_tokens_seen": 32073552, "step": 26430 }, { "epoch": 2.94409176968482, "grad_norm": 0.10837674140930176, "learning_rate": 4.446166490754984e-05, "loss": 0.4653, "num_input_tokens_seen": 32079888, "step": 26435 }, { "epoch": 2.9446486245684373, "grad_norm": 0.1364128440618515, "learning_rate": 4.44586143149437e-05, "loss": 0.4765, "num_input_tokens_seen": 32086192, "step": 26440 }, { "epoch": 2.9452054794520546, "grad_norm": 0.1310478001832962, "learning_rate": 4.4455562987128745e-05, "loss": 0.4675, "num_input_tokens_seen": 32092464, "step": 26445 }, { "epoch": 2.9457623343356723, "grad_norm": 0.15493173897266388, "learning_rate": 4.445251092422028e-05, "loss": 0.4607, "num_input_tokens_seen": 32098416, "step": 26450 }, { "epoch": 2.9463191892192895, "grad_norm": 0.11912544816732407, "learning_rate": 4.44494581263336e-05, "loss": 0.4579, "num_input_tokens_seen": 32104816, "step": 26455 }, { "epoch": 2.9468760441029067, "grad_norm": 0.14465445280075073, "learning_rate": 4.4446404593584074e-05, "loss": 0.4566, "num_input_tokens_seen": 32110896, "step": 26460 }, { "epoch": 2.9474328989865244, "grad_norm": 0.130111962556839, "learning_rate": 4.444335032608706e-05, "loss": 0.4694, "num_input_tokens_seen": 32116720, "step": 26465 }, { "epoch": 2.947989753870141, "grad_norm": 0.15438584983348846, "learning_rate": 4.444029532395796e-05, "loss": 0.4667, "num_input_tokens_seen": 32123088, "step": 26470 }, { "epoch": 2.948546608753759, "grad_norm": 0.1571207195520401, "learning_rate": 4.443723958731221e-05, "loss": 0.4653, "num_input_tokens_seen": 32129008, "step": 26475 }, { "epoch": 2.949103463637376, "grad_norm": 0.13847525417804718, "learning_rate": 4.443418311626525e-05, "loss": 0.4653, "num_input_tokens_seen": 32135536, "step": 26480 }, { "epoch": 2.9496603185209933, "grad_norm": 0.12883099913597107, "learning_rate": 4.4431125910932575e-05, "loss": 0.4608, "num_input_tokens_seen": 32141744, "step": 26485 }, { "epoch": 2.950217173404611, "grad_norm": 0.1113785058259964, "learning_rate": 4.4428067971429695e-05, "loss": 0.4597, "num_input_tokens_seen": 32147600, "step": 26490 }, { "epoch": 2.950774028288228, "grad_norm": 0.140869602560997, "learning_rate": 4.442500929787215e-05, "loss": 0.4664, "num_input_tokens_seen": 32153872, "step": 26495 }, { "epoch": 2.9513308831718454, "grad_norm": 0.1314309537410736, "learning_rate": 4.442194989037549e-05, "loss": 0.4609, "num_input_tokens_seen": 32159952, "step": 26500 }, { "epoch": 2.9518877380554627, "grad_norm": 0.12783990800380707, "learning_rate": 4.441888974905534e-05, "loss": 0.46, "num_input_tokens_seen": 32166160, "step": 26505 }, { "epoch": 2.95244459293908, "grad_norm": 0.14314457774162292, "learning_rate": 4.441582887402729e-05, "loss": 0.4746, "num_input_tokens_seen": 32172144, "step": 26510 }, { "epoch": 2.9530014478226976, "grad_norm": 0.143088698387146, "learning_rate": 4.4412767265407006e-05, "loss": 0.4743, "num_input_tokens_seen": 32178064, "step": 26515 }, { "epoch": 2.953558302706315, "grad_norm": 0.13690130412578583, "learning_rate": 4.440970492331017e-05, "loss": 0.4651, "num_input_tokens_seen": 32184176, "step": 26520 }, { "epoch": 2.954115157589932, "grad_norm": 0.11943569034337997, "learning_rate": 4.440664184785248e-05, "loss": 0.4609, "num_input_tokens_seen": 32190160, "step": 26525 }, { "epoch": 2.9546720124735493, "grad_norm": 0.14223553240299225, "learning_rate": 4.440357803914966e-05, "loss": 0.4619, "num_input_tokens_seen": 32196272, "step": 26530 }, { "epoch": 2.9552288673571665, "grad_norm": 0.10062262415885925, "learning_rate": 4.4400513497317484e-05, "loss": 0.4781, "num_input_tokens_seen": 32202160, "step": 26535 }, { "epoch": 2.955785722240784, "grad_norm": 0.11295009404420853, "learning_rate": 4.439744822247174e-05, "loss": 0.4659, "num_input_tokens_seen": 32208016, "step": 26540 }, { "epoch": 2.9563425771244014, "grad_norm": 0.14217150211334229, "learning_rate": 4.439438221472824e-05, "loss": 0.4714, "num_input_tokens_seen": 32213968, "step": 26545 }, { "epoch": 2.9568994320080186, "grad_norm": 0.13264451920986176, "learning_rate": 4.439131547420283e-05, "loss": 0.4588, "num_input_tokens_seen": 32220240, "step": 26550 }, { "epoch": 2.9574562868916363, "grad_norm": 0.16738013923168182, "learning_rate": 4.438824800101137e-05, "loss": 0.4719, "num_input_tokens_seen": 32226480, "step": 26555 }, { "epoch": 2.958013141775253, "grad_norm": 0.14437814056873322, "learning_rate": 4.4385179795269774e-05, "loss": 0.4671, "num_input_tokens_seen": 32232688, "step": 26560 }, { "epoch": 2.9585699966588708, "grad_norm": 0.11810792237520218, "learning_rate": 4.438211085709396e-05, "loss": 0.459, "num_input_tokens_seen": 32238896, "step": 26565 }, { "epoch": 2.959126851542488, "grad_norm": 0.1383485049009323, "learning_rate": 4.437904118659989e-05, "loss": 0.4729, "num_input_tokens_seen": 32244752, "step": 26570 }, { "epoch": 2.9596837064261052, "grad_norm": 0.14066916704177856, "learning_rate": 4.4375970783903536e-05, "loss": 0.4566, "num_input_tokens_seen": 32251152, "step": 26575 }, { "epoch": 2.960240561309723, "grad_norm": 0.12757985293865204, "learning_rate": 4.4372899649120915e-05, "loss": 0.4606, "num_input_tokens_seen": 32256656, "step": 26580 }, { "epoch": 2.96079741619334, "grad_norm": 0.11618848890066147, "learning_rate": 4.436982778236806e-05, "loss": 0.4639, "num_input_tokens_seen": 32262416, "step": 26585 }, { "epoch": 2.9613542710769574, "grad_norm": 0.12175876647233963, "learning_rate": 4.4366755183761034e-05, "loss": 0.4652, "num_input_tokens_seen": 32268720, "step": 26590 }, { "epoch": 2.9619111259605746, "grad_norm": 0.13143101334571838, "learning_rate": 4.436368185341594e-05, "loss": 0.4592, "num_input_tokens_seen": 32274096, "step": 26595 }, { "epoch": 2.962467980844192, "grad_norm": 0.12361500412225723, "learning_rate": 4.4360607791448886e-05, "loss": 0.4637, "num_input_tokens_seen": 32280240, "step": 26600 }, { "epoch": 2.9630248357278095, "grad_norm": 0.12203158438205719, "learning_rate": 4.435753299797603e-05, "loss": 0.461, "num_input_tokens_seen": 32286128, "step": 26605 }, { "epoch": 2.9635816906114267, "grad_norm": 0.12810441851615906, "learning_rate": 4.4354457473113545e-05, "loss": 0.4622, "num_input_tokens_seen": 32292112, "step": 26610 }, { "epoch": 2.964138545495044, "grad_norm": 0.13472984731197357, "learning_rate": 4.4351381216977625e-05, "loss": 0.4664, "num_input_tokens_seen": 32298128, "step": 26615 }, { "epoch": 2.964695400378661, "grad_norm": 0.13133947551250458, "learning_rate": 4.434830422968451e-05, "loss": 0.4731, "num_input_tokens_seen": 32304400, "step": 26620 }, { "epoch": 2.9652522552622784, "grad_norm": 0.18426279723644257, "learning_rate": 4.4345226511350464e-05, "loss": 0.4586, "num_input_tokens_seen": 32310640, "step": 26625 }, { "epoch": 2.965809110145896, "grad_norm": 0.1287718266248703, "learning_rate": 4.434214806209176e-05, "loss": 0.4635, "num_input_tokens_seen": 32316720, "step": 26630 }, { "epoch": 2.9663659650295133, "grad_norm": 0.13045036792755127, "learning_rate": 4.4339068882024715e-05, "loss": 0.4564, "num_input_tokens_seen": 32322736, "step": 26635 }, { "epoch": 2.9669228199131306, "grad_norm": 0.15335097908973694, "learning_rate": 4.4335988971265675e-05, "loss": 0.4751, "num_input_tokens_seen": 32329168, "step": 26640 }, { "epoch": 2.9674796747967482, "grad_norm": 0.11249557882547379, "learning_rate": 4.433290832993101e-05, "loss": 0.4591, "num_input_tokens_seen": 32335600, "step": 26645 }, { "epoch": 2.968036529680365, "grad_norm": 0.09773195534944534, "learning_rate": 4.43298269581371e-05, "loss": 0.4619, "num_input_tokens_seen": 32341296, "step": 26650 }, { "epoch": 2.9685933845639827, "grad_norm": 0.18019945919513702, "learning_rate": 4.4326744856000394e-05, "loss": 0.4829, "num_input_tokens_seen": 32347408, "step": 26655 }, { "epoch": 2.9691502394476, "grad_norm": 0.10210088640451431, "learning_rate": 4.432366202363733e-05, "loss": 0.4773, "num_input_tokens_seen": 32353552, "step": 26660 }, { "epoch": 2.969707094331217, "grad_norm": 0.1396998018026352, "learning_rate": 4.432057846116439e-05, "loss": 0.4571, "num_input_tokens_seen": 32359504, "step": 26665 }, { "epoch": 2.970263949214835, "grad_norm": 0.18251673877239227, "learning_rate": 4.431749416869808e-05, "loss": 0.4719, "num_input_tokens_seen": 32365776, "step": 26670 }, { "epoch": 2.970820804098452, "grad_norm": 0.12988652288913727, "learning_rate": 4.431440914635493e-05, "loss": 0.4752, "num_input_tokens_seen": 32371632, "step": 26675 }, { "epoch": 2.9713776589820693, "grad_norm": 0.09233719855546951, "learning_rate": 4.4311323394251506e-05, "loss": 0.4573, "num_input_tokens_seen": 32377744, "step": 26680 }, { "epoch": 2.9719345138656865, "grad_norm": 0.164152130484581, "learning_rate": 4.4308236912504395e-05, "loss": 0.4796, "num_input_tokens_seen": 32384048, "step": 26685 }, { "epoch": 2.9724913687493038, "grad_norm": 0.1501591056585312, "learning_rate": 4.430514970123023e-05, "loss": 0.4713, "num_input_tokens_seen": 32389872, "step": 26690 }, { "epoch": 2.9730482236329214, "grad_norm": 0.14033211767673492, "learning_rate": 4.430206176054563e-05, "loss": 0.4769, "num_input_tokens_seen": 32395984, "step": 26695 }, { "epoch": 2.9736050785165387, "grad_norm": 0.12081800401210785, "learning_rate": 4.429897309056729e-05, "loss": 0.4598, "num_input_tokens_seen": 32402384, "step": 26700 }, { "epoch": 2.974161933400156, "grad_norm": 0.14511702954769135, "learning_rate": 4.42958836914119e-05, "loss": 0.4716, "num_input_tokens_seen": 32408496, "step": 26705 }, { "epoch": 2.974718788283773, "grad_norm": 0.12871387600898743, "learning_rate": 4.4292793563196176e-05, "loss": 0.4648, "num_input_tokens_seen": 32414352, "step": 26710 }, { "epoch": 2.9752756431673903, "grad_norm": 0.10846145451068878, "learning_rate": 4.4289702706036896e-05, "loss": 0.4708, "num_input_tokens_seen": 32420336, "step": 26715 }, { "epoch": 2.975832498051008, "grad_norm": 0.12333358824253082, "learning_rate": 4.428661112005083e-05, "loss": 0.4654, "num_input_tokens_seen": 32426256, "step": 26720 }, { "epoch": 2.9763893529346253, "grad_norm": 0.11654239892959595, "learning_rate": 4.4283518805354786e-05, "loss": 0.459, "num_input_tokens_seen": 32432560, "step": 26725 }, { "epoch": 2.9769462078182425, "grad_norm": 0.09769675135612488, "learning_rate": 4.42804257620656e-05, "loss": 0.4666, "num_input_tokens_seen": 32438960, "step": 26730 }, { "epoch": 2.97750306270186, "grad_norm": 0.11834637075662613, "learning_rate": 4.427733199030014e-05, "loss": 0.4719, "num_input_tokens_seen": 32445072, "step": 26735 }, { "epoch": 2.9780599175854774, "grad_norm": 0.10252122581005096, "learning_rate": 4.427423749017531e-05, "loss": 0.4678, "num_input_tokens_seen": 32451152, "step": 26740 }, { "epoch": 2.9786167724690946, "grad_norm": 0.10556042939424515, "learning_rate": 4.4271142261808016e-05, "loss": 0.4653, "num_input_tokens_seen": 32457296, "step": 26745 }, { "epoch": 2.979173627352712, "grad_norm": 0.10715270042419434, "learning_rate": 4.426804630531521e-05, "loss": 0.4612, "num_input_tokens_seen": 32463376, "step": 26750 }, { "epoch": 2.979730482236329, "grad_norm": 0.10770626366138458, "learning_rate": 4.426494962081387e-05, "loss": 0.4611, "num_input_tokens_seen": 32469744, "step": 26755 }, { "epoch": 2.9802873371199468, "grad_norm": 0.10955888777971268, "learning_rate": 4.426185220842099e-05, "loss": 0.4732, "num_input_tokens_seen": 32476336, "step": 26760 }, { "epoch": 2.980844192003564, "grad_norm": 0.09242243319749832, "learning_rate": 4.425875406825361e-05, "loss": 0.4637, "num_input_tokens_seen": 32482640, "step": 26765 }, { "epoch": 2.981401046887181, "grad_norm": 0.11459057033061981, "learning_rate": 4.425565520042878e-05, "loss": 0.4656, "num_input_tokens_seen": 32488976, "step": 26770 }, { "epoch": 2.9819579017707984, "grad_norm": 0.11041963845491409, "learning_rate": 4.4252555605063594e-05, "loss": 0.4575, "num_input_tokens_seen": 32495184, "step": 26775 }, { "epoch": 2.9825147566544157, "grad_norm": 0.1288454383611679, "learning_rate": 4.424945528227515e-05, "loss": 0.4527, "num_input_tokens_seen": 32501520, "step": 26780 }, { "epoch": 2.9830716115380334, "grad_norm": 0.1113332062959671, "learning_rate": 4.424635423218061e-05, "loss": 0.4591, "num_input_tokens_seen": 32507664, "step": 26785 }, { "epoch": 2.9836284664216506, "grad_norm": 0.12996627390384674, "learning_rate": 4.424325245489712e-05, "loss": 0.47, "num_input_tokens_seen": 32513744, "step": 26790 }, { "epoch": 2.984185321305268, "grad_norm": 0.12792068719863892, "learning_rate": 4.424014995054189e-05, "loss": 0.4648, "num_input_tokens_seen": 32519984, "step": 26795 }, { "epoch": 2.984742176188885, "grad_norm": 0.10943625867366791, "learning_rate": 4.423704671923213e-05, "loss": 0.4674, "num_input_tokens_seen": 32525808, "step": 26800 }, { "epoch": 2.9852990310725023, "grad_norm": 0.1260461062192917, "learning_rate": 4.42339427610851e-05, "loss": 0.4556, "num_input_tokens_seen": 32531952, "step": 26805 }, { "epoch": 2.98585588595612, "grad_norm": 0.13225552439689636, "learning_rate": 4.423083807621808e-05, "loss": 0.473, "num_input_tokens_seen": 32538064, "step": 26810 }, { "epoch": 2.986412740839737, "grad_norm": 0.11308062076568604, "learning_rate": 4.4227732664748365e-05, "loss": 0.451, "num_input_tokens_seen": 32544016, "step": 26815 }, { "epoch": 2.9869695957233544, "grad_norm": 0.08596338331699371, "learning_rate": 4.422462652679329e-05, "loss": 0.4644, "num_input_tokens_seen": 32550064, "step": 26820 }, { "epoch": 2.987526450606972, "grad_norm": 0.12370706349611282, "learning_rate": 4.4221519662470216e-05, "loss": 0.4636, "num_input_tokens_seen": 32556528, "step": 26825 }, { "epoch": 2.9880833054905893, "grad_norm": 0.12599077820777893, "learning_rate": 4.421841207189653e-05, "loss": 0.4604, "num_input_tokens_seen": 32562064, "step": 26830 }, { "epoch": 2.9886401603742065, "grad_norm": 0.16089804470539093, "learning_rate": 4.421530375518965e-05, "loss": 0.4692, "num_input_tokens_seen": 32568272, "step": 26835 }, { "epoch": 2.9891970152578238, "grad_norm": 0.18168573081493378, "learning_rate": 4.421219471246701e-05, "loss": 0.4591, "num_input_tokens_seen": 32574416, "step": 26840 }, { "epoch": 2.989753870141441, "grad_norm": 0.13740894198417664, "learning_rate": 4.4209084943846095e-05, "loss": 0.4788, "num_input_tokens_seen": 32580464, "step": 26845 }, { "epoch": 2.9903107250250587, "grad_norm": 0.08410661667585373, "learning_rate": 4.420597444944439e-05, "loss": 0.4653, "num_input_tokens_seen": 32586608, "step": 26850 }, { "epoch": 2.990867579908676, "grad_norm": 0.1223427876830101, "learning_rate": 4.420286322937942e-05, "loss": 0.4567, "num_input_tokens_seen": 32592560, "step": 26855 }, { "epoch": 2.991424434792293, "grad_norm": 0.11045070737600327, "learning_rate": 4.4199751283768745e-05, "loss": 0.4685, "num_input_tokens_seen": 32599056, "step": 26860 }, { "epoch": 2.9919812896759104, "grad_norm": 0.10721883922815323, "learning_rate": 4.4196638612729926e-05, "loss": 0.459, "num_input_tokens_seen": 32604944, "step": 26865 }, { "epoch": 2.9925381445595276, "grad_norm": 0.09421589225530624, "learning_rate": 4.419352521638058e-05, "loss": 0.4519, "num_input_tokens_seen": 32611024, "step": 26870 }, { "epoch": 2.9930949994431453, "grad_norm": 0.14447705447673798, "learning_rate": 4.4190411094838355e-05, "loss": 0.475, "num_input_tokens_seen": 32617232, "step": 26875 }, { "epoch": 2.9936518543267625, "grad_norm": 0.1335171014070511, "learning_rate": 4.4187296248220896e-05, "loss": 0.4606, "num_input_tokens_seen": 32623312, "step": 26880 }, { "epoch": 2.9942087092103797, "grad_norm": 0.16399239003658295, "learning_rate": 4.4184180676645895e-05, "loss": 0.4674, "num_input_tokens_seen": 32629552, "step": 26885 }, { "epoch": 2.994765564093997, "grad_norm": 0.14813555777072906, "learning_rate": 4.418106438023107e-05, "loss": 0.4593, "num_input_tokens_seen": 32635920, "step": 26890 }, { "epoch": 2.995322418977614, "grad_norm": 0.12648732960224152, "learning_rate": 4.417794735909416e-05, "loss": 0.4695, "num_input_tokens_seen": 32641968, "step": 26895 }, { "epoch": 2.995879273861232, "grad_norm": 0.12053678929805756, "learning_rate": 4.4174829613352944e-05, "loss": 0.4504, "num_input_tokens_seen": 32648048, "step": 26900 }, { "epoch": 2.996436128744849, "grad_norm": 0.1232408732175827, "learning_rate": 4.4171711143125216e-05, "loss": 0.4482, "num_input_tokens_seen": 32654352, "step": 26905 }, { "epoch": 2.9969929836284663, "grad_norm": 0.12696245312690735, "learning_rate": 4.41685919485288e-05, "loss": 0.4854, "num_input_tokens_seen": 32659856, "step": 26910 }, { "epoch": 2.997549838512084, "grad_norm": 0.1381182074546814, "learning_rate": 4.416547202968155e-05, "loss": 0.4666, "num_input_tokens_seen": 32665904, "step": 26915 }, { "epoch": 2.9981066933957012, "grad_norm": 0.18310050666332245, "learning_rate": 4.4162351386701354e-05, "loss": 0.4583, "num_input_tokens_seen": 32672240, "step": 26920 }, { "epoch": 2.9986635482793185, "grad_norm": 0.11363822966814041, "learning_rate": 4.415923001970611e-05, "loss": 0.4655, "num_input_tokens_seen": 32678448, "step": 26925 }, { "epoch": 2.9992204031629357, "grad_norm": 0.1445470154285431, "learning_rate": 4.415610792881376e-05, "loss": 0.4617, "num_input_tokens_seen": 32684592, "step": 26930 }, { "epoch": 2.999777258046553, "grad_norm": 0.11559336632490158, "learning_rate": 4.4152985114142265e-05, "loss": 0.462, "num_input_tokens_seen": 32690960, "step": 26935 }, { "epoch": 3.0003341129301706, "grad_norm": 0.12809087336063385, "learning_rate": 4.4149861575809605e-05, "loss": 0.4663, "num_input_tokens_seen": 32696128, "step": 26940 }, { "epoch": 3.0003341129301706, "eval_loss": 0.4649043679237366, "eval_runtime": 113.1313, "eval_samples_per_second": 35.278, "eval_steps_per_second": 8.822, "num_input_tokens_seen": 32696128, "step": 26940 }, { "epoch": 3.000890967813788, "grad_norm": 0.09877903759479523, "learning_rate": 4.414673731393381e-05, "loss": 0.4717, "num_input_tokens_seen": 32702144, "step": 26945 }, { "epoch": 3.001447822697405, "grad_norm": 0.1329801082611084, "learning_rate": 4.414361232863292e-05, "loss": 0.4663, "num_input_tokens_seen": 32708256, "step": 26950 }, { "epoch": 3.0020046775810223, "grad_norm": 0.09058427065610886, "learning_rate": 4.414048662002501e-05, "loss": 0.4564, "num_input_tokens_seen": 32714912, "step": 26955 }, { "epoch": 3.0025615324646395, "grad_norm": 0.1328432410955429, "learning_rate": 4.4137360188228176e-05, "loss": 0.4525, "num_input_tokens_seen": 32721088, "step": 26960 }, { "epoch": 3.003118387348257, "grad_norm": 0.12065964192152023, "learning_rate": 4.413423303336055e-05, "loss": 0.4625, "num_input_tokens_seen": 32726848, "step": 26965 }, { "epoch": 3.0036752422318744, "grad_norm": 0.13221880793571472, "learning_rate": 4.4131105155540276e-05, "loss": 0.4624, "num_input_tokens_seen": 32733088, "step": 26970 }, { "epoch": 3.0042320971154917, "grad_norm": 0.12890490889549255, "learning_rate": 4.412797655488554e-05, "loss": 0.462, "num_input_tokens_seen": 32738944, "step": 26975 }, { "epoch": 3.004788951999109, "grad_norm": 0.20194567739963531, "learning_rate": 4.412484723151455e-05, "loss": 0.4665, "num_input_tokens_seen": 32745056, "step": 26980 }, { "epoch": 3.0053458068827266, "grad_norm": 0.18284456431865692, "learning_rate": 4.412171718554555e-05, "loss": 0.4566, "num_input_tokens_seen": 32751104, "step": 26985 }, { "epoch": 3.005902661766344, "grad_norm": 0.11730578541755676, "learning_rate": 4.411858641709679e-05, "loss": 0.4733, "num_input_tokens_seen": 32757280, "step": 26990 }, { "epoch": 3.006459516649961, "grad_norm": 0.16172660887241364, "learning_rate": 4.411545492628657e-05, "loss": 0.4678, "num_input_tokens_seen": 32763808, "step": 26995 }, { "epoch": 3.0070163715335783, "grad_norm": 0.13470254838466644, "learning_rate": 4.41123227132332e-05, "loss": 0.4724, "num_input_tokens_seen": 32769920, "step": 27000 }, { "epoch": 3.0075732264171955, "grad_norm": 0.12370005995035172, "learning_rate": 4.4109189778055034e-05, "loss": 0.4679, "num_input_tokens_seen": 32775872, "step": 27005 }, { "epoch": 3.008130081300813, "grad_norm": 0.1083943322300911, "learning_rate": 4.410605612087043e-05, "loss": 0.4781, "num_input_tokens_seen": 32782016, "step": 27010 }, { "epoch": 3.0086869361844304, "grad_norm": 0.14401237666606903, "learning_rate": 4.410292174179781e-05, "loss": 0.4675, "num_input_tokens_seen": 32788128, "step": 27015 }, { "epoch": 3.0092437910680476, "grad_norm": 0.09991002827882767, "learning_rate": 4.409978664095559e-05, "loss": 0.4617, "num_input_tokens_seen": 32794016, "step": 27020 }, { "epoch": 3.009800645951665, "grad_norm": 0.1379557102918625, "learning_rate": 4.409665081846222e-05, "loss": 0.4722, "num_input_tokens_seen": 32799520, "step": 27025 }, { "epoch": 3.0103575008352825, "grad_norm": 0.1056518703699112, "learning_rate": 4.409351427443617e-05, "loss": 0.4648, "num_input_tokens_seen": 32805792, "step": 27030 }, { "epoch": 3.0109143557188998, "grad_norm": 0.12058112025260925, "learning_rate": 4.4090377008995975e-05, "loss": 0.4664, "num_input_tokens_seen": 32811648, "step": 27035 }, { "epoch": 3.011471210602517, "grad_norm": 0.11479929834604263, "learning_rate": 4.4087239022260155e-05, "loss": 0.4693, "num_input_tokens_seen": 32817152, "step": 27040 }, { "epoch": 3.012028065486134, "grad_norm": 0.15494590997695923, "learning_rate": 4.4084100314347276e-05, "loss": 0.4627, "num_input_tokens_seen": 32823616, "step": 27045 }, { "epoch": 3.0125849203697515, "grad_norm": 0.11124397069215775, "learning_rate": 4.408096088537593e-05, "loss": 0.4573, "num_input_tokens_seen": 32828640, "step": 27050 }, { "epoch": 3.013141775253369, "grad_norm": 0.13198238611221313, "learning_rate": 4.407782073546474e-05, "loss": 0.4697, "num_input_tokens_seen": 32834560, "step": 27055 }, { "epoch": 3.0136986301369864, "grad_norm": 0.1338493674993515, "learning_rate": 4.407467986473234e-05, "loss": 0.4515, "num_input_tokens_seen": 32840640, "step": 27060 }, { "epoch": 3.0142554850206036, "grad_norm": 0.15927904844284058, "learning_rate": 4.40715382732974e-05, "loss": 0.4604, "num_input_tokens_seen": 32846784, "step": 27065 }, { "epoch": 3.014812339904221, "grad_norm": 0.1295970231294632, "learning_rate": 4.406839596127863e-05, "loss": 0.4476, "num_input_tokens_seen": 32853088, "step": 27070 }, { "epoch": 3.0153691947878385, "grad_norm": 0.128839910030365, "learning_rate": 4.4065252928794756e-05, "loss": 0.4695, "num_input_tokens_seen": 32859456, "step": 27075 }, { "epoch": 3.0159260496714557, "grad_norm": 0.1274501383304596, "learning_rate": 4.4062109175964526e-05, "loss": 0.4584, "num_input_tokens_seen": 32865536, "step": 27080 }, { "epoch": 3.016482904555073, "grad_norm": 0.13887572288513184, "learning_rate": 4.405896470290673e-05, "loss": 0.4775, "num_input_tokens_seen": 32871520, "step": 27085 }, { "epoch": 3.01703975943869, "grad_norm": 0.10229749977588654, "learning_rate": 4.405581950974017e-05, "loss": 0.4681, "num_input_tokens_seen": 32877696, "step": 27090 }, { "epoch": 3.0175966143223074, "grad_norm": 0.0973740667104721, "learning_rate": 4.4052673596583674e-05, "loss": 0.461, "num_input_tokens_seen": 32883840, "step": 27095 }, { "epoch": 3.018153469205925, "grad_norm": 0.1142367422580719, "learning_rate": 4.4049526963556116e-05, "loss": 0.4687, "num_input_tokens_seen": 32889984, "step": 27100 }, { "epoch": 3.0187103240895423, "grad_norm": 0.18509528040885925, "learning_rate": 4.404637961077638e-05, "loss": 0.4645, "num_input_tokens_seen": 32896448, "step": 27105 }, { "epoch": 3.0192671789731595, "grad_norm": 0.11010678112506866, "learning_rate": 4.404323153836339e-05, "loss": 0.4714, "num_input_tokens_seen": 32902560, "step": 27110 }, { "epoch": 3.019824033856777, "grad_norm": 0.1000450998544693, "learning_rate": 4.4040082746436085e-05, "loss": 0.4698, "num_input_tokens_seen": 32908512, "step": 27115 }, { "epoch": 3.0203808887403945, "grad_norm": 0.09215803444385529, "learning_rate": 4.403693323511343e-05, "loss": 0.4594, "num_input_tokens_seen": 32914624, "step": 27120 }, { "epoch": 3.0209377436240117, "grad_norm": 0.11822067201137543, "learning_rate": 4.403378300451444e-05, "loss": 0.4534, "num_input_tokens_seen": 32920640, "step": 27125 }, { "epoch": 3.021494598507629, "grad_norm": 0.09250430017709732, "learning_rate": 4.4030632054758124e-05, "loss": 0.4596, "num_input_tokens_seen": 32926016, "step": 27130 }, { "epoch": 3.022051453391246, "grad_norm": 0.12246684730052948, "learning_rate": 4.402748038596354e-05, "loss": 0.463, "num_input_tokens_seen": 32932352, "step": 27135 }, { "epoch": 3.0226083082748634, "grad_norm": 0.09390828758478165, "learning_rate": 4.402432799824978e-05, "loss": 0.4555, "num_input_tokens_seen": 32938080, "step": 27140 }, { "epoch": 3.023165163158481, "grad_norm": 0.08429073542356491, "learning_rate": 4.402117489173594e-05, "loss": 0.4598, "num_input_tokens_seen": 32944384, "step": 27145 }, { "epoch": 3.0237220180420983, "grad_norm": 0.15100432932376862, "learning_rate": 4.401802106654115e-05, "loss": 0.4631, "num_input_tokens_seen": 32950464, "step": 27150 }, { "epoch": 3.0242788729257155, "grad_norm": 0.12287843972444534, "learning_rate": 4.401486652278458e-05, "loss": 0.485, "num_input_tokens_seen": 32955840, "step": 27155 }, { "epoch": 3.0248357278093327, "grad_norm": 0.11794155836105347, "learning_rate": 4.401171126058542e-05, "loss": 0.4642, "num_input_tokens_seen": 32961984, "step": 27160 }, { "epoch": 3.0253925826929504, "grad_norm": 0.1108173057436943, "learning_rate": 4.400855528006288e-05, "loss": 0.4446, "num_input_tokens_seen": 32968064, "step": 27165 }, { "epoch": 3.0259494375765676, "grad_norm": 0.1117543950676918, "learning_rate": 4.40053985813362e-05, "loss": 0.4696, "num_input_tokens_seen": 32974464, "step": 27170 }, { "epoch": 3.026506292460185, "grad_norm": 0.0997110977768898, "learning_rate": 4.400224116452466e-05, "loss": 0.4482, "num_input_tokens_seen": 32980704, "step": 27175 }, { "epoch": 3.027063147343802, "grad_norm": 0.08149798214435577, "learning_rate": 4.399908302974756e-05, "loss": 0.4648, "num_input_tokens_seen": 32986752, "step": 27180 }, { "epoch": 3.0276200022274193, "grad_norm": 0.1305813193321228, "learning_rate": 4.3995924177124214e-05, "loss": 0.4781, "num_input_tokens_seen": 32992768, "step": 27185 }, { "epoch": 3.028176857111037, "grad_norm": 0.11791016161441803, "learning_rate": 4.3992764606773974e-05, "loss": 0.4662, "num_input_tokens_seen": 32999008, "step": 27190 }, { "epoch": 3.0287337119946542, "grad_norm": 0.14869925379753113, "learning_rate": 4.398960431881622e-05, "loss": 0.467, "num_input_tokens_seen": 33005216, "step": 27195 }, { "epoch": 3.0292905668782715, "grad_norm": 0.14484959840774536, "learning_rate": 4.398644331337036e-05, "loss": 0.4564, "num_input_tokens_seen": 33011456, "step": 27200 }, { "epoch": 3.0298474217618887, "grad_norm": 0.10225235670804977, "learning_rate": 4.3983281590555834e-05, "loss": 0.4588, "num_input_tokens_seen": 33017088, "step": 27205 }, { "epoch": 3.0304042766455064, "grad_norm": 0.11583920568227768, "learning_rate": 4.398011915049209e-05, "loss": 0.4716, "num_input_tokens_seen": 33023072, "step": 27210 }, { "epoch": 3.0309611315291236, "grad_norm": 0.08046352863311768, "learning_rate": 4.3976955993298616e-05, "loss": 0.4736, "num_input_tokens_seen": 33029408, "step": 27215 }, { "epoch": 3.031517986412741, "grad_norm": 0.09584555774927139, "learning_rate": 4.3973792119094934e-05, "loss": 0.4752, "num_input_tokens_seen": 33035680, "step": 27220 }, { "epoch": 3.032074841296358, "grad_norm": 0.12890541553497314, "learning_rate": 4.3970627528000586e-05, "loss": 0.4626, "num_input_tokens_seen": 33041728, "step": 27225 }, { "epoch": 3.0326316961799753, "grad_norm": 0.13927164673805237, "learning_rate": 4.396746222013513e-05, "loss": 0.4637, "num_input_tokens_seen": 33047808, "step": 27230 }, { "epoch": 3.033188551063593, "grad_norm": 0.12322413921356201, "learning_rate": 4.396429619561817e-05, "loss": 0.4581, "num_input_tokens_seen": 33054208, "step": 27235 }, { "epoch": 3.03374540594721, "grad_norm": 0.13710345327854156, "learning_rate": 4.396112945456932e-05, "loss": 0.4545, "num_input_tokens_seen": 33059840, "step": 27240 }, { "epoch": 3.0343022608308274, "grad_norm": 0.11200092732906342, "learning_rate": 4.395796199710824e-05, "loss": 0.4736, "num_input_tokens_seen": 33065856, "step": 27245 }, { "epoch": 3.0348591157144447, "grad_norm": 0.1286938339471817, "learning_rate": 4.39547938233546e-05, "loss": 0.4578, "num_input_tokens_seen": 33071904, "step": 27250 }, { "epoch": 3.0354159705980623, "grad_norm": 0.10334993898868561, "learning_rate": 4.3951624933428106e-05, "loss": 0.464, "num_input_tokens_seen": 33077216, "step": 27255 }, { "epoch": 3.0359728254816796, "grad_norm": 0.0987204909324646, "learning_rate": 4.394845532744849e-05, "loss": 0.4644, "num_input_tokens_seen": 33083488, "step": 27260 }, { "epoch": 3.036529680365297, "grad_norm": 0.12139520049095154, "learning_rate": 4.394528500553551e-05, "loss": 0.468, "num_input_tokens_seen": 33089504, "step": 27265 }, { "epoch": 3.037086535248914, "grad_norm": 0.09309286624193192, "learning_rate": 4.394211396780894e-05, "loss": 0.4711, "num_input_tokens_seen": 33095808, "step": 27270 }, { "epoch": 3.0376433901325313, "grad_norm": 0.12371253967285156, "learning_rate": 4.3938942214388615e-05, "loss": 0.4568, "num_input_tokens_seen": 33102016, "step": 27275 }, { "epoch": 3.038200245016149, "grad_norm": 0.13562420010566711, "learning_rate": 4.393576974539435e-05, "loss": 0.4769, "num_input_tokens_seen": 33107904, "step": 27280 }, { "epoch": 3.038757099899766, "grad_norm": 0.09935450553894043, "learning_rate": 4.393259656094603e-05, "loss": 0.4599, "num_input_tokens_seen": 33114080, "step": 27285 }, { "epoch": 3.0393139547833834, "grad_norm": 0.10369659215211868, "learning_rate": 4.392942266116353e-05, "loss": 0.4675, "num_input_tokens_seen": 33120224, "step": 27290 }, { "epoch": 3.0398708096670006, "grad_norm": 0.10609814524650574, "learning_rate": 4.392624804616678e-05, "loss": 0.4608, "num_input_tokens_seen": 33126400, "step": 27295 }, { "epoch": 3.0404276645506183, "grad_norm": 0.12072770297527313, "learning_rate": 4.392307271607573e-05, "loss": 0.4664, "num_input_tokens_seen": 33132608, "step": 27300 }, { "epoch": 3.0409845194342355, "grad_norm": 0.12783023715019226, "learning_rate": 4.3919896671010355e-05, "loss": 0.4701, "num_input_tokens_seen": 33138720, "step": 27305 }, { "epoch": 3.0415413743178528, "grad_norm": 0.12820060551166534, "learning_rate": 4.3916719911090645e-05, "loss": 0.4621, "num_input_tokens_seen": 33144896, "step": 27310 }, { "epoch": 3.04209822920147, "grad_norm": 0.16191986203193665, "learning_rate": 4.391354243643664e-05, "loss": 0.4509, "num_input_tokens_seen": 33150432, "step": 27315 }, { "epoch": 3.0426550840850872, "grad_norm": 0.08732535690069199, "learning_rate": 4.391036424716839e-05, "loss": 0.4801, "num_input_tokens_seen": 33156096, "step": 27320 }, { "epoch": 3.043211938968705, "grad_norm": 0.12205666303634644, "learning_rate": 4.390718534340598e-05, "loss": 0.4636, "num_input_tokens_seen": 33162176, "step": 27325 }, { "epoch": 3.043768793852322, "grad_norm": 0.10839028656482697, "learning_rate": 4.390400572526951e-05, "loss": 0.4738, "num_input_tokens_seen": 33167904, "step": 27330 }, { "epoch": 3.0443256487359394, "grad_norm": 0.16087788343429565, "learning_rate": 4.390082539287912e-05, "loss": 0.4568, "num_input_tokens_seen": 33174176, "step": 27335 }, { "epoch": 3.0448825036195566, "grad_norm": 0.1708328276872635, "learning_rate": 4.389764434635498e-05, "loss": 0.4564, "num_input_tokens_seen": 33180320, "step": 27340 }, { "epoch": 3.0454393585031743, "grad_norm": 0.11830007284879684, "learning_rate": 4.3894462585817284e-05, "loss": 0.4605, "num_input_tokens_seen": 33186528, "step": 27345 }, { "epoch": 3.0459962133867915, "grad_norm": 0.11918092519044876, "learning_rate": 4.389128011138623e-05, "loss": 0.4638, "num_input_tokens_seen": 33193024, "step": 27350 }, { "epoch": 3.0465530682704087, "grad_norm": 0.13530519604682922, "learning_rate": 4.388809692318209e-05, "loss": 0.4688, "num_input_tokens_seen": 33199136, "step": 27355 }, { "epoch": 3.047109923154026, "grad_norm": 0.14838296175003052, "learning_rate": 4.3884913021325104e-05, "loss": 0.4737, "num_input_tokens_seen": 33205312, "step": 27360 }, { "epoch": 3.047666778037643, "grad_norm": 0.12801936268806458, "learning_rate": 4.388172840593559e-05, "loss": 0.4561, "num_input_tokens_seen": 33210784, "step": 27365 }, { "epoch": 3.048223632921261, "grad_norm": 0.1384408324956894, "learning_rate": 4.387854307713386e-05, "loss": 0.459, "num_input_tokens_seen": 33216352, "step": 27370 }, { "epoch": 3.048780487804878, "grad_norm": 0.11425240337848663, "learning_rate": 4.3875357035040275e-05, "loss": 0.4671, "num_input_tokens_seen": 33222624, "step": 27375 }, { "epoch": 3.0493373426884953, "grad_norm": 0.1291583776473999, "learning_rate": 4.3872170279775214e-05, "loss": 0.4765, "num_input_tokens_seen": 33228800, "step": 27380 }, { "epoch": 3.0498941975721126, "grad_norm": 0.12247724831104279, "learning_rate": 4.386898281145908e-05, "loss": 0.4656, "num_input_tokens_seen": 33234496, "step": 27385 }, { "epoch": 3.0504510524557302, "grad_norm": 0.08129449188709259, "learning_rate": 4.3865794630212306e-05, "loss": 0.4725, "num_input_tokens_seen": 33239904, "step": 27390 }, { "epoch": 3.0510079073393475, "grad_norm": 0.1408262848854065, "learning_rate": 4.386260573615536e-05, "loss": 0.458, "num_input_tokens_seen": 33246016, "step": 27395 }, { "epoch": 3.0515647622229647, "grad_norm": 0.1292048543691635, "learning_rate": 4.385941612940872e-05, "loss": 0.4586, "num_input_tokens_seen": 33252288, "step": 27400 }, { "epoch": 3.052121617106582, "grad_norm": 0.13514767587184906, "learning_rate": 4.3856225810092896e-05, "loss": 0.4581, "num_input_tokens_seen": 33258592, "step": 27405 }, { "epoch": 3.052678471990199, "grad_norm": 0.12552863359451294, "learning_rate": 4.3853034778328426e-05, "loss": 0.4645, "num_input_tokens_seen": 33264928, "step": 27410 }, { "epoch": 3.053235326873817, "grad_norm": 0.18514074385166168, "learning_rate": 4.384984303423589e-05, "loss": 0.468, "num_input_tokens_seen": 33271072, "step": 27415 }, { "epoch": 3.053792181757434, "grad_norm": 0.14741556346416473, "learning_rate": 4.384665057793589e-05, "loss": 0.4563, "num_input_tokens_seen": 33277184, "step": 27420 }, { "epoch": 3.0543490366410513, "grad_norm": 0.15188594162464142, "learning_rate": 4.384345740954901e-05, "loss": 0.4733, "num_input_tokens_seen": 33283520, "step": 27425 }, { "epoch": 3.0549058915246685, "grad_norm": 0.157225102186203, "learning_rate": 4.384026352919595e-05, "loss": 0.4732, "num_input_tokens_seen": 33289472, "step": 27430 }, { "epoch": 3.055462746408286, "grad_norm": 0.12104031443595886, "learning_rate": 4.383706893699734e-05, "loss": 0.4672, "num_input_tokens_seen": 33295392, "step": 27435 }, { "epoch": 3.0560196012919034, "grad_norm": 0.17416934669017792, "learning_rate": 4.38338736330739e-05, "loss": 0.4946, "num_input_tokens_seen": 33301504, "step": 27440 }, { "epoch": 3.0565764561755207, "grad_norm": 0.10696231573820114, "learning_rate": 4.383067761754636e-05, "loss": 0.4657, "num_input_tokens_seen": 33307680, "step": 27445 }, { "epoch": 3.057133311059138, "grad_norm": 0.13414661586284637, "learning_rate": 4.3827480890535474e-05, "loss": 0.4531, "num_input_tokens_seen": 33313824, "step": 27450 }, { "epoch": 3.057690165942755, "grad_norm": 0.13680267333984375, "learning_rate": 4.382428345216203e-05, "loss": 0.464, "num_input_tokens_seen": 33319552, "step": 27455 }, { "epoch": 3.058247020826373, "grad_norm": 0.10901638120412827, "learning_rate": 4.3821085302546825e-05, "loss": 0.4654, "num_input_tokens_seen": 33325728, "step": 27460 }, { "epoch": 3.05880387570999, "grad_norm": 0.23614254593849182, "learning_rate": 4.38178864418107e-05, "loss": 0.4605, "num_input_tokens_seen": 33331936, "step": 27465 }, { "epoch": 3.0593607305936072, "grad_norm": 0.22843393683433533, "learning_rate": 4.381468687007453e-05, "loss": 0.4653, "num_input_tokens_seen": 33337888, "step": 27470 }, { "epoch": 3.0599175854772245, "grad_norm": 0.1280616968870163, "learning_rate": 4.3811486587459186e-05, "loss": 0.4645, "num_input_tokens_seen": 33343968, "step": 27475 }, { "epoch": 3.060474440360842, "grad_norm": 0.11571593582630157, "learning_rate": 4.3808285594085596e-05, "loss": 0.4587, "num_input_tokens_seen": 33350400, "step": 27480 }, { "epoch": 3.0610312952444594, "grad_norm": 0.1511157602071762, "learning_rate": 4.380508389007471e-05, "loss": 0.4646, "num_input_tokens_seen": 33356704, "step": 27485 }, { "epoch": 3.0615881501280766, "grad_norm": 0.19667325913906097, "learning_rate": 4.380188147554748e-05, "loss": 0.4351, "num_input_tokens_seen": 33362848, "step": 27490 }, { "epoch": 3.062145005011694, "grad_norm": 0.11649462580680847, "learning_rate": 4.3798678350624916e-05, "loss": 0.456, "num_input_tokens_seen": 33369088, "step": 27495 }, { "epoch": 3.062701859895311, "grad_norm": 0.10317958891391754, "learning_rate": 4.379547451542804e-05, "loss": 0.4516, "num_input_tokens_seen": 33374976, "step": 27500 }, { "epoch": 3.0632587147789287, "grad_norm": 0.12017497420310974, "learning_rate": 4.3792269970077906e-05, "loss": 0.4687, "num_input_tokens_seen": 33381184, "step": 27505 }, { "epoch": 3.063815569662546, "grad_norm": 0.12310328334569931, "learning_rate": 4.3789064714695595e-05, "loss": 0.4463, "num_input_tokens_seen": 33387104, "step": 27510 }, { "epoch": 3.064372424546163, "grad_norm": 0.10983414947986603, "learning_rate": 4.37858587494022e-05, "loss": 0.4658, "num_input_tokens_seen": 33393568, "step": 27515 }, { "epoch": 3.0649292794297804, "grad_norm": 0.1031472310423851, "learning_rate": 4.3782652074318866e-05, "loss": 0.4642, "num_input_tokens_seen": 33399296, "step": 27520 }, { "epoch": 3.065486134313398, "grad_norm": 0.1097773015499115, "learning_rate": 4.377944468956674e-05, "loss": 0.4599, "num_input_tokens_seen": 33405184, "step": 27525 }, { "epoch": 3.0660429891970153, "grad_norm": 0.16187947988510132, "learning_rate": 4.3776236595267015e-05, "loss": 0.4711, "num_input_tokens_seen": 33411808, "step": 27530 }, { "epoch": 3.0665998440806326, "grad_norm": 0.12171077728271484, "learning_rate": 4.3773027791540895e-05, "loss": 0.4576, "num_input_tokens_seen": 33418048, "step": 27535 }, { "epoch": 3.06715669896425, "grad_norm": 0.12860500812530518, "learning_rate": 4.376981827850962e-05, "loss": 0.4528, "num_input_tokens_seen": 33424032, "step": 27540 }, { "epoch": 3.067713553847867, "grad_norm": 0.12346871197223663, "learning_rate": 4.3766608056294464e-05, "loss": 0.47, "num_input_tokens_seen": 33430240, "step": 27545 }, { "epoch": 3.0682704087314847, "grad_norm": 0.10085637122392654, "learning_rate": 4.3763397125016724e-05, "loss": 0.4609, "num_input_tokens_seen": 33436288, "step": 27550 }, { "epoch": 3.068827263615102, "grad_norm": 0.1539590060710907, "learning_rate": 4.37601854847977e-05, "loss": 0.4607, "num_input_tokens_seen": 33442336, "step": 27555 }, { "epoch": 3.069384118498719, "grad_norm": 0.11401841044425964, "learning_rate": 4.375697313575875e-05, "loss": 0.4749, "num_input_tokens_seen": 33448384, "step": 27560 }, { "epoch": 3.0699409733823364, "grad_norm": 0.17894528806209564, "learning_rate": 4.375376007802125e-05, "loss": 0.4693, "num_input_tokens_seen": 33454752, "step": 27565 }, { "epoch": 3.070497828265954, "grad_norm": 0.1294867843389511, "learning_rate": 4.375054631170659e-05, "loss": 0.462, "num_input_tokens_seen": 33461024, "step": 27570 }, { "epoch": 3.0710546831495713, "grad_norm": 0.13118593394756317, "learning_rate": 4.3747331836936205e-05, "loss": 0.4563, "num_input_tokens_seen": 33467168, "step": 27575 }, { "epoch": 3.0716115380331885, "grad_norm": 0.11382874846458435, "learning_rate": 4.374411665383154e-05, "loss": 0.4759, "num_input_tokens_seen": 33473024, "step": 27580 }, { "epoch": 3.0721683929168058, "grad_norm": 0.12831780314445496, "learning_rate": 4.374090076251408e-05, "loss": 0.4679, "num_input_tokens_seen": 33478592, "step": 27585 }, { "epoch": 3.072725247800423, "grad_norm": 0.1162397712469101, "learning_rate": 4.373768416310534e-05, "loss": 0.459, "num_input_tokens_seen": 33484672, "step": 27590 }, { "epoch": 3.0732821026840407, "grad_norm": 0.11459878087043762, "learning_rate": 4.373446685572683e-05, "loss": 0.4666, "num_input_tokens_seen": 33490432, "step": 27595 }, { "epoch": 3.073838957567658, "grad_norm": 0.14921705424785614, "learning_rate": 4.373124884050014e-05, "loss": 0.4667, "num_input_tokens_seen": 33496896, "step": 27600 }, { "epoch": 3.074395812451275, "grad_norm": 0.17049837112426758, "learning_rate": 4.372803011754683e-05, "loss": 0.464, "num_input_tokens_seen": 33503168, "step": 27605 }, { "epoch": 3.0749526673348924, "grad_norm": 0.1378580629825592, "learning_rate": 4.3724810686988524e-05, "loss": 0.4617, "num_input_tokens_seen": 33509312, "step": 27610 }, { "epoch": 3.07550952221851, "grad_norm": 0.17076829075813293, "learning_rate": 4.3721590548946865e-05, "loss": 0.4656, "num_input_tokens_seen": 33515840, "step": 27615 }, { "epoch": 3.0760663771021273, "grad_norm": 0.15466071665287018, "learning_rate": 4.3718369703543526e-05, "loss": 0.4572, "num_input_tokens_seen": 33522016, "step": 27620 }, { "epoch": 3.0766232319857445, "grad_norm": 0.1516132801771164, "learning_rate": 4.371514815090019e-05, "loss": 0.4644, "num_input_tokens_seen": 33528128, "step": 27625 }, { "epoch": 3.0771800868693617, "grad_norm": 0.15740378201007843, "learning_rate": 4.371192589113858e-05, "loss": 0.4389, "num_input_tokens_seen": 33534304, "step": 27630 }, { "epoch": 3.077736941752979, "grad_norm": 0.12691630423069, "learning_rate": 4.3708702924380455e-05, "loss": 0.4753, "num_input_tokens_seen": 33540352, "step": 27635 }, { "epoch": 3.0782937966365966, "grad_norm": 0.11127908527851105, "learning_rate": 4.370547925074756e-05, "loss": 0.462, "num_input_tokens_seen": 33546368, "step": 27640 }, { "epoch": 3.078850651520214, "grad_norm": 0.1499079465866089, "learning_rate": 4.370225487036172e-05, "loss": 0.4879, "num_input_tokens_seen": 33552288, "step": 27645 }, { "epoch": 3.079407506403831, "grad_norm": 0.11141526699066162, "learning_rate": 4.369902978334477e-05, "loss": 0.4503, "num_input_tokens_seen": 33558272, "step": 27650 }, { "epoch": 3.0799643612874483, "grad_norm": 0.14083412289619446, "learning_rate": 4.369580398981854e-05, "loss": 0.4563, "num_input_tokens_seen": 33564672, "step": 27655 }, { "epoch": 3.080521216171066, "grad_norm": 0.10140449553728104, "learning_rate": 4.3692577489904917e-05, "loss": 0.4674, "num_input_tokens_seen": 33570272, "step": 27660 }, { "epoch": 3.0810780710546832, "grad_norm": 0.17088375985622406, "learning_rate": 4.368935028372582e-05, "loss": 0.4539, "num_input_tokens_seen": 33576416, "step": 27665 }, { "epoch": 3.0816349259383005, "grad_norm": 0.12473899871110916, "learning_rate": 4.368612237140317e-05, "loss": 0.4535, "num_input_tokens_seen": 33582720, "step": 27670 }, { "epoch": 3.0821917808219177, "grad_norm": 0.1270252764225006, "learning_rate": 4.368289375305895e-05, "loss": 0.4694, "num_input_tokens_seen": 33589120, "step": 27675 }, { "epoch": 3.082748635705535, "grad_norm": 0.10583173483610153, "learning_rate": 4.3679664428815115e-05, "loss": 0.4689, "num_input_tokens_seen": 33594976, "step": 27680 }, { "epoch": 3.0833054905891526, "grad_norm": 0.10094916075468063, "learning_rate": 4.3676434398793697e-05, "loss": 0.466, "num_input_tokens_seen": 33600768, "step": 27685 }, { "epoch": 3.08386234547277, "grad_norm": 0.13030895590782166, "learning_rate": 4.3673203663116747e-05, "loss": 0.4615, "num_input_tokens_seen": 33606912, "step": 27690 }, { "epoch": 3.084419200356387, "grad_norm": 0.11876971274614334, "learning_rate": 4.366997222190631e-05, "loss": 0.4549, "num_input_tokens_seen": 33613120, "step": 27695 }, { "epoch": 3.0849760552400043, "grad_norm": 0.12646372616291046, "learning_rate": 4.36667400752845e-05, "loss": 0.4695, "num_input_tokens_seen": 33619200, "step": 27700 }, { "epoch": 3.085532910123622, "grad_norm": 0.14662522077560425, "learning_rate": 4.366350722337342e-05, "loss": 0.4732, "num_input_tokens_seen": 33625344, "step": 27705 }, { "epoch": 3.086089765007239, "grad_norm": 0.11841676384210587, "learning_rate": 4.366027366629524e-05, "loss": 0.4791, "num_input_tokens_seen": 33630816, "step": 27710 }, { "epoch": 3.0866466198908564, "grad_norm": 0.12181664258241653, "learning_rate": 4.365703940417211e-05, "loss": 0.4688, "num_input_tokens_seen": 33636896, "step": 27715 }, { "epoch": 3.0872034747744737, "grad_norm": 0.11608972400426865, "learning_rate": 4.365380443712624e-05, "loss": 0.4701, "num_input_tokens_seen": 33643104, "step": 27720 }, { "epoch": 3.0877603296580913, "grad_norm": 0.11052031069993973, "learning_rate": 4.365056876527987e-05, "loss": 0.4598, "num_input_tokens_seen": 33649344, "step": 27725 }, { "epoch": 3.0883171845417086, "grad_norm": 0.11130180954933167, "learning_rate": 4.364733238875524e-05, "loss": 0.4605, "num_input_tokens_seen": 33655360, "step": 27730 }, { "epoch": 3.088874039425326, "grad_norm": 0.11217109858989716, "learning_rate": 4.364409530767464e-05, "loss": 0.4561, "num_input_tokens_seen": 33661408, "step": 27735 }, { "epoch": 3.089430894308943, "grad_norm": 0.13538169860839844, "learning_rate": 4.364085752216036e-05, "loss": 0.4679, "num_input_tokens_seen": 33667744, "step": 27740 }, { "epoch": 3.0899877491925603, "grad_norm": 0.15231767296791077, "learning_rate": 4.363761903233475e-05, "loss": 0.4598, "num_input_tokens_seen": 33673632, "step": 27745 }, { "epoch": 3.090544604076178, "grad_norm": 0.13016153872013092, "learning_rate": 4.363437983832016e-05, "loss": 0.4596, "num_input_tokens_seen": 33679680, "step": 27750 }, { "epoch": 3.091101458959795, "grad_norm": 0.13391052186489105, "learning_rate": 4.363113994023899e-05, "loss": 0.454, "num_input_tokens_seen": 33685792, "step": 27755 }, { "epoch": 3.0916583138434124, "grad_norm": 0.1503259390592575, "learning_rate": 4.362789933821365e-05, "loss": 0.4506, "num_input_tokens_seen": 33692032, "step": 27760 }, { "epoch": 3.0922151687270296, "grad_norm": 0.11465579271316528, "learning_rate": 4.3624658032366567e-05, "loss": 0.4432, "num_input_tokens_seen": 33698240, "step": 27765 }, { "epoch": 3.092772023610647, "grad_norm": 0.10580707341432571, "learning_rate": 4.362141602282022e-05, "loss": 0.4526, "num_input_tokens_seen": 33704352, "step": 27770 }, { "epoch": 3.0933288784942645, "grad_norm": 0.15171384811401367, "learning_rate": 4.361817330969711e-05, "loss": 0.4474, "num_input_tokens_seen": 33710304, "step": 27775 }, { "epoch": 3.0938857333778818, "grad_norm": 0.17250216007232666, "learning_rate": 4.361492989311974e-05, "loss": 0.4427, "num_input_tokens_seen": 33716320, "step": 27780 }, { "epoch": 3.094442588261499, "grad_norm": 0.13588249683380127, "learning_rate": 4.3611685773210665e-05, "loss": 0.4329, "num_input_tokens_seen": 33722112, "step": 27785 }, { "epoch": 3.094999443145116, "grad_norm": 0.20361341536045074, "learning_rate": 4.360844095009245e-05, "loss": 0.4656, "num_input_tokens_seen": 33727776, "step": 27790 }, { "epoch": 3.095556298028734, "grad_norm": 0.11358320713043213, "learning_rate": 4.360519542388771e-05, "loss": 0.4464, "num_input_tokens_seen": 33733856, "step": 27795 }, { "epoch": 3.096113152912351, "grad_norm": 0.12428209185600281, "learning_rate": 4.360194919471906e-05, "loss": 0.4486, "num_input_tokens_seen": 33739776, "step": 27800 }, { "epoch": 3.0966700077959683, "grad_norm": 0.16583721339702606, "learning_rate": 4.3598702262709165e-05, "loss": 0.4566, "num_input_tokens_seen": 33745504, "step": 27805 }, { "epoch": 3.0972268626795856, "grad_norm": 0.11346624791622162, "learning_rate": 4.3595454627980687e-05, "loss": 0.4735, "num_input_tokens_seen": 33751616, "step": 27810 }, { "epoch": 3.0977837175632033, "grad_norm": 0.13376618921756744, "learning_rate": 4.359220629065634e-05, "loss": 0.4588, "num_input_tokens_seen": 33757792, "step": 27815 }, { "epoch": 3.0983405724468205, "grad_norm": 0.20147110521793365, "learning_rate": 4.3588957250858865e-05, "loss": 0.453, "num_input_tokens_seen": 33763776, "step": 27820 }, { "epoch": 3.0988974273304377, "grad_norm": 0.11320606619119644, "learning_rate": 4.358570750871102e-05, "loss": 0.4686, "num_input_tokens_seen": 33769728, "step": 27825 }, { "epoch": 3.099454282214055, "grad_norm": 0.10861380398273468, "learning_rate": 4.358245706433557e-05, "loss": 0.4673, "num_input_tokens_seen": 33775456, "step": 27830 }, { "epoch": 3.100011137097672, "grad_norm": 0.15568912029266357, "learning_rate": 4.357920591785535e-05, "loss": 0.4653, "num_input_tokens_seen": 33781568, "step": 27835 }, { "epoch": 3.10056799198129, "grad_norm": 0.09687680006027222, "learning_rate": 4.3575954069393186e-05, "loss": 0.4498, "num_input_tokens_seen": 33787552, "step": 27840 }, { "epoch": 3.101124846864907, "grad_norm": 0.1376374214887619, "learning_rate": 4.357270151907195e-05, "loss": 0.4504, "num_input_tokens_seen": 33792832, "step": 27845 }, { "epoch": 3.1016817017485243, "grad_norm": 0.10315141826868057, "learning_rate": 4.356944826701453e-05, "loss": 0.4452, "num_input_tokens_seen": 33798784, "step": 27850 }, { "epoch": 3.1022385566321415, "grad_norm": 0.14421619474887848, "learning_rate": 4.356619431334385e-05, "loss": 0.4599, "num_input_tokens_seen": 33804672, "step": 27855 }, { "epoch": 3.1027954115157588, "grad_norm": 0.12101364880800247, "learning_rate": 4.356293965818284e-05, "loss": 0.4462, "num_input_tokens_seen": 33810688, "step": 27860 }, { "epoch": 3.1033522663993764, "grad_norm": 0.11330509185791016, "learning_rate": 4.355968430165449e-05, "loss": 0.4421, "num_input_tokens_seen": 33816832, "step": 27865 }, { "epoch": 3.1039091212829937, "grad_norm": 0.1251721978187561, "learning_rate": 4.355642824388179e-05, "loss": 0.4648, "num_input_tokens_seen": 33822816, "step": 27870 }, { "epoch": 3.104465976166611, "grad_norm": 0.1177096962928772, "learning_rate": 4.3553171484987764e-05, "loss": 0.4939, "num_input_tokens_seen": 33828544, "step": 27875 }, { "epoch": 3.105022831050228, "grad_norm": 0.13240565359592438, "learning_rate": 4.3549914025095465e-05, "loss": 0.4611, "num_input_tokens_seen": 33834624, "step": 27880 }, { "epoch": 3.105579685933846, "grad_norm": 0.10888246446847916, "learning_rate": 4.3546655864327955e-05, "loss": 0.4779, "num_input_tokens_seen": 33840640, "step": 27885 }, { "epoch": 3.106136540817463, "grad_norm": 0.14684805274009705, "learning_rate": 4.354339700280836e-05, "loss": 0.4771, "num_input_tokens_seen": 33846720, "step": 27890 }, { "epoch": 3.1066933957010803, "grad_norm": 0.14240384101867676, "learning_rate": 4.35401374406598e-05, "loss": 0.4661, "num_input_tokens_seen": 33853120, "step": 27895 }, { "epoch": 3.1072502505846975, "grad_norm": 0.15331827104091644, "learning_rate": 4.353687717800544e-05, "loss": 0.4582, "num_input_tokens_seen": 33859232, "step": 27900 }, { "epoch": 3.107807105468315, "grad_norm": 0.10272470116615295, "learning_rate": 4.353361621496845e-05, "loss": 0.4518, "num_input_tokens_seen": 33865376, "step": 27905 }, { "epoch": 3.1083639603519324, "grad_norm": 0.19890852272510529, "learning_rate": 4.3530354551672044e-05, "loss": 0.4799, "num_input_tokens_seen": 33871328, "step": 27910 }, { "epoch": 3.1089208152355496, "grad_norm": 0.1026298776268959, "learning_rate": 4.352709218823946e-05, "loss": 0.4611, "num_input_tokens_seen": 33877440, "step": 27915 }, { "epoch": 3.109477670119167, "grad_norm": 0.10004115849733353, "learning_rate": 4.352382912479396e-05, "loss": 0.4519, "num_input_tokens_seen": 33883392, "step": 27920 }, { "epoch": 3.110034525002784, "grad_norm": 0.11619684845209122, "learning_rate": 4.352056536145883e-05, "loss": 0.4616, "num_input_tokens_seen": 33888832, "step": 27925 }, { "epoch": 3.1105913798864018, "grad_norm": 0.15735012292861938, "learning_rate": 4.3517300898357395e-05, "loss": 0.4704, "num_input_tokens_seen": 33894304, "step": 27930 }, { "epoch": 3.111148234770019, "grad_norm": 0.12498043477535248, "learning_rate": 4.351403573561299e-05, "loss": 0.4692, "num_input_tokens_seen": 33900416, "step": 27935 }, { "epoch": 3.1117050896536362, "grad_norm": 0.11788874119520187, "learning_rate": 4.3510769873348986e-05, "loss": 0.4605, "num_input_tokens_seen": 33906304, "step": 27940 }, { "epoch": 3.1122619445372535, "grad_norm": 0.15750154852867126, "learning_rate": 4.350750331168877e-05, "loss": 0.4679, "num_input_tokens_seen": 33912288, "step": 27945 }, { "epoch": 3.1128187994208707, "grad_norm": 0.11702271550893784, "learning_rate": 4.350423605075577e-05, "loss": 0.4609, "num_input_tokens_seen": 33918432, "step": 27950 }, { "epoch": 3.1133756543044884, "grad_norm": 0.19277402758598328, "learning_rate": 4.3500968090673435e-05, "loss": 0.4747, "num_input_tokens_seen": 33924768, "step": 27955 }, { "epoch": 3.1139325091881056, "grad_norm": 0.12556341290473938, "learning_rate": 4.349769943156523e-05, "loss": 0.4635, "num_input_tokens_seen": 33930912, "step": 27960 }, { "epoch": 3.114489364071723, "grad_norm": 0.08921421319246292, "learning_rate": 4.349443007355466e-05, "loss": 0.4587, "num_input_tokens_seen": 33937120, "step": 27965 }, { "epoch": 3.11504621895534, "grad_norm": 0.10459333658218384, "learning_rate": 4.349116001676527e-05, "loss": 0.4704, "num_input_tokens_seen": 33943808, "step": 27970 }, { "epoch": 3.1156030738389577, "grad_norm": 0.1232912465929985, "learning_rate": 4.348788926132058e-05, "loss": 0.4459, "num_input_tokens_seen": 33949728, "step": 27975 }, { "epoch": 3.116159928722575, "grad_norm": 0.12263281643390656, "learning_rate": 4.348461780734419e-05, "loss": 0.4725, "num_input_tokens_seen": 33955904, "step": 27980 }, { "epoch": 3.116716783606192, "grad_norm": 0.16183717548847198, "learning_rate": 4.3481345654959705e-05, "loss": 0.4586, "num_input_tokens_seen": 33961856, "step": 27985 }, { "epoch": 3.1172736384898094, "grad_norm": 0.13802671432495117, "learning_rate": 4.347807280429075e-05, "loss": 0.4573, "num_input_tokens_seen": 33967872, "step": 27990 }, { "epoch": 3.117830493373427, "grad_norm": 0.1310061514377594, "learning_rate": 4.347479925546099e-05, "loss": 0.4713, "num_input_tokens_seen": 33974080, "step": 27995 }, { "epoch": 3.1183873482570443, "grad_norm": 0.14056169986724854, "learning_rate": 4.347152500859412e-05, "loss": 0.462, "num_input_tokens_seen": 33980352, "step": 28000 }, { "epoch": 3.1189442031406616, "grad_norm": 0.1072525754570961, "learning_rate": 4.346825006381383e-05, "loss": 0.4789, "num_input_tokens_seen": 33986176, "step": 28005 }, { "epoch": 3.119501058024279, "grad_norm": 0.11089914292097092, "learning_rate": 4.3464974421243864e-05, "loss": 0.4675, "num_input_tokens_seen": 33992320, "step": 28010 }, { "epoch": 3.120057912907896, "grad_norm": 0.11915770918130875, "learning_rate": 4.3461698081007994e-05, "loss": 0.4559, "num_input_tokens_seen": 33997760, "step": 28015 }, { "epoch": 3.1206147677915137, "grad_norm": 0.11973831802606583, "learning_rate": 4.3458421043230015e-05, "loss": 0.4523, "num_input_tokens_seen": 34003904, "step": 28020 }, { "epoch": 3.121171622675131, "grad_norm": 0.14238956570625305, "learning_rate": 4.345514330803373e-05, "loss": 0.4697, "num_input_tokens_seen": 34010016, "step": 28025 }, { "epoch": 3.121728477558748, "grad_norm": 0.11484923213720322, "learning_rate": 4.345186487554299e-05, "loss": 0.4703, "num_input_tokens_seen": 34016480, "step": 28030 }, { "epoch": 3.1222853324423654, "grad_norm": 0.1673959344625473, "learning_rate": 4.344858574588166e-05, "loss": 0.4494, "num_input_tokens_seen": 34022432, "step": 28035 }, { "epoch": 3.1228421873259826, "grad_norm": 0.13089779019355774, "learning_rate": 4.344530591917364e-05, "loss": 0.4579, "num_input_tokens_seen": 34028736, "step": 28040 }, { "epoch": 3.1233990422096003, "grad_norm": 0.12178640812635422, "learning_rate": 4.344202539554285e-05, "loss": 0.4703, "num_input_tokens_seen": 34034592, "step": 28045 }, { "epoch": 3.1239558970932175, "grad_norm": 0.09592227637767792, "learning_rate": 4.343874417511324e-05, "loss": 0.4637, "num_input_tokens_seen": 34040640, "step": 28050 }, { "epoch": 3.1245127519768348, "grad_norm": 0.1310955286026001, "learning_rate": 4.3435462258008794e-05, "loss": 0.4567, "num_input_tokens_seen": 34046848, "step": 28055 }, { "epoch": 3.125069606860452, "grad_norm": 0.11051442474126816, "learning_rate": 4.34321796443535e-05, "loss": 0.4633, "num_input_tokens_seen": 34053024, "step": 28060 }, { "epoch": 3.1256264617440697, "grad_norm": 0.15144844353199005, "learning_rate": 4.3428896334271384e-05, "loss": 0.4751, "num_input_tokens_seen": 34059072, "step": 28065 }, { "epoch": 3.126183316627687, "grad_norm": 0.12385359406471252, "learning_rate": 4.3425612327886514e-05, "loss": 0.4481, "num_input_tokens_seen": 34065120, "step": 28070 }, { "epoch": 3.126740171511304, "grad_norm": 0.13531138002872467, "learning_rate": 4.3422327625322954e-05, "loss": 0.462, "num_input_tokens_seen": 34071072, "step": 28075 }, { "epoch": 3.1272970263949214, "grad_norm": 0.15258599817752838, "learning_rate": 4.341904222670483e-05, "loss": 0.4727, "num_input_tokens_seen": 34077408, "step": 28080 }, { "epoch": 3.127853881278539, "grad_norm": 0.10881300270557404, "learning_rate": 4.341575613215626e-05, "loss": 0.4709, "num_input_tokens_seen": 34083488, "step": 28085 }, { "epoch": 3.1284107361621563, "grad_norm": 0.08778203278779984, "learning_rate": 4.34124693418014e-05, "loss": 0.4682, "num_input_tokens_seen": 34089312, "step": 28090 }, { "epoch": 3.1289675910457735, "grad_norm": 0.15650838613510132, "learning_rate": 4.340918185576445e-05, "loss": 0.4637, "num_input_tokens_seen": 34095136, "step": 28095 }, { "epoch": 3.1295244459293907, "grad_norm": 0.13553135097026825, "learning_rate": 4.3405893674169614e-05, "loss": 0.4598, "num_input_tokens_seen": 34100800, "step": 28100 }, { "epoch": 3.130081300813008, "grad_norm": 0.13093140721321106, "learning_rate": 4.340260479714113e-05, "loss": 0.4705, "num_input_tokens_seen": 34106592, "step": 28105 }, { "epoch": 3.1306381556966256, "grad_norm": 0.13919660449028015, "learning_rate": 4.339931522480326e-05, "loss": 0.4617, "num_input_tokens_seen": 34112672, "step": 28110 }, { "epoch": 3.131195010580243, "grad_norm": 0.2189219743013382, "learning_rate": 4.3396024957280304e-05, "loss": 0.4663, "num_input_tokens_seen": 34118304, "step": 28115 }, { "epoch": 3.13175186546386, "grad_norm": 0.1163911372423172, "learning_rate": 4.3392733994696566e-05, "loss": 0.4517, "num_input_tokens_seen": 34124192, "step": 28120 }, { "epoch": 3.1323087203474773, "grad_norm": 0.12347003072500229, "learning_rate": 4.338944233717639e-05, "loss": 0.4446, "num_input_tokens_seen": 34130336, "step": 28125 }, { "epoch": 3.1328655752310945, "grad_norm": 0.19499215483665466, "learning_rate": 4.338614998484416e-05, "loss": 0.4712, "num_input_tokens_seen": 34136320, "step": 28130 }, { "epoch": 3.133422430114712, "grad_norm": 0.11111737787723541, "learning_rate": 4.3382856937824255e-05, "loss": 0.4584, "num_input_tokens_seen": 34142432, "step": 28135 }, { "epoch": 3.1339792849983295, "grad_norm": 0.12317745387554169, "learning_rate": 4.3379563196241114e-05, "loss": 0.4547, "num_input_tokens_seen": 34148288, "step": 28140 }, { "epoch": 3.1345361398819467, "grad_norm": 0.13754431903362274, "learning_rate": 4.337626876021917e-05, "loss": 0.4658, "num_input_tokens_seen": 34154304, "step": 28145 }, { "epoch": 3.135092994765564, "grad_norm": 0.10397885739803314, "learning_rate": 4.33729736298829e-05, "loss": 0.4706, "num_input_tokens_seen": 34160448, "step": 28150 }, { "epoch": 3.1356498496491816, "grad_norm": 0.1070360392332077, "learning_rate": 4.3369677805356815e-05, "loss": 0.4533, "num_input_tokens_seen": 34166880, "step": 28155 }, { "epoch": 3.136206704532799, "grad_norm": 0.1065153032541275, "learning_rate": 4.3366381286765423e-05, "loss": 0.4635, "num_input_tokens_seen": 34173088, "step": 28160 }, { "epoch": 3.136763559416416, "grad_norm": 0.14939966797828674, "learning_rate": 4.33630840742333e-05, "loss": 0.4742, "num_input_tokens_seen": 34178496, "step": 28165 }, { "epoch": 3.1373204143000333, "grad_norm": 0.12349631637334824, "learning_rate": 4.3359786167885e-05, "loss": 0.4632, "num_input_tokens_seen": 34184576, "step": 28170 }, { "epoch": 3.137877269183651, "grad_norm": 0.16766904294490814, "learning_rate": 4.335648756784515e-05, "loss": 0.4638, "num_input_tokens_seen": 34190368, "step": 28175 }, { "epoch": 3.138434124067268, "grad_norm": 0.132097989320755, "learning_rate": 4.335318827423837e-05, "loss": 0.4542, "num_input_tokens_seen": 34196448, "step": 28180 }, { "epoch": 3.1389909789508854, "grad_norm": 0.1226917952299118, "learning_rate": 4.334988828718932e-05, "loss": 0.4578, "num_input_tokens_seen": 34202144, "step": 28185 }, { "epoch": 3.1395478338345026, "grad_norm": 0.12729984521865845, "learning_rate": 4.334658760682269e-05, "loss": 0.4425, "num_input_tokens_seen": 34208256, "step": 28190 }, { "epoch": 3.14010468871812, "grad_norm": 0.13396091759204865, "learning_rate": 4.334328623326319e-05, "loss": 0.4699, "num_input_tokens_seen": 34214336, "step": 28195 }, { "epoch": 3.1406615436017375, "grad_norm": 0.16527408361434937, "learning_rate": 4.3339984166635536e-05, "loss": 0.4513, "num_input_tokens_seen": 34220480, "step": 28200 }, { "epoch": 3.141218398485355, "grad_norm": 0.13447976112365723, "learning_rate": 4.333668140706452e-05, "loss": 0.4488, "num_input_tokens_seen": 34226592, "step": 28205 }, { "epoch": 3.141775253368972, "grad_norm": 0.11040487140417099, "learning_rate": 4.3333377954674915e-05, "loss": 0.4758, "num_input_tokens_seen": 34232928, "step": 28210 }, { "epoch": 3.1423321082525892, "grad_norm": 0.13354134559631348, "learning_rate": 4.333007380959154e-05, "loss": 0.4412, "num_input_tokens_seen": 34238976, "step": 28215 }, { "epoch": 3.1428889631362065, "grad_norm": 0.1169794350862503, "learning_rate": 4.332676897193924e-05, "loss": 0.4701, "num_input_tokens_seen": 34245024, "step": 28220 }, { "epoch": 3.143445818019824, "grad_norm": 0.10502474009990692, "learning_rate": 4.332346344184286e-05, "loss": 0.472, "num_input_tokens_seen": 34250816, "step": 28225 }, { "epoch": 3.1440026729034414, "grad_norm": 0.1338246762752533, "learning_rate": 4.332015721942733e-05, "loss": 0.4569, "num_input_tokens_seen": 34256736, "step": 28230 }, { "epoch": 3.1445595277870586, "grad_norm": 0.11477957665920258, "learning_rate": 4.3316850304817534e-05, "loss": 0.4657, "num_input_tokens_seen": 34262784, "step": 28235 }, { "epoch": 3.145116382670676, "grad_norm": 0.08885497599840164, "learning_rate": 4.3313542698138446e-05, "loss": 0.4695, "num_input_tokens_seen": 34268992, "step": 28240 }, { "epoch": 3.1456732375542935, "grad_norm": 0.09723988175392151, "learning_rate": 4.3310234399515026e-05, "loss": 0.4605, "num_input_tokens_seen": 34275104, "step": 28245 }, { "epoch": 3.1462300924379107, "grad_norm": 0.11399378627538681, "learning_rate": 4.3306925409072263e-05, "loss": 0.4576, "num_input_tokens_seen": 34281120, "step": 28250 }, { "epoch": 3.146786947321528, "grad_norm": 0.10517893731594086, "learning_rate": 4.33036157269352e-05, "loss": 0.4683, "num_input_tokens_seen": 34286752, "step": 28255 }, { "epoch": 3.147343802205145, "grad_norm": 0.08157717436552048, "learning_rate": 4.3300305353228876e-05, "loss": 0.4595, "num_input_tokens_seen": 34292992, "step": 28260 }, { "epoch": 3.147900657088763, "grad_norm": 0.14064176380634308, "learning_rate": 4.3296994288078364e-05, "loss": 0.4621, "num_input_tokens_seen": 34299200, "step": 28265 }, { "epoch": 3.14845751197238, "grad_norm": 0.16659945249557495, "learning_rate": 4.329368253160878e-05, "loss": 0.461, "num_input_tokens_seen": 34305312, "step": 28270 }, { "epoch": 3.1490143668559973, "grad_norm": 0.12120193988084793, "learning_rate": 4.3290370083945233e-05, "loss": 0.4582, "num_input_tokens_seen": 34311264, "step": 28275 }, { "epoch": 3.1495712217396146, "grad_norm": 0.11516106873750687, "learning_rate": 4.32870569452129e-05, "loss": 0.4504, "num_input_tokens_seen": 34317536, "step": 28280 }, { "epoch": 3.150128076623232, "grad_norm": 0.10396004468202591, "learning_rate": 4.328374311553696e-05, "loss": 0.4542, "num_input_tokens_seen": 34323616, "step": 28285 }, { "epoch": 3.1506849315068495, "grad_norm": 0.10112843662500381, "learning_rate": 4.32804285950426e-05, "loss": 0.4488, "num_input_tokens_seen": 34329664, "step": 28290 }, { "epoch": 3.1512417863904667, "grad_norm": 0.11507201194763184, "learning_rate": 4.327711338385506e-05, "loss": 0.4565, "num_input_tokens_seen": 34335584, "step": 28295 }, { "epoch": 3.151798641274084, "grad_norm": 0.10987822711467743, "learning_rate": 4.327379748209961e-05, "loss": 0.4631, "num_input_tokens_seen": 34341600, "step": 28300 }, { "epoch": 3.152355496157701, "grad_norm": 0.11840551346540451, "learning_rate": 4.327048088990153e-05, "loss": 0.4712, "num_input_tokens_seen": 34347616, "step": 28305 }, { "epoch": 3.152912351041319, "grad_norm": 0.15748944878578186, "learning_rate": 4.3267163607386134e-05, "loss": 0.4677, "num_input_tokens_seen": 34353472, "step": 28310 }, { "epoch": 3.153469205924936, "grad_norm": 0.12725339829921722, "learning_rate": 4.326384563467876e-05, "loss": 0.4534, "num_input_tokens_seen": 34359488, "step": 28315 }, { "epoch": 3.1540260608085533, "grad_norm": 0.1909237802028656, "learning_rate": 4.326052697190476e-05, "loss": 0.4655, "num_input_tokens_seen": 34365184, "step": 28320 }, { "epoch": 3.1545829156921705, "grad_norm": 0.1694549024105072, "learning_rate": 4.325720761918954e-05, "loss": 0.4599, "num_input_tokens_seen": 34371136, "step": 28325 }, { "epoch": 3.1551397705757878, "grad_norm": 0.1183214858174324, "learning_rate": 4.32538875766585e-05, "loss": 0.4545, "num_input_tokens_seen": 34377344, "step": 28330 }, { "epoch": 3.1556966254594054, "grad_norm": 0.10870831459760666, "learning_rate": 4.32505668444371e-05, "loss": 0.4755, "num_input_tokens_seen": 34383424, "step": 28335 }, { "epoch": 3.1562534803430227, "grad_norm": 0.1296456903219223, "learning_rate": 4.3247245422650784e-05, "loss": 0.4689, "num_input_tokens_seen": 34389696, "step": 28340 }, { "epoch": 3.15681033522664, "grad_norm": 0.11324621737003326, "learning_rate": 4.324392331142507e-05, "loss": 0.473, "num_input_tokens_seen": 34395904, "step": 28345 }, { "epoch": 3.157367190110257, "grad_norm": 0.17200687527656555, "learning_rate": 4.324060051088546e-05, "loss": 0.4548, "num_input_tokens_seen": 34402240, "step": 28350 }, { "epoch": 3.157924044993875, "grad_norm": 0.11964454501867294, "learning_rate": 4.323727702115752e-05, "loss": 0.4583, "num_input_tokens_seen": 34408352, "step": 28355 }, { "epoch": 3.158480899877492, "grad_norm": 0.08967360109090805, "learning_rate": 4.323395284236681e-05, "loss": 0.467, "num_input_tokens_seen": 34414528, "step": 28360 }, { "epoch": 3.1590377547611093, "grad_norm": 0.11389283090829849, "learning_rate": 4.3230627974638914e-05, "loss": 0.4755, "num_input_tokens_seen": 34420384, "step": 28365 }, { "epoch": 3.1595946096447265, "grad_norm": 0.1258040964603424, "learning_rate": 4.322730241809948e-05, "loss": 0.4511, "num_input_tokens_seen": 34426752, "step": 28370 }, { "epoch": 3.1601514645283437, "grad_norm": 0.13436844944953918, "learning_rate": 4.3223976172874145e-05, "loss": 0.4611, "num_input_tokens_seen": 34433056, "step": 28375 }, { "epoch": 3.1607083194119614, "grad_norm": 0.14782710373401642, "learning_rate": 4.322064923908859e-05, "loss": 0.4684, "num_input_tokens_seen": 34439040, "step": 28380 }, { "epoch": 3.1612651742955786, "grad_norm": 0.13830935955047607, "learning_rate": 4.3217321616868516e-05, "loss": 0.4708, "num_input_tokens_seen": 34445152, "step": 28385 }, { "epoch": 3.161822029179196, "grad_norm": 0.11648387461900711, "learning_rate": 4.321399330633965e-05, "loss": 0.4601, "num_input_tokens_seen": 34451648, "step": 28390 }, { "epoch": 3.162378884062813, "grad_norm": 0.1852940320968628, "learning_rate": 4.321066430762775e-05, "loss": 0.4627, "num_input_tokens_seen": 34457760, "step": 28395 }, { "epoch": 3.1629357389464308, "grad_norm": 0.10356373339891434, "learning_rate": 4.3207334620858584e-05, "loss": 0.4693, "num_input_tokens_seen": 34464000, "step": 28400 }, { "epoch": 3.163492593830048, "grad_norm": 0.1028912141919136, "learning_rate": 4.320400424615798e-05, "loss": 0.4508, "num_input_tokens_seen": 34470112, "step": 28405 }, { "epoch": 3.1640494487136652, "grad_norm": 0.10858127474784851, "learning_rate": 4.320067318365175e-05, "loss": 0.4659, "num_input_tokens_seen": 34476224, "step": 28410 }, { "epoch": 3.1646063035972825, "grad_norm": 0.17708829045295715, "learning_rate": 4.319734143346576e-05, "loss": 0.4731, "num_input_tokens_seen": 34482496, "step": 28415 }, { "epoch": 3.1651631584808997, "grad_norm": 0.13478462398052216, "learning_rate": 4.3194008995725904e-05, "loss": 0.467, "num_input_tokens_seen": 34488384, "step": 28420 }, { "epoch": 3.1657200133645174, "grad_norm": 0.09888643771409988, "learning_rate": 4.319067587055807e-05, "loss": 0.4564, "num_input_tokens_seen": 34493984, "step": 28425 }, { "epoch": 3.1662768682481346, "grad_norm": 0.09807983785867691, "learning_rate": 4.318734205808822e-05, "loss": 0.447, "num_input_tokens_seen": 34500000, "step": 28430 }, { "epoch": 3.166833723131752, "grad_norm": 0.16078297793865204, "learning_rate": 4.3184007558442294e-05, "loss": 0.4613, "num_input_tokens_seen": 34506048, "step": 28435 }, { "epoch": 3.167390578015369, "grad_norm": 0.13552899658679962, "learning_rate": 4.318067237174629e-05, "loss": 0.4633, "num_input_tokens_seen": 34511776, "step": 28440 }, { "epoch": 3.1679474328989867, "grad_norm": 0.11640194803476334, "learning_rate": 4.317733649812622e-05, "loss": 0.4595, "num_input_tokens_seen": 34517888, "step": 28445 }, { "epoch": 3.168504287782604, "grad_norm": 0.10823042690753937, "learning_rate": 4.317399993770813e-05, "loss": 0.4456, "num_input_tokens_seen": 34523296, "step": 28450 }, { "epoch": 3.169061142666221, "grad_norm": 0.12673242390155792, "learning_rate": 4.317066269061808e-05, "loss": 0.4641, "num_input_tokens_seen": 34529696, "step": 28455 }, { "epoch": 3.1696179975498384, "grad_norm": 0.1496077924966812, "learning_rate": 4.316732475698216e-05, "loss": 0.4592, "num_input_tokens_seen": 34535744, "step": 28460 }, { "epoch": 3.1701748524334556, "grad_norm": 0.12592248618602753, "learning_rate": 4.3163986136926485e-05, "loss": 0.4586, "num_input_tokens_seen": 34541856, "step": 28465 }, { "epoch": 3.1707317073170733, "grad_norm": 0.10673803091049194, "learning_rate": 4.316064683057721e-05, "loss": 0.4809, "num_input_tokens_seen": 34547680, "step": 28470 }, { "epoch": 3.1712885622006906, "grad_norm": 0.13889016211032867, "learning_rate": 4.315730683806051e-05, "loss": 0.4431, "num_input_tokens_seen": 34553760, "step": 28475 }, { "epoch": 3.171845417084308, "grad_norm": 0.11873536556959152, "learning_rate": 4.315396615950255e-05, "loss": 0.4725, "num_input_tokens_seen": 34559904, "step": 28480 }, { "epoch": 3.172402271967925, "grad_norm": 0.16044142842292786, "learning_rate": 4.315062479502958e-05, "loss": 0.4789, "num_input_tokens_seen": 34565440, "step": 28485 }, { "epoch": 3.1729591268515427, "grad_norm": 0.15318670868873596, "learning_rate": 4.3147282744767844e-05, "loss": 0.4661, "num_input_tokens_seen": 34571648, "step": 28490 }, { "epoch": 3.17351598173516, "grad_norm": 0.1743767112493515, "learning_rate": 4.31439400088436e-05, "loss": 0.473, "num_input_tokens_seen": 34577632, "step": 28495 }, { "epoch": 3.174072836618777, "grad_norm": 0.133060023188591, "learning_rate": 4.314059658738316e-05, "loss": 0.4717, "num_input_tokens_seen": 34583808, "step": 28500 }, { "epoch": 3.1746296915023944, "grad_norm": 0.116344153881073, "learning_rate": 4.313725248051286e-05, "loss": 0.4708, "num_input_tokens_seen": 34590080, "step": 28505 }, { "epoch": 3.1751865463860116, "grad_norm": 0.1364380270242691, "learning_rate": 4.313390768835902e-05, "loss": 0.4612, "num_input_tokens_seen": 34596480, "step": 28510 }, { "epoch": 3.1757434012696293, "grad_norm": 0.1097930371761322, "learning_rate": 4.313056221104804e-05, "loss": 0.4526, "num_input_tokens_seen": 34602560, "step": 28515 }, { "epoch": 3.1763002561532465, "grad_norm": 0.11805015057325363, "learning_rate": 4.312721604870632e-05, "loss": 0.466, "num_input_tokens_seen": 34607328, "step": 28520 }, { "epoch": 3.1768571110368637, "grad_norm": 0.1424911618232727, "learning_rate": 4.3123869201460285e-05, "loss": 0.46, "num_input_tokens_seen": 34613472, "step": 28525 }, { "epoch": 3.177413965920481, "grad_norm": 0.12875480949878693, "learning_rate": 4.3120521669436395e-05, "loss": 0.4665, "num_input_tokens_seen": 34619488, "step": 28530 }, { "epoch": 3.1779708208040987, "grad_norm": 0.13183827698230743, "learning_rate": 4.311717345276112e-05, "loss": 0.4631, "num_input_tokens_seen": 34624928, "step": 28535 }, { "epoch": 3.178527675687716, "grad_norm": 0.11208347231149673, "learning_rate": 4.311382455156098e-05, "loss": 0.4561, "num_input_tokens_seen": 34631264, "step": 28540 }, { "epoch": 3.179084530571333, "grad_norm": 0.14762316644191742, "learning_rate": 4.311047496596249e-05, "loss": 0.4668, "num_input_tokens_seen": 34637664, "step": 28545 }, { "epoch": 3.1796413854549503, "grad_norm": 0.17148372530937195, "learning_rate": 4.3107124696092216e-05, "loss": 0.4622, "num_input_tokens_seen": 34643840, "step": 28550 }, { "epoch": 3.1801982403385676, "grad_norm": 0.1445646435022354, "learning_rate": 4.3103773742076754e-05, "loss": 0.4521, "num_input_tokens_seen": 34649952, "step": 28555 }, { "epoch": 3.1807550952221852, "grad_norm": 0.12512676417827606, "learning_rate": 4.31004221040427e-05, "loss": 0.4661, "num_input_tokens_seen": 34655776, "step": 28560 }, { "epoch": 3.1813119501058025, "grad_norm": 0.13974526524543762, "learning_rate": 4.3097069782116694e-05, "loss": 0.4469, "num_input_tokens_seen": 34661952, "step": 28565 }, { "epoch": 3.1818688049894197, "grad_norm": 0.11915986239910126, "learning_rate": 4.30937167764254e-05, "loss": 0.4629, "num_input_tokens_seen": 34668384, "step": 28570 }, { "epoch": 3.182425659873037, "grad_norm": 0.14161457121372223, "learning_rate": 4.3090363087095494e-05, "loss": 0.4757, "num_input_tokens_seen": 34673856, "step": 28575 }, { "epoch": 3.1829825147566546, "grad_norm": 0.12146051973104477, "learning_rate": 4.308700871425369e-05, "loss": 0.4501, "num_input_tokens_seen": 34679712, "step": 28580 }, { "epoch": 3.183539369640272, "grad_norm": 0.17819690704345703, "learning_rate": 4.3083653658026754e-05, "loss": 0.4723, "num_input_tokens_seen": 34685856, "step": 28585 }, { "epoch": 3.184096224523889, "grad_norm": 0.13342075049877167, "learning_rate": 4.308029791854142e-05, "loss": 0.458, "num_input_tokens_seen": 34691808, "step": 28590 }, { "epoch": 3.1846530794075063, "grad_norm": 0.1565374881029129, "learning_rate": 4.3076941495924495e-05, "loss": 0.4573, "num_input_tokens_seen": 34698048, "step": 28595 }, { "epoch": 3.1852099342911235, "grad_norm": 0.13195565342903137, "learning_rate": 4.3073584390302784e-05, "loss": 0.4628, "num_input_tokens_seen": 34704096, "step": 28600 }, { "epoch": 3.185766789174741, "grad_norm": 0.11766593158245087, "learning_rate": 4.307022660180314e-05, "loss": 0.4433, "num_input_tokens_seen": 34710048, "step": 28605 }, { "epoch": 3.1863236440583584, "grad_norm": 0.1322002112865448, "learning_rate": 4.306686813055242e-05, "loss": 0.4656, "num_input_tokens_seen": 34716256, "step": 28610 }, { "epoch": 3.1868804989419757, "grad_norm": 0.14015278220176697, "learning_rate": 4.306350897667753e-05, "loss": 0.4472, "num_input_tokens_seen": 34722464, "step": 28615 }, { "epoch": 3.187437353825593, "grad_norm": 0.12983085215091705, "learning_rate": 4.3060149140305375e-05, "loss": 0.4661, "num_input_tokens_seen": 34728384, "step": 28620 }, { "epoch": 3.1879942087092106, "grad_norm": 0.14783872663974762, "learning_rate": 4.305678862156292e-05, "loss": 0.4584, "num_input_tokens_seen": 34733984, "step": 28625 }, { "epoch": 3.188551063592828, "grad_norm": 0.13096250593662262, "learning_rate": 4.305342742057712e-05, "loss": 0.4796, "num_input_tokens_seen": 34739936, "step": 28630 }, { "epoch": 3.189107918476445, "grad_norm": 0.1097068265080452, "learning_rate": 4.305006553747497e-05, "loss": 0.4539, "num_input_tokens_seen": 34746176, "step": 28635 }, { "epoch": 3.1896647733600623, "grad_norm": 0.11970420926809311, "learning_rate": 4.304670297238351e-05, "loss": 0.4585, "num_input_tokens_seen": 34752192, "step": 28640 }, { "epoch": 3.1902216282436795, "grad_norm": 0.12420641630887985, "learning_rate": 4.3043339725429775e-05, "loss": 0.4729, "num_input_tokens_seen": 34758336, "step": 28645 }, { "epoch": 3.190778483127297, "grad_norm": 0.1692015826702118, "learning_rate": 4.303997579674084e-05, "loss": 0.4629, "num_input_tokens_seen": 34764768, "step": 28650 }, { "epoch": 3.1913353380109144, "grad_norm": 0.12457901239395142, "learning_rate": 4.303661118644381e-05, "loss": 0.4709, "num_input_tokens_seen": 34770528, "step": 28655 }, { "epoch": 3.1918921928945316, "grad_norm": 0.18224555253982544, "learning_rate": 4.3033245894665814e-05, "loss": 0.4535, "num_input_tokens_seen": 34776384, "step": 28660 }, { "epoch": 3.192449047778149, "grad_norm": 0.12741056084632874, "learning_rate": 4.302987992153399e-05, "loss": 0.4632, "num_input_tokens_seen": 34782272, "step": 28665 }, { "epoch": 3.1930059026617665, "grad_norm": 0.11238522827625275, "learning_rate": 4.302651326717553e-05, "loss": 0.4587, "num_input_tokens_seen": 34788288, "step": 28670 }, { "epoch": 3.1935627575453838, "grad_norm": 0.15654587745666504, "learning_rate": 4.302314593171762e-05, "loss": 0.4581, "num_input_tokens_seen": 34794208, "step": 28675 }, { "epoch": 3.194119612429001, "grad_norm": 0.11898581683635712, "learning_rate": 4.30197779152875e-05, "loss": 0.4508, "num_input_tokens_seen": 34800384, "step": 28680 }, { "epoch": 3.1946764673126182, "grad_norm": 0.13948391377925873, "learning_rate": 4.301640921801243e-05, "loss": 0.4869, "num_input_tokens_seen": 34806144, "step": 28685 }, { "epoch": 3.1952333221962355, "grad_norm": 0.11628442257642746, "learning_rate": 4.301303984001967e-05, "loss": 0.4563, "num_input_tokens_seen": 34811680, "step": 28690 }, { "epoch": 3.195790177079853, "grad_norm": 0.20612318813800812, "learning_rate": 4.300966978143656e-05, "loss": 0.4629, "num_input_tokens_seen": 34818176, "step": 28695 }, { "epoch": 3.1963470319634704, "grad_norm": 0.12799912691116333, "learning_rate": 4.30062990423904e-05, "loss": 0.4529, "num_input_tokens_seen": 34824544, "step": 28700 }, { "epoch": 3.1969038868470876, "grad_norm": 0.0880209282040596, "learning_rate": 4.300292762300856e-05, "loss": 0.4687, "num_input_tokens_seen": 34830528, "step": 28705 }, { "epoch": 3.197460741730705, "grad_norm": 0.16237971186637878, "learning_rate": 4.299955552341841e-05, "loss": 0.4526, "num_input_tokens_seen": 34836512, "step": 28710 }, { "epoch": 3.1980175966143225, "grad_norm": 0.128991037607193, "learning_rate": 4.299618274374738e-05, "loss": 0.4705, "num_input_tokens_seen": 34842816, "step": 28715 }, { "epoch": 3.1985744514979397, "grad_norm": 0.16599759459495544, "learning_rate": 4.2992809284122895e-05, "loss": 0.4714, "num_input_tokens_seen": 34848832, "step": 28720 }, { "epoch": 3.199131306381557, "grad_norm": 0.14820118248462677, "learning_rate": 4.2989435144672415e-05, "loss": 0.4536, "num_input_tokens_seen": 34855136, "step": 28725 }, { "epoch": 3.199688161265174, "grad_norm": 0.13099981844425201, "learning_rate": 4.2986060325523424e-05, "loss": 0.4615, "num_input_tokens_seen": 34860992, "step": 28730 }, { "epoch": 3.2002450161487914, "grad_norm": 0.13171954452991486, "learning_rate": 4.298268482680343e-05, "loss": 0.4752, "num_input_tokens_seen": 34867328, "step": 28735 }, { "epoch": 3.200801871032409, "grad_norm": 0.10439538955688477, "learning_rate": 4.297930864863998e-05, "loss": 0.4637, "num_input_tokens_seen": 34873536, "step": 28740 }, { "epoch": 3.2013587259160263, "grad_norm": 0.10328970104455948, "learning_rate": 4.297593179116063e-05, "loss": 0.4726, "num_input_tokens_seen": 34879648, "step": 28745 }, { "epoch": 3.2019155807996436, "grad_norm": 0.16385114192962646, "learning_rate": 4.297255425449297e-05, "loss": 0.4675, "num_input_tokens_seen": 34885888, "step": 28750 }, { "epoch": 3.202472435683261, "grad_norm": 0.11038190871477127, "learning_rate": 4.2969176038764615e-05, "loss": 0.4506, "num_input_tokens_seen": 34891968, "step": 28755 }, { "epoch": 3.2030292905668785, "grad_norm": 0.12584595382213593, "learning_rate": 4.29657971441032e-05, "loss": 0.4643, "num_input_tokens_seen": 34897632, "step": 28760 }, { "epoch": 3.2035861454504957, "grad_norm": 0.13536937534809113, "learning_rate": 4.296241757063641e-05, "loss": 0.4629, "num_input_tokens_seen": 34903808, "step": 28765 }, { "epoch": 3.204143000334113, "grad_norm": 0.11972139775753021, "learning_rate": 4.2959037318491904e-05, "loss": 0.4575, "num_input_tokens_seen": 34909824, "step": 28770 }, { "epoch": 3.20469985521773, "grad_norm": 0.11936008185148239, "learning_rate": 4.295565638779742e-05, "loss": 0.4592, "num_input_tokens_seen": 34915616, "step": 28775 }, { "epoch": 3.2052567101013474, "grad_norm": 0.13244172930717468, "learning_rate": 4.29522747786807e-05, "loss": 0.4752, "num_input_tokens_seen": 34921920, "step": 28780 }, { "epoch": 3.205813564984965, "grad_norm": 0.15902459621429443, "learning_rate": 4.2948892491269504e-05, "loss": 0.4663, "num_input_tokens_seen": 34927520, "step": 28785 }, { "epoch": 3.2063704198685823, "grad_norm": 0.1526286005973816, "learning_rate": 4.294550952569163e-05, "loss": 0.4472, "num_input_tokens_seen": 34933664, "step": 28790 }, { "epoch": 3.2069272747521995, "grad_norm": 0.14204257726669312, "learning_rate": 4.29421258820749e-05, "loss": 0.4613, "num_input_tokens_seen": 34939872, "step": 28795 }, { "epoch": 3.2074841296358167, "grad_norm": 0.16534312069416046, "learning_rate": 4.2938741560547155e-05, "loss": 0.4641, "num_input_tokens_seen": 34945344, "step": 28800 }, { "epoch": 3.2080409845194344, "grad_norm": 0.15369577705860138, "learning_rate": 4.293535656123626e-05, "loss": 0.4654, "num_input_tokens_seen": 34951456, "step": 28805 }, { "epoch": 3.2085978394030517, "grad_norm": 0.14121156930923462, "learning_rate": 4.293197088427012e-05, "loss": 0.469, "num_input_tokens_seen": 34957696, "step": 28810 }, { "epoch": 3.209154694286669, "grad_norm": 0.11285538971424103, "learning_rate": 4.292858452977666e-05, "loss": 0.4825, "num_input_tokens_seen": 34963904, "step": 28815 }, { "epoch": 3.209711549170286, "grad_norm": 0.08809149265289307, "learning_rate": 4.292519749788382e-05, "loss": 0.4526, "num_input_tokens_seen": 34969760, "step": 28820 }, { "epoch": 3.2102684040539033, "grad_norm": 0.1494196057319641, "learning_rate": 4.292180978871957e-05, "loss": 0.4466, "num_input_tokens_seen": 34975936, "step": 28825 }, { "epoch": 3.210825258937521, "grad_norm": 0.12844178080558777, "learning_rate": 4.291842140241192e-05, "loss": 0.4722, "num_input_tokens_seen": 34982176, "step": 28830 }, { "epoch": 3.2113821138211383, "grad_norm": 0.08494175225496292, "learning_rate": 4.291503233908888e-05, "loss": 0.483, "num_input_tokens_seen": 34988256, "step": 28835 }, { "epoch": 3.2119389687047555, "grad_norm": 0.10743016004562378, "learning_rate": 4.291164259887851e-05, "loss": 0.4562, "num_input_tokens_seen": 34993632, "step": 28840 }, { "epoch": 3.2124958235883727, "grad_norm": 0.10855372995138168, "learning_rate": 4.290825218190888e-05, "loss": 0.4485, "num_input_tokens_seen": 34999264, "step": 28845 }, { "epoch": 3.2130526784719904, "grad_norm": 0.114527627825737, "learning_rate": 4.290486108830809e-05, "loss": 0.4639, "num_input_tokens_seen": 35005568, "step": 28850 }, { "epoch": 3.2136095333556076, "grad_norm": 0.10108284652233124, "learning_rate": 4.2901469318204275e-05, "loss": 0.4489, "num_input_tokens_seen": 35010624, "step": 28855 }, { "epoch": 3.214166388239225, "grad_norm": 0.10752779990434647, "learning_rate": 4.289807687172558e-05, "loss": 0.4571, "num_input_tokens_seen": 35016800, "step": 28860 }, { "epoch": 3.214723243122842, "grad_norm": 0.1401720494031906, "learning_rate": 4.289468374900019e-05, "loss": 0.4603, "num_input_tokens_seen": 35023040, "step": 28865 }, { "epoch": 3.2152800980064598, "grad_norm": 0.11886291205883026, "learning_rate": 4.2891289950156285e-05, "loss": 0.455, "num_input_tokens_seen": 35029024, "step": 28870 }, { "epoch": 3.215836952890077, "grad_norm": 0.16133588552474976, "learning_rate": 4.288789547532212e-05, "loss": 0.4483, "num_input_tokens_seen": 35035392, "step": 28875 }, { "epoch": 3.216393807773694, "grad_norm": 0.12901611626148224, "learning_rate": 4.2884500324625945e-05, "loss": 0.4719, "num_input_tokens_seen": 35041696, "step": 28880 }, { "epoch": 3.2169506626573114, "grad_norm": 0.11497361212968826, "learning_rate": 4.2881104498196024e-05, "loss": 0.4706, "num_input_tokens_seen": 35047712, "step": 28885 }, { "epoch": 3.2175075175409287, "grad_norm": 0.13334716856479645, "learning_rate": 4.287770799616068e-05, "loss": 0.4712, "num_input_tokens_seen": 35054048, "step": 28890 }, { "epoch": 3.2180643724245463, "grad_norm": 0.11993614584207535, "learning_rate": 4.287431081864823e-05, "loss": 0.4473, "num_input_tokens_seen": 35060160, "step": 28895 }, { "epoch": 3.2186212273081636, "grad_norm": 0.13112519681453705, "learning_rate": 4.287091296578704e-05, "loss": 0.4729, "num_input_tokens_seen": 35065856, "step": 28900 }, { "epoch": 3.219178082191781, "grad_norm": 0.10615148395299911, "learning_rate": 4.286751443770548e-05, "loss": 0.4585, "num_input_tokens_seen": 35071744, "step": 28905 }, { "epoch": 3.219734937075398, "grad_norm": 0.16153225302696228, "learning_rate": 4.2864115234531974e-05, "loss": 0.4923, "num_input_tokens_seen": 35077792, "step": 28910 }, { "epoch": 3.2202917919590153, "grad_norm": 0.10944363474845886, "learning_rate": 4.286071535639494e-05, "loss": 0.4573, "num_input_tokens_seen": 35083616, "step": 28915 }, { "epoch": 3.220848646842633, "grad_norm": 0.1165299117565155, "learning_rate": 4.285731480342285e-05, "loss": 0.4584, "num_input_tokens_seen": 35089024, "step": 28920 }, { "epoch": 3.22140550172625, "grad_norm": 0.17328645288944244, "learning_rate": 4.285391357574417e-05, "loss": 0.4631, "num_input_tokens_seen": 35095264, "step": 28925 }, { "epoch": 3.2219623566098674, "grad_norm": 0.13700877130031586, "learning_rate": 4.2850511673487425e-05, "loss": 0.458, "num_input_tokens_seen": 35101280, "step": 28930 }, { "epoch": 3.2225192114934846, "grad_norm": 0.1381559520959854, "learning_rate": 4.284710909678115e-05, "loss": 0.4542, "num_input_tokens_seen": 35107040, "step": 28935 }, { "epoch": 3.2230760663771023, "grad_norm": 0.15960440039634705, "learning_rate": 4.28437058457539e-05, "loss": 0.4557, "num_input_tokens_seen": 35113344, "step": 28940 }, { "epoch": 3.2236329212607195, "grad_norm": 0.10227257013320923, "learning_rate": 4.284030192053425e-05, "loss": 0.4736, "num_input_tokens_seen": 35119584, "step": 28945 }, { "epoch": 3.2241897761443368, "grad_norm": 0.13843925297260284, "learning_rate": 4.2836897321250826e-05, "loss": 0.4625, "num_input_tokens_seen": 35125536, "step": 28950 }, { "epoch": 3.224746631027954, "grad_norm": 0.13405287265777588, "learning_rate": 4.283349204803226e-05, "loss": 0.4669, "num_input_tokens_seen": 35131584, "step": 28955 }, { "epoch": 3.2253034859115717, "grad_norm": 0.12085862457752228, "learning_rate": 4.283008610100722e-05, "loss": 0.458, "num_input_tokens_seen": 35137856, "step": 28960 }, { "epoch": 3.225860340795189, "grad_norm": 0.2091686874628067, "learning_rate": 4.282667948030438e-05, "loss": 0.4726, "num_input_tokens_seen": 35143680, "step": 28965 }, { "epoch": 3.226417195678806, "grad_norm": 0.14747697114944458, "learning_rate": 4.282327218605246e-05, "loss": 0.45, "num_input_tokens_seen": 35149888, "step": 28970 }, { "epoch": 3.2269740505624234, "grad_norm": 0.1149211972951889, "learning_rate": 4.28198642183802e-05, "loss": 0.464, "num_input_tokens_seen": 35155936, "step": 28975 }, { "epoch": 3.2275309054460406, "grad_norm": 0.09892355650663376, "learning_rate": 4.2816455577416366e-05, "loss": 0.4577, "num_input_tokens_seen": 35161952, "step": 28980 }, { "epoch": 3.2280877603296583, "grad_norm": 0.14030566811561584, "learning_rate": 4.281304626328975e-05, "loss": 0.4618, "num_input_tokens_seen": 35167776, "step": 28985 }, { "epoch": 3.2286446152132755, "grad_norm": 0.11937558650970459, "learning_rate": 4.280963627612915e-05, "loss": 0.4633, "num_input_tokens_seen": 35173504, "step": 28990 }, { "epoch": 3.2292014700968927, "grad_norm": 0.14666633307933807, "learning_rate": 4.280622561606342e-05, "loss": 0.4595, "num_input_tokens_seen": 35179936, "step": 28995 }, { "epoch": 3.22975832498051, "grad_norm": 0.18720915913581848, "learning_rate": 4.2802814283221425e-05, "loss": 0.4744, "num_input_tokens_seen": 35186368, "step": 29000 }, { "epoch": 3.230315179864127, "grad_norm": 0.1026056557893753, "learning_rate": 4.279940227773206e-05, "loss": 0.4775, "num_input_tokens_seen": 35192512, "step": 29005 }, { "epoch": 3.230872034747745, "grad_norm": 0.1058921366930008, "learning_rate": 4.279598959972423e-05, "loss": 0.4614, "num_input_tokens_seen": 35198624, "step": 29010 }, { "epoch": 3.231428889631362, "grad_norm": 0.10889843851327896, "learning_rate": 4.279257624932689e-05, "loss": 0.4626, "num_input_tokens_seen": 35204736, "step": 29015 }, { "epoch": 3.2319857445149793, "grad_norm": 0.10422345250844955, "learning_rate": 4.278916222666899e-05, "loss": 0.461, "num_input_tokens_seen": 35210592, "step": 29020 }, { "epoch": 3.2325425993985966, "grad_norm": 0.15109428763389587, "learning_rate": 4.278574753187954e-05, "loss": 0.4802, "num_input_tokens_seen": 35216384, "step": 29025 }, { "epoch": 3.2330994542822142, "grad_norm": 0.13980624079704285, "learning_rate": 4.278233216508755e-05, "loss": 0.4615, "num_input_tokens_seen": 35222528, "step": 29030 }, { "epoch": 3.2336563091658315, "grad_norm": 0.10049472004175186, "learning_rate": 4.277891612642206e-05, "loss": 0.4706, "num_input_tokens_seen": 35228768, "step": 29035 }, { "epoch": 3.2342131640494487, "grad_norm": 0.12420159578323364, "learning_rate": 4.277549941601214e-05, "loss": 0.464, "num_input_tokens_seen": 35234624, "step": 29040 }, { "epoch": 3.234770018933066, "grad_norm": 0.11106277257204056, "learning_rate": 4.2772082033986906e-05, "loss": 0.4439, "num_input_tokens_seen": 35240896, "step": 29045 }, { "epoch": 3.2353268738166836, "grad_norm": 0.12026545405387878, "learning_rate": 4.276866398047544e-05, "loss": 0.468, "num_input_tokens_seen": 35246784, "step": 29050 }, { "epoch": 3.235883728700301, "grad_norm": 0.1156381294131279, "learning_rate": 4.2765245255606915e-05, "loss": 0.4457, "num_input_tokens_seen": 35253152, "step": 29055 }, { "epoch": 3.236440583583918, "grad_norm": 0.11373520642518997, "learning_rate": 4.27618258595105e-05, "loss": 0.456, "num_input_tokens_seen": 35259104, "step": 29060 }, { "epoch": 3.2369974384675353, "grad_norm": 0.10546388477087021, "learning_rate": 4.275840579231537e-05, "loss": 0.4629, "num_input_tokens_seen": 35264704, "step": 29065 }, { "epoch": 3.2375542933511525, "grad_norm": 0.10180344432592392, "learning_rate": 4.275498505415077e-05, "loss": 0.467, "num_input_tokens_seen": 35270688, "step": 29070 }, { "epoch": 3.23811114823477, "grad_norm": 0.13420750200748444, "learning_rate": 4.275156364514593e-05, "loss": 0.4643, "num_input_tokens_seen": 35276800, "step": 29075 }, { "epoch": 3.2386680031183874, "grad_norm": 0.16192877292633057, "learning_rate": 4.274814156543012e-05, "loss": 0.4602, "num_input_tokens_seen": 35283008, "step": 29080 }, { "epoch": 3.2392248580020047, "grad_norm": 0.09687771648168564, "learning_rate": 4.2744718815132657e-05, "loss": 0.4599, "num_input_tokens_seen": 35289024, "step": 29085 }, { "epoch": 3.239781712885622, "grad_norm": 0.1262630820274353, "learning_rate": 4.274129539438285e-05, "loss": 0.4583, "num_input_tokens_seen": 35295264, "step": 29090 }, { "epoch": 3.240338567769239, "grad_norm": 0.12923502922058105, "learning_rate": 4.273787130331005e-05, "loss": 0.4716, "num_input_tokens_seen": 35301504, "step": 29095 }, { "epoch": 3.240895422652857, "grad_norm": 0.1540880799293518, "learning_rate": 4.273444654204362e-05, "loss": 0.4614, "num_input_tokens_seen": 35307648, "step": 29100 }, { "epoch": 3.241452277536474, "grad_norm": 0.17756091058254242, "learning_rate": 4.273102111071297e-05, "loss": 0.4692, "num_input_tokens_seen": 35313984, "step": 29105 }, { "epoch": 3.2420091324200913, "grad_norm": 0.16224302351474762, "learning_rate": 4.272759500944752e-05, "loss": 0.4814, "num_input_tokens_seen": 35320128, "step": 29110 }, { "epoch": 3.2425659873037085, "grad_norm": 0.11127684265375137, "learning_rate": 4.272416823837673e-05, "loss": 0.4668, "num_input_tokens_seen": 35326048, "step": 29115 }, { "epoch": 3.243122842187326, "grad_norm": 0.12328912317752838, "learning_rate": 4.272074079763006e-05, "loss": 0.4545, "num_input_tokens_seen": 35332128, "step": 29120 }, { "epoch": 3.2436796970709434, "grad_norm": 0.19216062128543854, "learning_rate": 4.2717312687337e-05, "loss": 0.4657, "num_input_tokens_seen": 35338400, "step": 29125 }, { "epoch": 3.2442365519545606, "grad_norm": 0.16551551222801208, "learning_rate": 4.27138839076271e-05, "loss": 0.4773, "num_input_tokens_seen": 35344192, "step": 29130 }, { "epoch": 3.244793406838178, "grad_norm": 0.11033393442630768, "learning_rate": 4.27104544586299e-05, "loss": 0.4633, "num_input_tokens_seen": 35350368, "step": 29135 }, { "epoch": 3.2453502617217955, "grad_norm": 0.1256650686264038, "learning_rate": 4.2707024340474965e-05, "loss": 0.4612, "num_input_tokens_seen": 35356480, "step": 29140 }, { "epoch": 3.2459071166054128, "grad_norm": 0.12870676815509796, "learning_rate": 4.270359355329191e-05, "loss": 0.4716, "num_input_tokens_seen": 35362528, "step": 29145 }, { "epoch": 3.24646397148903, "grad_norm": 0.13934609293937683, "learning_rate": 4.2700162097210364e-05, "loss": 0.4708, "num_input_tokens_seen": 35368704, "step": 29150 }, { "epoch": 3.247020826372647, "grad_norm": 0.12321487814188004, "learning_rate": 4.269672997235997e-05, "loss": 0.4507, "num_input_tokens_seen": 35374816, "step": 29155 }, { "epoch": 3.2475776812562644, "grad_norm": 0.18311737477779388, "learning_rate": 4.269329717887039e-05, "loss": 0.4616, "num_input_tokens_seen": 35380960, "step": 29160 }, { "epoch": 3.248134536139882, "grad_norm": 0.12422938644886017, "learning_rate": 4.268986371687136e-05, "loss": 0.4858, "num_input_tokens_seen": 35387328, "step": 29165 }, { "epoch": 3.2486913910234994, "grad_norm": 0.10173743218183517, "learning_rate": 4.268642958649258e-05, "loss": 0.4715, "num_input_tokens_seen": 35393536, "step": 29170 }, { "epoch": 3.2492482459071166, "grad_norm": 0.1476956307888031, "learning_rate": 4.2682994787863814e-05, "loss": 0.4531, "num_input_tokens_seen": 35399616, "step": 29175 }, { "epoch": 3.249805100790734, "grad_norm": 0.13027462363243103, "learning_rate": 4.267955932111484e-05, "loss": 0.4602, "num_input_tokens_seen": 35405248, "step": 29180 }, { "epoch": 3.250361955674351, "grad_norm": 0.1270507574081421, "learning_rate": 4.267612318637545e-05, "loss": 0.4663, "num_input_tokens_seen": 35411200, "step": 29185 }, { "epoch": 3.2509188105579687, "grad_norm": 0.12299344688653946, "learning_rate": 4.267268638377549e-05, "loss": 0.4618, "num_input_tokens_seen": 35417248, "step": 29190 }, { "epoch": 3.251475665441586, "grad_norm": 0.10258504003286362, "learning_rate": 4.2669248913444795e-05, "loss": 0.4789, "num_input_tokens_seen": 35423360, "step": 29195 }, { "epoch": 3.252032520325203, "grad_norm": 0.15329134464263916, "learning_rate": 4.266581077551326e-05, "loss": 0.4613, "num_input_tokens_seen": 35429440, "step": 29200 }, { "epoch": 3.2525893752088204, "grad_norm": 0.11805534362792969, "learning_rate": 4.266237197011077e-05, "loss": 0.4832, "num_input_tokens_seen": 35435744, "step": 29205 }, { "epoch": 3.253146230092438, "grad_norm": 0.11417268961668015, "learning_rate": 4.265893249736728e-05, "loss": 0.4551, "num_input_tokens_seen": 35441696, "step": 29210 }, { "epoch": 3.2537030849760553, "grad_norm": 0.11219276487827301, "learning_rate": 4.265549235741271e-05, "loss": 0.4512, "num_input_tokens_seen": 35447872, "step": 29215 }, { "epoch": 3.2542599398596725, "grad_norm": 0.12288105487823486, "learning_rate": 4.265205155037707e-05, "loss": 0.475, "num_input_tokens_seen": 35454080, "step": 29220 }, { "epoch": 3.2548167947432898, "grad_norm": 0.11680471897125244, "learning_rate": 4.264861007639036e-05, "loss": 0.4743, "num_input_tokens_seen": 35460000, "step": 29225 }, { "epoch": 3.2553736496269075, "grad_norm": 0.13502605259418488, "learning_rate": 4.26451679355826e-05, "loss": 0.4752, "num_input_tokens_seen": 35465824, "step": 29230 }, { "epoch": 3.2559305045105247, "grad_norm": 0.1277243196964264, "learning_rate": 4.264172512808384e-05, "loss": 0.4546, "num_input_tokens_seen": 35472320, "step": 29235 }, { "epoch": 3.256487359394142, "grad_norm": 0.110042043030262, "learning_rate": 4.2638281654024174e-05, "loss": 0.4528, "num_input_tokens_seen": 35478560, "step": 29240 }, { "epoch": 3.257044214277759, "grad_norm": 0.1295386701822281, "learning_rate": 4.263483751353371e-05, "loss": 0.4592, "num_input_tokens_seen": 35484736, "step": 29245 }, { "epoch": 3.2576010691613764, "grad_norm": 0.13681001961231232, "learning_rate": 4.263139270674256e-05, "loss": 0.4686, "num_input_tokens_seen": 35490848, "step": 29250 }, { "epoch": 3.258157924044994, "grad_norm": 0.12770718336105347, "learning_rate": 4.26279472337809e-05, "loss": 0.4585, "num_input_tokens_seen": 35496960, "step": 29255 }, { "epoch": 3.2587147789286113, "grad_norm": 0.11219935864210129, "learning_rate": 4.262450109477889e-05, "loss": 0.4554, "num_input_tokens_seen": 35503104, "step": 29260 }, { "epoch": 3.2592716338122285, "grad_norm": 0.12164711952209473, "learning_rate": 4.2621054289866755e-05, "loss": 0.481, "num_input_tokens_seen": 35509088, "step": 29265 }, { "epoch": 3.2598284886958457, "grad_norm": 0.10004441440105438, "learning_rate": 4.2617606819174725e-05, "loss": 0.4667, "num_input_tokens_seen": 35515360, "step": 29270 }, { "epoch": 3.260385343579463, "grad_norm": 0.13996517658233643, "learning_rate": 4.2614158682833037e-05, "loss": 0.451, "num_input_tokens_seen": 35521376, "step": 29275 }, { "epoch": 3.2609421984630806, "grad_norm": 0.11105571687221527, "learning_rate": 4.2610709880972e-05, "loss": 0.4536, "num_input_tokens_seen": 35527488, "step": 29280 }, { "epoch": 3.261499053346698, "grad_norm": 0.1250024437904358, "learning_rate": 4.2607260413721905e-05, "loss": 0.4744, "num_input_tokens_seen": 35533728, "step": 29285 }, { "epoch": 3.262055908230315, "grad_norm": 0.12751325964927673, "learning_rate": 4.260381028121308e-05, "loss": 0.4735, "num_input_tokens_seen": 35539968, "step": 29290 }, { "epoch": 3.2626127631139323, "grad_norm": 0.1201147735118866, "learning_rate": 4.2600359483575894e-05, "loss": 0.4506, "num_input_tokens_seen": 35546560, "step": 29295 }, { "epoch": 3.26316961799755, "grad_norm": 0.11252783238887787, "learning_rate": 4.259690802094072e-05, "loss": 0.4642, "num_input_tokens_seen": 35552672, "step": 29300 }, { "epoch": 3.2637264728811672, "grad_norm": 0.11508534848690033, "learning_rate": 4.2593455893437974e-05, "loss": 0.4537, "num_input_tokens_seen": 35558624, "step": 29305 }, { "epoch": 3.2642833277647845, "grad_norm": 0.11522380262613297, "learning_rate": 4.259000310119808e-05, "loss": 0.4465, "num_input_tokens_seen": 35564960, "step": 29310 }, { "epoch": 3.2648401826484017, "grad_norm": 0.10603423416614532, "learning_rate": 4.2586549644351506e-05, "loss": 0.4695, "num_input_tokens_seen": 35570944, "step": 29315 }, { "epoch": 3.2653970375320194, "grad_norm": 0.11824005097150803, "learning_rate": 4.258309552302872e-05, "loss": 0.4441, "num_input_tokens_seen": 35577184, "step": 29320 }, { "epoch": 3.2659538924156366, "grad_norm": 0.12337010353803635, "learning_rate": 4.257964073736023e-05, "loss": 0.4481, "num_input_tokens_seen": 35583232, "step": 29325 }, { "epoch": 3.266510747299254, "grad_norm": 0.13846462965011597, "learning_rate": 4.257618528747659e-05, "loss": 0.4819, "num_input_tokens_seen": 35589632, "step": 29330 }, { "epoch": 3.267067602182871, "grad_norm": 0.10860874503850937, "learning_rate": 4.2572729173508335e-05, "loss": 0.465, "num_input_tokens_seen": 35595840, "step": 29335 }, { "epoch": 3.2676244570664883, "grad_norm": 0.10965383797883987, "learning_rate": 4.256927239558606e-05, "loss": 0.4582, "num_input_tokens_seen": 35601696, "step": 29340 }, { "epoch": 3.268181311950106, "grad_norm": 0.11901408433914185, "learning_rate": 4.256581495384037e-05, "loss": 0.4567, "num_input_tokens_seen": 35607616, "step": 29345 }, { "epoch": 3.268738166833723, "grad_norm": 0.1519501954317093, "learning_rate": 4.25623568484019e-05, "loss": 0.4403, "num_input_tokens_seen": 35613792, "step": 29350 }, { "epoch": 3.2692950217173404, "grad_norm": 0.12578999996185303, "learning_rate": 4.2558898079401314e-05, "loss": 0.4768, "num_input_tokens_seen": 35619776, "step": 29355 }, { "epoch": 3.2698518766009577, "grad_norm": 0.12675783038139343, "learning_rate": 4.2555438646969274e-05, "loss": 0.4468, "num_input_tokens_seen": 35626048, "step": 29360 }, { "epoch": 3.270408731484575, "grad_norm": 0.11652155220508575, "learning_rate": 4.255197855123651e-05, "loss": 0.4617, "num_input_tokens_seen": 35632192, "step": 29365 }, { "epoch": 3.2709655863681926, "grad_norm": 0.09709024429321289, "learning_rate": 4.254851779233375e-05, "loss": 0.4599, "num_input_tokens_seen": 35638112, "step": 29370 }, { "epoch": 3.27152244125181, "grad_norm": 0.1241350919008255, "learning_rate": 4.2545056370391746e-05, "loss": 0.4615, "num_input_tokens_seen": 35644256, "step": 29375 }, { "epoch": 3.272079296135427, "grad_norm": 0.1116945892572403, "learning_rate": 4.2541594285541294e-05, "loss": 0.49, "num_input_tokens_seen": 35650336, "step": 29380 }, { "epoch": 3.2726361510190443, "grad_norm": 0.15820278227329254, "learning_rate": 4.253813153791318e-05, "loss": 0.463, "num_input_tokens_seen": 35656832, "step": 29385 }, { "epoch": 3.273193005902662, "grad_norm": 0.1047930121421814, "learning_rate": 4.2534668127638274e-05, "loss": 0.4526, "num_input_tokens_seen": 35662848, "step": 29390 }, { "epoch": 3.273749860786279, "grad_norm": 0.09669012576341629, "learning_rate": 4.2531204054847404e-05, "loss": 0.4594, "num_input_tokens_seen": 35669120, "step": 29395 }, { "epoch": 3.2743067156698964, "grad_norm": 0.13143649697303772, "learning_rate": 4.252773931967146e-05, "loss": 0.4648, "num_input_tokens_seen": 35675040, "step": 29400 }, { "epoch": 3.2748635705535136, "grad_norm": 0.11108463257551193, "learning_rate": 4.2524273922241357e-05, "loss": 0.4691, "num_input_tokens_seen": 35680672, "step": 29405 }, { "epoch": 3.2754204254371313, "grad_norm": 0.10532815009355545, "learning_rate": 4.252080786268802e-05, "loss": 0.4647, "num_input_tokens_seen": 35686880, "step": 29410 }, { "epoch": 3.2759772803207485, "grad_norm": 0.13261306285858154, "learning_rate": 4.251734114114242e-05, "loss": 0.4704, "num_input_tokens_seen": 35693184, "step": 29415 }, { "epoch": 3.2765341352043658, "grad_norm": 0.11349114030599594, "learning_rate": 4.2513873757735535e-05, "loss": 0.4562, "num_input_tokens_seen": 35699264, "step": 29420 }, { "epoch": 3.277090990087983, "grad_norm": 0.11329434812068939, "learning_rate": 4.251040571259837e-05, "loss": 0.4531, "num_input_tokens_seen": 35705312, "step": 29425 }, { "epoch": 3.2776478449716, "grad_norm": 0.13189712166786194, "learning_rate": 4.250693700586197e-05, "loss": 0.4668, "num_input_tokens_seen": 35711232, "step": 29430 }, { "epoch": 3.278204699855218, "grad_norm": 0.09934023767709732, "learning_rate": 4.250346763765738e-05, "loss": 0.4619, "num_input_tokens_seen": 35717760, "step": 29435 }, { "epoch": 3.278761554738835, "grad_norm": 0.1189860999584198, "learning_rate": 4.249999760811569e-05, "loss": 0.4591, "num_input_tokens_seen": 35723680, "step": 29440 }, { "epoch": 3.2793184096224524, "grad_norm": 0.0835564136505127, "learning_rate": 4.249652691736801e-05, "loss": 0.4585, "num_input_tokens_seen": 35729184, "step": 29445 }, { "epoch": 3.2798752645060696, "grad_norm": 0.1079551950097084, "learning_rate": 4.249305556554547e-05, "loss": 0.478, "num_input_tokens_seen": 35735328, "step": 29450 }, { "epoch": 3.280432119389687, "grad_norm": 0.09566795080900192, "learning_rate": 4.248958355277923e-05, "loss": 0.4636, "num_input_tokens_seen": 35741664, "step": 29455 }, { "epoch": 3.2809889742733045, "grad_norm": 0.10618152469396591, "learning_rate": 4.248611087920048e-05, "loss": 0.4713, "num_input_tokens_seen": 35747872, "step": 29460 }, { "epoch": 3.2815458291569217, "grad_norm": 0.14302605390548706, "learning_rate": 4.248263754494043e-05, "loss": 0.4686, "num_input_tokens_seen": 35753888, "step": 29465 }, { "epoch": 3.282102684040539, "grad_norm": 0.09884412586688995, "learning_rate": 4.24791635501303e-05, "loss": 0.46, "num_input_tokens_seen": 35759936, "step": 29470 }, { "epoch": 3.282659538924156, "grad_norm": 0.13447925448417664, "learning_rate": 4.247568889490136e-05, "loss": 0.4641, "num_input_tokens_seen": 35766304, "step": 29475 }, { "epoch": 3.283216393807774, "grad_norm": 0.11839070916175842, "learning_rate": 4.247221357938488e-05, "loss": 0.4602, "num_input_tokens_seen": 35772704, "step": 29480 }, { "epoch": 3.283773248691391, "grad_norm": 0.0959438681602478, "learning_rate": 4.2468737603712186e-05, "loss": 0.4648, "num_input_tokens_seen": 35778816, "step": 29485 }, { "epoch": 3.2843301035750083, "grad_norm": 0.14272311329841614, "learning_rate": 4.2465260968014594e-05, "loss": 0.4671, "num_input_tokens_seen": 35785088, "step": 29490 }, { "epoch": 3.2848869584586255, "grad_norm": 0.0948639065027237, "learning_rate": 4.246178367242349e-05, "loss": 0.4797, "num_input_tokens_seen": 35791232, "step": 29495 }, { "epoch": 3.2854438133422432, "grad_norm": 0.14009279012680054, "learning_rate": 4.2458305717070236e-05, "loss": 0.4639, "num_input_tokens_seen": 35797376, "step": 29500 }, { "epoch": 3.2860006682258605, "grad_norm": 0.1703876107931137, "learning_rate": 4.245482710208624e-05, "loss": 0.4667, "num_input_tokens_seen": 35803264, "step": 29505 }, { "epoch": 3.2865575231094777, "grad_norm": 0.13427753746509552, "learning_rate": 4.245134782760293e-05, "loss": 0.4755, "num_input_tokens_seen": 35809312, "step": 29510 }, { "epoch": 3.287114377993095, "grad_norm": 0.1371704638004303, "learning_rate": 4.244786789375179e-05, "loss": 0.4698, "num_input_tokens_seen": 35814784, "step": 29515 }, { "epoch": 3.287671232876712, "grad_norm": 0.12159886211156845, "learning_rate": 4.244438730066427e-05, "loss": 0.4588, "num_input_tokens_seen": 35820064, "step": 29520 }, { "epoch": 3.28822808776033, "grad_norm": 0.1506028175354004, "learning_rate": 4.2440906048471906e-05, "loss": 0.4783, "num_input_tokens_seen": 35826080, "step": 29525 }, { "epoch": 3.288784942643947, "grad_norm": 0.11069072782993317, "learning_rate": 4.243742413730622e-05, "loss": 0.4562, "num_input_tokens_seen": 35831648, "step": 29530 }, { "epoch": 3.2893417975275643, "grad_norm": 0.1236729770898819, "learning_rate": 4.2433941567298764e-05, "loss": 0.481, "num_input_tokens_seen": 35837056, "step": 29535 }, { "epoch": 3.2898986524111815, "grad_norm": 0.12003162503242493, "learning_rate": 4.2430458338581124e-05, "loss": 0.4719, "num_input_tokens_seen": 35843008, "step": 29540 }, { "epoch": 3.2904555072947987, "grad_norm": 0.1436660885810852, "learning_rate": 4.242697445128492e-05, "loss": 0.4636, "num_input_tokens_seen": 35849024, "step": 29545 }, { "epoch": 3.2910123621784164, "grad_norm": 0.11180250346660614, "learning_rate": 4.242348990554177e-05, "loss": 0.4638, "num_input_tokens_seen": 35855008, "step": 29550 }, { "epoch": 3.2915692170620336, "grad_norm": 0.12057431042194366, "learning_rate": 4.242000470148333e-05, "loss": 0.4661, "num_input_tokens_seen": 35860736, "step": 29555 }, { "epoch": 3.292126071945651, "grad_norm": 0.10033781826496124, "learning_rate": 4.241651883924128e-05, "loss": 0.4762, "num_input_tokens_seen": 35866880, "step": 29560 }, { "epoch": 3.292682926829268, "grad_norm": 0.11552813649177551, "learning_rate": 4.241303231894735e-05, "loss": 0.449, "num_input_tokens_seen": 35873344, "step": 29565 }, { "epoch": 3.293239781712886, "grad_norm": 0.12735696136951447, "learning_rate": 4.240954514073325e-05, "loss": 0.4557, "num_input_tokens_seen": 35879712, "step": 29570 }, { "epoch": 3.293796636596503, "grad_norm": 0.11351612210273743, "learning_rate": 4.240605730473074e-05, "loss": 0.4718, "num_input_tokens_seen": 35885600, "step": 29575 }, { "epoch": 3.2943534914801202, "grad_norm": 0.1685934215784073, "learning_rate": 4.2402568811071606e-05, "loss": 0.4652, "num_input_tokens_seen": 35891456, "step": 29580 }, { "epoch": 3.2949103463637375, "grad_norm": 0.10609355568885803, "learning_rate": 4.239907965988765e-05, "loss": 0.4647, "num_input_tokens_seen": 35897536, "step": 29585 }, { "epoch": 3.295467201247355, "grad_norm": 0.10981972515583038, "learning_rate": 4.2395589851310715e-05, "loss": 0.4563, "num_input_tokens_seen": 35903648, "step": 29590 }, { "epoch": 3.2960240561309724, "grad_norm": 0.1347932517528534, "learning_rate": 4.239209938547265e-05, "loss": 0.4688, "num_input_tokens_seen": 35909792, "step": 29595 }, { "epoch": 3.2965809110145896, "grad_norm": 0.1130114421248436, "learning_rate": 4.238860826250533e-05, "loss": 0.4597, "num_input_tokens_seen": 35915968, "step": 29600 }, { "epoch": 3.297137765898207, "grad_norm": 0.09597102552652359, "learning_rate": 4.2385116482540656e-05, "loss": 0.4737, "num_input_tokens_seen": 35922368, "step": 29605 }, { "epoch": 3.297694620781824, "grad_norm": 0.09740894287824631, "learning_rate": 4.238162404571058e-05, "loss": 0.4598, "num_input_tokens_seen": 35928000, "step": 29610 }, { "epoch": 3.2982514756654417, "grad_norm": 0.09666173905134201, "learning_rate": 4.237813095214704e-05, "loss": 0.4496, "num_input_tokens_seen": 35933824, "step": 29615 }, { "epoch": 3.298808330549059, "grad_norm": 0.13475936651229858, "learning_rate": 4.237463720198202e-05, "loss": 0.4786, "num_input_tokens_seen": 35940256, "step": 29620 }, { "epoch": 3.299365185432676, "grad_norm": 0.14426052570343018, "learning_rate": 4.237114279534753e-05, "loss": 0.4586, "num_input_tokens_seen": 35946560, "step": 29625 }, { "epoch": 3.2999220403162934, "grad_norm": 0.12109409272670746, "learning_rate": 4.23676477323756e-05, "loss": 0.459, "num_input_tokens_seen": 35952832, "step": 29630 }, { "epoch": 3.3004788951999107, "grad_norm": 0.13080094754695892, "learning_rate": 4.2364152013198276e-05, "loss": 0.4546, "num_input_tokens_seen": 35958688, "step": 29635 }, { "epoch": 3.3010357500835283, "grad_norm": 0.10121232271194458, "learning_rate": 4.236065563794764e-05, "loss": 0.4525, "num_input_tokens_seen": 35964864, "step": 29640 }, { "epoch": 3.3015926049671456, "grad_norm": 0.09270922094583511, "learning_rate": 4.235715860675581e-05, "loss": 0.4631, "num_input_tokens_seen": 35971200, "step": 29645 }, { "epoch": 3.302149459850763, "grad_norm": 0.08761291205883026, "learning_rate": 4.23536609197549e-05, "loss": 0.4624, "num_input_tokens_seen": 35977408, "step": 29650 }, { "epoch": 3.30270631473438, "grad_norm": 0.12612485885620117, "learning_rate": 4.2350162577077065e-05, "loss": 0.4656, "num_input_tokens_seen": 35983168, "step": 29655 }, { "epoch": 3.3032631696179977, "grad_norm": 0.12032785266637802, "learning_rate": 4.234666357885449e-05, "loss": 0.4847, "num_input_tokens_seen": 35989504, "step": 29660 }, { "epoch": 3.303820024501615, "grad_norm": 0.11015147715806961, "learning_rate": 4.2343163925219364e-05, "loss": 0.4685, "num_input_tokens_seen": 35995776, "step": 29665 }, { "epoch": 3.304376879385232, "grad_norm": 0.09216252714395523, "learning_rate": 4.233966361630394e-05, "loss": 0.4682, "num_input_tokens_seen": 36002016, "step": 29670 }, { "epoch": 3.3049337342688494, "grad_norm": 0.13340015709400177, "learning_rate": 4.233616265224045e-05, "loss": 0.461, "num_input_tokens_seen": 36008448, "step": 29675 }, { "epoch": 3.305490589152467, "grad_norm": 0.14332540333271027, "learning_rate": 4.233266103316117e-05, "loss": 0.4568, "num_input_tokens_seen": 36015008, "step": 29680 }, { "epoch": 3.3060474440360843, "grad_norm": 0.16446836292743683, "learning_rate": 4.232915875919843e-05, "loss": 0.4745, "num_input_tokens_seen": 36021024, "step": 29685 }, { "epoch": 3.3066042989197015, "grad_norm": 0.10807877779006958, "learning_rate": 4.232565583048452e-05, "loss": 0.4496, "num_input_tokens_seen": 36027008, "step": 29690 }, { "epoch": 3.3071611538033188, "grad_norm": 0.10310488194227219, "learning_rate": 4.232215224715182e-05, "loss": 0.473, "num_input_tokens_seen": 36033504, "step": 29695 }, { "epoch": 3.307718008686936, "grad_norm": 0.1529274731874466, "learning_rate": 4.231864800933269e-05, "loss": 0.4655, "num_input_tokens_seen": 36039616, "step": 29700 }, { "epoch": 3.3082748635705537, "grad_norm": 0.12436238676309586, "learning_rate": 4.231514311715955e-05, "loss": 0.4622, "num_input_tokens_seen": 36045792, "step": 29705 }, { "epoch": 3.308831718454171, "grad_norm": 0.12150358408689499, "learning_rate": 4.231163757076481e-05, "loss": 0.4559, "num_input_tokens_seen": 36051648, "step": 29710 }, { "epoch": 3.309388573337788, "grad_norm": 0.170377716422081, "learning_rate": 4.230813137028092e-05, "loss": 0.4588, "num_input_tokens_seen": 36057952, "step": 29715 }, { "epoch": 3.3099454282214054, "grad_norm": 0.12005269527435303, "learning_rate": 4.2304624515840355e-05, "loss": 0.4671, "num_input_tokens_seen": 36064192, "step": 29720 }, { "epoch": 3.3105022831050226, "grad_norm": 0.1388518214225769, "learning_rate": 4.230111700757563e-05, "loss": 0.4676, "num_input_tokens_seen": 36069984, "step": 29725 }, { "epoch": 3.3110591379886403, "grad_norm": 0.1468159407377243, "learning_rate": 4.229760884561926e-05, "loss": 0.4618, "num_input_tokens_seen": 36076608, "step": 29730 }, { "epoch": 3.3116159928722575, "grad_norm": 0.16236406564712524, "learning_rate": 4.229410003010379e-05, "loss": 0.4711, "num_input_tokens_seen": 36082688, "step": 29735 }, { "epoch": 3.3121728477558747, "grad_norm": 0.11482227593660355, "learning_rate": 4.2290590561161805e-05, "loss": 0.4671, "num_input_tokens_seen": 36088736, "step": 29740 }, { "epoch": 3.312729702639492, "grad_norm": 0.14697229862213135, "learning_rate": 4.228708043892589e-05, "loss": 0.4504, "num_input_tokens_seen": 36094944, "step": 29745 }, { "epoch": 3.3132865575231096, "grad_norm": 0.10918533056974411, "learning_rate": 4.2283569663528685e-05, "loss": 0.4495, "num_input_tokens_seen": 36101120, "step": 29750 }, { "epoch": 3.313843412406727, "grad_norm": 0.12153332680463791, "learning_rate": 4.2280058235102824e-05, "loss": 0.4689, "num_input_tokens_seen": 36107200, "step": 29755 }, { "epoch": 3.314400267290344, "grad_norm": 0.17296352982521057, "learning_rate": 4.2276546153780986e-05, "loss": 0.4596, "num_input_tokens_seen": 36113504, "step": 29760 }, { "epoch": 3.3149571221739613, "grad_norm": 0.1425979882478714, "learning_rate": 4.227303341969587e-05, "loss": 0.4666, "num_input_tokens_seen": 36119680, "step": 29765 }, { "epoch": 3.315513977057579, "grad_norm": 0.1397036910057068, "learning_rate": 4.2269520032980204e-05, "loss": 0.4629, "num_input_tokens_seen": 36125696, "step": 29770 }, { "epoch": 3.3160708319411962, "grad_norm": 0.14122216403484344, "learning_rate": 4.226600599376672e-05, "loss": 0.4641, "num_input_tokens_seen": 36131840, "step": 29775 }, { "epoch": 3.3166276868248135, "grad_norm": 0.15740911662578583, "learning_rate": 4.22624913021882e-05, "loss": 0.4545, "num_input_tokens_seen": 36137952, "step": 29780 }, { "epoch": 3.3171845417084307, "grad_norm": 0.12036801129579544, "learning_rate": 4.225897595837744e-05, "loss": 0.4731, "num_input_tokens_seen": 36144032, "step": 29785 }, { "epoch": 3.317741396592048, "grad_norm": 0.0979459211230278, "learning_rate": 4.225545996246726e-05, "loss": 0.4534, "num_input_tokens_seen": 36150656, "step": 29790 }, { "epoch": 3.3182982514756656, "grad_norm": 0.16134080290794373, "learning_rate": 4.22519433145905e-05, "loss": 0.4831, "num_input_tokens_seen": 36156768, "step": 29795 }, { "epoch": 3.318855106359283, "grad_norm": 0.13759687542915344, "learning_rate": 4.2248426014880036e-05, "loss": 0.4524, "num_input_tokens_seen": 36162880, "step": 29800 }, { "epoch": 3.3194119612429, "grad_norm": 0.11953316628932953, "learning_rate": 4.224490806346877e-05, "loss": 0.4572, "num_input_tokens_seen": 36168768, "step": 29805 }, { "epoch": 3.3199688161265173, "grad_norm": 0.14480799436569214, "learning_rate": 4.2241389460489606e-05, "loss": 0.456, "num_input_tokens_seen": 36174784, "step": 29810 }, { "epoch": 3.3205256710101345, "grad_norm": 0.1528232991695404, "learning_rate": 4.2237870206075495e-05, "loss": 0.4686, "num_input_tokens_seen": 36180704, "step": 29815 }, { "epoch": 3.321082525893752, "grad_norm": 0.16244405508041382, "learning_rate": 4.223435030035941e-05, "loss": 0.4824, "num_input_tokens_seen": 36186752, "step": 29820 }, { "epoch": 3.3216393807773694, "grad_norm": 0.10948565602302551, "learning_rate": 4.223082974347434e-05, "loss": 0.4754, "num_input_tokens_seen": 36192960, "step": 29825 }, { "epoch": 3.3221962356609867, "grad_norm": 0.10231157392263412, "learning_rate": 4.22273085355533e-05, "loss": 0.4529, "num_input_tokens_seen": 36199008, "step": 29830 }, { "epoch": 3.322753090544604, "grad_norm": 0.09366181492805481, "learning_rate": 4.222378667672934e-05, "loss": 0.4552, "num_input_tokens_seen": 36205312, "step": 29835 }, { "epoch": 3.3233099454282216, "grad_norm": 0.12234435975551605, "learning_rate": 4.222026416713551e-05, "loss": 0.4626, "num_input_tokens_seen": 36211360, "step": 29840 }, { "epoch": 3.323866800311839, "grad_norm": 0.10822921991348267, "learning_rate": 4.2216741006904935e-05, "loss": 0.4639, "num_input_tokens_seen": 36217728, "step": 29845 }, { "epoch": 3.324423655195456, "grad_norm": 0.1351218819618225, "learning_rate": 4.221321719617071e-05, "loss": 0.4661, "num_input_tokens_seen": 36224096, "step": 29850 }, { "epoch": 3.3249805100790732, "grad_norm": 0.1218707412481308, "learning_rate": 4.220969273506597e-05, "loss": 0.4681, "num_input_tokens_seen": 36230112, "step": 29855 }, { "epoch": 3.325537364962691, "grad_norm": 0.13925603032112122, "learning_rate": 4.220616762372388e-05, "loss": 0.456, "num_input_tokens_seen": 36236256, "step": 29860 }, { "epoch": 3.326094219846308, "grad_norm": 0.11395327746868134, "learning_rate": 4.220264186227764e-05, "loss": 0.4666, "num_input_tokens_seen": 36242432, "step": 29865 }, { "epoch": 3.3266510747299254, "grad_norm": 0.08769935369491577, "learning_rate": 4.219911545086047e-05, "loss": 0.4683, "num_input_tokens_seen": 36247936, "step": 29870 }, { "epoch": 3.3272079296135426, "grad_norm": 0.1427718847990036, "learning_rate": 4.219558838960559e-05, "loss": 0.4604, "num_input_tokens_seen": 36253824, "step": 29875 }, { "epoch": 3.32776478449716, "grad_norm": 0.09251856803894043, "learning_rate": 4.219206067864628e-05, "loss": 0.46, "num_input_tokens_seen": 36260000, "step": 29880 }, { "epoch": 3.3283216393807775, "grad_norm": 0.10189501941204071, "learning_rate": 4.2188532318115826e-05, "loss": 0.4652, "num_input_tokens_seen": 36266112, "step": 29885 }, { "epoch": 3.3288784942643947, "grad_norm": 0.13396356999874115, "learning_rate": 4.218500330814753e-05, "loss": 0.4628, "num_input_tokens_seen": 36272032, "step": 29890 }, { "epoch": 3.329435349148012, "grad_norm": 0.10035090893507004, "learning_rate": 4.218147364887475e-05, "loss": 0.4562, "num_input_tokens_seen": 36277984, "step": 29895 }, { "epoch": 3.329992204031629, "grad_norm": 0.13482865691184998, "learning_rate": 4.217794334043083e-05, "loss": 0.4542, "num_input_tokens_seen": 36283808, "step": 29900 }, { "epoch": 3.3305490589152464, "grad_norm": 0.1437745839357376, "learning_rate": 4.217441238294915e-05, "loss": 0.4538, "num_input_tokens_seen": 36289984, "step": 29905 }, { "epoch": 3.331105913798864, "grad_norm": 0.12273465842008591, "learning_rate": 4.217088077656314e-05, "loss": 0.4558, "num_input_tokens_seen": 36295968, "step": 29910 }, { "epoch": 3.3316627686824813, "grad_norm": 0.11279196292161942, "learning_rate": 4.216734852140623e-05, "loss": 0.4637, "num_input_tokens_seen": 36302112, "step": 29915 }, { "epoch": 3.3322196235660986, "grad_norm": 0.1013297587633133, "learning_rate": 4.216381561761188e-05, "loss": 0.4677, "num_input_tokens_seen": 36308096, "step": 29920 }, { "epoch": 3.3327764784497163, "grad_norm": 0.13817448914051056, "learning_rate": 4.216028206531356e-05, "loss": 0.4621, "num_input_tokens_seen": 36313952, "step": 29925 }, { "epoch": 3.3333333333333335, "grad_norm": 0.14917142689228058, "learning_rate": 4.21567478646448e-05, "loss": 0.4607, "num_input_tokens_seen": 36319872, "step": 29930 }, { "epoch": 3.3338901882169507, "grad_norm": 0.10254120826721191, "learning_rate": 4.215321301573912e-05, "loss": 0.4654, "num_input_tokens_seen": 36326208, "step": 29935 }, { "epoch": 3.334447043100568, "grad_norm": 0.10799755156040192, "learning_rate": 4.214967751873008e-05, "loss": 0.4514, "num_input_tokens_seen": 36332032, "step": 29940 }, { "epoch": 3.335003897984185, "grad_norm": 0.12778319418430328, "learning_rate": 4.214614137375127e-05, "loss": 0.4563, "num_input_tokens_seen": 36338368, "step": 29945 }, { "epoch": 3.335560752867803, "grad_norm": 0.11930571496486664, "learning_rate": 4.2142604580936296e-05, "loss": 0.4692, "num_input_tokens_seen": 36344576, "step": 29950 }, { "epoch": 3.33611760775142, "grad_norm": 0.13681086897850037, "learning_rate": 4.213906714041878e-05, "loss": 0.4534, "num_input_tokens_seen": 36350656, "step": 29955 }, { "epoch": 3.3366744626350373, "grad_norm": 0.16670529544353485, "learning_rate": 4.213552905233238e-05, "loss": 0.466, "num_input_tokens_seen": 36356576, "step": 29960 }, { "epoch": 3.3372313175186545, "grad_norm": 0.10449153929948807, "learning_rate": 4.2131990316810775e-05, "loss": 0.4637, "num_input_tokens_seen": 36362784, "step": 29965 }, { "epoch": 3.3377881724022718, "grad_norm": 0.10623490810394287, "learning_rate": 4.212845093398768e-05, "loss": 0.4647, "num_input_tokens_seen": 36368768, "step": 29970 }, { "epoch": 3.3383450272858894, "grad_norm": 0.13990792632102966, "learning_rate": 4.2124910903996815e-05, "loss": 0.4565, "num_input_tokens_seen": 36374848, "step": 29975 }, { "epoch": 3.3389018821695067, "grad_norm": 0.09817210584878922, "learning_rate": 4.2121370226971935e-05, "loss": 0.4648, "num_input_tokens_seen": 36381280, "step": 29980 }, { "epoch": 3.339458737053124, "grad_norm": 0.07854503393173218, "learning_rate": 4.211782890304683e-05, "loss": 0.471, "num_input_tokens_seen": 36386816, "step": 29985 }, { "epoch": 3.340015591936741, "grad_norm": 0.1481683850288391, "learning_rate": 4.211428693235529e-05, "loss": 0.4636, "num_input_tokens_seen": 36393088, "step": 29990 }, { "epoch": 3.3405724468203584, "grad_norm": 0.13208907842636108, "learning_rate": 4.211074431503113e-05, "loss": 0.473, "num_input_tokens_seen": 36399520, "step": 29995 }, { "epoch": 3.341129301703976, "grad_norm": 0.10922414809465408, "learning_rate": 4.210720105120822e-05, "loss": 0.4594, "num_input_tokens_seen": 36405728, "step": 30000 }, { "epoch": 3.3416861565875933, "grad_norm": 0.10624771565198898, "learning_rate": 4.210365714102044e-05, "loss": 0.4661, "num_input_tokens_seen": 36411616, "step": 30005 }, { "epoch": 3.3422430114712105, "grad_norm": 0.11671468615531921, "learning_rate": 4.210011258460168e-05, "loss": 0.4711, "num_input_tokens_seen": 36417600, "step": 30010 }, { "epoch": 3.342799866354828, "grad_norm": 0.11306707561016083, "learning_rate": 4.2096567382085865e-05, "loss": 0.4608, "num_input_tokens_seen": 36423296, "step": 30015 }, { "epoch": 3.3433567212384454, "grad_norm": 0.10760153830051422, "learning_rate": 4.209302153360695e-05, "loss": 0.4637, "num_input_tokens_seen": 36429248, "step": 30020 }, { "epoch": 3.3439135761220626, "grad_norm": 0.12768998742103577, "learning_rate": 4.2089475039298896e-05, "loss": 0.4708, "num_input_tokens_seen": 36435680, "step": 30025 }, { "epoch": 3.34447043100568, "grad_norm": 0.10182685405015945, "learning_rate": 4.208592789929571e-05, "loss": 0.4646, "num_input_tokens_seen": 36441184, "step": 30030 }, { "epoch": 3.345027285889297, "grad_norm": 0.12544572353363037, "learning_rate": 4.208238011373142e-05, "loss": 0.4489, "num_input_tokens_seen": 36447168, "step": 30035 }, { "epoch": 3.3455841407729148, "grad_norm": 0.11133358627557755, "learning_rate": 4.207883168274006e-05, "loss": 0.4555, "num_input_tokens_seen": 36453120, "step": 30040 }, { "epoch": 3.346140995656532, "grad_norm": 0.16463597118854523, "learning_rate": 4.207528260645571e-05, "loss": 0.45, "num_input_tokens_seen": 36459104, "step": 30045 }, { "epoch": 3.3466978505401492, "grad_norm": 0.13520517945289612, "learning_rate": 4.207173288501247e-05, "loss": 0.4655, "num_input_tokens_seen": 36465376, "step": 30050 }, { "epoch": 3.3472547054237665, "grad_norm": 0.12446291744709015, "learning_rate": 4.206818251854444e-05, "loss": 0.4622, "num_input_tokens_seen": 36471616, "step": 30055 }, { "epoch": 3.3478115603073837, "grad_norm": 0.12218022346496582, "learning_rate": 4.206463150718578e-05, "loss": 0.4606, "num_input_tokens_seen": 36477600, "step": 30060 }, { "epoch": 3.3483684151910014, "grad_norm": 0.08874090015888214, "learning_rate": 4.2061079851070664e-05, "loss": 0.4329, "num_input_tokens_seen": 36483360, "step": 30065 }, { "epoch": 3.3489252700746186, "grad_norm": 0.15385861694812775, "learning_rate": 4.2057527550333266e-05, "loss": 0.4658, "num_input_tokens_seen": 36489216, "step": 30070 }, { "epoch": 3.349482124958236, "grad_norm": 0.09856243431568146, "learning_rate": 4.205397460510782e-05, "loss": 0.4493, "num_input_tokens_seen": 36494912, "step": 30075 }, { "epoch": 3.350038979841853, "grad_norm": 0.13518108427524567, "learning_rate": 4.205042101552855e-05, "loss": 0.472, "num_input_tokens_seen": 36500864, "step": 30080 }, { "epoch": 3.3505958347254707, "grad_norm": 0.10802628099918365, "learning_rate": 4.204686678172975e-05, "loss": 0.4629, "num_input_tokens_seen": 36506784, "step": 30085 }, { "epoch": 3.351152689609088, "grad_norm": 0.12997165322303772, "learning_rate": 4.2043311903845684e-05, "loss": 0.4713, "num_input_tokens_seen": 36512480, "step": 30090 }, { "epoch": 3.351709544492705, "grad_norm": 0.10278818756341934, "learning_rate": 4.203975638201068e-05, "loss": 0.47, "num_input_tokens_seen": 36518336, "step": 30095 }, { "epoch": 3.3522663993763224, "grad_norm": 0.12007410079240799, "learning_rate": 4.2036200216359065e-05, "loss": 0.4504, "num_input_tokens_seen": 36524576, "step": 30100 }, { "epoch": 3.35282325425994, "grad_norm": 0.1461155116558075, "learning_rate": 4.2032643407025215e-05, "loss": 0.4694, "num_input_tokens_seen": 36530432, "step": 30105 }, { "epoch": 3.3533801091435573, "grad_norm": 0.14072725176811218, "learning_rate": 4.202908595414352e-05, "loss": 0.4582, "num_input_tokens_seen": 36536416, "step": 30110 }, { "epoch": 3.3539369640271746, "grad_norm": 0.09556271880865097, "learning_rate": 4.202552785784838e-05, "loss": 0.4657, "num_input_tokens_seen": 36542400, "step": 30115 }, { "epoch": 3.354493818910792, "grad_norm": 0.12108960747718811, "learning_rate": 4.202196911827423e-05, "loss": 0.4747, "num_input_tokens_seen": 36548512, "step": 30120 }, { "epoch": 3.355050673794409, "grad_norm": 0.11670606583356857, "learning_rate": 4.2018409735555546e-05, "loss": 0.4653, "num_input_tokens_seen": 36554560, "step": 30125 }, { "epoch": 3.3556075286780267, "grad_norm": 0.11819538474082947, "learning_rate": 4.20148497098268e-05, "loss": 0.4783, "num_input_tokens_seen": 36560736, "step": 30130 }, { "epoch": 3.356164383561644, "grad_norm": 0.09779588878154755, "learning_rate": 4.201128904122251e-05, "loss": 0.4567, "num_input_tokens_seen": 36566080, "step": 30135 }, { "epoch": 3.356721238445261, "grad_norm": 0.10746373236179352, "learning_rate": 4.20077277298772e-05, "loss": 0.4625, "num_input_tokens_seen": 36572512, "step": 30140 }, { "epoch": 3.3572780933288784, "grad_norm": 0.1285521239042282, "learning_rate": 4.200416577592544e-05, "loss": 0.4503, "num_input_tokens_seen": 36578400, "step": 30145 }, { "epoch": 3.3578349482124956, "grad_norm": 0.13024458289146423, "learning_rate": 4.200060317950179e-05, "loss": 0.4685, "num_input_tokens_seen": 36584160, "step": 30150 }, { "epoch": 3.3583918030961133, "grad_norm": 0.12378101050853729, "learning_rate": 4.199703994074089e-05, "loss": 0.442, "num_input_tokens_seen": 36590272, "step": 30155 }, { "epoch": 3.3589486579797305, "grad_norm": 0.11890289932489395, "learning_rate": 4.1993476059777336e-05, "loss": 0.4629, "num_input_tokens_seen": 36596000, "step": 30160 }, { "epoch": 3.3595055128633478, "grad_norm": 0.14592857658863068, "learning_rate": 4.1989911536745804e-05, "loss": 0.4562, "num_input_tokens_seen": 36602016, "step": 30165 }, { "epoch": 3.360062367746965, "grad_norm": 0.11367523670196533, "learning_rate": 4.198634637178097e-05, "loss": 0.4667, "num_input_tokens_seen": 36608160, "step": 30170 }, { "epoch": 3.3606192226305827, "grad_norm": 0.13269071280956268, "learning_rate": 4.198278056501752e-05, "loss": 0.4756, "num_input_tokens_seen": 36614304, "step": 30175 }, { "epoch": 3.3611760775142, "grad_norm": 0.12280430644750595, "learning_rate": 4.1979214116590214e-05, "loss": 0.462, "num_input_tokens_seen": 36620320, "step": 30180 }, { "epoch": 3.361732932397817, "grad_norm": 0.10775475203990936, "learning_rate": 4.197564702663378e-05, "loss": 0.4707, "num_input_tokens_seen": 36626688, "step": 30185 }, { "epoch": 3.3622897872814344, "grad_norm": 0.1313132792711258, "learning_rate": 4.1972079295283e-05, "loss": 0.4573, "num_input_tokens_seen": 36632864, "step": 30190 }, { "epoch": 3.362846642165052, "grad_norm": 0.0938175767660141, "learning_rate": 4.196851092267267e-05, "loss": 0.4594, "num_input_tokens_seen": 36638816, "step": 30195 }, { "epoch": 3.3634034970486693, "grad_norm": 0.10490816086530685, "learning_rate": 4.1964941908937626e-05, "loss": 0.4494, "num_input_tokens_seen": 36645120, "step": 30200 }, { "epoch": 3.3639603519322865, "grad_norm": 0.09741601347923279, "learning_rate": 4.1961372254212705e-05, "loss": 0.4549, "num_input_tokens_seen": 36650784, "step": 30205 }, { "epoch": 3.3645172068159037, "grad_norm": 0.11545456200838089, "learning_rate": 4.195780195863279e-05, "loss": 0.4515, "num_input_tokens_seen": 36656480, "step": 30210 }, { "epoch": 3.365074061699521, "grad_norm": 0.08654936403036118, "learning_rate": 4.1954231022332766e-05, "loss": 0.4671, "num_input_tokens_seen": 36662208, "step": 30215 }, { "epoch": 3.3656309165831386, "grad_norm": 0.15718843042850494, "learning_rate": 4.1950659445447565e-05, "loss": 0.4754, "num_input_tokens_seen": 36667616, "step": 30220 }, { "epoch": 3.366187771466756, "grad_norm": 0.1605207324028015, "learning_rate": 4.194708722811213e-05, "loss": 0.4649, "num_input_tokens_seen": 36673952, "step": 30225 }, { "epoch": 3.366744626350373, "grad_norm": 0.11006669700145721, "learning_rate": 4.194351437046143e-05, "loss": 0.4488, "num_input_tokens_seen": 36679840, "step": 30230 }, { "epoch": 3.3673014812339903, "grad_norm": 0.16708534955978394, "learning_rate": 4.1939940872630457e-05, "loss": 0.4713, "num_input_tokens_seen": 36685824, "step": 30235 }, { "epoch": 3.3678583361176075, "grad_norm": 0.11746451258659363, "learning_rate": 4.193636673475423e-05, "loss": 0.4774, "num_input_tokens_seen": 36692224, "step": 30240 }, { "epoch": 3.368415191001225, "grad_norm": 0.12013409286737442, "learning_rate": 4.1932791956967796e-05, "loss": 0.4504, "num_input_tokens_seen": 36698528, "step": 30245 }, { "epoch": 3.3689720458848424, "grad_norm": 0.16206350922584534, "learning_rate": 4.1929216539406216e-05, "loss": 0.4685, "num_input_tokens_seen": 36704416, "step": 30250 }, { "epoch": 3.3695289007684597, "grad_norm": 0.13465897738933563, "learning_rate": 4.192564048220459e-05, "loss": 0.4692, "num_input_tokens_seen": 36710144, "step": 30255 }, { "epoch": 3.370085755652077, "grad_norm": 0.11243236064910889, "learning_rate": 4.192206378549802e-05, "loss": 0.4637, "num_input_tokens_seen": 36716288, "step": 30260 }, { "epoch": 3.3706426105356946, "grad_norm": 0.12392149120569229, "learning_rate": 4.1918486449421645e-05, "loss": 0.4658, "num_input_tokens_seen": 36722304, "step": 30265 }, { "epoch": 3.371199465419312, "grad_norm": 0.124268539249897, "learning_rate": 4.191490847411064e-05, "loss": 0.4659, "num_input_tokens_seen": 36728288, "step": 30270 }, { "epoch": 3.371756320302929, "grad_norm": 0.12199502438306808, "learning_rate": 4.191132985970019e-05, "loss": 0.4739, "num_input_tokens_seen": 36734592, "step": 30275 }, { "epoch": 3.3723131751865463, "grad_norm": 0.12196394801139832, "learning_rate": 4.190775060632549e-05, "loss": 0.4605, "num_input_tokens_seen": 36740864, "step": 30280 }, { "epoch": 3.372870030070164, "grad_norm": 0.12401818484067917, "learning_rate": 4.1904170714121805e-05, "loss": 0.4513, "num_input_tokens_seen": 36747232, "step": 30285 }, { "epoch": 3.373426884953781, "grad_norm": 0.19659100472927094, "learning_rate": 4.190059018322436e-05, "loss": 0.4702, "num_input_tokens_seen": 36753184, "step": 30290 }, { "epoch": 3.3739837398373984, "grad_norm": 0.11970939487218857, "learning_rate": 4.189700901376847e-05, "loss": 0.463, "num_input_tokens_seen": 36758976, "step": 30295 }, { "epoch": 3.3745405947210156, "grad_norm": 0.10953889787197113, "learning_rate": 4.189342720588943e-05, "loss": 0.4526, "num_input_tokens_seen": 36765216, "step": 30300 }, { "epoch": 3.375097449604633, "grad_norm": 0.10714147239923477, "learning_rate": 4.1889844759722574e-05, "loss": 0.4283, "num_input_tokens_seen": 36771552, "step": 30305 }, { "epoch": 3.3756543044882505, "grad_norm": 0.10395821183919907, "learning_rate": 4.188626167540325e-05, "loss": 0.4802, "num_input_tokens_seen": 36777856, "step": 30310 }, { "epoch": 3.3762111593718678, "grad_norm": 0.1894557625055313, "learning_rate": 4.188267795306685e-05, "loss": 0.4795, "num_input_tokens_seen": 36783744, "step": 30315 }, { "epoch": 3.376768014255485, "grad_norm": 0.13421444594860077, "learning_rate": 4.187909359284877e-05, "loss": 0.4672, "num_input_tokens_seen": 36789920, "step": 30320 }, { "epoch": 3.3773248691391022, "grad_norm": 0.10705658048391342, "learning_rate": 4.187550859488445e-05, "loss": 0.4599, "num_input_tokens_seen": 36795744, "step": 30325 }, { "epoch": 3.3778817240227195, "grad_norm": 0.16581189632415771, "learning_rate": 4.187192295930933e-05, "loss": 0.4848, "num_input_tokens_seen": 36801760, "step": 30330 }, { "epoch": 3.378438578906337, "grad_norm": 0.11896905303001404, "learning_rate": 4.186833668625889e-05, "loss": 0.4899, "num_input_tokens_seen": 36807968, "step": 30335 }, { "epoch": 3.3789954337899544, "grad_norm": 0.1374536156654358, "learning_rate": 4.1864749775868634e-05, "loss": 0.4598, "num_input_tokens_seen": 36814048, "step": 30340 }, { "epoch": 3.3795522886735716, "grad_norm": 0.10145257413387299, "learning_rate": 4.1861162228274086e-05, "loss": 0.4588, "num_input_tokens_seen": 36819968, "step": 30345 }, { "epoch": 3.380109143557189, "grad_norm": 0.10470760613679886, "learning_rate": 4.185757404361079e-05, "loss": 0.4641, "num_input_tokens_seen": 36826208, "step": 30350 }, { "epoch": 3.3806659984408065, "grad_norm": 0.1474400907754898, "learning_rate": 4.185398522201434e-05, "loss": 0.4629, "num_input_tokens_seen": 36832224, "step": 30355 }, { "epoch": 3.3812228533244237, "grad_norm": 0.12884411215782166, "learning_rate": 4.1850395763620295e-05, "loss": 0.4576, "num_input_tokens_seen": 36838272, "step": 30360 }, { "epoch": 3.381779708208041, "grad_norm": 0.11797133088111877, "learning_rate": 4.184680566856431e-05, "loss": 0.4802, "num_input_tokens_seen": 36843840, "step": 30365 }, { "epoch": 3.382336563091658, "grad_norm": 0.12765134871006012, "learning_rate": 4.184321493698202e-05, "loss": 0.4563, "num_input_tokens_seen": 36850048, "step": 30370 }, { "epoch": 3.382893417975276, "grad_norm": 0.11980622261762619, "learning_rate": 4.183962356900909e-05, "loss": 0.4655, "num_input_tokens_seen": 36856480, "step": 30375 }, { "epoch": 3.383450272858893, "grad_norm": 0.09917989373207092, "learning_rate": 4.183603156478122e-05, "loss": 0.4609, "num_input_tokens_seen": 36862656, "step": 30380 }, { "epoch": 3.3840071277425103, "grad_norm": 0.12248512357473373, "learning_rate": 4.183243892443411e-05, "loss": 0.4741, "num_input_tokens_seen": 36867904, "step": 30385 }, { "epoch": 3.3845639826261276, "grad_norm": 0.13890594244003296, "learning_rate": 4.182884564810353e-05, "loss": 0.4616, "num_input_tokens_seen": 36873888, "step": 30390 }, { "epoch": 3.385120837509745, "grad_norm": 0.14734666049480438, "learning_rate": 4.182525173592523e-05, "loss": 0.4682, "num_input_tokens_seen": 36879616, "step": 30395 }, { "epoch": 3.3856776923933625, "grad_norm": 0.12199748307466507, "learning_rate": 4.1821657188034994e-05, "loss": 0.4619, "num_input_tokens_seen": 36886048, "step": 30400 }, { "epoch": 3.3862345472769797, "grad_norm": 0.10054879635572433, "learning_rate": 4.1818062004568646e-05, "loss": 0.4494, "num_input_tokens_seen": 36892256, "step": 30405 }, { "epoch": 3.386791402160597, "grad_norm": 0.1089661717414856, "learning_rate": 4.181446618566202e-05, "loss": 0.4644, "num_input_tokens_seen": 36898624, "step": 30410 }, { "epoch": 3.387348257044214, "grad_norm": 0.18224488198757172, "learning_rate": 4.181086973145098e-05, "loss": 0.4709, "num_input_tokens_seen": 36904512, "step": 30415 }, { "epoch": 3.3879051119278314, "grad_norm": 0.13690204918384552, "learning_rate": 4.1807272642071396e-05, "loss": 0.448, "num_input_tokens_seen": 36910144, "step": 30420 }, { "epoch": 3.388461966811449, "grad_norm": 0.11424698680639267, "learning_rate": 4.1803674917659195e-05, "loss": 0.4553, "num_input_tokens_seen": 36915904, "step": 30425 }, { "epoch": 3.3890188216950663, "grad_norm": 0.15553052723407745, "learning_rate": 4.180007655835031e-05, "loss": 0.4686, "num_input_tokens_seen": 36921984, "step": 30430 }, { "epoch": 3.3895756765786835, "grad_norm": 0.12673380970954895, "learning_rate": 4.179647756428069e-05, "loss": 0.4554, "num_input_tokens_seen": 36928160, "step": 30435 }, { "epoch": 3.3901325314623008, "grad_norm": 0.09371906518936157, "learning_rate": 4.1792877935586327e-05, "loss": 0.4643, "num_input_tokens_seen": 36934304, "step": 30440 }, { "epoch": 3.3906893863459184, "grad_norm": 0.15018798410892487, "learning_rate": 4.1789277672403214e-05, "loss": 0.4679, "num_input_tokens_seen": 36940320, "step": 30445 }, { "epoch": 3.3912462412295357, "grad_norm": 0.14771105349063873, "learning_rate": 4.178567677486739e-05, "loss": 0.4758, "num_input_tokens_seen": 36946592, "step": 30450 }, { "epoch": 3.391803096113153, "grad_norm": 0.11278121173381805, "learning_rate": 4.17820752431149e-05, "loss": 0.4667, "num_input_tokens_seen": 36952768, "step": 30455 }, { "epoch": 3.39235995099677, "grad_norm": 0.1014406755566597, "learning_rate": 4.1778473077281835e-05, "loss": 0.4629, "num_input_tokens_seen": 36959040, "step": 30460 }, { "epoch": 3.392916805880388, "grad_norm": 0.13765306770801544, "learning_rate": 4.177487027750429e-05, "loss": 0.4632, "num_input_tokens_seen": 36965312, "step": 30465 }, { "epoch": 3.393473660764005, "grad_norm": 0.10323864221572876, "learning_rate": 4.177126684391838e-05, "loss": 0.4626, "num_input_tokens_seen": 36971424, "step": 30470 }, { "epoch": 3.3940305156476223, "grad_norm": 0.1005323976278305, "learning_rate": 4.1767662776660263e-05, "loss": 0.4544, "num_input_tokens_seen": 36977728, "step": 30475 }, { "epoch": 3.3945873705312395, "grad_norm": 0.1366235911846161, "learning_rate": 4.1764058075866106e-05, "loss": 0.4574, "num_input_tokens_seen": 36983616, "step": 30480 }, { "epoch": 3.3951442254148567, "grad_norm": 0.12324772030115128, "learning_rate": 4.176045274167212e-05, "loss": 0.4593, "num_input_tokens_seen": 36989504, "step": 30485 }, { "epoch": 3.3957010802984744, "grad_norm": 0.1292802393436432, "learning_rate": 4.175684677421452e-05, "loss": 0.4707, "num_input_tokens_seen": 36995872, "step": 30490 }, { "epoch": 3.3962579351820916, "grad_norm": 0.11594208329916, "learning_rate": 4.1753240173629545e-05, "loss": 0.4677, "num_input_tokens_seen": 37002048, "step": 30495 }, { "epoch": 3.396814790065709, "grad_norm": 0.11947771161794662, "learning_rate": 4.174963294005346e-05, "loss": 0.4596, "num_input_tokens_seen": 37008320, "step": 30500 }, { "epoch": 3.397371644949326, "grad_norm": 0.13986650109291077, "learning_rate": 4.174602507362258e-05, "loss": 0.4486, "num_input_tokens_seen": 37014400, "step": 30505 }, { "epoch": 3.3979284998329433, "grad_norm": 0.1589023321866989, "learning_rate": 4.17424165744732e-05, "loss": 0.4696, "num_input_tokens_seen": 37020864, "step": 30510 }, { "epoch": 3.398485354716561, "grad_norm": 0.12020252645015717, "learning_rate": 4.173880744274167e-05, "loss": 0.4659, "num_input_tokens_seen": 37027232, "step": 30515 }, { "epoch": 3.399042209600178, "grad_norm": 0.1065344288945198, "learning_rate": 4.1735197678564354e-05, "loss": 0.4567, "num_input_tokens_seen": 37033120, "step": 30520 }, { "epoch": 3.3995990644837955, "grad_norm": 0.16574998199939728, "learning_rate": 4.1731587282077635e-05, "loss": 0.4583, "num_input_tokens_seen": 37039104, "step": 30525 }, { "epoch": 3.4001559193674127, "grad_norm": 0.1365993320941925, "learning_rate": 4.1727976253417925e-05, "loss": 0.4625, "num_input_tokens_seen": 37045504, "step": 30530 }, { "epoch": 3.4007127742510304, "grad_norm": 0.15113221108913422, "learning_rate": 4.172436459272168e-05, "loss": 0.488, "num_input_tokens_seen": 37051648, "step": 30535 }, { "epoch": 3.4012696291346476, "grad_norm": 0.12480680644512177, "learning_rate": 4.172075230012533e-05, "loss": 0.4633, "num_input_tokens_seen": 37057568, "step": 30540 }, { "epoch": 3.401826484018265, "grad_norm": 0.09467379748821259, "learning_rate": 4.171713937576538e-05, "loss": 0.4614, "num_input_tokens_seen": 37063552, "step": 30545 }, { "epoch": 3.402383338901882, "grad_norm": 0.10941637307405472, "learning_rate": 4.171352581977833e-05, "loss": 0.4657, "num_input_tokens_seen": 37069760, "step": 30550 }, { "epoch": 3.4029401937854997, "grad_norm": 0.16099673509597778, "learning_rate": 4.170991163230072e-05, "loss": 0.4792, "num_input_tokens_seen": 37076096, "step": 30555 }, { "epoch": 3.403497048669117, "grad_norm": 0.08491560071706772, "learning_rate": 4.170629681346909e-05, "loss": 0.444, "num_input_tokens_seen": 37082400, "step": 30560 }, { "epoch": 3.404053903552734, "grad_norm": 0.13177944719791412, "learning_rate": 4.170268136342004e-05, "loss": 0.4678, "num_input_tokens_seen": 37087840, "step": 30565 }, { "epoch": 3.4046107584363514, "grad_norm": 0.10469050705432892, "learning_rate": 4.169906528229015e-05, "loss": 0.4736, "num_input_tokens_seen": 37093760, "step": 30570 }, { "epoch": 3.4051676133199686, "grad_norm": 0.10275815427303314, "learning_rate": 4.169544857021606e-05, "loss": 0.4684, "num_input_tokens_seen": 37100000, "step": 30575 }, { "epoch": 3.4057244682035863, "grad_norm": 0.1148403137922287, "learning_rate": 4.169183122733442e-05, "loss": 0.4666, "num_input_tokens_seen": 37106272, "step": 30580 }, { "epoch": 3.4062813230872035, "grad_norm": 0.087297722697258, "learning_rate": 4.1688213253781914e-05, "loss": 0.4552, "num_input_tokens_seen": 37112544, "step": 30585 }, { "epoch": 3.406838177970821, "grad_norm": 0.10542067885398865, "learning_rate": 4.168459464969522e-05, "loss": 0.4605, "num_input_tokens_seen": 37118496, "step": 30590 }, { "epoch": 3.407395032854438, "grad_norm": 0.1381184458732605, "learning_rate": 4.168097541521108e-05, "loss": 0.4591, "num_input_tokens_seen": 37124704, "step": 30595 }, { "epoch": 3.4079518877380552, "grad_norm": 0.14101536571979523, "learning_rate": 4.1677355550466226e-05, "loss": 0.4567, "num_input_tokens_seen": 37130624, "step": 30600 }, { "epoch": 3.408508742621673, "grad_norm": 0.12590500712394714, "learning_rate": 4.167373505559744e-05, "loss": 0.4661, "num_input_tokens_seen": 37136768, "step": 30605 }, { "epoch": 3.40906559750529, "grad_norm": 0.11184652149677277, "learning_rate": 4.167011393074151e-05, "loss": 0.4718, "num_input_tokens_seen": 37142976, "step": 30610 }, { "epoch": 3.4096224523889074, "grad_norm": 0.09890957176685333, "learning_rate": 4.166649217603524e-05, "loss": 0.4784, "num_input_tokens_seen": 37149056, "step": 30615 }, { "epoch": 3.4101793072725246, "grad_norm": 0.09408967941999435, "learning_rate": 4.16628697916155e-05, "loss": 0.4712, "num_input_tokens_seen": 37155232, "step": 30620 }, { "epoch": 3.4107361621561423, "grad_norm": 0.07644571363925934, "learning_rate": 4.1659246777619134e-05, "loss": 0.4461, "num_input_tokens_seen": 37161568, "step": 30625 }, { "epoch": 3.4112930170397595, "grad_norm": 0.103919617831707, "learning_rate": 4.165562313418304e-05, "loss": 0.4651, "num_input_tokens_seen": 37167936, "step": 30630 }, { "epoch": 3.4118498719233767, "grad_norm": 0.11894011497497559, "learning_rate": 4.165199886144413e-05, "loss": 0.4397, "num_input_tokens_seen": 37173984, "step": 30635 }, { "epoch": 3.412406726806994, "grad_norm": 0.10984660685062408, "learning_rate": 4.164837395953933e-05, "loss": 0.4572, "num_input_tokens_seen": 37179968, "step": 30640 }, { "epoch": 3.4129635816906116, "grad_norm": 0.112653449177742, "learning_rate": 4.164474842860562e-05, "loss": 0.4665, "num_input_tokens_seen": 37185728, "step": 30645 }, { "epoch": 3.413520436574229, "grad_norm": 0.12118618935346603, "learning_rate": 4.164112226877998e-05, "loss": 0.4659, "num_input_tokens_seen": 37191744, "step": 30650 }, { "epoch": 3.414077291457846, "grad_norm": 0.10745278745889664, "learning_rate": 4.16374954801994e-05, "loss": 0.4699, "num_input_tokens_seen": 37197184, "step": 30655 }, { "epoch": 3.4146341463414633, "grad_norm": 0.12166555225849152, "learning_rate": 4.1633868063000915e-05, "loss": 0.4826, "num_input_tokens_seen": 37203264, "step": 30660 }, { "epoch": 3.4151910012250806, "grad_norm": 0.0936715230345726, "learning_rate": 4.163024001732161e-05, "loss": 0.4588, "num_input_tokens_seen": 37209280, "step": 30665 }, { "epoch": 3.4157478561086982, "grad_norm": 0.11003860086202621, "learning_rate": 4.1626611343298526e-05, "loss": 0.4575, "num_input_tokens_seen": 37215296, "step": 30670 }, { "epoch": 3.4163047109923155, "grad_norm": 0.10415113717317581, "learning_rate": 4.1622982041068794e-05, "loss": 0.4619, "num_input_tokens_seen": 37220864, "step": 30675 }, { "epoch": 3.4168615658759327, "grad_norm": 0.10602463036775589, "learning_rate": 4.161935211076952e-05, "loss": 0.4669, "num_input_tokens_seen": 37226752, "step": 30680 }, { "epoch": 3.41741842075955, "grad_norm": 0.10366726666688919, "learning_rate": 4.1615721552537867e-05, "loss": 0.475, "num_input_tokens_seen": 37232640, "step": 30685 }, { "epoch": 3.417975275643167, "grad_norm": 0.13006485998630524, "learning_rate": 4.1612090366511e-05, "loss": 0.4733, "num_input_tokens_seen": 37238240, "step": 30690 }, { "epoch": 3.418532130526785, "grad_norm": 0.11947865784168243, "learning_rate": 4.1608458552826134e-05, "loss": 0.4744, "num_input_tokens_seen": 37244864, "step": 30695 }, { "epoch": 3.419088985410402, "grad_norm": 0.11013278365135193, "learning_rate": 4.160482611162047e-05, "loss": 0.4642, "num_input_tokens_seen": 37251072, "step": 30700 }, { "epoch": 3.4196458402940193, "grad_norm": 0.12285540252923965, "learning_rate": 4.160119304303127e-05, "loss": 0.4666, "num_input_tokens_seen": 37256992, "step": 30705 }, { "epoch": 3.4202026951776365, "grad_norm": 0.10935399681329727, "learning_rate": 4.15975593471958e-05, "loss": 0.4651, "num_input_tokens_seen": 37263232, "step": 30710 }, { "epoch": 3.420759550061254, "grad_norm": 0.11378439515829086, "learning_rate": 4.159392502425134e-05, "loss": 0.4701, "num_input_tokens_seen": 37269248, "step": 30715 }, { "epoch": 3.4213164049448714, "grad_norm": 0.10842728614807129, "learning_rate": 4.159029007433521e-05, "loss": 0.4734, "num_input_tokens_seen": 37275296, "step": 30720 }, { "epoch": 3.4218732598284887, "grad_norm": 0.10668566823005676, "learning_rate": 4.158665449758477e-05, "loss": 0.4597, "num_input_tokens_seen": 37280960, "step": 30725 }, { "epoch": 3.422430114712106, "grad_norm": 0.09667900204658508, "learning_rate": 4.1583018294137355e-05, "loss": 0.4551, "num_input_tokens_seen": 37286944, "step": 30730 }, { "epoch": 3.4229869695957236, "grad_norm": 0.11251901090145111, "learning_rate": 4.1579381464130374e-05, "loss": 0.448, "num_input_tokens_seen": 37293504, "step": 30735 }, { "epoch": 3.423543824479341, "grad_norm": 0.13359850645065308, "learning_rate": 4.1575744007701226e-05, "loss": 0.4679, "num_input_tokens_seen": 37299840, "step": 30740 }, { "epoch": 3.424100679362958, "grad_norm": 0.10118500888347626, "learning_rate": 4.1572105924987356e-05, "loss": 0.4637, "num_input_tokens_seen": 37305888, "step": 30745 }, { "epoch": 3.4246575342465753, "grad_norm": 0.10449964553117752, "learning_rate": 4.1568467216126206e-05, "loss": 0.4572, "num_input_tokens_seen": 37311936, "step": 30750 }, { "epoch": 3.4252143891301925, "grad_norm": 0.09674082696437836, "learning_rate": 4.156482788125527e-05, "loss": 0.4649, "num_input_tokens_seen": 37317920, "step": 30755 }, { "epoch": 3.42577124401381, "grad_norm": 0.13637298345565796, "learning_rate": 4.156118792051206e-05, "loss": 0.4492, "num_input_tokens_seen": 37323744, "step": 30760 }, { "epoch": 3.4263280988974274, "grad_norm": 0.11223940551280975, "learning_rate": 4.1557547334034095e-05, "loss": 0.4777, "num_input_tokens_seen": 37329952, "step": 30765 }, { "epoch": 3.4268849537810446, "grad_norm": 0.12686596810817719, "learning_rate": 4.155390612195893e-05, "loss": 0.4583, "num_input_tokens_seen": 37335520, "step": 30770 }, { "epoch": 3.427441808664662, "grad_norm": 0.09230650961399078, "learning_rate": 4.155026428442414e-05, "loss": 0.4515, "num_input_tokens_seen": 37341632, "step": 30775 }, { "epoch": 3.427998663548279, "grad_norm": 0.10372792184352875, "learning_rate": 4.154662182156732e-05, "loss": 0.4593, "num_input_tokens_seen": 37347808, "step": 30780 }, { "epoch": 3.4285555184318968, "grad_norm": 0.13119231164455414, "learning_rate": 4.154297873352612e-05, "loss": 0.4501, "num_input_tokens_seen": 37353824, "step": 30785 }, { "epoch": 3.429112373315514, "grad_norm": 0.12409582734107971, "learning_rate": 4.153933502043815e-05, "loss": 0.4518, "num_input_tokens_seen": 37359904, "step": 30790 }, { "epoch": 3.4296692281991312, "grad_norm": 0.11768588423728943, "learning_rate": 4.1535690682441105e-05, "loss": 0.4718, "num_input_tokens_seen": 37366080, "step": 30795 }, { "epoch": 3.4302260830827485, "grad_norm": 0.10418319702148438, "learning_rate": 4.1532045719672685e-05, "loss": 0.4633, "num_input_tokens_seen": 37372160, "step": 30800 }, { "epoch": 3.430782937966366, "grad_norm": 0.10059821605682373, "learning_rate": 4.152840013227059e-05, "loss": 0.4689, "num_input_tokens_seen": 37378688, "step": 30805 }, { "epoch": 3.4313397928499834, "grad_norm": 0.1366179883480072, "learning_rate": 4.1524753920372574e-05, "loss": 0.4754, "num_input_tokens_seen": 37384800, "step": 30810 }, { "epoch": 3.4318966477336006, "grad_norm": 0.12574100494384766, "learning_rate": 4.152110708411639e-05, "loss": 0.4698, "num_input_tokens_seen": 37390720, "step": 30815 }, { "epoch": 3.432453502617218, "grad_norm": 0.0894334465265274, "learning_rate": 4.1517459623639844e-05, "loss": 0.4652, "num_input_tokens_seen": 37396576, "step": 30820 }, { "epoch": 3.4330103575008355, "grad_norm": 0.11041752994060516, "learning_rate": 4.1513811539080735e-05, "loss": 0.4665, "num_input_tokens_seen": 37402752, "step": 30825 }, { "epoch": 3.4335672123844527, "grad_norm": 0.10801304131746292, "learning_rate": 4.151016283057692e-05, "loss": 0.4531, "num_input_tokens_seen": 37408480, "step": 30830 }, { "epoch": 3.43412406726807, "grad_norm": 0.10686042159795761, "learning_rate": 4.1506513498266225e-05, "loss": 0.4789, "num_input_tokens_seen": 37414752, "step": 30835 }, { "epoch": 3.434680922151687, "grad_norm": 0.10935226827859879, "learning_rate": 4.1502863542286564e-05, "loss": 0.474, "num_input_tokens_seen": 37420832, "step": 30840 }, { "epoch": 3.4352377770353044, "grad_norm": 0.1235881894826889, "learning_rate": 4.149921296277582e-05, "loss": 0.4758, "num_input_tokens_seen": 37427104, "step": 30845 }, { "epoch": 3.435794631918922, "grad_norm": 0.13540063798427582, "learning_rate": 4.1495561759871945e-05, "loss": 0.4587, "num_input_tokens_seen": 37433184, "step": 30850 }, { "epoch": 3.4363514868025393, "grad_norm": 0.1618013083934784, "learning_rate": 4.1491909933712884e-05, "loss": 0.4552, "num_input_tokens_seen": 37439232, "step": 30855 }, { "epoch": 3.4369083416861566, "grad_norm": 0.09591284394264221, "learning_rate": 4.148825748443662e-05, "loss": 0.4725, "num_input_tokens_seen": 37445280, "step": 30860 }, { "epoch": 3.437465196569774, "grad_norm": 0.14296342432498932, "learning_rate": 4.148460441218114e-05, "loss": 0.4693, "num_input_tokens_seen": 37451424, "step": 30865 }, { "epoch": 3.438022051453391, "grad_norm": 0.11083899438381195, "learning_rate": 4.148095071708448e-05, "loss": 0.4637, "num_input_tokens_seen": 37456736, "step": 30870 }, { "epoch": 3.4385789063370087, "grad_norm": 0.09884939342737198, "learning_rate": 4.147729639928469e-05, "loss": 0.4613, "num_input_tokens_seen": 37463008, "step": 30875 }, { "epoch": 3.439135761220626, "grad_norm": 0.13399846851825714, "learning_rate": 4.147364145891983e-05, "loss": 0.4596, "num_input_tokens_seen": 37469152, "step": 30880 }, { "epoch": 3.439692616104243, "grad_norm": 0.09375324845314026, "learning_rate": 4.1469985896128014e-05, "loss": 0.465, "num_input_tokens_seen": 37474912, "step": 30885 }, { "epoch": 3.4402494709878604, "grad_norm": 0.10862463712692261, "learning_rate": 4.146632971104734e-05, "loss": 0.469, "num_input_tokens_seen": 37480704, "step": 30890 }, { "epoch": 3.440806325871478, "grad_norm": 0.09786955267190933, "learning_rate": 4.146267290381597e-05, "loss": 0.4591, "num_input_tokens_seen": 37486976, "step": 30895 }, { "epoch": 3.4413631807550953, "grad_norm": 0.08598335087299347, "learning_rate": 4.145901547457205e-05, "loss": 0.4679, "num_input_tokens_seen": 37493248, "step": 30900 }, { "epoch": 3.4419200356387125, "grad_norm": 0.09415411204099655, "learning_rate": 4.145535742345379e-05, "loss": 0.4673, "num_input_tokens_seen": 37499328, "step": 30905 }, { "epoch": 3.4424768905223297, "grad_norm": 0.09973834455013275, "learning_rate": 4.145169875059939e-05, "loss": 0.4648, "num_input_tokens_seen": 37505568, "step": 30910 }, { "epoch": 3.4430337454059474, "grad_norm": 0.17638690769672394, "learning_rate": 4.144803945614708e-05, "loss": 0.4719, "num_input_tokens_seen": 37511872, "step": 30915 }, { "epoch": 3.4435906002895647, "grad_norm": 0.1299053430557251, "learning_rate": 4.1444379540235135e-05, "loss": 0.4628, "num_input_tokens_seen": 37518080, "step": 30920 }, { "epoch": 3.444147455173182, "grad_norm": 0.10683062672615051, "learning_rate": 4.144071900300184e-05, "loss": 0.4609, "num_input_tokens_seen": 37524192, "step": 30925 }, { "epoch": 3.444704310056799, "grad_norm": 0.11415533721446991, "learning_rate": 4.143705784458548e-05, "loss": 0.4728, "num_input_tokens_seen": 37530432, "step": 30930 }, { "epoch": 3.4452611649404163, "grad_norm": 0.08954361081123352, "learning_rate": 4.143339606512441e-05, "loss": 0.4578, "num_input_tokens_seen": 37536096, "step": 30935 }, { "epoch": 3.445818019824034, "grad_norm": 0.09814178943634033, "learning_rate": 4.142973366475697e-05, "loss": 0.4768, "num_input_tokens_seen": 37542400, "step": 30940 }, { "epoch": 3.4463748747076512, "grad_norm": 0.15051138401031494, "learning_rate": 4.142607064362154e-05, "loss": 0.4737, "num_input_tokens_seen": 37548704, "step": 30945 }, { "epoch": 3.4469317295912685, "grad_norm": 0.10903724282979965, "learning_rate": 4.1422407001856524e-05, "loss": 0.4439, "num_input_tokens_seen": 37554688, "step": 30950 }, { "epoch": 3.4474885844748857, "grad_norm": 0.10580987483263016, "learning_rate": 4.141874273960034e-05, "loss": 0.4567, "num_input_tokens_seen": 37560736, "step": 30955 }, { "epoch": 3.448045439358503, "grad_norm": 0.09113554656505585, "learning_rate": 4.141507785699144e-05, "loss": 0.4632, "num_input_tokens_seen": 37566400, "step": 30960 }, { "epoch": 3.4486022942421206, "grad_norm": 0.14053058624267578, "learning_rate": 4.1411412354168295e-05, "loss": 0.461, "num_input_tokens_seen": 37572128, "step": 30965 }, { "epoch": 3.449159149125738, "grad_norm": 0.09752240031957626, "learning_rate": 4.1407746231269396e-05, "loss": 0.4749, "num_input_tokens_seen": 37578304, "step": 30970 }, { "epoch": 3.449716004009355, "grad_norm": 0.14023005962371826, "learning_rate": 4.140407948843327e-05, "loss": 0.4739, "num_input_tokens_seen": 37584096, "step": 30975 }, { "epoch": 3.4502728588929723, "grad_norm": 0.12805189192295074, "learning_rate": 4.140041212579844e-05, "loss": 0.4633, "num_input_tokens_seen": 37589792, "step": 30980 }, { "epoch": 3.45082971377659, "grad_norm": 0.09269126504659653, "learning_rate": 4.1396744143503485e-05, "loss": 0.4661, "num_input_tokens_seen": 37595808, "step": 30985 }, { "epoch": 3.451386568660207, "grad_norm": 0.0869799479842186, "learning_rate": 4.1393075541686996e-05, "loss": 0.4589, "num_input_tokens_seen": 37602048, "step": 30990 }, { "epoch": 3.4519434235438244, "grad_norm": 0.11563707888126373, "learning_rate": 4.138940632048758e-05, "loss": 0.4507, "num_input_tokens_seen": 37608256, "step": 30995 }, { "epoch": 3.4525002784274417, "grad_norm": 0.11128189414739609, "learning_rate": 4.1385736480043866e-05, "loss": 0.472, "num_input_tokens_seen": 37614336, "step": 31000 }, { "epoch": 3.4530571333110593, "grad_norm": 0.10966916382312775, "learning_rate": 4.1382066020494516e-05, "loss": 0.4672, "num_input_tokens_seen": 37620576, "step": 31005 }, { "epoch": 3.4536139881946766, "grad_norm": 0.0933319702744484, "learning_rate": 4.137839494197821e-05, "loss": 0.469, "num_input_tokens_seen": 37626592, "step": 31010 }, { "epoch": 3.454170843078294, "grad_norm": 0.08318356424570084, "learning_rate": 4.1374723244633664e-05, "loss": 0.4569, "num_input_tokens_seen": 37632896, "step": 31015 }, { "epoch": 3.454727697961911, "grad_norm": 0.12161517143249512, "learning_rate": 4.1371050928599595e-05, "loss": 0.457, "num_input_tokens_seen": 37639072, "step": 31020 }, { "epoch": 3.4552845528455283, "grad_norm": 0.10682238638401031, "learning_rate": 4.136737799401476e-05, "loss": 0.452, "num_input_tokens_seen": 37644896, "step": 31025 }, { "epoch": 3.455841407729146, "grad_norm": 0.10444341599941254, "learning_rate": 4.136370444101793e-05, "loss": 0.4772, "num_input_tokens_seen": 37651200, "step": 31030 }, { "epoch": 3.456398262612763, "grad_norm": 0.13346439599990845, "learning_rate": 4.136003026974791e-05, "loss": 0.4648, "num_input_tokens_seen": 37657344, "step": 31035 }, { "epoch": 3.4569551174963804, "grad_norm": 0.1346011757850647, "learning_rate": 4.1356355480343514e-05, "loss": 0.452, "num_input_tokens_seen": 37663456, "step": 31040 }, { "epoch": 3.4575119723799976, "grad_norm": 0.13872098922729492, "learning_rate": 4.13526800729436e-05, "loss": 0.4614, "num_input_tokens_seen": 37669120, "step": 31045 }, { "epoch": 3.458068827263615, "grad_norm": 0.08364321291446686, "learning_rate": 4.134900404768701e-05, "loss": 0.4616, "num_input_tokens_seen": 37675136, "step": 31050 }, { "epoch": 3.4586256821472325, "grad_norm": 0.09911900013685226, "learning_rate": 4.1345327404712673e-05, "loss": 0.4672, "num_input_tokens_seen": 37681632, "step": 31055 }, { "epoch": 3.4591825370308498, "grad_norm": 0.09279921650886536, "learning_rate": 4.134165014415947e-05, "loss": 0.4686, "num_input_tokens_seen": 37687872, "step": 31060 }, { "epoch": 3.459739391914467, "grad_norm": 0.08371903747320175, "learning_rate": 4.133797226616637e-05, "loss": 0.459, "num_input_tokens_seen": 37694016, "step": 31065 }, { "epoch": 3.4602962467980842, "grad_norm": 0.10819070041179657, "learning_rate": 4.1334293770872314e-05, "loss": 0.4493, "num_input_tokens_seen": 37700096, "step": 31070 }, { "epoch": 3.460853101681702, "grad_norm": 0.11492812633514404, "learning_rate": 4.1330614658416286e-05, "loss": 0.455, "num_input_tokens_seen": 37706336, "step": 31075 }, { "epoch": 3.461409956565319, "grad_norm": 0.12232936918735504, "learning_rate": 4.132693492893732e-05, "loss": 0.4662, "num_input_tokens_seen": 37712320, "step": 31080 }, { "epoch": 3.4619668114489364, "grad_norm": 0.1045723557472229, "learning_rate": 4.132325458257441e-05, "loss": 0.4595, "num_input_tokens_seen": 37718336, "step": 31085 }, { "epoch": 3.4625236663325536, "grad_norm": 0.1259378343820572, "learning_rate": 4.1319573619466646e-05, "loss": 0.4623, "num_input_tokens_seen": 37724224, "step": 31090 }, { "epoch": 3.4630805212161713, "grad_norm": 0.11084438860416412, "learning_rate": 4.1315892039753086e-05, "loss": 0.4783, "num_input_tokens_seen": 37730368, "step": 31095 }, { "epoch": 3.4636373760997885, "grad_norm": 0.08600674569606781, "learning_rate": 4.1312209843572837e-05, "loss": 0.457, "num_input_tokens_seen": 37736416, "step": 31100 }, { "epoch": 3.4641942309834057, "grad_norm": 0.11874482035636902, "learning_rate": 4.130852703106503e-05, "loss": 0.4561, "num_input_tokens_seen": 37742432, "step": 31105 }, { "epoch": 3.464751085867023, "grad_norm": 0.10171771794557571, "learning_rate": 4.1304843602368804e-05, "loss": 0.4408, "num_input_tokens_seen": 37748608, "step": 31110 }, { "epoch": 3.46530794075064, "grad_norm": 0.0959487035870552, "learning_rate": 4.1301159557623336e-05, "loss": 0.4608, "num_input_tokens_seen": 37754880, "step": 31115 }, { "epoch": 3.465864795634258, "grad_norm": 0.13270318508148193, "learning_rate": 4.1297474896967814e-05, "loss": 0.4639, "num_input_tokens_seen": 37760992, "step": 31120 }, { "epoch": 3.466421650517875, "grad_norm": 0.10422175377607346, "learning_rate": 4.1293789620541464e-05, "loss": 0.4613, "num_input_tokens_seen": 37767136, "step": 31125 }, { "epoch": 3.4669785054014923, "grad_norm": 0.11873723566532135, "learning_rate": 4.1290103728483545e-05, "loss": 0.4667, "num_input_tokens_seen": 37773184, "step": 31130 }, { "epoch": 3.4675353602851096, "grad_norm": 0.13674123585224152, "learning_rate": 4.128641722093328e-05, "loss": 0.4684, "num_input_tokens_seen": 37779072, "step": 31135 }, { "epoch": 3.468092215168727, "grad_norm": 0.1410873532295227, "learning_rate": 4.1282730098029995e-05, "loss": 0.4663, "num_input_tokens_seen": 37785184, "step": 31140 }, { "epoch": 3.4686490700523445, "grad_norm": 0.12664319574832916, "learning_rate": 4.127904235991298e-05, "loss": 0.4687, "num_input_tokens_seen": 37791392, "step": 31145 }, { "epoch": 3.4692059249359617, "grad_norm": 0.1078873872756958, "learning_rate": 4.1275354006721574e-05, "loss": 0.4587, "num_input_tokens_seen": 37797408, "step": 31150 }, { "epoch": 3.469762779819579, "grad_norm": 0.1035580262541771, "learning_rate": 4.1271665038595135e-05, "loss": 0.4687, "num_input_tokens_seen": 37802944, "step": 31155 }, { "epoch": 3.470319634703196, "grad_norm": 0.1141660138964653, "learning_rate": 4.126797545567305e-05, "loss": 0.4544, "num_input_tokens_seen": 37808800, "step": 31160 }, { "epoch": 3.470876489586814, "grad_norm": 0.10946178436279297, "learning_rate": 4.126428525809472e-05, "loss": 0.4647, "num_input_tokens_seen": 37814496, "step": 31165 }, { "epoch": 3.471433344470431, "grad_norm": 0.10620922595262527, "learning_rate": 4.126059444599957e-05, "loss": 0.4775, "num_input_tokens_seen": 37820064, "step": 31170 }, { "epoch": 3.4719901993540483, "grad_norm": 0.10723823308944702, "learning_rate": 4.125690301952705e-05, "loss": 0.4777, "num_input_tokens_seen": 37826112, "step": 31175 }, { "epoch": 3.4725470542376655, "grad_norm": 0.14188750088214874, "learning_rate": 4.1253210978816645e-05, "loss": 0.4747, "num_input_tokens_seen": 37832352, "step": 31180 }, { "epoch": 3.473103909121283, "grad_norm": 0.12604300677776337, "learning_rate": 4.124951832400783e-05, "loss": 0.4593, "num_input_tokens_seen": 37838560, "step": 31185 }, { "epoch": 3.4736607640049004, "grad_norm": 0.11648498475551605, "learning_rate": 4.124582505524015e-05, "loss": 0.4641, "num_input_tokens_seen": 37844832, "step": 31190 }, { "epoch": 3.4742176188885177, "grad_norm": 0.11206325888633728, "learning_rate": 4.124213117265313e-05, "loss": 0.4528, "num_input_tokens_seen": 37850784, "step": 31195 }, { "epoch": 3.474774473772135, "grad_norm": 0.11308246105909348, "learning_rate": 4.123843667638635e-05, "loss": 0.463, "num_input_tokens_seen": 37856768, "step": 31200 }, { "epoch": 3.475331328655752, "grad_norm": 0.15540307760238647, "learning_rate": 4.123474156657939e-05, "loss": 0.4583, "num_input_tokens_seen": 37863008, "step": 31205 }, { "epoch": 3.47588818353937, "grad_norm": 0.15010225772857666, "learning_rate": 4.123104584337186e-05, "loss": 0.4734, "num_input_tokens_seen": 37868960, "step": 31210 }, { "epoch": 3.476445038422987, "grad_norm": 0.12045305967330933, "learning_rate": 4.122734950690341e-05, "loss": 0.4632, "num_input_tokens_seen": 37875392, "step": 31215 }, { "epoch": 3.4770018933066043, "grad_norm": 0.1301729381084442, "learning_rate": 4.1223652557313695e-05, "loss": 0.4673, "num_input_tokens_seen": 37881408, "step": 31220 }, { "epoch": 3.4775587481902215, "grad_norm": 0.12992438673973083, "learning_rate": 4.121995499474239e-05, "loss": 0.4676, "num_input_tokens_seen": 37887680, "step": 31225 }, { "epoch": 3.4781156030738387, "grad_norm": 0.1348150074481964, "learning_rate": 4.121625681932921e-05, "loss": 0.472, "num_input_tokens_seen": 37894144, "step": 31230 }, { "epoch": 3.4786724579574564, "grad_norm": 0.08932328969240189, "learning_rate": 4.121255803121388e-05, "loss": 0.4653, "num_input_tokens_seen": 37900192, "step": 31235 }, { "epoch": 3.4792293128410736, "grad_norm": 0.12406816333532333, "learning_rate": 4.1208858630536154e-05, "loss": 0.455, "num_input_tokens_seen": 37906304, "step": 31240 }, { "epoch": 3.479786167724691, "grad_norm": 0.11962592601776123, "learning_rate": 4.12051586174358e-05, "loss": 0.4647, "num_input_tokens_seen": 37912576, "step": 31245 }, { "epoch": 3.4803430226083085, "grad_norm": 0.12103553861379623, "learning_rate": 4.1201457992052615e-05, "loss": 0.4721, "num_input_tokens_seen": 37918720, "step": 31250 }, { "epoch": 3.4808998774919258, "grad_norm": 0.1259276568889618, "learning_rate": 4.119775675452644e-05, "loss": 0.4492, "num_input_tokens_seen": 37924896, "step": 31255 }, { "epoch": 3.481456732375543, "grad_norm": 0.09874371439218521, "learning_rate": 4.1194054904997106e-05, "loss": 0.4795, "num_input_tokens_seen": 37930816, "step": 31260 }, { "epoch": 3.48201358725916, "grad_norm": 0.15357638895511627, "learning_rate": 4.1190352443604477e-05, "loss": 0.4608, "num_input_tokens_seen": 37936800, "step": 31265 }, { "epoch": 3.4825704421427774, "grad_norm": 0.10741551965475082, "learning_rate": 4.118664937048844e-05, "loss": 0.4715, "num_input_tokens_seen": 37943072, "step": 31270 }, { "epoch": 3.483127297026395, "grad_norm": 0.13084077835083008, "learning_rate": 4.118294568578893e-05, "loss": 0.47, "num_input_tokens_seen": 37949376, "step": 31275 }, { "epoch": 3.4836841519100123, "grad_norm": 0.11544378846883774, "learning_rate": 4.117924138964586e-05, "loss": 0.4612, "num_input_tokens_seen": 37955616, "step": 31280 }, { "epoch": 3.4842410067936296, "grad_norm": 0.13785162568092346, "learning_rate": 4.117553648219922e-05, "loss": 0.4676, "num_input_tokens_seen": 37961984, "step": 31285 }, { "epoch": 3.484797861677247, "grad_norm": 0.12105530500411987, "learning_rate": 4.1171830963588956e-05, "loss": 0.4466, "num_input_tokens_seen": 37968096, "step": 31290 }, { "epoch": 3.485354716560864, "grad_norm": 0.12126712501049042, "learning_rate": 4.11681248339551e-05, "loss": 0.4716, "num_input_tokens_seen": 37974208, "step": 31295 }, { "epoch": 3.4859115714444817, "grad_norm": 0.12556463479995728, "learning_rate": 4.116441809343767e-05, "loss": 0.4762, "num_input_tokens_seen": 37980192, "step": 31300 }, { "epoch": 3.486468426328099, "grad_norm": 0.14243054389953613, "learning_rate": 4.116071074217673e-05, "loss": 0.4469, "num_input_tokens_seen": 37986240, "step": 31305 }, { "epoch": 3.487025281211716, "grad_norm": 0.1439446657896042, "learning_rate": 4.115700278031234e-05, "loss": 0.4728, "num_input_tokens_seen": 37992096, "step": 31310 }, { "epoch": 3.4875821360953334, "grad_norm": 0.13243183493614197, "learning_rate": 4.115329420798462e-05, "loss": 0.4626, "num_input_tokens_seen": 37998304, "step": 31315 }, { "epoch": 3.4881389909789506, "grad_norm": 0.11945450305938721, "learning_rate": 4.114958502533367e-05, "loss": 0.4586, "num_input_tokens_seen": 38004512, "step": 31320 }, { "epoch": 3.4886958458625683, "grad_norm": 0.12037031352519989, "learning_rate": 4.114587523249964e-05, "loss": 0.4758, "num_input_tokens_seen": 38010528, "step": 31325 }, { "epoch": 3.4892527007461855, "grad_norm": 0.10159832239151001, "learning_rate": 4.114216482962271e-05, "loss": 0.4518, "num_input_tokens_seen": 38016832, "step": 31330 }, { "epoch": 3.4898095556298028, "grad_norm": 0.09590180218219757, "learning_rate": 4.113845381684306e-05, "loss": 0.4598, "num_input_tokens_seen": 38022592, "step": 31335 }, { "epoch": 3.4903664105134204, "grad_norm": 0.10282517969608307, "learning_rate": 4.11347421943009e-05, "loss": 0.4599, "num_input_tokens_seen": 38028736, "step": 31340 }, { "epoch": 3.4909232653970377, "grad_norm": 0.10789602249860764, "learning_rate": 4.113102996213648e-05, "loss": 0.4584, "num_input_tokens_seen": 38034880, "step": 31345 }, { "epoch": 3.491480120280655, "grad_norm": 0.10252523422241211, "learning_rate": 4.112731712049006e-05, "loss": 0.4547, "num_input_tokens_seen": 38040960, "step": 31350 }, { "epoch": 3.492036975164272, "grad_norm": 0.12896643579006195, "learning_rate": 4.112360366950191e-05, "loss": 0.4639, "num_input_tokens_seen": 38047072, "step": 31355 }, { "epoch": 3.4925938300478894, "grad_norm": 0.10812297463417053, "learning_rate": 4.111988960931234e-05, "loss": 0.4807, "num_input_tokens_seen": 38053152, "step": 31360 }, { "epoch": 3.493150684931507, "grad_norm": 0.13101151585578918, "learning_rate": 4.111617494006169e-05, "loss": 0.4684, "num_input_tokens_seen": 38059232, "step": 31365 }, { "epoch": 3.4937075398151243, "grad_norm": 0.11649975925683975, "learning_rate": 4.111245966189029e-05, "loss": 0.4662, "num_input_tokens_seen": 38065504, "step": 31370 }, { "epoch": 3.4942643946987415, "grad_norm": 0.10327766835689545, "learning_rate": 4.1108743774938544e-05, "loss": 0.4702, "num_input_tokens_seen": 38071584, "step": 31375 }, { "epoch": 3.4948212495823587, "grad_norm": 0.09231985360383987, "learning_rate": 4.110502727934683e-05, "loss": 0.4647, "num_input_tokens_seen": 38077632, "step": 31380 }, { "epoch": 3.495378104465976, "grad_norm": 0.10279986262321472, "learning_rate": 4.110131017525557e-05, "loss": 0.4607, "num_input_tokens_seen": 38083104, "step": 31385 }, { "epoch": 3.4959349593495936, "grad_norm": 0.10865271091461182, "learning_rate": 4.109759246280521e-05, "loss": 0.4717, "num_input_tokens_seen": 38089056, "step": 31390 }, { "epoch": 3.496491814233211, "grad_norm": 0.1302466243505478, "learning_rate": 4.109387414213623e-05, "loss": 0.4649, "num_input_tokens_seen": 38095072, "step": 31395 }, { "epoch": 3.497048669116828, "grad_norm": 0.08842871338129044, "learning_rate": 4.1090155213389106e-05, "loss": 0.4695, "num_input_tokens_seen": 38101184, "step": 31400 }, { "epoch": 3.4976055240004453, "grad_norm": 0.09648774564266205, "learning_rate": 4.108643567670435e-05, "loss": 0.4667, "num_input_tokens_seen": 38107200, "step": 31405 }, { "epoch": 3.498162378884063, "grad_norm": 0.09493555873632431, "learning_rate": 4.108271553222251e-05, "loss": 0.4692, "num_input_tokens_seen": 38113088, "step": 31410 }, { "epoch": 3.4987192337676802, "grad_norm": 0.13044309616088867, "learning_rate": 4.1078994780084135e-05, "loss": 0.4697, "num_input_tokens_seen": 38119328, "step": 31415 }, { "epoch": 3.4992760886512975, "grad_norm": 0.1115722507238388, "learning_rate": 4.107527342042981e-05, "loss": 0.4684, "num_input_tokens_seen": 38125408, "step": 31420 }, { "epoch": 3.4998329435349147, "grad_norm": 0.11643032729625702, "learning_rate": 4.107155145340013e-05, "loss": 0.4774, "num_input_tokens_seen": 38131520, "step": 31425 }, { "epoch": 3.5003897984185324, "grad_norm": 0.07565184682607651, "learning_rate": 4.1067828879135737e-05, "loss": 0.4541, "num_input_tokens_seen": 38137920, "step": 31430 }, { "epoch": 3.5003897984185324, "eval_loss": 0.464559406042099, "eval_runtime": 113.0538, "eval_samples_per_second": 35.302, "eval_steps_per_second": 8.828, "num_input_tokens_seen": 38137920, "step": 31430 }, { "epoch": 3.5009466533021496, "grad_norm": 0.1343914270401001, "learning_rate": 4.1064105697777284e-05, "loss": 0.4615, "num_input_tokens_seen": 38143968, "step": 31435 }, { "epoch": 3.501503508185767, "grad_norm": 0.10545539110898972, "learning_rate": 4.106038190946543e-05, "loss": 0.4602, "num_input_tokens_seen": 38149824, "step": 31440 }, { "epoch": 3.502060363069384, "grad_norm": 0.11578751355409622, "learning_rate": 4.105665751434089e-05, "loss": 0.4708, "num_input_tokens_seen": 38156096, "step": 31445 }, { "epoch": 3.5026172179530013, "grad_norm": 0.10722383856773376, "learning_rate": 4.105293251254436e-05, "loss": 0.4589, "num_input_tokens_seen": 38162400, "step": 31450 }, { "epoch": 3.503174072836619, "grad_norm": 0.12734529376029968, "learning_rate": 4.1049206904216605e-05, "loss": 0.4785, "num_input_tokens_seen": 38168672, "step": 31455 }, { "epoch": 3.503730927720236, "grad_norm": 0.15052656829357147, "learning_rate": 4.1045480689498376e-05, "loss": 0.4654, "num_input_tokens_seen": 38174784, "step": 31460 }, { "epoch": 3.5042877826038534, "grad_norm": 0.12152586877346039, "learning_rate": 4.104175386853046e-05, "loss": 0.4669, "num_input_tokens_seen": 38181056, "step": 31465 }, { "epoch": 3.5048446374874707, "grad_norm": 0.18920700252056122, "learning_rate": 4.103802644145369e-05, "loss": 0.4618, "num_input_tokens_seen": 38186944, "step": 31470 }, { "epoch": 3.505401492371088, "grad_norm": 0.12620678544044495, "learning_rate": 4.103429840840888e-05, "loss": 0.4589, "num_input_tokens_seen": 38193120, "step": 31475 }, { "epoch": 3.5059583472547056, "grad_norm": 0.12140152603387833, "learning_rate": 4.103056976953689e-05, "loss": 0.4636, "num_input_tokens_seen": 38199072, "step": 31480 }, { "epoch": 3.506515202138323, "grad_norm": 0.11646988987922668, "learning_rate": 4.10268405249786e-05, "loss": 0.4486, "num_input_tokens_seen": 38205312, "step": 31485 }, { "epoch": 3.50707205702194, "grad_norm": 0.11187088489532471, "learning_rate": 4.102311067487491e-05, "loss": 0.4529, "num_input_tokens_seen": 38211232, "step": 31490 }, { "epoch": 3.5076289119055573, "grad_norm": 0.0942215695977211, "learning_rate": 4.1019380219366755e-05, "loss": 0.4647, "num_input_tokens_seen": 38217344, "step": 31495 }, { "epoch": 3.5081857667891745, "grad_norm": 0.11555583775043488, "learning_rate": 4.101564915859508e-05, "loss": 0.4651, "num_input_tokens_seen": 38223616, "step": 31500 }, { "epoch": 3.508742621672792, "grad_norm": 0.13053424656391144, "learning_rate": 4.101191749270086e-05, "loss": 0.4645, "num_input_tokens_seen": 38229280, "step": 31505 }, { "epoch": 3.5092994765564094, "grad_norm": 0.10455714911222458, "learning_rate": 4.100818522182507e-05, "loss": 0.4594, "num_input_tokens_seen": 38235392, "step": 31510 }, { "epoch": 3.5098563314400266, "grad_norm": 0.1450938731431961, "learning_rate": 4.1004452346108754e-05, "loss": 0.4632, "num_input_tokens_seen": 38241472, "step": 31515 }, { "epoch": 3.5104131863236443, "grad_norm": 0.11662732064723969, "learning_rate": 4.100071886569293e-05, "loss": 0.4585, "num_input_tokens_seen": 38247392, "step": 31520 }, { "epoch": 3.5109700412072615, "grad_norm": 0.10923449695110321, "learning_rate": 4.099698478071868e-05, "loss": 0.4537, "num_input_tokens_seen": 38253440, "step": 31525 }, { "epoch": 3.5115268960908788, "grad_norm": 0.12443889677524567, "learning_rate": 4.0993250091327075e-05, "loss": 0.4593, "num_input_tokens_seen": 38259232, "step": 31530 }, { "epoch": 3.512083750974496, "grad_norm": 0.10994771122932434, "learning_rate": 4.098951479765923e-05, "loss": 0.4479, "num_input_tokens_seen": 38265056, "step": 31535 }, { "epoch": 3.512640605858113, "grad_norm": 0.10982275009155273, "learning_rate": 4.0985778899856276e-05, "loss": 0.466, "num_input_tokens_seen": 38271104, "step": 31540 }, { "epoch": 3.513197460741731, "grad_norm": 0.11980624496936798, "learning_rate": 4.098204239805936e-05, "loss": 0.4691, "num_input_tokens_seen": 38277248, "step": 31545 }, { "epoch": 3.513754315625348, "grad_norm": 0.11384817212820053, "learning_rate": 4.097830529240967e-05, "loss": 0.4619, "num_input_tokens_seen": 38283040, "step": 31550 }, { "epoch": 3.5143111705089654, "grad_norm": 0.12177524715662003, "learning_rate": 4.09745675830484e-05, "loss": 0.4537, "num_input_tokens_seen": 38289056, "step": 31555 }, { "epoch": 3.5148680253925826, "grad_norm": 0.08672595024108887, "learning_rate": 4.097082927011677e-05, "loss": 0.4685, "num_input_tokens_seen": 38295200, "step": 31560 }, { "epoch": 3.5154248802762, "grad_norm": 0.12086760252714157, "learning_rate": 4.096709035375604e-05, "loss": 0.4564, "num_input_tokens_seen": 38300672, "step": 31565 }, { "epoch": 3.5159817351598175, "grad_norm": 0.13206513226032257, "learning_rate": 4.096335083410746e-05, "loss": 0.4669, "num_input_tokens_seen": 38306432, "step": 31570 }, { "epoch": 3.5165385900434347, "grad_norm": 0.10743644833564758, "learning_rate": 4.095961071131234e-05, "loss": 0.4604, "num_input_tokens_seen": 38312544, "step": 31575 }, { "epoch": 3.517095444927052, "grad_norm": 0.1148776188492775, "learning_rate": 4.0955869985511966e-05, "loss": 0.4524, "num_input_tokens_seen": 38318592, "step": 31580 }, { "epoch": 3.517652299810669, "grad_norm": 0.11119060218334198, "learning_rate": 4.095212865684769e-05, "loss": 0.483, "num_input_tokens_seen": 38324736, "step": 31585 }, { "epoch": 3.5182091546942864, "grad_norm": 0.08621292561292648, "learning_rate": 4.094838672546089e-05, "loss": 0.4636, "num_input_tokens_seen": 38331200, "step": 31590 }, { "epoch": 3.518766009577904, "grad_norm": 0.09107755869626999, "learning_rate": 4.094464419149291e-05, "loss": 0.4739, "num_input_tokens_seen": 38337056, "step": 31595 }, { "epoch": 3.5193228644615213, "grad_norm": 0.0989958718419075, "learning_rate": 4.094090105508519e-05, "loss": 0.4598, "num_input_tokens_seen": 38343104, "step": 31600 }, { "epoch": 3.5198797193451385, "grad_norm": 0.0968940481543541, "learning_rate": 4.093715731637913e-05, "loss": 0.4683, "num_input_tokens_seen": 38349056, "step": 31605 }, { "epoch": 3.520436574228756, "grad_norm": 0.12669500708580017, "learning_rate": 4.093341297551621e-05, "loss": 0.4638, "num_input_tokens_seen": 38354688, "step": 31610 }, { "epoch": 3.5209934291123735, "grad_norm": 0.10858795046806335, "learning_rate": 4.0929668032637875e-05, "loss": 0.4607, "num_input_tokens_seen": 38360640, "step": 31615 }, { "epoch": 3.5215502839959907, "grad_norm": 0.10306941717863083, "learning_rate": 4.092592248788563e-05, "loss": 0.4567, "num_input_tokens_seen": 38366976, "step": 31620 }, { "epoch": 3.522107138879608, "grad_norm": 0.133620485663414, "learning_rate": 4.092217634140101e-05, "loss": 0.4691, "num_input_tokens_seen": 38372992, "step": 31625 }, { "epoch": 3.522663993763225, "grad_norm": 0.16836649179458618, "learning_rate": 4.091842959332553e-05, "loss": 0.4802, "num_input_tokens_seen": 38379296, "step": 31630 }, { "epoch": 3.523220848646843, "grad_norm": 0.09632210433483124, "learning_rate": 4.0914682243800775e-05, "loss": 0.4587, "num_input_tokens_seen": 38385568, "step": 31635 }, { "epoch": 3.52377770353046, "grad_norm": 0.16772697865962982, "learning_rate": 4.0910934292968315e-05, "loss": 0.4542, "num_input_tokens_seen": 38391616, "step": 31640 }, { "epoch": 3.5243345584140773, "grad_norm": 0.1101737767457962, "learning_rate": 4.090718574096978e-05, "loss": 0.4584, "num_input_tokens_seen": 38397536, "step": 31645 }, { "epoch": 3.5248914132976945, "grad_norm": 0.11537859588861465, "learning_rate": 4.0903436587946776e-05, "loss": 0.4662, "num_input_tokens_seen": 38403648, "step": 31650 }, { "epoch": 3.5254482681813117, "grad_norm": 0.09218744933605194, "learning_rate": 4.0899686834040986e-05, "loss": 0.4714, "num_input_tokens_seen": 38410080, "step": 31655 }, { "epoch": 3.5260051230649294, "grad_norm": 0.10320360213518143, "learning_rate": 4.0895936479394066e-05, "loss": 0.4644, "num_input_tokens_seen": 38416384, "step": 31660 }, { "epoch": 3.5265619779485466, "grad_norm": 0.15482313930988312, "learning_rate": 4.089218552414773e-05, "loss": 0.4632, "num_input_tokens_seen": 38422592, "step": 31665 }, { "epoch": 3.527118832832164, "grad_norm": 0.14141124486923218, "learning_rate": 4.088843396844369e-05, "loss": 0.4508, "num_input_tokens_seen": 38428544, "step": 31670 }, { "epoch": 3.527675687715781, "grad_norm": 0.12591031193733215, "learning_rate": 4.0884681812423696e-05, "loss": 0.4652, "num_input_tokens_seen": 38434656, "step": 31675 }, { "epoch": 3.5282325425993983, "grad_norm": 0.1656264066696167, "learning_rate": 4.088092905622953e-05, "loss": 0.4637, "num_input_tokens_seen": 38440640, "step": 31680 }, { "epoch": 3.528789397483016, "grad_norm": 0.08386862277984619, "learning_rate": 4.087717570000296e-05, "loss": 0.4557, "num_input_tokens_seen": 38447008, "step": 31685 }, { "epoch": 3.5293462523666332, "grad_norm": 0.14779449999332428, "learning_rate": 4.087342174388582e-05, "loss": 0.4625, "num_input_tokens_seen": 38453216, "step": 31690 }, { "epoch": 3.5299031072502505, "grad_norm": 0.10702605545520782, "learning_rate": 4.086966718801993e-05, "loss": 0.4649, "num_input_tokens_seen": 38459328, "step": 31695 }, { "epoch": 3.530459962133868, "grad_norm": 0.1001463308930397, "learning_rate": 4.0865912032547164e-05, "loss": 0.4641, "num_input_tokens_seen": 38465408, "step": 31700 }, { "epoch": 3.5310168170174854, "grad_norm": 0.13513003289699554, "learning_rate": 4.08621562776094e-05, "loss": 0.4645, "num_input_tokens_seen": 38471200, "step": 31705 }, { "epoch": 3.5315736719011026, "grad_norm": 0.10948613286018372, "learning_rate": 4.085839992334852e-05, "loss": 0.4767, "num_input_tokens_seen": 38477184, "step": 31710 }, { "epoch": 3.53213052678472, "grad_norm": 0.09833334386348724, "learning_rate": 4.085464296990649e-05, "loss": 0.4586, "num_input_tokens_seen": 38483040, "step": 31715 }, { "epoch": 3.532687381668337, "grad_norm": 0.07962869107723236, "learning_rate": 4.0850885417425234e-05, "loss": 0.4542, "num_input_tokens_seen": 38489088, "step": 31720 }, { "epoch": 3.5332442365519547, "grad_norm": 0.10279757529497147, "learning_rate": 4.0847127266046724e-05, "loss": 0.4682, "num_input_tokens_seen": 38495200, "step": 31725 }, { "epoch": 3.533801091435572, "grad_norm": 0.11052145808935165, "learning_rate": 4.0843368515912964e-05, "loss": 0.4772, "num_input_tokens_seen": 38501440, "step": 31730 }, { "epoch": 3.534357946319189, "grad_norm": 0.09045525640249252, "learning_rate": 4.083960916716597e-05, "loss": 0.4561, "num_input_tokens_seen": 38507296, "step": 31735 }, { "epoch": 3.5349148012028064, "grad_norm": 0.1056763082742691, "learning_rate": 4.0835849219947785e-05, "loss": 0.4693, "num_input_tokens_seen": 38513184, "step": 31740 }, { "epoch": 3.5354716560864237, "grad_norm": 0.098983995616436, "learning_rate": 4.083208867440046e-05, "loss": 0.4594, "num_input_tokens_seen": 38519392, "step": 31745 }, { "epoch": 3.5360285109700413, "grad_norm": 0.11168844252824783, "learning_rate": 4.08283275306661e-05, "loss": 0.4589, "num_input_tokens_seen": 38525472, "step": 31750 }, { "epoch": 3.5365853658536586, "grad_norm": 0.14323419332504272, "learning_rate": 4.082456578888679e-05, "loss": 0.4665, "num_input_tokens_seen": 38531392, "step": 31755 }, { "epoch": 3.537142220737276, "grad_norm": 0.09538467973470688, "learning_rate": 4.082080344920468e-05, "loss": 0.4579, "num_input_tokens_seen": 38537568, "step": 31760 }, { "epoch": 3.537699075620893, "grad_norm": 0.10627926886081696, "learning_rate": 4.081704051176191e-05, "loss": 0.4596, "num_input_tokens_seen": 38543744, "step": 31765 }, { "epoch": 3.5382559305045103, "grad_norm": 0.14385664463043213, "learning_rate": 4.0813276976700666e-05, "loss": 0.4609, "num_input_tokens_seen": 38550016, "step": 31770 }, { "epoch": 3.538812785388128, "grad_norm": 0.09354324638843536, "learning_rate": 4.0809512844163134e-05, "loss": 0.4688, "num_input_tokens_seen": 38556384, "step": 31775 }, { "epoch": 3.539369640271745, "grad_norm": 0.10911170393228531, "learning_rate": 4.080574811429155e-05, "loss": 0.459, "num_input_tokens_seen": 38562720, "step": 31780 }, { "epoch": 3.5399264951553624, "grad_norm": 0.08771666139364243, "learning_rate": 4.080198278722815e-05, "loss": 0.4641, "num_input_tokens_seen": 38568928, "step": 31785 }, { "epoch": 3.54048335003898, "grad_norm": 0.11313515901565552, "learning_rate": 4.07982168631152e-05, "loss": 0.4606, "num_input_tokens_seen": 38574880, "step": 31790 }, { "epoch": 3.5410402049225973, "grad_norm": 0.08287839591503143, "learning_rate": 4.079445034209498e-05, "loss": 0.4465, "num_input_tokens_seen": 38581120, "step": 31795 }, { "epoch": 3.5415970598062145, "grad_norm": 0.08156699687242508, "learning_rate": 4.0790683224309824e-05, "loss": 0.4643, "num_input_tokens_seen": 38586656, "step": 31800 }, { "epoch": 3.5421539146898318, "grad_norm": 0.09858154505491257, "learning_rate": 4.0786915509902045e-05, "loss": 0.455, "num_input_tokens_seen": 38592928, "step": 31805 }, { "epoch": 3.542710769573449, "grad_norm": 0.12574723362922668, "learning_rate": 4.0783147199014004e-05, "loss": 0.4683, "num_input_tokens_seen": 38598848, "step": 31810 }, { "epoch": 3.5432676244570667, "grad_norm": 0.0955347791314125, "learning_rate": 4.077937829178809e-05, "loss": 0.4658, "num_input_tokens_seen": 38604864, "step": 31815 }, { "epoch": 3.543824479340684, "grad_norm": 0.09893179684877396, "learning_rate": 4.077560878836669e-05, "loss": 0.4658, "num_input_tokens_seen": 38611072, "step": 31820 }, { "epoch": 3.544381334224301, "grad_norm": 0.09431880712509155, "learning_rate": 4.077183868889224e-05, "loss": 0.4536, "num_input_tokens_seen": 38617120, "step": 31825 }, { "epoch": 3.5449381891079184, "grad_norm": 0.10990861803293228, "learning_rate": 4.076806799350717e-05, "loss": 0.4764, "num_input_tokens_seen": 38622464, "step": 31830 }, { "epoch": 3.5454950439915356, "grad_norm": 0.12878131866455078, "learning_rate": 4.076429670235397e-05, "loss": 0.4666, "num_input_tokens_seen": 38628608, "step": 31835 }, { "epoch": 3.5460518988751533, "grad_norm": 0.11468588560819626, "learning_rate": 4.0760524815575115e-05, "loss": 0.4525, "num_input_tokens_seen": 38634688, "step": 31840 }, { "epoch": 3.5466087537587705, "grad_norm": 0.09348101913928986, "learning_rate": 4.0756752333313134e-05, "loss": 0.4534, "num_input_tokens_seen": 38641056, "step": 31845 }, { "epoch": 3.5471656086423877, "grad_norm": 0.11565399169921875, "learning_rate": 4.0752979255710556e-05, "loss": 0.4695, "num_input_tokens_seen": 38646880, "step": 31850 }, { "epoch": 3.5477224635260054, "grad_norm": 0.08260287344455719, "learning_rate": 4.074920558290993e-05, "loss": 0.4704, "num_input_tokens_seen": 38652640, "step": 31855 }, { "epoch": 3.548279318409622, "grad_norm": 0.07989349216222763, "learning_rate": 4.074543131505385e-05, "loss": 0.4584, "num_input_tokens_seen": 38658016, "step": 31860 }, { "epoch": 3.54883617329324, "grad_norm": 0.13155607879161835, "learning_rate": 4.0741656452284914e-05, "loss": 0.4615, "num_input_tokens_seen": 38664224, "step": 31865 }, { "epoch": 3.549393028176857, "grad_norm": 0.09635648876428604, "learning_rate": 4.073788099474575e-05, "loss": 0.4558, "num_input_tokens_seen": 38670272, "step": 31870 }, { "epoch": 3.5499498830604743, "grad_norm": 0.09248583763837814, "learning_rate": 4.073410494257901e-05, "loss": 0.4865, "num_input_tokens_seen": 38676480, "step": 31875 }, { "epoch": 3.550506737944092, "grad_norm": 0.1341143101453781, "learning_rate": 4.073032829592737e-05, "loss": 0.4529, "num_input_tokens_seen": 38682784, "step": 31880 }, { "epoch": 3.5510635928277092, "grad_norm": 0.1035875454545021, "learning_rate": 4.0726551054933505e-05, "loss": 0.4679, "num_input_tokens_seen": 38689120, "step": 31885 }, { "epoch": 3.5516204477113265, "grad_norm": 0.10993042588233948, "learning_rate": 4.072277321974014e-05, "loss": 0.4531, "num_input_tokens_seen": 38695488, "step": 31890 }, { "epoch": 3.5521773025949437, "grad_norm": 0.12716010212898254, "learning_rate": 4.0718994790490026e-05, "loss": 0.4582, "num_input_tokens_seen": 38701312, "step": 31895 }, { "epoch": 3.552734157478561, "grad_norm": 0.10231984406709671, "learning_rate": 4.071521576732592e-05, "loss": 0.4539, "num_input_tokens_seen": 38706752, "step": 31900 }, { "epoch": 3.5532910123621786, "grad_norm": 0.1583271622657776, "learning_rate": 4.0711436150390594e-05, "loss": 0.4516, "num_input_tokens_seen": 38712320, "step": 31905 }, { "epoch": 3.553847867245796, "grad_norm": 0.17168426513671875, "learning_rate": 4.070765593982686e-05, "loss": 0.4668, "num_input_tokens_seen": 38718624, "step": 31910 }, { "epoch": 3.554404722129413, "grad_norm": 0.12161999940872192, "learning_rate": 4.070387513577754e-05, "loss": 0.4647, "num_input_tokens_seen": 38724672, "step": 31915 }, { "epoch": 3.5549615770130303, "grad_norm": 0.08307990431785583, "learning_rate": 4.07000937383855e-05, "loss": 0.4598, "num_input_tokens_seen": 38730784, "step": 31920 }, { "epoch": 3.5555184318966475, "grad_norm": 0.10215036571025848, "learning_rate": 4.06963117477936e-05, "loss": 0.463, "num_input_tokens_seen": 38736768, "step": 31925 }, { "epoch": 3.556075286780265, "grad_norm": 0.11010897159576416, "learning_rate": 4.069252916414475e-05, "loss": 0.4658, "num_input_tokens_seen": 38742464, "step": 31930 }, { "epoch": 3.5566321416638824, "grad_norm": 0.14935912191867828, "learning_rate": 4.0688745987581855e-05, "loss": 0.4596, "num_input_tokens_seen": 38748160, "step": 31935 }, { "epoch": 3.5571889965474996, "grad_norm": 0.1302400678396225, "learning_rate": 4.0684962218247855e-05, "loss": 0.4724, "num_input_tokens_seen": 38754176, "step": 31940 }, { "epoch": 3.5577458514311173, "grad_norm": 0.16938291490077972, "learning_rate": 4.068117785628572e-05, "loss": 0.479, "num_input_tokens_seen": 38760512, "step": 31945 }, { "epoch": 3.558302706314734, "grad_norm": 0.10409301519393921, "learning_rate": 4.067739290183843e-05, "loss": 0.4623, "num_input_tokens_seen": 38766336, "step": 31950 }, { "epoch": 3.558859561198352, "grad_norm": 0.13412773609161377, "learning_rate": 4.067360735504901e-05, "loss": 0.4487, "num_input_tokens_seen": 38772736, "step": 31955 }, { "epoch": 3.559416416081969, "grad_norm": 0.10925814509391785, "learning_rate": 4.066982121606046e-05, "loss": 0.4721, "num_input_tokens_seen": 38779136, "step": 31960 }, { "epoch": 3.5599732709655862, "grad_norm": 0.08328963071107864, "learning_rate": 4.066603448501585e-05, "loss": 0.4684, "num_input_tokens_seen": 38785312, "step": 31965 }, { "epoch": 3.560530125849204, "grad_norm": 0.0875563994050026, "learning_rate": 4.0662247162058254e-05, "loss": 0.4561, "num_input_tokens_seen": 38791360, "step": 31970 }, { "epoch": 3.561086980732821, "grad_norm": 0.13285380601882935, "learning_rate": 4.0658459247330766e-05, "loss": 0.4504, "num_input_tokens_seen": 38797408, "step": 31975 }, { "epoch": 3.5616438356164384, "grad_norm": 0.11319142580032349, "learning_rate": 4.065467074097651e-05, "loss": 0.4657, "num_input_tokens_seen": 38803808, "step": 31980 }, { "epoch": 3.5622006905000556, "grad_norm": 0.11469917744398117, "learning_rate": 4.065088164313863e-05, "loss": 0.4481, "num_input_tokens_seen": 38810208, "step": 31985 }, { "epoch": 3.562757545383673, "grad_norm": 0.11098689585924149, "learning_rate": 4.064709195396028e-05, "loss": 0.4587, "num_input_tokens_seen": 38815808, "step": 31990 }, { "epoch": 3.5633144002672905, "grad_norm": 0.10971705615520477, "learning_rate": 4.0643301673584656e-05, "loss": 0.4614, "num_input_tokens_seen": 38821504, "step": 31995 }, { "epoch": 3.5638712551509077, "grad_norm": 0.084192655980587, "learning_rate": 4.063951080215496e-05, "loss": 0.4607, "num_input_tokens_seen": 38826528, "step": 32000 }, { "epoch": 3.564428110034525, "grad_norm": 0.10276409238576889, "learning_rate": 4.063571933981443e-05, "loss": 0.4562, "num_input_tokens_seen": 38832896, "step": 32005 }, { "epoch": 3.564984964918142, "grad_norm": 0.1258051097393036, "learning_rate": 4.0631927286706314e-05, "loss": 0.4629, "num_input_tokens_seen": 38838944, "step": 32010 }, { "epoch": 3.5655418198017594, "grad_norm": 0.1023208349943161, "learning_rate": 4.062813464297389e-05, "loss": 0.4616, "num_input_tokens_seen": 38845088, "step": 32015 }, { "epoch": 3.566098674685377, "grad_norm": 0.1256905347108841, "learning_rate": 4.062434140876046e-05, "loss": 0.4852, "num_input_tokens_seen": 38851392, "step": 32020 }, { "epoch": 3.5666555295689943, "grad_norm": 0.12126890569925308, "learning_rate": 4.0620547584209334e-05, "loss": 0.4886, "num_input_tokens_seen": 38857344, "step": 32025 }, { "epoch": 3.5672123844526116, "grad_norm": 0.10643043369054794, "learning_rate": 4.0616753169463875e-05, "loss": 0.4706, "num_input_tokens_seen": 38863200, "step": 32030 }, { "epoch": 3.5677692393362292, "grad_norm": 0.12689200043678284, "learning_rate": 4.061295816466742e-05, "loss": 0.4731, "num_input_tokens_seen": 38869312, "step": 32035 }, { "epoch": 3.568326094219846, "grad_norm": 0.14976897835731506, "learning_rate": 4.0609162569963375e-05, "loss": 0.471, "num_input_tokens_seen": 38874784, "step": 32040 }, { "epoch": 3.5688829491034637, "grad_norm": 0.08966154605150223, "learning_rate": 4.0605366385495146e-05, "loss": 0.4654, "num_input_tokens_seen": 38880800, "step": 32045 }, { "epoch": 3.569439803987081, "grad_norm": 0.14030760526657104, "learning_rate": 4.060156961140618e-05, "loss": 0.4636, "num_input_tokens_seen": 38886816, "step": 32050 }, { "epoch": 3.569996658870698, "grad_norm": 0.09831379354000092, "learning_rate": 4.059777224783989e-05, "loss": 0.4554, "num_input_tokens_seen": 38892832, "step": 32055 }, { "epoch": 3.570553513754316, "grad_norm": 0.13266466557979584, "learning_rate": 4.059397429493979e-05, "loss": 0.4551, "num_input_tokens_seen": 38898944, "step": 32060 }, { "epoch": 3.571110368637933, "grad_norm": 0.09661585837602615, "learning_rate": 4.0590175752849377e-05, "loss": 0.4546, "num_input_tokens_seen": 38904928, "step": 32065 }, { "epoch": 3.5716672235215503, "grad_norm": 0.10198913514614105, "learning_rate": 4.058637662171216e-05, "loss": 0.4486, "num_input_tokens_seen": 38910112, "step": 32070 }, { "epoch": 3.5722240784051675, "grad_norm": 0.12471473962068558, "learning_rate": 4.0582576901671676e-05, "loss": 0.4543, "num_input_tokens_seen": 38916224, "step": 32075 }, { "epoch": 3.5727809332887848, "grad_norm": 0.15212897956371307, "learning_rate": 4.057877659287151e-05, "loss": 0.4679, "num_input_tokens_seen": 38922144, "step": 32080 }, { "epoch": 3.5733377881724024, "grad_norm": 0.12317714095115662, "learning_rate": 4.057497569545524e-05, "loss": 0.4822, "num_input_tokens_seen": 38928480, "step": 32085 }, { "epoch": 3.5738946430560197, "grad_norm": 0.08907561749219894, "learning_rate": 4.0571174209566466e-05, "loss": 0.4578, "num_input_tokens_seen": 38934400, "step": 32090 }, { "epoch": 3.574451497939637, "grad_norm": 0.10102492570877075, "learning_rate": 4.0567372135348835e-05, "loss": 0.4584, "num_input_tokens_seen": 38940448, "step": 32095 }, { "epoch": 3.575008352823254, "grad_norm": 0.1042008250951767, "learning_rate": 4.0563569472946e-05, "loss": 0.4558, "num_input_tokens_seen": 38946624, "step": 32100 }, { "epoch": 3.5755652077068714, "grad_norm": 0.09069705754518509, "learning_rate": 4.055976622250163e-05, "loss": 0.4656, "num_input_tokens_seen": 38952128, "step": 32105 }, { "epoch": 3.576122062590489, "grad_norm": 0.09730765968561172, "learning_rate": 4.055596238415943e-05, "loss": 0.4549, "num_input_tokens_seen": 38958336, "step": 32110 }, { "epoch": 3.5766789174741063, "grad_norm": 0.11048101633787155, "learning_rate": 4.055215795806312e-05, "loss": 0.4666, "num_input_tokens_seen": 38964224, "step": 32115 }, { "epoch": 3.5772357723577235, "grad_norm": 0.09018416702747345, "learning_rate": 4.054835294435644e-05, "loss": 0.4712, "num_input_tokens_seen": 38970304, "step": 32120 }, { "epoch": 3.577792627241341, "grad_norm": 0.1346152126789093, "learning_rate": 4.054454734318316e-05, "loss": 0.459, "num_input_tokens_seen": 38976640, "step": 32125 }, { "epoch": 3.578349482124958, "grad_norm": 0.10892757028341293, "learning_rate": 4.054074115468707e-05, "loss": 0.4657, "num_input_tokens_seen": 38983072, "step": 32130 }, { "epoch": 3.5789063370085756, "grad_norm": 0.11052973568439484, "learning_rate": 4.053693437901197e-05, "loss": 0.4601, "num_input_tokens_seen": 38989184, "step": 32135 }, { "epoch": 3.579463191892193, "grad_norm": 0.12539242208003998, "learning_rate": 4.0533127016301705e-05, "loss": 0.4626, "num_input_tokens_seen": 38995232, "step": 32140 }, { "epoch": 3.58002004677581, "grad_norm": 0.11557850241661072, "learning_rate": 4.052931906670012e-05, "loss": 0.4733, "num_input_tokens_seen": 39001440, "step": 32145 }, { "epoch": 3.5805769016594278, "grad_norm": 0.1280750185251236, "learning_rate": 4.05255105303511e-05, "loss": 0.4659, "num_input_tokens_seen": 39007616, "step": 32150 }, { "epoch": 3.581133756543045, "grad_norm": 0.11398870497941971, "learning_rate": 4.0521701407398526e-05, "loss": 0.4584, "num_input_tokens_seen": 39013696, "step": 32155 }, { "epoch": 3.5816906114266622, "grad_norm": 0.14780715107917786, "learning_rate": 4.0517891697986335e-05, "loss": 0.4795, "num_input_tokens_seen": 39019744, "step": 32160 }, { "epoch": 3.5822474663102795, "grad_norm": 0.10910589247941971, "learning_rate": 4.0514081402258474e-05, "loss": 0.4598, "num_input_tokens_seen": 39025728, "step": 32165 }, { "epoch": 3.5828043211938967, "grad_norm": 0.11136419326066971, "learning_rate": 4.05102705203589e-05, "loss": 0.4561, "num_input_tokens_seen": 39031808, "step": 32170 }, { "epoch": 3.5833611760775144, "grad_norm": 0.09418455511331558, "learning_rate": 4.050645905243159e-05, "loss": 0.4656, "num_input_tokens_seen": 39037312, "step": 32175 }, { "epoch": 3.5839180309611316, "grad_norm": 0.09824369102716446, "learning_rate": 4.050264699862057e-05, "loss": 0.4669, "num_input_tokens_seen": 39043360, "step": 32180 }, { "epoch": 3.584474885844749, "grad_norm": 0.09941891580820084, "learning_rate": 4.049883435906987e-05, "loss": 0.4684, "num_input_tokens_seen": 39049248, "step": 32185 }, { "epoch": 3.585031740728366, "grad_norm": 0.09999863803386688, "learning_rate": 4.0495021133923534e-05, "loss": 0.4698, "num_input_tokens_seen": 39054976, "step": 32190 }, { "epoch": 3.5855885956119833, "grad_norm": 0.09363044053316116, "learning_rate": 4.0491207323325644e-05, "loss": 0.4668, "num_input_tokens_seen": 39060544, "step": 32195 }, { "epoch": 3.586145450495601, "grad_norm": 0.09203969687223434, "learning_rate": 4.04873929274203e-05, "loss": 0.4705, "num_input_tokens_seen": 39066720, "step": 32200 }, { "epoch": 3.586702305379218, "grad_norm": 0.11555693298578262, "learning_rate": 4.0483577946351616e-05, "loss": 0.4518, "num_input_tokens_seen": 39072672, "step": 32205 }, { "epoch": 3.5872591602628354, "grad_norm": 0.09510896354913712, "learning_rate": 4.047976238026373e-05, "loss": 0.4696, "num_input_tokens_seen": 39078976, "step": 32210 }, { "epoch": 3.587816015146453, "grad_norm": 0.11426053941249847, "learning_rate": 4.0475946229300834e-05, "loss": 0.4646, "num_input_tokens_seen": 39084896, "step": 32215 }, { "epoch": 3.5883728700300703, "grad_norm": 0.1444064974784851, "learning_rate": 4.047212949360708e-05, "loss": 0.4629, "num_input_tokens_seen": 39090592, "step": 32220 }, { "epoch": 3.5889297249136876, "grad_norm": 0.12210478633642197, "learning_rate": 4.046831217332669e-05, "loss": 0.4568, "num_input_tokens_seen": 39096800, "step": 32225 }, { "epoch": 3.589486579797305, "grad_norm": 0.11091490834951401, "learning_rate": 4.04644942686039e-05, "loss": 0.4645, "num_input_tokens_seen": 39102944, "step": 32230 }, { "epoch": 3.590043434680922, "grad_norm": 0.13164018094539642, "learning_rate": 4.046067577958296e-05, "loss": 0.4649, "num_input_tokens_seen": 39108928, "step": 32235 }, { "epoch": 3.5906002895645397, "grad_norm": 0.11768313497304916, "learning_rate": 4.045685670640814e-05, "loss": 0.4684, "num_input_tokens_seen": 39115072, "step": 32240 }, { "epoch": 3.591157144448157, "grad_norm": 0.14899671077728271, "learning_rate": 4.045303704922374e-05, "loss": 0.4772, "num_input_tokens_seen": 39120864, "step": 32245 }, { "epoch": 3.591713999331774, "grad_norm": 0.0988934189081192, "learning_rate": 4.044921680817409e-05, "loss": 0.483, "num_input_tokens_seen": 39126880, "step": 32250 }, { "epoch": 3.5922708542153914, "grad_norm": 0.08739260584115982, "learning_rate": 4.04453959834035e-05, "loss": 0.4745, "num_input_tokens_seen": 39132480, "step": 32255 }, { "epoch": 3.5928277090990086, "grad_norm": 0.11781943589448929, "learning_rate": 4.0441574575056365e-05, "loss": 0.464, "num_input_tokens_seen": 39138880, "step": 32260 }, { "epoch": 3.5933845639826263, "grad_norm": 0.10516569763422012, "learning_rate": 4.043775258327704e-05, "loss": 0.4736, "num_input_tokens_seen": 39145152, "step": 32265 }, { "epoch": 3.5939414188662435, "grad_norm": 0.13514523208141327, "learning_rate": 4.043393000820997e-05, "loss": 0.4711, "num_input_tokens_seen": 39151040, "step": 32270 }, { "epoch": 3.5944982737498608, "grad_norm": 0.10620354861021042, "learning_rate": 4.043010684999956e-05, "loss": 0.4621, "num_input_tokens_seen": 39157248, "step": 32275 }, { "epoch": 3.595055128633478, "grad_norm": 0.10218402743339539, "learning_rate": 4.0426283108790265e-05, "loss": 0.4782, "num_input_tokens_seen": 39163008, "step": 32280 }, { "epoch": 3.595611983517095, "grad_norm": 0.18424305319786072, "learning_rate": 4.042245878472655e-05, "loss": 0.4645, "num_input_tokens_seen": 39169120, "step": 32285 }, { "epoch": 3.596168838400713, "grad_norm": 0.10902494192123413, "learning_rate": 4.0418633877952926e-05, "loss": 0.4649, "num_input_tokens_seen": 39175104, "step": 32290 }, { "epoch": 3.59672569328433, "grad_norm": 0.09027329832315445, "learning_rate": 4.04148083886139e-05, "loss": 0.4679, "num_input_tokens_seen": 39181408, "step": 32295 }, { "epoch": 3.5972825481679473, "grad_norm": 0.14948469400405884, "learning_rate": 4.041098231685402e-05, "loss": 0.4702, "num_input_tokens_seen": 39186976, "step": 32300 }, { "epoch": 3.597839403051565, "grad_norm": 0.09892380982637405, "learning_rate": 4.040715566281783e-05, "loss": 0.4747, "num_input_tokens_seen": 39193216, "step": 32305 }, { "epoch": 3.5983962579351823, "grad_norm": 0.07710400968790054, "learning_rate": 4.0403328426649936e-05, "loss": 0.463, "num_input_tokens_seen": 39199552, "step": 32310 }, { "epoch": 3.5989531128187995, "grad_norm": 0.10535915195941925, "learning_rate": 4.039950060849492e-05, "loss": 0.4718, "num_input_tokens_seen": 39205632, "step": 32315 }, { "epoch": 3.5995099677024167, "grad_norm": 0.07891791313886642, "learning_rate": 4.0395672208497424e-05, "loss": 0.4671, "num_input_tokens_seen": 39212032, "step": 32320 }, { "epoch": 3.600066822586034, "grad_norm": 0.08381862938404083, "learning_rate": 4.0391843226802104e-05, "loss": 0.4583, "num_input_tokens_seen": 39218240, "step": 32325 }, { "epoch": 3.6006236774696516, "grad_norm": 0.09565852582454681, "learning_rate": 4.0388013663553606e-05, "loss": 0.4418, "num_input_tokens_seen": 39224480, "step": 32330 }, { "epoch": 3.601180532353269, "grad_norm": 0.09790237247943878, "learning_rate": 4.038418351889665e-05, "loss": 0.4604, "num_input_tokens_seen": 39230432, "step": 32335 }, { "epoch": 3.601737387236886, "grad_norm": 0.10244045406579971, "learning_rate": 4.038035279297594e-05, "loss": 0.4716, "num_input_tokens_seen": 39236704, "step": 32340 }, { "epoch": 3.6022942421205033, "grad_norm": 0.10588085651397705, "learning_rate": 4.03765214859362e-05, "loss": 0.4735, "num_input_tokens_seen": 39243040, "step": 32345 }, { "epoch": 3.6028510970041205, "grad_norm": 0.08243072032928467, "learning_rate": 4.0372689597922215e-05, "loss": 0.4652, "num_input_tokens_seen": 39248672, "step": 32350 }, { "epoch": 3.603407951887738, "grad_norm": 0.15764322876930237, "learning_rate": 4.036885712907875e-05, "loss": 0.4697, "num_input_tokens_seen": 39254784, "step": 32355 }, { "epoch": 3.6039648067713554, "grad_norm": 0.09678545594215393, "learning_rate": 4.0365024079550607e-05, "loss": 0.4669, "num_input_tokens_seen": 39261024, "step": 32360 }, { "epoch": 3.6045216616549727, "grad_norm": 0.09843827784061432, "learning_rate": 4.0361190449482616e-05, "loss": 0.4693, "num_input_tokens_seen": 39267136, "step": 32365 }, { "epoch": 3.60507851653859, "grad_norm": 0.08534432202577591, "learning_rate": 4.035735623901963e-05, "loss": 0.459, "num_input_tokens_seen": 39273216, "step": 32370 }, { "epoch": 3.605635371422207, "grad_norm": 0.1409662514925003, "learning_rate": 4.0353521448306495e-05, "loss": 0.4588, "num_input_tokens_seen": 39279168, "step": 32375 }, { "epoch": 3.606192226305825, "grad_norm": 0.0886220782995224, "learning_rate": 4.0349686077488125e-05, "loss": 0.4523, "num_input_tokens_seen": 39285440, "step": 32380 }, { "epoch": 3.606749081189442, "grad_norm": 0.08651135116815567, "learning_rate": 4.0345850126709426e-05, "loss": 0.4578, "num_input_tokens_seen": 39290752, "step": 32385 }, { "epoch": 3.6073059360730593, "grad_norm": 0.10277500003576279, "learning_rate": 4.034201359611534e-05, "loss": 0.4627, "num_input_tokens_seen": 39297120, "step": 32390 }, { "epoch": 3.607862790956677, "grad_norm": 0.09149839729070663, "learning_rate": 4.03381764858508e-05, "loss": 0.4653, "num_input_tokens_seen": 39303360, "step": 32395 }, { "epoch": 3.608419645840294, "grad_norm": 0.106252022087574, "learning_rate": 4.0334338796060797e-05, "loss": 0.4652, "num_input_tokens_seen": 39309664, "step": 32400 }, { "epoch": 3.6089765007239114, "grad_norm": 0.11540105193853378, "learning_rate": 4.033050052689034e-05, "loss": 0.4592, "num_input_tokens_seen": 39315648, "step": 32405 }, { "epoch": 3.6095333556075286, "grad_norm": 0.12079722434282303, "learning_rate": 4.0326661678484444e-05, "loss": 0.4626, "num_input_tokens_seen": 39321664, "step": 32410 }, { "epoch": 3.610090210491146, "grad_norm": 0.08874857425689697, "learning_rate": 4.0322822250988154e-05, "loss": 0.463, "num_input_tokens_seen": 39327488, "step": 32415 }, { "epoch": 3.6106470653747635, "grad_norm": 0.07583095878362656, "learning_rate": 4.031898224454653e-05, "loss": 0.468, "num_input_tokens_seen": 39333536, "step": 32420 }, { "epoch": 3.6112039202583808, "grad_norm": 0.09377218782901764, "learning_rate": 4.0315141659304664e-05, "loss": 0.4603, "num_input_tokens_seen": 39339488, "step": 32425 }, { "epoch": 3.611760775141998, "grad_norm": 0.09697645902633667, "learning_rate": 4.031130049540767e-05, "loss": 0.4576, "num_input_tokens_seen": 39345184, "step": 32430 }, { "epoch": 3.6123176300256152, "grad_norm": 0.0889938473701477, "learning_rate": 4.030745875300068e-05, "loss": 0.4649, "num_input_tokens_seen": 39351296, "step": 32435 }, { "epoch": 3.6128744849092325, "grad_norm": 0.08607127517461777, "learning_rate": 4.030361643222884e-05, "loss": 0.4663, "num_input_tokens_seen": 39357376, "step": 32440 }, { "epoch": 3.61343133979285, "grad_norm": 0.1203274130821228, "learning_rate": 4.029977353323733e-05, "loss": 0.4674, "num_input_tokens_seen": 39363168, "step": 32445 }, { "epoch": 3.6139881946764674, "grad_norm": 0.09841585159301758, "learning_rate": 4.029593005617134e-05, "loss": 0.4569, "num_input_tokens_seen": 39369408, "step": 32450 }, { "epoch": 3.6145450495600846, "grad_norm": 0.0961470678448677, "learning_rate": 4.02920860011761e-05, "loss": 0.4701, "num_input_tokens_seen": 39375616, "step": 32455 }, { "epoch": 3.615101904443702, "grad_norm": 0.12625131011009216, "learning_rate": 4.028824136839684e-05, "loss": 0.4599, "num_input_tokens_seen": 39381728, "step": 32460 }, { "epoch": 3.615658759327319, "grad_norm": 0.10102836042642593, "learning_rate": 4.028439615797883e-05, "loss": 0.4631, "num_input_tokens_seen": 39388224, "step": 32465 }, { "epoch": 3.6162156142109367, "grad_norm": 0.1306736320257187, "learning_rate": 4.028055037006735e-05, "loss": 0.4769, "num_input_tokens_seen": 39394208, "step": 32470 }, { "epoch": 3.616772469094554, "grad_norm": 0.10324578732252121, "learning_rate": 4.027670400480771e-05, "loss": 0.4642, "num_input_tokens_seen": 39399776, "step": 32475 }, { "epoch": 3.617329323978171, "grad_norm": 0.09450928866863251, "learning_rate": 4.027285706234524e-05, "loss": 0.4561, "num_input_tokens_seen": 39405792, "step": 32480 }, { "epoch": 3.617886178861789, "grad_norm": 0.12936578691005707, "learning_rate": 4.0269009542825274e-05, "loss": 0.463, "num_input_tokens_seen": 39411616, "step": 32485 }, { "epoch": 3.618443033745406, "grad_norm": 0.11439228802919388, "learning_rate": 4.026516144639321e-05, "loss": 0.4649, "num_input_tokens_seen": 39417792, "step": 32490 }, { "epoch": 3.6189998886290233, "grad_norm": 0.12185975164175034, "learning_rate": 4.0261312773194415e-05, "loss": 0.4552, "num_input_tokens_seen": 39423808, "step": 32495 }, { "epoch": 3.6195567435126406, "grad_norm": 0.1263507902622223, "learning_rate": 4.0257463523374324e-05, "loss": 0.4772, "num_input_tokens_seen": 39429792, "step": 32500 }, { "epoch": 3.620113598396258, "grad_norm": 0.11647643148899078, "learning_rate": 4.025361369707836e-05, "loss": 0.4677, "num_input_tokens_seen": 39435904, "step": 32505 }, { "epoch": 3.6206704532798755, "grad_norm": 0.11276685446500778, "learning_rate": 4.024976329445199e-05, "loss": 0.4665, "num_input_tokens_seen": 39442048, "step": 32510 }, { "epoch": 3.6212273081634927, "grad_norm": 0.10968570411205292, "learning_rate": 4.02459123156407e-05, "loss": 0.4659, "num_input_tokens_seen": 39447936, "step": 32515 }, { "epoch": 3.62178416304711, "grad_norm": 0.10916978120803833, "learning_rate": 4.024206076078998e-05, "loss": 0.473, "num_input_tokens_seen": 39454304, "step": 32520 }, { "epoch": 3.622341017930727, "grad_norm": 0.0982281044125557, "learning_rate": 4.023820863004535e-05, "loss": 0.4623, "num_input_tokens_seen": 39460640, "step": 32525 }, { "epoch": 3.6228978728143444, "grad_norm": 0.1078144758939743, "learning_rate": 4.023435592355238e-05, "loss": 0.468, "num_input_tokens_seen": 39466400, "step": 32530 }, { "epoch": 3.623454727697962, "grad_norm": 0.1384524554014206, "learning_rate": 4.0230502641456614e-05, "loss": 0.4723, "num_input_tokens_seen": 39472384, "step": 32535 }, { "epoch": 3.6240115825815793, "grad_norm": 0.11621739715337753, "learning_rate": 4.0226648783903646e-05, "loss": 0.4634, "num_input_tokens_seen": 39478368, "step": 32540 }, { "epoch": 3.6245684374651965, "grad_norm": 0.11794639378786087, "learning_rate": 4.02227943510391e-05, "loss": 0.4628, "num_input_tokens_seen": 39484288, "step": 32545 }, { "epoch": 3.6251252923488138, "grad_norm": 0.11064118891954422, "learning_rate": 4.021893934300859e-05, "loss": 0.469, "num_input_tokens_seen": 39490336, "step": 32550 }, { "epoch": 3.625682147232431, "grad_norm": 0.12889190018177032, "learning_rate": 4.0215083759957785e-05, "loss": 0.4611, "num_input_tokens_seen": 39496352, "step": 32555 }, { "epoch": 3.6262390021160487, "grad_norm": 0.08537538349628448, "learning_rate": 4.021122760203235e-05, "loss": 0.4652, "num_input_tokens_seen": 39502656, "step": 32560 }, { "epoch": 3.626795856999666, "grad_norm": 0.11190919578075409, "learning_rate": 4.0207370869378004e-05, "loss": 0.4717, "num_input_tokens_seen": 39508640, "step": 32565 }, { "epoch": 3.627352711883283, "grad_norm": 0.07860466092824936, "learning_rate": 4.020351356214044e-05, "loss": 0.4719, "num_input_tokens_seen": 39514816, "step": 32570 }, { "epoch": 3.627909566766901, "grad_norm": 0.10119416564702988, "learning_rate": 4.019965568046542e-05, "loss": 0.4632, "num_input_tokens_seen": 39520896, "step": 32575 }, { "epoch": 3.628466421650518, "grad_norm": 0.10219863802194595, "learning_rate": 4.019579722449869e-05, "loss": 0.4624, "num_input_tokens_seen": 39527008, "step": 32580 }, { "epoch": 3.6290232765341353, "grad_norm": 0.08840961009263992, "learning_rate": 4.0191938194386046e-05, "loss": 0.466, "num_input_tokens_seen": 39532992, "step": 32585 }, { "epoch": 3.6295801314177525, "grad_norm": 0.12719231843948364, "learning_rate": 4.0188078590273295e-05, "loss": 0.4771, "num_input_tokens_seen": 39539008, "step": 32590 }, { "epoch": 3.6301369863013697, "grad_norm": 0.0861244648694992, "learning_rate": 4.018421841230626e-05, "loss": 0.456, "num_input_tokens_seen": 39545472, "step": 32595 }, { "epoch": 3.6306938411849874, "grad_norm": 0.16461658477783203, "learning_rate": 4.01803576606308e-05, "loss": 0.4619, "num_input_tokens_seen": 39551104, "step": 32600 }, { "epoch": 3.6312506960686046, "grad_norm": 0.12030358612537384, "learning_rate": 4.0176496335392765e-05, "loss": 0.4652, "num_input_tokens_seen": 39557184, "step": 32605 }, { "epoch": 3.631807550952222, "grad_norm": 0.10151758790016174, "learning_rate": 4.017263443673808e-05, "loss": 0.4497, "num_input_tokens_seen": 39563040, "step": 32610 }, { "epoch": 3.632364405835839, "grad_norm": 0.12772034108638763, "learning_rate": 4.016877196481263e-05, "loss": 0.4649, "num_input_tokens_seen": 39568896, "step": 32615 }, { "epoch": 3.6329212607194563, "grad_norm": 0.08914337307214737, "learning_rate": 4.016490891976237e-05, "loss": 0.4493, "num_input_tokens_seen": 39574976, "step": 32620 }, { "epoch": 3.633478115603074, "grad_norm": 0.12450435012578964, "learning_rate": 4.016104530173325e-05, "loss": 0.4642, "num_input_tokens_seen": 39580736, "step": 32625 }, { "epoch": 3.634034970486691, "grad_norm": 0.10089842975139618, "learning_rate": 4.0157181110871257e-05, "loss": 0.4553, "num_input_tokens_seen": 39586528, "step": 32630 }, { "epoch": 3.6345918253703084, "grad_norm": 0.09013056755065918, "learning_rate": 4.015331634732238e-05, "loss": 0.4665, "num_input_tokens_seen": 39592480, "step": 32635 }, { "epoch": 3.6351486802539257, "grad_norm": 0.09220387786626816, "learning_rate": 4.014945101123266e-05, "loss": 0.4601, "num_input_tokens_seen": 39598560, "step": 32640 }, { "epoch": 3.635705535137543, "grad_norm": 0.1069265753030777, "learning_rate": 4.014558510274812e-05, "loss": 0.4622, "num_input_tokens_seen": 39604448, "step": 32645 }, { "epoch": 3.6362623900211606, "grad_norm": 0.11929779499769211, "learning_rate": 4.014171862201485e-05, "loss": 0.4521, "num_input_tokens_seen": 39610240, "step": 32650 }, { "epoch": 3.636819244904778, "grad_norm": 0.09941644966602325, "learning_rate": 4.013785156917892e-05, "loss": 0.4702, "num_input_tokens_seen": 39616128, "step": 32655 }, { "epoch": 3.637376099788395, "grad_norm": 0.08745899051427841, "learning_rate": 4.013398394438645e-05, "loss": 0.4661, "num_input_tokens_seen": 39621920, "step": 32660 }, { "epoch": 3.6379329546720127, "grad_norm": 0.09654644876718521, "learning_rate": 4.013011574778357e-05, "loss": 0.4599, "num_input_tokens_seen": 39627904, "step": 32665 }, { "epoch": 3.63848980955563, "grad_norm": 0.11528339236974716, "learning_rate": 4.012624697951642e-05, "loss": 0.4748, "num_input_tokens_seen": 39634080, "step": 32670 }, { "epoch": 3.639046664439247, "grad_norm": 0.10641489923000336, "learning_rate": 4.012237763973119e-05, "loss": 0.4632, "num_input_tokens_seen": 39640032, "step": 32675 }, { "epoch": 3.6396035193228644, "grad_norm": 0.10008689016103745, "learning_rate": 4.011850772857407e-05, "loss": 0.4592, "num_input_tokens_seen": 39645632, "step": 32680 }, { "epoch": 3.6401603742064816, "grad_norm": 0.12817442417144775, "learning_rate": 4.011463724619128e-05, "loss": 0.4785, "num_input_tokens_seen": 39651360, "step": 32685 }, { "epoch": 3.6407172290900993, "grad_norm": 0.11523154377937317, "learning_rate": 4.011076619272907e-05, "loss": 0.4692, "num_input_tokens_seen": 39657536, "step": 32690 }, { "epoch": 3.6412740839737165, "grad_norm": 0.16100284457206726, "learning_rate": 4.010689456833367e-05, "loss": 0.4691, "num_input_tokens_seen": 39663136, "step": 32695 }, { "epoch": 3.6418309388573338, "grad_norm": 0.16350296139717102, "learning_rate": 4.0103022373151385e-05, "loss": 0.4574, "num_input_tokens_seen": 39669408, "step": 32700 }, { "epoch": 3.642387793740951, "grad_norm": 0.10105031728744507, "learning_rate": 4.009914960732851e-05, "loss": 0.4617, "num_input_tokens_seen": 39675616, "step": 32705 }, { "epoch": 3.6429446486245682, "grad_norm": 0.08713559061288834, "learning_rate": 4.0095276271011386e-05, "loss": 0.4721, "num_input_tokens_seen": 39681888, "step": 32710 }, { "epoch": 3.643501503508186, "grad_norm": 0.09653347730636597, "learning_rate": 4.009140236434634e-05, "loss": 0.4669, "num_input_tokens_seen": 39687936, "step": 32715 }, { "epoch": 3.644058358391803, "grad_norm": 0.09832701832056046, "learning_rate": 4.008752788747976e-05, "loss": 0.4565, "num_input_tokens_seen": 39693952, "step": 32720 }, { "epoch": 3.6446152132754204, "grad_norm": 0.1105068102478981, "learning_rate": 4.008365284055802e-05, "loss": 0.4605, "num_input_tokens_seen": 39699808, "step": 32725 }, { "epoch": 3.6451720681590376, "grad_norm": 0.18553811311721802, "learning_rate": 4.007977722372754e-05, "loss": 0.4603, "num_input_tokens_seen": 39705888, "step": 32730 }, { "epoch": 3.645728923042655, "grad_norm": 0.14384345710277557, "learning_rate": 4.007590103713475e-05, "loss": 0.4608, "num_input_tokens_seen": 39711840, "step": 32735 }, { "epoch": 3.6462857779262725, "grad_norm": 0.11430972069501877, "learning_rate": 4.0072024280926105e-05, "loss": 0.4616, "num_input_tokens_seen": 39717888, "step": 32740 }, { "epoch": 3.6468426328098897, "grad_norm": 0.10149844735860825, "learning_rate": 4.006814695524809e-05, "loss": 0.4621, "num_input_tokens_seen": 39724096, "step": 32745 }, { "epoch": 3.647399487693507, "grad_norm": 0.09910617023706436, "learning_rate": 4.006426906024718e-05, "loss": 0.4685, "num_input_tokens_seen": 39729696, "step": 32750 }, { "epoch": 3.6479563425771246, "grad_norm": 0.13571462035179138, "learning_rate": 4.006039059606992e-05, "loss": 0.4569, "num_input_tokens_seen": 39735168, "step": 32755 }, { "epoch": 3.648513197460742, "grad_norm": 0.10169029980897903, "learning_rate": 4.005651156286283e-05, "loss": 0.4641, "num_input_tokens_seen": 39741312, "step": 32760 }, { "epoch": 3.649070052344359, "grad_norm": 0.0908966138958931, "learning_rate": 4.005263196077249e-05, "loss": 0.4605, "num_input_tokens_seen": 39747616, "step": 32765 }, { "epoch": 3.6496269072279763, "grad_norm": 0.09881222993135452, "learning_rate": 4.004875178994547e-05, "loss": 0.4602, "num_input_tokens_seen": 39753792, "step": 32770 }, { "epoch": 3.6501837621115936, "grad_norm": 0.11835943907499313, "learning_rate": 4.004487105052839e-05, "loss": 0.4692, "num_input_tokens_seen": 39759680, "step": 32775 }, { "epoch": 3.6507406169952112, "grad_norm": 0.10010717064142227, "learning_rate": 4.0040989742667865e-05, "loss": 0.4722, "num_input_tokens_seen": 39765856, "step": 32780 }, { "epoch": 3.6512974718788285, "grad_norm": 0.09340544790029526, "learning_rate": 4.003710786651054e-05, "loss": 0.4588, "num_input_tokens_seen": 39772096, "step": 32785 }, { "epoch": 3.6518543267624457, "grad_norm": 0.10868163406848907, "learning_rate": 4.00332254222031e-05, "loss": 0.4698, "num_input_tokens_seen": 39778464, "step": 32790 }, { "epoch": 3.652411181646063, "grad_norm": 0.11530158668756485, "learning_rate": 4.002934240989222e-05, "loss": 0.4622, "num_input_tokens_seen": 39784512, "step": 32795 }, { "epoch": 3.65296803652968, "grad_norm": 0.13291168212890625, "learning_rate": 4.0025458829724625e-05, "loss": 0.4791, "num_input_tokens_seen": 39790752, "step": 32800 }, { "epoch": 3.653524891413298, "grad_norm": 0.09703285247087479, "learning_rate": 4.002157468184704e-05, "loss": 0.459, "num_input_tokens_seen": 39797120, "step": 32805 }, { "epoch": 3.654081746296915, "grad_norm": 0.07043002545833588, "learning_rate": 4.001768996640623e-05, "loss": 0.4743, "num_input_tokens_seen": 39802816, "step": 32810 }, { "epoch": 3.6546386011805323, "grad_norm": 0.09502505511045456, "learning_rate": 4.001380468354896e-05, "loss": 0.4647, "num_input_tokens_seen": 39808704, "step": 32815 }, { "epoch": 3.6551954560641495, "grad_norm": 0.0829782634973526, "learning_rate": 4.0009918833422036e-05, "loss": 0.457, "num_input_tokens_seen": 39814432, "step": 32820 }, { "epoch": 3.6557523109477668, "grad_norm": 0.11091134697198868, "learning_rate": 4.000603241617228e-05, "loss": 0.4721, "num_input_tokens_seen": 39820800, "step": 32825 }, { "epoch": 3.6563091658313844, "grad_norm": 0.1089741513133049, "learning_rate": 4.000214543194652e-05, "loss": 0.462, "num_input_tokens_seen": 39826944, "step": 32830 }, { "epoch": 3.6568660207150017, "grad_norm": 0.1260818988084793, "learning_rate": 3.999825788089164e-05, "loss": 0.4618, "num_input_tokens_seen": 39833280, "step": 32835 }, { "epoch": 3.657422875598619, "grad_norm": 0.1475876420736313, "learning_rate": 3.999436976315451e-05, "loss": 0.4728, "num_input_tokens_seen": 39839488, "step": 32840 }, { "epoch": 3.6579797304822366, "grad_norm": 0.12586358189582825, "learning_rate": 3.999048107888205e-05, "loss": 0.4669, "num_input_tokens_seen": 39845536, "step": 32845 }, { "epoch": 3.658536585365854, "grad_norm": 0.1919434517621994, "learning_rate": 3.998659182822116e-05, "loss": 0.4581, "num_input_tokens_seen": 39851424, "step": 32850 }, { "epoch": 3.659093440249471, "grad_norm": 0.11399378627538681, "learning_rate": 3.9982702011318804e-05, "loss": 0.472, "num_input_tokens_seen": 39857536, "step": 32855 }, { "epoch": 3.6596502951330883, "grad_norm": 0.101821169257164, "learning_rate": 3.997881162832197e-05, "loss": 0.4623, "num_input_tokens_seen": 39863712, "step": 32860 }, { "epoch": 3.6602071500167055, "grad_norm": 0.1389399766921997, "learning_rate": 3.9974920679377606e-05, "loss": 0.4583, "num_input_tokens_seen": 39869664, "step": 32865 }, { "epoch": 3.660764004900323, "grad_norm": 0.12200471758842468, "learning_rate": 3.997102916463276e-05, "loss": 0.456, "num_input_tokens_seen": 39875744, "step": 32870 }, { "epoch": 3.6613208597839404, "grad_norm": 0.14501364529132843, "learning_rate": 3.996713708423446e-05, "loss": 0.4601, "num_input_tokens_seen": 39881984, "step": 32875 }, { "epoch": 3.6618777146675576, "grad_norm": 0.09132992476224899, "learning_rate": 3.996324443832975e-05, "loss": 0.4481, "num_input_tokens_seen": 39887776, "step": 32880 }, { "epoch": 3.662434569551175, "grad_norm": 0.1451777219772339, "learning_rate": 3.995935122706571e-05, "loss": 0.4504, "num_input_tokens_seen": 39893760, "step": 32885 }, { "epoch": 3.662991424434792, "grad_norm": 0.13283243775367737, "learning_rate": 3.995545745058945e-05, "loss": 0.4677, "num_input_tokens_seen": 39900032, "step": 32890 }, { "epoch": 3.6635482793184098, "grad_norm": 0.16069450974464417, "learning_rate": 3.9951563109048076e-05, "loss": 0.486, "num_input_tokens_seen": 39905536, "step": 32895 }, { "epoch": 3.664105134202027, "grad_norm": 0.13891901075839996, "learning_rate": 3.994766820258873e-05, "loss": 0.4791, "num_input_tokens_seen": 39911904, "step": 32900 }, { "epoch": 3.6646619890856442, "grad_norm": 0.12058543413877487, "learning_rate": 3.994377273135859e-05, "loss": 0.4565, "num_input_tokens_seen": 39918048, "step": 32905 }, { "epoch": 3.6652188439692615, "grad_norm": 0.1245083436369896, "learning_rate": 3.9939876695504805e-05, "loss": 0.4622, "num_input_tokens_seen": 39924256, "step": 32910 }, { "epoch": 3.6657756988528787, "grad_norm": 0.156279057264328, "learning_rate": 3.993598009517462e-05, "loss": 0.4721, "num_input_tokens_seen": 39930464, "step": 32915 }, { "epoch": 3.6663325537364964, "grad_norm": 0.1646660715341568, "learning_rate": 3.9932082930515235e-05, "loss": 0.4751, "num_input_tokens_seen": 39936832, "step": 32920 }, { "epoch": 3.6668894086201136, "grad_norm": 0.09812970459461212, "learning_rate": 3.9928185201673904e-05, "loss": 0.4654, "num_input_tokens_seen": 39942752, "step": 32925 }, { "epoch": 3.667446263503731, "grad_norm": 0.08712922781705856, "learning_rate": 3.99242869087979e-05, "loss": 0.4576, "num_input_tokens_seen": 39948832, "step": 32930 }, { "epoch": 3.6680031183873485, "grad_norm": 0.09364631026983261, "learning_rate": 3.9920388052034507e-05, "loss": 0.459, "num_input_tokens_seen": 39955296, "step": 32935 }, { "epoch": 3.6685599732709657, "grad_norm": 0.12535743415355682, "learning_rate": 3.991648863153103e-05, "loss": 0.4561, "num_input_tokens_seen": 39961344, "step": 32940 }, { "epoch": 3.669116828154583, "grad_norm": 0.08662979304790497, "learning_rate": 3.9912588647434826e-05, "loss": 0.463, "num_input_tokens_seen": 39967200, "step": 32945 }, { "epoch": 3.6696736830382, "grad_norm": 0.12261221557855606, "learning_rate": 3.9908688099893225e-05, "loss": 0.4614, "num_input_tokens_seen": 39973504, "step": 32950 }, { "epoch": 3.6702305379218174, "grad_norm": 0.08517342805862427, "learning_rate": 3.990478698905361e-05, "loss": 0.4645, "num_input_tokens_seen": 39979712, "step": 32955 }, { "epoch": 3.670787392805435, "grad_norm": 0.10293988138437271, "learning_rate": 3.990088531506338e-05, "loss": 0.4679, "num_input_tokens_seen": 39985856, "step": 32960 }, { "epoch": 3.6713442476890523, "grad_norm": 0.10736798495054245, "learning_rate": 3.989698307806995e-05, "loss": 0.4644, "num_input_tokens_seen": 39991712, "step": 32965 }, { "epoch": 3.6719011025726696, "grad_norm": 0.11860467493534088, "learning_rate": 3.989308027822076e-05, "loss": 0.466, "num_input_tokens_seen": 39997504, "step": 32970 }, { "epoch": 3.672457957456287, "grad_norm": 0.15088167786598206, "learning_rate": 3.988917691566327e-05, "loss": 0.4688, "num_input_tokens_seen": 40003392, "step": 32975 }, { "epoch": 3.673014812339904, "grad_norm": 0.10903800278902054, "learning_rate": 3.988527299054496e-05, "loss": 0.4476, "num_input_tokens_seen": 40009184, "step": 32980 }, { "epoch": 3.6735716672235217, "grad_norm": 0.10776175558567047, "learning_rate": 3.9881368503013334e-05, "loss": 0.4708, "num_input_tokens_seen": 40015360, "step": 32985 }, { "epoch": 3.674128522107139, "grad_norm": 0.1033979207277298, "learning_rate": 3.987746345321591e-05, "loss": 0.4685, "num_input_tokens_seen": 40021312, "step": 32990 }, { "epoch": 3.674685376990756, "grad_norm": 0.11410795897245407, "learning_rate": 3.9873557841300255e-05, "loss": 0.4722, "num_input_tokens_seen": 40027200, "step": 32995 }, { "epoch": 3.6752422318743734, "grad_norm": 0.14740857481956482, "learning_rate": 3.986965166741391e-05, "loss": 0.4747, "num_input_tokens_seen": 40033472, "step": 33000 }, { "epoch": 3.6757990867579906, "grad_norm": 0.0960584282875061, "learning_rate": 3.9865744931704475e-05, "loss": 0.4648, "num_input_tokens_seen": 40039392, "step": 33005 }, { "epoch": 3.6763559416416083, "grad_norm": 0.06885306537151337, "learning_rate": 3.9861837634319555e-05, "loss": 0.4595, "num_input_tokens_seen": 40045600, "step": 33010 }, { "epoch": 3.6769127965252255, "grad_norm": 0.09605726599693298, "learning_rate": 3.985792977540678e-05, "loss": 0.4701, "num_input_tokens_seen": 40051648, "step": 33015 }, { "epoch": 3.6774696514088427, "grad_norm": 0.0954262837767601, "learning_rate": 3.985402135511381e-05, "loss": 0.4674, "num_input_tokens_seen": 40057696, "step": 33020 }, { "epoch": 3.6780265062924604, "grad_norm": 0.12485143542289734, "learning_rate": 3.985011237358831e-05, "loss": 0.4507, "num_input_tokens_seen": 40063808, "step": 33025 }, { "epoch": 3.6785833611760776, "grad_norm": 0.09615699201822281, "learning_rate": 3.984620283097798e-05, "loss": 0.4534, "num_input_tokens_seen": 40069824, "step": 33030 }, { "epoch": 3.679140216059695, "grad_norm": 0.10736946761608124, "learning_rate": 3.9842292727430514e-05, "loss": 0.4809, "num_input_tokens_seen": 40075680, "step": 33035 }, { "epoch": 3.679697070943312, "grad_norm": 0.11020679771900177, "learning_rate": 3.9838382063093674e-05, "loss": 0.4626, "num_input_tokens_seen": 40081632, "step": 33040 }, { "epoch": 3.6802539258269293, "grad_norm": 0.10957039892673492, "learning_rate": 3.983447083811521e-05, "loss": 0.468, "num_input_tokens_seen": 40087808, "step": 33045 }, { "epoch": 3.680810780710547, "grad_norm": 0.13649402558803558, "learning_rate": 3.983055905264289e-05, "loss": 0.46, "num_input_tokens_seen": 40093696, "step": 33050 }, { "epoch": 3.6813676355941642, "grad_norm": 0.10923843830823898, "learning_rate": 3.9826646706824534e-05, "loss": 0.4682, "num_input_tokens_seen": 40099680, "step": 33055 }, { "epoch": 3.6819244904777815, "grad_norm": 0.08988689631223679, "learning_rate": 3.9822733800807945e-05, "loss": 0.4633, "num_input_tokens_seen": 40105824, "step": 33060 }, { "epoch": 3.6824813453613987, "grad_norm": 0.12220941483974457, "learning_rate": 3.981882033474097e-05, "loss": 0.459, "num_input_tokens_seen": 40112160, "step": 33065 }, { "epoch": 3.683038200245016, "grad_norm": 0.10809072852134705, "learning_rate": 3.981490630877147e-05, "loss": 0.4604, "num_input_tokens_seen": 40118400, "step": 33070 }, { "epoch": 3.6835950551286336, "grad_norm": 0.10371658951044083, "learning_rate": 3.981099172304734e-05, "loss": 0.4682, "num_input_tokens_seen": 40124384, "step": 33075 }, { "epoch": 3.684151910012251, "grad_norm": 0.09836117923259735, "learning_rate": 3.980707657771647e-05, "loss": 0.4594, "num_input_tokens_seen": 40130400, "step": 33080 }, { "epoch": 3.684708764895868, "grad_norm": 0.14156916737556458, "learning_rate": 3.98031608729268e-05, "loss": 0.4673, "num_input_tokens_seen": 40136768, "step": 33085 }, { "epoch": 3.6852656197794853, "grad_norm": 0.11440076678991318, "learning_rate": 3.979924460882628e-05, "loss": 0.4702, "num_input_tokens_seen": 40142464, "step": 33090 }, { "epoch": 3.6858224746631025, "grad_norm": 0.10409651696681976, "learning_rate": 3.979532778556287e-05, "loss": 0.4657, "num_input_tokens_seen": 40148736, "step": 33095 }, { "epoch": 3.68637932954672, "grad_norm": 0.13780885934829712, "learning_rate": 3.979141040328456e-05, "loss": 0.4693, "num_input_tokens_seen": 40154688, "step": 33100 }, { "epoch": 3.6869361844303374, "grad_norm": 0.0980035811662674, "learning_rate": 3.9787492462139356e-05, "loss": 0.4592, "num_input_tokens_seen": 40160192, "step": 33105 }, { "epoch": 3.6874930393139547, "grad_norm": 0.13428820669651031, "learning_rate": 3.978357396227531e-05, "loss": 0.4609, "num_input_tokens_seen": 40166144, "step": 33110 }, { "epoch": 3.6880498941975723, "grad_norm": 0.09049519151449203, "learning_rate": 3.977965490384046e-05, "loss": 0.4626, "num_input_tokens_seen": 40172032, "step": 33115 }, { "epoch": 3.6886067490811896, "grad_norm": 0.10571570694446564, "learning_rate": 3.9775735286982876e-05, "loss": 0.4783, "num_input_tokens_seen": 40178272, "step": 33120 }, { "epoch": 3.689163603964807, "grad_norm": 0.10777031630277634, "learning_rate": 3.9771815111850665e-05, "loss": 0.4791, "num_input_tokens_seen": 40184224, "step": 33125 }, { "epoch": 3.689720458848424, "grad_norm": 0.0985926017165184, "learning_rate": 3.976789437859195e-05, "loss": 0.4663, "num_input_tokens_seen": 40190432, "step": 33130 }, { "epoch": 3.6902773137320413, "grad_norm": 0.08047126233577728, "learning_rate": 3.9763973087354854e-05, "loss": 0.4624, "num_input_tokens_seen": 40196608, "step": 33135 }, { "epoch": 3.690834168615659, "grad_norm": 0.10060951113700867, "learning_rate": 3.976005123828754e-05, "loss": 0.4721, "num_input_tokens_seen": 40202752, "step": 33140 }, { "epoch": 3.691391023499276, "grad_norm": 0.10856673866510391, "learning_rate": 3.975612883153819e-05, "loss": 0.4565, "num_input_tokens_seen": 40208864, "step": 33145 }, { "epoch": 3.6919478783828934, "grad_norm": 0.11518938839435577, "learning_rate": 3.9752205867255e-05, "loss": 0.4528, "num_input_tokens_seen": 40214528, "step": 33150 }, { "epoch": 3.6925047332665106, "grad_norm": 0.13657152652740479, "learning_rate": 3.9748282345586194e-05, "loss": 0.4591, "num_input_tokens_seen": 40220608, "step": 33155 }, { "epoch": 3.693061588150128, "grad_norm": 0.10538482666015625, "learning_rate": 3.9744358266680026e-05, "loss": 0.455, "num_input_tokens_seen": 40226176, "step": 33160 }, { "epoch": 3.6936184430337455, "grad_norm": 0.08415709435939789, "learning_rate": 3.9740433630684754e-05, "loss": 0.458, "num_input_tokens_seen": 40231744, "step": 33165 }, { "epoch": 3.6941752979173628, "grad_norm": 0.10982072353363037, "learning_rate": 3.973650843774866e-05, "loss": 0.47, "num_input_tokens_seen": 40237984, "step": 33170 }, { "epoch": 3.69473215280098, "grad_norm": 0.0867312103509903, "learning_rate": 3.9732582688020045e-05, "loss": 0.476, "num_input_tokens_seen": 40243776, "step": 33175 }, { "epoch": 3.6952890076845977, "grad_norm": 0.11227844655513763, "learning_rate": 3.9728656381647246e-05, "loss": 0.4695, "num_input_tokens_seen": 40249856, "step": 33180 }, { "epoch": 3.6958458625682145, "grad_norm": 0.11501021683216095, "learning_rate": 3.972472951877861e-05, "loss": 0.4471, "num_input_tokens_seen": 40256064, "step": 33185 }, { "epoch": 3.696402717451832, "grad_norm": 0.06921644508838654, "learning_rate": 3.97208020995625e-05, "loss": 0.46, "num_input_tokens_seen": 40262624, "step": 33190 }, { "epoch": 3.6969595723354494, "grad_norm": 0.09744814783334732, "learning_rate": 3.971687412414732e-05, "loss": 0.4553, "num_input_tokens_seen": 40269024, "step": 33195 }, { "epoch": 3.6975164272190666, "grad_norm": 0.08795853704214096, "learning_rate": 3.971294559268147e-05, "loss": 0.4706, "num_input_tokens_seen": 40274976, "step": 33200 }, { "epoch": 3.6980732821026843, "grad_norm": 0.10451851040124893, "learning_rate": 3.970901650531339e-05, "loss": 0.4588, "num_input_tokens_seen": 40281280, "step": 33205 }, { "epoch": 3.6986301369863015, "grad_norm": 0.08330348879098892, "learning_rate": 3.970508686219152e-05, "loss": 0.4636, "num_input_tokens_seen": 40287296, "step": 33210 }, { "epoch": 3.6991869918699187, "grad_norm": 0.14151249825954437, "learning_rate": 3.970115666346435e-05, "loss": 0.464, "num_input_tokens_seen": 40293504, "step": 33215 }, { "epoch": 3.699743846753536, "grad_norm": 0.10186687111854553, "learning_rate": 3.969722590928037e-05, "loss": 0.4512, "num_input_tokens_seen": 40299136, "step": 33220 }, { "epoch": 3.700300701637153, "grad_norm": 0.09861290454864502, "learning_rate": 3.969329459978809e-05, "loss": 0.4606, "num_input_tokens_seen": 40304864, "step": 33225 }, { "epoch": 3.700857556520771, "grad_norm": 0.0985632836818695, "learning_rate": 3.9689362735136055e-05, "loss": 0.4554, "num_input_tokens_seen": 40310528, "step": 33230 }, { "epoch": 3.701414411404388, "grad_norm": 0.1295701414346695, "learning_rate": 3.968543031547281e-05, "loss": 0.4618, "num_input_tokens_seen": 40316736, "step": 33235 }, { "epoch": 3.7019712662880053, "grad_norm": 0.09522293508052826, "learning_rate": 3.9681497340946965e-05, "loss": 0.457, "num_input_tokens_seen": 40322912, "step": 33240 }, { "epoch": 3.7025281211716226, "grad_norm": 0.10393308103084564, "learning_rate": 3.96775638117071e-05, "loss": 0.4663, "num_input_tokens_seen": 40328960, "step": 33245 }, { "epoch": 3.70308497605524, "grad_norm": 0.11520272493362427, "learning_rate": 3.967362972790183e-05, "loss": 0.4514, "num_input_tokens_seen": 40335072, "step": 33250 }, { "epoch": 3.7036418309388575, "grad_norm": 0.09368608891963959, "learning_rate": 3.966969508967981e-05, "loss": 0.4692, "num_input_tokens_seen": 40341312, "step": 33255 }, { "epoch": 3.7041986858224747, "grad_norm": 0.11668093502521515, "learning_rate": 3.966575989718969e-05, "loss": 0.4675, "num_input_tokens_seen": 40347264, "step": 33260 }, { "epoch": 3.704755540706092, "grad_norm": 0.10590538382530212, "learning_rate": 3.966182415058017e-05, "loss": 0.4586, "num_input_tokens_seen": 40353216, "step": 33265 }, { "epoch": 3.7053123955897096, "grad_norm": 0.10303213447332382, "learning_rate": 3.9657887849999944e-05, "loss": 0.4767, "num_input_tokens_seen": 40359328, "step": 33270 }, { "epoch": 3.7058692504733264, "grad_norm": 0.11207076907157898, "learning_rate": 3.9653950995597735e-05, "loss": 0.4672, "num_input_tokens_seen": 40365280, "step": 33275 }, { "epoch": 3.706426105356944, "grad_norm": 0.18652233481407166, "learning_rate": 3.9650013587522303e-05, "loss": 0.4716, "num_input_tokens_seen": 40371552, "step": 33280 }, { "epoch": 3.7069829602405613, "grad_norm": 0.10625626146793365, "learning_rate": 3.9646075625922414e-05, "loss": 0.4851, "num_input_tokens_seen": 40377600, "step": 33285 }, { "epoch": 3.7075398151241785, "grad_norm": 0.1155381128191948, "learning_rate": 3.964213711094685e-05, "loss": 0.4475, "num_input_tokens_seen": 40383136, "step": 33290 }, { "epoch": 3.708096670007796, "grad_norm": 0.0920211672782898, "learning_rate": 3.9638198042744424e-05, "loss": 0.4719, "num_input_tokens_seen": 40389280, "step": 33295 }, { "epoch": 3.7086535248914134, "grad_norm": 0.14611509442329407, "learning_rate": 3.9634258421463956e-05, "loss": 0.4544, "num_input_tokens_seen": 40395520, "step": 33300 }, { "epoch": 3.7092103797750307, "grad_norm": 0.09724578261375427, "learning_rate": 3.963031824725432e-05, "loss": 0.4744, "num_input_tokens_seen": 40401792, "step": 33305 }, { "epoch": 3.709767234658648, "grad_norm": 0.11119087040424347, "learning_rate": 3.962637752026437e-05, "loss": 0.4698, "num_input_tokens_seen": 40407424, "step": 33310 }, { "epoch": 3.710324089542265, "grad_norm": 0.13623683154582977, "learning_rate": 3.9622436240642996e-05, "loss": 0.4571, "num_input_tokens_seen": 40413248, "step": 33315 }, { "epoch": 3.710880944425883, "grad_norm": 0.11060413718223572, "learning_rate": 3.961849440853913e-05, "loss": 0.4641, "num_input_tokens_seen": 40418976, "step": 33320 }, { "epoch": 3.7114377993095, "grad_norm": 0.11407549679279327, "learning_rate": 3.96145520241017e-05, "loss": 0.4585, "num_input_tokens_seen": 40424992, "step": 33325 }, { "epoch": 3.7119946541931172, "grad_norm": 0.089515820145607, "learning_rate": 3.961060908747965e-05, "loss": 0.4649, "num_input_tokens_seen": 40431328, "step": 33330 }, { "epoch": 3.7125515090767345, "grad_norm": 0.09962719678878784, "learning_rate": 3.960666559882198e-05, "loss": 0.4555, "num_input_tokens_seen": 40437888, "step": 33335 }, { "epoch": 3.7131083639603517, "grad_norm": 0.12357097119092941, "learning_rate": 3.9602721558277664e-05, "loss": 0.4525, "num_input_tokens_seen": 40444000, "step": 33340 }, { "epoch": 3.7136652188439694, "grad_norm": 0.11546643823385239, "learning_rate": 3.9598776965995735e-05, "loss": 0.4786, "num_input_tokens_seen": 40450240, "step": 33345 }, { "epoch": 3.7142220737275866, "grad_norm": 0.12482776492834091, "learning_rate": 3.959483182212523e-05, "loss": 0.4576, "num_input_tokens_seen": 40456416, "step": 33350 }, { "epoch": 3.714778928611204, "grad_norm": 0.10629003494977951, "learning_rate": 3.959088612681521e-05, "loss": 0.4672, "num_input_tokens_seen": 40462464, "step": 33355 }, { "epoch": 3.7153357834948215, "grad_norm": 0.10395543277263641, "learning_rate": 3.9586939880214743e-05, "loss": 0.4631, "num_input_tokens_seen": 40468512, "step": 33360 }, { "epoch": 3.7158926383784383, "grad_norm": 0.0892210379242897, "learning_rate": 3.958299308247295e-05, "loss": 0.4599, "num_input_tokens_seen": 40474720, "step": 33365 }, { "epoch": 3.716449493262056, "grad_norm": 0.1123717725276947, "learning_rate": 3.957904573373894e-05, "loss": 0.4653, "num_input_tokens_seen": 40480672, "step": 33370 }, { "epoch": 3.717006348145673, "grad_norm": 0.1776009052991867, "learning_rate": 3.957509783416186e-05, "loss": 0.4506, "num_input_tokens_seen": 40486560, "step": 33375 }, { "epoch": 3.7175632030292904, "grad_norm": 0.14140534400939941, "learning_rate": 3.957114938389088e-05, "loss": 0.481, "num_input_tokens_seen": 40492480, "step": 33380 }, { "epoch": 3.718120057912908, "grad_norm": 0.07447446137666702, "learning_rate": 3.956720038307518e-05, "loss": 0.4607, "num_input_tokens_seen": 40498560, "step": 33385 }, { "epoch": 3.7186769127965253, "grad_norm": 0.08641985058784485, "learning_rate": 3.956325083186396e-05, "loss": 0.4622, "num_input_tokens_seen": 40504480, "step": 33390 }, { "epoch": 3.7192337676801426, "grad_norm": 0.10241689532995224, "learning_rate": 3.955930073040647e-05, "loss": 0.4645, "num_input_tokens_seen": 40510528, "step": 33395 }, { "epoch": 3.71979062256376, "grad_norm": 0.18706375360488892, "learning_rate": 3.9555350078851924e-05, "loss": 0.4606, "num_input_tokens_seen": 40516448, "step": 33400 }, { "epoch": 3.720347477447377, "grad_norm": 0.14220325648784637, "learning_rate": 3.955139887734961e-05, "loss": 0.4582, "num_input_tokens_seen": 40522560, "step": 33405 }, { "epoch": 3.7209043323309947, "grad_norm": 0.1098204031586647, "learning_rate": 3.954744712604882e-05, "loss": 0.4642, "num_input_tokens_seen": 40528768, "step": 33410 }, { "epoch": 3.721461187214612, "grad_norm": 0.09171027690172195, "learning_rate": 3.954349482509886e-05, "loss": 0.4627, "num_input_tokens_seen": 40534400, "step": 33415 }, { "epoch": 3.722018042098229, "grad_norm": 0.11838731914758682, "learning_rate": 3.953954197464905e-05, "loss": 0.4579, "num_input_tokens_seen": 40540128, "step": 33420 }, { "epoch": 3.7225748969818464, "grad_norm": 0.09218788892030716, "learning_rate": 3.953558857484875e-05, "loss": 0.4647, "num_input_tokens_seen": 40546240, "step": 33425 }, { "epoch": 3.7231317518654636, "grad_norm": 0.09450148791074753, "learning_rate": 3.9531634625847334e-05, "loss": 0.4624, "num_input_tokens_seen": 40552192, "step": 33430 }, { "epoch": 3.7236886067490813, "grad_norm": 0.11120346933603287, "learning_rate": 3.9527680127794195e-05, "loss": 0.4674, "num_input_tokens_seen": 40557984, "step": 33435 }, { "epoch": 3.7242454616326985, "grad_norm": 0.12053567171096802, "learning_rate": 3.952372508083875e-05, "loss": 0.4554, "num_input_tokens_seen": 40563936, "step": 33440 }, { "epoch": 3.7248023165163158, "grad_norm": 0.12477108091115952, "learning_rate": 3.951976948513042e-05, "loss": 0.4568, "num_input_tokens_seen": 40569824, "step": 33445 }, { "epoch": 3.7253591713999334, "grad_norm": 0.11033479869365692, "learning_rate": 3.9515813340818676e-05, "loss": 0.4618, "num_input_tokens_seen": 40576256, "step": 33450 }, { "epoch": 3.7259160262835502, "grad_norm": 0.09827936440706253, "learning_rate": 3.951185664805297e-05, "loss": 0.4782, "num_input_tokens_seen": 40582496, "step": 33455 }, { "epoch": 3.726472881167168, "grad_norm": 0.10174877941608429, "learning_rate": 3.950789940698283e-05, "loss": 0.4681, "num_input_tokens_seen": 40588480, "step": 33460 }, { "epoch": 3.727029736050785, "grad_norm": 0.11141170561313629, "learning_rate": 3.950394161775775e-05, "loss": 0.4619, "num_input_tokens_seen": 40594784, "step": 33465 }, { "epoch": 3.7275865909344024, "grad_norm": 0.10350729525089264, "learning_rate": 3.9499983280527275e-05, "loss": 0.4683, "num_input_tokens_seen": 40601280, "step": 33470 }, { "epoch": 3.72814344581802, "grad_norm": 0.10430596768856049, "learning_rate": 3.9496024395440965e-05, "loss": 0.4622, "num_input_tokens_seen": 40607584, "step": 33475 }, { "epoch": 3.7287003007016373, "grad_norm": 0.13105261325836182, "learning_rate": 3.94920649626484e-05, "loss": 0.4662, "num_input_tokens_seen": 40613952, "step": 33480 }, { "epoch": 3.7292571555852545, "grad_norm": 0.1026138886809349, "learning_rate": 3.948810498229918e-05, "loss": 0.4595, "num_input_tokens_seen": 40620224, "step": 33485 }, { "epoch": 3.7298140104688717, "grad_norm": 0.10722358524799347, "learning_rate": 3.948414445454293e-05, "loss": 0.4717, "num_input_tokens_seen": 40626432, "step": 33490 }, { "epoch": 3.730370865352489, "grad_norm": 0.08917825669050217, "learning_rate": 3.9480183379529275e-05, "loss": 0.4535, "num_input_tokens_seen": 40632640, "step": 33495 }, { "epoch": 3.7309277202361066, "grad_norm": 0.1286008507013321, "learning_rate": 3.947622175740789e-05, "loss": 0.4515, "num_input_tokens_seen": 40638656, "step": 33500 }, { "epoch": 3.731484575119724, "grad_norm": 0.08434734493494034, "learning_rate": 3.947225958832845e-05, "loss": 0.4688, "num_input_tokens_seen": 40644960, "step": 33505 }, { "epoch": 3.732041430003341, "grad_norm": 0.12397324293851852, "learning_rate": 3.946829687244068e-05, "loss": 0.4729, "num_input_tokens_seen": 40651072, "step": 33510 }, { "epoch": 3.7325982848869583, "grad_norm": 0.10026143491268158, "learning_rate": 3.9464333609894274e-05, "loss": 0.445, "num_input_tokens_seen": 40657152, "step": 33515 }, { "epoch": 3.7331551397705756, "grad_norm": 0.1100558489561081, "learning_rate": 3.9460369800839004e-05, "loss": 0.4411, "num_input_tokens_seen": 40663360, "step": 33520 }, { "epoch": 3.7337119946541932, "grad_norm": 0.1666507124900818, "learning_rate": 3.945640544542461e-05, "loss": 0.4718, "num_input_tokens_seen": 40669280, "step": 33525 }, { "epoch": 3.7342688495378105, "grad_norm": 0.10723996162414551, "learning_rate": 3.9452440543800896e-05, "loss": 0.4604, "num_input_tokens_seen": 40675328, "step": 33530 }, { "epoch": 3.7348257044214277, "grad_norm": 0.12181615829467773, "learning_rate": 3.9448475096117665e-05, "loss": 0.4551, "num_input_tokens_seen": 40681472, "step": 33535 }, { "epoch": 3.7353825593050454, "grad_norm": 0.13724364340305328, "learning_rate": 3.944450910252474e-05, "loss": 0.48, "num_input_tokens_seen": 40686944, "step": 33540 }, { "epoch": 3.7359394141886626, "grad_norm": 0.10989729315042496, "learning_rate": 3.944054256317197e-05, "loss": 0.4532, "num_input_tokens_seen": 40693056, "step": 33545 }, { "epoch": 3.73649626907228, "grad_norm": 0.10339320451021194, "learning_rate": 3.943657547820923e-05, "loss": 0.471, "num_input_tokens_seen": 40699168, "step": 33550 }, { "epoch": 3.737053123955897, "grad_norm": 0.12644462287425995, "learning_rate": 3.94326078477864e-05, "loss": 0.4787, "num_input_tokens_seen": 40705216, "step": 33555 }, { "epoch": 3.7376099788395143, "grad_norm": 0.09540265798568726, "learning_rate": 3.942863967205339e-05, "loss": 0.4703, "num_input_tokens_seen": 40711136, "step": 33560 }, { "epoch": 3.738166833723132, "grad_norm": 0.09466398507356644, "learning_rate": 3.9424670951160144e-05, "loss": 0.4533, "num_input_tokens_seen": 40717344, "step": 33565 }, { "epoch": 3.738723688606749, "grad_norm": 0.14294762909412384, "learning_rate": 3.94207016852566e-05, "loss": 0.4565, "num_input_tokens_seen": 40723584, "step": 33570 }, { "epoch": 3.7392805434903664, "grad_norm": 0.13249042630195618, "learning_rate": 3.941673187449273e-05, "loss": 0.4598, "num_input_tokens_seen": 40729600, "step": 33575 }, { "epoch": 3.7398373983739837, "grad_norm": 0.13108286261558533, "learning_rate": 3.941276151901853e-05, "loss": 0.4659, "num_input_tokens_seen": 40735936, "step": 33580 }, { "epoch": 3.740394253257601, "grad_norm": 0.11015093326568604, "learning_rate": 3.940879061898402e-05, "loss": 0.4608, "num_input_tokens_seen": 40742016, "step": 33585 }, { "epoch": 3.7409511081412186, "grad_norm": 0.123079314827919, "learning_rate": 3.940481917453921e-05, "loss": 0.4585, "num_input_tokens_seen": 40748544, "step": 33590 }, { "epoch": 3.741507963024836, "grad_norm": 0.16288767755031586, "learning_rate": 3.9400847185834176e-05, "loss": 0.4628, "num_input_tokens_seen": 40754976, "step": 33595 }, { "epoch": 3.742064817908453, "grad_norm": 0.11781793087720871, "learning_rate": 3.9396874653018985e-05, "loss": 0.4754, "num_input_tokens_seen": 40760896, "step": 33600 }, { "epoch": 3.7426216727920703, "grad_norm": 0.11708875000476837, "learning_rate": 3.9392901576243734e-05, "loss": 0.4699, "num_input_tokens_seen": 40766848, "step": 33605 }, { "epoch": 3.7431785276756875, "grad_norm": 0.10634411871433258, "learning_rate": 3.938892795565853e-05, "loss": 0.46, "num_input_tokens_seen": 40772832, "step": 33610 }, { "epoch": 3.743735382559305, "grad_norm": 0.1029936894774437, "learning_rate": 3.9384953791413526e-05, "loss": 0.4625, "num_input_tokens_seen": 40778720, "step": 33615 }, { "epoch": 3.7442922374429224, "grad_norm": 0.13397222757339478, "learning_rate": 3.9380979083658866e-05, "loss": 0.4663, "num_input_tokens_seen": 40785024, "step": 33620 }, { "epoch": 3.7448490923265396, "grad_norm": 0.11065433919429779, "learning_rate": 3.937700383254473e-05, "loss": 0.4716, "num_input_tokens_seen": 40791104, "step": 33625 }, { "epoch": 3.7454059472101573, "grad_norm": 0.1542707234621048, "learning_rate": 3.937302803822132e-05, "loss": 0.4635, "num_input_tokens_seen": 40797120, "step": 33630 }, { "epoch": 3.7459628020937745, "grad_norm": 0.1055152416229248, "learning_rate": 3.936905170083884e-05, "loss": 0.4678, "num_input_tokens_seen": 40803328, "step": 33635 }, { "epoch": 3.7465196569773918, "grad_norm": 0.0845060795545578, "learning_rate": 3.9365074820547544e-05, "loss": 0.4505, "num_input_tokens_seen": 40809408, "step": 33640 }, { "epoch": 3.747076511861009, "grad_norm": 0.1653200089931488, "learning_rate": 3.936109739749769e-05, "loss": 0.4678, "num_input_tokens_seen": 40815616, "step": 33645 }, { "epoch": 3.747633366744626, "grad_norm": 0.09875398129224777, "learning_rate": 3.935711943183955e-05, "loss": 0.4763, "num_input_tokens_seen": 40821184, "step": 33650 }, { "epoch": 3.748190221628244, "grad_norm": 0.08809245377779007, "learning_rate": 3.935314092372343e-05, "loss": 0.4617, "num_input_tokens_seen": 40826656, "step": 33655 }, { "epoch": 3.748747076511861, "grad_norm": 0.12520617246627808, "learning_rate": 3.934916187329964e-05, "loss": 0.4626, "num_input_tokens_seen": 40832992, "step": 33660 }, { "epoch": 3.7493039313954784, "grad_norm": 0.12480958551168442, "learning_rate": 3.9345182280718545e-05, "loss": 0.4507, "num_input_tokens_seen": 40839200, "step": 33665 }, { "epoch": 3.7498607862790956, "grad_norm": 0.07551459968090057, "learning_rate": 3.9341202146130484e-05, "loss": 0.4713, "num_input_tokens_seen": 40845312, "step": 33670 }, { "epoch": 3.750417641162713, "grad_norm": 0.08207835257053375, "learning_rate": 3.9337221469685845e-05, "loss": 0.4659, "num_input_tokens_seen": 40850784, "step": 33675 }, { "epoch": 3.7509744960463305, "grad_norm": 0.10186073184013367, "learning_rate": 3.933324025153504e-05, "loss": 0.4629, "num_input_tokens_seen": 40856768, "step": 33680 }, { "epoch": 3.7515313509299477, "grad_norm": 0.10887549817562103, "learning_rate": 3.932925849182848e-05, "loss": 0.463, "num_input_tokens_seen": 40863072, "step": 33685 }, { "epoch": 3.752088205813565, "grad_norm": 0.10191363096237183, "learning_rate": 3.932527619071662e-05, "loss": 0.4566, "num_input_tokens_seen": 40869216, "step": 33690 }, { "epoch": 3.752645060697182, "grad_norm": 0.08415904641151428, "learning_rate": 3.932129334834991e-05, "loss": 0.4571, "num_input_tokens_seen": 40875040, "step": 33695 }, { "epoch": 3.7532019155807994, "grad_norm": 0.13785719871520996, "learning_rate": 3.931730996487885e-05, "loss": 0.4569, "num_input_tokens_seen": 40880640, "step": 33700 }, { "epoch": 3.753758770464417, "grad_norm": 0.1794702410697937, "learning_rate": 3.931332604045393e-05, "loss": 0.4568, "num_input_tokens_seen": 40887136, "step": 33705 }, { "epoch": 3.7543156253480343, "grad_norm": 0.13417279720306396, "learning_rate": 3.930934157522569e-05, "loss": 0.4675, "num_input_tokens_seen": 40892960, "step": 33710 }, { "epoch": 3.7548724802316515, "grad_norm": 0.10364723950624466, "learning_rate": 3.9305356569344665e-05, "loss": 0.4588, "num_input_tokens_seen": 40898720, "step": 33715 }, { "epoch": 3.755429335115269, "grad_norm": 0.07826809585094452, "learning_rate": 3.930137102296142e-05, "loss": 0.4644, "num_input_tokens_seen": 40904672, "step": 33720 }, { "epoch": 3.7559861899988864, "grad_norm": 0.10408046841621399, "learning_rate": 3.929738493622656e-05, "loss": 0.4671, "num_input_tokens_seen": 40911008, "step": 33725 }, { "epoch": 3.7565430448825037, "grad_norm": 0.12661516666412354, "learning_rate": 3.9293398309290666e-05, "loss": 0.4546, "num_input_tokens_seen": 40917376, "step": 33730 }, { "epoch": 3.757099899766121, "grad_norm": 0.12178245186805725, "learning_rate": 3.9289411142304375e-05, "loss": 0.4714, "num_input_tokens_seen": 40923520, "step": 33735 }, { "epoch": 3.757656754649738, "grad_norm": 0.1684918999671936, "learning_rate": 3.928542343541835e-05, "loss": 0.4652, "num_input_tokens_seen": 40929440, "step": 33740 }, { "epoch": 3.758213609533356, "grad_norm": 0.1745777428150177, "learning_rate": 3.928143518878324e-05, "loss": 0.462, "num_input_tokens_seen": 40935424, "step": 33745 }, { "epoch": 3.758770464416973, "grad_norm": 0.10271459072828293, "learning_rate": 3.9277446402549756e-05, "loss": 0.4717, "num_input_tokens_seen": 40941696, "step": 33750 }, { "epoch": 3.7593273193005903, "grad_norm": 0.10440979897975922, "learning_rate": 3.9273457076868584e-05, "loss": 0.4622, "num_input_tokens_seen": 40947776, "step": 33755 }, { "epoch": 3.7598841741842075, "grad_norm": 0.13298597931861877, "learning_rate": 3.926946721189045e-05, "loss": 0.467, "num_input_tokens_seen": 40954144, "step": 33760 }, { "epoch": 3.7604410290678247, "grad_norm": 0.07685402035713196, "learning_rate": 3.926547680776613e-05, "loss": 0.4611, "num_input_tokens_seen": 40960480, "step": 33765 }, { "epoch": 3.7609978839514424, "grad_norm": 0.11384179443120956, "learning_rate": 3.926148586464638e-05, "loss": 0.4582, "num_input_tokens_seen": 40966432, "step": 33770 }, { "epoch": 3.7615547388350596, "grad_norm": 0.15118086338043213, "learning_rate": 3.925749438268199e-05, "loss": 0.4642, "num_input_tokens_seen": 40972864, "step": 33775 }, { "epoch": 3.762111593718677, "grad_norm": 0.08358006179332733, "learning_rate": 3.9253502362023764e-05, "loss": 0.4548, "num_input_tokens_seen": 40979200, "step": 33780 }, { "epoch": 3.762668448602294, "grad_norm": 0.12813188135623932, "learning_rate": 3.9249509802822546e-05, "loss": 0.4695, "num_input_tokens_seen": 40985376, "step": 33785 }, { "epoch": 3.7632253034859113, "grad_norm": 0.18146800994873047, "learning_rate": 3.924551670522919e-05, "loss": 0.4645, "num_input_tokens_seen": 40991296, "step": 33790 }, { "epoch": 3.763782158369529, "grad_norm": 0.11871059238910675, "learning_rate": 3.9241523069394546e-05, "loss": 0.473, "num_input_tokens_seen": 40997504, "step": 33795 }, { "epoch": 3.7643390132531462, "grad_norm": 0.1349102407693863, "learning_rate": 3.923752889546953e-05, "loss": 0.4603, "num_input_tokens_seen": 41003616, "step": 33800 }, { "epoch": 3.7648958681367635, "grad_norm": 0.11175151914358139, "learning_rate": 3.9233534183605046e-05, "loss": 0.4749, "num_input_tokens_seen": 41008960, "step": 33805 }, { "epoch": 3.765452723020381, "grad_norm": 0.1495971381664276, "learning_rate": 3.9229538933952024e-05, "loss": 0.4714, "num_input_tokens_seen": 41015168, "step": 33810 }, { "epoch": 3.7660095779039984, "grad_norm": 0.11772947758436203, "learning_rate": 3.922554314666143e-05, "loss": 0.45, "num_input_tokens_seen": 41021568, "step": 33815 }, { "epoch": 3.7665664327876156, "grad_norm": 0.13386766612529755, "learning_rate": 3.922154682188422e-05, "loss": 0.4622, "num_input_tokens_seen": 41027936, "step": 33820 }, { "epoch": 3.767123287671233, "grad_norm": 0.12530027329921722, "learning_rate": 3.9217549959771395e-05, "loss": 0.4689, "num_input_tokens_seen": 41033856, "step": 33825 }, { "epoch": 3.76768014255485, "grad_norm": 0.12999258935451508, "learning_rate": 3.921355256047397e-05, "loss": 0.4659, "num_input_tokens_seen": 41039904, "step": 33830 }, { "epoch": 3.7682369974384677, "grad_norm": 0.1259377896785736, "learning_rate": 3.920955462414299e-05, "loss": 0.4543, "num_input_tokens_seen": 41046080, "step": 33835 }, { "epoch": 3.768793852322085, "grad_norm": 0.11754144728183746, "learning_rate": 3.9205556150929476e-05, "loss": 0.4577, "num_input_tokens_seen": 41052288, "step": 33840 }, { "epoch": 3.769350707205702, "grad_norm": 0.12417097389698029, "learning_rate": 3.9201557140984545e-05, "loss": 0.4624, "num_input_tokens_seen": 41058624, "step": 33845 }, { "epoch": 3.7699075620893194, "grad_norm": 0.10712695866823196, "learning_rate": 3.919755759445928e-05, "loss": 0.4748, "num_input_tokens_seen": 41064160, "step": 33850 }, { "epoch": 3.7704644169729367, "grad_norm": 0.11098593473434448, "learning_rate": 3.919355751150477e-05, "loss": 0.4758, "num_input_tokens_seen": 41070016, "step": 33855 }, { "epoch": 3.7710212718565543, "grad_norm": 0.10396921634674072, "learning_rate": 3.9189556892272185e-05, "loss": 0.4789, "num_input_tokens_seen": 41076320, "step": 33860 }, { "epoch": 3.7715781267401716, "grad_norm": 0.12074485421180725, "learning_rate": 3.918555573691267e-05, "loss": 0.4626, "num_input_tokens_seen": 41082304, "step": 33865 }, { "epoch": 3.772134981623789, "grad_norm": 0.11518154293298721, "learning_rate": 3.91815540455774e-05, "loss": 0.4752, "num_input_tokens_seen": 41088320, "step": 33870 }, { "epoch": 3.772691836507406, "grad_norm": 0.15705297887325287, "learning_rate": 3.917755181841756e-05, "loss": 0.4755, "num_input_tokens_seen": 41094432, "step": 33875 }, { "epoch": 3.7732486913910233, "grad_norm": 0.11973974853754044, "learning_rate": 3.917354905558438e-05, "loss": 0.4562, "num_input_tokens_seen": 41100736, "step": 33880 }, { "epoch": 3.773805546274641, "grad_norm": 0.10411804914474487, "learning_rate": 3.9169545757229106e-05, "loss": 0.4616, "num_input_tokens_seen": 41106944, "step": 33885 }, { "epoch": 3.774362401158258, "grad_norm": 0.1117476373910904, "learning_rate": 3.916554192350298e-05, "loss": 0.4746, "num_input_tokens_seen": 41112800, "step": 33890 }, { "epoch": 3.7749192560418754, "grad_norm": 0.10623755306005478, "learning_rate": 3.9161537554557284e-05, "loss": 0.4708, "num_input_tokens_seen": 41119040, "step": 33895 }, { "epoch": 3.775476110925493, "grad_norm": 0.09445314109325409, "learning_rate": 3.915753265054331e-05, "loss": 0.4506, "num_input_tokens_seen": 41125344, "step": 33900 }, { "epoch": 3.7760329658091103, "grad_norm": 0.13095472753047943, "learning_rate": 3.915352721161239e-05, "loss": 0.458, "num_input_tokens_seen": 41131520, "step": 33905 }, { "epoch": 3.7765898206927275, "grad_norm": 0.11211635172367096, "learning_rate": 3.9149521237915854e-05, "loss": 0.451, "num_input_tokens_seen": 41137696, "step": 33910 }, { "epoch": 3.7771466755763448, "grad_norm": 0.10845786333084106, "learning_rate": 3.914551472960506e-05, "loss": 0.4816, "num_input_tokens_seen": 41144064, "step": 33915 }, { "epoch": 3.777703530459962, "grad_norm": 0.09143447130918503, "learning_rate": 3.914150768683139e-05, "loss": 0.4682, "num_input_tokens_seen": 41150240, "step": 33920 }, { "epoch": 3.7782603853435797, "grad_norm": 0.12500876188278198, "learning_rate": 3.913750010974623e-05, "loss": 0.4613, "num_input_tokens_seen": 41155584, "step": 33925 }, { "epoch": 3.778817240227197, "grad_norm": 0.1266585886478424, "learning_rate": 3.913349199850103e-05, "loss": 0.4732, "num_input_tokens_seen": 41161664, "step": 33930 }, { "epoch": 3.779374095110814, "grad_norm": 0.08586657047271729, "learning_rate": 3.9129483353247195e-05, "loss": 0.4554, "num_input_tokens_seen": 41167904, "step": 33935 }, { "epoch": 3.7799309499944314, "grad_norm": 0.11637341231107712, "learning_rate": 3.9125474174136203e-05, "loss": 0.4707, "num_input_tokens_seen": 41173888, "step": 33940 }, { "epoch": 3.7804878048780486, "grad_norm": 0.18754321336746216, "learning_rate": 3.912146446131952e-05, "loss": 0.4648, "num_input_tokens_seen": 41179904, "step": 33945 }, { "epoch": 3.7810446597616663, "grad_norm": 0.14636710286140442, "learning_rate": 3.9117454214948665e-05, "loss": 0.4669, "num_input_tokens_seen": 41186400, "step": 33950 }, { "epoch": 3.7816015146452835, "grad_norm": 0.1006515696644783, "learning_rate": 3.911344343517515e-05, "loss": 0.4637, "num_input_tokens_seen": 41192448, "step": 33955 }, { "epoch": 3.7821583695289007, "grad_norm": 0.13893000781536102, "learning_rate": 3.9109432122150516e-05, "loss": 0.4647, "num_input_tokens_seen": 41198080, "step": 33960 }, { "epoch": 3.782715224412518, "grad_norm": 0.13888104259967804, "learning_rate": 3.910542027602631e-05, "loss": 0.4664, "num_input_tokens_seen": 41203840, "step": 33965 }, { "epoch": 3.783272079296135, "grad_norm": 0.10879026353359222, "learning_rate": 3.9101407896954125e-05, "loss": 0.4692, "num_input_tokens_seen": 41209888, "step": 33970 }, { "epoch": 3.783828934179753, "grad_norm": 0.09833880513906479, "learning_rate": 3.909739498508557e-05, "loss": 0.472, "num_input_tokens_seen": 41215808, "step": 33975 }, { "epoch": 3.78438578906337, "grad_norm": 0.11475487053394318, "learning_rate": 3.909338154057224e-05, "loss": 0.4812, "num_input_tokens_seen": 41221856, "step": 33980 }, { "epoch": 3.7849426439469873, "grad_norm": 0.08839789032936096, "learning_rate": 3.90893675635658e-05, "loss": 0.4516, "num_input_tokens_seen": 41227648, "step": 33985 }, { "epoch": 3.785499498830605, "grad_norm": 0.11511853337287903, "learning_rate": 3.9085353054217896e-05, "loss": 0.4685, "num_input_tokens_seen": 41233504, "step": 33990 }, { "epoch": 3.7860563537142222, "grad_norm": 0.136063352227211, "learning_rate": 3.908133801268022e-05, "loss": 0.471, "num_input_tokens_seen": 41239680, "step": 33995 }, { "epoch": 3.7866132085978395, "grad_norm": 0.11136060208082199, "learning_rate": 3.907732243910447e-05, "loss": 0.4684, "num_input_tokens_seen": 41245568, "step": 34000 }, { "epoch": 3.7871700634814567, "grad_norm": 0.09561469405889511, "learning_rate": 3.9073306333642355e-05, "loss": 0.4612, "num_input_tokens_seen": 41251264, "step": 34005 }, { "epoch": 3.787726918365074, "grad_norm": 0.10614977777004242, "learning_rate": 3.906928969644563e-05, "loss": 0.4462, "num_input_tokens_seen": 41257568, "step": 34010 }, { "epoch": 3.7882837732486916, "grad_norm": 0.1375333070755005, "learning_rate": 3.906527252766606e-05, "loss": 0.4697, "num_input_tokens_seen": 41263648, "step": 34015 }, { "epoch": 3.788840628132309, "grad_norm": 0.11325790733098984, "learning_rate": 3.906125482745542e-05, "loss": 0.4537, "num_input_tokens_seen": 41269792, "step": 34020 }, { "epoch": 3.789397483015926, "grad_norm": 0.10780840367078781, "learning_rate": 3.9057236595965505e-05, "loss": 0.4663, "num_input_tokens_seen": 41275904, "step": 34025 }, { "epoch": 3.7899543378995433, "grad_norm": 0.12772084772586823, "learning_rate": 3.905321783334814e-05, "loss": 0.4711, "num_input_tokens_seen": 41282144, "step": 34030 }, { "epoch": 3.7905111927831605, "grad_norm": 0.15594078600406647, "learning_rate": 3.9049198539755184e-05, "loss": 0.4574, "num_input_tokens_seen": 41288352, "step": 34035 }, { "epoch": 3.791068047666778, "grad_norm": 0.10872302204370499, "learning_rate": 3.904517871533847e-05, "loss": 0.4583, "num_input_tokens_seen": 41294432, "step": 34040 }, { "epoch": 3.7916249025503954, "grad_norm": 0.11147146672010422, "learning_rate": 3.9041158360249895e-05, "loss": 0.4672, "num_input_tokens_seen": 41300512, "step": 34045 }, { "epoch": 3.7921817574340126, "grad_norm": 0.1728898584842682, "learning_rate": 3.903713747464137e-05, "loss": 0.454, "num_input_tokens_seen": 41306720, "step": 34050 }, { "epoch": 3.79273861231763, "grad_norm": 0.12309367954730988, "learning_rate": 3.90331160586648e-05, "loss": 0.4706, "num_input_tokens_seen": 41312992, "step": 34055 }, { "epoch": 3.793295467201247, "grad_norm": 0.16316482424736023, "learning_rate": 3.902909411247213e-05, "loss": 0.4719, "num_input_tokens_seen": 41318880, "step": 34060 }, { "epoch": 3.793852322084865, "grad_norm": 0.09597089141607285, "learning_rate": 3.902507163621534e-05, "loss": 0.4705, "num_input_tokens_seen": 41324320, "step": 34065 }, { "epoch": 3.794409176968482, "grad_norm": 0.10780968517065048, "learning_rate": 3.9021048630046385e-05, "loss": 0.4578, "num_input_tokens_seen": 41330432, "step": 34070 }, { "epoch": 3.7949660318520992, "grad_norm": 0.11804629117250443, "learning_rate": 3.9017025094117284e-05, "loss": 0.4668, "num_input_tokens_seen": 41336576, "step": 34075 }, { "epoch": 3.795522886735717, "grad_norm": 0.11200462281703949, "learning_rate": 3.9013001028580054e-05, "loss": 0.453, "num_input_tokens_seen": 41342848, "step": 34080 }, { "epoch": 3.796079741619334, "grad_norm": 0.11402666568756104, "learning_rate": 3.900897643358674e-05, "loss": 0.4718, "num_input_tokens_seen": 41348768, "step": 34085 }, { "epoch": 3.7966365965029514, "grad_norm": 0.10429957509040833, "learning_rate": 3.90049513092894e-05, "loss": 0.4755, "num_input_tokens_seen": 41354528, "step": 34090 }, { "epoch": 3.7971934513865686, "grad_norm": 0.09319762140512466, "learning_rate": 3.900092565584012e-05, "loss": 0.4549, "num_input_tokens_seen": 41360736, "step": 34095 }, { "epoch": 3.797750306270186, "grad_norm": 0.11322570592164993, "learning_rate": 3.8996899473391e-05, "loss": 0.4702, "num_input_tokens_seen": 41366976, "step": 34100 }, { "epoch": 3.7983071611538035, "grad_norm": 0.1274120956659317, "learning_rate": 3.8992872762094165e-05, "loss": 0.4768, "num_input_tokens_seen": 41372928, "step": 34105 }, { "epoch": 3.7988640160374207, "grad_norm": 0.11080379784107208, "learning_rate": 3.8988845522101756e-05, "loss": 0.4624, "num_input_tokens_seen": 41378816, "step": 34110 }, { "epoch": 3.799420870921038, "grad_norm": 0.11139469593763351, "learning_rate": 3.898481775356593e-05, "loss": 0.4647, "num_input_tokens_seen": 41385408, "step": 34115 }, { "epoch": 3.799977725804655, "grad_norm": 0.10185275226831436, "learning_rate": 3.898078945663887e-05, "loss": 0.454, "num_input_tokens_seen": 41392032, "step": 34120 }, { "epoch": 3.8005345806882724, "grad_norm": 0.1129477396607399, "learning_rate": 3.8976760631472786e-05, "loss": 0.4587, "num_input_tokens_seen": 41397952, "step": 34125 }, { "epoch": 3.80109143557189, "grad_norm": 0.10854858160018921, "learning_rate": 3.897273127821989e-05, "loss": 0.4761, "num_input_tokens_seen": 41403392, "step": 34130 }, { "epoch": 3.8016482904555073, "grad_norm": 0.12462734431028366, "learning_rate": 3.896870139703244e-05, "loss": 0.475, "num_input_tokens_seen": 41409664, "step": 34135 }, { "epoch": 3.8022051453391246, "grad_norm": 0.09957718849182129, "learning_rate": 3.896467098806267e-05, "loss": 0.4568, "num_input_tokens_seen": 41414912, "step": 34140 }, { "epoch": 3.802762000222742, "grad_norm": 0.09338197857141495, "learning_rate": 3.8960640051462886e-05, "loss": 0.4667, "num_input_tokens_seen": 41420992, "step": 34145 }, { "epoch": 3.803318855106359, "grad_norm": 0.08658773452043533, "learning_rate": 3.895660858738538e-05, "loss": 0.4589, "num_input_tokens_seen": 41427040, "step": 34150 }, { "epoch": 3.8038757099899767, "grad_norm": 0.09880197793245316, "learning_rate": 3.895257659598248e-05, "loss": 0.473, "num_input_tokens_seen": 41431968, "step": 34155 }, { "epoch": 3.804432564873594, "grad_norm": 0.09091471880674362, "learning_rate": 3.894854407740652e-05, "loss": 0.4675, "num_input_tokens_seen": 41438048, "step": 34160 }, { "epoch": 3.804989419757211, "grad_norm": 0.10721496492624283, "learning_rate": 3.8944511031809865e-05, "loss": 0.4595, "num_input_tokens_seen": 41444192, "step": 34165 }, { "epoch": 3.805546274640829, "grad_norm": 0.09918869286775589, "learning_rate": 3.89404774593449e-05, "loss": 0.4762, "num_input_tokens_seen": 41449984, "step": 34170 }, { "epoch": 3.806103129524446, "grad_norm": 0.09584306180477142, "learning_rate": 3.893644336016401e-05, "loss": 0.4506, "num_input_tokens_seen": 41456512, "step": 34175 }, { "epoch": 3.8066599844080633, "grad_norm": 0.13603240251541138, "learning_rate": 3.893240873441964e-05, "loss": 0.4691, "num_input_tokens_seen": 41462176, "step": 34180 }, { "epoch": 3.8072168392916805, "grad_norm": 0.08907065540552139, "learning_rate": 3.8928373582264215e-05, "loss": 0.4647, "num_input_tokens_seen": 41468448, "step": 34185 }, { "epoch": 3.8077736941752978, "grad_norm": 0.12323721498250961, "learning_rate": 3.892433790385021e-05, "loss": 0.4629, "num_input_tokens_seen": 41474848, "step": 34190 }, { "epoch": 3.8083305490589154, "grad_norm": 0.08729144930839539, "learning_rate": 3.8920301699330076e-05, "loss": 0.4696, "num_input_tokens_seen": 41480768, "step": 34195 }, { "epoch": 3.8088874039425327, "grad_norm": 0.10953851044178009, "learning_rate": 3.891626496885634e-05, "loss": 0.4571, "num_input_tokens_seen": 41487072, "step": 34200 }, { "epoch": 3.80944425882615, "grad_norm": 0.09805894643068314, "learning_rate": 3.891222771258153e-05, "loss": 0.4535, "num_input_tokens_seen": 41493120, "step": 34205 }, { "epoch": 3.810001113709767, "grad_norm": 0.09118843823671341, "learning_rate": 3.890818993065816e-05, "loss": 0.4623, "num_input_tokens_seen": 41499520, "step": 34210 }, { "epoch": 3.8105579685933844, "grad_norm": 0.109297014772892, "learning_rate": 3.890415162323881e-05, "loss": 0.4616, "num_input_tokens_seen": 41506176, "step": 34215 }, { "epoch": 3.811114823477002, "grad_norm": 0.08671506494283676, "learning_rate": 3.890011279047605e-05, "loss": 0.456, "num_input_tokens_seen": 41512192, "step": 34220 }, { "epoch": 3.8116716783606193, "grad_norm": 0.11100200563669205, "learning_rate": 3.889607343252248e-05, "loss": 0.4528, "num_input_tokens_seen": 41518176, "step": 34225 }, { "epoch": 3.8122285332442365, "grad_norm": 0.103350929915905, "learning_rate": 3.889203354953074e-05, "loss": 0.4509, "num_input_tokens_seen": 41524160, "step": 34230 }, { "epoch": 3.8127853881278537, "grad_norm": 0.14783304929733276, "learning_rate": 3.888799314165343e-05, "loss": 0.4668, "num_input_tokens_seen": 41530336, "step": 34235 }, { "epoch": 3.813342243011471, "grad_norm": 0.08105645328760147, "learning_rate": 3.888395220904325e-05, "loss": 0.4661, "num_input_tokens_seen": 41536480, "step": 34240 }, { "epoch": 3.8138990978950886, "grad_norm": 0.10849327594041824, "learning_rate": 3.887991075185285e-05, "loss": 0.4606, "num_input_tokens_seen": 41542464, "step": 34245 }, { "epoch": 3.814455952778706, "grad_norm": 0.16023187339305878, "learning_rate": 3.887586877023496e-05, "loss": 0.4723, "num_input_tokens_seen": 41548992, "step": 34250 }, { "epoch": 3.815012807662323, "grad_norm": 0.13048847019672394, "learning_rate": 3.887182626434227e-05, "loss": 0.4604, "num_input_tokens_seen": 41555360, "step": 34255 }, { "epoch": 3.8155696625459408, "grad_norm": 0.0974530279636383, "learning_rate": 3.8867783234327526e-05, "loss": 0.4677, "num_input_tokens_seen": 41561824, "step": 34260 }, { "epoch": 3.816126517429558, "grad_norm": 0.09210918843746185, "learning_rate": 3.886373968034349e-05, "loss": 0.466, "num_input_tokens_seen": 41568192, "step": 34265 }, { "epoch": 3.8166833723131752, "grad_norm": 0.07968369871377945, "learning_rate": 3.885969560254295e-05, "loss": 0.4605, "num_input_tokens_seen": 41574176, "step": 34270 }, { "epoch": 3.8172402271967925, "grad_norm": 0.09273415803909302, "learning_rate": 3.885565100107868e-05, "loss": 0.4698, "num_input_tokens_seen": 41580192, "step": 34275 }, { "epoch": 3.8177970820804097, "grad_norm": 0.09934060275554657, "learning_rate": 3.885160587610352e-05, "loss": 0.4567, "num_input_tokens_seen": 41586304, "step": 34280 }, { "epoch": 3.8183539369640274, "grad_norm": 0.12593284249305725, "learning_rate": 3.8847560227770305e-05, "loss": 0.4707, "num_input_tokens_seen": 41592224, "step": 34285 }, { "epoch": 3.8189107918476446, "grad_norm": 0.0991821438074112, "learning_rate": 3.8843514056231875e-05, "loss": 0.4717, "num_input_tokens_seen": 41598336, "step": 34290 }, { "epoch": 3.819467646731262, "grad_norm": 0.13127310574054718, "learning_rate": 3.883946736164113e-05, "loss": 0.458, "num_input_tokens_seen": 41604480, "step": 34295 }, { "epoch": 3.820024501614879, "grad_norm": 0.0910361036658287, "learning_rate": 3.883542014415096e-05, "loss": 0.4507, "num_input_tokens_seen": 41610368, "step": 34300 }, { "epoch": 3.8205813564984963, "grad_norm": 0.13304410874843597, "learning_rate": 3.8831372403914274e-05, "loss": 0.4706, "num_input_tokens_seen": 41616352, "step": 34305 }, { "epoch": 3.821138211382114, "grad_norm": 0.1061197966337204, "learning_rate": 3.882732414108401e-05, "loss": 0.4538, "num_input_tokens_seen": 41622304, "step": 34310 }, { "epoch": 3.821695066265731, "grad_norm": 0.10229407250881195, "learning_rate": 3.882327535581314e-05, "loss": 0.4669, "num_input_tokens_seen": 41628384, "step": 34315 }, { "epoch": 3.8222519211493484, "grad_norm": 0.09252741932868958, "learning_rate": 3.881922604825462e-05, "loss": 0.462, "num_input_tokens_seen": 41634400, "step": 34320 }, { "epoch": 3.8228087760329656, "grad_norm": 0.09158894419670105, "learning_rate": 3.881517621856145e-05, "loss": 0.4701, "num_input_tokens_seen": 41640960, "step": 34325 }, { "epoch": 3.823365630916583, "grad_norm": 0.10502877831459045, "learning_rate": 3.8811125866886654e-05, "loss": 0.4696, "num_input_tokens_seen": 41647040, "step": 34330 }, { "epoch": 3.8239224858002006, "grad_norm": 0.11291535198688507, "learning_rate": 3.8807074993383265e-05, "loss": 0.485, "num_input_tokens_seen": 41652896, "step": 34335 }, { "epoch": 3.824479340683818, "grad_norm": 0.11408594995737076, "learning_rate": 3.880302359820434e-05, "loss": 0.4719, "num_input_tokens_seen": 41659040, "step": 34340 }, { "epoch": 3.825036195567435, "grad_norm": 0.07366008311510086, "learning_rate": 3.8798971681502936e-05, "loss": 0.4643, "num_input_tokens_seen": 41664960, "step": 34345 }, { "epoch": 3.8255930504510527, "grad_norm": 0.09701760858297348, "learning_rate": 3.879491924343217e-05, "loss": 0.4667, "num_input_tokens_seen": 41670976, "step": 34350 }, { "epoch": 3.82614990533467, "grad_norm": 0.0890149399638176, "learning_rate": 3.8790866284145146e-05, "loss": 0.4654, "num_input_tokens_seen": 41677280, "step": 34355 }, { "epoch": 3.826706760218287, "grad_norm": 0.08785679191350937, "learning_rate": 3.8786812803795e-05, "loss": 0.4534, "num_input_tokens_seen": 41683616, "step": 34360 }, { "epoch": 3.8272636151019044, "grad_norm": 0.12633854150772095, "learning_rate": 3.878275880253488e-05, "loss": 0.4652, "num_input_tokens_seen": 41689952, "step": 34365 }, { "epoch": 3.8278204699855216, "grad_norm": 0.11037483811378479, "learning_rate": 3.877870428051797e-05, "loss": 0.4663, "num_input_tokens_seen": 41696544, "step": 34370 }, { "epoch": 3.8283773248691393, "grad_norm": 0.1150163784623146, "learning_rate": 3.877464923789745e-05, "loss": 0.4549, "num_input_tokens_seen": 41702464, "step": 34375 }, { "epoch": 3.8289341797527565, "grad_norm": 0.08682141453027725, "learning_rate": 3.8770593674826547e-05, "loss": 0.4564, "num_input_tokens_seen": 41708800, "step": 34380 }, { "epoch": 3.8294910346363737, "grad_norm": 0.14744536578655243, "learning_rate": 3.876653759145847e-05, "loss": 0.4779, "num_input_tokens_seen": 41714784, "step": 34385 }, { "epoch": 3.830047889519991, "grad_norm": 0.08210261911153793, "learning_rate": 3.87624809879465e-05, "loss": 0.4572, "num_input_tokens_seen": 41720768, "step": 34390 }, { "epoch": 3.830604744403608, "grad_norm": 0.0867156833410263, "learning_rate": 3.875842386444389e-05, "loss": 0.4566, "num_input_tokens_seen": 41726560, "step": 34395 }, { "epoch": 3.831161599287226, "grad_norm": 0.10943593829870224, "learning_rate": 3.8754366221103934e-05, "loss": 0.4639, "num_input_tokens_seen": 41732384, "step": 34400 }, { "epoch": 3.831718454170843, "grad_norm": 0.10788046568632126, "learning_rate": 3.875030805807994e-05, "loss": 0.4683, "num_input_tokens_seen": 41738368, "step": 34405 }, { "epoch": 3.8322753090544603, "grad_norm": 0.10096628218889236, "learning_rate": 3.8746249375525246e-05, "loss": 0.4644, "num_input_tokens_seen": 41744288, "step": 34410 }, { "epoch": 3.8328321639380776, "grad_norm": 0.13441389799118042, "learning_rate": 3.87421901735932e-05, "loss": 0.4632, "num_input_tokens_seen": 41749952, "step": 34415 }, { "epoch": 3.833389018821695, "grad_norm": 0.12029937654733658, "learning_rate": 3.873813045243717e-05, "loss": 0.4477, "num_input_tokens_seen": 41756064, "step": 34420 }, { "epoch": 3.8339458737053125, "grad_norm": 0.09670980274677277, "learning_rate": 3.873407021221055e-05, "loss": 0.4674, "num_input_tokens_seen": 41762368, "step": 34425 }, { "epoch": 3.8345027285889297, "grad_norm": 0.09744395315647125, "learning_rate": 3.873000945306674e-05, "loss": 0.457, "num_input_tokens_seen": 41768576, "step": 34430 }, { "epoch": 3.835059583472547, "grad_norm": 0.07672962546348572, "learning_rate": 3.872594817515917e-05, "loss": 0.4659, "num_input_tokens_seen": 41774560, "step": 34435 }, { "epoch": 3.8356164383561646, "grad_norm": 0.10638201981782913, "learning_rate": 3.872188637864129e-05, "loss": 0.4758, "num_input_tokens_seen": 41780352, "step": 34440 }, { "epoch": 3.836173293239782, "grad_norm": 0.09632602334022522, "learning_rate": 3.871782406366658e-05, "loss": 0.4656, "num_input_tokens_seen": 41786752, "step": 34445 }, { "epoch": 3.836730148123399, "grad_norm": 0.13056284189224243, "learning_rate": 3.87137612303885e-05, "loss": 0.4582, "num_input_tokens_seen": 41792992, "step": 34450 }, { "epoch": 3.8372870030070163, "grad_norm": 0.10237565636634827, "learning_rate": 3.8709697878960585e-05, "loss": 0.4613, "num_input_tokens_seen": 41798592, "step": 34455 }, { "epoch": 3.8378438578906335, "grad_norm": 0.08883894979953766, "learning_rate": 3.870563400953634e-05, "loss": 0.4667, "num_input_tokens_seen": 41804672, "step": 34460 }, { "epoch": 3.838400712774251, "grad_norm": 0.09807217121124268, "learning_rate": 3.870156962226933e-05, "loss": 0.4596, "num_input_tokens_seen": 41810688, "step": 34465 }, { "epoch": 3.8389575676578684, "grad_norm": 0.09711482375860214, "learning_rate": 3.86975047173131e-05, "loss": 0.4666, "num_input_tokens_seen": 41816768, "step": 34470 }, { "epoch": 3.8395144225414857, "grad_norm": 0.14496055245399475, "learning_rate": 3.8693439294821254e-05, "loss": 0.4698, "num_input_tokens_seen": 41822880, "step": 34475 }, { "epoch": 3.840071277425103, "grad_norm": 0.11306474357843399, "learning_rate": 3.868937335494739e-05, "loss": 0.4715, "num_input_tokens_seen": 41828960, "step": 34480 }, { "epoch": 3.84062813230872, "grad_norm": 0.10378647595643997, "learning_rate": 3.868530689784513e-05, "loss": 0.4555, "num_input_tokens_seen": 41835136, "step": 34485 }, { "epoch": 3.841184987192338, "grad_norm": 0.11562294512987137, "learning_rate": 3.868123992366811e-05, "loss": 0.4722, "num_input_tokens_seen": 41840768, "step": 34490 }, { "epoch": 3.841741842075955, "grad_norm": 0.10527967661619186, "learning_rate": 3.867717243257001e-05, "loss": 0.4602, "num_input_tokens_seen": 41846976, "step": 34495 }, { "epoch": 3.8422986969595723, "grad_norm": 0.10100668668746948, "learning_rate": 3.867310442470451e-05, "loss": 0.4545, "num_input_tokens_seen": 41853280, "step": 34500 }, { "epoch": 3.8428555518431895, "grad_norm": 0.10291695594787598, "learning_rate": 3.866903590022529e-05, "loss": 0.4628, "num_input_tokens_seen": 41859264, "step": 34505 }, { "epoch": 3.8434124067268067, "grad_norm": 0.13623465597629547, "learning_rate": 3.86649668592861e-05, "loss": 0.4763, "num_input_tokens_seen": 41864992, "step": 34510 }, { "epoch": 3.8439692616104244, "grad_norm": 0.08875658363103867, "learning_rate": 3.866089730204068e-05, "loss": 0.4696, "num_input_tokens_seen": 41870560, "step": 34515 }, { "epoch": 3.8445261164940416, "grad_norm": 0.0591229684650898, "learning_rate": 3.865682722864277e-05, "loss": 0.4594, "num_input_tokens_seen": 41876672, "step": 34520 }, { "epoch": 3.845082971377659, "grad_norm": 0.10252810269594193, "learning_rate": 3.865275663924616e-05, "loss": 0.4614, "num_input_tokens_seen": 41883072, "step": 34525 }, { "epoch": 3.8456398262612765, "grad_norm": 0.09902482479810715, "learning_rate": 3.864868553400466e-05, "loss": 0.4717, "num_input_tokens_seen": 41888704, "step": 34530 }, { "epoch": 3.8461966811448938, "grad_norm": 0.10415913909673691, "learning_rate": 3.864461391307207e-05, "loss": 0.4687, "num_input_tokens_seen": 41895168, "step": 34535 }, { "epoch": 3.846753536028511, "grad_norm": 0.18791073560714722, "learning_rate": 3.864054177660225e-05, "loss": 0.4673, "num_input_tokens_seen": 41901568, "step": 34540 }, { "epoch": 3.8473103909121282, "grad_norm": 0.08871276676654816, "learning_rate": 3.8636469124749044e-05, "loss": 0.4401, "num_input_tokens_seen": 41907584, "step": 34545 }, { "epoch": 3.8478672457957455, "grad_norm": 0.10626587271690369, "learning_rate": 3.863239595766633e-05, "loss": 0.4677, "num_input_tokens_seen": 41913632, "step": 34550 }, { "epoch": 3.848424100679363, "grad_norm": 0.09799452871084213, "learning_rate": 3.862832227550802e-05, "loss": 0.4491, "num_input_tokens_seen": 41919872, "step": 34555 }, { "epoch": 3.8489809555629804, "grad_norm": 0.08538585901260376, "learning_rate": 3.8624248078428014e-05, "loss": 0.4753, "num_input_tokens_seen": 41926080, "step": 34560 }, { "epoch": 3.8495378104465976, "grad_norm": 0.10085771977901459, "learning_rate": 3.862017336658025e-05, "loss": 0.4623, "num_input_tokens_seen": 41931904, "step": 34565 }, { "epoch": 3.850094665330215, "grad_norm": 0.0839577466249466, "learning_rate": 3.86160981401187e-05, "loss": 0.4658, "num_input_tokens_seen": 41937792, "step": 34570 }, { "epoch": 3.850651520213832, "grad_norm": 0.12899050116539001, "learning_rate": 3.861202239919732e-05, "loss": 0.4533, "num_input_tokens_seen": 41944032, "step": 34575 }, { "epoch": 3.8512083750974497, "grad_norm": 0.11937831342220306, "learning_rate": 3.860794614397012e-05, "loss": 0.4616, "num_input_tokens_seen": 41949600, "step": 34580 }, { "epoch": 3.851765229981067, "grad_norm": 0.11203434318304062, "learning_rate": 3.86038693745911e-05, "loss": 0.4541, "num_input_tokens_seen": 41955776, "step": 34585 }, { "epoch": 3.852322084864684, "grad_norm": 0.12061756104230881, "learning_rate": 3.8599792091214295e-05, "loss": 0.4696, "num_input_tokens_seen": 41961856, "step": 34590 }, { "epoch": 3.852878939748302, "grad_norm": 0.11363813281059265, "learning_rate": 3.8595714293993764e-05, "loss": 0.477, "num_input_tokens_seen": 41967872, "step": 34595 }, { "epoch": 3.8534357946319187, "grad_norm": 0.1052810326218605, "learning_rate": 3.8591635983083576e-05, "loss": 0.468, "num_input_tokens_seen": 41974016, "step": 34600 }, { "epoch": 3.8539926495155363, "grad_norm": 0.11769969016313553, "learning_rate": 3.8587557158637825e-05, "loss": 0.4609, "num_input_tokens_seen": 41980064, "step": 34605 }, { "epoch": 3.8545495043991536, "grad_norm": 0.11135053634643555, "learning_rate": 3.858347782081063e-05, "loss": 0.4637, "num_input_tokens_seen": 41986432, "step": 34610 }, { "epoch": 3.855106359282771, "grad_norm": 0.1487632840871811, "learning_rate": 3.8579397969756106e-05, "loss": 0.4646, "num_input_tokens_seen": 41992704, "step": 34615 }, { "epoch": 3.8556632141663885, "grad_norm": 0.09983836114406586, "learning_rate": 3.8575317605628404e-05, "loss": 0.4804, "num_input_tokens_seen": 41998912, "step": 34620 }, { "epoch": 3.8562200690500057, "grad_norm": 0.1343790590763092, "learning_rate": 3.8571236728581704e-05, "loss": 0.4597, "num_input_tokens_seen": 42004832, "step": 34625 }, { "epoch": 3.856776923933623, "grad_norm": 0.1049799919128418, "learning_rate": 3.8567155338770186e-05, "loss": 0.4732, "num_input_tokens_seen": 42010976, "step": 34630 }, { "epoch": 3.85733377881724, "grad_norm": 0.1037222295999527, "learning_rate": 3.8563073436348064e-05, "loss": 0.4597, "num_input_tokens_seen": 42017504, "step": 34635 }, { "epoch": 3.8578906337008574, "grad_norm": 0.1560497283935547, "learning_rate": 3.855899102146956e-05, "loss": 0.4887, "num_input_tokens_seen": 42024064, "step": 34640 }, { "epoch": 3.858447488584475, "grad_norm": 0.11508995294570923, "learning_rate": 3.855490809428893e-05, "loss": 0.4597, "num_input_tokens_seen": 42030112, "step": 34645 }, { "epoch": 3.8590043434680923, "grad_norm": 0.12686868011951447, "learning_rate": 3.855082465496043e-05, "loss": 0.4648, "num_input_tokens_seen": 42036416, "step": 34650 }, { "epoch": 3.8595611983517095, "grad_norm": 0.1085076853632927, "learning_rate": 3.8546740703638344e-05, "loss": 0.4565, "num_input_tokens_seen": 42042400, "step": 34655 }, { "epoch": 3.8601180532353268, "grad_norm": 0.10561800748109818, "learning_rate": 3.854265624047699e-05, "loss": 0.4708, "num_input_tokens_seen": 42048672, "step": 34660 }, { "epoch": 3.860674908118944, "grad_norm": 0.10157246887683868, "learning_rate": 3.8538571265630675e-05, "loss": 0.4671, "num_input_tokens_seen": 42055040, "step": 34665 }, { "epoch": 3.8612317630025617, "grad_norm": 0.11322583258152008, "learning_rate": 3.853448577925375e-05, "loss": 0.4748, "num_input_tokens_seen": 42061088, "step": 34670 }, { "epoch": 3.861788617886179, "grad_norm": 0.12097644805908203, "learning_rate": 3.8530399781500593e-05, "loss": 0.4687, "num_input_tokens_seen": 42067200, "step": 34675 }, { "epoch": 3.862345472769796, "grad_norm": 0.10360563546419144, "learning_rate": 3.852631327252556e-05, "loss": 0.4747, "num_input_tokens_seen": 42073216, "step": 34680 }, { "epoch": 3.862902327653414, "grad_norm": 0.14003457129001617, "learning_rate": 3.8522226252483066e-05, "loss": 0.4664, "num_input_tokens_seen": 42079168, "step": 34685 }, { "epoch": 3.8634591825370306, "grad_norm": 0.09418793767690659, "learning_rate": 3.851813872152753e-05, "loss": 0.4691, "num_input_tokens_seen": 42085472, "step": 34690 }, { "epoch": 3.8640160374206483, "grad_norm": 0.09517430514097214, "learning_rate": 3.85140506798134e-05, "loss": 0.4567, "num_input_tokens_seen": 42091616, "step": 34695 }, { "epoch": 3.8645728923042655, "grad_norm": 0.1502991020679474, "learning_rate": 3.850996212749511e-05, "loss": 0.4562, "num_input_tokens_seen": 42097568, "step": 34700 }, { "epoch": 3.8651297471878827, "grad_norm": 0.08203088492155075, "learning_rate": 3.8505873064727174e-05, "loss": 0.4672, "num_input_tokens_seen": 42103904, "step": 34705 }, { "epoch": 3.8656866020715004, "grad_norm": 0.12391582876443863, "learning_rate": 3.850178349166406e-05, "loss": 0.4682, "num_input_tokens_seen": 42110016, "step": 34710 }, { "epoch": 3.8662434569551176, "grad_norm": 0.12011800706386566, "learning_rate": 3.84976934084603e-05, "loss": 0.458, "num_input_tokens_seen": 42116000, "step": 34715 }, { "epoch": 3.866800311838735, "grad_norm": 0.1036120057106018, "learning_rate": 3.849360281527044e-05, "loss": 0.4667, "num_input_tokens_seen": 42121664, "step": 34720 }, { "epoch": 3.867357166722352, "grad_norm": 0.11390482634305954, "learning_rate": 3.848951171224901e-05, "loss": 0.4654, "num_input_tokens_seen": 42127136, "step": 34725 }, { "epoch": 3.8679140216059693, "grad_norm": 0.13061782717704773, "learning_rate": 3.8485420099550594e-05, "loss": 0.4611, "num_input_tokens_seen": 42133408, "step": 34730 }, { "epoch": 3.868470876489587, "grad_norm": 0.08685877919197083, "learning_rate": 3.84813279773298e-05, "loss": 0.4567, "num_input_tokens_seen": 42139520, "step": 34735 }, { "epoch": 3.869027731373204, "grad_norm": 0.11424325406551361, "learning_rate": 3.8477235345741225e-05, "loss": 0.4594, "num_input_tokens_seen": 42145248, "step": 34740 }, { "epoch": 3.8695845862568214, "grad_norm": 0.10614123940467834, "learning_rate": 3.847314220493952e-05, "loss": 0.4676, "num_input_tokens_seen": 42151264, "step": 34745 }, { "epoch": 3.8701414411404387, "grad_norm": 0.09482286125421524, "learning_rate": 3.846904855507932e-05, "loss": 0.4611, "num_input_tokens_seen": 42157600, "step": 34750 }, { "epoch": 3.870698296024056, "grad_norm": 0.08852240443229675, "learning_rate": 3.84649543963153e-05, "loss": 0.4588, "num_input_tokens_seen": 42163776, "step": 34755 }, { "epoch": 3.8712551509076736, "grad_norm": 0.10444767773151398, "learning_rate": 3.846085972880215e-05, "loss": 0.4778, "num_input_tokens_seen": 42169792, "step": 34760 }, { "epoch": 3.871812005791291, "grad_norm": 0.16264154016971588, "learning_rate": 3.845676455269459e-05, "loss": 0.4633, "num_input_tokens_seen": 42175104, "step": 34765 }, { "epoch": 3.872368860674908, "grad_norm": 0.11088673770427704, "learning_rate": 3.845266886814733e-05, "loss": 0.451, "num_input_tokens_seen": 42181184, "step": 34770 }, { "epoch": 3.8729257155585257, "grad_norm": 0.11963669210672379, "learning_rate": 3.8448572675315144e-05, "loss": 0.4612, "num_input_tokens_seen": 42187264, "step": 34775 }, { "epoch": 3.8734825704421425, "grad_norm": 0.09825414419174194, "learning_rate": 3.844447597435277e-05, "loss": 0.454, "num_input_tokens_seen": 42193344, "step": 34780 }, { "epoch": 3.87403942532576, "grad_norm": 0.09653107821941376, "learning_rate": 3.844037876541502e-05, "loss": 0.468, "num_input_tokens_seen": 42199360, "step": 34785 }, { "epoch": 3.8745962802093774, "grad_norm": 0.10333535820245743, "learning_rate": 3.843628104865668e-05, "loss": 0.4825, "num_input_tokens_seen": 42205760, "step": 34790 }, { "epoch": 3.8751531350929946, "grad_norm": 0.10983970761299133, "learning_rate": 3.843218282423258e-05, "loss": 0.4579, "num_input_tokens_seen": 42211904, "step": 34795 }, { "epoch": 3.8757099899766123, "grad_norm": 0.11175411194562912, "learning_rate": 3.842808409229758e-05, "loss": 0.4663, "num_input_tokens_seen": 42217952, "step": 34800 }, { "epoch": 3.8762668448602295, "grad_norm": 0.1139230951666832, "learning_rate": 3.842398485300652e-05, "loss": 0.4585, "num_input_tokens_seen": 42224224, "step": 34805 }, { "epoch": 3.8768236997438468, "grad_norm": 0.11066878587007523, "learning_rate": 3.84198851065143e-05, "loss": 0.4586, "num_input_tokens_seen": 42230496, "step": 34810 }, { "epoch": 3.877380554627464, "grad_norm": 0.14729931950569153, "learning_rate": 3.8415784852975814e-05, "loss": 0.4727, "num_input_tokens_seen": 42236224, "step": 34815 }, { "epoch": 3.8779374095110812, "grad_norm": 0.1280653178691864, "learning_rate": 3.841168409254597e-05, "loss": 0.4605, "num_input_tokens_seen": 42242144, "step": 34820 }, { "epoch": 3.878494264394699, "grad_norm": 0.12145908921957016, "learning_rate": 3.8407582825379735e-05, "loss": 0.4798, "num_input_tokens_seen": 42248192, "step": 34825 }, { "epoch": 3.879051119278316, "grad_norm": 0.10717451572418213, "learning_rate": 3.840348105163205e-05, "loss": 0.4611, "num_input_tokens_seen": 42253728, "step": 34830 }, { "epoch": 3.8796079741619334, "grad_norm": 0.0850910022854805, "learning_rate": 3.839937877145789e-05, "loss": 0.4558, "num_input_tokens_seen": 42260064, "step": 34835 }, { "epoch": 3.8801648290455506, "grad_norm": 0.15029489994049072, "learning_rate": 3.839527598501228e-05, "loss": 0.4526, "num_input_tokens_seen": 42266624, "step": 34840 }, { "epoch": 3.880721683929168, "grad_norm": 0.11908377707004547, "learning_rate": 3.83911726924502e-05, "loss": 0.4613, "num_input_tokens_seen": 42272736, "step": 34845 }, { "epoch": 3.8812785388127855, "grad_norm": 0.12429489940404892, "learning_rate": 3.83870688939267e-05, "loss": 0.4642, "num_input_tokens_seen": 42278624, "step": 34850 }, { "epoch": 3.8818353936964027, "grad_norm": 0.08495055884122849, "learning_rate": 3.838296458959685e-05, "loss": 0.4445, "num_input_tokens_seen": 42284704, "step": 34855 }, { "epoch": 3.88239224858002, "grad_norm": 0.10229743272066116, "learning_rate": 3.837885977961569e-05, "loss": 0.462, "num_input_tokens_seen": 42290752, "step": 34860 }, { "epoch": 3.8829491034636376, "grad_norm": 0.12492891401052475, "learning_rate": 3.837475446413835e-05, "loss": 0.448, "num_input_tokens_seen": 42296704, "step": 34865 }, { "epoch": 3.883505958347255, "grad_norm": 0.107121042907238, "learning_rate": 3.837064864331992e-05, "loss": 0.4641, "num_input_tokens_seen": 42302848, "step": 34870 }, { "epoch": 3.884062813230872, "grad_norm": 0.10141590237617493, "learning_rate": 3.836654231731554e-05, "loss": 0.4574, "num_input_tokens_seen": 42309216, "step": 34875 }, { "epoch": 3.8846196681144893, "grad_norm": 0.11175332218408585, "learning_rate": 3.836243548628035e-05, "loss": 0.4719, "num_input_tokens_seen": 42315488, "step": 34880 }, { "epoch": 3.8851765229981066, "grad_norm": 0.09500859677791595, "learning_rate": 3.835832815036953e-05, "loss": 0.4763, "num_input_tokens_seen": 42321696, "step": 34885 }, { "epoch": 3.8857333778817242, "grad_norm": 0.10567492246627808, "learning_rate": 3.835422030973825e-05, "loss": 0.4602, "num_input_tokens_seen": 42328128, "step": 34890 }, { "epoch": 3.8862902327653415, "grad_norm": 0.11988668143749237, "learning_rate": 3.835011196454175e-05, "loss": 0.4609, "num_input_tokens_seen": 42334496, "step": 34895 }, { "epoch": 3.8868470876489587, "grad_norm": 0.10675762593746185, "learning_rate": 3.8346003114935225e-05, "loss": 0.4669, "num_input_tokens_seen": 42340608, "step": 34900 }, { "epoch": 3.887403942532576, "grad_norm": 0.11751411855220795, "learning_rate": 3.834189376107393e-05, "loss": 0.4586, "num_input_tokens_seen": 42346816, "step": 34905 }, { "epoch": 3.887960797416193, "grad_norm": 0.10259576141834259, "learning_rate": 3.8337783903113146e-05, "loss": 0.4749, "num_input_tokens_seen": 42352896, "step": 34910 }, { "epoch": 3.888517652299811, "grad_norm": 0.11341889947652817, "learning_rate": 3.833367354120814e-05, "loss": 0.4644, "num_input_tokens_seen": 42359264, "step": 34915 }, { "epoch": 3.889074507183428, "grad_norm": 0.09266052395105362, "learning_rate": 3.8329562675514206e-05, "loss": 0.4606, "num_input_tokens_seen": 42365440, "step": 34920 }, { "epoch": 3.8896313620670453, "grad_norm": 0.0963483601808548, "learning_rate": 3.832545130618668e-05, "loss": 0.4582, "num_input_tokens_seen": 42371552, "step": 34925 }, { "epoch": 3.8901882169506625, "grad_norm": 0.10360237210988998, "learning_rate": 3.832133943338091e-05, "loss": 0.4638, "num_input_tokens_seen": 42377600, "step": 34930 }, { "epoch": 3.8907450718342798, "grad_norm": 0.14600153267383575, "learning_rate": 3.8317227057252245e-05, "loss": 0.4614, "num_input_tokens_seen": 42383552, "step": 34935 }, { "epoch": 3.8913019267178974, "grad_norm": 0.10925349593162537, "learning_rate": 3.8313114177956055e-05, "loss": 0.46, "num_input_tokens_seen": 42389952, "step": 34940 }, { "epoch": 3.8918587816015147, "grad_norm": 0.13839316368103027, "learning_rate": 3.8309000795647757e-05, "loss": 0.4675, "num_input_tokens_seen": 42396032, "step": 34945 }, { "epoch": 3.892415636485132, "grad_norm": 0.10873498767614365, "learning_rate": 3.830488691048275e-05, "loss": 0.4644, "num_input_tokens_seen": 42401792, "step": 34950 }, { "epoch": 3.8929724913687496, "grad_norm": 0.1857999563217163, "learning_rate": 3.830077252261648e-05, "loss": 0.4591, "num_input_tokens_seen": 42407872, "step": 34955 }, { "epoch": 3.893529346252367, "grad_norm": 0.11879609525203705, "learning_rate": 3.8296657632204404e-05, "loss": 0.4504, "num_input_tokens_seen": 42413792, "step": 34960 }, { "epoch": 3.894086201135984, "grad_norm": 0.1120540052652359, "learning_rate": 3.829254223940199e-05, "loss": 0.4651, "num_input_tokens_seen": 42420224, "step": 34965 }, { "epoch": 3.8946430560196013, "grad_norm": 0.11587128043174744, "learning_rate": 3.8288426344364737e-05, "loss": 0.4598, "num_input_tokens_seen": 42426208, "step": 34970 }, { "epoch": 3.8951999109032185, "grad_norm": 0.09826726466417313, "learning_rate": 3.828430994724814e-05, "loss": 0.4669, "num_input_tokens_seen": 42432032, "step": 34975 }, { "epoch": 3.895756765786836, "grad_norm": 0.09292122721672058, "learning_rate": 3.828019304820775e-05, "loss": 0.4622, "num_input_tokens_seen": 42437824, "step": 34980 }, { "epoch": 3.8963136206704534, "grad_norm": 0.09169285744428635, "learning_rate": 3.8276075647399116e-05, "loss": 0.462, "num_input_tokens_seen": 42443904, "step": 34985 }, { "epoch": 3.8968704755540706, "grad_norm": 0.10352794826030731, "learning_rate": 3.827195774497778e-05, "loss": 0.4664, "num_input_tokens_seen": 42449728, "step": 34990 }, { "epoch": 3.897427330437688, "grad_norm": 0.18998920917510986, "learning_rate": 3.826783934109936e-05, "loss": 0.4756, "num_input_tokens_seen": 42455392, "step": 34995 }, { "epoch": 3.897984185321305, "grad_norm": 0.13039231300354004, "learning_rate": 3.826372043591945e-05, "loss": 0.449, "num_input_tokens_seen": 42461184, "step": 35000 }, { "epoch": 3.8985410402049228, "grad_norm": 0.1196548342704773, "learning_rate": 3.825960102959367e-05, "loss": 0.4686, "num_input_tokens_seen": 42467424, "step": 35005 }, { "epoch": 3.89909789508854, "grad_norm": 0.09677138179540634, "learning_rate": 3.8255481122277684e-05, "loss": 0.476, "num_input_tokens_seen": 42473184, "step": 35010 }, { "epoch": 3.899654749972157, "grad_norm": 0.10949823260307312, "learning_rate": 3.825136071412713e-05, "loss": 0.4674, "num_input_tokens_seen": 42478560, "step": 35015 }, { "epoch": 3.9002116048557744, "grad_norm": 0.1283024251461029, "learning_rate": 3.82472398052977e-05, "loss": 0.4662, "num_input_tokens_seen": 42484896, "step": 35020 }, { "epoch": 3.9007684597393917, "grad_norm": 0.10991307348012924, "learning_rate": 3.82431183959451e-05, "loss": 0.4618, "num_input_tokens_seen": 42491104, "step": 35025 }, { "epoch": 3.9013253146230094, "grad_norm": 0.07264377921819687, "learning_rate": 3.8238996486225054e-05, "loss": 0.4658, "num_input_tokens_seen": 42497440, "step": 35030 }, { "epoch": 3.9018821695066266, "grad_norm": 0.10154854506254196, "learning_rate": 3.823487407629329e-05, "loss": 0.4705, "num_input_tokens_seen": 42502816, "step": 35035 }, { "epoch": 3.902439024390244, "grad_norm": 0.11930561065673828, "learning_rate": 3.823075116630557e-05, "loss": 0.4554, "num_input_tokens_seen": 42508928, "step": 35040 }, { "epoch": 3.9029958792738615, "grad_norm": 0.07405462861061096, "learning_rate": 3.8226627756417666e-05, "loss": 0.4607, "num_input_tokens_seen": 42515040, "step": 35045 }, { "epoch": 3.9035527341574787, "grad_norm": 0.10923406481742859, "learning_rate": 3.822250384678539e-05, "loss": 0.4804, "num_input_tokens_seen": 42521280, "step": 35050 }, { "epoch": 3.904109589041096, "grad_norm": 0.13547267019748688, "learning_rate": 3.821837943756453e-05, "loss": 0.4621, "num_input_tokens_seen": 42527648, "step": 35055 }, { "epoch": 3.904666443924713, "grad_norm": 0.10154229402542114, "learning_rate": 3.8214254528910945e-05, "loss": 0.4694, "num_input_tokens_seen": 42533536, "step": 35060 }, { "epoch": 3.9052232988083304, "grad_norm": 0.10107248276472092, "learning_rate": 3.821012912098048e-05, "loss": 0.4591, "num_input_tokens_seen": 42539648, "step": 35065 }, { "epoch": 3.905780153691948, "grad_norm": 0.07327066361904144, "learning_rate": 3.8206003213929e-05, "loss": 0.4636, "num_input_tokens_seen": 42545952, "step": 35070 }, { "epoch": 3.9063370085755653, "grad_norm": 0.129864901304245, "learning_rate": 3.820187680791239e-05, "loss": 0.4609, "num_input_tokens_seen": 42552160, "step": 35075 }, { "epoch": 3.9068938634591825, "grad_norm": 0.07846274226903915, "learning_rate": 3.8197749903086575e-05, "loss": 0.4627, "num_input_tokens_seen": 42558048, "step": 35080 }, { "epoch": 3.9074507183428, "grad_norm": 0.12785185873508453, "learning_rate": 3.8193622499607476e-05, "loss": 0.4632, "num_input_tokens_seen": 42564096, "step": 35085 }, { "epoch": 3.908007573226417, "grad_norm": 0.10719316452741623, "learning_rate": 3.818949459763103e-05, "loss": 0.4535, "num_input_tokens_seen": 42570080, "step": 35090 }, { "epoch": 3.9085644281100347, "grad_norm": 0.1188373938202858, "learning_rate": 3.818536619731321e-05, "loss": 0.4615, "num_input_tokens_seen": 42576448, "step": 35095 }, { "epoch": 3.909121282993652, "grad_norm": 0.12176605314016342, "learning_rate": 3.818123729881001e-05, "loss": 0.4575, "num_input_tokens_seen": 42582528, "step": 35100 }, { "epoch": 3.909678137877269, "grad_norm": 0.11163937300443649, "learning_rate": 3.8177107902277415e-05, "loss": 0.4781, "num_input_tokens_seen": 42588704, "step": 35105 }, { "epoch": 3.9102349927608864, "grad_norm": 0.12617269158363342, "learning_rate": 3.817297800787145e-05, "loss": 0.4609, "num_input_tokens_seen": 42595200, "step": 35110 }, { "epoch": 3.9107918476445036, "grad_norm": 0.117600217461586, "learning_rate": 3.816884761574817e-05, "loss": 0.4655, "num_input_tokens_seen": 42600864, "step": 35115 }, { "epoch": 3.9113487025281213, "grad_norm": 0.08353761583566666, "learning_rate": 3.816471672606363e-05, "loss": 0.4561, "num_input_tokens_seen": 42607136, "step": 35120 }, { "epoch": 3.9119055574117385, "grad_norm": 0.14967679977416992, "learning_rate": 3.81605853389739e-05, "loss": 0.4681, "num_input_tokens_seen": 42612384, "step": 35125 }, { "epoch": 3.9124624122953557, "grad_norm": 0.0956474095582962, "learning_rate": 3.815645345463508e-05, "loss": 0.454, "num_input_tokens_seen": 42618272, "step": 35130 }, { "epoch": 3.9130192671789734, "grad_norm": 0.1247096136212349, "learning_rate": 3.8152321073203286e-05, "loss": 0.4737, "num_input_tokens_seen": 42624320, "step": 35135 }, { "epoch": 3.9135761220625906, "grad_norm": 0.11802012473344803, "learning_rate": 3.814818819483464e-05, "loss": 0.4635, "num_input_tokens_seen": 42630560, "step": 35140 }, { "epoch": 3.914132976946208, "grad_norm": 0.1395626813173294, "learning_rate": 3.8144054819685323e-05, "loss": 0.4609, "num_input_tokens_seen": 42636576, "step": 35145 }, { "epoch": 3.914689831829825, "grad_norm": 0.12184018641710281, "learning_rate": 3.81399209479115e-05, "loss": 0.4716, "num_input_tokens_seen": 42642528, "step": 35150 }, { "epoch": 3.9152466867134423, "grad_norm": 0.13125163316726685, "learning_rate": 3.813578657966934e-05, "loss": 0.4668, "num_input_tokens_seen": 42648544, "step": 35155 }, { "epoch": 3.91580354159706, "grad_norm": 0.0856267511844635, "learning_rate": 3.813165171511508e-05, "loss": 0.4567, "num_input_tokens_seen": 42654720, "step": 35160 }, { "epoch": 3.9163603964806772, "grad_norm": 0.11386290937662125, "learning_rate": 3.812751635440492e-05, "loss": 0.4694, "num_input_tokens_seen": 42660576, "step": 35165 }, { "epoch": 3.9169172513642945, "grad_norm": 0.08678612858057022, "learning_rate": 3.812338049769514e-05, "loss": 0.4572, "num_input_tokens_seen": 42667040, "step": 35170 }, { "epoch": 3.9174741062479117, "grad_norm": 0.0935610681772232, "learning_rate": 3.8119244145141975e-05, "loss": 0.4654, "num_input_tokens_seen": 42672960, "step": 35175 }, { "epoch": 3.918030961131529, "grad_norm": 0.07587241381406784, "learning_rate": 3.8115107296901734e-05, "loss": 0.465, "num_input_tokens_seen": 42678912, "step": 35180 }, { "epoch": 3.9185878160151466, "grad_norm": 0.09013509005308151, "learning_rate": 3.8110969953130705e-05, "loss": 0.4528, "num_input_tokens_seen": 42685088, "step": 35185 }, { "epoch": 3.919144670898764, "grad_norm": 0.0906883180141449, "learning_rate": 3.810683211398522e-05, "loss": 0.4705, "num_input_tokens_seen": 42690496, "step": 35190 }, { "epoch": 3.919701525782381, "grad_norm": 0.11685866117477417, "learning_rate": 3.810269377962161e-05, "loss": 0.4785, "num_input_tokens_seen": 42696096, "step": 35195 }, { "epoch": 3.9202583806659983, "grad_norm": 0.10097495466470718, "learning_rate": 3.809855495019623e-05, "loss": 0.4682, "num_input_tokens_seen": 42702272, "step": 35200 }, { "epoch": 3.9208152355496155, "grad_norm": 0.09962581098079681, "learning_rate": 3.809441562586548e-05, "loss": 0.4649, "num_input_tokens_seen": 42707552, "step": 35205 }, { "epoch": 3.921372090433233, "grad_norm": 0.11084356904029846, "learning_rate": 3.8090275806785745e-05, "loss": 0.463, "num_input_tokens_seen": 42713600, "step": 35210 }, { "epoch": 3.9219289453168504, "grad_norm": 0.09180755913257599, "learning_rate": 3.808613549311344e-05, "loss": 0.4626, "num_input_tokens_seen": 42719936, "step": 35215 }, { "epoch": 3.9224858002004677, "grad_norm": 0.15289701521396637, "learning_rate": 3.808199468500499e-05, "loss": 0.4622, "num_input_tokens_seen": 42725536, "step": 35220 }, { "epoch": 3.9230426550840853, "grad_norm": 0.10533657670021057, "learning_rate": 3.8077853382616866e-05, "loss": 0.4569, "num_input_tokens_seen": 42731712, "step": 35225 }, { "epoch": 3.9235995099677026, "grad_norm": 0.12030309438705444, "learning_rate": 3.807371158610552e-05, "loss": 0.4579, "num_input_tokens_seen": 42737696, "step": 35230 }, { "epoch": 3.92415636485132, "grad_norm": 0.09150531888008118, "learning_rate": 3.806956929562747e-05, "loss": 0.4562, "num_input_tokens_seen": 42743648, "step": 35235 }, { "epoch": 3.924713219734937, "grad_norm": 0.14123189449310303, "learning_rate": 3.8065426511339195e-05, "loss": 0.4557, "num_input_tokens_seen": 42749792, "step": 35240 }, { "epoch": 3.9252700746185543, "grad_norm": 0.14837513864040375, "learning_rate": 3.8061283233397246e-05, "loss": 0.4616, "num_input_tokens_seen": 42756096, "step": 35245 }, { "epoch": 3.925826929502172, "grad_norm": 0.09246717393398285, "learning_rate": 3.8057139461958154e-05, "loss": 0.4668, "num_input_tokens_seen": 42762016, "step": 35250 }, { "epoch": 3.926383784385789, "grad_norm": 0.1135789155960083, "learning_rate": 3.80529951971785e-05, "loss": 0.4611, "num_input_tokens_seen": 42768576, "step": 35255 }, { "epoch": 3.9269406392694064, "grad_norm": 0.1717405468225479, "learning_rate": 3.8048850439214844e-05, "loss": 0.4619, "num_input_tokens_seen": 42774496, "step": 35260 }, { "epoch": 3.9274974941530236, "grad_norm": 0.10251982510089874, "learning_rate": 3.80447051882238e-05, "loss": 0.4618, "num_input_tokens_seen": 42780864, "step": 35265 }, { "epoch": 3.928054349036641, "grad_norm": 0.1090819463133812, "learning_rate": 3.804055944436199e-05, "loss": 0.4704, "num_input_tokens_seen": 42786944, "step": 35270 }, { "epoch": 3.9286112039202585, "grad_norm": 0.12222786247730255, "learning_rate": 3.803641320778606e-05, "loss": 0.4673, "num_input_tokens_seen": 42793024, "step": 35275 }, { "epoch": 3.9291680588038758, "grad_norm": 0.11459943652153015, "learning_rate": 3.8032266478652666e-05, "loss": 0.4621, "num_input_tokens_seen": 42799104, "step": 35280 }, { "epoch": 3.929724913687493, "grad_norm": 0.11831521987915039, "learning_rate": 3.802811925711848e-05, "loss": 0.4608, "num_input_tokens_seen": 42805120, "step": 35285 }, { "epoch": 3.9302817685711102, "grad_norm": 0.0974915623664856, "learning_rate": 3.8023971543340195e-05, "loss": 0.4642, "num_input_tokens_seen": 42811584, "step": 35290 }, { "epoch": 3.9308386234547275, "grad_norm": 0.08048097044229507, "learning_rate": 3.8019823337474516e-05, "loss": 0.46, "num_input_tokens_seen": 42817536, "step": 35295 }, { "epoch": 3.931395478338345, "grad_norm": 0.1052866280078888, "learning_rate": 3.80156746396782e-05, "loss": 0.4648, "num_input_tokens_seen": 42823456, "step": 35300 }, { "epoch": 3.9319523332219624, "grad_norm": 0.1061905100941658, "learning_rate": 3.8011525450107984e-05, "loss": 0.4683, "num_input_tokens_seen": 42829312, "step": 35305 }, { "epoch": 3.9325091881055796, "grad_norm": 0.09972364455461502, "learning_rate": 3.8007375768920636e-05, "loss": 0.4618, "num_input_tokens_seen": 42835648, "step": 35310 }, { "epoch": 3.9330660429891973, "grad_norm": 0.1105247363448143, "learning_rate": 3.8003225596272954e-05, "loss": 0.462, "num_input_tokens_seen": 42841984, "step": 35315 }, { "epoch": 3.9336228978728145, "grad_norm": 0.06561163812875748, "learning_rate": 3.7999074932321734e-05, "loss": 0.4681, "num_input_tokens_seen": 42847968, "step": 35320 }, { "epoch": 3.9341797527564317, "grad_norm": 0.09776237607002258, "learning_rate": 3.799492377722379e-05, "loss": 0.4494, "num_input_tokens_seen": 42854240, "step": 35325 }, { "epoch": 3.934736607640049, "grad_norm": 0.10733088105916977, "learning_rate": 3.7990772131136e-05, "loss": 0.4577, "num_input_tokens_seen": 42860224, "step": 35330 }, { "epoch": 3.935293462523666, "grad_norm": 0.11797575652599335, "learning_rate": 3.798661999421521e-05, "loss": 0.4517, "num_input_tokens_seen": 42866912, "step": 35335 }, { "epoch": 3.935850317407284, "grad_norm": 0.09073125571012497, "learning_rate": 3.798246736661829e-05, "loss": 0.4506, "num_input_tokens_seen": 42873024, "step": 35340 }, { "epoch": 3.936407172290901, "grad_norm": 0.10390615463256836, "learning_rate": 3.7978314248502154e-05, "loss": 0.4674, "num_input_tokens_seen": 42879360, "step": 35345 }, { "epoch": 3.9369640271745183, "grad_norm": 0.09544658660888672, "learning_rate": 3.797416064002371e-05, "loss": 0.4605, "num_input_tokens_seen": 42885568, "step": 35350 }, { "epoch": 3.9375208820581356, "grad_norm": 0.09444261342287064, "learning_rate": 3.7970006541339895e-05, "loss": 0.4729, "num_input_tokens_seen": 42892000, "step": 35355 }, { "epoch": 3.938077736941753, "grad_norm": 0.1241246908903122, "learning_rate": 3.796585195260768e-05, "loss": 0.468, "num_input_tokens_seen": 42897984, "step": 35360 }, { "epoch": 3.9386345918253705, "grad_norm": 0.08906251192092896, "learning_rate": 3.796169687398402e-05, "loss": 0.4618, "num_input_tokens_seen": 42904320, "step": 35365 }, { "epoch": 3.9391914467089877, "grad_norm": 0.08857657760381699, "learning_rate": 3.7957541305625916e-05, "loss": 0.4591, "num_input_tokens_seen": 42910304, "step": 35370 }, { "epoch": 3.939748301592605, "grad_norm": 0.1325608193874359, "learning_rate": 3.795338524769038e-05, "loss": 0.461, "num_input_tokens_seen": 42916032, "step": 35375 }, { "epoch": 3.940305156476222, "grad_norm": 0.12871958315372467, "learning_rate": 3.794922870033443e-05, "loss": 0.4568, "num_input_tokens_seen": 42922144, "step": 35380 }, { "epoch": 3.9408620113598394, "grad_norm": 0.08213019371032715, "learning_rate": 3.7945071663715134e-05, "loss": 0.4694, "num_input_tokens_seen": 42928320, "step": 35385 }, { "epoch": 3.941418866243457, "grad_norm": 0.12693056464195251, "learning_rate": 3.7940914137989534e-05, "loss": 0.462, "num_input_tokens_seen": 42934272, "step": 35390 }, { "epoch": 3.9419757211270743, "grad_norm": 0.1254236251115799, "learning_rate": 3.7936756123314735e-05, "loss": 0.4739, "num_input_tokens_seen": 42940640, "step": 35395 }, { "epoch": 3.9425325760106915, "grad_norm": 0.09670404344797134, "learning_rate": 3.793259761984783e-05, "loss": 0.474, "num_input_tokens_seen": 42946976, "step": 35400 }, { "epoch": 3.943089430894309, "grad_norm": 0.10926084965467453, "learning_rate": 3.792843862774594e-05, "loss": 0.4654, "num_input_tokens_seen": 42953344, "step": 35405 }, { "epoch": 3.9436462857779264, "grad_norm": 0.09600722044706345, "learning_rate": 3.792427914716621e-05, "loss": 0.4625, "num_input_tokens_seen": 42959840, "step": 35410 }, { "epoch": 3.9442031406615436, "grad_norm": 0.09738823026418686, "learning_rate": 3.792011917826579e-05, "loss": 0.4655, "num_input_tokens_seen": 42966144, "step": 35415 }, { "epoch": 3.944759995545161, "grad_norm": 0.10518564283847809, "learning_rate": 3.7915958721201865e-05, "loss": 0.4546, "num_input_tokens_seen": 42972288, "step": 35420 }, { "epoch": 3.945316850428778, "grad_norm": 0.10173063725233078, "learning_rate": 3.791179777613163e-05, "loss": 0.4633, "num_input_tokens_seen": 42978496, "step": 35425 }, { "epoch": 3.945873705312396, "grad_norm": 0.0868324488401413, "learning_rate": 3.7907636343212297e-05, "loss": 0.46, "num_input_tokens_seen": 42984864, "step": 35430 }, { "epoch": 3.946430560196013, "grad_norm": 0.10289482027292252, "learning_rate": 3.7903474422601105e-05, "loss": 0.4665, "num_input_tokens_seen": 42990560, "step": 35435 }, { "epoch": 3.9469874150796302, "grad_norm": 0.1307263821363449, "learning_rate": 3.789931201445529e-05, "loss": 0.4799, "num_input_tokens_seen": 42996608, "step": 35440 }, { "epoch": 3.9475442699632475, "grad_norm": 0.12305986881256104, "learning_rate": 3.789514911893213e-05, "loss": 0.4704, "num_input_tokens_seen": 43002784, "step": 35445 }, { "epoch": 3.9481011248468647, "grad_norm": 0.1254635900259018, "learning_rate": 3.7890985736188914e-05, "loss": 0.4617, "num_input_tokens_seen": 43008992, "step": 35450 }, { "epoch": 3.9486579797304824, "grad_norm": 0.09124037623405457, "learning_rate": 3.788682186638294e-05, "loss": 0.4596, "num_input_tokens_seen": 43015040, "step": 35455 }, { "epoch": 3.9492148346140996, "grad_norm": 0.1503419727087021, "learning_rate": 3.788265750967155e-05, "loss": 0.4649, "num_input_tokens_seen": 43021376, "step": 35460 }, { "epoch": 3.949771689497717, "grad_norm": 0.14576008915901184, "learning_rate": 3.7878492666212065e-05, "loss": 0.4656, "num_input_tokens_seen": 43027424, "step": 35465 }, { "epoch": 3.950328544381334, "grad_norm": 0.12672577798366547, "learning_rate": 3.7874327336161855e-05, "loss": 0.4625, "num_input_tokens_seen": 43033792, "step": 35470 }, { "epoch": 3.9508853992649513, "grad_norm": 0.10219569504261017, "learning_rate": 3.78701615196783e-05, "loss": 0.4673, "num_input_tokens_seen": 43039392, "step": 35475 }, { "epoch": 3.951442254148569, "grad_norm": 0.09643560647964478, "learning_rate": 3.78659952169188e-05, "loss": 0.4569, "num_input_tokens_seen": 43045568, "step": 35480 }, { "epoch": 3.951999109032186, "grad_norm": 0.11642487347126007, "learning_rate": 3.786182842804078e-05, "loss": 0.4724, "num_input_tokens_seen": 43051104, "step": 35485 }, { "epoch": 3.9525559639158034, "grad_norm": 0.10108227282762527, "learning_rate": 3.785766115320165e-05, "loss": 0.4668, "num_input_tokens_seen": 43057280, "step": 35490 }, { "epoch": 3.953112818799421, "grad_norm": 0.1356627345085144, "learning_rate": 3.785349339255889e-05, "loss": 0.4539, "num_input_tokens_seen": 43063296, "step": 35495 }, { "epoch": 3.9536696736830383, "grad_norm": 0.11988057941198349, "learning_rate": 3.784932514626995e-05, "loss": 0.4729, "num_input_tokens_seen": 43069312, "step": 35500 }, { "epoch": 3.9542265285666556, "grad_norm": 0.11013121902942657, "learning_rate": 3.7845156414492335e-05, "loss": 0.4499, "num_input_tokens_seen": 43074304, "step": 35505 }, { "epoch": 3.954783383450273, "grad_norm": 0.16769956052303314, "learning_rate": 3.7840987197383536e-05, "loss": 0.4652, "num_input_tokens_seen": 43080576, "step": 35510 }, { "epoch": 3.95534023833389, "grad_norm": 0.08543547242879868, "learning_rate": 3.7836817495101095e-05, "loss": 0.4655, "num_input_tokens_seen": 43086144, "step": 35515 }, { "epoch": 3.9558970932175077, "grad_norm": 0.13491934537887573, "learning_rate": 3.783264730780255e-05, "loss": 0.4637, "num_input_tokens_seen": 43092320, "step": 35520 }, { "epoch": 3.956453948101125, "grad_norm": 0.0957464799284935, "learning_rate": 3.7828476635645473e-05, "loss": 0.4562, "num_input_tokens_seen": 43098176, "step": 35525 }, { "epoch": 3.957010802984742, "grad_norm": 0.13178551197052002, "learning_rate": 3.782430547878742e-05, "loss": 0.4546, "num_input_tokens_seen": 43103904, "step": 35530 }, { "epoch": 3.9575676578683594, "grad_norm": 0.08466608822345734, "learning_rate": 3.782013383738603e-05, "loss": 0.4615, "num_input_tokens_seen": 43109984, "step": 35535 }, { "epoch": 3.9581245127519766, "grad_norm": 0.13607051968574524, "learning_rate": 3.781596171159889e-05, "loss": 0.4682, "num_input_tokens_seen": 43115936, "step": 35540 }, { "epoch": 3.9586813676355943, "grad_norm": 0.0878419354557991, "learning_rate": 3.781178910158364e-05, "loss": 0.4576, "num_input_tokens_seen": 43122112, "step": 35545 }, { "epoch": 3.9592382225192115, "grad_norm": 0.11031284928321838, "learning_rate": 3.7807616007497947e-05, "loss": 0.4753, "num_input_tokens_seen": 43127744, "step": 35550 }, { "epoch": 3.9597950774028288, "grad_norm": 0.07401703298091888, "learning_rate": 3.780344242949948e-05, "loss": 0.4607, "num_input_tokens_seen": 43134016, "step": 35555 }, { "epoch": 3.960351932286446, "grad_norm": 0.08939238637685776, "learning_rate": 3.779926836774592e-05, "loss": 0.4679, "num_input_tokens_seen": 43140256, "step": 35560 }, { "epoch": 3.9609087871700632, "grad_norm": 0.08168686926364899, "learning_rate": 3.779509382239499e-05, "loss": 0.4758, "num_input_tokens_seen": 43146336, "step": 35565 }, { "epoch": 3.961465642053681, "grad_norm": 0.1490313559770584, "learning_rate": 3.779091879360441e-05, "loss": 0.4587, "num_input_tokens_seen": 43152512, "step": 35570 }, { "epoch": 3.962022496937298, "grad_norm": 0.11542653292417526, "learning_rate": 3.7786743281531925e-05, "loss": 0.4566, "num_input_tokens_seen": 43158272, "step": 35575 }, { "epoch": 3.9625793518209154, "grad_norm": 0.11669768393039703, "learning_rate": 3.77825672863353e-05, "loss": 0.4739, "num_input_tokens_seen": 43164512, "step": 35580 }, { "epoch": 3.963136206704533, "grad_norm": 0.07597719877958298, "learning_rate": 3.7778390808172326e-05, "loss": 0.4756, "num_input_tokens_seen": 43170336, "step": 35585 }, { "epoch": 3.9636930615881503, "grad_norm": 0.13568316400051117, "learning_rate": 3.777421384720079e-05, "loss": 0.4637, "num_input_tokens_seen": 43176704, "step": 35590 }, { "epoch": 3.9642499164717675, "grad_norm": 0.09585265070199966, "learning_rate": 3.777003640357852e-05, "loss": 0.4684, "num_input_tokens_seen": 43182304, "step": 35595 }, { "epoch": 3.9648067713553847, "grad_norm": 0.0995209589600563, "learning_rate": 3.7765858477463354e-05, "loss": 0.4654, "num_input_tokens_seen": 43188640, "step": 35600 }, { "epoch": 3.965363626239002, "grad_norm": 0.11628029495477676, "learning_rate": 3.776168006901315e-05, "loss": 0.47, "num_input_tokens_seen": 43195136, "step": 35605 }, { "epoch": 3.9659204811226196, "grad_norm": 0.07689862698316574, "learning_rate": 3.775750117838577e-05, "loss": 0.466, "num_input_tokens_seen": 43200992, "step": 35610 }, { "epoch": 3.966477336006237, "grad_norm": 0.10807999223470688, "learning_rate": 3.775332180573911e-05, "loss": 0.4703, "num_input_tokens_seen": 43207104, "step": 35615 }, { "epoch": 3.967034190889854, "grad_norm": 0.08301306515932083, "learning_rate": 3.7749141951231084e-05, "loss": 0.4566, "num_input_tokens_seen": 43213376, "step": 35620 }, { "epoch": 3.9675910457734713, "grad_norm": 0.10046228766441345, "learning_rate": 3.774496161501962e-05, "loss": 0.4563, "num_input_tokens_seen": 43219232, "step": 35625 }, { "epoch": 3.9681479006570886, "grad_norm": 0.0864405557513237, "learning_rate": 3.7740780797262655e-05, "loss": 0.4589, "num_input_tokens_seen": 43225504, "step": 35630 }, { "epoch": 3.9687047555407062, "grad_norm": 0.10857296735048294, "learning_rate": 3.7736599498118166e-05, "loss": 0.4667, "num_input_tokens_seen": 43231584, "step": 35635 }, { "epoch": 3.9692616104243235, "grad_norm": 0.07989611476659775, "learning_rate": 3.7732417717744126e-05, "loss": 0.4636, "num_input_tokens_seen": 43237568, "step": 35640 }, { "epoch": 3.9698184653079407, "grad_norm": 0.12694188952445984, "learning_rate": 3.772823545629856e-05, "loss": 0.4757, "num_input_tokens_seen": 43243936, "step": 35645 }, { "epoch": 3.970375320191558, "grad_norm": 0.09618092328310013, "learning_rate": 3.772405271393945e-05, "loss": 0.4695, "num_input_tokens_seen": 43249856, "step": 35650 }, { "epoch": 3.970932175075175, "grad_norm": 0.09039344638586044, "learning_rate": 3.771986949082486e-05, "loss": 0.4489, "num_input_tokens_seen": 43256000, "step": 35655 }, { "epoch": 3.971489029958793, "grad_norm": 0.10009272396564484, "learning_rate": 3.771568578711284e-05, "loss": 0.4531, "num_input_tokens_seen": 43261888, "step": 35660 }, { "epoch": 3.97204588484241, "grad_norm": 0.10369637608528137, "learning_rate": 3.771150160296145e-05, "loss": 0.4536, "num_input_tokens_seen": 43267872, "step": 35665 }, { "epoch": 3.9726027397260273, "grad_norm": 0.11422377824783325, "learning_rate": 3.7707316938528804e-05, "loss": 0.454, "num_input_tokens_seen": 43274144, "step": 35670 }, { "epoch": 3.973159594609645, "grad_norm": 0.09230412542819977, "learning_rate": 3.7703131793973e-05, "loss": 0.4664, "num_input_tokens_seen": 43280256, "step": 35675 }, { "epoch": 3.973716449493262, "grad_norm": 0.17574812471866608, "learning_rate": 3.7698946169452165e-05, "loss": 0.4772, "num_input_tokens_seen": 43286816, "step": 35680 }, { "epoch": 3.9742733043768794, "grad_norm": 0.12950997054576874, "learning_rate": 3.7694760065124454e-05, "loss": 0.4524, "num_input_tokens_seen": 43293056, "step": 35685 }, { "epoch": 3.9748301592604967, "grad_norm": 0.13577164709568024, "learning_rate": 3.769057348114802e-05, "loss": 0.4618, "num_input_tokens_seen": 43298752, "step": 35690 }, { "epoch": 3.975387014144114, "grad_norm": 0.1106361597776413, "learning_rate": 3.7686386417681054e-05, "loss": 0.4571, "num_input_tokens_seen": 43304640, "step": 35695 }, { "epoch": 3.9759438690277316, "grad_norm": 0.09747522324323654, "learning_rate": 3.768219887488175e-05, "loss": 0.4655, "num_input_tokens_seen": 43310496, "step": 35700 }, { "epoch": 3.976500723911349, "grad_norm": 0.13223421573638916, "learning_rate": 3.767801085290833e-05, "loss": 0.473, "num_input_tokens_seen": 43316704, "step": 35705 }, { "epoch": 3.977057578794966, "grad_norm": 0.08582507818937302, "learning_rate": 3.767382235191904e-05, "loss": 0.4587, "num_input_tokens_seen": 43322656, "step": 35710 }, { "epoch": 3.9776144336785833, "grad_norm": 0.08123794943094254, "learning_rate": 3.7669633372072124e-05, "loss": 0.4576, "num_input_tokens_seen": 43329088, "step": 35715 }, { "epoch": 3.9781712885622005, "grad_norm": 0.09469913691282272, "learning_rate": 3.7665443913525845e-05, "loss": 0.4579, "num_input_tokens_seen": 43335264, "step": 35720 }, { "epoch": 3.978728143445818, "grad_norm": 0.10978153347969055, "learning_rate": 3.766125397643852e-05, "loss": 0.4693, "num_input_tokens_seen": 43341280, "step": 35725 }, { "epoch": 3.9792849983294354, "grad_norm": 0.07770579308271408, "learning_rate": 3.765706356096843e-05, "loss": 0.458, "num_input_tokens_seen": 43347072, "step": 35730 }, { "epoch": 3.9798418532130526, "grad_norm": 0.08983194828033447, "learning_rate": 3.765287266727393e-05, "loss": 0.4734, "num_input_tokens_seen": 43353024, "step": 35735 }, { "epoch": 3.98039870809667, "grad_norm": 0.09629564732313156, "learning_rate": 3.7648681295513355e-05, "loss": 0.4552, "num_input_tokens_seen": 43359136, "step": 35740 }, { "epoch": 3.980955562980287, "grad_norm": 0.12405717372894287, "learning_rate": 3.764448944584505e-05, "loss": 0.456, "num_input_tokens_seen": 43365376, "step": 35745 }, { "epoch": 3.9815124178639048, "grad_norm": 0.14673975110054016, "learning_rate": 3.764029711842743e-05, "loss": 0.4777, "num_input_tokens_seen": 43371104, "step": 35750 }, { "epoch": 3.982069272747522, "grad_norm": 0.09272906929254532, "learning_rate": 3.7636104313418875e-05, "loss": 0.4581, "num_input_tokens_seen": 43377344, "step": 35755 }, { "epoch": 3.982626127631139, "grad_norm": 0.08484739065170288, "learning_rate": 3.7631911030977795e-05, "loss": 0.4796, "num_input_tokens_seen": 43383488, "step": 35760 }, { "epoch": 3.983182982514757, "grad_norm": 0.1254369169473648, "learning_rate": 3.762771727126264e-05, "loss": 0.4432, "num_input_tokens_seen": 43389664, "step": 35765 }, { "epoch": 3.983739837398374, "grad_norm": 0.07626020908355713, "learning_rate": 3.7623523034431855e-05, "loss": 0.4702, "num_input_tokens_seen": 43396000, "step": 35770 }, { "epoch": 3.9842966922819913, "grad_norm": 0.10313486307859421, "learning_rate": 3.761932832064392e-05, "loss": 0.4589, "num_input_tokens_seen": 43402304, "step": 35775 }, { "epoch": 3.9848535471656086, "grad_norm": 0.144108384847641, "learning_rate": 3.7615133130057324e-05, "loss": 0.4494, "num_input_tokens_seen": 43408512, "step": 35780 }, { "epoch": 3.985410402049226, "grad_norm": 0.09829989075660706, "learning_rate": 3.761093746283056e-05, "loss": 0.456, "num_input_tokens_seen": 43414464, "step": 35785 }, { "epoch": 3.9859672569328435, "grad_norm": 0.12625084817409515, "learning_rate": 3.760674131912218e-05, "loss": 0.4668, "num_input_tokens_seen": 43420704, "step": 35790 }, { "epoch": 3.9865241118164607, "grad_norm": 0.10408997535705566, "learning_rate": 3.76025446990907e-05, "loss": 0.4684, "num_input_tokens_seen": 43427008, "step": 35795 }, { "epoch": 3.987080966700078, "grad_norm": 0.0939985066652298, "learning_rate": 3.75983476028947e-05, "loss": 0.4531, "num_input_tokens_seen": 43432896, "step": 35800 }, { "epoch": 3.987637821583695, "grad_norm": 0.1244378313422203, "learning_rate": 3.759415003069276e-05, "loss": 0.4577, "num_input_tokens_seen": 43439104, "step": 35805 }, { "epoch": 3.9881946764673124, "grad_norm": 0.10410952568054199, "learning_rate": 3.758995198264346e-05, "loss": 0.4553, "num_input_tokens_seen": 43445280, "step": 35810 }, { "epoch": 3.98875153135093, "grad_norm": 0.10086297243833542, "learning_rate": 3.7585753458905436e-05, "loss": 0.4586, "num_input_tokens_seen": 43451488, "step": 35815 }, { "epoch": 3.9893083862345473, "grad_norm": 0.12685027718544006, "learning_rate": 3.75815544596373e-05, "loss": 0.4687, "num_input_tokens_seen": 43457440, "step": 35820 }, { "epoch": 3.9898652411181645, "grad_norm": 0.09128974378108978, "learning_rate": 3.757735498499774e-05, "loss": 0.4558, "num_input_tokens_seen": 43463584, "step": 35825 }, { "epoch": 3.9904220960017818, "grad_norm": 0.1271117925643921, "learning_rate": 3.757315503514539e-05, "loss": 0.4715, "num_input_tokens_seen": 43469600, "step": 35830 }, { "epoch": 3.990978950885399, "grad_norm": 0.10258506238460541, "learning_rate": 3.756895461023895e-05, "loss": 0.4755, "num_input_tokens_seen": 43475904, "step": 35835 }, { "epoch": 3.9915358057690167, "grad_norm": 0.10740512609481812, "learning_rate": 3.7564753710437115e-05, "loss": 0.4718, "num_input_tokens_seen": 43482016, "step": 35840 }, { "epoch": 3.992092660652634, "grad_norm": 0.12230661511421204, "learning_rate": 3.7560552335898636e-05, "loss": 0.46, "num_input_tokens_seen": 43488032, "step": 35845 }, { "epoch": 3.992649515536251, "grad_norm": 0.07506586611270905, "learning_rate": 3.755635048678223e-05, "loss": 0.4632, "num_input_tokens_seen": 43493952, "step": 35850 }, { "epoch": 3.993206370419869, "grad_norm": 0.116755910217762, "learning_rate": 3.755214816324666e-05, "loss": 0.4551, "num_input_tokens_seen": 43500064, "step": 35855 }, { "epoch": 3.993763225303486, "grad_norm": 0.09894448518753052, "learning_rate": 3.7547945365450715e-05, "loss": 0.4634, "num_input_tokens_seen": 43506368, "step": 35860 }, { "epoch": 3.9943200801871033, "grad_norm": 0.12613441050052643, "learning_rate": 3.754374209355318e-05, "loss": 0.4649, "num_input_tokens_seen": 43512608, "step": 35865 }, { "epoch": 3.9948769350707205, "grad_norm": 0.09069371223449707, "learning_rate": 3.7539538347712865e-05, "loss": 0.4577, "num_input_tokens_seen": 43518784, "step": 35870 }, { "epoch": 3.9954337899543377, "grad_norm": 0.0906403586268425, "learning_rate": 3.753533412808862e-05, "loss": 0.467, "num_input_tokens_seen": 43525120, "step": 35875 }, { "epoch": 3.9959906448379554, "grad_norm": 0.10717989504337311, "learning_rate": 3.753112943483926e-05, "loss": 0.4568, "num_input_tokens_seen": 43531136, "step": 35880 }, { "epoch": 3.9965474997215726, "grad_norm": 0.1009940430521965, "learning_rate": 3.7526924268123686e-05, "loss": 0.4555, "num_input_tokens_seen": 43537376, "step": 35885 }, { "epoch": 3.99710435460519, "grad_norm": 0.1490837037563324, "learning_rate": 3.752271862810076e-05, "loss": 0.4669, "num_input_tokens_seen": 43543648, "step": 35890 }, { "epoch": 3.997661209488807, "grad_norm": 0.12276583909988403, "learning_rate": 3.7518512514929404e-05, "loss": 0.4672, "num_input_tokens_seen": 43550080, "step": 35895 }, { "epoch": 3.9982180643724243, "grad_norm": 0.09016367048025131, "learning_rate": 3.7514305928768524e-05, "loss": 0.4593, "num_input_tokens_seen": 43556384, "step": 35900 }, { "epoch": 3.998774919256042, "grad_norm": 0.08494854718446732, "learning_rate": 3.751009886977706e-05, "loss": 0.4601, "num_input_tokens_seen": 43562496, "step": 35905 }, { "epoch": 3.9993317741396592, "grad_norm": 0.09179025143384933, "learning_rate": 3.7505891338113975e-05, "loss": 0.4659, "num_input_tokens_seen": 43567904, "step": 35910 }, { "epoch": 3.9998886290232765, "grad_norm": 0.07951304316520691, "learning_rate": 3.750168333393823e-05, "loss": 0.4673, "num_input_tokens_seen": 43574208, "step": 35915 }, { "epoch": 4.000445483906894, "grad_norm": 0.1252915859222412, "learning_rate": 3.749747485740882e-05, "loss": 0.4581, "num_input_tokens_seen": 43579472, "step": 35920 }, { "epoch": 4.000445483906894, "eval_loss": 0.4643072783946991, "eval_runtime": 113.0708, "eval_samples_per_second": 35.296, "eval_steps_per_second": 8.826, "num_input_tokens_seen": 43579472, "step": 35920 }, { "epoch": 4.001002338790511, "grad_norm": 0.06937497854232788, "learning_rate": 3.749326590868477e-05, "loss": 0.4611, "num_input_tokens_seen": 43585840, "step": 35925 }, { "epoch": 4.001559193674129, "grad_norm": 0.09793446213006973, "learning_rate": 3.74890564879251e-05, "loss": 0.4563, "num_input_tokens_seen": 43591920, "step": 35930 }, { "epoch": 4.002116048557746, "grad_norm": 0.10660343617200851, "learning_rate": 3.748484659528885e-05, "loss": 0.4596, "num_input_tokens_seen": 43598224, "step": 35935 }, { "epoch": 4.002672903441363, "grad_norm": 0.11717631667852402, "learning_rate": 3.748063623093508e-05, "loss": 0.4564, "num_input_tokens_seen": 43604400, "step": 35940 }, { "epoch": 4.003229758324981, "grad_norm": 0.08020148426294327, "learning_rate": 3.747642539502288e-05, "loss": 0.4593, "num_input_tokens_seen": 43610160, "step": 35945 }, { "epoch": 4.0037866132085975, "grad_norm": 0.11549731343984604, "learning_rate": 3.747221408771135e-05, "loss": 0.4541, "num_input_tokens_seen": 43615920, "step": 35950 }, { "epoch": 4.004343468092215, "grad_norm": 0.11503349989652634, "learning_rate": 3.74680023091596e-05, "loss": 0.4631, "num_input_tokens_seen": 43622096, "step": 35955 }, { "epoch": 4.004900322975833, "grad_norm": 0.10136133432388306, "learning_rate": 3.746379005952676e-05, "loss": 0.455, "num_input_tokens_seen": 43628080, "step": 35960 }, { "epoch": 4.00545717785945, "grad_norm": 0.10812864452600479, "learning_rate": 3.745957733897201e-05, "loss": 0.4534, "num_input_tokens_seen": 43634416, "step": 35965 }, { "epoch": 4.006014032743067, "grad_norm": 0.09562531858682632, "learning_rate": 3.745536414765448e-05, "loss": 0.4559, "num_input_tokens_seen": 43640144, "step": 35970 }, { "epoch": 4.006570887626684, "grad_norm": 0.11111101508140564, "learning_rate": 3.745115048573338e-05, "loss": 0.4686, "num_input_tokens_seen": 43646480, "step": 35975 }, { "epoch": 4.007127742510302, "grad_norm": 0.11558090150356293, "learning_rate": 3.7446936353367916e-05, "loss": 0.4654, "num_input_tokens_seen": 43652656, "step": 35980 }, { "epoch": 4.0076845973939195, "grad_norm": 0.14444796741008759, "learning_rate": 3.744272175071731e-05, "loss": 0.4629, "num_input_tokens_seen": 43659056, "step": 35985 }, { "epoch": 4.008241452277536, "grad_norm": 0.15698115527629852, "learning_rate": 3.74385066779408e-05, "loss": 0.4541, "num_input_tokens_seen": 43665360, "step": 35990 }, { "epoch": 4.008798307161154, "grad_norm": 0.0863720253109932, "learning_rate": 3.743429113519765e-05, "loss": 0.4611, "num_input_tokens_seen": 43671472, "step": 35995 }, { "epoch": 4.009355162044771, "grad_norm": 0.06721595674753189, "learning_rate": 3.743007512264713e-05, "loss": 0.4574, "num_input_tokens_seen": 43677456, "step": 36000 }, { "epoch": 4.009912016928388, "grad_norm": 0.0803740844130516, "learning_rate": 3.742585864044854e-05, "loss": 0.4549, "num_input_tokens_seen": 43683600, "step": 36005 }, { "epoch": 4.010468871812006, "grad_norm": 0.10149844735860825, "learning_rate": 3.742164168876118e-05, "loss": 0.451, "num_input_tokens_seen": 43689840, "step": 36010 }, { "epoch": 4.011025726695623, "grad_norm": 0.14075547456741333, "learning_rate": 3.741742426774441e-05, "loss": 0.4722, "num_input_tokens_seen": 43696144, "step": 36015 }, { "epoch": 4.0115825815792405, "grad_norm": 0.1067495122551918, "learning_rate": 3.741320637755754e-05, "loss": 0.4643, "num_input_tokens_seen": 43702480, "step": 36020 }, { "epoch": 4.012139436462858, "grad_norm": 0.1002400815486908, "learning_rate": 3.7408988018359955e-05, "loss": 0.4663, "num_input_tokens_seen": 43708720, "step": 36025 }, { "epoch": 4.012696291346475, "grad_norm": 0.13268835842609406, "learning_rate": 3.740476919031104e-05, "loss": 0.4626, "num_input_tokens_seen": 43714864, "step": 36030 }, { "epoch": 4.013253146230093, "grad_norm": 0.12398221343755722, "learning_rate": 3.7400549893570194e-05, "loss": 0.4567, "num_input_tokens_seen": 43720912, "step": 36035 }, { "epoch": 4.0138100011137094, "grad_norm": 0.10654827207326889, "learning_rate": 3.739633012829682e-05, "loss": 0.455, "num_input_tokens_seen": 43727056, "step": 36040 }, { "epoch": 4.014366855997327, "grad_norm": 0.08831699937582016, "learning_rate": 3.7392109894650384e-05, "loss": 0.4763, "num_input_tokens_seen": 43733296, "step": 36045 }, { "epoch": 4.014923710880945, "grad_norm": 0.1217576414346695, "learning_rate": 3.738788919279032e-05, "loss": 0.4569, "num_input_tokens_seen": 43738736, "step": 36050 }, { "epoch": 4.015480565764562, "grad_norm": 0.09716125577688217, "learning_rate": 3.73836680228761e-05, "loss": 0.4657, "num_input_tokens_seen": 43744560, "step": 36055 }, { "epoch": 4.016037420648179, "grad_norm": 0.1139155775308609, "learning_rate": 3.737944638506722e-05, "loss": 0.4744, "num_input_tokens_seen": 43750352, "step": 36060 }, { "epoch": 4.016594275531796, "grad_norm": 0.09706013649702072, "learning_rate": 3.737522427952318e-05, "loss": 0.4508, "num_input_tokens_seen": 43756368, "step": 36065 }, { "epoch": 4.017151130415414, "grad_norm": 0.07992339879274368, "learning_rate": 3.737100170640351e-05, "loss": 0.4748, "num_input_tokens_seen": 43762416, "step": 36070 }, { "epoch": 4.017707985299031, "grad_norm": 0.0980459675192833, "learning_rate": 3.7366778665867754e-05, "loss": 0.4548, "num_input_tokens_seen": 43768656, "step": 36075 }, { "epoch": 4.018264840182648, "grad_norm": 0.09872222691774368, "learning_rate": 3.736255515807546e-05, "loss": 0.4605, "num_input_tokens_seen": 43774832, "step": 36080 }, { "epoch": 4.018821695066266, "grad_norm": 0.10818622261285782, "learning_rate": 3.7358331183186226e-05, "loss": 0.47, "num_input_tokens_seen": 43780880, "step": 36085 }, { "epoch": 4.019378549949883, "grad_norm": 0.11424935609102249, "learning_rate": 3.735410674135963e-05, "loss": 0.4623, "num_input_tokens_seen": 43786608, "step": 36090 }, { "epoch": 4.0199354048335, "grad_norm": 0.10794126242399216, "learning_rate": 3.734988183275529e-05, "loss": 0.4737, "num_input_tokens_seen": 43793008, "step": 36095 }, { "epoch": 4.020492259717118, "grad_norm": 0.09105024486780167, "learning_rate": 3.734565645753284e-05, "loss": 0.4577, "num_input_tokens_seen": 43799312, "step": 36100 }, { "epoch": 4.021049114600735, "grad_norm": 0.12308032810688019, "learning_rate": 3.7341430615851916e-05, "loss": 0.4673, "num_input_tokens_seen": 43805328, "step": 36105 }, { "epoch": 4.0216059694843524, "grad_norm": 0.09210481494665146, "learning_rate": 3.73372043078722e-05, "loss": 0.46, "num_input_tokens_seen": 43811504, "step": 36110 }, { "epoch": 4.02216282436797, "grad_norm": 0.12206190824508667, "learning_rate": 3.7332977533753374e-05, "loss": 0.4537, "num_input_tokens_seen": 43817424, "step": 36115 }, { "epoch": 4.022719679251587, "grad_norm": 0.1363523304462433, "learning_rate": 3.7328750293655126e-05, "loss": 0.467, "num_input_tokens_seen": 43823568, "step": 36120 }, { "epoch": 4.023276534135205, "grad_norm": 0.15265794098377228, "learning_rate": 3.732452258773719e-05, "loss": 0.4551, "num_input_tokens_seen": 43829776, "step": 36125 }, { "epoch": 4.023833389018821, "grad_norm": 0.09150759130716324, "learning_rate": 3.732029441615929e-05, "loss": 0.4619, "num_input_tokens_seen": 43836048, "step": 36130 }, { "epoch": 4.024390243902439, "grad_norm": 0.140463724732399, "learning_rate": 3.731606577908118e-05, "loss": 0.4599, "num_input_tokens_seen": 43841744, "step": 36135 }, { "epoch": 4.024947098786057, "grad_norm": 0.12704892456531525, "learning_rate": 3.731183667666264e-05, "loss": 0.4656, "num_input_tokens_seen": 43846800, "step": 36140 }, { "epoch": 4.0255039536696735, "grad_norm": 0.0962647944688797, "learning_rate": 3.7307607109063455e-05, "loss": 0.4499, "num_input_tokens_seen": 43852720, "step": 36145 }, { "epoch": 4.026060808553291, "grad_norm": 0.09229335933923721, "learning_rate": 3.730337707644343e-05, "loss": 0.4646, "num_input_tokens_seen": 43858768, "step": 36150 }, { "epoch": 4.026617663436908, "grad_norm": 0.10241895914077759, "learning_rate": 3.72991465789624e-05, "loss": 0.4676, "num_input_tokens_seen": 43864976, "step": 36155 }, { "epoch": 4.027174518320526, "grad_norm": 0.13094404339790344, "learning_rate": 3.729491561678019e-05, "loss": 0.4588, "num_input_tokens_seen": 43870864, "step": 36160 }, { "epoch": 4.027731373204143, "grad_norm": 0.06803694367408752, "learning_rate": 3.7290684190056665e-05, "loss": 0.4575, "num_input_tokens_seen": 43876912, "step": 36165 }, { "epoch": 4.02828822808776, "grad_norm": 0.09406231343746185, "learning_rate": 3.7286452298951705e-05, "loss": 0.4689, "num_input_tokens_seen": 43882992, "step": 36170 }, { "epoch": 4.028845082971378, "grad_norm": 0.09487316012382507, "learning_rate": 3.728221994362521e-05, "loss": 0.4534, "num_input_tokens_seen": 43889200, "step": 36175 }, { "epoch": 4.029401937854995, "grad_norm": 0.09587007015943527, "learning_rate": 3.727798712423708e-05, "loss": 0.4682, "num_input_tokens_seen": 43895856, "step": 36180 }, { "epoch": 4.029958792738612, "grad_norm": 0.11386636644601822, "learning_rate": 3.727375384094725e-05, "loss": 0.4755, "num_input_tokens_seen": 43901520, "step": 36185 }, { "epoch": 4.03051564762223, "grad_norm": 0.10389489680528641, "learning_rate": 3.726952009391567e-05, "loss": 0.4682, "num_input_tokens_seen": 43907408, "step": 36190 }, { "epoch": 4.031072502505847, "grad_norm": 0.0990915596485138, "learning_rate": 3.72652858833023e-05, "loss": 0.4573, "num_input_tokens_seen": 43913680, "step": 36195 }, { "epoch": 4.031629357389464, "grad_norm": 0.0852992981672287, "learning_rate": 3.726105120926712e-05, "loss": 0.4599, "num_input_tokens_seen": 43919856, "step": 36200 }, { "epoch": 4.032186212273082, "grad_norm": 0.08717383444309235, "learning_rate": 3.725681607197013e-05, "loss": 0.4595, "num_input_tokens_seen": 43925360, "step": 36205 }, { "epoch": 4.032743067156699, "grad_norm": 0.12114264070987701, "learning_rate": 3.725258047157135e-05, "loss": 0.4447, "num_input_tokens_seen": 43931440, "step": 36210 }, { "epoch": 4.0332999220403165, "grad_norm": 0.0929950550198555, "learning_rate": 3.724834440823083e-05, "loss": 0.4627, "num_input_tokens_seen": 43937328, "step": 36215 }, { "epoch": 4.033856776923933, "grad_norm": 0.10095106065273285, "learning_rate": 3.724410788210859e-05, "loss": 0.4683, "num_input_tokens_seen": 43943472, "step": 36220 }, { "epoch": 4.034413631807551, "grad_norm": 0.11903716623783112, "learning_rate": 3.723987089336473e-05, "loss": 0.4632, "num_input_tokens_seen": 43949776, "step": 36225 }, { "epoch": 4.034970486691169, "grad_norm": 0.12126396596431732, "learning_rate": 3.7235633442159306e-05, "loss": 0.4632, "num_input_tokens_seen": 43955792, "step": 36230 }, { "epoch": 4.035527341574785, "grad_norm": 0.09880276024341583, "learning_rate": 3.723139552865245e-05, "loss": 0.4515, "num_input_tokens_seen": 43961680, "step": 36235 }, { "epoch": 4.036084196458403, "grad_norm": 0.10454064607620239, "learning_rate": 3.7227157153004275e-05, "loss": 0.4612, "num_input_tokens_seen": 43967536, "step": 36240 }, { "epoch": 4.03664105134202, "grad_norm": 0.1167868822813034, "learning_rate": 3.722291831537491e-05, "loss": 0.4507, "num_input_tokens_seen": 43973712, "step": 36245 }, { "epoch": 4.037197906225638, "grad_norm": 0.15516731142997742, "learning_rate": 3.721867901592453e-05, "loss": 0.4785, "num_input_tokens_seen": 43980016, "step": 36250 }, { "epoch": 4.037754761109255, "grad_norm": 0.11450015753507614, "learning_rate": 3.7214439254813295e-05, "loss": 0.4576, "num_input_tokens_seen": 43985520, "step": 36255 }, { "epoch": 4.038311615992872, "grad_norm": 0.11341753602027893, "learning_rate": 3.7210199032201396e-05, "loss": 0.4495, "num_input_tokens_seen": 43991696, "step": 36260 }, { "epoch": 4.03886847087649, "grad_norm": 0.0916970893740654, "learning_rate": 3.720595834824906e-05, "loss": 0.461, "num_input_tokens_seen": 43997680, "step": 36265 }, { "epoch": 4.0394253257601065, "grad_norm": 0.0965106412768364, "learning_rate": 3.72017172031165e-05, "loss": 0.4791, "num_input_tokens_seen": 44004048, "step": 36270 }, { "epoch": 4.039982180643724, "grad_norm": 0.09838437288999557, "learning_rate": 3.719747559696396e-05, "loss": 0.4736, "num_input_tokens_seen": 44009840, "step": 36275 }, { "epoch": 4.040539035527342, "grad_norm": 0.11265990138053894, "learning_rate": 3.71932335299517e-05, "loss": 0.468, "num_input_tokens_seen": 44015728, "step": 36280 }, { "epoch": 4.041095890410959, "grad_norm": 0.06364483386278152, "learning_rate": 3.7188991002240004e-05, "loss": 0.4739, "num_input_tokens_seen": 44020880, "step": 36285 }, { "epoch": 4.041652745294576, "grad_norm": 0.11862891912460327, "learning_rate": 3.7184748013989164e-05, "loss": 0.4605, "num_input_tokens_seen": 44026672, "step": 36290 }, { "epoch": 4.042209600178194, "grad_norm": 0.14018473029136658, "learning_rate": 3.71805045653595e-05, "loss": 0.4489, "num_input_tokens_seen": 44032624, "step": 36295 }, { "epoch": 4.042766455061811, "grad_norm": 0.10119197517633438, "learning_rate": 3.717626065651133e-05, "loss": 0.461, "num_input_tokens_seen": 44038768, "step": 36300 }, { "epoch": 4.043323309945428, "grad_norm": 0.13194051384925842, "learning_rate": 3.717201628760503e-05, "loss": 0.4574, "num_input_tokens_seen": 44044944, "step": 36305 }, { "epoch": 4.043880164829045, "grad_norm": 0.06736832857131958, "learning_rate": 3.716777145880094e-05, "loss": 0.4582, "num_input_tokens_seen": 44051056, "step": 36310 }, { "epoch": 4.044437019712663, "grad_norm": 0.11406629532575607, "learning_rate": 3.7163526170259445e-05, "loss": 0.4623, "num_input_tokens_seen": 44056880, "step": 36315 }, { "epoch": 4.044993874596281, "grad_norm": 0.1376303732395172, "learning_rate": 3.715928042214095e-05, "loss": 0.4676, "num_input_tokens_seen": 44063440, "step": 36320 }, { "epoch": 4.045550729479897, "grad_norm": 0.14699837565422058, "learning_rate": 3.715503421460588e-05, "loss": 0.4621, "num_input_tokens_seen": 44069648, "step": 36325 }, { "epoch": 4.046107584363515, "grad_norm": 0.13371308147907257, "learning_rate": 3.715078754781466e-05, "loss": 0.4689, "num_input_tokens_seen": 44075696, "step": 36330 }, { "epoch": 4.046664439247132, "grad_norm": 0.11799634993076324, "learning_rate": 3.714654042192775e-05, "loss": 0.455, "num_input_tokens_seen": 44081968, "step": 36335 }, { "epoch": 4.0472212941307495, "grad_norm": 0.15021154284477234, "learning_rate": 3.7142292837105616e-05, "loss": 0.4771, "num_input_tokens_seen": 44088176, "step": 36340 }, { "epoch": 4.047778149014367, "grad_norm": 0.13012021780014038, "learning_rate": 3.7138044793508755e-05, "loss": 0.4529, "num_input_tokens_seen": 44093936, "step": 36345 }, { "epoch": 4.048335003897984, "grad_norm": 0.08473571389913559, "learning_rate": 3.713379629129765e-05, "loss": 0.4619, "num_input_tokens_seen": 44100144, "step": 36350 }, { "epoch": 4.048891858781602, "grad_norm": 0.14084988832473755, "learning_rate": 3.712954733063284e-05, "loss": 0.4598, "num_input_tokens_seen": 44106192, "step": 36355 }, { "epoch": 4.049448713665218, "grad_norm": 0.16206441819667816, "learning_rate": 3.712529791167487e-05, "loss": 0.4607, "num_input_tokens_seen": 44112144, "step": 36360 }, { "epoch": 4.050005568548836, "grad_norm": 0.0940612182021141, "learning_rate": 3.7121048034584285e-05, "loss": 0.4735, "num_input_tokens_seen": 44118224, "step": 36365 }, { "epoch": 4.050562423432454, "grad_norm": 0.1306045651435852, "learning_rate": 3.711679769952165e-05, "loss": 0.4553, "num_input_tokens_seen": 44123920, "step": 36370 }, { "epoch": 4.0511192783160705, "grad_norm": 0.11862198263406754, "learning_rate": 3.711254690664758e-05, "loss": 0.4618, "num_input_tokens_seen": 44130128, "step": 36375 }, { "epoch": 4.051676133199688, "grad_norm": 0.08771985024213791, "learning_rate": 3.710829565612266e-05, "loss": 0.4501, "num_input_tokens_seen": 44136336, "step": 36380 }, { "epoch": 4.052232988083306, "grad_norm": 0.1297244131565094, "learning_rate": 3.710404394810754e-05, "loss": 0.4565, "num_input_tokens_seen": 44142512, "step": 36385 }, { "epoch": 4.052789842966923, "grad_norm": 0.07380961626768112, "learning_rate": 3.709979178276284e-05, "loss": 0.4703, "num_input_tokens_seen": 44148528, "step": 36390 }, { "epoch": 4.05334669785054, "grad_norm": 0.11505890637636185, "learning_rate": 3.709553916024924e-05, "loss": 0.4613, "num_input_tokens_seen": 44155024, "step": 36395 }, { "epoch": 4.053903552734157, "grad_norm": 0.1180453896522522, "learning_rate": 3.709128608072741e-05, "loss": 0.4482, "num_input_tokens_seen": 44161328, "step": 36400 }, { "epoch": 4.054460407617775, "grad_norm": 0.12426755577325821, "learning_rate": 3.708703254435803e-05, "loss": 0.4604, "num_input_tokens_seen": 44167632, "step": 36405 }, { "epoch": 4.0550172625013925, "grad_norm": 0.10394947230815887, "learning_rate": 3.708277855130183e-05, "loss": 0.4549, "num_input_tokens_seen": 44173904, "step": 36410 }, { "epoch": 4.055574117385009, "grad_norm": 0.09489349275827408, "learning_rate": 3.7078524101719536e-05, "loss": 0.4702, "num_input_tokens_seen": 44179984, "step": 36415 }, { "epoch": 4.056130972268627, "grad_norm": 0.09721432626247406, "learning_rate": 3.7074269195771906e-05, "loss": 0.4775, "num_input_tokens_seen": 44186096, "step": 36420 }, { "epoch": 4.056687827152244, "grad_norm": 0.11551833897829056, "learning_rate": 3.707001383361968e-05, "loss": 0.4603, "num_input_tokens_seen": 44191984, "step": 36425 }, { "epoch": 4.057244682035861, "grad_norm": 0.08416251093149185, "learning_rate": 3.706575801542365e-05, "loss": 0.464, "num_input_tokens_seen": 44197968, "step": 36430 }, { "epoch": 4.057801536919479, "grad_norm": 0.10819689929485321, "learning_rate": 3.706150174134462e-05, "loss": 0.4753, "num_input_tokens_seen": 44204080, "step": 36435 }, { "epoch": 4.058358391803096, "grad_norm": 0.1497109979391098, "learning_rate": 3.70572450115434e-05, "loss": 0.46, "num_input_tokens_seen": 44210192, "step": 36440 }, { "epoch": 4.0589152466867136, "grad_norm": 0.10918300598859787, "learning_rate": 3.7052987826180816e-05, "loss": 0.4594, "num_input_tokens_seen": 44216592, "step": 36445 }, { "epoch": 4.05947210157033, "grad_norm": 0.10523660480976105, "learning_rate": 3.704873018541774e-05, "loss": 0.4664, "num_input_tokens_seen": 44222864, "step": 36450 }, { "epoch": 4.060028956453948, "grad_norm": 0.11967029422521591, "learning_rate": 3.7044472089415025e-05, "loss": 0.4588, "num_input_tokens_seen": 44229296, "step": 36455 }, { "epoch": 4.060585811337566, "grad_norm": 0.10847590863704681, "learning_rate": 3.7040213538333545e-05, "loss": 0.4548, "num_input_tokens_seen": 44235376, "step": 36460 }, { "epoch": 4.0611426662211825, "grad_norm": 0.13140957057476044, "learning_rate": 3.7035954532334224e-05, "loss": 0.459, "num_input_tokens_seen": 44241680, "step": 36465 }, { "epoch": 4.0616995211048, "grad_norm": 0.09718485921621323, "learning_rate": 3.703169507157796e-05, "loss": 0.4633, "num_input_tokens_seen": 44247728, "step": 36470 }, { "epoch": 4.062256375988418, "grad_norm": 0.12296373397111893, "learning_rate": 3.70274351562257e-05, "loss": 0.4788, "num_input_tokens_seen": 44253936, "step": 36475 }, { "epoch": 4.062813230872035, "grad_norm": 0.09728050231933594, "learning_rate": 3.7023174786438396e-05, "loss": 0.4659, "num_input_tokens_seen": 44259824, "step": 36480 }, { "epoch": 4.063370085755652, "grad_norm": 0.11970940977334976, "learning_rate": 3.701891396237702e-05, "loss": 0.4605, "num_input_tokens_seen": 44265808, "step": 36485 }, { "epoch": 4.063926940639269, "grad_norm": 0.09692838042974472, "learning_rate": 3.7014652684202564e-05, "loss": 0.4742, "num_input_tokens_seen": 44271280, "step": 36490 }, { "epoch": 4.064483795522887, "grad_norm": 0.10043072700500488, "learning_rate": 3.7010390952076024e-05, "loss": 0.4708, "num_input_tokens_seen": 44276592, "step": 36495 }, { "epoch": 4.065040650406504, "grad_norm": 0.10097604244947433, "learning_rate": 3.700612876615842e-05, "loss": 0.4575, "num_input_tokens_seen": 44282800, "step": 36500 }, { "epoch": 4.065597505290121, "grad_norm": 0.1532386839389801, "learning_rate": 3.7001866126610795e-05, "loss": 0.4528, "num_input_tokens_seen": 44288784, "step": 36505 }, { "epoch": 4.066154360173739, "grad_norm": 0.132787823677063, "learning_rate": 3.699760303359421e-05, "loss": 0.4564, "num_input_tokens_seen": 44294288, "step": 36510 }, { "epoch": 4.066711215057356, "grad_norm": 0.15785743296146393, "learning_rate": 3.6993339487269734e-05, "loss": 0.4651, "num_input_tokens_seen": 44300656, "step": 36515 }, { "epoch": 4.067268069940973, "grad_norm": 0.10017917305231094, "learning_rate": 3.698907548779846e-05, "loss": 0.4685, "num_input_tokens_seen": 44306704, "step": 36520 }, { "epoch": 4.067824924824591, "grad_norm": 0.0708150565624237, "learning_rate": 3.698481103534149e-05, "loss": 0.4674, "num_input_tokens_seen": 44313104, "step": 36525 }, { "epoch": 4.068381779708208, "grad_norm": 0.09558596462011337, "learning_rate": 3.698054613005995e-05, "loss": 0.4567, "num_input_tokens_seen": 44318928, "step": 36530 }, { "epoch": 4.0689386345918255, "grad_norm": 0.11953490227460861, "learning_rate": 3.697628077211499e-05, "loss": 0.4667, "num_input_tokens_seen": 44324464, "step": 36535 }, { "epoch": 4.069495489475442, "grad_norm": 0.09145766496658325, "learning_rate": 3.697201496166775e-05, "loss": 0.456, "num_input_tokens_seen": 44330192, "step": 36540 }, { "epoch": 4.07005234435906, "grad_norm": 0.0814773365855217, "learning_rate": 3.696774869887943e-05, "loss": 0.4692, "num_input_tokens_seen": 44335248, "step": 36545 }, { "epoch": 4.070609199242678, "grad_norm": 0.08840407431125641, "learning_rate": 3.696348198391121e-05, "loss": 0.4536, "num_input_tokens_seen": 44341360, "step": 36550 }, { "epoch": 4.071166054126294, "grad_norm": 0.10275428742170334, "learning_rate": 3.6959214816924296e-05, "loss": 0.4516, "num_input_tokens_seen": 44347728, "step": 36555 }, { "epoch": 4.071722909009912, "grad_norm": 0.14110645651817322, "learning_rate": 3.695494719807993e-05, "loss": 0.4679, "num_input_tokens_seen": 44353584, "step": 36560 }, { "epoch": 4.07227976389353, "grad_norm": 0.11523621529340744, "learning_rate": 3.6950679127539344e-05, "loss": 0.4636, "num_input_tokens_seen": 44360016, "step": 36565 }, { "epoch": 4.0728366187771465, "grad_norm": 0.11364156007766724, "learning_rate": 3.69464106054638e-05, "loss": 0.4566, "num_input_tokens_seen": 44365648, "step": 36570 }, { "epoch": 4.073393473660764, "grad_norm": 0.1120029091835022, "learning_rate": 3.6942141632014574e-05, "loss": 0.455, "num_input_tokens_seen": 44371632, "step": 36575 }, { "epoch": 4.073950328544381, "grad_norm": 0.11644746363162994, "learning_rate": 3.6937872207352975e-05, "loss": 0.4574, "num_input_tokens_seen": 44377264, "step": 36580 }, { "epoch": 4.074507183427999, "grad_norm": 0.1884140968322754, "learning_rate": 3.693360233164031e-05, "loss": 0.4534, "num_input_tokens_seen": 44383312, "step": 36585 }, { "epoch": 4.075064038311616, "grad_norm": 0.09921436756849289, "learning_rate": 3.69293320050379e-05, "loss": 0.4744, "num_input_tokens_seen": 44389104, "step": 36590 }, { "epoch": 4.075620893195233, "grad_norm": 0.08914660662412643, "learning_rate": 3.69250612277071e-05, "loss": 0.4603, "num_input_tokens_seen": 44395120, "step": 36595 }, { "epoch": 4.076177748078851, "grad_norm": 0.07495571672916412, "learning_rate": 3.692078999980927e-05, "loss": 0.4681, "num_input_tokens_seen": 44401520, "step": 36600 }, { "epoch": 4.076734602962468, "grad_norm": 0.09521441161632538, "learning_rate": 3.691651832150579e-05, "loss": 0.4619, "num_input_tokens_seen": 44407856, "step": 36605 }, { "epoch": 4.077291457846085, "grad_norm": 0.10094977170228958, "learning_rate": 3.691224619295806e-05, "loss": 0.4557, "num_input_tokens_seen": 44413232, "step": 36610 }, { "epoch": 4.077848312729703, "grad_norm": 0.13930031657218933, "learning_rate": 3.69079736143275e-05, "loss": 0.4581, "num_input_tokens_seen": 44419088, "step": 36615 }, { "epoch": 4.07840516761332, "grad_norm": 0.11580008268356323, "learning_rate": 3.6903700585775525e-05, "loss": 0.4744, "num_input_tokens_seen": 44425232, "step": 36620 }, { "epoch": 4.078962022496937, "grad_norm": 0.1633131057024002, "learning_rate": 3.689942710746361e-05, "loss": 0.4635, "num_input_tokens_seen": 44431024, "step": 36625 }, { "epoch": 4.079518877380554, "grad_norm": 0.1510869413614273, "learning_rate": 3.689515317955319e-05, "loss": 0.4486, "num_input_tokens_seen": 44437392, "step": 36630 }, { "epoch": 4.080075732264172, "grad_norm": 0.11691971123218536, "learning_rate": 3.6890878802205764e-05, "loss": 0.4526, "num_input_tokens_seen": 44443760, "step": 36635 }, { "epoch": 4.0806325871477895, "grad_norm": 0.09982825070619583, "learning_rate": 3.6886603975582836e-05, "loss": 0.4643, "num_input_tokens_seen": 44449648, "step": 36640 }, { "epoch": 4.081189442031406, "grad_norm": 0.07812131941318512, "learning_rate": 3.688232869984592e-05, "loss": 0.4648, "num_input_tokens_seen": 44455664, "step": 36645 }, { "epoch": 4.081746296915024, "grad_norm": 0.13598161935806274, "learning_rate": 3.6878052975156544e-05, "loss": 0.4484, "num_input_tokens_seen": 44461520, "step": 36650 }, { "epoch": 4.082303151798642, "grad_norm": 0.10851085931062698, "learning_rate": 3.6873776801676264e-05, "loss": 0.4495, "num_input_tokens_seen": 44467760, "step": 36655 }, { "epoch": 4.0828600066822585, "grad_norm": 0.15115386247634888, "learning_rate": 3.686950017956664e-05, "loss": 0.4742, "num_input_tokens_seen": 44474128, "step": 36660 }, { "epoch": 4.083416861565876, "grad_norm": 0.16790033876895905, "learning_rate": 3.686522310898926e-05, "loss": 0.4692, "num_input_tokens_seen": 44480432, "step": 36665 }, { "epoch": 4.083973716449493, "grad_norm": 0.09092945605516434, "learning_rate": 3.686094559010573e-05, "loss": 0.4612, "num_input_tokens_seen": 44486608, "step": 36670 }, { "epoch": 4.084530571333111, "grad_norm": 0.11500872671604156, "learning_rate": 3.6856667623077665e-05, "loss": 0.4697, "num_input_tokens_seen": 44492816, "step": 36675 }, { "epoch": 4.085087426216728, "grad_norm": 0.11937600374221802, "learning_rate": 3.685238920806671e-05, "loss": 0.4565, "num_input_tokens_seen": 44498704, "step": 36680 }, { "epoch": 4.085644281100345, "grad_norm": 0.15660293400287628, "learning_rate": 3.68481103452345e-05, "loss": 0.4421, "num_input_tokens_seen": 44504880, "step": 36685 }, { "epoch": 4.086201135983963, "grad_norm": 0.16102160513401031, "learning_rate": 3.68438310347427e-05, "loss": 0.4547, "num_input_tokens_seen": 44510864, "step": 36690 }, { "epoch": 4.0867579908675795, "grad_norm": 0.10795280337333679, "learning_rate": 3.6839551276753024e-05, "loss": 0.465, "num_input_tokens_seen": 44516912, "step": 36695 }, { "epoch": 4.087314845751197, "grad_norm": 0.11883742362260818, "learning_rate": 3.6835271071427155e-05, "loss": 0.4657, "num_input_tokens_seen": 44522384, "step": 36700 }, { "epoch": 4.087871700634815, "grad_norm": 0.11023721098899841, "learning_rate": 3.683099041892682e-05, "loss": 0.4563, "num_input_tokens_seen": 44528784, "step": 36705 }, { "epoch": 4.088428555518432, "grad_norm": 0.10235805064439774, "learning_rate": 3.682670931941375e-05, "loss": 0.4541, "num_input_tokens_seen": 44534704, "step": 36710 }, { "epoch": 4.088985410402049, "grad_norm": 0.14100022614002228, "learning_rate": 3.68224277730497e-05, "loss": 0.4698, "num_input_tokens_seen": 44540816, "step": 36715 }, { "epoch": 4.089542265285667, "grad_norm": 0.1363554149866104, "learning_rate": 3.6818145779996445e-05, "loss": 0.467, "num_input_tokens_seen": 44546256, "step": 36720 }, { "epoch": 4.090099120169284, "grad_norm": 0.11631467938423157, "learning_rate": 3.681386334041576e-05, "loss": 0.4704, "num_input_tokens_seen": 44551984, "step": 36725 }, { "epoch": 4.0906559750529015, "grad_norm": 0.14222154021263123, "learning_rate": 3.680958045446947e-05, "loss": 0.4577, "num_input_tokens_seen": 44558160, "step": 36730 }, { "epoch": 4.091212829936518, "grad_norm": 0.1034819558262825, "learning_rate": 3.6805297122319374e-05, "loss": 0.4623, "num_input_tokens_seen": 44564496, "step": 36735 }, { "epoch": 4.091769684820136, "grad_norm": 0.10176973789930344, "learning_rate": 3.680101334412733e-05, "loss": 0.4652, "num_input_tokens_seen": 44570576, "step": 36740 }, { "epoch": 4.092326539703754, "grad_norm": 0.11812802404165268, "learning_rate": 3.679672912005518e-05, "loss": 0.4486, "num_input_tokens_seen": 44576592, "step": 36745 }, { "epoch": 4.09288339458737, "grad_norm": 0.1406761109828949, "learning_rate": 3.679244445026481e-05, "loss": 0.4602, "num_input_tokens_seen": 44582832, "step": 36750 }, { "epoch": 4.093440249470988, "grad_norm": 0.11483170837163925, "learning_rate": 3.678815933491808e-05, "loss": 0.4792, "num_input_tokens_seen": 44588592, "step": 36755 }, { "epoch": 4.093997104354605, "grad_norm": 0.08175478875637054, "learning_rate": 3.678387377417692e-05, "loss": 0.4579, "num_input_tokens_seen": 44595056, "step": 36760 }, { "epoch": 4.0945539592382225, "grad_norm": 0.11669570952653885, "learning_rate": 3.677958776820325e-05, "loss": 0.4558, "num_input_tokens_seen": 44601040, "step": 36765 }, { "epoch": 4.09511081412184, "grad_norm": 0.12566454708576202, "learning_rate": 3.6775301317159e-05, "loss": 0.4647, "num_input_tokens_seen": 44607024, "step": 36770 }, { "epoch": 4.095667669005457, "grad_norm": 0.10741235315799713, "learning_rate": 3.6771014421206136e-05, "loss": 0.4595, "num_input_tokens_seen": 44613296, "step": 36775 }, { "epoch": 4.096224523889075, "grad_norm": 0.08736719191074371, "learning_rate": 3.676672708050662e-05, "loss": 0.4608, "num_input_tokens_seen": 44619248, "step": 36780 }, { "epoch": 4.096781378772691, "grad_norm": 0.11576691269874573, "learning_rate": 3.676243929522244e-05, "loss": 0.4638, "num_input_tokens_seen": 44625552, "step": 36785 }, { "epoch": 4.097338233656309, "grad_norm": 0.1249382346868515, "learning_rate": 3.6758151065515616e-05, "loss": 0.4643, "num_input_tokens_seen": 44631376, "step": 36790 }, { "epoch": 4.097895088539927, "grad_norm": 0.10697147995233536, "learning_rate": 3.6753862391548166e-05, "loss": 0.4665, "num_input_tokens_seen": 44637680, "step": 36795 }, { "epoch": 4.098451943423544, "grad_norm": 0.11151737719774246, "learning_rate": 3.674957327348213e-05, "loss": 0.4652, "num_input_tokens_seen": 44643728, "step": 36800 }, { "epoch": 4.099008798307161, "grad_norm": 0.15570832788944244, "learning_rate": 3.674528371147956e-05, "loss": 0.4579, "num_input_tokens_seen": 44649488, "step": 36805 }, { "epoch": 4.099565653190778, "grad_norm": 0.10601744055747986, "learning_rate": 3.674099370570253e-05, "loss": 0.4693, "num_input_tokens_seen": 44655952, "step": 36810 }, { "epoch": 4.100122508074396, "grad_norm": 0.1242191419005394, "learning_rate": 3.673670325631313e-05, "loss": 0.4519, "num_input_tokens_seen": 44662096, "step": 36815 }, { "epoch": 4.100679362958013, "grad_norm": 0.07479459047317505, "learning_rate": 3.6732412363473466e-05, "loss": 0.4609, "num_input_tokens_seen": 44668080, "step": 36820 }, { "epoch": 4.10123621784163, "grad_norm": 0.11601120978593826, "learning_rate": 3.6728121027345675e-05, "loss": 0.4468, "num_input_tokens_seen": 44673808, "step": 36825 }, { "epoch": 4.101793072725248, "grad_norm": 0.08420132100582123, "learning_rate": 3.6723829248091886e-05, "loss": 0.4478, "num_input_tokens_seen": 44680048, "step": 36830 }, { "epoch": 4.1023499276088655, "grad_norm": 0.10786144435405731, "learning_rate": 3.671953702587425e-05, "loss": 0.4741, "num_input_tokens_seen": 44685680, "step": 36835 }, { "epoch": 4.102906782492482, "grad_norm": 0.12553682923316956, "learning_rate": 3.671524436085496e-05, "loss": 0.4504, "num_input_tokens_seen": 44692080, "step": 36840 }, { "epoch": 4.1034636373761, "grad_norm": 0.13451091945171356, "learning_rate": 3.6710951253196173e-05, "loss": 0.4825, "num_input_tokens_seen": 44697936, "step": 36845 }, { "epoch": 4.104020492259717, "grad_norm": 0.10820748656988144, "learning_rate": 3.670665770306013e-05, "loss": 0.4601, "num_input_tokens_seen": 44704048, "step": 36850 }, { "epoch": 4.104577347143334, "grad_norm": 0.10672090202569962, "learning_rate": 3.670236371060904e-05, "loss": 0.4588, "num_input_tokens_seen": 44710192, "step": 36855 }, { "epoch": 4.105134202026952, "grad_norm": 0.16447357833385468, "learning_rate": 3.669806927600516e-05, "loss": 0.457, "num_input_tokens_seen": 44716272, "step": 36860 }, { "epoch": 4.105691056910569, "grad_norm": 0.09416060894727707, "learning_rate": 3.669377439941072e-05, "loss": 0.4645, "num_input_tokens_seen": 44722480, "step": 36865 }, { "epoch": 4.106247911794187, "grad_norm": 0.12673619389533997, "learning_rate": 3.668947908098802e-05, "loss": 0.4541, "num_input_tokens_seen": 44728656, "step": 36870 }, { "epoch": 4.106804766677803, "grad_norm": 0.09310580044984818, "learning_rate": 3.668518332089933e-05, "loss": 0.4647, "num_input_tokens_seen": 44734704, "step": 36875 }, { "epoch": 4.107361621561421, "grad_norm": 0.1259438544511795, "learning_rate": 3.668088711930697e-05, "loss": 0.4379, "num_input_tokens_seen": 44740432, "step": 36880 }, { "epoch": 4.107918476445039, "grad_norm": 0.09499816596508026, "learning_rate": 3.667659047637326e-05, "loss": 0.4482, "num_input_tokens_seen": 44746736, "step": 36885 }, { "epoch": 4.1084753313286555, "grad_norm": 0.12147478759288788, "learning_rate": 3.667229339226055e-05, "loss": 0.4649, "num_input_tokens_seen": 44752752, "step": 36890 }, { "epoch": 4.109032186212273, "grad_norm": 0.09511220455169678, "learning_rate": 3.6667995867131184e-05, "loss": 0.4605, "num_input_tokens_seen": 44758896, "step": 36895 }, { "epoch": 4.109589041095891, "grad_norm": 0.10264620184898376, "learning_rate": 3.666369790114754e-05, "loss": 0.4715, "num_input_tokens_seen": 44764464, "step": 36900 }, { "epoch": 4.110145895979508, "grad_norm": 0.13129831850528717, "learning_rate": 3.665939949447201e-05, "loss": 0.4682, "num_input_tokens_seen": 44770736, "step": 36905 }, { "epoch": 4.110702750863125, "grad_norm": 0.09094908088445663, "learning_rate": 3.6655100647267004e-05, "loss": 0.4603, "num_input_tokens_seen": 44776368, "step": 36910 }, { "epoch": 4.111259605746742, "grad_norm": 0.13848905265331268, "learning_rate": 3.6650801359694954e-05, "loss": 0.4672, "num_input_tokens_seen": 44782512, "step": 36915 }, { "epoch": 4.11181646063036, "grad_norm": 0.12507809698581696, "learning_rate": 3.664650163191828e-05, "loss": 0.4534, "num_input_tokens_seen": 44788784, "step": 36920 }, { "epoch": 4.1123733155139774, "grad_norm": 0.09794057160615921, "learning_rate": 3.664220146409946e-05, "loss": 0.4565, "num_input_tokens_seen": 44794896, "step": 36925 }, { "epoch": 4.112930170397594, "grad_norm": 0.12114337831735611, "learning_rate": 3.663790085640095e-05, "loss": 0.4719, "num_input_tokens_seen": 44801264, "step": 36930 }, { "epoch": 4.113487025281212, "grad_norm": 0.10044599324464798, "learning_rate": 3.663359980898526e-05, "loss": 0.4702, "num_input_tokens_seen": 44807216, "step": 36935 }, { "epoch": 4.114043880164829, "grad_norm": 0.1319068819284439, "learning_rate": 3.662929832201488e-05, "loss": 0.458, "num_input_tokens_seen": 44813040, "step": 36940 }, { "epoch": 4.114600735048446, "grad_norm": 0.09306079894304276, "learning_rate": 3.662499639565235e-05, "loss": 0.4488, "num_input_tokens_seen": 44818896, "step": 36945 }, { "epoch": 4.115157589932064, "grad_norm": 0.12262418121099472, "learning_rate": 3.662069403006019e-05, "loss": 0.4517, "num_input_tokens_seen": 44825232, "step": 36950 }, { "epoch": 4.115714444815681, "grad_norm": 0.11214756220579147, "learning_rate": 3.661639122540098e-05, "loss": 0.4735, "num_input_tokens_seen": 44831440, "step": 36955 }, { "epoch": 4.1162712996992985, "grad_norm": 0.14371445775032043, "learning_rate": 3.661208798183728e-05, "loss": 0.4469, "num_input_tokens_seen": 44837712, "step": 36960 }, { "epoch": 4.116828154582915, "grad_norm": 0.14607833325862885, "learning_rate": 3.660778429953168e-05, "loss": 0.4632, "num_input_tokens_seen": 44843920, "step": 36965 }, { "epoch": 4.117385009466533, "grad_norm": 0.12967850267887115, "learning_rate": 3.66034801786468e-05, "loss": 0.4642, "num_input_tokens_seen": 44850352, "step": 36970 }, { "epoch": 4.117941864350151, "grad_norm": 0.12979009747505188, "learning_rate": 3.659917561934524e-05, "loss": 0.4558, "num_input_tokens_seen": 44856496, "step": 36975 }, { "epoch": 4.118498719233767, "grad_norm": 0.11755955219268799, "learning_rate": 3.659487062178966e-05, "loss": 0.4649, "num_input_tokens_seen": 44862768, "step": 36980 }, { "epoch": 4.119055574117385, "grad_norm": 0.13633891940116882, "learning_rate": 3.659056518614271e-05, "loss": 0.4693, "num_input_tokens_seen": 44868688, "step": 36985 }, { "epoch": 4.119612429001003, "grad_norm": 0.1030922383069992, "learning_rate": 3.6586259312567064e-05, "loss": 0.4533, "num_input_tokens_seen": 44874640, "step": 36990 }, { "epoch": 4.12016928388462, "grad_norm": 0.12257840484380722, "learning_rate": 3.6581953001225414e-05, "loss": 0.4685, "num_input_tokens_seen": 44880784, "step": 36995 }, { "epoch": 4.120726138768237, "grad_norm": 0.1447284072637558, "learning_rate": 3.6577646252280455e-05, "loss": 0.4693, "num_input_tokens_seen": 44887312, "step": 37000 }, { "epoch": 4.121282993651854, "grad_norm": 0.1351063996553421, "learning_rate": 3.6573339065894926e-05, "loss": 0.452, "num_input_tokens_seen": 44893744, "step": 37005 }, { "epoch": 4.121839848535472, "grad_norm": 0.15183593332767487, "learning_rate": 3.656903144223155e-05, "loss": 0.4554, "num_input_tokens_seen": 44900208, "step": 37010 }, { "epoch": 4.122396703419089, "grad_norm": 0.11883331835269928, "learning_rate": 3.6564723381453104e-05, "loss": 0.4553, "num_input_tokens_seen": 44906352, "step": 37015 }, { "epoch": 4.122953558302706, "grad_norm": 0.1327361911535263, "learning_rate": 3.656041488372234e-05, "loss": 0.462, "num_input_tokens_seen": 44912624, "step": 37020 }, { "epoch": 4.123510413186324, "grad_norm": 0.08563806861639023, "learning_rate": 3.6556105949202055e-05, "loss": 0.4661, "num_input_tokens_seen": 44918864, "step": 37025 }, { "epoch": 4.124067268069941, "grad_norm": 0.11673075705766678, "learning_rate": 3.6551796578055053e-05, "loss": 0.4581, "num_input_tokens_seen": 44924976, "step": 37030 }, { "epoch": 4.124624122953558, "grad_norm": 0.1275230050086975, "learning_rate": 3.6547486770444164e-05, "loss": 0.4741, "num_input_tokens_seen": 44931056, "step": 37035 }, { "epoch": 4.125180977837176, "grad_norm": 0.10417809337377548, "learning_rate": 3.654317652653221e-05, "loss": 0.467, "num_input_tokens_seen": 44937136, "step": 37040 }, { "epoch": 4.125737832720793, "grad_norm": 0.12742292881011963, "learning_rate": 3.6538865846482065e-05, "loss": 0.4638, "num_input_tokens_seen": 44943152, "step": 37045 }, { "epoch": 4.12629468760441, "grad_norm": 0.16019290685653687, "learning_rate": 3.653455473045658e-05, "loss": 0.4675, "num_input_tokens_seen": 44949264, "step": 37050 }, { "epoch": 4.126851542488027, "grad_norm": 0.1508679836988449, "learning_rate": 3.653024317861866e-05, "loss": 0.467, "num_input_tokens_seen": 44955600, "step": 37055 }, { "epoch": 4.127408397371645, "grad_norm": 0.16347943246364594, "learning_rate": 3.652593119113119e-05, "loss": 0.4664, "num_input_tokens_seen": 44961552, "step": 37060 }, { "epoch": 4.127965252255263, "grad_norm": 0.11149842292070389, "learning_rate": 3.652161876815711e-05, "loss": 0.457, "num_input_tokens_seen": 44967568, "step": 37065 }, { "epoch": 4.128522107138879, "grad_norm": 0.13746730983257294, "learning_rate": 3.651730590985936e-05, "loss": 0.4782, "num_input_tokens_seen": 44973904, "step": 37070 }, { "epoch": 4.129078962022497, "grad_norm": 0.14459258317947388, "learning_rate": 3.651299261640087e-05, "loss": 0.4537, "num_input_tokens_seen": 44979568, "step": 37075 }, { "epoch": 4.129635816906115, "grad_norm": 0.1007247194647789, "learning_rate": 3.650867888794463e-05, "loss": 0.4519, "num_input_tokens_seen": 44985680, "step": 37080 }, { "epoch": 4.1301926717897315, "grad_norm": 0.10946303606033325, "learning_rate": 3.650436472465362e-05, "loss": 0.4593, "num_input_tokens_seen": 44991760, "step": 37085 }, { "epoch": 4.130749526673349, "grad_norm": 0.11415143311023712, "learning_rate": 3.650005012669084e-05, "loss": 0.447, "num_input_tokens_seen": 44997712, "step": 37090 }, { "epoch": 4.131306381556966, "grad_norm": 0.10161954909563065, "learning_rate": 3.6495735094219316e-05, "loss": 0.4787, "num_input_tokens_seen": 45003472, "step": 37095 }, { "epoch": 4.131863236440584, "grad_norm": 0.1447151154279709, "learning_rate": 3.649141962740208e-05, "loss": 0.4777, "num_input_tokens_seen": 45009584, "step": 37100 }, { "epoch": 4.132420091324201, "grad_norm": 0.10321067273616791, "learning_rate": 3.648710372640218e-05, "loss": 0.4489, "num_input_tokens_seen": 45015536, "step": 37105 }, { "epoch": 4.132976946207818, "grad_norm": 0.14250540733337402, "learning_rate": 3.648278739138269e-05, "loss": 0.4614, "num_input_tokens_seen": 45021840, "step": 37110 }, { "epoch": 4.133533801091436, "grad_norm": 0.13679824769496918, "learning_rate": 3.64784706225067e-05, "loss": 0.4595, "num_input_tokens_seen": 45027952, "step": 37115 }, { "epoch": 4.1340906559750525, "grad_norm": 0.11135552078485489, "learning_rate": 3.647415341993731e-05, "loss": 0.4539, "num_input_tokens_seen": 45034096, "step": 37120 }, { "epoch": 4.13464751085867, "grad_norm": 0.11948440968990326, "learning_rate": 3.646983578383762e-05, "loss": 0.4476, "num_input_tokens_seen": 45040144, "step": 37125 }, { "epoch": 4.135204365742288, "grad_norm": 0.14159637689590454, "learning_rate": 3.646551771437078e-05, "loss": 0.4527, "num_input_tokens_seen": 45046352, "step": 37130 }, { "epoch": 4.135761220625905, "grad_norm": 0.1402195245027542, "learning_rate": 3.6461199211699934e-05, "loss": 0.4591, "num_input_tokens_seen": 45052432, "step": 37135 }, { "epoch": 4.136318075509522, "grad_norm": 0.11461219191551208, "learning_rate": 3.645688027598826e-05, "loss": 0.471, "num_input_tokens_seen": 45058736, "step": 37140 }, { "epoch": 4.136874930393139, "grad_norm": 0.10859175026416779, "learning_rate": 3.6452560907398933e-05, "loss": 0.4606, "num_input_tokens_seen": 45064880, "step": 37145 }, { "epoch": 4.137431785276757, "grad_norm": 0.16489914059638977, "learning_rate": 3.644824110609515e-05, "loss": 0.4546, "num_input_tokens_seen": 45070512, "step": 37150 }, { "epoch": 4.1379886401603745, "grad_norm": 0.10685378313064575, "learning_rate": 3.644392087224014e-05, "loss": 0.4578, "num_input_tokens_seen": 45076624, "step": 37155 }, { "epoch": 4.138545495043991, "grad_norm": 0.12819646298885345, "learning_rate": 3.6439600205997114e-05, "loss": 0.4583, "num_input_tokens_seen": 45081840, "step": 37160 }, { "epoch": 4.139102349927609, "grad_norm": 0.16352835297584534, "learning_rate": 3.643527910752934e-05, "loss": 0.4595, "num_input_tokens_seen": 45087664, "step": 37165 }, { "epoch": 4.139659204811227, "grad_norm": 0.12916089594364166, "learning_rate": 3.643095757700007e-05, "loss": 0.4513, "num_input_tokens_seen": 45093680, "step": 37170 }, { "epoch": 4.140216059694843, "grad_norm": 0.13504183292388916, "learning_rate": 3.642663561457259e-05, "loss": 0.4597, "num_input_tokens_seen": 45099408, "step": 37175 }, { "epoch": 4.140772914578461, "grad_norm": 0.13015130162239075, "learning_rate": 3.6422313220410205e-05, "loss": 0.4611, "num_input_tokens_seen": 45105616, "step": 37180 }, { "epoch": 4.141329769462078, "grad_norm": 0.11408402770757675, "learning_rate": 3.641799039467622e-05, "loss": 0.445, "num_input_tokens_seen": 45111664, "step": 37185 }, { "epoch": 4.1418866243456955, "grad_norm": 0.16991080343723297, "learning_rate": 3.6413667137533956e-05, "loss": 0.4778, "num_input_tokens_seen": 45117552, "step": 37190 }, { "epoch": 4.142443479229313, "grad_norm": 0.12249108403921127, "learning_rate": 3.6409343449146784e-05, "loss": 0.4537, "num_input_tokens_seen": 45123536, "step": 37195 }, { "epoch": 4.14300033411293, "grad_norm": 0.11571869999170303, "learning_rate": 3.640501932967805e-05, "loss": 0.4733, "num_input_tokens_seen": 45129872, "step": 37200 }, { "epoch": 4.143557188996548, "grad_norm": 0.12773245573043823, "learning_rate": 3.640069477929113e-05, "loss": 0.4429, "num_input_tokens_seen": 45135984, "step": 37205 }, { "epoch": 4.1441140438801645, "grad_norm": 0.1649814248085022, "learning_rate": 3.6396369798149434e-05, "loss": 0.4704, "num_input_tokens_seen": 45142384, "step": 37210 }, { "epoch": 4.144670898763782, "grad_norm": 0.09064634889364243, "learning_rate": 3.639204438641636e-05, "loss": 0.45, "num_input_tokens_seen": 45148368, "step": 37215 }, { "epoch": 4.1452277536474, "grad_norm": 0.12846285104751587, "learning_rate": 3.638771854425534e-05, "loss": 0.4547, "num_input_tokens_seen": 45154608, "step": 37220 }, { "epoch": 4.145784608531017, "grad_norm": 0.07970768958330154, "learning_rate": 3.6383392271829826e-05, "loss": 0.453, "num_input_tokens_seen": 45160528, "step": 37225 }, { "epoch": 4.146341463414634, "grad_norm": 0.11864202469587326, "learning_rate": 3.6379065569303275e-05, "loss": 0.4632, "num_input_tokens_seen": 45166768, "step": 37230 }, { "epoch": 4.146898318298251, "grad_norm": 0.12632673978805542, "learning_rate": 3.637473843683915e-05, "loss": 0.4587, "num_input_tokens_seen": 45173040, "step": 37235 }, { "epoch": 4.147455173181869, "grad_norm": 0.10199178010225296, "learning_rate": 3.637041087460097e-05, "loss": 0.4422, "num_input_tokens_seen": 45178352, "step": 37240 }, { "epoch": 4.148012028065486, "grad_norm": 0.1778450757265091, "learning_rate": 3.636608288275222e-05, "loss": 0.4709, "num_input_tokens_seen": 45184368, "step": 37245 }, { "epoch": 4.148568882949103, "grad_norm": 0.12040450423955917, "learning_rate": 3.636175446145642e-05, "loss": 0.465, "num_input_tokens_seen": 45190640, "step": 37250 }, { "epoch": 4.149125737832721, "grad_norm": 0.13079898059368134, "learning_rate": 3.6357425610877146e-05, "loss": 0.4579, "num_input_tokens_seen": 45196336, "step": 37255 }, { "epoch": 4.1496825927163385, "grad_norm": 0.13448598980903625, "learning_rate": 3.635309633117793e-05, "loss": 0.4642, "num_input_tokens_seen": 45202256, "step": 37260 }, { "epoch": 4.150239447599955, "grad_norm": 0.14316847920417786, "learning_rate": 3.634876662252235e-05, "loss": 0.4675, "num_input_tokens_seen": 45208240, "step": 37265 }, { "epoch": 4.150796302483573, "grad_norm": 0.12205557525157928, "learning_rate": 3.6344436485074e-05, "loss": 0.4588, "num_input_tokens_seen": 45214704, "step": 37270 }, { "epoch": 4.15135315736719, "grad_norm": 0.16818532347679138, "learning_rate": 3.634010591899649e-05, "loss": 0.4785, "num_input_tokens_seen": 45220592, "step": 37275 }, { "epoch": 4.1519100122508075, "grad_norm": 0.11651646345853806, "learning_rate": 3.6335774924453426e-05, "loss": 0.4682, "num_input_tokens_seen": 45226736, "step": 37280 }, { "epoch": 4.152466867134425, "grad_norm": 0.1535521298646927, "learning_rate": 3.633144350160846e-05, "loss": 0.4469, "num_input_tokens_seen": 45232592, "step": 37285 }, { "epoch": 4.153023722018042, "grad_norm": 0.11844746768474579, "learning_rate": 3.632711165062525e-05, "loss": 0.4772, "num_input_tokens_seen": 45238320, "step": 37290 }, { "epoch": 4.15358057690166, "grad_norm": 0.09553694725036621, "learning_rate": 3.632277937166746e-05, "loss": 0.4646, "num_input_tokens_seen": 45244272, "step": 37295 }, { "epoch": 4.154137431785276, "grad_norm": 0.10392816364765167, "learning_rate": 3.631844666489878e-05, "loss": 0.4573, "num_input_tokens_seen": 45250480, "step": 37300 }, { "epoch": 4.154694286668894, "grad_norm": 0.11347018927335739, "learning_rate": 3.631411353048291e-05, "loss": 0.477, "num_input_tokens_seen": 45256496, "step": 37305 }, { "epoch": 4.155251141552512, "grad_norm": 0.1539570391178131, "learning_rate": 3.6309779968583576e-05, "loss": 0.4633, "num_input_tokens_seen": 45262832, "step": 37310 }, { "epoch": 4.1558079964361285, "grad_norm": 0.1141834408044815, "learning_rate": 3.630544597936451e-05, "loss": 0.4721, "num_input_tokens_seen": 45269072, "step": 37315 }, { "epoch": 4.156364851319746, "grad_norm": 0.10117839276790619, "learning_rate": 3.630111156298947e-05, "loss": 0.4491, "num_input_tokens_seen": 45274992, "step": 37320 }, { "epoch": 4.156921706203363, "grad_norm": 0.13344910740852356, "learning_rate": 3.629677671962222e-05, "loss": 0.4578, "num_input_tokens_seen": 45281616, "step": 37325 }, { "epoch": 4.157478561086981, "grad_norm": 0.10180861502885818, "learning_rate": 3.629244144942653e-05, "loss": 0.4514, "num_input_tokens_seen": 45287600, "step": 37330 }, { "epoch": 4.158035415970598, "grad_norm": 0.11326880007982254, "learning_rate": 3.6288105752566225e-05, "loss": 0.4696, "num_input_tokens_seen": 45293776, "step": 37335 }, { "epoch": 4.158592270854215, "grad_norm": 0.09699950367212296, "learning_rate": 3.628376962920511e-05, "loss": 0.4749, "num_input_tokens_seen": 45299920, "step": 37340 }, { "epoch": 4.159149125737833, "grad_norm": 0.13029174506664276, "learning_rate": 3.6279433079507006e-05, "loss": 0.464, "num_input_tokens_seen": 45305872, "step": 37345 }, { "epoch": 4.1597059806214505, "grad_norm": 0.14087533950805664, "learning_rate": 3.6275096103635786e-05, "loss": 0.4428, "num_input_tokens_seen": 45312048, "step": 37350 }, { "epoch": 4.160262835505067, "grad_norm": 0.09618725627660751, "learning_rate": 3.6270758701755294e-05, "loss": 0.4547, "num_input_tokens_seen": 45318416, "step": 37355 }, { "epoch": 4.160819690388685, "grad_norm": 0.10718128085136414, "learning_rate": 3.626642087402943e-05, "loss": 0.4696, "num_input_tokens_seen": 45324432, "step": 37360 }, { "epoch": 4.161376545272302, "grad_norm": 0.09830797463655472, "learning_rate": 3.6262082620622074e-05, "loss": 0.4519, "num_input_tokens_seen": 45330768, "step": 37365 }, { "epoch": 4.161933400155919, "grad_norm": 0.15419761836528778, "learning_rate": 3.625774394169715e-05, "loss": 0.4508, "num_input_tokens_seen": 45337168, "step": 37370 }, { "epoch": 4.162490255039537, "grad_norm": 0.12168211489915848, "learning_rate": 3.625340483741857e-05, "loss": 0.4562, "num_input_tokens_seen": 45343696, "step": 37375 }, { "epoch": 4.163047109923154, "grad_norm": 0.12826548516750336, "learning_rate": 3.62490653079503e-05, "loss": 0.4796, "num_input_tokens_seen": 45349776, "step": 37380 }, { "epoch": 4.1636039648067715, "grad_norm": 0.18342222273349762, "learning_rate": 3.62447253534563e-05, "loss": 0.4716, "num_input_tokens_seen": 45355888, "step": 37385 }, { "epoch": 4.164160819690388, "grad_norm": 0.1123126670718193, "learning_rate": 3.6240384974100536e-05, "loss": 0.4659, "num_input_tokens_seen": 45362288, "step": 37390 }, { "epoch": 4.164717674574006, "grad_norm": 0.1181248128414154, "learning_rate": 3.6236044170047004e-05, "loss": 0.4563, "num_input_tokens_seen": 45368144, "step": 37395 }, { "epoch": 4.165274529457624, "grad_norm": 0.1145881712436676, "learning_rate": 3.623170294145971e-05, "loss": 0.4633, "num_input_tokens_seen": 45374128, "step": 37400 }, { "epoch": 4.1658313843412405, "grad_norm": 0.11252405494451523, "learning_rate": 3.622736128850269e-05, "loss": 0.452, "num_input_tokens_seen": 45380080, "step": 37405 }, { "epoch": 4.166388239224858, "grad_norm": 0.12897983193397522, "learning_rate": 3.622301921133998e-05, "loss": 0.4693, "num_input_tokens_seen": 45386448, "step": 37410 }, { "epoch": 4.166945094108475, "grad_norm": 0.12405321002006531, "learning_rate": 3.6218676710135645e-05, "loss": 0.4613, "num_input_tokens_seen": 45392784, "step": 37415 }, { "epoch": 4.167501948992093, "grad_norm": 0.13641981780529022, "learning_rate": 3.6214333785053744e-05, "loss": 0.4752, "num_input_tokens_seen": 45398992, "step": 37420 }, { "epoch": 4.16805880387571, "grad_norm": 0.10457427054643631, "learning_rate": 3.6209990436258377e-05, "loss": 0.4681, "num_input_tokens_seen": 45405104, "step": 37425 }, { "epoch": 4.168615658759327, "grad_norm": 0.08762006461620331, "learning_rate": 3.620564666391365e-05, "loss": 0.4561, "num_input_tokens_seen": 45411216, "step": 37430 }, { "epoch": 4.169172513642945, "grad_norm": 0.09719561040401459, "learning_rate": 3.6201302468183686e-05, "loss": 0.4593, "num_input_tokens_seen": 45416752, "step": 37435 }, { "epoch": 4.169729368526562, "grad_norm": 0.09482225775718689, "learning_rate": 3.6196957849232605e-05, "loss": 0.4699, "num_input_tokens_seen": 45422992, "step": 37440 }, { "epoch": 4.170286223410179, "grad_norm": 0.1086619570851326, "learning_rate": 3.619261280722458e-05, "loss": 0.4591, "num_input_tokens_seen": 45429072, "step": 37445 }, { "epoch": 4.170843078293797, "grad_norm": 0.10511650145053864, "learning_rate": 3.618826734232378e-05, "loss": 0.4703, "num_input_tokens_seen": 45435024, "step": 37450 }, { "epoch": 4.171399933177414, "grad_norm": 0.11153070628643036, "learning_rate": 3.618392145469438e-05, "loss": 0.471, "num_input_tokens_seen": 45440944, "step": 37455 }, { "epoch": 4.171956788061031, "grad_norm": 0.07341744750738144, "learning_rate": 3.617957514450059e-05, "loss": 0.4583, "num_input_tokens_seen": 45446800, "step": 37460 }, { "epoch": 4.172513642944649, "grad_norm": 0.08863741904497147, "learning_rate": 3.617522841190662e-05, "loss": 0.4656, "num_input_tokens_seen": 45452688, "step": 37465 }, { "epoch": 4.173070497828266, "grad_norm": 0.11035577952861786, "learning_rate": 3.6170881257076706e-05, "loss": 0.4773, "num_input_tokens_seen": 45458384, "step": 37470 }, { "epoch": 4.1736273527118835, "grad_norm": 0.08175648003816605, "learning_rate": 3.616653368017511e-05, "loss": 0.4524, "num_input_tokens_seen": 45464848, "step": 37475 }, { "epoch": 4.1741842075955, "grad_norm": 0.10640702396631241, "learning_rate": 3.616218568136608e-05, "loss": 0.4567, "num_input_tokens_seen": 45470992, "step": 37480 }, { "epoch": 4.174741062479118, "grad_norm": 0.18930889666080475, "learning_rate": 3.6157837260813895e-05, "loss": 0.4758, "num_input_tokens_seen": 45476976, "step": 37485 }, { "epoch": 4.175297917362736, "grad_norm": 0.07644249498844147, "learning_rate": 3.6153488418682876e-05, "loss": 0.463, "num_input_tokens_seen": 45483088, "step": 37490 }, { "epoch": 4.175854772246352, "grad_norm": 0.08932629972696304, "learning_rate": 3.614913915513731e-05, "loss": 0.459, "num_input_tokens_seen": 45488880, "step": 37495 }, { "epoch": 4.17641162712997, "grad_norm": 0.1455269604921341, "learning_rate": 3.614478947034154e-05, "loss": 0.4621, "num_input_tokens_seen": 45495024, "step": 37500 }, { "epoch": 4.176968482013587, "grad_norm": 0.11338307708501816, "learning_rate": 3.614043936445992e-05, "loss": 0.4574, "num_input_tokens_seen": 45501392, "step": 37505 }, { "epoch": 4.1775253368972045, "grad_norm": 0.10872064530849457, "learning_rate": 3.613608883765679e-05, "loss": 0.4631, "num_input_tokens_seen": 45507696, "step": 37510 }, { "epoch": 4.178082191780822, "grad_norm": 0.11408184468746185, "learning_rate": 3.6131737890096536e-05, "loss": 0.4632, "num_input_tokens_seen": 45513872, "step": 37515 }, { "epoch": 4.178639046664439, "grad_norm": 0.13768379390239716, "learning_rate": 3.612738652194355e-05, "loss": 0.4686, "num_input_tokens_seen": 45520144, "step": 37520 }, { "epoch": 4.179195901548057, "grad_norm": 0.14437226951122284, "learning_rate": 3.612303473336225e-05, "loss": 0.4786, "num_input_tokens_seen": 45526384, "step": 37525 }, { "epoch": 4.179752756431674, "grad_norm": 0.09053122997283936, "learning_rate": 3.6118682524517036e-05, "loss": 0.459, "num_input_tokens_seen": 45532336, "step": 37530 }, { "epoch": 4.180309611315291, "grad_norm": 0.11681879311800003, "learning_rate": 3.611432989557238e-05, "loss": 0.4628, "num_input_tokens_seen": 45538640, "step": 37535 }, { "epoch": 4.180866466198909, "grad_norm": 0.17199592292308807, "learning_rate": 3.610997684669272e-05, "loss": 0.4672, "num_input_tokens_seen": 45545008, "step": 37540 }, { "epoch": 4.181423321082526, "grad_norm": 0.11443927139043808, "learning_rate": 3.610562337804253e-05, "loss": 0.4626, "num_input_tokens_seen": 45551312, "step": 37545 }, { "epoch": 4.181980175966143, "grad_norm": 0.08476722240447998, "learning_rate": 3.610126948978629e-05, "loss": 0.4791, "num_input_tokens_seen": 45556880, "step": 37550 }, { "epoch": 4.182537030849761, "grad_norm": 0.1058126762509346, "learning_rate": 3.609691518208853e-05, "loss": 0.4663, "num_input_tokens_seen": 45562832, "step": 37555 }, { "epoch": 4.183093885733378, "grad_norm": 0.10244732350111008, "learning_rate": 3.609256045511374e-05, "loss": 0.4541, "num_input_tokens_seen": 45568816, "step": 37560 }, { "epoch": 4.183650740616995, "grad_norm": 0.13239681720733643, "learning_rate": 3.608820530902647e-05, "loss": 0.468, "num_input_tokens_seen": 45574928, "step": 37565 }, { "epoch": 4.184207595500612, "grad_norm": 0.1342964619398117, "learning_rate": 3.608384974399127e-05, "loss": 0.4506, "num_input_tokens_seen": 45580976, "step": 37570 }, { "epoch": 4.18476445038423, "grad_norm": 0.10197684168815613, "learning_rate": 3.6079493760172714e-05, "loss": 0.4421, "num_input_tokens_seen": 45586928, "step": 37575 }, { "epoch": 4.1853213052678475, "grad_norm": 0.09658129513263702, "learning_rate": 3.6075137357735364e-05, "loss": 0.4642, "num_input_tokens_seen": 45592720, "step": 37580 }, { "epoch": 4.185878160151464, "grad_norm": 0.08853587508201599, "learning_rate": 3.607078053684384e-05, "loss": 0.4564, "num_input_tokens_seen": 45598544, "step": 37585 }, { "epoch": 4.186435015035082, "grad_norm": 0.11425989866256714, "learning_rate": 3.6066423297662753e-05, "loss": 0.4602, "num_input_tokens_seen": 45604624, "step": 37590 }, { "epoch": 4.186991869918699, "grad_norm": 0.13431504368782043, "learning_rate": 3.6062065640356724e-05, "loss": 0.473, "num_input_tokens_seen": 45610256, "step": 37595 }, { "epoch": 4.187548724802316, "grad_norm": 0.10683225840330124, "learning_rate": 3.605770756509041e-05, "loss": 0.4566, "num_input_tokens_seen": 45616272, "step": 37600 }, { "epoch": 4.188105579685934, "grad_norm": 0.10607604682445526, "learning_rate": 3.6053349072028466e-05, "loss": 0.4871, "num_input_tokens_seen": 45622480, "step": 37605 }, { "epoch": 4.188662434569551, "grad_norm": 0.0992753729224205, "learning_rate": 3.604899016133557e-05, "loss": 0.471, "num_input_tokens_seen": 45628080, "step": 37610 }, { "epoch": 4.189219289453169, "grad_norm": 0.11344518512487411, "learning_rate": 3.6044630833176416e-05, "loss": 0.4644, "num_input_tokens_seen": 45634032, "step": 37615 }, { "epoch": 4.189776144336786, "grad_norm": 0.10926920920610428, "learning_rate": 3.604027108771572e-05, "loss": 0.4603, "num_input_tokens_seen": 45640336, "step": 37620 }, { "epoch": 4.190332999220403, "grad_norm": 0.13036710023880005, "learning_rate": 3.6035910925118196e-05, "loss": 0.4641, "num_input_tokens_seen": 45646128, "step": 37625 }, { "epoch": 4.190889854104021, "grad_norm": 0.10978101938962936, "learning_rate": 3.60315503455486e-05, "loss": 0.4596, "num_input_tokens_seen": 45652240, "step": 37630 }, { "epoch": 4.1914467089876375, "grad_norm": 0.0998074859380722, "learning_rate": 3.602718934917167e-05, "loss": 0.4509, "num_input_tokens_seen": 45658096, "step": 37635 }, { "epoch": 4.192003563871255, "grad_norm": 0.1197633147239685, "learning_rate": 3.6022827936152195e-05, "loss": 0.4587, "num_input_tokens_seen": 45663984, "step": 37640 }, { "epoch": 4.192560418754873, "grad_norm": 0.08247727900743484, "learning_rate": 3.6018466106654954e-05, "loss": 0.452, "num_input_tokens_seen": 45670096, "step": 37645 }, { "epoch": 4.19311727363849, "grad_norm": 0.1285611093044281, "learning_rate": 3.601410386084475e-05, "loss": 0.4452, "num_input_tokens_seen": 45676208, "step": 37650 }, { "epoch": 4.193674128522107, "grad_norm": 0.14895083010196686, "learning_rate": 3.600974119888641e-05, "loss": 0.4665, "num_input_tokens_seen": 45682224, "step": 37655 }, { "epoch": 4.194230983405724, "grad_norm": 0.08457057923078537, "learning_rate": 3.6005378120944764e-05, "loss": 0.4657, "num_input_tokens_seen": 45688400, "step": 37660 }, { "epoch": 4.194787838289342, "grad_norm": 0.11369629949331284, "learning_rate": 3.6001014627184666e-05, "loss": 0.4551, "num_input_tokens_seen": 45694672, "step": 37665 }, { "epoch": 4.195344693172959, "grad_norm": 0.1458921730518341, "learning_rate": 3.599665071777097e-05, "loss": 0.4712, "num_input_tokens_seen": 45700592, "step": 37670 }, { "epoch": 4.195901548056576, "grad_norm": 0.10627324134111404, "learning_rate": 3.5992286392868585e-05, "loss": 0.4825, "num_input_tokens_seen": 45706704, "step": 37675 }, { "epoch": 4.196458402940194, "grad_norm": 0.11991475522518158, "learning_rate": 3.598792165264239e-05, "loss": 0.4741, "num_input_tokens_seen": 45712336, "step": 37680 }, { "epoch": 4.197015257823811, "grad_norm": 0.13669902086257935, "learning_rate": 3.598355649725729e-05, "loss": 0.4527, "num_input_tokens_seen": 45718352, "step": 37685 }, { "epoch": 4.197572112707428, "grad_norm": 0.13887721300125122, "learning_rate": 3.597919092687824e-05, "loss": 0.4757, "num_input_tokens_seen": 45724752, "step": 37690 }, { "epoch": 4.198128967591046, "grad_norm": 0.09410006552934647, "learning_rate": 3.597482494167017e-05, "loss": 0.4522, "num_input_tokens_seen": 45730960, "step": 37695 }, { "epoch": 4.198685822474663, "grad_norm": 0.12573343515396118, "learning_rate": 3.5970458541798034e-05, "loss": 0.4752, "num_input_tokens_seen": 45737424, "step": 37700 }, { "epoch": 4.1992426773582805, "grad_norm": 0.09987622499465942, "learning_rate": 3.596609172742683e-05, "loss": 0.4538, "num_input_tokens_seen": 45743792, "step": 37705 }, { "epoch": 4.199799532241898, "grad_norm": 0.1481606662273407, "learning_rate": 3.596172449872153e-05, "loss": 0.4493, "num_input_tokens_seen": 45749648, "step": 37710 }, { "epoch": 4.200356387125515, "grad_norm": 0.1003805473446846, "learning_rate": 3.595735685584716e-05, "loss": 0.4828, "num_input_tokens_seen": 45755792, "step": 37715 }, { "epoch": 4.200913242009133, "grad_norm": 0.07248087227344513, "learning_rate": 3.595298879896872e-05, "loss": 0.4552, "num_input_tokens_seen": 45761872, "step": 37720 }, { "epoch": 4.201470096892749, "grad_norm": 0.1130312904715538, "learning_rate": 3.594862032825127e-05, "loss": 0.4609, "num_input_tokens_seen": 45767888, "step": 37725 }, { "epoch": 4.202026951776367, "grad_norm": 0.09815742075443268, "learning_rate": 3.5944251443859854e-05, "loss": 0.4436, "num_input_tokens_seen": 45773936, "step": 37730 }, { "epoch": 4.202583806659985, "grad_norm": 0.09595231711864471, "learning_rate": 3.593988214595955e-05, "loss": 0.4499, "num_input_tokens_seen": 45779664, "step": 37735 }, { "epoch": 4.2031406615436016, "grad_norm": 0.13070330023765564, "learning_rate": 3.5935512434715436e-05, "loss": 0.465, "num_input_tokens_seen": 45785968, "step": 37740 }, { "epoch": 4.203697516427219, "grad_norm": 0.09838523715734482, "learning_rate": 3.593114231029262e-05, "loss": 0.46, "num_input_tokens_seen": 45791952, "step": 37745 }, { "epoch": 4.204254371310836, "grad_norm": 0.14609037339687347, "learning_rate": 3.5926771772856216e-05, "loss": 0.4754, "num_input_tokens_seen": 45797840, "step": 37750 }, { "epoch": 4.204811226194454, "grad_norm": 0.12457268685102463, "learning_rate": 3.5922400822571354e-05, "loss": 0.4657, "num_input_tokens_seen": 45804016, "step": 37755 }, { "epoch": 4.205368081078071, "grad_norm": 0.0913102999329567, "learning_rate": 3.591802945960319e-05, "loss": 0.4786, "num_input_tokens_seen": 45810192, "step": 37760 }, { "epoch": 4.205924935961688, "grad_norm": 0.11757101118564606, "learning_rate": 3.5913657684116884e-05, "loss": 0.4535, "num_input_tokens_seen": 45815824, "step": 37765 }, { "epoch": 4.206481790845306, "grad_norm": 0.10980555415153503, "learning_rate": 3.5909285496277625e-05, "loss": 0.4687, "num_input_tokens_seen": 45822096, "step": 37770 }, { "epoch": 4.2070386457289235, "grad_norm": 0.1682724952697754, "learning_rate": 3.590491289625059e-05, "loss": 0.4555, "num_input_tokens_seen": 45828208, "step": 37775 }, { "epoch": 4.20759550061254, "grad_norm": 0.10115845501422882, "learning_rate": 3.5900539884200997e-05, "loss": 0.4642, "num_input_tokens_seen": 45833648, "step": 37780 }, { "epoch": 4.208152355496158, "grad_norm": 0.09555213153362274, "learning_rate": 3.589616646029408e-05, "loss": 0.4696, "num_input_tokens_seen": 45839824, "step": 37785 }, { "epoch": 4.208709210379775, "grad_norm": 0.12371177226305008, "learning_rate": 3.589179262469508e-05, "loss": 0.4743, "num_input_tokens_seen": 45846064, "step": 37790 }, { "epoch": 4.209266065263392, "grad_norm": 0.10973488539457321, "learning_rate": 3.588741837756924e-05, "loss": 0.4596, "num_input_tokens_seen": 45852208, "step": 37795 }, { "epoch": 4.20982292014701, "grad_norm": 0.0948595479130745, "learning_rate": 3.588304371908185e-05, "loss": 0.4589, "num_input_tokens_seen": 45858480, "step": 37800 }, { "epoch": 4.210379775030627, "grad_norm": 0.14892075955867767, "learning_rate": 3.587866864939819e-05, "loss": 0.4745, "num_input_tokens_seen": 45864752, "step": 37805 }, { "epoch": 4.2109366299142446, "grad_norm": 0.11169520020484924, "learning_rate": 3.5874293168683566e-05, "loss": 0.4692, "num_input_tokens_seen": 45870864, "step": 37810 }, { "epoch": 4.211493484797861, "grad_norm": 0.12874245643615723, "learning_rate": 3.586991727710329e-05, "loss": 0.4669, "num_input_tokens_seen": 45877104, "step": 37815 }, { "epoch": 4.212050339681479, "grad_norm": 0.18656812608242035, "learning_rate": 3.5865540974822714e-05, "loss": 0.4595, "num_input_tokens_seen": 45883216, "step": 37820 }, { "epoch": 4.212607194565097, "grad_norm": 0.1002669706940651, "learning_rate": 3.586116426200718e-05, "loss": 0.4656, "num_input_tokens_seen": 45889200, "step": 37825 }, { "epoch": 4.2131640494487135, "grad_norm": 0.12732374668121338, "learning_rate": 3.585678713882205e-05, "loss": 0.4605, "num_input_tokens_seen": 45895120, "step": 37830 }, { "epoch": 4.213720904332331, "grad_norm": 0.10175410658121109, "learning_rate": 3.585240960543271e-05, "loss": 0.4624, "num_input_tokens_seen": 45901328, "step": 37835 }, { "epoch": 4.214277759215948, "grad_norm": 0.11860841512680054, "learning_rate": 3.584803166200456e-05, "loss": 0.4485, "num_input_tokens_seen": 45907632, "step": 37840 }, { "epoch": 4.214834614099566, "grad_norm": 0.07676824927330017, "learning_rate": 3.584365330870301e-05, "loss": 0.4663, "num_input_tokens_seen": 45913584, "step": 37845 }, { "epoch": 4.215391468983183, "grad_norm": 0.12219882011413574, "learning_rate": 3.583927454569348e-05, "loss": 0.4561, "num_input_tokens_seen": 45920080, "step": 37850 }, { "epoch": 4.2159483238668, "grad_norm": 0.09902883321046829, "learning_rate": 3.583489537314143e-05, "loss": 0.4508, "num_input_tokens_seen": 45925904, "step": 37855 }, { "epoch": 4.216505178750418, "grad_norm": 0.11372298747301102, "learning_rate": 3.5830515791212314e-05, "loss": 0.456, "num_input_tokens_seen": 45932112, "step": 37860 }, { "epoch": 4.2170620336340345, "grad_norm": 0.1275227814912796, "learning_rate": 3.58261358000716e-05, "loss": 0.4621, "num_input_tokens_seen": 45937520, "step": 37865 }, { "epoch": 4.217618888517652, "grad_norm": 0.11795427650213242, "learning_rate": 3.582175539988477e-05, "loss": 0.4583, "num_input_tokens_seen": 45942992, "step": 37870 }, { "epoch": 4.21817574340127, "grad_norm": 0.12180261313915253, "learning_rate": 3.5817374590817355e-05, "loss": 0.4564, "num_input_tokens_seen": 45949040, "step": 37875 }, { "epoch": 4.218732598284887, "grad_norm": 0.1250259280204773, "learning_rate": 3.581299337303485e-05, "loss": 0.4596, "num_input_tokens_seen": 45954544, "step": 37880 }, { "epoch": 4.219289453168504, "grad_norm": 0.09478580206632614, "learning_rate": 3.5808611746702814e-05, "loss": 0.4644, "num_input_tokens_seen": 45960080, "step": 37885 }, { "epoch": 4.219846308052122, "grad_norm": 0.08510558307170868, "learning_rate": 3.580422971198679e-05, "loss": 0.4569, "num_input_tokens_seen": 45966128, "step": 37890 }, { "epoch": 4.220403162935739, "grad_norm": 0.1235741600394249, "learning_rate": 3.5799847269052336e-05, "loss": 0.4725, "num_input_tokens_seen": 45972560, "step": 37895 }, { "epoch": 4.2209600178193565, "grad_norm": 0.10305266082286835, "learning_rate": 3.5795464418065045e-05, "loss": 0.46, "num_input_tokens_seen": 45978704, "step": 37900 }, { "epoch": 4.221516872702973, "grad_norm": 0.11251770704984665, "learning_rate": 3.57910811591905e-05, "loss": 0.4772, "num_input_tokens_seen": 45984848, "step": 37905 }, { "epoch": 4.222073727586591, "grad_norm": 0.11594510078430176, "learning_rate": 3.578669749259435e-05, "loss": 0.4638, "num_input_tokens_seen": 45991024, "step": 37910 }, { "epoch": 4.222630582470209, "grad_norm": 0.0971764475107193, "learning_rate": 3.5782313418442184e-05, "loss": 0.4638, "num_input_tokens_seen": 45997136, "step": 37915 }, { "epoch": 4.223187437353825, "grad_norm": 0.11676015704870224, "learning_rate": 3.577792893689967e-05, "loss": 0.4505, "num_input_tokens_seen": 46003184, "step": 37920 }, { "epoch": 4.223744292237443, "grad_norm": 0.12395788729190826, "learning_rate": 3.5773544048132464e-05, "loss": 0.4564, "num_input_tokens_seen": 46009360, "step": 37925 }, { "epoch": 4.22430114712106, "grad_norm": 0.12864576280117035, "learning_rate": 3.576915875230623e-05, "loss": 0.4395, "num_input_tokens_seen": 46015216, "step": 37930 }, { "epoch": 4.2248580020046775, "grad_norm": 0.11548691987991333, "learning_rate": 3.576477304958667e-05, "loss": 0.4573, "num_input_tokens_seen": 46021328, "step": 37935 }, { "epoch": 4.225414856888295, "grad_norm": 0.10246815532445908, "learning_rate": 3.576038694013949e-05, "loss": 0.458, "num_input_tokens_seen": 46027856, "step": 37940 }, { "epoch": 4.225971711771912, "grad_norm": 0.11038651317358017, "learning_rate": 3.575600042413041e-05, "loss": 0.4566, "num_input_tokens_seen": 46033936, "step": 37945 }, { "epoch": 4.22652856665553, "grad_norm": 0.10984628647565842, "learning_rate": 3.5751613501725154e-05, "loss": 0.4675, "num_input_tokens_seen": 46040144, "step": 37950 }, { "epoch": 4.227085421539147, "grad_norm": 0.08238454163074493, "learning_rate": 3.5747226173089495e-05, "loss": 0.4643, "num_input_tokens_seen": 46046128, "step": 37955 }, { "epoch": 4.227642276422764, "grad_norm": 0.08949305862188339, "learning_rate": 3.574283843838919e-05, "loss": 0.4434, "num_input_tokens_seen": 46052144, "step": 37960 }, { "epoch": 4.228199131306382, "grad_norm": 0.11073743551969528, "learning_rate": 3.5738450297790025e-05, "loss": 0.4641, "num_input_tokens_seen": 46058448, "step": 37965 }, { "epoch": 4.228755986189999, "grad_norm": 0.10227883607149124, "learning_rate": 3.573406175145778e-05, "loss": 0.458, "num_input_tokens_seen": 46064400, "step": 37970 }, { "epoch": 4.229312841073616, "grad_norm": 0.1206527128815651, "learning_rate": 3.572967279955829e-05, "loss": 0.4764, "num_input_tokens_seen": 46070640, "step": 37975 }, { "epoch": 4.229869695957234, "grad_norm": 0.0921143889427185, "learning_rate": 3.572528344225738e-05, "loss": 0.4613, "num_input_tokens_seen": 46076784, "step": 37980 }, { "epoch": 4.230426550840851, "grad_norm": 0.10374253243207932, "learning_rate": 3.5720893679720894e-05, "loss": 0.4484, "num_input_tokens_seen": 46082960, "step": 37985 }, { "epoch": 4.230983405724468, "grad_norm": 0.09469763934612274, "learning_rate": 3.571650351211468e-05, "loss": 0.4739, "num_input_tokens_seen": 46088624, "step": 37990 }, { "epoch": 4.231540260608085, "grad_norm": 0.14091277122497559, "learning_rate": 3.571211293960462e-05, "loss": 0.4635, "num_input_tokens_seen": 46094640, "step": 37995 }, { "epoch": 4.232097115491703, "grad_norm": 0.099215067923069, "learning_rate": 3.5707721962356606e-05, "loss": 0.4623, "num_input_tokens_seen": 46100848, "step": 38000 }, { "epoch": 4.2326539703753205, "grad_norm": 0.11903107911348343, "learning_rate": 3.5703330580536544e-05, "loss": 0.4637, "num_input_tokens_seen": 46107056, "step": 38005 }, { "epoch": 4.233210825258937, "grad_norm": 0.0856594666838646, "learning_rate": 3.569893879431035e-05, "loss": 0.4792, "num_input_tokens_seen": 46113200, "step": 38010 }, { "epoch": 4.233767680142555, "grad_norm": 0.11490737646818161, "learning_rate": 3.569454660384396e-05, "loss": 0.467, "num_input_tokens_seen": 46119152, "step": 38015 }, { "epoch": 4.234324535026172, "grad_norm": 0.09188567847013474, "learning_rate": 3.569015400930334e-05, "loss": 0.4615, "num_input_tokens_seen": 46125520, "step": 38020 }, { "epoch": 4.2348813899097895, "grad_norm": 0.17035625874996185, "learning_rate": 3.568576101085443e-05, "loss": 0.4571, "num_input_tokens_seen": 46131408, "step": 38025 }, { "epoch": 4.235438244793407, "grad_norm": 0.09338309615850449, "learning_rate": 3.568136760866322e-05, "loss": 0.4507, "num_input_tokens_seen": 46137456, "step": 38030 }, { "epoch": 4.235995099677024, "grad_norm": 0.08249691873788834, "learning_rate": 3.567697380289573e-05, "loss": 0.4572, "num_input_tokens_seen": 46143504, "step": 38035 }, { "epoch": 4.236551954560642, "grad_norm": 0.09718282520771027, "learning_rate": 3.567257959371794e-05, "loss": 0.4728, "num_input_tokens_seen": 46149616, "step": 38040 }, { "epoch": 4.237108809444258, "grad_norm": 0.10054793208837509, "learning_rate": 3.56681849812959e-05, "loss": 0.4579, "num_input_tokens_seen": 46155280, "step": 38045 }, { "epoch": 4.237665664327876, "grad_norm": 0.0990545004606247, "learning_rate": 3.566378996579563e-05, "loss": 0.4629, "num_input_tokens_seen": 46161584, "step": 38050 }, { "epoch": 4.238222519211494, "grad_norm": 0.06889218091964722, "learning_rate": 3.565939454738322e-05, "loss": 0.4595, "num_input_tokens_seen": 46167376, "step": 38055 }, { "epoch": 4.2387793740951105, "grad_norm": 0.10888157039880753, "learning_rate": 3.565499872622471e-05, "loss": 0.4625, "num_input_tokens_seen": 46173232, "step": 38060 }, { "epoch": 4.239336228978728, "grad_norm": 0.13543444871902466, "learning_rate": 3.565060250248621e-05, "loss": 0.4641, "num_input_tokens_seen": 46178928, "step": 38065 }, { "epoch": 4.239893083862346, "grad_norm": 0.11171627789735794, "learning_rate": 3.564620587633381e-05, "loss": 0.4643, "num_input_tokens_seen": 46185232, "step": 38070 }, { "epoch": 4.240449938745963, "grad_norm": 0.10980983823537827, "learning_rate": 3.564180884793364e-05, "loss": 0.4651, "num_input_tokens_seen": 46191120, "step": 38075 }, { "epoch": 4.24100679362958, "grad_norm": 0.1487557590007782, "learning_rate": 3.563741141745183e-05, "loss": 0.4692, "num_input_tokens_seen": 46196816, "step": 38080 }, { "epoch": 4.241563648513197, "grad_norm": 0.10305500775575638, "learning_rate": 3.563301358505452e-05, "loss": 0.4459, "num_input_tokens_seen": 46202704, "step": 38085 }, { "epoch": 4.242120503396815, "grad_norm": 0.10364853590726852, "learning_rate": 3.562861535090788e-05, "loss": 0.4543, "num_input_tokens_seen": 46208880, "step": 38090 }, { "epoch": 4.2426773582804325, "grad_norm": 0.1234741285443306, "learning_rate": 3.5624216715178094e-05, "loss": 0.4679, "num_input_tokens_seen": 46214992, "step": 38095 }, { "epoch": 4.243234213164049, "grad_norm": 0.11879575252532959, "learning_rate": 3.5619817678031355e-05, "loss": 0.4693, "num_input_tokens_seen": 46221168, "step": 38100 }, { "epoch": 4.243791068047667, "grad_norm": 0.1434653103351593, "learning_rate": 3.5615418239633875e-05, "loss": 0.4532, "num_input_tokens_seen": 46227664, "step": 38105 }, { "epoch": 4.244347922931284, "grad_norm": 0.10486920922994614, "learning_rate": 3.561101840015187e-05, "loss": 0.4597, "num_input_tokens_seen": 46233680, "step": 38110 }, { "epoch": 4.244904777814901, "grad_norm": 0.10782996565103531, "learning_rate": 3.560661815975159e-05, "loss": 0.4653, "num_input_tokens_seen": 46240080, "step": 38115 }, { "epoch": 4.245461632698519, "grad_norm": 0.1306895762681961, "learning_rate": 3.560221751859927e-05, "loss": 0.4679, "num_input_tokens_seen": 46245520, "step": 38120 }, { "epoch": 4.246018487582136, "grad_norm": 0.10471881181001663, "learning_rate": 3.559781647686121e-05, "loss": 0.4658, "num_input_tokens_seen": 46251920, "step": 38125 }, { "epoch": 4.2465753424657535, "grad_norm": 0.09991458058357239, "learning_rate": 3.559341503470368e-05, "loss": 0.4515, "num_input_tokens_seen": 46257936, "step": 38130 }, { "epoch": 4.247132197349371, "grad_norm": 0.1281599998474121, "learning_rate": 3.558901319229298e-05, "loss": 0.4449, "num_input_tokens_seen": 46264112, "step": 38135 }, { "epoch": 4.247689052232988, "grad_norm": 0.20370018482208252, "learning_rate": 3.558461094979543e-05, "loss": 0.4769, "num_input_tokens_seen": 46270416, "step": 38140 }, { "epoch": 4.248245907116606, "grad_norm": 0.10233037173748016, "learning_rate": 3.558020830737736e-05, "loss": 0.4526, "num_input_tokens_seen": 46276592, "step": 38145 }, { "epoch": 4.248802762000222, "grad_norm": 0.11831780523061752, "learning_rate": 3.557580526520511e-05, "loss": 0.4639, "num_input_tokens_seen": 46282928, "step": 38150 }, { "epoch": 4.24935961688384, "grad_norm": 0.09688594192266464, "learning_rate": 3.557140182344504e-05, "loss": 0.4796, "num_input_tokens_seen": 46288944, "step": 38155 }, { "epoch": 4.249916471767458, "grad_norm": 0.1016717329621315, "learning_rate": 3.556699798226354e-05, "loss": 0.4657, "num_input_tokens_seen": 46295024, "step": 38160 }, { "epoch": 4.250473326651075, "grad_norm": 0.1407032310962677, "learning_rate": 3.556259374182699e-05, "loss": 0.4511, "num_input_tokens_seen": 46300880, "step": 38165 }, { "epoch": 4.251030181534692, "grad_norm": 0.11655653268098831, "learning_rate": 3.55581891023018e-05, "loss": 0.4451, "num_input_tokens_seen": 46306800, "step": 38170 }, { "epoch": 4.251587036418309, "grad_norm": 0.1372692584991455, "learning_rate": 3.5553784063854386e-05, "loss": 0.4756, "num_input_tokens_seen": 46312784, "step": 38175 }, { "epoch": 4.252143891301927, "grad_norm": 0.19124218821525574, "learning_rate": 3.554937862665119e-05, "loss": 0.4717, "num_input_tokens_seen": 46317904, "step": 38180 }, { "epoch": 4.252700746185544, "grad_norm": 0.09352059662342072, "learning_rate": 3.554497279085866e-05, "loss": 0.4668, "num_input_tokens_seen": 46323888, "step": 38185 }, { "epoch": 4.253257601069161, "grad_norm": 0.1079772412776947, "learning_rate": 3.554056655664327e-05, "loss": 0.4665, "num_input_tokens_seen": 46330192, "step": 38190 }, { "epoch": 4.253814455952779, "grad_norm": 0.16727229952812195, "learning_rate": 3.5536159924171494e-05, "loss": 0.4753, "num_input_tokens_seen": 46336368, "step": 38195 }, { "epoch": 4.254371310836396, "grad_norm": 0.14839549362659454, "learning_rate": 3.553175289360984e-05, "loss": 0.4663, "num_input_tokens_seen": 46342544, "step": 38200 }, { "epoch": 4.254928165720013, "grad_norm": 0.15738670527935028, "learning_rate": 3.55273454651248e-05, "loss": 0.4672, "num_input_tokens_seen": 46348752, "step": 38205 }, { "epoch": 4.255485020603631, "grad_norm": 0.12134949117898941, "learning_rate": 3.5522937638882914e-05, "loss": 0.4627, "num_input_tokens_seen": 46355120, "step": 38210 }, { "epoch": 4.256041875487248, "grad_norm": 0.15758869051933289, "learning_rate": 3.5518529415050726e-05, "loss": 0.4694, "num_input_tokens_seen": 46360816, "step": 38215 }, { "epoch": 4.2565987303708654, "grad_norm": 0.17558827996253967, "learning_rate": 3.551412079379478e-05, "loss": 0.4589, "num_input_tokens_seen": 46367088, "step": 38220 }, { "epoch": 4.257155585254482, "grad_norm": 0.14378593862056732, "learning_rate": 3.550971177528166e-05, "loss": 0.4601, "num_input_tokens_seen": 46373200, "step": 38225 }, { "epoch": 4.2577124401381, "grad_norm": 0.08778053522109985, "learning_rate": 3.550530235967796e-05, "loss": 0.464, "num_input_tokens_seen": 46379536, "step": 38230 }, { "epoch": 4.258269295021718, "grad_norm": 0.07948140799999237, "learning_rate": 3.550089254715025e-05, "loss": 0.4665, "num_input_tokens_seen": 46385424, "step": 38235 }, { "epoch": 4.258826149905334, "grad_norm": 0.11838868260383606, "learning_rate": 3.549648233786519e-05, "loss": 0.4689, "num_input_tokens_seen": 46391184, "step": 38240 }, { "epoch": 4.259383004788952, "grad_norm": 0.1164906695485115, "learning_rate": 3.549207173198937e-05, "loss": 0.4547, "num_input_tokens_seen": 46397456, "step": 38245 }, { "epoch": 4.25993985967257, "grad_norm": 0.11718811839818954, "learning_rate": 3.548766072968947e-05, "loss": 0.4666, "num_input_tokens_seen": 46403824, "step": 38250 }, { "epoch": 4.2604967145561865, "grad_norm": 0.10849753767251968, "learning_rate": 3.5483249331132135e-05, "loss": 0.4754, "num_input_tokens_seen": 46409840, "step": 38255 }, { "epoch": 4.261053569439804, "grad_norm": 0.13449423015117645, "learning_rate": 3.547883753648404e-05, "loss": 0.4575, "num_input_tokens_seen": 46415984, "step": 38260 }, { "epoch": 4.261610424323421, "grad_norm": 0.13107728958129883, "learning_rate": 3.5474425345911886e-05, "loss": 0.4724, "num_input_tokens_seen": 46422288, "step": 38265 }, { "epoch": 4.262167279207039, "grad_norm": 0.10797552019357681, "learning_rate": 3.547001275958238e-05, "loss": 0.4586, "num_input_tokens_seen": 46428688, "step": 38270 }, { "epoch": 4.262724134090656, "grad_norm": 0.09470802545547485, "learning_rate": 3.5465599777662235e-05, "loss": 0.4652, "num_input_tokens_seen": 46434960, "step": 38275 }, { "epoch": 4.263280988974273, "grad_norm": 0.10654830187559128, "learning_rate": 3.546118640031819e-05, "loss": 0.4696, "num_input_tokens_seen": 46440944, "step": 38280 }, { "epoch": 4.263837843857891, "grad_norm": 0.1068410724401474, "learning_rate": 3.5456772627717e-05, "loss": 0.4507, "num_input_tokens_seen": 46446832, "step": 38285 }, { "epoch": 4.264394698741508, "grad_norm": 0.0977698564529419, "learning_rate": 3.545235846002543e-05, "loss": 0.4666, "num_input_tokens_seen": 46452816, "step": 38290 }, { "epoch": 4.264951553625125, "grad_norm": 0.0866597592830658, "learning_rate": 3.544794389741026e-05, "loss": 0.4612, "num_input_tokens_seen": 46458608, "step": 38295 }, { "epoch": 4.265508408508743, "grad_norm": 0.09944159537553787, "learning_rate": 3.544352894003829e-05, "loss": 0.4666, "num_input_tokens_seen": 46464528, "step": 38300 }, { "epoch": 4.26606526339236, "grad_norm": 0.10517056286334991, "learning_rate": 3.543911358807633e-05, "loss": 0.4427, "num_input_tokens_seen": 46470640, "step": 38305 }, { "epoch": 4.266622118275977, "grad_norm": 0.09013120830059052, "learning_rate": 3.543469784169119e-05, "loss": 0.4548, "num_input_tokens_seen": 46476592, "step": 38310 }, { "epoch": 4.267178973159595, "grad_norm": 0.08575639873743057, "learning_rate": 3.5430281701049746e-05, "loss": 0.4454, "num_input_tokens_seen": 46482928, "step": 38315 }, { "epoch": 4.267735828043212, "grad_norm": 0.12464683502912521, "learning_rate": 3.542586516631882e-05, "loss": 0.4709, "num_input_tokens_seen": 46488848, "step": 38320 }, { "epoch": 4.2682926829268295, "grad_norm": 0.154830202460289, "learning_rate": 3.5421448237665306e-05, "loss": 0.4687, "num_input_tokens_seen": 46494800, "step": 38325 }, { "epoch": 4.268849537810446, "grad_norm": 0.12061244994401932, "learning_rate": 3.5417030915256076e-05, "loss": 0.468, "num_input_tokens_seen": 46500912, "step": 38330 }, { "epoch": 4.269406392694064, "grad_norm": 0.20034706592559814, "learning_rate": 3.541261319925804e-05, "loss": 0.4771, "num_input_tokens_seen": 46507216, "step": 38335 }, { "epoch": 4.269963247577682, "grad_norm": 0.12592355906963348, "learning_rate": 3.5408195089838094e-05, "loss": 0.4564, "num_input_tokens_seen": 46513552, "step": 38340 }, { "epoch": 4.270520102461298, "grad_norm": 0.12303503602743149, "learning_rate": 3.5403776587163194e-05, "loss": 0.4562, "num_input_tokens_seen": 46519248, "step": 38345 }, { "epoch": 4.271076957344916, "grad_norm": 0.10255619138479233, "learning_rate": 3.539935769140027e-05, "loss": 0.4645, "num_input_tokens_seen": 46525744, "step": 38350 }, { "epoch": 4.271633812228533, "grad_norm": 0.09953439235687256, "learning_rate": 3.539493840271629e-05, "loss": 0.4706, "num_input_tokens_seen": 46531696, "step": 38355 }, { "epoch": 4.272190667112151, "grad_norm": 0.12332089990377426, "learning_rate": 3.539051872127822e-05, "loss": 0.4686, "num_input_tokens_seen": 46537840, "step": 38360 }, { "epoch": 4.272747521995768, "grad_norm": 0.13055579364299774, "learning_rate": 3.538609864725306e-05, "loss": 0.4443, "num_input_tokens_seen": 46543856, "step": 38365 }, { "epoch": 4.273304376879385, "grad_norm": 0.14183171093463898, "learning_rate": 3.538167818080781e-05, "loss": 0.468, "num_input_tokens_seen": 46549648, "step": 38370 }, { "epoch": 4.273861231763003, "grad_norm": 0.08793112635612488, "learning_rate": 3.5377257322109486e-05, "loss": 0.4675, "num_input_tokens_seen": 46555632, "step": 38375 }, { "epoch": 4.2744180866466195, "grad_norm": 0.11254405975341797, "learning_rate": 3.537283607132513e-05, "loss": 0.4724, "num_input_tokens_seen": 46561648, "step": 38380 }, { "epoch": 4.274974941530237, "grad_norm": 0.06768883764743805, "learning_rate": 3.5368414428621775e-05, "loss": 0.4648, "num_input_tokens_seen": 46567888, "step": 38385 }, { "epoch": 4.275531796413855, "grad_norm": 0.11809058487415314, "learning_rate": 3.53639923941665e-05, "loss": 0.4639, "num_input_tokens_seen": 46574192, "step": 38390 }, { "epoch": 4.276088651297472, "grad_norm": 0.09824587404727936, "learning_rate": 3.5359569968126385e-05, "loss": 0.4602, "num_input_tokens_seen": 46580144, "step": 38395 }, { "epoch": 4.276645506181089, "grad_norm": 0.12812340259552002, "learning_rate": 3.535514715066852e-05, "loss": 0.4687, "num_input_tokens_seen": 46586320, "step": 38400 }, { "epoch": 4.277202361064706, "grad_norm": 0.10876025259494781, "learning_rate": 3.5350723941959994e-05, "loss": 0.4691, "num_input_tokens_seen": 46592336, "step": 38405 }, { "epoch": 4.277759215948324, "grad_norm": 0.07482418417930603, "learning_rate": 3.534630034216796e-05, "loss": 0.4551, "num_input_tokens_seen": 46598480, "step": 38410 }, { "epoch": 4.278316070831941, "grad_norm": 0.09993541985750198, "learning_rate": 3.534187635145954e-05, "loss": 0.4671, "num_input_tokens_seen": 46604656, "step": 38415 }, { "epoch": 4.278872925715558, "grad_norm": 0.09501446783542633, "learning_rate": 3.533745197000189e-05, "loss": 0.4608, "num_input_tokens_seen": 46610256, "step": 38420 }, { "epoch": 4.279429780599176, "grad_norm": 0.13400930166244507, "learning_rate": 3.533302719796217e-05, "loss": 0.4608, "num_input_tokens_seen": 46616336, "step": 38425 }, { "epoch": 4.279986635482794, "grad_norm": 0.18874379992485046, "learning_rate": 3.532860203550758e-05, "loss": 0.4611, "num_input_tokens_seen": 46622544, "step": 38430 }, { "epoch": 4.28054349036641, "grad_norm": 0.10387056320905685, "learning_rate": 3.5324176482805294e-05, "loss": 0.4514, "num_input_tokens_seen": 46628912, "step": 38435 }, { "epoch": 4.281100345250028, "grad_norm": 0.10469680279493332, "learning_rate": 3.5319750540022545e-05, "loss": 0.4508, "num_input_tokens_seen": 46635248, "step": 38440 }, { "epoch": 4.281657200133645, "grad_norm": 0.09623375535011292, "learning_rate": 3.531532420732654e-05, "loss": 0.4623, "num_input_tokens_seen": 46641552, "step": 38445 }, { "epoch": 4.2822140550172625, "grad_norm": 0.11861822009086609, "learning_rate": 3.531089748488454e-05, "loss": 0.4657, "num_input_tokens_seen": 46647856, "step": 38450 }, { "epoch": 4.28277090990088, "grad_norm": 0.14126582443714142, "learning_rate": 3.5306470372863784e-05, "loss": 0.4547, "num_input_tokens_seen": 46653648, "step": 38455 }, { "epoch": 4.283327764784497, "grad_norm": 0.1206759363412857, "learning_rate": 3.530204287143155e-05, "loss": 0.4755, "num_input_tokens_seen": 46659760, "step": 38460 }, { "epoch": 4.283884619668115, "grad_norm": 0.09554077684879303, "learning_rate": 3.529761498075512e-05, "loss": 0.4591, "num_input_tokens_seen": 46665712, "step": 38465 }, { "epoch": 4.284441474551731, "grad_norm": 0.10561827570199966, "learning_rate": 3.5293186701001795e-05, "loss": 0.4615, "num_input_tokens_seen": 46671792, "step": 38470 }, { "epoch": 4.284998329435349, "grad_norm": 0.12944713234901428, "learning_rate": 3.5288758032338896e-05, "loss": 0.4717, "num_input_tokens_seen": 46678192, "step": 38475 }, { "epoch": 4.285555184318967, "grad_norm": 0.10964827984571457, "learning_rate": 3.5284328974933743e-05, "loss": 0.4632, "num_input_tokens_seen": 46684112, "step": 38480 }, { "epoch": 4.2861120392025835, "grad_norm": 0.1338893622159958, "learning_rate": 3.527989952895368e-05, "loss": 0.4621, "num_input_tokens_seen": 46690480, "step": 38485 }, { "epoch": 4.286668894086201, "grad_norm": 0.14689214527606964, "learning_rate": 3.527546969456608e-05, "loss": 0.4697, "num_input_tokens_seen": 46696336, "step": 38490 }, { "epoch": 4.287225748969819, "grad_norm": 0.11785458773374557, "learning_rate": 3.527103947193829e-05, "loss": 0.4653, "num_input_tokens_seen": 46702160, "step": 38495 }, { "epoch": 4.287782603853436, "grad_norm": 0.09895860403776169, "learning_rate": 3.5266608861237724e-05, "loss": 0.4599, "num_input_tokens_seen": 46708208, "step": 38500 }, { "epoch": 4.288339458737053, "grad_norm": 0.09298823773860931, "learning_rate": 3.526217786263177e-05, "loss": 0.4584, "num_input_tokens_seen": 46714256, "step": 38505 }, { "epoch": 4.28889631362067, "grad_norm": 0.0799020379781723, "learning_rate": 3.525774647628786e-05, "loss": 0.4642, "num_input_tokens_seen": 46720176, "step": 38510 }, { "epoch": 4.289453168504288, "grad_norm": 0.10619724541902542, "learning_rate": 3.525331470237341e-05, "loss": 0.4761, "num_input_tokens_seen": 46726288, "step": 38515 }, { "epoch": 4.2900100233879055, "grad_norm": 0.11353829503059387, "learning_rate": 3.5248882541055866e-05, "loss": 0.4623, "num_input_tokens_seen": 46732528, "step": 38520 }, { "epoch": 4.290566878271522, "grad_norm": 0.10003110021352768, "learning_rate": 3.5244449992502696e-05, "loss": 0.4551, "num_input_tokens_seen": 46738544, "step": 38525 }, { "epoch": 4.29112373315514, "grad_norm": 0.09058929979801178, "learning_rate": 3.524001705688138e-05, "loss": 0.4681, "num_input_tokens_seen": 46744784, "step": 38530 }, { "epoch": 4.291680588038757, "grad_norm": 0.09320475161075592, "learning_rate": 3.5235583734359404e-05, "loss": 0.4721, "num_input_tokens_seen": 46750992, "step": 38535 }, { "epoch": 4.292237442922374, "grad_norm": 0.10166355967521667, "learning_rate": 3.523115002510427e-05, "loss": 0.4473, "num_input_tokens_seen": 46757008, "step": 38540 }, { "epoch": 4.292794297805992, "grad_norm": 0.08789964020252228, "learning_rate": 3.5226715929283506e-05, "loss": 0.4653, "num_input_tokens_seen": 46762896, "step": 38545 }, { "epoch": 4.293351152689609, "grad_norm": 0.09694761782884598, "learning_rate": 3.522228144706463e-05, "loss": 0.4589, "num_input_tokens_seen": 46769168, "step": 38550 }, { "epoch": 4.2939080075732265, "grad_norm": 0.12071090936660767, "learning_rate": 3.521784657861521e-05, "loss": 0.4605, "num_input_tokens_seen": 46775120, "step": 38555 }, { "epoch": 4.294464862456843, "grad_norm": 0.10963821411132812, "learning_rate": 3.52134113241028e-05, "loss": 0.4621, "num_input_tokens_seen": 46780784, "step": 38560 }, { "epoch": 4.295021717340461, "grad_norm": 0.10185252130031586, "learning_rate": 3.5208975683694976e-05, "loss": 0.4571, "num_input_tokens_seen": 46786768, "step": 38565 }, { "epoch": 4.295578572224079, "grad_norm": 0.10726860910654068, "learning_rate": 3.5204539657559346e-05, "loss": 0.4651, "num_input_tokens_seen": 46792944, "step": 38570 }, { "epoch": 4.2961354271076955, "grad_norm": 0.10278370976448059, "learning_rate": 3.52001032458635e-05, "loss": 0.4607, "num_input_tokens_seen": 46799472, "step": 38575 }, { "epoch": 4.296692281991313, "grad_norm": 0.0818522572517395, "learning_rate": 3.519566644877506e-05, "loss": 0.4646, "num_input_tokens_seen": 46805712, "step": 38580 }, { "epoch": 4.29724913687493, "grad_norm": 0.11840221285820007, "learning_rate": 3.519122926646167e-05, "loss": 0.4545, "num_input_tokens_seen": 46811056, "step": 38585 }, { "epoch": 4.297805991758548, "grad_norm": 0.11762148141860962, "learning_rate": 3.5186791699090974e-05, "loss": 0.4492, "num_input_tokens_seen": 46817008, "step": 38590 }, { "epoch": 4.298362846642165, "grad_norm": 0.1264435201883316, "learning_rate": 3.518235374683065e-05, "loss": 0.463, "num_input_tokens_seen": 46823024, "step": 38595 }, { "epoch": 4.298919701525782, "grad_norm": 0.08458880335092545, "learning_rate": 3.517791540984837e-05, "loss": 0.467, "num_input_tokens_seen": 46829168, "step": 38600 }, { "epoch": 4.2994765564094, "grad_norm": 0.1149739921092987, "learning_rate": 3.517347668831183e-05, "loss": 0.4578, "num_input_tokens_seen": 46835312, "step": 38605 }, { "epoch": 4.300033411293017, "grad_norm": 0.11399366706609726, "learning_rate": 3.516903758238874e-05, "loss": 0.4601, "num_input_tokens_seen": 46841520, "step": 38610 }, { "epoch": 4.300590266176634, "grad_norm": 0.1175106018781662, "learning_rate": 3.516459809224682e-05, "loss": 0.4599, "num_input_tokens_seen": 46847728, "step": 38615 }, { "epoch": 4.301147121060252, "grad_norm": 0.07960881292819977, "learning_rate": 3.516015821805381e-05, "loss": 0.4659, "num_input_tokens_seen": 46853488, "step": 38620 }, { "epoch": 4.301703975943869, "grad_norm": 0.07764527946710587, "learning_rate": 3.515571795997746e-05, "loss": 0.4564, "num_input_tokens_seen": 46859632, "step": 38625 }, { "epoch": 4.302260830827486, "grad_norm": 0.12234175950288773, "learning_rate": 3.5151277318185535e-05, "loss": 0.4587, "num_input_tokens_seen": 46865648, "step": 38630 }, { "epoch": 4.302817685711104, "grad_norm": 0.1207510456442833, "learning_rate": 3.514683629284583e-05, "loss": 0.4635, "num_input_tokens_seen": 46871536, "step": 38635 }, { "epoch": 4.303374540594721, "grad_norm": 0.1062634065747261, "learning_rate": 3.5142394884126134e-05, "loss": 0.462, "num_input_tokens_seen": 46877648, "step": 38640 }, { "epoch": 4.3039313954783385, "grad_norm": 0.08792579919099808, "learning_rate": 3.513795309219425e-05, "loss": 0.4677, "num_input_tokens_seen": 46883504, "step": 38645 }, { "epoch": 4.304488250361955, "grad_norm": 0.11010702699422836, "learning_rate": 3.5133510917218015e-05, "loss": 0.4747, "num_input_tokens_seen": 46889520, "step": 38650 }, { "epoch": 4.305045105245573, "grad_norm": 0.12217392027378082, "learning_rate": 3.5129068359365256e-05, "loss": 0.4656, "num_input_tokens_seen": 46895472, "step": 38655 }, { "epoch": 4.305601960129191, "grad_norm": 0.11379895359277725, "learning_rate": 3.512462541880384e-05, "loss": 0.4599, "num_input_tokens_seen": 46901776, "step": 38660 }, { "epoch": 4.306158815012807, "grad_norm": 0.13085317611694336, "learning_rate": 3.512018209570163e-05, "loss": 0.4501, "num_input_tokens_seen": 46907984, "step": 38665 }, { "epoch": 4.306715669896425, "grad_norm": 0.10378565639257431, "learning_rate": 3.5115738390226504e-05, "loss": 0.466, "num_input_tokens_seen": 46914000, "step": 38670 }, { "epoch": 4.307272524780043, "grad_norm": 0.1114632785320282, "learning_rate": 3.511129430254637e-05, "loss": 0.4585, "num_input_tokens_seen": 46920080, "step": 38675 }, { "epoch": 4.3078293796636595, "grad_norm": 0.0788814052939415, "learning_rate": 3.5106849832829135e-05, "loss": 0.4663, "num_input_tokens_seen": 46925840, "step": 38680 }, { "epoch": 4.308386234547277, "grad_norm": 0.07542141526937485, "learning_rate": 3.510240498124272e-05, "loss": 0.4768, "num_input_tokens_seen": 46932048, "step": 38685 }, { "epoch": 4.308943089430894, "grad_norm": 0.1113106831908226, "learning_rate": 3.509795974795507e-05, "loss": 0.4574, "num_input_tokens_seen": 46938224, "step": 38690 }, { "epoch": 4.309499944314512, "grad_norm": 0.1219758540391922, "learning_rate": 3.509351413313414e-05, "loss": 0.4526, "num_input_tokens_seen": 46944560, "step": 38695 }, { "epoch": 4.310056799198129, "grad_norm": 0.1331542581319809, "learning_rate": 3.50890681369479e-05, "loss": 0.4713, "num_input_tokens_seen": 46950416, "step": 38700 }, { "epoch": 4.310613654081746, "grad_norm": 0.1207195520401001, "learning_rate": 3.508462175956434e-05, "loss": 0.4615, "num_input_tokens_seen": 46956656, "step": 38705 }, { "epoch": 4.311170508965364, "grad_norm": 0.1093258410692215, "learning_rate": 3.508017500115144e-05, "loss": 0.4636, "num_input_tokens_seen": 46962672, "step": 38710 }, { "epoch": 4.311727363848981, "grad_norm": 0.10081600397825241, "learning_rate": 3.5075727861877234e-05, "loss": 0.4697, "num_input_tokens_seen": 46968208, "step": 38715 }, { "epoch": 4.312284218732598, "grad_norm": 0.09420966356992722, "learning_rate": 3.507128034190974e-05, "loss": 0.4542, "num_input_tokens_seen": 46973808, "step": 38720 }, { "epoch": 4.312841073616216, "grad_norm": 0.13534881174564362, "learning_rate": 3.5066832441417e-05, "loss": 0.4463, "num_input_tokens_seen": 46979984, "step": 38725 }, { "epoch": 4.313397928499833, "grad_norm": 0.0866909921169281, "learning_rate": 3.506238416056706e-05, "loss": 0.4502, "num_input_tokens_seen": 46986160, "step": 38730 }, { "epoch": 4.31395478338345, "grad_norm": 0.09119129180908203, "learning_rate": 3.505793549952801e-05, "loss": 0.4645, "num_input_tokens_seen": 46992176, "step": 38735 }, { "epoch": 4.314511638267067, "grad_norm": 0.13769568502902985, "learning_rate": 3.505348645846792e-05, "loss": 0.4513, "num_input_tokens_seen": 46998064, "step": 38740 }, { "epoch": 4.315068493150685, "grad_norm": 0.1391444355249405, "learning_rate": 3.50490370375549e-05, "loss": 0.4645, "num_input_tokens_seen": 47003984, "step": 38745 }, { "epoch": 4.3156253480343025, "grad_norm": 0.10746271163225174, "learning_rate": 3.504458723695705e-05, "loss": 0.455, "num_input_tokens_seen": 47010256, "step": 38750 }, { "epoch": 4.316182202917919, "grad_norm": 0.10776623338460922, "learning_rate": 3.50401370568425e-05, "loss": 0.4573, "num_input_tokens_seen": 47016560, "step": 38755 }, { "epoch": 4.316739057801537, "grad_norm": 0.08712595701217651, "learning_rate": 3.503568649737941e-05, "loss": 0.467, "num_input_tokens_seen": 47022064, "step": 38760 }, { "epoch": 4.317295912685154, "grad_norm": 0.10802395641803741, "learning_rate": 3.503123555873592e-05, "loss": 0.4594, "num_input_tokens_seen": 47028176, "step": 38765 }, { "epoch": 4.3178527675687715, "grad_norm": 0.09793244302272797, "learning_rate": 3.5026784241080204e-05, "loss": 0.464, "num_input_tokens_seen": 47034256, "step": 38770 }, { "epoch": 4.318409622452389, "grad_norm": 0.12826599180698395, "learning_rate": 3.502233254458043e-05, "loss": 0.472, "num_input_tokens_seen": 47040304, "step": 38775 }, { "epoch": 4.318966477336006, "grad_norm": 0.07375402003526688, "learning_rate": 3.501788046940483e-05, "loss": 0.4573, "num_input_tokens_seen": 47046256, "step": 38780 }, { "epoch": 4.319523332219624, "grad_norm": 0.08131390064954758, "learning_rate": 3.5013428015721605e-05, "loss": 0.4584, "num_input_tokens_seen": 47052656, "step": 38785 }, { "epoch": 4.320080187103241, "grad_norm": 0.0954313650727272, "learning_rate": 3.500897518369897e-05, "loss": 0.4721, "num_input_tokens_seen": 47058608, "step": 38790 }, { "epoch": 4.320637041986858, "grad_norm": 0.10527247190475464, "learning_rate": 3.500452197350519e-05, "loss": 0.4631, "num_input_tokens_seen": 47064784, "step": 38795 }, { "epoch": 4.321193896870476, "grad_norm": 0.13077668845653534, "learning_rate": 3.50000683853085e-05, "loss": 0.467, "num_input_tokens_seen": 47070992, "step": 38800 }, { "epoch": 4.3217507517540925, "grad_norm": 0.09484066069126129, "learning_rate": 3.499561441927718e-05, "loss": 0.4718, "num_input_tokens_seen": 47077104, "step": 38805 }, { "epoch": 4.32230760663771, "grad_norm": 0.15824657678604126, "learning_rate": 3.499116007557951e-05, "loss": 0.4608, "num_input_tokens_seen": 47083088, "step": 38810 }, { "epoch": 4.322864461521328, "grad_norm": 0.08436904102563858, "learning_rate": 3.49867053543838e-05, "loss": 0.4527, "num_input_tokens_seen": 47089232, "step": 38815 }, { "epoch": 4.323421316404945, "grad_norm": 0.10549990087747574, "learning_rate": 3.498225025585836e-05, "loss": 0.4774, "num_input_tokens_seen": 47095344, "step": 38820 }, { "epoch": 4.323978171288562, "grad_norm": 0.12255359441041946, "learning_rate": 3.497779478017151e-05, "loss": 0.4695, "num_input_tokens_seen": 47101168, "step": 38825 }, { "epoch": 4.32453502617218, "grad_norm": 0.09420745074748993, "learning_rate": 3.497333892749161e-05, "loss": 0.4652, "num_input_tokens_seen": 47107120, "step": 38830 }, { "epoch": 4.325091881055797, "grad_norm": 0.1303887963294983, "learning_rate": 3.496888269798699e-05, "loss": 0.4785, "num_input_tokens_seen": 47113392, "step": 38835 }, { "epoch": 4.3256487359394145, "grad_norm": 0.10338038951158524, "learning_rate": 3.496442609182603e-05, "loss": 0.4639, "num_input_tokens_seen": 47119760, "step": 38840 }, { "epoch": 4.326205590823031, "grad_norm": 0.11772636324167252, "learning_rate": 3.495996910917714e-05, "loss": 0.4643, "num_input_tokens_seen": 47126288, "step": 38845 }, { "epoch": 4.326762445706649, "grad_norm": 0.10662315785884857, "learning_rate": 3.495551175020868e-05, "loss": 0.4652, "num_input_tokens_seen": 47132016, "step": 38850 }, { "epoch": 4.327319300590267, "grad_norm": 0.10346256196498871, "learning_rate": 3.495105401508909e-05, "loss": 0.4596, "num_input_tokens_seen": 47137712, "step": 38855 }, { "epoch": 4.327876155473883, "grad_norm": 0.11986738443374634, "learning_rate": 3.4946595903986794e-05, "loss": 0.4568, "num_input_tokens_seen": 47143856, "step": 38860 }, { "epoch": 4.328433010357501, "grad_norm": 0.11978849768638611, "learning_rate": 3.494213741707023e-05, "loss": 0.4578, "num_input_tokens_seen": 47149936, "step": 38865 }, { "epoch": 4.328989865241118, "grad_norm": 0.09919589012861252, "learning_rate": 3.493767855450784e-05, "loss": 0.4543, "num_input_tokens_seen": 47155952, "step": 38870 }, { "epoch": 4.3295467201247355, "grad_norm": 0.12626104056835175, "learning_rate": 3.4933219316468116e-05, "loss": 0.4479, "num_input_tokens_seen": 47161968, "step": 38875 }, { "epoch": 4.330103575008353, "grad_norm": 0.1202734038233757, "learning_rate": 3.492875970311953e-05, "loss": 0.4769, "num_input_tokens_seen": 47167632, "step": 38880 }, { "epoch": 4.33066042989197, "grad_norm": 0.12403383105993271, "learning_rate": 3.4924299714630596e-05, "loss": 0.4579, "num_input_tokens_seen": 47173456, "step": 38885 }, { "epoch": 4.331217284775588, "grad_norm": 0.08767300844192505, "learning_rate": 3.491983935116981e-05, "loss": 0.4564, "num_input_tokens_seen": 47179760, "step": 38890 }, { "epoch": 4.331774139659204, "grad_norm": 0.12153904139995575, "learning_rate": 3.491537861290569e-05, "loss": 0.4686, "num_input_tokens_seen": 47185840, "step": 38895 }, { "epoch": 4.332330994542822, "grad_norm": 0.09205915033817291, "learning_rate": 3.4910917500006806e-05, "loss": 0.4473, "num_input_tokens_seen": 47192272, "step": 38900 }, { "epoch": 4.33288784942644, "grad_norm": 0.13820335268974304, "learning_rate": 3.490645601264169e-05, "loss": 0.4669, "num_input_tokens_seen": 47198160, "step": 38905 }, { "epoch": 4.333444704310057, "grad_norm": 0.11325006932020187, "learning_rate": 3.490199415097892e-05, "loss": 0.4712, "num_input_tokens_seen": 47203696, "step": 38910 }, { "epoch": 4.334001559193674, "grad_norm": 0.07155650854110718, "learning_rate": 3.489753191518709e-05, "loss": 0.4702, "num_input_tokens_seen": 47209776, "step": 38915 }, { "epoch": 4.334558414077291, "grad_norm": 0.08947048336267471, "learning_rate": 3.489306930543477e-05, "loss": 0.4459, "num_input_tokens_seen": 47215472, "step": 38920 }, { "epoch": 4.335115268960909, "grad_norm": 0.07446103543043137, "learning_rate": 3.488860632189059e-05, "loss": 0.4662, "num_input_tokens_seen": 47220816, "step": 38925 }, { "epoch": 4.335672123844526, "grad_norm": 0.09507425129413605, "learning_rate": 3.4884142964723176e-05, "loss": 0.4664, "num_input_tokens_seen": 47227152, "step": 38930 }, { "epoch": 4.336228978728143, "grad_norm": 0.1453275978565216, "learning_rate": 3.487967923410117e-05, "loss": 0.4567, "num_input_tokens_seen": 47233392, "step": 38935 }, { "epoch": 4.336785833611761, "grad_norm": 0.13972453773021698, "learning_rate": 3.4875215130193215e-05, "loss": 0.4659, "num_input_tokens_seen": 47239408, "step": 38940 }, { "epoch": 4.3373426884953785, "grad_norm": 0.1339278221130371, "learning_rate": 3.487075065316799e-05, "loss": 0.4544, "num_input_tokens_seen": 47245360, "step": 38945 }, { "epoch": 4.337899543378995, "grad_norm": 0.12163575738668442, "learning_rate": 3.4866285803194175e-05, "loss": 0.4722, "num_input_tokens_seen": 47251312, "step": 38950 }, { "epoch": 4.338456398262613, "grad_norm": 0.11324044317007065, "learning_rate": 3.486182058044046e-05, "loss": 0.4595, "num_input_tokens_seen": 47257296, "step": 38955 }, { "epoch": 4.33901325314623, "grad_norm": 0.12285152077674866, "learning_rate": 3.485735498507556e-05, "loss": 0.4622, "num_input_tokens_seen": 47263408, "step": 38960 }, { "epoch": 4.339570108029847, "grad_norm": 0.13704265654087067, "learning_rate": 3.48528890172682e-05, "loss": 0.4534, "num_input_tokens_seen": 47269552, "step": 38965 }, { "epoch": 4.340126962913465, "grad_norm": 0.08364001661539078, "learning_rate": 3.484842267718712e-05, "loss": 0.4755, "num_input_tokens_seen": 47275664, "step": 38970 }, { "epoch": 4.340683817797082, "grad_norm": 0.1056584045290947, "learning_rate": 3.4843955965001066e-05, "loss": 0.4582, "num_input_tokens_seen": 47281872, "step": 38975 }, { "epoch": 4.3412406726807, "grad_norm": 0.1292153298854828, "learning_rate": 3.483948888087881e-05, "loss": 0.4633, "num_input_tokens_seen": 47288016, "step": 38980 }, { "epoch": 4.341797527564316, "grad_norm": 0.08881405740976334, "learning_rate": 3.483502142498914e-05, "loss": 0.4727, "num_input_tokens_seen": 47293872, "step": 38985 }, { "epoch": 4.342354382447934, "grad_norm": 0.16979794204235077, "learning_rate": 3.483055359750084e-05, "loss": 0.4625, "num_input_tokens_seen": 47300144, "step": 38990 }, { "epoch": 4.342911237331552, "grad_norm": 0.11580196022987366, "learning_rate": 3.482608539858271e-05, "loss": 0.4659, "num_input_tokens_seen": 47305744, "step": 38995 }, { "epoch": 4.3434680922151685, "grad_norm": 0.11074697971343994, "learning_rate": 3.482161682840359e-05, "loss": 0.4596, "num_input_tokens_seen": 47311568, "step": 39000 }, { "epoch": 4.344024947098786, "grad_norm": 0.10298352688550949, "learning_rate": 3.481714788713232e-05, "loss": 0.463, "num_input_tokens_seen": 47317648, "step": 39005 }, { "epoch": 4.344581801982404, "grad_norm": 0.09879033267498016, "learning_rate": 3.481267857493773e-05, "loss": 0.4607, "num_input_tokens_seen": 47323728, "step": 39010 }, { "epoch": 4.345138656866021, "grad_norm": 0.1507268100976944, "learning_rate": 3.4808208891988704e-05, "loss": 0.4608, "num_input_tokens_seen": 47329936, "step": 39015 }, { "epoch": 4.345695511749638, "grad_norm": 0.09491521120071411, "learning_rate": 3.4803738838454114e-05, "loss": 0.4548, "num_input_tokens_seen": 47335920, "step": 39020 }, { "epoch": 4.346252366633255, "grad_norm": 0.12512020766735077, "learning_rate": 3.479926841450285e-05, "loss": 0.4686, "num_input_tokens_seen": 47342160, "step": 39025 }, { "epoch": 4.346809221516873, "grad_norm": 0.12129200249910355, "learning_rate": 3.479479762030382e-05, "loss": 0.4697, "num_input_tokens_seen": 47348464, "step": 39030 }, { "epoch": 4.34736607640049, "grad_norm": 0.1271914839744568, "learning_rate": 3.479032645602595e-05, "loss": 0.4564, "num_input_tokens_seen": 47354480, "step": 39035 }, { "epoch": 4.347922931284107, "grad_norm": 0.1043272390961647, "learning_rate": 3.478585492183818e-05, "loss": 0.471, "num_input_tokens_seen": 47360624, "step": 39040 }, { "epoch": 4.348479786167725, "grad_norm": 0.1020364984869957, "learning_rate": 3.478138301790944e-05, "loss": 0.4603, "num_input_tokens_seen": 47366608, "step": 39045 }, { "epoch": 4.349036641051342, "grad_norm": 0.08291351050138474, "learning_rate": 3.477691074440871e-05, "loss": 0.4716, "num_input_tokens_seen": 47372528, "step": 39050 }, { "epoch": 4.349593495934959, "grad_norm": 0.10608455538749695, "learning_rate": 3.4772438101504956e-05, "loss": 0.4576, "num_input_tokens_seen": 47378352, "step": 39055 }, { "epoch": 4.350150350818577, "grad_norm": 0.10982837527990341, "learning_rate": 3.4767965089367174e-05, "loss": 0.4592, "num_input_tokens_seen": 47384464, "step": 39060 }, { "epoch": 4.350707205702194, "grad_norm": 0.13797229528427124, "learning_rate": 3.4763491708164364e-05, "loss": 0.4601, "num_input_tokens_seen": 47390960, "step": 39065 }, { "epoch": 4.3512640605858115, "grad_norm": 0.10632777214050293, "learning_rate": 3.475901795806555e-05, "loss": 0.4742, "num_input_tokens_seen": 47397136, "step": 39070 }, { "epoch": 4.351820915469428, "grad_norm": 0.12734699249267578, "learning_rate": 3.4754543839239775e-05, "loss": 0.4554, "num_input_tokens_seen": 47403184, "step": 39075 }, { "epoch": 4.352377770353046, "grad_norm": 0.07361927628517151, "learning_rate": 3.475006935185606e-05, "loss": 0.4566, "num_input_tokens_seen": 47408976, "step": 39080 }, { "epoch": 4.352934625236664, "grad_norm": 0.11578819900751114, "learning_rate": 3.474559449608349e-05, "loss": 0.4687, "num_input_tokens_seen": 47414960, "step": 39085 }, { "epoch": 4.35349148012028, "grad_norm": 0.1339961141347885, "learning_rate": 3.4741119272091124e-05, "loss": 0.4657, "num_input_tokens_seen": 47421264, "step": 39090 }, { "epoch": 4.354048335003898, "grad_norm": 0.09933944791555405, "learning_rate": 3.473664368004805e-05, "loss": 0.4657, "num_input_tokens_seen": 47427344, "step": 39095 }, { "epoch": 4.354605189887515, "grad_norm": 0.10466708242893219, "learning_rate": 3.473216772012338e-05, "loss": 0.4554, "num_input_tokens_seen": 47433520, "step": 39100 }, { "epoch": 4.355162044771133, "grad_norm": 0.11962026357650757, "learning_rate": 3.4727691392486226e-05, "loss": 0.4626, "num_input_tokens_seen": 47439696, "step": 39105 }, { "epoch": 4.35571889965475, "grad_norm": 0.10272906720638275, "learning_rate": 3.4723214697305716e-05, "loss": 0.4521, "num_input_tokens_seen": 47445808, "step": 39110 }, { "epoch": 4.356275754538367, "grad_norm": 0.08491427451372147, "learning_rate": 3.471873763475099e-05, "loss": 0.4547, "num_input_tokens_seen": 47451568, "step": 39115 }, { "epoch": 4.356832609421985, "grad_norm": 0.07809588313102722, "learning_rate": 3.471426020499122e-05, "loss": 0.453, "num_input_tokens_seen": 47457552, "step": 39120 }, { "epoch": 4.357389464305602, "grad_norm": 0.10675626993179321, "learning_rate": 3.470978240819556e-05, "loss": 0.4599, "num_input_tokens_seen": 47463920, "step": 39125 }, { "epoch": 4.357946319189219, "grad_norm": 0.10433229058980942, "learning_rate": 3.470530424453321e-05, "loss": 0.4502, "num_input_tokens_seen": 47469872, "step": 39130 }, { "epoch": 4.358503174072837, "grad_norm": 0.12030449509620667, "learning_rate": 3.470082571417337e-05, "loss": 0.4614, "num_input_tokens_seen": 47475760, "step": 39135 }, { "epoch": 4.359060028956454, "grad_norm": 0.10158032178878784, "learning_rate": 3.469634681728523e-05, "loss": 0.4672, "num_input_tokens_seen": 47481552, "step": 39140 }, { "epoch": 4.359616883840071, "grad_norm": 0.10371646285057068, "learning_rate": 3.4691867554038046e-05, "loss": 0.4727, "num_input_tokens_seen": 47487664, "step": 39145 }, { "epoch": 4.360173738723689, "grad_norm": 0.09557007998228073, "learning_rate": 3.468738792460105e-05, "loss": 0.472, "num_input_tokens_seen": 47494032, "step": 39150 }, { "epoch": 4.360730593607306, "grad_norm": 0.11365237087011337, "learning_rate": 3.468290792914348e-05, "loss": 0.4615, "num_input_tokens_seen": 47499856, "step": 39155 }, { "epoch": 4.361287448490923, "grad_norm": 0.10388563573360443, "learning_rate": 3.467842756783463e-05, "loss": 0.4539, "num_input_tokens_seen": 47506064, "step": 39160 }, { "epoch": 4.36184430337454, "grad_norm": 0.09911908954381943, "learning_rate": 3.4673946840843765e-05, "loss": 0.4605, "num_input_tokens_seen": 47512144, "step": 39165 }, { "epoch": 4.362401158258158, "grad_norm": 0.11975204944610596, "learning_rate": 3.466946574834018e-05, "loss": 0.4684, "num_input_tokens_seen": 47518160, "step": 39170 }, { "epoch": 4.362958013141776, "grad_norm": 0.14099909365177155, "learning_rate": 3.46649842904932e-05, "loss": 0.4568, "num_input_tokens_seen": 47524624, "step": 39175 }, { "epoch": 4.363514868025392, "grad_norm": 0.10433506965637207, "learning_rate": 3.4660502467472134e-05, "loss": 0.4548, "num_input_tokens_seen": 47530672, "step": 39180 }, { "epoch": 4.36407172290901, "grad_norm": 0.12191221117973328, "learning_rate": 3.465602027944633e-05, "loss": 0.4572, "num_input_tokens_seen": 47536720, "step": 39185 }, { "epoch": 4.364628577792628, "grad_norm": 0.13068324327468872, "learning_rate": 3.465153772658514e-05, "loss": 0.4748, "num_input_tokens_seen": 47543056, "step": 39190 }, { "epoch": 4.3651854326762445, "grad_norm": 0.14562375843524933, "learning_rate": 3.464705480905792e-05, "loss": 0.4737, "num_input_tokens_seen": 47548912, "step": 39195 }, { "epoch": 4.365742287559862, "grad_norm": 0.10738669335842133, "learning_rate": 3.4642571527034064e-05, "loss": 0.4673, "num_input_tokens_seen": 47555280, "step": 39200 }, { "epoch": 4.366299142443479, "grad_norm": 0.09945204108953476, "learning_rate": 3.4638087880682945e-05, "loss": 0.4704, "num_input_tokens_seen": 47561264, "step": 39205 }, { "epoch": 4.366855997327097, "grad_norm": 0.07262612879276276, "learning_rate": 3.463360387017397e-05, "loss": 0.4717, "num_input_tokens_seen": 47567664, "step": 39210 }, { "epoch": 4.367412852210714, "grad_norm": 0.08716999739408493, "learning_rate": 3.462911949567658e-05, "loss": 0.4551, "num_input_tokens_seen": 47573872, "step": 39215 }, { "epoch": 4.367969707094331, "grad_norm": 0.08998484164476395, "learning_rate": 3.46246347573602e-05, "loss": 0.4627, "num_input_tokens_seen": 47579728, "step": 39220 }, { "epoch": 4.368526561977949, "grad_norm": 0.10897751897573471, "learning_rate": 3.4620149655394275e-05, "loss": 0.4742, "num_input_tokens_seen": 47586064, "step": 39225 }, { "epoch": 4.3690834168615655, "grad_norm": 0.07744718343019485, "learning_rate": 3.461566418994826e-05, "loss": 0.4569, "num_input_tokens_seen": 47592560, "step": 39230 }, { "epoch": 4.369640271745183, "grad_norm": 0.1118551567196846, "learning_rate": 3.4611178361191645e-05, "loss": 0.4669, "num_input_tokens_seen": 47598768, "step": 39235 }, { "epoch": 4.370197126628801, "grad_norm": 0.12646254897117615, "learning_rate": 3.46066921692939e-05, "loss": 0.4592, "num_input_tokens_seen": 47604784, "step": 39240 }, { "epoch": 4.370753981512418, "grad_norm": 0.09532348811626434, "learning_rate": 3.4602205614424546e-05, "loss": 0.4583, "num_input_tokens_seen": 47611184, "step": 39245 }, { "epoch": 4.371310836396035, "grad_norm": 0.08014033734798431, "learning_rate": 3.459771869675309e-05, "loss": 0.4594, "num_input_tokens_seen": 47617296, "step": 39250 }, { "epoch": 4.371867691279652, "grad_norm": 0.1147453784942627, "learning_rate": 3.4593231416449065e-05, "loss": 0.4663, "num_input_tokens_seen": 47623408, "step": 39255 }, { "epoch": 4.37242454616327, "grad_norm": 0.09262311458587646, "learning_rate": 3.458874377368201e-05, "loss": 0.4651, "num_input_tokens_seen": 47629872, "step": 39260 }, { "epoch": 4.3729814010468875, "grad_norm": 0.1058657094836235, "learning_rate": 3.458425576862149e-05, "loss": 0.463, "num_input_tokens_seen": 47636144, "step": 39265 }, { "epoch": 4.373538255930504, "grad_norm": 0.15338172018527985, "learning_rate": 3.457976740143708e-05, "loss": 0.4535, "num_input_tokens_seen": 47642128, "step": 39270 }, { "epoch": 4.374095110814122, "grad_norm": 0.12203836441040039, "learning_rate": 3.4575278672298345e-05, "loss": 0.4546, "num_input_tokens_seen": 47647248, "step": 39275 }, { "epoch": 4.374651965697739, "grad_norm": 0.1013726145029068, "learning_rate": 3.45707895813749e-05, "loss": 0.4732, "num_input_tokens_seen": 47653296, "step": 39280 }, { "epoch": 4.375208820581356, "grad_norm": 0.10111025720834732, "learning_rate": 3.456630012883636e-05, "loss": 0.4662, "num_input_tokens_seen": 47659408, "step": 39285 }, { "epoch": 4.375765675464974, "grad_norm": 0.09501592814922333, "learning_rate": 3.456181031485234e-05, "loss": 0.4598, "num_input_tokens_seen": 47665552, "step": 39290 }, { "epoch": 4.376322530348591, "grad_norm": 0.11038724333047867, "learning_rate": 3.455732013959248e-05, "loss": 0.4551, "num_input_tokens_seen": 47671728, "step": 39295 }, { "epoch": 4.3768793852322085, "grad_norm": 0.08725153654813766, "learning_rate": 3.4552829603226445e-05, "loss": 0.4697, "num_input_tokens_seen": 47677680, "step": 39300 }, { "epoch": 4.377436240115826, "grad_norm": 0.10905411839485168, "learning_rate": 3.454833870592389e-05, "loss": 0.4554, "num_input_tokens_seen": 47683888, "step": 39305 }, { "epoch": 4.377993094999443, "grad_norm": 0.1257062554359436, "learning_rate": 3.45438474478545e-05, "loss": 0.4663, "num_input_tokens_seen": 47690288, "step": 39310 }, { "epoch": 4.378549949883061, "grad_norm": 0.09972550719976425, "learning_rate": 3.453935582918797e-05, "loss": 0.4593, "num_input_tokens_seen": 47696144, "step": 39315 }, { "epoch": 4.3791068047666775, "grad_norm": 0.13003043830394745, "learning_rate": 3.453486385009401e-05, "loss": 0.458, "num_input_tokens_seen": 47701872, "step": 39320 }, { "epoch": 4.379663659650295, "grad_norm": 0.12278355658054352, "learning_rate": 3.453037151074233e-05, "loss": 0.4727, "num_input_tokens_seen": 47707504, "step": 39325 }, { "epoch": 4.380220514533913, "grad_norm": 0.1175297349691391, "learning_rate": 3.452587881130267e-05, "loss": 0.4657, "num_input_tokens_seen": 47713680, "step": 39330 }, { "epoch": 4.38077736941753, "grad_norm": 0.10638453811407089, "learning_rate": 3.452138575194479e-05, "loss": 0.4599, "num_input_tokens_seen": 47719120, "step": 39335 }, { "epoch": 4.381334224301147, "grad_norm": 0.10413161665201187, "learning_rate": 3.451689233283844e-05, "loss": 0.4536, "num_input_tokens_seen": 47725552, "step": 39340 }, { "epoch": 4.381891079184764, "grad_norm": 0.09425760805606842, "learning_rate": 3.45123985541534e-05, "loss": 0.4527, "num_input_tokens_seen": 47731792, "step": 39345 }, { "epoch": 4.382447934068382, "grad_norm": 0.11645089089870453, "learning_rate": 3.450790441605946e-05, "loss": 0.459, "num_input_tokens_seen": 47737904, "step": 39350 }, { "epoch": 4.383004788951999, "grad_norm": 0.1142689511179924, "learning_rate": 3.450340991872642e-05, "loss": 0.4542, "num_input_tokens_seen": 47743984, "step": 39355 }, { "epoch": 4.383561643835616, "grad_norm": 0.11739815026521683, "learning_rate": 3.44989150623241e-05, "loss": 0.4617, "num_input_tokens_seen": 47750000, "step": 39360 }, { "epoch": 4.384118498719234, "grad_norm": 0.1351032555103302, "learning_rate": 3.449441984702232e-05, "loss": 0.4427, "num_input_tokens_seen": 47755568, "step": 39365 }, { "epoch": 4.3846753536028515, "grad_norm": 0.11042672395706177, "learning_rate": 3.4489924272990935e-05, "loss": 0.4675, "num_input_tokens_seen": 47762032, "step": 39370 }, { "epoch": 4.385232208486468, "grad_norm": 0.12166757881641388, "learning_rate": 3.4485428340399804e-05, "loss": 0.4667, "num_input_tokens_seen": 47768144, "step": 39375 }, { "epoch": 4.385789063370086, "grad_norm": 0.11135870218276978, "learning_rate": 3.4480932049418794e-05, "loss": 0.4539, "num_input_tokens_seen": 47773648, "step": 39380 }, { "epoch": 4.386345918253703, "grad_norm": 0.10634849965572357, "learning_rate": 3.4476435400217785e-05, "loss": 0.4391, "num_input_tokens_seen": 47779024, "step": 39385 }, { "epoch": 4.3869027731373205, "grad_norm": 0.10583078861236572, "learning_rate": 3.447193839296668e-05, "loss": 0.4502, "num_input_tokens_seen": 47785456, "step": 39390 }, { "epoch": 4.387459628020938, "grad_norm": 0.148117795586586, "learning_rate": 3.446744102783539e-05, "loss": 0.4648, "num_input_tokens_seen": 47791792, "step": 39395 }, { "epoch": 4.388016482904555, "grad_norm": 0.1312742382287979, "learning_rate": 3.4462943304993826e-05, "loss": 0.4595, "num_input_tokens_seen": 47798160, "step": 39400 }, { "epoch": 4.388573337788173, "grad_norm": 0.13140985369682312, "learning_rate": 3.4458445224611947e-05, "loss": 0.4648, "num_input_tokens_seen": 47804176, "step": 39405 }, { "epoch": 4.389130192671789, "grad_norm": 0.1397012174129486, "learning_rate": 3.44539467868597e-05, "loss": 0.4584, "num_input_tokens_seen": 47810480, "step": 39410 }, { "epoch": 4.389687047555407, "grad_norm": 0.0899718776345253, "learning_rate": 3.444944799190704e-05, "loss": 0.4584, "num_input_tokens_seen": 47816624, "step": 39415 }, { "epoch": 4.390243902439025, "grad_norm": 0.10790695250034332, "learning_rate": 3.444494883992396e-05, "loss": 0.4753, "num_input_tokens_seen": 47822960, "step": 39420 }, { "epoch": 4.3908007573226415, "grad_norm": 0.09718675911426544, "learning_rate": 3.4440449331080436e-05, "loss": 0.4669, "num_input_tokens_seen": 47828368, "step": 39425 }, { "epoch": 4.391357612206259, "grad_norm": 0.10621684789657593, "learning_rate": 3.443594946554648e-05, "loss": 0.4655, "num_input_tokens_seen": 47834832, "step": 39430 }, { "epoch": 4.391914467089876, "grad_norm": 0.10419197380542755, "learning_rate": 3.443144924349213e-05, "loss": 0.4622, "num_input_tokens_seen": 47841040, "step": 39435 }, { "epoch": 4.392471321973494, "grad_norm": 0.11507820338010788, "learning_rate": 3.442694866508739e-05, "loss": 0.4615, "num_input_tokens_seen": 47847280, "step": 39440 }, { "epoch": 4.393028176857111, "grad_norm": 0.09347474575042725, "learning_rate": 3.442244773050232e-05, "loss": 0.4739, "num_input_tokens_seen": 47853392, "step": 39445 }, { "epoch": 4.393585031740728, "grad_norm": 0.09833838045597076, "learning_rate": 3.441794643990699e-05, "loss": 0.4598, "num_input_tokens_seen": 47859184, "step": 39450 }, { "epoch": 4.394141886624346, "grad_norm": 0.13078206777572632, "learning_rate": 3.441344479347145e-05, "loss": 0.4728, "num_input_tokens_seen": 47865552, "step": 39455 }, { "epoch": 4.394698741507963, "grad_norm": 0.11809190362691879, "learning_rate": 3.44089427913658e-05, "loss": 0.4661, "num_input_tokens_seen": 47871216, "step": 39460 }, { "epoch": 4.39525559639158, "grad_norm": 0.12222598493099213, "learning_rate": 3.4404440433760146e-05, "loss": 0.4568, "num_input_tokens_seen": 47877712, "step": 39465 }, { "epoch": 4.395812451275198, "grad_norm": 0.152409166097641, "learning_rate": 3.4399937720824595e-05, "loss": 0.4614, "num_input_tokens_seen": 47883600, "step": 39470 }, { "epoch": 4.396369306158815, "grad_norm": 0.1420438140630722, "learning_rate": 3.439543465272927e-05, "loss": 0.4688, "num_input_tokens_seen": 47889808, "step": 39475 }, { "epoch": 4.396926161042432, "grad_norm": 0.1155674085021019, "learning_rate": 3.439093122964431e-05, "loss": 0.4665, "num_input_tokens_seen": 47895728, "step": 39480 }, { "epoch": 4.39748301592605, "grad_norm": 0.09554912149906158, "learning_rate": 3.4386427451739876e-05, "loss": 0.4678, "num_input_tokens_seen": 47902032, "step": 39485 }, { "epoch": 4.398039870809667, "grad_norm": 0.129459410905838, "learning_rate": 3.438192331918614e-05, "loss": 0.4552, "num_input_tokens_seen": 47908112, "step": 39490 }, { "epoch": 4.3985967256932845, "grad_norm": 0.11982820183038712, "learning_rate": 3.4377418832153265e-05, "loss": 0.4756, "num_input_tokens_seen": 47914480, "step": 39495 }, { "epoch": 4.399153580576901, "grad_norm": 0.1337403804063797, "learning_rate": 3.4372913990811464e-05, "loss": 0.4679, "num_input_tokens_seen": 47920720, "step": 39500 }, { "epoch": 4.399710435460519, "grad_norm": 0.10238675773143768, "learning_rate": 3.436840879533093e-05, "loss": 0.4652, "num_input_tokens_seen": 47926608, "step": 39505 }, { "epoch": 4.400267290344137, "grad_norm": 0.11764199286699295, "learning_rate": 3.436390324588189e-05, "loss": 0.4617, "num_input_tokens_seen": 47932560, "step": 39510 }, { "epoch": 4.4008241452277534, "grad_norm": 0.06374695152044296, "learning_rate": 3.435939734263457e-05, "loss": 0.4644, "num_input_tokens_seen": 47938608, "step": 39515 }, { "epoch": 4.401381000111371, "grad_norm": 0.15087385475635529, "learning_rate": 3.4354891085759234e-05, "loss": 0.4665, "num_input_tokens_seen": 47944848, "step": 39520 }, { "epoch": 4.401937854994988, "grad_norm": 0.11940676718950272, "learning_rate": 3.4350384475426134e-05, "loss": 0.4668, "num_input_tokens_seen": 47951408, "step": 39525 }, { "epoch": 4.402494709878606, "grad_norm": 0.14447292685508728, "learning_rate": 3.434587751180553e-05, "loss": 0.4623, "num_input_tokens_seen": 47956688, "step": 39530 }, { "epoch": 4.403051564762223, "grad_norm": 0.09429456293582916, "learning_rate": 3.4341370195067745e-05, "loss": 0.4572, "num_input_tokens_seen": 47962832, "step": 39535 }, { "epoch": 4.40360841964584, "grad_norm": 0.10825726389884949, "learning_rate": 3.4336862525383044e-05, "loss": 0.4607, "num_input_tokens_seen": 47969072, "step": 39540 }, { "epoch": 4.404165274529458, "grad_norm": 0.13289086520671844, "learning_rate": 3.433235450292176e-05, "loss": 0.4774, "num_input_tokens_seen": 47975216, "step": 39545 }, { "epoch": 4.404722129413075, "grad_norm": 0.15050528943538666, "learning_rate": 3.4327846127854215e-05, "loss": 0.4625, "num_input_tokens_seen": 47981360, "step": 39550 }, { "epoch": 4.405278984296692, "grad_norm": 0.13067284226417542, "learning_rate": 3.4323337400350744e-05, "loss": 0.4473, "num_input_tokens_seen": 47987632, "step": 39555 }, { "epoch": 4.40583583918031, "grad_norm": 0.09789858758449554, "learning_rate": 3.431882832058172e-05, "loss": 0.453, "num_input_tokens_seen": 47993744, "step": 39560 }, { "epoch": 4.406392694063927, "grad_norm": 0.11171829700469971, "learning_rate": 3.431431888871749e-05, "loss": 0.4568, "num_input_tokens_seen": 47999888, "step": 39565 }, { "epoch": 4.406949548947544, "grad_norm": 0.1067291870713234, "learning_rate": 3.430980910492845e-05, "loss": 0.4541, "num_input_tokens_seen": 48006256, "step": 39570 }, { "epoch": 4.407506403831162, "grad_norm": 0.10727327316999435, "learning_rate": 3.430529896938498e-05, "loss": 0.4638, "num_input_tokens_seen": 48012208, "step": 39575 }, { "epoch": 4.408063258714779, "grad_norm": 0.11746273189783096, "learning_rate": 3.43007884822575e-05, "loss": 0.4541, "num_input_tokens_seen": 48018128, "step": 39580 }, { "epoch": 4.4086201135983965, "grad_norm": 0.07392029464244843, "learning_rate": 3.4296277643716426e-05, "loss": 0.479, "num_input_tokens_seen": 48023984, "step": 39585 }, { "epoch": 4.409176968482013, "grad_norm": 0.11310623586177826, "learning_rate": 3.429176645393218e-05, "loss": 0.4571, "num_input_tokens_seen": 48030288, "step": 39590 }, { "epoch": 4.409733823365631, "grad_norm": 0.09468091279268265, "learning_rate": 3.428725491307523e-05, "loss": 0.4514, "num_input_tokens_seen": 48036080, "step": 39595 }, { "epoch": 4.410290678249249, "grad_norm": 0.10368454456329346, "learning_rate": 3.428274302131603e-05, "loss": 0.4553, "num_input_tokens_seen": 48041648, "step": 39600 }, { "epoch": 4.410847533132865, "grad_norm": 0.07453621178865433, "learning_rate": 3.427823077882505e-05, "loss": 0.4693, "num_input_tokens_seen": 48047888, "step": 39605 }, { "epoch": 4.411404388016483, "grad_norm": 0.11177283525466919, "learning_rate": 3.4273718185772777e-05, "loss": 0.4658, "num_input_tokens_seen": 48053936, "step": 39610 }, { "epoch": 4.4119612429001, "grad_norm": 0.1049547865986824, "learning_rate": 3.4269205242329696e-05, "loss": 0.4703, "num_input_tokens_seen": 48060144, "step": 39615 }, { "epoch": 4.4125180977837175, "grad_norm": 0.08633121848106384, "learning_rate": 3.426469194866635e-05, "loss": 0.4752, "num_input_tokens_seen": 48065424, "step": 39620 }, { "epoch": 4.413074952667335, "grad_norm": 0.13942578434944153, "learning_rate": 3.4260178304953255e-05, "loss": 0.4608, "num_input_tokens_seen": 48071632, "step": 39625 }, { "epoch": 4.413631807550952, "grad_norm": 0.10418756306171417, "learning_rate": 3.425566431136095e-05, "loss": 0.4605, "num_input_tokens_seen": 48077680, "step": 39630 }, { "epoch": 4.41418866243457, "grad_norm": 0.10528850555419922, "learning_rate": 3.425114996805998e-05, "loss": 0.4611, "num_input_tokens_seen": 48083728, "step": 39635 }, { "epoch": 4.414745517318186, "grad_norm": 0.100639209151268, "learning_rate": 3.4246635275220914e-05, "loss": 0.4485, "num_input_tokens_seen": 48089904, "step": 39640 }, { "epoch": 4.415302372201804, "grad_norm": 0.13371077179908752, "learning_rate": 3.424212023301434e-05, "loss": 0.461, "num_input_tokens_seen": 48096240, "step": 39645 }, { "epoch": 4.415859227085422, "grad_norm": 0.10054780542850494, "learning_rate": 3.4237604841610835e-05, "loss": 0.4552, "num_input_tokens_seen": 48102384, "step": 39650 }, { "epoch": 4.416416081969039, "grad_norm": 0.11213558912277222, "learning_rate": 3.423308910118103e-05, "loss": 0.4627, "num_input_tokens_seen": 48108464, "step": 39655 }, { "epoch": 4.416972936852656, "grad_norm": 0.11645812541246414, "learning_rate": 3.4228573011895526e-05, "loss": 0.4652, "num_input_tokens_seen": 48114512, "step": 39660 }, { "epoch": 4.417529791736274, "grad_norm": 0.15897653996944427, "learning_rate": 3.422405657392496e-05, "loss": 0.4689, "num_input_tokens_seen": 48120624, "step": 39665 }, { "epoch": 4.418086646619891, "grad_norm": 0.0971703827381134, "learning_rate": 3.421953978743998e-05, "loss": 0.4608, "num_input_tokens_seen": 48126544, "step": 39670 }, { "epoch": 4.418643501503508, "grad_norm": 0.1081882193684578, "learning_rate": 3.421502265261123e-05, "loss": 0.459, "num_input_tokens_seen": 48132656, "step": 39675 }, { "epoch": 4.419200356387125, "grad_norm": 0.08263565599918365, "learning_rate": 3.42105051696094e-05, "loss": 0.4696, "num_input_tokens_seen": 48138864, "step": 39680 }, { "epoch": 4.419757211270743, "grad_norm": 0.09454458206892014, "learning_rate": 3.420598733860516e-05, "loss": 0.4537, "num_input_tokens_seen": 48144720, "step": 39685 }, { "epoch": 4.4203140661543605, "grad_norm": 0.11449572443962097, "learning_rate": 3.420146915976923e-05, "loss": 0.4531, "num_input_tokens_seen": 48150768, "step": 39690 }, { "epoch": 4.420870921037977, "grad_norm": 0.09420511871576309, "learning_rate": 3.41969506332723e-05, "loss": 0.4696, "num_input_tokens_seen": 48156784, "step": 39695 }, { "epoch": 4.421427775921595, "grad_norm": 0.0992787703871727, "learning_rate": 3.4192431759285104e-05, "loss": 0.4663, "num_input_tokens_seen": 48162992, "step": 39700 }, { "epoch": 4.421984630805212, "grad_norm": 0.11355049163103104, "learning_rate": 3.418791253797838e-05, "loss": 0.4628, "num_input_tokens_seen": 48168304, "step": 39705 }, { "epoch": 4.422541485688829, "grad_norm": 0.0818326473236084, "learning_rate": 3.4183392969522877e-05, "loss": 0.4598, "num_input_tokens_seen": 48174608, "step": 39710 }, { "epoch": 4.423098340572447, "grad_norm": 0.10561169683933258, "learning_rate": 3.417887305408935e-05, "loss": 0.4529, "num_input_tokens_seen": 48180688, "step": 39715 }, { "epoch": 4.423655195456064, "grad_norm": 0.12034223973751068, "learning_rate": 3.417435279184859e-05, "loss": 0.4523, "num_input_tokens_seen": 48186928, "step": 39720 }, { "epoch": 4.424212050339682, "grad_norm": 0.10210225731134415, "learning_rate": 3.416983218297138e-05, "loss": 0.4449, "num_input_tokens_seen": 48192720, "step": 39725 }, { "epoch": 4.424768905223299, "grad_norm": 0.1286081224679947, "learning_rate": 3.4165311227628524e-05, "loss": 0.4563, "num_input_tokens_seen": 48198736, "step": 39730 }, { "epoch": 4.425325760106916, "grad_norm": 0.12072443962097168, "learning_rate": 3.4160789925990834e-05, "loss": 0.4473, "num_input_tokens_seen": 48204752, "step": 39735 }, { "epoch": 4.425882614990534, "grad_norm": 0.14414359629154205, "learning_rate": 3.415626827822914e-05, "loss": 0.4773, "num_input_tokens_seen": 48210448, "step": 39740 }, { "epoch": 4.4264394698741505, "grad_norm": 0.14183464646339417, "learning_rate": 3.41517462845143e-05, "loss": 0.4637, "num_input_tokens_seen": 48216496, "step": 39745 }, { "epoch": 4.426996324757768, "grad_norm": 0.10940110683441162, "learning_rate": 3.4147223945017156e-05, "loss": 0.4589, "num_input_tokens_seen": 48222320, "step": 39750 }, { "epoch": 4.427553179641386, "grad_norm": 0.10819035768508911, "learning_rate": 3.4142701259908574e-05, "loss": 0.4529, "num_input_tokens_seen": 48228464, "step": 39755 }, { "epoch": 4.428110034525003, "grad_norm": 0.11706811934709549, "learning_rate": 3.413817822935944e-05, "loss": 0.457, "num_input_tokens_seen": 48234992, "step": 39760 }, { "epoch": 4.42866688940862, "grad_norm": 0.15840691328048706, "learning_rate": 3.413365485354064e-05, "loss": 0.4627, "num_input_tokens_seen": 48241072, "step": 39765 }, { "epoch": 4.429223744292237, "grad_norm": 0.1717124730348587, "learning_rate": 3.4129131132623096e-05, "loss": 0.4596, "num_input_tokens_seen": 48247184, "step": 39770 }, { "epoch": 4.429780599175855, "grad_norm": 0.100651815533638, "learning_rate": 3.412460706677772e-05, "loss": 0.4581, "num_input_tokens_seen": 48253488, "step": 39775 }, { "epoch": 4.430337454059472, "grad_norm": 0.16382241249084473, "learning_rate": 3.412008265617545e-05, "loss": 0.4788, "num_input_tokens_seen": 48259696, "step": 39780 }, { "epoch": 4.430894308943089, "grad_norm": 0.10219722986221313, "learning_rate": 3.411555790098723e-05, "loss": 0.4445, "num_input_tokens_seen": 48265680, "step": 39785 }, { "epoch": 4.431451163826707, "grad_norm": 0.12184526026248932, "learning_rate": 3.411103280138402e-05, "loss": 0.4597, "num_input_tokens_seen": 48271824, "step": 39790 }, { "epoch": 4.432008018710324, "grad_norm": 0.103051096200943, "learning_rate": 3.410650735753679e-05, "loss": 0.4591, "num_input_tokens_seen": 48278064, "step": 39795 }, { "epoch": 4.432564873593941, "grad_norm": 0.12567177414894104, "learning_rate": 3.410198156961653e-05, "loss": 0.459, "num_input_tokens_seen": 48284112, "step": 39800 }, { "epoch": 4.433121728477559, "grad_norm": 0.1653997153043747, "learning_rate": 3.409745543779424e-05, "loss": 0.4701, "num_input_tokens_seen": 48290096, "step": 39805 }, { "epoch": 4.433678583361176, "grad_norm": 0.10024341940879822, "learning_rate": 3.409292896224093e-05, "loss": 0.4539, "num_input_tokens_seen": 48296176, "step": 39810 }, { "epoch": 4.4342354382447935, "grad_norm": 0.08501791208982468, "learning_rate": 3.4088402143127625e-05, "loss": 0.4471, "num_input_tokens_seen": 48302160, "step": 39815 }, { "epoch": 4.43479229312841, "grad_norm": 0.15468938648700714, "learning_rate": 3.408387498062536e-05, "loss": 0.4697, "num_input_tokens_seen": 48307696, "step": 39820 }, { "epoch": 4.435349148012028, "grad_norm": 0.10994219779968262, "learning_rate": 3.407934747490519e-05, "loss": 0.4579, "num_input_tokens_seen": 48313424, "step": 39825 }, { "epoch": 4.435906002895646, "grad_norm": 0.12413191795349121, "learning_rate": 3.407481962613817e-05, "loss": 0.466, "num_input_tokens_seen": 48319536, "step": 39830 }, { "epoch": 4.436462857779262, "grad_norm": 0.1211981326341629, "learning_rate": 3.407029143449538e-05, "loss": 0.4712, "num_input_tokens_seen": 48325744, "step": 39835 }, { "epoch": 4.43701971266288, "grad_norm": 0.08562120050191879, "learning_rate": 3.406576290014792e-05, "loss": 0.4639, "num_input_tokens_seen": 48331856, "step": 39840 }, { "epoch": 4.437576567546498, "grad_norm": 0.10221989452838898, "learning_rate": 3.406123402326689e-05, "loss": 0.4666, "num_input_tokens_seen": 48337712, "step": 39845 }, { "epoch": 4.4381334224301145, "grad_norm": 0.19291037321090698, "learning_rate": 3.405670480402339e-05, "loss": 0.4683, "num_input_tokens_seen": 48344144, "step": 39850 }, { "epoch": 4.438690277313732, "grad_norm": 0.12158675491809845, "learning_rate": 3.4052175242588566e-05, "loss": 0.4689, "num_input_tokens_seen": 48349968, "step": 39855 }, { "epoch": 4.439247132197349, "grad_norm": 0.10421204566955566, "learning_rate": 3.4047645339133546e-05, "loss": 0.4723, "num_input_tokens_seen": 48356240, "step": 39860 }, { "epoch": 4.439803987080967, "grad_norm": 0.0987544059753418, "learning_rate": 3.404311509382949e-05, "loss": 0.4532, "num_input_tokens_seen": 48362224, "step": 39865 }, { "epoch": 4.440360841964584, "grad_norm": 0.15964104235172272, "learning_rate": 3.403858450684757e-05, "loss": 0.4616, "num_input_tokens_seen": 48368400, "step": 39870 }, { "epoch": 4.440917696848201, "grad_norm": 0.10238926112651825, "learning_rate": 3.4034053578358967e-05, "loss": 0.4652, "num_input_tokens_seen": 48374544, "step": 39875 }, { "epoch": 4.441474551731819, "grad_norm": 0.18892385065555573, "learning_rate": 3.402952230853486e-05, "loss": 0.4695, "num_input_tokens_seen": 48380560, "step": 39880 }, { "epoch": 4.442031406615436, "grad_norm": 0.08298437297344208, "learning_rate": 3.4024990697546465e-05, "loss": 0.4561, "num_input_tokens_seen": 48386576, "step": 39885 }, { "epoch": 4.442588261499053, "grad_norm": 0.10868270695209503, "learning_rate": 3.4020458745565e-05, "loss": 0.4778, "num_input_tokens_seen": 48392752, "step": 39890 }, { "epoch": 4.443145116382671, "grad_norm": 0.08415263891220093, "learning_rate": 3.40159264527617e-05, "loss": 0.4597, "num_input_tokens_seen": 48398928, "step": 39895 }, { "epoch": 4.443701971266288, "grad_norm": 0.08403833210468292, "learning_rate": 3.40113938193078e-05, "loss": 0.4656, "num_input_tokens_seen": 48404784, "step": 39900 }, { "epoch": 4.444258826149905, "grad_norm": 0.12326525151729584, "learning_rate": 3.4006860845374564e-05, "loss": 0.4622, "num_input_tokens_seen": 48411152, "step": 39905 }, { "epoch": 4.444815681033523, "grad_norm": 0.13420380651950836, "learning_rate": 3.400232753113327e-05, "loss": 0.4655, "num_input_tokens_seen": 48417648, "step": 39910 }, { "epoch": 4.44537253591714, "grad_norm": 0.12860652804374695, "learning_rate": 3.3997793876755194e-05, "loss": 0.4681, "num_input_tokens_seen": 48423856, "step": 39915 }, { "epoch": 4.4459293908007576, "grad_norm": 0.11195051670074463, "learning_rate": 3.3993259882411616e-05, "loss": 0.4692, "num_input_tokens_seen": 48429296, "step": 39920 }, { "epoch": 4.446486245684374, "grad_norm": 0.12537552416324615, "learning_rate": 3.398872554827386e-05, "loss": 0.4706, "num_input_tokens_seen": 48435248, "step": 39925 }, { "epoch": 4.447043100567992, "grad_norm": 0.08472682535648346, "learning_rate": 3.3984190874513256e-05, "loss": 0.4489, "num_input_tokens_seen": 48441488, "step": 39930 }, { "epoch": 4.44759995545161, "grad_norm": 0.11463979631662369, "learning_rate": 3.397965586130112e-05, "loss": 0.4499, "num_input_tokens_seen": 48447664, "step": 39935 }, { "epoch": 4.4481568103352265, "grad_norm": 0.1368405669927597, "learning_rate": 3.397512050880882e-05, "loss": 0.4684, "num_input_tokens_seen": 48453328, "step": 39940 }, { "epoch": 4.448713665218844, "grad_norm": 0.11733309179544449, "learning_rate": 3.3970584817207703e-05, "loss": 0.4604, "num_input_tokens_seen": 48459376, "step": 39945 }, { "epoch": 4.449270520102461, "grad_norm": 0.07656505703926086, "learning_rate": 3.396604878666914e-05, "loss": 0.4635, "num_input_tokens_seen": 48465264, "step": 39950 }, { "epoch": 4.449827374986079, "grad_norm": 0.10760073363780975, "learning_rate": 3.396151241736452e-05, "loss": 0.4543, "num_input_tokens_seen": 48471568, "step": 39955 }, { "epoch": 4.450384229869696, "grad_norm": 0.1141209751367569, "learning_rate": 3.395697570946525e-05, "loss": 0.4676, "num_input_tokens_seen": 48477648, "step": 39960 }, { "epoch": 4.450941084753313, "grad_norm": 0.12555132806301117, "learning_rate": 3.395243866314272e-05, "loss": 0.4716, "num_input_tokens_seen": 48483600, "step": 39965 }, { "epoch": 4.451497939636931, "grad_norm": 0.15677081048488617, "learning_rate": 3.394790127856837e-05, "loss": 0.4618, "num_input_tokens_seen": 48489552, "step": 39970 }, { "epoch": 4.4520547945205475, "grad_norm": 0.118509441614151, "learning_rate": 3.394336355591364e-05, "loss": 0.465, "num_input_tokens_seen": 48495696, "step": 39975 }, { "epoch": 4.452611649404165, "grad_norm": 0.11593375355005264, "learning_rate": 3.393882549534998e-05, "loss": 0.4642, "num_input_tokens_seen": 48501936, "step": 39980 }, { "epoch": 4.453168504287783, "grad_norm": 0.12706388533115387, "learning_rate": 3.3934287097048836e-05, "loss": 0.4681, "num_input_tokens_seen": 48507760, "step": 39985 }, { "epoch": 4.4537253591714, "grad_norm": 0.12407031655311584, "learning_rate": 3.39297483611817e-05, "loss": 0.4676, "num_input_tokens_seen": 48514000, "step": 39990 }, { "epoch": 4.454282214055017, "grad_norm": 0.10781815648078918, "learning_rate": 3.3925209287920054e-05, "loss": 0.4582, "num_input_tokens_seen": 48519984, "step": 39995 }, { "epoch": 4.454839068938634, "grad_norm": 0.0919768288731575, "learning_rate": 3.39206698774354e-05, "loss": 0.4675, "num_input_tokens_seen": 48525968, "step": 40000 }, { "epoch": 4.455395923822252, "grad_norm": 0.1303260773420334, "learning_rate": 3.3916130129899246e-05, "loss": 0.4647, "num_input_tokens_seen": 48532208, "step": 40005 }, { "epoch": 4.4559527787058695, "grad_norm": 0.12300363928079605, "learning_rate": 3.391159004548313e-05, "loss": 0.4736, "num_input_tokens_seen": 48538512, "step": 40010 }, { "epoch": 4.456509633589486, "grad_norm": 0.11476285010576248, "learning_rate": 3.3907049624358577e-05, "loss": 0.4562, "num_input_tokens_seen": 48544880, "step": 40015 }, { "epoch": 4.457066488473104, "grad_norm": 0.11079046875238419, "learning_rate": 3.390250886669715e-05, "loss": 0.4522, "num_input_tokens_seen": 48550832, "step": 40020 }, { "epoch": 4.457623343356722, "grad_norm": 0.10962465405464172, "learning_rate": 3.38979677726704e-05, "loss": 0.4672, "num_input_tokens_seen": 48556720, "step": 40025 }, { "epoch": 4.458180198240338, "grad_norm": 0.15172986686229706, "learning_rate": 3.389342634244992e-05, "loss": 0.4515, "num_input_tokens_seen": 48562704, "step": 40030 }, { "epoch": 4.458737053123956, "grad_norm": 0.10026133060455322, "learning_rate": 3.388888457620729e-05, "loss": 0.4726, "num_input_tokens_seen": 48569008, "step": 40035 }, { "epoch": 4.459293908007573, "grad_norm": 0.1569366157054901, "learning_rate": 3.3884342474114114e-05, "loss": 0.4645, "num_input_tokens_seen": 48574640, "step": 40040 }, { "epoch": 4.4598507628911905, "grad_norm": 0.09171800315380096, "learning_rate": 3.387980003634201e-05, "loss": 0.4664, "num_input_tokens_seen": 48580624, "step": 40045 }, { "epoch": 4.460407617774808, "grad_norm": 0.12541988492012024, "learning_rate": 3.38752572630626e-05, "loss": 0.4554, "num_input_tokens_seen": 48586160, "step": 40050 }, { "epoch": 4.460964472658425, "grad_norm": 0.10762401670217514, "learning_rate": 3.387071415444753e-05, "loss": 0.4566, "num_input_tokens_seen": 48592304, "step": 40055 }, { "epoch": 4.461521327542043, "grad_norm": 0.10025380551815033, "learning_rate": 3.386617071066846e-05, "loss": 0.471, "num_input_tokens_seen": 48598544, "step": 40060 }, { "epoch": 4.46207818242566, "grad_norm": 0.12419731169939041, "learning_rate": 3.386162693189704e-05, "loss": 0.4623, "num_input_tokens_seen": 48604464, "step": 40065 }, { "epoch": 4.462635037309277, "grad_norm": 0.14074641466140747, "learning_rate": 3.385708281830496e-05, "loss": 0.4487, "num_input_tokens_seen": 48610768, "step": 40070 }, { "epoch": 4.463191892192895, "grad_norm": 0.10325648635625839, "learning_rate": 3.38525383700639e-05, "loss": 0.4559, "num_input_tokens_seen": 48616720, "step": 40075 }, { "epoch": 4.463748747076512, "grad_norm": 0.13510966300964355, "learning_rate": 3.384799358734558e-05, "loss": 0.4685, "num_input_tokens_seen": 48622736, "step": 40080 }, { "epoch": 4.464305601960129, "grad_norm": 0.11893117427825928, "learning_rate": 3.38434484703217e-05, "loss": 0.4684, "num_input_tokens_seen": 48628784, "step": 40085 }, { "epoch": 4.464862456843747, "grad_norm": 0.18748106062412262, "learning_rate": 3.3838903019164e-05, "loss": 0.4588, "num_input_tokens_seen": 48634992, "step": 40090 }, { "epoch": 4.465419311727364, "grad_norm": 0.15069958567619324, "learning_rate": 3.3834357234044214e-05, "loss": 0.4564, "num_input_tokens_seen": 48641232, "step": 40095 }, { "epoch": 4.465976166610981, "grad_norm": 0.11443839967250824, "learning_rate": 3.382981111513411e-05, "loss": 0.463, "num_input_tokens_seen": 48647280, "step": 40100 }, { "epoch": 4.466533021494598, "grad_norm": 0.10909141600131989, "learning_rate": 3.382526466260544e-05, "loss": 0.4478, "num_input_tokens_seen": 48653232, "step": 40105 }, { "epoch": 4.467089876378216, "grad_norm": 0.10759974271059036, "learning_rate": 3.3820717876629985e-05, "loss": 0.453, "num_input_tokens_seen": 48659600, "step": 40110 }, { "epoch": 4.4676467312618335, "grad_norm": 0.1637365221977234, "learning_rate": 3.3816170757379546e-05, "loss": 0.4758, "num_input_tokens_seen": 48665584, "step": 40115 }, { "epoch": 4.46820358614545, "grad_norm": 0.09803483635187149, "learning_rate": 3.3811623305025925e-05, "loss": 0.4748, "num_input_tokens_seen": 48671280, "step": 40120 }, { "epoch": 4.468760441029068, "grad_norm": 0.09549587219953537, "learning_rate": 3.3807075519740936e-05, "loss": 0.4534, "num_input_tokens_seen": 48677424, "step": 40125 }, { "epoch": 4.469317295912685, "grad_norm": 0.1005052775144577, "learning_rate": 3.380252740169641e-05, "loss": 0.4578, "num_input_tokens_seen": 48683760, "step": 40130 }, { "epoch": 4.4698741507963025, "grad_norm": 0.11743184179067612, "learning_rate": 3.379797895106419e-05, "loss": 0.4777, "num_input_tokens_seen": 48689808, "step": 40135 }, { "epoch": 4.47043100567992, "grad_norm": 0.1266264021396637, "learning_rate": 3.379343016801612e-05, "loss": 0.4507, "num_input_tokens_seen": 48695440, "step": 40140 }, { "epoch": 4.470987860563537, "grad_norm": 0.10172529518604279, "learning_rate": 3.378888105272409e-05, "loss": 0.4583, "num_input_tokens_seen": 48701520, "step": 40145 }, { "epoch": 4.471544715447155, "grad_norm": 0.10057932883501053, "learning_rate": 3.3784331605359966e-05, "loss": 0.4521, "num_input_tokens_seen": 48707760, "step": 40150 }, { "epoch": 4.472101570330771, "grad_norm": 0.10339047759771347, "learning_rate": 3.377978182609565e-05, "loss": 0.4618, "num_input_tokens_seen": 48713616, "step": 40155 }, { "epoch": 4.472658425214389, "grad_norm": 0.09300554543733597, "learning_rate": 3.3775231715103025e-05, "loss": 0.4635, "num_input_tokens_seen": 48719984, "step": 40160 }, { "epoch": 4.473215280098007, "grad_norm": 0.07295439392328262, "learning_rate": 3.377068127255404e-05, "loss": 0.4547, "num_input_tokens_seen": 48725616, "step": 40165 }, { "epoch": 4.4737721349816235, "grad_norm": 0.08706655353307724, "learning_rate": 3.37661304986206e-05, "loss": 0.478, "num_input_tokens_seen": 48731504, "step": 40170 }, { "epoch": 4.474328989865241, "grad_norm": 0.10996631532907486, "learning_rate": 3.376157939347465e-05, "loss": 0.4592, "num_input_tokens_seen": 48737808, "step": 40175 }, { "epoch": 4.474885844748858, "grad_norm": 0.09168558567762375, "learning_rate": 3.3757027957288166e-05, "loss": 0.4825, "num_input_tokens_seen": 48743728, "step": 40180 }, { "epoch": 4.475442699632476, "grad_norm": 0.14302994310855865, "learning_rate": 3.3752476190233104e-05, "loss": 0.4813, "num_input_tokens_seen": 48749488, "step": 40185 }, { "epoch": 4.475999554516093, "grad_norm": 0.12435426563024521, "learning_rate": 3.374792409248144e-05, "loss": 0.4553, "num_input_tokens_seen": 48755504, "step": 40190 }, { "epoch": 4.47655640939971, "grad_norm": 0.10575543344020844, "learning_rate": 3.374337166420516e-05, "loss": 0.4495, "num_input_tokens_seen": 48761744, "step": 40195 }, { "epoch": 4.477113264283328, "grad_norm": 0.14895159006118774, "learning_rate": 3.3738818905576286e-05, "loss": 0.4775, "num_input_tokens_seen": 48767760, "step": 40200 }, { "epoch": 4.4776701191669455, "grad_norm": 0.10636553913354874, "learning_rate": 3.373426581676683e-05, "loss": 0.4566, "num_input_tokens_seen": 48773616, "step": 40205 }, { "epoch": 4.478226974050562, "grad_norm": 0.1021156832575798, "learning_rate": 3.372971239794882e-05, "loss": 0.4703, "num_input_tokens_seen": 48779728, "step": 40210 }, { "epoch": 4.47878382893418, "grad_norm": 0.087002694606781, "learning_rate": 3.37251586492943e-05, "loss": 0.4528, "num_input_tokens_seen": 48785712, "step": 40215 }, { "epoch": 4.479340683817797, "grad_norm": 0.11474082618951797, "learning_rate": 3.372060457097532e-05, "loss": 0.4503, "num_input_tokens_seen": 48791696, "step": 40220 }, { "epoch": 4.479897538701414, "grad_norm": 0.1769920140504837, "learning_rate": 3.371605016316397e-05, "loss": 0.4777, "num_input_tokens_seen": 48797968, "step": 40225 }, { "epoch": 4.480454393585032, "grad_norm": 0.14713019132614136, "learning_rate": 3.3711495426032294e-05, "loss": 0.4623, "num_input_tokens_seen": 48803856, "step": 40230 }, { "epoch": 4.481011248468649, "grad_norm": 0.0778379961848259, "learning_rate": 3.370694035975241e-05, "loss": 0.4718, "num_input_tokens_seen": 48809872, "step": 40235 }, { "epoch": 4.4815681033522665, "grad_norm": 0.09675629436969757, "learning_rate": 3.3702384964496414e-05, "loss": 0.454, "num_input_tokens_seen": 48815952, "step": 40240 }, { "epoch": 4.482124958235884, "grad_norm": 0.11370213329792023, "learning_rate": 3.369782924043643e-05, "loss": 0.454, "num_input_tokens_seen": 48822224, "step": 40245 }, { "epoch": 4.482681813119501, "grad_norm": 0.10226976871490479, "learning_rate": 3.369327318774458e-05, "loss": 0.4605, "num_input_tokens_seen": 48828656, "step": 40250 }, { "epoch": 4.483238668003119, "grad_norm": 0.12402713298797607, "learning_rate": 3.368871680659301e-05, "loss": 0.4673, "num_input_tokens_seen": 48834960, "step": 40255 }, { "epoch": 4.483795522886735, "grad_norm": 0.0687570795416832, "learning_rate": 3.368416009715388e-05, "loss": 0.4559, "num_input_tokens_seen": 48841360, "step": 40260 }, { "epoch": 4.484352377770353, "grad_norm": 0.10413862019777298, "learning_rate": 3.367960305959935e-05, "loss": 0.4628, "num_input_tokens_seen": 48847312, "step": 40265 }, { "epoch": 4.484909232653971, "grad_norm": 0.10971496254205704, "learning_rate": 3.36750456941016e-05, "loss": 0.443, "num_input_tokens_seen": 48853136, "step": 40270 }, { "epoch": 4.485466087537588, "grad_norm": 0.13487444818019867, "learning_rate": 3.3670488000832826e-05, "loss": 0.4787, "num_input_tokens_seen": 48858928, "step": 40275 }, { "epoch": 4.486022942421205, "grad_norm": 0.13480088114738464, "learning_rate": 3.366592997996522e-05, "loss": 0.46, "num_input_tokens_seen": 48865072, "step": 40280 }, { "epoch": 4.486579797304822, "grad_norm": 0.0979388952255249, "learning_rate": 3.3661371631671016e-05, "loss": 0.4694, "num_input_tokens_seen": 48871536, "step": 40285 }, { "epoch": 4.48713665218844, "grad_norm": 0.08540499955415726, "learning_rate": 3.3656812956122435e-05, "loss": 0.46, "num_input_tokens_seen": 48877936, "step": 40290 }, { "epoch": 4.487693507072057, "grad_norm": 0.10255516320466995, "learning_rate": 3.365225395349172e-05, "loss": 0.4705, "num_input_tokens_seen": 48884080, "step": 40295 }, { "epoch": 4.488250361955674, "grad_norm": 0.15951071679592133, "learning_rate": 3.364769462395111e-05, "loss": 0.4754, "num_input_tokens_seen": 48890448, "step": 40300 }, { "epoch": 4.488807216839292, "grad_norm": 0.0980219617486, "learning_rate": 3.36431349676729e-05, "loss": 0.4607, "num_input_tokens_seen": 48896624, "step": 40305 }, { "epoch": 4.489364071722909, "grad_norm": 0.10457172989845276, "learning_rate": 3.363857498482935e-05, "loss": 0.4516, "num_input_tokens_seen": 48902608, "step": 40310 }, { "epoch": 4.489920926606526, "grad_norm": 0.09173575788736343, "learning_rate": 3.363401467559275e-05, "loss": 0.4629, "num_input_tokens_seen": 48908720, "step": 40315 }, { "epoch": 4.490477781490144, "grad_norm": 0.12413327395915985, "learning_rate": 3.36294540401354e-05, "loss": 0.4704, "num_input_tokens_seen": 48914896, "step": 40320 }, { "epoch": 4.491034636373761, "grad_norm": 0.11233151704072952, "learning_rate": 3.362489307862963e-05, "loss": 0.4442, "num_input_tokens_seen": 48920784, "step": 40325 }, { "epoch": 4.491591491257378, "grad_norm": 0.11463086307048798, "learning_rate": 3.362033179124776e-05, "loss": 0.4543, "num_input_tokens_seen": 48926928, "step": 40330 }, { "epoch": 4.492148346140995, "grad_norm": 0.120476633310318, "learning_rate": 3.3615770178162134e-05, "loss": 0.4544, "num_input_tokens_seen": 48932944, "step": 40335 }, { "epoch": 4.492705201024613, "grad_norm": 0.10076753050088882, "learning_rate": 3.361120823954509e-05, "loss": 0.4626, "num_input_tokens_seen": 48939120, "step": 40340 }, { "epoch": 4.493262055908231, "grad_norm": 0.08457866311073303, "learning_rate": 3.3606645975569005e-05, "loss": 0.4569, "num_input_tokens_seen": 48944688, "step": 40345 }, { "epoch": 4.493818910791847, "grad_norm": 0.09946916997432709, "learning_rate": 3.360208338640626e-05, "loss": 0.4676, "num_input_tokens_seen": 48950608, "step": 40350 }, { "epoch": 4.494375765675465, "grad_norm": 0.10813706368207932, "learning_rate": 3.359752047222923e-05, "loss": 0.4453, "num_input_tokens_seen": 48956752, "step": 40355 }, { "epoch": 4.494932620559083, "grad_norm": 0.06198981776833534, "learning_rate": 3.3592957233210326e-05, "loss": 0.4623, "num_input_tokens_seen": 48962384, "step": 40360 }, { "epoch": 4.4954894754426995, "grad_norm": 0.09419529885053635, "learning_rate": 3.3588393669521966e-05, "loss": 0.4603, "num_input_tokens_seen": 48968624, "step": 40365 }, { "epoch": 4.496046330326317, "grad_norm": 0.12112991511821747, "learning_rate": 3.3583829781336565e-05, "loss": 0.4567, "num_input_tokens_seen": 48974224, "step": 40370 }, { "epoch": 4.496603185209934, "grad_norm": 0.10596670210361481, "learning_rate": 3.3579265568826565e-05, "loss": 0.4659, "num_input_tokens_seen": 48980592, "step": 40375 }, { "epoch": 4.497160040093552, "grad_norm": 0.09345981478691101, "learning_rate": 3.357470103216442e-05, "loss": 0.4695, "num_input_tokens_seen": 48986672, "step": 40380 }, { "epoch": 4.497716894977169, "grad_norm": 0.13208836317062378, "learning_rate": 3.35701361715226e-05, "loss": 0.4622, "num_input_tokens_seen": 48992464, "step": 40385 }, { "epoch": 4.498273749860786, "grad_norm": 0.11165782809257507, "learning_rate": 3.3565570987073556e-05, "loss": 0.457, "num_input_tokens_seen": 48998160, "step": 40390 }, { "epoch": 4.498830604744404, "grad_norm": 0.11001278460025787, "learning_rate": 3.35610054789898e-05, "loss": 0.4685, "num_input_tokens_seen": 49004304, "step": 40395 }, { "epoch": 4.499387459628021, "grad_norm": 0.1273721605539322, "learning_rate": 3.355643964744382e-05, "loss": 0.4708, "num_input_tokens_seen": 49010480, "step": 40400 }, { "epoch": 4.499944314511638, "grad_norm": 0.09222691506147385, "learning_rate": 3.355187349260813e-05, "loss": 0.4603, "num_input_tokens_seen": 49016752, "step": 40405 }, { "epoch": 4.500501169395256, "grad_norm": 0.09621814638376236, "learning_rate": 3.354730701465526e-05, "loss": 0.4528, "num_input_tokens_seen": 49022960, "step": 40410 }, { "epoch": 4.500501169395256, "eval_loss": 0.4643326997756958, "eval_runtime": 113.8047, "eval_samples_per_second": 35.069, "eval_steps_per_second": 8.769, "num_input_tokens_seen": 49022960, "step": 40410 }, { "epoch": 4.501058024278873, "grad_norm": 0.1280021369457245, "learning_rate": 3.3542740213757725e-05, "loss": 0.4662, "num_input_tokens_seen": 49029008, "step": 40415 }, { "epoch": 4.50161487916249, "grad_norm": 0.14134761691093445, "learning_rate": 3.3538173090088096e-05, "loss": 0.4637, "num_input_tokens_seen": 49035280, "step": 40420 }, { "epoch": 4.502171734046108, "grad_norm": 0.08907154947519302, "learning_rate": 3.353360564381892e-05, "loss": 0.462, "num_input_tokens_seen": 49040720, "step": 40425 }, { "epoch": 4.502728588929725, "grad_norm": 0.14795488119125366, "learning_rate": 3.352903787512279e-05, "loss": 0.4727, "num_input_tokens_seen": 49046832, "step": 40430 }, { "epoch": 4.5032854438133425, "grad_norm": 0.14073729515075684, "learning_rate": 3.352446978417226e-05, "loss": 0.456, "num_input_tokens_seen": 49053424, "step": 40435 }, { "epoch": 4.503842298696959, "grad_norm": 0.10052219033241272, "learning_rate": 3.3519901371139954e-05, "loss": 0.465, "num_input_tokens_seen": 49059504, "step": 40440 }, { "epoch": 4.504399153580577, "grad_norm": 0.15539653599262238, "learning_rate": 3.351533263619847e-05, "loss": 0.4777, "num_input_tokens_seen": 49065808, "step": 40445 }, { "epoch": 4.504956008464195, "grad_norm": 0.11514914780855179, "learning_rate": 3.351076357952043e-05, "loss": 0.455, "num_input_tokens_seen": 49072176, "step": 40450 }, { "epoch": 4.505512863347811, "grad_norm": 0.10278299450874329, "learning_rate": 3.3506194201278465e-05, "loss": 0.4674, "num_input_tokens_seen": 49078096, "step": 40455 }, { "epoch": 4.506069718231429, "grad_norm": 0.08353424817323685, "learning_rate": 3.3501624501645224e-05, "loss": 0.4686, "num_input_tokens_seen": 49083760, "step": 40460 }, { "epoch": 4.506626573115046, "grad_norm": 0.11373215913772583, "learning_rate": 3.3497054480793366e-05, "loss": 0.465, "num_input_tokens_seen": 49089616, "step": 40465 }, { "epoch": 4.507183427998664, "grad_norm": 0.14012648165225983, "learning_rate": 3.3492484138895556e-05, "loss": 0.4568, "num_input_tokens_seen": 49095344, "step": 40470 }, { "epoch": 4.507740282882281, "grad_norm": 0.12397348880767822, "learning_rate": 3.348791347612448e-05, "loss": 0.4702, "num_input_tokens_seen": 49101808, "step": 40475 }, { "epoch": 4.508297137765898, "grad_norm": 0.3106449544429779, "learning_rate": 3.348334249265284e-05, "loss": 0.4715, "num_input_tokens_seen": 49108176, "step": 40480 }, { "epoch": 4.508853992649516, "grad_norm": 0.08781084418296814, "learning_rate": 3.347877118865333e-05, "loss": 0.4581, "num_input_tokens_seen": 49113936, "step": 40485 }, { "epoch": 4.5094108475331325, "grad_norm": 0.09157314896583557, "learning_rate": 3.347419956429867e-05, "loss": 0.4585, "num_input_tokens_seen": 49120304, "step": 40490 }, { "epoch": 4.50996770241675, "grad_norm": 0.09310400485992432, "learning_rate": 3.3469627619761596e-05, "loss": 0.4524, "num_input_tokens_seen": 49126384, "step": 40495 }, { "epoch": 4.510524557300368, "grad_norm": 0.10983406752347946, "learning_rate": 3.346505535521485e-05, "loss": 0.4699, "num_input_tokens_seen": 49132080, "step": 40500 }, { "epoch": 4.511081412183985, "grad_norm": 0.08278696238994598, "learning_rate": 3.346048277083118e-05, "loss": 0.4548, "num_input_tokens_seen": 49138288, "step": 40505 }, { "epoch": 4.511638267067602, "grad_norm": 0.09483388066291809, "learning_rate": 3.345590986678336e-05, "loss": 0.4442, "num_input_tokens_seen": 49143888, "step": 40510 }, { "epoch": 4.512195121951219, "grad_norm": 0.10290930420160294, "learning_rate": 3.345133664324417e-05, "loss": 0.4548, "num_input_tokens_seen": 49150096, "step": 40515 }, { "epoch": 4.512751976834837, "grad_norm": 0.1368899792432785, "learning_rate": 3.344676310038639e-05, "loss": 0.4463, "num_input_tokens_seen": 49155376, "step": 40520 }, { "epoch": 4.513308831718454, "grad_norm": 0.12515097856521606, "learning_rate": 3.344218923838284e-05, "loss": 0.4502, "num_input_tokens_seen": 49161104, "step": 40525 }, { "epoch": 4.513865686602071, "grad_norm": 0.11231303215026855, "learning_rate": 3.343761505740633e-05, "loss": 0.4654, "num_input_tokens_seen": 49166832, "step": 40530 }, { "epoch": 4.514422541485689, "grad_norm": 0.1248365268111229, "learning_rate": 3.343304055762967e-05, "loss": 0.4395, "num_input_tokens_seen": 49173136, "step": 40535 }, { "epoch": 4.514979396369306, "grad_norm": 0.1219286248087883, "learning_rate": 3.342846573922572e-05, "loss": 0.4726, "num_input_tokens_seen": 49179280, "step": 40540 }, { "epoch": 4.515536251252923, "grad_norm": 0.1091647520661354, "learning_rate": 3.342389060236733e-05, "loss": 0.4645, "num_input_tokens_seen": 49185552, "step": 40545 }, { "epoch": 4.516093106136541, "grad_norm": 0.16905732452869415, "learning_rate": 3.3419315147227345e-05, "loss": 0.4445, "num_input_tokens_seen": 49191728, "step": 40550 }, { "epoch": 4.516649961020158, "grad_norm": 0.13419920206069946, "learning_rate": 3.341473937397866e-05, "loss": 0.4702, "num_input_tokens_seen": 49197456, "step": 40555 }, { "epoch": 4.5172068159037755, "grad_norm": 0.10433291643857956, "learning_rate": 3.341016328279415e-05, "loss": 0.4672, "num_input_tokens_seen": 49202864, "step": 40560 }, { "epoch": 4.517763670787393, "grad_norm": 0.11420758813619614, "learning_rate": 3.340558687384673e-05, "loss": 0.4788, "num_input_tokens_seen": 49209136, "step": 40565 }, { "epoch": 4.51832052567101, "grad_norm": 0.12270814180374146, "learning_rate": 3.340101014730929e-05, "loss": 0.4572, "num_input_tokens_seen": 49215408, "step": 40570 }, { "epoch": 4.518877380554628, "grad_norm": 0.13329508900642395, "learning_rate": 3.3396433103354765e-05, "loss": 0.4822, "num_input_tokens_seen": 49221616, "step": 40575 }, { "epoch": 4.519434235438244, "grad_norm": 0.08420093357563019, "learning_rate": 3.33918557421561e-05, "loss": 0.4496, "num_input_tokens_seen": 49227920, "step": 40580 }, { "epoch": 4.519991090321862, "grad_norm": 0.10059642046689987, "learning_rate": 3.338727806388622e-05, "loss": 0.4682, "num_input_tokens_seen": 49234096, "step": 40585 }, { "epoch": 4.52054794520548, "grad_norm": 0.12001170963048935, "learning_rate": 3.3382700068718104e-05, "loss": 0.4616, "num_input_tokens_seen": 49240176, "step": 40590 }, { "epoch": 4.5211048000890965, "grad_norm": 0.10564269870519638, "learning_rate": 3.3378121756824706e-05, "loss": 0.4459, "num_input_tokens_seen": 49246416, "step": 40595 }, { "epoch": 4.521661654972714, "grad_norm": 0.09342541545629501, "learning_rate": 3.337354312837903e-05, "loss": 0.4666, "num_input_tokens_seen": 49252656, "step": 40600 }, { "epoch": 4.522218509856332, "grad_norm": 0.11263683438301086, "learning_rate": 3.336896418355406e-05, "loss": 0.4744, "num_input_tokens_seen": 49258736, "step": 40605 }, { "epoch": 4.522775364739949, "grad_norm": 0.13444823026657104, "learning_rate": 3.33643849225228e-05, "loss": 0.4651, "num_input_tokens_seen": 49265232, "step": 40610 }, { "epoch": 4.523332219623566, "grad_norm": 0.1456424444913864, "learning_rate": 3.3359805345458267e-05, "loss": 0.4695, "num_input_tokens_seen": 49270512, "step": 40615 }, { "epoch": 4.523889074507183, "grad_norm": 0.1080336943268776, "learning_rate": 3.335522545253351e-05, "loss": 0.469, "num_input_tokens_seen": 49276784, "step": 40620 }, { "epoch": 4.524445929390801, "grad_norm": 0.09951061755418777, "learning_rate": 3.3350645243921543e-05, "loss": 0.4585, "num_input_tokens_seen": 49282736, "step": 40625 }, { "epoch": 4.5250027842744185, "grad_norm": 0.17945677042007446, "learning_rate": 3.334606471979546e-05, "loss": 0.462, "num_input_tokens_seen": 49288976, "step": 40630 }, { "epoch": 4.525559639158035, "grad_norm": 0.09066054224967957, "learning_rate": 3.334148388032829e-05, "loss": 0.471, "num_input_tokens_seen": 49294704, "step": 40635 }, { "epoch": 4.526116494041653, "grad_norm": 0.15677125751972198, "learning_rate": 3.333690272569312e-05, "loss": 0.4585, "num_input_tokens_seen": 49301072, "step": 40640 }, { "epoch": 4.52667334892527, "grad_norm": 0.09902125597000122, "learning_rate": 3.3332321256063064e-05, "loss": 0.4664, "num_input_tokens_seen": 49307024, "step": 40645 }, { "epoch": 4.527230203808887, "grad_norm": 0.10358012467622757, "learning_rate": 3.3327739471611204e-05, "loss": 0.4587, "num_input_tokens_seen": 49313040, "step": 40650 }, { "epoch": 4.527787058692505, "grad_norm": 0.1682821363210678, "learning_rate": 3.332315737251066e-05, "loss": 0.453, "num_input_tokens_seen": 49319344, "step": 40655 }, { "epoch": 4.528343913576122, "grad_norm": 0.1367620974779129, "learning_rate": 3.331857495893456e-05, "loss": 0.4757, "num_input_tokens_seen": 49325680, "step": 40660 }, { "epoch": 4.5289007684597395, "grad_norm": 0.09635923057794571, "learning_rate": 3.331399223105604e-05, "loss": 0.4572, "num_input_tokens_seen": 49331792, "step": 40665 }, { "epoch": 4.529457623343356, "grad_norm": 0.11905995011329651, "learning_rate": 3.330940918904824e-05, "loss": 0.4641, "num_input_tokens_seen": 49338160, "step": 40670 }, { "epoch": 4.530014478226974, "grad_norm": 0.0972619503736496, "learning_rate": 3.330482583308433e-05, "loss": 0.469, "num_input_tokens_seen": 49344272, "step": 40675 }, { "epoch": 4.530571333110592, "grad_norm": 0.14278313517570496, "learning_rate": 3.3300242163337495e-05, "loss": 0.4517, "num_input_tokens_seen": 49350096, "step": 40680 }, { "epoch": 4.5311281879942085, "grad_norm": 0.09803199768066406, "learning_rate": 3.3295658179980914e-05, "loss": 0.4776, "num_input_tokens_seen": 49355728, "step": 40685 }, { "epoch": 4.531685042877826, "grad_norm": 0.11377528309822083, "learning_rate": 3.329107388318778e-05, "loss": 0.4634, "num_input_tokens_seen": 49361808, "step": 40690 }, { "epoch": 4.532241897761443, "grad_norm": 0.1322145015001297, "learning_rate": 3.328648927313131e-05, "loss": 0.4612, "num_input_tokens_seen": 49367760, "step": 40695 }, { "epoch": 4.532798752645061, "grad_norm": 0.13468855619430542, "learning_rate": 3.3281904349984705e-05, "loss": 0.4538, "num_input_tokens_seen": 49373584, "step": 40700 }, { "epoch": 4.533355607528678, "grad_norm": 0.12619668245315552, "learning_rate": 3.327731911392122e-05, "loss": 0.4668, "num_input_tokens_seen": 49380080, "step": 40705 }, { "epoch": 4.533912462412295, "grad_norm": 0.09346887469291687, "learning_rate": 3.327273356511409e-05, "loss": 0.4746, "num_input_tokens_seen": 49385776, "step": 40710 }, { "epoch": 4.534469317295913, "grad_norm": 0.1350109726190567, "learning_rate": 3.3268147703736585e-05, "loss": 0.4632, "num_input_tokens_seen": 49392048, "step": 40715 }, { "epoch": 4.5350261721795295, "grad_norm": 0.10448935627937317, "learning_rate": 3.326356152996196e-05, "loss": 0.4556, "num_input_tokens_seen": 49398128, "step": 40720 }, { "epoch": 4.535583027063147, "grad_norm": 0.1882127821445465, "learning_rate": 3.325897504396349e-05, "loss": 0.4489, "num_input_tokens_seen": 49404432, "step": 40725 }, { "epoch": 4.536139881946765, "grad_norm": 0.09557167440652847, "learning_rate": 3.3254388245914486e-05, "loss": 0.4613, "num_input_tokens_seen": 49410544, "step": 40730 }, { "epoch": 4.536696736830382, "grad_norm": 0.14433355629444122, "learning_rate": 3.324980113598824e-05, "loss": 0.4696, "num_input_tokens_seen": 49416816, "step": 40735 }, { "epoch": 4.537253591713999, "grad_norm": 0.10768260806798935, "learning_rate": 3.3245213714358064e-05, "loss": 0.4755, "num_input_tokens_seen": 49422800, "step": 40740 }, { "epoch": 4.537810446597617, "grad_norm": 0.09454634040594101, "learning_rate": 3.324062598119729e-05, "loss": 0.4678, "num_input_tokens_seen": 49428624, "step": 40745 }, { "epoch": 4.538367301481234, "grad_norm": 0.0943189188838005, "learning_rate": 3.323603793667927e-05, "loss": 0.4662, "num_input_tokens_seen": 49434800, "step": 40750 }, { "epoch": 4.5389241563648515, "grad_norm": 0.09987188130617142, "learning_rate": 3.323144958097733e-05, "loss": 0.4558, "num_input_tokens_seen": 49440912, "step": 40755 }, { "epoch": 4.539481011248469, "grad_norm": 0.165458083152771, "learning_rate": 3.322686091426485e-05, "loss": 0.47, "num_input_tokens_seen": 49446992, "step": 40760 }, { "epoch": 4.540037866132086, "grad_norm": 0.09871889650821686, "learning_rate": 3.32222719367152e-05, "loss": 0.473, "num_input_tokens_seen": 49453104, "step": 40765 }, { "epoch": 4.540594721015704, "grad_norm": 0.08908320963382721, "learning_rate": 3.321768264850177e-05, "loss": 0.4704, "num_input_tokens_seen": 49459088, "step": 40770 }, { "epoch": 4.54115157589932, "grad_norm": 0.11148316413164139, "learning_rate": 3.321309304979795e-05, "loss": 0.4661, "num_input_tokens_seen": 49464752, "step": 40775 }, { "epoch": 4.541708430782938, "grad_norm": 0.12955541908740997, "learning_rate": 3.3208503140777154e-05, "loss": 0.4512, "num_input_tokens_seen": 49470960, "step": 40780 }, { "epoch": 4.542265285666556, "grad_norm": 0.10598959773778915, "learning_rate": 3.3203912921612804e-05, "loss": 0.4564, "num_input_tokens_seen": 49476816, "step": 40785 }, { "epoch": 4.5428221405501725, "grad_norm": 0.10964420437812805, "learning_rate": 3.319932239247834e-05, "loss": 0.4673, "num_input_tokens_seen": 49482800, "step": 40790 }, { "epoch": 4.54337899543379, "grad_norm": 0.08541034907102585, "learning_rate": 3.3194731553547185e-05, "loss": 0.4559, "num_input_tokens_seen": 49489392, "step": 40795 }, { "epoch": 4.543935850317407, "grad_norm": 0.08770455420017242, "learning_rate": 3.319014040499283e-05, "loss": 0.4638, "num_input_tokens_seen": 49495600, "step": 40800 }, { "epoch": 4.544492705201025, "grad_norm": 0.09851621836423874, "learning_rate": 3.318554894698871e-05, "loss": 0.4539, "num_input_tokens_seen": 49501168, "step": 40805 }, { "epoch": 4.545049560084642, "grad_norm": 0.11488388478755951, "learning_rate": 3.318095717970833e-05, "loss": 0.4698, "num_input_tokens_seen": 49507440, "step": 40810 }, { "epoch": 4.545606414968259, "grad_norm": 0.09499333053827286, "learning_rate": 3.3176365103325166e-05, "loss": 0.4653, "num_input_tokens_seen": 49513264, "step": 40815 }, { "epoch": 4.546163269851877, "grad_norm": 0.0715642049908638, "learning_rate": 3.3171772718012716e-05, "loss": 0.4606, "num_input_tokens_seen": 49519632, "step": 40820 }, { "epoch": 4.546720124735494, "grad_norm": 0.0947926938533783, "learning_rate": 3.316718002394451e-05, "loss": 0.4461, "num_input_tokens_seen": 49525424, "step": 40825 }, { "epoch": 4.547276979619111, "grad_norm": 0.0934138223528862, "learning_rate": 3.316258702129408e-05, "loss": 0.4529, "num_input_tokens_seen": 49531184, "step": 40830 }, { "epoch": 4.547833834502729, "grad_norm": 0.10660732537508011, "learning_rate": 3.315799371023495e-05, "loss": 0.4536, "num_input_tokens_seen": 49537232, "step": 40835 }, { "epoch": 4.548390689386346, "grad_norm": 0.08832568675279617, "learning_rate": 3.315340009094067e-05, "loss": 0.4662, "num_input_tokens_seen": 49543440, "step": 40840 }, { "epoch": 4.548947544269963, "grad_norm": 0.15259207785129547, "learning_rate": 3.31488061635848e-05, "loss": 0.4588, "num_input_tokens_seen": 49549424, "step": 40845 }, { "epoch": 4.54950439915358, "grad_norm": 0.107300765812397, "learning_rate": 3.314421192834093e-05, "loss": 0.4648, "num_input_tokens_seen": 49555280, "step": 40850 }, { "epoch": 4.550061254037198, "grad_norm": 0.08793140947818756, "learning_rate": 3.313961738538263e-05, "loss": 0.4506, "num_input_tokens_seen": 49561840, "step": 40855 }, { "epoch": 4.5506181089208155, "grad_norm": 0.10586518049240112, "learning_rate": 3.31350225348835e-05, "loss": 0.4638, "num_input_tokens_seen": 49567696, "step": 40860 }, { "epoch": 4.551174963804432, "grad_norm": 0.0868489071726799, "learning_rate": 3.313042737701714e-05, "loss": 0.4635, "num_input_tokens_seen": 49573872, "step": 40865 }, { "epoch": 4.55173181868805, "grad_norm": 0.11168522387742996, "learning_rate": 3.3125831911957195e-05, "loss": 0.4616, "num_input_tokens_seen": 49580240, "step": 40870 }, { "epoch": 4.552288673571667, "grad_norm": 0.09105650335550308, "learning_rate": 3.312123613987727e-05, "loss": 0.467, "num_input_tokens_seen": 49585904, "step": 40875 }, { "epoch": 4.5528455284552845, "grad_norm": 0.10728716850280762, "learning_rate": 3.3116640060951024e-05, "loss": 0.467, "num_input_tokens_seen": 49592144, "step": 40880 }, { "epoch": 4.553402383338902, "grad_norm": 0.13262753188610077, "learning_rate": 3.31120436753521e-05, "loss": 0.4666, "num_input_tokens_seen": 49598448, "step": 40885 }, { "epoch": 4.553959238222519, "grad_norm": 0.0968884602189064, "learning_rate": 3.310744698325416e-05, "loss": 0.4577, "num_input_tokens_seen": 49604432, "step": 40890 }, { "epoch": 4.554516093106137, "grad_norm": 0.09789537638425827, "learning_rate": 3.310284998483091e-05, "loss": 0.4681, "num_input_tokens_seen": 49610576, "step": 40895 }, { "epoch": 4.555072947989753, "grad_norm": 0.10616883635520935, "learning_rate": 3.309825268025601e-05, "loss": 0.4608, "num_input_tokens_seen": 49616272, "step": 40900 }, { "epoch": 4.555629802873371, "grad_norm": 0.11700098216533661, "learning_rate": 3.3093655069703176e-05, "loss": 0.4636, "num_input_tokens_seen": 49622640, "step": 40905 }, { "epoch": 4.556186657756989, "grad_norm": 0.12687426805496216, "learning_rate": 3.308905715334612e-05, "loss": 0.4728, "num_input_tokens_seen": 49628816, "step": 40910 }, { "epoch": 4.5567435126406055, "grad_norm": 0.09447161108255386, "learning_rate": 3.308445893135855e-05, "loss": 0.464, "num_input_tokens_seen": 49635248, "step": 40915 }, { "epoch": 4.557300367524223, "grad_norm": 0.10188566893339157, "learning_rate": 3.3079860403914226e-05, "loss": 0.4673, "num_input_tokens_seen": 49641456, "step": 40920 }, { "epoch": 4.557857222407841, "grad_norm": 0.09815314412117004, "learning_rate": 3.3075261571186874e-05, "loss": 0.4593, "num_input_tokens_seen": 49646928, "step": 40925 }, { "epoch": 4.558414077291458, "grad_norm": 0.0869402140378952, "learning_rate": 3.3070662433350266e-05, "loss": 0.4567, "num_input_tokens_seen": 49652912, "step": 40930 }, { "epoch": 4.558970932175075, "grad_norm": 0.1024627685546875, "learning_rate": 3.306606299057816e-05, "loss": 0.4622, "num_input_tokens_seen": 49659024, "step": 40935 }, { "epoch": 4.559527787058693, "grad_norm": 0.10710816830396652, "learning_rate": 3.306146324304436e-05, "loss": 0.4764, "num_input_tokens_seen": 49665232, "step": 40940 }, { "epoch": 4.56008464194231, "grad_norm": 0.13811038434505463, "learning_rate": 3.305686319092264e-05, "loss": 0.458, "num_input_tokens_seen": 49671376, "step": 40945 }, { "epoch": 4.5606414968259275, "grad_norm": 0.1119375079870224, "learning_rate": 3.30522628343868e-05, "loss": 0.4649, "num_input_tokens_seen": 49677040, "step": 40950 }, { "epoch": 4.561198351709544, "grad_norm": 0.10905952751636505, "learning_rate": 3.304766217361067e-05, "loss": 0.4714, "num_input_tokens_seen": 49682672, "step": 40955 }, { "epoch": 4.561755206593162, "grad_norm": 0.09477438032627106, "learning_rate": 3.304306120876807e-05, "loss": 0.4544, "num_input_tokens_seen": 49688752, "step": 40960 }, { "epoch": 4.56231206147678, "grad_norm": 0.09206748753786087, "learning_rate": 3.303845994003285e-05, "loss": 0.4546, "num_input_tokens_seen": 49694832, "step": 40965 }, { "epoch": 4.562868916360396, "grad_norm": 0.0931628867983818, "learning_rate": 3.303385836757885e-05, "loss": 0.4674, "num_input_tokens_seen": 49700688, "step": 40970 }, { "epoch": 4.563425771244014, "grad_norm": 0.10929029434919357, "learning_rate": 3.3029256491579933e-05, "loss": 0.4526, "num_input_tokens_seen": 49706736, "step": 40975 }, { "epoch": 4.563982626127631, "grad_norm": 0.09250841289758682, "learning_rate": 3.302465431220997e-05, "loss": 0.4578, "num_input_tokens_seen": 49713008, "step": 40980 }, { "epoch": 4.5645394810112485, "grad_norm": 0.11438935250043869, "learning_rate": 3.3020051829642865e-05, "loss": 0.4818, "num_input_tokens_seen": 49718512, "step": 40985 }, { "epoch": 4.565096335894866, "grad_norm": 0.12914904952049255, "learning_rate": 3.3015449044052486e-05, "loss": 0.4631, "num_input_tokens_seen": 49724016, "step": 40990 }, { "epoch": 4.565653190778483, "grad_norm": 0.10757438093423843, "learning_rate": 3.301084595561276e-05, "loss": 0.469, "num_input_tokens_seen": 49730192, "step": 40995 }, { "epoch": 4.566210045662101, "grad_norm": 0.09628751128911972, "learning_rate": 3.300624256449761e-05, "loss": 0.4583, "num_input_tokens_seen": 49736496, "step": 41000 }, { "epoch": 4.566766900545717, "grad_norm": 0.09102611243724823, "learning_rate": 3.3001638870880946e-05, "loss": 0.4574, "num_input_tokens_seen": 49742800, "step": 41005 }, { "epoch": 4.567323755429335, "grad_norm": 0.10454314947128296, "learning_rate": 3.2997034874936736e-05, "loss": 0.4611, "num_input_tokens_seen": 49749232, "step": 41010 }, { "epoch": 4.567880610312953, "grad_norm": 0.08853840082883835, "learning_rate": 3.2992430576838905e-05, "loss": 0.4541, "num_input_tokens_seen": 49754928, "step": 41015 }, { "epoch": 4.56843746519657, "grad_norm": 0.1111551970243454, "learning_rate": 3.298782597676144e-05, "loss": 0.467, "num_input_tokens_seen": 49761392, "step": 41020 }, { "epoch": 4.568994320080187, "grad_norm": 0.11602282524108887, "learning_rate": 3.298322107487832e-05, "loss": 0.4639, "num_input_tokens_seen": 49767056, "step": 41025 }, { "epoch": 4.569551174963804, "grad_norm": 0.07692454010248184, "learning_rate": 3.297861587136352e-05, "loss": 0.479, "num_input_tokens_seen": 49773040, "step": 41030 }, { "epoch": 4.570108029847422, "grad_norm": 0.1342965066432953, "learning_rate": 3.297401036639104e-05, "loss": 0.4588, "num_input_tokens_seen": 49779344, "step": 41035 }, { "epoch": 4.570664884731039, "grad_norm": 0.13905243575572968, "learning_rate": 3.296940456013489e-05, "loss": 0.4566, "num_input_tokens_seen": 49785360, "step": 41040 }, { "epoch": 4.571221739614656, "grad_norm": 0.11464338004589081, "learning_rate": 3.2964798452769096e-05, "loss": 0.4681, "num_input_tokens_seen": 49791568, "step": 41045 }, { "epoch": 4.571778594498274, "grad_norm": 0.10073385387659073, "learning_rate": 3.29601920444677e-05, "loss": 0.4556, "num_input_tokens_seen": 49797456, "step": 41050 }, { "epoch": 4.572335449381891, "grad_norm": 0.09147725254297256, "learning_rate": 3.2955585335404744e-05, "loss": 0.4514, "num_input_tokens_seen": 49803632, "step": 41055 }, { "epoch": 4.572892304265508, "grad_norm": 0.10091230273246765, "learning_rate": 3.2950978325754265e-05, "loss": 0.4667, "num_input_tokens_seen": 49809040, "step": 41060 }, { "epoch": 4.573449159149126, "grad_norm": 0.10842656344175339, "learning_rate": 3.294637101569036e-05, "loss": 0.4487, "num_input_tokens_seen": 49815056, "step": 41065 }, { "epoch": 4.574006014032743, "grad_norm": 0.0868581160902977, "learning_rate": 3.2941763405387084e-05, "loss": 0.4595, "num_input_tokens_seen": 49821296, "step": 41070 }, { "epoch": 4.57456286891636, "grad_norm": 0.14486950635910034, "learning_rate": 3.293715549501853e-05, "loss": 0.4683, "num_input_tokens_seen": 49827376, "step": 41075 }, { "epoch": 4.575119723799978, "grad_norm": 0.09857475012540817, "learning_rate": 3.2932547284758815e-05, "loss": 0.4634, "num_input_tokens_seen": 49833552, "step": 41080 }, { "epoch": 4.575676578683595, "grad_norm": 0.09051750600337982, "learning_rate": 3.2927938774782044e-05, "loss": 0.4702, "num_input_tokens_seen": 49839536, "step": 41085 }, { "epoch": 4.576233433567213, "grad_norm": 0.11249516159296036, "learning_rate": 3.292332996526233e-05, "loss": 0.4698, "num_input_tokens_seen": 49845520, "step": 41090 }, { "epoch": 4.576790288450829, "grad_norm": 0.08600159734487534, "learning_rate": 3.2918720856373825e-05, "loss": 0.4616, "num_input_tokens_seen": 49851632, "step": 41095 }, { "epoch": 4.577347143334447, "grad_norm": 0.12941524386405945, "learning_rate": 3.291411144829067e-05, "loss": 0.4617, "num_input_tokens_seen": 49857616, "step": 41100 }, { "epoch": 4.577903998218065, "grad_norm": 0.09113026410341263, "learning_rate": 3.290950174118702e-05, "loss": 0.4737, "num_input_tokens_seen": 49863856, "step": 41105 }, { "epoch": 4.5784608531016815, "grad_norm": 0.09937328100204468, "learning_rate": 3.2904891735237046e-05, "loss": 0.4719, "num_input_tokens_seen": 49870032, "step": 41110 }, { "epoch": 4.579017707985299, "grad_norm": 0.09180689603090286, "learning_rate": 3.2900281430614924e-05, "loss": 0.4671, "num_input_tokens_seen": 49875856, "step": 41115 }, { "epoch": 4.579574562868917, "grad_norm": 0.10179184377193451, "learning_rate": 3.289567082749486e-05, "loss": 0.4614, "num_input_tokens_seen": 49882160, "step": 41120 }, { "epoch": 4.580131417752534, "grad_norm": 0.10979174822568893, "learning_rate": 3.289105992605105e-05, "loss": 0.4561, "num_input_tokens_seen": 49888464, "step": 41125 }, { "epoch": 4.580688272636151, "grad_norm": 0.0928892195224762, "learning_rate": 3.288644872645771e-05, "loss": 0.4525, "num_input_tokens_seen": 49894992, "step": 41130 }, { "epoch": 4.581245127519768, "grad_norm": 0.08624677360057831, "learning_rate": 3.288183722888905e-05, "loss": 0.4648, "num_input_tokens_seen": 49900176, "step": 41135 }, { "epoch": 4.581801982403386, "grad_norm": 0.12510541081428528, "learning_rate": 3.287722543351933e-05, "loss": 0.4721, "num_input_tokens_seen": 49905712, "step": 41140 }, { "epoch": 4.582358837287003, "grad_norm": 0.07050587236881256, "learning_rate": 3.28726133405228e-05, "loss": 0.4644, "num_input_tokens_seen": 49911728, "step": 41145 }, { "epoch": 4.58291569217062, "grad_norm": 0.13031005859375, "learning_rate": 3.2868000950073697e-05, "loss": 0.4517, "num_input_tokens_seen": 49917936, "step": 41150 }, { "epoch": 4.583472547054238, "grad_norm": 0.09914414584636688, "learning_rate": 3.2863388262346305e-05, "loss": 0.4659, "num_input_tokens_seen": 49924240, "step": 41155 }, { "epoch": 4.584029401937855, "grad_norm": 0.1216830313205719, "learning_rate": 3.2858775277514905e-05, "loss": 0.4609, "num_input_tokens_seen": 49930416, "step": 41160 }, { "epoch": 4.584586256821472, "grad_norm": 0.09498687088489532, "learning_rate": 3.285416199575379e-05, "loss": 0.455, "num_input_tokens_seen": 49936240, "step": 41165 }, { "epoch": 4.58514311170509, "grad_norm": 0.09875626862049103, "learning_rate": 3.284954841723727e-05, "loss": 0.473, "num_input_tokens_seen": 49942384, "step": 41170 }, { "epoch": 4.585699966588707, "grad_norm": 0.090086929500103, "learning_rate": 3.284493454213965e-05, "loss": 0.4683, "num_input_tokens_seen": 49948560, "step": 41175 }, { "epoch": 4.5862568214723245, "grad_norm": 0.09628051519393921, "learning_rate": 3.284032037063527e-05, "loss": 0.4681, "num_input_tokens_seen": 49954928, "step": 41180 }, { "epoch": 4.586813676355941, "grad_norm": 0.0865512266755104, "learning_rate": 3.283570590289845e-05, "loss": 0.46, "num_input_tokens_seen": 49960368, "step": 41185 }, { "epoch": 4.587370531239559, "grad_norm": 0.1418224275112152, "learning_rate": 3.283109113910356e-05, "loss": 0.4604, "num_input_tokens_seen": 49966672, "step": 41190 }, { "epoch": 4.587927386123177, "grad_norm": 0.09517331421375275, "learning_rate": 3.282647607942495e-05, "loss": 0.4673, "num_input_tokens_seen": 49972848, "step": 41195 }, { "epoch": 4.588484241006793, "grad_norm": 0.12341314554214478, "learning_rate": 3.2821860724036976e-05, "loss": 0.4752, "num_input_tokens_seen": 49978768, "step": 41200 }, { "epoch": 4.589041095890411, "grad_norm": 0.13699443638324738, "learning_rate": 3.281724507311406e-05, "loss": 0.4684, "num_input_tokens_seen": 49984816, "step": 41205 }, { "epoch": 4.589597950774028, "grad_norm": 0.10203055292367935, "learning_rate": 3.281262912683056e-05, "loss": 0.4581, "num_input_tokens_seen": 49990576, "step": 41210 }, { "epoch": 4.5901548056576456, "grad_norm": 0.09590379148721695, "learning_rate": 3.28080128853609e-05, "loss": 0.4661, "num_input_tokens_seen": 49996656, "step": 41215 }, { "epoch": 4.590711660541263, "grad_norm": 0.10894034057855606, "learning_rate": 3.280339634887949e-05, "loss": 0.4834, "num_input_tokens_seen": 50002864, "step": 41220 }, { "epoch": 4.59126851542488, "grad_norm": 0.10832904279232025, "learning_rate": 3.279877951756076e-05, "loss": 0.4597, "num_input_tokens_seen": 50009040, "step": 41225 }, { "epoch": 4.591825370308498, "grad_norm": 0.10330704599618912, "learning_rate": 3.279416239157914e-05, "loss": 0.4678, "num_input_tokens_seen": 50015280, "step": 41230 }, { "epoch": 4.5923822251921145, "grad_norm": 0.08704590797424316, "learning_rate": 3.2789544971109093e-05, "loss": 0.466, "num_input_tokens_seen": 50021360, "step": 41235 }, { "epoch": 4.592939080075732, "grad_norm": 0.0724259465932846, "learning_rate": 3.278492725632507e-05, "loss": 0.4587, "num_input_tokens_seen": 50027344, "step": 41240 }, { "epoch": 4.59349593495935, "grad_norm": 0.11132574826478958, "learning_rate": 3.278030924740155e-05, "loss": 0.4608, "num_input_tokens_seen": 50032752, "step": 41245 }, { "epoch": 4.594052789842967, "grad_norm": 0.10310740023851395, "learning_rate": 3.277569094451302e-05, "loss": 0.456, "num_input_tokens_seen": 50038928, "step": 41250 }, { "epoch": 4.594609644726584, "grad_norm": 0.11129213869571686, "learning_rate": 3.277107234783396e-05, "loss": 0.4721, "num_input_tokens_seen": 50044912, "step": 41255 }, { "epoch": 4.595166499610202, "grad_norm": 0.10130689293146133, "learning_rate": 3.276645345753889e-05, "loss": 0.4671, "num_input_tokens_seen": 50050672, "step": 41260 }, { "epoch": 4.595723354493819, "grad_norm": 0.09714748710393906, "learning_rate": 3.276183427380231e-05, "loss": 0.4582, "num_input_tokens_seen": 50057040, "step": 41265 }, { "epoch": 4.596280209377436, "grad_norm": 0.1385837346315384, "learning_rate": 3.275721479679876e-05, "loss": 0.4627, "num_input_tokens_seen": 50062960, "step": 41270 }, { "epoch": 4.596837064261053, "grad_norm": 0.08688610047101974, "learning_rate": 3.275259502670278e-05, "loss": 0.4525, "num_input_tokens_seen": 50068848, "step": 41275 }, { "epoch": 4.597393919144671, "grad_norm": 0.10597439855337143, "learning_rate": 3.274797496368892e-05, "loss": 0.4687, "num_input_tokens_seen": 50075056, "step": 41280 }, { "epoch": 4.597950774028289, "grad_norm": 0.08935894072055817, "learning_rate": 3.274335460793173e-05, "loss": 0.4621, "num_input_tokens_seen": 50080208, "step": 41285 }, { "epoch": 4.598507628911905, "grad_norm": 0.08413548767566681, "learning_rate": 3.273873395960579e-05, "loss": 0.4803, "num_input_tokens_seen": 50086384, "step": 41290 }, { "epoch": 4.599064483795523, "grad_norm": 0.09672967344522476, "learning_rate": 3.273411301888567e-05, "loss": 0.451, "num_input_tokens_seen": 50092688, "step": 41295 }, { "epoch": 4.599621338679141, "grad_norm": 0.08252496272325516, "learning_rate": 3.272949178594599e-05, "loss": 0.4569, "num_input_tokens_seen": 50098608, "step": 41300 }, { "epoch": 4.6001781935627575, "grad_norm": 0.10195443779230118, "learning_rate": 3.272487026096133e-05, "loss": 0.4508, "num_input_tokens_seen": 50104240, "step": 41305 }, { "epoch": 4.600735048446375, "grad_norm": 0.08218622207641602, "learning_rate": 3.2720248444106324e-05, "loss": 0.4479, "num_input_tokens_seen": 50110160, "step": 41310 }, { "epoch": 4.601291903329992, "grad_norm": 0.11195550113916397, "learning_rate": 3.2715626335555585e-05, "loss": 0.4833, "num_input_tokens_seen": 50116144, "step": 41315 }, { "epoch": 4.60184875821361, "grad_norm": 0.09712086617946625, "learning_rate": 3.271100393548376e-05, "loss": 0.476, "num_input_tokens_seen": 50122288, "step": 41320 }, { "epoch": 4.602405613097227, "grad_norm": 0.10206416994333267, "learning_rate": 3.270638124406549e-05, "loss": 0.4748, "num_input_tokens_seen": 50128400, "step": 41325 }, { "epoch": 4.602962467980844, "grad_norm": 0.0829913392663002, "learning_rate": 3.270175826147544e-05, "loss": 0.4503, "num_input_tokens_seen": 50134544, "step": 41330 }, { "epoch": 4.603519322864462, "grad_norm": 0.08138199895620346, "learning_rate": 3.269713498788829e-05, "loss": 0.4603, "num_input_tokens_seen": 50140848, "step": 41335 }, { "epoch": 4.6040761777480785, "grad_norm": 0.10500670969486237, "learning_rate": 3.269251142347871e-05, "loss": 0.46, "num_input_tokens_seen": 50147056, "step": 41340 }, { "epoch": 4.604633032631696, "grad_norm": 0.08736928552389145, "learning_rate": 3.26878875684214e-05, "loss": 0.4673, "num_input_tokens_seen": 50153104, "step": 41345 }, { "epoch": 4.605189887515314, "grad_norm": 0.09373775869607925, "learning_rate": 3.2683263422891056e-05, "loss": 0.4677, "num_input_tokens_seen": 50159440, "step": 41350 }, { "epoch": 4.605746742398931, "grad_norm": 0.11196190118789673, "learning_rate": 3.26786389870624e-05, "loss": 0.472, "num_input_tokens_seen": 50165904, "step": 41355 }, { "epoch": 4.606303597282548, "grad_norm": 0.11118291318416595, "learning_rate": 3.2674014261110155e-05, "loss": 0.4667, "num_input_tokens_seen": 50172080, "step": 41360 }, { "epoch": 4.606860452166165, "grad_norm": 0.10776041448116302, "learning_rate": 3.266938924520906e-05, "loss": 0.4668, "num_input_tokens_seen": 50178352, "step": 41365 }, { "epoch": 4.607417307049783, "grad_norm": 0.11372620612382889, "learning_rate": 3.2664763939533856e-05, "loss": 0.462, "num_input_tokens_seen": 50183664, "step": 41370 }, { "epoch": 4.6079741619334005, "grad_norm": 0.08453810214996338, "learning_rate": 3.266013834425931e-05, "loss": 0.4698, "num_input_tokens_seen": 50189968, "step": 41375 }, { "epoch": 4.608531016817017, "grad_norm": 0.09968248754739761, "learning_rate": 3.265551245956019e-05, "loss": 0.4617, "num_input_tokens_seen": 50195216, "step": 41380 }, { "epoch": 4.609087871700635, "grad_norm": 0.11174977570772171, "learning_rate": 3.265088628561127e-05, "loss": 0.4561, "num_input_tokens_seen": 50201136, "step": 41385 }, { "epoch": 4.609644726584252, "grad_norm": 0.10091444104909897, "learning_rate": 3.2646259822587353e-05, "loss": 0.4618, "num_input_tokens_seen": 50207312, "step": 41390 }, { "epoch": 4.610201581467869, "grad_norm": 0.1100374162197113, "learning_rate": 3.264163307066324e-05, "loss": 0.468, "num_input_tokens_seen": 50213392, "step": 41395 }, { "epoch": 4.610758436351487, "grad_norm": 0.14325185120105743, "learning_rate": 3.263700603001374e-05, "loss": 0.4564, "num_input_tokens_seen": 50219472, "step": 41400 }, { "epoch": 4.611315291235104, "grad_norm": 0.13018707931041718, "learning_rate": 3.263237870081368e-05, "loss": 0.4473, "num_input_tokens_seen": 50225584, "step": 41405 }, { "epoch": 4.6118721461187215, "grad_norm": 0.10588313639163971, "learning_rate": 3.2627751083237886e-05, "loss": 0.4786, "num_input_tokens_seen": 50231536, "step": 41410 }, { "epoch": 4.612429001002338, "grad_norm": 0.10818550735712051, "learning_rate": 3.262312317746121e-05, "loss": 0.4601, "num_input_tokens_seen": 50237808, "step": 41415 }, { "epoch": 4.612985855885956, "grad_norm": 0.11352044343948364, "learning_rate": 3.261849498365852e-05, "loss": 0.4626, "num_input_tokens_seen": 50243536, "step": 41420 }, { "epoch": 4.613542710769574, "grad_norm": 0.07843633741140366, "learning_rate": 3.2613866502004664e-05, "loss": 0.462, "num_input_tokens_seen": 50249488, "step": 41425 }, { "epoch": 4.6140995656531905, "grad_norm": 0.09982365369796753, "learning_rate": 3.260923773267454e-05, "loss": 0.481, "num_input_tokens_seen": 50254800, "step": 41430 }, { "epoch": 4.614656420536808, "grad_norm": 0.1064324602484703, "learning_rate": 3.2604608675843026e-05, "loss": 0.4679, "num_input_tokens_seen": 50261104, "step": 41435 }, { "epoch": 4.615213275420426, "grad_norm": 0.09859701991081238, "learning_rate": 3.259997933168503e-05, "loss": 0.4587, "num_input_tokens_seen": 50267408, "step": 41440 }, { "epoch": 4.615770130304043, "grad_norm": 0.11768978834152222, "learning_rate": 3.259534970037546e-05, "loss": 0.4555, "num_input_tokens_seen": 50273712, "step": 41445 }, { "epoch": 4.61632698518766, "grad_norm": 0.11275717616081238, "learning_rate": 3.259071978208923e-05, "loss": 0.4781, "num_input_tokens_seen": 50280176, "step": 41450 }, { "epoch": 4.616883840071277, "grad_norm": 0.13014835119247437, "learning_rate": 3.258608957700129e-05, "loss": 0.4697, "num_input_tokens_seen": 50286416, "step": 41455 }, { "epoch": 4.617440694954895, "grad_norm": 0.12683576345443726, "learning_rate": 3.258145908528657e-05, "loss": 0.472, "num_input_tokens_seen": 50292304, "step": 41460 }, { "epoch": 4.617997549838512, "grad_norm": 0.09103816747665405, "learning_rate": 3.257682830712003e-05, "loss": 0.4679, "num_input_tokens_seen": 50298448, "step": 41465 }, { "epoch": 4.618554404722129, "grad_norm": 0.13772344589233398, "learning_rate": 3.257219724267664e-05, "loss": 0.4556, "num_input_tokens_seen": 50304816, "step": 41470 }, { "epoch": 4.619111259605747, "grad_norm": 0.09587927907705307, "learning_rate": 3.256756589213137e-05, "loss": 0.4627, "num_input_tokens_seen": 50310800, "step": 41475 }, { "epoch": 4.6196681144893645, "grad_norm": 0.13773761689662933, "learning_rate": 3.25629342556592e-05, "loss": 0.4685, "num_input_tokens_seen": 50317008, "step": 41480 }, { "epoch": 4.620224969372981, "grad_norm": 0.0857592299580574, "learning_rate": 3.2558302333435145e-05, "loss": 0.4657, "num_input_tokens_seen": 50323120, "step": 41485 }, { "epoch": 4.620781824256599, "grad_norm": 0.1163891926407814, "learning_rate": 3.255367012563421e-05, "loss": 0.4646, "num_input_tokens_seen": 50329040, "step": 41490 }, { "epoch": 4.621338679140216, "grad_norm": 0.08728333562612534, "learning_rate": 3.25490376324314e-05, "loss": 0.4475, "num_input_tokens_seen": 50334896, "step": 41495 }, { "epoch": 4.6218955340238335, "grad_norm": 0.12450147420167923, "learning_rate": 3.254440485400176e-05, "loss": 0.4647, "num_input_tokens_seen": 50340432, "step": 41500 }, { "epoch": 4.622452388907451, "grad_norm": 0.10305514931678772, "learning_rate": 3.2539771790520336e-05, "loss": 0.4719, "num_input_tokens_seen": 50346544, "step": 41505 }, { "epoch": 4.623009243791068, "grad_norm": 0.08586026728153229, "learning_rate": 3.2535138442162166e-05, "loss": 0.4617, "num_input_tokens_seen": 50352432, "step": 41510 }, { "epoch": 4.623566098674686, "grad_norm": 0.13317443430423737, "learning_rate": 3.253050480910231e-05, "loss": 0.465, "num_input_tokens_seen": 50358640, "step": 41515 }, { "epoch": 4.624122953558302, "grad_norm": 0.09655629843473434, "learning_rate": 3.252587089151586e-05, "loss": 0.4598, "num_input_tokens_seen": 50364688, "step": 41520 }, { "epoch": 4.62467980844192, "grad_norm": 0.09406909346580505, "learning_rate": 3.2521236689577886e-05, "loss": 0.4611, "num_input_tokens_seen": 50370736, "step": 41525 }, { "epoch": 4.625236663325538, "grad_norm": 0.09702315926551819, "learning_rate": 3.251660220346349e-05, "loss": 0.4627, "num_input_tokens_seen": 50376848, "step": 41530 }, { "epoch": 4.6257935182091545, "grad_norm": 0.12004689127206802, "learning_rate": 3.2511967433347774e-05, "loss": 0.4587, "num_input_tokens_seen": 50382384, "step": 41535 }, { "epoch": 4.626350373092772, "grad_norm": 0.11433082073926926, "learning_rate": 3.250733237940585e-05, "loss": 0.4608, "num_input_tokens_seen": 50388784, "step": 41540 }, { "epoch": 4.626907227976389, "grad_norm": 0.11048681288957596, "learning_rate": 3.2502697041812855e-05, "loss": 0.4656, "num_input_tokens_seen": 50395024, "step": 41545 }, { "epoch": 4.627464082860007, "grad_norm": 0.08338907361030579, "learning_rate": 3.249806142074392e-05, "loss": 0.471, "num_input_tokens_seen": 50400880, "step": 41550 }, { "epoch": 4.628020937743624, "grad_norm": 0.08991418033838272, "learning_rate": 3.24934255163742e-05, "loss": 0.4815, "num_input_tokens_seen": 50406992, "step": 41555 }, { "epoch": 4.628577792627241, "grad_norm": 0.08974412083625793, "learning_rate": 3.248878932887885e-05, "loss": 0.458, "num_input_tokens_seen": 50413168, "step": 41560 }, { "epoch": 4.629134647510859, "grad_norm": 0.10631610453128815, "learning_rate": 3.248415285843304e-05, "loss": 0.474, "num_input_tokens_seen": 50419024, "step": 41565 }, { "epoch": 4.629691502394476, "grad_norm": 0.09552083164453506, "learning_rate": 3.247951610521194e-05, "loss": 0.4584, "num_input_tokens_seen": 50425200, "step": 41570 }, { "epoch": 4.630248357278093, "grad_norm": 0.13010257482528687, "learning_rate": 3.247487906939076e-05, "loss": 0.4537, "num_input_tokens_seen": 50431216, "step": 41575 }, { "epoch": 4.630805212161711, "grad_norm": 0.105073481798172, "learning_rate": 3.2470241751144696e-05, "loss": 0.4628, "num_input_tokens_seen": 50437200, "step": 41580 }, { "epoch": 4.631362067045328, "grad_norm": 0.10040319710969925, "learning_rate": 3.246560415064896e-05, "loss": 0.4687, "num_input_tokens_seen": 50443216, "step": 41585 }, { "epoch": 4.631918921928945, "grad_norm": 0.07575607299804688, "learning_rate": 3.246096626807877e-05, "loss": 0.4707, "num_input_tokens_seen": 50448976, "step": 41590 }, { "epoch": 4.632475776812562, "grad_norm": 0.14068655669689178, "learning_rate": 3.245632810360937e-05, "loss": 0.4589, "num_input_tokens_seen": 50454768, "step": 41595 }, { "epoch": 4.63303263169618, "grad_norm": 0.0946081355214119, "learning_rate": 3.2451689657416e-05, "loss": 0.4655, "num_input_tokens_seen": 50460848, "step": 41600 }, { "epoch": 4.6335894865797975, "grad_norm": 0.12960878014564514, "learning_rate": 3.24470509296739e-05, "loss": 0.4568, "num_input_tokens_seen": 50467248, "step": 41605 }, { "epoch": 4.634146341463414, "grad_norm": 0.09865851700305939, "learning_rate": 3.2442411920558365e-05, "loss": 0.4476, "num_input_tokens_seen": 50473424, "step": 41610 }, { "epoch": 4.634703196347032, "grad_norm": 0.08121908456087112, "learning_rate": 3.243777263024465e-05, "loss": 0.4754, "num_input_tokens_seen": 50479632, "step": 41615 }, { "epoch": 4.63526005123065, "grad_norm": 0.07891533523797989, "learning_rate": 3.2433133058908055e-05, "loss": 0.4693, "num_input_tokens_seen": 50485680, "step": 41620 }, { "epoch": 4.6358169061142664, "grad_norm": 0.13026823103427887, "learning_rate": 3.242849320672387e-05, "loss": 0.4699, "num_input_tokens_seen": 50491504, "step": 41625 }, { "epoch": 4.636373760997884, "grad_norm": 0.09672736376523972, "learning_rate": 3.24238530738674e-05, "loss": 0.4688, "num_input_tokens_seen": 50496944, "step": 41630 }, { "epoch": 4.636930615881501, "grad_norm": 0.09697791934013367, "learning_rate": 3.2419212660513984e-05, "loss": 0.4754, "num_input_tokens_seen": 50503120, "step": 41635 }, { "epoch": 4.637487470765119, "grad_norm": 0.0912589281797409, "learning_rate": 3.241457196683893e-05, "loss": 0.4727, "num_input_tokens_seen": 50509360, "step": 41640 }, { "epoch": 4.638044325648736, "grad_norm": 0.08998849242925644, "learning_rate": 3.240993099301758e-05, "loss": 0.4728, "num_input_tokens_seen": 50515056, "step": 41645 }, { "epoch": 4.638601180532353, "grad_norm": 0.09840632975101471, "learning_rate": 3.24052897392253e-05, "loss": 0.4674, "num_input_tokens_seen": 50521136, "step": 41650 }, { "epoch": 4.639158035415971, "grad_norm": 0.09466637670993805, "learning_rate": 3.240064820563744e-05, "loss": 0.4702, "num_input_tokens_seen": 50527504, "step": 41655 }, { "epoch": 4.639714890299588, "grad_norm": 0.14149034023284912, "learning_rate": 3.239600639242937e-05, "loss": 0.4673, "num_input_tokens_seen": 50533776, "step": 41660 }, { "epoch": 4.640271745183205, "grad_norm": 0.09428998082876205, "learning_rate": 3.239136429977648e-05, "loss": 0.468, "num_input_tokens_seen": 50540144, "step": 41665 }, { "epoch": 4.640828600066823, "grad_norm": 0.09753118455410004, "learning_rate": 3.238672192785416e-05, "loss": 0.4619, "num_input_tokens_seen": 50546064, "step": 41670 }, { "epoch": 4.64138545495044, "grad_norm": 0.11872032284736633, "learning_rate": 3.238207927683781e-05, "loss": 0.4706, "num_input_tokens_seen": 50552400, "step": 41675 }, { "epoch": 4.641942309834057, "grad_norm": 0.10319548845291138, "learning_rate": 3.2377436346902856e-05, "loss": 0.4615, "num_input_tokens_seen": 50558320, "step": 41680 }, { "epoch": 4.642499164717675, "grad_norm": 0.1026182472705841, "learning_rate": 3.237279313822471e-05, "loss": 0.4686, "num_input_tokens_seen": 50564368, "step": 41685 }, { "epoch": 4.643056019601292, "grad_norm": 0.1042189672589302, "learning_rate": 3.236814965097881e-05, "loss": 0.4657, "num_input_tokens_seen": 50570192, "step": 41690 }, { "epoch": 4.6436128744849094, "grad_norm": 0.10236606746912003, "learning_rate": 3.2363505885340605e-05, "loss": 0.4613, "num_input_tokens_seen": 50576176, "step": 41695 }, { "epoch": 4.644169729368526, "grad_norm": 0.09169042110443115, "learning_rate": 3.235886184148555e-05, "loss": 0.4614, "num_input_tokens_seen": 50582576, "step": 41700 }, { "epoch": 4.644726584252144, "grad_norm": 0.10988010466098785, "learning_rate": 3.2354217519589115e-05, "loss": 0.4655, "num_input_tokens_seen": 50588272, "step": 41705 }, { "epoch": 4.645283439135762, "grad_norm": 0.11621442437171936, "learning_rate": 3.234957291982677e-05, "loss": 0.4617, "num_input_tokens_seen": 50593712, "step": 41710 }, { "epoch": 4.645840294019378, "grad_norm": 0.11835166066884995, "learning_rate": 3.2344928042374015e-05, "loss": 0.4598, "num_input_tokens_seen": 50599856, "step": 41715 }, { "epoch": 4.646397148902996, "grad_norm": 0.09466679394245148, "learning_rate": 3.234028288740633e-05, "loss": 0.4652, "num_input_tokens_seen": 50606064, "step": 41720 }, { "epoch": 4.646954003786613, "grad_norm": 0.12210559844970703, "learning_rate": 3.2335637455099246e-05, "loss": 0.4609, "num_input_tokens_seen": 50611824, "step": 41725 }, { "epoch": 4.6475108586702305, "grad_norm": 0.09766966849565506, "learning_rate": 3.233099174562826e-05, "loss": 0.4708, "num_input_tokens_seen": 50618192, "step": 41730 }, { "epoch": 4.648067713553848, "grad_norm": 0.1018369048833847, "learning_rate": 3.232634575916891e-05, "loss": 0.4681, "num_input_tokens_seen": 50624272, "step": 41735 }, { "epoch": 4.648624568437465, "grad_norm": 0.11432327330112457, "learning_rate": 3.232169949589675e-05, "loss": 0.4661, "num_input_tokens_seen": 50630512, "step": 41740 }, { "epoch": 4.649181423321083, "grad_norm": 0.07235885411500931, "learning_rate": 3.2317052955987315e-05, "loss": 0.4605, "num_input_tokens_seen": 50635792, "step": 41745 }, { "epoch": 4.649738278204699, "grad_norm": 0.11545837670564651, "learning_rate": 3.231240613961617e-05, "loss": 0.4667, "num_input_tokens_seen": 50641840, "step": 41750 }, { "epoch": 4.650295133088317, "grad_norm": 0.10248177498579025, "learning_rate": 3.2307759046958894e-05, "loss": 0.4649, "num_input_tokens_seen": 50647856, "step": 41755 }, { "epoch": 4.650851987971935, "grad_norm": 0.09528334438800812, "learning_rate": 3.2303111678191054e-05, "loss": 0.4742, "num_input_tokens_seen": 50654032, "step": 41760 }, { "epoch": 4.651408842855552, "grad_norm": 0.10894368588924408, "learning_rate": 3.229846403348825e-05, "loss": 0.4648, "num_input_tokens_seen": 50659952, "step": 41765 }, { "epoch": 4.651965697739169, "grad_norm": 0.12261519581079483, "learning_rate": 3.2293816113026094e-05, "loss": 0.4726, "num_input_tokens_seen": 50666032, "step": 41770 }, { "epoch": 4.652522552622786, "grad_norm": 0.08620376139879227, "learning_rate": 3.228916791698018e-05, "loss": 0.4658, "num_input_tokens_seen": 50672080, "step": 41775 }, { "epoch": 4.653079407506404, "grad_norm": 0.1043786108493805, "learning_rate": 3.228451944552615e-05, "loss": 0.4586, "num_input_tokens_seen": 50678128, "step": 41780 }, { "epoch": 4.653636262390021, "grad_norm": 0.08429615944623947, "learning_rate": 3.2279870698839634e-05, "loss": 0.4674, "num_input_tokens_seen": 50684048, "step": 41785 }, { "epoch": 4.654193117273638, "grad_norm": 0.12943831086158752, "learning_rate": 3.227522167709627e-05, "loss": 0.4552, "num_input_tokens_seen": 50690000, "step": 41790 }, { "epoch": 4.654749972157256, "grad_norm": 0.10618654638528824, "learning_rate": 3.227057238047171e-05, "loss": 0.477, "num_input_tokens_seen": 50696144, "step": 41795 }, { "epoch": 4.6553068270408735, "grad_norm": 0.12134188413619995, "learning_rate": 3.226592280914163e-05, "loss": 0.4599, "num_input_tokens_seen": 50702352, "step": 41800 }, { "epoch": 4.65586368192449, "grad_norm": 0.08243734389543533, "learning_rate": 3.2261272963281706e-05, "loss": 0.4532, "num_input_tokens_seen": 50708080, "step": 41805 }, { "epoch": 4.656420536808108, "grad_norm": 0.10593622177839279, "learning_rate": 3.225662284306762e-05, "loss": 0.4718, "num_input_tokens_seen": 50714384, "step": 41810 }, { "epoch": 4.656977391691725, "grad_norm": 0.09526446461677551, "learning_rate": 3.225197244867506e-05, "loss": 0.4595, "num_input_tokens_seen": 50720208, "step": 41815 }, { "epoch": 4.657534246575342, "grad_norm": 0.1053244024515152, "learning_rate": 3.224732178027974e-05, "loss": 0.4664, "num_input_tokens_seen": 50726448, "step": 41820 }, { "epoch": 4.65809110145896, "grad_norm": 0.0905497670173645, "learning_rate": 3.2242670838057386e-05, "loss": 0.4711, "num_input_tokens_seen": 50732304, "step": 41825 }, { "epoch": 4.658647956342577, "grad_norm": 0.1076459288597107, "learning_rate": 3.223801962218372e-05, "loss": 0.4602, "num_input_tokens_seen": 50738800, "step": 41830 }, { "epoch": 4.659204811226195, "grad_norm": 0.0979146733880043, "learning_rate": 3.223336813283447e-05, "loss": 0.4738, "num_input_tokens_seen": 50744944, "step": 41835 }, { "epoch": 4.659761666109812, "grad_norm": 0.10095171630382538, "learning_rate": 3.22287163701854e-05, "loss": 0.4726, "num_input_tokens_seen": 50751088, "step": 41840 }, { "epoch": 4.660318520993429, "grad_norm": 0.09766281396150589, "learning_rate": 3.222406433441225e-05, "loss": 0.454, "num_input_tokens_seen": 50757168, "step": 41845 }, { "epoch": 4.660875375877047, "grad_norm": 0.1278539001941681, "learning_rate": 3.2219412025690805e-05, "loss": 0.4588, "num_input_tokens_seen": 50763632, "step": 41850 }, { "epoch": 4.6614322307606635, "grad_norm": 0.08535677194595337, "learning_rate": 3.221475944419683e-05, "loss": 0.4719, "num_input_tokens_seen": 50769264, "step": 41855 }, { "epoch": 4.661989085644281, "grad_norm": 0.09954673051834106, "learning_rate": 3.2210106590106124e-05, "loss": 0.461, "num_input_tokens_seen": 50775344, "step": 41860 }, { "epoch": 4.662545940527899, "grad_norm": 0.09355168789625168, "learning_rate": 3.2205453463594495e-05, "loss": 0.4617, "num_input_tokens_seen": 50781904, "step": 41865 }, { "epoch": 4.663102795411516, "grad_norm": 0.1107148677110672, "learning_rate": 3.2200800064837735e-05, "loss": 0.464, "num_input_tokens_seen": 50788208, "step": 41870 }, { "epoch": 4.663659650295133, "grad_norm": 0.11145297437906265, "learning_rate": 3.2196146394011674e-05, "loss": 0.4686, "num_input_tokens_seen": 50794608, "step": 41875 }, { "epoch": 4.66421650517875, "grad_norm": 0.13967011868953705, "learning_rate": 3.2191492451292144e-05, "loss": 0.47, "num_input_tokens_seen": 50800560, "step": 41880 }, { "epoch": 4.664773360062368, "grad_norm": 0.11019615828990936, "learning_rate": 3.2186838236854975e-05, "loss": 0.4607, "num_input_tokens_seen": 50806576, "step": 41885 }, { "epoch": 4.665330214945985, "grad_norm": 0.08542187511920929, "learning_rate": 3.2182183750876024e-05, "loss": 0.4563, "num_input_tokens_seen": 50812784, "step": 41890 }, { "epoch": 4.665887069829602, "grad_norm": 0.0807328075170517, "learning_rate": 3.217752899353117e-05, "loss": 0.4678, "num_input_tokens_seen": 50818896, "step": 41895 }, { "epoch": 4.66644392471322, "grad_norm": 0.08960361778736115, "learning_rate": 3.2172873964996255e-05, "loss": 0.4709, "num_input_tokens_seen": 50824976, "step": 41900 }, { "epoch": 4.667000779596837, "grad_norm": 0.10999980568885803, "learning_rate": 3.216821866544719e-05, "loss": 0.4706, "num_input_tokens_seen": 50831024, "step": 41905 }, { "epoch": 4.667557634480454, "grad_norm": 0.0973396971821785, "learning_rate": 3.216356309505984e-05, "loss": 0.4595, "num_input_tokens_seen": 50837232, "step": 41910 }, { "epoch": 4.668114489364072, "grad_norm": 0.0910872295498848, "learning_rate": 3.215890725401012e-05, "loss": 0.4651, "num_input_tokens_seen": 50843376, "step": 41915 }, { "epoch": 4.668671344247689, "grad_norm": 0.10476355999708176, "learning_rate": 3.215425114247395e-05, "loss": 0.4628, "num_input_tokens_seen": 50849520, "step": 41920 }, { "epoch": 4.6692281991313065, "grad_norm": 0.08534055948257446, "learning_rate": 3.214959476062723e-05, "loss": 0.462, "num_input_tokens_seen": 50855664, "step": 41925 }, { "epoch": 4.669785054014923, "grad_norm": 0.1254539042711258, "learning_rate": 3.214493810864592e-05, "loss": 0.458, "num_input_tokens_seen": 50862256, "step": 41930 }, { "epoch": 4.670341908898541, "grad_norm": 0.07995908707380295, "learning_rate": 3.214028118670595e-05, "loss": 0.4614, "num_input_tokens_seen": 50868496, "step": 41935 }, { "epoch": 4.670898763782159, "grad_norm": 0.09899864345788956, "learning_rate": 3.213562399498328e-05, "loss": 0.4513, "num_input_tokens_seen": 50874512, "step": 41940 }, { "epoch": 4.671455618665775, "grad_norm": 0.10048352181911469, "learning_rate": 3.213096653365386e-05, "loss": 0.4716, "num_input_tokens_seen": 50880336, "step": 41945 }, { "epoch": 4.672012473549393, "grad_norm": 0.09622488170862198, "learning_rate": 3.212630880289367e-05, "loss": 0.4713, "num_input_tokens_seen": 50886352, "step": 41950 }, { "epoch": 4.67256932843301, "grad_norm": 0.10510526597499847, "learning_rate": 3.2121650802878696e-05, "loss": 0.4579, "num_input_tokens_seen": 50892080, "step": 41955 }, { "epoch": 4.6731261833166275, "grad_norm": 0.12974834442138672, "learning_rate": 3.211699253378494e-05, "loss": 0.4618, "num_input_tokens_seen": 50898128, "step": 41960 }, { "epoch": 4.673683038200245, "grad_norm": 0.11898034811019897, "learning_rate": 3.211233399578839e-05, "loss": 0.4679, "num_input_tokens_seen": 50904304, "step": 41965 }, { "epoch": 4.674239893083862, "grad_norm": 0.08351003378629684, "learning_rate": 3.210767518906508e-05, "loss": 0.473, "num_input_tokens_seen": 50910352, "step": 41970 }, { "epoch": 4.67479674796748, "grad_norm": 0.11819521337747574, "learning_rate": 3.2103016113791014e-05, "loss": 0.465, "num_input_tokens_seen": 50916368, "step": 41975 }, { "epoch": 4.675353602851097, "grad_norm": 0.10308286547660828, "learning_rate": 3.209835677014224e-05, "loss": 0.4614, "num_input_tokens_seen": 50921648, "step": 41980 }, { "epoch": 4.675910457734714, "grad_norm": 0.08258208632469177, "learning_rate": 3.20936971582948e-05, "loss": 0.4601, "num_input_tokens_seen": 50927344, "step": 41985 }, { "epoch": 4.676467312618332, "grad_norm": 0.12023302167654037, "learning_rate": 3.208903727842475e-05, "loss": 0.4563, "num_input_tokens_seen": 50933392, "step": 41990 }, { "epoch": 4.677024167501949, "grad_norm": 0.09242010116577148, "learning_rate": 3.2084377130708145e-05, "loss": 0.4524, "num_input_tokens_seen": 50939600, "step": 41995 }, { "epoch": 4.677581022385566, "grad_norm": 0.09081520885229111, "learning_rate": 3.207971671532108e-05, "loss": 0.4606, "num_input_tokens_seen": 50945744, "step": 42000 }, { "epoch": 4.678137877269184, "grad_norm": 0.13674993813037872, "learning_rate": 3.207505603243962e-05, "loss": 0.4623, "num_input_tokens_seen": 50952016, "step": 42005 }, { "epoch": 4.678694732152801, "grad_norm": 0.08082206547260284, "learning_rate": 3.2070395082239876e-05, "loss": 0.465, "num_input_tokens_seen": 50957936, "step": 42010 }, { "epoch": 4.679251587036418, "grad_norm": 0.09903185069561005, "learning_rate": 3.206573386489795e-05, "loss": 0.4489, "num_input_tokens_seen": 50963760, "step": 42015 }, { "epoch": 4.679808441920036, "grad_norm": 0.10464835911989212, "learning_rate": 3.206107238058995e-05, "loss": 0.4612, "num_input_tokens_seen": 50969936, "step": 42020 }, { "epoch": 4.680365296803653, "grad_norm": 0.0905175730586052, "learning_rate": 3.2056410629492004e-05, "loss": 0.4628, "num_input_tokens_seen": 50975824, "step": 42025 }, { "epoch": 4.6809221516872705, "grad_norm": 0.0980057343840599, "learning_rate": 3.205174861178026e-05, "loss": 0.448, "num_input_tokens_seen": 50981840, "step": 42030 }, { "epoch": 4.681479006570887, "grad_norm": 0.08785741031169891, "learning_rate": 3.204708632763085e-05, "loss": 0.4586, "num_input_tokens_seen": 50987920, "step": 42035 }, { "epoch": 4.682035861454505, "grad_norm": 0.12273848056793213, "learning_rate": 3.204242377721993e-05, "loss": 0.4701, "num_input_tokens_seen": 50994192, "step": 42040 }, { "epoch": 4.682592716338123, "grad_norm": 0.16283050179481506, "learning_rate": 3.203776096072368e-05, "loss": 0.4626, "num_input_tokens_seen": 51000528, "step": 42045 }, { "epoch": 4.6831495712217395, "grad_norm": 0.11096501350402832, "learning_rate": 3.203309787831826e-05, "loss": 0.4732, "num_input_tokens_seen": 51006352, "step": 42050 }, { "epoch": 4.683706426105357, "grad_norm": 0.08248799294233322, "learning_rate": 3.2028434530179876e-05, "loss": 0.4643, "num_input_tokens_seen": 51012720, "step": 42055 }, { "epoch": 4.684263280988974, "grad_norm": 0.10500682890415192, "learning_rate": 3.2023770916484703e-05, "loss": 0.4681, "num_input_tokens_seen": 51018256, "step": 42060 }, { "epoch": 4.684820135872592, "grad_norm": 0.12004060298204422, "learning_rate": 3.201910703740896e-05, "loss": 0.4618, "num_input_tokens_seen": 51023504, "step": 42065 }, { "epoch": 4.685376990756209, "grad_norm": 0.1587180495262146, "learning_rate": 3.201444289312885e-05, "loss": 0.4462, "num_input_tokens_seen": 51029776, "step": 42070 }, { "epoch": 4.685933845639826, "grad_norm": 0.10106176882982254, "learning_rate": 3.200977848382061e-05, "loss": 0.4449, "num_input_tokens_seen": 51035568, "step": 42075 }, { "epoch": 4.686490700523444, "grad_norm": 0.10182423144578934, "learning_rate": 3.200511380966048e-05, "loss": 0.4654, "num_input_tokens_seen": 51041584, "step": 42080 }, { "epoch": 4.6870475554070605, "grad_norm": 0.08422654867172241, "learning_rate": 3.20004488708247e-05, "loss": 0.4646, "num_input_tokens_seen": 51047696, "step": 42085 }, { "epoch": 4.687604410290678, "grad_norm": 0.10319488495588303, "learning_rate": 3.199578366748953e-05, "loss": 0.4656, "num_input_tokens_seen": 51053872, "step": 42090 }, { "epoch": 4.688161265174296, "grad_norm": 0.1154012531042099, "learning_rate": 3.1991118199831236e-05, "loss": 0.4585, "num_input_tokens_seen": 51060144, "step": 42095 }, { "epoch": 4.688718120057913, "grad_norm": 0.1138778030872345, "learning_rate": 3.198645246802609e-05, "loss": 0.4776, "num_input_tokens_seen": 51066064, "step": 42100 }, { "epoch": 4.68927497494153, "grad_norm": 0.11833630502223969, "learning_rate": 3.198178647225037e-05, "loss": 0.4744, "num_input_tokens_seen": 51072112, "step": 42105 }, { "epoch": 4.689831829825147, "grad_norm": 0.1524789184331894, "learning_rate": 3.1977120212680395e-05, "loss": 0.4569, "num_input_tokens_seen": 51078000, "step": 42110 }, { "epoch": 4.690388684708765, "grad_norm": 0.09795812517404556, "learning_rate": 3.197245368949245e-05, "loss": 0.4726, "num_input_tokens_seen": 51084336, "step": 42115 }, { "epoch": 4.6909455395923825, "grad_norm": 0.10617613792419434, "learning_rate": 3.196778690286287e-05, "loss": 0.4666, "num_input_tokens_seen": 51090576, "step": 42120 }, { "epoch": 4.691502394475999, "grad_norm": 0.09123548865318298, "learning_rate": 3.196311985296797e-05, "loss": 0.4613, "num_input_tokens_seen": 51096976, "step": 42125 }, { "epoch": 4.692059249359617, "grad_norm": 0.0966426432132721, "learning_rate": 3.1958452539984084e-05, "loss": 0.4787, "num_input_tokens_seen": 51102864, "step": 42130 }, { "epoch": 4.692616104243234, "grad_norm": 0.11565805971622467, "learning_rate": 3.195378496408756e-05, "loss": 0.4612, "num_input_tokens_seen": 51109104, "step": 42135 }, { "epoch": 4.693172959126851, "grad_norm": 0.12193971872329712, "learning_rate": 3.1949117125454757e-05, "loss": 0.4736, "num_input_tokens_seen": 51115216, "step": 42140 }, { "epoch": 4.693729814010469, "grad_norm": 0.08415383100509644, "learning_rate": 3.1944449024262045e-05, "loss": 0.4605, "num_input_tokens_seen": 51120880, "step": 42145 }, { "epoch": 4.694286668894086, "grad_norm": 0.08506447821855545, "learning_rate": 3.1939780660685794e-05, "loss": 0.4628, "num_input_tokens_seen": 51127088, "step": 42150 }, { "epoch": 4.6948435237777035, "grad_norm": 0.10895629972219467, "learning_rate": 3.1935112034902384e-05, "loss": 0.4662, "num_input_tokens_seen": 51133200, "step": 42155 }, { "epoch": 4.695400378661321, "grad_norm": 0.08439049124717712, "learning_rate": 3.193044314708822e-05, "loss": 0.4625, "num_input_tokens_seen": 51139376, "step": 42160 }, { "epoch": 4.695957233544938, "grad_norm": 0.12361451238393784, "learning_rate": 3.1925773997419694e-05, "loss": 0.4634, "num_input_tokens_seen": 51145296, "step": 42165 }, { "epoch": 4.696514088428556, "grad_norm": 0.12894979119300842, "learning_rate": 3.192110458607324e-05, "loss": 0.4601, "num_input_tokens_seen": 51151024, "step": 42170 }, { "epoch": 4.697070943312173, "grad_norm": 0.08347469568252563, "learning_rate": 3.1916434913225283e-05, "loss": 0.4581, "num_input_tokens_seen": 51157360, "step": 42175 }, { "epoch": 4.69762779819579, "grad_norm": 0.08913004398345947, "learning_rate": 3.1911764979052244e-05, "loss": 0.459, "num_input_tokens_seen": 51163408, "step": 42180 }, { "epoch": 4.698184653079408, "grad_norm": 0.11764663457870483, "learning_rate": 3.190709478373057e-05, "loss": 0.4653, "num_input_tokens_seen": 51169360, "step": 42185 }, { "epoch": 4.698741507963025, "grad_norm": 0.1500798761844635, "learning_rate": 3.1902424327436734e-05, "loss": 0.4677, "num_input_tokens_seen": 51175248, "step": 42190 }, { "epoch": 4.699298362846642, "grad_norm": 0.08387178182601929, "learning_rate": 3.1897753610347164e-05, "loss": 0.4502, "num_input_tokens_seen": 51180560, "step": 42195 }, { "epoch": 4.69985521773026, "grad_norm": 0.0951242446899414, "learning_rate": 3.189308263263837e-05, "loss": 0.4729, "num_input_tokens_seen": 51186736, "step": 42200 }, { "epoch": 4.700412072613877, "grad_norm": 0.09971597790718079, "learning_rate": 3.1888411394486826e-05, "loss": 0.466, "num_input_tokens_seen": 51192976, "step": 42205 }, { "epoch": 4.700968927497494, "grad_norm": 0.08041910827159882, "learning_rate": 3.188373989606903e-05, "loss": 0.4696, "num_input_tokens_seen": 51198928, "step": 42210 }, { "epoch": 4.701525782381111, "grad_norm": 0.108662448823452, "learning_rate": 3.187906813756147e-05, "loss": 0.455, "num_input_tokens_seen": 51204656, "step": 42215 }, { "epoch": 4.702082637264729, "grad_norm": 0.1120491772890091, "learning_rate": 3.187439611914067e-05, "loss": 0.465, "num_input_tokens_seen": 51210608, "step": 42220 }, { "epoch": 4.7026394921483465, "grad_norm": 0.1263083517551422, "learning_rate": 3.1869723840983154e-05, "loss": 0.4743, "num_input_tokens_seen": 51216720, "step": 42225 }, { "epoch": 4.703196347031963, "grad_norm": 0.092225082218647, "learning_rate": 3.186505130326546e-05, "loss": 0.4652, "num_input_tokens_seen": 51222672, "step": 42230 }, { "epoch": 4.703753201915581, "grad_norm": 0.10729224234819412, "learning_rate": 3.186037850616413e-05, "loss": 0.4663, "num_input_tokens_seen": 51228496, "step": 42235 }, { "epoch": 4.704310056799198, "grad_norm": 0.0846937820315361, "learning_rate": 3.1855705449855716e-05, "loss": 0.4606, "num_input_tokens_seen": 51234640, "step": 42240 }, { "epoch": 4.7048669116828155, "grad_norm": 0.09190446883440018, "learning_rate": 3.185103213451678e-05, "loss": 0.4598, "num_input_tokens_seen": 51240912, "step": 42245 }, { "epoch": 4.705423766566433, "grad_norm": 0.1361924707889557, "learning_rate": 3.1846358560323895e-05, "loss": 0.4615, "num_input_tokens_seen": 51247152, "step": 42250 }, { "epoch": 4.70598062145005, "grad_norm": 0.10072288662195206, "learning_rate": 3.184168472745363e-05, "loss": 0.4508, "num_input_tokens_seen": 51253136, "step": 42255 }, { "epoch": 4.706537476333668, "grad_norm": 0.13612014055252075, "learning_rate": 3.1837010636082605e-05, "loss": 0.4713, "num_input_tokens_seen": 51259248, "step": 42260 }, { "epoch": 4.707094331217284, "grad_norm": 0.10380611568689346, "learning_rate": 3.18323362863874e-05, "loss": 0.4612, "num_input_tokens_seen": 51265552, "step": 42265 }, { "epoch": 4.707651186100902, "grad_norm": 0.09530916810035706, "learning_rate": 3.182766167854464e-05, "loss": 0.466, "num_input_tokens_seen": 51271696, "step": 42270 }, { "epoch": 4.70820804098452, "grad_norm": 0.07838763296604156, "learning_rate": 3.1822986812730944e-05, "loss": 0.4552, "num_input_tokens_seen": 51277616, "step": 42275 }, { "epoch": 4.7087648958681365, "grad_norm": 0.09470583498477936, "learning_rate": 3.1818311689122936e-05, "loss": 0.4605, "num_input_tokens_seen": 51283696, "step": 42280 }, { "epoch": 4.709321750751754, "grad_norm": 0.10004963725805283, "learning_rate": 3.1813636307897264e-05, "loss": 0.4688, "num_input_tokens_seen": 51289904, "step": 42285 }, { "epoch": 4.709878605635371, "grad_norm": 0.10867341607809067, "learning_rate": 3.180896066923057e-05, "loss": 0.4731, "num_input_tokens_seen": 51296272, "step": 42290 }, { "epoch": 4.710435460518989, "grad_norm": 0.10738084465265274, "learning_rate": 3.180428477329953e-05, "loss": 0.4686, "num_input_tokens_seen": 51302512, "step": 42295 }, { "epoch": 4.710992315402606, "grad_norm": 0.08559655398130417, "learning_rate": 3.17996086202808e-05, "loss": 0.4541, "num_input_tokens_seen": 51308624, "step": 42300 }, { "epoch": 4.711549170286223, "grad_norm": 0.1184626966714859, "learning_rate": 3.179493221035107e-05, "loss": 0.4608, "num_input_tokens_seen": 51314640, "step": 42305 }, { "epoch": 4.712106025169841, "grad_norm": 0.09904316812753677, "learning_rate": 3.179025554368703e-05, "loss": 0.4638, "num_input_tokens_seen": 51320624, "step": 42310 }, { "epoch": 4.712662880053458, "grad_norm": 0.07723605632781982, "learning_rate": 3.1785578620465365e-05, "loss": 0.4598, "num_input_tokens_seen": 51326544, "step": 42315 }, { "epoch": 4.713219734937075, "grad_norm": 0.07090447843074799, "learning_rate": 3.1780901440862796e-05, "loss": 0.4683, "num_input_tokens_seen": 51333008, "step": 42320 }, { "epoch": 4.713776589820693, "grad_norm": 0.10042428225278854, "learning_rate": 3.177622400505604e-05, "loss": 0.4631, "num_input_tokens_seen": 51339152, "step": 42325 }, { "epoch": 4.71433344470431, "grad_norm": 0.09405981004238129, "learning_rate": 3.177154631322183e-05, "loss": 0.462, "num_input_tokens_seen": 51345360, "step": 42330 }, { "epoch": 4.714890299587927, "grad_norm": 0.07186049968004227, "learning_rate": 3.176686836553691e-05, "loss": 0.4595, "num_input_tokens_seen": 51351632, "step": 42335 }, { "epoch": 4.715447154471545, "grad_norm": 0.12440194934606552, "learning_rate": 3.1762190162178e-05, "loss": 0.4709, "num_input_tokens_seen": 51358352, "step": 42340 }, { "epoch": 4.716004009355162, "grad_norm": 0.08303361386060715, "learning_rate": 3.175751170332189e-05, "loss": 0.46, "num_input_tokens_seen": 51364336, "step": 42345 }, { "epoch": 4.7165608642387795, "grad_norm": 0.11757756769657135, "learning_rate": 3.175283298914532e-05, "loss": 0.4543, "num_input_tokens_seen": 51370160, "step": 42350 }, { "epoch": 4.717117719122397, "grad_norm": 0.09275078028440475, "learning_rate": 3.174815401982508e-05, "loss": 0.4665, "num_input_tokens_seen": 51375600, "step": 42355 }, { "epoch": 4.717674574006014, "grad_norm": 0.10711153596639633, "learning_rate": 3.174347479553796e-05, "loss": 0.4642, "num_input_tokens_seen": 51381584, "step": 42360 }, { "epoch": 4.718231428889632, "grad_norm": 0.10946810245513916, "learning_rate": 3.1738795316460747e-05, "loss": 0.4551, "num_input_tokens_seen": 51387120, "step": 42365 }, { "epoch": 4.718788283773248, "grad_norm": 0.08895283192396164, "learning_rate": 3.1734115582770254e-05, "loss": 0.4552, "num_input_tokens_seen": 51393360, "step": 42370 }, { "epoch": 4.719345138656866, "grad_norm": 0.08923645317554474, "learning_rate": 3.1729435594643296e-05, "loss": 0.4768, "num_input_tokens_seen": 51399440, "step": 42375 }, { "epoch": 4.719901993540484, "grad_norm": 0.10505715012550354, "learning_rate": 3.17247553522567e-05, "loss": 0.4697, "num_input_tokens_seen": 51405808, "step": 42380 }, { "epoch": 4.720458848424101, "grad_norm": 0.10141053795814514, "learning_rate": 3.172007485578729e-05, "loss": 0.4509, "num_input_tokens_seen": 51411152, "step": 42385 }, { "epoch": 4.721015703307718, "grad_norm": 0.11135673522949219, "learning_rate": 3.171539410541192e-05, "loss": 0.4755, "num_input_tokens_seen": 51417264, "step": 42390 }, { "epoch": 4.721572558191335, "grad_norm": 0.16400380432605743, "learning_rate": 3.1710713101307434e-05, "loss": 0.4494, "num_input_tokens_seen": 51423664, "step": 42395 }, { "epoch": 4.722129413074953, "grad_norm": 0.08957845717668533, "learning_rate": 3.1706031843650707e-05, "loss": 0.4619, "num_input_tokens_seen": 51429840, "step": 42400 }, { "epoch": 4.72268626795857, "grad_norm": 0.11034373939037323, "learning_rate": 3.17013503326186e-05, "loss": 0.4596, "num_input_tokens_seen": 51435760, "step": 42405 }, { "epoch": 4.723243122842187, "grad_norm": 0.10425397008657455, "learning_rate": 3.1696668568388005e-05, "loss": 0.4531, "num_input_tokens_seen": 51442064, "step": 42410 }, { "epoch": 4.723799977725805, "grad_norm": 0.10683689266443253, "learning_rate": 3.169198655113581e-05, "loss": 0.4536, "num_input_tokens_seen": 51448208, "step": 42415 }, { "epoch": 4.724356832609422, "grad_norm": 0.1117183268070221, "learning_rate": 3.168730428103892e-05, "loss": 0.4625, "num_input_tokens_seen": 51454480, "step": 42420 }, { "epoch": 4.724913687493039, "grad_norm": 0.1099490374326706, "learning_rate": 3.1682621758274246e-05, "loss": 0.4637, "num_input_tokens_seen": 51460752, "step": 42425 }, { "epoch": 4.725470542376657, "grad_norm": 0.13860981166362762, "learning_rate": 3.16779389830187e-05, "loss": 0.4708, "num_input_tokens_seen": 51466736, "step": 42430 }, { "epoch": 4.726027397260274, "grad_norm": 0.11332598328590393, "learning_rate": 3.1673255955449225e-05, "loss": 0.4519, "num_input_tokens_seen": 51472688, "step": 42435 }, { "epoch": 4.726584252143891, "grad_norm": 0.16200709342956543, "learning_rate": 3.1668572675742753e-05, "loss": 0.4703, "num_input_tokens_seen": 51478224, "step": 42440 }, { "epoch": 4.727141107027508, "grad_norm": 0.11945919692516327, "learning_rate": 3.166388914407623e-05, "loss": 0.4691, "num_input_tokens_seen": 51484816, "step": 42445 }, { "epoch": 4.727697961911126, "grad_norm": 0.09824326634407043, "learning_rate": 3.165920536062662e-05, "loss": 0.4537, "num_input_tokens_seen": 51491248, "step": 42450 }, { "epoch": 4.728254816794744, "grad_norm": 0.11468316614627838, "learning_rate": 3.1654521325570894e-05, "loss": 0.4597, "num_input_tokens_seen": 51497328, "step": 42455 }, { "epoch": 4.72881167167836, "grad_norm": 0.09107720851898193, "learning_rate": 3.1649837039086035e-05, "loss": 0.4569, "num_input_tokens_seen": 51503344, "step": 42460 }, { "epoch": 4.729368526561978, "grad_norm": 0.1449095457792282, "learning_rate": 3.164515250134901e-05, "loss": 0.4593, "num_input_tokens_seen": 51509264, "step": 42465 }, { "epoch": 4.729925381445595, "grad_norm": 0.1343490481376648, "learning_rate": 3.1640467712536834e-05, "loss": 0.4668, "num_input_tokens_seen": 51515568, "step": 42470 }, { "epoch": 4.7304822363292125, "grad_norm": 0.14807488024234772, "learning_rate": 3.1635782672826504e-05, "loss": 0.4716, "num_input_tokens_seen": 51521808, "step": 42475 }, { "epoch": 4.73103909121283, "grad_norm": 0.1100081354379654, "learning_rate": 3.1631097382395045e-05, "loss": 0.4625, "num_input_tokens_seen": 51527792, "step": 42480 }, { "epoch": 4.731595946096447, "grad_norm": 0.10407821089029312, "learning_rate": 3.1626411841419466e-05, "loss": 0.466, "num_input_tokens_seen": 51534000, "step": 42485 }, { "epoch": 4.732152800980065, "grad_norm": 0.08938020467758179, "learning_rate": 3.1621726050076825e-05, "loss": 0.4544, "num_input_tokens_seen": 51540240, "step": 42490 }, { "epoch": 4.732709655863682, "grad_norm": 0.1380210518836975, "learning_rate": 3.1617040008544145e-05, "loss": 0.4667, "num_input_tokens_seen": 51546384, "step": 42495 }, { "epoch": 4.733266510747299, "grad_norm": 0.09465167671442032, "learning_rate": 3.161235371699849e-05, "loss": 0.4719, "num_input_tokens_seen": 51552496, "step": 42500 }, { "epoch": 4.733823365630917, "grad_norm": 0.10948999971151352, "learning_rate": 3.1607667175616915e-05, "loss": 0.4465, "num_input_tokens_seen": 51559120, "step": 42505 }, { "epoch": 4.7343802205145336, "grad_norm": 0.09830260276794434, "learning_rate": 3.1602980384576506e-05, "loss": 0.473, "num_input_tokens_seen": 51565232, "step": 42510 }, { "epoch": 4.734937075398151, "grad_norm": 0.1202281042933464, "learning_rate": 3.159829334405434e-05, "loss": 0.4735, "num_input_tokens_seen": 51571504, "step": 42515 }, { "epoch": 4.735493930281769, "grad_norm": 0.11214804649353027, "learning_rate": 3.15936060542275e-05, "loss": 0.4676, "num_input_tokens_seen": 51577776, "step": 42520 }, { "epoch": 4.736050785165386, "grad_norm": 0.11534979939460754, "learning_rate": 3.15889185152731e-05, "loss": 0.4645, "num_input_tokens_seen": 51583760, "step": 42525 }, { "epoch": 4.736607640049003, "grad_norm": 0.11962813884019852, "learning_rate": 3.158423072736824e-05, "loss": 0.4684, "num_input_tokens_seen": 51589776, "step": 42530 }, { "epoch": 4.737164494932621, "grad_norm": 0.09832219779491425, "learning_rate": 3.1579542690690045e-05, "loss": 0.4663, "num_input_tokens_seen": 51595984, "step": 42535 }, { "epoch": 4.737721349816238, "grad_norm": 0.12110497057437897, "learning_rate": 3.157485440541563e-05, "loss": 0.4718, "num_input_tokens_seen": 51602480, "step": 42540 }, { "epoch": 4.7382782046998555, "grad_norm": 0.1037316545844078, "learning_rate": 3.1570165871722164e-05, "loss": 0.4674, "num_input_tokens_seen": 51608560, "step": 42545 }, { "epoch": 4.738835059583472, "grad_norm": 0.08652462065219879, "learning_rate": 3.1565477089786765e-05, "loss": 0.4522, "num_input_tokens_seen": 51614928, "step": 42550 }, { "epoch": 4.73939191446709, "grad_norm": 0.11353535205125809, "learning_rate": 3.1560788059786606e-05, "loss": 0.4625, "num_input_tokens_seen": 51620944, "step": 42555 }, { "epoch": 4.739948769350708, "grad_norm": 0.12633082270622253, "learning_rate": 3.155609878189885e-05, "loss": 0.4597, "num_input_tokens_seen": 51627024, "step": 42560 }, { "epoch": 4.740505624234324, "grad_norm": 0.09869560599327087, "learning_rate": 3.1551409256300666e-05, "loss": 0.4676, "num_input_tokens_seen": 51633200, "step": 42565 }, { "epoch": 4.741062479117942, "grad_norm": 0.11113832145929337, "learning_rate": 3.154671948316924e-05, "loss": 0.459, "num_input_tokens_seen": 51639408, "step": 42570 }, { "epoch": 4.741619334001559, "grad_norm": 0.10552573949098587, "learning_rate": 3.154202946268178e-05, "loss": 0.4623, "num_input_tokens_seen": 51645712, "step": 42575 }, { "epoch": 4.742176188885177, "grad_norm": 0.09287703037261963, "learning_rate": 3.1537339195015486e-05, "loss": 0.4656, "num_input_tokens_seen": 51651632, "step": 42580 }, { "epoch": 4.742733043768794, "grad_norm": 0.08490940183401108, "learning_rate": 3.153264868034757e-05, "loss": 0.4671, "num_input_tokens_seen": 51657616, "step": 42585 }, { "epoch": 4.743289898652411, "grad_norm": 0.117535300552845, "learning_rate": 3.1527957918855245e-05, "loss": 0.4737, "num_input_tokens_seen": 51663856, "step": 42590 }, { "epoch": 4.743846753536029, "grad_norm": 0.13660241663455963, "learning_rate": 3.152326691071574e-05, "loss": 0.4741, "num_input_tokens_seen": 51670224, "step": 42595 }, { "epoch": 4.7444036084196455, "grad_norm": 0.07423754036426544, "learning_rate": 3.151857565610632e-05, "loss": 0.4597, "num_input_tokens_seen": 51676368, "step": 42600 }, { "epoch": 4.744960463303263, "grad_norm": 0.10562536865472794, "learning_rate": 3.151388415520422e-05, "loss": 0.4573, "num_input_tokens_seen": 51682512, "step": 42605 }, { "epoch": 4.745517318186881, "grad_norm": 0.08803877234458923, "learning_rate": 3.15091924081867e-05, "loss": 0.4497, "num_input_tokens_seen": 51688464, "step": 42610 }, { "epoch": 4.746074173070498, "grad_norm": 0.0699651837348938, "learning_rate": 3.150450041523103e-05, "loss": 0.4473, "num_input_tokens_seen": 51694672, "step": 42615 }, { "epoch": 4.746631027954115, "grad_norm": 0.12426265329122543, "learning_rate": 3.149980817651449e-05, "loss": 0.4679, "num_input_tokens_seen": 51700816, "step": 42620 }, { "epoch": 4.747187882837732, "grad_norm": 0.0974157452583313, "learning_rate": 3.149511569221437e-05, "loss": 0.4481, "num_input_tokens_seen": 51707216, "step": 42625 }, { "epoch": 4.74774473772135, "grad_norm": 0.0989387184381485, "learning_rate": 3.149042296250796e-05, "loss": 0.4728, "num_input_tokens_seen": 51713008, "step": 42630 }, { "epoch": 4.748301592604967, "grad_norm": 0.1370716094970703, "learning_rate": 3.1485729987572574e-05, "loss": 0.4748, "num_input_tokens_seen": 51719088, "step": 42635 }, { "epoch": 4.748858447488584, "grad_norm": 0.13529756665229797, "learning_rate": 3.148103676758552e-05, "loss": 0.47, "num_input_tokens_seen": 51725552, "step": 42640 }, { "epoch": 4.749415302372202, "grad_norm": 0.09154333919286728, "learning_rate": 3.147634330272414e-05, "loss": 0.4613, "num_input_tokens_seen": 51731888, "step": 42645 }, { "epoch": 4.749972157255819, "grad_norm": 0.09104108810424805, "learning_rate": 3.147164959316575e-05, "loss": 0.4796, "num_input_tokens_seen": 51737840, "step": 42650 }, { "epoch": 4.750529012139436, "grad_norm": 0.12319494783878326, "learning_rate": 3.146695563908769e-05, "loss": 0.4708, "num_input_tokens_seen": 51743920, "step": 42655 }, { "epoch": 4.751085867023054, "grad_norm": 0.09384888410568237, "learning_rate": 3.146226144066733e-05, "loss": 0.4717, "num_input_tokens_seen": 51749552, "step": 42660 }, { "epoch": 4.751642721906671, "grad_norm": 0.12569354474544525, "learning_rate": 3.145756699808202e-05, "loss": 0.47, "num_input_tokens_seen": 51755440, "step": 42665 }, { "epoch": 4.7521995767902885, "grad_norm": 0.11194109171628952, "learning_rate": 3.1452872311509135e-05, "loss": 0.4667, "num_input_tokens_seen": 51761008, "step": 42670 }, { "epoch": 4.752756431673906, "grad_norm": 0.08105936646461487, "learning_rate": 3.144817738112605e-05, "loss": 0.4679, "num_input_tokens_seen": 51767152, "step": 42675 }, { "epoch": 4.753313286557523, "grad_norm": 0.11815769225358963, "learning_rate": 3.144348220711017e-05, "loss": 0.4676, "num_input_tokens_seen": 51773168, "step": 42680 }, { "epoch": 4.753870141441141, "grad_norm": 0.11296451836824417, "learning_rate": 3.143878678963888e-05, "loss": 0.4718, "num_input_tokens_seen": 51779184, "step": 42685 }, { "epoch": 4.754426996324757, "grad_norm": 0.14465051889419556, "learning_rate": 3.143409112888959e-05, "loss": 0.4782, "num_input_tokens_seen": 51785264, "step": 42690 }, { "epoch": 4.754983851208375, "grad_norm": 0.08954087644815445, "learning_rate": 3.142939522503971e-05, "loss": 0.4594, "num_input_tokens_seen": 51791152, "step": 42695 }, { "epoch": 4.755540706091993, "grad_norm": 0.11774792522192001, "learning_rate": 3.1424699078266686e-05, "loss": 0.4616, "num_input_tokens_seen": 51797360, "step": 42700 }, { "epoch": 4.7560975609756095, "grad_norm": 0.12379477173089981, "learning_rate": 3.1420002688747934e-05, "loss": 0.4694, "num_input_tokens_seen": 51803024, "step": 42705 }, { "epoch": 4.756654415859227, "grad_norm": 0.10764827579259872, "learning_rate": 3.141530605666091e-05, "loss": 0.4675, "num_input_tokens_seen": 51808816, "step": 42710 }, { "epoch": 4.757211270742845, "grad_norm": 0.08530180901288986, "learning_rate": 3.141060918218306e-05, "loss": 0.4656, "num_input_tokens_seen": 51815248, "step": 42715 }, { "epoch": 4.757768125626462, "grad_norm": 0.08751949667930603, "learning_rate": 3.140591206549186e-05, "loss": 0.4557, "num_input_tokens_seen": 51821328, "step": 42720 }, { "epoch": 4.758324980510079, "grad_norm": 0.08782093971967697, "learning_rate": 3.1401214706764764e-05, "loss": 0.462, "num_input_tokens_seen": 51827120, "step": 42725 }, { "epoch": 4.758881835393696, "grad_norm": 0.09017930179834366, "learning_rate": 3.139651710617927e-05, "loss": 0.4591, "num_input_tokens_seen": 51833424, "step": 42730 }, { "epoch": 4.759438690277314, "grad_norm": 0.09251036494970322, "learning_rate": 3.139181926391286e-05, "loss": 0.468, "num_input_tokens_seen": 51839824, "step": 42735 }, { "epoch": 4.7599955451609315, "grad_norm": 0.11826013773679733, "learning_rate": 3.138712118014303e-05, "loss": 0.4771, "num_input_tokens_seen": 51845840, "step": 42740 }, { "epoch": 4.760552400044548, "grad_norm": 0.08715210855007172, "learning_rate": 3.13824228550473e-05, "loss": 0.4676, "num_input_tokens_seen": 51851888, "step": 42745 }, { "epoch": 4.761109254928166, "grad_norm": 0.1717870533466339, "learning_rate": 3.1377724288803176e-05, "loss": 0.4578, "num_input_tokens_seen": 51858160, "step": 42750 }, { "epoch": 4.761666109811783, "grad_norm": 0.07800165563821793, "learning_rate": 3.137302548158819e-05, "loss": 0.455, "num_input_tokens_seen": 51864272, "step": 42755 }, { "epoch": 4.7622229646954, "grad_norm": 0.1030348688364029, "learning_rate": 3.136832643357988e-05, "loss": 0.451, "num_input_tokens_seen": 51870352, "step": 42760 }, { "epoch": 4.762779819579018, "grad_norm": 0.09318752586841583, "learning_rate": 3.136362714495579e-05, "loss": 0.466, "num_input_tokens_seen": 51876368, "step": 42765 }, { "epoch": 4.763336674462635, "grad_norm": 0.10350020229816437, "learning_rate": 3.135892761589347e-05, "loss": 0.4611, "num_input_tokens_seen": 51882480, "step": 42770 }, { "epoch": 4.7638935293462525, "grad_norm": 0.0737842544913292, "learning_rate": 3.135422784657049e-05, "loss": 0.4652, "num_input_tokens_seen": 51888624, "step": 42775 }, { "epoch": 4.764450384229869, "grad_norm": 0.08030124008655548, "learning_rate": 3.134952783716442e-05, "loss": 0.4583, "num_input_tokens_seen": 51894256, "step": 42780 }, { "epoch": 4.765007239113487, "grad_norm": 0.10689394921064377, "learning_rate": 3.134482758785283e-05, "loss": 0.4647, "num_input_tokens_seen": 51900272, "step": 42785 }, { "epoch": 4.765564093997105, "grad_norm": 0.08299996703863144, "learning_rate": 3.1340127098813334e-05, "loss": 0.4625, "num_input_tokens_seen": 51906288, "step": 42790 }, { "epoch": 4.7661209488807215, "grad_norm": 0.08779963105916977, "learning_rate": 3.1335426370223515e-05, "loss": 0.4574, "num_input_tokens_seen": 51912464, "step": 42795 }, { "epoch": 4.766677803764339, "grad_norm": 0.09323357790708542, "learning_rate": 3.133072540226098e-05, "loss": 0.4648, "num_input_tokens_seen": 51918448, "step": 42800 }, { "epoch": 4.767234658647956, "grad_norm": 0.10053646564483643, "learning_rate": 3.132602419510336e-05, "loss": 0.4709, "num_input_tokens_seen": 51924592, "step": 42805 }, { "epoch": 4.767791513531574, "grad_norm": 0.08888763934373856, "learning_rate": 3.1321322748928264e-05, "loss": 0.4618, "num_input_tokens_seen": 51930832, "step": 42810 }, { "epoch": 4.768348368415191, "grad_norm": 0.09370683878660202, "learning_rate": 3.1316621063913334e-05, "loss": 0.4734, "num_input_tokens_seen": 51937136, "step": 42815 }, { "epoch": 4.768905223298808, "grad_norm": 0.0905688926577568, "learning_rate": 3.1311919140236224e-05, "loss": 0.4745, "num_input_tokens_seen": 51943376, "step": 42820 }, { "epoch": 4.769462078182426, "grad_norm": 0.09372328966856003, "learning_rate": 3.1307216978074576e-05, "loss": 0.4695, "num_input_tokens_seen": 51949520, "step": 42825 }, { "epoch": 4.7700189330660425, "grad_norm": 0.09673436731100082, "learning_rate": 3.130251457760607e-05, "loss": 0.47, "num_input_tokens_seen": 51955760, "step": 42830 }, { "epoch": 4.77057578794966, "grad_norm": 0.07561103254556656, "learning_rate": 3.129781193900835e-05, "loss": 0.4595, "num_input_tokens_seen": 51962032, "step": 42835 }, { "epoch": 4.771132642833278, "grad_norm": 0.09607020020484924, "learning_rate": 3.1293109062459124e-05, "loss": 0.4741, "num_input_tokens_seen": 51968144, "step": 42840 }, { "epoch": 4.771689497716895, "grad_norm": 0.08080143481492996, "learning_rate": 3.128840594813607e-05, "loss": 0.4579, "num_input_tokens_seen": 51974416, "step": 42845 }, { "epoch": 4.772246352600512, "grad_norm": 0.08564577251672745, "learning_rate": 3.128370259621688e-05, "loss": 0.4683, "num_input_tokens_seen": 51980752, "step": 42850 }, { "epoch": 4.77280320748413, "grad_norm": 0.12130046635866165, "learning_rate": 3.127899900687927e-05, "loss": 0.4684, "num_input_tokens_seen": 51986704, "step": 42855 }, { "epoch": 4.773360062367747, "grad_norm": 0.08097526431083679, "learning_rate": 3.1274295180300964e-05, "loss": 0.4678, "num_input_tokens_seen": 51993264, "step": 42860 }, { "epoch": 4.7739169172513645, "grad_norm": 0.12645813822746277, "learning_rate": 3.126959111665968e-05, "loss": 0.454, "num_input_tokens_seen": 51999248, "step": 42865 }, { "epoch": 4.774473772134981, "grad_norm": 0.09585922211408615, "learning_rate": 3.1264886816133146e-05, "loss": 0.4758, "num_input_tokens_seen": 52005456, "step": 42870 }, { "epoch": 4.775030627018599, "grad_norm": 0.07464069128036499, "learning_rate": 3.126018227889911e-05, "loss": 0.4616, "num_input_tokens_seen": 52011248, "step": 42875 }, { "epoch": 4.775587481902217, "grad_norm": 0.0874638482928276, "learning_rate": 3.1255477505135325e-05, "loss": 0.4558, "num_input_tokens_seen": 52017296, "step": 42880 }, { "epoch": 4.776144336785833, "grad_norm": 0.12191474437713623, "learning_rate": 3.1250772495019545e-05, "loss": 0.4704, "num_input_tokens_seen": 52023632, "step": 42885 }, { "epoch": 4.776701191669451, "grad_norm": 0.09585336595773697, "learning_rate": 3.124606724872957e-05, "loss": 0.462, "num_input_tokens_seen": 52029776, "step": 42890 }, { "epoch": 4.777258046553069, "grad_norm": 0.09023847430944443, "learning_rate": 3.124136176644314e-05, "loss": 0.4737, "num_input_tokens_seen": 52036144, "step": 42895 }, { "epoch": 4.7778149014366855, "grad_norm": 0.07927601784467697, "learning_rate": 3.123665604833807e-05, "loss": 0.464, "num_input_tokens_seen": 52042128, "step": 42900 }, { "epoch": 4.778371756320303, "grad_norm": 0.09897775202989578, "learning_rate": 3.1231950094592145e-05, "loss": 0.4658, "num_input_tokens_seen": 52048112, "step": 42905 }, { "epoch": 4.77892861120392, "grad_norm": 0.08136790990829468, "learning_rate": 3.1227243905383166e-05, "loss": 0.4701, "num_input_tokens_seen": 52053968, "step": 42910 }, { "epoch": 4.779485466087538, "grad_norm": 0.0895877480506897, "learning_rate": 3.122253748088897e-05, "loss": 0.4542, "num_input_tokens_seen": 52059824, "step": 42915 }, { "epoch": 4.780042320971155, "grad_norm": 0.08325117081403732, "learning_rate": 3.121783082128737e-05, "loss": 0.4598, "num_input_tokens_seen": 52066064, "step": 42920 }, { "epoch": 4.780599175854772, "grad_norm": 0.12480349838733673, "learning_rate": 3.121312392675618e-05, "loss": 0.4518, "num_input_tokens_seen": 52072368, "step": 42925 }, { "epoch": 4.78115603073839, "grad_norm": 0.13563308119773865, "learning_rate": 3.120841679747327e-05, "loss": 0.4678, "num_input_tokens_seen": 52078512, "step": 42930 }, { "epoch": 4.781712885622007, "grad_norm": 0.08995544165372849, "learning_rate": 3.1203709433616466e-05, "loss": 0.4597, "num_input_tokens_seen": 52084560, "step": 42935 }, { "epoch": 4.782269740505624, "grad_norm": 0.07175177335739136, "learning_rate": 3.1199001835363645e-05, "loss": 0.4679, "num_input_tokens_seen": 52090640, "step": 42940 }, { "epoch": 4.782826595389242, "grad_norm": 0.10904250293970108, "learning_rate": 3.119429400289266e-05, "loss": 0.4603, "num_input_tokens_seen": 52096848, "step": 42945 }, { "epoch": 4.783383450272859, "grad_norm": 0.07851210981607437, "learning_rate": 3.118958593638141e-05, "loss": 0.462, "num_input_tokens_seen": 52103184, "step": 42950 }, { "epoch": 4.783940305156476, "grad_norm": 0.08168414235115051, "learning_rate": 3.1184877636007764e-05, "loss": 0.4591, "num_input_tokens_seen": 52109520, "step": 42955 }, { "epoch": 4.784497160040093, "grad_norm": 0.09595546126365662, "learning_rate": 3.1180169101949616e-05, "loss": 0.4514, "num_input_tokens_seen": 52115824, "step": 42960 }, { "epoch": 4.785054014923711, "grad_norm": 0.08620782941579819, "learning_rate": 3.117546033438489e-05, "loss": 0.4606, "num_input_tokens_seen": 52122064, "step": 42965 }, { "epoch": 4.7856108698073285, "grad_norm": 0.0680484026670456, "learning_rate": 3.117075133349146e-05, "loss": 0.4571, "num_input_tokens_seen": 52128144, "step": 42970 }, { "epoch": 4.786167724690945, "grad_norm": 0.10449156910181046, "learning_rate": 3.116604209944728e-05, "loss": 0.4582, "num_input_tokens_seen": 52133424, "step": 42975 }, { "epoch": 4.786724579574563, "grad_norm": 0.08879437297582626, "learning_rate": 3.116133263243027e-05, "loss": 0.4535, "num_input_tokens_seen": 52139568, "step": 42980 }, { "epoch": 4.78728143445818, "grad_norm": 0.10277480632066727, "learning_rate": 3.115662293261837e-05, "loss": 0.4556, "num_input_tokens_seen": 52145648, "step": 42985 }, { "epoch": 4.7878382893417974, "grad_norm": 0.09343449026346207, "learning_rate": 3.115191300018952e-05, "loss": 0.4642, "num_input_tokens_seen": 52151312, "step": 42990 }, { "epoch": 4.788395144225415, "grad_norm": 0.08753848820924759, "learning_rate": 3.114720283532169e-05, "loss": 0.4607, "num_input_tokens_seen": 52157616, "step": 42995 }, { "epoch": 4.788951999109032, "grad_norm": 0.0899341031908989, "learning_rate": 3.114249243819283e-05, "loss": 0.4527, "num_input_tokens_seen": 52163856, "step": 43000 }, { "epoch": 4.78950885399265, "grad_norm": 0.07282505929470062, "learning_rate": 3.113778180898093e-05, "loss": 0.4471, "num_input_tokens_seen": 52170032, "step": 43005 }, { "epoch": 4.790065708876266, "grad_norm": 0.09099908173084259, "learning_rate": 3.113307094786395e-05, "loss": 0.4621, "num_input_tokens_seen": 52176464, "step": 43010 }, { "epoch": 4.790622563759884, "grad_norm": 0.12760621309280396, "learning_rate": 3.1128359855019905e-05, "loss": 0.4574, "num_input_tokens_seen": 52182640, "step": 43015 }, { "epoch": 4.791179418643502, "grad_norm": 0.0894157662987709, "learning_rate": 3.1123648530626784e-05, "loss": 0.4668, "num_input_tokens_seen": 52188720, "step": 43020 }, { "epoch": 4.7917362735271185, "grad_norm": 0.10559308528900146, "learning_rate": 3.11189369748626e-05, "loss": 0.4429, "num_input_tokens_seen": 52194704, "step": 43025 }, { "epoch": 4.792293128410736, "grad_norm": 0.11533655971288681, "learning_rate": 3.111422518790536e-05, "loss": 0.4819, "num_input_tokens_seen": 52201168, "step": 43030 }, { "epoch": 4.792849983294354, "grad_norm": 0.09539595991373062, "learning_rate": 3.1109513169933105e-05, "loss": 0.4763, "num_input_tokens_seen": 52206640, "step": 43035 }, { "epoch": 4.793406838177971, "grad_norm": 0.08888272196054459, "learning_rate": 3.1104800921123855e-05, "loss": 0.4616, "num_input_tokens_seen": 52212560, "step": 43040 }, { "epoch": 4.793963693061588, "grad_norm": 0.08343503624200821, "learning_rate": 3.1100088441655675e-05, "loss": 0.4726, "num_input_tokens_seen": 52218544, "step": 43045 }, { "epoch": 4.794520547945205, "grad_norm": 0.10715688765048981, "learning_rate": 3.109537573170659e-05, "loss": 0.4556, "num_input_tokens_seen": 52225040, "step": 43050 }, { "epoch": 4.795077402828823, "grad_norm": 0.12438125163316727, "learning_rate": 3.109066279145469e-05, "loss": 0.4752, "num_input_tokens_seen": 52231312, "step": 43055 }, { "epoch": 4.7956342577124405, "grad_norm": 0.08442702144384384, "learning_rate": 3.1085949621078026e-05, "loss": 0.4535, "num_input_tokens_seen": 52237360, "step": 43060 }, { "epoch": 4.796191112596057, "grad_norm": 0.08632726967334747, "learning_rate": 3.108123622075468e-05, "loss": 0.4623, "num_input_tokens_seen": 52243472, "step": 43065 }, { "epoch": 4.796747967479675, "grad_norm": 0.08030321449041367, "learning_rate": 3.1076522590662745e-05, "loss": 0.4647, "num_input_tokens_seen": 52249872, "step": 43070 }, { "epoch": 4.797304822363293, "grad_norm": 0.11818545311689377, "learning_rate": 3.107180873098031e-05, "loss": 0.4554, "num_input_tokens_seen": 52255280, "step": 43075 }, { "epoch": 4.797861677246909, "grad_norm": 0.10661565512418747, "learning_rate": 3.106709464188549e-05, "loss": 0.4597, "num_input_tokens_seen": 52261616, "step": 43080 }, { "epoch": 4.798418532130527, "grad_norm": 0.1122668981552124, "learning_rate": 3.106238032355638e-05, "loss": 0.4709, "num_input_tokens_seen": 52267280, "step": 43085 }, { "epoch": 4.798975387014144, "grad_norm": 0.09599130600690842, "learning_rate": 3.1057665776171116e-05, "loss": 0.471, "num_input_tokens_seen": 52273488, "step": 43090 }, { "epoch": 4.7995322418977615, "grad_norm": 0.09304801374673843, "learning_rate": 3.105295099990783e-05, "loss": 0.4663, "num_input_tokens_seen": 52279760, "step": 43095 }, { "epoch": 4.800089096781379, "grad_norm": 0.10169791430234909, "learning_rate": 3.104823599494466e-05, "loss": 0.4526, "num_input_tokens_seen": 52285584, "step": 43100 }, { "epoch": 4.800645951664996, "grad_norm": 0.09418147057294846, "learning_rate": 3.1043520761459746e-05, "loss": 0.4569, "num_input_tokens_seen": 52292016, "step": 43105 }, { "epoch": 4.801202806548614, "grad_norm": 0.09560207277536392, "learning_rate": 3.1038805299631256e-05, "loss": 0.4554, "num_input_tokens_seen": 52298128, "step": 43110 }, { "epoch": 4.80175966143223, "grad_norm": 0.09249593317508698, "learning_rate": 3.1034089609637345e-05, "loss": 0.4516, "num_input_tokens_seen": 52304304, "step": 43115 }, { "epoch": 4.802316516315848, "grad_norm": 0.10094697773456573, "learning_rate": 3.1029373691656196e-05, "loss": 0.4603, "num_input_tokens_seen": 52310512, "step": 43120 }, { "epoch": 4.802873371199466, "grad_norm": 0.14328020811080933, "learning_rate": 3.102465754586599e-05, "loss": 0.4519, "num_input_tokens_seen": 52316528, "step": 43125 }, { "epoch": 4.803430226083083, "grad_norm": 0.09763367474079132, "learning_rate": 3.10199411724449e-05, "loss": 0.4568, "num_input_tokens_seen": 52322064, "step": 43130 }, { "epoch": 4.8039870809667, "grad_norm": 0.09087762981653214, "learning_rate": 3.101522457157115e-05, "loss": 0.4491, "num_input_tokens_seen": 52328272, "step": 43135 }, { "epoch": 4.804543935850317, "grad_norm": 0.09901674836874008, "learning_rate": 3.101050774342294e-05, "loss": 0.4691, "num_input_tokens_seen": 52334192, "step": 43140 }, { "epoch": 4.805100790733935, "grad_norm": 0.09538779407739639, "learning_rate": 3.1005790688178484e-05, "loss": 0.4646, "num_input_tokens_seen": 52340016, "step": 43145 }, { "epoch": 4.805657645617552, "grad_norm": 0.09880381077528, "learning_rate": 3.100107340601601e-05, "loss": 0.4761, "num_input_tokens_seen": 52346096, "step": 43150 }, { "epoch": 4.806214500501169, "grad_norm": 0.10659492760896683, "learning_rate": 3.099635589711375e-05, "loss": 0.467, "num_input_tokens_seen": 52352304, "step": 43155 }, { "epoch": 4.806771355384787, "grad_norm": 0.17588657140731812, "learning_rate": 3.099163816164994e-05, "loss": 0.456, "num_input_tokens_seen": 52358800, "step": 43160 }, { "epoch": 4.807328210268404, "grad_norm": 0.08203846961259842, "learning_rate": 3.0986920199802856e-05, "loss": 0.4577, "num_input_tokens_seen": 52365360, "step": 43165 }, { "epoch": 4.807885065152021, "grad_norm": 0.08884279429912567, "learning_rate": 3.0982202011750734e-05, "loss": 0.4686, "num_input_tokens_seen": 52371536, "step": 43170 }, { "epoch": 4.808441920035639, "grad_norm": 0.145547017455101, "learning_rate": 3.097748359767185e-05, "loss": 0.4562, "num_input_tokens_seen": 52377552, "step": 43175 }, { "epoch": 4.808998774919256, "grad_norm": 0.0992686003446579, "learning_rate": 3.0972764957744474e-05, "loss": 0.4595, "num_input_tokens_seen": 52383984, "step": 43180 }, { "epoch": 4.809555629802873, "grad_norm": 0.09083043038845062, "learning_rate": 3.09680460921469e-05, "loss": 0.463, "num_input_tokens_seen": 52389872, "step": 43185 }, { "epoch": 4.81011248468649, "grad_norm": 0.09815215319395065, "learning_rate": 3.0963327001057425e-05, "loss": 0.4528, "num_input_tokens_seen": 52396016, "step": 43190 }, { "epoch": 4.810669339570108, "grad_norm": 0.1322941780090332, "learning_rate": 3.095860768465434e-05, "loss": 0.4525, "num_input_tokens_seen": 52401616, "step": 43195 }, { "epoch": 4.811226194453726, "grad_norm": 0.09465096890926361, "learning_rate": 3.095388814311596e-05, "loss": 0.4515, "num_input_tokens_seen": 52407664, "step": 43200 }, { "epoch": 4.811783049337342, "grad_norm": 0.12874111533164978, "learning_rate": 3.0949168376620614e-05, "loss": 0.4466, "num_input_tokens_seen": 52413808, "step": 43205 }, { "epoch": 4.81233990422096, "grad_norm": 0.12178078293800354, "learning_rate": 3.0944448385346614e-05, "loss": 0.4794, "num_input_tokens_seen": 52419216, "step": 43210 }, { "epoch": 4.812896759104578, "grad_norm": 0.1061096116900444, "learning_rate": 3.093972816947231e-05, "loss": 0.457, "num_input_tokens_seen": 52425040, "step": 43215 }, { "epoch": 4.8134536139881945, "grad_norm": 0.08808497339487076, "learning_rate": 3.093500772917603e-05, "loss": 0.4588, "num_input_tokens_seen": 52430192, "step": 43220 }, { "epoch": 4.814010468871812, "grad_norm": 0.08650057762861252, "learning_rate": 3.0930287064636144e-05, "loss": 0.4685, "num_input_tokens_seen": 52436336, "step": 43225 }, { "epoch": 4.814567323755429, "grad_norm": 0.13424837589263916, "learning_rate": 3.0925566176031005e-05, "loss": 0.4698, "num_input_tokens_seen": 52442512, "step": 43230 }, { "epoch": 4.815124178639047, "grad_norm": 0.09483261406421661, "learning_rate": 3.0920845063539e-05, "loss": 0.468, "num_input_tokens_seen": 52448688, "step": 43235 }, { "epoch": 4.815681033522664, "grad_norm": 0.09659865498542786, "learning_rate": 3.091612372733848e-05, "loss": 0.4573, "num_input_tokens_seen": 52454608, "step": 43240 }, { "epoch": 4.816237888406281, "grad_norm": 0.0905228704214096, "learning_rate": 3.091140216760785e-05, "loss": 0.467, "num_input_tokens_seen": 52460624, "step": 43245 }, { "epoch": 4.816794743289899, "grad_norm": 0.09535089880228043, "learning_rate": 3.09066803845255e-05, "loss": 0.4505, "num_input_tokens_seen": 52466736, "step": 43250 }, { "epoch": 4.817351598173516, "grad_norm": 0.09136498719453812, "learning_rate": 3.090195837826984e-05, "loss": 0.47, "num_input_tokens_seen": 52472944, "step": 43255 }, { "epoch": 4.817908453057133, "grad_norm": 0.14495398104190826, "learning_rate": 3.0897236149019284e-05, "loss": 0.4661, "num_input_tokens_seen": 52479216, "step": 43260 }, { "epoch": 4.818465307940751, "grad_norm": 0.09546301513910294, "learning_rate": 3.089251369695224e-05, "loss": 0.4608, "num_input_tokens_seen": 52485392, "step": 43265 }, { "epoch": 4.819022162824368, "grad_norm": 0.09237243235111237, "learning_rate": 3.0887791022247146e-05, "loss": 0.4602, "num_input_tokens_seen": 52491632, "step": 43270 }, { "epoch": 4.819579017707985, "grad_norm": 0.08396591246128082, "learning_rate": 3.0883068125082434e-05, "loss": 0.465, "num_input_tokens_seen": 52497424, "step": 43275 }, { "epoch": 4.820135872591603, "grad_norm": 0.08593526482582092, "learning_rate": 3.087834500563656e-05, "loss": 0.4592, "num_input_tokens_seen": 52503152, "step": 43280 }, { "epoch": 4.82069272747522, "grad_norm": 0.10956154763698578, "learning_rate": 3.087362166408798e-05, "loss": 0.4634, "num_input_tokens_seen": 52509328, "step": 43285 }, { "epoch": 4.8212495823588375, "grad_norm": 0.09248191118240356, "learning_rate": 3.086889810061514e-05, "loss": 0.4671, "num_input_tokens_seen": 52515184, "step": 43290 }, { "epoch": 4.821806437242454, "grad_norm": 0.11964553594589233, "learning_rate": 3.086417431539653e-05, "loss": 0.4554, "num_input_tokens_seen": 52521232, "step": 43295 }, { "epoch": 4.822363292126072, "grad_norm": 0.09871517866849899, "learning_rate": 3.085945030861062e-05, "loss": 0.4815, "num_input_tokens_seen": 52527248, "step": 43300 }, { "epoch": 4.82292014700969, "grad_norm": 0.09681474417448044, "learning_rate": 3.0854726080435894e-05, "loss": 0.4605, "num_input_tokens_seen": 52533456, "step": 43305 }, { "epoch": 4.823477001893306, "grad_norm": 0.1004863753914833, "learning_rate": 3.085000163105086e-05, "loss": 0.4612, "num_input_tokens_seen": 52539920, "step": 43310 }, { "epoch": 4.824033856776924, "grad_norm": 0.07040350139141083, "learning_rate": 3.084527696063402e-05, "loss": 0.4585, "num_input_tokens_seen": 52546032, "step": 43315 }, { "epoch": 4.824590711660541, "grad_norm": 0.11022266000509262, "learning_rate": 3.084055206936387e-05, "loss": 0.4655, "num_input_tokens_seen": 52552144, "step": 43320 }, { "epoch": 4.8251475665441586, "grad_norm": 0.08106594532728195, "learning_rate": 3.0835826957418964e-05, "loss": 0.4693, "num_input_tokens_seen": 52558000, "step": 43325 }, { "epoch": 4.825704421427776, "grad_norm": 0.09309279173612595, "learning_rate": 3.0831101624977805e-05, "loss": 0.4555, "num_input_tokens_seen": 52563888, "step": 43330 }, { "epoch": 4.826261276311393, "grad_norm": 0.09114254266023636, "learning_rate": 3.0826376072218945e-05, "loss": 0.4609, "num_input_tokens_seen": 52569872, "step": 43335 }, { "epoch": 4.826818131195011, "grad_norm": 0.10746698826551437, "learning_rate": 3.082165029932091e-05, "loss": 0.4662, "num_input_tokens_seen": 52575952, "step": 43340 }, { "epoch": 4.8273749860786275, "grad_norm": 0.12224606424570084, "learning_rate": 3.0816924306462286e-05, "loss": 0.4655, "num_input_tokens_seen": 52582224, "step": 43345 }, { "epoch": 4.827931840962245, "grad_norm": 0.10381276160478592, "learning_rate": 3.0812198093821615e-05, "loss": 0.4663, "num_input_tokens_seen": 52588592, "step": 43350 }, { "epoch": 4.828488695845863, "grad_norm": 0.11923385411500931, "learning_rate": 3.0807471661577476e-05, "loss": 0.4618, "num_input_tokens_seen": 52594384, "step": 43355 }, { "epoch": 4.82904555072948, "grad_norm": 0.09589681774377823, "learning_rate": 3.080274500990845e-05, "loss": 0.4647, "num_input_tokens_seen": 52600496, "step": 43360 }, { "epoch": 4.829602405613097, "grad_norm": 0.11236678063869476, "learning_rate": 3.0798018138993114e-05, "loss": 0.4468, "num_input_tokens_seen": 52606320, "step": 43365 }, { "epoch": 4.830159260496714, "grad_norm": 0.1105259358882904, "learning_rate": 3.0793291049010076e-05, "loss": 0.4707, "num_input_tokens_seen": 52612560, "step": 43370 }, { "epoch": 4.830716115380332, "grad_norm": 0.10223899781703949, "learning_rate": 3.0788563740137936e-05, "loss": 0.4673, "num_input_tokens_seen": 52618704, "step": 43375 }, { "epoch": 4.831272970263949, "grad_norm": 0.08468177914619446, "learning_rate": 3.07838362125553e-05, "loss": 0.4646, "num_input_tokens_seen": 52624624, "step": 43380 }, { "epoch": 4.831829825147566, "grad_norm": 0.11467236280441284, "learning_rate": 3.07791084664408e-05, "loss": 0.4647, "num_input_tokens_seen": 52630800, "step": 43385 }, { "epoch": 4.832386680031184, "grad_norm": 0.08978156000375748, "learning_rate": 3.077438050197307e-05, "loss": 0.4472, "num_input_tokens_seen": 52636176, "step": 43390 }, { "epoch": 4.8329435349148016, "grad_norm": 0.0998641699552536, "learning_rate": 3.0769652319330737e-05, "loss": 0.4627, "num_input_tokens_seen": 52642224, "step": 43395 }, { "epoch": 4.833500389798418, "grad_norm": 0.12249132990837097, "learning_rate": 3.0764923918692453e-05, "loss": 0.4489, "num_input_tokens_seen": 52648304, "step": 43400 }, { "epoch": 4.834057244682036, "grad_norm": 0.1230308786034584, "learning_rate": 3.076019530023686e-05, "loss": 0.4476, "num_input_tokens_seen": 52654320, "step": 43405 }, { "epoch": 4.834614099565654, "grad_norm": 0.08705312758684158, "learning_rate": 3.0755466464142634e-05, "loss": 0.4564, "num_input_tokens_seen": 52660432, "step": 43410 }, { "epoch": 4.8351709544492705, "grad_norm": 0.09111019968986511, "learning_rate": 3.0750737410588445e-05, "loss": 0.4717, "num_input_tokens_seen": 52666480, "step": 43415 }, { "epoch": 4.835727809332888, "grad_norm": 0.08452489227056503, "learning_rate": 3.0746008139752964e-05, "loss": 0.4596, "num_input_tokens_seen": 52672528, "step": 43420 }, { "epoch": 4.836284664216505, "grad_norm": 0.09613543003797531, "learning_rate": 3.0741278651814875e-05, "loss": 0.4673, "num_input_tokens_seen": 52678576, "step": 43425 }, { "epoch": 4.836841519100123, "grad_norm": 0.08984038233757019, "learning_rate": 3.073654894695289e-05, "loss": 0.4792, "num_input_tokens_seen": 52684272, "step": 43430 }, { "epoch": 4.83739837398374, "grad_norm": 0.14159581065177917, "learning_rate": 3.0731819025345694e-05, "loss": 0.4731, "num_input_tokens_seen": 52690608, "step": 43435 }, { "epoch": 4.837955228867357, "grad_norm": 0.09763477742671967, "learning_rate": 3.072708888717201e-05, "loss": 0.4689, "num_input_tokens_seen": 52696656, "step": 43440 }, { "epoch": 4.838512083750975, "grad_norm": 0.10420385003089905, "learning_rate": 3.072235853261055e-05, "loss": 0.4724, "num_input_tokens_seen": 52703088, "step": 43445 }, { "epoch": 4.8390689386345915, "grad_norm": 0.08972498774528503, "learning_rate": 3.071762796184005e-05, "loss": 0.4675, "num_input_tokens_seen": 52708944, "step": 43450 }, { "epoch": 4.839625793518209, "grad_norm": 0.12069058418273926, "learning_rate": 3.0712897175039244e-05, "loss": 0.4713, "num_input_tokens_seen": 52715280, "step": 43455 }, { "epoch": 4.840182648401827, "grad_norm": 0.13376080989837646, "learning_rate": 3.070816617238688e-05, "loss": 0.4709, "num_input_tokens_seen": 52721392, "step": 43460 }, { "epoch": 4.840739503285444, "grad_norm": 0.08874848484992981, "learning_rate": 3.0703434954061694e-05, "loss": 0.4696, "num_input_tokens_seen": 52727248, "step": 43465 }, { "epoch": 4.841296358169061, "grad_norm": 0.14204192161560059, "learning_rate": 3.069870352024246e-05, "loss": 0.4601, "num_input_tokens_seen": 52733584, "step": 43470 }, { "epoch": 4.841853213052678, "grad_norm": 0.12605875730514526, "learning_rate": 3.069397187110795e-05, "loss": 0.4589, "num_input_tokens_seen": 52739504, "step": 43475 }, { "epoch": 4.842410067936296, "grad_norm": 0.08412812650203705, "learning_rate": 3.068924000683693e-05, "loss": 0.4757, "num_input_tokens_seen": 52745584, "step": 43480 }, { "epoch": 4.8429669228199135, "grad_norm": 0.12168938666582108, "learning_rate": 3.0684507927608196e-05, "loss": 0.4542, "num_input_tokens_seen": 52751696, "step": 43485 }, { "epoch": 4.84352377770353, "grad_norm": 0.09665588289499283, "learning_rate": 3.0679775633600535e-05, "loss": 0.4574, "num_input_tokens_seen": 52757904, "step": 43490 }, { "epoch": 4.844080632587148, "grad_norm": 0.11170592904090881, "learning_rate": 3.067504312499275e-05, "loss": 0.4713, "num_input_tokens_seen": 52763952, "step": 43495 }, { "epoch": 4.844637487470765, "grad_norm": 0.08141640573740005, "learning_rate": 3.0670310401963636e-05, "loss": 0.4737, "num_input_tokens_seen": 52770128, "step": 43500 }, { "epoch": 4.845194342354382, "grad_norm": 0.08065075427293777, "learning_rate": 3.0665577464692036e-05, "loss": 0.4607, "num_input_tokens_seen": 52776336, "step": 43505 }, { "epoch": 4.845751197238, "grad_norm": 0.09071710705757141, "learning_rate": 3.0660844313356766e-05, "loss": 0.4565, "num_input_tokens_seen": 52782480, "step": 43510 }, { "epoch": 4.846308052121617, "grad_norm": 0.1093803197145462, "learning_rate": 3.065611094813665e-05, "loss": 0.466, "num_input_tokens_seen": 52788528, "step": 43515 }, { "epoch": 4.8468649070052345, "grad_norm": 0.11002590507268906, "learning_rate": 3.0651377369210535e-05, "loss": 0.4549, "num_input_tokens_seen": 52794736, "step": 43520 }, { "epoch": 4.847421761888851, "grad_norm": 0.09830392152070999, "learning_rate": 3.064664357675728e-05, "loss": 0.4744, "num_input_tokens_seen": 52800496, "step": 43525 }, { "epoch": 4.847978616772469, "grad_norm": 0.10917149484157562, "learning_rate": 3.064190957095573e-05, "loss": 0.4558, "num_input_tokens_seen": 52806640, "step": 43530 }, { "epoch": 4.848535471656087, "grad_norm": 0.09729062020778656, "learning_rate": 3.0637175351984755e-05, "loss": 0.4541, "num_input_tokens_seen": 52812368, "step": 43535 }, { "epoch": 4.8490923265397035, "grad_norm": 0.11287972331047058, "learning_rate": 3.063244092002323e-05, "loss": 0.4691, "num_input_tokens_seen": 52818448, "step": 43540 }, { "epoch": 4.849649181423321, "grad_norm": 0.09450281411409378, "learning_rate": 3.062770627525004e-05, "loss": 0.4648, "num_input_tokens_seen": 52824624, "step": 43545 }, { "epoch": 4.850206036306938, "grad_norm": 0.13769379258155823, "learning_rate": 3.062297141784407e-05, "loss": 0.4681, "num_input_tokens_seen": 52830032, "step": 43550 }, { "epoch": 4.850762891190556, "grad_norm": 0.08243310451507568, "learning_rate": 3.061823634798422e-05, "loss": 0.4584, "num_input_tokens_seen": 52836048, "step": 43555 }, { "epoch": 4.851319746074173, "grad_norm": 0.0933431014418602, "learning_rate": 3.0613501065849395e-05, "loss": 0.4685, "num_input_tokens_seen": 52841968, "step": 43560 }, { "epoch": 4.85187660095779, "grad_norm": 0.08561840653419495, "learning_rate": 3.0608765571618506e-05, "loss": 0.4522, "num_input_tokens_seen": 52847984, "step": 43565 }, { "epoch": 4.852433455841408, "grad_norm": 0.10927919298410416, "learning_rate": 3.060402986547049e-05, "loss": 0.4605, "num_input_tokens_seen": 52853936, "step": 43570 }, { "epoch": 4.852990310725025, "grad_norm": 0.07229360193014145, "learning_rate": 3.059929394758426e-05, "loss": 0.462, "num_input_tokens_seen": 52859664, "step": 43575 }, { "epoch": 4.853547165608642, "grad_norm": 0.08860158175230026, "learning_rate": 3.059455781813877e-05, "loss": 0.4572, "num_input_tokens_seen": 52865968, "step": 43580 }, { "epoch": 4.85410402049226, "grad_norm": 0.08679399639368057, "learning_rate": 3.058982147731295e-05, "loss": 0.4659, "num_input_tokens_seen": 52872176, "step": 43585 }, { "epoch": 4.8546608753758775, "grad_norm": 0.1311688870191574, "learning_rate": 3.058508492528576e-05, "loss": 0.4588, "num_input_tokens_seen": 52878192, "step": 43590 }, { "epoch": 4.855217730259494, "grad_norm": 0.10681227594614029, "learning_rate": 3.058034816223616e-05, "loss": 0.4667, "num_input_tokens_seen": 52884560, "step": 43595 }, { "epoch": 4.855774585143112, "grad_norm": 0.10076881945133209, "learning_rate": 3.057561118834313e-05, "loss": 0.4687, "num_input_tokens_seen": 52890960, "step": 43600 }, { "epoch": 4.856331440026729, "grad_norm": 0.0763721615076065, "learning_rate": 3.0570874003785646e-05, "loss": 0.4695, "num_input_tokens_seen": 52896976, "step": 43605 }, { "epoch": 4.8568882949103465, "grad_norm": 0.08319588750600815, "learning_rate": 3.056613660874268e-05, "loss": 0.4513, "num_input_tokens_seen": 52903088, "step": 43610 }, { "epoch": 4.857445149793964, "grad_norm": 0.13021831214427948, "learning_rate": 3.056139900339324e-05, "loss": 0.4603, "num_input_tokens_seen": 52909360, "step": 43615 }, { "epoch": 4.858002004677581, "grad_norm": 0.13396185636520386, "learning_rate": 3.055666118791632e-05, "loss": 0.4603, "num_input_tokens_seen": 52915920, "step": 43620 }, { "epoch": 4.858558859561199, "grad_norm": 0.08414249122142792, "learning_rate": 3.0551923162490936e-05, "loss": 0.4584, "num_input_tokens_seen": 52921456, "step": 43625 }, { "epoch": 4.859115714444815, "grad_norm": 0.07611899077892303, "learning_rate": 3.05471849272961e-05, "loss": 0.4636, "num_input_tokens_seen": 52927472, "step": 43630 }, { "epoch": 4.859672569328433, "grad_norm": 0.08779393881559372, "learning_rate": 3.054244648251085e-05, "loss": 0.4511, "num_input_tokens_seen": 52933680, "step": 43635 }, { "epoch": 4.860229424212051, "grad_norm": 0.07782620936632156, "learning_rate": 3.05377078283142e-05, "loss": 0.4668, "num_input_tokens_seen": 52939888, "step": 43640 }, { "epoch": 4.8607862790956675, "grad_norm": 0.08572837710380554, "learning_rate": 3.05329689648852e-05, "loss": 0.4579, "num_input_tokens_seen": 52946000, "step": 43645 }, { "epoch": 4.861343133979285, "grad_norm": 0.1243622750043869, "learning_rate": 3.052822989240291e-05, "loss": 0.4581, "num_input_tokens_seen": 52952048, "step": 43650 }, { "epoch": 4.861899988862902, "grad_norm": 0.11274335533380508, "learning_rate": 3.052349061104637e-05, "loss": 0.4805, "num_input_tokens_seen": 52957520, "step": 43655 }, { "epoch": 4.86245684374652, "grad_norm": 0.1313686966896057, "learning_rate": 3.0518751120994654e-05, "loss": 0.4652, "num_input_tokens_seen": 52963760, "step": 43660 }, { "epoch": 4.863013698630137, "grad_norm": 0.08129974454641342, "learning_rate": 3.051401142242684e-05, "loss": 0.4546, "num_input_tokens_seen": 52969968, "step": 43665 }, { "epoch": 4.863570553513754, "grad_norm": 0.10420089215040207, "learning_rate": 3.0509271515522002e-05, "loss": 0.4717, "num_input_tokens_seen": 52975920, "step": 43670 }, { "epoch": 4.864127408397372, "grad_norm": 0.12485865503549576, "learning_rate": 3.0504531400459234e-05, "loss": 0.474, "num_input_tokens_seen": 52982128, "step": 43675 }, { "epoch": 4.864684263280989, "grad_norm": 0.11011090874671936, "learning_rate": 3.0499791077417627e-05, "loss": 0.4744, "num_input_tokens_seen": 52988368, "step": 43680 }, { "epoch": 4.865241118164606, "grad_norm": 0.09477201104164124, "learning_rate": 3.049505054657629e-05, "loss": 0.4694, "num_input_tokens_seen": 52994640, "step": 43685 }, { "epoch": 4.865797973048224, "grad_norm": 0.07213899493217468, "learning_rate": 3.0490309808114324e-05, "loss": 0.4572, "num_input_tokens_seen": 53000624, "step": 43690 }, { "epoch": 4.866354827931841, "grad_norm": 0.13547934591770172, "learning_rate": 3.0485568862210874e-05, "loss": 0.464, "num_input_tokens_seen": 53006832, "step": 43695 }, { "epoch": 4.866911682815458, "grad_norm": 0.1120787039399147, "learning_rate": 3.0480827709045044e-05, "loss": 0.4597, "num_input_tokens_seen": 53012784, "step": 43700 }, { "epoch": 4.867468537699075, "grad_norm": 0.10682155936956406, "learning_rate": 3.047608634879598e-05, "loss": 0.4607, "num_input_tokens_seen": 53018896, "step": 43705 }, { "epoch": 4.868025392582693, "grad_norm": 0.09658907353878021, "learning_rate": 3.0471344781642825e-05, "loss": 0.4578, "num_input_tokens_seen": 53025136, "step": 43710 }, { "epoch": 4.8685822474663105, "grad_norm": 0.12405655533075333, "learning_rate": 3.0466603007764733e-05, "loss": 0.4657, "num_input_tokens_seen": 53031088, "step": 43715 }, { "epoch": 4.869139102349927, "grad_norm": 0.13116468489170074, "learning_rate": 3.0461861027340855e-05, "loss": 0.4536, "num_input_tokens_seen": 53037360, "step": 43720 }, { "epoch": 4.869695957233545, "grad_norm": 0.07560808211565018, "learning_rate": 3.045711884055037e-05, "loss": 0.4555, "num_input_tokens_seen": 53043728, "step": 43725 }, { "epoch": 4.870252812117163, "grad_norm": 0.09041846543550491, "learning_rate": 3.0452376447572444e-05, "loss": 0.466, "num_input_tokens_seen": 53050032, "step": 43730 }, { "epoch": 4.870809667000779, "grad_norm": 0.08601421117782593, "learning_rate": 3.0447633848586267e-05, "loss": 0.452, "num_input_tokens_seen": 53055280, "step": 43735 }, { "epoch": 4.871366521884397, "grad_norm": 0.08982345461845398, "learning_rate": 3.0442891043771015e-05, "loss": 0.479, "num_input_tokens_seen": 53061232, "step": 43740 }, { "epoch": 4.871923376768014, "grad_norm": 0.09174367040395737, "learning_rate": 3.043814803330591e-05, "loss": 0.4677, "num_input_tokens_seen": 53067376, "step": 43745 }, { "epoch": 4.872480231651632, "grad_norm": 0.10073483735322952, "learning_rate": 3.043340481737013e-05, "loss": 0.4601, "num_input_tokens_seen": 53073264, "step": 43750 }, { "epoch": 4.873037086535249, "grad_norm": 0.10524007678031921, "learning_rate": 3.0428661396142916e-05, "loss": 0.4609, "num_input_tokens_seen": 53079504, "step": 43755 }, { "epoch": 4.873593941418866, "grad_norm": 0.09684369713068008, "learning_rate": 3.0423917769803473e-05, "loss": 0.4596, "num_input_tokens_seen": 53085520, "step": 43760 }, { "epoch": 4.874150796302484, "grad_norm": 0.09967764467000961, "learning_rate": 3.0419173938531036e-05, "loss": 0.4694, "num_input_tokens_seen": 53091472, "step": 43765 }, { "epoch": 4.874707651186101, "grad_norm": 0.10874076187610626, "learning_rate": 3.0414429902504844e-05, "loss": 0.4645, "num_input_tokens_seen": 53097712, "step": 43770 }, { "epoch": 4.875264506069718, "grad_norm": 0.09130164980888367, "learning_rate": 3.040968566190413e-05, "loss": 0.4613, "num_input_tokens_seen": 53103792, "step": 43775 }, { "epoch": 4.875821360953336, "grad_norm": 0.06607179343700409, "learning_rate": 3.0404941216908166e-05, "loss": 0.4569, "num_input_tokens_seen": 53110032, "step": 43780 }, { "epoch": 4.876378215836953, "grad_norm": 0.09114634990692139, "learning_rate": 3.0400196567696192e-05, "loss": 0.4702, "num_input_tokens_seen": 53116144, "step": 43785 }, { "epoch": 4.87693507072057, "grad_norm": 0.09304578602313995, "learning_rate": 3.0395451714447493e-05, "loss": 0.469, "num_input_tokens_seen": 53122448, "step": 43790 }, { "epoch": 4.877491925604188, "grad_norm": 0.11939998716115952, "learning_rate": 3.0390706657341335e-05, "loss": 0.4614, "num_input_tokens_seen": 53128720, "step": 43795 }, { "epoch": 4.878048780487805, "grad_norm": 0.08980350196361542, "learning_rate": 3.0385961396557004e-05, "loss": 0.4623, "num_input_tokens_seen": 53134512, "step": 43800 }, { "epoch": 4.8786056353714224, "grad_norm": 0.09271286427974701, "learning_rate": 3.0381215932273794e-05, "loss": 0.4745, "num_input_tokens_seen": 53140272, "step": 43805 }, { "epoch": 4.879162490255039, "grad_norm": 0.09879790246486664, "learning_rate": 3.0376470264670997e-05, "loss": 0.4674, "num_input_tokens_seen": 53146736, "step": 43810 }, { "epoch": 4.879719345138657, "grad_norm": 0.11586226522922516, "learning_rate": 3.037172439392792e-05, "loss": 0.4584, "num_input_tokens_seen": 53152624, "step": 43815 }, { "epoch": 4.880276200022275, "grad_norm": 0.10924364626407623, "learning_rate": 3.0366978320223895e-05, "loss": 0.456, "num_input_tokens_seen": 53158896, "step": 43820 }, { "epoch": 4.880833054905891, "grad_norm": 0.08833719789981842, "learning_rate": 3.036223204373822e-05, "loss": 0.4648, "num_input_tokens_seen": 53165488, "step": 43825 }, { "epoch": 4.881389909789509, "grad_norm": 0.11277089267969131, "learning_rate": 3.0357485564650236e-05, "loss": 0.45, "num_input_tokens_seen": 53171600, "step": 43830 }, { "epoch": 4.881946764673126, "grad_norm": 0.08878592401742935, "learning_rate": 3.035273888313928e-05, "loss": 0.4772, "num_input_tokens_seen": 53177872, "step": 43835 }, { "epoch": 4.8825036195567435, "grad_norm": 0.09796559810638428, "learning_rate": 3.034799199938469e-05, "loss": 0.4677, "num_input_tokens_seen": 53183472, "step": 43840 }, { "epoch": 4.883060474440361, "grad_norm": 0.10359036922454834, "learning_rate": 3.0343244913565828e-05, "loss": 0.4652, "num_input_tokens_seen": 53189392, "step": 43845 }, { "epoch": 4.883617329323978, "grad_norm": 0.11001721769571304, "learning_rate": 3.033849762586205e-05, "loss": 0.4697, "num_input_tokens_seen": 53195536, "step": 43850 }, { "epoch": 4.884174184207596, "grad_norm": 0.09924774616956711, "learning_rate": 3.0333750136452723e-05, "loss": 0.4622, "num_input_tokens_seen": 53201584, "step": 43855 }, { "epoch": 4.884731039091212, "grad_norm": 0.08929271250963211, "learning_rate": 3.032900244551722e-05, "loss": 0.462, "num_input_tokens_seen": 53207792, "step": 43860 }, { "epoch": 4.88528789397483, "grad_norm": 0.09332960098981857, "learning_rate": 3.0324254553234932e-05, "loss": 0.4551, "num_input_tokens_seen": 53213360, "step": 43865 }, { "epoch": 4.885844748858448, "grad_norm": 0.10467896610498428, "learning_rate": 3.0319506459785235e-05, "loss": 0.4654, "num_input_tokens_seen": 53219440, "step": 43870 }, { "epoch": 4.886401603742065, "grad_norm": 0.07868872582912445, "learning_rate": 3.0314758165347545e-05, "loss": 0.4628, "num_input_tokens_seen": 53225488, "step": 43875 }, { "epoch": 4.886958458625682, "grad_norm": 0.1191488727927208, "learning_rate": 3.0310009670101257e-05, "loss": 0.4596, "num_input_tokens_seen": 53231248, "step": 43880 }, { "epoch": 4.887515313509299, "grad_norm": 0.11927753686904907, "learning_rate": 3.030526097422579e-05, "loss": 0.4624, "num_input_tokens_seen": 53237200, "step": 43885 }, { "epoch": 4.888072168392917, "grad_norm": 0.11255120486021042, "learning_rate": 3.030051207790056e-05, "loss": 0.4599, "num_input_tokens_seen": 53243376, "step": 43890 }, { "epoch": 4.888629023276534, "grad_norm": 0.09349595010280609, "learning_rate": 3.0295762981304998e-05, "loss": 0.4664, "num_input_tokens_seen": 53249776, "step": 43895 }, { "epoch": 4.889185878160151, "grad_norm": 0.08689219504594803, "learning_rate": 3.029101368461854e-05, "loss": 0.4668, "num_input_tokens_seen": 53255728, "step": 43900 }, { "epoch": 4.889742733043769, "grad_norm": 0.06568407267332077, "learning_rate": 3.0286264188020623e-05, "loss": 0.4518, "num_input_tokens_seen": 53261584, "step": 43905 }, { "epoch": 4.8902995879273865, "grad_norm": 0.09959103167057037, "learning_rate": 3.028151449169071e-05, "loss": 0.4503, "num_input_tokens_seen": 53267536, "step": 43910 }, { "epoch": 4.890856442811003, "grad_norm": 0.13173814117908478, "learning_rate": 3.0276764595808253e-05, "loss": 0.4578, "num_input_tokens_seen": 53273616, "step": 43915 }, { "epoch": 4.891413297694621, "grad_norm": 0.10793394595384598, "learning_rate": 3.0272014500552727e-05, "loss": 0.4534, "num_input_tokens_seen": 53279312, "step": 43920 }, { "epoch": 4.891970152578238, "grad_norm": 0.07936877012252808, "learning_rate": 3.026726420610359e-05, "loss": 0.4798, "num_input_tokens_seen": 53285648, "step": 43925 }, { "epoch": 4.892527007461855, "grad_norm": 0.06826288253068924, "learning_rate": 3.0262513712640334e-05, "loss": 0.4531, "num_input_tokens_seen": 53291984, "step": 43930 }, { "epoch": 4.893083862345473, "grad_norm": 0.09095414727926254, "learning_rate": 3.025776302034244e-05, "loss": 0.4754, "num_input_tokens_seen": 53297904, "step": 43935 }, { "epoch": 4.89364071722909, "grad_norm": 0.08941951394081116, "learning_rate": 3.025301212938941e-05, "loss": 0.4734, "num_input_tokens_seen": 53304048, "step": 43940 }, { "epoch": 4.894197572112708, "grad_norm": 0.082646943628788, "learning_rate": 3.024826103996075e-05, "loss": 0.4515, "num_input_tokens_seen": 53310000, "step": 43945 }, { "epoch": 4.894754426996325, "grad_norm": 0.09379930049180984, "learning_rate": 3.0243509752235978e-05, "loss": 0.4591, "num_input_tokens_seen": 53316048, "step": 43950 }, { "epoch": 4.895311281879942, "grad_norm": 0.09279629588127136, "learning_rate": 3.0238758266394597e-05, "loss": 0.4644, "num_input_tokens_seen": 53321872, "step": 43955 }, { "epoch": 4.89586813676356, "grad_norm": 0.08404384553432465, "learning_rate": 3.0234006582616138e-05, "loss": 0.459, "num_input_tokens_seen": 53328080, "step": 43960 }, { "epoch": 4.8964249916471765, "grad_norm": 0.09006199985742569, "learning_rate": 3.022925470108014e-05, "loss": 0.4729, "num_input_tokens_seen": 53334288, "step": 43965 }, { "epoch": 4.896981846530794, "grad_norm": 0.10456862300634384, "learning_rate": 3.0224502621966133e-05, "loss": 0.469, "num_input_tokens_seen": 53340496, "step": 43970 }, { "epoch": 4.897538701414412, "grad_norm": 0.09281881898641586, "learning_rate": 3.0219750345453678e-05, "loss": 0.4643, "num_input_tokens_seen": 53346512, "step": 43975 }, { "epoch": 4.898095556298029, "grad_norm": 0.12032032757997513, "learning_rate": 3.0214997871722334e-05, "loss": 0.4663, "num_input_tokens_seen": 53352560, "step": 43980 }, { "epoch": 4.898652411181646, "grad_norm": 0.09149809926748276, "learning_rate": 3.021024520095166e-05, "loss": 0.4677, "num_input_tokens_seen": 53358000, "step": 43985 }, { "epoch": 4.899209266065263, "grad_norm": 0.10597382485866547, "learning_rate": 3.0205492333321216e-05, "loss": 0.4617, "num_input_tokens_seen": 53364400, "step": 43990 }, { "epoch": 4.899766120948881, "grad_norm": 0.08541123569011688, "learning_rate": 3.0200739269010597e-05, "loss": 0.4669, "num_input_tokens_seen": 53370416, "step": 43995 }, { "epoch": 4.900322975832498, "grad_norm": 0.07898768037557602, "learning_rate": 3.0195986008199372e-05, "loss": 0.4635, "num_input_tokens_seen": 53376528, "step": 44000 }, { "epoch": 4.900879830716115, "grad_norm": 0.09057601541280746, "learning_rate": 3.019123255106715e-05, "loss": 0.4528, "num_input_tokens_seen": 53381872, "step": 44005 }, { "epoch": 4.901436685599733, "grad_norm": 0.10219533741474152, "learning_rate": 3.018647889779353e-05, "loss": 0.456, "num_input_tokens_seen": 53388048, "step": 44010 }, { "epoch": 4.90199354048335, "grad_norm": 0.10999327898025513, "learning_rate": 3.0181725048558113e-05, "loss": 0.4604, "num_input_tokens_seen": 53394032, "step": 44015 }, { "epoch": 4.902550395366967, "grad_norm": 0.0929664894938469, "learning_rate": 3.0176971003540523e-05, "loss": 0.4576, "num_input_tokens_seen": 53400176, "step": 44020 }, { "epoch": 4.903107250250585, "grad_norm": 0.10451270639896393, "learning_rate": 3.017221676292037e-05, "loss": 0.4706, "num_input_tokens_seen": 53406192, "step": 44025 }, { "epoch": 4.903664105134202, "grad_norm": 0.09960595518350601, "learning_rate": 3.01674623268773e-05, "loss": 0.4644, "num_input_tokens_seen": 53412432, "step": 44030 }, { "epoch": 4.9042209600178195, "grad_norm": 0.1160905584692955, "learning_rate": 3.0162707695590935e-05, "loss": 0.4611, "num_input_tokens_seen": 53418224, "step": 44035 }, { "epoch": 4.904777814901436, "grad_norm": 0.09635759145021439, "learning_rate": 3.0157952869240935e-05, "loss": 0.4575, "num_input_tokens_seen": 53423952, "step": 44040 }, { "epoch": 4.905334669785054, "grad_norm": 0.10015615075826645, "learning_rate": 3.0153197848006947e-05, "loss": 0.4548, "num_input_tokens_seen": 53429968, "step": 44045 }, { "epoch": 4.905891524668672, "grad_norm": 0.1138085201382637, "learning_rate": 3.014844263206863e-05, "loss": 0.4606, "num_input_tokens_seen": 53435280, "step": 44050 }, { "epoch": 4.906448379552288, "grad_norm": 0.10292430222034454, "learning_rate": 3.0143687221605647e-05, "loss": 0.4603, "num_input_tokens_seen": 53441232, "step": 44055 }, { "epoch": 4.907005234435906, "grad_norm": 0.10312984883785248, "learning_rate": 3.013893161679767e-05, "loss": 0.4687, "num_input_tokens_seen": 53447216, "step": 44060 }, { "epoch": 4.907562089319523, "grad_norm": 0.09035596996545792, "learning_rate": 3.01341758178244e-05, "loss": 0.4653, "num_input_tokens_seen": 53453168, "step": 44065 }, { "epoch": 4.9081189442031405, "grad_norm": 0.08387797325849533, "learning_rate": 3.012941982486551e-05, "loss": 0.4717, "num_input_tokens_seen": 53459568, "step": 44070 }, { "epoch": 4.908675799086758, "grad_norm": 0.10880093276500702, "learning_rate": 3.0124663638100702e-05, "loss": 0.4617, "num_input_tokens_seen": 53465520, "step": 44075 }, { "epoch": 4.909232653970375, "grad_norm": 0.09630077332258224, "learning_rate": 3.0119907257709678e-05, "loss": 0.4648, "num_input_tokens_seen": 53471312, "step": 44080 }, { "epoch": 4.909789508853993, "grad_norm": 0.09797138720750809, "learning_rate": 3.0115150683872158e-05, "loss": 0.4582, "num_input_tokens_seen": 53476880, "step": 44085 }, { "epoch": 4.91034636373761, "grad_norm": 0.07267415523529053, "learning_rate": 3.0110393916767838e-05, "loss": 0.4599, "num_input_tokens_seen": 53482864, "step": 44090 }, { "epoch": 4.910903218621227, "grad_norm": 0.08854754269123077, "learning_rate": 3.0105636956576466e-05, "loss": 0.4608, "num_input_tokens_seen": 53488912, "step": 44095 }, { "epoch": 4.911460073504845, "grad_norm": 0.08642296493053436, "learning_rate": 3.0100879803477772e-05, "loss": 0.4557, "num_input_tokens_seen": 53495312, "step": 44100 }, { "epoch": 4.912016928388462, "grad_norm": 0.10192402452230453, "learning_rate": 3.0096122457651486e-05, "loss": 0.4566, "num_input_tokens_seen": 53501328, "step": 44105 }, { "epoch": 4.912573783272079, "grad_norm": 0.10095708072185516, "learning_rate": 3.009136491927737e-05, "loss": 0.4661, "num_input_tokens_seen": 53507504, "step": 44110 }, { "epoch": 4.913130638155697, "grad_norm": 0.11835438758134842, "learning_rate": 3.0086607188535165e-05, "loss": 0.4614, "num_input_tokens_seen": 53513456, "step": 44115 }, { "epoch": 4.913687493039314, "grad_norm": 0.09714631736278534, "learning_rate": 3.0081849265604638e-05, "loss": 0.4608, "num_input_tokens_seen": 53519696, "step": 44120 }, { "epoch": 4.914244347922931, "grad_norm": 0.09555523097515106, "learning_rate": 3.0077091150665564e-05, "loss": 0.4578, "num_input_tokens_seen": 53525712, "step": 44125 }, { "epoch": 4.914801202806549, "grad_norm": 0.13015365600585938, "learning_rate": 3.0072332843897717e-05, "loss": 0.4459, "num_input_tokens_seen": 53532144, "step": 44130 }, { "epoch": 4.915358057690166, "grad_norm": 0.10459600389003754, "learning_rate": 3.0067574345480875e-05, "loss": 0.467, "num_input_tokens_seen": 53538000, "step": 44135 }, { "epoch": 4.9159149125737835, "grad_norm": 0.10631051659584045, "learning_rate": 3.006281565559484e-05, "loss": 0.4575, "num_input_tokens_seen": 53544208, "step": 44140 }, { "epoch": 4.9164717674574, "grad_norm": 0.07846571505069733, "learning_rate": 3.00580567744194e-05, "loss": 0.4673, "num_input_tokens_seen": 53550640, "step": 44145 }, { "epoch": 4.917028622341018, "grad_norm": 0.09098032861948013, "learning_rate": 3.0053297702134376e-05, "loss": 0.4684, "num_input_tokens_seen": 53556720, "step": 44150 }, { "epoch": 4.917585477224636, "grad_norm": 0.10346616804599762, "learning_rate": 3.0048538438919554e-05, "loss": 0.4729, "num_input_tokens_seen": 53562544, "step": 44155 }, { "epoch": 4.9181423321082525, "grad_norm": 0.08408597111701965, "learning_rate": 3.0043778984954785e-05, "loss": 0.4619, "num_input_tokens_seen": 53568688, "step": 44160 }, { "epoch": 4.91869918699187, "grad_norm": 0.06792919337749481, "learning_rate": 3.003901934041988e-05, "loss": 0.4651, "num_input_tokens_seen": 53574864, "step": 44165 }, { "epoch": 4.919256041875487, "grad_norm": 0.11523937433958054, "learning_rate": 3.0034259505494666e-05, "loss": 0.4719, "num_input_tokens_seen": 53580944, "step": 44170 }, { "epoch": 4.919812896759105, "grad_norm": 0.0869188904762268, "learning_rate": 3.0029499480359006e-05, "loss": 0.4534, "num_input_tokens_seen": 53587120, "step": 44175 }, { "epoch": 4.920369751642722, "grad_norm": 0.08296375721693039, "learning_rate": 3.0024739265192735e-05, "loss": 0.4553, "num_input_tokens_seen": 53592816, "step": 44180 }, { "epoch": 4.920926606526339, "grad_norm": 0.09483093023300171, "learning_rate": 3.0019978860175707e-05, "loss": 0.4538, "num_input_tokens_seen": 53598928, "step": 44185 }, { "epoch": 4.921483461409957, "grad_norm": 0.09573551267385483, "learning_rate": 3.0015218265487793e-05, "loss": 0.4614, "num_input_tokens_seen": 53605392, "step": 44190 }, { "epoch": 4.9220403162935735, "grad_norm": 0.12257228791713715, "learning_rate": 3.0010457481308863e-05, "loss": 0.4494, "num_input_tokens_seen": 53611280, "step": 44195 }, { "epoch": 4.922597171177191, "grad_norm": 0.11250686645507812, "learning_rate": 3.0005696507818792e-05, "loss": 0.4635, "num_input_tokens_seen": 53617552, "step": 44200 }, { "epoch": 4.923154026060809, "grad_norm": 0.10253533720970154, "learning_rate": 3.0000935345197468e-05, "loss": 0.4654, "num_input_tokens_seen": 53623696, "step": 44205 }, { "epoch": 4.923710880944426, "grad_norm": 0.12081044912338257, "learning_rate": 2.9996173993624777e-05, "loss": 0.4532, "num_input_tokens_seen": 53629808, "step": 44210 }, { "epoch": 4.924267735828043, "grad_norm": 0.10138530284166336, "learning_rate": 2.999141245328062e-05, "loss": 0.4528, "num_input_tokens_seen": 53636144, "step": 44215 }, { "epoch": 4.92482459071166, "grad_norm": 0.12151447683572769, "learning_rate": 2.9986650724344906e-05, "loss": 0.4606, "num_input_tokens_seen": 53642064, "step": 44220 }, { "epoch": 4.925381445595278, "grad_norm": 0.08876951038837433, "learning_rate": 2.9981888806997548e-05, "loss": 0.4649, "num_input_tokens_seen": 53648400, "step": 44225 }, { "epoch": 4.9259383004788955, "grad_norm": 0.08747053146362305, "learning_rate": 2.9977126701418472e-05, "loss": 0.4698, "num_input_tokens_seen": 53654960, "step": 44230 }, { "epoch": 4.926495155362512, "grad_norm": 0.09753017127513885, "learning_rate": 2.9972364407787596e-05, "loss": 0.4562, "num_input_tokens_seen": 53661040, "step": 44235 }, { "epoch": 4.92705201024613, "grad_norm": 0.08566534519195557, "learning_rate": 2.996760192628485e-05, "loss": 0.4716, "num_input_tokens_seen": 53667088, "step": 44240 }, { "epoch": 4.927608865129747, "grad_norm": 0.10080790519714355, "learning_rate": 2.9962839257090187e-05, "loss": 0.4724, "num_input_tokens_seen": 53673360, "step": 44245 }, { "epoch": 4.928165720013364, "grad_norm": 0.113766610622406, "learning_rate": 2.9958076400383555e-05, "loss": 0.4641, "num_input_tokens_seen": 53679440, "step": 44250 }, { "epoch": 4.928722574896982, "grad_norm": 0.09030026942491531, "learning_rate": 2.995331335634491e-05, "loss": 0.4719, "num_input_tokens_seen": 53685680, "step": 44255 }, { "epoch": 4.929279429780599, "grad_norm": 0.13551627099514008, "learning_rate": 2.9948550125154207e-05, "loss": 0.4624, "num_input_tokens_seen": 53691824, "step": 44260 }, { "epoch": 4.9298362846642165, "grad_norm": 0.08365654945373535, "learning_rate": 2.994378670699143e-05, "loss": 0.4517, "num_input_tokens_seen": 53698256, "step": 44265 }, { "epoch": 4.930393139547834, "grad_norm": 0.10180572420358658, "learning_rate": 2.993902310203654e-05, "loss": 0.4562, "num_input_tokens_seen": 53704080, "step": 44270 }, { "epoch": 4.930949994431451, "grad_norm": 0.10910189151763916, "learning_rate": 2.9934259310469525e-05, "loss": 0.4635, "num_input_tokens_seen": 53709424, "step": 44275 }, { "epoch": 4.931506849315069, "grad_norm": 0.09568539261817932, "learning_rate": 2.9929495332470388e-05, "loss": 0.4668, "num_input_tokens_seen": 53715408, "step": 44280 }, { "epoch": 4.9320637041986854, "grad_norm": 0.10227279365062714, "learning_rate": 2.9924731168219124e-05, "loss": 0.4548, "num_input_tokens_seen": 53721488, "step": 44285 }, { "epoch": 4.932620559082303, "grad_norm": 0.09358704090118408, "learning_rate": 2.9919966817895727e-05, "loss": 0.4634, "num_input_tokens_seen": 53727824, "step": 44290 }, { "epoch": 4.933177413965921, "grad_norm": 0.08986405283212662, "learning_rate": 2.991520228168022e-05, "loss": 0.4767, "num_input_tokens_seen": 53733552, "step": 44295 }, { "epoch": 4.933734268849538, "grad_norm": 0.11770359426736832, "learning_rate": 2.9910437559752618e-05, "loss": 0.464, "num_input_tokens_seen": 53740048, "step": 44300 }, { "epoch": 4.934291123733155, "grad_norm": 0.12302760034799576, "learning_rate": 2.9905672652292955e-05, "loss": 0.4605, "num_input_tokens_seen": 53746320, "step": 44305 }, { "epoch": 4.934847978616773, "grad_norm": 0.09049680829048157, "learning_rate": 2.9900907559481246e-05, "loss": 0.4666, "num_input_tokens_seen": 53752144, "step": 44310 }, { "epoch": 4.93540483350039, "grad_norm": 0.1159079372882843, "learning_rate": 2.989614228149755e-05, "loss": 0.4698, "num_input_tokens_seen": 53758512, "step": 44315 }, { "epoch": 4.935961688384007, "grad_norm": 0.09309565275907516, "learning_rate": 2.989137681852191e-05, "loss": 0.4677, "num_input_tokens_seen": 53763504, "step": 44320 }, { "epoch": 4.936518543267624, "grad_norm": 0.08959271758794785, "learning_rate": 2.988661117073438e-05, "loss": 0.4653, "num_input_tokens_seen": 53769392, "step": 44325 }, { "epoch": 4.937075398151242, "grad_norm": 0.12218350917100906, "learning_rate": 2.9881845338315017e-05, "loss": 0.467, "num_input_tokens_seen": 53775696, "step": 44330 }, { "epoch": 4.9376322530348595, "grad_norm": 0.11278962343931198, "learning_rate": 2.987707932144389e-05, "loss": 0.4512, "num_input_tokens_seen": 53781968, "step": 44335 }, { "epoch": 4.938189107918476, "grad_norm": 0.08135528117418289, "learning_rate": 2.9872313120301075e-05, "loss": 0.4621, "num_input_tokens_seen": 53787888, "step": 44340 }, { "epoch": 4.938745962802094, "grad_norm": 0.07839881628751755, "learning_rate": 2.9867546735066665e-05, "loss": 0.4594, "num_input_tokens_seen": 53793744, "step": 44345 }, { "epoch": 4.939302817685711, "grad_norm": 0.08669614046812057, "learning_rate": 2.9862780165920734e-05, "loss": 0.4689, "num_input_tokens_seen": 53800144, "step": 44350 }, { "epoch": 4.9398596725693285, "grad_norm": 0.08220319449901581, "learning_rate": 2.985801341304339e-05, "loss": 0.4622, "num_input_tokens_seen": 53806224, "step": 44355 }, { "epoch": 4.940416527452946, "grad_norm": 0.11895821243524551, "learning_rate": 2.9853246476614726e-05, "loss": 0.4662, "num_input_tokens_seen": 53812368, "step": 44360 }, { "epoch": 4.940973382336563, "grad_norm": 0.10072203725576401, "learning_rate": 2.984847935681486e-05, "loss": 0.4662, "num_input_tokens_seen": 53818416, "step": 44365 }, { "epoch": 4.941530237220181, "grad_norm": 0.09488577395677567, "learning_rate": 2.9843712053823902e-05, "loss": 0.4458, "num_input_tokens_seen": 53824624, "step": 44370 }, { "epoch": 4.942087092103797, "grad_norm": 0.08608295768499374, "learning_rate": 2.9838944567821987e-05, "loss": 0.465, "num_input_tokens_seen": 53830672, "step": 44375 }, { "epoch": 4.942643946987415, "grad_norm": 0.14242219924926758, "learning_rate": 2.9834176898989234e-05, "loss": 0.4738, "num_input_tokens_seen": 53836912, "step": 44380 }, { "epoch": 4.943200801871033, "grad_norm": 0.09703254699707031, "learning_rate": 2.982940904750579e-05, "loss": 0.4553, "num_input_tokens_seen": 53842960, "step": 44385 }, { "epoch": 4.9437576567546495, "grad_norm": 0.11206017434597015, "learning_rate": 2.982464101355179e-05, "loss": 0.4649, "num_input_tokens_seen": 53849072, "step": 44390 }, { "epoch": 4.944314511638267, "grad_norm": 0.12007743865251541, "learning_rate": 2.9819872797307396e-05, "loss": 0.464, "num_input_tokens_seen": 53855376, "step": 44395 }, { "epoch": 4.944871366521884, "grad_norm": 0.11002648621797562, "learning_rate": 2.9815104398952765e-05, "loss": 0.4545, "num_input_tokens_seen": 53861584, "step": 44400 }, { "epoch": 4.945428221405502, "grad_norm": 0.10126951336860657, "learning_rate": 2.9810335818668044e-05, "loss": 0.4674, "num_input_tokens_seen": 53867888, "step": 44405 }, { "epoch": 4.945985076289119, "grad_norm": 0.11379831284284592, "learning_rate": 2.9805567056633434e-05, "loss": 0.4618, "num_input_tokens_seen": 53874064, "step": 44410 }, { "epoch": 4.946541931172736, "grad_norm": 0.11024339497089386, "learning_rate": 2.98007981130291e-05, "loss": 0.4692, "num_input_tokens_seen": 53880624, "step": 44415 }, { "epoch": 4.947098786056354, "grad_norm": 0.08207754790782928, "learning_rate": 2.9796028988035223e-05, "loss": 0.46, "num_input_tokens_seen": 53886704, "step": 44420 }, { "epoch": 4.947655640939971, "grad_norm": 0.08923857659101486, "learning_rate": 2.9791259681832007e-05, "loss": 0.4667, "num_input_tokens_seen": 53892880, "step": 44425 }, { "epoch": 4.948212495823588, "grad_norm": 0.07943820208311081, "learning_rate": 2.9786490194599643e-05, "loss": 0.4522, "num_input_tokens_seen": 53899088, "step": 44430 }, { "epoch": 4.948769350707206, "grad_norm": 0.11558406800031662, "learning_rate": 2.9781720526518338e-05, "loss": 0.4674, "num_input_tokens_seen": 53905072, "step": 44435 }, { "epoch": 4.949326205590823, "grad_norm": 0.09455787390470505, "learning_rate": 2.977695067776831e-05, "loss": 0.4728, "num_input_tokens_seen": 53911376, "step": 44440 }, { "epoch": 4.94988306047444, "grad_norm": 0.10417719930410385, "learning_rate": 2.9772180648529778e-05, "loss": 0.4519, "num_input_tokens_seen": 53917584, "step": 44445 }, { "epoch": 4.950439915358058, "grad_norm": 0.07834180444478989, "learning_rate": 2.9767410438982966e-05, "loss": 0.4458, "num_input_tokens_seen": 53923664, "step": 44450 }, { "epoch": 4.950996770241675, "grad_norm": 0.111297108232975, "learning_rate": 2.976264004930811e-05, "loss": 0.4698, "num_input_tokens_seen": 53929520, "step": 44455 }, { "epoch": 4.9515536251252925, "grad_norm": 0.1403367817401886, "learning_rate": 2.9757869479685448e-05, "loss": 0.4642, "num_input_tokens_seen": 53935920, "step": 44460 }, { "epoch": 4.952110480008909, "grad_norm": 0.14859658479690552, "learning_rate": 2.975309873029523e-05, "loss": 0.4585, "num_input_tokens_seen": 53941904, "step": 44465 }, { "epoch": 4.952667334892527, "grad_norm": 0.09297604113817215, "learning_rate": 2.974832780131771e-05, "loss": 0.4582, "num_input_tokens_seen": 53947696, "step": 44470 }, { "epoch": 4.953224189776145, "grad_norm": 0.08846582472324371, "learning_rate": 2.9743556692933155e-05, "loss": 0.4642, "num_input_tokens_seen": 53954000, "step": 44475 }, { "epoch": 4.953781044659761, "grad_norm": 0.1329496204853058, "learning_rate": 2.9738785405321823e-05, "loss": 0.4733, "num_input_tokens_seen": 53960144, "step": 44480 }, { "epoch": 4.954337899543379, "grad_norm": 0.10285625606775284, "learning_rate": 2.973401393866399e-05, "loss": 0.4645, "num_input_tokens_seen": 53966288, "step": 44485 }, { "epoch": 4.954894754426997, "grad_norm": 0.09830950945615768, "learning_rate": 2.972924229313994e-05, "loss": 0.4636, "num_input_tokens_seen": 53972688, "step": 44490 }, { "epoch": 4.955451609310614, "grad_norm": 0.09147349745035172, "learning_rate": 2.972447046892996e-05, "loss": 0.4721, "num_input_tokens_seen": 53979152, "step": 44495 }, { "epoch": 4.956008464194231, "grad_norm": 0.13154229521751404, "learning_rate": 2.9719698466214345e-05, "loss": 0.4615, "num_input_tokens_seen": 53985104, "step": 44500 }, { "epoch": 4.956565319077848, "grad_norm": 0.09562753885984421, "learning_rate": 2.9714926285173405e-05, "loss": 0.4708, "num_input_tokens_seen": 53991472, "step": 44505 }, { "epoch": 4.957122173961466, "grad_norm": 0.08422216773033142, "learning_rate": 2.9710153925987433e-05, "loss": 0.4661, "num_input_tokens_seen": 53997616, "step": 44510 }, { "epoch": 4.957679028845083, "grad_norm": 0.10395125299692154, "learning_rate": 2.970538138883675e-05, "loss": 0.4576, "num_input_tokens_seen": 54003824, "step": 44515 }, { "epoch": 4.9582358837287, "grad_norm": 0.1153416559100151, "learning_rate": 2.9700608673901686e-05, "loss": 0.4539, "num_input_tokens_seen": 54009968, "step": 44520 }, { "epoch": 4.958792738612318, "grad_norm": 0.17231978476047516, "learning_rate": 2.969583578136255e-05, "loss": 0.4546, "num_input_tokens_seen": 54016048, "step": 44525 }, { "epoch": 4.959349593495935, "grad_norm": 0.12733648717403412, "learning_rate": 2.9691062711399696e-05, "loss": 0.4665, "num_input_tokens_seen": 54022480, "step": 44530 }, { "epoch": 4.959906448379552, "grad_norm": 0.11910878866910934, "learning_rate": 2.9686289464193466e-05, "loss": 0.4625, "num_input_tokens_seen": 54028400, "step": 44535 }, { "epoch": 4.96046330326317, "grad_norm": 0.08723431080579758, "learning_rate": 2.9681516039924194e-05, "loss": 0.4578, "num_input_tokens_seen": 54034864, "step": 44540 }, { "epoch": 4.961020158146787, "grad_norm": 0.10518256574869156, "learning_rate": 2.9676742438772253e-05, "loss": 0.4546, "num_input_tokens_seen": 54041008, "step": 44545 }, { "epoch": 4.961577013030404, "grad_norm": 0.1106930673122406, "learning_rate": 2.9671968660917988e-05, "loss": 0.4677, "num_input_tokens_seen": 54046768, "step": 44550 }, { "epoch": 4.962133867914021, "grad_norm": 0.10723117738962173, "learning_rate": 2.966719470654178e-05, "loss": 0.4612, "num_input_tokens_seen": 54052528, "step": 44555 }, { "epoch": 4.962690722797639, "grad_norm": 0.11248353123664856, "learning_rate": 2.9662420575823986e-05, "loss": 0.459, "num_input_tokens_seen": 54058160, "step": 44560 }, { "epoch": 4.963247577681257, "grad_norm": 0.09621356427669525, "learning_rate": 2.965764626894501e-05, "loss": 0.4667, "num_input_tokens_seen": 54064528, "step": 44565 }, { "epoch": 4.963804432564873, "grad_norm": 0.11625077575445175, "learning_rate": 2.9652871786085233e-05, "loss": 0.4834, "num_input_tokens_seen": 54070384, "step": 44570 }, { "epoch": 4.964361287448491, "grad_norm": 0.0838974118232727, "learning_rate": 2.964809712742505e-05, "loss": 0.4733, "num_input_tokens_seen": 54076752, "step": 44575 }, { "epoch": 4.964918142332108, "grad_norm": 0.13665170967578888, "learning_rate": 2.964332229314486e-05, "loss": 0.4595, "num_input_tokens_seen": 54081456, "step": 44580 }, { "epoch": 4.9654749972157255, "grad_norm": 0.13184136152267456, "learning_rate": 2.963854728342507e-05, "loss": 0.4846, "num_input_tokens_seen": 54087344, "step": 44585 }, { "epoch": 4.966031852099343, "grad_norm": 0.10030379891395569, "learning_rate": 2.96337720984461e-05, "loss": 0.464, "num_input_tokens_seen": 54093552, "step": 44590 }, { "epoch": 4.96658870698296, "grad_norm": 0.07968151569366455, "learning_rate": 2.9628996738388378e-05, "loss": 0.4667, "num_input_tokens_seen": 54099760, "step": 44595 }, { "epoch": 4.967145561866578, "grad_norm": 0.09739430248737335, "learning_rate": 2.9624221203432317e-05, "loss": 0.4765, "num_input_tokens_seen": 54105840, "step": 44600 }, { "epoch": 4.967702416750194, "grad_norm": 0.08121868222951889, "learning_rate": 2.9619445493758364e-05, "loss": 0.4598, "num_input_tokens_seen": 54111856, "step": 44605 }, { "epoch": 4.968259271633812, "grad_norm": 0.11421090364456177, "learning_rate": 2.9614669609546953e-05, "loss": 0.4649, "num_input_tokens_seen": 54117968, "step": 44610 }, { "epoch": 4.96881612651743, "grad_norm": 0.11885014176368713, "learning_rate": 2.960989355097854e-05, "loss": 0.4648, "num_input_tokens_seen": 54124080, "step": 44615 }, { "epoch": 4.9693729814010466, "grad_norm": 0.07452992349863052, "learning_rate": 2.960511731823357e-05, "loss": 0.4658, "num_input_tokens_seen": 54130000, "step": 44620 }, { "epoch": 4.969929836284664, "grad_norm": 0.08547558635473251, "learning_rate": 2.9600340911492515e-05, "loss": 0.4598, "num_input_tokens_seen": 54136144, "step": 44625 }, { "epoch": 4.970486691168282, "grad_norm": 0.08791603147983551, "learning_rate": 2.9595564330935833e-05, "loss": 0.4515, "num_input_tokens_seen": 54142128, "step": 44630 }, { "epoch": 4.971043546051899, "grad_norm": 0.12353704869747162, "learning_rate": 2.9590787576744004e-05, "loss": 0.4582, "num_input_tokens_seen": 54148496, "step": 44635 }, { "epoch": 4.971600400935516, "grad_norm": 0.09221113473176956, "learning_rate": 2.9586010649097517e-05, "loss": 0.4691, "num_input_tokens_seen": 54154768, "step": 44640 }, { "epoch": 4.972157255819133, "grad_norm": 0.1034606546163559, "learning_rate": 2.958123354817685e-05, "loss": 0.4645, "num_input_tokens_seen": 54160304, "step": 44645 }, { "epoch": 4.972714110702751, "grad_norm": 0.11112132668495178, "learning_rate": 2.9576456274162488e-05, "loss": 0.4638, "num_input_tokens_seen": 54166608, "step": 44650 }, { "epoch": 4.9732709655863685, "grad_norm": 0.10854139924049377, "learning_rate": 2.957167882723495e-05, "loss": 0.4575, "num_input_tokens_seen": 54172816, "step": 44655 }, { "epoch": 4.973827820469985, "grad_norm": 0.08209347724914551, "learning_rate": 2.9566901207574733e-05, "loss": 0.4648, "num_input_tokens_seen": 54178608, "step": 44660 }, { "epoch": 4.974384675353603, "grad_norm": 0.11472422629594803, "learning_rate": 2.9562123415362354e-05, "loss": 0.458, "num_input_tokens_seen": 54184688, "step": 44665 }, { "epoch": 4.974941530237221, "grad_norm": 0.08563532680273056, "learning_rate": 2.955734545077833e-05, "loss": 0.4572, "num_input_tokens_seen": 54190768, "step": 44670 }, { "epoch": 4.975498385120837, "grad_norm": 0.11106402426958084, "learning_rate": 2.9552567314003186e-05, "loss": 0.462, "num_input_tokens_seen": 54196944, "step": 44675 }, { "epoch": 4.976055240004455, "grad_norm": 0.06909964233636856, "learning_rate": 2.9547789005217464e-05, "loss": 0.4575, "num_input_tokens_seen": 54202416, "step": 44680 }, { "epoch": 4.976612094888072, "grad_norm": 0.12092449516057968, "learning_rate": 2.95430105246017e-05, "loss": 0.4737, "num_input_tokens_seen": 54208496, "step": 44685 }, { "epoch": 4.9771689497716896, "grad_norm": 0.09925156086683273, "learning_rate": 2.9538231872336436e-05, "loss": 0.4605, "num_input_tokens_seen": 54214640, "step": 44690 }, { "epoch": 4.977725804655307, "grad_norm": 0.1367839276790619, "learning_rate": 2.9533453048602235e-05, "loss": 0.4564, "num_input_tokens_seen": 54220784, "step": 44695 }, { "epoch": 4.978282659538924, "grad_norm": 0.06942339986562729, "learning_rate": 2.9528674053579648e-05, "loss": 0.4704, "num_input_tokens_seen": 54226832, "step": 44700 }, { "epoch": 4.978839514422542, "grad_norm": 0.06724389642477036, "learning_rate": 2.9523894887449237e-05, "loss": 0.4587, "num_input_tokens_seen": 54233104, "step": 44705 }, { "epoch": 4.9793963693061585, "grad_norm": 0.10356280207633972, "learning_rate": 2.9519115550391578e-05, "loss": 0.4637, "num_input_tokens_seen": 54239216, "step": 44710 }, { "epoch": 4.979953224189776, "grad_norm": 0.08023888617753983, "learning_rate": 2.951433604258726e-05, "loss": 0.4644, "num_input_tokens_seen": 54245040, "step": 44715 }, { "epoch": 4.980510079073394, "grad_norm": 0.11644718796014786, "learning_rate": 2.9509556364216855e-05, "loss": 0.4589, "num_input_tokens_seen": 54251120, "step": 44720 }, { "epoch": 4.981066933957011, "grad_norm": 0.07991571724414825, "learning_rate": 2.9504776515460965e-05, "loss": 0.4571, "num_input_tokens_seen": 54257104, "step": 44725 }, { "epoch": 4.981623788840628, "grad_norm": 0.09156963229179382, "learning_rate": 2.9499996496500178e-05, "loss": 0.4671, "num_input_tokens_seen": 54263248, "step": 44730 }, { "epoch": 4.982180643724245, "grad_norm": 0.11999507248401642, "learning_rate": 2.94952163075151e-05, "loss": 0.4539, "num_input_tokens_seen": 54269104, "step": 44735 }, { "epoch": 4.982737498607863, "grad_norm": 0.09353317320346832, "learning_rate": 2.9490435948686344e-05, "loss": 0.4668, "num_input_tokens_seen": 54275440, "step": 44740 }, { "epoch": 4.98329435349148, "grad_norm": 0.0975470095872879, "learning_rate": 2.9485655420194524e-05, "loss": 0.468, "num_input_tokens_seen": 54281744, "step": 44745 }, { "epoch": 4.983851208375097, "grad_norm": 0.08419169485569, "learning_rate": 2.9480874722220274e-05, "loss": 0.4703, "num_input_tokens_seen": 54287952, "step": 44750 }, { "epoch": 4.984408063258715, "grad_norm": 0.07484105974435806, "learning_rate": 2.9476093854944216e-05, "loss": 0.4728, "num_input_tokens_seen": 54293840, "step": 44755 }, { "epoch": 4.984964918142332, "grad_norm": 0.07829780876636505, "learning_rate": 2.9471312818546985e-05, "loss": 0.4718, "num_input_tokens_seen": 54299728, "step": 44760 }, { "epoch": 4.985521773025949, "grad_norm": 0.07797682285308838, "learning_rate": 2.946653161320923e-05, "loss": 0.4557, "num_input_tokens_seen": 54305520, "step": 44765 }, { "epoch": 4.986078627909567, "grad_norm": 0.1080130860209465, "learning_rate": 2.9461750239111597e-05, "loss": 0.4629, "num_input_tokens_seen": 54311568, "step": 44770 }, { "epoch": 4.986635482793184, "grad_norm": 0.07609304040670395, "learning_rate": 2.945696869643474e-05, "loss": 0.4559, "num_input_tokens_seen": 54317552, "step": 44775 }, { "epoch": 4.9871923376768015, "grad_norm": 0.12109445780515671, "learning_rate": 2.945218698535932e-05, "loss": 0.4666, "num_input_tokens_seen": 54323792, "step": 44780 }, { "epoch": 4.987749192560418, "grad_norm": 0.09761925786733627, "learning_rate": 2.944740510606601e-05, "loss": 0.4577, "num_input_tokens_seen": 54329808, "step": 44785 }, { "epoch": 4.988306047444036, "grad_norm": 0.12247953563928604, "learning_rate": 2.9442623058735485e-05, "loss": 0.4779, "num_input_tokens_seen": 54335952, "step": 44790 }, { "epoch": 4.988862902327654, "grad_norm": 0.09437990933656693, "learning_rate": 2.9437840843548424e-05, "loss": 0.4616, "num_input_tokens_seen": 54342000, "step": 44795 }, { "epoch": 4.98941975721127, "grad_norm": 0.08983977884054184, "learning_rate": 2.9433058460685516e-05, "loss": 0.4543, "num_input_tokens_seen": 54347792, "step": 44800 }, { "epoch": 4.989976612094888, "grad_norm": 0.08537017554044724, "learning_rate": 2.9428275910327453e-05, "loss": 0.4553, "num_input_tokens_seen": 54353520, "step": 44805 }, { "epoch": 4.990533466978506, "grad_norm": 0.10354418307542801, "learning_rate": 2.9423493192654928e-05, "loss": 0.4554, "num_input_tokens_seen": 54359824, "step": 44810 }, { "epoch": 4.9910903218621225, "grad_norm": 0.14097847044467926, "learning_rate": 2.9418710307848657e-05, "loss": 0.4571, "num_input_tokens_seen": 54365776, "step": 44815 }, { "epoch": 4.99164717674574, "grad_norm": 0.12473467737436295, "learning_rate": 2.9413927256089353e-05, "loss": 0.4504, "num_input_tokens_seen": 54371792, "step": 44820 }, { "epoch": 4.992204031629358, "grad_norm": 0.0763528048992157, "learning_rate": 2.940914403755773e-05, "loss": 0.4685, "num_input_tokens_seen": 54377296, "step": 44825 }, { "epoch": 4.992760886512975, "grad_norm": 0.07167129963636398, "learning_rate": 2.9404360652434514e-05, "loss": 0.4767, "num_input_tokens_seen": 54383376, "step": 44830 }, { "epoch": 4.993317741396592, "grad_norm": 0.06449498236179352, "learning_rate": 2.9399577100900444e-05, "loss": 0.4708, "num_input_tokens_seen": 54389456, "step": 44835 }, { "epoch": 4.993874596280209, "grad_norm": 0.10782019793987274, "learning_rate": 2.9394793383136244e-05, "loss": 0.4683, "num_input_tokens_seen": 54395632, "step": 44840 }, { "epoch": 4.994431451163827, "grad_norm": 0.09335813671350479, "learning_rate": 2.9390009499322673e-05, "loss": 0.4595, "num_input_tokens_seen": 54401552, "step": 44845 }, { "epoch": 4.9949883060474445, "grad_norm": 0.10606344789266586, "learning_rate": 2.9385225449640474e-05, "loss": 0.4594, "num_input_tokens_seen": 54407792, "step": 44850 }, { "epoch": 4.995545160931061, "grad_norm": 0.079408660531044, "learning_rate": 2.93804412342704e-05, "loss": 0.4629, "num_input_tokens_seen": 54414032, "step": 44855 }, { "epoch": 4.996102015814679, "grad_norm": 0.08481308072805405, "learning_rate": 2.9375656853393223e-05, "loss": 0.4667, "num_input_tokens_seen": 54420240, "step": 44860 }, { "epoch": 4.996658870698296, "grad_norm": 0.10154251754283905, "learning_rate": 2.9370872307189702e-05, "loss": 0.464, "num_input_tokens_seen": 54426608, "step": 44865 }, { "epoch": 4.997215725581913, "grad_norm": 0.09903170168399811, "learning_rate": 2.9366087595840618e-05, "loss": 0.4644, "num_input_tokens_seen": 54432880, "step": 44870 }, { "epoch": 4.997772580465531, "grad_norm": 0.10837175697088242, "learning_rate": 2.9361302719526755e-05, "loss": 0.459, "num_input_tokens_seen": 54438992, "step": 44875 }, { "epoch": 4.998329435349148, "grad_norm": 0.10033831745386124, "learning_rate": 2.9356517678428904e-05, "loss": 0.4651, "num_input_tokens_seen": 54445168, "step": 44880 }, { "epoch": 4.9988862902327655, "grad_norm": 0.0886281356215477, "learning_rate": 2.935173247272785e-05, "loss": 0.4566, "num_input_tokens_seen": 54450992, "step": 44885 }, { "epoch": 4.999443145116382, "grad_norm": 0.10775350034236908, "learning_rate": 2.9346947102604398e-05, "loss": 0.4566, "num_input_tokens_seen": 54457104, "step": 44890 }, { "epoch": 5.0, "grad_norm": 0.26546868681907654, "learning_rate": 2.9342161568239356e-05, "loss": 0.4745, "num_input_tokens_seen": 54462384, "step": 44895 }, { "epoch": 5.000556854883618, "grad_norm": 0.11097326129674911, "learning_rate": 2.933737586981353e-05, "loss": 0.4502, "num_input_tokens_seen": 54468496, "step": 44900 }, { "epoch": 5.000556854883618, "eval_loss": 0.4639337360858917, "eval_runtime": 113.0678, "eval_samples_per_second": 35.297, "eval_steps_per_second": 8.827, "num_input_tokens_seen": 54468496, "step": 44900 }, { "epoch": 5.0011137097672345, "grad_norm": 0.10384614020586014, "learning_rate": 2.9332590007507745e-05, "loss": 0.4714, "num_input_tokens_seen": 54474608, "step": 44905 }, { "epoch": 5.001670564650852, "grad_norm": 0.0893130749464035, "learning_rate": 2.932780398150282e-05, "loss": 0.4431, "num_input_tokens_seen": 54480752, "step": 44910 }, { "epoch": 5.002227419534469, "grad_norm": 0.0924501121044159, "learning_rate": 2.93230177919796e-05, "loss": 0.4529, "num_input_tokens_seen": 54486896, "step": 44915 }, { "epoch": 5.002784274418087, "grad_norm": 0.07587375491857529, "learning_rate": 2.9318231439118913e-05, "loss": 0.4696, "num_input_tokens_seen": 54493104, "step": 44920 }, { "epoch": 5.003341129301704, "grad_norm": 0.09800572693347931, "learning_rate": 2.93134449231016e-05, "loss": 0.4576, "num_input_tokens_seen": 54499152, "step": 44925 }, { "epoch": 5.003897984185321, "grad_norm": 0.10412488132715225, "learning_rate": 2.9308658244108517e-05, "loss": 0.4675, "num_input_tokens_seen": 54505168, "step": 44930 }, { "epoch": 5.004454839068939, "grad_norm": 0.08476253598928452, "learning_rate": 2.9303871402320514e-05, "loss": 0.4546, "num_input_tokens_seen": 54511408, "step": 44935 }, { "epoch": 5.005011693952556, "grad_norm": 0.11729508638381958, "learning_rate": 2.9299084397918465e-05, "loss": 0.4558, "num_input_tokens_seen": 54517200, "step": 44940 }, { "epoch": 5.005568548836173, "grad_norm": 0.09223143011331558, "learning_rate": 2.9294297231083223e-05, "loss": 0.4556, "num_input_tokens_seen": 54523344, "step": 44945 }, { "epoch": 5.006125403719791, "grad_norm": 0.09458740800619125, "learning_rate": 2.9289509901995678e-05, "loss": 0.4586, "num_input_tokens_seen": 54529488, "step": 44950 }, { "epoch": 5.006682258603408, "grad_norm": 0.13047878444194794, "learning_rate": 2.9284722410836697e-05, "loss": 0.4607, "num_input_tokens_seen": 54535600, "step": 44955 }, { "epoch": 5.007239113487025, "grad_norm": 0.10133469104766846, "learning_rate": 2.9279934757787164e-05, "loss": 0.4652, "num_input_tokens_seen": 54541648, "step": 44960 }, { "epoch": 5.007795968370643, "grad_norm": 0.11130436509847641, "learning_rate": 2.9275146943027994e-05, "loss": 0.4548, "num_input_tokens_seen": 54547792, "step": 44965 }, { "epoch": 5.00835282325426, "grad_norm": 0.11894756555557251, "learning_rate": 2.9270358966740063e-05, "loss": 0.4473, "num_input_tokens_seen": 54553680, "step": 44970 }, { "epoch": 5.0089096781378775, "grad_norm": 0.11847381293773651, "learning_rate": 2.926557082910429e-05, "loss": 0.4644, "num_input_tokens_seen": 54559440, "step": 44975 }, { "epoch": 5.009466533021494, "grad_norm": 0.08982004225254059, "learning_rate": 2.926078253030158e-05, "loss": 0.4613, "num_input_tokens_seen": 54565520, "step": 44980 }, { "epoch": 5.010023387905112, "grad_norm": 0.0924738347530365, "learning_rate": 2.9255994070512853e-05, "loss": 0.454, "num_input_tokens_seen": 54571696, "step": 44985 }, { "epoch": 5.01058024278873, "grad_norm": 0.09663229435682297, "learning_rate": 2.9251205449919035e-05, "loss": 0.4584, "num_input_tokens_seen": 54577744, "step": 44990 }, { "epoch": 5.011137097672346, "grad_norm": 0.09271819144487381, "learning_rate": 2.9246416668701038e-05, "loss": 0.4597, "num_input_tokens_seen": 54584144, "step": 44995 }, { "epoch": 5.011693952555964, "grad_norm": 0.06761470437049866, "learning_rate": 2.9241627727039822e-05, "loss": 0.4495, "num_input_tokens_seen": 54589808, "step": 45000 }, { "epoch": 5.012250807439581, "grad_norm": 0.07747544348239899, "learning_rate": 2.9236838625116314e-05, "loss": 0.4679, "num_input_tokens_seen": 54596016, "step": 45005 }, { "epoch": 5.0128076623231985, "grad_norm": 0.12199078500270844, "learning_rate": 2.923204936311147e-05, "loss": 0.4695, "num_input_tokens_seen": 54602224, "step": 45010 }, { "epoch": 5.013364517206816, "grad_norm": 0.09390327334403992, "learning_rate": 2.922725994120623e-05, "loss": 0.4539, "num_input_tokens_seen": 54608144, "step": 45015 }, { "epoch": 5.013921372090433, "grad_norm": 0.12019745260477066, "learning_rate": 2.9222470359581568e-05, "loss": 0.454, "num_input_tokens_seen": 54614096, "step": 45020 }, { "epoch": 5.014478226974051, "grad_norm": 0.07475230097770691, "learning_rate": 2.921768061841844e-05, "loss": 0.4677, "num_input_tokens_seen": 54620304, "step": 45025 }, { "epoch": 5.015035081857668, "grad_norm": 0.07482220977544785, "learning_rate": 2.921289071789783e-05, "loss": 0.4639, "num_input_tokens_seen": 54626544, "step": 45030 }, { "epoch": 5.015591936741285, "grad_norm": 0.09836756438016891, "learning_rate": 2.92081006582007e-05, "loss": 0.4621, "num_input_tokens_seen": 54632656, "step": 45035 }, { "epoch": 5.016148791624903, "grad_norm": 0.10025565326213837, "learning_rate": 2.920331043950805e-05, "loss": 0.4837, "num_input_tokens_seen": 54638896, "step": 45040 }, { "epoch": 5.01670564650852, "grad_norm": 0.14629000425338745, "learning_rate": 2.9198520062000857e-05, "loss": 0.4576, "num_input_tokens_seen": 54645232, "step": 45045 }, { "epoch": 5.017262501392137, "grad_norm": 0.08383304625749588, "learning_rate": 2.9193729525860118e-05, "loss": 0.4576, "num_input_tokens_seen": 54650992, "step": 45050 }, { "epoch": 5.017819356275755, "grad_norm": 0.10487378388643265, "learning_rate": 2.9188938831266843e-05, "loss": 0.4658, "num_input_tokens_seen": 54657264, "step": 45055 }, { "epoch": 5.018376211159372, "grad_norm": 0.10517245531082153, "learning_rate": 2.918414797840203e-05, "loss": 0.4703, "num_input_tokens_seen": 54663312, "step": 45060 }, { "epoch": 5.018933066042989, "grad_norm": 0.09651871025562286, "learning_rate": 2.9179356967446704e-05, "loss": 0.4616, "num_input_tokens_seen": 54669264, "step": 45065 }, { "epoch": 5.019489920926606, "grad_norm": 0.09555751830339432, "learning_rate": 2.917456579858187e-05, "loss": 0.4753, "num_input_tokens_seen": 54675472, "step": 45070 }, { "epoch": 5.020046775810224, "grad_norm": 0.12089627981185913, "learning_rate": 2.916977447198857e-05, "loss": 0.4499, "num_input_tokens_seen": 54681200, "step": 45075 }, { "epoch": 5.0206036306938415, "grad_norm": 0.07974530756473541, "learning_rate": 2.916498298784782e-05, "loss": 0.4616, "num_input_tokens_seen": 54687440, "step": 45080 }, { "epoch": 5.021160485577458, "grad_norm": 0.07613670080900192, "learning_rate": 2.916019134634067e-05, "loss": 0.476, "num_input_tokens_seen": 54693776, "step": 45085 }, { "epoch": 5.021717340461076, "grad_norm": 0.0713738277554512, "learning_rate": 2.915539954764816e-05, "loss": 0.4599, "num_input_tokens_seen": 54699664, "step": 45090 }, { "epoch": 5.022274195344693, "grad_norm": 0.11521059274673462, "learning_rate": 2.9150607591951334e-05, "loss": 0.4478, "num_input_tokens_seen": 54706032, "step": 45095 }, { "epoch": 5.0228310502283104, "grad_norm": 0.10662630200386047, "learning_rate": 2.914581547943126e-05, "loss": 0.453, "num_input_tokens_seen": 54711792, "step": 45100 }, { "epoch": 5.023387905111928, "grad_norm": 0.09541147202253342, "learning_rate": 2.9141023210268987e-05, "loss": 0.4597, "num_input_tokens_seen": 54718096, "step": 45105 }, { "epoch": 5.023944759995545, "grad_norm": 0.09025654941797256, "learning_rate": 2.9136230784645585e-05, "loss": 0.4743, "num_input_tokens_seen": 54724240, "step": 45110 }, { "epoch": 5.024501614879163, "grad_norm": 0.12133973836898804, "learning_rate": 2.9131438202742124e-05, "loss": 0.4604, "num_input_tokens_seen": 54730448, "step": 45115 }, { "epoch": 5.02505846976278, "grad_norm": 0.07799792289733887, "learning_rate": 2.9126645464739693e-05, "loss": 0.4671, "num_input_tokens_seen": 54736272, "step": 45120 }, { "epoch": 5.025615324646397, "grad_norm": 0.0922897607088089, "learning_rate": 2.9121852570819374e-05, "loss": 0.4533, "num_input_tokens_seen": 54742704, "step": 45125 }, { "epoch": 5.026172179530015, "grad_norm": 0.06940267235040665, "learning_rate": 2.9117059521162253e-05, "loss": 0.4595, "num_input_tokens_seen": 54748112, "step": 45130 }, { "epoch": 5.0267290344136315, "grad_norm": 0.11537075787782669, "learning_rate": 2.911226631594943e-05, "loss": 0.4665, "num_input_tokens_seen": 54753424, "step": 45135 }, { "epoch": 5.027285889297249, "grad_norm": 0.15394750237464905, "learning_rate": 2.9107472955362012e-05, "loss": 0.47, "num_input_tokens_seen": 54759440, "step": 45140 }, { "epoch": 5.027842744180867, "grad_norm": 0.11164110898971558, "learning_rate": 2.9102679439581098e-05, "loss": 0.4572, "num_input_tokens_seen": 54765168, "step": 45145 }, { "epoch": 5.028399599064484, "grad_norm": 0.10313393175601959, "learning_rate": 2.90978857687878e-05, "loss": 0.4667, "num_input_tokens_seen": 54771440, "step": 45150 }, { "epoch": 5.028956453948101, "grad_norm": 0.08481844514608383, "learning_rate": 2.9093091943163258e-05, "loss": 0.4538, "num_input_tokens_seen": 54777776, "step": 45155 }, { "epoch": 5.029513308831718, "grad_norm": 0.13522671163082123, "learning_rate": 2.9088297962888578e-05, "loss": 0.4641, "num_input_tokens_seen": 54783856, "step": 45160 }, { "epoch": 5.030070163715336, "grad_norm": 0.09435246139764786, "learning_rate": 2.9083503828144904e-05, "loss": 0.4606, "num_input_tokens_seen": 54790160, "step": 45165 }, { "epoch": 5.0306270185989534, "grad_norm": 0.10382400453090668, "learning_rate": 2.907870953911337e-05, "loss": 0.4715, "num_input_tokens_seen": 54796144, "step": 45170 }, { "epoch": 5.03118387348257, "grad_norm": 0.11645054817199707, "learning_rate": 2.9073915095975112e-05, "loss": 0.464, "num_input_tokens_seen": 54802352, "step": 45175 }, { "epoch": 5.031740728366188, "grad_norm": 0.0930461660027504, "learning_rate": 2.9069120498911283e-05, "loss": 0.4681, "num_input_tokens_seen": 54808400, "step": 45180 }, { "epoch": 5.032297583249805, "grad_norm": 0.08535384386777878, "learning_rate": 2.9064325748103054e-05, "loss": 0.4693, "num_input_tokens_seen": 54814512, "step": 45185 }, { "epoch": 5.032854438133422, "grad_norm": 0.11197831481695175, "learning_rate": 2.905953084373157e-05, "loss": 0.4571, "num_input_tokens_seen": 54819888, "step": 45190 }, { "epoch": 5.03341129301704, "grad_norm": 0.09183672070503235, "learning_rate": 2.9054735785978e-05, "loss": 0.4615, "num_input_tokens_seen": 54826128, "step": 45195 }, { "epoch": 5.033968147900657, "grad_norm": 0.10449408739805222, "learning_rate": 2.9049940575023516e-05, "loss": 0.4687, "num_input_tokens_seen": 54832400, "step": 45200 }, { "epoch": 5.0345250027842745, "grad_norm": 0.13066616654396057, "learning_rate": 2.9045145211049295e-05, "loss": 0.4571, "num_input_tokens_seen": 54838480, "step": 45205 }, { "epoch": 5.035081857667892, "grad_norm": 0.08596739172935486, "learning_rate": 2.904034969423653e-05, "loss": 0.4573, "num_input_tokens_seen": 54844560, "step": 45210 }, { "epoch": 5.035638712551509, "grad_norm": 0.09282802045345306, "learning_rate": 2.9035554024766404e-05, "loss": 0.4615, "num_input_tokens_seen": 54850000, "step": 45215 }, { "epoch": 5.036195567435127, "grad_norm": 0.10148300975561142, "learning_rate": 2.9030758202820113e-05, "loss": 0.4626, "num_input_tokens_seen": 54855952, "step": 45220 }, { "epoch": 5.036752422318743, "grad_norm": 0.10277292132377625, "learning_rate": 2.9025962228578863e-05, "loss": 0.4576, "num_input_tokens_seen": 54862256, "step": 45225 }, { "epoch": 5.037309277202361, "grad_norm": 0.10569506138563156, "learning_rate": 2.9021166102223856e-05, "loss": 0.4704, "num_input_tokens_seen": 54868624, "step": 45230 }, { "epoch": 5.037866132085979, "grad_norm": 0.1113901361823082, "learning_rate": 2.901636982393631e-05, "loss": 0.457, "num_input_tokens_seen": 54874416, "step": 45235 }, { "epoch": 5.038422986969596, "grad_norm": 0.08606485277414322, "learning_rate": 2.901157339389744e-05, "loss": 0.4685, "num_input_tokens_seen": 54880464, "step": 45240 }, { "epoch": 5.038979841853213, "grad_norm": 0.18011219799518585, "learning_rate": 2.9006776812288468e-05, "loss": 0.4642, "num_input_tokens_seen": 54886736, "step": 45245 }, { "epoch": 5.03953669673683, "grad_norm": 0.10905037820339203, "learning_rate": 2.9001980079290632e-05, "loss": 0.4639, "num_input_tokens_seen": 54893008, "step": 45250 }, { "epoch": 5.040093551620448, "grad_norm": 0.09232105314731598, "learning_rate": 2.8997183195085163e-05, "loss": 0.4505, "num_input_tokens_seen": 54898800, "step": 45255 }, { "epoch": 5.040650406504065, "grad_norm": 0.11972219496965408, "learning_rate": 2.8992386159853307e-05, "loss": 0.4613, "num_input_tokens_seen": 54904496, "step": 45260 }, { "epoch": 5.041207261387682, "grad_norm": 0.13466498255729675, "learning_rate": 2.8987588973776304e-05, "loss": 0.4608, "num_input_tokens_seen": 54910768, "step": 45265 }, { "epoch": 5.0417641162713, "grad_norm": 0.08319678157567978, "learning_rate": 2.89827916370354e-05, "loss": 0.464, "num_input_tokens_seen": 54916656, "step": 45270 }, { "epoch": 5.042320971154917, "grad_norm": 0.09089242666959763, "learning_rate": 2.8977994149811878e-05, "loss": 0.4703, "num_input_tokens_seen": 54922576, "step": 45275 }, { "epoch": 5.042877826038534, "grad_norm": 0.09041012823581696, "learning_rate": 2.897319651228698e-05, "loss": 0.4457, "num_input_tokens_seen": 54928496, "step": 45280 }, { "epoch": 5.043434680922152, "grad_norm": 0.1024853065609932, "learning_rate": 2.8968398724642e-05, "loss": 0.4637, "num_input_tokens_seen": 54934832, "step": 45285 }, { "epoch": 5.043991535805769, "grad_norm": 0.10577623546123505, "learning_rate": 2.8963600787058188e-05, "loss": 0.4511, "num_input_tokens_seen": 54940912, "step": 45290 }, { "epoch": 5.044548390689386, "grad_norm": 0.09236374497413635, "learning_rate": 2.8958802699716835e-05, "loss": 0.4609, "num_input_tokens_seen": 54946640, "step": 45295 }, { "epoch": 5.045105245573004, "grad_norm": 0.10322248190641403, "learning_rate": 2.8954004462799228e-05, "loss": 0.4624, "num_input_tokens_seen": 54952784, "step": 45300 }, { "epoch": 5.045662100456621, "grad_norm": 0.14934276044368744, "learning_rate": 2.8949206076486662e-05, "loss": 0.4552, "num_input_tokens_seen": 54959152, "step": 45305 }, { "epoch": 5.046218955340239, "grad_norm": 0.11480703204870224, "learning_rate": 2.8944407540960437e-05, "loss": 0.4688, "num_input_tokens_seen": 54965200, "step": 45310 }, { "epoch": 5.046775810223855, "grad_norm": 0.10767026990652084, "learning_rate": 2.8939608856401855e-05, "loss": 0.4606, "num_input_tokens_seen": 54971184, "step": 45315 }, { "epoch": 5.047332665107473, "grad_norm": 0.11934087425470352, "learning_rate": 2.8934810022992232e-05, "loss": 0.4495, "num_input_tokens_seen": 54977424, "step": 45320 }, { "epoch": 5.047889519991091, "grad_norm": 0.10984103381633759, "learning_rate": 2.893001104091287e-05, "loss": 0.4678, "num_input_tokens_seen": 54983600, "step": 45325 }, { "epoch": 5.0484463748747075, "grad_norm": 0.08902686089277267, "learning_rate": 2.8925211910345095e-05, "loss": 0.4716, "num_input_tokens_seen": 54989872, "step": 45330 }, { "epoch": 5.049003229758325, "grad_norm": 0.1012110486626625, "learning_rate": 2.892041263147023e-05, "loss": 0.4539, "num_input_tokens_seen": 54995344, "step": 45335 }, { "epoch": 5.049560084641942, "grad_norm": 0.11519455164670944, "learning_rate": 2.891561320446962e-05, "loss": 0.4672, "num_input_tokens_seen": 55001456, "step": 45340 }, { "epoch": 5.05011693952556, "grad_norm": 0.0951894074678421, "learning_rate": 2.8910813629524597e-05, "loss": 0.4663, "num_input_tokens_seen": 55007600, "step": 45345 }, { "epoch": 5.050673794409177, "grad_norm": 0.10301675647497177, "learning_rate": 2.89060139068165e-05, "loss": 0.4595, "num_input_tokens_seen": 55014192, "step": 45350 }, { "epoch": 5.051230649292794, "grad_norm": 0.09106951206922531, "learning_rate": 2.8901214036526675e-05, "loss": 0.4699, "num_input_tokens_seen": 55019600, "step": 45355 }, { "epoch": 5.051787504176412, "grad_norm": 0.11599158495664597, "learning_rate": 2.8896414018836486e-05, "loss": 0.4544, "num_input_tokens_seen": 55025616, "step": 45360 }, { "epoch": 5.0523443590600285, "grad_norm": 0.11351144313812256, "learning_rate": 2.8891613853927286e-05, "loss": 0.4528, "num_input_tokens_seen": 55031792, "step": 45365 }, { "epoch": 5.052901213943646, "grad_norm": 0.09798599034547806, "learning_rate": 2.8886813541980447e-05, "loss": 0.4517, "num_input_tokens_seen": 55037904, "step": 45370 }, { "epoch": 5.053458068827264, "grad_norm": 0.1233380064368248, "learning_rate": 2.8882013083177336e-05, "loss": 0.4673, "num_input_tokens_seen": 55044048, "step": 45375 }, { "epoch": 5.054014923710881, "grad_norm": 0.09521432220935822, "learning_rate": 2.887721247769933e-05, "loss": 0.4749, "num_input_tokens_seen": 55049328, "step": 45380 }, { "epoch": 5.054571778594498, "grad_norm": 0.06729099899530411, "learning_rate": 2.887241172572781e-05, "loss": 0.4608, "num_input_tokens_seen": 55055440, "step": 45385 }, { "epoch": 5.055128633478116, "grad_norm": 0.08252246677875519, "learning_rate": 2.886761082744417e-05, "loss": 0.4624, "num_input_tokens_seen": 55061776, "step": 45390 }, { "epoch": 5.055685488361733, "grad_norm": 0.07982206344604492, "learning_rate": 2.886280978302979e-05, "loss": 0.4607, "num_input_tokens_seen": 55067952, "step": 45395 }, { "epoch": 5.0562423432453505, "grad_norm": 0.12245850265026093, "learning_rate": 2.885800859266608e-05, "loss": 0.4527, "num_input_tokens_seen": 55073936, "step": 45400 }, { "epoch": 5.056799198128967, "grad_norm": 0.08386722952127457, "learning_rate": 2.8853207256534444e-05, "loss": 0.461, "num_input_tokens_seen": 55079760, "step": 45405 }, { "epoch": 5.057356053012585, "grad_norm": 0.076194167137146, "learning_rate": 2.884840577481629e-05, "loss": 0.4675, "num_input_tokens_seen": 55085968, "step": 45410 }, { "epoch": 5.057912907896203, "grad_norm": 0.08197898417711258, "learning_rate": 2.8843604147693032e-05, "loss": 0.4643, "num_input_tokens_seen": 55092048, "step": 45415 }, { "epoch": 5.058469762779819, "grad_norm": 0.08753971010446548, "learning_rate": 2.883880237534609e-05, "loss": 0.4589, "num_input_tokens_seen": 55098192, "step": 45420 }, { "epoch": 5.059026617663437, "grad_norm": 0.09888003021478653, "learning_rate": 2.88340004579569e-05, "loss": 0.4638, "num_input_tokens_seen": 55104624, "step": 45425 }, { "epoch": 5.059583472547054, "grad_norm": 0.09568843245506287, "learning_rate": 2.8829198395706874e-05, "loss": 0.4544, "num_input_tokens_seen": 55110384, "step": 45430 }, { "epoch": 5.0601403274306715, "grad_norm": 0.08395966142416, "learning_rate": 2.882439618877747e-05, "loss": 0.4792, "num_input_tokens_seen": 55116464, "step": 45435 }, { "epoch": 5.060697182314289, "grad_norm": 0.08324603736400604, "learning_rate": 2.8819593837350116e-05, "loss": 0.4572, "num_input_tokens_seen": 55122832, "step": 45440 }, { "epoch": 5.061254037197906, "grad_norm": 0.09603562951087952, "learning_rate": 2.8814791341606273e-05, "loss": 0.4651, "num_input_tokens_seen": 55128752, "step": 45445 }, { "epoch": 5.061810892081524, "grad_norm": 0.12748730182647705, "learning_rate": 2.8809988701727387e-05, "loss": 0.4682, "num_input_tokens_seen": 55134704, "step": 45450 }, { "epoch": 5.0623677469651405, "grad_norm": 0.0912899449467659, "learning_rate": 2.8805185917894916e-05, "loss": 0.456, "num_input_tokens_seen": 55141008, "step": 45455 }, { "epoch": 5.062924601848758, "grad_norm": 0.0966728925704956, "learning_rate": 2.8800382990290326e-05, "loss": 0.4647, "num_input_tokens_seen": 55147120, "step": 45460 }, { "epoch": 5.063481456732376, "grad_norm": 0.11575954407453537, "learning_rate": 2.8795579919095094e-05, "loss": 0.4604, "num_input_tokens_seen": 55153008, "step": 45465 }, { "epoch": 5.064038311615993, "grad_norm": 0.1146862655878067, "learning_rate": 2.8790776704490685e-05, "loss": 0.4642, "num_input_tokens_seen": 55159184, "step": 45470 }, { "epoch": 5.06459516649961, "grad_norm": 0.053152184933423996, "learning_rate": 2.878597334665858e-05, "loss": 0.4566, "num_input_tokens_seen": 55165392, "step": 45475 }, { "epoch": 5.065152021383228, "grad_norm": 0.16046085953712463, "learning_rate": 2.8781169845780277e-05, "loss": 0.4648, "num_input_tokens_seen": 55171280, "step": 45480 }, { "epoch": 5.065708876266845, "grad_norm": 0.08475970476865768, "learning_rate": 2.877636620203725e-05, "loss": 0.4581, "num_input_tokens_seen": 55176752, "step": 45485 }, { "epoch": 5.066265731150462, "grad_norm": 0.10115983337163925, "learning_rate": 2.877156241561101e-05, "loss": 0.4625, "num_input_tokens_seen": 55182832, "step": 45490 }, { "epoch": 5.066822586034079, "grad_norm": 0.08476509898900986, "learning_rate": 2.8766758486683053e-05, "loss": 0.4557, "num_input_tokens_seen": 55189104, "step": 45495 }, { "epoch": 5.067379440917697, "grad_norm": 0.07684454321861267, "learning_rate": 2.87619544154349e-05, "loss": 0.4676, "num_input_tokens_seen": 55195216, "step": 45500 }, { "epoch": 5.0679362958013146, "grad_norm": 0.1235705092549324, "learning_rate": 2.8757150202048044e-05, "loss": 0.4565, "num_input_tokens_seen": 55201424, "step": 45505 }, { "epoch": 5.068493150684931, "grad_norm": 0.11095134913921356, "learning_rate": 2.8752345846704016e-05, "loss": 0.4518, "num_input_tokens_seen": 55207376, "step": 45510 }, { "epoch": 5.069050005568549, "grad_norm": 0.10423315316438675, "learning_rate": 2.8747541349584335e-05, "loss": 0.4554, "num_input_tokens_seen": 55213840, "step": 45515 }, { "epoch": 5.069606860452166, "grad_norm": 0.06830035895109177, "learning_rate": 2.8742736710870528e-05, "loss": 0.4624, "num_input_tokens_seen": 55220080, "step": 45520 }, { "epoch": 5.0701637153357835, "grad_norm": 0.1021336242556572, "learning_rate": 2.873793193074414e-05, "loss": 0.459, "num_input_tokens_seen": 55226320, "step": 45525 }, { "epoch": 5.070720570219401, "grad_norm": 0.10008018463850021, "learning_rate": 2.8733127009386706e-05, "loss": 0.4603, "num_input_tokens_seen": 55232304, "step": 45530 }, { "epoch": 5.071277425103018, "grad_norm": 0.09249847382307053, "learning_rate": 2.872832194697977e-05, "loss": 0.4582, "num_input_tokens_seen": 55238640, "step": 45535 }, { "epoch": 5.071834279986636, "grad_norm": 0.100133515894413, "learning_rate": 2.8723516743704883e-05, "loss": 0.46, "num_input_tokens_seen": 55244240, "step": 45540 }, { "epoch": 5.072391134870252, "grad_norm": 0.13263772428035736, "learning_rate": 2.8718711399743596e-05, "loss": 0.4552, "num_input_tokens_seen": 55250320, "step": 45545 }, { "epoch": 5.07294798975387, "grad_norm": 0.09576313197612762, "learning_rate": 2.8713905915277474e-05, "loss": 0.4622, "num_input_tokens_seen": 55256560, "step": 45550 }, { "epoch": 5.073504844637488, "grad_norm": 0.10752295702695847, "learning_rate": 2.870910029048809e-05, "loss": 0.4601, "num_input_tokens_seen": 55262960, "step": 45555 }, { "epoch": 5.0740616995211045, "grad_norm": 0.08287600427865982, "learning_rate": 2.8704294525557007e-05, "loss": 0.4614, "num_input_tokens_seen": 55268976, "step": 45560 }, { "epoch": 5.074618554404722, "grad_norm": 0.09348449856042862, "learning_rate": 2.869948862066581e-05, "loss": 0.4625, "num_input_tokens_seen": 55275152, "step": 45565 }, { "epoch": 5.07517540928834, "grad_norm": 0.09744232892990112, "learning_rate": 2.8694682575996074e-05, "loss": 0.4658, "num_input_tokens_seen": 55281680, "step": 45570 }, { "epoch": 5.075732264171957, "grad_norm": 0.10358142107725143, "learning_rate": 2.8689876391729393e-05, "loss": 0.461, "num_input_tokens_seen": 55287856, "step": 45575 }, { "epoch": 5.076289119055574, "grad_norm": 0.11014452576637268, "learning_rate": 2.868507006804736e-05, "loss": 0.4537, "num_input_tokens_seen": 55293936, "step": 45580 }, { "epoch": 5.076845973939191, "grad_norm": 0.10100051760673523, "learning_rate": 2.8680263605131558e-05, "loss": 0.4462, "num_input_tokens_seen": 55300560, "step": 45585 }, { "epoch": 5.077402828822809, "grad_norm": 0.11424586176872253, "learning_rate": 2.867545700316361e-05, "loss": 0.4724, "num_input_tokens_seen": 55306800, "step": 45590 }, { "epoch": 5.0779596837064265, "grad_norm": 0.0993146151304245, "learning_rate": 2.867065026232512e-05, "loss": 0.4517, "num_input_tokens_seen": 55313392, "step": 45595 }, { "epoch": 5.078516538590043, "grad_norm": 0.10258863121271133, "learning_rate": 2.8665843382797703e-05, "loss": 0.4577, "num_input_tokens_seen": 55319760, "step": 45600 }, { "epoch": 5.079073393473661, "grad_norm": 0.10746607184410095, "learning_rate": 2.8661036364762966e-05, "loss": 0.4768, "num_input_tokens_seen": 55326064, "step": 45605 }, { "epoch": 5.079630248357278, "grad_norm": 0.10818609595298767, "learning_rate": 2.865622920840255e-05, "loss": 0.4684, "num_input_tokens_seen": 55332496, "step": 45610 }, { "epoch": 5.080187103240895, "grad_norm": 0.07874627411365509, "learning_rate": 2.865142191389807e-05, "loss": 0.4565, "num_input_tokens_seen": 55338480, "step": 45615 }, { "epoch": 5.080743958124513, "grad_norm": 0.08670532703399658, "learning_rate": 2.864661448143118e-05, "loss": 0.4655, "num_input_tokens_seen": 55344400, "step": 45620 }, { "epoch": 5.08130081300813, "grad_norm": 0.12045591324567795, "learning_rate": 2.86418069111835e-05, "loss": 0.4524, "num_input_tokens_seen": 55350576, "step": 45625 }, { "epoch": 5.0818576678917475, "grad_norm": 0.09526046365499496, "learning_rate": 2.863699920333669e-05, "loss": 0.4544, "num_input_tokens_seen": 55356816, "step": 45630 }, { "epoch": 5.082414522775364, "grad_norm": 0.11260950565338135, "learning_rate": 2.863219135807239e-05, "loss": 0.4662, "num_input_tokens_seen": 55362640, "step": 45635 }, { "epoch": 5.082971377658982, "grad_norm": 0.09641916304826736, "learning_rate": 2.8627383375572257e-05, "loss": 0.463, "num_input_tokens_seen": 55368880, "step": 45640 }, { "epoch": 5.0835282325426, "grad_norm": 0.09463126957416534, "learning_rate": 2.8622575256017962e-05, "loss": 0.4703, "num_input_tokens_seen": 55374800, "step": 45645 }, { "epoch": 5.0840850874262165, "grad_norm": 0.10030842572450638, "learning_rate": 2.861776699959116e-05, "loss": 0.4709, "num_input_tokens_seen": 55380368, "step": 45650 }, { "epoch": 5.084641942309834, "grad_norm": 0.08461116999387741, "learning_rate": 2.8612958606473534e-05, "loss": 0.4641, "num_input_tokens_seen": 55386448, "step": 45655 }, { "epoch": 5.085198797193452, "grad_norm": 0.08875446766614914, "learning_rate": 2.860815007684675e-05, "loss": 0.4555, "num_input_tokens_seen": 55392464, "step": 45660 }, { "epoch": 5.085755652077069, "grad_norm": 0.09410343319177628, "learning_rate": 2.860334141089249e-05, "loss": 0.4601, "num_input_tokens_seen": 55398736, "step": 45665 }, { "epoch": 5.086312506960686, "grad_norm": 0.11580268293619156, "learning_rate": 2.8598532608792454e-05, "loss": 0.4666, "num_input_tokens_seen": 55404944, "step": 45670 }, { "epoch": 5.086869361844303, "grad_norm": 0.10810580104589462, "learning_rate": 2.8593723670728318e-05, "loss": 0.4643, "num_input_tokens_seen": 55411088, "step": 45675 }, { "epoch": 5.087426216727921, "grad_norm": 0.10745702683925629, "learning_rate": 2.858891459688178e-05, "loss": 0.4777, "num_input_tokens_seen": 55416752, "step": 45680 }, { "epoch": 5.087983071611538, "grad_norm": 0.11276644468307495, "learning_rate": 2.8584105387434557e-05, "loss": 0.4658, "num_input_tokens_seen": 55422992, "step": 45685 }, { "epoch": 5.088539926495155, "grad_norm": 0.16743837296962738, "learning_rate": 2.8579296042568337e-05, "loss": 0.4631, "num_input_tokens_seen": 55429328, "step": 45690 }, { "epoch": 5.089096781378773, "grad_norm": 0.0752783864736557, "learning_rate": 2.8574486562464847e-05, "loss": 0.4572, "num_input_tokens_seen": 55434832, "step": 45695 }, { "epoch": 5.08965363626239, "grad_norm": 0.08430886268615723, "learning_rate": 2.85696769473058e-05, "loss": 0.4604, "num_input_tokens_seen": 55440688, "step": 45700 }, { "epoch": 5.090210491146007, "grad_norm": 0.0861252173781395, "learning_rate": 2.8564867197272914e-05, "loss": 0.4567, "num_input_tokens_seen": 55446512, "step": 45705 }, { "epoch": 5.090767346029625, "grad_norm": 0.08265630900859833, "learning_rate": 2.8560057312547928e-05, "loss": 0.4569, "num_input_tokens_seen": 55452624, "step": 45710 }, { "epoch": 5.091324200913242, "grad_norm": 0.09678597003221512, "learning_rate": 2.855524729331256e-05, "loss": 0.4612, "num_input_tokens_seen": 55458384, "step": 45715 }, { "epoch": 5.0918810557968595, "grad_norm": 0.09151994436979294, "learning_rate": 2.8550437139748566e-05, "loss": 0.4694, "num_input_tokens_seen": 55464432, "step": 45720 }, { "epoch": 5.092437910680476, "grad_norm": 0.10071099549531937, "learning_rate": 2.8545626852037673e-05, "loss": 0.4717, "num_input_tokens_seen": 55470352, "step": 45725 }, { "epoch": 5.092994765564094, "grad_norm": 0.13385716080665588, "learning_rate": 2.8540816430361635e-05, "loss": 0.4552, "num_input_tokens_seen": 55476656, "step": 45730 }, { "epoch": 5.093551620447712, "grad_norm": 0.09917890280485153, "learning_rate": 2.8536005874902204e-05, "loss": 0.456, "num_input_tokens_seen": 55482896, "step": 45735 }, { "epoch": 5.094108475331328, "grad_norm": 0.10972701013088226, "learning_rate": 2.8531195185841138e-05, "loss": 0.476, "num_input_tokens_seen": 55488912, "step": 45740 }, { "epoch": 5.094665330214946, "grad_norm": 0.08450642973184586, "learning_rate": 2.8526384363360202e-05, "loss": 0.4707, "num_input_tokens_seen": 55494960, "step": 45745 }, { "epoch": 5.095222185098564, "grad_norm": 0.11129572987556458, "learning_rate": 2.8521573407641166e-05, "loss": 0.4612, "num_input_tokens_seen": 55501232, "step": 45750 }, { "epoch": 5.0957790399821805, "grad_norm": 0.09553862363100052, "learning_rate": 2.8516762318865804e-05, "loss": 0.4649, "num_input_tokens_seen": 55507312, "step": 45755 }, { "epoch": 5.096335894865798, "grad_norm": 0.11434872448444366, "learning_rate": 2.851195109721589e-05, "loss": 0.4666, "num_input_tokens_seen": 55513584, "step": 45760 }, { "epoch": 5.096892749749415, "grad_norm": 0.10034845769405365, "learning_rate": 2.8507139742873208e-05, "loss": 0.4647, "num_input_tokens_seen": 55519888, "step": 45765 }, { "epoch": 5.097449604633033, "grad_norm": 0.11641005426645279, "learning_rate": 2.8502328256019543e-05, "loss": 0.4631, "num_input_tokens_seen": 55525904, "step": 45770 }, { "epoch": 5.09800645951665, "grad_norm": 0.08334637433290482, "learning_rate": 2.8497516636836697e-05, "loss": 0.4579, "num_input_tokens_seen": 55531216, "step": 45775 }, { "epoch": 5.098563314400267, "grad_norm": 0.09583365172147751, "learning_rate": 2.8492704885506465e-05, "loss": 0.458, "num_input_tokens_seen": 55537264, "step": 45780 }, { "epoch": 5.099120169283885, "grad_norm": 0.09887047111988068, "learning_rate": 2.8487893002210654e-05, "loss": 0.4512, "num_input_tokens_seen": 55543152, "step": 45785 }, { "epoch": 5.099677024167502, "grad_norm": 0.11823001503944397, "learning_rate": 2.8483080987131062e-05, "loss": 0.4579, "num_input_tokens_seen": 55549456, "step": 45790 }, { "epoch": 5.100233879051119, "grad_norm": 0.19025173783302307, "learning_rate": 2.8478268840449506e-05, "loss": 0.4858, "num_input_tokens_seen": 55555504, "step": 45795 }, { "epoch": 5.100790733934737, "grad_norm": 0.12823174893856049, "learning_rate": 2.8473456562347807e-05, "loss": 0.4666, "num_input_tokens_seen": 55561008, "step": 45800 }, { "epoch": 5.101347588818354, "grad_norm": 0.09190602600574493, "learning_rate": 2.846864415300779e-05, "loss": 0.4616, "num_input_tokens_seen": 55567376, "step": 45805 }, { "epoch": 5.101904443701971, "grad_norm": 0.0800008475780487, "learning_rate": 2.846383161261128e-05, "loss": 0.4458, "num_input_tokens_seen": 55572944, "step": 45810 }, { "epoch": 5.102461298585588, "grad_norm": 0.10803430527448654, "learning_rate": 2.8459018941340114e-05, "loss": 0.4629, "num_input_tokens_seen": 55578992, "step": 45815 }, { "epoch": 5.103018153469206, "grad_norm": 0.13334882259368896, "learning_rate": 2.8454206139376126e-05, "loss": 0.4645, "num_input_tokens_seen": 55585040, "step": 45820 }, { "epoch": 5.1035750083528235, "grad_norm": 0.12166598439216614, "learning_rate": 2.844939320690116e-05, "loss": 0.4748, "num_input_tokens_seen": 55591184, "step": 45825 }, { "epoch": 5.10413186323644, "grad_norm": 0.10059182345867157, "learning_rate": 2.8444580144097066e-05, "loss": 0.4584, "num_input_tokens_seen": 55596976, "step": 45830 }, { "epoch": 5.104688718120058, "grad_norm": 0.11390654742717743, "learning_rate": 2.8439766951145692e-05, "loss": 0.4703, "num_input_tokens_seen": 55602448, "step": 45835 }, { "epoch": 5.105245573003676, "grad_norm": 0.08492012321949005, "learning_rate": 2.84349536282289e-05, "loss": 0.4627, "num_input_tokens_seen": 55608560, "step": 45840 }, { "epoch": 5.105802427887292, "grad_norm": 0.08587811887264252, "learning_rate": 2.843014017552855e-05, "loss": 0.4619, "num_input_tokens_seen": 55614832, "step": 45845 }, { "epoch": 5.10635928277091, "grad_norm": 0.09983351826667786, "learning_rate": 2.842532659322652e-05, "loss": 0.4586, "num_input_tokens_seen": 55620720, "step": 45850 }, { "epoch": 5.106916137654527, "grad_norm": 0.1161903440952301, "learning_rate": 2.8420512881504667e-05, "loss": 0.4578, "num_input_tokens_seen": 55626224, "step": 45855 }, { "epoch": 5.107472992538145, "grad_norm": 0.09914300590753555, "learning_rate": 2.8415699040544886e-05, "loss": 0.4598, "num_input_tokens_seen": 55632528, "step": 45860 }, { "epoch": 5.108029847421762, "grad_norm": 0.1473984569311142, "learning_rate": 2.8410885070529037e-05, "loss": 0.4612, "num_input_tokens_seen": 55638608, "step": 45865 }, { "epoch": 5.108586702305379, "grad_norm": 0.11021452397108078, "learning_rate": 2.840607097163902e-05, "loss": 0.4661, "num_input_tokens_seen": 55644912, "step": 45870 }, { "epoch": 5.109143557188997, "grad_norm": 0.09734540432691574, "learning_rate": 2.8401256744056738e-05, "loss": 0.463, "num_input_tokens_seen": 55651024, "step": 45875 }, { "epoch": 5.1097004120726135, "grad_norm": 0.095604807138443, "learning_rate": 2.8396442387964075e-05, "loss": 0.4598, "num_input_tokens_seen": 55657296, "step": 45880 }, { "epoch": 5.110257266956231, "grad_norm": 0.13336871564388275, "learning_rate": 2.839162790354292e-05, "loss": 0.471, "num_input_tokens_seen": 55663408, "step": 45885 }, { "epoch": 5.110814121839849, "grad_norm": 0.10719335824251175, "learning_rate": 2.83868132909752e-05, "loss": 0.4599, "num_input_tokens_seen": 55669488, "step": 45890 }, { "epoch": 5.111370976723466, "grad_norm": 0.07935703545808792, "learning_rate": 2.8381998550442824e-05, "loss": 0.4601, "num_input_tokens_seen": 55674960, "step": 45895 }, { "epoch": 5.111927831607083, "grad_norm": 0.07836730033159256, "learning_rate": 2.83771836821277e-05, "loss": 0.4506, "num_input_tokens_seen": 55681040, "step": 45900 }, { "epoch": 5.1124846864907, "grad_norm": 0.10495594143867493, "learning_rate": 2.8372368686211757e-05, "loss": 0.4583, "num_input_tokens_seen": 55686512, "step": 45905 }, { "epoch": 5.113041541374318, "grad_norm": 0.0992170125246048, "learning_rate": 2.8367553562876918e-05, "loss": 0.4713, "num_input_tokens_seen": 55692080, "step": 45910 }, { "epoch": 5.113598396257935, "grad_norm": 0.14456124603748322, "learning_rate": 2.8362738312305116e-05, "loss": 0.4607, "num_input_tokens_seen": 55698064, "step": 45915 }, { "epoch": 5.114155251141552, "grad_norm": 0.11389249563217163, "learning_rate": 2.8357922934678284e-05, "loss": 0.4604, "num_input_tokens_seen": 55704112, "step": 45920 }, { "epoch": 5.11471210602517, "grad_norm": 0.08491435647010803, "learning_rate": 2.835310743017835e-05, "loss": 0.4604, "num_input_tokens_seen": 55710032, "step": 45925 }, { "epoch": 5.115268960908788, "grad_norm": 0.09842230379581451, "learning_rate": 2.834829179898728e-05, "loss": 0.4666, "num_input_tokens_seen": 55716240, "step": 45930 }, { "epoch": 5.115825815792404, "grad_norm": 0.14077474176883698, "learning_rate": 2.834347604128702e-05, "loss": 0.463, "num_input_tokens_seen": 55722448, "step": 45935 }, { "epoch": 5.116382670676022, "grad_norm": 0.08589047938585281, "learning_rate": 2.8338660157259518e-05, "loss": 0.4664, "num_input_tokens_seen": 55728592, "step": 45940 }, { "epoch": 5.116939525559639, "grad_norm": 0.10184016823768616, "learning_rate": 2.8333844147086736e-05, "loss": 0.4655, "num_input_tokens_seen": 55734832, "step": 45945 }, { "epoch": 5.1174963804432565, "grad_norm": 0.1075231283903122, "learning_rate": 2.8329028010950638e-05, "loss": 0.4608, "num_input_tokens_seen": 55740080, "step": 45950 }, { "epoch": 5.118053235326874, "grad_norm": 0.08191222697496414, "learning_rate": 2.8324211749033193e-05, "loss": 0.4589, "num_input_tokens_seen": 55745616, "step": 45955 }, { "epoch": 5.118610090210491, "grad_norm": 0.07599031180143356, "learning_rate": 2.831939536151637e-05, "loss": 0.4624, "num_input_tokens_seen": 55751600, "step": 45960 }, { "epoch": 5.119166945094109, "grad_norm": 0.07331487536430359, "learning_rate": 2.831457884858216e-05, "loss": 0.4547, "num_input_tokens_seen": 55757808, "step": 45965 }, { "epoch": 5.119723799977725, "grad_norm": 0.10033686459064484, "learning_rate": 2.830976221041254e-05, "loss": 0.4527, "num_input_tokens_seen": 55763952, "step": 45970 }, { "epoch": 5.120280654861343, "grad_norm": 0.13984885811805725, "learning_rate": 2.830494544718949e-05, "loss": 0.4576, "num_input_tokens_seen": 55769808, "step": 45975 }, { "epoch": 5.120837509744961, "grad_norm": 0.10978998243808746, "learning_rate": 2.8300128559095017e-05, "loss": 0.4758, "num_input_tokens_seen": 55775664, "step": 45980 }, { "epoch": 5.1213943646285776, "grad_norm": 0.13721489906311035, "learning_rate": 2.829531154631111e-05, "loss": 0.4492, "num_input_tokens_seen": 55782160, "step": 45985 }, { "epoch": 5.121951219512195, "grad_norm": 0.16333454847335815, "learning_rate": 2.8290494409019762e-05, "loss": 0.471, "num_input_tokens_seen": 55788048, "step": 45990 }, { "epoch": 5.122508074395812, "grad_norm": 0.11548001319169998, "learning_rate": 2.8285677147403e-05, "loss": 0.4676, "num_input_tokens_seen": 55794416, "step": 45995 }, { "epoch": 5.12306492927943, "grad_norm": 0.11038834601640701, "learning_rate": 2.8280859761642824e-05, "loss": 0.4615, "num_input_tokens_seen": 55800368, "step": 46000 }, { "epoch": 5.123621784163047, "grad_norm": 0.09939894080162048, "learning_rate": 2.8276042251921253e-05, "loss": 0.4646, "num_input_tokens_seen": 55806704, "step": 46005 }, { "epoch": 5.124178639046664, "grad_norm": 0.1468995064496994, "learning_rate": 2.827122461842031e-05, "loss": 0.46, "num_input_tokens_seen": 55812144, "step": 46010 }, { "epoch": 5.124735493930282, "grad_norm": 0.10055471211671829, "learning_rate": 2.8266406861322014e-05, "loss": 0.4664, "num_input_tokens_seen": 55818576, "step": 46015 }, { "epoch": 5.1252923488138995, "grad_norm": 0.10511557757854462, "learning_rate": 2.8261588980808396e-05, "loss": 0.4642, "num_input_tokens_seen": 55824304, "step": 46020 }, { "epoch": 5.125849203697516, "grad_norm": 0.12384594976902008, "learning_rate": 2.8256770977061498e-05, "loss": 0.4634, "num_input_tokens_seen": 55830448, "step": 46025 }, { "epoch": 5.126406058581134, "grad_norm": 0.12127963453531265, "learning_rate": 2.8251952850263353e-05, "loss": 0.4703, "num_input_tokens_seen": 55836528, "step": 46030 }, { "epoch": 5.126962913464751, "grad_norm": 0.1372217833995819, "learning_rate": 2.8247134600596015e-05, "loss": 0.47, "num_input_tokens_seen": 55842576, "step": 46035 }, { "epoch": 5.127519768348368, "grad_norm": 0.10508977621793747, "learning_rate": 2.824231622824152e-05, "loss": 0.4642, "num_input_tokens_seen": 55848560, "step": 46040 }, { "epoch": 5.128076623231986, "grad_norm": 0.10604593902826309, "learning_rate": 2.823749773338193e-05, "loss": 0.4576, "num_input_tokens_seen": 55854864, "step": 46045 }, { "epoch": 5.128633478115603, "grad_norm": 0.08743145316839218, "learning_rate": 2.8232679116199294e-05, "loss": 0.4693, "num_input_tokens_seen": 55860848, "step": 46050 }, { "epoch": 5.129190332999221, "grad_norm": 0.08858108520507812, "learning_rate": 2.8227860376875687e-05, "loss": 0.4653, "num_input_tokens_seen": 55866768, "step": 46055 }, { "epoch": 5.129747187882837, "grad_norm": 0.08758552372455597, "learning_rate": 2.822304151559317e-05, "loss": 0.468, "num_input_tokens_seen": 55872816, "step": 46060 }, { "epoch": 5.130304042766455, "grad_norm": 0.16884399950504303, "learning_rate": 2.8218222532533816e-05, "loss": 0.4564, "num_input_tokens_seen": 55878960, "step": 46065 }, { "epoch": 5.130860897650073, "grad_norm": 0.1317296326160431, "learning_rate": 2.8213403427879704e-05, "loss": 0.4561, "num_input_tokens_seen": 55884880, "step": 46070 }, { "epoch": 5.1314177525336895, "grad_norm": 0.09647773951292038, "learning_rate": 2.820858420181291e-05, "loss": 0.4574, "num_input_tokens_seen": 55891344, "step": 46075 }, { "epoch": 5.131974607417307, "grad_norm": 0.09222885966300964, "learning_rate": 2.8203764854515525e-05, "loss": 0.4612, "num_input_tokens_seen": 55897488, "step": 46080 }, { "epoch": 5.132531462300925, "grad_norm": 0.11241281032562256, "learning_rate": 2.8198945386169634e-05, "loss": 0.4623, "num_input_tokens_seen": 55903504, "step": 46085 }, { "epoch": 5.133088317184542, "grad_norm": 0.11752957850694656, "learning_rate": 2.8194125796957332e-05, "loss": 0.4644, "num_input_tokens_seen": 55909648, "step": 46090 }, { "epoch": 5.133645172068159, "grad_norm": 0.12330619990825653, "learning_rate": 2.818930608706073e-05, "loss": 0.4588, "num_input_tokens_seen": 55915088, "step": 46095 }, { "epoch": 5.134202026951776, "grad_norm": 0.09138326346874237, "learning_rate": 2.8184486256661912e-05, "loss": 0.4511, "num_input_tokens_seen": 55921040, "step": 46100 }, { "epoch": 5.134758881835394, "grad_norm": 0.07900073379278183, "learning_rate": 2.817966630594301e-05, "loss": 0.4784, "num_input_tokens_seen": 55926576, "step": 46105 }, { "epoch": 5.135315736719011, "grad_norm": 0.09643127024173737, "learning_rate": 2.817484623508611e-05, "loss": 0.4639, "num_input_tokens_seen": 55932624, "step": 46110 }, { "epoch": 5.135872591602628, "grad_norm": 0.1220175251364708, "learning_rate": 2.8170026044273356e-05, "loss": 0.4536, "num_input_tokens_seen": 55938896, "step": 46115 }, { "epoch": 5.136429446486246, "grad_norm": 0.09134805202484131, "learning_rate": 2.816520573368686e-05, "loss": 0.4769, "num_input_tokens_seen": 55944880, "step": 46120 }, { "epoch": 5.136986301369863, "grad_norm": 0.10389076173305511, "learning_rate": 2.8160385303508745e-05, "loss": 0.4622, "num_input_tokens_seen": 55951184, "step": 46125 }, { "epoch": 5.13754315625348, "grad_norm": 0.10063163936138153, "learning_rate": 2.815556475392115e-05, "loss": 0.4694, "num_input_tokens_seen": 55957488, "step": 46130 }, { "epoch": 5.138100011137098, "grad_norm": 0.10478616505861282, "learning_rate": 2.8150744085106196e-05, "loss": 0.4528, "num_input_tokens_seen": 55963440, "step": 46135 }, { "epoch": 5.138656866020715, "grad_norm": 0.09282069653272629, "learning_rate": 2.8145923297246034e-05, "loss": 0.4555, "num_input_tokens_seen": 55969744, "step": 46140 }, { "epoch": 5.1392137209043325, "grad_norm": 0.08916140347719193, "learning_rate": 2.8141102390522815e-05, "loss": 0.4671, "num_input_tokens_seen": 55975344, "step": 46145 }, { "epoch": 5.139770575787949, "grad_norm": 0.14110994338989258, "learning_rate": 2.8136281365118683e-05, "loss": 0.4598, "num_input_tokens_seen": 55981168, "step": 46150 }, { "epoch": 5.140327430671567, "grad_norm": 0.13297612965106964, "learning_rate": 2.8131460221215787e-05, "loss": 0.4734, "num_input_tokens_seen": 55987440, "step": 46155 }, { "epoch": 5.140884285555185, "grad_norm": 0.10769666731357574, "learning_rate": 2.8126638958996286e-05, "loss": 0.4687, "num_input_tokens_seen": 55993552, "step": 46160 }, { "epoch": 5.141441140438801, "grad_norm": 0.06702783703804016, "learning_rate": 2.812181757864235e-05, "loss": 0.4596, "num_input_tokens_seen": 55999504, "step": 46165 }, { "epoch": 5.141997995322419, "grad_norm": 0.09732411801815033, "learning_rate": 2.811699608033614e-05, "loss": 0.4692, "num_input_tokens_seen": 56005200, "step": 46170 }, { "epoch": 5.142554850206036, "grad_norm": 0.14101846516132355, "learning_rate": 2.8112174464259823e-05, "loss": 0.4564, "num_input_tokens_seen": 56011184, "step": 46175 }, { "epoch": 5.1431117050896535, "grad_norm": 0.09226682782173157, "learning_rate": 2.8107352730595587e-05, "loss": 0.4704, "num_input_tokens_seen": 56017392, "step": 46180 }, { "epoch": 5.143668559973271, "grad_norm": 0.06844690442085266, "learning_rate": 2.8102530879525608e-05, "loss": 0.4694, "num_input_tokens_seen": 56022928, "step": 46185 }, { "epoch": 5.144225414856888, "grad_norm": 0.08953530341386795, "learning_rate": 2.8097708911232067e-05, "loss": 0.453, "num_input_tokens_seen": 56029104, "step": 46190 }, { "epoch": 5.144782269740506, "grad_norm": 0.0808403417468071, "learning_rate": 2.8092886825897163e-05, "loss": 0.4636, "num_input_tokens_seen": 56035568, "step": 46195 }, { "epoch": 5.145339124624123, "grad_norm": 0.11926788091659546, "learning_rate": 2.8088064623703076e-05, "loss": 0.4594, "num_input_tokens_seen": 56041872, "step": 46200 }, { "epoch": 5.14589597950774, "grad_norm": 0.1462344527244568, "learning_rate": 2.8083242304832007e-05, "loss": 0.459, "num_input_tokens_seen": 56047952, "step": 46205 }, { "epoch": 5.146452834391358, "grad_norm": 0.07327266782522202, "learning_rate": 2.8078419869466172e-05, "loss": 0.461, "num_input_tokens_seen": 56054096, "step": 46210 }, { "epoch": 5.147009689274975, "grad_norm": 0.10980772227048874, "learning_rate": 2.807359731778777e-05, "loss": 0.4633, "num_input_tokens_seen": 56060304, "step": 46215 }, { "epoch": 5.147566544158592, "grad_norm": 0.08530214428901672, "learning_rate": 2.8068774649979006e-05, "loss": 0.4743, "num_input_tokens_seen": 56066416, "step": 46220 }, { "epoch": 5.14812339904221, "grad_norm": 0.12341007590293884, "learning_rate": 2.8063951866222106e-05, "loss": 0.4607, "num_input_tokens_seen": 56072816, "step": 46225 }, { "epoch": 5.148680253925827, "grad_norm": 0.10653715580701828, "learning_rate": 2.8059128966699282e-05, "loss": 0.4598, "num_input_tokens_seen": 56078416, "step": 46230 }, { "epoch": 5.149237108809444, "grad_norm": 0.11660972237586975, "learning_rate": 2.805430595159276e-05, "loss": 0.4644, "num_input_tokens_seen": 56084496, "step": 46235 }, { "epoch": 5.149793963693061, "grad_norm": 0.09134726971387863, "learning_rate": 2.804948282108477e-05, "loss": 0.4676, "num_input_tokens_seen": 56090608, "step": 46240 }, { "epoch": 5.150350818576679, "grad_norm": 0.1025458574295044, "learning_rate": 2.8044659575357552e-05, "loss": 0.4733, "num_input_tokens_seen": 56096784, "step": 46245 }, { "epoch": 5.1509076734602965, "grad_norm": 0.10283131897449493, "learning_rate": 2.803983621459334e-05, "loss": 0.4719, "num_input_tokens_seen": 56102736, "step": 46250 }, { "epoch": 5.151464528343913, "grad_norm": 0.08733689039945602, "learning_rate": 2.803501273897437e-05, "loss": 0.4388, "num_input_tokens_seen": 56109008, "step": 46255 }, { "epoch": 5.152021383227531, "grad_norm": 0.09478586912155151, "learning_rate": 2.8030189148682896e-05, "loss": 0.4616, "num_input_tokens_seen": 56115312, "step": 46260 }, { "epoch": 5.152578238111149, "grad_norm": 0.13964678347110748, "learning_rate": 2.8025365443901164e-05, "loss": 0.4651, "num_input_tokens_seen": 56121552, "step": 46265 }, { "epoch": 5.1531350929947655, "grad_norm": 0.0858009085059166, "learning_rate": 2.802054162481142e-05, "loss": 0.4672, "num_input_tokens_seen": 56127600, "step": 46270 }, { "epoch": 5.153691947878383, "grad_norm": 0.10509639233350754, "learning_rate": 2.8015717691595944e-05, "loss": 0.4597, "num_input_tokens_seen": 56133520, "step": 46275 }, { "epoch": 5.154248802762, "grad_norm": 0.11393294483423233, "learning_rate": 2.8010893644436985e-05, "loss": 0.4585, "num_input_tokens_seen": 56139984, "step": 46280 }, { "epoch": 5.154805657645618, "grad_norm": 0.10708004236221313, "learning_rate": 2.800606948351683e-05, "loss": 0.4596, "num_input_tokens_seen": 56146128, "step": 46285 }, { "epoch": 5.155362512529235, "grad_norm": 0.09274999797344208, "learning_rate": 2.8001245209017725e-05, "loss": 0.4682, "num_input_tokens_seen": 56152400, "step": 46290 }, { "epoch": 5.155919367412852, "grad_norm": 0.0968705415725708, "learning_rate": 2.799642082112195e-05, "loss": 0.464, "num_input_tokens_seen": 56158832, "step": 46295 }, { "epoch": 5.15647622229647, "grad_norm": 0.08122655749320984, "learning_rate": 2.7991596320011803e-05, "loss": 0.4561, "num_input_tokens_seen": 56164848, "step": 46300 }, { "epoch": 5.1570330771800865, "grad_norm": 0.0914917066693306, "learning_rate": 2.798677170586956e-05, "loss": 0.4665, "num_input_tokens_seen": 56170800, "step": 46305 }, { "epoch": 5.157589932063704, "grad_norm": 0.08648694306612015, "learning_rate": 2.798194697887751e-05, "loss": 0.4614, "num_input_tokens_seen": 56176752, "step": 46310 }, { "epoch": 5.158146786947322, "grad_norm": 0.1029096469283104, "learning_rate": 2.7977122139217943e-05, "loss": 0.4585, "num_input_tokens_seen": 56183120, "step": 46315 }, { "epoch": 5.158703641830939, "grad_norm": 0.07924456894397736, "learning_rate": 2.7972297187073164e-05, "loss": 0.463, "num_input_tokens_seen": 56189040, "step": 46320 }, { "epoch": 5.159260496714556, "grad_norm": 0.08325179666280746, "learning_rate": 2.796747212262547e-05, "loss": 0.4662, "num_input_tokens_seen": 56195152, "step": 46325 }, { "epoch": 5.159817351598173, "grad_norm": 0.07819563895463943, "learning_rate": 2.796264694605717e-05, "loss": 0.4529, "num_input_tokens_seen": 56200464, "step": 46330 }, { "epoch": 5.160374206481791, "grad_norm": 0.0956803411245346, "learning_rate": 2.7957821657550575e-05, "loss": 0.4598, "num_input_tokens_seen": 56206352, "step": 46335 }, { "epoch": 5.1609310613654085, "grad_norm": 0.08700435608625412, "learning_rate": 2.7952996257288e-05, "loss": 0.462, "num_input_tokens_seen": 56212432, "step": 46340 }, { "epoch": 5.161487916249025, "grad_norm": 0.07948280870914459, "learning_rate": 2.794817074545176e-05, "loss": 0.4522, "num_input_tokens_seen": 56218576, "step": 46345 }, { "epoch": 5.162044771132643, "grad_norm": 0.11603154242038727, "learning_rate": 2.794334512222419e-05, "loss": 0.4544, "num_input_tokens_seen": 56224496, "step": 46350 }, { "epoch": 5.16260162601626, "grad_norm": 0.10583627969026566, "learning_rate": 2.7938519387787603e-05, "loss": 0.47, "num_input_tokens_seen": 56229968, "step": 46355 }, { "epoch": 5.163158480899877, "grad_norm": 0.09150596708059311, "learning_rate": 2.793369354232433e-05, "loss": 0.4572, "num_input_tokens_seen": 56235664, "step": 46360 }, { "epoch": 5.163715335783495, "grad_norm": 0.11194200813770294, "learning_rate": 2.792886758601672e-05, "loss": 0.4551, "num_input_tokens_seen": 56242000, "step": 46365 }, { "epoch": 5.164272190667112, "grad_norm": 0.19624187052249908, "learning_rate": 2.7924041519047107e-05, "loss": 0.4531, "num_input_tokens_seen": 56247888, "step": 46370 }, { "epoch": 5.1648290455507295, "grad_norm": 0.10743045806884766, "learning_rate": 2.7919215341597837e-05, "loss": 0.4647, "num_input_tokens_seen": 56253840, "step": 46375 }, { "epoch": 5.165385900434347, "grad_norm": 0.09510903805494308, "learning_rate": 2.7914389053851253e-05, "loss": 0.4648, "num_input_tokens_seen": 56259920, "step": 46380 }, { "epoch": 5.165942755317964, "grad_norm": 0.09046392142772675, "learning_rate": 2.790956265598971e-05, "loss": 0.4583, "num_input_tokens_seen": 56266128, "step": 46385 }, { "epoch": 5.166499610201582, "grad_norm": 0.09002336859703064, "learning_rate": 2.7904736148195566e-05, "loss": 0.4612, "num_input_tokens_seen": 56272144, "step": 46390 }, { "epoch": 5.1670564650851984, "grad_norm": 0.09129511564970016, "learning_rate": 2.789990953065118e-05, "loss": 0.4601, "num_input_tokens_seen": 56278128, "step": 46395 }, { "epoch": 5.167613319968816, "grad_norm": 0.11104581505060196, "learning_rate": 2.7895082803538924e-05, "loss": 0.4507, "num_input_tokens_seen": 56284176, "step": 46400 }, { "epoch": 5.168170174852434, "grad_norm": 0.1332702785730362, "learning_rate": 2.789025596704116e-05, "loss": 0.4581, "num_input_tokens_seen": 56290544, "step": 46405 }, { "epoch": 5.168727029736051, "grad_norm": 0.0830833911895752, "learning_rate": 2.7885429021340265e-05, "loss": 0.4561, "num_input_tokens_seen": 56296432, "step": 46410 }, { "epoch": 5.169283884619668, "grad_norm": 0.08411336690187454, "learning_rate": 2.7880601966618612e-05, "loss": 0.4728, "num_input_tokens_seen": 56302512, "step": 46415 }, { "epoch": 5.169840739503285, "grad_norm": 0.11140011250972748, "learning_rate": 2.7875774803058585e-05, "loss": 0.4529, "num_input_tokens_seen": 56308752, "step": 46420 }, { "epoch": 5.170397594386903, "grad_norm": 0.09327211230993271, "learning_rate": 2.7870947530842568e-05, "loss": 0.4612, "num_input_tokens_seen": 56314896, "step": 46425 }, { "epoch": 5.17095444927052, "grad_norm": 0.11674638837575912, "learning_rate": 2.7866120150152957e-05, "loss": 0.4695, "num_input_tokens_seen": 56320976, "step": 46430 }, { "epoch": 5.171511304154137, "grad_norm": 0.060746416449546814, "learning_rate": 2.7861292661172138e-05, "loss": 0.4548, "num_input_tokens_seen": 56326800, "step": 46435 }, { "epoch": 5.172068159037755, "grad_norm": 0.0857488214969635, "learning_rate": 2.7856465064082516e-05, "loss": 0.4616, "num_input_tokens_seen": 56333296, "step": 46440 }, { "epoch": 5.1726250139213725, "grad_norm": 0.08067429810762405, "learning_rate": 2.785163735906649e-05, "loss": 0.4623, "num_input_tokens_seen": 56339632, "step": 46445 }, { "epoch": 5.173181868804989, "grad_norm": 0.08723112940788269, "learning_rate": 2.784680954630647e-05, "loss": 0.4767, "num_input_tokens_seen": 56346064, "step": 46450 }, { "epoch": 5.173738723688607, "grad_norm": 0.09340903162956238, "learning_rate": 2.7841981625984853e-05, "loss": 0.4556, "num_input_tokens_seen": 56351728, "step": 46455 }, { "epoch": 5.174295578572224, "grad_norm": 0.09143611043691635, "learning_rate": 2.7837153598284067e-05, "loss": 0.4644, "num_input_tokens_seen": 56358256, "step": 46460 }, { "epoch": 5.1748524334558414, "grad_norm": 0.11201608926057816, "learning_rate": 2.7832325463386527e-05, "loss": 0.455, "num_input_tokens_seen": 56364464, "step": 46465 }, { "epoch": 5.175409288339459, "grad_norm": 0.08067651838064194, "learning_rate": 2.7827497221474653e-05, "loss": 0.4741, "num_input_tokens_seen": 56369840, "step": 46470 }, { "epoch": 5.175966143223076, "grad_norm": 0.10876703262329102, "learning_rate": 2.7822668872730873e-05, "loss": 0.4607, "num_input_tokens_seen": 56376304, "step": 46475 }, { "epoch": 5.176522998106694, "grad_norm": 0.0857057124376297, "learning_rate": 2.781784041733762e-05, "loss": 0.4672, "num_input_tokens_seen": 56382672, "step": 46480 }, { "epoch": 5.17707985299031, "grad_norm": 0.08757170289754868, "learning_rate": 2.7813011855477323e-05, "loss": 0.4523, "num_input_tokens_seen": 56389072, "step": 46485 }, { "epoch": 5.177636707873928, "grad_norm": 0.1025979220867157, "learning_rate": 2.7808183187332425e-05, "loss": 0.4638, "num_input_tokens_seen": 56394928, "step": 46490 }, { "epoch": 5.178193562757546, "grad_norm": 0.10918693244457245, "learning_rate": 2.7803354413085364e-05, "loss": 0.4759, "num_input_tokens_seen": 56400944, "step": 46495 }, { "epoch": 5.1787504176411625, "grad_norm": 0.13009147346019745, "learning_rate": 2.7798525532918595e-05, "loss": 0.4594, "num_input_tokens_seen": 56406864, "step": 46500 }, { "epoch": 5.17930727252478, "grad_norm": 0.1362917423248291, "learning_rate": 2.779369654701456e-05, "loss": 0.4626, "num_input_tokens_seen": 56412816, "step": 46505 }, { "epoch": 5.179864127408397, "grad_norm": 0.10429546236991882, "learning_rate": 2.7788867455555712e-05, "loss": 0.4628, "num_input_tokens_seen": 56418864, "step": 46510 }, { "epoch": 5.180420982292015, "grad_norm": 0.09737967699766159, "learning_rate": 2.7784038258724514e-05, "loss": 0.4542, "num_input_tokens_seen": 56424944, "step": 46515 }, { "epoch": 5.180977837175632, "grad_norm": 0.11041241884231567, "learning_rate": 2.7779208956703434e-05, "loss": 0.4499, "num_input_tokens_seen": 56431440, "step": 46520 }, { "epoch": 5.181534692059249, "grad_norm": 0.14553441107273102, "learning_rate": 2.7774379549674935e-05, "loss": 0.4634, "num_input_tokens_seen": 56437808, "step": 46525 }, { "epoch": 5.182091546942867, "grad_norm": 0.09596409648656845, "learning_rate": 2.776955003782148e-05, "loss": 0.4675, "num_input_tokens_seen": 56444112, "step": 46530 }, { "epoch": 5.182648401826484, "grad_norm": 0.10375413298606873, "learning_rate": 2.7764720421325552e-05, "loss": 0.4698, "num_input_tokens_seen": 56450256, "step": 46535 }, { "epoch": 5.183205256710101, "grad_norm": 0.11033105105161667, "learning_rate": 2.7759890700369627e-05, "loss": 0.4523, "num_input_tokens_seen": 56455696, "step": 46540 }, { "epoch": 5.183762111593719, "grad_norm": 0.12023387104272842, "learning_rate": 2.7755060875136184e-05, "loss": 0.4521, "num_input_tokens_seen": 56461808, "step": 46545 }, { "epoch": 5.184318966477336, "grad_norm": 0.1607215702533722, "learning_rate": 2.7750230945807715e-05, "loss": 0.4638, "num_input_tokens_seen": 56467280, "step": 46550 }, { "epoch": 5.184875821360953, "grad_norm": 0.09438413381576538, "learning_rate": 2.7745400912566704e-05, "loss": 0.4697, "num_input_tokens_seen": 56473232, "step": 46555 }, { "epoch": 5.185432676244571, "grad_norm": 0.13586203753948212, "learning_rate": 2.7740570775595653e-05, "loss": 0.4608, "num_input_tokens_seen": 56479184, "step": 46560 }, { "epoch": 5.185989531128188, "grad_norm": 0.1021716296672821, "learning_rate": 2.7735740535077054e-05, "loss": 0.4604, "num_input_tokens_seen": 56485264, "step": 46565 }, { "epoch": 5.1865463860118055, "grad_norm": 0.1189463660120964, "learning_rate": 2.7730910191193408e-05, "loss": 0.472, "num_input_tokens_seen": 56491344, "step": 46570 }, { "epoch": 5.187103240895422, "grad_norm": 0.0738549530506134, "learning_rate": 2.772607974412723e-05, "loss": 0.4615, "num_input_tokens_seen": 56496880, "step": 46575 }, { "epoch": 5.18766009577904, "grad_norm": 0.09560620784759521, "learning_rate": 2.772124919406101e-05, "loss": 0.4626, "num_input_tokens_seen": 56502992, "step": 46580 }, { "epoch": 5.188216950662658, "grad_norm": 0.11062651127576828, "learning_rate": 2.7716418541177285e-05, "loss": 0.4587, "num_input_tokens_seen": 56508944, "step": 46585 }, { "epoch": 5.188773805546274, "grad_norm": 0.07970462739467621, "learning_rate": 2.771158778565856e-05, "loss": 0.4558, "num_input_tokens_seen": 56514864, "step": 46590 }, { "epoch": 5.189330660429892, "grad_norm": 0.10535159707069397, "learning_rate": 2.770675692768736e-05, "loss": 0.4805, "num_input_tokens_seen": 56520624, "step": 46595 }, { "epoch": 5.189887515313509, "grad_norm": 0.09394989162683487, "learning_rate": 2.7701925967446214e-05, "loss": 0.4649, "num_input_tokens_seen": 56526768, "step": 46600 }, { "epoch": 5.190444370197127, "grad_norm": 0.09354787319898605, "learning_rate": 2.769709490511764e-05, "loss": 0.4632, "num_input_tokens_seen": 56532848, "step": 46605 }, { "epoch": 5.191001225080744, "grad_norm": 0.1099100112915039, "learning_rate": 2.7692263740884172e-05, "loss": 0.4497, "num_input_tokens_seen": 56538448, "step": 46610 }, { "epoch": 5.191558079964361, "grad_norm": 0.07461602985858917, "learning_rate": 2.768743247492836e-05, "loss": 0.4607, "num_input_tokens_seen": 56544144, "step": 46615 }, { "epoch": 5.192114934847979, "grad_norm": 0.1036420464515686, "learning_rate": 2.768260110743273e-05, "loss": 0.4575, "num_input_tokens_seen": 56549872, "step": 46620 }, { "epoch": 5.192671789731596, "grad_norm": 0.10633784532546997, "learning_rate": 2.7677769638579843e-05, "loss": 0.4627, "num_input_tokens_seen": 56555952, "step": 46625 }, { "epoch": 5.193228644615213, "grad_norm": 0.08693967759609222, "learning_rate": 2.7672938068552234e-05, "loss": 0.4606, "num_input_tokens_seen": 56562256, "step": 46630 }, { "epoch": 5.193785499498831, "grad_norm": 0.09339094161987305, "learning_rate": 2.7668106397532457e-05, "loss": 0.4383, "num_input_tokens_seen": 56568400, "step": 46635 }, { "epoch": 5.194342354382448, "grad_norm": 0.10713181644678116, "learning_rate": 2.766327462570306e-05, "loss": 0.46, "num_input_tokens_seen": 56574800, "step": 46640 }, { "epoch": 5.194899209266065, "grad_norm": 0.16057239472866058, "learning_rate": 2.7658442753246628e-05, "loss": 0.4553, "num_input_tokens_seen": 56580784, "step": 46645 }, { "epoch": 5.195456064149683, "grad_norm": 0.07613550126552582, "learning_rate": 2.7653610780345708e-05, "loss": 0.4583, "num_input_tokens_seen": 56587152, "step": 46650 }, { "epoch": 5.1960129190333, "grad_norm": 0.09697156399488449, "learning_rate": 2.7648778707182866e-05, "loss": 0.4672, "num_input_tokens_seen": 56593360, "step": 46655 }, { "epoch": 5.196569773916917, "grad_norm": 0.10649820417165756, "learning_rate": 2.7643946533940673e-05, "loss": 0.4661, "num_input_tokens_seen": 56599280, "step": 46660 }, { "epoch": 5.197126628800534, "grad_norm": 0.1196938008069992, "learning_rate": 2.763911426080171e-05, "loss": 0.462, "num_input_tokens_seen": 56605456, "step": 46665 }, { "epoch": 5.197683483684152, "grad_norm": 0.09525469690561295, "learning_rate": 2.7634281887948554e-05, "loss": 0.4541, "num_input_tokens_seen": 56611920, "step": 46670 }, { "epoch": 5.19824033856777, "grad_norm": 0.10698115080595016, "learning_rate": 2.762944941556378e-05, "loss": 0.4619, "num_input_tokens_seen": 56617872, "step": 46675 }, { "epoch": 5.198797193451386, "grad_norm": 0.11424288898706436, "learning_rate": 2.7624616843829987e-05, "loss": 0.464, "num_input_tokens_seen": 56624080, "step": 46680 }, { "epoch": 5.199354048335004, "grad_norm": 0.0812835842370987, "learning_rate": 2.7619784172929758e-05, "loss": 0.4534, "num_input_tokens_seen": 56630288, "step": 46685 }, { "epoch": 5.199910903218621, "grad_norm": 0.08517569303512573, "learning_rate": 2.7614951403045685e-05, "loss": 0.4762, "num_input_tokens_seen": 56636720, "step": 46690 }, { "epoch": 5.2004677581022385, "grad_norm": 0.1342923939228058, "learning_rate": 2.7610118534360375e-05, "loss": 0.451, "num_input_tokens_seen": 56642800, "step": 46695 }, { "epoch": 5.201024612985856, "grad_norm": 0.18690668046474457, "learning_rate": 2.7605285567056412e-05, "loss": 0.4773, "num_input_tokens_seen": 56649008, "step": 46700 }, { "epoch": 5.201581467869473, "grad_norm": 0.13518404960632324, "learning_rate": 2.7600452501316416e-05, "loss": 0.46, "num_input_tokens_seen": 56655088, "step": 46705 }, { "epoch": 5.202138322753091, "grad_norm": 0.10603281110525131, "learning_rate": 2.7595619337322993e-05, "loss": 0.4649, "num_input_tokens_seen": 56661040, "step": 46710 }, { "epoch": 5.202695177636708, "grad_norm": 0.08647598326206207, "learning_rate": 2.759078607525875e-05, "loss": 0.4581, "num_input_tokens_seen": 56666992, "step": 46715 }, { "epoch": 5.203252032520325, "grad_norm": 0.13296625018119812, "learning_rate": 2.7585952715306305e-05, "loss": 0.455, "num_input_tokens_seen": 56673168, "step": 46720 }, { "epoch": 5.203808887403943, "grad_norm": 0.09180767834186554, "learning_rate": 2.7581119257648284e-05, "loss": 0.4575, "num_input_tokens_seen": 56679120, "step": 46725 }, { "epoch": 5.2043657422875595, "grad_norm": 0.12203569710254669, "learning_rate": 2.7576285702467298e-05, "loss": 0.4672, "num_input_tokens_seen": 56685008, "step": 46730 }, { "epoch": 5.204922597171177, "grad_norm": 0.11393275111913681, "learning_rate": 2.757145204994599e-05, "loss": 0.4628, "num_input_tokens_seen": 56691152, "step": 46735 }, { "epoch": 5.205479452054795, "grad_norm": 0.09041882306337357, "learning_rate": 2.7566618300266983e-05, "loss": 0.4601, "num_input_tokens_seen": 56697392, "step": 46740 }, { "epoch": 5.206036306938412, "grad_norm": 0.11613117158412933, "learning_rate": 2.756178445361291e-05, "loss": 0.4421, "num_input_tokens_seen": 56703600, "step": 46745 }, { "epoch": 5.206593161822029, "grad_norm": 0.12223269045352936, "learning_rate": 2.7556950510166406e-05, "loss": 0.4654, "num_input_tokens_seen": 56709744, "step": 46750 }, { "epoch": 5.207150016705646, "grad_norm": 0.11626993119716644, "learning_rate": 2.7552116470110123e-05, "loss": 0.464, "num_input_tokens_seen": 56715920, "step": 46755 }, { "epoch": 5.207706871589264, "grad_norm": 0.0944686308503151, "learning_rate": 2.7547282333626702e-05, "loss": 0.4605, "num_input_tokens_seen": 56722064, "step": 46760 }, { "epoch": 5.2082637264728815, "grad_norm": 0.10578587651252747, "learning_rate": 2.7542448100898782e-05, "loss": 0.461, "num_input_tokens_seen": 56728272, "step": 46765 }, { "epoch": 5.208820581356498, "grad_norm": 0.10561583191156387, "learning_rate": 2.7537613772109032e-05, "loss": 0.4499, "num_input_tokens_seen": 56734640, "step": 46770 }, { "epoch": 5.209377436240116, "grad_norm": 0.12019248306751251, "learning_rate": 2.7532779347440102e-05, "loss": 0.4696, "num_input_tokens_seen": 56740816, "step": 46775 }, { "epoch": 5.209934291123733, "grad_norm": 0.10611122846603394, "learning_rate": 2.7527944827074654e-05, "loss": 0.4739, "num_input_tokens_seen": 56747120, "step": 46780 }, { "epoch": 5.21049114600735, "grad_norm": 0.08169185370206833, "learning_rate": 2.7523110211195348e-05, "loss": 0.4492, "num_input_tokens_seen": 56753424, "step": 46785 }, { "epoch": 5.211048000890968, "grad_norm": 0.09992863237857819, "learning_rate": 2.751827549998485e-05, "loss": 0.465, "num_input_tokens_seen": 56759664, "step": 46790 }, { "epoch": 5.211604855774585, "grad_norm": 0.08918149769306183, "learning_rate": 2.751344069362583e-05, "loss": 0.467, "num_input_tokens_seen": 56766096, "step": 46795 }, { "epoch": 5.2121617106582026, "grad_norm": 0.13071562349796295, "learning_rate": 2.7508605792300968e-05, "loss": 0.4556, "num_input_tokens_seen": 56772336, "step": 46800 }, { "epoch": 5.21271856554182, "grad_norm": 0.08491093665361404, "learning_rate": 2.750377079619294e-05, "loss": 0.4635, "num_input_tokens_seen": 56778544, "step": 46805 }, { "epoch": 5.213275420425437, "grad_norm": 0.11683131009340286, "learning_rate": 2.7498935705484436e-05, "loss": 0.4662, "num_input_tokens_seen": 56784688, "step": 46810 }, { "epoch": 5.213832275309055, "grad_norm": 0.1619545966386795, "learning_rate": 2.7494100520358125e-05, "loss": 0.4844, "num_input_tokens_seen": 56790992, "step": 46815 }, { "epoch": 5.2143891301926715, "grad_norm": 0.1140669733285904, "learning_rate": 2.74892652409967e-05, "loss": 0.4602, "num_input_tokens_seen": 56796848, "step": 46820 }, { "epoch": 5.214945985076289, "grad_norm": 0.146328404545784, "learning_rate": 2.7484429867582868e-05, "loss": 0.466, "num_input_tokens_seen": 56802512, "step": 46825 }, { "epoch": 5.215502839959907, "grad_norm": 0.11251378059387207, "learning_rate": 2.74795944002993e-05, "loss": 0.4696, "num_input_tokens_seen": 56808720, "step": 46830 }, { "epoch": 5.216059694843524, "grad_norm": 0.11816676706075668, "learning_rate": 2.7474758839328717e-05, "loss": 0.4606, "num_input_tokens_seen": 56813936, "step": 46835 }, { "epoch": 5.216616549727141, "grad_norm": 0.12164948880672455, "learning_rate": 2.746992318485382e-05, "loss": 0.4693, "num_input_tokens_seen": 56819952, "step": 46840 }, { "epoch": 5.217173404610758, "grad_norm": 0.14125177264213562, "learning_rate": 2.74650874370573e-05, "loss": 0.4688, "num_input_tokens_seen": 56826032, "step": 46845 }, { "epoch": 5.217730259494376, "grad_norm": 0.1325657069683075, "learning_rate": 2.7460251596121884e-05, "loss": 0.4533, "num_input_tokens_seen": 56832048, "step": 46850 }, { "epoch": 5.218287114377993, "grad_norm": 0.10651975870132446, "learning_rate": 2.7455415662230276e-05, "loss": 0.4597, "num_input_tokens_seen": 56837904, "step": 46855 }, { "epoch": 5.21884396926161, "grad_norm": 0.10154692083597183, "learning_rate": 2.745057963556519e-05, "loss": 0.4494, "num_input_tokens_seen": 56843664, "step": 46860 }, { "epoch": 5.219400824145228, "grad_norm": 0.12064879387617111, "learning_rate": 2.7445743516309358e-05, "loss": 0.4454, "num_input_tokens_seen": 56849904, "step": 46865 }, { "epoch": 5.219957679028845, "grad_norm": 0.12393414229154587, "learning_rate": 2.7440907304645495e-05, "loss": 0.4679, "num_input_tokens_seen": 56856208, "step": 46870 }, { "epoch": 5.220514533912462, "grad_norm": 0.0929240956902504, "learning_rate": 2.7436071000756337e-05, "loss": 0.4653, "num_input_tokens_seen": 56861488, "step": 46875 }, { "epoch": 5.22107138879608, "grad_norm": 0.1451733112335205, "learning_rate": 2.7431234604824607e-05, "loss": 0.4611, "num_input_tokens_seen": 56867632, "step": 46880 }, { "epoch": 5.221628243679697, "grad_norm": 0.1122061014175415, "learning_rate": 2.742639811703304e-05, "loss": 0.4379, "num_input_tokens_seen": 56873168, "step": 46885 }, { "epoch": 5.2221850985633145, "grad_norm": 0.11842355877161026, "learning_rate": 2.7421561537564376e-05, "loss": 0.4556, "num_input_tokens_seen": 56879024, "step": 46890 }, { "epoch": 5.222741953446932, "grad_norm": 0.1236833781003952, "learning_rate": 2.7416724866601364e-05, "loss": 0.4641, "num_input_tokens_seen": 56885104, "step": 46895 }, { "epoch": 5.223298808330549, "grad_norm": 0.11847802251577377, "learning_rate": 2.741188810432674e-05, "loss": 0.4789, "num_input_tokens_seen": 56891056, "step": 46900 }, { "epoch": 5.223855663214167, "grad_norm": 0.12345738708972931, "learning_rate": 2.7407051250923248e-05, "loss": 0.4666, "num_input_tokens_seen": 56897136, "step": 46905 }, { "epoch": 5.224412518097783, "grad_norm": 0.15621574223041534, "learning_rate": 2.7402214306573653e-05, "loss": 0.4624, "num_input_tokens_seen": 56903344, "step": 46910 }, { "epoch": 5.224969372981401, "grad_norm": 0.12181198596954346, "learning_rate": 2.7397377271460693e-05, "loss": 0.4488, "num_input_tokens_seen": 56909648, "step": 46915 }, { "epoch": 5.225526227865019, "grad_norm": 0.12023868411779404, "learning_rate": 2.739254014576715e-05, "loss": 0.4705, "num_input_tokens_seen": 56915664, "step": 46920 }, { "epoch": 5.2260830827486355, "grad_norm": 0.09787669032812119, "learning_rate": 2.7387702929675764e-05, "loss": 0.4631, "num_input_tokens_seen": 56922192, "step": 46925 }, { "epoch": 5.226639937632253, "grad_norm": 0.12312819808721542, "learning_rate": 2.7382865623369313e-05, "loss": 0.452, "num_input_tokens_seen": 56927792, "step": 46930 }, { "epoch": 5.22719679251587, "grad_norm": 0.1171860620379448, "learning_rate": 2.737802822703056e-05, "loss": 0.4469, "num_input_tokens_seen": 56934000, "step": 46935 }, { "epoch": 5.227753647399488, "grad_norm": 0.16956102848052979, "learning_rate": 2.737319074084228e-05, "loss": 0.4615, "num_input_tokens_seen": 56939952, "step": 46940 }, { "epoch": 5.228310502283105, "grad_norm": 0.0837654173374176, "learning_rate": 2.7368353164987255e-05, "loss": 0.4583, "num_input_tokens_seen": 56945808, "step": 46945 }, { "epoch": 5.228867357166722, "grad_norm": 0.08901314437389374, "learning_rate": 2.7363515499648246e-05, "loss": 0.4519, "num_input_tokens_seen": 56952208, "step": 46950 }, { "epoch": 5.22942421205034, "grad_norm": 0.11305496096611023, "learning_rate": 2.7358677745008056e-05, "loss": 0.4623, "num_input_tokens_seen": 56958320, "step": 46955 }, { "epoch": 5.229981066933957, "grad_norm": 0.1694880723953247, "learning_rate": 2.7353839901249455e-05, "loss": 0.4643, "num_input_tokens_seen": 56964464, "step": 46960 }, { "epoch": 5.230537921817574, "grad_norm": 0.13612312078475952, "learning_rate": 2.7349001968555248e-05, "loss": 0.4397, "num_input_tokens_seen": 56970320, "step": 46965 }, { "epoch": 5.231094776701192, "grad_norm": 0.1416763961315155, "learning_rate": 2.7344163947108213e-05, "loss": 0.4743, "num_input_tokens_seen": 56976976, "step": 46970 }, { "epoch": 5.231651631584809, "grad_norm": 0.1435374617576599, "learning_rate": 2.7339325837091155e-05, "loss": 0.4592, "num_input_tokens_seen": 56983248, "step": 46975 }, { "epoch": 5.232208486468426, "grad_norm": 0.11305884271860123, "learning_rate": 2.7334487638686862e-05, "loss": 0.4666, "num_input_tokens_seen": 56989264, "step": 46980 }, { "epoch": 5.232765341352044, "grad_norm": 0.10732106864452362, "learning_rate": 2.7329649352078156e-05, "loss": 0.4772, "num_input_tokens_seen": 56995472, "step": 46985 }, { "epoch": 5.233322196235661, "grad_norm": 0.10213949531316757, "learning_rate": 2.732481097744783e-05, "loss": 0.4584, "num_input_tokens_seen": 57001584, "step": 46990 }, { "epoch": 5.2338790511192785, "grad_norm": 0.10249438136816025, "learning_rate": 2.7319972514978687e-05, "loss": 0.4664, "num_input_tokens_seen": 57007472, "step": 46995 }, { "epoch": 5.234435906002895, "grad_norm": 0.15928655862808228, "learning_rate": 2.7315133964853555e-05, "loss": 0.4676, "num_input_tokens_seen": 57013808, "step": 47000 }, { "epoch": 5.234992760886513, "grad_norm": 0.06848497688770294, "learning_rate": 2.7310295327255243e-05, "loss": 0.4586, "num_input_tokens_seen": 57020176, "step": 47005 }, { "epoch": 5.235549615770131, "grad_norm": 0.10336033254861832, "learning_rate": 2.730545660236657e-05, "loss": 0.4527, "num_input_tokens_seen": 57026512, "step": 47010 }, { "epoch": 5.2361064706537475, "grad_norm": 0.09872577339410782, "learning_rate": 2.7300617790370347e-05, "loss": 0.4635, "num_input_tokens_seen": 57032656, "step": 47015 }, { "epoch": 5.236663325537365, "grad_norm": 0.12575168907642365, "learning_rate": 2.729577889144942e-05, "loss": 0.4632, "num_input_tokens_seen": 57038576, "step": 47020 }, { "epoch": 5.237220180420982, "grad_norm": 0.15286913514137268, "learning_rate": 2.729093990578661e-05, "loss": 0.4562, "num_input_tokens_seen": 57044560, "step": 47025 }, { "epoch": 5.2377770353046, "grad_norm": 0.1179380714893341, "learning_rate": 2.7286100833564747e-05, "loss": 0.4556, "num_input_tokens_seen": 57050672, "step": 47030 }, { "epoch": 5.238333890188217, "grad_norm": 0.10093434154987335, "learning_rate": 2.7281261674966673e-05, "loss": 0.4607, "num_input_tokens_seen": 57056848, "step": 47035 }, { "epoch": 5.238890745071834, "grad_norm": 0.07599657028913498, "learning_rate": 2.727642243017522e-05, "loss": 0.4643, "num_input_tokens_seen": 57062480, "step": 47040 }, { "epoch": 5.239447599955452, "grad_norm": 0.11177713423967361, "learning_rate": 2.7271583099373226e-05, "loss": 0.4645, "num_input_tokens_seen": 57068400, "step": 47045 }, { "epoch": 5.2400044548390685, "grad_norm": 0.10457637906074524, "learning_rate": 2.7266743682743547e-05, "loss": 0.4563, "num_input_tokens_seen": 57074512, "step": 47050 }, { "epoch": 5.240561309722686, "grad_norm": 0.09817422926425934, "learning_rate": 2.7261904180469028e-05, "loss": 0.4657, "num_input_tokens_seen": 57080688, "step": 47055 }, { "epoch": 5.241118164606304, "grad_norm": 0.10545636713504791, "learning_rate": 2.7257064592732523e-05, "loss": 0.4681, "num_input_tokens_seen": 57086768, "step": 47060 }, { "epoch": 5.241675019489921, "grad_norm": 0.09622084349393845, "learning_rate": 2.7252224919716884e-05, "loss": 0.4585, "num_input_tokens_seen": 57092624, "step": 47065 }, { "epoch": 5.242231874373538, "grad_norm": 0.08749130368232727, "learning_rate": 2.7247385161604967e-05, "loss": 0.4699, "num_input_tokens_seen": 57098704, "step": 47070 }, { "epoch": 5.242788729257156, "grad_norm": 0.1044941246509552, "learning_rate": 2.7242545318579637e-05, "loss": 0.4687, "num_input_tokens_seen": 57104720, "step": 47075 }, { "epoch": 5.243345584140773, "grad_norm": 0.1307043433189392, "learning_rate": 2.723770539082376e-05, "loss": 0.4412, "num_input_tokens_seen": 57110544, "step": 47080 }, { "epoch": 5.2439024390243905, "grad_norm": 0.12503156065940857, "learning_rate": 2.7232865378520202e-05, "loss": 0.4552, "num_input_tokens_seen": 57116880, "step": 47085 }, { "epoch": 5.244459293908007, "grad_norm": 0.10541912168264389, "learning_rate": 2.7228025281851832e-05, "loss": 0.4485, "num_input_tokens_seen": 57122992, "step": 47090 }, { "epoch": 5.245016148791625, "grad_norm": 0.10865966230630875, "learning_rate": 2.722318510100153e-05, "loss": 0.4581, "num_input_tokens_seen": 57129104, "step": 47095 }, { "epoch": 5.245573003675243, "grad_norm": 0.11983945965766907, "learning_rate": 2.7218344836152166e-05, "loss": 0.4497, "num_input_tokens_seen": 57134928, "step": 47100 }, { "epoch": 5.246129858558859, "grad_norm": 0.08659813553094864, "learning_rate": 2.721350448748663e-05, "loss": 0.464, "num_input_tokens_seen": 57140912, "step": 47105 }, { "epoch": 5.246686713442477, "grad_norm": 0.11205483227968216, "learning_rate": 2.72086640551878e-05, "loss": 0.4603, "num_input_tokens_seen": 57146768, "step": 47110 }, { "epoch": 5.247243568326094, "grad_norm": 0.10561881214380264, "learning_rate": 2.7203823539438565e-05, "loss": 0.4658, "num_input_tokens_seen": 57152880, "step": 47115 }, { "epoch": 5.2478004232097115, "grad_norm": 0.09962142258882523, "learning_rate": 2.7198982940421812e-05, "loss": 0.4639, "num_input_tokens_seen": 57158160, "step": 47120 }, { "epoch": 5.248357278093329, "grad_norm": 0.11036715656518936, "learning_rate": 2.7194142258320437e-05, "loss": 0.4624, "num_input_tokens_seen": 57164368, "step": 47125 }, { "epoch": 5.248914132976946, "grad_norm": 0.12488322705030441, "learning_rate": 2.718930149331734e-05, "loss": 0.4681, "num_input_tokens_seen": 57170544, "step": 47130 }, { "epoch": 5.249470987860564, "grad_norm": 0.0903121754527092, "learning_rate": 2.7184460645595405e-05, "loss": 0.4636, "num_input_tokens_seen": 57176432, "step": 47135 }, { "epoch": 5.250027842744181, "grad_norm": 0.10327259451150894, "learning_rate": 2.717961971533755e-05, "loss": 0.472, "num_input_tokens_seen": 57182800, "step": 47140 }, { "epoch": 5.250584697627798, "grad_norm": 0.10253455489873886, "learning_rate": 2.7174778702726684e-05, "loss": 0.4586, "num_input_tokens_seen": 57189072, "step": 47145 }, { "epoch": 5.251141552511416, "grad_norm": 0.09554900228977203, "learning_rate": 2.7169937607945704e-05, "loss": 0.4528, "num_input_tokens_seen": 57194608, "step": 47150 }, { "epoch": 5.251698407395033, "grad_norm": 0.11004354059696198, "learning_rate": 2.7165096431177535e-05, "loss": 0.4715, "num_input_tokens_seen": 57200816, "step": 47155 }, { "epoch": 5.25225526227865, "grad_norm": 0.13000118732452393, "learning_rate": 2.7160255172605076e-05, "loss": 0.4655, "num_input_tokens_seen": 57206832, "step": 47160 }, { "epoch": 5.252812117162268, "grad_norm": 0.1052330955862999, "learning_rate": 2.7155413832411254e-05, "loss": 0.4641, "num_input_tokens_seen": 57212912, "step": 47165 }, { "epoch": 5.253368972045885, "grad_norm": 0.08857564628124237, "learning_rate": 2.715057241077899e-05, "loss": 0.4765, "num_input_tokens_seen": 57218928, "step": 47170 }, { "epoch": 5.253925826929502, "grad_norm": 0.09435366094112396, "learning_rate": 2.7145730907891216e-05, "loss": 0.4495, "num_input_tokens_seen": 57224976, "step": 47175 }, { "epoch": 5.254482681813119, "grad_norm": 0.1280529797077179, "learning_rate": 2.7140889323930847e-05, "loss": 0.4554, "num_input_tokens_seen": 57231088, "step": 47180 }, { "epoch": 5.255039536696737, "grad_norm": 0.11001141369342804, "learning_rate": 2.7136047659080816e-05, "loss": 0.4705, "num_input_tokens_seen": 57237040, "step": 47185 }, { "epoch": 5.2555963915803545, "grad_norm": 0.09371168911457062, "learning_rate": 2.713120591352406e-05, "loss": 0.468, "num_input_tokens_seen": 57243376, "step": 47190 }, { "epoch": 5.256153246463971, "grad_norm": 0.10968839377164841, "learning_rate": 2.7126364087443517e-05, "loss": 0.4724, "num_input_tokens_seen": 57249584, "step": 47195 }, { "epoch": 5.256710101347589, "grad_norm": 0.08515971899032593, "learning_rate": 2.712152218102212e-05, "loss": 0.4673, "num_input_tokens_seen": 57255760, "step": 47200 }, { "epoch": 5.257266956231206, "grad_norm": 0.09749636799097061, "learning_rate": 2.7116680194442823e-05, "loss": 0.4616, "num_input_tokens_seen": 57261776, "step": 47205 }, { "epoch": 5.257823811114823, "grad_norm": 0.14202673733234406, "learning_rate": 2.7111838127888562e-05, "loss": 0.4653, "num_input_tokens_seen": 57268048, "step": 47210 }, { "epoch": 5.258380665998441, "grad_norm": 0.07992386817932129, "learning_rate": 2.7106995981542287e-05, "loss": 0.4614, "num_input_tokens_seen": 57274064, "step": 47215 }, { "epoch": 5.258937520882058, "grad_norm": 0.10376681387424469, "learning_rate": 2.7102153755586956e-05, "loss": 0.4577, "num_input_tokens_seen": 57280368, "step": 47220 }, { "epoch": 5.259494375765676, "grad_norm": 0.08159303665161133, "learning_rate": 2.7097311450205522e-05, "loss": 0.4531, "num_input_tokens_seen": 57286096, "step": 47225 }, { "epoch": 5.260051230649292, "grad_norm": 0.09671349078416824, "learning_rate": 2.7092469065580927e-05, "loss": 0.4505, "num_input_tokens_seen": 57291952, "step": 47230 }, { "epoch": 5.26060808553291, "grad_norm": 0.1039276197552681, "learning_rate": 2.7087626601896153e-05, "loss": 0.4617, "num_input_tokens_seen": 57298064, "step": 47235 }, { "epoch": 5.261164940416528, "grad_norm": 0.07572715729475021, "learning_rate": 2.7082784059334154e-05, "loss": 0.4491, "num_input_tokens_seen": 57304080, "step": 47240 }, { "epoch": 5.2617217953001445, "grad_norm": 0.07403711974620819, "learning_rate": 2.7077941438077904e-05, "loss": 0.468, "num_input_tokens_seen": 57310288, "step": 47245 }, { "epoch": 5.262278650183762, "grad_norm": 0.11165177822113037, "learning_rate": 2.7073098738310364e-05, "loss": 0.4756, "num_input_tokens_seen": 57316496, "step": 47250 }, { "epoch": 5.26283550506738, "grad_norm": 0.10883326828479767, "learning_rate": 2.7068255960214513e-05, "loss": 0.4727, "num_input_tokens_seen": 57322768, "step": 47255 }, { "epoch": 5.263392359950997, "grad_norm": 0.08681241422891617, "learning_rate": 2.7063413103973323e-05, "loss": 0.4717, "num_input_tokens_seen": 57328208, "step": 47260 }, { "epoch": 5.263949214834614, "grad_norm": 0.11378616094589233, "learning_rate": 2.7058570169769763e-05, "loss": 0.4631, "num_input_tokens_seen": 57333584, "step": 47265 }, { "epoch": 5.264506069718231, "grad_norm": 0.14309725165367126, "learning_rate": 2.7053727157786834e-05, "loss": 0.4787, "num_input_tokens_seen": 57339632, "step": 47270 }, { "epoch": 5.265062924601849, "grad_norm": 0.08361929655075073, "learning_rate": 2.7048884068207515e-05, "loss": 0.4498, "num_input_tokens_seen": 57345936, "step": 47275 }, { "epoch": 5.2656197794854664, "grad_norm": 0.10874655097723007, "learning_rate": 2.7044040901214785e-05, "loss": 0.4634, "num_input_tokens_seen": 57352304, "step": 47280 }, { "epoch": 5.266176634369083, "grad_norm": 0.08237720280885696, "learning_rate": 2.7039197656991638e-05, "loss": 0.4571, "num_input_tokens_seen": 57358416, "step": 47285 }, { "epoch": 5.266733489252701, "grad_norm": 0.11753794550895691, "learning_rate": 2.7034354335721072e-05, "loss": 0.4708, "num_input_tokens_seen": 57364624, "step": 47290 }, { "epoch": 5.267290344136318, "grad_norm": 0.09040969610214233, "learning_rate": 2.7029510937586072e-05, "loss": 0.4656, "num_input_tokens_seen": 57370864, "step": 47295 }, { "epoch": 5.267847199019935, "grad_norm": 0.11589086055755615, "learning_rate": 2.7024667462769653e-05, "loss": 0.4827, "num_input_tokens_seen": 57376592, "step": 47300 }, { "epoch": 5.268404053903553, "grad_norm": 0.09399969130754471, "learning_rate": 2.7019823911454812e-05, "loss": 0.4505, "num_input_tokens_seen": 57382256, "step": 47305 }, { "epoch": 5.26896090878717, "grad_norm": 0.07327553629875183, "learning_rate": 2.701498028382455e-05, "loss": 0.4606, "num_input_tokens_seen": 57388336, "step": 47310 }, { "epoch": 5.2695177636707875, "grad_norm": 0.10317452251911163, "learning_rate": 2.7010136580061873e-05, "loss": 0.4519, "num_input_tokens_seen": 57394512, "step": 47315 }, { "epoch": 5.270074618554405, "grad_norm": 0.09634343534708023, "learning_rate": 2.7005292800349785e-05, "loss": 0.4738, "num_input_tokens_seen": 57400432, "step": 47320 }, { "epoch": 5.270631473438022, "grad_norm": 0.08461600542068481, "learning_rate": 2.7000448944871322e-05, "loss": 0.4672, "num_input_tokens_seen": 57405968, "step": 47325 }, { "epoch": 5.27118832832164, "grad_norm": 0.09124543517827988, "learning_rate": 2.6995605013809476e-05, "loss": 0.4642, "num_input_tokens_seen": 57411952, "step": 47330 }, { "epoch": 5.271745183205256, "grad_norm": 0.08967110514640808, "learning_rate": 2.6990761007347288e-05, "loss": 0.456, "num_input_tokens_seen": 57417936, "step": 47335 }, { "epoch": 5.272302038088874, "grad_norm": 0.09528186917304993, "learning_rate": 2.6985916925667765e-05, "loss": 0.4579, "num_input_tokens_seen": 57424048, "step": 47340 }, { "epoch": 5.272858892972492, "grad_norm": 0.10713234543800354, "learning_rate": 2.6981072768953936e-05, "loss": 0.4591, "num_input_tokens_seen": 57429968, "step": 47345 }, { "epoch": 5.273415747856109, "grad_norm": 0.10490317642688751, "learning_rate": 2.6976228537388833e-05, "loss": 0.4595, "num_input_tokens_seen": 57435856, "step": 47350 }, { "epoch": 5.273972602739726, "grad_norm": 0.07580703496932983, "learning_rate": 2.697138423115547e-05, "loss": 0.459, "num_input_tokens_seen": 57442128, "step": 47355 }, { "epoch": 5.274529457623343, "grad_norm": 0.1372934728860855, "learning_rate": 2.6966539850436902e-05, "loss": 0.462, "num_input_tokens_seen": 57448464, "step": 47360 }, { "epoch": 5.275086312506961, "grad_norm": 0.09367933124303818, "learning_rate": 2.696169539541616e-05, "loss": 0.4605, "num_input_tokens_seen": 57454704, "step": 47365 }, { "epoch": 5.275643167390578, "grad_norm": 0.07881253957748413, "learning_rate": 2.6956850866276275e-05, "loss": 0.4568, "num_input_tokens_seen": 57461168, "step": 47370 }, { "epoch": 5.276200022274195, "grad_norm": 0.11323212832212448, "learning_rate": 2.695200626320029e-05, "loss": 0.4568, "num_input_tokens_seen": 57466224, "step": 47375 }, { "epoch": 5.276756877157813, "grad_norm": 0.08246434479951859, "learning_rate": 2.694716158637126e-05, "loss": 0.4518, "num_input_tokens_seen": 57472080, "step": 47380 }, { "epoch": 5.27731373204143, "grad_norm": 0.08966485410928726, "learning_rate": 2.6942316835972213e-05, "loss": 0.4612, "num_input_tokens_seen": 57478128, "step": 47385 }, { "epoch": 5.277870586925047, "grad_norm": 0.08341208845376968, "learning_rate": 2.6937472012186227e-05, "loss": 0.461, "num_input_tokens_seen": 57484080, "step": 47390 }, { "epoch": 5.278427441808665, "grad_norm": 0.09851662069559097, "learning_rate": 2.693262711519633e-05, "loss": 0.461, "num_input_tokens_seen": 57490160, "step": 47395 }, { "epoch": 5.278984296692282, "grad_norm": 0.08756014704704285, "learning_rate": 2.692778214518559e-05, "loss": 0.4703, "num_input_tokens_seen": 57496528, "step": 47400 }, { "epoch": 5.279541151575899, "grad_norm": 0.09959398210048676, "learning_rate": 2.6922937102337064e-05, "loss": 0.456, "num_input_tokens_seen": 57502928, "step": 47405 }, { "epoch": 5.280098006459516, "grad_norm": 0.10345970839262009, "learning_rate": 2.691809198683381e-05, "loss": 0.4616, "num_input_tokens_seen": 57509008, "step": 47410 }, { "epoch": 5.280654861343134, "grad_norm": 0.10133237391710281, "learning_rate": 2.6913246798858895e-05, "loss": 0.4785, "num_input_tokens_seen": 57515312, "step": 47415 }, { "epoch": 5.281211716226752, "grad_norm": 0.1308869570493698, "learning_rate": 2.6908401538595384e-05, "loss": 0.4682, "num_input_tokens_seen": 57521648, "step": 47420 }, { "epoch": 5.281768571110368, "grad_norm": 0.11066943407058716, "learning_rate": 2.6903556206226343e-05, "loss": 0.4595, "num_input_tokens_seen": 57527888, "step": 47425 }, { "epoch": 5.282325425993986, "grad_norm": 0.09239166229963303, "learning_rate": 2.6898710801934858e-05, "loss": 0.4693, "num_input_tokens_seen": 57534160, "step": 47430 }, { "epoch": 5.282882280877604, "grad_norm": 0.08839571475982666, "learning_rate": 2.6893865325903988e-05, "loss": 0.4648, "num_input_tokens_seen": 57540432, "step": 47435 }, { "epoch": 5.2834391357612205, "grad_norm": 0.09135700017213821, "learning_rate": 2.6889019778316815e-05, "loss": 0.4645, "num_input_tokens_seen": 57546736, "step": 47440 }, { "epoch": 5.283995990644838, "grad_norm": 0.11040712147951126, "learning_rate": 2.688417415935643e-05, "loss": 0.4479, "num_input_tokens_seen": 57552976, "step": 47445 }, { "epoch": 5.284552845528455, "grad_norm": 0.0875622034072876, "learning_rate": 2.6879328469205894e-05, "loss": 0.4588, "num_input_tokens_seen": 57558480, "step": 47450 }, { "epoch": 5.285109700412073, "grad_norm": 0.12202189117670059, "learning_rate": 2.687448270804831e-05, "loss": 0.4664, "num_input_tokens_seen": 57564496, "step": 47455 }, { "epoch": 5.28566655529569, "grad_norm": 0.11859485507011414, "learning_rate": 2.6869636876066766e-05, "loss": 0.4625, "num_input_tokens_seen": 57570640, "step": 47460 }, { "epoch": 5.286223410179307, "grad_norm": 0.1190878227353096, "learning_rate": 2.6864790973444348e-05, "loss": 0.4601, "num_input_tokens_seen": 57576912, "step": 47465 }, { "epoch": 5.286780265062925, "grad_norm": 0.11199404299259186, "learning_rate": 2.6859945000364156e-05, "loss": 0.4606, "num_input_tokens_seen": 57582768, "step": 47470 }, { "epoch": 5.2873371199465415, "grad_norm": 0.09872420132160187, "learning_rate": 2.6855098957009273e-05, "loss": 0.4692, "num_input_tokens_seen": 57588400, "step": 47475 }, { "epoch": 5.287893974830159, "grad_norm": 0.09292922914028168, "learning_rate": 2.6850252843562807e-05, "loss": 0.465, "num_input_tokens_seen": 57594576, "step": 47480 }, { "epoch": 5.288450829713777, "grad_norm": 0.11973707377910614, "learning_rate": 2.6845406660207862e-05, "loss": 0.4631, "num_input_tokens_seen": 57600752, "step": 47485 }, { "epoch": 5.289007684597394, "grad_norm": 0.09351203590631485, "learning_rate": 2.684056040712754e-05, "loss": 0.455, "num_input_tokens_seen": 57606768, "step": 47490 }, { "epoch": 5.289564539481011, "grad_norm": 0.07850794494152069, "learning_rate": 2.683571408450495e-05, "loss": 0.4545, "num_input_tokens_seen": 57612944, "step": 47495 }, { "epoch": 5.290121394364629, "grad_norm": 0.12303587049245834, "learning_rate": 2.683086769252319e-05, "loss": 0.4582, "num_input_tokens_seen": 57619344, "step": 47500 }, { "epoch": 5.290678249248246, "grad_norm": 0.09374392777681351, "learning_rate": 2.6826021231365394e-05, "loss": 0.4626, "num_input_tokens_seen": 57625264, "step": 47505 }, { "epoch": 5.2912351041318635, "grad_norm": 0.1109422817826271, "learning_rate": 2.682117470121466e-05, "loss": 0.4676, "num_input_tokens_seen": 57631664, "step": 47510 }, { "epoch": 5.29179195901548, "grad_norm": 0.12984837591648102, "learning_rate": 2.68163281022541e-05, "loss": 0.4687, "num_input_tokens_seen": 57637680, "step": 47515 }, { "epoch": 5.292348813899098, "grad_norm": 0.10894498974084854, "learning_rate": 2.6811481434666856e-05, "loss": 0.4672, "num_input_tokens_seen": 57643664, "step": 47520 }, { "epoch": 5.292905668782716, "grad_norm": 0.09711826592683792, "learning_rate": 2.6806634698636036e-05, "loss": 0.4559, "num_input_tokens_seen": 57649776, "step": 47525 }, { "epoch": 5.293462523666332, "grad_norm": 0.09753229469060898, "learning_rate": 2.6801787894344765e-05, "loss": 0.46, "num_input_tokens_seen": 57655664, "step": 47530 }, { "epoch": 5.29401937854995, "grad_norm": 0.09183397144079208, "learning_rate": 2.6796941021976168e-05, "loss": 0.4643, "num_input_tokens_seen": 57661616, "step": 47535 }, { "epoch": 5.294576233433567, "grad_norm": 0.1056809350848198, "learning_rate": 2.679209408171338e-05, "loss": 0.4717, "num_input_tokens_seen": 57667856, "step": 47540 }, { "epoch": 5.2951330883171845, "grad_norm": 0.11888129264116287, "learning_rate": 2.678724707373954e-05, "loss": 0.4568, "num_input_tokens_seen": 57674096, "step": 47545 }, { "epoch": 5.295689943200802, "grad_norm": 0.17204126715660095, "learning_rate": 2.6782399998237785e-05, "loss": 0.4561, "num_input_tokens_seen": 57680144, "step": 47550 }, { "epoch": 5.296246798084419, "grad_norm": 0.12305962294340134, "learning_rate": 2.6777552855391236e-05, "loss": 0.4653, "num_input_tokens_seen": 57685904, "step": 47555 }, { "epoch": 5.296803652968037, "grad_norm": 0.13074389100074768, "learning_rate": 2.6772705645383045e-05, "loss": 0.4611, "num_input_tokens_seen": 57692144, "step": 47560 }, { "epoch": 5.2973605078516535, "grad_norm": 0.0969364270567894, "learning_rate": 2.676785836839636e-05, "loss": 0.4756, "num_input_tokens_seen": 57697904, "step": 47565 }, { "epoch": 5.297917362735271, "grad_norm": 0.13155697286128998, "learning_rate": 2.676301102461431e-05, "loss": 0.465, "num_input_tokens_seen": 57704240, "step": 47570 }, { "epoch": 5.298474217618889, "grad_norm": 0.1151517704129219, "learning_rate": 2.675816361422006e-05, "loss": 0.4524, "num_input_tokens_seen": 57710352, "step": 47575 }, { "epoch": 5.299031072502506, "grad_norm": 0.16699261963367462, "learning_rate": 2.6753316137396754e-05, "loss": 0.4592, "num_input_tokens_seen": 57716656, "step": 47580 }, { "epoch": 5.299587927386123, "grad_norm": 0.1036430150270462, "learning_rate": 2.6748468594327548e-05, "loss": 0.4588, "num_input_tokens_seen": 57722864, "step": 47585 }, { "epoch": 5.30014478226974, "grad_norm": 0.08495877683162689, "learning_rate": 2.674362098519559e-05, "loss": 0.4714, "num_input_tokens_seen": 57728880, "step": 47590 }, { "epoch": 5.300701637153358, "grad_norm": 0.08947079628705978, "learning_rate": 2.6738773310184046e-05, "loss": 0.4676, "num_input_tokens_seen": 57735152, "step": 47595 }, { "epoch": 5.301258492036975, "grad_norm": 0.12510575354099274, "learning_rate": 2.6733925569476076e-05, "loss": 0.4769, "num_input_tokens_seen": 57741552, "step": 47600 }, { "epoch": 5.301815346920592, "grad_norm": 0.12590374052524567, "learning_rate": 2.6729077763254834e-05, "loss": 0.4606, "num_input_tokens_seen": 57747632, "step": 47605 }, { "epoch": 5.30237220180421, "grad_norm": 0.09034169465303421, "learning_rate": 2.67242298917035e-05, "loss": 0.4525, "num_input_tokens_seen": 57753680, "step": 47610 }, { "epoch": 5.3029290566878275, "grad_norm": 0.1173577532172203, "learning_rate": 2.6719381955005235e-05, "loss": 0.4621, "num_input_tokens_seen": 57759920, "step": 47615 }, { "epoch": 5.303485911571444, "grad_norm": 0.12063569575548172, "learning_rate": 2.6714533953343205e-05, "loss": 0.4617, "num_input_tokens_seen": 57765840, "step": 47620 }, { "epoch": 5.304042766455062, "grad_norm": 0.08933055400848389, "learning_rate": 2.6709685886900594e-05, "loss": 0.4694, "num_input_tokens_seen": 57771856, "step": 47625 }, { "epoch": 5.304599621338679, "grad_norm": 0.08490222692489624, "learning_rate": 2.6704837755860567e-05, "loss": 0.4717, "num_input_tokens_seen": 57777968, "step": 47630 }, { "epoch": 5.3051564762222965, "grad_norm": 0.13432548940181732, "learning_rate": 2.6699989560406303e-05, "loss": 0.4624, "num_input_tokens_seen": 57784272, "step": 47635 }, { "epoch": 5.305713331105914, "grad_norm": 0.0852622389793396, "learning_rate": 2.6695141300720993e-05, "loss": 0.4604, "num_input_tokens_seen": 57790352, "step": 47640 }, { "epoch": 5.306270185989531, "grad_norm": 0.10669541358947754, "learning_rate": 2.6690292976987807e-05, "loss": 0.4635, "num_input_tokens_seen": 57796400, "step": 47645 }, { "epoch": 5.306827040873149, "grad_norm": 0.10489830374717712, "learning_rate": 2.668544458938994e-05, "loss": 0.4542, "num_input_tokens_seen": 57802704, "step": 47650 }, { "epoch": 5.307383895756765, "grad_norm": 0.10418164730072021, "learning_rate": 2.6680596138110574e-05, "loss": 0.4563, "num_input_tokens_seen": 57809104, "step": 47655 }, { "epoch": 5.307940750640383, "grad_norm": 0.11898328363895416, "learning_rate": 2.6675747623332902e-05, "loss": 0.4664, "num_input_tokens_seen": 57815408, "step": 47660 }, { "epoch": 5.308497605524001, "grad_norm": 0.08411047607660294, "learning_rate": 2.6670899045240104e-05, "loss": 0.4691, "num_input_tokens_seen": 57821328, "step": 47665 }, { "epoch": 5.3090544604076175, "grad_norm": 0.08187080174684525, "learning_rate": 2.6666050404015395e-05, "loss": 0.467, "num_input_tokens_seen": 57827408, "step": 47670 }, { "epoch": 5.309611315291235, "grad_norm": 0.09975577145814896, "learning_rate": 2.666120169984196e-05, "loss": 0.4649, "num_input_tokens_seen": 57833648, "step": 47675 }, { "epoch": 5.310168170174853, "grad_norm": 0.09301174432039261, "learning_rate": 2.6656352932903006e-05, "loss": 0.4683, "num_input_tokens_seen": 57839856, "step": 47680 }, { "epoch": 5.31072502505847, "grad_norm": 0.12503431737422943, "learning_rate": 2.6651504103381725e-05, "loss": 0.4734, "num_input_tokens_seen": 57845776, "step": 47685 }, { "epoch": 5.311281879942087, "grad_norm": 0.1320926547050476, "learning_rate": 2.664665521146133e-05, "loss": 0.4605, "num_input_tokens_seen": 57852016, "step": 47690 }, { "epoch": 5.311838734825704, "grad_norm": 0.07996347546577454, "learning_rate": 2.664180625732502e-05, "loss": 0.452, "num_input_tokens_seen": 57858000, "step": 47695 }, { "epoch": 5.312395589709322, "grad_norm": 0.12559600174427032, "learning_rate": 2.6636957241156008e-05, "loss": 0.4683, "num_input_tokens_seen": 57864048, "step": 47700 }, { "epoch": 5.3129524445929395, "grad_norm": 0.10383616387844086, "learning_rate": 2.6632108163137503e-05, "loss": 0.4746, "num_input_tokens_seen": 57869936, "step": 47705 }, { "epoch": 5.313509299476556, "grad_norm": 0.10243935883045197, "learning_rate": 2.662725902345273e-05, "loss": 0.4659, "num_input_tokens_seen": 57875952, "step": 47710 }, { "epoch": 5.314066154360174, "grad_norm": 0.0889255478978157, "learning_rate": 2.662240982228489e-05, "loss": 0.4694, "num_input_tokens_seen": 57882064, "step": 47715 }, { "epoch": 5.314623009243791, "grad_norm": 0.12809208035469055, "learning_rate": 2.6617560559817212e-05, "loss": 0.4676, "num_input_tokens_seen": 57888368, "step": 47720 }, { "epoch": 5.315179864127408, "grad_norm": 0.10174660384654999, "learning_rate": 2.6612711236232912e-05, "loss": 0.4619, "num_input_tokens_seen": 57894192, "step": 47725 }, { "epoch": 5.315736719011026, "grad_norm": 0.1091160848736763, "learning_rate": 2.660786185171521e-05, "loss": 0.4586, "num_input_tokens_seen": 57900240, "step": 47730 }, { "epoch": 5.316293573894643, "grad_norm": 0.20989836752414703, "learning_rate": 2.6603012406447335e-05, "loss": 0.4756, "num_input_tokens_seen": 57906512, "step": 47735 }, { "epoch": 5.3168504287782605, "grad_norm": 0.09717607498168945, "learning_rate": 2.6598162900612516e-05, "loss": 0.465, "num_input_tokens_seen": 57912464, "step": 47740 }, { "epoch": 5.317407283661877, "grad_norm": 0.12633293867111206, "learning_rate": 2.659331333439398e-05, "loss": 0.4585, "num_input_tokens_seen": 57918800, "step": 47745 }, { "epoch": 5.317964138545495, "grad_norm": 0.13183824717998505, "learning_rate": 2.6588463707974966e-05, "loss": 0.4696, "num_input_tokens_seen": 57925008, "step": 47750 }, { "epoch": 5.318520993429113, "grad_norm": 0.10862663388252258, "learning_rate": 2.6583614021538688e-05, "loss": 0.4682, "num_input_tokens_seen": 57930992, "step": 47755 }, { "epoch": 5.3190778483127295, "grad_norm": 0.12434336543083191, "learning_rate": 2.657876427526841e-05, "loss": 0.478, "num_input_tokens_seen": 57937456, "step": 47760 }, { "epoch": 5.319634703196347, "grad_norm": 0.09140513837337494, "learning_rate": 2.6573914469347354e-05, "loss": 0.4553, "num_input_tokens_seen": 57943152, "step": 47765 }, { "epoch": 5.320191558079964, "grad_norm": 0.13023528456687927, "learning_rate": 2.6569064603958765e-05, "loss": 0.4485, "num_input_tokens_seen": 57949264, "step": 47770 }, { "epoch": 5.320748412963582, "grad_norm": 0.12319613993167877, "learning_rate": 2.6564214679285886e-05, "loss": 0.4706, "num_input_tokens_seen": 57955440, "step": 47775 }, { "epoch": 5.321305267847199, "grad_norm": 0.10413312911987305, "learning_rate": 2.6559364695511967e-05, "loss": 0.4584, "num_input_tokens_seen": 57961168, "step": 47780 }, { "epoch": 5.321862122730816, "grad_norm": 0.09604381024837494, "learning_rate": 2.6554514652820256e-05, "loss": 0.4777, "num_input_tokens_seen": 57967632, "step": 47785 }, { "epoch": 5.322418977614434, "grad_norm": 0.11283823102712631, "learning_rate": 2.654966455139399e-05, "loss": 0.4756, "num_input_tokens_seen": 57973744, "step": 47790 }, { "epoch": 5.322975832498051, "grad_norm": 0.07961660623550415, "learning_rate": 2.6544814391416434e-05, "loss": 0.4617, "num_input_tokens_seen": 57979984, "step": 47795 }, { "epoch": 5.323532687381668, "grad_norm": 0.11754488199949265, "learning_rate": 2.6539964173070842e-05, "loss": 0.473, "num_input_tokens_seen": 57986416, "step": 47800 }, { "epoch": 5.324089542265286, "grad_norm": 0.10008646547794342, "learning_rate": 2.653511389654047e-05, "loss": 0.4651, "num_input_tokens_seen": 57992688, "step": 47805 }, { "epoch": 5.324646397148903, "grad_norm": 0.08860748261213303, "learning_rate": 2.6530263562008574e-05, "loss": 0.4674, "num_input_tokens_seen": 57998960, "step": 47810 }, { "epoch": 5.32520325203252, "grad_norm": 0.12213143706321716, "learning_rate": 2.652541316965842e-05, "loss": 0.4652, "num_input_tokens_seen": 58005104, "step": 47815 }, { "epoch": 5.325760106916138, "grad_norm": 0.06028275191783905, "learning_rate": 2.6520562719673257e-05, "loss": 0.4554, "num_input_tokens_seen": 58010256, "step": 47820 }, { "epoch": 5.326316961799755, "grad_norm": 0.11552911251783371, "learning_rate": 2.651571221223637e-05, "loss": 0.4563, "num_input_tokens_seen": 58016272, "step": 47825 }, { "epoch": 5.3268738166833725, "grad_norm": 0.11311061680316925, "learning_rate": 2.6510861647531017e-05, "loss": 0.4555, "num_input_tokens_seen": 58022448, "step": 47830 }, { "epoch": 5.327430671566989, "grad_norm": 0.0911281555891037, "learning_rate": 2.6506011025740467e-05, "loss": 0.4741, "num_input_tokens_seen": 58028816, "step": 47835 }, { "epoch": 5.327987526450607, "grad_norm": 0.09783505648374557, "learning_rate": 2.6501160347048e-05, "loss": 0.4637, "num_input_tokens_seen": 58034800, "step": 47840 }, { "epoch": 5.328544381334225, "grad_norm": 0.1262115091085434, "learning_rate": 2.6496309611636884e-05, "loss": 0.4631, "num_input_tokens_seen": 58041008, "step": 47845 }, { "epoch": 5.329101236217841, "grad_norm": 0.0954502671957016, "learning_rate": 2.649145881969039e-05, "loss": 0.4675, "num_input_tokens_seen": 58046768, "step": 47850 }, { "epoch": 5.329658091101459, "grad_norm": 0.13001684844493866, "learning_rate": 2.64866079713918e-05, "loss": 0.4564, "num_input_tokens_seen": 58052848, "step": 47855 }, { "epoch": 5.330214945985077, "grad_norm": 0.1116398423910141, "learning_rate": 2.6481757066924402e-05, "loss": 0.4605, "num_input_tokens_seen": 58059216, "step": 47860 }, { "epoch": 5.3307718008686935, "grad_norm": 0.1261587142944336, "learning_rate": 2.6476906106471476e-05, "loss": 0.4671, "num_input_tokens_seen": 58065328, "step": 47865 }, { "epoch": 5.331328655752311, "grad_norm": 0.14291197061538696, "learning_rate": 2.64720550902163e-05, "loss": 0.465, "num_input_tokens_seen": 58071408, "step": 47870 }, { "epoch": 5.331885510635928, "grad_norm": 0.10066685080528259, "learning_rate": 2.6467204018342167e-05, "loss": 0.465, "num_input_tokens_seen": 58077520, "step": 47875 }, { "epoch": 5.332442365519546, "grad_norm": 0.09229277819395065, "learning_rate": 2.6462352891032366e-05, "loss": 0.4541, "num_input_tokens_seen": 58083760, "step": 47880 }, { "epoch": 5.332999220403163, "grad_norm": 0.09312158823013306, "learning_rate": 2.645750170847018e-05, "loss": 0.4639, "num_input_tokens_seen": 58089872, "step": 47885 }, { "epoch": 5.33355607528678, "grad_norm": 0.15452736616134644, "learning_rate": 2.6452650470838912e-05, "loss": 0.4571, "num_input_tokens_seen": 58095984, "step": 47890 }, { "epoch": 5.334112930170398, "grad_norm": 0.1648508757352829, "learning_rate": 2.6447799178321857e-05, "loss": 0.4664, "num_input_tokens_seen": 58101936, "step": 47895 }, { "epoch": 5.334669785054015, "grad_norm": 0.11039680987596512, "learning_rate": 2.644294783110231e-05, "loss": 0.466, "num_input_tokens_seen": 58108336, "step": 47900 }, { "epoch": 5.335226639937632, "grad_norm": 0.1170324757695198, "learning_rate": 2.643809642936357e-05, "loss": 0.457, "num_input_tokens_seen": 58114000, "step": 47905 }, { "epoch": 5.33578349482125, "grad_norm": 0.10381094366312027, "learning_rate": 2.6433244973288934e-05, "loss": 0.4549, "num_input_tokens_seen": 58120016, "step": 47910 }, { "epoch": 5.336340349704867, "grad_norm": 0.13032470643520355, "learning_rate": 2.642839346306171e-05, "loss": 0.4639, "num_input_tokens_seen": 58125488, "step": 47915 }, { "epoch": 5.336897204588484, "grad_norm": 0.09178750216960907, "learning_rate": 2.64235418988652e-05, "loss": 0.4608, "num_input_tokens_seen": 58131664, "step": 47920 }, { "epoch": 5.337454059472101, "grad_norm": 0.09377457946538925, "learning_rate": 2.641869028088272e-05, "loss": 0.4721, "num_input_tokens_seen": 58137904, "step": 47925 }, { "epoch": 5.338010914355719, "grad_norm": 0.13874301314353943, "learning_rate": 2.641383860929758e-05, "loss": 0.4638, "num_input_tokens_seen": 58144272, "step": 47930 }, { "epoch": 5.3385677692393365, "grad_norm": 0.10079489648342133, "learning_rate": 2.6408986884293085e-05, "loss": 0.4498, "num_input_tokens_seen": 58150416, "step": 47935 }, { "epoch": 5.339124624122953, "grad_norm": 0.10698487609624863, "learning_rate": 2.6404135106052546e-05, "loss": 0.4648, "num_input_tokens_seen": 58156400, "step": 47940 }, { "epoch": 5.339681479006571, "grad_norm": 0.09724591672420502, "learning_rate": 2.639928327475928e-05, "loss": 0.4661, "num_input_tokens_seen": 58162512, "step": 47945 }, { "epoch": 5.340238333890188, "grad_norm": 0.12003683298826218, "learning_rate": 2.6394431390596607e-05, "loss": 0.4645, "num_input_tokens_seen": 58168560, "step": 47950 }, { "epoch": 5.340795188773805, "grad_norm": 0.08849215507507324, "learning_rate": 2.6389579453747853e-05, "loss": 0.4438, "num_input_tokens_seen": 58174512, "step": 47955 }, { "epoch": 5.341352043657423, "grad_norm": 0.08260868489742279, "learning_rate": 2.6384727464396336e-05, "loss": 0.4628, "num_input_tokens_seen": 58180464, "step": 47960 }, { "epoch": 5.34190889854104, "grad_norm": 0.10358946025371552, "learning_rate": 2.6379875422725368e-05, "loss": 0.4442, "num_input_tokens_seen": 58186416, "step": 47965 }, { "epoch": 5.342465753424658, "grad_norm": 0.11707556247711182, "learning_rate": 2.6375023328918296e-05, "loss": 0.4628, "num_input_tokens_seen": 58192336, "step": 47970 }, { "epoch": 5.343022608308275, "grad_norm": 0.11883547902107239, "learning_rate": 2.6370171183158428e-05, "loss": 0.4653, "num_input_tokens_seen": 58198800, "step": 47975 }, { "epoch": 5.343579463191892, "grad_norm": 0.0977850928902626, "learning_rate": 2.6365318985629102e-05, "loss": 0.4675, "num_input_tokens_seen": 58204944, "step": 47980 }, { "epoch": 5.34413631807551, "grad_norm": 0.07109009474515915, "learning_rate": 2.6360466736513657e-05, "loss": 0.4673, "num_input_tokens_seen": 58211088, "step": 47985 }, { "epoch": 5.3446931729591265, "grad_norm": 0.08056649565696716, "learning_rate": 2.6355614435995414e-05, "loss": 0.4597, "num_input_tokens_seen": 58217392, "step": 47990 }, { "epoch": 5.345250027842744, "grad_norm": 0.0953420028090477, "learning_rate": 2.6350762084257713e-05, "loss": 0.4581, "num_input_tokens_seen": 58223920, "step": 47995 }, { "epoch": 5.345806882726362, "grad_norm": 0.07650967687368393, "learning_rate": 2.6345909681483887e-05, "loss": 0.4702, "num_input_tokens_seen": 58230000, "step": 48000 }, { "epoch": 5.346363737609979, "grad_norm": 0.09062561392784119, "learning_rate": 2.6341057227857284e-05, "loss": 0.4617, "num_input_tokens_seen": 58236208, "step": 48005 }, { "epoch": 5.346920592493596, "grad_norm": 0.0880778357386589, "learning_rate": 2.633620472356124e-05, "loss": 0.4603, "num_input_tokens_seen": 58242512, "step": 48010 }, { "epoch": 5.347477447377213, "grad_norm": 0.11747598648071289, "learning_rate": 2.6331352168779106e-05, "loss": 0.4531, "num_input_tokens_seen": 58248464, "step": 48015 }, { "epoch": 5.348034302260831, "grad_norm": 0.09286660701036453, "learning_rate": 2.632649956369422e-05, "loss": 0.4638, "num_input_tokens_seen": 58254736, "step": 48020 }, { "epoch": 5.348591157144448, "grad_norm": 0.07707294821739197, "learning_rate": 2.6321646908489922e-05, "loss": 0.458, "num_input_tokens_seen": 58260624, "step": 48025 }, { "epoch": 5.349148012028065, "grad_norm": 0.09540124237537384, "learning_rate": 2.631679420334957e-05, "loss": 0.4634, "num_input_tokens_seen": 58266960, "step": 48030 }, { "epoch": 5.349704866911683, "grad_norm": 0.11300642788410187, "learning_rate": 2.631194144845652e-05, "loss": 0.4461, "num_input_tokens_seen": 58273104, "step": 48035 }, { "epoch": 5.350261721795301, "grad_norm": 0.11815404891967773, "learning_rate": 2.6307088643994103e-05, "loss": 0.4514, "num_input_tokens_seen": 58279344, "step": 48040 }, { "epoch": 5.350818576678917, "grad_norm": 0.10887208580970764, "learning_rate": 2.6302235790145702e-05, "loss": 0.4752, "num_input_tokens_seen": 58285424, "step": 48045 }, { "epoch": 5.351375431562535, "grad_norm": 0.1127738207578659, "learning_rate": 2.6297382887094655e-05, "loss": 0.457, "num_input_tokens_seen": 58291664, "step": 48050 }, { "epoch": 5.351932286446152, "grad_norm": 0.09794174134731293, "learning_rate": 2.6292529935024328e-05, "loss": 0.4695, "num_input_tokens_seen": 58298096, "step": 48055 }, { "epoch": 5.3524891413297695, "grad_norm": 0.08612467348575592, "learning_rate": 2.628767693411808e-05, "loss": 0.4601, "num_input_tokens_seen": 58304400, "step": 48060 }, { "epoch": 5.353045996213387, "grad_norm": 0.09321386367082596, "learning_rate": 2.6282823884559266e-05, "loss": 0.4444, "num_input_tokens_seen": 58310160, "step": 48065 }, { "epoch": 5.353602851097004, "grad_norm": 0.14737799763679504, "learning_rate": 2.6277970786531247e-05, "loss": 0.4531, "num_input_tokens_seen": 58316368, "step": 48070 }, { "epoch": 5.354159705980622, "grad_norm": 0.13666322827339172, "learning_rate": 2.6273117640217403e-05, "loss": 0.4683, "num_input_tokens_seen": 58322352, "step": 48075 }, { "epoch": 5.354716560864238, "grad_norm": 0.09706846624612808, "learning_rate": 2.62682644458011e-05, "loss": 0.4562, "num_input_tokens_seen": 58328560, "step": 48080 }, { "epoch": 5.355273415747856, "grad_norm": 0.12613531947135925, "learning_rate": 2.62634112034657e-05, "loss": 0.4522, "num_input_tokens_seen": 58334928, "step": 48085 }, { "epoch": 5.355830270631474, "grad_norm": 0.11506988853216171, "learning_rate": 2.6258557913394572e-05, "loss": 0.4472, "num_input_tokens_seen": 58341136, "step": 48090 }, { "epoch": 5.3563871255150906, "grad_norm": 0.11212106049060822, "learning_rate": 2.62537045757711e-05, "loss": 0.4538, "num_input_tokens_seen": 58347472, "step": 48095 }, { "epoch": 5.356943980398708, "grad_norm": 0.09575561434030533, "learning_rate": 2.6248851190778645e-05, "loss": 0.4644, "num_input_tokens_seen": 58353552, "step": 48100 }, { "epoch": 5.357500835282325, "grad_norm": 0.11830894649028778, "learning_rate": 2.6243997758600587e-05, "loss": 0.4552, "num_input_tokens_seen": 58359120, "step": 48105 }, { "epoch": 5.358057690165943, "grad_norm": 0.13727746903896332, "learning_rate": 2.6239144279420307e-05, "loss": 0.4579, "num_input_tokens_seen": 58365200, "step": 48110 }, { "epoch": 5.35861454504956, "grad_norm": 0.08536078035831451, "learning_rate": 2.6234290753421193e-05, "loss": 0.4511, "num_input_tokens_seen": 58370928, "step": 48115 }, { "epoch": 5.359171399933177, "grad_norm": 0.10960592329502106, "learning_rate": 2.6229437180786614e-05, "loss": 0.4622, "num_input_tokens_seen": 58376528, "step": 48120 }, { "epoch": 5.359728254816795, "grad_norm": 0.11193260550498962, "learning_rate": 2.622458356169996e-05, "loss": 0.4499, "num_input_tokens_seen": 58381808, "step": 48125 }, { "epoch": 5.360285109700412, "grad_norm": 0.14177317917346954, "learning_rate": 2.6219729896344615e-05, "loss": 0.475, "num_input_tokens_seen": 58387792, "step": 48130 }, { "epoch": 5.360841964584029, "grad_norm": 0.11123041063547134, "learning_rate": 2.6214876184903965e-05, "loss": 0.4655, "num_input_tokens_seen": 58393808, "step": 48135 }, { "epoch": 5.361398819467647, "grad_norm": 0.11361536383628845, "learning_rate": 2.62100224275614e-05, "loss": 0.451, "num_input_tokens_seen": 58399888, "step": 48140 }, { "epoch": 5.361955674351264, "grad_norm": 0.15565648674964905, "learning_rate": 2.620516862450031e-05, "loss": 0.4581, "num_input_tokens_seen": 58406096, "step": 48145 }, { "epoch": 5.362512529234881, "grad_norm": 0.11760225892066956, "learning_rate": 2.620031477590409e-05, "loss": 0.4611, "num_input_tokens_seen": 58412304, "step": 48150 }, { "epoch": 5.363069384118499, "grad_norm": 0.10327129811048508, "learning_rate": 2.6195460881956123e-05, "loss": 0.4651, "num_input_tokens_seen": 58418352, "step": 48155 }, { "epoch": 5.363626239002116, "grad_norm": 0.12394197285175323, "learning_rate": 2.6190606942839813e-05, "loss": 0.4699, "num_input_tokens_seen": 58424464, "step": 48160 }, { "epoch": 5.3641830938857336, "grad_norm": 0.13131685554981232, "learning_rate": 2.6185752958738563e-05, "loss": 0.4703, "num_input_tokens_seen": 58430512, "step": 48165 }, { "epoch": 5.36473994876935, "grad_norm": 0.13005276024341583, "learning_rate": 2.6180898929835768e-05, "loss": 0.4628, "num_input_tokens_seen": 58436688, "step": 48170 }, { "epoch": 5.365296803652968, "grad_norm": 0.11702834069728851, "learning_rate": 2.6176044856314824e-05, "loss": 0.4643, "num_input_tokens_seen": 58442928, "step": 48175 }, { "epoch": 5.365853658536586, "grad_norm": 0.12056075781583786, "learning_rate": 2.6171190738359136e-05, "loss": 0.4753, "num_input_tokens_seen": 58449296, "step": 48180 }, { "epoch": 5.3664105134202025, "grad_norm": 0.10753385722637177, "learning_rate": 2.616633657615211e-05, "loss": 0.4592, "num_input_tokens_seen": 58455152, "step": 48185 }, { "epoch": 5.36696736830382, "grad_norm": 0.09719440340995789, "learning_rate": 2.616148236987715e-05, "loss": 0.454, "num_input_tokens_seen": 58461584, "step": 48190 }, { "epoch": 5.367524223187437, "grad_norm": 0.12201730161905289, "learning_rate": 2.6156628119717658e-05, "loss": 0.4711, "num_input_tokens_seen": 58467504, "step": 48195 }, { "epoch": 5.368081078071055, "grad_norm": 0.13448959589004517, "learning_rate": 2.6151773825857058e-05, "loss": 0.4651, "num_input_tokens_seen": 58473808, "step": 48200 }, { "epoch": 5.368637932954672, "grad_norm": 0.09952262789011002, "learning_rate": 2.6146919488478748e-05, "loss": 0.4628, "num_input_tokens_seen": 58479888, "step": 48205 }, { "epoch": 5.369194787838289, "grad_norm": 0.1722584217786789, "learning_rate": 2.6142065107766144e-05, "loss": 0.4489, "num_input_tokens_seen": 58486096, "step": 48210 }, { "epoch": 5.369751642721907, "grad_norm": 0.10400800406932831, "learning_rate": 2.6137210683902664e-05, "loss": 0.4598, "num_input_tokens_seen": 58491824, "step": 48215 }, { "epoch": 5.370308497605524, "grad_norm": 0.11398759484291077, "learning_rate": 2.6132356217071712e-05, "loss": 0.4651, "num_input_tokens_seen": 58497680, "step": 48220 }, { "epoch": 5.370865352489141, "grad_norm": 0.13744959235191345, "learning_rate": 2.6127501707456713e-05, "loss": 0.4614, "num_input_tokens_seen": 58503600, "step": 48225 }, { "epoch": 5.371422207372759, "grad_norm": 0.10947319120168686, "learning_rate": 2.6122647155241097e-05, "loss": 0.4603, "num_input_tokens_seen": 58509744, "step": 48230 }, { "epoch": 5.371979062256376, "grad_norm": 0.09165515750646591, "learning_rate": 2.611779256060827e-05, "loss": 0.4677, "num_input_tokens_seen": 58515440, "step": 48235 }, { "epoch": 5.372535917139993, "grad_norm": 0.11346405744552612, "learning_rate": 2.6112937923741654e-05, "loss": 0.4582, "num_input_tokens_seen": 58521360, "step": 48240 }, { "epoch": 5.373092772023611, "grad_norm": 0.07996658980846405, "learning_rate": 2.6108083244824677e-05, "loss": 0.4601, "num_input_tokens_seen": 58527504, "step": 48245 }, { "epoch": 5.373649626907228, "grad_norm": 0.0751214474439621, "learning_rate": 2.610322852404077e-05, "loss": 0.4604, "num_input_tokens_seen": 58533712, "step": 48250 }, { "epoch": 5.3742064817908455, "grad_norm": 0.12497652322053909, "learning_rate": 2.609837376157334e-05, "loss": 0.4561, "num_input_tokens_seen": 58539408, "step": 48255 }, { "epoch": 5.374763336674462, "grad_norm": 0.11441174149513245, "learning_rate": 2.6093518957605844e-05, "loss": 0.4521, "num_input_tokens_seen": 58545712, "step": 48260 }, { "epoch": 5.37532019155808, "grad_norm": 0.09451958537101746, "learning_rate": 2.6088664112321693e-05, "loss": 0.4576, "num_input_tokens_seen": 58551664, "step": 48265 }, { "epoch": 5.375877046441698, "grad_norm": 0.0990380123257637, "learning_rate": 2.608380922590432e-05, "loss": 0.4566, "num_input_tokens_seen": 58557616, "step": 48270 }, { "epoch": 5.376433901325314, "grad_norm": 0.0875951498746872, "learning_rate": 2.607895429853717e-05, "loss": 0.4526, "num_input_tokens_seen": 58563824, "step": 48275 }, { "epoch": 5.376990756208932, "grad_norm": 0.11770884692668915, "learning_rate": 2.6074099330403662e-05, "loss": 0.4611, "num_input_tokens_seen": 58570128, "step": 48280 }, { "epoch": 5.377547611092549, "grad_norm": 0.12509803473949432, "learning_rate": 2.6069244321687235e-05, "loss": 0.4474, "num_input_tokens_seen": 58576464, "step": 48285 }, { "epoch": 5.3781044659761665, "grad_norm": 0.15851907432079315, "learning_rate": 2.6064389272571334e-05, "loss": 0.4656, "num_input_tokens_seen": 58582640, "step": 48290 }, { "epoch": 5.378661320859784, "grad_norm": 0.1350555717945099, "learning_rate": 2.60595341832394e-05, "loss": 0.4694, "num_input_tokens_seen": 58588880, "step": 48295 }, { "epoch": 5.379218175743401, "grad_norm": 0.15224240720272064, "learning_rate": 2.6054679053874866e-05, "loss": 0.4547, "num_input_tokens_seen": 58595184, "step": 48300 }, { "epoch": 5.379775030627019, "grad_norm": 0.08704844117164612, "learning_rate": 2.604982388466118e-05, "loss": 0.4524, "num_input_tokens_seen": 58601264, "step": 48305 }, { "epoch": 5.3803318855106355, "grad_norm": 0.12671343982219696, "learning_rate": 2.604496867578178e-05, "loss": 0.4563, "num_input_tokens_seen": 58607408, "step": 48310 }, { "epoch": 5.380888740394253, "grad_norm": 0.09824255108833313, "learning_rate": 2.6040113427420117e-05, "loss": 0.4643, "num_input_tokens_seen": 58613584, "step": 48315 }, { "epoch": 5.381445595277871, "grad_norm": 0.1123233214020729, "learning_rate": 2.6035258139759626e-05, "loss": 0.4493, "num_input_tokens_seen": 58619728, "step": 48320 }, { "epoch": 5.382002450161488, "grad_norm": 0.06608468294143677, "learning_rate": 2.6030402812983774e-05, "loss": 0.4579, "num_input_tokens_seen": 58625456, "step": 48325 }, { "epoch": 5.382559305045105, "grad_norm": 0.10885436832904816, "learning_rate": 2.6025547447276e-05, "loss": 0.4559, "num_input_tokens_seen": 58631568, "step": 48330 }, { "epoch": 5.383116159928723, "grad_norm": 0.10100895166397095, "learning_rate": 2.602069204281976e-05, "loss": 0.4551, "num_input_tokens_seen": 58637744, "step": 48335 }, { "epoch": 5.38367301481234, "grad_norm": 0.1321534365415573, "learning_rate": 2.601583659979851e-05, "loss": 0.4604, "num_input_tokens_seen": 58643984, "step": 48340 }, { "epoch": 5.384229869695957, "grad_norm": 0.10551118105649948, "learning_rate": 2.6010981118395684e-05, "loss": 0.4535, "num_input_tokens_seen": 58650000, "step": 48345 }, { "epoch": 5.384786724579574, "grad_norm": 0.13650183379650116, "learning_rate": 2.6006125598794756e-05, "loss": 0.4519, "num_input_tokens_seen": 58656080, "step": 48350 }, { "epoch": 5.385343579463192, "grad_norm": 0.11411099880933762, "learning_rate": 2.6001270041179177e-05, "loss": 0.4645, "num_input_tokens_seen": 58661840, "step": 48355 }, { "epoch": 5.3859004343468095, "grad_norm": 0.11294616013765335, "learning_rate": 2.599641444573241e-05, "loss": 0.4546, "num_input_tokens_seen": 58668176, "step": 48360 }, { "epoch": 5.386457289230426, "grad_norm": 0.13478359580039978, "learning_rate": 2.5991558812637913e-05, "loss": 0.4563, "num_input_tokens_seen": 58674384, "step": 48365 }, { "epoch": 5.387014144114044, "grad_norm": 0.12073108553886414, "learning_rate": 2.5986703142079143e-05, "loss": 0.444, "num_input_tokens_seen": 58680688, "step": 48370 }, { "epoch": 5.387570998997662, "grad_norm": 0.1300765872001648, "learning_rate": 2.598184743423957e-05, "loss": 0.4604, "num_input_tokens_seen": 58686736, "step": 48375 }, { "epoch": 5.3881278538812785, "grad_norm": 0.09069384634494781, "learning_rate": 2.597699168930265e-05, "loss": 0.4639, "num_input_tokens_seen": 58692912, "step": 48380 }, { "epoch": 5.388684708764896, "grad_norm": 0.16038928925991058, "learning_rate": 2.5972135907451855e-05, "loss": 0.4739, "num_input_tokens_seen": 58698640, "step": 48385 }, { "epoch": 5.389241563648513, "grad_norm": 0.1014612466096878, "learning_rate": 2.596728008887065e-05, "loss": 0.4671, "num_input_tokens_seen": 58704464, "step": 48390 }, { "epoch": 5.389798418532131, "grad_norm": 0.10196547955274582, "learning_rate": 2.5962424233742506e-05, "loss": 0.4664, "num_input_tokens_seen": 58710544, "step": 48395 }, { "epoch": 5.390355273415748, "grad_norm": 0.11493920534849167, "learning_rate": 2.595756834225089e-05, "loss": 0.4555, "num_input_tokens_seen": 58716368, "step": 48400 }, { "epoch": 5.390912128299365, "grad_norm": 0.099045529961586, "learning_rate": 2.5952712414579272e-05, "loss": 0.4537, "num_input_tokens_seen": 58722640, "step": 48405 }, { "epoch": 5.391468983182983, "grad_norm": 0.09286566078662872, "learning_rate": 2.5947856450911117e-05, "loss": 0.4835, "num_input_tokens_seen": 58728496, "step": 48410 }, { "epoch": 5.3920258380665995, "grad_norm": 0.12374581396579742, "learning_rate": 2.594300045142992e-05, "loss": 0.4682, "num_input_tokens_seen": 58734832, "step": 48415 }, { "epoch": 5.392582692950217, "grad_norm": 0.08819160610437393, "learning_rate": 2.593814441631914e-05, "loss": 0.465, "num_input_tokens_seen": 58741136, "step": 48420 }, { "epoch": 5.393139547833835, "grad_norm": 0.13272719085216522, "learning_rate": 2.593328834576226e-05, "loss": 0.4623, "num_input_tokens_seen": 58747568, "step": 48425 }, { "epoch": 5.393696402717452, "grad_norm": 0.11906108260154724, "learning_rate": 2.592843223994275e-05, "loss": 0.4721, "num_input_tokens_seen": 58754032, "step": 48430 }, { "epoch": 5.394253257601069, "grad_norm": 0.140791118144989, "learning_rate": 2.59235760990441e-05, "loss": 0.4546, "num_input_tokens_seen": 58760208, "step": 48435 }, { "epoch": 5.394810112484686, "grad_norm": 0.10917487740516663, "learning_rate": 2.5918719923249783e-05, "loss": 0.4565, "num_input_tokens_seen": 58766448, "step": 48440 }, { "epoch": 5.395366967368304, "grad_norm": 0.12618958950042725, "learning_rate": 2.591386371274328e-05, "loss": 0.4735, "num_input_tokens_seen": 58772880, "step": 48445 }, { "epoch": 5.3959238222519215, "grad_norm": 0.09505420178174973, "learning_rate": 2.590900746770808e-05, "loss": 0.4421, "num_input_tokens_seen": 58779184, "step": 48450 }, { "epoch": 5.396480677135538, "grad_norm": 0.13313385844230652, "learning_rate": 2.5904151188327663e-05, "loss": 0.4612, "num_input_tokens_seen": 58785200, "step": 48455 }, { "epoch": 5.397037532019156, "grad_norm": 0.09254729002714157, "learning_rate": 2.5899294874785522e-05, "loss": 0.4601, "num_input_tokens_seen": 58791376, "step": 48460 }, { "epoch": 5.397594386902773, "grad_norm": 0.09761132299900055, "learning_rate": 2.5894438527265136e-05, "loss": 0.4589, "num_input_tokens_seen": 58797232, "step": 48465 }, { "epoch": 5.39815124178639, "grad_norm": 0.12905167043209076, "learning_rate": 2.588958214595e-05, "loss": 0.4671, "num_input_tokens_seen": 58803632, "step": 48470 }, { "epoch": 5.398708096670008, "grad_norm": 0.1159985214471817, "learning_rate": 2.5884725731023584e-05, "loss": 0.4734, "num_input_tokens_seen": 58809840, "step": 48475 }, { "epoch": 5.399264951553625, "grad_norm": 0.1275498867034912, "learning_rate": 2.587986928266941e-05, "loss": 0.4586, "num_input_tokens_seen": 58815472, "step": 48480 }, { "epoch": 5.3998218064372425, "grad_norm": 0.11196215450763702, "learning_rate": 2.5875012801070952e-05, "loss": 0.4611, "num_input_tokens_seen": 58821520, "step": 48485 }, { "epoch": 5.40037866132086, "grad_norm": 0.09554571658372879, "learning_rate": 2.5870156286411707e-05, "loss": 0.4666, "num_input_tokens_seen": 58827760, "step": 48490 }, { "epoch": 5.400935516204477, "grad_norm": 0.10476996004581451, "learning_rate": 2.5865299738875166e-05, "loss": 0.4536, "num_input_tokens_seen": 58834000, "step": 48495 }, { "epoch": 5.401492371088095, "grad_norm": 0.0891026109457016, "learning_rate": 2.5860443158644827e-05, "loss": 0.4561, "num_input_tokens_seen": 58840080, "step": 48500 }, { "epoch": 5.402049225971711, "grad_norm": 0.14158625900745392, "learning_rate": 2.5855586545904188e-05, "loss": 0.4586, "num_input_tokens_seen": 58846128, "step": 48505 }, { "epoch": 5.402606080855329, "grad_norm": 0.09090061485767365, "learning_rate": 2.5850729900836747e-05, "loss": 0.4779, "num_input_tokens_seen": 58852240, "step": 48510 }, { "epoch": 5.403162935738947, "grad_norm": 0.12225885689258575, "learning_rate": 2.584587322362601e-05, "loss": 0.4724, "num_input_tokens_seen": 58858128, "step": 48515 }, { "epoch": 5.403719790622564, "grad_norm": 0.09568873792886734, "learning_rate": 2.5841016514455468e-05, "loss": 0.4706, "num_input_tokens_seen": 58864176, "step": 48520 }, { "epoch": 5.404276645506181, "grad_norm": 0.0884656235575676, "learning_rate": 2.5836159773508627e-05, "loss": 0.4764, "num_input_tokens_seen": 58870288, "step": 48525 }, { "epoch": 5.404833500389798, "grad_norm": 0.16124171018600464, "learning_rate": 2.583130300096899e-05, "loss": 0.4671, "num_input_tokens_seen": 58876240, "step": 48530 }, { "epoch": 5.405390355273416, "grad_norm": 0.08777893334627151, "learning_rate": 2.582644619702006e-05, "loss": 0.4656, "num_input_tokens_seen": 58882288, "step": 48535 }, { "epoch": 5.405947210157033, "grad_norm": 0.10849497467279434, "learning_rate": 2.5821589361845343e-05, "loss": 0.4581, "num_input_tokens_seen": 58887984, "step": 48540 }, { "epoch": 5.40650406504065, "grad_norm": 0.10263551771640778, "learning_rate": 2.581673249562836e-05, "loss": 0.465, "num_input_tokens_seen": 58894032, "step": 48545 }, { "epoch": 5.407060919924268, "grad_norm": 0.09835878014564514, "learning_rate": 2.5811875598552594e-05, "loss": 0.4715, "num_input_tokens_seen": 58900336, "step": 48550 }, { "epoch": 5.4076177748078855, "grad_norm": 0.09263144433498383, "learning_rate": 2.580701867080157e-05, "loss": 0.4683, "num_input_tokens_seen": 58906544, "step": 48555 }, { "epoch": 5.408174629691502, "grad_norm": 0.10599047690629959, "learning_rate": 2.580216171255879e-05, "loss": 0.4606, "num_input_tokens_seen": 58912400, "step": 48560 }, { "epoch": 5.40873148457512, "grad_norm": 0.11215902119874954, "learning_rate": 2.579730472400777e-05, "loss": 0.4603, "num_input_tokens_seen": 58918736, "step": 48565 }, { "epoch": 5.409288339458737, "grad_norm": 0.10210559517145157, "learning_rate": 2.5792447705332022e-05, "loss": 0.4592, "num_input_tokens_seen": 58924912, "step": 48570 }, { "epoch": 5.4098451943423544, "grad_norm": 0.1188591942191124, "learning_rate": 2.5787590656715065e-05, "loss": 0.4677, "num_input_tokens_seen": 58931280, "step": 48575 }, { "epoch": 5.410402049225972, "grad_norm": 0.0837550163269043, "learning_rate": 2.5782733578340408e-05, "loss": 0.4602, "num_input_tokens_seen": 58937424, "step": 48580 }, { "epoch": 5.410958904109589, "grad_norm": 0.11511818319559097, "learning_rate": 2.5777876470391572e-05, "loss": 0.466, "num_input_tokens_seen": 58944080, "step": 48585 }, { "epoch": 5.411515758993207, "grad_norm": 0.09993679076433182, "learning_rate": 2.577301933305206e-05, "loss": 0.4571, "num_input_tokens_seen": 58950192, "step": 48590 }, { "epoch": 5.412072613876823, "grad_norm": 0.08661778271198273, "learning_rate": 2.5768162166505404e-05, "loss": 0.4784, "num_input_tokens_seen": 58956400, "step": 48595 }, { "epoch": 5.412629468760441, "grad_norm": 0.1115284264087677, "learning_rate": 2.576330497093512e-05, "loss": 0.4658, "num_input_tokens_seen": 58962608, "step": 48600 }, { "epoch": 5.413186323644059, "grad_norm": 0.12683838605880737, "learning_rate": 2.5758447746524732e-05, "loss": 0.4653, "num_input_tokens_seen": 58968336, "step": 48605 }, { "epoch": 5.4137431785276755, "grad_norm": 0.12370039522647858, "learning_rate": 2.5753590493457752e-05, "loss": 0.4641, "num_input_tokens_seen": 58974256, "step": 48610 }, { "epoch": 5.414300033411293, "grad_norm": 0.08822880685329437, "learning_rate": 2.5748733211917713e-05, "loss": 0.4516, "num_input_tokens_seen": 58980368, "step": 48615 }, { "epoch": 5.41485688829491, "grad_norm": 0.12472233921289444, "learning_rate": 2.574387590208813e-05, "loss": 0.4602, "num_input_tokens_seen": 58986576, "step": 48620 }, { "epoch": 5.415413743178528, "grad_norm": 0.1365405023097992, "learning_rate": 2.573901856415254e-05, "loss": 0.4791, "num_input_tokens_seen": 58992656, "step": 48625 }, { "epoch": 5.415970598062145, "grad_norm": 0.09512317180633545, "learning_rate": 2.5734161198294443e-05, "loss": 0.4669, "num_input_tokens_seen": 58998640, "step": 48630 }, { "epoch": 5.416527452945762, "grad_norm": 0.1703815907239914, "learning_rate": 2.5729303804697396e-05, "loss": 0.4555, "num_input_tokens_seen": 59004752, "step": 48635 }, { "epoch": 5.41708430782938, "grad_norm": 0.13958345353603363, "learning_rate": 2.5724446383544914e-05, "loss": 0.4672, "num_input_tokens_seen": 59011088, "step": 48640 }, { "epoch": 5.417641162712997, "grad_norm": 0.12354955822229385, "learning_rate": 2.571958893502052e-05, "loss": 0.4651, "num_input_tokens_seen": 59016880, "step": 48645 }, { "epoch": 5.418198017596614, "grad_norm": 0.08854396641254425, "learning_rate": 2.571473145930776e-05, "loss": 0.456, "num_input_tokens_seen": 59022960, "step": 48650 }, { "epoch": 5.418754872480232, "grad_norm": 0.14494335651397705, "learning_rate": 2.5709873956590146e-05, "loss": 0.4726, "num_input_tokens_seen": 59029264, "step": 48655 }, { "epoch": 5.419311727363849, "grad_norm": 0.08849731087684631, "learning_rate": 2.570501642705122e-05, "loss": 0.4528, "num_input_tokens_seen": 59035504, "step": 48660 }, { "epoch": 5.419868582247466, "grad_norm": 0.11713224649429321, "learning_rate": 2.5700158870874512e-05, "loss": 0.4653, "num_input_tokens_seen": 59041872, "step": 48665 }, { "epoch": 5.420425437131084, "grad_norm": 0.14630837738513947, "learning_rate": 2.5695301288243568e-05, "loss": 0.4764, "num_input_tokens_seen": 59048432, "step": 48670 }, { "epoch": 5.420982292014701, "grad_norm": 0.10511007159948349, "learning_rate": 2.569044367934191e-05, "loss": 0.4631, "num_input_tokens_seen": 59054640, "step": 48675 }, { "epoch": 5.4215391468983185, "grad_norm": 0.11346753686666489, "learning_rate": 2.5685586044353073e-05, "loss": 0.4602, "num_input_tokens_seen": 59060656, "step": 48680 }, { "epoch": 5.422096001781935, "grad_norm": 0.08743239939212799, "learning_rate": 2.56807283834606e-05, "loss": 0.4604, "num_input_tokens_seen": 59066608, "step": 48685 }, { "epoch": 5.422652856665553, "grad_norm": 0.0888189896941185, "learning_rate": 2.5675870696848026e-05, "loss": 0.4594, "num_input_tokens_seen": 59072688, "step": 48690 }, { "epoch": 5.423209711549171, "grad_norm": 0.11336866021156311, "learning_rate": 2.567101298469889e-05, "loss": 0.4653, "num_input_tokens_seen": 59078640, "step": 48695 }, { "epoch": 5.423766566432787, "grad_norm": 0.10416190326213837, "learning_rate": 2.5666155247196732e-05, "loss": 0.4499, "num_input_tokens_seen": 59084528, "step": 48700 }, { "epoch": 5.424323421316405, "grad_norm": 0.12645770609378815, "learning_rate": 2.56612974845251e-05, "loss": 0.4677, "num_input_tokens_seen": 59090608, "step": 48705 }, { "epoch": 5.424880276200022, "grad_norm": 0.10351309180259705, "learning_rate": 2.5656439696867525e-05, "loss": 0.4591, "num_input_tokens_seen": 59096688, "step": 48710 }, { "epoch": 5.42543713108364, "grad_norm": 0.09605444222688675, "learning_rate": 2.5651581884407556e-05, "loss": 0.4596, "num_input_tokens_seen": 59102768, "step": 48715 }, { "epoch": 5.425993985967257, "grad_norm": 0.13434822857379913, "learning_rate": 2.564672404732874e-05, "loss": 0.4673, "num_input_tokens_seen": 59108496, "step": 48720 }, { "epoch": 5.426550840850874, "grad_norm": 0.09864597767591476, "learning_rate": 2.5641866185814607e-05, "loss": 0.4643, "num_input_tokens_seen": 59114672, "step": 48725 }, { "epoch": 5.427107695734492, "grad_norm": 0.09514924883842468, "learning_rate": 2.5637008300048713e-05, "loss": 0.4637, "num_input_tokens_seen": 59120592, "step": 48730 }, { "epoch": 5.427664550618109, "grad_norm": 0.08597608655691147, "learning_rate": 2.5632150390214616e-05, "loss": 0.4477, "num_input_tokens_seen": 59126192, "step": 48735 }, { "epoch": 5.428221405501726, "grad_norm": 0.09519162774085999, "learning_rate": 2.5627292456495843e-05, "loss": 0.4573, "num_input_tokens_seen": 59132272, "step": 48740 }, { "epoch": 5.428778260385344, "grad_norm": 0.0904768630862236, "learning_rate": 2.5622434499075953e-05, "loss": 0.4624, "num_input_tokens_seen": 59138352, "step": 48745 }, { "epoch": 5.429335115268961, "grad_norm": 0.07169901579618454, "learning_rate": 2.561757651813849e-05, "loss": 0.4596, "num_input_tokens_seen": 59144560, "step": 48750 }, { "epoch": 5.429891970152578, "grad_norm": 0.10902386158704758, "learning_rate": 2.5612718513867014e-05, "loss": 0.4647, "num_input_tokens_seen": 59150608, "step": 48755 }, { "epoch": 5.430448825036196, "grad_norm": 0.07997607439756393, "learning_rate": 2.5607860486445062e-05, "loss": 0.4658, "num_input_tokens_seen": 59156848, "step": 48760 }, { "epoch": 5.431005679919813, "grad_norm": 0.11010440438985825, "learning_rate": 2.56030024360562e-05, "loss": 0.456, "num_input_tokens_seen": 59162768, "step": 48765 }, { "epoch": 5.43156253480343, "grad_norm": 0.144357368350029, "learning_rate": 2.559814436288397e-05, "loss": 0.4656, "num_input_tokens_seen": 59168336, "step": 48770 }, { "epoch": 5.432119389687047, "grad_norm": 0.09513594955205917, "learning_rate": 2.5593286267111927e-05, "loss": 0.4526, "num_input_tokens_seen": 59174448, "step": 48775 }, { "epoch": 5.432676244570665, "grad_norm": 0.08076237887144089, "learning_rate": 2.5588428148923626e-05, "loss": 0.4653, "num_input_tokens_seen": 59180592, "step": 48780 }, { "epoch": 5.433233099454283, "grad_norm": 0.11038108170032501, "learning_rate": 2.5583570008502623e-05, "loss": 0.4655, "num_input_tokens_seen": 59186800, "step": 48785 }, { "epoch": 5.433789954337899, "grad_norm": 0.07742512971162796, "learning_rate": 2.5578711846032476e-05, "loss": 0.4579, "num_input_tokens_seen": 59192720, "step": 48790 }, { "epoch": 5.434346809221517, "grad_norm": 0.09359046816825867, "learning_rate": 2.5573853661696745e-05, "loss": 0.4712, "num_input_tokens_seen": 59198928, "step": 48795 }, { "epoch": 5.434903664105134, "grad_norm": 0.10848070681095123, "learning_rate": 2.5568995455678984e-05, "loss": 0.4542, "num_input_tokens_seen": 59205072, "step": 48800 }, { "epoch": 5.4354605189887515, "grad_norm": 0.09696482867002487, "learning_rate": 2.556413722816275e-05, "loss": 0.4612, "num_input_tokens_seen": 59211056, "step": 48805 }, { "epoch": 5.436017373872369, "grad_norm": 0.13849098980426788, "learning_rate": 2.5559278979331604e-05, "loss": 0.4599, "num_input_tokens_seen": 59217360, "step": 48810 }, { "epoch": 5.436574228755986, "grad_norm": 0.12647467851638794, "learning_rate": 2.55544207093691e-05, "loss": 0.4583, "num_input_tokens_seen": 59223440, "step": 48815 }, { "epoch": 5.437131083639604, "grad_norm": 0.11274642497301102, "learning_rate": 2.5549562418458806e-05, "loss": 0.4696, "num_input_tokens_seen": 59229392, "step": 48820 }, { "epoch": 5.43768793852322, "grad_norm": 0.10129871219396591, "learning_rate": 2.554470410678429e-05, "loss": 0.4586, "num_input_tokens_seen": 59235664, "step": 48825 }, { "epoch": 5.438244793406838, "grad_norm": 0.10654327273368835, "learning_rate": 2.5539845774529104e-05, "loss": 0.4669, "num_input_tokens_seen": 59241872, "step": 48830 }, { "epoch": 5.438801648290456, "grad_norm": 0.08461713045835495, "learning_rate": 2.553498742187681e-05, "loss": 0.4527, "num_input_tokens_seen": 59247920, "step": 48835 }, { "epoch": 5.4393585031740725, "grad_norm": 0.10792884230613708, "learning_rate": 2.553012904901098e-05, "loss": 0.4477, "num_input_tokens_seen": 59253968, "step": 48840 }, { "epoch": 5.43991535805769, "grad_norm": 0.10402414202690125, "learning_rate": 2.5525270656115176e-05, "loss": 0.4574, "num_input_tokens_seen": 59259856, "step": 48845 }, { "epoch": 5.440472212941308, "grad_norm": 0.1364944577217102, "learning_rate": 2.5520412243372966e-05, "loss": 0.4419, "num_input_tokens_seen": 59265776, "step": 48850 }, { "epoch": 5.441029067824925, "grad_norm": 0.11708949506282806, "learning_rate": 2.5515553810967905e-05, "loss": 0.4614, "num_input_tokens_seen": 59272112, "step": 48855 }, { "epoch": 5.441585922708542, "grad_norm": 0.137893408536911, "learning_rate": 2.551069535908358e-05, "loss": 0.4591, "num_input_tokens_seen": 59278352, "step": 48860 }, { "epoch": 5.442142777592159, "grad_norm": 0.08817031979560852, "learning_rate": 2.5505836887903545e-05, "loss": 0.464, "num_input_tokens_seen": 59284528, "step": 48865 }, { "epoch": 5.442699632475777, "grad_norm": 0.1490042805671692, "learning_rate": 2.550097839761137e-05, "loss": 0.4685, "num_input_tokens_seen": 59290768, "step": 48870 }, { "epoch": 5.4432564873593945, "grad_norm": 0.10411763936281204, "learning_rate": 2.5496119888390625e-05, "loss": 0.4805, "num_input_tokens_seen": 59296880, "step": 48875 }, { "epoch": 5.443813342243011, "grad_norm": 0.12406757473945618, "learning_rate": 2.5491261360424873e-05, "loss": 0.4539, "num_input_tokens_seen": 59302832, "step": 48880 }, { "epoch": 5.444370197126629, "grad_norm": 0.0992060974240303, "learning_rate": 2.5486402813897702e-05, "loss": 0.4644, "num_input_tokens_seen": 59308464, "step": 48885 }, { "epoch": 5.444927052010246, "grad_norm": 0.13365478813648224, "learning_rate": 2.5481544248992674e-05, "loss": 0.4655, "num_input_tokens_seen": 59314416, "step": 48890 }, { "epoch": 5.445483906893863, "grad_norm": 0.11306758970022202, "learning_rate": 2.5476685665893358e-05, "loss": 0.454, "num_input_tokens_seen": 59320080, "step": 48895 }, { "epoch": 5.446040761777481, "grad_norm": 0.1361897736787796, "learning_rate": 2.5471827064783332e-05, "loss": 0.4605, "num_input_tokens_seen": 59326224, "step": 48900 }, { "epoch": 5.446597616661098, "grad_norm": 0.14031247794628143, "learning_rate": 2.5466968445846167e-05, "loss": 0.4526, "num_input_tokens_seen": 59332400, "step": 48905 }, { "epoch": 5.4471544715447155, "grad_norm": 0.14896436035633087, "learning_rate": 2.5462109809265435e-05, "loss": 0.4623, "num_input_tokens_seen": 59337904, "step": 48910 }, { "epoch": 5.447711326428333, "grad_norm": 0.08307375758886337, "learning_rate": 2.5457251155224714e-05, "loss": 0.4612, "num_input_tokens_seen": 59343984, "step": 48915 }, { "epoch": 5.44826818131195, "grad_norm": 0.13268448412418365, "learning_rate": 2.5452392483907577e-05, "loss": 0.4612, "num_input_tokens_seen": 59349776, "step": 48920 }, { "epoch": 5.448825036195568, "grad_norm": 0.0979233831167221, "learning_rate": 2.5447533795497606e-05, "loss": 0.4611, "num_input_tokens_seen": 59355216, "step": 48925 }, { "epoch": 5.4493818910791845, "grad_norm": 0.09218954294919968, "learning_rate": 2.5442675090178375e-05, "loss": 0.4634, "num_input_tokens_seen": 59361360, "step": 48930 }, { "epoch": 5.449938745962802, "grad_norm": 0.0996403843164444, "learning_rate": 2.543781636813346e-05, "loss": 0.4634, "num_input_tokens_seen": 59367760, "step": 48935 }, { "epoch": 5.45049560084642, "grad_norm": 0.11472922563552856, "learning_rate": 2.5432957629546444e-05, "loss": 0.4543, "num_input_tokens_seen": 59374256, "step": 48940 }, { "epoch": 5.451052455730037, "grad_norm": 0.12189260870218277, "learning_rate": 2.5428098874600887e-05, "loss": 0.4842, "num_input_tokens_seen": 59380176, "step": 48945 }, { "epoch": 5.451609310613654, "grad_norm": 0.08959787338972092, "learning_rate": 2.5423240103480396e-05, "loss": 0.4606, "num_input_tokens_seen": 59386288, "step": 48950 }, { "epoch": 5.452166165497271, "grad_norm": 0.08600963652133942, "learning_rate": 2.541838131636854e-05, "loss": 0.4705, "num_input_tokens_seen": 59392176, "step": 48955 }, { "epoch": 5.452723020380889, "grad_norm": 0.11791243404150009, "learning_rate": 2.541352251344889e-05, "loss": 0.4572, "num_input_tokens_seen": 59398480, "step": 48960 }, { "epoch": 5.453279875264506, "grad_norm": 0.1382572501897812, "learning_rate": 2.5408663694905032e-05, "loss": 0.461, "num_input_tokens_seen": 59404304, "step": 48965 }, { "epoch": 5.453836730148123, "grad_norm": 0.09141986817121506, "learning_rate": 2.540380486092055e-05, "loss": 0.4761, "num_input_tokens_seen": 59410288, "step": 48970 }, { "epoch": 5.454393585031741, "grad_norm": 0.09575629234313965, "learning_rate": 2.539894601167903e-05, "loss": 0.4563, "num_input_tokens_seen": 59416464, "step": 48975 }, { "epoch": 5.454950439915358, "grad_norm": 0.14977259933948517, "learning_rate": 2.5394087147364047e-05, "loss": 0.4552, "num_input_tokens_seen": 59422768, "step": 48980 }, { "epoch": 5.455507294798975, "grad_norm": 0.11682277172803879, "learning_rate": 2.5389228268159193e-05, "loss": 0.4681, "num_input_tokens_seen": 59429072, "step": 48985 }, { "epoch": 5.456064149682593, "grad_norm": 0.09049080312252045, "learning_rate": 2.5384369374248052e-05, "loss": 0.4658, "num_input_tokens_seen": 59435056, "step": 48990 }, { "epoch": 5.45662100456621, "grad_norm": 0.09692506492137909, "learning_rate": 2.53795104658142e-05, "loss": 0.4716, "num_input_tokens_seen": 59441200, "step": 48995 }, { "epoch": 5.4571778594498275, "grad_norm": 0.13957694172859192, "learning_rate": 2.5374651543041217e-05, "loss": 0.4688, "num_input_tokens_seen": 59447408, "step": 49000 }, { "epoch": 5.457734714333444, "grad_norm": 0.11110606789588928, "learning_rate": 2.5369792606112708e-05, "loss": 0.4674, "num_input_tokens_seen": 59453424, "step": 49005 }, { "epoch": 5.458291569217062, "grad_norm": 0.10603133589029312, "learning_rate": 2.536493365521225e-05, "loss": 0.4657, "num_input_tokens_seen": 59459632, "step": 49010 }, { "epoch": 5.45884842410068, "grad_norm": 0.10509932786226273, "learning_rate": 2.536007469052342e-05, "loss": 0.4634, "num_input_tokens_seen": 59465584, "step": 49015 }, { "epoch": 5.459405278984296, "grad_norm": 0.09411689639091492, "learning_rate": 2.5355215712229824e-05, "loss": 0.4547, "num_input_tokens_seen": 59471792, "step": 49020 }, { "epoch": 5.459962133867914, "grad_norm": 0.08837369829416275, "learning_rate": 2.535035672051504e-05, "loss": 0.4518, "num_input_tokens_seen": 59477680, "step": 49025 }, { "epoch": 5.460518988751532, "grad_norm": 0.09429186582565308, "learning_rate": 2.534549771556265e-05, "loss": 0.4592, "num_input_tokens_seen": 59483984, "step": 49030 }, { "epoch": 5.4610758436351485, "grad_norm": 0.06605219095945358, "learning_rate": 2.534063869755625e-05, "loss": 0.4598, "num_input_tokens_seen": 59489808, "step": 49035 }, { "epoch": 5.461632698518766, "grad_norm": 0.09146585315465927, "learning_rate": 2.533577966667943e-05, "loss": 0.456, "num_input_tokens_seen": 59495952, "step": 49040 }, { "epoch": 5.462189553402383, "grad_norm": 0.09418643265962601, "learning_rate": 2.533092062311578e-05, "loss": 0.4658, "num_input_tokens_seen": 59501808, "step": 49045 }, { "epoch": 5.462746408286001, "grad_norm": 0.10416745394468307, "learning_rate": 2.5326061567048888e-05, "loss": 0.4605, "num_input_tokens_seen": 59507952, "step": 49050 }, { "epoch": 5.463303263169618, "grad_norm": 0.10761496424674988, "learning_rate": 2.5321202498662343e-05, "loss": 0.4601, "num_input_tokens_seen": 59514096, "step": 49055 }, { "epoch": 5.463860118053235, "grad_norm": 0.12706075608730316, "learning_rate": 2.5316343418139743e-05, "loss": 0.4541, "num_input_tokens_seen": 59520208, "step": 49060 }, { "epoch": 5.464416972936853, "grad_norm": 0.08946516364812851, "learning_rate": 2.5311484325664666e-05, "loss": 0.4554, "num_input_tokens_seen": 59526256, "step": 49065 }, { "epoch": 5.46497382782047, "grad_norm": 0.11482218652963638, "learning_rate": 2.5306625221420715e-05, "loss": 0.4575, "num_input_tokens_seen": 59532304, "step": 49070 }, { "epoch": 5.465530682704087, "grad_norm": 0.09335248172283173, "learning_rate": 2.530176610559149e-05, "loss": 0.4611, "num_input_tokens_seen": 59538576, "step": 49075 }, { "epoch": 5.466087537587705, "grad_norm": 0.0883047878742218, "learning_rate": 2.529690697836057e-05, "loss": 0.4556, "num_input_tokens_seen": 59543920, "step": 49080 }, { "epoch": 5.466644392471322, "grad_norm": 0.12345143407583237, "learning_rate": 2.5292047839911548e-05, "loss": 0.4568, "num_input_tokens_seen": 59550096, "step": 49085 }, { "epoch": 5.467201247354939, "grad_norm": 0.09995273500680923, "learning_rate": 2.5287188690428025e-05, "loss": 0.4661, "num_input_tokens_seen": 59556176, "step": 49090 }, { "epoch": 5.467758102238557, "grad_norm": 0.1143350899219513, "learning_rate": 2.528232953009359e-05, "loss": 0.458, "num_input_tokens_seen": 59562160, "step": 49095 }, { "epoch": 5.468314957122174, "grad_norm": 0.08878709375858307, "learning_rate": 2.5277470359091843e-05, "loss": 0.4786, "num_input_tokens_seen": 59568144, "step": 49100 }, { "epoch": 5.4688718120057915, "grad_norm": 0.16055341064929962, "learning_rate": 2.5272611177606383e-05, "loss": 0.4606, "num_input_tokens_seen": 59574416, "step": 49105 }, { "epoch": 5.469428666889408, "grad_norm": 0.11117806285619736, "learning_rate": 2.526775198582079e-05, "loss": 0.4637, "num_input_tokens_seen": 59580592, "step": 49110 }, { "epoch": 5.469985521773026, "grad_norm": 0.1254817545413971, "learning_rate": 2.526289278391868e-05, "loss": 0.4676, "num_input_tokens_seen": 59586864, "step": 49115 }, { "epoch": 5.470542376656644, "grad_norm": 0.12003415077924728, "learning_rate": 2.5258033572083628e-05, "loss": 0.4454, "num_input_tokens_seen": 59593072, "step": 49120 }, { "epoch": 5.4710992315402605, "grad_norm": 0.11426646262407303, "learning_rate": 2.525317435049924e-05, "loss": 0.4721, "num_input_tokens_seen": 59598928, "step": 49125 }, { "epoch": 5.471656086423878, "grad_norm": 0.07237183302640915, "learning_rate": 2.524831511934912e-05, "loss": 0.4547, "num_input_tokens_seen": 59605296, "step": 49130 }, { "epoch": 5.472212941307495, "grad_norm": 0.10221756994724274, "learning_rate": 2.5243455878816852e-05, "loss": 0.4613, "num_input_tokens_seen": 59611408, "step": 49135 }, { "epoch": 5.472769796191113, "grad_norm": 0.11569199711084366, "learning_rate": 2.523859662908605e-05, "loss": 0.4558, "num_input_tokens_seen": 59617680, "step": 49140 }, { "epoch": 5.47332665107473, "grad_norm": 0.07870005071163177, "learning_rate": 2.52337373703403e-05, "loss": 0.4488, "num_input_tokens_seen": 59623984, "step": 49145 }, { "epoch": 5.473883505958347, "grad_norm": 0.10946512222290039, "learning_rate": 2.5228878102763197e-05, "loss": 0.4584, "num_input_tokens_seen": 59630192, "step": 49150 }, { "epoch": 5.474440360841965, "grad_norm": 0.12159980833530426, "learning_rate": 2.5224018826538355e-05, "loss": 0.4627, "num_input_tokens_seen": 59636240, "step": 49155 }, { "epoch": 5.4749972157255815, "grad_norm": 0.120729461312294, "learning_rate": 2.5219159541849357e-05, "loss": 0.4658, "num_input_tokens_seen": 59642256, "step": 49160 }, { "epoch": 5.475554070609199, "grad_norm": 0.10965780168771744, "learning_rate": 2.521430024887981e-05, "loss": 0.4651, "num_input_tokens_seen": 59647728, "step": 49165 }, { "epoch": 5.476110925492817, "grad_norm": 0.08307404816150665, "learning_rate": 2.5209440947813316e-05, "loss": 0.4754, "num_input_tokens_seen": 59653776, "step": 49170 }, { "epoch": 5.476667780376434, "grad_norm": 0.13255074620246887, "learning_rate": 2.520458163883347e-05, "loss": 0.4773, "num_input_tokens_seen": 59659472, "step": 49175 }, { "epoch": 5.477224635260051, "grad_norm": 0.12392551451921463, "learning_rate": 2.5199722322123874e-05, "loss": 0.4624, "num_input_tokens_seen": 59665712, "step": 49180 }, { "epoch": 5.477781490143668, "grad_norm": 0.15050958096981049, "learning_rate": 2.5194862997868124e-05, "loss": 0.4672, "num_input_tokens_seen": 59671824, "step": 49185 }, { "epoch": 5.478338345027286, "grad_norm": 0.09845774620771408, "learning_rate": 2.5190003666249827e-05, "loss": 0.4559, "num_input_tokens_seen": 59677776, "step": 49190 }, { "epoch": 5.4788951999109035, "grad_norm": 0.11100275814533234, "learning_rate": 2.5185144327452588e-05, "loss": 0.4625, "num_input_tokens_seen": 59683088, "step": 49195 }, { "epoch": 5.47945205479452, "grad_norm": 0.089144267141819, "learning_rate": 2.518028498166e-05, "loss": 0.4582, "num_input_tokens_seen": 59689168, "step": 49200 }, { "epoch": 5.480008909678138, "grad_norm": 0.09109639376401901, "learning_rate": 2.517542562905567e-05, "loss": 0.4489, "num_input_tokens_seen": 59695184, "step": 49205 }, { "epoch": 5.480565764561756, "grad_norm": 0.09787700325250626, "learning_rate": 2.51705662698232e-05, "loss": 0.4685, "num_input_tokens_seen": 59701008, "step": 49210 }, { "epoch": 5.481122619445372, "grad_norm": 0.12683691084384918, "learning_rate": 2.5165706904146182e-05, "loss": 0.4674, "num_input_tokens_seen": 59706736, "step": 49215 }, { "epoch": 5.48167947432899, "grad_norm": 0.09321750700473785, "learning_rate": 2.5160847532208226e-05, "loss": 0.4589, "num_input_tokens_seen": 59712528, "step": 49220 }, { "epoch": 5.482236329212607, "grad_norm": 0.1316147893667221, "learning_rate": 2.5155988154192937e-05, "loss": 0.4603, "num_input_tokens_seen": 59718640, "step": 49225 }, { "epoch": 5.4827931840962245, "grad_norm": 0.1683051884174347, "learning_rate": 2.5151128770283916e-05, "loss": 0.4875, "num_input_tokens_seen": 59724464, "step": 49230 }, { "epoch": 5.483350038979842, "grad_norm": 0.11112045496702194, "learning_rate": 2.514626938066477e-05, "loss": 0.4563, "num_input_tokens_seen": 59730480, "step": 49235 }, { "epoch": 5.483906893863459, "grad_norm": 0.10653470456600189, "learning_rate": 2.5141409985519094e-05, "loss": 0.4795, "num_input_tokens_seen": 59736752, "step": 49240 }, { "epoch": 5.484463748747077, "grad_norm": 0.08074483275413513, "learning_rate": 2.51365505850305e-05, "loss": 0.4706, "num_input_tokens_seen": 59743024, "step": 49245 }, { "epoch": 5.485020603630693, "grad_norm": 0.10597886890172958, "learning_rate": 2.5131691179382578e-05, "loss": 0.4653, "num_input_tokens_seen": 59748944, "step": 49250 }, { "epoch": 5.485577458514311, "grad_norm": 0.10529404133558273, "learning_rate": 2.5126831768758947e-05, "loss": 0.4685, "num_input_tokens_seen": 59755152, "step": 49255 }, { "epoch": 5.486134313397929, "grad_norm": 0.10308301448822021, "learning_rate": 2.5121972353343204e-05, "loss": 0.4657, "num_input_tokens_seen": 59761392, "step": 49260 }, { "epoch": 5.486691168281546, "grad_norm": 0.13893063366413116, "learning_rate": 2.511711293331896e-05, "loss": 0.4727, "num_input_tokens_seen": 59767632, "step": 49265 }, { "epoch": 5.487248023165163, "grad_norm": 0.11591528356075287, "learning_rate": 2.511225350886981e-05, "loss": 0.4571, "num_input_tokens_seen": 59773552, "step": 49270 }, { "epoch": 5.487804878048781, "grad_norm": 0.10825712978839874, "learning_rate": 2.510739408017936e-05, "loss": 0.4687, "num_input_tokens_seen": 59779440, "step": 49275 }, { "epoch": 5.488361732932398, "grad_norm": 0.08761186897754669, "learning_rate": 2.510253464743122e-05, "loss": 0.454, "num_input_tokens_seen": 59785680, "step": 49280 }, { "epoch": 5.488918587816015, "grad_norm": 0.08200488239526749, "learning_rate": 2.509767521080899e-05, "loss": 0.4571, "num_input_tokens_seen": 59791664, "step": 49285 }, { "epoch": 5.489475442699632, "grad_norm": 0.09184464067220688, "learning_rate": 2.509281577049628e-05, "loss": 0.4679, "num_input_tokens_seen": 59798000, "step": 49290 }, { "epoch": 5.49003229758325, "grad_norm": 0.098475880920887, "learning_rate": 2.508795632667669e-05, "loss": 0.4644, "num_input_tokens_seen": 59804176, "step": 49295 }, { "epoch": 5.4905891524668675, "grad_norm": 0.10302293300628662, "learning_rate": 2.5083096879533835e-05, "loss": 0.4578, "num_input_tokens_seen": 59809968, "step": 49300 }, { "epoch": 5.491146007350484, "grad_norm": 0.11158476769924164, "learning_rate": 2.5078237429251305e-05, "loss": 0.4583, "num_input_tokens_seen": 59815696, "step": 49305 }, { "epoch": 5.491702862234102, "grad_norm": 0.08233852684497833, "learning_rate": 2.5073377976012724e-05, "loss": 0.4733, "num_input_tokens_seen": 59821424, "step": 49310 }, { "epoch": 5.492259717117719, "grad_norm": 0.08678972721099854, "learning_rate": 2.5068518520001676e-05, "loss": 0.4572, "num_input_tokens_seen": 59827536, "step": 49315 }, { "epoch": 5.492816572001336, "grad_norm": 0.12366670370101929, "learning_rate": 2.5063659061401784e-05, "loss": 0.463, "num_input_tokens_seen": 59833456, "step": 49320 }, { "epoch": 5.493373426884954, "grad_norm": 0.10490250587463379, "learning_rate": 2.5058799600396654e-05, "loss": 0.4553, "num_input_tokens_seen": 59839216, "step": 49325 }, { "epoch": 5.493930281768571, "grad_norm": 0.0960923582315445, "learning_rate": 2.5053940137169878e-05, "loss": 0.4637, "num_input_tokens_seen": 59845360, "step": 49330 }, { "epoch": 5.494487136652189, "grad_norm": 0.11301831156015396, "learning_rate": 2.504908067190508e-05, "loss": 0.4707, "num_input_tokens_seen": 59851568, "step": 49335 }, { "epoch": 5.495043991535805, "grad_norm": 0.11062799394130707, "learning_rate": 2.5044221204785852e-05, "loss": 0.4603, "num_input_tokens_seen": 59857712, "step": 49340 }, { "epoch": 5.495600846419423, "grad_norm": 0.14644552767276764, "learning_rate": 2.5039361735995797e-05, "loss": 0.4767, "num_input_tokens_seen": 59863568, "step": 49345 }, { "epoch": 5.496157701303041, "grad_norm": 0.09804115444421768, "learning_rate": 2.503450226571854e-05, "loss": 0.4733, "num_input_tokens_seen": 59869296, "step": 49350 }, { "epoch": 5.4967145561866575, "grad_norm": 0.1385001242160797, "learning_rate": 2.5029642794137674e-05, "loss": 0.4631, "num_input_tokens_seen": 59875312, "step": 49355 }, { "epoch": 5.497271411070275, "grad_norm": 0.10275149345397949, "learning_rate": 2.502478332143681e-05, "loss": 0.4633, "num_input_tokens_seen": 59881008, "step": 49360 }, { "epoch": 5.497828265953892, "grad_norm": 0.13437187671661377, "learning_rate": 2.501992384779956e-05, "loss": 0.4616, "num_input_tokens_seen": 59887184, "step": 49365 }, { "epoch": 5.49838512083751, "grad_norm": 0.08888374269008636, "learning_rate": 2.5015064373409506e-05, "loss": 0.4533, "num_input_tokens_seen": 59892752, "step": 49370 }, { "epoch": 5.498941975721127, "grad_norm": 0.09794783592224121, "learning_rate": 2.5010204898450284e-05, "loss": 0.4822, "num_input_tokens_seen": 59898832, "step": 49375 }, { "epoch": 5.499498830604744, "grad_norm": 0.10332950949668884, "learning_rate": 2.5005345423105485e-05, "loss": 0.4559, "num_input_tokens_seen": 59905072, "step": 49380 }, { "epoch": 5.500055685488362, "grad_norm": 0.09487155824899673, "learning_rate": 2.500048594755872e-05, "loss": 0.4679, "num_input_tokens_seen": 59911024, "step": 49385 }, { "epoch": 5.500612540371979, "grad_norm": 0.11966866999864578, "learning_rate": 2.4995626471993588e-05, "loss": 0.4639, "num_input_tokens_seen": 59917136, "step": 49390 }, { "epoch": 5.500612540371979, "eval_loss": 0.4634726643562317, "eval_runtime": 113.1026, "eval_samples_per_second": 35.287, "eval_steps_per_second": 8.824, "num_input_tokens_seen": 59917136, "step": 49390 }, { "epoch": 5.501169395255596, "grad_norm": 0.10474321991205215, "learning_rate": 2.4990766996593706e-05, "loss": 0.463, "num_input_tokens_seen": 59923344, "step": 49395 }, { "epoch": 5.501726250139214, "grad_norm": 0.10949715226888657, "learning_rate": 2.498590752154268e-05, "loss": 0.4626, "num_input_tokens_seen": 59929104, "step": 49400 }, { "epoch": 5.502283105022831, "grad_norm": 0.10635608434677124, "learning_rate": 2.498104804702411e-05, "loss": 0.4654, "num_input_tokens_seen": 59935216, "step": 49405 }, { "epoch": 5.502839959906448, "grad_norm": 0.1265770047903061, "learning_rate": 2.4976188573221602e-05, "loss": 0.467, "num_input_tokens_seen": 59941168, "step": 49410 }, { "epoch": 5.503396814790066, "grad_norm": 0.09850479662418365, "learning_rate": 2.497132910031877e-05, "loss": 0.4602, "num_input_tokens_seen": 59947632, "step": 49415 }, { "epoch": 5.503953669673683, "grad_norm": 0.08707313239574432, "learning_rate": 2.4966469628499217e-05, "loss": 0.4635, "num_input_tokens_seen": 59953680, "step": 49420 }, { "epoch": 5.5045105245573005, "grad_norm": 0.12714888155460358, "learning_rate": 2.4961610157946548e-05, "loss": 0.4622, "num_input_tokens_seen": 59959536, "step": 49425 }, { "epoch": 5.505067379440918, "grad_norm": 0.09376182407140732, "learning_rate": 2.4956750688844372e-05, "loss": 0.4575, "num_input_tokens_seen": 59965488, "step": 49430 }, { "epoch": 5.505624234324535, "grad_norm": 0.10097356885671616, "learning_rate": 2.4951891221376292e-05, "loss": 0.4673, "num_input_tokens_seen": 59971536, "step": 49435 }, { "epoch": 5.506181089208153, "grad_norm": 0.12267863005399704, "learning_rate": 2.4947031755725918e-05, "loss": 0.4515, "num_input_tokens_seen": 59977616, "step": 49440 }, { "epoch": 5.506737944091769, "grad_norm": 0.09093409776687622, "learning_rate": 2.494217229207684e-05, "loss": 0.4567, "num_input_tokens_seen": 59984112, "step": 49445 }, { "epoch": 5.507294798975387, "grad_norm": 0.07167407870292664, "learning_rate": 2.4937312830612694e-05, "loss": 0.4703, "num_input_tokens_seen": 59990352, "step": 49450 }, { "epoch": 5.507851653859005, "grad_norm": 0.1087147518992424, "learning_rate": 2.4932453371517067e-05, "loss": 0.4741, "num_input_tokens_seen": 59996176, "step": 49455 }, { "epoch": 5.508408508742622, "grad_norm": 0.08260327577590942, "learning_rate": 2.4927593914973568e-05, "loss": 0.4699, "num_input_tokens_seen": 60002384, "step": 49460 }, { "epoch": 5.508965363626239, "grad_norm": 0.10351188480854034, "learning_rate": 2.4922734461165805e-05, "loss": 0.4643, "num_input_tokens_seen": 60008400, "step": 49465 }, { "epoch": 5.509522218509856, "grad_norm": 0.10887711495161057, "learning_rate": 2.4917875010277382e-05, "loss": 0.4508, "num_input_tokens_seen": 60014896, "step": 49470 }, { "epoch": 5.510079073393474, "grad_norm": 0.11828038841485977, "learning_rate": 2.4913015562491905e-05, "loss": 0.4681, "num_input_tokens_seen": 60021136, "step": 49475 }, { "epoch": 5.510635928277091, "grad_norm": 0.10988384485244751, "learning_rate": 2.4908156117992973e-05, "loss": 0.4743, "num_input_tokens_seen": 60027408, "step": 49480 }, { "epoch": 5.511192783160708, "grad_norm": 0.09100355952978134, "learning_rate": 2.49032966769642e-05, "loss": 0.4589, "num_input_tokens_seen": 60033424, "step": 49485 }, { "epoch": 5.511749638044326, "grad_norm": 0.09347133338451385, "learning_rate": 2.4898437239589188e-05, "loss": 0.4706, "num_input_tokens_seen": 60039568, "step": 49490 }, { "epoch": 5.512306492927943, "grad_norm": 0.11298640817403793, "learning_rate": 2.4893577806051537e-05, "loss": 0.4594, "num_input_tokens_seen": 60045872, "step": 49495 }, { "epoch": 5.51286334781156, "grad_norm": 0.11132588237524033, "learning_rate": 2.488871837653486e-05, "loss": 0.4569, "num_input_tokens_seen": 60051728, "step": 49500 }, { "epoch": 5.513420202695178, "grad_norm": 0.10684631764888763, "learning_rate": 2.4883858951222766e-05, "loss": 0.4619, "num_input_tokens_seen": 60057488, "step": 49505 }, { "epoch": 5.513977057578795, "grad_norm": 0.09477993100881577, "learning_rate": 2.487899953029885e-05, "loss": 0.4626, "num_input_tokens_seen": 60063536, "step": 49510 }, { "epoch": 5.514533912462412, "grad_norm": 0.10632367432117462, "learning_rate": 2.4874140113946717e-05, "loss": 0.4509, "num_input_tokens_seen": 60069904, "step": 49515 }, { "epoch": 5.515090767346029, "grad_norm": 0.09865552932024002, "learning_rate": 2.4869280702349977e-05, "loss": 0.4709, "num_input_tokens_seen": 60075888, "step": 49520 }, { "epoch": 5.515647622229647, "grad_norm": 0.10710375010967255, "learning_rate": 2.4864421295692225e-05, "loss": 0.4599, "num_input_tokens_seen": 60081648, "step": 49525 }, { "epoch": 5.516204477113265, "grad_norm": 0.11335653811693192, "learning_rate": 2.4859561894157075e-05, "loss": 0.4603, "num_input_tokens_seen": 60087440, "step": 49530 }, { "epoch": 5.516761331996881, "grad_norm": 0.12407927960157394, "learning_rate": 2.4854702497928128e-05, "loss": 0.4591, "num_input_tokens_seen": 60093232, "step": 49535 }, { "epoch": 5.517318186880499, "grad_norm": 0.08864597231149673, "learning_rate": 2.4849843107188985e-05, "loss": 0.4648, "num_input_tokens_seen": 60099248, "step": 49540 }, { "epoch": 5.517875041764116, "grad_norm": 0.08925943076610565, "learning_rate": 2.4844983722123252e-05, "loss": 0.4514, "num_input_tokens_seen": 60105360, "step": 49545 }, { "epoch": 5.5184318966477335, "grad_norm": 0.0829353779554367, "learning_rate": 2.4840124342914528e-05, "loss": 0.4773, "num_input_tokens_seen": 60111664, "step": 49550 }, { "epoch": 5.518988751531351, "grad_norm": 0.08868157118558884, "learning_rate": 2.4835264969746423e-05, "loss": 0.4624, "num_input_tokens_seen": 60117872, "step": 49555 }, { "epoch": 5.519545606414968, "grad_norm": 0.10029548406600952, "learning_rate": 2.4830405602802526e-05, "loss": 0.4624, "num_input_tokens_seen": 60124368, "step": 49560 }, { "epoch": 5.520102461298586, "grad_norm": 0.11292076855897903, "learning_rate": 2.4825546242266457e-05, "loss": 0.4754, "num_input_tokens_seen": 60130672, "step": 49565 }, { "epoch": 5.520659316182203, "grad_norm": 0.11468786746263504, "learning_rate": 2.4820686888321808e-05, "loss": 0.4676, "num_input_tokens_seen": 60136848, "step": 49570 }, { "epoch": 5.52121617106582, "grad_norm": 0.11665258556604385, "learning_rate": 2.4815827541152183e-05, "loss": 0.4663, "num_input_tokens_seen": 60143088, "step": 49575 }, { "epoch": 5.521773025949438, "grad_norm": 0.09846368432044983, "learning_rate": 2.4810968200941188e-05, "loss": 0.4604, "num_input_tokens_seen": 60149360, "step": 49580 }, { "epoch": 5.5223298808330545, "grad_norm": 0.0846971645951271, "learning_rate": 2.4806108867872416e-05, "loss": 0.4587, "num_input_tokens_seen": 60155376, "step": 49585 }, { "epoch": 5.522886735716672, "grad_norm": 0.06956394761800766, "learning_rate": 2.480124954212947e-05, "loss": 0.4551, "num_input_tokens_seen": 60161360, "step": 49590 }, { "epoch": 5.52344359060029, "grad_norm": 0.10584770888090134, "learning_rate": 2.4796390223895953e-05, "loss": 0.4559, "num_input_tokens_seen": 60167632, "step": 49595 }, { "epoch": 5.524000445483907, "grad_norm": 0.1211877092719078, "learning_rate": 2.4791530913355468e-05, "loss": 0.4605, "num_input_tokens_seen": 60173840, "step": 49600 }, { "epoch": 5.524557300367524, "grad_norm": 0.10423213243484497, "learning_rate": 2.4786671610691617e-05, "loss": 0.4606, "num_input_tokens_seen": 60180080, "step": 49605 }, { "epoch": 5.525114155251142, "grad_norm": 0.09802483767271042, "learning_rate": 2.4781812316087993e-05, "loss": 0.4515, "num_input_tokens_seen": 60186288, "step": 49610 }, { "epoch": 5.525671010134759, "grad_norm": 0.09566177427768707, "learning_rate": 2.47769530297282e-05, "loss": 0.4649, "num_input_tokens_seen": 60192400, "step": 49615 }, { "epoch": 5.5262278650183765, "grad_norm": 0.09242360293865204, "learning_rate": 2.4772093751795836e-05, "loss": 0.4597, "num_input_tokens_seen": 60198480, "step": 49620 }, { "epoch": 5.526784719901993, "grad_norm": 0.15254099667072296, "learning_rate": 2.4767234482474495e-05, "loss": 0.4646, "num_input_tokens_seen": 60204176, "step": 49625 }, { "epoch": 5.527341574785611, "grad_norm": 0.12798923254013062, "learning_rate": 2.4762375221947783e-05, "loss": 0.4484, "num_input_tokens_seen": 60209552, "step": 49630 }, { "epoch": 5.527898429669229, "grad_norm": 0.10319826006889343, "learning_rate": 2.47575159703993e-05, "loss": 0.4505, "num_input_tokens_seen": 60215824, "step": 49635 }, { "epoch": 5.528455284552845, "grad_norm": 0.09277927130460739, "learning_rate": 2.4752656728012645e-05, "loss": 0.4637, "num_input_tokens_seen": 60221712, "step": 49640 }, { "epoch": 5.529012139436463, "grad_norm": 0.08214476704597473, "learning_rate": 2.4747797494971406e-05, "loss": 0.4511, "num_input_tokens_seen": 60227920, "step": 49645 }, { "epoch": 5.52956899432008, "grad_norm": 0.10876549035310745, "learning_rate": 2.4742938271459186e-05, "loss": 0.4742, "num_input_tokens_seen": 60234384, "step": 49650 }, { "epoch": 5.5301258492036975, "grad_norm": 0.10823702812194824, "learning_rate": 2.4738079057659578e-05, "loss": 0.4745, "num_input_tokens_seen": 60240336, "step": 49655 }, { "epoch": 5.530682704087315, "grad_norm": 0.09452952444553375, "learning_rate": 2.4733219853756188e-05, "loss": 0.4619, "num_input_tokens_seen": 60246544, "step": 49660 }, { "epoch": 5.531239558970932, "grad_norm": 0.08485828340053558, "learning_rate": 2.4728360659932608e-05, "loss": 0.4517, "num_input_tokens_seen": 60252656, "step": 49665 }, { "epoch": 5.53179641385455, "grad_norm": 0.14068563282489777, "learning_rate": 2.4723501476372428e-05, "loss": 0.4528, "num_input_tokens_seen": 60258608, "step": 49670 }, { "epoch": 5.5323532687381665, "grad_norm": 0.09986817836761475, "learning_rate": 2.471864230325925e-05, "loss": 0.464, "num_input_tokens_seen": 60263952, "step": 49675 }, { "epoch": 5.532910123621784, "grad_norm": 0.11122716963291168, "learning_rate": 2.471378314077667e-05, "loss": 0.4506, "num_input_tokens_seen": 60270032, "step": 49680 }, { "epoch": 5.533466978505402, "grad_norm": 0.07834503799676895, "learning_rate": 2.4708923989108273e-05, "loss": 0.452, "num_input_tokens_seen": 60276240, "step": 49685 }, { "epoch": 5.534023833389019, "grad_norm": 0.09199253469705582, "learning_rate": 2.4704064848437668e-05, "loss": 0.4597, "num_input_tokens_seen": 60282288, "step": 49690 }, { "epoch": 5.534580688272636, "grad_norm": 0.13472405076026917, "learning_rate": 2.469920571894844e-05, "loss": 0.4786, "num_input_tokens_seen": 60288368, "step": 49695 }, { "epoch": 5.535137543156253, "grad_norm": 0.11431339383125305, "learning_rate": 2.4694346600824184e-05, "loss": 0.4543, "num_input_tokens_seen": 60294352, "step": 49700 }, { "epoch": 5.535694398039871, "grad_norm": 0.09752475470304489, "learning_rate": 2.468948749424849e-05, "loss": 0.4545, "num_input_tokens_seen": 60300496, "step": 49705 }, { "epoch": 5.536251252923488, "grad_norm": 0.11917632818222046, "learning_rate": 2.4684628399404956e-05, "loss": 0.4537, "num_input_tokens_seen": 60306672, "step": 49710 }, { "epoch": 5.536808107807105, "grad_norm": 0.10117090493440628, "learning_rate": 2.4679769316477173e-05, "loss": 0.46, "num_input_tokens_seen": 60312656, "step": 49715 }, { "epoch": 5.537364962690723, "grad_norm": 0.08237768709659576, "learning_rate": 2.4674910245648727e-05, "loss": 0.4666, "num_input_tokens_seen": 60318704, "step": 49720 }, { "epoch": 5.53792181757434, "grad_norm": 0.10929562151432037, "learning_rate": 2.4670051187103217e-05, "loss": 0.4595, "num_input_tokens_seen": 60324272, "step": 49725 }, { "epoch": 5.538478672457957, "grad_norm": 0.08812820911407471, "learning_rate": 2.4665192141024234e-05, "loss": 0.4616, "num_input_tokens_seen": 60330448, "step": 49730 }, { "epoch": 5.539035527341575, "grad_norm": 0.11382666230201721, "learning_rate": 2.4660333107595358e-05, "loss": 0.4657, "num_input_tokens_seen": 60336528, "step": 49735 }, { "epoch": 5.539592382225192, "grad_norm": 0.08910463750362396, "learning_rate": 2.465547408700019e-05, "loss": 0.4733, "num_input_tokens_seen": 60342576, "step": 49740 }, { "epoch": 5.5401492371088095, "grad_norm": 0.10109645128250122, "learning_rate": 2.4650615079422314e-05, "loss": 0.4634, "num_input_tokens_seen": 60348624, "step": 49745 }, { "epoch": 5.540706091992427, "grad_norm": 0.10237035155296326, "learning_rate": 2.4645756085045317e-05, "loss": 0.4684, "num_input_tokens_seen": 60354384, "step": 49750 }, { "epoch": 5.541262946876044, "grad_norm": 0.0853603333234787, "learning_rate": 2.464089710405279e-05, "loss": 0.4662, "num_input_tokens_seen": 60360560, "step": 49755 }, { "epoch": 5.541819801759662, "grad_norm": 0.15990418195724487, "learning_rate": 2.463603813662833e-05, "loss": 0.478, "num_input_tokens_seen": 60366192, "step": 49760 }, { "epoch": 5.542376656643278, "grad_norm": 0.09289753437042236, "learning_rate": 2.4631179182955506e-05, "loss": 0.4646, "num_input_tokens_seen": 60372336, "step": 49765 }, { "epoch": 5.542933511526896, "grad_norm": 0.0856994017958641, "learning_rate": 2.462632024321792e-05, "loss": 0.4553, "num_input_tokens_seen": 60378320, "step": 49770 }, { "epoch": 5.543490366410514, "grad_norm": 0.10039827972650528, "learning_rate": 2.4621461317599152e-05, "loss": 0.4589, "num_input_tokens_seen": 60384336, "step": 49775 }, { "epoch": 5.5440472212941305, "grad_norm": 0.12123836576938629, "learning_rate": 2.4616602406282788e-05, "loss": 0.4571, "num_input_tokens_seen": 60389584, "step": 49780 }, { "epoch": 5.544604076177748, "grad_norm": 0.15573233366012573, "learning_rate": 2.461174350945241e-05, "loss": 0.4669, "num_input_tokens_seen": 60395696, "step": 49785 }, { "epoch": 5.545160931061366, "grad_norm": 0.09639301151037216, "learning_rate": 2.460688462729161e-05, "loss": 0.4526, "num_input_tokens_seen": 60402032, "step": 49790 }, { "epoch": 5.545717785944983, "grad_norm": 0.1204182580113411, "learning_rate": 2.460202575998397e-05, "loss": 0.4699, "num_input_tokens_seen": 60407984, "step": 49795 }, { "epoch": 5.5462746408286, "grad_norm": 0.09861711412668228, "learning_rate": 2.459716690771307e-05, "loss": 0.4694, "num_input_tokens_seen": 60414032, "step": 49800 }, { "epoch": 5.546831495712217, "grad_norm": 0.10216470062732697, "learning_rate": 2.4592308070662494e-05, "loss": 0.4674, "num_input_tokens_seen": 60420080, "step": 49805 }, { "epoch": 5.547388350595835, "grad_norm": 0.11431198567152023, "learning_rate": 2.458744924901583e-05, "loss": 0.4659, "num_input_tokens_seen": 60425744, "step": 49810 }, { "epoch": 5.5479452054794525, "grad_norm": 0.1127471923828125, "learning_rate": 2.4582590442956646e-05, "loss": 0.4614, "num_input_tokens_seen": 60431984, "step": 49815 }, { "epoch": 5.548502060363069, "grad_norm": 0.10467951744794846, "learning_rate": 2.457773165266854e-05, "loss": 0.4569, "num_input_tokens_seen": 60437488, "step": 49820 }, { "epoch": 5.549058915246687, "grad_norm": 0.07017681747674942, "learning_rate": 2.4572872878335087e-05, "loss": 0.4471, "num_input_tokens_seen": 60444144, "step": 49825 }, { "epoch": 5.549615770130304, "grad_norm": 0.13160374760627747, "learning_rate": 2.456801412013986e-05, "loss": 0.481, "num_input_tokens_seen": 60450128, "step": 49830 }, { "epoch": 5.550172625013921, "grad_norm": 0.10100392252206802, "learning_rate": 2.456315537826645e-05, "loss": 0.4765, "num_input_tokens_seen": 60456208, "step": 49835 }, { "epoch": 5.550729479897539, "grad_norm": 0.09054547548294067, "learning_rate": 2.455829665289843e-05, "loss": 0.4617, "num_input_tokens_seen": 60461904, "step": 49840 }, { "epoch": 5.551286334781156, "grad_norm": 0.08650393784046173, "learning_rate": 2.455343794421937e-05, "loss": 0.4644, "num_input_tokens_seen": 60468208, "step": 49845 }, { "epoch": 5.5518431896647735, "grad_norm": 0.08502952754497528, "learning_rate": 2.454857925241286e-05, "loss": 0.4528, "num_input_tokens_seen": 60474064, "step": 49850 }, { "epoch": 5.55240004454839, "grad_norm": 0.11809764802455902, "learning_rate": 2.4543720577662475e-05, "loss": 0.4542, "num_input_tokens_seen": 60479792, "step": 49855 }, { "epoch": 5.552956899432008, "grad_norm": 0.08728861808776855, "learning_rate": 2.4538861920151785e-05, "loss": 0.4578, "num_input_tokens_seen": 60485968, "step": 49860 }, { "epoch": 5.553513754315626, "grad_norm": 0.10417424887418747, "learning_rate": 2.453400328006437e-05, "loss": 0.4631, "num_input_tokens_seen": 60492336, "step": 49865 }, { "epoch": 5.5540706091992424, "grad_norm": 0.08911941945552826, "learning_rate": 2.452914465758381e-05, "loss": 0.4532, "num_input_tokens_seen": 60498384, "step": 49870 }, { "epoch": 5.55462746408286, "grad_norm": 0.10331819206476212, "learning_rate": 2.452428605289367e-05, "loss": 0.4671, "num_input_tokens_seen": 60504528, "step": 49875 }, { "epoch": 5.555184318966477, "grad_norm": 0.10651706159114838, "learning_rate": 2.451942746617753e-05, "loss": 0.4628, "num_input_tokens_seen": 60510512, "step": 49880 }, { "epoch": 5.555741173850095, "grad_norm": 0.07761027663946152, "learning_rate": 2.4514568897618957e-05, "loss": 0.476, "num_input_tokens_seen": 60516656, "step": 49885 }, { "epoch": 5.556298028733712, "grad_norm": 0.15424300730228424, "learning_rate": 2.4509710347401528e-05, "loss": 0.4572, "num_input_tokens_seen": 60522864, "step": 49890 }, { "epoch": 5.556854883617329, "grad_norm": 0.10855501890182495, "learning_rate": 2.4504851815708814e-05, "loss": 0.4565, "num_input_tokens_seen": 60529232, "step": 49895 }, { "epoch": 5.557411738500947, "grad_norm": 0.11839725822210312, "learning_rate": 2.4499993302724378e-05, "loss": 0.4562, "num_input_tokens_seen": 60535696, "step": 49900 }, { "epoch": 5.5579685933845635, "grad_norm": 0.08307811617851257, "learning_rate": 2.4495134808631802e-05, "loss": 0.4512, "num_input_tokens_seen": 60541488, "step": 49905 }, { "epoch": 5.558525448268181, "grad_norm": 0.1428012102842331, "learning_rate": 2.4490276333614652e-05, "loss": 0.453, "num_input_tokens_seen": 60547664, "step": 49910 }, { "epoch": 5.559082303151799, "grad_norm": 0.08536772429943085, "learning_rate": 2.4485417877856497e-05, "loss": 0.461, "num_input_tokens_seen": 60553296, "step": 49915 }, { "epoch": 5.559639158035416, "grad_norm": 0.08598218858242035, "learning_rate": 2.4480559441540903e-05, "loss": 0.4714, "num_input_tokens_seen": 60559440, "step": 49920 }, { "epoch": 5.560196012919033, "grad_norm": 0.12848232686519623, "learning_rate": 2.4475701024851438e-05, "loss": 0.4635, "num_input_tokens_seen": 60565648, "step": 49925 }, { "epoch": 5.560752867802651, "grad_norm": 0.07426168024539948, "learning_rate": 2.447084262797166e-05, "loss": 0.4642, "num_input_tokens_seen": 60571824, "step": 49930 }, { "epoch": 5.561309722686268, "grad_norm": 0.10370755940675735, "learning_rate": 2.4465984251085152e-05, "loss": 0.4608, "num_input_tokens_seen": 60577840, "step": 49935 }, { "epoch": 5.5618665775698855, "grad_norm": 0.1287924349308014, "learning_rate": 2.446112589437547e-05, "loss": 0.4687, "num_input_tokens_seen": 60584176, "step": 49940 }, { "epoch": 5.562423432453502, "grad_norm": 0.1042470633983612, "learning_rate": 2.445626755802618e-05, "loss": 0.4583, "num_input_tokens_seen": 60590544, "step": 49945 }, { "epoch": 5.56298028733712, "grad_norm": 0.07751753181219101, "learning_rate": 2.445140924222084e-05, "loss": 0.4542, "num_input_tokens_seen": 60596432, "step": 49950 }, { "epoch": 5.563537142220738, "grad_norm": 0.09617380052804947, "learning_rate": 2.444655094714302e-05, "loss": 0.4592, "num_input_tokens_seen": 60602800, "step": 49955 }, { "epoch": 5.564093997104354, "grad_norm": 0.10475540906190872, "learning_rate": 2.444169267297627e-05, "loss": 0.4668, "num_input_tokens_seen": 60608720, "step": 49960 }, { "epoch": 5.564650851987972, "grad_norm": 0.10955943912267685, "learning_rate": 2.443683441990417e-05, "loss": 0.4661, "num_input_tokens_seen": 60614864, "step": 49965 }, { "epoch": 5.56520770687159, "grad_norm": 0.09528590738773346, "learning_rate": 2.443197618811027e-05, "loss": 0.452, "num_input_tokens_seen": 60621264, "step": 49970 }, { "epoch": 5.5657645617552065, "grad_norm": 0.15273068845272064, "learning_rate": 2.4427117977778126e-05, "loss": 0.4607, "num_input_tokens_seen": 60627568, "step": 49975 }, { "epoch": 5.566321416638824, "grad_norm": 0.1094450131058693, "learning_rate": 2.4422259789091302e-05, "loss": 0.4638, "num_input_tokens_seen": 60633744, "step": 49980 }, { "epoch": 5.566878271522441, "grad_norm": 0.13618317246437073, "learning_rate": 2.4417401622233358e-05, "loss": 0.463, "num_input_tokens_seen": 60639728, "step": 49985 }, { "epoch": 5.567435126406059, "grad_norm": 0.10015071928501129, "learning_rate": 2.4412543477387845e-05, "loss": 0.4648, "num_input_tokens_seen": 60646000, "step": 49990 }, { "epoch": 5.567991981289676, "grad_norm": 0.09657667577266693, "learning_rate": 2.4407685354738316e-05, "loss": 0.4658, "num_input_tokens_seen": 60652080, "step": 49995 }, { "epoch": 5.568548836173293, "grad_norm": 0.08212655037641525, "learning_rate": 2.440282725446834e-05, "loss": 0.4599, "num_input_tokens_seen": 60658320, "step": 50000 }, { "epoch": 5.569105691056911, "grad_norm": 0.10425079613924026, "learning_rate": 2.4397969176761462e-05, "loss": 0.4601, "num_input_tokens_seen": 60664368, "step": 50005 }, { "epoch": 5.569662545940528, "grad_norm": 0.09195413440465927, "learning_rate": 2.439311112180124e-05, "loss": 0.4647, "num_input_tokens_seen": 60670320, "step": 50010 }, { "epoch": 5.570219400824145, "grad_norm": 0.07388186454772949, "learning_rate": 2.438825308977122e-05, "loss": 0.4615, "num_input_tokens_seen": 60676784, "step": 50015 }, { "epoch": 5.570776255707763, "grad_norm": 0.08973973244428635, "learning_rate": 2.4383395080854964e-05, "loss": 0.4662, "num_input_tokens_seen": 60682064, "step": 50020 }, { "epoch": 5.57133311059138, "grad_norm": 0.10063521564006805, "learning_rate": 2.437853709523601e-05, "loss": 0.4555, "num_input_tokens_seen": 60688208, "step": 50025 }, { "epoch": 5.571889965474997, "grad_norm": 0.10728496313095093, "learning_rate": 2.4373679133097922e-05, "loss": 0.4589, "num_input_tokens_seen": 60694288, "step": 50030 }, { "epoch": 5.572446820358614, "grad_norm": 0.08610907196998596, "learning_rate": 2.4368821194624245e-05, "loss": 0.4616, "num_input_tokens_seen": 60700496, "step": 50035 }, { "epoch": 5.573003675242232, "grad_norm": 0.09145312011241913, "learning_rate": 2.4363963279998523e-05, "loss": 0.4627, "num_input_tokens_seen": 60706672, "step": 50040 }, { "epoch": 5.5735605301258495, "grad_norm": 0.13085219264030457, "learning_rate": 2.4359105389404308e-05, "loss": 0.4786, "num_input_tokens_seen": 60712720, "step": 50045 }, { "epoch": 5.574117385009466, "grad_norm": 0.08922387659549713, "learning_rate": 2.4354247523025145e-05, "loss": 0.4553, "num_input_tokens_seen": 60718704, "step": 50050 }, { "epoch": 5.574674239893084, "grad_norm": 0.09578664600849152, "learning_rate": 2.434938968104458e-05, "loss": 0.476, "num_input_tokens_seen": 60724496, "step": 50055 }, { "epoch": 5.575231094776701, "grad_norm": 0.07794628292322159, "learning_rate": 2.4344531863646152e-05, "loss": 0.463, "num_input_tokens_seen": 60730384, "step": 50060 }, { "epoch": 5.575787949660318, "grad_norm": 0.09744342416524887, "learning_rate": 2.433967407101342e-05, "loss": 0.4558, "num_input_tokens_seen": 60736304, "step": 50065 }, { "epoch": 5.576344804543936, "grad_norm": 0.09784108400344849, "learning_rate": 2.433481630332991e-05, "loss": 0.4694, "num_input_tokens_seen": 60741840, "step": 50070 }, { "epoch": 5.576901659427553, "grad_norm": 0.09789480268955231, "learning_rate": 2.432995856077918e-05, "loss": 0.4698, "num_input_tokens_seen": 60748048, "step": 50075 }, { "epoch": 5.577458514311171, "grad_norm": 0.11346511542797089, "learning_rate": 2.4325100843544756e-05, "loss": 0.4644, "num_input_tokens_seen": 60754288, "step": 50080 }, { "epoch": 5.578015369194787, "grad_norm": 0.08105385303497314, "learning_rate": 2.432024315181019e-05, "loss": 0.4614, "num_input_tokens_seen": 60760432, "step": 50085 }, { "epoch": 5.578572224078405, "grad_norm": 0.08351762592792511, "learning_rate": 2.431538548575901e-05, "loss": 0.4671, "num_input_tokens_seen": 60766192, "step": 50090 }, { "epoch": 5.579129078962023, "grad_norm": 0.08474917709827423, "learning_rate": 2.4310527845574764e-05, "loss": 0.4738, "num_input_tokens_seen": 60772176, "step": 50095 }, { "epoch": 5.5796859338456395, "grad_norm": 0.08669845759868622, "learning_rate": 2.4305670231440987e-05, "loss": 0.4587, "num_input_tokens_seen": 60777648, "step": 50100 }, { "epoch": 5.580242788729257, "grad_norm": 0.10398887097835541, "learning_rate": 2.430081264354121e-05, "loss": 0.4683, "num_input_tokens_seen": 60783248, "step": 50105 }, { "epoch": 5.580799643612875, "grad_norm": 0.08202002942562103, "learning_rate": 2.4295955082058976e-05, "loss": 0.4567, "num_input_tokens_seen": 60789552, "step": 50110 }, { "epoch": 5.581356498496492, "grad_norm": 0.08306208997964859, "learning_rate": 2.4291097547177812e-05, "loss": 0.466, "num_input_tokens_seen": 60795440, "step": 50115 }, { "epoch": 5.581913353380109, "grad_norm": 0.10947313904762268, "learning_rate": 2.4286240039081257e-05, "loss": 0.4593, "num_input_tokens_seen": 60801168, "step": 50120 }, { "epoch": 5.582470208263726, "grad_norm": 0.11925473064184189, "learning_rate": 2.4281382557952835e-05, "loss": 0.4548, "num_input_tokens_seen": 60807344, "step": 50125 }, { "epoch": 5.583027063147344, "grad_norm": 0.10918648540973663, "learning_rate": 2.4276525103976088e-05, "loss": 0.4573, "num_input_tokens_seen": 60813776, "step": 50130 }, { "epoch": 5.583583918030961, "grad_norm": 0.09291274845600128, "learning_rate": 2.427166767733454e-05, "loss": 0.4674, "num_input_tokens_seen": 60820144, "step": 50135 }, { "epoch": 5.584140772914578, "grad_norm": 0.08118928968906403, "learning_rate": 2.4266810278211722e-05, "loss": 0.4549, "num_input_tokens_seen": 60826128, "step": 50140 }, { "epoch": 5.584697627798196, "grad_norm": 0.09904664754867554, "learning_rate": 2.4261952906791164e-05, "loss": 0.4632, "num_input_tokens_seen": 60832304, "step": 50145 }, { "epoch": 5.585254482681814, "grad_norm": 0.07422683387994766, "learning_rate": 2.4257095563256386e-05, "loss": 0.4618, "num_input_tokens_seen": 60838352, "step": 50150 }, { "epoch": 5.58581133756543, "grad_norm": 0.09615334123373032, "learning_rate": 2.4252238247790915e-05, "loss": 0.4475, "num_input_tokens_seen": 60844400, "step": 50155 }, { "epoch": 5.586368192449048, "grad_norm": 0.09386740624904633, "learning_rate": 2.4247380960578286e-05, "loss": 0.4595, "num_input_tokens_seen": 60850320, "step": 50160 }, { "epoch": 5.586925047332665, "grad_norm": 0.08191393315792084, "learning_rate": 2.4242523701802013e-05, "loss": 0.4689, "num_input_tokens_seen": 60855568, "step": 50165 }, { "epoch": 5.5874819022162825, "grad_norm": 0.08705474436283112, "learning_rate": 2.4237666471645625e-05, "loss": 0.4565, "num_input_tokens_seen": 60861808, "step": 50170 }, { "epoch": 5.5880387570999, "grad_norm": 0.11540236324071884, "learning_rate": 2.423280927029264e-05, "loss": 0.4576, "num_input_tokens_seen": 60867248, "step": 50175 }, { "epoch": 5.588595611983517, "grad_norm": 0.09690357744693756, "learning_rate": 2.4227952097926578e-05, "loss": 0.474, "num_input_tokens_seen": 60873808, "step": 50180 }, { "epoch": 5.589152466867135, "grad_norm": 0.10716406255960464, "learning_rate": 2.4223094954730956e-05, "loss": 0.4622, "num_input_tokens_seen": 60879312, "step": 50185 }, { "epoch": 5.589709321750751, "grad_norm": 0.14982765913009644, "learning_rate": 2.42182378408893e-05, "loss": 0.4745, "num_input_tokens_seen": 60885264, "step": 50190 }, { "epoch": 5.590266176634369, "grad_norm": 0.10907487571239471, "learning_rate": 2.4213380756585125e-05, "loss": 0.4683, "num_input_tokens_seen": 60891024, "step": 50195 }, { "epoch": 5.590823031517987, "grad_norm": 0.10308098047971725, "learning_rate": 2.4208523702001947e-05, "loss": 0.4601, "num_input_tokens_seen": 60897072, "step": 50200 }, { "epoch": 5.5913798864016035, "grad_norm": 0.09367986023426056, "learning_rate": 2.4203666677323276e-05, "loss": 0.4656, "num_input_tokens_seen": 60903408, "step": 50205 }, { "epoch": 5.591936741285221, "grad_norm": 0.09121590107679367, "learning_rate": 2.4198809682732636e-05, "loss": 0.4541, "num_input_tokens_seen": 60909616, "step": 50210 }, { "epoch": 5.592493596168838, "grad_norm": 0.08112601935863495, "learning_rate": 2.419395271841353e-05, "loss": 0.4601, "num_input_tokens_seen": 60915504, "step": 50215 }, { "epoch": 5.593050451052456, "grad_norm": 0.0799090787768364, "learning_rate": 2.4189095784549466e-05, "loss": 0.464, "num_input_tokens_seen": 60921552, "step": 50220 }, { "epoch": 5.593607305936073, "grad_norm": 0.09455018490552902, "learning_rate": 2.418423888132397e-05, "loss": 0.47, "num_input_tokens_seen": 60927600, "step": 50225 }, { "epoch": 5.59416416081969, "grad_norm": 0.0975150391459465, "learning_rate": 2.4179382008920543e-05, "loss": 0.4616, "num_input_tokens_seen": 60933008, "step": 50230 }, { "epoch": 5.594721015703308, "grad_norm": 0.07990606874227524, "learning_rate": 2.4174525167522692e-05, "loss": 0.4691, "num_input_tokens_seen": 60938832, "step": 50235 }, { "epoch": 5.595277870586925, "grad_norm": 0.10430232435464859, "learning_rate": 2.4169668357313926e-05, "loss": 0.4525, "num_input_tokens_seen": 60945168, "step": 50240 }, { "epoch": 5.595834725470542, "grad_norm": 0.0963888019323349, "learning_rate": 2.4164811578477753e-05, "loss": 0.4618, "num_input_tokens_seen": 60950320, "step": 50245 }, { "epoch": 5.59639158035416, "grad_norm": 0.12725764513015747, "learning_rate": 2.4159954831197666e-05, "loss": 0.4472, "num_input_tokens_seen": 60956592, "step": 50250 }, { "epoch": 5.596948435237777, "grad_norm": 0.11368845403194427, "learning_rate": 2.4155098115657185e-05, "loss": 0.4621, "num_input_tokens_seen": 60962544, "step": 50255 }, { "epoch": 5.597505290121394, "grad_norm": 0.1082262247800827, "learning_rate": 2.4150241432039804e-05, "loss": 0.4685, "num_input_tokens_seen": 60968592, "step": 50260 }, { "epoch": 5.598062145005011, "grad_norm": 0.09042871743440628, "learning_rate": 2.4145384780529027e-05, "loss": 0.4662, "num_input_tokens_seen": 60974448, "step": 50265 }, { "epoch": 5.598618999888629, "grad_norm": 0.11365178972482681, "learning_rate": 2.414052816130835e-05, "loss": 0.4666, "num_input_tokens_seen": 60980912, "step": 50270 }, { "epoch": 5.5991758547722466, "grad_norm": 0.09925233572721481, "learning_rate": 2.4135671574561273e-05, "loss": 0.4593, "num_input_tokens_seen": 60987184, "step": 50275 }, { "epoch": 5.599732709655863, "grad_norm": 0.12464103102684021, "learning_rate": 2.4130815020471296e-05, "loss": 0.4694, "num_input_tokens_seen": 60993072, "step": 50280 }, { "epoch": 5.600289564539481, "grad_norm": 0.1307225227355957, "learning_rate": 2.4125958499221908e-05, "loss": 0.4563, "num_input_tokens_seen": 60999056, "step": 50285 }, { "epoch": 5.600846419423099, "grad_norm": 0.1180633008480072, "learning_rate": 2.4121102010996623e-05, "loss": 0.4558, "num_input_tokens_seen": 61004944, "step": 50290 }, { "epoch": 5.6014032743067155, "grad_norm": 0.10880677402019501, "learning_rate": 2.411624555597891e-05, "loss": 0.4664, "num_input_tokens_seen": 61011216, "step": 50295 }, { "epoch": 5.601960129190333, "grad_norm": 0.084886334836483, "learning_rate": 2.4111389134352264e-05, "loss": 0.474, "num_input_tokens_seen": 61017264, "step": 50300 }, { "epoch": 5.602516984073951, "grad_norm": 0.07478673756122589, "learning_rate": 2.4106532746300192e-05, "loss": 0.4524, "num_input_tokens_seen": 61023088, "step": 50305 }, { "epoch": 5.603073838957568, "grad_norm": 0.11769437044858932, "learning_rate": 2.4101676392006175e-05, "loss": 0.4649, "num_input_tokens_seen": 61029264, "step": 50310 }, { "epoch": 5.603630693841185, "grad_norm": 0.11057940125465393, "learning_rate": 2.4096820071653706e-05, "loss": 0.4613, "num_input_tokens_seen": 61035344, "step": 50315 }, { "epoch": 5.604187548724802, "grad_norm": 0.12059441208839417, "learning_rate": 2.4091963785426267e-05, "loss": 0.4569, "num_input_tokens_seen": 61041360, "step": 50320 }, { "epoch": 5.60474440360842, "grad_norm": 0.08267239481210709, "learning_rate": 2.4087107533507345e-05, "loss": 0.4658, "num_input_tokens_seen": 61047504, "step": 50325 }, { "epoch": 5.605301258492037, "grad_norm": 0.10004734247922897, "learning_rate": 2.4082251316080428e-05, "loss": 0.453, "num_input_tokens_seen": 61053744, "step": 50330 }, { "epoch": 5.605858113375654, "grad_norm": 0.10958895087242126, "learning_rate": 2.407739513332899e-05, "loss": 0.4625, "num_input_tokens_seen": 61059696, "step": 50335 }, { "epoch": 5.606414968259272, "grad_norm": 0.11329682171344757, "learning_rate": 2.4072538985436526e-05, "loss": 0.4734, "num_input_tokens_seen": 61065520, "step": 50340 }, { "epoch": 5.606971823142889, "grad_norm": 0.0824299231171608, "learning_rate": 2.4067682872586512e-05, "loss": 0.4617, "num_input_tokens_seen": 61071440, "step": 50345 }, { "epoch": 5.607528678026506, "grad_norm": 0.09315208345651627, "learning_rate": 2.4062826794962427e-05, "loss": 0.4673, "num_input_tokens_seen": 61077616, "step": 50350 }, { "epoch": 5.608085532910124, "grad_norm": 0.08822686970233917, "learning_rate": 2.405797075274775e-05, "loss": 0.4607, "num_input_tokens_seen": 61083728, "step": 50355 }, { "epoch": 5.608642387793741, "grad_norm": 0.11043232679367065, "learning_rate": 2.4053114746125956e-05, "loss": 0.4673, "num_input_tokens_seen": 61089616, "step": 50360 }, { "epoch": 5.6091992426773585, "grad_norm": 0.1174965351819992, "learning_rate": 2.4048258775280515e-05, "loss": 0.4655, "num_input_tokens_seen": 61095920, "step": 50365 }, { "epoch": 5.609756097560975, "grad_norm": 0.08354535698890686, "learning_rate": 2.4043402840394916e-05, "loss": 0.4539, "num_input_tokens_seen": 61101808, "step": 50370 }, { "epoch": 5.610312952444593, "grad_norm": 0.09749998897314072, "learning_rate": 2.403854694165262e-05, "loss": 0.456, "num_input_tokens_seen": 61108080, "step": 50375 }, { "epoch": 5.610869807328211, "grad_norm": 0.09775271266698837, "learning_rate": 2.4033691079237102e-05, "loss": 0.4718, "num_input_tokens_seen": 61114192, "step": 50380 }, { "epoch": 5.611426662211827, "grad_norm": 0.08415767550468445, "learning_rate": 2.4028835253331832e-05, "loss": 0.4632, "num_input_tokens_seen": 61120112, "step": 50385 }, { "epoch": 5.611983517095445, "grad_norm": 0.08038689196109772, "learning_rate": 2.402397946412028e-05, "loss": 0.4639, "num_input_tokens_seen": 61126288, "step": 50390 }, { "epoch": 5.612540371979062, "grad_norm": 0.10438532382249832, "learning_rate": 2.401912371178591e-05, "loss": 0.4675, "num_input_tokens_seen": 61132400, "step": 50395 }, { "epoch": 5.6130972268626795, "grad_norm": 0.12543657422065735, "learning_rate": 2.4014267996512187e-05, "loss": 0.4801, "num_input_tokens_seen": 61138416, "step": 50400 }, { "epoch": 5.613654081746297, "grad_norm": 0.08826454728841782, "learning_rate": 2.4009412318482576e-05, "loss": 0.4646, "num_input_tokens_seen": 61144560, "step": 50405 }, { "epoch": 5.614210936629914, "grad_norm": 0.09622680395841599, "learning_rate": 2.400455667788055e-05, "loss": 0.4586, "num_input_tokens_seen": 61150640, "step": 50410 }, { "epoch": 5.614767791513532, "grad_norm": 0.11784474551677704, "learning_rate": 2.3999701074889557e-05, "loss": 0.4602, "num_input_tokens_seen": 61156880, "step": 50415 }, { "epoch": 5.6153246463971485, "grad_norm": 0.0780281201004982, "learning_rate": 2.3994845509693066e-05, "loss": 0.4648, "num_input_tokens_seen": 61163056, "step": 50420 }, { "epoch": 5.615881501280766, "grad_norm": 0.09773795306682587, "learning_rate": 2.3989989982474534e-05, "loss": 0.4655, "num_input_tokens_seen": 61169104, "step": 50425 }, { "epoch": 5.616438356164384, "grad_norm": 0.09374545514583588, "learning_rate": 2.398513449341741e-05, "loss": 0.4695, "num_input_tokens_seen": 61175216, "step": 50430 }, { "epoch": 5.616995211048001, "grad_norm": 0.10722199082374573, "learning_rate": 2.3980279042705166e-05, "loss": 0.4595, "num_input_tokens_seen": 61181136, "step": 50435 }, { "epoch": 5.617552065931618, "grad_norm": 0.09246966987848282, "learning_rate": 2.3975423630521243e-05, "loss": 0.4558, "num_input_tokens_seen": 61187312, "step": 50440 }, { "epoch": 5.618108920815235, "grad_norm": 0.09195972979068756, "learning_rate": 2.39705682570491e-05, "loss": 0.4622, "num_input_tokens_seen": 61193232, "step": 50445 }, { "epoch": 5.618665775698853, "grad_norm": 0.1013936847448349, "learning_rate": 2.3965712922472184e-05, "loss": 0.4651, "num_input_tokens_seen": 61199664, "step": 50450 }, { "epoch": 5.61922263058247, "grad_norm": 0.11151884496212006, "learning_rate": 2.3960857626973952e-05, "loss": 0.466, "num_input_tokens_seen": 61205936, "step": 50455 }, { "epoch": 5.619779485466087, "grad_norm": 0.115828737616539, "learning_rate": 2.3956002370737845e-05, "loss": 0.4584, "num_input_tokens_seen": 61211888, "step": 50460 }, { "epoch": 5.620336340349705, "grad_norm": 0.08498276025056839, "learning_rate": 2.3951147153947317e-05, "loss": 0.4684, "num_input_tokens_seen": 61218192, "step": 50465 }, { "epoch": 5.6208931952333225, "grad_norm": 0.09677144885063171, "learning_rate": 2.394629197678581e-05, "loss": 0.4637, "num_input_tokens_seen": 61224592, "step": 50470 }, { "epoch": 5.621450050116939, "grad_norm": 0.09607777744531631, "learning_rate": 2.3941436839436772e-05, "loss": 0.4617, "num_input_tokens_seen": 61230736, "step": 50475 }, { "epoch": 5.622006905000557, "grad_norm": 0.11684853583574295, "learning_rate": 2.3936581742083643e-05, "loss": 0.4501, "num_input_tokens_seen": 61237200, "step": 50480 }, { "epoch": 5.622563759884175, "grad_norm": 0.08601123839616776, "learning_rate": 2.393172668490986e-05, "loss": 0.4537, "num_input_tokens_seen": 61243440, "step": 50485 }, { "epoch": 5.6231206147677915, "grad_norm": 0.10554807633161545, "learning_rate": 2.3926871668098867e-05, "loss": 0.4593, "num_input_tokens_seen": 61249488, "step": 50490 }, { "epoch": 5.623677469651409, "grad_norm": 0.0842399001121521, "learning_rate": 2.3922016691834098e-05, "loss": 0.4637, "num_input_tokens_seen": 61255664, "step": 50495 }, { "epoch": 5.624234324535026, "grad_norm": 0.08474801480770111, "learning_rate": 2.3917161756298997e-05, "loss": 0.466, "num_input_tokens_seen": 61261872, "step": 50500 }, { "epoch": 5.624791179418644, "grad_norm": 0.09257441014051437, "learning_rate": 2.3912306861676996e-05, "loss": 0.4655, "num_input_tokens_seen": 61268080, "step": 50505 }, { "epoch": 5.625348034302261, "grad_norm": 0.11791084706783295, "learning_rate": 2.3907452008151526e-05, "loss": 0.4572, "num_input_tokens_seen": 61274512, "step": 50510 }, { "epoch": 5.625904889185878, "grad_norm": 0.0981266126036644, "learning_rate": 2.3902597195906024e-05, "loss": 0.4623, "num_input_tokens_seen": 61280592, "step": 50515 }, { "epoch": 5.626461744069496, "grad_norm": 0.12087240815162659, "learning_rate": 2.3897742425123913e-05, "loss": 0.4631, "num_input_tokens_seen": 61286896, "step": 50520 }, { "epoch": 5.6270185989531125, "grad_norm": 0.11767970025539398, "learning_rate": 2.389288769598862e-05, "loss": 0.4565, "num_input_tokens_seen": 61292944, "step": 50525 }, { "epoch": 5.62757545383673, "grad_norm": 0.1484213024377823, "learning_rate": 2.3888033008683585e-05, "loss": 0.4595, "num_input_tokens_seen": 61298992, "step": 50530 }, { "epoch": 5.628132308720348, "grad_norm": 0.16578657925128937, "learning_rate": 2.3883178363392224e-05, "loss": 0.4642, "num_input_tokens_seen": 61304816, "step": 50535 }, { "epoch": 5.628689163603965, "grad_norm": 0.06805475801229477, "learning_rate": 2.3878323760297967e-05, "loss": 0.4672, "num_input_tokens_seen": 61311120, "step": 50540 }, { "epoch": 5.629246018487582, "grad_norm": 0.0790417268872261, "learning_rate": 2.387346919958423e-05, "loss": 0.4712, "num_input_tokens_seen": 61317296, "step": 50545 }, { "epoch": 5.629802873371199, "grad_norm": 0.09540107846260071, "learning_rate": 2.3868614681434437e-05, "loss": 0.4669, "num_input_tokens_seen": 61323024, "step": 50550 }, { "epoch": 5.630359728254817, "grad_norm": 0.09137334674596786, "learning_rate": 2.3863760206032007e-05, "loss": 0.4663, "num_input_tokens_seen": 61329040, "step": 50555 }, { "epoch": 5.6309165831384345, "grad_norm": 0.09503267705440521, "learning_rate": 2.3858905773560354e-05, "loss": 0.4577, "num_input_tokens_seen": 61335088, "step": 50560 }, { "epoch": 5.631473438022051, "grad_norm": 0.09473740309476852, "learning_rate": 2.38540513842029e-05, "loss": 0.4691, "num_input_tokens_seen": 61341360, "step": 50565 }, { "epoch": 5.632030292905669, "grad_norm": 0.07199052721261978, "learning_rate": 2.3849197038143063e-05, "loss": 0.4657, "num_input_tokens_seen": 61347472, "step": 50570 }, { "epoch": 5.632587147789286, "grad_norm": 0.1262764185667038, "learning_rate": 2.3844342735564244e-05, "loss": 0.4719, "num_input_tokens_seen": 61353712, "step": 50575 }, { "epoch": 5.633144002672903, "grad_norm": 0.10138893127441406, "learning_rate": 2.3839488476649863e-05, "loss": 0.4542, "num_input_tokens_seen": 61359344, "step": 50580 }, { "epoch": 5.633700857556521, "grad_norm": 0.0931181088089943, "learning_rate": 2.3834634261583324e-05, "loss": 0.4511, "num_input_tokens_seen": 61365488, "step": 50585 }, { "epoch": 5.634257712440138, "grad_norm": 0.10206352174282074, "learning_rate": 2.3829780090548034e-05, "loss": 0.4572, "num_input_tokens_seen": 61371536, "step": 50590 }, { "epoch": 5.6348145673237555, "grad_norm": 0.08624111115932465, "learning_rate": 2.3824925963727407e-05, "loss": 0.469, "num_input_tokens_seen": 61377680, "step": 50595 }, { "epoch": 5.635371422207372, "grad_norm": 0.0758683905005455, "learning_rate": 2.3820071881304842e-05, "loss": 0.4604, "num_input_tokens_seen": 61383824, "step": 50600 }, { "epoch": 5.63592827709099, "grad_norm": 0.11557680368423462, "learning_rate": 2.3815217843463745e-05, "loss": 0.4754, "num_input_tokens_seen": 61389936, "step": 50605 }, { "epoch": 5.636485131974608, "grad_norm": 0.0841619223356247, "learning_rate": 2.3810363850387513e-05, "loss": 0.4665, "num_input_tokens_seen": 61396400, "step": 50610 }, { "epoch": 5.637041986858224, "grad_norm": 0.20247498154640198, "learning_rate": 2.3805509902259545e-05, "loss": 0.4857, "num_input_tokens_seen": 61402544, "step": 50615 }, { "epoch": 5.637598841741842, "grad_norm": 0.0975823774933815, "learning_rate": 2.3800655999263244e-05, "loss": 0.4582, "num_input_tokens_seen": 61409072, "step": 50620 }, { "epoch": 5.63815569662546, "grad_norm": 0.09801643341779709, "learning_rate": 2.3795802141581996e-05, "loss": 0.4573, "num_input_tokens_seen": 61414800, "step": 50625 }, { "epoch": 5.638712551509077, "grad_norm": 0.11346345394849777, "learning_rate": 2.3790948329399208e-05, "loss": 0.4633, "num_input_tokens_seen": 61420816, "step": 50630 }, { "epoch": 5.639269406392694, "grad_norm": 0.11035211384296417, "learning_rate": 2.378609456289827e-05, "loss": 0.468, "num_input_tokens_seen": 61427120, "step": 50635 }, { "epoch": 5.639826261276311, "grad_norm": 0.08769168704748154, "learning_rate": 2.378124084226256e-05, "loss": 0.4718, "num_input_tokens_seen": 61433264, "step": 50640 }, { "epoch": 5.640383116159929, "grad_norm": 0.09208643436431885, "learning_rate": 2.3776387167675484e-05, "loss": 0.4613, "num_input_tokens_seen": 61439120, "step": 50645 }, { "epoch": 5.640939971043546, "grad_norm": 0.11195558309555054, "learning_rate": 2.3771533539320424e-05, "loss": 0.4618, "num_input_tokens_seen": 61445136, "step": 50650 }, { "epoch": 5.641496825927163, "grad_norm": 0.14467544853687286, "learning_rate": 2.3766679957380754e-05, "loss": 0.4577, "num_input_tokens_seen": 61451152, "step": 50655 }, { "epoch": 5.642053680810781, "grad_norm": 0.07733581960201263, "learning_rate": 2.3761826422039873e-05, "loss": 0.4594, "num_input_tokens_seen": 61457168, "step": 50660 }, { "epoch": 5.6426105356943985, "grad_norm": 0.07676387578248978, "learning_rate": 2.375697293348116e-05, "loss": 0.4737, "num_input_tokens_seen": 61463376, "step": 50665 }, { "epoch": 5.643167390578015, "grad_norm": 0.09997483342885971, "learning_rate": 2.3752119491887992e-05, "loss": 0.459, "num_input_tokens_seen": 61469136, "step": 50670 }, { "epoch": 5.643724245461633, "grad_norm": 0.09439574182033539, "learning_rate": 2.374726609744375e-05, "loss": 0.4605, "num_input_tokens_seen": 61475216, "step": 50675 }, { "epoch": 5.64428110034525, "grad_norm": 0.10450959950685501, "learning_rate": 2.3742412750331805e-05, "loss": 0.4519, "num_input_tokens_seen": 61481424, "step": 50680 }, { "epoch": 5.644837955228867, "grad_norm": 0.07411414384841919, "learning_rate": 2.3737559450735534e-05, "loss": 0.4555, "num_input_tokens_seen": 61487632, "step": 50685 }, { "epoch": 5.645394810112485, "grad_norm": 0.10747332125902176, "learning_rate": 2.3732706198838318e-05, "loss": 0.4636, "num_input_tokens_seen": 61493776, "step": 50690 }, { "epoch": 5.645951664996102, "grad_norm": 0.10989554971456528, "learning_rate": 2.3727852994823523e-05, "loss": 0.4524, "num_input_tokens_seen": 61499504, "step": 50695 }, { "epoch": 5.64650851987972, "grad_norm": 0.1646665781736374, "learning_rate": 2.372299983887452e-05, "loss": 0.469, "num_input_tokens_seen": 61505584, "step": 50700 }, { "epoch": 5.647065374763336, "grad_norm": 0.1012846976518631, "learning_rate": 2.3718146731174678e-05, "loss": 0.4616, "num_input_tokens_seen": 61511952, "step": 50705 }, { "epoch": 5.647622229646954, "grad_norm": 0.11443950235843658, "learning_rate": 2.3713293671907357e-05, "loss": 0.4566, "num_input_tokens_seen": 61517840, "step": 50710 }, { "epoch": 5.648179084530572, "grad_norm": 0.08314782381057739, "learning_rate": 2.3708440661255922e-05, "loss": 0.4584, "num_input_tokens_seen": 61523664, "step": 50715 }, { "epoch": 5.6487359394141885, "grad_norm": 0.08437094837427139, "learning_rate": 2.3703587699403743e-05, "loss": 0.4654, "num_input_tokens_seen": 61529552, "step": 50720 }, { "epoch": 5.649292794297806, "grad_norm": 0.08798721432685852, "learning_rate": 2.369873478653417e-05, "loss": 0.4655, "num_input_tokens_seen": 61535760, "step": 50725 }, { "epoch": 5.649849649181423, "grad_norm": 0.090064138174057, "learning_rate": 2.369388192283057e-05, "loss": 0.4722, "num_input_tokens_seen": 61541584, "step": 50730 }, { "epoch": 5.650406504065041, "grad_norm": 0.09498784691095352, "learning_rate": 2.3689029108476297e-05, "loss": 0.4568, "num_input_tokens_seen": 61547984, "step": 50735 }, { "epoch": 5.650963358948658, "grad_norm": 0.10279474407434464, "learning_rate": 2.3684176343654698e-05, "loss": 0.4696, "num_input_tokens_seen": 61554480, "step": 50740 }, { "epoch": 5.651520213832275, "grad_norm": 0.09714987128973007, "learning_rate": 2.367932362854914e-05, "loss": 0.4728, "num_input_tokens_seen": 61560624, "step": 50745 }, { "epoch": 5.652077068715893, "grad_norm": 0.08884543925523758, "learning_rate": 2.3674470963342968e-05, "loss": 0.4615, "num_input_tokens_seen": 61566192, "step": 50750 }, { "epoch": 5.65263392359951, "grad_norm": 0.11000777781009674, "learning_rate": 2.366961834821953e-05, "loss": 0.463, "num_input_tokens_seen": 61571696, "step": 50755 }, { "epoch": 5.653190778483127, "grad_norm": 0.11704905331134796, "learning_rate": 2.3664765783362174e-05, "loss": 0.4576, "num_input_tokens_seen": 61577776, "step": 50760 }, { "epoch": 5.653747633366745, "grad_norm": 0.08331708610057831, "learning_rate": 2.3659913268954244e-05, "loss": 0.463, "num_input_tokens_seen": 61583696, "step": 50765 }, { "epoch": 5.654304488250362, "grad_norm": 0.1007281169295311, "learning_rate": 2.365506080517908e-05, "loss": 0.4694, "num_input_tokens_seen": 61590192, "step": 50770 }, { "epoch": 5.654861343133979, "grad_norm": 0.17006956040859222, "learning_rate": 2.3650208392220037e-05, "loss": 0.4761, "num_input_tokens_seen": 61596304, "step": 50775 }, { "epoch": 5.655418198017596, "grad_norm": 0.08987333625555038, "learning_rate": 2.3645356030260447e-05, "loss": 0.4698, "num_input_tokens_seen": 61602384, "step": 50780 }, { "epoch": 5.655975052901214, "grad_norm": 0.09165334701538086, "learning_rate": 2.3640503719483646e-05, "loss": 0.4517, "num_input_tokens_seen": 61608112, "step": 50785 }, { "epoch": 5.6565319077848315, "grad_norm": 0.08789137005805969, "learning_rate": 2.3635651460072973e-05, "loss": 0.4628, "num_input_tokens_seen": 61614352, "step": 50790 }, { "epoch": 5.657088762668448, "grad_norm": 0.13134166598320007, "learning_rate": 2.363079925221176e-05, "loss": 0.4529, "num_input_tokens_seen": 61620624, "step": 50795 }, { "epoch": 5.657645617552066, "grad_norm": 0.09612490236759186, "learning_rate": 2.3625947096083327e-05, "loss": 0.4633, "num_input_tokens_seen": 61626704, "step": 50800 }, { "epoch": 5.658202472435684, "grad_norm": 0.08775265514850616, "learning_rate": 2.3621094991871028e-05, "loss": 0.4644, "num_input_tokens_seen": 61632752, "step": 50805 }, { "epoch": 5.6587593273193, "grad_norm": 0.09258972853422165, "learning_rate": 2.361624293975818e-05, "loss": 0.4504, "num_input_tokens_seen": 61638640, "step": 50810 }, { "epoch": 5.659316182202918, "grad_norm": 0.09514272958040237, "learning_rate": 2.3611390939928106e-05, "loss": 0.4665, "num_input_tokens_seen": 61644784, "step": 50815 }, { "epoch": 5.659873037086535, "grad_norm": 0.08610521256923676, "learning_rate": 2.360653899256413e-05, "loss": 0.4686, "num_input_tokens_seen": 61651088, "step": 50820 }, { "epoch": 5.660429891970153, "grad_norm": 0.12931062281131744, "learning_rate": 2.3601687097849577e-05, "loss": 0.4572, "num_input_tokens_seen": 61657264, "step": 50825 }, { "epoch": 5.66098674685377, "grad_norm": 0.08338862657546997, "learning_rate": 2.359683525596777e-05, "loss": 0.4551, "num_input_tokens_seen": 61663376, "step": 50830 }, { "epoch": 5.661543601737387, "grad_norm": 0.12338919937610626, "learning_rate": 2.3591983467102015e-05, "loss": 0.4623, "num_input_tokens_seen": 61669296, "step": 50835 }, { "epoch": 5.662100456621005, "grad_norm": 0.10489308834075928, "learning_rate": 2.3587131731435645e-05, "loss": 0.4687, "num_input_tokens_seen": 61675440, "step": 50840 }, { "epoch": 5.662657311504622, "grad_norm": 0.13166984915733337, "learning_rate": 2.3582280049151965e-05, "loss": 0.4557, "num_input_tokens_seen": 61681904, "step": 50845 }, { "epoch": 5.663214166388239, "grad_norm": 0.13135872781276703, "learning_rate": 2.3577428420434286e-05, "loss": 0.4639, "num_input_tokens_seen": 61688400, "step": 50850 }, { "epoch": 5.663771021271857, "grad_norm": 0.10628785192966461, "learning_rate": 2.3572576845465922e-05, "loss": 0.4597, "num_input_tokens_seen": 61694256, "step": 50855 }, { "epoch": 5.664327876155474, "grad_norm": 0.10912566632032394, "learning_rate": 2.356772532443018e-05, "loss": 0.4592, "num_input_tokens_seen": 61700528, "step": 50860 }, { "epoch": 5.664884731039091, "grad_norm": 0.11566033214330673, "learning_rate": 2.356287385751036e-05, "loss": 0.4622, "num_input_tokens_seen": 61706416, "step": 50865 }, { "epoch": 5.665441585922709, "grad_norm": 0.10381211340427399, "learning_rate": 2.3558022444889775e-05, "loss": 0.4696, "num_input_tokens_seen": 61712528, "step": 50870 }, { "epoch": 5.665998440806326, "grad_norm": 0.07736040651798248, "learning_rate": 2.3553171086751723e-05, "loss": 0.4558, "num_input_tokens_seen": 61718672, "step": 50875 }, { "epoch": 5.666555295689943, "grad_norm": 0.07889425754547119, "learning_rate": 2.3548319783279503e-05, "loss": 0.4586, "num_input_tokens_seen": 61724592, "step": 50880 }, { "epoch": 5.66711215057356, "grad_norm": 0.09324721246957779, "learning_rate": 2.3543468534656417e-05, "loss": 0.4551, "num_input_tokens_seen": 61730352, "step": 50885 }, { "epoch": 5.667669005457178, "grad_norm": 0.08474846929311752, "learning_rate": 2.3538617341065756e-05, "loss": 0.4619, "num_input_tokens_seen": 61736432, "step": 50890 }, { "epoch": 5.668225860340796, "grad_norm": 0.15945085883140564, "learning_rate": 2.3533766202690812e-05, "loss": 0.4499, "num_input_tokens_seen": 61742896, "step": 50895 }, { "epoch": 5.668782715224412, "grad_norm": 0.08526536822319031, "learning_rate": 2.352891511971488e-05, "loss": 0.4503, "num_input_tokens_seen": 61749392, "step": 50900 }, { "epoch": 5.66933957010803, "grad_norm": 0.07589325308799744, "learning_rate": 2.352406409232125e-05, "loss": 0.4584, "num_input_tokens_seen": 61754896, "step": 50905 }, { "epoch": 5.669896424991647, "grad_norm": 0.10864338278770447, "learning_rate": 2.351921312069321e-05, "loss": 0.4681, "num_input_tokens_seen": 61761168, "step": 50910 }, { "epoch": 5.6704532798752645, "grad_norm": 0.1146872341632843, "learning_rate": 2.3514362205014044e-05, "loss": 0.4566, "num_input_tokens_seen": 61767600, "step": 50915 }, { "epoch": 5.671010134758882, "grad_norm": 0.1040385514497757, "learning_rate": 2.3509511345467032e-05, "loss": 0.4665, "num_input_tokens_seen": 61773680, "step": 50920 }, { "epoch": 5.671566989642499, "grad_norm": 0.11770936846733093, "learning_rate": 2.350466054223546e-05, "loss": 0.4768, "num_input_tokens_seen": 61779664, "step": 50925 }, { "epoch": 5.672123844526117, "grad_norm": 0.10920567065477371, "learning_rate": 2.34998097955026e-05, "loss": 0.467, "num_input_tokens_seen": 61785968, "step": 50930 }, { "epoch": 5.672680699409733, "grad_norm": 0.09115205705165863, "learning_rate": 2.349495910545174e-05, "loss": 0.4645, "num_input_tokens_seen": 61792048, "step": 50935 }, { "epoch": 5.673237554293351, "grad_norm": 0.09965229779481888, "learning_rate": 2.3490108472266146e-05, "loss": 0.4646, "num_input_tokens_seen": 61798576, "step": 50940 }, { "epoch": 5.673794409176969, "grad_norm": 0.13301439583301544, "learning_rate": 2.3485257896129094e-05, "loss": 0.4545, "num_input_tokens_seen": 61804048, "step": 50945 }, { "epoch": 5.6743512640605855, "grad_norm": 0.09908325970172882, "learning_rate": 2.3480407377223852e-05, "loss": 0.4653, "num_input_tokens_seen": 61809712, "step": 50950 }, { "epoch": 5.674908118944203, "grad_norm": 0.07807770371437073, "learning_rate": 2.3475556915733688e-05, "loss": 0.4622, "num_input_tokens_seen": 61815568, "step": 50955 }, { "epoch": 5.67546497382782, "grad_norm": 0.10479778796434402, "learning_rate": 2.347070651184187e-05, "loss": 0.4564, "num_input_tokens_seen": 61821648, "step": 50960 }, { "epoch": 5.676021828711438, "grad_norm": 0.15254859626293182, "learning_rate": 2.3465856165731657e-05, "loss": 0.469, "num_input_tokens_seen": 61827920, "step": 50965 }, { "epoch": 5.676578683595055, "grad_norm": 0.11851202696561813, "learning_rate": 2.3461005877586318e-05, "loss": 0.4543, "num_input_tokens_seen": 61834320, "step": 50970 }, { "epoch": 5.677135538478672, "grad_norm": 0.11003047227859497, "learning_rate": 2.3456155647589107e-05, "loss": 0.4692, "num_input_tokens_seen": 61840624, "step": 50975 }, { "epoch": 5.67769239336229, "grad_norm": 0.10385137796401978, "learning_rate": 2.3451305475923287e-05, "loss": 0.455, "num_input_tokens_seen": 61846704, "step": 50980 }, { "epoch": 5.6782492482459075, "grad_norm": 0.10617620497941971, "learning_rate": 2.3446455362772106e-05, "loss": 0.4534, "num_input_tokens_seen": 61852432, "step": 50985 }, { "epoch": 5.678806103129524, "grad_norm": 0.10590188950300217, "learning_rate": 2.3441605308318824e-05, "loss": 0.4575, "num_input_tokens_seen": 61858512, "step": 50990 }, { "epoch": 5.679362958013142, "grad_norm": 0.08846848458051682, "learning_rate": 2.3436755312746678e-05, "loss": 0.4718, "num_input_tokens_seen": 61864400, "step": 50995 }, { "epoch": 5.679919812896759, "grad_norm": 0.1203937977552414, "learning_rate": 2.3431905376238932e-05, "loss": 0.468, "num_input_tokens_seen": 61870512, "step": 51000 }, { "epoch": 5.680476667780376, "grad_norm": 0.12865018844604492, "learning_rate": 2.3427055498978827e-05, "loss": 0.4571, "num_input_tokens_seen": 61876560, "step": 51005 }, { "epoch": 5.681033522663994, "grad_norm": 0.11107152700424194, "learning_rate": 2.342220568114961e-05, "loss": 0.4583, "num_input_tokens_seen": 61882352, "step": 51010 }, { "epoch": 5.681590377547611, "grad_norm": 0.09715837985277176, "learning_rate": 2.3417355922934513e-05, "loss": 0.4656, "num_input_tokens_seen": 61888464, "step": 51015 }, { "epoch": 5.6821472324312285, "grad_norm": 0.08104608207941055, "learning_rate": 2.3412506224516782e-05, "loss": 0.4603, "num_input_tokens_seen": 61894672, "step": 51020 }, { "epoch": 5.682704087314846, "grad_norm": 0.0957581102848053, "learning_rate": 2.3407656586079653e-05, "loss": 0.4533, "num_input_tokens_seen": 61900528, "step": 51025 }, { "epoch": 5.683260942198463, "grad_norm": 0.10036174952983856, "learning_rate": 2.340280700780636e-05, "loss": 0.4689, "num_input_tokens_seen": 61906704, "step": 51030 }, { "epoch": 5.683817797082081, "grad_norm": 0.11582271754741669, "learning_rate": 2.3397957489880137e-05, "loss": 0.4626, "num_input_tokens_seen": 61913104, "step": 51035 }, { "epoch": 5.6843746519656975, "grad_norm": 0.08973538875579834, "learning_rate": 2.3393108032484218e-05, "loss": 0.4608, "num_input_tokens_seen": 61919248, "step": 51040 }, { "epoch": 5.684931506849315, "grad_norm": 0.0888812392950058, "learning_rate": 2.3388258635801825e-05, "loss": 0.4707, "num_input_tokens_seen": 61925136, "step": 51045 }, { "epoch": 5.685488361732933, "grad_norm": 0.11512196063995361, "learning_rate": 2.3383409300016185e-05, "loss": 0.4601, "num_input_tokens_seen": 61931376, "step": 51050 }, { "epoch": 5.68604521661655, "grad_norm": 0.08712414652109146, "learning_rate": 2.337856002531052e-05, "loss": 0.4643, "num_input_tokens_seen": 61936816, "step": 51055 }, { "epoch": 5.686602071500167, "grad_norm": 0.1053600013256073, "learning_rate": 2.3373710811868056e-05, "loss": 0.464, "num_input_tokens_seen": 61942800, "step": 51060 }, { "epoch": 5.687158926383784, "grad_norm": 0.14465105533599854, "learning_rate": 2.336886165987201e-05, "loss": 0.4528, "num_input_tokens_seen": 61948976, "step": 51065 }, { "epoch": 5.687715781267402, "grad_norm": 0.11756831407546997, "learning_rate": 2.33640125695056e-05, "loss": 0.454, "num_input_tokens_seen": 61955408, "step": 51070 }, { "epoch": 5.688272636151019, "grad_norm": 0.14434801042079926, "learning_rate": 2.3359163540952032e-05, "loss": 0.453, "num_input_tokens_seen": 61961456, "step": 51075 }, { "epoch": 5.688829491034636, "grad_norm": 0.1319318413734436, "learning_rate": 2.3354314574394525e-05, "loss": 0.4618, "num_input_tokens_seen": 61967440, "step": 51080 }, { "epoch": 5.689386345918254, "grad_norm": 0.13780520856380463, "learning_rate": 2.334946567001629e-05, "loss": 0.4694, "num_input_tokens_seen": 61973904, "step": 51085 }, { "epoch": 5.689943200801871, "grad_norm": 0.09069101512432098, "learning_rate": 2.3344616828000527e-05, "loss": 0.4703, "num_input_tokens_seen": 61979600, "step": 51090 }, { "epoch": 5.690500055685488, "grad_norm": 0.08116760104894638, "learning_rate": 2.3339768048530447e-05, "loss": 0.4544, "num_input_tokens_seen": 61985520, "step": 51095 }, { "epoch": 5.691056910569106, "grad_norm": 0.08056122809648514, "learning_rate": 2.3334919331789254e-05, "loss": 0.4625, "num_input_tokens_seen": 61991536, "step": 51100 }, { "epoch": 5.691613765452723, "grad_norm": 0.10355876386165619, "learning_rate": 2.3330070677960143e-05, "loss": 0.4627, "num_input_tokens_seen": 61997648, "step": 51105 }, { "epoch": 5.6921706203363405, "grad_norm": 0.09714051336050034, "learning_rate": 2.3325222087226313e-05, "loss": 0.4552, "num_input_tokens_seen": 62004144, "step": 51110 }, { "epoch": 5.692727475219957, "grad_norm": 0.11967191100120544, "learning_rate": 2.3320373559770967e-05, "loss": 0.463, "num_input_tokens_seen": 62009872, "step": 51115 }, { "epoch": 5.693284330103575, "grad_norm": 0.09251118451356888, "learning_rate": 2.3315525095777284e-05, "loss": 0.4447, "num_input_tokens_seen": 62015984, "step": 51120 }, { "epoch": 5.693841184987193, "grad_norm": 0.08141409605741501, "learning_rate": 2.331067669542846e-05, "loss": 0.4557, "num_input_tokens_seen": 62022192, "step": 51125 }, { "epoch": 5.694398039870809, "grad_norm": 0.16051673889160156, "learning_rate": 2.3305828358907687e-05, "loss": 0.4706, "num_input_tokens_seen": 62028176, "step": 51130 }, { "epoch": 5.694954894754427, "grad_norm": 0.10507224500179291, "learning_rate": 2.3300980086398145e-05, "loss": 0.4574, "num_input_tokens_seen": 62034448, "step": 51135 }, { "epoch": 5.695511749638044, "grad_norm": 0.1202661395072937, "learning_rate": 2.3296131878083014e-05, "loss": 0.4776, "num_input_tokens_seen": 62040464, "step": 51140 }, { "epoch": 5.6960686045216615, "grad_norm": 0.10447244346141815, "learning_rate": 2.329128373414549e-05, "loss": 0.4524, "num_input_tokens_seen": 62046832, "step": 51145 }, { "epoch": 5.696625459405279, "grad_norm": 0.13898411393165588, "learning_rate": 2.328643565476874e-05, "loss": 0.4635, "num_input_tokens_seen": 62052944, "step": 51150 }, { "epoch": 5.697182314288896, "grad_norm": 0.07824879139661789, "learning_rate": 2.328158764013594e-05, "loss": 0.4523, "num_input_tokens_seen": 62058672, "step": 51155 }, { "epoch": 5.697739169172514, "grad_norm": 0.08409791439771652, "learning_rate": 2.3276739690430267e-05, "loss": 0.4633, "num_input_tokens_seen": 62065104, "step": 51160 }, { "epoch": 5.698296024056131, "grad_norm": 0.21237345039844513, "learning_rate": 2.3271891805834893e-05, "loss": 0.4638, "num_input_tokens_seen": 62070800, "step": 51165 }, { "epoch": 5.698852878939748, "grad_norm": 0.092449851334095, "learning_rate": 2.3267043986532986e-05, "loss": 0.4436, "num_input_tokens_seen": 62077008, "step": 51170 }, { "epoch": 5.699409733823366, "grad_norm": 0.08512666821479797, "learning_rate": 2.3262196232707704e-05, "loss": 0.4506, "num_input_tokens_seen": 62083152, "step": 51175 }, { "epoch": 5.699966588706983, "grad_norm": 0.07734289020299911, "learning_rate": 2.325734854454222e-05, "loss": 0.4687, "num_input_tokens_seen": 62088912, "step": 51180 }, { "epoch": 5.7005234435906, "grad_norm": 0.09289253503084183, "learning_rate": 2.3252500922219696e-05, "loss": 0.4549, "num_input_tokens_seen": 62094800, "step": 51185 }, { "epoch": 5.701080298474218, "grad_norm": 0.08154530823230743, "learning_rate": 2.3247653365923287e-05, "loss": 0.4686, "num_input_tokens_seen": 62100880, "step": 51190 }, { "epoch": 5.701637153357835, "grad_norm": 0.08877809345722198, "learning_rate": 2.3242805875836148e-05, "loss": 0.4623, "num_input_tokens_seen": 62106992, "step": 51195 }, { "epoch": 5.702194008241452, "grad_norm": 0.09946487098932266, "learning_rate": 2.3237958452141436e-05, "loss": 0.4497, "num_input_tokens_seen": 62113168, "step": 51200 }, { "epoch": 5.70275086312507, "grad_norm": 0.10195740312337875, "learning_rate": 2.3233111095022294e-05, "loss": 0.458, "num_input_tokens_seen": 62119440, "step": 51205 }, { "epoch": 5.703307718008687, "grad_norm": 0.10967613756656647, "learning_rate": 2.322826380466188e-05, "loss": 0.4502, "num_input_tokens_seen": 62125712, "step": 51210 }, { "epoch": 5.7038645728923045, "grad_norm": 0.09969694167375565, "learning_rate": 2.3223416581243342e-05, "loss": 0.447, "num_input_tokens_seen": 62131984, "step": 51215 }, { "epoch": 5.704421427775921, "grad_norm": 0.09476378560066223, "learning_rate": 2.3218569424949816e-05, "loss": 0.4602, "num_input_tokens_seen": 62137808, "step": 51220 }, { "epoch": 5.704978282659539, "grad_norm": 0.11152353882789612, "learning_rate": 2.3213722335964446e-05, "loss": 0.471, "num_input_tokens_seen": 62143760, "step": 51225 }, { "epoch": 5.705535137543157, "grad_norm": 0.10418090969324112, "learning_rate": 2.320887531447037e-05, "loss": 0.47, "num_input_tokens_seen": 62149744, "step": 51230 }, { "epoch": 5.7060919924267735, "grad_norm": 0.08723391592502594, "learning_rate": 2.3204028360650725e-05, "loss": 0.4541, "num_input_tokens_seen": 62155888, "step": 51235 }, { "epoch": 5.706648847310391, "grad_norm": 0.12776802480220795, "learning_rate": 2.319918147468864e-05, "loss": 0.4612, "num_input_tokens_seen": 62161360, "step": 51240 }, { "epoch": 5.707205702194008, "grad_norm": 0.10173092782497406, "learning_rate": 2.3194334656767252e-05, "loss": 0.4587, "num_input_tokens_seen": 62167440, "step": 51245 }, { "epoch": 5.707762557077626, "grad_norm": 0.11496949940919876, "learning_rate": 2.318948790706969e-05, "loss": 0.4631, "num_input_tokens_seen": 62173424, "step": 51250 }, { "epoch": 5.708319411961243, "grad_norm": 0.12974752485752106, "learning_rate": 2.318464122577908e-05, "loss": 0.4571, "num_input_tokens_seen": 62179376, "step": 51255 }, { "epoch": 5.70887626684486, "grad_norm": 0.10223540663719177, "learning_rate": 2.3179794613078535e-05, "loss": 0.4589, "num_input_tokens_seen": 62185424, "step": 51260 }, { "epoch": 5.709433121728478, "grad_norm": 0.0976029708981514, "learning_rate": 2.3174948069151186e-05, "loss": 0.4563, "num_input_tokens_seen": 62191376, "step": 51265 }, { "epoch": 5.7099899766120945, "grad_norm": 0.09397561848163605, "learning_rate": 2.3170101594180144e-05, "loss": 0.4677, "num_input_tokens_seen": 62197712, "step": 51270 }, { "epoch": 5.710546831495712, "grad_norm": 0.1432265043258667, "learning_rate": 2.316525518834853e-05, "loss": 0.4629, "num_input_tokens_seen": 62203536, "step": 51275 }, { "epoch": 5.71110368637933, "grad_norm": 0.1020406112074852, "learning_rate": 2.3160408851839457e-05, "loss": 0.4594, "num_input_tokens_seen": 62209680, "step": 51280 }, { "epoch": 5.711660541262947, "grad_norm": 0.09770788997411728, "learning_rate": 2.3155562584836035e-05, "loss": 0.4715, "num_input_tokens_seen": 62215568, "step": 51285 }, { "epoch": 5.712217396146564, "grad_norm": 0.12476099282503128, "learning_rate": 2.3150716387521364e-05, "loss": 0.4748, "num_input_tokens_seen": 62221648, "step": 51290 }, { "epoch": 5.712774251030181, "grad_norm": 0.08100415766239166, "learning_rate": 2.314587026007856e-05, "loss": 0.4551, "num_input_tokens_seen": 62227472, "step": 51295 }, { "epoch": 5.713331105913799, "grad_norm": 0.09267479181289673, "learning_rate": 2.3141024202690708e-05, "loss": 0.4674, "num_input_tokens_seen": 62233008, "step": 51300 }, { "epoch": 5.7138879607974165, "grad_norm": 0.10465298593044281, "learning_rate": 2.313617821554093e-05, "loss": 0.4539, "num_input_tokens_seen": 62239184, "step": 51305 }, { "epoch": 5.714444815681033, "grad_norm": 0.15737830102443695, "learning_rate": 2.313133229881231e-05, "loss": 0.4593, "num_input_tokens_seen": 62245136, "step": 51310 }, { "epoch": 5.715001670564651, "grad_norm": 0.13373342156410217, "learning_rate": 2.3126486452687944e-05, "loss": 0.4686, "num_input_tokens_seen": 62250928, "step": 51315 }, { "epoch": 5.715558525448268, "grad_norm": 0.1436118632555008, "learning_rate": 2.312164067735092e-05, "loss": 0.4568, "num_input_tokens_seen": 62257104, "step": 51320 }, { "epoch": 5.716115380331885, "grad_norm": 0.0848902091383934, "learning_rate": 2.3116794972984333e-05, "loss": 0.453, "num_input_tokens_seen": 62263216, "step": 51325 }, { "epoch": 5.716672235215503, "grad_norm": 0.10906451940536499, "learning_rate": 2.311194933977127e-05, "loss": 0.4635, "num_input_tokens_seen": 62269456, "step": 51330 }, { "epoch": 5.71722909009912, "grad_norm": 0.1253119707107544, "learning_rate": 2.3107103777894804e-05, "loss": 0.4523, "num_input_tokens_seen": 62275216, "step": 51335 }, { "epoch": 5.7177859449827375, "grad_norm": 0.10560663044452667, "learning_rate": 2.310225828753803e-05, "loss": 0.4609, "num_input_tokens_seen": 62281136, "step": 51340 }, { "epoch": 5.718342799866355, "grad_norm": 0.10206746309995651, "learning_rate": 2.3097412868884015e-05, "loss": 0.463, "num_input_tokens_seen": 62287280, "step": 51345 }, { "epoch": 5.718899654749972, "grad_norm": 0.13716435432434082, "learning_rate": 2.3092567522115845e-05, "loss": 0.4651, "num_input_tokens_seen": 62293264, "step": 51350 }, { "epoch": 5.71945650963359, "grad_norm": 0.10495474189519882, "learning_rate": 2.3087722247416582e-05, "loss": 0.4642, "num_input_tokens_seen": 62298736, "step": 51355 }, { "epoch": 5.720013364517206, "grad_norm": 0.08645512163639069, "learning_rate": 2.3082877044969304e-05, "loss": 0.4602, "num_input_tokens_seen": 62305136, "step": 51360 }, { "epoch": 5.720570219400824, "grad_norm": 0.07229064404964447, "learning_rate": 2.3078031914957068e-05, "loss": 0.4593, "num_input_tokens_seen": 62311248, "step": 51365 }, { "epoch": 5.721127074284442, "grad_norm": 0.09890114516019821, "learning_rate": 2.3073186857562952e-05, "loss": 0.4546, "num_input_tokens_seen": 62316944, "step": 51370 }, { "epoch": 5.721683929168059, "grad_norm": 0.09274362772703171, "learning_rate": 2.306834187297001e-05, "loss": 0.4627, "num_input_tokens_seen": 62323248, "step": 51375 }, { "epoch": 5.722240784051676, "grad_norm": 0.09169390052556992, "learning_rate": 2.3063496961361303e-05, "loss": 0.4532, "num_input_tokens_seen": 62329232, "step": 51380 }, { "epoch": 5.722797638935294, "grad_norm": 0.1361922025680542, "learning_rate": 2.3058652122919885e-05, "loss": 0.469, "num_input_tokens_seen": 62335504, "step": 51385 }, { "epoch": 5.723354493818911, "grad_norm": 0.08716665208339691, "learning_rate": 2.3053807357828813e-05, "loss": 0.4788, "num_input_tokens_seen": 62341712, "step": 51390 }, { "epoch": 5.723911348702528, "grad_norm": 0.09908999502658844, "learning_rate": 2.3048962666271138e-05, "loss": 0.4558, "num_input_tokens_seen": 62347888, "step": 51395 }, { "epoch": 5.724468203586145, "grad_norm": 0.10140147060155869, "learning_rate": 2.3044118048429896e-05, "loss": 0.4695, "num_input_tokens_seen": 62354256, "step": 51400 }, { "epoch": 5.725025058469763, "grad_norm": 0.11940590292215347, "learning_rate": 2.303927350448815e-05, "loss": 0.4675, "num_input_tokens_seen": 62360368, "step": 51405 }, { "epoch": 5.7255819133533805, "grad_norm": 0.10187274217605591, "learning_rate": 2.3034429034628933e-05, "loss": 0.4618, "num_input_tokens_seen": 62366096, "step": 51410 }, { "epoch": 5.726138768236997, "grad_norm": 0.1047239750623703, "learning_rate": 2.3029584639035286e-05, "loss": 0.4561, "num_input_tokens_seen": 62371600, "step": 51415 }, { "epoch": 5.726695623120615, "grad_norm": 0.11414197832345963, "learning_rate": 2.3024740317890248e-05, "loss": 0.4545, "num_input_tokens_seen": 62377584, "step": 51420 }, { "epoch": 5.727252478004232, "grad_norm": 0.08963906019926071, "learning_rate": 2.301989607137685e-05, "loss": 0.4657, "num_input_tokens_seen": 62383248, "step": 51425 }, { "epoch": 5.727809332887849, "grad_norm": 0.1125757023692131, "learning_rate": 2.3015051899678117e-05, "loss": 0.4658, "num_input_tokens_seen": 62389392, "step": 51430 }, { "epoch": 5.728366187771467, "grad_norm": 0.11397339403629303, "learning_rate": 2.301020780297709e-05, "loss": 0.4469, "num_input_tokens_seen": 62395632, "step": 51435 }, { "epoch": 5.728923042655084, "grad_norm": 0.10406753420829773, "learning_rate": 2.300536378145679e-05, "loss": 0.4631, "num_input_tokens_seen": 62401904, "step": 51440 }, { "epoch": 5.729479897538702, "grad_norm": 0.08897513896226883, "learning_rate": 2.300051983530024e-05, "loss": 0.4616, "num_input_tokens_seen": 62407984, "step": 51445 }, { "epoch": 5.730036752422318, "grad_norm": 0.09807299077510834, "learning_rate": 2.2995675964690454e-05, "loss": 0.46, "num_input_tokens_seen": 62414320, "step": 51450 }, { "epoch": 5.730593607305936, "grad_norm": 0.12983126938343048, "learning_rate": 2.2990832169810456e-05, "loss": 0.4485, "num_input_tokens_seen": 62420464, "step": 51455 }, { "epoch": 5.731150462189554, "grad_norm": 0.0987209752202034, "learning_rate": 2.2985988450843258e-05, "loss": 0.4588, "num_input_tokens_seen": 62426128, "step": 51460 }, { "epoch": 5.7317073170731705, "grad_norm": 0.14004990458488464, "learning_rate": 2.2981144807971863e-05, "loss": 0.4547, "num_input_tokens_seen": 62432112, "step": 51465 }, { "epoch": 5.732264171956788, "grad_norm": 0.09594011306762695, "learning_rate": 2.2976301241379296e-05, "loss": 0.475, "num_input_tokens_seen": 62438192, "step": 51470 }, { "epoch": 5.732821026840405, "grad_norm": 0.11970449984073639, "learning_rate": 2.2971457751248548e-05, "loss": 0.4655, "num_input_tokens_seen": 62444208, "step": 51475 }, { "epoch": 5.733377881724023, "grad_norm": 0.09334968775510788, "learning_rate": 2.296661433776263e-05, "loss": 0.4566, "num_input_tokens_seen": 62450192, "step": 51480 }, { "epoch": 5.73393473660764, "grad_norm": 0.1300126314163208, "learning_rate": 2.2961771001104536e-05, "loss": 0.4607, "num_input_tokens_seen": 62456240, "step": 51485 }, { "epoch": 5.734491591491257, "grad_norm": 0.12248746305704117, "learning_rate": 2.295692774145727e-05, "loss": 0.4542, "num_input_tokens_seen": 62462288, "step": 51490 }, { "epoch": 5.735048446374875, "grad_norm": 0.11504718661308289, "learning_rate": 2.2952084559003815e-05, "loss": 0.4711, "num_input_tokens_seen": 62468496, "step": 51495 }, { "epoch": 5.7356053012584916, "grad_norm": 0.10883541405200958, "learning_rate": 2.294724145392717e-05, "loss": 0.4605, "num_input_tokens_seen": 62474576, "step": 51500 }, { "epoch": 5.736162156142109, "grad_norm": 0.07872417569160461, "learning_rate": 2.2942398426410325e-05, "loss": 0.4566, "num_input_tokens_seen": 62480848, "step": 51505 }, { "epoch": 5.736719011025727, "grad_norm": 0.09536787867546082, "learning_rate": 2.293755547663626e-05, "loss": 0.4635, "num_input_tokens_seen": 62486960, "step": 51510 }, { "epoch": 5.737275865909344, "grad_norm": 0.13496725261211395, "learning_rate": 2.293271260478796e-05, "loss": 0.4623, "num_input_tokens_seen": 62493136, "step": 51515 }, { "epoch": 5.737832720792961, "grad_norm": 0.13043807446956635, "learning_rate": 2.29278698110484e-05, "loss": 0.4521, "num_input_tokens_seen": 62499344, "step": 51520 }, { "epoch": 5.738389575676579, "grad_norm": 0.10383327305316925, "learning_rate": 2.2923027095600564e-05, "loss": 0.4615, "num_input_tokens_seen": 62505136, "step": 51525 }, { "epoch": 5.738946430560196, "grad_norm": 0.08895239233970642, "learning_rate": 2.291818445862742e-05, "loss": 0.4569, "num_input_tokens_seen": 62510832, "step": 51530 }, { "epoch": 5.7395032854438135, "grad_norm": 0.12692829966545105, "learning_rate": 2.291334190031193e-05, "loss": 0.4608, "num_input_tokens_seen": 62517136, "step": 51535 }, { "epoch": 5.740060140327431, "grad_norm": 0.14167223870754242, "learning_rate": 2.2908499420837078e-05, "loss": 0.4659, "num_input_tokens_seen": 62523568, "step": 51540 }, { "epoch": 5.740616995211048, "grad_norm": 0.09641857445240021, "learning_rate": 2.290365702038581e-05, "loss": 0.4605, "num_input_tokens_seen": 62529776, "step": 51545 }, { "epoch": 5.741173850094666, "grad_norm": 0.09809136390686035, "learning_rate": 2.2898814699141106e-05, "loss": 0.4668, "num_input_tokens_seen": 62535920, "step": 51550 }, { "epoch": 5.741730704978282, "grad_norm": 0.11259760707616806, "learning_rate": 2.2893972457285917e-05, "loss": 0.46, "num_input_tokens_seen": 62542256, "step": 51555 }, { "epoch": 5.7422875598619, "grad_norm": 0.09336727857589722, "learning_rate": 2.2889130295003194e-05, "loss": 0.467, "num_input_tokens_seen": 62548272, "step": 51560 }, { "epoch": 5.742844414745518, "grad_norm": 0.10277634114027023, "learning_rate": 2.2884288212475895e-05, "loss": 0.4574, "num_input_tokens_seen": 62554352, "step": 51565 }, { "epoch": 5.7434012696291346, "grad_norm": 0.07631019502878189, "learning_rate": 2.2879446209886967e-05, "loss": 0.4695, "num_input_tokens_seen": 62560848, "step": 51570 }, { "epoch": 5.743958124512752, "grad_norm": 0.1061038225889206, "learning_rate": 2.287460428741936e-05, "loss": 0.4553, "num_input_tokens_seen": 62567056, "step": 51575 }, { "epoch": 5.744514979396369, "grad_norm": 0.08812704682350159, "learning_rate": 2.2869762445256003e-05, "loss": 0.452, "num_input_tokens_seen": 62573232, "step": 51580 }, { "epoch": 5.745071834279987, "grad_norm": 0.13325071334838867, "learning_rate": 2.286492068357986e-05, "loss": 0.4614, "num_input_tokens_seen": 62579600, "step": 51585 }, { "epoch": 5.745628689163604, "grad_norm": 0.1313590258359909, "learning_rate": 2.286007900257385e-05, "loss": 0.4521, "num_input_tokens_seen": 62585552, "step": 51590 }, { "epoch": 5.746185544047221, "grad_norm": 0.09638363122940063, "learning_rate": 2.2855237402420917e-05, "loss": 0.474, "num_input_tokens_seen": 62591504, "step": 51595 }, { "epoch": 5.746742398930839, "grad_norm": 0.11875174939632416, "learning_rate": 2.2850395883303986e-05, "loss": 0.4718, "num_input_tokens_seen": 62597904, "step": 51600 }, { "epoch": 5.747299253814456, "grad_norm": 0.09291065484285355, "learning_rate": 2.2845554445405988e-05, "loss": 0.444, "num_input_tokens_seen": 62603856, "step": 51605 }, { "epoch": 5.747856108698073, "grad_norm": 0.09253552556037903, "learning_rate": 2.2840713088909842e-05, "loss": 0.4525, "num_input_tokens_seen": 62608528, "step": 51610 }, { "epoch": 5.748412963581691, "grad_norm": 0.10964052379131317, "learning_rate": 2.2835871813998476e-05, "loss": 0.4742, "num_input_tokens_seen": 62614768, "step": 51615 }, { "epoch": 5.748969818465308, "grad_norm": 0.09838499128818512, "learning_rate": 2.2831030620854816e-05, "loss": 0.4759, "num_input_tokens_seen": 62621232, "step": 51620 }, { "epoch": 5.749526673348925, "grad_norm": 0.15695767104625702, "learning_rate": 2.2826189509661766e-05, "loss": 0.4831, "num_input_tokens_seen": 62626960, "step": 51625 }, { "epoch": 5.750083528232542, "grad_norm": 0.12463846802711487, "learning_rate": 2.282134848060224e-05, "loss": 0.455, "num_input_tokens_seen": 62633072, "step": 51630 }, { "epoch": 5.75064038311616, "grad_norm": 0.07940331101417542, "learning_rate": 2.2816507533859154e-05, "loss": 0.468, "num_input_tokens_seen": 62638608, "step": 51635 }, { "epoch": 5.751197237999778, "grad_norm": 0.11034014075994492, "learning_rate": 2.2811666669615406e-05, "loss": 0.4701, "num_input_tokens_seen": 62644720, "step": 51640 }, { "epoch": 5.751754092883394, "grad_norm": 0.1092986986041069, "learning_rate": 2.2806825888053908e-05, "loss": 0.4557, "num_input_tokens_seen": 62650800, "step": 51645 }, { "epoch": 5.752310947767012, "grad_norm": 0.13633576035499573, "learning_rate": 2.2801985189357555e-05, "loss": 0.4575, "num_input_tokens_seen": 62656816, "step": 51650 }, { "epoch": 5.752867802650629, "grad_norm": 0.10688735544681549, "learning_rate": 2.2797144573709248e-05, "loss": 0.4664, "num_input_tokens_seen": 62662896, "step": 51655 }, { "epoch": 5.7534246575342465, "grad_norm": 0.07616902887821198, "learning_rate": 2.2792304041291877e-05, "loss": 0.4707, "num_input_tokens_seen": 62669104, "step": 51660 }, { "epoch": 5.753981512417864, "grad_norm": 0.08847004175186157, "learning_rate": 2.2787463592288334e-05, "loss": 0.4598, "num_input_tokens_seen": 62675344, "step": 51665 }, { "epoch": 5.754538367301481, "grad_norm": 0.09490086883306503, "learning_rate": 2.2782623226881506e-05, "loss": 0.4738, "num_input_tokens_seen": 62681584, "step": 51670 }, { "epoch": 5.755095222185099, "grad_norm": 0.09878505021333694, "learning_rate": 2.2777782945254276e-05, "loss": 0.4694, "num_input_tokens_seen": 62687184, "step": 51675 }, { "epoch": 5.755652077068715, "grad_norm": 0.10904627293348312, "learning_rate": 2.277294274758953e-05, "loss": 0.4631, "num_input_tokens_seen": 62692080, "step": 51680 }, { "epoch": 5.756208931952333, "grad_norm": 0.1005857065320015, "learning_rate": 2.2768102634070147e-05, "loss": 0.451, "num_input_tokens_seen": 62698192, "step": 51685 }, { "epoch": 5.756765786835951, "grad_norm": 0.0795661062002182, "learning_rate": 2.2763262604878996e-05, "loss": 0.469, "num_input_tokens_seen": 62704336, "step": 51690 }, { "epoch": 5.7573226417195675, "grad_norm": 0.11245977133512497, "learning_rate": 2.2758422660198952e-05, "loss": 0.4573, "num_input_tokens_seen": 62710064, "step": 51695 }, { "epoch": 5.757879496603185, "grad_norm": 0.08365265280008316, "learning_rate": 2.2753582800212885e-05, "loss": 0.4651, "num_input_tokens_seen": 62716528, "step": 51700 }, { "epoch": 5.758436351486803, "grad_norm": 0.13221828639507294, "learning_rate": 2.2748743025103653e-05, "loss": 0.4664, "num_input_tokens_seen": 62722352, "step": 51705 }, { "epoch": 5.75899320637042, "grad_norm": 0.10826409608125687, "learning_rate": 2.2743903335054127e-05, "loss": 0.4571, "num_input_tokens_seen": 62728752, "step": 51710 }, { "epoch": 5.759550061254037, "grad_norm": 0.10542619228363037, "learning_rate": 2.2739063730247168e-05, "loss": 0.4686, "num_input_tokens_seen": 62734640, "step": 51715 }, { "epoch": 5.760106916137655, "grad_norm": 0.10964034497737885, "learning_rate": 2.2734224210865622e-05, "loss": 0.4614, "num_input_tokens_seen": 62740144, "step": 51720 }, { "epoch": 5.760663771021272, "grad_norm": 0.0998329445719719, "learning_rate": 2.272938477709235e-05, "loss": 0.4587, "num_input_tokens_seen": 62746224, "step": 51725 }, { "epoch": 5.7612206259048895, "grad_norm": 0.10752225667238235, "learning_rate": 2.2724545429110193e-05, "loss": 0.4631, "num_input_tokens_seen": 62752112, "step": 51730 }, { "epoch": 5.761777480788506, "grad_norm": 0.10969987511634827, "learning_rate": 2.2719706167102008e-05, "loss": 0.4699, "num_input_tokens_seen": 62758512, "step": 51735 }, { "epoch": 5.762334335672124, "grad_norm": 0.11535333096981049, "learning_rate": 2.2714866991250622e-05, "loss": 0.4656, "num_input_tokens_seen": 62764688, "step": 51740 }, { "epoch": 5.762891190555742, "grad_norm": 0.09352296590805054, "learning_rate": 2.271002790173889e-05, "loss": 0.4601, "num_input_tokens_seen": 62770160, "step": 51745 }, { "epoch": 5.763448045439358, "grad_norm": 0.08772549033164978, "learning_rate": 2.2705188898749643e-05, "loss": 0.4648, "num_input_tokens_seen": 62776368, "step": 51750 }, { "epoch": 5.764004900322976, "grad_norm": 0.09662167727947235, "learning_rate": 2.2700349982465714e-05, "loss": 0.4666, "num_input_tokens_seen": 62782224, "step": 51755 }, { "epoch": 5.764561755206593, "grad_norm": 0.13063059747219086, "learning_rate": 2.2695511153069933e-05, "loss": 0.4645, "num_input_tokens_seen": 62788368, "step": 51760 }, { "epoch": 5.7651186100902105, "grad_norm": 0.0952831581234932, "learning_rate": 2.2690672410745124e-05, "loss": 0.4679, "num_input_tokens_seen": 62794640, "step": 51765 }, { "epoch": 5.765675464973828, "grad_norm": 0.07743053883314133, "learning_rate": 2.2685833755674104e-05, "loss": 0.4667, "num_input_tokens_seen": 62800688, "step": 51770 }, { "epoch": 5.766232319857445, "grad_norm": 0.09861165285110474, "learning_rate": 2.268099518803971e-05, "loss": 0.465, "num_input_tokens_seen": 62806608, "step": 51775 }, { "epoch": 5.766789174741063, "grad_norm": 0.1016744002699852, "learning_rate": 2.267615670802475e-05, "loss": 0.4531, "num_input_tokens_seen": 62812560, "step": 51780 }, { "epoch": 5.7673460296246795, "grad_norm": 0.09629477560520172, "learning_rate": 2.2671318315812033e-05, "loss": 0.4547, "num_input_tokens_seen": 62818288, "step": 51785 }, { "epoch": 5.767902884508297, "grad_norm": 0.09699160605669022, "learning_rate": 2.2666480011584378e-05, "loss": 0.4602, "num_input_tokens_seen": 62824336, "step": 51790 }, { "epoch": 5.768459739391915, "grad_norm": 0.11021649837493896, "learning_rate": 2.266164179552458e-05, "loss": 0.4633, "num_input_tokens_seen": 62830512, "step": 51795 }, { "epoch": 5.769016594275532, "grad_norm": 0.10990870743989944, "learning_rate": 2.2656803667815447e-05, "loss": 0.465, "num_input_tokens_seen": 62836816, "step": 51800 }, { "epoch": 5.769573449159149, "grad_norm": 0.11117570102214813, "learning_rate": 2.2651965628639786e-05, "loss": 0.4633, "num_input_tokens_seen": 62843280, "step": 51805 }, { "epoch": 5.770130304042766, "grad_norm": 0.12193789333105087, "learning_rate": 2.2647127678180388e-05, "loss": 0.466, "num_input_tokens_seen": 62848144, "step": 51810 }, { "epoch": 5.770687158926384, "grad_norm": 0.10839871317148209, "learning_rate": 2.2642289816620048e-05, "loss": 0.4722, "num_input_tokens_seen": 62854224, "step": 51815 }, { "epoch": 5.771244013810001, "grad_norm": 0.08750761300325394, "learning_rate": 2.263745204414155e-05, "loss": 0.4664, "num_input_tokens_seen": 62860208, "step": 51820 }, { "epoch": 5.771800868693618, "grad_norm": 0.09863840043544769, "learning_rate": 2.2632614360927687e-05, "loss": 0.4567, "num_input_tokens_seen": 62866288, "step": 51825 }, { "epoch": 5.772357723577236, "grad_norm": 0.1401660144329071, "learning_rate": 2.2627776767161242e-05, "loss": 0.4555, "num_input_tokens_seen": 62872176, "step": 51830 }, { "epoch": 5.772914578460853, "grad_norm": 0.09144662320613861, "learning_rate": 2.2622939263024987e-05, "loss": 0.4604, "num_input_tokens_seen": 62878448, "step": 51835 }, { "epoch": 5.77347143334447, "grad_norm": 0.0939304456114769, "learning_rate": 2.2618101848701706e-05, "loss": 0.456, "num_input_tokens_seen": 62884080, "step": 51840 }, { "epoch": 5.774028288228088, "grad_norm": 0.07922147214412689, "learning_rate": 2.261326452437417e-05, "loss": 0.4653, "num_input_tokens_seen": 62889744, "step": 51845 }, { "epoch": 5.774585143111705, "grad_norm": 0.1194901391863823, "learning_rate": 2.2608427290225156e-05, "loss": 0.4626, "num_input_tokens_seen": 62895888, "step": 51850 }, { "epoch": 5.7751419979953225, "grad_norm": 0.11897125095129013, "learning_rate": 2.260359014643742e-05, "loss": 0.4645, "num_input_tokens_seen": 62901552, "step": 51855 }, { "epoch": 5.775698852878939, "grad_norm": 0.08595802634954453, "learning_rate": 2.2598753093193725e-05, "loss": 0.465, "num_input_tokens_seen": 62907760, "step": 51860 }, { "epoch": 5.776255707762557, "grad_norm": 0.10772515088319778, "learning_rate": 2.2593916130676832e-05, "loss": 0.4603, "num_input_tokens_seen": 62913904, "step": 51865 }, { "epoch": 5.776812562646175, "grad_norm": 0.13912171125411987, "learning_rate": 2.2589079259069502e-05, "loss": 0.4574, "num_input_tokens_seen": 62919856, "step": 51870 }, { "epoch": 5.777369417529791, "grad_norm": 0.10585150867700577, "learning_rate": 2.2584242478554484e-05, "loss": 0.4498, "num_input_tokens_seen": 62925808, "step": 51875 }, { "epoch": 5.777926272413409, "grad_norm": 0.07977932691574097, "learning_rate": 2.2579405789314523e-05, "loss": 0.4689, "num_input_tokens_seen": 62931760, "step": 51880 }, { "epoch": 5.778483127297027, "grad_norm": 0.07633758336305618, "learning_rate": 2.257456919153237e-05, "loss": 0.4486, "num_input_tokens_seen": 62938288, "step": 51885 }, { "epoch": 5.7790399821806435, "grad_norm": 0.10085985064506531, "learning_rate": 2.256973268539077e-05, "loss": 0.4698, "num_input_tokens_seen": 62944304, "step": 51890 }, { "epoch": 5.779596837064261, "grad_norm": 0.09594572335481644, "learning_rate": 2.2564896271072455e-05, "loss": 0.4695, "num_input_tokens_seen": 62950768, "step": 51895 }, { "epoch": 5.780153691947879, "grad_norm": 0.10699419677257538, "learning_rate": 2.2560059948760156e-05, "loss": 0.4679, "num_input_tokens_seen": 62957232, "step": 51900 }, { "epoch": 5.780710546831496, "grad_norm": 0.0961342602968216, "learning_rate": 2.2555223718636617e-05, "loss": 0.467, "num_input_tokens_seen": 62963376, "step": 51905 }, { "epoch": 5.781267401715113, "grad_norm": 0.09713703393936157, "learning_rate": 2.255038758088456e-05, "loss": 0.4479, "num_input_tokens_seen": 62969584, "step": 51910 }, { "epoch": 5.78182425659873, "grad_norm": 0.09563732892274857, "learning_rate": 2.254555153568671e-05, "loss": 0.4671, "num_input_tokens_seen": 62975920, "step": 51915 }, { "epoch": 5.782381111482348, "grad_norm": 0.147552490234375, "learning_rate": 2.2540715583225793e-05, "loss": 0.4481, "num_input_tokens_seen": 62982576, "step": 51920 }, { "epoch": 5.7829379663659655, "grad_norm": 0.07925815135240555, "learning_rate": 2.2535879723684518e-05, "loss": 0.4831, "num_input_tokens_seen": 62988624, "step": 51925 }, { "epoch": 5.783494821249582, "grad_norm": 0.11290911585092545, "learning_rate": 2.25310439572456e-05, "loss": 0.4539, "num_input_tokens_seen": 62994000, "step": 51930 }, { "epoch": 5.7840516761332, "grad_norm": 0.07945030927658081, "learning_rate": 2.252620828409177e-05, "loss": 0.465, "num_input_tokens_seen": 62999888, "step": 51935 }, { "epoch": 5.784608531016817, "grad_norm": 0.09621656686067581, "learning_rate": 2.252137270440571e-05, "loss": 0.4675, "num_input_tokens_seen": 63005936, "step": 51940 }, { "epoch": 5.785165385900434, "grad_norm": 0.11766945570707321, "learning_rate": 2.251653721837013e-05, "loss": 0.4598, "num_input_tokens_seen": 63011920, "step": 51945 }, { "epoch": 5.785722240784052, "grad_norm": 0.08802885562181473, "learning_rate": 2.2511701826167723e-05, "loss": 0.4619, "num_input_tokens_seen": 63018000, "step": 51950 }, { "epoch": 5.786279095667669, "grad_norm": 0.10417382419109344, "learning_rate": 2.2506866527981206e-05, "loss": 0.4593, "num_input_tokens_seen": 63024080, "step": 51955 }, { "epoch": 5.7868359505512865, "grad_norm": 0.07494717091321945, "learning_rate": 2.2502031323993257e-05, "loss": 0.4561, "num_input_tokens_seen": 63030320, "step": 51960 }, { "epoch": 5.787392805434903, "grad_norm": 0.13150450587272644, "learning_rate": 2.2497196214386574e-05, "loss": 0.4636, "num_input_tokens_seen": 63036464, "step": 51965 }, { "epoch": 5.787949660318521, "grad_norm": 0.1319034844636917, "learning_rate": 2.2492361199343836e-05, "loss": 0.4648, "num_input_tokens_seen": 63042544, "step": 51970 }, { "epoch": 5.788506515202139, "grad_norm": 0.09252624958753586, "learning_rate": 2.248752627904773e-05, "loss": 0.4512, "num_input_tokens_seen": 63048752, "step": 51975 }, { "epoch": 5.7890633700857554, "grad_norm": 0.0858168974518776, "learning_rate": 2.2482691453680926e-05, "loss": 0.46, "num_input_tokens_seen": 63053808, "step": 51980 }, { "epoch": 5.789620224969373, "grad_norm": 0.10393863171339035, "learning_rate": 2.2477856723426114e-05, "loss": 0.4714, "num_input_tokens_seen": 63060048, "step": 51985 }, { "epoch": 5.79017707985299, "grad_norm": 0.11208342015743256, "learning_rate": 2.2473022088465954e-05, "loss": 0.4589, "num_input_tokens_seen": 63066288, "step": 51990 }, { "epoch": 5.790733934736608, "grad_norm": 0.12930850684642792, "learning_rate": 2.2468187548983118e-05, "loss": 0.4697, "num_input_tokens_seen": 63072496, "step": 51995 }, { "epoch": 5.791290789620225, "grad_norm": 0.08162502944469452, "learning_rate": 2.2463353105160273e-05, "loss": 0.4556, "num_input_tokens_seen": 63078384, "step": 52000 }, { "epoch": 5.791847644503842, "grad_norm": 0.0700010433793068, "learning_rate": 2.2458518757180076e-05, "loss": 0.4574, "num_input_tokens_seen": 63084592, "step": 52005 }, { "epoch": 5.79240449938746, "grad_norm": 0.11166791617870331, "learning_rate": 2.245368450522518e-05, "loss": 0.4516, "num_input_tokens_seen": 63090480, "step": 52010 }, { "epoch": 5.7929613542710765, "grad_norm": 0.14037249982357025, "learning_rate": 2.2448850349478243e-05, "loss": 0.459, "num_input_tokens_seen": 63096112, "step": 52015 }, { "epoch": 5.793518209154694, "grad_norm": 0.09421362727880478, "learning_rate": 2.2444016290121918e-05, "loss": 0.4534, "num_input_tokens_seen": 63102224, "step": 52020 }, { "epoch": 5.794075064038312, "grad_norm": 0.09006842970848083, "learning_rate": 2.243918232733885e-05, "loss": 0.4629, "num_input_tokens_seen": 63108400, "step": 52025 }, { "epoch": 5.794631918921929, "grad_norm": 0.10028388351202011, "learning_rate": 2.2434348461311684e-05, "loss": 0.4571, "num_input_tokens_seen": 63114224, "step": 52030 }, { "epoch": 5.795188773805546, "grad_norm": 0.0928255021572113, "learning_rate": 2.242951469222305e-05, "loss": 0.466, "num_input_tokens_seen": 63120080, "step": 52035 }, { "epoch": 5.795745628689164, "grad_norm": 0.08537834137678146, "learning_rate": 2.242468102025559e-05, "loss": 0.4632, "num_input_tokens_seen": 63125968, "step": 52040 }, { "epoch": 5.796302483572781, "grad_norm": 0.1364588886499405, "learning_rate": 2.2419847445591928e-05, "loss": 0.463, "num_input_tokens_seen": 63131952, "step": 52045 }, { "epoch": 5.7968593384563984, "grad_norm": 0.09679359942674637, "learning_rate": 2.2415013968414703e-05, "loss": 0.4545, "num_input_tokens_seen": 63138160, "step": 52050 }, { "epoch": 5.797416193340015, "grad_norm": 0.07994002103805542, "learning_rate": 2.2410180588906534e-05, "loss": 0.4713, "num_input_tokens_seen": 63144336, "step": 52055 }, { "epoch": 5.797973048223633, "grad_norm": 0.0854840874671936, "learning_rate": 2.2405347307250043e-05, "loss": 0.4681, "num_input_tokens_seen": 63150640, "step": 52060 }, { "epoch": 5.798529903107251, "grad_norm": 0.10756546258926392, "learning_rate": 2.240051412362785e-05, "loss": 0.4591, "num_input_tokens_seen": 63156784, "step": 52065 }, { "epoch": 5.799086757990867, "grad_norm": 0.06961242854595184, "learning_rate": 2.2395681038222556e-05, "loss": 0.4688, "num_input_tokens_seen": 63162736, "step": 52070 }, { "epoch": 5.799643612874485, "grad_norm": 0.10823296755552292, "learning_rate": 2.2390848051216783e-05, "loss": 0.4562, "num_input_tokens_seen": 63168816, "step": 52075 }, { "epoch": 5.800200467758103, "grad_norm": 0.1433129757642746, "learning_rate": 2.2386015162793125e-05, "loss": 0.4753, "num_input_tokens_seen": 63174800, "step": 52080 }, { "epoch": 5.8007573226417195, "grad_norm": 0.10096275806427002, "learning_rate": 2.2381182373134198e-05, "loss": 0.4519, "num_input_tokens_seen": 63180688, "step": 52085 }, { "epoch": 5.801314177525337, "grad_norm": 0.09524291008710861, "learning_rate": 2.2376349682422592e-05, "loss": 0.451, "num_input_tokens_seen": 63185968, "step": 52090 }, { "epoch": 5.801871032408954, "grad_norm": 0.13528215885162354, "learning_rate": 2.2371517090840903e-05, "loss": 0.455, "num_input_tokens_seen": 63191984, "step": 52095 }, { "epoch": 5.802427887292572, "grad_norm": 0.09016334265470505, "learning_rate": 2.2366684598571723e-05, "loss": 0.4625, "num_input_tokens_seen": 63198192, "step": 52100 }, { "epoch": 5.802984742176189, "grad_norm": 0.15377162396907806, "learning_rate": 2.2361852205797638e-05, "loss": 0.4652, "num_input_tokens_seen": 63203888, "step": 52105 }, { "epoch": 5.803541597059806, "grad_norm": 0.10277709364891052, "learning_rate": 2.235701991270122e-05, "loss": 0.453, "num_input_tokens_seen": 63210000, "step": 52110 }, { "epoch": 5.804098451943424, "grad_norm": 0.11089518666267395, "learning_rate": 2.2352187719465073e-05, "loss": 0.4603, "num_input_tokens_seen": 63216048, "step": 52115 }, { "epoch": 5.804655306827041, "grad_norm": 0.10587160289287567, "learning_rate": 2.2347355626271756e-05, "loss": 0.4631, "num_input_tokens_seen": 63221872, "step": 52120 }, { "epoch": 5.805212161710658, "grad_norm": 0.11036518961191177, "learning_rate": 2.2342523633303842e-05, "loss": 0.4711, "num_input_tokens_seen": 63228016, "step": 52125 }, { "epoch": 5.805769016594276, "grad_norm": 0.08627214282751083, "learning_rate": 2.2337691740743906e-05, "loss": 0.4648, "num_input_tokens_seen": 63233968, "step": 52130 }, { "epoch": 5.806325871477893, "grad_norm": 0.10028114169836044, "learning_rate": 2.2332859948774505e-05, "loss": 0.4621, "num_input_tokens_seen": 63239856, "step": 52135 }, { "epoch": 5.80688272636151, "grad_norm": 0.06434468179941177, "learning_rate": 2.2328028257578196e-05, "loss": 0.4509, "num_input_tokens_seen": 63245872, "step": 52140 }, { "epoch": 5.807439581245127, "grad_norm": 0.1080702394247055, "learning_rate": 2.232319666733755e-05, "loss": 0.4613, "num_input_tokens_seen": 63252368, "step": 52145 }, { "epoch": 5.807996436128745, "grad_norm": 0.077133908867836, "learning_rate": 2.231836517823511e-05, "loss": 0.4622, "num_input_tokens_seen": 63258224, "step": 52150 }, { "epoch": 5.8085532910123625, "grad_norm": 0.09739410132169724, "learning_rate": 2.231353379045343e-05, "loss": 0.4515, "num_input_tokens_seen": 63264272, "step": 52155 }, { "epoch": 5.809110145895979, "grad_norm": 0.08153557032346725, "learning_rate": 2.2308702504175048e-05, "loss": 0.4638, "num_input_tokens_seen": 63270448, "step": 52160 }, { "epoch": 5.809667000779597, "grad_norm": 0.10112659633159637, "learning_rate": 2.230387131958251e-05, "loss": 0.4699, "num_input_tokens_seen": 63276496, "step": 52165 }, { "epoch": 5.810223855663214, "grad_norm": 0.13165901601314545, "learning_rate": 2.2299040236858358e-05, "loss": 0.4695, "num_input_tokens_seen": 63282480, "step": 52170 }, { "epoch": 5.810780710546831, "grad_norm": 0.1614445298910141, "learning_rate": 2.2294209256185117e-05, "loss": 0.4666, "num_input_tokens_seen": 63288368, "step": 52175 }, { "epoch": 5.811337565430449, "grad_norm": 0.12805569171905518, "learning_rate": 2.2289378377745324e-05, "loss": 0.4543, "num_input_tokens_seen": 63294416, "step": 52180 }, { "epoch": 5.811894420314066, "grad_norm": 0.13521379232406616, "learning_rate": 2.2284547601721504e-05, "loss": 0.4728, "num_input_tokens_seen": 63300176, "step": 52185 }, { "epoch": 5.812451275197684, "grad_norm": 0.08138877898454666, "learning_rate": 2.2279716928296173e-05, "loss": 0.4428, "num_input_tokens_seen": 63306032, "step": 52190 }, { "epoch": 5.8130081300813, "grad_norm": 0.10753183811903, "learning_rate": 2.227488635765186e-05, "loss": 0.4723, "num_input_tokens_seen": 63311984, "step": 52195 }, { "epoch": 5.813564984964918, "grad_norm": 0.10597041994333267, "learning_rate": 2.227005588997107e-05, "loss": 0.4676, "num_input_tokens_seen": 63318448, "step": 52200 }, { "epoch": 5.814121839848536, "grad_norm": 0.08304227143526077, "learning_rate": 2.2265225525436316e-05, "loss": 0.4673, "num_input_tokens_seen": 63324400, "step": 52205 }, { "epoch": 5.8146786947321525, "grad_norm": 0.11087986081838608, "learning_rate": 2.2260395264230106e-05, "loss": 0.4681, "num_input_tokens_seen": 63330736, "step": 52210 }, { "epoch": 5.81523554961577, "grad_norm": 0.11196605861186981, "learning_rate": 2.2255565106534944e-05, "loss": 0.4606, "num_input_tokens_seen": 63337200, "step": 52215 }, { "epoch": 5.815792404499388, "grad_norm": 0.1250603348016739, "learning_rate": 2.2250735052533328e-05, "loss": 0.461, "num_input_tokens_seen": 63343344, "step": 52220 }, { "epoch": 5.816349259383005, "grad_norm": 0.09381459653377533, "learning_rate": 2.224590510240775e-05, "loss": 0.4572, "num_input_tokens_seen": 63349392, "step": 52225 }, { "epoch": 5.816906114266622, "grad_norm": 0.15314026176929474, "learning_rate": 2.2241075256340706e-05, "loss": 0.4735, "num_input_tokens_seen": 63355024, "step": 52230 }, { "epoch": 5.817462969150239, "grad_norm": 0.07839306443929672, "learning_rate": 2.223624551451468e-05, "loss": 0.4762, "num_input_tokens_seen": 63361328, "step": 52235 }, { "epoch": 5.818019824033857, "grad_norm": 0.14448505640029907, "learning_rate": 2.223141587711215e-05, "loss": 0.4717, "num_input_tokens_seen": 63367344, "step": 52240 }, { "epoch": 5.818576678917474, "grad_norm": 0.09185630083084106, "learning_rate": 2.2226586344315603e-05, "loss": 0.4668, "num_input_tokens_seen": 63373264, "step": 52245 }, { "epoch": 5.819133533801091, "grad_norm": 0.0922614261507988, "learning_rate": 2.222175691630751e-05, "loss": 0.4569, "num_input_tokens_seen": 63379600, "step": 52250 }, { "epoch": 5.819690388684709, "grad_norm": 0.08389893174171448, "learning_rate": 2.221692759327035e-05, "loss": 0.4688, "num_input_tokens_seen": 63385008, "step": 52255 }, { "epoch": 5.820247243568327, "grad_norm": 0.08476435393095016, "learning_rate": 2.2212098375386582e-05, "loss": 0.4712, "num_input_tokens_seen": 63391024, "step": 52260 }, { "epoch": 5.820804098451943, "grad_norm": 0.07905139029026031, "learning_rate": 2.2207269262838672e-05, "loss": 0.4586, "num_input_tokens_seen": 63397264, "step": 52265 }, { "epoch": 5.821360953335561, "grad_norm": 0.11828002333641052, "learning_rate": 2.2202440255809075e-05, "loss": 0.4636, "num_input_tokens_seen": 63403344, "step": 52270 }, { "epoch": 5.821917808219178, "grad_norm": 0.09953952580690384, "learning_rate": 2.2197611354480252e-05, "loss": 0.4622, "num_input_tokens_seen": 63409584, "step": 52275 }, { "epoch": 5.8224746631027955, "grad_norm": 0.11883123964071274, "learning_rate": 2.2192782559034657e-05, "loss": 0.4527, "num_input_tokens_seen": 63415504, "step": 52280 }, { "epoch": 5.823031517986413, "grad_norm": 0.13821794092655182, "learning_rate": 2.218795386965473e-05, "loss": 0.4582, "num_input_tokens_seen": 63421776, "step": 52285 }, { "epoch": 5.82358837287003, "grad_norm": 0.09326634556055069, "learning_rate": 2.2183125286522917e-05, "loss": 0.4576, "num_input_tokens_seen": 63427920, "step": 52290 }, { "epoch": 5.824145227753648, "grad_norm": 0.10295601189136505, "learning_rate": 2.217829680982166e-05, "loss": 0.4702, "num_input_tokens_seen": 63434096, "step": 52295 }, { "epoch": 5.824702082637264, "grad_norm": 0.08637436479330063, "learning_rate": 2.217346843973339e-05, "loss": 0.4599, "num_input_tokens_seen": 63440272, "step": 52300 }, { "epoch": 5.825258937520882, "grad_norm": 0.09942704439163208, "learning_rate": 2.216864017644054e-05, "loss": 0.4614, "num_input_tokens_seen": 63446256, "step": 52305 }, { "epoch": 5.8258157924045, "grad_norm": 0.09103985130786896, "learning_rate": 2.216381202012554e-05, "loss": 0.4725, "num_input_tokens_seen": 63452272, "step": 52310 }, { "epoch": 5.8263726472881165, "grad_norm": 0.10422985255718231, "learning_rate": 2.215898397097081e-05, "loss": 0.4762, "num_input_tokens_seen": 63457616, "step": 52315 }, { "epoch": 5.826929502171734, "grad_norm": 0.08701298385858536, "learning_rate": 2.215415602915877e-05, "loss": 0.4596, "num_input_tokens_seen": 63463952, "step": 52320 }, { "epoch": 5.827486357055351, "grad_norm": 0.08576450496912003, "learning_rate": 2.2149328194871833e-05, "loss": 0.4739, "num_input_tokens_seen": 63470128, "step": 52325 }, { "epoch": 5.828043211938969, "grad_norm": 0.16674695909023285, "learning_rate": 2.2144500468292415e-05, "loss": 0.4646, "num_input_tokens_seen": 63476240, "step": 52330 }, { "epoch": 5.828600066822586, "grad_norm": 0.13368652760982513, "learning_rate": 2.2139672849602914e-05, "loss": 0.4655, "num_input_tokens_seen": 63482352, "step": 52335 }, { "epoch": 5.829156921706203, "grad_norm": 0.09928683936595917, "learning_rate": 2.2134845338985747e-05, "loss": 0.4623, "num_input_tokens_seen": 63488528, "step": 52340 }, { "epoch": 5.829713776589821, "grad_norm": 0.10670914500951767, "learning_rate": 2.21300179366233e-05, "loss": 0.4735, "num_input_tokens_seen": 63494352, "step": 52345 }, { "epoch": 5.830270631473438, "grad_norm": 0.10569950938224792, "learning_rate": 2.2125190642697987e-05, "loss": 0.4677, "num_input_tokens_seen": 63499216, "step": 52350 }, { "epoch": 5.830827486357055, "grad_norm": 0.10305166989564896, "learning_rate": 2.212036345739217e-05, "loss": 0.4583, "num_input_tokens_seen": 63505296, "step": 52355 }, { "epoch": 5.831384341240673, "grad_norm": 0.07595314830541611, "learning_rate": 2.2115536380888253e-05, "loss": 0.4508, "num_input_tokens_seen": 63511504, "step": 52360 }, { "epoch": 5.83194119612429, "grad_norm": 0.11140698194503784, "learning_rate": 2.2110709413368612e-05, "loss": 0.4498, "num_input_tokens_seen": 63518192, "step": 52365 }, { "epoch": 5.832498051007907, "grad_norm": 0.11934176087379456, "learning_rate": 2.2105882555015634e-05, "loss": 0.4548, "num_input_tokens_seen": 63524400, "step": 52370 }, { "epoch": 5.833054905891524, "grad_norm": 0.08986479789018631, "learning_rate": 2.2101055806011685e-05, "loss": 0.4562, "num_input_tokens_seen": 63530384, "step": 52375 }, { "epoch": 5.833611760775142, "grad_norm": 0.07573463022708893, "learning_rate": 2.2096229166539135e-05, "loss": 0.4512, "num_input_tokens_seen": 63536592, "step": 52380 }, { "epoch": 5.8341686156587595, "grad_norm": 0.09924950450658798, "learning_rate": 2.209140263678035e-05, "loss": 0.4734, "num_input_tokens_seen": 63542832, "step": 52385 }, { "epoch": 5.834725470542376, "grad_norm": 0.12758377194404602, "learning_rate": 2.20865762169177e-05, "loss": 0.469, "num_input_tokens_seen": 63548784, "step": 52390 }, { "epoch": 5.835282325425994, "grad_norm": 0.13773733377456665, "learning_rate": 2.208174990713354e-05, "loss": 0.45, "num_input_tokens_seen": 63555056, "step": 52395 }, { "epoch": 5.835839180309612, "grad_norm": 0.15990088880062103, "learning_rate": 2.2076923707610218e-05, "loss": 0.4774, "num_input_tokens_seen": 63561168, "step": 52400 }, { "epoch": 5.8363960351932285, "grad_norm": 0.1340760439634323, "learning_rate": 2.2072097618530086e-05, "loss": 0.4689, "num_input_tokens_seen": 63567408, "step": 52405 }, { "epoch": 5.836952890076846, "grad_norm": 0.07909898459911346, "learning_rate": 2.206727164007549e-05, "loss": 0.4599, "num_input_tokens_seen": 63573680, "step": 52410 }, { "epoch": 5.837509744960463, "grad_norm": 0.09313199669122696, "learning_rate": 2.206244577242876e-05, "loss": 0.4569, "num_input_tokens_seen": 63579824, "step": 52415 }, { "epoch": 5.838066599844081, "grad_norm": 0.1575319916009903, "learning_rate": 2.205762001577225e-05, "loss": 0.4623, "num_input_tokens_seen": 63586352, "step": 52420 }, { "epoch": 5.838623454727698, "grad_norm": 0.11997170746326447, "learning_rate": 2.2052794370288287e-05, "loss": 0.4657, "num_input_tokens_seen": 63592304, "step": 52425 }, { "epoch": 5.839180309611315, "grad_norm": 0.1381002962589264, "learning_rate": 2.2047968836159194e-05, "loss": 0.4673, "num_input_tokens_seen": 63598608, "step": 52430 }, { "epoch": 5.839737164494933, "grad_norm": 0.10592067986726761, "learning_rate": 2.2043143413567303e-05, "loss": 0.4646, "num_input_tokens_seen": 63604656, "step": 52435 }, { "epoch": 5.84029401937855, "grad_norm": 0.08126655220985413, "learning_rate": 2.2038318102694926e-05, "loss": 0.4579, "num_input_tokens_seen": 63610832, "step": 52440 }, { "epoch": 5.840850874262167, "grad_norm": 0.0919906497001648, "learning_rate": 2.2033492903724385e-05, "loss": 0.4587, "num_input_tokens_seen": 63617200, "step": 52445 }, { "epoch": 5.841407729145785, "grad_norm": 0.10436476022005081, "learning_rate": 2.2028667816837982e-05, "loss": 0.4649, "num_input_tokens_seen": 63623472, "step": 52450 }, { "epoch": 5.841964584029402, "grad_norm": 0.13444435596466064, "learning_rate": 2.2023842842218035e-05, "loss": 0.4709, "num_input_tokens_seen": 63629616, "step": 52455 }, { "epoch": 5.842521438913019, "grad_norm": 0.12145236879587173, "learning_rate": 2.2019017980046848e-05, "loss": 0.4539, "num_input_tokens_seen": 63635824, "step": 52460 }, { "epoch": 5.843078293796637, "grad_norm": 0.11428173631429672, "learning_rate": 2.201419323050671e-05, "loss": 0.4671, "num_input_tokens_seen": 63642032, "step": 52465 }, { "epoch": 5.843635148680254, "grad_norm": 0.07496658712625504, "learning_rate": 2.200936859377992e-05, "loss": 0.4626, "num_input_tokens_seen": 63647824, "step": 52470 }, { "epoch": 5.8441920035638715, "grad_norm": 0.11156926304101944, "learning_rate": 2.2004544070048767e-05, "loss": 0.45, "num_input_tokens_seen": 63653648, "step": 52475 }, { "epoch": 5.844748858447488, "grad_norm": 0.0985904335975647, "learning_rate": 2.1999719659495537e-05, "loss": 0.4685, "num_input_tokens_seen": 63659920, "step": 52480 }, { "epoch": 5.845305713331106, "grad_norm": 0.10498170554637909, "learning_rate": 2.199489536230252e-05, "loss": 0.4572, "num_input_tokens_seen": 63665776, "step": 52485 }, { "epoch": 5.845862568214724, "grad_norm": 0.08873828500509262, "learning_rate": 2.1990071178651983e-05, "loss": 0.4723, "num_input_tokens_seen": 63671856, "step": 52490 }, { "epoch": 5.84641942309834, "grad_norm": 0.05884821340441704, "learning_rate": 2.1985247108726202e-05, "loss": 0.4624, "num_input_tokens_seen": 63677872, "step": 52495 }, { "epoch": 5.846976277981958, "grad_norm": 0.08513811230659485, "learning_rate": 2.198042315270745e-05, "loss": 0.4619, "num_input_tokens_seen": 63683952, "step": 52500 }, { "epoch": 5.847533132865575, "grad_norm": 0.11064919829368591, "learning_rate": 2.1975599310777985e-05, "loss": 0.4547, "num_input_tokens_seen": 63690192, "step": 52505 }, { "epoch": 5.8480899877491925, "grad_norm": 0.10401599109172821, "learning_rate": 2.1970775583120074e-05, "loss": 0.4559, "num_input_tokens_seen": 63696176, "step": 52510 }, { "epoch": 5.84864684263281, "grad_norm": 0.08312100172042847, "learning_rate": 2.196595196991596e-05, "loss": 0.4557, "num_input_tokens_seen": 63702128, "step": 52515 }, { "epoch": 5.849203697516427, "grad_norm": 0.087467260658741, "learning_rate": 2.1961128471347908e-05, "loss": 0.4669, "num_input_tokens_seen": 63708336, "step": 52520 }, { "epoch": 5.849760552400045, "grad_norm": 0.13980679214000702, "learning_rate": 2.1956305087598164e-05, "loss": 0.4858, "num_input_tokens_seen": 63714384, "step": 52525 }, { "epoch": 5.8503174072836615, "grad_norm": 0.08352866768836975, "learning_rate": 2.195148181884897e-05, "loss": 0.4592, "num_input_tokens_seen": 63720464, "step": 52530 }, { "epoch": 5.850874262167279, "grad_norm": 0.07016468048095703, "learning_rate": 2.1946658665282556e-05, "loss": 0.4658, "num_input_tokens_seen": 63726672, "step": 52535 }, { "epoch": 5.851431117050897, "grad_norm": 0.09156903624534607, "learning_rate": 2.1941835627081167e-05, "loss": 0.4591, "num_input_tokens_seen": 63733040, "step": 52540 }, { "epoch": 5.851987971934514, "grad_norm": 0.08692914992570877, "learning_rate": 2.1937012704427022e-05, "loss": 0.4556, "num_input_tokens_seen": 63739312, "step": 52545 }, { "epoch": 5.852544826818131, "grad_norm": 0.11030041426420212, "learning_rate": 2.1932189897502355e-05, "loss": 0.4698, "num_input_tokens_seen": 63744944, "step": 52550 }, { "epoch": 5.853101681701748, "grad_norm": 0.13179130852222443, "learning_rate": 2.192736720648939e-05, "loss": 0.4663, "num_input_tokens_seen": 63751152, "step": 52555 }, { "epoch": 5.853658536585366, "grad_norm": 0.08728025108575821, "learning_rate": 2.1922544631570336e-05, "loss": 0.4578, "num_input_tokens_seen": 63756848, "step": 52560 }, { "epoch": 5.854215391468983, "grad_norm": 0.11834397166967392, "learning_rate": 2.191772217292741e-05, "loss": 0.4597, "num_input_tokens_seen": 63763056, "step": 52565 }, { "epoch": 5.8547722463526, "grad_norm": 0.09442546963691711, "learning_rate": 2.1912899830742815e-05, "loss": 0.459, "num_input_tokens_seen": 63769136, "step": 52570 }, { "epoch": 5.855329101236218, "grad_norm": 0.09826875478029251, "learning_rate": 2.190807760519876e-05, "loss": 0.4709, "num_input_tokens_seen": 63774736, "step": 52575 }, { "epoch": 5.8558859561198355, "grad_norm": 0.08362999558448792, "learning_rate": 2.1903255496477433e-05, "loss": 0.4652, "num_input_tokens_seen": 63780784, "step": 52580 }, { "epoch": 5.856442811003452, "grad_norm": 0.10073721408843994, "learning_rate": 2.1898433504761045e-05, "loss": 0.4686, "num_input_tokens_seen": 63786736, "step": 52585 }, { "epoch": 5.85699966588707, "grad_norm": 0.07715902477502823, "learning_rate": 2.189361163023178e-05, "loss": 0.4503, "num_input_tokens_seen": 63793040, "step": 52590 }, { "epoch": 5.857556520770687, "grad_norm": 0.11143138259649277, "learning_rate": 2.188878987307182e-05, "loss": 0.4538, "num_input_tokens_seen": 63799184, "step": 52595 }, { "epoch": 5.8581133756543045, "grad_norm": 0.09749055653810501, "learning_rate": 2.188396823346335e-05, "loss": 0.4656, "num_input_tokens_seen": 63805456, "step": 52600 }, { "epoch": 5.858670230537922, "grad_norm": 0.07627364993095398, "learning_rate": 2.1879146711588548e-05, "loss": 0.4632, "num_input_tokens_seen": 63811664, "step": 52605 }, { "epoch": 5.859227085421539, "grad_norm": 0.09693703800439835, "learning_rate": 2.187432530762958e-05, "loss": 0.4678, "num_input_tokens_seen": 63817712, "step": 52610 }, { "epoch": 5.859783940305157, "grad_norm": 0.09793002158403397, "learning_rate": 2.1869504021768617e-05, "loss": 0.4584, "num_input_tokens_seen": 63823792, "step": 52615 }, { "epoch": 5.860340795188774, "grad_norm": 0.08076067268848419, "learning_rate": 2.186468285418783e-05, "loss": 0.4615, "num_input_tokens_seen": 63829744, "step": 52620 }, { "epoch": 5.860897650072391, "grad_norm": 0.08579520136117935, "learning_rate": 2.185986180506937e-05, "loss": 0.464, "num_input_tokens_seen": 63835824, "step": 52625 }, { "epoch": 5.861454504956009, "grad_norm": 0.07387866079807281, "learning_rate": 2.1855040874595395e-05, "loss": 0.4626, "num_input_tokens_seen": 63841936, "step": 52630 }, { "epoch": 5.8620113598396255, "grad_norm": 0.10832900553941727, "learning_rate": 2.1850220062948054e-05, "loss": 0.4607, "num_input_tokens_seen": 63848208, "step": 52635 }, { "epoch": 5.862568214723243, "grad_norm": 0.15841270983219147, "learning_rate": 2.184539937030949e-05, "loss": 0.4488, "num_input_tokens_seen": 63854608, "step": 52640 }, { "epoch": 5.863125069606861, "grad_norm": 0.09994637966156006, "learning_rate": 2.184057879686185e-05, "loss": 0.4657, "num_input_tokens_seen": 63860528, "step": 52645 }, { "epoch": 5.863681924490478, "grad_norm": 0.10053794831037521, "learning_rate": 2.1835758342787265e-05, "loss": 0.4592, "num_input_tokens_seen": 63866992, "step": 52650 }, { "epoch": 5.864238779374095, "grad_norm": 0.08648160845041275, "learning_rate": 2.1830938008267877e-05, "loss": 0.4572, "num_input_tokens_seen": 63873072, "step": 52655 }, { "epoch": 5.864795634257712, "grad_norm": 0.0884818360209465, "learning_rate": 2.1826117793485802e-05, "loss": 0.4574, "num_input_tokens_seen": 63879056, "step": 52660 }, { "epoch": 5.86535248914133, "grad_norm": 0.09125169366598129, "learning_rate": 2.1821297698623165e-05, "loss": 0.4558, "num_input_tokens_seen": 63885328, "step": 52665 }, { "epoch": 5.8659093440249475, "grad_norm": 0.09137776494026184, "learning_rate": 2.1816477723862096e-05, "loss": 0.4591, "num_input_tokens_seen": 63891248, "step": 52670 }, { "epoch": 5.866466198908564, "grad_norm": 0.09509943425655365, "learning_rate": 2.181165786938469e-05, "loss": 0.4631, "num_input_tokens_seen": 63897360, "step": 52675 }, { "epoch": 5.867023053792182, "grad_norm": 0.09131839871406555, "learning_rate": 2.1806838135373073e-05, "loss": 0.4663, "num_input_tokens_seen": 63903632, "step": 52680 }, { "epoch": 5.867579908675799, "grad_norm": 0.0910092443227768, "learning_rate": 2.1802018522009344e-05, "loss": 0.4554, "num_input_tokens_seen": 63909552, "step": 52685 }, { "epoch": 5.868136763559416, "grad_norm": 0.0921812653541565, "learning_rate": 2.1797199029475607e-05, "loss": 0.4591, "num_input_tokens_seen": 63915600, "step": 52690 }, { "epoch": 5.868693618443034, "grad_norm": 0.12917640805244446, "learning_rate": 2.179237965795395e-05, "loss": 0.4631, "num_input_tokens_seen": 63921648, "step": 52695 }, { "epoch": 5.869250473326651, "grad_norm": 0.11303579062223434, "learning_rate": 2.1787560407626473e-05, "loss": 0.4731, "num_input_tokens_seen": 63927632, "step": 52700 }, { "epoch": 5.8698073282102685, "grad_norm": 0.08334025740623474, "learning_rate": 2.1782741278675252e-05, "loss": 0.465, "num_input_tokens_seen": 63933616, "step": 52705 }, { "epoch": 5.870364183093885, "grad_norm": 0.08830734342336655, "learning_rate": 2.1777922271282377e-05, "loss": 0.4644, "num_input_tokens_seen": 63939760, "step": 52710 }, { "epoch": 5.870921037977503, "grad_norm": 0.10243795812129974, "learning_rate": 2.1773103385629928e-05, "loss": 0.4517, "num_input_tokens_seen": 63945840, "step": 52715 }, { "epoch": 5.871477892861121, "grad_norm": 0.08575201779603958, "learning_rate": 2.1768284621899975e-05, "loss": 0.4608, "num_input_tokens_seen": 63951888, "step": 52720 }, { "epoch": 5.872034747744737, "grad_norm": 0.0970621407032013, "learning_rate": 2.1763465980274584e-05, "loss": 0.4622, "num_input_tokens_seen": 63957904, "step": 52725 }, { "epoch": 5.872591602628355, "grad_norm": 0.10089490562677383, "learning_rate": 2.1758647460935818e-05, "loss": 0.4659, "num_input_tokens_seen": 63964208, "step": 52730 }, { "epoch": 5.873148457511972, "grad_norm": 0.07926411181688309, "learning_rate": 2.175382906406574e-05, "loss": 0.4577, "num_input_tokens_seen": 63970608, "step": 52735 }, { "epoch": 5.87370531239559, "grad_norm": 0.08559233695268631, "learning_rate": 2.17490107898464e-05, "loss": 0.4711, "num_input_tokens_seen": 63976912, "step": 52740 }, { "epoch": 5.874262167279207, "grad_norm": 0.08637825399637222, "learning_rate": 2.1744192638459855e-05, "loss": 0.4666, "num_input_tokens_seen": 63982352, "step": 52745 }, { "epoch": 5.874819022162824, "grad_norm": 0.08746337890625, "learning_rate": 2.1739374610088145e-05, "loss": 0.4611, "num_input_tokens_seen": 63988496, "step": 52750 }, { "epoch": 5.875375877046442, "grad_norm": 0.08613752573728561, "learning_rate": 2.173455670491331e-05, "loss": 0.472, "num_input_tokens_seen": 63994608, "step": 52755 }, { "epoch": 5.875932731930059, "grad_norm": 0.10880964994430542, "learning_rate": 2.1729738923117397e-05, "loss": 0.4729, "num_input_tokens_seen": 64000848, "step": 52760 }, { "epoch": 5.876489586813676, "grad_norm": 0.13653381168842316, "learning_rate": 2.172492126488242e-05, "loss": 0.4751, "num_input_tokens_seen": 64007216, "step": 52765 }, { "epoch": 5.877046441697294, "grad_norm": 0.08150514215230942, "learning_rate": 2.1720103730390412e-05, "loss": 0.46, "num_input_tokens_seen": 64013200, "step": 52770 }, { "epoch": 5.877603296580911, "grad_norm": 0.09360533952713013, "learning_rate": 2.17152863198234e-05, "loss": 0.4553, "num_input_tokens_seen": 64019536, "step": 52775 }, { "epoch": 5.878160151464528, "grad_norm": 0.09102007001638412, "learning_rate": 2.171046903336339e-05, "loss": 0.4787, "num_input_tokens_seen": 64025680, "step": 52780 }, { "epoch": 5.878717006348146, "grad_norm": 0.08717084676027298, "learning_rate": 2.1705651871192408e-05, "loss": 0.456, "num_input_tokens_seen": 64032016, "step": 52785 }, { "epoch": 5.879273861231763, "grad_norm": 0.09803426265716553, "learning_rate": 2.1700834833492445e-05, "loss": 0.467, "num_input_tokens_seen": 64037968, "step": 52790 }, { "epoch": 5.87983071611538, "grad_norm": 0.08132132142782211, "learning_rate": 2.169601792044552e-05, "loss": 0.4672, "num_input_tokens_seen": 64044144, "step": 52795 }, { "epoch": 5.880387570998998, "grad_norm": 0.0921943336725235, "learning_rate": 2.169120113223363e-05, "loss": 0.46, "num_input_tokens_seen": 64050416, "step": 52800 }, { "epoch": 5.880944425882615, "grad_norm": 0.13851802051067352, "learning_rate": 2.1686384469038764e-05, "loss": 0.4533, "num_input_tokens_seen": 64056400, "step": 52805 }, { "epoch": 5.881501280766233, "grad_norm": 0.10503871738910675, "learning_rate": 2.168156793104291e-05, "loss": 0.4579, "num_input_tokens_seen": 64062768, "step": 52810 }, { "epoch": 5.882058135649849, "grad_norm": 0.1008540615439415, "learning_rate": 2.1676751518428055e-05, "loss": 0.4617, "num_input_tokens_seen": 64068880, "step": 52815 }, { "epoch": 5.882614990533467, "grad_norm": 0.0900086984038353, "learning_rate": 2.1671935231376175e-05, "loss": 0.4647, "num_input_tokens_seen": 64074800, "step": 52820 }, { "epoch": 5.883171845417085, "grad_norm": 0.1119578406214714, "learning_rate": 2.166711907006925e-05, "loss": 0.4597, "num_input_tokens_seen": 64081008, "step": 52825 }, { "epoch": 5.8837287003007015, "grad_norm": 0.07629290223121643, "learning_rate": 2.1662303034689248e-05, "loss": 0.4662, "num_input_tokens_seen": 64087184, "step": 52830 }, { "epoch": 5.884285555184319, "grad_norm": 0.08517121523618698, "learning_rate": 2.1657487125418134e-05, "loss": 0.4561, "num_input_tokens_seen": 64093328, "step": 52835 }, { "epoch": 5.884842410067936, "grad_norm": 0.11861948668956757, "learning_rate": 2.1652671342437864e-05, "loss": 0.4447, "num_input_tokens_seen": 64099728, "step": 52840 }, { "epoch": 5.885399264951554, "grad_norm": 0.10924739390611649, "learning_rate": 2.16478556859304e-05, "loss": 0.4536, "num_input_tokens_seen": 64105616, "step": 52845 }, { "epoch": 5.885956119835171, "grad_norm": 0.11559901386499405, "learning_rate": 2.1643040156077695e-05, "loss": 0.4611, "num_input_tokens_seen": 64111824, "step": 52850 }, { "epoch": 5.886512974718788, "grad_norm": 0.09864374250173569, "learning_rate": 2.163822475306168e-05, "loss": 0.461, "num_input_tokens_seen": 64118096, "step": 52855 }, { "epoch": 5.887069829602406, "grad_norm": 0.09393594413995743, "learning_rate": 2.1633409477064314e-05, "loss": 0.4729, "num_input_tokens_seen": 64123696, "step": 52860 }, { "epoch": 5.8876266844860226, "grad_norm": 0.09493473917245865, "learning_rate": 2.1628594328267526e-05, "loss": 0.4649, "num_input_tokens_seen": 64129360, "step": 52865 }, { "epoch": 5.88818353936964, "grad_norm": 0.11266081035137177, "learning_rate": 2.162377930685325e-05, "loss": 0.4645, "num_input_tokens_seen": 64135536, "step": 52870 }, { "epoch": 5.888740394253258, "grad_norm": 0.08096244931221008, "learning_rate": 2.161896441300341e-05, "loss": 0.4607, "num_input_tokens_seen": 64141424, "step": 52875 }, { "epoch": 5.889297249136875, "grad_norm": 0.08889125287532806, "learning_rate": 2.1614149646899934e-05, "loss": 0.4757, "num_input_tokens_seen": 64148016, "step": 52880 }, { "epoch": 5.889854104020492, "grad_norm": 0.1089676097035408, "learning_rate": 2.1609335008724723e-05, "loss": 0.4722, "num_input_tokens_seen": 64154416, "step": 52885 }, { "epoch": 5.890410958904109, "grad_norm": 0.08421696722507477, "learning_rate": 2.160452049865971e-05, "loss": 0.4583, "num_input_tokens_seen": 64160656, "step": 52890 }, { "epoch": 5.890967813787727, "grad_norm": 0.08988168835639954, "learning_rate": 2.1599706116886795e-05, "loss": 0.4651, "num_input_tokens_seen": 64166928, "step": 52895 }, { "epoch": 5.8915246686713445, "grad_norm": 0.09515494853258133, "learning_rate": 2.1594891863587875e-05, "loss": 0.459, "num_input_tokens_seen": 64172752, "step": 52900 }, { "epoch": 5.892081523554961, "grad_norm": 0.07603297382593155, "learning_rate": 2.1590077738944857e-05, "loss": 0.4815, "num_input_tokens_seen": 64178928, "step": 52905 }, { "epoch": 5.892638378438579, "grad_norm": 0.0951618030667305, "learning_rate": 2.158526374313963e-05, "loss": 0.4733, "num_input_tokens_seen": 64185200, "step": 52910 }, { "epoch": 5.893195233322196, "grad_norm": 0.09950949996709824, "learning_rate": 2.1580449876354082e-05, "loss": 0.4559, "num_input_tokens_seen": 64191440, "step": 52915 }, { "epoch": 5.893752088205813, "grad_norm": 0.10580509901046753, "learning_rate": 2.1575636138770088e-05, "loss": 0.4647, "num_input_tokens_seen": 64197488, "step": 52920 }, { "epoch": 5.894308943089431, "grad_norm": 0.09028859436511993, "learning_rate": 2.157082253056954e-05, "loss": 0.4701, "num_input_tokens_seen": 64203600, "step": 52925 }, { "epoch": 5.894865797973048, "grad_norm": 0.10152656584978104, "learning_rate": 2.156600905193431e-05, "loss": 0.4689, "num_input_tokens_seen": 64209616, "step": 52930 }, { "epoch": 5.895422652856666, "grad_norm": 0.08620350807905197, "learning_rate": 2.156119570304626e-05, "loss": 0.4616, "num_input_tokens_seen": 64214704, "step": 52935 }, { "epoch": 5.895979507740283, "grad_norm": 0.09955117106437683, "learning_rate": 2.1556382484087254e-05, "loss": 0.4657, "num_input_tokens_seen": 64220688, "step": 52940 }, { "epoch": 5.8965363626239, "grad_norm": 0.10550155490636826, "learning_rate": 2.1551569395239157e-05, "loss": 0.4705, "num_input_tokens_seen": 64226896, "step": 52945 }, { "epoch": 5.897093217507518, "grad_norm": 0.08105595409870148, "learning_rate": 2.1546756436683812e-05, "loss": 0.456, "num_input_tokens_seen": 64232944, "step": 52950 }, { "epoch": 5.897650072391135, "grad_norm": 0.12562933564186096, "learning_rate": 2.1541943608603082e-05, "loss": 0.4672, "num_input_tokens_seen": 64239024, "step": 52955 }, { "epoch": 5.898206927274752, "grad_norm": 0.14898556470870972, "learning_rate": 2.1537130911178803e-05, "loss": 0.4684, "num_input_tokens_seen": 64245264, "step": 52960 }, { "epoch": 5.89876378215837, "grad_norm": 0.1239793673157692, "learning_rate": 2.1532318344592816e-05, "loss": 0.4607, "num_input_tokens_seen": 64251216, "step": 52965 }, { "epoch": 5.899320637041987, "grad_norm": 0.07579401880502701, "learning_rate": 2.1527505909026954e-05, "loss": 0.4617, "num_input_tokens_seen": 64257264, "step": 52970 }, { "epoch": 5.899877491925604, "grad_norm": 0.10812919586896896, "learning_rate": 2.152269360466305e-05, "loss": 0.4652, "num_input_tokens_seen": 64263248, "step": 52975 }, { "epoch": 5.900434346809222, "grad_norm": 0.08538307994604111, "learning_rate": 2.1517881431682916e-05, "loss": 0.4511, "num_input_tokens_seen": 64269424, "step": 52980 }, { "epoch": 5.900991201692839, "grad_norm": 0.12359078228473663, "learning_rate": 2.1513069390268385e-05, "loss": 0.4624, "num_input_tokens_seen": 64275760, "step": 52985 }, { "epoch": 5.901548056576456, "grad_norm": 0.11346599459648132, "learning_rate": 2.1508257480601274e-05, "loss": 0.4687, "num_input_tokens_seen": 64281936, "step": 52990 }, { "epoch": 5.902104911460073, "grad_norm": 0.07435296475887299, "learning_rate": 2.1503445702863377e-05, "loss": 0.4657, "num_input_tokens_seen": 64287472, "step": 52995 }, { "epoch": 5.902661766343691, "grad_norm": 0.11558859795331955, "learning_rate": 2.149863405723651e-05, "loss": 0.4709, "num_input_tokens_seen": 64293712, "step": 53000 }, { "epoch": 5.903218621227309, "grad_norm": 0.13173668086528778, "learning_rate": 2.149382254390247e-05, "loss": 0.4649, "num_input_tokens_seen": 64299824, "step": 53005 }, { "epoch": 5.903775476110925, "grad_norm": 0.09484350681304932, "learning_rate": 2.148901116304305e-05, "loss": 0.4546, "num_input_tokens_seen": 64305904, "step": 53010 }, { "epoch": 5.904332330994543, "grad_norm": 0.09792781621217728, "learning_rate": 2.1484199914840036e-05, "loss": 0.4706, "num_input_tokens_seen": 64311824, "step": 53015 }, { "epoch": 5.90488918587816, "grad_norm": 0.06982967257499695, "learning_rate": 2.147938879947522e-05, "loss": 0.4717, "num_input_tokens_seen": 64317968, "step": 53020 }, { "epoch": 5.9054460407617775, "grad_norm": 0.11653181165456772, "learning_rate": 2.1474577817130377e-05, "loss": 0.4777, "num_input_tokens_seen": 64324560, "step": 53025 }, { "epoch": 5.906002895645395, "grad_norm": 0.11784844100475311, "learning_rate": 2.1469766967987283e-05, "loss": 0.4658, "num_input_tokens_seen": 64330928, "step": 53030 }, { "epoch": 5.906559750529012, "grad_norm": 0.07335475832223892, "learning_rate": 2.1464956252227704e-05, "loss": 0.4563, "num_input_tokens_seen": 64336144, "step": 53035 }, { "epoch": 5.90711660541263, "grad_norm": 0.12074104696512222, "learning_rate": 2.1460145670033403e-05, "loss": 0.4771, "num_input_tokens_seen": 64342512, "step": 53040 }, { "epoch": 5.907673460296246, "grad_norm": 0.08981930464506149, "learning_rate": 2.1455335221586143e-05, "loss": 0.4614, "num_input_tokens_seen": 64348592, "step": 53045 }, { "epoch": 5.908230315179864, "grad_norm": 0.08962144702672958, "learning_rate": 2.145052490706768e-05, "loss": 0.4485, "num_input_tokens_seen": 64354928, "step": 53050 }, { "epoch": 5.908787170063482, "grad_norm": 0.09693086892366409, "learning_rate": 2.1445714726659766e-05, "loss": 0.4566, "num_input_tokens_seen": 64361040, "step": 53055 }, { "epoch": 5.9093440249470985, "grad_norm": 0.0960191860795021, "learning_rate": 2.1440904680544137e-05, "loss": 0.4608, "num_input_tokens_seen": 64367184, "step": 53060 }, { "epoch": 5.909900879830716, "grad_norm": 0.0927109643816948, "learning_rate": 2.1436094768902533e-05, "loss": 0.4459, "num_input_tokens_seen": 64373648, "step": 53065 }, { "epoch": 5.910457734714333, "grad_norm": 0.12914617359638214, "learning_rate": 2.143128499191669e-05, "loss": 0.4589, "num_input_tokens_seen": 64379760, "step": 53070 }, { "epoch": 5.911014589597951, "grad_norm": 0.07522568106651306, "learning_rate": 2.1426475349768338e-05, "loss": 0.4701, "num_input_tokens_seen": 64386096, "step": 53075 }, { "epoch": 5.911571444481568, "grad_norm": 0.11390845477581024, "learning_rate": 2.1421665842639194e-05, "loss": 0.4653, "num_input_tokens_seen": 64391280, "step": 53080 }, { "epoch": 5.912128299365185, "grad_norm": 0.08793815225362778, "learning_rate": 2.141685647071099e-05, "loss": 0.4558, "num_input_tokens_seen": 64397424, "step": 53085 }, { "epoch": 5.912685154248803, "grad_norm": 0.11199567466974258, "learning_rate": 2.1412047234165426e-05, "loss": 0.4629, "num_input_tokens_seen": 64403536, "step": 53090 }, { "epoch": 5.91324200913242, "grad_norm": 0.0965290367603302, "learning_rate": 2.1407238133184216e-05, "loss": 0.4664, "num_input_tokens_seen": 64409520, "step": 53095 }, { "epoch": 5.913798864016037, "grad_norm": 0.1447611004114151, "learning_rate": 2.1402429167949066e-05, "loss": 0.4628, "num_input_tokens_seen": 64415280, "step": 53100 }, { "epoch": 5.914355718899655, "grad_norm": 0.11319327354431152, "learning_rate": 2.139762033864167e-05, "loss": 0.4479, "num_input_tokens_seen": 64421168, "step": 53105 }, { "epoch": 5.914912573783272, "grad_norm": 0.09202185273170471, "learning_rate": 2.1392811645443713e-05, "loss": 0.4696, "num_input_tokens_seen": 64427504, "step": 53110 }, { "epoch": 5.915469428666889, "grad_norm": 0.08884784579277039, "learning_rate": 2.1388003088536902e-05, "loss": 0.4559, "num_input_tokens_seen": 64433584, "step": 53115 }, { "epoch": 5.916026283550507, "grad_norm": 0.10898054391145706, "learning_rate": 2.1383194668102908e-05, "loss": 0.4545, "num_input_tokens_seen": 64439952, "step": 53120 }, { "epoch": 5.916583138434124, "grad_norm": 0.12954175472259521, "learning_rate": 2.137838638432341e-05, "loss": 0.4627, "num_input_tokens_seen": 64445840, "step": 53125 }, { "epoch": 5.9171399933177415, "grad_norm": 0.09724166244268417, "learning_rate": 2.1373578237380084e-05, "loss": 0.4582, "num_input_tokens_seen": 64451920, "step": 53130 }, { "epoch": 5.917696848201359, "grad_norm": 0.11056201905012131, "learning_rate": 2.1368770227454586e-05, "loss": 0.4542, "num_input_tokens_seen": 64458256, "step": 53135 }, { "epoch": 5.918253703084976, "grad_norm": 0.13124807178974152, "learning_rate": 2.1363962354728583e-05, "loss": 0.4637, "num_input_tokens_seen": 64464560, "step": 53140 }, { "epoch": 5.918810557968594, "grad_norm": 0.1465284675359726, "learning_rate": 2.1359154619383743e-05, "loss": 0.4629, "num_input_tokens_seen": 64470864, "step": 53145 }, { "epoch": 5.9193674128522105, "grad_norm": 0.08982870727777481, "learning_rate": 2.1354347021601707e-05, "loss": 0.4497, "num_input_tokens_seen": 64477072, "step": 53150 }, { "epoch": 5.919924267735828, "grad_norm": 0.13128389418125153, "learning_rate": 2.1349539561564124e-05, "loss": 0.4553, "num_input_tokens_seen": 64483120, "step": 53155 }, { "epoch": 5.920481122619446, "grad_norm": 0.07923660427331924, "learning_rate": 2.1344732239452636e-05, "loss": 0.453, "num_input_tokens_seen": 64488784, "step": 53160 }, { "epoch": 5.921037977503063, "grad_norm": 0.10151808708906174, "learning_rate": 2.1339925055448874e-05, "loss": 0.4573, "num_input_tokens_seen": 64494544, "step": 53165 }, { "epoch": 5.92159483238668, "grad_norm": 0.10379176586866379, "learning_rate": 2.133511800973448e-05, "loss": 0.4456, "num_input_tokens_seen": 64500624, "step": 53170 }, { "epoch": 5.922151687270297, "grad_norm": 0.08844436705112457, "learning_rate": 2.133031110249107e-05, "loss": 0.4582, "num_input_tokens_seen": 64506992, "step": 53175 }, { "epoch": 5.922708542153915, "grad_norm": 0.10709622502326965, "learning_rate": 2.1325504333900264e-05, "loss": 0.4484, "num_input_tokens_seen": 64513264, "step": 53180 }, { "epoch": 5.923265397037532, "grad_norm": 0.1104099377989769, "learning_rate": 2.132069770414368e-05, "loss": 0.4587, "num_input_tokens_seen": 64519408, "step": 53185 }, { "epoch": 5.923822251921149, "grad_norm": 0.08548511564731598, "learning_rate": 2.1315891213402926e-05, "loss": 0.4531, "num_input_tokens_seen": 64525680, "step": 53190 }, { "epoch": 5.924379106804767, "grad_norm": 0.09778957068920135, "learning_rate": 2.1311084861859608e-05, "loss": 0.4629, "num_input_tokens_seen": 64531888, "step": 53195 }, { "epoch": 5.924935961688384, "grad_norm": 0.09255924820899963, "learning_rate": 2.1306278649695324e-05, "loss": 0.4674, "num_input_tokens_seen": 64538192, "step": 53200 }, { "epoch": 5.925492816572001, "grad_norm": 0.0929657593369484, "learning_rate": 2.1301472577091674e-05, "loss": 0.4654, "num_input_tokens_seen": 64544144, "step": 53205 }, { "epoch": 5.926049671455619, "grad_norm": 0.10056865215301514, "learning_rate": 2.1296666644230243e-05, "loss": 0.4661, "num_input_tokens_seen": 64550448, "step": 53210 }, { "epoch": 5.926606526339236, "grad_norm": 0.08386360108852386, "learning_rate": 2.129186085129261e-05, "loss": 0.466, "num_input_tokens_seen": 64556496, "step": 53215 }, { "epoch": 5.9271633812228535, "grad_norm": 0.08227267861366272, "learning_rate": 2.1287055198460363e-05, "loss": 0.478, "num_input_tokens_seen": 64562640, "step": 53220 }, { "epoch": 5.92772023610647, "grad_norm": 0.09113412350416183, "learning_rate": 2.1282249685915058e-05, "loss": 0.4759, "num_input_tokens_seen": 64568912, "step": 53225 }, { "epoch": 5.928277090990088, "grad_norm": 0.11039340496063232, "learning_rate": 2.1277444313838285e-05, "loss": 0.4666, "num_input_tokens_seen": 64574288, "step": 53230 }, { "epoch": 5.928833945873706, "grad_norm": 0.08783934265375137, "learning_rate": 2.1272639082411593e-05, "loss": 0.4561, "num_input_tokens_seen": 64580464, "step": 53235 }, { "epoch": 5.929390800757322, "grad_norm": 0.08903302997350693, "learning_rate": 2.126783399181654e-05, "loss": 0.4624, "num_input_tokens_seen": 64586704, "step": 53240 }, { "epoch": 5.92994765564094, "grad_norm": 0.09159509837627411, "learning_rate": 2.126302904223468e-05, "loss": 0.4634, "num_input_tokens_seen": 64592592, "step": 53245 }, { "epoch": 5.930504510524557, "grad_norm": 0.09369396418333054, "learning_rate": 2.125822423384756e-05, "loss": 0.4645, "num_input_tokens_seen": 64598416, "step": 53250 }, { "epoch": 5.9310613654081745, "grad_norm": 0.10910958796739578, "learning_rate": 2.1253419566836712e-05, "loss": 0.4723, "num_input_tokens_seen": 64604592, "step": 53255 }, { "epoch": 5.931618220291792, "grad_norm": 0.07690665870904922, "learning_rate": 2.1248615041383685e-05, "loss": 0.4481, "num_input_tokens_seen": 64610128, "step": 53260 }, { "epoch": 5.932175075175409, "grad_norm": 0.08356224745512009, "learning_rate": 2.1243810657670006e-05, "loss": 0.4623, "num_input_tokens_seen": 64616144, "step": 53265 }, { "epoch": 5.932731930059027, "grad_norm": 0.07571988552808762, "learning_rate": 2.1239006415877193e-05, "loss": 0.4618, "num_input_tokens_seen": 64622480, "step": 53270 }, { "epoch": 5.933288784942644, "grad_norm": 0.10363762080669403, "learning_rate": 2.1234202316186774e-05, "loss": 0.4751, "num_input_tokens_seen": 64628688, "step": 53275 }, { "epoch": 5.933845639826261, "grad_norm": 0.08871560543775558, "learning_rate": 2.122939835878026e-05, "loss": 0.4637, "num_input_tokens_seen": 64635152, "step": 53280 }, { "epoch": 5.934402494709879, "grad_norm": 0.1020861566066742, "learning_rate": 2.122459454383916e-05, "loss": 0.4508, "num_input_tokens_seen": 64641360, "step": 53285 }, { "epoch": 5.934959349593496, "grad_norm": 0.13104161620140076, "learning_rate": 2.121979087154497e-05, "loss": 0.4612, "num_input_tokens_seen": 64647408, "step": 53290 }, { "epoch": 5.935516204477113, "grad_norm": 0.10453232377767563, "learning_rate": 2.12149873420792e-05, "loss": 0.4615, "num_input_tokens_seen": 64653776, "step": 53295 }, { "epoch": 5.936073059360731, "grad_norm": 0.1311284303665161, "learning_rate": 2.1210183955623336e-05, "loss": 0.4645, "num_input_tokens_seen": 64659440, "step": 53300 }, { "epoch": 5.936629914244348, "grad_norm": 0.07266329973936081, "learning_rate": 2.1205380712358873e-05, "loss": 0.4606, "num_input_tokens_seen": 64665776, "step": 53305 }, { "epoch": 5.937186769127965, "grad_norm": 0.11051347851753235, "learning_rate": 2.1200577612467283e-05, "loss": 0.4672, "num_input_tokens_seen": 64671856, "step": 53310 }, { "epoch": 5.937743624011583, "grad_norm": 0.062102995812892914, "learning_rate": 2.1195774656130047e-05, "loss": 0.4531, "num_input_tokens_seen": 64677488, "step": 53315 }, { "epoch": 5.9383004788952, "grad_norm": 0.08160301297903061, "learning_rate": 2.1190971843528628e-05, "loss": 0.473, "num_input_tokens_seen": 64683696, "step": 53320 }, { "epoch": 5.9388573337788175, "grad_norm": 0.1196993961930275, "learning_rate": 2.1186169174844513e-05, "loss": 0.4669, "num_input_tokens_seen": 64689680, "step": 53325 }, { "epoch": 5.939414188662434, "grad_norm": 0.15798772871494293, "learning_rate": 2.118136665025914e-05, "loss": 0.4811, "num_input_tokens_seen": 64695728, "step": 53330 }, { "epoch": 5.939971043546052, "grad_norm": 0.08895468711853027, "learning_rate": 2.1176564269953976e-05, "loss": 0.4554, "num_input_tokens_seen": 64701744, "step": 53335 }, { "epoch": 5.94052789842967, "grad_norm": 0.10508804023265839, "learning_rate": 2.1171762034110466e-05, "loss": 0.4601, "num_input_tokens_seen": 64708048, "step": 53340 }, { "epoch": 5.9410847533132864, "grad_norm": 0.10793016850948334, "learning_rate": 2.1166959942910054e-05, "loss": 0.4599, "num_input_tokens_seen": 64713904, "step": 53345 }, { "epoch": 5.941641608196904, "grad_norm": 0.09027223289012909, "learning_rate": 2.116215799653418e-05, "loss": 0.4506, "num_input_tokens_seen": 64719888, "step": 53350 }, { "epoch": 5.942198463080521, "grad_norm": 0.08853351324796677, "learning_rate": 2.115735619516427e-05, "loss": 0.4703, "num_input_tokens_seen": 64726192, "step": 53355 }, { "epoch": 5.942755317964139, "grad_norm": 0.08238188922405243, "learning_rate": 2.115255453898176e-05, "loss": 0.4547, "num_input_tokens_seen": 64731600, "step": 53360 }, { "epoch": 5.943312172847756, "grad_norm": 0.05639093369245529, "learning_rate": 2.1147753028168073e-05, "loss": 0.4746, "num_input_tokens_seen": 64737648, "step": 53365 }, { "epoch": 5.943869027731373, "grad_norm": 0.08053955435752869, "learning_rate": 2.1142951662904624e-05, "loss": 0.4674, "num_input_tokens_seen": 64743024, "step": 53370 }, { "epoch": 5.944425882614991, "grad_norm": 0.10394910722970963, "learning_rate": 2.1138150443372817e-05, "loss": 0.4714, "num_input_tokens_seen": 64749360, "step": 53375 }, { "epoch": 5.9449827374986075, "grad_norm": 0.09446781128644943, "learning_rate": 2.1133349369754063e-05, "loss": 0.4766, "num_input_tokens_seen": 64755376, "step": 53380 }, { "epoch": 5.945539592382225, "grad_norm": 0.06927456706762314, "learning_rate": 2.1128548442229756e-05, "loss": 0.4471, "num_input_tokens_seen": 64761200, "step": 53385 }, { "epoch": 5.946096447265843, "grad_norm": 0.09487037360668182, "learning_rate": 2.11237476609813e-05, "loss": 0.4659, "num_input_tokens_seen": 64767376, "step": 53390 }, { "epoch": 5.94665330214946, "grad_norm": 0.10161440074443817, "learning_rate": 2.1118947026190083e-05, "loss": 0.4719, "num_input_tokens_seen": 64773680, "step": 53395 }, { "epoch": 5.947210157033077, "grad_norm": 0.103919118642807, "learning_rate": 2.1114146538037484e-05, "loss": 0.4633, "num_input_tokens_seen": 64779888, "step": 53400 }, { "epoch": 5.947767011916694, "grad_norm": 0.12005053460597992, "learning_rate": 2.1109346196704876e-05, "loss": 0.4643, "num_input_tokens_seen": 64785744, "step": 53405 }, { "epoch": 5.948323866800312, "grad_norm": 0.1173558458685875, "learning_rate": 2.1104546002373644e-05, "loss": 0.4643, "num_input_tokens_seen": 64791760, "step": 53410 }, { "epoch": 5.9488807216839295, "grad_norm": 0.08897454291582108, "learning_rate": 2.1099745955225144e-05, "loss": 0.4567, "num_input_tokens_seen": 64797584, "step": 53415 }, { "epoch": 5.949437576567546, "grad_norm": 0.11254192143678665, "learning_rate": 2.1094946055440735e-05, "loss": 0.4729, "num_input_tokens_seen": 64803536, "step": 53420 }, { "epoch": 5.949994431451164, "grad_norm": 0.08818910270929337, "learning_rate": 2.1090146303201785e-05, "loss": 0.4707, "num_input_tokens_seen": 64809616, "step": 53425 }, { "epoch": 5.950551286334781, "grad_norm": 0.08911080658435822, "learning_rate": 2.1085346698689637e-05, "loss": 0.462, "num_input_tokens_seen": 64815856, "step": 53430 }, { "epoch": 5.951108141218398, "grad_norm": 0.11590006947517395, "learning_rate": 2.1080547242085636e-05, "loss": 0.4708, "num_input_tokens_seen": 64822192, "step": 53435 }, { "epoch": 5.951664996102016, "grad_norm": 0.0960933044552803, "learning_rate": 2.1075747933571123e-05, "loss": 0.4547, "num_input_tokens_seen": 64828432, "step": 53440 }, { "epoch": 5.952221850985633, "grad_norm": 0.09523701667785645, "learning_rate": 2.1070948773327426e-05, "loss": 0.4555, "num_input_tokens_seen": 64834512, "step": 53445 }, { "epoch": 5.9527787058692505, "grad_norm": 0.11779703199863434, "learning_rate": 2.106614976153587e-05, "loss": 0.4747, "num_input_tokens_seen": 64840656, "step": 53450 }, { "epoch": 5.953335560752868, "grad_norm": 0.13774403929710388, "learning_rate": 2.106135089837779e-05, "loss": 0.469, "num_input_tokens_seen": 64846576, "step": 53455 }, { "epoch": 5.953892415636485, "grad_norm": 0.08980558812618256, "learning_rate": 2.1056552184034492e-05, "loss": 0.4756, "num_input_tokens_seen": 64852592, "step": 53460 }, { "epoch": 5.954449270520103, "grad_norm": 0.08775535225868225, "learning_rate": 2.1051753618687293e-05, "loss": 0.4496, "num_input_tokens_seen": 64858224, "step": 53465 }, { "epoch": 5.955006125403719, "grad_norm": 0.10069849342107773, "learning_rate": 2.1046955202517494e-05, "loss": 0.4591, "num_input_tokens_seen": 64864240, "step": 53470 }, { "epoch": 5.955562980287337, "grad_norm": 0.08183977752923965, "learning_rate": 2.1042156935706393e-05, "loss": 0.4643, "num_input_tokens_seen": 64870320, "step": 53475 }, { "epoch": 5.956119835170955, "grad_norm": 0.11306622624397278, "learning_rate": 2.1037358818435287e-05, "loss": 0.4683, "num_input_tokens_seen": 64876272, "step": 53480 }, { "epoch": 5.956676690054572, "grad_norm": 0.10549814254045486, "learning_rate": 2.1032560850885463e-05, "loss": 0.466, "num_input_tokens_seen": 64881968, "step": 53485 }, { "epoch": 5.957233544938189, "grad_norm": 0.10587666928768158, "learning_rate": 2.102776303323821e-05, "loss": 0.4581, "num_input_tokens_seen": 64887984, "step": 53490 }, { "epoch": 5.957790399821807, "grad_norm": 0.13988667726516724, "learning_rate": 2.1022965365674795e-05, "loss": 0.4745, "num_input_tokens_seen": 64894320, "step": 53495 }, { "epoch": 5.958347254705424, "grad_norm": 0.10684716701507568, "learning_rate": 2.1018167848376495e-05, "loss": 0.4579, "num_input_tokens_seen": 64900080, "step": 53500 }, { "epoch": 5.958904109589041, "grad_norm": 0.12775714695453644, "learning_rate": 2.1013370481524575e-05, "loss": 0.4681, "num_input_tokens_seen": 64906160, "step": 53505 }, { "epoch": 5.959460964472658, "grad_norm": 0.07606495916843414, "learning_rate": 2.100857326530029e-05, "loss": 0.4583, "num_input_tokens_seen": 64912560, "step": 53510 }, { "epoch": 5.960017819356276, "grad_norm": 0.07980073988437653, "learning_rate": 2.1003776199884896e-05, "loss": 0.4535, "num_input_tokens_seen": 64918608, "step": 53515 }, { "epoch": 5.9605746742398935, "grad_norm": 0.09435654431581497, "learning_rate": 2.0998979285459654e-05, "loss": 0.4551, "num_input_tokens_seen": 64924656, "step": 53520 }, { "epoch": 5.96113152912351, "grad_norm": 0.10197114199399948, "learning_rate": 2.0994182522205787e-05, "loss": 0.4735, "num_input_tokens_seen": 64930864, "step": 53525 }, { "epoch": 5.961688384007128, "grad_norm": 0.0956830084323883, "learning_rate": 2.0989385910304548e-05, "loss": 0.4634, "num_input_tokens_seen": 64937040, "step": 53530 }, { "epoch": 5.962245238890745, "grad_norm": 0.12477134168148041, "learning_rate": 2.0984589449937158e-05, "loss": 0.4642, "num_input_tokens_seen": 64943152, "step": 53535 }, { "epoch": 5.962802093774362, "grad_norm": 0.0795888677239418, "learning_rate": 2.097979314128485e-05, "loss": 0.4628, "num_input_tokens_seen": 64949008, "step": 53540 }, { "epoch": 5.96335894865798, "grad_norm": 0.11377769708633423, "learning_rate": 2.097499698452883e-05, "loss": 0.477, "num_input_tokens_seen": 64955216, "step": 53545 }, { "epoch": 5.963915803541597, "grad_norm": 0.0797932893037796, "learning_rate": 2.0970200979850334e-05, "loss": 0.4675, "num_input_tokens_seen": 64961360, "step": 53550 }, { "epoch": 5.964472658425215, "grad_norm": 0.09700950235128403, "learning_rate": 2.096540512743056e-05, "loss": 0.4624, "num_input_tokens_seen": 64967792, "step": 53555 }, { "epoch": 5.965029513308831, "grad_norm": 0.09811679273843765, "learning_rate": 2.0960609427450706e-05, "loss": 0.4654, "num_input_tokens_seen": 64973904, "step": 53560 }, { "epoch": 5.965586368192449, "grad_norm": 0.10232725739479065, "learning_rate": 2.0955813880091975e-05, "loss": 0.4696, "num_input_tokens_seen": 64979952, "step": 53565 }, { "epoch": 5.966143223076067, "grad_norm": 0.07314610481262207, "learning_rate": 2.095101848553555e-05, "loss": 0.4507, "num_input_tokens_seen": 64986256, "step": 53570 }, { "epoch": 5.9667000779596835, "grad_norm": 0.08311726897954941, "learning_rate": 2.0946223243962625e-05, "loss": 0.4585, "num_input_tokens_seen": 64992272, "step": 53575 }, { "epoch": 5.967256932843301, "grad_norm": 0.1283940225839615, "learning_rate": 2.0941428155554388e-05, "loss": 0.4623, "num_input_tokens_seen": 64998288, "step": 53580 }, { "epoch": 5.967813787726918, "grad_norm": 0.11323897540569305, "learning_rate": 2.0936633220491993e-05, "loss": 0.4734, "num_input_tokens_seen": 65004400, "step": 53585 }, { "epoch": 5.968370642610536, "grad_norm": 0.0921909436583519, "learning_rate": 2.0931838438956613e-05, "loss": 0.4742, "num_input_tokens_seen": 65010384, "step": 53590 }, { "epoch": 5.968927497494153, "grad_norm": 0.11869718879461288, "learning_rate": 2.092704381112941e-05, "loss": 0.4616, "num_input_tokens_seen": 65016720, "step": 53595 }, { "epoch": 5.96948435237777, "grad_norm": 0.07935129851102829, "learning_rate": 2.0922249337191553e-05, "loss": 0.4676, "num_input_tokens_seen": 65023024, "step": 53600 }, { "epoch": 5.970041207261388, "grad_norm": 0.10883009433746338, "learning_rate": 2.0917455017324176e-05, "loss": 0.4709, "num_input_tokens_seen": 65028880, "step": 53605 }, { "epoch": 5.9705980621450045, "grad_norm": 0.10786586999893188, "learning_rate": 2.091266085170844e-05, "loss": 0.4487, "num_input_tokens_seen": 65035280, "step": 53610 }, { "epoch": 5.971154917028622, "grad_norm": 0.10428140312433243, "learning_rate": 2.090786684052547e-05, "loss": 0.4612, "num_input_tokens_seen": 65041232, "step": 53615 }, { "epoch": 5.97171177191224, "grad_norm": 0.12432186305522919, "learning_rate": 2.090307298395641e-05, "loss": 0.4642, "num_input_tokens_seen": 65047568, "step": 53620 }, { "epoch": 5.972268626795857, "grad_norm": 0.10458918660879135, "learning_rate": 2.0898279282182378e-05, "loss": 0.4722, "num_input_tokens_seen": 65053776, "step": 53625 }, { "epoch": 5.972825481679474, "grad_norm": 0.08252855390310287, "learning_rate": 2.0893485735384496e-05, "loss": 0.4616, "num_input_tokens_seen": 65059376, "step": 53630 }, { "epoch": 5.973382336563092, "grad_norm": 0.10886625200510025, "learning_rate": 2.0888692343743887e-05, "loss": 0.4694, "num_input_tokens_seen": 65065392, "step": 53635 }, { "epoch": 5.973939191446709, "grad_norm": 0.12795551121234894, "learning_rate": 2.0883899107441656e-05, "loss": 0.4584, "num_input_tokens_seen": 65071536, "step": 53640 }, { "epoch": 5.9744960463303265, "grad_norm": 0.09209170937538147, "learning_rate": 2.087910602665891e-05, "loss": 0.4606, "num_input_tokens_seen": 65077456, "step": 53645 }, { "epoch": 5.975052901213943, "grad_norm": 0.09718132764101028, "learning_rate": 2.0874313101576743e-05, "loss": 0.4564, "num_input_tokens_seen": 65083568, "step": 53650 }, { "epoch": 5.975609756097561, "grad_norm": 0.09233516454696655, "learning_rate": 2.0869520332376246e-05, "loss": 0.458, "num_input_tokens_seen": 65089776, "step": 53655 }, { "epoch": 5.976166610981179, "grad_norm": 0.1037711575627327, "learning_rate": 2.0864727719238506e-05, "loss": 0.4636, "num_input_tokens_seen": 65095760, "step": 53660 }, { "epoch": 5.976723465864795, "grad_norm": 0.11563819646835327, "learning_rate": 2.0859935262344605e-05, "loss": 0.4786, "num_input_tokens_seen": 65101072, "step": 53665 }, { "epoch": 5.977280320748413, "grad_norm": 0.08026371896266937, "learning_rate": 2.0855142961875624e-05, "loss": 0.4544, "num_input_tokens_seen": 65107216, "step": 53670 }, { "epoch": 5.977837175632031, "grad_norm": 0.08246046304702759, "learning_rate": 2.0850350818012622e-05, "loss": 0.4591, "num_input_tokens_seen": 65113328, "step": 53675 }, { "epoch": 5.9783940305156476, "grad_norm": 0.1063375473022461, "learning_rate": 2.0845558830936662e-05, "loss": 0.465, "num_input_tokens_seen": 65119824, "step": 53680 }, { "epoch": 5.978950885399265, "grad_norm": 0.11524064838886261, "learning_rate": 2.0840767000828807e-05, "loss": 0.4701, "num_input_tokens_seen": 65125520, "step": 53685 }, { "epoch": 5.979507740282882, "grad_norm": 0.0862174779176712, "learning_rate": 2.08359753278701e-05, "loss": 0.4613, "num_input_tokens_seen": 65131312, "step": 53690 }, { "epoch": 5.9800645951665, "grad_norm": 0.10087569057941437, "learning_rate": 2.0831183812241586e-05, "loss": 0.4673, "num_input_tokens_seen": 65137488, "step": 53695 }, { "epoch": 5.980621450050117, "grad_norm": 0.07619044929742813, "learning_rate": 2.082639245412431e-05, "loss": 0.464, "num_input_tokens_seen": 65143568, "step": 53700 }, { "epoch": 5.981178304933734, "grad_norm": 0.09176269173622131, "learning_rate": 2.0821601253699304e-05, "loss": 0.4585, "num_input_tokens_seen": 65149904, "step": 53705 }, { "epoch": 5.981735159817352, "grad_norm": 0.10981865227222443, "learning_rate": 2.0816810211147595e-05, "loss": 0.4535, "num_input_tokens_seen": 65156496, "step": 53710 }, { "epoch": 5.982292014700969, "grad_norm": 0.10694752633571625, "learning_rate": 2.08120193266502e-05, "loss": 0.4625, "num_input_tokens_seen": 65162576, "step": 53715 }, { "epoch": 5.982848869584586, "grad_norm": 0.1490480601787567, "learning_rate": 2.0807228600388133e-05, "loss": 0.4624, "num_input_tokens_seen": 65168784, "step": 53720 }, { "epoch": 5.983405724468204, "grad_norm": 0.08652445673942566, "learning_rate": 2.0802438032542406e-05, "loss": 0.4579, "num_input_tokens_seen": 65175088, "step": 53725 }, { "epoch": 5.983962579351821, "grad_norm": 0.08878818899393082, "learning_rate": 2.0797647623294027e-05, "loss": 0.4578, "num_input_tokens_seen": 65180880, "step": 53730 }, { "epoch": 5.984519434235438, "grad_norm": 0.11688462644815445, "learning_rate": 2.0792857372823982e-05, "loss": 0.4747, "num_input_tokens_seen": 65186800, "step": 53735 }, { "epoch": 5.985076289119055, "grad_norm": 0.08234548568725586, "learning_rate": 2.0788067281313274e-05, "loss": 0.4585, "num_input_tokens_seen": 65192208, "step": 53740 }, { "epoch": 5.985633144002673, "grad_norm": 0.08682400733232498, "learning_rate": 2.0783277348942878e-05, "loss": 0.4605, "num_input_tokens_seen": 65198544, "step": 53745 }, { "epoch": 5.9861899988862906, "grad_norm": 0.09335026144981384, "learning_rate": 2.077848757589378e-05, "loss": 0.4728, "num_input_tokens_seen": 65204592, "step": 53750 }, { "epoch": 5.986746853769907, "grad_norm": 0.08775648474693298, "learning_rate": 2.077369796234694e-05, "loss": 0.4658, "num_input_tokens_seen": 65210416, "step": 53755 }, { "epoch": 5.987303708653525, "grad_norm": 0.10303986072540283, "learning_rate": 2.0768908508483344e-05, "loss": 0.4608, "num_input_tokens_seen": 65216304, "step": 53760 }, { "epoch": 5.987860563537142, "grad_norm": 0.0959782525897026, "learning_rate": 2.0764119214483943e-05, "loss": 0.4655, "num_input_tokens_seen": 65222448, "step": 53765 }, { "epoch": 5.9884174184207595, "grad_norm": 0.08791051805019379, "learning_rate": 2.0759330080529694e-05, "loss": 0.4624, "num_input_tokens_seen": 65227920, "step": 53770 }, { "epoch": 5.988974273304377, "grad_norm": 0.09239523112773895, "learning_rate": 2.075454110680154e-05, "loss": 0.4617, "num_input_tokens_seen": 65234224, "step": 53775 }, { "epoch": 5.989531128187994, "grad_norm": 0.11258222162723541, "learning_rate": 2.074975229348043e-05, "loss": 0.4669, "num_input_tokens_seen": 65240208, "step": 53780 }, { "epoch": 5.990087983071612, "grad_norm": 0.10240671038627625, "learning_rate": 2.07449636407473e-05, "loss": 0.4665, "num_input_tokens_seen": 65246000, "step": 53785 }, { "epoch": 5.990644837955228, "grad_norm": 0.12083038687705994, "learning_rate": 2.0740175148783075e-05, "loss": 0.4646, "num_input_tokens_seen": 65252080, "step": 53790 }, { "epoch": 5.991201692838846, "grad_norm": 0.10668519139289856, "learning_rate": 2.073538681776869e-05, "loss": 0.471, "num_input_tokens_seen": 65258000, "step": 53795 }, { "epoch": 5.991758547722464, "grad_norm": 0.08280002325773239, "learning_rate": 2.0730598647885055e-05, "loss": 0.4673, "num_input_tokens_seen": 65264368, "step": 53800 }, { "epoch": 5.9923154026060805, "grad_norm": 0.13156872987747192, "learning_rate": 2.0725810639313087e-05, "loss": 0.4757, "num_input_tokens_seen": 65270416, "step": 53805 }, { "epoch": 5.992872257489698, "grad_norm": 0.11307717114686966, "learning_rate": 2.072102279223369e-05, "loss": 0.4532, "num_input_tokens_seen": 65276080, "step": 53810 }, { "epoch": 5.993429112373316, "grad_norm": 0.0869380533695221, "learning_rate": 2.0716235106827764e-05, "loss": 0.4599, "num_input_tokens_seen": 65282256, "step": 53815 }, { "epoch": 5.993985967256933, "grad_norm": 0.08660242706537247, "learning_rate": 2.07114475832762e-05, "loss": 0.4569, "num_input_tokens_seen": 65288496, "step": 53820 }, { "epoch": 5.99454282214055, "grad_norm": 0.09048464149236679, "learning_rate": 2.0706660221759893e-05, "loss": 0.4488, "num_input_tokens_seen": 65294800, "step": 53825 }, { "epoch": 5.995099677024167, "grad_norm": 0.08940439671278, "learning_rate": 2.070187302245972e-05, "loss": 0.4539, "num_input_tokens_seen": 65300848, "step": 53830 }, { "epoch": 5.995656531907785, "grad_norm": 0.08176513761281967, "learning_rate": 2.069708598555656e-05, "loss": 0.4704, "num_input_tokens_seen": 65306960, "step": 53835 }, { "epoch": 5.9962133867914025, "grad_norm": 0.05418793857097626, "learning_rate": 2.0692299111231285e-05, "loss": 0.462, "num_input_tokens_seen": 65312944, "step": 53840 }, { "epoch": 5.996770241675019, "grad_norm": 0.10211742669343948, "learning_rate": 2.068751239966475e-05, "loss": 0.4607, "num_input_tokens_seen": 65318704, "step": 53845 }, { "epoch": 5.997327096558637, "grad_norm": 0.09606809169054031, "learning_rate": 2.0682725851037817e-05, "loss": 0.4602, "num_input_tokens_seen": 65324368, "step": 53850 }, { "epoch": 5.997883951442255, "grad_norm": 0.14677129685878754, "learning_rate": 2.0677939465531328e-05, "loss": 0.4618, "num_input_tokens_seen": 65330544, "step": 53855 }, { "epoch": 5.998440806325871, "grad_norm": 0.09162964671850204, "learning_rate": 2.0673153243326147e-05, "loss": 0.4569, "num_input_tokens_seen": 65336496, "step": 53860 }, { "epoch": 5.998997661209489, "grad_norm": 0.08783715963363647, "learning_rate": 2.0668367184603102e-05, "loss": 0.4659, "num_input_tokens_seen": 65342384, "step": 53865 }, { "epoch": 5.999554516093106, "grad_norm": 0.12060566991567612, "learning_rate": 2.0663581289543025e-05, "loss": 0.4591, "num_input_tokens_seen": 65347824, "step": 53870 }, { "epoch": 6.0001113709767235, "grad_norm": 0.12206387519836426, "learning_rate": 2.0658795558326743e-05, "loss": 0.4666, "num_input_tokens_seen": 65353248, "step": 53875 }, { "epoch": 6.000668225860341, "grad_norm": 0.07287310063838959, "learning_rate": 2.065400999113508e-05, "loss": 0.4624, "num_input_tokens_seen": 65358976, "step": 53880 }, { "epoch": 6.000668225860341, "eval_loss": 0.4643497169017792, "eval_runtime": 113.1537, "eval_samples_per_second": 35.271, "eval_steps_per_second": 8.82, "num_input_tokens_seen": 65358976, "step": 53880 }, { "epoch": 6.001225080743958, "grad_norm": 0.09259433299303055, "learning_rate": 2.0649224588148834e-05, "loss": 0.4621, "num_input_tokens_seen": 65365024, "step": 53885 }, { "epoch": 6.001781935627576, "grad_norm": 0.11521313339471817, "learning_rate": 2.064443934954884e-05, "loss": 0.4702, "num_input_tokens_seen": 65370720, "step": 53890 }, { "epoch": 6.0023387905111925, "grad_norm": 0.09329560399055481, "learning_rate": 2.0639654275515883e-05, "loss": 0.4593, "num_input_tokens_seen": 65376960, "step": 53895 }, { "epoch": 6.00289564539481, "grad_norm": 0.10999895632266998, "learning_rate": 2.0634869366230758e-05, "loss": 0.4578, "num_input_tokens_seen": 65383040, "step": 53900 }, { "epoch": 6.003452500278428, "grad_norm": 0.08793909847736359, "learning_rate": 2.063008462187426e-05, "loss": 0.4642, "num_input_tokens_seen": 65389024, "step": 53905 }, { "epoch": 6.004009355162045, "grad_norm": 0.10951100289821625, "learning_rate": 2.0625300042627167e-05, "loss": 0.4681, "num_input_tokens_seen": 65394976, "step": 53910 }, { "epoch": 6.004566210045662, "grad_norm": 0.10749966651201248, "learning_rate": 2.062051562867026e-05, "loss": 0.46, "num_input_tokens_seen": 65400704, "step": 53915 }, { "epoch": 6.005123064929279, "grad_norm": 0.1039823442697525, "learning_rate": 2.06157313801843e-05, "loss": 0.4444, "num_input_tokens_seen": 65406944, "step": 53920 }, { "epoch": 6.005679919812897, "grad_norm": 0.0750274732708931, "learning_rate": 2.0610947297350064e-05, "loss": 0.4721, "num_input_tokens_seen": 65412928, "step": 53925 }, { "epoch": 6.006236774696514, "grad_norm": 0.09266164153814316, "learning_rate": 2.0606163380348305e-05, "loss": 0.4702, "num_input_tokens_seen": 65419200, "step": 53930 }, { "epoch": 6.006793629580131, "grad_norm": 0.11984285712242126, "learning_rate": 2.0601379629359773e-05, "loss": 0.4612, "num_input_tokens_seen": 65425344, "step": 53935 }, { "epoch": 6.007350484463749, "grad_norm": 0.11203990876674652, "learning_rate": 2.0596596044565217e-05, "loss": 0.4693, "num_input_tokens_seen": 65431328, "step": 53940 }, { "epoch": 6.007907339347366, "grad_norm": 0.14812198281288147, "learning_rate": 2.0591812626145374e-05, "loss": 0.459, "num_input_tokens_seen": 65436608, "step": 53945 }, { "epoch": 6.008464194230983, "grad_norm": 0.07496467977762222, "learning_rate": 2.058702937428097e-05, "loss": 0.4561, "num_input_tokens_seen": 65442816, "step": 53950 }, { "epoch": 6.009021049114601, "grad_norm": 0.09612222015857697, "learning_rate": 2.0582246289152742e-05, "loss": 0.465, "num_input_tokens_seen": 65448896, "step": 53955 }, { "epoch": 6.009577903998218, "grad_norm": 0.0817769393324852, "learning_rate": 2.057746337094141e-05, "loss": 0.4439, "num_input_tokens_seen": 65454784, "step": 53960 }, { "epoch": 6.0101347588818355, "grad_norm": 0.12529411911964417, "learning_rate": 2.0572680619827685e-05, "loss": 0.4675, "num_input_tokens_seen": 65460576, "step": 53965 }, { "epoch": 6.010691613765453, "grad_norm": 0.06996975839138031, "learning_rate": 2.056789803599227e-05, "loss": 0.4597, "num_input_tokens_seen": 65466656, "step": 53970 }, { "epoch": 6.01124846864907, "grad_norm": 0.09369030594825745, "learning_rate": 2.056311561961587e-05, "loss": 0.4695, "num_input_tokens_seen": 65472800, "step": 53975 }, { "epoch": 6.011805323532688, "grad_norm": 0.08209969103336334, "learning_rate": 2.055833337087918e-05, "loss": 0.4679, "num_input_tokens_seen": 65478560, "step": 53980 }, { "epoch": 6.012362178416304, "grad_norm": 0.09043397009372711, "learning_rate": 2.0553551289962893e-05, "loss": 0.4474, "num_input_tokens_seen": 65484096, "step": 53985 }, { "epoch": 6.012919033299922, "grad_norm": 0.09593334048986435, "learning_rate": 2.0548769377047698e-05, "loss": 0.4663, "num_input_tokens_seen": 65490048, "step": 53990 }, { "epoch": 6.01347588818354, "grad_norm": 0.09527649730443954, "learning_rate": 2.054398763231425e-05, "loss": 0.4526, "num_input_tokens_seen": 65496224, "step": 53995 }, { "epoch": 6.0140327430671565, "grad_norm": 0.11391784995794296, "learning_rate": 2.0539206055943224e-05, "loss": 0.4614, "num_input_tokens_seen": 65501792, "step": 54000 }, { "epoch": 6.014589597950774, "grad_norm": 0.09744954854249954, "learning_rate": 2.0534424648115293e-05, "loss": 0.4662, "num_input_tokens_seen": 65507968, "step": 54005 }, { "epoch": 6.015146452834391, "grad_norm": 0.09751801937818527, "learning_rate": 2.052964340901111e-05, "loss": 0.465, "num_input_tokens_seen": 65514272, "step": 54010 }, { "epoch": 6.015703307718009, "grad_norm": 0.09126961976289749, "learning_rate": 2.0524862338811326e-05, "loss": 0.4621, "num_input_tokens_seen": 65520384, "step": 54015 }, { "epoch": 6.016260162601626, "grad_norm": 0.09186393767595291, "learning_rate": 2.0520081437696583e-05, "loss": 0.4553, "num_input_tokens_seen": 65526304, "step": 54020 }, { "epoch": 6.016817017485243, "grad_norm": 0.07645701617002487, "learning_rate": 2.051530070584752e-05, "loss": 0.4582, "num_input_tokens_seen": 65532576, "step": 54025 }, { "epoch": 6.017373872368861, "grad_norm": 0.09140877425670624, "learning_rate": 2.0510520143444766e-05, "loss": 0.4631, "num_input_tokens_seen": 65538496, "step": 54030 }, { "epoch": 6.017930727252478, "grad_norm": 0.08259232342243195, "learning_rate": 2.0505739750668946e-05, "loss": 0.4634, "num_input_tokens_seen": 65544672, "step": 54035 }, { "epoch": 6.018487582136095, "grad_norm": 0.08648645132780075, "learning_rate": 2.0500959527700686e-05, "loss": 0.4659, "num_input_tokens_seen": 65550816, "step": 54040 }, { "epoch": 6.019044437019713, "grad_norm": 0.08281707763671875, "learning_rate": 2.0496179474720592e-05, "loss": 0.4643, "num_input_tokens_seen": 65556992, "step": 54045 }, { "epoch": 6.01960129190333, "grad_norm": 0.09375516325235367, "learning_rate": 2.0491399591909272e-05, "loss": 0.4548, "num_input_tokens_seen": 65562944, "step": 54050 }, { "epoch": 6.020158146786947, "grad_norm": 0.08860929310321808, "learning_rate": 2.0486619879447323e-05, "loss": 0.4582, "num_input_tokens_seen": 65568704, "step": 54055 }, { "epoch": 6.020715001670565, "grad_norm": 0.09686073660850525, "learning_rate": 2.048184033751534e-05, "loss": 0.4522, "num_input_tokens_seen": 65574304, "step": 54060 }, { "epoch": 6.021271856554182, "grad_norm": 0.0758780837059021, "learning_rate": 2.0477060966293897e-05, "loss": 0.4654, "num_input_tokens_seen": 65580768, "step": 54065 }, { "epoch": 6.0218287114377995, "grad_norm": 0.06586339324712753, "learning_rate": 2.0472281765963595e-05, "loss": 0.4586, "num_input_tokens_seen": 65586880, "step": 54070 }, { "epoch": 6.022385566321416, "grad_norm": 0.08945363014936447, "learning_rate": 2.0467502736705e-05, "loss": 0.4645, "num_input_tokens_seen": 65593184, "step": 54075 }, { "epoch": 6.022942421205034, "grad_norm": 0.09978388994932175, "learning_rate": 2.0462723878698674e-05, "loss": 0.4531, "num_input_tokens_seen": 65599360, "step": 54080 }, { "epoch": 6.023499276088652, "grad_norm": 0.1111343502998352, "learning_rate": 2.045794519212518e-05, "loss": 0.4529, "num_input_tokens_seen": 65605056, "step": 54085 }, { "epoch": 6.024056130972268, "grad_norm": 0.08046023547649384, "learning_rate": 2.0453166677165073e-05, "loss": 0.4621, "num_input_tokens_seen": 65610912, "step": 54090 }, { "epoch": 6.024612985855886, "grad_norm": 0.08320373296737671, "learning_rate": 2.0448388333998894e-05, "loss": 0.4743, "num_input_tokens_seen": 65617152, "step": 54095 }, { "epoch": 6.025169840739503, "grad_norm": 0.08452440798282623, "learning_rate": 2.0443610162807196e-05, "loss": 0.4593, "num_input_tokens_seen": 65623456, "step": 54100 }, { "epoch": 6.025726695623121, "grad_norm": 0.13162504136562347, "learning_rate": 2.043883216377051e-05, "loss": 0.477, "num_input_tokens_seen": 65629824, "step": 54105 }, { "epoch": 6.026283550506738, "grad_norm": 0.10372473299503326, "learning_rate": 2.0434054337069357e-05, "loss": 0.4603, "num_input_tokens_seen": 65636352, "step": 54110 }, { "epoch": 6.026840405390355, "grad_norm": 0.10443401336669922, "learning_rate": 2.042927668288427e-05, "loss": 0.4655, "num_input_tokens_seen": 65642400, "step": 54115 }, { "epoch": 6.027397260273973, "grad_norm": 0.10450286418199539, "learning_rate": 2.0424499201395752e-05, "loss": 0.459, "num_input_tokens_seen": 65648384, "step": 54120 }, { "epoch": 6.0279541151575895, "grad_norm": 0.09638654440641403, "learning_rate": 2.041972189278432e-05, "loss": 0.4763, "num_input_tokens_seen": 65654016, "step": 54125 }, { "epoch": 6.028510970041207, "grad_norm": 0.1095629408955574, "learning_rate": 2.0414944757230464e-05, "loss": 0.4674, "num_input_tokens_seen": 65660032, "step": 54130 }, { "epoch": 6.029067824924825, "grad_norm": 0.1371595561504364, "learning_rate": 2.0410167794914698e-05, "loss": 0.4701, "num_input_tokens_seen": 65665472, "step": 54135 }, { "epoch": 6.029624679808442, "grad_norm": 0.09290162473917007, "learning_rate": 2.04053910060175e-05, "loss": 0.4537, "num_input_tokens_seen": 65671104, "step": 54140 }, { "epoch": 6.030181534692059, "grad_norm": 0.09904374182224274, "learning_rate": 2.0400614390719352e-05, "loss": 0.4631, "num_input_tokens_seen": 65677408, "step": 54145 }, { "epoch": 6.030738389575677, "grad_norm": 0.07023509591817856, "learning_rate": 2.0395837949200733e-05, "loss": 0.467, "num_input_tokens_seen": 65683264, "step": 54150 }, { "epoch": 6.031295244459294, "grad_norm": 0.10363312810659409, "learning_rate": 2.0391061681642113e-05, "loss": 0.4539, "num_input_tokens_seen": 65689504, "step": 54155 }, { "epoch": 6.031852099342911, "grad_norm": 0.10034999996423721, "learning_rate": 2.0386285588223945e-05, "loss": 0.4648, "num_input_tokens_seen": 65695808, "step": 54160 }, { "epoch": 6.032408954226528, "grad_norm": 0.11009285598993301, "learning_rate": 2.03815096691267e-05, "loss": 0.4599, "num_input_tokens_seen": 65702176, "step": 54165 }, { "epoch": 6.032965809110146, "grad_norm": 0.10879557579755783, "learning_rate": 2.037673392453082e-05, "loss": 0.4613, "num_input_tokens_seen": 65707744, "step": 54170 }, { "epoch": 6.033522663993764, "grad_norm": 0.08940379321575165, "learning_rate": 2.0371958354616746e-05, "loss": 0.4623, "num_input_tokens_seen": 65714016, "step": 54175 }, { "epoch": 6.03407951887738, "grad_norm": 0.10127659142017365, "learning_rate": 2.036718295956492e-05, "loss": 0.4568, "num_input_tokens_seen": 65719712, "step": 54180 }, { "epoch": 6.034636373760998, "grad_norm": 0.06930019706487656, "learning_rate": 2.0362407739555768e-05, "loss": 0.4626, "num_input_tokens_seen": 65725984, "step": 54185 }, { "epoch": 6.035193228644615, "grad_norm": 0.08338526636362076, "learning_rate": 2.0357632694769712e-05, "loss": 0.4728, "num_input_tokens_seen": 65732416, "step": 54190 }, { "epoch": 6.0357500835282325, "grad_norm": 0.08959858864545822, "learning_rate": 2.0352857825387165e-05, "loss": 0.4564, "num_input_tokens_seen": 65738592, "step": 54195 }, { "epoch": 6.03630693841185, "grad_norm": 0.10975915193557739, "learning_rate": 2.0348083131588546e-05, "loss": 0.4573, "num_input_tokens_seen": 65744672, "step": 54200 }, { "epoch": 6.036863793295467, "grad_norm": 0.12080024927854538, "learning_rate": 2.034330861355426e-05, "loss": 0.4662, "num_input_tokens_seen": 65750624, "step": 54205 }, { "epoch": 6.037420648179085, "grad_norm": 0.08288851380348206, "learning_rate": 2.0338534271464686e-05, "loss": 0.4626, "num_input_tokens_seen": 65756736, "step": 54210 }, { "epoch": 6.037977503062701, "grad_norm": 0.08358237147331238, "learning_rate": 2.0333760105500236e-05, "loss": 0.455, "num_input_tokens_seen": 65762944, "step": 54215 }, { "epoch": 6.038534357946319, "grad_norm": 0.09654708206653595, "learning_rate": 2.032898611584128e-05, "loss": 0.4665, "num_input_tokens_seen": 65769216, "step": 54220 }, { "epoch": 6.039091212829937, "grad_norm": 0.08822517842054367, "learning_rate": 2.032421230266819e-05, "loss": 0.4698, "num_input_tokens_seen": 65775360, "step": 54225 }, { "epoch": 6.039648067713554, "grad_norm": 0.10098717361688614, "learning_rate": 2.031943866616135e-05, "loss": 0.4697, "num_input_tokens_seen": 65781792, "step": 54230 }, { "epoch": 6.040204922597171, "grad_norm": 0.08665476739406586, "learning_rate": 2.0314665206501118e-05, "loss": 0.4647, "num_input_tokens_seen": 65787968, "step": 54235 }, { "epoch": 6.040761777480789, "grad_norm": 0.10261625051498413, "learning_rate": 2.030989192386785e-05, "loss": 0.463, "num_input_tokens_seen": 65794144, "step": 54240 }, { "epoch": 6.041318632364406, "grad_norm": 0.08421634882688522, "learning_rate": 2.030511881844189e-05, "loss": 0.4743, "num_input_tokens_seen": 65800224, "step": 54245 }, { "epoch": 6.041875487248023, "grad_norm": 0.09011666476726532, "learning_rate": 2.0300345890403595e-05, "loss": 0.4642, "num_input_tokens_seen": 65806048, "step": 54250 }, { "epoch": 6.04243234213164, "grad_norm": 0.07933692634105682, "learning_rate": 2.0295573139933287e-05, "loss": 0.4684, "num_input_tokens_seen": 65812352, "step": 54255 }, { "epoch": 6.042989197015258, "grad_norm": 0.08444157242774963, "learning_rate": 2.02908005672113e-05, "loss": 0.4609, "num_input_tokens_seen": 65817888, "step": 54260 }, { "epoch": 6.0435460518988755, "grad_norm": 0.10259595513343811, "learning_rate": 2.0286028172417963e-05, "loss": 0.4705, "num_input_tokens_seen": 65824128, "step": 54265 }, { "epoch": 6.044102906782492, "grad_norm": 0.07914681732654572, "learning_rate": 2.0281255955733585e-05, "loss": 0.4503, "num_input_tokens_seen": 65829984, "step": 54270 }, { "epoch": 6.04465976166611, "grad_norm": 0.08783995360136032, "learning_rate": 2.0276483917338486e-05, "loss": 0.4476, "num_input_tokens_seen": 65835904, "step": 54275 }, { "epoch": 6.045216616549727, "grad_norm": 0.07844456285238266, "learning_rate": 2.0271712057412958e-05, "loss": 0.4617, "num_input_tokens_seen": 65842016, "step": 54280 }, { "epoch": 6.045773471433344, "grad_norm": 0.07933232188224792, "learning_rate": 2.0266940376137296e-05, "loss": 0.4686, "num_input_tokens_seen": 65848384, "step": 54285 }, { "epoch": 6.046330326316962, "grad_norm": 0.10439946502447128, "learning_rate": 2.0262168873691794e-05, "loss": 0.4716, "num_input_tokens_seen": 65854912, "step": 54290 }, { "epoch": 6.046887181200579, "grad_norm": 0.0878821387887001, "learning_rate": 2.025739755025674e-05, "loss": 0.4583, "num_input_tokens_seen": 65861120, "step": 54295 }, { "epoch": 6.047444036084197, "grad_norm": 0.08606583625078201, "learning_rate": 2.0252626406012407e-05, "loss": 0.4707, "num_input_tokens_seen": 65867264, "step": 54300 }, { "epoch": 6.048000890967813, "grad_norm": 0.0988360345363617, "learning_rate": 2.0247855441139062e-05, "loss": 0.4645, "num_input_tokens_seen": 65873248, "step": 54305 }, { "epoch": 6.048557745851431, "grad_norm": 0.10088098794221878, "learning_rate": 2.0243084655816963e-05, "loss": 0.4603, "num_input_tokens_seen": 65879104, "step": 54310 }, { "epoch": 6.049114600735049, "grad_norm": 0.0815482959151268, "learning_rate": 2.0238314050226374e-05, "loss": 0.466, "num_input_tokens_seen": 65885440, "step": 54315 }, { "epoch": 6.0496714556186655, "grad_norm": 0.10748127847909927, "learning_rate": 2.0233543624547532e-05, "loss": 0.4619, "num_input_tokens_seen": 65891904, "step": 54320 }, { "epoch": 6.050228310502283, "grad_norm": 0.09810326248407364, "learning_rate": 2.0228773378960695e-05, "loss": 0.4518, "num_input_tokens_seen": 65898048, "step": 54325 }, { "epoch": 6.050785165385901, "grad_norm": 0.07427090406417847, "learning_rate": 2.0224003313646088e-05, "loss": 0.4546, "num_input_tokens_seen": 65904192, "step": 54330 }, { "epoch": 6.051342020269518, "grad_norm": 0.09170979261398315, "learning_rate": 2.021923342878394e-05, "loss": 0.4607, "num_input_tokens_seen": 65910048, "step": 54335 }, { "epoch": 6.051898875153135, "grad_norm": 0.0958935096859932, "learning_rate": 2.0214463724554477e-05, "loss": 0.4646, "num_input_tokens_seen": 65916416, "step": 54340 }, { "epoch": 6.052455730036752, "grad_norm": 0.08962440490722656, "learning_rate": 2.0209694201137908e-05, "loss": 0.4573, "num_input_tokens_seen": 65922144, "step": 54345 }, { "epoch": 6.05301258492037, "grad_norm": 0.114597849547863, "learning_rate": 2.0204924858714442e-05, "loss": 0.4634, "num_input_tokens_seen": 65928480, "step": 54350 }, { "epoch": 6.053569439803987, "grad_norm": 0.10625771433115005, "learning_rate": 2.020015569746428e-05, "loss": 0.465, "num_input_tokens_seen": 65934272, "step": 54355 }, { "epoch": 6.054126294687604, "grad_norm": 0.09110787510871887, "learning_rate": 2.019538671756762e-05, "loss": 0.4519, "num_input_tokens_seen": 65940448, "step": 54360 }, { "epoch": 6.054683149571222, "grad_norm": 0.08136725425720215, "learning_rate": 2.0190617919204646e-05, "loss": 0.468, "num_input_tokens_seen": 65946400, "step": 54365 }, { "epoch": 6.055240004454839, "grad_norm": 0.10430997610092163, "learning_rate": 2.0185849302555544e-05, "loss": 0.472, "num_input_tokens_seen": 65952640, "step": 54370 }, { "epoch": 6.055796859338456, "grad_norm": 0.09590929746627808, "learning_rate": 2.0181080867800478e-05, "loss": 0.4584, "num_input_tokens_seen": 65958880, "step": 54375 }, { "epoch": 6.056353714222074, "grad_norm": 0.09749681502580643, "learning_rate": 2.017631261511962e-05, "loss": 0.4616, "num_input_tokens_seen": 65964864, "step": 54380 }, { "epoch": 6.056910569105691, "grad_norm": 0.10432031750679016, "learning_rate": 2.0171544544693128e-05, "loss": 0.4593, "num_input_tokens_seen": 65971200, "step": 54385 }, { "epoch": 6.0574674239893085, "grad_norm": 0.08746597170829773, "learning_rate": 2.016677665670116e-05, "loss": 0.462, "num_input_tokens_seen": 65977408, "step": 54390 }, { "epoch": 6.058024278872926, "grad_norm": 0.0848870724439621, "learning_rate": 2.016200895132386e-05, "loss": 0.4626, "num_input_tokens_seen": 65983904, "step": 54395 }, { "epoch": 6.058581133756543, "grad_norm": 0.10251105576753616, "learning_rate": 2.0157241428741362e-05, "loss": 0.4697, "num_input_tokens_seen": 65990208, "step": 54400 }, { "epoch": 6.059137988640161, "grad_norm": 0.08031564950942993, "learning_rate": 2.0152474089133812e-05, "loss": 0.4614, "num_input_tokens_seen": 65996352, "step": 54405 }, { "epoch": 6.059694843523777, "grad_norm": 0.13590647280216217, "learning_rate": 2.014770693268132e-05, "loss": 0.4643, "num_input_tokens_seen": 66002016, "step": 54410 }, { "epoch": 6.060251698407395, "grad_norm": 0.084414541721344, "learning_rate": 2.014293995956401e-05, "loss": 0.4642, "num_input_tokens_seen": 66008032, "step": 54415 }, { "epoch": 6.060808553291013, "grad_norm": 0.10053768008947372, "learning_rate": 2.013817316996199e-05, "loss": 0.4658, "num_input_tokens_seen": 66013984, "step": 54420 }, { "epoch": 6.0613654081746295, "grad_norm": 0.09533223509788513, "learning_rate": 2.0133406564055373e-05, "loss": 0.4568, "num_input_tokens_seen": 66020032, "step": 54425 }, { "epoch": 6.061922263058247, "grad_norm": 0.1195899024605751, "learning_rate": 2.012864014202425e-05, "loss": 0.4787, "num_input_tokens_seen": 66026400, "step": 54430 }, { "epoch": 6.062479117941864, "grad_norm": 0.08912337571382523, "learning_rate": 2.0123873904048705e-05, "loss": 0.464, "num_input_tokens_seen": 66032480, "step": 54435 }, { "epoch": 6.063035972825482, "grad_norm": 0.09284844994544983, "learning_rate": 2.0119107850308838e-05, "loss": 0.4556, "num_input_tokens_seen": 66038528, "step": 54440 }, { "epoch": 6.063592827709099, "grad_norm": 0.10038802027702332, "learning_rate": 2.011434198098472e-05, "loss": 0.4575, "num_input_tokens_seen": 66044640, "step": 54445 }, { "epoch": 6.064149682592716, "grad_norm": 0.09205795079469681, "learning_rate": 2.0109576296256416e-05, "loss": 0.4538, "num_input_tokens_seen": 66050592, "step": 54450 }, { "epoch": 6.064706537476334, "grad_norm": 0.07563552260398865, "learning_rate": 2.010481079630399e-05, "loss": 0.4619, "num_input_tokens_seen": 66056384, "step": 54455 }, { "epoch": 6.065263392359951, "grad_norm": 0.10066521167755127, "learning_rate": 2.0100045481307506e-05, "loss": 0.4622, "num_input_tokens_seen": 66062208, "step": 54460 }, { "epoch": 6.065820247243568, "grad_norm": 0.08611464500427246, "learning_rate": 2.0095280351447e-05, "loss": 0.4644, "num_input_tokens_seen": 66067744, "step": 54465 }, { "epoch": 6.066377102127186, "grad_norm": 0.10154745727777481, "learning_rate": 2.009051540690252e-05, "loss": 0.4758, "num_input_tokens_seen": 66073504, "step": 54470 }, { "epoch": 6.066933957010803, "grad_norm": 0.08532167226076126, "learning_rate": 2.0085750647854102e-05, "loss": 0.4629, "num_input_tokens_seen": 66079776, "step": 54475 }, { "epoch": 6.06749081189442, "grad_norm": 0.10135269165039062, "learning_rate": 2.0080986074481773e-05, "loss": 0.4632, "num_input_tokens_seen": 66085888, "step": 54480 }, { "epoch": 6.068047666778037, "grad_norm": 0.10022548586130142, "learning_rate": 2.0076221686965557e-05, "loss": 0.4593, "num_input_tokens_seen": 66092288, "step": 54485 }, { "epoch": 6.068604521661655, "grad_norm": 0.09348133951425552, "learning_rate": 2.0071457485485463e-05, "loss": 0.4581, "num_input_tokens_seen": 66098464, "step": 54490 }, { "epoch": 6.0691613765452725, "grad_norm": 0.08390861749649048, "learning_rate": 2.00666934702215e-05, "loss": 0.4569, "num_input_tokens_seen": 66104480, "step": 54495 }, { "epoch": 6.069718231428889, "grad_norm": 0.08674317598342896, "learning_rate": 2.006192964135366e-05, "loss": 0.4705, "num_input_tokens_seen": 66110720, "step": 54500 }, { "epoch": 6.070275086312507, "grad_norm": 0.0953768938779831, "learning_rate": 2.0057165999061954e-05, "loss": 0.4655, "num_input_tokens_seen": 66116960, "step": 54505 }, { "epoch": 6.070831941196125, "grad_norm": 0.09670094400644302, "learning_rate": 2.0052402543526348e-05, "loss": 0.465, "num_input_tokens_seen": 66123360, "step": 54510 }, { "epoch": 6.0713887960797415, "grad_norm": 0.09536929428577423, "learning_rate": 2.0047639274926838e-05, "loss": 0.4661, "num_input_tokens_seen": 66129440, "step": 54515 }, { "epoch": 6.071945650963359, "grad_norm": 0.06996823847293854, "learning_rate": 2.0042876193443384e-05, "loss": 0.4573, "num_input_tokens_seen": 66135328, "step": 54520 }, { "epoch": 6.072502505846976, "grad_norm": 0.10287284106016159, "learning_rate": 2.003811329925595e-05, "loss": 0.4503, "num_input_tokens_seen": 66141920, "step": 54525 }, { "epoch": 6.073059360730594, "grad_norm": 0.09419921040534973, "learning_rate": 2.0033350592544497e-05, "loss": 0.464, "num_input_tokens_seen": 66148256, "step": 54530 }, { "epoch": 6.073616215614211, "grad_norm": 0.10047335177659988, "learning_rate": 2.0028588073488975e-05, "loss": 0.4613, "num_input_tokens_seen": 66154688, "step": 54535 }, { "epoch": 6.074173070497828, "grad_norm": 0.07737234979867935, "learning_rate": 2.0023825742269325e-05, "loss": 0.4718, "num_input_tokens_seen": 66160672, "step": 54540 }, { "epoch": 6.074729925381446, "grad_norm": 0.09871520102024078, "learning_rate": 2.001906359906549e-05, "loss": 0.47, "num_input_tokens_seen": 66166080, "step": 54545 }, { "epoch": 6.0752867802650625, "grad_norm": 0.09684320539236069, "learning_rate": 2.0014301644057397e-05, "loss": 0.4513, "num_input_tokens_seen": 66172224, "step": 54550 }, { "epoch": 6.07584363514868, "grad_norm": 0.08022449910640717, "learning_rate": 2.000953987742496e-05, "loss": 0.4737, "num_input_tokens_seen": 66178240, "step": 54555 }, { "epoch": 6.076400490032298, "grad_norm": 0.09452236443758011, "learning_rate": 2.00047782993481e-05, "loss": 0.464, "num_input_tokens_seen": 66184384, "step": 54560 }, { "epoch": 6.076957344915915, "grad_norm": 0.08309360593557358, "learning_rate": 2.0000016910006722e-05, "loss": 0.4605, "num_input_tokens_seen": 66190656, "step": 54565 }, { "epoch": 6.077514199799532, "grad_norm": 0.097904272377491, "learning_rate": 1.999525570958073e-05, "loss": 0.463, "num_input_tokens_seen": 66196960, "step": 54570 }, { "epoch": 6.07807105468315, "grad_norm": 0.08742021769285202, "learning_rate": 1.999049469825002e-05, "loss": 0.4629, "num_input_tokens_seen": 66203168, "step": 54575 }, { "epoch": 6.078627909566767, "grad_norm": 0.08124706149101257, "learning_rate": 1.9985733876194466e-05, "loss": 0.4611, "num_input_tokens_seen": 66209664, "step": 54580 }, { "epoch": 6.0791847644503845, "grad_norm": 0.09159722179174423, "learning_rate": 1.9980973243593963e-05, "loss": 0.4553, "num_input_tokens_seen": 66215488, "step": 54585 }, { "epoch": 6.079741619334001, "grad_norm": 0.10488100349903107, "learning_rate": 1.9976212800628376e-05, "loss": 0.4545, "num_input_tokens_seen": 66221312, "step": 54590 }, { "epoch": 6.080298474217619, "grad_norm": 0.10637631267309189, "learning_rate": 1.997145254747756e-05, "loss": 0.4743, "num_input_tokens_seen": 66227584, "step": 54595 }, { "epoch": 6.080855329101237, "grad_norm": 0.11081255227327347, "learning_rate": 1.9966692484321386e-05, "loss": 0.4558, "num_input_tokens_seen": 66233888, "step": 54600 }, { "epoch": 6.081412183984853, "grad_norm": 0.08430548012256622, "learning_rate": 1.99619326113397e-05, "loss": 0.4687, "num_input_tokens_seen": 66239968, "step": 54605 }, { "epoch": 6.081969038868471, "grad_norm": 0.0889817401766777, "learning_rate": 1.9957172928712347e-05, "loss": 0.4569, "num_input_tokens_seen": 66245664, "step": 54610 }, { "epoch": 6.082525893752088, "grad_norm": 0.06724462658166885, "learning_rate": 1.995241343661916e-05, "loss": 0.4692, "num_input_tokens_seen": 66252032, "step": 54615 }, { "epoch": 6.0830827486357055, "grad_norm": 0.12084411084651947, "learning_rate": 1.9947654135239967e-05, "loss": 0.4563, "num_input_tokens_seen": 66258272, "step": 54620 }, { "epoch": 6.083639603519323, "grad_norm": 0.12139525264501572, "learning_rate": 1.99428950247546e-05, "loss": 0.4618, "num_input_tokens_seen": 66264320, "step": 54625 }, { "epoch": 6.08419645840294, "grad_norm": 0.09892895817756653, "learning_rate": 1.993813610534285e-05, "loss": 0.4614, "num_input_tokens_seen": 66270496, "step": 54630 }, { "epoch": 6.084753313286558, "grad_norm": 0.11291877180337906, "learning_rate": 1.993337737718455e-05, "loss": 0.4692, "num_input_tokens_seen": 66276640, "step": 54635 }, { "epoch": 6.0853101681701744, "grad_norm": 0.12163859605789185, "learning_rate": 1.992861884045949e-05, "loss": 0.4559, "num_input_tokens_seen": 66282784, "step": 54640 }, { "epoch": 6.085867023053792, "grad_norm": 0.08813684433698654, "learning_rate": 1.992386049534746e-05, "loss": 0.4656, "num_input_tokens_seen": 66289056, "step": 54645 }, { "epoch": 6.08642387793741, "grad_norm": 0.06319102644920349, "learning_rate": 1.991910234202825e-05, "loss": 0.46, "num_input_tokens_seen": 66294784, "step": 54650 }, { "epoch": 6.086980732821027, "grad_norm": 0.096035897731781, "learning_rate": 1.9914344380681635e-05, "loss": 0.4717, "num_input_tokens_seen": 66301024, "step": 54655 }, { "epoch": 6.087537587704644, "grad_norm": 0.05793267861008644, "learning_rate": 1.990958661148738e-05, "loss": 0.461, "num_input_tokens_seen": 66307104, "step": 54660 }, { "epoch": 6.088094442588262, "grad_norm": 0.09190843999385834, "learning_rate": 1.9904829034625262e-05, "loss": 0.4604, "num_input_tokens_seen": 66313376, "step": 54665 }, { "epoch": 6.088651297471879, "grad_norm": 0.11765643209218979, "learning_rate": 1.9900071650275034e-05, "loss": 0.4554, "num_input_tokens_seen": 66319264, "step": 54670 }, { "epoch": 6.089208152355496, "grad_norm": 0.0856609046459198, "learning_rate": 1.989531445861644e-05, "loss": 0.4544, "num_input_tokens_seen": 66324800, "step": 54675 }, { "epoch": 6.089765007239113, "grad_norm": 0.09943288564682007, "learning_rate": 1.9890557459829225e-05, "loss": 0.4613, "num_input_tokens_seen": 66330656, "step": 54680 }, { "epoch": 6.090321862122731, "grad_norm": 0.10588589310646057, "learning_rate": 1.988580065409312e-05, "loss": 0.4617, "num_input_tokens_seen": 66336352, "step": 54685 }, { "epoch": 6.0908787170063485, "grad_norm": 0.08384134620428085, "learning_rate": 1.9881044041587863e-05, "loss": 0.4731, "num_input_tokens_seen": 66342464, "step": 54690 }, { "epoch": 6.091435571889965, "grad_norm": 0.09990852326154709, "learning_rate": 1.9876287622493155e-05, "loss": 0.4445, "num_input_tokens_seen": 66348544, "step": 54695 }, { "epoch": 6.091992426773583, "grad_norm": 0.10437043756246567, "learning_rate": 1.987153139698873e-05, "loss": 0.4638, "num_input_tokens_seen": 66354624, "step": 54700 }, { "epoch": 6.0925492816572, "grad_norm": 0.08593108505010605, "learning_rate": 1.986677536525428e-05, "loss": 0.4546, "num_input_tokens_seen": 66360960, "step": 54705 }, { "epoch": 6.0931061365408175, "grad_norm": 0.13468295335769653, "learning_rate": 1.986201952746951e-05, "loss": 0.4708, "num_input_tokens_seen": 66367552, "step": 54710 }, { "epoch": 6.093662991424435, "grad_norm": 0.08827153593301773, "learning_rate": 1.9857263883814105e-05, "loss": 0.4578, "num_input_tokens_seen": 66373792, "step": 54715 }, { "epoch": 6.094219846308052, "grad_norm": 0.13587170839309692, "learning_rate": 1.9852508434467754e-05, "loss": 0.4669, "num_input_tokens_seen": 66379616, "step": 54720 }, { "epoch": 6.09477670119167, "grad_norm": 0.08563583344221115, "learning_rate": 1.9847753179610123e-05, "loss": 0.4674, "num_input_tokens_seen": 66385440, "step": 54725 }, { "epoch": 6.095333556075286, "grad_norm": 0.06764615327119827, "learning_rate": 1.9842998119420896e-05, "loss": 0.4651, "num_input_tokens_seen": 66391552, "step": 54730 }, { "epoch": 6.095890410958904, "grad_norm": 0.09124521911144257, "learning_rate": 1.9838243254079723e-05, "loss": 0.4582, "num_input_tokens_seen": 66397504, "step": 54735 }, { "epoch": 6.096447265842522, "grad_norm": 0.08363797515630722, "learning_rate": 1.9833488583766264e-05, "loss": 0.4609, "num_input_tokens_seen": 66403264, "step": 54740 }, { "epoch": 6.0970041207261385, "grad_norm": 0.11834540218114853, "learning_rate": 1.982873410866016e-05, "loss": 0.4594, "num_input_tokens_seen": 66409056, "step": 54745 }, { "epoch": 6.097560975609756, "grad_norm": 0.08279459178447723, "learning_rate": 1.9823979828941057e-05, "loss": 0.4531, "num_input_tokens_seen": 66415232, "step": 54750 }, { "epoch": 6.098117830493374, "grad_norm": 0.0998399406671524, "learning_rate": 1.981922574478858e-05, "loss": 0.4582, "num_input_tokens_seen": 66421696, "step": 54755 }, { "epoch": 6.098674685376991, "grad_norm": 0.10247746109962463, "learning_rate": 1.9814471856382353e-05, "loss": 0.4686, "num_input_tokens_seen": 66427936, "step": 54760 }, { "epoch": 6.099231540260608, "grad_norm": 0.10445541143417358, "learning_rate": 1.9809718163902004e-05, "loss": 0.4664, "num_input_tokens_seen": 66434112, "step": 54765 }, { "epoch": 6.099788395144225, "grad_norm": 0.09094471484422684, "learning_rate": 1.9804964667527136e-05, "loss": 0.4657, "num_input_tokens_seen": 66440032, "step": 54770 }, { "epoch": 6.100345250027843, "grad_norm": 0.11379540711641312, "learning_rate": 1.980021136743735e-05, "loss": 0.454, "num_input_tokens_seen": 66446336, "step": 54775 }, { "epoch": 6.1009021049114605, "grad_norm": 0.11096499115228653, "learning_rate": 1.9795458263812242e-05, "loss": 0.4511, "num_input_tokens_seen": 66452288, "step": 54780 }, { "epoch": 6.101458959795077, "grad_norm": 0.11322037875652313, "learning_rate": 1.97907053568314e-05, "loss": 0.4606, "num_input_tokens_seen": 66458304, "step": 54785 }, { "epoch": 6.102015814678695, "grad_norm": 0.12138038128614426, "learning_rate": 1.97859526466744e-05, "loss": 0.4645, "num_input_tokens_seen": 66464224, "step": 54790 }, { "epoch": 6.102572669562312, "grad_norm": 0.131311297416687, "learning_rate": 1.978120013352082e-05, "loss": 0.4634, "num_input_tokens_seen": 66469856, "step": 54795 }, { "epoch": 6.103129524445929, "grad_norm": 0.10544725507497787, "learning_rate": 1.9776447817550225e-05, "loss": 0.4593, "num_input_tokens_seen": 66476128, "step": 54800 }, { "epoch": 6.103686379329547, "grad_norm": 0.10408322513103485, "learning_rate": 1.977169569894217e-05, "loss": 0.4628, "num_input_tokens_seen": 66482432, "step": 54805 }, { "epoch": 6.104243234213164, "grad_norm": 0.12009290605783463, "learning_rate": 1.9766943777876207e-05, "loss": 0.4591, "num_input_tokens_seen": 66488192, "step": 54810 }, { "epoch": 6.1048000890967815, "grad_norm": 0.10518539696931839, "learning_rate": 1.9762192054531886e-05, "loss": 0.4676, "num_input_tokens_seen": 66494240, "step": 54815 }, { "epoch": 6.105356943980398, "grad_norm": 0.12283482402563095, "learning_rate": 1.9757440529088727e-05, "loss": 0.4677, "num_input_tokens_seen": 66500352, "step": 54820 }, { "epoch": 6.105913798864016, "grad_norm": 0.10165567696094513, "learning_rate": 1.9752689201726265e-05, "loss": 0.4534, "num_input_tokens_seen": 66506528, "step": 54825 }, { "epoch": 6.106470653747634, "grad_norm": 0.09898559749126434, "learning_rate": 1.9747938072624024e-05, "loss": 0.4538, "num_input_tokens_seen": 66512800, "step": 54830 }, { "epoch": 6.10702750863125, "grad_norm": 0.0847489982843399, "learning_rate": 1.974318714196151e-05, "loss": 0.4596, "num_input_tokens_seen": 66519072, "step": 54835 }, { "epoch": 6.107584363514868, "grad_norm": 0.13237744569778442, "learning_rate": 1.9738436409918226e-05, "loss": 0.4665, "num_input_tokens_seen": 66525568, "step": 54840 }, { "epoch": 6.108141218398486, "grad_norm": 0.1334354728460312, "learning_rate": 1.9733685876673686e-05, "loss": 0.4646, "num_input_tokens_seen": 66531456, "step": 54845 }, { "epoch": 6.108698073282103, "grad_norm": 0.10084500163793564, "learning_rate": 1.9728935542407366e-05, "loss": 0.4656, "num_input_tokens_seen": 66537440, "step": 54850 }, { "epoch": 6.10925492816572, "grad_norm": 0.10212648659944534, "learning_rate": 1.9724185407298756e-05, "loss": 0.473, "num_input_tokens_seen": 66543392, "step": 54855 }, { "epoch": 6.109811783049337, "grad_norm": 0.11004249006509781, "learning_rate": 1.9719435471527326e-05, "loss": 0.4622, "num_input_tokens_seen": 66549696, "step": 54860 }, { "epoch": 6.110368637932955, "grad_norm": 0.11866548657417297, "learning_rate": 1.971468573527255e-05, "loss": 0.4604, "num_input_tokens_seen": 66555936, "step": 54865 }, { "epoch": 6.110925492816572, "grad_norm": 0.11982620507478714, "learning_rate": 1.970993619871388e-05, "loss": 0.4635, "num_input_tokens_seen": 66561664, "step": 54870 }, { "epoch": 6.111482347700189, "grad_norm": 0.1009199246764183, "learning_rate": 1.970518686203077e-05, "loss": 0.4747, "num_input_tokens_seen": 66567904, "step": 54875 }, { "epoch": 6.112039202583807, "grad_norm": 0.16686959564685822, "learning_rate": 1.9700437725402675e-05, "loss": 0.4542, "num_input_tokens_seen": 66574304, "step": 54880 }, { "epoch": 6.112596057467424, "grad_norm": 0.13572047650814056, "learning_rate": 1.9695688789009024e-05, "loss": 0.4557, "num_input_tokens_seen": 66580544, "step": 54885 }, { "epoch": 6.113152912351041, "grad_norm": 0.09876050055027008, "learning_rate": 1.969094005302925e-05, "loss": 0.4484, "num_input_tokens_seen": 66586688, "step": 54890 }, { "epoch": 6.113709767234659, "grad_norm": 0.0864500105381012, "learning_rate": 1.9686191517642773e-05, "loss": 0.4715, "num_input_tokens_seen": 66592512, "step": 54895 }, { "epoch": 6.114266622118276, "grad_norm": 0.08469884842634201, "learning_rate": 1.9681443183029012e-05, "loss": 0.4596, "num_input_tokens_seen": 66598528, "step": 54900 }, { "epoch": 6.114823477001893, "grad_norm": 0.09398853033781052, "learning_rate": 1.9676695049367364e-05, "loss": 0.468, "num_input_tokens_seen": 66604768, "step": 54905 }, { "epoch": 6.11538033188551, "grad_norm": 0.13157930970191956, "learning_rate": 1.967194711683724e-05, "loss": 0.4574, "num_input_tokens_seen": 66611200, "step": 54910 }, { "epoch": 6.115937186769128, "grad_norm": 0.11105779558420181, "learning_rate": 1.9667199385618035e-05, "loss": 0.4656, "num_input_tokens_seen": 66617632, "step": 54915 }, { "epoch": 6.116494041652746, "grad_norm": 0.10217326879501343, "learning_rate": 1.9662451855889123e-05, "loss": 0.4475, "num_input_tokens_seen": 66623808, "step": 54920 }, { "epoch": 6.117050896536362, "grad_norm": 0.09438161551952362, "learning_rate": 1.9657704527829883e-05, "loss": 0.4661, "num_input_tokens_seen": 66629856, "step": 54925 }, { "epoch": 6.11760775141998, "grad_norm": 0.08368333429098129, "learning_rate": 1.965295740161969e-05, "loss": 0.4756, "num_input_tokens_seen": 66636032, "step": 54930 }, { "epoch": 6.118164606303598, "grad_norm": 0.07965843379497528, "learning_rate": 1.9648210477437897e-05, "loss": 0.4524, "num_input_tokens_seen": 66642240, "step": 54935 }, { "epoch": 6.1187214611872145, "grad_norm": 0.11967677623033524, "learning_rate": 1.9643463755463866e-05, "loss": 0.4578, "num_input_tokens_seen": 66648160, "step": 54940 }, { "epoch": 6.119278316070832, "grad_norm": 0.09551545232534409, "learning_rate": 1.963871723587694e-05, "loss": 0.4672, "num_input_tokens_seen": 66654336, "step": 54945 }, { "epoch": 6.119835170954449, "grad_norm": 0.09666698426008224, "learning_rate": 1.9633970918856455e-05, "loss": 0.4533, "num_input_tokens_seen": 66660448, "step": 54950 }, { "epoch": 6.120392025838067, "grad_norm": 0.12697453796863556, "learning_rate": 1.962922480458175e-05, "loss": 0.4595, "num_input_tokens_seen": 66666880, "step": 54955 }, { "epoch": 6.120948880721684, "grad_norm": 0.11747848987579346, "learning_rate": 1.962447889323214e-05, "loss": 0.4636, "num_input_tokens_seen": 66673056, "step": 54960 }, { "epoch": 6.121505735605301, "grad_norm": 0.09204434603452682, "learning_rate": 1.9619733184986945e-05, "loss": 0.4581, "num_input_tokens_seen": 66679072, "step": 54965 }, { "epoch": 6.122062590488919, "grad_norm": 0.0708773210644722, "learning_rate": 1.961498768002547e-05, "loss": 0.4628, "num_input_tokens_seen": 66685536, "step": 54970 }, { "epoch": 6.1226194453725356, "grad_norm": 0.1007438600063324, "learning_rate": 1.961024237852702e-05, "loss": 0.4678, "num_input_tokens_seen": 66691456, "step": 54975 }, { "epoch": 6.123176300256153, "grad_norm": 0.105033740401268, "learning_rate": 1.9605497280670885e-05, "loss": 0.4541, "num_input_tokens_seen": 66697760, "step": 54980 }, { "epoch": 6.123733155139771, "grad_norm": 0.09453000128269196, "learning_rate": 1.960075238663635e-05, "loss": 0.4605, "num_input_tokens_seen": 66703520, "step": 54985 }, { "epoch": 6.124290010023388, "grad_norm": 0.09411582350730896, "learning_rate": 1.9596007696602694e-05, "loss": 0.4651, "num_input_tokens_seen": 66709728, "step": 54990 }, { "epoch": 6.124846864907005, "grad_norm": 0.09593755006790161, "learning_rate": 1.9591263210749182e-05, "loss": 0.4667, "num_input_tokens_seen": 66715904, "step": 54995 }, { "epoch": 6.125403719790622, "grad_norm": 0.10238616168498993, "learning_rate": 1.9586518929255078e-05, "loss": 0.452, "num_input_tokens_seen": 66722048, "step": 55000 }, { "epoch": 6.12596057467424, "grad_norm": 0.10016652196645737, "learning_rate": 1.958177485229964e-05, "loss": 0.4526, "num_input_tokens_seen": 66728128, "step": 55005 }, { "epoch": 6.1265174295578575, "grad_norm": 0.09251503646373749, "learning_rate": 1.9577030980062112e-05, "loss": 0.4598, "num_input_tokens_seen": 66734432, "step": 55010 }, { "epoch": 6.127074284441474, "grad_norm": 0.12902501225471497, "learning_rate": 1.9572287312721735e-05, "loss": 0.46, "num_input_tokens_seen": 66740576, "step": 55015 }, { "epoch": 6.127631139325092, "grad_norm": 0.119289331138134, "learning_rate": 1.9567543850457735e-05, "loss": 0.4564, "num_input_tokens_seen": 66746144, "step": 55020 }, { "epoch": 6.12818799420871, "grad_norm": 0.0968349277973175, "learning_rate": 1.9562800593449337e-05, "loss": 0.4676, "num_input_tokens_seen": 66751840, "step": 55025 }, { "epoch": 6.128744849092326, "grad_norm": 0.09975540637969971, "learning_rate": 1.9558057541875756e-05, "loss": 0.456, "num_input_tokens_seen": 66758112, "step": 55030 }, { "epoch": 6.129301703975944, "grad_norm": 0.10901113599538803, "learning_rate": 1.9553314695916198e-05, "loss": 0.469, "num_input_tokens_seen": 66764192, "step": 55035 }, { "epoch": 6.129858558859561, "grad_norm": 0.10265374928712845, "learning_rate": 1.954857205574987e-05, "loss": 0.4796, "num_input_tokens_seen": 66770144, "step": 55040 }, { "epoch": 6.1304154137431786, "grad_norm": 0.10825178027153015, "learning_rate": 1.9543829621555958e-05, "loss": 0.4762, "num_input_tokens_seen": 66776064, "step": 55045 }, { "epoch": 6.130972268626796, "grad_norm": 0.08932726830244064, "learning_rate": 1.953908739351365e-05, "loss": 0.4691, "num_input_tokens_seen": 66782176, "step": 55050 }, { "epoch": 6.131529123510413, "grad_norm": 0.10218747705221176, "learning_rate": 1.953434537180212e-05, "loss": 0.457, "num_input_tokens_seen": 66788448, "step": 55055 }, { "epoch": 6.132085978394031, "grad_norm": 0.09396667033433914, "learning_rate": 1.952960355660054e-05, "loss": 0.4601, "num_input_tokens_seen": 66794528, "step": 55060 }, { "epoch": 6.1326428332776475, "grad_norm": 0.11693771928548813, "learning_rate": 1.9524861948088057e-05, "loss": 0.4589, "num_input_tokens_seen": 66800608, "step": 55065 }, { "epoch": 6.133199688161265, "grad_norm": 0.10875833034515381, "learning_rate": 1.9520120546443846e-05, "loss": 0.4497, "num_input_tokens_seen": 66806656, "step": 55070 }, { "epoch": 6.133756543044883, "grad_norm": 0.10460592806339264, "learning_rate": 1.951537935184704e-05, "loss": 0.4677, "num_input_tokens_seen": 66812192, "step": 55075 }, { "epoch": 6.1343133979285, "grad_norm": 0.09493226557970047, "learning_rate": 1.951063836447678e-05, "loss": 0.4653, "num_input_tokens_seen": 66818400, "step": 55080 }, { "epoch": 6.134870252812117, "grad_norm": 0.09258001297712326, "learning_rate": 1.9505897584512193e-05, "loss": 0.4687, "num_input_tokens_seen": 66823904, "step": 55085 }, { "epoch": 6.135427107695734, "grad_norm": 0.09556756913661957, "learning_rate": 1.95011570121324e-05, "loss": 0.4713, "num_input_tokens_seen": 66830176, "step": 55090 }, { "epoch": 6.135983962579352, "grad_norm": 0.12352074682712555, "learning_rate": 1.9496416647516514e-05, "loss": 0.4596, "num_input_tokens_seen": 66836448, "step": 55095 }, { "epoch": 6.136540817462969, "grad_norm": 0.08667077124118805, "learning_rate": 1.949167649084365e-05, "loss": 0.4546, "num_input_tokens_seen": 66842528, "step": 55100 }, { "epoch": 6.137097672346586, "grad_norm": 0.10998233407735825, "learning_rate": 1.94869365422929e-05, "loss": 0.4772, "num_input_tokens_seen": 66848800, "step": 55105 }, { "epoch": 6.137654527230204, "grad_norm": 0.15780287981033325, "learning_rate": 1.9482196802043355e-05, "loss": 0.4511, "num_input_tokens_seen": 66854912, "step": 55110 }, { "epoch": 6.138211382113822, "grad_norm": 0.11389908939599991, "learning_rate": 1.9477457270274095e-05, "loss": 0.474, "num_input_tokens_seen": 66860864, "step": 55115 }, { "epoch": 6.138768236997438, "grad_norm": 0.09959377348423004, "learning_rate": 1.94727179471642e-05, "loss": 0.4632, "num_input_tokens_seen": 66867136, "step": 55120 }, { "epoch": 6.139325091881056, "grad_norm": 0.11419571191072464, "learning_rate": 1.946797883289273e-05, "loss": 0.4526, "num_input_tokens_seen": 66873664, "step": 55125 }, { "epoch": 6.139881946764673, "grad_norm": 0.1301186978816986, "learning_rate": 1.9463239927638747e-05, "loss": 0.4608, "num_input_tokens_seen": 66879808, "step": 55130 }, { "epoch": 6.1404388016482905, "grad_norm": 0.08329272270202637, "learning_rate": 1.9458501231581304e-05, "loss": 0.4616, "num_input_tokens_seen": 66885984, "step": 55135 }, { "epoch": 6.140995656531908, "grad_norm": 0.10285801440477371, "learning_rate": 1.945376274489945e-05, "loss": 0.4537, "num_input_tokens_seen": 66891648, "step": 55140 }, { "epoch": 6.141552511415525, "grad_norm": 0.1081111878156662, "learning_rate": 1.944902446777221e-05, "loss": 0.4637, "num_input_tokens_seen": 66898080, "step": 55145 }, { "epoch": 6.142109366299143, "grad_norm": 0.09993743896484375, "learning_rate": 1.944428640037861e-05, "loss": 0.4665, "num_input_tokens_seen": 66904256, "step": 55150 }, { "epoch": 6.142666221182759, "grad_norm": 0.09354951232671738, "learning_rate": 1.9439548542897677e-05, "loss": 0.4622, "num_input_tokens_seen": 66909888, "step": 55155 }, { "epoch": 6.143223076066377, "grad_norm": 0.09367568045854568, "learning_rate": 1.943481089550842e-05, "loss": 0.4487, "num_input_tokens_seen": 66916128, "step": 55160 }, { "epoch": 6.143779930949995, "grad_norm": 0.09512346237897873, "learning_rate": 1.9430073458389838e-05, "loss": 0.4522, "num_input_tokens_seen": 66922784, "step": 55165 }, { "epoch": 6.1443367858336115, "grad_norm": 0.07738364487886429, "learning_rate": 1.9425336231720937e-05, "loss": 0.4676, "num_input_tokens_seen": 66929024, "step": 55170 }, { "epoch": 6.144893640717229, "grad_norm": 0.0884159728884697, "learning_rate": 1.9420599215680696e-05, "loss": 0.4592, "num_input_tokens_seen": 66935072, "step": 55175 }, { "epoch": 6.145450495600846, "grad_norm": 0.09590377658605576, "learning_rate": 1.9415862410448095e-05, "loss": 0.458, "num_input_tokens_seen": 66941312, "step": 55180 }, { "epoch": 6.146007350484464, "grad_norm": 0.09859935939311981, "learning_rate": 1.941112581620211e-05, "loss": 0.4581, "num_input_tokens_seen": 66947520, "step": 55185 }, { "epoch": 6.146564205368081, "grad_norm": 0.09912484884262085, "learning_rate": 1.94063894331217e-05, "loss": 0.4494, "num_input_tokens_seen": 66953216, "step": 55190 }, { "epoch": 6.147121060251698, "grad_norm": 0.10036212205886841, "learning_rate": 1.9401653261385815e-05, "loss": 0.4712, "num_input_tokens_seen": 66959520, "step": 55195 }, { "epoch": 6.147677915135316, "grad_norm": 0.08174110949039459, "learning_rate": 1.9396917301173424e-05, "loss": 0.4692, "num_input_tokens_seen": 66965728, "step": 55200 }, { "epoch": 6.1482347700189335, "grad_norm": 0.131956085562706, "learning_rate": 1.9392181552663447e-05, "loss": 0.4595, "num_input_tokens_seen": 66971776, "step": 55205 }, { "epoch": 6.14879162490255, "grad_norm": 0.072147898375988, "learning_rate": 1.938744601603482e-05, "loss": 0.461, "num_input_tokens_seen": 66977792, "step": 55210 }, { "epoch": 6.149348479786168, "grad_norm": 0.1587885171175003, "learning_rate": 1.9382710691466474e-05, "loss": 0.459, "num_input_tokens_seen": 66983264, "step": 55215 }, { "epoch": 6.149905334669785, "grad_norm": 0.11126670986413956, "learning_rate": 1.937797557913732e-05, "loss": 0.4634, "num_input_tokens_seen": 66989440, "step": 55220 }, { "epoch": 6.150462189553402, "grad_norm": 0.11497754603624344, "learning_rate": 1.9373240679226266e-05, "loss": 0.4707, "num_input_tokens_seen": 66995584, "step": 55225 }, { "epoch": 6.15101904443702, "grad_norm": 0.07276361435651779, "learning_rate": 1.936850599191221e-05, "loss": 0.4591, "num_input_tokens_seen": 67001536, "step": 55230 }, { "epoch": 6.151575899320637, "grad_norm": 0.14324644207954407, "learning_rate": 1.936377151737404e-05, "loss": 0.4597, "num_input_tokens_seen": 67007936, "step": 55235 }, { "epoch": 6.1521327542042545, "grad_norm": 0.15761923789978027, "learning_rate": 1.9359037255790648e-05, "loss": 0.4555, "num_input_tokens_seen": 67014560, "step": 55240 }, { "epoch": 6.152689609087871, "grad_norm": 0.09782654047012329, "learning_rate": 1.9354303207340903e-05, "loss": 0.4507, "num_input_tokens_seen": 67020768, "step": 55245 }, { "epoch": 6.153246463971489, "grad_norm": 0.13365161418914795, "learning_rate": 1.9349569372203675e-05, "loss": 0.4667, "num_input_tokens_seen": 67026784, "step": 55250 }, { "epoch": 6.153803318855107, "grad_norm": 0.10804157704114914, "learning_rate": 1.9344835750557826e-05, "loss": 0.4542, "num_input_tokens_seen": 67032992, "step": 55255 }, { "epoch": 6.1543601737387235, "grad_norm": 0.09303340315818787, "learning_rate": 1.9340102342582205e-05, "loss": 0.4605, "num_input_tokens_seen": 67039072, "step": 55260 }, { "epoch": 6.154917028622341, "grad_norm": 0.08453787863254547, "learning_rate": 1.9335369148455656e-05, "loss": 0.4616, "num_input_tokens_seen": 67045504, "step": 55265 }, { "epoch": 6.155473883505958, "grad_norm": 0.08540071547031403, "learning_rate": 1.9330636168357013e-05, "loss": 0.452, "num_input_tokens_seen": 67051360, "step": 55270 }, { "epoch": 6.156030738389576, "grad_norm": 0.1348802000284195, "learning_rate": 1.93259034024651e-05, "loss": 0.4666, "num_input_tokens_seen": 67057888, "step": 55275 }, { "epoch": 6.156587593273193, "grad_norm": 0.09718906879425049, "learning_rate": 1.9321170850958744e-05, "loss": 0.4666, "num_input_tokens_seen": 67064000, "step": 55280 }, { "epoch": 6.15714444815681, "grad_norm": 0.11090067774057388, "learning_rate": 1.931643851401675e-05, "loss": 0.4533, "num_input_tokens_seen": 67069760, "step": 55285 }, { "epoch": 6.157701303040428, "grad_norm": 0.07533518970012665, "learning_rate": 1.9311706391817926e-05, "loss": 0.4629, "num_input_tokens_seen": 67075264, "step": 55290 }, { "epoch": 6.158258157924045, "grad_norm": 0.10508905351161957, "learning_rate": 1.9306974484541063e-05, "loss": 0.4482, "num_input_tokens_seen": 67081280, "step": 55295 }, { "epoch": 6.158815012807662, "grad_norm": 0.1220129206776619, "learning_rate": 1.930224279236495e-05, "loss": 0.4635, "num_input_tokens_seen": 67087168, "step": 55300 }, { "epoch": 6.15937186769128, "grad_norm": 0.11157459765672684, "learning_rate": 1.929751131546836e-05, "loss": 0.4613, "num_input_tokens_seen": 67093696, "step": 55305 }, { "epoch": 6.159928722574897, "grad_norm": 0.09453689306974411, "learning_rate": 1.9292780054030065e-05, "loss": 0.4637, "num_input_tokens_seen": 67099680, "step": 55310 }, { "epoch": 6.160485577458514, "grad_norm": 0.11703285574913025, "learning_rate": 1.928804900822883e-05, "loss": 0.4545, "num_input_tokens_seen": 67105824, "step": 55315 }, { "epoch": 6.161042432342132, "grad_norm": 0.1531706154346466, "learning_rate": 1.928331817824341e-05, "loss": 0.4686, "num_input_tokens_seen": 67111648, "step": 55320 }, { "epoch": 6.161599287225749, "grad_norm": 0.07225966453552246, "learning_rate": 1.9278587564252553e-05, "loss": 0.4711, "num_input_tokens_seen": 67117792, "step": 55325 }, { "epoch": 6.1621561421093665, "grad_norm": 0.1472899168729782, "learning_rate": 1.927385716643499e-05, "loss": 0.4702, "num_input_tokens_seen": 67123552, "step": 55330 }, { "epoch": 6.162712996992983, "grad_norm": 0.11637471616268158, "learning_rate": 1.9269126984969453e-05, "loss": 0.4622, "num_input_tokens_seen": 67129664, "step": 55335 }, { "epoch": 6.163269851876601, "grad_norm": 0.0937499925494194, "learning_rate": 1.926439702003466e-05, "loss": 0.4613, "num_input_tokens_seen": 67135872, "step": 55340 }, { "epoch": 6.163826706760219, "grad_norm": 0.10056589543819427, "learning_rate": 1.9259667271809333e-05, "loss": 0.4629, "num_input_tokens_seen": 67142304, "step": 55345 }, { "epoch": 6.164383561643835, "grad_norm": 0.06416645646095276, "learning_rate": 1.9254937740472173e-05, "loss": 0.4681, "num_input_tokens_seen": 67148352, "step": 55350 }, { "epoch": 6.164940416527453, "grad_norm": 0.08432382345199585, "learning_rate": 1.9250208426201876e-05, "loss": 0.4566, "num_input_tokens_seen": 67154720, "step": 55355 }, { "epoch": 6.16549727141107, "grad_norm": 0.1617819368839264, "learning_rate": 1.924547932917713e-05, "loss": 0.4616, "num_input_tokens_seen": 67160928, "step": 55360 }, { "epoch": 6.1660541262946875, "grad_norm": 0.07892018556594849, "learning_rate": 1.924075044957662e-05, "loss": 0.4495, "num_input_tokens_seen": 67167392, "step": 55365 }, { "epoch": 6.166610981178305, "grad_norm": 0.08142056316137314, "learning_rate": 1.9236021787579008e-05, "loss": 0.463, "num_input_tokens_seen": 67173408, "step": 55370 }, { "epoch": 6.167167836061922, "grad_norm": 0.09248557686805725, "learning_rate": 1.923129334336296e-05, "loss": 0.4649, "num_input_tokens_seen": 67179584, "step": 55375 }, { "epoch": 6.16772469094554, "grad_norm": 0.11826593428850174, "learning_rate": 1.9226565117107144e-05, "loss": 0.4745, "num_input_tokens_seen": 67185952, "step": 55380 }, { "epoch": 6.168281545829157, "grad_norm": 0.08950507640838623, "learning_rate": 1.92218371089902e-05, "loss": 0.4683, "num_input_tokens_seen": 67192192, "step": 55385 }, { "epoch": 6.168838400712774, "grad_norm": 0.12558378279209137, "learning_rate": 1.9217109319190767e-05, "loss": 0.4648, "num_input_tokens_seen": 67198144, "step": 55390 }, { "epoch": 6.169395255596392, "grad_norm": 0.07552290707826614, "learning_rate": 1.9212381747887477e-05, "loss": 0.4544, "num_input_tokens_seen": 67204320, "step": 55395 }, { "epoch": 6.169952110480009, "grad_norm": 0.1624889373779297, "learning_rate": 1.920765439525895e-05, "loss": 0.4702, "num_input_tokens_seen": 67210432, "step": 55400 }, { "epoch": 6.170508965363626, "grad_norm": 0.1555858701467514, "learning_rate": 1.9202927261483797e-05, "loss": 0.459, "num_input_tokens_seen": 67216512, "step": 55405 }, { "epoch": 6.171065820247244, "grad_norm": 0.10296366363763809, "learning_rate": 1.9198200346740637e-05, "loss": 0.4791, "num_input_tokens_seen": 67222912, "step": 55410 }, { "epoch": 6.171622675130861, "grad_norm": 0.10845568776130676, "learning_rate": 1.919347365120806e-05, "loss": 0.4601, "num_input_tokens_seen": 67229312, "step": 55415 }, { "epoch": 6.172179530014478, "grad_norm": 0.09769632667303085, "learning_rate": 1.9188747175064654e-05, "loss": 0.4494, "num_input_tokens_seen": 67235552, "step": 55420 }, { "epoch": 6.172736384898095, "grad_norm": 0.09568330645561218, "learning_rate": 1.9184020918489005e-05, "loss": 0.4572, "num_input_tokens_seen": 67241216, "step": 55425 }, { "epoch": 6.173293239781713, "grad_norm": 0.09860822558403015, "learning_rate": 1.9179294881659683e-05, "loss": 0.4593, "num_input_tokens_seen": 67247232, "step": 55430 }, { "epoch": 6.1738500946653305, "grad_norm": 0.11495548486709595, "learning_rate": 1.9174569064755248e-05, "loss": 0.4593, "num_input_tokens_seen": 67253568, "step": 55435 }, { "epoch": 6.174406949548947, "grad_norm": 0.10470375418663025, "learning_rate": 1.9169843467954267e-05, "loss": 0.4599, "num_input_tokens_seen": 67259584, "step": 55440 }, { "epoch": 6.174963804432565, "grad_norm": 0.11120767891407013, "learning_rate": 1.9165118091435285e-05, "loss": 0.4715, "num_input_tokens_seen": 67265504, "step": 55445 }, { "epoch": 6.175520659316183, "grad_norm": 0.11586465686559677, "learning_rate": 1.9160392935376842e-05, "loss": 0.4711, "num_input_tokens_seen": 67270912, "step": 55450 }, { "epoch": 6.1760775141997994, "grad_norm": 0.11889156699180603, "learning_rate": 1.9155667999957465e-05, "loss": 0.4646, "num_input_tokens_seen": 67277088, "step": 55455 }, { "epoch": 6.176634369083417, "grad_norm": 0.12941420078277588, "learning_rate": 1.9150943285355684e-05, "loss": 0.4619, "num_input_tokens_seen": 67283424, "step": 55460 }, { "epoch": 6.177191223967034, "grad_norm": 0.1074230819940567, "learning_rate": 1.9146218791750003e-05, "loss": 0.4641, "num_input_tokens_seen": 67289568, "step": 55465 }, { "epoch": 6.177748078850652, "grad_norm": 0.11151840537786484, "learning_rate": 1.9141494519318936e-05, "loss": 0.4463, "num_input_tokens_seen": 67295488, "step": 55470 }, { "epoch": 6.178304933734269, "grad_norm": 0.08144119381904602, "learning_rate": 1.9136770468240983e-05, "loss": 0.4568, "num_input_tokens_seen": 67301696, "step": 55475 }, { "epoch": 6.178861788617886, "grad_norm": 0.13513480126857758, "learning_rate": 1.913204663869463e-05, "loss": 0.4679, "num_input_tokens_seen": 67307968, "step": 55480 }, { "epoch": 6.179418643501504, "grad_norm": 0.1024501621723175, "learning_rate": 1.9127323030858364e-05, "loss": 0.4678, "num_input_tokens_seen": 67314400, "step": 55485 }, { "epoch": 6.1799754983851205, "grad_norm": 0.13281969726085663, "learning_rate": 1.912259964491065e-05, "loss": 0.4634, "num_input_tokens_seen": 67320096, "step": 55490 }, { "epoch": 6.180532353268738, "grad_norm": 0.09394743293523788, "learning_rate": 1.9117876481029963e-05, "loss": 0.4524, "num_input_tokens_seen": 67326112, "step": 55495 }, { "epoch": 6.181089208152356, "grad_norm": 0.08817145973443985, "learning_rate": 1.911315353939474e-05, "loss": 0.4477, "num_input_tokens_seen": 67332416, "step": 55500 }, { "epoch": 6.181646063035973, "grad_norm": 0.09110114723443985, "learning_rate": 1.910843082018345e-05, "loss": 0.4492, "num_input_tokens_seen": 67338240, "step": 55505 }, { "epoch": 6.18220291791959, "grad_norm": 0.09811554849147797, "learning_rate": 1.9103708323574524e-05, "loss": 0.4617, "num_input_tokens_seen": 67344544, "step": 55510 }, { "epoch": 6.182759772803207, "grad_norm": 0.1292872279882431, "learning_rate": 1.9098986049746394e-05, "loss": 0.4592, "num_input_tokens_seen": 67350912, "step": 55515 }, { "epoch": 6.183316627686825, "grad_norm": 0.14183533191680908, "learning_rate": 1.909426399887748e-05, "loss": 0.458, "num_input_tokens_seen": 67357248, "step": 55520 }, { "epoch": 6.1838734825704424, "grad_norm": 0.13640519976615906, "learning_rate": 1.9089542171146198e-05, "loss": 0.4535, "num_input_tokens_seen": 67362624, "step": 55525 }, { "epoch": 6.184430337454059, "grad_norm": 0.09705542027950287, "learning_rate": 1.9084820566730953e-05, "loss": 0.4629, "num_input_tokens_seen": 67368640, "step": 55530 }, { "epoch": 6.184987192337677, "grad_norm": 0.1020490825176239, "learning_rate": 1.908009918581014e-05, "loss": 0.4612, "num_input_tokens_seen": 67374784, "step": 55535 }, { "epoch": 6.185544047221294, "grad_norm": 0.07678227871656418, "learning_rate": 1.9075378028562154e-05, "loss": 0.4617, "num_input_tokens_seen": 67380608, "step": 55540 }, { "epoch": 6.186100902104911, "grad_norm": 0.08753281831741333, "learning_rate": 1.9070657095165374e-05, "loss": 0.4449, "num_input_tokens_seen": 67386624, "step": 55545 }, { "epoch": 6.186657756988529, "grad_norm": 0.10933330655097961, "learning_rate": 1.9065936385798173e-05, "loss": 0.4624, "num_input_tokens_seen": 67392544, "step": 55550 }, { "epoch": 6.187214611872146, "grad_norm": 0.0855056494474411, "learning_rate": 1.9061215900638905e-05, "loss": 0.4351, "num_input_tokens_seen": 67397824, "step": 55555 }, { "epoch": 6.1877714667557635, "grad_norm": 0.12479773908853531, "learning_rate": 1.905649563986594e-05, "loss": 0.4688, "num_input_tokens_seen": 67403840, "step": 55560 }, { "epoch": 6.188328321639381, "grad_norm": 0.13983148336410522, "learning_rate": 1.9051775603657608e-05, "loss": 0.4588, "num_input_tokens_seen": 67409728, "step": 55565 }, { "epoch": 6.188885176522998, "grad_norm": 0.10431142151355743, "learning_rate": 1.904705579219226e-05, "loss": 0.4578, "num_input_tokens_seen": 67415200, "step": 55570 }, { "epoch": 6.189442031406616, "grad_norm": 0.10841041058301926, "learning_rate": 1.904233620564822e-05, "loss": 0.4617, "num_input_tokens_seen": 67421184, "step": 55575 }, { "epoch": 6.189998886290232, "grad_norm": 0.11834745854139328, "learning_rate": 1.9037616844203816e-05, "loss": 0.4683, "num_input_tokens_seen": 67427328, "step": 55580 }, { "epoch": 6.19055574117385, "grad_norm": 0.12045519053936005, "learning_rate": 1.9032897708037348e-05, "loss": 0.4603, "num_input_tokens_seen": 67433216, "step": 55585 }, { "epoch": 6.191112596057468, "grad_norm": 0.1077229455113411, "learning_rate": 1.902817879732713e-05, "loss": 0.4702, "num_input_tokens_seen": 67439296, "step": 55590 }, { "epoch": 6.191669450941085, "grad_norm": 0.10166741162538528, "learning_rate": 1.9023460112251458e-05, "loss": 0.4655, "num_input_tokens_seen": 67445408, "step": 55595 }, { "epoch": 6.192226305824702, "grad_norm": 0.14539532363414764, "learning_rate": 1.9018741652988606e-05, "loss": 0.4466, "num_input_tokens_seen": 67451680, "step": 55600 }, { "epoch": 6.192783160708319, "grad_norm": 0.10275722295045853, "learning_rate": 1.901402341971687e-05, "loss": 0.4512, "num_input_tokens_seen": 67456928, "step": 55605 }, { "epoch": 6.193340015591937, "grad_norm": 0.10947973281145096, "learning_rate": 1.900930541261451e-05, "loss": 0.4695, "num_input_tokens_seen": 67463072, "step": 55610 }, { "epoch": 6.193896870475554, "grad_norm": 0.12502342462539673, "learning_rate": 1.900458763185979e-05, "loss": 0.4629, "num_input_tokens_seen": 67469248, "step": 55615 }, { "epoch": 6.194453725359171, "grad_norm": 0.08014834672212601, "learning_rate": 1.8999870077630965e-05, "loss": 0.4572, "num_input_tokens_seen": 67474848, "step": 55620 }, { "epoch": 6.195010580242789, "grad_norm": 0.08359362185001373, "learning_rate": 1.8995152750106275e-05, "loss": 0.453, "num_input_tokens_seen": 67481152, "step": 55625 }, { "epoch": 6.1955674351264065, "grad_norm": 0.09653404355049133, "learning_rate": 1.899043564946395e-05, "loss": 0.4734, "num_input_tokens_seen": 67486848, "step": 55630 }, { "epoch": 6.196124290010023, "grad_norm": 0.1350659728050232, "learning_rate": 1.8985718775882247e-05, "loss": 0.4631, "num_input_tokens_seen": 67492576, "step": 55635 }, { "epoch": 6.196681144893641, "grad_norm": 0.14507588744163513, "learning_rate": 1.898100212953935e-05, "loss": 0.4705, "num_input_tokens_seen": 67498560, "step": 55640 }, { "epoch": 6.197237999777258, "grad_norm": 0.12291301041841507, "learning_rate": 1.897628571061348e-05, "loss": 0.4441, "num_input_tokens_seen": 67504192, "step": 55645 }, { "epoch": 6.197794854660875, "grad_norm": 0.14910593628883362, "learning_rate": 1.897156951928283e-05, "loss": 0.4714, "num_input_tokens_seen": 67510016, "step": 55650 }, { "epoch": 6.198351709544493, "grad_norm": 0.08954106271266937, "learning_rate": 1.8966853555725612e-05, "loss": 0.4686, "num_input_tokens_seen": 67515968, "step": 55655 }, { "epoch": 6.19890856442811, "grad_norm": 0.10397986322641373, "learning_rate": 1.896213782012e-05, "loss": 0.4504, "num_input_tokens_seen": 67521920, "step": 55660 }, { "epoch": 6.199465419311728, "grad_norm": 0.09438445419073105, "learning_rate": 1.8957422312644173e-05, "loss": 0.4681, "num_input_tokens_seen": 67527808, "step": 55665 }, { "epoch": 6.200022274195344, "grad_norm": 0.10956920683383942, "learning_rate": 1.8952707033476295e-05, "loss": 0.4544, "num_input_tokens_seen": 67533568, "step": 55670 }, { "epoch": 6.200579129078962, "grad_norm": 0.09825671464204788, "learning_rate": 1.894799198279452e-05, "loss": 0.469, "num_input_tokens_seen": 67539680, "step": 55675 }, { "epoch": 6.20113598396258, "grad_norm": 0.1364920735359192, "learning_rate": 1.8943277160777e-05, "loss": 0.4595, "num_input_tokens_seen": 67545824, "step": 55680 }, { "epoch": 6.2016928388461965, "grad_norm": 0.117408886551857, "learning_rate": 1.8938562567601885e-05, "loss": 0.4645, "num_input_tokens_seen": 67551936, "step": 55685 }, { "epoch": 6.202249693729814, "grad_norm": 0.11849214881658554, "learning_rate": 1.89338482034473e-05, "loss": 0.4704, "num_input_tokens_seen": 67557984, "step": 55690 }, { "epoch": 6.202806548613431, "grad_norm": 0.1122259721159935, "learning_rate": 1.8929134068491373e-05, "loss": 0.4692, "num_input_tokens_seen": 67563200, "step": 55695 }, { "epoch": 6.203363403497049, "grad_norm": 0.16837731003761292, "learning_rate": 1.892442016291221e-05, "loss": 0.4629, "num_input_tokens_seen": 67569312, "step": 55700 }, { "epoch": 6.203920258380666, "grad_norm": 0.11990135908126831, "learning_rate": 1.891970648688793e-05, "loss": 0.4457, "num_input_tokens_seen": 67575552, "step": 55705 }, { "epoch": 6.204477113264283, "grad_norm": 0.0973442941904068, "learning_rate": 1.8914993040596614e-05, "loss": 0.4667, "num_input_tokens_seen": 67581504, "step": 55710 }, { "epoch": 6.205033968147901, "grad_norm": 0.07531221210956573, "learning_rate": 1.891027982421637e-05, "loss": 0.4663, "num_input_tokens_seen": 67587712, "step": 55715 }, { "epoch": 6.2055908230315175, "grad_norm": 0.09817104786634445, "learning_rate": 1.8905566837925264e-05, "loss": 0.4717, "num_input_tokens_seen": 67593984, "step": 55720 }, { "epoch": 6.206147677915135, "grad_norm": 0.12787331640720367, "learning_rate": 1.8900854081901382e-05, "loss": 0.4639, "num_input_tokens_seen": 67600224, "step": 55725 }, { "epoch": 6.206704532798753, "grad_norm": 0.09665123373270035, "learning_rate": 1.8896141556322772e-05, "loss": 0.4654, "num_input_tokens_seen": 67606176, "step": 55730 }, { "epoch": 6.20726138768237, "grad_norm": 0.09100527316331863, "learning_rate": 1.8891429261367496e-05, "loss": 0.4515, "num_input_tokens_seen": 67612544, "step": 55735 }, { "epoch": 6.207818242565987, "grad_norm": 0.10419642925262451, "learning_rate": 1.88867171972136e-05, "loss": 0.4642, "num_input_tokens_seen": 67618848, "step": 55740 }, { "epoch": 6.208375097449605, "grad_norm": 0.11993914842605591, "learning_rate": 1.888200536403911e-05, "loss": 0.4516, "num_input_tokens_seen": 67624896, "step": 55745 }, { "epoch": 6.208931952333222, "grad_norm": 0.08392493426799774, "learning_rate": 1.887729376202207e-05, "loss": 0.4568, "num_input_tokens_seen": 67630912, "step": 55750 }, { "epoch": 6.2094888072168395, "grad_norm": 0.08836507797241211, "learning_rate": 1.8872582391340498e-05, "loss": 0.4734, "num_input_tokens_seen": 67636832, "step": 55755 }, { "epoch": 6.210045662100456, "grad_norm": 0.12072330713272095, "learning_rate": 1.8867871252172395e-05, "loss": 0.4616, "num_input_tokens_seen": 67643072, "step": 55760 }, { "epoch": 6.210602516984074, "grad_norm": 0.07945048809051514, "learning_rate": 1.886316034469577e-05, "loss": 0.4761, "num_input_tokens_seen": 67648960, "step": 55765 }, { "epoch": 6.211159371867692, "grad_norm": 0.09131676703691483, "learning_rate": 1.885844966908861e-05, "loss": 0.465, "num_input_tokens_seen": 67654752, "step": 55770 }, { "epoch": 6.211716226751308, "grad_norm": 0.12432585656642914, "learning_rate": 1.88537392255289e-05, "loss": 0.467, "num_input_tokens_seen": 67660864, "step": 55775 }, { "epoch": 6.212273081634926, "grad_norm": 0.10025250166654587, "learning_rate": 1.8849029014194623e-05, "loss": 0.4564, "num_input_tokens_seen": 67667040, "step": 55780 }, { "epoch": 6.212829936518543, "grad_norm": 0.10031788796186447, "learning_rate": 1.8844319035263744e-05, "loss": 0.4732, "num_input_tokens_seen": 67673376, "step": 55785 }, { "epoch": 6.2133867914021605, "grad_norm": 0.07919469475746155, "learning_rate": 1.8839609288914217e-05, "loss": 0.4551, "num_input_tokens_seen": 67679552, "step": 55790 }, { "epoch": 6.213943646285778, "grad_norm": 0.09725548326969147, "learning_rate": 1.883489977532399e-05, "loss": 0.4564, "num_input_tokens_seen": 67685664, "step": 55795 }, { "epoch": 6.214500501169395, "grad_norm": 0.10303891450166702, "learning_rate": 1.8830190494671013e-05, "loss": 0.4788, "num_input_tokens_seen": 67691744, "step": 55800 }, { "epoch": 6.215057356053013, "grad_norm": 0.11162935942411423, "learning_rate": 1.8825481447133205e-05, "loss": 0.4693, "num_input_tokens_seen": 67697568, "step": 55805 }, { "epoch": 6.21561421093663, "grad_norm": 0.07728296518325806, "learning_rate": 1.8820772632888494e-05, "loss": 0.4631, "num_input_tokens_seen": 67703840, "step": 55810 }, { "epoch": 6.216171065820247, "grad_norm": 0.09251092374324799, "learning_rate": 1.8816064052114802e-05, "loss": 0.4675, "num_input_tokens_seen": 67709728, "step": 55815 }, { "epoch": 6.216727920703865, "grad_norm": 0.10476240515708923, "learning_rate": 1.8811355704990023e-05, "loss": 0.4679, "num_input_tokens_seen": 67716224, "step": 55820 }, { "epoch": 6.217284775587482, "grad_norm": 0.10248862951993942, "learning_rate": 1.880664759169206e-05, "loss": 0.4572, "num_input_tokens_seen": 67722528, "step": 55825 }, { "epoch": 6.217841630471099, "grad_norm": 0.09258554875850677, "learning_rate": 1.88019397123988e-05, "loss": 0.4666, "num_input_tokens_seen": 67728800, "step": 55830 }, { "epoch": 6.218398485354717, "grad_norm": 0.09718095511198044, "learning_rate": 1.879723206728812e-05, "loss": 0.465, "num_input_tokens_seen": 67734976, "step": 55835 }, { "epoch": 6.218955340238334, "grad_norm": 0.11191633343696594, "learning_rate": 1.879252465653788e-05, "loss": 0.4587, "num_input_tokens_seen": 67740960, "step": 55840 }, { "epoch": 6.219512195121951, "grad_norm": 0.11994507163763046, "learning_rate": 1.878781748032596e-05, "loss": 0.4629, "num_input_tokens_seen": 67747296, "step": 55845 }, { "epoch": 6.220069050005568, "grad_norm": 0.10137064754962921, "learning_rate": 1.8783110538830205e-05, "loss": 0.4642, "num_input_tokens_seen": 67753632, "step": 55850 }, { "epoch": 6.220625904889186, "grad_norm": 0.11952143162488937, "learning_rate": 1.8778403832228452e-05, "loss": 0.4495, "num_input_tokens_seen": 67759040, "step": 55855 }, { "epoch": 6.2211827597728035, "grad_norm": 0.11674661189317703, "learning_rate": 1.8773697360698543e-05, "loss": 0.4639, "num_input_tokens_seen": 67764160, "step": 55860 }, { "epoch": 6.22173961465642, "grad_norm": 0.07149647176265717, "learning_rate": 1.8768991124418303e-05, "loss": 0.4638, "num_input_tokens_seen": 67769536, "step": 55865 }, { "epoch": 6.222296469540038, "grad_norm": 0.13378600776195526, "learning_rate": 1.8764285123565544e-05, "loss": 0.4528, "num_input_tokens_seen": 67775232, "step": 55870 }, { "epoch": 6.222853324423655, "grad_norm": 0.07840430736541748, "learning_rate": 1.8759579358318076e-05, "loss": 0.4602, "num_input_tokens_seen": 67781504, "step": 55875 }, { "epoch": 6.2234101793072725, "grad_norm": 0.12003042548894882, "learning_rate": 1.8754873828853698e-05, "loss": 0.4686, "num_input_tokens_seen": 67787264, "step": 55880 }, { "epoch": 6.22396703419089, "grad_norm": 0.08951357752084732, "learning_rate": 1.87501685353502e-05, "loss": 0.4608, "num_input_tokens_seen": 67793536, "step": 55885 }, { "epoch": 6.224523889074507, "grad_norm": 0.10292451828718185, "learning_rate": 1.874546347798537e-05, "loss": 0.4644, "num_input_tokens_seen": 67799872, "step": 55890 }, { "epoch": 6.225080743958125, "grad_norm": 0.06766969710588455, "learning_rate": 1.8740758656936965e-05, "loss": 0.4647, "num_input_tokens_seen": 67806080, "step": 55895 }, { "epoch": 6.225637598841741, "grad_norm": 0.07483939081430435, "learning_rate": 1.8736054072382763e-05, "loss": 0.4669, "num_input_tokens_seen": 67812096, "step": 55900 }, { "epoch": 6.226194453725359, "grad_norm": 0.11388690769672394, "learning_rate": 1.8731349724500503e-05, "loss": 0.461, "num_input_tokens_seen": 67818464, "step": 55905 }, { "epoch": 6.226751308608977, "grad_norm": 0.08228320628404617, "learning_rate": 1.8726645613467946e-05, "loss": 0.4704, "num_input_tokens_seen": 67824448, "step": 55910 }, { "epoch": 6.2273081634925935, "grad_norm": 0.0845257043838501, "learning_rate": 1.8721941739462822e-05, "loss": 0.4704, "num_input_tokens_seen": 67830848, "step": 55915 }, { "epoch": 6.227865018376211, "grad_norm": 0.09762448072433472, "learning_rate": 1.8717238102662858e-05, "loss": 0.45, "num_input_tokens_seen": 67837152, "step": 55920 }, { "epoch": 6.228421873259829, "grad_norm": 0.11003140360116959, "learning_rate": 1.871253470324577e-05, "loss": 0.4706, "num_input_tokens_seen": 67843328, "step": 55925 }, { "epoch": 6.228978728143446, "grad_norm": 0.10362811386585236, "learning_rate": 1.8707831541389272e-05, "loss": 0.4682, "num_input_tokens_seen": 67849408, "step": 55930 }, { "epoch": 6.229535583027063, "grad_norm": 0.09079403430223465, "learning_rate": 1.870312861727106e-05, "loss": 0.4635, "num_input_tokens_seen": 67855264, "step": 55935 }, { "epoch": 6.23009243791068, "grad_norm": 0.08410753309726715, "learning_rate": 1.869842593106883e-05, "loss": 0.4714, "num_input_tokens_seen": 67861344, "step": 55940 }, { "epoch": 6.230649292794298, "grad_norm": 0.10589247941970825, "learning_rate": 1.8693723482960263e-05, "loss": 0.4506, "num_input_tokens_seen": 67867424, "step": 55945 }, { "epoch": 6.2312061476779155, "grad_norm": 0.08672132343053818, "learning_rate": 1.868902127312303e-05, "loss": 0.4613, "num_input_tokens_seen": 67872960, "step": 55950 }, { "epoch": 6.231763002561532, "grad_norm": 0.108361154794693, "learning_rate": 1.8684319301734805e-05, "loss": 0.4577, "num_input_tokens_seen": 67879296, "step": 55955 }, { "epoch": 6.23231985744515, "grad_norm": 0.08740949630737305, "learning_rate": 1.867961756897323e-05, "loss": 0.4617, "num_input_tokens_seen": 67885504, "step": 55960 }, { "epoch": 6.232876712328767, "grad_norm": 0.08521483838558197, "learning_rate": 1.8674916075015962e-05, "loss": 0.4698, "num_input_tokens_seen": 67891680, "step": 55965 }, { "epoch": 6.233433567212384, "grad_norm": 0.09478016942739487, "learning_rate": 1.8670214820040627e-05, "loss": 0.464, "num_input_tokens_seen": 67898048, "step": 55970 }, { "epoch": 6.233990422096002, "grad_norm": 0.109669990837574, "learning_rate": 1.8665513804224866e-05, "loss": 0.4558, "num_input_tokens_seen": 67904096, "step": 55975 }, { "epoch": 6.234547276979619, "grad_norm": 0.07758037745952606, "learning_rate": 1.8660813027746294e-05, "loss": 0.4602, "num_input_tokens_seen": 67910176, "step": 55980 }, { "epoch": 6.2351041318632365, "grad_norm": 0.120169498026371, "learning_rate": 1.8656112490782516e-05, "loss": 0.4571, "num_input_tokens_seen": 67916352, "step": 55985 }, { "epoch": 6.235660986746854, "grad_norm": 0.1450619101524353, "learning_rate": 1.8651412193511143e-05, "loss": 0.461, "num_input_tokens_seen": 67922240, "step": 55990 }, { "epoch": 6.236217841630471, "grad_norm": 0.10508828610181808, "learning_rate": 1.8646712136109763e-05, "loss": 0.4534, "num_input_tokens_seen": 67928704, "step": 55995 }, { "epoch": 6.236774696514089, "grad_norm": 0.12152882665395737, "learning_rate": 1.8642012318755952e-05, "loss": 0.4635, "num_input_tokens_seen": 67935072, "step": 56000 }, { "epoch": 6.2373315513977055, "grad_norm": 0.12788501381874084, "learning_rate": 1.8637312741627298e-05, "loss": 0.4544, "num_input_tokens_seen": 67941056, "step": 56005 }, { "epoch": 6.237888406281323, "grad_norm": 0.0825016051530838, "learning_rate": 1.8632613404901357e-05, "loss": 0.4599, "num_input_tokens_seen": 67946656, "step": 56010 }, { "epoch": 6.238445261164941, "grad_norm": 0.12629379332065582, "learning_rate": 1.8627914308755683e-05, "loss": 0.4671, "num_input_tokens_seen": 67952736, "step": 56015 }, { "epoch": 6.239002116048558, "grad_norm": 0.06734352558851242, "learning_rate": 1.862321545336783e-05, "loss": 0.4696, "num_input_tokens_seen": 67958624, "step": 56020 }, { "epoch": 6.239558970932175, "grad_norm": 0.10700828582048416, "learning_rate": 1.8618516838915338e-05, "loss": 0.4603, "num_input_tokens_seen": 67964704, "step": 56025 }, { "epoch": 6.240115825815792, "grad_norm": 0.0844104066491127, "learning_rate": 1.8613818465575726e-05, "loss": 0.4515, "num_input_tokens_seen": 67971168, "step": 56030 }, { "epoch": 6.24067268069941, "grad_norm": 0.0959751307964325, "learning_rate": 1.860912033352651e-05, "loss": 0.4554, "num_input_tokens_seen": 67977120, "step": 56035 }, { "epoch": 6.241229535583027, "grad_norm": 0.11346042156219482, "learning_rate": 1.8604422442945215e-05, "loss": 0.4648, "num_input_tokens_seen": 67983328, "step": 56040 }, { "epoch": 6.241786390466644, "grad_norm": 0.10311321914196014, "learning_rate": 1.859972479400935e-05, "loss": 0.4565, "num_input_tokens_seen": 67989600, "step": 56045 }, { "epoch": 6.242343245350262, "grad_norm": 0.11363525688648224, "learning_rate": 1.8595027386896374e-05, "loss": 0.4612, "num_input_tokens_seen": 67995328, "step": 56050 }, { "epoch": 6.242900100233879, "grad_norm": 0.1264614313840866, "learning_rate": 1.8590330221783793e-05, "loss": 0.4644, "num_input_tokens_seen": 68001376, "step": 56055 }, { "epoch": 6.243456955117496, "grad_norm": 0.11992333084344864, "learning_rate": 1.8585633298849078e-05, "loss": 0.4579, "num_input_tokens_seen": 68007744, "step": 56060 }, { "epoch": 6.244013810001114, "grad_norm": 0.15667571127414703, "learning_rate": 1.8580936618269696e-05, "loss": 0.4756, "num_input_tokens_seen": 68013728, "step": 56065 }, { "epoch": 6.244570664884731, "grad_norm": 0.10733641684055328, "learning_rate": 1.8576240180223095e-05, "loss": 0.4641, "num_input_tokens_seen": 68019776, "step": 56070 }, { "epoch": 6.2451275197683485, "grad_norm": 0.08336629718542099, "learning_rate": 1.857154398488673e-05, "loss": 0.4628, "num_input_tokens_seen": 68026144, "step": 56075 }, { "epoch": 6.245684374651965, "grad_norm": 0.11060864478349686, "learning_rate": 1.856684803243803e-05, "loss": 0.4553, "num_input_tokens_seen": 68032544, "step": 56080 }, { "epoch": 6.246241229535583, "grad_norm": 0.142791286110878, "learning_rate": 1.8562152323054424e-05, "loss": 0.4792, "num_input_tokens_seen": 68038880, "step": 56085 }, { "epoch": 6.246798084419201, "grad_norm": 0.08741465210914612, "learning_rate": 1.8557456856913342e-05, "loss": 0.4477, "num_input_tokens_seen": 68044640, "step": 56090 }, { "epoch": 6.247354939302817, "grad_norm": 0.14413398504257202, "learning_rate": 1.8552761634192185e-05, "loss": 0.4696, "num_input_tokens_seen": 68050816, "step": 56095 }, { "epoch": 6.247911794186435, "grad_norm": 0.1097041666507721, "learning_rate": 1.8548066655068352e-05, "loss": 0.4788, "num_input_tokens_seen": 68057440, "step": 56100 }, { "epoch": 6.248468649070053, "grad_norm": 0.09146316349506378, "learning_rate": 1.8543371919719237e-05, "loss": 0.4621, "num_input_tokens_seen": 68063488, "step": 56105 }, { "epoch": 6.2490255039536695, "grad_norm": 0.13564641773700714, "learning_rate": 1.8538677428322223e-05, "loss": 0.4627, "num_input_tokens_seen": 68069696, "step": 56110 }, { "epoch": 6.249582358837287, "grad_norm": 0.0933772623538971, "learning_rate": 1.8533983181054677e-05, "loss": 0.461, "num_input_tokens_seen": 68075936, "step": 56115 }, { "epoch": 6.250139213720904, "grad_norm": 0.09240797162055969, "learning_rate": 1.8529289178093975e-05, "loss": 0.466, "num_input_tokens_seen": 68081792, "step": 56120 }, { "epoch": 6.250696068604522, "grad_norm": 0.07383482903242111, "learning_rate": 1.8524595419617464e-05, "loss": 0.4613, "num_input_tokens_seen": 68087808, "step": 56125 }, { "epoch": 6.251252923488139, "grad_norm": 0.13854509592056274, "learning_rate": 1.8519901905802487e-05, "loss": 0.4562, "num_input_tokens_seen": 68093632, "step": 56130 }, { "epoch": 6.251809778371756, "grad_norm": 0.09783175587654114, "learning_rate": 1.8515208636826384e-05, "loss": 0.4626, "num_input_tokens_seen": 68099776, "step": 56135 }, { "epoch": 6.252366633255374, "grad_norm": 0.10309597849845886, "learning_rate": 1.851051561286648e-05, "loss": 0.4599, "num_input_tokens_seen": 68105632, "step": 56140 }, { "epoch": 6.252923488138991, "grad_norm": 0.10766004770994186, "learning_rate": 1.850582283410009e-05, "loss": 0.4568, "num_input_tokens_seen": 68112128, "step": 56145 }, { "epoch": 6.253480343022608, "grad_norm": 0.10852601379156113, "learning_rate": 1.8501130300704526e-05, "loss": 0.4526, "num_input_tokens_seen": 68118336, "step": 56150 }, { "epoch": 6.254037197906226, "grad_norm": 0.11602827906608582, "learning_rate": 1.8496438012857088e-05, "loss": 0.4637, "num_input_tokens_seen": 68124416, "step": 56155 }, { "epoch": 6.254594052789843, "grad_norm": 0.13190317153930664, "learning_rate": 1.849174597073506e-05, "loss": 0.4743, "num_input_tokens_seen": 68130304, "step": 56160 }, { "epoch": 6.25515090767346, "grad_norm": 0.15951333940029144, "learning_rate": 1.848705417451573e-05, "loss": 0.4611, "num_input_tokens_seen": 68136512, "step": 56165 }, { "epoch": 6.255707762557078, "grad_norm": 0.1102117970585823, "learning_rate": 1.8482362624376365e-05, "loss": 0.4678, "num_input_tokens_seen": 68142400, "step": 56170 }, { "epoch": 6.256264617440695, "grad_norm": 0.0702480748295784, "learning_rate": 1.8477671320494226e-05, "loss": 0.4635, "num_input_tokens_seen": 68148416, "step": 56175 }, { "epoch": 6.2568214723243125, "grad_norm": 0.09603960812091827, "learning_rate": 1.847298026304656e-05, "loss": 0.4581, "num_input_tokens_seen": 68154304, "step": 56180 }, { "epoch": 6.257378327207929, "grad_norm": 0.11473336070775986, "learning_rate": 1.8468289452210623e-05, "loss": 0.4676, "num_input_tokens_seen": 68160448, "step": 56185 }, { "epoch": 6.257935182091547, "grad_norm": 0.09484167397022247, "learning_rate": 1.8463598888163642e-05, "loss": 0.4553, "num_input_tokens_seen": 68166592, "step": 56190 }, { "epoch": 6.258492036975165, "grad_norm": 0.09220676124095917, "learning_rate": 1.8458908571082846e-05, "loss": 0.4534, "num_input_tokens_seen": 68172704, "step": 56195 }, { "epoch": 6.259048891858781, "grad_norm": 0.11061958223581314, "learning_rate": 1.845421850114544e-05, "loss": 0.4699, "num_input_tokens_seen": 68178944, "step": 56200 }, { "epoch": 6.259605746742399, "grad_norm": 0.07716067880392075, "learning_rate": 1.8449528678528638e-05, "loss": 0.4571, "num_input_tokens_seen": 68185024, "step": 56205 }, { "epoch": 6.260162601626016, "grad_norm": 0.12670424580574036, "learning_rate": 1.8444839103409635e-05, "loss": 0.4642, "num_input_tokens_seen": 68191296, "step": 56210 }, { "epoch": 6.260719456509634, "grad_norm": 0.10730094462633133, "learning_rate": 1.8440149775965613e-05, "loss": 0.4687, "num_input_tokens_seen": 68197088, "step": 56215 }, { "epoch": 6.261276311393251, "grad_norm": 0.089483343064785, "learning_rate": 1.843546069637376e-05, "loss": 0.454, "num_input_tokens_seen": 68202976, "step": 56220 }, { "epoch": 6.261833166276868, "grad_norm": 0.08889538794755936, "learning_rate": 1.8430771864811238e-05, "loss": 0.4587, "num_input_tokens_seen": 68209216, "step": 56225 }, { "epoch": 6.262390021160486, "grad_norm": 0.09402041882276535, "learning_rate": 1.8426083281455204e-05, "loss": 0.46, "num_input_tokens_seen": 68215552, "step": 56230 }, { "epoch": 6.2629468760441025, "grad_norm": 0.06547394394874573, "learning_rate": 1.8421394946482818e-05, "loss": 0.4612, "num_input_tokens_seen": 68221600, "step": 56235 }, { "epoch": 6.26350373092772, "grad_norm": 0.10033291578292847, "learning_rate": 1.8416706860071206e-05, "loss": 0.469, "num_input_tokens_seen": 68227616, "step": 56240 }, { "epoch": 6.264060585811338, "grad_norm": 0.1261342465877533, "learning_rate": 1.8412019022397504e-05, "loss": 0.4713, "num_input_tokens_seen": 68233728, "step": 56245 }, { "epoch": 6.264617440694955, "grad_norm": 0.09932684153318405, "learning_rate": 1.840733143363884e-05, "loss": 0.4574, "num_input_tokens_seen": 68238912, "step": 56250 }, { "epoch": 6.265174295578572, "grad_norm": 0.10621221363544464, "learning_rate": 1.8402644093972325e-05, "loss": 0.4611, "num_input_tokens_seen": 68245184, "step": 56255 }, { "epoch": 6.265731150462189, "grad_norm": 0.08275888115167618, "learning_rate": 1.8397957003575056e-05, "loss": 0.4613, "num_input_tokens_seen": 68251040, "step": 56260 }, { "epoch": 6.266288005345807, "grad_norm": 0.1489553153514862, "learning_rate": 1.8393270162624126e-05, "loss": 0.4528, "num_input_tokens_seen": 68256896, "step": 56265 }, { "epoch": 6.266844860229424, "grad_norm": 0.1141846552491188, "learning_rate": 1.8388583571296624e-05, "loss": 0.4727, "num_input_tokens_seen": 68263360, "step": 56270 }, { "epoch": 6.267401715113041, "grad_norm": 0.08092931658029556, "learning_rate": 1.8383897229769617e-05, "loss": 0.4529, "num_input_tokens_seen": 68269280, "step": 56275 }, { "epoch": 6.267958569996659, "grad_norm": 0.12859728932380676, "learning_rate": 1.837921113822018e-05, "loss": 0.4562, "num_input_tokens_seen": 68275520, "step": 56280 }, { "epoch": 6.268515424880277, "grad_norm": 0.08748605102300644, "learning_rate": 1.8374525296825363e-05, "loss": 0.4608, "num_input_tokens_seen": 68281664, "step": 56285 }, { "epoch": 6.269072279763893, "grad_norm": 0.11602744460105896, "learning_rate": 1.8369839705762216e-05, "loss": 0.4542, "num_input_tokens_seen": 68287776, "step": 56290 }, { "epoch": 6.269629134647511, "grad_norm": 0.103162981569767, "learning_rate": 1.8365154365207765e-05, "loss": 0.4615, "num_input_tokens_seen": 68293856, "step": 56295 }, { "epoch": 6.270185989531128, "grad_norm": 0.07218179851770401, "learning_rate": 1.836046927533905e-05, "loss": 0.4692, "num_input_tokens_seen": 68299584, "step": 56300 }, { "epoch": 6.2707428444147455, "grad_norm": 0.08777353912591934, "learning_rate": 1.8355784436333086e-05, "loss": 0.4575, "num_input_tokens_seen": 68305760, "step": 56305 }, { "epoch": 6.271299699298363, "grad_norm": 0.1523640900850296, "learning_rate": 1.8351099848366863e-05, "loss": 0.4611, "num_input_tokens_seen": 68311456, "step": 56310 }, { "epoch": 6.27185655418198, "grad_norm": 0.12604255974292755, "learning_rate": 1.834641551161741e-05, "loss": 0.4678, "num_input_tokens_seen": 68317408, "step": 56315 }, { "epoch": 6.272413409065598, "grad_norm": 0.0955854207277298, "learning_rate": 1.83417314262617e-05, "loss": 0.4606, "num_input_tokens_seen": 68323744, "step": 56320 }, { "epoch": 6.272970263949214, "grad_norm": 0.11932647973299026, "learning_rate": 1.833704759247671e-05, "loss": 0.4472, "num_input_tokens_seen": 68329120, "step": 56325 }, { "epoch": 6.273527118832832, "grad_norm": 0.09044221043586731, "learning_rate": 1.833236401043942e-05, "loss": 0.4619, "num_input_tokens_seen": 68335488, "step": 56330 }, { "epoch": 6.27408397371645, "grad_norm": 0.1153041198849678, "learning_rate": 1.832768068032678e-05, "loss": 0.4602, "num_input_tokens_seen": 68341600, "step": 56335 }, { "epoch": 6.2746408286000666, "grad_norm": 0.10982610285282135, "learning_rate": 1.8322997602315746e-05, "loss": 0.4588, "num_input_tokens_seen": 68347840, "step": 56340 }, { "epoch": 6.275197683483684, "grad_norm": 0.10702386498451233, "learning_rate": 1.831831477658326e-05, "loss": 0.4708, "num_input_tokens_seen": 68353536, "step": 56345 }, { "epoch": 6.275754538367302, "grad_norm": 0.09678828716278076, "learning_rate": 1.831363220330626e-05, "loss": 0.4612, "num_input_tokens_seen": 68359680, "step": 56350 }, { "epoch": 6.276311393250919, "grad_norm": 0.11287640035152435, "learning_rate": 1.8308949882661658e-05, "loss": 0.4435, "num_input_tokens_seen": 68366048, "step": 56355 }, { "epoch": 6.276868248134536, "grad_norm": 0.08510196954011917, "learning_rate": 1.830426781482637e-05, "loss": 0.4511, "num_input_tokens_seen": 68372160, "step": 56360 }, { "epoch": 6.277425103018153, "grad_norm": 0.11105331033468246, "learning_rate": 1.8299585999977305e-05, "loss": 0.4712, "num_input_tokens_seen": 68378240, "step": 56365 }, { "epoch": 6.277981957901771, "grad_norm": 0.10694056004285812, "learning_rate": 1.8294904438291355e-05, "loss": 0.4719, "num_input_tokens_seen": 68384800, "step": 56370 }, { "epoch": 6.2785388127853885, "grad_norm": 0.09529366344213486, "learning_rate": 1.8290223129945394e-05, "loss": 0.4726, "num_input_tokens_seen": 68390752, "step": 56375 }, { "epoch": 6.279095667669005, "grad_norm": 0.11051572859287262, "learning_rate": 1.8285542075116316e-05, "loss": 0.4629, "num_input_tokens_seen": 68396704, "step": 56380 }, { "epoch": 6.279652522552623, "grad_norm": 0.08758744597434998, "learning_rate": 1.8280861273980972e-05, "loss": 0.4682, "num_input_tokens_seen": 68402816, "step": 56385 }, { "epoch": 6.28020937743624, "grad_norm": 0.062423225492239, "learning_rate": 1.8276180726716225e-05, "loss": 0.4583, "num_input_tokens_seen": 68408608, "step": 56390 }, { "epoch": 6.280766232319857, "grad_norm": 0.09570030868053436, "learning_rate": 1.8271500433498912e-05, "loss": 0.4468, "num_input_tokens_seen": 68414880, "step": 56395 }, { "epoch": 6.281323087203475, "grad_norm": 0.08507470041513443, "learning_rate": 1.826682039450588e-05, "loss": 0.4672, "num_input_tokens_seen": 68421120, "step": 56400 }, { "epoch": 6.281879942087092, "grad_norm": 0.09115945547819138, "learning_rate": 1.8262140609913943e-05, "loss": 0.4629, "num_input_tokens_seen": 68427424, "step": 56405 }, { "epoch": 6.28243679697071, "grad_norm": 0.0926220640540123, "learning_rate": 1.8257461079899936e-05, "loss": 0.4606, "num_input_tokens_seen": 68432832, "step": 56410 }, { "epoch": 6.282993651854326, "grad_norm": 0.11072709411382675, "learning_rate": 1.825278180464065e-05, "loss": 0.4592, "num_input_tokens_seen": 68438848, "step": 56415 }, { "epoch": 6.283550506737944, "grad_norm": 0.12039899080991745, "learning_rate": 1.8248102784312896e-05, "loss": 0.4727, "num_input_tokens_seen": 68445312, "step": 56420 }, { "epoch": 6.284107361621562, "grad_norm": 0.09128332883119583, "learning_rate": 1.8243424019093452e-05, "loss": 0.4681, "num_input_tokens_seen": 68451232, "step": 56425 }, { "epoch": 6.2846642165051785, "grad_norm": 0.08494114130735397, "learning_rate": 1.8238745509159106e-05, "loss": 0.455, "num_input_tokens_seen": 68457344, "step": 56430 }, { "epoch": 6.285221071388796, "grad_norm": 0.09801861643791199, "learning_rate": 1.8234067254686614e-05, "loss": 0.4552, "num_input_tokens_seen": 68463296, "step": 56435 }, { "epoch": 6.285777926272413, "grad_norm": 0.09618926048278809, "learning_rate": 1.8229389255852748e-05, "loss": 0.4542, "num_input_tokens_seen": 68469376, "step": 56440 }, { "epoch": 6.286334781156031, "grad_norm": 0.1438603550195694, "learning_rate": 1.8224711512834253e-05, "loss": 0.4566, "num_input_tokens_seen": 68474976, "step": 56445 }, { "epoch": 6.286891636039648, "grad_norm": 0.09421955794095993, "learning_rate": 1.822003402580787e-05, "loss": 0.4683, "num_input_tokens_seen": 68480832, "step": 56450 }, { "epoch": 6.287448490923265, "grad_norm": 0.08864030241966248, "learning_rate": 1.821535679495033e-05, "loss": 0.4534, "num_input_tokens_seen": 68486976, "step": 56455 }, { "epoch": 6.288005345806883, "grad_norm": 0.08858732134103775, "learning_rate": 1.821067982043836e-05, "loss": 0.4568, "num_input_tokens_seen": 68493280, "step": 56460 }, { "epoch": 6.2885622006905, "grad_norm": 0.0777413472533226, "learning_rate": 1.8206003102448655e-05, "loss": 0.4565, "num_input_tokens_seen": 68499616, "step": 56465 }, { "epoch": 6.289119055574117, "grad_norm": 0.11919687688350677, "learning_rate": 1.8201326641157926e-05, "loss": 0.4604, "num_input_tokens_seen": 68505728, "step": 56470 }, { "epoch": 6.289675910457735, "grad_norm": 0.1064639762043953, "learning_rate": 1.8196650436742864e-05, "loss": 0.4598, "num_input_tokens_seen": 68511616, "step": 56475 }, { "epoch": 6.290232765341352, "grad_norm": 0.12174464762210846, "learning_rate": 1.8191974489380153e-05, "loss": 0.463, "num_input_tokens_seen": 68517120, "step": 56480 }, { "epoch": 6.290789620224969, "grad_norm": 0.07366172224283218, "learning_rate": 1.818729879924646e-05, "loss": 0.4494, "num_input_tokens_seen": 68522816, "step": 56485 }, { "epoch": 6.291346475108587, "grad_norm": 0.1058579757809639, "learning_rate": 1.8182623366518448e-05, "loss": 0.4527, "num_input_tokens_seen": 68528800, "step": 56490 }, { "epoch": 6.291903329992204, "grad_norm": 0.10562137514352798, "learning_rate": 1.8177948191372775e-05, "loss": 0.4615, "num_input_tokens_seen": 68535072, "step": 56495 }, { "epoch": 6.2924601848758215, "grad_norm": 0.10897542536258698, "learning_rate": 1.8173273273986085e-05, "loss": 0.4627, "num_input_tokens_seen": 68540736, "step": 56500 }, { "epoch": 6.293017039759439, "grad_norm": 0.0809643343091011, "learning_rate": 1.8168598614535002e-05, "loss": 0.469, "num_input_tokens_seen": 68546848, "step": 56505 }, { "epoch": 6.293573894643056, "grad_norm": 0.09354867041110992, "learning_rate": 1.8163924213196166e-05, "loss": 0.4649, "num_input_tokens_seen": 68553024, "step": 56510 }, { "epoch": 6.294130749526674, "grad_norm": 0.10563871264457703, "learning_rate": 1.8159250070146174e-05, "loss": 0.4647, "num_input_tokens_seen": 68559296, "step": 56515 }, { "epoch": 6.29468760441029, "grad_norm": 0.09338808804750443, "learning_rate": 1.815457618556163e-05, "loss": 0.4576, "num_input_tokens_seen": 68564800, "step": 56520 }, { "epoch": 6.295244459293908, "grad_norm": 0.13750506937503815, "learning_rate": 1.8149902559619143e-05, "loss": 0.4637, "num_input_tokens_seen": 68570880, "step": 56525 }, { "epoch": 6.295801314177526, "grad_norm": 0.13535107672214508, "learning_rate": 1.8145229192495288e-05, "loss": 0.4644, "num_input_tokens_seen": 68576928, "step": 56530 }, { "epoch": 6.2963581690611425, "grad_norm": 0.12307317554950714, "learning_rate": 1.8140556084366643e-05, "loss": 0.4544, "num_input_tokens_seen": 68582592, "step": 56535 }, { "epoch": 6.29691502394476, "grad_norm": 0.11363580077886581, "learning_rate": 1.813588323540977e-05, "loss": 0.4627, "num_input_tokens_seen": 68588576, "step": 56540 }, { "epoch": 6.297471878828377, "grad_norm": 0.10652853548526764, "learning_rate": 1.8131210645801227e-05, "loss": 0.4725, "num_input_tokens_seen": 68594816, "step": 56545 }, { "epoch": 6.298028733711995, "grad_norm": 0.11133789271116257, "learning_rate": 1.812653831571755e-05, "loss": 0.4759, "num_input_tokens_seen": 68600512, "step": 56550 }, { "epoch": 6.298585588595612, "grad_norm": 0.10348834842443466, "learning_rate": 1.812186624533529e-05, "loss": 0.4653, "num_input_tokens_seen": 68606400, "step": 56555 }, { "epoch": 6.299142443479229, "grad_norm": 0.10594552755355835, "learning_rate": 1.8117194434830964e-05, "loss": 0.4687, "num_input_tokens_seen": 68612800, "step": 56560 }, { "epoch": 6.299699298362847, "grad_norm": 0.10722460597753525, "learning_rate": 1.811252288438109e-05, "loss": 0.4598, "num_input_tokens_seen": 68619008, "step": 56565 }, { "epoch": 6.300256153246464, "grad_norm": 0.12426500022411346, "learning_rate": 1.8107851594162173e-05, "loss": 0.4702, "num_input_tokens_seen": 68624960, "step": 56570 }, { "epoch": 6.300813008130081, "grad_norm": 0.06475210189819336, "learning_rate": 1.8103180564350712e-05, "loss": 0.46, "num_input_tokens_seen": 68630848, "step": 56575 }, { "epoch": 6.301369863013699, "grad_norm": 0.10408558696508408, "learning_rate": 1.8098509795123188e-05, "loss": 0.4493, "num_input_tokens_seen": 68637248, "step": 56580 }, { "epoch": 6.301926717897316, "grad_norm": 0.13500787317752838, "learning_rate": 1.8093839286656078e-05, "loss": 0.4544, "num_input_tokens_seen": 68643328, "step": 56585 }, { "epoch": 6.302483572780933, "grad_norm": 0.07804083824157715, "learning_rate": 1.8089169039125853e-05, "loss": 0.448, "num_input_tokens_seen": 68649504, "step": 56590 }, { "epoch": 6.30304042766455, "grad_norm": 0.11187610030174255, "learning_rate": 1.8084499052708973e-05, "loss": 0.4527, "num_input_tokens_seen": 68655904, "step": 56595 }, { "epoch": 6.303597282548168, "grad_norm": 0.09056063741445541, "learning_rate": 1.8079829327581876e-05, "loss": 0.4708, "num_input_tokens_seen": 68661952, "step": 56600 }, { "epoch": 6.3041541374317855, "grad_norm": 0.10494466871023178, "learning_rate": 1.807515986392101e-05, "loss": 0.4512, "num_input_tokens_seen": 68668160, "step": 56605 }, { "epoch": 6.304710992315402, "grad_norm": 0.08610603213310242, "learning_rate": 1.8070490661902792e-05, "loss": 0.457, "num_input_tokens_seen": 68674496, "step": 56610 }, { "epoch": 6.30526784719902, "grad_norm": 0.1298309564590454, "learning_rate": 1.8065821721703638e-05, "loss": 0.4552, "num_input_tokens_seen": 68680448, "step": 56615 }, { "epoch": 6.305824702082638, "grad_norm": 0.1072298064827919, "learning_rate": 1.8061153043499967e-05, "loss": 0.4696, "num_input_tokens_seen": 68686624, "step": 56620 }, { "epoch": 6.3063815569662545, "grad_norm": 0.08395116776227951, "learning_rate": 1.8056484627468172e-05, "loss": 0.4569, "num_input_tokens_seen": 68692576, "step": 56625 }, { "epoch": 6.306938411849872, "grad_norm": 0.10196610540151596, "learning_rate": 1.805181647378464e-05, "loss": 0.4481, "num_input_tokens_seen": 68698272, "step": 56630 }, { "epoch": 6.307495266733489, "grad_norm": 0.08834808319807053, "learning_rate": 1.8047148582625745e-05, "loss": 0.4624, "num_input_tokens_seen": 68704224, "step": 56635 }, { "epoch": 6.308052121617107, "grad_norm": 0.08521444350481033, "learning_rate": 1.804248095416786e-05, "loss": 0.4567, "num_input_tokens_seen": 68710336, "step": 56640 }, { "epoch": 6.308608976500724, "grad_norm": 0.10644997656345367, "learning_rate": 1.803781358858734e-05, "loss": 0.4546, "num_input_tokens_seen": 68716384, "step": 56645 }, { "epoch": 6.309165831384341, "grad_norm": 0.11226975917816162, "learning_rate": 1.803314648606053e-05, "loss": 0.4662, "num_input_tokens_seen": 68722368, "step": 56650 }, { "epoch": 6.309722686267959, "grad_norm": 0.16684779524803162, "learning_rate": 1.8028479646763775e-05, "loss": 0.4769, "num_input_tokens_seen": 68728256, "step": 56655 }, { "epoch": 6.3102795411515755, "grad_norm": 0.10939918458461761, "learning_rate": 1.8023813070873402e-05, "loss": 0.4691, "num_input_tokens_seen": 68734560, "step": 56660 }, { "epoch": 6.310836396035193, "grad_norm": 0.12578170001506805, "learning_rate": 1.8019146758565727e-05, "loss": 0.4572, "num_input_tokens_seen": 68740768, "step": 56665 }, { "epoch": 6.311393250918811, "grad_norm": 0.09558861702680588, "learning_rate": 1.801448071001706e-05, "loss": 0.467, "num_input_tokens_seen": 68747264, "step": 56670 }, { "epoch": 6.311950105802428, "grad_norm": 0.13353465497493744, "learning_rate": 1.8009814925403695e-05, "loss": 0.4654, "num_input_tokens_seen": 68753216, "step": 56675 }, { "epoch": 6.312506960686045, "grad_norm": 0.09104961901903152, "learning_rate": 1.800514940490192e-05, "loss": 0.4591, "num_input_tokens_seen": 68759424, "step": 56680 }, { "epoch": 6.313063815569663, "grad_norm": 0.1005600169301033, "learning_rate": 1.8000484148688022e-05, "loss": 0.4585, "num_input_tokens_seen": 68764416, "step": 56685 }, { "epoch": 6.31362067045328, "grad_norm": 0.09512568265199661, "learning_rate": 1.7995819156938264e-05, "loss": 0.4566, "num_input_tokens_seen": 68770784, "step": 56690 }, { "epoch": 6.3141775253368975, "grad_norm": 0.08262421190738678, "learning_rate": 1.7991154429828904e-05, "loss": 0.4707, "num_input_tokens_seen": 68776960, "step": 56695 }, { "epoch": 6.314734380220514, "grad_norm": 0.12827369570732117, "learning_rate": 1.7986489967536192e-05, "loss": 0.4566, "num_input_tokens_seen": 68783008, "step": 56700 }, { "epoch": 6.315291235104132, "grad_norm": 0.10586816817522049, "learning_rate": 1.7981825770236366e-05, "loss": 0.4562, "num_input_tokens_seen": 68788960, "step": 56705 }, { "epoch": 6.31584808998775, "grad_norm": 0.10328317433595657, "learning_rate": 1.7977161838105648e-05, "loss": 0.4657, "num_input_tokens_seen": 68795072, "step": 56710 }, { "epoch": 6.316404944871366, "grad_norm": 0.10269631445407867, "learning_rate": 1.7972498171320263e-05, "loss": 0.462, "num_input_tokens_seen": 68801152, "step": 56715 }, { "epoch": 6.316961799754984, "grad_norm": 0.08439266681671143, "learning_rate": 1.7967834770056416e-05, "loss": 0.4574, "num_input_tokens_seen": 68807136, "step": 56720 }, { "epoch": 6.317518654638601, "grad_norm": 0.08292268961668015, "learning_rate": 1.7963171634490312e-05, "loss": 0.4533, "num_input_tokens_seen": 68812672, "step": 56725 }, { "epoch": 6.3180755095222185, "grad_norm": 0.08812649548053741, "learning_rate": 1.7958508764798133e-05, "loss": 0.4598, "num_input_tokens_seen": 68818432, "step": 56730 }, { "epoch": 6.318632364405836, "grad_norm": 0.09452399611473083, "learning_rate": 1.7953846161156058e-05, "loss": 0.4556, "num_input_tokens_seen": 68824608, "step": 56735 }, { "epoch": 6.319189219289453, "grad_norm": 0.09897787123918533, "learning_rate": 1.7949183823740254e-05, "loss": 0.4634, "num_input_tokens_seen": 68830528, "step": 56740 }, { "epoch": 6.319746074173071, "grad_norm": 0.09166624397039413, "learning_rate": 1.794452175272688e-05, "loss": 0.4571, "num_input_tokens_seen": 68836352, "step": 56745 }, { "epoch": 6.3203029290566874, "grad_norm": 0.0983821302652359, "learning_rate": 1.7939859948292086e-05, "loss": 0.4483, "num_input_tokens_seen": 68842624, "step": 56750 }, { "epoch": 6.320859783940305, "grad_norm": 0.11270195990800858, "learning_rate": 1.793519841061201e-05, "loss": 0.4636, "num_input_tokens_seen": 68848992, "step": 56755 }, { "epoch": 6.321416638823923, "grad_norm": 0.07873315364122391, "learning_rate": 1.7930537139862773e-05, "loss": 0.4618, "num_input_tokens_seen": 68855104, "step": 56760 }, { "epoch": 6.32197349370754, "grad_norm": 0.17454533278942108, "learning_rate": 1.7925876136220504e-05, "loss": 0.4732, "num_input_tokens_seen": 68861120, "step": 56765 }, { "epoch": 6.322530348591157, "grad_norm": 0.09411346167325974, "learning_rate": 1.79212153998613e-05, "loss": 0.4551, "num_input_tokens_seen": 68867424, "step": 56770 }, { "epoch": 6.323087203474774, "grad_norm": 0.09550875425338745, "learning_rate": 1.7916554930961266e-05, "loss": 0.4584, "num_input_tokens_seen": 68873920, "step": 56775 }, { "epoch": 6.323644058358392, "grad_norm": 0.10261201113462448, "learning_rate": 1.791189472969649e-05, "loss": 0.4782, "num_input_tokens_seen": 68879424, "step": 56780 }, { "epoch": 6.324200913242009, "grad_norm": 0.08500313758850098, "learning_rate": 1.790723479624304e-05, "loss": 0.4635, "num_input_tokens_seen": 68885120, "step": 56785 }, { "epoch": 6.324757768125626, "grad_norm": 0.1292383074760437, "learning_rate": 1.790257513077699e-05, "loss": 0.4667, "num_input_tokens_seen": 68891264, "step": 56790 }, { "epoch": 6.325314623009244, "grad_norm": 0.10507199168205261, "learning_rate": 1.7897915733474402e-05, "loss": 0.4622, "num_input_tokens_seen": 68897152, "step": 56795 }, { "epoch": 6.3258714778928615, "grad_norm": 0.14303530752658844, "learning_rate": 1.7893256604511314e-05, "loss": 0.4621, "num_input_tokens_seen": 68903456, "step": 56800 }, { "epoch": 6.326428332776478, "grad_norm": 0.1613699048757553, "learning_rate": 1.788859774406377e-05, "loss": 0.4577, "num_input_tokens_seen": 68909600, "step": 56805 }, { "epoch": 6.326985187660096, "grad_norm": 0.11442548036575317, "learning_rate": 1.788393915230778e-05, "loss": 0.4627, "num_input_tokens_seen": 68915584, "step": 56810 }, { "epoch": 6.327542042543713, "grad_norm": 0.09855125099420547, "learning_rate": 1.787928082941938e-05, "loss": 0.4569, "num_input_tokens_seen": 68921920, "step": 56815 }, { "epoch": 6.3280988974273304, "grad_norm": 0.12075375765562057, "learning_rate": 1.7874622775574575e-05, "loss": 0.4644, "num_input_tokens_seen": 68928032, "step": 56820 }, { "epoch": 6.328655752310948, "grad_norm": 0.08845115453004837, "learning_rate": 1.7869964990949352e-05, "loss": 0.4442, "num_input_tokens_seen": 68933888, "step": 56825 }, { "epoch": 6.329212607194565, "grad_norm": 0.10312417894601822, "learning_rate": 1.7865307475719703e-05, "loss": 0.4651, "num_input_tokens_seen": 68940064, "step": 56830 }, { "epoch": 6.329769462078183, "grad_norm": 0.09682042896747589, "learning_rate": 1.7860650230061602e-05, "loss": 0.4659, "num_input_tokens_seen": 68945888, "step": 56835 }, { "epoch": 6.330326316961799, "grad_norm": 0.13076144456863403, "learning_rate": 1.785599325415101e-05, "loss": 0.4732, "num_input_tokens_seen": 68951840, "step": 56840 }, { "epoch": 6.330883171845417, "grad_norm": 0.1447155475616455, "learning_rate": 1.785133654816389e-05, "loss": 0.4645, "num_input_tokens_seen": 68958176, "step": 56845 }, { "epoch": 6.331440026729035, "grad_norm": 0.09415744245052338, "learning_rate": 1.784668011227618e-05, "loss": 0.4671, "num_input_tokens_seen": 68964480, "step": 56850 }, { "epoch": 6.3319968816126515, "grad_norm": 0.13536392152309418, "learning_rate": 1.7842023946663828e-05, "loss": 0.4583, "num_input_tokens_seen": 68970496, "step": 56855 }, { "epoch": 6.332553736496269, "grad_norm": 0.09254352003335953, "learning_rate": 1.7837368051502746e-05, "loss": 0.4726, "num_input_tokens_seen": 68976256, "step": 56860 }, { "epoch": 6.333110591379887, "grad_norm": 0.09510966390371323, "learning_rate": 1.7832712426968854e-05, "loss": 0.456, "num_input_tokens_seen": 68982464, "step": 56865 }, { "epoch": 6.333667446263504, "grad_norm": 0.08266212046146393, "learning_rate": 1.782805707323806e-05, "loss": 0.4667, "num_input_tokens_seen": 68988736, "step": 56870 }, { "epoch": 6.334224301147121, "grad_norm": 0.10565759986639023, "learning_rate": 1.7823401990486246e-05, "loss": 0.4628, "num_input_tokens_seen": 68994816, "step": 56875 }, { "epoch": 6.334781156030738, "grad_norm": 0.10489228367805481, "learning_rate": 1.7818747178889304e-05, "loss": 0.4594, "num_input_tokens_seen": 69000544, "step": 56880 }, { "epoch": 6.335338010914356, "grad_norm": 0.09078161418437958, "learning_rate": 1.781409263862311e-05, "loss": 0.4632, "num_input_tokens_seen": 69006656, "step": 56885 }, { "epoch": 6.3358948657979735, "grad_norm": 0.08790735900402069, "learning_rate": 1.780943836986352e-05, "loss": 0.47, "num_input_tokens_seen": 69012896, "step": 56890 }, { "epoch": 6.33645172068159, "grad_norm": 0.09010221809148788, "learning_rate": 1.780478437278639e-05, "loss": 0.4582, "num_input_tokens_seen": 69019072, "step": 56895 }, { "epoch": 6.337008575565208, "grad_norm": 0.1317562311887741, "learning_rate": 1.780013064756757e-05, "loss": 0.4615, "num_input_tokens_seen": 69024992, "step": 56900 }, { "epoch": 6.337565430448825, "grad_norm": 0.10220645368099213, "learning_rate": 1.7795477194382887e-05, "loss": 0.4647, "num_input_tokens_seen": 69031008, "step": 56905 }, { "epoch": 6.338122285332442, "grad_norm": 0.13252711296081543, "learning_rate": 1.779082401340816e-05, "loss": 0.4576, "num_input_tokens_seen": 69036992, "step": 56910 }, { "epoch": 6.33867914021606, "grad_norm": 0.11848348379135132, "learning_rate": 1.778617110481921e-05, "loss": 0.4681, "num_input_tokens_seen": 69043232, "step": 56915 }, { "epoch": 6.339235995099677, "grad_norm": 0.11193104833364487, "learning_rate": 1.778151846879183e-05, "loss": 0.4592, "num_input_tokens_seen": 69049344, "step": 56920 }, { "epoch": 6.3397928499832945, "grad_norm": 0.09675481915473938, "learning_rate": 1.777686610550181e-05, "loss": 0.4572, "num_input_tokens_seen": 69055168, "step": 56925 }, { "epoch": 6.340349704866911, "grad_norm": 0.14082464575767517, "learning_rate": 1.777221401512494e-05, "loss": 0.4615, "num_input_tokens_seen": 69061376, "step": 56930 }, { "epoch": 6.340906559750529, "grad_norm": 0.08604888617992401, "learning_rate": 1.776756219783699e-05, "loss": 0.4576, "num_input_tokens_seen": 69067520, "step": 56935 }, { "epoch": 6.341463414634147, "grad_norm": 0.07792697101831436, "learning_rate": 1.776291065381372e-05, "loss": 0.4574, "num_input_tokens_seen": 69073568, "step": 56940 }, { "epoch": 6.342020269517763, "grad_norm": 0.12091962993144989, "learning_rate": 1.7758259383230875e-05, "loss": 0.4513, "num_input_tokens_seen": 69080064, "step": 56945 }, { "epoch": 6.342577124401381, "grad_norm": 0.10754009336233139, "learning_rate": 1.7753608386264196e-05, "loss": 0.4366, "num_input_tokens_seen": 69085600, "step": 56950 }, { "epoch": 6.343133979284998, "grad_norm": 0.09917589277029037, "learning_rate": 1.774895766308941e-05, "loss": 0.4721, "num_input_tokens_seen": 69091808, "step": 56955 }, { "epoch": 6.343690834168616, "grad_norm": 0.09493527561426163, "learning_rate": 1.7744307213882244e-05, "loss": 0.4627, "num_input_tokens_seen": 69097984, "step": 56960 }, { "epoch": 6.344247689052233, "grad_norm": 0.12673956155776978, "learning_rate": 1.7739657038818407e-05, "loss": 0.4511, "num_input_tokens_seen": 69104128, "step": 56965 }, { "epoch": 6.34480454393585, "grad_norm": 0.09093877673149109, "learning_rate": 1.7735007138073595e-05, "loss": 0.4568, "num_input_tokens_seen": 69110304, "step": 56970 }, { "epoch": 6.345361398819468, "grad_norm": 0.11161910742521286, "learning_rate": 1.773035751182349e-05, "loss": 0.457, "num_input_tokens_seen": 69116096, "step": 56975 }, { "epoch": 6.345918253703085, "grad_norm": 0.07888548821210861, "learning_rate": 1.772570816024378e-05, "loss": 0.4567, "num_input_tokens_seen": 69122304, "step": 56980 }, { "epoch": 6.346475108586702, "grad_norm": 0.10474436730146408, "learning_rate": 1.7721059083510123e-05, "loss": 0.4631, "num_input_tokens_seen": 69128384, "step": 56985 }, { "epoch": 6.34703196347032, "grad_norm": 0.11904727667570114, "learning_rate": 1.7716410281798178e-05, "loss": 0.4571, "num_input_tokens_seen": 69133856, "step": 56990 }, { "epoch": 6.347588818353937, "grad_norm": 0.11377822607755661, "learning_rate": 1.77117617552836e-05, "loss": 0.4663, "num_input_tokens_seen": 69139680, "step": 56995 }, { "epoch": 6.348145673237554, "grad_norm": 0.10619548708200455, "learning_rate": 1.7707113504142017e-05, "loss": 0.4639, "num_input_tokens_seen": 69145216, "step": 57000 }, { "epoch": 6.348702528121172, "grad_norm": 0.1301933079957962, "learning_rate": 1.770246552854906e-05, "loss": 0.4625, "num_input_tokens_seen": 69151712, "step": 57005 }, { "epoch": 6.349259383004789, "grad_norm": 0.10812414437532425, "learning_rate": 1.7697817828680337e-05, "loss": 0.4793, "num_input_tokens_seen": 69157664, "step": 57010 }, { "epoch": 6.349816237888406, "grad_norm": 0.11713004857301712, "learning_rate": 1.769317040471146e-05, "loss": 0.4466, "num_input_tokens_seen": 69163552, "step": 57015 }, { "epoch": 6.350373092772023, "grad_norm": 0.08756110817193985, "learning_rate": 1.7688523256818016e-05, "loss": 0.4696, "num_input_tokens_seen": 69169184, "step": 57020 }, { "epoch": 6.350929947655641, "grad_norm": 0.12211665511131287, "learning_rate": 1.7683876385175598e-05, "loss": 0.4541, "num_input_tokens_seen": 69175200, "step": 57025 }, { "epoch": 6.351486802539259, "grad_norm": 0.08333408832550049, "learning_rate": 1.7679229789959777e-05, "loss": 0.4684, "num_input_tokens_seen": 69181376, "step": 57030 }, { "epoch": 6.352043657422875, "grad_norm": 0.10364113003015518, "learning_rate": 1.7674583471346115e-05, "loss": 0.4715, "num_input_tokens_seen": 69187296, "step": 57035 }, { "epoch": 6.352600512306493, "grad_norm": 0.07267658412456512, "learning_rate": 1.7669937429510163e-05, "loss": 0.4471, "num_input_tokens_seen": 69193344, "step": 57040 }, { "epoch": 6.353157367190111, "grad_norm": 0.09121514856815338, "learning_rate": 1.766529166462747e-05, "loss": 0.4615, "num_input_tokens_seen": 69199136, "step": 57045 }, { "epoch": 6.3537142220737275, "grad_norm": 0.07449079304933548, "learning_rate": 1.766064617687355e-05, "loss": 0.4654, "num_input_tokens_seen": 69205472, "step": 57050 }, { "epoch": 6.354271076957345, "grad_norm": 0.07769224047660828, "learning_rate": 1.7656000966423954e-05, "loss": 0.4601, "num_input_tokens_seen": 69211520, "step": 57055 }, { "epoch": 6.354827931840962, "grad_norm": 0.11890284717082977, "learning_rate": 1.7651356033454167e-05, "loss": 0.4623, "num_input_tokens_seen": 69217888, "step": 57060 }, { "epoch": 6.35538478672458, "grad_norm": 0.07455195486545563, "learning_rate": 1.76467113781397e-05, "loss": 0.4563, "num_input_tokens_seen": 69223904, "step": 57065 }, { "epoch": 6.355941641608197, "grad_norm": 0.09946233779191971, "learning_rate": 1.7642067000656045e-05, "loss": 0.4643, "num_input_tokens_seen": 69229920, "step": 57070 }, { "epoch": 6.356498496491814, "grad_norm": 0.09902910143136978, "learning_rate": 1.763742290117868e-05, "loss": 0.4614, "num_input_tokens_seen": 69236000, "step": 57075 }, { "epoch": 6.357055351375432, "grad_norm": 0.06770460307598114, "learning_rate": 1.7632779079883065e-05, "loss": 0.4606, "num_input_tokens_seen": 69241888, "step": 57080 }, { "epoch": 6.3576122062590485, "grad_norm": 0.10000240802764893, "learning_rate": 1.762813553694467e-05, "loss": 0.4502, "num_input_tokens_seen": 69248256, "step": 57085 }, { "epoch": 6.358169061142666, "grad_norm": 0.09816259890794754, "learning_rate": 1.7623492272538936e-05, "loss": 0.4563, "num_input_tokens_seen": 69254432, "step": 57090 }, { "epoch": 6.358725916026284, "grad_norm": 0.1191696897149086, "learning_rate": 1.7618849286841306e-05, "loss": 0.4649, "num_input_tokens_seen": 69260544, "step": 57095 }, { "epoch": 6.359282770909901, "grad_norm": 0.10631587356328964, "learning_rate": 1.7614206580027205e-05, "loss": 0.4535, "num_input_tokens_seen": 69266848, "step": 57100 }, { "epoch": 6.359839625793518, "grad_norm": 0.1335684061050415, "learning_rate": 1.7609564152272044e-05, "loss": 0.4674, "num_input_tokens_seen": 69273056, "step": 57105 }, { "epoch": 6.360396480677135, "grad_norm": 0.09108737111091614, "learning_rate": 1.760492200375124e-05, "loss": 0.4622, "num_input_tokens_seen": 69279168, "step": 57110 }, { "epoch": 6.360953335560753, "grad_norm": 0.1208215281367302, "learning_rate": 1.7600280134640173e-05, "loss": 0.4666, "num_input_tokens_seen": 69285152, "step": 57115 }, { "epoch": 6.3615101904443705, "grad_norm": 0.08765064924955368, "learning_rate": 1.759563854511424e-05, "loss": 0.4475, "num_input_tokens_seen": 69291488, "step": 57120 }, { "epoch": 6.362067045327987, "grad_norm": 0.11785659939050674, "learning_rate": 1.7590997235348812e-05, "loss": 0.4662, "num_input_tokens_seen": 69297472, "step": 57125 }, { "epoch": 6.362623900211605, "grad_norm": 0.12133604288101196, "learning_rate": 1.7586356205519254e-05, "loss": 0.4533, "num_input_tokens_seen": 69303072, "step": 57130 }, { "epoch": 6.363180755095222, "grad_norm": 0.0913882926106453, "learning_rate": 1.7581715455800913e-05, "loss": 0.4615, "num_input_tokens_seen": 69309088, "step": 57135 }, { "epoch": 6.363737609978839, "grad_norm": 0.08582212030887604, "learning_rate": 1.757707498636914e-05, "loss": 0.4665, "num_input_tokens_seen": 69314592, "step": 57140 }, { "epoch": 6.364294464862457, "grad_norm": 0.11946235597133636, "learning_rate": 1.7572434797399257e-05, "loss": 0.4597, "num_input_tokens_seen": 69320640, "step": 57145 }, { "epoch": 6.364851319746074, "grad_norm": 0.09209536015987396, "learning_rate": 1.7567794889066586e-05, "loss": 0.4431, "num_input_tokens_seen": 69326752, "step": 57150 }, { "epoch": 6.3654081746296916, "grad_norm": 0.11585525423288345, "learning_rate": 1.756315526154645e-05, "loss": 0.4462, "num_input_tokens_seen": 69333056, "step": 57155 }, { "epoch": 6.365965029513309, "grad_norm": 0.09500102698802948, "learning_rate": 1.7558515915014137e-05, "loss": 0.4625, "num_input_tokens_seen": 69338944, "step": 57160 }, { "epoch": 6.366521884396926, "grad_norm": 0.07884222269058228, "learning_rate": 1.7553876849644942e-05, "loss": 0.4707, "num_input_tokens_seen": 69345248, "step": 57165 }, { "epoch": 6.367078739280544, "grad_norm": 0.08537571132183075, "learning_rate": 1.7549238065614142e-05, "loss": 0.4665, "num_input_tokens_seen": 69351456, "step": 57170 }, { "epoch": 6.3676355941641605, "grad_norm": 0.09452622383832932, "learning_rate": 1.7544599563097002e-05, "loss": 0.4477, "num_input_tokens_seen": 69357248, "step": 57175 }, { "epoch": 6.368192449047778, "grad_norm": 0.09129969775676727, "learning_rate": 1.7539961342268783e-05, "loss": 0.4658, "num_input_tokens_seen": 69363168, "step": 57180 }, { "epoch": 6.368749303931396, "grad_norm": 0.07869859784841537, "learning_rate": 1.7535323403304734e-05, "loss": 0.4594, "num_input_tokens_seen": 69369344, "step": 57185 }, { "epoch": 6.369306158815013, "grad_norm": 0.0796724334359169, "learning_rate": 1.7530685746380087e-05, "loss": 0.467, "num_input_tokens_seen": 69375616, "step": 57190 }, { "epoch": 6.36986301369863, "grad_norm": 0.08608657866716385, "learning_rate": 1.752604837167007e-05, "loss": 0.4716, "num_input_tokens_seen": 69381728, "step": 57195 }, { "epoch": 6.370419868582247, "grad_norm": 0.08979886770248413, "learning_rate": 1.75214112793499e-05, "loss": 0.4581, "num_input_tokens_seen": 69387552, "step": 57200 }, { "epoch": 6.370976723465865, "grad_norm": 0.09502381086349487, "learning_rate": 1.751677446959477e-05, "loss": 0.4627, "num_input_tokens_seen": 69393696, "step": 57205 }, { "epoch": 6.371533578349482, "grad_norm": 0.09295960515737534, "learning_rate": 1.751213794257989e-05, "loss": 0.4637, "num_input_tokens_seen": 69399872, "step": 57210 }, { "epoch": 6.372090433233099, "grad_norm": 0.10725373774766922, "learning_rate": 1.7507501698480428e-05, "loss": 0.4615, "num_input_tokens_seen": 69406048, "step": 57215 }, { "epoch": 6.372647288116717, "grad_norm": 0.09849516302347183, "learning_rate": 1.7502865737471562e-05, "loss": 0.4595, "num_input_tokens_seen": 69411904, "step": 57220 }, { "epoch": 6.3732041430003346, "grad_norm": 0.11976908147335052, "learning_rate": 1.749823005972846e-05, "loss": 0.469, "num_input_tokens_seen": 69418144, "step": 57225 }, { "epoch": 6.373760997883951, "grad_norm": 0.1301608830690384, "learning_rate": 1.7493594665426266e-05, "loss": 0.4671, "num_input_tokens_seen": 69424000, "step": 57230 }, { "epoch": 6.374317852767569, "grad_norm": 0.0968356505036354, "learning_rate": 1.7488959554740118e-05, "loss": 0.4673, "num_input_tokens_seen": 69430176, "step": 57235 }, { "epoch": 6.374874707651186, "grad_norm": 0.09804612398147583, "learning_rate": 1.7484324727845153e-05, "loss": 0.4587, "num_input_tokens_seen": 69436096, "step": 57240 }, { "epoch": 6.3754315625348035, "grad_norm": 0.10204023122787476, "learning_rate": 1.7479690184916478e-05, "loss": 0.4704, "num_input_tokens_seen": 69442080, "step": 57245 }, { "epoch": 6.375988417418421, "grad_norm": 0.09826391190290451, "learning_rate": 1.7475055926129214e-05, "loss": 0.4611, "num_input_tokens_seen": 69448192, "step": 57250 }, { "epoch": 6.376545272302038, "grad_norm": 0.1018429771065712, "learning_rate": 1.747042195165845e-05, "loss": 0.4618, "num_input_tokens_seen": 69453760, "step": 57255 }, { "epoch": 6.377102127185656, "grad_norm": 0.10356582701206207, "learning_rate": 1.746578826167927e-05, "loss": 0.463, "num_input_tokens_seen": 69459776, "step": 57260 }, { "epoch": 6.377658982069272, "grad_norm": 0.11348867416381836, "learning_rate": 1.746115485636676e-05, "loss": 0.4637, "num_input_tokens_seen": 69466208, "step": 57265 }, { "epoch": 6.37821583695289, "grad_norm": 0.10901308059692383, "learning_rate": 1.7456521735895982e-05, "loss": 0.4604, "num_input_tokens_seen": 69472352, "step": 57270 }, { "epoch": 6.378772691836508, "grad_norm": 0.10712116211652756, "learning_rate": 1.7451888900441975e-05, "loss": 0.4609, "num_input_tokens_seen": 69478080, "step": 57275 }, { "epoch": 6.3793295467201245, "grad_norm": 0.0882701650261879, "learning_rate": 1.7447256350179813e-05, "loss": 0.4564, "num_input_tokens_seen": 69484000, "step": 57280 }, { "epoch": 6.379886401603742, "grad_norm": 0.09892264008522034, "learning_rate": 1.7442624085284502e-05, "loss": 0.4692, "num_input_tokens_seen": 69490112, "step": 57285 }, { "epoch": 6.380443256487359, "grad_norm": 0.0957852154970169, "learning_rate": 1.7437992105931068e-05, "loss": 0.453, "num_input_tokens_seen": 69496192, "step": 57290 }, { "epoch": 6.381000111370977, "grad_norm": 0.09216012805700302, "learning_rate": 1.7433360412294522e-05, "loss": 0.4635, "num_input_tokens_seen": 69502336, "step": 57295 }, { "epoch": 6.381556966254594, "grad_norm": 0.11863426119089127, "learning_rate": 1.7428729004549872e-05, "loss": 0.465, "num_input_tokens_seen": 69508512, "step": 57300 }, { "epoch": 6.382113821138211, "grad_norm": 0.11507481336593628, "learning_rate": 1.7424097882872107e-05, "loss": 0.456, "num_input_tokens_seen": 69514528, "step": 57305 }, { "epoch": 6.382670676021829, "grad_norm": 0.12180694937705994, "learning_rate": 1.74194670474362e-05, "loss": 0.4599, "num_input_tokens_seen": 69520800, "step": 57310 }, { "epoch": 6.383227530905446, "grad_norm": 0.08631504327058792, "learning_rate": 1.741483649841712e-05, "loss": 0.4612, "num_input_tokens_seen": 69527200, "step": 57315 }, { "epoch": 6.383784385789063, "grad_norm": 0.12113255262374878, "learning_rate": 1.7410206235989828e-05, "loss": 0.4548, "num_input_tokens_seen": 69533568, "step": 57320 }, { "epoch": 6.384341240672681, "grad_norm": 0.08877629786729813, "learning_rate": 1.7405576260329263e-05, "loss": 0.4516, "num_input_tokens_seen": 69539456, "step": 57325 }, { "epoch": 6.384898095556298, "grad_norm": 0.09445023536682129, "learning_rate": 1.740094657161036e-05, "loss": 0.4678, "num_input_tokens_seen": 69545248, "step": 57330 }, { "epoch": 6.385454950439915, "grad_norm": 0.10864967852830887, "learning_rate": 1.7396317170008055e-05, "loss": 0.4652, "num_input_tokens_seen": 69551360, "step": 57335 }, { "epoch": 6.386011805323533, "grad_norm": 0.08049943298101425, "learning_rate": 1.7391688055697257e-05, "loss": 0.466, "num_input_tokens_seen": 69557344, "step": 57340 }, { "epoch": 6.38656866020715, "grad_norm": 0.08174148201942444, "learning_rate": 1.7387059228852862e-05, "loss": 0.4616, "num_input_tokens_seen": 69563584, "step": 57345 }, { "epoch": 6.3871255150907675, "grad_norm": 0.08723731338977814, "learning_rate": 1.7382430689649767e-05, "loss": 0.4664, "num_input_tokens_seen": 69569088, "step": 57350 }, { "epoch": 6.387682369974384, "grad_norm": 0.08686867356300354, "learning_rate": 1.7377802438262853e-05, "loss": 0.4678, "num_input_tokens_seen": 69575072, "step": 57355 }, { "epoch": 6.388239224858002, "grad_norm": 0.11832884699106216, "learning_rate": 1.7373174474866983e-05, "loss": 0.4576, "num_input_tokens_seen": 69581408, "step": 57360 }, { "epoch": 6.38879607974162, "grad_norm": 0.10199880599975586, "learning_rate": 1.736854679963703e-05, "loss": 0.4574, "num_input_tokens_seen": 69586944, "step": 57365 }, { "epoch": 6.3893529346252365, "grad_norm": 0.09739097207784653, "learning_rate": 1.7363919412747832e-05, "loss": 0.4617, "num_input_tokens_seen": 69592928, "step": 57370 }, { "epoch": 6.389909789508854, "grad_norm": 0.09989887475967407, "learning_rate": 1.735929231437423e-05, "loss": 0.456, "num_input_tokens_seen": 69599072, "step": 57375 }, { "epoch": 6.390466644392471, "grad_norm": 0.08145791292190552, "learning_rate": 1.7354665504691052e-05, "loss": 0.4567, "num_input_tokens_seen": 69605216, "step": 57380 }, { "epoch": 6.391023499276089, "grad_norm": 0.09957008063793182, "learning_rate": 1.735003898387311e-05, "loss": 0.4662, "num_input_tokens_seen": 69611392, "step": 57385 }, { "epoch": 6.391580354159706, "grad_norm": 0.08913018554449081, "learning_rate": 1.734541275209521e-05, "loss": 0.4757, "num_input_tokens_seen": 69617760, "step": 57390 }, { "epoch": 6.392137209043323, "grad_norm": 0.08404158800840378, "learning_rate": 1.734078680953215e-05, "loss": 0.467, "num_input_tokens_seen": 69623712, "step": 57395 }, { "epoch": 6.392694063926941, "grad_norm": 0.13994525372982025, "learning_rate": 1.7336161156358706e-05, "loss": 0.4614, "num_input_tokens_seen": 69629952, "step": 57400 }, { "epoch": 6.393250918810558, "grad_norm": 0.0936124324798584, "learning_rate": 1.7331535792749656e-05, "loss": 0.4652, "num_input_tokens_seen": 69636192, "step": 57405 }, { "epoch": 6.393807773694175, "grad_norm": 0.0918331891298294, "learning_rate": 1.7326910718879758e-05, "loss": 0.4553, "num_input_tokens_seen": 69642176, "step": 57410 }, { "epoch": 6.394364628577793, "grad_norm": 0.1388392597436905, "learning_rate": 1.7322285934923767e-05, "loss": 0.4706, "num_input_tokens_seen": 69648672, "step": 57415 }, { "epoch": 6.39492148346141, "grad_norm": 0.10584836453199387, "learning_rate": 1.731766144105641e-05, "loss": 0.4544, "num_input_tokens_seen": 69654656, "step": 57420 }, { "epoch": 6.395478338345027, "grad_norm": 0.09998225420713425, "learning_rate": 1.7313037237452422e-05, "loss": 0.4707, "num_input_tokens_seen": 69660960, "step": 57425 }, { "epoch": 6.396035193228645, "grad_norm": 0.11621600389480591, "learning_rate": 1.7308413324286527e-05, "loss": 0.4611, "num_input_tokens_seen": 69666912, "step": 57430 }, { "epoch": 6.396592048112262, "grad_norm": 0.09820809215307236, "learning_rate": 1.7303789701733426e-05, "loss": 0.4512, "num_input_tokens_seen": 69672864, "step": 57435 }, { "epoch": 6.3971489029958795, "grad_norm": 0.1986546814441681, "learning_rate": 1.7299166369967812e-05, "loss": 0.4657, "num_input_tokens_seen": 69679296, "step": 57440 }, { "epoch": 6.397705757879496, "grad_norm": 0.10564986616373062, "learning_rate": 1.729454332916437e-05, "loss": 0.4623, "num_input_tokens_seen": 69685536, "step": 57445 }, { "epoch": 6.398262612763114, "grad_norm": 0.10532169044017792, "learning_rate": 1.7289920579497777e-05, "loss": 0.4583, "num_input_tokens_seen": 69691456, "step": 57450 }, { "epoch": 6.398819467646732, "grad_norm": 0.09954112023115158, "learning_rate": 1.7285298121142686e-05, "loss": 0.4566, "num_input_tokens_seen": 69697536, "step": 57455 }, { "epoch": 6.399376322530348, "grad_norm": 0.08692216128110886, "learning_rate": 1.728067595427376e-05, "loss": 0.4589, "num_input_tokens_seen": 69703616, "step": 57460 }, { "epoch": 6.399933177413966, "grad_norm": 0.10544721782207489, "learning_rate": 1.7276054079065633e-05, "loss": 0.4679, "num_input_tokens_seen": 69709888, "step": 57465 }, { "epoch": 6.400490032297583, "grad_norm": 0.09565623849630356, "learning_rate": 1.7271432495692937e-05, "loss": 0.4624, "num_input_tokens_seen": 69715808, "step": 57470 }, { "epoch": 6.4010468871812005, "grad_norm": 0.11108911782503128, "learning_rate": 1.7266811204330292e-05, "loss": 0.4541, "num_input_tokens_seen": 69721984, "step": 57475 }, { "epoch": 6.401603742064818, "grad_norm": 0.10952506959438324, "learning_rate": 1.7262190205152296e-05, "loss": 0.4596, "num_input_tokens_seen": 69727936, "step": 57480 }, { "epoch": 6.402160596948435, "grad_norm": 0.12368954718112946, "learning_rate": 1.7257569498333553e-05, "loss": 0.4567, "num_input_tokens_seen": 69733952, "step": 57485 }, { "epoch": 6.402717451832053, "grad_norm": 0.10088157653808594, "learning_rate": 1.7252949084048643e-05, "loss": 0.468, "num_input_tokens_seen": 69740224, "step": 57490 }, { "epoch": 6.403274306715669, "grad_norm": 0.09043484926223755, "learning_rate": 1.724832896247215e-05, "loss": 0.4606, "num_input_tokens_seen": 69746400, "step": 57495 }, { "epoch": 6.403831161599287, "grad_norm": 0.10566292703151703, "learning_rate": 1.7243709133778625e-05, "loss": 0.475, "num_input_tokens_seen": 69752512, "step": 57500 }, { "epoch": 6.404388016482905, "grad_norm": 0.10598086565732956, "learning_rate": 1.7239089598142628e-05, "loss": 0.4606, "num_input_tokens_seen": 69758784, "step": 57505 }, { "epoch": 6.404944871366522, "grad_norm": 0.10674809664487839, "learning_rate": 1.7234470355738697e-05, "loss": 0.4682, "num_input_tokens_seen": 69765120, "step": 57510 }, { "epoch": 6.405501726250139, "grad_norm": 0.09477289766073227, "learning_rate": 1.722985140674136e-05, "loss": 0.4641, "num_input_tokens_seen": 69771424, "step": 57515 }, { "epoch": 6.406058581133757, "grad_norm": 0.10504051297903061, "learning_rate": 1.7225232751325136e-05, "loss": 0.4698, "num_input_tokens_seen": 69775776, "step": 57520 }, { "epoch": 6.406615436017374, "grad_norm": 0.1985616385936737, "learning_rate": 1.7220614389664535e-05, "loss": 0.4573, "num_input_tokens_seen": 69782176, "step": 57525 }, { "epoch": 6.407172290900991, "grad_norm": 0.1005956381559372, "learning_rate": 1.7215996321934057e-05, "loss": 0.4566, "num_input_tokens_seen": 69788640, "step": 57530 }, { "epoch": 6.407729145784608, "grad_norm": 0.09951312839984894, "learning_rate": 1.721137854830818e-05, "loss": 0.4659, "num_input_tokens_seen": 69794272, "step": 57535 }, { "epoch": 6.408286000668226, "grad_norm": 0.12952399253845215, "learning_rate": 1.7206761068961383e-05, "loss": 0.4574, "num_input_tokens_seen": 69800416, "step": 57540 }, { "epoch": 6.4088428555518435, "grad_norm": 0.13637220859527588, "learning_rate": 1.7202143884068125e-05, "loss": 0.4446, "num_input_tokens_seen": 69806176, "step": 57545 }, { "epoch": 6.40939971043546, "grad_norm": 0.09648891538381577, "learning_rate": 1.7197526993802866e-05, "loss": 0.4722, "num_input_tokens_seen": 69812320, "step": 57550 }, { "epoch": 6.409956565319078, "grad_norm": 0.11265519261360168, "learning_rate": 1.719291039834003e-05, "loss": 0.4615, "num_input_tokens_seen": 69818272, "step": 57555 }, { "epoch": 6.410513420202695, "grad_norm": 0.1306714117527008, "learning_rate": 1.7188294097854067e-05, "loss": 0.4741, "num_input_tokens_seen": 69824576, "step": 57560 }, { "epoch": 6.411070275086312, "grad_norm": 0.0725114569067955, "learning_rate": 1.7183678092519385e-05, "loss": 0.4707, "num_input_tokens_seen": 69830752, "step": 57565 }, { "epoch": 6.41162712996993, "grad_norm": 0.12943552434444427, "learning_rate": 1.717906238251039e-05, "loss": 0.4617, "num_input_tokens_seen": 69836800, "step": 57570 }, { "epoch": 6.412183984853547, "grad_norm": 0.12518037855625153, "learning_rate": 1.717444696800149e-05, "loss": 0.4522, "num_input_tokens_seen": 69843456, "step": 57575 }, { "epoch": 6.412740839737165, "grad_norm": 0.1051369458436966, "learning_rate": 1.7169831849167058e-05, "loss": 0.4539, "num_input_tokens_seen": 69849696, "step": 57580 }, { "epoch": 6.413297694620782, "grad_norm": 0.11734820157289505, "learning_rate": 1.7165217026181463e-05, "loss": 0.4603, "num_input_tokens_seen": 69856128, "step": 57585 }, { "epoch": 6.413854549504399, "grad_norm": 0.09066414833068848, "learning_rate": 1.7160602499219086e-05, "loss": 0.4691, "num_input_tokens_seen": 69862112, "step": 57590 }, { "epoch": 6.414411404388017, "grad_norm": 0.08733861893415451, "learning_rate": 1.7155988268454265e-05, "loss": 0.454, "num_input_tokens_seen": 69868064, "step": 57595 }, { "epoch": 6.4149682592716335, "grad_norm": 0.15506617724895477, "learning_rate": 1.715137433406135e-05, "loss": 0.4689, "num_input_tokens_seen": 69873184, "step": 57600 }, { "epoch": 6.415525114155251, "grad_norm": 0.14782465994358063, "learning_rate": 1.7146760696214662e-05, "loss": 0.4512, "num_input_tokens_seen": 69879392, "step": 57605 }, { "epoch": 6.416081969038869, "grad_norm": 0.07151418924331665, "learning_rate": 1.7142147355088522e-05, "loss": 0.46, "num_input_tokens_seen": 69885600, "step": 57610 }, { "epoch": 6.416638823922486, "grad_norm": 0.11510755121707916, "learning_rate": 1.7137534310857232e-05, "loss": 0.465, "num_input_tokens_seen": 69891744, "step": 57615 }, { "epoch": 6.417195678806103, "grad_norm": 0.1423545926809311, "learning_rate": 1.7132921563695094e-05, "loss": 0.4464, "num_input_tokens_seen": 69897280, "step": 57620 }, { "epoch": 6.41775253368972, "grad_norm": 0.09684333205223083, "learning_rate": 1.7128309113776394e-05, "loss": 0.4665, "num_input_tokens_seen": 69902592, "step": 57625 }, { "epoch": 6.418309388573338, "grad_norm": 0.10493044555187225, "learning_rate": 1.7123696961275404e-05, "loss": 0.4478, "num_input_tokens_seen": 69908704, "step": 57630 }, { "epoch": 6.4188662434569554, "grad_norm": 0.09762775897979736, "learning_rate": 1.7119085106366378e-05, "loss": 0.4679, "num_input_tokens_seen": 69915040, "step": 57635 }, { "epoch": 6.419423098340572, "grad_norm": 0.11435110867023468, "learning_rate": 1.7114473549223576e-05, "loss": 0.4623, "num_input_tokens_seen": 69920864, "step": 57640 }, { "epoch": 6.41997995322419, "grad_norm": 0.12640686333179474, "learning_rate": 1.7109862290021235e-05, "loss": 0.4571, "num_input_tokens_seen": 69927200, "step": 57645 }, { "epoch": 6.420536808107807, "grad_norm": 0.09017965197563171, "learning_rate": 1.7105251328933576e-05, "loss": 0.4576, "num_input_tokens_seen": 69933568, "step": 57650 }, { "epoch": 6.421093662991424, "grad_norm": 0.12976683676242828, "learning_rate": 1.710064066613482e-05, "loss": 0.4373, "num_input_tokens_seen": 69939648, "step": 57655 }, { "epoch": 6.421650517875042, "grad_norm": 0.09385383129119873, "learning_rate": 1.7096030301799183e-05, "loss": 0.4696, "num_input_tokens_seen": 69945728, "step": 57660 }, { "epoch": 6.422207372758659, "grad_norm": 0.11464949697256088, "learning_rate": 1.7091420236100847e-05, "loss": 0.4613, "num_input_tokens_seen": 69951904, "step": 57665 }, { "epoch": 6.4227642276422765, "grad_norm": 0.10535024851560593, "learning_rate": 1.7086810469213997e-05, "loss": 0.4759, "num_input_tokens_seen": 69957760, "step": 57670 }, { "epoch": 6.423321082525893, "grad_norm": 0.18903711438179016, "learning_rate": 1.7082201001312813e-05, "loss": 0.4793, "num_input_tokens_seen": 69964128, "step": 57675 }, { "epoch": 6.423877937409511, "grad_norm": 0.0939013734459877, "learning_rate": 1.7077591832571438e-05, "loss": 0.4778, "num_input_tokens_seen": 69970016, "step": 57680 }, { "epoch": 6.424434792293129, "grad_norm": 0.08317320048809052, "learning_rate": 1.707298296316404e-05, "loss": 0.4615, "num_input_tokens_seen": 69976064, "step": 57685 }, { "epoch": 6.424991647176745, "grad_norm": 0.1414867341518402, "learning_rate": 1.706837439326476e-05, "loss": 0.4739, "num_input_tokens_seen": 69982272, "step": 57690 }, { "epoch": 6.425548502060363, "grad_norm": 0.12511859834194183, "learning_rate": 1.7063766123047698e-05, "loss": 0.4567, "num_input_tokens_seen": 69987936, "step": 57695 }, { "epoch": 6.426105356943981, "grad_norm": 0.0872928574681282, "learning_rate": 1.7059158152686985e-05, "loss": 0.473, "num_input_tokens_seen": 69993728, "step": 57700 }, { "epoch": 6.426662211827598, "grad_norm": 0.08766636997461319, "learning_rate": 1.705455048235673e-05, "loss": 0.4596, "num_input_tokens_seen": 69999872, "step": 57705 }, { "epoch": 6.427219066711215, "grad_norm": 0.09670226275920868, "learning_rate": 1.7049943112231014e-05, "loss": 0.4688, "num_input_tokens_seen": 70005856, "step": 57710 }, { "epoch": 6.427775921594832, "grad_norm": 0.08388238400220871, "learning_rate": 1.704533604248393e-05, "loss": 0.4707, "num_input_tokens_seen": 70012064, "step": 57715 }, { "epoch": 6.42833277647845, "grad_norm": 0.0956432968378067, "learning_rate": 1.704072927328954e-05, "loss": 0.4604, "num_input_tokens_seen": 70017952, "step": 57720 }, { "epoch": 6.428889631362067, "grad_norm": 0.12614695727825165, "learning_rate": 1.703612280482191e-05, "loss": 0.4589, "num_input_tokens_seen": 70024352, "step": 57725 }, { "epoch": 6.429446486245684, "grad_norm": 0.0943526104092598, "learning_rate": 1.7031516637255072e-05, "loss": 0.4647, "num_input_tokens_seen": 70030528, "step": 57730 }, { "epoch": 6.430003341129302, "grad_norm": 0.1141599789261818, "learning_rate": 1.7026910770763077e-05, "loss": 0.4715, "num_input_tokens_seen": 70036448, "step": 57735 }, { "epoch": 6.4305601960129195, "grad_norm": 0.12739010155200958, "learning_rate": 1.7022305205519946e-05, "loss": 0.4434, "num_input_tokens_seen": 70042208, "step": 57740 }, { "epoch": 6.431117050896536, "grad_norm": 0.1157025620341301, "learning_rate": 1.701769994169969e-05, "loss": 0.4754, "num_input_tokens_seen": 70048416, "step": 57745 }, { "epoch": 6.431673905780154, "grad_norm": 0.11137843877077103, "learning_rate": 1.7013094979476308e-05, "loss": 0.4587, "num_input_tokens_seen": 70054656, "step": 57750 }, { "epoch": 6.432230760663771, "grad_norm": 0.10750896483659744, "learning_rate": 1.7008490319023793e-05, "loss": 0.4712, "num_input_tokens_seen": 70060704, "step": 57755 }, { "epoch": 6.432787615547388, "grad_norm": 0.08643290400505066, "learning_rate": 1.7003885960516125e-05, "loss": 0.4653, "num_input_tokens_seen": 70067136, "step": 57760 }, { "epoch": 6.433344470431006, "grad_norm": 0.06732777506113052, "learning_rate": 1.6999281904127264e-05, "loss": 0.4744, "num_input_tokens_seen": 70073216, "step": 57765 }, { "epoch": 6.433901325314623, "grad_norm": 0.09086228162050247, "learning_rate": 1.699467815003118e-05, "loss": 0.4731, "num_input_tokens_seen": 70079072, "step": 57770 }, { "epoch": 6.434458180198241, "grad_norm": 0.09958858042955399, "learning_rate": 1.699007469840181e-05, "loss": 0.4605, "num_input_tokens_seen": 70085024, "step": 57775 }, { "epoch": 6.435015035081857, "grad_norm": 0.0896976888179779, "learning_rate": 1.6985471549413084e-05, "loss": 0.4552, "num_input_tokens_seen": 70091200, "step": 57780 }, { "epoch": 6.435571889965475, "grad_norm": 0.11100978404283524, "learning_rate": 1.6980868703238928e-05, "loss": 0.4703, "num_input_tokens_seen": 70097120, "step": 57785 }, { "epoch": 6.436128744849093, "grad_norm": 0.09184703230857849, "learning_rate": 1.697626616005325e-05, "loss": 0.4513, "num_input_tokens_seen": 70103296, "step": 57790 }, { "epoch": 6.4366855997327095, "grad_norm": 0.1126280128955841, "learning_rate": 1.6971663920029946e-05, "loss": 0.4672, "num_input_tokens_seen": 70109536, "step": 57795 }, { "epoch": 6.437242454616327, "grad_norm": 0.10504809767007828, "learning_rate": 1.6967061983342912e-05, "loss": 0.4692, "num_input_tokens_seen": 70115808, "step": 57800 }, { "epoch": 6.437799309499944, "grad_norm": 0.08961288630962372, "learning_rate": 1.6962460350166022e-05, "loss": 0.4653, "num_input_tokens_seen": 70122016, "step": 57805 }, { "epoch": 6.438356164383562, "grad_norm": 0.12802070379257202, "learning_rate": 1.6957859020673134e-05, "loss": 0.4754, "num_input_tokens_seen": 70128064, "step": 57810 }, { "epoch": 6.438913019267179, "grad_norm": 0.14223186671733856, "learning_rate": 1.6953257995038102e-05, "loss": 0.453, "num_input_tokens_seen": 70134144, "step": 57815 }, { "epoch": 6.439469874150796, "grad_norm": 0.11639339476823807, "learning_rate": 1.6948657273434776e-05, "loss": 0.4562, "num_input_tokens_seen": 70140288, "step": 57820 }, { "epoch": 6.440026729034414, "grad_norm": 0.09482844918966293, "learning_rate": 1.6944056856036978e-05, "loss": 0.4694, "num_input_tokens_seen": 70146432, "step": 57825 }, { "epoch": 6.4405835839180305, "grad_norm": 0.09890057891607285, "learning_rate": 1.693945674301852e-05, "loss": 0.4652, "num_input_tokens_seen": 70152512, "step": 57830 }, { "epoch": 6.441140438801648, "grad_norm": 0.0986965224146843, "learning_rate": 1.6934856934553225e-05, "loss": 0.4535, "num_input_tokens_seen": 70158528, "step": 57835 }, { "epoch": 6.441697293685266, "grad_norm": 0.08124221116304398, "learning_rate": 1.6930257430814884e-05, "loss": 0.4597, "num_input_tokens_seen": 70164736, "step": 57840 }, { "epoch": 6.442254148568883, "grad_norm": 0.06973480433225632, "learning_rate": 1.6925658231977274e-05, "loss": 0.4581, "num_input_tokens_seen": 70170208, "step": 57845 }, { "epoch": 6.4428110034525, "grad_norm": 0.1147170141339302, "learning_rate": 1.6921059338214175e-05, "loss": 0.4524, "num_input_tokens_seen": 70175936, "step": 57850 }, { "epoch": 6.443367858336117, "grad_norm": 0.10112729668617249, "learning_rate": 1.6916460749699338e-05, "loss": 0.4688, "num_input_tokens_seen": 70182016, "step": 57855 }, { "epoch": 6.443924713219735, "grad_norm": 0.09016422182321548, "learning_rate": 1.691186246660652e-05, "loss": 0.4703, "num_input_tokens_seen": 70188160, "step": 57860 }, { "epoch": 6.4444815681033525, "grad_norm": 0.10475821048021317, "learning_rate": 1.690726448910946e-05, "loss": 0.4568, "num_input_tokens_seen": 70194528, "step": 57865 }, { "epoch": 6.445038422986969, "grad_norm": 0.10411808639764786, "learning_rate": 1.690266681738188e-05, "loss": 0.459, "num_input_tokens_seen": 70200736, "step": 57870 }, { "epoch": 6.445595277870587, "grad_norm": 0.09183576703071594, "learning_rate": 1.6898069451597505e-05, "loss": 0.4567, "num_input_tokens_seen": 70207072, "step": 57875 }, { "epoch": 6.446152132754205, "grad_norm": 0.12763626873493195, "learning_rate": 1.6893472391930022e-05, "loss": 0.4736, "num_input_tokens_seen": 70212928, "step": 57880 }, { "epoch": 6.446708987637821, "grad_norm": 0.09739173203706741, "learning_rate": 1.688887563855313e-05, "loss": 0.4678, "num_input_tokens_seen": 70219360, "step": 57885 }, { "epoch": 6.447265842521439, "grad_norm": 0.09629242867231369, "learning_rate": 1.6884279191640513e-05, "loss": 0.4658, "num_input_tokens_seen": 70225472, "step": 57890 }, { "epoch": 6.447822697405056, "grad_norm": 0.10905380547046661, "learning_rate": 1.6879683051365837e-05, "loss": 0.4597, "num_input_tokens_seen": 70231392, "step": 57895 }, { "epoch": 6.4483795522886735, "grad_norm": 0.1219935268163681, "learning_rate": 1.6875087217902754e-05, "loss": 0.47, "num_input_tokens_seen": 70236544, "step": 57900 }, { "epoch": 6.448936407172291, "grad_norm": 0.10957623273134232, "learning_rate": 1.6870491691424917e-05, "loss": 0.4651, "num_input_tokens_seen": 70242912, "step": 57905 }, { "epoch": 6.449493262055908, "grad_norm": 0.09604834765195847, "learning_rate": 1.686589647210596e-05, "loss": 0.4569, "num_input_tokens_seen": 70248992, "step": 57910 }, { "epoch": 6.450050116939526, "grad_norm": 0.10597768425941467, "learning_rate": 1.6861301560119493e-05, "loss": 0.4483, "num_input_tokens_seen": 70254464, "step": 57915 }, { "epoch": 6.450606971823143, "grad_norm": 0.08075879514217377, "learning_rate": 1.685670695563914e-05, "loss": 0.4734, "num_input_tokens_seen": 70260608, "step": 57920 }, { "epoch": 6.45116382670676, "grad_norm": 0.127150297164917, "learning_rate": 1.6852112658838486e-05, "loss": 0.4667, "num_input_tokens_seen": 70267008, "step": 57925 }, { "epoch": 6.451720681590378, "grad_norm": 0.10472840815782547, "learning_rate": 1.6847518669891138e-05, "loss": 0.4692, "num_input_tokens_seen": 70272800, "step": 57930 }, { "epoch": 6.452277536473995, "grad_norm": 0.11620796471834183, "learning_rate": 1.6842924988970658e-05, "loss": 0.4574, "num_input_tokens_seen": 70279040, "step": 57935 }, { "epoch": 6.452834391357612, "grad_norm": 0.07241648435592651, "learning_rate": 1.6838331616250614e-05, "loss": 0.458, "num_input_tokens_seen": 70284992, "step": 57940 }, { "epoch": 6.45339124624123, "grad_norm": 0.14762909710407257, "learning_rate": 1.6833738551904552e-05, "loss": 0.4671, "num_input_tokens_seen": 70290528, "step": 57945 }, { "epoch": 6.453948101124847, "grad_norm": 0.08256250619888306, "learning_rate": 1.682914579610602e-05, "loss": 0.4597, "num_input_tokens_seen": 70296544, "step": 57950 }, { "epoch": 6.454504956008464, "grad_norm": 0.11955800652503967, "learning_rate": 1.682455334902854e-05, "loss": 0.4507, "num_input_tokens_seen": 70302272, "step": 57955 }, { "epoch": 6.455061810892081, "grad_norm": 0.11080732196569443, "learning_rate": 1.681996121084564e-05, "loss": 0.4517, "num_input_tokens_seen": 70308672, "step": 57960 }, { "epoch": 6.455618665775699, "grad_norm": 0.12006337940692902, "learning_rate": 1.6815369381730817e-05, "loss": 0.4612, "num_input_tokens_seen": 70314752, "step": 57965 }, { "epoch": 6.4561755206593165, "grad_norm": 0.11254396289587021, "learning_rate": 1.6810777861857568e-05, "loss": 0.468, "num_input_tokens_seen": 70320736, "step": 57970 }, { "epoch": 6.456732375542933, "grad_norm": 0.0814448669552803, "learning_rate": 1.6806186651399382e-05, "loss": 0.4603, "num_input_tokens_seen": 70326656, "step": 57975 }, { "epoch": 6.457289230426551, "grad_norm": 0.08994312584400177, "learning_rate": 1.6801595750529713e-05, "loss": 0.4689, "num_input_tokens_seen": 70332480, "step": 57980 }, { "epoch": 6.457846085310168, "grad_norm": 0.10392073541879654, "learning_rate": 1.6797005159422035e-05, "loss": 0.4821, "num_input_tokens_seen": 70338528, "step": 57985 }, { "epoch": 6.4584029401937855, "grad_norm": 0.13623712956905365, "learning_rate": 1.6792414878249784e-05, "loss": 0.4735, "num_input_tokens_seen": 70343744, "step": 57990 }, { "epoch": 6.458959795077403, "grad_norm": 0.08234873414039612, "learning_rate": 1.6787824907186406e-05, "loss": 0.4547, "num_input_tokens_seen": 70349856, "step": 57995 }, { "epoch": 6.45951664996102, "grad_norm": 0.08864172548055649, "learning_rate": 1.678323524640532e-05, "loss": 0.4571, "num_input_tokens_seen": 70355840, "step": 58000 }, { "epoch": 6.460073504844638, "grad_norm": 0.09804610908031464, "learning_rate": 1.6778645896079938e-05, "loss": 0.4616, "num_input_tokens_seen": 70362080, "step": 58005 }, { "epoch": 6.460630359728254, "grad_norm": 0.10313820093870163, "learning_rate": 1.677405685638366e-05, "loss": 0.4593, "num_input_tokens_seen": 70368352, "step": 58010 }, { "epoch": 6.461187214611872, "grad_norm": 0.1302800476551056, "learning_rate": 1.676946812748988e-05, "loss": 0.4567, "num_input_tokens_seen": 70374240, "step": 58015 }, { "epoch": 6.46174406949549, "grad_norm": 0.10746854543685913, "learning_rate": 1.6764879709571957e-05, "loss": 0.4641, "num_input_tokens_seen": 70379936, "step": 58020 }, { "epoch": 6.4623009243791065, "grad_norm": 0.12230434268712997, "learning_rate": 1.676029160280328e-05, "loss": 0.4585, "num_input_tokens_seen": 70386336, "step": 58025 }, { "epoch": 6.462857779262724, "grad_norm": 0.10141341388225555, "learning_rate": 1.6755703807357193e-05, "loss": 0.4585, "num_input_tokens_seen": 70392416, "step": 58030 }, { "epoch": 6.463414634146342, "grad_norm": 0.12240365147590637, "learning_rate": 1.6751116323407032e-05, "loss": 0.4461, "num_input_tokens_seen": 70397600, "step": 58035 }, { "epoch": 6.463971489029959, "grad_norm": 0.1261255145072937, "learning_rate": 1.6746529151126133e-05, "loss": 0.4646, "num_input_tokens_seen": 70403840, "step": 58040 }, { "epoch": 6.464528343913576, "grad_norm": 0.1250370740890503, "learning_rate": 1.674194229068781e-05, "loss": 0.46, "num_input_tokens_seen": 70410176, "step": 58045 }, { "epoch": 6.465085198797193, "grad_norm": 0.09624943137168884, "learning_rate": 1.6737355742265377e-05, "loss": 0.471, "num_input_tokens_seen": 70416416, "step": 58050 }, { "epoch": 6.465642053680811, "grad_norm": 0.1155964806675911, "learning_rate": 1.6732769506032115e-05, "loss": 0.4695, "num_input_tokens_seen": 70422624, "step": 58055 }, { "epoch": 6.4661989085644285, "grad_norm": 0.08785911649465561, "learning_rate": 1.6728183582161317e-05, "loss": 0.4639, "num_input_tokens_seen": 70428352, "step": 58060 }, { "epoch": 6.466755763448045, "grad_norm": 0.11579376459121704, "learning_rate": 1.6723597970826256e-05, "loss": 0.471, "num_input_tokens_seen": 70434336, "step": 58065 }, { "epoch": 6.467312618331663, "grad_norm": 0.099189892411232, "learning_rate": 1.6719012672200187e-05, "loss": 0.4612, "num_input_tokens_seen": 70440256, "step": 58070 }, { "epoch": 6.46786947321528, "grad_norm": 0.10794238746166229, "learning_rate": 1.6714427686456352e-05, "loss": 0.4749, "num_input_tokens_seen": 70446016, "step": 58075 }, { "epoch": 6.468426328098897, "grad_norm": 0.09422750771045685, "learning_rate": 1.6709843013767993e-05, "loss": 0.4651, "num_input_tokens_seen": 70452000, "step": 58080 }, { "epoch": 6.468983182982515, "grad_norm": 0.09911191463470459, "learning_rate": 1.6705258654308333e-05, "loss": 0.4627, "num_input_tokens_seen": 70458112, "step": 58085 }, { "epoch": 6.469540037866132, "grad_norm": 0.10354182124137878, "learning_rate": 1.670067460825058e-05, "loss": 0.4739, "num_input_tokens_seen": 70464384, "step": 58090 }, { "epoch": 6.4700968927497495, "grad_norm": 0.10972842574119568, "learning_rate": 1.669609087576794e-05, "loss": 0.4553, "num_input_tokens_seen": 70469952, "step": 58095 }, { "epoch": 6.470653747633367, "grad_norm": 0.09671080857515335, "learning_rate": 1.6691507457033605e-05, "loss": 0.459, "num_input_tokens_seen": 70476224, "step": 58100 }, { "epoch": 6.471210602516984, "grad_norm": 0.07865582406520844, "learning_rate": 1.6686924352220735e-05, "loss": 0.4714, "num_input_tokens_seen": 70482240, "step": 58105 }, { "epoch": 6.471767457400602, "grad_norm": 0.11834008246660233, "learning_rate": 1.6682341561502505e-05, "loss": 0.4624, "num_input_tokens_seen": 70488544, "step": 58110 }, { "epoch": 6.4723243122842185, "grad_norm": 0.09838404506444931, "learning_rate": 1.6677759085052065e-05, "loss": 0.464, "num_input_tokens_seen": 70494592, "step": 58115 }, { "epoch": 6.472881167167836, "grad_norm": 0.13485197722911835, "learning_rate": 1.667317692304256e-05, "loss": 0.4569, "num_input_tokens_seen": 70500992, "step": 58120 }, { "epoch": 6.473438022051454, "grad_norm": 0.08635533601045609, "learning_rate": 1.6668595075647115e-05, "loss": 0.4574, "num_input_tokens_seen": 70506720, "step": 58125 }, { "epoch": 6.473994876935071, "grad_norm": 0.11104028671979904, "learning_rate": 1.6664013543038844e-05, "loss": 0.4557, "num_input_tokens_seen": 70512800, "step": 58130 }, { "epoch": 6.474551731818688, "grad_norm": 0.10413618385791779, "learning_rate": 1.665943232539085e-05, "loss": 0.4692, "num_input_tokens_seen": 70518784, "step": 58135 }, { "epoch": 6.475108586702305, "grad_norm": 0.0926784798502922, "learning_rate": 1.665485142287624e-05, "loss": 0.4657, "num_input_tokens_seen": 70524704, "step": 58140 }, { "epoch": 6.475665441585923, "grad_norm": 0.11242537945508957, "learning_rate": 1.6650270835668086e-05, "loss": 0.4617, "num_input_tokens_seen": 70531136, "step": 58145 }, { "epoch": 6.47622229646954, "grad_norm": 0.09998159110546112, "learning_rate": 1.6645690563939457e-05, "loss": 0.4616, "num_input_tokens_seen": 70537376, "step": 58150 }, { "epoch": 6.476779151353157, "grad_norm": 0.12077644467353821, "learning_rate": 1.6641110607863413e-05, "loss": 0.4672, "num_input_tokens_seen": 70543840, "step": 58155 }, { "epoch": 6.477336006236775, "grad_norm": 0.12061911076307297, "learning_rate": 1.6636530967612994e-05, "loss": 0.46, "num_input_tokens_seen": 70549920, "step": 58160 }, { "epoch": 6.477892861120392, "grad_norm": 0.09329622238874435, "learning_rate": 1.663195164336124e-05, "loss": 0.4528, "num_input_tokens_seen": 70555968, "step": 58165 }, { "epoch": 6.478449716004009, "grad_norm": 0.10858651250600815, "learning_rate": 1.6627372635281164e-05, "loss": 0.4504, "num_input_tokens_seen": 70561504, "step": 58170 }, { "epoch": 6.479006570887627, "grad_norm": 0.12867236137390137, "learning_rate": 1.6622793943545782e-05, "loss": 0.4618, "num_input_tokens_seen": 70567520, "step": 58175 }, { "epoch": 6.479563425771244, "grad_norm": 0.10550080239772797, "learning_rate": 1.66182155683281e-05, "loss": 0.4706, "num_input_tokens_seen": 70573504, "step": 58180 }, { "epoch": 6.4801202806548615, "grad_norm": 0.11051864176988602, "learning_rate": 1.6613637509801088e-05, "loss": 0.4651, "num_input_tokens_seen": 70579648, "step": 58185 }, { "epoch": 6.480677135538478, "grad_norm": 0.08099876344203949, "learning_rate": 1.6609059768137728e-05, "loss": 0.4628, "num_input_tokens_seen": 70586048, "step": 58190 }, { "epoch": 6.481233990422096, "grad_norm": 0.09482141584157944, "learning_rate": 1.660448234351098e-05, "loss": 0.4534, "num_input_tokens_seen": 70592416, "step": 58195 }, { "epoch": 6.481790845305714, "grad_norm": 0.09819495677947998, "learning_rate": 1.659990523609379e-05, "loss": 0.4576, "num_input_tokens_seen": 70598688, "step": 58200 }, { "epoch": 6.48234770018933, "grad_norm": 0.09865835309028625, "learning_rate": 1.65953284460591e-05, "loss": 0.4653, "num_input_tokens_seen": 70604480, "step": 58205 }, { "epoch": 6.482904555072948, "grad_norm": 0.07582031935453415, "learning_rate": 1.659075197357984e-05, "loss": 0.4606, "num_input_tokens_seen": 70610656, "step": 58210 }, { "epoch": 6.483461409956566, "grad_norm": 0.10675104707479477, "learning_rate": 1.6586175818828917e-05, "loss": 0.454, "num_input_tokens_seen": 70616864, "step": 58215 }, { "epoch": 6.4840182648401825, "grad_norm": 0.09696515649557114, "learning_rate": 1.6581599981979236e-05, "loss": 0.4578, "num_input_tokens_seen": 70622784, "step": 58220 }, { "epoch": 6.4845751197238, "grad_norm": 0.1434658169746399, "learning_rate": 1.6577024463203687e-05, "loss": 0.4657, "num_input_tokens_seen": 70628864, "step": 58225 }, { "epoch": 6.485131974607417, "grad_norm": 0.13753411173820496, "learning_rate": 1.6572449262675137e-05, "loss": 0.4521, "num_input_tokens_seen": 70634880, "step": 58230 }, { "epoch": 6.485688829491035, "grad_norm": 0.0992186889052391, "learning_rate": 1.656787438056647e-05, "loss": 0.4679, "num_input_tokens_seen": 70640800, "step": 58235 }, { "epoch": 6.486245684374652, "grad_norm": 0.08504028618335724, "learning_rate": 1.656329981705053e-05, "loss": 0.4645, "num_input_tokens_seen": 70646912, "step": 58240 }, { "epoch": 6.486802539258269, "grad_norm": 0.14385800063610077, "learning_rate": 1.655872557230016e-05, "loss": 0.4536, "num_input_tokens_seen": 70652736, "step": 58245 }, { "epoch": 6.487359394141887, "grad_norm": 0.12552550435066223, "learning_rate": 1.655415164648819e-05, "loss": 0.4602, "num_input_tokens_seen": 70658848, "step": 58250 }, { "epoch": 6.487916249025504, "grad_norm": 0.07384735345840454, "learning_rate": 1.6549578039787436e-05, "loss": 0.4615, "num_input_tokens_seen": 70664960, "step": 58255 }, { "epoch": 6.488473103909121, "grad_norm": 0.07567884027957916, "learning_rate": 1.6545004752370706e-05, "loss": 0.4725, "num_input_tokens_seen": 70670656, "step": 58260 }, { "epoch": 6.489029958792739, "grad_norm": 0.12096909433603287, "learning_rate": 1.6540431784410785e-05, "loss": 0.4517, "num_input_tokens_seen": 70676640, "step": 58265 }, { "epoch": 6.489586813676356, "grad_norm": 0.08334999531507492, "learning_rate": 1.6535859136080468e-05, "loss": 0.4616, "num_input_tokens_seen": 70682528, "step": 58270 }, { "epoch": 6.490143668559973, "grad_norm": 0.09721400588750839, "learning_rate": 1.653128680755251e-05, "loss": 0.4612, "num_input_tokens_seen": 70688480, "step": 58275 }, { "epoch": 6.490700523443591, "grad_norm": 0.08731258660554886, "learning_rate": 1.6526714798999685e-05, "loss": 0.4561, "num_input_tokens_seen": 70694848, "step": 58280 }, { "epoch": 6.491257378327208, "grad_norm": 0.10204862058162689, "learning_rate": 1.6522143110594723e-05, "loss": 0.4651, "num_input_tokens_seen": 70701184, "step": 58285 }, { "epoch": 6.4918142332108255, "grad_norm": 0.08889353275299072, "learning_rate": 1.651757174251037e-05, "loss": 0.4459, "num_input_tokens_seen": 70707488, "step": 58290 }, { "epoch": 6.492371088094442, "grad_norm": 0.09149689972400665, "learning_rate": 1.6513000694919328e-05, "loss": 0.4654, "num_input_tokens_seen": 70713600, "step": 58295 }, { "epoch": 6.49292794297806, "grad_norm": 0.09430660307407379, "learning_rate": 1.6508429967994326e-05, "loss": 0.4692, "num_input_tokens_seen": 70719520, "step": 58300 }, { "epoch": 6.493484797861678, "grad_norm": 0.08878903836011887, "learning_rate": 1.650385956190805e-05, "loss": 0.4592, "num_input_tokens_seen": 70725728, "step": 58305 }, { "epoch": 6.494041652745294, "grad_norm": 0.1733938604593277, "learning_rate": 1.649928947683319e-05, "loss": 0.4702, "num_input_tokens_seen": 70732032, "step": 58310 }, { "epoch": 6.494598507628912, "grad_norm": 0.11555019021034241, "learning_rate": 1.6494719712942412e-05, "loss": 0.4539, "num_input_tokens_seen": 70738048, "step": 58315 }, { "epoch": 6.495155362512529, "grad_norm": 0.07833481580018997, "learning_rate": 1.6490150270408384e-05, "loss": 0.4644, "num_input_tokens_seen": 70744384, "step": 58320 }, { "epoch": 6.495712217396147, "grad_norm": 0.08537697046995163, "learning_rate": 1.6485581149403745e-05, "loss": 0.4673, "num_input_tokens_seen": 70750496, "step": 58325 }, { "epoch": 6.496269072279764, "grad_norm": 0.09041153639554977, "learning_rate": 1.6481012350101133e-05, "loss": 0.4602, "num_input_tokens_seen": 70756992, "step": 58330 }, { "epoch": 6.496825927163381, "grad_norm": 0.08655229955911636, "learning_rate": 1.647644387267318e-05, "loss": 0.4558, "num_input_tokens_seen": 70763296, "step": 58335 }, { "epoch": 6.497382782046999, "grad_norm": 0.10103584080934525, "learning_rate": 1.6471875717292496e-05, "loss": 0.464, "num_input_tokens_seen": 70769408, "step": 58340 }, { "epoch": 6.4979396369306155, "grad_norm": 0.14443078637123108, "learning_rate": 1.6467307884131672e-05, "loss": 0.4639, "num_input_tokens_seen": 70775616, "step": 58345 }, { "epoch": 6.498496491814233, "grad_norm": 0.09239578992128372, "learning_rate": 1.6462740373363304e-05, "loss": 0.4623, "num_input_tokens_seen": 70781984, "step": 58350 }, { "epoch": 6.499053346697851, "grad_norm": 0.08844365179538727, "learning_rate": 1.645817318515996e-05, "loss": 0.4505, "num_input_tokens_seen": 70787968, "step": 58355 }, { "epoch": 6.499610201581468, "grad_norm": 0.12388064712285995, "learning_rate": 1.6453606319694203e-05, "loss": 0.4574, "num_input_tokens_seen": 70794144, "step": 58360 }, { "epoch": 6.500167056465085, "grad_norm": 0.0900658592581749, "learning_rate": 1.6449039777138595e-05, "loss": 0.4621, "num_input_tokens_seen": 70799936, "step": 58365 }, { "epoch": 6.500723911348702, "grad_norm": 0.12202727049589157, "learning_rate": 1.6444473557665663e-05, "loss": 0.4692, "num_input_tokens_seen": 70806016, "step": 58370 }, { "epoch": 6.500723911348702, "eval_loss": 0.4642300307750702, "eval_runtime": 113.1003, "eval_samples_per_second": 35.287, "eval_steps_per_second": 8.824, "num_input_tokens_seen": 70806016, "step": 58370 }, { "epoch": 6.50128076623232, "grad_norm": 0.11468210816383362, "learning_rate": 1.643990766144794e-05, "loss": 0.4578, "num_input_tokens_seen": 70812000, "step": 58375 }, { "epoch": 6.501837621115937, "grad_norm": 0.10561112314462662, "learning_rate": 1.643534208865794e-05, "loss": 0.466, "num_input_tokens_seen": 70817984, "step": 58380 }, { "epoch": 6.502394475999554, "grad_norm": 0.08680227398872375, "learning_rate": 1.643077683946816e-05, "loss": 0.4472, "num_input_tokens_seen": 70823936, "step": 58385 }, { "epoch": 6.502951330883172, "grad_norm": 0.13431592285633087, "learning_rate": 1.6426211914051094e-05, "loss": 0.4665, "num_input_tokens_seen": 70830112, "step": 58390 }, { "epoch": 6.503508185766789, "grad_norm": 0.09483778476715088, "learning_rate": 1.642164731257921e-05, "loss": 0.4557, "num_input_tokens_seen": 70836384, "step": 58395 }, { "epoch": 6.504065040650406, "grad_norm": 0.13485658168792725, "learning_rate": 1.641708303522499e-05, "loss": 0.4639, "num_input_tokens_seen": 70842560, "step": 58400 }, { "epoch": 6.504621895534024, "grad_norm": 0.11670438945293427, "learning_rate": 1.6412519082160872e-05, "loss": 0.4495, "num_input_tokens_seen": 70848608, "step": 58405 }, { "epoch": 6.505178750417641, "grad_norm": 0.11196031421422958, "learning_rate": 1.640795545355931e-05, "loss": 0.4487, "num_input_tokens_seen": 70854784, "step": 58410 }, { "epoch": 6.5057356053012585, "grad_norm": 0.11598328500986099, "learning_rate": 1.640339214959272e-05, "loss": 0.4604, "num_input_tokens_seen": 70860704, "step": 58415 }, { "epoch": 6.506292460184876, "grad_norm": 0.09486403316259384, "learning_rate": 1.6398829170433528e-05, "loss": 0.4487, "num_input_tokens_seen": 70867232, "step": 58420 }, { "epoch": 6.506849315068493, "grad_norm": 0.09547359496355057, "learning_rate": 1.6394266516254127e-05, "loss": 0.4595, "num_input_tokens_seen": 70872832, "step": 58425 }, { "epoch": 6.507406169952111, "grad_norm": 0.10807850956916809, "learning_rate": 1.6389704187226924e-05, "loss": 0.4603, "num_input_tokens_seen": 70878784, "step": 58430 }, { "epoch": 6.507963024835728, "grad_norm": 0.10599837452173233, "learning_rate": 1.6385142183524282e-05, "loss": 0.4622, "num_input_tokens_seen": 70884640, "step": 58435 }, { "epoch": 6.508519879719345, "grad_norm": 0.14265035092830658, "learning_rate": 1.638058050531858e-05, "loss": 0.4487, "num_input_tokens_seen": 70891040, "step": 58440 }, { "epoch": 6.509076734602963, "grad_norm": 0.10233225673437119, "learning_rate": 1.6376019152782174e-05, "loss": 0.4698, "num_input_tokens_seen": 70897152, "step": 58445 }, { "epoch": 6.5096335894865796, "grad_norm": 0.11802265793085098, "learning_rate": 1.6371458126087392e-05, "loss": 0.4662, "num_input_tokens_seen": 70903232, "step": 58450 }, { "epoch": 6.510190444370197, "grad_norm": 0.10666626691818237, "learning_rate": 1.6366897425406573e-05, "loss": 0.458, "num_input_tokens_seen": 70909376, "step": 58455 }, { "epoch": 6.510747299253815, "grad_norm": 0.07979801297187805, "learning_rate": 1.6362337050912042e-05, "loss": 0.4704, "num_input_tokens_seen": 70915424, "step": 58460 }, { "epoch": 6.511304154137432, "grad_norm": 0.14468400180339813, "learning_rate": 1.6357777002776098e-05, "loss": 0.4498, "num_input_tokens_seen": 70921536, "step": 58465 }, { "epoch": 6.511861009021049, "grad_norm": 0.10777093470096588, "learning_rate": 1.6353217281171033e-05, "loss": 0.4525, "num_input_tokens_seen": 70926944, "step": 58470 }, { "epoch": 6.512417863904666, "grad_norm": 0.10108289867639542, "learning_rate": 1.6348657886269127e-05, "loss": 0.4593, "num_input_tokens_seen": 70933184, "step": 58475 }, { "epoch": 6.512974718788284, "grad_norm": 0.12952794134616852, "learning_rate": 1.6344098818242653e-05, "loss": 0.4524, "num_input_tokens_seen": 70939328, "step": 58480 }, { "epoch": 6.5135315736719015, "grad_norm": 0.0939798355102539, "learning_rate": 1.6339540077263866e-05, "loss": 0.4533, "num_input_tokens_seen": 70945472, "step": 58485 }, { "epoch": 6.514088428555518, "grad_norm": 0.08856747299432755, "learning_rate": 1.6334981663505e-05, "loss": 0.4561, "num_input_tokens_seen": 70951424, "step": 58490 }, { "epoch": 6.514645283439136, "grad_norm": 0.08550319820642471, "learning_rate": 1.6330423577138304e-05, "loss": 0.4687, "num_input_tokens_seen": 70957248, "step": 58495 }, { "epoch": 6.515202138322753, "grad_norm": 0.167785182595253, "learning_rate": 1.6325865818335983e-05, "loss": 0.4635, "num_input_tokens_seen": 70962400, "step": 58500 }, { "epoch": 6.51575899320637, "grad_norm": 0.10440482199192047, "learning_rate": 1.6321308387270258e-05, "loss": 0.4662, "num_input_tokens_seen": 70968672, "step": 58505 }, { "epoch": 6.516315848089988, "grad_norm": 0.0891835168004036, "learning_rate": 1.6316751284113304e-05, "loss": 0.474, "num_input_tokens_seen": 70974560, "step": 58510 }, { "epoch": 6.516872702973605, "grad_norm": 0.12219332903623581, "learning_rate": 1.6312194509037328e-05, "loss": 0.4634, "num_input_tokens_seen": 70980608, "step": 58515 }, { "epoch": 6.5174295578572226, "grad_norm": 0.10122084617614746, "learning_rate": 1.630763806221447e-05, "loss": 0.4641, "num_input_tokens_seen": 70986272, "step": 58520 }, { "epoch": 6.517986412740839, "grad_norm": 0.1173596978187561, "learning_rate": 1.6303081943816908e-05, "loss": 0.4545, "num_input_tokens_seen": 70992352, "step": 58525 }, { "epoch": 6.518543267624457, "grad_norm": 0.10418059676885605, "learning_rate": 1.629852615401678e-05, "loss": 0.4545, "num_input_tokens_seen": 70998784, "step": 58530 }, { "epoch": 6.519100122508075, "grad_norm": 0.09961972385644913, "learning_rate": 1.629397069298622e-05, "loss": 0.4697, "num_input_tokens_seen": 71004992, "step": 58535 }, { "epoch": 6.5196569773916915, "grad_norm": 0.18770615756511688, "learning_rate": 1.628941556089734e-05, "loss": 0.4689, "num_input_tokens_seen": 71011584, "step": 58540 }, { "epoch": 6.520213832275309, "grad_norm": 0.10048358887434006, "learning_rate": 1.628486075792226e-05, "loss": 0.4726, "num_input_tokens_seen": 71017728, "step": 58545 }, { "epoch": 6.520770687158926, "grad_norm": 0.09834016859531403, "learning_rate": 1.628030628423307e-05, "loss": 0.4477, "num_input_tokens_seen": 71023808, "step": 58550 }, { "epoch": 6.521327542042544, "grad_norm": 0.10593882948160172, "learning_rate": 1.6275752140001854e-05, "loss": 0.4757, "num_input_tokens_seen": 71029792, "step": 58555 }, { "epoch": 6.521884396926161, "grad_norm": 0.0928039401769638, "learning_rate": 1.627119832540068e-05, "loss": 0.4657, "num_input_tokens_seen": 71036032, "step": 58560 }, { "epoch": 6.522441251809778, "grad_norm": 0.08628244698047638, "learning_rate": 1.6266644840601604e-05, "loss": 0.4676, "num_input_tokens_seen": 71042176, "step": 58565 }, { "epoch": 6.522998106693396, "grad_norm": 0.14009694755077362, "learning_rate": 1.626209168577667e-05, "loss": 0.4596, "num_input_tokens_seen": 71048640, "step": 58570 }, { "epoch": 6.5235549615770125, "grad_norm": 0.10180805623531342, "learning_rate": 1.6257538861097914e-05, "loss": 0.4553, "num_input_tokens_seen": 71054784, "step": 58575 }, { "epoch": 6.52411181646063, "grad_norm": 0.1219504103064537, "learning_rate": 1.625298636673736e-05, "loss": 0.4547, "num_input_tokens_seen": 71060544, "step": 58580 }, { "epoch": 6.524668671344248, "grad_norm": 0.09275051206350327, "learning_rate": 1.6248434202867015e-05, "loss": 0.4539, "num_input_tokens_seen": 71066624, "step": 58585 }, { "epoch": 6.525225526227865, "grad_norm": 0.10684680193662643, "learning_rate": 1.624388236965887e-05, "loss": 0.4459, "num_input_tokens_seen": 71072608, "step": 58590 }, { "epoch": 6.525782381111482, "grad_norm": 0.09107106178998947, "learning_rate": 1.623933086728491e-05, "loss": 0.4538, "num_input_tokens_seen": 71079136, "step": 58595 }, { "epoch": 6.5263392359951, "grad_norm": 0.12112846225500107, "learning_rate": 1.6234779695917104e-05, "loss": 0.4548, "num_input_tokens_seen": 71084768, "step": 58600 }, { "epoch": 6.526896090878717, "grad_norm": 0.0842665284872055, "learning_rate": 1.6230228855727404e-05, "loss": 0.4669, "num_input_tokens_seen": 71091104, "step": 58605 }, { "epoch": 6.5274529457623345, "grad_norm": 0.10254431515932083, "learning_rate": 1.6225678346887767e-05, "loss": 0.4593, "num_input_tokens_seen": 71097024, "step": 58610 }, { "epoch": 6.528009800645952, "grad_norm": 0.11491740494966507, "learning_rate": 1.6221128169570124e-05, "loss": 0.4576, "num_input_tokens_seen": 71102848, "step": 58615 }, { "epoch": 6.528566655529569, "grad_norm": 0.10508580505847931, "learning_rate": 1.6216578323946392e-05, "loss": 0.4508, "num_input_tokens_seen": 71108768, "step": 58620 }, { "epoch": 6.529123510413187, "grad_norm": 0.11541049927473068, "learning_rate": 1.6212028810188478e-05, "loss": 0.4521, "num_input_tokens_seen": 71115168, "step": 58625 }, { "epoch": 6.529680365296803, "grad_norm": 0.10553682595491409, "learning_rate": 1.6207479628468275e-05, "loss": 0.4635, "num_input_tokens_seen": 71121504, "step": 58630 }, { "epoch": 6.530237220180421, "grad_norm": 0.11254218965768814, "learning_rate": 1.6202930778957664e-05, "loss": 0.4725, "num_input_tokens_seen": 71127552, "step": 58635 }, { "epoch": 6.530794075064039, "grad_norm": 0.13379628956317902, "learning_rate": 1.6198382261828528e-05, "loss": 0.4588, "num_input_tokens_seen": 71133600, "step": 58640 }, { "epoch": 6.5313509299476555, "grad_norm": 0.13152673840522766, "learning_rate": 1.6193834077252712e-05, "loss": 0.452, "num_input_tokens_seen": 71139744, "step": 58645 }, { "epoch": 6.531907784831273, "grad_norm": 0.11687672138214111, "learning_rate": 1.618928622540207e-05, "loss": 0.4686, "num_input_tokens_seen": 71146240, "step": 58650 }, { "epoch": 6.53246463971489, "grad_norm": 0.12031800299882889, "learning_rate": 1.6184738706448428e-05, "loss": 0.4685, "num_input_tokens_seen": 71152160, "step": 58655 }, { "epoch": 6.533021494598508, "grad_norm": 0.09716298431158066, "learning_rate": 1.6180191520563607e-05, "loss": 0.4593, "num_input_tokens_seen": 71158464, "step": 58660 }, { "epoch": 6.533578349482125, "grad_norm": 0.09244242310523987, "learning_rate": 1.6175644667919416e-05, "loss": 0.4549, "num_input_tokens_seen": 71164384, "step": 58665 }, { "epoch": 6.534135204365742, "grad_norm": 0.10446012020111084, "learning_rate": 1.617109814868764e-05, "loss": 0.4574, "num_input_tokens_seen": 71170624, "step": 58670 }, { "epoch": 6.53469205924936, "grad_norm": 0.11422799527645111, "learning_rate": 1.6166551963040077e-05, "loss": 0.4692, "num_input_tokens_seen": 71176928, "step": 58675 }, { "epoch": 6.535248914132977, "grad_norm": 0.09445058554410934, "learning_rate": 1.616200611114849e-05, "loss": 0.4655, "num_input_tokens_seen": 71183104, "step": 58680 }, { "epoch": 6.535805769016594, "grad_norm": 0.0872335284948349, "learning_rate": 1.6157460593184633e-05, "loss": 0.4638, "num_input_tokens_seen": 71189280, "step": 58685 }, { "epoch": 6.536362623900212, "grad_norm": 0.08801460266113281, "learning_rate": 1.6152915409320252e-05, "loss": 0.4682, "num_input_tokens_seen": 71195488, "step": 58690 }, { "epoch": 6.536919478783829, "grad_norm": 0.09614048153162003, "learning_rate": 1.6148370559727084e-05, "loss": 0.4575, "num_input_tokens_seen": 71201440, "step": 58695 }, { "epoch": 6.537476333667446, "grad_norm": 0.10312478244304657, "learning_rate": 1.614382604457683e-05, "loss": 0.4506, "num_input_tokens_seen": 71207712, "step": 58700 }, { "epoch": 6.538033188551063, "grad_norm": 0.09650200605392456, "learning_rate": 1.6139281864041218e-05, "loss": 0.4592, "num_input_tokens_seen": 71213984, "step": 58705 }, { "epoch": 6.538590043434681, "grad_norm": 0.12202377617359161, "learning_rate": 1.6134738018291934e-05, "loss": 0.4545, "num_input_tokens_seen": 71219968, "step": 58710 }, { "epoch": 6.5391468983182985, "grad_norm": 0.07090624421834946, "learning_rate": 1.6130194507500655e-05, "loss": 0.4654, "num_input_tokens_seen": 71226208, "step": 58715 }, { "epoch": 6.539703753201915, "grad_norm": 0.11593565344810486, "learning_rate": 1.6125651331839055e-05, "loss": 0.4532, "num_input_tokens_seen": 71232064, "step": 58720 }, { "epoch": 6.540260608085533, "grad_norm": 0.14612258970737457, "learning_rate": 1.612110849147878e-05, "loss": 0.4531, "num_input_tokens_seen": 71238080, "step": 58725 }, { "epoch": 6.54081746296915, "grad_norm": 0.15721601247787476, "learning_rate": 1.611656598659148e-05, "loss": 0.458, "num_input_tokens_seen": 71243680, "step": 58730 }, { "epoch": 6.5413743178527675, "grad_norm": 0.15162868797779083, "learning_rate": 1.6112023817348793e-05, "loss": 0.4597, "num_input_tokens_seen": 71249472, "step": 58735 }, { "epoch": 6.541931172736385, "grad_norm": 0.1417113095521927, "learning_rate": 1.6107481983922327e-05, "loss": 0.4676, "num_input_tokens_seen": 71255488, "step": 58740 }, { "epoch": 6.542488027620002, "grad_norm": 0.12129539251327515, "learning_rate": 1.610294048648369e-05, "loss": 0.4626, "num_input_tokens_seen": 71261536, "step": 58745 }, { "epoch": 6.54304488250362, "grad_norm": 0.09886231273412704, "learning_rate": 1.6098399325204472e-05, "loss": 0.4716, "num_input_tokens_seen": 71267584, "step": 58750 }, { "epoch": 6.543601737387237, "grad_norm": 0.11474016308784485, "learning_rate": 1.6093858500256253e-05, "loss": 0.4591, "num_input_tokens_seen": 71273600, "step": 58755 }, { "epoch": 6.544158592270854, "grad_norm": 0.11007720232009888, "learning_rate": 1.6089318011810602e-05, "loss": 0.4646, "num_input_tokens_seen": 71279584, "step": 58760 }, { "epoch": 6.544715447154472, "grad_norm": 0.08909046649932861, "learning_rate": 1.608477786003907e-05, "loss": 0.4708, "num_input_tokens_seen": 71284928, "step": 58765 }, { "epoch": 6.5452723020380885, "grad_norm": 0.09920709580183029, "learning_rate": 1.6080238045113198e-05, "loss": 0.4559, "num_input_tokens_seen": 71291168, "step": 58770 }, { "epoch": 6.545829156921706, "grad_norm": 0.115126833319664, "learning_rate": 1.6075698567204524e-05, "loss": 0.4675, "num_input_tokens_seen": 71297344, "step": 58775 }, { "epoch": 6.546386011805324, "grad_norm": 0.13175614178180695, "learning_rate": 1.6071159426484556e-05, "loss": 0.4604, "num_input_tokens_seen": 71303392, "step": 58780 }, { "epoch": 6.546942866688941, "grad_norm": 0.08976307511329651, "learning_rate": 1.60666206231248e-05, "loss": 0.4678, "num_input_tokens_seen": 71309792, "step": 58785 }, { "epoch": 6.547499721572558, "grad_norm": 0.12771190702915192, "learning_rate": 1.6062082157296743e-05, "loss": 0.4558, "num_input_tokens_seen": 71315520, "step": 58790 }, { "epoch": 6.548056576456176, "grad_norm": 0.09677921235561371, "learning_rate": 1.6057544029171863e-05, "loss": 0.4385, "num_input_tokens_seen": 71321792, "step": 58795 }, { "epoch": 6.548613431339793, "grad_norm": 0.10307496786117554, "learning_rate": 1.605300623892163e-05, "loss": 0.4729, "num_input_tokens_seen": 71328320, "step": 58800 }, { "epoch": 6.5491702862234105, "grad_norm": 0.08933497965335846, "learning_rate": 1.6048468786717493e-05, "loss": 0.4655, "num_input_tokens_seen": 71333888, "step": 58805 }, { "epoch": 6.549727141107027, "grad_norm": 0.13177546858787537, "learning_rate": 1.6043931672730892e-05, "loss": 0.4515, "num_input_tokens_seen": 71339616, "step": 58810 }, { "epoch": 6.550283995990645, "grad_norm": 0.11153733730316162, "learning_rate": 1.6039394897133253e-05, "loss": 0.469, "num_input_tokens_seen": 71345824, "step": 58815 }, { "epoch": 6.550840850874263, "grad_norm": 0.14727269113063812, "learning_rate": 1.603485846009599e-05, "loss": 0.4741, "num_input_tokens_seen": 71352064, "step": 58820 }, { "epoch": 6.551397705757879, "grad_norm": 0.10346392542123795, "learning_rate": 1.6030322361790508e-05, "loss": 0.459, "num_input_tokens_seen": 71357568, "step": 58825 }, { "epoch": 6.551954560641497, "grad_norm": 0.10274460166692734, "learning_rate": 1.6025786602388185e-05, "loss": 0.4798, "num_input_tokens_seen": 71363616, "step": 58830 }, { "epoch": 6.552511415525114, "grad_norm": 0.07479273527860641, "learning_rate": 1.602125118206041e-05, "loss": 0.4615, "num_input_tokens_seen": 71369760, "step": 58835 }, { "epoch": 6.5530682704087315, "grad_norm": 0.09962110966444016, "learning_rate": 1.6016716100978534e-05, "loss": 0.4644, "num_input_tokens_seen": 71375840, "step": 58840 }, { "epoch": 6.553625125292349, "grad_norm": 0.14735786616802216, "learning_rate": 1.6012181359313914e-05, "loss": 0.4614, "num_input_tokens_seen": 71381952, "step": 58845 }, { "epoch": 6.554181980175966, "grad_norm": 0.09253334999084473, "learning_rate": 1.600764695723789e-05, "loss": 0.46, "num_input_tokens_seen": 71388064, "step": 58850 }, { "epoch": 6.554738835059584, "grad_norm": 0.1460374891757965, "learning_rate": 1.600311289492178e-05, "loss": 0.4573, "num_input_tokens_seen": 71394304, "step": 58855 }, { "epoch": 6.5552956899432, "grad_norm": 0.1025400459766388, "learning_rate": 1.5998579172536887e-05, "loss": 0.463, "num_input_tokens_seen": 71400576, "step": 58860 }, { "epoch": 6.555852544826818, "grad_norm": 0.09712445735931396, "learning_rate": 1.599404579025453e-05, "loss": 0.4526, "num_input_tokens_seen": 71405984, "step": 58865 }, { "epoch": 6.556409399710436, "grad_norm": 0.11072196066379547, "learning_rate": 1.5989512748245978e-05, "loss": 0.4715, "num_input_tokens_seen": 71412384, "step": 58870 }, { "epoch": 6.556966254594053, "grad_norm": 0.14461442828178406, "learning_rate": 1.5984980046682512e-05, "loss": 0.4668, "num_input_tokens_seen": 71418496, "step": 58875 }, { "epoch": 6.55752310947767, "grad_norm": 0.17635415494441986, "learning_rate": 1.5980447685735393e-05, "loss": 0.4692, "num_input_tokens_seen": 71424928, "step": 58880 }, { "epoch": 6.558079964361287, "grad_norm": 0.11877962201833725, "learning_rate": 1.5975915665575862e-05, "loss": 0.4725, "num_input_tokens_seen": 71430880, "step": 58885 }, { "epoch": 6.558636819244905, "grad_norm": 0.1541859656572342, "learning_rate": 1.5971383986375157e-05, "loss": 0.4681, "num_input_tokens_seen": 71436992, "step": 58890 }, { "epoch": 6.559193674128522, "grad_norm": 0.10646471381187439, "learning_rate": 1.5966852648304493e-05, "loss": 0.452, "num_input_tokens_seen": 71442912, "step": 58895 }, { "epoch": 6.559750529012139, "grad_norm": 0.11580256372690201, "learning_rate": 1.596232165153509e-05, "loss": 0.4511, "num_input_tokens_seen": 71448992, "step": 58900 }, { "epoch": 6.560307383895757, "grad_norm": 0.0998549833893776, "learning_rate": 1.5957790996238137e-05, "loss": 0.4705, "num_input_tokens_seen": 71454656, "step": 58905 }, { "epoch": 6.560864238779374, "grad_norm": 0.09857825189828873, "learning_rate": 1.595326068258482e-05, "loss": 0.475, "num_input_tokens_seen": 71460768, "step": 58910 }, { "epoch": 6.561421093662991, "grad_norm": 0.15514682233333588, "learning_rate": 1.5948730710746302e-05, "loss": 0.4633, "num_input_tokens_seen": 71467040, "step": 58915 }, { "epoch": 6.561977948546609, "grad_norm": 0.10202353447675705, "learning_rate": 1.594420108089375e-05, "loss": 0.4592, "num_input_tokens_seen": 71472864, "step": 58920 }, { "epoch": 6.562534803430226, "grad_norm": 0.16499090194702148, "learning_rate": 1.59396717931983e-05, "loss": 0.4637, "num_input_tokens_seen": 71479424, "step": 58925 }, { "epoch": 6.5630916583138434, "grad_norm": 0.1164376363158226, "learning_rate": 1.5935142847831084e-05, "loss": 0.4604, "num_input_tokens_seen": 71485536, "step": 58930 }, { "epoch": 6.563648513197461, "grad_norm": 0.10971987247467041, "learning_rate": 1.5930614244963217e-05, "loss": 0.4617, "num_input_tokens_seen": 71491616, "step": 58935 }, { "epoch": 6.564205368081078, "grad_norm": 0.1269596517086029, "learning_rate": 1.5926085984765812e-05, "loss": 0.4547, "num_input_tokens_seen": 71497984, "step": 58940 }, { "epoch": 6.564762222964696, "grad_norm": 0.10767897218465805, "learning_rate": 1.5921558067409954e-05, "loss": 0.4652, "num_input_tokens_seen": 71503904, "step": 58945 }, { "epoch": 6.565319077848312, "grad_norm": 0.10012675076723099, "learning_rate": 1.5917030493066727e-05, "loss": 0.4638, "num_input_tokens_seen": 71510336, "step": 58950 }, { "epoch": 6.56587593273193, "grad_norm": 0.0779838114976883, "learning_rate": 1.5912503261907194e-05, "loss": 0.4562, "num_input_tokens_seen": 71516352, "step": 58955 }, { "epoch": 6.566432787615548, "grad_norm": 0.13121633231639862, "learning_rate": 1.5907976374102413e-05, "loss": 0.4615, "num_input_tokens_seen": 71522144, "step": 58960 }, { "epoch": 6.5669896424991645, "grad_norm": 0.08914781361818314, "learning_rate": 1.5903449829823418e-05, "loss": 0.4734, "num_input_tokens_seen": 71528256, "step": 58965 }, { "epoch": 6.567546497382782, "grad_norm": 0.10817460715770721, "learning_rate": 1.5898923629241248e-05, "loss": 0.4542, "num_input_tokens_seen": 71534304, "step": 58970 }, { "epoch": 6.5681033522664, "grad_norm": 0.10685189813375473, "learning_rate": 1.5894397772526896e-05, "loss": 0.4733, "num_input_tokens_seen": 71540448, "step": 58975 }, { "epoch": 6.568660207150017, "grad_norm": 0.09975449740886688, "learning_rate": 1.5889872259851384e-05, "loss": 0.449, "num_input_tokens_seen": 71546464, "step": 58980 }, { "epoch": 6.569217062033634, "grad_norm": 0.10730995237827301, "learning_rate": 1.5885347091385694e-05, "loss": 0.4591, "num_input_tokens_seen": 71552672, "step": 58985 }, { "epoch": 6.569773916917251, "grad_norm": 0.09446004778146744, "learning_rate": 1.58808222673008e-05, "loss": 0.4556, "num_input_tokens_seen": 71559008, "step": 58990 }, { "epoch": 6.570330771800869, "grad_norm": 0.1404815912246704, "learning_rate": 1.5876297787767662e-05, "loss": 0.4653, "num_input_tokens_seen": 71565312, "step": 58995 }, { "epoch": 6.5708876266844864, "grad_norm": 0.10400289297103882, "learning_rate": 1.5871773652957234e-05, "loss": 0.4586, "num_input_tokens_seen": 71571008, "step": 59000 }, { "epoch": 6.571444481568103, "grad_norm": 0.1399092972278595, "learning_rate": 1.5867249863040444e-05, "loss": 0.457, "num_input_tokens_seen": 71577280, "step": 59005 }, { "epoch": 6.572001336451721, "grad_norm": 0.12035302072763443, "learning_rate": 1.5862726418188227e-05, "loss": 0.469, "num_input_tokens_seen": 71583328, "step": 59010 }, { "epoch": 6.572558191335338, "grad_norm": 0.10482516139745712, "learning_rate": 1.5858203318571492e-05, "loss": 0.4595, "num_input_tokens_seen": 71589280, "step": 59015 }, { "epoch": 6.573115046218955, "grad_norm": 0.14515994489192963, "learning_rate": 1.5853680564361125e-05, "loss": 0.4718, "num_input_tokens_seen": 71595552, "step": 59020 }, { "epoch": 6.573671901102573, "grad_norm": 0.1060531809926033, "learning_rate": 1.5849158155728023e-05, "loss": 0.4574, "num_input_tokens_seen": 71601760, "step": 59025 }, { "epoch": 6.57422875598619, "grad_norm": 0.09992268681526184, "learning_rate": 1.5844636092843047e-05, "loss": 0.4598, "num_input_tokens_seen": 71607968, "step": 59030 }, { "epoch": 6.5747856108698075, "grad_norm": 0.13704176247119904, "learning_rate": 1.584011437587706e-05, "loss": 0.4582, "num_input_tokens_seen": 71614176, "step": 59035 }, { "epoch": 6.575342465753424, "grad_norm": 0.0875190794467926, "learning_rate": 1.5835593005000905e-05, "loss": 0.4491, "num_input_tokens_seen": 71620256, "step": 59040 }, { "epoch": 6.575899320637042, "grad_norm": 0.08209490776062012, "learning_rate": 1.5831071980385416e-05, "loss": 0.4591, "num_input_tokens_seen": 71626368, "step": 59045 }, { "epoch": 6.57645617552066, "grad_norm": 0.12436597794294357, "learning_rate": 1.5826551302201407e-05, "loss": 0.4517, "num_input_tokens_seen": 71632640, "step": 59050 }, { "epoch": 6.577013030404276, "grad_norm": 0.10776825249195099, "learning_rate": 1.5822030970619693e-05, "loss": 0.4503, "num_input_tokens_seen": 71638848, "step": 59055 }, { "epoch": 6.577569885287894, "grad_norm": 0.09409133344888687, "learning_rate": 1.5817510985811062e-05, "loss": 0.4573, "num_input_tokens_seen": 71644736, "step": 59060 }, { "epoch": 6.578126740171511, "grad_norm": 0.08817776292562485, "learning_rate": 1.5812991347946293e-05, "loss": 0.443, "num_input_tokens_seen": 71651232, "step": 59065 }, { "epoch": 6.578683595055129, "grad_norm": 0.09013407677412033, "learning_rate": 1.5808472057196143e-05, "loss": 0.456, "num_input_tokens_seen": 71657216, "step": 59070 }, { "epoch": 6.579240449938746, "grad_norm": 0.1098882257938385, "learning_rate": 1.5803953113731386e-05, "loss": 0.442, "num_input_tokens_seen": 71663360, "step": 59075 }, { "epoch": 6.579797304822363, "grad_norm": 0.10013526678085327, "learning_rate": 1.5799434517722746e-05, "loss": 0.4751, "num_input_tokens_seen": 71669568, "step": 59080 }, { "epoch": 6.580354159705981, "grad_norm": 0.0724390521645546, "learning_rate": 1.5794916269340953e-05, "loss": 0.4629, "num_input_tokens_seen": 71675520, "step": 59085 }, { "epoch": 6.5809110145895975, "grad_norm": 0.14053072035312653, "learning_rate": 1.5790398368756724e-05, "loss": 0.4667, "num_input_tokens_seen": 71681984, "step": 59090 }, { "epoch": 6.581467869473215, "grad_norm": 0.11312699317932129, "learning_rate": 1.5785880816140757e-05, "loss": 0.4611, "num_input_tokens_seen": 71688480, "step": 59095 }, { "epoch": 6.582024724356833, "grad_norm": 0.1052662804722786, "learning_rate": 1.5781363611663745e-05, "loss": 0.4613, "num_input_tokens_seen": 71694368, "step": 59100 }, { "epoch": 6.58258157924045, "grad_norm": 0.09718908369541168, "learning_rate": 1.577684675549635e-05, "loss": 0.4447, "num_input_tokens_seen": 71699840, "step": 59105 }, { "epoch": 6.583138434124067, "grad_norm": 0.11377622187137604, "learning_rate": 1.5772330247809247e-05, "loss": 0.4616, "num_input_tokens_seen": 71705920, "step": 59110 }, { "epoch": 6.583695289007685, "grad_norm": 0.10499110072851181, "learning_rate": 1.576781408877308e-05, "loss": 0.467, "num_input_tokens_seen": 71712416, "step": 59115 }, { "epoch": 6.584252143891302, "grad_norm": 0.10860739648342133, "learning_rate": 1.576329827855848e-05, "loss": 0.456, "num_input_tokens_seen": 71718720, "step": 59120 }, { "epoch": 6.584808998774919, "grad_norm": 0.12269004434347153, "learning_rate": 1.5758782817336073e-05, "loss": 0.4606, "num_input_tokens_seen": 71724576, "step": 59125 }, { "epoch": 6.585365853658536, "grad_norm": 0.10641322284936905, "learning_rate": 1.5754267705276467e-05, "loss": 0.4513, "num_input_tokens_seen": 71730912, "step": 59130 }, { "epoch": 6.585922708542154, "grad_norm": 0.09299459308385849, "learning_rate": 1.574975294255025e-05, "loss": 0.4602, "num_input_tokens_seen": 71737120, "step": 59135 }, { "epoch": 6.586479563425772, "grad_norm": 0.07987845689058304, "learning_rate": 1.5745238529328016e-05, "loss": 0.4556, "num_input_tokens_seen": 71743360, "step": 59140 }, { "epoch": 6.587036418309388, "grad_norm": 0.12812617421150208, "learning_rate": 1.5740724465780333e-05, "loss": 0.4724, "num_input_tokens_seen": 71749760, "step": 59145 }, { "epoch": 6.587593273193006, "grad_norm": 0.10063013434410095, "learning_rate": 1.5736210752077746e-05, "loss": 0.4653, "num_input_tokens_seen": 71755552, "step": 59150 }, { "epoch": 6.588150128076624, "grad_norm": 0.09342775493860245, "learning_rate": 1.573169738839081e-05, "loss": 0.4607, "num_input_tokens_seen": 71761792, "step": 59155 }, { "epoch": 6.5887069829602405, "grad_norm": 0.1362091451883316, "learning_rate": 1.572718437489004e-05, "loss": 0.4614, "num_input_tokens_seen": 71767680, "step": 59160 }, { "epoch": 6.589263837843858, "grad_norm": 0.11689259856939316, "learning_rate": 1.5722671711745968e-05, "loss": 0.4569, "num_input_tokens_seen": 71773504, "step": 59165 }, { "epoch": 6.589820692727475, "grad_norm": 0.1091156005859375, "learning_rate": 1.5718159399129083e-05, "loss": 0.4763, "num_input_tokens_seen": 71779808, "step": 59170 }, { "epoch": 6.590377547611093, "grad_norm": 0.1262976974248886, "learning_rate": 1.5713647437209886e-05, "loss": 0.4509, "num_input_tokens_seen": 71786016, "step": 59175 }, { "epoch": 6.59093440249471, "grad_norm": 0.10470005124807358, "learning_rate": 1.5709135826158848e-05, "loss": 0.4655, "num_input_tokens_seen": 71792096, "step": 59180 }, { "epoch": 6.591491257378327, "grad_norm": 0.12779386341571808, "learning_rate": 1.570462456614643e-05, "loss": 0.4654, "num_input_tokens_seen": 71798720, "step": 59185 }, { "epoch": 6.592048112261945, "grad_norm": 0.1034206673502922, "learning_rate": 1.5700113657343088e-05, "loss": 0.4596, "num_input_tokens_seen": 71804608, "step": 59190 }, { "epoch": 6.5926049671455615, "grad_norm": 0.1202850267291069, "learning_rate": 1.5695603099919253e-05, "loss": 0.472, "num_input_tokens_seen": 71811008, "step": 59195 }, { "epoch": 6.593161822029179, "grad_norm": 0.09081301838159561, "learning_rate": 1.5691092894045344e-05, "loss": 0.4537, "num_input_tokens_seen": 71816640, "step": 59200 }, { "epoch": 6.593718676912797, "grad_norm": 0.11429934948682785, "learning_rate": 1.5686583039891786e-05, "loss": 0.4742, "num_input_tokens_seen": 71822496, "step": 59205 }, { "epoch": 6.594275531796414, "grad_norm": 0.10124877840280533, "learning_rate": 1.5682073537628965e-05, "loss": 0.4773, "num_input_tokens_seen": 71828736, "step": 59210 }, { "epoch": 6.594832386680031, "grad_norm": 0.08983729034662247, "learning_rate": 1.5677564387427268e-05, "loss": 0.4773, "num_input_tokens_seen": 71834272, "step": 59215 }, { "epoch": 6.595389241563648, "grad_norm": 0.09339591860771179, "learning_rate": 1.5673055589457063e-05, "loss": 0.4655, "num_input_tokens_seen": 71840032, "step": 59220 }, { "epoch": 6.595946096447266, "grad_norm": 0.10143331438302994, "learning_rate": 1.5668547143888708e-05, "loss": 0.4696, "num_input_tokens_seen": 71846272, "step": 59225 }, { "epoch": 6.5965029513308835, "grad_norm": 0.16643266379833221, "learning_rate": 1.5664039050892536e-05, "loss": 0.4668, "num_input_tokens_seen": 71852192, "step": 59230 }, { "epoch": 6.5970598062145, "grad_norm": 0.1085272952914238, "learning_rate": 1.5659531310638897e-05, "loss": 0.4664, "num_input_tokens_seen": 71858400, "step": 59235 }, { "epoch": 6.597616661098118, "grad_norm": 0.11014695465564728, "learning_rate": 1.5655023923298094e-05, "loss": 0.4493, "num_input_tokens_seen": 71864448, "step": 59240 }, { "epoch": 6.598173515981735, "grad_norm": 0.10570394992828369, "learning_rate": 1.565051688904044e-05, "loss": 0.4599, "num_input_tokens_seen": 71870592, "step": 59245 }, { "epoch": 6.598730370865352, "grad_norm": 0.12483739852905273, "learning_rate": 1.5646010208036218e-05, "loss": 0.4531, "num_input_tokens_seen": 71876608, "step": 59250 }, { "epoch": 6.59928722574897, "grad_norm": 0.127315491437912, "learning_rate": 1.5641503880455706e-05, "loss": 0.463, "num_input_tokens_seen": 71882848, "step": 59255 }, { "epoch": 6.599844080632587, "grad_norm": 0.0955343022942543, "learning_rate": 1.563699790646917e-05, "loss": 0.4629, "num_input_tokens_seen": 71888832, "step": 59260 }, { "epoch": 6.6004009355162045, "grad_norm": 0.09330157190561295, "learning_rate": 1.563249228624685e-05, "loss": 0.4714, "num_input_tokens_seen": 71894816, "step": 59265 }, { "epoch": 6.600957790399821, "grad_norm": 0.1241583377122879, "learning_rate": 1.5627987019958998e-05, "loss": 0.4486, "num_input_tokens_seen": 71901024, "step": 59270 }, { "epoch": 6.601514645283439, "grad_norm": 0.11374947428703308, "learning_rate": 1.562348210777583e-05, "loss": 0.4616, "num_input_tokens_seen": 71907040, "step": 59275 }, { "epoch": 6.602071500167057, "grad_norm": 0.1082514300942421, "learning_rate": 1.561897754986756e-05, "loss": 0.4619, "num_input_tokens_seen": 71913440, "step": 59280 }, { "epoch": 6.6026283550506735, "grad_norm": 0.0993104800581932, "learning_rate": 1.5614473346404373e-05, "loss": 0.465, "num_input_tokens_seen": 71919776, "step": 59285 }, { "epoch": 6.603185209934291, "grad_norm": 0.09716803580522537, "learning_rate": 1.5609969497556467e-05, "loss": 0.4797, "num_input_tokens_seen": 71925600, "step": 59290 }, { "epoch": 6.603742064817909, "grad_norm": 0.08999352157115936, "learning_rate": 1.5605466003494e-05, "loss": 0.4557, "num_input_tokens_seen": 71931712, "step": 59295 }, { "epoch": 6.604298919701526, "grad_norm": 0.13607917726039886, "learning_rate": 1.5600962864387136e-05, "loss": 0.4504, "num_input_tokens_seen": 71937888, "step": 59300 }, { "epoch": 6.604855774585143, "grad_norm": 0.11720090359449387, "learning_rate": 1.5596460080406018e-05, "loss": 0.4713, "num_input_tokens_seen": 71944032, "step": 59305 }, { "epoch": 6.60541262946876, "grad_norm": 0.13954879343509674, "learning_rate": 1.559195765172077e-05, "loss": 0.4608, "num_input_tokens_seen": 71950176, "step": 59310 }, { "epoch": 6.605969484352378, "grad_norm": 0.14034883677959442, "learning_rate": 1.5587455578501514e-05, "loss": 0.4647, "num_input_tokens_seen": 71956352, "step": 59315 }, { "epoch": 6.606526339235995, "grad_norm": 0.10341623425483704, "learning_rate": 1.558295386091835e-05, "loss": 0.4494, "num_input_tokens_seen": 71962496, "step": 59320 }, { "epoch": 6.607083194119612, "grad_norm": 0.07476367801427841, "learning_rate": 1.5578452499141366e-05, "loss": 0.4486, "num_input_tokens_seen": 71968576, "step": 59325 }, { "epoch": 6.60764004900323, "grad_norm": 0.09539968520402908, "learning_rate": 1.5573951493340638e-05, "loss": 0.4556, "num_input_tokens_seen": 71974368, "step": 59330 }, { "epoch": 6.6081969038868476, "grad_norm": 0.1534246802330017, "learning_rate": 1.5569450843686243e-05, "loss": 0.4641, "num_input_tokens_seen": 71980544, "step": 59335 }, { "epoch": 6.608753758770464, "grad_norm": 0.09067406505346298, "learning_rate": 1.5564950550348206e-05, "loss": 0.4642, "num_input_tokens_seen": 71986784, "step": 59340 }, { "epoch": 6.609310613654082, "grad_norm": 0.10473120212554932, "learning_rate": 1.556045061349657e-05, "loss": 0.4564, "num_input_tokens_seen": 71992736, "step": 59345 }, { "epoch": 6.609867468537699, "grad_norm": 0.09011119604110718, "learning_rate": 1.5555951033301362e-05, "loss": 0.4573, "num_input_tokens_seen": 71998592, "step": 59350 }, { "epoch": 6.6104243234213165, "grad_norm": 0.1555911749601364, "learning_rate": 1.555145180993259e-05, "loss": 0.4638, "num_input_tokens_seen": 72004928, "step": 59355 }, { "epoch": 6.610981178304934, "grad_norm": 0.0993112176656723, "learning_rate": 1.554695294356025e-05, "loss": 0.4517, "num_input_tokens_seen": 72011136, "step": 59360 }, { "epoch": 6.611538033188551, "grad_norm": 0.09155325591564178, "learning_rate": 1.554245443435432e-05, "loss": 0.4694, "num_input_tokens_seen": 72017312, "step": 59365 }, { "epoch": 6.612094888072169, "grad_norm": 0.09601815789937973, "learning_rate": 1.553795628248477e-05, "loss": 0.4517, "num_input_tokens_seen": 72023648, "step": 59370 }, { "epoch": 6.612651742955785, "grad_norm": 0.09041308611631393, "learning_rate": 1.5533458488121555e-05, "loss": 0.4616, "num_input_tokens_seen": 72029440, "step": 59375 }, { "epoch": 6.613208597839403, "grad_norm": 0.08202818781137466, "learning_rate": 1.552896105143461e-05, "loss": 0.4604, "num_input_tokens_seen": 72035488, "step": 59380 }, { "epoch": 6.613765452723021, "grad_norm": 0.11587550491094589, "learning_rate": 1.552446397259387e-05, "loss": 0.4617, "num_input_tokens_seen": 72040544, "step": 59385 }, { "epoch": 6.6143223076066375, "grad_norm": 0.0801066905260086, "learning_rate": 1.551996725176925e-05, "loss": 0.4625, "num_input_tokens_seen": 72046496, "step": 59390 }, { "epoch": 6.614879162490255, "grad_norm": 0.09771766513586044, "learning_rate": 1.551547088913065e-05, "loss": 0.4603, "num_input_tokens_seen": 72052640, "step": 59395 }, { "epoch": 6.615436017373872, "grad_norm": 0.08811295032501221, "learning_rate": 1.551097488484795e-05, "loss": 0.4641, "num_input_tokens_seen": 72058304, "step": 59400 }, { "epoch": 6.61599287225749, "grad_norm": 0.1313551366329193, "learning_rate": 1.550647923909103e-05, "loss": 0.4598, "num_input_tokens_seen": 72064608, "step": 59405 }, { "epoch": 6.616549727141107, "grad_norm": 0.07081913948059082, "learning_rate": 1.550198395202974e-05, "loss": 0.4608, "num_input_tokens_seen": 72070816, "step": 59410 }, { "epoch": 6.617106582024724, "grad_norm": 0.1212209090590477, "learning_rate": 1.5497489023833944e-05, "loss": 0.4694, "num_input_tokens_seen": 72076928, "step": 59415 }, { "epoch": 6.617663436908342, "grad_norm": 0.10274873673915863, "learning_rate": 1.549299445467346e-05, "loss": 0.4585, "num_input_tokens_seen": 72082976, "step": 59420 }, { "epoch": 6.618220291791959, "grad_norm": 0.09197796881198883, "learning_rate": 1.5488500244718118e-05, "loss": 0.4696, "num_input_tokens_seen": 72089088, "step": 59425 }, { "epoch": 6.618777146675576, "grad_norm": 0.11072283983230591, "learning_rate": 1.548400639413771e-05, "loss": 0.4634, "num_input_tokens_seen": 72094528, "step": 59430 }, { "epoch": 6.619334001559194, "grad_norm": 0.113707534968853, "learning_rate": 1.5479512903102045e-05, "loss": 0.4823, "num_input_tokens_seen": 72100640, "step": 59435 }, { "epoch": 6.619890856442811, "grad_norm": 0.10401388257741928, "learning_rate": 1.5475019771780885e-05, "loss": 0.4636, "num_input_tokens_seen": 72106688, "step": 59440 }, { "epoch": 6.620447711326428, "grad_norm": 0.10023633390665054, "learning_rate": 1.5470527000344e-05, "loss": 0.4579, "num_input_tokens_seen": 72112768, "step": 59445 }, { "epoch": 6.621004566210045, "grad_norm": 0.13118083775043488, "learning_rate": 1.546603458896115e-05, "loss": 0.4597, "num_input_tokens_seen": 72118880, "step": 59450 }, { "epoch": 6.621561421093663, "grad_norm": 0.09519292414188385, "learning_rate": 1.5461542537802065e-05, "loss": 0.4713, "num_input_tokens_seen": 72125248, "step": 59455 }, { "epoch": 6.6221182759772805, "grad_norm": 0.11610012501478195, "learning_rate": 1.545705084703647e-05, "loss": 0.4824, "num_input_tokens_seen": 72131488, "step": 59460 }, { "epoch": 6.622675130860897, "grad_norm": 0.10760737210512161, "learning_rate": 1.545255951683408e-05, "loss": 0.4559, "num_input_tokens_seen": 72137600, "step": 59465 }, { "epoch": 6.623231985744515, "grad_norm": 0.11522699147462845, "learning_rate": 1.544806854736458e-05, "loss": 0.4526, "num_input_tokens_seen": 72143808, "step": 59470 }, { "epoch": 6.623788840628133, "grad_norm": 0.10983726382255554, "learning_rate": 1.544357793879766e-05, "loss": 0.4658, "num_input_tokens_seen": 72149856, "step": 59475 }, { "epoch": 6.6243456955117495, "grad_norm": 0.07015616446733475, "learning_rate": 1.5439087691302996e-05, "loss": 0.4586, "num_input_tokens_seen": 72156064, "step": 59480 }, { "epoch": 6.624902550395367, "grad_norm": 0.1126725822687149, "learning_rate": 1.543459780505024e-05, "loss": 0.468, "num_input_tokens_seen": 72162368, "step": 59485 }, { "epoch": 6.625459405278984, "grad_norm": 0.08571243286132812, "learning_rate": 1.543010828020903e-05, "loss": 0.4557, "num_input_tokens_seen": 72168320, "step": 59490 }, { "epoch": 6.626016260162602, "grad_norm": 0.12466178089380264, "learning_rate": 1.5425619116948998e-05, "loss": 0.4673, "num_input_tokens_seen": 72174336, "step": 59495 }, { "epoch": 6.626573115046219, "grad_norm": 0.09468808770179749, "learning_rate": 1.5421130315439758e-05, "loss": 0.4693, "num_input_tokens_seen": 72180352, "step": 59500 }, { "epoch": 6.627129969929836, "grad_norm": 0.11235105246305466, "learning_rate": 1.5416641875850912e-05, "loss": 0.4645, "num_input_tokens_seen": 72185920, "step": 59505 }, { "epoch": 6.627686824813454, "grad_norm": 0.08253908902406693, "learning_rate": 1.5412153798352045e-05, "loss": 0.4554, "num_input_tokens_seen": 72191872, "step": 59510 }, { "epoch": 6.628243679697071, "grad_norm": 0.09009255468845367, "learning_rate": 1.5407666083112735e-05, "loss": 0.4548, "num_input_tokens_seen": 72198368, "step": 59515 }, { "epoch": 6.628800534580688, "grad_norm": 0.09210987389087677, "learning_rate": 1.5403178730302542e-05, "loss": 0.4656, "num_input_tokens_seen": 72204288, "step": 59520 }, { "epoch": 6.629357389464306, "grad_norm": 0.08678264170885086, "learning_rate": 1.539869174009101e-05, "loss": 0.4564, "num_input_tokens_seen": 72210208, "step": 59525 }, { "epoch": 6.629914244347923, "grad_norm": 0.09259139001369476, "learning_rate": 1.5394205112647673e-05, "loss": 0.4536, "num_input_tokens_seen": 72216512, "step": 59530 }, { "epoch": 6.63047109923154, "grad_norm": 0.112477146089077, "learning_rate": 1.538971884814205e-05, "loss": 0.4525, "num_input_tokens_seen": 72222592, "step": 59535 }, { "epoch": 6.631027954115158, "grad_norm": 0.10292460769414902, "learning_rate": 1.5385232946743644e-05, "loss": 0.4608, "num_input_tokens_seen": 72228896, "step": 59540 }, { "epoch": 6.631584808998775, "grad_norm": 0.15712681412696838, "learning_rate": 1.538074740862195e-05, "loss": 0.462, "num_input_tokens_seen": 72235232, "step": 59545 }, { "epoch": 6.6321416638823925, "grad_norm": 0.08229321986436844, "learning_rate": 1.5376262233946448e-05, "loss": 0.4622, "num_input_tokens_seen": 72241216, "step": 59550 }, { "epoch": 6.632698518766009, "grad_norm": 0.11887221783399582, "learning_rate": 1.53717774228866e-05, "loss": 0.467, "num_input_tokens_seen": 72247648, "step": 59555 }, { "epoch": 6.633255373649627, "grad_norm": 0.12367914617061615, "learning_rate": 1.5367292975611857e-05, "loss": 0.4716, "num_input_tokens_seen": 72253888, "step": 59560 }, { "epoch": 6.633812228533245, "grad_norm": 0.09878481924533844, "learning_rate": 1.5362808892291652e-05, "loss": 0.4543, "num_input_tokens_seen": 72260288, "step": 59565 }, { "epoch": 6.634369083416861, "grad_norm": 0.10345005989074707, "learning_rate": 1.5358325173095404e-05, "loss": 0.4668, "num_input_tokens_seen": 72266240, "step": 59570 }, { "epoch": 6.634925938300479, "grad_norm": 0.07137173414230347, "learning_rate": 1.535384181819254e-05, "loss": 0.4561, "num_input_tokens_seen": 72272320, "step": 59575 }, { "epoch": 6.635482793184096, "grad_norm": 0.13935023546218872, "learning_rate": 1.5349358827752442e-05, "loss": 0.4572, "num_input_tokens_seen": 72278176, "step": 59580 }, { "epoch": 6.6360396480677135, "grad_norm": 0.14147451519966125, "learning_rate": 1.5344876201944497e-05, "loss": 0.468, "num_input_tokens_seen": 72284352, "step": 59585 }, { "epoch": 6.636596502951331, "grad_norm": 0.10275507718324661, "learning_rate": 1.534039394093807e-05, "loss": 0.4541, "num_input_tokens_seen": 72290592, "step": 59590 }, { "epoch": 6.637153357834948, "grad_norm": 0.13296306133270264, "learning_rate": 1.533591204490251e-05, "loss": 0.4612, "num_input_tokens_seen": 72296544, "step": 59595 }, { "epoch": 6.637710212718566, "grad_norm": 0.11443348973989487, "learning_rate": 1.5331430514007167e-05, "loss": 0.4651, "num_input_tokens_seen": 72302816, "step": 59600 }, { "epoch": 6.638267067602182, "grad_norm": 0.07634632289409637, "learning_rate": 1.532694934842136e-05, "loss": 0.4636, "num_input_tokens_seen": 72309120, "step": 59605 }, { "epoch": 6.6388239224858, "grad_norm": 0.11145394295454025, "learning_rate": 1.532246854831441e-05, "loss": 0.4615, "num_input_tokens_seen": 72315424, "step": 59610 }, { "epoch": 6.639380777369418, "grad_norm": 0.08393045514822006, "learning_rate": 1.531798811385561e-05, "loss": 0.4597, "num_input_tokens_seen": 72321440, "step": 59615 }, { "epoch": 6.639937632253035, "grad_norm": 0.10555438697338104, "learning_rate": 1.5313508045214246e-05, "loss": 0.4594, "num_input_tokens_seen": 72327552, "step": 59620 }, { "epoch": 6.640494487136652, "grad_norm": 0.1325417160987854, "learning_rate": 1.530902834255959e-05, "loss": 0.4572, "num_input_tokens_seen": 72333728, "step": 59625 }, { "epoch": 6.641051342020269, "grad_norm": 0.09079142659902573, "learning_rate": 1.5304549006060903e-05, "loss": 0.471, "num_input_tokens_seen": 72339744, "step": 59630 }, { "epoch": 6.641608196903887, "grad_norm": 0.09139781445264816, "learning_rate": 1.5300070035887414e-05, "loss": 0.4694, "num_input_tokens_seen": 72345824, "step": 59635 }, { "epoch": 6.642165051787504, "grad_norm": 0.08798582851886749, "learning_rate": 1.5295591432208372e-05, "loss": 0.4573, "num_input_tokens_seen": 72352064, "step": 59640 }, { "epoch": 6.642721906671121, "grad_norm": 0.08525606989860535, "learning_rate": 1.529111319519299e-05, "loss": 0.4645, "num_input_tokens_seen": 72358048, "step": 59645 }, { "epoch": 6.643278761554739, "grad_norm": 0.08983404189348221, "learning_rate": 1.5286635325010458e-05, "loss": 0.4602, "num_input_tokens_seen": 72364000, "step": 59650 }, { "epoch": 6.6438356164383565, "grad_norm": 0.11280222237110138, "learning_rate": 1.5282157821829973e-05, "loss": 0.4568, "num_input_tokens_seen": 72369696, "step": 59655 }, { "epoch": 6.644392471321973, "grad_norm": 0.09655497223138809, "learning_rate": 1.527768068582071e-05, "loss": 0.463, "num_input_tokens_seen": 72375808, "step": 59660 }, { "epoch": 6.644949326205591, "grad_norm": 0.08471959829330444, "learning_rate": 1.5273203917151825e-05, "loss": 0.4635, "num_input_tokens_seen": 72382016, "step": 59665 }, { "epoch": 6.645506181089208, "grad_norm": 0.08428169786930084, "learning_rate": 1.5268727515992463e-05, "loss": 0.4531, "num_input_tokens_seen": 72387552, "step": 59670 }, { "epoch": 6.646063035972825, "grad_norm": 0.07395225763320923, "learning_rate": 1.5264251482511768e-05, "loss": 0.4605, "num_input_tokens_seen": 72393472, "step": 59675 }, { "epoch": 6.646619890856443, "grad_norm": 0.10707798600196838, "learning_rate": 1.5259775816878853e-05, "loss": 0.4601, "num_input_tokens_seen": 72399552, "step": 59680 }, { "epoch": 6.64717674574006, "grad_norm": 0.10643458366394043, "learning_rate": 1.5255300519262816e-05, "loss": 0.4567, "num_input_tokens_seen": 72405920, "step": 59685 }, { "epoch": 6.647733600623678, "grad_norm": 0.11906980723142624, "learning_rate": 1.5250825589832756e-05, "loss": 0.4509, "num_input_tokens_seen": 72412128, "step": 59690 }, { "epoch": 6.648290455507295, "grad_norm": 0.08887401968240738, "learning_rate": 1.5246351028757749e-05, "loss": 0.4598, "num_input_tokens_seen": 72418080, "step": 59695 }, { "epoch": 6.648847310390912, "grad_norm": 0.12116537988185883, "learning_rate": 1.5241876836206853e-05, "loss": 0.4719, "num_input_tokens_seen": 72424160, "step": 59700 }, { "epoch": 6.64940416527453, "grad_norm": 0.09341967850923538, "learning_rate": 1.5237403012349121e-05, "loss": 0.4556, "num_input_tokens_seen": 72429920, "step": 59705 }, { "epoch": 6.6499610201581465, "grad_norm": 0.08739586174488068, "learning_rate": 1.5232929557353595e-05, "loss": 0.4703, "num_input_tokens_seen": 72436128, "step": 59710 }, { "epoch": 6.650517875041764, "grad_norm": 0.10101724416017532, "learning_rate": 1.5228456471389291e-05, "loss": 0.4508, "num_input_tokens_seen": 72442400, "step": 59715 }, { "epoch": 6.651074729925382, "grad_norm": 0.0914502963423729, "learning_rate": 1.5223983754625212e-05, "loss": 0.4515, "num_input_tokens_seen": 72448448, "step": 59720 }, { "epoch": 6.651631584808999, "grad_norm": 0.07922270148992538, "learning_rate": 1.5219511407230358e-05, "loss": 0.4621, "num_input_tokens_seen": 72454336, "step": 59725 }, { "epoch": 6.652188439692616, "grad_norm": 0.08545583486557007, "learning_rate": 1.5215039429373704e-05, "loss": 0.456, "num_input_tokens_seen": 72460192, "step": 59730 }, { "epoch": 6.652745294576233, "grad_norm": 0.10483667999505997, "learning_rate": 1.5210567821224214e-05, "loss": 0.4593, "num_input_tokens_seen": 72466080, "step": 59735 }, { "epoch": 6.653302149459851, "grad_norm": 0.09037694334983826, "learning_rate": 1.5206096582950848e-05, "loss": 0.4661, "num_input_tokens_seen": 72472224, "step": 59740 }, { "epoch": 6.653859004343468, "grad_norm": 0.08626957982778549, "learning_rate": 1.520162571472255e-05, "loss": 0.4439, "num_input_tokens_seen": 72477920, "step": 59745 }, { "epoch": 6.654415859227085, "grad_norm": 0.08835053443908691, "learning_rate": 1.519715521670822e-05, "loss": 0.473, "num_input_tokens_seen": 72483840, "step": 59750 }, { "epoch": 6.654972714110703, "grad_norm": 0.1351841241121292, "learning_rate": 1.519268508907678e-05, "loss": 0.4652, "num_input_tokens_seen": 72489984, "step": 59755 }, { "epoch": 6.65552956899432, "grad_norm": 0.102572001516819, "learning_rate": 1.518821533199713e-05, "loss": 0.4661, "num_input_tokens_seen": 72495872, "step": 59760 }, { "epoch": 6.656086423877937, "grad_norm": 0.09633779525756836, "learning_rate": 1.518374594563815e-05, "loss": 0.47, "num_input_tokens_seen": 72501824, "step": 59765 }, { "epoch": 6.656643278761555, "grad_norm": 0.1274559497833252, "learning_rate": 1.5179276930168704e-05, "loss": 0.4629, "num_input_tokens_seen": 72508096, "step": 59770 }, { "epoch": 6.657200133645172, "grad_norm": 0.10452824085950851, "learning_rate": 1.5174808285757647e-05, "loss": 0.4597, "num_input_tokens_seen": 72514080, "step": 59775 }, { "epoch": 6.6577569885287895, "grad_norm": 0.08023799955844879, "learning_rate": 1.517034001257382e-05, "loss": 0.4582, "num_input_tokens_seen": 72519904, "step": 59780 }, { "epoch": 6.658313843412406, "grad_norm": 0.11270030587911606, "learning_rate": 1.5165872110786045e-05, "loss": 0.4583, "num_input_tokens_seen": 72525600, "step": 59785 }, { "epoch": 6.658870698296024, "grad_norm": 0.10601255297660828, "learning_rate": 1.5161404580563137e-05, "loss": 0.4528, "num_input_tokens_seen": 72531808, "step": 59790 }, { "epoch": 6.659427553179642, "grad_norm": 0.11759261041879654, "learning_rate": 1.5156937422073897e-05, "loss": 0.4608, "num_input_tokens_seen": 72537920, "step": 59795 }, { "epoch": 6.659984408063258, "grad_norm": 0.08970615267753601, "learning_rate": 1.5152470635487105e-05, "loss": 0.4556, "num_input_tokens_seen": 72543936, "step": 59800 }, { "epoch": 6.660541262946876, "grad_norm": 0.08766066282987595, "learning_rate": 1.5148004220971527e-05, "loss": 0.4553, "num_input_tokens_seen": 72549440, "step": 59805 }, { "epoch": 6.661098117830493, "grad_norm": 0.10958388447761536, "learning_rate": 1.5143538178695924e-05, "loss": 0.4502, "num_input_tokens_seen": 72555488, "step": 59810 }, { "epoch": 6.661654972714111, "grad_norm": 0.11580699682235718, "learning_rate": 1.5139072508829028e-05, "loss": 0.4559, "num_input_tokens_seen": 72561504, "step": 59815 }, { "epoch": 6.662211827597728, "grad_norm": 0.10202791541814804, "learning_rate": 1.513460721153958e-05, "loss": 0.4527, "num_input_tokens_seen": 72567520, "step": 59820 }, { "epoch": 6.662768682481345, "grad_norm": 0.11557655781507492, "learning_rate": 1.513014228699629e-05, "loss": 0.4691, "num_input_tokens_seen": 72572896, "step": 59825 }, { "epoch": 6.663325537364963, "grad_norm": 0.09234447032213211, "learning_rate": 1.5125677735367849e-05, "loss": 0.4734, "num_input_tokens_seen": 72579136, "step": 59830 }, { "epoch": 6.66388239224858, "grad_norm": 0.09096483141183853, "learning_rate": 1.512121355682295e-05, "loss": 0.4613, "num_input_tokens_seen": 72585248, "step": 59835 }, { "epoch": 6.664439247132197, "grad_norm": 0.14351427555084229, "learning_rate": 1.511674975153026e-05, "loss": 0.4606, "num_input_tokens_seen": 72591552, "step": 59840 }, { "epoch": 6.664996102015815, "grad_norm": 0.0930408462882042, "learning_rate": 1.5112286319658431e-05, "loss": 0.4709, "num_input_tokens_seen": 72597504, "step": 59845 }, { "epoch": 6.6655529568994325, "grad_norm": 0.0887710377573967, "learning_rate": 1.510782326137612e-05, "loss": 0.4738, "num_input_tokens_seen": 72603488, "step": 59850 }, { "epoch": 6.666109811783049, "grad_norm": 0.10881336033344269, "learning_rate": 1.5103360576851943e-05, "loss": 0.457, "num_input_tokens_seen": 72609216, "step": 59855 }, { "epoch": 6.666666666666667, "grad_norm": 0.0991700142621994, "learning_rate": 1.5098898266254522e-05, "loss": 0.4724, "num_input_tokens_seen": 72614752, "step": 59860 }, { "epoch": 6.667223521550284, "grad_norm": 0.10072427988052368, "learning_rate": 1.5094436329752451e-05, "loss": 0.4619, "num_input_tokens_seen": 72620864, "step": 59865 }, { "epoch": 6.667780376433901, "grad_norm": 0.06631318479776382, "learning_rate": 1.508997476751432e-05, "loss": 0.4551, "num_input_tokens_seen": 72626944, "step": 59870 }, { "epoch": 6.668337231317519, "grad_norm": 0.08803743869066238, "learning_rate": 1.5085513579708704e-05, "loss": 0.4598, "num_input_tokens_seen": 72632960, "step": 59875 }, { "epoch": 6.668894086201136, "grad_norm": 0.09596794843673706, "learning_rate": 1.508105276650415e-05, "loss": 0.4627, "num_input_tokens_seen": 72638592, "step": 59880 }, { "epoch": 6.669450941084754, "grad_norm": 0.16633795201778412, "learning_rate": 1.5076592328069212e-05, "loss": 0.466, "num_input_tokens_seen": 72644128, "step": 59885 }, { "epoch": 6.67000779596837, "grad_norm": 0.10481413453817368, "learning_rate": 1.507213226457242e-05, "loss": 0.4696, "num_input_tokens_seen": 72650112, "step": 59890 }, { "epoch": 6.670564650851988, "grad_norm": 0.10629777610301971, "learning_rate": 1.5067672576182285e-05, "loss": 0.4777, "num_input_tokens_seen": 72656288, "step": 59895 }, { "epoch": 6.671121505735606, "grad_norm": 0.10965818911790848, "learning_rate": 1.506321326306731e-05, "loss": 0.4624, "num_input_tokens_seen": 72662496, "step": 59900 }, { "epoch": 6.6716783606192225, "grad_norm": 0.083075612783432, "learning_rate": 1.5058754325395985e-05, "loss": 0.4572, "num_input_tokens_seen": 72668928, "step": 59905 }, { "epoch": 6.67223521550284, "grad_norm": 0.08265887200832367, "learning_rate": 1.505429576333677e-05, "loss": 0.4677, "num_input_tokens_seen": 72675168, "step": 59910 }, { "epoch": 6.672792070386457, "grad_norm": 0.10127296298742294, "learning_rate": 1.504983757705814e-05, "loss": 0.4669, "num_input_tokens_seen": 72681280, "step": 59915 }, { "epoch": 6.673348925270075, "grad_norm": 0.11958396434783936, "learning_rate": 1.5045379766728534e-05, "loss": 0.4768, "num_input_tokens_seen": 72686848, "step": 59920 }, { "epoch": 6.673905780153692, "grad_norm": 0.11911527812480927, "learning_rate": 1.5040922332516382e-05, "loss": 0.4601, "num_input_tokens_seen": 72692864, "step": 59925 }, { "epoch": 6.674462635037309, "grad_norm": 0.09375909715890884, "learning_rate": 1.50364652745901e-05, "loss": 0.4575, "num_input_tokens_seen": 72698816, "step": 59930 }, { "epoch": 6.675019489920927, "grad_norm": 0.09150965511798859, "learning_rate": 1.503200859311809e-05, "loss": 0.462, "num_input_tokens_seen": 72704640, "step": 59935 }, { "epoch": 6.6755763448045435, "grad_norm": 0.09630994498729706, "learning_rate": 1.5027552288268738e-05, "loss": 0.4551, "num_input_tokens_seen": 72710560, "step": 59940 }, { "epoch": 6.676133199688161, "grad_norm": 0.1684056520462036, "learning_rate": 1.5023096360210415e-05, "loss": 0.4565, "num_input_tokens_seen": 72716704, "step": 59945 }, { "epoch": 6.676690054571779, "grad_norm": 0.08880573511123657, "learning_rate": 1.5018640809111489e-05, "loss": 0.4628, "num_input_tokens_seen": 72722944, "step": 59950 }, { "epoch": 6.677246909455396, "grad_norm": 0.08041997998952866, "learning_rate": 1.5014185635140298e-05, "loss": 0.4671, "num_input_tokens_seen": 72728288, "step": 59955 }, { "epoch": 6.677803764339013, "grad_norm": 0.13376091420650482, "learning_rate": 1.5009730838465175e-05, "loss": 0.4666, "num_input_tokens_seen": 72733984, "step": 59960 }, { "epoch": 6.67836061922263, "grad_norm": 0.13359656929969788, "learning_rate": 1.5005276419254439e-05, "loss": 0.4616, "num_input_tokens_seen": 72739648, "step": 59965 }, { "epoch": 6.678917474106248, "grad_norm": 0.08822212368249893, "learning_rate": 1.5000822377676388e-05, "loss": 0.4481, "num_input_tokens_seen": 72745184, "step": 59970 }, { "epoch": 6.6794743289898655, "grad_norm": 0.08403251320123672, "learning_rate": 1.4996368713899305e-05, "loss": 0.4711, "num_input_tokens_seen": 72751264, "step": 59975 }, { "epoch": 6.680031183873482, "grad_norm": 0.1144520565867424, "learning_rate": 1.499191542809148e-05, "loss": 0.4559, "num_input_tokens_seen": 72756832, "step": 59980 }, { "epoch": 6.6805880387571, "grad_norm": 0.11435490101575851, "learning_rate": 1.4987462520421159e-05, "loss": 0.4726, "num_input_tokens_seen": 72762784, "step": 59985 }, { "epoch": 6.681144893640717, "grad_norm": 0.09500028192996979, "learning_rate": 1.498300999105659e-05, "loss": 0.4516, "num_input_tokens_seen": 72768992, "step": 59990 }, { "epoch": 6.681701748524334, "grad_norm": 0.11690317839384079, "learning_rate": 1.4978557840166005e-05, "loss": 0.4528, "num_input_tokens_seen": 72775168, "step": 59995 }, { "epoch": 6.682258603407952, "grad_norm": 0.10941817611455917, "learning_rate": 1.497410606791762e-05, "loss": 0.4579, "num_input_tokens_seen": 72781184, "step": 60000 }, { "epoch": 6.682815458291569, "grad_norm": 0.13439062237739563, "learning_rate": 1.4969654674479638e-05, "loss": 0.4471, "num_input_tokens_seen": 72787136, "step": 60005 }, { "epoch": 6.6833723131751865, "grad_norm": 0.11174394190311432, "learning_rate": 1.4965203660020245e-05, "loss": 0.4575, "num_input_tokens_seen": 72793056, "step": 60010 }, { "epoch": 6.683929168058804, "grad_norm": 0.07483794540166855, "learning_rate": 1.4960753024707613e-05, "loss": 0.456, "num_input_tokens_seen": 72798880, "step": 60015 }, { "epoch": 6.684486022942421, "grad_norm": 0.10576987266540527, "learning_rate": 1.495630276870991e-05, "loss": 0.4585, "num_input_tokens_seen": 72804896, "step": 60020 }, { "epoch": 6.685042877826039, "grad_norm": 0.14106355607509613, "learning_rate": 1.4951852892195272e-05, "loss": 0.4577, "num_input_tokens_seen": 72810688, "step": 60025 }, { "epoch": 6.685599732709656, "grad_norm": 0.12389039993286133, "learning_rate": 1.4947403395331832e-05, "loss": 0.4721, "num_input_tokens_seen": 72817024, "step": 60030 }, { "epoch": 6.686156587593273, "grad_norm": 0.11007136851549149, "learning_rate": 1.494295427828771e-05, "loss": 0.4571, "num_input_tokens_seen": 72823296, "step": 60035 }, { "epoch": 6.686713442476891, "grad_norm": 0.10313943028450012, "learning_rate": 1.4938505541230998e-05, "loss": 0.4564, "num_input_tokens_seen": 72829216, "step": 60040 }, { "epoch": 6.687270297360508, "grad_norm": 0.15383663773536682, "learning_rate": 1.4934057184329798e-05, "loss": 0.4771, "num_input_tokens_seen": 72835424, "step": 60045 }, { "epoch": 6.687827152244125, "grad_norm": 0.11134816706180573, "learning_rate": 1.492960920775217e-05, "loss": 0.46, "num_input_tokens_seen": 72841536, "step": 60050 }, { "epoch": 6.688384007127743, "grad_norm": 0.15530669689178467, "learning_rate": 1.4925161611666183e-05, "loss": 0.4679, "num_input_tokens_seen": 72847904, "step": 60055 }, { "epoch": 6.68894086201136, "grad_norm": 0.11487819254398346, "learning_rate": 1.4920714396239876e-05, "loss": 0.4548, "num_input_tokens_seen": 72853824, "step": 60060 }, { "epoch": 6.689497716894977, "grad_norm": 0.12365347892045975, "learning_rate": 1.4916267561641279e-05, "loss": 0.4581, "num_input_tokens_seen": 72859776, "step": 60065 }, { "epoch": 6.690054571778594, "grad_norm": 0.10225009173154831, "learning_rate": 1.4911821108038401e-05, "loss": 0.4605, "num_input_tokens_seen": 72865984, "step": 60070 }, { "epoch": 6.690611426662212, "grad_norm": 0.0734567642211914, "learning_rate": 1.4907375035599259e-05, "loss": 0.466, "num_input_tokens_seen": 72871968, "step": 60075 }, { "epoch": 6.6911682815458295, "grad_norm": 0.08819475769996643, "learning_rate": 1.4902929344491829e-05, "loss": 0.4669, "num_input_tokens_seen": 72878048, "step": 60080 }, { "epoch": 6.691725136429446, "grad_norm": 0.10715585947036743, "learning_rate": 1.4898484034884089e-05, "loss": 0.4504, "num_input_tokens_seen": 72884128, "step": 60085 }, { "epoch": 6.692281991313064, "grad_norm": 0.11419211328029633, "learning_rate": 1.4894039106943991e-05, "loss": 0.4678, "num_input_tokens_seen": 72889920, "step": 60090 }, { "epoch": 6.692838846196681, "grad_norm": 0.09154988825321198, "learning_rate": 1.4889594560839484e-05, "loss": 0.4504, "num_input_tokens_seen": 72896096, "step": 60095 }, { "epoch": 6.6933957010802985, "grad_norm": 0.12786902487277985, "learning_rate": 1.4885150396738495e-05, "loss": 0.4558, "num_input_tokens_seen": 72902368, "step": 60100 }, { "epoch": 6.693952555963916, "grad_norm": 0.09652607142925262, "learning_rate": 1.4880706614808932e-05, "loss": 0.4603, "num_input_tokens_seen": 72908480, "step": 60105 }, { "epoch": 6.694509410847533, "grad_norm": 0.13120105862617493, "learning_rate": 1.4876263215218708e-05, "loss": 0.463, "num_input_tokens_seen": 72914304, "step": 60110 }, { "epoch": 6.695066265731151, "grad_norm": 0.10913349688053131, "learning_rate": 1.4871820198135703e-05, "loss": 0.4668, "num_input_tokens_seen": 72920512, "step": 60115 }, { "epoch": 6.695623120614767, "grad_norm": 0.1308669149875641, "learning_rate": 1.4867377563727789e-05, "loss": 0.4705, "num_input_tokens_seen": 72926880, "step": 60120 }, { "epoch": 6.696179975498385, "grad_norm": 0.09286624193191528, "learning_rate": 1.486293531216282e-05, "loss": 0.4636, "num_input_tokens_seen": 72933056, "step": 60125 }, { "epoch": 6.696736830382003, "grad_norm": 0.099715955555439, "learning_rate": 1.485849344360864e-05, "loss": 0.4675, "num_input_tokens_seen": 72939392, "step": 60130 }, { "epoch": 6.6972936852656195, "grad_norm": 0.09018509089946747, "learning_rate": 1.4854051958233079e-05, "loss": 0.4747, "num_input_tokens_seen": 72945536, "step": 60135 }, { "epoch": 6.697850540149237, "grad_norm": 0.17855028808116913, "learning_rate": 1.4849610856203946e-05, "loss": 0.4532, "num_input_tokens_seen": 72951872, "step": 60140 }, { "epoch": 6.698407395032854, "grad_norm": 0.11452510207891464, "learning_rate": 1.4845170137689049e-05, "loss": 0.4647, "num_input_tokens_seen": 72958144, "step": 60145 }, { "epoch": 6.698964249916472, "grad_norm": 0.08533701300621033, "learning_rate": 1.4840729802856162e-05, "loss": 0.4627, "num_input_tokens_seen": 72963968, "step": 60150 }, { "epoch": 6.699521104800089, "grad_norm": 0.11229466646909714, "learning_rate": 1.483628985187307e-05, "loss": 0.4624, "num_input_tokens_seen": 72969600, "step": 60155 }, { "epoch": 6.700077959683706, "grad_norm": 0.07864326983690262, "learning_rate": 1.4831850284907512e-05, "loss": 0.4665, "num_input_tokens_seen": 72975616, "step": 60160 }, { "epoch": 6.700634814567324, "grad_norm": 0.09919331967830658, "learning_rate": 1.4827411102127237e-05, "loss": 0.4638, "num_input_tokens_seen": 72981792, "step": 60165 }, { "epoch": 6.7011916694509415, "grad_norm": 0.09208492934703827, "learning_rate": 1.4822972303699967e-05, "loss": 0.4578, "num_input_tokens_seen": 72987072, "step": 60170 }, { "epoch": 6.701748524334558, "grad_norm": 0.11280877143144608, "learning_rate": 1.4818533889793418e-05, "loss": 0.458, "num_input_tokens_seen": 72993088, "step": 60175 }, { "epoch": 6.702305379218176, "grad_norm": 0.13599903881549835, "learning_rate": 1.4814095860575289e-05, "loss": 0.4538, "num_input_tokens_seen": 72999040, "step": 60180 }, { "epoch": 6.702862234101793, "grad_norm": 0.09188136458396912, "learning_rate": 1.4809658216213257e-05, "loss": 0.4691, "num_input_tokens_seen": 73005024, "step": 60185 }, { "epoch": 6.70341908898541, "grad_norm": 0.14565560221672058, "learning_rate": 1.4805220956874994e-05, "loss": 0.4557, "num_input_tokens_seen": 73010688, "step": 60190 }, { "epoch": 6.703975943869028, "grad_norm": 0.07206649333238602, "learning_rate": 1.4800784082728158e-05, "loss": 0.4671, "num_input_tokens_seen": 73016800, "step": 60195 }, { "epoch": 6.704532798752645, "grad_norm": 0.11592336744070053, "learning_rate": 1.4796347593940383e-05, "loss": 0.4732, "num_input_tokens_seen": 73022880, "step": 60200 }, { "epoch": 6.7050896536362625, "grad_norm": 0.11390113085508347, "learning_rate": 1.4791911490679295e-05, "loss": 0.4763, "num_input_tokens_seen": 73028736, "step": 60205 }, { "epoch": 6.70564650851988, "grad_norm": 0.16042689979076385, "learning_rate": 1.4787475773112502e-05, "loss": 0.4583, "num_input_tokens_seen": 73035040, "step": 60210 }, { "epoch": 6.706203363403497, "grad_norm": 0.10108782351016998, "learning_rate": 1.4783040441407605e-05, "loss": 0.4543, "num_input_tokens_seen": 73041376, "step": 60215 }, { "epoch": 6.706760218287115, "grad_norm": 0.08655032515525818, "learning_rate": 1.4778605495732178e-05, "loss": 0.4647, "num_input_tokens_seen": 73047136, "step": 60220 }, { "epoch": 6.7073170731707314, "grad_norm": 0.1092473566532135, "learning_rate": 1.4774170936253795e-05, "loss": 0.4491, "num_input_tokens_seen": 73053216, "step": 60225 }, { "epoch": 6.707873928054349, "grad_norm": 0.09935887157917023, "learning_rate": 1.4769736763140003e-05, "loss": 0.4648, "num_input_tokens_seen": 73059424, "step": 60230 }, { "epoch": 6.708430782937967, "grad_norm": 0.09243768453598022, "learning_rate": 1.4765302976558343e-05, "loss": 0.475, "num_input_tokens_seen": 73065216, "step": 60235 }, { "epoch": 6.708987637821584, "grad_norm": 0.13130266964435577, "learning_rate": 1.4760869576676334e-05, "loss": 0.4605, "num_input_tokens_seen": 73071136, "step": 60240 }, { "epoch": 6.709544492705201, "grad_norm": 0.0864761546254158, "learning_rate": 1.4756436563661485e-05, "loss": 0.4507, "num_input_tokens_seen": 73077184, "step": 60245 }, { "epoch": 6.710101347588818, "grad_norm": 0.10975034534931183, "learning_rate": 1.4752003937681284e-05, "loss": 0.4508, "num_input_tokens_seen": 73083488, "step": 60250 }, { "epoch": 6.710658202472436, "grad_norm": 0.1004970595240593, "learning_rate": 1.474757169890322e-05, "loss": 0.4756, "num_input_tokens_seen": 73089312, "step": 60255 }, { "epoch": 6.711215057356053, "grad_norm": 0.12680956721305847, "learning_rate": 1.4743139847494756e-05, "loss": 0.4617, "num_input_tokens_seen": 73095520, "step": 60260 }, { "epoch": 6.71177191223967, "grad_norm": 0.1135084256529808, "learning_rate": 1.4738708383623335e-05, "loss": 0.466, "num_input_tokens_seen": 73102016, "step": 60265 }, { "epoch": 6.712328767123288, "grad_norm": 0.1006380245089531, "learning_rate": 1.4734277307456396e-05, "loss": 0.4619, "num_input_tokens_seen": 73108000, "step": 60270 }, { "epoch": 6.712885622006905, "grad_norm": 0.15299318730831146, "learning_rate": 1.4729846619161359e-05, "loss": 0.4577, "num_input_tokens_seen": 73114176, "step": 60275 }, { "epoch": 6.713442476890522, "grad_norm": 0.09439162164926529, "learning_rate": 1.472541631890563e-05, "loss": 0.4591, "num_input_tokens_seen": 73120288, "step": 60280 }, { "epoch": 6.71399933177414, "grad_norm": 0.09429580718278885, "learning_rate": 1.4720986406856593e-05, "loss": 0.4543, "num_input_tokens_seen": 73126688, "step": 60285 }, { "epoch": 6.714556186657757, "grad_norm": 0.06834234297275543, "learning_rate": 1.4716556883181637e-05, "loss": 0.4615, "num_input_tokens_seen": 73132832, "step": 60290 }, { "epoch": 6.7151130415413745, "grad_norm": 0.09644879400730133, "learning_rate": 1.4712127748048116e-05, "loss": 0.465, "num_input_tokens_seen": 73138784, "step": 60295 }, { "epoch": 6.715669896424991, "grad_norm": 0.12732943892478943, "learning_rate": 1.4707699001623376e-05, "loss": 0.4666, "num_input_tokens_seen": 73144960, "step": 60300 }, { "epoch": 6.716226751308609, "grad_norm": 0.18522876501083374, "learning_rate": 1.470327064407475e-05, "loss": 0.4724, "num_input_tokens_seen": 73151328, "step": 60305 }, { "epoch": 6.716783606192227, "grad_norm": 0.14657089114189148, "learning_rate": 1.469884267556956e-05, "loss": 0.4584, "num_input_tokens_seen": 73157344, "step": 60310 }, { "epoch": 6.717340461075843, "grad_norm": 0.09327108412981033, "learning_rate": 1.4694415096275099e-05, "loss": 0.4593, "num_input_tokens_seen": 73163488, "step": 60315 }, { "epoch": 6.717897315959461, "grad_norm": 0.08529014140367508, "learning_rate": 1.4689987906358665e-05, "loss": 0.4663, "num_input_tokens_seen": 73169600, "step": 60320 }, { "epoch": 6.718454170843078, "grad_norm": 0.16193315386772156, "learning_rate": 1.468556110598753e-05, "loss": 0.4515, "num_input_tokens_seen": 73176096, "step": 60325 }, { "epoch": 6.7190110257266955, "grad_norm": 0.10562902688980103, "learning_rate": 1.4681134695328952e-05, "loss": 0.4633, "num_input_tokens_seen": 73181856, "step": 60330 }, { "epoch": 6.719567880610313, "grad_norm": 0.13950678706169128, "learning_rate": 1.4676708674550171e-05, "loss": 0.464, "num_input_tokens_seen": 73188032, "step": 60335 }, { "epoch": 6.72012473549393, "grad_norm": 0.07677099108695984, "learning_rate": 1.4672283043818419e-05, "loss": 0.4629, "num_input_tokens_seen": 73194144, "step": 60340 }, { "epoch": 6.720681590377548, "grad_norm": 0.09137002378702164, "learning_rate": 1.466785780330091e-05, "loss": 0.4668, "num_input_tokens_seen": 73200224, "step": 60345 }, { "epoch": 6.721238445261165, "grad_norm": 0.08401862531900406, "learning_rate": 1.466343295316484e-05, "loss": 0.4603, "num_input_tokens_seen": 73206144, "step": 60350 }, { "epoch": 6.721795300144782, "grad_norm": 0.10358969122171402, "learning_rate": 1.4659008493577403e-05, "loss": 0.4568, "num_input_tokens_seen": 73212032, "step": 60355 }, { "epoch": 6.7223521550284, "grad_norm": 0.1550324261188507, "learning_rate": 1.4654584424705767e-05, "loss": 0.4617, "num_input_tokens_seen": 73217920, "step": 60360 }, { "epoch": 6.722909009912017, "grad_norm": 0.11115793138742447, "learning_rate": 1.4650160746717085e-05, "loss": 0.4815, "num_input_tokens_seen": 73223904, "step": 60365 }, { "epoch": 6.723465864795634, "grad_norm": 0.10407984256744385, "learning_rate": 1.4645737459778492e-05, "loss": 0.4707, "num_input_tokens_seen": 73230016, "step": 60370 }, { "epoch": 6.724022719679252, "grad_norm": 0.11630207300186157, "learning_rate": 1.4641314564057124e-05, "loss": 0.466, "num_input_tokens_seen": 73235808, "step": 60375 }, { "epoch": 6.724579574562869, "grad_norm": 0.07408934831619263, "learning_rate": 1.463689205972008e-05, "loss": 0.4707, "num_input_tokens_seen": 73241696, "step": 60380 }, { "epoch": 6.725136429446486, "grad_norm": 0.09683050215244293, "learning_rate": 1.4632469946934474e-05, "loss": 0.4661, "num_input_tokens_seen": 73247808, "step": 60385 }, { "epoch": 6.725693284330104, "grad_norm": 0.1042870283126831, "learning_rate": 1.4628048225867374e-05, "loss": 0.4592, "num_input_tokens_seen": 73253952, "step": 60390 }, { "epoch": 6.726250139213721, "grad_norm": 0.10941222310066223, "learning_rate": 1.4623626896685852e-05, "loss": 0.4585, "num_input_tokens_seen": 73260224, "step": 60395 }, { "epoch": 6.7268069940973385, "grad_norm": 0.12430141866207123, "learning_rate": 1.4619205959556959e-05, "loss": 0.4594, "num_input_tokens_seen": 73266272, "step": 60400 }, { "epoch": 6.727363848980955, "grad_norm": 0.09972479939460754, "learning_rate": 1.461478541464773e-05, "loss": 0.4581, "num_input_tokens_seen": 73272512, "step": 60405 }, { "epoch": 6.727920703864573, "grad_norm": 0.10826625674962997, "learning_rate": 1.4610365262125183e-05, "loss": 0.4502, "num_input_tokens_seen": 73278656, "step": 60410 }, { "epoch": 6.728477558748191, "grad_norm": 0.10360641032457352, "learning_rate": 1.460594550215634e-05, "loss": 0.471, "num_input_tokens_seen": 73284672, "step": 60415 }, { "epoch": 6.729034413631807, "grad_norm": 0.0861947238445282, "learning_rate": 1.4601526134908183e-05, "loss": 0.4593, "num_input_tokens_seen": 73290848, "step": 60420 }, { "epoch": 6.729591268515425, "grad_norm": 0.13717517256736755, "learning_rate": 1.4597107160547694e-05, "loss": 0.4631, "num_input_tokens_seen": 73297056, "step": 60425 }, { "epoch": 6.730148123399042, "grad_norm": 0.10175886750221252, "learning_rate": 1.4592688579241825e-05, "loss": 0.4653, "num_input_tokens_seen": 73303136, "step": 60430 }, { "epoch": 6.73070497828266, "grad_norm": 0.09621193259954453, "learning_rate": 1.4588270391157549e-05, "loss": 0.4683, "num_input_tokens_seen": 73309280, "step": 60435 }, { "epoch": 6.731261833166277, "grad_norm": 0.10067980736494064, "learning_rate": 1.4583852596461772e-05, "loss": 0.4551, "num_input_tokens_seen": 73315616, "step": 60440 }, { "epoch": 6.731818688049894, "grad_norm": 0.10245377570390701, "learning_rate": 1.4579435195321434e-05, "loss": 0.4599, "num_input_tokens_seen": 73321696, "step": 60445 }, { "epoch": 6.732375542933512, "grad_norm": 0.08459997177124023, "learning_rate": 1.4575018187903417e-05, "loss": 0.4631, "num_input_tokens_seen": 73327488, "step": 60450 }, { "epoch": 6.7329323978171285, "grad_norm": 0.12723179161548615, "learning_rate": 1.4570601574374632e-05, "loss": 0.4572, "num_input_tokens_seen": 73333632, "step": 60455 }, { "epoch": 6.733489252700746, "grad_norm": 0.09553004056215286, "learning_rate": 1.4566185354901931e-05, "loss": 0.4629, "num_input_tokens_seen": 73339904, "step": 60460 }, { "epoch": 6.734046107584364, "grad_norm": 0.09701640158891678, "learning_rate": 1.4561769529652186e-05, "loss": 0.4644, "num_input_tokens_seen": 73346176, "step": 60465 }, { "epoch": 6.734602962467981, "grad_norm": 0.1546632945537567, "learning_rate": 1.4557354098792248e-05, "loss": 0.4595, "num_input_tokens_seen": 73352320, "step": 60470 }, { "epoch": 6.735159817351598, "grad_norm": 0.10219362378120422, "learning_rate": 1.455293906248893e-05, "loss": 0.4549, "num_input_tokens_seen": 73358336, "step": 60475 }, { "epoch": 6.735716672235215, "grad_norm": 0.09674075990915298, "learning_rate": 1.454852442090906e-05, "loss": 0.4646, "num_input_tokens_seen": 73364544, "step": 60480 }, { "epoch": 6.736273527118833, "grad_norm": 0.09463998675346375, "learning_rate": 1.4544110174219421e-05, "loss": 0.4638, "num_input_tokens_seen": 73370496, "step": 60485 }, { "epoch": 6.73683038200245, "grad_norm": 0.10013610124588013, "learning_rate": 1.4539696322586809e-05, "loss": 0.4556, "num_input_tokens_seen": 73376384, "step": 60490 }, { "epoch": 6.737387236886067, "grad_norm": 0.1249610036611557, "learning_rate": 1.4535282866178001e-05, "loss": 0.4581, "num_input_tokens_seen": 73382400, "step": 60495 }, { "epoch": 6.737944091769685, "grad_norm": 0.10913597047328949, "learning_rate": 1.4530869805159733e-05, "loss": 0.4672, "num_input_tokens_seen": 73388704, "step": 60500 }, { "epoch": 6.738500946653302, "grad_norm": 0.1088060513138771, "learning_rate": 1.4526457139698762e-05, "loss": 0.4611, "num_input_tokens_seen": 73394912, "step": 60505 }, { "epoch": 6.739057801536919, "grad_norm": 0.08579650521278381, "learning_rate": 1.4522044869961795e-05, "loss": 0.45, "num_input_tokens_seen": 73400960, "step": 60510 }, { "epoch": 6.739614656420537, "grad_norm": 0.10185687243938446, "learning_rate": 1.451763299611556e-05, "loss": 0.4803, "num_input_tokens_seen": 73407136, "step": 60515 }, { "epoch": 6.740171511304154, "grad_norm": 0.1060865968465805, "learning_rate": 1.4513221518326736e-05, "loss": 0.4603, "num_input_tokens_seen": 73412896, "step": 60520 }, { "epoch": 6.7407283661877715, "grad_norm": 0.15746286511421204, "learning_rate": 1.4508810436762007e-05, "loss": 0.4517, "num_input_tokens_seen": 73418720, "step": 60525 }, { "epoch": 6.741285221071389, "grad_norm": 0.1319829672574997, "learning_rate": 1.4504399751588052e-05, "loss": 0.4631, "num_input_tokens_seen": 73424544, "step": 60530 }, { "epoch": 6.741842075955006, "grad_norm": 0.07868485152721405, "learning_rate": 1.4499989462971497e-05, "loss": 0.4529, "num_input_tokens_seen": 73430400, "step": 60535 }, { "epoch": 6.742398930838624, "grad_norm": 0.11138658225536346, "learning_rate": 1.4495579571079001e-05, "loss": 0.4526, "num_input_tokens_seen": 73436320, "step": 60540 }, { "epoch": 6.74295578572224, "grad_norm": 0.07247963547706604, "learning_rate": 1.449117007607716e-05, "loss": 0.4601, "num_input_tokens_seen": 73442624, "step": 60545 }, { "epoch": 6.743512640605858, "grad_norm": 0.08775056153535843, "learning_rate": 1.4486760978132603e-05, "loss": 0.452, "num_input_tokens_seen": 73448672, "step": 60550 }, { "epoch": 6.744069495489476, "grad_norm": 0.07253383100032806, "learning_rate": 1.4482352277411899e-05, "loss": 0.4632, "num_input_tokens_seen": 73454880, "step": 60555 }, { "epoch": 6.7446263503730925, "grad_norm": 0.0707736387848854, "learning_rate": 1.4477943974081626e-05, "loss": 0.4643, "num_input_tokens_seen": 73460960, "step": 60560 }, { "epoch": 6.74518320525671, "grad_norm": 0.12580113112926483, "learning_rate": 1.4473536068308364e-05, "loss": 0.4647, "num_input_tokens_seen": 73467200, "step": 60565 }, { "epoch": 6.745740060140328, "grad_norm": 0.10385262966156006, "learning_rate": 1.4469128560258638e-05, "loss": 0.4577, "num_input_tokens_seen": 73473152, "step": 60570 }, { "epoch": 6.746296915023945, "grad_norm": 0.103598952293396, "learning_rate": 1.4464721450098978e-05, "loss": 0.4711, "num_input_tokens_seen": 73478720, "step": 60575 }, { "epoch": 6.746853769907562, "grad_norm": 0.1093609407544136, "learning_rate": 1.4460314737995912e-05, "loss": 0.4582, "num_input_tokens_seen": 73484992, "step": 60580 }, { "epoch": 6.747410624791179, "grad_norm": 0.11122103035449982, "learning_rate": 1.4455908424115919e-05, "loss": 0.4565, "num_input_tokens_seen": 73490912, "step": 60585 }, { "epoch": 6.747967479674797, "grad_norm": 0.07694301009178162, "learning_rate": 1.4451502508625508e-05, "loss": 0.462, "num_input_tokens_seen": 73497216, "step": 60590 }, { "epoch": 6.7485243345584145, "grad_norm": 0.1367677003145218, "learning_rate": 1.4447096991691125e-05, "loss": 0.4628, "num_input_tokens_seen": 73503328, "step": 60595 }, { "epoch": 6.749081189442031, "grad_norm": 0.08288644254207611, "learning_rate": 1.4442691873479247e-05, "loss": 0.4524, "num_input_tokens_seen": 73509216, "step": 60600 }, { "epoch": 6.749638044325649, "grad_norm": 0.11086706817150116, "learning_rate": 1.4438287154156293e-05, "loss": 0.4613, "num_input_tokens_seen": 73514592, "step": 60605 }, { "epoch": 6.750194899209266, "grad_norm": 0.12726955115795135, "learning_rate": 1.4433882833888698e-05, "loss": 0.473, "num_input_tokens_seen": 73520896, "step": 60610 }, { "epoch": 6.750751754092883, "grad_norm": 0.059341803193092346, "learning_rate": 1.442947891284288e-05, "loss": 0.4655, "num_input_tokens_seen": 73526976, "step": 60615 }, { "epoch": 6.751308608976501, "grad_norm": 0.10855893045663834, "learning_rate": 1.4425075391185211e-05, "loss": 0.4488, "num_input_tokens_seen": 73533280, "step": 60620 }, { "epoch": 6.751865463860118, "grad_norm": 0.13684135675430298, "learning_rate": 1.4420672269082097e-05, "loss": 0.4684, "num_input_tokens_seen": 73539488, "step": 60625 }, { "epoch": 6.7524223187437356, "grad_norm": 0.0864834412932396, "learning_rate": 1.4416269546699873e-05, "loss": 0.466, "num_input_tokens_seen": 73545216, "step": 60630 }, { "epoch": 6.752979173627352, "grad_norm": 0.10552404820919037, "learning_rate": 1.4411867224204916e-05, "loss": 0.4654, "num_input_tokens_seen": 73551328, "step": 60635 }, { "epoch": 6.75353602851097, "grad_norm": 0.10927605628967285, "learning_rate": 1.4407465301763534e-05, "loss": 0.4697, "num_input_tokens_seen": 73557344, "step": 60640 }, { "epoch": 6.754092883394588, "grad_norm": 0.0997730940580368, "learning_rate": 1.4403063779542061e-05, "loss": 0.4513, "num_input_tokens_seen": 73563392, "step": 60645 }, { "epoch": 6.7546497382782045, "grad_norm": 0.12380573898553848, "learning_rate": 1.4398662657706807e-05, "loss": 0.4614, "num_input_tokens_seen": 73569056, "step": 60650 }, { "epoch": 6.755206593161822, "grad_norm": 0.0925431028008461, "learning_rate": 1.4394261936424041e-05, "loss": 0.4649, "num_input_tokens_seen": 73575072, "step": 60655 }, { "epoch": 6.755763448045439, "grad_norm": 0.09718791395425797, "learning_rate": 1.4389861615860056e-05, "loss": 0.4714, "num_input_tokens_seen": 73581280, "step": 60660 }, { "epoch": 6.756320302929057, "grad_norm": 0.0985792949795723, "learning_rate": 1.4385461696181095e-05, "loss": 0.472, "num_input_tokens_seen": 73587584, "step": 60665 }, { "epoch": 6.756877157812674, "grad_norm": 0.09175055474042892, "learning_rate": 1.4381062177553403e-05, "loss": 0.472, "num_input_tokens_seen": 73593472, "step": 60670 }, { "epoch": 6.757434012696291, "grad_norm": 0.12229613959789276, "learning_rate": 1.4376663060143225e-05, "loss": 0.4593, "num_input_tokens_seen": 73599808, "step": 60675 }, { "epoch": 6.757990867579909, "grad_norm": 0.0712137520313263, "learning_rate": 1.4372264344116748e-05, "loss": 0.4625, "num_input_tokens_seen": 73606112, "step": 60680 }, { "epoch": 6.7585477224635255, "grad_norm": 0.09570417553186417, "learning_rate": 1.4367866029640192e-05, "loss": 0.465, "num_input_tokens_seen": 73612512, "step": 60685 }, { "epoch": 6.759104577347143, "grad_norm": 0.0874355211853981, "learning_rate": 1.4363468116879721e-05, "loss": 0.4629, "num_input_tokens_seen": 73618784, "step": 60690 }, { "epoch": 6.759661432230761, "grad_norm": 0.11284926533699036, "learning_rate": 1.4359070606001524e-05, "loss": 0.469, "num_input_tokens_seen": 73624736, "step": 60695 }, { "epoch": 6.760218287114378, "grad_norm": 0.11211909353733063, "learning_rate": 1.4354673497171726e-05, "loss": 0.4456, "num_input_tokens_seen": 73630912, "step": 60700 }, { "epoch": 6.760775141997995, "grad_norm": 0.1257680356502533, "learning_rate": 1.4350276790556476e-05, "loss": 0.4574, "num_input_tokens_seen": 73636960, "step": 60705 }, { "epoch": 6.761331996881613, "grad_norm": 0.09060271084308624, "learning_rate": 1.4345880486321911e-05, "loss": 0.4606, "num_input_tokens_seen": 73643040, "step": 60710 }, { "epoch": 6.76188885176523, "grad_norm": 0.11562850326299667, "learning_rate": 1.4341484584634115e-05, "loss": 0.4554, "num_input_tokens_seen": 73649216, "step": 60715 }, { "epoch": 6.7624457066488475, "grad_norm": 0.12701381742954254, "learning_rate": 1.4337089085659197e-05, "loss": 0.4692, "num_input_tokens_seen": 73654880, "step": 60720 }, { "epoch": 6.763002561532464, "grad_norm": 0.10646872222423553, "learning_rate": 1.4332693989563215e-05, "loss": 0.4739, "num_input_tokens_seen": 73660896, "step": 60725 }, { "epoch": 6.763559416416082, "grad_norm": 0.11941666901111603, "learning_rate": 1.4328299296512249e-05, "loss": 0.4664, "num_input_tokens_seen": 73666592, "step": 60730 }, { "epoch": 6.7641162712997, "grad_norm": 0.10587572306394577, "learning_rate": 1.4323905006672323e-05, "loss": 0.4649, "num_input_tokens_seen": 73672832, "step": 60735 }, { "epoch": 6.764673126183316, "grad_norm": 0.11784877628087997, "learning_rate": 1.431951112020948e-05, "loss": 0.4729, "num_input_tokens_seen": 73678816, "step": 60740 }, { "epoch": 6.765229981066934, "grad_norm": 0.09859409183263779, "learning_rate": 1.4315117637289744e-05, "loss": 0.4542, "num_input_tokens_seen": 73684672, "step": 60745 }, { "epoch": 6.765786835950552, "grad_norm": 0.11384277790784836, "learning_rate": 1.4310724558079096e-05, "loss": 0.4602, "num_input_tokens_seen": 73690944, "step": 60750 }, { "epoch": 6.7663436908341685, "grad_norm": 0.08035465329885483, "learning_rate": 1.430633188274354e-05, "loss": 0.4592, "num_input_tokens_seen": 73697152, "step": 60755 }, { "epoch": 6.766900545717786, "grad_norm": 0.08156789839267731, "learning_rate": 1.4301939611449022e-05, "loss": 0.4648, "num_input_tokens_seen": 73703200, "step": 60760 }, { "epoch": 6.767457400601403, "grad_norm": 0.11454825848340988, "learning_rate": 1.429754774436151e-05, "loss": 0.4615, "num_input_tokens_seen": 73709440, "step": 60765 }, { "epoch": 6.768014255485021, "grad_norm": 0.08714978396892548, "learning_rate": 1.4293156281646954e-05, "loss": 0.46, "num_input_tokens_seen": 73715520, "step": 60770 }, { "epoch": 6.768571110368638, "grad_norm": 0.13639548420906067, "learning_rate": 1.4288765223471256e-05, "loss": 0.457, "num_input_tokens_seen": 73721696, "step": 60775 }, { "epoch": 6.769127965252255, "grad_norm": 0.11127392202615738, "learning_rate": 1.4284374570000341e-05, "loss": 0.4666, "num_input_tokens_seen": 73727936, "step": 60780 }, { "epoch": 6.769684820135873, "grad_norm": 0.11201945692300797, "learning_rate": 1.4279984321400086e-05, "loss": 0.4476, "num_input_tokens_seen": 73734112, "step": 60785 }, { "epoch": 6.77024167501949, "grad_norm": 0.09399460256099701, "learning_rate": 1.4275594477836388e-05, "loss": 0.4614, "num_input_tokens_seen": 73740064, "step": 60790 }, { "epoch": 6.770798529903107, "grad_norm": 0.08386846631765366, "learning_rate": 1.4271205039475088e-05, "loss": 0.4601, "num_input_tokens_seen": 73746272, "step": 60795 }, { "epoch": 6.771355384786725, "grad_norm": 0.09932181984186172, "learning_rate": 1.4266816006482042e-05, "loss": 0.4697, "num_input_tokens_seen": 73752032, "step": 60800 }, { "epoch": 6.771912239670342, "grad_norm": 0.12533925473690033, "learning_rate": 1.4262427379023096e-05, "loss": 0.4691, "num_input_tokens_seen": 73758144, "step": 60805 }, { "epoch": 6.772469094553959, "grad_norm": 0.1547878384590149, "learning_rate": 1.4258039157264042e-05, "loss": 0.4652, "num_input_tokens_seen": 73763904, "step": 60810 }, { "epoch": 6.773025949437576, "grad_norm": 0.0964081659913063, "learning_rate": 1.4253651341370705e-05, "loss": 0.4688, "num_input_tokens_seen": 73769952, "step": 60815 }, { "epoch": 6.773582804321194, "grad_norm": 0.10313452780246735, "learning_rate": 1.4249263931508849e-05, "loss": 0.4661, "num_input_tokens_seen": 73776128, "step": 60820 }, { "epoch": 6.7741396592048115, "grad_norm": 0.12916232645511627, "learning_rate": 1.4244876927844262e-05, "loss": 0.4605, "num_input_tokens_seen": 73782336, "step": 60825 }, { "epoch": 6.774696514088428, "grad_norm": 0.13966691493988037, "learning_rate": 1.424049033054268e-05, "loss": 0.4717, "num_input_tokens_seen": 73788448, "step": 60830 }, { "epoch": 6.775253368972046, "grad_norm": 0.09722230583429337, "learning_rate": 1.4236104139769857e-05, "loss": 0.4526, "num_input_tokens_seen": 73794592, "step": 60835 }, { "epoch": 6.775810223855663, "grad_norm": 0.08960500359535217, "learning_rate": 1.423171835569152e-05, "loss": 0.4607, "num_input_tokens_seen": 73800704, "step": 60840 }, { "epoch": 6.7763670787392805, "grad_norm": 0.11780790239572525, "learning_rate": 1.4227332978473368e-05, "loss": 0.4532, "num_input_tokens_seen": 73806880, "step": 60845 }, { "epoch": 6.776923933622898, "grad_norm": 0.09268482029438019, "learning_rate": 1.4222948008281106e-05, "loss": 0.4619, "num_input_tokens_seen": 73812704, "step": 60850 }, { "epoch": 6.777480788506515, "grad_norm": 0.12339454889297485, "learning_rate": 1.4218563445280392e-05, "loss": 0.4609, "num_input_tokens_seen": 73818496, "step": 60855 }, { "epoch": 6.778037643390133, "grad_norm": 0.09801332652568817, "learning_rate": 1.4214179289636914e-05, "loss": 0.469, "num_input_tokens_seen": 73824032, "step": 60860 }, { "epoch": 6.778594498273749, "grad_norm": 0.08850251138210297, "learning_rate": 1.4209795541516302e-05, "loss": 0.4684, "num_input_tokens_seen": 73830208, "step": 60865 }, { "epoch": 6.779151353157367, "grad_norm": 0.09456199407577515, "learning_rate": 1.4205412201084188e-05, "loss": 0.4546, "num_input_tokens_seen": 73836448, "step": 60870 }, { "epoch": 6.779708208040985, "grad_norm": 0.10580138862133026, "learning_rate": 1.4201029268506205e-05, "loss": 0.4703, "num_input_tokens_seen": 73842592, "step": 60875 }, { "epoch": 6.7802650629246015, "grad_norm": 0.10656055063009262, "learning_rate": 1.4196646743947933e-05, "loss": 0.4622, "num_input_tokens_seen": 73848800, "step": 60880 }, { "epoch": 6.780821917808219, "grad_norm": 0.06708300113677979, "learning_rate": 1.419226462757498e-05, "loss": 0.4499, "num_input_tokens_seen": 73854624, "step": 60885 }, { "epoch": 6.781378772691837, "grad_norm": 0.08214335143566132, "learning_rate": 1.4187882919552892e-05, "loss": 0.4691, "num_input_tokens_seen": 73860512, "step": 60890 }, { "epoch": 6.781935627575454, "grad_norm": 0.0899149626493454, "learning_rate": 1.4183501620047238e-05, "loss": 0.466, "num_input_tokens_seen": 73866528, "step": 60895 }, { "epoch": 6.782492482459071, "grad_norm": 0.10875742882490158, "learning_rate": 1.4179120729223565e-05, "loss": 0.454, "num_input_tokens_seen": 73872800, "step": 60900 }, { "epoch": 6.783049337342688, "grad_norm": 0.1284153163433075, "learning_rate": 1.4174740247247378e-05, "loss": 0.4528, "num_input_tokens_seen": 73878880, "step": 60905 }, { "epoch": 6.783606192226306, "grad_norm": 0.12914584577083588, "learning_rate": 1.4170360174284206e-05, "loss": 0.4694, "num_input_tokens_seen": 73884544, "step": 60910 }, { "epoch": 6.7841630471099235, "grad_norm": 0.1122383400797844, "learning_rate": 1.4165980510499522e-05, "loss": 0.4623, "num_input_tokens_seen": 73890368, "step": 60915 }, { "epoch": 6.78471990199354, "grad_norm": 0.10401300340890884, "learning_rate": 1.4161601256058823e-05, "loss": 0.4603, "num_input_tokens_seen": 73896576, "step": 60920 }, { "epoch": 6.785276756877158, "grad_norm": 0.08887085318565369, "learning_rate": 1.4157222411127552e-05, "loss": 0.4622, "num_input_tokens_seen": 73902656, "step": 60925 }, { "epoch": 6.785833611760776, "grad_norm": 0.113311268389225, "learning_rate": 1.4152843975871167e-05, "loss": 0.4585, "num_input_tokens_seen": 73908992, "step": 60930 }, { "epoch": 6.786390466644392, "grad_norm": 0.09930825233459473, "learning_rate": 1.4148465950455109e-05, "loss": 0.4591, "num_input_tokens_seen": 73915232, "step": 60935 }, { "epoch": 6.78694732152801, "grad_norm": 0.12935034930706024, "learning_rate": 1.414408833504477e-05, "loss": 0.4664, "num_input_tokens_seen": 73921024, "step": 60940 }, { "epoch": 6.787504176411627, "grad_norm": 0.10543931275606155, "learning_rate": 1.4139711129805574e-05, "loss": 0.466, "num_input_tokens_seen": 73927104, "step": 60945 }, { "epoch": 6.7880610312952445, "grad_norm": 0.1179456114768982, "learning_rate": 1.4135334334902887e-05, "loss": 0.4554, "num_input_tokens_seen": 73932736, "step": 60950 }, { "epoch": 6.788617886178862, "grad_norm": 0.18215098977088928, "learning_rate": 1.4130957950502094e-05, "loss": 0.4628, "num_input_tokens_seen": 73939072, "step": 60955 }, { "epoch": 6.789174741062479, "grad_norm": 0.1579689234495163, "learning_rate": 1.4126581976768533e-05, "loss": 0.4656, "num_input_tokens_seen": 73945376, "step": 60960 }, { "epoch": 6.789731595946097, "grad_norm": 0.14990825951099396, "learning_rate": 1.4122206413867547e-05, "loss": 0.4719, "num_input_tokens_seen": 73951712, "step": 60965 }, { "epoch": 6.790288450829713, "grad_norm": 0.07459912449121475, "learning_rate": 1.4117831261964475e-05, "loss": 0.4573, "num_input_tokens_seen": 73957792, "step": 60970 }, { "epoch": 6.790845305713331, "grad_norm": 0.08891800791025162, "learning_rate": 1.41134565212246e-05, "loss": 0.4606, "num_input_tokens_seen": 73963968, "step": 60975 }, { "epoch": 6.791402160596949, "grad_norm": 0.15318499505519867, "learning_rate": 1.410908219181324e-05, "loss": 0.467, "num_input_tokens_seen": 73970048, "step": 60980 }, { "epoch": 6.791959015480566, "grad_norm": 0.08782734721899033, "learning_rate": 1.410470827389565e-05, "loss": 0.4578, "num_input_tokens_seen": 73976224, "step": 60985 }, { "epoch": 6.792515870364183, "grad_norm": 0.08362159878015518, "learning_rate": 1.410033476763709e-05, "loss": 0.4525, "num_input_tokens_seen": 73981984, "step": 60990 }, { "epoch": 6.7930727252478, "grad_norm": 0.09739388525485992, "learning_rate": 1.4095961673202823e-05, "loss": 0.4515, "num_input_tokens_seen": 73988032, "step": 60995 }, { "epoch": 6.793629580131418, "grad_norm": 0.11453450471162796, "learning_rate": 1.4091588990758056e-05, "loss": 0.4605, "num_input_tokens_seen": 73993856, "step": 61000 }, { "epoch": 6.794186435015035, "grad_norm": 0.08606167882680893, "learning_rate": 1.4087216720468027e-05, "loss": 0.474, "num_input_tokens_seen": 74000000, "step": 61005 }, { "epoch": 6.794743289898652, "grad_norm": 0.12180667370557785, "learning_rate": 1.408284486249791e-05, "loss": 0.4701, "num_input_tokens_seen": 74005760, "step": 61010 }, { "epoch": 6.79530014478227, "grad_norm": 0.0965188667178154, "learning_rate": 1.4078473417012899e-05, "loss": 0.4607, "num_input_tokens_seen": 74011680, "step": 61015 }, { "epoch": 6.795856999665887, "grad_norm": 0.12357567995786667, "learning_rate": 1.4074102384178173e-05, "loss": 0.4623, "num_input_tokens_seen": 74017888, "step": 61020 }, { "epoch": 6.796413854549504, "grad_norm": 0.1112113893032074, "learning_rate": 1.4069731764158866e-05, "loss": 0.4606, "num_input_tokens_seen": 74024256, "step": 61025 }, { "epoch": 6.796970709433122, "grad_norm": 0.10313459485769272, "learning_rate": 1.4065361557120126e-05, "loss": 0.4523, "num_input_tokens_seen": 74030464, "step": 61030 }, { "epoch": 6.797527564316739, "grad_norm": 0.0861789733171463, "learning_rate": 1.406099176322706e-05, "loss": 0.4655, "num_input_tokens_seen": 74036000, "step": 61035 }, { "epoch": 6.798084419200356, "grad_norm": 0.14490368962287903, "learning_rate": 1.4056622382644791e-05, "loss": 0.4513, "num_input_tokens_seen": 74042208, "step": 61040 }, { "epoch": 6.798641274083973, "grad_norm": 0.08494346588850021, "learning_rate": 1.4052253415538388e-05, "loss": 0.4604, "num_input_tokens_seen": 74047456, "step": 61045 }, { "epoch": 6.799198128967591, "grad_norm": 0.1010574996471405, "learning_rate": 1.4047884862072935e-05, "loss": 0.4553, "num_input_tokens_seen": 74053472, "step": 61050 }, { "epoch": 6.799754983851209, "grad_norm": 0.08900297433137894, "learning_rate": 1.40435167224135e-05, "loss": 0.461, "num_input_tokens_seen": 74059616, "step": 61055 }, { "epoch": 6.800311838734825, "grad_norm": 0.09997142106294632, "learning_rate": 1.4039148996725105e-05, "loss": 0.4654, "num_input_tokens_seen": 74065952, "step": 61060 }, { "epoch": 6.800868693618443, "grad_norm": 0.12948554754257202, "learning_rate": 1.4034781685172796e-05, "loss": 0.4601, "num_input_tokens_seen": 74071776, "step": 61065 }, { "epoch": 6.801425548502061, "grad_norm": 0.0984216257929802, "learning_rate": 1.4030414787921564e-05, "loss": 0.4613, "num_input_tokens_seen": 74077984, "step": 61070 }, { "epoch": 6.8019824033856775, "grad_norm": 0.08109171688556671, "learning_rate": 1.4026048305136417e-05, "loss": 0.4628, "num_input_tokens_seen": 74084160, "step": 61075 }, { "epoch": 6.802539258269295, "grad_norm": 0.09469068795442581, "learning_rate": 1.4021682236982342e-05, "loss": 0.4622, "num_input_tokens_seen": 74090464, "step": 61080 }, { "epoch": 6.803096113152913, "grad_norm": 0.08698159456253052, "learning_rate": 1.4017316583624285e-05, "loss": 0.4666, "num_input_tokens_seen": 74096544, "step": 61085 }, { "epoch": 6.80365296803653, "grad_norm": 0.10672274231910706, "learning_rate": 1.401295134522721e-05, "loss": 0.4624, "num_input_tokens_seen": 74102656, "step": 61090 }, { "epoch": 6.804209822920147, "grad_norm": 0.09060096740722656, "learning_rate": 1.4008586521956036e-05, "loss": 0.4616, "num_input_tokens_seen": 74108640, "step": 61095 }, { "epoch": 6.804766677803764, "grad_norm": 0.10101357847452164, "learning_rate": 1.4004222113975695e-05, "loss": 0.4529, "num_input_tokens_seen": 74114592, "step": 61100 }, { "epoch": 6.805323532687382, "grad_norm": 0.09117688238620758, "learning_rate": 1.399985812145107e-05, "loss": 0.4717, "num_input_tokens_seen": 74120768, "step": 61105 }, { "epoch": 6.8058803875709994, "grad_norm": 0.11466404050588608, "learning_rate": 1.3995494544547059e-05, "loss": 0.4529, "num_input_tokens_seen": 74126656, "step": 61110 }, { "epoch": 6.806437242454616, "grad_norm": 0.10516997426748276, "learning_rate": 1.3991131383428537e-05, "loss": 0.4579, "num_input_tokens_seen": 74132832, "step": 61115 }, { "epoch": 6.806994097338234, "grad_norm": 0.1279539167881012, "learning_rate": 1.3986768638260344e-05, "loss": 0.4708, "num_input_tokens_seen": 74139008, "step": 61120 }, { "epoch": 6.807550952221851, "grad_norm": 0.09544161707162857, "learning_rate": 1.3982406309207333e-05, "loss": 0.4656, "num_input_tokens_seen": 74144864, "step": 61125 }, { "epoch": 6.808107807105468, "grad_norm": 0.08603454381227493, "learning_rate": 1.3978044396434309e-05, "loss": 0.4638, "num_input_tokens_seen": 74150080, "step": 61130 }, { "epoch": 6.808664661989086, "grad_norm": 0.0897713452577591, "learning_rate": 1.39736829001061e-05, "loss": 0.4526, "num_input_tokens_seen": 74156256, "step": 61135 }, { "epoch": 6.809221516872703, "grad_norm": 0.08678455650806427, "learning_rate": 1.3969321820387477e-05, "loss": 0.4669, "num_input_tokens_seen": 74162400, "step": 61140 }, { "epoch": 6.8097783717563205, "grad_norm": 0.11979357898235321, "learning_rate": 1.3964961157443226e-05, "loss": 0.458, "num_input_tokens_seen": 74168480, "step": 61145 }, { "epoch": 6.810335226639937, "grad_norm": 0.09899863600730896, "learning_rate": 1.3960600911438115e-05, "loss": 0.4631, "num_input_tokens_seen": 74174464, "step": 61150 }, { "epoch": 6.810892081523555, "grad_norm": 0.10783441364765167, "learning_rate": 1.395624108253687e-05, "loss": 0.4573, "num_input_tokens_seen": 74180480, "step": 61155 }, { "epoch": 6.811448936407173, "grad_norm": 0.12350985407829285, "learning_rate": 1.3951881670904237e-05, "loss": 0.4662, "num_input_tokens_seen": 74186656, "step": 61160 }, { "epoch": 6.812005791290789, "grad_norm": 0.08922924101352692, "learning_rate": 1.3947522676704911e-05, "loss": 0.4663, "num_input_tokens_seen": 74192480, "step": 61165 }, { "epoch": 6.812562646174407, "grad_norm": 0.09435378015041351, "learning_rate": 1.3943164100103598e-05, "loss": 0.4495, "num_input_tokens_seen": 74198816, "step": 61170 }, { "epoch": 6.813119501058024, "grad_norm": 0.08378394693136215, "learning_rate": 1.3938805941264988e-05, "loss": 0.4606, "num_input_tokens_seen": 74204736, "step": 61175 }, { "epoch": 6.813676355941642, "grad_norm": 0.09415678679943085, "learning_rate": 1.3934448200353727e-05, "loss": 0.4589, "num_input_tokens_seen": 74210912, "step": 61180 }, { "epoch": 6.814233210825259, "grad_norm": 0.08512681722640991, "learning_rate": 1.3930090877534485e-05, "loss": 0.446, "num_input_tokens_seen": 74216992, "step": 61185 }, { "epoch": 6.814790065708876, "grad_norm": 0.13583923876285553, "learning_rate": 1.3925733972971872e-05, "loss": 0.472, "num_input_tokens_seen": 74223168, "step": 61190 }, { "epoch": 6.815346920592494, "grad_norm": 0.08192814141511917, "learning_rate": 1.392137748683053e-05, "loss": 0.4574, "num_input_tokens_seen": 74229440, "step": 61195 }, { "epoch": 6.8159037754761105, "grad_norm": 0.13498617708683014, "learning_rate": 1.3917021419275042e-05, "loss": 0.4649, "num_input_tokens_seen": 74235168, "step": 61200 }, { "epoch": 6.816460630359728, "grad_norm": 0.13114649057388306, "learning_rate": 1.391266577047e-05, "loss": 0.457, "num_input_tokens_seen": 74241376, "step": 61205 }, { "epoch": 6.817017485243346, "grad_norm": 0.13420367240905762, "learning_rate": 1.3908310540579989e-05, "loss": 0.4691, "num_input_tokens_seen": 74247808, "step": 61210 }, { "epoch": 6.817574340126963, "grad_norm": 0.10418165475130081, "learning_rate": 1.3903955729769536e-05, "loss": 0.4669, "num_input_tokens_seen": 74253760, "step": 61215 }, { "epoch": 6.81813119501058, "grad_norm": 0.0893084779381752, "learning_rate": 1.3899601338203208e-05, "loss": 0.4631, "num_input_tokens_seen": 74259840, "step": 61220 }, { "epoch": 6.818688049894197, "grad_norm": 0.10595329850912094, "learning_rate": 1.3895247366045506e-05, "loss": 0.4608, "num_input_tokens_seen": 74265856, "step": 61225 }, { "epoch": 6.819244904777815, "grad_norm": 0.09658104926347733, "learning_rate": 1.389089381346095e-05, "loss": 0.4637, "num_input_tokens_seen": 74272064, "step": 61230 }, { "epoch": 6.819801759661432, "grad_norm": 0.10613445192575455, "learning_rate": 1.3886540680614019e-05, "loss": 0.4647, "num_input_tokens_seen": 74278240, "step": 61235 }, { "epoch": 6.820358614545049, "grad_norm": 0.07959152013063431, "learning_rate": 1.3882187967669197e-05, "loss": 0.4656, "num_input_tokens_seen": 74284096, "step": 61240 }, { "epoch": 6.820915469428667, "grad_norm": 0.08455956727266312, "learning_rate": 1.387783567479095e-05, "loss": 0.4561, "num_input_tokens_seen": 74289984, "step": 61245 }, { "epoch": 6.821472324312285, "grad_norm": 0.0928008034825325, "learning_rate": 1.3873483802143708e-05, "loss": 0.4546, "num_input_tokens_seen": 74295616, "step": 61250 }, { "epoch": 6.822029179195901, "grad_norm": 0.11955472826957703, "learning_rate": 1.386913234989191e-05, "loss": 0.4652, "num_input_tokens_seen": 74301984, "step": 61255 }, { "epoch": 6.822586034079519, "grad_norm": 0.09236344695091248, "learning_rate": 1.3864781318199957e-05, "loss": 0.4732, "num_input_tokens_seen": 74308096, "step": 61260 }, { "epoch": 6.823142888963137, "grad_norm": 0.10095903277397156, "learning_rate": 1.3860430707232257e-05, "loss": 0.4566, "num_input_tokens_seen": 74314112, "step": 61265 }, { "epoch": 6.8236997438467535, "grad_norm": 0.09432666748762131, "learning_rate": 1.3856080517153175e-05, "loss": 0.4613, "num_input_tokens_seen": 74319936, "step": 61270 }, { "epoch": 6.824256598730371, "grad_norm": 0.14837577939033508, "learning_rate": 1.3851730748127084e-05, "loss": 0.4536, "num_input_tokens_seen": 74326176, "step": 61275 }, { "epoch": 6.824813453613988, "grad_norm": 0.09030099958181381, "learning_rate": 1.384738140031834e-05, "loss": 0.4624, "num_input_tokens_seen": 74332320, "step": 61280 }, { "epoch": 6.825370308497606, "grad_norm": 0.12078631669282913, "learning_rate": 1.384303247389126e-05, "loss": 0.4489, "num_input_tokens_seen": 74338592, "step": 61285 }, { "epoch": 6.825927163381223, "grad_norm": 0.11349311470985413, "learning_rate": 1.3838683969010174e-05, "loss": 0.4678, "num_input_tokens_seen": 74344736, "step": 61290 }, { "epoch": 6.82648401826484, "grad_norm": 0.0976734384894371, "learning_rate": 1.3834335885839367e-05, "loss": 0.4537, "num_input_tokens_seen": 74350752, "step": 61295 }, { "epoch": 6.827040873148458, "grad_norm": 0.10158183425664902, "learning_rate": 1.3829988224543134e-05, "loss": 0.4571, "num_input_tokens_seen": 74356672, "step": 61300 }, { "epoch": 6.8275977280320745, "grad_norm": 0.11428140103816986, "learning_rate": 1.3825640985285748e-05, "loss": 0.4768, "num_input_tokens_seen": 74362848, "step": 61305 }, { "epoch": 6.828154582915692, "grad_norm": 0.11418931186199188, "learning_rate": 1.3821294168231446e-05, "loss": 0.4571, "num_input_tokens_seen": 74369184, "step": 61310 }, { "epoch": 6.82871143779931, "grad_norm": 0.12148363888263702, "learning_rate": 1.3816947773544486e-05, "loss": 0.4582, "num_input_tokens_seen": 74375392, "step": 61315 }, { "epoch": 6.829268292682927, "grad_norm": 0.09429563581943512, "learning_rate": 1.3812601801389069e-05, "loss": 0.4628, "num_input_tokens_seen": 74381536, "step": 61320 }, { "epoch": 6.829825147566544, "grad_norm": 0.14583082497119904, "learning_rate": 1.3808256251929416e-05, "loss": 0.4447, "num_input_tokens_seen": 74387520, "step": 61325 }, { "epoch": 6.830382002450161, "grad_norm": 0.13355609774589539, "learning_rate": 1.3803911125329699e-05, "loss": 0.467, "num_input_tokens_seen": 74393472, "step": 61330 }, { "epoch": 6.830938857333779, "grad_norm": 0.08903082460165024, "learning_rate": 1.3799566421754096e-05, "loss": 0.4688, "num_input_tokens_seen": 74399776, "step": 61335 }, { "epoch": 6.8314957122173965, "grad_norm": 0.08471066504716873, "learning_rate": 1.3795222141366778e-05, "loss": 0.4531, "num_input_tokens_seen": 74405920, "step": 61340 }, { "epoch": 6.832052567101013, "grad_norm": 0.12955893576145172, "learning_rate": 1.3790878284331866e-05, "loss": 0.4635, "num_input_tokens_seen": 74411904, "step": 61345 }, { "epoch": 6.832609421984631, "grad_norm": 0.16188345849514008, "learning_rate": 1.3786534850813503e-05, "loss": 0.4548, "num_input_tokens_seen": 74417888, "step": 61350 }, { "epoch": 6.833166276868248, "grad_norm": 0.09422072768211365, "learning_rate": 1.3782191840975781e-05, "loss": 0.4708, "num_input_tokens_seen": 74424256, "step": 61355 }, { "epoch": 6.833723131751865, "grad_norm": 0.11647117882966995, "learning_rate": 1.3777849254982809e-05, "loss": 0.4587, "num_input_tokens_seen": 74429984, "step": 61360 }, { "epoch": 6.834279986635483, "grad_norm": 0.09739051014184952, "learning_rate": 1.3773507092998644e-05, "loss": 0.4609, "num_input_tokens_seen": 74435648, "step": 61365 }, { "epoch": 6.8348368415191, "grad_norm": 0.08124256134033203, "learning_rate": 1.3769165355187359e-05, "loss": 0.4697, "num_input_tokens_seen": 74441536, "step": 61370 }, { "epoch": 6.8353936964027175, "grad_norm": 0.07769101858139038, "learning_rate": 1.376482404171301e-05, "loss": 0.4611, "num_input_tokens_seen": 74447712, "step": 61375 }, { "epoch": 6.835950551286334, "grad_norm": 0.07552258670330048, "learning_rate": 1.37604831527396e-05, "loss": 0.4636, "num_input_tokens_seen": 74453856, "step": 61380 }, { "epoch": 6.836507406169952, "grad_norm": 0.08192585408687592, "learning_rate": 1.3756142688431168e-05, "loss": 0.4539, "num_input_tokens_seen": 74459904, "step": 61385 }, { "epoch": 6.83706426105357, "grad_norm": 0.08895042538642883, "learning_rate": 1.3751802648951695e-05, "loss": 0.4584, "num_input_tokens_seen": 74466048, "step": 61390 }, { "epoch": 6.8376211159371865, "grad_norm": 0.10938029736280441, "learning_rate": 1.3747463034465158e-05, "loss": 0.4573, "num_input_tokens_seen": 74472384, "step": 61395 }, { "epoch": 6.838177970820804, "grad_norm": 0.12010690569877625, "learning_rate": 1.3743123845135536e-05, "loss": 0.4549, "num_input_tokens_seen": 74478432, "step": 61400 }, { "epoch": 6.838734825704422, "grad_norm": 0.14446285367012024, "learning_rate": 1.373878508112676e-05, "loss": 0.4703, "num_input_tokens_seen": 74484480, "step": 61405 }, { "epoch": 6.839291680588039, "grad_norm": 0.10491406172513962, "learning_rate": 1.373444674260278e-05, "loss": 0.4637, "num_input_tokens_seen": 74490176, "step": 61410 }, { "epoch": 6.839848535471656, "grad_norm": 0.09216684848070145, "learning_rate": 1.3730108829727498e-05, "loss": 0.4514, "num_input_tokens_seen": 74496576, "step": 61415 }, { "epoch": 6.840405390355273, "grad_norm": 0.08543521910905838, "learning_rate": 1.3725771342664818e-05, "loss": 0.4636, "num_input_tokens_seen": 74502304, "step": 61420 }, { "epoch": 6.840962245238891, "grad_norm": 0.11679035425186157, "learning_rate": 1.3721434281578638e-05, "loss": 0.4498, "num_input_tokens_seen": 74508704, "step": 61425 }, { "epoch": 6.841519100122508, "grad_norm": 0.10379351675510406, "learning_rate": 1.3717097646632804e-05, "loss": 0.475, "num_input_tokens_seen": 74515072, "step": 61430 }, { "epoch": 6.842075955006125, "grad_norm": 0.18310166895389557, "learning_rate": 1.371276143799119e-05, "loss": 0.4512, "num_input_tokens_seen": 74521056, "step": 61435 }, { "epoch": 6.842632809889743, "grad_norm": 0.13101939857006073, "learning_rate": 1.3708425655817608e-05, "loss": 0.479, "num_input_tokens_seen": 74527104, "step": 61440 }, { "epoch": 6.8431896647733605, "grad_norm": 0.0886971652507782, "learning_rate": 1.3704090300275893e-05, "loss": 0.4556, "num_input_tokens_seen": 74533024, "step": 61445 }, { "epoch": 6.843746519656977, "grad_norm": 0.10846685618162155, "learning_rate": 1.3699755371529854e-05, "loss": 0.4573, "num_input_tokens_seen": 74539200, "step": 61450 }, { "epoch": 6.844303374540595, "grad_norm": 0.1056598499417305, "learning_rate": 1.3695420869743262e-05, "loss": 0.4706, "num_input_tokens_seen": 74545376, "step": 61455 }, { "epoch": 6.844860229424212, "grad_norm": 0.11680193245410919, "learning_rate": 1.3691086795079905e-05, "loss": 0.4665, "num_input_tokens_seen": 74551552, "step": 61460 }, { "epoch": 6.8454170843078295, "grad_norm": 0.1251169890165329, "learning_rate": 1.3686753147703524e-05, "loss": 0.4484, "num_input_tokens_seen": 74557920, "step": 61465 }, { "epoch": 6.845973939191447, "grad_norm": 0.16531352698802948, "learning_rate": 1.368241992777787e-05, "loss": 0.4572, "num_input_tokens_seen": 74564384, "step": 61470 }, { "epoch": 6.846530794075064, "grad_norm": 0.09462004154920578, "learning_rate": 1.367808713546665e-05, "loss": 0.463, "num_input_tokens_seen": 74570560, "step": 61475 }, { "epoch": 6.847087648958682, "grad_norm": 0.09596160799264908, "learning_rate": 1.3673754770933583e-05, "loss": 0.4688, "num_input_tokens_seen": 74576768, "step": 61480 }, { "epoch": 6.847644503842298, "grad_norm": 0.12298055738210678, "learning_rate": 1.366942283434237e-05, "loss": 0.4603, "num_input_tokens_seen": 74583136, "step": 61485 }, { "epoch": 6.848201358725916, "grad_norm": 0.1010669395327568, "learning_rate": 1.3665091325856657e-05, "loss": 0.4694, "num_input_tokens_seen": 74589440, "step": 61490 }, { "epoch": 6.848758213609534, "grad_norm": 0.11631124466657639, "learning_rate": 1.3660760245640131e-05, "loss": 0.4701, "num_input_tokens_seen": 74595744, "step": 61495 }, { "epoch": 6.8493150684931505, "grad_norm": 0.11884037405252457, "learning_rate": 1.3656429593856412e-05, "loss": 0.4587, "num_input_tokens_seen": 74601888, "step": 61500 }, { "epoch": 6.849871923376768, "grad_norm": 0.09552126377820969, "learning_rate": 1.3652099370669148e-05, "loss": 0.4552, "num_input_tokens_seen": 74607840, "step": 61505 }, { "epoch": 6.850428778260385, "grad_norm": 0.08829651027917862, "learning_rate": 1.364776957624192e-05, "loss": 0.4579, "num_input_tokens_seen": 74614112, "step": 61510 }, { "epoch": 6.850985633144003, "grad_norm": 0.0876598209142685, "learning_rate": 1.364344021073834e-05, "loss": 0.4665, "num_input_tokens_seen": 74620480, "step": 61515 }, { "epoch": 6.85154248802762, "grad_norm": 0.10029778629541397, "learning_rate": 1.3639111274321992e-05, "loss": 0.4622, "num_input_tokens_seen": 74626784, "step": 61520 }, { "epoch": 6.852099342911237, "grad_norm": 0.1087588369846344, "learning_rate": 1.3634782767156418e-05, "loss": 0.4738, "num_input_tokens_seen": 74633184, "step": 61525 }, { "epoch": 6.852656197794855, "grad_norm": 0.09076397866010666, "learning_rate": 1.3630454689405185e-05, "loss": 0.4578, "num_input_tokens_seen": 74639200, "step": 61530 }, { "epoch": 6.853213052678472, "grad_norm": 0.11248684674501419, "learning_rate": 1.3626127041231796e-05, "loss": 0.4544, "num_input_tokens_seen": 74645120, "step": 61535 }, { "epoch": 6.853769907562089, "grad_norm": 0.10857625305652618, "learning_rate": 1.3621799822799788e-05, "loss": 0.4608, "num_input_tokens_seen": 74650880, "step": 61540 }, { "epoch": 6.854326762445707, "grad_norm": 0.08941783010959625, "learning_rate": 1.3617473034272638e-05, "loss": 0.463, "num_input_tokens_seen": 74657408, "step": 61545 }, { "epoch": 6.854883617329324, "grad_norm": 0.09306192398071289, "learning_rate": 1.3613146675813832e-05, "loss": 0.4577, "num_input_tokens_seen": 74663424, "step": 61550 }, { "epoch": 6.855440472212941, "grad_norm": 0.09849811345338821, "learning_rate": 1.3608820747586843e-05, "loss": 0.4651, "num_input_tokens_seen": 74669696, "step": 61555 }, { "epoch": 6.855997327096558, "grad_norm": 0.1118093878030777, "learning_rate": 1.3604495249755106e-05, "loss": 0.4804, "num_input_tokens_seen": 74675968, "step": 61560 }, { "epoch": 6.856554181980176, "grad_norm": 0.11432943493127823, "learning_rate": 1.3600170182482065e-05, "loss": 0.467, "num_input_tokens_seen": 74682496, "step": 61565 }, { "epoch": 6.8571110368637935, "grad_norm": 0.11254577338695526, "learning_rate": 1.3595845545931116e-05, "loss": 0.4591, "num_input_tokens_seen": 74688800, "step": 61570 }, { "epoch": 6.85766789174741, "grad_norm": 0.10371730476617813, "learning_rate": 1.3591521340265667e-05, "loss": 0.4557, "num_input_tokens_seen": 74695008, "step": 61575 }, { "epoch": 6.858224746631028, "grad_norm": 0.0773141086101532, "learning_rate": 1.3587197565649118e-05, "loss": 0.4639, "num_input_tokens_seen": 74701056, "step": 61580 }, { "epoch": 6.858781601514646, "grad_norm": 0.09764096885919571, "learning_rate": 1.3582874222244803e-05, "loss": 0.4467, "num_input_tokens_seen": 74707040, "step": 61585 }, { "epoch": 6.8593384563982625, "grad_norm": 0.12570331990718842, "learning_rate": 1.35785513102161e-05, "loss": 0.4627, "num_input_tokens_seen": 74712928, "step": 61590 }, { "epoch": 6.85989531128188, "grad_norm": 0.11604420840740204, "learning_rate": 1.357422882972632e-05, "loss": 0.4674, "num_input_tokens_seen": 74719168, "step": 61595 }, { "epoch": 6.860452166165497, "grad_norm": 0.11722815781831741, "learning_rate": 1.3569906780938802e-05, "loss": 0.4683, "num_input_tokens_seen": 74725120, "step": 61600 }, { "epoch": 6.861009021049115, "grad_norm": 0.10809540003538132, "learning_rate": 1.3565585164016826e-05, "loss": 0.4587, "num_input_tokens_seen": 74731200, "step": 61605 }, { "epoch": 6.861565875932732, "grad_norm": 0.08685470372438431, "learning_rate": 1.3561263979123684e-05, "loss": 0.4595, "num_input_tokens_seen": 74737504, "step": 61610 }, { "epoch": 6.862122730816349, "grad_norm": 0.1022530123591423, "learning_rate": 1.3556943226422659e-05, "loss": 0.467, "num_input_tokens_seen": 74743520, "step": 61615 }, { "epoch": 6.862679585699967, "grad_norm": 0.10215963423252106, "learning_rate": 1.3552622906076976e-05, "loss": 0.4636, "num_input_tokens_seen": 74749952, "step": 61620 }, { "epoch": 6.863236440583584, "grad_norm": 0.10272814333438873, "learning_rate": 1.3548303018249898e-05, "loss": 0.4523, "num_input_tokens_seen": 74756064, "step": 61625 }, { "epoch": 6.863793295467201, "grad_norm": 0.11066509038209915, "learning_rate": 1.3543983563104618e-05, "loss": 0.4584, "num_input_tokens_seen": 74761696, "step": 61630 }, { "epoch": 6.864350150350819, "grad_norm": 0.114500992000103, "learning_rate": 1.3539664540804358e-05, "loss": 0.4588, "num_input_tokens_seen": 74767936, "step": 61635 }, { "epoch": 6.864907005234436, "grad_norm": 0.09315742552280426, "learning_rate": 1.3535345951512297e-05, "loss": 0.4645, "num_input_tokens_seen": 74774144, "step": 61640 }, { "epoch": 6.865463860118053, "grad_norm": 0.08481350541114807, "learning_rate": 1.3531027795391598e-05, "loss": 0.4558, "num_input_tokens_seen": 74780352, "step": 61645 }, { "epoch": 6.866020715001671, "grad_norm": 0.09292322397232056, "learning_rate": 1.3526710072605436e-05, "loss": 0.4683, "num_input_tokens_seen": 74786720, "step": 61650 }, { "epoch": 6.866577569885288, "grad_norm": 0.15148991346359253, "learning_rate": 1.3522392783316928e-05, "loss": 0.4722, "num_input_tokens_seen": 74792704, "step": 61655 }, { "epoch": 6.8671344247689055, "grad_norm": 0.07528948783874512, "learning_rate": 1.3518075927689212e-05, "loss": 0.4558, "num_input_tokens_seen": 74798336, "step": 61660 }, { "epoch": 6.867691279652522, "grad_norm": 0.08335811644792557, "learning_rate": 1.351375950588537e-05, "loss": 0.4655, "num_input_tokens_seen": 74804448, "step": 61665 }, { "epoch": 6.86824813453614, "grad_norm": 0.11252596974372864, "learning_rate": 1.3509443518068504e-05, "loss": 0.4731, "num_input_tokens_seen": 74810560, "step": 61670 }, { "epoch": 6.868804989419758, "grad_norm": 0.08381606638431549, "learning_rate": 1.3505127964401698e-05, "loss": 0.4714, "num_input_tokens_seen": 74816800, "step": 61675 }, { "epoch": 6.869361844303374, "grad_norm": 0.1257370561361313, "learning_rate": 1.3500812845047984e-05, "loss": 0.465, "num_input_tokens_seen": 74822976, "step": 61680 }, { "epoch": 6.869918699186992, "grad_norm": 0.11399871110916138, "learning_rate": 1.3496498160170418e-05, "loss": 0.4671, "num_input_tokens_seen": 74828928, "step": 61685 }, { "epoch": 6.870475554070609, "grad_norm": 0.11166859418153763, "learning_rate": 1.349218390993201e-05, "loss": 0.4532, "num_input_tokens_seen": 74834880, "step": 61690 }, { "epoch": 6.8710324089542265, "grad_norm": 0.09086642414331436, "learning_rate": 1.348787009449578e-05, "loss": 0.4522, "num_input_tokens_seen": 74840960, "step": 61695 }, { "epoch": 6.871589263837844, "grad_norm": 0.08490756154060364, "learning_rate": 1.3483556714024704e-05, "loss": 0.4571, "num_input_tokens_seen": 74847232, "step": 61700 }, { "epoch": 6.872146118721461, "grad_norm": 0.09722699970006943, "learning_rate": 1.347924376868176e-05, "loss": 0.4638, "num_input_tokens_seen": 74853568, "step": 61705 }, { "epoch": 6.872702973605079, "grad_norm": 0.09122593700885773, "learning_rate": 1.3474931258629913e-05, "loss": 0.4594, "num_input_tokens_seen": 74859008, "step": 61710 }, { "epoch": 6.873259828488695, "grad_norm": 0.09574159234762192, "learning_rate": 1.347061918403209e-05, "loss": 0.4428, "num_input_tokens_seen": 74865152, "step": 61715 }, { "epoch": 6.873816683372313, "grad_norm": 0.12262333929538727, "learning_rate": 1.3466307545051232e-05, "loss": 0.4686, "num_input_tokens_seen": 74871360, "step": 61720 }, { "epoch": 6.874373538255931, "grad_norm": 0.0940442755818367, "learning_rate": 1.3461996341850226e-05, "loss": 0.4729, "num_input_tokens_seen": 74877920, "step": 61725 }, { "epoch": 6.874930393139548, "grad_norm": 0.09046649932861328, "learning_rate": 1.3457685574591983e-05, "loss": 0.46, "num_input_tokens_seen": 74883712, "step": 61730 }, { "epoch": 6.875487248023165, "grad_norm": 0.10371307283639908, "learning_rate": 1.3453375243439358e-05, "loss": 0.4594, "num_input_tokens_seen": 74889344, "step": 61735 }, { "epoch": 6.876044102906782, "grad_norm": 0.09458740800619125, "learning_rate": 1.3449065348555224e-05, "loss": 0.4594, "num_input_tokens_seen": 74895456, "step": 61740 }, { "epoch": 6.8766009577904, "grad_norm": 0.1313420832157135, "learning_rate": 1.3444755890102424e-05, "loss": 0.4637, "num_input_tokens_seen": 74901632, "step": 61745 }, { "epoch": 6.877157812674017, "grad_norm": 0.11486504226922989, "learning_rate": 1.3440446868243764e-05, "loss": 0.4648, "num_input_tokens_seen": 74907808, "step": 61750 }, { "epoch": 6.877714667557634, "grad_norm": 0.09378489851951599, "learning_rate": 1.343613828314208e-05, "loss": 0.4572, "num_input_tokens_seen": 74913568, "step": 61755 }, { "epoch": 6.878271522441252, "grad_norm": 0.1261710375547409, "learning_rate": 1.3431830134960139e-05, "loss": 0.4634, "num_input_tokens_seen": 74919456, "step": 61760 }, { "epoch": 6.8788283773248695, "grad_norm": 0.11890450865030289, "learning_rate": 1.3427522423860737e-05, "loss": 0.4534, "num_input_tokens_seen": 74925632, "step": 61765 }, { "epoch": 6.879385232208486, "grad_norm": 0.08421630412340164, "learning_rate": 1.3423215150006613e-05, "loss": 0.4613, "num_input_tokens_seen": 74931904, "step": 61770 }, { "epoch": 6.879942087092104, "grad_norm": 0.09086011350154877, "learning_rate": 1.341890831356052e-05, "loss": 0.458, "num_input_tokens_seen": 74937856, "step": 61775 }, { "epoch": 6.880498941975721, "grad_norm": 0.09221014380455017, "learning_rate": 1.3414601914685196e-05, "loss": 0.4548, "num_input_tokens_seen": 74943872, "step": 61780 }, { "epoch": 6.881055796859338, "grad_norm": 0.0862305536866188, "learning_rate": 1.3410295953543328e-05, "loss": 0.4739, "num_input_tokens_seen": 74950080, "step": 61785 }, { "epoch": 6.881612651742956, "grad_norm": 0.11121097207069397, "learning_rate": 1.3405990430297629e-05, "loss": 0.4716, "num_input_tokens_seen": 74956416, "step": 61790 }, { "epoch": 6.882169506626573, "grad_norm": 0.10924660414457321, "learning_rate": 1.340168534511076e-05, "loss": 0.4614, "num_input_tokens_seen": 74962560, "step": 61795 }, { "epoch": 6.882726361510191, "grad_norm": 0.10265998542308807, "learning_rate": 1.3397380698145396e-05, "loss": 0.4586, "num_input_tokens_seen": 74968800, "step": 61800 }, { "epoch": 6.883283216393808, "grad_norm": 0.1444549262523651, "learning_rate": 1.3393076489564169e-05, "loss": 0.4638, "num_input_tokens_seen": 74974368, "step": 61805 }, { "epoch": 6.883840071277425, "grad_norm": 0.07241702079772949, "learning_rate": 1.3388772719529699e-05, "loss": 0.454, "num_input_tokens_seen": 74980352, "step": 61810 }, { "epoch": 6.884396926161043, "grad_norm": 0.1259198784828186, "learning_rate": 1.3384469388204618e-05, "loss": 0.4687, "num_input_tokens_seen": 74986720, "step": 61815 }, { "epoch": 6.8849537810446595, "grad_norm": 0.1325567215681076, "learning_rate": 1.3380166495751496e-05, "loss": 0.4682, "num_input_tokens_seen": 74991968, "step": 61820 }, { "epoch": 6.885510635928277, "grad_norm": 0.09135844558477402, "learning_rate": 1.337586404233292e-05, "loss": 0.4654, "num_input_tokens_seen": 74998048, "step": 61825 }, { "epoch": 6.886067490811895, "grad_norm": 0.1063433289527893, "learning_rate": 1.3371562028111462e-05, "loss": 0.4781, "num_input_tokens_seen": 75004192, "step": 61830 }, { "epoch": 6.886624345695512, "grad_norm": 0.10286497324705124, "learning_rate": 1.3367260453249646e-05, "loss": 0.4489, "num_input_tokens_seen": 75010400, "step": 61835 }, { "epoch": 6.887181200579129, "grad_norm": 0.11370661854743958, "learning_rate": 1.3362959317910024e-05, "loss": 0.4575, "num_input_tokens_seen": 75016096, "step": 61840 }, { "epoch": 6.887738055462746, "grad_norm": 0.0854925662279129, "learning_rate": 1.3358658622255077e-05, "loss": 0.4695, "num_input_tokens_seen": 75022048, "step": 61845 }, { "epoch": 6.888294910346364, "grad_norm": 0.11452233791351318, "learning_rate": 1.3354358366447311e-05, "loss": 0.4614, "num_input_tokens_seen": 75028416, "step": 61850 }, { "epoch": 6.888851765229981, "grad_norm": 0.09746166318655014, "learning_rate": 1.3350058550649219e-05, "loss": 0.4677, "num_input_tokens_seen": 75034848, "step": 61855 }, { "epoch": 6.889408620113598, "grad_norm": 0.07493620365858078, "learning_rate": 1.3345759175023243e-05, "loss": 0.4722, "num_input_tokens_seen": 75040928, "step": 61860 }, { "epoch": 6.889965474997216, "grad_norm": 0.11742636561393738, "learning_rate": 1.334146023973184e-05, "loss": 0.4497, "num_input_tokens_seen": 75046976, "step": 61865 }, { "epoch": 6.890522329880833, "grad_norm": 0.09950844943523407, "learning_rate": 1.3337161744937422e-05, "loss": 0.4516, "num_input_tokens_seen": 75052864, "step": 61870 }, { "epoch": 6.89107918476445, "grad_norm": 0.1130342185497284, "learning_rate": 1.3332863690802414e-05, "loss": 0.4701, "num_input_tokens_seen": 75059040, "step": 61875 }, { "epoch": 6.891636039648068, "grad_norm": 0.10124784708023071, "learning_rate": 1.3328566077489197e-05, "loss": 0.4632, "num_input_tokens_seen": 75065216, "step": 61880 }, { "epoch": 6.892192894531685, "grad_norm": 0.18215732276439667, "learning_rate": 1.332426890516016e-05, "loss": 0.4502, "num_input_tokens_seen": 75071360, "step": 61885 }, { "epoch": 6.8927497494153025, "grad_norm": 0.10806459933519363, "learning_rate": 1.3319972173977668e-05, "loss": 0.4634, "num_input_tokens_seen": 75077088, "step": 61890 }, { "epoch": 6.893306604298919, "grad_norm": 0.08857490122318268, "learning_rate": 1.3315675884104046e-05, "loss": 0.4654, "num_input_tokens_seen": 75082848, "step": 61895 }, { "epoch": 6.893863459182537, "grad_norm": 0.09872450679540634, "learning_rate": 1.3311380035701646e-05, "loss": 0.4558, "num_input_tokens_seen": 75089344, "step": 61900 }, { "epoch": 6.894420314066155, "grad_norm": 0.12367604672908783, "learning_rate": 1.3307084628932756e-05, "loss": 0.4549, "num_input_tokens_seen": 75095392, "step": 61905 }, { "epoch": 6.894977168949771, "grad_norm": 0.08459153771400452, "learning_rate": 1.3302789663959692e-05, "loss": 0.4511, "num_input_tokens_seen": 75101728, "step": 61910 }, { "epoch": 6.895534023833389, "grad_norm": 0.08697785437107086, "learning_rate": 1.3298495140944706e-05, "loss": 0.4667, "num_input_tokens_seen": 75107808, "step": 61915 }, { "epoch": 6.896090878717006, "grad_norm": 0.10793215036392212, "learning_rate": 1.3294201060050077e-05, "loss": 0.4576, "num_input_tokens_seen": 75113952, "step": 61920 }, { "epoch": 6.8966477336006236, "grad_norm": 0.0979977548122406, "learning_rate": 1.3289907421438053e-05, "loss": 0.4612, "num_input_tokens_seen": 75119584, "step": 61925 }, { "epoch": 6.897204588484241, "grad_norm": 0.11693772673606873, "learning_rate": 1.3285614225270842e-05, "loss": 0.4767, "num_input_tokens_seen": 75125504, "step": 61930 }, { "epoch": 6.897761443367858, "grad_norm": 0.07375471293926239, "learning_rate": 1.3281321471710678e-05, "loss": 0.4545, "num_input_tokens_seen": 75131648, "step": 61935 }, { "epoch": 6.898318298251476, "grad_norm": 0.08247717469930649, "learning_rate": 1.3277029160919732e-05, "loss": 0.4671, "num_input_tokens_seen": 75137664, "step": 61940 }, { "epoch": 6.898875153135093, "grad_norm": 0.08668522536754608, "learning_rate": 1.3272737293060195e-05, "loss": 0.4716, "num_input_tokens_seen": 75143872, "step": 61945 }, { "epoch": 6.89943200801871, "grad_norm": 0.09873655438423157, "learning_rate": 1.3268445868294233e-05, "loss": 0.4622, "num_input_tokens_seen": 75149600, "step": 61950 }, { "epoch": 6.899988862902328, "grad_norm": 0.08190866559743881, "learning_rate": 1.326415488678397e-05, "loss": 0.4579, "num_input_tokens_seen": 75156096, "step": 61955 }, { "epoch": 6.900545717785945, "grad_norm": 0.09479968994855881, "learning_rate": 1.3259864348691556e-05, "loss": 0.4737, "num_input_tokens_seen": 75161728, "step": 61960 }, { "epoch": 6.901102572669562, "grad_norm": 0.0886155366897583, "learning_rate": 1.3255574254179081e-05, "loss": 0.4674, "num_input_tokens_seen": 75167744, "step": 61965 }, { "epoch": 6.90165942755318, "grad_norm": 0.08282123506069183, "learning_rate": 1.325128460340866e-05, "loss": 0.4758, "num_input_tokens_seen": 75174304, "step": 61970 }, { "epoch": 6.902216282436797, "grad_norm": 0.14372451603412628, "learning_rate": 1.3246995396542342e-05, "loss": 0.4764, "num_input_tokens_seen": 75180160, "step": 61975 }, { "epoch": 6.902773137320414, "grad_norm": 0.08425798267126083, "learning_rate": 1.3242706633742202e-05, "loss": 0.4574, "num_input_tokens_seen": 75186272, "step": 61980 }, { "epoch": 6.903329992204032, "grad_norm": 0.10146202892065048, "learning_rate": 1.3238418315170298e-05, "loss": 0.459, "num_input_tokens_seen": 75192352, "step": 61985 }, { "epoch": 6.903886847087649, "grad_norm": 0.10944303125143051, "learning_rate": 1.3234130440988628e-05, "loss": 0.4666, "num_input_tokens_seen": 75198464, "step": 61990 }, { "epoch": 6.904443701971267, "grad_norm": 0.08312007039785385, "learning_rate": 1.3229843011359231e-05, "loss": 0.4623, "num_input_tokens_seen": 75204512, "step": 61995 }, { "epoch": 6.905000556854883, "grad_norm": 0.12062463164329529, "learning_rate": 1.322555602644407e-05, "loss": 0.4574, "num_input_tokens_seen": 75210880, "step": 62000 }, { "epoch": 6.905557411738501, "grad_norm": 0.10007544606924057, "learning_rate": 1.3221269486405146e-05, "loss": 0.4687, "num_input_tokens_seen": 75216928, "step": 62005 }, { "epoch": 6.906114266622119, "grad_norm": 0.107076495885849, "learning_rate": 1.3216983391404397e-05, "loss": 0.4569, "num_input_tokens_seen": 75223104, "step": 62010 }, { "epoch": 6.9066711215057355, "grad_norm": 0.09293232858181, "learning_rate": 1.3212697741603778e-05, "loss": 0.457, "num_input_tokens_seen": 75229216, "step": 62015 }, { "epoch": 6.907227976389353, "grad_norm": 0.12598565220832825, "learning_rate": 1.3208412537165222e-05, "loss": 0.4666, "num_input_tokens_seen": 75235264, "step": 62020 }, { "epoch": 6.90778483127297, "grad_norm": 0.07284070551395416, "learning_rate": 1.320412777825062e-05, "loss": 0.4556, "num_input_tokens_seen": 75241632, "step": 62025 }, { "epoch": 6.908341686156588, "grad_norm": 0.08209995180368423, "learning_rate": 1.3199843465021877e-05, "loss": 0.47, "num_input_tokens_seen": 75247872, "step": 62030 }, { "epoch": 6.908898541040205, "grad_norm": 0.1444978415966034, "learning_rate": 1.3195559597640856e-05, "loss": 0.4615, "num_input_tokens_seen": 75253824, "step": 62035 }, { "epoch": 6.909455395923822, "grad_norm": 0.10484608262777328, "learning_rate": 1.3191276176269435e-05, "loss": 0.4685, "num_input_tokens_seen": 75259904, "step": 62040 }, { "epoch": 6.91001225080744, "grad_norm": 0.07713449746370316, "learning_rate": 1.3186993201069432e-05, "loss": 0.4571, "num_input_tokens_seen": 75265984, "step": 62045 }, { "epoch": 6.9105691056910565, "grad_norm": 0.09519743174314499, "learning_rate": 1.3182710672202681e-05, "loss": 0.4629, "num_input_tokens_seen": 75272128, "step": 62050 }, { "epoch": 6.911125960574674, "grad_norm": 0.08562829345464706, "learning_rate": 1.3178428589830999e-05, "loss": 0.4653, "num_input_tokens_seen": 75278240, "step": 62055 }, { "epoch": 6.911682815458292, "grad_norm": 0.13071584701538086, "learning_rate": 1.3174146954116163e-05, "loss": 0.4583, "num_input_tokens_seen": 75284512, "step": 62060 }, { "epoch": 6.912239670341909, "grad_norm": 0.09627969563007355, "learning_rate": 1.3169865765219961e-05, "loss": 0.4793, "num_input_tokens_seen": 75290848, "step": 62065 }, { "epoch": 6.912796525225526, "grad_norm": 0.10905230045318604, "learning_rate": 1.3165585023304133e-05, "loss": 0.4627, "num_input_tokens_seen": 75296992, "step": 62070 }, { "epoch": 6.913353380109143, "grad_norm": 0.10517577081918716, "learning_rate": 1.3161304728530427e-05, "loss": 0.4605, "num_input_tokens_seen": 75302688, "step": 62075 }, { "epoch": 6.913910234992761, "grad_norm": 0.12298080325126648, "learning_rate": 1.3157024881060575e-05, "loss": 0.4633, "num_input_tokens_seen": 75308512, "step": 62080 }, { "epoch": 6.9144670898763785, "grad_norm": 0.10483583807945251, "learning_rate": 1.3152745481056272e-05, "loss": 0.4587, "num_input_tokens_seen": 75314400, "step": 62085 }, { "epoch": 6.915023944759995, "grad_norm": 0.09138435870409012, "learning_rate": 1.3148466528679216e-05, "loss": 0.4653, "num_input_tokens_seen": 75320448, "step": 62090 }, { "epoch": 6.915580799643613, "grad_norm": 0.111526258289814, "learning_rate": 1.3144188024091066e-05, "loss": 0.4665, "num_input_tokens_seen": 75326560, "step": 62095 }, { "epoch": 6.91613765452723, "grad_norm": 0.13128067553043365, "learning_rate": 1.3139909967453496e-05, "loss": 0.4538, "num_input_tokens_seen": 75333024, "step": 62100 }, { "epoch": 6.916694509410847, "grad_norm": 0.12493632733821869, "learning_rate": 1.3135632358928127e-05, "loss": 0.469, "num_input_tokens_seen": 75339456, "step": 62105 }, { "epoch": 6.917251364294465, "grad_norm": 0.08699770271778107, "learning_rate": 1.3131355198676587e-05, "loss": 0.4549, "num_input_tokens_seen": 75345664, "step": 62110 }, { "epoch": 6.917808219178082, "grad_norm": 0.09722595661878586, "learning_rate": 1.3127078486860494e-05, "loss": 0.4667, "num_input_tokens_seen": 75351936, "step": 62115 }, { "epoch": 6.9183650740616995, "grad_norm": 0.155199334025383, "learning_rate": 1.3122802223641412e-05, "loss": 0.4572, "num_input_tokens_seen": 75358016, "step": 62120 }, { "epoch": 6.918921928945317, "grad_norm": 0.07509629428386688, "learning_rate": 1.3118526409180937e-05, "loss": 0.463, "num_input_tokens_seen": 75364480, "step": 62125 }, { "epoch": 6.919478783828934, "grad_norm": 0.11682984232902527, "learning_rate": 1.3114251043640596e-05, "loss": 0.4737, "num_input_tokens_seen": 75370368, "step": 62130 }, { "epoch": 6.920035638712552, "grad_norm": 0.07705947756767273, "learning_rate": 1.3109976127181956e-05, "loss": 0.4451, "num_input_tokens_seen": 75376544, "step": 62135 }, { "epoch": 6.9205924935961685, "grad_norm": 0.1385565847158432, "learning_rate": 1.3105701659966507e-05, "loss": 0.457, "num_input_tokens_seen": 75382656, "step": 62140 }, { "epoch": 6.921149348479786, "grad_norm": 0.11478468775749207, "learning_rate": 1.3101427642155767e-05, "loss": 0.4513, "num_input_tokens_seen": 75388608, "step": 62145 }, { "epoch": 6.921706203363404, "grad_norm": 0.09451716393232346, "learning_rate": 1.3097154073911232e-05, "loss": 0.4529, "num_input_tokens_seen": 75394496, "step": 62150 }, { "epoch": 6.922263058247021, "grad_norm": 0.061157066375017166, "learning_rate": 1.3092880955394352e-05, "loss": 0.4758, "num_input_tokens_seen": 75400864, "step": 62155 }, { "epoch": 6.922819913130638, "grad_norm": 0.0949159786105156, "learning_rate": 1.3088608286766596e-05, "loss": 0.4639, "num_input_tokens_seen": 75407040, "step": 62160 }, { "epoch": 6.923376768014256, "grad_norm": 0.11962611228227615, "learning_rate": 1.3084336068189384e-05, "loss": 0.4524, "num_input_tokens_seen": 75413248, "step": 62165 }, { "epoch": 6.923933622897873, "grad_norm": 0.1243610605597496, "learning_rate": 1.3080064299824136e-05, "loss": 0.4637, "num_input_tokens_seen": 75419232, "step": 62170 }, { "epoch": 6.92449047778149, "grad_norm": 0.08005015552043915, "learning_rate": 1.3075792981832272e-05, "loss": 0.4586, "num_input_tokens_seen": 75425344, "step": 62175 }, { "epoch": 6.925047332665107, "grad_norm": 0.0707644447684288, "learning_rate": 1.307152211437515e-05, "loss": 0.4523, "num_input_tokens_seen": 75431136, "step": 62180 }, { "epoch": 6.925604187548725, "grad_norm": 0.10452914983034134, "learning_rate": 1.306725169761416e-05, "loss": 0.4697, "num_input_tokens_seen": 75437312, "step": 62185 }, { "epoch": 6.9261610424323425, "grad_norm": 0.10766301304101944, "learning_rate": 1.3062981731710633e-05, "loss": 0.4568, "num_input_tokens_seen": 75442944, "step": 62190 }, { "epoch": 6.926717897315959, "grad_norm": 0.09339093416929245, "learning_rate": 1.3058712216825918e-05, "loss": 0.4548, "num_input_tokens_seen": 75449120, "step": 62195 }, { "epoch": 6.927274752199577, "grad_norm": 0.09848302602767944, "learning_rate": 1.3054443153121313e-05, "loss": 0.4525, "num_input_tokens_seen": 75454976, "step": 62200 }, { "epoch": 6.927831607083194, "grad_norm": 0.10872253030538559, "learning_rate": 1.3050174540758129e-05, "loss": 0.4638, "num_input_tokens_seen": 75461184, "step": 62205 }, { "epoch": 6.9283884619668115, "grad_norm": 0.12125397473573685, "learning_rate": 1.3045906379897654e-05, "loss": 0.462, "num_input_tokens_seen": 75467232, "step": 62210 }, { "epoch": 6.928945316850429, "grad_norm": 0.08816320449113846, "learning_rate": 1.3041638670701145e-05, "loss": 0.4634, "num_input_tokens_seen": 75473440, "step": 62215 }, { "epoch": 6.929502171734046, "grad_norm": 0.08865271508693695, "learning_rate": 1.303737141332984e-05, "loss": 0.4573, "num_input_tokens_seen": 75479360, "step": 62220 }, { "epoch": 6.930059026617664, "grad_norm": 0.09774031490087509, "learning_rate": 1.303310460794499e-05, "loss": 0.4666, "num_input_tokens_seen": 75485632, "step": 62225 }, { "epoch": 6.93061588150128, "grad_norm": 0.07125185430049896, "learning_rate": 1.3028838254707786e-05, "loss": 0.4637, "num_input_tokens_seen": 75491232, "step": 62230 }, { "epoch": 6.931172736384898, "grad_norm": 0.10329614579677582, "learning_rate": 1.3024572353779445e-05, "loss": 0.4536, "num_input_tokens_seen": 75497248, "step": 62235 }, { "epoch": 6.931729591268516, "grad_norm": 0.09107029438018799, "learning_rate": 1.3020306905321131e-05, "loss": 0.4613, "num_input_tokens_seen": 75503520, "step": 62240 }, { "epoch": 6.9322864461521325, "grad_norm": 0.08703647553920746, "learning_rate": 1.301604190949402e-05, "loss": 0.4623, "num_input_tokens_seen": 75509664, "step": 62245 }, { "epoch": 6.93284330103575, "grad_norm": 0.09305990487337112, "learning_rate": 1.3011777366459243e-05, "loss": 0.4595, "num_input_tokens_seen": 75515456, "step": 62250 }, { "epoch": 6.933400155919367, "grad_norm": 0.08188246935606003, "learning_rate": 1.3007513276377936e-05, "loss": 0.4666, "num_input_tokens_seen": 75521216, "step": 62255 }, { "epoch": 6.933957010802985, "grad_norm": 0.09905003011226654, "learning_rate": 1.3003249639411213e-05, "loss": 0.4641, "num_input_tokens_seen": 75527040, "step": 62260 }, { "epoch": 6.934513865686602, "grad_norm": 0.13786964118480682, "learning_rate": 1.299898645572016e-05, "loss": 0.4592, "num_input_tokens_seen": 75533408, "step": 62265 }, { "epoch": 6.935070720570219, "grad_norm": 0.15899658203125, "learning_rate": 1.2994723725465868e-05, "loss": 0.4564, "num_input_tokens_seen": 75539776, "step": 62270 }, { "epoch": 6.935627575453837, "grad_norm": 0.1254863291978836, "learning_rate": 1.299046144880937e-05, "loss": 0.4766, "num_input_tokens_seen": 75545952, "step": 62275 }, { "epoch": 6.936184430337454, "grad_norm": 0.13401196897029877, "learning_rate": 1.2986199625911744e-05, "loss": 0.4515, "num_input_tokens_seen": 75552096, "step": 62280 }, { "epoch": 6.936741285221071, "grad_norm": 0.0932404100894928, "learning_rate": 1.298193825693398e-05, "loss": 0.456, "num_input_tokens_seen": 75558400, "step": 62285 }, { "epoch": 6.937298140104689, "grad_norm": 0.08436726033687592, "learning_rate": 1.2977677342037103e-05, "loss": 0.4552, "num_input_tokens_seen": 75564672, "step": 62290 }, { "epoch": 6.937854994988306, "grad_norm": 0.10296876728534698, "learning_rate": 1.2973416881382116e-05, "loss": 0.4683, "num_input_tokens_seen": 75570752, "step": 62295 }, { "epoch": 6.938411849871923, "grad_norm": 0.13901863992214203, "learning_rate": 1.2969156875129968e-05, "loss": 0.4545, "num_input_tokens_seen": 75577216, "step": 62300 }, { "epoch": 6.938968704755541, "grad_norm": 0.0956256315112114, "learning_rate": 1.2964897323441639e-05, "loss": 0.4578, "num_input_tokens_seen": 75583328, "step": 62305 }, { "epoch": 6.939525559639158, "grad_norm": 0.108220174908638, "learning_rate": 1.2960638226478048e-05, "loss": 0.4459, "num_input_tokens_seen": 75589440, "step": 62310 }, { "epoch": 6.9400824145227755, "grad_norm": 0.11039944738149643, "learning_rate": 1.2956379584400136e-05, "loss": 0.4579, "num_input_tokens_seen": 75595648, "step": 62315 }, { "epoch": 6.940639269406392, "grad_norm": 0.1024371087551117, "learning_rate": 1.2952121397368788e-05, "loss": 0.4725, "num_input_tokens_seen": 75601184, "step": 62320 }, { "epoch": 6.94119612429001, "grad_norm": 0.0967433974146843, "learning_rate": 1.2947863665544904e-05, "loss": 0.4592, "num_input_tokens_seen": 75607296, "step": 62325 }, { "epoch": 6.941752979173628, "grad_norm": 0.10371546447277069, "learning_rate": 1.2943606389089364e-05, "loss": 0.4655, "num_input_tokens_seen": 75613376, "step": 62330 }, { "epoch": 6.9423098340572444, "grad_norm": 0.0912143811583519, "learning_rate": 1.2939349568163e-05, "loss": 0.4728, "num_input_tokens_seen": 75619360, "step": 62335 }, { "epoch": 6.942866688940862, "grad_norm": 0.08983202278614044, "learning_rate": 1.2935093202926673e-05, "loss": 0.4666, "num_input_tokens_seen": 75625472, "step": 62340 }, { "epoch": 6.94342354382448, "grad_norm": 0.1185302883386612, "learning_rate": 1.2930837293541177e-05, "loss": 0.4646, "num_input_tokens_seen": 75631392, "step": 62345 }, { "epoch": 6.943980398708097, "grad_norm": 0.11606341600418091, "learning_rate": 1.2926581840167324e-05, "loss": 0.4519, "num_input_tokens_seen": 75637408, "step": 62350 }, { "epoch": 6.944537253591714, "grad_norm": 0.1661662757396698, "learning_rate": 1.292232684296591e-05, "loss": 0.4653, "num_input_tokens_seen": 75643136, "step": 62355 }, { "epoch": 6.945094108475331, "grad_norm": 0.09749950468540192, "learning_rate": 1.2918072302097683e-05, "loss": 0.4614, "num_input_tokens_seen": 75649024, "step": 62360 }, { "epoch": 6.945650963358949, "grad_norm": 0.11647076904773712, "learning_rate": 1.2913818217723415e-05, "loss": 0.4642, "num_input_tokens_seen": 75655264, "step": 62365 }, { "epoch": 6.946207818242566, "grad_norm": 0.1042773425579071, "learning_rate": 1.2909564590003814e-05, "loss": 0.4607, "num_input_tokens_seen": 75661312, "step": 62370 }, { "epoch": 6.946764673126183, "grad_norm": 0.12006551027297974, "learning_rate": 1.2905311419099619e-05, "loss": 0.4613, "num_input_tokens_seen": 75667616, "step": 62375 }, { "epoch": 6.947321528009801, "grad_norm": 0.09524770826101303, "learning_rate": 1.290105870517151e-05, "loss": 0.4569, "num_input_tokens_seen": 75673568, "step": 62380 }, { "epoch": 6.947878382893418, "grad_norm": 0.10376565158367157, "learning_rate": 1.2896806448380177e-05, "loss": 0.4581, "num_input_tokens_seen": 75679648, "step": 62385 }, { "epoch": 6.948435237777035, "grad_norm": 0.11419843882322311, "learning_rate": 1.2892554648886291e-05, "loss": 0.4593, "num_input_tokens_seen": 75685600, "step": 62390 }, { "epoch": 6.948992092660653, "grad_norm": 0.09400641918182373, "learning_rate": 1.288830330685048e-05, "loss": 0.4579, "num_input_tokens_seen": 75691744, "step": 62395 }, { "epoch": 6.94954894754427, "grad_norm": 0.1446644812822342, "learning_rate": 1.2884052422433392e-05, "loss": 0.468, "num_input_tokens_seen": 75697792, "step": 62400 }, { "epoch": 6.9501058024278874, "grad_norm": 0.1079343780875206, "learning_rate": 1.2879801995795627e-05, "loss": 0.4644, "num_input_tokens_seen": 75704000, "step": 62405 }, { "epoch": 6.950662657311504, "grad_norm": 0.09134174138307571, "learning_rate": 1.2875552027097786e-05, "loss": 0.4647, "num_input_tokens_seen": 75710048, "step": 62410 }, { "epoch": 6.951219512195122, "grad_norm": 0.1053832620382309, "learning_rate": 1.2871302516500439e-05, "loss": 0.464, "num_input_tokens_seen": 75716384, "step": 62415 }, { "epoch": 6.95177636707874, "grad_norm": 0.08719680458307266, "learning_rate": 1.2867053464164152e-05, "loss": 0.4582, "num_input_tokens_seen": 75722336, "step": 62420 }, { "epoch": 6.952333221962356, "grad_norm": 0.08940150588750839, "learning_rate": 1.2862804870249472e-05, "loss": 0.4567, "num_input_tokens_seen": 75728128, "step": 62425 }, { "epoch": 6.952890076845974, "grad_norm": 0.08344317972660065, "learning_rate": 1.2858556734916916e-05, "loss": 0.469, "num_input_tokens_seen": 75734400, "step": 62430 }, { "epoch": 6.953446931729591, "grad_norm": 0.10273539274930954, "learning_rate": 1.2854309058327002e-05, "loss": 0.4637, "num_input_tokens_seen": 75740416, "step": 62435 }, { "epoch": 6.9540037866132085, "grad_norm": 0.07602125406265259, "learning_rate": 1.2850061840640204e-05, "loss": 0.4678, "num_input_tokens_seen": 75746496, "step": 62440 }, { "epoch": 6.954560641496826, "grad_norm": 0.1007661297917366, "learning_rate": 1.284581508201701e-05, "loss": 0.4667, "num_input_tokens_seen": 75752512, "step": 62445 }, { "epoch": 6.955117496380443, "grad_norm": 0.12430509924888611, "learning_rate": 1.2841568782617877e-05, "loss": 0.4637, "num_input_tokens_seen": 75758592, "step": 62450 }, { "epoch": 6.955674351264061, "grad_norm": 0.08530653268098831, "learning_rate": 1.2837322942603231e-05, "loss": 0.4619, "num_input_tokens_seen": 75764672, "step": 62455 }, { "epoch": 6.956231206147677, "grad_norm": 0.08954598754644394, "learning_rate": 1.2833077562133512e-05, "loss": 0.4663, "num_input_tokens_seen": 75770720, "step": 62460 }, { "epoch": 6.956788061031295, "grad_norm": 0.0890931636095047, "learning_rate": 1.2828832641369105e-05, "loss": 0.4616, "num_input_tokens_seen": 75776544, "step": 62465 }, { "epoch": 6.957344915914913, "grad_norm": 0.1169014573097229, "learning_rate": 1.2824588180470415e-05, "loss": 0.4656, "num_input_tokens_seen": 75782528, "step": 62470 }, { "epoch": 6.95790177079853, "grad_norm": 0.13645724952220917, "learning_rate": 1.2820344179597787e-05, "loss": 0.4586, "num_input_tokens_seen": 75788800, "step": 62475 }, { "epoch": 6.958458625682147, "grad_norm": 0.07969401031732559, "learning_rate": 1.2816100638911593e-05, "loss": 0.4548, "num_input_tokens_seen": 75794944, "step": 62480 }, { "epoch": 6.959015480565765, "grad_norm": 0.0755118653178215, "learning_rate": 1.2811857558572168e-05, "loss": 0.4656, "num_input_tokens_seen": 75801184, "step": 62485 }, { "epoch": 6.959572335449382, "grad_norm": 0.09709826856851578, "learning_rate": 1.2807614938739814e-05, "loss": 0.4676, "num_input_tokens_seen": 75807232, "step": 62490 }, { "epoch": 6.960129190332999, "grad_norm": 0.08743941783905029, "learning_rate": 1.2803372779574852e-05, "loss": 0.4625, "num_input_tokens_seen": 75813184, "step": 62495 }, { "epoch": 6.960686045216617, "grad_norm": 0.1323986053466797, "learning_rate": 1.2799131081237541e-05, "loss": 0.462, "num_input_tokens_seen": 75819168, "step": 62500 }, { "epoch": 6.961242900100234, "grad_norm": 0.10030853748321533, "learning_rate": 1.279488984388817e-05, "loss": 0.448, "num_input_tokens_seen": 75825408, "step": 62505 }, { "epoch": 6.9617997549838515, "grad_norm": 0.11994524300098419, "learning_rate": 1.279064906768696e-05, "loss": 0.4522, "num_input_tokens_seen": 75831584, "step": 62510 }, { "epoch": 6.962356609867468, "grad_norm": 0.10980913043022156, "learning_rate": 1.2786408752794157e-05, "loss": 0.4525, "num_input_tokens_seen": 75837984, "step": 62515 }, { "epoch": 6.962913464751086, "grad_norm": 0.10168489813804626, "learning_rate": 1.2782168899369983e-05, "loss": 0.4617, "num_input_tokens_seen": 75844032, "step": 62520 }, { "epoch": 6.963470319634704, "grad_norm": 0.11729512363672256, "learning_rate": 1.277792950757461e-05, "loss": 0.4736, "num_input_tokens_seen": 75850080, "step": 62525 }, { "epoch": 6.96402717451832, "grad_norm": 0.09390701353549957, "learning_rate": 1.2773690577568237e-05, "loss": 0.4723, "num_input_tokens_seen": 75855776, "step": 62530 }, { "epoch": 6.964584029401938, "grad_norm": 0.1030358299612999, "learning_rate": 1.2769452109511007e-05, "loss": 0.4621, "num_input_tokens_seen": 75862048, "step": 62535 }, { "epoch": 6.965140884285555, "grad_norm": 0.16999521851539612, "learning_rate": 1.276521410356308e-05, "loss": 0.4579, "num_input_tokens_seen": 75868416, "step": 62540 }, { "epoch": 6.965697739169173, "grad_norm": 0.10094036906957626, "learning_rate": 1.2760976559884563e-05, "loss": 0.4575, "num_input_tokens_seen": 75874592, "step": 62545 }, { "epoch": 6.96625459405279, "grad_norm": 0.14977988600730896, "learning_rate": 1.2756739478635573e-05, "loss": 0.453, "num_input_tokens_seen": 75880640, "step": 62550 }, { "epoch": 6.966811448936407, "grad_norm": 0.10552815347909927, "learning_rate": 1.2752502859976211e-05, "loss": 0.4498, "num_input_tokens_seen": 75886784, "step": 62555 }, { "epoch": 6.967368303820025, "grad_norm": 0.1061495989561081, "learning_rate": 1.274826670406653e-05, "loss": 0.4673, "num_input_tokens_seen": 75892832, "step": 62560 }, { "epoch": 6.9679251587036415, "grad_norm": 0.06157047674059868, "learning_rate": 1.2744031011066604e-05, "loss": 0.4664, "num_input_tokens_seen": 75898752, "step": 62565 }, { "epoch": 6.968482013587259, "grad_norm": 0.09908170998096466, "learning_rate": 1.2739795781136455e-05, "loss": 0.4707, "num_input_tokens_seen": 75904928, "step": 62570 }, { "epoch": 6.969038868470877, "grad_norm": 0.08197983354330063, "learning_rate": 1.2735561014436112e-05, "loss": 0.4559, "num_input_tokens_seen": 75910912, "step": 62575 }, { "epoch": 6.969595723354494, "grad_norm": 0.1337059736251831, "learning_rate": 1.2731326711125585e-05, "loss": 0.4531, "num_input_tokens_seen": 75917312, "step": 62580 }, { "epoch": 6.970152578238111, "grad_norm": 0.09792832285165787, "learning_rate": 1.2727092871364843e-05, "loss": 0.4612, "num_input_tokens_seen": 75923232, "step": 62585 }, { "epoch": 6.970709433121728, "grad_norm": 0.09444960951805115, "learning_rate": 1.272285949531387e-05, "loss": 0.4536, "num_input_tokens_seen": 75929504, "step": 62590 }, { "epoch": 6.971266288005346, "grad_norm": 0.10103817284107208, "learning_rate": 1.2718626583132604e-05, "loss": 0.4531, "num_input_tokens_seen": 75935648, "step": 62595 }, { "epoch": 6.971823142888963, "grad_norm": 0.09430043399333954, "learning_rate": 1.2714394134980989e-05, "loss": 0.458, "num_input_tokens_seen": 75941824, "step": 62600 }, { "epoch": 6.97237999777258, "grad_norm": 0.09701401740312576, "learning_rate": 1.2710162151018929e-05, "loss": 0.4642, "num_input_tokens_seen": 75947776, "step": 62605 }, { "epoch": 6.972936852656198, "grad_norm": 0.11238059401512146, "learning_rate": 1.2705930631406323e-05, "loss": 0.4682, "num_input_tokens_seen": 75953760, "step": 62610 }, { "epoch": 6.973493707539815, "grad_norm": 0.09598836302757263, "learning_rate": 1.2701699576303067e-05, "loss": 0.4735, "num_input_tokens_seen": 75959680, "step": 62615 }, { "epoch": 6.974050562423432, "grad_norm": 0.12269134074449539, "learning_rate": 1.2697468985869004e-05, "loss": 0.4614, "num_input_tokens_seen": 75965696, "step": 62620 }, { "epoch": 6.97460741730705, "grad_norm": 0.1077781692147255, "learning_rate": 1.2693238860264001e-05, "loss": 0.4658, "num_input_tokens_seen": 75971840, "step": 62625 }, { "epoch": 6.975164272190667, "grad_norm": 0.13687381148338318, "learning_rate": 1.2689009199647866e-05, "loss": 0.4634, "num_input_tokens_seen": 75977920, "step": 62630 }, { "epoch": 6.9757211270742845, "grad_norm": 0.1425381898880005, "learning_rate": 1.2684780004180414e-05, "loss": 0.4363, "num_input_tokens_seen": 75984096, "step": 62635 }, { "epoch": 6.976277981957901, "grad_norm": 0.09427749365568161, "learning_rate": 1.2680551274021443e-05, "loss": 0.4681, "num_input_tokens_seen": 75990592, "step": 62640 }, { "epoch": 6.976834836841519, "grad_norm": 0.11997901648283005, "learning_rate": 1.2676323009330716e-05, "loss": 0.4672, "num_input_tokens_seen": 75996736, "step": 62645 }, { "epoch": 6.977391691725137, "grad_norm": 0.11737996339797974, "learning_rate": 1.267209521026801e-05, "loss": 0.4611, "num_input_tokens_seen": 76002816, "step": 62650 }, { "epoch": 6.977948546608753, "grad_norm": 0.09739001095294952, "learning_rate": 1.2667867876993046e-05, "loss": 0.4633, "num_input_tokens_seen": 76008800, "step": 62655 }, { "epoch": 6.978505401492371, "grad_norm": 0.09214765578508377, "learning_rate": 1.266364100966555e-05, "loss": 0.4683, "num_input_tokens_seen": 76015296, "step": 62660 }, { "epoch": 6.979062256375989, "grad_norm": 0.09880934655666351, "learning_rate": 1.2659414608445241e-05, "loss": 0.4483, "num_input_tokens_seen": 76021312, "step": 62665 }, { "epoch": 6.9796191112596055, "grad_norm": 0.12339429557323456, "learning_rate": 1.2655188673491786e-05, "loss": 0.4584, "num_input_tokens_seen": 76027456, "step": 62670 }, { "epoch": 6.980175966143223, "grad_norm": 0.11113327741622925, "learning_rate": 1.2650963204964872e-05, "loss": 0.4726, "num_input_tokens_seen": 76033536, "step": 62675 }, { "epoch": 6.980732821026841, "grad_norm": 0.09219716489315033, "learning_rate": 1.2646738203024136e-05, "loss": 0.4577, "num_input_tokens_seen": 76040096, "step": 62680 }, { "epoch": 6.981289675910458, "grad_norm": 0.14684800803661346, "learning_rate": 1.2642513667829226e-05, "loss": 0.4604, "num_input_tokens_seen": 76046304, "step": 62685 }, { "epoch": 6.981846530794075, "grad_norm": 0.10604720562696457, "learning_rate": 1.2638289599539743e-05, "loss": 0.4597, "num_input_tokens_seen": 76052160, "step": 62690 }, { "epoch": 6.982403385677692, "grad_norm": 0.08942870795726776, "learning_rate": 1.2634065998315292e-05, "loss": 0.4679, "num_input_tokens_seen": 76058176, "step": 62695 }, { "epoch": 6.98296024056131, "grad_norm": 0.10128659009933472, "learning_rate": 1.262984286431547e-05, "loss": 0.4596, "num_input_tokens_seen": 76064192, "step": 62700 }, { "epoch": 6.9835170954449275, "grad_norm": 0.08766794949769974, "learning_rate": 1.2625620197699812e-05, "loss": 0.4534, "num_input_tokens_seen": 76070464, "step": 62705 }, { "epoch": 6.984073950328544, "grad_norm": 0.10231729596853256, "learning_rate": 1.2621397998627893e-05, "loss": 0.4542, "num_input_tokens_seen": 76076544, "step": 62710 }, { "epoch": 6.984630805212162, "grad_norm": 0.10012307018041611, "learning_rate": 1.2617176267259211e-05, "loss": 0.4563, "num_input_tokens_seen": 76082656, "step": 62715 }, { "epoch": 6.985187660095779, "grad_norm": 0.08219713717699051, "learning_rate": 1.2612955003753296e-05, "loss": 0.4592, "num_input_tokens_seen": 76088672, "step": 62720 }, { "epoch": 6.985744514979396, "grad_norm": 0.10445734113454819, "learning_rate": 1.2608734208269648e-05, "loss": 0.4459, "num_input_tokens_seen": 76094528, "step": 62725 }, { "epoch": 6.986301369863014, "grad_norm": 0.11280715465545654, "learning_rate": 1.2604513880967721e-05, "loss": 0.4607, "num_input_tokens_seen": 76100576, "step": 62730 }, { "epoch": 6.986858224746631, "grad_norm": 0.11954326182603836, "learning_rate": 1.2600294022006992e-05, "loss": 0.4626, "num_input_tokens_seen": 76106848, "step": 62735 }, { "epoch": 6.9874150796302485, "grad_norm": 0.07426076382398605, "learning_rate": 1.2596074631546883e-05, "loss": 0.4688, "num_input_tokens_seen": 76113152, "step": 62740 }, { "epoch": 6.987971934513865, "grad_norm": 0.10105821490287781, "learning_rate": 1.259185570974683e-05, "loss": 0.4707, "num_input_tokens_seen": 76119424, "step": 62745 }, { "epoch": 6.988528789397483, "grad_norm": 0.167847141623497, "learning_rate": 1.2587637256766228e-05, "loss": 0.4633, "num_input_tokens_seen": 76125632, "step": 62750 }, { "epoch": 6.989085644281101, "grad_norm": 0.1082717627286911, "learning_rate": 1.2583419272764466e-05, "loss": 0.4783, "num_input_tokens_seen": 76132000, "step": 62755 }, { "epoch": 6.9896424991647175, "grad_norm": 0.0711665004491806, "learning_rate": 1.2579201757900925e-05, "loss": 0.4548, "num_input_tokens_seen": 76138016, "step": 62760 }, { "epoch": 6.990199354048335, "grad_norm": 0.15938887000083923, "learning_rate": 1.2574984712334936e-05, "loss": 0.4576, "num_input_tokens_seen": 76144512, "step": 62765 }, { "epoch": 6.990756208931952, "grad_norm": 0.11904467642307281, "learning_rate": 1.257076813622585e-05, "loss": 0.4639, "num_input_tokens_seen": 76150240, "step": 62770 }, { "epoch": 6.99131306381557, "grad_norm": 0.08542320132255554, "learning_rate": 1.2566552029732967e-05, "loss": 0.4499, "num_input_tokens_seen": 76156448, "step": 62775 }, { "epoch": 6.991869918699187, "grad_norm": 0.14282171428203583, "learning_rate": 1.2562336393015603e-05, "loss": 0.4533, "num_input_tokens_seen": 76162432, "step": 62780 }, { "epoch": 6.992426773582804, "grad_norm": 0.1054655984044075, "learning_rate": 1.2558121226233019e-05, "loss": 0.4705, "num_input_tokens_seen": 76168320, "step": 62785 }, { "epoch": 6.992983628466422, "grad_norm": 0.09697902202606201, "learning_rate": 1.2553906529544487e-05, "loss": 0.4644, "num_input_tokens_seen": 76173792, "step": 62790 }, { "epoch": 6.9935404833500385, "grad_norm": 0.08942801505327225, "learning_rate": 1.2549692303109262e-05, "loss": 0.4672, "num_input_tokens_seen": 76179840, "step": 62795 }, { "epoch": 6.994097338233656, "grad_norm": 0.11018025875091553, "learning_rate": 1.2545478547086551e-05, "loss": 0.4556, "num_input_tokens_seen": 76185792, "step": 62800 }, { "epoch": 6.994654193117274, "grad_norm": 0.09517841041088104, "learning_rate": 1.2541265261635581e-05, "loss": 0.4499, "num_input_tokens_seen": 76191840, "step": 62805 }, { "epoch": 6.995211048000891, "grad_norm": 0.09918297827243805, "learning_rate": 1.2537052446915526e-05, "loss": 0.4592, "num_input_tokens_seen": 76198016, "step": 62810 }, { "epoch": 6.995767902884508, "grad_norm": 0.09439436346292496, "learning_rate": 1.253284010308558e-05, "loss": 0.466, "num_input_tokens_seen": 76204288, "step": 62815 }, { "epoch": 6.996324757768126, "grad_norm": 0.10129523277282715, "learning_rate": 1.252862823030488e-05, "loss": 0.4679, "num_input_tokens_seen": 76210464, "step": 62820 }, { "epoch": 6.996881612651743, "grad_norm": 0.07799810916185379, "learning_rate": 1.2524416828732569e-05, "loss": 0.4513, "num_input_tokens_seen": 76216672, "step": 62825 }, { "epoch": 6.9974384675353605, "grad_norm": 0.08420680463314056, "learning_rate": 1.2520205898527781e-05, "loss": 0.4655, "num_input_tokens_seen": 76222816, "step": 62830 }, { "epoch": 6.997995322418977, "grad_norm": 0.09998102486133575, "learning_rate": 1.2515995439849598e-05, "loss": 0.4685, "num_input_tokens_seen": 76228640, "step": 62835 }, { "epoch": 6.998552177302595, "grad_norm": 0.08995790779590607, "learning_rate": 1.2511785452857122e-05, "loss": 0.4635, "num_input_tokens_seen": 76234976, "step": 62840 }, { "epoch": 6.999109032186213, "grad_norm": 0.12556904554367065, "learning_rate": 1.2507575937709403e-05, "loss": 0.4493, "num_input_tokens_seen": 76241408, "step": 62845 }, { "epoch": 6.999665887069829, "grad_norm": 0.055761635303497314, "learning_rate": 1.2503366894565499e-05, "loss": 0.4547, "num_input_tokens_seen": 76247680, "step": 62850 }, { "epoch": 7.000222741953447, "grad_norm": 0.10134099423885345, "learning_rate": 1.2499158323584454e-05, "loss": 0.465, "num_input_tokens_seen": 76253232, "step": 62855 }, { "epoch": 7.000779596837064, "grad_norm": 0.11201688647270203, "learning_rate": 1.2494950224925255e-05, "loss": 0.4549, "num_input_tokens_seen": 76259312, "step": 62860 }, { "epoch": 7.000779596837064, "eval_loss": 0.4643833637237549, "eval_runtime": 113.0259, "eval_samples_per_second": 35.31, "eval_steps_per_second": 8.83, "num_input_tokens_seen": 76259312, "step": 62860 }, { "epoch": 7.0013364517206815, "grad_norm": 0.07886173576116562, "learning_rate": 1.2490742598746922e-05, "loss": 0.4639, "num_input_tokens_seen": 76265328, "step": 62865 }, { "epoch": 7.001893306604299, "grad_norm": 0.12170913070440292, "learning_rate": 1.248653544520841e-05, "loss": 0.4547, "num_input_tokens_seen": 76271408, "step": 62870 }, { "epoch": 7.002450161487916, "grad_norm": 0.08379245549440384, "learning_rate": 1.2482328764468704e-05, "loss": 0.4668, "num_input_tokens_seen": 76277552, "step": 62875 }, { "epoch": 7.003007016371534, "grad_norm": 0.10348505526781082, "learning_rate": 1.247812255668672e-05, "loss": 0.4743, "num_input_tokens_seen": 76283632, "step": 62880 }, { "epoch": 7.003563871255151, "grad_norm": 0.10287214815616608, "learning_rate": 1.2473916822021396e-05, "loss": 0.4551, "num_input_tokens_seen": 76289872, "step": 62885 }, { "epoch": 7.004120726138768, "grad_norm": 0.12253980338573456, "learning_rate": 1.2469711560631642e-05, "loss": 0.4563, "num_input_tokens_seen": 76295888, "step": 62890 }, { "epoch": 7.004677581022386, "grad_norm": 0.08831649273633957, "learning_rate": 1.2465506772676334e-05, "loss": 0.4514, "num_input_tokens_seen": 76301648, "step": 62895 }, { "epoch": 7.005234435906003, "grad_norm": 0.15255852043628693, "learning_rate": 1.2461302458314358e-05, "loss": 0.4487, "num_input_tokens_seen": 76307760, "step": 62900 }, { "epoch": 7.00579129078962, "grad_norm": 0.09060227870941162, "learning_rate": 1.2457098617704546e-05, "loss": 0.4637, "num_input_tokens_seen": 76313808, "step": 62905 }, { "epoch": 7.006348145673238, "grad_norm": 0.10059540718793869, "learning_rate": 1.2452895251005756e-05, "loss": 0.457, "num_input_tokens_seen": 76319792, "step": 62910 }, { "epoch": 7.006905000556855, "grad_norm": 0.1194702759385109, "learning_rate": 1.2448692358376782e-05, "loss": 0.4699, "num_input_tokens_seen": 76326064, "step": 62915 }, { "epoch": 7.007461855440472, "grad_norm": 0.11851224303245544, "learning_rate": 1.2444489939976433e-05, "loss": 0.4577, "num_input_tokens_seen": 76331952, "step": 62920 }, { "epoch": 7.008018710324089, "grad_norm": 0.17084930837154388, "learning_rate": 1.2440287995963499e-05, "loss": 0.4533, "num_input_tokens_seen": 76337680, "step": 62925 }, { "epoch": 7.008575565207707, "grad_norm": 0.10147369652986526, "learning_rate": 1.2436086526496726e-05, "loss": 0.464, "num_input_tokens_seen": 76344016, "step": 62930 }, { "epoch": 7.0091324200913245, "grad_norm": 0.11369560658931732, "learning_rate": 1.2431885531734878e-05, "loss": 0.4494, "num_input_tokens_seen": 76350256, "step": 62935 }, { "epoch": 7.009689274974941, "grad_norm": 0.09737478941679001, "learning_rate": 1.2427685011836657e-05, "loss": 0.4637, "num_input_tokens_seen": 76356496, "step": 62940 }, { "epoch": 7.010246129858559, "grad_norm": 0.09558451175689697, "learning_rate": 1.2423484966960788e-05, "loss": 0.4584, "num_input_tokens_seen": 76362352, "step": 62945 }, { "epoch": 7.010802984742176, "grad_norm": 0.11266610771417618, "learning_rate": 1.2419285397265973e-05, "loss": 0.4539, "num_input_tokens_seen": 76368304, "step": 62950 }, { "epoch": 7.0113598396257935, "grad_norm": 0.1290552020072937, "learning_rate": 1.2415086302910862e-05, "loss": 0.4574, "num_input_tokens_seen": 76374320, "step": 62955 }, { "epoch": 7.011916694509411, "grad_norm": 0.09597428143024445, "learning_rate": 1.2410887684054126e-05, "loss": 0.455, "num_input_tokens_seen": 76380208, "step": 62960 }, { "epoch": 7.012473549393028, "grad_norm": 0.11476698517799377, "learning_rate": 1.240668954085439e-05, "loss": 0.4626, "num_input_tokens_seen": 76386192, "step": 62965 }, { "epoch": 7.013030404276646, "grad_norm": 0.08862622827291489, "learning_rate": 1.2402491873470293e-05, "loss": 0.4707, "num_input_tokens_seen": 76392560, "step": 62970 }, { "epoch": 7.013587259160263, "grad_norm": 0.10106322914361954, "learning_rate": 1.2398294682060412e-05, "loss": 0.4732, "num_input_tokens_seen": 76398864, "step": 62975 }, { "epoch": 7.01414411404388, "grad_norm": 0.10085751116275787, "learning_rate": 1.239409796678334e-05, "loss": 0.4732, "num_input_tokens_seen": 76405136, "step": 62980 }, { "epoch": 7.014700968927498, "grad_norm": 0.09761455655097961, "learning_rate": 1.2389901727797657e-05, "loss": 0.4667, "num_input_tokens_seen": 76411184, "step": 62985 }, { "epoch": 7.0152578238111145, "grad_norm": 0.10452652722597122, "learning_rate": 1.2385705965261887e-05, "loss": 0.4539, "num_input_tokens_seen": 76417072, "step": 62990 }, { "epoch": 7.015814678694732, "grad_norm": 0.11437740176916122, "learning_rate": 1.238151067933458e-05, "loss": 0.4614, "num_input_tokens_seen": 76422864, "step": 62995 }, { "epoch": 7.01637153357835, "grad_norm": 0.088633693754673, "learning_rate": 1.2377315870174228e-05, "loss": 0.446, "num_input_tokens_seen": 76428944, "step": 63000 }, { "epoch": 7.016928388461967, "grad_norm": 0.09810969233512878, "learning_rate": 1.2373121537939342e-05, "loss": 0.4673, "num_input_tokens_seen": 76435120, "step": 63005 }, { "epoch": 7.017485243345584, "grad_norm": 0.06362706422805786, "learning_rate": 1.2368927682788382e-05, "loss": 0.4595, "num_input_tokens_seen": 76441392, "step": 63010 }, { "epoch": 7.018042098229201, "grad_norm": 0.1019410640001297, "learning_rate": 1.2364734304879811e-05, "loss": 0.467, "num_input_tokens_seen": 76447152, "step": 63015 }, { "epoch": 7.018598953112819, "grad_norm": 0.10339415073394775, "learning_rate": 1.2360541404372077e-05, "loss": 0.4622, "num_input_tokens_seen": 76453200, "step": 63020 }, { "epoch": 7.0191558079964365, "grad_norm": 0.08723971992731094, "learning_rate": 1.2356348981423588e-05, "loss": 0.4604, "num_input_tokens_seen": 76459088, "step": 63025 }, { "epoch": 7.019712662880053, "grad_norm": 0.09357555210590363, "learning_rate": 1.235215703619276e-05, "loss": 0.4607, "num_input_tokens_seen": 76465392, "step": 63030 }, { "epoch": 7.020269517763671, "grad_norm": 0.1030716672539711, "learning_rate": 1.2347965568837975e-05, "loss": 0.4607, "num_input_tokens_seen": 76471344, "step": 63035 }, { "epoch": 7.020826372647288, "grad_norm": 0.11804778128862381, "learning_rate": 1.2343774579517583e-05, "loss": 0.4619, "num_input_tokens_seen": 76477456, "step": 63040 }, { "epoch": 7.021383227530905, "grad_norm": 0.081391341984272, "learning_rate": 1.2339584068389956e-05, "loss": 0.4573, "num_input_tokens_seen": 76483408, "step": 63045 }, { "epoch": 7.021940082414523, "grad_norm": 0.1320653110742569, "learning_rate": 1.2335394035613406e-05, "loss": 0.4559, "num_input_tokens_seen": 76489264, "step": 63050 }, { "epoch": 7.02249693729814, "grad_norm": 0.14032596349716187, "learning_rate": 1.2331204481346265e-05, "loss": 0.4541, "num_input_tokens_seen": 76495376, "step": 63055 }, { "epoch": 7.0230537921817575, "grad_norm": 0.10484003275632858, "learning_rate": 1.2327015405746808e-05, "loss": 0.4581, "num_input_tokens_seen": 76501488, "step": 63060 }, { "epoch": 7.023610647065375, "grad_norm": 0.08978428691625595, "learning_rate": 1.2322826808973324e-05, "loss": 0.4579, "num_input_tokens_seen": 76507792, "step": 63065 }, { "epoch": 7.024167501948992, "grad_norm": 0.0903576985001564, "learning_rate": 1.2318638691184075e-05, "loss": 0.4618, "num_input_tokens_seen": 76513616, "step": 63070 }, { "epoch": 7.02472435683261, "grad_norm": 0.08808393776416779, "learning_rate": 1.2314451052537288e-05, "loss": 0.4645, "num_input_tokens_seen": 76519120, "step": 63075 }, { "epoch": 7.025281211716226, "grad_norm": 0.0930781364440918, "learning_rate": 1.2310263893191201e-05, "loss": 0.4673, "num_input_tokens_seen": 76525296, "step": 63080 }, { "epoch": 7.025838066599844, "grad_norm": 0.08539561182260513, "learning_rate": 1.2306077213304004e-05, "loss": 0.4622, "num_input_tokens_seen": 76531632, "step": 63085 }, { "epoch": 7.026394921483462, "grad_norm": 0.09861545264720917, "learning_rate": 1.2301891013033898e-05, "loss": 0.4615, "num_input_tokens_seen": 76537584, "step": 63090 }, { "epoch": 7.026951776367079, "grad_norm": 0.10303837060928345, "learning_rate": 1.229770529253903e-05, "loss": 0.4584, "num_input_tokens_seen": 76543664, "step": 63095 }, { "epoch": 7.027508631250696, "grad_norm": 0.08705366402864456, "learning_rate": 1.2293520051977567e-05, "loss": 0.4655, "num_input_tokens_seen": 76549840, "step": 63100 }, { "epoch": 7.028065486134313, "grad_norm": 0.09402540326118469, "learning_rate": 1.2289335291507643e-05, "loss": 0.4659, "num_input_tokens_seen": 76555920, "step": 63105 }, { "epoch": 7.028622341017931, "grad_norm": 0.10399268567562103, "learning_rate": 1.2285151011287354e-05, "loss": 0.4559, "num_input_tokens_seen": 76561520, "step": 63110 }, { "epoch": 7.029179195901548, "grad_norm": 0.10630680620670319, "learning_rate": 1.2280967211474819e-05, "loss": 0.471, "num_input_tokens_seen": 76567536, "step": 63115 }, { "epoch": 7.029736050785165, "grad_norm": 0.06628779321908951, "learning_rate": 1.2276783892228094e-05, "loss": 0.4642, "num_input_tokens_seen": 76573776, "step": 63120 }, { "epoch": 7.030292905668783, "grad_norm": 0.1057434231042862, "learning_rate": 1.2272601053705246e-05, "loss": 0.4551, "num_input_tokens_seen": 76579280, "step": 63125 }, { "epoch": 7.0308497605524, "grad_norm": 0.13224533200263977, "learning_rate": 1.2268418696064327e-05, "loss": 0.4611, "num_input_tokens_seen": 76585520, "step": 63130 }, { "epoch": 7.031406615436017, "grad_norm": 0.13198179006576538, "learning_rate": 1.226423681946334e-05, "loss": 0.4732, "num_input_tokens_seen": 76591600, "step": 63135 }, { "epoch": 7.031963470319635, "grad_norm": 0.09349530935287476, "learning_rate": 1.2260055424060308e-05, "loss": 0.4622, "num_input_tokens_seen": 76597680, "step": 63140 }, { "epoch": 7.032520325203252, "grad_norm": 0.07389284670352936, "learning_rate": 1.2255874510013201e-05, "loss": 0.4636, "num_input_tokens_seen": 76603760, "step": 63145 }, { "epoch": 7.033077180086869, "grad_norm": 0.09641563892364502, "learning_rate": 1.2251694077480005e-05, "loss": 0.4637, "num_input_tokens_seen": 76609904, "step": 63150 }, { "epoch": 7.033634034970487, "grad_norm": 0.10011988878250122, "learning_rate": 1.224751412661865e-05, "loss": 0.4536, "num_input_tokens_seen": 76616304, "step": 63155 }, { "epoch": 7.034190889854104, "grad_norm": 0.13121962547302246, "learning_rate": 1.224333465758708e-05, "loss": 0.4604, "num_input_tokens_seen": 76622192, "step": 63160 }, { "epoch": 7.034747744737722, "grad_norm": 0.15485192835330963, "learning_rate": 1.2239155670543217e-05, "loss": 0.4638, "num_input_tokens_seen": 76628464, "step": 63165 }, { "epoch": 7.035304599621338, "grad_norm": 0.10328210890293121, "learning_rate": 1.2234977165644937e-05, "loss": 0.469, "num_input_tokens_seen": 76634640, "step": 63170 }, { "epoch": 7.035861454504956, "grad_norm": 0.09666433930397034, "learning_rate": 1.2230799143050137e-05, "loss": 0.4722, "num_input_tokens_seen": 76640528, "step": 63175 }, { "epoch": 7.036418309388574, "grad_norm": 0.08779445290565491, "learning_rate": 1.2226621602916658e-05, "loss": 0.4719, "num_input_tokens_seen": 76646800, "step": 63180 }, { "epoch": 7.0369751642721905, "grad_norm": 0.10050990432500839, "learning_rate": 1.2222444545402357e-05, "loss": 0.4613, "num_input_tokens_seen": 76652912, "step": 63185 }, { "epoch": 7.037532019155808, "grad_norm": 0.08816667646169662, "learning_rate": 1.2218267970665037e-05, "loss": 0.4527, "num_input_tokens_seen": 76659184, "step": 63190 }, { "epoch": 7.038088874039425, "grad_norm": 0.1492404043674469, "learning_rate": 1.2214091878862518e-05, "loss": 0.4527, "num_input_tokens_seen": 76665264, "step": 63195 }, { "epoch": 7.038645728923043, "grad_norm": 0.08898676931858063, "learning_rate": 1.220991627015259e-05, "loss": 0.4418, "num_input_tokens_seen": 76670960, "step": 63200 }, { "epoch": 7.03920258380666, "grad_norm": 0.10948783904314041, "learning_rate": 1.2205741144693003e-05, "loss": 0.4726, "num_input_tokens_seen": 76677168, "step": 63205 }, { "epoch": 7.039759438690277, "grad_norm": 0.09807416796684265, "learning_rate": 1.2201566502641528e-05, "loss": 0.4561, "num_input_tokens_seen": 76683504, "step": 63210 }, { "epoch": 7.040316293573895, "grad_norm": 0.12342599779367447, "learning_rate": 1.2197392344155876e-05, "loss": 0.4528, "num_input_tokens_seen": 76689488, "step": 63215 }, { "epoch": 7.0408731484575116, "grad_norm": 0.09862353652715683, "learning_rate": 1.2193218669393777e-05, "loss": 0.464, "num_input_tokens_seen": 76695600, "step": 63220 }, { "epoch": 7.041430003341129, "grad_norm": 0.07634125649929047, "learning_rate": 1.218904547851291e-05, "loss": 0.4583, "num_input_tokens_seen": 76701808, "step": 63225 }, { "epoch": 7.041986858224747, "grad_norm": 0.07820248603820801, "learning_rate": 1.2184872771670958e-05, "loss": 0.4478, "num_input_tokens_seen": 76708208, "step": 63230 }, { "epoch": 7.042543713108364, "grad_norm": 0.12751451134681702, "learning_rate": 1.2180700549025593e-05, "loss": 0.4493, "num_input_tokens_seen": 76714480, "step": 63235 }, { "epoch": 7.043100567991981, "grad_norm": 0.10994765907526016, "learning_rate": 1.2176528810734427e-05, "loss": 0.4635, "num_input_tokens_seen": 76720432, "step": 63240 }, { "epoch": 7.043657422875599, "grad_norm": 0.11051373183727264, "learning_rate": 1.2172357556955113e-05, "loss": 0.4419, "num_input_tokens_seen": 76726416, "step": 63245 }, { "epoch": 7.044214277759216, "grad_norm": 0.0964014008641243, "learning_rate": 1.2168186787845226e-05, "loss": 0.4645, "num_input_tokens_seen": 76732464, "step": 63250 }, { "epoch": 7.0447711326428335, "grad_norm": 0.09972156584262848, "learning_rate": 1.2164016503562367e-05, "loss": 0.4648, "num_input_tokens_seen": 76738320, "step": 63255 }, { "epoch": 7.04532798752645, "grad_norm": 0.11014986038208008, "learning_rate": 1.2159846704264105e-05, "loss": 0.4526, "num_input_tokens_seen": 76744208, "step": 63260 }, { "epoch": 7.045884842410068, "grad_norm": 0.11780068278312683, "learning_rate": 1.2155677390107975e-05, "loss": 0.4527, "num_input_tokens_seen": 76750352, "step": 63265 }, { "epoch": 7.046441697293686, "grad_norm": 0.0979543924331665, "learning_rate": 1.2151508561251526e-05, "loss": 0.4648, "num_input_tokens_seen": 76756176, "step": 63270 }, { "epoch": 7.046998552177302, "grad_norm": 0.126641184091568, "learning_rate": 1.2147340217852248e-05, "loss": 0.4494, "num_input_tokens_seen": 76762544, "step": 63275 }, { "epoch": 7.04755540706092, "grad_norm": 0.08562914282083511, "learning_rate": 1.2143172360067654e-05, "loss": 0.4685, "num_input_tokens_seen": 76768528, "step": 63280 }, { "epoch": 7.048112261944537, "grad_norm": 0.07785896956920624, "learning_rate": 1.21390049880552e-05, "loss": 0.4556, "num_input_tokens_seen": 76774512, "step": 63285 }, { "epoch": 7.048669116828155, "grad_norm": 0.11997655034065247, "learning_rate": 1.2134838101972353e-05, "loss": 0.4566, "num_input_tokens_seen": 76780528, "step": 63290 }, { "epoch": 7.049225971711772, "grad_norm": 0.10056914389133453, "learning_rate": 1.2130671701976562e-05, "loss": 0.4601, "num_input_tokens_seen": 76786736, "step": 63295 }, { "epoch": 7.049782826595389, "grad_norm": 0.09263204783201218, "learning_rate": 1.2126505788225224e-05, "loss": 0.459, "num_input_tokens_seen": 76792304, "step": 63300 }, { "epoch": 7.050339681479007, "grad_norm": 0.13308225572109222, "learning_rate": 1.2122340360875762e-05, "loss": 0.4719, "num_input_tokens_seen": 76798480, "step": 63305 }, { "epoch": 7.0508965363626235, "grad_norm": 0.08848781138658524, "learning_rate": 1.2118175420085543e-05, "loss": 0.4619, "num_input_tokens_seen": 76804528, "step": 63310 }, { "epoch": 7.051453391246241, "grad_norm": 0.09562533348798752, "learning_rate": 1.2114010966011945e-05, "loss": 0.4587, "num_input_tokens_seen": 76810608, "step": 63315 }, { "epoch": 7.052010246129859, "grad_norm": 0.09287899732589722, "learning_rate": 1.21098469988123e-05, "loss": 0.4616, "num_input_tokens_seen": 76816528, "step": 63320 }, { "epoch": 7.052567101013476, "grad_norm": 0.07445050776004791, "learning_rate": 1.2105683518643942e-05, "loss": 0.4528, "num_input_tokens_seen": 76822544, "step": 63325 }, { "epoch": 7.053123955897093, "grad_norm": 0.08316043764352798, "learning_rate": 1.2101520525664196e-05, "loss": 0.4632, "num_input_tokens_seen": 76828592, "step": 63330 }, { "epoch": 7.053680810780711, "grad_norm": 0.10229017585515976, "learning_rate": 1.2097358020030326e-05, "loss": 0.4793, "num_input_tokens_seen": 76834512, "step": 63335 }, { "epoch": 7.054237665664328, "grad_norm": 0.09389987587928772, "learning_rate": 1.209319600189963e-05, "loss": 0.4636, "num_input_tokens_seen": 76840464, "step": 63340 }, { "epoch": 7.054794520547945, "grad_norm": 0.11941719800233841, "learning_rate": 1.208903447142934e-05, "loss": 0.4638, "num_input_tokens_seen": 76846608, "step": 63345 }, { "epoch": 7.055351375431562, "grad_norm": 0.09575651586055756, "learning_rate": 1.2084873428776705e-05, "loss": 0.4511, "num_input_tokens_seen": 76852784, "step": 63350 }, { "epoch": 7.05590823031518, "grad_norm": 0.0791454166173935, "learning_rate": 1.2080712874098949e-05, "loss": 0.4636, "num_input_tokens_seen": 76859088, "step": 63355 }, { "epoch": 7.056465085198798, "grad_norm": 0.0938224345445633, "learning_rate": 1.207655280755325e-05, "loss": 0.4574, "num_input_tokens_seen": 76864752, "step": 63360 }, { "epoch": 7.057021940082414, "grad_norm": 0.09187735617160797, "learning_rate": 1.2072393229296817e-05, "loss": 0.4605, "num_input_tokens_seen": 76870480, "step": 63365 }, { "epoch": 7.057578794966032, "grad_norm": 0.12352971732616425, "learning_rate": 1.206823413948678e-05, "loss": 0.4641, "num_input_tokens_seen": 76876272, "step": 63370 }, { "epoch": 7.058135649849649, "grad_norm": 0.10718967020511627, "learning_rate": 1.2064075538280314e-05, "loss": 0.4595, "num_input_tokens_seen": 76881712, "step": 63375 }, { "epoch": 7.0586925047332665, "grad_norm": 0.14924286305904388, "learning_rate": 1.205991742583452e-05, "loss": 0.4703, "num_input_tokens_seen": 76887376, "step": 63380 }, { "epoch": 7.059249359616884, "grad_norm": 0.09327191859483719, "learning_rate": 1.205575980230651e-05, "loss": 0.4528, "num_input_tokens_seen": 76893392, "step": 63385 }, { "epoch": 7.059806214500501, "grad_norm": 0.08179177343845367, "learning_rate": 1.2051602667853387e-05, "loss": 0.4662, "num_input_tokens_seen": 76899536, "step": 63390 }, { "epoch": 7.060363069384119, "grad_norm": 0.11938072741031647, "learning_rate": 1.2047446022632202e-05, "loss": 0.469, "num_input_tokens_seen": 76905584, "step": 63395 }, { "epoch": 7.060919924267735, "grad_norm": 0.08404109627008438, "learning_rate": 1.2043289866800025e-05, "loss": 0.469, "num_input_tokens_seen": 76911920, "step": 63400 }, { "epoch": 7.061476779151353, "grad_norm": 0.09837311506271362, "learning_rate": 1.2039134200513869e-05, "loss": 0.4642, "num_input_tokens_seen": 76918160, "step": 63405 }, { "epoch": 7.062033634034971, "grad_norm": 0.096242755651474, "learning_rate": 1.2034979023930768e-05, "loss": 0.4779, "num_input_tokens_seen": 76924240, "step": 63410 }, { "epoch": 7.0625904889185875, "grad_norm": 0.13194558024406433, "learning_rate": 1.2030824337207696e-05, "loss": 0.4527, "num_input_tokens_seen": 76930128, "step": 63415 }, { "epoch": 7.063147343802205, "grad_norm": 0.10136997699737549, "learning_rate": 1.2026670140501642e-05, "loss": 0.456, "num_input_tokens_seen": 76936144, "step": 63420 }, { "epoch": 7.063704198685823, "grad_norm": 0.10058451443910599, "learning_rate": 1.2022516433969578e-05, "loss": 0.46, "num_input_tokens_seen": 76942128, "step": 63425 }, { "epoch": 7.06426105356944, "grad_norm": 0.16077697277069092, "learning_rate": 1.2018363217768419e-05, "loss": 0.458, "num_input_tokens_seen": 76948368, "step": 63430 }, { "epoch": 7.064817908453057, "grad_norm": 0.09758654236793518, "learning_rate": 1.2014210492055112e-05, "loss": 0.4823, "num_input_tokens_seen": 76954384, "step": 63435 }, { "epoch": 7.065374763336674, "grad_norm": 0.12189845740795135, "learning_rate": 1.2010058256986534e-05, "loss": 0.4612, "num_input_tokens_seen": 76960368, "step": 63440 }, { "epoch": 7.065931618220292, "grad_norm": 0.07767040282487869, "learning_rate": 1.2005906512719598e-05, "loss": 0.4643, "num_input_tokens_seen": 76966256, "step": 63445 }, { "epoch": 7.0664884731039095, "grad_norm": 0.0759098082780838, "learning_rate": 1.2001755259411151e-05, "loss": 0.4605, "num_input_tokens_seen": 76972592, "step": 63450 }, { "epoch": 7.067045327987526, "grad_norm": 0.13279898464679718, "learning_rate": 1.1997604497218038e-05, "loss": 0.4706, "num_input_tokens_seen": 76978448, "step": 63455 }, { "epoch": 7.067602182871144, "grad_norm": 0.07935818284749985, "learning_rate": 1.1993454226297104e-05, "loss": 0.4432, "num_input_tokens_seen": 76984528, "step": 63460 }, { "epoch": 7.068159037754761, "grad_norm": 0.09781394153833389, "learning_rate": 1.1989304446805141e-05, "loss": 0.4649, "num_input_tokens_seen": 76991056, "step": 63465 }, { "epoch": 7.068715892638378, "grad_norm": 0.08146613836288452, "learning_rate": 1.198515515889895e-05, "loss": 0.453, "num_input_tokens_seen": 76997552, "step": 63470 }, { "epoch": 7.069272747521996, "grad_norm": 0.14645910263061523, "learning_rate": 1.1981006362735316e-05, "loss": 0.4703, "num_input_tokens_seen": 77003856, "step": 63475 }, { "epoch": 7.069829602405613, "grad_norm": 0.07600656896829605, "learning_rate": 1.1976858058470975e-05, "loss": 0.459, "num_input_tokens_seen": 77009072, "step": 63480 }, { "epoch": 7.0703864572892305, "grad_norm": 0.1073489785194397, "learning_rate": 1.197271024626268e-05, "loss": 0.4676, "num_input_tokens_seen": 77015120, "step": 63485 }, { "epoch": 7.070943312172847, "grad_norm": 0.08391815423965454, "learning_rate": 1.196856292626713e-05, "loss": 0.4762, "num_input_tokens_seen": 77021488, "step": 63490 }, { "epoch": 7.071500167056465, "grad_norm": 0.13400036096572876, "learning_rate": 1.1964416098641043e-05, "loss": 0.4598, "num_input_tokens_seen": 77027632, "step": 63495 }, { "epoch": 7.072057021940083, "grad_norm": 0.13994066417217255, "learning_rate": 1.1960269763541079e-05, "loss": 0.4534, "num_input_tokens_seen": 77033520, "step": 63500 }, { "epoch": 7.0726138768236995, "grad_norm": 0.08678624778985977, "learning_rate": 1.1956123921123915e-05, "loss": 0.4574, "num_input_tokens_seen": 77039760, "step": 63505 }, { "epoch": 7.073170731707317, "grad_norm": 0.10334321856498718, "learning_rate": 1.19519785715462e-05, "loss": 0.4701, "num_input_tokens_seen": 77045680, "step": 63510 }, { "epoch": 7.073727586590935, "grad_norm": 0.08810169249773026, "learning_rate": 1.1947833714964535e-05, "loss": 0.4684, "num_input_tokens_seen": 77052176, "step": 63515 }, { "epoch": 7.074284441474552, "grad_norm": 0.08016480505466461, "learning_rate": 1.1943689351535553e-05, "loss": 0.4663, "num_input_tokens_seen": 77058160, "step": 63520 }, { "epoch": 7.074841296358169, "grad_norm": 0.10185764729976654, "learning_rate": 1.1939545481415817e-05, "loss": 0.459, "num_input_tokens_seen": 77064400, "step": 63525 }, { "epoch": 7.075398151241786, "grad_norm": 0.11437927186489105, "learning_rate": 1.1935402104761909e-05, "loss": 0.4732, "num_input_tokens_seen": 77070672, "step": 63530 }, { "epoch": 7.075955006125404, "grad_norm": 0.0820380300283432, "learning_rate": 1.1931259221730384e-05, "loss": 0.462, "num_input_tokens_seen": 77076688, "step": 63535 }, { "epoch": 7.076511861009021, "grad_norm": 0.08104304224252701, "learning_rate": 1.192711683247776e-05, "loss": 0.4572, "num_input_tokens_seen": 77082704, "step": 63540 }, { "epoch": 7.077068715892638, "grad_norm": 0.10034050047397614, "learning_rate": 1.1922974937160564e-05, "loss": 0.4631, "num_input_tokens_seen": 77089104, "step": 63545 }, { "epoch": 7.077625570776256, "grad_norm": 0.10747089982032776, "learning_rate": 1.1918833535935273e-05, "loss": 0.4701, "num_input_tokens_seen": 77094864, "step": 63550 }, { "epoch": 7.078182425659873, "grad_norm": 0.10430502146482468, "learning_rate": 1.1914692628958382e-05, "loss": 0.4514, "num_input_tokens_seen": 77101168, "step": 63555 }, { "epoch": 7.07873928054349, "grad_norm": 0.12574860453605652, "learning_rate": 1.1910552216386326e-05, "loss": 0.4547, "num_input_tokens_seen": 77107184, "step": 63560 }, { "epoch": 7.079296135427108, "grad_norm": 0.1557403951883316, "learning_rate": 1.1906412298375554e-05, "loss": 0.4496, "num_input_tokens_seen": 77113616, "step": 63565 }, { "epoch": 7.079852990310725, "grad_norm": 0.11567346751689911, "learning_rate": 1.1902272875082499e-05, "loss": 0.4541, "num_input_tokens_seen": 77119824, "step": 63570 }, { "epoch": 7.0804098451943425, "grad_norm": 0.11952172219753265, "learning_rate": 1.1898133946663537e-05, "loss": 0.4623, "num_input_tokens_seen": 77125488, "step": 63575 }, { "epoch": 7.080966700077959, "grad_norm": 0.11651977151632309, "learning_rate": 1.1893995513275069e-05, "loss": 0.4544, "num_input_tokens_seen": 77131216, "step": 63580 }, { "epoch": 7.081523554961577, "grad_norm": 0.09981272369623184, "learning_rate": 1.1889857575073443e-05, "loss": 0.4586, "num_input_tokens_seen": 77137392, "step": 63585 }, { "epoch": 7.082080409845195, "grad_norm": 0.15640634298324585, "learning_rate": 1.1885720132215023e-05, "loss": 0.4625, "num_input_tokens_seen": 77142960, "step": 63590 }, { "epoch": 7.082637264728811, "grad_norm": 0.12720920145511627, "learning_rate": 1.188158318485611e-05, "loss": 0.4679, "num_input_tokens_seen": 77149008, "step": 63595 }, { "epoch": 7.083194119612429, "grad_norm": 0.10059396922588348, "learning_rate": 1.1877446733153025e-05, "loss": 0.4599, "num_input_tokens_seen": 77154800, "step": 63600 }, { "epoch": 7.083750974496047, "grad_norm": 0.11611480265855789, "learning_rate": 1.1873310777262067e-05, "loss": 0.4514, "num_input_tokens_seen": 77161488, "step": 63605 }, { "epoch": 7.0843078293796635, "grad_norm": 0.09030497819185257, "learning_rate": 1.1869175317339484e-05, "loss": 0.4639, "num_input_tokens_seen": 77167632, "step": 63610 }, { "epoch": 7.084864684263281, "grad_norm": 0.1091119647026062, "learning_rate": 1.1865040353541545e-05, "loss": 0.4632, "num_input_tokens_seen": 77173616, "step": 63615 }, { "epoch": 7.085421539146898, "grad_norm": 0.13070757687091827, "learning_rate": 1.1860905886024468e-05, "loss": 0.4647, "num_input_tokens_seen": 77179824, "step": 63620 }, { "epoch": 7.085978394030516, "grad_norm": 0.10473110526800156, "learning_rate": 1.1856771914944473e-05, "loss": 0.4629, "num_input_tokens_seen": 77185776, "step": 63625 }, { "epoch": 7.086535248914133, "grad_norm": 0.09939336031675339, "learning_rate": 1.185263844045776e-05, "loss": 0.4699, "num_input_tokens_seen": 77191792, "step": 63630 }, { "epoch": 7.08709210379775, "grad_norm": 0.10835956037044525, "learning_rate": 1.1848505462720495e-05, "loss": 0.4539, "num_input_tokens_seen": 77197904, "step": 63635 }, { "epoch": 7.087648958681368, "grad_norm": 0.10433298349380493, "learning_rate": 1.1844372981888846e-05, "loss": 0.4596, "num_input_tokens_seen": 77203856, "step": 63640 }, { "epoch": 7.088205813564985, "grad_norm": 0.11614413559436798, "learning_rate": 1.1840240998118937e-05, "loss": 0.4676, "num_input_tokens_seen": 77209776, "step": 63645 }, { "epoch": 7.088762668448602, "grad_norm": 0.1279139220714569, "learning_rate": 1.1836109511566903e-05, "loss": 0.4708, "num_input_tokens_seen": 77216272, "step": 63650 }, { "epoch": 7.08931952333222, "grad_norm": 0.09528693556785583, "learning_rate": 1.1831978522388832e-05, "loss": 0.4585, "num_input_tokens_seen": 77222352, "step": 63655 }, { "epoch": 7.089876378215837, "grad_norm": 0.12320549041032791, "learning_rate": 1.1827848030740806e-05, "loss": 0.4512, "num_input_tokens_seen": 77228528, "step": 63660 }, { "epoch": 7.090433233099454, "grad_norm": 0.10318811237812042, "learning_rate": 1.1823718036778908e-05, "loss": 0.4607, "num_input_tokens_seen": 77234416, "step": 63665 }, { "epoch": 7.090990087983071, "grad_norm": 0.09489788860082626, "learning_rate": 1.1819588540659155e-05, "loss": 0.456, "num_input_tokens_seen": 77240112, "step": 63670 }, { "epoch": 7.091546942866689, "grad_norm": 0.08923163264989853, "learning_rate": 1.1815459542537596e-05, "loss": 0.4609, "num_input_tokens_seen": 77246512, "step": 63675 }, { "epoch": 7.0921037977503065, "grad_norm": 0.1006910651922226, "learning_rate": 1.181133104257022e-05, "loss": 0.4578, "num_input_tokens_seen": 77252336, "step": 63680 }, { "epoch": 7.092660652633923, "grad_norm": 0.10144759714603424, "learning_rate": 1.180720304091303e-05, "loss": 0.4576, "num_input_tokens_seen": 77258736, "step": 63685 }, { "epoch": 7.093217507517541, "grad_norm": 0.08766376972198486, "learning_rate": 1.1803075537721977e-05, "loss": 0.4763, "num_input_tokens_seen": 77264944, "step": 63690 }, { "epoch": 7.093774362401159, "grad_norm": 0.11288636177778244, "learning_rate": 1.1798948533153026e-05, "loss": 0.4587, "num_input_tokens_seen": 77270960, "step": 63695 }, { "epoch": 7.0943312172847754, "grad_norm": 0.10584433376789093, "learning_rate": 1.179482202736211e-05, "loss": 0.4659, "num_input_tokens_seen": 77277168, "step": 63700 }, { "epoch": 7.094888072168393, "grad_norm": 0.14555588364601135, "learning_rate": 1.1790696020505127e-05, "loss": 0.4686, "num_input_tokens_seen": 77282896, "step": 63705 }, { "epoch": 7.09544492705201, "grad_norm": 0.07757294923067093, "learning_rate": 1.178657051273799e-05, "loss": 0.4621, "num_input_tokens_seen": 77289296, "step": 63710 }, { "epoch": 7.096001781935628, "grad_norm": 0.08686695992946625, "learning_rate": 1.1782445504216552e-05, "loss": 0.4498, "num_input_tokens_seen": 77294832, "step": 63715 }, { "epoch": 7.096558636819245, "grad_norm": 0.14152668416500092, "learning_rate": 1.1778320995096692e-05, "loss": 0.4636, "num_input_tokens_seen": 77300784, "step": 63720 }, { "epoch": 7.097115491702862, "grad_norm": 0.10313332825899124, "learning_rate": 1.1774196985534227e-05, "loss": 0.4566, "num_input_tokens_seen": 77306640, "step": 63725 }, { "epoch": 7.09767234658648, "grad_norm": 0.07823216915130615, "learning_rate": 1.1770073475684984e-05, "loss": 0.4522, "num_input_tokens_seen": 77312816, "step": 63730 }, { "epoch": 7.0982292014700965, "grad_norm": 0.08905612677335739, "learning_rate": 1.176595046570477e-05, "loss": 0.4613, "num_input_tokens_seen": 77318928, "step": 63735 }, { "epoch": 7.098786056353714, "grad_norm": 0.11372676491737366, "learning_rate": 1.176182795574935e-05, "loss": 0.4607, "num_input_tokens_seen": 77323856, "step": 63740 }, { "epoch": 7.099342911237332, "grad_norm": 0.16498786211013794, "learning_rate": 1.1757705945974501e-05, "loss": 0.4695, "num_input_tokens_seen": 77330064, "step": 63745 }, { "epoch": 7.099899766120949, "grad_norm": 0.11549884080886841, "learning_rate": 1.1753584436535953e-05, "loss": 0.4566, "num_input_tokens_seen": 77336496, "step": 63750 }, { "epoch": 7.100456621004566, "grad_norm": 0.08767355233430862, "learning_rate": 1.174946342758943e-05, "loss": 0.4436, "num_input_tokens_seen": 77342416, "step": 63755 }, { "epoch": 7.101013475888184, "grad_norm": 0.10670214146375656, "learning_rate": 1.1745342919290655e-05, "loss": 0.4536, "num_input_tokens_seen": 77348272, "step": 63760 }, { "epoch": 7.101570330771801, "grad_norm": 0.09548980742692947, "learning_rate": 1.1741222911795291e-05, "loss": 0.4747, "num_input_tokens_seen": 77354032, "step": 63765 }, { "epoch": 7.1021271856554185, "grad_norm": 0.08390134572982788, "learning_rate": 1.1737103405259025e-05, "loss": 0.4565, "num_input_tokens_seen": 77360048, "step": 63770 }, { "epoch": 7.102684040539035, "grad_norm": 0.08508412539958954, "learning_rate": 1.1732984399837485e-05, "loss": 0.4486, "num_input_tokens_seen": 77365968, "step": 63775 }, { "epoch": 7.103240895422653, "grad_norm": 0.09195159375667572, "learning_rate": 1.1728865895686323e-05, "loss": 0.4599, "num_input_tokens_seen": 77372016, "step": 63780 }, { "epoch": 7.103797750306271, "grad_norm": 0.08461636304855347, "learning_rate": 1.1724747892961125e-05, "loss": 0.4544, "num_input_tokens_seen": 77378288, "step": 63785 }, { "epoch": 7.104354605189887, "grad_norm": 0.13143101334571838, "learning_rate": 1.1720630391817492e-05, "loss": 0.4609, "num_input_tokens_seen": 77384592, "step": 63790 }, { "epoch": 7.104911460073505, "grad_norm": 0.08885136246681213, "learning_rate": 1.171651339241101e-05, "loss": 0.4512, "num_input_tokens_seen": 77391152, "step": 63795 }, { "epoch": 7.105468314957122, "grad_norm": 0.17235365509986877, "learning_rate": 1.1712396894897213e-05, "loss": 0.463, "num_input_tokens_seen": 77397264, "step": 63800 }, { "epoch": 7.1060251698407395, "grad_norm": 0.10317033529281616, "learning_rate": 1.170828089943165e-05, "loss": 0.4707, "num_input_tokens_seen": 77403568, "step": 63805 }, { "epoch": 7.106582024724357, "grad_norm": 0.11129911988973618, "learning_rate": 1.1704165406169823e-05, "loss": 0.4634, "num_input_tokens_seen": 77409680, "step": 63810 }, { "epoch": 7.107138879607974, "grad_norm": 0.09193453937768936, "learning_rate": 1.1700050415267247e-05, "loss": 0.4581, "num_input_tokens_seen": 77415952, "step": 63815 }, { "epoch": 7.107695734491592, "grad_norm": 0.1135406345129013, "learning_rate": 1.1695935926879375e-05, "loss": 0.4632, "num_input_tokens_seen": 77421904, "step": 63820 }, { "epoch": 7.108252589375208, "grad_norm": 0.09565787017345428, "learning_rate": 1.1691821941161679e-05, "loss": 0.4522, "num_input_tokens_seen": 77428144, "step": 63825 }, { "epoch": 7.108809444258826, "grad_norm": 0.1179635152220726, "learning_rate": 1.1687708458269609e-05, "loss": 0.4642, "num_input_tokens_seen": 77434032, "step": 63830 }, { "epoch": 7.109366299142444, "grad_norm": 0.09536311775445938, "learning_rate": 1.1683595478358566e-05, "loss": 0.4568, "num_input_tokens_seen": 77440176, "step": 63835 }, { "epoch": 7.109923154026061, "grad_norm": 0.12576179206371307, "learning_rate": 1.167948300158397e-05, "loss": 0.4675, "num_input_tokens_seen": 77446096, "step": 63840 }, { "epoch": 7.110480008909678, "grad_norm": 0.1346486508846283, "learning_rate": 1.1675371028101181e-05, "loss": 0.4665, "num_input_tokens_seen": 77452368, "step": 63845 }, { "epoch": 7.111036863793295, "grad_norm": 0.10904621332883835, "learning_rate": 1.1671259558065578e-05, "loss": 0.4754, "num_input_tokens_seen": 77458384, "step": 63850 }, { "epoch": 7.111593718676913, "grad_norm": 0.08503366261720657, "learning_rate": 1.166714859163252e-05, "loss": 0.4577, "num_input_tokens_seen": 77464528, "step": 63855 }, { "epoch": 7.11215057356053, "grad_norm": 0.10831504315137863, "learning_rate": 1.1663038128957299e-05, "loss": 0.4598, "num_input_tokens_seen": 77470704, "step": 63860 }, { "epoch": 7.112707428444147, "grad_norm": 0.12707141041755676, "learning_rate": 1.1658928170195246e-05, "loss": 0.4574, "num_input_tokens_seen": 77476848, "step": 63865 }, { "epoch": 7.113264283327765, "grad_norm": 0.09581948071718216, "learning_rate": 1.165481871550163e-05, "loss": 0.4615, "num_input_tokens_seen": 77482832, "step": 63870 }, { "epoch": 7.1138211382113825, "grad_norm": 0.16982655227184296, "learning_rate": 1.1650709765031731e-05, "loss": 0.4729, "num_input_tokens_seen": 77488560, "step": 63875 }, { "epoch": 7.114377993094999, "grad_norm": 0.1183643490076065, "learning_rate": 1.1646601318940806e-05, "loss": 0.4726, "num_input_tokens_seen": 77494128, "step": 63880 }, { "epoch": 7.114934847978617, "grad_norm": 0.11373934894800186, "learning_rate": 1.1642493377384068e-05, "loss": 0.4598, "num_input_tokens_seen": 77500368, "step": 63885 }, { "epoch": 7.115491702862234, "grad_norm": 0.10307863354682922, "learning_rate": 1.1638385940516744e-05, "loss": 0.4609, "num_input_tokens_seen": 77506256, "step": 63890 }, { "epoch": 7.116048557745851, "grad_norm": 0.16055257618427277, "learning_rate": 1.163427900849401e-05, "loss": 0.4576, "num_input_tokens_seen": 77512624, "step": 63895 }, { "epoch": 7.116605412629469, "grad_norm": 0.11194074898958206, "learning_rate": 1.1630172581471046e-05, "loss": 0.4577, "num_input_tokens_seen": 77518832, "step": 63900 }, { "epoch": 7.117162267513086, "grad_norm": 0.09410092979669571, "learning_rate": 1.1626066659603022e-05, "loss": 0.4679, "num_input_tokens_seen": 77525040, "step": 63905 }, { "epoch": 7.117719122396704, "grad_norm": 0.10395926982164383, "learning_rate": 1.1621961243045043e-05, "loss": 0.4546, "num_input_tokens_seen": 77531344, "step": 63910 }, { "epoch": 7.11827597728032, "grad_norm": 0.11931313574314117, "learning_rate": 1.1617856331952253e-05, "loss": 0.4601, "num_input_tokens_seen": 77537584, "step": 63915 }, { "epoch": 7.118832832163938, "grad_norm": 0.10835643112659454, "learning_rate": 1.1613751926479726e-05, "loss": 0.4454, "num_input_tokens_seen": 77543312, "step": 63920 }, { "epoch": 7.119389687047556, "grad_norm": 0.12211915850639343, "learning_rate": 1.1609648026782558e-05, "loss": 0.4573, "num_input_tokens_seen": 77549616, "step": 63925 }, { "epoch": 7.1199465419311725, "grad_norm": 0.13857945799827576, "learning_rate": 1.1605544633015792e-05, "loss": 0.4531, "num_input_tokens_seen": 77555184, "step": 63930 }, { "epoch": 7.12050339681479, "grad_norm": 0.09696629643440247, "learning_rate": 1.1601441745334471e-05, "loss": 0.4639, "num_input_tokens_seen": 77561200, "step": 63935 }, { "epoch": 7.121060251698408, "grad_norm": 0.11289265006780624, "learning_rate": 1.1597339363893631e-05, "loss": 0.4566, "num_input_tokens_seen": 77567280, "step": 63940 }, { "epoch": 7.121617106582025, "grad_norm": 0.10971051454544067, "learning_rate": 1.1593237488848249e-05, "loss": 0.4572, "num_input_tokens_seen": 77573424, "step": 63945 }, { "epoch": 7.122173961465642, "grad_norm": 0.13197945058345795, "learning_rate": 1.158913612035333e-05, "loss": 0.4565, "num_input_tokens_seen": 77579600, "step": 63950 }, { "epoch": 7.122730816349259, "grad_norm": 0.09706494957208633, "learning_rate": 1.1585035258563818e-05, "loss": 0.4613, "num_input_tokens_seen": 77585776, "step": 63955 }, { "epoch": 7.123287671232877, "grad_norm": 0.11317824572324753, "learning_rate": 1.158093490363467e-05, "loss": 0.4593, "num_input_tokens_seen": 77591888, "step": 63960 }, { "epoch": 7.123844526116494, "grad_norm": 0.17047272622585297, "learning_rate": 1.1576835055720797e-05, "loss": 0.4705, "num_input_tokens_seen": 77598160, "step": 63965 }, { "epoch": 7.124401381000111, "grad_norm": 0.08735176175832748, "learning_rate": 1.1572735714977112e-05, "loss": 0.4691, "num_input_tokens_seen": 77604144, "step": 63970 }, { "epoch": 7.124958235883729, "grad_norm": 0.11234577000141144, "learning_rate": 1.1568636881558508e-05, "loss": 0.4646, "num_input_tokens_seen": 77610160, "step": 63975 }, { "epoch": 7.125515090767346, "grad_norm": 0.09943870455026627, "learning_rate": 1.1564538555619842e-05, "loss": 0.474, "num_input_tokens_seen": 77616112, "step": 63980 }, { "epoch": 7.126071945650963, "grad_norm": 0.21308660507202148, "learning_rate": 1.156044073731597e-05, "loss": 0.468, "num_input_tokens_seen": 77622480, "step": 63985 }, { "epoch": 7.126628800534581, "grad_norm": 0.1042466089129448, "learning_rate": 1.155634342680171e-05, "loss": 0.4571, "num_input_tokens_seen": 77628528, "step": 63990 }, { "epoch": 7.127185655418198, "grad_norm": 0.10901288688182831, "learning_rate": 1.1552246624231886e-05, "loss": 0.4616, "num_input_tokens_seen": 77634512, "step": 63995 }, { "epoch": 7.1277425103018155, "grad_norm": 0.15531469881534576, "learning_rate": 1.154815032976127e-05, "loss": 0.4599, "num_input_tokens_seen": 77640688, "step": 64000 }, { "epoch": 7.128299365185432, "grad_norm": 0.10774198919534683, "learning_rate": 1.1544054543544641e-05, "loss": 0.4743, "num_input_tokens_seen": 77647120, "step": 64005 }, { "epoch": 7.12885622006905, "grad_norm": 0.08249541372060776, "learning_rate": 1.1539959265736766e-05, "loss": 0.4658, "num_input_tokens_seen": 77652464, "step": 64010 }, { "epoch": 7.129413074952668, "grad_norm": 0.10985992848873138, "learning_rate": 1.1535864496492354e-05, "loss": 0.4795, "num_input_tokens_seen": 77658608, "step": 64015 }, { "epoch": 7.129969929836284, "grad_norm": 0.0977957621216774, "learning_rate": 1.1531770235966138e-05, "loss": 0.4513, "num_input_tokens_seen": 77664688, "step": 64020 }, { "epoch": 7.130526784719902, "grad_norm": 0.14073246717453003, "learning_rate": 1.1527676484312796e-05, "loss": 0.4495, "num_input_tokens_seen": 77671024, "step": 64025 }, { "epoch": 7.131083639603519, "grad_norm": 0.13245825469493866, "learning_rate": 1.1523583241687009e-05, "loss": 0.465, "num_input_tokens_seen": 77676816, "step": 64030 }, { "epoch": 7.1316404944871366, "grad_norm": 0.11289049685001373, "learning_rate": 1.1519490508243444e-05, "loss": 0.4592, "num_input_tokens_seen": 77683024, "step": 64035 }, { "epoch": 7.132197349370754, "grad_norm": 0.11122515052556992, "learning_rate": 1.1515398284136719e-05, "loss": 0.4685, "num_input_tokens_seen": 77689104, "step": 64040 }, { "epoch": 7.132754204254371, "grad_norm": 0.11164149641990662, "learning_rate": 1.1511306569521471e-05, "loss": 0.4649, "num_input_tokens_seen": 77695312, "step": 64045 }, { "epoch": 7.133311059137989, "grad_norm": 0.11793774366378784, "learning_rate": 1.1507215364552276e-05, "loss": 0.4535, "num_input_tokens_seen": 77701776, "step": 64050 }, { "epoch": 7.133867914021606, "grad_norm": 0.0869205966591835, "learning_rate": 1.1503124669383734e-05, "loss": 0.4571, "num_input_tokens_seen": 77707792, "step": 64055 }, { "epoch": 7.134424768905223, "grad_norm": 0.09927891939878464, "learning_rate": 1.1499034484170385e-05, "loss": 0.4454, "num_input_tokens_seen": 77713808, "step": 64060 }, { "epoch": 7.134981623788841, "grad_norm": 0.09785700589418411, "learning_rate": 1.1494944809066782e-05, "loss": 0.4656, "num_input_tokens_seen": 77719856, "step": 64065 }, { "epoch": 7.135538478672458, "grad_norm": 0.13015124201774597, "learning_rate": 1.149085564422745e-05, "loss": 0.4518, "num_input_tokens_seen": 77725968, "step": 64070 }, { "epoch": 7.136095333556075, "grad_norm": 0.09674721211194992, "learning_rate": 1.1486766989806875e-05, "loss": 0.4676, "num_input_tokens_seen": 77731440, "step": 64075 }, { "epoch": 7.136652188439693, "grad_norm": 0.1026933565735817, "learning_rate": 1.1482678845959557e-05, "loss": 0.462, "num_input_tokens_seen": 77737104, "step": 64080 }, { "epoch": 7.13720904332331, "grad_norm": 0.15233850479125977, "learning_rate": 1.1478591212839943e-05, "loss": 0.4624, "num_input_tokens_seen": 77743504, "step": 64085 }, { "epoch": 7.137765898206927, "grad_norm": 0.1345963478088379, "learning_rate": 1.1474504090602492e-05, "loss": 0.4647, "num_input_tokens_seen": 77749808, "step": 64090 }, { "epoch": 7.138322753090544, "grad_norm": 0.12141858041286469, "learning_rate": 1.1470417479401613e-05, "loss": 0.4525, "num_input_tokens_seen": 77755696, "step": 64095 }, { "epoch": 7.138879607974162, "grad_norm": 0.14470410346984863, "learning_rate": 1.146633137939172e-05, "loss": 0.4624, "num_input_tokens_seen": 77761776, "step": 64100 }, { "epoch": 7.1394364628577796, "grad_norm": 0.10924552381038666, "learning_rate": 1.1462245790727204e-05, "loss": 0.4408, "num_input_tokens_seen": 77767984, "step": 64105 }, { "epoch": 7.139993317741396, "grad_norm": 0.13677147030830383, "learning_rate": 1.1458160713562422e-05, "loss": 0.4555, "num_input_tokens_seen": 77774384, "step": 64110 }, { "epoch": 7.140550172625014, "grad_norm": 0.19791775941848755, "learning_rate": 1.1454076148051729e-05, "loss": 0.4474, "num_input_tokens_seen": 77779984, "step": 64115 }, { "epoch": 7.141107027508632, "grad_norm": 0.09726343303918839, "learning_rate": 1.1449992094349443e-05, "loss": 0.4582, "num_input_tokens_seen": 77785904, "step": 64120 }, { "epoch": 7.1416638823922485, "grad_norm": 0.12707138061523438, "learning_rate": 1.144590855260988e-05, "loss": 0.4543, "num_input_tokens_seen": 77791952, "step": 64125 }, { "epoch": 7.142220737275866, "grad_norm": 0.10039885342121124, "learning_rate": 1.1441825522987335e-05, "loss": 0.4635, "num_input_tokens_seen": 77798352, "step": 64130 }, { "epoch": 7.142777592159483, "grad_norm": 0.09286660701036453, "learning_rate": 1.1437743005636062e-05, "loss": 0.4629, "num_input_tokens_seen": 77804624, "step": 64135 }, { "epoch": 7.143334447043101, "grad_norm": 0.08674603700637817, "learning_rate": 1.143366100071033e-05, "loss": 0.4634, "num_input_tokens_seen": 77810832, "step": 64140 }, { "epoch": 7.143891301926718, "grad_norm": 0.11641811579465866, "learning_rate": 1.1429579508364352e-05, "loss": 0.4608, "num_input_tokens_seen": 77816752, "step": 64145 }, { "epoch": 7.144448156810335, "grad_norm": 0.1811373233795166, "learning_rate": 1.1425498528752357e-05, "loss": 0.4578, "num_input_tokens_seen": 77822960, "step": 64150 }, { "epoch": 7.145005011693953, "grad_norm": 0.12315455824136734, "learning_rate": 1.1421418062028522e-05, "loss": 0.4574, "num_input_tokens_seen": 77828976, "step": 64155 }, { "epoch": 7.1455618665775695, "grad_norm": 0.16623090207576752, "learning_rate": 1.1417338108347026e-05, "loss": 0.4759, "num_input_tokens_seen": 77835312, "step": 64160 }, { "epoch": 7.146118721461187, "grad_norm": 0.11914018541574478, "learning_rate": 1.1413258667862034e-05, "loss": 0.4547, "num_input_tokens_seen": 77841104, "step": 64165 }, { "epoch": 7.146675576344805, "grad_norm": 0.10539127886295319, "learning_rate": 1.140917974072766e-05, "loss": 0.4676, "num_input_tokens_seen": 77847280, "step": 64170 }, { "epoch": 7.147232431228422, "grad_norm": 0.10810881853103638, "learning_rate": 1.1405101327098039e-05, "loss": 0.4656, "num_input_tokens_seen": 77853104, "step": 64175 }, { "epoch": 7.147789286112039, "grad_norm": 0.10569406300783157, "learning_rate": 1.1401023427127247e-05, "loss": 0.4644, "num_input_tokens_seen": 77859248, "step": 64180 }, { "epoch": 7.148346140995656, "grad_norm": 0.12270904332399368, "learning_rate": 1.1396946040969381e-05, "loss": 0.4618, "num_input_tokens_seen": 77865296, "step": 64185 }, { "epoch": 7.148902995879274, "grad_norm": 0.144008070230484, "learning_rate": 1.1392869168778478e-05, "loss": 0.468, "num_input_tokens_seen": 77871280, "step": 64190 }, { "epoch": 7.1494598507628915, "grad_norm": 0.1515214890241623, "learning_rate": 1.1388792810708581e-05, "loss": 0.4568, "num_input_tokens_seen": 77876880, "step": 64195 }, { "epoch": 7.150016705646508, "grad_norm": 0.104927659034729, "learning_rate": 1.1384716966913722e-05, "loss": 0.4657, "num_input_tokens_seen": 77883120, "step": 64200 }, { "epoch": 7.150573560530126, "grad_norm": 0.09792117774486542, "learning_rate": 1.1380641637547879e-05, "loss": 0.4486, "num_input_tokens_seen": 77889200, "step": 64205 }, { "epoch": 7.151130415413743, "grad_norm": 0.09780502319335938, "learning_rate": 1.1376566822765048e-05, "loss": 0.4579, "num_input_tokens_seen": 77895536, "step": 64210 }, { "epoch": 7.15168727029736, "grad_norm": 0.10066898912191391, "learning_rate": 1.1372492522719171e-05, "loss": 0.4497, "num_input_tokens_seen": 77900880, "step": 64215 }, { "epoch": 7.152244125180978, "grad_norm": 0.0927584245800972, "learning_rate": 1.1368418737564212e-05, "loss": 0.4521, "num_input_tokens_seen": 77906768, "step": 64220 }, { "epoch": 7.152800980064595, "grad_norm": 0.08772584050893784, "learning_rate": 1.1364345467454065e-05, "loss": 0.453, "num_input_tokens_seen": 77912688, "step": 64225 }, { "epoch": 7.1533578349482125, "grad_norm": 0.10312933474779129, "learning_rate": 1.1360272712542644e-05, "loss": 0.4563, "num_input_tokens_seen": 77918896, "step": 64230 }, { "epoch": 7.15391468983183, "grad_norm": 0.08923716843128204, "learning_rate": 1.1356200472983841e-05, "loss": 0.4631, "num_input_tokens_seen": 77925296, "step": 64235 }, { "epoch": 7.154471544715447, "grad_norm": 0.10197249799966812, "learning_rate": 1.1352128748931498e-05, "loss": 0.4565, "num_input_tokens_seen": 77931248, "step": 64240 }, { "epoch": 7.155028399599065, "grad_norm": 0.08524207770824432, "learning_rate": 1.1348057540539476e-05, "loss": 0.4662, "num_input_tokens_seen": 77937136, "step": 64245 }, { "epoch": 7.1555852544826815, "grad_norm": 0.14387184381484985, "learning_rate": 1.134398684796158e-05, "loss": 0.4397, "num_input_tokens_seen": 77943600, "step": 64250 }, { "epoch": 7.156142109366299, "grad_norm": 0.15309442579746246, "learning_rate": 1.1339916671351624e-05, "loss": 0.4637, "num_input_tokens_seen": 77949744, "step": 64255 }, { "epoch": 7.156698964249917, "grad_norm": 0.10798250883817673, "learning_rate": 1.1335847010863404e-05, "loss": 0.4582, "num_input_tokens_seen": 77956080, "step": 64260 }, { "epoch": 7.157255819133534, "grad_norm": 0.11552193760871887, "learning_rate": 1.1331777866650662e-05, "loss": 0.4628, "num_input_tokens_seen": 77962576, "step": 64265 }, { "epoch": 7.157812674017151, "grad_norm": 0.1599973887205124, "learning_rate": 1.132770923886717e-05, "loss": 0.4594, "num_input_tokens_seen": 77968400, "step": 64270 }, { "epoch": 7.158369528900768, "grad_norm": 0.10348503291606903, "learning_rate": 1.1323641127666624e-05, "loss": 0.4583, "num_input_tokens_seen": 77974736, "step": 64275 }, { "epoch": 7.158926383784386, "grad_norm": 0.10452856123447418, "learning_rate": 1.1319573533202743e-05, "loss": 0.4699, "num_input_tokens_seen": 77980848, "step": 64280 }, { "epoch": 7.159483238668003, "grad_norm": 0.11111751198768616, "learning_rate": 1.1315506455629224e-05, "loss": 0.4682, "num_input_tokens_seen": 77986768, "step": 64285 }, { "epoch": 7.16004009355162, "grad_norm": 0.0904732495546341, "learning_rate": 1.1311439895099715e-05, "loss": 0.4528, "num_input_tokens_seen": 77992880, "step": 64290 }, { "epoch": 7.160596948435238, "grad_norm": 0.16675406694412231, "learning_rate": 1.1307373851767882e-05, "loss": 0.4651, "num_input_tokens_seen": 77999152, "step": 64295 }, { "epoch": 7.1611538033188555, "grad_norm": 0.11884023994207382, "learning_rate": 1.1303308325787338e-05, "loss": 0.462, "num_input_tokens_seen": 78005392, "step": 64300 }, { "epoch": 7.161710658202472, "grad_norm": 0.1502072662115097, "learning_rate": 1.1299243317311698e-05, "loss": 0.4714, "num_input_tokens_seen": 78011344, "step": 64305 }, { "epoch": 7.16226751308609, "grad_norm": 0.0898669958114624, "learning_rate": 1.129517882649456e-05, "loss": 0.4687, "num_input_tokens_seen": 78017328, "step": 64310 }, { "epoch": 7.162824367969707, "grad_norm": 0.09885627031326294, "learning_rate": 1.1291114853489479e-05, "loss": 0.4614, "num_input_tokens_seen": 78022832, "step": 64315 }, { "epoch": 7.1633812228533245, "grad_norm": 0.13941970467567444, "learning_rate": 1.1287051398450016e-05, "loss": 0.4559, "num_input_tokens_seen": 78028400, "step": 64320 }, { "epoch": 7.163938077736942, "grad_norm": 0.0755162164568901, "learning_rate": 1.128298846152969e-05, "loss": 0.4749, "num_input_tokens_seen": 78033904, "step": 64325 }, { "epoch": 7.164494932620559, "grad_norm": 0.07531701773405075, "learning_rate": 1.1278926042882026e-05, "loss": 0.4604, "num_input_tokens_seen": 78039408, "step": 64330 }, { "epoch": 7.165051787504177, "grad_norm": 0.11001746356487274, "learning_rate": 1.1274864142660502e-05, "loss": 0.4791, "num_input_tokens_seen": 78045776, "step": 64335 }, { "epoch": 7.165608642387793, "grad_norm": 0.09715519100427628, "learning_rate": 1.1270802761018592e-05, "loss": 0.4596, "num_input_tokens_seen": 78051920, "step": 64340 }, { "epoch": 7.166165497271411, "grad_norm": 0.11517440527677536, "learning_rate": 1.126674189810976e-05, "loss": 0.4554, "num_input_tokens_seen": 78057776, "step": 64345 }, { "epoch": 7.166722352155029, "grad_norm": 0.17543677985668182, "learning_rate": 1.126268155408742e-05, "loss": 0.4607, "num_input_tokens_seen": 78063952, "step": 64350 }, { "epoch": 7.1672792070386455, "grad_norm": 0.07386674731969833, "learning_rate": 1.1258621729105004e-05, "loss": 0.4532, "num_input_tokens_seen": 78070000, "step": 64355 }, { "epoch": 7.167836061922263, "grad_norm": 0.13468506932258606, "learning_rate": 1.1254562423315888e-05, "loss": 0.4516, "num_input_tokens_seen": 78076112, "step": 64360 }, { "epoch": 7.16839291680588, "grad_norm": 0.10442130267620087, "learning_rate": 1.1250503636873461e-05, "loss": 0.4543, "num_input_tokens_seen": 78081744, "step": 64365 }, { "epoch": 7.168949771689498, "grad_norm": 0.10779990255832672, "learning_rate": 1.1246445369931058e-05, "loss": 0.4729, "num_input_tokens_seen": 78087856, "step": 64370 }, { "epoch": 7.169506626573115, "grad_norm": 0.09838548302650452, "learning_rate": 1.1242387622642026e-05, "loss": 0.4604, "num_input_tokens_seen": 78093648, "step": 64375 }, { "epoch": 7.170063481456732, "grad_norm": 0.09256431460380554, "learning_rate": 1.1238330395159688e-05, "loss": 0.4608, "num_input_tokens_seen": 78099856, "step": 64380 }, { "epoch": 7.17062033634035, "grad_norm": 0.09641946107149124, "learning_rate": 1.123427368763732e-05, "loss": 0.4638, "num_input_tokens_seen": 78106192, "step": 64385 }, { "epoch": 7.1711771912239675, "grad_norm": 0.11613614112138748, "learning_rate": 1.1230217500228214e-05, "loss": 0.4526, "num_input_tokens_seen": 78112560, "step": 64390 }, { "epoch": 7.171734046107584, "grad_norm": 0.1317700892686844, "learning_rate": 1.122616183308561e-05, "loss": 0.4503, "num_input_tokens_seen": 78118416, "step": 64395 }, { "epoch": 7.172290900991202, "grad_norm": 0.16229957342147827, "learning_rate": 1.1222106686362752e-05, "loss": 0.4492, "num_input_tokens_seen": 78124560, "step": 64400 }, { "epoch": 7.172847755874819, "grad_norm": 0.12377270311117172, "learning_rate": 1.1218052060212866e-05, "loss": 0.4656, "num_input_tokens_seen": 78130640, "step": 64405 }, { "epoch": 7.173404610758436, "grad_norm": 0.1283283233642578, "learning_rate": 1.1213997954789133e-05, "loss": 0.4512, "num_input_tokens_seen": 78137136, "step": 64410 }, { "epoch": 7.173961465642054, "grad_norm": 0.10546530038118362, "learning_rate": 1.1209944370244743e-05, "loss": 0.4639, "num_input_tokens_seen": 78143312, "step": 64415 }, { "epoch": 7.174518320525671, "grad_norm": 0.12806475162506104, "learning_rate": 1.1205891306732839e-05, "loss": 0.4587, "num_input_tokens_seen": 78149392, "step": 64420 }, { "epoch": 7.1750751754092885, "grad_norm": 0.09837199747562408, "learning_rate": 1.1201838764406575e-05, "loss": 0.4639, "num_input_tokens_seen": 78155472, "step": 64425 }, { "epoch": 7.175632030292905, "grad_norm": 0.08524023741483688, "learning_rate": 1.119778674341905e-05, "loss": 0.4665, "num_input_tokens_seen": 78161520, "step": 64430 }, { "epoch": 7.176188885176523, "grad_norm": 0.11611361056566238, "learning_rate": 1.1193735243923376e-05, "loss": 0.4611, "num_input_tokens_seen": 78167760, "step": 64435 }, { "epoch": 7.176745740060141, "grad_norm": 0.12524694204330444, "learning_rate": 1.1189684266072639e-05, "loss": 0.4386, "num_input_tokens_seen": 78173872, "step": 64440 }, { "epoch": 7.177302594943757, "grad_norm": 0.10126225650310516, "learning_rate": 1.1185633810019878e-05, "loss": 0.4537, "num_input_tokens_seen": 78180240, "step": 64445 }, { "epoch": 7.177859449827375, "grad_norm": 0.09416556358337402, "learning_rate": 1.1181583875918148e-05, "loss": 0.4799, "num_input_tokens_seen": 78186224, "step": 64450 }, { "epoch": 7.178416304710992, "grad_norm": 0.11917994171380997, "learning_rate": 1.1177534463920456e-05, "loss": 0.4584, "num_input_tokens_seen": 78192144, "step": 64455 }, { "epoch": 7.17897315959461, "grad_norm": 0.113688163459301, "learning_rate": 1.1173485574179814e-05, "loss": 0.4735, "num_input_tokens_seen": 78197840, "step": 64460 }, { "epoch": 7.179530014478227, "grad_norm": 0.11756543815135956, "learning_rate": 1.1169437206849193e-05, "loss": 0.4616, "num_input_tokens_seen": 78204112, "step": 64465 }, { "epoch": 7.180086869361844, "grad_norm": 0.07282594591379166, "learning_rate": 1.1165389362081552e-05, "loss": 0.475, "num_input_tokens_seen": 78210160, "step": 64470 }, { "epoch": 7.180643724245462, "grad_norm": 0.07827618718147278, "learning_rate": 1.1161342040029848e-05, "loss": 0.4502, "num_input_tokens_seen": 78216080, "step": 64475 }, { "epoch": 7.181200579129079, "grad_norm": 0.16128364205360413, "learning_rate": 1.115729524084698e-05, "loss": 0.4718, "num_input_tokens_seen": 78222000, "step": 64480 }, { "epoch": 7.181757434012696, "grad_norm": 0.10880149900913239, "learning_rate": 1.1153248964685868e-05, "loss": 0.453, "num_input_tokens_seen": 78228080, "step": 64485 }, { "epoch": 7.182314288896314, "grad_norm": 0.11245792359113693, "learning_rate": 1.1149203211699371e-05, "loss": 0.4753, "num_input_tokens_seen": 78233744, "step": 64490 }, { "epoch": 7.182871143779931, "grad_norm": 0.11492852866649628, "learning_rate": 1.1145157982040377e-05, "loss": 0.4573, "num_input_tokens_seen": 78239792, "step": 64495 }, { "epoch": 7.183427998663548, "grad_norm": 0.09719771146774292, "learning_rate": 1.1141113275861705e-05, "loss": 0.4657, "num_input_tokens_seen": 78246192, "step": 64500 }, { "epoch": 7.183984853547166, "grad_norm": 0.10070644319057465, "learning_rate": 1.1137069093316187e-05, "loss": 0.4581, "num_input_tokens_seen": 78252208, "step": 64505 }, { "epoch": 7.184541708430783, "grad_norm": 0.09376481920480728, "learning_rate": 1.1133025434556632e-05, "loss": 0.458, "num_input_tokens_seen": 78258512, "step": 64510 }, { "epoch": 7.1850985633144, "grad_norm": 0.15679439902305603, "learning_rate": 1.1128982299735807e-05, "loss": 0.4585, "num_input_tokens_seen": 78264464, "step": 64515 }, { "epoch": 7.185655418198017, "grad_norm": 0.108845554292202, "learning_rate": 1.1124939689006492e-05, "loss": 0.4679, "num_input_tokens_seen": 78270448, "step": 64520 }, { "epoch": 7.186212273081635, "grad_norm": 0.09172799438238144, "learning_rate": 1.112089760252141e-05, "loss": 0.4687, "num_input_tokens_seen": 78276560, "step": 64525 }, { "epoch": 7.186769127965253, "grad_norm": 0.09308533370494843, "learning_rate": 1.1116856040433295e-05, "loss": 0.4676, "num_input_tokens_seen": 78282800, "step": 64530 }, { "epoch": 7.187325982848869, "grad_norm": 0.10882014036178589, "learning_rate": 1.1112815002894858e-05, "loss": 0.4716, "num_input_tokens_seen": 78289168, "step": 64535 }, { "epoch": 7.187882837732487, "grad_norm": 0.12915723025798798, "learning_rate": 1.1108774490058765e-05, "loss": 0.4677, "num_input_tokens_seen": 78295024, "step": 64540 }, { "epoch": 7.188439692616104, "grad_norm": 0.09524765610694885, "learning_rate": 1.1104734502077699e-05, "loss": 0.4654, "num_input_tokens_seen": 78301200, "step": 64545 }, { "epoch": 7.1889965474997215, "grad_norm": 0.09570565819740295, "learning_rate": 1.1100695039104284e-05, "loss": 0.4615, "num_input_tokens_seen": 78307344, "step": 64550 }, { "epoch": 7.189553402383339, "grad_norm": 0.10190506279468536, "learning_rate": 1.109665610129116e-05, "loss": 0.4548, "num_input_tokens_seen": 78313296, "step": 64555 }, { "epoch": 7.190110257266956, "grad_norm": 0.10227350145578384, "learning_rate": 1.1092617688790915e-05, "loss": 0.4707, "num_input_tokens_seen": 78319344, "step": 64560 }, { "epoch": 7.190667112150574, "grad_norm": 0.07507770508527756, "learning_rate": 1.1088579801756145e-05, "loss": 0.4586, "num_input_tokens_seen": 78325616, "step": 64565 }, { "epoch": 7.191223967034191, "grad_norm": 0.0985790491104126, "learning_rate": 1.1084542440339419e-05, "loss": 0.4547, "num_input_tokens_seen": 78331920, "step": 64570 }, { "epoch": 7.191780821917808, "grad_norm": 0.10025206208229065, "learning_rate": 1.1080505604693262e-05, "loss": 0.4695, "num_input_tokens_seen": 78338160, "step": 64575 }, { "epoch": 7.192337676801426, "grad_norm": 0.12346398830413818, "learning_rate": 1.1076469294970222e-05, "loss": 0.4515, "num_input_tokens_seen": 78344112, "step": 64580 }, { "epoch": 7.192894531685043, "grad_norm": 0.08256103098392487, "learning_rate": 1.1072433511322783e-05, "loss": 0.4622, "num_input_tokens_seen": 78350064, "step": 64585 }, { "epoch": 7.19345138656866, "grad_norm": 0.12819872796535492, "learning_rate": 1.1068398253903447e-05, "loss": 0.4469, "num_input_tokens_seen": 78356144, "step": 64590 }, { "epoch": 7.194008241452278, "grad_norm": 0.10831106454133987, "learning_rate": 1.1064363522864662e-05, "loss": 0.463, "num_input_tokens_seen": 78361840, "step": 64595 }, { "epoch": 7.194565096335895, "grad_norm": 0.09017132967710495, "learning_rate": 1.1060329318358884e-05, "loss": 0.4631, "num_input_tokens_seen": 78368336, "step": 64600 }, { "epoch": 7.195121951219512, "grad_norm": 0.13245545327663422, "learning_rate": 1.1056295640538545e-05, "loss": 0.4571, "num_input_tokens_seen": 78374576, "step": 64605 }, { "epoch": 7.195678806103129, "grad_norm": 0.09270878881216049, "learning_rate": 1.1052262489556031e-05, "loss": 0.4633, "num_input_tokens_seen": 78380144, "step": 64610 }, { "epoch": 7.196235660986747, "grad_norm": 0.13538041710853577, "learning_rate": 1.1048229865563748e-05, "loss": 0.4651, "num_input_tokens_seen": 78386416, "step": 64615 }, { "epoch": 7.1967925158703645, "grad_norm": 0.09988979250192642, "learning_rate": 1.104419776871404e-05, "loss": 0.4569, "num_input_tokens_seen": 78392496, "step": 64620 }, { "epoch": 7.197349370753981, "grad_norm": 0.09533794224262238, "learning_rate": 1.1040166199159266e-05, "loss": 0.4704, "num_input_tokens_seen": 78398416, "step": 64625 }, { "epoch": 7.197906225637599, "grad_norm": 0.15604451298713684, "learning_rate": 1.1036135157051758e-05, "loss": 0.4547, "num_input_tokens_seen": 78404752, "step": 64630 }, { "epoch": 7.198463080521216, "grad_norm": 0.13236118853092194, "learning_rate": 1.1032104642543806e-05, "loss": 0.4471, "num_input_tokens_seen": 78411120, "step": 64635 }, { "epoch": 7.199019935404833, "grad_norm": 0.0833975076675415, "learning_rate": 1.102807465578771e-05, "loss": 0.4668, "num_input_tokens_seen": 78417456, "step": 64640 }, { "epoch": 7.199576790288451, "grad_norm": 0.1384918987751007, "learning_rate": 1.102404519693572e-05, "loss": 0.4587, "num_input_tokens_seen": 78423792, "step": 64645 }, { "epoch": 7.200133645172068, "grad_norm": 0.09669453650712967, "learning_rate": 1.1020016266140098e-05, "loss": 0.4638, "num_input_tokens_seen": 78429776, "step": 64650 }, { "epoch": 7.200690500055686, "grad_norm": 0.11411715298891068, "learning_rate": 1.1015987863553056e-05, "loss": 0.4478, "num_input_tokens_seen": 78435952, "step": 64655 }, { "epoch": 7.201247354939303, "grad_norm": 0.12132947891950607, "learning_rate": 1.1011959989326807e-05, "loss": 0.4634, "num_input_tokens_seen": 78442096, "step": 64660 }, { "epoch": 7.20180420982292, "grad_norm": 0.09230955690145493, "learning_rate": 1.1007932643613542e-05, "loss": 0.4459, "num_input_tokens_seen": 78448176, "step": 64665 }, { "epoch": 7.202361064706538, "grad_norm": 0.10887228697538376, "learning_rate": 1.1003905826565414e-05, "loss": 0.4633, "num_input_tokens_seen": 78454160, "step": 64670 }, { "epoch": 7.2029179195901545, "grad_norm": 0.12814466655254364, "learning_rate": 1.0999879538334587e-05, "loss": 0.4819, "num_input_tokens_seen": 78460304, "step": 64675 }, { "epoch": 7.203474774473772, "grad_norm": 0.1085796132683754, "learning_rate": 1.0995853779073175e-05, "loss": 0.4487, "num_input_tokens_seen": 78466256, "step": 64680 }, { "epoch": 7.20403162935739, "grad_norm": 0.12306013703346252, "learning_rate": 1.0991828548933278e-05, "loss": 0.4821, "num_input_tokens_seen": 78472592, "step": 64685 }, { "epoch": 7.204588484241007, "grad_norm": 0.0872553363442421, "learning_rate": 1.0987803848067e-05, "loss": 0.4587, "num_input_tokens_seen": 78478032, "step": 64690 }, { "epoch": 7.205145339124624, "grad_norm": 0.11892495304346085, "learning_rate": 1.0983779676626385e-05, "loss": 0.4517, "num_input_tokens_seen": 78484016, "step": 64695 }, { "epoch": 7.205702194008241, "grad_norm": 0.10399017482995987, "learning_rate": 1.09797560347635e-05, "loss": 0.4635, "num_input_tokens_seen": 78490128, "step": 64700 }, { "epoch": 7.206259048891859, "grad_norm": 0.09598159044981003, "learning_rate": 1.0975732922630355e-05, "loss": 0.4679, "num_input_tokens_seen": 78496176, "step": 64705 }, { "epoch": 7.206815903775476, "grad_norm": 0.12829947471618652, "learning_rate": 1.0971710340378963e-05, "loss": 0.4533, "num_input_tokens_seen": 78502448, "step": 64710 }, { "epoch": 7.207372758659093, "grad_norm": 0.14921934902668, "learning_rate": 1.0967688288161317e-05, "loss": 0.4675, "num_input_tokens_seen": 78508592, "step": 64715 }, { "epoch": 7.207929613542711, "grad_norm": 0.14247263967990875, "learning_rate": 1.0963666766129368e-05, "loss": 0.4624, "num_input_tokens_seen": 78514544, "step": 64720 }, { "epoch": 7.208486468426328, "grad_norm": 0.16052857041358948, "learning_rate": 1.0959645774435082e-05, "loss": 0.4627, "num_input_tokens_seen": 78520816, "step": 64725 }, { "epoch": 7.209043323309945, "grad_norm": 0.08594010025262833, "learning_rate": 1.095562531323036e-05, "loss": 0.4709, "num_input_tokens_seen": 78527024, "step": 64730 }, { "epoch": 7.209600178193563, "grad_norm": 0.10034802556037903, "learning_rate": 1.0951605382667135e-05, "loss": 0.4683, "num_input_tokens_seen": 78533168, "step": 64735 }, { "epoch": 7.21015703307718, "grad_norm": 0.08060090243816376, "learning_rate": 1.0947585982897265e-05, "loss": 0.4658, "num_input_tokens_seen": 78539216, "step": 64740 }, { "epoch": 7.2107138879607975, "grad_norm": 0.12724992632865906, "learning_rate": 1.0943567114072634e-05, "loss": 0.4637, "num_input_tokens_seen": 78545264, "step": 64745 }, { "epoch": 7.211270742844415, "grad_norm": 0.13188941776752472, "learning_rate": 1.093954877634509e-05, "loss": 0.4646, "num_input_tokens_seen": 78551024, "step": 64750 }, { "epoch": 7.211827597728032, "grad_norm": 0.10442250967025757, "learning_rate": 1.0935530969866442e-05, "loss": 0.464, "num_input_tokens_seen": 78557200, "step": 64755 }, { "epoch": 7.21238445261165, "grad_norm": 0.11761865019798279, "learning_rate": 1.0931513694788517e-05, "loss": 0.4624, "num_input_tokens_seen": 78562992, "step": 64760 }, { "epoch": 7.212941307495266, "grad_norm": 0.12496979534626007, "learning_rate": 1.092749695126308e-05, "loss": 0.4692, "num_input_tokens_seen": 78569104, "step": 64765 }, { "epoch": 7.213498162378884, "grad_norm": 0.11204222589731216, "learning_rate": 1.0923480739441916e-05, "loss": 0.4685, "num_input_tokens_seen": 78575408, "step": 64770 }, { "epoch": 7.214055017262502, "grad_norm": 0.09831291437149048, "learning_rate": 1.0919465059476746e-05, "loss": 0.46, "num_input_tokens_seen": 78581584, "step": 64775 }, { "epoch": 7.2146118721461185, "grad_norm": 0.10649057477712631, "learning_rate": 1.0915449911519315e-05, "loss": 0.4605, "num_input_tokens_seen": 78587792, "step": 64780 }, { "epoch": 7.215168727029736, "grad_norm": 0.0997457355260849, "learning_rate": 1.0911435295721329e-05, "loss": 0.4646, "num_input_tokens_seen": 78593744, "step": 64785 }, { "epoch": 7.215725581913353, "grad_norm": 0.0756988599896431, "learning_rate": 1.0907421212234458e-05, "loss": 0.4507, "num_input_tokens_seen": 78599408, "step": 64790 }, { "epoch": 7.216282436796971, "grad_norm": 0.16846202313899994, "learning_rate": 1.0903407661210385e-05, "loss": 0.4733, "num_input_tokens_seen": 78605296, "step": 64795 }, { "epoch": 7.216839291680588, "grad_norm": 0.12093443423509598, "learning_rate": 1.0899394642800733e-05, "loss": 0.4571, "num_input_tokens_seen": 78611408, "step": 64800 }, { "epoch": 7.217396146564205, "grad_norm": 0.0856417566537857, "learning_rate": 1.0895382157157144e-05, "loss": 0.444, "num_input_tokens_seen": 78617648, "step": 64805 }, { "epoch": 7.217953001447823, "grad_norm": 0.15580826997756958, "learning_rate": 1.0891370204431222e-05, "loss": 0.4554, "num_input_tokens_seen": 78623760, "step": 64810 }, { "epoch": 7.21850985633144, "grad_norm": 0.09138116985559464, "learning_rate": 1.0887358784774544e-05, "loss": 0.4581, "num_input_tokens_seen": 78629712, "step": 64815 }, { "epoch": 7.219066711215057, "grad_norm": 0.13426660001277924, "learning_rate": 1.088334789833868e-05, "loss": 0.4659, "num_input_tokens_seen": 78635920, "step": 64820 }, { "epoch": 7.219623566098675, "grad_norm": 0.11682701110839844, "learning_rate": 1.0879337545275165e-05, "loss": 0.4622, "num_input_tokens_seen": 78642128, "step": 64825 }, { "epoch": 7.220180420982292, "grad_norm": 0.08572832494974136, "learning_rate": 1.0875327725735537e-05, "loss": 0.4666, "num_input_tokens_seen": 78648176, "step": 64830 }, { "epoch": 7.220737275865909, "grad_norm": 0.09724536538124084, "learning_rate": 1.0871318439871287e-05, "loss": 0.448, "num_input_tokens_seen": 78653520, "step": 64835 }, { "epoch": 7.221294130749527, "grad_norm": 0.09216025471687317, "learning_rate": 1.08673096878339e-05, "loss": 0.4667, "num_input_tokens_seen": 78659504, "step": 64840 }, { "epoch": 7.221850985633144, "grad_norm": 0.09330861270427704, "learning_rate": 1.0863301469774856e-05, "loss": 0.4646, "num_input_tokens_seen": 78665392, "step": 64845 }, { "epoch": 7.2224078405167615, "grad_norm": 0.1208537295460701, "learning_rate": 1.0859293785845573e-05, "loss": 0.4556, "num_input_tokens_seen": 78671312, "step": 64850 }, { "epoch": 7.222964695400378, "grad_norm": 0.10544097423553467, "learning_rate": 1.0855286636197498e-05, "loss": 0.4708, "num_input_tokens_seen": 78677744, "step": 64855 }, { "epoch": 7.223521550283996, "grad_norm": 0.12255686521530151, "learning_rate": 1.0851280020982013e-05, "loss": 0.4648, "num_input_tokens_seen": 78683600, "step": 64860 }, { "epoch": 7.224078405167614, "grad_norm": 0.11123087257146835, "learning_rate": 1.0847273940350521e-05, "loss": 0.4601, "num_input_tokens_seen": 78689776, "step": 64865 }, { "epoch": 7.2246352600512305, "grad_norm": 0.11563454568386078, "learning_rate": 1.0843268394454364e-05, "loss": 0.4647, "num_input_tokens_seen": 78695824, "step": 64870 }, { "epoch": 7.225192114934848, "grad_norm": 0.10292961448431015, "learning_rate": 1.0839263383444895e-05, "loss": 0.4708, "num_input_tokens_seen": 78702000, "step": 64875 }, { "epoch": 7.225748969818465, "grad_norm": 0.08796492218971252, "learning_rate": 1.0835258907473445e-05, "loss": 0.4676, "num_input_tokens_seen": 78708144, "step": 64880 }, { "epoch": 7.226305824702083, "grad_norm": 0.0851416289806366, "learning_rate": 1.0831254966691296e-05, "loss": 0.4625, "num_input_tokens_seen": 78714800, "step": 64885 }, { "epoch": 7.2268626795857, "grad_norm": 0.12247627973556519, "learning_rate": 1.0827251561249751e-05, "loss": 0.4783, "num_input_tokens_seen": 78720240, "step": 64890 }, { "epoch": 7.227419534469317, "grad_norm": 0.12731413543224335, "learning_rate": 1.0823248691300051e-05, "loss": 0.4618, "num_input_tokens_seen": 78726384, "step": 64895 }, { "epoch": 7.227976389352935, "grad_norm": 0.1055595800280571, "learning_rate": 1.0819246356993446e-05, "loss": 0.4627, "num_input_tokens_seen": 78732560, "step": 64900 }, { "epoch": 7.2285332442365515, "grad_norm": 0.1187712550163269, "learning_rate": 1.0815244558481168e-05, "loss": 0.4537, "num_input_tokens_seen": 78738672, "step": 64905 }, { "epoch": 7.229090099120169, "grad_norm": 0.18489067256450653, "learning_rate": 1.08112432959144e-05, "loss": 0.4614, "num_input_tokens_seen": 78744720, "step": 64910 }, { "epoch": 7.229646954003787, "grad_norm": 0.1457614153623581, "learning_rate": 1.080724256944434e-05, "loss": 0.4601, "num_input_tokens_seen": 78750864, "step": 64915 }, { "epoch": 7.230203808887404, "grad_norm": 0.14677266776561737, "learning_rate": 1.0803242379222127e-05, "loss": 0.4636, "num_input_tokens_seen": 78757168, "step": 64920 }, { "epoch": 7.230760663771021, "grad_norm": 0.1298529952764511, "learning_rate": 1.0799242725398923e-05, "loss": 0.4553, "num_input_tokens_seen": 78763120, "step": 64925 }, { "epoch": 7.231317518654639, "grad_norm": 0.09962522983551025, "learning_rate": 1.079524360812583e-05, "loss": 0.4571, "num_input_tokens_seen": 78769552, "step": 64930 }, { "epoch": 7.231874373538256, "grad_norm": 0.0892462208867073, "learning_rate": 1.079124502755395e-05, "loss": 0.4724, "num_input_tokens_seen": 78775536, "step": 64935 }, { "epoch": 7.2324312284218735, "grad_norm": 0.12240471690893173, "learning_rate": 1.0787246983834382e-05, "loss": 0.4555, "num_input_tokens_seen": 78781424, "step": 64940 }, { "epoch": 7.23298808330549, "grad_norm": 0.07713072001934052, "learning_rate": 1.0783249477118156e-05, "loss": 0.451, "num_input_tokens_seen": 78787632, "step": 64945 }, { "epoch": 7.233544938189108, "grad_norm": 0.11846698820590973, "learning_rate": 1.0779252507556337e-05, "loss": 0.4658, "num_input_tokens_seen": 78793584, "step": 64950 }, { "epoch": 7.234101793072726, "grad_norm": 0.09939346462488174, "learning_rate": 1.0775256075299919e-05, "loss": 0.4678, "num_input_tokens_seen": 78799344, "step": 64955 }, { "epoch": 7.234658647956342, "grad_norm": 0.16798032820224762, "learning_rate": 1.0771260180499923e-05, "loss": 0.4491, "num_input_tokens_seen": 78805552, "step": 64960 }, { "epoch": 7.23521550283996, "grad_norm": 0.13969916105270386, "learning_rate": 1.0767264823307308e-05, "loss": 0.4513, "num_input_tokens_seen": 78811376, "step": 64965 }, { "epoch": 7.235772357723577, "grad_norm": 0.12063000351190567, "learning_rate": 1.0763270003873035e-05, "loss": 0.4672, "num_input_tokens_seen": 78817680, "step": 64970 }, { "epoch": 7.2363292126071945, "grad_norm": 0.10284639894962311, "learning_rate": 1.0759275722348058e-05, "loss": 0.4575, "num_input_tokens_seen": 78823920, "step": 64975 }, { "epoch": 7.236886067490812, "grad_norm": 0.07898364216089249, "learning_rate": 1.075528197888327e-05, "loss": 0.4694, "num_input_tokens_seen": 78830032, "step": 64980 }, { "epoch": 7.237442922374429, "grad_norm": 0.14905031025409698, "learning_rate": 1.0751288773629586e-05, "loss": 0.4732, "num_input_tokens_seen": 78835728, "step": 64985 }, { "epoch": 7.237999777258047, "grad_norm": 0.11319408565759659, "learning_rate": 1.0747296106737866e-05, "loss": 0.4588, "num_input_tokens_seen": 78842000, "step": 64990 }, { "epoch": 7.238556632141664, "grad_norm": 0.0751957967877388, "learning_rate": 1.0743303978358984e-05, "loss": 0.4554, "num_input_tokens_seen": 78848208, "step": 64995 }, { "epoch": 7.239113487025281, "grad_norm": 0.05092629790306091, "learning_rate": 1.0739312388643758e-05, "loss": 0.4741, "num_input_tokens_seen": 78854128, "step": 65000 }, { "epoch": 7.239670341908899, "grad_norm": 0.09317337721586227, "learning_rate": 1.073532133774301e-05, "loss": 0.4571, "num_input_tokens_seen": 78860208, "step": 65005 }, { "epoch": 7.240227196792516, "grad_norm": 0.09305927157402039, "learning_rate": 1.0731330825807542e-05, "loss": 0.464, "num_input_tokens_seen": 78866096, "step": 65010 }, { "epoch": 7.240784051676133, "grad_norm": 0.08542877435684204, "learning_rate": 1.0727340852988113e-05, "loss": 0.4548, "num_input_tokens_seen": 78871984, "step": 65015 }, { "epoch": 7.241340906559751, "grad_norm": 0.07393995672464371, "learning_rate": 1.0723351419435495e-05, "loss": 0.461, "num_input_tokens_seen": 78877904, "step": 65020 }, { "epoch": 7.241897761443368, "grad_norm": 0.12175744771957397, "learning_rate": 1.0719362525300405e-05, "loss": 0.4587, "num_input_tokens_seen": 78884016, "step": 65025 }, { "epoch": 7.242454616326985, "grad_norm": 0.08673901110887527, "learning_rate": 1.0715374170733561e-05, "loss": 0.4559, "num_input_tokens_seen": 78890160, "step": 65030 }, { "epoch": 7.243011471210602, "grad_norm": 0.12620237469673157, "learning_rate": 1.0711386355885669e-05, "loss": 0.4607, "num_input_tokens_seen": 78896624, "step": 65035 }, { "epoch": 7.24356832609422, "grad_norm": 0.10265617817640305, "learning_rate": 1.070739908090738e-05, "loss": 0.4655, "num_input_tokens_seen": 78902352, "step": 65040 }, { "epoch": 7.2441251809778375, "grad_norm": 0.08173434436321259, "learning_rate": 1.0703412345949365e-05, "loss": 0.4596, "num_input_tokens_seen": 78908528, "step": 65045 }, { "epoch": 7.244682035861454, "grad_norm": 0.13502903282642365, "learning_rate": 1.0699426151162239e-05, "loss": 0.4715, "num_input_tokens_seen": 78914576, "step": 65050 }, { "epoch": 7.245238890745072, "grad_norm": 0.1280854195356369, "learning_rate": 1.0695440496696633e-05, "loss": 0.4612, "num_input_tokens_seen": 78920816, "step": 65055 }, { "epoch": 7.245795745628689, "grad_norm": 0.08982978016138077, "learning_rate": 1.0691455382703114e-05, "loss": 0.467, "num_input_tokens_seen": 78926704, "step": 65060 }, { "epoch": 7.2463526005123065, "grad_norm": 0.12628568708896637, "learning_rate": 1.0687470809332267e-05, "loss": 0.4722, "num_input_tokens_seen": 78932688, "step": 65065 }, { "epoch": 7.246909455395924, "grad_norm": 0.1339157372713089, "learning_rate": 1.0683486776734648e-05, "loss": 0.4569, "num_input_tokens_seen": 78938512, "step": 65070 }, { "epoch": 7.247466310279541, "grad_norm": 0.09633695334196091, "learning_rate": 1.0679503285060768e-05, "loss": 0.4624, "num_input_tokens_seen": 78944560, "step": 65075 }, { "epoch": 7.248023165163159, "grad_norm": 0.10236139595508575, "learning_rate": 1.0675520334461155e-05, "loss": 0.4589, "num_input_tokens_seen": 78950608, "step": 65080 }, { "epoch": 7.248580020046775, "grad_norm": 0.10848701000213623, "learning_rate": 1.067153792508628e-05, "loss": 0.4556, "num_input_tokens_seen": 78956944, "step": 65085 }, { "epoch": 7.249136874930393, "grad_norm": 0.12338407337665558, "learning_rate": 1.0667556057086631e-05, "loss": 0.4581, "num_input_tokens_seen": 78962736, "step": 65090 }, { "epoch": 7.249693729814011, "grad_norm": 0.08643841743469238, "learning_rate": 1.066357473061264e-05, "loss": 0.4707, "num_input_tokens_seen": 78969104, "step": 65095 }, { "epoch": 7.2502505846976275, "grad_norm": 0.08154663443565369, "learning_rate": 1.0659593945814736e-05, "loss": 0.4538, "num_input_tokens_seen": 78975472, "step": 65100 }, { "epoch": 7.250807439581245, "grad_norm": 0.09438242018222809, "learning_rate": 1.0655613702843334e-05, "loss": 0.4662, "num_input_tokens_seen": 78981488, "step": 65105 }, { "epoch": 7.251364294464863, "grad_norm": 0.12841157615184784, "learning_rate": 1.0651634001848806e-05, "loss": 0.4596, "num_input_tokens_seen": 78987376, "step": 65110 }, { "epoch": 7.25192114934848, "grad_norm": 0.11694812029600143, "learning_rate": 1.064765484298153e-05, "loss": 0.4626, "num_input_tokens_seen": 78993904, "step": 65115 }, { "epoch": 7.252478004232097, "grad_norm": 0.11593825370073318, "learning_rate": 1.0643676226391855e-05, "loss": 0.4535, "num_input_tokens_seen": 79000272, "step": 65120 }, { "epoch": 7.253034859115714, "grad_norm": 0.09761466830968857, "learning_rate": 1.063969815223009e-05, "loss": 0.4505, "num_input_tokens_seen": 79006288, "step": 65125 }, { "epoch": 7.253591713999332, "grad_norm": 0.10228589177131653, "learning_rate": 1.0635720620646558e-05, "loss": 0.4506, "num_input_tokens_seen": 79012720, "step": 65130 }, { "epoch": 7.2541485688829495, "grad_norm": 0.0943833440542221, "learning_rate": 1.0631743631791527e-05, "loss": 0.4584, "num_input_tokens_seen": 79018608, "step": 65135 }, { "epoch": 7.254705423766566, "grad_norm": 0.07483402639627457, "learning_rate": 1.0627767185815274e-05, "loss": 0.464, "num_input_tokens_seen": 79024720, "step": 65140 }, { "epoch": 7.255262278650184, "grad_norm": 0.08367397636175156, "learning_rate": 1.0623791282868025e-05, "loss": 0.4666, "num_input_tokens_seen": 79030960, "step": 65145 }, { "epoch": 7.255819133533801, "grad_norm": 0.08873563259840012, "learning_rate": 1.0619815923100013e-05, "loss": 0.4604, "num_input_tokens_seen": 79037008, "step": 65150 }, { "epoch": 7.256375988417418, "grad_norm": 0.0813496857881546, "learning_rate": 1.0615841106661448e-05, "loss": 0.4748, "num_input_tokens_seen": 79043120, "step": 65155 }, { "epoch": 7.256932843301036, "grad_norm": 0.12967875599861145, "learning_rate": 1.0611866833702491e-05, "loss": 0.4527, "num_input_tokens_seen": 79049200, "step": 65160 }, { "epoch": 7.257489698184653, "grad_norm": 0.13414739072322845, "learning_rate": 1.0607893104373326e-05, "loss": 0.4758, "num_input_tokens_seen": 79054864, "step": 65165 }, { "epoch": 7.2580465530682705, "grad_norm": 0.11933246999979019, "learning_rate": 1.0603919918824074e-05, "loss": 0.4646, "num_input_tokens_seen": 79060976, "step": 65170 }, { "epoch": 7.258603407951888, "grad_norm": 0.09726984053850174, "learning_rate": 1.0599947277204866e-05, "loss": 0.4521, "num_input_tokens_seen": 79067184, "step": 65175 }, { "epoch": 7.259160262835505, "grad_norm": 0.11504394561052322, "learning_rate": 1.059597517966579e-05, "loss": 0.4649, "num_input_tokens_seen": 79073328, "step": 65180 }, { "epoch": 7.259717117719123, "grad_norm": 0.08252301812171936, "learning_rate": 1.0592003626356934e-05, "loss": 0.4689, "num_input_tokens_seen": 79079536, "step": 65185 }, { "epoch": 7.260273972602739, "grad_norm": 0.1345914900302887, "learning_rate": 1.0588032617428362e-05, "loss": 0.458, "num_input_tokens_seen": 79085776, "step": 65190 }, { "epoch": 7.260830827486357, "grad_norm": 0.09211763739585876, "learning_rate": 1.0584062153030092e-05, "loss": 0.4623, "num_input_tokens_seen": 79091632, "step": 65195 }, { "epoch": 7.261387682369975, "grad_norm": 0.09851127117872238, "learning_rate": 1.0580092233312161e-05, "loss": 0.4685, "num_input_tokens_seen": 79097776, "step": 65200 }, { "epoch": 7.261944537253592, "grad_norm": 0.10227402299642563, "learning_rate": 1.0576122858424548e-05, "loss": 0.466, "num_input_tokens_seen": 79103248, "step": 65205 }, { "epoch": 7.262501392137209, "grad_norm": 0.1309746354818344, "learning_rate": 1.0572154028517239e-05, "loss": 0.4584, "num_input_tokens_seen": 79110000, "step": 65210 }, { "epoch": 7.263058247020826, "grad_norm": 0.12134332209825516, "learning_rate": 1.0568185743740194e-05, "loss": 0.4698, "num_input_tokens_seen": 79116144, "step": 65215 }, { "epoch": 7.263615101904444, "grad_norm": 0.1859976053237915, "learning_rate": 1.056421800424333e-05, "loss": 0.4731, "num_input_tokens_seen": 79122384, "step": 65220 }, { "epoch": 7.264171956788061, "grad_norm": 0.11311333626508713, "learning_rate": 1.0560250810176583e-05, "loss": 0.4654, "num_input_tokens_seen": 79128848, "step": 65225 }, { "epoch": 7.264728811671678, "grad_norm": 0.093834787607193, "learning_rate": 1.0556284161689828e-05, "loss": 0.462, "num_input_tokens_seen": 79135120, "step": 65230 }, { "epoch": 7.265285666555296, "grad_norm": 0.15248431265354156, "learning_rate": 1.055231805893295e-05, "loss": 0.4511, "num_input_tokens_seen": 79141072, "step": 65235 }, { "epoch": 7.265842521438913, "grad_norm": 0.1193196177482605, "learning_rate": 1.0548352502055787e-05, "loss": 0.4656, "num_input_tokens_seen": 79147216, "step": 65240 }, { "epoch": 7.26639937632253, "grad_norm": 0.1177511066198349, "learning_rate": 1.054438749120818e-05, "loss": 0.4621, "num_input_tokens_seen": 79153168, "step": 65245 }, { "epoch": 7.266956231206148, "grad_norm": 0.13336674869060516, "learning_rate": 1.0540423026539947e-05, "loss": 0.4701, "num_input_tokens_seen": 79159344, "step": 65250 }, { "epoch": 7.267513086089765, "grad_norm": 0.10210452973842621, "learning_rate": 1.053645910820086e-05, "loss": 0.452, "num_input_tokens_seen": 79165616, "step": 65255 }, { "epoch": 7.268069940973382, "grad_norm": 0.09646181762218475, "learning_rate": 1.053249573634071e-05, "loss": 0.4607, "num_input_tokens_seen": 79171952, "step": 65260 }, { "epoch": 7.268626795856999, "grad_norm": 0.0609530545771122, "learning_rate": 1.0528532911109226e-05, "loss": 0.4592, "num_input_tokens_seen": 79177488, "step": 65265 }, { "epoch": 7.269183650740617, "grad_norm": 0.08445071429014206, "learning_rate": 1.0524570632656153e-05, "loss": 0.458, "num_input_tokens_seen": 79183568, "step": 65270 }, { "epoch": 7.269740505624235, "grad_norm": 0.08497517555952072, "learning_rate": 1.0520608901131179e-05, "loss": 0.4485, "num_input_tokens_seen": 79189712, "step": 65275 }, { "epoch": 7.270297360507851, "grad_norm": 0.09297018498182297, "learning_rate": 1.0516647716684006e-05, "loss": 0.4647, "num_input_tokens_seen": 79195760, "step": 65280 }, { "epoch": 7.270854215391469, "grad_norm": 0.12235313653945923, "learning_rate": 1.05126870794643e-05, "loss": 0.4519, "num_input_tokens_seen": 79201424, "step": 65285 }, { "epoch": 7.271411070275087, "grad_norm": 0.1371467411518097, "learning_rate": 1.0508726989621697e-05, "loss": 0.4617, "num_input_tokens_seen": 79207216, "step": 65290 }, { "epoch": 7.2719679251587035, "grad_norm": 0.09866952896118164, "learning_rate": 1.0504767447305839e-05, "loss": 0.4586, "num_input_tokens_seen": 79213584, "step": 65295 }, { "epoch": 7.272524780042321, "grad_norm": 0.11528090387582779, "learning_rate": 1.0500808452666308e-05, "loss": 0.45, "num_input_tokens_seen": 79219632, "step": 65300 }, { "epoch": 7.273081634925938, "grad_norm": 0.1315445899963379, "learning_rate": 1.0496850005852699e-05, "loss": 0.4538, "num_input_tokens_seen": 79225488, "step": 65305 }, { "epoch": 7.273638489809556, "grad_norm": 0.08092791587114334, "learning_rate": 1.0492892107014585e-05, "loss": 0.4677, "num_input_tokens_seen": 79231888, "step": 65310 }, { "epoch": 7.274195344693173, "grad_norm": 0.12949232757091522, "learning_rate": 1.0488934756301485e-05, "loss": 0.4709, "num_input_tokens_seen": 79238160, "step": 65315 }, { "epoch": 7.27475219957679, "grad_norm": 0.07634012401103973, "learning_rate": 1.0484977953862942e-05, "loss": 0.4425, "num_input_tokens_seen": 79244304, "step": 65320 }, { "epoch": 7.275309054460408, "grad_norm": 0.1594361960887909, "learning_rate": 1.0481021699848443e-05, "loss": 0.4655, "num_input_tokens_seen": 79250672, "step": 65325 }, { "epoch": 7.2758659093440246, "grad_norm": 0.08011547476053238, "learning_rate": 1.0477065994407479e-05, "loss": 0.4522, "num_input_tokens_seen": 79256368, "step": 65330 }, { "epoch": 7.276422764227642, "grad_norm": 0.0934501513838768, "learning_rate": 1.0473110837689493e-05, "loss": 0.4476, "num_input_tokens_seen": 79262288, "step": 65335 }, { "epoch": 7.27697961911126, "grad_norm": 0.09620586782693863, "learning_rate": 1.0469156229843933e-05, "loss": 0.452, "num_input_tokens_seen": 79268432, "step": 65340 }, { "epoch": 7.277536473994877, "grad_norm": 0.07756946235895157, "learning_rate": 1.0465202171020228e-05, "loss": 0.4605, "num_input_tokens_seen": 79274448, "step": 65345 }, { "epoch": 7.278093328878494, "grad_norm": 0.08675553649663925, "learning_rate": 1.0461248661367754e-05, "loss": 0.4602, "num_input_tokens_seen": 79280784, "step": 65350 }, { "epoch": 7.278650183762112, "grad_norm": 0.22314144670963287, "learning_rate": 1.0457295701035904e-05, "loss": 0.4581, "num_input_tokens_seen": 79286576, "step": 65355 }, { "epoch": 7.279207038645729, "grad_norm": 0.0982399731874466, "learning_rate": 1.045334329017402e-05, "loss": 0.4615, "num_input_tokens_seen": 79292752, "step": 65360 }, { "epoch": 7.2797638935293465, "grad_norm": 0.15614666044712067, "learning_rate": 1.0449391428931454e-05, "loss": 0.464, "num_input_tokens_seen": 79298896, "step": 65365 }, { "epoch": 7.280320748412963, "grad_norm": 0.11722038686275482, "learning_rate": 1.0445440117457497e-05, "loss": 0.4621, "num_input_tokens_seen": 79305008, "step": 65370 }, { "epoch": 7.280877603296581, "grad_norm": 0.13161660730838776, "learning_rate": 1.0441489355901457e-05, "loss": 0.4628, "num_input_tokens_seen": 79311152, "step": 65375 }, { "epoch": 7.281434458180199, "grad_norm": 0.12905389070510864, "learning_rate": 1.0437539144412614e-05, "loss": 0.4628, "num_input_tokens_seen": 79316880, "step": 65380 }, { "epoch": 7.281991313063815, "grad_norm": 0.08110266923904419, "learning_rate": 1.04335894831402e-05, "loss": 0.4552, "num_input_tokens_seen": 79322864, "step": 65385 }, { "epoch": 7.282548167947433, "grad_norm": 0.1399221271276474, "learning_rate": 1.0429640372233466e-05, "loss": 0.4731, "num_input_tokens_seen": 79328976, "step": 65390 }, { "epoch": 7.28310502283105, "grad_norm": 0.08920589834451675, "learning_rate": 1.0425691811841598e-05, "loss": 0.4625, "num_input_tokens_seen": 79334960, "step": 65395 }, { "epoch": 7.2836618777146676, "grad_norm": 0.1113031730055809, "learning_rate": 1.0421743802113812e-05, "loss": 0.4543, "num_input_tokens_seen": 79340496, "step": 65400 }, { "epoch": 7.284218732598285, "grad_norm": 0.1360543668270111, "learning_rate": 1.0417796343199254e-05, "loss": 0.4672, "num_input_tokens_seen": 79346672, "step": 65405 }, { "epoch": 7.284775587481902, "grad_norm": 0.16512557864189148, "learning_rate": 1.0413849435247081e-05, "loss": 0.4622, "num_input_tokens_seen": 79353200, "step": 65410 }, { "epoch": 7.28533244236552, "grad_norm": 0.08478817343711853, "learning_rate": 1.0409903078406428e-05, "loss": 0.4574, "num_input_tokens_seen": 79359568, "step": 65415 }, { "epoch": 7.2858892972491365, "grad_norm": 0.10672484338283539, "learning_rate": 1.0405957272826386e-05, "loss": 0.4685, "num_input_tokens_seen": 79365264, "step": 65420 }, { "epoch": 7.286446152132754, "grad_norm": 0.1271347850561142, "learning_rate": 1.0402012018656054e-05, "loss": 0.4605, "num_input_tokens_seen": 79371216, "step": 65425 }, { "epoch": 7.287003007016372, "grad_norm": 0.09555046260356903, "learning_rate": 1.039806731604448e-05, "loss": 0.4533, "num_input_tokens_seen": 79377264, "step": 65430 }, { "epoch": 7.287559861899989, "grad_norm": 0.0867806151509285, "learning_rate": 1.0394123165140719e-05, "loss": 0.4491, "num_input_tokens_seen": 79383312, "step": 65435 }, { "epoch": 7.288116716783606, "grad_norm": 0.12395530194044113, "learning_rate": 1.03901795660938e-05, "loss": 0.4659, "num_input_tokens_seen": 79389648, "step": 65440 }, { "epoch": 7.288673571667223, "grad_norm": 0.1096695140004158, "learning_rate": 1.0386236519052709e-05, "loss": 0.472, "num_input_tokens_seen": 79395792, "step": 65445 }, { "epoch": 7.289230426550841, "grad_norm": 0.08992024511098862, "learning_rate": 1.0382294024166439e-05, "loss": 0.4582, "num_input_tokens_seen": 79401648, "step": 65450 }, { "epoch": 7.289787281434458, "grad_norm": 0.12106122821569443, "learning_rate": 1.037835208158394e-05, "loss": 0.4678, "num_input_tokens_seen": 79407024, "step": 65455 }, { "epoch": 7.290344136318075, "grad_norm": 0.12391791492700577, "learning_rate": 1.0374410691454165e-05, "loss": 0.45, "num_input_tokens_seen": 79412976, "step": 65460 }, { "epoch": 7.290900991201693, "grad_norm": 0.13355547189712524, "learning_rate": 1.0370469853926015e-05, "loss": 0.4478, "num_input_tokens_seen": 79418928, "step": 65465 }, { "epoch": 7.291457846085311, "grad_norm": 0.0639662891626358, "learning_rate": 1.0366529569148397e-05, "loss": 0.4605, "num_input_tokens_seen": 79425040, "step": 65470 }, { "epoch": 7.292014700968927, "grad_norm": 0.14157262444496155, "learning_rate": 1.0362589837270196e-05, "loss": 0.4662, "num_input_tokens_seen": 79430736, "step": 65475 }, { "epoch": 7.292571555852545, "grad_norm": 0.09657327830791473, "learning_rate": 1.035865065844025e-05, "loss": 0.4783, "num_input_tokens_seen": 79437040, "step": 65480 }, { "epoch": 7.293128410736162, "grad_norm": 0.12134347856044769, "learning_rate": 1.0354712032807413e-05, "loss": 0.4664, "num_input_tokens_seen": 79443024, "step": 65485 }, { "epoch": 7.2936852656197795, "grad_norm": 0.10458938777446747, "learning_rate": 1.0350773960520477e-05, "loss": 0.4623, "num_input_tokens_seen": 79448880, "step": 65490 }, { "epoch": 7.294242120503397, "grad_norm": 0.09591779857873917, "learning_rate": 1.0346836441728255e-05, "loss": 0.4705, "num_input_tokens_seen": 79454864, "step": 65495 }, { "epoch": 7.294798975387014, "grad_norm": 0.10835237801074982, "learning_rate": 1.0342899476579513e-05, "loss": 0.4762, "num_input_tokens_seen": 79460912, "step": 65500 }, { "epoch": 7.295355830270632, "grad_norm": 0.0875389501452446, "learning_rate": 1.033896306522299e-05, "loss": 0.462, "num_input_tokens_seen": 79467088, "step": 65505 }, { "epoch": 7.295912685154248, "grad_norm": 0.11529508978128433, "learning_rate": 1.0335027207807432e-05, "loss": 0.4524, "num_input_tokens_seen": 79473136, "step": 65510 }, { "epoch": 7.296469540037866, "grad_norm": 0.10702582448720932, "learning_rate": 1.0331091904481538e-05, "loss": 0.4577, "num_input_tokens_seen": 79479216, "step": 65515 }, { "epoch": 7.297026394921484, "grad_norm": 0.08669546246528625, "learning_rate": 1.0327157155393999e-05, "loss": 0.4661, "num_input_tokens_seen": 79484688, "step": 65520 }, { "epoch": 7.2975832498051005, "grad_norm": 0.1485612392425537, "learning_rate": 1.0323222960693491e-05, "loss": 0.4683, "num_input_tokens_seen": 79491120, "step": 65525 }, { "epoch": 7.298140104688718, "grad_norm": 0.10281631350517273, "learning_rate": 1.0319289320528647e-05, "loss": 0.4545, "num_input_tokens_seen": 79497296, "step": 65530 }, { "epoch": 7.298696959572336, "grad_norm": 0.09331705421209335, "learning_rate": 1.0315356235048107e-05, "loss": 0.4647, "num_input_tokens_seen": 79503600, "step": 65535 }, { "epoch": 7.299253814455953, "grad_norm": 0.10416774451732635, "learning_rate": 1.0311423704400458e-05, "loss": 0.4695, "num_input_tokens_seen": 79509200, "step": 65540 }, { "epoch": 7.29981066933957, "grad_norm": 0.16940197348594666, "learning_rate": 1.0307491728734306e-05, "loss": 0.4439, "num_input_tokens_seen": 79514864, "step": 65545 }, { "epoch": 7.300367524223187, "grad_norm": 0.1329432874917984, "learning_rate": 1.0303560308198187e-05, "loss": 0.4558, "num_input_tokens_seen": 79520880, "step": 65550 }, { "epoch": 7.300924379106805, "grad_norm": 0.09112176299095154, "learning_rate": 1.0299629442940658e-05, "loss": 0.4692, "num_input_tokens_seen": 79527056, "step": 65555 }, { "epoch": 7.3014812339904225, "grad_norm": 0.1118769571185112, "learning_rate": 1.0295699133110251e-05, "loss": 0.4555, "num_input_tokens_seen": 79533264, "step": 65560 }, { "epoch": 7.302038088874039, "grad_norm": 0.10452321916818619, "learning_rate": 1.029176937885544e-05, "loss": 0.4539, "num_input_tokens_seen": 79539504, "step": 65565 }, { "epoch": 7.302594943757657, "grad_norm": 0.12813422083854675, "learning_rate": 1.0287840180324728e-05, "loss": 0.4676, "num_input_tokens_seen": 79545456, "step": 65570 }, { "epoch": 7.303151798641274, "grad_norm": 0.13847385346889496, "learning_rate": 1.0283911537666552e-05, "loss": 0.4659, "num_input_tokens_seen": 79551664, "step": 65575 }, { "epoch": 7.303708653524891, "grad_norm": 0.11335128545761108, "learning_rate": 1.0279983451029357e-05, "loss": 0.4726, "num_input_tokens_seen": 79557648, "step": 65580 }, { "epoch": 7.304265508408509, "grad_norm": 0.12572532892227173, "learning_rate": 1.027605592056157e-05, "loss": 0.4727, "num_input_tokens_seen": 79563952, "step": 65585 }, { "epoch": 7.304822363292126, "grad_norm": 0.12630926072597504, "learning_rate": 1.0272128946411563e-05, "loss": 0.4498, "num_input_tokens_seen": 79570160, "step": 65590 }, { "epoch": 7.3053792181757435, "grad_norm": 0.10507145524024963, "learning_rate": 1.0268202528727735e-05, "loss": 0.456, "num_input_tokens_seen": 79576080, "step": 65595 }, { "epoch": 7.30593607305936, "grad_norm": 0.12926670908927917, "learning_rate": 1.0264276667658415e-05, "loss": 0.4647, "num_input_tokens_seen": 79582512, "step": 65600 }, { "epoch": 7.306492927942978, "grad_norm": 0.09551480412483215, "learning_rate": 1.0260351363351956e-05, "loss": 0.4605, "num_input_tokens_seen": 79588848, "step": 65605 }, { "epoch": 7.307049782826596, "grad_norm": 0.1320827603340149, "learning_rate": 1.0256426615956649e-05, "loss": 0.461, "num_input_tokens_seen": 79594864, "step": 65610 }, { "epoch": 7.3076066377102125, "grad_norm": 0.12569265067577362, "learning_rate": 1.025250242562079e-05, "loss": 0.4614, "num_input_tokens_seen": 79600560, "step": 65615 }, { "epoch": 7.30816349259383, "grad_norm": 0.12382758408784866, "learning_rate": 1.0248578792492664e-05, "loss": 0.4729, "num_input_tokens_seen": 79606544, "step": 65620 }, { "epoch": 7.308720347477447, "grad_norm": 0.08237715065479279, "learning_rate": 1.0244655716720492e-05, "loss": 0.4456, "num_input_tokens_seen": 79612656, "step": 65625 }, { "epoch": 7.309277202361065, "grad_norm": 0.1378844976425171, "learning_rate": 1.024073319845252e-05, "loss": 0.4743, "num_input_tokens_seen": 79618480, "step": 65630 }, { "epoch": 7.309834057244682, "grad_norm": 0.12450840324163437, "learning_rate": 1.0236811237836941e-05, "loss": 0.4541, "num_input_tokens_seen": 79624400, "step": 65635 }, { "epoch": 7.310390912128299, "grad_norm": 0.08542516827583313, "learning_rate": 1.0232889835021952e-05, "loss": 0.4657, "num_input_tokens_seen": 79630960, "step": 65640 }, { "epoch": 7.310947767011917, "grad_norm": 0.06322265416383743, "learning_rate": 1.0228968990155698e-05, "loss": 0.4605, "num_input_tokens_seen": 79637008, "step": 65645 }, { "epoch": 7.311504621895534, "grad_norm": 0.10662432760000229, "learning_rate": 1.0225048703386333e-05, "loss": 0.4649, "num_input_tokens_seen": 79643216, "step": 65650 }, { "epoch": 7.312061476779151, "grad_norm": 0.12266072630882263, "learning_rate": 1.0221128974861987e-05, "loss": 0.4644, "num_input_tokens_seen": 79649232, "step": 65655 }, { "epoch": 7.312618331662769, "grad_norm": 0.11709728837013245, "learning_rate": 1.021720980473074e-05, "loss": 0.4615, "num_input_tokens_seen": 79654736, "step": 65660 }, { "epoch": 7.313175186546386, "grad_norm": 0.1207897961139679, "learning_rate": 1.0213291193140686e-05, "loss": 0.4637, "num_input_tokens_seen": 79661040, "step": 65665 }, { "epoch": 7.313732041430003, "grad_norm": 0.11166097968816757, "learning_rate": 1.0209373140239873e-05, "loss": 0.4614, "num_input_tokens_seen": 79667216, "step": 65670 }, { "epoch": 7.314288896313621, "grad_norm": 0.15554511547088623, "learning_rate": 1.0205455646176349e-05, "loss": 0.4723, "num_input_tokens_seen": 79673776, "step": 65675 }, { "epoch": 7.314845751197238, "grad_norm": 0.12437210977077484, "learning_rate": 1.0201538711098111e-05, "loss": 0.4512, "num_input_tokens_seen": 79680080, "step": 65680 }, { "epoch": 7.3154026060808555, "grad_norm": 0.09821637719869614, "learning_rate": 1.0197622335153165e-05, "loss": 0.4507, "num_input_tokens_seen": 79685968, "step": 65685 }, { "epoch": 7.315959460964472, "grad_norm": 0.08978118747472763, "learning_rate": 1.0193706518489492e-05, "loss": 0.4574, "num_input_tokens_seen": 79692336, "step": 65690 }, { "epoch": 7.31651631584809, "grad_norm": 0.14519350230693817, "learning_rate": 1.018979126125503e-05, "loss": 0.477, "num_input_tokens_seen": 79698608, "step": 65695 }, { "epoch": 7.317073170731708, "grad_norm": 0.12992362678050995, "learning_rate": 1.0185876563597723e-05, "loss": 0.459, "num_input_tokens_seen": 79704752, "step": 65700 }, { "epoch": 7.317630025615324, "grad_norm": 0.1319754272699356, "learning_rate": 1.0181962425665465e-05, "loss": 0.4607, "num_input_tokens_seen": 79710832, "step": 65705 }, { "epoch": 7.318186880498942, "grad_norm": 0.08101829886436462, "learning_rate": 1.0178048847606153e-05, "loss": 0.4553, "num_input_tokens_seen": 79716912, "step": 65710 }, { "epoch": 7.31874373538256, "grad_norm": 0.08536921441555023, "learning_rate": 1.0174135829567664e-05, "loss": 0.4593, "num_input_tokens_seen": 79722768, "step": 65715 }, { "epoch": 7.3193005902661765, "grad_norm": 0.08855017274618149, "learning_rate": 1.0170223371697826e-05, "loss": 0.4614, "num_input_tokens_seen": 79728464, "step": 65720 }, { "epoch": 7.319857445149794, "grad_norm": 0.16589675843715668, "learning_rate": 1.0166311474144483e-05, "loss": 0.4656, "num_input_tokens_seen": 79734768, "step": 65725 }, { "epoch": 7.320414300033411, "grad_norm": 0.07756342738866806, "learning_rate": 1.0162400137055419e-05, "loss": 0.4602, "num_input_tokens_seen": 79740784, "step": 65730 }, { "epoch": 7.320971154917029, "grad_norm": 0.13365933299064636, "learning_rate": 1.0158489360578439e-05, "loss": 0.4797, "num_input_tokens_seen": 79746832, "step": 65735 }, { "epoch": 7.321528009800646, "grad_norm": 0.12263499945402145, "learning_rate": 1.015457914486128e-05, "loss": 0.4743, "num_input_tokens_seen": 79752816, "step": 65740 }, { "epoch": 7.322084864684263, "grad_norm": 0.12493396550416946, "learning_rate": 1.0150669490051699e-05, "loss": 0.4548, "num_input_tokens_seen": 79758640, "step": 65745 }, { "epoch": 7.322641719567881, "grad_norm": 0.19259297847747803, "learning_rate": 1.014676039629742e-05, "loss": 0.4643, "num_input_tokens_seen": 79764560, "step": 65750 }, { "epoch": 7.323198574451498, "grad_norm": 0.10981141775846481, "learning_rate": 1.0142851863746122e-05, "loss": 0.4612, "num_input_tokens_seen": 79770736, "step": 65755 }, { "epoch": 7.323755429335115, "grad_norm": 0.11478450149297714, "learning_rate": 1.0138943892545503e-05, "loss": 0.4593, "num_input_tokens_seen": 79776656, "step": 65760 }, { "epoch": 7.324312284218733, "grad_norm": 0.11117468774318695, "learning_rate": 1.0135036482843197e-05, "loss": 0.4593, "num_input_tokens_seen": 79782480, "step": 65765 }, { "epoch": 7.32486913910235, "grad_norm": 0.12251907587051392, "learning_rate": 1.0131129634786863e-05, "loss": 0.4544, "num_input_tokens_seen": 79788688, "step": 65770 }, { "epoch": 7.325425993985967, "grad_norm": 0.10942579060792923, "learning_rate": 1.0127223348524086e-05, "loss": 0.4564, "num_input_tokens_seen": 79794768, "step": 65775 }, { "epoch": 7.325982848869584, "grad_norm": 0.11741838604211807, "learning_rate": 1.012331762420248e-05, "loss": 0.4558, "num_input_tokens_seen": 79801104, "step": 65780 }, { "epoch": 7.326539703753202, "grad_norm": 0.09027095884084702, "learning_rate": 1.0119412461969612e-05, "loss": 0.4714, "num_input_tokens_seen": 79807056, "step": 65785 }, { "epoch": 7.3270965586368195, "grad_norm": 0.11087990552186966, "learning_rate": 1.0115507861973022e-05, "loss": 0.4687, "num_input_tokens_seen": 79812880, "step": 65790 }, { "epoch": 7.327653413520436, "grad_norm": 0.10051684081554413, "learning_rate": 1.0111603824360255e-05, "loss": 0.4538, "num_input_tokens_seen": 79818448, "step": 65795 }, { "epoch": 7.328210268404054, "grad_norm": 0.11986729502677917, "learning_rate": 1.0107700349278798e-05, "loss": 0.471, "num_input_tokens_seen": 79824560, "step": 65800 }, { "epoch": 7.328767123287671, "grad_norm": 0.11045293509960175, "learning_rate": 1.0103797436876145e-05, "loss": 0.448, "num_input_tokens_seen": 79830832, "step": 65805 }, { "epoch": 7.3293239781712884, "grad_norm": 0.1135123074054718, "learning_rate": 1.0099895087299772e-05, "loss": 0.4589, "num_input_tokens_seen": 79837008, "step": 65810 }, { "epoch": 7.329880833054906, "grad_norm": 0.13757704198360443, "learning_rate": 1.0095993300697105e-05, "loss": 0.4729, "num_input_tokens_seen": 79842800, "step": 65815 }, { "epoch": 7.330437687938523, "grad_norm": 0.0829179510474205, "learning_rate": 1.0092092077215579e-05, "loss": 0.4759, "num_input_tokens_seen": 79848912, "step": 65820 }, { "epoch": 7.330994542822141, "grad_norm": 0.14267081022262573, "learning_rate": 1.0088191417002582e-05, "loss": 0.4731, "num_input_tokens_seen": 79855152, "step": 65825 }, { "epoch": 7.331551397705758, "grad_norm": 0.11651395261287689, "learning_rate": 1.0084291320205508e-05, "loss": 0.4743, "num_input_tokens_seen": 79861296, "step": 65830 }, { "epoch": 7.332108252589375, "grad_norm": 0.09337493032217026, "learning_rate": 1.0080391786971699e-05, "loss": 0.4594, "num_input_tokens_seen": 79867696, "step": 65835 }, { "epoch": 7.332665107472993, "grad_norm": 0.08295874297618866, "learning_rate": 1.0076492817448501e-05, "loss": 0.4576, "num_input_tokens_seen": 79874192, "step": 65840 }, { "epoch": 7.3332219623566095, "grad_norm": 0.11601397395133972, "learning_rate": 1.007259441178324e-05, "loss": 0.4709, "num_input_tokens_seen": 79880336, "step": 65845 }, { "epoch": 7.333778817240227, "grad_norm": 0.13261736929416656, "learning_rate": 1.006869657012319e-05, "loss": 0.4593, "num_input_tokens_seen": 79886384, "step": 65850 }, { "epoch": 7.334335672123845, "grad_norm": 0.07716415822505951, "learning_rate": 1.006479929261564e-05, "loss": 0.463, "num_input_tokens_seen": 79892368, "step": 65855 }, { "epoch": 7.334892527007462, "grad_norm": 0.14119842648506165, "learning_rate": 1.0060902579407827e-05, "loss": 0.4537, "num_input_tokens_seen": 79898928, "step": 65860 }, { "epoch": 7.335449381891079, "grad_norm": 0.1334524005651474, "learning_rate": 1.0057006430646999e-05, "loss": 0.463, "num_input_tokens_seen": 79905136, "step": 65865 }, { "epoch": 7.336006236774696, "grad_norm": 0.12929362058639526, "learning_rate": 1.0053110846480342e-05, "loss": 0.48, "num_input_tokens_seen": 79911024, "step": 65870 }, { "epoch": 7.336563091658314, "grad_norm": 0.12000507116317749, "learning_rate": 1.0049215827055063e-05, "loss": 0.4551, "num_input_tokens_seen": 79917200, "step": 65875 }, { "epoch": 7.3371199465419314, "grad_norm": 0.09478902071714401, "learning_rate": 1.0045321372518328e-05, "loss": 0.4611, "num_input_tokens_seen": 79923312, "step": 65880 }, { "epoch": 7.337676801425548, "grad_norm": 0.10083851218223572, "learning_rate": 1.0041427483017268e-05, "loss": 0.4604, "num_input_tokens_seen": 79928016, "step": 65885 }, { "epoch": 7.338233656309166, "grad_norm": 0.1294410079717636, "learning_rate": 1.0037534158699024e-05, "loss": 0.455, "num_input_tokens_seen": 79934128, "step": 65890 }, { "epoch": 7.338790511192784, "grad_norm": 0.11964502930641174, "learning_rate": 1.0033641399710677e-05, "loss": 0.4578, "num_input_tokens_seen": 79940336, "step": 65895 }, { "epoch": 7.3393473660764, "grad_norm": 0.1492956429719925, "learning_rate": 1.0029749206199331e-05, "loss": 0.4661, "num_input_tokens_seen": 79946288, "step": 65900 }, { "epoch": 7.339904220960018, "grad_norm": 0.11545545607805252, "learning_rate": 1.0025857578312028e-05, "loss": 0.4485, "num_input_tokens_seen": 79952624, "step": 65905 }, { "epoch": 7.340461075843635, "grad_norm": 0.09240034222602844, "learning_rate": 1.0021966516195819e-05, "loss": 0.4622, "num_input_tokens_seen": 79958832, "step": 65910 }, { "epoch": 7.3410179307272525, "grad_norm": 0.09811046719551086, "learning_rate": 1.0018076019997715e-05, "loss": 0.4592, "num_input_tokens_seen": 79965008, "step": 65915 }, { "epoch": 7.34157478561087, "grad_norm": 0.06556407362222672, "learning_rate": 1.0014186089864702e-05, "loss": 0.4717, "num_input_tokens_seen": 79971312, "step": 65920 }, { "epoch": 7.342131640494487, "grad_norm": 0.13801515102386475, "learning_rate": 1.0010296725943764e-05, "loss": 0.4695, "num_input_tokens_seen": 79977584, "step": 65925 }, { "epoch": 7.342688495378105, "grad_norm": 0.12308858335018158, "learning_rate": 1.0006407928381861e-05, "loss": 0.4519, "num_input_tokens_seen": 79983888, "step": 65930 }, { "epoch": 7.343245350261721, "grad_norm": 0.09847443550825119, "learning_rate": 1.0002519697325908e-05, "loss": 0.4675, "num_input_tokens_seen": 79990160, "step": 65935 }, { "epoch": 7.343802205145339, "grad_norm": 0.08296418190002441, "learning_rate": 9.998632032922828e-06, "loss": 0.4725, "num_input_tokens_seen": 79996240, "step": 65940 }, { "epoch": 7.344359060028957, "grad_norm": 0.11026743799448013, "learning_rate": 9.994744935319498e-06, "loss": 0.4544, "num_input_tokens_seen": 80002576, "step": 65945 }, { "epoch": 7.344915914912574, "grad_norm": 0.11625958979129791, "learning_rate": 9.990858404662797e-06, "loss": 0.4712, "num_input_tokens_seen": 80008624, "step": 65950 }, { "epoch": 7.345472769796191, "grad_norm": 0.14389269053936005, "learning_rate": 9.986972441099559e-06, "loss": 0.45, "num_input_tokens_seen": 80014928, "step": 65955 }, { "epoch": 7.346029624679808, "grad_norm": 0.12496399134397507, "learning_rate": 9.983087044776612e-06, "loss": 0.465, "num_input_tokens_seen": 80021104, "step": 65960 }, { "epoch": 7.346586479563426, "grad_norm": 0.10440853983163834, "learning_rate": 9.97920221584077e-06, "loss": 0.4539, "num_input_tokens_seen": 80027216, "step": 65965 }, { "epoch": 7.347143334447043, "grad_norm": 0.18024957180023193, "learning_rate": 9.975317954438796e-06, "loss": 0.4566, "num_input_tokens_seen": 80033584, "step": 65970 }, { "epoch": 7.34770018933066, "grad_norm": 0.08540347218513489, "learning_rate": 9.971434260717467e-06, "loss": 0.4646, "num_input_tokens_seen": 80039952, "step": 65975 }, { "epoch": 7.348257044214278, "grad_norm": 0.12808936834335327, "learning_rate": 9.967551134823505e-06, "loss": 0.4513, "num_input_tokens_seen": 80046384, "step": 65980 }, { "epoch": 7.348813899097895, "grad_norm": 0.13109956681728363, "learning_rate": 9.963668576903634e-06, "loss": 0.4652, "num_input_tokens_seen": 80052688, "step": 65985 }, { "epoch": 7.349370753981512, "grad_norm": 0.10274187475442886, "learning_rate": 9.95978658710456e-06, "loss": 0.4686, "num_input_tokens_seen": 80058992, "step": 65990 }, { "epoch": 7.34992760886513, "grad_norm": 0.09987909346818924, "learning_rate": 9.955905165572938e-06, "loss": 0.4605, "num_input_tokens_seen": 80065072, "step": 65995 }, { "epoch": 7.350484463748747, "grad_norm": 0.14176538586616516, "learning_rate": 9.952024312455438e-06, "loss": 0.4531, "num_input_tokens_seen": 80071344, "step": 66000 }, { "epoch": 7.351041318632364, "grad_norm": 0.13176383078098297, "learning_rate": 9.948144027898676e-06, "loss": 0.4655, "num_input_tokens_seen": 80077328, "step": 66005 }, { "epoch": 7.351598173515982, "grad_norm": 0.11359129101037979, "learning_rate": 9.944264312049278e-06, "loss": 0.4668, "num_input_tokens_seen": 80083536, "step": 66010 }, { "epoch": 7.352155028399599, "grad_norm": 0.0950406864285469, "learning_rate": 9.940385165053812e-06, "loss": 0.4657, "num_input_tokens_seen": 80089552, "step": 66015 }, { "epoch": 7.352711883283217, "grad_norm": 0.07944012433290482, "learning_rate": 9.936506587058858e-06, "loss": 0.4589, "num_input_tokens_seen": 80094896, "step": 66020 }, { "epoch": 7.353268738166833, "grad_norm": 0.11720726639032364, "learning_rate": 9.932628578210965e-06, "loss": 0.4671, "num_input_tokens_seen": 80100944, "step": 66025 }, { "epoch": 7.353825593050451, "grad_norm": 0.09126662462949753, "learning_rate": 9.928751138656642e-06, "loss": 0.4521, "num_input_tokens_seen": 80106800, "step": 66030 }, { "epoch": 7.354382447934069, "grad_norm": 0.1584731489419937, "learning_rate": 9.924874268542405e-06, "loss": 0.4808, "num_input_tokens_seen": 80112912, "step": 66035 }, { "epoch": 7.3549393028176855, "grad_norm": 0.10974971204996109, "learning_rate": 9.920997968014725e-06, "loss": 0.4496, "num_input_tokens_seen": 80118896, "step": 66040 }, { "epoch": 7.355496157701303, "grad_norm": 0.09743252396583557, "learning_rate": 9.917122237220073e-06, "loss": 0.4598, "num_input_tokens_seen": 80124976, "step": 66045 }, { "epoch": 7.356053012584921, "grad_norm": 0.09636718779802322, "learning_rate": 9.913247076304865e-06, "loss": 0.469, "num_input_tokens_seen": 80130608, "step": 66050 }, { "epoch": 7.356609867468538, "grad_norm": 0.1396801620721817, "learning_rate": 9.909372485415533e-06, "loss": 0.4536, "num_input_tokens_seen": 80136752, "step": 66055 }, { "epoch": 7.357166722352155, "grad_norm": 0.08622869104146957, "learning_rate": 9.905498464698478e-06, "loss": 0.468, "num_input_tokens_seen": 80142800, "step": 66060 }, { "epoch": 7.357723577235772, "grad_norm": 0.08967071771621704, "learning_rate": 9.901625014300056e-06, "loss": 0.4573, "num_input_tokens_seen": 80148752, "step": 66065 }, { "epoch": 7.35828043211939, "grad_norm": 0.12697826325893402, "learning_rate": 9.897752134366633e-06, "loss": 0.4589, "num_input_tokens_seen": 80154672, "step": 66070 }, { "epoch": 7.358837287003007, "grad_norm": 0.13901452720165253, "learning_rate": 9.893879825044525e-06, "loss": 0.4562, "num_input_tokens_seen": 80160752, "step": 66075 }, { "epoch": 7.359394141886624, "grad_norm": 0.1102508157491684, "learning_rate": 9.890008086480047e-06, "loss": 0.476, "num_input_tokens_seen": 80167056, "step": 66080 }, { "epoch": 7.359950996770242, "grad_norm": 0.09960892051458359, "learning_rate": 9.886136918819491e-06, "loss": 0.4632, "num_input_tokens_seen": 80173040, "step": 66085 }, { "epoch": 7.360507851653859, "grad_norm": 0.08821861445903778, "learning_rate": 9.882266322209111e-06, "loss": 0.4585, "num_input_tokens_seen": 80178928, "step": 66090 }, { "epoch": 7.361064706537476, "grad_norm": 0.08088972419500351, "learning_rate": 9.878396296795165e-06, "loss": 0.4611, "num_input_tokens_seen": 80184976, "step": 66095 }, { "epoch": 7.361621561421094, "grad_norm": 0.08945140987634659, "learning_rate": 9.874526842723858e-06, "loss": 0.4671, "num_input_tokens_seen": 80191152, "step": 66100 }, { "epoch": 7.362178416304711, "grad_norm": 0.1212829202413559, "learning_rate": 9.870657960141408e-06, "loss": 0.4592, "num_input_tokens_seen": 80197392, "step": 66105 }, { "epoch": 7.3627352711883285, "grad_norm": 0.1474534273147583, "learning_rate": 9.866789649193974e-06, "loss": 0.4517, "num_input_tokens_seen": 80203536, "step": 66110 }, { "epoch": 7.363292126071945, "grad_norm": 0.08468743413686752, "learning_rate": 9.862921910027725e-06, "loss": 0.4649, "num_input_tokens_seen": 80209936, "step": 66115 }, { "epoch": 7.363848980955563, "grad_norm": 0.153525248169899, "learning_rate": 9.859054742788803e-06, "loss": 0.4606, "num_input_tokens_seen": 80216208, "step": 66120 }, { "epoch": 7.364405835839181, "grad_norm": 0.11798294633626938, "learning_rate": 9.855188147623307e-06, "loss": 0.4601, "num_input_tokens_seen": 80222064, "step": 66125 }, { "epoch": 7.364962690722797, "grad_norm": 0.11672618985176086, "learning_rate": 9.851322124677345e-06, "loss": 0.4681, "num_input_tokens_seen": 80228336, "step": 66130 }, { "epoch": 7.365519545606415, "grad_norm": 0.10447916388511658, "learning_rate": 9.847456674096971e-06, "loss": 0.4527, "num_input_tokens_seen": 80234320, "step": 66135 }, { "epoch": 7.366076400490032, "grad_norm": 0.11059590429067612, "learning_rate": 9.84359179602825e-06, "loss": 0.4665, "num_input_tokens_seen": 80239984, "step": 66140 }, { "epoch": 7.3666332553736495, "grad_norm": 0.09196348488330841, "learning_rate": 9.839727490617196e-06, "loss": 0.452, "num_input_tokens_seen": 80245648, "step": 66145 }, { "epoch": 7.367190110257267, "grad_norm": 0.1096845492720604, "learning_rate": 9.835863758009819e-06, "loss": 0.4467, "num_input_tokens_seen": 80251952, "step": 66150 }, { "epoch": 7.367746965140884, "grad_norm": 0.09049684554338455, "learning_rate": 9.832000598352114e-06, "loss": 0.4657, "num_input_tokens_seen": 80258064, "step": 66155 }, { "epoch": 7.368303820024502, "grad_norm": 0.10303298383951187, "learning_rate": 9.828138011790025e-06, "loss": 0.4522, "num_input_tokens_seen": 80264016, "step": 66160 }, { "epoch": 7.368860674908119, "grad_norm": 0.14415781199932098, "learning_rate": 9.824275998469514e-06, "loss": 0.4778, "num_input_tokens_seen": 80269968, "step": 66165 }, { "epoch": 7.369417529791736, "grad_norm": 0.08371423929929733, "learning_rate": 9.820414558536478e-06, "loss": 0.4526, "num_input_tokens_seen": 80276048, "step": 66170 }, { "epoch": 7.369974384675354, "grad_norm": 0.1517915427684784, "learning_rate": 9.816553692136835e-06, "loss": 0.4624, "num_input_tokens_seen": 80281904, "step": 66175 }, { "epoch": 7.370531239558971, "grad_norm": 0.09954065829515457, "learning_rate": 9.812693399416442e-06, "loss": 0.4564, "num_input_tokens_seen": 80288048, "step": 66180 }, { "epoch": 7.371088094442588, "grad_norm": 0.13146965205669403, "learning_rate": 9.808833680521163e-06, "loss": 0.4481, "num_input_tokens_seen": 80293232, "step": 66185 }, { "epoch": 7.371644949326206, "grad_norm": 0.11875776946544647, "learning_rate": 9.804974535596836e-06, "loss": 0.47, "num_input_tokens_seen": 80299472, "step": 66190 }, { "epoch": 7.372201804209823, "grad_norm": 0.0974162369966507, "learning_rate": 9.80111596478926e-06, "loss": 0.451, "num_input_tokens_seen": 80305264, "step": 66195 }, { "epoch": 7.37275865909344, "grad_norm": 0.0824441984295845, "learning_rate": 9.797257968244239e-06, "loss": 0.4664, "num_input_tokens_seen": 80311024, "step": 66200 }, { "epoch": 7.373315513977057, "grad_norm": 0.11358879506587982, "learning_rate": 9.79340054610752e-06, "loss": 0.4641, "num_input_tokens_seen": 80317040, "step": 66205 }, { "epoch": 7.373872368860675, "grad_norm": 0.10030639916658401, "learning_rate": 9.789543698524864e-06, "loss": 0.4688, "num_input_tokens_seen": 80322448, "step": 66210 }, { "epoch": 7.3744292237442925, "grad_norm": 0.08984155207872391, "learning_rate": 9.785687425641999e-06, "loss": 0.467, "num_input_tokens_seen": 80328656, "step": 66215 }, { "epoch": 7.374986078627909, "grad_norm": 0.09448761492967606, "learning_rate": 9.78183172760461e-06, "loss": 0.4599, "num_input_tokens_seen": 80334512, "step": 66220 }, { "epoch": 7.375542933511527, "grad_norm": 0.10094485431909561, "learning_rate": 9.777976604558395e-06, "loss": 0.4717, "num_input_tokens_seen": 80340464, "step": 66225 }, { "epoch": 7.376099788395145, "grad_norm": 0.10779445618391037, "learning_rate": 9.774122056648998e-06, "loss": 0.4606, "num_input_tokens_seen": 80346576, "step": 66230 }, { "epoch": 7.3766566432787615, "grad_norm": 0.09141691029071808, "learning_rate": 9.770268084022071e-06, "loss": 0.4691, "num_input_tokens_seen": 80352784, "step": 66235 }, { "epoch": 7.377213498162379, "grad_norm": 0.09650260210037231, "learning_rate": 9.766414686823216e-06, "loss": 0.456, "num_input_tokens_seen": 80359056, "step": 66240 }, { "epoch": 7.377770353045996, "grad_norm": 0.11223898828029633, "learning_rate": 9.76256186519803e-06, "loss": 0.4671, "num_input_tokens_seen": 80365296, "step": 66245 }, { "epoch": 7.378327207929614, "grad_norm": 0.12930281460285187, "learning_rate": 9.758709619292098e-06, "loss": 0.4531, "num_input_tokens_seen": 80371600, "step": 66250 }, { "epoch": 7.378884062813231, "grad_norm": 0.10183673352003098, "learning_rate": 9.75485794925095e-06, "loss": 0.4619, "num_input_tokens_seen": 80378096, "step": 66255 }, { "epoch": 7.379440917696848, "grad_norm": 0.09021098166704178, "learning_rate": 9.751006855220132e-06, "loss": 0.4487, "num_input_tokens_seen": 80383984, "step": 66260 }, { "epoch": 7.379997772580466, "grad_norm": 0.10954001545906067, "learning_rate": 9.747156337345134e-06, "loss": 0.4515, "num_input_tokens_seen": 80389904, "step": 66265 }, { "epoch": 7.3805546274640825, "grad_norm": 0.09808076173067093, "learning_rate": 9.743306395771459e-06, "loss": 0.4682, "num_input_tokens_seen": 80396112, "step": 66270 }, { "epoch": 7.3811114823477, "grad_norm": 0.0953783169388771, "learning_rate": 9.739457030644556e-06, "loss": 0.4634, "num_input_tokens_seen": 80402448, "step": 66275 }, { "epoch": 7.381668337231318, "grad_norm": 0.11306008696556091, "learning_rate": 9.735608242109867e-06, "loss": 0.4556, "num_input_tokens_seen": 80408592, "step": 66280 }, { "epoch": 7.382225192114935, "grad_norm": 0.1084555983543396, "learning_rate": 9.731760030312822e-06, "loss": 0.4578, "num_input_tokens_seen": 80414224, "step": 66285 }, { "epoch": 7.382782046998552, "grad_norm": 0.09761843830347061, "learning_rate": 9.727912395398808e-06, "loss": 0.4543, "num_input_tokens_seen": 80420240, "step": 66290 }, { "epoch": 7.383338901882169, "grad_norm": 0.1141524612903595, "learning_rate": 9.724065337513214e-06, "loss": 0.464, "num_input_tokens_seen": 80426000, "step": 66295 }, { "epoch": 7.383895756765787, "grad_norm": 0.15660814940929413, "learning_rate": 9.720218856801378e-06, "loss": 0.4645, "num_input_tokens_seen": 80432272, "step": 66300 }, { "epoch": 7.3844526116494045, "grad_norm": 0.14791074395179749, "learning_rate": 9.716372953408636e-06, "loss": 0.4681, "num_input_tokens_seen": 80438448, "step": 66305 }, { "epoch": 7.385009466533021, "grad_norm": 0.13330058753490448, "learning_rate": 9.712527627480314e-06, "loss": 0.4544, "num_input_tokens_seen": 80444720, "step": 66310 }, { "epoch": 7.385566321416639, "grad_norm": 0.13400760293006897, "learning_rate": 9.70868287916168e-06, "loss": 0.465, "num_input_tokens_seen": 80450256, "step": 66315 }, { "epoch": 7.386123176300256, "grad_norm": 0.09720246493816376, "learning_rate": 9.704838708598019e-06, "loss": 0.454, "num_input_tokens_seen": 80456400, "step": 66320 }, { "epoch": 7.386680031183873, "grad_norm": 0.11905211955308914, "learning_rate": 9.700995115934567e-06, "loss": 0.4566, "num_input_tokens_seen": 80462288, "step": 66325 }, { "epoch": 7.387236886067491, "grad_norm": 0.11316424608230591, "learning_rate": 9.69715210131654e-06, "loss": 0.4554, "num_input_tokens_seen": 80468336, "step": 66330 }, { "epoch": 7.387793740951108, "grad_norm": 0.11598577350378036, "learning_rate": 9.693309664889155e-06, "loss": 0.4705, "num_input_tokens_seen": 80474128, "step": 66335 }, { "epoch": 7.3883505958347255, "grad_norm": 0.10732477903366089, "learning_rate": 9.689467806797575e-06, "loss": 0.4675, "num_input_tokens_seen": 80480464, "step": 66340 }, { "epoch": 7.388907450718343, "grad_norm": 0.08898313343524933, "learning_rate": 9.685626527186974e-06, "loss": 0.4639, "num_input_tokens_seen": 80485968, "step": 66345 }, { "epoch": 7.38946430560196, "grad_norm": 0.0955284833908081, "learning_rate": 9.681785826202472e-06, "loss": 0.4559, "num_input_tokens_seen": 80491728, "step": 66350 }, { "epoch": 7.390021160485578, "grad_norm": 0.10903491824865341, "learning_rate": 9.677945703989192e-06, "loss": 0.4542, "num_input_tokens_seen": 80497840, "step": 66355 }, { "epoch": 7.3905780153691945, "grad_norm": 0.12099258601665497, "learning_rate": 9.674106160692234e-06, "loss": 0.4552, "num_input_tokens_seen": 80503664, "step": 66360 }, { "epoch": 7.391134870252812, "grad_norm": 0.11171028017997742, "learning_rate": 9.67026719645665e-06, "loss": 0.4598, "num_input_tokens_seen": 80509552, "step": 66365 }, { "epoch": 7.39169172513643, "grad_norm": 0.10943380743265152, "learning_rate": 9.666428811427505e-06, "loss": 0.4691, "num_input_tokens_seen": 80515504, "step": 66370 }, { "epoch": 7.392248580020047, "grad_norm": 0.10948565602302551, "learning_rate": 9.662591005749814e-06, "loss": 0.461, "num_input_tokens_seen": 80521616, "step": 66375 }, { "epoch": 7.392805434903664, "grad_norm": 0.10603974759578705, "learning_rate": 9.658753779568592e-06, "loss": 0.4706, "num_input_tokens_seen": 80527856, "step": 66380 }, { "epoch": 7.393362289787281, "grad_norm": 0.12220630794763565, "learning_rate": 9.654917133028807e-06, "loss": 0.4694, "num_input_tokens_seen": 80534032, "step": 66385 }, { "epoch": 7.393919144670899, "grad_norm": 0.1011165902018547, "learning_rate": 9.651081066275427e-06, "loss": 0.459, "num_input_tokens_seen": 80540240, "step": 66390 }, { "epoch": 7.394475999554516, "grad_norm": 0.10357392579317093, "learning_rate": 9.6472455794534e-06, "loss": 0.4553, "num_input_tokens_seen": 80546320, "step": 66395 }, { "epoch": 7.395032854438133, "grad_norm": 0.08906634896993637, "learning_rate": 9.643410672707631e-06, "loss": 0.4456, "num_input_tokens_seen": 80552432, "step": 66400 }, { "epoch": 7.395589709321751, "grad_norm": 0.09114929288625717, "learning_rate": 9.639576346183027e-06, "loss": 0.4504, "num_input_tokens_seen": 80558416, "step": 66405 }, { "epoch": 7.3961465642053685, "grad_norm": 0.11047592014074326, "learning_rate": 9.635742600024445e-06, "loss": 0.4492, "num_input_tokens_seen": 80564432, "step": 66410 }, { "epoch": 7.396703419088985, "grad_norm": 0.10182441025972366, "learning_rate": 9.631909434376752e-06, "loss": 0.471, "num_input_tokens_seen": 80570384, "step": 66415 }, { "epoch": 7.397260273972603, "grad_norm": 0.07785675674676895, "learning_rate": 9.62807684938476e-06, "loss": 0.4639, "num_input_tokens_seen": 80576464, "step": 66420 }, { "epoch": 7.39781712885622, "grad_norm": 0.09689699858427048, "learning_rate": 9.624244845193289e-06, "loss": 0.4567, "num_input_tokens_seen": 80582512, "step": 66425 }, { "epoch": 7.3983739837398375, "grad_norm": 0.12548445165157318, "learning_rate": 9.62041342194713e-06, "loss": 0.4577, "num_input_tokens_seen": 80588592, "step": 66430 }, { "epoch": 7.398930838623455, "grad_norm": 0.1215251237154007, "learning_rate": 9.616582579791028e-06, "loss": 0.4585, "num_input_tokens_seen": 80594896, "step": 66435 }, { "epoch": 7.399487693507072, "grad_norm": 0.11539682745933533, "learning_rate": 9.612752318869745e-06, "loss": 0.4595, "num_input_tokens_seen": 80600720, "step": 66440 }, { "epoch": 7.40004454839069, "grad_norm": 0.11223260313272476, "learning_rate": 9.60892263932798e-06, "loss": 0.4567, "num_input_tokens_seen": 80606992, "step": 66445 }, { "epoch": 7.400601403274306, "grad_norm": 0.11072063446044922, "learning_rate": 9.60509354131045e-06, "loss": 0.4624, "num_input_tokens_seen": 80613072, "step": 66450 }, { "epoch": 7.401158258157924, "grad_norm": 0.09020398557186127, "learning_rate": 9.601265024961814e-06, "loss": 0.4625, "num_input_tokens_seen": 80619280, "step": 66455 }, { "epoch": 7.401715113041542, "grad_norm": 0.08181124925613403, "learning_rate": 9.597437090426734e-06, "loss": 0.4785, "num_input_tokens_seen": 80625328, "step": 66460 }, { "epoch": 7.4022719679251585, "grad_norm": 0.0916639119386673, "learning_rate": 9.593609737849845e-06, "loss": 0.4669, "num_input_tokens_seen": 80631408, "step": 66465 }, { "epoch": 7.402828822808776, "grad_norm": 0.12665095925331116, "learning_rate": 9.58978296737575e-06, "loss": 0.4538, "num_input_tokens_seen": 80637520, "step": 66470 }, { "epoch": 7.403385677692393, "grad_norm": 0.11114206165075302, "learning_rate": 9.58595677914904e-06, "loss": 0.4615, "num_input_tokens_seen": 80643824, "step": 66475 }, { "epoch": 7.403942532576011, "grad_norm": 0.11493215709924698, "learning_rate": 9.582131173314277e-06, "loss": 0.4531, "num_input_tokens_seen": 80650000, "step": 66480 }, { "epoch": 7.404499387459628, "grad_norm": 0.10995103418827057, "learning_rate": 9.578306150016005e-06, "loss": 0.462, "num_input_tokens_seen": 80655792, "step": 66485 }, { "epoch": 7.405056242343245, "grad_norm": 0.10377878695726395, "learning_rate": 9.574481709398756e-06, "loss": 0.4569, "num_input_tokens_seen": 80661872, "step": 66490 }, { "epoch": 7.405613097226863, "grad_norm": 0.13609962165355682, "learning_rate": 9.570657851607015e-06, "loss": 0.4627, "num_input_tokens_seen": 80668080, "step": 66495 }, { "epoch": 7.40616995211048, "grad_norm": 0.10207504779100418, "learning_rate": 9.566834576785272e-06, "loss": 0.4534, "num_input_tokens_seen": 80674096, "step": 66500 }, { "epoch": 7.406726806994097, "grad_norm": 0.12567687034606934, "learning_rate": 9.563011885077969e-06, "loss": 0.4618, "num_input_tokens_seen": 80680016, "step": 66505 }, { "epoch": 7.407283661877715, "grad_norm": 0.08721478283405304, "learning_rate": 9.559189776629557e-06, "loss": 0.4767, "num_input_tokens_seen": 80686064, "step": 66510 }, { "epoch": 7.407840516761332, "grad_norm": 0.11228063702583313, "learning_rate": 9.55536825158443e-06, "loss": 0.4653, "num_input_tokens_seen": 80692240, "step": 66515 }, { "epoch": 7.408397371644949, "grad_norm": 0.11813241988420486, "learning_rate": 9.551547310086983e-06, "loss": 0.4696, "num_input_tokens_seen": 80698352, "step": 66520 }, { "epoch": 7.408954226528567, "grad_norm": 0.16052572429180145, "learning_rate": 9.547726952281593e-06, "loss": 0.4652, "num_input_tokens_seen": 80704688, "step": 66525 }, { "epoch": 7.409511081412184, "grad_norm": 0.09077899158000946, "learning_rate": 9.543907178312591e-06, "loss": 0.4437, "num_input_tokens_seen": 80710832, "step": 66530 }, { "epoch": 7.4100679362958015, "grad_norm": 0.11607135087251663, "learning_rate": 9.540087988324317e-06, "loss": 0.4568, "num_input_tokens_seen": 80717136, "step": 66535 }, { "epoch": 7.410624791179418, "grad_norm": 0.13269366323947906, "learning_rate": 9.536269382461052e-06, "loss": 0.4575, "num_input_tokens_seen": 80723440, "step": 66540 }, { "epoch": 7.411181646063036, "grad_norm": 0.0889301523566246, "learning_rate": 9.532451360867098e-06, "loss": 0.4602, "num_input_tokens_seen": 80729488, "step": 66545 }, { "epoch": 7.411738500946654, "grad_norm": 0.10086241364479065, "learning_rate": 9.528633923686685e-06, "loss": 0.4609, "num_input_tokens_seen": 80735984, "step": 66550 }, { "epoch": 7.41229535583027, "grad_norm": 0.1017627865076065, "learning_rate": 9.524817071064067e-06, "loss": 0.4573, "num_input_tokens_seen": 80742224, "step": 66555 }, { "epoch": 7.412852210713888, "grad_norm": 0.14231424033641815, "learning_rate": 9.521000803143462e-06, "loss": 0.474, "num_input_tokens_seen": 80748176, "step": 66560 }, { "epoch": 7.413409065597505, "grad_norm": 0.0844087302684784, "learning_rate": 9.517185120069041e-06, "loss": 0.4706, "num_input_tokens_seen": 80754320, "step": 66565 }, { "epoch": 7.413965920481123, "grad_norm": 0.14182204008102417, "learning_rate": 9.513370021984993e-06, "loss": 0.4561, "num_input_tokens_seen": 80760432, "step": 66570 }, { "epoch": 7.41452277536474, "grad_norm": 0.0838913768529892, "learning_rate": 9.509555509035446e-06, "loss": 0.4672, "num_input_tokens_seen": 80766640, "step": 66575 }, { "epoch": 7.415079630248357, "grad_norm": 0.14234983921051025, "learning_rate": 9.505741581364533e-06, "loss": 0.4574, "num_input_tokens_seen": 80772656, "step": 66580 }, { "epoch": 7.415636485131975, "grad_norm": 0.09585198760032654, "learning_rate": 9.501928239116364e-06, "loss": 0.4661, "num_input_tokens_seen": 80778736, "step": 66585 }, { "epoch": 7.416193340015592, "grad_norm": 0.10497575253248215, "learning_rate": 9.498115482435005e-06, "loss": 0.4798, "num_input_tokens_seen": 80784624, "step": 66590 }, { "epoch": 7.416750194899209, "grad_norm": 0.12181317061185837, "learning_rate": 9.494303311464531e-06, "loss": 0.4529, "num_input_tokens_seen": 80790672, "step": 66595 }, { "epoch": 7.417307049782827, "grad_norm": 0.1278942972421646, "learning_rate": 9.49049172634896e-06, "loss": 0.4526, "num_input_tokens_seen": 80796816, "step": 66600 }, { "epoch": 7.417863904666444, "grad_norm": 0.08267800509929657, "learning_rate": 9.48668072723232e-06, "loss": 0.4672, "num_input_tokens_seen": 80802608, "step": 66605 }, { "epoch": 7.418420759550061, "grad_norm": 0.11884121596813202, "learning_rate": 9.48287031425859e-06, "loss": 0.4617, "num_input_tokens_seen": 80808880, "step": 66610 }, { "epoch": 7.418977614433679, "grad_norm": 0.08968998491764069, "learning_rate": 9.479060487571747e-06, "loss": 0.4773, "num_input_tokens_seen": 80814864, "step": 66615 }, { "epoch": 7.419534469317296, "grad_norm": 0.10235185921192169, "learning_rate": 9.475251247315748e-06, "loss": 0.4542, "num_input_tokens_seen": 80821040, "step": 66620 }, { "epoch": 7.420091324200913, "grad_norm": 0.13569369912147522, "learning_rate": 9.471442593634497e-06, "loss": 0.4582, "num_input_tokens_seen": 80827472, "step": 66625 }, { "epoch": 7.42064817908453, "grad_norm": 0.12105346471071243, "learning_rate": 9.467634526671917e-06, "loss": 0.4629, "num_input_tokens_seen": 80833168, "step": 66630 }, { "epoch": 7.421205033968148, "grad_norm": 0.10546308010816574, "learning_rate": 9.463827046571873e-06, "loss": 0.4565, "num_input_tokens_seen": 80839408, "step": 66635 }, { "epoch": 7.421761888851766, "grad_norm": 0.116291344165802, "learning_rate": 9.46002015347824e-06, "loss": 0.4481, "num_input_tokens_seen": 80845680, "step": 66640 }, { "epoch": 7.422318743735382, "grad_norm": 0.1260080188512802, "learning_rate": 9.456213847534836e-06, "loss": 0.4705, "num_input_tokens_seen": 80851792, "step": 66645 }, { "epoch": 7.422875598619, "grad_norm": 0.12814931571483612, "learning_rate": 9.452408128885487e-06, "loss": 0.4629, "num_input_tokens_seen": 80858096, "step": 66650 }, { "epoch": 7.423432453502617, "grad_norm": 0.1414264291524887, "learning_rate": 9.448602997673995e-06, "loss": 0.4608, "num_input_tokens_seen": 80864304, "step": 66655 }, { "epoch": 7.4239893083862345, "grad_norm": 0.12900789082050323, "learning_rate": 9.444798454044104e-06, "loss": 0.4413, "num_input_tokens_seen": 80870544, "step": 66660 }, { "epoch": 7.424546163269852, "grad_norm": 0.09343798458576202, "learning_rate": 9.440994498139589e-06, "loss": 0.4714, "num_input_tokens_seen": 80876624, "step": 66665 }, { "epoch": 7.425103018153469, "grad_norm": 0.16869908571243286, "learning_rate": 9.437191130104153e-06, "loss": 0.4656, "num_input_tokens_seen": 80881840, "step": 66670 }, { "epoch": 7.425659873037087, "grad_norm": 0.1542157083749771, "learning_rate": 9.433388350081518e-06, "loss": 0.4611, "num_input_tokens_seen": 80888080, "step": 66675 }, { "epoch": 7.426216727920703, "grad_norm": 0.13231410086154938, "learning_rate": 9.42958615821535e-06, "loss": 0.4636, "num_input_tokens_seen": 80893808, "step": 66680 }, { "epoch": 7.426773582804321, "grad_norm": 0.11181019991636276, "learning_rate": 9.425784554649314e-06, "loss": 0.4587, "num_input_tokens_seen": 80899504, "step": 66685 }, { "epoch": 7.427330437687939, "grad_norm": 0.1349867731332779, "learning_rate": 9.421983539527054e-06, "loss": 0.4556, "num_input_tokens_seen": 80905936, "step": 66690 }, { "epoch": 7.4278872925715556, "grad_norm": 0.09778512269258499, "learning_rate": 9.418183112992173e-06, "loss": 0.4585, "num_input_tokens_seen": 80912240, "step": 66695 }, { "epoch": 7.428444147455173, "grad_norm": 0.13248826563358307, "learning_rate": 9.414383275188273e-06, "loss": 0.4662, "num_input_tokens_seen": 80917712, "step": 66700 }, { "epoch": 7.429001002338791, "grad_norm": 0.11449914425611496, "learning_rate": 9.410584026258915e-06, "loss": 0.4557, "num_input_tokens_seen": 80923888, "step": 66705 }, { "epoch": 7.429557857222408, "grad_norm": 0.1468876749277115, "learning_rate": 9.406785366347649e-06, "loss": 0.475, "num_input_tokens_seen": 80929872, "step": 66710 }, { "epoch": 7.430114712106025, "grad_norm": 0.10625898092985153, "learning_rate": 9.402987295598013e-06, "loss": 0.4645, "num_input_tokens_seen": 80935824, "step": 66715 }, { "epoch": 7.430671566989642, "grad_norm": 0.13706235587596893, "learning_rate": 9.39918981415349e-06, "loss": 0.4613, "num_input_tokens_seen": 80941840, "step": 66720 }, { "epoch": 7.43122842187326, "grad_norm": 0.12437457591295242, "learning_rate": 9.395392922157578e-06, "loss": 0.4613, "num_input_tokens_seen": 80947888, "step": 66725 }, { "epoch": 7.4317852767568775, "grad_norm": 0.10845735669136047, "learning_rate": 9.391596619753723e-06, "loss": 0.4556, "num_input_tokens_seen": 80953968, "step": 66730 }, { "epoch": 7.432342131640494, "grad_norm": 0.13333483040332794, "learning_rate": 9.387800907085376e-06, "loss": 0.46, "num_input_tokens_seen": 80960176, "step": 66735 }, { "epoch": 7.432898986524112, "grad_norm": 0.07738757133483887, "learning_rate": 9.38400578429594e-06, "loss": 0.457, "num_input_tokens_seen": 80966384, "step": 66740 }, { "epoch": 7.433455841407729, "grad_norm": 0.16253948211669922, "learning_rate": 9.380211251528802e-06, "loss": 0.4475, "num_input_tokens_seen": 80972624, "step": 66745 }, { "epoch": 7.434012696291346, "grad_norm": 0.1599796861410141, "learning_rate": 9.376417308927348e-06, "loss": 0.4649, "num_input_tokens_seen": 80978704, "step": 66750 }, { "epoch": 7.434569551174964, "grad_norm": 0.07965213805437088, "learning_rate": 9.372623956634908e-06, "loss": 0.4657, "num_input_tokens_seen": 80984816, "step": 66755 }, { "epoch": 7.435126406058581, "grad_norm": 0.12132950127124786, "learning_rate": 9.368831194794814e-06, "loss": 0.4615, "num_input_tokens_seen": 80991088, "step": 66760 }, { "epoch": 7.435683260942199, "grad_norm": 0.11735722422599792, "learning_rate": 9.365039023550378e-06, "loss": 0.463, "num_input_tokens_seen": 80997360, "step": 66765 }, { "epoch": 7.436240115825816, "grad_norm": 0.08128489553928375, "learning_rate": 9.361247443044865e-06, "loss": 0.4524, "num_input_tokens_seen": 81003888, "step": 66770 }, { "epoch": 7.436796970709433, "grad_norm": 0.09867940098047256, "learning_rate": 9.35745645342155e-06, "loss": 0.462, "num_input_tokens_seen": 81009872, "step": 66775 }, { "epoch": 7.437353825593051, "grad_norm": 0.12251816689968109, "learning_rate": 9.35366605482365e-06, "loss": 0.4616, "num_input_tokens_seen": 81015920, "step": 66780 }, { "epoch": 7.4379106804766675, "grad_norm": 0.09384665638208389, "learning_rate": 9.349876247394396e-06, "loss": 0.4632, "num_input_tokens_seen": 81022384, "step": 66785 }, { "epoch": 7.438467535360285, "grad_norm": 0.11832873523235321, "learning_rate": 9.346087031276962e-06, "loss": 0.4656, "num_input_tokens_seen": 81028624, "step": 66790 }, { "epoch": 7.439024390243903, "grad_norm": 0.13569900393486023, "learning_rate": 9.342298406614524e-06, "loss": 0.4571, "num_input_tokens_seen": 81034864, "step": 66795 }, { "epoch": 7.43958124512752, "grad_norm": 0.11310796439647675, "learning_rate": 9.338510373550242e-06, "loss": 0.4645, "num_input_tokens_seen": 81040912, "step": 66800 }, { "epoch": 7.440138100011137, "grad_norm": 0.14117370545864105, "learning_rate": 9.334722932227217e-06, "loss": 0.463, "num_input_tokens_seen": 81047056, "step": 66805 }, { "epoch": 7.440694954894754, "grad_norm": 0.12469443678855896, "learning_rate": 9.33093608278857e-06, "loss": 0.4487, "num_input_tokens_seen": 81052816, "step": 66810 }, { "epoch": 7.441251809778372, "grad_norm": 0.12333806604146957, "learning_rate": 9.327149825377362e-06, "loss": 0.4555, "num_input_tokens_seen": 81058512, "step": 66815 }, { "epoch": 7.441808664661989, "grad_norm": 0.09135742485523224, "learning_rate": 9.323364160136669e-06, "loss": 0.4479, "num_input_tokens_seen": 81065072, "step": 66820 }, { "epoch": 7.442365519545606, "grad_norm": 0.10766879469156265, "learning_rate": 9.31957908720951e-06, "loss": 0.4707, "num_input_tokens_seen": 81071120, "step": 66825 }, { "epoch": 7.442922374429224, "grad_norm": 0.12445056438446045, "learning_rate": 9.3157946067389e-06, "loss": 0.455, "num_input_tokens_seen": 81077648, "step": 66830 }, { "epoch": 7.443479229312841, "grad_norm": 0.14052750170230865, "learning_rate": 9.312010718867844e-06, "loss": 0.4696, "num_input_tokens_seen": 81083664, "step": 66835 }, { "epoch": 7.444036084196458, "grad_norm": 0.0877002626657486, "learning_rate": 9.308227423739288e-06, "loss": 0.4546, "num_input_tokens_seen": 81089584, "step": 66840 }, { "epoch": 7.444592939080076, "grad_norm": 0.1379445195198059, "learning_rate": 9.304444721496194e-06, "loss": 0.4695, "num_input_tokens_seen": 81095504, "step": 66845 }, { "epoch": 7.445149793963693, "grad_norm": 0.0931464359164238, "learning_rate": 9.300662612281469e-06, "loss": 0.4713, "num_input_tokens_seen": 81101872, "step": 66850 }, { "epoch": 7.4457066488473105, "grad_norm": 0.1291235089302063, "learning_rate": 9.296881096238022e-06, "loss": 0.4623, "num_input_tokens_seen": 81108368, "step": 66855 }, { "epoch": 7.446263503730927, "grad_norm": 0.07787532359361649, "learning_rate": 9.29310017350874e-06, "loss": 0.469, "num_input_tokens_seen": 81113808, "step": 66860 }, { "epoch": 7.446820358614545, "grad_norm": 0.12351249903440475, "learning_rate": 9.289319844236457e-06, "loss": 0.4494, "num_input_tokens_seen": 81119920, "step": 66865 }, { "epoch": 7.447377213498163, "grad_norm": 0.12283087521791458, "learning_rate": 9.285540108564029e-06, "loss": 0.4701, "num_input_tokens_seen": 81126000, "step": 66870 }, { "epoch": 7.447934068381779, "grad_norm": 0.09152859449386597, "learning_rate": 9.281760966634245e-06, "loss": 0.4659, "num_input_tokens_seen": 81131984, "step": 66875 }, { "epoch": 7.448490923265397, "grad_norm": 0.1502278745174408, "learning_rate": 9.277982418589914e-06, "loss": 0.4668, "num_input_tokens_seen": 81137968, "step": 66880 }, { "epoch": 7.449047778149015, "grad_norm": 0.11958103626966476, "learning_rate": 9.27420446457378e-06, "loss": 0.4668, "num_input_tokens_seen": 81143984, "step": 66885 }, { "epoch": 7.4496046330326315, "grad_norm": 0.12432660162448883, "learning_rate": 9.2704271047286e-06, "loss": 0.4601, "num_input_tokens_seen": 81150096, "step": 66890 }, { "epoch": 7.450161487916249, "grad_norm": 0.09564577043056488, "learning_rate": 9.266650339197097e-06, "loss": 0.4551, "num_input_tokens_seen": 81156240, "step": 66895 }, { "epoch": 7.450718342799866, "grad_norm": 0.13750362396240234, "learning_rate": 9.262874168121957e-06, "loss": 0.4642, "num_input_tokens_seen": 81162224, "step": 66900 }, { "epoch": 7.451275197683484, "grad_norm": 0.12137362360954285, "learning_rate": 9.259098591645873e-06, "loss": 0.4723, "num_input_tokens_seen": 81168368, "step": 66905 }, { "epoch": 7.451832052567101, "grad_norm": 0.1000746563076973, "learning_rate": 9.255323609911478e-06, "loss": 0.4574, "num_input_tokens_seen": 81173968, "step": 66910 }, { "epoch": 7.452388907450718, "grad_norm": 0.1619972139596939, "learning_rate": 9.251549223061422e-06, "loss": 0.4614, "num_input_tokens_seen": 81179984, "step": 66915 }, { "epoch": 7.452945762334336, "grad_norm": 0.11423100531101227, "learning_rate": 9.247775431238298e-06, "loss": 0.446, "num_input_tokens_seen": 81186128, "step": 66920 }, { "epoch": 7.453502617217953, "grad_norm": 0.09262894093990326, "learning_rate": 9.244002234584694e-06, "loss": 0.4663, "num_input_tokens_seen": 81192336, "step": 66925 }, { "epoch": 7.45405947210157, "grad_norm": 0.10305019468069077, "learning_rate": 9.240229633243191e-06, "loss": 0.4634, "num_input_tokens_seen": 81198480, "step": 66930 }, { "epoch": 7.454616326985188, "grad_norm": 0.0974046140909195, "learning_rate": 9.23645762735631e-06, "loss": 0.4654, "num_input_tokens_seen": 81204464, "step": 66935 }, { "epoch": 7.455173181868805, "grad_norm": 0.09518949687480927, "learning_rate": 9.232686217066583e-06, "loss": 0.4621, "num_input_tokens_seen": 81210640, "step": 66940 }, { "epoch": 7.455730036752422, "grad_norm": 0.07166256010532379, "learning_rate": 9.228915402516489e-06, "loss": 0.463, "num_input_tokens_seen": 81216688, "step": 66945 }, { "epoch": 7.45628689163604, "grad_norm": 0.11478316783905029, "learning_rate": 9.225145183848522e-06, "loss": 0.4557, "num_input_tokens_seen": 81222576, "step": 66950 }, { "epoch": 7.456843746519657, "grad_norm": 0.11685771495103836, "learning_rate": 9.221375561205114e-06, "loss": 0.4635, "num_input_tokens_seen": 81228944, "step": 66955 }, { "epoch": 7.4574006014032745, "grad_norm": 0.14319181442260742, "learning_rate": 9.217606534728704e-06, "loss": 0.4682, "num_input_tokens_seen": 81235120, "step": 66960 }, { "epoch": 7.457957456286891, "grad_norm": 0.10659133642911911, "learning_rate": 9.213838104561703e-06, "loss": 0.4727, "num_input_tokens_seen": 81241040, "step": 66965 }, { "epoch": 7.458514311170509, "grad_norm": 0.10592217743396759, "learning_rate": 9.210070270846482e-06, "loss": 0.4551, "num_input_tokens_seen": 81247312, "step": 66970 }, { "epoch": 7.459071166054127, "grad_norm": 0.16316105425357819, "learning_rate": 9.206303033725413e-06, "loss": 0.4548, "num_input_tokens_seen": 81253616, "step": 66975 }, { "epoch": 7.4596280209377435, "grad_norm": 0.12432672828435898, "learning_rate": 9.202536393340822e-06, "loss": 0.4644, "num_input_tokens_seen": 81259792, "step": 66980 }, { "epoch": 7.460184875821361, "grad_norm": 0.0895577073097229, "learning_rate": 9.198770349835031e-06, "loss": 0.4593, "num_input_tokens_seen": 81266192, "step": 66985 }, { "epoch": 7.460741730704978, "grad_norm": 0.0825730562210083, "learning_rate": 9.19500490335034e-06, "loss": 0.4434, "num_input_tokens_seen": 81272592, "step": 66990 }, { "epoch": 7.461298585588596, "grad_norm": 0.10752614587545395, "learning_rate": 9.191240054029007e-06, "loss": 0.4619, "num_input_tokens_seen": 81278512, "step": 66995 }, { "epoch": 7.461855440472213, "grad_norm": 0.10646242648363113, "learning_rate": 9.187475802013295e-06, "loss": 0.4606, "num_input_tokens_seen": 81284464, "step": 67000 }, { "epoch": 7.46241229535583, "grad_norm": 0.09444493055343628, "learning_rate": 9.183712147445415e-06, "loss": 0.4705, "num_input_tokens_seen": 81290352, "step": 67005 }, { "epoch": 7.462969150239448, "grad_norm": 0.10900531709194183, "learning_rate": 9.17994909046758e-06, "loss": 0.4614, "num_input_tokens_seen": 81295824, "step": 67010 }, { "epoch": 7.4635260051230645, "grad_norm": 0.07694883644580841, "learning_rate": 9.176186631221958e-06, "loss": 0.4601, "num_input_tokens_seen": 81302096, "step": 67015 }, { "epoch": 7.464082860006682, "grad_norm": 0.11788720637559891, "learning_rate": 9.172424769850718e-06, "loss": 0.443, "num_input_tokens_seen": 81307856, "step": 67020 }, { "epoch": 7.4646397148903, "grad_norm": 0.14072422683238983, "learning_rate": 9.168663506495998e-06, "loss": 0.448, "num_input_tokens_seen": 81314320, "step": 67025 }, { "epoch": 7.465196569773917, "grad_norm": 0.09903444349765778, "learning_rate": 9.164902841299896e-06, "loss": 0.4553, "num_input_tokens_seen": 81320272, "step": 67030 }, { "epoch": 7.465753424657534, "grad_norm": 0.08592002838850021, "learning_rate": 9.161142774404522e-06, "loss": 0.4676, "num_input_tokens_seen": 81325776, "step": 67035 }, { "epoch": 7.466310279541151, "grad_norm": 0.12717759609222412, "learning_rate": 9.157383305951922e-06, "loss": 0.4658, "num_input_tokens_seen": 81331088, "step": 67040 }, { "epoch": 7.466867134424769, "grad_norm": 0.12144418060779572, "learning_rate": 9.15362443608416e-06, "loss": 0.4705, "num_input_tokens_seen": 81337136, "step": 67045 }, { "epoch": 7.4674239893083865, "grad_norm": 0.09034766256809235, "learning_rate": 9.149866164943238e-06, "loss": 0.4671, "num_input_tokens_seen": 81343440, "step": 67050 }, { "epoch": 7.467980844192003, "grad_norm": 0.09632247686386108, "learning_rate": 9.146108492671168e-06, "loss": 0.4557, "num_input_tokens_seen": 81349328, "step": 67055 }, { "epoch": 7.468537699075621, "grad_norm": 0.1273302137851715, "learning_rate": 9.142351419409936e-06, "loss": 0.463, "num_input_tokens_seen": 81355216, "step": 67060 }, { "epoch": 7.469094553959239, "grad_norm": 0.14280599355697632, "learning_rate": 9.138594945301474e-06, "loss": 0.4463, "num_input_tokens_seen": 81361456, "step": 67065 }, { "epoch": 7.469651408842855, "grad_norm": 0.09649008512496948, "learning_rate": 9.134839070487735e-06, "loss": 0.4704, "num_input_tokens_seen": 81367696, "step": 67070 }, { "epoch": 7.470208263726473, "grad_norm": 0.12875650823116302, "learning_rate": 9.131083795110612e-06, "loss": 0.4692, "num_input_tokens_seen": 81373616, "step": 67075 }, { "epoch": 7.47076511861009, "grad_norm": 0.12091213464736938, "learning_rate": 9.127329119311995e-06, "loss": 0.4534, "num_input_tokens_seen": 81379952, "step": 67080 }, { "epoch": 7.4713219734937075, "grad_norm": 0.09573999047279358, "learning_rate": 9.12357504323376e-06, "loss": 0.4484, "num_input_tokens_seen": 81386096, "step": 67085 }, { "epoch": 7.471878828377325, "grad_norm": 0.12941506505012512, "learning_rate": 9.119821567017727e-06, "loss": 0.4566, "num_input_tokens_seen": 81392400, "step": 67090 }, { "epoch": 7.472435683260942, "grad_norm": 0.09909231960773468, "learning_rate": 9.116068690805738e-06, "loss": 0.4622, "num_input_tokens_seen": 81398768, "step": 67095 }, { "epoch": 7.47299253814456, "grad_norm": 0.08859948068857193, "learning_rate": 9.112316414739567e-06, "loss": 0.4514, "num_input_tokens_seen": 81405072, "step": 67100 }, { "epoch": 7.4735493930281764, "grad_norm": 0.15820550918579102, "learning_rate": 9.108564738961004e-06, "loss": 0.4571, "num_input_tokens_seen": 81411536, "step": 67105 }, { "epoch": 7.474106247911794, "grad_norm": 0.1485038846731186, "learning_rate": 9.104813663611786e-06, "loss": 0.4502, "num_input_tokens_seen": 81417872, "step": 67110 }, { "epoch": 7.474663102795412, "grad_norm": 0.09201192855834961, "learning_rate": 9.101063188833641e-06, "loss": 0.4508, "num_input_tokens_seen": 81423760, "step": 67115 }, { "epoch": 7.475219957679029, "grad_norm": 0.09875437617301941, "learning_rate": 9.097313314768294e-06, "loss": 0.4674, "num_input_tokens_seen": 81429712, "step": 67120 }, { "epoch": 7.475776812562646, "grad_norm": 0.1196470707654953, "learning_rate": 9.0935640415574e-06, "loss": 0.4607, "num_input_tokens_seen": 81436016, "step": 67125 }, { "epoch": 7.476333667446264, "grad_norm": 0.09172989428043365, "learning_rate": 9.089815369342642e-06, "loss": 0.4537, "num_input_tokens_seen": 81442320, "step": 67130 }, { "epoch": 7.476890522329881, "grad_norm": 0.07280683517456055, "learning_rate": 9.086067298265634e-06, "loss": 0.4498, "num_input_tokens_seen": 81448016, "step": 67135 }, { "epoch": 7.477447377213498, "grad_norm": 0.08030106127262115, "learning_rate": 9.082319828468014e-06, "loss": 0.4527, "num_input_tokens_seen": 81453904, "step": 67140 }, { "epoch": 7.478004232097115, "grad_norm": 0.12822817265987396, "learning_rate": 9.078572960091361e-06, "loss": 0.4764, "num_input_tokens_seen": 81460080, "step": 67145 }, { "epoch": 7.478561086980733, "grad_norm": 0.1660819947719574, "learning_rate": 9.074826693277236e-06, "loss": 0.4722, "num_input_tokens_seen": 81466480, "step": 67150 }, { "epoch": 7.4791179418643505, "grad_norm": 0.10967210680246353, "learning_rate": 9.071081028167203e-06, "loss": 0.4646, "num_input_tokens_seen": 81471952, "step": 67155 }, { "epoch": 7.479674796747967, "grad_norm": 0.10772532224655151, "learning_rate": 9.067335964902762e-06, "loss": 0.4625, "num_input_tokens_seen": 81478032, "step": 67160 }, { "epoch": 7.480231651631585, "grad_norm": 0.11598477512598038, "learning_rate": 9.063591503625432e-06, "loss": 0.4597, "num_input_tokens_seen": 81484400, "step": 67165 }, { "epoch": 7.480788506515202, "grad_norm": 0.09914267063140869, "learning_rate": 9.059847644476693e-06, "loss": 0.4547, "num_input_tokens_seen": 81490480, "step": 67170 }, { "epoch": 7.4813453613988194, "grad_norm": 0.0731860101222992, "learning_rate": 9.056104387597983e-06, "loss": 0.4724, "num_input_tokens_seen": 81496496, "step": 67175 }, { "epoch": 7.481902216282437, "grad_norm": 0.09523912519216537, "learning_rate": 9.052361733130755e-06, "loss": 0.4619, "num_input_tokens_seen": 81502512, "step": 67180 }, { "epoch": 7.482459071166054, "grad_norm": 0.10686280578374863, "learning_rate": 9.048619681216394e-06, "loss": 0.4538, "num_input_tokens_seen": 81508336, "step": 67185 }, { "epoch": 7.483015926049672, "grad_norm": 0.0823233425617218, "learning_rate": 9.044878231996312e-06, "loss": 0.4559, "num_input_tokens_seen": 81514096, "step": 67190 }, { "epoch": 7.483572780933288, "grad_norm": 0.09739412367343903, "learning_rate": 9.041137385611853e-06, "loss": 0.4597, "num_input_tokens_seen": 81520080, "step": 67195 }, { "epoch": 7.484129635816906, "grad_norm": 0.1510130763053894, "learning_rate": 9.037397142204365e-06, "loss": 0.4538, "num_input_tokens_seen": 81526480, "step": 67200 }, { "epoch": 7.484686490700524, "grad_norm": 0.12128914892673492, "learning_rate": 9.033657501915177e-06, "loss": 0.456, "num_input_tokens_seen": 81532464, "step": 67205 }, { "epoch": 7.4852433455841405, "grad_norm": 0.09031224250793457, "learning_rate": 9.029918464885567e-06, "loss": 0.4613, "num_input_tokens_seen": 81538544, "step": 67210 }, { "epoch": 7.485800200467758, "grad_norm": 0.12565429508686066, "learning_rate": 9.026180031256822e-06, "loss": 0.4608, "num_input_tokens_seen": 81544624, "step": 67215 }, { "epoch": 7.486357055351375, "grad_norm": 0.09465483576059341, "learning_rate": 9.022442201170181e-06, "loss": 0.4721, "num_input_tokens_seen": 81550576, "step": 67220 }, { "epoch": 7.486913910234993, "grad_norm": 0.0958055853843689, "learning_rate": 9.018704974766884e-06, "loss": 0.4768, "num_input_tokens_seen": 81556688, "step": 67225 }, { "epoch": 7.48747076511861, "grad_norm": 0.08503848314285278, "learning_rate": 9.014968352188119e-06, "loss": 0.4529, "num_input_tokens_seen": 81562224, "step": 67230 }, { "epoch": 7.488027620002227, "grad_norm": 0.1287691593170166, "learning_rate": 9.011232333575073e-06, "loss": 0.4695, "num_input_tokens_seen": 81568272, "step": 67235 }, { "epoch": 7.488584474885845, "grad_norm": 0.08501490950584412, "learning_rate": 9.007496919068919e-06, "loss": 0.4673, "num_input_tokens_seen": 81573552, "step": 67240 }, { "epoch": 7.4891413297694625, "grad_norm": 0.1325451135635376, "learning_rate": 9.003762108810775e-06, "loss": 0.4655, "num_input_tokens_seen": 81579696, "step": 67245 }, { "epoch": 7.489698184653079, "grad_norm": 0.15696237981319427, "learning_rate": 9.000027902941768e-06, "loss": 0.4596, "num_input_tokens_seen": 81585520, "step": 67250 }, { "epoch": 7.490255039536697, "grad_norm": 0.12916205823421478, "learning_rate": 8.996294301602973e-06, "loss": 0.4567, "num_input_tokens_seen": 81591664, "step": 67255 }, { "epoch": 7.490811894420314, "grad_norm": 0.0933072417974472, "learning_rate": 8.992561304935465e-06, "loss": 0.4644, "num_input_tokens_seen": 81597168, "step": 67260 }, { "epoch": 7.491368749303931, "grad_norm": 0.10374832153320312, "learning_rate": 8.9888289130803e-06, "loss": 0.4575, "num_input_tokens_seen": 81603440, "step": 67265 }, { "epoch": 7.491925604187549, "grad_norm": 0.10965412855148315, "learning_rate": 8.985097126178476e-06, "loss": 0.4559, "num_input_tokens_seen": 81609040, "step": 67270 }, { "epoch": 7.492482459071166, "grad_norm": 0.0922582596540451, "learning_rate": 8.981365944371017e-06, "loss": 0.459, "num_input_tokens_seen": 81615056, "step": 67275 }, { "epoch": 7.4930393139547835, "grad_norm": 0.10524747520685196, "learning_rate": 8.977635367798876e-06, "loss": 0.4705, "num_input_tokens_seen": 81621168, "step": 67280 }, { "epoch": 7.493596168838401, "grad_norm": 0.07994654029607773, "learning_rate": 8.973905396603028e-06, "loss": 0.4537, "num_input_tokens_seen": 81626992, "step": 67285 }, { "epoch": 7.494153023722018, "grad_norm": 0.11005207151174545, "learning_rate": 8.970176030924382e-06, "loss": 0.4567, "num_input_tokens_seen": 81633168, "step": 67290 }, { "epoch": 7.494709878605636, "grad_norm": 0.10406404733657837, "learning_rate": 8.966447270903855e-06, "loss": 0.4641, "num_input_tokens_seen": 81639504, "step": 67295 }, { "epoch": 7.495266733489252, "grad_norm": 0.09587117284536362, "learning_rate": 8.962719116682344e-06, "loss": 0.4565, "num_input_tokens_seen": 81645616, "step": 67300 }, { "epoch": 7.49582358837287, "grad_norm": 0.13723169267177582, "learning_rate": 8.958991568400687e-06, "loss": 0.4678, "num_input_tokens_seen": 81651568, "step": 67305 }, { "epoch": 7.496380443256488, "grad_norm": 0.11552082002162933, "learning_rate": 8.955264626199744e-06, "loss": 0.459, "num_input_tokens_seen": 81657840, "step": 67310 }, { "epoch": 7.496937298140105, "grad_norm": 0.11597940325737, "learning_rate": 8.951538290220313e-06, "loss": 0.4657, "num_input_tokens_seen": 81664016, "step": 67315 }, { "epoch": 7.497494153023722, "grad_norm": 0.11289995163679123, "learning_rate": 8.947812560603203e-06, "loss": 0.4576, "num_input_tokens_seen": 81670032, "step": 67320 }, { "epoch": 7.498051007907339, "grad_norm": 0.08503635227680206, "learning_rate": 8.944087437489169e-06, "loss": 0.455, "num_input_tokens_seen": 81676144, "step": 67325 }, { "epoch": 7.498607862790957, "grad_norm": 0.10286779701709747, "learning_rate": 8.940362921018966e-06, "loss": 0.4648, "num_input_tokens_seen": 81682320, "step": 67330 }, { "epoch": 7.499164717674574, "grad_norm": 0.11250415444374084, "learning_rate": 8.936639011333323e-06, "loss": 0.464, "num_input_tokens_seen": 81688432, "step": 67335 }, { "epoch": 7.499721572558191, "grad_norm": 0.1260482370853424, "learning_rate": 8.932915708572928e-06, "loss": 0.4604, "num_input_tokens_seen": 81693424, "step": 67340 }, { "epoch": 7.500278427441809, "grad_norm": 0.09186484664678574, "learning_rate": 8.929193012878479e-06, "loss": 0.4647, "num_input_tokens_seen": 81699600, "step": 67345 }, { "epoch": 7.500835282325426, "grad_norm": 0.08201577514410019, "learning_rate": 8.925470924390605e-06, "loss": 0.4698, "num_input_tokens_seen": 81705616, "step": 67350 }, { "epoch": 7.500835282325426, "eval_loss": 0.4640391767024994, "eval_runtime": 113.07, "eval_samples_per_second": 35.297, "eval_steps_per_second": 8.826, "num_input_tokens_seen": 81705616, "step": 67350 }, { "epoch": 7.501392137209043, "grad_norm": 0.12007363140583038, "learning_rate": 8.921749443249967e-06, "loss": 0.4609, "num_input_tokens_seen": 81711856, "step": 67355 }, { "epoch": 7.501948992092661, "grad_norm": 0.11513836681842804, "learning_rate": 8.918028569597148e-06, "loss": 0.4552, "num_input_tokens_seen": 81718096, "step": 67360 }, { "epoch": 7.502505846976278, "grad_norm": 0.0783965215086937, "learning_rate": 8.91430830357275e-06, "loss": 0.4588, "num_input_tokens_seen": 81724304, "step": 67365 }, { "epoch": 7.503062701859895, "grad_norm": 0.09355258196592331, "learning_rate": 8.91058864531734e-06, "loss": 0.4579, "num_input_tokens_seen": 81729776, "step": 67370 }, { "epoch": 7.503619556743512, "grad_norm": 0.1370815932750702, "learning_rate": 8.906869594971442e-06, "loss": 0.4684, "num_input_tokens_seen": 81735952, "step": 67375 }, { "epoch": 7.50417641162713, "grad_norm": 0.08346737176179886, "learning_rate": 8.903151152675596e-06, "loss": 0.4549, "num_input_tokens_seen": 81742032, "step": 67380 }, { "epoch": 7.504733266510748, "grad_norm": 0.10183155536651611, "learning_rate": 8.899433318570272e-06, "loss": 0.4684, "num_input_tokens_seen": 81748336, "step": 67385 }, { "epoch": 7.505290121394364, "grad_norm": 0.1781308948993683, "learning_rate": 8.895716092795955e-06, "loss": 0.4569, "num_input_tokens_seen": 81754736, "step": 67390 }, { "epoch": 7.505846976277982, "grad_norm": 0.12876665592193604, "learning_rate": 8.891999475493103e-06, "loss": 0.4526, "num_input_tokens_seen": 81760944, "step": 67395 }, { "epoch": 7.506403831161599, "grad_norm": 0.11973219364881516, "learning_rate": 8.888283466802122e-06, "loss": 0.461, "num_input_tokens_seen": 81766960, "step": 67400 }, { "epoch": 7.5069606860452165, "grad_norm": 0.09355638176202774, "learning_rate": 8.884568066863433e-06, "loss": 0.4581, "num_input_tokens_seen": 81772976, "step": 67405 }, { "epoch": 7.507517540928834, "grad_norm": 0.0885748341679573, "learning_rate": 8.880853275817397e-06, "loss": 0.4583, "num_input_tokens_seen": 81779152, "step": 67410 }, { "epoch": 7.508074395812451, "grad_norm": 0.18049460649490356, "learning_rate": 8.87713909380439e-06, "loss": 0.4776, "num_input_tokens_seen": 81785168, "step": 67415 }, { "epoch": 7.508631250696069, "grad_norm": 0.14007365703582764, "learning_rate": 8.873425520964728e-06, "loss": 0.4648, "num_input_tokens_seen": 81791088, "step": 67420 }, { "epoch": 7.509188105579686, "grad_norm": 0.1353149712085724, "learning_rate": 8.86971255743873e-06, "loss": 0.4573, "num_input_tokens_seen": 81797264, "step": 67425 }, { "epoch": 7.509744960463303, "grad_norm": 0.07472450286149979, "learning_rate": 8.866000203366693e-06, "loss": 0.4557, "num_input_tokens_seen": 81803120, "step": 67430 }, { "epoch": 7.510301815346921, "grad_norm": 0.11850900948047638, "learning_rate": 8.862288458888863e-06, "loss": 0.4808, "num_input_tokens_seen": 81808816, "step": 67435 }, { "epoch": 7.5108586702305375, "grad_norm": 0.07940956205129623, "learning_rate": 8.858577324145498e-06, "loss": 0.4699, "num_input_tokens_seen": 81814480, "step": 67440 }, { "epoch": 7.511415525114155, "grad_norm": 0.14170117676258087, "learning_rate": 8.8548667992768e-06, "loss": 0.4594, "num_input_tokens_seen": 81820848, "step": 67445 }, { "epoch": 7.511972379997773, "grad_norm": 0.10046631842851639, "learning_rate": 8.851156884422986e-06, "loss": 0.4621, "num_input_tokens_seen": 81827056, "step": 67450 }, { "epoch": 7.51252923488139, "grad_norm": 0.12889464199543, "learning_rate": 8.847447579724206e-06, "loss": 0.4621, "num_input_tokens_seen": 81833200, "step": 67455 }, { "epoch": 7.513086089765007, "grad_norm": 0.1087886169552803, "learning_rate": 8.84373888532062e-06, "loss": 0.4614, "num_input_tokens_seen": 81839376, "step": 67460 }, { "epoch": 7.513642944648625, "grad_norm": 0.11234750598669052, "learning_rate": 8.840030801352364e-06, "loss": 0.4593, "num_input_tokens_seen": 81845584, "step": 67465 }, { "epoch": 7.514199799532242, "grad_norm": 0.14236566424369812, "learning_rate": 8.836323327959519e-06, "loss": 0.4625, "num_input_tokens_seen": 81851664, "step": 67470 }, { "epoch": 7.5147566544158595, "grad_norm": 0.12943652272224426, "learning_rate": 8.832616465282189e-06, "loss": 0.4572, "num_input_tokens_seen": 81857840, "step": 67475 }, { "epoch": 7.515313509299476, "grad_norm": 0.08182284235954285, "learning_rate": 8.828910213460412e-06, "loss": 0.4579, "num_input_tokens_seen": 81863856, "step": 67480 }, { "epoch": 7.515870364183094, "grad_norm": 0.10875009000301361, "learning_rate": 8.82520457263423e-06, "loss": 0.47, "num_input_tokens_seen": 81869968, "step": 67485 }, { "epoch": 7.516427219066712, "grad_norm": 0.16200712323188782, "learning_rate": 8.821499542943661e-06, "loss": 0.4618, "num_input_tokens_seen": 81876112, "step": 67490 }, { "epoch": 7.516984073950328, "grad_norm": 0.09286055713891983, "learning_rate": 8.81779512452868e-06, "loss": 0.4612, "num_input_tokens_seen": 81882064, "step": 67495 }, { "epoch": 7.517540928833946, "grad_norm": 0.10297518223524094, "learning_rate": 8.814091317529263e-06, "loss": 0.4523, "num_input_tokens_seen": 81888368, "step": 67500 }, { "epoch": 7.518097783717563, "grad_norm": 0.09439542144536972, "learning_rate": 8.810388122085339e-06, "loss": 0.4608, "num_input_tokens_seen": 81894416, "step": 67505 }, { "epoch": 7.5186546386011806, "grad_norm": 0.09575016796588898, "learning_rate": 8.806685538336842e-06, "loss": 0.4661, "num_input_tokens_seen": 81900528, "step": 67510 }, { "epoch": 7.519211493484798, "grad_norm": 0.1302352100610733, "learning_rate": 8.80298356642365e-06, "loss": 0.4624, "num_input_tokens_seen": 81906544, "step": 67515 }, { "epoch": 7.519768348368415, "grad_norm": 0.10637605935335159, "learning_rate": 8.799282206485648e-06, "loss": 0.4648, "num_input_tokens_seen": 81912976, "step": 67520 }, { "epoch": 7.520325203252033, "grad_norm": 0.15905027091503143, "learning_rate": 8.795581458662688e-06, "loss": 0.4476, "num_input_tokens_seen": 81919152, "step": 67525 }, { "epoch": 7.5208820581356495, "grad_norm": 0.08646102249622345, "learning_rate": 8.791881323094583e-06, "loss": 0.4536, "num_input_tokens_seen": 81925264, "step": 67530 }, { "epoch": 7.521438913019267, "grad_norm": 0.08164593577384949, "learning_rate": 8.788181799921149e-06, "loss": 0.4615, "num_input_tokens_seen": 81931568, "step": 67535 }, { "epoch": 7.521995767902885, "grad_norm": 0.18080586194992065, "learning_rate": 8.784482889282152e-06, "loss": 0.4615, "num_input_tokens_seen": 81937776, "step": 67540 }, { "epoch": 7.522552622786502, "grad_norm": 0.12267722934484482, "learning_rate": 8.780784591317367e-06, "loss": 0.4523, "num_input_tokens_seen": 81943856, "step": 67545 }, { "epoch": 7.523109477670119, "grad_norm": 0.12970733642578125, "learning_rate": 8.777086906166509e-06, "loss": 0.4694, "num_input_tokens_seen": 81949968, "step": 67550 }, { "epoch": 7.523666332553736, "grad_norm": 0.1446777731180191, "learning_rate": 8.773389833969303e-06, "loss": 0.4668, "num_input_tokens_seen": 81956208, "step": 67555 }, { "epoch": 7.524223187437354, "grad_norm": 0.09536843001842499, "learning_rate": 8.769693374865431e-06, "loss": 0.4491, "num_input_tokens_seen": 81962576, "step": 67560 }, { "epoch": 7.524780042320971, "grad_norm": 0.1434255987405777, "learning_rate": 8.76599752899455e-06, "loss": 0.4631, "num_input_tokens_seen": 81968624, "step": 67565 }, { "epoch": 7.525336897204588, "grad_norm": 0.09228634834289551, "learning_rate": 8.762302296496302e-06, "loss": 0.4601, "num_input_tokens_seen": 81974864, "step": 67570 }, { "epoch": 7.525893752088206, "grad_norm": 0.09074539691209793, "learning_rate": 8.75860767751032e-06, "loss": 0.4634, "num_input_tokens_seen": 81981072, "step": 67575 }, { "epoch": 7.526450606971823, "grad_norm": 0.11305245012044907, "learning_rate": 8.754913672176179e-06, "loss": 0.4624, "num_input_tokens_seen": 81986640, "step": 67580 }, { "epoch": 7.52700746185544, "grad_norm": 0.09799827635288239, "learning_rate": 8.751220280633469e-06, "loss": 0.4602, "num_input_tokens_seen": 81992976, "step": 67585 }, { "epoch": 7.527564316739058, "grad_norm": 0.07295376062393188, "learning_rate": 8.747527503021718e-06, "loss": 0.4525, "num_input_tokens_seen": 81998960, "step": 67590 }, { "epoch": 7.528121171622675, "grad_norm": 0.08979998528957367, "learning_rate": 8.743835339480471e-06, "loss": 0.457, "num_input_tokens_seen": 82005072, "step": 67595 }, { "epoch": 7.5286780265062925, "grad_norm": 0.1371123045682907, "learning_rate": 8.740143790149214e-06, "loss": 0.4637, "num_input_tokens_seen": 82011472, "step": 67600 }, { "epoch": 7.52923488138991, "grad_norm": 0.16041985154151917, "learning_rate": 8.736452855167429e-06, "loss": 0.4603, "num_input_tokens_seen": 82016848, "step": 67605 }, { "epoch": 7.529791736273527, "grad_norm": 0.1537213772535324, "learning_rate": 8.732762534674586e-06, "loss": 0.4529, "num_input_tokens_seen": 82022320, "step": 67610 }, { "epoch": 7.530348591157145, "grad_norm": 0.09901490062475204, "learning_rate": 8.729072828810091e-06, "loss": 0.4629, "num_input_tokens_seen": 82028560, "step": 67615 }, { "epoch": 7.530905446040761, "grad_norm": 0.1426040679216385, "learning_rate": 8.725383737713382e-06, "loss": 0.4654, "num_input_tokens_seen": 82034032, "step": 67620 }, { "epoch": 7.531462300924379, "grad_norm": 0.12738776206970215, "learning_rate": 8.721695261523819e-06, "loss": 0.4673, "num_input_tokens_seen": 82040208, "step": 67625 }, { "epoch": 7.532019155807997, "grad_norm": 0.09403318911790848, "learning_rate": 8.718007400380784e-06, "loss": 0.4515, "num_input_tokens_seen": 82045968, "step": 67630 }, { "epoch": 7.5325760106916135, "grad_norm": 0.12224336713552475, "learning_rate": 8.714320154423597e-06, "loss": 0.4653, "num_input_tokens_seen": 82051984, "step": 67635 }, { "epoch": 7.533132865575231, "grad_norm": 0.14986836910247803, "learning_rate": 8.71063352379159e-06, "loss": 0.4572, "num_input_tokens_seen": 82058064, "step": 67640 }, { "epoch": 7.533689720458849, "grad_norm": 0.11192677170038223, "learning_rate": 8.706947508624052e-06, "loss": 0.4599, "num_input_tokens_seen": 82064240, "step": 67645 }, { "epoch": 7.534246575342466, "grad_norm": 0.0982896164059639, "learning_rate": 8.703262109060248e-06, "loss": 0.4658, "num_input_tokens_seen": 82070096, "step": 67650 }, { "epoch": 7.534803430226083, "grad_norm": 0.11650308221578598, "learning_rate": 8.699577325239433e-06, "loss": 0.4756, "num_input_tokens_seen": 82076336, "step": 67655 }, { "epoch": 7.5353602851097, "grad_norm": 0.10907381027936935, "learning_rate": 8.695893157300813e-06, "loss": 0.471, "num_input_tokens_seen": 82082608, "step": 67660 }, { "epoch": 7.535917139993318, "grad_norm": 0.08103310316801071, "learning_rate": 8.692209605383603e-06, "loss": 0.4565, "num_input_tokens_seen": 82088208, "step": 67665 }, { "epoch": 7.5364739948769355, "grad_norm": 0.17117899656295776, "learning_rate": 8.688526669626981e-06, "loss": 0.4741, "num_input_tokens_seen": 82094064, "step": 67670 }, { "epoch": 7.537030849760552, "grad_norm": 0.11155392229557037, "learning_rate": 8.684844350170087e-06, "loss": 0.4581, "num_input_tokens_seen": 82099952, "step": 67675 }, { "epoch": 7.53758770464417, "grad_norm": 0.07616396993398666, "learning_rate": 8.681162647152066e-06, "loss": 0.4561, "num_input_tokens_seen": 82106160, "step": 67680 }, { "epoch": 7.538144559527787, "grad_norm": 0.09660963714122772, "learning_rate": 8.677481560712005e-06, "loss": 0.4655, "num_input_tokens_seen": 82112144, "step": 67685 }, { "epoch": 7.538701414411404, "grad_norm": 0.10140891373157501, "learning_rate": 8.673801090989012e-06, "loss": 0.4644, "num_input_tokens_seen": 82117616, "step": 67690 }, { "epoch": 7.539258269295022, "grad_norm": 0.11755749583244324, "learning_rate": 8.670121238122123e-06, "loss": 0.4651, "num_input_tokens_seen": 82124016, "step": 67695 }, { "epoch": 7.539815124178639, "grad_norm": 0.129279226064682, "learning_rate": 8.666442002250386e-06, "loss": 0.4581, "num_input_tokens_seen": 82130192, "step": 67700 }, { "epoch": 7.5403719790622565, "grad_norm": 0.16888339817523956, "learning_rate": 8.66276338351282e-06, "loss": 0.4641, "num_input_tokens_seen": 82136208, "step": 67705 }, { "epoch": 7.540928833945873, "grad_norm": 0.09442972391843796, "learning_rate": 8.6590853820484e-06, "loss": 0.4545, "num_input_tokens_seen": 82142384, "step": 67710 }, { "epoch": 7.541485688829491, "grad_norm": 0.10349363833665848, "learning_rate": 8.655407997996112e-06, "loss": 0.4585, "num_input_tokens_seen": 82148176, "step": 67715 }, { "epoch": 7.542042543713109, "grad_norm": 0.09636832773685455, "learning_rate": 8.65173123149488e-06, "loss": 0.4607, "num_input_tokens_seen": 82154544, "step": 67720 }, { "epoch": 7.5425993985967255, "grad_norm": 0.10976272076368332, "learning_rate": 8.64805508268364e-06, "loss": 0.4578, "num_input_tokens_seen": 82160656, "step": 67725 }, { "epoch": 7.543156253480343, "grad_norm": 0.10784715414047241, "learning_rate": 8.644379551701274e-06, "loss": 0.4675, "num_input_tokens_seen": 82166640, "step": 67730 }, { "epoch": 7.54371310836396, "grad_norm": 0.09355685114860535, "learning_rate": 8.640704638686664e-06, "loss": 0.4524, "num_input_tokens_seen": 82172880, "step": 67735 }, { "epoch": 7.544269963247578, "grad_norm": 0.09282896667718887, "learning_rate": 8.637030343778665e-06, "loss": 0.456, "num_input_tokens_seen": 82179088, "step": 67740 }, { "epoch": 7.544826818131195, "grad_norm": 0.09599652141332626, "learning_rate": 8.633356667116087e-06, "loss": 0.4475, "num_input_tokens_seen": 82185296, "step": 67745 }, { "epoch": 7.545383673014812, "grad_norm": 0.09045808017253876, "learning_rate": 8.629683608837755e-06, "loss": 0.464, "num_input_tokens_seen": 82191504, "step": 67750 }, { "epoch": 7.54594052789843, "grad_norm": 0.09246771037578583, "learning_rate": 8.626011169082428e-06, "loss": 0.4528, "num_input_tokens_seen": 82197584, "step": 67755 }, { "epoch": 7.5464973827820465, "grad_norm": 0.08958136290311813, "learning_rate": 8.622339347988872e-06, "loss": 0.4602, "num_input_tokens_seen": 82203824, "step": 67760 }, { "epoch": 7.547054237665664, "grad_norm": 0.09588605910539627, "learning_rate": 8.61866814569583e-06, "loss": 0.4615, "num_input_tokens_seen": 82209808, "step": 67765 }, { "epoch": 7.547611092549282, "grad_norm": 0.0796341523528099, "learning_rate": 8.614997562341992e-06, "loss": 0.4589, "num_input_tokens_seen": 82216144, "step": 67770 }, { "epoch": 7.548167947432899, "grad_norm": 0.09980171918869019, "learning_rate": 8.611327598066066e-06, "loss": 0.4639, "num_input_tokens_seen": 82222064, "step": 67775 }, { "epoch": 7.548724802316516, "grad_norm": 0.136078342795372, "learning_rate": 8.607658253006689e-06, "loss": 0.4512, "num_input_tokens_seen": 82228208, "step": 67780 }, { "epoch": 7.549281657200134, "grad_norm": 0.09491469711065292, "learning_rate": 8.603989527302527e-06, "loss": 0.4611, "num_input_tokens_seen": 82234256, "step": 67785 }, { "epoch": 7.549838512083751, "grad_norm": 0.08601208031177521, "learning_rate": 8.600321421092178e-06, "loss": 0.4611, "num_input_tokens_seen": 82240240, "step": 67790 }, { "epoch": 7.5503953669673685, "grad_norm": 0.12289892882108688, "learning_rate": 8.596653934514237e-06, "loss": 0.4715, "num_input_tokens_seen": 82246256, "step": 67795 }, { "epoch": 7.550952221850985, "grad_norm": 0.1062423512339592, "learning_rate": 8.592987067707289e-06, "loss": 0.4596, "num_input_tokens_seen": 82252560, "step": 67800 }, { "epoch": 7.551509076734603, "grad_norm": 0.10261182487010956, "learning_rate": 8.58932082080986e-06, "loss": 0.4739, "num_input_tokens_seen": 82258832, "step": 67805 }, { "epoch": 7.552065931618221, "grad_norm": 0.09355198591947556, "learning_rate": 8.585655193960487e-06, "loss": 0.4525, "num_input_tokens_seen": 82264784, "step": 67810 }, { "epoch": 7.552622786501837, "grad_norm": 0.08699505031108856, "learning_rate": 8.581990187297656e-06, "loss": 0.4592, "num_input_tokens_seen": 82270224, "step": 67815 }, { "epoch": 7.553179641385455, "grad_norm": 0.12297182530164719, "learning_rate": 8.578325800959858e-06, "loss": 0.4663, "num_input_tokens_seen": 82276592, "step": 67820 }, { "epoch": 7.553736496269073, "grad_norm": 0.07490364462137222, "learning_rate": 8.574662035085528e-06, "loss": 0.4594, "num_input_tokens_seen": 82282800, "step": 67825 }, { "epoch": 7.5542933511526895, "grad_norm": 0.11767479032278061, "learning_rate": 8.570998889813106e-06, "loss": 0.4633, "num_input_tokens_seen": 82288176, "step": 67830 }, { "epoch": 7.554850206036307, "grad_norm": 0.11226364970207214, "learning_rate": 8.567336365281e-06, "loss": 0.4609, "num_input_tokens_seen": 82294320, "step": 67835 }, { "epoch": 7.555407060919924, "grad_norm": 0.10757667571306229, "learning_rate": 8.563674461627582e-06, "loss": 0.4573, "num_input_tokens_seen": 82300688, "step": 67840 }, { "epoch": 7.555963915803542, "grad_norm": 0.08555959165096283, "learning_rate": 8.560013178991225e-06, "loss": 0.4724, "num_input_tokens_seen": 82306992, "step": 67845 }, { "epoch": 7.556520770687159, "grad_norm": 0.09340614080429077, "learning_rate": 8.556352517510245e-06, "loss": 0.4676, "num_input_tokens_seen": 82313232, "step": 67850 }, { "epoch": 7.557077625570776, "grad_norm": 0.1267853081226349, "learning_rate": 8.552692477322968e-06, "loss": 0.4563, "num_input_tokens_seen": 82319216, "step": 67855 }, { "epoch": 7.557634480454394, "grad_norm": 0.13756917417049408, "learning_rate": 8.549033058567674e-06, "loss": 0.4482, "num_input_tokens_seen": 82325168, "step": 67860 }, { "epoch": 7.558191335338011, "grad_norm": 0.12956731021404266, "learning_rate": 8.545374261382628e-06, "loss": 0.472, "num_input_tokens_seen": 82331280, "step": 67865 }, { "epoch": 7.558748190221628, "grad_norm": 0.09492048621177673, "learning_rate": 8.541716085906082e-06, "loss": 0.4521, "num_input_tokens_seen": 82337232, "step": 67870 }, { "epoch": 7.559305045105246, "grad_norm": 0.09657656401395798, "learning_rate": 8.538058532276238e-06, "loss": 0.4631, "num_input_tokens_seen": 82342928, "step": 67875 }, { "epoch": 7.559861899988863, "grad_norm": 0.11388324946165085, "learning_rate": 8.534401600631305e-06, "loss": 0.4688, "num_input_tokens_seen": 82349040, "step": 67880 }, { "epoch": 7.56041875487248, "grad_norm": 0.0992613136768341, "learning_rate": 8.530745291109438e-06, "loss": 0.4628, "num_input_tokens_seen": 82355120, "step": 67885 }, { "epoch": 7.560975609756097, "grad_norm": 0.1269039511680603, "learning_rate": 8.527089603848792e-06, "loss": 0.4607, "num_input_tokens_seen": 82361360, "step": 67890 }, { "epoch": 7.561532464639715, "grad_norm": 0.09576410800218582, "learning_rate": 8.523434538987502e-06, "loss": 0.4678, "num_input_tokens_seen": 82367472, "step": 67895 }, { "epoch": 7.5620893195233325, "grad_norm": 0.12597554922103882, "learning_rate": 8.519780096663645e-06, "loss": 0.4708, "num_input_tokens_seen": 82373552, "step": 67900 }, { "epoch": 7.562646174406949, "grad_norm": 0.10159870237112045, "learning_rate": 8.516126277015318e-06, "loss": 0.4617, "num_input_tokens_seen": 82379472, "step": 67905 }, { "epoch": 7.563203029290567, "grad_norm": 0.13716207444667816, "learning_rate": 8.512473080180559e-06, "loss": 0.4588, "num_input_tokens_seen": 82385456, "step": 67910 }, { "epoch": 7.563759884174184, "grad_norm": 0.08700283616781235, "learning_rate": 8.50882050629741e-06, "loss": 0.4558, "num_input_tokens_seen": 82391344, "step": 67915 }, { "epoch": 7.564316739057801, "grad_norm": 0.09231340885162354, "learning_rate": 8.505168555503865e-06, "loss": 0.4662, "num_input_tokens_seen": 82397296, "step": 67920 }, { "epoch": 7.564873593941419, "grad_norm": 0.11599259078502655, "learning_rate": 8.501517227937911e-06, "loss": 0.459, "num_input_tokens_seen": 82403536, "step": 67925 }, { "epoch": 7.565430448825036, "grad_norm": 0.09039038419723511, "learning_rate": 8.497866523737516e-06, "loss": 0.4588, "num_input_tokens_seen": 82409776, "step": 67930 }, { "epoch": 7.565987303708654, "grad_norm": 0.10122984647750854, "learning_rate": 8.494216443040601e-06, "loss": 0.4659, "num_input_tokens_seen": 82416048, "step": 67935 }, { "epoch": 7.56654415859227, "grad_norm": 0.1021764874458313, "learning_rate": 8.49056698598509e-06, "loss": 0.4608, "num_input_tokens_seen": 82422032, "step": 67940 }, { "epoch": 7.567101013475888, "grad_norm": 0.13693682849407196, "learning_rate": 8.486918152708859e-06, "loss": 0.4476, "num_input_tokens_seen": 82427696, "step": 67945 }, { "epoch": 7.567657868359506, "grad_norm": 0.09118933975696564, "learning_rate": 8.483269943349786e-06, "loss": 0.4651, "num_input_tokens_seen": 82433968, "step": 67950 }, { "epoch": 7.5682147232431225, "grad_norm": 0.13800886273384094, "learning_rate": 8.479622358045697e-06, "loss": 0.4631, "num_input_tokens_seen": 82439760, "step": 67955 }, { "epoch": 7.56877157812674, "grad_norm": 0.1421201229095459, "learning_rate": 8.475975396934418e-06, "loss": 0.4505, "num_input_tokens_seen": 82445936, "step": 67960 }, { "epoch": 7.569328433010358, "grad_norm": 0.10832349956035614, "learning_rate": 8.472329060153759e-06, "loss": 0.4502, "num_input_tokens_seen": 82452048, "step": 67965 }, { "epoch": 7.569885287893975, "grad_norm": 0.09279855340719223, "learning_rate": 8.468683347841455e-06, "loss": 0.4789, "num_input_tokens_seen": 82457488, "step": 67970 }, { "epoch": 7.570442142777592, "grad_norm": 0.13143983483314514, "learning_rate": 8.465038260135275e-06, "loss": 0.4659, "num_input_tokens_seen": 82463536, "step": 67975 }, { "epoch": 7.57099899766121, "grad_norm": 0.15320709347724915, "learning_rate": 8.461393797172942e-06, "loss": 0.4752, "num_input_tokens_seen": 82469328, "step": 67980 }, { "epoch": 7.571555852544827, "grad_norm": 0.08867112547159195, "learning_rate": 8.457749959092148e-06, "loss": 0.4554, "num_input_tokens_seen": 82475248, "step": 67985 }, { "epoch": 7.5721127074284444, "grad_norm": 0.1471032053232193, "learning_rate": 8.45410674603058e-06, "loss": 0.47, "num_input_tokens_seen": 82480816, "step": 67990 }, { "epoch": 7.572669562312061, "grad_norm": 0.11871301382780075, "learning_rate": 8.450464158125874e-06, "loss": 0.4536, "num_input_tokens_seen": 82487152, "step": 67995 }, { "epoch": 7.573226417195679, "grad_norm": 0.12094607949256897, "learning_rate": 8.446822195515674e-06, "loss": 0.4583, "num_input_tokens_seen": 82493328, "step": 68000 }, { "epoch": 7.573783272079297, "grad_norm": 0.09843065589666367, "learning_rate": 8.443180858337574e-06, "loss": 0.4698, "num_input_tokens_seen": 82499440, "step": 68005 }, { "epoch": 7.574340126962913, "grad_norm": 0.12349661439657211, "learning_rate": 8.439540146729158e-06, "loss": 0.4575, "num_input_tokens_seen": 82504976, "step": 68010 }, { "epoch": 7.574896981846531, "grad_norm": 0.11190783232450485, "learning_rate": 8.435900060827994e-06, "loss": 0.4604, "num_input_tokens_seen": 82511088, "step": 68015 }, { "epoch": 7.575453836730148, "grad_norm": 0.10304959118366241, "learning_rate": 8.432260600771599e-06, "loss": 0.4468, "num_input_tokens_seen": 82517584, "step": 68020 }, { "epoch": 7.5760106916137655, "grad_norm": 0.12159227579832077, "learning_rate": 8.428621766697503e-06, "loss": 0.4579, "num_input_tokens_seen": 82523632, "step": 68025 }, { "epoch": 7.576567546497383, "grad_norm": 0.10836373269557953, "learning_rate": 8.424983558743175e-06, "loss": 0.4541, "num_input_tokens_seen": 82529744, "step": 68030 }, { "epoch": 7.577124401381, "grad_norm": 0.1481645107269287, "learning_rate": 8.421345977046082e-06, "loss": 0.4521, "num_input_tokens_seen": 82535696, "step": 68035 }, { "epoch": 7.577681256264618, "grad_norm": 0.095330610871315, "learning_rate": 8.417709021743678e-06, "loss": 0.4627, "num_input_tokens_seen": 82541808, "step": 68040 }, { "epoch": 7.578238111148234, "grad_norm": 0.09323868900537491, "learning_rate": 8.414072692973358e-06, "loss": 0.4556, "num_input_tokens_seen": 82547824, "step": 68045 }, { "epoch": 7.578794966031852, "grad_norm": 0.12638553977012634, "learning_rate": 8.41043699087253e-06, "loss": 0.4524, "num_input_tokens_seen": 82554160, "step": 68050 }, { "epoch": 7.57935182091547, "grad_norm": 0.08390920609235764, "learning_rate": 8.406801915578551e-06, "loss": 0.4535, "num_input_tokens_seen": 82560112, "step": 68055 }, { "epoch": 7.579908675799087, "grad_norm": 0.13403300940990448, "learning_rate": 8.403167467228779e-06, "loss": 0.4626, "num_input_tokens_seen": 82566128, "step": 68060 }, { "epoch": 7.580465530682704, "grad_norm": 0.2139350175857544, "learning_rate": 8.399533645960519e-06, "loss": 0.4634, "num_input_tokens_seen": 82572240, "step": 68065 }, { "epoch": 7.581022385566321, "grad_norm": 0.09734182059764862, "learning_rate": 8.395900451911077e-06, "loss": 0.4685, "num_input_tokens_seen": 82578480, "step": 68070 }, { "epoch": 7.581579240449939, "grad_norm": 0.09703461080789566, "learning_rate": 8.392267885217734e-06, "loss": 0.4624, "num_input_tokens_seen": 82584848, "step": 68075 }, { "epoch": 7.582136095333556, "grad_norm": 0.14220882952213287, "learning_rate": 8.388635946017723e-06, "loss": 0.4698, "num_input_tokens_seen": 82591472, "step": 68080 }, { "epoch": 7.582692950217173, "grad_norm": 0.10379882901906967, "learning_rate": 8.38500463444829e-06, "loss": 0.471, "num_input_tokens_seen": 82597456, "step": 68085 }, { "epoch": 7.583249805100791, "grad_norm": 0.0987028181552887, "learning_rate": 8.381373950646617e-06, "loss": 0.4635, "num_input_tokens_seen": 82603440, "step": 68090 }, { "epoch": 7.583806659984408, "grad_norm": 0.12872520089149475, "learning_rate": 8.377743894749903e-06, "loss": 0.464, "num_input_tokens_seen": 82609712, "step": 68095 }, { "epoch": 7.584363514868025, "grad_norm": 0.1367490589618683, "learning_rate": 8.374114466895284e-06, "loss": 0.4645, "num_input_tokens_seen": 82615344, "step": 68100 }, { "epoch": 7.584920369751643, "grad_norm": 0.09358318895101547, "learning_rate": 8.370485667219902e-06, "loss": 0.465, "num_input_tokens_seen": 82621648, "step": 68105 }, { "epoch": 7.58547722463526, "grad_norm": 0.10369350016117096, "learning_rate": 8.36685749586087e-06, "loss": 0.4544, "num_input_tokens_seen": 82627984, "step": 68110 }, { "epoch": 7.586034079518877, "grad_norm": 0.10453400015830994, "learning_rate": 8.36322995295526e-06, "loss": 0.469, "num_input_tokens_seen": 82633776, "step": 68115 }, { "epoch": 7.586590934402494, "grad_norm": 0.12082809954881668, "learning_rate": 8.359603038640142e-06, "loss": 0.4643, "num_input_tokens_seen": 82640016, "step": 68120 }, { "epoch": 7.587147789286112, "grad_norm": 0.09618651121854782, "learning_rate": 8.355976753052541e-06, "loss": 0.4499, "num_input_tokens_seen": 82645968, "step": 68125 }, { "epoch": 7.58770464416973, "grad_norm": 0.08808495849370956, "learning_rate": 8.352351096329485e-06, "loss": 0.4622, "num_input_tokens_seen": 82652144, "step": 68130 }, { "epoch": 7.588261499053346, "grad_norm": 0.08956816792488098, "learning_rate": 8.348726068607949e-06, "loss": 0.4594, "num_input_tokens_seen": 82658224, "step": 68135 }, { "epoch": 7.588818353936964, "grad_norm": 0.09875714778900146, "learning_rate": 8.3451016700249e-06, "loss": 0.4748, "num_input_tokens_seen": 82663792, "step": 68140 }, { "epoch": 7.589375208820582, "grad_norm": 0.2282959371805191, "learning_rate": 8.341477900717292e-06, "loss": 0.4739, "num_input_tokens_seen": 82670064, "step": 68145 }, { "epoch": 7.5899320637041985, "grad_norm": 0.10453619807958603, "learning_rate": 8.337854760822028e-06, "loss": 0.4574, "num_input_tokens_seen": 82676112, "step": 68150 }, { "epoch": 7.590488918587816, "grad_norm": 0.1258752942085266, "learning_rate": 8.334232250476013e-06, "loss": 0.4642, "num_input_tokens_seen": 82682000, "step": 68155 }, { "epoch": 7.591045773471434, "grad_norm": 0.09417109191417694, "learning_rate": 8.330610369816105e-06, "loss": 0.4692, "num_input_tokens_seen": 82688240, "step": 68160 }, { "epoch": 7.591602628355051, "grad_norm": 0.10849344730377197, "learning_rate": 8.326989118979155e-06, "loss": 0.464, "num_input_tokens_seen": 82694192, "step": 68165 }, { "epoch": 7.592159483238668, "grad_norm": 0.16166812181472778, "learning_rate": 8.323368498102e-06, "loss": 0.4555, "num_input_tokens_seen": 82700208, "step": 68170 }, { "epoch": 7.592716338122285, "grad_norm": 0.09121876955032349, "learning_rate": 8.319748507321413e-06, "loss": 0.4603, "num_input_tokens_seen": 82706480, "step": 68175 }, { "epoch": 7.593273193005903, "grad_norm": 0.09910252690315247, "learning_rate": 8.316129146774194e-06, "loss": 0.474, "num_input_tokens_seen": 82712336, "step": 68180 }, { "epoch": 7.59383004788952, "grad_norm": 0.07687408477067947, "learning_rate": 8.31251041659707e-06, "loss": 0.4556, "num_input_tokens_seen": 82718352, "step": 68185 }, { "epoch": 7.594386902773137, "grad_norm": 0.16830354928970337, "learning_rate": 8.308892316926789e-06, "loss": 0.4524, "num_input_tokens_seen": 82724208, "step": 68190 }, { "epoch": 7.594943757656755, "grad_norm": 0.11182069778442383, "learning_rate": 8.30527484790004e-06, "loss": 0.467, "num_input_tokens_seen": 82730512, "step": 68195 }, { "epoch": 7.595500612540372, "grad_norm": 0.08565250784158707, "learning_rate": 8.301658009653506e-06, "loss": 0.4591, "num_input_tokens_seen": 82736720, "step": 68200 }, { "epoch": 7.596057467423989, "grad_norm": 0.11390024423599243, "learning_rate": 8.298041802323853e-06, "loss": 0.4645, "num_input_tokens_seen": 82741744, "step": 68205 }, { "epoch": 7.596614322307607, "grad_norm": 0.09655600786209106, "learning_rate": 8.294426226047697e-06, "loss": 0.4702, "num_input_tokens_seen": 82747856, "step": 68210 }, { "epoch": 7.597171177191224, "grad_norm": 0.07219814509153366, "learning_rate": 8.290811280961664e-06, "loss": 0.4595, "num_input_tokens_seen": 82753744, "step": 68215 }, { "epoch": 7.5977280320748415, "grad_norm": 0.0890604704618454, "learning_rate": 8.287196967202318e-06, "loss": 0.4643, "num_input_tokens_seen": 82760016, "step": 68220 }, { "epoch": 7.598284886958458, "grad_norm": 0.09159302711486816, "learning_rate": 8.283583284906239e-06, "loss": 0.4649, "num_input_tokens_seen": 82765872, "step": 68225 }, { "epoch": 7.598841741842076, "grad_norm": 0.0957987830042839, "learning_rate": 8.279970234209941e-06, "loss": 0.468, "num_input_tokens_seen": 82771920, "step": 68230 }, { "epoch": 7.599398596725694, "grad_norm": 0.11877966672182083, "learning_rate": 8.276357815249955e-06, "loss": 0.4627, "num_input_tokens_seen": 82777712, "step": 68235 }, { "epoch": 7.59995545160931, "grad_norm": 0.09319209307432175, "learning_rate": 8.27274602816277e-06, "loss": 0.449, "num_input_tokens_seen": 82784016, "step": 68240 }, { "epoch": 7.600512306492928, "grad_norm": 0.14171618223190308, "learning_rate": 8.269134873084836e-06, "loss": 0.4528, "num_input_tokens_seen": 82790096, "step": 68245 }, { "epoch": 7.601069161376545, "grad_norm": 0.10920672863721848, "learning_rate": 8.265524350152612e-06, "loss": 0.464, "num_input_tokens_seen": 82796016, "step": 68250 }, { "epoch": 7.6016260162601625, "grad_norm": 0.10742572695016861, "learning_rate": 8.2619144595025e-06, "loss": 0.4642, "num_input_tokens_seen": 82802128, "step": 68255 }, { "epoch": 7.60218287114378, "grad_norm": 0.11558230966329575, "learning_rate": 8.258305201270899e-06, "loss": 0.4669, "num_input_tokens_seen": 82808432, "step": 68260 }, { "epoch": 7.602739726027397, "grad_norm": 0.1177578791975975, "learning_rate": 8.254696575594187e-06, "loss": 0.4497, "num_input_tokens_seen": 82814544, "step": 68265 }, { "epoch": 7.603296580911015, "grad_norm": 0.12659335136413574, "learning_rate": 8.25108858260869e-06, "loss": 0.4656, "num_input_tokens_seen": 82820560, "step": 68270 }, { "epoch": 7.6038534357946315, "grad_norm": 0.11499939113855362, "learning_rate": 8.247481222450754e-06, "loss": 0.463, "num_input_tokens_seen": 82826448, "step": 68275 }, { "epoch": 7.604410290678249, "grad_norm": 0.09055763483047485, "learning_rate": 8.24387449525665e-06, "loss": 0.4441, "num_input_tokens_seen": 82831760, "step": 68280 }, { "epoch": 7.604967145561867, "grad_norm": 0.08744758367538452, "learning_rate": 8.240268401162678e-06, "loss": 0.4682, "num_input_tokens_seen": 82837776, "step": 68285 }, { "epoch": 7.605524000445484, "grad_norm": 0.0917232409119606, "learning_rate": 8.236662940305064e-06, "loss": 0.4629, "num_input_tokens_seen": 82844144, "step": 68290 }, { "epoch": 7.606080855329101, "grad_norm": 0.1134430319070816, "learning_rate": 8.233058112820046e-06, "loss": 0.4599, "num_input_tokens_seen": 82849712, "step": 68295 }, { "epoch": 7.606637710212719, "grad_norm": 0.08161795139312744, "learning_rate": 8.229453918843836e-06, "loss": 0.4579, "num_input_tokens_seen": 82855792, "step": 68300 }, { "epoch": 7.607194565096336, "grad_norm": 0.10982609540224075, "learning_rate": 8.225850358512587e-06, "loss": 0.4571, "num_input_tokens_seen": 82861840, "step": 68305 }, { "epoch": 7.607751419979953, "grad_norm": 0.10218113660812378, "learning_rate": 8.222247431962479e-06, "loss": 0.4588, "num_input_tokens_seen": 82868112, "step": 68310 }, { "epoch": 7.60830827486357, "grad_norm": 0.13785290718078613, "learning_rate": 8.21864513932962e-06, "loss": 0.466, "num_input_tokens_seen": 82874384, "step": 68315 }, { "epoch": 7.608865129747188, "grad_norm": 0.08198092132806778, "learning_rate": 8.215043480750134e-06, "loss": 0.4598, "num_input_tokens_seen": 82880624, "step": 68320 }, { "epoch": 7.6094219846308055, "grad_norm": 0.13402293622493744, "learning_rate": 8.211442456360089e-06, "loss": 0.4588, "num_input_tokens_seen": 82886736, "step": 68325 }, { "epoch": 7.609978839514422, "grad_norm": 0.11285620182752609, "learning_rate": 8.207842066295549e-06, "loss": 0.4647, "num_input_tokens_seen": 82892784, "step": 68330 }, { "epoch": 7.61053569439804, "grad_norm": 0.09520767629146576, "learning_rate": 8.204242310692556e-06, "loss": 0.4611, "num_input_tokens_seen": 82898672, "step": 68335 }, { "epoch": 7.611092549281658, "grad_norm": 0.12568223476409912, "learning_rate": 8.200643189687104e-06, "loss": 0.4556, "num_input_tokens_seen": 82904784, "step": 68340 }, { "epoch": 7.6116494041652745, "grad_norm": 0.12562280893325806, "learning_rate": 8.197044703415199e-06, "loss": 0.4531, "num_input_tokens_seen": 82910960, "step": 68345 }, { "epoch": 7.612206259048892, "grad_norm": 0.0992889553308487, "learning_rate": 8.193446852012784e-06, "loss": 0.4539, "num_input_tokens_seen": 82916784, "step": 68350 }, { "epoch": 7.612763113932509, "grad_norm": 0.10334029793739319, "learning_rate": 8.189849635615815e-06, "loss": 0.4639, "num_input_tokens_seen": 82922896, "step": 68355 }, { "epoch": 7.613319968816127, "grad_norm": 0.1013694480061531, "learning_rate": 8.186253054360187e-06, "loss": 0.4525, "num_input_tokens_seen": 82929168, "step": 68360 }, { "epoch": 7.613876823699744, "grad_norm": 0.132298544049263, "learning_rate": 8.182657108381802e-06, "loss": 0.4693, "num_input_tokens_seen": 82935248, "step": 68365 }, { "epoch": 7.614433678583361, "grad_norm": 0.11062084883451462, "learning_rate": 8.179061797816534e-06, "loss": 0.4632, "num_input_tokens_seen": 82941360, "step": 68370 }, { "epoch": 7.614990533466979, "grad_norm": 0.11747022718191147, "learning_rate": 8.175467122800209e-06, "loss": 0.4555, "num_input_tokens_seen": 82947568, "step": 68375 }, { "epoch": 7.6155473883505955, "grad_norm": 0.12611141800880432, "learning_rate": 8.17187308346866e-06, "loss": 0.4684, "num_input_tokens_seen": 82953840, "step": 68380 }, { "epoch": 7.616104243234213, "grad_norm": 0.11464811116456985, "learning_rate": 8.168279679957674e-06, "loss": 0.4596, "num_input_tokens_seen": 82960240, "step": 68385 }, { "epoch": 7.616661098117831, "grad_norm": 0.08596988767385483, "learning_rate": 8.164686912403013e-06, "loss": 0.4532, "num_input_tokens_seen": 82966320, "step": 68390 }, { "epoch": 7.617217953001448, "grad_norm": 0.08424839377403259, "learning_rate": 8.161094780940443e-06, "loss": 0.4707, "num_input_tokens_seen": 82972336, "step": 68395 }, { "epoch": 7.617774807885065, "grad_norm": 0.08709360659122467, "learning_rate": 8.157503285705664e-06, "loss": 0.4615, "num_input_tokens_seen": 82978608, "step": 68400 }, { "epoch": 7.618331662768682, "grad_norm": 0.08541540056467056, "learning_rate": 8.153912426834396e-06, "loss": 0.459, "num_input_tokens_seen": 82984656, "step": 68405 }, { "epoch": 7.6188885176523, "grad_norm": 0.09053738415241241, "learning_rate": 8.150322204462294e-06, "loss": 0.4492, "num_input_tokens_seen": 82990544, "step": 68410 }, { "epoch": 7.6194453725359175, "grad_norm": 0.15571750700473785, "learning_rate": 8.146732618725013e-06, "loss": 0.4757, "num_input_tokens_seen": 82996848, "step": 68415 }, { "epoch": 7.620002227419534, "grad_norm": 0.09348160028457642, "learning_rate": 8.143143669758194e-06, "loss": 0.4558, "num_input_tokens_seen": 83002640, "step": 68420 }, { "epoch": 7.620559082303152, "grad_norm": 0.10897190868854523, "learning_rate": 8.139555357697417e-06, "loss": 0.4659, "num_input_tokens_seen": 83009008, "step": 68425 }, { "epoch": 7.621115937186769, "grad_norm": 0.1375303715467453, "learning_rate": 8.13596768267828e-06, "loss": 0.4631, "num_input_tokens_seen": 83014960, "step": 68430 }, { "epoch": 7.621672792070386, "grad_norm": 0.13238568603992462, "learning_rate": 8.13238064483632e-06, "loss": 0.464, "num_input_tokens_seen": 83021232, "step": 68435 }, { "epoch": 7.622229646954004, "grad_norm": 0.13746432960033417, "learning_rate": 8.128794244307075e-06, "loss": 0.4614, "num_input_tokens_seen": 83027120, "step": 68440 }, { "epoch": 7.622786501837621, "grad_norm": 0.09511778503656387, "learning_rate": 8.125208481226056e-06, "loss": 0.4522, "num_input_tokens_seen": 83032880, "step": 68445 }, { "epoch": 7.6233433567212385, "grad_norm": 0.09077522903680801, "learning_rate": 8.12162335572873e-06, "loss": 0.4604, "num_input_tokens_seen": 83038992, "step": 68450 }, { "epoch": 7.623900211604855, "grad_norm": 0.09232926368713379, "learning_rate": 8.118038867950573e-06, "loss": 0.4645, "num_input_tokens_seen": 83045296, "step": 68455 }, { "epoch": 7.624457066488473, "grad_norm": 0.09781133383512497, "learning_rate": 8.114455018027001e-06, "loss": 0.4555, "num_input_tokens_seen": 83051600, "step": 68460 }, { "epoch": 7.625013921372091, "grad_norm": 0.0894390121102333, "learning_rate": 8.110871806093437e-06, "loss": 0.4536, "num_input_tokens_seen": 83057840, "step": 68465 }, { "epoch": 7.6255707762557075, "grad_norm": 0.10254818201065063, "learning_rate": 8.107289232285253e-06, "loss": 0.4617, "num_input_tokens_seen": 83063600, "step": 68470 }, { "epoch": 7.626127631139325, "grad_norm": 0.11932298541069031, "learning_rate": 8.103707296737818e-06, "loss": 0.4585, "num_input_tokens_seen": 83069840, "step": 68475 }, { "epoch": 7.626684486022943, "grad_norm": 0.08749523758888245, "learning_rate": 8.100125999586472e-06, "loss": 0.4714, "num_input_tokens_seen": 83075696, "step": 68480 }, { "epoch": 7.62724134090656, "grad_norm": 0.13377664983272552, "learning_rate": 8.096545340966519e-06, "loss": 0.4407, "num_input_tokens_seen": 83081360, "step": 68485 }, { "epoch": 7.627798195790177, "grad_norm": 0.12132085859775543, "learning_rate": 8.09296532101326e-06, "loss": 0.4622, "num_input_tokens_seen": 83087472, "step": 68490 }, { "epoch": 7.628355050673794, "grad_norm": 0.07810838520526886, "learning_rate": 8.089385939861946e-06, "loss": 0.4605, "num_input_tokens_seen": 83093744, "step": 68495 }, { "epoch": 7.628911905557412, "grad_norm": 0.11396995931863785, "learning_rate": 8.085807197647832e-06, "loss": 0.4489, "num_input_tokens_seen": 83099920, "step": 68500 }, { "epoch": 7.629468760441029, "grad_norm": 0.08412541449069977, "learning_rate": 8.082229094506117e-06, "loss": 0.4643, "num_input_tokens_seen": 83105904, "step": 68505 }, { "epoch": 7.630025615324646, "grad_norm": 0.09763690084218979, "learning_rate": 8.078651630572e-06, "loss": 0.455, "num_input_tokens_seen": 83112240, "step": 68510 }, { "epoch": 7.630582470208264, "grad_norm": 0.09861315786838531, "learning_rate": 8.075074805980664e-06, "loss": 0.4397, "num_input_tokens_seen": 83118448, "step": 68515 }, { "epoch": 7.6311393250918815, "grad_norm": 0.10330396890640259, "learning_rate": 8.071498620867229e-06, "loss": 0.4588, "num_input_tokens_seen": 83124368, "step": 68520 }, { "epoch": 7.631696179975498, "grad_norm": 0.12875567376613617, "learning_rate": 8.067923075366838e-06, "loss": 0.475, "num_input_tokens_seen": 83129776, "step": 68525 }, { "epoch": 7.632253034859116, "grad_norm": 0.12982067465782166, "learning_rate": 8.064348169614563e-06, "loss": 0.4637, "num_input_tokens_seen": 83135920, "step": 68530 }, { "epoch": 7.632809889742733, "grad_norm": 0.08612529933452606, "learning_rate": 8.06077390374549e-06, "loss": 0.4601, "num_input_tokens_seen": 83141680, "step": 68535 }, { "epoch": 7.6333667446263505, "grad_norm": 0.09688276797533035, "learning_rate": 8.057200277894669e-06, "loss": 0.4651, "num_input_tokens_seen": 83148176, "step": 68540 }, { "epoch": 7.633923599509968, "grad_norm": 0.1117674708366394, "learning_rate": 8.053627292197111e-06, "loss": 0.4643, "num_input_tokens_seen": 83154640, "step": 68545 }, { "epoch": 7.634480454393585, "grad_norm": 0.10231132060289383, "learning_rate": 8.050054946787828e-06, "loss": 0.4613, "num_input_tokens_seen": 83160848, "step": 68550 }, { "epoch": 7.635037309277203, "grad_norm": 0.10525903105735779, "learning_rate": 8.04648324180178e-06, "loss": 0.4674, "num_input_tokens_seen": 83166928, "step": 68555 }, { "epoch": 7.635594164160819, "grad_norm": 0.11223505437374115, "learning_rate": 8.042912177373934e-06, "loss": 0.4592, "num_input_tokens_seen": 83172528, "step": 68560 }, { "epoch": 7.636151019044437, "grad_norm": 0.07927355170249939, "learning_rate": 8.039341753639198e-06, "loss": 0.4582, "num_input_tokens_seen": 83178256, "step": 68565 }, { "epoch": 7.636707873928055, "grad_norm": 0.10453476756811142, "learning_rate": 8.035771970732486e-06, "loss": 0.4594, "num_input_tokens_seen": 83184240, "step": 68570 }, { "epoch": 7.6372647288116715, "grad_norm": 0.11779282987117767, "learning_rate": 8.032202828788682e-06, "loss": 0.4691, "num_input_tokens_seen": 83190288, "step": 68575 }, { "epoch": 7.637821583695289, "grad_norm": 0.09903693199157715, "learning_rate": 8.028634327942622e-06, "loss": 0.4507, "num_input_tokens_seen": 83196496, "step": 68580 }, { "epoch": 7.638378438578906, "grad_norm": 0.0799255296587944, "learning_rate": 8.02506646832915e-06, "loss": 0.4522, "num_input_tokens_seen": 83202352, "step": 68585 }, { "epoch": 7.638935293462524, "grad_norm": 0.1084270253777504, "learning_rate": 8.021499250083062e-06, "loss": 0.472, "num_input_tokens_seen": 83208496, "step": 68590 }, { "epoch": 7.639492148346141, "grad_norm": 0.11323880404233932, "learning_rate": 8.017932673339145e-06, "loss": 0.4664, "num_input_tokens_seen": 83214544, "step": 68595 }, { "epoch": 7.640049003229758, "grad_norm": 0.10806509107351303, "learning_rate": 8.01436673823215e-06, "loss": 0.4508, "num_input_tokens_seen": 83220912, "step": 68600 }, { "epoch": 7.640605858113376, "grad_norm": 0.08936852961778641, "learning_rate": 8.010801444896812e-06, "loss": 0.4628, "num_input_tokens_seen": 83227216, "step": 68605 }, { "epoch": 7.641162712996993, "grad_norm": 0.11394347995519638, "learning_rate": 8.007236793467848e-06, "loss": 0.4624, "num_input_tokens_seen": 83233200, "step": 68610 }, { "epoch": 7.64171956788061, "grad_norm": 0.11980685591697693, "learning_rate": 8.003672784079925e-06, "loss": 0.4535, "num_input_tokens_seen": 83239376, "step": 68615 }, { "epoch": 7.642276422764228, "grad_norm": 0.07508515566587448, "learning_rate": 8.000109416867718e-06, "loss": 0.467, "num_input_tokens_seen": 83245680, "step": 68620 }, { "epoch": 7.642833277647845, "grad_norm": 0.1036565899848938, "learning_rate": 7.996546691965851e-06, "loss": 0.4576, "num_input_tokens_seen": 83251760, "step": 68625 }, { "epoch": 7.643390132531462, "grad_norm": 0.10226818919181824, "learning_rate": 7.992984609508946e-06, "loss": 0.4723, "num_input_tokens_seen": 83257840, "step": 68630 }, { "epoch": 7.643946987415079, "grad_norm": 0.10037139803171158, "learning_rate": 7.98942316963158e-06, "loss": 0.4548, "num_input_tokens_seen": 83263600, "step": 68635 }, { "epoch": 7.644503842298697, "grad_norm": 0.11886876821517944, "learning_rate": 7.985862372468317e-06, "loss": 0.4617, "num_input_tokens_seen": 83269968, "step": 68640 }, { "epoch": 7.6450606971823145, "grad_norm": 0.09193450957536697, "learning_rate": 7.982302218153707e-06, "loss": 0.4649, "num_input_tokens_seen": 83275472, "step": 68645 }, { "epoch": 7.645617552065931, "grad_norm": 0.08416967839002609, "learning_rate": 7.978742706822245e-06, "loss": 0.447, "num_input_tokens_seen": 83281424, "step": 68650 }, { "epoch": 7.646174406949549, "grad_norm": 0.09565631300210953, "learning_rate": 7.975183838608439e-06, "loss": 0.456, "num_input_tokens_seen": 83287536, "step": 68655 }, { "epoch": 7.646731261833167, "grad_norm": 0.0994948297739029, "learning_rate": 7.971625613646739e-06, "loss": 0.4653, "num_input_tokens_seen": 83293616, "step": 68660 }, { "epoch": 7.647288116716783, "grad_norm": 0.13752701878547668, "learning_rate": 7.968068032071593e-06, "loss": 0.4678, "num_input_tokens_seen": 83299376, "step": 68665 }, { "epoch": 7.647844971600401, "grad_norm": 0.10482841730117798, "learning_rate": 7.96451109401743e-06, "loss": 0.4682, "num_input_tokens_seen": 83305168, "step": 68670 }, { "epoch": 7.648401826484018, "grad_norm": 0.09952042996883392, "learning_rate": 7.960954799618618e-06, "loss": 0.4522, "num_input_tokens_seen": 83311440, "step": 68675 }, { "epoch": 7.648958681367636, "grad_norm": 0.08411355316638947, "learning_rate": 7.957399149009548e-06, "loss": 0.4608, "num_input_tokens_seen": 83317456, "step": 68680 }, { "epoch": 7.649515536251253, "grad_norm": 0.11673489212989807, "learning_rate": 7.953844142324546e-06, "loss": 0.4616, "num_input_tokens_seen": 83323728, "step": 68685 }, { "epoch": 7.65007239113487, "grad_norm": 0.09883306175470352, "learning_rate": 7.950289779697947e-06, "loss": 0.472, "num_input_tokens_seen": 83330032, "step": 68690 }, { "epoch": 7.650629246018488, "grad_norm": 0.09055711328983307, "learning_rate": 7.94673606126403e-06, "loss": 0.4677, "num_input_tokens_seen": 83336400, "step": 68695 }, { "epoch": 7.651186100902105, "grad_norm": 0.11638952791690826, "learning_rate": 7.943182987157075e-06, "loss": 0.4556, "num_input_tokens_seen": 83342544, "step": 68700 }, { "epoch": 7.651742955785722, "grad_norm": 0.13508982956409454, "learning_rate": 7.939630557511334e-06, "loss": 0.4567, "num_input_tokens_seen": 83348656, "step": 68705 }, { "epoch": 7.65229981066934, "grad_norm": 0.1191672533750534, "learning_rate": 7.93607877246102e-06, "loss": 0.4716, "num_input_tokens_seen": 83354032, "step": 68710 }, { "epoch": 7.652856665552957, "grad_norm": 0.07791031152009964, "learning_rate": 7.932527632140337e-06, "loss": 0.4433, "num_input_tokens_seen": 83360304, "step": 68715 }, { "epoch": 7.653413520436574, "grad_norm": 0.10277878493070602, "learning_rate": 7.928977136683452e-06, "loss": 0.4753, "num_input_tokens_seen": 83366640, "step": 68720 }, { "epoch": 7.653970375320192, "grad_norm": 0.11658003181219101, "learning_rate": 7.925427286224521e-06, "loss": 0.4488, "num_input_tokens_seen": 83372784, "step": 68725 }, { "epoch": 7.654527230203809, "grad_norm": 0.10127469897270203, "learning_rate": 7.92187808089766e-06, "loss": 0.4593, "num_input_tokens_seen": 83378768, "step": 68730 }, { "epoch": 7.655084085087426, "grad_norm": 0.11232659220695496, "learning_rate": 7.918329520836976e-06, "loss": 0.4687, "num_input_tokens_seen": 83384944, "step": 68735 }, { "epoch": 7.655640939971043, "grad_norm": 0.0926743596792221, "learning_rate": 7.91478160617655e-06, "loss": 0.4713, "num_input_tokens_seen": 83390800, "step": 68740 }, { "epoch": 7.656197794854661, "grad_norm": 0.1296982616186142, "learning_rate": 7.911234337050416e-06, "loss": 0.457, "num_input_tokens_seen": 83396720, "step": 68745 }, { "epoch": 7.656754649738279, "grad_norm": 0.11154751479625702, "learning_rate": 7.907687713592624e-06, "loss": 0.4548, "num_input_tokens_seen": 83402640, "step": 68750 }, { "epoch": 7.657311504621895, "grad_norm": 0.12233196198940277, "learning_rate": 7.904141735937156e-06, "loss": 0.4632, "num_input_tokens_seen": 83408656, "step": 68755 }, { "epoch": 7.657868359505513, "grad_norm": 0.09875163435935974, "learning_rate": 7.900596404218e-06, "loss": 0.4612, "num_input_tokens_seen": 83414192, "step": 68760 }, { "epoch": 7.65842521438913, "grad_norm": 0.08558665961027145, "learning_rate": 7.897051718569119e-06, "loss": 0.4551, "num_input_tokens_seen": 83420176, "step": 68765 }, { "epoch": 7.6589820692727475, "grad_norm": 0.13754574954509735, "learning_rate": 7.893507679124423e-06, "loss": 0.4642, "num_input_tokens_seen": 83426256, "step": 68770 }, { "epoch": 7.659538924156365, "grad_norm": 0.09852413833141327, "learning_rate": 7.889964286017836e-06, "loss": 0.4604, "num_input_tokens_seen": 83432432, "step": 68775 }, { "epoch": 7.660095779039982, "grad_norm": 0.1175759807229042, "learning_rate": 7.886421539383224e-06, "loss": 0.4571, "num_input_tokens_seen": 83438512, "step": 68780 }, { "epoch": 7.6606526339236, "grad_norm": 0.11526042968034744, "learning_rate": 7.882879439354454e-06, "loss": 0.4533, "num_input_tokens_seen": 83444688, "step": 68785 }, { "epoch": 7.661209488807216, "grad_norm": 0.08447612822055817, "learning_rate": 7.879337986065354e-06, "loss": 0.4562, "num_input_tokens_seen": 83450672, "step": 68790 }, { "epoch": 7.661766343690834, "grad_norm": 0.14768309891223907, "learning_rate": 7.875797179649723e-06, "loss": 0.4638, "num_input_tokens_seen": 83456880, "step": 68795 }, { "epoch": 7.662323198574452, "grad_norm": 0.11981386691331863, "learning_rate": 7.872257020241362e-06, "loss": 0.4628, "num_input_tokens_seen": 83463184, "step": 68800 }, { "epoch": 7.6628800534580686, "grad_norm": 0.12235541641712189, "learning_rate": 7.86871750797401e-06, "loss": 0.4579, "num_input_tokens_seen": 83468976, "step": 68805 }, { "epoch": 7.663436908341686, "grad_norm": 0.11160067468881607, "learning_rate": 7.865178642981411e-06, "loss": 0.4508, "num_input_tokens_seen": 83474736, "step": 68810 }, { "epoch": 7.663993763225303, "grad_norm": 0.11568211019039154, "learning_rate": 7.861640425397279e-06, "loss": 0.4695, "num_input_tokens_seen": 83480784, "step": 68815 }, { "epoch": 7.664550618108921, "grad_norm": 0.08776858448982239, "learning_rate": 7.85810285535529e-06, "loss": 0.4661, "num_input_tokens_seen": 83486960, "step": 68820 }, { "epoch": 7.665107472992538, "grad_norm": 0.0937381163239479, "learning_rate": 7.854565932989116e-06, "loss": 0.4523, "num_input_tokens_seen": 83493072, "step": 68825 }, { "epoch": 7.665664327876155, "grad_norm": 0.11900985240936279, "learning_rate": 7.851029658432377e-06, "loss": 0.4757, "num_input_tokens_seen": 83498416, "step": 68830 }, { "epoch": 7.666221182759773, "grad_norm": 0.11008628457784653, "learning_rate": 7.847494031818702e-06, "loss": 0.4611, "num_input_tokens_seen": 83504752, "step": 68835 }, { "epoch": 7.6667780376433905, "grad_norm": 0.11654278635978699, "learning_rate": 7.843959053281663e-06, "loss": 0.458, "num_input_tokens_seen": 83511024, "step": 68840 }, { "epoch": 7.667334892527007, "grad_norm": 0.0833439826965332, "learning_rate": 7.84042472295483e-06, "loss": 0.4742, "num_input_tokens_seen": 83517104, "step": 68845 }, { "epoch": 7.667891747410625, "grad_norm": 0.10562983900308609, "learning_rate": 7.836891040971753e-06, "loss": 0.4663, "num_input_tokens_seen": 83522960, "step": 68850 }, { "epoch": 7.668448602294242, "grad_norm": 0.1178942397236824, "learning_rate": 7.833358007465922e-06, "loss": 0.4601, "num_input_tokens_seen": 83529104, "step": 68855 }, { "epoch": 7.669005457177859, "grad_norm": 0.16326485574245453, "learning_rate": 7.82982562257085e-06, "loss": 0.4578, "num_input_tokens_seen": 83535280, "step": 68860 }, { "epoch": 7.669562312061477, "grad_norm": 0.11794301122426987, "learning_rate": 7.82629388641998e-06, "loss": 0.47, "num_input_tokens_seen": 83541232, "step": 68865 }, { "epoch": 7.670119166945094, "grad_norm": 0.13759441673755646, "learning_rate": 7.822762799146771e-06, "loss": 0.4734, "num_input_tokens_seen": 83547152, "step": 68870 }, { "epoch": 7.6706760218287116, "grad_norm": 0.09103401005268097, "learning_rate": 7.819232360884624e-06, "loss": 0.4569, "num_input_tokens_seen": 83553168, "step": 68875 }, { "epoch": 7.671232876712329, "grad_norm": 0.10064030438661575, "learning_rate": 7.815702571766934e-06, "loss": 0.4546, "num_input_tokens_seen": 83559312, "step": 68880 }, { "epoch": 7.671789731595946, "grad_norm": 0.08538320660591125, "learning_rate": 7.812173431927081e-06, "loss": 0.4492, "num_input_tokens_seen": 83565648, "step": 68885 }, { "epoch": 7.672346586479564, "grad_norm": 0.09931133687496185, "learning_rate": 7.808644941498389e-06, "loss": 0.4683, "num_input_tokens_seen": 83571472, "step": 68890 }, { "epoch": 7.6729034413631805, "grad_norm": 0.08299513161182404, "learning_rate": 7.80511710061419e-06, "loss": 0.4664, "num_input_tokens_seen": 83577584, "step": 68895 }, { "epoch": 7.673460296246798, "grad_norm": 0.09330549091100693, "learning_rate": 7.801589909407764e-06, "loss": 0.4485, "num_input_tokens_seen": 83583824, "step": 68900 }, { "epoch": 7.674017151130416, "grad_norm": 0.13733328878879547, "learning_rate": 7.798063368012393e-06, "loss": 0.4589, "num_input_tokens_seen": 83590032, "step": 68905 }, { "epoch": 7.674574006014033, "grad_norm": 0.11824782192707062, "learning_rate": 7.794537476561306e-06, "loss": 0.4547, "num_input_tokens_seen": 83596048, "step": 68910 }, { "epoch": 7.67513086089765, "grad_norm": 0.12496040016412735, "learning_rate": 7.791012235187734e-06, "loss": 0.4599, "num_input_tokens_seen": 83602000, "step": 68915 }, { "epoch": 7.675687715781267, "grad_norm": 0.08872952312231064, "learning_rate": 7.787487644024873e-06, "loss": 0.4674, "num_input_tokens_seen": 83608304, "step": 68920 }, { "epoch": 7.676244570664885, "grad_norm": 0.11200414597988129, "learning_rate": 7.783963703205883e-06, "loss": 0.4675, "num_input_tokens_seen": 83614480, "step": 68925 }, { "epoch": 7.676801425548502, "grad_norm": 0.11635523289442062, "learning_rate": 7.780440412863923e-06, "loss": 0.4701, "num_input_tokens_seen": 83621072, "step": 68930 }, { "epoch": 7.677358280432119, "grad_norm": 0.10288675129413605, "learning_rate": 7.7769177731321e-06, "loss": 0.4605, "num_input_tokens_seen": 83627408, "step": 68935 }, { "epoch": 7.677915135315737, "grad_norm": 0.10825467109680176, "learning_rate": 7.773395784143516e-06, "loss": 0.4685, "num_input_tokens_seen": 83633776, "step": 68940 }, { "epoch": 7.678471990199354, "grad_norm": 0.10156338661909103, "learning_rate": 7.769874446031254e-06, "loss": 0.4556, "num_input_tokens_seen": 83639600, "step": 68945 }, { "epoch": 7.679028845082971, "grad_norm": 0.15274357795715332, "learning_rate": 7.766353758928343e-06, "loss": 0.4543, "num_input_tokens_seen": 83645328, "step": 68950 }, { "epoch": 7.679585699966589, "grad_norm": 0.10173480957746506, "learning_rate": 7.762833722967825e-06, "loss": 0.4555, "num_input_tokens_seen": 83651632, "step": 68955 }, { "epoch": 7.680142554850206, "grad_norm": 0.11353942006826401, "learning_rate": 7.759314338282677e-06, "loss": 0.4575, "num_input_tokens_seen": 83657424, "step": 68960 }, { "epoch": 7.6806994097338235, "grad_norm": 0.11152877658605576, "learning_rate": 7.755795605005894e-06, "loss": 0.4418, "num_input_tokens_seen": 83663536, "step": 68965 }, { "epoch": 7.68125626461744, "grad_norm": 0.0939858928322792, "learning_rate": 7.752277523270408e-06, "loss": 0.4604, "num_input_tokens_seen": 83669712, "step": 68970 }, { "epoch": 7.681813119501058, "grad_norm": 0.10991241037845612, "learning_rate": 7.748760093209145e-06, "loss": 0.455, "num_input_tokens_seen": 83675952, "step": 68975 }, { "epoch": 7.682369974384676, "grad_norm": 0.10066048055887222, "learning_rate": 7.745243314955023e-06, "loss": 0.4527, "num_input_tokens_seen": 83682032, "step": 68980 }, { "epoch": 7.682926829268292, "grad_norm": 0.09017110615968704, "learning_rate": 7.741727188640893e-06, "loss": 0.4685, "num_input_tokens_seen": 83688080, "step": 68985 }, { "epoch": 7.68348368415191, "grad_norm": 0.07746493071317673, "learning_rate": 7.738211714399626e-06, "loss": 0.4604, "num_input_tokens_seen": 83693936, "step": 68990 }, { "epoch": 7.684040539035527, "grad_norm": 0.12651263177394867, "learning_rate": 7.734696892364033e-06, "loss": 0.4514, "num_input_tokens_seen": 83699920, "step": 68995 }, { "epoch": 7.6845973939191445, "grad_norm": 0.10245300829410553, "learning_rate": 7.731182722666927e-06, "loss": 0.4602, "num_input_tokens_seen": 83706064, "step": 69000 }, { "epoch": 7.685154248802762, "grad_norm": 0.09048685431480408, "learning_rate": 7.727669205441068e-06, "loss": 0.4623, "num_input_tokens_seen": 83712208, "step": 69005 }, { "epoch": 7.685711103686379, "grad_norm": 0.12034744769334793, "learning_rate": 7.724156340819222e-06, "loss": 0.4666, "num_input_tokens_seen": 83718512, "step": 69010 }, { "epoch": 7.686267958569997, "grad_norm": 0.18465165793895721, "learning_rate": 7.720644128934117e-06, "loss": 0.4709, "num_input_tokens_seen": 83724656, "step": 69015 }, { "epoch": 7.686824813453614, "grad_norm": 0.08071146160364151, "learning_rate": 7.717132569918445e-06, "loss": 0.4526, "num_input_tokens_seen": 83730768, "step": 69020 }, { "epoch": 7.687381668337231, "grad_norm": 0.12557482719421387, "learning_rate": 7.7136216639049e-06, "loss": 0.448, "num_input_tokens_seen": 83736720, "step": 69025 }, { "epoch": 7.687938523220849, "grad_norm": 0.1119031086564064, "learning_rate": 7.710111411026113e-06, "loss": 0.4516, "num_input_tokens_seen": 83742736, "step": 69030 }, { "epoch": 7.688495378104466, "grad_norm": 0.10695625841617584, "learning_rate": 7.706601811414728e-06, "loss": 0.4742, "num_input_tokens_seen": 83748592, "step": 69035 }, { "epoch": 7.689052232988083, "grad_norm": 0.11456184089183807, "learning_rate": 7.703092865203355e-06, "loss": 0.4627, "num_input_tokens_seen": 83754704, "step": 69040 }, { "epoch": 7.689609087871701, "grad_norm": 0.08144428580999374, "learning_rate": 7.699584572524554e-06, "loss": 0.4551, "num_input_tokens_seen": 83761168, "step": 69045 }, { "epoch": 7.690165942755318, "grad_norm": 0.08846194297075272, "learning_rate": 7.696076933510901e-06, "loss": 0.457, "num_input_tokens_seen": 83767632, "step": 69050 }, { "epoch": 7.690722797638935, "grad_norm": 0.09846778213977814, "learning_rate": 7.692569948294903e-06, "loss": 0.4455, "num_input_tokens_seen": 83773872, "step": 69055 }, { "epoch": 7.691279652522553, "grad_norm": 0.10335316509008408, "learning_rate": 7.689063617009088e-06, "loss": 0.4553, "num_input_tokens_seen": 83780144, "step": 69060 }, { "epoch": 7.69183650740617, "grad_norm": 0.101798877120018, "learning_rate": 7.685557939785915e-06, "loss": 0.4658, "num_input_tokens_seen": 83786160, "step": 69065 }, { "epoch": 7.6923933622897875, "grad_norm": 0.1257268190383911, "learning_rate": 7.682052916757848e-06, "loss": 0.4517, "num_input_tokens_seen": 83792144, "step": 69070 }, { "epoch": 7.692950217173404, "grad_norm": 0.10874645411968231, "learning_rate": 7.67854854805733e-06, "loss": 0.4533, "num_input_tokens_seen": 83798608, "step": 69075 }, { "epoch": 7.693507072057022, "grad_norm": 0.14015834033489227, "learning_rate": 7.67504483381675e-06, "loss": 0.4621, "num_input_tokens_seen": 83804784, "step": 69080 }, { "epoch": 7.69406392694064, "grad_norm": 0.08521327376365662, "learning_rate": 7.671541774168503e-06, "loss": 0.4653, "num_input_tokens_seen": 83810928, "step": 69085 }, { "epoch": 7.6946207818242565, "grad_norm": 0.14175626635551453, "learning_rate": 7.668039369244933e-06, "loss": 0.4544, "num_input_tokens_seen": 83817328, "step": 69090 }, { "epoch": 7.695177636707874, "grad_norm": 0.1173894926905632, "learning_rate": 7.664537619178386e-06, "loss": 0.4641, "num_input_tokens_seen": 83823920, "step": 69095 }, { "epoch": 7.695734491591491, "grad_norm": 0.09755043685436249, "learning_rate": 7.661036524101153e-06, "loss": 0.4631, "num_input_tokens_seen": 83830096, "step": 69100 }, { "epoch": 7.696291346475109, "grad_norm": 0.10623741894960403, "learning_rate": 7.657536084145528e-06, "loss": 0.4507, "num_input_tokens_seen": 83835696, "step": 69105 }, { "epoch": 7.696848201358726, "grad_norm": 0.09483882039785385, "learning_rate": 7.654036299443774e-06, "loss": 0.4614, "num_input_tokens_seen": 83841968, "step": 69110 }, { "epoch": 7.697405056242343, "grad_norm": 0.11917367577552795, "learning_rate": 7.650537170128106e-06, "loss": 0.4599, "num_input_tokens_seen": 83848176, "step": 69115 }, { "epoch": 7.697961911125961, "grad_norm": 0.11091802269220352, "learning_rate": 7.647038696330752e-06, "loss": 0.4548, "num_input_tokens_seen": 83854416, "step": 69120 }, { "epoch": 7.6985187660095775, "grad_norm": 0.10140027105808258, "learning_rate": 7.643540878183878e-06, "loss": 0.4693, "num_input_tokens_seen": 83860368, "step": 69125 }, { "epoch": 7.699075620893195, "grad_norm": 0.09285635501146317, "learning_rate": 7.640043715819662e-06, "loss": 0.462, "num_input_tokens_seen": 83866448, "step": 69130 }, { "epoch": 7.699632475776813, "grad_norm": 0.08593730628490448, "learning_rate": 7.636547209370214e-06, "loss": 0.455, "num_input_tokens_seen": 83872752, "step": 69135 }, { "epoch": 7.70018933066043, "grad_norm": 0.12684273719787598, "learning_rate": 7.633051358967661e-06, "loss": 0.4688, "num_input_tokens_seen": 83878640, "step": 69140 }, { "epoch": 7.700746185544047, "grad_norm": 0.08056364953517914, "learning_rate": 7.62955616474409e-06, "loss": 0.4589, "num_input_tokens_seen": 83884688, "step": 69145 }, { "epoch": 7.701303040427664, "grad_norm": 0.14352087676525116, "learning_rate": 7.626061626831543e-06, "loss": 0.4671, "num_input_tokens_seen": 83890864, "step": 69150 }, { "epoch": 7.701859895311282, "grad_norm": 0.13058963418006897, "learning_rate": 7.622567745362075e-06, "loss": 0.4559, "num_input_tokens_seen": 83896560, "step": 69155 }, { "epoch": 7.7024167501948995, "grad_norm": 0.09380808472633362, "learning_rate": 7.619074520467679e-06, "loss": 0.4567, "num_input_tokens_seen": 83902608, "step": 69160 }, { "epoch": 7.702973605078516, "grad_norm": 0.09041695296764374, "learning_rate": 7.615581952280346e-06, "loss": 0.4684, "num_input_tokens_seen": 83908880, "step": 69165 }, { "epoch": 7.703530459962134, "grad_norm": 0.143488347530365, "learning_rate": 7.612090040932046e-06, "loss": 0.4589, "num_input_tokens_seen": 83914768, "step": 69170 }, { "epoch": 7.704087314845751, "grad_norm": 0.12661752104759216, "learning_rate": 7.608598786554699e-06, "loss": 0.4603, "num_input_tokens_seen": 83920752, "step": 69175 }, { "epoch": 7.704644169729368, "grad_norm": 0.08906187117099762, "learning_rate": 7.60510818928023e-06, "loss": 0.4573, "num_input_tokens_seen": 83926832, "step": 69180 }, { "epoch": 7.705201024612986, "grad_norm": 0.10521198064088821, "learning_rate": 7.6016182492405106e-06, "loss": 0.4733, "num_input_tokens_seen": 83932720, "step": 69185 }, { "epoch": 7.705757879496603, "grad_norm": 0.08359815925359726, "learning_rate": 7.598128966567419e-06, "loss": 0.4607, "num_input_tokens_seen": 83938384, "step": 69190 }, { "epoch": 7.7063147343802205, "grad_norm": 0.10487395524978638, "learning_rate": 7.594640341392775e-06, "loss": 0.4648, "num_input_tokens_seen": 83944496, "step": 69195 }, { "epoch": 7.706871589263838, "grad_norm": 0.10170044004917145, "learning_rate": 7.591152373848404e-06, "loss": 0.4606, "num_input_tokens_seen": 83950832, "step": 69200 }, { "epoch": 7.707428444147455, "grad_norm": 0.11889654397964478, "learning_rate": 7.587665064066085e-06, "loss": 0.4607, "num_input_tokens_seen": 83957136, "step": 69205 }, { "epoch": 7.707985299031073, "grad_norm": 0.09419414401054382, "learning_rate": 7.584178412177573e-06, "loss": 0.4656, "num_input_tokens_seen": 83963216, "step": 69210 }, { "epoch": 7.70854215391469, "grad_norm": 0.11487182974815369, "learning_rate": 7.580692418314611e-06, "loss": 0.4674, "num_input_tokens_seen": 83968464, "step": 69215 }, { "epoch": 7.709099008798307, "grad_norm": 0.11715136468410492, "learning_rate": 7.577207082608923e-06, "loss": 0.4645, "num_input_tokens_seen": 83974928, "step": 69220 }, { "epoch": 7.709655863681925, "grad_norm": 0.12265272438526154, "learning_rate": 7.573722405192176e-06, "loss": 0.4504, "num_input_tokens_seen": 83981072, "step": 69225 }, { "epoch": 7.710212718565542, "grad_norm": 0.1341560333967209, "learning_rate": 7.570238386196049e-06, "loss": 0.4505, "num_input_tokens_seen": 83987120, "step": 69230 }, { "epoch": 7.710769573449159, "grad_norm": 0.07971352338790894, "learning_rate": 7.566755025752164e-06, "loss": 0.4599, "num_input_tokens_seen": 83992944, "step": 69235 }, { "epoch": 7.711326428332777, "grad_norm": 0.09196017682552338, "learning_rate": 7.5632723239921514e-06, "loss": 0.4544, "num_input_tokens_seen": 83999088, "step": 69240 }, { "epoch": 7.711883283216394, "grad_norm": 0.0988394170999527, "learning_rate": 7.559790281047583e-06, "loss": 0.4612, "num_input_tokens_seen": 84005200, "step": 69245 }, { "epoch": 7.712440138100011, "grad_norm": 0.07549092918634415, "learning_rate": 7.556308897050024e-06, "loss": 0.4482, "num_input_tokens_seen": 84011120, "step": 69250 }, { "epoch": 7.712996992983628, "grad_norm": 0.11587212979793549, "learning_rate": 7.552828172131024e-06, "loss": 0.4663, "num_input_tokens_seen": 84017392, "step": 69255 }, { "epoch": 7.713553847867246, "grad_norm": 0.1011536568403244, "learning_rate": 7.5493481064220804e-06, "loss": 0.4688, "num_input_tokens_seen": 84023120, "step": 69260 }, { "epoch": 7.7141107027508635, "grad_norm": 0.1469680368900299, "learning_rate": 7.5458687000546976e-06, "loss": 0.4783, "num_input_tokens_seen": 84029424, "step": 69265 }, { "epoch": 7.71466755763448, "grad_norm": 0.07574448734521866, "learning_rate": 7.5423899531603224e-06, "loss": 0.4547, "num_input_tokens_seen": 84034768, "step": 69270 }, { "epoch": 7.715224412518098, "grad_norm": 0.1064450815320015, "learning_rate": 7.538911865870407e-06, "loss": 0.4615, "num_input_tokens_seen": 84040784, "step": 69275 }, { "epoch": 7.715781267401715, "grad_norm": 0.10821224004030228, "learning_rate": 7.5354344383163495e-06, "loss": 0.4557, "num_input_tokens_seen": 84047312, "step": 69280 }, { "epoch": 7.7163381222853324, "grad_norm": 0.13021615147590637, "learning_rate": 7.53195767062955e-06, "loss": 0.4574, "num_input_tokens_seen": 84053488, "step": 69285 }, { "epoch": 7.71689497716895, "grad_norm": 0.09918440878391266, "learning_rate": 7.528481562941373e-06, "loss": 0.45, "num_input_tokens_seen": 84059696, "step": 69290 }, { "epoch": 7.717451832052567, "grad_norm": 0.13173694908618927, "learning_rate": 7.525006115383149e-06, "loss": 0.4406, "num_input_tokens_seen": 84066288, "step": 69295 }, { "epoch": 7.718008686936185, "grad_norm": 0.09863533079624176, "learning_rate": 7.521531328086198e-06, "loss": 0.4649, "num_input_tokens_seen": 84072400, "step": 69300 }, { "epoch": 7.718565541819801, "grad_norm": 0.1142587959766388, "learning_rate": 7.518057201181802e-06, "loss": 0.4498, "num_input_tokens_seen": 84078352, "step": 69305 }, { "epoch": 7.719122396703419, "grad_norm": 0.11472932249307632, "learning_rate": 7.514583734801237e-06, "loss": 0.4639, "num_input_tokens_seen": 84084496, "step": 69310 }, { "epoch": 7.719679251587037, "grad_norm": 0.0979672223329544, "learning_rate": 7.511110929075721e-06, "loss": 0.4692, "num_input_tokens_seen": 84090736, "step": 69315 }, { "epoch": 7.7202361064706535, "grad_norm": 0.09433023631572723, "learning_rate": 7.507638784136484e-06, "loss": 0.4554, "num_input_tokens_seen": 84096880, "step": 69320 }, { "epoch": 7.720792961354271, "grad_norm": 0.0907033160328865, "learning_rate": 7.5041673001147175e-06, "loss": 0.4706, "num_input_tokens_seen": 84103216, "step": 69325 }, { "epoch": 7.721349816237888, "grad_norm": 0.14880891144275665, "learning_rate": 7.5006964771415695e-06, "loss": 0.4801, "num_input_tokens_seen": 84109520, "step": 69330 }, { "epoch": 7.721906671121506, "grad_norm": 0.0959545373916626, "learning_rate": 7.4972263153481964e-06, "loss": 0.4545, "num_input_tokens_seen": 84115568, "step": 69335 }, { "epoch": 7.722463526005123, "grad_norm": 0.11766652762889862, "learning_rate": 7.493756814865696e-06, "loss": 0.4659, "num_input_tokens_seen": 84121904, "step": 69340 }, { "epoch": 7.72302038088874, "grad_norm": 0.10590308904647827, "learning_rate": 7.490287975825166e-06, "loss": 0.4475, "num_input_tokens_seen": 84127664, "step": 69345 }, { "epoch": 7.723577235772358, "grad_norm": 0.0914488434791565, "learning_rate": 7.486819798357675e-06, "loss": 0.4679, "num_input_tokens_seen": 84133904, "step": 69350 }, { "epoch": 7.724134090655975, "grad_norm": 0.09873275458812714, "learning_rate": 7.483352282594247e-06, "loss": 0.463, "num_input_tokens_seen": 84140176, "step": 69355 }, { "epoch": 7.724690945539592, "grad_norm": 0.08316285908222198, "learning_rate": 7.479885428665914e-06, "loss": 0.4637, "num_input_tokens_seen": 84146160, "step": 69360 }, { "epoch": 7.72524780042321, "grad_norm": 0.08400233834981918, "learning_rate": 7.476419236703646e-06, "loss": 0.4609, "num_input_tokens_seen": 84152304, "step": 69365 }, { "epoch": 7.725804655306827, "grad_norm": 0.09252931922674179, "learning_rate": 7.472953706838429e-06, "loss": 0.4655, "num_input_tokens_seen": 84158032, "step": 69370 }, { "epoch": 7.726361510190444, "grad_norm": 0.1072365790605545, "learning_rate": 7.469488839201177e-06, "loss": 0.4555, "num_input_tokens_seen": 84164432, "step": 69375 }, { "epoch": 7.726918365074062, "grad_norm": 0.09653240442276001, "learning_rate": 7.466024633922816e-06, "loss": 0.4609, "num_input_tokens_seen": 84170576, "step": 69380 }, { "epoch": 7.727475219957679, "grad_norm": 0.0674348697066307, "learning_rate": 7.462561091134243e-06, "loss": 0.4548, "num_input_tokens_seen": 84176528, "step": 69385 }, { "epoch": 7.7280320748412965, "grad_norm": 0.0950286015868187, "learning_rate": 7.459098210966306e-06, "loss": 0.4632, "num_input_tokens_seen": 84182864, "step": 69390 }, { "epoch": 7.728588929724914, "grad_norm": 0.08527956157922745, "learning_rate": 7.455635993549859e-06, "loss": 0.4561, "num_input_tokens_seen": 84189040, "step": 69395 }, { "epoch": 7.729145784608531, "grad_norm": 0.09700482338666916, "learning_rate": 7.4521744390156995e-06, "loss": 0.4742, "num_input_tokens_seen": 84195056, "step": 69400 }, { "epoch": 7.729702639492149, "grad_norm": 0.10173780471086502, "learning_rate": 7.448713547494635e-06, "loss": 0.4643, "num_input_tokens_seen": 84201008, "step": 69405 }, { "epoch": 7.730259494375765, "grad_norm": 0.13565732538700104, "learning_rate": 7.445253319117407e-06, "loss": 0.4595, "num_input_tokens_seen": 84207216, "step": 69410 }, { "epoch": 7.730816349259383, "grad_norm": 0.10584396123886108, "learning_rate": 7.441793754014767e-06, "loss": 0.47, "num_input_tokens_seen": 84213328, "step": 69415 }, { "epoch": 7.731373204143001, "grad_norm": 0.1341966688632965, "learning_rate": 7.438334852317436e-06, "loss": 0.4754, "num_input_tokens_seen": 84219440, "step": 69420 }, { "epoch": 7.731930059026618, "grad_norm": 0.13156160712242126, "learning_rate": 7.4348766141560834e-06, "loss": 0.4582, "num_input_tokens_seen": 84225552, "step": 69425 }, { "epoch": 7.732486913910235, "grad_norm": 0.09356974065303802, "learning_rate": 7.431419039661391e-06, "loss": 0.4672, "num_input_tokens_seen": 84231536, "step": 69430 }, { "epoch": 7.733043768793852, "grad_norm": 0.08375288546085358, "learning_rate": 7.427962128963981e-06, "loss": 0.454, "num_input_tokens_seen": 84237232, "step": 69435 }, { "epoch": 7.73360062367747, "grad_norm": 0.13282181322574615, "learning_rate": 7.424505882194474e-06, "loss": 0.454, "num_input_tokens_seen": 84243344, "step": 69440 }, { "epoch": 7.734157478561087, "grad_norm": 0.0987543910741806, "learning_rate": 7.421050299483467e-06, "loss": 0.4679, "num_input_tokens_seen": 84249328, "step": 69445 }, { "epoch": 7.734714333444704, "grad_norm": 0.12547463178634644, "learning_rate": 7.417595380961503e-06, "loss": 0.4591, "num_input_tokens_seen": 84255728, "step": 69450 }, { "epoch": 7.735271188328322, "grad_norm": 0.12085303664207458, "learning_rate": 7.414141126759144e-06, "loss": 0.464, "num_input_tokens_seen": 84261680, "step": 69455 }, { "epoch": 7.735828043211939, "grad_norm": 0.1250797063112259, "learning_rate": 7.4106875370068805e-06, "loss": 0.4637, "num_input_tokens_seen": 84267696, "step": 69460 }, { "epoch": 7.736384898095556, "grad_norm": 0.11841189861297607, "learning_rate": 7.407234611835215e-06, "loss": 0.4667, "num_input_tokens_seen": 84273552, "step": 69465 }, { "epoch": 7.736941752979174, "grad_norm": 0.09574401378631592, "learning_rate": 7.403782351374597e-06, "loss": 0.4574, "num_input_tokens_seen": 84279568, "step": 69470 }, { "epoch": 7.737498607862791, "grad_norm": 0.07758814096450806, "learning_rate": 7.400330755755475e-06, "loss": 0.4649, "num_input_tokens_seen": 84285936, "step": 69475 }, { "epoch": 7.738055462746408, "grad_norm": 0.10521886497735977, "learning_rate": 7.396879825108263e-06, "loss": 0.4649, "num_input_tokens_seen": 84292144, "step": 69480 }, { "epoch": 7.738612317630025, "grad_norm": 0.08449995517730713, "learning_rate": 7.393429559563337e-06, "loss": 0.453, "num_input_tokens_seen": 84298480, "step": 69485 }, { "epoch": 7.739169172513643, "grad_norm": 0.09397754818201065, "learning_rate": 7.38997995925107e-06, "loss": 0.4504, "num_input_tokens_seen": 84304816, "step": 69490 }, { "epoch": 7.739726027397261, "grad_norm": 0.11579165607690811, "learning_rate": 7.3865310243017904e-06, "loss": 0.458, "num_input_tokens_seen": 84310960, "step": 69495 }, { "epoch": 7.740282882280877, "grad_norm": 0.11421245336532593, "learning_rate": 7.383082754845819e-06, "loss": 0.4639, "num_input_tokens_seen": 84316912, "step": 69500 }, { "epoch": 7.740839737164495, "grad_norm": 0.0886024683713913, "learning_rate": 7.379635151013431e-06, "loss": 0.4688, "num_input_tokens_seen": 84322896, "step": 69505 }, { "epoch": 7.741396592048112, "grad_norm": 0.10277456045150757, "learning_rate": 7.376188212934892e-06, "loss": 0.433, "num_input_tokens_seen": 84329360, "step": 69510 }, { "epoch": 7.7419534469317295, "grad_norm": 0.09608414769172668, "learning_rate": 7.372741940740449e-06, "loss": 0.4561, "num_input_tokens_seen": 84335184, "step": 69515 }, { "epoch": 7.742510301815347, "grad_norm": 0.08538222312927246, "learning_rate": 7.369296334560299e-06, "loss": 0.4495, "num_input_tokens_seen": 84341360, "step": 69520 }, { "epoch": 7.743067156698964, "grad_norm": 0.12140652537345886, "learning_rate": 7.365851394524639e-06, "loss": 0.4479, "num_input_tokens_seen": 84347312, "step": 69525 }, { "epoch": 7.743624011582582, "grad_norm": 0.10540684312582016, "learning_rate": 7.36240712076362e-06, "loss": 0.4581, "num_input_tokens_seen": 84353200, "step": 69530 }, { "epoch": 7.744180866466198, "grad_norm": 0.12299259752035141, "learning_rate": 7.358963513407388e-06, "loss": 0.4637, "num_input_tokens_seen": 84359440, "step": 69535 }, { "epoch": 7.744737721349816, "grad_norm": 0.10337745398283005, "learning_rate": 7.3555205725860416e-06, "loss": 0.4552, "num_input_tokens_seen": 84365072, "step": 69540 }, { "epoch": 7.745294576233434, "grad_norm": 0.14059321582317352, "learning_rate": 7.352078298429671e-06, "loss": 0.449, "num_input_tokens_seen": 84371184, "step": 69545 }, { "epoch": 7.7458514311170505, "grad_norm": 0.10030677914619446, "learning_rate": 7.3486366910683476e-06, "loss": 0.4565, "num_input_tokens_seen": 84377296, "step": 69550 }, { "epoch": 7.746408286000668, "grad_norm": 0.10886475443840027, "learning_rate": 7.345195750632086e-06, "loss": 0.4476, "num_input_tokens_seen": 84383792, "step": 69555 }, { "epoch": 7.746965140884286, "grad_norm": 0.129648819565773, "learning_rate": 7.3417554772509165e-06, "loss": 0.4544, "num_input_tokens_seen": 84390064, "step": 69560 }, { "epoch": 7.747521995767903, "grad_norm": 0.15790016949176788, "learning_rate": 7.338315871054804e-06, "loss": 0.4601, "num_input_tokens_seen": 84396336, "step": 69565 }, { "epoch": 7.74807885065152, "grad_norm": 0.09793191403150558, "learning_rate": 7.33487693217372e-06, "loss": 0.4668, "num_input_tokens_seen": 84402384, "step": 69570 }, { "epoch": 7.748635705535138, "grad_norm": 0.09545358270406723, "learning_rate": 7.331438660737602e-06, "loss": 0.4586, "num_input_tokens_seen": 84408656, "step": 69575 }, { "epoch": 7.749192560418755, "grad_norm": 0.0881178081035614, "learning_rate": 7.328001056876347e-06, "loss": 0.4617, "num_input_tokens_seen": 84414896, "step": 69580 }, { "epoch": 7.7497494153023725, "grad_norm": 0.11242049187421799, "learning_rate": 7.324564120719851e-06, "loss": 0.4589, "num_input_tokens_seen": 84420944, "step": 69585 }, { "epoch": 7.750306270185989, "grad_norm": 0.094558946788311, "learning_rate": 7.3211278523979605e-06, "loss": 0.4568, "num_input_tokens_seen": 84427024, "step": 69590 }, { "epoch": 7.750863125069607, "grad_norm": 0.09873740375041962, "learning_rate": 7.31769225204052e-06, "loss": 0.4637, "num_input_tokens_seen": 84433264, "step": 69595 }, { "epoch": 7.751419979953225, "grad_norm": 0.1443057358264923, "learning_rate": 7.314257319777323e-06, "loss": 0.4715, "num_input_tokens_seen": 84439728, "step": 69600 }, { "epoch": 7.751976834836841, "grad_norm": 0.10216141492128372, "learning_rate": 7.3108230557381645e-06, "loss": 0.4638, "num_input_tokens_seen": 84445680, "step": 69605 }, { "epoch": 7.752533689720459, "grad_norm": 0.08658795058727264, "learning_rate": 7.307389460052813e-06, "loss": 0.4658, "num_input_tokens_seen": 84451792, "step": 69610 }, { "epoch": 7.753090544604076, "grad_norm": 0.09832663089036942, "learning_rate": 7.303956532850969e-06, "loss": 0.4581, "num_input_tokens_seen": 84458064, "step": 69615 }, { "epoch": 7.7536473994876935, "grad_norm": 0.09319101274013519, "learning_rate": 7.30052427426236e-06, "loss": 0.4614, "num_input_tokens_seen": 84464144, "step": 69620 }, { "epoch": 7.754204254371311, "grad_norm": 0.10135999321937561, "learning_rate": 7.29709268441667e-06, "loss": 0.461, "num_input_tokens_seen": 84470608, "step": 69625 }, { "epoch": 7.754761109254928, "grad_norm": 0.08629976958036423, "learning_rate": 7.293661763443546e-06, "loss": 0.4585, "num_input_tokens_seen": 84476944, "step": 69630 }, { "epoch": 7.755317964138546, "grad_norm": 0.09277728199958801, "learning_rate": 7.290231511472628e-06, "loss": 0.4636, "num_input_tokens_seen": 84483120, "step": 69635 }, { "epoch": 7.7558748190221625, "grad_norm": 0.09556525200605392, "learning_rate": 7.286801928633511e-06, "loss": 0.4606, "num_input_tokens_seen": 84489456, "step": 69640 }, { "epoch": 7.75643167390578, "grad_norm": 0.15796451270580292, "learning_rate": 7.283373015055789e-06, "loss": 0.4618, "num_input_tokens_seen": 84495632, "step": 69645 }, { "epoch": 7.756988528789398, "grad_norm": 0.10364113003015518, "learning_rate": 7.2799447708690015e-06, "loss": 0.4665, "num_input_tokens_seen": 84501584, "step": 69650 }, { "epoch": 7.757545383673015, "grad_norm": 0.13162006437778473, "learning_rate": 7.276517196202687e-06, "loss": 0.4592, "num_input_tokens_seen": 84507152, "step": 69655 }, { "epoch": 7.758102238556632, "grad_norm": 0.09724362939596176, "learning_rate": 7.27309029118636e-06, "loss": 0.4575, "num_input_tokens_seen": 84513392, "step": 69660 }, { "epoch": 7.758659093440249, "grad_norm": 0.0926128700375557, "learning_rate": 7.269664055949482e-06, "loss": 0.4545, "num_input_tokens_seen": 84518736, "step": 69665 }, { "epoch": 7.759215948323867, "grad_norm": 0.08931655436754227, "learning_rate": 7.2662384906215245e-06, "loss": 0.4526, "num_input_tokens_seen": 84525040, "step": 69670 }, { "epoch": 7.759772803207484, "grad_norm": 0.13249842822551727, "learning_rate": 7.262813595331897e-06, "loss": 0.4659, "num_input_tokens_seen": 84531632, "step": 69675 }, { "epoch": 7.760329658091101, "grad_norm": 0.12783072888851166, "learning_rate": 7.259389370210023e-06, "loss": 0.4516, "num_input_tokens_seen": 84536752, "step": 69680 }, { "epoch": 7.760886512974719, "grad_norm": 0.09016880393028259, "learning_rate": 7.255965815385263e-06, "loss": 0.4628, "num_input_tokens_seen": 84542800, "step": 69685 }, { "epoch": 7.761443367858336, "grad_norm": 0.10509435832500458, "learning_rate": 7.252542930986977e-06, "loss": 0.4589, "num_input_tokens_seen": 84548944, "step": 69690 }, { "epoch": 7.762000222741953, "grad_norm": 0.0834655836224556, "learning_rate": 7.249120717144503e-06, "loss": 0.4649, "num_input_tokens_seen": 84555088, "step": 69695 }, { "epoch": 7.762557077625571, "grad_norm": 0.11763759702444077, "learning_rate": 7.245699173987125e-06, "loss": 0.4664, "num_input_tokens_seen": 84561040, "step": 69700 }, { "epoch": 7.763113932509188, "grad_norm": 0.10825904458761215, "learning_rate": 7.242278301644137e-06, "loss": 0.4497, "num_input_tokens_seen": 84567088, "step": 69705 }, { "epoch": 7.7636707873928055, "grad_norm": 0.1011461466550827, "learning_rate": 7.238858100244775e-06, "loss": 0.4602, "num_input_tokens_seen": 84573040, "step": 69710 }, { "epoch": 7.764227642276423, "grad_norm": 0.10213962942361832, "learning_rate": 7.235438569918271e-06, "loss": 0.4634, "num_input_tokens_seen": 84579024, "step": 69715 }, { "epoch": 7.76478449716004, "grad_norm": 0.07588931918144226, "learning_rate": 7.232019710793836e-06, "loss": 0.4585, "num_input_tokens_seen": 84585136, "step": 69720 }, { "epoch": 7.765341352043658, "grad_norm": 0.11177965998649597, "learning_rate": 7.2286015230006304e-06, "loss": 0.456, "num_input_tokens_seen": 84591184, "step": 69725 }, { "epoch": 7.765898206927274, "grad_norm": 0.10855182260274887, "learning_rate": 7.225184006667815e-06, "loss": 0.4594, "num_input_tokens_seen": 84597200, "step": 69730 }, { "epoch": 7.766455061810892, "grad_norm": 0.10598008334636688, "learning_rate": 7.221767161924503e-06, "loss": 0.4597, "num_input_tokens_seen": 84603312, "step": 69735 }, { "epoch": 7.76701191669451, "grad_norm": 0.08445217460393906, "learning_rate": 7.218350988899811e-06, "loss": 0.4534, "num_input_tokens_seen": 84609392, "step": 69740 }, { "epoch": 7.7675687715781265, "grad_norm": 0.10628446936607361, "learning_rate": 7.214935487722793e-06, "loss": 0.4512, "num_input_tokens_seen": 84615792, "step": 69745 }, { "epoch": 7.768125626461744, "grad_norm": 0.09695136547088623, "learning_rate": 7.2115206585225055e-06, "loss": 0.4616, "num_input_tokens_seen": 84621904, "step": 69750 }, { "epoch": 7.768682481345362, "grad_norm": 0.1191006675362587, "learning_rate": 7.208106501427986e-06, "loss": 0.4652, "num_input_tokens_seen": 84627792, "step": 69755 }, { "epoch": 7.769239336228979, "grad_norm": 0.12533991038799286, "learning_rate": 7.2046930165682065e-06, "loss": 0.4611, "num_input_tokens_seen": 84634032, "step": 69760 }, { "epoch": 7.769796191112596, "grad_norm": 0.13815075159072876, "learning_rate": 7.2012802040721636e-06, "loss": 0.4606, "num_input_tokens_seen": 84640176, "step": 69765 }, { "epoch": 7.770353045996213, "grad_norm": 0.1608293503522873, "learning_rate": 7.197868064068785e-06, "loss": 0.4698, "num_input_tokens_seen": 84646224, "step": 69770 }, { "epoch": 7.770909900879831, "grad_norm": 0.11818375438451767, "learning_rate": 7.194456596687011e-06, "loss": 0.4461, "num_input_tokens_seen": 84652368, "step": 69775 }, { "epoch": 7.7714667557634485, "grad_norm": 0.09468214958906174, "learning_rate": 7.1910458020557155e-06, "loss": 0.4705, "num_input_tokens_seen": 84658768, "step": 69780 }, { "epoch": 7.772023610647065, "grad_norm": 0.11683332175016403, "learning_rate": 7.1876356803037836e-06, "loss": 0.4605, "num_input_tokens_seen": 84664944, "step": 69785 }, { "epoch": 7.772580465530683, "grad_norm": 0.10247131437063217, "learning_rate": 7.184226231560065e-06, "loss": 0.4831, "num_input_tokens_seen": 84670960, "step": 69790 }, { "epoch": 7.7731373204143, "grad_norm": 0.08006013929843903, "learning_rate": 7.180817455953365e-06, "loss": 0.4643, "num_input_tokens_seen": 84677008, "step": 69795 }, { "epoch": 7.773694175297917, "grad_norm": 0.124618761241436, "learning_rate": 7.177409353612497e-06, "loss": 0.4481, "num_input_tokens_seen": 84683280, "step": 69800 }, { "epoch": 7.774251030181535, "grad_norm": 0.10276618599891663, "learning_rate": 7.1740019246662075e-06, "loss": 0.452, "num_input_tokens_seen": 84689200, "step": 69805 }, { "epoch": 7.774807885065152, "grad_norm": 0.09380624443292618, "learning_rate": 7.170595169243258e-06, "loss": 0.4666, "num_input_tokens_seen": 84695248, "step": 69810 }, { "epoch": 7.7753647399487695, "grad_norm": 0.1491421014070511, "learning_rate": 7.167189087472356e-06, "loss": 0.4619, "num_input_tokens_seen": 84701520, "step": 69815 }, { "epoch": 7.775921594832386, "grad_norm": 0.13239699602127075, "learning_rate": 7.163783679482197e-06, "loss": 0.4736, "num_input_tokens_seen": 84707824, "step": 69820 }, { "epoch": 7.776478449716004, "grad_norm": 0.08736196160316467, "learning_rate": 7.160378945401455e-06, "loss": 0.4762, "num_input_tokens_seen": 84713456, "step": 69825 }, { "epoch": 7.777035304599622, "grad_norm": 0.09857688099145889, "learning_rate": 7.156974885358761e-06, "loss": 0.4668, "num_input_tokens_seen": 84719824, "step": 69830 }, { "epoch": 7.7775921594832385, "grad_norm": 0.091733917593956, "learning_rate": 7.153571499482745e-06, "loss": 0.4518, "num_input_tokens_seen": 84725904, "step": 69835 }, { "epoch": 7.778149014366856, "grad_norm": 0.11336037516593933, "learning_rate": 7.150168787901981e-06, "loss": 0.4581, "num_input_tokens_seen": 84731792, "step": 69840 }, { "epoch": 7.778705869250473, "grad_norm": 0.11400022357702255, "learning_rate": 7.146766750745043e-06, "loss": 0.4548, "num_input_tokens_seen": 84738000, "step": 69845 }, { "epoch": 7.779262724134091, "grad_norm": 0.09489109367132187, "learning_rate": 7.143365388140477e-06, "loss": 0.4551, "num_input_tokens_seen": 84744208, "step": 69850 }, { "epoch": 7.779819579017708, "grad_norm": 0.10798770934343338, "learning_rate": 7.139964700216783e-06, "loss": 0.4525, "num_input_tokens_seen": 84750192, "step": 69855 }, { "epoch": 7.780376433901325, "grad_norm": 0.14250445365905762, "learning_rate": 7.136564687102468e-06, "loss": 0.45, "num_input_tokens_seen": 84756400, "step": 69860 }, { "epoch": 7.780933288784943, "grad_norm": 0.09454100579023361, "learning_rate": 7.133165348925977e-06, "loss": 0.4569, "num_input_tokens_seen": 84762640, "step": 69865 }, { "epoch": 7.7814901436685595, "grad_norm": 0.15813159942626953, "learning_rate": 7.129766685815761e-06, "loss": 0.4644, "num_input_tokens_seen": 84768688, "step": 69870 }, { "epoch": 7.782046998552177, "grad_norm": 0.11714542657136917, "learning_rate": 7.1263686979002236e-06, "loss": 0.4602, "num_input_tokens_seen": 84774768, "step": 69875 }, { "epoch": 7.782603853435795, "grad_norm": 0.1086733341217041, "learning_rate": 7.122971385307753e-06, "loss": 0.4567, "num_input_tokens_seen": 84780976, "step": 69880 }, { "epoch": 7.783160708319412, "grad_norm": 0.09740511327981949, "learning_rate": 7.119574748166724e-06, "loss": 0.4526, "num_input_tokens_seen": 84787184, "step": 69885 }, { "epoch": 7.783717563203029, "grad_norm": 0.12435825169086456, "learning_rate": 7.116178786605451e-06, "loss": 0.4583, "num_input_tokens_seen": 84793264, "step": 69890 }, { "epoch": 7.784274418086647, "grad_norm": 0.08372705429792404, "learning_rate": 7.112783500752265e-06, "loss": 0.4645, "num_input_tokens_seen": 84799440, "step": 69895 }, { "epoch": 7.784831272970264, "grad_norm": 0.13056747615337372, "learning_rate": 7.10938889073543e-06, "loss": 0.4716, "num_input_tokens_seen": 84805520, "step": 69900 }, { "epoch": 7.7853881278538815, "grad_norm": 0.11394081264734268, "learning_rate": 7.105994956683226e-06, "loss": 0.4585, "num_input_tokens_seen": 84811472, "step": 69905 }, { "epoch": 7.785944982737498, "grad_norm": 0.11137603968381882, "learning_rate": 7.102601698723868e-06, "loss": 0.4468, "num_input_tokens_seen": 84817616, "step": 69910 }, { "epoch": 7.786501837621116, "grad_norm": 0.11975736916065216, "learning_rate": 7.0992091169855755e-06, "loss": 0.4599, "num_input_tokens_seen": 84823664, "step": 69915 }, { "epoch": 7.787058692504734, "grad_norm": 0.08894123882055283, "learning_rate": 7.095817211596534e-06, "loss": 0.4615, "num_input_tokens_seen": 84829968, "step": 69920 }, { "epoch": 7.78761554738835, "grad_norm": 0.15562357008457184, "learning_rate": 7.092425982684886e-06, "loss": 0.4661, "num_input_tokens_seen": 84835920, "step": 69925 }, { "epoch": 7.788172402271968, "grad_norm": 0.12580977380275726, "learning_rate": 7.089035430378782e-06, "loss": 0.4665, "num_input_tokens_seen": 84841776, "step": 69930 }, { "epoch": 7.788729257155586, "grad_norm": 0.1172821968793869, "learning_rate": 7.085645554806309e-06, "loss": 0.4656, "num_input_tokens_seen": 84847984, "step": 69935 }, { "epoch": 7.7892861120392025, "grad_norm": 0.10056848078966141, "learning_rate": 7.082256356095557e-06, "loss": 0.4426, "num_input_tokens_seen": 84854000, "step": 69940 }, { "epoch": 7.78984296692282, "grad_norm": 0.1270788013935089, "learning_rate": 7.078867834374586e-06, "loss": 0.4446, "num_input_tokens_seen": 84860112, "step": 69945 }, { "epoch": 7.790399821806437, "grad_norm": 0.0989220142364502, "learning_rate": 7.075479989771414e-06, "loss": 0.4596, "num_input_tokens_seen": 84866160, "step": 69950 }, { "epoch": 7.790956676690055, "grad_norm": 0.09592597186565399, "learning_rate": 7.072092822414056e-06, "loss": 0.4545, "num_input_tokens_seen": 84871760, "step": 69955 }, { "epoch": 7.791513531573672, "grad_norm": 0.10183202475309372, "learning_rate": 7.068706332430478e-06, "loss": 0.4558, "num_input_tokens_seen": 84877872, "step": 69960 }, { "epoch": 7.792070386457289, "grad_norm": 0.11342401057481766, "learning_rate": 7.0653205199486425e-06, "loss": 0.4548, "num_input_tokens_seen": 84883824, "step": 69965 }, { "epoch": 7.792627241340907, "grad_norm": 0.13506843149662018, "learning_rate": 7.061935385096469e-06, "loss": 0.4697, "num_input_tokens_seen": 84890032, "step": 69970 }, { "epoch": 7.793184096224524, "grad_norm": 0.12726357579231262, "learning_rate": 7.058550928001859e-06, "loss": 0.4674, "num_input_tokens_seen": 84895952, "step": 69975 }, { "epoch": 7.793740951108141, "grad_norm": 0.12087427824735641, "learning_rate": 7.055167148792699e-06, "loss": 0.4366, "num_input_tokens_seen": 84902128, "step": 69980 }, { "epoch": 7.794297805991759, "grad_norm": 0.1292586326599121, "learning_rate": 7.0517840475968264e-06, "loss": 0.4737, "num_input_tokens_seen": 84908080, "step": 69985 }, { "epoch": 7.794854660875376, "grad_norm": 0.10053233057260513, "learning_rate": 7.048401624542075e-06, "loss": 0.4546, "num_input_tokens_seen": 84913872, "step": 69990 }, { "epoch": 7.795411515758993, "grad_norm": 0.13302546739578247, "learning_rate": 7.045019879756232e-06, "loss": 0.4698, "num_input_tokens_seen": 84919792, "step": 69995 }, { "epoch": 7.79596837064261, "grad_norm": 0.16097427904605865, "learning_rate": 7.041638813367085e-06, "loss": 0.4525, "num_input_tokens_seen": 84926256, "step": 70000 }, { "epoch": 7.796525225526228, "grad_norm": 0.1149819940328598, "learning_rate": 7.038258425502367e-06, "loss": 0.4545, "num_input_tokens_seen": 84933104, "step": 70005 }, { "epoch": 7.7970820804098455, "grad_norm": 0.17661941051483154, "learning_rate": 7.0348787162898086e-06, "loss": 0.4597, "num_input_tokens_seen": 84939344, "step": 70010 }, { "epoch": 7.797638935293462, "grad_norm": 0.09301158785820007, "learning_rate": 7.03149968585711e-06, "loss": 0.4628, "num_input_tokens_seen": 84945456, "step": 70015 }, { "epoch": 7.79819579017708, "grad_norm": 0.14458607137203217, "learning_rate": 7.0281213343319355e-06, "loss": 0.4724, "num_input_tokens_seen": 84951120, "step": 70020 }, { "epoch": 7.798752645060697, "grad_norm": 0.13649903237819672, "learning_rate": 7.024743661841923e-06, "loss": 0.471, "num_input_tokens_seen": 84957296, "step": 70025 }, { "epoch": 7.799309499944314, "grad_norm": 0.09327471256256104, "learning_rate": 7.021366668514709e-06, "loss": 0.4714, "num_input_tokens_seen": 84962672, "step": 70030 }, { "epoch": 7.799866354827932, "grad_norm": 0.11722338199615479, "learning_rate": 7.017990354477869e-06, "loss": 0.469, "num_input_tokens_seen": 84968688, "step": 70035 }, { "epoch": 7.800423209711549, "grad_norm": 0.10674133896827698, "learning_rate": 7.0146147198589866e-06, "loss": 0.4595, "num_input_tokens_seen": 84974832, "step": 70040 }, { "epoch": 7.800980064595167, "grad_norm": 0.12971818447113037, "learning_rate": 7.011239764785591e-06, "loss": 0.451, "num_input_tokens_seen": 84981040, "step": 70045 }, { "epoch": 7.801536919478783, "grad_norm": 0.08725243806838989, "learning_rate": 7.007865489385209e-06, "loss": 0.4505, "num_input_tokens_seen": 84987248, "step": 70050 }, { "epoch": 7.802093774362401, "grad_norm": 0.11075977236032486, "learning_rate": 7.004491893785323e-06, "loss": 0.4595, "num_input_tokens_seen": 84993200, "step": 70055 }, { "epoch": 7.802650629246019, "grad_norm": 0.13106009364128113, "learning_rate": 7.001118978113402e-06, "loss": 0.4709, "num_input_tokens_seen": 84999344, "step": 70060 }, { "epoch": 7.8032074841296355, "grad_norm": 0.12425404787063599, "learning_rate": 6.997746742496894e-06, "loss": 0.4542, "num_input_tokens_seen": 85005488, "step": 70065 }, { "epoch": 7.803764339013253, "grad_norm": 0.09540046006441116, "learning_rate": 6.994375187063198e-06, "loss": 0.4594, "num_input_tokens_seen": 85011760, "step": 70070 }, { "epoch": 7.804321193896871, "grad_norm": 0.09996020048856735, "learning_rate": 6.991004311939717e-06, "loss": 0.4665, "num_input_tokens_seen": 85017840, "step": 70075 }, { "epoch": 7.804878048780488, "grad_norm": 0.0953017920255661, "learning_rate": 6.987634117253797e-06, "loss": 0.4707, "num_input_tokens_seen": 85023920, "step": 70080 }, { "epoch": 7.805434903664105, "grad_norm": 0.12816210091114044, "learning_rate": 6.9842646031327925e-06, "loss": 0.4426, "num_input_tokens_seen": 85030160, "step": 70085 }, { "epoch": 7.805991758547722, "grad_norm": 0.167461097240448, "learning_rate": 6.980895769703996e-06, "loss": 0.4666, "num_input_tokens_seen": 85036304, "step": 70090 }, { "epoch": 7.80654861343134, "grad_norm": 0.12992213666439056, "learning_rate": 6.977527617094704e-06, "loss": 0.465, "num_input_tokens_seen": 85042352, "step": 70095 }, { "epoch": 7.807105468314957, "grad_norm": 0.08788327127695084, "learning_rate": 6.974160145432182e-06, "loss": 0.4639, "num_input_tokens_seen": 85048688, "step": 70100 }, { "epoch": 7.807662323198574, "grad_norm": 0.1052655354142189, "learning_rate": 6.970793354843647e-06, "loss": 0.4734, "num_input_tokens_seen": 85054832, "step": 70105 }, { "epoch": 7.808219178082192, "grad_norm": 0.08223330229520798, "learning_rate": 6.967427245456326e-06, "loss": 0.4534, "num_input_tokens_seen": 85060848, "step": 70110 }, { "epoch": 7.80877603296581, "grad_norm": 0.11033575981855392, "learning_rate": 6.964061817397383e-06, "loss": 0.4609, "num_input_tokens_seen": 85066832, "step": 70115 }, { "epoch": 7.809332887849426, "grad_norm": 0.10354571789503098, "learning_rate": 6.960697070793984e-06, "loss": 0.4639, "num_input_tokens_seen": 85072784, "step": 70120 }, { "epoch": 7.809889742733044, "grad_norm": 0.11310669034719467, "learning_rate": 6.957333005773267e-06, "loss": 0.4714, "num_input_tokens_seen": 85078800, "step": 70125 }, { "epoch": 7.810446597616661, "grad_norm": 0.10658377408981323, "learning_rate": 6.953969622462322e-06, "loss": 0.4662, "num_input_tokens_seen": 85084528, "step": 70130 }, { "epoch": 7.8110034525002785, "grad_norm": 0.08564326167106628, "learning_rate": 6.950606920988245e-06, "loss": 0.46, "num_input_tokens_seen": 85090512, "step": 70135 }, { "epoch": 7.811560307383896, "grad_norm": 0.14329206943511963, "learning_rate": 6.94724490147807e-06, "loss": 0.4684, "num_input_tokens_seen": 85096432, "step": 70140 }, { "epoch": 7.812117162267513, "grad_norm": 0.142124205827713, "learning_rate": 6.943883564058845e-06, "loss": 0.4628, "num_input_tokens_seen": 85102800, "step": 70145 }, { "epoch": 7.812674017151131, "grad_norm": 0.17658257484436035, "learning_rate": 6.940522908857555e-06, "loss": 0.4573, "num_input_tokens_seen": 85109008, "step": 70150 }, { "epoch": 7.813230872034747, "grad_norm": 0.10681108385324478, "learning_rate": 6.937162936001185e-06, "loss": 0.474, "num_input_tokens_seen": 85115248, "step": 70155 }, { "epoch": 7.813787726918365, "grad_norm": 0.10135715454816818, "learning_rate": 6.933803645616691e-06, "loss": 0.4612, "num_input_tokens_seen": 85121456, "step": 70160 }, { "epoch": 7.814344581801983, "grad_norm": 0.12472929805517197, "learning_rate": 6.930445037830985e-06, "loss": 0.4585, "num_input_tokens_seen": 85127184, "step": 70165 }, { "epoch": 7.8149014366856, "grad_norm": 0.09329650551080704, "learning_rate": 6.927087112770978e-06, "loss": 0.4544, "num_input_tokens_seen": 85133456, "step": 70170 }, { "epoch": 7.815458291569217, "grad_norm": 0.10827676951885223, "learning_rate": 6.923729870563528e-06, "loss": 0.4589, "num_input_tokens_seen": 85139568, "step": 70175 }, { "epoch": 7.816015146452834, "grad_norm": 0.10334902256727219, "learning_rate": 6.920373311335504e-06, "loss": 0.4462, "num_input_tokens_seen": 85145872, "step": 70180 }, { "epoch": 7.816572001336452, "grad_norm": 0.0991809070110321, "learning_rate": 6.9170174352137046e-06, "loss": 0.4558, "num_input_tokens_seen": 85152016, "step": 70185 }, { "epoch": 7.817128856220069, "grad_norm": 0.12751057744026184, "learning_rate": 6.913662242324937e-06, "loss": 0.4665, "num_input_tokens_seen": 85157840, "step": 70190 }, { "epoch": 7.817685711103686, "grad_norm": 0.09189769625663757, "learning_rate": 6.910307732795976e-06, "loss": 0.4611, "num_input_tokens_seen": 85163984, "step": 70195 }, { "epoch": 7.818242565987304, "grad_norm": 0.11625941097736359, "learning_rate": 6.906953906753555e-06, "loss": 0.461, "num_input_tokens_seen": 85170000, "step": 70200 }, { "epoch": 7.818799420870921, "grad_norm": 0.11586344987154007, "learning_rate": 6.903600764324405e-06, "loss": 0.4602, "num_input_tokens_seen": 85175536, "step": 70205 }, { "epoch": 7.819356275754538, "grad_norm": 0.1264205276966095, "learning_rate": 6.900248305635204e-06, "loss": 0.4636, "num_input_tokens_seen": 85181712, "step": 70210 }, { "epoch": 7.819913130638156, "grad_norm": 0.10783202946186066, "learning_rate": 6.896896530812625e-06, "loss": 0.4685, "num_input_tokens_seen": 85187568, "step": 70215 }, { "epoch": 7.820469985521773, "grad_norm": 0.09899310767650604, "learning_rate": 6.893545439983315e-06, "loss": 0.46, "num_input_tokens_seen": 85193680, "step": 70220 }, { "epoch": 7.82102684040539, "grad_norm": 0.08892292529344559, "learning_rate": 6.890195033273874e-06, "loss": 0.4552, "num_input_tokens_seen": 85199504, "step": 70225 }, { "epoch": 7.821583695289007, "grad_norm": 0.11215554177761078, "learning_rate": 6.886845310810913e-06, "loss": 0.4613, "num_input_tokens_seen": 85205392, "step": 70230 }, { "epoch": 7.822140550172625, "grad_norm": 0.10548724979162216, "learning_rate": 6.883496272720971e-06, "loss": 0.4561, "num_input_tokens_seen": 85211664, "step": 70235 }, { "epoch": 7.822697405056243, "grad_norm": 0.17439474165439606, "learning_rate": 6.8801479191306085e-06, "loss": 0.4633, "num_input_tokens_seen": 85218192, "step": 70240 }, { "epoch": 7.823254259939859, "grad_norm": 0.12154538184404373, "learning_rate": 6.8768002501663156e-06, "loss": 0.4617, "num_input_tokens_seen": 85224240, "step": 70245 }, { "epoch": 7.823811114823477, "grad_norm": 0.07915029674768448, "learning_rate": 6.873453265954588e-06, "loss": 0.4725, "num_input_tokens_seen": 85230544, "step": 70250 }, { "epoch": 7.824367969707095, "grad_norm": 0.089190274477005, "learning_rate": 6.870106966621892e-06, "loss": 0.4706, "num_input_tokens_seen": 85236656, "step": 70255 }, { "epoch": 7.8249248245907115, "grad_norm": 0.10178407281637192, "learning_rate": 6.8667613522946485e-06, "loss": 0.4596, "num_input_tokens_seen": 85242736, "step": 70260 }, { "epoch": 7.825481679474329, "grad_norm": 0.13347944617271423, "learning_rate": 6.863416423099281e-06, "loss": 0.4681, "num_input_tokens_seen": 85247952, "step": 70265 }, { "epoch": 7.826038534357946, "grad_norm": 0.16339051723480225, "learning_rate": 6.860072179162152e-06, "loss": 0.4686, "num_input_tokens_seen": 85253808, "step": 70270 }, { "epoch": 7.826595389241564, "grad_norm": 0.07688687741756439, "learning_rate": 6.856728620609637e-06, "loss": 0.46, "num_input_tokens_seen": 85259792, "step": 70275 }, { "epoch": 7.827152244125181, "grad_norm": 0.13525807857513428, "learning_rate": 6.85338574756805e-06, "loss": 0.4552, "num_input_tokens_seen": 85265616, "step": 70280 }, { "epoch": 7.827709099008798, "grad_norm": 0.13596698641777039, "learning_rate": 6.8500435601637045e-06, "loss": 0.4652, "num_input_tokens_seen": 85271792, "step": 70285 }, { "epoch": 7.828265953892416, "grad_norm": 0.12132130563259125, "learning_rate": 6.846702058522886e-06, "loss": 0.4619, "num_input_tokens_seen": 85277680, "step": 70290 }, { "epoch": 7.828822808776033, "grad_norm": 0.12169245630502701, "learning_rate": 6.8433612427718285e-06, "loss": 0.4611, "num_input_tokens_seen": 85283824, "step": 70295 }, { "epoch": 7.82937966365965, "grad_norm": 0.112793929874897, "learning_rate": 6.840021113036776e-06, "loss": 0.4702, "num_input_tokens_seen": 85289968, "step": 70300 }, { "epoch": 7.829936518543268, "grad_norm": 0.09599891304969788, "learning_rate": 6.836681669443918e-06, "loss": 0.4605, "num_input_tokens_seen": 85295920, "step": 70305 }, { "epoch": 7.830493373426885, "grad_norm": 0.1381419152021408, "learning_rate": 6.833342912119439e-06, "loss": 0.4635, "num_input_tokens_seen": 85301360, "step": 70310 }, { "epoch": 7.831050228310502, "grad_norm": 0.10642135143280029, "learning_rate": 6.830004841189477e-06, "loss": 0.4534, "num_input_tokens_seen": 85307280, "step": 70315 }, { "epoch": 7.83160708319412, "grad_norm": 0.10927874594926834, "learning_rate": 6.826667456780159e-06, "loss": 0.4593, "num_input_tokens_seen": 85313520, "step": 70320 }, { "epoch": 7.832163938077737, "grad_norm": 0.12317930907011032, "learning_rate": 6.8233307590175936e-06, "loss": 0.4672, "num_input_tokens_seen": 85319696, "step": 70325 }, { "epoch": 7.8327207929613545, "grad_norm": 0.08642421662807465, "learning_rate": 6.81999474802783e-06, "loss": 0.4553, "num_input_tokens_seen": 85325968, "step": 70330 }, { "epoch": 7.833277647844971, "grad_norm": 0.10511350631713867, "learning_rate": 6.816659423936939e-06, "loss": 0.4721, "num_input_tokens_seen": 85332304, "step": 70335 }, { "epoch": 7.833834502728589, "grad_norm": 0.10563762485980988, "learning_rate": 6.813324786870915e-06, "loss": 0.4742, "num_input_tokens_seen": 85338320, "step": 70340 }, { "epoch": 7.834391357612207, "grad_norm": 0.11159951984882355, "learning_rate": 6.8099908369557626e-06, "loss": 0.4719, "num_input_tokens_seen": 85344336, "step": 70345 }, { "epoch": 7.834948212495823, "grad_norm": 0.10234089940786362, "learning_rate": 6.806657574317457e-06, "loss": 0.4609, "num_input_tokens_seen": 85350512, "step": 70350 }, { "epoch": 7.835505067379441, "grad_norm": 0.12818115949630737, "learning_rate": 6.803324999081925e-06, "loss": 0.4631, "num_input_tokens_seen": 85355632, "step": 70355 }, { "epoch": 7.836061922263058, "grad_norm": 0.1040167436003685, "learning_rate": 6.799993111375094e-06, "loss": 0.4598, "num_input_tokens_seen": 85361680, "step": 70360 }, { "epoch": 7.8366187771466755, "grad_norm": 0.13950631022453308, "learning_rate": 6.796661911322843e-06, "loss": 0.4617, "num_input_tokens_seen": 85367568, "step": 70365 }, { "epoch": 7.837175632030293, "grad_norm": 0.13566908240318298, "learning_rate": 6.793331399051048e-06, "loss": 0.474, "num_input_tokens_seen": 85373072, "step": 70370 }, { "epoch": 7.83773248691391, "grad_norm": 0.10652356594800949, "learning_rate": 6.790001574685531e-06, "loss": 0.4621, "num_input_tokens_seen": 85379152, "step": 70375 }, { "epoch": 7.838289341797528, "grad_norm": 0.09045866131782532, "learning_rate": 6.78667243835211e-06, "loss": 0.4544, "num_input_tokens_seen": 85385392, "step": 70380 }, { "epoch": 7.8388461966811445, "grad_norm": 0.14231404662132263, "learning_rate": 6.78334399017658e-06, "loss": 0.4757, "num_input_tokens_seen": 85391504, "step": 70385 }, { "epoch": 7.839403051564762, "grad_norm": 0.09575668722391129, "learning_rate": 6.780016230284686e-06, "loss": 0.4644, "num_input_tokens_seen": 85397616, "step": 70390 }, { "epoch": 7.83995990644838, "grad_norm": 0.08852306753396988, "learning_rate": 6.776689158802174e-06, "loss": 0.4474, "num_input_tokens_seen": 85403504, "step": 70395 }, { "epoch": 7.840516761331997, "grad_norm": 0.15257690846920013, "learning_rate": 6.7733627758547354e-06, "loss": 0.4673, "num_input_tokens_seen": 85409680, "step": 70400 }, { "epoch": 7.841073616215614, "grad_norm": 0.12016032636165619, "learning_rate": 6.770037081568073e-06, "loss": 0.4596, "num_input_tokens_seen": 85415088, "step": 70405 }, { "epoch": 7.841630471099231, "grad_norm": 0.10216270387172699, "learning_rate": 6.766712076067821e-06, "loss": 0.4677, "num_input_tokens_seen": 85421360, "step": 70410 }, { "epoch": 7.842187325982849, "grad_norm": 0.07594925165176392, "learning_rate": 6.763387759479617e-06, "loss": 0.4531, "num_input_tokens_seen": 85427472, "step": 70415 }, { "epoch": 7.842744180866466, "grad_norm": 0.08921404927968979, "learning_rate": 6.760064131929075e-06, "loss": 0.4664, "num_input_tokens_seen": 85433616, "step": 70420 }, { "epoch": 7.843301035750083, "grad_norm": 0.08321261405944824, "learning_rate": 6.7567411935417545e-06, "loss": 0.4635, "num_input_tokens_seen": 85439856, "step": 70425 }, { "epoch": 7.843857890633701, "grad_norm": 0.07813010364770889, "learning_rate": 6.753418944443221e-06, "loss": 0.4581, "num_input_tokens_seen": 85446320, "step": 70430 }, { "epoch": 7.8444147455173185, "grad_norm": 0.085670106112957, "learning_rate": 6.750097384758996e-06, "loss": 0.469, "num_input_tokens_seen": 85452400, "step": 70435 }, { "epoch": 7.844971600400935, "grad_norm": 0.11250781267881393, "learning_rate": 6.7467765146145695e-06, "loss": 0.4595, "num_input_tokens_seen": 85458544, "step": 70440 }, { "epoch": 7.845528455284553, "grad_norm": 0.09552574902772903, "learning_rate": 6.743456334135429e-06, "loss": 0.4585, "num_input_tokens_seen": 85464816, "step": 70445 }, { "epoch": 7.84608531016817, "grad_norm": 0.09874092042446136, "learning_rate": 6.740136843447004e-06, "loss": 0.4532, "num_input_tokens_seen": 85470992, "step": 70450 }, { "epoch": 7.8466421650517875, "grad_norm": 0.11367052048444748, "learning_rate": 6.736818042674733e-06, "loss": 0.4611, "num_input_tokens_seen": 85476944, "step": 70455 }, { "epoch": 7.847199019935405, "grad_norm": 0.08803946524858475, "learning_rate": 6.733499931943999e-06, "loss": 0.4444, "num_input_tokens_seen": 85483184, "step": 70460 }, { "epoch": 7.847755874819022, "grad_norm": 0.10235309600830078, "learning_rate": 6.730182511380176e-06, "loss": 0.4728, "num_input_tokens_seen": 85489360, "step": 70465 }, { "epoch": 7.84831272970264, "grad_norm": 0.10246092081069946, "learning_rate": 6.72686578110861e-06, "loss": 0.4481, "num_input_tokens_seen": 85495536, "step": 70470 }, { "epoch": 7.848869584586257, "grad_norm": 0.1233578771352768, "learning_rate": 6.723549741254609e-06, "loss": 0.4566, "num_input_tokens_seen": 85501808, "step": 70475 }, { "epoch": 7.849426439469874, "grad_norm": 0.19847434759140015, "learning_rate": 6.720234391943475e-06, "loss": 0.4634, "num_input_tokens_seen": 85508240, "step": 70480 }, { "epoch": 7.849983294353492, "grad_norm": 0.101289764046669, "learning_rate": 6.716919733300458e-06, "loss": 0.4445, "num_input_tokens_seen": 85514544, "step": 70485 }, { "epoch": 7.8505401492371085, "grad_norm": 0.10407204926013947, "learning_rate": 6.713605765450806e-06, "loss": 0.4633, "num_input_tokens_seen": 85520688, "step": 70490 }, { "epoch": 7.851097004120726, "grad_norm": 0.09080322831869125, "learning_rate": 6.7102924885197375e-06, "loss": 0.453, "num_input_tokens_seen": 85526416, "step": 70495 }, { "epoch": 7.851653859004344, "grad_norm": 0.15199077129364014, "learning_rate": 6.706979902632424e-06, "loss": 0.4612, "num_input_tokens_seen": 85532624, "step": 70500 }, { "epoch": 7.852210713887961, "grad_norm": 0.1123906597495079, "learning_rate": 6.703668007914038e-06, "loss": 0.4609, "num_input_tokens_seen": 85538672, "step": 70505 }, { "epoch": 7.852767568771578, "grad_norm": 0.11117260903120041, "learning_rate": 6.700356804489702e-06, "loss": 0.4629, "num_input_tokens_seen": 85543952, "step": 70510 }, { "epoch": 7.853324423655195, "grad_norm": 0.10919482260942459, "learning_rate": 6.697046292484538e-06, "loss": 0.4604, "num_input_tokens_seen": 85550320, "step": 70515 }, { "epoch": 7.853881278538813, "grad_norm": 0.12987928092479706, "learning_rate": 6.6937364720236095e-06, "loss": 0.4528, "num_input_tokens_seen": 85555600, "step": 70520 }, { "epoch": 7.8544381334224305, "grad_norm": 0.11293211579322815, "learning_rate": 6.690427343231984e-06, "loss": 0.4482, "num_input_tokens_seen": 85561840, "step": 70525 }, { "epoch": 7.854994988306047, "grad_norm": 0.15251004695892334, "learning_rate": 6.687118906234699e-06, "loss": 0.4725, "num_input_tokens_seen": 85567792, "step": 70530 }, { "epoch": 7.855551843189665, "grad_norm": 0.08239813148975372, "learning_rate": 6.683811161156739e-06, "loss": 0.4551, "num_input_tokens_seen": 85573808, "step": 70535 }, { "epoch": 7.856108698073282, "grad_norm": 0.10493867844343185, "learning_rate": 6.6805041081230975e-06, "loss": 0.4571, "num_input_tokens_seen": 85579888, "step": 70540 }, { "epoch": 7.856665552956899, "grad_norm": 0.09329114109277725, "learning_rate": 6.677197747258709e-06, "loss": 0.4527, "num_input_tokens_seen": 85586256, "step": 70545 }, { "epoch": 7.857222407840517, "grad_norm": 0.108253613114357, "learning_rate": 6.673892078688521e-06, "loss": 0.4653, "num_input_tokens_seen": 85592304, "step": 70550 }, { "epoch": 7.857779262724134, "grad_norm": 0.13135521113872528, "learning_rate": 6.670587102537407e-06, "loss": 0.4636, "num_input_tokens_seen": 85598320, "step": 70555 }, { "epoch": 7.8583361176077515, "grad_norm": 0.08575940877199173, "learning_rate": 6.667282818930254e-06, "loss": 0.4553, "num_input_tokens_seen": 85604336, "step": 70560 }, { "epoch": 7.858892972491368, "grad_norm": 0.10380516946315765, "learning_rate": 6.663979227991912e-06, "loss": 0.4676, "num_input_tokens_seen": 85610704, "step": 70565 }, { "epoch": 7.859449827374986, "grad_norm": 0.10176248848438263, "learning_rate": 6.6606763298471876e-06, "loss": 0.4598, "num_input_tokens_seen": 85616720, "step": 70570 }, { "epoch": 7.860006682258604, "grad_norm": 0.11484736204147339, "learning_rate": 6.6573741246208935e-06, "loss": 0.4692, "num_input_tokens_seen": 85622672, "step": 70575 }, { "epoch": 7.8605635371422204, "grad_norm": 0.1116047203540802, "learning_rate": 6.654072612437776e-06, "loss": 0.4589, "num_input_tokens_seen": 85628752, "step": 70580 }, { "epoch": 7.861120392025838, "grad_norm": 0.1126909852027893, "learning_rate": 6.650771793422597e-06, "loss": 0.4603, "num_input_tokens_seen": 85634960, "step": 70585 }, { "epoch": 7.861677246909455, "grad_norm": 0.15883266925811768, "learning_rate": 6.647471667700053e-06, "loss": 0.4639, "num_input_tokens_seen": 85641232, "step": 70590 }, { "epoch": 7.862234101793073, "grad_norm": 0.09586194902658463, "learning_rate": 6.644172235394844e-06, "loss": 0.4572, "num_input_tokens_seen": 85647184, "step": 70595 }, { "epoch": 7.86279095667669, "grad_norm": 0.17406538128852844, "learning_rate": 6.640873496631642e-06, "loss": 0.4529, "num_input_tokens_seen": 85653072, "step": 70600 }, { "epoch": 7.863347811560307, "grad_norm": 0.11026613414287567, "learning_rate": 6.637575451535064e-06, "loss": 0.469, "num_input_tokens_seen": 85658864, "step": 70605 }, { "epoch": 7.863904666443925, "grad_norm": 0.12512211501598358, "learning_rate": 6.634278100229738e-06, "loss": 0.4592, "num_input_tokens_seen": 85664816, "step": 70610 }, { "epoch": 7.864461521327542, "grad_norm": 0.10098543018102646, "learning_rate": 6.630981442840234e-06, "loss": 0.4541, "num_input_tokens_seen": 85671120, "step": 70615 }, { "epoch": 7.865018376211159, "grad_norm": 0.09615974873304367, "learning_rate": 6.6276854794911195e-06, "loss": 0.4805, "num_input_tokens_seen": 85677136, "step": 70620 }, { "epoch": 7.865575231094777, "grad_norm": 0.08462420105934143, "learning_rate": 6.62439021030693e-06, "loss": 0.4628, "num_input_tokens_seen": 85683504, "step": 70625 }, { "epoch": 7.8661320859783945, "grad_norm": 0.09719323366880417, "learning_rate": 6.621095635412158e-06, "loss": 0.4615, "num_input_tokens_seen": 85689840, "step": 70630 }, { "epoch": 7.866688940862011, "grad_norm": 0.11490466445684433, "learning_rate": 6.617801754931299e-06, "loss": 0.4583, "num_input_tokens_seen": 85695760, "step": 70635 }, { "epoch": 7.867245795745629, "grad_norm": 0.14834117889404297, "learning_rate": 6.614508568988792e-06, "loss": 0.469, "num_input_tokens_seen": 85701616, "step": 70640 }, { "epoch": 7.867802650629246, "grad_norm": 0.08786751329898834, "learning_rate": 6.611216077709076e-06, "loss": 0.4602, "num_input_tokens_seen": 85707696, "step": 70645 }, { "epoch": 7.8683595055128634, "grad_norm": 0.09138864278793335, "learning_rate": 6.607924281216541e-06, "loss": 0.4666, "num_input_tokens_seen": 85714000, "step": 70650 }, { "epoch": 7.868916360396481, "grad_norm": 0.09270687401294708, "learning_rate": 6.6046331796355665e-06, "loss": 0.4583, "num_input_tokens_seen": 85720272, "step": 70655 }, { "epoch": 7.869473215280098, "grad_norm": 0.2159249186515808, "learning_rate": 6.601342773090508e-06, "loss": 0.4747, "num_input_tokens_seen": 85726640, "step": 70660 }, { "epoch": 7.870030070163716, "grad_norm": 0.12645193934440613, "learning_rate": 6.598053061705673e-06, "loss": 0.4678, "num_input_tokens_seen": 85732176, "step": 70665 }, { "epoch": 7.870586925047332, "grad_norm": 0.1209506019949913, "learning_rate": 6.594764045605373e-06, "loss": 0.4471, "num_input_tokens_seen": 85738160, "step": 70670 }, { "epoch": 7.87114377993095, "grad_norm": 0.09449216723442078, "learning_rate": 6.591475724913865e-06, "loss": 0.4455, "num_input_tokens_seen": 85743760, "step": 70675 }, { "epoch": 7.871700634814568, "grad_norm": 0.09203114360570908, "learning_rate": 6.588188099755402e-06, "loss": 0.4564, "num_input_tokens_seen": 85749840, "step": 70680 }, { "epoch": 7.8722574896981845, "grad_norm": 0.13096490502357483, "learning_rate": 6.584901170254187e-06, "loss": 0.4624, "num_input_tokens_seen": 85756272, "step": 70685 }, { "epoch": 7.872814344581802, "grad_norm": 0.09893960505723953, "learning_rate": 6.581614936534422e-06, "loss": 0.4447, "num_input_tokens_seen": 85762576, "step": 70690 }, { "epoch": 7.873371199465419, "grad_norm": 0.09480351209640503, "learning_rate": 6.578329398720279e-06, "loss": 0.458, "num_input_tokens_seen": 85768528, "step": 70695 }, { "epoch": 7.873928054349037, "grad_norm": 0.10185565054416656, "learning_rate": 6.575044556935875e-06, "loss": 0.4598, "num_input_tokens_seen": 85774608, "step": 70700 }, { "epoch": 7.874484909232654, "grad_norm": 0.09801501035690308, "learning_rate": 6.571760411305342e-06, "loss": 0.4568, "num_input_tokens_seen": 85780880, "step": 70705 }, { "epoch": 7.875041764116271, "grad_norm": 0.10289006680250168, "learning_rate": 6.568476961952752e-06, "loss": 0.4658, "num_input_tokens_seen": 85787056, "step": 70710 }, { "epoch": 7.875598618999889, "grad_norm": 0.11431024223566055, "learning_rate": 6.565194209002165e-06, "loss": 0.4577, "num_input_tokens_seen": 85793424, "step": 70715 }, { "epoch": 7.876155473883506, "grad_norm": 0.1444949507713318, "learning_rate": 6.561912152577629e-06, "loss": 0.4636, "num_input_tokens_seen": 85799792, "step": 70720 }, { "epoch": 7.876712328767123, "grad_norm": 0.11630367487668991, "learning_rate": 6.5586307928031305e-06, "loss": 0.4724, "num_input_tokens_seen": 85805104, "step": 70725 }, { "epoch": 7.877269183650741, "grad_norm": 0.10067950934171677, "learning_rate": 6.555350129802665e-06, "loss": 0.462, "num_input_tokens_seen": 85811152, "step": 70730 }, { "epoch": 7.877826038534358, "grad_norm": 0.10674120485782623, "learning_rate": 6.552070163700175e-06, "loss": 0.461, "num_input_tokens_seen": 85816976, "step": 70735 }, { "epoch": 7.878382893417975, "grad_norm": 0.11001139134168625, "learning_rate": 6.5487908946196e-06, "loss": 0.4611, "num_input_tokens_seen": 85823216, "step": 70740 }, { "epoch": 7.878939748301592, "grad_norm": 0.12734727561473846, "learning_rate": 6.545512322684827e-06, "loss": 0.4616, "num_input_tokens_seen": 85829456, "step": 70745 }, { "epoch": 7.87949660318521, "grad_norm": 0.11358538269996643, "learning_rate": 6.54223444801974e-06, "loss": 0.4618, "num_input_tokens_seen": 85835504, "step": 70750 }, { "epoch": 7.8800534580688275, "grad_norm": 0.09969229996204376, "learning_rate": 6.538957270748194e-06, "loss": 0.4757, "num_input_tokens_seen": 85841584, "step": 70755 }, { "epoch": 7.880610312952444, "grad_norm": 0.09801610559225082, "learning_rate": 6.535680790993995e-06, "loss": 0.4671, "num_input_tokens_seen": 85847600, "step": 70760 }, { "epoch": 7.881167167836062, "grad_norm": 0.11485978215932846, "learning_rate": 6.5324050088809565e-06, "loss": 0.4526, "num_input_tokens_seen": 85853616, "step": 70765 }, { "epoch": 7.881724022719679, "grad_norm": 0.0897587463259697, "learning_rate": 6.529129924532828e-06, "loss": 0.4491, "num_input_tokens_seen": 85859728, "step": 70770 }, { "epoch": 7.882280877603296, "grad_norm": 0.11475773900747299, "learning_rate": 6.525855538073375e-06, "loss": 0.4572, "num_input_tokens_seen": 85865936, "step": 70775 }, { "epoch": 7.882837732486914, "grad_norm": 0.11942943930625916, "learning_rate": 6.522581849626294e-06, "loss": 0.4685, "num_input_tokens_seen": 85872112, "step": 70780 }, { "epoch": 7.883394587370531, "grad_norm": 0.1038607507944107, "learning_rate": 6.519308859315285e-06, "loss": 0.4556, "num_input_tokens_seen": 85878256, "step": 70785 }, { "epoch": 7.883951442254149, "grad_norm": 0.10656760632991791, "learning_rate": 6.5160365672640195e-06, "loss": 0.457, "num_input_tokens_seen": 85884368, "step": 70790 }, { "epoch": 7.884508297137766, "grad_norm": 0.132797509431839, "learning_rate": 6.512764973596119e-06, "loss": 0.4548, "num_input_tokens_seen": 85890000, "step": 70795 }, { "epoch": 7.885065152021383, "grad_norm": 0.14201012253761292, "learning_rate": 6.509494078435211e-06, "loss": 0.4574, "num_input_tokens_seen": 85896016, "step": 70800 }, { "epoch": 7.885622006905001, "grad_norm": 0.10339420288801193, "learning_rate": 6.506223881904866e-06, "loss": 0.4782, "num_input_tokens_seen": 85901744, "step": 70805 }, { "epoch": 7.886178861788618, "grad_norm": 0.1750374436378479, "learning_rate": 6.502954384128654e-06, "loss": 0.4516, "num_input_tokens_seen": 85908048, "step": 70810 }, { "epoch": 7.886735716672235, "grad_norm": 0.11340785771608353, "learning_rate": 6.499685585230095e-06, "loss": 0.4708, "num_input_tokens_seen": 85913968, "step": 70815 }, { "epoch": 7.887292571555853, "grad_norm": 0.175038680434227, "learning_rate": 6.4964174853327035e-06, "loss": 0.4585, "num_input_tokens_seen": 85920144, "step": 70820 }, { "epoch": 7.88784942643947, "grad_norm": 0.0878070667386055, "learning_rate": 6.493150084559962e-06, "loss": 0.4608, "num_input_tokens_seen": 85926128, "step": 70825 }, { "epoch": 7.888406281323087, "grad_norm": 0.15480802953243256, "learning_rate": 6.489883383035314e-06, "loss": 0.4746, "num_input_tokens_seen": 85932464, "step": 70830 }, { "epoch": 7.888963136206705, "grad_norm": 0.11754961311817169, "learning_rate": 6.486617380882196e-06, "loss": 0.4573, "num_input_tokens_seen": 85938672, "step": 70835 }, { "epoch": 7.889519991090322, "grad_norm": 0.12524272501468658, "learning_rate": 6.483352078223995e-06, "loss": 0.4636, "num_input_tokens_seen": 85944880, "step": 70840 }, { "epoch": 7.890076845973939, "grad_norm": 0.09523673355579376, "learning_rate": 6.480087475184099e-06, "loss": 0.4609, "num_input_tokens_seen": 85951120, "step": 70845 }, { "epoch": 7.890633700857556, "grad_norm": 0.13236044347286224, "learning_rate": 6.476823571885851e-06, "loss": 0.4681, "num_input_tokens_seen": 85957264, "step": 70850 }, { "epoch": 7.891190555741174, "grad_norm": 0.09280768036842346, "learning_rate": 6.473560368452558e-06, "loss": 0.4653, "num_input_tokens_seen": 85963408, "step": 70855 }, { "epoch": 7.891747410624792, "grad_norm": 0.10541941225528717, "learning_rate": 6.470297865007535e-06, "loss": 0.4462, "num_input_tokens_seen": 85969552, "step": 70860 }, { "epoch": 7.892304265508408, "grad_norm": 0.11381606757640839, "learning_rate": 6.467036061674031e-06, "loss": 0.4642, "num_input_tokens_seen": 85976016, "step": 70865 }, { "epoch": 7.892861120392026, "grad_norm": 0.11096589267253876, "learning_rate": 6.463774958575297e-06, "loss": 0.4645, "num_input_tokens_seen": 85981936, "step": 70870 }, { "epoch": 7.893417975275643, "grad_norm": 0.1367630660533905, "learning_rate": 6.460514555834554e-06, "loss": 0.4473, "num_input_tokens_seen": 85988112, "step": 70875 }, { "epoch": 7.8939748301592605, "grad_norm": 0.16237211227416992, "learning_rate": 6.457254853574976e-06, "loss": 0.4669, "num_input_tokens_seen": 85993712, "step": 70880 }, { "epoch": 7.894531685042878, "grad_norm": 0.11069860309362411, "learning_rate": 6.453995851919742e-06, "loss": 0.4645, "num_input_tokens_seen": 85999632, "step": 70885 }, { "epoch": 7.895088539926495, "grad_norm": 0.12329772114753723, "learning_rate": 6.4507375509919696e-06, "loss": 0.4606, "num_input_tokens_seen": 86005936, "step": 70890 }, { "epoch": 7.895645394810113, "grad_norm": 0.15298022329807281, "learning_rate": 6.447479950914778e-06, "loss": 0.4624, "num_input_tokens_seen": 86011728, "step": 70895 }, { "epoch": 7.896202249693729, "grad_norm": 0.10516306757926941, "learning_rate": 6.444223051811254e-06, "loss": 0.4566, "num_input_tokens_seen": 86017360, "step": 70900 }, { "epoch": 7.896759104577347, "grad_norm": 0.09833479672670364, "learning_rate": 6.440966853804442e-06, "loss": 0.4594, "num_input_tokens_seen": 86023024, "step": 70905 }, { "epoch": 7.897315959460965, "grad_norm": 0.17278702557086945, "learning_rate": 6.437711357017384e-06, "loss": 0.4594, "num_input_tokens_seen": 86029616, "step": 70910 }, { "epoch": 7.8978728143445815, "grad_norm": 0.13027088344097137, "learning_rate": 6.434456561573071e-06, "loss": 0.4656, "num_input_tokens_seen": 86036016, "step": 70915 }, { "epoch": 7.898429669228199, "grad_norm": 0.1482117772102356, "learning_rate": 6.431202467594491e-06, "loss": 0.4665, "num_input_tokens_seen": 86042192, "step": 70920 }, { "epoch": 7.898986524111816, "grad_norm": 0.09254731982946396, "learning_rate": 6.4279490752045815e-06, "loss": 0.4616, "num_input_tokens_seen": 86048240, "step": 70925 }, { "epoch": 7.899543378995434, "grad_norm": 0.10765108466148376, "learning_rate": 6.424696384526272e-06, "loss": 0.4559, "num_input_tokens_seen": 86054640, "step": 70930 }, { "epoch": 7.900100233879051, "grad_norm": 0.13604746758937836, "learning_rate": 6.421444395682469e-06, "loss": 0.4523, "num_input_tokens_seen": 86060720, "step": 70935 }, { "epoch": 7.900657088762668, "grad_norm": 0.09823638945817947, "learning_rate": 6.418193108796028e-06, "loss": 0.4648, "num_input_tokens_seen": 86066928, "step": 70940 }, { "epoch": 7.901213943646286, "grad_norm": 0.08800800889730453, "learning_rate": 6.414942523989806e-06, "loss": 0.4576, "num_input_tokens_seen": 86073328, "step": 70945 }, { "epoch": 7.9017707985299035, "grad_norm": 0.10190604627132416, "learning_rate": 6.411692641386608e-06, "loss": 0.4558, "num_input_tokens_seen": 86079536, "step": 70950 }, { "epoch": 7.90232765341352, "grad_norm": 0.12581990659236908, "learning_rate": 6.408443461109237e-06, "loss": 0.4707, "num_input_tokens_seen": 86085488, "step": 70955 }, { "epoch": 7.902884508297138, "grad_norm": 0.07576710730791092, "learning_rate": 6.4051949832804435e-06, "loss": 0.4648, "num_input_tokens_seen": 86091568, "step": 70960 }, { "epoch": 7.903441363180755, "grad_norm": 0.1574251353740692, "learning_rate": 6.401947208022974e-06, "loss": 0.458, "num_input_tokens_seen": 86097648, "step": 70965 }, { "epoch": 7.903998218064372, "grad_norm": 0.0925920382142067, "learning_rate": 6.398700135459548e-06, "loss": 0.475, "num_input_tokens_seen": 86104048, "step": 70970 }, { "epoch": 7.90455507294799, "grad_norm": 0.10200903564691544, "learning_rate": 6.395453765712833e-06, "loss": 0.4561, "num_input_tokens_seen": 86110128, "step": 70975 }, { "epoch": 7.905111927831607, "grad_norm": 0.09088966995477676, "learning_rate": 6.392208098905505e-06, "loss": 0.4649, "num_input_tokens_seen": 86116144, "step": 70980 }, { "epoch": 7.9056687827152246, "grad_norm": 0.07308174669742584, "learning_rate": 6.388963135160178e-06, "loss": 0.4646, "num_input_tokens_seen": 86122224, "step": 70985 }, { "epoch": 7.906225637598842, "grad_norm": 0.1252867877483368, "learning_rate": 6.385718874599467e-06, "loss": 0.4541, "num_input_tokens_seen": 86128272, "step": 70990 }, { "epoch": 7.906782492482459, "grad_norm": 0.09821681678295135, "learning_rate": 6.382475317345954e-06, "loss": 0.4574, "num_input_tokens_seen": 86134032, "step": 70995 }, { "epoch": 7.907339347366077, "grad_norm": 0.08823634684085846, "learning_rate": 6.379232463522181e-06, "loss": 0.4563, "num_input_tokens_seen": 86140240, "step": 71000 }, { "epoch": 7.9078962022496935, "grad_norm": 0.11266134679317474, "learning_rate": 6.375990313250687e-06, "loss": 0.4667, "num_input_tokens_seen": 86145680, "step": 71005 }, { "epoch": 7.908453057133311, "grad_norm": 0.08817575871944427, "learning_rate": 6.372748866653955e-06, "loss": 0.4542, "num_input_tokens_seen": 86151664, "step": 71010 }, { "epoch": 7.909009912016929, "grad_norm": 0.08887708187103271, "learning_rate": 6.36950812385447e-06, "loss": 0.4674, "num_input_tokens_seen": 86157872, "step": 71015 }, { "epoch": 7.909566766900546, "grad_norm": 0.10651344805955887, "learning_rate": 6.366268084974667e-06, "loss": 0.4504, "num_input_tokens_seen": 86164176, "step": 71020 }, { "epoch": 7.910123621784163, "grad_norm": 0.09327732771635056, "learning_rate": 6.363028750136968e-06, "loss": 0.4577, "num_input_tokens_seen": 86170320, "step": 71025 }, { "epoch": 7.91068047666778, "grad_norm": 0.0851098820567131, "learning_rate": 6.359790119463777e-06, "loss": 0.4498, "num_input_tokens_seen": 86176144, "step": 71030 }, { "epoch": 7.911237331551398, "grad_norm": 0.12327494472265244, "learning_rate": 6.356552193077445e-06, "loss": 0.4645, "num_input_tokens_seen": 86182480, "step": 71035 }, { "epoch": 7.911794186435015, "grad_norm": 0.08835966885089874, "learning_rate": 6.353314971100322e-06, "loss": 0.4627, "num_input_tokens_seen": 86188176, "step": 71040 }, { "epoch": 7.912351041318632, "grad_norm": 0.10946409404277802, "learning_rate": 6.350078453654709e-06, "loss": 0.4609, "num_input_tokens_seen": 86193968, "step": 71045 }, { "epoch": 7.91290789620225, "grad_norm": 0.12343693524599075, "learning_rate": 6.346842640862904e-06, "loss": 0.4544, "num_input_tokens_seen": 86199888, "step": 71050 }, { "epoch": 7.913464751085867, "grad_norm": 0.12149039655923843, "learning_rate": 6.343607532847157e-06, "loss": 0.476, "num_input_tokens_seen": 86206192, "step": 71055 }, { "epoch": 7.914021605969484, "grad_norm": 0.1139916479587555, "learning_rate": 6.340373129729702e-06, "loss": 0.4712, "num_input_tokens_seen": 86212272, "step": 71060 }, { "epoch": 7.914578460853102, "grad_norm": 0.08890288323163986, "learning_rate": 6.337139431632758e-06, "loss": 0.4623, "num_input_tokens_seen": 86218640, "step": 71065 }, { "epoch": 7.915135315736719, "grad_norm": 0.11885425448417664, "learning_rate": 6.3339064386784856e-06, "loss": 0.4613, "num_input_tokens_seen": 86224880, "step": 71070 }, { "epoch": 7.9156921706203365, "grad_norm": 0.09563116729259491, "learning_rate": 6.330674150989052e-06, "loss": 0.4564, "num_input_tokens_seen": 86231280, "step": 71075 }, { "epoch": 7.916249025503953, "grad_norm": 0.10362136363983154, "learning_rate": 6.327442568686573e-06, "loss": 0.4579, "num_input_tokens_seen": 86237488, "step": 71080 }, { "epoch": 7.916805880387571, "grad_norm": 0.09385323524475098, "learning_rate": 6.324211691893159e-06, "loss": 0.4614, "num_input_tokens_seen": 86243600, "step": 71085 }, { "epoch": 7.917362735271189, "grad_norm": 0.12886139750480652, "learning_rate": 6.32098152073087e-06, "loss": 0.4569, "num_input_tokens_seen": 86249872, "step": 71090 }, { "epoch": 7.917919590154805, "grad_norm": 0.1626012772321701, "learning_rate": 6.31775205532176e-06, "loss": 0.4592, "num_input_tokens_seen": 86255888, "step": 71095 }, { "epoch": 7.918476445038423, "grad_norm": 0.13585035502910614, "learning_rate": 6.314523295787852e-06, "loss": 0.4622, "num_input_tokens_seen": 86262000, "step": 71100 }, { "epoch": 7.91903329992204, "grad_norm": 0.11605328321456909, "learning_rate": 6.311295242251128e-06, "loss": 0.4672, "num_input_tokens_seen": 86267952, "step": 71105 }, { "epoch": 7.9195901548056575, "grad_norm": 0.1242157369852066, "learning_rate": 6.308067894833569e-06, "loss": 0.4718, "num_input_tokens_seen": 86274384, "step": 71110 }, { "epoch": 7.920147009689275, "grad_norm": 0.11825168132781982, "learning_rate": 6.304841253657098e-06, "loss": 0.4542, "num_input_tokens_seen": 86280336, "step": 71115 }, { "epoch": 7.920703864572892, "grad_norm": 0.13407018780708313, "learning_rate": 6.301615318843637e-06, "loss": 0.4557, "num_input_tokens_seen": 86285968, "step": 71120 }, { "epoch": 7.92126071945651, "grad_norm": 0.18483687937259674, "learning_rate": 6.29839009051508e-06, "loss": 0.4622, "num_input_tokens_seen": 86291376, "step": 71125 }, { "epoch": 7.921817574340127, "grad_norm": 0.11211816966533661, "learning_rate": 6.295165568793268e-06, "loss": 0.4711, "num_input_tokens_seen": 86297648, "step": 71130 }, { "epoch": 7.922374429223744, "grad_norm": 0.08723533898591995, "learning_rate": 6.291941753800051e-06, "loss": 0.4627, "num_input_tokens_seen": 86303248, "step": 71135 }, { "epoch": 7.922931284107362, "grad_norm": 0.1298542320728302, "learning_rate": 6.288718645657221e-06, "loss": 0.4543, "num_input_tokens_seen": 86308624, "step": 71140 }, { "epoch": 7.923488138990979, "grad_norm": 0.09153807163238525, "learning_rate": 6.285496244486569e-06, "loss": 0.4735, "num_input_tokens_seen": 86314736, "step": 71145 }, { "epoch": 7.924044993874596, "grad_norm": 0.0988333448767662, "learning_rate": 6.282274550409836e-06, "loss": 0.4744, "num_input_tokens_seen": 86321072, "step": 71150 }, { "epoch": 7.924601848758214, "grad_norm": 0.08728967607021332, "learning_rate": 6.279053563548756e-06, "loss": 0.4559, "num_input_tokens_seen": 86327280, "step": 71155 }, { "epoch": 7.925158703641831, "grad_norm": 0.11412617564201355, "learning_rate": 6.275833284025032e-06, "loss": 0.4596, "num_input_tokens_seen": 86333520, "step": 71160 }, { "epoch": 7.925715558525448, "grad_norm": 0.08694534003734589, "learning_rate": 6.272613711960326e-06, "loss": 0.4643, "num_input_tokens_seen": 86339696, "step": 71165 }, { "epoch": 7.926272413409066, "grad_norm": 0.09734304249286652, "learning_rate": 6.269394847476293e-06, "loss": 0.4556, "num_input_tokens_seen": 86345936, "step": 71170 }, { "epoch": 7.926829268292683, "grad_norm": 0.09196459501981735, "learning_rate": 6.266176690694542e-06, "loss": 0.483, "num_input_tokens_seen": 86352112, "step": 71175 }, { "epoch": 7.9273861231763005, "grad_norm": 0.1068711206316948, "learning_rate": 6.26295924173668e-06, "loss": 0.4651, "num_input_tokens_seen": 86358000, "step": 71180 }, { "epoch": 7.927942978059917, "grad_norm": 0.10532969236373901, "learning_rate": 6.2597425007242555e-06, "loss": 0.4675, "num_input_tokens_seen": 86363824, "step": 71185 }, { "epoch": 7.928499832943535, "grad_norm": 0.08993270993232727, "learning_rate": 6.256526467778814e-06, "loss": 0.4594, "num_input_tokens_seen": 86370160, "step": 71190 }, { "epoch": 7.929056687827153, "grad_norm": 0.09381630271673203, "learning_rate": 6.253311143021878e-06, "loss": 0.4503, "num_input_tokens_seen": 86376240, "step": 71195 }, { "epoch": 7.9296135427107695, "grad_norm": 0.09900354593992233, "learning_rate": 6.250096526574914e-06, "loss": 0.4652, "num_input_tokens_seen": 86382256, "step": 71200 }, { "epoch": 7.930170397594387, "grad_norm": 0.08837608247995377, "learning_rate": 6.246882618559399e-06, "loss": 0.4526, "num_input_tokens_seen": 86388560, "step": 71205 }, { "epoch": 7.930727252478004, "grad_norm": 0.0795731171965599, "learning_rate": 6.243669419096751e-06, "loss": 0.4608, "num_input_tokens_seen": 86394320, "step": 71210 }, { "epoch": 7.931284107361622, "grad_norm": 0.10900327563285828, "learning_rate": 6.240456928308378e-06, "loss": 0.4659, "num_input_tokens_seen": 86400784, "step": 71215 }, { "epoch": 7.931840962245239, "grad_norm": 0.08932872861623764, "learning_rate": 6.237245146315665e-06, "loss": 0.4622, "num_input_tokens_seen": 86406864, "step": 71220 }, { "epoch": 7.932397817128856, "grad_norm": 0.12741008400917053, "learning_rate": 6.234034073239955e-06, "loss": 0.4595, "num_input_tokens_seen": 86413168, "step": 71225 }, { "epoch": 7.932954672012474, "grad_norm": 0.11952526867389679, "learning_rate": 6.230823709202582e-06, "loss": 0.4614, "num_input_tokens_seen": 86419376, "step": 71230 }, { "epoch": 7.9335115268960905, "grad_norm": 0.0921427309513092, "learning_rate": 6.227614054324829e-06, "loss": 0.4525, "num_input_tokens_seen": 86425360, "step": 71235 }, { "epoch": 7.934068381779708, "grad_norm": 0.13231877982616425, "learning_rate": 6.2244051087279845e-06, "loss": 0.4509, "num_input_tokens_seen": 86431056, "step": 71240 }, { "epoch": 7.934625236663326, "grad_norm": 0.10608597099781036, "learning_rate": 6.2211968725332804e-06, "loss": 0.4547, "num_input_tokens_seen": 86436912, "step": 71245 }, { "epoch": 7.935182091546943, "grad_norm": 0.14145001769065857, "learning_rate": 6.217989345861935e-06, "loss": 0.466, "num_input_tokens_seen": 86442864, "step": 71250 }, { "epoch": 7.93573894643056, "grad_norm": 0.13289713859558105, "learning_rate": 6.214782528835155e-06, "loss": 0.4603, "num_input_tokens_seen": 86449168, "step": 71255 }, { "epoch": 7.936295801314177, "grad_norm": 0.11200267821550369, "learning_rate": 6.211576421574081e-06, "loss": 0.463, "num_input_tokens_seen": 86455568, "step": 71260 }, { "epoch": 7.936852656197795, "grad_norm": 0.0895506888628006, "learning_rate": 6.208371024199863e-06, "loss": 0.4512, "num_input_tokens_seen": 86461680, "step": 71265 }, { "epoch": 7.9374095110814125, "grad_norm": 0.10812865942716599, "learning_rate": 6.205166336833603e-06, "loss": 0.4652, "num_input_tokens_seen": 86467664, "step": 71270 }, { "epoch": 7.937966365965029, "grad_norm": 0.07751213759183884, "learning_rate": 6.201962359596391e-06, "loss": 0.4635, "num_input_tokens_seen": 86473872, "step": 71275 }, { "epoch": 7.938523220848647, "grad_norm": 0.1190270334482193, "learning_rate": 6.198759092609288e-06, "loss": 0.4514, "num_input_tokens_seen": 86479888, "step": 71280 }, { "epoch": 7.939080075732264, "grad_norm": 0.07798866927623749, "learning_rate": 6.195556535993313e-06, "loss": 0.4669, "num_input_tokens_seen": 86486128, "step": 71285 }, { "epoch": 7.939636930615881, "grad_norm": 0.15709222853183746, "learning_rate": 6.192354689869478e-06, "loss": 0.4573, "num_input_tokens_seen": 86492272, "step": 71290 }, { "epoch": 7.940193785499499, "grad_norm": 0.18309521675109863, "learning_rate": 6.1891535543587505e-06, "loss": 0.4692, "num_input_tokens_seen": 86498384, "step": 71295 }, { "epoch": 7.940750640383116, "grad_norm": 0.10485905408859253, "learning_rate": 6.185953129582084e-06, "loss": 0.4522, "num_input_tokens_seen": 86504464, "step": 71300 }, { "epoch": 7.9413074952667335, "grad_norm": 0.11305937170982361, "learning_rate": 6.182753415660406e-06, "loss": 0.4614, "num_input_tokens_seen": 86510384, "step": 71305 }, { "epoch": 7.941864350150351, "grad_norm": 0.11784198135137558, "learning_rate": 6.1795544127146e-06, "loss": 0.4596, "num_input_tokens_seen": 86516560, "step": 71310 }, { "epoch": 7.942421205033968, "grad_norm": 0.13653728365898132, "learning_rate": 6.176356120865548e-06, "loss": 0.4694, "num_input_tokens_seen": 86522768, "step": 71315 }, { "epoch": 7.942978059917586, "grad_norm": 0.138411283493042, "learning_rate": 6.1731585402340805e-06, "loss": 0.4637, "num_input_tokens_seen": 86529104, "step": 71320 }, { "epoch": 7.943534914801202, "grad_norm": 0.07958468794822693, "learning_rate": 6.169961670941021e-06, "loss": 0.4654, "num_input_tokens_seen": 86535088, "step": 71325 }, { "epoch": 7.94409176968482, "grad_norm": 0.11633823066949844, "learning_rate": 6.166765513107148e-06, "loss": 0.4598, "num_input_tokens_seen": 86540944, "step": 71330 }, { "epoch": 7.944648624568438, "grad_norm": 0.09340718388557434, "learning_rate": 6.1635700668532275e-06, "loss": 0.4467, "num_input_tokens_seen": 86547344, "step": 71335 }, { "epoch": 7.945205479452055, "grad_norm": 0.1509055495262146, "learning_rate": 6.160375332300003e-06, "loss": 0.4635, "num_input_tokens_seen": 86553360, "step": 71340 }, { "epoch": 7.945762334335672, "grad_norm": 0.09535925090312958, "learning_rate": 6.157181309568163e-06, "loss": 0.4644, "num_input_tokens_seen": 86559088, "step": 71345 }, { "epoch": 7.94631918921929, "grad_norm": 0.10678187757730484, "learning_rate": 6.153987998778407e-06, "loss": 0.4647, "num_input_tokens_seen": 86565040, "step": 71350 }, { "epoch": 7.946876044102907, "grad_norm": 0.10119953006505966, "learning_rate": 6.150795400051371e-06, "loss": 0.4484, "num_input_tokens_seen": 86571248, "step": 71355 }, { "epoch": 7.947432898986524, "grad_norm": 0.12804745137691498, "learning_rate": 6.1476035135076994e-06, "loss": 0.4654, "num_input_tokens_seen": 86577360, "step": 71360 }, { "epoch": 7.947989753870141, "grad_norm": 0.11691879481077194, "learning_rate": 6.1444123392679715e-06, "loss": 0.457, "num_input_tokens_seen": 86583088, "step": 71365 }, { "epoch": 7.948546608753759, "grad_norm": 0.16475477814674377, "learning_rate": 6.141221877452774e-06, "loss": 0.4605, "num_input_tokens_seen": 86589200, "step": 71370 }, { "epoch": 7.9491034636373765, "grad_norm": 0.08062050491571426, "learning_rate": 6.138032128182655e-06, "loss": 0.4695, "num_input_tokens_seen": 86595120, "step": 71375 }, { "epoch": 7.949660318520993, "grad_norm": 0.0948452576994896, "learning_rate": 6.134843091578124e-06, "loss": 0.4619, "num_input_tokens_seen": 86601264, "step": 71380 }, { "epoch": 7.950217173404611, "grad_norm": 0.09631354361772537, "learning_rate": 6.131654767759684e-06, "loss": 0.461, "num_input_tokens_seen": 86607376, "step": 71385 }, { "epoch": 7.950774028288228, "grad_norm": 0.10376657545566559, "learning_rate": 6.128467156847784e-06, "loss": 0.4546, "num_input_tokens_seen": 86613808, "step": 71390 }, { "epoch": 7.951330883171845, "grad_norm": 0.21165207028388977, "learning_rate": 6.125280258962873e-06, "loss": 0.4712, "num_input_tokens_seen": 86619728, "step": 71395 }, { "epoch": 7.951887738055463, "grad_norm": 0.10075999796390533, "learning_rate": 6.122094074225368e-06, "loss": 0.471, "num_input_tokens_seen": 86626288, "step": 71400 }, { "epoch": 7.95244459293908, "grad_norm": 0.08748860657215118, "learning_rate": 6.118908602755641e-06, "loss": 0.4606, "num_input_tokens_seen": 86632400, "step": 71405 }, { "epoch": 7.953001447822698, "grad_norm": 0.11055741459131241, "learning_rate": 6.11572384467406e-06, "loss": 0.4586, "num_input_tokens_seen": 86638544, "step": 71410 }, { "epoch": 7.953558302706314, "grad_norm": 0.10354162007570267, "learning_rate": 6.112539800100942e-06, "loss": 0.4579, "num_input_tokens_seen": 86644752, "step": 71415 }, { "epoch": 7.954115157589932, "grad_norm": 0.07348258793354034, "learning_rate": 6.109356469156604e-06, "loss": 0.4625, "num_input_tokens_seen": 86650544, "step": 71420 }, { "epoch": 7.95467201247355, "grad_norm": 0.10456843674182892, "learning_rate": 6.106173851961311e-06, "loss": 0.4639, "num_input_tokens_seen": 86656976, "step": 71425 }, { "epoch": 7.9552288673571665, "grad_norm": 0.08537305891513824, "learning_rate": 6.102991948635317e-06, "loss": 0.4665, "num_input_tokens_seen": 86663088, "step": 71430 }, { "epoch": 7.955785722240784, "grad_norm": 0.08842357248067856, "learning_rate": 6.099810759298855e-06, "loss": 0.4587, "num_input_tokens_seen": 86668976, "step": 71435 }, { "epoch": 7.956342577124401, "grad_norm": 0.084679014980793, "learning_rate": 6.096630284072102e-06, "loss": 0.465, "num_input_tokens_seen": 86674032, "step": 71440 }, { "epoch": 7.956899432008019, "grad_norm": 0.11450733244419098, "learning_rate": 6.093450523075245e-06, "loss": 0.4594, "num_input_tokens_seen": 86680176, "step": 71445 }, { "epoch": 7.957456286891636, "grad_norm": 0.14941133558750153, "learning_rate": 6.090271476428408e-06, "loss": 0.4656, "num_input_tokens_seen": 86686416, "step": 71450 }, { "epoch": 7.958013141775253, "grad_norm": 0.12988349795341492, "learning_rate": 6.087093144251721e-06, "loss": 0.4662, "num_input_tokens_seen": 86692272, "step": 71455 }, { "epoch": 7.958569996658871, "grad_norm": 0.09583442658185959, "learning_rate": 6.083915526665257e-06, "loss": 0.4657, "num_input_tokens_seen": 86698160, "step": 71460 }, { "epoch": 7.959126851542488, "grad_norm": 0.11113874614238739, "learning_rate": 6.080738623789084e-06, "loss": 0.4576, "num_input_tokens_seen": 86704208, "step": 71465 }, { "epoch": 7.959683706426105, "grad_norm": 0.09640286862850189, "learning_rate": 6.077562435743242e-06, "loss": 0.4636, "num_input_tokens_seen": 86709968, "step": 71470 }, { "epoch": 7.960240561309723, "grad_norm": 0.11952707916498184, "learning_rate": 6.074386962647724e-06, "loss": 0.4454, "num_input_tokens_seen": 86715344, "step": 71475 }, { "epoch": 7.96079741619334, "grad_norm": 0.0853564515709877, "learning_rate": 6.071212204622525e-06, "loss": 0.4576, "num_input_tokens_seen": 86721648, "step": 71480 }, { "epoch": 7.961354271076957, "grad_norm": 0.1504175066947937, "learning_rate": 6.068038161787581e-06, "loss": 0.4459, "num_input_tokens_seen": 86727728, "step": 71485 }, { "epoch": 7.961911125960575, "grad_norm": 0.07206716388463974, "learning_rate": 6.064864834262835e-06, "loss": 0.4583, "num_input_tokens_seen": 86733232, "step": 71490 }, { "epoch": 7.962467980844192, "grad_norm": 0.13567395508289337, "learning_rate": 6.061692222168166e-06, "loss": 0.4663, "num_input_tokens_seen": 86738608, "step": 71495 }, { "epoch": 7.9630248357278095, "grad_norm": 0.12696731090545654, "learning_rate": 6.0585203256234565e-06, "loss": 0.4611, "num_input_tokens_seen": 86744560, "step": 71500 }, { "epoch": 7.963581690611426, "grad_norm": 0.09852752089500427, "learning_rate": 6.055349144748559e-06, "loss": 0.4486, "num_input_tokens_seen": 86750608, "step": 71505 }, { "epoch": 7.964138545495044, "grad_norm": 0.10976962000131607, "learning_rate": 6.052178679663273e-06, "loss": 0.4646, "num_input_tokens_seen": 86756912, "step": 71510 }, { "epoch": 7.964695400378662, "grad_norm": 0.12491477280855179, "learning_rate": 6.049008930487404e-06, "loss": 0.4712, "num_input_tokens_seen": 86763120, "step": 71515 }, { "epoch": 7.965252255262278, "grad_norm": 0.11152290552854538, "learning_rate": 6.045839897340705e-06, "loss": 0.454, "num_input_tokens_seen": 86769296, "step": 71520 }, { "epoch": 7.965809110145896, "grad_norm": 0.10780106484889984, "learning_rate": 6.042671580342915e-06, "loss": 0.4645, "num_input_tokens_seen": 86775152, "step": 71525 }, { "epoch": 7.966365965029514, "grad_norm": 0.1281106024980545, "learning_rate": 6.03950397961375e-06, "loss": 0.452, "num_input_tokens_seen": 86781040, "step": 71530 }, { "epoch": 7.966922819913131, "grad_norm": 0.12523451447486877, "learning_rate": 6.0363370952728815e-06, "loss": 0.4735, "num_input_tokens_seen": 86787216, "step": 71535 }, { "epoch": 7.967479674796748, "grad_norm": 0.13034532964229584, "learning_rate": 6.033170927439977e-06, "loss": 0.4661, "num_input_tokens_seen": 86792944, "step": 71540 }, { "epoch": 7.968036529680365, "grad_norm": 0.1474430114030838, "learning_rate": 6.030005476234646e-06, "loss": 0.4541, "num_input_tokens_seen": 86798960, "step": 71545 }, { "epoch": 7.968593384563983, "grad_norm": 0.08193757385015488, "learning_rate": 6.02684074177651e-06, "loss": 0.4618, "num_input_tokens_seen": 86805232, "step": 71550 }, { "epoch": 7.9691502394476, "grad_norm": 0.12704890966415405, "learning_rate": 6.023676724185129e-06, "loss": 0.4636, "num_input_tokens_seen": 86811344, "step": 71555 }, { "epoch": 7.969707094331217, "grad_norm": 0.1237926334142685, "learning_rate": 6.0205134235800485e-06, "loss": 0.4621, "num_input_tokens_seen": 86816816, "step": 71560 }, { "epoch": 7.970263949214835, "grad_norm": 0.1516781449317932, "learning_rate": 6.017350840080804e-06, "loss": 0.467, "num_input_tokens_seen": 86822896, "step": 71565 }, { "epoch": 7.970820804098452, "grad_norm": 0.09035538882017136, "learning_rate": 6.01418897380687e-06, "loss": 0.4603, "num_input_tokens_seen": 86828752, "step": 71570 }, { "epoch": 7.971377658982069, "grad_norm": 0.1017000824213028, "learning_rate": 6.011027824877727e-06, "loss": 0.4534, "num_input_tokens_seen": 86834800, "step": 71575 }, { "epoch": 7.971934513865687, "grad_norm": 0.11122439056634903, "learning_rate": 6.0078673934128e-06, "loss": 0.4641, "num_input_tokens_seen": 86840880, "step": 71580 }, { "epoch": 7.972491368749304, "grad_norm": 0.10146365314722061, "learning_rate": 6.004707679531513e-06, "loss": 0.4693, "num_input_tokens_seen": 86846960, "step": 71585 }, { "epoch": 7.973048223632921, "grad_norm": 0.12171255052089691, "learning_rate": 6.001548683353234e-06, "loss": 0.4665, "num_input_tokens_seen": 86853168, "step": 71590 }, { "epoch": 7.973605078516538, "grad_norm": 0.11128010600805283, "learning_rate": 5.998390404997328e-06, "loss": 0.4578, "num_input_tokens_seen": 86859216, "step": 71595 }, { "epoch": 7.974161933400156, "grad_norm": 0.17286352813243866, "learning_rate": 5.995232844583137e-06, "loss": 0.4654, "num_input_tokens_seen": 86865104, "step": 71600 }, { "epoch": 7.974718788283774, "grad_norm": 0.08278580009937286, "learning_rate": 5.992076002229943e-06, "loss": 0.4501, "num_input_tokens_seen": 86871120, "step": 71605 }, { "epoch": 7.97527564316739, "grad_norm": 0.12446422129869461, "learning_rate": 5.9889198780570424e-06, "loss": 0.4569, "num_input_tokens_seen": 86877072, "step": 71610 }, { "epoch": 7.975832498051008, "grad_norm": 0.07931491732597351, "learning_rate": 5.985764472183664e-06, "loss": 0.4543, "num_input_tokens_seen": 86882608, "step": 71615 }, { "epoch": 7.976389352934625, "grad_norm": 0.09378953278064728, "learning_rate": 5.9826097847290345e-06, "loss": 0.463, "num_input_tokens_seen": 86888656, "step": 71620 }, { "epoch": 7.9769462078182425, "grad_norm": 0.09535438567399979, "learning_rate": 5.979455815812363e-06, "loss": 0.4646, "num_input_tokens_seen": 86894800, "step": 71625 }, { "epoch": 7.97750306270186, "grad_norm": 0.09464216232299805, "learning_rate": 5.976302565552796e-06, "loss": 0.4649, "num_input_tokens_seen": 86901104, "step": 71630 }, { "epoch": 7.978059917585477, "grad_norm": 0.08649344742298126, "learning_rate": 5.9731500340694885e-06, "loss": 0.4626, "num_input_tokens_seen": 86906736, "step": 71635 }, { "epoch": 7.978616772469095, "grad_norm": 0.08303163200616837, "learning_rate": 5.969998221481543e-06, "loss": 0.4618, "num_input_tokens_seen": 86912656, "step": 71640 }, { "epoch": 7.979173627352711, "grad_norm": 0.10469969362020493, "learning_rate": 5.966847127908054e-06, "loss": 0.46, "num_input_tokens_seen": 86918768, "step": 71645 }, { "epoch": 7.979730482236329, "grad_norm": 0.09830796718597412, "learning_rate": 5.96369675346807e-06, "loss": 0.4562, "num_input_tokens_seen": 86924976, "step": 71650 }, { "epoch": 7.980287337119947, "grad_norm": 0.0924212783575058, "learning_rate": 5.960547098280627e-06, "loss": 0.457, "num_input_tokens_seen": 86931376, "step": 71655 }, { "epoch": 7.9808441920035635, "grad_norm": 0.11027389764785767, "learning_rate": 5.957398162464736e-06, "loss": 0.4665, "num_input_tokens_seen": 86937136, "step": 71660 }, { "epoch": 7.981401046887181, "grad_norm": 0.09594562649726868, "learning_rate": 5.954249946139368e-06, "loss": 0.456, "num_input_tokens_seen": 86943088, "step": 71665 }, { "epoch": 7.981957901770799, "grad_norm": 0.14258870482444763, "learning_rate": 5.951102449423465e-06, "loss": 0.4595, "num_input_tokens_seen": 86949200, "step": 71670 }, { "epoch": 7.982514756654416, "grad_norm": 0.12713801860809326, "learning_rate": 5.947955672435965e-06, "loss": 0.4592, "num_input_tokens_seen": 86955376, "step": 71675 }, { "epoch": 7.983071611538033, "grad_norm": 0.1581612229347229, "learning_rate": 5.944809615295746e-06, "loss": 0.4571, "num_input_tokens_seen": 86961488, "step": 71680 }, { "epoch": 7.98362846642165, "grad_norm": 0.11129864305257797, "learning_rate": 5.941664278121692e-06, "loss": 0.4528, "num_input_tokens_seen": 86967344, "step": 71685 }, { "epoch": 7.984185321305268, "grad_norm": 0.11758916825056076, "learning_rate": 5.9385196610326294e-06, "loss": 0.4613, "num_input_tokens_seen": 86973456, "step": 71690 }, { "epoch": 7.9847421761888855, "grad_norm": 0.13274481892585754, "learning_rate": 5.9353757641473885e-06, "loss": 0.4534, "num_input_tokens_seen": 86979632, "step": 71695 }, { "epoch": 7.985299031072502, "grad_norm": 0.10725174099206924, "learning_rate": 5.932232587584738e-06, "loss": 0.4689, "num_input_tokens_seen": 86985520, "step": 71700 }, { "epoch": 7.98585588595612, "grad_norm": 0.12952087819576263, "learning_rate": 5.929090131463447e-06, "loss": 0.4709, "num_input_tokens_seen": 86991504, "step": 71705 }, { "epoch": 7.986412740839738, "grad_norm": 0.116857148706913, "learning_rate": 5.925948395902253e-06, "loss": 0.4531, "num_input_tokens_seen": 86997744, "step": 71710 }, { "epoch": 7.986969595723354, "grad_norm": 0.09362407773733139, "learning_rate": 5.922807381019848e-06, "loss": 0.4685, "num_input_tokens_seen": 87003920, "step": 71715 }, { "epoch": 7.987526450606972, "grad_norm": 0.10090281069278717, "learning_rate": 5.919667086934924e-06, "loss": 0.469, "num_input_tokens_seen": 87010128, "step": 71720 }, { "epoch": 7.988083305490589, "grad_norm": 0.10713396966457367, "learning_rate": 5.916527513766115e-06, "loss": 0.456, "num_input_tokens_seen": 87016432, "step": 71725 }, { "epoch": 7.9886401603742065, "grad_norm": 0.11723575741052628, "learning_rate": 5.9133886616320605e-06, "loss": 0.4597, "num_input_tokens_seen": 87022160, "step": 71730 }, { "epoch": 7.989197015257824, "grad_norm": 0.09149709343910217, "learning_rate": 5.91025053065134e-06, "loss": 0.4555, "num_input_tokens_seen": 87027888, "step": 71735 }, { "epoch": 7.989753870141441, "grad_norm": 0.10656259953975677, "learning_rate": 5.9071131209425316e-06, "loss": 0.4552, "num_input_tokens_seen": 87033744, "step": 71740 }, { "epoch": 7.990310725025059, "grad_norm": 0.12696705758571625, "learning_rate": 5.9039764326241834e-06, "loss": 0.4638, "num_input_tokens_seen": 87040016, "step": 71745 }, { "epoch": 7.9908675799086755, "grad_norm": 0.10999908298254013, "learning_rate": 5.900840465814794e-06, "loss": 0.46, "num_input_tokens_seen": 87045936, "step": 71750 }, { "epoch": 7.991424434792293, "grad_norm": 0.09846440702676773, "learning_rate": 5.897705220632865e-06, "loss": 0.4408, "num_input_tokens_seen": 87051824, "step": 71755 }, { "epoch": 7.991981289675911, "grad_norm": 0.11000499129295349, "learning_rate": 5.894570697196841e-06, "loss": 0.4745, "num_input_tokens_seen": 87057872, "step": 71760 }, { "epoch": 7.992538144559528, "grad_norm": 0.11337363719940186, "learning_rate": 5.891436895625171e-06, "loss": 0.4662, "num_input_tokens_seen": 87064048, "step": 71765 }, { "epoch": 7.993094999443145, "grad_norm": 0.10274051874876022, "learning_rate": 5.888303816036242e-06, "loss": 0.45, "num_input_tokens_seen": 87070000, "step": 71770 }, { "epoch": 7.993651854326762, "grad_norm": 0.11741235107183456, "learning_rate": 5.885171458548444e-06, "loss": 0.4702, "num_input_tokens_seen": 87076176, "step": 71775 }, { "epoch": 7.99420870921038, "grad_norm": 0.11422813683748245, "learning_rate": 5.882039823280131e-06, "loss": 0.4642, "num_input_tokens_seen": 87082160, "step": 71780 }, { "epoch": 7.994765564093997, "grad_norm": 0.14054852724075317, "learning_rate": 5.878908910349612e-06, "loss": 0.468, "num_input_tokens_seen": 87088336, "step": 71785 }, { "epoch": 7.995322418977614, "grad_norm": 0.16196949779987335, "learning_rate": 5.8757787198751995e-06, "loss": 0.4615, "num_input_tokens_seen": 87094160, "step": 71790 }, { "epoch": 7.995879273861232, "grad_norm": 0.13535146415233612, "learning_rate": 5.872649251975146e-06, "loss": 0.4752, "num_input_tokens_seen": 87100240, "step": 71795 }, { "epoch": 7.996436128744849, "grad_norm": 0.12410023808479309, "learning_rate": 5.869520506767701e-06, "loss": 0.462, "num_input_tokens_seen": 87106352, "step": 71800 }, { "epoch": 7.996992983628466, "grad_norm": 0.09880907833576202, "learning_rate": 5.866392484371083e-06, "loss": 0.4616, "num_input_tokens_seen": 87112304, "step": 71805 }, { "epoch": 7.997549838512084, "grad_norm": 0.10765587538480759, "learning_rate": 5.863265184903468e-06, "loss": 0.452, "num_input_tokens_seen": 87118128, "step": 71810 }, { "epoch": 7.998106693395701, "grad_norm": 0.1005958840250969, "learning_rate": 5.860138608483026e-06, "loss": 0.4485, "num_input_tokens_seen": 87123664, "step": 71815 }, { "epoch": 7.9986635482793185, "grad_norm": 0.1066787987947464, "learning_rate": 5.8570127552278804e-06, "loss": 0.4708, "num_input_tokens_seen": 87130032, "step": 71820 }, { "epoch": 7.999220403162935, "grad_norm": 0.09157530218362808, "learning_rate": 5.853887625256144e-06, "loss": 0.4534, "num_input_tokens_seen": 87136112, "step": 71825 }, { "epoch": 7.999777258046553, "grad_norm": 0.11970245093107224, "learning_rate": 5.850763218685884e-06, "loss": 0.4525, "num_input_tokens_seen": 87142128, "step": 71830 }, { "epoch": 8.00033411293017, "grad_norm": 0.0855785682797432, "learning_rate": 5.8476395356351575e-06, "loss": 0.4682, "num_input_tokens_seen": 87147696, "step": 71835 }, { "epoch": 8.000890967813788, "grad_norm": 0.09945225715637207, "learning_rate": 5.844516576221989e-06, "loss": 0.4546, "num_input_tokens_seen": 87153488, "step": 71840 }, { "epoch": 8.000890967813788, "eval_loss": 0.4640532433986664, "eval_runtime": 113.0873, "eval_samples_per_second": 35.291, "eval_steps_per_second": 8.825, "num_input_tokens_seen": 87153488, "step": 71840 }, { "epoch": 8.001447822697404, "grad_norm": 0.10200120508670807, "learning_rate": 5.841394340564369e-06, "loss": 0.4548, "num_input_tokens_seen": 87159600, "step": 71845 }, { "epoch": 8.002004677581022, "grad_norm": 0.11270488053560257, "learning_rate": 5.838272828780272e-06, "loss": 0.4511, "num_input_tokens_seen": 87165936, "step": 71850 }, { "epoch": 8.00256153246464, "grad_norm": 0.12886396050453186, "learning_rate": 5.835152040987626e-06, "loss": 0.4748, "num_input_tokens_seen": 87172144, "step": 71855 }, { "epoch": 8.003118387348257, "grad_norm": 0.09557247161865234, "learning_rate": 5.8320319773043615e-06, "loss": 0.4597, "num_input_tokens_seen": 87178512, "step": 71860 }, { "epoch": 8.003675242231875, "grad_norm": 0.10871630162000656, "learning_rate": 5.8289126378483494e-06, "loss": 0.4627, "num_input_tokens_seen": 87184848, "step": 71865 }, { "epoch": 8.004232097115493, "grad_norm": 0.11828108131885529, "learning_rate": 5.825794022737455e-06, "loss": 0.4704, "num_input_tokens_seen": 87190800, "step": 71870 }, { "epoch": 8.004788951999108, "grad_norm": 0.10902765393257141, "learning_rate": 5.822676132089513e-06, "loss": 0.457, "num_input_tokens_seen": 87196848, "step": 71875 }, { "epoch": 8.005345806882726, "grad_norm": 0.13393370807170868, "learning_rate": 5.819558966022317e-06, "loss": 0.4729, "num_input_tokens_seen": 87202800, "step": 71880 }, { "epoch": 8.005902661766344, "grad_norm": 0.12664538621902466, "learning_rate": 5.816442524653659e-06, "loss": 0.4757, "num_input_tokens_seen": 87209232, "step": 71885 }, { "epoch": 8.006459516649961, "grad_norm": 0.13036036491394043, "learning_rate": 5.813326808101272e-06, "loss": 0.4638, "num_input_tokens_seen": 87215344, "step": 71890 }, { "epoch": 8.00701637153358, "grad_norm": 0.07966804504394531, "learning_rate": 5.810211816482885e-06, "loss": 0.4576, "num_input_tokens_seen": 87221552, "step": 71895 }, { "epoch": 8.007573226417195, "grad_norm": 0.10345669835805893, "learning_rate": 5.807097549916199e-06, "loss": 0.4607, "num_input_tokens_seen": 87227568, "step": 71900 }, { "epoch": 8.008130081300813, "grad_norm": 0.11940764635801315, "learning_rate": 5.803984008518867e-06, "loss": 0.4599, "num_input_tokens_seen": 87233584, "step": 71905 }, { "epoch": 8.00868693618443, "grad_norm": 0.1330808401107788, "learning_rate": 5.800871192408541e-06, "loss": 0.4743, "num_input_tokens_seen": 87239696, "step": 71910 }, { "epoch": 8.009243791068048, "grad_norm": 0.10625776648521423, "learning_rate": 5.7977591017028234e-06, "loss": 0.4625, "num_input_tokens_seen": 87245872, "step": 71915 }, { "epoch": 8.009800645951666, "grad_norm": 0.12346432358026505, "learning_rate": 5.79464773651931e-06, "loss": 0.4549, "num_input_tokens_seen": 87251984, "step": 71920 }, { "epoch": 8.010357500835282, "grad_norm": 0.11511793732643127, "learning_rate": 5.791537096975544e-06, "loss": 0.4678, "num_input_tokens_seen": 87258320, "step": 71925 }, { "epoch": 8.0109143557189, "grad_norm": 0.08005928993225098, "learning_rate": 5.788427183189063e-06, "loss": 0.4565, "num_input_tokens_seen": 87264592, "step": 71930 }, { "epoch": 8.011471210602517, "grad_norm": 0.08931634575128555, "learning_rate": 5.785317995277378e-06, "loss": 0.4578, "num_input_tokens_seen": 87270544, "step": 71935 }, { "epoch": 8.012028065486135, "grad_norm": 0.08507954329252243, "learning_rate": 5.782209533357946e-06, "loss": 0.4662, "num_input_tokens_seen": 87276464, "step": 71940 }, { "epoch": 8.012584920369752, "grad_norm": 0.1297033280134201, "learning_rate": 5.7791017975482335e-06, "loss": 0.4633, "num_input_tokens_seen": 87282736, "step": 71945 }, { "epoch": 8.013141775253368, "grad_norm": 0.10562179982662201, "learning_rate": 5.775994787965644e-06, "loss": 0.4482, "num_input_tokens_seen": 87288752, "step": 71950 }, { "epoch": 8.013698630136986, "grad_norm": 0.08644898235797882, "learning_rate": 5.772888504727586e-06, "loss": 0.4447, "num_input_tokens_seen": 87294960, "step": 71955 }, { "epoch": 8.014255485020604, "grad_norm": 0.09745186567306519, "learning_rate": 5.769782947951408e-06, "loss": 0.4611, "num_input_tokens_seen": 87301200, "step": 71960 }, { "epoch": 8.014812339904221, "grad_norm": 0.09131118655204773, "learning_rate": 5.766678117754457e-06, "loss": 0.4563, "num_input_tokens_seen": 87307408, "step": 71965 }, { "epoch": 8.015369194787839, "grad_norm": 0.1034686341881752, "learning_rate": 5.76357401425405e-06, "loss": 0.4458, "num_input_tokens_seen": 87313360, "step": 71970 }, { "epoch": 8.015926049671455, "grad_norm": 0.11566503345966339, "learning_rate": 5.760470637567456e-06, "loss": 0.471, "num_input_tokens_seen": 87319536, "step": 71975 }, { "epoch": 8.016482904555073, "grad_norm": 0.09705248475074768, "learning_rate": 5.757367987811943e-06, "loss": 0.465, "num_input_tokens_seen": 87325584, "step": 71980 }, { "epoch": 8.01703975943869, "grad_norm": 0.10875128209590912, "learning_rate": 5.754266065104727e-06, "loss": 0.4701, "num_input_tokens_seen": 87331728, "step": 71985 }, { "epoch": 8.017596614322308, "grad_norm": 0.09440154582262039, "learning_rate": 5.751164869563022e-06, "loss": 0.4676, "num_input_tokens_seen": 87337872, "step": 71990 }, { "epoch": 8.018153469205926, "grad_norm": 0.09564290940761566, "learning_rate": 5.7480644013039874e-06, "loss": 0.4597, "num_input_tokens_seen": 87344080, "step": 71995 }, { "epoch": 8.018710324089541, "grad_norm": 0.11352664232254028, "learning_rate": 5.744964660444776e-06, "loss": 0.4666, "num_input_tokens_seen": 87350288, "step": 72000 }, { "epoch": 8.019267178973159, "grad_norm": 0.10952183604240417, "learning_rate": 5.741865647102513e-06, "loss": 0.4616, "num_input_tokens_seen": 87356144, "step": 72005 }, { "epoch": 8.019824033856777, "grad_norm": 0.1054796427488327, "learning_rate": 5.738767361394273e-06, "loss": 0.4532, "num_input_tokens_seen": 87362000, "step": 72010 }, { "epoch": 8.020380888740394, "grad_norm": 0.08604832738637924, "learning_rate": 5.735669803437136e-06, "loss": 0.4487, "num_input_tokens_seen": 87368368, "step": 72015 }, { "epoch": 8.020937743624012, "grad_norm": 0.10189173370599747, "learning_rate": 5.732572973348121e-06, "loss": 0.4567, "num_input_tokens_seen": 87374544, "step": 72020 }, { "epoch": 8.021494598507628, "grad_norm": 0.12281815707683563, "learning_rate": 5.729476871244247e-06, "loss": 0.4671, "num_input_tokens_seen": 87380592, "step": 72025 }, { "epoch": 8.022051453391246, "grad_norm": 0.14847978949546814, "learning_rate": 5.726381497242497e-06, "loss": 0.4662, "num_input_tokens_seen": 87386512, "step": 72030 }, { "epoch": 8.022608308274863, "grad_norm": 0.09611572325229645, "learning_rate": 5.723286851459813e-06, "loss": 0.4606, "num_input_tokens_seen": 87392784, "step": 72035 }, { "epoch": 8.023165163158481, "grad_norm": 0.10883957147598267, "learning_rate": 5.720192934013136e-06, "loss": 0.4613, "num_input_tokens_seen": 87398864, "step": 72040 }, { "epoch": 8.023722018042099, "grad_norm": 0.09270191192626953, "learning_rate": 5.717099745019344e-06, "loss": 0.4601, "num_input_tokens_seen": 87405008, "step": 72045 }, { "epoch": 8.024278872925716, "grad_norm": 0.13514049351215363, "learning_rate": 5.71400728459533e-06, "loss": 0.4738, "num_input_tokens_seen": 87411056, "step": 72050 }, { "epoch": 8.024835727809332, "grad_norm": 0.09009239822626114, "learning_rate": 5.710915552857915e-06, "loss": 0.4521, "num_input_tokens_seen": 87417264, "step": 72055 }, { "epoch": 8.02539258269295, "grad_norm": 0.09784945100545883, "learning_rate": 5.707824549923926e-06, "loss": 0.4555, "num_input_tokens_seen": 87423440, "step": 72060 }, { "epoch": 8.025949437576568, "grad_norm": 0.13032348453998566, "learning_rate": 5.7047342759101595e-06, "loss": 0.4623, "num_input_tokens_seen": 87429264, "step": 72065 }, { "epoch": 8.026506292460185, "grad_norm": 0.11139979958534241, "learning_rate": 5.701644730933358e-06, "loss": 0.4615, "num_input_tokens_seen": 87435088, "step": 72070 }, { "epoch": 8.027063147343803, "grad_norm": 0.10925334692001343, "learning_rate": 5.698555915110274e-06, "loss": 0.4552, "num_input_tokens_seen": 87441328, "step": 72075 }, { "epoch": 8.027620002227419, "grad_norm": 0.1814875453710556, "learning_rate": 5.695467828557596e-06, "loss": 0.4646, "num_input_tokens_seen": 87447728, "step": 72080 }, { "epoch": 8.028176857111037, "grad_norm": 0.1055208146572113, "learning_rate": 5.692380471392006e-06, "loss": 0.4661, "num_input_tokens_seen": 87453264, "step": 72085 }, { "epoch": 8.028733711994654, "grad_norm": 0.07871589064598083, "learning_rate": 5.689293843730159e-06, "loss": 0.4513, "num_input_tokens_seen": 87459280, "step": 72090 }, { "epoch": 8.029290566878272, "grad_norm": 0.10202629119157791, "learning_rate": 5.686207945688671e-06, "loss": 0.4554, "num_input_tokens_seen": 87464912, "step": 72095 }, { "epoch": 8.02984742176189, "grad_norm": 0.1012539267539978, "learning_rate": 5.683122777384145e-06, "loss": 0.4554, "num_input_tokens_seen": 87470448, "step": 72100 }, { "epoch": 8.030404276645505, "grad_norm": 0.1312170773744583, "learning_rate": 5.680038338933141e-06, "loss": 0.4587, "num_input_tokens_seen": 87476496, "step": 72105 }, { "epoch": 8.030961131529123, "grad_norm": 0.10101893544197083, "learning_rate": 5.6769546304521995e-06, "loss": 0.4591, "num_input_tokens_seen": 87482032, "step": 72110 }, { "epoch": 8.03151798641274, "grad_norm": 0.1078527644276619, "learning_rate": 5.673871652057844e-06, "loss": 0.465, "num_input_tokens_seen": 87488112, "step": 72115 }, { "epoch": 8.032074841296359, "grad_norm": 0.15258795022964478, "learning_rate": 5.670789403866544e-06, "loss": 0.4437, "num_input_tokens_seen": 87494576, "step": 72120 }, { "epoch": 8.032631696179976, "grad_norm": 0.11274397373199463, "learning_rate": 5.667707885994769e-06, "loss": 0.46, "num_input_tokens_seen": 87500880, "step": 72125 }, { "epoch": 8.033188551063592, "grad_norm": 0.08841925114393234, "learning_rate": 5.664627098558939e-06, "loss": 0.4638, "num_input_tokens_seen": 87507024, "step": 72130 }, { "epoch": 8.03374540594721, "grad_norm": 0.08952014893293381, "learning_rate": 5.661547041675466e-06, "loss": 0.4701, "num_input_tokens_seen": 87513040, "step": 72135 }, { "epoch": 8.034302260830827, "grad_norm": 0.14297950267791748, "learning_rate": 5.658467715460714e-06, "loss": 0.4492, "num_input_tokens_seen": 87519184, "step": 72140 }, { "epoch": 8.034859115714445, "grad_norm": 0.1095561683177948, "learning_rate": 5.655389120031032e-06, "loss": 0.453, "num_input_tokens_seen": 87525104, "step": 72145 }, { "epoch": 8.035415970598063, "grad_norm": 0.12619160115718842, "learning_rate": 5.652311255502751e-06, "loss": 0.4748, "num_input_tokens_seen": 87531312, "step": 72150 }, { "epoch": 8.035972825481679, "grad_norm": 0.13807158172130585, "learning_rate": 5.6492341219921446e-06, "loss": 0.456, "num_input_tokens_seen": 87537456, "step": 72155 }, { "epoch": 8.036529680365296, "grad_norm": 0.10643866658210754, "learning_rate": 5.646157719615494e-06, "loss": 0.4485, "num_input_tokens_seen": 87543760, "step": 72160 }, { "epoch": 8.037086535248914, "grad_norm": 0.1028599813580513, "learning_rate": 5.64308204848902e-06, "loss": 0.4743, "num_input_tokens_seen": 87549552, "step": 72165 }, { "epoch": 8.037643390132532, "grad_norm": 0.09831535816192627, "learning_rate": 5.640007108728937e-06, "loss": 0.4537, "num_input_tokens_seen": 87555728, "step": 72170 }, { "epoch": 8.03820024501615, "grad_norm": 0.09999731183052063, "learning_rate": 5.636932900451436e-06, "loss": 0.4584, "num_input_tokens_seen": 87561968, "step": 72175 }, { "epoch": 8.038757099899765, "grad_norm": 0.09958279132843018, "learning_rate": 5.633859423772655e-06, "loss": 0.4649, "num_input_tokens_seen": 87568144, "step": 72180 }, { "epoch": 8.039313954783383, "grad_norm": 0.08669369667768478, "learning_rate": 5.630786678808733e-06, "loss": 0.4651, "num_input_tokens_seen": 87573808, "step": 72185 }, { "epoch": 8.039870809667, "grad_norm": 0.09523683786392212, "learning_rate": 5.627714665675757e-06, "loss": 0.4497, "num_input_tokens_seen": 87579984, "step": 72190 }, { "epoch": 8.040427664550618, "grad_norm": 0.15639865398406982, "learning_rate": 5.624643384489808e-06, "loss": 0.4801, "num_input_tokens_seen": 87584976, "step": 72195 }, { "epoch": 8.040984519434236, "grad_norm": 0.0922478437423706, "learning_rate": 5.621572835366915e-06, "loss": 0.4734, "num_input_tokens_seen": 87591088, "step": 72200 }, { "epoch": 8.041541374317854, "grad_norm": 0.10096479207277298, "learning_rate": 5.618503018423105e-06, "loss": 0.4603, "num_input_tokens_seen": 87597552, "step": 72205 }, { "epoch": 8.04209822920147, "grad_norm": 0.1047019213438034, "learning_rate": 5.615433933774367e-06, "loss": 0.4663, "num_input_tokens_seen": 87603472, "step": 72210 }, { "epoch": 8.042655084085087, "grad_norm": 0.11836598068475723, "learning_rate": 5.6123655815366464e-06, "loss": 0.4601, "num_input_tokens_seen": 87609744, "step": 72215 }, { "epoch": 8.043211938968705, "grad_norm": 0.11578311026096344, "learning_rate": 5.6092979618258964e-06, "loss": 0.4599, "num_input_tokens_seen": 87616016, "step": 72220 }, { "epoch": 8.043768793852323, "grad_norm": 0.12461429834365845, "learning_rate": 5.606231074758e-06, "loss": 0.4662, "num_input_tokens_seen": 87621872, "step": 72225 }, { "epoch": 8.04432564873594, "grad_norm": 0.15217669308185577, "learning_rate": 5.60316492044885e-06, "loss": 0.4643, "num_input_tokens_seen": 87628048, "step": 72230 }, { "epoch": 8.044882503619556, "grad_norm": 0.11239529401063919, "learning_rate": 5.600099499014286e-06, "loss": 0.4614, "num_input_tokens_seen": 87634128, "step": 72235 }, { "epoch": 8.045439358503174, "grad_norm": 0.10130871832370758, "learning_rate": 5.597034810570132e-06, "loss": 0.4548, "num_input_tokens_seen": 87640176, "step": 72240 }, { "epoch": 8.045996213386791, "grad_norm": 0.093410424888134, "learning_rate": 5.593970855232186e-06, "loss": 0.4677, "num_input_tokens_seen": 87646256, "step": 72245 }, { "epoch": 8.04655306827041, "grad_norm": 0.10011355578899384, "learning_rate": 5.590907633116207e-06, "loss": 0.462, "num_input_tokens_seen": 87652592, "step": 72250 }, { "epoch": 8.047109923154027, "grad_norm": 0.09705712646245956, "learning_rate": 5.5878451443379445e-06, "loss": 0.4677, "num_input_tokens_seen": 87658960, "step": 72255 }, { "epoch": 8.047666778037643, "grad_norm": 0.15342877805233002, "learning_rate": 5.584783389013096e-06, "loss": 0.4548, "num_input_tokens_seen": 87665168, "step": 72260 }, { "epoch": 8.04822363292126, "grad_norm": 0.1345316618680954, "learning_rate": 5.581722367257353e-06, "loss": 0.4694, "num_input_tokens_seen": 87671312, "step": 72265 }, { "epoch": 8.048780487804878, "grad_norm": 0.10678902268409729, "learning_rate": 5.578662079186364e-06, "loss": 0.4635, "num_input_tokens_seen": 87677200, "step": 72270 }, { "epoch": 8.049337342688496, "grad_norm": 0.13608138263225555, "learning_rate": 5.57560252491576e-06, "loss": 0.4729, "num_input_tokens_seen": 87683280, "step": 72275 }, { "epoch": 8.049894197572113, "grad_norm": 0.0869680643081665, "learning_rate": 5.572543704561148e-06, "loss": 0.465, "num_input_tokens_seen": 87689584, "step": 72280 }, { "epoch": 8.05045105245573, "grad_norm": 0.1378154158592224, "learning_rate": 5.569485618238085e-06, "loss": 0.4527, "num_input_tokens_seen": 87695248, "step": 72285 }, { "epoch": 8.051007907339347, "grad_norm": 0.18502457439899445, "learning_rate": 5.566428266062132e-06, "loss": 0.4568, "num_input_tokens_seen": 87701584, "step": 72290 }, { "epoch": 8.051564762222965, "grad_norm": 0.0991167202591896, "learning_rate": 5.563371648148788e-06, "loss": 0.4496, "num_input_tokens_seen": 87707888, "step": 72295 }, { "epoch": 8.052121617106582, "grad_norm": 0.11225753277540207, "learning_rate": 5.5603157646135525e-06, "loss": 0.4517, "num_input_tokens_seen": 87713872, "step": 72300 }, { "epoch": 8.0526784719902, "grad_norm": 0.10807635635137558, "learning_rate": 5.557260615571891e-06, "loss": 0.4576, "num_input_tokens_seen": 87720080, "step": 72305 }, { "epoch": 8.053235326873816, "grad_norm": 0.10275321453809738, "learning_rate": 5.554206201139225e-06, "loss": 0.467, "num_input_tokens_seen": 87725808, "step": 72310 }, { "epoch": 8.053792181757434, "grad_norm": 0.11457815021276474, "learning_rate": 5.5511525214309716e-06, "loss": 0.4658, "num_input_tokens_seen": 87732144, "step": 72315 }, { "epoch": 8.054349036641051, "grad_norm": 0.11211422085762024, "learning_rate": 5.548099576562496e-06, "loss": 0.4582, "num_input_tokens_seen": 87738448, "step": 72320 }, { "epoch": 8.054905891524669, "grad_norm": 0.1574661135673523, "learning_rate": 5.545047366649164e-06, "loss": 0.4683, "num_input_tokens_seen": 87744496, "step": 72325 }, { "epoch": 8.055462746408287, "grad_norm": 0.11651844531297684, "learning_rate": 5.541995891806281e-06, "loss": 0.464, "num_input_tokens_seen": 87750288, "step": 72330 }, { "epoch": 8.056019601291903, "grad_norm": 0.1686074286699295, "learning_rate": 5.538945152149147e-06, "loss": 0.46, "num_input_tokens_seen": 87755856, "step": 72335 }, { "epoch": 8.05657645617552, "grad_norm": 0.08970028162002563, "learning_rate": 5.535895147793041e-06, "loss": 0.474, "num_input_tokens_seen": 87761872, "step": 72340 }, { "epoch": 8.057133311059138, "grad_norm": 0.07426641881465912, "learning_rate": 5.532845878853185e-06, "loss": 0.4597, "num_input_tokens_seen": 87767696, "step": 72345 }, { "epoch": 8.057690165942756, "grad_norm": 0.09415920823812485, "learning_rate": 5.529797345444804e-06, "loss": 0.4529, "num_input_tokens_seen": 87774000, "step": 72350 }, { "epoch": 8.058247020826373, "grad_norm": 0.10396702587604523, "learning_rate": 5.526749547683066e-06, "loss": 0.4637, "num_input_tokens_seen": 87780112, "step": 72355 }, { "epoch": 8.05880387570999, "grad_norm": 0.08555241674184799, "learning_rate": 5.523702485683144e-06, "loss": 0.4638, "num_input_tokens_seen": 87785904, "step": 72360 }, { "epoch": 8.059360730593607, "grad_norm": 0.10682302713394165, "learning_rate": 5.520656159560147e-06, "loss": 0.4692, "num_input_tokens_seen": 87791408, "step": 72365 }, { "epoch": 8.059917585477224, "grad_norm": 0.1266525834798813, "learning_rate": 5.517610569429188e-06, "loss": 0.4597, "num_input_tokens_seen": 87796912, "step": 72370 }, { "epoch": 8.060474440360842, "grad_norm": 0.10587087273597717, "learning_rate": 5.5145657154053425e-06, "loss": 0.4612, "num_input_tokens_seen": 87802928, "step": 72375 }, { "epoch": 8.06103129524446, "grad_norm": 0.09808515757322311, "learning_rate": 5.511521597603641e-06, "loss": 0.4688, "num_input_tokens_seen": 87808912, "step": 72380 }, { "epoch": 8.061588150128078, "grad_norm": 0.10348474234342575, "learning_rate": 5.5084782161391155e-06, "loss": 0.4522, "num_input_tokens_seen": 87815088, "step": 72385 }, { "epoch": 8.062145005011693, "grad_norm": 0.1156448945403099, "learning_rate": 5.505435571126738e-06, "loss": 0.4568, "num_input_tokens_seen": 87821296, "step": 72390 }, { "epoch": 8.062701859895311, "grad_norm": 0.1263173669576645, "learning_rate": 5.50239366268148e-06, "loss": 0.4483, "num_input_tokens_seen": 87827504, "step": 72395 }, { "epoch": 8.063258714778929, "grad_norm": 0.1464652568101883, "learning_rate": 5.499352490918277e-06, "loss": 0.4585, "num_input_tokens_seen": 87833872, "step": 72400 }, { "epoch": 8.063815569662546, "grad_norm": 0.14202405512332916, "learning_rate": 5.496312055952024e-06, "loss": 0.4507, "num_input_tokens_seen": 87840016, "step": 72405 }, { "epoch": 8.064372424546164, "grad_norm": 0.10679066181182861, "learning_rate": 5.493272357897611e-06, "loss": 0.4577, "num_input_tokens_seen": 87845968, "step": 72410 }, { "epoch": 8.06492927942978, "grad_norm": 0.10086079686880112, "learning_rate": 5.490233396869873e-06, "loss": 0.4634, "num_input_tokens_seen": 87851952, "step": 72415 }, { "epoch": 8.065486134313398, "grad_norm": 0.09728646278381348, "learning_rate": 5.487195172983647e-06, "loss": 0.4595, "num_input_tokens_seen": 87857872, "step": 72420 }, { "epoch": 8.066042989197015, "grad_norm": 0.08607687056064606, "learning_rate": 5.484157686353713e-06, "loss": 0.4517, "num_input_tokens_seen": 87864176, "step": 72425 }, { "epoch": 8.066599844080633, "grad_norm": 0.14932101964950562, "learning_rate": 5.481120937094844e-06, "loss": 0.4769, "num_input_tokens_seen": 87870480, "step": 72430 }, { "epoch": 8.06715669896425, "grad_norm": 0.11783580482006073, "learning_rate": 5.478084925321783e-06, "loss": 0.4551, "num_input_tokens_seen": 87876688, "step": 72435 }, { "epoch": 8.067713553847867, "grad_norm": 0.09188450127840042, "learning_rate": 5.475049651149228e-06, "loss": 0.4654, "num_input_tokens_seen": 87882672, "step": 72440 }, { "epoch": 8.068270408731484, "grad_norm": 0.12951667606830597, "learning_rate": 5.472015114691875e-06, "loss": 0.458, "num_input_tokens_seen": 87888816, "step": 72445 }, { "epoch": 8.068827263615102, "grad_norm": 0.1192893236875534, "learning_rate": 5.468981316064364e-06, "loss": 0.457, "num_input_tokens_seen": 87894704, "step": 72450 }, { "epoch": 8.06938411849872, "grad_norm": 0.1076824963092804, "learning_rate": 5.465948255381334e-06, "loss": 0.4554, "num_input_tokens_seen": 87900912, "step": 72455 }, { "epoch": 8.069940973382337, "grad_norm": 0.09905719012022018, "learning_rate": 5.462915932757376e-06, "loss": 0.4635, "num_input_tokens_seen": 87907152, "step": 72460 }, { "epoch": 8.070497828265953, "grad_norm": 0.1268370896577835, "learning_rate": 5.4598843483070626e-06, "loss": 0.469, "num_input_tokens_seen": 87913200, "step": 72465 }, { "epoch": 8.07105468314957, "grad_norm": 0.10242478549480438, "learning_rate": 5.456853502144943e-06, "loss": 0.4608, "num_input_tokens_seen": 87919376, "step": 72470 }, { "epoch": 8.071611538033189, "grad_norm": 0.1172817200422287, "learning_rate": 5.453823394385521e-06, "loss": 0.4629, "num_input_tokens_seen": 87925648, "step": 72475 }, { "epoch": 8.072168392916806, "grad_norm": 0.14057894051074982, "learning_rate": 5.450794025143296e-06, "loss": 0.4617, "num_input_tokens_seen": 87931792, "step": 72480 }, { "epoch": 8.072725247800424, "grad_norm": 0.1052088811993599, "learning_rate": 5.447765394532717e-06, "loss": 0.4654, "num_input_tokens_seen": 87937520, "step": 72485 }, { "epoch": 8.07328210268404, "grad_norm": 0.10884015262126923, "learning_rate": 5.444737502668223e-06, "loss": 0.4539, "num_input_tokens_seen": 87944016, "step": 72490 }, { "epoch": 8.073838957567657, "grad_norm": 0.1275942325592041, "learning_rate": 5.441710349664217e-06, "loss": 0.4645, "num_input_tokens_seen": 87949392, "step": 72495 }, { "epoch": 8.074395812451275, "grad_norm": 0.15350161492824554, "learning_rate": 5.438683935635064e-06, "loss": 0.4714, "num_input_tokens_seen": 87955568, "step": 72500 }, { "epoch": 8.074952667334893, "grad_norm": 0.1068403348326683, "learning_rate": 5.435658260695125e-06, "loss": 0.4673, "num_input_tokens_seen": 87961872, "step": 72505 }, { "epoch": 8.07550952221851, "grad_norm": 0.11740819364786148, "learning_rate": 5.432633324958708e-06, "loss": 0.4597, "num_input_tokens_seen": 87967632, "step": 72510 }, { "epoch": 8.076066377102126, "grad_norm": 0.11836943030357361, "learning_rate": 5.429609128540111e-06, "loss": 0.4683, "num_input_tokens_seen": 87973328, "step": 72515 }, { "epoch": 8.076623231985744, "grad_norm": 0.10944774746894836, "learning_rate": 5.426585671553602e-06, "loss": 0.4598, "num_input_tokens_seen": 87979376, "step": 72520 }, { "epoch": 8.077180086869362, "grad_norm": 0.09502223879098892, "learning_rate": 5.423562954113409e-06, "loss": 0.4657, "num_input_tokens_seen": 87985072, "step": 72525 }, { "epoch": 8.07773694175298, "grad_norm": 0.09311177581548691, "learning_rate": 5.420540976333749e-06, "loss": 0.457, "num_input_tokens_seen": 87991344, "step": 72530 }, { "epoch": 8.078293796636597, "grad_norm": 0.10896877944469452, "learning_rate": 5.417519738328788e-06, "loss": 0.4595, "num_input_tokens_seen": 87996976, "step": 72535 }, { "epoch": 8.078850651520213, "grad_norm": 0.12440074980258942, "learning_rate": 5.414499240212694e-06, "loss": 0.4731, "num_input_tokens_seen": 88003248, "step": 72540 }, { "epoch": 8.07940750640383, "grad_norm": 0.08363963663578033, "learning_rate": 5.411479482099579e-06, "loss": 0.4692, "num_input_tokens_seen": 88009392, "step": 72545 }, { "epoch": 8.079964361287448, "grad_norm": 0.10101880133152008, "learning_rate": 5.40846046410354e-06, "loss": 0.4656, "num_input_tokens_seen": 88015120, "step": 72550 }, { "epoch": 8.080521216171066, "grad_norm": 0.09385468065738678, "learning_rate": 5.405442186338658e-06, "loss": 0.4613, "num_input_tokens_seen": 88021136, "step": 72555 }, { "epoch": 8.081078071054684, "grad_norm": 0.10116223990917206, "learning_rate": 5.402424648918958e-06, "loss": 0.4585, "num_input_tokens_seen": 88027248, "step": 72560 }, { "epoch": 8.081634925938301, "grad_norm": 0.09801370650529861, "learning_rate": 5.399407851958463e-06, "loss": 0.4629, "num_input_tokens_seen": 88033392, "step": 72565 }, { "epoch": 8.082191780821917, "grad_norm": 0.09392695128917694, "learning_rate": 5.3963917955711455e-06, "loss": 0.4631, "num_input_tokens_seen": 88039472, "step": 72570 }, { "epoch": 8.082748635705535, "grad_norm": 0.1070738285779953, "learning_rate": 5.393376479870971e-06, "loss": 0.4571, "num_input_tokens_seen": 88045520, "step": 72575 }, { "epoch": 8.083305490589153, "grad_norm": 0.08540329337120056, "learning_rate": 5.390361904971872e-06, "loss": 0.4606, "num_input_tokens_seen": 88051632, "step": 72580 }, { "epoch": 8.08386234547277, "grad_norm": 0.09658887982368469, "learning_rate": 5.3873480709877355e-06, "loss": 0.4627, "num_input_tokens_seen": 88057392, "step": 72585 }, { "epoch": 8.084419200356388, "grad_norm": 0.1237884983420372, "learning_rate": 5.384334978032446e-06, "loss": 0.459, "num_input_tokens_seen": 88063472, "step": 72590 }, { "epoch": 8.084976055240004, "grad_norm": 0.09712404012680054, "learning_rate": 5.3813226262198366e-06, "loss": 0.453, "num_input_tokens_seen": 88069520, "step": 72595 }, { "epoch": 8.085532910123622, "grad_norm": 0.11161542683839798, "learning_rate": 5.378311015663736e-06, "loss": 0.4681, "num_input_tokens_seen": 88075792, "step": 72600 }, { "epoch": 8.08608976500724, "grad_norm": 0.09887455403804779, "learning_rate": 5.375300146477918e-06, "loss": 0.4594, "num_input_tokens_seen": 88082064, "step": 72605 }, { "epoch": 8.086646619890857, "grad_norm": 0.10366803407669067, "learning_rate": 5.372290018776155e-06, "loss": 0.4598, "num_input_tokens_seen": 88088144, "step": 72610 }, { "epoch": 8.087203474774475, "grad_norm": 0.11885496228933334, "learning_rate": 5.369280632672177e-06, "loss": 0.4593, "num_input_tokens_seen": 88094128, "step": 72615 }, { "epoch": 8.08776032965809, "grad_norm": 0.10896987468004227, "learning_rate": 5.366271988279684e-06, "loss": 0.4672, "num_input_tokens_seen": 88100400, "step": 72620 }, { "epoch": 8.088317184541708, "grad_norm": 0.09362827986478806, "learning_rate": 5.363264085712358e-06, "loss": 0.448, "num_input_tokens_seen": 88106416, "step": 72625 }, { "epoch": 8.088874039425326, "grad_norm": 0.12001007050275803, "learning_rate": 5.36025692508384e-06, "loss": 0.4553, "num_input_tokens_seen": 88112560, "step": 72630 }, { "epoch": 8.089430894308943, "grad_norm": 0.11091294884681702, "learning_rate": 5.3572505065077575e-06, "loss": 0.4589, "num_input_tokens_seen": 88118736, "step": 72635 }, { "epoch": 8.089987749192561, "grad_norm": 0.08381903916597366, "learning_rate": 5.3542448300976965e-06, "loss": 0.4576, "num_input_tokens_seen": 88124688, "step": 72640 }, { "epoch": 8.090544604076177, "grad_norm": 0.09990942478179932, "learning_rate": 5.351239895967219e-06, "loss": 0.4574, "num_input_tokens_seen": 88130640, "step": 72645 }, { "epoch": 8.091101458959795, "grad_norm": 0.08048322051763535, "learning_rate": 5.348235704229876e-06, "loss": 0.4551, "num_input_tokens_seen": 88136848, "step": 72650 }, { "epoch": 8.091658313843412, "grad_norm": 0.15427909791469574, "learning_rate": 5.345232254999158e-06, "loss": 0.4555, "num_input_tokens_seen": 88143152, "step": 72655 }, { "epoch": 8.09221516872703, "grad_norm": 0.09777168929576874, "learning_rate": 5.342229548388558e-06, "loss": 0.4659, "num_input_tokens_seen": 88149424, "step": 72660 }, { "epoch": 8.092772023610648, "grad_norm": 0.11028686910867691, "learning_rate": 5.339227584511516e-06, "loss": 0.4572, "num_input_tokens_seen": 88155408, "step": 72665 }, { "epoch": 8.093328878494264, "grad_norm": 0.11030447483062744, "learning_rate": 5.336226363481461e-06, "loss": 0.4663, "num_input_tokens_seen": 88161424, "step": 72670 }, { "epoch": 8.093885733377881, "grad_norm": 0.09062635153532028, "learning_rate": 5.333225885411797e-06, "loss": 0.4589, "num_input_tokens_seen": 88167664, "step": 72675 }, { "epoch": 8.094442588261499, "grad_norm": 0.12307114154100418, "learning_rate": 5.330226150415879e-06, "loss": 0.4708, "num_input_tokens_seen": 88174224, "step": 72680 }, { "epoch": 8.094999443145117, "grad_norm": 0.12008430808782578, "learning_rate": 5.327227158607057e-06, "loss": 0.4618, "num_input_tokens_seen": 88179888, "step": 72685 }, { "epoch": 8.095556298028734, "grad_norm": 0.0971788689494133, "learning_rate": 5.32422891009863e-06, "loss": 0.4584, "num_input_tokens_seen": 88185968, "step": 72690 }, { "epoch": 8.09611315291235, "grad_norm": 0.09693549573421478, "learning_rate": 5.321231405003896e-06, "loss": 0.4625, "num_input_tokens_seen": 88191888, "step": 72695 }, { "epoch": 8.096670007795968, "grad_norm": 0.11799846589565277, "learning_rate": 5.318234643436099e-06, "loss": 0.4548, "num_input_tokens_seen": 88197936, "step": 72700 }, { "epoch": 8.097226862679586, "grad_norm": 0.21006619930267334, "learning_rate": 5.315238625508467e-06, "loss": 0.4697, "num_input_tokens_seen": 88203920, "step": 72705 }, { "epoch": 8.097783717563203, "grad_norm": 0.11020131409168243, "learning_rate": 5.312243351334212e-06, "loss": 0.4687, "num_input_tokens_seen": 88210256, "step": 72710 }, { "epoch": 8.098340572446821, "grad_norm": 0.08539707958698273, "learning_rate": 5.309248821026488e-06, "loss": 0.4605, "num_input_tokens_seen": 88216624, "step": 72715 }, { "epoch": 8.098897427330437, "grad_norm": 0.12951017916202545, "learning_rate": 5.306255034698451e-06, "loss": 0.471, "num_input_tokens_seen": 88223152, "step": 72720 }, { "epoch": 8.099454282214055, "grad_norm": 0.09967965632677078, "learning_rate": 5.3032619924632075e-06, "loss": 0.4445, "num_input_tokens_seen": 88229456, "step": 72725 }, { "epoch": 8.100011137097672, "grad_norm": 0.1267959028482437, "learning_rate": 5.300269694433849e-06, "loss": 0.4554, "num_input_tokens_seen": 88235856, "step": 72730 }, { "epoch": 8.10056799198129, "grad_norm": 0.11735308170318604, "learning_rate": 5.2972781407234305e-06, "loss": 0.4541, "num_input_tokens_seen": 88242032, "step": 72735 }, { "epoch": 8.101124846864908, "grad_norm": 0.10251789540052414, "learning_rate": 5.2942873314449805e-06, "loss": 0.4578, "num_input_tokens_seen": 88248272, "step": 72740 }, { "epoch": 8.101681701748525, "grad_norm": 0.1131286546587944, "learning_rate": 5.291297266711515e-06, "loss": 0.4394, "num_input_tokens_seen": 88254736, "step": 72745 }, { "epoch": 8.102238556632141, "grad_norm": 0.10096751898527145, "learning_rate": 5.288307946635987e-06, "loss": 0.4562, "num_input_tokens_seen": 88260784, "step": 72750 }, { "epoch": 8.102795411515759, "grad_norm": 0.10015221685171127, "learning_rate": 5.285319371331365e-06, "loss": 0.4581, "num_input_tokens_seen": 88266704, "step": 72755 }, { "epoch": 8.103352266399376, "grad_norm": 0.07329142093658447, "learning_rate": 5.2823315409105484e-06, "loss": 0.4619, "num_input_tokens_seen": 88272656, "step": 72760 }, { "epoch": 8.103909121282994, "grad_norm": 0.1299271285533905, "learning_rate": 5.2793444554864404e-06, "loss": 0.4499, "num_input_tokens_seen": 88278640, "step": 72765 }, { "epoch": 8.104465976166612, "grad_norm": 0.0951990857720375, "learning_rate": 5.276358115171889e-06, "loss": 0.4502, "num_input_tokens_seen": 88284752, "step": 72770 }, { "epoch": 8.105022831050228, "grad_norm": 0.15591342747211456, "learning_rate": 5.273372520079739e-06, "loss": 0.4611, "num_input_tokens_seen": 88290672, "step": 72775 }, { "epoch": 8.105579685933845, "grad_norm": 0.12739109992980957, "learning_rate": 5.270387670322796e-06, "loss": 0.4642, "num_input_tokens_seen": 88296816, "step": 72780 }, { "epoch": 8.106136540817463, "grad_norm": 0.09632397443056107, "learning_rate": 5.267403566013829e-06, "loss": 0.444, "num_input_tokens_seen": 88302864, "step": 72785 }, { "epoch": 8.10669339570108, "grad_norm": 0.11238469928503036, "learning_rate": 5.264420207265597e-06, "loss": 0.4585, "num_input_tokens_seen": 88309072, "step": 72790 }, { "epoch": 8.107250250584698, "grad_norm": 0.08295059949159622, "learning_rate": 5.261437594190808e-06, "loss": 0.4592, "num_input_tokens_seen": 88315376, "step": 72795 }, { "epoch": 8.107807105468314, "grad_norm": 0.10195570439100266, "learning_rate": 5.2584557269021615e-06, "loss": 0.4546, "num_input_tokens_seen": 88321584, "step": 72800 }, { "epoch": 8.108363960351932, "grad_norm": 0.09340495616197586, "learning_rate": 5.255474605512331e-06, "loss": 0.452, "num_input_tokens_seen": 88327728, "step": 72805 }, { "epoch": 8.10892081523555, "grad_norm": 0.12606678903102875, "learning_rate": 5.252494230133936e-06, "loss": 0.4676, "num_input_tokens_seen": 88334032, "step": 72810 }, { "epoch": 8.109477670119167, "grad_norm": 0.12176299840211868, "learning_rate": 5.249514600879601e-06, "loss": 0.4675, "num_input_tokens_seen": 88340144, "step": 72815 }, { "epoch": 8.110034525002785, "grad_norm": 0.10380396991968155, "learning_rate": 5.246535717861892e-06, "loss": 0.4666, "num_input_tokens_seen": 88346480, "step": 72820 }, { "epoch": 8.1105913798864, "grad_norm": 0.1071566790342331, "learning_rate": 5.243557581193373e-06, "loss": 0.4657, "num_input_tokens_seen": 88352880, "step": 72825 }, { "epoch": 8.111148234770019, "grad_norm": 0.11683125048875809, "learning_rate": 5.240580190986552e-06, "loss": 0.4511, "num_input_tokens_seen": 88358928, "step": 72830 }, { "epoch": 8.111705089653636, "grad_norm": 0.1527884155511856, "learning_rate": 5.237603547353937e-06, "loss": 0.457, "num_input_tokens_seen": 88365104, "step": 72835 }, { "epoch": 8.112261944537254, "grad_norm": 0.09248267114162445, "learning_rate": 5.234627650407997e-06, "loss": 0.4489, "num_input_tokens_seen": 88371312, "step": 72840 }, { "epoch": 8.112818799420872, "grad_norm": 0.10124997049570084, "learning_rate": 5.231652500261158e-06, "loss": 0.4658, "num_input_tokens_seen": 88377264, "step": 72845 }, { "epoch": 8.113375654304487, "grad_norm": 0.1453540027141571, "learning_rate": 5.2286780970258464e-06, "loss": 0.4545, "num_input_tokens_seen": 88383184, "step": 72850 }, { "epoch": 8.113932509188105, "grad_norm": 0.11308228969573975, "learning_rate": 5.225704440814427e-06, "loss": 0.4572, "num_input_tokens_seen": 88389168, "step": 72855 }, { "epoch": 8.114489364071723, "grad_norm": 0.12016541510820389, "learning_rate": 5.222731531739272e-06, "loss": 0.4614, "num_input_tokens_seen": 88395440, "step": 72860 }, { "epoch": 8.11504621895534, "grad_norm": 0.08867628127336502, "learning_rate": 5.219759369912694e-06, "loss": 0.4569, "num_input_tokens_seen": 88401552, "step": 72865 }, { "epoch": 8.115603073838958, "grad_norm": 0.1704230010509491, "learning_rate": 5.216787955446995e-06, "loss": 0.4434, "num_input_tokens_seen": 88407504, "step": 72870 }, { "epoch": 8.116159928722574, "grad_norm": 0.1708638072013855, "learning_rate": 5.2138172884544535e-06, "loss": 0.4704, "num_input_tokens_seen": 88413584, "step": 72875 }, { "epoch": 8.116716783606192, "grad_norm": 0.09279607236385345, "learning_rate": 5.210847369047292e-06, "loss": 0.4583, "num_input_tokens_seen": 88419696, "step": 72880 }, { "epoch": 8.11727363848981, "grad_norm": 0.09710617363452911, "learning_rate": 5.207878197337745e-06, "loss": 0.4632, "num_input_tokens_seen": 88425968, "step": 72885 }, { "epoch": 8.117830493373427, "grad_norm": 0.13073591887950897, "learning_rate": 5.204909773437977e-06, "loss": 0.4671, "num_input_tokens_seen": 88431632, "step": 72890 }, { "epoch": 8.118387348257045, "grad_norm": 0.09827499091625214, "learning_rate": 5.201942097460158e-06, "loss": 0.4605, "num_input_tokens_seen": 88437296, "step": 72895 }, { "epoch": 8.11894420314066, "grad_norm": 0.09825492650270462, "learning_rate": 5.198975169516424e-06, "loss": 0.4612, "num_input_tokens_seen": 88442576, "step": 72900 }, { "epoch": 8.119501058024278, "grad_norm": 0.09530439972877502, "learning_rate": 5.196008989718848e-06, "loss": 0.4669, "num_input_tokens_seen": 88448944, "step": 72905 }, { "epoch": 8.120057912907896, "grad_norm": 0.1426970660686493, "learning_rate": 5.193043558179528e-06, "loss": 0.459, "num_input_tokens_seen": 88454896, "step": 72910 }, { "epoch": 8.120614767791514, "grad_norm": 0.0981813594698906, "learning_rate": 5.190078875010487e-06, "loss": 0.4716, "num_input_tokens_seen": 88460720, "step": 72915 }, { "epoch": 8.121171622675131, "grad_norm": 0.09440861642360687, "learning_rate": 5.187114940323751e-06, "loss": 0.4664, "num_input_tokens_seen": 88466544, "step": 72920 }, { "epoch": 8.121728477558749, "grad_norm": 0.10635272413492203, "learning_rate": 5.184151754231315e-06, "loss": 0.451, "num_input_tokens_seen": 88471952, "step": 72925 }, { "epoch": 8.122285332442365, "grad_norm": 0.09698346257209778, "learning_rate": 5.18118931684512e-06, "loss": 0.4674, "num_input_tokens_seen": 88477584, "step": 72930 }, { "epoch": 8.122842187325983, "grad_norm": 0.11589310318231583, "learning_rate": 5.17822762827711e-06, "loss": 0.4654, "num_input_tokens_seen": 88482960, "step": 72935 }, { "epoch": 8.1233990422096, "grad_norm": 0.17304939031600952, "learning_rate": 5.175266688639177e-06, "loss": 0.457, "num_input_tokens_seen": 88489328, "step": 72940 }, { "epoch": 8.123955897093218, "grad_norm": 0.11826307326555252, "learning_rate": 5.1723064980432005e-06, "loss": 0.4753, "num_input_tokens_seen": 88495472, "step": 72945 }, { "epoch": 8.124512751976836, "grad_norm": 0.07009346783161163, "learning_rate": 5.169347056601032e-06, "loss": 0.4578, "num_input_tokens_seen": 88501040, "step": 72950 }, { "epoch": 8.125069606860452, "grad_norm": 0.16394054889678955, "learning_rate": 5.166388364424476e-06, "loss": 0.474, "num_input_tokens_seen": 88506864, "step": 72955 }, { "epoch": 8.12562646174407, "grad_norm": 0.09574231505393982, "learning_rate": 5.1634304216253345e-06, "loss": 0.4689, "num_input_tokens_seen": 88512976, "step": 72960 }, { "epoch": 8.126183316627687, "grad_norm": 0.11690543591976166, "learning_rate": 5.1604732283153544e-06, "loss": 0.4602, "num_input_tokens_seen": 88519408, "step": 72965 }, { "epoch": 8.126740171511305, "grad_norm": 0.12083937972784042, "learning_rate": 5.157516784606278e-06, "loss": 0.4684, "num_input_tokens_seen": 88525808, "step": 72970 }, { "epoch": 8.127297026394922, "grad_norm": 0.17377126216888428, "learning_rate": 5.1545610906098026e-06, "loss": 0.4605, "num_input_tokens_seen": 88532304, "step": 72975 }, { "epoch": 8.127853881278538, "grad_norm": 0.12159989774227142, "learning_rate": 5.151606146437607e-06, "loss": 0.462, "num_input_tokens_seen": 88538512, "step": 72980 }, { "epoch": 8.128410736162156, "grad_norm": 0.09527073055505753, "learning_rate": 5.1486519522013445e-06, "loss": 0.46, "num_input_tokens_seen": 88544592, "step": 72985 }, { "epoch": 8.128967591045773, "grad_norm": 0.09671381860971451, "learning_rate": 5.14569850801262e-06, "loss": 0.4577, "num_input_tokens_seen": 88550864, "step": 72990 }, { "epoch": 8.129524445929391, "grad_norm": 0.11695405095815659, "learning_rate": 5.142745813983041e-06, "loss": 0.444, "num_input_tokens_seen": 88556432, "step": 72995 }, { "epoch": 8.130081300813009, "grad_norm": 0.11942074447870255, "learning_rate": 5.139793870224152e-06, "loss": 0.4609, "num_input_tokens_seen": 88562640, "step": 73000 }, { "epoch": 8.130638155696625, "grad_norm": 0.09984876215457916, "learning_rate": 5.136842676847503e-06, "loss": 0.4694, "num_input_tokens_seen": 88568880, "step": 73005 }, { "epoch": 8.131195010580242, "grad_norm": 0.07640916109085083, "learning_rate": 5.133892233964588e-06, "loss": 0.4537, "num_input_tokens_seen": 88574256, "step": 73010 }, { "epoch": 8.13175186546386, "grad_norm": 0.10540967434644699, "learning_rate": 5.130942541686884e-06, "loss": 0.4677, "num_input_tokens_seen": 88580336, "step": 73015 }, { "epoch": 8.132308720347478, "grad_norm": 0.11107748746871948, "learning_rate": 5.127993600125855e-06, "loss": 0.4578, "num_input_tokens_seen": 88586352, "step": 73020 }, { "epoch": 8.132865575231095, "grad_norm": 0.1006515771150589, "learning_rate": 5.125045409392904e-06, "loss": 0.4575, "num_input_tokens_seen": 88592784, "step": 73025 }, { "epoch": 8.133422430114711, "grad_norm": 0.07688438892364502, "learning_rate": 5.122097969599432e-06, "loss": 0.4695, "num_input_tokens_seen": 88598928, "step": 73030 }, { "epoch": 8.133979284998329, "grad_norm": 0.09154544770717621, "learning_rate": 5.119151280856799e-06, "loss": 0.4504, "num_input_tokens_seen": 88604528, "step": 73035 }, { "epoch": 8.134536139881947, "grad_norm": 0.11621193587779999, "learning_rate": 5.116205343276345e-06, "loss": 0.4624, "num_input_tokens_seen": 88610768, "step": 73040 }, { "epoch": 8.135092994765564, "grad_norm": 0.12653011083602905, "learning_rate": 5.113260156969371e-06, "loss": 0.4588, "num_input_tokens_seen": 88617008, "step": 73045 }, { "epoch": 8.135649849649182, "grad_norm": 0.13318997621536255, "learning_rate": 5.110315722047154e-06, "loss": 0.4518, "num_input_tokens_seen": 88623312, "step": 73050 }, { "epoch": 8.136206704532798, "grad_norm": 0.10381109267473221, "learning_rate": 5.107372038620958e-06, "loss": 0.4622, "num_input_tokens_seen": 88628944, "step": 73055 }, { "epoch": 8.136763559416416, "grad_norm": 0.07786455750465393, "learning_rate": 5.104429106801989e-06, "loss": 0.461, "num_input_tokens_seen": 88634992, "step": 73060 }, { "epoch": 8.137320414300033, "grad_norm": 0.09858079254627228, "learning_rate": 5.101486926701451e-06, "loss": 0.4663, "num_input_tokens_seen": 88641392, "step": 73065 }, { "epoch": 8.137877269183651, "grad_norm": 0.08551347255706787, "learning_rate": 5.098545498430501e-06, "loss": 0.4557, "num_input_tokens_seen": 88647504, "step": 73070 }, { "epoch": 8.138434124067269, "grad_norm": 0.11136499047279358, "learning_rate": 5.0956048221002776e-06, "loss": 0.4679, "num_input_tokens_seen": 88652976, "step": 73075 }, { "epoch": 8.138990978950885, "grad_norm": 0.10530383884906769, "learning_rate": 5.0926648978219e-06, "loss": 0.4595, "num_input_tokens_seen": 88659024, "step": 73080 }, { "epoch": 8.139547833834502, "grad_norm": 0.1391333043575287, "learning_rate": 5.089725725706429e-06, "loss": 0.4576, "num_input_tokens_seen": 88665648, "step": 73085 }, { "epoch": 8.14010468871812, "grad_norm": 0.09701287001371384, "learning_rate": 5.086787305864934e-06, "loss": 0.4501, "num_input_tokens_seen": 88672048, "step": 73090 }, { "epoch": 8.140661543601738, "grad_norm": 0.12282901257276535, "learning_rate": 5.0838496384084226e-06, "loss": 0.451, "num_input_tokens_seen": 88678032, "step": 73095 }, { "epoch": 8.141218398485355, "grad_norm": 0.07939902693033218, "learning_rate": 5.080912723447903e-06, "loss": 0.4614, "num_input_tokens_seen": 88684208, "step": 73100 }, { "epoch": 8.141775253368973, "grad_norm": 0.08316657692193985, "learning_rate": 5.077976561094328e-06, "loss": 0.4693, "num_input_tokens_seen": 88690480, "step": 73105 }, { "epoch": 8.142332108252589, "grad_norm": 0.14503827691078186, "learning_rate": 5.0750411514586425e-06, "loss": 0.4734, "num_input_tokens_seen": 88696144, "step": 73110 }, { "epoch": 8.142888963136206, "grad_norm": 0.14095743000507355, "learning_rate": 5.07210649465176e-06, "loss": 0.461, "num_input_tokens_seen": 88702320, "step": 73115 }, { "epoch": 8.143445818019824, "grad_norm": 0.07921110838651657, "learning_rate": 5.069172590784552e-06, "loss": 0.4493, "num_input_tokens_seen": 88708336, "step": 73120 }, { "epoch": 8.144002672903442, "grad_norm": 0.10336504876613617, "learning_rate": 5.0662394399678795e-06, "loss": 0.4538, "num_input_tokens_seen": 88714576, "step": 73125 }, { "epoch": 8.14455952778706, "grad_norm": 0.08289201557636261, "learning_rate": 5.063307042312554e-06, "loss": 0.4593, "num_input_tokens_seen": 88720624, "step": 73130 }, { "epoch": 8.145116382670675, "grad_norm": 0.10872671008110046, "learning_rate": 5.060375397929387e-06, "loss": 0.4553, "num_input_tokens_seen": 88726736, "step": 73135 }, { "epoch": 8.145673237554293, "grad_norm": 0.1163305714726448, "learning_rate": 5.057444506929132e-06, "loss": 0.459, "num_input_tokens_seen": 88732240, "step": 73140 }, { "epoch": 8.14623009243791, "grad_norm": 0.07717987895011902, "learning_rate": 5.054514369422531e-06, "loss": 0.4525, "num_input_tokens_seen": 88738224, "step": 73145 }, { "epoch": 8.146786947321528, "grad_norm": 0.10009124875068665, "learning_rate": 5.051584985520302e-06, "loss": 0.4552, "num_input_tokens_seen": 88744368, "step": 73150 }, { "epoch": 8.147343802205146, "grad_norm": 0.14059332013130188, "learning_rate": 5.048656355333114e-06, "loss": 0.4541, "num_input_tokens_seen": 88750096, "step": 73155 }, { "epoch": 8.147900657088762, "grad_norm": 0.08704045414924622, "learning_rate": 5.0457284789716315e-06, "loss": 0.4474, "num_input_tokens_seen": 88756240, "step": 73160 }, { "epoch": 8.14845751197238, "grad_norm": 0.13252288103103638, "learning_rate": 5.04280135654647e-06, "loss": 0.4631, "num_input_tokens_seen": 88762160, "step": 73165 }, { "epoch": 8.149014366855997, "grad_norm": 0.08345163613557816, "learning_rate": 5.039874988168225e-06, "loss": 0.4643, "num_input_tokens_seen": 88768144, "step": 73170 }, { "epoch": 8.149571221739615, "grad_norm": 0.08993522822856903, "learning_rate": 5.036949373947478e-06, "loss": 0.4624, "num_input_tokens_seen": 88774000, "step": 73175 }, { "epoch": 8.150128076623233, "grad_norm": 0.1066136509180069, "learning_rate": 5.03402451399475e-06, "loss": 0.4546, "num_input_tokens_seen": 88780336, "step": 73180 }, { "epoch": 8.150684931506849, "grad_norm": 0.11042755097150803, "learning_rate": 5.0311004084205694e-06, "loss": 0.4545, "num_input_tokens_seen": 88785968, "step": 73185 }, { "epoch": 8.151241786390466, "grad_norm": 0.11433892697095871, "learning_rate": 5.028177057335401e-06, "loss": 0.4602, "num_input_tokens_seen": 88792080, "step": 73190 }, { "epoch": 8.151798641274084, "grad_norm": 0.09550630301237106, "learning_rate": 5.02525446084971e-06, "loss": 0.4765, "num_input_tokens_seen": 88798064, "step": 73195 }, { "epoch": 8.152355496157702, "grad_norm": 0.08410750329494476, "learning_rate": 5.022332619073914e-06, "loss": 0.4613, "num_input_tokens_seen": 88803824, "step": 73200 }, { "epoch": 8.15291235104132, "grad_norm": 0.12236388027667999, "learning_rate": 5.019411532118412e-06, "loss": 0.4594, "num_input_tokens_seen": 88810128, "step": 73205 }, { "epoch": 8.153469205924935, "grad_norm": 0.12073012441396713, "learning_rate": 5.0164912000935815e-06, "loss": 0.4595, "num_input_tokens_seen": 88816112, "step": 73210 }, { "epoch": 8.154026060808553, "grad_norm": 0.08580806106328964, "learning_rate": 5.013571623109745e-06, "loss": 0.4526, "num_input_tokens_seen": 88822160, "step": 73215 }, { "epoch": 8.15458291569217, "grad_norm": 0.10539811849594116, "learning_rate": 5.01065280127723e-06, "loss": 0.4689, "num_input_tokens_seen": 88828336, "step": 73220 }, { "epoch": 8.155139770575788, "grad_norm": 0.09899366647005081, "learning_rate": 5.007734734706304e-06, "loss": 0.4597, "num_input_tokens_seen": 88834256, "step": 73225 }, { "epoch": 8.155696625459406, "grad_norm": 0.09893109649419785, "learning_rate": 5.004817423507236e-06, "loss": 0.4592, "num_input_tokens_seen": 88839792, "step": 73230 }, { "epoch": 8.156253480343022, "grad_norm": 0.1167093887925148, "learning_rate": 5.001900867790235e-06, "loss": 0.467, "num_input_tokens_seen": 88845904, "step": 73235 }, { "epoch": 8.15681033522664, "grad_norm": 0.11917557567358017, "learning_rate": 4.99898506766551e-06, "loss": 0.4473, "num_input_tokens_seen": 88852112, "step": 73240 }, { "epoch": 8.157367190110257, "grad_norm": 0.1075114980340004, "learning_rate": 4.996070023243227e-06, "loss": 0.4615, "num_input_tokens_seen": 88858416, "step": 73245 }, { "epoch": 8.157924044993875, "grad_norm": 0.09055836498737335, "learning_rate": 4.99315573463352e-06, "loss": 0.4662, "num_input_tokens_seen": 88864720, "step": 73250 }, { "epoch": 8.158480899877492, "grad_norm": 0.08692238479852676, "learning_rate": 4.990242201946513e-06, "loss": 0.4554, "num_input_tokens_seen": 88870800, "step": 73255 }, { "epoch": 8.159037754761108, "grad_norm": 0.09327948093414307, "learning_rate": 4.9873294252922696e-06, "loss": 0.4559, "num_input_tokens_seen": 88876816, "step": 73260 }, { "epoch": 8.159594609644726, "grad_norm": 0.13928668200969696, "learning_rate": 4.984417404780864e-06, "loss": 0.4519, "num_input_tokens_seen": 88882864, "step": 73265 }, { "epoch": 8.160151464528344, "grad_norm": 0.1288745105266571, "learning_rate": 4.9815061405223046e-06, "loss": 0.4688, "num_input_tokens_seen": 88888784, "step": 73270 }, { "epoch": 8.160708319411961, "grad_norm": 0.09616976231336594, "learning_rate": 4.9785956326265955e-06, "loss": 0.452, "num_input_tokens_seen": 88894736, "step": 73275 }, { "epoch": 8.161265174295579, "grad_norm": 0.13934673368930817, "learning_rate": 4.975685881203709e-06, "loss": 0.4677, "num_input_tokens_seen": 88900496, "step": 73280 }, { "epoch": 8.161822029179197, "grad_norm": 0.10323207825422287, "learning_rate": 4.972776886363578e-06, "loss": 0.4653, "num_input_tokens_seen": 88906704, "step": 73285 }, { "epoch": 8.162378884062813, "grad_norm": 0.07762351632118225, "learning_rate": 4.969868648216122e-06, "loss": 0.4657, "num_input_tokens_seen": 88913072, "step": 73290 }, { "epoch": 8.16293573894643, "grad_norm": 0.1144428700208664, "learning_rate": 4.9669611668712115e-06, "loss": 0.4623, "num_input_tokens_seen": 88918992, "step": 73295 }, { "epoch": 8.163492593830048, "grad_norm": 0.13156402111053467, "learning_rate": 4.9640544424387065e-06, "loss": 0.4649, "num_input_tokens_seen": 88924720, "step": 73300 }, { "epoch": 8.164049448713666, "grad_norm": 0.10749537497758865, "learning_rate": 4.96114847502844e-06, "loss": 0.4524, "num_input_tokens_seen": 88930960, "step": 73305 }, { "epoch": 8.164606303597283, "grad_norm": 0.09791523218154907, "learning_rate": 4.958243264750204e-06, "loss": 0.4499, "num_input_tokens_seen": 88937040, "step": 73310 }, { "epoch": 8.1651631584809, "grad_norm": 0.10801797360181808, "learning_rate": 4.955338811713761e-06, "loss": 0.4546, "num_input_tokens_seen": 88943120, "step": 73315 }, { "epoch": 8.165720013364517, "grad_norm": 0.11199105530977249, "learning_rate": 4.95243511602885e-06, "loss": 0.4584, "num_input_tokens_seen": 88948848, "step": 73320 }, { "epoch": 8.166276868248135, "grad_norm": 0.10492213815450668, "learning_rate": 4.949532177805183e-06, "loss": 0.4609, "num_input_tokens_seen": 88955024, "step": 73325 }, { "epoch": 8.166833723131752, "grad_norm": 0.12438974529504776, "learning_rate": 4.9466299971524546e-06, "loss": 0.46, "num_input_tokens_seen": 88961360, "step": 73330 }, { "epoch": 8.16739057801537, "grad_norm": 0.11458783596754074, "learning_rate": 4.943728574180301e-06, "loss": 0.4701, "num_input_tokens_seen": 88967376, "step": 73335 }, { "epoch": 8.167947432898986, "grad_norm": 0.14501230418682098, "learning_rate": 4.940827908998363e-06, "loss": 0.4672, "num_input_tokens_seen": 88973648, "step": 73340 }, { "epoch": 8.168504287782604, "grad_norm": 0.1360797882080078, "learning_rate": 4.9379280017162214e-06, "loss": 0.465, "num_input_tokens_seen": 88979600, "step": 73345 }, { "epoch": 8.169061142666221, "grad_norm": 0.07645316421985626, "learning_rate": 4.935028852443454e-06, "loss": 0.4699, "num_input_tokens_seen": 88985328, "step": 73350 }, { "epoch": 8.169617997549839, "grad_norm": 0.08675078302621841, "learning_rate": 4.932130461289602e-06, "loss": 0.46, "num_input_tokens_seen": 88991504, "step": 73355 }, { "epoch": 8.170174852433457, "grad_norm": 0.10343556851148605, "learning_rate": 4.929232828364167e-06, "loss": 0.4677, "num_input_tokens_seen": 88997040, "step": 73360 }, { "epoch": 8.170731707317072, "grad_norm": 0.06944498419761658, "learning_rate": 4.926335953776642e-06, "loss": 0.4671, "num_input_tokens_seen": 89003024, "step": 73365 }, { "epoch": 8.17128856220069, "grad_norm": 0.15396501123905182, "learning_rate": 4.923439837636468e-06, "loss": 0.4604, "num_input_tokens_seen": 89009552, "step": 73370 }, { "epoch": 8.171845417084308, "grad_norm": 0.08189830929040909, "learning_rate": 4.9205444800530795e-06, "loss": 0.461, "num_input_tokens_seen": 89015504, "step": 73375 }, { "epoch": 8.172402271967925, "grad_norm": 0.08490562438964844, "learning_rate": 4.917649881135863e-06, "loss": 0.4602, "num_input_tokens_seen": 89021680, "step": 73380 }, { "epoch": 8.172959126851543, "grad_norm": 0.12139176577329636, "learning_rate": 4.914756040994192e-06, "loss": 0.4654, "num_input_tokens_seen": 89027248, "step": 73385 }, { "epoch": 8.173515981735159, "grad_norm": 0.09676926583051682, "learning_rate": 4.9118629597374094e-06, "loss": 0.4611, "num_input_tokens_seen": 89033424, "step": 73390 }, { "epoch": 8.174072836618777, "grad_norm": 0.12012586742639542, "learning_rate": 4.908970637474814e-06, "loss": 0.4617, "num_input_tokens_seen": 89039568, "step": 73395 }, { "epoch": 8.174629691502394, "grad_norm": 0.09085578471422195, "learning_rate": 4.906079074315697e-06, "loss": 0.4445, "num_input_tokens_seen": 89045904, "step": 73400 }, { "epoch": 8.175186546386012, "grad_norm": 0.0808248296380043, "learning_rate": 4.903188270369305e-06, "loss": 0.4613, "num_input_tokens_seen": 89052080, "step": 73405 }, { "epoch": 8.17574340126963, "grad_norm": 0.1091272160410881, "learning_rate": 4.900298225744867e-06, "loss": 0.4645, "num_input_tokens_seen": 89058160, "step": 73410 }, { "epoch": 8.176300256153246, "grad_norm": 0.10725010186433792, "learning_rate": 4.897408940551568e-06, "loss": 0.4635, "num_input_tokens_seen": 89064400, "step": 73415 }, { "epoch": 8.176857111036863, "grad_norm": 0.1140279546380043, "learning_rate": 4.894520414898582e-06, "loss": 0.4493, "num_input_tokens_seen": 89070160, "step": 73420 }, { "epoch": 8.177413965920481, "grad_norm": 0.11955484002828598, "learning_rate": 4.89163264889505e-06, "loss": 0.4631, "num_input_tokens_seen": 89076144, "step": 73425 }, { "epoch": 8.177970820804099, "grad_norm": 0.10299022495746613, "learning_rate": 4.8887456426500746e-06, "loss": 0.4631, "num_input_tokens_seen": 89081968, "step": 73430 }, { "epoch": 8.178527675687716, "grad_norm": 0.0812901183962822, "learning_rate": 4.8858593962727435e-06, "loss": 0.4669, "num_input_tokens_seen": 89087504, "step": 73435 }, { "epoch": 8.179084530571334, "grad_norm": 0.08535248041152954, "learning_rate": 4.8829739098720964e-06, "loss": 0.4659, "num_input_tokens_seen": 89093616, "step": 73440 }, { "epoch": 8.17964138545495, "grad_norm": 0.12144453078508377, "learning_rate": 4.8800891835571684e-06, "loss": 0.4611, "num_input_tokens_seen": 89099664, "step": 73445 }, { "epoch": 8.180198240338568, "grad_norm": 0.1289394348859787, "learning_rate": 4.877205217436942e-06, "loss": 0.4532, "num_input_tokens_seen": 89105424, "step": 73450 }, { "epoch": 8.180755095222185, "grad_norm": 0.13573569059371948, "learning_rate": 4.874322011620391e-06, "loss": 0.4715, "num_input_tokens_seen": 89111632, "step": 73455 }, { "epoch": 8.181311950105803, "grad_norm": 0.130978062748909, "learning_rate": 4.871439566216457e-06, "loss": 0.4566, "num_input_tokens_seen": 89117776, "step": 73460 }, { "epoch": 8.18186880498942, "grad_norm": 0.10954932123422623, "learning_rate": 4.868557881334032e-06, "loss": 0.4542, "num_input_tokens_seen": 89123568, "step": 73465 }, { "epoch": 8.182425659873036, "grad_norm": 0.13223785161972046, "learning_rate": 4.865676957082013e-06, "loss": 0.4646, "num_input_tokens_seen": 89130128, "step": 73470 }, { "epoch": 8.182982514756654, "grad_norm": 0.1198173463344574, "learning_rate": 4.862796793569236e-06, "loss": 0.4476, "num_input_tokens_seen": 89136176, "step": 73475 }, { "epoch": 8.183539369640272, "grad_norm": 0.15093566477298737, "learning_rate": 4.859917390904528e-06, "loss": 0.4479, "num_input_tokens_seen": 89142128, "step": 73480 }, { "epoch": 8.18409622452389, "grad_norm": 0.09450642019510269, "learning_rate": 4.8570387491966905e-06, "loss": 0.4535, "num_input_tokens_seen": 89148208, "step": 73485 }, { "epoch": 8.184653079407507, "grad_norm": 0.10304556787014008, "learning_rate": 4.8541608685544745e-06, "loss": 0.4743, "num_input_tokens_seen": 89154192, "step": 73490 }, { "epoch": 8.185209934291123, "grad_norm": 0.0857703909277916, "learning_rate": 4.851283749086627e-06, "loss": 0.4483, "num_input_tokens_seen": 89160208, "step": 73495 }, { "epoch": 8.18576678917474, "grad_norm": 0.12927861511707306, "learning_rate": 4.848407390901841e-06, "loss": 0.4537, "num_input_tokens_seen": 89166160, "step": 73500 }, { "epoch": 8.186323644058358, "grad_norm": 0.08866923302412033, "learning_rate": 4.845531794108813e-06, "loss": 0.4669, "num_input_tokens_seen": 89172272, "step": 73505 }, { "epoch": 8.186880498941976, "grad_norm": 0.142713725566864, "learning_rate": 4.8426569588161715e-06, "loss": 0.4442, "num_input_tokens_seen": 89178064, "step": 73510 }, { "epoch": 8.187437353825594, "grad_norm": 0.10023153573274612, "learning_rate": 4.839782885132549e-06, "loss": 0.4583, "num_input_tokens_seen": 89184400, "step": 73515 }, { "epoch": 8.18799420870921, "grad_norm": 0.09429062157869339, "learning_rate": 4.836909573166543e-06, "loss": 0.4526, "num_input_tokens_seen": 89190448, "step": 73520 }, { "epoch": 8.188551063592827, "grad_norm": 0.1613158881664276, "learning_rate": 4.834037023026703e-06, "loss": 0.4659, "num_input_tokens_seen": 89196560, "step": 73525 }, { "epoch": 8.189107918476445, "grad_norm": 0.0798179879784584, "learning_rate": 4.831165234821575e-06, "loss": 0.4505, "num_input_tokens_seen": 89202704, "step": 73530 }, { "epoch": 8.189664773360063, "grad_norm": 0.10223336517810822, "learning_rate": 4.828294208659651e-06, "loss": 0.462, "num_input_tokens_seen": 89209008, "step": 73535 }, { "epoch": 8.19022162824368, "grad_norm": 0.14893625676631927, "learning_rate": 4.825423944649424e-06, "loss": 0.4667, "num_input_tokens_seen": 89214832, "step": 73540 }, { "epoch": 8.190778483127296, "grad_norm": 0.11678766459226608, "learning_rate": 4.8225544428993244e-06, "loss": 0.4811, "num_input_tokens_seen": 89221008, "step": 73545 }, { "epoch": 8.191335338010914, "grad_norm": 0.1023014709353447, "learning_rate": 4.819685703517779e-06, "loss": 0.4581, "num_input_tokens_seen": 89227152, "step": 73550 }, { "epoch": 8.191892192894532, "grad_norm": 0.11871665716171265, "learning_rate": 4.816817726613188e-06, "loss": 0.4646, "num_input_tokens_seen": 89233168, "step": 73555 }, { "epoch": 8.19244904777815, "grad_norm": 0.10098248720169067, "learning_rate": 4.813950512293894e-06, "loss": 0.4716, "num_input_tokens_seen": 89239504, "step": 73560 }, { "epoch": 8.193005902661767, "grad_norm": 0.08634566515684128, "learning_rate": 4.811084060668247e-06, "loss": 0.4664, "num_input_tokens_seen": 89245168, "step": 73565 }, { "epoch": 8.193562757545383, "grad_norm": 0.10135854035615921, "learning_rate": 4.8082183718445365e-06, "loss": 0.4482, "num_input_tokens_seen": 89251376, "step": 73570 }, { "epoch": 8.194119612429, "grad_norm": 0.08995011448860168, "learning_rate": 4.8053534459310436e-06, "loss": 0.4614, "num_input_tokens_seen": 89257456, "step": 73575 }, { "epoch": 8.194676467312618, "grad_norm": 0.11591631174087524, "learning_rate": 4.802489283036021e-06, "loss": 0.4609, "num_input_tokens_seen": 89263728, "step": 73580 }, { "epoch": 8.195233322196236, "grad_norm": 0.12915265560150146, "learning_rate": 4.799625883267672e-06, "loss": 0.4618, "num_input_tokens_seen": 89269584, "step": 73585 }, { "epoch": 8.195790177079854, "grad_norm": 0.10517793148756027, "learning_rate": 4.796763246734201e-06, "loss": 0.4526, "num_input_tokens_seen": 89275856, "step": 73590 }, { "epoch": 8.19634703196347, "grad_norm": 0.11844291538000107, "learning_rate": 4.793901373543752e-06, "loss": 0.4622, "num_input_tokens_seen": 89281936, "step": 73595 }, { "epoch": 8.196903886847087, "grad_norm": 0.0915137231349945, "learning_rate": 4.791040263804467e-06, "loss": 0.4607, "num_input_tokens_seen": 89288112, "step": 73600 }, { "epoch": 8.197460741730705, "grad_norm": 0.1311943233013153, "learning_rate": 4.788179917624439e-06, "loss": 0.4709, "num_input_tokens_seen": 89294640, "step": 73605 }, { "epoch": 8.198017596614322, "grad_norm": 0.09937011450529099, "learning_rate": 4.785320335111745e-06, "loss": 0.4603, "num_input_tokens_seen": 89300976, "step": 73610 }, { "epoch": 8.19857445149794, "grad_norm": 0.11479145288467407, "learning_rate": 4.782461516374437e-06, "loss": 0.4571, "num_input_tokens_seen": 89307120, "step": 73615 }, { "epoch": 8.199131306381556, "grad_norm": 0.0921950414776802, "learning_rate": 4.779603461520516e-06, "loss": 0.465, "num_input_tokens_seen": 89313296, "step": 73620 }, { "epoch": 8.199688161265174, "grad_norm": 0.09160640090703964, "learning_rate": 4.776746170657984e-06, "loss": 0.4484, "num_input_tokens_seen": 89319760, "step": 73625 }, { "epoch": 8.200245016148791, "grad_norm": 0.11390510201454163, "learning_rate": 4.77388964389478e-06, "loss": 0.4603, "num_input_tokens_seen": 89326032, "step": 73630 }, { "epoch": 8.200801871032409, "grad_norm": 0.09904573112726212, "learning_rate": 4.771033881338851e-06, "loss": 0.4648, "num_input_tokens_seen": 89331984, "step": 73635 }, { "epoch": 8.201358725916027, "grad_norm": 0.08963317424058914, "learning_rate": 4.768178883098085e-06, "loss": 0.4581, "num_input_tokens_seen": 89338064, "step": 73640 }, { "epoch": 8.201915580799644, "grad_norm": 0.08995340019464493, "learning_rate": 4.765324649280353e-06, "loss": 0.4722, "num_input_tokens_seen": 89344400, "step": 73645 }, { "epoch": 8.20247243568326, "grad_norm": 0.09963890165090561, "learning_rate": 4.762471179993511e-06, "loss": 0.4655, "num_input_tokens_seen": 89350672, "step": 73650 }, { "epoch": 8.203029290566878, "grad_norm": 0.1482824981212616, "learning_rate": 4.7596184753453535e-06, "loss": 0.4685, "num_input_tokens_seen": 89356912, "step": 73655 }, { "epoch": 8.203586145450496, "grad_norm": 0.11500176787376404, "learning_rate": 4.7567665354436794e-06, "loss": 0.4617, "num_input_tokens_seen": 89362576, "step": 73660 }, { "epoch": 8.204143000334113, "grad_norm": 0.09962356835603714, "learning_rate": 4.753915360396235e-06, "loss": 0.4691, "num_input_tokens_seen": 89368816, "step": 73665 }, { "epoch": 8.204699855217731, "grad_norm": 0.11571960151195526, "learning_rate": 4.751064950310746e-06, "loss": 0.4621, "num_input_tokens_seen": 89375120, "step": 73670 }, { "epoch": 8.205256710101347, "grad_norm": 0.0772579237818718, "learning_rate": 4.748215305294923e-06, "loss": 0.457, "num_input_tokens_seen": 89381296, "step": 73675 }, { "epoch": 8.205813564984965, "grad_norm": 0.10713432729244232, "learning_rate": 4.745366425456419e-06, "loss": 0.4668, "num_input_tokens_seen": 89387440, "step": 73680 }, { "epoch": 8.206370419868582, "grad_norm": 0.09033747762441635, "learning_rate": 4.7425183109028885e-06, "loss": 0.4542, "num_input_tokens_seen": 89393392, "step": 73685 }, { "epoch": 8.2069272747522, "grad_norm": 0.12826833128929138, "learning_rate": 4.7396709617419235e-06, "loss": 0.4528, "num_input_tokens_seen": 89399568, "step": 73690 }, { "epoch": 8.207484129635818, "grad_norm": 0.14314648509025574, "learning_rate": 4.736824378081128e-06, "loss": 0.4616, "num_input_tokens_seen": 89405488, "step": 73695 }, { "epoch": 8.208040984519434, "grad_norm": 0.12196376919746399, "learning_rate": 4.733978560028035e-06, "loss": 0.4638, "num_input_tokens_seen": 89411600, "step": 73700 }, { "epoch": 8.208597839403051, "grad_norm": 0.1739315539598465, "learning_rate": 4.73113350769018e-06, "loss": 0.4692, "num_input_tokens_seen": 89418224, "step": 73705 }, { "epoch": 8.209154694286669, "grad_norm": 0.11489006131887436, "learning_rate": 4.728289221175061e-06, "loss": 0.4619, "num_input_tokens_seen": 89424272, "step": 73710 }, { "epoch": 8.209711549170287, "grad_norm": 0.12610545754432678, "learning_rate": 4.725445700590131e-06, "loss": 0.4655, "num_input_tokens_seen": 89430416, "step": 73715 }, { "epoch": 8.210268404053904, "grad_norm": 0.11089236289262772, "learning_rate": 4.72260294604285e-06, "loss": 0.4577, "num_input_tokens_seen": 89436464, "step": 73720 }, { "epoch": 8.21082525893752, "grad_norm": 0.11656639724969864, "learning_rate": 4.719760957640598e-06, "loss": 0.4621, "num_input_tokens_seen": 89442416, "step": 73725 }, { "epoch": 8.211382113821138, "grad_norm": 0.1291866898536682, "learning_rate": 4.7169197354907715e-06, "loss": 0.467, "num_input_tokens_seen": 89448816, "step": 73730 }, { "epoch": 8.211938968704755, "grad_norm": 0.0951065644621849, "learning_rate": 4.71407927970072e-06, "loss": 0.4537, "num_input_tokens_seen": 89454928, "step": 73735 }, { "epoch": 8.212495823588373, "grad_norm": 0.10351348668336868, "learning_rate": 4.711239590377759e-06, "loss": 0.473, "num_input_tokens_seen": 89460944, "step": 73740 }, { "epoch": 8.21305267847199, "grad_norm": 0.13481290638446808, "learning_rate": 4.70840066762919e-06, "loss": 0.4638, "num_input_tokens_seen": 89467024, "step": 73745 }, { "epoch": 8.213609533355607, "grad_norm": 0.08649522811174393, "learning_rate": 4.705562511562264e-06, "loss": 0.4473, "num_input_tokens_seen": 89473296, "step": 73750 }, { "epoch": 8.214166388239224, "grad_norm": 0.10357899218797684, "learning_rate": 4.702725122284224e-06, "loss": 0.461, "num_input_tokens_seen": 89479504, "step": 73755 }, { "epoch": 8.214723243122842, "grad_norm": 0.11798102408647537, "learning_rate": 4.699888499902283e-06, "loss": 0.4528, "num_input_tokens_seen": 89485552, "step": 73760 }, { "epoch": 8.21528009800646, "grad_norm": 0.17795802652835846, "learning_rate": 4.6970526445236e-06, "loss": 0.4626, "num_input_tokens_seen": 89491888, "step": 73765 }, { "epoch": 8.215836952890077, "grad_norm": 0.09873607009649277, "learning_rate": 4.694217556255339e-06, "loss": 0.4576, "num_input_tokens_seen": 89497872, "step": 73770 }, { "epoch": 8.216393807773693, "grad_norm": 0.11290731281042099, "learning_rate": 4.6913832352046075e-06, "loss": 0.4572, "num_input_tokens_seen": 89503824, "step": 73775 }, { "epoch": 8.216950662657311, "grad_norm": 0.10463976860046387, "learning_rate": 4.688549681478504e-06, "loss": 0.4581, "num_input_tokens_seen": 89509488, "step": 73780 }, { "epoch": 8.217507517540929, "grad_norm": 0.10438380390405655, "learning_rate": 4.685716895184078e-06, "loss": 0.4722, "num_input_tokens_seen": 89515664, "step": 73785 }, { "epoch": 8.218064372424546, "grad_norm": 0.0762004628777504, "learning_rate": 4.68288487642837e-06, "loss": 0.455, "num_input_tokens_seen": 89521808, "step": 73790 }, { "epoch": 8.218621227308164, "grad_norm": 0.13264703750610352, "learning_rate": 4.680053625318387e-06, "loss": 0.4719, "num_input_tokens_seen": 89528080, "step": 73795 }, { "epoch": 8.219178082191782, "grad_norm": 0.0889596939086914, "learning_rate": 4.6772231419610875e-06, "loss": 0.4582, "num_input_tokens_seen": 89534288, "step": 73800 }, { "epoch": 8.219734937075398, "grad_norm": 0.10398568958044052, "learning_rate": 4.674393426463433e-06, "loss": 0.4791, "num_input_tokens_seen": 89540560, "step": 73805 }, { "epoch": 8.220291791959015, "grad_norm": 0.1173284500837326, "learning_rate": 4.671564478932328e-06, "loss": 0.4566, "num_input_tokens_seen": 89546672, "step": 73810 }, { "epoch": 8.220848646842633, "grad_norm": 0.10859620571136475, "learning_rate": 4.668736299474666e-06, "loss": 0.4713, "num_input_tokens_seen": 89552688, "step": 73815 }, { "epoch": 8.22140550172625, "grad_norm": 0.10617440938949585, "learning_rate": 4.665908888197296e-06, "loss": 0.4668, "num_input_tokens_seen": 89559056, "step": 73820 }, { "epoch": 8.221962356609868, "grad_norm": 0.11080915480852127, "learning_rate": 4.6630822452070514e-06, "loss": 0.458, "num_input_tokens_seen": 89565584, "step": 73825 }, { "epoch": 8.222519211493484, "grad_norm": 0.08283654600381851, "learning_rate": 4.660256370610738e-06, "loss": 0.446, "num_input_tokens_seen": 89571632, "step": 73830 }, { "epoch": 8.223076066377102, "grad_norm": 0.10236509889364243, "learning_rate": 4.657431264515113e-06, "loss": 0.4527, "num_input_tokens_seen": 89577808, "step": 73835 }, { "epoch": 8.22363292126072, "grad_norm": 0.08392573893070221, "learning_rate": 4.654606927026936e-06, "loss": 0.4602, "num_input_tokens_seen": 89583856, "step": 73840 }, { "epoch": 8.224189776144337, "grad_norm": 0.09118791669607162, "learning_rate": 4.6517833582529e-06, "loss": 0.465, "num_input_tokens_seen": 89589776, "step": 73845 }, { "epoch": 8.224746631027955, "grad_norm": 0.0845155194401741, "learning_rate": 4.648960558299701e-06, "loss": 0.464, "num_input_tokens_seen": 89596048, "step": 73850 }, { "epoch": 8.22530348591157, "grad_norm": 0.10362044721841812, "learning_rate": 4.646138527273994e-06, "loss": 0.4654, "num_input_tokens_seen": 89602288, "step": 73855 }, { "epoch": 8.225860340795188, "grad_norm": 0.11240285634994507, "learning_rate": 4.643317265282396e-06, "loss": 0.4665, "num_input_tokens_seen": 89608656, "step": 73860 }, { "epoch": 8.226417195678806, "grad_norm": 0.09713151305913925, "learning_rate": 4.640496772431513e-06, "loss": 0.4586, "num_input_tokens_seen": 89614672, "step": 73865 }, { "epoch": 8.226974050562424, "grad_norm": 0.0938548743724823, "learning_rate": 4.637677048827901e-06, "loss": 0.4608, "num_input_tokens_seen": 89620720, "step": 73870 }, { "epoch": 8.227530905446041, "grad_norm": 0.1030145138502121, "learning_rate": 4.634858094578109e-06, "loss": 0.4641, "num_input_tokens_seen": 89626512, "step": 73875 }, { "epoch": 8.228087760329657, "grad_norm": 0.11488289386034012, "learning_rate": 4.632039909788638e-06, "loss": 0.4631, "num_input_tokens_seen": 89632816, "step": 73880 }, { "epoch": 8.228644615213275, "grad_norm": 0.075349360704422, "learning_rate": 4.6292224945659715e-06, "loss": 0.4599, "num_input_tokens_seen": 89638992, "step": 73885 }, { "epoch": 8.229201470096893, "grad_norm": 0.1669316440820694, "learning_rate": 4.626405849016566e-06, "loss": 0.4806, "num_input_tokens_seen": 89644976, "step": 73890 }, { "epoch": 8.22975832498051, "grad_norm": 0.09305083006620407, "learning_rate": 4.6235899732468305e-06, "loss": 0.4585, "num_input_tokens_seen": 89651152, "step": 73895 }, { "epoch": 8.230315179864128, "grad_norm": 0.09336686879396439, "learning_rate": 4.620774867363173e-06, "loss": 0.4679, "num_input_tokens_seen": 89657104, "step": 73900 }, { "epoch": 8.230872034747744, "grad_norm": 0.09404506534337997, "learning_rate": 4.61796053147194e-06, "loss": 0.46, "num_input_tokens_seen": 89663472, "step": 73905 }, { "epoch": 8.231428889631362, "grad_norm": 0.13190360367298126, "learning_rate": 4.615146965679487e-06, "loss": 0.4641, "num_input_tokens_seen": 89669744, "step": 73910 }, { "epoch": 8.23198574451498, "grad_norm": 0.12589102983474731, "learning_rate": 4.612334170092098e-06, "loss": 0.4481, "num_input_tokens_seen": 89675888, "step": 73915 }, { "epoch": 8.232542599398597, "grad_norm": 0.12553000450134277, "learning_rate": 4.6095221448160616e-06, "loss": 0.4632, "num_input_tokens_seen": 89682032, "step": 73920 }, { "epoch": 8.233099454282215, "grad_norm": 0.09482096135616302, "learning_rate": 4.606710889957627e-06, "loss": 0.4673, "num_input_tokens_seen": 89687920, "step": 73925 }, { "epoch": 8.23365630916583, "grad_norm": 0.10938151925802231, "learning_rate": 4.603900405623004e-06, "loss": 0.4502, "num_input_tokens_seen": 89694128, "step": 73930 }, { "epoch": 8.234213164049448, "grad_norm": 0.08858329057693481, "learning_rate": 4.601090691918391e-06, "loss": 0.4663, "num_input_tokens_seen": 89700208, "step": 73935 }, { "epoch": 8.234770018933066, "grad_norm": 0.12255796790122986, "learning_rate": 4.59828174894994e-06, "loss": 0.4605, "num_input_tokens_seen": 89705968, "step": 73940 }, { "epoch": 8.235326873816684, "grad_norm": 0.12296643108129501, "learning_rate": 4.595473576823789e-06, "loss": 0.4574, "num_input_tokens_seen": 89711792, "step": 73945 }, { "epoch": 8.235883728700301, "grad_norm": 0.1287839710712433, "learning_rate": 4.592666175646029e-06, "loss": 0.4577, "num_input_tokens_seen": 89717104, "step": 73950 }, { "epoch": 8.236440583583917, "grad_norm": 0.0949145257472992, "learning_rate": 4.58985954552274e-06, "loss": 0.4671, "num_input_tokens_seen": 89723152, "step": 73955 }, { "epoch": 8.236997438467535, "grad_norm": 0.10145330429077148, "learning_rate": 4.587053686559972e-06, "loss": 0.454, "num_input_tokens_seen": 89729008, "step": 73960 }, { "epoch": 8.237554293351153, "grad_norm": 0.09877263009548187, "learning_rate": 4.584248598863725e-06, "loss": 0.4565, "num_input_tokens_seen": 89735152, "step": 73965 }, { "epoch": 8.23811114823477, "grad_norm": 0.0788571760058403, "learning_rate": 4.581444282539998e-06, "loss": 0.4598, "num_input_tokens_seen": 89740880, "step": 73970 }, { "epoch": 8.238668003118388, "grad_norm": 0.09790311008691788, "learning_rate": 4.578640737694737e-06, "loss": 0.4638, "num_input_tokens_seen": 89746992, "step": 73975 }, { "epoch": 8.239224858002006, "grad_norm": 0.10301369428634644, "learning_rate": 4.5758379644338684e-06, "loss": 0.4555, "num_input_tokens_seen": 89753072, "step": 73980 }, { "epoch": 8.239781712885621, "grad_norm": 0.10888594388961792, "learning_rate": 4.573035962863301e-06, "loss": 0.4638, "num_input_tokens_seen": 89759312, "step": 73985 }, { "epoch": 8.24033856776924, "grad_norm": 0.1090613305568695, "learning_rate": 4.570234733088891e-06, "loss": 0.4662, "num_input_tokens_seen": 89765456, "step": 73990 }, { "epoch": 8.240895422652857, "grad_norm": 0.09840360283851624, "learning_rate": 4.567434275216489e-06, "loss": 0.4765, "num_input_tokens_seen": 89771344, "step": 73995 }, { "epoch": 8.241452277536474, "grad_norm": 0.1375763863325119, "learning_rate": 4.564634589351893e-06, "loss": 0.4604, "num_input_tokens_seen": 89777360, "step": 74000 }, { "epoch": 8.242009132420092, "grad_norm": 0.16921402513980865, "learning_rate": 4.561835675600898e-06, "loss": 0.4755, "num_input_tokens_seen": 89783120, "step": 74005 }, { "epoch": 8.242565987303708, "grad_norm": 0.10184983909130096, "learning_rate": 4.559037534069241e-06, "loss": 0.4602, "num_input_tokens_seen": 89789136, "step": 74010 }, { "epoch": 8.243122842187326, "grad_norm": 0.22872626781463623, "learning_rate": 4.5562401648626515e-06, "loss": 0.4726, "num_input_tokens_seen": 89794960, "step": 74015 }, { "epoch": 8.243679697070943, "grad_norm": 0.11531416326761246, "learning_rate": 4.553443568086832e-06, "loss": 0.4522, "num_input_tokens_seen": 89801488, "step": 74020 }, { "epoch": 8.244236551954561, "grad_norm": 0.10371938347816467, "learning_rate": 4.55064774384743e-06, "loss": 0.4761, "num_input_tokens_seen": 89807152, "step": 74025 }, { "epoch": 8.244793406838179, "grad_norm": 0.16637329757213593, "learning_rate": 4.547852692250098e-06, "loss": 0.4624, "num_input_tokens_seen": 89813488, "step": 74030 }, { "epoch": 8.245350261721795, "grad_norm": 0.13721683621406555, "learning_rate": 4.545058413400427e-06, "loss": 0.467, "num_input_tokens_seen": 89819536, "step": 74035 }, { "epoch": 8.245907116605412, "grad_norm": 0.1112234815955162, "learning_rate": 4.542264907404004e-06, "loss": 0.4583, "num_input_tokens_seen": 89825712, "step": 74040 }, { "epoch": 8.24646397148903, "grad_norm": 0.10846862941980362, "learning_rate": 4.539472174366368e-06, "loss": 0.4622, "num_input_tokens_seen": 89831664, "step": 74045 }, { "epoch": 8.247020826372648, "grad_norm": 0.11428897827863693, "learning_rate": 4.536680214393043e-06, "loss": 0.4631, "num_input_tokens_seen": 89837936, "step": 74050 }, { "epoch": 8.247577681256265, "grad_norm": 0.11676350235939026, "learning_rate": 4.533889027589525e-06, "loss": 0.4577, "num_input_tokens_seen": 89843600, "step": 74055 }, { "epoch": 8.248134536139881, "grad_norm": 0.11137789487838745, "learning_rate": 4.531098614061257e-06, "loss": 0.4637, "num_input_tokens_seen": 89849712, "step": 74060 }, { "epoch": 8.248691391023499, "grad_norm": 0.09952510893344879, "learning_rate": 4.5283089739136866e-06, "loss": 0.4634, "num_input_tokens_seen": 89855760, "step": 74065 }, { "epoch": 8.249248245907117, "grad_norm": 0.09203039854764938, "learning_rate": 4.525520107252201e-06, "loss": 0.4609, "num_input_tokens_seen": 89861840, "step": 74070 }, { "epoch": 8.249805100790734, "grad_norm": 0.10450298339128494, "learning_rate": 4.522732014182182e-06, "loss": 0.4534, "num_input_tokens_seen": 89868400, "step": 74075 }, { "epoch": 8.250361955674352, "grad_norm": 0.12218458205461502, "learning_rate": 4.519944694808975e-06, "loss": 0.459, "num_input_tokens_seen": 89874544, "step": 74080 }, { "epoch": 8.250918810557968, "grad_norm": 0.11965735256671906, "learning_rate": 4.517158149237883e-06, "loss": 0.4661, "num_input_tokens_seen": 89880528, "step": 74085 }, { "epoch": 8.251475665441586, "grad_norm": 0.12128366529941559, "learning_rate": 4.514372377574202e-06, "loss": 0.4594, "num_input_tokens_seen": 89886128, "step": 74090 }, { "epoch": 8.252032520325203, "grad_norm": 0.09403284639120102, "learning_rate": 4.511587379923176e-06, "loss": 0.4639, "num_input_tokens_seen": 89892368, "step": 74095 }, { "epoch": 8.25258937520882, "grad_norm": 0.13795028626918793, "learning_rate": 4.5088031563900425e-06, "loss": 0.4705, "num_input_tokens_seen": 89898192, "step": 74100 }, { "epoch": 8.253146230092439, "grad_norm": 0.08812073618173599, "learning_rate": 4.506019707079989e-06, "loss": 0.4562, "num_input_tokens_seen": 89904208, "step": 74105 }, { "epoch": 8.253703084976054, "grad_norm": 0.0925627052783966, "learning_rate": 4.503237032098185e-06, "loss": 0.4667, "num_input_tokens_seen": 89910352, "step": 74110 }, { "epoch": 8.254259939859672, "grad_norm": 0.12861932814121246, "learning_rate": 4.500455131549777e-06, "loss": 0.4653, "num_input_tokens_seen": 89916272, "step": 74115 }, { "epoch": 8.25481679474329, "grad_norm": 0.13365483283996582, "learning_rate": 4.497674005539865e-06, "loss": 0.4627, "num_input_tokens_seen": 89922256, "step": 74120 }, { "epoch": 8.255373649626907, "grad_norm": 0.10546280443668365, "learning_rate": 4.494893654173535e-06, "loss": 0.4593, "num_input_tokens_seen": 89928368, "step": 74125 }, { "epoch": 8.255930504510525, "grad_norm": 0.1224953830242157, "learning_rate": 4.492114077555831e-06, "loss": 0.4619, "num_input_tokens_seen": 89934320, "step": 74130 }, { "epoch": 8.256487359394143, "grad_norm": 0.08478984236717224, "learning_rate": 4.4893352757917815e-06, "loss": 0.4629, "num_input_tokens_seen": 89940176, "step": 74135 }, { "epoch": 8.257044214277759, "grad_norm": 0.13063891232013702, "learning_rate": 4.486557248986379e-06, "loss": 0.4685, "num_input_tokens_seen": 89945456, "step": 74140 }, { "epoch": 8.257601069161376, "grad_norm": 0.12298556417226791, "learning_rate": 4.483779997244572e-06, "loss": 0.4687, "num_input_tokens_seen": 89951120, "step": 74145 }, { "epoch": 8.258157924044994, "grad_norm": 0.10224727541208267, "learning_rate": 4.481003520671312e-06, "loss": 0.4676, "num_input_tokens_seen": 89957136, "step": 74150 }, { "epoch": 8.258714778928612, "grad_norm": 0.09861721098423004, "learning_rate": 4.478227819371489e-06, "loss": 0.4664, "num_input_tokens_seen": 89963536, "step": 74155 }, { "epoch": 8.25927163381223, "grad_norm": 0.1207265853881836, "learning_rate": 4.475452893449983e-06, "loss": 0.4587, "num_input_tokens_seen": 89968944, "step": 74160 }, { "epoch": 8.259828488695845, "grad_norm": 0.12576396763324738, "learning_rate": 4.472678743011646e-06, "loss": 0.4609, "num_input_tokens_seen": 89975024, "step": 74165 }, { "epoch": 8.260385343579463, "grad_norm": 0.12332531064748764, "learning_rate": 4.469905368161287e-06, "loss": 0.466, "num_input_tokens_seen": 89981168, "step": 74170 }, { "epoch": 8.26094219846308, "grad_norm": 0.18058860301971436, "learning_rate": 4.467132769003699e-06, "loss": 0.458, "num_input_tokens_seen": 89987152, "step": 74175 }, { "epoch": 8.261499053346698, "grad_norm": 0.09728934615850449, "learning_rate": 4.464360945643628e-06, "loss": 0.4618, "num_input_tokens_seen": 89993264, "step": 74180 }, { "epoch": 8.262055908230316, "grad_norm": 0.12279266864061356, "learning_rate": 4.46158989818582e-06, "loss": 0.4595, "num_input_tokens_seen": 89998832, "step": 74185 }, { "epoch": 8.262612763113932, "grad_norm": 0.10394876450300217, "learning_rate": 4.4588196267349555e-06, "loss": 0.4519, "num_input_tokens_seen": 90004880, "step": 74190 }, { "epoch": 8.26316961799755, "grad_norm": 0.09105370193719864, "learning_rate": 4.456050131395714e-06, "loss": 0.4638, "num_input_tokens_seen": 90010608, "step": 74195 }, { "epoch": 8.263726472881167, "grad_norm": 0.11842416971921921, "learning_rate": 4.453281412272739e-06, "loss": 0.457, "num_input_tokens_seen": 90016656, "step": 74200 }, { "epoch": 8.264283327764785, "grad_norm": 0.11009465903043747, "learning_rate": 4.450513469470635e-06, "loss": 0.4677, "num_input_tokens_seen": 90022768, "step": 74205 }, { "epoch": 8.264840182648403, "grad_norm": 0.11536058783531189, "learning_rate": 4.447746303093989e-06, "loss": 0.46, "num_input_tokens_seen": 90028944, "step": 74210 }, { "epoch": 8.265397037532018, "grad_norm": 0.13699494302272797, "learning_rate": 4.444979913247346e-06, "loss": 0.4586, "num_input_tokens_seen": 90035120, "step": 74215 }, { "epoch": 8.265953892415636, "grad_norm": 0.12482915073633194, "learning_rate": 4.4422143000352385e-06, "loss": 0.4557, "num_input_tokens_seen": 90041616, "step": 74220 }, { "epoch": 8.266510747299254, "grad_norm": 0.09963523596525192, "learning_rate": 4.439449463562151e-06, "loss": 0.4545, "num_input_tokens_seen": 90047696, "step": 74225 }, { "epoch": 8.267067602182872, "grad_norm": 0.10849101096391678, "learning_rate": 4.436685403932553e-06, "loss": 0.4659, "num_input_tokens_seen": 90054032, "step": 74230 }, { "epoch": 8.26762445706649, "grad_norm": 0.10584564507007599, "learning_rate": 4.433922121250883e-06, "loss": 0.4543, "num_input_tokens_seen": 90060048, "step": 74235 }, { "epoch": 8.268181311950105, "grad_norm": 0.12178416550159454, "learning_rate": 4.431159615621536e-06, "loss": 0.4611, "num_input_tokens_seen": 90066288, "step": 74240 }, { "epoch": 8.268738166833723, "grad_norm": 0.09027009457349777, "learning_rate": 4.428397887148903e-06, "loss": 0.4688, "num_input_tokens_seen": 90072592, "step": 74245 }, { "epoch": 8.26929502171734, "grad_norm": 0.11792091280221939, "learning_rate": 4.425636935937313e-06, "loss": 0.4537, "num_input_tokens_seen": 90078704, "step": 74250 }, { "epoch": 8.269851876600958, "grad_norm": 0.09410814940929413, "learning_rate": 4.4228767620910965e-06, "loss": 0.4639, "num_input_tokens_seen": 90084496, "step": 74255 }, { "epoch": 8.270408731484576, "grad_norm": 0.08568254113197327, "learning_rate": 4.4201173657145464e-06, "loss": 0.465, "num_input_tokens_seen": 90090448, "step": 74260 }, { "epoch": 8.270965586368192, "grad_norm": 0.10573337972164154, "learning_rate": 4.417358746911904e-06, "loss": 0.4572, "num_input_tokens_seen": 90096592, "step": 74265 }, { "epoch": 8.27152244125181, "grad_norm": 0.12871508300304413, "learning_rate": 4.4146009057874165e-06, "loss": 0.4625, "num_input_tokens_seen": 90102800, "step": 74270 }, { "epoch": 8.272079296135427, "grad_norm": 0.10977426171302795, "learning_rate": 4.411843842445268e-06, "loss": 0.4583, "num_input_tokens_seen": 90108944, "step": 74275 }, { "epoch": 8.272636151019045, "grad_norm": 0.15195097029209137, "learning_rate": 4.409087556989644e-06, "loss": 0.4625, "num_input_tokens_seen": 90115408, "step": 74280 }, { "epoch": 8.273193005902662, "grad_norm": 0.08898896723985672, "learning_rate": 4.406332049524673e-06, "loss": 0.4671, "num_input_tokens_seen": 90121680, "step": 74285 }, { "epoch": 8.273749860786278, "grad_norm": 0.09188757091760635, "learning_rate": 4.403577320154473e-06, "loss": 0.4609, "num_input_tokens_seen": 90127536, "step": 74290 }, { "epoch": 8.274306715669896, "grad_norm": 0.12705069780349731, "learning_rate": 4.400823368983131e-06, "loss": 0.4624, "num_input_tokens_seen": 90133584, "step": 74295 }, { "epoch": 8.274863570553514, "grad_norm": 0.10416179895401001, "learning_rate": 4.398070196114693e-06, "loss": 0.468, "num_input_tokens_seen": 90139664, "step": 74300 }, { "epoch": 8.275420425437131, "grad_norm": 0.18136632442474365, "learning_rate": 4.395317801653187e-06, "loss": 0.4579, "num_input_tokens_seen": 90145424, "step": 74305 }, { "epoch": 8.275977280320749, "grad_norm": 0.1261829286813736, "learning_rate": 4.392566185702601e-06, "loss": 0.4692, "num_input_tokens_seen": 90151376, "step": 74310 }, { "epoch": 8.276534135204365, "grad_norm": 0.11268503963947296, "learning_rate": 4.38981534836691e-06, "loss": 0.4668, "num_input_tokens_seen": 90157392, "step": 74315 }, { "epoch": 8.277090990087983, "grad_norm": 0.11165701597929001, "learning_rate": 4.387065289750039e-06, "loss": 0.4494, "num_input_tokens_seen": 90163536, "step": 74320 }, { "epoch": 8.2776478449716, "grad_norm": 0.0954500362277031, "learning_rate": 4.384316009955899e-06, "loss": 0.4738, "num_input_tokens_seen": 90169648, "step": 74325 }, { "epoch": 8.278204699855218, "grad_norm": 0.10340069234371185, "learning_rate": 4.381567509088372e-06, "loss": 0.4567, "num_input_tokens_seen": 90175920, "step": 74330 }, { "epoch": 8.278761554738836, "grad_norm": 0.14142782986164093, "learning_rate": 4.378819787251293e-06, "loss": 0.4542, "num_input_tokens_seen": 90182224, "step": 74335 }, { "epoch": 8.279318409622453, "grad_norm": 0.0921056941151619, "learning_rate": 4.37607284454849e-06, "loss": 0.4658, "num_input_tokens_seen": 90188272, "step": 74340 }, { "epoch": 8.27987526450607, "grad_norm": 0.1245819553732872, "learning_rate": 4.373326681083745e-06, "loss": 0.4633, "num_input_tokens_seen": 90194192, "step": 74345 }, { "epoch": 8.280432119389687, "grad_norm": 0.0999658852815628, "learning_rate": 4.370581296960816e-06, "loss": 0.4603, "num_input_tokens_seen": 90200240, "step": 74350 }, { "epoch": 8.280988974273304, "grad_norm": 0.18581978976726532, "learning_rate": 4.367836692283445e-06, "loss": 0.4632, "num_input_tokens_seen": 90206192, "step": 74355 }, { "epoch": 8.281545829156922, "grad_norm": 0.08853079378604889, "learning_rate": 4.365092867155316e-06, "loss": 0.4453, "num_input_tokens_seen": 90212528, "step": 74360 }, { "epoch": 8.28210268404054, "grad_norm": 0.13679900765419006, "learning_rate": 4.3623498216801115e-06, "loss": 0.4565, "num_input_tokens_seen": 90218640, "step": 74365 }, { "epoch": 8.282659538924156, "grad_norm": 0.11835285276174545, "learning_rate": 4.359607555961462e-06, "loss": 0.4652, "num_input_tokens_seen": 90224688, "step": 74370 }, { "epoch": 8.283216393807773, "grad_norm": 0.10541754961013794, "learning_rate": 4.356866070102989e-06, "loss": 0.4618, "num_input_tokens_seen": 90230992, "step": 74375 }, { "epoch": 8.283773248691391, "grad_norm": 0.10748680680990219, "learning_rate": 4.354125364208267e-06, "loss": 0.4605, "num_input_tokens_seen": 90237488, "step": 74380 }, { "epoch": 8.284330103575009, "grad_norm": 0.09171035140752792, "learning_rate": 4.3513854383808475e-06, "loss": 0.4618, "num_input_tokens_seen": 90243824, "step": 74385 }, { "epoch": 8.284886958458626, "grad_norm": 0.07471150904893875, "learning_rate": 4.348646292724267e-06, "loss": 0.4626, "num_input_tokens_seen": 90249744, "step": 74390 }, { "epoch": 8.285443813342242, "grad_norm": 0.09167870134115219, "learning_rate": 4.345907927342002e-06, "loss": 0.4578, "num_input_tokens_seen": 90255824, "step": 74395 }, { "epoch": 8.28600066822586, "grad_norm": 0.09848453849554062, "learning_rate": 4.343170342337533e-06, "loss": 0.4689, "num_input_tokens_seen": 90261808, "step": 74400 }, { "epoch": 8.286557523109478, "grad_norm": 0.09328632056713104, "learning_rate": 4.3404335378142816e-06, "loss": 0.4499, "num_input_tokens_seen": 90268080, "step": 74405 }, { "epoch": 8.287114377993095, "grad_norm": 0.10675819963216782, "learning_rate": 4.337697513875661e-06, "loss": 0.4596, "num_input_tokens_seen": 90274064, "step": 74410 }, { "epoch": 8.287671232876713, "grad_norm": 0.09812522679567337, "learning_rate": 4.33496227062504e-06, "loss": 0.4537, "num_input_tokens_seen": 90280272, "step": 74415 }, { "epoch": 8.288228087760329, "grad_norm": 0.12913073599338531, "learning_rate": 4.332227808165768e-06, "loss": 0.4544, "num_input_tokens_seen": 90286480, "step": 74420 }, { "epoch": 8.288784942643947, "grad_norm": 0.10980571806430817, "learning_rate": 4.329494126601166e-06, "loss": 0.4633, "num_input_tokens_seen": 90292592, "step": 74425 }, { "epoch": 8.289341797527564, "grad_norm": 0.11372625827789307, "learning_rate": 4.3267612260345144e-06, "loss": 0.4611, "num_input_tokens_seen": 90298704, "step": 74430 }, { "epoch": 8.289898652411182, "grad_norm": 0.16516979038715363, "learning_rate": 4.32402910656908e-06, "loss": 0.4539, "num_input_tokens_seen": 90304624, "step": 74435 }, { "epoch": 8.2904555072948, "grad_norm": 0.12635502219200134, "learning_rate": 4.32129776830808e-06, "loss": 0.4715, "num_input_tokens_seen": 90310640, "step": 74440 }, { "epoch": 8.291012362178416, "grad_norm": 0.09086014330387115, "learning_rate": 4.318567211354724e-06, "loss": 0.4654, "num_input_tokens_seen": 90316528, "step": 74445 }, { "epoch": 8.291569217062033, "grad_norm": 0.1437026411294937, "learning_rate": 4.315837435812167e-06, "loss": 0.4519, "num_input_tokens_seen": 90322448, "step": 74450 }, { "epoch": 8.29212607194565, "grad_norm": 0.12707506120204926, "learning_rate": 4.313108441783561e-06, "loss": 0.4584, "num_input_tokens_seen": 90328912, "step": 74455 }, { "epoch": 8.292682926829269, "grad_norm": 0.13200539350509644, "learning_rate": 4.310380229372018e-06, "loss": 0.4682, "num_input_tokens_seen": 90334992, "step": 74460 }, { "epoch": 8.293239781712886, "grad_norm": 0.12811307609081268, "learning_rate": 4.307652798680606e-06, "loss": 0.4512, "num_input_tokens_seen": 90340848, "step": 74465 }, { "epoch": 8.293796636596502, "grad_norm": 0.08866705000400543, "learning_rate": 4.304926149812388e-06, "loss": 0.4581, "num_input_tokens_seen": 90346960, "step": 74470 }, { "epoch": 8.29435349148012, "grad_norm": 0.10675179958343506, "learning_rate": 4.302200282870378e-06, "loss": 0.4767, "num_input_tokens_seen": 90352912, "step": 74475 }, { "epoch": 8.294910346363737, "grad_norm": 0.11032426357269287, "learning_rate": 4.299475197957567e-06, "loss": 0.461, "num_input_tokens_seen": 90359024, "step": 74480 }, { "epoch": 8.295467201247355, "grad_norm": 0.1167178824543953, "learning_rate": 4.2967508951769295e-06, "loss": 0.4654, "num_input_tokens_seen": 90365008, "step": 74485 }, { "epoch": 8.296024056130973, "grad_norm": 0.09766118228435516, "learning_rate": 4.294027374631385e-06, "loss": 0.4534, "num_input_tokens_seen": 90370832, "step": 74490 }, { "epoch": 8.29658091101459, "grad_norm": 0.09896144270896912, "learning_rate": 4.291304636423845e-06, "loss": 0.4673, "num_input_tokens_seen": 90376880, "step": 74495 }, { "epoch": 8.297137765898206, "grad_norm": 0.16336683928966522, "learning_rate": 4.2885826806571745e-06, "loss": 0.4575, "num_input_tokens_seen": 90382256, "step": 74500 }, { "epoch": 8.297694620781824, "grad_norm": 0.09576867520809174, "learning_rate": 4.28586150743423e-06, "loss": 0.4616, "num_input_tokens_seen": 90388400, "step": 74505 }, { "epoch": 8.298251475665442, "grad_norm": 0.10144471377134323, "learning_rate": 4.283141116857814e-06, "loss": 0.4734, "num_input_tokens_seen": 90394544, "step": 74510 }, { "epoch": 8.29880833054906, "grad_norm": 0.11677821725606918, "learning_rate": 4.280421509030719e-06, "loss": 0.4784, "num_input_tokens_seen": 90400720, "step": 74515 }, { "epoch": 8.299365185432677, "grad_norm": 0.09401202946901321, "learning_rate": 4.2777026840557014e-06, "loss": 0.4581, "num_input_tokens_seen": 90406288, "step": 74520 }, { "epoch": 8.299922040316293, "grad_norm": 0.08373981714248657, "learning_rate": 4.2749846420354785e-06, "loss": 0.4563, "num_input_tokens_seen": 90412432, "step": 74525 }, { "epoch": 8.30047889519991, "grad_norm": 0.1333368420600891, "learning_rate": 4.272267383072759e-06, "loss": 0.4513, "num_input_tokens_seen": 90418672, "step": 74530 }, { "epoch": 8.301035750083528, "grad_norm": 0.13507020473480225, "learning_rate": 4.269550907270198e-06, "loss": 0.4551, "num_input_tokens_seen": 90424560, "step": 74535 }, { "epoch": 8.301592604967146, "grad_norm": 0.10779145359992981, "learning_rate": 4.266835214730441e-06, "loss": 0.4622, "num_input_tokens_seen": 90430672, "step": 74540 }, { "epoch": 8.302149459850764, "grad_norm": 0.0963798239827156, "learning_rate": 4.264120305556094e-06, "loss": 0.4588, "num_input_tokens_seen": 90437008, "step": 74545 }, { "epoch": 8.30270631473438, "grad_norm": 0.11031781136989594, "learning_rate": 4.261406179849728e-06, "loss": 0.4565, "num_input_tokens_seen": 90443248, "step": 74550 }, { "epoch": 8.303263169617997, "grad_norm": 0.09577909111976624, "learning_rate": 4.258692837713901e-06, "loss": 0.4617, "num_input_tokens_seen": 90449296, "step": 74555 }, { "epoch": 8.303820024501615, "grad_norm": 0.10324797034263611, "learning_rate": 4.255980279251123e-06, "loss": 0.4495, "num_input_tokens_seen": 90455664, "step": 74560 }, { "epoch": 8.304376879385233, "grad_norm": 0.11926215887069702, "learning_rate": 4.253268504563887e-06, "loss": 0.4619, "num_input_tokens_seen": 90462032, "step": 74565 }, { "epoch": 8.30493373426885, "grad_norm": 0.10740471631288528, "learning_rate": 4.250557513754658e-06, "loss": 0.4654, "num_input_tokens_seen": 90467984, "step": 74570 }, { "epoch": 8.305490589152466, "grad_norm": 0.10857534408569336, "learning_rate": 4.247847306925856e-06, "loss": 0.4561, "num_input_tokens_seen": 90474128, "step": 74575 }, { "epoch": 8.306047444036084, "grad_norm": 0.08746722340583801, "learning_rate": 4.245137884179892e-06, "loss": 0.4589, "num_input_tokens_seen": 90480048, "step": 74580 }, { "epoch": 8.306604298919702, "grad_norm": 0.10694988816976547, "learning_rate": 4.2424292456191255e-06, "loss": 0.4651, "num_input_tokens_seen": 90486448, "step": 74585 }, { "epoch": 8.30716115380332, "grad_norm": 0.14928673207759857, "learning_rate": 4.2397213913459075e-06, "loss": 0.4734, "num_input_tokens_seen": 90492592, "step": 74590 }, { "epoch": 8.307718008686937, "grad_norm": 0.1233736053109169, "learning_rate": 4.23701432146254e-06, "loss": 0.4423, "num_input_tokens_seen": 90498608, "step": 74595 }, { "epoch": 8.308274863570553, "grad_norm": 0.08643507957458496, "learning_rate": 4.23430803607131e-06, "loss": 0.4545, "num_input_tokens_seen": 90504624, "step": 74600 }, { "epoch": 8.30883171845417, "grad_norm": 0.11613847315311432, "learning_rate": 4.231602535274476e-06, "loss": 0.468, "num_input_tokens_seen": 90510512, "step": 74605 }, { "epoch": 8.309388573337788, "grad_norm": 0.11932998150587082, "learning_rate": 4.228897819174246e-06, "loss": 0.4528, "num_input_tokens_seen": 90516848, "step": 74610 }, { "epoch": 8.309945428221406, "grad_norm": 0.1065932884812355, "learning_rate": 4.226193887872828e-06, "loss": 0.4515, "num_input_tokens_seen": 90522544, "step": 74615 }, { "epoch": 8.310502283105023, "grad_norm": 0.08542371541261673, "learning_rate": 4.223490741472372e-06, "loss": 0.4633, "num_input_tokens_seen": 90527984, "step": 74620 }, { "epoch": 8.31105913798864, "grad_norm": 0.0977078527212143, "learning_rate": 4.220788380075017e-06, "loss": 0.4656, "num_input_tokens_seen": 90534096, "step": 74625 }, { "epoch": 8.311615992872257, "grad_norm": 0.09943188726902008, "learning_rate": 4.218086803782873e-06, "loss": 0.4628, "num_input_tokens_seen": 90540016, "step": 74630 }, { "epoch": 8.312172847755875, "grad_norm": 0.09037735313177109, "learning_rate": 4.215386012698e-06, "loss": 0.46, "num_input_tokens_seen": 90545968, "step": 74635 }, { "epoch": 8.312729702639492, "grad_norm": 0.11391972750425339, "learning_rate": 4.212686006922462e-06, "loss": 0.4628, "num_input_tokens_seen": 90552016, "step": 74640 }, { "epoch": 8.31328655752311, "grad_norm": 0.10550307482481003, "learning_rate": 4.209986786558254e-06, "loss": 0.4718, "num_input_tokens_seen": 90558160, "step": 74645 }, { "epoch": 8.313843412406726, "grad_norm": 0.13180874288082123, "learning_rate": 4.207288351707375e-06, "loss": 0.4597, "num_input_tokens_seen": 90564048, "step": 74650 }, { "epoch": 8.314400267290344, "grad_norm": 0.10237827152013779, "learning_rate": 4.204590702471769e-06, "loss": 0.4642, "num_input_tokens_seen": 90569872, "step": 74655 }, { "epoch": 8.314957122173961, "grad_norm": 0.1374368667602539, "learning_rate": 4.201893838953372e-06, "loss": 0.459, "num_input_tokens_seen": 90576144, "step": 74660 }, { "epoch": 8.315513977057579, "grad_norm": 0.1085982471704483, "learning_rate": 4.199197761254081e-06, "loss": 0.4578, "num_input_tokens_seen": 90582320, "step": 74665 }, { "epoch": 8.316070831941197, "grad_norm": 0.13172851502895355, "learning_rate": 4.196502469475752e-06, "loss": 0.4531, "num_input_tokens_seen": 90588432, "step": 74670 }, { "epoch": 8.316627686824813, "grad_norm": 0.08496009558439255, "learning_rate": 4.1938079637202336e-06, "loss": 0.4614, "num_input_tokens_seen": 90594512, "step": 74675 }, { "epoch": 8.31718454170843, "grad_norm": 0.11002306640148163, "learning_rate": 4.19111424408932e-06, "loss": 0.4649, "num_input_tokens_seen": 90600560, "step": 74680 }, { "epoch": 8.317741396592048, "grad_norm": 0.11984322220087051, "learning_rate": 4.188421310684803e-06, "loss": 0.4664, "num_input_tokens_seen": 90606736, "step": 74685 }, { "epoch": 8.318298251475666, "grad_norm": 0.11004915833473206, "learning_rate": 4.1857291636084174e-06, "loss": 0.4607, "num_input_tokens_seen": 90612688, "step": 74690 }, { "epoch": 8.318855106359283, "grad_norm": 0.13223430514335632, "learning_rate": 4.1830378029618865e-06, "loss": 0.4626, "num_input_tokens_seen": 90618576, "step": 74695 }, { "epoch": 8.319411961242901, "grad_norm": 0.1109774187207222, "learning_rate": 4.180347228846904e-06, "loss": 0.4695, "num_input_tokens_seen": 90624656, "step": 74700 }, { "epoch": 8.319968816126517, "grad_norm": 0.08224080502986908, "learning_rate": 4.177657441365115e-06, "loss": 0.4681, "num_input_tokens_seen": 90630736, "step": 74705 }, { "epoch": 8.320525671010135, "grad_norm": 0.10429010540246964, "learning_rate": 4.174968440618165e-06, "loss": 0.4539, "num_input_tokens_seen": 90636624, "step": 74710 }, { "epoch": 8.321082525893752, "grad_norm": 0.07747876644134521, "learning_rate": 4.172280226707639e-06, "loss": 0.4735, "num_input_tokens_seen": 90642928, "step": 74715 }, { "epoch": 8.32163938077737, "grad_norm": 0.14429502189159393, "learning_rate": 4.169592799735117e-06, "loss": 0.4669, "num_input_tokens_seen": 90648912, "step": 74720 }, { "epoch": 8.322196235660988, "grad_norm": 0.09956557303667068, "learning_rate": 4.166906159802126e-06, "loss": 0.4659, "num_input_tokens_seen": 90655344, "step": 74725 }, { "epoch": 8.322753090544603, "grad_norm": 0.10115179419517517, "learning_rate": 4.164220307010183e-06, "loss": 0.466, "num_input_tokens_seen": 90661520, "step": 74730 }, { "epoch": 8.323309945428221, "grad_norm": 0.15493997931480408, "learning_rate": 4.161535241460776e-06, "loss": 0.4594, "num_input_tokens_seen": 90667824, "step": 74735 }, { "epoch": 8.323866800311839, "grad_norm": 0.10836924612522125, "learning_rate": 4.1588509632553425e-06, "loss": 0.4727, "num_input_tokens_seen": 90674000, "step": 74740 }, { "epoch": 8.324423655195456, "grad_norm": 0.21038612723350525, "learning_rate": 4.156167472495312e-06, "loss": 0.4634, "num_input_tokens_seen": 90680016, "step": 74745 }, { "epoch": 8.324980510079074, "grad_norm": 0.11105737835168839, "learning_rate": 4.153484769282068e-06, "loss": 0.4627, "num_input_tokens_seen": 90686032, "step": 74750 }, { "epoch": 8.32553736496269, "grad_norm": 0.14712801575660706, "learning_rate": 4.150802853716973e-06, "loss": 0.4679, "num_input_tokens_seen": 90692432, "step": 74755 }, { "epoch": 8.326094219846308, "grad_norm": 0.08367462456226349, "learning_rate": 4.148121725901366e-06, "loss": 0.4651, "num_input_tokens_seen": 90698288, "step": 74760 }, { "epoch": 8.326651074729925, "grad_norm": 0.08975452184677124, "learning_rate": 4.145441385936541e-06, "loss": 0.4587, "num_input_tokens_seen": 90704336, "step": 74765 }, { "epoch": 8.327207929613543, "grad_norm": 0.1159050315618515, "learning_rate": 4.142761833923775e-06, "loss": 0.4506, "num_input_tokens_seen": 90710192, "step": 74770 }, { "epoch": 8.32776478449716, "grad_norm": 0.12168995290994644, "learning_rate": 4.140083069964304e-06, "loss": 0.4638, "num_input_tokens_seen": 90716432, "step": 74775 }, { "epoch": 8.328321639380777, "grad_norm": 0.10181578993797302, "learning_rate": 4.137405094159347e-06, "loss": 0.4544, "num_input_tokens_seen": 90722480, "step": 74780 }, { "epoch": 8.328878494264394, "grad_norm": 0.16836203634738922, "learning_rate": 4.134727906610078e-06, "loss": 0.4586, "num_input_tokens_seen": 90728464, "step": 74785 }, { "epoch": 8.329435349148012, "grad_norm": 0.1103556677699089, "learning_rate": 4.132051507417656e-06, "loss": 0.4531, "num_input_tokens_seen": 90734768, "step": 74790 }, { "epoch": 8.32999220403163, "grad_norm": 0.0982094332575798, "learning_rate": 4.1293758966832045e-06, "loss": 0.4673, "num_input_tokens_seen": 90740560, "step": 74795 }, { "epoch": 8.330549058915247, "grad_norm": 0.1506635546684265, "learning_rate": 4.126701074507813e-06, "loss": 0.4491, "num_input_tokens_seen": 90746448, "step": 74800 }, { "epoch": 8.331105913798863, "grad_norm": 0.11209087073802948, "learning_rate": 4.124027040992551e-06, "loss": 0.4546, "num_input_tokens_seen": 90752816, "step": 74805 }, { "epoch": 8.331662768682481, "grad_norm": 0.11902108043432236, "learning_rate": 4.121353796238442e-06, "loss": 0.456, "num_input_tokens_seen": 90758800, "step": 74810 }, { "epoch": 8.332219623566099, "grad_norm": 0.08212553709745407, "learning_rate": 4.118681340346503e-06, "loss": 0.4723, "num_input_tokens_seen": 90764880, "step": 74815 }, { "epoch": 8.332776478449716, "grad_norm": 0.09778188914060593, "learning_rate": 4.116009673417692e-06, "loss": 0.4504, "num_input_tokens_seen": 90770896, "step": 74820 }, { "epoch": 8.333333333333334, "grad_norm": 0.20036756992340088, "learning_rate": 4.113338795552962e-06, "loss": 0.4671, "num_input_tokens_seen": 90776624, "step": 74825 }, { "epoch": 8.33389018821695, "grad_norm": 0.1513499617576599, "learning_rate": 4.110668706853235e-06, "loss": 0.4648, "num_input_tokens_seen": 90782800, "step": 74830 }, { "epoch": 8.334447043100567, "grad_norm": 0.12220481038093567, "learning_rate": 4.10799940741938e-06, "loss": 0.4592, "num_input_tokens_seen": 90788560, "step": 74835 }, { "epoch": 8.335003897984185, "grad_norm": 0.12906095385551453, "learning_rate": 4.105330897352263e-06, "loss": 0.4624, "num_input_tokens_seen": 90794384, "step": 74840 }, { "epoch": 8.335560752867803, "grad_norm": 0.15435215830802917, "learning_rate": 4.102663176752702e-06, "loss": 0.4687, "num_input_tokens_seen": 90800176, "step": 74845 }, { "epoch": 8.33611760775142, "grad_norm": 0.10419220477342606, "learning_rate": 4.099996245721494e-06, "loss": 0.4622, "num_input_tokens_seen": 90806320, "step": 74850 }, { "epoch": 8.336674462635038, "grad_norm": 0.08826208114624023, "learning_rate": 4.097330104359412e-06, "loss": 0.4604, "num_input_tokens_seen": 90812848, "step": 74855 }, { "epoch": 8.337231317518654, "grad_norm": 0.11592747271060944, "learning_rate": 4.094664752767177e-06, "loss": 0.4577, "num_input_tokens_seen": 90819056, "step": 74860 }, { "epoch": 8.337788172402272, "grad_norm": 0.09728872030973434, "learning_rate": 4.092000191045509e-06, "loss": 0.4593, "num_input_tokens_seen": 90825264, "step": 74865 }, { "epoch": 8.33834502728589, "grad_norm": 0.13497234880924225, "learning_rate": 4.08933641929507e-06, "loss": 0.4603, "num_input_tokens_seen": 90831408, "step": 74870 }, { "epoch": 8.338901882169507, "grad_norm": 0.12089067697525024, "learning_rate": 4.08667343761652e-06, "loss": 0.4517, "num_input_tokens_seen": 90837200, "step": 74875 }, { "epoch": 8.339458737053125, "grad_norm": 0.10715953260660172, "learning_rate": 4.084011246110459e-06, "loss": 0.4715, "num_input_tokens_seen": 90843280, "step": 74880 }, { "epoch": 8.34001559193674, "grad_norm": 0.10949330031871796, "learning_rate": 4.081349844877483e-06, "loss": 0.466, "num_input_tokens_seen": 90849264, "step": 74885 }, { "epoch": 8.340572446820358, "grad_norm": 0.1045573279261589, "learning_rate": 4.078689234018155e-06, "loss": 0.4558, "num_input_tokens_seen": 90855440, "step": 74890 }, { "epoch": 8.341129301703976, "grad_norm": 0.12246561050415039, "learning_rate": 4.076029413632984e-06, "loss": 0.4587, "num_input_tokens_seen": 90861200, "step": 74895 }, { "epoch": 8.341686156587594, "grad_norm": 0.11432471126317978, "learning_rate": 4.073370383822484e-06, "loss": 0.4532, "num_input_tokens_seen": 90867120, "step": 74900 }, { "epoch": 8.342243011471211, "grad_norm": 0.12987075746059418, "learning_rate": 4.070712144687108e-06, "loss": 0.4524, "num_input_tokens_seen": 90873072, "step": 74905 }, { "epoch": 8.342799866354827, "grad_norm": 0.1413818597793579, "learning_rate": 4.068054696327303e-06, "loss": 0.4747, "num_input_tokens_seen": 90878384, "step": 74910 }, { "epoch": 8.343356721238445, "grad_norm": 0.1601853221654892, "learning_rate": 4.065398038843465e-06, "loss": 0.4616, "num_input_tokens_seen": 90884592, "step": 74915 }, { "epoch": 8.343913576122063, "grad_norm": 0.08599386364221573, "learning_rate": 4.062742172335979e-06, "loss": 0.446, "num_input_tokens_seen": 90890864, "step": 74920 }, { "epoch": 8.34447043100568, "grad_norm": 0.12425683438777924, "learning_rate": 4.060087096905196e-06, "loss": 0.4604, "num_input_tokens_seen": 90897232, "step": 74925 }, { "epoch": 8.345027285889298, "grad_norm": 0.08788175880908966, "learning_rate": 4.057432812651421e-06, "loss": 0.4651, "num_input_tokens_seen": 90903312, "step": 74930 }, { "epoch": 8.345584140772914, "grad_norm": 0.181979700922966, "learning_rate": 4.054779319674954e-06, "loss": 0.4493, "num_input_tokens_seen": 90909488, "step": 74935 }, { "epoch": 8.346140995656532, "grad_norm": 0.12791478633880615, "learning_rate": 4.052126618076041e-06, "loss": 0.4534, "num_input_tokens_seen": 90915632, "step": 74940 }, { "epoch": 8.34669785054015, "grad_norm": 0.12706133723258972, "learning_rate": 4.04947470795492e-06, "loss": 0.463, "num_input_tokens_seen": 90921648, "step": 74945 }, { "epoch": 8.347254705423767, "grad_norm": 0.1343032568693161, "learning_rate": 4.04682358941178e-06, "loss": 0.4549, "num_input_tokens_seen": 90927728, "step": 74950 }, { "epoch": 8.347811560307385, "grad_norm": 0.145324245095253, "learning_rate": 4.044173262546796e-06, "loss": 0.4893, "num_input_tokens_seen": 90933808, "step": 74955 }, { "epoch": 8.348368415191, "grad_norm": 0.10550530254840851, "learning_rate": 4.041523727460106e-06, "loss": 0.462, "num_input_tokens_seen": 90940112, "step": 74960 }, { "epoch": 8.348925270074618, "grad_norm": 0.09487491846084595, "learning_rate": 4.038874984251806e-06, "loss": 0.4462, "num_input_tokens_seen": 90946448, "step": 74965 }, { "epoch": 8.349482124958236, "grad_norm": 0.07643213868141174, "learning_rate": 4.036227033021983e-06, "loss": 0.4648, "num_input_tokens_seen": 90952432, "step": 74970 }, { "epoch": 8.350038979841854, "grad_norm": 0.0943002849817276, "learning_rate": 4.033579873870688e-06, "loss": 0.4563, "num_input_tokens_seen": 90958544, "step": 74975 }, { "epoch": 8.350595834725471, "grad_norm": 0.12144336104393005, "learning_rate": 4.030933506897933e-06, "loss": 0.4559, "num_input_tokens_seen": 90964880, "step": 74980 }, { "epoch": 8.351152689609087, "grad_norm": 0.09469079226255417, "learning_rate": 4.028287932203711e-06, "loss": 0.4652, "num_input_tokens_seen": 90971088, "step": 74985 }, { "epoch": 8.351709544492705, "grad_norm": 0.1610366702079773, "learning_rate": 4.025643149887976e-06, "loss": 0.4699, "num_input_tokens_seen": 90976560, "step": 74990 }, { "epoch": 8.352266399376322, "grad_norm": 0.09213779866695404, "learning_rate": 4.022999160050661e-06, "loss": 0.4579, "num_input_tokens_seen": 90982896, "step": 74995 }, { "epoch": 8.35282325425994, "grad_norm": 0.1621280461549759, "learning_rate": 4.020355962791658e-06, "loss": 0.4698, "num_input_tokens_seen": 90989008, "step": 75000 }, { "epoch": 8.353380109143558, "grad_norm": 0.09087125957012177, "learning_rate": 4.017713558210839e-06, "loss": 0.4691, "num_input_tokens_seen": 90994512, "step": 75005 }, { "epoch": 8.353936964027174, "grad_norm": 0.09673764556646347, "learning_rate": 4.01507194640805e-06, "loss": 0.4573, "num_input_tokens_seen": 91000528, "step": 75010 }, { "epoch": 8.354493818910791, "grad_norm": 0.11985751241445541, "learning_rate": 4.012431127483085e-06, "loss": 0.4615, "num_input_tokens_seen": 91006480, "step": 75015 }, { "epoch": 8.355050673794409, "grad_norm": 0.11952794343233109, "learning_rate": 4.009791101535734e-06, "loss": 0.4542, "num_input_tokens_seen": 91012752, "step": 75020 }, { "epoch": 8.355607528678027, "grad_norm": 0.12984362244606018, "learning_rate": 4.007151868665737e-06, "loss": 0.452, "num_input_tokens_seen": 91018128, "step": 75025 }, { "epoch": 8.356164383561644, "grad_norm": 0.10744594782590866, "learning_rate": 4.004513428972819e-06, "loss": 0.4734, "num_input_tokens_seen": 91023760, "step": 75030 }, { "epoch": 8.35672123844526, "grad_norm": 0.15128064155578613, "learning_rate": 4.001875782556672e-06, "loss": 0.4581, "num_input_tokens_seen": 91029936, "step": 75035 }, { "epoch": 8.357278093328878, "grad_norm": 0.16790466010570526, "learning_rate": 3.999238929516943e-06, "loss": 0.4628, "num_input_tokens_seen": 91036016, "step": 75040 }, { "epoch": 8.357834948212496, "grad_norm": 0.09886133670806885, "learning_rate": 3.9966028699532756e-06, "loss": 0.4468, "num_input_tokens_seen": 91041552, "step": 75045 }, { "epoch": 8.358391803096113, "grad_norm": 0.09463351964950562, "learning_rate": 3.993967603965251e-06, "loss": 0.465, "num_input_tokens_seen": 91047440, "step": 75050 }, { "epoch": 8.358948657979731, "grad_norm": 0.09536536782979965, "learning_rate": 3.991333131652455e-06, "loss": 0.4566, "num_input_tokens_seen": 91053552, "step": 75055 }, { "epoch": 8.359505512863349, "grad_norm": 0.1271318644285202, "learning_rate": 3.988699453114414e-06, "loss": 0.4574, "num_input_tokens_seen": 91059728, "step": 75060 }, { "epoch": 8.360062367746965, "grad_norm": 0.12949681282043457, "learning_rate": 3.9860665684506415e-06, "loss": 0.4759, "num_input_tokens_seen": 91066128, "step": 75065 }, { "epoch": 8.360619222630582, "grad_norm": 0.1168326586484909, "learning_rate": 3.983434477760622e-06, "loss": 0.4605, "num_input_tokens_seen": 91072464, "step": 75070 }, { "epoch": 8.3611760775142, "grad_norm": 0.10774030536413193, "learning_rate": 3.980803181143794e-06, "loss": 0.4635, "num_input_tokens_seen": 91078864, "step": 75075 }, { "epoch": 8.361732932397818, "grad_norm": 0.11624055355787277, "learning_rate": 3.9781726786995845e-06, "loss": 0.4588, "num_input_tokens_seen": 91084880, "step": 75080 }, { "epoch": 8.362289787281435, "grad_norm": 0.12328018993139267, "learning_rate": 3.975542970527374e-06, "loss": 0.4667, "num_input_tokens_seen": 91091216, "step": 75085 }, { "epoch": 8.362846642165051, "grad_norm": 0.10263486206531525, "learning_rate": 3.972914056726531e-06, "loss": 0.4561, "num_input_tokens_seen": 91097232, "step": 75090 }, { "epoch": 8.363403497048669, "grad_norm": 0.11927017569541931, "learning_rate": 3.970285937396376e-06, "loss": 0.4659, "num_input_tokens_seen": 91103216, "step": 75095 }, { "epoch": 8.363960351932286, "grad_norm": 0.09701497107744217, "learning_rate": 3.96765861263621e-06, "loss": 0.4701, "num_input_tokens_seen": 91109200, "step": 75100 }, { "epoch": 8.364517206815904, "grad_norm": 0.13279271125793457, "learning_rate": 3.965032082545312e-06, "loss": 0.467, "num_input_tokens_seen": 91115408, "step": 75105 }, { "epoch": 8.365074061699522, "grad_norm": 0.08500175178050995, "learning_rate": 3.9624063472229024e-06, "loss": 0.4602, "num_input_tokens_seen": 91121296, "step": 75110 }, { "epoch": 8.365630916583138, "grad_norm": 0.11827494204044342, "learning_rate": 3.959781406768207e-06, "loss": 0.451, "num_input_tokens_seen": 91127344, "step": 75115 }, { "epoch": 8.366187771466755, "grad_norm": 0.10699331760406494, "learning_rate": 3.957157261280389e-06, "loss": 0.454, "num_input_tokens_seen": 91133328, "step": 75120 }, { "epoch": 8.366744626350373, "grad_norm": 0.09162532538175583, "learning_rate": 3.954533910858605e-06, "loss": 0.4653, "num_input_tokens_seen": 91139472, "step": 75125 }, { "epoch": 8.36730148123399, "grad_norm": 0.0935320109128952, "learning_rate": 3.951911355601981e-06, "loss": 0.4541, "num_input_tokens_seen": 91145328, "step": 75130 }, { "epoch": 8.367858336117608, "grad_norm": 0.10114741325378418, "learning_rate": 3.94928959560959e-06, "loss": 0.4598, "num_input_tokens_seen": 91151664, "step": 75135 }, { "epoch": 8.368415191001224, "grad_norm": 0.11240652948617935, "learning_rate": 3.946668630980505e-06, "loss": 0.4589, "num_input_tokens_seen": 91157520, "step": 75140 }, { "epoch": 8.368972045884842, "grad_norm": 0.09073007851839066, "learning_rate": 3.944048461813743e-06, "loss": 0.4598, "num_input_tokens_seen": 91163888, "step": 75145 }, { "epoch": 8.36952890076846, "grad_norm": 0.09641195088624954, "learning_rate": 3.941429088208312e-06, "loss": 0.4577, "num_input_tokens_seen": 91170064, "step": 75150 }, { "epoch": 8.370085755652077, "grad_norm": 0.10436499863862991, "learning_rate": 3.938810510263169e-06, "loss": 0.4603, "num_input_tokens_seen": 91176304, "step": 75155 }, { "epoch": 8.370642610535695, "grad_norm": 0.13324666023254395, "learning_rate": 3.936192728077262e-06, "loss": 0.4516, "num_input_tokens_seen": 91181872, "step": 75160 }, { "epoch": 8.371199465419311, "grad_norm": 0.10140040516853333, "learning_rate": 3.933575741749499e-06, "loss": 0.466, "num_input_tokens_seen": 91187888, "step": 75165 }, { "epoch": 8.371756320302929, "grad_norm": 0.13606713712215424, "learning_rate": 3.930959551378749e-06, "loss": 0.4613, "num_input_tokens_seen": 91194032, "step": 75170 }, { "epoch": 8.372313175186546, "grad_norm": 0.11538910865783691, "learning_rate": 3.928344157063873e-06, "loss": 0.4607, "num_input_tokens_seen": 91200112, "step": 75175 }, { "epoch": 8.372870030070164, "grad_norm": 0.1306500881910324, "learning_rate": 3.925729558903676e-06, "loss": 0.4718, "num_input_tokens_seen": 91206128, "step": 75180 }, { "epoch": 8.373426884953782, "grad_norm": 0.10951603949069977, "learning_rate": 3.923115756996959e-06, "loss": 0.4714, "num_input_tokens_seen": 91212240, "step": 75185 }, { "epoch": 8.373983739837398, "grad_norm": 0.0989406630396843, "learning_rate": 3.9205027514424675e-06, "loss": 0.4542, "num_input_tokens_seen": 91218768, "step": 75190 }, { "epoch": 8.374540594721015, "grad_norm": 0.13296359777450562, "learning_rate": 3.917890542338931e-06, "loss": 0.4543, "num_input_tokens_seen": 91224784, "step": 75195 }, { "epoch": 8.375097449604633, "grad_norm": 0.08786237984895706, "learning_rate": 3.91527912978506e-06, "loss": 0.4689, "num_input_tokens_seen": 91230672, "step": 75200 }, { "epoch": 8.37565430448825, "grad_norm": 0.11626077443361282, "learning_rate": 3.912668513879506e-06, "loss": 0.4592, "num_input_tokens_seen": 91236816, "step": 75205 }, { "epoch": 8.376211159371868, "grad_norm": 0.08660595864057541, "learning_rate": 3.910058694720919e-06, "loss": 0.4593, "num_input_tokens_seen": 91242768, "step": 75210 }, { "epoch": 8.376768014255486, "grad_norm": 0.1358480155467987, "learning_rate": 3.907449672407895e-06, "loss": 0.4474, "num_input_tokens_seen": 91248304, "step": 75215 }, { "epoch": 8.377324869139102, "grad_norm": 0.12655283510684967, "learning_rate": 3.904841447039023e-06, "loss": 0.4538, "num_input_tokens_seen": 91254320, "step": 75220 }, { "epoch": 8.37788172402272, "grad_norm": 0.11280778050422668, "learning_rate": 3.902234018712836e-06, "loss": 0.4544, "num_input_tokens_seen": 91260592, "step": 75225 }, { "epoch": 8.378438578906337, "grad_norm": 0.12693828344345093, "learning_rate": 3.89962738752786e-06, "loss": 0.4611, "num_input_tokens_seen": 91266640, "step": 75230 }, { "epoch": 8.378995433789955, "grad_norm": 0.10847204923629761, "learning_rate": 3.897021553582589e-06, "loss": 0.459, "num_input_tokens_seen": 91272464, "step": 75235 }, { "epoch": 8.379552288673572, "grad_norm": 0.11747261136770248, "learning_rate": 3.894416516975463e-06, "loss": 0.4666, "num_input_tokens_seen": 91278640, "step": 75240 }, { "epoch": 8.380109143557188, "grad_norm": 0.09364496916532516, "learning_rate": 3.891812277804924e-06, "loss": 0.4475, "num_input_tokens_seen": 91285008, "step": 75245 }, { "epoch": 8.380665998440806, "grad_norm": 0.12684288620948792, "learning_rate": 3.889208836169356e-06, "loss": 0.4718, "num_input_tokens_seen": 91290960, "step": 75250 }, { "epoch": 8.381222853324424, "grad_norm": 0.1291457712650299, "learning_rate": 3.886606192167128e-06, "loss": 0.4545, "num_input_tokens_seen": 91296464, "step": 75255 }, { "epoch": 8.381779708208041, "grad_norm": 0.12944461405277252, "learning_rate": 3.884004345896588e-06, "loss": 0.4636, "num_input_tokens_seen": 91302192, "step": 75260 }, { "epoch": 8.382336563091659, "grad_norm": 0.09758598357439041, "learning_rate": 3.881403297456027e-06, "loss": 0.457, "num_input_tokens_seen": 91308432, "step": 75265 }, { "epoch": 8.382893417975275, "grad_norm": 0.10606471449136734, "learning_rate": 3.878803046943732e-06, "loss": 0.4696, "num_input_tokens_seen": 91314608, "step": 75270 }, { "epoch": 8.383450272858893, "grad_norm": 0.10498622804880142, "learning_rate": 3.876203594457942e-06, "loss": 0.4572, "num_input_tokens_seen": 91320880, "step": 75275 }, { "epoch": 8.38400712774251, "grad_norm": 0.10203667730093002, "learning_rate": 3.873604940096878e-06, "loss": 0.4725, "num_input_tokens_seen": 91327216, "step": 75280 }, { "epoch": 8.384563982626128, "grad_norm": 0.09824158996343613, "learning_rate": 3.871007083958716e-06, "loss": 0.4508, "num_input_tokens_seen": 91333360, "step": 75285 }, { "epoch": 8.385120837509746, "grad_norm": 0.08325894176959991, "learning_rate": 3.868410026141617e-06, "loss": 0.4562, "num_input_tokens_seen": 91339440, "step": 75290 }, { "epoch": 8.385677692393362, "grad_norm": 0.1056060642004013, "learning_rate": 3.865813766743714e-06, "loss": 0.4687, "num_input_tokens_seen": 91345488, "step": 75295 }, { "epoch": 8.38623454727698, "grad_norm": 0.11287829279899597, "learning_rate": 3.863218305863086e-06, "loss": 0.4578, "num_input_tokens_seen": 91350928, "step": 75300 }, { "epoch": 8.386791402160597, "grad_norm": 0.10457415133714676, "learning_rate": 3.860623643597816e-06, "loss": 0.4581, "num_input_tokens_seen": 91357232, "step": 75305 }, { "epoch": 8.387348257044215, "grad_norm": 0.11664004623889923, "learning_rate": 3.858029780045921e-06, "loss": 0.4553, "num_input_tokens_seen": 91363248, "step": 75310 }, { "epoch": 8.387905111927832, "grad_norm": 0.10535333305597305, "learning_rate": 3.855436715305421e-06, "loss": 0.4558, "num_input_tokens_seen": 91369264, "step": 75315 }, { "epoch": 8.388461966811448, "grad_norm": 0.1775277853012085, "learning_rate": 3.852844449474277e-06, "loss": 0.4625, "num_input_tokens_seen": 91375216, "step": 75320 }, { "epoch": 8.389018821695066, "grad_norm": 0.12103159725666046, "learning_rate": 3.85025298265044e-06, "loss": 0.4544, "num_input_tokens_seen": 91381456, "step": 75325 }, { "epoch": 8.389575676578684, "grad_norm": 0.16218885779380798, "learning_rate": 3.847662314931827e-06, "loss": 0.4576, "num_input_tokens_seen": 91387536, "step": 75330 }, { "epoch": 8.390132531462301, "grad_norm": 0.10434766113758087, "learning_rate": 3.845072446416312e-06, "loss": 0.4716, "num_input_tokens_seen": 91393552, "step": 75335 }, { "epoch": 8.390689386345919, "grad_norm": 0.09539957344532013, "learning_rate": 3.842483377201761e-06, "loss": 0.4535, "num_input_tokens_seen": 91399440, "step": 75340 }, { "epoch": 8.391246241229535, "grad_norm": 0.1715606451034546, "learning_rate": 3.839895107385985e-06, "loss": 0.4647, "num_input_tokens_seen": 91405936, "step": 75345 }, { "epoch": 8.391803096113152, "grad_norm": 0.11230403929948807, "learning_rate": 3.837307637066781e-06, "loss": 0.4562, "num_input_tokens_seen": 91411952, "step": 75350 }, { "epoch": 8.39235995099677, "grad_norm": 0.10074792802333832, "learning_rate": 3.83472096634192e-06, "loss": 0.4521, "num_input_tokens_seen": 91417840, "step": 75355 }, { "epoch": 8.392916805880388, "grad_norm": 0.0920519307255745, "learning_rate": 3.832135095309125e-06, "loss": 0.4451, "num_input_tokens_seen": 91423824, "step": 75360 }, { "epoch": 8.393473660764005, "grad_norm": 0.11478044837713242, "learning_rate": 3.8295500240661115e-06, "loss": 0.4645, "num_input_tokens_seen": 91430000, "step": 75365 }, { "epoch": 8.394030515647621, "grad_norm": 0.13850758969783783, "learning_rate": 3.826965752710529e-06, "loss": 0.4567, "num_input_tokens_seen": 91436368, "step": 75370 }, { "epoch": 8.394587370531239, "grad_norm": 0.0956064760684967, "learning_rate": 3.824382281340036e-06, "loss": 0.4627, "num_input_tokens_seen": 91441392, "step": 75375 }, { "epoch": 8.395144225414857, "grad_norm": 0.13593371212482452, "learning_rate": 3.8217996100522455e-06, "loss": 0.4489, "num_input_tokens_seen": 91447280, "step": 75380 }, { "epoch": 8.395701080298474, "grad_norm": 0.10473421216011047, "learning_rate": 3.819217738944728e-06, "loss": 0.4562, "num_input_tokens_seen": 91453584, "step": 75385 }, { "epoch": 8.396257935182092, "grad_norm": 0.1330091655254364, "learning_rate": 3.816636668115048e-06, "loss": 0.4626, "num_input_tokens_seen": 91459728, "step": 75390 }, { "epoch": 8.396814790065708, "grad_norm": 0.1387723684310913, "learning_rate": 3.8140563976607117e-06, "loss": 0.4555, "num_input_tokens_seen": 91466064, "step": 75395 }, { "epoch": 8.397371644949326, "grad_norm": 0.09739254415035248, "learning_rate": 3.8114769276792278e-06, "loss": 0.4635, "num_input_tokens_seen": 91472176, "step": 75400 }, { "epoch": 8.397928499832943, "grad_norm": 0.1055283322930336, "learning_rate": 3.80889825826804e-06, "loss": 0.4571, "num_input_tokens_seen": 91478096, "step": 75405 }, { "epoch": 8.398485354716561, "grad_norm": 0.12635929882526398, "learning_rate": 3.806320389524587e-06, "loss": 0.4606, "num_input_tokens_seen": 91484336, "step": 75410 }, { "epoch": 8.399042209600179, "grad_norm": 0.10064674913883209, "learning_rate": 3.803743321546274e-06, "loss": 0.4606, "num_input_tokens_seen": 91490352, "step": 75415 }, { "epoch": 8.399599064483796, "grad_norm": 0.08580098301172256, "learning_rate": 3.8011670544304573e-06, "loss": 0.4672, "num_input_tokens_seen": 91496656, "step": 75420 }, { "epoch": 8.400155919367412, "grad_norm": 0.09659022837877274, "learning_rate": 3.7985915882744915e-06, "loss": 0.4615, "num_input_tokens_seen": 91502672, "step": 75425 }, { "epoch": 8.40071277425103, "grad_norm": 0.11913567036390305, "learning_rate": 3.7960169231756745e-06, "loss": 0.4553, "num_input_tokens_seen": 91508464, "step": 75430 }, { "epoch": 8.401269629134648, "grad_norm": 0.10700616985559464, "learning_rate": 3.7934430592312887e-06, "loss": 0.4705, "num_input_tokens_seen": 91514352, "step": 75435 }, { "epoch": 8.401826484018265, "grad_norm": 0.11381841450929642, "learning_rate": 3.7908699965385876e-06, "loss": 0.4574, "num_input_tokens_seen": 91520688, "step": 75440 }, { "epoch": 8.402383338901883, "grad_norm": 0.12088799476623535, "learning_rate": 3.7882977351947817e-06, "loss": 0.4591, "num_input_tokens_seen": 91526672, "step": 75445 }, { "epoch": 8.402940193785499, "grad_norm": 0.11159008741378784, "learning_rate": 3.785726275297072e-06, "loss": 0.461, "num_input_tokens_seen": 91532944, "step": 75450 }, { "epoch": 8.403497048669117, "grad_norm": 0.0988490879535675, "learning_rate": 3.783155616942599e-06, "loss": 0.4621, "num_input_tokens_seen": 91539024, "step": 75455 }, { "epoch": 8.404053903552734, "grad_norm": 0.10555540770292282, "learning_rate": 3.7805857602285056e-06, "loss": 0.466, "num_input_tokens_seen": 91545136, "step": 75460 }, { "epoch": 8.404610758436352, "grad_norm": 0.08382125943899155, "learning_rate": 3.778016705251877e-06, "loss": 0.4561, "num_input_tokens_seen": 91551120, "step": 75465 }, { "epoch": 8.40516761331997, "grad_norm": 0.1261356920003891, "learning_rate": 3.775448452109787e-06, "loss": 0.4628, "num_input_tokens_seen": 91557008, "step": 75470 }, { "epoch": 8.405724468203585, "grad_norm": 0.1022048145532608, "learning_rate": 3.7728810008992784e-06, "loss": 0.4563, "num_input_tokens_seen": 91562704, "step": 75475 }, { "epoch": 8.406281323087203, "grad_norm": 0.10856503993272781, "learning_rate": 3.770314351717347e-06, "loss": 0.4567, "num_input_tokens_seen": 91569008, "step": 75480 }, { "epoch": 8.40683817797082, "grad_norm": 0.11007648706436157, "learning_rate": 3.7677485046609755e-06, "loss": 0.4583, "num_input_tokens_seen": 91575056, "step": 75485 }, { "epoch": 8.407395032854438, "grad_norm": 0.11328783631324768, "learning_rate": 3.765183459827104e-06, "loss": 0.4804, "num_input_tokens_seen": 91581168, "step": 75490 }, { "epoch": 8.407951887738056, "grad_norm": 0.12325754016637802, "learning_rate": 3.762619217312657e-06, "loss": 0.454, "num_input_tokens_seen": 91587472, "step": 75495 }, { "epoch": 8.408508742621672, "grad_norm": 0.12484835088253021, "learning_rate": 3.760055777214508e-06, "loss": 0.4563, "num_input_tokens_seen": 91593776, "step": 75500 }, { "epoch": 8.40906559750529, "grad_norm": 0.1087537631392479, "learning_rate": 3.7574931396295198e-06, "loss": 0.4614, "num_input_tokens_seen": 91599760, "step": 75505 }, { "epoch": 8.409622452388907, "grad_norm": 0.08521390706300735, "learning_rate": 3.7549313046545188e-06, "loss": 0.4684, "num_input_tokens_seen": 91605872, "step": 75510 }, { "epoch": 8.410179307272525, "grad_norm": 0.09856090694665909, "learning_rate": 3.7523702723862936e-06, "loss": 0.4753, "num_input_tokens_seen": 91611952, "step": 75515 }, { "epoch": 8.410736162156143, "grad_norm": 0.1191096231341362, "learning_rate": 3.7498100429216145e-06, "loss": 0.4682, "num_input_tokens_seen": 91618256, "step": 75520 }, { "epoch": 8.411293017039759, "grad_norm": 0.11982984095811844, "learning_rate": 3.747250616357209e-06, "loss": 0.4557, "num_input_tokens_seen": 91624720, "step": 75525 }, { "epoch": 8.411849871923376, "grad_norm": 0.08226844668388367, "learning_rate": 3.7446919927897783e-06, "loss": 0.4746, "num_input_tokens_seen": 91630928, "step": 75530 }, { "epoch": 8.412406726806994, "grad_norm": 0.1363641619682312, "learning_rate": 3.7421341723160085e-06, "loss": 0.4691, "num_input_tokens_seen": 91637168, "step": 75535 }, { "epoch": 8.412963581690612, "grad_norm": 0.12902192771434784, "learning_rate": 3.739577155032528e-06, "loss": 0.4568, "num_input_tokens_seen": 91643056, "step": 75540 }, { "epoch": 8.41352043657423, "grad_norm": 0.12709537148475647, "learning_rate": 3.7370209410359625e-06, "loss": 0.4529, "num_input_tokens_seen": 91649200, "step": 75545 }, { "epoch": 8.414077291457847, "grad_norm": 0.11585324257612228, "learning_rate": 3.7344655304228788e-06, "loss": 0.4627, "num_input_tokens_seen": 91655376, "step": 75550 }, { "epoch": 8.414634146341463, "grad_norm": 0.10575181990861893, "learning_rate": 3.7319109232898413e-06, "loss": 0.4581, "num_input_tokens_seen": 91661520, "step": 75555 }, { "epoch": 8.41519100122508, "grad_norm": 0.1088642030954361, "learning_rate": 3.7293571197333592e-06, "loss": 0.459, "num_input_tokens_seen": 91667568, "step": 75560 }, { "epoch": 8.415747856108698, "grad_norm": 0.12732093036174774, "learning_rate": 3.726804119849933e-06, "loss": 0.4658, "num_input_tokens_seen": 91673712, "step": 75565 }, { "epoch": 8.416304710992316, "grad_norm": 0.13930901885032654, "learning_rate": 3.7242519237360245e-06, "loss": 0.4583, "num_input_tokens_seen": 91679856, "step": 75570 }, { "epoch": 8.416861565875934, "grad_norm": 0.12726850807666779, "learning_rate": 3.7217005314880564e-06, "loss": 0.4664, "num_input_tokens_seen": 91685392, "step": 75575 }, { "epoch": 8.41741842075955, "grad_norm": 0.1439538598060608, "learning_rate": 3.719149943202435e-06, "loss": 0.4599, "num_input_tokens_seen": 91691568, "step": 75580 }, { "epoch": 8.417975275643167, "grad_norm": 0.09319978207349777, "learning_rate": 3.7166001589755224e-06, "loss": 0.4522, "num_input_tokens_seen": 91697328, "step": 75585 }, { "epoch": 8.418532130526785, "grad_norm": 0.10986176878213882, "learning_rate": 3.714051178903666e-06, "loss": 0.4664, "num_input_tokens_seen": 91703312, "step": 75590 }, { "epoch": 8.419088985410403, "grad_norm": 0.09400904923677444, "learning_rate": 3.7115030030831667e-06, "loss": 0.457, "num_input_tokens_seen": 91709392, "step": 75595 }, { "epoch": 8.41964584029402, "grad_norm": 0.10704122483730316, "learning_rate": 3.708955631610303e-06, "loss": 0.4681, "num_input_tokens_seen": 91715632, "step": 75600 }, { "epoch": 8.420202695177636, "grad_norm": 0.10025864839553833, "learning_rate": 3.7064090645813344e-06, "loss": 0.4605, "num_input_tokens_seen": 91721840, "step": 75605 }, { "epoch": 8.420759550061254, "grad_norm": 0.12065153568983078, "learning_rate": 3.703863302092464e-06, "loss": 0.4579, "num_input_tokens_seen": 91727664, "step": 75610 }, { "epoch": 8.421316404944871, "grad_norm": 0.13866187632083893, "learning_rate": 3.7013183442398904e-06, "loss": 0.4567, "num_input_tokens_seen": 91733264, "step": 75615 }, { "epoch": 8.421873259828489, "grad_norm": 0.14784736931324005, "learning_rate": 3.6987741911197583e-06, "loss": 0.4599, "num_input_tokens_seen": 91739536, "step": 75620 }, { "epoch": 8.422430114712107, "grad_norm": 0.10118915885686874, "learning_rate": 3.6962308428281993e-06, "loss": 0.4729, "num_input_tokens_seen": 91745456, "step": 75625 }, { "epoch": 8.422986969595723, "grad_norm": 0.1038021668791771, "learning_rate": 3.6936882994613203e-06, "loss": 0.4649, "num_input_tokens_seen": 91751568, "step": 75630 }, { "epoch": 8.42354382447934, "grad_norm": 0.12004825472831726, "learning_rate": 3.6911465611151663e-06, "loss": 0.4634, "num_input_tokens_seen": 91757712, "step": 75635 }, { "epoch": 8.424100679362958, "grad_norm": 0.12629817426204681, "learning_rate": 3.688605627885791e-06, "loss": 0.455, "num_input_tokens_seen": 91763952, "step": 75640 }, { "epoch": 8.424657534246576, "grad_norm": 0.14269466698169708, "learning_rate": 3.6860654998691846e-06, "loss": 0.4651, "num_input_tokens_seen": 91770160, "step": 75645 }, { "epoch": 8.425214389130193, "grad_norm": 0.0953710526227951, "learning_rate": 3.683526177161334e-06, "loss": 0.4629, "num_input_tokens_seen": 91776176, "step": 75650 }, { "epoch": 8.42577124401381, "grad_norm": 0.09766826778650284, "learning_rate": 3.6809876598581704e-06, "loss": 0.4529, "num_input_tokens_seen": 91782192, "step": 75655 }, { "epoch": 8.426328098897427, "grad_norm": 0.10215818881988525, "learning_rate": 3.678449948055612e-06, "loss": 0.4613, "num_input_tokens_seen": 91788560, "step": 75660 }, { "epoch": 8.426884953781045, "grad_norm": 0.08734828233718872, "learning_rate": 3.675913041849549e-06, "loss": 0.4762, "num_input_tokens_seen": 91794736, "step": 75665 }, { "epoch": 8.427441808664662, "grad_norm": 0.14181599020957947, "learning_rate": 3.673376941335824e-06, "loss": 0.4623, "num_input_tokens_seen": 91800880, "step": 75670 }, { "epoch": 8.42799866354828, "grad_norm": 0.08568783849477768, "learning_rate": 3.6708416466102654e-06, "loss": 0.4659, "num_input_tokens_seen": 91806352, "step": 75675 }, { "epoch": 8.428555518431896, "grad_norm": 0.10502563416957855, "learning_rate": 3.668307157768658e-06, "loss": 0.461, "num_input_tokens_seen": 91812336, "step": 75680 }, { "epoch": 8.429112373315514, "grad_norm": 0.09691103547811508, "learning_rate": 3.6657734749067723e-06, "loss": 0.4569, "num_input_tokens_seen": 91818192, "step": 75685 }, { "epoch": 8.429669228199131, "grad_norm": 0.0992780551314354, "learning_rate": 3.6632405981203266e-06, "loss": 0.4771, "num_input_tokens_seen": 91823856, "step": 75690 }, { "epoch": 8.430226083082749, "grad_norm": 0.09999186545610428, "learning_rate": 3.6607085275050303e-06, "loss": 0.4799, "num_input_tokens_seen": 91829872, "step": 75695 }, { "epoch": 8.430782937966367, "grad_norm": 0.12429013848304749, "learning_rate": 3.6581772631565535e-06, "loss": 0.4694, "num_input_tokens_seen": 91835984, "step": 75700 }, { "epoch": 8.431339792849982, "grad_norm": 0.11561053991317749, "learning_rate": 3.6556468051705315e-06, "loss": 0.4543, "num_input_tokens_seen": 91841392, "step": 75705 }, { "epoch": 8.4318966477336, "grad_norm": 0.07949510216712952, "learning_rate": 3.6531171536425766e-06, "loss": 0.4562, "num_input_tokens_seen": 91847184, "step": 75710 }, { "epoch": 8.432453502617218, "grad_norm": 0.12858673930168152, "learning_rate": 3.6505883086682626e-06, "loss": 0.4774, "num_input_tokens_seen": 91853264, "step": 75715 }, { "epoch": 8.433010357500835, "grad_norm": 0.1013130396604538, "learning_rate": 3.6480602703431427e-06, "loss": 0.4564, "num_input_tokens_seen": 91858448, "step": 75720 }, { "epoch": 8.433567212384453, "grad_norm": 0.09451030194759369, "learning_rate": 3.645533038762727e-06, "loss": 0.4778, "num_input_tokens_seen": 91864848, "step": 75725 }, { "epoch": 8.434124067268069, "grad_norm": 0.09942173957824707, "learning_rate": 3.6430066140225067e-06, "loss": 0.4518, "num_input_tokens_seen": 91871056, "step": 75730 }, { "epoch": 8.434680922151687, "grad_norm": 0.09193957597017288, "learning_rate": 3.640480996217943e-06, "loss": 0.4672, "num_input_tokens_seen": 91877168, "step": 75735 }, { "epoch": 8.435237777035304, "grad_norm": 0.0866888239979744, "learning_rate": 3.637956185444452e-06, "loss": 0.4613, "num_input_tokens_seen": 91883088, "step": 75740 }, { "epoch": 8.435794631918922, "grad_norm": 0.12249726057052612, "learning_rate": 3.6354321817974435e-06, "loss": 0.4602, "num_input_tokens_seen": 91889008, "step": 75745 }, { "epoch": 8.43635148680254, "grad_norm": 0.10293542593717575, "learning_rate": 3.6329089853722632e-06, "loss": 0.4544, "num_input_tokens_seen": 91895088, "step": 75750 }, { "epoch": 8.436908341686157, "grad_norm": 0.10321228206157684, "learning_rate": 3.630386596264257e-06, "loss": 0.4432, "num_input_tokens_seen": 91901104, "step": 75755 }, { "epoch": 8.437465196569773, "grad_norm": 0.08596663922071457, "learning_rate": 3.6278650145687347e-06, "loss": 0.4581, "num_input_tokens_seen": 91907120, "step": 75760 }, { "epoch": 8.438022051453391, "grad_norm": 0.12297812849283218, "learning_rate": 3.625344240380954e-06, "loss": 0.4711, "num_input_tokens_seen": 91913264, "step": 75765 }, { "epoch": 8.438578906337009, "grad_norm": 0.11775368452072144, "learning_rate": 3.6228242737961743e-06, "loss": 0.4564, "num_input_tokens_seen": 91919216, "step": 75770 }, { "epoch": 8.439135761220626, "grad_norm": 0.14022986590862274, "learning_rate": 3.6203051149095973e-06, "loss": 0.4563, "num_input_tokens_seen": 91925328, "step": 75775 }, { "epoch": 8.439692616104244, "grad_norm": 0.10229062288999557, "learning_rate": 3.6177867638164054e-06, "loss": 0.4663, "num_input_tokens_seen": 91930928, "step": 75780 }, { "epoch": 8.44024947098786, "grad_norm": 0.10122685134410858, "learning_rate": 3.6152692206117557e-06, "loss": 0.4703, "num_input_tokens_seen": 91937008, "step": 75785 }, { "epoch": 8.440806325871478, "grad_norm": 0.12761008739471436, "learning_rate": 3.6127524853907607e-06, "loss": 0.4591, "num_input_tokens_seen": 91943248, "step": 75790 }, { "epoch": 8.441363180755095, "grad_norm": 0.0935501679778099, "learning_rate": 3.6102365582485225e-06, "loss": 0.4599, "num_input_tokens_seen": 91949072, "step": 75795 }, { "epoch": 8.441920035638713, "grad_norm": 0.1484040766954422, "learning_rate": 3.6077214392800873e-06, "loss": 0.4567, "num_input_tokens_seen": 91955184, "step": 75800 }, { "epoch": 8.44247689052233, "grad_norm": 0.10313325375318527, "learning_rate": 3.60520712858049e-06, "loss": 0.4618, "num_input_tokens_seen": 91961104, "step": 75805 }, { "epoch": 8.443033745405947, "grad_norm": 0.13100987672805786, "learning_rate": 3.602693626244735e-06, "loss": 0.4658, "num_input_tokens_seen": 91967376, "step": 75810 }, { "epoch": 8.443590600289564, "grad_norm": 0.10307450592517853, "learning_rate": 3.600180932367783e-06, "loss": 0.4793, "num_input_tokens_seen": 91973456, "step": 75815 }, { "epoch": 8.444147455173182, "grad_norm": 0.10282537341117859, "learning_rate": 3.59766904704458e-06, "loss": 0.4606, "num_input_tokens_seen": 91979504, "step": 75820 }, { "epoch": 8.4447043100568, "grad_norm": 0.10319533199071884, "learning_rate": 3.5951579703700193e-06, "loss": 0.4574, "num_input_tokens_seen": 91985840, "step": 75825 }, { "epoch": 8.445261164940417, "grad_norm": 0.08820848166942596, "learning_rate": 3.592647702438995e-06, "loss": 0.4587, "num_input_tokens_seen": 91992208, "step": 75830 }, { "epoch": 8.445818019824033, "grad_norm": 0.10233018547296524, "learning_rate": 3.590138243346336e-06, "loss": 0.4622, "num_input_tokens_seen": 91998352, "step": 75835 }, { "epoch": 8.44637487470765, "grad_norm": 0.11807971447706223, "learning_rate": 3.587629593186864e-06, "loss": 0.4749, "num_input_tokens_seen": 92004528, "step": 75840 }, { "epoch": 8.446931729591268, "grad_norm": 0.09470709413290024, "learning_rate": 3.585121752055373e-06, "loss": 0.4614, "num_input_tokens_seen": 92009776, "step": 75845 }, { "epoch": 8.447488584474886, "grad_norm": 0.15979301929473877, "learning_rate": 3.5826147200466036e-06, "loss": 0.462, "num_input_tokens_seen": 92015824, "step": 75850 }, { "epoch": 8.448045439358504, "grad_norm": 0.11546080559492111, "learning_rate": 3.580108497255291e-06, "loss": 0.46, "num_input_tokens_seen": 92022000, "step": 75855 }, { "epoch": 8.44860229424212, "grad_norm": 0.08365736901760101, "learning_rate": 3.5776030837761152e-06, "loss": 0.463, "num_input_tokens_seen": 92028208, "step": 75860 }, { "epoch": 8.449159149125737, "grad_norm": 0.11967849731445312, "learning_rate": 3.575098479703756e-06, "loss": 0.4563, "num_input_tokens_seen": 92034480, "step": 75865 }, { "epoch": 8.449716004009355, "grad_norm": 0.12949040532112122, "learning_rate": 3.5725946851328266e-06, "loss": 0.4742, "num_input_tokens_seen": 92040496, "step": 75870 }, { "epoch": 8.450272858892973, "grad_norm": 0.08515138924121857, "learning_rate": 3.5700917001579376e-06, "loss": 0.4586, "num_input_tokens_seen": 92046704, "step": 75875 }, { "epoch": 8.45082971377659, "grad_norm": 0.07716716080904007, "learning_rate": 3.5675895248736684e-06, "loss": 0.461, "num_input_tokens_seen": 92052848, "step": 75880 }, { "epoch": 8.451386568660206, "grad_norm": 0.1477992832660675, "learning_rate": 3.565088159374541e-06, "loss": 0.4675, "num_input_tokens_seen": 92059024, "step": 75885 }, { "epoch": 8.451943423543824, "grad_norm": 0.1079457625746727, "learning_rate": 3.5625876037550825e-06, "loss": 0.4639, "num_input_tokens_seen": 92064944, "step": 75890 }, { "epoch": 8.452500278427442, "grad_norm": 0.1267271488904953, "learning_rate": 3.560087858109759e-06, "loss": 0.4622, "num_input_tokens_seen": 92071152, "step": 75895 }, { "epoch": 8.45305713331106, "grad_norm": 0.12838348746299744, "learning_rate": 3.5575889225330284e-06, "loss": 0.4564, "num_input_tokens_seen": 92077392, "step": 75900 }, { "epoch": 8.453613988194677, "grad_norm": 0.12712512910366058, "learning_rate": 3.5550907971192983e-06, "loss": 0.4531, "num_input_tokens_seen": 92083824, "step": 75905 }, { "epoch": 8.454170843078295, "grad_norm": 0.11225111782550812, "learning_rate": 3.5525934819629596e-06, "loss": 0.4538, "num_input_tokens_seen": 92089872, "step": 75910 }, { "epoch": 8.45472769796191, "grad_norm": 0.10201715677976608, "learning_rate": 3.550096977158379e-06, "loss": 0.4643, "num_input_tokens_seen": 92096176, "step": 75915 }, { "epoch": 8.455284552845528, "grad_norm": 0.13458839058876038, "learning_rate": 3.547601282799867e-06, "loss": 0.4664, "num_input_tokens_seen": 92102224, "step": 75920 }, { "epoch": 8.455841407729146, "grad_norm": 0.08460313081741333, "learning_rate": 3.545106398981732e-06, "loss": 0.4431, "num_input_tokens_seen": 92108208, "step": 75925 }, { "epoch": 8.456398262612764, "grad_norm": 0.1042143702507019, "learning_rate": 3.542612325798228e-06, "loss": 0.463, "num_input_tokens_seen": 92114256, "step": 75930 }, { "epoch": 8.456955117496381, "grad_norm": 0.09426011145114899, "learning_rate": 3.5401190633435943e-06, "loss": 0.4685, "num_input_tokens_seen": 92120624, "step": 75935 }, { "epoch": 8.457511972379997, "grad_norm": 0.11469276249408722, "learning_rate": 3.537626611712036e-06, "loss": 0.4588, "num_input_tokens_seen": 92126448, "step": 75940 }, { "epoch": 8.458068827263615, "grad_norm": 0.14690954983234406, "learning_rate": 3.535134970997722e-06, "loss": 0.4505, "num_input_tokens_seen": 92132464, "step": 75945 }, { "epoch": 8.458625682147233, "grad_norm": 0.11507456749677658, "learning_rate": 3.5326441412948024e-06, "loss": 0.4599, "num_input_tokens_seen": 92138640, "step": 75950 }, { "epoch": 8.45918253703085, "grad_norm": 0.13904398679733276, "learning_rate": 3.530154122697374e-06, "loss": 0.4553, "num_input_tokens_seen": 92144464, "step": 75955 }, { "epoch": 8.459739391914468, "grad_norm": 0.13099399209022522, "learning_rate": 3.5276649152995333e-06, "loss": 0.468, "num_input_tokens_seen": 92150736, "step": 75960 }, { "epoch": 8.460296246798084, "grad_norm": 0.10335070639848709, "learning_rate": 3.5251765191953196e-06, "loss": 0.4513, "num_input_tokens_seen": 92157008, "step": 75965 }, { "epoch": 8.460853101681701, "grad_norm": 0.15887585282325745, "learning_rate": 3.5226889344787545e-06, "loss": 0.4555, "num_input_tokens_seen": 92163248, "step": 75970 }, { "epoch": 8.46140995656532, "grad_norm": 0.09540645033121109, "learning_rate": 3.5202021612438353e-06, "loss": 0.4569, "num_input_tokens_seen": 92169040, "step": 75975 }, { "epoch": 8.461966811448937, "grad_norm": 0.09547065198421478, "learning_rate": 3.517716199584506e-06, "loss": 0.464, "num_input_tokens_seen": 92175472, "step": 75980 }, { "epoch": 8.462523666332554, "grad_norm": 0.09896598756313324, "learning_rate": 3.5152310495947113e-06, "loss": 0.444, "num_input_tokens_seen": 92181616, "step": 75985 }, { "epoch": 8.46308052121617, "grad_norm": 0.12778779864311218, "learning_rate": 3.512746711368331e-06, "loss": 0.4653, "num_input_tokens_seen": 92187760, "step": 75990 }, { "epoch": 8.463637376099788, "grad_norm": 0.11207265406847, "learning_rate": 3.5102631849992464e-06, "loss": 0.4561, "num_input_tokens_seen": 92193904, "step": 75995 }, { "epoch": 8.464194230983406, "grad_norm": 0.11785311996936798, "learning_rate": 3.507780470581279e-06, "loss": 0.4513, "num_input_tokens_seen": 92199888, "step": 76000 }, { "epoch": 8.464751085867023, "grad_norm": 0.11282473057508469, "learning_rate": 3.50529856820824e-06, "loss": 0.4567, "num_input_tokens_seen": 92206128, "step": 76005 }, { "epoch": 8.465307940750641, "grad_norm": 0.11014008522033691, "learning_rate": 3.5028174779739104e-06, "loss": 0.4662, "num_input_tokens_seen": 92212336, "step": 76010 }, { "epoch": 8.465864795634257, "grad_norm": 0.08255532383918762, "learning_rate": 3.500337199972023e-06, "loss": 0.451, "num_input_tokens_seen": 92218160, "step": 76015 }, { "epoch": 8.466421650517875, "grad_norm": 0.12046676129102707, "learning_rate": 3.4978577342963004e-06, "loss": 0.4705, "num_input_tokens_seen": 92224336, "step": 76020 }, { "epoch": 8.466978505401492, "grad_norm": 0.09020652621984482, "learning_rate": 3.4953790810404118e-06, "loss": 0.4649, "num_input_tokens_seen": 92230192, "step": 76025 }, { "epoch": 8.46753536028511, "grad_norm": 0.11009445041418076, "learning_rate": 3.4929012402980187e-06, "loss": 0.4503, "num_input_tokens_seen": 92235952, "step": 76030 }, { "epoch": 8.468092215168728, "grad_norm": 0.1174549013376236, "learning_rate": 3.490424212162746e-06, "loss": 0.4508, "num_input_tokens_seen": 92242096, "step": 76035 }, { "epoch": 8.468649070052344, "grad_norm": 0.1178177148103714, "learning_rate": 3.4879479967281694e-06, "loss": 0.4676, "num_input_tokens_seen": 92248144, "step": 76040 }, { "epoch": 8.469205924935961, "grad_norm": 0.10559187084436417, "learning_rate": 3.485472594087863e-06, "loss": 0.4697, "num_input_tokens_seen": 92254288, "step": 76045 }, { "epoch": 8.469762779819579, "grad_norm": 0.10113278776407242, "learning_rate": 3.4829980043353445e-06, "loss": 0.4718, "num_input_tokens_seen": 92260720, "step": 76050 }, { "epoch": 8.470319634703197, "grad_norm": 0.11178115755319595, "learning_rate": 3.480524227564119e-06, "loss": 0.4687, "num_input_tokens_seen": 92267024, "step": 76055 }, { "epoch": 8.470876489586814, "grad_norm": 0.10601263493299484, "learning_rate": 3.478051263867646e-06, "loss": 0.4769, "num_input_tokens_seen": 92273456, "step": 76060 }, { "epoch": 8.47143334447043, "grad_norm": 0.09983208030462265, "learning_rate": 3.4755791133393694e-06, "loss": 0.4474, "num_input_tokens_seen": 92279184, "step": 76065 }, { "epoch": 8.471990199354048, "grad_norm": 0.08789770305156708, "learning_rate": 3.4731077760726954e-06, "loss": 0.462, "num_input_tokens_seen": 92285328, "step": 76070 }, { "epoch": 8.472547054237666, "grad_norm": 0.13570645451545715, "learning_rate": 3.4706372521609908e-06, "loss": 0.4624, "num_input_tokens_seen": 92291856, "step": 76075 }, { "epoch": 8.473103909121283, "grad_norm": 0.11847315728664398, "learning_rate": 3.468167541697609e-06, "loss": 0.4619, "num_input_tokens_seen": 92297680, "step": 76080 }, { "epoch": 8.4736607640049, "grad_norm": 0.08808982372283936, "learning_rate": 3.4656986447758554e-06, "loss": 0.4535, "num_input_tokens_seen": 92303856, "step": 76085 }, { "epoch": 8.474217618888517, "grad_norm": 0.13005319237709045, "learning_rate": 3.4632305614890226e-06, "loss": 0.461, "num_input_tokens_seen": 92309328, "step": 76090 }, { "epoch": 8.474774473772134, "grad_norm": 0.08894564211368561, "learning_rate": 3.4607632919303525e-06, "loss": 0.4684, "num_input_tokens_seen": 92315472, "step": 76095 }, { "epoch": 8.475331328655752, "grad_norm": 0.10810106992721558, "learning_rate": 3.45829683619307e-06, "loss": 0.4645, "num_input_tokens_seen": 92321456, "step": 76100 }, { "epoch": 8.47588818353937, "grad_norm": 0.11684960871934891, "learning_rate": 3.4558311943703735e-06, "loss": 0.4599, "num_input_tokens_seen": 92327408, "step": 76105 }, { "epoch": 8.476445038422987, "grad_norm": 0.10920478403568268, "learning_rate": 3.453366366555408e-06, "loss": 0.4532, "num_input_tokens_seen": 92333904, "step": 76110 }, { "epoch": 8.477001893306605, "grad_norm": 0.10820678621530533, "learning_rate": 3.4509023528413175e-06, "loss": 0.4614, "num_input_tokens_seen": 92340112, "step": 76115 }, { "epoch": 8.477558748190221, "grad_norm": 0.12362036854028702, "learning_rate": 3.4484391533211897e-06, "loss": 0.4546, "num_input_tokens_seen": 92346160, "step": 76120 }, { "epoch": 8.478115603073839, "grad_norm": 0.09891141206026077, "learning_rate": 3.4459767680880996e-06, "loss": 0.4659, "num_input_tokens_seen": 92352272, "step": 76125 }, { "epoch": 8.478672457957456, "grad_norm": 0.10419408977031708, "learning_rate": 3.443515197235078e-06, "loss": 0.463, "num_input_tokens_seen": 92358320, "step": 76130 }, { "epoch": 8.479229312841074, "grad_norm": 0.07980488985776901, "learning_rate": 3.441054440855129e-06, "loss": 0.4558, "num_input_tokens_seen": 92364016, "step": 76135 }, { "epoch": 8.479786167724692, "grad_norm": 0.09797840565443039, "learning_rate": 3.438594499041242e-06, "loss": 0.4682, "num_input_tokens_seen": 92370032, "step": 76140 }, { "epoch": 8.480343022608308, "grad_norm": 0.1379263997077942, "learning_rate": 3.4361353718863426e-06, "loss": 0.4586, "num_input_tokens_seen": 92376208, "step": 76145 }, { "epoch": 8.480899877491925, "grad_norm": 0.10948370397090912, "learning_rate": 3.4336770594833616e-06, "loss": 0.4531, "num_input_tokens_seen": 92382160, "step": 76150 }, { "epoch": 8.481456732375543, "grad_norm": 0.09626000374555588, "learning_rate": 3.431219561925167e-06, "loss": 0.4624, "num_input_tokens_seen": 92388112, "step": 76155 }, { "epoch": 8.48201358725916, "grad_norm": 0.12082159519195557, "learning_rate": 3.42876287930462e-06, "loss": 0.4545, "num_input_tokens_seen": 92394256, "step": 76160 }, { "epoch": 8.482570442142778, "grad_norm": 0.1005498617887497, "learning_rate": 3.426307011714544e-06, "loss": 0.4656, "num_input_tokens_seen": 92400560, "step": 76165 }, { "epoch": 8.483127297026394, "grad_norm": 0.09179142862558365, "learning_rate": 3.4238519592477176e-06, "loss": 0.4678, "num_input_tokens_seen": 92406608, "step": 76170 }, { "epoch": 8.483684151910012, "grad_norm": 0.09981299936771393, "learning_rate": 3.4213977219969163e-06, "loss": 0.4643, "num_input_tokens_seen": 92413040, "step": 76175 }, { "epoch": 8.48424100679363, "grad_norm": 0.10692556947469711, "learning_rate": 3.418944300054855e-06, "loss": 0.4538, "num_input_tokens_seen": 92419280, "step": 76180 }, { "epoch": 8.484797861677247, "grad_norm": 0.1837548166513443, "learning_rate": 3.4164916935142456e-06, "loss": 0.4619, "num_input_tokens_seen": 92425648, "step": 76185 }, { "epoch": 8.485354716560865, "grad_norm": 0.14295372366905212, "learning_rate": 3.4140399024677443e-06, "loss": 0.4497, "num_input_tokens_seen": 92431984, "step": 76190 }, { "epoch": 8.48591157144448, "grad_norm": 0.09982835501432419, "learning_rate": 3.4115889270079886e-06, "loss": 0.4554, "num_input_tokens_seen": 92438224, "step": 76195 }, { "epoch": 8.486468426328099, "grad_norm": 0.1065654531121254, "learning_rate": 3.4091387672275904e-06, "loss": 0.4608, "num_input_tokens_seen": 92444240, "step": 76200 }, { "epoch": 8.487025281211716, "grad_norm": 0.1469668596982956, "learning_rate": 3.4066894232191145e-06, "loss": 0.4654, "num_input_tokens_seen": 92450608, "step": 76205 }, { "epoch": 8.487582136095334, "grad_norm": 0.09840672463178635, "learning_rate": 3.404240895075114e-06, "loss": 0.4664, "num_input_tokens_seen": 92456560, "step": 76210 }, { "epoch": 8.488138990978952, "grad_norm": 0.20864884555339813, "learning_rate": 3.4017931828881024e-06, "loss": 0.4639, "num_input_tokens_seen": 92462736, "step": 76215 }, { "epoch": 8.488695845862567, "grad_norm": 0.09039304405450821, "learning_rate": 3.399346286750557e-06, "loss": 0.4613, "num_input_tokens_seen": 92469072, "step": 76220 }, { "epoch": 8.489252700746185, "grad_norm": 0.08263629674911499, "learning_rate": 3.3969002067549327e-06, "loss": 0.4638, "num_input_tokens_seen": 92474960, "step": 76225 }, { "epoch": 8.489809555629803, "grad_norm": 0.14413803815841675, "learning_rate": 3.394454942993647e-06, "loss": 0.4709, "num_input_tokens_seen": 92481072, "step": 76230 }, { "epoch": 8.49036641051342, "grad_norm": 0.12331537157297134, "learning_rate": 3.3920104955590976e-06, "loss": 0.464, "num_input_tokens_seen": 92486992, "step": 76235 }, { "epoch": 8.490923265397038, "grad_norm": 0.12177860736846924, "learning_rate": 3.3895668645436306e-06, "loss": 0.4681, "num_input_tokens_seen": 92493168, "step": 76240 }, { "epoch": 8.491480120280654, "grad_norm": 0.08941967785358429, "learning_rate": 3.3871240500395805e-06, "loss": 0.4621, "num_input_tokens_seen": 92499120, "step": 76245 }, { "epoch": 8.492036975164272, "grad_norm": 0.11606693267822266, "learning_rate": 3.384682052139254e-06, "loss": 0.4627, "num_input_tokens_seen": 92505232, "step": 76250 }, { "epoch": 8.49259383004789, "grad_norm": 0.1210830882191658, "learning_rate": 3.3822408709349023e-06, "loss": 0.4457, "num_input_tokens_seen": 92511536, "step": 76255 }, { "epoch": 8.493150684931507, "grad_norm": 0.09915725141763687, "learning_rate": 3.379800506518774e-06, "loss": 0.4651, "num_input_tokens_seen": 92516912, "step": 76260 }, { "epoch": 8.493707539815125, "grad_norm": 0.12146606296300888, "learning_rate": 3.3773609589830615e-06, "loss": 0.4516, "num_input_tokens_seen": 92522896, "step": 76265 }, { "epoch": 8.494264394698742, "grad_norm": 0.09586182981729507, "learning_rate": 3.3749222284199494e-06, "loss": 0.4595, "num_input_tokens_seen": 92528816, "step": 76270 }, { "epoch": 8.494821249582358, "grad_norm": 0.11874350905418396, "learning_rate": 3.372484314921573e-06, "loss": 0.455, "num_input_tokens_seen": 92534800, "step": 76275 }, { "epoch": 8.495378104465976, "grad_norm": 0.11115393787622452, "learning_rate": 3.3700472185800497e-06, "loss": 0.4626, "num_input_tokens_seen": 92541040, "step": 76280 }, { "epoch": 8.495934959349594, "grad_norm": 0.11029364168643951, "learning_rate": 3.3676109394874617e-06, "loss": 0.4685, "num_input_tokens_seen": 92547088, "step": 76285 }, { "epoch": 8.496491814233211, "grad_norm": 0.09770959615707397, "learning_rate": 3.3651754777358517e-06, "loss": 0.4432, "num_input_tokens_seen": 92552944, "step": 76290 }, { "epoch": 8.497048669116829, "grad_norm": 0.1078195571899414, "learning_rate": 3.3627408334172495e-06, "loss": 0.4581, "num_input_tokens_seen": 92558992, "step": 76295 }, { "epoch": 8.497605524000445, "grad_norm": 0.10919395089149475, "learning_rate": 3.3603070066236336e-06, "loss": 0.4646, "num_input_tokens_seen": 92565232, "step": 76300 }, { "epoch": 8.498162378884063, "grad_norm": 0.10465555638074875, "learning_rate": 3.357873997446967e-06, "loss": 0.4681, "num_input_tokens_seen": 92571408, "step": 76305 }, { "epoch": 8.49871923376768, "grad_norm": 0.08862896263599396, "learning_rate": 3.355441805979184e-06, "loss": 0.4654, "num_input_tokens_seen": 92577520, "step": 76310 }, { "epoch": 8.499276088651298, "grad_norm": 0.10198518633842468, "learning_rate": 3.3530104323121647e-06, "loss": 0.4454, "num_input_tokens_seen": 92583504, "step": 76315 }, { "epoch": 8.499832943534916, "grad_norm": 0.11185357719659805, "learning_rate": 3.3505798765377876e-06, "loss": 0.4678, "num_input_tokens_seen": 92589680, "step": 76320 }, { "epoch": 8.500389798418531, "grad_norm": 0.12488065659999847, "learning_rate": 3.34815013874788e-06, "loss": 0.4574, "num_input_tokens_seen": 92595952, "step": 76325 }, { "epoch": 8.50094665330215, "grad_norm": 0.20786887407302856, "learning_rate": 3.345721219034248e-06, "loss": 0.4462, "num_input_tokens_seen": 92602480, "step": 76330 }, { "epoch": 8.50094665330215, "eval_loss": 0.4634992778301239, "eval_runtime": 113.0481, "eval_samples_per_second": 35.304, "eval_steps_per_second": 8.828, "num_input_tokens_seen": 92602480, "step": 76330 }, { "epoch": 8.501503508185767, "grad_norm": 0.11167586594820023, "learning_rate": 3.3432931174886607e-06, "loss": 0.4653, "num_input_tokens_seen": 92608560, "step": 76335 }, { "epoch": 8.502060363069385, "grad_norm": 0.07881639152765274, "learning_rate": 3.340865834202858e-06, "loss": 0.4619, "num_input_tokens_seen": 92613840, "step": 76340 }, { "epoch": 8.502617217953002, "grad_norm": 0.13065221905708313, "learning_rate": 3.3384393692685616e-06, "loss": 0.4454, "num_input_tokens_seen": 92619984, "step": 76345 }, { "epoch": 8.503174072836618, "grad_norm": 0.0784086361527443, "learning_rate": 3.33601372277744e-06, "loss": 0.4636, "num_input_tokens_seen": 92626288, "step": 76350 }, { "epoch": 8.503730927720236, "grad_norm": 0.10286488384008408, "learning_rate": 3.333588894821149e-06, "loss": 0.4501, "num_input_tokens_seen": 92632560, "step": 76355 }, { "epoch": 8.504287782603853, "grad_norm": 0.08986833691596985, "learning_rate": 3.3311648854912973e-06, "loss": 0.4615, "num_input_tokens_seen": 92638704, "step": 76360 }, { "epoch": 8.504844637487471, "grad_norm": 0.10382771492004395, "learning_rate": 3.3287416948794835e-06, "loss": 0.4513, "num_input_tokens_seen": 92644656, "step": 76365 }, { "epoch": 8.505401492371089, "grad_norm": 0.08835423737764359, "learning_rate": 3.3263193230772538e-06, "loss": 0.4647, "num_input_tokens_seen": 92650768, "step": 76370 }, { "epoch": 8.505958347254705, "grad_norm": 0.07700040191411972, "learning_rate": 3.323897770176135e-06, "loss": 0.4594, "num_input_tokens_seen": 92657072, "step": 76375 }, { "epoch": 8.506515202138322, "grad_norm": 0.1181364506483078, "learning_rate": 3.3214770362676285e-06, "loss": 0.4716, "num_input_tokens_seen": 92662896, "step": 76380 }, { "epoch": 8.50707205702194, "grad_norm": 0.08670547604560852, "learning_rate": 3.3190571214431866e-06, "loss": 0.4616, "num_input_tokens_seen": 92669136, "step": 76385 }, { "epoch": 8.507628911905558, "grad_norm": 0.11653126031160355, "learning_rate": 3.3166380257942496e-06, "loss": 0.4542, "num_input_tokens_seen": 92675216, "step": 76390 }, { "epoch": 8.508185766789175, "grad_norm": 0.088577501475811, "learning_rate": 3.3142197494122144e-06, "loss": 0.4565, "num_input_tokens_seen": 92680880, "step": 76395 }, { "epoch": 8.508742621672791, "grad_norm": 0.09823653101921082, "learning_rate": 3.311802292388455e-06, "loss": 0.4529, "num_input_tokens_seen": 92686992, "step": 76400 }, { "epoch": 8.509299476556409, "grad_norm": 0.13968850672245026, "learning_rate": 3.309385654814304e-06, "loss": 0.4708, "num_input_tokens_seen": 92692880, "step": 76405 }, { "epoch": 8.509856331440027, "grad_norm": 0.10649989545345306, "learning_rate": 3.3069698367810738e-06, "loss": 0.4679, "num_input_tokens_seen": 92698928, "step": 76410 }, { "epoch": 8.510413186323644, "grad_norm": 0.10938365012407303, "learning_rate": 3.304554838380047e-06, "loss": 0.4547, "num_input_tokens_seen": 92705072, "step": 76415 }, { "epoch": 8.510970041207262, "grad_norm": 0.11047220230102539, "learning_rate": 3.30214065970246e-06, "loss": 0.4624, "num_input_tokens_seen": 92711280, "step": 76420 }, { "epoch": 8.511526896090878, "grad_norm": 0.14411166310310364, "learning_rate": 3.299727300839539e-06, "loss": 0.4686, "num_input_tokens_seen": 92717328, "step": 76425 }, { "epoch": 8.512083750974496, "grad_norm": 0.10332783311605453, "learning_rate": 3.2973147618824557e-06, "loss": 0.4664, "num_input_tokens_seen": 92723440, "step": 76430 }, { "epoch": 8.512640605858113, "grad_norm": 0.1264384239912033, "learning_rate": 3.2949030429223704e-06, "loss": 0.4696, "num_input_tokens_seen": 92729296, "step": 76435 }, { "epoch": 8.513197460741731, "grad_norm": 0.11699835956096649, "learning_rate": 3.292492144050413e-06, "loss": 0.4665, "num_input_tokens_seen": 92735440, "step": 76440 }, { "epoch": 8.513754315625349, "grad_norm": 0.11390286684036255, "learning_rate": 3.2900820653576606e-06, "loss": 0.4527, "num_input_tokens_seen": 92741264, "step": 76445 }, { "epoch": 8.514311170508964, "grad_norm": 0.1148906871676445, "learning_rate": 3.2876728069351855e-06, "loss": 0.4644, "num_input_tokens_seen": 92747600, "step": 76450 }, { "epoch": 8.514868025392582, "grad_norm": 0.09295342862606049, "learning_rate": 3.285264368874008e-06, "loss": 0.4633, "num_input_tokens_seen": 92753584, "step": 76455 }, { "epoch": 8.5154248802762, "grad_norm": 0.10179319977760315, "learning_rate": 3.282856751265137e-06, "loss": 0.4746, "num_input_tokens_seen": 92759472, "step": 76460 }, { "epoch": 8.515981735159817, "grad_norm": 0.12181670218706131, "learning_rate": 3.280449954199527e-06, "loss": 0.4608, "num_input_tokens_seen": 92765200, "step": 76465 }, { "epoch": 8.516538590043435, "grad_norm": 0.08378402143716812, "learning_rate": 3.278043977768122e-06, "loss": 0.4604, "num_input_tokens_seen": 92771408, "step": 76470 }, { "epoch": 8.517095444927053, "grad_norm": 0.12260833382606506, "learning_rate": 3.275638822061833e-06, "loss": 0.4627, "num_input_tokens_seen": 92777200, "step": 76475 }, { "epoch": 8.517652299810669, "grad_norm": 0.09974024444818497, "learning_rate": 3.2732344871715196e-06, "loss": 0.4613, "num_input_tokens_seen": 92782704, "step": 76480 }, { "epoch": 8.518209154694286, "grad_norm": 0.11264690011739731, "learning_rate": 3.270830973188044e-06, "loss": 0.4546, "num_input_tokens_seen": 92788944, "step": 76485 }, { "epoch": 8.518766009577904, "grad_norm": 0.08796227723360062, "learning_rate": 3.2684282802022016e-06, "loss": 0.4638, "num_input_tokens_seen": 92795152, "step": 76490 }, { "epoch": 8.519322864461522, "grad_norm": 0.08756960928440094, "learning_rate": 3.266026408304787e-06, "loss": 0.4533, "num_input_tokens_seen": 92801424, "step": 76495 }, { "epoch": 8.51987971934514, "grad_norm": 0.09780537337064743, "learning_rate": 3.263625357586536e-06, "loss": 0.4678, "num_input_tokens_seen": 92807184, "step": 76500 }, { "epoch": 8.520436574228755, "grad_norm": 0.12784947454929352, "learning_rate": 3.261225128138182e-06, "loss": 0.4592, "num_input_tokens_seen": 92813296, "step": 76505 }, { "epoch": 8.520993429112373, "grad_norm": 0.09943709522485733, "learning_rate": 3.2588257200504096e-06, "loss": 0.4557, "num_input_tokens_seen": 92818992, "step": 76510 }, { "epoch": 8.52155028399599, "grad_norm": 0.10137031972408295, "learning_rate": 3.2564271334138695e-06, "loss": 0.4434, "num_input_tokens_seen": 92825392, "step": 76515 }, { "epoch": 8.522107138879608, "grad_norm": 0.12373503297567368, "learning_rate": 3.2540293683192e-06, "loss": 0.4742, "num_input_tokens_seen": 92831344, "step": 76520 }, { "epoch": 8.522663993763226, "grad_norm": 0.12512721121311188, "learning_rate": 3.251632424856982e-06, "loss": 0.4584, "num_input_tokens_seen": 92837328, "step": 76525 }, { "epoch": 8.523220848646842, "grad_norm": 0.11526721715927124, "learning_rate": 3.249236303117789e-06, "loss": 0.4558, "num_input_tokens_seen": 92843248, "step": 76530 }, { "epoch": 8.52377770353046, "grad_norm": 0.13893280923366547, "learning_rate": 3.2468410031921555e-06, "loss": 0.472, "num_input_tokens_seen": 92849456, "step": 76535 }, { "epoch": 8.524334558414077, "grad_norm": 0.12477158010005951, "learning_rate": 3.2444465251705775e-06, "loss": 0.4672, "num_input_tokens_seen": 92855440, "step": 76540 }, { "epoch": 8.524891413297695, "grad_norm": 0.0922958254814148, "learning_rate": 3.242052869143533e-06, "loss": 0.4775, "num_input_tokens_seen": 92861456, "step": 76545 }, { "epoch": 8.525448268181313, "grad_norm": 0.06394843757152557, "learning_rate": 3.23966003520145e-06, "loss": 0.4745, "num_input_tokens_seen": 92867760, "step": 76550 }, { "epoch": 8.526005123064929, "grad_norm": 0.09443697333335876, "learning_rate": 3.237268023434753e-06, "loss": 0.4601, "num_input_tokens_seen": 92873968, "step": 76555 }, { "epoch": 8.526561977948546, "grad_norm": 0.12327374517917633, "learning_rate": 3.234876833933809e-06, "loss": 0.4555, "num_input_tokens_seen": 92879696, "step": 76560 }, { "epoch": 8.527118832832164, "grad_norm": 0.10539673268795013, "learning_rate": 3.2324864667889644e-06, "loss": 0.4569, "num_input_tokens_seen": 92885712, "step": 76565 }, { "epoch": 8.527675687715782, "grad_norm": 0.09921546280384064, "learning_rate": 3.2300969220905447e-06, "loss": 0.4513, "num_input_tokens_seen": 92891408, "step": 76570 }, { "epoch": 8.5282325425994, "grad_norm": 0.09677568823099136, "learning_rate": 3.227708199928825e-06, "loss": 0.4628, "num_input_tokens_seen": 92897200, "step": 76575 }, { "epoch": 8.528789397483015, "grad_norm": 0.13172657787799835, "learning_rate": 3.225320300394066e-06, "loss": 0.4618, "num_input_tokens_seen": 92903280, "step": 76580 }, { "epoch": 8.529346252366633, "grad_norm": 0.13148292899131775, "learning_rate": 3.2229332235764796e-06, "loss": 0.4579, "num_input_tokens_seen": 92909552, "step": 76585 }, { "epoch": 8.52990310725025, "grad_norm": 0.1125614270567894, "learning_rate": 3.220546969566271e-06, "loss": 0.4562, "num_input_tokens_seen": 92915760, "step": 76590 }, { "epoch": 8.530459962133868, "grad_norm": 0.10567010939121246, "learning_rate": 3.21816153845359e-06, "loss": 0.4546, "num_input_tokens_seen": 92921648, "step": 76595 }, { "epoch": 8.531016817017486, "grad_norm": 0.14722132682800293, "learning_rate": 3.2157769303285736e-06, "loss": 0.4734, "num_input_tokens_seen": 92927536, "step": 76600 }, { "epoch": 8.531573671901103, "grad_norm": 0.09392335265874863, "learning_rate": 3.2133931452813153e-06, "loss": 0.4722, "num_input_tokens_seen": 92933648, "step": 76605 }, { "epoch": 8.53213052678472, "grad_norm": 0.0815153494477272, "learning_rate": 3.211010183401875e-06, "loss": 0.4645, "num_input_tokens_seen": 92939696, "step": 76610 }, { "epoch": 8.532687381668337, "grad_norm": 0.08985725790262222, "learning_rate": 3.2086280447802983e-06, "loss": 0.4585, "num_input_tokens_seen": 92945872, "step": 76615 }, { "epoch": 8.533244236551955, "grad_norm": 0.18396621942520142, "learning_rate": 3.206246729506593e-06, "loss": 0.4551, "num_input_tokens_seen": 92952112, "step": 76620 }, { "epoch": 8.533801091435572, "grad_norm": 0.13302293419837952, "learning_rate": 3.2038662376707245e-06, "loss": 0.4616, "num_input_tokens_seen": 92957456, "step": 76625 }, { "epoch": 8.53435794631919, "grad_norm": 0.08869506418704987, "learning_rate": 3.201486569362641e-06, "loss": 0.4716, "num_input_tokens_seen": 92963568, "step": 76630 }, { "epoch": 8.534914801202806, "grad_norm": 0.06241616606712341, "learning_rate": 3.199107724672248e-06, "loss": 0.4589, "num_input_tokens_seen": 92969456, "step": 76635 }, { "epoch": 8.535471656086424, "grad_norm": 0.13668765127658844, "learning_rate": 3.196729703689436e-06, "loss": 0.4607, "num_input_tokens_seen": 92975472, "step": 76640 }, { "epoch": 8.536028510970041, "grad_norm": 0.14234374463558197, "learning_rate": 3.194352506504039e-06, "loss": 0.4578, "num_input_tokens_seen": 92981712, "step": 76645 }, { "epoch": 8.536585365853659, "grad_norm": 0.16671457886695862, "learning_rate": 3.1919761332058873e-06, "loss": 0.4506, "num_input_tokens_seen": 92987696, "step": 76650 }, { "epoch": 8.537142220737277, "grad_norm": 0.13418370485305786, "learning_rate": 3.1896005838847666e-06, "loss": 0.4562, "num_input_tokens_seen": 92993104, "step": 76655 }, { "epoch": 8.537699075620893, "grad_norm": 0.09572940319776535, "learning_rate": 3.1872258586304273e-06, "loss": 0.4534, "num_input_tokens_seen": 92999312, "step": 76660 }, { "epoch": 8.53825593050451, "grad_norm": 0.11458314955234528, "learning_rate": 3.184851957532603e-06, "loss": 0.4645, "num_input_tokens_seen": 93005328, "step": 76665 }, { "epoch": 8.538812785388128, "grad_norm": 0.13668720424175262, "learning_rate": 3.182478880680978e-06, "loss": 0.4674, "num_input_tokens_seen": 93011376, "step": 76670 }, { "epoch": 8.539369640271746, "grad_norm": 0.13122344017028809, "learning_rate": 3.180106628165222e-06, "loss": 0.4599, "num_input_tokens_seen": 93017712, "step": 76675 }, { "epoch": 8.539926495155363, "grad_norm": 0.10423966497182846, "learning_rate": 3.1777352000749576e-06, "loss": 0.4566, "num_input_tokens_seen": 93023728, "step": 76680 }, { "epoch": 8.54048335003898, "grad_norm": 0.10923110693693161, "learning_rate": 3.1753645964997884e-06, "loss": 0.4614, "num_input_tokens_seen": 93029584, "step": 76685 }, { "epoch": 8.541040204922597, "grad_norm": 0.1181526631116867, "learning_rate": 3.1729948175292928e-06, "loss": 0.4342, "num_input_tokens_seen": 93036144, "step": 76690 }, { "epoch": 8.541597059806215, "grad_norm": 0.12159988284111023, "learning_rate": 3.170625863252996e-06, "loss": 0.4605, "num_input_tokens_seen": 93042064, "step": 76695 }, { "epoch": 8.542153914689832, "grad_norm": 0.10208968073129654, "learning_rate": 3.168257733760413e-06, "loss": 0.4609, "num_input_tokens_seen": 93048176, "step": 76700 }, { "epoch": 8.54271076957345, "grad_norm": 0.15992560982704163, "learning_rate": 3.1658904291410136e-06, "loss": 0.4652, "num_input_tokens_seen": 93054480, "step": 76705 }, { "epoch": 8.543267624457066, "grad_norm": 0.13752681016921997, "learning_rate": 3.1635239494842426e-06, "loss": 0.4557, "num_input_tokens_seen": 93060272, "step": 76710 }, { "epoch": 8.543824479340683, "grad_norm": 0.09813536703586578, "learning_rate": 3.161158294879521e-06, "loss": 0.4585, "num_input_tokens_seen": 93066256, "step": 76715 }, { "epoch": 8.544381334224301, "grad_norm": 0.09243205934762955, "learning_rate": 3.158793465416221e-06, "loss": 0.4703, "num_input_tokens_seen": 93072464, "step": 76720 }, { "epoch": 8.544938189107919, "grad_norm": 0.14721886813640594, "learning_rate": 3.156429461183702e-06, "loss": 0.4682, "num_input_tokens_seen": 93078768, "step": 76725 }, { "epoch": 8.545495043991536, "grad_norm": 0.12620864808559418, "learning_rate": 3.154066282271273e-06, "loss": 0.466, "num_input_tokens_seen": 93084912, "step": 76730 }, { "epoch": 8.546051898875152, "grad_norm": 0.13462519645690918, "learning_rate": 3.151703928768235e-06, "loss": 0.4565, "num_input_tokens_seen": 93090928, "step": 76735 }, { "epoch": 8.54660875375877, "grad_norm": 0.1273854523897171, "learning_rate": 3.149342400763833e-06, "loss": 0.4568, "num_input_tokens_seen": 93097232, "step": 76740 }, { "epoch": 8.547165608642388, "grad_norm": 0.16675984859466553, "learning_rate": 3.1469816983472983e-06, "loss": 0.4485, "num_input_tokens_seen": 93103632, "step": 76745 }, { "epoch": 8.547722463526005, "grad_norm": 0.12451519072055817, "learning_rate": 3.1446218216078323e-06, "loss": 0.4592, "num_input_tokens_seen": 93109840, "step": 76750 }, { "epoch": 8.548279318409623, "grad_norm": 0.1437477022409439, "learning_rate": 3.142262770634588e-06, "loss": 0.4584, "num_input_tokens_seen": 93115408, "step": 76755 }, { "epoch": 8.548836173293239, "grad_norm": 0.08648794889450073, "learning_rate": 3.139904545516706e-06, "loss": 0.4613, "num_input_tokens_seen": 93121360, "step": 76760 }, { "epoch": 8.549393028176857, "grad_norm": 0.08594007790088654, "learning_rate": 3.137547146343281e-06, "loss": 0.4496, "num_input_tokens_seen": 93127504, "step": 76765 }, { "epoch": 8.549949883060474, "grad_norm": 0.1518792062997818, "learning_rate": 3.135190573203392e-06, "loss": 0.4578, "num_input_tokens_seen": 93133552, "step": 76770 }, { "epoch": 8.550506737944092, "grad_norm": 0.11171165853738785, "learning_rate": 3.132834826186068e-06, "loss": 0.4495, "num_input_tokens_seen": 93139568, "step": 76775 }, { "epoch": 8.55106359282771, "grad_norm": 0.13952195644378662, "learning_rate": 3.130479905380318e-06, "loss": 0.4566, "num_input_tokens_seen": 93145808, "step": 76780 }, { "epoch": 8.551620447711326, "grad_norm": 0.11929098516702652, "learning_rate": 3.1281258108751294e-06, "loss": 0.4587, "num_input_tokens_seen": 93152112, "step": 76785 }, { "epoch": 8.552177302594943, "grad_norm": 0.1028217151761055, "learning_rate": 3.1257725427594336e-06, "loss": 0.4764, "num_input_tokens_seen": 93158288, "step": 76790 }, { "epoch": 8.552734157478561, "grad_norm": 0.15150855481624603, "learning_rate": 3.1234201011221544e-06, "loss": 0.4654, "num_input_tokens_seen": 93164272, "step": 76795 }, { "epoch": 8.553291012362179, "grad_norm": 0.10576862841844559, "learning_rate": 3.1210684860521674e-06, "loss": 0.4568, "num_input_tokens_seen": 93170320, "step": 76800 }, { "epoch": 8.553847867245796, "grad_norm": 0.1062980517745018, "learning_rate": 3.1187176976383266e-06, "loss": 0.4504, "num_input_tokens_seen": 93176304, "step": 76805 }, { "epoch": 8.554404722129412, "grad_norm": 0.08589867502450943, "learning_rate": 3.1163677359694583e-06, "loss": 0.4592, "num_input_tokens_seen": 93182384, "step": 76810 }, { "epoch": 8.55496157701303, "grad_norm": 0.14342670142650604, "learning_rate": 3.1140186011343413e-06, "loss": 0.4558, "num_input_tokens_seen": 93188400, "step": 76815 }, { "epoch": 8.555518431896648, "grad_norm": 0.11155738681554794, "learning_rate": 3.111670293221744e-06, "loss": 0.4608, "num_input_tokens_seen": 93194448, "step": 76820 }, { "epoch": 8.556075286780265, "grad_norm": 0.11556791514158249, "learning_rate": 3.109322812320381e-06, "loss": 0.4678, "num_input_tokens_seen": 93200464, "step": 76825 }, { "epoch": 8.556632141663883, "grad_norm": 0.08733910322189331, "learning_rate": 3.106976158518962e-06, "loss": 0.46, "num_input_tokens_seen": 93206832, "step": 76830 }, { "epoch": 8.5571889965475, "grad_norm": 0.1420017033815384, "learning_rate": 3.104630331906136e-06, "loss": 0.4503, "num_input_tokens_seen": 93212752, "step": 76835 }, { "epoch": 8.557745851431116, "grad_norm": 0.09366244822740555, "learning_rate": 3.1022853325705454e-06, "loss": 0.4699, "num_input_tokens_seen": 93219056, "step": 76840 }, { "epoch": 8.558302706314734, "grad_norm": 0.10898873209953308, "learning_rate": 3.0999411606007945e-06, "loss": 0.4627, "num_input_tokens_seen": 93225296, "step": 76845 }, { "epoch": 8.558859561198352, "grad_norm": 0.10126093775033951, "learning_rate": 3.097597816085443e-06, "loss": 0.4552, "num_input_tokens_seen": 93231376, "step": 76850 }, { "epoch": 8.55941641608197, "grad_norm": 0.09114402532577515, "learning_rate": 3.0952552991130395e-06, "loss": 0.4588, "num_input_tokens_seen": 93237328, "step": 76855 }, { "epoch": 8.559973270965587, "grad_norm": 0.10977169126272202, "learning_rate": 3.092913609772083e-06, "loss": 0.453, "num_input_tokens_seen": 93243536, "step": 76860 }, { "epoch": 8.560530125849203, "grad_norm": 0.1084647998213768, "learning_rate": 3.090572748151063e-06, "loss": 0.4588, "num_input_tokens_seen": 93249392, "step": 76865 }, { "epoch": 8.56108698073282, "grad_norm": 0.10028766095638275, "learning_rate": 3.08823271433841e-06, "loss": 0.4536, "num_input_tokens_seen": 93254768, "step": 76870 }, { "epoch": 8.561643835616438, "grad_norm": 0.08708929270505905, "learning_rate": 3.085893508422544e-06, "loss": 0.4518, "num_input_tokens_seen": 93260944, "step": 76875 }, { "epoch": 8.562200690500056, "grad_norm": 0.10626287013292313, "learning_rate": 3.0835551304918526e-06, "loss": 0.4556, "num_input_tokens_seen": 93266832, "step": 76880 }, { "epoch": 8.562757545383674, "grad_norm": 0.09374064952135086, "learning_rate": 3.0812175806346793e-06, "loss": 0.4612, "num_input_tokens_seen": 93272464, "step": 76885 }, { "epoch": 8.56331440026729, "grad_norm": 0.0977487862110138, "learning_rate": 3.0788808589393537e-06, "loss": 0.4715, "num_input_tokens_seen": 93278768, "step": 76890 }, { "epoch": 8.563871255150907, "grad_norm": 0.16334369778633118, "learning_rate": 3.076544965494155e-06, "loss": 0.457, "num_input_tokens_seen": 93284592, "step": 76895 }, { "epoch": 8.564428110034525, "grad_norm": 0.0903158113360405, "learning_rate": 3.0742099003873482e-06, "loss": 0.4625, "num_input_tokens_seen": 93290896, "step": 76900 }, { "epoch": 8.564984964918143, "grad_norm": 0.08977383375167847, "learning_rate": 3.0718756637071494e-06, "loss": 0.4733, "num_input_tokens_seen": 93296624, "step": 76905 }, { "epoch": 8.56554181980176, "grad_norm": 0.10494095087051392, "learning_rate": 3.0695422555417602e-06, "loss": 0.4606, "num_input_tokens_seen": 93302640, "step": 76910 }, { "epoch": 8.566098674685376, "grad_norm": 0.11463550478219986, "learning_rate": 3.067209675979352e-06, "loss": 0.4594, "num_input_tokens_seen": 93308400, "step": 76915 }, { "epoch": 8.566655529568994, "grad_norm": 0.10716495662927628, "learning_rate": 3.0648779251080424e-06, "loss": 0.4652, "num_input_tokens_seen": 93314448, "step": 76920 }, { "epoch": 8.567212384452612, "grad_norm": 0.09326210618019104, "learning_rate": 3.062547003015945e-06, "loss": 0.4443, "num_input_tokens_seen": 93320816, "step": 76925 }, { "epoch": 8.56776923933623, "grad_norm": 0.1292419284582138, "learning_rate": 3.0602169097911192e-06, "loss": 0.4564, "num_input_tokens_seen": 93326832, "step": 76930 }, { "epoch": 8.568326094219847, "grad_norm": 0.0969720333814621, "learning_rate": 3.0578876455216066e-06, "loss": 0.4515, "num_input_tokens_seen": 93331856, "step": 76935 }, { "epoch": 8.568882949103463, "grad_norm": 0.08588940650224686, "learning_rate": 3.0555592102954223e-06, "loss": 0.4711, "num_input_tokens_seen": 93338064, "step": 76940 }, { "epoch": 8.56943980398708, "grad_norm": 0.13761137425899506, "learning_rate": 3.0532316042005293e-06, "loss": 0.4449, "num_input_tokens_seen": 93343952, "step": 76945 }, { "epoch": 8.569996658870698, "grad_norm": 0.09330520778894424, "learning_rate": 3.050904827324885e-06, "loss": 0.461, "num_input_tokens_seen": 93349808, "step": 76950 }, { "epoch": 8.570553513754316, "grad_norm": 0.09863994270563126, "learning_rate": 3.048578879756389e-06, "loss": 0.4684, "num_input_tokens_seen": 93356208, "step": 76955 }, { "epoch": 8.571110368637934, "grad_norm": 0.0891728550195694, "learning_rate": 3.0462537615829347e-06, "loss": 0.4659, "num_input_tokens_seen": 93362448, "step": 76960 }, { "epoch": 8.571667223521551, "grad_norm": 0.11072386056184769, "learning_rate": 3.0439294728923623e-06, "loss": 0.4643, "num_input_tokens_seen": 93368112, "step": 76965 }, { "epoch": 8.572224078405167, "grad_norm": 0.10172048211097717, "learning_rate": 3.0416060137724965e-06, "loss": 0.4505, "num_input_tokens_seen": 93374288, "step": 76970 }, { "epoch": 8.572780933288785, "grad_norm": 0.12194043397903442, "learning_rate": 3.0392833843111314e-06, "loss": 0.4567, "num_input_tokens_seen": 93380368, "step": 76975 }, { "epoch": 8.573337788172402, "grad_norm": 0.08663853257894516, "learning_rate": 3.03696158459601e-06, "loss": 0.4687, "num_input_tokens_seen": 93386864, "step": 76980 }, { "epoch": 8.57389464305602, "grad_norm": 0.11418326199054718, "learning_rate": 3.0346406147148677e-06, "loss": 0.4504, "num_input_tokens_seen": 93392976, "step": 76985 }, { "epoch": 8.574451497939638, "grad_norm": 0.12069165706634521, "learning_rate": 3.03232047475539e-06, "loss": 0.4581, "num_input_tokens_seen": 93399472, "step": 76990 }, { "epoch": 8.575008352823254, "grad_norm": 0.13763324916362762, "learning_rate": 3.030001164805249e-06, "loss": 0.4503, "num_input_tokens_seen": 93404880, "step": 76995 }, { "epoch": 8.575565207706871, "grad_norm": 0.09761680662631989, "learning_rate": 3.0276826849520656e-06, "loss": 0.4595, "num_input_tokens_seen": 93410672, "step": 77000 }, { "epoch": 8.576122062590489, "grad_norm": 0.15407422184944153, "learning_rate": 3.025365035283445e-06, "loss": 0.4697, "num_input_tokens_seen": 93416784, "step": 77005 }, { "epoch": 8.576678917474107, "grad_norm": 0.08630480617284775, "learning_rate": 3.0230482158869644e-06, "loss": 0.4687, "num_input_tokens_seen": 93422704, "step": 77010 }, { "epoch": 8.577235772357724, "grad_norm": 0.12294294685125351, "learning_rate": 3.02073222685014e-06, "loss": 0.4609, "num_input_tokens_seen": 93428624, "step": 77015 }, { "epoch": 8.57779262724134, "grad_norm": 0.10808707773685455, "learning_rate": 3.0184170682604875e-06, "loss": 0.4543, "num_input_tokens_seen": 93434544, "step": 77020 }, { "epoch": 8.578349482124958, "grad_norm": 0.1317308247089386, "learning_rate": 3.016102740205487e-06, "loss": 0.465, "num_input_tokens_seen": 93440464, "step": 77025 }, { "epoch": 8.578906337008576, "grad_norm": 0.09208385646343231, "learning_rate": 3.0137892427725715e-06, "loss": 0.4627, "num_input_tokens_seen": 93446352, "step": 77030 }, { "epoch": 8.579463191892193, "grad_norm": 0.11603005975484848, "learning_rate": 3.01147657604916e-06, "loss": 0.4608, "num_input_tokens_seen": 93452720, "step": 77035 }, { "epoch": 8.580020046775811, "grad_norm": 0.1586608737707138, "learning_rate": 3.0091647401226274e-06, "loss": 0.4549, "num_input_tokens_seen": 93458544, "step": 77040 }, { "epoch": 8.580576901659427, "grad_norm": 0.10162895917892456, "learning_rate": 3.0068537350803254e-06, "loss": 0.4675, "num_input_tokens_seen": 93465008, "step": 77045 }, { "epoch": 8.581133756543045, "grad_norm": 0.12062782049179077, "learning_rate": 3.004543561009568e-06, "loss": 0.4634, "num_input_tokens_seen": 93470864, "step": 77050 }, { "epoch": 8.581690611426662, "grad_norm": 0.10367757827043533, "learning_rate": 3.0022342179976402e-06, "loss": 0.4556, "num_input_tokens_seen": 93476912, "step": 77055 }, { "epoch": 8.58224746631028, "grad_norm": 0.10369595140218735, "learning_rate": 2.9999257061318036e-06, "loss": 0.4649, "num_input_tokens_seen": 93483440, "step": 77060 }, { "epoch": 8.582804321193898, "grad_norm": 0.08412890136241913, "learning_rate": 2.997618025499274e-06, "loss": 0.4631, "num_input_tokens_seen": 93489584, "step": 77065 }, { "epoch": 8.583361176077513, "grad_norm": 0.08042141795158386, "learning_rate": 2.9953111761872487e-06, "loss": 0.4585, "num_input_tokens_seen": 93495824, "step": 77070 }, { "epoch": 8.583918030961131, "grad_norm": 0.1182694286108017, "learning_rate": 2.99300515828288e-06, "loss": 0.4713, "num_input_tokens_seen": 93502128, "step": 77075 }, { "epoch": 8.584474885844749, "grad_norm": 0.09692481905221939, "learning_rate": 2.9906999718733007e-06, "loss": 0.4553, "num_input_tokens_seen": 93508208, "step": 77080 }, { "epoch": 8.585031740728367, "grad_norm": 0.1268005073070526, "learning_rate": 2.9883956170456167e-06, "loss": 0.4536, "num_input_tokens_seen": 93514288, "step": 77085 }, { "epoch": 8.585588595611984, "grad_norm": 0.12259657680988312, "learning_rate": 2.9860920938868775e-06, "loss": 0.459, "num_input_tokens_seen": 93520528, "step": 77090 }, { "epoch": 8.5861454504956, "grad_norm": 0.10633698105812073, "learning_rate": 2.9837894024841327e-06, "loss": 0.4531, "num_input_tokens_seen": 93526384, "step": 77095 }, { "epoch": 8.586702305379218, "grad_norm": 0.13194845616817474, "learning_rate": 2.9814875429243743e-06, "loss": 0.4542, "num_input_tokens_seen": 93532688, "step": 77100 }, { "epoch": 8.587259160262835, "grad_norm": 0.07473623752593994, "learning_rate": 2.9791865152945825e-06, "loss": 0.4547, "num_input_tokens_seen": 93538960, "step": 77105 }, { "epoch": 8.587816015146453, "grad_norm": 0.12136093527078629, "learning_rate": 2.9768863196816877e-06, "loss": 0.4516, "num_input_tokens_seen": 93545232, "step": 77110 }, { "epoch": 8.58837287003007, "grad_norm": 0.08452071994543076, "learning_rate": 2.9745869561726037e-06, "loss": 0.4615, "num_input_tokens_seen": 93551344, "step": 77115 }, { "epoch": 8.588929724913687, "grad_norm": 0.11893689632415771, "learning_rate": 2.9722884248542165e-06, "loss": 0.4691, "num_input_tokens_seen": 93557616, "step": 77120 }, { "epoch": 8.589486579797304, "grad_norm": 0.08603442460298538, "learning_rate": 2.969990725813354e-06, "loss": 0.4681, "num_input_tokens_seen": 93563696, "step": 77125 }, { "epoch": 8.590043434680922, "grad_norm": 0.1418527215719223, "learning_rate": 2.96769385913685e-06, "loss": 0.4668, "num_input_tokens_seen": 93569904, "step": 77130 }, { "epoch": 8.59060028956454, "grad_norm": 0.12063416093587875, "learning_rate": 2.96539782491147e-06, "loss": 0.4646, "num_input_tokens_seen": 93576112, "step": 77135 }, { "epoch": 8.591157144448157, "grad_norm": 0.1142091304063797, "learning_rate": 2.9631026232239817e-06, "loss": 0.4591, "num_input_tokens_seen": 93582160, "step": 77140 }, { "epoch": 8.591713999331773, "grad_norm": 0.12544257938861847, "learning_rate": 2.9608082541610906e-06, "loss": 0.4676, "num_input_tokens_seen": 93587984, "step": 77145 }, { "epoch": 8.592270854215391, "grad_norm": 0.09852149337530136, "learning_rate": 2.958514717809491e-06, "loss": 0.4601, "num_input_tokens_seen": 93593872, "step": 77150 }, { "epoch": 8.592827709099009, "grad_norm": 0.11620424687862396, "learning_rate": 2.956222014255844e-06, "loss": 0.4497, "num_input_tokens_seen": 93599824, "step": 77155 }, { "epoch": 8.593384563982626, "grad_norm": 0.09873653948307037, "learning_rate": 2.9539301435867693e-06, "loss": 0.4512, "num_input_tokens_seen": 93606224, "step": 77160 }, { "epoch": 8.593941418866244, "grad_norm": 0.10714909434318542, "learning_rate": 2.9516391058888706e-06, "loss": 0.4702, "num_input_tokens_seen": 93612432, "step": 77165 }, { "epoch": 8.59449827374986, "grad_norm": 0.14675959944725037, "learning_rate": 2.949348901248697e-06, "loss": 0.4602, "num_input_tokens_seen": 93618384, "step": 77170 }, { "epoch": 8.595055128633478, "grad_norm": 0.09686693549156189, "learning_rate": 2.947059529752791e-06, "loss": 0.4632, "num_input_tokens_seen": 93624720, "step": 77175 }, { "epoch": 8.595611983517095, "grad_norm": 0.0995999127626419, "learning_rate": 2.944770991487644e-06, "loss": 0.4528, "num_input_tokens_seen": 93630800, "step": 77180 }, { "epoch": 8.596168838400713, "grad_norm": 0.09726429730653763, "learning_rate": 2.9424832865397268e-06, "loss": 0.4584, "num_input_tokens_seen": 93636976, "step": 77185 }, { "epoch": 8.59672569328433, "grad_norm": 0.11788161098957062, "learning_rate": 2.9401964149954858e-06, "loss": 0.4523, "num_input_tokens_seen": 93643408, "step": 77190 }, { "epoch": 8.597282548167948, "grad_norm": 0.11222357302904129, "learning_rate": 2.937910376941311e-06, "loss": 0.4462, "num_input_tokens_seen": 93649808, "step": 77195 }, { "epoch": 8.597839403051564, "grad_norm": 0.10245831310749054, "learning_rate": 2.935625172463588e-06, "loss": 0.4662, "num_input_tokens_seen": 93656048, "step": 77200 }, { "epoch": 8.598396257935182, "grad_norm": 0.11729340255260468, "learning_rate": 2.9333408016486515e-06, "loss": 0.4539, "num_input_tokens_seen": 93662320, "step": 77205 }, { "epoch": 8.5989531128188, "grad_norm": 0.11705232411623001, "learning_rate": 2.9310572645828147e-06, "loss": 0.455, "num_input_tokens_seen": 93668240, "step": 77210 }, { "epoch": 8.599509967702417, "grad_norm": 0.1056809201836586, "learning_rate": 2.9287745613523655e-06, "loss": 0.4595, "num_input_tokens_seen": 93674480, "step": 77215 }, { "epoch": 8.600066822586035, "grad_norm": 0.1052728146314621, "learning_rate": 2.9264926920435367e-06, "loss": 0.4565, "num_input_tokens_seen": 93680592, "step": 77220 }, { "epoch": 8.60062367746965, "grad_norm": 0.10195520520210266, "learning_rate": 2.9242116567425563e-06, "loss": 0.466, "num_input_tokens_seen": 93686896, "step": 77225 }, { "epoch": 8.601180532353268, "grad_norm": 0.12598562240600586, "learning_rate": 2.9219314555356007e-06, "loss": 0.4581, "num_input_tokens_seen": 93693168, "step": 77230 }, { "epoch": 8.601737387236886, "grad_norm": 0.09585545212030411, "learning_rate": 2.9196520885088314e-06, "loss": 0.4651, "num_input_tokens_seen": 93699440, "step": 77235 }, { "epoch": 8.602294242120504, "grad_norm": 0.11013872176408768, "learning_rate": 2.917373555748362e-06, "loss": 0.4607, "num_input_tokens_seen": 93705744, "step": 77240 }, { "epoch": 8.602851097004121, "grad_norm": 0.19951416552066803, "learning_rate": 2.9150958573402887e-06, "loss": 0.4559, "num_input_tokens_seen": 93711504, "step": 77245 }, { "epoch": 8.603407951887737, "grad_norm": 0.13077469170093536, "learning_rate": 2.9128189933706697e-06, "loss": 0.4577, "num_input_tokens_seen": 93717744, "step": 77250 }, { "epoch": 8.603964806771355, "grad_norm": 0.12431029975414276, "learning_rate": 2.9105429639255305e-06, "loss": 0.461, "num_input_tokens_seen": 93724112, "step": 77255 }, { "epoch": 8.604521661654973, "grad_norm": 0.08800452947616577, "learning_rate": 2.9082677690908693e-06, "loss": 0.4612, "num_input_tokens_seen": 93730320, "step": 77260 }, { "epoch": 8.60507851653859, "grad_norm": 0.09086696058511734, "learning_rate": 2.9059934089526423e-06, "loss": 0.4605, "num_input_tokens_seen": 93736592, "step": 77265 }, { "epoch": 8.605635371422208, "grad_norm": 0.07150887697935104, "learning_rate": 2.903719883596795e-06, "loss": 0.4683, "num_input_tokens_seen": 93742832, "step": 77270 }, { "epoch": 8.606192226305824, "grad_norm": 0.11861705034971237, "learning_rate": 2.9014471931092163e-06, "loss": 0.4581, "num_input_tokens_seen": 93749136, "step": 77275 }, { "epoch": 8.606749081189442, "grad_norm": 0.12129571288824081, "learning_rate": 2.8991753375757795e-06, "loss": 0.447, "num_input_tokens_seen": 93755472, "step": 77280 }, { "epoch": 8.60730593607306, "grad_norm": 0.10022240877151489, "learning_rate": 2.89690431708233e-06, "loss": 0.4625, "num_input_tokens_seen": 93761168, "step": 77285 }, { "epoch": 8.607862790956677, "grad_norm": 0.13041986525058746, "learning_rate": 2.894634131714666e-06, "loss": 0.453, "num_input_tokens_seen": 93767088, "step": 77290 }, { "epoch": 8.608419645840295, "grad_norm": 0.09591680765151978, "learning_rate": 2.8923647815585654e-06, "loss": 0.459, "num_input_tokens_seen": 93773680, "step": 77295 }, { "epoch": 8.60897650072391, "grad_norm": 0.08350066840648651, "learning_rate": 2.890096266699768e-06, "loss": 0.4569, "num_input_tokens_seen": 93779728, "step": 77300 }, { "epoch": 8.609533355607528, "grad_norm": 0.10121259093284607, "learning_rate": 2.887828587223987e-06, "loss": 0.4572, "num_input_tokens_seen": 93785520, "step": 77305 }, { "epoch": 8.610090210491146, "grad_norm": 0.09456092864274979, "learning_rate": 2.885561743216911e-06, "loss": 0.461, "num_input_tokens_seen": 93791056, "step": 77310 }, { "epoch": 8.610647065374764, "grad_norm": 0.13109977543354034, "learning_rate": 2.883295734764174e-06, "loss": 0.462, "num_input_tokens_seen": 93797136, "step": 77315 }, { "epoch": 8.611203920258381, "grad_norm": 0.11863568425178528, "learning_rate": 2.8810305619514063e-06, "loss": 0.4709, "num_input_tokens_seen": 93802992, "step": 77320 }, { "epoch": 8.611760775141999, "grad_norm": 0.09410390257835388, "learning_rate": 2.878766224864182e-06, "loss": 0.4716, "num_input_tokens_seen": 93808784, "step": 77325 }, { "epoch": 8.612317630025615, "grad_norm": 0.10881958901882172, "learning_rate": 2.8765027235880676e-06, "loss": 0.4642, "num_input_tokens_seen": 93814672, "step": 77330 }, { "epoch": 8.612874484909232, "grad_norm": 0.11149358004331589, "learning_rate": 2.8742400582085744e-06, "loss": 0.4674, "num_input_tokens_seen": 93820656, "step": 77335 }, { "epoch": 8.61343133979285, "grad_norm": 0.08339957147836685, "learning_rate": 2.871978228811195e-06, "loss": 0.4504, "num_input_tokens_seen": 93826384, "step": 77340 }, { "epoch": 8.613988194676468, "grad_norm": 0.18979741632938385, "learning_rate": 2.869717235481395e-06, "loss": 0.4554, "num_input_tokens_seen": 93832496, "step": 77345 }, { "epoch": 8.614545049560085, "grad_norm": 0.09693891555070877, "learning_rate": 2.8674570783045943e-06, "loss": 0.4631, "num_input_tokens_seen": 93838608, "step": 77350 }, { "epoch": 8.615101904443701, "grad_norm": 0.11401247978210449, "learning_rate": 2.8651977573661977e-06, "loss": 0.4643, "num_input_tokens_seen": 93844880, "step": 77355 }, { "epoch": 8.615658759327319, "grad_norm": 0.17204803228378296, "learning_rate": 2.8629392727515608e-06, "loss": 0.4511, "num_input_tokens_seen": 93851120, "step": 77360 }, { "epoch": 8.616215614210937, "grad_norm": 0.12337905168533325, "learning_rate": 2.860681624546022e-06, "loss": 0.4733, "num_input_tokens_seen": 93857200, "step": 77365 }, { "epoch": 8.616772469094554, "grad_norm": 0.10769181698560715, "learning_rate": 2.8584248128348757e-06, "loss": 0.4644, "num_input_tokens_seen": 93863440, "step": 77370 }, { "epoch": 8.617329323978172, "grad_norm": 0.12478892505168915, "learning_rate": 2.8561688377033963e-06, "loss": 0.4495, "num_input_tokens_seen": 93869584, "step": 77375 }, { "epoch": 8.617886178861788, "grad_norm": 0.12070834636688232, "learning_rate": 2.8539136992368293e-06, "loss": 0.4673, "num_input_tokens_seen": 93875312, "step": 77380 }, { "epoch": 8.618443033745406, "grad_norm": 0.13137179613113403, "learning_rate": 2.8516593975203647e-06, "loss": 0.4614, "num_input_tokens_seen": 93881296, "step": 77385 }, { "epoch": 8.618999888629023, "grad_norm": 0.09302989393472672, "learning_rate": 2.849405932639193e-06, "loss": 0.4569, "num_input_tokens_seen": 93887344, "step": 77390 }, { "epoch": 8.619556743512641, "grad_norm": 0.17646606266498566, "learning_rate": 2.8471533046784434e-06, "loss": 0.4689, "num_input_tokens_seen": 93893520, "step": 77395 }, { "epoch": 8.620113598396259, "grad_norm": 0.09405835717916489, "learning_rate": 2.8449015137232416e-06, "loss": 0.4542, "num_input_tokens_seen": 93899664, "step": 77400 }, { "epoch": 8.620670453279875, "grad_norm": 0.09687892347574234, "learning_rate": 2.842650559858653e-06, "loss": 0.461, "num_input_tokens_seen": 93905520, "step": 77405 }, { "epoch": 8.621227308163492, "grad_norm": 0.14582815766334534, "learning_rate": 2.8404004431697356e-06, "loss": 0.463, "num_input_tokens_seen": 93911536, "step": 77410 }, { "epoch": 8.62178416304711, "grad_norm": 0.09768856316804886, "learning_rate": 2.8381511637415064e-06, "loss": 0.4645, "num_input_tokens_seen": 93917392, "step": 77415 }, { "epoch": 8.622341017930728, "grad_norm": 0.10034620761871338, "learning_rate": 2.835902721658948e-06, "loss": 0.4582, "num_input_tokens_seen": 93923504, "step": 77420 }, { "epoch": 8.622897872814345, "grad_norm": 0.09917905181646347, "learning_rate": 2.833655117007006e-06, "loss": 0.4703, "num_input_tokens_seen": 93929584, "step": 77425 }, { "epoch": 8.623454727697961, "grad_norm": 0.13370075821876526, "learning_rate": 2.831408349870618e-06, "loss": 0.4472, "num_input_tokens_seen": 93935792, "step": 77430 }, { "epoch": 8.624011582581579, "grad_norm": 0.11033692210912704, "learning_rate": 2.8291624203346576e-06, "loss": 0.4602, "num_input_tokens_seen": 93941808, "step": 77435 }, { "epoch": 8.624568437465197, "grad_norm": 0.12915688753128052, "learning_rate": 2.826917328483997e-06, "loss": 0.4684, "num_input_tokens_seen": 93948336, "step": 77440 }, { "epoch": 8.625125292348814, "grad_norm": 0.0880519449710846, "learning_rate": 2.82467307440345e-06, "loss": 0.4573, "num_input_tokens_seen": 93954672, "step": 77445 }, { "epoch": 8.625682147232432, "grad_norm": 0.11736299097537994, "learning_rate": 2.822429658177825e-06, "loss": 0.4707, "num_input_tokens_seen": 93960560, "step": 77450 }, { "epoch": 8.626239002116048, "grad_norm": 0.1296231895685196, "learning_rate": 2.8201870798918757e-06, "loss": 0.4625, "num_input_tokens_seen": 93966384, "step": 77455 }, { "epoch": 8.626795856999665, "grad_norm": 0.09508480131626129, "learning_rate": 2.8179453396303344e-06, "loss": 0.4606, "num_input_tokens_seen": 93971696, "step": 77460 }, { "epoch": 8.627352711883283, "grad_norm": 0.1305992156267166, "learning_rate": 2.815704437477909e-06, "loss": 0.451, "num_input_tokens_seen": 93977936, "step": 77465 }, { "epoch": 8.6279095667669, "grad_norm": 0.14947009086608887, "learning_rate": 2.813464373519259e-06, "loss": 0.4555, "num_input_tokens_seen": 93984368, "step": 77470 }, { "epoch": 8.628466421650518, "grad_norm": 0.1157887801527977, "learning_rate": 2.8112251478390307e-06, "loss": 0.4495, "num_input_tokens_seen": 93990352, "step": 77475 }, { "epoch": 8.629023276534134, "grad_norm": 0.0986698791384697, "learning_rate": 2.80898676052182e-06, "loss": 0.4592, "num_input_tokens_seen": 93996464, "step": 77480 }, { "epoch": 8.629580131417752, "grad_norm": 0.10910826176404953, "learning_rate": 2.8067492116522015e-06, "loss": 0.4638, "num_input_tokens_seen": 94002416, "step": 77485 }, { "epoch": 8.63013698630137, "grad_norm": 0.12226651608943939, "learning_rate": 2.8045125013147265e-06, "loss": 0.4726, "num_input_tokens_seen": 94008656, "step": 77490 }, { "epoch": 8.630693841184987, "grad_norm": 0.10711756348609924, "learning_rate": 2.8022766295938917e-06, "loss": 0.4583, "num_input_tokens_seen": 94014800, "step": 77495 }, { "epoch": 8.631250696068605, "grad_norm": 0.1005338653922081, "learning_rate": 2.8000415965741875e-06, "loss": 0.4642, "num_input_tokens_seen": 94020016, "step": 77500 }, { "epoch": 8.631807550952221, "grad_norm": 0.09808379411697388, "learning_rate": 2.7978074023400495e-06, "loss": 0.462, "num_input_tokens_seen": 94026384, "step": 77505 }, { "epoch": 8.632364405835839, "grad_norm": 0.10455575585365295, "learning_rate": 2.7955740469759044e-06, "loss": 0.4638, "num_input_tokens_seen": 94032656, "step": 77510 }, { "epoch": 8.632921260719456, "grad_norm": 0.08816369622945786, "learning_rate": 2.793341530566124e-06, "loss": 0.4542, "num_input_tokens_seen": 94038928, "step": 77515 }, { "epoch": 8.633478115603074, "grad_norm": 0.16986271739006042, "learning_rate": 2.791109853195065e-06, "loss": 0.4481, "num_input_tokens_seen": 94045168, "step": 77520 }, { "epoch": 8.634034970486692, "grad_norm": 0.09140351414680481, "learning_rate": 2.788879014947052e-06, "loss": 0.4716, "num_input_tokens_seen": 94051344, "step": 77525 }, { "epoch": 8.634591825370308, "grad_norm": 0.09173405170440674, "learning_rate": 2.786649015906365e-06, "loss": 0.4588, "num_input_tokens_seen": 94057296, "step": 77530 }, { "epoch": 8.635148680253925, "grad_norm": 0.09796597808599472, "learning_rate": 2.7844198561572695e-06, "loss": 0.4596, "num_input_tokens_seen": 94063248, "step": 77535 }, { "epoch": 8.635705535137543, "grad_norm": 0.111935093998909, "learning_rate": 2.782191535783979e-06, "loss": 0.4665, "num_input_tokens_seen": 94069040, "step": 77540 }, { "epoch": 8.63626239002116, "grad_norm": 0.08621595054864883, "learning_rate": 2.7799640548706985e-06, "loss": 0.4622, "num_input_tokens_seen": 94075056, "step": 77545 }, { "epoch": 8.636819244904778, "grad_norm": 0.1085372120141983, "learning_rate": 2.777737413501577e-06, "loss": 0.4619, "num_input_tokens_seen": 94081232, "step": 77550 }, { "epoch": 8.637376099788396, "grad_norm": 0.1248934268951416, "learning_rate": 2.775511611760753e-06, "loss": 0.4569, "num_input_tokens_seen": 94087536, "step": 77555 }, { "epoch": 8.637932954672012, "grad_norm": 0.12672273814678192, "learning_rate": 2.7732866497323264e-06, "loss": 0.464, "num_input_tokens_seen": 94093488, "step": 77560 }, { "epoch": 8.63848980955563, "grad_norm": 0.11329073458909988, "learning_rate": 2.771062527500354e-06, "loss": 0.4618, "num_input_tokens_seen": 94099664, "step": 77565 }, { "epoch": 8.639046664439247, "grad_norm": 0.09277109056711197, "learning_rate": 2.7688392451488796e-06, "loss": 0.4657, "num_input_tokens_seen": 94105616, "step": 77570 }, { "epoch": 8.639603519322865, "grad_norm": 0.1582212597131729, "learning_rate": 2.766616802761898e-06, "loss": 0.4479, "num_input_tokens_seen": 94111856, "step": 77575 }, { "epoch": 8.640160374206483, "grad_norm": 0.09883667528629303, "learning_rate": 2.7643952004233836e-06, "loss": 0.4607, "num_input_tokens_seen": 94118032, "step": 77580 }, { "epoch": 8.640717229090098, "grad_norm": 0.09803397208452225, "learning_rate": 2.762174438217283e-06, "loss": 0.4481, "num_input_tokens_seen": 94123984, "step": 77585 }, { "epoch": 8.641274083973716, "grad_norm": 0.08503104746341705, "learning_rate": 2.7599545162274893e-06, "loss": 0.4548, "num_input_tokens_seen": 94130480, "step": 77590 }, { "epoch": 8.641830938857334, "grad_norm": 0.08754906058311462, "learning_rate": 2.757735434537889e-06, "loss": 0.4548, "num_input_tokens_seen": 94136848, "step": 77595 }, { "epoch": 8.642387793740951, "grad_norm": 0.0971655547618866, "learning_rate": 2.755517193232321e-06, "loss": 0.4372, "num_input_tokens_seen": 94142576, "step": 77600 }, { "epoch": 8.64294464862457, "grad_norm": 0.14990240335464478, "learning_rate": 2.7532997923946035e-06, "loss": 0.4661, "num_input_tokens_seen": 94149200, "step": 77605 }, { "epoch": 8.643501503508185, "grad_norm": 0.14982615411281586, "learning_rate": 2.751083232108509e-06, "loss": 0.4685, "num_input_tokens_seen": 94155376, "step": 77610 }, { "epoch": 8.644058358391803, "grad_norm": 0.14194707572460175, "learning_rate": 2.7488675124577865e-06, "loss": 0.4588, "num_input_tokens_seen": 94161616, "step": 77615 }, { "epoch": 8.64461521327542, "grad_norm": 0.16081079840660095, "learning_rate": 2.7466526335261645e-06, "loss": 0.4499, "num_input_tokens_seen": 94167152, "step": 77620 }, { "epoch": 8.645172068159038, "grad_norm": 0.10582205653190613, "learning_rate": 2.7444385953973168e-06, "loss": 0.4626, "num_input_tokens_seen": 94173360, "step": 77625 }, { "epoch": 8.645728923042656, "grad_norm": 0.09714190661907196, "learning_rate": 2.7422253981549018e-06, "loss": 0.4537, "num_input_tokens_seen": 94179344, "step": 77630 }, { "epoch": 8.646285777926272, "grad_norm": 0.13781508803367615, "learning_rate": 2.740013041882536e-06, "loss": 0.4562, "num_input_tokens_seen": 94185680, "step": 77635 }, { "epoch": 8.64684263280989, "grad_norm": 0.10118356347084045, "learning_rate": 2.737801526663819e-06, "loss": 0.4566, "num_input_tokens_seen": 94192240, "step": 77640 }, { "epoch": 8.647399487693507, "grad_norm": 0.11103552579879761, "learning_rate": 2.7355908525822983e-06, "loss": 0.4576, "num_input_tokens_seen": 94198320, "step": 77645 }, { "epoch": 8.647956342577125, "grad_norm": 0.1141572818160057, "learning_rate": 2.7333810197215037e-06, "loss": 0.4604, "num_input_tokens_seen": 94204432, "step": 77650 }, { "epoch": 8.648513197460742, "grad_norm": 0.15774783492088318, "learning_rate": 2.7311720281649385e-06, "loss": 0.4547, "num_input_tokens_seen": 94210096, "step": 77655 }, { "epoch": 8.64907005234436, "grad_norm": 0.127130389213562, "learning_rate": 2.728963877996052e-06, "loss": 0.4738, "num_input_tokens_seen": 94216048, "step": 77660 }, { "epoch": 8.649626907227976, "grad_norm": 0.10163179785013199, "learning_rate": 2.7267565692982833e-06, "loss": 0.4739, "num_input_tokens_seen": 94222224, "step": 77665 }, { "epoch": 8.650183762111594, "grad_norm": 0.17352408170700073, "learning_rate": 2.7245501021550267e-06, "loss": 0.4611, "num_input_tokens_seen": 94228720, "step": 77670 }, { "epoch": 8.650740616995211, "grad_norm": 0.10903601348400116, "learning_rate": 2.7223444766496575e-06, "loss": 0.4478, "num_input_tokens_seen": 94234800, "step": 77675 }, { "epoch": 8.651297471878829, "grad_norm": 0.09085942804813385, "learning_rate": 2.7201396928655033e-06, "loss": 0.4644, "num_input_tokens_seen": 94240720, "step": 77680 }, { "epoch": 8.651854326762447, "grad_norm": 0.10887959599494934, "learning_rate": 2.7179357508858666e-06, "loss": 0.4523, "num_input_tokens_seen": 94246896, "step": 77685 }, { "epoch": 8.652411181646062, "grad_norm": 0.10818159580230713, "learning_rate": 2.7157326507940313e-06, "loss": 0.4547, "num_input_tokens_seen": 94252976, "step": 77690 }, { "epoch": 8.65296803652968, "grad_norm": 0.10232627391815186, "learning_rate": 2.713530392673225e-06, "loss": 0.4604, "num_input_tokens_seen": 94259280, "step": 77695 }, { "epoch": 8.653524891413298, "grad_norm": 0.08780410140752792, "learning_rate": 2.711328976606664e-06, "loss": 0.4652, "num_input_tokens_seen": 94265296, "step": 77700 }, { "epoch": 8.654081746296916, "grad_norm": 0.13180279731750488, "learning_rate": 2.7091284026775187e-06, "loss": 0.4534, "num_input_tokens_seen": 94271216, "step": 77705 }, { "epoch": 8.654638601180533, "grad_norm": 0.13452045619487762, "learning_rate": 2.706928670968936e-06, "loss": 0.4541, "num_input_tokens_seen": 94277264, "step": 77710 }, { "epoch": 8.655195456064149, "grad_norm": 0.10389897972345352, "learning_rate": 2.704729781564036e-06, "loss": 0.4606, "num_input_tokens_seen": 94283408, "step": 77715 }, { "epoch": 8.655752310947767, "grad_norm": 0.08425761014223099, "learning_rate": 2.702531734545888e-06, "loss": 0.4633, "num_input_tokens_seen": 94289520, "step": 77720 }, { "epoch": 8.656309165831384, "grad_norm": 0.10907133668661118, "learning_rate": 2.7003345299975503e-06, "loss": 0.4678, "num_input_tokens_seen": 94295664, "step": 77725 }, { "epoch": 8.656866020715002, "grad_norm": 0.09981636703014374, "learning_rate": 2.6981381680020346e-06, "loss": 0.4686, "num_input_tokens_seen": 94301744, "step": 77730 }, { "epoch": 8.65742287559862, "grad_norm": 0.12519507110118866, "learning_rate": 2.69594264864233e-06, "loss": 0.4622, "num_input_tokens_seen": 94307824, "step": 77735 }, { "epoch": 8.657979730482236, "grad_norm": 0.131129190325737, "learning_rate": 2.693747972001387e-06, "loss": 0.4627, "num_input_tokens_seen": 94314128, "step": 77740 }, { "epoch": 8.658536585365853, "grad_norm": 0.11494097113609314, "learning_rate": 2.6915541381621277e-06, "loss": 0.4624, "num_input_tokens_seen": 94320080, "step": 77745 }, { "epoch": 8.659093440249471, "grad_norm": 0.09534449130296707, "learning_rate": 2.6893611472074505e-06, "loss": 0.4645, "num_input_tokens_seen": 94326448, "step": 77750 }, { "epoch": 8.659650295133089, "grad_norm": 0.09284456074237823, "learning_rate": 2.687168999220202e-06, "loss": 0.4454, "num_input_tokens_seen": 94332816, "step": 77755 }, { "epoch": 8.660207150016706, "grad_norm": 0.10751192271709442, "learning_rate": 2.6849776942832193e-06, "loss": 0.4602, "num_input_tokens_seen": 94338576, "step": 77760 }, { "epoch": 8.660764004900322, "grad_norm": 0.1209782063961029, "learning_rate": 2.6827872324792837e-06, "loss": 0.4616, "num_input_tokens_seen": 94344784, "step": 77765 }, { "epoch": 8.66132085978394, "grad_norm": 0.10827060788869858, "learning_rate": 2.6805976138911726e-06, "loss": 0.4666, "num_input_tokens_seen": 94350608, "step": 77770 }, { "epoch": 8.661877714667558, "grad_norm": 0.10434728115797043, "learning_rate": 2.6784088386016036e-06, "loss": 0.475, "num_input_tokens_seen": 94356368, "step": 77775 }, { "epoch": 8.662434569551175, "grad_norm": 0.13042567670345306, "learning_rate": 2.676220906693283e-06, "loss": 0.4677, "num_input_tokens_seen": 94362288, "step": 77780 }, { "epoch": 8.662991424434793, "grad_norm": 0.10565266758203506, "learning_rate": 2.6740338182488805e-06, "loss": 0.4689, "num_input_tokens_seen": 94368304, "step": 77785 }, { "epoch": 8.663548279318409, "grad_norm": 0.10502800345420837, "learning_rate": 2.671847573351022e-06, "loss": 0.4599, "num_input_tokens_seen": 94374416, "step": 77790 }, { "epoch": 8.664105134202027, "grad_norm": 0.15145531296730042, "learning_rate": 2.669662172082324e-06, "loss": 0.4656, "num_input_tokens_seen": 94380752, "step": 77795 }, { "epoch": 8.664661989085644, "grad_norm": 0.1107354685664177, "learning_rate": 2.667477614525343e-06, "loss": 0.4616, "num_input_tokens_seen": 94386800, "step": 77800 }, { "epoch": 8.665218843969262, "grad_norm": 0.1451173722743988, "learning_rate": 2.665293900762625e-06, "loss": 0.4919, "num_input_tokens_seen": 94392976, "step": 77805 }, { "epoch": 8.66577569885288, "grad_norm": 0.11113562434911728, "learning_rate": 2.663111030876686e-06, "loss": 0.4539, "num_input_tokens_seen": 94399344, "step": 77810 }, { "epoch": 8.666332553736495, "grad_norm": 0.11464952677488327, "learning_rate": 2.660929004949986e-06, "loss": 0.4488, "num_input_tokens_seen": 94405296, "step": 77815 }, { "epoch": 8.666889408620113, "grad_norm": 0.15948691964149475, "learning_rate": 2.658747823064986e-06, "loss": 0.4642, "num_input_tokens_seen": 94411760, "step": 77820 }, { "epoch": 8.66744626350373, "grad_norm": 0.14815327525138855, "learning_rate": 2.6565674853040817e-06, "loss": 0.4618, "num_input_tokens_seen": 94417808, "step": 77825 }, { "epoch": 8.668003118387348, "grad_norm": 0.13384151458740234, "learning_rate": 2.6543879917496658e-06, "loss": 0.4582, "num_input_tokens_seen": 94424432, "step": 77830 }, { "epoch": 8.668559973270966, "grad_norm": 0.08968106657266617, "learning_rate": 2.6522093424840827e-06, "loss": 0.4553, "num_input_tokens_seen": 94430352, "step": 77835 }, { "epoch": 8.669116828154582, "grad_norm": 0.092009536921978, "learning_rate": 2.6500315375896425e-06, "loss": 0.4715, "num_input_tokens_seen": 94436304, "step": 77840 }, { "epoch": 8.6696736830382, "grad_norm": 0.13852547109127045, "learning_rate": 2.6478545771486394e-06, "loss": 0.4646, "num_input_tokens_seen": 94442448, "step": 77845 }, { "epoch": 8.670230537921817, "grad_norm": 0.1542920470237732, "learning_rate": 2.645678461243317e-06, "loss": 0.4563, "num_input_tokens_seen": 94448784, "step": 77850 }, { "epoch": 8.670787392805435, "grad_norm": 0.09736865758895874, "learning_rate": 2.6435031899559036e-06, "loss": 0.4526, "num_input_tokens_seen": 94454960, "step": 77855 }, { "epoch": 8.671344247689053, "grad_norm": 0.1270139217376709, "learning_rate": 2.6413287633685807e-06, "loss": 0.4686, "num_input_tokens_seen": 94461040, "step": 77860 }, { "epoch": 8.671901102572669, "grad_norm": 0.10999618470668793, "learning_rate": 2.6391551815635103e-06, "loss": 0.4627, "num_input_tokens_seen": 94466864, "step": 77865 }, { "epoch": 8.672457957456286, "grad_norm": 0.09514224529266357, "learning_rate": 2.636982444622821e-06, "loss": 0.4647, "num_input_tokens_seen": 94473200, "step": 77870 }, { "epoch": 8.673014812339904, "grad_norm": 0.11010927706956863, "learning_rate": 2.6348105526285945e-06, "loss": 0.459, "num_input_tokens_seen": 94479696, "step": 77875 }, { "epoch": 8.673571667223522, "grad_norm": 0.12484607100486755, "learning_rate": 2.632639505662901e-06, "loss": 0.4539, "num_input_tokens_seen": 94485296, "step": 77880 }, { "epoch": 8.67412852210714, "grad_norm": 0.10129625350236893, "learning_rate": 2.6304693038077643e-06, "loss": 0.4652, "num_input_tokens_seen": 94491696, "step": 77885 }, { "epoch": 8.674685376990757, "grad_norm": 0.11450210213661194, "learning_rate": 2.628299947145185e-06, "loss": 0.4621, "num_input_tokens_seen": 94497776, "step": 77890 }, { "epoch": 8.675242231874373, "grad_norm": 0.1030554324388504, "learning_rate": 2.6261314357571313e-06, "loss": 0.4733, "num_input_tokens_seen": 94503760, "step": 77895 }, { "epoch": 8.67579908675799, "grad_norm": 0.13164295256137848, "learning_rate": 2.6239637697255265e-06, "loss": 0.463, "num_input_tokens_seen": 94510224, "step": 77900 }, { "epoch": 8.676355941641608, "grad_norm": 0.10943905264139175, "learning_rate": 2.6217969491322823e-06, "loss": 0.4713, "num_input_tokens_seen": 94516432, "step": 77905 }, { "epoch": 8.676912796525226, "grad_norm": 0.09069371223449707, "learning_rate": 2.6196309740592616e-06, "loss": 0.4502, "num_input_tokens_seen": 94522512, "step": 77910 }, { "epoch": 8.677469651408844, "grad_norm": 0.09909742325544357, "learning_rate": 2.6174658445883073e-06, "loss": 0.4617, "num_input_tokens_seen": 94528624, "step": 77915 }, { "epoch": 8.67802650629246, "grad_norm": 0.1040000319480896, "learning_rate": 2.6153015608012173e-06, "loss": 0.46, "num_input_tokens_seen": 94534704, "step": 77920 }, { "epoch": 8.678583361176077, "grad_norm": 0.14737044274806976, "learning_rate": 2.613138122779771e-06, "loss": 0.4555, "num_input_tokens_seen": 94540720, "step": 77925 }, { "epoch": 8.679140216059695, "grad_norm": 0.13581635057926178, "learning_rate": 2.610975530605711e-06, "loss": 0.4596, "num_input_tokens_seen": 94546928, "step": 77930 }, { "epoch": 8.679697070943313, "grad_norm": 0.09181851893663406, "learning_rate": 2.608813784360739e-06, "loss": 0.4587, "num_input_tokens_seen": 94553264, "step": 77935 }, { "epoch": 8.68025392582693, "grad_norm": 0.10301995277404785, "learning_rate": 2.606652884126545e-06, "loss": 0.4573, "num_input_tokens_seen": 94559632, "step": 77940 }, { "epoch": 8.680810780710546, "grad_norm": 0.12111905962228775, "learning_rate": 2.6044928299847643e-06, "loss": 0.4522, "num_input_tokens_seen": 94566256, "step": 77945 }, { "epoch": 8.681367635594164, "grad_norm": 0.10469361394643784, "learning_rate": 2.6023336220170167e-06, "loss": 0.4481, "num_input_tokens_seen": 94572560, "step": 77950 }, { "epoch": 8.681924490477781, "grad_norm": 0.09784244000911713, "learning_rate": 2.6001752603048795e-06, "loss": 0.4616, "num_input_tokens_seen": 94578480, "step": 77955 }, { "epoch": 8.6824813453614, "grad_norm": 0.1018659770488739, "learning_rate": 2.5980177449299036e-06, "loss": 0.4578, "num_input_tokens_seen": 94584400, "step": 77960 }, { "epoch": 8.683038200245017, "grad_norm": 0.0922347903251648, "learning_rate": 2.595861075973613e-06, "loss": 0.4625, "num_input_tokens_seen": 94590224, "step": 77965 }, { "epoch": 8.683595055128633, "grad_norm": 0.08328047394752502, "learning_rate": 2.593705253517484e-06, "loss": 0.4604, "num_input_tokens_seen": 94596400, "step": 77970 }, { "epoch": 8.68415191001225, "grad_norm": 0.10629314929246902, "learning_rate": 2.591550277642979e-06, "loss": 0.4667, "num_input_tokens_seen": 94602704, "step": 77975 }, { "epoch": 8.684708764895868, "grad_norm": 0.1201617494225502, "learning_rate": 2.589396148431511e-06, "loss": 0.4664, "num_input_tokens_seen": 94608784, "step": 77980 }, { "epoch": 8.685265619779486, "grad_norm": 0.12486832588911057, "learning_rate": 2.587242865964476e-06, "loss": 0.4553, "num_input_tokens_seen": 94614928, "step": 77985 }, { "epoch": 8.685822474663103, "grad_norm": 0.11422515660524368, "learning_rate": 2.5850904303232335e-06, "loss": 0.4514, "num_input_tokens_seen": 94620784, "step": 77990 }, { "epoch": 8.68637932954672, "grad_norm": 0.13555486500263214, "learning_rate": 2.5829388415891026e-06, "loss": 0.4587, "num_input_tokens_seen": 94626576, "step": 77995 }, { "epoch": 8.686936184430337, "grad_norm": 0.11156144738197327, "learning_rate": 2.580788099843387e-06, "loss": 0.475, "num_input_tokens_seen": 94632656, "step": 78000 }, { "epoch": 8.687493039313955, "grad_norm": 0.10605458915233612, "learning_rate": 2.5786382051673387e-06, "loss": 0.4615, "num_input_tokens_seen": 94638320, "step": 78005 }, { "epoch": 8.688049894197572, "grad_norm": 0.09488312900066376, "learning_rate": 2.576489157642195e-06, "loss": 0.4598, "num_input_tokens_seen": 94644496, "step": 78010 }, { "epoch": 8.68860674908119, "grad_norm": 0.10044287145137787, "learning_rate": 2.5743409573491443e-06, "loss": 0.4669, "num_input_tokens_seen": 94650768, "step": 78015 }, { "epoch": 8.689163603964808, "grad_norm": 0.11407814919948578, "learning_rate": 2.572193604369361e-06, "loss": 0.4648, "num_input_tokens_seen": 94656880, "step": 78020 }, { "epoch": 8.689720458848424, "grad_norm": 0.11726781725883484, "learning_rate": 2.5700470987839814e-06, "loss": 0.4655, "num_input_tokens_seen": 94662960, "step": 78025 }, { "epoch": 8.690277313732041, "grad_norm": 0.16118039190769196, "learning_rate": 2.5679014406740973e-06, "loss": 0.4635, "num_input_tokens_seen": 94669008, "step": 78030 }, { "epoch": 8.690834168615659, "grad_norm": 0.1009252592921257, "learning_rate": 2.565756630120791e-06, "loss": 0.464, "num_input_tokens_seen": 94674768, "step": 78035 }, { "epoch": 8.691391023499277, "grad_norm": 0.09791538864374161, "learning_rate": 2.563612667205087e-06, "loss": 0.4588, "num_input_tokens_seen": 94680400, "step": 78040 }, { "epoch": 8.691947878382894, "grad_norm": 0.12190352380275726, "learning_rate": 2.5614695520080003e-06, "loss": 0.4757, "num_input_tokens_seen": 94686480, "step": 78045 }, { "epoch": 8.69250473326651, "grad_norm": 0.08439770340919495, "learning_rate": 2.5593272846104995e-06, "loss": 0.4596, "num_input_tokens_seen": 94692560, "step": 78050 }, { "epoch": 8.693061588150128, "grad_norm": 0.09604959189891815, "learning_rate": 2.5571858650935282e-06, "loss": 0.4571, "num_input_tokens_seen": 94698864, "step": 78055 }, { "epoch": 8.693618443033746, "grad_norm": 0.09425163269042969, "learning_rate": 2.5550452935379997e-06, "loss": 0.4681, "num_input_tokens_seen": 94704816, "step": 78060 }, { "epoch": 8.694175297917363, "grad_norm": 0.10492438822984695, "learning_rate": 2.5529055700247857e-06, "loss": 0.4541, "num_input_tokens_seen": 94710960, "step": 78065 }, { "epoch": 8.69473215280098, "grad_norm": 0.13426540791988373, "learning_rate": 2.5507666946347376e-06, "loss": 0.4751, "num_input_tokens_seen": 94716880, "step": 78070 }, { "epoch": 8.695289007684597, "grad_norm": 0.08754542469978333, "learning_rate": 2.54862866744866e-06, "loss": 0.4647, "num_input_tokens_seen": 94723120, "step": 78075 }, { "epoch": 8.695845862568214, "grad_norm": 0.09227743744850159, "learning_rate": 2.5464914885473477e-06, "loss": 0.4654, "num_input_tokens_seen": 94729104, "step": 78080 }, { "epoch": 8.696402717451832, "grad_norm": 0.09062465280294418, "learning_rate": 2.544355158011538e-06, "loss": 0.4598, "num_input_tokens_seen": 94735760, "step": 78085 }, { "epoch": 8.69695957233545, "grad_norm": 0.12497834116220474, "learning_rate": 2.5422196759219504e-06, "loss": 0.4553, "num_input_tokens_seen": 94741712, "step": 78090 }, { "epoch": 8.697516427219067, "grad_norm": 0.08949447423219681, "learning_rate": 2.5400850423592802e-06, "loss": 0.4526, "num_input_tokens_seen": 94747536, "step": 78095 }, { "epoch": 8.698073282102683, "grad_norm": 0.09168397635221481, "learning_rate": 2.537951257404167e-06, "loss": 0.4537, "num_input_tokens_seen": 94753456, "step": 78100 }, { "epoch": 8.698630136986301, "grad_norm": 0.11195901781320572, "learning_rate": 2.5358183211372454e-06, "loss": 0.4518, "num_input_tokens_seen": 94759984, "step": 78105 }, { "epoch": 8.699186991869919, "grad_norm": 0.16590622067451477, "learning_rate": 2.5336862336390905e-06, "loss": 0.4649, "num_input_tokens_seen": 94766064, "step": 78110 }, { "epoch": 8.699743846753536, "grad_norm": 0.09048032015562057, "learning_rate": 2.531554994990268e-06, "loss": 0.4561, "num_input_tokens_seen": 94771920, "step": 78115 }, { "epoch": 8.700300701637154, "grad_norm": 0.15175685286521912, "learning_rate": 2.529424605271305e-06, "loss": 0.4621, "num_input_tokens_seen": 94778032, "step": 78120 }, { "epoch": 8.70085755652077, "grad_norm": 0.08840557187795639, "learning_rate": 2.5272950645626876e-06, "loss": 0.4563, "num_input_tokens_seen": 94784016, "step": 78125 }, { "epoch": 8.701414411404388, "grad_norm": 0.1416279673576355, "learning_rate": 2.5251663729448843e-06, "loss": 0.4623, "num_input_tokens_seen": 94790096, "step": 78130 }, { "epoch": 8.701971266288005, "grad_norm": 0.09774693101644516, "learning_rate": 2.5230385304983146e-06, "loss": 0.4531, "num_input_tokens_seen": 94796432, "step": 78135 }, { "epoch": 8.702528121171623, "grad_norm": 0.11497395485639572, "learning_rate": 2.5209115373033857e-06, "loss": 0.4541, "num_input_tokens_seen": 94802192, "step": 78140 }, { "epoch": 8.70308497605524, "grad_norm": 0.1319838911294937, "learning_rate": 2.518785393440451e-06, "loss": 0.4435, "num_input_tokens_seen": 94808176, "step": 78145 }, { "epoch": 8.703641830938857, "grad_norm": 0.1267556995153427, "learning_rate": 2.516660098989848e-06, "loss": 0.4556, "num_input_tokens_seen": 94814384, "step": 78150 }, { "epoch": 8.704198685822474, "grad_norm": 0.10686221718788147, "learning_rate": 2.5145356540318825e-06, "loss": 0.4521, "num_input_tokens_seen": 94820272, "step": 78155 }, { "epoch": 8.704755540706092, "grad_norm": 0.10447472333908081, "learning_rate": 2.5124120586468153e-06, "loss": 0.4729, "num_input_tokens_seen": 94826512, "step": 78160 }, { "epoch": 8.70531239558971, "grad_norm": 0.12370629608631134, "learning_rate": 2.51028931291489e-06, "loss": 0.4654, "num_input_tokens_seen": 94832272, "step": 78165 }, { "epoch": 8.705869250473327, "grad_norm": 0.09328662604093552, "learning_rate": 2.5081674169163015e-06, "loss": 0.4766, "num_input_tokens_seen": 94838736, "step": 78170 }, { "epoch": 8.706426105356943, "grad_norm": 0.09075096249580383, "learning_rate": 2.5060463707312297e-06, "loss": 0.4622, "num_input_tokens_seen": 94844272, "step": 78175 }, { "epoch": 8.70698296024056, "grad_norm": 0.284624308347702, "learning_rate": 2.503926174439808e-06, "loss": 0.4618, "num_input_tokens_seen": 94850256, "step": 78180 }, { "epoch": 8.707539815124179, "grad_norm": 0.09825524687767029, "learning_rate": 2.501806828122147e-06, "loss": 0.4613, "num_input_tokens_seen": 94856336, "step": 78185 }, { "epoch": 8.708096670007796, "grad_norm": 0.11828815191984177, "learning_rate": 2.49968833185833e-06, "loss": 0.451, "num_input_tokens_seen": 94862128, "step": 78190 }, { "epoch": 8.708653524891414, "grad_norm": 0.13140814006328583, "learning_rate": 2.4975706857283872e-06, "loss": 0.4666, "num_input_tokens_seen": 94868304, "step": 78195 }, { "epoch": 8.70921037977503, "grad_norm": 0.12115667760372162, "learning_rate": 2.495453889812341e-06, "loss": 0.4604, "num_input_tokens_seen": 94873936, "step": 78200 }, { "epoch": 8.709767234658647, "grad_norm": 0.07954083383083344, "learning_rate": 2.4933379441901605e-06, "loss": 0.4609, "num_input_tokens_seen": 94880080, "step": 78205 }, { "epoch": 8.710324089542265, "grad_norm": 0.09882838279008865, "learning_rate": 2.491222848941799e-06, "loss": 0.4662, "num_input_tokens_seen": 94886256, "step": 78210 }, { "epoch": 8.710880944425883, "grad_norm": 0.16849659383296967, "learning_rate": 2.489108604147178e-06, "loss": 0.4596, "num_input_tokens_seen": 94892432, "step": 78215 }, { "epoch": 8.7114377993095, "grad_norm": 0.09757629781961441, "learning_rate": 2.486995209886167e-06, "loss": 0.4704, "num_input_tokens_seen": 94898352, "step": 78220 }, { "epoch": 8.711994654193116, "grad_norm": 0.12184512615203857, "learning_rate": 2.4848826662386275e-06, "loss": 0.4585, "num_input_tokens_seen": 94904496, "step": 78225 }, { "epoch": 8.712551509076734, "grad_norm": 0.09172716736793518, "learning_rate": 2.482770973284371e-06, "loss": 0.4679, "num_input_tokens_seen": 94910640, "step": 78230 }, { "epoch": 8.713108363960352, "grad_norm": 0.1086716502904892, "learning_rate": 2.4806601311031908e-06, "loss": 0.4634, "num_input_tokens_seen": 94916880, "step": 78235 }, { "epoch": 8.71366521884397, "grad_norm": 0.11886030435562134, "learning_rate": 2.478550139774835e-06, "loss": 0.464, "num_input_tokens_seen": 94922864, "step": 78240 }, { "epoch": 8.714222073727587, "grad_norm": 0.09426870197057724, "learning_rate": 2.4764409993790315e-06, "loss": 0.4641, "num_input_tokens_seen": 94928400, "step": 78245 }, { "epoch": 8.714778928611205, "grad_norm": 0.12207186967134476, "learning_rate": 2.4743327099954667e-06, "loss": 0.4617, "num_input_tokens_seen": 94934960, "step": 78250 }, { "epoch": 8.71533578349482, "grad_norm": 0.09199750423431396, "learning_rate": 2.472225271703796e-06, "loss": 0.4654, "num_input_tokens_seen": 94940880, "step": 78255 }, { "epoch": 8.715892638378438, "grad_norm": 0.09292338788509369, "learning_rate": 2.470118684583647e-06, "loss": 0.4764, "num_input_tokens_seen": 94946448, "step": 78260 }, { "epoch": 8.716449493262056, "grad_norm": 0.12299545854330063, "learning_rate": 2.468012948714621e-06, "loss": 0.4624, "num_input_tokens_seen": 94952240, "step": 78265 }, { "epoch": 8.717006348145674, "grad_norm": 0.0928075760602951, "learning_rate": 2.465908064176267e-06, "loss": 0.4616, "num_input_tokens_seen": 94958384, "step": 78270 }, { "epoch": 8.717563203029291, "grad_norm": 0.10942064225673676, "learning_rate": 2.4638040310481254e-06, "loss": 0.4589, "num_input_tokens_seen": 94964624, "step": 78275 }, { "epoch": 8.718120057912907, "grad_norm": 0.12813681364059448, "learning_rate": 2.4617008494096816e-06, "loss": 0.4593, "num_input_tokens_seen": 94970576, "step": 78280 }, { "epoch": 8.718676912796525, "grad_norm": 0.13476289808750153, "learning_rate": 2.459598519340414e-06, "loss": 0.4639, "num_input_tokens_seen": 94976656, "step": 78285 }, { "epoch": 8.719233767680143, "grad_norm": 0.08766241371631622, "learning_rate": 2.4574970409197457e-06, "loss": 0.4668, "num_input_tokens_seen": 94982800, "step": 78290 }, { "epoch": 8.71979062256376, "grad_norm": 0.079053595662117, "learning_rate": 2.455396414227076e-06, "loss": 0.4632, "num_input_tokens_seen": 94989264, "step": 78295 }, { "epoch": 8.720347477447378, "grad_norm": 0.11674793809652328, "learning_rate": 2.4532966393417846e-06, "loss": 0.4737, "num_input_tokens_seen": 94995280, "step": 78300 }, { "epoch": 8.720904332330994, "grad_norm": 0.10207448154687881, "learning_rate": 2.4511977163431984e-06, "loss": 0.4657, "num_input_tokens_seen": 95001296, "step": 78305 }, { "epoch": 8.721461187214611, "grad_norm": 0.127939373254776, "learning_rate": 2.4490996453106234e-06, "loss": 0.4513, "num_input_tokens_seen": 95007376, "step": 78310 }, { "epoch": 8.72201804209823, "grad_norm": 0.0998634621500969, "learning_rate": 2.4470024263233304e-06, "loss": 0.4564, "num_input_tokens_seen": 95013424, "step": 78315 }, { "epoch": 8.722574896981847, "grad_norm": 0.11929197609424591, "learning_rate": 2.4449060594605637e-06, "loss": 0.4653, "num_input_tokens_seen": 95019664, "step": 78320 }, { "epoch": 8.723131751865465, "grad_norm": 0.10494110733270645, "learning_rate": 2.4428105448015243e-06, "loss": 0.4494, "num_input_tokens_seen": 95025872, "step": 78325 }, { "epoch": 8.72368860674908, "grad_norm": 0.11601968854665756, "learning_rate": 2.440715882425387e-06, "loss": 0.4537, "num_input_tokens_seen": 95032080, "step": 78330 }, { "epoch": 8.724245461632698, "grad_norm": 0.17640309035778046, "learning_rate": 2.4386220724113063e-06, "loss": 0.4659, "num_input_tokens_seen": 95037872, "step": 78335 }, { "epoch": 8.724802316516316, "grad_norm": 0.11012829840183258, "learning_rate": 2.436529114838379e-06, "loss": 0.4606, "num_input_tokens_seen": 95043536, "step": 78340 }, { "epoch": 8.725359171399933, "grad_norm": 0.10870914161205292, "learning_rate": 2.434437009785698e-06, "loss": 0.4519, "num_input_tokens_seen": 95049648, "step": 78345 }, { "epoch": 8.725916026283551, "grad_norm": 0.11312255263328552, "learning_rate": 2.432345757332294e-06, "loss": 0.4751, "num_input_tokens_seen": 95055568, "step": 78350 }, { "epoch": 8.726472881167167, "grad_norm": 0.09121464192867279, "learning_rate": 2.430255357557193e-06, "loss": 0.4607, "num_input_tokens_seen": 95061744, "step": 78355 }, { "epoch": 8.727029736050785, "grad_norm": 0.10260722041130066, "learning_rate": 2.428165810539368e-06, "loss": 0.455, "num_input_tokens_seen": 95067632, "step": 78360 }, { "epoch": 8.727586590934402, "grad_norm": 0.09511714428663254, "learning_rate": 2.4260771163577752e-06, "loss": 0.4675, "num_input_tokens_seen": 95073840, "step": 78365 }, { "epoch": 8.72814344581802, "grad_norm": 0.09456725418567657, "learning_rate": 2.4239892750913345e-06, "loss": 0.4648, "num_input_tokens_seen": 95079888, "step": 78370 }, { "epoch": 8.728700300701638, "grad_norm": 0.1127861812710762, "learning_rate": 2.421902286818922e-06, "loss": 0.458, "num_input_tokens_seen": 95085968, "step": 78375 }, { "epoch": 8.729257155585255, "grad_norm": 0.1231573298573494, "learning_rate": 2.419816151619403e-06, "loss": 0.4615, "num_input_tokens_seen": 95091568, "step": 78380 }, { "epoch": 8.729814010468871, "grad_norm": 0.09756424278020859, "learning_rate": 2.4177308695715857e-06, "loss": 0.4583, "num_input_tokens_seen": 95097136, "step": 78385 }, { "epoch": 8.730370865352489, "grad_norm": 0.09466378390789032, "learning_rate": 2.4156464407542656e-06, "loss": 0.4687, "num_input_tokens_seen": 95103248, "step": 78390 }, { "epoch": 8.730927720236107, "grad_norm": 0.10998553782701492, "learning_rate": 2.4135628652462023e-06, "loss": 0.4597, "num_input_tokens_seen": 95109584, "step": 78395 }, { "epoch": 8.731484575119724, "grad_norm": 0.10255637019872665, "learning_rate": 2.4114801431261103e-06, "loss": 0.4598, "num_input_tokens_seen": 95114768, "step": 78400 }, { "epoch": 8.732041430003342, "grad_norm": 0.07638897746801376, "learning_rate": 2.4093982744726906e-06, "loss": 0.4587, "num_input_tokens_seen": 95121136, "step": 78405 }, { "epoch": 8.732598284886958, "grad_norm": 0.14420650899410248, "learning_rate": 2.407317259364597e-06, "loss": 0.457, "num_input_tokens_seen": 95127600, "step": 78410 }, { "epoch": 8.733155139770576, "grad_norm": 0.07485488802194595, "learning_rate": 2.405237097880461e-06, "loss": 0.4633, "num_input_tokens_seen": 95133936, "step": 78415 }, { "epoch": 8.733711994654193, "grad_norm": 0.12193380296230316, "learning_rate": 2.4031577900988726e-06, "loss": 0.465, "num_input_tokens_seen": 95140112, "step": 78420 }, { "epoch": 8.734268849537811, "grad_norm": 0.09795711189508438, "learning_rate": 2.4010793360983996e-06, "loss": 0.4634, "num_input_tokens_seen": 95146288, "step": 78425 }, { "epoch": 8.734825704421429, "grad_norm": 0.1368473321199417, "learning_rate": 2.3990017359575736e-06, "loss": 0.4638, "num_input_tokens_seen": 95152112, "step": 78430 }, { "epoch": 8.735382559305044, "grad_norm": 0.08417969197034836, "learning_rate": 2.396924989754887e-06, "loss": 0.4495, "num_input_tokens_seen": 95158032, "step": 78435 }, { "epoch": 8.735939414188662, "grad_norm": 0.14311257004737854, "learning_rate": 2.394849097568813e-06, "loss": 0.4635, "num_input_tokens_seen": 95164016, "step": 78440 }, { "epoch": 8.73649626907228, "grad_norm": 0.10781165212392807, "learning_rate": 2.3927740594777785e-06, "loss": 0.462, "num_input_tokens_seen": 95170128, "step": 78445 }, { "epoch": 8.737053123955898, "grad_norm": 0.08771169185638428, "learning_rate": 2.3906998755601928e-06, "loss": 0.4581, "num_input_tokens_seen": 95176208, "step": 78450 }, { "epoch": 8.737609978839515, "grad_norm": 0.10859241336584091, "learning_rate": 2.388626545894415e-06, "loss": 0.4531, "num_input_tokens_seen": 95182352, "step": 78455 }, { "epoch": 8.738166833723131, "grad_norm": 0.15328535437583923, "learning_rate": 2.3865540705587887e-06, "loss": 0.4674, "num_input_tokens_seen": 95188272, "step": 78460 }, { "epoch": 8.738723688606749, "grad_norm": 0.09226340800523758, "learning_rate": 2.3844824496316252e-06, "loss": 0.4532, "num_input_tokens_seen": 95194320, "step": 78465 }, { "epoch": 8.739280543490366, "grad_norm": 0.1221807524561882, "learning_rate": 2.3824116831911825e-06, "loss": 0.4742, "num_input_tokens_seen": 95200624, "step": 78470 }, { "epoch": 8.739837398373984, "grad_norm": 0.14474250376224518, "learning_rate": 2.380341771315711e-06, "loss": 0.4557, "num_input_tokens_seen": 95206992, "step": 78475 }, { "epoch": 8.740394253257602, "grad_norm": 0.12905128300189972, "learning_rate": 2.3782727140834125e-06, "loss": 0.4515, "num_input_tokens_seen": 95213072, "step": 78480 }, { "epoch": 8.740951108141218, "grad_norm": 0.10807424783706665, "learning_rate": 2.376204511572466e-06, "loss": 0.4529, "num_input_tokens_seen": 95219248, "step": 78485 }, { "epoch": 8.741507963024835, "grad_norm": 0.10391826182603836, "learning_rate": 2.3741371638610205e-06, "loss": 0.4572, "num_input_tokens_seen": 95225136, "step": 78490 }, { "epoch": 8.742064817908453, "grad_norm": 0.08088001608848572, "learning_rate": 2.3720706710271768e-06, "loss": 0.4587, "num_input_tokens_seen": 95230768, "step": 78495 }, { "epoch": 8.74262167279207, "grad_norm": 0.11421288549900055, "learning_rate": 2.37000503314902e-06, "loss": 0.4528, "num_input_tokens_seen": 95236720, "step": 78500 }, { "epoch": 8.743178527675688, "grad_norm": 0.16177032887935638, "learning_rate": 2.36794025030459e-06, "loss": 0.4567, "num_input_tokens_seen": 95242832, "step": 78505 }, { "epoch": 8.743735382559304, "grad_norm": 0.11944200098514557, "learning_rate": 2.365876322571911e-06, "loss": 0.4555, "num_input_tokens_seen": 95249456, "step": 78510 }, { "epoch": 8.744292237442922, "grad_norm": 0.12600836157798767, "learning_rate": 2.363813250028954e-06, "loss": 0.4568, "num_input_tokens_seen": 95255504, "step": 78515 }, { "epoch": 8.74484909232654, "grad_norm": 0.1436765193939209, "learning_rate": 2.3617510327536757e-06, "loss": 0.4691, "num_input_tokens_seen": 95261136, "step": 78520 }, { "epoch": 8.745405947210157, "grad_norm": 0.15718989074230194, "learning_rate": 2.359689670823992e-06, "loss": 0.4653, "num_input_tokens_seen": 95267376, "step": 78525 }, { "epoch": 8.745962802093775, "grad_norm": 0.08224520087242126, "learning_rate": 2.3576291643177843e-06, "loss": 0.4596, "num_input_tokens_seen": 95273840, "step": 78530 }, { "epoch": 8.74651965697739, "grad_norm": 0.11197832226753235, "learning_rate": 2.3555695133129137e-06, "loss": 0.47, "num_input_tokens_seen": 95279760, "step": 78535 }, { "epoch": 8.747076511861009, "grad_norm": 0.18670758605003357, "learning_rate": 2.353510717887189e-06, "loss": 0.4577, "num_input_tokens_seen": 95286192, "step": 78540 }, { "epoch": 8.747633366744626, "grad_norm": 0.1359318047761917, "learning_rate": 2.351452778118407e-06, "loss": 0.4637, "num_input_tokens_seen": 95292240, "step": 78545 }, { "epoch": 8.748190221628244, "grad_norm": 0.09087339043617249, "learning_rate": 2.3493956940843164e-06, "loss": 0.4432, "num_input_tokens_seen": 95298288, "step": 78550 }, { "epoch": 8.748747076511862, "grad_norm": 0.10323118418455124, "learning_rate": 2.347339465862644e-06, "loss": 0.465, "num_input_tokens_seen": 95304112, "step": 78555 }, { "epoch": 8.749303931395477, "grad_norm": 0.1016940101981163, "learning_rate": 2.345284093531086e-06, "loss": 0.4474, "num_input_tokens_seen": 95309904, "step": 78560 }, { "epoch": 8.749860786279095, "grad_norm": 0.07641725987195969, "learning_rate": 2.3432295771672885e-06, "loss": 0.4655, "num_input_tokens_seen": 95316272, "step": 78565 }, { "epoch": 8.750417641162713, "grad_norm": 0.09044504910707474, "learning_rate": 2.341175916848892e-06, "loss": 0.4641, "num_input_tokens_seen": 95322384, "step": 78570 }, { "epoch": 8.75097449604633, "grad_norm": 0.15211090445518494, "learning_rate": 2.3391231126534768e-06, "loss": 0.4632, "num_input_tokens_seen": 95328464, "step": 78575 }, { "epoch": 8.751531350929948, "grad_norm": 0.12425006926059723, "learning_rate": 2.3370711646586164e-06, "loss": 0.4772, "num_input_tokens_seen": 95334800, "step": 78580 }, { "epoch": 8.752088205813564, "grad_norm": 0.13929322361946106, "learning_rate": 2.335020072941829e-06, "loss": 0.4654, "num_input_tokens_seen": 95340624, "step": 78585 }, { "epoch": 8.752645060697182, "grad_norm": 0.1064973697066307, "learning_rate": 2.3329698375806175e-06, "loss": 0.455, "num_input_tokens_seen": 95346736, "step": 78590 }, { "epoch": 8.7532019155808, "grad_norm": 0.11695971339941025, "learning_rate": 2.33092045865245e-06, "loss": 0.4657, "num_input_tokens_seen": 95352144, "step": 78595 }, { "epoch": 8.753758770464417, "grad_norm": 0.15504662692546844, "learning_rate": 2.3288719362347472e-06, "loss": 0.4604, "num_input_tokens_seen": 95358512, "step": 78600 }, { "epoch": 8.754315625348035, "grad_norm": 0.17589938640594482, "learning_rate": 2.3268242704049233e-06, "loss": 0.4505, "num_input_tokens_seen": 95364752, "step": 78605 }, { "epoch": 8.754872480231652, "grad_norm": 0.09235168248414993, "learning_rate": 2.324777461240332e-06, "loss": 0.4555, "num_input_tokens_seen": 95370832, "step": 78610 }, { "epoch": 8.755429335115268, "grad_norm": 0.10226382315158844, "learning_rate": 2.3227315088183153e-06, "loss": 0.4695, "num_input_tokens_seen": 95376912, "step": 78615 }, { "epoch": 8.755986189998886, "grad_norm": 0.10638800263404846, "learning_rate": 2.3206864132161804e-06, "loss": 0.4735, "num_input_tokens_seen": 95383376, "step": 78620 }, { "epoch": 8.756543044882504, "grad_norm": 0.1063905656337738, "learning_rate": 2.3186421745111874e-06, "loss": 0.4631, "num_input_tokens_seen": 95389424, "step": 78625 }, { "epoch": 8.757099899766121, "grad_norm": 0.0936877653002739, "learning_rate": 2.316598792780583e-06, "loss": 0.4587, "num_input_tokens_seen": 95395536, "step": 78630 }, { "epoch": 8.757656754649739, "grad_norm": 0.09136076271533966, "learning_rate": 2.3145562681015643e-06, "loss": 0.4668, "num_input_tokens_seen": 95401392, "step": 78635 }, { "epoch": 8.758213609533355, "grad_norm": 0.14779132604599, "learning_rate": 2.3125146005513133e-06, "loss": 0.4615, "num_input_tokens_seen": 95407408, "step": 78640 }, { "epoch": 8.758770464416973, "grad_norm": 0.09766719490289688, "learning_rate": 2.310473790206963e-06, "loss": 0.4507, "num_input_tokens_seen": 95413520, "step": 78645 }, { "epoch": 8.75932731930059, "grad_norm": 0.08112584054470062, "learning_rate": 2.3084338371456236e-06, "loss": 0.4594, "num_input_tokens_seen": 95419728, "step": 78650 }, { "epoch": 8.759884174184208, "grad_norm": 0.10704174637794495, "learning_rate": 2.3063947414443814e-06, "loss": 0.4675, "num_input_tokens_seen": 95425904, "step": 78655 }, { "epoch": 8.760441029067826, "grad_norm": 0.1305956393480301, "learning_rate": 2.3043565031802632e-06, "loss": 0.4539, "num_input_tokens_seen": 95432016, "step": 78660 }, { "epoch": 8.760997883951442, "grad_norm": 0.13704389333724976, "learning_rate": 2.302319122430285e-06, "loss": 0.4619, "num_input_tokens_seen": 95438064, "step": 78665 }, { "epoch": 8.76155473883506, "grad_norm": 0.10915592312812805, "learning_rate": 2.3002825992714363e-06, "loss": 0.471, "num_input_tokens_seen": 95444208, "step": 78670 }, { "epoch": 8.762111593718677, "grad_norm": 0.1837420016527176, "learning_rate": 2.298246933780651e-06, "loss": 0.4702, "num_input_tokens_seen": 95450736, "step": 78675 }, { "epoch": 8.762668448602295, "grad_norm": 0.13621363043785095, "learning_rate": 2.2962121260348523e-06, "loss": 0.4525, "num_input_tokens_seen": 95456912, "step": 78680 }, { "epoch": 8.763225303485912, "grad_norm": 0.1044883280992508, "learning_rate": 2.2941781761109114e-06, "loss": 0.4666, "num_input_tokens_seen": 95462768, "step": 78685 }, { "epoch": 8.763782158369528, "grad_norm": 0.09594382345676422, "learning_rate": 2.2921450840856893e-06, "loss": 0.4573, "num_input_tokens_seen": 95468848, "step": 78690 }, { "epoch": 8.764339013253146, "grad_norm": 0.0875532329082489, "learning_rate": 2.2901128500359944e-06, "loss": 0.4522, "num_input_tokens_seen": 95474992, "step": 78695 }, { "epoch": 8.764895868136763, "grad_norm": 0.10174009948968887, "learning_rate": 2.2880814740386087e-06, "loss": 0.4678, "num_input_tokens_seen": 95481232, "step": 78700 }, { "epoch": 8.765452723020381, "grad_norm": 0.1173274889588356, "learning_rate": 2.2860509561702964e-06, "loss": 0.4699, "num_input_tokens_seen": 95487312, "step": 78705 }, { "epoch": 8.766009577903999, "grad_norm": 0.1260693073272705, "learning_rate": 2.2840212965077656e-06, "loss": 0.4586, "num_input_tokens_seen": 95493456, "step": 78710 }, { "epoch": 8.766566432787615, "grad_norm": 0.096210777759552, "learning_rate": 2.2819924951277105e-06, "loss": 0.4623, "num_input_tokens_seen": 95499504, "step": 78715 }, { "epoch": 8.767123287671232, "grad_norm": 0.1553569734096527, "learning_rate": 2.279964552106775e-06, "loss": 0.4628, "num_input_tokens_seen": 95505776, "step": 78720 }, { "epoch": 8.76768014255485, "grad_norm": 0.12332756072282791, "learning_rate": 2.277937467521596e-06, "loss": 0.4682, "num_input_tokens_seen": 95511568, "step": 78725 }, { "epoch": 8.768236997438468, "grad_norm": 0.12704551219940186, "learning_rate": 2.2759112414487527e-06, "loss": 0.4562, "num_input_tokens_seen": 95517936, "step": 78730 }, { "epoch": 8.768793852322085, "grad_norm": 0.10154365748167038, "learning_rate": 2.273885873964804e-06, "loss": 0.4597, "num_input_tokens_seen": 95523984, "step": 78735 }, { "epoch": 8.769350707205703, "grad_norm": 0.11405475437641144, "learning_rate": 2.27186136514628e-06, "loss": 0.4593, "num_input_tokens_seen": 95530032, "step": 78740 }, { "epoch": 8.769907562089319, "grad_norm": 0.1580527275800705, "learning_rate": 2.269837715069667e-06, "loss": 0.4751, "num_input_tokens_seen": 95536336, "step": 78745 }, { "epoch": 8.770464416972937, "grad_norm": 0.09785483032464981, "learning_rate": 2.2678149238114315e-06, "loss": 0.4566, "num_input_tokens_seen": 95542384, "step": 78750 }, { "epoch": 8.771021271856554, "grad_norm": 0.10471654683351517, "learning_rate": 2.2657929914479903e-06, "loss": 0.4582, "num_input_tokens_seen": 95548176, "step": 78755 }, { "epoch": 8.771578126740172, "grad_norm": 0.09213428944349289, "learning_rate": 2.2637719180557486e-06, "loss": 0.4536, "num_input_tokens_seen": 95554096, "step": 78760 }, { "epoch": 8.77213498162379, "grad_norm": 0.11460497975349426, "learning_rate": 2.261751703711068e-06, "loss": 0.462, "num_input_tokens_seen": 95560176, "step": 78765 }, { "epoch": 8.772691836507406, "grad_norm": 0.08930500596761703, "learning_rate": 2.2597323484902734e-06, "loss": 0.4579, "num_input_tokens_seen": 95566256, "step": 78770 }, { "epoch": 8.773248691391023, "grad_norm": 0.12573878467082977, "learning_rate": 2.2577138524696673e-06, "loss": 0.4453, "num_input_tokens_seen": 95572336, "step": 78775 }, { "epoch": 8.773805546274641, "grad_norm": 0.0900510624051094, "learning_rate": 2.2556962157255114e-06, "loss": 0.481, "num_input_tokens_seen": 95578672, "step": 78780 }, { "epoch": 8.774362401158259, "grad_norm": 0.12832704186439514, "learning_rate": 2.2536794383340417e-06, "loss": 0.4685, "num_input_tokens_seen": 95584688, "step": 78785 }, { "epoch": 8.774919256041876, "grad_norm": 0.10601649433374405, "learning_rate": 2.2516635203714555e-06, "loss": 0.4523, "num_input_tokens_seen": 95591056, "step": 78790 }, { "epoch": 8.775476110925492, "grad_norm": 0.08437845855951309, "learning_rate": 2.2496484619139197e-06, "loss": 0.4477, "num_input_tokens_seen": 95597008, "step": 78795 }, { "epoch": 8.77603296580911, "grad_norm": 0.09296252578496933, "learning_rate": 2.2476342630375792e-06, "loss": 0.4484, "num_input_tokens_seen": 95602896, "step": 78800 }, { "epoch": 8.776589820692728, "grad_norm": 0.13100914657115936, "learning_rate": 2.2456209238185226e-06, "loss": 0.4587, "num_input_tokens_seen": 95609104, "step": 78805 }, { "epoch": 8.777146675576345, "grad_norm": 0.12101034075021744, "learning_rate": 2.243608444332834e-06, "loss": 0.4472, "num_input_tokens_seen": 95615184, "step": 78810 }, { "epoch": 8.777703530459963, "grad_norm": 0.11828291416168213, "learning_rate": 2.241596824656539e-06, "loss": 0.4709, "num_input_tokens_seen": 95621712, "step": 78815 }, { "epoch": 8.778260385343579, "grad_norm": 0.10567869246006012, "learning_rate": 2.2395860648656537e-06, "loss": 0.4614, "num_input_tokens_seen": 95627792, "step": 78820 }, { "epoch": 8.778817240227196, "grad_norm": 0.12707440555095673, "learning_rate": 2.2375761650361427e-06, "loss": 0.4531, "num_input_tokens_seen": 95633648, "step": 78825 }, { "epoch": 8.779374095110814, "grad_norm": 0.11732454597949982, "learning_rate": 2.235567125243948e-06, "loss": 0.4695, "num_input_tokens_seen": 95639408, "step": 78830 }, { "epoch": 8.779930949994432, "grad_norm": 0.1329185962677002, "learning_rate": 2.2335589455649842e-06, "loss": 0.4564, "num_input_tokens_seen": 95646032, "step": 78835 }, { "epoch": 8.78048780487805, "grad_norm": 0.10807152837514877, "learning_rate": 2.231551626075121e-06, "loss": 0.4567, "num_input_tokens_seen": 95652144, "step": 78840 }, { "epoch": 8.781044659761665, "grad_norm": 0.17890922725200653, "learning_rate": 2.2295451668502027e-06, "loss": 0.4499, "num_input_tokens_seen": 95658320, "step": 78845 }, { "epoch": 8.781601514645283, "grad_norm": 0.06662021577358246, "learning_rate": 2.227539567966036e-06, "loss": 0.462, "num_input_tokens_seen": 95664464, "step": 78850 }, { "epoch": 8.7821583695289, "grad_norm": 0.11396534740924835, "learning_rate": 2.2255348294984075e-06, "loss": 0.4553, "num_input_tokens_seen": 95670512, "step": 78855 }, { "epoch": 8.782715224412518, "grad_norm": 0.12305890023708344, "learning_rate": 2.2235309515230536e-06, "loss": 0.4571, "num_input_tokens_seen": 95676592, "step": 78860 }, { "epoch": 8.783272079296136, "grad_norm": 0.10390837490558624, "learning_rate": 2.2215279341156913e-06, "loss": 0.4501, "num_input_tokens_seen": 95682704, "step": 78865 }, { "epoch": 8.783828934179752, "grad_norm": 0.08799715340137482, "learning_rate": 2.219525777352005e-06, "loss": 0.4628, "num_input_tokens_seen": 95688944, "step": 78870 }, { "epoch": 8.78438578906337, "grad_norm": 0.11482252180576324, "learning_rate": 2.217524481307634e-06, "loss": 0.4604, "num_input_tokens_seen": 95695216, "step": 78875 }, { "epoch": 8.784942643946987, "grad_norm": 0.11282277852296829, "learning_rate": 2.215524046058201e-06, "loss": 0.4431, "num_input_tokens_seen": 95701712, "step": 78880 }, { "epoch": 8.785499498830605, "grad_norm": 0.13435760140419006, "learning_rate": 2.213524471679282e-06, "loss": 0.4532, "num_input_tokens_seen": 95707632, "step": 78885 }, { "epoch": 8.786056353714223, "grad_norm": 0.21989552676677704, "learning_rate": 2.2115257582464328e-06, "loss": 0.4706, "num_input_tokens_seen": 95713904, "step": 78890 }, { "epoch": 8.786613208597839, "grad_norm": 0.12204210460186005, "learning_rate": 2.2095279058351737e-06, "loss": 0.4697, "num_input_tokens_seen": 95720112, "step": 78895 }, { "epoch": 8.787170063481456, "grad_norm": 0.0853935107588768, "learning_rate": 2.2075309145209805e-06, "loss": 0.4667, "num_input_tokens_seen": 95726352, "step": 78900 }, { "epoch": 8.787726918365074, "grad_norm": 0.1040707379579544, "learning_rate": 2.2055347843793177e-06, "loss": 0.4636, "num_input_tokens_seen": 95732240, "step": 78905 }, { "epoch": 8.788283773248692, "grad_norm": 0.09942491352558136, "learning_rate": 2.203539515485592e-06, "loss": 0.4592, "num_input_tokens_seen": 95738352, "step": 78910 }, { "epoch": 8.78884062813231, "grad_norm": 0.13196241855621338, "learning_rate": 2.2015451079152067e-06, "loss": 0.452, "num_input_tokens_seen": 95744496, "step": 78915 }, { "epoch": 8.789397483015925, "grad_norm": 0.11683697998523712, "learning_rate": 2.1995515617435014e-06, "loss": 0.4479, "num_input_tokens_seen": 95750704, "step": 78920 }, { "epoch": 8.789954337899543, "grad_norm": 0.12675800919532776, "learning_rate": 2.197558877045805e-06, "loss": 0.4596, "num_input_tokens_seen": 95756944, "step": 78925 }, { "epoch": 8.79051119278316, "grad_norm": 0.11557821929454803, "learning_rate": 2.195567053897413e-06, "loss": 0.4889, "num_input_tokens_seen": 95762896, "step": 78930 }, { "epoch": 8.791068047666778, "grad_norm": 0.09358013421297073, "learning_rate": 2.1935760923735756e-06, "loss": 0.4564, "num_input_tokens_seen": 95768560, "step": 78935 }, { "epoch": 8.791624902550396, "grad_norm": 0.09021871536970139, "learning_rate": 2.191585992549522e-06, "loss": 0.4579, "num_input_tokens_seen": 95774928, "step": 78940 }, { "epoch": 8.792181757434012, "grad_norm": 0.15897129476070404, "learning_rate": 2.1895967545004394e-06, "loss": 0.458, "num_input_tokens_seen": 95781424, "step": 78945 }, { "epoch": 8.79273861231763, "grad_norm": 0.12954680621623993, "learning_rate": 2.1876083783014977e-06, "loss": 0.4596, "num_input_tokens_seen": 95787152, "step": 78950 }, { "epoch": 8.793295467201247, "grad_norm": 0.13383017480373383, "learning_rate": 2.1856208640278095e-06, "loss": 0.4628, "num_input_tokens_seen": 95793104, "step": 78955 }, { "epoch": 8.793852322084865, "grad_norm": 0.1155475601553917, "learning_rate": 2.1836342117544788e-06, "loss": 0.467, "num_input_tokens_seen": 95798992, "step": 78960 }, { "epoch": 8.794409176968482, "grad_norm": 0.09584950655698776, "learning_rate": 2.1816484215565703e-06, "loss": 0.4509, "num_input_tokens_seen": 95805008, "step": 78965 }, { "epoch": 8.7949660318521, "grad_norm": 0.08787830919027328, "learning_rate": 2.179663493509104e-06, "loss": 0.4683, "num_input_tokens_seen": 95811440, "step": 78970 }, { "epoch": 8.795522886735716, "grad_norm": 0.14217166602611542, "learning_rate": 2.1776794276870906e-06, "loss": 0.4584, "num_input_tokens_seen": 95817360, "step": 78975 }, { "epoch": 8.796079741619334, "grad_norm": 0.0750168040394783, "learning_rate": 2.1756962241654773e-06, "loss": 0.4578, "num_input_tokens_seen": 95822800, "step": 78980 }, { "epoch": 8.796636596502951, "grad_norm": 0.09811762720346451, "learning_rate": 2.173713883019207e-06, "loss": 0.4528, "num_input_tokens_seen": 95828880, "step": 78985 }, { "epoch": 8.797193451386569, "grad_norm": 0.13274650275707245, "learning_rate": 2.1717324043231817e-06, "loss": 0.4598, "num_input_tokens_seen": 95834992, "step": 78990 }, { "epoch": 8.797750306270187, "grad_norm": 0.10929373651742935, "learning_rate": 2.169751788152255e-06, "loss": 0.4618, "num_input_tokens_seen": 95840976, "step": 78995 }, { "epoch": 8.798307161153803, "grad_norm": 0.09119562059640884, "learning_rate": 2.167772034581278e-06, "loss": 0.4555, "num_input_tokens_seen": 95847184, "step": 79000 }, { "epoch": 8.79886401603742, "grad_norm": 0.09455040097236633, "learning_rate": 2.1657931436850352e-06, "loss": 0.4581, "num_input_tokens_seen": 95853456, "step": 79005 }, { "epoch": 8.799420870921038, "grad_norm": 0.10954605042934418, "learning_rate": 2.163815115538309e-06, "loss": 0.4538, "num_input_tokens_seen": 95859664, "step": 79010 }, { "epoch": 8.799977725804656, "grad_norm": 0.11007585376501083, "learning_rate": 2.1618379502158253e-06, "loss": 0.465, "num_input_tokens_seen": 95865872, "step": 79015 }, { "epoch": 8.800534580688273, "grad_norm": 0.11076510697603226, "learning_rate": 2.159861647792291e-06, "loss": 0.4674, "num_input_tokens_seen": 95872080, "step": 79020 }, { "epoch": 8.80109143557189, "grad_norm": 0.10913246124982834, "learning_rate": 2.1578862083423856e-06, "loss": 0.476, "num_input_tokens_seen": 95877616, "step": 79025 }, { "epoch": 8.801648290455507, "grad_norm": 0.10294192284345627, "learning_rate": 2.1559116319407325e-06, "loss": 0.4766, "num_input_tokens_seen": 95883312, "step": 79030 }, { "epoch": 8.802205145339125, "grad_norm": 0.11987944692373276, "learning_rate": 2.1539379186619524e-06, "loss": 0.4693, "num_input_tokens_seen": 95889552, "step": 79035 }, { "epoch": 8.802762000222742, "grad_norm": 0.08767068386077881, "learning_rate": 2.1519650685806053e-06, "loss": 0.4691, "num_input_tokens_seen": 95895728, "step": 79040 }, { "epoch": 8.80331885510636, "grad_norm": 0.09883327782154083, "learning_rate": 2.1499930817712455e-06, "loss": 0.4603, "num_input_tokens_seen": 95902064, "step": 79045 }, { "epoch": 8.803875709989976, "grad_norm": 0.13906370103359222, "learning_rate": 2.148021958308366e-06, "loss": 0.4637, "num_input_tokens_seen": 95907888, "step": 79050 }, { "epoch": 8.804432564873593, "grad_norm": 0.10621996968984604, "learning_rate": 2.146051698266452e-06, "loss": 0.4669, "num_input_tokens_seen": 95914096, "step": 79055 }, { "epoch": 8.804989419757211, "grad_norm": 0.128740131855011, "learning_rate": 2.1440823017199462e-06, "loss": 0.4424, "num_input_tokens_seen": 95920368, "step": 79060 }, { "epoch": 8.805546274640829, "grad_norm": 0.0942496657371521, "learning_rate": 2.1421137687432563e-06, "loss": 0.4582, "num_input_tokens_seen": 95925776, "step": 79065 }, { "epoch": 8.806103129524447, "grad_norm": 0.10483812540769577, "learning_rate": 2.1401460994107586e-06, "loss": 0.4619, "num_input_tokens_seen": 95932336, "step": 79070 }, { "epoch": 8.806659984408064, "grad_norm": 0.07533347606658936, "learning_rate": 2.138179293796802e-06, "loss": 0.4628, "num_input_tokens_seen": 95938672, "step": 79075 }, { "epoch": 8.80721683929168, "grad_norm": 0.13200940191745758, "learning_rate": 2.136213351975691e-06, "loss": 0.4583, "num_input_tokens_seen": 95944528, "step": 79080 }, { "epoch": 8.807773694175298, "grad_norm": 0.11863573640584946, "learning_rate": 2.1342482740217135e-06, "loss": 0.4508, "num_input_tokens_seen": 95950640, "step": 79085 }, { "epoch": 8.808330549058915, "grad_norm": 0.10447081178426743, "learning_rate": 2.13228406000911e-06, "loss": 0.4548, "num_input_tokens_seen": 95956816, "step": 79090 }, { "epoch": 8.808887403942533, "grad_norm": 0.17489752173423767, "learning_rate": 2.1303207100121042e-06, "loss": 0.471, "num_input_tokens_seen": 95962896, "step": 79095 }, { "epoch": 8.80944425882615, "grad_norm": 0.11590185016393661, "learning_rate": 2.128358224104865e-06, "loss": 0.4627, "num_input_tokens_seen": 95969008, "step": 79100 }, { "epoch": 8.810001113709767, "grad_norm": 0.10902789235115051, "learning_rate": 2.1263966023615465e-06, "loss": 0.4619, "num_input_tokens_seen": 95974704, "step": 79105 }, { "epoch": 8.810557968593384, "grad_norm": 0.09114402532577515, "learning_rate": 2.1244358448562736e-06, "loss": 0.4602, "num_input_tokens_seen": 95980752, "step": 79110 }, { "epoch": 8.811114823477002, "grad_norm": 0.13249826431274414, "learning_rate": 2.1224759516631164e-06, "loss": 0.4629, "num_input_tokens_seen": 95986640, "step": 79115 }, { "epoch": 8.81167167836062, "grad_norm": 0.11242780834436417, "learning_rate": 2.120516922856139e-06, "loss": 0.4692, "num_input_tokens_seen": 95992752, "step": 79120 }, { "epoch": 8.812228533244237, "grad_norm": 0.13898056745529175, "learning_rate": 2.118558758509345e-06, "loss": 0.4518, "num_input_tokens_seen": 95998896, "step": 79125 }, { "epoch": 8.812785388127853, "grad_norm": 0.11825308203697205, "learning_rate": 2.1166014586967348e-06, "loss": 0.4521, "num_input_tokens_seen": 96005264, "step": 79130 }, { "epoch": 8.813342243011471, "grad_norm": 0.10140542685985565, "learning_rate": 2.114645023492251e-06, "loss": 0.4622, "num_input_tokens_seen": 96011472, "step": 79135 }, { "epoch": 8.813899097895089, "grad_norm": 0.10399671643972397, "learning_rate": 2.1126894529698184e-06, "loss": 0.456, "num_input_tokens_seen": 96017072, "step": 79140 }, { "epoch": 8.814455952778706, "grad_norm": 0.1318366527557373, "learning_rate": 2.110734747203327e-06, "loss": 0.4627, "num_input_tokens_seen": 96023088, "step": 79145 }, { "epoch": 8.815012807662324, "grad_norm": 0.1121983751654625, "learning_rate": 2.1087809062666274e-06, "loss": 0.4511, "num_input_tokens_seen": 96029264, "step": 79150 }, { "epoch": 8.81556966254594, "grad_norm": 0.11973512172698975, "learning_rate": 2.106827930233546e-06, "loss": 0.4699, "num_input_tokens_seen": 96034992, "step": 79155 }, { "epoch": 8.816126517429558, "grad_norm": 0.11097149550914764, "learning_rate": 2.104875819177865e-06, "loss": 0.4612, "num_input_tokens_seen": 96041392, "step": 79160 }, { "epoch": 8.816683372313175, "grad_norm": 0.12509208917617798, "learning_rate": 2.102924573173348e-06, "loss": 0.4664, "num_input_tokens_seen": 96047632, "step": 79165 }, { "epoch": 8.817240227196793, "grad_norm": 0.13409632444381714, "learning_rate": 2.1009741922937225e-06, "loss": 0.4544, "num_input_tokens_seen": 96053936, "step": 79170 }, { "epoch": 8.81779708208041, "grad_norm": 0.14352737367153168, "learning_rate": 2.099024676612671e-06, "loss": 0.4426, "num_input_tokens_seen": 96059920, "step": 79175 }, { "epoch": 8.818353936964026, "grad_norm": 0.08798424154520035, "learning_rate": 2.097076026203862e-06, "loss": 0.4514, "num_input_tokens_seen": 96066064, "step": 79180 }, { "epoch": 8.818910791847644, "grad_norm": 0.14339442551136017, "learning_rate": 2.095128241140909e-06, "loss": 0.4485, "num_input_tokens_seen": 96072272, "step": 79185 }, { "epoch": 8.819467646731262, "grad_norm": 0.12914501130580902, "learning_rate": 2.09318132149742e-06, "loss": 0.4558, "num_input_tokens_seen": 96077872, "step": 79190 }, { "epoch": 8.82002450161488, "grad_norm": 0.09405400604009628, "learning_rate": 2.091235267346947e-06, "loss": 0.4576, "num_input_tokens_seen": 96084176, "step": 79195 }, { "epoch": 8.820581356498497, "grad_norm": 0.07895521819591522, "learning_rate": 2.0892900787630177e-06, "loss": 0.4684, "num_input_tokens_seen": 96090256, "step": 79200 }, { "epoch": 8.821138211382113, "grad_norm": 0.10360348224639893, "learning_rate": 2.0873457558191338e-06, "loss": 0.4624, "num_input_tokens_seen": 96095920, "step": 79205 }, { "epoch": 8.82169506626573, "grad_norm": 0.14613322913646698, "learning_rate": 2.085402298588751e-06, "loss": 0.454, "num_input_tokens_seen": 96101808, "step": 79210 }, { "epoch": 8.822251921149348, "grad_norm": 0.090727798640728, "learning_rate": 2.0834597071453078e-06, "loss": 0.4478, "num_input_tokens_seen": 96107728, "step": 79215 }, { "epoch": 8.822808776032966, "grad_norm": 0.10922018438577652, "learning_rate": 2.081517981562192e-06, "loss": 0.4606, "num_input_tokens_seen": 96113840, "step": 79220 }, { "epoch": 8.823365630916584, "grad_norm": 0.11682762950658798, "learning_rate": 2.079577121912779e-06, "loss": 0.4598, "num_input_tokens_seen": 96120048, "step": 79225 }, { "epoch": 8.8239224858002, "grad_norm": 0.08661806583404541, "learning_rate": 2.0776371282703876e-06, "loss": 0.4644, "num_input_tokens_seen": 96126032, "step": 79230 }, { "epoch": 8.824479340683817, "grad_norm": 0.12365979701280594, "learning_rate": 2.075698000708323e-06, "loss": 0.4744, "num_input_tokens_seen": 96132304, "step": 79235 }, { "epoch": 8.825036195567435, "grad_norm": 0.10051534324884415, "learning_rate": 2.0737597392998574e-06, "loss": 0.452, "num_input_tokens_seen": 96138192, "step": 79240 }, { "epoch": 8.825593050451053, "grad_norm": 0.11953511834144592, "learning_rate": 2.0718223441182182e-06, "loss": 0.4611, "num_input_tokens_seen": 96143888, "step": 79245 }, { "epoch": 8.82614990533467, "grad_norm": 0.12883584201335907, "learning_rate": 2.069885815236608e-06, "loss": 0.4604, "num_input_tokens_seen": 96150160, "step": 79250 }, { "epoch": 8.826706760218286, "grad_norm": 0.11650899797677994, "learning_rate": 2.067950152728193e-06, "loss": 0.4541, "num_input_tokens_seen": 96156080, "step": 79255 }, { "epoch": 8.827263615101904, "grad_norm": 0.1203785166144371, "learning_rate": 2.0660153566661093e-06, "loss": 0.4657, "num_input_tokens_seen": 96162192, "step": 79260 }, { "epoch": 8.827820469985522, "grad_norm": 0.10703529417514801, "learning_rate": 2.064081427123468e-06, "loss": 0.4604, "num_input_tokens_seen": 96168272, "step": 79265 }, { "epoch": 8.82837732486914, "grad_norm": 0.0834464356303215, "learning_rate": 2.0621483641733248e-06, "loss": 0.4673, "num_input_tokens_seen": 96174160, "step": 79270 }, { "epoch": 8.828934179752757, "grad_norm": 0.10805077850818634, "learning_rate": 2.0602161678887286e-06, "loss": 0.4512, "num_input_tokens_seen": 96179984, "step": 79275 }, { "epoch": 8.829491034636373, "grad_norm": 0.10710220783948898, "learning_rate": 2.058284838342678e-06, "loss": 0.4519, "num_input_tokens_seen": 96186224, "step": 79280 }, { "epoch": 8.83004788951999, "grad_norm": 0.0889122262597084, "learning_rate": 2.0563543756081494e-06, "loss": 0.4597, "num_input_tokens_seen": 96192080, "step": 79285 }, { "epoch": 8.830604744403608, "grad_norm": 0.1082867980003357, "learning_rate": 2.054424779758074e-06, "loss": 0.4674, "num_input_tokens_seen": 96198096, "step": 79290 }, { "epoch": 8.831161599287226, "grad_norm": 0.08032403141260147, "learning_rate": 2.052496050865363e-06, "loss": 0.4634, "num_input_tokens_seen": 96204144, "step": 79295 }, { "epoch": 8.831718454170844, "grad_norm": 0.09134878218173981, "learning_rate": 2.0505681890028964e-06, "loss": 0.4633, "num_input_tokens_seen": 96210064, "step": 79300 }, { "epoch": 8.832275309054461, "grad_norm": 0.1353449523448944, "learning_rate": 2.048641194243503e-06, "loss": 0.4611, "num_input_tokens_seen": 96216592, "step": 79305 }, { "epoch": 8.832832163938077, "grad_norm": 0.11735790222883224, "learning_rate": 2.0467150666600018e-06, "loss": 0.4621, "num_input_tokens_seen": 96222672, "step": 79310 }, { "epoch": 8.833389018821695, "grad_norm": 0.1059173047542572, "learning_rate": 2.0447898063251564e-06, "loss": 0.461, "num_input_tokens_seen": 96229040, "step": 79315 }, { "epoch": 8.833945873705312, "grad_norm": 0.11711188405752182, "learning_rate": 2.04286541331172e-06, "loss": 0.4521, "num_input_tokens_seen": 96235472, "step": 79320 }, { "epoch": 8.83450272858893, "grad_norm": 0.10916395485401154, "learning_rate": 2.0409418876923963e-06, "loss": 0.4541, "num_input_tokens_seen": 96241584, "step": 79325 }, { "epoch": 8.835059583472548, "grad_norm": 0.11222051829099655, "learning_rate": 2.0390192295398624e-06, "loss": 0.4611, "num_input_tokens_seen": 96247824, "step": 79330 }, { "epoch": 8.835616438356164, "grad_norm": 0.10469533503055573, "learning_rate": 2.037097438926769e-06, "loss": 0.4672, "num_input_tokens_seen": 96253680, "step": 79335 }, { "epoch": 8.836173293239781, "grad_norm": 0.08796992152929306, "learning_rate": 2.0351765159257186e-06, "loss": 0.4565, "num_input_tokens_seen": 96259920, "step": 79340 }, { "epoch": 8.836730148123399, "grad_norm": 0.13747820258140564, "learning_rate": 2.033256460609298e-06, "loss": 0.4617, "num_input_tokens_seen": 96266096, "step": 79345 }, { "epoch": 8.837287003007017, "grad_norm": 0.11176736652851105, "learning_rate": 2.0313372730500435e-06, "loss": 0.4668, "num_input_tokens_seen": 96272208, "step": 79350 }, { "epoch": 8.837843857890634, "grad_norm": 0.10691826790571213, "learning_rate": 2.029418953320478e-06, "loss": 0.4646, "num_input_tokens_seen": 96278544, "step": 79355 }, { "epoch": 8.83840071277425, "grad_norm": 0.1251349151134491, "learning_rate": 2.027501501493073e-06, "loss": 0.4686, "num_input_tokens_seen": 96284912, "step": 79360 }, { "epoch": 8.838957567657868, "grad_norm": 0.1405966430902481, "learning_rate": 2.02558491764028e-06, "loss": 0.4589, "num_input_tokens_seen": 96291056, "step": 79365 }, { "epoch": 8.839514422541486, "grad_norm": 0.1314469873905182, "learning_rate": 2.0236692018345186e-06, "loss": 0.4611, "num_input_tokens_seen": 96297168, "step": 79370 }, { "epoch": 8.840071277425103, "grad_norm": 0.10451221466064453, "learning_rate": 2.0217543541481644e-06, "loss": 0.4599, "num_input_tokens_seen": 96303024, "step": 79375 }, { "epoch": 8.840628132308721, "grad_norm": 0.12425689399242401, "learning_rate": 2.01984037465357e-06, "loss": 0.455, "num_input_tokens_seen": 96308560, "step": 79380 }, { "epoch": 8.841184987192337, "grad_norm": 0.08582282811403275, "learning_rate": 2.0179272634230443e-06, "loss": 0.4703, "num_input_tokens_seen": 96314640, "step": 79385 }, { "epoch": 8.841741842075955, "grad_norm": 0.11200256645679474, "learning_rate": 2.0160150205288773e-06, "loss": 0.4664, "num_input_tokens_seen": 96320752, "step": 79390 }, { "epoch": 8.842298696959572, "grad_norm": 0.11736270040273666, "learning_rate": 2.014103646043322e-06, "loss": 0.4685, "num_input_tokens_seen": 96326544, "step": 79395 }, { "epoch": 8.84285555184319, "grad_norm": 0.12997306883335114, "learning_rate": 2.01219314003859e-06, "loss": 0.4452, "num_input_tokens_seen": 96332688, "step": 79400 }, { "epoch": 8.843412406726808, "grad_norm": 0.10503329336643219, "learning_rate": 2.0102835025868737e-06, "loss": 0.4594, "num_input_tokens_seen": 96339024, "step": 79405 }, { "epoch": 8.843969261610424, "grad_norm": 0.09747731685638428, "learning_rate": 2.0083747337603152e-06, "loss": 0.4629, "num_input_tokens_seen": 96344976, "step": 79410 }, { "epoch": 8.844526116494041, "grad_norm": 0.15814723074436188, "learning_rate": 2.006466833631046e-06, "loss": 0.4567, "num_input_tokens_seen": 96351536, "step": 79415 }, { "epoch": 8.845082971377659, "grad_norm": 0.11501748114824295, "learning_rate": 2.0045598022711415e-06, "loss": 0.4436, "num_input_tokens_seen": 96357584, "step": 79420 }, { "epoch": 8.845639826261277, "grad_norm": 0.10694915801286697, "learning_rate": 2.0026536397526578e-06, "loss": 0.4601, "num_input_tokens_seen": 96363984, "step": 79425 }, { "epoch": 8.846196681144894, "grad_norm": 0.1236492320895195, "learning_rate": 2.000748346147624e-06, "loss": 0.4662, "num_input_tokens_seen": 96369648, "step": 79430 }, { "epoch": 8.846753536028512, "grad_norm": 0.13547170162200928, "learning_rate": 1.998843921528018e-06, "loss": 0.457, "num_input_tokens_seen": 96375824, "step": 79435 }, { "epoch": 8.847310390912128, "grad_norm": 0.1159413605928421, "learning_rate": 1.996940365965805e-06, "loss": 0.446, "num_input_tokens_seen": 96381616, "step": 79440 }, { "epoch": 8.847867245795745, "grad_norm": 0.11358100175857544, "learning_rate": 1.9950376795328963e-06, "loss": 0.4537, "num_input_tokens_seen": 96387728, "step": 79445 }, { "epoch": 8.848424100679363, "grad_norm": 0.0868912860751152, "learning_rate": 1.9931358623011904e-06, "loss": 0.4532, "num_input_tokens_seen": 96394032, "step": 79450 }, { "epoch": 8.84898095556298, "grad_norm": 0.1362202763557434, "learning_rate": 1.991234914342538e-06, "loss": 0.4746, "num_input_tokens_seen": 96399664, "step": 79455 }, { "epoch": 8.849537810446598, "grad_norm": 0.10290110856294632, "learning_rate": 1.989334835728765e-06, "loss": 0.459, "num_input_tokens_seen": 96405648, "step": 79460 }, { "epoch": 8.850094665330214, "grad_norm": 0.09288263320922852, "learning_rate": 1.987435626531667e-06, "loss": 0.4633, "num_input_tokens_seen": 96411696, "step": 79465 }, { "epoch": 8.850651520213832, "grad_norm": 0.12982326745986938, "learning_rate": 1.9855372868229944e-06, "loss": 0.4616, "num_input_tokens_seen": 96418128, "step": 79470 }, { "epoch": 8.85120837509745, "grad_norm": 0.10148432850837708, "learning_rate": 1.9836398166744817e-06, "loss": 0.4528, "num_input_tokens_seen": 96424592, "step": 79475 }, { "epoch": 8.851765229981067, "grad_norm": 0.0952238142490387, "learning_rate": 1.9817432161578163e-06, "loss": 0.4677, "num_input_tokens_seen": 96430576, "step": 79480 }, { "epoch": 8.852322084864685, "grad_norm": 0.09397272765636444, "learning_rate": 1.979847485344652e-06, "loss": 0.4717, "num_input_tokens_seen": 96436848, "step": 79485 }, { "epoch": 8.852878939748301, "grad_norm": 0.18723367154598236, "learning_rate": 1.977952624306628e-06, "loss": 0.4559, "num_input_tokens_seen": 96442992, "step": 79490 }, { "epoch": 8.853435794631919, "grad_norm": 0.1251479983329773, "learning_rate": 1.9760586331153267e-06, "loss": 0.4548, "num_input_tokens_seen": 96449008, "step": 79495 }, { "epoch": 8.853992649515536, "grad_norm": 0.10710127651691437, "learning_rate": 1.9741655118423184e-06, "loss": 0.4729, "num_input_tokens_seen": 96454928, "step": 79500 }, { "epoch": 8.854549504399154, "grad_norm": 0.12394794821739197, "learning_rate": 1.972273260559121e-06, "loss": 0.4533, "num_input_tokens_seen": 96460624, "step": 79505 }, { "epoch": 8.855106359282772, "grad_norm": 0.12323309481143951, "learning_rate": 1.970381879337238e-06, "loss": 0.4645, "num_input_tokens_seen": 96466032, "step": 79510 }, { "epoch": 8.855663214166388, "grad_norm": 0.10124950855970383, "learning_rate": 1.9684913682481323e-06, "loss": 0.4587, "num_input_tokens_seen": 96472208, "step": 79515 }, { "epoch": 8.856220069050005, "grad_norm": 0.12240409851074219, "learning_rate": 1.9666017273632263e-06, "loss": 0.4607, "num_input_tokens_seen": 96478192, "step": 79520 }, { "epoch": 8.856776923933623, "grad_norm": 0.1273595690727234, "learning_rate": 1.9647129567539254e-06, "loss": 0.4665, "num_input_tokens_seen": 96483952, "step": 79525 }, { "epoch": 8.85733377881724, "grad_norm": 0.11137689650058746, "learning_rate": 1.9628250564915878e-06, "loss": 0.466, "num_input_tokens_seen": 96490224, "step": 79530 }, { "epoch": 8.857890633700858, "grad_norm": 0.09355048835277557, "learning_rate": 1.9609380266475434e-06, "loss": 0.4594, "num_input_tokens_seen": 96496176, "step": 79535 }, { "epoch": 8.858447488584474, "grad_norm": 0.10086213052272797, "learning_rate": 1.9590518672930963e-06, "loss": 0.4649, "num_input_tokens_seen": 96502160, "step": 79540 }, { "epoch": 8.859004343468092, "grad_norm": 0.0862283930182457, "learning_rate": 1.9571665784995057e-06, "loss": 0.4626, "num_input_tokens_seen": 96508656, "step": 79545 }, { "epoch": 8.85956119835171, "grad_norm": 0.09962808340787888, "learning_rate": 1.9552821603380096e-06, "loss": 0.4712, "num_input_tokens_seen": 96514768, "step": 79550 }, { "epoch": 8.860118053235327, "grad_norm": 0.12035904824733734, "learning_rate": 1.9533986128797977e-06, "loss": 0.4675, "num_input_tokens_seen": 96521072, "step": 79555 }, { "epoch": 8.860674908118945, "grad_norm": 0.11314428597688675, "learning_rate": 1.9515159361960492e-06, "loss": 0.4637, "num_input_tokens_seen": 96527088, "step": 79560 }, { "epoch": 8.86123176300256, "grad_norm": 0.1449347287416458, "learning_rate": 1.9496341303578857e-06, "loss": 0.4617, "num_input_tokens_seen": 96533360, "step": 79565 }, { "epoch": 8.861788617886178, "grad_norm": 0.10655660927295685, "learning_rate": 1.9477531954364138e-06, "loss": 0.4702, "num_input_tokens_seen": 96539408, "step": 79570 }, { "epoch": 8.862345472769796, "grad_norm": 0.09324371814727783, "learning_rate": 1.9458731315027068e-06, "loss": 0.4515, "num_input_tokens_seen": 96545392, "step": 79575 }, { "epoch": 8.862902327653414, "grad_norm": 0.14217455685138702, "learning_rate": 1.9439939386277863e-06, "loss": 0.4648, "num_input_tokens_seen": 96551280, "step": 79580 }, { "epoch": 8.863459182537031, "grad_norm": 0.10332384705543518, "learning_rate": 1.942115616882667e-06, "loss": 0.4611, "num_input_tokens_seen": 96557392, "step": 79585 }, { "epoch": 8.864016037420647, "grad_norm": 0.08654849231243134, "learning_rate": 1.940238166338307e-06, "loss": 0.4488, "num_input_tokens_seen": 96563632, "step": 79590 }, { "epoch": 8.864572892304265, "grad_norm": 0.099108025431633, "learning_rate": 1.938361587065654e-06, "loss": 0.4693, "num_input_tokens_seen": 96569840, "step": 79595 }, { "epoch": 8.865129747187883, "grad_norm": 0.11708880215883255, "learning_rate": 1.9364858791355995e-06, "loss": 0.4595, "num_input_tokens_seen": 96575920, "step": 79600 }, { "epoch": 8.8656866020715, "grad_norm": 0.12139251083135605, "learning_rate": 1.9346110426190194e-06, "loss": 0.4635, "num_input_tokens_seen": 96582384, "step": 79605 }, { "epoch": 8.866243456955118, "grad_norm": 0.12864452600479126, "learning_rate": 1.932737077586755e-06, "loss": 0.4681, "num_input_tokens_seen": 96588560, "step": 79610 }, { "epoch": 8.866800311838734, "grad_norm": 0.10529083758592606, "learning_rate": 1.9308639841096016e-06, "loss": 0.4492, "num_input_tokens_seen": 96594672, "step": 79615 }, { "epoch": 8.867357166722352, "grad_norm": 0.11175422370433807, "learning_rate": 1.928991762258339e-06, "loss": 0.4555, "num_input_tokens_seen": 96600784, "step": 79620 }, { "epoch": 8.86791402160597, "grad_norm": 0.11051283031702042, "learning_rate": 1.9271204121037e-06, "loss": 0.4688, "num_input_tokens_seen": 96606672, "step": 79625 }, { "epoch": 8.868470876489587, "grad_norm": 0.1408480703830719, "learning_rate": 1.925249933716397e-06, "loss": 0.4546, "num_input_tokens_seen": 96612720, "step": 79630 }, { "epoch": 8.869027731373205, "grad_norm": 0.11595281958580017, "learning_rate": 1.923380327167093e-06, "loss": 0.4692, "num_input_tokens_seen": 96618352, "step": 79635 }, { "epoch": 8.86958458625682, "grad_norm": 0.10805075615644455, "learning_rate": 1.9215115925264317e-06, "loss": 0.4673, "num_input_tokens_seen": 96624656, "step": 79640 }, { "epoch": 8.870141441140438, "grad_norm": 0.10489193350076675, "learning_rate": 1.919643729865028e-06, "loss": 0.4434, "num_input_tokens_seen": 96630448, "step": 79645 }, { "epoch": 8.870698296024056, "grad_norm": 0.11161455512046814, "learning_rate": 1.9177767392534434e-06, "loss": 0.4573, "num_input_tokens_seen": 96636688, "step": 79650 }, { "epoch": 8.871255150907674, "grad_norm": 0.12712644040584564, "learning_rate": 1.9159106207622265e-06, "loss": 0.4533, "num_input_tokens_seen": 96643152, "step": 79655 }, { "epoch": 8.871812005791291, "grad_norm": 0.09057661145925522, "learning_rate": 1.9140453744618814e-06, "loss": 0.469, "num_input_tokens_seen": 96649328, "step": 79660 }, { "epoch": 8.872368860674909, "grad_norm": 0.1498440057039261, "learning_rate": 1.912181000422883e-06, "loss": 0.4649, "num_input_tokens_seen": 96655728, "step": 79665 }, { "epoch": 8.872925715558525, "grad_norm": 0.09387167543172836, "learning_rate": 1.91031749871568e-06, "loss": 0.4582, "num_input_tokens_seen": 96661968, "step": 79670 }, { "epoch": 8.873482570442143, "grad_norm": 0.08229870349168777, "learning_rate": 1.9084548694106726e-06, "loss": 0.4544, "num_input_tokens_seen": 96668432, "step": 79675 }, { "epoch": 8.87403942532576, "grad_norm": 0.11386144161224365, "learning_rate": 1.9065931125782422e-06, "loss": 0.45, "num_input_tokens_seen": 96674768, "step": 79680 }, { "epoch": 8.874596280209378, "grad_norm": 0.09762129187583923, "learning_rate": 1.9047322282887304e-06, "loss": 0.4638, "num_input_tokens_seen": 96680944, "step": 79685 }, { "epoch": 8.875153135092996, "grad_norm": 0.08858678489923477, "learning_rate": 1.9028722166124474e-06, "loss": 0.4597, "num_input_tokens_seen": 96686416, "step": 79690 }, { "epoch": 8.875709989976611, "grad_norm": 0.11000906676054001, "learning_rate": 1.9010130776196704e-06, "loss": 0.4573, "num_input_tokens_seen": 96692528, "step": 79695 }, { "epoch": 8.876266844860229, "grad_norm": 0.10888537764549255, "learning_rate": 1.8991548113806408e-06, "loss": 0.4653, "num_input_tokens_seen": 96698736, "step": 79700 }, { "epoch": 8.876823699743847, "grad_norm": 0.07852397859096527, "learning_rate": 1.8972974179655768e-06, "loss": 0.4515, "num_input_tokens_seen": 96704528, "step": 79705 }, { "epoch": 8.877380554627464, "grad_norm": 0.10678604990243912, "learning_rate": 1.8954408974446503e-06, "loss": 0.4625, "num_input_tokens_seen": 96710544, "step": 79710 }, { "epoch": 8.877937409511082, "grad_norm": 0.10982014983892441, "learning_rate": 1.8935852498880108e-06, "loss": 0.4551, "num_input_tokens_seen": 96716848, "step": 79715 }, { "epoch": 8.878494264394698, "grad_norm": 0.10805575549602509, "learning_rate": 1.8917304753657661e-06, "loss": 0.4597, "num_input_tokens_seen": 96722928, "step": 79720 }, { "epoch": 8.879051119278316, "grad_norm": 0.11456257849931717, "learning_rate": 1.8898765739480018e-06, "loss": 0.4702, "num_input_tokens_seen": 96729104, "step": 79725 }, { "epoch": 8.879607974161933, "grad_norm": 0.0884501188993454, "learning_rate": 1.888023545704759e-06, "loss": 0.4655, "num_input_tokens_seen": 96735280, "step": 79730 }, { "epoch": 8.880164829045551, "grad_norm": 0.08814812451601028, "learning_rate": 1.8861713907060486e-06, "loss": 0.4701, "num_input_tokens_seen": 96741296, "step": 79735 }, { "epoch": 8.880721683929169, "grad_norm": 0.11672721058130264, "learning_rate": 1.8843201090218616e-06, "loss": 0.4621, "num_input_tokens_seen": 96746960, "step": 79740 }, { "epoch": 8.881278538812785, "grad_norm": 0.1088385134935379, "learning_rate": 1.8824697007221337e-06, "loss": 0.4651, "num_input_tokens_seen": 96752592, "step": 79745 }, { "epoch": 8.881835393696402, "grad_norm": 0.14319941401481628, "learning_rate": 1.880620165876787e-06, "loss": 0.4627, "num_input_tokens_seen": 96758608, "step": 79750 }, { "epoch": 8.88239224858002, "grad_norm": 0.10315743088722229, "learning_rate": 1.8787715045556986e-06, "loss": 0.4532, "num_input_tokens_seen": 96764784, "step": 79755 }, { "epoch": 8.882949103463638, "grad_norm": 0.17639444768428802, "learning_rate": 1.8769237168287184e-06, "loss": 0.4572, "num_input_tokens_seen": 96770608, "step": 79760 }, { "epoch": 8.883505958347255, "grad_norm": 0.1127658262848854, "learning_rate": 1.875076802765663e-06, "loss": 0.4654, "num_input_tokens_seen": 96776880, "step": 79765 }, { "epoch": 8.884062813230871, "grad_norm": 0.11895322054624557, "learning_rate": 1.873230762436312e-06, "loss": 0.4547, "num_input_tokens_seen": 96782736, "step": 79770 }, { "epoch": 8.884619668114489, "grad_norm": 0.08718810975551605, "learning_rate": 1.8713855959104187e-06, "loss": 0.462, "num_input_tokens_seen": 96788912, "step": 79775 }, { "epoch": 8.885176522998107, "grad_norm": 0.12929897010326385, "learning_rate": 1.8695413032576935e-06, "loss": 0.4431, "num_input_tokens_seen": 96794800, "step": 79780 }, { "epoch": 8.885733377881724, "grad_norm": 0.11436454951763153, "learning_rate": 1.8676978845478254e-06, "loss": 0.4673, "num_input_tokens_seen": 96801040, "step": 79785 }, { "epoch": 8.886290232765342, "grad_norm": 0.10706722736358643, "learning_rate": 1.8658553398504614e-06, "loss": 0.4616, "num_input_tokens_seen": 96807344, "step": 79790 }, { "epoch": 8.88684708764896, "grad_norm": 0.10510078817605972, "learning_rate": 1.8640136692352179e-06, "loss": 0.4654, "num_input_tokens_seen": 96813616, "step": 79795 }, { "epoch": 8.887403942532575, "grad_norm": 0.1276365965604782, "learning_rate": 1.8621728727716864e-06, "loss": 0.461, "num_input_tokens_seen": 96819632, "step": 79800 }, { "epoch": 8.887960797416193, "grad_norm": 0.10135567933320999, "learning_rate": 1.8603329505294059e-06, "loss": 0.4488, "num_input_tokens_seen": 96825872, "step": 79805 }, { "epoch": 8.88851765229981, "grad_norm": 0.12222398072481155, "learning_rate": 1.8584939025779068e-06, "loss": 0.4589, "num_input_tokens_seen": 96832144, "step": 79810 }, { "epoch": 8.889074507183429, "grad_norm": 0.11205663532018661, "learning_rate": 1.8566557289866643e-06, "loss": 0.4505, "num_input_tokens_seen": 96838352, "step": 79815 }, { "epoch": 8.889631362067046, "grad_norm": 0.11604990065097809, "learning_rate": 1.854818429825139e-06, "loss": 0.4727, "num_input_tokens_seen": 96844720, "step": 79820 }, { "epoch": 8.890188216950662, "grad_norm": 0.10474731028079987, "learning_rate": 1.8529820051627427e-06, "loss": 0.4591, "num_input_tokens_seen": 96850896, "step": 79825 }, { "epoch": 8.89074507183428, "grad_norm": 0.1015242338180542, "learning_rate": 1.851146455068864e-06, "loss": 0.4603, "num_input_tokens_seen": 96857168, "step": 79830 }, { "epoch": 8.891301926717897, "grad_norm": 0.11694180965423584, "learning_rate": 1.8493117796128585e-06, "loss": 0.4617, "num_input_tokens_seen": 96863504, "step": 79835 }, { "epoch": 8.891858781601515, "grad_norm": 0.1367374062538147, "learning_rate": 1.847477978864043e-06, "loss": 0.4556, "num_input_tokens_seen": 96869712, "step": 79840 }, { "epoch": 8.892415636485133, "grad_norm": 0.0895114541053772, "learning_rate": 1.8456450528917062e-06, "loss": 0.4638, "num_input_tokens_seen": 96875568, "step": 79845 }, { "epoch": 8.892972491368749, "grad_norm": 0.11625712364912033, "learning_rate": 1.8438130017650989e-06, "loss": 0.4555, "num_input_tokens_seen": 96881616, "step": 79850 }, { "epoch": 8.893529346252366, "grad_norm": 0.10138794034719467, "learning_rate": 1.8419818255534487e-06, "loss": 0.4659, "num_input_tokens_seen": 96887024, "step": 79855 }, { "epoch": 8.894086201135984, "grad_norm": 0.13598300516605377, "learning_rate": 1.8401515243259337e-06, "loss": 0.4619, "num_input_tokens_seen": 96893072, "step": 79860 }, { "epoch": 8.894643056019602, "grad_norm": 0.13872411847114563, "learning_rate": 1.8383220981517124e-06, "loss": 0.4522, "num_input_tokens_seen": 96899024, "step": 79865 }, { "epoch": 8.89519991090322, "grad_norm": 0.1505826711654663, "learning_rate": 1.836493547099913e-06, "loss": 0.4646, "num_input_tokens_seen": 96904720, "step": 79870 }, { "epoch": 8.895756765786835, "grad_norm": 0.09994462877511978, "learning_rate": 1.834665871239613e-06, "loss": 0.4582, "num_input_tokens_seen": 96910832, "step": 79875 }, { "epoch": 8.896313620670453, "grad_norm": 0.1263798028230667, "learning_rate": 1.8328390706398801e-06, "loss": 0.4718, "num_input_tokens_seen": 96917072, "step": 79880 }, { "epoch": 8.89687047555407, "grad_norm": 0.09724033623933792, "learning_rate": 1.8310131453697282e-06, "loss": 0.4694, "num_input_tokens_seen": 96923024, "step": 79885 }, { "epoch": 8.897427330437688, "grad_norm": 0.10185187309980392, "learning_rate": 1.8291880954981438e-06, "loss": 0.4561, "num_input_tokens_seen": 96929232, "step": 79890 }, { "epoch": 8.897984185321306, "grad_norm": 0.11900284886360168, "learning_rate": 1.827363921094094e-06, "loss": 0.4638, "num_input_tokens_seen": 96934864, "step": 79895 }, { "epoch": 8.898541040204922, "grad_norm": 0.08440462499856949, "learning_rate": 1.8255406222264876e-06, "loss": 0.4569, "num_input_tokens_seen": 96940976, "step": 79900 }, { "epoch": 8.89909789508854, "grad_norm": 0.10503283888101578, "learning_rate": 1.8237181989642305e-06, "loss": 0.4439, "num_input_tokens_seen": 96946928, "step": 79905 }, { "epoch": 8.899654749972157, "grad_norm": 0.11860787123441696, "learning_rate": 1.821896651376165e-06, "loss": 0.4527, "num_input_tokens_seen": 96953008, "step": 79910 }, { "epoch": 8.900211604855775, "grad_norm": 0.08101309090852737, "learning_rate": 1.8200759795311245e-06, "loss": 0.4504, "num_input_tokens_seen": 96959248, "step": 79915 }, { "epoch": 8.900768459739393, "grad_norm": 0.17058192193508148, "learning_rate": 1.8182561834978989e-06, "loss": 0.4645, "num_input_tokens_seen": 96965296, "step": 79920 }, { "epoch": 8.901325314623008, "grad_norm": 0.10336355865001678, "learning_rate": 1.8164372633452414e-06, "loss": 0.4606, "num_input_tokens_seen": 96971472, "step": 79925 }, { "epoch": 8.901882169506626, "grad_norm": 0.10955752432346344, "learning_rate": 1.8146192191418832e-06, "loss": 0.4521, "num_input_tokens_seen": 96977456, "step": 79930 }, { "epoch": 8.902439024390244, "grad_norm": 0.09217923134565353, "learning_rate": 1.812802050956508e-06, "loss": 0.4622, "num_input_tokens_seen": 96983504, "step": 79935 }, { "epoch": 8.902995879273861, "grad_norm": 0.10370532423257828, "learning_rate": 1.8109857588577806e-06, "loss": 0.4559, "num_input_tokens_seen": 96989456, "step": 79940 }, { "epoch": 8.90355273415748, "grad_norm": 0.10518806427717209, "learning_rate": 1.8091703429143265e-06, "loss": 0.4555, "num_input_tokens_seen": 96995440, "step": 79945 }, { "epoch": 8.904109589041095, "grad_norm": 0.12552975118160248, "learning_rate": 1.8073558031947296e-06, "loss": 0.4547, "num_input_tokens_seen": 97000944, "step": 79950 }, { "epoch": 8.904666443924713, "grad_norm": 0.06970099359750748, "learning_rate": 1.8055421397675603e-06, "loss": 0.4498, "num_input_tokens_seen": 97007088, "step": 79955 }, { "epoch": 8.90522329880833, "grad_norm": 0.10793574899435043, "learning_rate": 1.803729352701336e-06, "loss": 0.4678, "num_input_tokens_seen": 97013232, "step": 79960 }, { "epoch": 8.905780153691948, "grad_norm": 0.1257839947938919, "learning_rate": 1.8019174420645573e-06, "loss": 0.458, "num_input_tokens_seen": 97019600, "step": 79965 }, { "epoch": 8.906337008575566, "grad_norm": 0.12957949936389923, "learning_rate": 1.8001064079256725e-06, "loss": 0.4518, "num_input_tokens_seen": 97025424, "step": 79970 }, { "epoch": 8.906893863459182, "grad_norm": 0.14594364166259766, "learning_rate": 1.798296250353118e-06, "loss": 0.4665, "num_input_tokens_seen": 97031408, "step": 79975 }, { "epoch": 8.9074507183428, "grad_norm": 0.1253218948841095, "learning_rate": 1.7964869694152868e-06, "loss": 0.4624, "num_input_tokens_seen": 97037264, "step": 79980 }, { "epoch": 8.908007573226417, "grad_norm": 0.1067781075835228, "learning_rate": 1.7946785651805353e-06, "loss": 0.4668, "num_input_tokens_seen": 97043568, "step": 79985 }, { "epoch": 8.908564428110035, "grad_norm": 0.09870798140764236, "learning_rate": 1.7928710377171948e-06, "loss": 0.4503, "num_input_tokens_seen": 97049296, "step": 79990 }, { "epoch": 8.909121282993652, "grad_norm": 0.11214065551757812, "learning_rate": 1.791064387093555e-06, "loss": 0.457, "num_input_tokens_seen": 97055536, "step": 79995 }, { "epoch": 8.909678137877268, "grad_norm": 0.12406055629253387, "learning_rate": 1.7892586133778837e-06, "loss": 0.4633, "num_input_tokens_seen": 97061424, "step": 80000 }, { "epoch": 8.910234992760886, "grad_norm": 0.09155422449111938, "learning_rate": 1.7874537166383986e-06, "loss": 0.4771, "num_input_tokens_seen": 97067440, "step": 80005 }, { "epoch": 8.910791847644504, "grad_norm": 0.11027250438928604, "learning_rate": 1.7856496969433007e-06, "loss": 0.4738, "num_input_tokens_seen": 97073104, "step": 80010 }, { "epoch": 8.911348702528121, "grad_norm": 0.06588870286941528, "learning_rate": 1.7838465543607573e-06, "loss": 0.471, "num_input_tokens_seen": 97079440, "step": 80015 }, { "epoch": 8.911905557411739, "grad_norm": 0.1052577942609787, "learning_rate": 1.782044288958884e-06, "loss": 0.4594, "num_input_tokens_seen": 97085392, "step": 80020 }, { "epoch": 8.912462412295357, "grad_norm": 0.10566404461860657, "learning_rate": 1.7802429008057897e-06, "loss": 0.4676, "num_input_tokens_seen": 97091696, "step": 80025 }, { "epoch": 8.913019267178973, "grad_norm": 0.09474212676286697, "learning_rate": 1.778442389969523e-06, "loss": 0.46, "num_input_tokens_seen": 97097776, "step": 80030 }, { "epoch": 8.91357612206259, "grad_norm": 0.09215158969163895, "learning_rate": 1.7766427565181266e-06, "loss": 0.4639, "num_input_tokens_seen": 97103472, "step": 80035 }, { "epoch": 8.914132976946208, "grad_norm": 0.09544280916452408, "learning_rate": 1.774844000519585e-06, "loss": 0.4662, "num_input_tokens_seen": 97109392, "step": 80040 }, { "epoch": 8.914689831829826, "grad_norm": 0.13135948777198792, "learning_rate": 1.773046122041866e-06, "loss": 0.456, "num_input_tokens_seen": 97115568, "step": 80045 }, { "epoch": 8.915246686713443, "grad_norm": 0.13072538375854492, "learning_rate": 1.7712491211529014e-06, "loss": 0.476, "num_input_tokens_seen": 97121776, "step": 80050 }, { "epoch": 8.91580354159706, "grad_norm": 0.09539107978343964, "learning_rate": 1.7694529979205842e-06, "loss": 0.4572, "num_input_tokens_seen": 97127856, "step": 80055 }, { "epoch": 8.916360396480677, "grad_norm": 0.14153270423412323, "learning_rate": 1.7676577524127796e-06, "loss": 0.4538, "num_input_tokens_seen": 97134128, "step": 80060 }, { "epoch": 8.916917251364294, "grad_norm": 0.11160727590322495, "learning_rate": 1.7658633846973138e-06, "loss": 0.4609, "num_input_tokens_seen": 97140112, "step": 80065 }, { "epoch": 8.917474106247912, "grad_norm": 0.08612414449453354, "learning_rate": 1.764069894841988e-06, "loss": 0.46, "num_input_tokens_seen": 97145552, "step": 80070 }, { "epoch": 8.91803096113153, "grad_norm": 0.10911652445793152, "learning_rate": 1.7622772829145677e-06, "loss": 0.4595, "num_input_tokens_seen": 97152048, "step": 80075 }, { "epoch": 8.918587816015146, "grad_norm": 0.11886461824178696, "learning_rate": 1.7604855489827765e-06, "loss": 0.4341, "num_input_tokens_seen": 97158352, "step": 80080 }, { "epoch": 8.919144670898763, "grad_norm": 0.13637202978134155, "learning_rate": 1.7586946931143184e-06, "loss": 0.4623, "num_input_tokens_seen": 97164336, "step": 80085 }, { "epoch": 8.919701525782381, "grad_norm": 0.12666507065296173, "learning_rate": 1.7569047153768531e-06, "loss": 0.4635, "num_input_tokens_seen": 97170416, "step": 80090 }, { "epoch": 8.920258380665999, "grad_norm": 0.10282278060913086, "learning_rate": 1.7551156158380183e-06, "loss": 0.4744, "num_input_tokens_seen": 97176688, "step": 80095 }, { "epoch": 8.920815235549616, "grad_norm": 0.14521458745002747, "learning_rate": 1.7533273945654015e-06, "loss": 0.4421, "num_input_tokens_seen": 97182512, "step": 80100 }, { "epoch": 8.921372090433232, "grad_norm": 0.1019255518913269, "learning_rate": 1.7515400516265767e-06, "loss": 0.4592, "num_input_tokens_seen": 97187920, "step": 80105 }, { "epoch": 8.92192894531685, "grad_norm": 0.13435502350330353, "learning_rate": 1.749753587089073e-06, "loss": 0.4589, "num_input_tokens_seen": 97193776, "step": 80110 }, { "epoch": 8.922485800200468, "grad_norm": 0.13132530450820923, "learning_rate": 1.7479680010203835e-06, "loss": 0.4652, "num_input_tokens_seen": 97199792, "step": 80115 }, { "epoch": 8.923042655084085, "grad_norm": 0.10818028450012207, "learning_rate": 1.7461832934879852e-06, "loss": 0.4695, "num_input_tokens_seen": 97205904, "step": 80120 }, { "epoch": 8.923599509967703, "grad_norm": 0.09116099029779434, "learning_rate": 1.744399464559296e-06, "loss": 0.463, "num_input_tokens_seen": 97211728, "step": 80125 }, { "epoch": 8.92415636485132, "grad_norm": 0.06370390206575394, "learning_rate": 1.7426165143017259e-06, "loss": 0.4483, "num_input_tokens_seen": 97217488, "step": 80130 }, { "epoch": 8.924713219734937, "grad_norm": 0.13875706493854523, "learning_rate": 1.7408344427826296e-06, "loss": 0.442, "num_input_tokens_seen": 97223568, "step": 80135 }, { "epoch": 8.925270074618554, "grad_norm": 0.10389146953821182, "learning_rate": 1.7390532500693475e-06, "loss": 0.4552, "num_input_tokens_seen": 97229872, "step": 80140 }, { "epoch": 8.925826929502172, "grad_norm": 0.0707458108663559, "learning_rate": 1.7372729362291812e-06, "loss": 0.454, "num_input_tokens_seen": 97235920, "step": 80145 }, { "epoch": 8.92638378438579, "grad_norm": 0.11983189731836319, "learning_rate": 1.7354935013293882e-06, "loss": 0.4659, "num_input_tokens_seen": 97242064, "step": 80150 }, { "epoch": 8.926940639269407, "grad_norm": 0.19536328315734863, "learning_rate": 1.733714945437212e-06, "loss": 0.4516, "num_input_tokens_seen": 97248336, "step": 80155 }, { "epoch": 8.927497494153023, "grad_norm": 0.19277745485305786, "learning_rate": 1.7319372686198377e-06, "loss": 0.4597, "num_input_tokens_seen": 97254352, "step": 80160 }, { "epoch": 8.92805434903664, "grad_norm": 0.14323057234287262, "learning_rate": 1.7301604709444419e-06, "loss": 0.454, "num_input_tokens_seen": 97259600, "step": 80165 }, { "epoch": 8.928611203920259, "grad_norm": 0.1273941695690155, "learning_rate": 1.72838455247816e-06, "loss": 0.4651, "num_input_tokens_seen": 97265200, "step": 80170 }, { "epoch": 8.929168058803876, "grad_norm": 0.10560593754053116, "learning_rate": 1.72660951328808e-06, "loss": 0.472, "num_input_tokens_seen": 97271376, "step": 80175 }, { "epoch": 8.929724913687494, "grad_norm": 0.09050217270851135, "learning_rate": 1.7248353534412815e-06, "loss": 0.4585, "num_input_tokens_seen": 97277232, "step": 80180 }, { "epoch": 8.93028176857111, "grad_norm": 0.08405488729476929, "learning_rate": 1.7230620730047885e-06, "loss": 0.4668, "num_input_tokens_seen": 97283568, "step": 80185 }, { "epoch": 8.930838623454727, "grad_norm": 0.08164116740226746, "learning_rate": 1.721289672045609e-06, "loss": 0.4771, "num_input_tokens_seen": 97289872, "step": 80190 }, { "epoch": 8.931395478338345, "grad_norm": 0.08917570114135742, "learning_rate": 1.7195181506307028e-06, "loss": 0.462, "num_input_tokens_seen": 97296080, "step": 80195 }, { "epoch": 8.931952333221963, "grad_norm": 0.10321241617202759, "learning_rate": 1.7177475088270055e-06, "loss": 0.4532, "num_input_tokens_seen": 97301904, "step": 80200 }, { "epoch": 8.93250918810558, "grad_norm": 0.1238447055220604, "learning_rate": 1.715977746701425e-06, "loss": 0.4682, "num_input_tokens_seen": 97308112, "step": 80205 }, { "epoch": 8.933066042989196, "grad_norm": 0.14460571110248566, "learning_rate": 1.7142088643208154e-06, "loss": 0.4526, "num_input_tokens_seen": 97313712, "step": 80210 }, { "epoch": 8.933622897872814, "grad_norm": 0.08905182778835297, "learning_rate": 1.7124408617520238e-06, "loss": 0.4659, "num_input_tokens_seen": 97318800, "step": 80215 }, { "epoch": 8.934179752756432, "grad_norm": 0.11233317852020264, "learning_rate": 1.710673739061841e-06, "loss": 0.4636, "num_input_tokens_seen": 97324240, "step": 80220 }, { "epoch": 8.93473660764005, "grad_norm": 0.12345453351736069, "learning_rate": 1.7089074963170414e-06, "loss": 0.4451, "num_input_tokens_seen": 97330224, "step": 80225 }, { "epoch": 8.935293462523667, "grad_norm": 0.09337244182825089, "learning_rate": 1.707142133584355e-06, "loss": 0.4563, "num_input_tokens_seen": 97336016, "step": 80230 }, { "epoch": 8.935850317407283, "grad_norm": 0.13605931401252747, "learning_rate": 1.7053776509304842e-06, "loss": 0.439, "num_input_tokens_seen": 97342160, "step": 80235 }, { "epoch": 8.9364071722909, "grad_norm": 0.09975872188806534, "learning_rate": 1.7036140484220975e-06, "loss": 0.4632, "num_input_tokens_seen": 97347344, "step": 80240 }, { "epoch": 8.936964027174518, "grad_norm": 0.11730373650789261, "learning_rate": 1.7018513261258278e-06, "loss": 0.4534, "num_input_tokens_seen": 97353520, "step": 80245 }, { "epoch": 8.937520882058136, "grad_norm": 0.1771911382675171, "learning_rate": 1.7000894841082805e-06, "loss": 0.4495, "num_input_tokens_seen": 97359440, "step": 80250 }, { "epoch": 8.938077736941754, "grad_norm": 0.11707477271556854, "learning_rate": 1.6983285224360185e-06, "loss": 0.4592, "num_input_tokens_seen": 97365520, "step": 80255 }, { "epoch": 8.93863459182537, "grad_norm": 0.12405375391244888, "learning_rate": 1.6965684411755806e-06, "loss": 0.4701, "num_input_tokens_seen": 97371504, "step": 80260 }, { "epoch": 8.939191446708987, "grad_norm": 0.12430723011493683, "learning_rate": 1.6948092403934634e-06, "loss": 0.466, "num_input_tokens_seen": 97377744, "step": 80265 }, { "epoch": 8.939748301592605, "grad_norm": 0.13110898435115814, "learning_rate": 1.693050920156139e-06, "loss": 0.4546, "num_input_tokens_seen": 97383952, "step": 80270 }, { "epoch": 8.940305156476223, "grad_norm": 0.114177405834198, "learning_rate": 1.6912934805300456e-06, "loss": 0.466, "num_input_tokens_seen": 97389744, "step": 80275 }, { "epoch": 8.94086201135984, "grad_norm": 0.09728581458330154, "learning_rate": 1.6895369215815776e-06, "loss": 0.4702, "num_input_tokens_seen": 97395984, "step": 80280 }, { "epoch": 8.941418866243456, "grad_norm": 0.15986186265945435, "learning_rate": 1.6877812433771094e-06, "loss": 0.4631, "num_input_tokens_seen": 97402480, "step": 80285 }, { "epoch": 8.941975721127074, "grad_norm": 0.11138644814491272, "learning_rate": 1.686026445982969e-06, "loss": 0.4568, "num_input_tokens_seen": 97408752, "step": 80290 }, { "epoch": 8.942532576010692, "grad_norm": 0.11812549829483032, "learning_rate": 1.6842725294654638e-06, "loss": 0.4568, "num_input_tokens_seen": 97414288, "step": 80295 }, { "epoch": 8.94308943089431, "grad_norm": 0.14820928871631622, "learning_rate": 1.6825194938908689e-06, "loss": 0.4536, "num_input_tokens_seen": 97420144, "step": 80300 }, { "epoch": 8.943646285777927, "grad_norm": 0.10349331051111221, "learning_rate": 1.6807673393254037e-06, "loss": 0.4606, "num_input_tokens_seen": 97426704, "step": 80305 }, { "epoch": 8.944203140661543, "grad_norm": 0.09421990066766739, "learning_rate": 1.6790160658352815e-06, "loss": 0.4595, "num_input_tokens_seen": 97432752, "step": 80310 }, { "epoch": 8.94475999554516, "grad_norm": 0.11543964594602585, "learning_rate": 1.6772656734866665e-06, "loss": 0.4615, "num_input_tokens_seen": 97438640, "step": 80315 }, { "epoch": 8.945316850428778, "grad_norm": 0.11437638849020004, "learning_rate": 1.6755161623456943e-06, "loss": 0.4557, "num_input_tokens_seen": 97444752, "step": 80320 }, { "epoch": 8.945873705312396, "grad_norm": 0.09254535287618637, "learning_rate": 1.6737675324784736e-06, "loss": 0.4679, "num_input_tokens_seen": 97450768, "step": 80325 }, { "epoch": 8.946430560196013, "grad_norm": 0.10755884647369385, "learning_rate": 1.672019783951062e-06, "loss": 0.4545, "num_input_tokens_seen": 97456144, "step": 80330 }, { "epoch": 8.94698741507963, "grad_norm": 0.10893948376178741, "learning_rate": 1.6702729168295073e-06, "loss": 0.4595, "num_input_tokens_seen": 97462320, "step": 80335 }, { "epoch": 8.947544269963247, "grad_norm": 0.11791346222162247, "learning_rate": 1.6685269311798007e-06, "loss": 0.4499, "num_input_tokens_seen": 97468624, "step": 80340 }, { "epoch": 8.948101124846865, "grad_norm": 0.13426166772842407, "learning_rate": 1.6667818270679175e-06, "loss": 0.4559, "num_input_tokens_seen": 97474896, "step": 80345 }, { "epoch": 8.948657979730482, "grad_norm": 0.11628539860248566, "learning_rate": 1.6650376045597937e-06, "loss": 0.4631, "num_input_tokens_seen": 97481008, "step": 80350 }, { "epoch": 8.9492148346141, "grad_norm": 0.10693298280239105, "learning_rate": 1.6632942637213268e-06, "loss": 0.4609, "num_input_tokens_seen": 97487088, "step": 80355 }, { "epoch": 8.949771689497716, "grad_norm": 0.13378943502902985, "learning_rate": 1.6615518046183914e-06, "loss": 0.4684, "num_input_tokens_seen": 97493424, "step": 80360 }, { "epoch": 8.950328544381334, "grad_norm": 0.1183662936091423, "learning_rate": 1.6598102273168186e-06, "loss": 0.4536, "num_input_tokens_seen": 97499952, "step": 80365 }, { "epoch": 8.950885399264951, "grad_norm": 0.1379324346780777, "learning_rate": 1.6580695318824169e-06, "loss": 0.4541, "num_input_tokens_seen": 97505776, "step": 80370 }, { "epoch": 8.951442254148569, "grad_norm": 0.11886513978242874, "learning_rate": 1.6563297183809445e-06, "loss": 0.4543, "num_input_tokens_seen": 97511856, "step": 80375 }, { "epoch": 8.951999109032187, "grad_norm": 0.10930398106575012, "learning_rate": 1.6545907868781462e-06, "loss": 0.4572, "num_input_tokens_seen": 97518160, "step": 80380 }, { "epoch": 8.952555963915804, "grad_norm": 0.13711535930633545, "learning_rate": 1.6528527374397251e-06, "loss": 0.4688, "num_input_tokens_seen": 97523952, "step": 80385 }, { "epoch": 8.95311281879942, "grad_norm": 0.12941256165504456, "learning_rate": 1.6511155701313425e-06, "loss": 0.4638, "num_input_tokens_seen": 97529744, "step": 80390 }, { "epoch": 8.953669673683038, "grad_norm": 0.07031834870576859, "learning_rate": 1.649379285018643e-06, "loss": 0.4565, "num_input_tokens_seen": 97535888, "step": 80395 }, { "epoch": 8.954226528566656, "grad_norm": 0.09855673462152481, "learning_rate": 1.6476438821672213e-06, "loss": 0.4598, "num_input_tokens_seen": 97542096, "step": 80400 }, { "epoch": 8.954783383450273, "grad_norm": 0.10715724527835846, "learning_rate": 1.6459093616426557e-06, "loss": 0.4562, "num_input_tokens_seen": 97548336, "step": 80405 }, { "epoch": 8.955340238333891, "grad_norm": 0.11925642192363739, "learning_rate": 1.6441757235104716e-06, "loss": 0.4643, "num_input_tokens_seen": 97554832, "step": 80410 }, { "epoch": 8.955897093217507, "grad_norm": 0.16522374749183655, "learning_rate": 1.6424429678361747e-06, "loss": 0.4614, "num_input_tokens_seen": 97561040, "step": 80415 }, { "epoch": 8.956453948101124, "grad_norm": 0.10711672902107239, "learning_rate": 1.6407110946852378e-06, "loss": 0.4737, "num_input_tokens_seen": 97567088, "step": 80420 }, { "epoch": 8.957010802984742, "grad_norm": 0.13108690083026886, "learning_rate": 1.6389801041230946e-06, "loss": 0.4646, "num_input_tokens_seen": 97573392, "step": 80425 }, { "epoch": 8.95756765786836, "grad_norm": 0.10927987843751907, "learning_rate": 1.6372499962151483e-06, "loss": 0.4514, "num_input_tokens_seen": 97579664, "step": 80430 }, { "epoch": 8.958124512751978, "grad_norm": 0.10706662386655807, "learning_rate": 1.6355207710267634e-06, "loss": 0.4536, "num_input_tokens_seen": 97585776, "step": 80435 }, { "epoch": 8.958681367635593, "grad_norm": 0.08633512258529663, "learning_rate": 1.6337924286232792e-06, "loss": 0.4791, "num_input_tokens_seen": 97591824, "step": 80440 }, { "epoch": 8.959238222519211, "grad_norm": 0.11585269868373871, "learning_rate": 1.6320649690700019e-06, "loss": 0.4606, "num_input_tokens_seen": 97597712, "step": 80445 }, { "epoch": 8.959795077402829, "grad_norm": 0.10908720642328262, "learning_rate": 1.6303383924321903e-06, "loss": 0.4595, "num_input_tokens_seen": 97603632, "step": 80450 }, { "epoch": 8.960351932286446, "grad_norm": 0.09195375442504883, "learning_rate": 1.6286126987750922e-06, "loss": 0.4599, "num_input_tokens_seen": 97610096, "step": 80455 }, { "epoch": 8.960908787170064, "grad_norm": 0.117519311606884, "learning_rate": 1.6268878881639e-06, "loss": 0.4569, "num_input_tokens_seen": 97616016, "step": 80460 }, { "epoch": 8.96146564205368, "grad_norm": 0.09847989678382874, "learning_rate": 1.6251639606637893e-06, "loss": 0.4535, "num_input_tokens_seen": 97622160, "step": 80465 }, { "epoch": 8.962022496937298, "grad_norm": 0.08878618478775024, "learning_rate": 1.6234409163398885e-06, "loss": 0.4704, "num_input_tokens_seen": 97627888, "step": 80470 }, { "epoch": 8.962579351820915, "grad_norm": 0.12945806980133057, "learning_rate": 1.6217187552573039e-06, "loss": 0.4538, "num_input_tokens_seen": 97633232, "step": 80475 }, { "epoch": 8.963136206704533, "grad_norm": 0.093738853931427, "learning_rate": 1.6199974774811056e-06, "loss": 0.4657, "num_input_tokens_seen": 97639664, "step": 80480 }, { "epoch": 8.96369306158815, "grad_norm": 0.16046805679798126, "learning_rate": 1.6182770830763278e-06, "loss": 0.4603, "num_input_tokens_seen": 97646128, "step": 80485 }, { "epoch": 8.964249916471768, "grad_norm": 0.11601773649454117, "learning_rate": 1.616557572107974e-06, "loss": 0.4629, "num_input_tokens_seen": 97652400, "step": 80490 }, { "epoch": 8.964806771355384, "grad_norm": 0.10813361406326294, "learning_rate": 1.6148389446410062e-06, "loss": 0.4675, "num_input_tokens_seen": 97658288, "step": 80495 }, { "epoch": 8.965363626239002, "grad_norm": 0.090740866959095, "learning_rate": 1.6131212007403696e-06, "loss": 0.4704, "num_input_tokens_seen": 97664400, "step": 80500 }, { "epoch": 8.96592048112262, "grad_norm": 0.09835488349199295, "learning_rate": 1.6114043404709567e-06, "loss": 0.4543, "num_input_tokens_seen": 97670384, "step": 80505 }, { "epoch": 8.966477336006237, "grad_norm": 0.09872255474328995, "learning_rate": 1.6096883638976407e-06, "loss": 0.4573, "num_input_tokens_seen": 97676048, "step": 80510 }, { "epoch": 8.967034190889855, "grad_norm": 0.10817694664001465, "learning_rate": 1.6079732710852586e-06, "loss": 0.4665, "num_input_tokens_seen": 97681904, "step": 80515 }, { "epoch": 8.96759104577347, "grad_norm": 0.09829461574554443, "learning_rate": 1.6062590620986085e-06, "loss": 0.4678, "num_input_tokens_seen": 97688240, "step": 80520 }, { "epoch": 8.968147900657089, "grad_norm": 0.0848679468035698, "learning_rate": 1.604545737002461e-06, "loss": 0.4641, "num_input_tokens_seen": 97694064, "step": 80525 }, { "epoch": 8.968704755540706, "grad_norm": 0.09692163020372391, "learning_rate": 1.60283329586155e-06, "loss": 0.4647, "num_input_tokens_seen": 97700272, "step": 80530 }, { "epoch": 8.969261610424324, "grad_norm": 0.1008070558309555, "learning_rate": 1.6011217387405769e-06, "loss": 0.4539, "num_input_tokens_seen": 97706448, "step": 80535 }, { "epoch": 8.969818465307942, "grad_norm": 0.11666824668645859, "learning_rate": 1.5994110657042066e-06, "loss": 0.4585, "num_input_tokens_seen": 97712464, "step": 80540 }, { "epoch": 8.970375320191557, "grad_norm": 0.12232812494039536, "learning_rate": 1.597701276817079e-06, "loss": 0.4587, "num_input_tokens_seen": 97718672, "step": 80545 }, { "epoch": 8.970932175075175, "grad_norm": 0.14464721083641052, "learning_rate": 1.5959923721437952e-06, "loss": 0.4634, "num_input_tokens_seen": 97724784, "step": 80550 }, { "epoch": 8.971489029958793, "grad_norm": 0.07541081309318542, "learning_rate": 1.5942843517489203e-06, "loss": 0.4664, "num_input_tokens_seen": 97731024, "step": 80555 }, { "epoch": 8.97204588484241, "grad_norm": 0.0799647867679596, "learning_rate": 1.5925772156969915e-06, "loss": 0.466, "num_input_tokens_seen": 97737296, "step": 80560 }, { "epoch": 8.972602739726028, "grad_norm": 0.0900709331035614, "learning_rate": 1.5908709640525072e-06, "loss": 0.4683, "num_input_tokens_seen": 97743120, "step": 80565 }, { "epoch": 8.973159594609644, "grad_norm": 0.08894538879394531, "learning_rate": 1.5891655968799324e-06, "loss": 0.4692, "num_input_tokens_seen": 97749072, "step": 80570 }, { "epoch": 8.973716449493262, "grad_norm": 0.07618251442909241, "learning_rate": 1.58746111424371e-06, "loss": 0.4581, "num_input_tokens_seen": 97755184, "step": 80575 }, { "epoch": 8.97427330437688, "grad_norm": 0.12681832909584045, "learning_rate": 1.5857575162082333e-06, "loss": 0.452, "num_input_tokens_seen": 97761200, "step": 80580 }, { "epoch": 8.974830159260497, "grad_norm": 0.10894690454006195, "learning_rate": 1.5840548028378754e-06, "loss": 0.464, "num_input_tokens_seen": 97766544, "step": 80585 }, { "epoch": 8.975387014144115, "grad_norm": 0.12464500218629837, "learning_rate": 1.5823529741969629e-06, "loss": 0.4523, "num_input_tokens_seen": 97773072, "step": 80590 }, { "epoch": 8.97594386902773, "grad_norm": 0.14504539966583252, "learning_rate": 1.5806520303498051e-06, "loss": 0.4722, "num_input_tokens_seen": 97779440, "step": 80595 }, { "epoch": 8.976500723911348, "grad_norm": 0.12363535910844803, "learning_rate": 1.5789519713606592e-06, "loss": 0.4641, "num_input_tokens_seen": 97785552, "step": 80600 }, { "epoch": 8.977057578794966, "grad_norm": 0.12616844475269318, "learning_rate": 1.5772527972937655e-06, "loss": 0.4544, "num_input_tokens_seen": 97791600, "step": 80605 }, { "epoch": 8.977614433678584, "grad_norm": 0.10447356849908829, "learning_rate": 1.5755545082133284e-06, "loss": 0.4611, "num_input_tokens_seen": 97797712, "step": 80610 }, { "epoch": 8.978171288562201, "grad_norm": 0.11735482513904572, "learning_rate": 1.5738571041835043e-06, "loss": 0.4583, "num_input_tokens_seen": 97803920, "step": 80615 }, { "epoch": 8.978728143445817, "grad_norm": 0.08072120696306229, "learning_rate": 1.572160585268434e-06, "loss": 0.4616, "num_input_tokens_seen": 97809808, "step": 80620 }, { "epoch": 8.979284998329435, "grad_norm": 0.17418281733989716, "learning_rate": 1.5704649515322133e-06, "loss": 0.455, "num_input_tokens_seen": 97816240, "step": 80625 }, { "epoch": 8.979841853213053, "grad_norm": 0.0887288898229599, "learning_rate": 1.5687702030389135e-06, "loss": 0.4546, "num_input_tokens_seen": 97822448, "step": 80630 }, { "epoch": 8.98039870809667, "grad_norm": 0.12512657046318054, "learning_rate": 1.567076339852558e-06, "loss": 0.46, "num_input_tokens_seen": 97828496, "step": 80635 }, { "epoch": 8.980955562980288, "grad_norm": 0.12867657840251923, "learning_rate": 1.5653833620371567e-06, "loss": 0.4647, "num_input_tokens_seen": 97834352, "step": 80640 }, { "epoch": 8.981512417863904, "grad_norm": 0.09349916875362396, "learning_rate": 1.5636912696566724e-06, "loss": 0.4653, "num_input_tokens_seen": 97840400, "step": 80645 }, { "epoch": 8.982069272747522, "grad_norm": 0.11903421580791473, "learning_rate": 1.5620000627750348e-06, "loss": 0.4678, "num_input_tokens_seen": 97846640, "step": 80650 }, { "epoch": 8.98262612763114, "grad_norm": 0.10299549996852875, "learning_rate": 1.5603097414561507e-06, "loss": 0.4622, "num_input_tokens_seen": 97852688, "step": 80655 }, { "epoch": 8.983182982514757, "grad_norm": 0.10115934908390045, "learning_rate": 1.558620305763875e-06, "loss": 0.4558, "num_input_tokens_seen": 97859056, "step": 80660 }, { "epoch": 8.983739837398375, "grad_norm": 0.1396542489528656, "learning_rate": 1.5569317557620478e-06, "loss": 0.4596, "num_input_tokens_seen": 97865520, "step": 80665 }, { "epoch": 8.98429669228199, "grad_norm": 0.12705348432064056, "learning_rate": 1.5552440915144684e-06, "loss": 0.4613, "num_input_tokens_seen": 97871248, "step": 80670 }, { "epoch": 8.984853547165608, "grad_norm": 0.11073284596204758, "learning_rate": 1.5535573130848967e-06, "loss": 0.4642, "num_input_tokens_seen": 97877392, "step": 80675 }, { "epoch": 8.985410402049226, "grad_norm": 0.08235901594161987, "learning_rate": 1.551871420537071e-06, "loss": 0.4673, "num_input_tokens_seen": 97882992, "step": 80680 }, { "epoch": 8.985967256932843, "grad_norm": 0.12974369525909424, "learning_rate": 1.5501864139346817e-06, "loss": 0.4547, "num_input_tokens_seen": 97888944, "step": 80685 }, { "epoch": 8.986524111816461, "grad_norm": 0.1079757958650589, "learning_rate": 1.5485022933414028e-06, "loss": 0.4601, "num_input_tokens_seen": 97894992, "step": 80690 }, { "epoch": 8.987080966700077, "grad_norm": 0.1328430324792862, "learning_rate": 1.5468190588208559e-06, "loss": 0.4604, "num_input_tokens_seen": 97901136, "step": 80695 }, { "epoch": 8.987637821583695, "grad_norm": 0.08622109144926071, "learning_rate": 1.5451367104366455e-06, "loss": 0.4495, "num_input_tokens_seen": 97907504, "step": 80700 }, { "epoch": 8.988194676467312, "grad_norm": 0.09438914060592651, "learning_rate": 1.54345524825234e-06, "loss": 0.4721, "num_input_tokens_seen": 97913744, "step": 80705 }, { "epoch": 8.98875153135093, "grad_norm": 0.09945444762706757, "learning_rate": 1.5417746723314641e-06, "loss": 0.4675, "num_input_tokens_seen": 97920016, "step": 80710 }, { "epoch": 8.989308386234548, "grad_norm": 0.09044823050498962, "learning_rate": 1.5400949827375138e-06, "loss": 0.4517, "num_input_tokens_seen": 97925936, "step": 80715 }, { "epoch": 8.989865241118165, "grad_norm": 0.0982455462217331, "learning_rate": 1.538416179533958e-06, "loss": 0.4591, "num_input_tokens_seen": 97931728, "step": 80720 }, { "epoch": 8.990422096001781, "grad_norm": 0.07635918259620667, "learning_rate": 1.5367382627842237e-06, "loss": 0.4712, "num_input_tokens_seen": 97938000, "step": 80725 }, { "epoch": 8.990978950885399, "grad_norm": 0.1436464786529541, "learning_rate": 1.53506123255171e-06, "loss": 0.4743, "num_input_tokens_seen": 97943504, "step": 80730 }, { "epoch": 8.991535805769017, "grad_norm": 0.11033475399017334, "learning_rate": 1.5333850888997804e-06, "loss": 0.451, "num_input_tokens_seen": 97949744, "step": 80735 }, { "epoch": 8.992092660652634, "grad_norm": 0.09793468564748764, "learning_rate": 1.5317098318917644e-06, "loss": 0.4672, "num_input_tokens_seen": 97956016, "step": 80740 }, { "epoch": 8.992649515536252, "grad_norm": 0.11234480142593384, "learning_rate": 1.530035461590959e-06, "loss": 0.4543, "num_input_tokens_seen": 97962128, "step": 80745 }, { "epoch": 8.993206370419868, "grad_norm": 0.10772974044084549, "learning_rate": 1.5283619780606245e-06, "loss": 0.452, "num_input_tokens_seen": 97967952, "step": 80750 }, { "epoch": 8.993763225303486, "grad_norm": 0.11069772392511368, "learning_rate": 1.5266893813639966e-06, "loss": 0.4575, "num_input_tokens_seen": 97974000, "step": 80755 }, { "epoch": 8.994320080187103, "grad_norm": 0.10074315220117569, "learning_rate": 1.525017671564266e-06, "loss": 0.4557, "num_input_tokens_seen": 97979792, "step": 80760 }, { "epoch": 8.994876935070721, "grad_norm": 0.0920887291431427, "learning_rate": 1.5233468487245994e-06, "loss": 0.455, "num_input_tokens_seen": 97985968, "step": 80765 }, { "epoch": 8.995433789954339, "grad_norm": 0.0998227447271347, "learning_rate": 1.5216769129081183e-06, "loss": 0.4583, "num_input_tokens_seen": 97991984, "step": 80770 }, { "epoch": 8.995990644837955, "grad_norm": 0.11400220543146133, "learning_rate": 1.5200078641779303e-06, "loss": 0.4632, "num_input_tokens_seen": 97998096, "step": 80775 }, { "epoch": 8.996547499721572, "grad_norm": 0.14516660571098328, "learning_rate": 1.5183397025970853e-06, "loss": 0.4676, "num_input_tokens_seen": 98004112, "step": 80780 }, { "epoch": 8.99710435460519, "grad_norm": 0.09629381448030472, "learning_rate": 1.516672428228616e-06, "loss": 0.4658, "num_input_tokens_seen": 98010160, "step": 80785 }, { "epoch": 8.997661209488808, "grad_norm": 0.08942724764347076, "learning_rate": 1.5150060411355222e-06, "loss": 0.4701, "num_input_tokens_seen": 98015856, "step": 80790 }, { "epoch": 8.998218064372425, "grad_norm": 0.10168693214654922, "learning_rate": 1.5133405413807589e-06, "loss": 0.4438, "num_input_tokens_seen": 98022096, "step": 80795 }, { "epoch": 8.998774919256041, "grad_norm": 0.11133445054292679, "learning_rate": 1.5116759290272619e-06, "loss": 0.466, "num_input_tokens_seen": 98028176, "step": 80800 }, { "epoch": 8.999331774139659, "grad_norm": 0.08699413388967514, "learning_rate": 1.5100122041379145e-06, "loss": 0.4595, "num_input_tokens_seen": 98033872, "step": 80805 }, { "epoch": 8.999888629023276, "grad_norm": 0.08909951895475388, "learning_rate": 1.5083493667755855e-06, "loss": 0.4566, "num_input_tokens_seen": 98040112, "step": 80810 }, { "epoch": 9.000445483906894, "grad_norm": 0.08069190382957458, "learning_rate": 1.5066874170030998e-06, "loss": 0.4528, "num_input_tokens_seen": 98045488, "step": 80815 }, { "epoch": 9.001002338790512, "grad_norm": 0.10486897826194763, "learning_rate": 1.5050263548832488e-06, "loss": 0.4564, "num_input_tokens_seen": 98051504, "step": 80820 }, { "epoch": 9.001002338790512, "eval_loss": 0.4640403687953949, "eval_runtime": 113.0124, "eval_samples_per_second": 35.315, "eval_steps_per_second": 8.831, "num_input_tokens_seen": 98051504, "step": 80820 }, { "epoch": 9.001559193674128, "grad_norm": 0.1314145028591156, "learning_rate": 1.503366180478802e-06, "loss": 0.4789, "num_input_tokens_seen": 98057264, "step": 80825 }, { "epoch": 9.002116048557745, "grad_norm": 0.09552595019340515, "learning_rate": 1.5017068938524725e-06, "loss": 0.4576, "num_input_tokens_seen": 98063056, "step": 80830 }, { "epoch": 9.002672903441363, "grad_norm": 0.12542222440242767, "learning_rate": 1.5000484950669663e-06, "loss": 0.4607, "num_input_tokens_seen": 98068432, "step": 80835 }, { "epoch": 9.00322975832498, "grad_norm": 0.09355409443378448, "learning_rate": 1.4983909841849332e-06, "loss": 0.4705, "num_input_tokens_seen": 98074640, "step": 80840 }, { "epoch": 9.003786613208598, "grad_norm": 0.09101102501153946, "learning_rate": 1.4967343612690033e-06, "loss": 0.4673, "num_input_tokens_seen": 98080560, "step": 80845 }, { "epoch": 9.004343468092214, "grad_norm": 0.14040178060531616, "learning_rate": 1.495078626381774e-06, "loss": 0.4593, "num_input_tokens_seen": 98086096, "step": 80850 }, { "epoch": 9.004900322975832, "grad_norm": 0.1196482703089714, "learning_rate": 1.4934237795857952e-06, "loss": 0.4745, "num_input_tokens_seen": 98092272, "step": 80855 }, { "epoch": 9.00545717785945, "grad_norm": 0.11285113543272018, "learning_rate": 1.4917698209435977e-06, "loss": 0.4574, "num_input_tokens_seen": 98098480, "step": 80860 }, { "epoch": 9.006014032743067, "grad_norm": 0.13533806800842285, "learning_rate": 1.4901167505176728e-06, "loss": 0.4531, "num_input_tokens_seen": 98104560, "step": 80865 }, { "epoch": 9.006570887626685, "grad_norm": 0.11226309835910797, "learning_rate": 1.488464568370479e-06, "loss": 0.4681, "num_input_tokens_seen": 98110768, "step": 80870 }, { "epoch": 9.007127742510303, "grad_norm": 0.10440865904092789, "learning_rate": 1.4868132745644358e-06, "loss": 0.4558, "num_input_tokens_seen": 98116880, "step": 80875 }, { "epoch": 9.007684597393919, "grad_norm": 0.08792094141244888, "learning_rate": 1.4851628691619407e-06, "loss": 0.4669, "num_input_tokens_seen": 98122992, "step": 80880 }, { "epoch": 9.008241452277536, "grad_norm": 0.12668843567371368, "learning_rate": 1.4835133522253519e-06, "loss": 0.4565, "num_input_tokens_seen": 98129168, "step": 80885 }, { "epoch": 9.008798307161154, "grad_norm": 0.1022009328007698, "learning_rate": 1.481864723816989e-06, "loss": 0.4643, "num_input_tokens_seen": 98135472, "step": 80890 }, { "epoch": 9.009355162044772, "grad_norm": 0.1715681105852127, "learning_rate": 1.480216983999147e-06, "loss": 0.4756, "num_input_tokens_seen": 98141904, "step": 80895 }, { "epoch": 9.00991201692839, "grad_norm": 0.1143193393945694, "learning_rate": 1.4785701328340757e-06, "loss": 0.4554, "num_input_tokens_seen": 98147536, "step": 80900 }, { "epoch": 9.010468871812005, "grad_norm": 0.09657935798168182, "learning_rate": 1.4769241703840058e-06, "loss": 0.4667, "num_input_tokens_seen": 98153680, "step": 80905 }, { "epoch": 9.011025726695623, "grad_norm": 0.1006210446357727, "learning_rate": 1.475279096711124e-06, "loss": 0.4686, "num_input_tokens_seen": 98159824, "step": 80910 }, { "epoch": 9.01158258157924, "grad_norm": 0.10937316715717316, "learning_rate": 1.4736349118775834e-06, "loss": 0.4615, "num_input_tokens_seen": 98165936, "step": 80915 }, { "epoch": 9.012139436462858, "grad_norm": 0.10290569812059402, "learning_rate": 1.4719916159455143e-06, "loss": 0.4613, "num_input_tokens_seen": 98171984, "step": 80920 }, { "epoch": 9.012696291346476, "grad_norm": 0.1111767590045929, "learning_rate": 1.4703492089769982e-06, "loss": 0.4688, "num_input_tokens_seen": 98178384, "step": 80925 }, { "epoch": 9.013253146230092, "grad_norm": 0.12848415970802307, "learning_rate": 1.4687076910340963e-06, "loss": 0.4453, "num_input_tokens_seen": 98184656, "step": 80930 }, { "epoch": 9.01381000111371, "grad_norm": 0.1324591487646103, "learning_rate": 1.467067062178823e-06, "loss": 0.4622, "num_input_tokens_seen": 98190672, "step": 80935 }, { "epoch": 9.014366855997327, "grad_norm": 0.09805604815483093, "learning_rate": 1.465427322473173e-06, "loss": 0.4703, "num_input_tokens_seen": 98196688, "step": 80940 }, { "epoch": 9.014923710880945, "grad_norm": 0.12085143476724625, "learning_rate": 1.4637884719791024e-06, "loss": 0.4612, "num_input_tokens_seen": 98202256, "step": 80945 }, { "epoch": 9.015480565764562, "grad_norm": 0.1316642463207245, "learning_rate": 1.4621505107585254e-06, "loss": 0.456, "num_input_tokens_seen": 98208080, "step": 80950 }, { "epoch": 9.016037420648178, "grad_norm": 0.11920596659183502, "learning_rate": 1.4605134388733345e-06, "loss": 0.4718, "num_input_tokens_seen": 98214192, "step": 80955 }, { "epoch": 9.016594275531796, "grad_norm": 0.13483481109142303, "learning_rate": 1.4588772563853797e-06, "loss": 0.4584, "num_input_tokens_seen": 98220528, "step": 80960 }, { "epoch": 9.017151130415414, "grad_norm": 0.10494925826787949, "learning_rate": 1.457241963356487e-06, "loss": 0.4645, "num_input_tokens_seen": 98226448, "step": 80965 }, { "epoch": 9.017707985299031, "grad_norm": 0.08117523044347763, "learning_rate": 1.4556075598484349e-06, "loss": 0.4545, "num_input_tokens_seen": 98232656, "step": 80970 }, { "epoch": 9.018264840182649, "grad_norm": 0.1143639013171196, "learning_rate": 1.4539740459229817e-06, "loss": 0.4524, "num_input_tokens_seen": 98238640, "step": 80975 }, { "epoch": 9.018821695066265, "grad_norm": 0.09285513311624527, "learning_rate": 1.4523414216418507e-06, "loss": 0.4614, "num_input_tokens_seen": 98245136, "step": 80980 }, { "epoch": 9.019378549949883, "grad_norm": 0.11831226199865341, "learning_rate": 1.4507096870667174e-06, "loss": 0.4735, "num_input_tokens_seen": 98251568, "step": 80985 }, { "epoch": 9.0199354048335, "grad_norm": 0.12217001616954803, "learning_rate": 1.449078842259244e-06, "loss": 0.4667, "num_input_tokens_seen": 98257680, "step": 80990 }, { "epoch": 9.020492259717118, "grad_norm": 0.09549432247877121, "learning_rate": 1.4474488872810416e-06, "loss": 0.4553, "num_input_tokens_seen": 98263760, "step": 80995 }, { "epoch": 9.021049114600736, "grad_norm": 0.12735025584697723, "learning_rate": 1.4458198221937002e-06, "loss": 0.4531, "num_input_tokens_seen": 98269712, "step": 81000 }, { "epoch": 9.021605969484352, "grad_norm": 0.07908844947814941, "learning_rate": 1.444191647058768e-06, "loss": 0.4607, "num_input_tokens_seen": 98275920, "step": 81005 }, { "epoch": 9.02216282436797, "grad_norm": 0.07793068885803223, "learning_rate": 1.4425643619377592e-06, "loss": 0.4624, "num_input_tokens_seen": 98281456, "step": 81010 }, { "epoch": 9.022719679251587, "grad_norm": 0.09943784773349762, "learning_rate": 1.4409379668921696e-06, "loss": 0.4642, "num_input_tokens_seen": 98287504, "step": 81015 }, { "epoch": 9.023276534135205, "grad_norm": 0.09174850583076477, "learning_rate": 1.4393124619834386e-06, "loss": 0.4694, "num_input_tokens_seen": 98293744, "step": 81020 }, { "epoch": 9.023833389018822, "grad_norm": 0.10815813392400742, "learning_rate": 1.4376878472729893e-06, "loss": 0.4792, "num_input_tokens_seen": 98299632, "step": 81025 }, { "epoch": 9.024390243902438, "grad_norm": 0.09411376714706421, "learning_rate": 1.4360641228221977e-06, "loss": 0.4548, "num_input_tokens_seen": 98305776, "step": 81030 }, { "epoch": 9.024947098786056, "grad_norm": 0.10753334313631058, "learning_rate": 1.434441288692423e-06, "loss": 0.4593, "num_input_tokens_seen": 98311856, "step": 81035 }, { "epoch": 9.025503953669674, "grad_norm": 0.12649069726467133, "learning_rate": 1.4328193449449717e-06, "loss": 0.4651, "num_input_tokens_seen": 98318192, "step": 81040 }, { "epoch": 9.026060808553291, "grad_norm": 0.08582008630037308, "learning_rate": 1.4311982916411309e-06, "loss": 0.4699, "num_input_tokens_seen": 98324368, "step": 81045 }, { "epoch": 9.026617663436909, "grad_norm": 0.11613858491182327, "learning_rate": 1.4295781288421516e-06, "loss": 0.4667, "num_input_tokens_seen": 98330576, "step": 81050 }, { "epoch": 9.027174518320527, "grad_norm": 0.10485448688268661, "learning_rate": 1.4279588566092432e-06, "loss": 0.4601, "num_input_tokens_seen": 98336624, "step": 81055 }, { "epoch": 9.027731373204142, "grad_norm": 0.1157645508646965, "learning_rate": 1.426340475003593e-06, "loss": 0.4615, "num_input_tokens_seen": 98341904, "step": 81060 }, { "epoch": 9.02828822808776, "grad_norm": 0.1259610801935196, "learning_rate": 1.4247229840863406e-06, "loss": 0.4631, "num_input_tokens_seen": 98348176, "step": 81065 }, { "epoch": 9.028845082971378, "grad_norm": 0.09980461001396179, "learning_rate": 1.4231063839186042e-06, "loss": 0.4591, "num_input_tokens_seen": 98353840, "step": 81070 }, { "epoch": 9.029401937854995, "grad_norm": 0.10777652263641357, "learning_rate": 1.4214906745614682e-06, "loss": 0.4633, "num_input_tokens_seen": 98359888, "step": 81075 }, { "epoch": 9.029958792738613, "grad_norm": 0.10480957478284836, "learning_rate": 1.4198758560759723e-06, "loss": 0.4541, "num_input_tokens_seen": 98365904, "step": 81080 }, { "epoch": 9.030515647622229, "grad_norm": 0.09086784720420837, "learning_rate": 1.4182619285231375e-06, "loss": 0.4617, "num_input_tokens_seen": 98371984, "step": 81085 }, { "epoch": 9.031072502505847, "grad_norm": 0.10995430499315262, "learning_rate": 1.4166488919639315e-06, "loss": 0.4579, "num_input_tokens_seen": 98378032, "step": 81090 }, { "epoch": 9.031629357389464, "grad_norm": 0.09738726168870926, "learning_rate": 1.4150367464593139e-06, "loss": 0.4546, "num_input_tokens_seen": 98384432, "step": 81095 }, { "epoch": 9.032186212273082, "grad_norm": 0.11025210469961166, "learning_rate": 1.4134254920701862e-06, "loss": 0.4517, "num_input_tokens_seen": 98390608, "step": 81100 }, { "epoch": 9.0327430671567, "grad_norm": 0.13694718480110168, "learning_rate": 1.4118151288574271e-06, "loss": 0.4664, "num_input_tokens_seen": 98397200, "step": 81105 }, { "epoch": 9.033299922040316, "grad_norm": 0.11034621298313141, "learning_rate": 1.4102056568818912e-06, "loss": 0.4549, "num_input_tokens_seen": 98403216, "step": 81110 }, { "epoch": 9.033856776923933, "grad_norm": 0.09578949213027954, "learning_rate": 1.4085970762043765e-06, "loss": 0.4616, "num_input_tokens_seen": 98409392, "step": 81115 }, { "epoch": 9.034413631807551, "grad_norm": 0.15457142889499664, "learning_rate": 1.4069893868856738e-06, "loss": 0.4599, "num_input_tokens_seen": 98415504, "step": 81120 }, { "epoch": 9.034970486691169, "grad_norm": 0.11024435609579086, "learning_rate": 1.4053825889865173e-06, "loss": 0.4532, "num_input_tokens_seen": 98421648, "step": 81125 }, { "epoch": 9.035527341574786, "grad_norm": 0.09384855628013611, "learning_rate": 1.4037766825676147e-06, "loss": 0.4507, "num_input_tokens_seen": 98427792, "step": 81130 }, { "epoch": 9.036084196458402, "grad_norm": 0.11141667515039444, "learning_rate": 1.4021716676896529e-06, "loss": 0.4776, "num_input_tokens_seen": 98433616, "step": 81135 }, { "epoch": 9.03664105134202, "grad_norm": 0.11175600439310074, "learning_rate": 1.4005675444132644e-06, "loss": 0.4578, "num_input_tokens_seen": 98439760, "step": 81140 }, { "epoch": 9.037197906225638, "grad_norm": 0.09341765940189362, "learning_rate": 1.3989643127990642e-06, "loss": 0.4694, "num_input_tokens_seen": 98445872, "step": 81145 }, { "epoch": 9.037754761109255, "grad_norm": 0.10849853605031967, "learning_rate": 1.3973619729076209e-06, "loss": 0.4535, "num_input_tokens_seen": 98452176, "step": 81150 }, { "epoch": 9.038311615992873, "grad_norm": 0.09391060471534729, "learning_rate": 1.3957605247994833e-06, "loss": 0.4485, "num_input_tokens_seen": 98457488, "step": 81155 }, { "epoch": 9.038868470876489, "grad_norm": 0.17251262068748474, "learning_rate": 1.394159968535161e-06, "loss": 0.4618, "num_input_tokens_seen": 98463856, "step": 81160 }, { "epoch": 9.039425325760106, "grad_norm": 0.09851285815238953, "learning_rate": 1.3925603041751167e-06, "loss": 0.4534, "num_input_tokens_seen": 98469744, "step": 81165 }, { "epoch": 9.039982180643724, "grad_norm": 0.14486804604530334, "learning_rate": 1.3909615317798025e-06, "loss": 0.4642, "num_input_tokens_seen": 98475664, "step": 81170 }, { "epoch": 9.040539035527342, "grad_norm": 0.12179895490407944, "learning_rate": 1.3893636514096198e-06, "loss": 0.4691, "num_input_tokens_seen": 98481712, "step": 81175 }, { "epoch": 9.04109589041096, "grad_norm": 0.10084798187017441, "learning_rate": 1.3877666631249426e-06, "loss": 0.4541, "num_input_tokens_seen": 98488048, "step": 81180 }, { "epoch": 9.041652745294575, "grad_norm": 0.09601391106843948, "learning_rate": 1.3861705669861087e-06, "loss": 0.4649, "num_input_tokens_seen": 98494032, "step": 81185 }, { "epoch": 9.042209600178193, "grad_norm": 0.14941038191318512, "learning_rate": 1.3845753630534225e-06, "loss": 0.463, "num_input_tokens_seen": 98500112, "step": 81190 }, { "epoch": 9.04276645506181, "grad_norm": 0.0935399979352951, "learning_rate": 1.3829810513871638e-06, "loss": 0.4558, "num_input_tokens_seen": 98506128, "step": 81195 }, { "epoch": 9.043323309945428, "grad_norm": 0.09704235941171646, "learning_rate": 1.3813876320475622e-06, "loss": 0.4574, "num_input_tokens_seen": 98512432, "step": 81200 }, { "epoch": 9.043880164829046, "grad_norm": 0.15022233128547668, "learning_rate": 1.3797951050948277e-06, "loss": 0.451, "num_input_tokens_seen": 98518384, "step": 81205 }, { "epoch": 9.044437019712662, "grad_norm": 0.09487689286470413, "learning_rate": 1.3782034705891261e-06, "loss": 0.4657, "num_input_tokens_seen": 98524624, "step": 81210 }, { "epoch": 9.04499387459628, "grad_norm": 0.11379940062761307, "learning_rate": 1.3766127285905955e-06, "loss": 0.4516, "num_input_tokens_seen": 98530736, "step": 81215 }, { "epoch": 9.045550729479897, "grad_norm": 0.11983868479728699, "learning_rate": 1.3750228791593462e-06, "loss": 0.4544, "num_input_tokens_seen": 98536912, "step": 81220 }, { "epoch": 9.046107584363515, "grad_norm": 0.11708473414182663, "learning_rate": 1.3734339223554382e-06, "loss": 0.4768, "num_input_tokens_seen": 98543120, "step": 81225 }, { "epoch": 9.046664439247133, "grad_norm": 0.10840798914432526, "learning_rate": 1.3718458582389154e-06, "loss": 0.4562, "num_input_tokens_seen": 98549424, "step": 81230 }, { "epoch": 9.04722129413075, "grad_norm": 0.08301278948783875, "learning_rate": 1.3702586868697714e-06, "loss": 0.4483, "num_input_tokens_seen": 98555632, "step": 81235 }, { "epoch": 9.047778149014366, "grad_norm": 0.10471724718809128, "learning_rate": 1.368672408307986e-06, "loss": 0.4637, "num_input_tokens_seen": 98561072, "step": 81240 }, { "epoch": 9.048335003897984, "grad_norm": 0.11822595447301865, "learning_rate": 1.3670870226134807e-06, "loss": 0.4642, "num_input_tokens_seen": 98567024, "step": 81245 }, { "epoch": 9.048891858781602, "grad_norm": 0.10056764632463455, "learning_rate": 1.365502529846166e-06, "loss": 0.4685, "num_input_tokens_seen": 98572752, "step": 81250 }, { "epoch": 9.04944871366522, "grad_norm": 0.1115383505821228, "learning_rate": 1.3639189300659077e-06, "loss": 0.4751, "num_input_tokens_seen": 98578960, "step": 81255 }, { "epoch": 9.050005568548837, "grad_norm": 0.10418378561735153, "learning_rate": 1.3623362233325331e-06, "loss": 0.46, "num_input_tokens_seen": 98584976, "step": 81260 }, { "epoch": 9.050562423432453, "grad_norm": 0.11667676270008087, "learning_rate": 1.3607544097058527e-06, "loss": 0.458, "num_input_tokens_seen": 98591184, "step": 81265 }, { "epoch": 9.05111927831607, "grad_norm": 0.1011957898736, "learning_rate": 1.3591734892456215e-06, "loss": 0.4623, "num_input_tokens_seen": 98597104, "step": 81270 }, { "epoch": 9.051676133199688, "grad_norm": 0.10175985842943192, "learning_rate": 1.3575934620115804e-06, "loss": 0.4468, "num_input_tokens_seen": 98602928, "step": 81275 }, { "epoch": 9.052232988083306, "grad_norm": 0.12908019125461578, "learning_rate": 1.3560143280634209e-06, "loss": 0.4626, "num_input_tokens_seen": 98609200, "step": 81280 }, { "epoch": 9.052789842966924, "grad_norm": 0.11004234105348587, "learning_rate": 1.3544360874608114e-06, "loss": 0.4586, "num_input_tokens_seen": 98615536, "step": 81285 }, { "epoch": 9.05334669785054, "grad_norm": 0.16526634991168976, "learning_rate": 1.3528587402633851e-06, "loss": 0.4389, "num_input_tokens_seen": 98622064, "step": 81290 }, { "epoch": 9.053903552734157, "grad_norm": 0.1460312157869339, "learning_rate": 1.3512822865307335e-06, "loss": 0.4572, "num_input_tokens_seen": 98628016, "step": 81295 }, { "epoch": 9.054460407617775, "grad_norm": 0.14214761555194855, "learning_rate": 1.349706726322425e-06, "loss": 0.4457, "num_input_tokens_seen": 98634576, "step": 81300 }, { "epoch": 9.055017262501392, "grad_norm": 0.12707647681236267, "learning_rate": 1.3481320596979873e-06, "loss": 0.459, "num_input_tokens_seen": 98640816, "step": 81305 }, { "epoch": 9.05557411738501, "grad_norm": 0.09956211596727371, "learning_rate": 1.3465582867169175e-06, "loss": 0.46, "num_input_tokens_seen": 98646768, "step": 81310 }, { "epoch": 9.056130972268626, "grad_norm": 0.11324603855609894, "learning_rate": 1.344985407438673e-06, "loss": 0.4535, "num_input_tokens_seen": 98652848, "step": 81315 }, { "epoch": 9.056687827152244, "grad_norm": 0.09413345903158188, "learning_rate": 1.3434134219226874e-06, "loss": 0.4598, "num_input_tokens_seen": 98658352, "step": 81320 }, { "epoch": 9.057244682035861, "grad_norm": 0.1091512069106102, "learning_rate": 1.3418423302283572e-06, "loss": 0.4589, "num_input_tokens_seen": 98664496, "step": 81325 }, { "epoch": 9.057801536919479, "grad_norm": 0.09806036949157715, "learning_rate": 1.3402721324150352e-06, "loss": 0.467, "num_input_tokens_seen": 98670576, "step": 81330 }, { "epoch": 9.058358391803097, "grad_norm": 0.08821478486061096, "learning_rate": 1.3387028285420599e-06, "loss": 0.4684, "num_input_tokens_seen": 98676560, "step": 81335 }, { "epoch": 9.058915246686713, "grad_norm": 0.09919309616088867, "learning_rate": 1.3371344186687118e-06, "loss": 0.4594, "num_input_tokens_seen": 98682608, "step": 81340 }, { "epoch": 9.05947210157033, "grad_norm": 0.0959322452545166, "learning_rate": 1.3355669028542573e-06, "loss": 0.4592, "num_input_tokens_seen": 98688176, "step": 81345 }, { "epoch": 9.060028956453948, "grad_norm": 0.11296232044696808, "learning_rate": 1.3340002811579267e-06, "loss": 0.4505, "num_input_tokens_seen": 98694384, "step": 81350 }, { "epoch": 9.060585811337566, "grad_norm": 0.12856894731521606, "learning_rate": 1.3324345536389005e-06, "loss": 0.4748, "num_input_tokens_seen": 98700624, "step": 81355 }, { "epoch": 9.061142666221183, "grad_norm": 0.06883130967617035, "learning_rate": 1.3308697203563509e-06, "loss": 0.4558, "num_input_tokens_seen": 98706096, "step": 81360 }, { "epoch": 9.0616995211048, "grad_norm": 0.10203638672828674, "learning_rate": 1.3293057813693888e-06, "loss": 0.4658, "num_input_tokens_seen": 98712240, "step": 81365 }, { "epoch": 9.062256375988417, "grad_norm": 0.13148283958435059, "learning_rate": 1.3277427367371141e-06, "loss": 0.4533, "num_input_tokens_seen": 98718576, "step": 81370 }, { "epoch": 9.062813230872035, "grad_norm": 0.1232125386595726, "learning_rate": 1.3261805865185745e-06, "loss": 0.471, "num_input_tokens_seen": 98724816, "step": 81375 }, { "epoch": 9.063370085755652, "grad_norm": 0.11625101417303085, "learning_rate": 1.3246193307728028e-06, "loss": 0.4508, "num_input_tokens_seen": 98730992, "step": 81380 }, { "epoch": 9.06392694063927, "grad_norm": 0.10897745192050934, "learning_rate": 1.3230589695587853e-06, "loss": 0.4578, "num_input_tokens_seen": 98737232, "step": 81385 }, { "epoch": 9.064483795522886, "grad_norm": 0.12463143467903137, "learning_rate": 1.321499502935475e-06, "loss": 0.4503, "num_input_tokens_seen": 98743056, "step": 81390 }, { "epoch": 9.065040650406504, "grad_norm": 0.13292540609836578, "learning_rate": 1.3199409309617967e-06, "loss": 0.4576, "num_input_tokens_seen": 98749296, "step": 81395 }, { "epoch": 9.065597505290121, "grad_norm": 0.12044966220855713, "learning_rate": 1.3183832536966339e-06, "loss": 0.4579, "num_input_tokens_seen": 98755344, "step": 81400 }, { "epoch": 9.066154360173739, "grad_norm": 0.15972484648227692, "learning_rate": 1.3168264711988455e-06, "loss": 0.4634, "num_input_tokens_seen": 98761680, "step": 81405 }, { "epoch": 9.066711215057357, "grad_norm": 0.10509783774614334, "learning_rate": 1.3152705835272478e-06, "loss": 0.4691, "num_input_tokens_seen": 98767856, "step": 81410 }, { "epoch": 9.067268069940974, "grad_norm": 0.11997900158166885, "learning_rate": 1.3137155907406274e-06, "loss": 0.4635, "num_input_tokens_seen": 98774128, "step": 81415 }, { "epoch": 9.06782492482459, "grad_norm": 0.08388704061508179, "learning_rate": 1.3121614928977427e-06, "loss": 0.4686, "num_input_tokens_seen": 98779920, "step": 81420 }, { "epoch": 9.068381779708208, "grad_norm": 0.07009097188711166, "learning_rate": 1.3106082900573052e-06, "loss": 0.4574, "num_input_tokens_seen": 98785968, "step": 81425 }, { "epoch": 9.068938634591825, "grad_norm": 0.12990866601467133, "learning_rate": 1.3090559822780041e-06, "loss": 0.4628, "num_input_tokens_seen": 98792240, "step": 81430 }, { "epoch": 9.069495489475443, "grad_norm": 0.11168753355741501, "learning_rate": 1.3075045696184869e-06, "loss": 0.4741, "num_input_tokens_seen": 98797968, "step": 81435 }, { "epoch": 9.07005234435906, "grad_norm": 0.10707053542137146, "learning_rate": 1.3059540521373759e-06, "loss": 0.4598, "num_input_tokens_seen": 98804112, "step": 81440 }, { "epoch": 9.070609199242677, "grad_norm": 0.09177900850772858, "learning_rate": 1.3044044298932524e-06, "loss": 0.4493, "num_input_tokens_seen": 98810192, "step": 81445 }, { "epoch": 9.071166054126294, "grad_norm": 0.09554404765367508, "learning_rate": 1.3028557029446635e-06, "loss": 0.4539, "num_input_tokens_seen": 98815792, "step": 81450 }, { "epoch": 9.071722909009912, "grad_norm": 0.10119741410017014, "learning_rate": 1.3013078713501325e-06, "loss": 0.4642, "num_input_tokens_seen": 98822096, "step": 81455 }, { "epoch": 9.07227976389353, "grad_norm": 0.12091676145792007, "learning_rate": 1.299760935168129e-06, "loss": 0.4748, "num_input_tokens_seen": 98827984, "step": 81460 }, { "epoch": 9.072836618777147, "grad_norm": 0.09310982376337051, "learning_rate": 1.2982148944571144e-06, "loss": 0.4696, "num_input_tokens_seen": 98834224, "step": 81465 }, { "epoch": 9.073393473660763, "grad_norm": 0.09949833899736404, "learning_rate": 1.2966697492754948e-06, "loss": 0.4558, "num_input_tokens_seen": 98840368, "step": 81470 }, { "epoch": 9.073950328544381, "grad_norm": 0.10478764772415161, "learning_rate": 1.2951254996816514e-06, "loss": 0.4546, "num_input_tokens_seen": 98846736, "step": 81475 }, { "epoch": 9.074507183427999, "grad_norm": 0.10450445860624313, "learning_rate": 1.2935821457339374e-06, "loss": 0.4657, "num_input_tokens_seen": 98852752, "step": 81480 }, { "epoch": 9.075064038311616, "grad_norm": 0.12389197200536728, "learning_rate": 1.2920396874906565e-06, "loss": 0.464, "num_input_tokens_seen": 98858992, "step": 81485 }, { "epoch": 9.075620893195234, "grad_norm": 0.11284945160150528, "learning_rate": 1.290498125010095e-06, "loss": 0.4557, "num_input_tokens_seen": 98864976, "step": 81490 }, { "epoch": 9.07617774807885, "grad_norm": 0.12496630102396011, "learning_rate": 1.2889574583504926e-06, "loss": 0.4456, "num_input_tokens_seen": 98870864, "step": 81495 }, { "epoch": 9.076734602962468, "grad_norm": 0.11157695204019547, "learning_rate": 1.2874176875700666e-06, "loss": 0.4647, "num_input_tokens_seen": 98877008, "step": 81500 }, { "epoch": 9.077291457846085, "grad_norm": 0.1051691323518753, "learning_rate": 1.2858788127269873e-06, "loss": 0.4544, "num_input_tokens_seen": 98883024, "step": 81505 }, { "epoch": 9.077848312729703, "grad_norm": 0.1351349800825119, "learning_rate": 1.2843408338794022e-06, "loss": 0.4546, "num_input_tokens_seen": 98889552, "step": 81510 }, { "epoch": 9.07840516761332, "grad_norm": 0.1515284776687622, "learning_rate": 1.2828037510854235e-06, "loss": 0.4495, "num_input_tokens_seen": 98895472, "step": 81515 }, { "epoch": 9.078962022496937, "grad_norm": 0.1469333916902542, "learning_rate": 1.2812675644031214e-06, "loss": 0.4663, "num_input_tokens_seen": 98901936, "step": 81520 }, { "epoch": 9.079518877380554, "grad_norm": 0.14794029295444489, "learning_rate": 1.2797322738905466e-06, "loss": 0.4633, "num_input_tokens_seen": 98907376, "step": 81525 }, { "epoch": 9.080075732264172, "grad_norm": 0.12860603630542755, "learning_rate": 1.2781978796056998e-06, "loss": 0.4564, "num_input_tokens_seen": 98913424, "step": 81530 }, { "epoch": 9.08063258714779, "grad_norm": 0.1055237203836441, "learning_rate": 1.2766643816065544e-06, "loss": 0.4624, "num_input_tokens_seen": 98919536, "step": 81535 }, { "epoch": 9.081189442031407, "grad_norm": 0.054210275411605835, "learning_rate": 1.2751317799510582e-06, "loss": 0.463, "num_input_tokens_seen": 98925712, "step": 81540 }, { "epoch": 9.081746296915023, "grad_norm": 0.13491737842559814, "learning_rate": 1.2736000746971067e-06, "loss": 0.4647, "num_input_tokens_seen": 98931920, "step": 81545 }, { "epoch": 9.08230315179864, "grad_norm": 0.13436344265937805, "learning_rate": 1.2720692659025867e-06, "loss": 0.4441, "num_input_tokens_seen": 98937072, "step": 81550 }, { "epoch": 9.082860006682258, "grad_norm": 0.1658277064561844, "learning_rate": 1.2705393536253247e-06, "loss": 0.47, "num_input_tokens_seen": 98943280, "step": 81555 }, { "epoch": 9.083416861565876, "grad_norm": 0.09836988896131516, "learning_rate": 1.2690103379231295e-06, "loss": 0.4556, "num_input_tokens_seen": 98949392, "step": 81560 }, { "epoch": 9.083973716449494, "grad_norm": 0.11027530580759048, "learning_rate": 1.26748221885378e-06, "loss": 0.4489, "num_input_tokens_seen": 98955312, "step": 81565 }, { "epoch": 9.084530571333111, "grad_norm": 0.07456003129482269, "learning_rate": 1.2659549964750024e-06, "loss": 0.4651, "num_input_tokens_seen": 98961520, "step": 81570 }, { "epoch": 9.085087426216727, "grad_norm": 0.1002747192978859, "learning_rate": 1.2644286708445058e-06, "loss": 0.4655, "num_input_tokens_seen": 98967792, "step": 81575 }, { "epoch": 9.085644281100345, "grad_norm": 0.0883556678891182, "learning_rate": 1.2629032420199554e-06, "loss": 0.4744, "num_input_tokens_seen": 98973904, "step": 81580 }, { "epoch": 9.086201135983963, "grad_norm": 0.11780854314565659, "learning_rate": 1.2613787100589941e-06, "loss": 0.4635, "num_input_tokens_seen": 98979824, "step": 81585 }, { "epoch": 9.08675799086758, "grad_norm": 0.10935180634260178, "learning_rate": 1.2598550750192173e-06, "loss": 0.4534, "num_input_tokens_seen": 98985904, "step": 81590 }, { "epoch": 9.087314845751198, "grad_norm": 0.1453317552804947, "learning_rate": 1.2583323369581928e-06, "loss": 0.4574, "num_input_tokens_seen": 98992048, "step": 81595 }, { "epoch": 9.087871700634814, "grad_norm": 0.10153529047966003, "learning_rate": 1.2568104959334608e-06, "loss": 0.4622, "num_input_tokens_seen": 98998128, "step": 81600 }, { "epoch": 9.088428555518432, "grad_norm": 0.08405497670173645, "learning_rate": 1.2552895520025115e-06, "loss": 0.4573, "num_input_tokens_seen": 99003984, "step": 81605 }, { "epoch": 9.08898541040205, "grad_norm": 0.11157527565956116, "learning_rate": 1.2537695052228239e-06, "loss": 0.4553, "num_input_tokens_seen": 99010000, "step": 81610 }, { "epoch": 9.089542265285667, "grad_norm": 0.1478670835494995, "learning_rate": 1.2522503556518156e-06, "loss": 0.4713, "num_input_tokens_seen": 99016304, "step": 81615 }, { "epoch": 9.090099120169285, "grad_norm": 0.08319955319166183, "learning_rate": 1.250732103346894e-06, "loss": 0.4575, "num_input_tokens_seen": 99022384, "step": 81620 }, { "epoch": 9.0906559750529, "grad_norm": 0.13863448798656464, "learning_rate": 1.2492147483654265e-06, "loss": 0.4661, "num_input_tokens_seen": 99028528, "step": 81625 }, { "epoch": 9.091212829936518, "grad_norm": 0.10111305862665176, "learning_rate": 1.2476982907647345e-06, "loss": 0.4531, "num_input_tokens_seen": 99034320, "step": 81630 }, { "epoch": 9.091769684820136, "grad_norm": 0.18921904265880585, "learning_rate": 1.2461827306021217e-06, "loss": 0.4598, "num_input_tokens_seen": 99040656, "step": 81635 }, { "epoch": 9.092326539703754, "grad_norm": 0.09777004271745682, "learning_rate": 1.2446680679348455e-06, "loss": 0.4494, "num_input_tokens_seen": 99046480, "step": 81640 }, { "epoch": 9.092883394587371, "grad_norm": 0.08860115706920624, "learning_rate": 1.2431543028201403e-06, "loss": 0.4567, "num_input_tokens_seen": 99052496, "step": 81645 }, { "epoch": 9.093440249470987, "grad_norm": 0.10031313449144363, "learning_rate": 1.2416414353151968e-06, "loss": 0.4494, "num_input_tokens_seen": 99058512, "step": 81650 }, { "epoch": 9.093997104354605, "grad_norm": 0.15435855090618134, "learning_rate": 1.2401294654771744e-06, "loss": 0.4673, "num_input_tokens_seen": 99064720, "step": 81655 }, { "epoch": 9.094553959238223, "grad_norm": 0.09912281483411789, "learning_rate": 1.2386183933632084e-06, "loss": 0.4629, "num_input_tokens_seen": 99070736, "step": 81660 }, { "epoch": 9.09511081412184, "grad_norm": 0.13197548687458038, "learning_rate": 1.2371082190303806e-06, "loss": 0.4616, "num_input_tokens_seen": 99076752, "step": 81665 }, { "epoch": 9.095667669005458, "grad_norm": 0.10008938610553741, "learning_rate": 1.2355989425357622e-06, "loss": 0.4642, "num_input_tokens_seen": 99082608, "step": 81670 }, { "epoch": 9.096224523889074, "grad_norm": 0.11492646485567093, "learning_rate": 1.2340905639363686e-06, "loss": 0.4664, "num_input_tokens_seen": 99087984, "step": 81675 }, { "epoch": 9.096781378772691, "grad_norm": 0.1306503415107727, "learning_rate": 1.2325830832891989e-06, "loss": 0.4633, "num_input_tokens_seen": 99093808, "step": 81680 }, { "epoch": 9.09733823365631, "grad_norm": 0.17679135501384735, "learning_rate": 1.2310765006512015e-06, "loss": 0.4527, "num_input_tokens_seen": 99100048, "step": 81685 }, { "epoch": 9.097895088539927, "grad_norm": 0.10165517777204514, "learning_rate": 1.2295708160793063e-06, "loss": 0.4572, "num_input_tokens_seen": 99105392, "step": 81690 }, { "epoch": 9.098451943423544, "grad_norm": 0.13297061622142792, "learning_rate": 1.2280660296304064e-06, "loss": 0.4543, "num_input_tokens_seen": 99111312, "step": 81695 }, { "epoch": 9.09900879830716, "grad_norm": 0.12533745169639587, "learning_rate": 1.226562141361348e-06, "loss": 0.4565, "num_input_tokens_seen": 99117584, "step": 81700 }, { "epoch": 9.099565653190778, "grad_norm": 0.09438854455947876, "learning_rate": 1.225059151328961e-06, "loss": 0.4511, "num_input_tokens_seen": 99123376, "step": 81705 }, { "epoch": 9.100122508074396, "grad_norm": 0.11280950903892517, "learning_rate": 1.2235570595900247e-06, "loss": 0.4611, "num_input_tokens_seen": 99129520, "step": 81710 }, { "epoch": 9.100679362958013, "grad_norm": 0.0897984579205513, "learning_rate": 1.2220558662013022e-06, "loss": 0.4578, "num_input_tokens_seen": 99135664, "step": 81715 }, { "epoch": 9.101236217841631, "grad_norm": 0.10287996381521225, "learning_rate": 1.220555571219509e-06, "loss": 0.4635, "num_input_tokens_seen": 99141584, "step": 81720 }, { "epoch": 9.101793072725247, "grad_norm": 0.1269591599702835, "learning_rate": 1.21905617470133e-06, "loss": 0.4612, "num_input_tokens_seen": 99147568, "step": 81725 }, { "epoch": 9.102349927608865, "grad_norm": 0.19269157946109772, "learning_rate": 1.2175576767034208e-06, "loss": 0.4658, "num_input_tokens_seen": 99153872, "step": 81730 }, { "epoch": 9.102906782492482, "grad_norm": 0.10368510335683823, "learning_rate": 1.2160600772823938e-06, "loss": 0.4639, "num_input_tokens_seen": 99160144, "step": 81735 }, { "epoch": 9.1034636373761, "grad_norm": 0.10834166407585144, "learning_rate": 1.21456337649484e-06, "loss": 0.4522, "num_input_tokens_seen": 99166000, "step": 81740 }, { "epoch": 9.104020492259718, "grad_norm": 0.12489611655473709, "learning_rate": 1.213067574397303e-06, "loss": 0.4571, "num_input_tokens_seen": 99172144, "step": 81745 }, { "epoch": 9.104577347143334, "grad_norm": 0.1320183426141739, "learning_rate": 1.2115726710463016e-06, "loss": 0.4602, "num_input_tokens_seen": 99177712, "step": 81750 }, { "epoch": 9.105134202026951, "grad_norm": 0.13801807165145874, "learning_rate": 1.210078666498321e-06, "loss": 0.4725, "num_input_tokens_seen": 99183760, "step": 81755 }, { "epoch": 9.105691056910569, "grad_norm": 0.06657116860151291, "learning_rate": 1.208585560809805e-06, "loss": 0.4547, "num_input_tokens_seen": 99189616, "step": 81760 }, { "epoch": 9.106247911794187, "grad_norm": 0.11045002192258835, "learning_rate": 1.2070933540371726e-06, "loss": 0.4526, "num_input_tokens_seen": 99195920, "step": 81765 }, { "epoch": 9.106804766677804, "grad_norm": 0.10298550128936768, "learning_rate": 1.2056020462367978e-06, "loss": 0.4671, "num_input_tokens_seen": 99202192, "step": 81770 }, { "epoch": 9.107361621561422, "grad_norm": 0.11449204385280609, "learning_rate": 1.2041116374650357e-06, "loss": 0.4608, "num_input_tokens_seen": 99207760, "step": 81775 }, { "epoch": 9.107918476445038, "grad_norm": 0.1112813651561737, "learning_rate": 1.2026221277781885e-06, "loss": 0.4633, "num_input_tokens_seen": 99213072, "step": 81780 }, { "epoch": 9.108475331328655, "grad_norm": 0.13712026178836823, "learning_rate": 1.2011335172325389e-06, "loss": 0.4684, "num_input_tokens_seen": 99219344, "step": 81785 }, { "epoch": 9.109032186212273, "grad_norm": 0.09954576939344406, "learning_rate": 1.199645805884339e-06, "loss": 0.458, "num_input_tokens_seen": 99225776, "step": 81790 }, { "epoch": 9.10958904109589, "grad_norm": 0.0917348563671112, "learning_rate": 1.1981589937897858e-06, "loss": 0.4595, "num_input_tokens_seen": 99231728, "step": 81795 }, { "epoch": 9.110145895979509, "grad_norm": 0.14464299380779266, "learning_rate": 1.1966730810050676e-06, "loss": 0.4613, "num_input_tokens_seen": 99237808, "step": 81800 }, { "epoch": 9.110702750863124, "grad_norm": 0.15094490349292755, "learning_rate": 1.1951880675863169e-06, "loss": 0.4669, "num_input_tokens_seen": 99244144, "step": 81805 }, { "epoch": 9.111259605746742, "grad_norm": 0.09984026849269867, "learning_rate": 1.1937039535896532e-06, "loss": 0.46, "num_input_tokens_seen": 99250288, "step": 81810 }, { "epoch": 9.11181646063036, "grad_norm": 0.08855362981557846, "learning_rate": 1.19222073907114e-06, "loss": 0.4548, "num_input_tokens_seen": 99256784, "step": 81815 }, { "epoch": 9.112373315513977, "grad_norm": 0.10871028155088425, "learning_rate": 1.1907384240868236e-06, "loss": 0.467, "num_input_tokens_seen": 99262928, "step": 81820 }, { "epoch": 9.112930170397595, "grad_norm": 0.11493698507547379, "learning_rate": 1.1892570086927123e-06, "loss": 0.4612, "num_input_tokens_seen": 99268304, "step": 81825 }, { "epoch": 9.113487025281211, "grad_norm": 0.11606822907924652, "learning_rate": 1.1877764929447754e-06, "loss": 0.47, "num_input_tokens_seen": 99274512, "step": 81830 }, { "epoch": 9.114043880164829, "grad_norm": 0.1364820897579193, "learning_rate": 1.186296876898954e-06, "loss": 0.4602, "num_input_tokens_seen": 99280528, "step": 81835 }, { "epoch": 9.114600735048446, "grad_norm": 0.11452393978834152, "learning_rate": 1.1848181606111481e-06, "loss": 0.4684, "num_input_tokens_seen": 99286576, "step": 81840 }, { "epoch": 9.115157589932064, "grad_norm": 0.09681881219148636, "learning_rate": 1.183340344137232e-06, "loss": 0.4615, "num_input_tokens_seen": 99292496, "step": 81845 }, { "epoch": 9.115714444815682, "grad_norm": 0.12005122005939484, "learning_rate": 1.1818634275330449e-06, "loss": 0.4492, "num_input_tokens_seen": 99298608, "step": 81850 }, { "epoch": 9.116271299699298, "grad_norm": 0.09762689471244812, "learning_rate": 1.1803874108543833e-06, "loss": 0.4764, "num_input_tokens_seen": 99304656, "step": 81855 }, { "epoch": 9.116828154582915, "grad_norm": 0.09867509454488754, "learning_rate": 1.1789122941570253e-06, "loss": 0.449, "num_input_tokens_seen": 99310832, "step": 81860 }, { "epoch": 9.117385009466533, "grad_norm": 0.11620365083217621, "learning_rate": 1.1774380774966926e-06, "loss": 0.4626, "num_input_tokens_seen": 99316784, "step": 81865 }, { "epoch": 9.11794186435015, "grad_norm": 0.15000304579734802, "learning_rate": 1.1759647609290964e-06, "loss": 0.453, "num_input_tokens_seen": 99323152, "step": 81870 }, { "epoch": 9.118498719233768, "grad_norm": 0.08244209736585617, "learning_rate": 1.1744923445098949e-06, "loss": 0.4572, "num_input_tokens_seen": 99329392, "step": 81875 }, { "epoch": 9.119055574117384, "grad_norm": 0.10660834610462189, "learning_rate": 1.173020828294727e-06, "loss": 0.4695, "num_input_tokens_seen": 99335536, "step": 81880 }, { "epoch": 9.119612429001002, "grad_norm": 0.08183600008487701, "learning_rate": 1.1715502123391924e-06, "loss": 0.4658, "num_input_tokens_seen": 99341552, "step": 81885 }, { "epoch": 9.12016928388462, "grad_norm": 0.12294046580791473, "learning_rate": 1.1700804966988498e-06, "loss": 0.4629, "num_input_tokens_seen": 99347728, "step": 81890 }, { "epoch": 9.120726138768237, "grad_norm": 0.09218549728393555, "learning_rate": 1.1686116814292353e-06, "loss": 0.4593, "num_input_tokens_seen": 99354032, "step": 81895 }, { "epoch": 9.121282993651855, "grad_norm": 0.09965874254703522, "learning_rate": 1.1671437665858403e-06, "loss": 0.4598, "num_input_tokens_seen": 99360368, "step": 81900 }, { "epoch": 9.12183984853547, "grad_norm": 0.11178566515445709, "learning_rate": 1.1656767522241318e-06, "loss": 0.4587, "num_input_tokens_seen": 99366608, "step": 81905 }, { "epoch": 9.122396703419088, "grad_norm": 0.12433759868144989, "learning_rate": 1.164210638399535e-06, "loss": 0.4563, "num_input_tokens_seen": 99372848, "step": 81910 }, { "epoch": 9.122953558302706, "grad_norm": 0.13151535391807556, "learning_rate": 1.162745425167444e-06, "loss": 0.465, "num_input_tokens_seen": 99378992, "step": 81915 }, { "epoch": 9.123510413186324, "grad_norm": 0.09570246189832687, "learning_rate": 1.1612811125832263e-06, "loss": 0.4639, "num_input_tokens_seen": 99385008, "step": 81920 }, { "epoch": 9.124067268069942, "grad_norm": 0.10133381187915802, "learning_rate": 1.1598177007021983e-06, "loss": 0.4629, "num_input_tokens_seen": 99390992, "step": 81925 }, { "epoch": 9.12462412295356, "grad_norm": 0.14136219024658203, "learning_rate": 1.158355189579663e-06, "loss": 0.4512, "num_input_tokens_seen": 99396816, "step": 81930 }, { "epoch": 9.125180977837175, "grad_norm": 0.1342603713274002, "learning_rate": 1.156893579270868e-06, "loss": 0.4621, "num_input_tokens_seen": 99402960, "step": 81935 }, { "epoch": 9.125737832720793, "grad_norm": 0.1313808709383011, "learning_rate": 1.1554328698310413e-06, "loss": 0.4667, "num_input_tokens_seen": 99408944, "step": 81940 }, { "epoch": 9.12629468760441, "grad_norm": 0.1531306952238083, "learning_rate": 1.1539730613153803e-06, "loss": 0.4492, "num_input_tokens_seen": 99415184, "step": 81945 }, { "epoch": 9.126851542488028, "grad_norm": 0.07841434329748154, "learning_rate": 1.152514153779033e-06, "loss": 0.4696, "num_input_tokens_seen": 99421520, "step": 81950 }, { "epoch": 9.127408397371646, "grad_norm": 0.10809151828289032, "learning_rate": 1.1510561472771241e-06, "loss": 0.4531, "num_input_tokens_seen": 99427696, "step": 81955 }, { "epoch": 9.127965252255262, "grad_norm": 0.12257978320121765, "learning_rate": 1.1495990418647378e-06, "loss": 0.4509, "num_input_tokens_seen": 99433712, "step": 81960 }, { "epoch": 9.12852210713888, "grad_norm": 0.10049120336771011, "learning_rate": 1.1481428375969356e-06, "loss": 0.4636, "num_input_tokens_seen": 99440208, "step": 81965 }, { "epoch": 9.129078962022497, "grad_norm": 0.09500160068273544, "learning_rate": 1.1466875345287343e-06, "loss": 0.4444, "num_input_tokens_seen": 99446192, "step": 81970 }, { "epoch": 9.129635816906115, "grad_norm": 0.15606428682804108, "learning_rate": 1.1452331327151177e-06, "loss": 0.4675, "num_input_tokens_seen": 99452624, "step": 81975 }, { "epoch": 9.130192671789732, "grad_norm": 0.12324178218841553, "learning_rate": 1.1437796322110423e-06, "loss": 0.46, "num_input_tokens_seen": 99458480, "step": 81980 }, { "epoch": 9.130749526673348, "grad_norm": 0.09442568570375443, "learning_rate": 1.1423270330714192e-06, "loss": 0.4595, "num_input_tokens_seen": 99464496, "step": 81985 }, { "epoch": 9.131306381556966, "grad_norm": 0.08303719758987427, "learning_rate": 1.1408753353511408e-06, "loss": 0.4569, "num_input_tokens_seen": 99470480, "step": 81990 }, { "epoch": 9.131863236440584, "grad_norm": 0.20966817438602448, "learning_rate": 1.1394245391050495e-06, "loss": 0.4741, "num_input_tokens_seen": 99476688, "step": 81995 }, { "epoch": 9.132420091324201, "grad_norm": 0.10840197652578354, "learning_rate": 1.1379746443879623e-06, "loss": 0.4736, "num_input_tokens_seen": 99482576, "step": 82000 }, { "epoch": 9.132976946207819, "grad_norm": 0.11472545564174652, "learning_rate": 1.136525651254669e-06, "loss": 0.4633, "num_input_tokens_seen": 99488208, "step": 82005 }, { "epoch": 9.133533801091435, "grad_norm": 0.11124484241008759, "learning_rate": 1.1350775597599062e-06, "loss": 0.4436, "num_input_tokens_seen": 99494192, "step": 82010 }, { "epoch": 9.134090655975053, "grad_norm": 0.10886666923761368, "learning_rate": 1.1336303699583938e-06, "loss": 0.4521, "num_input_tokens_seen": 99500048, "step": 82015 }, { "epoch": 9.13464751085867, "grad_norm": 0.08973818272352219, "learning_rate": 1.132184081904808e-06, "loss": 0.4427, "num_input_tokens_seen": 99506320, "step": 82020 }, { "epoch": 9.135204365742288, "grad_norm": 0.09891205281019211, "learning_rate": 1.1307386956537963e-06, "loss": 0.4528, "num_input_tokens_seen": 99512240, "step": 82025 }, { "epoch": 9.135761220625906, "grad_norm": 0.11598576605319977, "learning_rate": 1.1292942112599707e-06, "loss": 0.4521, "num_input_tokens_seen": 99518672, "step": 82030 }, { "epoch": 9.136318075509521, "grad_norm": 0.1088249534368515, "learning_rate": 1.1278506287779073e-06, "loss": 0.4637, "num_input_tokens_seen": 99524848, "step": 82035 }, { "epoch": 9.13687493039314, "grad_norm": 0.1147817000746727, "learning_rate": 1.1264079482621482e-06, "loss": 0.4528, "num_input_tokens_seen": 99531024, "step": 82040 }, { "epoch": 9.137431785276757, "grad_norm": 0.11412695795297623, "learning_rate": 1.1249661697672027e-06, "loss": 0.465, "num_input_tokens_seen": 99537296, "step": 82045 }, { "epoch": 9.137988640160374, "grad_norm": 0.14781679213047028, "learning_rate": 1.1235252933475493e-06, "loss": 0.4689, "num_input_tokens_seen": 99543152, "step": 82050 }, { "epoch": 9.138545495043992, "grad_norm": 0.13559089601039886, "learning_rate": 1.1220853190576224e-06, "loss": 0.4674, "num_input_tokens_seen": 99549392, "step": 82055 }, { "epoch": 9.139102349927608, "grad_norm": 0.09699700772762299, "learning_rate": 1.1206462469518342e-06, "loss": 0.4559, "num_input_tokens_seen": 99555408, "step": 82060 }, { "epoch": 9.139659204811226, "grad_norm": 0.09497159719467163, "learning_rate": 1.1192080770845603e-06, "loss": 0.4611, "num_input_tokens_seen": 99561808, "step": 82065 }, { "epoch": 9.140216059694843, "grad_norm": 0.18399493396282196, "learning_rate": 1.1177708095101296e-06, "loss": 0.4642, "num_input_tokens_seen": 99568496, "step": 82070 }, { "epoch": 9.140772914578461, "grad_norm": 0.11550754308700562, "learning_rate": 1.1163344442828544e-06, "loss": 0.4569, "num_input_tokens_seen": 99574448, "step": 82075 }, { "epoch": 9.141329769462079, "grad_norm": 0.11843886971473694, "learning_rate": 1.1148989814570021e-06, "loss": 0.4553, "num_input_tokens_seen": 99580272, "step": 82080 }, { "epoch": 9.141886624345695, "grad_norm": 0.10923973470926285, "learning_rate": 1.11346442108681e-06, "loss": 0.4573, "num_input_tokens_seen": 99586576, "step": 82085 }, { "epoch": 9.142443479229312, "grad_norm": 0.13445234298706055, "learning_rate": 1.1120307632264792e-06, "loss": 0.473, "num_input_tokens_seen": 99592656, "step": 82090 }, { "epoch": 9.14300033411293, "grad_norm": 0.12855219841003418, "learning_rate": 1.1105980079301803e-06, "loss": 0.4663, "num_input_tokens_seen": 99598704, "step": 82095 }, { "epoch": 9.143557188996548, "grad_norm": 0.12106327712535858, "learning_rate": 1.1091661552520449e-06, "loss": 0.468, "num_input_tokens_seen": 99604784, "step": 82100 }, { "epoch": 9.144114043880165, "grad_norm": 0.12607385218143463, "learning_rate": 1.1077352052461742e-06, "loss": 0.4538, "num_input_tokens_seen": 99610672, "step": 82105 }, { "epoch": 9.144670898763783, "grad_norm": 0.11670560389757156, "learning_rate": 1.1063051579666362e-06, "loss": 0.451, "num_input_tokens_seen": 99616848, "step": 82110 }, { "epoch": 9.145227753647399, "grad_norm": 0.13099145889282227, "learning_rate": 1.1048760134674569e-06, "loss": 0.4636, "num_input_tokens_seen": 99622608, "step": 82115 }, { "epoch": 9.145784608531017, "grad_norm": 0.15699262917041779, "learning_rate": 1.1034477718026404e-06, "loss": 0.454, "num_input_tokens_seen": 99628784, "step": 82120 }, { "epoch": 9.146341463414634, "grad_norm": 0.10117041319608688, "learning_rate": 1.1020204330261463e-06, "loss": 0.447, "num_input_tokens_seen": 99634960, "step": 82125 }, { "epoch": 9.146898318298252, "grad_norm": 0.14073221385478973, "learning_rate": 1.1005939971919038e-06, "loss": 0.4634, "num_input_tokens_seen": 99641008, "step": 82130 }, { "epoch": 9.14745517318187, "grad_norm": 0.15595988929271698, "learning_rate": 1.099168464353814e-06, "loss": 0.4563, "num_input_tokens_seen": 99646768, "step": 82135 }, { "epoch": 9.148012028065486, "grad_norm": 0.13093595206737518, "learning_rate": 1.0977438345657287e-06, "loss": 0.4656, "num_input_tokens_seen": 99652944, "step": 82140 }, { "epoch": 9.148568882949103, "grad_norm": 0.0983337014913559, "learning_rate": 1.0963201078814822e-06, "loss": 0.4607, "num_input_tokens_seen": 99658864, "step": 82145 }, { "epoch": 9.14912573783272, "grad_norm": 0.09212411195039749, "learning_rate": 1.0948972843548649e-06, "loss": 0.4606, "num_input_tokens_seen": 99664912, "step": 82150 }, { "epoch": 9.149682592716339, "grad_norm": 0.10969854891300201, "learning_rate": 1.093475364039634e-06, "loss": 0.464, "num_input_tokens_seen": 99671248, "step": 82155 }, { "epoch": 9.150239447599956, "grad_norm": 0.14932788908481598, "learning_rate": 1.0920543469895184e-06, "loss": 0.4608, "num_input_tokens_seen": 99677456, "step": 82160 }, { "epoch": 9.150796302483572, "grad_norm": 0.11930186301469803, "learning_rate": 1.0906342332582031e-06, "loss": 0.4643, "num_input_tokens_seen": 99683472, "step": 82165 }, { "epoch": 9.15135315736719, "grad_norm": 0.10875334590673447, "learning_rate": 1.0892150228993537e-06, "loss": 0.4509, "num_input_tokens_seen": 99689776, "step": 82170 }, { "epoch": 9.151910012250807, "grad_norm": 0.14313119649887085, "learning_rate": 1.0877967159665826e-06, "loss": 0.4585, "num_input_tokens_seen": 99695984, "step": 82175 }, { "epoch": 9.152466867134425, "grad_norm": 0.10748250037431717, "learning_rate": 1.086379312513483e-06, "loss": 0.4603, "num_input_tokens_seen": 99702256, "step": 82180 }, { "epoch": 9.153023722018043, "grad_norm": 0.09988482296466827, "learning_rate": 1.0849628125936068e-06, "loss": 0.4573, "num_input_tokens_seen": 99708464, "step": 82185 }, { "epoch": 9.153580576901659, "grad_norm": 0.09725479781627655, "learning_rate": 1.0835472162604748e-06, "loss": 0.4683, "num_input_tokens_seen": 99714672, "step": 82190 }, { "epoch": 9.154137431785276, "grad_norm": 0.1165229082107544, "learning_rate": 1.082132523567575e-06, "loss": 0.4576, "num_input_tokens_seen": 99720944, "step": 82195 }, { "epoch": 9.154694286668894, "grad_norm": 0.12100100517272949, "learning_rate": 1.0807187345683535e-06, "loss": 0.4531, "num_input_tokens_seen": 99726800, "step": 82200 }, { "epoch": 9.155251141552512, "grad_norm": 0.12496981024742126, "learning_rate": 1.0793058493162367e-06, "loss": 0.4574, "num_input_tokens_seen": 99732912, "step": 82205 }, { "epoch": 9.15580799643613, "grad_norm": 0.09989066421985626, "learning_rate": 1.077893867864599e-06, "loss": 0.4643, "num_input_tokens_seen": 99739152, "step": 82210 }, { "epoch": 9.156364851319745, "grad_norm": 0.12535253167152405, "learning_rate": 1.0764827902667947e-06, "loss": 0.4557, "num_input_tokens_seen": 99745008, "step": 82215 }, { "epoch": 9.156921706203363, "grad_norm": 0.11904555559158325, "learning_rate": 1.0750726165761339e-06, "loss": 0.4555, "num_input_tokens_seen": 99751024, "step": 82220 }, { "epoch": 9.15747856108698, "grad_norm": 0.09875766932964325, "learning_rate": 1.0736633468459017e-06, "loss": 0.4422, "num_input_tokens_seen": 99757104, "step": 82225 }, { "epoch": 9.158035415970598, "grad_norm": 0.10081617534160614, "learning_rate": 1.0722549811293476e-06, "loss": 0.4469, "num_input_tokens_seen": 99763472, "step": 82230 }, { "epoch": 9.158592270854216, "grad_norm": 0.10698159784078598, "learning_rate": 1.0708475194796757e-06, "loss": 0.4476, "num_input_tokens_seen": 99769168, "step": 82235 }, { "epoch": 9.159149125737832, "grad_norm": 0.11775124073028564, "learning_rate": 1.0694409619500744e-06, "loss": 0.4578, "num_input_tokens_seen": 99775536, "step": 82240 }, { "epoch": 9.15970598062145, "grad_norm": 0.17388415336608887, "learning_rate": 1.068035308593679e-06, "loss": 0.4557, "num_input_tokens_seen": 99781648, "step": 82245 }, { "epoch": 9.160262835505067, "grad_norm": 0.10476352274417877, "learning_rate": 1.066630559463605e-06, "loss": 0.4638, "num_input_tokens_seen": 99787216, "step": 82250 }, { "epoch": 9.160819690388685, "grad_norm": 0.1296699047088623, "learning_rate": 1.065226714612927e-06, "loss": 0.4603, "num_input_tokens_seen": 99793584, "step": 82255 }, { "epoch": 9.161376545272303, "grad_norm": 0.10778789967298508, "learning_rate": 1.0638237740946855e-06, "loss": 0.4517, "num_input_tokens_seen": 99799440, "step": 82260 }, { "epoch": 9.161933400155919, "grad_norm": 0.13107149302959442, "learning_rate": 1.0624217379618912e-06, "loss": 0.4625, "num_input_tokens_seen": 99805392, "step": 82265 }, { "epoch": 9.162490255039536, "grad_norm": 0.19315139949321747, "learning_rate": 1.0610206062675126e-06, "loss": 0.4574, "num_input_tokens_seen": 99811920, "step": 82270 }, { "epoch": 9.163047109923154, "grad_norm": 0.12444207817316055, "learning_rate": 1.0596203790644938e-06, "loss": 0.4564, "num_input_tokens_seen": 99817744, "step": 82275 }, { "epoch": 9.163603964806772, "grad_norm": 0.09974531829357147, "learning_rate": 1.0582210564057337e-06, "loss": 0.4648, "num_input_tokens_seen": 99823568, "step": 82280 }, { "epoch": 9.16416081969039, "grad_norm": 0.1393812596797943, "learning_rate": 1.0568226383441098e-06, "loss": 0.4557, "num_input_tokens_seen": 99830064, "step": 82285 }, { "epoch": 9.164717674574007, "grad_norm": 0.11235225945711136, "learning_rate": 1.0554251249324577e-06, "loss": 0.4563, "num_input_tokens_seen": 99835952, "step": 82290 }, { "epoch": 9.165274529457623, "grad_norm": 0.15728402137756348, "learning_rate": 1.0540285162235763e-06, "loss": 0.4766, "num_input_tokens_seen": 99842224, "step": 82295 }, { "epoch": 9.16583138434124, "grad_norm": 0.08155082911252975, "learning_rate": 1.0526328122702407e-06, "loss": 0.4586, "num_input_tokens_seen": 99848112, "step": 82300 }, { "epoch": 9.166388239224858, "grad_norm": 0.13248369097709656, "learning_rate": 1.0512380131251749e-06, "loss": 0.4612, "num_input_tokens_seen": 99854320, "step": 82305 }, { "epoch": 9.166945094108476, "grad_norm": 0.11310797929763794, "learning_rate": 1.0498441188410896e-06, "loss": 0.4664, "num_input_tokens_seen": 99860304, "step": 82310 }, { "epoch": 9.167501948992093, "grad_norm": 0.1028953567147255, "learning_rate": 1.048451129470643e-06, "loss": 0.4659, "num_input_tokens_seen": 99866288, "step": 82315 }, { "epoch": 9.16805880387571, "grad_norm": 0.10574577748775482, "learning_rate": 1.0470590450664674e-06, "loss": 0.4515, "num_input_tokens_seen": 99872048, "step": 82320 }, { "epoch": 9.168615658759327, "grad_norm": 0.11661441624164581, "learning_rate": 1.045667865681166e-06, "loss": 0.4714, "num_input_tokens_seen": 99878192, "step": 82325 }, { "epoch": 9.169172513642945, "grad_norm": 0.13369162380695343, "learning_rate": 1.044277591367293e-06, "loss": 0.4581, "num_input_tokens_seen": 99884112, "step": 82330 }, { "epoch": 9.169729368526562, "grad_norm": 0.07987023890018463, "learning_rate": 1.0428882221773878e-06, "loss": 0.4649, "num_input_tokens_seen": 99890224, "step": 82335 }, { "epoch": 9.17028622341018, "grad_norm": 0.15579891204833984, "learning_rate": 1.0414997581639357e-06, "loss": 0.4547, "num_input_tokens_seen": 99896368, "step": 82340 }, { "epoch": 9.170843078293796, "grad_norm": 0.13984595239162445, "learning_rate": 1.0401121993794033e-06, "loss": 0.4592, "num_input_tokens_seen": 99902448, "step": 82345 }, { "epoch": 9.171399933177414, "grad_norm": 0.1580323576927185, "learning_rate": 1.0387255458762153e-06, "loss": 0.4614, "num_input_tokens_seen": 99908496, "step": 82350 }, { "epoch": 9.171956788061031, "grad_norm": 0.10668499767780304, "learning_rate": 1.0373397977067656e-06, "loss": 0.4695, "num_input_tokens_seen": 99914576, "step": 82355 }, { "epoch": 9.172513642944649, "grad_norm": 0.10120890289545059, "learning_rate": 1.0359549549234099e-06, "loss": 0.4501, "num_input_tokens_seen": 99920560, "step": 82360 }, { "epoch": 9.173070497828267, "grad_norm": 0.12061934918165207, "learning_rate": 1.0345710175784702e-06, "loss": 0.4573, "num_input_tokens_seen": 99926576, "step": 82365 }, { "epoch": 9.173627352711883, "grad_norm": 0.16508358716964722, "learning_rate": 1.0331879857242405e-06, "loss": 0.4517, "num_input_tokens_seen": 99932208, "step": 82370 }, { "epoch": 9.1741842075955, "grad_norm": 0.10257627069950104, "learning_rate": 1.0318058594129737e-06, "loss": 0.4659, "num_input_tokens_seen": 99938000, "step": 82375 }, { "epoch": 9.174741062479118, "grad_norm": 0.09578298777341843, "learning_rate": 1.030424638696889e-06, "loss": 0.463, "num_input_tokens_seen": 99944176, "step": 82380 }, { "epoch": 9.175297917362736, "grad_norm": 0.08665072172880173, "learning_rate": 1.029044323628181e-06, "loss": 0.4491, "num_input_tokens_seen": 99949744, "step": 82385 }, { "epoch": 9.175854772246353, "grad_norm": 0.10959489643573761, "learning_rate": 1.0276649142589912e-06, "loss": 0.4664, "num_input_tokens_seen": 99955792, "step": 82390 }, { "epoch": 9.17641162712997, "grad_norm": 0.16498246788978577, "learning_rate": 1.0262864106414444e-06, "loss": 0.4528, "num_input_tokens_seen": 99962032, "step": 82395 }, { "epoch": 9.176968482013587, "grad_norm": 0.08620470017194748, "learning_rate": 1.0249088128276297e-06, "loss": 0.475, "num_input_tokens_seen": 99968016, "step": 82400 }, { "epoch": 9.177525336897205, "grad_norm": 0.10686270147562027, "learning_rate": 1.0235321208695887e-06, "loss": 0.4692, "num_input_tokens_seen": 99974000, "step": 82405 }, { "epoch": 9.178082191780822, "grad_norm": 0.13448356091976166, "learning_rate": 1.0221563348193408e-06, "loss": 0.4667, "num_input_tokens_seen": 99979856, "step": 82410 }, { "epoch": 9.17863904666444, "grad_norm": 0.10487832129001617, "learning_rate": 1.0207814547288669e-06, "loss": 0.4532, "num_input_tokens_seen": 99985840, "step": 82415 }, { "epoch": 9.179195901548056, "grad_norm": 0.1072593703866005, "learning_rate": 1.019407480650117e-06, "loss": 0.4679, "num_input_tokens_seen": 99991792, "step": 82420 }, { "epoch": 9.179752756431673, "grad_norm": 0.12356635183095932, "learning_rate": 1.0180344126349994e-06, "loss": 0.4564, "num_input_tokens_seen": 99997584, "step": 82425 }, { "epoch": 9.180309611315291, "grad_norm": 0.09984913468360901, "learning_rate": 1.016662250735395e-06, "loss": 0.4509, "num_input_tokens_seen": 100003824, "step": 82430 }, { "epoch": 9.180866466198909, "grad_norm": 0.10681284219026566, "learning_rate": 1.0152909950031515e-06, "loss": 0.4621, "num_input_tokens_seen": 100009936, "step": 82435 }, { "epoch": 9.181423321082526, "grad_norm": 0.11403612047433853, "learning_rate": 1.0139206454900767e-06, "loss": 0.4597, "num_input_tokens_seen": 100015888, "step": 82440 }, { "epoch": 9.181980175966142, "grad_norm": 0.1272204965353012, "learning_rate": 1.0125512022479467e-06, "loss": 0.46, "num_input_tokens_seen": 100022352, "step": 82445 }, { "epoch": 9.18253703084976, "grad_norm": 0.12425761669874191, "learning_rate": 1.0111826653285027e-06, "loss": 0.4556, "num_input_tokens_seen": 100028464, "step": 82450 }, { "epoch": 9.183093885733378, "grad_norm": 0.1023993045091629, "learning_rate": 1.0098150347834567e-06, "loss": 0.4608, "num_input_tokens_seen": 100034512, "step": 82455 }, { "epoch": 9.183650740616995, "grad_norm": 0.11357992142438889, "learning_rate": 1.0084483106644754e-06, "loss": 0.4609, "num_input_tokens_seen": 100040400, "step": 82460 }, { "epoch": 9.184207595500613, "grad_norm": 0.13959679007530212, "learning_rate": 1.0070824930232036e-06, "loss": 0.4583, "num_input_tokens_seen": 100046288, "step": 82465 }, { "epoch": 9.18476445038423, "grad_norm": 0.18054108321666718, "learning_rate": 1.0057175819112447e-06, "loss": 0.4574, "num_input_tokens_seen": 100051568, "step": 82470 }, { "epoch": 9.185321305267847, "grad_norm": 0.13383430242538452, "learning_rate": 1.0043535773801655e-06, "loss": 0.4602, "num_input_tokens_seen": 100057520, "step": 82475 }, { "epoch": 9.185878160151464, "grad_norm": 0.12485270202159882, "learning_rate": 1.002990479481511e-06, "loss": 0.4607, "num_input_tokens_seen": 100063568, "step": 82480 }, { "epoch": 9.186435015035082, "grad_norm": 0.08818597346544266, "learning_rate": 1.001628288266776e-06, "loss": 0.4608, "num_input_tokens_seen": 100069392, "step": 82485 }, { "epoch": 9.1869918699187, "grad_norm": 0.13370706140995026, "learning_rate": 1.0002670037874307e-06, "loss": 0.4565, "num_input_tokens_seen": 100075280, "step": 82490 }, { "epoch": 9.187548724802317, "grad_norm": 0.09749643504619598, "learning_rate": 9.989066260949088e-07, "loss": 0.4554, "num_input_tokens_seen": 100081904, "step": 82495 }, { "epoch": 9.188105579685933, "grad_norm": 0.11378534138202667, "learning_rate": 9.97547155240608e-07, "loss": 0.4669, "num_input_tokens_seen": 100087984, "step": 82500 }, { "epoch": 9.188662434569551, "grad_norm": 0.24062848091125488, "learning_rate": 9.961885912759012e-07, "loss": 0.4693, "num_input_tokens_seen": 100093776, "step": 82505 }, { "epoch": 9.189219289453169, "grad_norm": 0.09451507031917572, "learning_rate": 9.948309342521083e-07, "loss": 0.4693, "num_input_tokens_seen": 100099760, "step": 82510 }, { "epoch": 9.189776144336786, "grad_norm": 0.12597960233688354, "learning_rate": 9.934741842205331e-07, "loss": 0.4641, "num_input_tokens_seen": 100105712, "step": 82515 }, { "epoch": 9.190332999220404, "grad_norm": 0.11272221058607101, "learning_rate": 9.921183412324342e-07, "loss": 0.4599, "num_input_tokens_seen": 100111792, "step": 82520 }, { "epoch": 9.19088985410402, "grad_norm": 0.11445487290620804, "learning_rate": 9.907634053390402e-07, "loss": 0.4536, "num_input_tokens_seen": 100118096, "step": 82525 }, { "epoch": 9.191446708987637, "grad_norm": 0.1261391043663025, "learning_rate": 9.89409376591549e-07, "loss": 0.4622, "num_input_tokens_seen": 100124144, "step": 82530 }, { "epoch": 9.192003563871255, "grad_norm": 0.1349683403968811, "learning_rate": 9.88056255041117e-07, "loss": 0.4623, "num_input_tokens_seen": 100129968, "step": 82535 }, { "epoch": 9.192560418754873, "grad_norm": 0.14871597290039062, "learning_rate": 9.8670404073887e-07, "loss": 0.4554, "num_input_tokens_seen": 100136528, "step": 82540 }, { "epoch": 9.19311727363849, "grad_norm": 0.11543719470500946, "learning_rate": 9.853527337358974e-07, "loss": 0.4613, "num_input_tokens_seen": 100142640, "step": 82545 }, { "epoch": 9.193674128522106, "grad_norm": 0.1372729241847992, "learning_rate": 9.84002334083259e-07, "loss": 0.4447, "num_input_tokens_seen": 100148848, "step": 82550 }, { "epoch": 9.194230983405724, "grad_norm": 0.16413860023021698, "learning_rate": 9.826528418319742e-07, "loss": 0.4594, "num_input_tokens_seen": 100154928, "step": 82555 }, { "epoch": 9.194787838289342, "grad_norm": 0.14149224758148193, "learning_rate": 9.813042570330306e-07, "loss": 0.4722, "num_input_tokens_seen": 100160752, "step": 82560 }, { "epoch": 9.19534469317296, "grad_norm": 0.12627895176410675, "learning_rate": 9.799565797373872e-07, "loss": 0.4622, "num_input_tokens_seen": 100166928, "step": 82565 }, { "epoch": 9.195901548056577, "grad_norm": 0.1574714332818985, "learning_rate": 9.78609809995959e-07, "loss": 0.4813, "num_input_tokens_seen": 100173296, "step": 82570 }, { "epoch": 9.196458402940193, "grad_norm": 0.10533185303211212, "learning_rate": 9.77263947859633e-07, "loss": 0.4516, "num_input_tokens_seen": 100179344, "step": 82575 }, { "epoch": 9.19701525782381, "grad_norm": 0.10019484907388687, "learning_rate": 9.759189933792573e-07, "loss": 0.4704, "num_input_tokens_seen": 100185360, "step": 82580 }, { "epoch": 9.197572112707428, "grad_norm": 0.0956181213259697, "learning_rate": 9.745749466056554e-07, "loss": 0.465, "num_input_tokens_seen": 100190672, "step": 82585 }, { "epoch": 9.198128967591046, "grad_norm": 0.11015114188194275, "learning_rate": 9.732318075896003e-07, "loss": 0.4596, "num_input_tokens_seen": 100195760, "step": 82590 }, { "epoch": 9.198685822474664, "grad_norm": 0.1285921186208725, "learning_rate": 9.718895763818459e-07, "loss": 0.469, "num_input_tokens_seen": 100202032, "step": 82595 }, { "epoch": 9.19924267735828, "grad_norm": 0.13275635242462158, "learning_rate": 9.7054825303311e-07, "loss": 0.4644, "num_input_tokens_seen": 100208144, "step": 82600 }, { "epoch": 9.199799532241897, "grad_norm": 0.08746078610420227, "learning_rate": 9.692078375940605e-07, "loss": 0.482, "num_input_tokens_seen": 100214288, "step": 82605 }, { "epoch": 9.200356387125515, "grad_norm": 0.10940922796726227, "learning_rate": 9.678683301153568e-07, "loss": 0.4656, "num_input_tokens_seen": 100220720, "step": 82610 }, { "epoch": 9.200913242009133, "grad_norm": 0.12016954272985458, "learning_rate": 9.665297306475946e-07, "loss": 0.4571, "num_input_tokens_seen": 100226960, "step": 82615 }, { "epoch": 9.20147009689275, "grad_norm": 0.09872303158044815, "learning_rate": 9.65192039241361e-07, "loss": 0.459, "num_input_tokens_seen": 100233136, "step": 82620 }, { "epoch": 9.202026951776368, "grad_norm": 0.13666865229606628, "learning_rate": 9.638552559471992e-07, "loss": 0.4579, "num_input_tokens_seen": 100239600, "step": 82625 }, { "epoch": 9.202583806659984, "grad_norm": 0.13081827759742737, "learning_rate": 9.625193808156075e-07, "loss": 0.4648, "num_input_tokens_seen": 100245904, "step": 82630 }, { "epoch": 9.203140661543602, "grad_norm": 0.09602917730808258, "learning_rate": 9.611844138970705e-07, "loss": 0.4641, "num_input_tokens_seen": 100252144, "step": 82635 }, { "epoch": 9.20369751642722, "grad_norm": 0.10304536670446396, "learning_rate": 9.5985035524202e-07, "loss": 0.4695, "num_input_tokens_seen": 100258608, "step": 82640 }, { "epoch": 9.204254371310837, "grad_norm": 0.08915025740861893, "learning_rate": 9.58517204900866e-07, "loss": 0.4712, "num_input_tokens_seen": 100264624, "step": 82645 }, { "epoch": 9.204811226194455, "grad_norm": 0.10749901086091995, "learning_rate": 9.571849629239738e-07, "loss": 0.4689, "num_input_tokens_seen": 100270672, "step": 82650 }, { "epoch": 9.20536808107807, "grad_norm": 0.12997381389141083, "learning_rate": 9.558536293616831e-07, "loss": 0.4576, "num_input_tokens_seen": 100276560, "step": 82655 }, { "epoch": 9.205924935961688, "grad_norm": 0.09795977920293808, "learning_rate": 9.54523204264296e-07, "loss": 0.4578, "num_input_tokens_seen": 100282864, "step": 82660 }, { "epoch": 9.206481790845306, "grad_norm": 0.12362906336784363, "learning_rate": 9.531936876820802e-07, "loss": 0.4612, "num_input_tokens_seen": 100289200, "step": 82665 }, { "epoch": 9.207038645728923, "grad_norm": 0.11958868056535721, "learning_rate": 9.518650796652706e-07, "loss": 0.4484, "num_input_tokens_seen": 100295568, "step": 82670 }, { "epoch": 9.207595500612541, "grad_norm": 0.17889752984046936, "learning_rate": 9.505373802640605e-07, "loss": 0.4526, "num_input_tokens_seen": 100302032, "step": 82675 }, { "epoch": 9.208152355496157, "grad_norm": 0.09788282215595245, "learning_rate": 9.492105895286207e-07, "loss": 0.4663, "num_input_tokens_seen": 100308208, "step": 82680 }, { "epoch": 9.208709210379775, "grad_norm": 0.10233446210622787, "learning_rate": 9.47884707509078e-07, "loss": 0.4546, "num_input_tokens_seen": 100314096, "step": 82685 }, { "epoch": 9.209266065263392, "grad_norm": 0.12173542380332947, "learning_rate": 9.465597342555283e-07, "loss": 0.4558, "num_input_tokens_seen": 100320368, "step": 82690 }, { "epoch": 9.20982292014701, "grad_norm": 0.11981765180826187, "learning_rate": 9.452356698180398e-07, "loss": 0.4537, "num_input_tokens_seen": 100326352, "step": 82695 }, { "epoch": 9.210379775030628, "grad_norm": 0.1430869847536087, "learning_rate": 9.43912514246631e-07, "loss": 0.4582, "num_input_tokens_seen": 100332432, "step": 82700 }, { "epoch": 9.210936629914244, "grad_norm": 0.12790042161941528, "learning_rate": 9.425902675913006e-07, "loss": 0.4567, "num_input_tokens_seen": 100338704, "step": 82705 }, { "epoch": 9.211493484797861, "grad_norm": 0.10461200028657913, "learning_rate": 9.412689299020033e-07, "loss": 0.4578, "num_input_tokens_seen": 100344656, "step": 82710 }, { "epoch": 9.212050339681479, "grad_norm": 0.09285156428813934, "learning_rate": 9.399485012286713e-07, "loss": 0.455, "num_input_tokens_seen": 100350864, "step": 82715 }, { "epoch": 9.212607194565097, "grad_norm": 0.10902819782495499, "learning_rate": 9.386289816211841e-07, "loss": 0.4677, "num_input_tokens_seen": 100356784, "step": 82720 }, { "epoch": 9.213164049448714, "grad_norm": 0.12016580253839493, "learning_rate": 9.373103711294018e-07, "loss": 0.4632, "num_input_tokens_seen": 100362928, "step": 82725 }, { "epoch": 9.21372090433233, "grad_norm": 0.08987969160079956, "learning_rate": 9.359926698031512e-07, "loss": 0.4509, "num_input_tokens_seen": 100368912, "step": 82730 }, { "epoch": 9.214277759215948, "grad_norm": 0.12089140713214874, "learning_rate": 9.346758776922093e-07, "loss": 0.4617, "num_input_tokens_seen": 100375056, "step": 82735 }, { "epoch": 9.214834614099566, "grad_norm": 0.1383190155029297, "learning_rate": 9.33359994846339e-07, "loss": 0.4605, "num_input_tokens_seen": 100381200, "step": 82740 }, { "epoch": 9.215391468983183, "grad_norm": 0.11976925283670425, "learning_rate": 9.320450213152476e-07, "loss": 0.4748, "num_input_tokens_seen": 100387376, "step": 82745 }, { "epoch": 9.215948323866801, "grad_norm": 0.1346651315689087, "learning_rate": 9.307309571486289e-07, "loss": 0.4628, "num_input_tokens_seen": 100392912, "step": 82750 }, { "epoch": 9.216505178750417, "grad_norm": 0.0971110612154007, "learning_rate": 9.294178023961292e-07, "loss": 0.4699, "num_input_tokens_seen": 100398864, "step": 82755 }, { "epoch": 9.217062033634035, "grad_norm": 0.09475810080766678, "learning_rate": 9.281055571073588e-07, "loss": 0.4483, "num_input_tokens_seen": 100404464, "step": 82760 }, { "epoch": 9.217618888517652, "grad_norm": 0.10455728322267532, "learning_rate": 9.267942213319087e-07, "loss": 0.468, "num_input_tokens_seen": 100410960, "step": 82765 }, { "epoch": 9.21817574340127, "grad_norm": 0.11575314402580261, "learning_rate": 9.254837951193141e-07, "loss": 0.4637, "num_input_tokens_seen": 100416944, "step": 82770 }, { "epoch": 9.218732598284888, "grad_norm": 0.08201851695775986, "learning_rate": 9.241742785190938e-07, "loss": 0.456, "num_input_tokens_seen": 100423312, "step": 82775 }, { "epoch": 9.219289453168503, "grad_norm": 0.1123991534113884, "learning_rate": 9.228656715807249e-07, "loss": 0.4537, "num_input_tokens_seen": 100429136, "step": 82780 }, { "epoch": 9.219846308052121, "grad_norm": 0.10931510478258133, "learning_rate": 9.215579743536484e-07, "loss": 0.4579, "num_input_tokens_seen": 100435344, "step": 82785 }, { "epoch": 9.220403162935739, "grad_norm": 0.10173647105693817, "learning_rate": 9.202511868872777e-07, "loss": 0.4524, "num_input_tokens_seen": 100441040, "step": 82790 }, { "epoch": 9.220960017819356, "grad_norm": 0.11431394517421722, "learning_rate": 9.189453092309785e-07, "loss": 0.4628, "num_input_tokens_seen": 100447184, "step": 82795 }, { "epoch": 9.221516872702974, "grad_norm": 0.09907562285661697, "learning_rate": 9.176403414341006e-07, "loss": 0.4595, "num_input_tokens_seen": 100453488, "step": 82800 }, { "epoch": 9.22207372758659, "grad_norm": 0.09412115067243576, "learning_rate": 9.163362835459488e-07, "loss": 0.4722, "num_input_tokens_seen": 100459376, "step": 82805 }, { "epoch": 9.222630582470208, "grad_norm": 0.0850074514746666, "learning_rate": 9.150331356157865e-07, "loss": 0.4727, "num_input_tokens_seen": 100465360, "step": 82810 }, { "epoch": 9.223187437353825, "grad_norm": 0.09401821345090866, "learning_rate": 9.137308976928632e-07, "loss": 0.4638, "num_input_tokens_seen": 100471440, "step": 82815 }, { "epoch": 9.223744292237443, "grad_norm": 0.11053664237260818, "learning_rate": 9.124295698263702e-07, "loss": 0.4634, "num_input_tokens_seen": 100477104, "step": 82820 }, { "epoch": 9.22430114712106, "grad_norm": 0.13767974078655243, "learning_rate": 9.111291520654819e-07, "loss": 0.4675, "num_input_tokens_seen": 100483408, "step": 82825 }, { "epoch": 9.224858002004678, "grad_norm": 0.08952652662992477, "learning_rate": 9.098296444593285e-07, "loss": 0.4546, "num_input_tokens_seen": 100489520, "step": 82830 }, { "epoch": 9.225414856888294, "grad_norm": 0.09958449751138687, "learning_rate": 9.085310470570124e-07, "loss": 0.4477, "num_input_tokens_seen": 100495216, "step": 82835 }, { "epoch": 9.225971711771912, "grad_norm": 0.11122038215398788, "learning_rate": 9.072333599076028e-07, "loss": 0.4686, "num_input_tokens_seen": 100501104, "step": 82840 }, { "epoch": 9.22652856665553, "grad_norm": 0.08927959948778152, "learning_rate": 9.059365830601213e-07, "loss": 0.4552, "num_input_tokens_seen": 100507280, "step": 82845 }, { "epoch": 9.227085421539147, "grad_norm": 0.09024699777364731, "learning_rate": 9.046407165635706e-07, "loss": 0.4674, "num_input_tokens_seen": 100513200, "step": 82850 }, { "epoch": 9.227642276422765, "grad_norm": 0.11736173182725906, "learning_rate": 9.033457604669115e-07, "loss": 0.4732, "num_input_tokens_seen": 100519056, "step": 82855 }, { "epoch": 9.228199131306381, "grad_norm": 0.11332985013723373, "learning_rate": 9.020517148190716e-07, "loss": 0.457, "num_input_tokens_seen": 100525008, "step": 82860 }, { "epoch": 9.228755986189999, "grad_norm": 0.09467795491218567, "learning_rate": 9.007585796689394e-07, "loss": 0.4606, "num_input_tokens_seen": 100531216, "step": 82865 }, { "epoch": 9.229312841073616, "grad_norm": 0.11939982324838638, "learning_rate": 8.994663550653815e-07, "loss": 0.4684, "num_input_tokens_seen": 100537136, "step": 82870 }, { "epoch": 9.229869695957234, "grad_norm": 0.10876162350177765, "learning_rate": 8.981750410572199e-07, "loss": 0.4678, "num_input_tokens_seen": 100542768, "step": 82875 }, { "epoch": 9.230426550840852, "grad_norm": 0.1282006949186325, "learning_rate": 8.968846376932377e-07, "loss": 0.4613, "num_input_tokens_seen": 100549104, "step": 82880 }, { "epoch": 9.230983405724468, "grad_norm": 0.09211447834968567, "learning_rate": 8.955951450222017e-07, "loss": 0.4666, "num_input_tokens_seen": 100554480, "step": 82885 }, { "epoch": 9.231540260608085, "grad_norm": 0.10852772742509842, "learning_rate": 8.943065630928254e-07, "loss": 0.4614, "num_input_tokens_seen": 100560496, "step": 82890 }, { "epoch": 9.232097115491703, "grad_norm": 0.12097122520208359, "learning_rate": 8.930188919537952e-07, "loss": 0.4576, "num_input_tokens_seen": 100566736, "step": 82895 }, { "epoch": 9.23265397037532, "grad_norm": 0.1079537495970726, "learning_rate": 8.917321316537719e-07, "loss": 0.4455, "num_input_tokens_seen": 100572976, "step": 82900 }, { "epoch": 9.233210825258938, "grad_norm": 0.16050705313682556, "learning_rate": 8.904462822413611e-07, "loss": 0.4543, "num_input_tokens_seen": 100579024, "step": 82905 }, { "epoch": 9.233767680142554, "grad_norm": 0.11580623686313629, "learning_rate": 8.891613437651574e-07, "loss": 0.4447, "num_input_tokens_seen": 100585136, "step": 82910 }, { "epoch": 9.234324535026172, "grad_norm": 0.12185481935739517, "learning_rate": 8.878773162737025e-07, "loss": 0.4609, "num_input_tokens_seen": 100591024, "step": 82915 }, { "epoch": 9.23488138990979, "grad_norm": 0.1021997258067131, "learning_rate": 8.865941998155158e-07, "loss": 0.4546, "num_input_tokens_seen": 100597392, "step": 82920 }, { "epoch": 9.235438244793407, "grad_norm": 0.15457917749881744, "learning_rate": 8.853119944390726e-07, "loss": 0.4636, "num_input_tokens_seen": 100603376, "step": 82925 }, { "epoch": 9.235995099677025, "grad_norm": 0.13383717834949493, "learning_rate": 8.840307001928227e-07, "loss": 0.473, "num_input_tokens_seen": 100609744, "step": 82930 }, { "epoch": 9.23655195456064, "grad_norm": 0.09613163769245148, "learning_rate": 8.827503171251777e-07, "loss": 0.4586, "num_input_tokens_seen": 100616112, "step": 82935 }, { "epoch": 9.237108809444258, "grad_norm": 0.08642761409282684, "learning_rate": 8.8147084528451e-07, "loss": 0.4731, "num_input_tokens_seen": 100622320, "step": 82940 }, { "epoch": 9.237665664327876, "grad_norm": 0.12385574728250504, "learning_rate": 8.801922847191696e-07, "loss": 0.4557, "num_input_tokens_seen": 100628240, "step": 82945 }, { "epoch": 9.238222519211494, "grad_norm": 0.11472326517105103, "learning_rate": 8.789146354774569e-07, "loss": 0.4553, "num_input_tokens_seen": 100633680, "step": 82950 }, { "epoch": 9.238779374095111, "grad_norm": 0.12090251594781876, "learning_rate": 8.776378976076527e-07, "loss": 0.4688, "num_input_tokens_seen": 100639856, "step": 82955 }, { "epoch": 9.239336228978727, "grad_norm": 0.11134164035320282, "learning_rate": 8.763620711579906e-07, "loss": 0.4434, "num_input_tokens_seen": 100646000, "step": 82960 }, { "epoch": 9.239893083862345, "grad_norm": 0.08046501874923706, "learning_rate": 8.750871561766766e-07, "loss": 0.4571, "num_input_tokens_seen": 100652432, "step": 82965 }, { "epoch": 9.240449938745963, "grad_norm": 0.09707046300172806, "learning_rate": 8.738131527118831e-07, "loss": 0.4507, "num_input_tokens_seen": 100658384, "step": 82970 }, { "epoch": 9.24100679362958, "grad_norm": 0.0911962166428566, "learning_rate": 8.72540060811744e-07, "loss": 0.459, "num_input_tokens_seen": 100664432, "step": 82975 }, { "epoch": 9.241563648513198, "grad_norm": 0.11150035262107849, "learning_rate": 8.712678805243624e-07, "loss": 0.4583, "num_input_tokens_seen": 100670576, "step": 82980 }, { "epoch": 9.242120503396816, "grad_norm": 0.12461278587579727, "learning_rate": 8.699966118978025e-07, "loss": 0.4521, "num_input_tokens_seen": 100676528, "step": 82985 }, { "epoch": 9.242677358280432, "grad_norm": 0.09868433326482773, "learning_rate": 8.687262549801039e-07, "loss": 0.4623, "num_input_tokens_seen": 100682736, "step": 82990 }, { "epoch": 9.24323421316405, "grad_norm": 0.1072009801864624, "learning_rate": 8.67456809819256e-07, "loss": 0.4665, "num_input_tokens_seen": 100688304, "step": 82995 }, { "epoch": 9.243791068047667, "grad_norm": 0.10535453259944916, "learning_rate": 8.661882764632257e-07, "loss": 0.4673, "num_input_tokens_seen": 100694416, "step": 83000 }, { "epoch": 9.244347922931285, "grad_norm": 0.16281656920909882, "learning_rate": 8.649206549599443e-07, "loss": 0.461, "num_input_tokens_seen": 100700432, "step": 83005 }, { "epoch": 9.244904777814902, "grad_norm": 0.09009489417076111, "learning_rate": 8.636539453573039e-07, "loss": 0.4631, "num_input_tokens_seen": 100706576, "step": 83010 }, { "epoch": 9.245461632698518, "grad_norm": 0.14534662663936615, "learning_rate": 8.623881477031693e-07, "loss": 0.4566, "num_input_tokens_seen": 100712304, "step": 83015 }, { "epoch": 9.246018487582136, "grad_norm": 0.17606639862060547, "learning_rate": 8.611232620453602e-07, "loss": 0.4496, "num_input_tokens_seen": 100718576, "step": 83020 }, { "epoch": 9.246575342465754, "grad_norm": 0.07823006063699722, "learning_rate": 8.59859288431672e-07, "loss": 0.4643, "num_input_tokens_seen": 100724336, "step": 83025 }, { "epoch": 9.247132197349371, "grad_norm": 0.12272071093320847, "learning_rate": 8.585962269098608e-07, "loss": 0.4616, "num_input_tokens_seen": 100730608, "step": 83030 }, { "epoch": 9.247689052232989, "grad_norm": 0.1028573289513588, "learning_rate": 8.573340775276495e-07, "loss": 0.4569, "num_input_tokens_seen": 100736752, "step": 83035 }, { "epoch": 9.248245907116605, "grad_norm": 0.11750319600105286, "learning_rate": 8.560728403327279e-07, "loss": 0.4495, "num_input_tokens_seen": 100742352, "step": 83040 }, { "epoch": 9.248802762000222, "grad_norm": 0.09534755349159241, "learning_rate": 8.548125153727438e-07, "loss": 0.4696, "num_input_tokens_seen": 100748016, "step": 83045 }, { "epoch": 9.24935961688384, "grad_norm": 0.10045182704925537, "learning_rate": 8.535531026953231e-07, "loss": 0.4549, "num_input_tokens_seen": 100753936, "step": 83050 }, { "epoch": 9.249916471767458, "grad_norm": 0.10191436111927032, "learning_rate": 8.522946023480416e-07, "loss": 0.4721, "num_input_tokens_seen": 100759600, "step": 83055 }, { "epoch": 9.250473326651075, "grad_norm": 0.15649282932281494, "learning_rate": 8.510370143784586e-07, "loss": 0.4604, "num_input_tokens_seen": 100765424, "step": 83060 }, { "epoch": 9.251030181534691, "grad_norm": 0.10183557868003845, "learning_rate": 8.497803388340886e-07, "loss": 0.4639, "num_input_tokens_seen": 100771792, "step": 83065 }, { "epoch": 9.251587036418309, "grad_norm": 0.1277988851070404, "learning_rate": 8.485245757624077e-07, "loss": 0.4595, "num_input_tokens_seen": 100778032, "step": 83070 }, { "epoch": 9.252143891301927, "grad_norm": 0.12593644857406616, "learning_rate": 8.472697252108669e-07, "loss": 0.4521, "num_input_tokens_seen": 100783824, "step": 83075 }, { "epoch": 9.252700746185544, "grad_norm": 0.12408419698476791, "learning_rate": 8.460157872268754e-07, "loss": 0.4548, "num_input_tokens_seen": 100790320, "step": 83080 }, { "epoch": 9.253257601069162, "grad_norm": 0.118329256772995, "learning_rate": 8.447627618578174e-07, "loss": 0.4626, "num_input_tokens_seen": 100796432, "step": 83085 }, { "epoch": 9.253814455952778, "grad_norm": 0.10158950835466385, "learning_rate": 8.435106491510247e-07, "loss": 0.4516, "num_input_tokens_seen": 100802192, "step": 83090 }, { "epoch": 9.254371310836396, "grad_norm": 0.1529131978750229, "learning_rate": 8.42259449153815e-07, "loss": 0.4685, "num_input_tokens_seen": 100808496, "step": 83095 }, { "epoch": 9.254928165720013, "grad_norm": 0.10652995109558105, "learning_rate": 8.410091619134641e-07, "loss": 0.4583, "num_input_tokens_seen": 100813808, "step": 83100 }, { "epoch": 9.255485020603631, "grad_norm": 0.10342909395694733, "learning_rate": 8.397597874772067e-07, "loss": 0.4579, "num_input_tokens_seen": 100819728, "step": 83105 }, { "epoch": 9.256041875487249, "grad_norm": 0.11895714700222015, "learning_rate": 8.385113258922495e-07, "loss": 0.4455, "num_input_tokens_seen": 100825648, "step": 83110 }, { "epoch": 9.256598730370865, "grad_norm": 0.13820137083530426, "learning_rate": 8.372637772057628e-07, "loss": 0.459, "num_input_tokens_seen": 100831760, "step": 83115 }, { "epoch": 9.257155585254482, "grad_norm": 0.08887704461812973, "learning_rate": 8.360171414648815e-07, "loss": 0.4704, "num_input_tokens_seen": 100838128, "step": 83120 }, { "epoch": 9.2577124401381, "grad_norm": 0.11705861240625381, "learning_rate": 8.347714187167149e-07, "loss": 0.4678, "num_input_tokens_seen": 100844112, "step": 83125 }, { "epoch": 9.258269295021718, "grad_norm": 0.2087986320257187, "learning_rate": 8.335266090083227e-07, "loss": 0.4583, "num_input_tokens_seen": 100850288, "step": 83130 }, { "epoch": 9.258826149905335, "grad_norm": 0.1178126409649849, "learning_rate": 8.322827123867421e-07, "loss": 0.453, "num_input_tokens_seen": 100856048, "step": 83135 }, { "epoch": 9.259383004788951, "grad_norm": 0.08845699578523636, "learning_rate": 8.310397288989691e-07, "loss": 0.4683, "num_input_tokens_seen": 100862160, "step": 83140 }, { "epoch": 9.259939859672569, "grad_norm": 0.08774880319833755, "learning_rate": 8.297976585919686e-07, "loss": 0.46, "num_input_tokens_seen": 100868464, "step": 83145 }, { "epoch": 9.260496714556187, "grad_norm": 0.11942578107118607, "learning_rate": 8.2855650151267e-07, "loss": 0.4572, "num_input_tokens_seen": 100874384, "step": 83150 }, { "epoch": 9.261053569439804, "grad_norm": 0.10403475165367126, "learning_rate": 8.273162577079662e-07, "loss": 0.4588, "num_input_tokens_seen": 100880592, "step": 83155 }, { "epoch": 9.261610424323422, "grad_norm": 0.1437927484512329, "learning_rate": 8.260769272247198e-07, "loss": 0.4651, "num_input_tokens_seen": 100886672, "step": 83160 }, { "epoch": 9.262167279207038, "grad_norm": 0.11484406888484955, "learning_rate": 8.248385101097572e-07, "loss": 0.4649, "num_input_tokens_seen": 100892688, "step": 83165 }, { "epoch": 9.262724134090655, "grad_norm": 0.0972190573811531, "learning_rate": 8.236010064098687e-07, "loss": 0.4604, "num_input_tokens_seen": 100898992, "step": 83170 }, { "epoch": 9.263280988974273, "grad_norm": 0.09037211537361145, "learning_rate": 8.223644161718141e-07, "loss": 0.451, "num_input_tokens_seen": 100904976, "step": 83175 }, { "epoch": 9.26383784385789, "grad_norm": 0.07671894878149033, "learning_rate": 8.21128739442309e-07, "loss": 0.4558, "num_input_tokens_seen": 100910896, "step": 83180 }, { "epoch": 9.264394698741508, "grad_norm": 0.13942797482013702, "learning_rate": 8.198939762680463e-07, "loss": 0.4601, "num_input_tokens_seen": 100917264, "step": 83185 }, { "epoch": 9.264951553625126, "grad_norm": 0.13838933408260345, "learning_rate": 8.186601266956778e-07, "loss": 0.4471, "num_input_tokens_seen": 100922800, "step": 83190 }, { "epoch": 9.265508408508742, "grad_norm": 0.0882141962647438, "learning_rate": 8.174271907718245e-07, "loss": 0.4618, "num_input_tokens_seen": 100928880, "step": 83195 }, { "epoch": 9.26606526339236, "grad_norm": 0.09030202776193619, "learning_rate": 8.161951685430658e-07, "loss": 0.4627, "num_input_tokens_seen": 100935248, "step": 83200 }, { "epoch": 9.266622118275977, "grad_norm": 0.09612264484167099, "learning_rate": 8.149640600559533e-07, "loss": 0.4685, "num_input_tokens_seen": 100941168, "step": 83205 }, { "epoch": 9.267178973159595, "grad_norm": 0.15158697962760925, "learning_rate": 8.137338653570081e-07, "loss": 0.4685, "num_input_tokens_seen": 100947248, "step": 83210 }, { "epoch": 9.267735828043213, "grad_norm": 0.09435370564460754, "learning_rate": 8.125045844927016e-07, "loss": 0.457, "num_input_tokens_seen": 100953520, "step": 83215 }, { "epoch": 9.268292682926829, "grad_norm": 0.09455940127372742, "learning_rate": 8.112762175094879e-07, "loss": 0.4561, "num_input_tokens_seen": 100959408, "step": 83220 }, { "epoch": 9.268849537810446, "grad_norm": 0.08830554783344269, "learning_rate": 8.100487644537747e-07, "loss": 0.4605, "num_input_tokens_seen": 100965456, "step": 83225 }, { "epoch": 9.269406392694064, "grad_norm": 0.15781500935554504, "learning_rate": 8.088222253719386e-07, "loss": 0.4481, "num_input_tokens_seen": 100970224, "step": 83230 }, { "epoch": 9.269963247577682, "grad_norm": 0.10068745166063309, "learning_rate": 8.075966003103231e-07, "loss": 0.4555, "num_input_tokens_seen": 100976592, "step": 83235 }, { "epoch": 9.2705201024613, "grad_norm": 0.10044803470373154, "learning_rate": 8.063718893152356e-07, "loss": 0.4681, "num_input_tokens_seen": 100982352, "step": 83240 }, { "epoch": 9.271076957344915, "grad_norm": 0.1316845566034317, "learning_rate": 8.051480924329529e-07, "loss": 0.4561, "num_input_tokens_seen": 100988144, "step": 83245 }, { "epoch": 9.271633812228533, "grad_norm": 0.10666331648826599, "learning_rate": 8.039252097097105e-07, "loss": 0.467, "num_input_tokens_seen": 100993808, "step": 83250 }, { "epoch": 9.27219066711215, "grad_norm": 0.07185465842485428, "learning_rate": 8.027032411917157e-07, "loss": 0.451, "num_input_tokens_seen": 100999472, "step": 83255 }, { "epoch": 9.272747521995768, "grad_norm": 0.10930604487657547, "learning_rate": 8.014821869251315e-07, "loss": 0.469, "num_input_tokens_seen": 101005776, "step": 83260 }, { "epoch": 9.273304376879386, "grad_norm": 0.1219385415315628, "learning_rate": 8.002620469561045e-07, "loss": 0.4438, "num_input_tokens_seen": 101011984, "step": 83265 }, { "epoch": 9.273861231763002, "grad_norm": 0.18796293437480927, "learning_rate": 7.990428213307227e-07, "loss": 0.4616, "num_input_tokens_seen": 101017936, "step": 83270 }, { "epoch": 9.27441808664662, "grad_norm": 0.07459849119186401, "learning_rate": 7.978245100950632e-07, "loss": 0.4624, "num_input_tokens_seen": 101024016, "step": 83275 }, { "epoch": 9.274974941530237, "grad_norm": 0.11555394530296326, "learning_rate": 7.966071132951531e-07, "loss": 0.4719, "num_input_tokens_seen": 101030064, "step": 83280 }, { "epoch": 9.275531796413855, "grad_norm": 0.17577116191387177, "learning_rate": 7.953906309769887e-07, "loss": 0.4682, "num_input_tokens_seen": 101036528, "step": 83285 }, { "epoch": 9.276088651297473, "grad_norm": 0.09889233112335205, "learning_rate": 7.941750631865336e-07, "loss": 0.46, "num_input_tokens_seen": 101042288, "step": 83290 }, { "epoch": 9.276645506181088, "grad_norm": 0.11887340247631073, "learning_rate": 7.929604099697174e-07, "loss": 0.4643, "num_input_tokens_seen": 101048400, "step": 83295 }, { "epoch": 9.277202361064706, "grad_norm": 0.11711391806602478, "learning_rate": 7.917466713724286e-07, "loss": 0.4689, "num_input_tokens_seen": 101054320, "step": 83300 }, { "epoch": 9.277759215948324, "grad_norm": 0.11936534196138382, "learning_rate": 7.905338474405333e-07, "loss": 0.4643, "num_input_tokens_seen": 101060560, "step": 83305 }, { "epoch": 9.278316070831941, "grad_norm": 0.11195144057273865, "learning_rate": 7.893219382198502e-07, "loss": 0.4625, "num_input_tokens_seen": 101066192, "step": 83310 }, { "epoch": 9.278872925715559, "grad_norm": 0.13327661156654358, "learning_rate": 7.881109437561762e-07, "loss": 0.4546, "num_input_tokens_seen": 101071792, "step": 83315 }, { "epoch": 9.279429780599175, "grad_norm": 0.1021135151386261, "learning_rate": 7.869008640952552e-07, "loss": 0.4639, "num_input_tokens_seen": 101077904, "step": 83320 }, { "epoch": 9.279986635482793, "grad_norm": 0.09917745739221573, "learning_rate": 7.856916992828173e-07, "loss": 0.4575, "num_input_tokens_seen": 101083952, "step": 83325 }, { "epoch": 9.28054349036641, "grad_norm": 0.1528884917497635, "learning_rate": 7.844834493645454e-07, "loss": 0.454, "num_input_tokens_seen": 101089968, "step": 83330 }, { "epoch": 9.281100345250028, "grad_norm": 0.10995738953351974, "learning_rate": 7.83276114386089e-07, "loss": 0.4519, "num_input_tokens_seen": 101095984, "step": 83335 }, { "epoch": 9.281657200133646, "grad_norm": 0.0925619825720787, "learning_rate": 7.820696943930699e-07, "loss": 0.4733, "num_input_tokens_seen": 101102096, "step": 83340 }, { "epoch": 9.282214055017263, "grad_norm": 0.08987019211053848, "learning_rate": 7.808641894310659e-07, "loss": 0.4563, "num_input_tokens_seen": 101108176, "step": 83345 }, { "epoch": 9.28277090990088, "grad_norm": 0.11525662988424301, "learning_rate": 7.796595995456318e-07, "loss": 0.4653, "num_input_tokens_seen": 101113776, "step": 83350 }, { "epoch": 9.283327764784497, "grad_norm": 0.10507645457983017, "learning_rate": 7.784559247822703e-07, "loss": 0.4637, "num_input_tokens_seen": 101119856, "step": 83355 }, { "epoch": 9.283884619668115, "grad_norm": 0.15140029788017273, "learning_rate": 7.772531651864673e-07, "loss": 0.4551, "num_input_tokens_seen": 101126224, "step": 83360 }, { "epoch": 9.284441474551732, "grad_norm": 0.1224638819694519, "learning_rate": 7.760513208036669e-07, "loss": 0.4724, "num_input_tokens_seen": 101132304, "step": 83365 }, { "epoch": 9.28499832943535, "grad_norm": 0.09769783169031143, "learning_rate": 7.748503916792743e-07, "loss": 0.4609, "num_input_tokens_seen": 101138672, "step": 83370 }, { "epoch": 9.285555184318966, "grad_norm": 0.12146534770727158, "learning_rate": 7.736503778586701e-07, "loss": 0.4567, "num_input_tokens_seen": 101144912, "step": 83375 }, { "epoch": 9.286112039202584, "grad_norm": 0.1287495642900467, "learning_rate": 7.724512793871874e-07, "loss": 0.4755, "num_input_tokens_seen": 101150672, "step": 83380 }, { "epoch": 9.286668894086201, "grad_norm": 0.10972993820905685, "learning_rate": 7.712530963101427e-07, "loss": 0.4563, "num_input_tokens_seen": 101156976, "step": 83385 }, { "epoch": 9.287225748969819, "grad_norm": 0.10483726859092712, "learning_rate": 7.700558286727944e-07, "loss": 0.4578, "num_input_tokens_seen": 101162960, "step": 83390 }, { "epoch": 9.287782603853437, "grad_norm": 0.0815260037779808, "learning_rate": 7.688594765203893e-07, "loss": 0.4825, "num_input_tokens_seen": 101168560, "step": 83395 }, { "epoch": 9.288339458737052, "grad_norm": 0.11107385158538818, "learning_rate": 7.67664039898125e-07, "loss": 0.4756, "num_input_tokens_seen": 101174608, "step": 83400 }, { "epoch": 9.28889631362067, "grad_norm": 0.12191854417324066, "learning_rate": 7.664695188511705e-07, "loss": 0.467, "num_input_tokens_seen": 101180784, "step": 83405 }, { "epoch": 9.289453168504288, "grad_norm": 0.1032543033361435, "learning_rate": 7.652759134246568e-07, "loss": 0.4567, "num_input_tokens_seen": 101187024, "step": 83410 }, { "epoch": 9.290010023387905, "grad_norm": 0.09495308995246887, "learning_rate": 7.640832236636836e-07, "loss": 0.4488, "num_input_tokens_seen": 101193104, "step": 83415 }, { "epoch": 9.290566878271523, "grad_norm": 0.10112682729959488, "learning_rate": 7.628914496133149e-07, "loss": 0.4699, "num_input_tokens_seen": 101199152, "step": 83420 }, { "epoch": 9.291123733155139, "grad_norm": 0.07450678944587708, "learning_rate": 7.617005913185759e-07, "loss": 0.4629, "num_input_tokens_seen": 101205008, "step": 83425 }, { "epoch": 9.291680588038757, "grad_norm": 0.11802805960178375, "learning_rate": 7.60510648824464e-07, "loss": 0.4718, "num_input_tokens_seen": 101211248, "step": 83430 }, { "epoch": 9.292237442922374, "grad_norm": 0.10283784568309784, "learning_rate": 7.593216221759431e-07, "loss": 0.4672, "num_input_tokens_seen": 101217392, "step": 83435 }, { "epoch": 9.292794297805992, "grad_norm": 0.106919065117836, "learning_rate": 7.5813351141793e-07, "loss": 0.456, "num_input_tokens_seen": 101223440, "step": 83440 }, { "epoch": 9.29335115268961, "grad_norm": 0.11983369290828705, "learning_rate": 7.569463165953223e-07, "loss": 0.463, "num_input_tokens_seen": 101229296, "step": 83445 }, { "epoch": 9.293908007573226, "grad_norm": 0.10274446755647659, "learning_rate": 7.557600377529728e-07, "loss": 0.4552, "num_input_tokens_seen": 101235312, "step": 83450 }, { "epoch": 9.294464862456843, "grad_norm": 0.12596367299556732, "learning_rate": 7.545746749357041e-07, "loss": 0.4504, "num_input_tokens_seen": 101241584, "step": 83455 }, { "epoch": 9.295021717340461, "grad_norm": 0.0989588052034378, "learning_rate": 7.533902281882998e-07, "loss": 0.4629, "num_input_tokens_seen": 101247344, "step": 83460 }, { "epoch": 9.295578572224079, "grad_norm": 0.09833517670631409, "learning_rate": 7.522066975555159e-07, "loss": 0.4647, "num_input_tokens_seen": 101253488, "step": 83465 }, { "epoch": 9.296135427107696, "grad_norm": 0.12553711235523224, "learning_rate": 7.51024083082072e-07, "loss": 0.4425, "num_input_tokens_seen": 101259600, "step": 83470 }, { "epoch": 9.296692281991312, "grad_norm": 0.10687029361724854, "learning_rate": 7.498423848126463e-07, "loss": 0.4562, "num_input_tokens_seen": 101265744, "step": 83475 }, { "epoch": 9.29724913687493, "grad_norm": 0.10622452199459076, "learning_rate": 7.486616027918892e-07, "loss": 0.4591, "num_input_tokens_seen": 101271664, "step": 83480 }, { "epoch": 9.297805991758548, "grad_norm": 0.13786832988262177, "learning_rate": 7.474817370644122e-07, "loss": 0.4676, "num_input_tokens_seen": 101277680, "step": 83485 }, { "epoch": 9.298362846642165, "grad_norm": 0.10916271060705185, "learning_rate": 7.463027876747963e-07, "loss": 0.4528, "num_input_tokens_seen": 101283952, "step": 83490 }, { "epoch": 9.298919701525783, "grad_norm": 0.1468666046857834, "learning_rate": 7.451247546675866e-07, "loss": 0.4747, "num_input_tokens_seen": 101289872, "step": 83495 }, { "epoch": 9.299476556409399, "grad_norm": 0.13198770582675934, "learning_rate": 7.439476380872917e-07, "loss": 0.4571, "num_input_tokens_seen": 101295952, "step": 83500 }, { "epoch": 9.300033411293017, "grad_norm": 0.09604118764400482, "learning_rate": 7.427714379783874e-07, "loss": 0.4688, "num_input_tokens_seen": 101302224, "step": 83505 }, { "epoch": 9.300590266176634, "grad_norm": 0.09440281242132187, "learning_rate": 7.415961543853155e-07, "loss": 0.466, "num_input_tokens_seen": 101308496, "step": 83510 }, { "epoch": 9.301147121060252, "grad_norm": 0.10239912569522858, "learning_rate": 7.404217873524799e-07, "loss": 0.4717, "num_input_tokens_seen": 101314512, "step": 83515 }, { "epoch": 9.30170397594387, "grad_norm": 0.16009670495986938, "learning_rate": 7.392483369242531e-07, "loss": 0.4615, "num_input_tokens_seen": 101320432, "step": 83520 }, { "epoch": 9.302260830827485, "grad_norm": 0.1624077558517456, "learning_rate": 7.380758031449691e-07, "loss": 0.4613, "num_input_tokens_seen": 101326224, "step": 83525 }, { "epoch": 9.302817685711103, "grad_norm": 0.10695750266313553, "learning_rate": 7.369041860589371e-07, "loss": 0.4719, "num_input_tokens_seen": 101332304, "step": 83530 }, { "epoch": 9.30337454059472, "grad_norm": 0.13431650400161743, "learning_rate": 7.357334857104159e-07, "loss": 0.4675, "num_input_tokens_seen": 101338256, "step": 83535 }, { "epoch": 9.303931395478338, "grad_norm": 0.10830232501029968, "learning_rate": 7.345637021436452e-07, "loss": 0.4582, "num_input_tokens_seen": 101344144, "step": 83540 }, { "epoch": 9.304488250361956, "grad_norm": 0.12815697491168976, "learning_rate": 7.333948354028175e-07, "loss": 0.477, "num_input_tokens_seen": 101350320, "step": 83545 }, { "epoch": 9.305045105245574, "grad_norm": 0.1088390126824379, "learning_rate": 7.322268855321001e-07, "loss": 0.4612, "num_input_tokens_seen": 101356752, "step": 83550 }, { "epoch": 9.30560196012919, "grad_norm": 0.11662567406892776, "learning_rate": 7.310598525756218e-07, "loss": 0.4659, "num_input_tokens_seen": 101362992, "step": 83555 }, { "epoch": 9.306158815012807, "grad_norm": 0.12931959331035614, "learning_rate": 7.298937365774722e-07, "loss": 0.467, "num_input_tokens_seen": 101369072, "step": 83560 }, { "epoch": 9.306715669896425, "grad_norm": 0.14076991379261017, "learning_rate": 7.287285375817188e-07, "loss": 0.4552, "num_input_tokens_seen": 101375024, "step": 83565 }, { "epoch": 9.307272524780043, "grad_norm": 0.10214418172836304, "learning_rate": 7.27564255632382e-07, "loss": 0.4506, "num_input_tokens_seen": 101380432, "step": 83570 }, { "epoch": 9.30782937966366, "grad_norm": 0.08265972137451172, "learning_rate": 7.264008907734515e-07, "loss": 0.4587, "num_input_tokens_seen": 101386320, "step": 83575 }, { "epoch": 9.308386234547276, "grad_norm": 0.1474006325006485, "learning_rate": 7.25238443048884e-07, "loss": 0.4555, "num_input_tokens_seen": 101392496, "step": 83580 }, { "epoch": 9.308943089430894, "grad_norm": 0.09567897021770477, "learning_rate": 7.240769125026025e-07, "loss": 0.4661, "num_input_tokens_seen": 101398576, "step": 83585 }, { "epoch": 9.309499944314512, "grad_norm": 0.12434723973274231, "learning_rate": 7.229162991784888e-07, "loss": 0.4752, "num_input_tokens_seen": 101405072, "step": 83590 }, { "epoch": 9.31005679919813, "grad_norm": 0.11924629658460617, "learning_rate": 7.217566031203965e-07, "loss": 0.4705, "num_input_tokens_seen": 101411184, "step": 83595 }, { "epoch": 9.310613654081747, "grad_norm": 0.11718256026506424, "learning_rate": 7.205978243721462e-07, "loss": 0.4664, "num_input_tokens_seen": 101417392, "step": 83600 }, { "epoch": 9.311170508965363, "grad_norm": 0.14264224469661713, "learning_rate": 7.194399629775139e-07, "loss": 0.4686, "num_input_tokens_seen": 101423376, "step": 83605 }, { "epoch": 9.31172736384898, "grad_norm": 0.12099230289459229, "learning_rate": 7.182830189802509e-07, "loss": 0.4659, "num_input_tokens_seen": 101429328, "step": 83610 }, { "epoch": 9.312284218732598, "grad_norm": 0.12196717411279678, "learning_rate": 7.17126992424072e-07, "loss": 0.4557, "num_input_tokens_seen": 101435504, "step": 83615 }, { "epoch": 9.312841073616216, "grad_norm": 0.1677166372537613, "learning_rate": 7.159718833526536e-07, "loss": 0.4533, "num_input_tokens_seen": 101441200, "step": 83620 }, { "epoch": 9.313397928499834, "grad_norm": 0.08461591601371765, "learning_rate": 7.148176918096383e-07, "loss": 0.4699, "num_input_tokens_seen": 101447152, "step": 83625 }, { "epoch": 9.31395478338345, "grad_norm": 0.11287528276443481, "learning_rate": 7.136644178386359e-07, "loss": 0.4556, "num_input_tokens_seen": 101453584, "step": 83630 }, { "epoch": 9.314511638267067, "grad_norm": 0.08816711604595184, "learning_rate": 7.125120614832225e-07, "loss": 0.4521, "num_input_tokens_seen": 101459568, "step": 83635 }, { "epoch": 9.315068493150685, "grad_norm": 0.16702769696712494, "learning_rate": 7.113606227869357e-07, "loss": 0.4563, "num_input_tokens_seen": 101465744, "step": 83640 }, { "epoch": 9.315625348034303, "grad_norm": 0.07481423765420914, "learning_rate": 7.102101017932794e-07, "loss": 0.4628, "num_input_tokens_seen": 101471856, "step": 83645 }, { "epoch": 9.31618220291792, "grad_norm": 0.09361350536346436, "learning_rate": 7.0906049854573e-07, "loss": 0.4657, "num_input_tokens_seen": 101477936, "step": 83650 }, { "epoch": 9.316739057801536, "grad_norm": 0.09267599880695343, "learning_rate": 7.079118130877166e-07, "loss": 0.4467, "num_input_tokens_seen": 101484272, "step": 83655 }, { "epoch": 9.317295912685154, "grad_norm": 0.08741007000207901, "learning_rate": 7.067640454626434e-07, "loss": 0.4598, "num_input_tokens_seen": 101490256, "step": 83660 }, { "epoch": 9.317852767568771, "grad_norm": 0.12591594457626343, "learning_rate": 7.056171957138757e-07, "loss": 0.4658, "num_input_tokens_seen": 101496752, "step": 83665 }, { "epoch": 9.31840962245239, "grad_norm": 0.16000106930732727, "learning_rate": 7.044712638847428e-07, "loss": 0.4621, "num_input_tokens_seen": 101502704, "step": 83670 }, { "epoch": 9.318966477336007, "grad_norm": 0.10458994656801224, "learning_rate": 7.033262500185489e-07, "loss": 0.4607, "num_input_tokens_seen": 101508720, "step": 83675 }, { "epoch": 9.319523332219624, "grad_norm": 0.10476993769407272, "learning_rate": 7.02182154158551e-07, "loss": 0.4651, "num_input_tokens_seen": 101514576, "step": 83680 }, { "epoch": 9.32008018710324, "grad_norm": 0.11287456750869751, "learning_rate": 7.010389763479786e-07, "loss": 0.4711, "num_input_tokens_seen": 101520560, "step": 83685 }, { "epoch": 9.320637041986858, "grad_norm": 0.14288711547851562, "learning_rate": 6.998967166300191e-07, "loss": 0.4763, "num_input_tokens_seen": 101526320, "step": 83690 }, { "epoch": 9.321193896870476, "grad_norm": 0.1001843735575676, "learning_rate": 6.987553750478382e-07, "loss": 0.4619, "num_input_tokens_seen": 101532336, "step": 83695 }, { "epoch": 9.321750751754093, "grad_norm": 0.09761564433574677, "learning_rate": 6.976149516445541e-07, "loss": 0.4597, "num_input_tokens_seen": 101538320, "step": 83700 }, { "epoch": 9.322307606637711, "grad_norm": 0.16511599719524384, "learning_rate": 6.9647544646326e-07, "loss": 0.457, "num_input_tokens_seen": 101544560, "step": 83705 }, { "epoch": 9.322864461521327, "grad_norm": 0.11276314407587051, "learning_rate": 6.953368595470078e-07, "loss": 0.4713, "num_input_tokens_seen": 101550800, "step": 83710 }, { "epoch": 9.323421316404945, "grad_norm": 0.10884939134120941, "learning_rate": 6.941991909388157e-07, "loss": 0.4659, "num_input_tokens_seen": 101556912, "step": 83715 }, { "epoch": 9.323978171288562, "grad_norm": 0.10913010686635971, "learning_rate": 6.930624406816743e-07, "loss": 0.4552, "num_input_tokens_seen": 101563248, "step": 83720 }, { "epoch": 9.32453502617218, "grad_norm": 0.09020031988620758, "learning_rate": 6.919266088185244e-07, "loss": 0.4652, "num_input_tokens_seen": 101569392, "step": 83725 }, { "epoch": 9.325091881055798, "grad_norm": 0.10961920022964478, "learning_rate": 6.907916953922899e-07, "loss": 0.4515, "num_input_tokens_seen": 101575856, "step": 83730 }, { "epoch": 9.325648735939414, "grad_norm": 0.1466640830039978, "learning_rate": 6.896577004458449e-07, "loss": 0.4627, "num_input_tokens_seen": 101581200, "step": 83735 }, { "epoch": 9.326205590823031, "grad_norm": 0.14067962765693665, "learning_rate": 6.885246240220383e-07, "loss": 0.4679, "num_input_tokens_seen": 101587312, "step": 83740 }, { "epoch": 9.326762445706649, "grad_norm": 0.09082039445638657, "learning_rate": 6.873924661636833e-07, "loss": 0.4556, "num_input_tokens_seen": 101593392, "step": 83745 }, { "epoch": 9.327319300590267, "grad_norm": 0.12075646221637726, "learning_rate": 6.862612269135538e-07, "loss": 0.4614, "num_input_tokens_seen": 101599024, "step": 83750 }, { "epoch": 9.327876155473884, "grad_norm": 0.11722725629806519, "learning_rate": 6.851309063143934e-07, "loss": 0.4662, "num_input_tokens_seen": 101604656, "step": 83755 }, { "epoch": 9.3284330103575, "grad_norm": 0.1451149433851242, "learning_rate": 6.840015044089043e-07, "loss": 0.4553, "num_input_tokens_seen": 101611024, "step": 83760 }, { "epoch": 9.328989865241118, "grad_norm": 0.11945933848619461, "learning_rate": 6.828730212397688e-07, "loss": 0.4679, "num_input_tokens_seen": 101617168, "step": 83765 }, { "epoch": 9.329546720124736, "grad_norm": 0.07997823506593704, "learning_rate": 6.81745456849614e-07, "loss": 0.4511, "num_input_tokens_seen": 101623312, "step": 83770 }, { "epoch": 9.330103575008353, "grad_norm": 0.1466207504272461, "learning_rate": 6.806188112810475e-07, "loss": 0.4552, "num_input_tokens_seen": 101629392, "step": 83775 }, { "epoch": 9.33066042989197, "grad_norm": 0.12392605096101761, "learning_rate": 6.794930845766407e-07, "loss": 0.4714, "num_input_tokens_seen": 101635888, "step": 83780 }, { "epoch": 9.331217284775587, "grad_norm": 0.12568403780460358, "learning_rate": 6.783682767789206e-07, "loss": 0.454, "num_input_tokens_seen": 101642128, "step": 83785 }, { "epoch": 9.331774139659204, "grad_norm": 0.09917779266834259, "learning_rate": 6.772443879303925e-07, "loss": 0.4642, "num_input_tokens_seen": 101648048, "step": 83790 }, { "epoch": 9.332330994542822, "grad_norm": 0.09356504678726196, "learning_rate": 6.761214180735137e-07, "loss": 0.4497, "num_input_tokens_seen": 101654576, "step": 83795 }, { "epoch": 9.33288784942644, "grad_norm": 0.08954907208681107, "learning_rate": 6.749993672507199e-07, "loss": 0.4622, "num_input_tokens_seen": 101660336, "step": 83800 }, { "epoch": 9.333444704310057, "grad_norm": 0.10876181721687317, "learning_rate": 6.738782355044049e-07, "loss": 0.4522, "num_input_tokens_seen": 101666352, "step": 83805 }, { "epoch": 9.334001559193673, "grad_norm": 0.11394074559211731, "learning_rate": 6.727580228769237e-07, "loss": 0.4679, "num_input_tokens_seen": 101671664, "step": 83810 }, { "epoch": 9.334558414077291, "grad_norm": 0.105408675968647, "learning_rate": 6.716387294106091e-07, "loss": 0.4694, "num_input_tokens_seen": 101678032, "step": 83815 }, { "epoch": 9.335115268960909, "grad_norm": 0.14413726329803467, "learning_rate": 6.705203551477441e-07, "loss": 0.4672, "num_input_tokens_seen": 101684048, "step": 83820 }, { "epoch": 9.335672123844526, "grad_norm": 0.09401663392782211, "learning_rate": 6.69402900130589e-07, "loss": 0.4641, "num_input_tokens_seen": 101689456, "step": 83825 }, { "epoch": 9.336228978728144, "grad_norm": 0.16622449457645416, "learning_rate": 6.682863644013632e-07, "loss": 0.4611, "num_input_tokens_seen": 101695696, "step": 83830 }, { "epoch": 9.33678583361176, "grad_norm": 0.09270351380109787, "learning_rate": 6.671707480022521e-07, "loss": 0.4596, "num_input_tokens_seen": 101701776, "step": 83835 }, { "epoch": 9.337342688495378, "grad_norm": 0.09470907598733902, "learning_rate": 6.66056050975411e-07, "loss": 0.4575, "num_input_tokens_seen": 101708240, "step": 83840 }, { "epoch": 9.337899543378995, "grad_norm": 0.10785046219825745, "learning_rate": 6.649422733629507e-07, "loss": 0.4497, "num_input_tokens_seen": 101714320, "step": 83845 }, { "epoch": 9.338456398262613, "grad_norm": 0.08805184066295624, "learning_rate": 6.638294152069597e-07, "loss": 0.4474, "num_input_tokens_seen": 101720528, "step": 83850 }, { "epoch": 9.33901325314623, "grad_norm": 0.13422085344791412, "learning_rate": 6.62717476549482e-07, "loss": 0.462, "num_input_tokens_seen": 101726576, "step": 83855 }, { "epoch": 9.339570108029847, "grad_norm": 0.10604182630777359, "learning_rate": 6.616064574325315e-07, "loss": 0.4622, "num_input_tokens_seen": 101732400, "step": 83860 }, { "epoch": 9.340126962913464, "grad_norm": 0.06636146456003189, "learning_rate": 6.604963578980828e-07, "loss": 0.4675, "num_input_tokens_seen": 101738672, "step": 83865 }, { "epoch": 9.340683817797082, "grad_norm": 0.16822071373462677, "learning_rate": 6.593871779880828e-07, "loss": 0.4537, "num_input_tokens_seen": 101745232, "step": 83870 }, { "epoch": 9.3412406726807, "grad_norm": 0.1543956845998764, "learning_rate": 6.582789177444399e-07, "loss": 0.4746, "num_input_tokens_seen": 101751216, "step": 83875 }, { "epoch": 9.341797527564317, "grad_norm": 0.12085440009832382, "learning_rate": 6.571715772090231e-07, "loss": 0.4633, "num_input_tokens_seen": 101757200, "step": 83880 }, { "epoch": 9.342354382447935, "grad_norm": 0.09363316744565964, "learning_rate": 6.560651564236797e-07, "loss": 0.4753, "num_input_tokens_seen": 101763312, "step": 83885 }, { "epoch": 9.34291123733155, "grad_norm": 0.07955952733755112, "learning_rate": 6.54959655430204e-07, "loss": 0.4625, "num_input_tokens_seen": 101769456, "step": 83890 }, { "epoch": 9.343468092215168, "grad_norm": 0.10990896075963974, "learning_rate": 6.538550742703709e-07, "loss": 0.4688, "num_input_tokens_seen": 101775568, "step": 83895 }, { "epoch": 9.344024947098786, "grad_norm": 0.12895502150058746, "learning_rate": 6.527514129859136e-07, "loss": 0.4664, "num_input_tokens_seen": 101782000, "step": 83900 }, { "epoch": 9.344581801982404, "grad_norm": 0.10606498271226883, "learning_rate": 6.516486716185349e-07, "loss": 0.4552, "num_input_tokens_seen": 101788272, "step": 83905 }, { "epoch": 9.345138656866022, "grad_norm": 0.09937051683664322, "learning_rate": 6.50546850209896e-07, "loss": 0.4564, "num_input_tokens_seen": 101794256, "step": 83910 }, { "epoch": 9.345695511749637, "grad_norm": 0.09096822142601013, "learning_rate": 6.494459488016274e-07, "loss": 0.4521, "num_input_tokens_seen": 101800464, "step": 83915 }, { "epoch": 9.346252366633255, "grad_norm": 0.15795882046222687, "learning_rate": 6.483459674353293e-07, "loss": 0.4702, "num_input_tokens_seen": 101806160, "step": 83920 }, { "epoch": 9.346809221516873, "grad_norm": 0.12190104275941849, "learning_rate": 6.472469061525571e-07, "loss": 0.4564, "num_input_tokens_seen": 101811760, "step": 83925 }, { "epoch": 9.34736607640049, "grad_norm": 0.10138869285583496, "learning_rate": 6.461487649948389e-07, "loss": 0.4531, "num_input_tokens_seen": 101818000, "step": 83930 }, { "epoch": 9.347922931284108, "grad_norm": 0.12634243071079254, "learning_rate": 6.450515440036692e-07, "loss": 0.4475, "num_input_tokens_seen": 101824368, "step": 83935 }, { "epoch": 9.348479786167724, "grad_norm": 0.1344846934080124, "learning_rate": 6.439552432204982e-07, "loss": 0.4572, "num_input_tokens_seen": 101830416, "step": 83940 }, { "epoch": 9.349036641051342, "grad_norm": 0.14323613047599792, "learning_rate": 6.42859862686751e-07, "loss": 0.484, "num_input_tokens_seen": 101836560, "step": 83945 }, { "epoch": 9.34959349593496, "grad_norm": 0.10964921861886978, "learning_rate": 6.417654024438141e-07, "loss": 0.4455, "num_input_tokens_seen": 101842800, "step": 83950 }, { "epoch": 9.350150350818577, "grad_norm": 0.10153457522392273, "learning_rate": 6.406718625330432e-07, "loss": 0.4582, "num_input_tokens_seen": 101848720, "step": 83955 }, { "epoch": 9.350707205702195, "grad_norm": 0.134987935423851, "learning_rate": 6.395792429957498e-07, "loss": 0.4557, "num_input_tokens_seen": 101854704, "step": 83960 }, { "epoch": 9.35126406058581, "grad_norm": 0.10288310796022415, "learning_rate": 6.384875438732202e-07, "loss": 0.4525, "num_input_tokens_seen": 101860784, "step": 83965 }, { "epoch": 9.351820915469428, "grad_norm": 0.12565377354621887, "learning_rate": 6.37396765206702e-07, "loss": 0.4613, "num_input_tokens_seen": 101866928, "step": 83970 }, { "epoch": 9.352377770353046, "grad_norm": 0.09057927876710892, "learning_rate": 6.363069070374067e-07, "loss": 0.4469, "num_input_tokens_seen": 101873264, "step": 83975 }, { "epoch": 9.352934625236664, "grad_norm": 0.09809330105781555, "learning_rate": 6.352179694065152e-07, "loss": 0.4505, "num_input_tokens_seen": 101879056, "step": 83980 }, { "epoch": 9.353491480120281, "grad_norm": 0.10019315779209137, "learning_rate": 6.34129952355167e-07, "loss": 0.4509, "num_input_tokens_seen": 101884464, "step": 83985 }, { "epoch": 9.354048335003897, "grad_norm": 0.10020942986011505, "learning_rate": 6.33042855924476e-07, "loss": 0.471, "num_input_tokens_seen": 101890512, "step": 83990 }, { "epoch": 9.354605189887515, "grad_norm": 0.09906770288944244, "learning_rate": 6.319566801555126e-07, "loss": 0.4686, "num_input_tokens_seen": 101895824, "step": 83995 }, { "epoch": 9.355162044771133, "grad_norm": 0.1303814947605133, "learning_rate": 6.308714250893188e-07, "loss": 0.4563, "num_input_tokens_seen": 101901680, "step": 84000 }, { "epoch": 9.35571889965475, "grad_norm": 0.1209971085190773, "learning_rate": 6.297870907668979e-07, "loss": 0.451, "num_input_tokens_seen": 101907728, "step": 84005 }, { "epoch": 9.356275754538368, "grad_norm": 0.15381234884262085, "learning_rate": 6.287036772292143e-07, "loss": 0.4638, "num_input_tokens_seen": 101913808, "step": 84010 }, { "epoch": 9.356832609421984, "grad_norm": 0.10235797613859177, "learning_rate": 6.276211845172103e-07, "loss": 0.4611, "num_input_tokens_seen": 101920016, "step": 84015 }, { "epoch": 9.357389464305601, "grad_norm": 0.10175757855176926, "learning_rate": 6.265396126717837e-07, "loss": 0.469, "num_input_tokens_seen": 101925008, "step": 84020 }, { "epoch": 9.35794631918922, "grad_norm": 0.12456146627664566, "learning_rate": 6.254589617337964e-07, "loss": 0.4542, "num_input_tokens_seen": 101931152, "step": 84025 }, { "epoch": 9.358503174072837, "grad_norm": 0.10037519037723541, "learning_rate": 6.243792317440849e-07, "loss": 0.4603, "num_input_tokens_seen": 101937392, "step": 84030 }, { "epoch": 9.359060028956455, "grad_norm": 0.10009487718343735, "learning_rate": 6.233004227434391e-07, "loss": 0.454, "num_input_tokens_seen": 101943152, "step": 84035 }, { "epoch": 9.359616883840072, "grad_norm": 0.1230866014957428, "learning_rate": 6.222225347726235e-07, "loss": 0.4625, "num_input_tokens_seen": 101948656, "step": 84040 }, { "epoch": 9.360173738723688, "grad_norm": 0.1514417827129364, "learning_rate": 6.21145567872361e-07, "loss": 0.4702, "num_input_tokens_seen": 101954768, "step": 84045 }, { "epoch": 9.360730593607306, "grad_norm": 0.09558183699846268, "learning_rate": 6.200695220833469e-07, "loss": 0.4566, "num_input_tokens_seen": 101960592, "step": 84050 }, { "epoch": 9.361287448490923, "grad_norm": 0.12167390435934067, "learning_rate": 6.189943974462348e-07, "loss": 0.4591, "num_input_tokens_seen": 101966800, "step": 84055 }, { "epoch": 9.361844303374541, "grad_norm": 0.10220957547426224, "learning_rate": 6.179201940016477e-07, "loss": 0.4667, "num_input_tokens_seen": 101972752, "step": 84060 }, { "epoch": 9.362401158258159, "grad_norm": 0.12178948521614075, "learning_rate": 6.168469117901727e-07, "loss": 0.4734, "num_input_tokens_seen": 101978832, "step": 84065 }, { "epoch": 9.362958013141775, "grad_norm": 0.12323907017707825, "learning_rate": 6.157745508523577e-07, "loss": 0.461, "num_input_tokens_seen": 101985168, "step": 84070 }, { "epoch": 9.363514868025392, "grad_norm": 0.06315150856971741, "learning_rate": 6.147031112287261e-07, "loss": 0.4645, "num_input_tokens_seen": 101991504, "step": 84075 }, { "epoch": 9.36407172290901, "grad_norm": 0.09751906991004944, "learning_rate": 6.136325929597564e-07, "loss": 0.4571, "num_input_tokens_seen": 101997424, "step": 84080 }, { "epoch": 9.364628577792628, "grad_norm": 0.11291444301605225, "learning_rate": 6.12562996085897e-07, "loss": 0.4723, "num_input_tokens_seen": 102003440, "step": 84085 }, { "epoch": 9.365185432676245, "grad_norm": 0.08852256834506989, "learning_rate": 6.114943206475626e-07, "loss": 0.4656, "num_input_tokens_seen": 102009616, "step": 84090 }, { "epoch": 9.365742287559861, "grad_norm": 0.09585757553577423, "learning_rate": 6.104265666851294e-07, "loss": 0.4621, "num_input_tokens_seen": 102015600, "step": 84095 }, { "epoch": 9.366299142443479, "grad_norm": 0.15645906329154968, "learning_rate": 6.093597342389401e-07, "loss": 0.4631, "num_input_tokens_seen": 102021680, "step": 84100 }, { "epoch": 9.366855997327097, "grad_norm": 0.10833686590194702, "learning_rate": 6.082938233493041e-07, "loss": 0.4743, "num_input_tokens_seen": 102027504, "step": 84105 }, { "epoch": 9.367412852210714, "grad_norm": 0.11030790954828262, "learning_rate": 6.072288340564919e-07, "loss": 0.4724, "num_input_tokens_seen": 102033520, "step": 84110 }, { "epoch": 9.367969707094332, "grad_norm": 0.17386791110038757, "learning_rate": 6.061647664007492e-07, "loss": 0.4625, "num_input_tokens_seen": 102039792, "step": 84115 }, { "epoch": 9.368526561977948, "grad_norm": 0.10840518027544022, "learning_rate": 6.051016204222714e-07, "loss": 0.4552, "num_input_tokens_seen": 102045904, "step": 84120 }, { "epoch": 9.369083416861566, "grad_norm": 0.0861390233039856, "learning_rate": 6.040393961612351e-07, "loss": 0.4612, "num_input_tokens_seen": 102052048, "step": 84125 }, { "epoch": 9.369640271745183, "grad_norm": 0.0963587686419487, "learning_rate": 6.029780936577689e-07, "loss": 0.4652, "num_input_tokens_seen": 102057936, "step": 84130 }, { "epoch": 9.3701971266288, "grad_norm": 0.13125266134738922, "learning_rate": 6.019177129519743e-07, "loss": 0.4585, "num_input_tokens_seen": 102064016, "step": 84135 }, { "epoch": 9.370753981512419, "grad_norm": 0.10412217676639557, "learning_rate": 6.008582540839164e-07, "loss": 0.456, "num_input_tokens_seen": 102070448, "step": 84140 }, { "epoch": 9.371310836396034, "grad_norm": 0.09511153399944305, "learning_rate": 5.997997170936242e-07, "loss": 0.4702, "num_input_tokens_seen": 102076656, "step": 84145 }, { "epoch": 9.371867691279652, "grad_norm": 0.10242114961147308, "learning_rate": 5.987421020210937e-07, "loss": 0.4647, "num_input_tokens_seen": 102082736, "step": 84150 }, { "epoch": 9.37242454616327, "grad_norm": 0.14056426286697388, "learning_rate": 5.976854089062844e-07, "loss": 0.4669, "num_input_tokens_seen": 102089040, "step": 84155 }, { "epoch": 9.372981401046887, "grad_norm": 0.10377606004476547, "learning_rate": 5.966296377891229e-07, "loss": 0.4701, "num_input_tokens_seen": 102095152, "step": 84160 }, { "epoch": 9.373538255930505, "grad_norm": 0.1017770767211914, "learning_rate": 5.955747887094937e-07, "loss": 0.4557, "num_input_tokens_seen": 102101104, "step": 84165 }, { "epoch": 9.374095110814121, "grad_norm": 0.09444397687911987, "learning_rate": 5.945208617072623e-07, "loss": 0.4555, "num_input_tokens_seen": 102107120, "step": 84170 }, { "epoch": 9.374651965697739, "grad_norm": 0.1687549650669098, "learning_rate": 5.934678568222385e-07, "loss": 0.4607, "num_input_tokens_seen": 102113232, "step": 84175 }, { "epoch": 9.375208820581356, "grad_norm": 0.09628604352474213, "learning_rate": 5.924157740942155e-07, "loss": 0.4631, "num_input_tokens_seen": 102119056, "step": 84180 }, { "epoch": 9.375765675464974, "grad_norm": 0.10000695288181305, "learning_rate": 5.913646135629447e-07, "loss": 0.4625, "num_input_tokens_seen": 102125008, "step": 84185 }, { "epoch": 9.376322530348592, "grad_norm": 0.1325923204421997, "learning_rate": 5.903143752681389e-07, "loss": 0.453, "num_input_tokens_seen": 102131216, "step": 84190 }, { "epoch": 9.376879385232208, "grad_norm": 0.08718181401491165, "learning_rate": 5.892650592494803e-07, "loss": 0.4576, "num_input_tokens_seen": 102137456, "step": 84195 }, { "epoch": 9.377436240115825, "grad_norm": 0.10435495525598526, "learning_rate": 5.882166655466149e-07, "loss": 0.4441, "num_input_tokens_seen": 102143088, "step": 84200 }, { "epoch": 9.377993094999443, "grad_norm": 0.1411951333284378, "learning_rate": 5.871691941991553e-07, "loss": 0.4492, "num_input_tokens_seen": 102149008, "step": 84205 }, { "epoch": 9.37854994988306, "grad_norm": 0.12337720394134521, "learning_rate": 5.861226452466812e-07, "loss": 0.4647, "num_input_tokens_seen": 102155088, "step": 84210 }, { "epoch": 9.379106804766678, "grad_norm": 0.11001790314912796, "learning_rate": 5.850770187287303e-07, "loss": 0.4654, "num_input_tokens_seen": 102161264, "step": 84215 }, { "epoch": 9.379663659650294, "grad_norm": 0.11367454379796982, "learning_rate": 5.840323146848098e-07, "loss": 0.453, "num_input_tokens_seen": 102167504, "step": 84220 }, { "epoch": 9.380220514533912, "grad_norm": 0.12603026628494263, "learning_rate": 5.829885331543939e-07, "loss": 0.4668, "num_input_tokens_seen": 102173808, "step": 84225 }, { "epoch": 9.38077736941753, "grad_norm": 0.09307029843330383, "learning_rate": 5.819456741769203e-07, "loss": 0.4679, "num_input_tokens_seen": 102179952, "step": 84230 }, { "epoch": 9.381334224301147, "grad_norm": 0.09632587432861328, "learning_rate": 5.809037377917909e-07, "loss": 0.4631, "num_input_tokens_seen": 102185296, "step": 84235 }, { "epoch": 9.381891079184765, "grad_norm": 0.09863582998514175, "learning_rate": 5.798627240383686e-07, "loss": 0.4689, "num_input_tokens_seen": 102191568, "step": 84240 }, { "epoch": 9.382447934068383, "grad_norm": 0.11938399076461792, "learning_rate": 5.788226329559971e-07, "loss": 0.4705, "num_input_tokens_seen": 102197648, "step": 84245 }, { "epoch": 9.383004788951999, "grad_norm": 0.1223454549908638, "learning_rate": 5.777834645839641e-07, "loss": 0.4745, "num_input_tokens_seen": 102203696, "step": 84250 }, { "epoch": 9.383561643835616, "grad_norm": 0.13175514340400696, "learning_rate": 5.767452189615385e-07, "loss": 0.4652, "num_input_tokens_seen": 102209904, "step": 84255 }, { "epoch": 9.384118498719234, "grad_norm": 0.11015603691339493, "learning_rate": 5.757078961279444e-07, "loss": 0.4346, "num_input_tokens_seen": 102216240, "step": 84260 }, { "epoch": 9.384675353602852, "grad_norm": 0.0797431543469429, "learning_rate": 5.746714961223809e-07, "loss": 0.4523, "num_input_tokens_seen": 102222384, "step": 84265 }, { "epoch": 9.38523220848647, "grad_norm": 0.11645805090665817, "learning_rate": 5.736360189840001e-07, "loss": 0.4573, "num_input_tokens_seen": 102228496, "step": 84270 }, { "epoch": 9.385789063370085, "grad_norm": 0.09503808617591858, "learning_rate": 5.72601464751929e-07, "loss": 0.4608, "num_input_tokens_seen": 102234576, "step": 84275 }, { "epoch": 9.386345918253703, "grad_norm": 0.08971236646175385, "learning_rate": 5.715678334652585e-07, "loss": 0.4612, "num_input_tokens_seen": 102240624, "step": 84280 }, { "epoch": 9.38690277313732, "grad_norm": 0.10100741684436798, "learning_rate": 5.705351251630381e-07, "loss": 0.4623, "num_input_tokens_seen": 102246928, "step": 84285 }, { "epoch": 9.387459628020938, "grad_norm": 0.1001986414194107, "learning_rate": 5.695033398842892e-07, "loss": 0.4638, "num_input_tokens_seen": 102253232, "step": 84290 }, { "epoch": 9.388016482904556, "grad_norm": 0.1541537046432495, "learning_rate": 5.684724776679945e-07, "loss": 0.4523, "num_input_tokens_seen": 102259344, "step": 84295 }, { "epoch": 9.388573337788172, "grad_norm": 0.11763138324022293, "learning_rate": 5.674425385531035e-07, "loss": 0.4584, "num_input_tokens_seen": 102265552, "step": 84300 }, { "epoch": 9.38913019267179, "grad_norm": 0.1011776477098465, "learning_rate": 5.66413522578535e-07, "loss": 0.462, "num_input_tokens_seen": 102271728, "step": 84305 }, { "epoch": 9.389687047555407, "grad_norm": 0.1036783754825592, "learning_rate": 5.653854297831606e-07, "loss": 0.4561, "num_input_tokens_seen": 102277744, "step": 84310 }, { "epoch": 9.390243902439025, "grad_norm": 0.09948289394378662, "learning_rate": 5.643582602058329e-07, "loss": 0.4765, "num_input_tokens_seen": 102284112, "step": 84315 }, { "epoch": 9.390800757322642, "grad_norm": 0.10853519290685654, "learning_rate": 5.633320138853537e-07, "loss": 0.4604, "num_input_tokens_seen": 102290320, "step": 84320 }, { "epoch": 9.391357612206258, "grad_norm": 0.0977669209241867, "learning_rate": 5.623066908605063e-07, "loss": 0.4693, "num_input_tokens_seen": 102296112, "step": 84325 }, { "epoch": 9.391914467089876, "grad_norm": 0.1525658369064331, "learning_rate": 5.61282291170026e-07, "loss": 0.4444, "num_input_tokens_seen": 102302384, "step": 84330 }, { "epoch": 9.392471321973494, "grad_norm": 0.08967578411102295, "learning_rate": 5.602588148526156e-07, "loss": 0.4568, "num_input_tokens_seen": 102308432, "step": 84335 }, { "epoch": 9.393028176857111, "grad_norm": 0.16604912281036377, "learning_rate": 5.592362619469521e-07, "loss": 0.4558, "num_input_tokens_seen": 102314480, "step": 84340 }, { "epoch": 9.393585031740729, "grad_norm": 0.08143272995948792, "learning_rate": 5.582146324916632e-07, "loss": 0.4575, "num_input_tokens_seen": 102320304, "step": 84345 }, { "epoch": 9.394141886624345, "grad_norm": 0.08155360072851181, "learning_rate": 5.571939265253568e-07, "loss": 0.4564, "num_input_tokens_seen": 102326384, "step": 84350 }, { "epoch": 9.394698741507963, "grad_norm": 0.08913957327604294, "learning_rate": 5.561741440865909e-07, "loss": 0.4662, "num_input_tokens_seen": 102332464, "step": 84355 }, { "epoch": 9.39525559639158, "grad_norm": 0.10630644857883453, "learning_rate": 5.551552852139042e-07, "loss": 0.4556, "num_input_tokens_seen": 102338832, "step": 84360 }, { "epoch": 9.395812451275198, "grad_norm": 0.12698835134506226, "learning_rate": 5.541373499457825e-07, "loss": 0.4554, "num_input_tokens_seen": 102345200, "step": 84365 }, { "epoch": 9.396369306158816, "grad_norm": 0.13665315508842468, "learning_rate": 5.531203383206951e-07, "loss": 0.4515, "num_input_tokens_seen": 102351504, "step": 84370 }, { "epoch": 9.396926161042432, "grad_norm": 0.1007741168141365, "learning_rate": 5.521042503770668e-07, "loss": 0.4618, "num_input_tokens_seen": 102357712, "step": 84375 }, { "epoch": 9.39748301592605, "grad_norm": 0.15459373593330383, "learning_rate": 5.510890861532864e-07, "loss": 0.4538, "num_input_tokens_seen": 102363600, "step": 84380 }, { "epoch": 9.398039870809667, "grad_norm": 0.09097521752119064, "learning_rate": 5.500748456877092e-07, "loss": 0.4556, "num_input_tokens_seen": 102370096, "step": 84385 }, { "epoch": 9.398596725693285, "grad_norm": 0.15151013433933258, "learning_rate": 5.490615290186602e-07, "loss": 0.4578, "num_input_tokens_seen": 102376432, "step": 84390 }, { "epoch": 9.399153580576902, "grad_norm": 0.09981352090835571, "learning_rate": 5.480491361844197e-07, "loss": 0.4587, "num_input_tokens_seen": 102382480, "step": 84395 }, { "epoch": 9.39971043546052, "grad_norm": 0.12420172989368439, "learning_rate": 5.470376672232463e-07, "loss": 0.4577, "num_input_tokens_seen": 102388528, "step": 84400 }, { "epoch": 9.400267290344136, "grad_norm": 0.1199505552649498, "learning_rate": 5.46027122173351e-07, "loss": 0.4698, "num_input_tokens_seen": 102394288, "step": 84405 }, { "epoch": 9.400824145227753, "grad_norm": 0.0924162045121193, "learning_rate": 5.450175010729225e-07, "loss": 0.4591, "num_input_tokens_seen": 102400400, "step": 84410 }, { "epoch": 9.401381000111371, "grad_norm": 0.13078555464744568, "learning_rate": 5.440088039600999e-07, "loss": 0.4602, "num_input_tokens_seen": 102406640, "step": 84415 }, { "epoch": 9.401937854994989, "grad_norm": 0.10480428487062454, "learning_rate": 5.430010308729944e-07, "loss": 0.4607, "num_input_tokens_seen": 102412432, "step": 84420 }, { "epoch": 9.402494709878606, "grad_norm": 0.17629151046276093, "learning_rate": 5.419941818496921e-07, "loss": 0.4758, "num_input_tokens_seen": 102418512, "step": 84425 }, { "epoch": 9.403051564762222, "grad_norm": 0.10856766253709793, "learning_rate": 5.409882569282238e-07, "loss": 0.4664, "num_input_tokens_seen": 102424624, "step": 84430 }, { "epoch": 9.40360841964584, "grad_norm": 0.11630715429782867, "learning_rate": 5.399832561466062e-07, "loss": 0.4557, "num_input_tokens_seen": 102430608, "step": 84435 }, { "epoch": 9.404165274529458, "grad_norm": 0.11328181624412537, "learning_rate": 5.389791795428034e-07, "loss": 0.4543, "num_input_tokens_seen": 102436560, "step": 84440 }, { "epoch": 9.404722129413075, "grad_norm": 0.09373024106025696, "learning_rate": 5.379760271547574e-07, "loss": 0.4549, "num_input_tokens_seen": 102442768, "step": 84445 }, { "epoch": 9.405278984296693, "grad_norm": 0.11607377231121063, "learning_rate": 5.369737990203711e-07, "loss": 0.4524, "num_input_tokens_seen": 102448848, "step": 84450 }, { "epoch": 9.405835839180309, "grad_norm": 0.10458867996931076, "learning_rate": 5.359724951775086e-07, "loss": 0.4537, "num_input_tokens_seen": 102455184, "step": 84455 }, { "epoch": 9.406392694063927, "grad_norm": 0.11507979035377502, "learning_rate": 5.349721156640037e-07, "loss": 0.4585, "num_input_tokens_seen": 102461360, "step": 84460 }, { "epoch": 9.406949548947544, "grad_norm": 0.12561270594596863, "learning_rate": 5.339726605176565e-07, "loss": 0.4649, "num_input_tokens_seen": 102467792, "step": 84465 }, { "epoch": 9.407506403831162, "grad_norm": 0.09894292056560516, "learning_rate": 5.329741297762258e-07, "loss": 0.4626, "num_input_tokens_seen": 102473968, "step": 84470 }, { "epoch": 9.40806325871478, "grad_norm": 0.0797310322523117, "learning_rate": 5.319765234774399e-07, "loss": 0.4615, "num_input_tokens_seen": 102479888, "step": 84475 }, { "epoch": 9.408620113598396, "grad_norm": 0.11261019110679626, "learning_rate": 5.309798416589934e-07, "loss": 0.4658, "num_input_tokens_seen": 102485712, "step": 84480 }, { "epoch": 9.409176968482013, "grad_norm": 0.09924861788749695, "learning_rate": 5.299840843585452e-07, "loss": 0.4504, "num_input_tokens_seen": 102491280, "step": 84485 }, { "epoch": 9.409733823365631, "grad_norm": 0.10672615468502045, "learning_rate": 5.289892516137152e-07, "loss": 0.4648, "num_input_tokens_seen": 102497456, "step": 84490 }, { "epoch": 9.410290678249249, "grad_norm": 0.09967317432165146, "learning_rate": 5.279953434620927e-07, "loss": 0.4673, "num_input_tokens_seen": 102503760, "step": 84495 }, { "epoch": 9.410847533132866, "grad_norm": 0.10296553373336792, "learning_rate": 5.270023599412283e-07, "loss": 0.4652, "num_input_tokens_seen": 102510064, "step": 84500 }, { "epoch": 9.411404388016482, "grad_norm": 0.12444619834423065, "learning_rate": 5.260103010886447e-07, "loss": 0.4594, "num_input_tokens_seen": 102516368, "step": 84505 }, { "epoch": 9.4119612429001, "grad_norm": 0.1384679079055786, "learning_rate": 5.25019166941823e-07, "loss": 0.4594, "num_input_tokens_seen": 102522576, "step": 84510 }, { "epoch": 9.412518097783718, "grad_norm": 0.0879027470946312, "learning_rate": 5.240289575382084e-07, "loss": 0.4751, "num_input_tokens_seen": 102528848, "step": 84515 }, { "epoch": 9.413074952667335, "grad_norm": 0.11432218551635742, "learning_rate": 5.230396729152209e-07, "loss": 0.4687, "num_input_tokens_seen": 102534416, "step": 84520 }, { "epoch": 9.413631807550953, "grad_norm": 0.1567649394273758, "learning_rate": 5.220513131102306e-07, "loss": 0.4809, "num_input_tokens_seen": 102540848, "step": 84525 }, { "epoch": 9.414188662434569, "grad_norm": 0.1199713721871376, "learning_rate": 5.210638781605881e-07, "loss": 0.4596, "num_input_tokens_seen": 102546992, "step": 84530 }, { "epoch": 9.414745517318186, "grad_norm": 0.1336876004934311, "learning_rate": 5.20077368103597e-07, "loss": 0.4623, "num_input_tokens_seen": 102553008, "step": 84535 }, { "epoch": 9.415302372201804, "grad_norm": 0.11140038818120956, "learning_rate": 5.190917829765357e-07, "loss": 0.4545, "num_input_tokens_seen": 102559312, "step": 84540 }, { "epoch": 9.415859227085422, "grad_norm": 0.12942878901958466, "learning_rate": 5.181071228166356e-07, "loss": 0.4651, "num_input_tokens_seen": 102564816, "step": 84545 }, { "epoch": 9.41641608196904, "grad_norm": 0.12434866279363632, "learning_rate": 5.17123387661106e-07, "loss": 0.4492, "num_input_tokens_seen": 102570928, "step": 84550 }, { "epoch": 9.416972936852655, "grad_norm": 0.14962293207645416, "learning_rate": 5.161405775471167e-07, "loss": 0.4675, "num_input_tokens_seen": 102577456, "step": 84555 }, { "epoch": 9.417529791736273, "grad_norm": 0.13083310425281525, "learning_rate": 5.151586925117941e-07, "loss": 0.4646, "num_input_tokens_seen": 102583568, "step": 84560 }, { "epoch": 9.41808664661989, "grad_norm": 0.10864745080471039, "learning_rate": 5.14177732592247e-07, "loss": 0.4583, "num_input_tokens_seen": 102590160, "step": 84565 }, { "epoch": 9.418643501503508, "grad_norm": 0.11135660111904144, "learning_rate": 5.131976978255292e-07, "loss": 0.4558, "num_input_tokens_seen": 102596304, "step": 84570 }, { "epoch": 9.419200356387126, "grad_norm": 0.14129404723644257, "learning_rate": 5.122185882486751e-07, "loss": 0.4636, "num_input_tokens_seen": 102602544, "step": 84575 }, { "epoch": 9.419757211270742, "grad_norm": 0.10771609097719193, "learning_rate": 5.112404038986801e-07, "loss": 0.4567, "num_input_tokens_seen": 102608944, "step": 84580 }, { "epoch": 9.42031406615436, "grad_norm": 0.11967912316322327, "learning_rate": 5.102631448124979e-07, "loss": 0.4584, "num_input_tokens_seen": 102614832, "step": 84585 }, { "epoch": 9.420870921037977, "grad_norm": 0.14321114122867584, "learning_rate": 5.092868110270571e-07, "loss": 0.4548, "num_input_tokens_seen": 102620976, "step": 84590 }, { "epoch": 9.421427775921595, "grad_norm": 0.09512155503034592, "learning_rate": 5.083114025792423e-07, "loss": 0.4698, "num_input_tokens_seen": 102626896, "step": 84595 }, { "epoch": 9.421984630805213, "grad_norm": 0.07843267172574997, "learning_rate": 5.073369195059158e-07, "loss": 0.458, "num_input_tokens_seen": 102633168, "step": 84600 }, { "epoch": 9.42254148568883, "grad_norm": 0.10377176851034164, "learning_rate": 5.063633618438868e-07, "loss": 0.4538, "num_input_tokens_seen": 102638896, "step": 84605 }, { "epoch": 9.423098340572446, "grad_norm": 0.1112411618232727, "learning_rate": 5.053907296299426e-07, "loss": 0.4685, "num_input_tokens_seen": 102645040, "step": 84610 }, { "epoch": 9.423655195456064, "grad_norm": 0.11638741195201874, "learning_rate": 5.044190229008372e-07, "loss": 0.4705, "num_input_tokens_seen": 102650960, "step": 84615 }, { "epoch": 9.424212050339682, "grad_norm": 0.09527991712093353, "learning_rate": 5.034482416932773e-07, "loss": 0.4591, "num_input_tokens_seen": 102656848, "step": 84620 }, { "epoch": 9.4247689052233, "grad_norm": 0.08890044689178467, "learning_rate": 5.024783860439475e-07, "loss": 0.4569, "num_input_tokens_seen": 102663280, "step": 84625 }, { "epoch": 9.425325760106917, "grad_norm": 0.09608463197946548, "learning_rate": 5.015094559894906e-07, "loss": 0.4613, "num_input_tokens_seen": 102669104, "step": 84630 }, { "epoch": 9.425882614990533, "grad_norm": 0.11293818056583405, "learning_rate": 5.005414515665163e-07, "loss": 0.4492, "num_input_tokens_seen": 102674288, "step": 84635 }, { "epoch": 9.42643946987415, "grad_norm": 0.13005170226097107, "learning_rate": 4.995743728115981e-07, "loss": 0.458, "num_input_tokens_seen": 102680400, "step": 84640 }, { "epoch": 9.426996324757768, "grad_norm": 0.1674908548593521, "learning_rate": 4.986082197612734e-07, "loss": 0.4663, "num_input_tokens_seen": 102686320, "step": 84645 }, { "epoch": 9.427553179641386, "grad_norm": 0.11725318431854248, "learning_rate": 4.976429924520521e-07, "loss": 0.4576, "num_input_tokens_seen": 102692560, "step": 84650 }, { "epoch": 9.428110034525004, "grad_norm": 0.14119017124176025, "learning_rate": 4.966786909203991e-07, "loss": 0.4472, "num_input_tokens_seen": 102698800, "step": 84655 }, { "epoch": 9.42866688940862, "grad_norm": 0.14006644487380981, "learning_rate": 4.957153152027493e-07, "loss": 0.4612, "num_input_tokens_seen": 102704720, "step": 84660 }, { "epoch": 9.429223744292237, "grad_norm": 0.13484254479408264, "learning_rate": 4.947528653355016e-07, "loss": 0.4633, "num_input_tokens_seen": 102710864, "step": 84665 }, { "epoch": 9.429780599175855, "grad_norm": 0.08289249986410141, "learning_rate": 4.937913413550266e-07, "loss": 0.4677, "num_input_tokens_seen": 102717136, "step": 84670 }, { "epoch": 9.430337454059472, "grad_norm": 0.12973371148109436, "learning_rate": 4.928307432976426e-07, "loss": 0.4588, "num_input_tokens_seen": 102722672, "step": 84675 }, { "epoch": 9.43089430894309, "grad_norm": 0.09771944582462311, "learning_rate": 4.918710711996511e-07, "loss": 0.4595, "num_input_tokens_seen": 102728944, "step": 84680 }, { "epoch": 9.431451163826706, "grad_norm": 0.08406153321266174, "learning_rate": 4.909123250973146e-07, "loss": 0.4686, "num_input_tokens_seen": 102735152, "step": 84685 }, { "epoch": 9.432008018710324, "grad_norm": 0.09269095957279205, "learning_rate": 4.899545050268489e-07, "loss": 0.4639, "num_input_tokens_seen": 102741808, "step": 84690 }, { "epoch": 9.432564873593941, "grad_norm": 0.09990620613098145, "learning_rate": 4.889976110244526e-07, "loss": 0.4573, "num_input_tokens_seen": 102747888, "step": 84695 }, { "epoch": 9.433121728477559, "grad_norm": 0.14081205427646637, "learning_rate": 4.880416431262746e-07, "loss": 0.4533, "num_input_tokens_seen": 102753840, "step": 84700 }, { "epoch": 9.433678583361177, "grad_norm": 0.17682166397571564, "learning_rate": 4.870866013684333e-07, "loss": 0.4581, "num_input_tokens_seen": 102760240, "step": 84705 }, { "epoch": 9.434235438244793, "grad_norm": 0.21859751641750336, "learning_rate": 4.861324857870192e-07, "loss": 0.4561, "num_input_tokens_seen": 102766448, "step": 84710 }, { "epoch": 9.43479229312841, "grad_norm": 0.15728114545345306, "learning_rate": 4.851792964180757e-07, "loss": 0.4556, "num_input_tokens_seen": 102772336, "step": 84715 }, { "epoch": 9.435349148012028, "grad_norm": 0.09635265171527863, "learning_rate": 4.842270332976212e-07, "loss": 0.4606, "num_input_tokens_seen": 102778640, "step": 84720 }, { "epoch": 9.435906002895646, "grad_norm": 0.09295628219842911, "learning_rate": 4.832756964616326e-07, "loss": 0.4568, "num_input_tokens_seen": 102784560, "step": 84725 }, { "epoch": 9.436462857779263, "grad_norm": 0.13463106751441956, "learning_rate": 4.823252859460587e-07, "loss": 0.4577, "num_input_tokens_seen": 102790544, "step": 84730 }, { "epoch": 9.43701971266288, "grad_norm": 0.08247502893209457, "learning_rate": 4.813758017868042e-07, "loss": 0.4549, "num_input_tokens_seen": 102796112, "step": 84735 }, { "epoch": 9.437576567546497, "grad_norm": 0.10529064387083054, "learning_rate": 4.804272440197461e-07, "loss": 0.4577, "num_input_tokens_seen": 102802256, "step": 84740 }, { "epoch": 9.438133422430115, "grad_norm": 0.13674040138721466, "learning_rate": 4.79479612680725e-07, "loss": 0.4633, "num_input_tokens_seen": 102808176, "step": 84745 }, { "epoch": 9.438690277313732, "grad_norm": 0.11182542890310287, "learning_rate": 4.785329078055401e-07, "loss": 0.4575, "num_input_tokens_seen": 102814416, "step": 84750 }, { "epoch": 9.43924713219735, "grad_norm": 0.12700410187244415, "learning_rate": 4.775871294299683e-07, "loss": 0.4625, "num_input_tokens_seen": 102820784, "step": 84755 }, { "epoch": 9.439803987080968, "grad_norm": 0.10294771194458008, "learning_rate": 4.7664227758973945e-07, "loss": 0.4679, "num_input_tokens_seen": 102826768, "step": 84760 }, { "epoch": 9.440360841964583, "grad_norm": 0.13759422302246094, "learning_rate": 4.756983523205555e-07, "loss": 0.4614, "num_input_tokens_seen": 102832400, "step": 84765 }, { "epoch": 9.440917696848201, "grad_norm": 0.16767209768295288, "learning_rate": 4.7475535365807676e-07, "loss": 0.4507, "num_input_tokens_seen": 102838416, "step": 84770 }, { "epoch": 9.441474551731819, "grad_norm": 0.16111670434474945, "learning_rate": 4.738132816379387e-07, "loss": 0.465, "num_input_tokens_seen": 102844368, "step": 84775 }, { "epoch": 9.442031406615436, "grad_norm": 0.1577925980091095, "learning_rate": 4.7287213629573223e-07, "loss": 0.451, "num_input_tokens_seen": 102850512, "step": 84780 }, { "epoch": 9.442588261499054, "grad_norm": 0.12730026245117188, "learning_rate": 4.7193191766701784e-07, "loss": 0.4551, "num_input_tokens_seen": 102856464, "step": 84785 }, { "epoch": 9.44314511638267, "grad_norm": 0.13748939335346222, "learning_rate": 4.7099262578731983e-07, "loss": 0.4855, "num_input_tokens_seen": 102861808, "step": 84790 }, { "epoch": 9.443701971266288, "grad_norm": 0.09269573539495468, "learning_rate": 4.7005426069212375e-07, "loss": 0.4615, "num_input_tokens_seen": 102868016, "step": 84795 }, { "epoch": 9.444258826149905, "grad_norm": 0.10932739078998566, "learning_rate": 4.6911682241689016e-07, "loss": 0.4627, "num_input_tokens_seen": 102874096, "step": 84800 }, { "epoch": 9.444815681033523, "grad_norm": 0.09987117350101471, "learning_rate": 4.6818031099703784e-07, "loss": 0.4479, "num_input_tokens_seen": 102880304, "step": 84805 }, { "epoch": 9.44537253591714, "grad_norm": 0.12131305038928986, "learning_rate": 4.672447264679497e-07, "loss": 0.4605, "num_input_tokens_seen": 102886320, "step": 84810 }, { "epoch": 9.445929390800757, "grad_norm": 0.09926582127809525, "learning_rate": 4.663100688649724e-07, "loss": 0.446, "num_input_tokens_seen": 102892368, "step": 84815 }, { "epoch": 9.446486245684374, "grad_norm": 0.09374123066663742, "learning_rate": 4.6537633822342496e-07, "loss": 0.4571, "num_input_tokens_seen": 102898672, "step": 84820 }, { "epoch": 9.447043100567992, "grad_norm": 0.09134531766176224, "learning_rate": 4.644435345785847e-07, "loss": 0.4549, "num_input_tokens_seen": 102904656, "step": 84825 }, { "epoch": 9.44759995545161, "grad_norm": 0.11559577286243439, "learning_rate": 4.635116579656956e-07, "loss": 0.4573, "num_input_tokens_seen": 102910832, "step": 84830 }, { "epoch": 9.448156810335227, "grad_norm": 0.15366065502166748, "learning_rate": 4.6258070841996294e-07, "loss": 0.4561, "num_input_tokens_seen": 102916976, "step": 84835 }, { "epoch": 9.448713665218843, "grad_norm": 0.15762634575366974, "learning_rate": 4.616506859765696e-07, "loss": 0.4515, "num_input_tokens_seen": 102923184, "step": 84840 }, { "epoch": 9.449270520102461, "grad_norm": 0.0894271656870842, "learning_rate": 4.6072159067064856e-07, "loss": 0.4616, "num_input_tokens_seen": 102929072, "step": 84845 }, { "epoch": 9.449827374986079, "grad_norm": 0.10537334531545639, "learning_rate": 4.597934225373052e-07, "loss": 0.4581, "num_input_tokens_seen": 102935056, "step": 84850 }, { "epoch": 9.450384229869696, "grad_norm": 0.1234244778752327, "learning_rate": 4.588661816116113e-07, "loss": 0.4567, "num_input_tokens_seen": 102941328, "step": 84855 }, { "epoch": 9.450941084753314, "grad_norm": 0.1247929260134697, "learning_rate": 4.5793986792859733e-07, "loss": 0.4569, "num_input_tokens_seen": 102947600, "step": 84860 }, { "epoch": 9.45149793963693, "grad_norm": 0.14080286026000977, "learning_rate": 4.5701448152326576e-07, "loss": 0.4559, "num_input_tokens_seen": 102953904, "step": 84865 }, { "epoch": 9.452054794520548, "grad_norm": 0.09157738834619522, "learning_rate": 4.5609002243057753e-07, "loss": 0.4612, "num_input_tokens_seen": 102960048, "step": 84870 }, { "epoch": 9.452611649404165, "grad_norm": 0.10286486148834229, "learning_rate": 4.5516649068546305e-07, "loss": 0.4564, "num_input_tokens_seen": 102966032, "step": 84875 }, { "epoch": 9.453168504287783, "grad_norm": 0.13672173023223877, "learning_rate": 4.542438863228166e-07, "loss": 0.4696, "num_input_tokens_seen": 102971536, "step": 84880 }, { "epoch": 9.4537253591714, "grad_norm": 0.1269891858100891, "learning_rate": 4.533222093774964e-07, "loss": 0.4685, "num_input_tokens_seen": 102977648, "step": 84885 }, { "epoch": 9.454282214055016, "grad_norm": 0.12950032949447632, "learning_rate": 4.524014598843274e-07, "loss": 0.4574, "num_input_tokens_seen": 102983696, "step": 84890 }, { "epoch": 9.454839068938634, "grad_norm": 0.11434568464756012, "learning_rate": 4.514816378780984e-07, "loss": 0.4528, "num_input_tokens_seen": 102989680, "step": 84895 }, { "epoch": 9.455395923822252, "grad_norm": 0.11268424987792969, "learning_rate": 4.505627433935622e-07, "loss": 0.4649, "num_input_tokens_seen": 102995696, "step": 84900 }, { "epoch": 9.45595277870587, "grad_norm": 0.13330991566181183, "learning_rate": 4.496447764654382e-07, "loss": 0.4724, "num_input_tokens_seen": 103001744, "step": 84905 }, { "epoch": 9.456509633589487, "grad_norm": 0.10424143821001053, "learning_rate": 4.487277371284099e-07, "loss": 0.4595, "num_input_tokens_seen": 103007760, "step": 84910 }, { "epoch": 9.457066488473103, "grad_norm": 0.08482015132904053, "learning_rate": 4.478116254171244e-07, "loss": 0.4691, "num_input_tokens_seen": 103013840, "step": 84915 }, { "epoch": 9.45762334335672, "grad_norm": 0.14563727378845215, "learning_rate": 4.4689644136619857e-07, "loss": 0.4526, "num_input_tokens_seen": 103020048, "step": 84920 }, { "epoch": 9.458180198240338, "grad_norm": 0.11145000159740448, "learning_rate": 4.4598218501021025e-07, "loss": 0.4641, "num_input_tokens_seen": 103026096, "step": 84925 }, { "epoch": 9.458737053123956, "grad_norm": 0.11205266416072845, "learning_rate": 4.4506885638370135e-07, "loss": 0.4575, "num_input_tokens_seen": 103031760, "step": 84930 }, { "epoch": 9.459293908007574, "grad_norm": 0.13355818390846252, "learning_rate": 4.44156455521183e-07, "loss": 0.454, "num_input_tokens_seen": 103037968, "step": 84935 }, { "epoch": 9.45985076289119, "grad_norm": 0.1384573131799698, "learning_rate": 4.43244982457125e-07, "loss": 0.4498, "num_input_tokens_seen": 103044336, "step": 84940 }, { "epoch": 9.460407617774807, "grad_norm": 0.10689613968133926, "learning_rate": 4.4233443722596914e-07, "loss": 0.4599, "num_input_tokens_seen": 103050800, "step": 84945 }, { "epoch": 9.460964472658425, "grad_norm": 0.10157638788223267, "learning_rate": 4.4142481986211294e-07, "loss": 0.4556, "num_input_tokens_seen": 103056944, "step": 84950 }, { "epoch": 9.461521327542043, "grad_norm": 0.12973643839359283, "learning_rate": 4.4051613039993166e-07, "loss": 0.453, "num_input_tokens_seen": 103062800, "step": 84955 }, { "epoch": 9.46207818242566, "grad_norm": 0.11251365393400192, "learning_rate": 4.396083688737562e-07, "loss": 0.4594, "num_input_tokens_seen": 103068272, "step": 84960 }, { "epoch": 9.462635037309278, "grad_norm": 0.1982259899377823, "learning_rate": 4.3870153531788415e-07, "loss": 0.468, "num_input_tokens_seen": 103074224, "step": 84965 }, { "epoch": 9.463191892192894, "grad_norm": 0.10813289880752563, "learning_rate": 4.3779562976657974e-07, "loss": 0.4555, "num_input_tokens_seen": 103080432, "step": 84970 }, { "epoch": 9.463748747076512, "grad_norm": 0.1325208693742752, "learning_rate": 4.3689065225406555e-07, "loss": 0.4476, "num_input_tokens_seen": 103086768, "step": 84975 }, { "epoch": 9.46430560196013, "grad_norm": 0.09108393639326096, "learning_rate": 4.3598660281454207e-07, "loss": 0.4575, "num_input_tokens_seen": 103093008, "step": 84980 }, { "epoch": 9.464862456843747, "grad_norm": 0.10311713814735413, "learning_rate": 4.350834814821653e-07, "loss": 0.45, "num_input_tokens_seen": 103099088, "step": 84985 }, { "epoch": 9.465419311727365, "grad_norm": 0.09814628958702087, "learning_rate": 4.3418128829105233e-07, "loss": 0.4635, "num_input_tokens_seen": 103105136, "step": 84990 }, { "epoch": 9.46597616661098, "grad_norm": 0.12212391942739487, "learning_rate": 4.332800232753009e-07, "loss": 0.4523, "num_input_tokens_seen": 103110800, "step": 84995 }, { "epoch": 9.466533021494598, "grad_norm": 0.10147756338119507, "learning_rate": 4.323796864689533e-07, "loss": 0.463, "num_input_tokens_seen": 103117040, "step": 85000 }, { "epoch": 9.467089876378216, "grad_norm": 0.09924203157424927, "learning_rate": 4.314802779060351e-07, "loss": 0.4528, "num_input_tokens_seen": 103123440, "step": 85005 }, { "epoch": 9.467646731261834, "grad_norm": 0.11654756963253021, "learning_rate": 4.305817976205245e-07, "loss": 0.4571, "num_input_tokens_seen": 103129552, "step": 85010 }, { "epoch": 9.468203586145451, "grad_norm": 0.13772565126419067, "learning_rate": 4.296842456463668e-07, "loss": 0.4563, "num_input_tokens_seen": 103135824, "step": 85015 }, { "epoch": 9.468760441029067, "grad_norm": 0.12100593000650406, "learning_rate": 4.287876220174819e-07, "loss": 0.4791, "num_input_tokens_seen": 103142032, "step": 85020 }, { "epoch": 9.469317295912685, "grad_norm": 0.10181982070207596, "learning_rate": 4.2789192676774283e-07, "loss": 0.4658, "num_input_tokens_seen": 103148272, "step": 85025 }, { "epoch": 9.469874150796302, "grad_norm": 0.14087271690368652, "learning_rate": 4.269971599309919e-07, "loss": 0.4646, "num_input_tokens_seen": 103154480, "step": 85030 }, { "epoch": 9.47043100567992, "grad_norm": 0.11690706759691238, "learning_rate": 4.261033215410354e-07, "loss": 0.4681, "num_input_tokens_seen": 103159600, "step": 85035 }, { "epoch": 9.470987860563538, "grad_norm": 0.08562283217906952, "learning_rate": 4.2521041163164633e-07, "loss": 0.4558, "num_input_tokens_seen": 103165520, "step": 85040 }, { "epoch": 9.471544715447154, "grad_norm": 0.09651867300271988, "learning_rate": 4.243184302365616e-07, "loss": 0.4627, "num_input_tokens_seen": 103171056, "step": 85045 }, { "epoch": 9.472101570330771, "grad_norm": 0.14323867857456207, "learning_rate": 4.23427377389482e-07, "loss": 0.4567, "num_input_tokens_seen": 103177072, "step": 85050 }, { "epoch": 9.472658425214389, "grad_norm": 0.09847591072320938, "learning_rate": 4.2253725312408065e-07, "loss": 0.4528, "num_input_tokens_seen": 103182960, "step": 85055 }, { "epoch": 9.473215280098007, "grad_norm": 0.1119527742266655, "learning_rate": 4.2164805747397784e-07, "loss": 0.4615, "num_input_tokens_seen": 103189200, "step": 85060 }, { "epoch": 9.473772134981624, "grad_norm": 0.10291464626789093, "learning_rate": 4.2075979047278e-07, "loss": 0.4556, "num_input_tokens_seen": 103195088, "step": 85065 }, { "epoch": 9.47432898986524, "grad_norm": 0.08836393058300018, "learning_rate": 4.1987245215404646e-07, "loss": 0.4645, "num_input_tokens_seen": 103201104, "step": 85070 }, { "epoch": 9.474885844748858, "grad_norm": 0.1193305253982544, "learning_rate": 4.189860425512976e-07, "loss": 0.4708, "num_input_tokens_seen": 103206896, "step": 85075 }, { "epoch": 9.475442699632476, "grad_norm": 0.1311807781457901, "learning_rate": 4.1810056169803433e-07, "loss": 0.4637, "num_input_tokens_seen": 103212528, "step": 85080 }, { "epoch": 9.475999554516093, "grad_norm": 0.11067009717226028, "learning_rate": 4.172160096277078e-07, "loss": 0.465, "num_input_tokens_seen": 103218896, "step": 85085 }, { "epoch": 9.476556409399711, "grad_norm": 0.11524717509746552, "learning_rate": 4.163323863737384e-07, "loss": 0.4441, "num_input_tokens_seen": 103225040, "step": 85090 }, { "epoch": 9.477113264283329, "grad_norm": 0.08657946437597275, "learning_rate": 4.154496919695161e-07, "loss": 0.4622, "num_input_tokens_seen": 103230960, "step": 85095 }, { "epoch": 9.477670119166945, "grad_norm": 0.08210091292858124, "learning_rate": 4.145679264483865e-07, "loss": 0.4502, "num_input_tokens_seen": 103237104, "step": 85100 }, { "epoch": 9.478226974050562, "grad_norm": 0.15403637290000916, "learning_rate": 4.136870898436701e-07, "loss": 0.4654, "num_input_tokens_seen": 103243248, "step": 85105 }, { "epoch": 9.47878382893418, "grad_norm": 0.10486186295747757, "learning_rate": 4.128071821886459e-07, "loss": 0.4585, "num_input_tokens_seen": 103249424, "step": 85110 }, { "epoch": 9.479340683817798, "grad_norm": 0.13246890902519226, "learning_rate": 4.119282035165595e-07, "loss": 0.4546, "num_input_tokens_seen": 103255920, "step": 85115 }, { "epoch": 9.479897538701415, "grad_norm": 0.10523612052202225, "learning_rate": 4.1105015386062316e-07, "loss": 0.4605, "num_input_tokens_seen": 103262064, "step": 85120 }, { "epoch": 9.480454393585031, "grad_norm": 0.11656967550516129, "learning_rate": 4.101730332540105e-07, "loss": 0.4602, "num_input_tokens_seen": 103268112, "step": 85125 }, { "epoch": 9.481011248468649, "grad_norm": 0.11223477125167847, "learning_rate": 4.0929684172986147e-07, "loss": 0.457, "num_input_tokens_seen": 103274256, "step": 85130 }, { "epoch": 9.481568103352267, "grad_norm": 0.09195060282945633, "learning_rate": 4.084215793212859e-07, "loss": 0.4626, "num_input_tokens_seen": 103280400, "step": 85135 }, { "epoch": 9.482124958235884, "grad_norm": 0.14411374926567078, "learning_rate": 4.075472460613461e-07, "loss": 0.4612, "num_input_tokens_seen": 103286832, "step": 85140 }, { "epoch": 9.482681813119502, "grad_norm": 0.1243782490491867, "learning_rate": 4.0667384198308236e-07, "loss": 0.4647, "num_input_tokens_seen": 103292912, "step": 85145 }, { "epoch": 9.483238668003118, "grad_norm": 0.1112324595451355, "learning_rate": 4.05801367119496e-07, "loss": 0.4536, "num_input_tokens_seen": 103299280, "step": 85150 }, { "epoch": 9.483795522886735, "grad_norm": 0.14761464297771454, "learning_rate": 4.0492982150354965e-07, "loss": 0.4676, "num_input_tokens_seen": 103305200, "step": 85155 }, { "epoch": 9.484352377770353, "grad_norm": 0.07711374014616013, "learning_rate": 4.0405920516817517e-07, "loss": 0.4523, "num_input_tokens_seen": 103311216, "step": 85160 }, { "epoch": 9.48490923265397, "grad_norm": 0.11535681039094925, "learning_rate": 4.0318951814626305e-07, "loss": 0.4581, "num_input_tokens_seen": 103316688, "step": 85165 }, { "epoch": 9.485466087537588, "grad_norm": 0.16404835879802704, "learning_rate": 4.023207604706758e-07, "loss": 0.4635, "num_input_tokens_seen": 103322736, "step": 85170 }, { "epoch": 9.486022942421204, "grad_norm": 0.09260197728872299, "learning_rate": 4.014529321742372e-07, "loss": 0.4652, "num_input_tokens_seen": 103328944, "step": 85175 }, { "epoch": 9.486579797304822, "grad_norm": 0.16322040557861328, "learning_rate": 4.005860332897349e-07, "loss": 0.4512, "num_input_tokens_seen": 103335056, "step": 85180 }, { "epoch": 9.48713665218844, "grad_norm": 0.11515319347381592, "learning_rate": 3.997200638499288e-07, "loss": 0.4708, "num_input_tokens_seen": 103341424, "step": 85185 }, { "epoch": 9.487693507072057, "grad_norm": 0.11160923540592194, "learning_rate": 3.988550238875316e-07, "loss": 0.455, "num_input_tokens_seen": 103347408, "step": 85190 }, { "epoch": 9.488250361955675, "grad_norm": 0.09758654981851578, "learning_rate": 3.979909134352311e-07, "loss": 0.4511, "num_input_tokens_seen": 103353520, "step": 85195 }, { "epoch": 9.488807216839291, "grad_norm": 0.1582440733909607, "learning_rate": 3.9712773252567335e-07, "loss": 0.4617, "num_input_tokens_seen": 103359408, "step": 85200 }, { "epoch": 9.489364071722909, "grad_norm": 0.10787739604711533, "learning_rate": 3.96265481191474e-07, "loss": 0.464, "num_input_tokens_seen": 103365712, "step": 85205 }, { "epoch": 9.489920926606526, "grad_norm": 0.1645956188440323, "learning_rate": 3.9540415946521247e-07, "loss": 0.4794, "num_input_tokens_seen": 103372048, "step": 85210 }, { "epoch": 9.490477781490144, "grad_norm": 0.14717638492584229, "learning_rate": 3.945437673794322e-07, "loss": 0.4453, "num_input_tokens_seen": 103377872, "step": 85215 }, { "epoch": 9.491034636373762, "grad_norm": 0.11225443333387375, "learning_rate": 3.9368430496663777e-07, "loss": 0.4523, "num_input_tokens_seen": 103383920, "step": 85220 }, { "epoch": 9.491591491257378, "grad_norm": 0.12013418227434158, "learning_rate": 3.928257722593059e-07, "loss": 0.4713, "num_input_tokens_seen": 103389552, "step": 85225 }, { "epoch": 9.492148346140995, "grad_norm": 0.11002030223608017, "learning_rate": 3.919681692898747e-07, "loss": 0.4567, "num_input_tokens_seen": 103395536, "step": 85230 }, { "epoch": 9.492705201024613, "grad_norm": 0.14773602783679962, "learning_rate": 3.911114960907486e-07, "loss": 0.4445, "num_input_tokens_seen": 103401520, "step": 85235 }, { "epoch": 9.49326205590823, "grad_norm": 0.16709893941879272, "learning_rate": 3.902557526942879e-07, "loss": 0.4562, "num_input_tokens_seen": 103407824, "step": 85240 }, { "epoch": 9.493818910791848, "grad_norm": 0.14401578903198242, "learning_rate": 3.8940093913283626e-07, "loss": 0.4499, "num_input_tokens_seen": 103413808, "step": 85245 }, { "epoch": 9.494375765675464, "grad_norm": 0.13929618895053864, "learning_rate": 3.885470554386816e-07, "loss": 0.47, "num_input_tokens_seen": 103420272, "step": 85250 }, { "epoch": 9.494932620559082, "grad_norm": 0.1092049777507782, "learning_rate": 3.8769410164408994e-07, "loss": 0.4709, "num_input_tokens_seen": 103426224, "step": 85255 }, { "epoch": 9.4954894754427, "grad_norm": 0.09320451319217682, "learning_rate": 3.868420777812909e-07, "loss": 0.4679, "num_input_tokens_seen": 103432304, "step": 85260 }, { "epoch": 9.496046330326317, "grad_norm": 0.10224539041519165, "learning_rate": 3.8599098388247545e-07, "loss": 0.447, "num_input_tokens_seen": 103438384, "step": 85265 }, { "epoch": 9.496603185209935, "grad_norm": 0.09782494604587555, "learning_rate": 3.851408199798012e-07, "loss": 0.46, "num_input_tokens_seen": 103444208, "step": 85270 }, { "epoch": 9.49716004009355, "grad_norm": 0.11237774044275284, "learning_rate": 3.842915861053842e-07, "loss": 0.4573, "num_input_tokens_seen": 103450288, "step": 85275 }, { "epoch": 9.497716894977168, "grad_norm": 0.10044944286346436, "learning_rate": 3.834432822913209e-07, "loss": 0.4594, "num_input_tokens_seen": 103455888, "step": 85280 }, { "epoch": 9.498273749860786, "grad_norm": 0.13608510792255402, "learning_rate": 3.825959085696551e-07, "loss": 0.4634, "num_input_tokens_seen": 103462000, "step": 85285 }, { "epoch": 9.498830604744404, "grad_norm": 0.12678898870944977, "learning_rate": 3.8174946497240574e-07, "loss": 0.4601, "num_input_tokens_seen": 103467856, "step": 85290 }, { "epoch": 9.499387459628021, "grad_norm": 0.09671835601329803, "learning_rate": 3.8090395153155824e-07, "loss": 0.4713, "num_input_tokens_seen": 103473968, "step": 85295 }, { "epoch": 9.49994431451164, "grad_norm": 0.10546787828207016, "learning_rate": 3.80059368279051e-07, "loss": 0.4655, "num_input_tokens_seen": 103479984, "step": 85300 }, { "epoch": 9.500501169395255, "grad_norm": 0.14708063006401062, "learning_rate": 3.792157152468029e-07, "loss": 0.4609, "num_input_tokens_seen": 103486000, "step": 85305 }, { "epoch": 9.501058024278873, "grad_norm": 0.13722562789916992, "learning_rate": 3.783729924666829e-07, "loss": 0.466, "num_input_tokens_seen": 103491728, "step": 85310 }, { "epoch": 9.501058024278873, "eval_loss": 0.46408841013908386, "eval_runtime": 112.9441, "eval_samples_per_second": 35.336, "eval_steps_per_second": 8.836, "num_input_tokens_seen": 103491728, "step": 85310 }, { "epoch": 9.50161487916249, "grad_norm": 0.12664161622524261, "learning_rate": 3.7753119997054054e-07, "loss": 0.4636, "num_input_tokens_seen": 103497232, "step": 85315 }, { "epoch": 9.502171734046108, "grad_norm": 0.07731840759515762, "learning_rate": 3.766903377901698e-07, "loss": 0.4589, "num_input_tokens_seen": 103503152, "step": 85320 }, { "epoch": 9.502728588929726, "grad_norm": 0.10180363059043884, "learning_rate": 3.758504059573509e-07, "loss": 0.4676, "num_input_tokens_seen": 103509264, "step": 85325 }, { "epoch": 9.503285443813342, "grad_norm": 0.08930745720863342, "learning_rate": 3.750114045038139e-07, "loss": 0.4662, "num_input_tokens_seen": 103515376, "step": 85330 }, { "epoch": 9.50384229869696, "grad_norm": 0.11165835708379745, "learning_rate": 3.7417333346126126e-07, "loss": 0.448, "num_input_tokens_seen": 103521488, "step": 85335 }, { "epoch": 9.504399153580577, "grad_norm": 0.08871928602457047, "learning_rate": 3.7333619286135933e-07, "loss": 0.4623, "num_input_tokens_seen": 103527728, "step": 85340 }, { "epoch": 9.504956008464195, "grad_norm": 0.12003428488969803, "learning_rate": 3.7249998273573284e-07, "loss": 0.448, "num_input_tokens_seen": 103533648, "step": 85345 }, { "epoch": 9.505512863347812, "grad_norm": 0.11014409363269806, "learning_rate": 3.7166470311598146e-07, "loss": 0.4578, "num_input_tokens_seen": 103539952, "step": 85350 }, { "epoch": 9.506069718231428, "grad_norm": 0.13257268071174622, "learning_rate": 3.7083035403366327e-07, "loss": 0.473, "num_input_tokens_seen": 103546224, "step": 85355 }, { "epoch": 9.506626573115046, "grad_norm": 0.0874682292342186, "learning_rate": 3.6999693552030037e-07, "loss": 0.454, "num_input_tokens_seen": 103552560, "step": 85360 }, { "epoch": 9.507183427998664, "grad_norm": 0.13418126106262207, "learning_rate": 3.691644476073869e-07, "loss": 0.4622, "num_input_tokens_seen": 103558288, "step": 85365 }, { "epoch": 9.507740282882281, "grad_norm": 0.10657991468906403, "learning_rate": 3.6833289032637277e-07, "loss": 0.4726, "num_input_tokens_seen": 103564368, "step": 85370 }, { "epoch": 9.508297137765899, "grad_norm": 0.10441049933433533, "learning_rate": 3.6750226370867724e-07, "loss": 0.4591, "num_input_tokens_seen": 103570512, "step": 85375 }, { "epoch": 9.508853992649515, "grad_norm": 0.11624657362699509, "learning_rate": 3.666725677856836e-07, "loss": 0.4589, "num_input_tokens_seen": 103576592, "step": 85380 }, { "epoch": 9.509410847533132, "grad_norm": 0.08496025949716568, "learning_rate": 3.658438025887445e-07, "loss": 0.4649, "num_input_tokens_seen": 103582736, "step": 85385 }, { "epoch": 9.50996770241675, "grad_norm": 0.12977184355258942, "learning_rate": 3.6501596814917103e-07, "loss": 0.4653, "num_input_tokens_seen": 103588752, "step": 85390 }, { "epoch": 9.510524557300368, "grad_norm": 0.07685327529907227, "learning_rate": 3.641890644982382e-07, "loss": 0.4579, "num_input_tokens_seen": 103594736, "step": 85395 }, { "epoch": 9.511081412183986, "grad_norm": 0.11870019137859344, "learning_rate": 3.6336309166719317e-07, "loss": 0.4821, "num_input_tokens_seen": 103600880, "step": 85400 }, { "epoch": 9.511638267067601, "grad_norm": 0.10744448006153107, "learning_rate": 3.625380496872416e-07, "loss": 0.4605, "num_input_tokens_seen": 103607312, "step": 85405 }, { "epoch": 9.512195121951219, "grad_norm": 0.09863830357789993, "learning_rate": 3.6171393858956126e-07, "loss": 0.4587, "num_input_tokens_seen": 103613392, "step": 85410 }, { "epoch": 9.512751976834837, "grad_norm": 0.13649001717567444, "learning_rate": 3.6089075840528006e-07, "loss": 0.4548, "num_input_tokens_seen": 103619536, "step": 85415 }, { "epoch": 9.513308831718454, "grad_norm": 0.09237660467624664, "learning_rate": 3.600685091655093e-07, "loss": 0.4609, "num_input_tokens_seen": 103625872, "step": 85420 }, { "epoch": 9.513865686602072, "grad_norm": 0.08751852065324783, "learning_rate": 3.5924719090131285e-07, "loss": 0.4694, "num_input_tokens_seen": 103632016, "step": 85425 }, { "epoch": 9.514422541485688, "grad_norm": 0.09828968346118927, "learning_rate": 3.5842680364371885e-07, "loss": 0.4509, "num_input_tokens_seen": 103638256, "step": 85430 }, { "epoch": 9.514979396369306, "grad_norm": 0.10468330979347229, "learning_rate": 3.5760734742373294e-07, "loss": 0.45, "num_input_tokens_seen": 103644240, "step": 85435 }, { "epoch": 9.515536251252923, "grad_norm": 0.07981161773204803, "learning_rate": 3.5678882227230814e-07, "loss": 0.4543, "num_input_tokens_seen": 103650448, "step": 85440 }, { "epoch": 9.516093106136541, "grad_norm": 0.11075355112552643, "learning_rate": 3.559712282203781e-07, "loss": 0.4599, "num_input_tokens_seen": 103656336, "step": 85445 }, { "epoch": 9.516649961020159, "grad_norm": 0.14237616956233978, "learning_rate": 3.551545652988292e-07, "loss": 0.461, "num_input_tokens_seen": 103662448, "step": 85450 }, { "epoch": 9.517206815903776, "grad_norm": 0.11667200177907944, "learning_rate": 3.543388335385173e-07, "loss": 0.4642, "num_input_tokens_seen": 103668528, "step": 85455 }, { "epoch": 9.517763670787392, "grad_norm": 0.12604674696922302, "learning_rate": 3.535240329702677e-07, "loss": 0.4729, "num_input_tokens_seen": 103674800, "step": 85460 }, { "epoch": 9.51832052567101, "grad_norm": 0.09455209225416183, "learning_rate": 3.5271016362486144e-07, "loss": 0.4618, "num_input_tokens_seen": 103681168, "step": 85465 }, { "epoch": 9.518877380554628, "grad_norm": 0.0882008895277977, "learning_rate": 3.5189722553305436e-07, "loss": 0.4561, "num_input_tokens_seen": 103687440, "step": 85470 }, { "epoch": 9.519434235438245, "grad_norm": 0.12704238295555115, "learning_rate": 3.510852187255553e-07, "loss": 0.4672, "num_input_tokens_seen": 103693360, "step": 85475 }, { "epoch": 9.519991090321863, "grad_norm": 0.0989239439368248, "learning_rate": 3.5027414323304806e-07, "loss": 0.4554, "num_input_tokens_seen": 103699376, "step": 85480 }, { "epoch": 9.520547945205479, "grad_norm": 0.14836116135120392, "learning_rate": 3.494639990861803e-07, "loss": 0.4572, "num_input_tokens_seen": 103704752, "step": 85485 }, { "epoch": 9.521104800089097, "grad_norm": 0.11054564267396927, "learning_rate": 3.4865478631555814e-07, "loss": 0.4568, "num_input_tokens_seen": 103711024, "step": 85490 }, { "epoch": 9.521661654972714, "grad_norm": 0.11096073687076569, "learning_rate": 3.478465049517571e-07, "loss": 0.4514, "num_input_tokens_seen": 103717168, "step": 85495 }, { "epoch": 9.522218509856332, "grad_norm": 0.13105317950248718, "learning_rate": 3.4703915502531657e-07, "loss": 0.4652, "num_input_tokens_seen": 103723312, "step": 85500 }, { "epoch": 9.52277536473995, "grad_norm": 0.08938899636268616, "learning_rate": 3.4623273656674283e-07, "loss": 0.4634, "num_input_tokens_seen": 103729552, "step": 85505 }, { "epoch": 9.523332219623565, "grad_norm": 0.09669037163257599, "learning_rate": 3.454272496065003e-07, "loss": 0.4624, "num_input_tokens_seen": 103735568, "step": 85510 }, { "epoch": 9.523889074507183, "grad_norm": 0.09967681765556335, "learning_rate": 3.4462269417502856e-07, "loss": 0.4764, "num_input_tokens_seen": 103741520, "step": 85515 }, { "epoch": 9.5244459293908, "grad_norm": 0.13654378056526184, "learning_rate": 3.438190703027228e-07, "loss": 0.4629, "num_input_tokens_seen": 103747888, "step": 85520 }, { "epoch": 9.525002784274418, "grad_norm": 0.1363389790058136, "learning_rate": 3.4301637801994466e-07, "loss": 0.4693, "num_input_tokens_seen": 103753648, "step": 85525 }, { "epoch": 9.525559639158036, "grad_norm": 0.10338031500577927, "learning_rate": 3.422146173570284e-07, "loss": 0.467, "num_input_tokens_seen": 103759856, "step": 85530 }, { "epoch": 9.526116494041652, "grad_norm": 0.1455029547214508, "learning_rate": 3.4141378834426083e-07, "loss": 0.4614, "num_input_tokens_seen": 103766064, "step": 85535 }, { "epoch": 9.52667334892527, "grad_norm": 0.13003148138523102, "learning_rate": 3.406138910119039e-07, "loss": 0.4475, "num_input_tokens_seen": 103771568, "step": 85540 }, { "epoch": 9.527230203808887, "grad_norm": 0.12523329257965088, "learning_rate": 3.398149253901778e-07, "loss": 0.4453, "num_input_tokens_seen": 103777584, "step": 85545 }, { "epoch": 9.527787058692505, "grad_norm": 0.10681625455617905, "learning_rate": 3.390168915092723e-07, "loss": 0.4625, "num_input_tokens_seen": 103783920, "step": 85550 }, { "epoch": 9.528343913576123, "grad_norm": 0.12777847051620483, "learning_rate": 3.382197893993411e-07, "loss": 0.4577, "num_input_tokens_seen": 103790032, "step": 85555 }, { "epoch": 9.528900768459739, "grad_norm": 0.1691678911447525, "learning_rate": 3.374236190904961e-07, "loss": 0.4569, "num_input_tokens_seen": 103796144, "step": 85560 }, { "epoch": 9.529457623343356, "grad_norm": 0.09884761273860931, "learning_rate": 3.3662838061282174e-07, "loss": 0.4687, "num_input_tokens_seen": 103802288, "step": 85565 }, { "epoch": 9.530014478226974, "grad_norm": 0.14050672948360443, "learning_rate": 3.3583407399636614e-07, "loss": 0.4669, "num_input_tokens_seen": 103808240, "step": 85570 }, { "epoch": 9.530571333110592, "grad_norm": 0.19780395925045013, "learning_rate": 3.350406992711358e-07, "loss": 0.4711, "num_input_tokens_seen": 103814352, "step": 85575 }, { "epoch": 9.53112818799421, "grad_norm": 0.12991653382778168, "learning_rate": 3.342482564671151e-07, "loss": 0.4544, "num_input_tokens_seen": 103820528, "step": 85580 }, { "epoch": 9.531685042877825, "grad_norm": 0.09319686889648438, "learning_rate": 3.3345674561423846e-07, "loss": 0.464, "num_input_tokens_seen": 103826672, "step": 85585 }, { "epoch": 9.532241897761443, "grad_norm": 0.1294519305229187, "learning_rate": 3.3266616674241526e-07, "loss": 0.4622, "num_input_tokens_seen": 103832912, "step": 85590 }, { "epoch": 9.53279875264506, "grad_norm": 0.12117555737495422, "learning_rate": 3.3187651988151323e-07, "loss": 0.4627, "num_input_tokens_seen": 103838928, "step": 85595 }, { "epoch": 9.533355607528678, "grad_norm": 0.14317746460437775, "learning_rate": 3.310878050613697e-07, "loss": 0.4695, "num_input_tokens_seen": 103844816, "step": 85600 }, { "epoch": 9.533912462412296, "grad_norm": 0.11739719659090042, "learning_rate": 3.3030002231178305e-07, "loss": 0.4702, "num_input_tokens_seen": 103850928, "step": 85605 }, { "epoch": 9.534469317295912, "grad_norm": 0.07570546865463257, "learning_rate": 3.2951317166252107e-07, "loss": 0.4647, "num_input_tokens_seen": 103856912, "step": 85610 }, { "epoch": 9.53502617217953, "grad_norm": 0.10189828276634216, "learning_rate": 3.287272531433128e-07, "loss": 0.4498, "num_input_tokens_seen": 103862768, "step": 85615 }, { "epoch": 9.535583027063147, "grad_norm": 0.11095374822616577, "learning_rate": 3.2794226678384834e-07, "loss": 0.4568, "num_input_tokens_seen": 103869008, "step": 85620 }, { "epoch": 9.536139881946765, "grad_norm": 0.12424483150243759, "learning_rate": 3.271582126137929e-07, "loss": 0.4614, "num_input_tokens_seen": 103875024, "step": 85625 }, { "epoch": 9.536696736830383, "grad_norm": 0.10719799995422363, "learning_rate": 3.263750906627672e-07, "loss": 0.4755, "num_input_tokens_seen": 103881328, "step": 85630 }, { "epoch": 9.537253591713998, "grad_norm": 0.11715815216302872, "learning_rate": 3.255929009603642e-07, "loss": 0.4624, "num_input_tokens_seen": 103887504, "step": 85635 }, { "epoch": 9.537810446597616, "grad_norm": 0.10473217070102692, "learning_rate": 3.2481164353613256e-07, "loss": 0.4503, "num_input_tokens_seen": 103893872, "step": 85640 }, { "epoch": 9.538367301481234, "grad_norm": 0.09837079793214798, "learning_rate": 3.2403131841959036e-07, "loss": 0.4564, "num_input_tokens_seen": 103899888, "step": 85645 }, { "epoch": 9.538924156364851, "grad_norm": 0.11286589503288269, "learning_rate": 3.2325192564022777e-07, "loss": 0.4625, "num_input_tokens_seen": 103905616, "step": 85650 }, { "epoch": 9.53948101124847, "grad_norm": 0.1098444014787674, "learning_rate": 3.224734652274825e-07, "loss": 0.4521, "num_input_tokens_seen": 103911888, "step": 85655 }, { "epoch": 9.540037866132085, "grad_norm": 0.13665996491909027, "learning_rate": 3.216959372107753e-07, "loss": 0.4569, "num_input_tokens_seen": 103918000, "step": 85660 }, { "epoch": 9.540594721015703, "grad_norm": 0.10195349901914597, "learning_rate": 3.2091934161948277e-07, "loss": 0.4561, "num_input_tokens_seen": 103923952, "step": 85665 }, { "epoch": 9.54115157589932, "grad_norm": 0.09163150936365128, "learning_rate": 3.2014367848294257e-07, "loss": 0.4558, "num_input_tokens_seen": 103929872, "step": 85670 }, { "epoch": 9.541708430782938, "grad_norm": 0.09460246562957764, "learning_rate": 3.193689478304646e-07, "loss": 0.4563, "num_input_tokens_seen": 103936176, "step": 85675 }, { "epoch": 9.542265285666556, "grad_norm": 0.10001705586910248, "learning_rate": 3.1859514969132255e-07, "loss": 0.4617, "num_input_tokens_seen": 103942704, "step": 85680 }, { "epoch": 9.542822140550173, "grad_norm": 0.09115134924650192, "learning_rate": 3.1782228409475143e-07, "loss": 0.4589, "num_input_tokens_seen": 103948976, "step": 85685 }, { "epoch": 9.54337899543379, "grad_norm": 0.1424388438463211, "learning_rate": 3.170503510699502e-07, "loss": 0.4513, "num_input_tokens_seen": 103955184, "step": 85690 }, { "epoch": 9.543935850317407, "grad_norm": 0.11054760962724686, "learning_rate": 3.1627935064608703e-07, "loss": 0.4561, "num_input_tokens_seen": 103961616, "step": 85695 }, { "epoch": 9.544492705201025, "grad_norm": 0.11593036353588104, "learning_rate": 3.155092828522971e-07, "loss": 0.4543, "num_input_tokens_seen": 103967088, "step": 85700 }, { "epoch": 9.545049560084642, "grad_norm": 0.17513185739517212, "learning_rate": 3.147401477176681e-07, "loss": 0.4496, "num_input_tokens_seen": 103972944, "step": 85705 }, { "epoch": 9.54560641496826, "grad_norm": 0.12133374065160751, "learning_rate": 3.139719452712658e-07, "loss": 0.4486, "num_input_tokens_seen": 103978064, "step": 85710 }, { "epoch": 9.546163269851876, "grad_norm": 0.09633761644363403, "learning_rate": 3.132046755421114e-07, "loss": 0.4498, "num_input_tokens_seen": 103984016, "step": 85715 }, { "epoch": 9.546720124735494, "grad_norm": 0.11266707628965378, "learning_rate": 3.1243833855919836e-07, "loss": 0.4661, "num_input_tokens_seen": 103990000, "step": 85720 }, { "epoch": 9.547276979619111, "grad_norm": 0.09843283146619797, "learning_rate": 3.116729343514785e-07, "loss": 0.4558, "num_input_tokens_seen": 103995920, "step": 85725 }, { "epoch": 9.547833834502729, "grad_norm": 0.10744691640138626, "learning_rate": 3.109084629478759e-07, "loss": 0.4601, "num_input_tokens_seen": 104002128, "step": 85730 }, { "epoch": 9.548390689386347, "grad_norm": 0.08271806687116623, "learning_rate": 3.101449243772703e-07, "loss": 0.4628, "num_input_tokens_seen": 104008080, "step": 85735 }, { "epoch": 9.548947544269963, "grad_norm": 0.14142973721027374, "learning_rate": 3.093823186685108e-07, "loss": 0.4613, "num_input_tokens_seen": 104014576, "step": 85740 }, { "epoch": 9.54950439915358, "grad_norm": 0.09174802899360657, "learning_rate": 3.086206458504159e-07, "loss": 0.4535, "num_input_tokens_seen": 104020528, "step": 85745 }, { "epoch": 9.550061254037198, "grad_norm": 0.10230614989995956, "learning_rate": 3.078599059517545e-07, "loss": 0.4498, "num_input_tokens_seen": 104026544, "step": 85750 }, { "epoch": 9.550618108920816, "grad_norm": 0.1314247101545334, "learning_rate": 3.0710009900127847e-07, "loss": 0.4748, "num_input_tokens_seen": 104032752, "step": 85755 }, { "epoch": 9.551174963804433, "grad_norm": 0.16045081615447998, "learning_rate": 3.0634122502769536e-07, "loss": 0.4538, "num_input_tokens_seen": 104038992, "step": 85760 }, { "epoch": 9.551731818688049, "grad_norm": 0.13445758819580078, "learning_rate": 3.0558328405967396e-07, "loss": 0.4495, "num_input_tokens_seen": 104045488, "step": 85765 }, { "epoch": 9.552288673571667, "grad_norm": 0.11291205137968063, "learning_rate": 3.048262761258525e-07, "loss": 0.4681, "num_input_tokens_seen": 104051568, "step": 85770 }, { "epoch": 9.552845528455284, "grad_norm": 0.1528312861919403, "learning_rate": 3.0407020125483297e-07, "loss": 0.4737, "num_input_tokens_seen": 104057616, "step": 85775 }, { "epoch": 9.553402383338902, "grad_norm": 0.13328157365322113, "learning_rate": 3.0331505947518714e-07, "loss": 0.4628, "num_input_tokens_seen": 104063248, "step": 85780 }, { "epoch": 9.55395923822252, "grad_norm": 0.09724628180265427, "learning_rate": 3.025608508154393e-07, "loss": 0.4671, "num_input_tokens_seen": 104069488, "step": 85785 }, { "epoch": 9.554516093106137, "grad_norm": 0.10782435536384583, "learning_rate": 3.0180757530408887e-07, "loss": 0.4593, "num_input_tokens_seen": 104075792, "step": 85790 }, { "epoch": 9.555072947989753, "grad_norm": 0.10825908184051514, "learning_rate": 3.0105523296959924e-07, "loss": 0.4657, "num_input_tokens_seen": 104081648, "step": 85795 }, { "epoch": 9.555629802873371, "grad_norm": 0.10448888689279556, "learning_rate": 3.003038238403949e-07, "loss": 0.4713, "num_input_tokens_seen": 104087632, "step": 85800 }, { "epoch": 9.556186657756989, "grad_norm": 0.10389205813407898, "learning_rate": 2.99553347944867e-07, "loss": 0.4513, "num_input_tokens_seen": 104093936, "step": 85805 }, { "epoch": 9.556743512640606, "grad_norm": 0.15579727292060852, "learning_rate": 2.9880380531136787e-07, "loss": 0.4618, "num_input_tokens_seen": 104100304, "step": 85810 }, { "epoch": 9.557300367524224, "grad_norm": 0.09985460340976715, "learning_rate": 2.980551959682221e-07, "loss": 0.4587, "num_input_tokens_seen": 104106416, "step": 85815 }, { "epoch": 9.55785722240784, "grad_norm": 0.1020813137292862, "learning_rate": 2.9730751994370975e-07, "loss": 0.4623, "num_input_tokens_seen": 104112304, "step": 85820 }, { "epoch": 9.558414077291458, "grad_norm": 0.09881195425987244, "learning_rate": 2.9656077726608335e-07, "loss": 0.466, "num_input_tokens_seen": 104118320, "step": 85825 }, { "epoch": 9.558970932175075, "grad_norm": 0.1188153550028801, "learning_rate": 2.958149679635563e-07, "loss": 0.4574, "num_input_tokens_seen": 104124496, "step": 85830 }, { "epoch": 9.559527787058693, "grad_norm": 0.1135096624493599, "learning_rate": 2.9507009206430904e-07, "loss": 0.4599, "num_input_tokens_seen": 104130416, "step": 85835 }, { "epoch": 9.56008464194231, "grad_norm": 0.10264555364847183, "learning_rate": 2.9432614959648555e-07, "loss": 0.4564, "num_input_tokens_seen": 104136400, "step": 85840 }, { "epoch": 9.560641496825927, "grad_norm": 0.11339546740055084, "learning_rate": 2.9358314058819394e-07, "loss": 0.4571, "num_input_tokens_seen": 104142608, "step": 85845 }, { "epoch": 9.561198351709544, "grad_norm": 0.11082930862903595, "learning_rate": 2.928410650675034e-07, "loss": 0.4701, "num_input_tokens_seen": 104148304, "step": 85850 }, { "epoch": 9.561755206593162, "grad_norm": 0.12591394782066345, "learning_rate": 2.9209992306245826e-07, "loss": 0.4571, "num_input_tokens_seen": 104154384, "step": 85855 }, { "epoch": 9.56231206147678, "grad_norm": 0.13420096039772034, "learning_rate": 2.913597146010583e-07, "loss": 0.4709, "num_input_tokens_seen": 104160720, "step": 85860 }, { "epoch": 9.562868916360397, "grad_norm": 0.11194022744894028, "learning_rate": 2.9062043971126997e-07, "loss": 0.4574, "num_input_tokens_seen": 104166928, "step": 85865 }, { "epoch": 9.563425771244013, "grad_norm": 0.10217583924531937, "learning_rate": 2.8988209842102655e-07, "loss": 0.4533, "num_input_tokens_seen": 104173072, "step": 85870 }, { "epoch": 9.56398262612763, "grad_norm": 0.1100354716181755, "learning_rate": 2.8914469075822517e-07, "loss": 0.4663, "num_input_tokens_seen": 104178992, "step": 85875 }, { "epoch": 9.564539481011249, "grad_norm": 0.10929016768932343, "learning_rate": 2.884082167507268e-07, "loss": 0.4793, "num_input_tokens_seen": 104185008, "step": 85880 }, { "epoch": 9.565096335894866, "grad_norm": 0.14031779766082764, "learning_rate": 2.8767267642635933e-07, "loss": 0.4697, "num_input_tokens_seen": 104191184, "step": 85885 }, { "epoch": 9.565653190778484, "grad_norm": 0.09676772356033325, "learning_rate": 2.869380698129143e-07, "loss": 0.4646, "num_input_tokens_seen": 104197360, "step": 85890 }, { "epoch": 9.5662100456621, "grad_norm": 0.07220598310232162, "learning_rate": 2.8620439693814174e-07, "loss": 0.4572, "num_input_tokens_seen": 104202992, "step": 85895 }, { "epoch": 9.566766900545717, "grad_norm": 0.09920968115329742, "learning_rate": 2.8547165782976947e-07, "loss": 0.4662, "num_input_tokens_seen": 104208976, "step": 85900 }, { "epoch": 9.567323755429335, "grad_norm": 0.08940184861421585, "learning_rate": 2.847398525154782e-07, "loss": 0.4624, "num_input_tokens_seen": 104214960, "step": 85905 }, { "epoch": 9.567880610312953, "grad_norm": 0.2167072743177414, "learning_rate": 2.8400898102292073e-07, "loss": 0.4604, "num_input_tokens_seen": 104221136, "step": 85910 }, { "epoch": 9.56843746519657, "grad_norm": 0.10917948931455612, "learning_rate": 2.8327904337970555e-07, "loss": 0.4578, "num_input_tokens_seen": 104227344, "step": 85915 }, { "epoch": 9.568994320080186, "grad_norm": 0.14200359582901, "learning_rate": 2.8255003961342177e-07, "loss": 0.47, "num_input_tokens_seen": 104233520, "step": 85920 }, { "epoch": 9.569551174963804, "grad_norm": 0.1512027233839035, "learning_rate": 2.818219697516056e-07, "loss": 0.4534, "num_input_tokens_seen": 104239824, "step": 85925 }, { "epoch": 9.570108029847422, "grad_norm": 0.09691981971263885, "learning_rate": 2.8109483382177114e-07, "loss": 0.4639, "num_input_tokens_seen": 104246352, "step": 85930 }, { "epoch": 9.57066488473104, "grad_norm": 0.12576483190059662, "learning_rate": 2.8036863185138815e-07, "loss": 0.4663, "num_input_tokens_seen": 104252720, "step": 85935 }, { "epoch": 9.571221739614657, "grad_norm": 0.10506527870893478, "learning_rate": 2.796433638678958e-07, "loss": 0.4489, "num_input_tokens_seen": 104258896, "step": 85940 }, { "epoch": 9.571778594498273, "grad_norm": 0.15227754414081573, "learning_rate": 2.789190298986999e-07, "loss": 0.4689, "num_input_tokens_seen": 104265072, "step": 85945 }, { "epoch": 9.57233544938189, "grad_norm": 0.09582103788852692, "learning_rate": 2.781956299711619e-07, "loss": 0.4608, "num_input_tokens_seen": 104271088, "step": 85950 }, { "epoch": 9.572892304265508, "grad_norm": 0.11467151343822479, "learning_rate": 2.7747316411261827e-07, "loss": 0.4612, "num_input_tokens_seen": 104276656, "step": 85955 }, { "epoch": 9.573449159149126, "grad_norm": 0.10081612318754196, "learning_rate": 2.767516323503694e-07, "loss": 0.4716, "num_input_tokens_seen": 104282800, "step": 85960 }, { "epoch": 9.574006014032744, "grad_norm": 0.10179995000362396, "learning_rate": 2.7603103471166846e-07, "loss": 0.4574, "num_input_tokens_seen": 104288656, "step": 85965 }, { "epoch": 9.57456286891636, "grad_norm": 0.12324347347021103, "learning_rate": 2.7531137122375206e-07, "loss": 0.4692, "num_input_tokens_seen": 104294704, "step": 85970 }, { "epoch": 9.575119723799977, "grad_norm": 0.10125403106212616, "learning_rate": 2.745926419138012e-07, "loss": 0.4602, "num_input_tokens_seen": 104301168, "step": 85975 }, { "epoch": 9.575676578683595, "grad_norm": 0.11398173123598099, "learning_rate": 2.738748468089802e-07, "loss": 0.4519, "num_input_tokens_seen": 104307376, "step": 85980 }, { "epoch": 9.576233433567213, "grad_norm": 0.20134030282497406, "learning_rate": 2.731579859364064e-07, "loss": 0.4614, "num_input_tokens_seen": 104313904, "step": 85985 }, { "epoch": 9.57679028845083, "grad_norm": 0.09805576503276825, "learning_rate": 2.724420593231636e-07, "loss": 0.4668, "num_input_tokens_seen": 104320016, "step": 85990 }, { "epoch": 9.577347143334446, "grad_norm": 0.12923823297023773, "learning_rate": 2.7172706699630234e-07, "loss": 0.4668, "num_input_tokens_seen": 104326224, "step": 85995 }, { "epoch": 9.577903998218064, "grad_norm": 0.11211787909269333, "learning_rate": 2.7101300898284e-07, "loss": 0.4569, "num_input_tokens_seen": 104332656, "step": 86000 }, { "epoch": 9.578460853101681, "grad_norm": 0.10544900596141815, "learning_rate": 2.702998853097549e-07, "loss": 0.4647, "num_input_tokens_seen": 104338768, "step": 86005 }, { "epoch": 9.5790177079853, "grad_norm": 0.06705857068300247, "learning_rate": 2.695876960039867e-07, "loss": 0.4636, "num_input_tokens_seen": 104344400, "step": 86010 }, { "epoch": 9.579574562868917, "grad_norm": 0.12210135906934738, "learning_rate": 2.688764410924499e-07, "loss": 0.457, "num_input_tokens_seen": 104350384, "step": 86015 }, { "epoch": 9.580131417752535, "grad_norm": 0.101657435297966, "learning_rate": 2.681661206020175e-07, "loss": 0.4763, "num_input_tokens_seen": 104355728, "step": 86020 }, { "epoch": 9.58068827263615, "grad_norm": 0.09257350116968155, "learning_rate": 2.674567345595236e-07, "loss": 0.4601, "num_input_tokens_seen": 104362096, "step": 86025 }, { "epoch": 9.581245127519768, "grad_norm": 0.08907628059387207, "learning_rate": 2.6674828299177446e-07, "loss": 0.4505, "num_input_tokens_seen": 104367984, "step": 86030 }, { "epoch": 9.581801982403386, "grad_norm": 0.12124623358249664, "learning_rate": 2.6604076592553764e-07, "loss": 0.4651, "num_input_tokens_seen": 104374096, "step": 86035 }, { "epoch": 9.582358837287003, "grad_norm": 0.10020957887172699, "learning_rate": 2.6533418338754445e-07, "loss": 0.4513, "num_input_tokens_seen": 104380304, "step": 86040 }, { "epoch": 9.582915692170621, "grad_norm": 0.09117043763399124, "learning_rate": 2.646285354044903e-07, "loss": 0.4583, "num_input_tokens_seen": 104386288, "step": 86045 }, { "epoch": 9.583472547054237, "grad_norm": 0.07563722133636475, "learning_rate": 2.6392382200303713e-07, "loss": 0.4584, "num_input_tokens_seen": 104392560, "step": 86050 }, { "epoch": 9.584029401937855, "grad_norm": 0.07182992994785309, "learning_rate": 2.632200432098192e-07, "loss": 0.4672, "num_input_tokens_seen": 104398960, "step": 86055 }, { "epoch": 9.584586256821472, "grad_norm": 0.1303006410598755, "learning_rate": 2.6251719905141527e-07, "loss": 0.4602, "num_input_tokens_seen": 104405072, "step": 86060 }, { "epoch": 9.58514311170509, "grad_norm": 0.10663840919733047, "learning_rate": 2.618152895543874e-07, "loss": 0.4473, "num_input_tokens_seen": 104411376, "step": 86065 }, { "epoch": 9.585699966588708, "grad_norm": 0.11452221125364304, "learning_rate": 2.6111431474525603e-07, "loss": 0.4698, "num_input_tokens_seen": 104417712, "step": 86070 }, { "epoch": 9.586256821472324, "grad_norm": 0.08459604531526566, "learning_rate": 2.604142746505056e-07, "loss": 0.4524, "num_input_tokens_seen": 104423920, "step": 86075 }, { "epoch": 9.586813676355941, "grad_norm": 0.10144316405057907, "learning_rate": 2.597151692965871e-07, "loss": 0.4524, "num_input_tokens_seen": 104429680, "step": 86080 }, { "epoch": 9.587370531239559, "grad_norm": 0.1155371442437172, "learning_rate": 2.5901699870991003e-07, "loss": 0.4645, "num_input_tokens_seen": 104435920, "step": 86085 }, { "epoch": 9.587927386123177, "grad_norm": 0.12108065187931061, "learning_rate": 2.5831976291686153e-07, "loss": 0.4562, "num_input_tokens_seen": 104441936, "step": 86090 }, { "epoch": 9.588484241006794, "grad_norm": 0.09852451086044312, "learning_rate": 2.576234619437762e-07, "loss": 0.4524, "num_input_tokens_seen": 104447312, "step": 86095 }, { "epoch": 9.58904109589041, "grad_norm": 0.0919283777475357, "learning_rate": 2.56928095816969e-07, "loss": 0.4667, "num_input_tokens_seen": 104453424, "step": 86100 }, { "epoch": 9.589597950774028, "grad_norm": 0.16978487372398376, "learning_rate": 2.562336645627134e-07, "loss": 0.4507, "num_input_tokens_seen": 104460144, "step": 86105 }, { "epoch": 9.590154805657646, "grad_norm": 0.1275787502527237, "learning_rate": 2.555401682072439e-07, "loss": 0.465, "num_input_tokens_seen": 104466128, "step": 86110 }, { "epoch": 9.590711660541263, "grad_norm": 0.09255961328744888, "learning_rate": 2.5484760677676466e-07, "loss": 0.4469, "num_input_tokens_seen": 104471600, "step": 86115 }, { "epoch": 9.591268515424881, "grad_norm": 0.12891234457492828, "learning_rate": 2.541559802974436e-07, "loss": 0.4592, "num_input_tokens_seen": 104477552, "step": 86120 }, { "epoch": 9.591825370308497, "grad_norm": 0.12592299282550812, "learning_rate": 2.5346528879540974e-07, "loss": 0.446, "num_input_tokens_seen": 104483760, "step": 86125 }, { "epoch": 9.592382225192114, "grad_norm": 0.10716815292835236, "learning_rate": 2.5277553229676176e-07, "loss": 0.4597, "num_input_tokens_seen": 104490192, "step": 86130 }, { "epoch": 9.592939080075732, "grad_norm": 0.10271681845188141, "learning_rate": 2.520867108275621e-07, "loss": 0.4557, "num_input_tokens_seen": 104495600, "step": 86135 }, { "epoch": 9.59349593495935, "grad_norm": 0.11672937870025635, "learning_rate": 2.513988244138343e-07, "loss": 0.455, "num_input_tokens_seen": 104501328, "step": 86140 }, { "epoch": 9.594052789842967, "grad_norm": 0.11387775093317032, "learning_rate": 2.5071187308156887e-07, "loss": 0.465, "num_input_tokens_seen": 104507312, "step": 86145 }, { "epoch": 9.594609644726585, "grad_norm": 0.13527290523052216, "learning_rate": 2.5002585685672266e-07, "loss": 0.4679, "num_input_tokens_seen": 104513008, "step": 86150 }, { "epoch": 9.595166499610201, "grad_norm": 0.13939835131168365, "learning_rate": 2.4934077576521384e-07, "loss": 0.457, "num_input_tokens_seen": 104518800, "step": 86155 }, { "epoch": 9.595723354493819, "grad_norm": 0.09512209892272949, "learning_rate": 2.4865662983292737e-07, "loss": 0.4593, "num_input_tokens_seen": 104524656, "step": 86160 }, { "epoch": 9.596280209377436, "grad_norm": 0.1325543224811554, "learning_rate": 2.479734190857147e-07, "loss": 0.459, "num_input_tokens_seen": 104530736, "step": 86165 }, { "epoch": 9.596837064261054, "grad_norm": 0.1435776650905609, "learning_rate": 2.472911435493858e-07, "loss": 0.4732, "num_input_tokens_seen": 104536848, "step": 86170 }, { "epoch": 9.597393919144672, "grad_norm": 0.08944854885339737, "learning_rate": 2.4660980324972547e-07, "loss": 0.4478, "num_input_tokens_seen": 104543024, "step": 86175 }, { "epoch": 9.597950774028288, "grad_norm": 0.10283137112855911, "learning_rate": 2.4592939821246886e-07, "loss": 0.456, "num_input_tokens_seen": 104549232, "step": 86180 }, { "epoch": 9.598507628911905, "grad_norm": 0.1058303564786911, "learning_rate": 2.4524992846333137e-07, "loss": 0.4489, "num_input_tokens_seen": 104555536, "step": 86185 }, { "epoch": 9.599064483795523, "grad_norm": 0.10124655812978745, "learning_rate": 2.4457139402797866e-07, "loss": 0.4665, "num_input_tokens_seen": 104561520, "step": 86190 }, { "epoch": 9.59962133867914, "grad_norm": 0.11617008596658707, "learning_rate": 2.4389379493205133e-07, "loss": 0.4569, "num_input_tokens_seen": 104567632, "step": 86195 }, { "epoch": 9.600178193562758, "grad_norm": 0.13150183856487274, "learning_rate": 2.4321713120115386e-07, "loss": 0.4579, "num_input_tokens_seen": 104573552, "step": 86200 }, { "epoch": 9.600735048446374, "grad_norm": 0.09749216586351395, "learning_rate": 2.425414028608491e-07, "loss": 0.4603, "num_input_tokens_seen": 104579760, "step": 86205 }, { "epoch": 9.601291903329992, "grad_norm": 0.15366138517856598, "learning_rate": 2.4186660993666954e-07, "loss": 0.4618, "num_input_tokens_seen": 104586000, "step": 86210 }, { "epoch": 9.60184875821361, "grad_norm": 0.09656006842851639, "learning_rate": 2.4119275245411134e-07, "loss": 0.4545, "num_input_tokens_seen": 104592048, "step": 86215 }, { "epoch": 9.602405613097227, "grad_norm": 0.10520755499601364, "learning_rate": 2.405198304386347e-07, "loss": 0.467, "num_input_tokens_seen": 104598128, "step": 86220 }, { "epoch": 9.602962467980845, "grad_norm": 0.11707563698291779, "learning_rate": 2.398478439156637e-07, "loss": 0.457, "num_input_tokens_seen": 104604368, "step": 86225 }, { "epoch": 9.60351932286446, "grad_norm": 0.13084453344345093, "learning_rate": 2.391767929105865e-07, "loss": 0.4716, "num_input_tokens_seen": 104610544, "step": 86230 }, { "epoch": 9.604076177748079, "grad_norm": 0.13785095512866974, "learning_rate": 2.385066774487632e-07, "loss": 0.4663, "num_input_tokens_seen": 104616944, "step": 86235 }, { "epoch": 9.604633032631696, "grad_norm": 0.102995865046978, "learning_rate": 2.3783749755550977e-07, "loss": 0.4744, "num_input_tokens_seen": 104623088, "step": 86240 }, { "epoch": 9.605189887515314, "grad_norm": 0.08829799294471741, "learning_rate": 2.3716925325610874e-07, "loss": 0.46, "num_input_tokens_seen": 104629136, "step": 86245 }, { "epoch": 9.605746742398932, "grad_norm": 0.12177809327840805, "learning_rate": 2.365019445758093e-07, "loss": 0.4675, "num_input_tokens_seen": 104635216, "step": 86250 }, { "epoch": 9.606303597282547, "grad_norm": 0.10138046741485596, "learning_rate": 2.358355715398247e-07, "loss": 0.4836, "num_input_tokens_seen": 104641264, "step": 86255 }, { "epoch": 9.606860452166165, "grad_norm": 0.12996284663677216, "learning_rate": 2.3517013417333477e-07, "loss": 0.4559, "num_input_tokens_seen": 104646576, "step": 86260 }, { "epoch": 9.607417307049783, "grad_norm": 0.1790664941072464, "learning_rate": 2.3450563250147771e-07, "loss": 0.4469, "num_input_tokens_seen": 104652848, "step": 86265 }, { "epoch": 9.6079741619334, "grad_norm": 0.08889307826757431, "learning_rate": 2.3384206654936403e-07, "loss": 0.4639, "num_input_tokens_seen": 104658960, "step": 86270 }, { "epoch": 9.608531016817018, "grad_norm": 0.10561257600784302, "learning_rate": 2.331794363420653e-07, "loss": 0.459, "num_input_tokens_seen": 104665296, "step": 86275 }, { "epoch": 9.609087871700634, "grad_norm": 0.09519525617361069, "learning_rate": 2.3251774190461706e-07, "loss": 0.4613, "num_input_tokens_seen": 104671504, "step": 86280 }, { "epoch": 9.609644726584252, "grad_norm": 0.11709097027778625, "learning_rate": 2.3185698326201598e-07, "loss": 0.4675, "num_input_tokens_seen": 104677488, "step": 86285 }, { "epoch": 9.61020158146787, "grad_norm": 0.09965652227401733, "learning_rate": 2.3119716043923378e-07, "loss": 0.4648, "num_input_tokens_seen": 104683472, "step": 86290 }, { "epoch": 9.610758436351487, "grad_norm": 0.12348408252000809, "learning_rate": 2.3053827346119772e-07, "loss": 0.4616, "num_input_tokens_seen": 104689616, "step": 86295 }, { "epoch": 9.611315291235105, "grad_norm": 0.07806862145662308, "learning_rate": 2.2988032235280454e-07, "loss": 0.4556, "num_input_tokens_seen": 104695408, "step": 86300 }, { "epoch": 9.61187214611872, "grad_norm": 0.09921599179506302, "learning_rate": 2.2922330713891215e-07, "loss": 0.4648, "num_input_tokens_seen": 104701264, "step": 86305 }, { "epoch": 9.612429001002338, "grad_norm": 0.12074172496795654, "learning_rate": 2.2856722784434514e-07, "loss": 0.4713, "num_input_tokens_seen": 104707344, "step": 86310 }, { "epoch": 9.612985855885956, "grad_norm": 0.10895900428295135, "learning_rate": 2.2791208449389202e-07, "loss": 0.4601, "num_input_tokens_seen": 104712848, "step": 86315 }, { "epoch": 9.613542710769574, "grad_norm": 0.09501168876886368, "learning_rate": 2.2725787711230517e-07, "loss": 0.4554, "num_input_tokens_seen": 104718704, "step": 86320 }, { "epoch": 9.614099565653191, "grad_norm": 0.08575250953435898, "learning_rate": 2.2660460572430376e-07, "loss": 0.467, "num_input_tokens_seen": 104725008, "step": 86325 }, { "epoch": 9.614656420536807, "grad_norm": 0.11387024074792862, "learning_rate": 2.2595227035457356e-07, "loss": 0.4637, "num_input_tokens_seen": 104731344, "step": 86330 }, { "epoch": 9.615213275420425, "grad_norm": 0.100169837474823, "learning_rate": 2.25300871027756e-07, "loss": 0.4667, "num_input_tokens_seen": 104737360, "step": 86335 }, { "epoch": 9.615770130304043, "grad_norm": 0.10051531344652176, "learning_rate": 2.2465040776846748e-07, "loss": 0.4686, "num_input_tokens_seen": 104743376, "step": 86340 }, { "epoch": 9.61632698518766, "grad_norm": 0.08445057272911072, "learning_rate": 2.2400088060128276e-07, "loss": 0.4705, "num_input_tokens_seen": 104749328, "step": 86345 }, { "epoch": 9.616883840071278, "grad_norm": 0.10992182046175003, "learning_rate": 2.2335228955074338e-07, "loss": 0.4593, "num_input_tokens_seen": 104755184, "step": 86350 }, { "epoch": 9.617440694954894, "grad_norm": 0.11729180812835693, "learning_rate": 2.2270463464135472e-07, "loss": 0.4685, "num_input_tokens_seen": 104761392, "step": 86355 }, { "epoch": 9.617997549838512, "grad_norm": 0.10646133869886398, "learning_rate": 2.2205791589758885e-07, "loss": 0.4522, "num_input_tokens_seen": 104767024, "step": 86360 }, { "epoch": 9.61855440472213, "grad_norm": 0.09171483665704727, "learning_rate": 2.21412133343879e-07, "loss": 0.4671, "num_input_tokens_seen": 104773040, "step": 86365 }, { "epoch": 9.619111259605747, "grad_norm": 0.16205330193042755, "learning_rate": 2.2076728700462513e-07, "loss": 0.4504, "num_input_tokens_seen": 104778736, "step": 86370 }, { "epoch": 9.619668114489365, "grad_norm": 0.11044009029865265, "learning_rate": 2.2012337690419383e-07, "loss": 0.4447, "num_input_tokens_seen": 104784848, "step": 86375 }, { "epoch": 9.620224969372982, "grad_norm": 0.12771765887737274, "learning_rate": 2.1948040306691008e-07, "loss": 0.4602, "num_input_tokens_seen": 104790928, "step": 86380 }, { "epoch": 9.620781824256598, "grad_norm": 0.1306832730770111, "learning_rate": 2.1883836551707115e-07, "loss": 0.4591, "num_input_tokens_seen": 104797264, "step": 86385 }, { "epoch": 9.621338679140216, "grad_norm": 0.1284860074520111, "learning_rate": 2.1819726427893262e-07, "loss": 0.4651, "num_input_tokens_seen": 104803600, "step": 86390 }, { "epoch": 9.621895534023833, "grad_norm": 0.1307290941476822, "learning_rate": 2.1755709937671953e-07, "loss": 0.4661, "num_input_tokens_seen": 104809744, "step": 86395 }, { "epoch": 9.622452388907451, "grad_norm": 0.1108284592628479, "learning_rate": 2.1691787083462089e-07, "loss": 0.4614, "num_input_tokens_seen": 104816016, "step": 86400 }, { "epoch": 9.623009243791069, "grad_norm": 0.09138419479131699, "learning_rate": 2.162795786767813e-07, "loss": 0.4613, "num_input_tokens_seen": 104821840, "step": 86405 }, { "epoch": 9.623566098674685, "grad_norm": 0.09691130369901657, "learning_rate": 2.1564222292732583e-07, "loss": 0.4643, "num_input_tokens_seen": 104828016, "step": 86410 }, { "epoch": 9.624122953558302, "grad_norm": 0.12169022113084793, "learning_rate": 2.1500580361033251e-07, "loss": 0.4523, "num_input_tokens_seen": 104833680, "step": 86415 }, { "epoch": 9.62467980844192, "grad_norm": 0.14326728880405426, "learning_rate": 2.1437032074984598e-07, "loss": 0.4551, "num_input_tokens_seen": 104839760, "step": 86420 }, { "epoch": 9.625236663325538, "grad_norm": 0.11522754281759262, "learning_rate": 2.1373577436988034e-07, "loss": 0.4713, "num_input_tokens_seen": 104845872, "step": 86425 }, { "epoch": 9.625793518209155, "grad_norm": 0.13805942237377167, "learning_rate": 2.1310216449440533e-07, "loss": 0.4703, "num_input_tokens_seen": 104851984, "step": 86430 }, { "epoch": 9.626350373092771, "grad_norm": 0.09664060920476913, "learning_rate": 2.1246949114736846e-07, "loss": 0.4526, "num_input_tokens_seen": 104858288, "step": 86435 }, { "epoch": 9.626907227976389, "grad_norm": 0.09551597386598587, "learning_rate": 2.1183775435266728e-07, "loss": 0.456, "num_input_tokens_seen": 104864528, "step": 86440 }, { "epoch": 9.627464082860007, "grad_norm": 0.139996737241745, "learning_rate": 2.1120695413417157e-07, "loss": 0.4631, "num_input_tokens_seen": 104870576, "step": 86445 }, { "epoch": 9.628020937743624, "grad_norm": 0.12283621728420258, "learning_rate": 2.1057709051571784e-07, "loss": 0.4531, "num_input_tokens_seen": 104876368, "step": 86450 }, { "epoch": 9.628577792627242, "grad_norm": 0.12499155104160309, "learning_rate": 2.099481635211037e-07, "loss": 0.4661, "num_input_tokens_seen": 104881776, "step": 86455 }, { "epoch": 9.629134647510858, "grad_norm": 0.0987924262881279, "learning_rate": 2.0932017317409348e-07, "loss": 0.4631, "num_input_tokens_seen": 104887888, "step": 86460 }, { "epoch": 9.629691502394476, "grad_norm": 0.10273099690675735, "learning_rate": 2.0869311949841265e-07, "loss": 0.4754, "num_input_tokens_seen": 104893872, "step": 86465 }, { "epoch": 9.630248357278093, "grad_norm": 0.07208071649074554, "learning_rate": 2.0806700251775057e-07, "loss": 0.4583, "num_input_tokens_seen": 104899888, "step": 86470 }, { "epoch": 9.630805212161711, "grad_norm": 0.12664592266082764, "learning_rate": 2.074418222557689e-07, "loss": 0.4606, "num_input_tokens_seen": 104906032, "step": 86475 }, { "epoch": 9.631362067045329, "grad_norm": 0.08487015217542648, "learning_rate": 2.0681757873608486e-07, "loss": 0.4552, "num_input_tokens_seen": 104912016, "step": 86480 }, { "epoch": 9.631918921928944, "grad_norm": 0.08167300373315811, "learning_rate": 2.061942719822879e-07, "loss": 0.4643, "num_input_tokens_seen": 104918224, "step": 86485 }, { "epoch": 9.632475776812562, "grad_norm": 0.1588016003370285, "learning_rate": 2.0557190201792586e-07, "loss": 0.458, "num_input_tokens_seen": 104923728, "step": 86490 }, { "epoch": 9.63303263169618, "grad_norm": 0.10694170743227005, "learning_rate": 2.0495046886651602e-07, "loss": 0.4597, "num_input_tokens_seen": 104929648, "step": 86495 }, { "epoch": 9.633589486579798, "grad_norm": 0.10141006857156754, "learning_rate": 2.0432997255153686e-07, "loss": 0.454, "num_input_tokens_seen": 104935728, "step": 86500 }, { "epoch": 9.634146341463415, "grad_norm": 0.11948857456445694, "learning_rate": 2.0371041309643346e-07, "loss": 0.4577, "num_input_tokens_seen": 104942032, "step": 86505 }, { "epoch": 9.634703196347033, "grad_norm": 0.1112348884344101, "learning_rate": 2.0309179052461214e-07, "loss": 0.4586, "num_input_tokens_seen": 104948240, "step": 86510 }, { "epoch": 9.635260051230649, "grad_norm": 0.1370307058095932, "learning_rate": 2.024741048594514e-07, "loss": 0.4513, "num_input_tokens_seen": 104954352, "step": 86515 }, { "epoch": 9.635816906114266, "grad_norm": 0.13832952082157135, "learning_rate": 2.0185735612428536e-07, "loss": 0.4575, "num_input_tokens_seen": 104960432, "step": 86520 }, { "epoch": 9.636373760997884, "grad_norm": 0.11781440675258636, "learning_rate": 2.012415443424176e-07, "loss": 0.4526, "num_input_tokens_seen": 104966160, "step": 86525 }, { "epoch": 9.636930615881502, "grad_norm": 0.09450380504131317, "learning_rate": 2.0062666953711561e-07, "loss": 0.4504, "num_input_tokens_seen": 104972208, "step": 86530 }, { "epoch": 9.63748747076512, "grad_norm": 0.11279299110174179, "learning_rate": 2.0001273173161085e-07, "loss": 0.453, "num_input_tokens_seen": 104978128, "step": 86535 }, { "epoch": 9.638044325648735, "grad_norm": 0.14407742023468018, "learning_rate": 1.9939973094910137e-07, "loss": 0.4661, "num_input_tokens_seen": 104984432, "step": 86540 }, { "epoch": 9.638601180532353, "grad_norm": 0.10332709550857544, "learning_rate": 1.9878766721274922e-07, "loss": 0.4584, "num_input_tokens_seen": 104990928, "step": 86545 }, { "epoch": 9.63915803541597, "grad_norm": 0.1319654881954193, "learning_rate": 1.981765405456776e-07, "loss": 0.473, "num_input_tokens_seen": 104997072, "step": 86550 }, { "epoch": 9.639714890299588, "grad_norm": 0.122860848903656, "learning_rate": 1.9756635097097632e-07, "loss": 0.4493, "num_input_tokens_seen": 105002704, "step": 86555 }, { "epoch": 9.640271745183206, "grad_norm": 0.12809458374977112, "learning_rate": 1.9695709851170197e-07, "loss": 0.4615, "num_input_tokens_seen": 105008816, "step": 86560 }, { "epoch": 9.640828600066822, "grad_norm": 0.07905838638544083, "learning_rate": 1.9634878319087224e-07, "loss": 0.4617, "num_input_tokens_seen": 105015312, "step": 86565 }, { "epoch": 9.64138545495044, "grad_norm": 0.1281581073999405, "learning_rate": 1.9574140503147709e-07, "loss": 0.4562, "num_input_tokens_seen": 105021392, "step": 86570 }, { "epoch": 9.641942309834057, "grad_norm": 0.08642873167991638, "learning_rate": 1.9513496405645647e-07, "loss": 0.4578, "num_input_tokens_seen": 105027440, "step": 86575 }, { "epoch": 9.642499164717675, "grad_norm": 0.14109720289707184, "learning_rate": 1.94529460288731e-07, "loss": 0.4623, "num_input_tokens_seen": 105033840, "step": 86580 }, { "epoch": 9.643056019601293, "grad_norm": 0.10356923192739487, "learning_rate": 1.9392489375117396e-07, "loss": 0.4677, "num_input_tokens_seen": 105039984, "step": 86585 }, { "epoch": 9.643612874484909, "grad_norm": 0.14905579388141632, "learning_rate": 1.9332126446663103e-07, "loss": 0.4523, "num_input_tokens_seen": 105046480, "step": 86590 }, { "epoch": 9.644169729368526, "grad_norm": 0.12345454841852188, "learning_rate": 1.9271857245790337e-07, "loss": 0.4586, "num_input_tokens_seen": 105052400, "step": 86595 }, { "epoch": 9.644726584252144, "grad_norm": 0.20005516707897186, "learning_rate": 1.9211681774777003e-07, "loss": 0.4689, "num_input_tokens_seen": 105058448, "step": 86600 }, { "epoch": 9.645283439135762, "grad_norm": 0.16648399829864502, "learning_rate": 1.9151600035896555e-07, "loss": 0.4532, "num_input_tokens_seen": 105064624, "step": 86605 }, { "epoch": 9.64584029401938, "grad_norm": 0.08869937062263489, "learning_rate": 1.9091612031418572e-07, "loss": 0.4686, "num_input_tokens_seen": 105070864, "step": 86610 }, { "epoch": 9.646397148902995, "grad_norm": 0.17442700266838074, "learning_rate": 1.9031717763610123e-07, "loss": 0.4585, "num_input_tokens_seen": 105076912, "step": 86615 }, { "epoch": 9.646954003786613, "grad_norm": 0.14260874688625336, "learning_rate": 1.8971917234734126e-07, "loss": 0.4539, "num_input_tokens_seen": 105083280, "step": 86620 }, { "epoch": 9.64751085867023, "grad_norm": 0.14316977560520172, "learning_rate": 1.8912210447049882e-07, "loss": 0.4823, "num_input_tokens_seen": 105088784, "step": 86625 }, { "epoch": 9.648067713553848, "grad_norm": 0.13335733115673065, "learning_rate": 1.8852597402813366e-07, "loss": 0.468, "num_input_tokens_seen": 105095536, "step": 86630 }, { "epoch": 9.648624568437466, "grad_norm": 0.18258842825889587, "learning_rate": 1.8793078104276663e-07, "loss": 0.4593, "num_input_tokens_seen": 105101968, "step": 86635 }, { "epoch": 9.649181423321082, "grad_norm": 0.1108185276389122, "learning_rate": 1.8733652553689364e-07, "loss": 0.4695, "num_input_tokens_seen": 105108336, "step": 86640 }, { "epoch": 9.6497382782047, "grad_norm": 0.08347567915916443, "learning_rate": 1.867432075329606e-07, "loss": 0.4547, "num_input_tokens_seen": 105113936, "step": 86645 }, { "epoch": 9.650295133088317, "grad_norm": 0.15297333896160126, "learning_rate": 1.8615082705338573e-07, "loss": 0.4651, "num_input_tokens_seen": 105119888, "step": 86650 }, { "epoch": 9.650851987971935, "grad_norm": 0.10043442994356155, "learning_rate": 1.8555938412055385e-07, "loss": 0.4591, "num_input_tokens_seen": 105126064, "step": 86655 }, { "epoch": 9.651408842855552, "grad_norm": 0.1634015142917633, "learning_rate": 1.8496887875681102e-07, "loss": 0.4588, "num_input_tokens_seen": 105132336, "step": 86660 }, { "epoch": 9.651965697739168, "grad_norm": 0.11334718763828278, "learning_rate": 1.8437931098446714e-07, "loss": 0.4646, "num_input_tokens_seen": 105138544, "step": 86665 }, { "epoch": 9.652522552622786, "grad_norm": 0.11873108148574829, "learning_rate": 1.837906808257961e-07, "loss": 0.4539, "num_input_tokens_seen": 105144528, "step": 86670 }, { "epoch": 9.653079407506404, "grad_norm": 0.1299019604921341, "learning_rate": 1.8320298830304394e-07, "loss": 0.4627, "num_input_tokens_seen": 105150704, "step": 86675 }, { "epoch": 9.653636262390021, "grad_norm": 0.13453790545463562, "learning_rate": 1.826162334384096e-07, "loss": 0.4609, "num_input_tokens_seen": 105156752, "step": 86680 }, { "epoch": 9.654193117273639, "grad_norm": 0.14780555665493011, "learning_rate": 1.820304162540698e-07, "loss": 0.4706, "num_input_tokens_seen": 105163088, "step": 86685 }, { "epoch": 9.654749972157255, "grad_norm": 0.14130547642707825, "learning_rate": 1.814455367721485e-07, "loss": 0.4561, "num_input_tokens_seen": 105169136, "step": 86690 }, { "epoch": 9.655306827040873, "grad_norm": 0.11578118056058884, "learning_rate": 1.8086159501475297e-07, "loss": 0.4547, "num_input_tokens_seen": 105174480, "step": 86695 }, { "epoch": 9.65586368192449, "grad_norm": 0.13010075688362122, "learning_rate": 1.8027859100394063e-07, "loss": 0.4663, "num_input_tokens_seen": 105180912, "step": 86700 }, { "epoch": 9.656420536808108, "grad_norm": 0.12773855030536652, "learning_rate": 1.7969652476174381e-07, "loss": 0.4757, "num_input_tokens_seen": 105186928, "step": 86705 }, { "epoch": 9.656977391691726, "grad_norm": 0.10181588679552078, "learning_rate": 1.7911539631015327e-07, "loss": 0.4548, "num_input_tokens_seen": 105193104, "step": 86710 }, { "epoch": 9.657534246575342, "grad_norm": 0.07131107896566391, "learning_rate": 1.785352056711237e-07, "loss": 0.4559, "num_input_tokens_seen": 105199344, "step": 86715 }, { "epoch": 9.65809110145896, "grad_norm": 0.10117550939321518, "learning_rate": 1.7795595286657918e-07, "loss": 0.4526, "num_input_tokens_seen": 105205424, "step": 86720 }, { "epoch": 9.658647956342577, "grad_norm": 0.07418613880872726, "learning_rate": 1.77377637918405e-07, "loss": 0.459, "num_input_tokens_seen": 105211920, "step": 86725 }, { "epoch": 9.659204811226195, "grad_norm": 0.10434666275978088, "learning_rate": 1.7680026084845036e-07, "loss": 0.4622, "num_input_tokens_seen": 105217840, "step": 86730 }, { "epoch": 9.659761666109812, "grad_norm": 0.09029869735240936, "learning_rate": 1.7622382167853392e-07, "loss": 0.4776, "num_input_tokens_seen": 105224208, "step": 86735 }, { "epoch": 9.66031852099343, "grad_norm": 0.10803445428609848, "learning_rate": 1.7564832043043266e-07, "loss": 0.4563, "num_input_tokens_seen": 105230416, "step": 86740 }, { "epoch": 9.660875375877046, "grad_norm": 0.11440546810626984, "learning_rate": 1.7507375712589037e-07, "loss": 0.4609, "num_input_tokens_seen": 105236272, "step": 86745 }, { "epoch": 9.661432230760663, "grad_norm": 0.11442341655492783, "learning_rate": 1.745001317866174e-07, "loss": 0.453, "num_input_tokens_seen": 105242416, "step": 86750 }, { "epoch": 9.661989085644281, "grad_norm": 0.08890542387962341, "learning_rate": 1.739274444342881e-07, "loss": 0.4517, "num_input_tokens_seen": 105248720, "step": 86755 }, { "epoch": 9.662545940527899, "grad_norm": 0.10075290501117706, "learning_rate": 1.7335569509053796e-07, "loss": 0.4553, "num_input_tokens_seen": 105254640, "step": 86760 }, { "epoch": 9.663102795411517, "grad_norm": 0.11296319216489792, "learning_rate": 1.727848837769691e-07, "loss": 0.461, "num_input_tokens_seen": 105260848, "step": 86765 }, { "epoch": 9.663659650295132, "grad_norm": 0.08882396668195724, "learning_rate": 1.722150105151532e-07, "loss": 0.4699, "num_input_tokens_seen": 105267152, "step": 86770 }, { "epoch": 9.66421650517875, "grad_norm": 0.1273309737443924, "learning_rate": 1.716460753266147e-07, "loss": 0.4656, "num_input_tokens_seen": 105273552, "step": 86775 }, { "epoch": 9.664773360062368, "grad_norm": 0.12852130830287933, "learning_rate": 1.710780782328586e-07, "loss": 0.4699, "num_input_tokens_seen": 105279440, "step": 86780 }, { "epoch": 9.665330214945985, "grad_norm": 0.11051283776760101, "learning_rate": 1.7051101925533718e-07, "loss": 0.4765, "num_input_tokens_seen": 105285648, "step": 86785 }, { "epoch": 9.665887069829603, "grad_norm": 0.130393385887146, "learning_rate": 1.699448984154778e-07, "loss": 0.4765, "num_input_tokens_seen": 105291824, "step": 86790 }, { "epoch": 9.666443924713219, "grad_norm": 0.15310591459274292, "learning_rate": 1.6937971573467715e-07, "loss": 0.4541, "num_input_tokens_seen": 105298224, "step": 86795 }, { "epoch": 9.667000779596837, "grad_norm": 0.1290864497423172, "learning_rate": 1.688154712342821e-07, "loss": 0.4603, "num_input_tokens_seen": 105303312, "step": 86800 }, { "epoch": 9.667557634480454, "grad_norm": 0.1675671637058258, "learning_rate": 1.6825216493561446e-07, "loss": 0.455, "num_input_tokens_seen": 105309264, "step": 86805 }, { "epoch": 9.668114489364072, "grad_norm": 0.17992186546325684, "learning_rate": 1.6768979685995445e-07, "loss": 0.4569, "num_input_tokens_seen": 105315280, "step": 86810 }, { "epoch": 9.66867134424769, "grad_norm": 0.11979738622903824, "learning_rate": 1.6712836702855729e-07, "loss": 0.4522, "num_input_tokens_seen": 105321776, "step": 86815 }, { "epoch": 9.669228199131306, "grad_norm": 0.1456374078989029, "learning_rate": 1.665678754626282e-07, "loss": 0.4627, "num_input_tokens_seen": 105327728, "step": 86820 }, { "epoch": 9.669785054014923, "grad_norm": 0.11457028985023499, "learning_rate": 1.660083221833475e-07, "loss": 0.453, "num_input_tokens_seen": 105334128, "step": 86825 }, { "epoch": 9.670341908898541, "grad_norm": 0.09007847309112549, "learning_rate": 1.6544970721185936e-07, "loss": 0.4523, "num_input_tokens_seen": 105340368, "step": 86830 }, { "epoch": 9.670898763782159, "grad_norm": 0.11172374337911606, "learning_rate": 1.648920305692636e-07, "loss": 0.4743, "num_input_tokens_seen": 105346416, "step": 86835 }, { "epoch": 9.671455618665776, "grad_norm": 0.14954493939876556, "learning_rate": 1.643352922766378e-07, "loss": 0.4535, "num_input_tokens_seen": 105352432, "step": 86840 }, { "epoch": 9.672012473549392, "grad_norm": 0.12716613709926605, "learning_rate": 1.637794923550151e-07, "loss": 0.4603, "num_input_tokens_seen": 105358800, "step": 86845 }, { "epoch": 9.67256932843301, "grad_norm": 0.09444640576839447, "learning_rate": 1.6322463082539262e-07, "loss": 0.4642, "num_input_tokens_seen": 105364816, "step": 86850 }, { "epoch": 9.673126183316628, "grad_norm": 0.1540774405002594, "learning_rate": 1.6267070770873971e-07, "loss": 0.4747, "num_input_tokens_seen": 105371280, "step": 86855 }, { "epoch": 9.673683038200245, "grad_norm": 0.14235223829746246, "learning_rate": 1.621177230259785e-07, "loss": 0.4677, "num_input_tokens_seen": 105377456, "step": 86860 }, { "epoch": 9.674239893083863, "grad_norm": 0.11504732072353363, "learning_rate": 1.6156567679800893e-07, "loss": 0.4469, "num_input_tokens_seen": 105383824, "step": 86865 }, { "epoch": 9.67479674796748, "grad_norm": 0.1485658884048462, "learning_rate": 1.6101456904568656e-07, "loss": 0.4693, "num_input_tokens_seen": 105389680, "step": 86870 }, { "epoch": 9.675353602851096, "grad_norm": 0.1424107402563095, "learning_rate": 1.604643997898364e-07, "loss": 0.4559, "num_input_tokens_seen": 105396080, "step": 86875 }, { "epoch": 9.675910457734714, "grad_norm": 0.1308627724647522, "learning_rate": 1.5991516905124183e-07, "loss": 0.4637, "num_input_tokens_seen": 105402128, "step": 86880 }, { "epoch": 9.676467312618332, "grad_norm": 0.12344394624233246, "learning_rate": 1.5936687685065565e-07, "loss": 0.4558, "num_input_tokens_seen": 105408560, "step": 86885 }, { "epoch": 9.67702416750195, "grad_norm": 0.15596409142017365, "learning_rate": 1.5881952320879469e-07, "loss": 0.446, "num_input_tokens_seen": 105415024, "step": 86890 }, { "epoch": 9.677581022385567, "grad_norm": 0.09020870923995972, "learning_rate": 1.5827310814633955e-07, "loss": 0.464, "num_input_tokens_seen": 105421168, "step": 86895 }, { "epoch": 9.678137877269183, "grad_norm": 0.07871520519256592, "learning_rate": 1.577276316839349e-07, "loss": 0.4506, "num_input_tokens_seen": 105427376, "step": 86900 }, { "epoch": 9.6786947321528, "grad_norm": 0.09439229220151901, "learning_rate": 1.5718309384219197e-07, "loss": 0.4721, "num_input_tokens_seen": 105433392, "step": 86905 }, { "epoch": 9.679251587036418, "grad_norm": 0.11935946345329285, "learning_rate": 1.5663949464168325e-07, "loss": 0.4681, "num_input_tokens_seen": 105439344, "step": 86910 }, { "epoch": 9.679808441920036, "grad_norm": 0.12260151654481888, "learning_rate": 1.560968341029506e-07, "loss": 0.465, "num_input_tokens_seen": 105445520, "step": 86915 }, { "epoch": 9.680365296803654, "grad_norm": 0.12144285440444946, "learning_rate": 1.5555511224649432e-07, "loss": 0.4646, "num_input_tokens_seen": 105451472, "step": 86920 }, { "epoch": 9.68092215168727, "grad_norm": 0.19534288346767426, "learning_rate": 1.550143290927869e-07, "loss": 0.4577, "num_input_tokens_seen": 105457584, "step": 86925 }, { "epoch": 9.681479006570887, "grad_norm": 0.08236874639987946, "learning_rate": 1.5447448466225368e-07, "loss": 0.4659, "num_input_tokens_seen": 105463664, "step": 86930 }, { "epoch": 9.682035861454505, "grad_norm": 0.13170941174030304, "learning_rate": 1.5393557897529776e-07, "loss": 0.4657, "num_input_tokens_seen": 105469872, "step": 86935 }, { "epoch": 9.682592716338123, "grad_norm": 0.09845327585935593, "learning_rate": 1.5339761205228065e-07, "loss": 0.4577, "num_input_tokens_seen": 105476080, "step": 86940 }, { "epoch": 9.68314957122174, "grad_norm": 0.1172047033905983, "learning_rate": 1.528605839135222e-07, "loss": 0.4658, "num_input_tokens_seen": 105481968, "step": 86945 }, { "epoch": 9.683706426105356, "grad_norm": 0.10172927379608154, "learning_rate": 1.5232449457932286e-07, "loss": 0.4531, "num_input_tokens_seen": 105488144, "step": 86950 }, { "epoch": 9.684263280988974, "grad_norm": 0.14016684889793396, "learning_rate": 1.517893440699275e-07, "loss": 0.468, "num_input_tokens_seen": 105493680, "step": 86955 }, { "epoch": 9.684820135872592, "grad_norm": 0.10859613120555878, "learning_rate": 1.5125513240556445e-07, "loss": 0.4573, "num_input_tokens_seen": 105499696, "step": 86960 }, { "epoch": 9.68537699075621, "grad_norm": 0.135580912232399, "learning_rate": 1.5072185960641194e-07, "loss": 0.4661, "num_input_tokens_seen": 105505712, "step": 86965 }, { "epoch": 9.685933845639827, "grad_norm": 0.13195858895778656, "learning_rate": 1.5018952569262058e-07, "loss": 0.4423, "num_input_tokens_seen": 105512048, "step": 86970 }, { "epoch": 9.686490700523443, "grad_norm": 0.08179415762424469, "learning_rate": 1.4965813068430479e-07, "loss": 0.4573, "num_input_tokens_seen": 105518096, "step": 86975 }, { "epoch": 9.68704755540706, "grad_norm": 0.10882316529750824, "learning_rate": 1.4912767460154297e-07, "loss": 0.4569, "num_input_tokens_seen": 105524144, "step": 86980 }, { "epoch": 9.687604410290678, "grad_norm": 0.12567351758480072, "learning_rate": 1.4859815746437467e-07, "loss": 0.4627, "num_input_tokens_seen": 105530448, "step": 86985 }, { "epoch": 9.688161265174296, "grad_norm": 0.11741632968187332, "learning_rate": 1.4806957929280884e-07, "loss": 0.4652, "num_input_tokens_seen": 105536368, "step": 86990 }, { "epoch": 9.688718120057914, "grad_norm": 0.10207019746303558, "learning_rate": 1.4754194010681565e-07, "loss": 0.4488, "num_input_tokens_seen": 105542416, "step": 86995 }, { "epoch": 9.68927497494153, "grad_norm": 0.12959881126880646, "learning_rate": 1.4701523992633192e-07, "loss": 0.4578, "num_input_tokens_seen": 105548464, "step": 87000 }, { "epoch": 9.689831829825147, "grad_norm": 0.10620080679655075, "learning_rate": 1.464894787712584e-07, "loss": 0.4515, "num_input_tokens_seen": 105554544, "step": 87005 }, { "epoch": 9.690388684708765, "grad_norm": 0.10513003915548325, "learning_rate": 1.4596465666145975e-07, "loss": 0.4599, "num_input_tokens_seen": 105560464, "step": 87010 }, { "epoch": 9.690945539592382, "grad_norm": 0.14359989762306213, "learning_rate": 1.4544077361676178e-07, "loss": 0.4574, "num_input_tokens_seen": 105566608, "step": 87015 }, { "epoch": 9.691502394476, "grad_norm": 0.13140152394771576, "learning_rate": 1.449178296569653e-07, "loss": 0.4672, "num_input_tokens_seen": 105572336, "step": 87020 }, { "epoch": 9.692059249359616, "grad_norm": 0.12366272509098053, "learning_rate": 1.44395824801824e-07, "loss": 0.462, "num_input_tokens_seen": 105578480, "step": 87025 }, { "epoch": 9.692616104243234, "grad_norm": 0.10420503467321396, "learning_rate": 1.4387475907106096e-07, "loss": 0.4634, "num_input_tokens_seen": 105584496, "step": 87030 }, { "epoch": 9.693172959126851, "grad_norm": 0.11487070471048355, "learning_rate": 1.43354632484366e-07, "loss": 0.4564, "num_input_tokens_seen": 105590256, "step": 87035 }, { "epoch": 9.693729814010469, "grad_norm": 0.0955580398440361, "learning_rate": 1.4283544506139e-07, "loss": 0.4677, "num_input_tokens_seen": 105596272, "step": 87040 }, { "epoch": 9.694286668894087, "grad_norm": 0.11086307466030121, "learning_rate": 1.4231719682175072e-07, "loss": 0.4535, "num_input_tokens_seen": 105602416, "step": 87045 }, { "epoch": 9.694843523777703, "grad_norm": 0.10280624777078629, "learning_rate": 1.4179988778502685e-07, "loss": 0.4588, "num_input_tokens_seen": 105608112, "step": 87050 }, { "epoch": 9.69540037866132, "grad_norm": 0.1319594383239746, "learning_rate": 1.412835179707639e-07, "loss": 0.4582, "num_input_tokens_seen": 105614224, "step": 87055 }, { "epoch": 9.695957233544938, "grad_norm": 0.1450973004102707, "learning_rate": 1.4076808739847403e-07, "loss": 0.4697, "num_input_tokens_seen": 105620208, "step": 87060 }, { "epoch": 9.696514088428556, "grad_norm": 0.14529410004615784, "learning_rate": 1.4025359608763334e-07, "loss": 0.4635, "num_input_tokens_seen": 105626352, "step": 87065 }, { "epoch": 9.697070943312173, "grad_norm": 0.12043917179107666, "learning_rate": 1.3974004405767628e-07, "loss": 0.4545, "num_input_tokens_seen": 105631984, "step": 87070 }, { "epoch": 9.69762779819579, "grad_norm": 0.1406434178352356, "learning_rate": 1.3922743132800952e-07, "loss": 0.4556, "num_input_tokens_seen": 105638192, "step": 87075 }, { "epoch": 9.698184653079407, "grad_norm": 0.13752682507038116, "learning_rate": 1.3871575791800372e-07, "loss": 0.4561, "num_input_tokens_seen": 105644176, "step": 87080 }, { "epoch": 9.698741507963025, "grad_norm": 0.18298783898353577, "learning_rate": 1.3820502384698508e-07, "loss": 0.464, "num_input_tokens_seen": 105650448, "step": 87085 }, { "epoch": 9.699298362846642, "grad_norm": 0.12203812599182129, "learning_rate": 1.376952291342548e-07, "loss": 0.4596, "num_input_tokens_seen": 105656016, "step": 87090 }, { "epoch": 9.69985521773026, "grad_norm": 0.10903103649616241, "learning_rate": 1.371863737990725e-07, "loss": 0.4447, "num_input_tokens_seen": 105662160, "step": 87095 }, { "epoch": 9.700412072613878, "grad_norm": 0.1075412780046463, "learning_rate": 1.3667845786066723e-07, "loss": 0.4603, "num_input_tokens_seen": 105668272, "step": 87100 }, { "epoch": 9.700968927497494, "grad_norm": 0.10360028594732285, "learning_rate": 1.361714813382292e-07, "loss": 0.4605, "num_input_tokens_seen": 105674160, "step": 87105 }, { "epoch": 9.701525782381111, "grad_norm": 0.09183783829212189, "learning_rate": 1.3566544425091253e-07, "loss": 0.466, "num_input_tokens_seen": 105679984, "step": 87110 }, { "epoch": 9.702082637264729, "grad_norm": 0.09071708470582962, "learning_rate": 1.3516034661783527e-07, "loss": 0.459, "num_input_tokens_seen": 105686320, "step": 87115 }, { "epoch": 9.702639492148347, "grad_norm": 0.14061930775642395, "learning_rate": 1.3465618845808493e-07, "loss": 0.4733, "num_input_tokens_seen": 105692592, "step": 87120 }, { "epoch": 9.703196347031964, "grad_norm": 0.10624715685844421, "learning_rate": 1.3415296979071012e-07, "loss": 0.4385, "num_input_tokens_seen": 105698928, "step": 87125 }, { "epoch": 9.70375320191558, "grad_norm": 0.15028469264507294, "learning_rate": 1.336506906347207e-07, "loss": 0.4672, "num_input_tokens_seen": 105704176, "step": 87130 }, { "epoch": 9.704310056799198, "grad_norm": 0.08915798366069794, "learning_rate": 1.3314935100909586e-07, "loss": 0.4639, "num_input_tokens_seen": 105710512, "step": 87135 }, { "epoch": 9.704866911682815, "grad_norm": 0.1367732137441635, "learning_rate": 1.3264895093277885e-07, "loss": 0.4676, "num_input_tokens_seen": 105716784, "step": 87140 }, { "epoch": 9.705423766566433, "grad_norm": 0.0771702229976654, "learning_rate": 1.3214949042467395e-07, "loss": 0.464, "num_input_tokens_seen": 105722544, "step": 87145 }, { "epoch": 9.70598062145005, "grad_norm": 0.10715470463037491, "learning_rate": 1.3165096950365774e-07, "loss": 0.4509, "num_input_tokens_seen": 105728752, "step": 87150 }, { "epoch": 9.706537476333667, "grad_norm": 0.08461153507232666, "learning_rate": 1.311533881885596e-07, "loss": 0.4641, "num_input_tokens_seen": 105734640, "step": 87155 }, { "epoch": 9.707094331217284, "grad_norm": 0.09590578079223633, "learning_rate": 1.3065674649818395e-07, "loss": 0.4612, "num_input_tokens_seen": 105740752, "step": 87160 }, { "epoch": 9.707651186100902, "grad_norm": 0.10350990295410156, "learning_rate": 1.3016104445129351e-07, "loss": 0.472, "num_input_tokens_seen": 105746992, "step": 87165 }, { "epoch": 9.70820804098452, "grad_norm": 0.10617511719465256, "learning_rate": 1.296662820666178e-07, "loss": 0.4626, "num_input_tokens_seen": 105752560, "step": 87170 }, { "epoch": 9.708764895868137, "grad_norm": 0.1447012722492218, "learning_rate": 1.2917245936285294e-07, "loss": 0.4485, "num_input_tokens_seen": 105758800, "step": 87175 }, { "epoch": 9.709321750751753, "grad_norm": 0.12793779373168945, "learning_rate": 1.2867957635865345e-07, "loss": 0.4577, "num_input_tokens_seen": 105764688, "step": 87180 }, { "epoch": 9.709878605635371, "grad_norm": 0.11293777078390121, "learning_rate": 1.2818763307264337e-07, "loss": 0.462, "num_input_tokens_seen": 105770896, "step": 87185 }, { "epoch": 9.710435460518989, "grad_norm": 0.138711616396904, "learning_rate": 1.2769662952341055e-07, "loss": 0.4497, "num_input_tokens_seen": 105777072, "step": 87190 }, { "epoch": 9.710992315402606, "grad_norm": 0.09738855063915253, "learning_rate": 1.2720656572950685e-07, "loss": 0.4589, "num_input_tokens_seen": 105783440, "step": 87195 }, { "epoch": 9.711549170286224, "grad_norm": 0.11301721632480621, "learning_rate": 1.26717441709448e-07, "loss": 0.4534, "num_input_tokens_seen": 105789200, "step": 87200 }, { "epoch": 9.712106025169842, "grad_norm": 0.09147264808416367, "learning_rate": 1.2622925748171366e-07, "loss": 0.4635, "num_input_tokens_seen": 105795248, "step": 87205 }, { "epoch": 9.712662880053458, "grad_norm": 0.0839930921792984, "learning_rate": 1.257420130647502e-07, "loss": 0.4663, "num_input_tokens_seen": 105801104, "step": 87210 }, { "epoch": 9.713219734937075, "grad_norm": 0.10282839834690094, "learning_rate": 1.2525570847696787e-07, "loss": 0.4545, "num_input_tokens_seen": 105807120, "step": 87215 }, { "epoch": 9.713776589820693, "grad_norm": 0.09795814007520676, "learning_rate": 1.2477034373673814e-07, "loss": 0.4584, "num_input_tokens_seen": 105813104, "step": 87220 }, { "epoch": 9.71433344470431, "grad_norm": 0.17318449914455414, "learning_rate": 1.2428591886240183e-07, "loss": 0.4591, "num_input_tokens_seen": 105819280, "step": 87225 }, { "epoch": 9.714890299587928, "grad_norm": 0.14748018980026245, "learning_rate": 1.2380243387226097e-07, "loss": 0.4709, "num_input_tokens_seen": 105825296, "step": 87230 }, { "epoch": 9.715447154471544, "grad_norm": 0.09873494505882263, "learning_rate": 1.2331988878458712e-07, "loss": 0.471, "num_input_tokens_seen": 105831120, "step": 87235 }, { "epoch": 9.716004009355162, "grad_norm": 0.12000656872987747, "learning_rate": 1.2283828361760452e-07, "loss": 0.4637, "num_input_tokens_seen": 105837136, "step": 87240 }, { "epoch": 9.71656086423878, "grad_norm": 0.10985289514064789, "learning_rate": 1.2235761838951809e-07, "loss": 0.467, "num_input_tokens_seen": 105843152, "step": 87245 }, { "epoch": 9.717117719122397, "grad_norm": 0.12147693336009979, "learning_rate": 1.2187789311848274e-07, "loss": 0.4781, "num_input_tokens_seen": 105849136, "step": 87250 }, { "epoch": 9.717674574006015, "grad_norm": 0.09576667100191116, "learning_rate": 1.2139910782262842e-07, "loss": 0.4597, "num_input_tokens_seen": 105855472, "step": 87255 }, { "epoch": 9.71823142888963, "grad_norm": 0.12167155742645264, "learning_rate": 1.2092126252004342e-07, "loss": 0.466, "num_input_tokens_seen": 105860976, "step": 87260 }, { "epoch": 9.718788283773248, "grad_norm": 0.10301671177148819, "learning_rate": 1.2044435722878001e-07, "loss": 0.4763, "num_input_tokens_seen": 105867280, "step": 87265 }, { "epoch": 9.719345138656866, "grad_norm": 0.11699049919843674, "learning_rate": 1.1996839196685983e-07, "loss": 0.4594, "num_input_tokens_seen": 105872752, "step": 87270 }, { "epoch": 9.719901993540484, "grad_norm": 0.10347721725702286, "learning_rate": 1.1949336675226574e-07, "loss": 0.4527, "num_input_tokens_seen": 105878704, "step": 87275 }, { "epoch": 9.720458848424101, "grad_norm": 0.13563013076782227, "learning_rate": 1.1901928160294729e-07, "loss": 0.4633, "num_input_tokens_seen": 105885008, "step": 87280 }, { "epoch": 9.721015703307717, "grad_norm": 0.12580814957618713, "learning_rate": 1.1854613653681512e-07, "loss": 0.4705, "num_input_tokens_seen": 105890512, "step": 87285 }, { "epoch": 9.721572558191335, "grad_norm": 0.09583701938390732, "learning_rate": 1.1807393157174384e-07, "loss": 0.4668, "num_input_tokens_seen": 105896688, "step": 87290 }, { "epoch": 9.722129413074953, "grad_norm": 0.1554556041955948, "learning_rate": 1.1760266672558028e-07, "loss": 0.4592, "num_input_tokens_seen": 105902512, "step": 87295 }, { "epoch": 9.72268626795857, "grad_norm": 0.10907519608736038, "learning_rate": 1.1713234201612689e-07, "loss": 0.4497, "num_input_tokens_seen": 105908496, "step": 87300 }, { "epoch": 9.723243122842188, "grad_norm": 0.2006024867296219, "learning_rate": 1.1666295746115275e-07, "loss": 0.4626, "num_input_tokens_seen": 105914576, "step": 87305 }, { "epoch": 9.723799977725804, "grad_norm": 0.13170039653778076, "learning_rate": 1.1619451307839646e-07, "loss": 0.471, "num_input_tokens_seen": 105921072, "step": 87310 }, { "epoch": 9.724356832609422, "grad_norm": 0.1553492248058319, "learning_rate": 1.1572700888555499e-07, "loss": 0.4557, "num_input_tokens_seen": 105927120, "step": 87315 }, { "epoch": 9.72491368749304, "grad_norm": 0.12186852097511292, "learning_rate": 1.1526044490029475e-07, "loss": 0.4485, "num_input_tokens_seen": 105933200, "step": 87320 }, { "epoch": 9.725470542376657, "grad_norm": 0.10808232426643372, "learning_rate": 1.1479482114023777e-07, "loss": 0.4548, "num_input_tokens_seen": 105939248, "step": 87325 }, { "epoch": 9.726027397260275, "grad_norm": 0.17228278517723083, "learning_rate": 1.1433013762298384e-07, "loss": 0.4652, "num_input_tokens_seen": 105945296, "step": 87330 }, { "epoch": 9.72658425214389, "grad_norm": 0.1365436315536499, "learning_rate": 1.1386639436608836e-07, "loss": 0.4713, "num_input_tokens_seen": 105951216, "step": 87335 }, { "epoch": 9.727141107027508, "grad_norm": 0.10836121439933777, "learning_rate": 1.1340359138707068e-07, "loss": 0.4508, "num_input_tokens_seen": 105957392, "step": 87340 }, { "epoch": 9.727697961911126, "grad_norm": 0.12756961584091187, "learning_rate": 1.1294172870341679e-07, "loss": 0.449, "num_input_tokens_seen": 105963504, "step": 87345 }, { "epoch": 9.728254816794744, "grad_norm": 0.10824481397867203, "learning_rate": 1.124808063325794e-07, "loss": 0.4597, "num_input_tokens_seen": 105969296, "step": 87350 }, { "epoch": 9.728811671678361, "grad_norm": 0.09285304695367813, "learning_rate": 1.1202082429197513e-07, "loss": 0.4635, "num_input_tokens_seen": 105975472, "step": 87355 }, { "epoch": 9.729368526561977, "grad_norm": 0.11569336801767349, "learning_rate": 1.1156178259898175e-07, "loss": 0.4615, "num_input_tokens_seen": 105981520, "step": 87360 }, { "epoch": 9.729925381445595, "grad_norm": 0.11289923638105392, "learning_rate": 1.1110368127094095e-07, "loss": 0.4701, "num_input_tokens_seen": 105987088, "step": 87365 }, { "epoch": 9.730482236329212, "grad_norm": 0.10133538395166397, "learning_rate": 1.1064652032516387e-07, "loss": 0.4651, "num_input_tokens_seen": 105992944, "step": 87370 }, { "epoch": 9.73103909121283, "grad_norm": 0.11589323729276657, "learning_rate": 1.1019029977892281e-07, "loss": 0.4712, "num_input_tokens_seen": 105998832, "step": 87375 }, { "epoch": 9.731595946096448, "grad_norm": 0.09369021654129028, "learning_rate": 1.0973501964945675e-07, "loss": 0.4641, "num_input_tokens_seen": 106004976, "step": 87380 }, { "epoch": 9.732152800980064, "grad_norm": 0.13361084461212158, "learning_rate": 1.0928067995396585e-07, "loss": 0.4634, "num_input_tokens_seen": 106010768, "step": 87385 }, { "epoch": 9.732709655863681, "grad_norm": 0.11222603917121887, "learning_rate": 1.088272807096169e-07, "loss": 0.4655, "num_input_tokens_seen": 106017040, "step": 87390 }, { "epoch": 9.733266510747299, "grad_norm": 0.11269568651914597, "learning_rate": 1.0837482193354065e-07, "loss": 0.4496, "num_input_tokens_seen": 106022576, "step": 87395 }, { "epoch": 9.733823365630917, "grad_norm": 0.14175157248973846, "learning_rate": 1.0792330364283454e-07, "loss": 0.4602, "num_input_tokens_seen": 106028368, "step": 87400 }, { "epoch": 9.734380220514534, "grad_norm": 0.119621142745018, "learning_rate": 1.0747272585455437e-07, "loss": 0.4588, "num_input_tokens_seen": 106034672, "step": 87405 }, { "epoch": 9.73493707539815, "grad_norm": 0.09820657968521118, "learning_rate": 1.0702308858572819e-07, "loss": 0.4641, "num_input_tokens_seen": 106040816, "step": 87410 }, { "epoch": 9.735493930281768, "grad_norm": 0.17010660469532013, "learning_rate": 1.0657439185334239e-07, "loss": 0.4601, "num_input_tokens_seen": 106047152, "step": 87415 }, { "epoch": 9.736050785165386, "grad_norm": 0.13455219566822052, "learning_rate": 1.0612663567435289e-07, "loss": 0.4627, "num_input_tokens_seen": 106053232, "step": 87420 }, { "epoch": 9.736607640049003, "grad_norm": 0.1007005050778389, "learning_rate": 1.0567982006567389e-07, "loss": 0.4673, "num_input_tokens_seen": 106059376, "step": 87425 }, { "epoch": 9.737164494932621, "grad_norm": 0.15100990235805511, "learning_rate": 1.0523394504418915e-07, "loss": 0.4701, "num_input_tokens_seen": 106065584, "step": 87430 }, { "epoch": 9.737721349816239, "grad_norm": 0.16748668253421783, "learning_rate": 1.0478901062674351e-07, "loss": 0.4583, "num_input_tokens_seen": 106071088, "step": 87435 }, { "epoch": 9.738278204699855, "grad_norm": 0.10404985398054123, "learning_rate": 1.043450168301513e-07, "loss": 0.4673, "num_input_tokens_seen": 106077392, "step": 87440 }, { "epoch": 9.738835059583472, "grad_norm": 0.09951795637607574, "learning_rate": 1.0390196367118521e-07, "loss": 0.4689, "num_input_tokens_seen": 106083344, "step": 87445 }, { "epoch": 9.73939191446709, "grad_norm": 0.12921884655952454, "learning_rate": 1.034598511665874e-07, "loss": 0.468, "num_input_tokens_seen": 106089328, "step": 87450 }, { "epoch": 9.739948769350708, "grad_norm": 0.14786697924137115, "learning_rate": 1.030186793330612e-07, "loss": 0.4481, "num_input_tokens_seen": 106094928, "step": 87455 }, { "epoch": 9.740505624234325, "grad_norm": 0.09504539519548416, "learning_rate": 1.0257844818727658e-07, "loss": 0.4539, "num_input_tokens_seen": 106101040, "step": 87460 }, { "epoch": 9.741062479117941, "grad_norm": 0.12129417061805725, "learning_rate": 1.021391577458647e-07, "loss": 0.4399, "num_input_tokens_seen": 106107632, "step": 87465 }, { "epoch": 9.741619334001559, "grad_norm": 0.1129673421382904, "learning_rate": 1.0170080802542336e-07, "loss": 0.4601, "num_input_tokens_seen": 106113328, "step": 87470 }, { "epoch": 9.742176188885177, "grad_norm": 0.09300617128610611, "learning_rate": 1.0126339904251992e-07, "loss": 0.4551, "num_input_tokens_seen": 106119344, "step": 87475 }, { "epoch": 9.742733043768794, "grad_norm": 0.09090938419103622, "learning_rate": 1.0082693081367445e-07, "loss": 0.462, "num_input_tokens_seen": 106125584, "step": 87480 }, { "epoch": 9.743289898652412, "grad_norm": 0.11253348737955093, "learning_rate": 1.003914033553821e-07, "loss": 0.4531, "num_input_tokens_seen": 106131568, "step": 87485 }, { "epoch": 9.743846753536028, "grad_norm": 0.1144389882683754, "learning_rate": 9.995681668409362e-08, "loss": 0.4723, "num_input_tokens_seen": 106137168, "step": 87490 }, { "epoch": 9.744403608419645, "grad_norm": 0.14848178625106812, "learning_rate": 9.952317081623752e-08, "loss": 0.4571, "num_input_tokens_seen": 106143440, "step": 87495 }, { "epoch": 9.744960463303263, "grad_norm": 0.11921308189630508, "learning_rate": 9.909046576818959e-08, "loss": 0.4702, "num_input_tokens_seen": 106149840, "step": 87500 }, { "epoch": 9.74551731818688, "grad_norm": 0.09574699401855469, "learning_rate": 9.865870155630618e-08, "loss": 0.4723, "num_input_tokens_seen": 106155344, "step": 87505 }, { "epoch": 9.746074173070499, "grad_norm": 0.15257646143436432, "learning_rate": 9.822787819689371e-08, "loss": 0.4529, "num_input_tokens_seen": 106161392, "step": 87510 }, { "epoch": 9.746631027954114, "grad_norm": 0.1183728352189064, "learning_rate": 9.779799570623637e-08, "loss": 0.4614, "num_input_tokens_seen": 106167376, "step": 87515 }, { "epoch": 9.747187882837732, "grad_norm": 0.09551632404327393, "learning_rate": 9.736905410057395e-08, "loss": 0.4595, "num_input_tokens_seen": 106173520, "step": 87520 }, { "epoch": 9.74774473772135, "grad_norm": 0.17233999073505402, "learning_rate": 9.694105339611015e-08, "loss": 0.4676, "num_input_tokens_seen": 106179280, "step": 87525 }, { "epoch": 9.748301592604967, "grad_norm": 0.1153452917933464, "learning_rate": 9.651399360902091e-08, "loss": 0.4654, "num_input_tokens_seen": 106185264, "step": 87530 }, { "epoch": 9.748858447488585, "grad_norm": 0.09684420377016068, "learning_rate": 9.608787475544056e-08, "loss": 0.4573, "num_input_tokens_seen": 106191568, "step": 87535 }, { "epoch": 9.749415302372201, "grad_norm": 0.1071825698018074, "learning_rate": 9.566269685147011e-08, "loss": 0.4536, "num_input_tokens_seen": 106197616, "step": 87540 }, { "epoch": 9.749972157255819, "grad_norm": 0.10987594723701477, "learning_rate": 9.523845991317171e-08, "loss": 0.4597, "num_input_tokens_seen": 106204016, "step": 87545 }, { "epoch": 9.750529012139436, "grad_norm": 0.08646990358829498, "learning_rate": 9.481516395657419e-08, "loss": 0.4541, "num_input_tokens_seen": 106209808, "step": 87550 }, { "epoch": 9.751085867023054, "grad_norm": 0.11178383231163025, "learning_rate": 9.439280899767311e-08, "loss": 0.4622, "num_input_tokens_seen": 106215824, "step": 87555 }, { "epoch": 9.751642721906672, "grad_norm": 0.11256668716669083, "learning_rate": 9.397139505242514e-08, "loss": 0.4555, "num_input_tokens_seen": 106221936, "step": 87560 }, { "epoch": 9.75219957679029, "grad_norm": 0.1148395910859108, "learning_rate": 9.355092213675365e-08, "loss": 0.4597, "num_input_tokens_seen": 106227472, "step": 87565 }, { "epoch": 9.752756431673905, "grad_norm": 0.09922380745410919, "learning_rate": 9.313139026654594e-08, "loss": 0.4471, "num_input_tokens_seen": 106233680, "step": 87570 }, { "epoch": 9.753313286557523, "grad_norm": 0.1022755578160286, "learning_rate": 9.271279945765044e-08, "loss": 0.4645, "num_input_tokens_seen": 106239600, "step": 87575 }, { "epoch": 9.75387014144114, "grad_norm": 0.102652907371521, "learning_rate": 9.229514972588782e-08, "loss": 0.4679, "num_input_tokens_seen": 106245648, "step": 87580 }, { "epoch": 9.754426996324758, "grad_norm": 0.10294878482818604, "learning_rate": 9.187844108703159e-08, "loss": 0.4592, "num_input_tokens_seen": 106251952, "step": 87585 }, { "epoch": 9.754983851208376, "grad_norm": 0.12804530560970306, "learning_rate": 9.146267355683302e-08, "loss": 0.4612, "num_input_tokens_seen": 106258032, "step": 87590 }, { "epoch": 9.755540706091992, "grad_norm": 0.1769614815711975, "learning_rate": 9.104784715099623e-08, "loss": 0.4672, "num_input_tokens_seen": 106264304, "step": 87595 }, { "epoch": 9.75609756097561, "grad_norm": 0.07846365123987198, "learning_rate": 9.063396188519479e-08, "loss": 0.4572, "num_input_tokens_seen": 106270352, "step": 87600 }, { "epoch": 9.756654415859227, "grad_norm": 0.08859315514564514, "learning_rate": 9.022101777506897e-08, "loss": 0.4621, "num_input_tokens_seen": 106276624, "step": 87605 }, { "epoch": 9.757211270742845, "grad_norm": 0.08714078366756439, "learning_rate": 8.980901483622295e-08, "loss": 0.4598, "num_input_tokens_seen": 106282544, "step": 87610 }, { "epoch": 9.757768125626463, "grad_norm": 0.12736153602600098, "learning_rate": 8.939795308421927e-08, "loss": 0.4549, "num_input_tokens_seen": 106288336, "step": 87615 }, { "epoch": 9.758324980510078, "grad_norm": 0.10950736701488495, "learning_rate": 8.898783253458997e-08, "loss": 0.4653, "num_input_tokens_seen": 106294608, "step": 87620 }, { "epoch": 9.758881835393696, "grad_norm": 0.10248580574989319, "learning_rate": 8.857865320283377e-08, "loss": 0.4643, "num_input_tokens_seen": 106300464, "step": 87625 }, { "epoch": 9.759438690277314, "grad_norm": 0.1054241731762886, "learning_rate": 8.817041510440771e-08, "loss": 0.4604, "num_input_tokens_seen": 106306192, "step": 87630 }, { "epoch": 9.759995545160931, "grad_norm": 0.10214052349328995, "learning_rate": 8.77631182547356e-08, "loss": 0.4579, "num_input_tokens_seen": 106312432, "step": 87635 }, { "epoch": 9.76055240004455, "grad_norm": 0.09543746709823608, "learning_rate": 8.735676266921066e-08, "loss": 0.4595, "num_input_tokens_seen": 106318672, "step": 87640 }, { "epoch": 9.761109254928165, "grad_norm": 0.14755108952522278, "learning_rate": 8.695134836318175e-08, "loss": 0.4515, "num_input_tokens_seen": 106324848, "step": 87645 }, { "epoch": 9.761666109811783, "grad_norm": 0.08610659092664719, "learning_rate": 8.654687535196993e-08, "loss": 0.4657, "num_input_tokens_seen": 106330864, "step": 87650 }, { "epoch": 9.7622229646954, "grad_norm": 0.10626109689474106, "learning_rate": 8.614334365085463e-08, "loss": 0.4493, "num_input_tokens_seen": 106336976, "step": 87655 }, { "epoch": 9.762779819579018, "grad_norm": 0.1460200995206833, "learning_rate": 8.574075327508479e-08, "loss": 0.4566, "num_input_tokens_seen": 106342960, "step": 87660 }, { "epoch": 9.763336674462636, "grad_norm": 0.10459520667791367, "learning_rate": 8.533910423987323e-08, "loss": 0.4755, "num_input_tokens_seen": 106349008, "step": 87665 }, { "epoch": 9.763893529346252, "grad_norm": 0.10412237048149109, "learning_rate": 8.493839656039116e-08, "loss": 0.4523, "num_input_tokens_seen": 106355216, "step": 87670 }, { "epoch": 9.76445038422987, "grad_norm": 0.09316161274909973, "learning_rate": 8.453863025177921e-08, "loss": 0.4487, "num_input_tokens_seen": 106361360, "step": 87675 }, { "epoch": 9.765007239113487, "grad_norm": 0.11465444415807724, "learning_rate": 8.413980532914478e-08, "loss": 0.4638, "num_input_tokens_seen": 106367536, "step": 87680 }, { "epoch": 9.765564093997105, "grad_norm": 0.13742713630199432, "learning_rate": 8.374192180755636e-08, "loss": 0.4613, "num_input_tokens_seen": 106374032, "step": 87685 }, { "epoch": 9.766120948880722, "grad_norm": 0.13409899175167084, "learning_rate": 8.334497970204636e-08, "loss": 0.4677, "num_input_tokens_seen": 106379600, "step": 87690 }, { "epoch": 9.766677803764338, "grad_norm": 0.11626860499382019, "learning_rate": 8.294897902761112e-08, "loss": 0.4565, "num_input_tokens_seen": 106385552, "step": 87695 }, { "epoch": 9.767234658647956, "grad_norm": 0.11029668152332306, "learning_rate": 8.255391979921645e-08, "loss": 0.4626, "num_input_tokens_seen": 106391056, "step": 87700 }, { "epoch": 9.767791513531574, "grad_norm": 0.12469591945409775, "learning_rate": 8.215980203178375e-08, "loss": 0.4492, "num_input_tokens_seen": 106397456, "step": 87705 }, { "epoch": 9.768348368415191, "grad_norm": 0.09168003499507904, "learning_rate": 8.176662574020666e-08, "loss": 0.4608, "num_input_tokens_seen": 106403600, "step": 87710 }, { "epoch": 9.768905223298809, "grad_norm": 0.12379951030015945, "learning_rate": 8.137439093934274e-08, "loss": 0.459, "num_input_tokens_seen": 106409648, "step": 87715 }, { "epoch": 9.769462078182425, "grad_norm": 0.1349041610956192, "learning_rate": 8.098309764400791e-08, "loss": 0.453, "num_input_tokens_seen": 106415728, "step": 87720 }, { "epoch": 9.770018933066043, "grad_norm": 0.11603856086730957, "learning_rate": 8.059274586899034e-08, "loss": 0.4515, "num_input_tokens_seen": 106421584, "step": 87725 }, { "epoch": 9.77057578794966, "grad_norm": 0.1098138839006424, "learning_rate": 8.02033356290366e-08, "loss": 0.45, "num_input_tokens_seen": 106427088, "step": 87730 }, { "epoch": 9.771132642833278, "grad_norm": 0.17825546860694885, "learning_rate": 7.981486693885986e-08, "loss": 0.4784, "num_input_tokens_seen": 106433040, "step": 87735 }, { "epoch": 9.771689497716896, "grad_norm": 0.09343782067298889, "learning_rate": 7.942733981313732e-08, "loss": 0.4551, "num_input_tokens_seen": 106439088, "step": 87740 }, { "epoch": 9.772246352600511, "grad_norm": 0.1173504963517189, "learning_rate": 7.904075426651281e-08, "loss": 0.4694, "num_input_tokens_seen": 106445136, "step": 87745 }, { "epoch": 9.77280320748413, "grad_norm": 0.11867687106132507, "learning_rate": 7.865511031359129e-08, "loss": 0.4549, "num_input_tokens_seen": 106451408, "step": 87750 }, { "epoch": 9.773360062367747, "grad_norm": 0.11419006437063217, "learning_rate": 7.827040796894447e-08, "loss": 0.4616, "num_input_tokens_seen": 106457520, "step": 87755 }, { "epoch": 9.773916917251364, "grad_norm": 0.1299571841955185, "learning_rate": 7.788664724710793e-08, "loss": 0.4723, "num_input_tokens_seen": 106463824, "step": 87760 }, { "epoch": 9.774473772134982, "grad_norm": 0.16707652807235718, "learning_rate": 7.750382816258117e-08, "loss": 0.4629, "num_input_tokens_seen": 106469616, "step": 87765 }, { "epoch": 9.775030627018598, "grad_norm": 0.12262609601020813, "learning_rate": 7.712195072982487e-08, "loss": 0.4505, "num_input_tokens_seen": 106475664, "step": 87770 }, { "epoch": 9.775587481902216, "grad_norm": 0.10627790540456772, "learning_rate": 7.674101496327191e-08, "loss": 0.4645, "num_input_tokens_seen": 106481616, "step": 87775 }, { "epoch": 9.776144336785833, "grad_norm": 0.13858669996261597, "learning_rate": 7.636102087731633e-08, "loss": 0.4679, "num_input_tokens_seen": 106487632, "step": 87780 }, { "epoch": 9.776701191669451, "grad_norm": 0.11844000220298767, "learning_rate": 7.598196848631056e-08, "loss": 0.454, "num_input_tokens_seen": 106493712, "step": 87785 }, { "epoch": 9.777258046553069, "grad_norm": 0.10151462256908417, "learning_rate": 7.560385780457924e-08, "loss": 0.465, "num_input_tokens_seen": 106500048, "step": 87790 }, { "epoch": 9.777814901436686, "grad_norm": 0.07253357768058777, "learning_rate": 7.522668884640815e-08, "loss": 0.4461, "num_input_tokens_seen": 106506512, "step": 87795 }, { "epoch": 9.778371756320302, "grad_norm": 0.14681459963321686, "learning_rate": 7.485046162604981e-08, "loss": 0.4661, "num_input_tokens_seen": 106512976, "step": 87800 }, { "epoch": 9.77892861120392, "grad_norm": 0.1084362268447876, "learning_rate": 7.447517615771505e-08, "loss": 0.4646, "num_input_tokens_seen": 106519216, "step": 87805 }, { "epoch": 9.779485466087538, "grad_norm": 0.08831364661455154, "learning_rate": 7.410083245558696e-08, "loss": 0.4503, "num_input_tokens_seen": 106525456, "step": 87810 }, { "epoch": 9.780042320971155, "grad_norm": 0.11341902613639832, "learning_rate": 7.372743053380982e-08, "loss": 0.4613, "num_input_tokens_seen": 106531888, "step": 87815 }, { "epoch": 9.780599175854773, "grad_norm": 0.13014180958271027, "learning_rate": 7.335497040648898e-08, "loss": 0.4564, "num_input_tokens_seen": 106537712, "step": 87820 }, { "epoch": 9.781156030738389, "grad_norm": 0.09704751521348953, "learning_rate": 7.29834520876993e-08, "loss": 0.4536, "num_input_tokens_seen": 106543728, "step": 87825 }, { "epoch": 9.781712885622007, "grad_norm": 0.1007046103477478, "learning_rate": 7.261287559147678e-08, "loss": 0.4651, "num_input_tokens_seen": 106549840, "step": 87830 }, { "epoch": 9.782269740505624, "grad_norm": 0.09269160777330399, "learning_rate": 7.224324093182411e-08, "loss": 0.4666, "num_input_tokens_seen": 106555664, "step": 87835 }, { "epoch": 9.782826595389242, "grad_norm": 0.16493010520935059, "learning_rate": 7.187454812270511e-08, "loss": 0.4642, "num_input_tokens_seen": 106561872, "step": 87840 }, { "epoch": 9.78338345027286, "grad_norm": 0.08898948132991791, "learning_rate": 7.150679717805309e-08, "loss": 0.4571, "num_input_tokens_seen": 106567472, "step": 87845 }, { "epoch": 9.783940305156476, "grad_norm": 0.1331735998392105, "learning_rate": 7.113998811176247e-08, "loss": 0.4448, "num_input_tokens_seen": 106574032, "step": 87850 }, { "epoch": 9.784497160040093, "grad_norm": 0.0986967459321022, "learning_rate": 7.077412093769165e-08, "loss": 0.4584, "num_input_tokens_seen": 106580624, "step": 87855 }, { "epoch": 9.78505401492371, "grad_norm": 0.09869024157524109, "learning_rate": 7.040919566966287e-08, "loss": 0.4537, "num_input_tokens_seen": 106586576, "step": 87860 }, { "epoch": 9.785610869807329, "grad_norm": 0.10740390419960022, "learning_rate": 7.00452123214651e-08, "loss": 0.4616, "num_input_tokens_seen": 106592496, "step": 87865 }, { "epoch": 9.786167724690946, "grad_norm": 0.13314947485923767, "learning_rate": 6.968217090685126e-08, "loss": 0.4594, "num_input_tokens_seen": 106598544, "step": 87870 }, { "epoch": 9.786724579574562, "grad_norm": 0.11357951164245605, "learning_rate": 6.932007143953811e-08, "loss": 0.4606, "num_input_tokens_seen": 106604624, "step": 87875 }, { "epoch": 9.78728143445818, "grad_norm": 0.1005779281258583, "learning_rate": 6.89589139332092e-08, "loss": 0.4601, "num_input_tokens_seen": 106610416, "step": 87880 }, { "epoch": 9.787838289341797, "grad_norm": 0.12379991263151169, "learning_rate": 6.859869840150634e-08, "loss": 0.4547, "num_input_tokens_seen": 106617040, "step": 87885 }, { "epoch": 9.788395144225415, "grad_norm": 0.09920386224985123, "learning_rate": 6.823942485804092e-08, "loss": 0.4584, "num_input_tokens_seen": 106623152, "step": 87890 }, { "epoch": 9.788951999109033, "grad_norm": 0.15036387741565704, "learning_rate": 6.788109331638814e-08, "loss": 0.4686, "num_input_tokens_seen": 106629360, "step": 87895 }, { "epoch": 9.789508853992649, "grad_norm": 0.11944770067930222, "learning_rate": 6.752370379008721e-08, "loss": 0.4512, "num_input_tokens_seen": 106634928, "step": 87900 }, { "epoch": 9.790065708876266, "grad_norm": 0.1533404439687729, "learning_rate": 6.716725629263842e-08, "loss": 0.4716, "num_input_tokens_seen": 106641264, "step": 87905 }, { "epoch": 9.790622563759884, "grad_norm": 0.13181191682815552, "learning_rate": 6.681175083751434e-08, "loss": 0.4586, "num_input_tokens_seen": 106647440, "step": 87910 }, { "epoch": 9.791179418643502, "grad_norm": 0.13111767172813416, "learning_rate": 6.64571874381431e-08, "loss": 0.4567, "num_input_tokens_seen": 106653520, "step": 87915 }, { "epoch": 9.79173627352712, "grad_norm": 0.11342926323413849, "learning_rate": 6.610356610792512e-08, "loss": 0.4509, "num_input_tokens_seen": 106659504, "step": 87920 }, { "epoch": 9.792293128410737, "grad_norm": 0.0886160284280777, "learning_rate": 6.575088686021635e-08, "loss": 0.4742, "num_input_tokens_seen": 106665776, "step": 87925 }, { "epoch": 9.792849983294353, "grad_norm": 0.07868729531764984, "learning_rate": 6.539914970834504e-08, "loss": 0.4651, "num_input_tokens_seen": 106671952, "step": 87930 }, { "epoch": 9.79340683817797, "grad_norm": 0.12415660917758942, "learning_rate": 6.504835466560332e-08, "loss": 0.4579, "num_input_tokens_seen": 106678000, "step": 87935 }, { "epoch": 9.793963693061588, "grad_norm": 0.10707780718803406, "learning_rate": 6.469850174524172e-08, "loss": 0.4528, "num_input_tokens_seen": 106683888, "step": 87940 }, { "epoch": 9.794520547945206, "grad_norm": 0.08748946338891983, "learning_rate": 6.434959096047743e-08, "loss": 0.4515, "num_input_tokens_seen": 106689712, "step": 87945 }, { "epoch": 9.795077402828824, "grad_norm": 0.11300988495349884, "learning_rate": 6.400162232449713e-08, "loss": 0.4583, "num_input_tokens_seen": 106696016, "step": 87950 }, { "epoch": 9.79563425771244, "grad_norm": 0.1114615648984909, "learning_rate": 6.365459585044864e-08, "loss": 0.4577, "num_input_tokens_seen": 106702192, "step": 87955 }, { "epoch": 9.796191112596057, "grad_norm": 0.14662092924118042, "learning_rate": 6.330851155144091e-08, "loss": 0.4537, "num_input_tokens_seen": 106708400, "step": 87960 }, { "epoch": 9.796747967479675, "grad_norm": 0.11427291482686996, "learning_rate": 6.296336944054959e-08, "loss": 0.4765, "num_input_tokens_seen": 106714480, "step": 87965 }, { "epoch": 9.797304822363293, "grad_norm": 0.14112693071365356, "learning_rate": 6.261916953081704e-08, "loss": 0.4554, "num_input_tokens_seen": 106720720, "step": 87970 }, { "epoch": 9.79786167724691, "grad_norm": 0.10483849048614502, "learning_rate": 6.22759118352495e-08, "loss": 0.4526, "num_input_tokens_seen": 106726864, "step": 87975 }, { "epoch": 9.798418532130526, "grad_norm": 0.10133741050958633, "learning_rate": 6.193359636681439e-08, "loss": 0.4576, "num_input_tokens_seen": 106733040, "step": 87980 }, { "epoch": 9.798975387014144, "grad_norm": 0.11567335575819016, "learning_rate": 6.159222313844582e-08, "loss": 0.4525, "num_input_tokens_seen": 106739184, "step": 87985 }, { "epoch": 9.799532241897762, "grad_norm": 0.11590193957090378, "learning_rate": 6.125179216304178e-08, "loss": 0.4588, "num_input_tokens_seen": 106745392, "step": 87990 }, { "epoch": 9.80008909678138, "grad_norm": 0.17096328735351562, "learning_rate": 6.091230345346699e-08, "loss": 0.4589, "num_input_tokens_seen": 106751792, "step": 87995 }, { "epoch": 9.800645951664997, "grad_norm": 0.07984530180692673, "learning_rate": 6.057375702254453e-08, "loss": 0.4629, "num_input_tokens_seen": 106757968, "step": 88000 }, { "epoch": 9.801202806548613, "grad_norm": 0.09195612370967865, "learning_rate": 6.023615288306694e-08, "loss": 0.4584, "num_input_tokens_seen": 106764176, "step": 88005 }, { "epoch": 9.80175966143223, "grad_norm": 0.10168144106864929, "learning_rate": 5.989949104779347e-08, "loss": 0.4638, "num_input_tokens_seen": 106770384, "step": 88010 }, { "epoch": 9.802316516315848, "grad_norm": 0.10237516462802887, "learning_rate": 5.95637715294417e-08, "loss": 0.4597, "num_input_tokens_seen": 106776240, "step": 88015 }, { "epoch": 9.802873371199466, "grad_norm": 0.12934967875480652, "learning_rate": 5.922899434069595e-08, "loss": 0.462, "num_input_tokens_seen": 106782128, "step": 88020 }, { "epoch": 9.803430226083083, "grad_norm": 0.1015520766377449, "learning_rate": 5.889515949420443e-08, "loss": 0.4679, "num_input_tokens_seen": 106788208, "step": 88025 }, { "epoch": 9.8039870809667, "grad_norm": 0.12912487983703613, "learning_rate": 5.8562267002579276e-08, "loss": 0.4595, "num_input_tokens_seen": 106794416, "step": 88030 }, { "epoch": 9.804543935850317, "grad_norm": 0.10690586268901825, "learning_rate": 5.823031687840486e-08, "loss": 0.4529, "num_input_tokens_seen": 106800656, "step": 88035 }, { "epoch": 9.805100790733935, "grad_norm": 0.12042040377855301, "learning_rate": 5.78993091342156e-08, "loss": 0.4655, "num_input_tokens_seen": 106806224, "step": 88040 }, { "epoch": 9.805657645617552, "grad_norm": 0.09455237537622452, "learning_rate": 5.7569243782523704e-08, "loss": 0.4568, "num_input_tokens_seen": 106811984, "step": 88045 }, { "epoch": 9.80621450050117, "grad_norm": 0.10714918375015259, "learning_rate": 5.7240120835796996e-08, "loss": 0.4573, "num_input_tokens_seen": 106818160, "step": 88050 }, { "epoch": 9.806771355384786, "grad_norm": 0.13371910154819489, "learning_rate": 5.6911940306469955e-08, "loss": 0.4549, "num_input_tokens_seen": 106824208, "step": 88055 }, { "epoch": 9.807328210268404, "grad_norm": 0.09170326590538025, "learning_rate": 5.658470220694656e-08, "loss": 0.4516, "num_input_tokens_seen": 106830064, "step": 88060 }, { "epoch": 9.807885065152021, "grad_norm": 0.1283237487077713, "learning_rate": 5.625840654958636e-08, "loss": 0.4536, "num_input_tokens_seen": 106836304, "step": 88065 }, { "epoch": 9.808441920035639, "grad_norm": 0.12853875756263733, "learning_rate": 5.593305334671839e-08, "loss": 0.4614, "num_input_tokens_seen": 106842224, "step": 88070 }, { "epoch": 9.808998774919257, "grad_norm": 0.11413083225488663, "learning_rate": 5.5608642610638364e-08, "loss": 0.463, "num_input_tokens_seen": 106848240, "step": 88075 }, { "epoch": 9.809555629802873, "grad_norm": 0.10108209401369095, "learning_rate": 5.528517435360314e-08, "loss": 0.462, "num_input_tokens_seen": 106854416, "step": 88080 }, { "epoch": 9.81011248468649, "grad_norm": 0.09482257813215256, "learning_rate": 5.496264858783351e-08, "loss": 0.4594, "num_input_tokens_seen": 106860528, "step": 88085 }, { "epoch": 9.810669339570108, "grad_norm": 0.15411780774593353, "learning_rate": 5.464106532551416e-08, "loss": 0.4553, "num_input_tokens_seen": 106866608, "step": 88090 }, { "epoch": 9.811226194453726, "grad_norm": 0.08498306572437286, "learning_rate": 5.4320424578796495e-08, "loss": 0.4599, "num_input_tokens_seen": 106872912, "step": 88095 }, { "epoch": 9.811783049337343, "grad_norm": 0.14220258593559265, "learning_rate": 5.4000726359795807e-08, "loss": 0.4532, "num_input_tokens_seen": 106878512, "step": 88100 }, { "epoch": 9.81233990422096, "grad_norm": 0.11808228492736816, "learning_rate": 5.368197068059133e-08, "loss": 0.4648, "num_input_tokens_seen": 106884432, "step": 88105 }, { "epoch": 9.812896759104577, "grad_norm": 0.13357438147068024, "learning_rate": 5.336415755322621e-08, "loss": 0.4631, "num_input_tokens_seen": 106889904, "step": 88110 }, { "epoch": 9.813453613988194, "grad_norm": 0.08056803047657013, "learning_rate": 5.304728698970751e-08, "loss": 0.4562, "num_input_tokens_seen": 106896112, "step": 88115 }, { "epoch": 9.814010468871812, "grad_norm": 0.1221696212887764, "learning_rate": 5.273135900201176e-08, "loss": 0.4695, "num_input_tokens_seen": 106901904, "step": 88120 }, { "epoch": 9.81456732375543, "grad_norm": 0.1973295509815216, "learning_rate": 5.24163736020683e-08, "loss": 0.4772, "num_input_tokens_seen": 106907920, "step": 88125 }, { "epoch": 9.815124178639046, "grad_norm": 0.11480284482240677, "learning_rate": 5.210233080178706e-08, "loss": 0.4678, "num_input_tokens_seen": 106914064, "step": 88130 }, { "epoch": 9.815681033522663, "grad_norm": 0.1443646252155304, "learning_rate": 5.178923061302521e-08, "loss": 0.464, "num_input_tokens_seen": 106919920, "step": 88135 }, { "epoch": 9.816237888406281, "grad_norm": 0.11137569695711136, "learning_rate": 5.147707304762051e-08, "loss": 0.4599, "num_input_tokens_seen": 106926224, "step": 88140 }, { "epoch": 9.816794743289899, "grad_norm": 0.12006723880767822, "learning_rate": 5.116585811736074e-08, "loss": 0.4534, "num_input_tokens_seen": 106932400, "step": 88145 }, { "epoch": 9.817351598173516, "grad_norm": 0.09337425231933594, "learning_rate": 5.0855585834008736e-08, "loss": 0.4631, "num_input_tokens_seen": 106938576, "step": 88150 }, { "epoch": 9.817908453057134, "grad_norm": 0.14348462224006653, "learning_rate": 5.054625620928566e-08, "loss": 0.4562, "num_input_tokens_seen": 106944752, "step": 88155 }, { "epoch": 9.81846530794075, "grad_norm": 0.12599517405033112, "learning_rate": 5.023786925487939e-08, "loss": 0.4517, "num_input_tokens_seen": 106950800, "step": 88160 }, { "epoch": 9.819022162824368, "grad_norm": 0.12470787018537521, "learning_rate": 4.993042498244171e-08, "loss": 0.4481, "num_input_tokens_seen": 106956944, "step": 88165 }, { "epoch": 9.819579017707985, "grad_norm": 0.11710242182016373, "learning_rate": 4.962392340358834e-08, "loss": 0.4517, "num_input_tokens_seen": 106963120, "step": 88170 }, { "epoch": 9.820135872591603, "grad_norm": 0.16566109657287598, "learning_rate": 4.931836452990168e-08, "loss": 0.4638, "num_input_tokens_seen": 106969552, "step": 88175 }, { "epoch": 9.82069272747522, "grad_norm": 0.11311765015125275, "learning_rate": 4.901374837292527e-08, "loss": 0.4491, "num_input_tokens_seen": 106975696, "step": 88180 }, { "epoch": 9.821249582358837, "grad_norm": 0.11257898062467575, "learning_rate": 4.871007494416935e-08, "loss": 0.4701, "num_input_tokens_seen": 106982000, "step": 88185 }, { "epoch": 9.821806437242454, "grad_norm": 0.10281096398830414, "learning_rate": 4.840734425510807e-08, "loss": 0.4538, "num_input_tokens_seen": 106988240, "step": 88190 }, { "epoch": 9.822363292126072, "grad_norm": 0.11704590171575546, "learning_rate": 4.8105556317176726e-08, "loss": 0.4615, "num_input_tokens_seen": 106994288, "step": 88195 }, { "epoch": 9.82292014700969, "grad_norm": 0.1073710173368454, "learning_rate": 4.780471114178009e-08, "loss": 0.4569, "num_input_tokens_seen": 107000656, "step": 88200 }, { "epoch": 9.823477001893307, "grad_norm": 0.10073472559452057, "learning_rate": 4.750480874028407e-08, "loss": 0.4651, "num_input_tokens_seen": 107006576, "step": 88205 }, { "epoch": 9.824033856776923, "grad_norm": 0.11951320618391037, "learning_rate": 4.720584912402126e-08, "loss": 0.4578, "num_input_tokens_seen": 107013008, "step": 88210 }, { "epoch": 9.82459071166054, "grad_norm": 0.12032420933246613, "learning_rate": 4.6907832304288194e-08, "loss": 0.4638, "num_input_tokens_seen": 107018736, "step": 88215 }, { "epoch": 9.825147566544159, "grad_norm": 0.1104813814163208, "learning_rate": 4.661075829233974e-08, "loss": 0.4494, "num_input_tokens_seen": 107024752, "step": 88220 }, { "epoch": 9.825704421427776, "grad_norm": 0.08363997936248779, "learning_rate": 4.631462709940859e-08, "loss": 0.4533, "num_input_tokens_seen": 107030672, "step": 88225 }, { "epoch": 9.826261276311394, "grad_norm": 0.08379470556974411, "learning_rate": 4.601943873667469e-08, "loss": 0.4496, "num_input_tokens_seen": 107035568, "step": 88230 }, { "epoch": 9.82681813119501, "grad_norm": 0.10196519643068314, "learning_rate": 4.5725193215298556e-08, "loss": 0.4409, "num_input_tokens_seen": 107041520, "step": 88235 }, { "epoch": 9.827374986078627, "grad_norm": 0.11623387783765793, "learning_rate": 4.543189054639074e-08, "loss": 0.4661, "num_input_tokens_seen": 107047696, "step": 88240 }, { "epoch": 9.827931840962245, "grad_norm": 0.12294936180114746, "learning_rate": 4.513953074103961e-08, "loss": 0.474, "num_input_tokens_seen": 107053296, "step": 88245 }, { "epoch": 9.828488695845863, "grad_norm": 0.12574590742588043, "learning_rate": 4.484811381028908e-08, "loss": 0.4592, "num_input_tokens_seen": 107059216, "step": 88250 }, { "epoch": 9.82904555072948, "grad_norm": 0.11693552136421204, "learning_rate": 4.455763976514982e-08, "loss": 0.4562, "num_input_tokens_seen": 107065424, "step": 88255 }, { "epoch": 9.829602405613098, "grad_norm": 0.10291285067796707, "learning_rate": 4.426810861659358e-08, "loss": 0.4581, "num_input_tokens_seen": 107071504, "step": 88260 }, { "epoch": 9.830159260496714, "grad_norm": 0.11664964258670807, "learning_rate": 4.397952037556441e-08, "loss": 0.4576, "num_input_tokens_seen": 107077584, "step": 88265 }, { "epoch": 9.830716115380332, "grad_norm": 0.10463211685419083, "learning_rate": 4.369187505296468e-08, "loss": 0.4484, "num_input_tokens_seen": 107083408, "step": 88270 }, { "epoch": 9.83127297026395, "grad_norm": 0.1429624855518341, "learning_rate": 4.340517265966348e-08, "loss": 0.4557, "num_input_tokens_seen": 107089424, "step": 88275 }, { "epoch": 9.831829825147567, "grad_norm": 0.11196105182170868, "learning_rate": 4.311941320649104e-08, "loss": 0.4511, "num_input_tokens_seen": 107095952, "step": 88280 }, { "epoch": 9.832386680031185, "grad_norm": 0.09851324558258057, "learning_rate": 4.283459670424428e-08, "loss": 0.4578, "num_input_tokens_seen": 107102064, "step": 88285 }, { "epoch": 9.8329435349148, "grad_norm": 0.11377191543579102, "learning_rate": 4.2550723163686804e-08, "loss": 0.4597, "num_input_tokens_seen": 107107792, "step": 88290 }, { "epoch": 9.833500389798418, "grad_norm": 0.10098785907030106, "learning_rate": 4.2267792595543364e-08, "loss": 0.4549, "num_input_tokens_seen": 107113904, "step": 88295 }, { "epoch": 9.834057244682036, "grad_norm": 0.13883358240127563, "learning_rate": 4.1985805010505416e-08, "loss": 0.4346, "num_input_tokens_seen": 107119856, "step": 88300 }, { "epoch": 9.834614099565654, "grad_norm": 0.10270701348781586, "learning_rate": 4.1704760419222776e-08, "loss": 0.4677, "num_input_tokens_seen": 107126320, "step": 88305 }, { "epoch": 9.835170954449271, "grad_norm": 0.07349734753370285, "learning_rate": 4.1424658832317496e-08, "loss": 0.4388, "num_input_tokens_seen": 107132752, "step": 88310 }, { "epoch": 9.835727809332887, "grad_norm": 0.08729279041290283, "learning_rate": 4.114550026037278e-08, "loss": 0.4569, "num_input_tokens_seen": 107138928, "step": 88315 }, { "epoch": 9.836284664216505, "grad_norm": 0.1185692846775055, "learning_rate": 4.086728471393575e-08, "loss": 0.4591, "num_input_tokens_seen": 107145136, "step": 88320 }, { "epoch": 9.836841519100123, "grad_norm": 0.12590783834457397, "learning_rate": 4.059001220351744e-08, "loss": 0.4419, "num_input_tokens_seen": 107151440, "step": 88325 }, { "epoch": 9.83739837398374, "grad_norm": 0.09502728283405304, "learning_rate": 4.031368273959557e-08, "loss": 0.4508, "num_input_tokens_seen": 107157680, "step": 88330 }, { "epoch": 9.837955228867358, "grad_norm": 0.13226261734962463, "learning_rate": 4.0038296332609026e-08, "loss": 0.4553, "num_input_tokens_seen": 107163632, "step": 88335 }, { "epoch": 9.838512083750974, "grad_norm": 0.10983823984861374, "learning_rate": 3.9763852992966146e-08, "loss": 0.461, "num_input_tokens_seen": 107169776, "step": 88340 }, { "epoch": 9.839068938634592, "grad_norm": 0.08747246861457825, "learning_rate": 3.9490352731030857e-08, "loss": 0.4536, "num_input_tokens_seen": 107175312, "step": 88345 }, { "epoch": 9.83962579351821, "grad_norm": 0.12158072739839554, "learning_rate": 3.9217795557142113e-08, "loss": 0.4669, "num_input_tokens_seen": 107181488, "step": 88350 }, { "epoch": 9.840182648401827, "grad_norm": 0.135980024933815, "learning_rate": 3.894618148159446e-08, "loss": 0.4624, "num_input_tokens_seen": 107187632, "step": 88355 }, { "epoch": 9.840739503285445, "grad_norm": 0.11384527385234833, "learning_rate": 3.86755105146519e-08, "loss": 0.4575, "num_input_tokens_seen": 107194064, "step": 88360 }, { "epoch": 9.84129635816906, "grad_norm": 0.11888478696346283, "learning_rate": 3.840578266654238e-08, "loss": 0.4514, "num_input_tokens_seen": 107200176, "step": 88365 }, { "epoch": 9.841853213052678, "grad_norm": 0.12248854339122772, "learning_rate": 3.813699794745496e-08, "loss": 0.4609, "num_input_tokens_seen": 107206256, "step": 88370 }, { "epoch": 9.842410067936296, "grad_norm": 0.08346813917160034, "learning_rate": 3.78691563675454e-08, "loss": 0.4606, "num_input_tokens_seen": 107212688, "step": 88375 }, { "epoch": 9.842966922819913, "grad_norm": 0.09835335612297058, "learning_rate": 3.760225793693617e-08, "loss": 0.4636, "num_input_tokens_seen": 107218512, "step": 88380 }, { "epoch": 9.843523777703531, "grad_norm": 0.12646178901195526, "learning_rate": 3.733630266570809e-08, "loss": 0.4581, "num_input_tokens_seen": 107224784, "step": 88385 }, { "epoch": 9.844080632587147, "grad_norm": 0.10686422884464264, "learning_rate": 3.707129056391145e-08, "loss": 0.4676, "num_input_tokens_seen": 107230768, "step": 88390 }, { "epoch": 9.844637487470765, "grad_norm": 0.11334218084812164, "learning_rate": 3.68072216415577e-08, "loss": 0.4615, "num_input_tokens_seen": 107236944, "step": 88395 }, { "epoch": 9.845194342354382, "grad_norm": 0.10170871019363403, "learning_rate": 3.654409590862773e-08, "loss": 0.4571, "num_input_tokens_seen": 107242768, "step": 88400 }, { "epoch": 9.845751197238, "grad_norm": 0.09105038642883301, "learning_rate": 3.628191337505804e-08, "loss": 0.4514, "num_input_tokens_seen": 107249104, "step": 88405 }, { "epoch": 9.846308052121618, "grad_norm": 0.15791164338588715, "learning_rate": 3.602067405076015e-08, "loss": 0.4593, "num_input_tokens_seen": 107255024, "step": 88410 }, { "epoch": 9.846864907005234, "grad_norm": 0.11334585398435593, "learning_rate": 3.576037794560394e-08, "loss": 0.4567, "num_input_tokens_seen": 107260240, "step": 88415 }, { "epoch": 9.847421761888851, "grad_norm": 0.08556334674358368, "learning_rate": 3.550102506942044e-08, "loss": 0.4763, "num_input_tokens_seen": 107266512, "step": 88420 }, { "epoch": 9.847978616772469, "grad_norm": 0.11680162698030472, "learning_rate": 3.524261543201013e-08, "loss": 0.4615, "num_input_tokens_seen": 107272816, "step": 88425 }, { "epoch": 9.848535471656087, "grad_norm": 0.09462633728981018, "learning_rate": 3.498514904313743e-08, "loss": 0.4594, "num_input_tokens_seen": 107278608, "step": 88430 }, { "epoch": 9.849092326539704, "grad_norm": 0.10645632445812225, "learning_rate": 3.472862591253068e-08, "loss": 0.4695, "num_input_tokens_seen": 107284624, "step": 88435 }, { "epoch": 9.84964918142332, "grad_norm": 0.13966743648052216, "learning_rate": 3.447304604988211e-08, "loss": 0.4621, "num_input_tokens_seen": 107290928, "step": 88440 }, { "epoch": 9.850206036306938, "grad_norm": 0.11791576445102692, "learning_rate": 3.421840946484789e-08, "loss": 0.4453, "num_input_tokens_seen": 107296976, "step": 88445 }, { "epoch": 9.850762891190556, "grad_norm": 0.11305777728557587, "learning_rate": 3.396471616704811e-08, "loss": 0.4654, "num_input_tokens_seen": 107303024, "step": 88450 }, { "epoch": 9.851319746074173, "grad_norm": 0.0800311490893364, "learning_rate": 3.371196616606953e-08, "loss": 0.4742, "num_input_tokens_seen": 107309200, "step": 88455 }, { "epoch": 9.851876600957791, "grad_norm": 0.0869104415178299, "learning_rate": 3.3460159471460085e-08, "loss": 0.4618, "num_input_tokens_seen": 107315056, "step": 88460 }, { "epoch": 9.852433455841407, "grad_norm": 0.08294360339641571, "learning_rate": 3.320929609273715e-08, "loss": 0.4577, "num_input_tokens_seen": 107321232, "step": 88465 }, { "epoch": 9.852990310725025, "grad_norm": 0.14999781548976898, "learning_rate": 3.2959376039373716e-08, "loss": 0.4573, "num_input_tokens_seen": 107327920, "step": 88470 }, { "epoch": 9.853547165608642, "grad_norm": 0.19891542196273804, "learning_rate": 3.271039932082054e-08, "loss": 0.4614, "num_input_tokens_seen": 107333648, "step": 88475 }, { "epoch": 9.85410402049226, "grad_norm": 0.08192861080169678, "learning_rate": 3.246236594647567e-08, "loss": 0.4673, "num_input_tokens_seen": 107338864, "step": 88480 }, { "epoch": 9.854660875375878, "grad_norm": 0.1422799676656723, "learning_rate": 3.2215275925717715e-08, "loss": 0.481, "num_input_tokens_seen": 107344496, "step": 88485 }, { "epoch": 9.855217730259493, "grad_norm": 0.12097963690757751, "learning_rate": 3.19691292678781e-08, "loss": 0.4688, "num_input_tokens_seen": 107350576, "step": 88490 }, { "epoch": 9.855774585143111, "grad_norm": 0.09320656210184097, "learning_rate": 3.1723925982260486e-08, "loss": 0.4552, "num_input_tokens_seen": 107356208, "step": 88495 }, { "epoch": 9.856331440026729, "grad_norm": 0.09385788440704346, "learning_rate": 3.147966607812691e-08, "loss": 0.4497, "num_input_tokens_seen": 107361904, "step": 88500 }, { "epoch": 9.856888294910346, "grad_norm": 0.12025509029626846, "learning_rate": 3.123634956470611e-08, "loss": 0.4621, "num_input_tokens_seen": 107367984, "step": 88505 }, { "epoch": 9.857445149793964, "grad_norm": 0.12198621034622192, "learning_rate": 3.0993976451193505e-08, "loss": 0.4624, "num_input_tokens_seen": 107374128, "step": 88510 }, { "epoch": 9.858002004677582, "grad_norm": 0.1379392445087433, "learning_rate": 3.075254674674566e-08, "loss": 0.4666, "num_input_tokens_seen": 107379856, "step": 88515 }, { "epoch": 9.858558859561198, "grad_norm": 0.12674467265605927, "learning_rate": 3.0512060460485824e-08, "loss": 0.4378, "num_input_tokens_seen": 107385904, "step": 88520 }, { "epoch": 9.859115714444815, "grad_norm": 0.10368403047323227, "learning_rate": 3.027251760149563e-08, "loss": 0.4589, "num_input_tokens_seen": 107392048, "step": 88525 }, { "epoch": 9.859672569328433, "grad_norm": 0.15490224957466125, "learning_rate": 3.003391817883172e-08, "loss": 0.4465, "num_input_tokens_seen": 107397616, "step": 88530 }, { "epoch": 9.86022942421205, "grad_norm": 0.08921156078577042, "learning_rate": 2.979626220150633e-08, "loss": 0.4634, "num_input_tokens_seen": 107403824, "step": 88535 }, { "epoch": 9.860786279095668, "grad_norm": 0.1205563023686409, "learning_rate": 2.9559549678498388e-08, "loss": 0.4591, "num_input_tokens_seen": 107409840, "step": 88540 }, { "epoch": 9.861343133979284, "grad_norm": 0.15250085294246674, "learning_rate": 2.9323780618753516e-08, "loss": 0.4537, "num_input_tokens_seen": 107415952, "step": 88545 }, { "epoch": 9.861899988862902, "grad_norm": 0.11522657424211502, "learning_rate": 2.90889550311757e-08, "loss": 0.4679, "num_input_tokens_seen": 107422320, "step": 88550 }, { "epoch": 9.86245684374652, "grad_norm": 0.11883611232042313, "learning_rate": 2.8855072924643955e-08, "loss": 0.4556, "num_input_tokens_seen": 107428240, "step": 88555 }, { "epoch": 9.863013698630137, "grad_norm": 0.11483117192983627, "learning_rate": 2.86221343079901e-08, "loss": 0.4613, "num_input_tokens_seen": 107433648, "step": 88560 }, { "epoch": 9.863570553513755, "grad_norm": 0.10930635780096054, "learning_rate": 2.839013919001543e-08, "loss": 0.4532, "num_input_tokens_seen": 107439728, "step": 88565 }, { "epoch": 9.864127408397371, "grad_norm": 0.10394281148910522, "learning_rate": 2.815908757948793e-08, "loss": 0.456, "num_input_tokens_seen": 107445872, "step": 88570 }, { "epoch": 9.864684263280989, "grad_norm": 0.10263840854167938, "learning_rate": 2.792897948513673e-08, "loss": 0.4607, "num_input_tokens_seen": 107451600, "step": 88575 }, { "epoch": 9.865241118164606, "grad_norm": 0.09240356832742691, "learning_rate": 2.7699814915654875e-08, "loss": 0.4679, "num_input_tokens_seen": 107457648, "step": 88580 }, { "epoch": 9.865797973048224, "grad_norm": 0.11529164761304855, "learning_rate": 2.7471593879702107e-08, "loss": 0.4652, "num_input_tokens_seen": 107463824, "step": 88585 }, { "epoch": 9.866354827931842, "grad_norm": 0.1011548787355423, "learning_rate": 2.7244316385899304e-08, "loss": 0.4549, "num_input_tokens_seen": 107469904, "step": 88590 }, { "epoch": 9.866911682815457, "grad_norm": 0.10841234028339386, "learning_rate": 2.701798244283682e-08, "loss": 0.4647, "num_input_tokens_seen": 107475888, "step": 88595 }, { "epoch": 9.867468537699075, "grad_norm": 0.13594844937324524, "learning_rate": 2.6792592059066145e-08, "loss": 0.4574, "num_input_tokens_seen": 107482032, "step": 88600 }, { "epoch": 9.868025392582693, "grad_norm": 0.11669432371854782, "learning_rate": 2.6568145243099918e-08, "loss": 0.4499, "num_input_tokens_seen": 107488016, "step": 88605 }, { "epoch": 9.86858224746631, "grad_norm": 0.09317833930253983, "learning_rate": 2.634464200342024e-08, "loss": 0.4586, "num_input_tokens_seen": 107493840, "step": 88610 }, { "epoch": 9.869139102349928, "grad_norm": 0.0976518988609314, "learning_rate": 2.612208234847313e-08, "loss": 0.4624, "num_input_tokens_seen": 107499952, "step": 88615 }, { "epoch": 9.869695957233546, "grad_norm": 0.1532791554927826, "learning_rate": 2.5900466286665758e-08, "loss": 0.4668, "num_input_tokens_seen": 107506128, "step": 88620 }, { "epoch": 9.870252812117162, "grad_norm": 0.12739847600460052, "learning_rate": 2.567979382637198e-08, "loss": 0.4653, "num_input_tokens_seen": 107512400, "step": 88625 }, { "epoch": 9.87080966700078, "grad_norm": 0.1184864342212677, "learning_rate": 2.546006497593234e-08, "loss": 0.4646, "num_input_tokens_seen": 107518544, "step": 88630 }, { "epoch": 9.871366521884397, "grad_norm": 0.11855459213256836, "learning_rate": 2.5241279743642988e-08, "loss": 0.4577, "num_input_tokens_seen": 107524528, "step": 88635 }, { "epoch": 9.871923376768015, "grad_norm": 0.09808643162250519, "learning_rate": 2.502343813777508e-08, "loss": 0.4543, "num_input_tokens_seen": 107530480, "step": 88640 }, { "epoch": 9.872480231651632, "grad_norm": 0.10247400403022766, "learning_rate": 2.4806540166558144e-08, "loss": 0.4647, "num_input_tokens_seen": 107536560, "step": 88645 }, { "epoch": 9.873037086535248, "grad_norm": 0.08266176283359528, "learning_rate": 2.459058583818563e-08, "loss": 0.4382, "num_input_tokens_seen": 107542320, "step": 88650 }, { "epoch": 9.873593941418866, "grad_norm": 0.12013322114944458, "learning_rate": 2.437557516082045e-08, "loss": 0.4692, "num_input_tokens_seen": 107548336, "step": 88655 }, { "epoch": 9.874150796302484, "grad_norm": 0.19332978129386902, "learning_rate": 2.416150814258389e-08, "loss": 0.4616, "num_input_tokens_seen": 107554448, "step": 88660 }, { "epoch": 9.874707651186101, "grad_norm": 0.11409226804971695, "learning_rate": 2.3948384791563916e-08, "loss": 0.4672, "num_input_tokens_seen": 107560432, "step": 88665 }, { "epoch": 9.875264506069719, "grad_norm": 0.14539483189582825, "learning_rate": 2.3736205115812427e-08, "loss": 0.4682, "num_input_tokens_seen": 107566768, "step": 88670 }, { "epoch": 9.875821360953335, "grad_norm": 0.11316125094890594, "learning_rate": 2.352496912335078e-08, "loss": 0.454, "num_input_tokens_seen": 107573296, "step": 88675 }, { "epoch": 9.876378215836953, "grad_norm": 0.16951315104961395, "learning_rate": 2.3314676822153158e-08, "loss": 0.4681, "num_input_tokens_seen": 107579504, "step": 88680 }, { "epoch": 9.87693507072057, "grad_norm": 0.10514306277036667, "learning_rate": 2.310532822017153e-08, "loss": 0.4544, "num_input_tokens_seen": 107585584, "step": 88685 }, { "epoch": 9.877491925604188, "grad_norm": 0.08966363221406937, "learning_rate": 2.289692332531068e-08, "loss": 0.4547, "num_input_tokens_seen": 107591472, "step": 88690 }, { "epoch": 9.878048780487806, "grad_norm": 0.1367761343717575, "learning_rate": 2.2689462145450424e-08, "loss": 0.4667, "num_input_tokens_seen": 107597616, "step": 88695 }, { "epoch": 9.878605635371422, "grad_norm": 0.0999002680182457, "learning_rate": 2.248294468842338e-08, "loss": 0.4622, "num_input_tokens_seen": 107604048, "step": 88700 }, { "epoch": 9.87916249025504, "grad_norm": 0.13508392870426178, "learning_rate": 2.2277370962034416e-08, "loss": 0.4573, "num_input_tokens_seen": 107610096, "step": 88705 }, { "epoch": 9.879719345138657, "grad_norm": 0.12985223531723022, "learning_rate": 2.207274097405232e-08, "loss": 0.454, "num_input_tokens_seen": 107616336, "step": 88710 }, { "epoch": 9.880276200022275, "grad_norm": 0.11872931569814682, "learning_rate": 2.1869054732209792e-08, "loss": 0.4537, "num_input_tokens_seen": 107622416, "step": 88715 }, { "epoch": 9.880833054905892, "grad_norm": 0.14509674906730652, "learning_rate": 2.1666312244197905e-08, "loss": 0.4599, "num_input_tokens_seen": 107628624, "step": 88720 }, { "epoch": 9.881389909789508, "grad_norm": 0.08765432238578796, "learning_rate": 2.146451351768275e-08, "loss": 0.4711, "num_input_tokens_seen": 107634608, "step": 88725 }, { "epoch": 9.881946764673126, "grad_norm": 0.11942141503095627, "learning_rate": 2.126365856028323e-08, "loss": 0.4595, "num_input_tokens_seen": 107640464, "step": 88730 }, { "epoch": 9.882503619556744, "grad_norm": 0.08146383613348007, "learning_rate": 2.106374737959049e-08, "loss": 0.4535, "num_input_tokens_seen": 107646928, "step": 88735 }, { "epoch": 9.883060474440361, "grad_norm": 0.11946853995323181, "learning_rate": 2.086477998315961e-08, "loss": 0.4653, "num_input_tokens_seen": 107652816, "step": 88740 }, { "epoch": 9.883617329323979, "grad_norm": 0.11729221791028976, "learning_rate": 2.066675637850679e-08, "loss": 0.4476, "num_input_tokens_seen": 107658992, "step": 88745 }, { "epoch": 9.884174184207595, "grad_norm": 0.08013647049665451, "learning_rate": 2.0469676573114936e-08, "loss": 0.4585, "num_input_tokens_seen": 107665296, "step": 88750 }, { "epoch": 9.884731039091212, "grad_norm": 0.12446510791778564, "learning_rate": 2.0273540574428094e-08, "loss": 0.444, "num_input_tokens_seen": 107671856, "step": 88755 }, { "epoch": 9.88528789397483, "grad_norm": 0.1259097009897232, "learning_rate": 2.007834838985978e-08, "loss": 0.4563, "num_input_tokens_seen": 107677968, "step": 88760 }, { "epoch": 9.885844748858448, "grad_norm": 0.13861216604709625, "learning_rate": 1.988410002678187e-08, "loss": 0.4566, "num_input_tokens_seen": 107683696, "step": 88765 }, { "epoch": 9.886401603742065, "grad_norm": 0.12540902197360992, "learning_rate": 1.9690795492538493e-08, "loss": 0.4645, "num_input_tokens_seen": 107689680, "step": 88770 }, { "epoch": 9.886958458625681, "grad_norm": 0.10791999101638794, "learning_rate": 1.949843479442659e-08, "loss": 0.4731, "num_input_tokens_seen": 107695952, "step": 88775 }, { "epoch": 9.887515313509299, "grad_norm": 0.1009560078382492, "learning_rate": 1.9307017939720895e-08, "loss": 0.4638, "num_input_tokens_seen": 107702160, "step": 88780 }, { "epoch": 9.888072168392917, "grad_norm": 0.12186558544635773, "learning_rate": 1.911654493564896e-08, "loss": 0.4399, "num_input_tokens_seen": 107708336, "step": 88785 }, { "epoch": 9.888629023276534, "grad_norm": 0.10482240468263626, "learning_rate": 1.892701578940781e-08, "loss": 0.46, "num_input_tokens_seen": 107714032, "step": 88790 }, { "epoch": 9.889185878160152, "grad_norm": 0.08577672392129898, "learning_rate": 1.8738430508161155e-08, "loss": 0.4665, "num_input_tokens_seen": 107719728, "step": 88795 }, { "epoch": 9.889742733043768, "grad_norm": 0.09342560917139053, "learning_rate": 1.855078909903385e-08, "loss": 0.4647, "num_input_tokens_seen": 107726032, "step": 88800 }, { "epoch": 9.890299587927386, "grad_norm": 0.14635322988033295, "learning_rate": 1.8364091569114672e-08, "loss": 0.4529, "num_input_tokens_seen": 107731888, "step": 88805 }, { "epoch": 9.890856442811003, "grad_norm": 0.10127624869346619, "learning_rate": 1.8178337925456314e-08, "loss": 0.4577, "num_input_tokens_seen": 107738160, "step": 88810 }, { "epoch": 9.891413297694621, "grad_norm": 0.10981027036905289, "learning_rate": 1.7993528175078157e-08, "loss": 0.4567, "num_input_tokens_seen": 107744304, "step": 88815 }, { "epoch": 9.891970152578239, "grad_norm": 0.13889822363853455, "learning_rate": 1.7809662324963505e-08, "loss": 0.4516, "num_input_tokens_seen": 107750448, "step": 88820 }, { "epoch": 9.892527007461855, "grad_norm": 0.11678502708673477, "learning_rate": 1.762674038205958e-08, "loss": 0.4524, "num_input_tokens_seen": 107756144, "step": 88825 }, { "epoch": 9.893083862345472, "grad_norm": 0.10575742274522781, "learning_rate": 1.7444762353277522e-08, "loss": 0.4623, "num_input_tokens_seen": 107762256, "step": 88830 }, { "epoch": 9.89364071722909, "grad_norm": 0.098967544734478, "learning_rate": 1.726372824549516e-08, "loss": 0.4601, "num_input_tokens_seen": 107768176, "step": 88835 }, { "epoch": 9.894197572112708, "grad_norm": 0.09596779942512512, "learning_rate": 1.7083638065545914e-08, "loss": 0.461, "num_input_tokens_seen": 107774224, "step": 88840 }, { "epoch": 9.894754426996325, "grad_norm": 0.1072373315691948, "learning_rate": 1.690449182024101e-08, "loss": 0.4633, "num_input_tokens_seen": 107780400, "step": 88845 }, { "epoch": 9.895311281879943, "grad_norm": 0.10459576547145844, "learning_rate": 1.6726289516347248e-08, "loss": 0.4582, "num_input_tokens_seen": 107786512, "step": 88850 }, { "epoch": 9.895868136763559, "grad_norm": 0.11855962127447128, "learning_rate": 1.6549031160595364e-08, "loss": 0.4566, "num_input_tokens_seen": 107792624, "step": 88855 }, { "epoch": 9.896424991647176, "grad_norm": 0.12978506088256836, "learning_rate": 1.6372716759685546e-08, "loss": 0.4559, "num_input_tokens_seen": 107798736, "step": 88860 }, { "epoch": 9.896981846530794, "grad_norm": 0.1182347759604454, "learning_rate": 1.619734632027914e-08, "loss": 0.4593, "num_input_tokens_seen": 107804848, "step": 88865 }, { "epoch": 9.897538701414412, "grad_norm": 0.128945991396904, "learning_rate": 1.6022919848998618e-08, "loss": 0.4602, "num_input_tokens_seen": 107810672, "step": 88870 }, { "epoch": 9.89809555629803, "grad_norm": 0.08736097067594528, "learning_rate": 1.5849437352438712e-08, "loss": 0.4567, "num_input_tokens_seen": 107816528, "step": 88875 }, { "epoch": 9.898652411181645, "grad_norm": 0.1257215291261673, "learning_rate": 1.5676898837155285e-08, "loss": 0.4687, "num_input_tokens_seen": 107822480, "step": 88880 }, { "epoch": 9.899209266065263, "grad_norm": 0.13308313488960266, "learning_rate": 1.5505304309662573e-08, "loss": 0.4673, "num_input_tokens_seen": 107828720, "step": 88885 }, { "epoch": 9.89976612094888, "grad_norm": 0.16327029466629028, "learning_rate": 1.533465377644705e-08, "loss": 0.4493, "num_input_tokens_seen": 107834896, "step": 88890 }, { "epoch": 9.900322975832498, "grad_norm": 0.1181190013885498, "learning_rate": 1.5164947243953566e-08, "loss": 0.4503, "num_input_tokens_seen": 107840752, "step": 88895 }, { "epoch": 9.900879830716116, "grad_norm": 0.12286645919084549, "learning_rate": 1.4996184718599204e-08, "loss": 0.4738, "num_input_tokens_seen": 107846768, "step": 88900 }, { "epoch": 9.901436685599732, "grad_norm": 0.11564013361930847, "learning_rate": 1.4828366206753875e-08, "loss": 0.4596, "num_input_tokens_seen": 107852848, "step": 88905 }, { "epoch": 9.90199354048335, "grad_norm": 0.11420424282550812, "learning_rate": 1.4661491714762498e-08, "loss": 0.455, "num_input_tokens_seen": 107859056, "step": 88910 }, { "epoch": 9.902550395366967, "grad_norm": 0.11227264255285263, "learning_rate": 1.4495561248931145e-08, "loss": 0.4593, "num_input_tokens_seen": 107865104, "step": 88915 }, { "epoch": 9.903107250250585, "grad_norm": 0.09046076983213425, "learning_rate": 1.433057481552702e-08, "loss": 0.4633, "num_input_tokens_seen": 107870896, "step": 88920 }, { "epoch": 9.903664105134203, "grad_norm": 0.09323319047689438, "learning_rate": 1.4166532420784028e-08, "loss": 0.4675, "num_input_tokens_seen": 107877104, "step": 88925 }, { "epoch": 9.904220960017819, "grad_norm": 0.0904374048113823, "learning_rate": 1.4003434070902766e-08, "loss": 0.4626, "num_input_tokens_seen": 107882640, "step": 88930 }, { "epoch": 9.904777814901436, "grad_norm": 0.09007230401039124, "learning_rate": 1.3841279772039417e-08, "loss": 0.4564, "num_input_tokens_seen": 107888688, "step": 88935 }, { "epoch": 9.905334669785054, "grad_norm": 0.09596376866102219, "learning_rate": 1.3680069530327966e-08, "loss": 0.4665, "num_input_tokens_seen": 107894864, "step": 88940 }, { "epoch": 9.905891524668672, "grad_norm": 0.117661252617836, "learning_rate": 1.3519803351852434e-08, "loss": 0.4736, "num_input_tokens_seen": 107900912, "step": 88945 }, { "epoch": 9.90644837955229, "grad_norm": 0.1484503298997879, "learning_rate": 1.3360481242674638e-08, "loss": 0.4648, "num_input_tokens_seen": 107907120, "step": 88950 }, { "epoch": 9.907005234435905, "grad_norm": 0.12397059798240662, "learning_rate": 1.3202103208809213e-08, "loss": 0.4568, "num_input_tokens_seen": 107913200, "step": 88955 }, { "epoch": 9.907562089319523, "grad_norm": 0.08819670230150223, "learning_rate": 1.3044669256245812e-08, "loss": 0.464, "num_input_tokens_seen": 107919088, "step": 88960 }, { "epoch": 9.90811894420314, "grad_norm": 0.10409349203109741, "learning_rate": 1.2888179390926903e-08, "loss": 0.4652, "num_input_tokens_seen": 107925168, "step": 88965 }, { "epoch": 9.908675799086758, "grad_norm": 0.10143726319074631, "learning_rate": 1.2732633618767197e-08, "loss": 0.4493, "num_input_tokens_seen": 107931440, "step": 88970 }, { "epoch": 9.909232653970376, "grad_norm": 0.11386708170175552, "learning_rate": 1.257803194564533e-08, "loss": 0.4634, "num_input_tokens_seen": 107936496, "step": 88975 }, { "epoch": 9.909789508853994, "grad_norm": 0.10404583066701889, "learning_rate": 1.2424374377403847e-08, "loss": 0.4611, "num_input_tokens_seen": 107942640, "step": 88980 }, { "epoch": 9.91034636373761, "grad_norm": 0.10177421569824219, "learning_rate": 1.2271660919843664e-08, "loss": 0.4579, "num_input_tokens_seen": 107948912, "step": 88985 }, { "epoch": 9.910903218621227, "grad_norm": 0.09056518226861954, "learning_rate": 1.2119891578735166e-08, "loss": 0.4636, "num_input_tokens_seen": 107955056, "step": 88990 }, { "epoch": 9.911460073504845, "grad_norm": 0.11234752088785172, "learning_rate": 1.1969066359818204e-08, "loss": 0.4624, "num_input_tokens_seen": 107960464, "step": 88995 }, { "epoch": 9.912016928388462, "grad_norm": 0.0884019210934639, "learning_rate": 1.1819185268788225e-08, "loss": 0.4698, "num_input_tokens_seen": 107966736, "step": 89000 }, { "epoch": 9.91257378327208, "grad_norm": 0.10871337354183197, "learning_rate": 1.1670248311304589e-08, "loss": 0.4686, "num_input_tokens_seen": 107972752, "step": 89005 }, { "epoch": 9.913130638155696, "grad_norm": 0.11770880967378616, "learning_rate": 1.1522255493001677e-08, "loss": 0.4577, "num_input_tokens_seen": 107979088, "step": 89010 }, { "epoch": 9.913687493039314, "grad_norm": 0.10273239016532898, "learning_rate": 1.1375206819463913e-08, "loss": 0.4536, "num_input_tokens_seen": 107985200, "step": 89015 }, { "epoch": 9.914244347922931, "grad_norm": 0.11607690900564194, "learning_rate": 1.1229102296253512e-08, "loss": 0.4635, "num_input_tokens_seen": 107991216, "step": 89020 }, { "epoch": 9.914801202806549, "grad_norm": 0.10698869824409485, "learning_rate": 1.1083941928888287e-08, "loss": 0.4637, "num_input_tokens_seen": 107997104, "step": 89025 }, { "epoch": 9.915358057690167, "grad_norm": 0.1233111172914505, "learning_rate": 1.0939725722849959e-08, "loss": 0.4577, "num_input_tokens_seen": 108003056, "step": 89030 }, { "epoch": 9.915914912573783, "grad_norm": 0.12452246248722076, "learning_rate": 1.07964536835925e-08, "loss": 0.4629, "num_input_tokens_seen": 108009264, "step": 89035 }, { "epoch": 9.9164717674574, "grad_norm": 0.1626574993133545, "learning_rate": 1.0654125816525473e-08, "loss": 0.4562, "num_input_tokens_seen": 108015504, "step": 89040 }, { "epoch": 9.917028622341018, "grad_norm": 0.12178532779216766, "learning_rate": 1.0512742127027908e-08, "loss": 0.4702, "num_input_tokens_seen": 108021488, "step": 89045 }, { "epoch": 9.917585477224636, "grad_norm": 0.09815513342618942, "learning_rate": 1.0372302620442754e-08, "loss": 0.4576, "num_input_tokens_seen": 108027696, "step": 89050 }, { "epoch": 9.918142332108253, "grad_norm": 0.0945163369178772, "learning_rate": 1.0232807302074099e-08, "loss": 0.4578, "num_input_tokens_seen": 108033872, "step": 89055 }, { "epoch": 9.91869918699187, "grad_norm": 0.15065909922122955, "learning_rate": 1.0094256177195504e-08, "loss": 0.4556, "num_input_tokens_seen": 108039728, "step": 89060 }, { "epoch": 9.919256041875487, "grad_norm": 0.08904151618480682, "learning_rate": 9.956649251038897e-09, "loss": 0.4641, "num_input_tokens_seen": 108045168, "step": 89065 }, { "epoch": 9.919812896759105, "grad_norm": 0.1004749983549118, "learning_rate": 9.819986528802894e-09, "loss": 0.4524, "num_input_tokens_seen": 108051248, "step": 89070 }, { "epoch": 9.920369751642722, "grad_norm": 0.0948164314031601, "learning_rate": 9.684268015655585e-09, "loss": 0.4551, "num_input_tokens_seen": 108057136, "step": 89075 }, { "epoch": 9.92092660652634, "grad_norm": 0.12450253963470459, "learning_rate": 9.549493716720647e-09, "loss": 0.4475, "num_input_tokens_seen": 108062864, "step": 89080 }, { "epoch": 9.921483461409956, "grad_norm": 0.10272054374217987, "learning_rate": 9.415663637091232e-09, "loss": 0.4637, "num_input_tokens_seen": 108069200, "step": 89085 }, { "epoch": 9.922040316293574, "grad_norm": 0.13182510435581207, "learning_rate": 9.282777781824403e-09, "loss": 0.4449, "num_input_tokens_seen": 108075600, "step": 89090 }, { "epoch": 9.922597171177191, "grad_norm": 0.07190896570682526, "learning_rate": 9.150836155941146e-09, "loss": 0.457, "num_input_tokens_seen": 108081776, "step": 89095 }, { "epoch": 9.923154026060809, "grad_norm": 0.08419892191886902, "learning_rate": 9.019838764429133e-09, "loss": 0.464, "num_input_tokens_seen": 108087952, "step": 89100 }, { "epoch": 9.923710880944427, "grad_norm": 0.11697308719158173, "learning_rate": 8.889785612231639e-09, "loss": 0.4671, "num_input_tokens_seen": 108093840, "step": 89105 }, { "epoch": 9.924267735828042, "grad_norm": 0.09286569058895111, "learning_rate": 8.760676704266946e-09, "loss": 0.4555, "num_input_tokens_seen": 108099728, "step": 89110 }, { "epoch": 9.92482459071166, "grad_norm": 0.08916304260492325, "learning_rate": 8.632512045411711e-09, "loss": 0.4564, "num_input_tokens_seen": 108106128, "step": 89115 }, { "epoch": 9.925381445595278, "grad_norm": 0.12118116021156311, "learning_rate": 8.505291640509282e-09, "loss": 0.4603, "num_input_tokens_seen": 108112080, "step": 89120 }, { "epoch": 9.925938300478895, "grad_norm": 0.07755202054977417, "learning_rate": 8.379015494366927e-09, "loss": 0.4723, "num_input_tokens_seen": 108118256, "step": 89125 }, { "epoch": 9.926495155362513, "grad_norm": 0.14580287039279938, "learning_rate": 8.253683611755825e-09, "loss": 0.4689, "num_input_tokens_seen": 108124752, "step": 89130 }, { "epoch": 9.927052010246129, "grad_norm": 0.0985972210764885, "learning_rate": 8.129295997408304e-09, "loss": 0.4521, "num_input_tokens_seen": 108131056, "step": 89135 }, { "epoch": 9.927608865129747, "grad_norm": 0.1373796910047531, "learning_rate": 8.005852656026159e-09, "loss": 0.4618, "num_input_tokens_seen": 108136816, "step": 89140 }, { "epoch": 9.928165720013364, "grad_norm": 0.09840263426303864, "learning_rate": 7.8833535922751e-09, "loss": 0.4631, "num_input_tokens_seen": 108143024, "step": 89145 }, { "epoch": 9.928722574896982, "grad_norm": 0.11632303148508072, "learning_rate": 7.761798810781984e-09, "loss": 0.4549, "num_input_tokens_seen": 108149392, "step": 89150 }, { "epoch": 9.9292794297806, "grad_norm": 0.11185140162706375, "learning_rate": 7.641188316140358e-09, "loss": 0.4675, "num_input_tokens_seen": 108155728, "step": 89155 }, { "epoch": 9.929836284664216, "grad_norm": 0.18076935410499573, "learning_rate": 7.52152211290491e-09, "loss": 0.4595, "num_input_tokens_seen": 108161648, "step": 89160 }, { "epoch": 9.930393139547833, "grad_norm": 0.16261142492294312, "learning_rate": 7.402800205599803e-09, "loss": 0.4644, "num_input_tokens_seen": 108168016, "step": 89165 }, { "epoch": 9.930949994431451, "grad_norm": 0.15653277933597565, "learning_rate": 7.2850225987075586e-09, "loss": 0.4705, "num_input_tokens_seen": 108174416, "step": 89170 }, { "epoch": 9.931506849315069, "grad_norm": 0.10082535445690155, "learning_rate": 7.168189296682948e-09, "loss": 0.4396, "num_input_tokens_seen": 108180720, "step": 89175 }, { "epoch": 9.932063704198686, "grad_norm": 0.09819089621305466, "learning_rate": 7.052300303936332e-09, "loss": 0.4562, "num_input_tokens_seen": 108186800, "step": 89180 }, { "epoch": 9.932620559082302, "grad_norm": 0.14076170325279236, "learning_rate": 6.9373556248475415e-09, "loss": 0.4682, "num_input_tokens_seen": 108192688, "step": 89185 }, { "epoch": 9.93317741396592, "grad_norm": 0.08193650096654892, "learning_rate": 6.823355263760322e-09, "loss": 0.4553, "num_input_tokens_seen": 108198960, "step": 89190 }, { "epoch": 9.933734268849538, "grad_norm": 0.12868750095367432, "learning_rate": 6.710299224979566e-09, "loss": 0.4675, "num_input_tokens_seen": 108205232, "step": 89195 }, { "epoch": 9.934291123733155, "grad_norm": 0.10042770951986313, "learning_rate": 6.5981875127824054e-09, "loss": 0.4538, "num_input_tokens_seen": 108211408, "step": 89200 }, { "epoch": 9.934847978616773, "grad_norm": 0.10373411327600479, "learning_rate": 6.487020131396015e-09, "loss": 0.4638, "num_input_tokens_seen": 108217680, "step": 89205 }, { "epoch": 9.93540483350039, "grad_norm": 0.10008221119642258, "learning_rate": 6.376797085028141e-09, "loss": 0.4628, "num_input_tokens_seen": 108223760, "step": 89210 }, { "epoch": 9.935961688384007, "grad_norm": 0.09823935478925705, "learning_rate": 6.267518377842118e-09, "loss": 0.4583, "num_input_tokens_seen": 108230032, "step": 89215 }, { "epoch": 9.936518543267624, "grad_norm": 0.13079378008842468, "learning_rate": 6.1591840139652025e-09, "loss": 0.4604, "num_input_tokens_seen": 108235920, "step": 89220 }, { "epoch": 9.937075398151242, "grad_norm": 0.12510061264038086, "learning_rate": 6.051793997488564e-09, "loss": 0.4633, "num_input_tokens_seen": 108242352, "step": 89225 }, { "epoch": 9.93763225303486, "grad_norm": 0.1507100760936737, "learning_rate": 5.94534833247562e-09, "loss": 0.4599, "num_input_tokens_seen": 108248496, "step": 89230 }, { "epoch": 9.938189107918477, "grad_norm": 0.08613859862089157, "learning_rate": 5.839847022942602e-09, "loss": 0.4608, "num_input_tokens_seen": 108254800, "step": 89235 }, { "epoch": 9.938745962802093, "grad_norm": 0.11324544996023178, "learning_rate": 5.7352900728779856e-09, "loss": 0.4619, "num_input_tokens_seen": 108260528, "step": 89240 }, { "epoch": 9.93930281768571, "grad_norm": 0.09340850263834, "learning_rate": 5.63167748623139e-09, "loss": 0.474, "num_input_tokens_seen": 108266736, "step": 89245 }, { "epoch": 9.939859672569328, "grad_norm": 0.09657448530197144, "learning_rate": 5.5290092669219025e-09, "loss": 0.4639, "num_input_tokens_seen": 108272336, "step": 89250 }, { "epoch": 9.940416527452946, "grad_norm": 0.09057297557592392, "learning_rate": 5.4272854188214265e-09, "loss": 0.4635, "num_input_tokens_seen": 108278576, "step": 89255 }, { "epoch": 9.940973382336564, "grad_norm": 0.19223792850971222, "learning_rate": 5.326505945779658e-09, "loss": 0.4534, "num_input_tokens_seen": 108284976, "step": 89260 }, { "epoch": 9.94153023722018, "grad_norm": 0.09935781359672546, "learning_rate": 5.226670851599113e-09, "loss": 0.4621, "num_input_tokens_seen": 108291216, "step": 89265 }, { "epoch": 9.942087092103797, "grad_norm": 0.11524885892868042, "learning_rate": 5.127780140057325e-09, "loss": 0.452, "num_input_tokens_seen": 108297328, "step": 89270 }, { "epoch": 9.942643946987415, "grad_norm": 0.07808130979537964, "learning_rate": 5.029833814887419e-09, "loss": 0.4686, "num_input_tokens_seen": 108303536, "step": 89275 }, { "epoch": 9.943200801871033, "grad_norm": 0.1245456114411354, "learning_rate": 4.932831879789213e-09, "loss": 0.4521, "num_input_tokens_seen": 108309712, "step": 89280 }, { "epoch": 9.94375765675465, "grad_norm": 0.11168219149112701, "learning_rate": 4.836774338429218e-09, "loss": 0.4536, "num_input_tokens_seen": 108315568, "step": 89285 }, { "epoch": 9.944314511638266, "grad_norm": 0.1025550588965416, "learning_rate": 4.741661194437863e-09, "loss": 0.4685, "num_input_tokens_seen": 108321744, "step": 89290 }, { "epoch": 9.944871366521884, "grad_norm": 0.12661421298980713, "learning_rate": 4.647492451409497e-09, "loss": 0.4615, "num_input_tokens_seen": 108327632, "step": 89295 }, { "epoch": 9.945428221405502, "grad_norm": 0.12717877328395844, "learning_rate": 4.554268112896831e-09, "loss": 0.456, "num_input_tokens_seen": 108334032, "step": 89300 }, { "epoch": 9.94598507628912, "grad_norm": 0.0979216992855072, "learning_rate": 4.461988182427601e-09, "loss": 0.4526, "num_input_tokens_seen": 108340208, "step": 89305 }, { "epoch": 9.946541931172737, "grad_norm": 0.1324424296617508, "learning_rate": 4.370652663487906e-09, "loss": 0.4603, "num_input_tokens_seen": 108346384, "step": 89310 }, { "epoch": 9.947098786056353, "grad_norm": 0.12379071861505508, "learning_rate": 4.28026155952499e-09, "loss": 0.4789, "num_input_tokens_seen": 108352560, "step": 89315 }, { "epoch": 9.94765564093997, "grad_norm": 0.16563472151756287, "learning_rate": 4.1908148739583375e-09, "loss": 0.4608, "num_input_tokens_seen": 108359056, "step": 89320 }, { "epoch": 9.948212495823588, "grad_norm": 0.08305522054433823, "learning_rate": 4.102312610165804e-09, "loss": 0.4573, "num_input_tokens_seen": 108365264, "step": 89325 }, { "epoch": 9.948769350707206, "grad_norm": 0.11157649755477905, "learning_rate": 4.01475477148916e-09, "loss": 0.4632, "num_input_tokens_seen": 108371344, "step": 89330 }, { "epoch": 9.949326205590824, "grad_norm": 0.13299183547496796, "learning_rate": 3.928141361239646e-09, "loss": 0.44, "num_input_tokens_seen": 108377296, "step": 89335 }, { "epoch": 9.949883060474441, "grad_norm": 0.08827235549688339, "learning_rate": 3.842472382689643e-09, "loss": 0.4584, "num_input_tokens_seen": 108383280, "step": 89340 }, { "epoch": 9.950439915358057, "grad_norm": 0.09897653013467789, "learning_rate": 3.757747839075454e-09, "loss": 0.4589, "num_input_tokens_seen": 108389264, "step": 89345 }, { "epoch": 9.950996770241675, "grad_norm": 0.10092124342918396, "learning_rate": 3.673967733594519e-09, "loss": 0.4601, "num_input_tokens_seen": 108395248, "step": 89350 }, { "epoch": 9.951553625125293, "grad_norm": 0.11826402693986893, "learning_rate": 3.5911320694193007e-09, "loss": 0.4658, "num_input_tokens_seen": 108401200, "step": 89355 }, { "epoch": 9.95211048000891, "grad_norm": 0.09730926156044006, "learning_rate": 3.5092408496750773e-09, "loss": 0.4584, "num_input_tokens_seen": 108407344, "step": 89360 }, { "epoch": 9.952667334892528, "grad_norm": 0.12695099413394928, "learning_rate": 3.4282940774565953e-09, "loss": 0.4582, "num_input_tokens_seen": 108413680, "step": 89365 }, { "epoch": 9.953224189776144, "grad_norm": 0.10323130339384079, "learning_rate": 3.348291755822519e-09, "loss": 0.4633, "num_input_tokens_seen": 108419760, "step": 89370 }, { "epoch": 9.953781044659761, "grad_norm": 0.09601142257452011, "learning_rate": 3.269233887795431e-09, "loss": 0.4442, "num_input_tokens_seen": 108425808, "step": 89375 }, { "epoch": 9.954337899543379, "grad_norm": 0.117954783141613, "learning_rate": 3.1911204763646063e-09, "loss": 0.459, "num_input_tokens_seen": 108431632, "step": 89380 }, { "epoch": 9.954894754426997, "grad_norm": 0.15485766530036926, "learning_rate": 3.113951524477687e-09, "loss": 0.4781, "num_input_tokens_seen": 108437808, "step": 89385 }, { "epoch": 9.955451609310614, "grad_norm": 0.10180610418319702, "learning_rate": 3.037727035051785e-09, "loss": 0.4724, "num_input_tokens_seen": 108444016, "step": 89390 }, { "epoch": 9.95600846419423, "grad_norm": 0.08464282751083374, "learning_rate": 2.9624470109679282e-09, "loss": 0.4603, "num_input_tokens_seen": 108450096, "step": 89395 }, { "epoch": 9.956565319077848, "grad_norm": 0.10022054612636566, "learning_rate": 2.888111455071063e-09, "loss": 0.4643, "num_input_tokens_seen": 108455472, "step": 89400 }, { "epoch": 9.957122173961466, "grad_norm": 0.1244768500328064, "learning_rate": 2.8147203701672784e-09, "loss": 0.4633, "num_input_tokens_seen": 108461712, "step": 89405 }, { "epoch": 9.957679028845083, "grad_norm": 0.10300721228122711, "learning_rate": 2.742273759029357e-09, "loss": 0.4594, "num_input_tokens_seen": 108467792, "step": 89410 }, { "epoch": 9.958235883728701, "grad_norm": 0.09314712136983871, "learning_rate": 2.6707716243967727e-09, "loss": 0.4687, "num_input_tokens_seen": 108473808, "step": 89415 }, { "epoch": 9.958792738612317, "grad_norm": 0.12060855329036713, "learning_rate": 2.6002139689729198e-09, "loss": 0.4615, "num_input_tokens_seen": 108479760, "step": 89420 }, { "epoch": 9.959349593495935, "grad_norm": 0.11288363486528397, "learning_rate": 2.5306007954167822e-09, "loss": 0.4655, "num_input_tokens_seen": 108485872, "step": 89425 }, { "epoch": 9.959906448379552, "grad_norm": 0.12275931984186172, "learning_rate": 2.461932106365139e-09, "loss": 0.4619, "num_input_tokens_seen": 108491984, "step": 89430 }, { "epoch": 9.96046330326317, "grad_norm": 0.10694552958011627, "learning_rate": 2.3942079044103615e-09, "loss": 0.4585, "num_input_tokens_seen": 108498096, "step": 89435 }, { "epoch": 9.961020158146788, "grad_norm": 0.1256035566329956, "learning_rate": 2.327428192111514e-09, "loss": 0.4694, "num_input_tokens_seen": 108504176, "step": 89440 }, { "epoch": 9.961577013030404, "grad_norm": 0.12137678265571594, "learning_rate": 2.2615929719915776e-09, "loss": 0.4615, "num_input_tokens_seen": 108510288, "step": 89445 }, { "epoch": 9.962133867914021, "grad_norm": 0.13268372416496277, "learning_rate": 2.196702246537452e-09, "loss": 0.4619, "num_input_tokens_seen": 108516464, "step": 89450 }, { "epoch": 9.962690722797639, "grad_norm": 0.10685417056083679, "learning_rate": 2.132756018199955e-09, "loss": 0.4689, "num_input_tokens_seen": 108522800, "step": 89455 }, { "epoch": 9.963247577681257, "grad_norm": 0.09294797480106354, "learning_rate": 2.0697542893993727e-09, "loss": 0.4622, "num_input_tokens_seen": 108528944, "step": 89460 }, { "epoch": 9.963804432564874, "grad_norm": 0.10665635019540787, "learning_rate": 2.007697062511582e-09, "loss": 0.4656, "num_input_tokens_seen": 108534640, "step": 89465 }, { "epoch": 9.96436128744849, "grad_norm": 0.08426600694656372, "learning_rate": 1.9465843398819295e-09, "loss": 0.465, "num_input_tokens_seen": 108540624, "step": 89470 }, { "epoch": 9.964918142332108, "grad_norm": 0.1052444651722908, "learning_rate": 1.8864161238224544e-09, "loss": 0.4553, "num_input_tokens_seen": 108547248, "step": 89475 }, { "epoch": 9.965474997215725, "grad_norm": 0.15599705278873444, "learning_rate": 1.8271924166035625e-09, "loss": 0.4612, "num_input_tokens_seen": 108553488, "step": 89480 }, { "epoch": 9.966031852099343, "grad_norm": 0.10222277045249939, "learning_rate": 1.7689132204623538e-09, "loss": 0.453, "num_input_tokens_seen": 108559760, "step": 89485 }, { "epoch": 9.96658870698296, "grad_norm": 0.11619455367326736, "learning_rate": 1.7115785376053962e-09, "loss": 0.4675, "num_input_tokens_seen": 108565744, "step": 89490 }, { "epoch": 9.967145561866577, "grad_norm": 0.11419608443975449, "learning_rate": 1.655188370194849e-09, "loss": 0.4686, "num_input_tokens_seen": 108572080, "step": 89495 }, { "epoch": 9.967702416750194, "grad_norm": 0.12431519478559494, "learning_rate": 1.5997427203595649e-09, "loss": 0.4633, "num_input_tokens_seen": 108577968, "step": 89500 }, { "epoch": 9.968259271633812, "grad_norm": 0.09443891793489456, "learning_rate": 1.5452415902006411e-09, "loss": 0.4725, "num_input_tokens_seen": 108583856, "step": 89505 }, { "epoch": 9.96881612651743, "grad_norm": 0.11818234622478485, "learning_rate": 1.4916849817747659e-09, "loss": 0.4554, "num_input_tokens_seen": 108589744, "step": 89510 }, { "epoch": 9.969372981401047, "grad_norm": 0.1321096420288086, "learning_rate": 1.4390728971025447e-09, "loss": 0.4623, "num_input_tokens_seen": 108595856, "step": 89515 }, { "epoch": 9.969929836284663, "grad_norm": 0.09121234714984894, "learning_rate": 1.3874053381740526e-09, "loss": 0.4703, "num_input_tokens_seen": 108601808, "step": 89520 }, { "epoch": 9.970486691168281, "grad_norm": 0.11549649387598038, "learning_rate": 1.3366823069405066e-09, "loss": 0.4621, "num_input_tokens_seen": 108607920, "step": 89525 }, { "epoch": 9.971043546051899, "grad_norm": 0.10164019465446472, "learning_rate": 1.2869038053225924e-09, "loss": 0.4543, "num_input_tokens_seen": 108614000, "step": 89530 }, { "epoch": 9.971600400935516, "grad_norm": 0.10578157007694244, "learning_rate": 1.2380698351938113e-09, "loss": 0.4546, "num_input_tokens_seen": 108619984, "step": 89535 }, { "epoch": 9.972157255819134, "grad_norm": 0.12072551995515823, "learning_rate": 1.1901803984054605e-09, "loss": 0.4676, "num_input_tokens_seen": 108626352, "step": 89540 }, { "epoch": 9.97271411070275, "grad_norm": 0.11415085941553116, "learning_rate": 1.1432354967644277e-09, "loss": 0.4583, "num_input_tokens_seen": 108632464, "step": 89545 }, { "epoch": 9.973270965586368, "grad_norm": 0.10730789601802826, "learning_rate": 1.0972351320442942e-09, "loss": 0.4486, "num_input_tokens_seen": 108638544, "step": 89550 }, { "epoch": 9.973827820469985, "grad_norm": 0.17110101878643036, "learning_rate": 1.0521793059853347e-09, "loss": 0.4593, "num_input_tokens_seen": 108644816, "step": 89555 }, { "epoch": 9.974384675353603, "grad_norm": 0.13416285812854767, "learning_rate": 1.0080680202861903e-09, "loss": 0.4608, "num_input_tokens_seen": 108650896, "step": 89560 }, { "epoch": 9.97494153023722, "grad_norm": 0.10846327245235443, "learning_rate": 9.649012766177467e-10, "loss": 0.4729, "num_input_tokens_seen": 108656944, "step": 89565 }, { "epoch": 9.975498385120838, "grad_norm": 0.09606007486581802, "learning_rate": 9.226790766064808e-10, "loss": 0.4586, "num_input_tokens_seen": 108663088, "step": 89570 }, { "epoch": 9.976055240004454, "grad_norm": 0.1255459189414978, "learning_rate": 8.81401421848338e-10, "loss": 0.4563, "num_input_tokens_seen": 108669456, "step": 89575 }, { "epoch": 9.976612094888072, "grad_norm": 0.08962364494800568, "learning_rate": 8.410683139087328e-10, "loss": 0.4595, "num_input_tokens_seen": 108675632, "step": 89580 }, { "epoch": 9.97716894977169, "grad_norm": 0.11248406767845154, "learning_rate": 8.016797543031196e-10, "loss": 0.4724, "num_input_tokens_seen": 108681776, "step": 89585 }, { "epoch": 9.977725804655307, "grad_norm": 0.0928574651479721, "learning_rate": 7.63235744527524e-10, "loss": 0.4525, "num_input_tokens_seen": 108688016, "step": 89590 }, { "epoch": 9.978282659538925, "grad_norm": 0.12380961328744888, "learning_rate": 7.257362860280115e-10, "loss": 0.4657, "num_input_tokens_seen": 108694384, "step": 89595 }, { "epoch": 9.97883951442254, "grad_norm": 0.10471273213624954, "learning_rate": 6.891813802256674e-10, "loss": 0.4527, "num_input_tokens_seen": 108700624, "step": 89600 }, { "epoch": 9.979396369306158, "grad_norm": 0.08097172528505325, "learning_rate": 6.535710284999441e-10, "loss": 0.45, "num_input_tokens_seen": 108706768, "step": 89605 }, { "epoch": 9.979953224189776, "grad_norm": 0.1284007579088211, "learning_rate": 6.189052321969868e-10, "loss": 0.4595, "num_input_tokens_seen": 108712784, "step": 89610 }, { "epoch": 9.980510079073394, "grad_norm": 0.10019106417894363, "learning_rate": 5.851839926268588e-10, "loss": 0.4601, "num_input_tokens_seen": 108718448, "step": 89615 }, { "epoch": 9.981066933957011, "grad_norm": 0.08105280995368958, "learning_rate": 5.524073110607652e-10, "loss": 0.4491, "num_input_tokens_seen": 108724688, "step": 89620 }, { "epoch": 9.981623788840627, "grad_norm": 0.10754148662090302, "learning_rate": 5.205751887421561e-10, "loss": 0.4579, "num_input_tokens_seen": 108730928, "step": 89625 }, { "epoch": 9.982180643724245, "grad_norm": 0.13674134016036987, "learning_rate": 4.896876268672967e-10, "loss": 0.4474, "num_input_tokens_seen": 108737424, "step": 89630 }, { "epoch": 9.982737498607863, "grad_norm": 0.1175629049539566, "learning_rate": 4.5974462661024787e-10, "loss": 0.466, "num_input_tokens_seen": 108743408, "step": 89635 }, { "epoch": 9.98329435349148, "grad_norm": 0.16728270053863525, "learning_rate": 4.307461890978859e-10, "loss": 0.4627, "num_input_tokens_seen": 108749552, "step": 89640 }, { "epoch": 9.983851208375098, "grad_norm": 0.08689147979021072, "learning_rate": 4.0269231542655606e-10, "loss": 0.4647, "num_input_tokens_seen": 108755760, "step": 89645 }, { "epoch": 9.984408063258714, "grad_norm": 0.10440342873334885, "learning_rate": 3.7558300665652136e-10, "loss": 0.4657, "num_input_tokens_seen": 108761968, "step": 89650 }, { "epoch": 9.984964918142332, "grad_norm": 0.09384243935346603, "learning_rate": 3.4941826381473806e-10, "loss": 0.4583, "num_input_tokens_seen": 108768144, "step": 89655 }, { "epoch": 9.98552177302595, "grad_norm": 0.10587408393621445, "learning_rate": 3.2419808788375363e-10, "loss": 0.474, "num_input_tokens_seen": 108774352, "step": 89660 }, { "epoch": 9.986078627909567, "grad_norm": 0.11511848866939545, "learning_rate": 2.999224798211353e-10, "loss": 0.4426, "num_input_tokens_seen": 108780272, "step": 89665 }, { "epoch": 9.986635482793185, "grad_norm": 0.11744485050439835, "learning_rate": 2.765914405455927e-10, "loss": 0.4631, "num_input_tokens_seen": 108786576, "step": 89670 }, { "epoch": 9.987192337676802, "grad_norm": 0.12419044971466064, "learning_rate": 2.542049709314265e-10, "loss": 0.4732, "num_input_tokens_seen": 108792752, "step": 89675 }, { "epoch": 9.987749192560418, "grad_norm": 0.11795279383659363, "learning_rate": 2.3276307183350831e-10, "loss": 0.4682, "num_input_tokens_seen": 108798832, "step": 89680 }, { "epoch": 9.988306047444036, "grad_norm": 0.1306319683790207, "learning_rate": 2.1226574405397437e-10, "loss": 0.4538, "num_input_tokens_seen": 108805200, "step": 89685 }, { "epoch": 9.988862902327654, "grad_norm": 0.09822555631399155, "learning_rate": 1.9271298837275632e-10, "loss": 0.468, "num_input_tokens_seen": 108810960, "step": 89690 }, { "epoch": 9.989419757211271, "grad_norm": 0.15325644612312317, "learning_rate": 1.741048055253769e-10, "loss": 0.4597, "num_input_tokens_seen": 108817040, "step": 89695 }, { "epoch": 9.989976612094889, "grad_norm": 0.10134247690439224, "learning_rate": 1.5644119621682774e-10, "loss": 0.4685, "num_input_tokens_seen": 108822608, "step": 89700 }, { "epoch": 9.990533466978505, "grad_norm": 0.08419308066368103, "learning_rate": 1.3972216111324266e-10, "loss": 0.4591, "num_input_tokens_seen": 108828912, "step": 89705 }, { "epoch": 9.991090321862123, "grad_norm": 0.09186803549528122, "learning_rate": 1.2394770084744877e-10, "loss": 0.476, "num_input_tokens_seen": 108835024, "step": 89710 }, { "epoch": 9.99164717674574, "grad_norm": 0.10662825405597687, "learning_rate": 1.0911781601619097e-10, "loss": 0.4745, "num_input_tokens_seen": 108841040, "step": 89715 }, { "epoch": 9.992204031629358, "grad_norm": 0.10840100049972534, "learning_rate": 9.523250717735632e-11, "loss": 0.4655, "num_input_tokens_seen": 108847280, "step": 89720 }, { "epoch": 9.992760886512976, "grad_norm": 0.16122961044311523, "learning_rate": 8.22917748555252e-11, "loss": 0.4561, "num_input_tokens_seen": 108853264, "step": 89725 }, { "epoch": 9.993317741396591, "grad_norm": 0.12221598625183105, "learning_rate": 7.029561954197129e-11, "loss": 0.4625, "num_input_tokens_seen": 108859536, "step": 89730 }, { "epoch": 9.99387459628021, "grad_norm": 0.10414834320545197, "learning_rate": 5.924404168911046e-11, "loss": 0.4643, "num_input_tokens_seen": 108865680, "step": 89735 }, { "epoch": 9.994431451163827, "grad_norm": 0.10446713864803314, "learning_rate": 4.9137041716051937e-11, "loss": 0.4604, "num_input_tokens_seen": 108871696, "step": 89740 }, { "epoch": 9.994988306047444, "grad_norm": 0.10702849924564362, "learning_rate": 3.9974620000271525e-11, "loss": 0.4624, "num_input_tokens_seen": 108877872, "step": 89745 }, { "epoch": 9.995545160931062, "grad_norm": 0.12336177378892899, "learning_rate": 3.175677689148948e-11, "loss": 0.4484, "num_input_tokens_seen": 108883632, "step": 89750 }, { "epoch": 9.996102015814678, "grad_norm": 0.09711413830518723, "learning_rate": 2.44835126977927e-11, "loss": 0.4574, "num_input_tokens_seen": 108889776, "step": 89755 }, { "epoch": 9.996658870698296, "grad_norm": 0.08569693565368652, "learning_rate": 1.8154827696736932e-11, "loss": 0.4609, "num_input_tokens_seen": 108895760, "step": 89760 }, { "epoch": 9.997215725581913, "grad_norm": 0.11914149671792984, "learning_rate": 1.277072212702013e-11, "loss": 0.4617, "num_input_tokens_seen": 108901776, "step": 89765 }, { "epoch": 9.997772580465531, "grad_norm": 0.10579286515712738, "learning_rate": 8.331196191257994e-12, "loss": 0.4514, "num_input_tokens_seen": 108907728, "step": 89770 }, { "epoch": 9.998329435349149, "grad_norm": 0.10732223093509674, "learning_rate": 4.836250055983982e-12, "loss": 0.4576, "num_input_tokens_seen": 108913840, "step": 89775 }, { "epoch": 9.998886290232765, "grad_norm": 0.09419000148773193, "learning_rate": 2.285883854424853e-12, "loss": 0.4598, "num_input_tokens_seen": 108919824, "step": 89780 }, { "epoch": 9.999443145116382, "grad_norm": 0.12580066919326782, "learning_rate": 6.800976837251227e-13, "loss": 0.4595, "num_input_tokens_seen": 108925200, "step": 89785 }, { "epoch": 10.0, "grad_norm": 0.16037555038928986, "learning_rate": 1.889160217150021e-14, "loss": 0.466, "num_input_tokens_seen": 108930064, "step": 89790 }, { "epoch": 10.0, "num_input_tokens_seen": 108930064, "step": 89790, "total_flos": 4.905071549016834e+18, "train_loss": 0.6625794311902188, "train_runtime": 25298.2139, "train_samples_per_second": 14.196, "train_steps_per_second": 3.549 } ], "logging_steps": 5, "max_steps": 89790, "num_input_tokens_seen": 108930064, "num_train_epochs": 10, "save_steps": 4490, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.905071549016834e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }