{ "best_global_step": 26856, "best_metric": 0.6477048397064209, "best_model_checkpoint": "saves/p-tuning/llama-3-8b-instruct/train_math_qa_1754652175/checkpoint-26856", "epoch": 10.0, "eval_steps": 3357, "global_step": 67140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007447125409591898, "grad_norm": 384.2735290527344, "learning_rate": 2.978850163836759e-08, "loss": 7.196, "num_input_tokens_seen": 2976, "step": 5 }, { "epoch": 0.0014894250819183796, "grad_norm": 309.330810546875, "learning_rate": 6.702412868632709e-08, "loss": 7.2587, "num_input_tokens_seen": 5920, "step": 10 }, { "epoch": 0.002234137622877569, "grad_norm": 369.9922790527344, "learning_rate": 1.0425975573428657e-07, "loss": 6.546, "num_input_tokens_seen": 8832, "step": 15 }, { "epoch": 0.002978850163836759, "grad_norm": 386.3720703125, "learning_rate": 1.4149538278224606e-07, "loss": 6.6276, "num_input_tokens_seen": 11648, "step": 20 }, { "epoch": 0.0037235627047959487, "grad_norm": 316.9941711425781, "learning_rate": 1.7873100983020555e-07, "loss": 6.5739, "num_input_tokens_seen": 14368, "step": 25 }, { "epoch": 0.004468275245755138, "grad_norm": 260.60894775390625, "learning_rate": 2.1596663687816505e-07, "loss": 6.138, "num_input_tokens_seen": 17280, "step": 30 }, { "epoch": 0.005212987786714328, "grad_norm": 133.60693359375, "learning_rate": 2.532022639261245e-07, "loss": 5.3035, "num_input_tokens_seen": 20160, "step": 35 }, { "epoch": 0.005957700327673518, "grad_norm": 236.73626708984375, "learning_rate": 2.90437890974084e-07, "loss": 5.5269, "num_input_tokens_seen": 23104, "step": 40 }, { "epoch": 0.006702412868632708, "grad_norm": 204.2643280029297, "learning_rate": 3.276735180220435e-07, "loss": 4.9813, "num_input_tokens_seen": 26048, "step": 45 }, { "epoch": 0.0074471254095918975, "grad_norm": 264.57879638671875, "learning_rate": 3.64909145070003e-07, "loss": 4.6177, "num_input_tokens_seen": 28928, "step": 50 }, { "epoch": 0.008191837950551088, "grad_norm": 165.04800415039062, "learning_rate": 4.021447721179625e-07, "loss": 4.1926, "num_input_tokens_seen": 32256, "step": 55 }, { "epoch": 0.008936550491510277, "grad_norm": 68.80213165283203, "learning_rate": 4.3938039916592203e-07, "loss": 4.0465, "num_input_tokens_seen": 35232, "step": 60 }, { "epoch": 0.009681263032469467, "grad_norm": 98.37369537353516, "learning_rate": 4.7661602621388146e-07, "loss": 3.996, "num_input_tokens_seen": 38112, "step": 65 }, { "epoch": 0.010425975573428656, "grad_norm": 49.42142868041992, "learning_rate": 5.13851653261841e-07, "loss": 3.7153, "num_input_tokens_seen": 41120, "step": 70 }, { "epoch": 0.011170688114387846, "grad_norm": 46.37179183959961, "learning_rate": 5.510872803098004e-07, "loss": 3.5405, "num_input_tokens_seen": 44064, "step": 75 }, { "epoch": 0.011915400655347037, "grad_norm": 41.566741943359375, "learning_rate": 5.8832290735776e-07, "loss": 3.2765, "num_input_tokens_seen": 46592, "step": 80 }, { "epoch": 0.012660113196306225, "grad_norm": 35.53861999511719, "learning_rate": 6.255585344057195e-07, "loss": 3.2172, "num_input_tokens_seen": 49152, "step": 85 }, { "epoch": 0.013404825737265416, "grad_norm": 37.274925231933594, "learning_rate": 6.627941614536789e-07, "loss": 3.1597, "num_input_tokens_seen": 51776, "step": 90 }, { "epoch": 0.014149538278224605, "grad_norm": 92.0075912475586, "learning_rate": 7.000297885016385e-07, "loss": 2.7179, "num_input_tokens_seen": 54624, "step": 95 }, { "epoch": 0.014894250819183795, "grad_norm": 35.84732437133789, "learning_rate": 7.372654155495979e-07, "loss": 2.8843, "num_input_tokens_seen": 57312, "step": 100 }, { "epoch": 0.015638963360142984, "grad_norm": 65.053955078125, "learning_rate": 7.745010425975574e-07, "loss": 2.8324, "num_input_tokens_seen": 60064, "step": 105 }, { "epoch": 0.016383675901102176, "grad_norm": 47.91868209838867, "learning_rate": 8.11736669645517e-07, "loss": 2.6194, "num_input_tokens_seen": 63424, "step": 110 }, { "epoch": 0.017128388442061365, "grad_norm": 72.74422454833984, "learning_rate": 8.489722966934764e-07, "loss": 2.3547, "num_input_tokens_seen": 66080, "step": 115 }, { "epoch": 0.017873100983020553, "grad_norm": 41.6552734375, "learning_rate": 8.862079237414358e-07, "loss": 2.3412, "num_input_tokens_seen": 69440, "step": 120 }, { "epoch": 0.018617813523979745, "grad_norm": 52.65083312988281, "learning_rate": 9.234435507893953e-07, "loss": 2.0561, "num_input_tokens_seen": 72320, "step": 125 }, { "epoch": 0.019362526064938934, "grad_norm": 73.25337982177734, "learning_rate": 9.606791778373549e-07, "loss": 1.8618, "num_input_tokens_seen": 75168, "step": 130 }, { "epoch": 0.020107238605898123, "grad_norm": 109.52399444580078, "learning_rate": 9.979148048853143e-07, "loss": 1.8099, "num_input_tokens_seen": 78176, "step": 135 }, { "epoch": 0.02085195114685731, "grad_norm": 114.76470184326172, "learning_rate": 1.035150431933274e-06, "loss": 1.5512, "num_input_tokens_seen": 81248, "step": 140 }, { "epoch": 0.021596663687816504, "grad_norm": 47.98325729370117, "learning_rate": 1.0723860589812334e-06, "loss": 1.5942, "num_input_tokens_seen": 84416, "step": 145 }, { "epoch": 0.022341376228775692, "grad_norm": 54.5960693359375, "learning_rate": 1.1096216860291928e-06, "loss": 1.4061, "num_input_tokens_seen": 87584, "step": 150 }, { "epoch": 0.02308608876973488, "grad_norm": 50.606163024902344, "learning_rate": 1.1468573130771522e-06, "loss": 1.1484, "num_input_tokens_seen": 90368, "step": 155 }, { "epoch": 0.023830801310694073, "grad_norm": 129.24461364746094, "learning_rate": 1.1840929401251119e-06, "loss": 1.3229, "num_input_tokens_seen": 93152, "step": 160 }, { "epoch": 0.024575513851653262, "grad_norm": 72.86314392089844, "learning_rate": 1.2213285671730713e-06, "loss": 1.0603, "num_input_tokens_seen": 96384, "step": 165 }, { "epoch": 0.02532022639261245, "grad_norm": 74.48533630371094, "learning_rate": 1.2585641942210307e-06, "loss": 1.1298, "num_input_tokens_seen": 99136, "step": 170 }, { "epoch": 0.026064938933571643, "grad_norm": 134.98365783691406, "learning_rate": 1.2957998212689904e-06, "loss": 1.1053, "num_input_tokens_seen": 101824, "step": 175 }, { "epoch": 0.02680965147453083, "grad_norm": 104.92533874511719, "learning_rate": 1.3330354483169498e-06, "loss": 1.0142, "num_input_tokens_seen": 104832, "step": 180 }, { "epoch": 0.02755436401549002, "grad_norm": 70.6898193359375, "learning_rate": 1.3702710753649092e-06, "loss": 1.0161, "num_input_tokens_seen": 107680, "step": 185 }, { "epoch": 0.02829907655644921, "grad_norm": 50.801856994628906, "learning_rate": 1.4075067024128687e-06, "loss": 0.9298, "num_input_tokens_seen": 110560, "step": 190 }, { "epoch": 0.0290437890974084, "grad_norm": 27.82895278930664, "learning_rate": 1.4447423294608283e-06, "loss": 0.9707, "num_input_tokens_seen": 113696, "step": 195 }, { "epoch": 0.02978850163836759, "grad_norm": 47.52273178100586, "learning_rate": 1.4819779565087877e-06, "loss": 1.0271, "num_input_tokens_seen": 116480, "step": 200 }, { "epoch": 0.03053321417932678, "grad_norm": 42.0241584777832, "learning_rate": 1.5192135835567472e-06, "loss": 0.976, "num_input_tokens_seen": 119712, "step": 205 }, { "epoch": 0.03127792672028597, "grad_norm": 47.810264587402344, "learning_rate": 1.5564492106047066e-06, "loss": 0.9068, "num_input_tokens_seen": 122944, "step": 210 }, { "epoch": 0.032022639261245156, "grad_norm": 57.232276916503906, "learning_rate": 1.593684837652666e-06, "loss": 0.7781, "num_input_tokens_seen": 125984, "step": 215 }, { "epoch": 0.03276735180220435, "grad_norm": 71.16899871826172, "learning_rate": 1.6309204647006257e-06, "loss": 1.0523, "num_input_tokens_seen": 129216, "step": 220 }, { "epoch": 0.03351206434316354, "grad_norm": 115.05960083007812, "learning_rate": 1.668156091748585e-06, "loss": 0.9772, "num_input_tokens_seen": 131808, "step": 225 }, { "epoch": 0.03425677688412273, "grad_norm": 36.22675704956055, "learning_rate": 1.7053917187965447e-06, "loss": 0.8965, "num_input_tokens_seen": 135040, "step": 230 }, { "epoch": 0.03500148942508192, "grad_norm": 40.20038604736328, "learning_rate": 1.7426273458445042e-06, "loss": 0.9434, "num_input_tokens_seen": 137792, "step": 235 }, { "epoch": 0.035746201966041107, "grad_norm": 58.371585845947266, "learning_rate": 1.7798629728924636e-06, "loss": 0.9646, "num_input_tokens_seen": 140800, "step": 240 }, { "epoch": 0.036490914507000295, "grad_norm": 79.57978057861328, "learning_rate": 1.817098599940423e-06, "loss": 0.9094, "num_input_tokens_seen": 143616, "step": 245 }, { "epoch": 0.03723562704795949, "grad_norm": 52.2946891784668, "learning_rate": 1.8543342269883825e-06, "loss": 0.9301, "num_input_tokens_seen": 146592, "step": 250 }, { "epoch": 0.03798033958891868, "grad_norm": 143.73617553710938, "learning_rate": 1.8915698540363419e-06, "loss": 1.0944, "num_input_tokens_seen": 149376, "step": 255 }, { "epoch": 0.03872505212987787, "grad_norm": 103.24267578125, "learning_rate": 1.9288054810843013e-06, "loss": 0.9266, "num_input_tokens_seen": 152256, "step": 260 }, { "epoch": 0.03946976467083706, "grad_norm": 55.70581817626953, "learning_rate": 1.966041108132261e-06, "loss": 0.942, "num_input_tokens_seen": 155168, "step": 265 }, { "epoch": 0.040214477211796246, "grad_norm": 58.627933502197266, "learning_rate": 2.0032767351802206e-06, "loss": 0.9838, "num_input_tokens_seen": 158016, "step": 270 }, { "epoch": 0.040959189752755434, "grad_norm": 79.55396270751953, "learning_rate": 2.04051236222818e-06, "loss": 1.055, "num_input_tokens_seen": 160992, "step": 275 }, { "epoch": 0.04170390229371462, "grad_norm": 42.807884216308594, "learning_rate": 2.0777479892761395e-06, "loss": 0.7821, "num_input_tokens_seen": 163616, "step": 280 }, { "epoch": 0.04244861483467382, "grad_norm": 52.488224029541016, "learning_rate": 2.114983616324099e-06, "loss": 0.8794, "num_input_tokens_seen": 166496, "step": 285 }, { "epoch": 0.04319332737563301, "grad_norm": 66.39044189453125, "learning_rate": 2.1522192433720583e-06, "loss": 0.8459, "num_input_tokens_seen": 169248, "step": 290 }, { "epoch": 0.043938039916592196, "grad_norm": 46.977115631103516, "learning_rate": 2.1894548704200177e-06, "loss": 0.9928, "num_input_tokens_seen": 172160, "step": 295 }, { "epoch": 0.044682752457551385, "grad_norm": 99.55119323730469, "learning_rate": 2.2266904974679776e-06, "loss": 1.0563, "num_input_tokens_seen": 174880, "step": 300 }, { "epoch": 0.045427464998510574, "grad_norm": 60.31602096557617, "learning_rate": 2.263926124515937e-06, "loss": 1.009, "num_input_tokens_seen": 177536, "step": 305 }, { "epoch": 0.04617217753946976, "grad_norm": 91.87252807617188, "learning_rate": 2.3011617515638965e-06, "loss": 1.0043, "num_input_tokens_seen": 180384, "step": 310 }, { "epoch": 0.04691689008042895, "grad_norm": 43.13937759399414, "learning_rate": 2.338397378611856e-06, "loss": 0.8316, "num_input_tokens_seen": 183104, "step": 315 }, { "epoch": 0.04766160262138815, "grad_norm": 38.80275344848633, "learning_rate": 2.3756330056598153e-06, "loss": 0.8994, "num_input_tokens_seen": 185792, "step": 320 }, { "epoch": 0.048406315162347335, "grad_norm": 40.838294982910156, "learning_rate": 2.4128686327077747e-06, "loss": 0.8567, "num_input_tokens_seen": 188640, "step": 325 }, { "epoch": 0.049151027703306524, "grad_norm": 31.556795120239258, "learning_rate": 2.4501042597557346e-06, "loss": 0.9353, "num_input_tokens_seen": 191328, "step": 330 }, { "epoch": 0.04989574024426571, "grad_norm": 68.81101989746094, "learning_rate": 2.487339886803694e-06, "loss": 0.8509, "num_input_tokens_seen": 194464, "step": 335 }, { "epoch": 0.0506404527852249, "grad_norm": 71.41007995605469, "learning_rate": 2.5245755138516535e-06, "loss": 0.9665, "num_input_tokens_seen": 196960, "step": 340 }, { "epoch": 0.05138516532618409, "grad_norm": 23.073495864868164, "learning_rate": 2.561811140899613e-06, "loss": 0.8715, "num_input_tokens_seen": 199840, "step": 345 }, { "epoch": 0.052129877867143286, "grad_norm": 43.93472671508789, "learning_rate": 2.5990467679475723e-06, "loss": 0.8209, "num_input_tokens_seen": 202560, "step": 350 }, { "epoch": 0.052874590408102475, "grad_norm": 28.19585609436035, "learning_rate": 2.6362823949955317e-06, "loss": 0.8802, "num_input_tokens_seen": 205600, "step": 355 }, { "epoch": 0.05361930294906166, "grad_norm": 46.12526321411133, "learning_rate": 2.673518022043491e-06, "loss": 0.8617, "num_input_tokens_seen": 208672, "step": 360 }, { "epoch": 0.05436401549002085, "grad_norm": 27.331645965576172, "learning_rate": 2.710753649091451e-06, "loss": 0.8325, "num_input_tokens_seen": 211520, "step": 365 }, { "epoch": 0.05510872803098004, "grad_norm": 34.99951171875, "learning_rate": 2.7479892761394105e-06, "loss": 1.0399, "num_input_tokens_seen": 214496, "step": 370 }, { "epoch": 0.05585344057193923, "grad_norm": 31.88015365600586, "learning_rate": 2.78522490318737e-06, "loss": 0.8416, "num_input_tokens_seen": 217216, "step": 375 }, { "epoch": 0.05659815311289842, "grad_norm": 49.76087188720703, "learning_rate": 2.8224605302353293e-06, "loss": 0.931, "num_input_tokens_seen": 220000, "step": 380 }, { "epoch": 0.057342865653857614, "grad_norm": 34.86581802368164, "learning_rate": 2.8596961572832887e-06, "loss": 1.0547, "num_input_tokens_seen": 222816, "step": 385 }, { "epoch": 0.0580875781948168, "grad_norm": 25.449901580810547, "learning_rate": 2.896931784331248e-06, "loss": 0.841, "num_input_tokens_seen": 225664, "step": 390 }, { "epoch": 0.05883229073577599, "grad_norm": 58.84994125366211, "learning_rate": 2.9341674113792076e-06, "loss": 0.8392, "num_input_tokens_seen": 228800, "step": 395 }, { "epoch": 0.05957700327673518, "grad_norm": 27.022018432617188, "learning_rate": 2.9714030384271675e-06, "loss": 0.8746, "num_input_tokens_seen": 232064, "step": 400 }, { "epoch": 0.06032171581769437, "grad_norm": 31.626529693603516, "learning_rate": 3.008638665475127e-06, "loss": 0.7465, "num_input_tokens_seen": 234880, "step": 405 }, { "epoch": 0.06106642835865356, "grad_norm": 45.08598327636719, "learning_rate": 3.0458742925230863e-06, "loss": 0.9453, "num_input_tokens_seen": 237856, "step": 410 }, { "epoch": 0.06181114089961275, "grad_norm": 56.03879928588867, "learning_rate": 3.0831099195710457e-06, "loss": 0.9598, "num_input_tokens_seen": 240608, "step": 415 }, { "epoch": 0.06255585344057193, "grad_norm": 37.092041015625, "learning_rate": 3.120345546619005e-06, "loss": 0.8648, "num_input_tokens_seen": 243168, "step": 420 }, { "epoch": 0.06330056598153112, "grad_norm": 60.024803161621094, "learning_rate": 3.1575811736669646e-06, "loss": 0.8792, "num_input_tokens_seen": 246304, "step": 425 }, { "epoch": 0.06404527852249031, "grad_norm": 77.80085754394531, "learning_rate": 3.194816800714924e-06, "loss": 1.0397, "num_input_tokens_seen": 249024, "step": 430 }, { "epoch": 0.06478999106344951, "grad_norm": 147.06959533691406, "learning_rate": 3.2320524277628835e-06, "loss": 0.941, "num_input_tokens_seen": 252224, "step": 435 }, { "epoch": 0.0655347036044087, "grad_norm": 19.60956382751465, "learning_rate": 3.269288054810843e-06, "loss": 0.983, "num_input_tokens_seen": 254848, "step": 440 }, { "epoch": 0.06627941614536789, "grad_norm": 32.935848236083984, "learning_rate": 3.3065236818588023e-06, "loss": 0.9496, "num_input_tokens_seen": 257760, "step": 445 }, { "epoch": 0.06702412868632708, "grad_norm": 49.024070739746094, "learning_rate": 3.3437593089067626e-06, "loss": 0.8912, "num_input_tokens_seen": 260384, "step": 450 }, { "epoch": 0.06776884122728627, "grad_norm": 21.89828872680664, "learning_rate": 3.380994935954722e-06, "loss": 0.8947, "num_input_tokens_seen": 263360, "step": 455 }, { "epoch": 0.06851355376824546, "grad_norm": 21.32175064086914, "learning_rate": 3.4182305630026814e-06, "loss": 0.8604, "num_input_tokens_seen": 266112, "step": 460 }, { "epoch": 0.06925826630920465, "grad_norm": 32.043914794921875, "learning_rate": 3.455466190050641e-06, "loss": 0.8826, "num_input_tokens_seen": 268672, "step": 465 }, { "epoch": 0.07000297885016384, "grad_norm": 40.362491607666016, "learning_rate": 3.4927018170986003e-06, "loss": 0.8912, "num_input_tokens_seen": 271328, "step": 470 }, { "epoch": 0.07074769139112302, "grad_norm": 79.70515441894531, "learning_rate": 3.5299374441465597e-06, "loss": 0.8963, "num_input_tokens_seen": 274144, "step": 475 }, { "epoch": 0.07149240393208221, "grad_norm": 31.8050594329834, "learning_rate": 3.567173071194519e-06, "loss": 0.806, "num_input_tokens_seen": 277024, "step": 480 }, { "epoch": 0.0722371164730414, "grad_norm": 28.050004959106445, "learning_rate": 3.6044086982424786e-06, "loss": 0.9033, "num_input_tokens_seen": 279840, "step": 485 }, { "epoch": 0.07298182901400059, "grad_norm": 17.745281219482422, "learning_rate": 3.641644325290438e-06, "loss": 0.8872, "num_input_tokens_seen": 283200, "step": 490 }, { "epoch": 0.07372654155495978, "grad_norm": 71.8849105834961, "learning_rate": 3.6788799523383975e-06, "loss": 0.8645, "num_input_tokens_seen": 286016, "step": 495 }, { "epoch": 0.07447125409591898, "grad_norm": 29.865060806274414, "learning_rate": 3.716115579386357e-06, "loss": 0.8283, "num_input_tokens_seen": 288864, "step": 500 }, { "epoch": 0.07521596663687817, "grad_norm": 82.75289154052734, "learning_rate": 3.7533512064343163e-06, "loss": 1.0209, "num_input_tokens_seen": 291904, "step": 505 }, { "epoch": 0.07596067917783736, "grad_norm": 508.5314636230469, "learning_rate": 3.7905868334822757e-06, "loss": 1.235, "num_input_tokens_seen": 294688, "step": 510 }, { "epoch": 0.07670539171879655, "grad_norm": 87.50004577636719, "learning_rate": 3.827822460530236e-06, "loss": 0.9427, "num_input_tokens_seen": 297792, "step": 515 }, { "epoch": 0.07745010425975574, "grad_norm": 33.57464599609375, "learning_rate": 3.8650580875781954e-06, "loss": 0.886, "num_input_tokens_seen": 300736, "step": 520 }, { "epoch": 0.07819481680071493, "grad_norm": 22.13481903076172, "learning_rate": 3.9022937146261545e-06, "loss": 0.8543, "num_input_tokens_seen": 303488, "step": 525 }, { "epoch": 0.07893952934167411, "grad_norm": 19.5278377532959, "learning_rate": 3.939529341674114e-06, "loss": 0.7119, "num_input_tokens_seen": 306624, "step": 530 }, { "epoch": 0.0796842418826333, "grad_norm": 23.021560668945312, "learning_rate": 3.976764968722073e-06, "loss": 0.8965, "num_input_tokens_seen": 309792, "step": 535 }, { "epoch": 0.08042895442359249, "grad_norm": 38.47995376586914, "learning_rate": 4.014000595770033e-06, "loss": 0.9347, "num_input_tokens_seen": 312576, "step": 540 }, { "epoch": 0.08117366696455168, "grad_norm": 22.319759368896484, "learning_rate": 4.051236222817992e-06, "loss": 0.9361, "num_input_tokens_seen": 315360, "step": 545 }, { "epoch": 0.08191837950551087, "grad_norm": 207.0150604248047, "learning_rate": 4.088471849865952e-06, "loss": 1.0087, "num_input_tokens_seen": 318016, "step": 550 }, { "epoch": 0.08266309204647006, "grad_norm": 19.11591911315918, "learning_rate": 4.125707476913911e-06, "loss": 0.8295, "num_input_tokens_seen": 320768, "step": 555 }, { "epoch": 0.08340780458742925, "grad_norm": 21.130691528320312, "learning_rate": 4.162943103961871e-06, "loss": 0.8625, "num_input_tokens_seen": 323680, "step": 560 }, { "epoch": 0.08415251712838845, "grad_norm": 122.96989440917969, "learning_rate": 4.20017873100983e-06, "loss": 0.9651, "num_input_tokens_seen": 326624, "step": 565 }, { "epoch": 0.08489722966934764, "grad_norm": 24.587867736816406, "learning_rate": 4.23741435805779e-06, "loss": 0.9424, "num_input_tokens_seen": 329216, "step": 570 }, { "epoch": 0.08564194221030683, "grad_norm": 34.31299591064453, "learning_rate": 4.274649985105749e-06, "loss": 0.879, "num_input_tokens_seen": 331904, "step": 575 }, { "epoch": 0.08638665475126601, "grad_norm": 23.151567459106445, "learning_rate": 4.3118856121537094e-06, "loss": 0.855, "num_input_tokens_seen": 334752, "step": 580 }, { "epoch": 0.0871313672922252, "grad_norm": 27.117351531982422, "learning_rate": 4.3491212392016685e-06, "loss": 0.818, "num_input_tokens_seen": 337984, "step": 585 }, { "epoch": 0.08787607983318439, "grad_norm": 29.169565200805664, "learning_rate": 4.386356866249628e-06, "loss": 0.9237, "num_input_tokens_seen": 340800, "step": 590 }, { "epoch": 0.08862079237414358, "grad_norm": 19.36422348022461, "learning_rate": 4.423592493297587e-06, "loss": 0.7448, "num_input_tokens_seen": 343424, "step": 595 }, { "epoch": 0.08936550491510277, "grad_norm": 25.914188385009766, "learning_rate": 4.460828120345547e-06, "loss": 1.0281, "num_input_tokens_seen": 346720, "step": 600 }, { "epoch": 0.09011021745606196, "grad_norm": 24.891721725463867, "learning_rate": 4.498063747393506e-06, "loss": 0.9625, "num_input_tokens_seen": 349728, "step": 605 }, { "epoch": 0.09085492999702115, "grad_norm": 32.27973175048828, "learning_rate": 4.535299374441466e-06, "loss": 0.8841, "num_input_tokens_seen": 352352, "step": 610 }, { "epoch": 0.09159964253798034, "grad_norm": 14.92967700958252, "learning_rate": 4.572535001489425e-06, "loss": 0.7964, "num_input_tokens_seen": 355168, "step": 615 }, { "epoch": 0.09234435507893952, "grad_norm": 39.11081314086914, "learning_rate": 4.609770628537385e-06, "loss": 0.8582, "num_input_tokens_seen": 357920, "step": 620 }, { "epoch": 0.09308906761989871, "grad_norm": 12.691473007202148, "learning_rate": 4.647006255585344e-06, "loss": 0.9372, "num_input_tokens_seen": 360704, "step": 625 }, { "epoch": 0.0938337801608579, "grad_norm": 13.83273696899414, "learning_rate": 4.684241882633304e-06, "loss": 0.8964, "num_input_tokens_seen": 363872, "step": 630 }, { "epoch": 0.0945784927018171, "grad_norm": 10.634119033813477, "learning_rate": 4.721477509681263e-06, "loss": 0.8797, "num_input_tokens_seen": 366976, "step": 635 }, { "epoch": 0.0953232052427763, "grad_norm": 16.53303337097168, "learning_rate": 4.758713136729223e-06, "loss": 0.8006, "num_input_tokens_seen": 369824, "step": 640 }, { "epoch": 0.09606791778373548, "grad_norm": 25.52064323425293, "learning_rate": 4.7959487637771824e-06, "loss": 0.8516, "num_input_tokens_seen": 372832, "step": 645 }, { "epoch": 0.09681263032469467, "grad_norm": 82.76335144042969, "learning_rate": 4.833184390825142e-06, "loss": 0.9315, "num_input_tokens_seen": 375872, "step": 650 }, { "epoch": 0.09755734286565386, "grad_norm": 24.21207618713379, "learning_rate": 4.870420017873101e-06, "loss": 0.8474, "num_input_tokens_seen": 378752, "step": 655 }, { "epoch": 0.09830205540661305, "grad_norm": 65.38968658447266, "learning_rate": 4.907655644921061e-06, "loss": 0.8887, "num_input_tokens_seen": 381632, "step": 660 }, { "epoch": 0.09904676794757224, "grad_norm": 19.700666427612305, "learning_rate": 4.94489127196902e-06, "loss": 0.8347, "num_input_tokens_seen": 384832, "step": 665 }, { "epoch": 0.09979148048853143, "grad_norm": 30.88331413269043, "learning_rate": 4.98212689901698e-06, "loss": 0.7682, "num_input_tokens_seen": 388192, "step": 670 }, { "epoch": 0.10053619302949061, "grad_norm": 34.961570739746094, "learning_rate": 5.019362526064939e-06, "loss": 0.9034, "num_input_tokens_seen": 391072, "step": 675 }, { "epoch": 0.1012809055704498, "grad_norm": 26.899402618408203, "learning_rate": 5.056598153112899e-06, "loss": 1.1481, "num_input_tokens_seen": 393792, "step": 680 }, { "epoch": 0.10202561811140899, "grad_norm": 22.513469696044922, "learning_rate": 5.093833780160858e-06, "loss": 0.9595, "num_input_tokens_seen": 396800, "step": 685 }, { "epoch": 0.10277033065236818, "grad_norm": 14.955849647521973, "learning_rate": 5.131069407208818e-06, "loss": 0.8885, "num_input_tokens_seen": 399648, "step": 690 }, { "epoch": 0.10351504319332737, "grad_norm": 34.269142150878906, "learning_rate": 5.168305034256777e-06, "loss": 0.7986, "num_input_tokens_seen": 402528, "step": 695 }, { "epoch": 0.10425975573428657, "grad_norm": 14.514220237731934, "learning_rate": 5.205540661304737e-06, "loss": 0.8624, "num_input_tokens_seen": 405696, "step": 700 }, { "epoch": 0.10500446827524576, "grad_norm": 20.435548782348633, "learning_rate": 5.242776288352696e-06, "loss": 0.857, "num_input_tokens_seen": 408864, "step": 705 }, { "epoch": 0.10574918081620495, "grad_norm": 19.15341567993164, "learning_rate": 5.2800119154006555e-06, "loss": 0.8417, "num_input_tokens_seen": 411616, "step": 710 }, { "epoch": 0.10649389335716414, "grad_norm": 38.98746109008789, "learning_rate": 5.317247542448615e-06, "loss": 0.8854, "num_input_tokens_seen": 414528, "step": 715 }, { "epoch": 0.10723860589812333, "grad_norm": 39.29813766479492, "learning_rate": 5.354483169496575e-06, "loss": 0.8461, "num_input_tokens_seen": 417440, "step": 720 }, { "epoch": 0.10798331843908252, "grad_norm": 13.457878112792969, "learning_rate": 5.391718796544534e-06, "loss": 0.7645, "num_input_tokens_seen": 420288, "step": 725 }, { "epoch": 0.1087280309800417, "grad_norm": 25.66771697998047, "learning_rate": 5.428954423592494e-06, "loss": 0.9127, "num_input_tokens_seen": 423168, "step": 730 }, { "epoch": 0.10947274352100089, "grad_norm": 11.338489532470703, "learning_rate": 5.466190050640453e-06, "loss": 0.8004, "num_input_tokens_seen": 426176, "step": 735 }, { "epoch": 0.11021745606196008, "grad_norm": 11.74666976928711, "learning_rate": 5.503425677688413e-06, "loss": 0.8383, "num_input_tokens_seen": 429344, "step": 740 }, { "epoch": 0.11096216860291927, "grad_norm": 33.5170783996582, "learning_rate": 5.540661304736372e-06, "loss": 0.8357, "num_input_tokens_seen": 432032, "step": 745 }, { "epoch": 0.11170688114387846, "grad_norm": 80.95526123046875, "learning_rate": 5.577896931784332e-06, "loss": 0.9268, "num_input_tokens_seen": 434784, "step": 750 }, { "epoch": 0.11245159368483765, "grad_norm": 19.06873321533203, "learning_rate": 5.615132558832291e-06, "loss": 0.8091, "num_input_tokens_seen": 437952, "step": 755 }, { "epoch": 0.11319630622579684, "grad_norm": 19.61686134338379, "learning_rate": 5.652368185880251e-06, "loss": 0.8245, "num_input_tokens_seen": 440672, "step": 760 }, { "epoch": 0.11394101876675604, "grad_norm": 28.665714263916016, "learning_rate": 5.68960381292821e-06, "loss": 0.9108, "num_input_tokens_seen": 443680, "step": 765 }, { "epoch": 0.11468573130771523, "grad_norm": 15.7631196975708, "learning_rate": 5.7268394399761695e-06, "loss": 0.849, "num_input_tokens_seen": 446400, "step": 770 }, { "epoch": 0.11543044384867442, "grad_norm": 25.1241397857666, "learning_rate": 5.7640750670241285e-06, "loss": 0.8606, "num_input_tokens_seen": 449184, "step": 775 }, { "epoch": 0.1161751563896336, "grad_norm": 74.70035552978516, "learning_rate": 5.801310694072089e-06, "loss": 0.8904, "num_input_tokens_seen": 452032, "step": 780 }, { "epoch": 0.1169198689305928, "grad_norm": 28.079103469848633, "learning_rate": 5.838546321120048e-06, "loss": 0.9192, "num_input_tokens_seen": 455072, "step": 785 }, { "epoch": 0.11766458147155198, "grad_norm": 39.37375259399414, "learning_rate": 5.875781948168008e-06, "loss": 0.8831, "num_input_tokens_seen": 457760, "step": 790 }, { "epoch": 0.11840929401251117, "grad_norm": 20.784372329711914, "learning_rate": 5.913017575215967e-06, "loss": 0.8503, "num_input_tokens_seen": 460608, "step": 795 }, { "epoch": 0.11915400655347036, "grad_norm": 13.502420425415039, "learning_rate": 5.950253202263927e-06, "loss": 0.7846, "num_input_tokens_seen": 463456, "step": 800 }, { "epoch": 0.11989871909442955, "grad_norm": 14.02131462097168, "learning_rate": 5.987488829311886e-06, "loss": 0.8532, "num_input_tokens_seen": 466336, "step": 805 }, { "epoch": 0.12064343163538874, "grad_norm": 22.632625579833984, "learning_rate": 6.024724456359846e-06, "loss": 0.8915, "num_input_tokens_seen": 469280, "step": 810 }, { "epoch": 0.12138814417634793, "grad_norm": 8.645807266235352, "learning_rate": 6.061960083407805e-06, "loss": 0.8188, "num_input_tokens_seen": 472096, "step": 815 }, { "epoch": 0.12213285671730711, "grad_norm": 38.09095764160156, "learning_rate": 6.099195710455765e-06, "loss": 0.8545, "num_input_tokens_seen": 474944, "step": 820 }, { "epoch": 0.1228775692582663, "grad_norm": 10.994053840637207, "learning_rate": 6.136431337503724e-06, "loss": 0.8171, "num_input_tokens_seen": 477696, "step": 825 }, { "epoch": 0.1236222817992255, "grad_norm": 30.468585968017578, "learning_rate": 6.1736669645516834e-06, "loss": 0.8097, "num_input_tokens_seen": 480672, "step": 830 }, { "epoch": 0.1243669943401847, "grad_norm": 6.249985218048096, "learning_rate": 6.2109025915996425e-06, "loss": 0.8718, "num_input_tokens_seen": 483712, "step": 835 }, { "epoch": 0.12511170688114387, "grad_norm": 12.650419235229492, "learning_rate": 6.248138218647602e-06, "loss": 0.8227, "num_input_tokens_seen": 486528, "step": 840 }, { "epoch": 0.12585641942210307, "grad_norm": 29.767961502075195, "learning_rate": 6.285373845695562e-06, "loss": 0.7903, "num_input_tokens_seen": 489184, "step": 845 }, { "epoch": 0.12660113196306225, "grad_norm": 30.84075164794922, "learning_rate": 6.322609472743521e-06, "loss": 0.8724, "num_input_tokens_seen": 492096, "step": 850 }, { "epoch": 0.12734584450402145, "grad_norm": 13.216416358947754, "learning_rate": 6.359845099791481e-06, "loss": 0.937, "num_input_tokens_seen": 495168, "step": 855 }, { "epoch": 0.12809055704498062, "grad_norm": 10.031280517578125, "learning_rate": 6.39708072683944e-06, "loss": 0.8293, "num_input_tokens_seen": 498112, "step": 860 }, { "epoch": 0.12883526958593983, "grad_norm": 23.172744750976562, "learning_rate": 6.4343163538874e-06, "loss": 0.8206, "num_input_tokens_seen": 501216, "step": 865 }, { "epoch": 0.12957998212689903, "grad_norm": 23.427671432495117, "learning_rate": 6.471551980935359e-06, "loss": 0.7863, "num_input_tokens_seen": 504000, "step": 870 }, { "epoch": 0.1303246946678582, "grad_norm": 8.583948135375977, "learning_rate": 6.508787607983319e-06, "loss": 0.8806, "num_input_tokens_seen": 506912, "step": 875 }, { "epoch": 0.1310694072088174, "grad_norm": 9.231948852539062, "learning_rate": 6.546023235031279e-06, "loss": 0.8242, "num_input_tokens_seen": 509696, "step": 880 }, { "epoch": 0.13181411974977658, "grad_norm": 14.37458324432373, "learning_rate": 6.583258862079238e-06, "loss": 0.8784, "num_input_tokens_seen": 512544, "step": 885 }, { "epoch": 0.13255883229073578, "grad_norm": 16.442590713500977, "learning_rate": 6.6204944891271974e-06, "loss": 1.013, "num_input_tokens_seen": 515392, "step": 890 }, { "epoch": 0.13330354483169496, "grad_norm": 8.911401748657227, "learning_rate": 6.6577301161751565e-06, "loss": 0.8238, "num_input_tokens_seen": 518560, "step": 895 }, { "epoch": 0.13404825737265416, "grad_norm": 10.364609718322754, "learning_rate": 6.694965743223116e-06, "loss": 0.8272, "num_input_tokens_seen": 521440, "step": 900 }, { "epoch": 0.13479296991361334, "grad_norm": 8.257158279418945, "learning_rate": 6.732201370271075e-06, "loss": 0.8323, "num_input_tokens_seen": 524352, "step": 905 }, { "epoch": 0.13553768245457254, "grad_norm": 751.1448974609375, "learning_rate": 6.769436997319035e-06, "loss": 1.0111, "num_input_tokens_seen": 527808, "step": 910 }, { "epoch": 0.1362823949955317, "grad_norm": 26.618240356445312, "learning_rate": 6.806672624366994e-06, "loss": 1.2627, "num_input_tokens_seen": 530688, "step": 915 }, { "epoch": 0.13702710753649092, "grad_norm": 14.821549415588379, "learning_rate": 6.843908251414954e-06, "loss": 0.9499, "num_input_tokens_seen": 533408, "step": 920 }, { "epoch": 0.1377718200774501, "grad_norm": 11.543157577514648, "learning_rate": 6.881143878462913e-06, "loss": 0.8445, "num_input_tokens_seen": 536480, "step": 925 }, { "epoch": 0.1385165326184093, "grad_norm": 74.36726379394531, "learning_rate": 6.918379505510873e-06, "loss": 0.8811, "num_input_tokens_seen": 539072, "step": 930 }, { "epoch": 0.1392612451593685, "grad_norm": 44.18201446533203, "learning_rate": 6.955615132558832e-06, "loss": 1.0344, "num_input_tokens_seen": 542208, "step": 935 }, { "epoch": 0.14000595770032767, "grad_norm": 30.792160034179688, "learning_rate": 6.992850759606792e-06, "loss": 0.9861, "num_input_tokens_seen": 545152, "step": 940 }, { "epoch": 0.14075067024128687, "grad_norm": 54.5593147277832, "learning_rate": 7.0300863866547524e-06, "loss": 0.941, "num_input_tokens_seen": 547776, "step": 945 }, { "epoch": 0.14149538278224605, "grad_norm": 8.817277908325195, "learning_rate": 7.067322013702711e-06, "loss": 0.8363, "num_input_tokens_seen": 550656, "step": 950 }, { "epoch": 0.14224009532320525, "grad_norm": 14.601434707641602, "learning_rate": 7.104557640750671e-06, "loss": 0.858, "num_input_tokens_seen": 553376, "step": 955 }, { "epoch": 0.14298480786416443, "grad_norm": 24.415319442749023, "learning_rate": 7.14179326779863e-06, "loss": 0.8051, "num_input_tokens_seen": 556512, "step": 960 }, { "epoch": 0.14372952040512363, "grad_norm": 46.13371658325195, "learning_rate": 7.17902889484659e-06, "loss": 0.9369, "num_input_tokens_seen": 559520, "step": 965 }, { "epoch": 0.1444742329460828, "grad_norm": 17.084928512573242, "learning_rate": 7.216264521894549e-06, "loss": 0.8294, "num_input_tokens_seen": 562464, "step": 970 }, { "epoch": 0.145218945487042, "grad_norm": 15.833344459533691, "learning_rate": 7.253500148942509e-06, "loss": 0.8835, "num_input_tokens_seen": 564960, "step": 975 }, { "epoch": 0.14596365802800118, "grad_norm": 19.60695457458496, "learning_rate": 7.290735775990468e-06, "loss": 0.8255, "num_input_tokens_seen": 568032, "step": 980 }, { "epoch": 0.14670837056896038, "grad_norm": 13.242178916931152, "learning_rate": 7.327971403038428e-06, "loss": 0.8602, "num_input_tokens_seen": 570912, "step": 985 }, { "epoch": 0.14745308310991956, "grad_norm": 11.453554153442383, "learning_rate": 7.365207030086387e-06, "loss": 0.8323, "num_input_tokens_seen": 573696, "step": 990 }, { "epoch": 0.14819779565087876, "grad_norm": 9.236621856689453, "learning_rate": 7.402442657134347e-06, "loss": 0.8355, "num_input_tokens_seen": 576480, "step": 995 }, { "epoch": 0.14894250819183796, "grad_norm": 12.985387802124023, "learning_rate": 7.439678284182306e-06, "loss": 0.8182, "num_input_tokens_seen": 579424, "step": 1000 }, { "epoch": 0.14968722073279714, "grad_norm": 7.120563507080078, "learning_rate": 7.476913911230266e-06, "loss": 0.7963, "num_input_tokens_seen": 582368, "step": 1005 }, { "epoch": 0.15043193327375634, "grad_norm": 6.242813587188721, "learning_rate": 7.5141495382782254e-06, "loss": 0.8412, "num_input_tokens_seen": 585408, "step": 1010 }, { "epoch": 0.15117664581471552, "grad_norm": 6.287008762359619, "learning_rate": 7.5513851653261844e-06, "loss": 0.8297, "num_input_tokens_seen": 588288, "step": 1015 }, { "epoch": 0.15192135835567472, "grad_norm": 23.63795280456543, "learning_rate": 7.588620792374144e-06, "loss": 0.8484, "num_input_tokens_seen": 591200, "step": 1020 }, { "epoch": 0.1526660708966339, "grad_norm": 8.347622871398926, "learning_rate": 7.625856419422103e-06, "loss": 0.8566, "num_input_tokens_seen": 594048, "step": 1025 }, { "epoch": 0.1534107834375931, "grad_norm": 9.375354766845703, "learning_rate": 7.663092046470063e-06, "loss": 0.8359, "num_input_tokens_seen": 597024, "step": 1030 }, { "epoch": 0.15415549597855227, "grad_norm": 10.211373329162598, "learning_rate": 7.700327673518021e-06, "loss": 0.8339, "num_input_tokens_seen": 599840, "step": 1035 }, { "epoch": 0.15490020851951147, "grad_norm": 8.95789623260498, "learning_rate": 7.737563300565983e-06, "loss": 0.8462, "num_input_tokens_seen": 602656, "step": 1040 }, { "epoch": 0.15564492106047065, "grad_norm": 13.476353645324707, "learning_rate": 7.774798927613941e-06, "loss": 0.8547, "num_input_tokens_seen": 605760, "step": 1045 }, { "epoch": 0.15638963360142985, "grad_norm": 5.785837650299072, "learning_rate": 7.812034554661901e-06, "loss": 0.7948, "num_input_tokens_seen": 608448, "step": 1050 }, { "epoch": 0.15713434614238903, "grad_norm": 5.210088729858398, "learning_rate": 7.84927018170986e-06, "loss": 0.8331, "num_input_tokens_seen": 611296, "step": 1055 }, { "epoch": 0.15787905868334823, "grad_norm": 7.350139617919922, "learning_rate": 7.88650580875782e-06, "loss": 0.8671, "num_input_tokens_seen": 614272, "step": 1060 }, { "epoch": 0.15862377122430743, "grad_norm": 10.293543815612793, "learning_rate": 7.923741435805779e-06, "loss": 0.8171, "num_input_tokens_seen": 617504, "step": 1065 }, { "epoch": 0.1593684837652666, "grad_norm": 13.99299430847168, "learning_rate": 7.960977062853739e-06, "loss": 0.8654, "num_input_tokens_seen": 620384, "step": 1070 }, { "epoch": 0.1601131963062258, "grad_norm": 15.094467163085938, "learning_rate": 7.998212689901698e-06, "loss": 0.8811, "num_input_tokens_seen": 623360, "step": 1075 }, { "epoch": 0.16085790884718498, "grad_norm": 10.523377418518066, "learning_rate": 8.035448316949658e-06, "loss": 0.7593, "num_input_tokens_seen": 626336, "step": 1080 }, { "epoch": 0.16160262138814419, "grad_norm": 13.940240859985352, "learning_rate": 8.072683943997618e-06, "loss": 0.9517, "num_input_tokens_seen": 629472, "step": 1085 }, { "epoch": 0.16234733392910336, "grad_norm": 20.761747360229492, "learning_rate": 8.109919571045576e-06, "loss": 0.8841, "num_input_tokens_seen": 632608, "step": 1090 }, { "epoch": 0.16309204647006256, "grad_norm": 9.639875411987305, "learning_rate": 8.147155198093536e-06, "loss": 0.8225, "num_input_tokens_seen": 635680, "step": 1095 }, { "epoch": 0.16383675901102174, "grad_norm": 6.969905853271484, "learning_rate": 8.184390825141496e-06, "loss": 0.8263, "num_input_tokens_seen": 638304, "step": 1100 }, { "epoch": 0.16458147155198094, "grad_norm": 6.77081298828125, "learning_rate": 8.221626452189456e-06, "loss": 0.809, "num_input_tokens_seen": 641312, "step": 1105 }, { "epoch": 0.16532618409294011, "grad_norm": 7.48370885848999, "learning_rate": 8.258862079237414e-06, "loss": 0.8291, "num_input_tokens_seen": 644160, "step": 1110 }, { "epoch": 0.16607089663389932, "grad_norm": 15.969592094421387, "learning_rate": 8.296097706285374e-06, "loss": 0.7995, "num_input_tokens_seen": 646912, "step": 1115 }, { "epoch": 0.1668156091748585, "grad_norm": 10.414977073669434, "learning_rate": 8.333333333333334e-06, "loss": 0.8182, "num_input_tokens_seen": 649888, "step": 1120 }, { "epoch": 0.1675603217158177, "grad_norm": 10.143717765808105, "learning_rate": 8.370568960381294e-06, "loss": 0.8387, "num_input_tokens_seen": 652672, "step": 1125 }, { "epoch": 0.1683050342567769, "grad_norm": 12.673398971557617, "learning_rate": 8.407804587429252e-06, "loss": 0.8526, "num_input_tokens_seen": 655072, "step": 1130 }, { "epoch": 0.16904974679773607, "grad_norm": 15.56026840209961, "learning_rate": 8.445040214477212e-06, "loss": 0.8242, "num_input_tokens_seen": 658112, "step": 1135 }, { "epoch": 0.16979445933869527, "grad_norm": 17.517906188964844, "learning_rate": 8.482275841525171e-06, "loss": 0.8324, "num_input_tokens_seen": 661024, "step": 1140 }, { "epoch": 0.17053917187965445, "grad_norm": 11.26296615600586, "learning_rate": 8.519511468573131e-06, "loss": 0.8499, "num_input_tokens_seen": 663936, "step": 1145 }, { "epoch": 0.17128388442061365, "grad_norm": 13.390634536743164, "learning_rate": 8.556747095621091e-06, "loss": 0.7932, "num_input_tokens_seen": 666784, "step": 1150 }, { "epoch": 0.17202859696157283, "grad_norm": 6.791441440582275, "learning_rate": 8.59398272266905e-06, "loss": 0.8933, "num_input_tokens_seen": 669984, "step": 1155 }, { "epoch": 0.17277330950253203, "grad_norm": 11.929642677307129, "learning_rate": 8.631218349717011e-06, "loss": 0.8481, "num_input_tokens_seen": 672672, "step": 1160 }, { "epoch": 0.1735180220434912, "grad_norm": 7.827012062072754, "learning_rate": 8.668453976764969e-06, "loss": 0.8239, "num_input_tokens_seen": 675584, "step": 1165 }, { "epoch": 0.1742627345844504, "grad_norm": 39.06742858886719, "learning_rate": 8.705689603812929e-06, "loss": 0.8391, "num_input_tokens_seen": 678336, "step": 1170 }, { "epoch": 0.17500744712540958, "grad_norm": 4.7362189292907715, "learning_rate": 8.742925230860887e-06, "loss": 0.8445, "num_input_tokens_seen": 681120, "step": 1175 }, { "epoch": 0.17575215966636878, "grad_norm": 3.5900444984436035, "learning_rate": 8.780160857908849e-06, "loss": 0.8338, "num_input_tokens_seen": 683936, "step": 1180 }, { "epoch": 0.17649687220732796, "grad_norm": 8.818977355957031, "learning_rate": 8.817396484956807e-06, "loss": 0.8039, "num_input_tokens_seen": 686816, "step": 1185 }, { "epoch": 0.17724158474828716, "grad_norm": 31.45404052734375, "learning_rate": 8.854632112004767e-06, "loss": 0.8206, "num_input_tokens_seen": 689664, "step": 1190 }, { "epoch": 0.17798629728924636, "grad_norm": 10.940343856811523, "learning_rate": 8.891867739052725e-06, "loss": 0.8445, "num_input_tokens_seen": 692320, "step": 1195 }, { "epoch": 0.17873100983020554, "grad_norm": 7.255443096160889, "learning_rate": 8.929103366100686e-06, "loss": 0.7734, "num_input_tokens_seen": 695136, "step": 1200 }, { "epoch": 0.17947572237116474, "grad_norm": 10.905896186828613, "learning_rate": 8.966338993148644e-06, "loss": 0.8254, "num_input_tokens_seen": 698048, "step": 1205 }, { "epoch": 0.18022043491212392, "grad_norm": 5.4978532791137695, "learning_rate": 9.003574620196604e-06, "loss": 0.7267, "num_input_tokens_seen": 700864, "step": 1210 }, { "epoch": 0.18096514745308312, "grad_norm": 15.597390174865723, "learning_rate": 9.040810247244564e-06, "loss": 0.8714, "num_input_tokens_seen": 703872, "step": 1215 }, { "epoch": 0.1817098599940423, "grad_norm": 8.866775512695312, "learning_rate": 9.078045874292524e-06, "loss": 0.8595, "num_input_tokens_seen": 706496, "step": 1220 }, { "epoch": 0.1824545725350015, "grad_norm": 14.571745872497559, "learning_rate": 9.115281501340484e-06, "loss": 0.8288, "num_input_tokens_seen": 709440, "step": 1225 }, { "epoch": 0.18319928507596067, "grad_norm": 4.615654945373535, "learning_rate": 9.152517128388442e-06, "loss": 0.7957, "num_input_tokens_seen": 712128, "step": 1230 }, { "epoch": 0.18394399761691987, "grad_norm": 6.769476890563965, "learning_rate": 9.189752755436402e-06, "loss": 0.8156, "num_input_tokens_seen": 714912, "step": 1235 }, { "epoch": 0.18468871015787905, "grad_norm": 14.210611343383789, "learning_rate": 9.226988382484362e-06, "loss": 0.8507, "num_input_tokens_seen": 717568, "step": 1240 }, { "epoch": 0.18543342269883825, "grad_norm": 13.890815734863281, "learning_rate": 9.264224009532322e-06, "loss": 0.8844, "num_input_tokens_seen": 720704, "step": 1245 }, { "epoch": 0.18617813523979743, "grad_norm": 2.9272093772888184, "learning_rate": 9.30145963658028e-06, "loss": 0.837, "num_input_tokens_seen": 723680, "step": 1250 }, { "epoch": 0.18692284778075663, "grad_norm": 15.769379615783691, "learning_rate": 9.33869526362824e-06, "loss": 0.8201, "num_input_tokens_seen": 726592, "step": 1255 }, { "epoch": 0.1876675603217158, "grad_norm": 11.882561683654785, "learning_rate": 9.3759308906762e-06, "loss": 0.8252, "num_input_tokens_seen": 729536, "step": 1260 }, { "epoch": 0.188412272862675, "grad_norm": 28.711750030517578, "learning_rate": 9.41316651772416e-06, "loss": 0.8171, "num_input_tokens_seen": 732544, "step": 1265 }, { "epoch": 0.1891569854036342, "grad_norm": 9.784748077392578, "learning_rate": 9.450402144772117e-06, "loss": 0.7818, "num_input_tokens_seen": 735712, "step": 1270 }, { "epoch": 0.18990169794459338, "grad_norm": 11.379605293273926, "learning_rate": 9.487637771820077e-06, "loss": 0.8475, "num_input_tokens_seen": 738304, "step": 1275 }, { "epoch": 0.1906464104855526, "grad_norm": 5.3094563484191895, "learning_rate": 9.524873398868039e-06, "loss": 0.8415, "num_input_tokens_seen": 741248, "step": 1280 }, { "epoch": 0.19139112302651176, "grad_norm": 6.325932025909424, "learning_rate": 9.562109025915997e-06, "loss": 0.847, "num_input_tokens_seen": 744192, "step": 1285 }, { "epoch": 0.19213583556747096, "grad_norm": 3.6792712211608887, "learning_rate": 9.599344652963957e-06, "loss": 0.804, "num_input_tokens_seen": 747104, "step": 1290 }, { "epoch": 0.19288054810843014, "grad_norm": 7.36221981048584, "learning_rate": 9.636580280011915e-06, "loss": 0.819, "num_input_tokens_seen": 750048, "step": 1295 }, { "epoch": 0.19362526064938934, "grad_norm": 5.857320785522461, "learning_rate": 9.673815907059877e-06, "loss": 0.7809, "num_input_tokens_seen": 752992, "step": 1300 }, { "epoch": 0.19436997319034852, "grad_norm": 12.34255313873291, "learning_rate": 9.711051534107835e-06, "loss": 0.7859, "num_input_tokens_seen": 755904, "step": 1305 }, { "epoch": 0.19511468573130772, "grad_norm": 7.136833190917969, "learning_rate": 9.748287161155795e-06, "loss": 0.8247, "num_input_tokens_seen": 758752, "step": 1310 }, { "epoch": 0.1958593982722669, "grad_norm": 16.518665313720703, "learning_rate": 9.785522788203753e-06, "loss": 0.8407, "num_input_tokens_seen": 761344, "step": 1315 }, { "epoch": 0.1966041108132261, "grad_norm": 4.0886406898498535, "learning_rate": 9.822758415251714e-06, "loss": 0.8161, "num_input_tokens_seen": 764160, "step": 1320 }, { "epoch": 0.19734882335418527, "grad_norm": 10.631810188293457, "learning_rate": 9.859994042299672e-06, "loss": 0.7901, "num_input_tokens_seen": 767104, "step": 1325 }, { "epoch": 0.19809353589514447, "grad_norm": 8.168771743774414, "learning_rate": 9.897229669347632e-06, "loss": 0.8739, "num_input_tokens_seen": 769824, "step": 1330 }, { "epoch": 0.19883824843610368, "grad_norm": 4.287918567657471, "learning_rate": 9.93446529639559e-06, "loss": 0.8477, "num_input_tokens_seen": 772544, "step": 1335 }, { "epoch": 0.19958296097706285, "grad_norm": 10.740772247314453, "learning_rate": 9.971700923443552e-06, "loss": 0.8241, "num_input_tokens_seen": 775360, "step": 1340 }, { "epoch": 0.20032767351802205, "grad_norm": 13.882862091064453, "learning_rate": 1.0008936550491512e-05, "loss": 0.8642, "num_input_tokens_seen": 778144, "step": 1345 }, { "epoch": 0.20107238605898123, "grad_norm": 3.8296985626220703, "learning_rate": 1.004617217753947e-05, "loss": 0.8101, "num_input_tokens_seen": 780864, "step": 1350 }, { "epoch": 0.20181709859994043, "grad_norm": 6.8708600997924805, "learning_rate": 1.008340780458743e-05, "loss": 0.7915, "num_input_tokens_seen": 783776, "step": 1355 }, { "epoch": 0.2025618111408996, "grad_norm": 10.100756645202637, "learning_rate": 1.012064343163539e-05, "loss": 0.8395, "num_input_tokens_seen": 786752, "step": 1360 }, { "epoch": 0.2033065236818588, "grad_norm": 8.268052101135254, "learning_rate": 1.015787905868335e-05, "loss": 0.7894, "num_input_tokens_seen": 789536, "step": 1365 }, { "epoch": 0.20405123622281798, "grad_norm": 48.284358978271484, "learning_rate": 1.0195114685731308e-05, "loss": 0.9203, "num_input_tokens_seen": 792160, "step": 1370 }, { "epoch": 0.20479594876377719, "grad_norm": 10.02619457244873, "learning_rate": 1.0232350312779268e-05, "loss": 0.8656, "num_input_tokens_seen": 794912, "step": 1375 }, { "epoch": 0.20554066130473636, "grad_norm": 14.421324729919434, "learning_rate": 1.0269585939827227e-05, "loss": 0.8095, "num_input_tokens_seen": 797824, "step": 1380 }, { "epoch": 0.20628537384569556, "grad_norm": 6.978302001953125, "learning_rate": 1.0306821566875187e-05, "loss": 0.8024, "num_input_tokens_seen": 801184, "step": 1385 }, { "epoch": 0.20703008638665474, "grad_norm": 8.05732250213623, "learning_rate": 1.0344057193923145e-05, "loss": 0.8168, "num_input_tokens_seen": 804032, "step": 1390 }, { "epoch": 0.20777479892761394, "grad_norm": 28.84726905822754, "learning_rate": 1.0381292820971105e-05, "loss": 0.8979, "num_input_tokens_seen": 806912, "step": 1395 }, { "epoch": 0.20851951146857314, "grad_norm": 4.786845684051514, "learning_rate": 1.0418528448019065e-05, "loss": 0.9699, "num_input_tokens_seen": 809472, "step": 1400 }, { "epoch": 0.20926422400953232, "grad_norm": 3.1929619312286377, "learning_rate": 1.0455764075067025e-05, "loss": 0.7895, "num_input_tokens_seen": 812064, "step": 1405 }, { "epoch": 0.21000893655049152, "grad_norm": 4.967606067657471, "learning_rate": 1.0492999702114985e-05, "loss": 0.7895, "num_input_tokens_seen": 815232, "step": 1410 }, { "epoch": 0.2107536490914507, "grad_norm": 3.6984550952911377, "learning_rate": 1.0530235329162943e-05, "loss": 0.7866, "num_input_tokens_seen": 818080, "step": 1415 }, { "epoch": 0.2114983616324099, "grad_norm": 4.338478088378906, "learning_rate": 1.0567470956210903e-05, "loss": 0.8694, "num_input_tokens_seen": 820832, "step": 1420 }, { "epoch": 0.21224307417336907, "grad_norm": 6.013980388641357, "learning_rate": 1.0604706583258863e-05, "loss": 0.7811, "num_input_tokens_seen": 823584, "step": 1425 }, { "epoch": 0.21298778671432828, "grad_norm": 4.338184356689453, "learning_rate": 1.0641942210306823e-05, "loss": 0.828, "num_input_tokens_seen": 826240, "step": 1430 }, { "epoch": 0.21373249925528745, "grad_norm": 1.9172528982162476, "learning_rate": 1.067917783735478e-05, "loss": 0.7984, "num_input_tokens_seen": 828928, "step": 1435 }, { "epoch": 0.21447721179624665, "grad_norm": 3.50146222114563, "learning_rate": 1.071641346440274e-05, "loss": 0.8251, "num_input_tokens_seen": 832128, "step": 1440 }, { "epoch": 0.21522192433720583, "grad_norm": 3.6127490997314453, "learning_rate": 1.07536490914507e-05, "loss": 0.8323, "num_input_tokens_seen": 835616, "step": 1445 }, { "epoch": 0.21596663687816503, "grad_norm": 8.852832794189453, "learning_rate": 1.079088471849866e-05, "loss": 0.8334, "num_input_tokens_seen": 838528, "step": 1450 }, { "epoch": 0.2167113494191242, "grad_norm": 6.142236709594727, "learning_rate": 1.0828120345546618e-05, "loss": 0.7766, "num_input_tokens_seen": 841440, "step": 1455 }, { "epoch": 0.2174560619600834, "grad_norm": 7.936775207519531, "learning_rate": 1.086535597259458e-05, "loss": 0.9724, "num_input_tokens_seen": 844416, "step": 1460 }, { "epoch": 0.2182007745010426, "grad_norm": 6.109663009643555, "learning_rate": 1.0902591599642538e-05, "loss": 0.8614, "num_input_tokens_seen": 847168, "step": 1465 }, { "epoch": 0.21894548704200179, "grad_norm": 5.165510177612305, "learning_rate": 1.0939827226690498e-05, "loss": 0.8406, "num_input_tokens_seen": 850176, "step": 1470 }, { "epoch": 0.219690199582961, "grad_norm": 6.2149128913879395, "learning_rate": 1.0977062853738458e-05, "loss": 0.8328, "num_input_tokens_seen": 852992, "step": 1475 }, { "epoch": 0.22043491212392016, "grad_norm": 6.082942962646484, "learning_rate": 1.1014298480786418e-05, "loss": 0.8023, "num_input_tokens_seen": 855968, "step": 1480 }, { "epoch": 0.22117962466487937, "grad_norm": 13.632050514221191, "learning_rate": 1.1051534107834378e-05, "loss": 1.3279, "num_input_tokens_seen": 859136, "step": 1485 }, { "epoch": 0.22192433720583854, "grad_norm": 7.279483795166016, "learning_rate": 1.1088769734882336e-05, "loss": 0.8657, "num_input_tokens_seen": 862144, "step": 1490 }, { "epoch": 0.22266904974679774, "grad_norm": 8.087252616882324, "learning_rate": 1.1126005361930296e-05, "loss": 0.8138, "num_input_tokens_seen": 865440, "step": 1495 }, { "epoch": 0.22341376228775692, "grad_norm": 16.233604431152344, "learning_rate": 1.1163240988978255e-05, "loss": 0.8344, "num_input_tokens_seen": 868640, "step": 1500 }, { "epoch": 0.22415847482871612, "grad_norm": 2.3225831985473633, "learning_rate": 1.1200476616026215e-05, "loss": 0.8112, "num_input_tokens_seen": 871520, "step": 1505 }, { "epoch": 0.2249031873696753, "grad_norm": 7.501922607421875, "learning_rate": 1.1237712243074173e-05, "loss": 0.8025, "num_input_tokens_seen": 874400, "step": 1510 }, { "epoch": 0.2256478999106345, "grad_norm": 12.876870155334473, "learning_rate": 1.1274947870122133e-05, "loss": 0.8041, "num_input_tokens_seen": 877600, "step": 1515 }, { "epoch": 0.22639261245159367, "grad_norm": 16.847900390625, "learning_rate": 1.1312183497170093e-05, "loss": 0.8462, "num_input_tokens_seen": 880544, "step": 1520 }, { "epoch": 0.22713732499255287, "grad_norm": 47.519371032714844, "learning_rate": 1.1349419124218053e-05, "loss": 0.8201, "num_input_tokens_seen": 883488, "step": 1525 }, { "epoch": 0.22788203753351208, "grad_norm": 4.410821437835693, "learning_rate": 1.1386654751266011e-05, "loss": 0.8389, "num_input_tokens_seen": 886208, "step": 1530 }, { "epoch": 0.22862675007447125, "grad_norm": 16.140535354614258, "learning_rate": 1.1423890378313971e-05, "loss": 0.9066, "num_input_tokens_seen": 889088, "step": 1535 }, { "epoch": 0.22937146261543045, "grad_norm": 7.377968788146973, "learning_rate": 1.1461126005361931e-05, "loss": 0.7988, "num_input_tokens_seen": 891776, "step": 1540 }, { "epoch": 0.23011617515638963, "grad_norm": 7.944906234741211, "learning_rate": 1.149836163240989e-05, "loss": 0.8126, "num_input_tokens_seen": 894944, "step": 1545 }, { "epoch": 0.23086088769734883, "grad_norm": 43.40406799316406, "learning_rate": 1.153559725945785e-05, "loss": 0.8083, "num_input_tokens_seen": 897824, "step": 1550 }, { "epoch": 0.231605600238308, "grad_norm": 7.562222480773926, "learning_rate": 1.1572832886505809e-05, "loss": 0.8044, "num_input_tokens_seen": 900640, "step": 1555 }, { "epoch": 0.2323503127792672, "grad_norm": 16.88384437561035, "learning_rate": 1.1610068513553769e-05, "loss": 0.8402, "num_input_tokens_seen": 903424, "step": 1560 }, { "epoch": 0.23309502532022638, "grad_norm": 4.79119873046875, "learning_rate": 1.1647304140601728e-05, "loss": 0.8316, "num_input_tokens_seen": 906048, "step": 1565 }, { "epoch": 0.2338397378611856, "grad_norm": 14.813480377197266, "learning_rate": 1.1684539767649688e-05, "loss": 0.8511, "num_input_tokens_seen": 908896, "step": 1570 }, { "epoch": 0.23458445040214476, "grad_norm": 4.361270427703857, "learning_rate": 1.1721775394697646e-05, "loss": 0.7754, "num_input_tokens_seen": 911776, "step": 1575 }, { "epoch": 0.23532916294310396, "grad_norm": 5.822728633880615, "learning_rate": 1.1759011021745606e-05, "loss": 0.8293, "num_input_tokens_seen": 915040, "step": 1580 }, { "epoch": 0.23607387548406314, "grad_norm": 7.399103164672852, "learning_rate": 1.1796246648793566e-05, "loss": 0.8851, "num_input_tokens_seen": 917952, "step": 1585 }, { "epoch": 0.23681858802502234, "grad_norm": 11.657243728637695, "learning_rate": 1.1833482275841526e-05, "loss": 0.8785, "num_input_tokens_seen": 920640, "step": 1590 }, { "epoch": 0.23756330056598154, "grad_norm": 9.193657875061035, "learning_rate": 1.1870717902889484e-05, "loss": 0.7781, "num_input_tokens_seen": 923680, "step": 1595 }, { "epoch": 0.23830801310694072, "grad_norm": 103.34953308105469, "learning_rate": 1.1907953529937444e-05, "loss": 0.8563, "num_input_tokens_seen": 926560, "step": 1600 }, { "epoch": 0.23905272564789992, "grad_norm": 4.873467445373535, "learning_rate": 1.1945189156985404e-05, "loss": 0.8235, "num_input_tokens_seen": 929376, "step": 1605 }, { "epoch": 0.2397974381888591, "grad_norm": 2.6564605236053467, "learning_rate": 1.1982424784033364e-05, "loss": 0.8091, "num_input_tokens_seen": 932160, "step": 1610 }, { "epoch": 0.2405421507298183, "grad_norm": 2.0706937313079834, "learning_rate": 1.2019660411081324e-05, "loss": 0.774, "num_input_tokens_seen": 935040, "step": 1615 }, { "epoch": 0.24128686327077747, "grad_norm": 5.6461639404296875, "learning_rate": 1.2056896038129282e-05, "loss": 0.7916, "num_input_tokens_seen": 937824, "step": 1620 }, { "epoch": 0.24203157581173668, "grad_norm": 7.123254299163818, "learning_rate": 1.2094131665177243e-05, "loss": 0.8146, "num_input_tokens_seen": 940832, "step": 1625 }, { "epoch": 0.24277628835269585, "grad_norm": 3.9927189350128174, "learning_rate": 1.2131367292225201e-05, "loss": 0.8462, "num_input_tokens_seen": 943520, "step": 1630 }, { "epoch": 0.24352100089365505, "grad_norm": 14.635393142700195, "learning_rate": 1.2168602919273161e-05, "loss": 0.8411, "num_input_tokens_seen": 946400, "step": 1635 }, { "epoch": 0.24426571343461423, "grad_norm": 4.681243896484375, "learning_rate": 1.220583854632112e-05, "loss": 0.8387, "num_input_tokens_seen": 949504, "step": 1640 }, { "epoch": 0.24501042597557343, "grad_norm": 8.084169387817383, "learning_rate": 1.2243074173369081e-05, "loss": 0.8387, "num_input_tokens_seen": 953184, "step": 1645 }, { "epoch": 0.2457551385165326, "grad_norm": 2.903254747390747, "learning_rate": 1.228030980041704e-05, "loss": 0.8252, "num_input_tokens_seen": 956256, "step": 1650 }, { "epoch": 0.2464998510574918, "grad_norm": 5.311208248138428, "learning_rate": 1.2317545427464999e-05, "loss": 0.8003, "num_input_tokens_seen": 959168, "step": 1655 }, { "epoch": 0.247244563598451, "grad_norm": 10.042405128479004, "learning_rate": 1.2354781054512959e-05, "loss": 0.8891, "num_input_tokens_seen": 962016, "step": 1660 }, { "epoch": 0.2479892761394102, "grad_norm": 7.725742340087891, "learning_rate": 1.2392016681560919e-05, "loss": 0.873, "num_input_tokens_seen": 964896, "step": 1665 }, { "epoch": 0.2487339886803694, "grad_norm": 3.2951366901397705, "learning_rate": 1.2429252308608877e-05, "loss": 0.8549, "num_input_tokens_seen": 967712, "step": 1670 }, { "epoch": 0.24947870122132856, "grad_norm": 5.6588287353515625, "learning_rate": 1.2466487935656837e-05, "loss": 0.8274, "num_input_tokens_seen": 970752, "step": 1675 }, { "epoch": 0.25022341376228774, "grad_norm": 4.682791709899902, "learning_rate": 1.2503723562704797e-05, "loss": 0.8368, "num_input_tokens_seen": 974080, "step": 1680 }, { "epoch": 0.25096812630324694, "grad_norm": 1.4754153490066528, "learning_rate": 1.2540959189752758e-05, "loss": 0.832, "num_input_tokens_seen": 976960, "step": 1685 }, { "epoch": 0.25171283884420614, "grad_norm": 5.157984733581543, "learning_rate": 1.2578194816800715e-05, "loss": 0.8012, "num_input_tokens_seen": 980192, "step": 1690 }, { "epoch": 0.25245755138516535, "grad_norm": 2.7619616985321045, "learning_rate": 1.2615430443848674e-05, "loss": 0.7924, "num_input_tokens_seen": 982944, "step": 1695 }, { "epoch": 0.2532022639261245, "grad_norm": 13.691899299621582, "learning_rate": 1.2652666070896634e-05, "loss": 0.8289, "num_input_tokens_seen": 985696, "step": 1700 }, { "epoch": 0.2539469764670837, "grad_norm": 4.276371002197266, "learning_rate": 1.2689901697944596e-05, "loss": 0.8432, "num_input_tokens_seen": 988480, "step": 1705 }, { "epoch": 0.2546916890080429, "grad_norm": 3.161595582962036, "learning_rate": 1.2727137324992552e-05, "loss": 0.8255, "num_input_tokens_seen": 991392, "step": 1710 }, { "epoch": 0.2554364015490021, "grad_norm": 2.269805431365967, "learning_rate": 1.2764372952040512e-05, "loss": 0.8194, "num_input_tokens_seen": 994048, "step": 1715 }, { "epoch": 0.25618111408996125, "grad_norm": 2.1867432594299316, "learning_rate": 1.2801608579088472e-05, "loss": 0.7976, "num_input_tokens_seen": 996576, "step": 1720 }, { "epoch": 0.25692582663092045, "grad_norm": 1.293324589729309, "learning_rate": 1.2838844206136434e-05, "loss": 0.8129, "num_input_tokens_seen": 999616, "step": 1725 }, { "epoch": 0.25767053917187965, "grad_norm": 4.680129051208496, "learning_rate": 1.287607983318439e-05, "loss": 0.7758, "num_input_tokens_seen": 1002720, "step": 1730 }, { "epoch": 0.25841525171283886, "grad_norm": 8.91256332397461, "learning_rate": 1.291331546023235e-05, "loss": 0.8752, "num_input_tokens_seen": 1005440, "step": 1735 }, { "epoch": 0.25915996425379806, "grad_norm": 2.908906936645508, "learning_rate": 1.295055108728031e-05, "loss": 0.7863, "num_input_tokens_seen": 1007936, "step": 1740 }, { "epoch": 0.2599046767947572, "grad_norm": 4.324284076690674, "learning_rate": 1.2987786714328271e-05, "loss": 0.8869, "num_input_tokens_seen": 1010848, "step": 1745 }, { "epoch": 0.2606493893357164, "grad_norm": 9.431934356689453, "learning_rate": 1.3025022341376231e-05, "loss": 0.8411, "num_input_tokens_seen": 1013664, "step": 1750 }, { "epoch": 0.2613941018766756, "grad_norm": 2.029202699661255, "learning_rate": 1.3062257968424188e-05, "loss": 0.8157, "num_input_tokens_seen": 1016800, "step": 1755 }, { "epoch": 0.2621388144176348, "grad_norm": 251.45843505859375, "learning_rate": 1.3099493595472147e-05, "loss": 0.8327, "num_input_tokens_seen": 1019648, "step": 1760 }, { "epoch": 0.26288352695859396, "grad_norm": 3.9137954711914062, "learning_rate": 1.3136729222520109e-05, "loss": 0.804, "num_input_tokens_seen": 1022432, "step": 1765 }, { "epoch": 0.26362823949955316, "grad_norm": 6.214075088500977, "learning_rate": 1.3173964849568069e-05, "loss": 0.8143, "num_input_tokens_seen": 1025472, "step": 1770 }, { "epoch": 0.26437295204051237, "grad_norm": 3.1182167530059814, "learning_rate": 1.3211200476616025e-05, "loss": 0.8252, "num_input_tokens_seen": 1028288, "step": 1775 }, { "epoch": 0.26511766458147157, "grad_norm": 2.7606699466705322, "learning_rate": 1.3248436103663985e-05, "loss": 0.8058, "num_input_tokens_seen": 1031040, "step": 1780 }, { "epoch": 0.2658623771224307, "grad_norm": 1.9846419095993042, "learning_rate": 1.3285671730711947e-05, "loss": 0.852, "num_input_tokens_seen": 1033792, "step": 1785 }, { "epoch": 0.2666070896633899, "grad_norm": 1.7261768579483032, "learning_rate": 1.3322907357759907e-05, "loss": 0.8148, "num_input_tokens_seen": 1036448, "step": 1790 }, { "epoch": 0.2673518022043491, "grad_norm": 3.676835775375366, "learning_rate": 1.3360142984807863e-05, "loss": 0.7996, "num_input_tokens_seen": 1039168, "step": 1795 }, { "epoch": 0.2680965147453083, "grad_norm": 2.639930248260498, "learning_rate": 1.3397378611855823e-05, "loss": 0.7911, "num_input_tokens_seen": 1041952, "step": 1800 }, { "epoch": 0.2688412272862675, "grad_norm": 9.912698745727539, "learning_rate": 1.3434614238903784e-05, "loss": 0.8098, "num_input_tokens_seen": 1044608, "step": 1805 }, { "epoch": 0.2695859398272267, "grad_norm": 6.992570877075195, "learning_rate": 1.3471849865951744e-05, "loss": 0.9276, "num_input_tokens_seen": 1047552, "step": 1810 }, { "epoch": 0.2703306523681859, "grad_norm": 6.5341925621032715, "learning_rate": 1.3509085492999704e-05, "loss": 0.8009, "num_input_tokens_seen": 1051200, "step": 1815 }, { "epoch": 0.2710753649091451, "grad_norm": 8.470333099365234, "learning_rate": 1.354632112004766e-05, "loss": 0.8269, "num_input_tokens_seen": 1053888, "step": 1820 }, { "epoch": 0.2718200774501043, "grad_norm": 36.03535461425781, "learning_rate": 1.3583556747095622e-05, "loss": 0.7616, "num_input_tokens_seen": 1056544, "step": 1825 }, { "epoch": 0.2725647899910634, "grad_norm": 9.479708671569824, "learning_rate": 1.3620792374143582e-05, "loss": 0.9282, "num_input_tokens_seen": 1059296, "step": 1830 }, { "epoch": 0.27330950253202263, "grad_norm": 6.1661529541015625, "learning_rate": 1.3658028001191542e-05, "loss": 0.8151, "num_input_tokens_seen": 1062176, "step": 1835 }, { "epoch": 0.27405421507298183, "grad_norm": 6.226592540740967, "learning_rate": 1.3695263628239498e-05, "loss": 0.8991, "num_input_tokens_seen": 1064992, "step": 1840 }, { "epoch": 0.27479892761394104, "grad_norm": 22.00455093383789, "learning_rate": 1.373249925528746e-05, "loss": 0.895, "num_input_tokens_seen": 1067872, "step": 1845 }, { "epoch": 0.2755436401549002, "grad_norm": 3.9863662719726562, "learning_rate": 1.376973488233542e-05, "loss": 0.8558, "num_input_tokens_seen": 1070816, "step": 1850 }, { "epoch": 0.2762883526958594, "grad_norm": 2.6860718727111816, "learning_rate": 1.380697050938338e-05, "loss": 0.8051, "num_input_tokens_seen": 1073600, "step": 1855 }, { "epoch": 0.2770330652368186, "grad_norm": 16.902891159057617, "learning_rate": 1.3844206136431338e-05, "loss": 0.8314, "num_input_tokens_seen": 1076512, "step": 1860 }, { "epoch": 0.2777777777777778, "grad_norm": 6.198364734649658, "learning_rate": 1.3881441763479298e-05, "loss": 0.8524, "num_input_tokens_seen": 1079264, "step": 1865 }, { "epoch": 0.278522490318737, "grad_norm": 5.072604179382324, "learning_rate": 1.3918677390527257e-05, "loss": 0.9392, "num_input_tokens_seen": 1082144, "step": 1870 }, { "epoch": 0.27926720285969614, "grad_norm": 2.11053204536438, "learning_rate": 1.3955913017575217e-05, "loss": 0.8168, "num_input_tokens_seen": 1084960, "step": 1875 }, { "epoch": 0.28001191540065534, "grad_norm": 2.696829080581665, "learning_rate": 1.3993148644623177e-05, "loss": 0.8084, "num_input_tokens_seen": 1087936, "step": 1880 }, { "epoch": 0.28075662794161454, "grad_norm": 34.85164260864258, "learning_rate": 1.4030384271671135e-05, "loss": 0.8305, "num_input_tokens_seen": 1091008, "step": 1885 }, { "epoch": 0.28150134048257375, "grad_norm": 7.774751663208008, "learning_rate": 1.4067619898719095e-05, "loss": 0.9858, "num_input_tokens_seen": 1093664, "step": 1890 }, { "epoch": 0.2822460530235329, "grad_norm": 2.242542266845703, "learning_rate": 1.4104855525767055e-05, "loss": 0.8249, "num_input_tokens_seen": 1096640, "step": 1895 }, { "epoch": 0.2829907655644921, "grad_norm": 6.058631420135498, "learning_rate": 1.4142091152815015e-05, "loss": 0.754, "num_input_tokens_seen": 1099200, "step": 1900 }, { "epoch": 0.2837354781054513, "grad_norm": 11.762587547302246, "learning_rate": 1.4179326779862973e-05, "loss": 0.9017, "num_input_tokens_seen": 1102368, "step": 1905 }, { "epoch": 0.2844801906464105, "grad_norm": 9.60482406616211, "learning_rate": 1.4216562406910933e-05, "loss": 0.8283, "num_input_tokens_seen": 1105056, "step": 1910 }, { "epoch": 0.28522490318736965, "grad_norm": 6.51470422744751, "learning_rate": 1.4253798033958893e-05, "loss": 0.7989, "num_input_tokens_seen": 1108032, "step": 1915 }, { "epoch": 0.28596961572832885, "grad_norm": 3.802070140838623, "learning_rate": 1.4291033661006853e-05, "loss": 0.8267, "num_input_tokens_seen": 1110880, "step": 1920 }, { "epoch": 0.28671432826928805, "grad_norm": 14.088200569152832, "learning_rate": 1.432826928805481e-05, "loss": 0.8895, "num_input_tokens_seen": 1113568, "step": 1925 }, { "epoch": 0.28745904081024726, "grad_norm": 7.681187629699707, "learning_rate": 1.436550491510277e-05, "loss": 0.816, "num_input_tokens_seen": 1116416, "step": 1930 }, { "epoch": 0.28820375335120646, "grad_norm": 2.688476324081421, "learning_rate": 1.440274054215073e-05, "loss": 0.8171, "num_input_tokens_seen": 1119232, "step": 1935 }, { "epoch": 0.2889484658921656, "grad_norm": 2.2835748195648193, "learning_rate": 1.443997616919869e-05, "loss": 0.7989, "num_input_tokens_seen": 1122144, "step": 1940 }, { "epoch": 0.2896931784331248, "grad_norm": 5.42056941986084, "learning_rate": 1.447721179624665e-05, "loss": 0.8008, "num_input_tokens_seen": 1125056, "step": 1945 }, { "epoch": 0.290437890974084, "grad_norm": 7.219368934631348, "learning_rate": 1.4514447423294608e-05, "loss": 0.8327, "num_input_tokens_seen": 1127744, "step": 1950 }, { "epoch": 0.2911826035150432, "grad_norm": 2.977308988571167, "learning_rate": 1.4551683050342568e-05, "loss": 0.8072, "num_input_tokens_seen": 1130432, "step": 1955 }, { "epoch": 0.29192731605600236, "grad_norm": 10.371152877807617, "learning_rate": 1.4588918677390528e-05, "loss": 0.7675, "num_input_tokens_seen": 1133184, "step": 1960 }, { "epoch": 0.29267202859696156, "grad_norm": 4.969689846038818, "learning_rate": 1.4626154304438488e-05, "loss": 0.8188, "num_input_tokens_seen": 1136256, "step": 1965 }, { "epoch": 0.29341674113792077, "grad_norm": 7.793428421020508, "learning_rate": 1.4663389931486446e-05, "loss": 0.8094, "num_input_tokens_seen": 1139104, "step": 1970 }, { "epoch": 0.29416145367887997, "grad_norm": 3.584695339202881, "learning_rate": 1.4700625558534406e-05, "loss": 0.762, "num_input_tokens_seen": 1142176, "step": 1975 }, { "epoch": 0.2949061662198391, "grad_norm": 4.197446823120117, "learning_rate": 1.4737861185582366e-05, "loss": 0.8528, "num_input_tokens_seen": 1144896, "step": 1980 }, { "epoch": 0.2956508787607983, "grad_norm": 2.470466136932373, "learning_rate": 1.4775096812630326e-05, "loss": 0.7739, "num_input_tokens_seen": 1148192, "step": 1985 }, { "epoch": 0.2963955913017575, "grad_norm": 3.354036569595337, "learning_rate": 1.4812332439678284e-05, "loss": 0.8239, "num_input_tokens_seen": 1150848, "step": 1990 }, { "epoch": 0.2971403038427167, "grad_norm": 2.5815162658691406, "learning_rate": 1.4849568066726244e-05, "loss": 0.8435, "num_input_tokens_seen": 1153792, "step": 1995 }, { "epoch": 0.2978850163836759, "grad_norm": 2.437992811203003, "learning_rate": 1.4886803693774203e-05, "loss": 0.8383, "num_input_tokens_seen": 1156544, "step": 2000 }, { "epoch": 0.2986297289246351, "grad_norm": 90.21554565429688, "learning_rate": 1.4924039320822163e-05, "loss": 0.8626, "num_input_tokens_seen": 1159360, "step": 2005 }, { "epoch": 0.2993744414655943, "grad_norm": 6.253443717956543, "learning_rate": 1.4961274947870125e-05, "loss": 0.8969, "num_input_tokens_seen": 1162240, "step": 2010 }, { "epoch": 0.3001191540065535, "grad_norm": 3.803964853286743, "learning_rate": 1.4998510574918081e-05, "loss": 0.8089, "num_input_tokens_seen": 1165056, "step": 2015 }, { "epoch": 0.3008638665475127, "grad_norm": 2.102247953414917, "learning_rate": 1.5035746201966041e-05, "loss": 0.8131, "num_input_tokens_seen": 1167616, "step": 2020 }, { "epoch": 0.30160857908847183, "grad_norm": 0.9743460416793823, "learning_rate": 1.5072981829014001e-05, "loss": 0.8279, "num_input_tokens_seen": 1170432, "step": 2025 }, { "epoch": 0.30235329162943103, "grad_norm": 1.8784035444259644, "learning_rate": 1.5110217456061963e-05, "loss": 0.7964, "num_input_tokens_seen": 1173216, "step": 2030 }, { "epoch": 0.30309800417039023, "grad_norm": 1.8545551300048828, "learning_rate": 1.5147453083109919e-05, "loss": 0.7741, "num_input_tokens_seen": 1176192, "step": 2035 }, { "epoch": 0.30384271671134944, "grad_norm": 5.242198467254639, "learning_rate": 1.5184688710157879e-05, "loss": 0.8406, "num_input_tokens_seen": 1179296, "step": 2040 }, { "epoch": 0.3045874292523086, "grad_norm": 7.693238735198975, "learning_rate": 1.5221924337205839e-05, "loss": 0.8564, "num_input_tokens_seen": 1182240, "step": 2045 }, { "epoch": 0.3053321417932678, "grad_norm": 3.072157859802246, "learning_rate": 1.52591599642538e-05, "loss": 0.8391, "num_input_tokens_seen": 1185088, "step": 2050 }, { "epoch": 0.306076854334227, "grad_norm": 2.829437017440796, "learning_rate": 1.529639559130176e-05, "loss": 0.8133, "num_input_tokens_seen": 1188128, "step": 2055 }, { "epoch": 0.3068215668751862, "grad_norm": 1.9193408489227295, "learning_rate": 1.5333631218349718e-05, "loss": 0.8007, "num_input_tokens_seen": 1190944, "step": 2060 }, { "epoch": 0.3075662794161454, "grad_norm": 4.668319225311279, "learning_rate": 1.5370866845397678e-05, "loss": 0.8381, "num_input_tokens_seen": 1193920, "step": 2065 }, { "epoch": 0.30831099195710454, "grad_norm": 3.6518442630767822, "learning_rate": 1.5408102472445638e-05, "loss": 0.8008, "num_input_tokens_seen": 1196640, "step": 2070 }, { "epoch": 0.30905570449806374, "grad_norm": 6.705214023590088, "learning_rate": 1.5445338099493598e-05, "loss": 0.859, "num_input_tokens_seen": 1199616, "step": 2075 }, { "epoch": 0.30980041703902295, "grad_norm": 2.4788217544555664, "learning_rate": 1.5482573726541554e-05, "loss": 0.8235, "num_input_tokens_seen": 1202368, "step": 2080 }, { "epoch": 0.31054512957998215, "grad_norm": 2.400958776473999, "learning_rate": 1.5519809353589514e-05, "loss": 0.8023, "num_input_tokens_seen": 1205312, "step": 2085 }, { "epoch": 0.3112898421209413, "grad_norm": 1.5861018896102905, "learning_rate": 1.5557044980637474e-05, "loss": 0.8231, "num_input_tokens_seen": 1208608, "step": 2090 }, { "epoch": 0.3120345546619005, "grad_norm": 1.6722157001495361, "learning_rate": 1.5594280607685434e-05, "loss": 0.8268, "num_input_tokens_seen": 1211520, "step": 2095 }, { "epoch": 0.3127792672028597, "grad_norm": 2.1336889266967773, "learning_rate": 1.5631516234733394e-05, "loss": 0.8107, "num_input_tokens_seen": 1214592, "step": 2100 }, { "epoch": 0.3135239797438189, "grad_norm": 2.381228446960449, "learning_rate": 1.5668751861781354e-05, "loss": 0.8254, "num_input_tokens_seen": 1217440, "step": 2105 }, { "epoch": 0.31426869228477805, "grad_norm": 1.8158631324768066, "learning_rate": 1.5705987488829313e-05, "loss": 0.8137, "num_input_tokens_seen": 1220416, "step": 2110 }, { "epoch": 0.31501340482573725, "grad_norm": 2.6328861713409424, "learning_rate": 1.5743223115877273e-05, "loss": 0.8132, "num_input_tokens_seen": 1222912, "step": 2115 }, { "epoch": 0.31575811736669646, "grad_norm": 1.7784690856933594, "learning_rate": 1.578045874292523e-05, "loss": 0.812, "num_input_tokens_seen": 1225952, "step": 2120 }, { "epoch": 0.31650282990765566, "grad_norm": 2.3429627418518066, "learning_rate": 1.581769436997319e-05, "loss": 0.8083, "num_input_tokens_seen": 1228736, "step": 2125 }, { "epoch": 0.31724754244861486, "grad_norm": 3.8448774814605713, "learning_rate": 1.585492999702115e-05, "loss": 0.8249, "num_input_tokens_seen": 1231584, "step": 2130 }, { "epoch": 0.317992254989574, "grad_norm": 2.312971353530884, "learning_rate": 1.589216562406911e-05, "loss": 0.7956, "num_input_tokens_seen": 1234432, "step": 2135 }, { "epoch": 0.3187369675305332, "grad_norm": 2.9016594886779785, "learning_rate": 1.5929401251117073e-05, "loss": 0.8012, "num_input_tokens_seen": 1237376, "step": 2140 }, { "epoch": 0.3194816800714924, "grad_norm": 2.1819002628326416, "learning_rate": 1.596663687816503e-05, "loss": 0.8011, "num_input_tokens_seen": 1240352, "step": 2145 }, { "epoch": 0.3202263926124516, "grad_norm": 2.4278769493103027, "learning_rate": 1.600387250521299e-05, "loss": 0.8401, "num_input_tokens_seen": 1242944, "step": 2150 }, { "epoch": 0.32097110515341076, "grad_norm": 6.500767707824707, "learning_rate": 1.604110813226095e-05, "loss": 0.8325, "num_input_tokens_seen": 1245760, "step": 2155 }, { "epoch": 0.32171581769436997, "grad_norm": 4.919790744781494, "learning_rate": 1.607834375930891e-05, "loss": 1.1421, "num_input_tokens_seen": 1248352, "step": 2160 }, { "epoch": 0.32246053023532917, "grad_norm": 1.155141830444336, "learning_rate": 1.6115579386356865e-05, "loss": 0.8353, "num_input_tokens_seen": 1251072, "step": 2165 }, { "epoch": 0.32320524277628837, "grad_norm": 2.0088489055633545, "learning_rate": 1.6152815013404825e-05, "loss": 0.8223, "num_input_tokens_seen": 1253792, "step": 2170 }, { "epoch": 0.3239499553172475, "grad_norm": 473.7802734375, "learning_rate": 1.6190050640452785e-05, "loss": 1.113, "num_input_tokens_seen": 1256512, "step": 2175 }, { "epoch": 0.3246946678582067, "grad_norm": 3.2314672470092773, "learning_rate": 1.6227286267500748e-05, "loss": 0.8199, "num_input_tokens_seen": 1259200, "step": 2180 }, { "epoch": 0.3254393803991659, "grad_norm": 1.2193031311035156, "learning_rate": 1.6264521894548704e-05, "loss": 0.8207, "num_input_tokens_seen": 1262208, "step": 2185 }, { "epoch": 0.3261840929401251, "grad_norm": 13.811347961425781, "learning_rate": 1.6301757521596664e-05, "loss": 0.8023, "num_input_tokens_seen": 1264960, "step": 2190 }, { "epoch": 0.32692880548108433, "grad_norm": 4.399806499481201, "learning_rate": 1.6338993148644624e-05, "loss": 0.7491, "num_input_tokens_seen": 1268000, "step": 2195 }, { "epoch": 0.3276735180220435, "grad_norm": 8.513660430908203, "learning_rate": 1.6376228775692584e-05, "loss": 0.8252, "num_input_tokens_seen": 1271296, "step": 2200 }, { "epoch": 0.3284182305630027, "grad_norm": 12.480905532836914, "learning_rate": 1.6413464402740544e-05, "loss": 0.821, "num_input_tokens_seen": 1274368, "step": 2205 }, { "epoch": 0.3291629431039619, "grad_norm": 3.237816572189331, "learning_rate": 1.64507000297885e-05, "loss": 0.8159, "num_input_tokens_seen": 1277216, "step": 2210 }, { "epoch": 0.3299076556449211, "grad_norm": 3.169919729232788, "learning_rate": 1.648793565683646e-05, "loss": 0.8163, "num_input_tokens_seen": 1280064, "step": 2215 }, { "epoch": 0.33065236818588023, "grad_norm": 4.0535888671875, "learning_rate": 1.6525171283884423e-05, "loss": 0.7614, "num_input_tokens_seen": 1282880, "step": 2220 }, { "epoch": 0.33139708072683943, "grad_norm": 2.21697998046875, "learning_rate": 1.6562406910932383e-05, "loss": 0.8568, "num_input_tokens_seen": 1285856, "step": 2225 }, { "epoch": 0.33214179326779864, "grad_norm": 3.651275396347046, "learning_rate": 1.659964253798034e-05, "loss": 0.7924, "num_input_tokens_seen": 1288576, "step": 2230 }, { "epoch": 0.33288650580875784, "grad_norm": 1.6730022430419922, "learning_rate": 1.66368781650283e-05, "loss": 0.8141, "num_input_tokens_seen": 1291360, "step": 2235 }, { "epoch": 0.333631218349717, "grad_norm": 2.1633825302124023, "learning_rate": 1.667411379207626e-05, "loss": 0.8206, "num_input_tokens_seen": 1294368, "step": 2240 }, { "epoch": 0.3343759308906762, "grad_norm": 1.4093570709228516, "learning_rate": 1.671134941912422e-05, "loss": 0.8092, "num_input_tokens_seen": 1297152, "step": 2245 }, { "epoch": 0.3351206434316354, "grad_norm": 1.4969439506530762, "learning_rate": 1.6748585046172176e-05, "loss": 0.8098, "num_input_tokens_seen": 1300640, "step": 2250 }, { "epoch": 0.3358653559725946, "grad_norm": 5.622081279754639, "learning_rate": 1.6785820673220136e-05, "loss": 0.8064, "num_input_tokens_seen": 1303488, "step": 2255 }, { "epoch": 0.3366100685135538, "grad_norm": 31.18144989013672, "learning_rate": 1.68230563002681e-05, "loss": 1.2177, "num_input_tokens_seen": 1306528, "step": 2260 }, { "epoch": 0.33735478105451294, "grad_norm": 22.975893020629883, "learning_rate": 1.686029192731606e-05, "loss": 1.0531, "num_input_tokens_seen": 1309792, "step": 2265 }, { "epoch": 0.33809949359547214, "grad_norm": 3.9560203552246094, "learning_rate": 1.689752755436402e-05, "loss": 0.835, "num_input_tokens_seen": 1312448, "step": 2270 }, { "epoch": 0.33884420613643135, "grad_norm": 1.9337329864501953, "learning_rate": 1.6934763181411975e-05, "loss": 0.8343, "num_input_tokens_seen": 1315136, "step": 2275 }, { "epoch": 0.33958891867739055, "grad_norm": 3.591508626937866, "learning_rate": 1.6971998808459935e-05, "loss": 0.8007, "num_input_tokens_seen": 1318144, "step": 2280 }, { "epoch": 0.3403336312183497, "grad_norm": 3.124187469482422, "learning_rate": 1.7009234435507895e-05, "loss": 0.8031, "num_input_tokens_seen": 1320992, "step": 2285 }, { "epoch": 0.3410783437593089, "grad_norm": 2.3677167892456055, "learning_rate": 1.7046470062555855e-05, "loss": 0.8873, "num_input_tokens_seen": 1323808, "step": 2290 }, { "epoch": 0.3418230563002681, "grad_norm": 2.5578813552856445, "learning_rate": 1.7083705689603814e-05, "loss": 0.8413, "num_input_tokens_seen": 1326400, "step": 2295 }, { "epoch": 0.3425677688412273, "grad_norm": 1.0194759368896484, "learning_rate": 1.7120941316651774e-05, "loss": 0.8264, "num_input_tokens_seen": 1329184, "step": 2300 }, { "epoch": 0.34331248138218645, "grad_norm": 4.109829425811768, "learning_rate": 1.7158176943699734e-05, "loss": 0.794, "num_input_tokens_seen": 1332224, "step": 2305 }, { "epoch": 0.34405719392314565, "grad_norm": 1.6620687246322632, "learning_rate": 1.7195412570747694e-05, "loss": 0.7891, "num_input_tokens_seen": 1335168, "step": 2310 }, { "epoch": 0.34480190646410486, "grad_norm": 2.395420551300049, "learning_rate": 1.723264819779565e-05, "loss": 0.8767, "num_input_tokens_seen": 1338016, "step": 2315 }, { "epoch": 0.34554661900506406, "grad_norm": 1.562321424484253, "learning_rate": 1.726988382484361e-05, "loss": 0.8321, "num_input_tokens_seen": 1340640, "step": 2320 }, { "epoch": 0.34629133154602326, "grad_norm": 2.8055503368377686, "learning_rate": 1.730711945189157e-05, "loss": 0.8011, "num_input_tokens_seen": 1343648, "step": 2325 }, { "epoch": 0.3470360440869824, "grad_norm": 2.5438122749328613, "learning_rate": 1.734435507893953e-05, "loss": 0.7867, "num_input_tokens_seen": 1346464, "step": 2330 }, { "epoch": 0.3477807566279416, "grad_norm": 1.4446773529052734, "learning_rate": 1.738159070598749e-05, "loss": 0.7928, "num_input_tokens_seen": 1349408, "step": 2335 }, { "epoch": 0.3485254691689008, "grad_norm": 1.4493564367294312, "learning_rate": 1.741882633303545e-05, "loss": 0.8558, "num_input_tokens_seen": 1352288, "step": 2340 }, { "epoch": 0.34927018170986, "grad_norm": 1.6671485900878906, "learning_rate": 1.745606196008341e-05, "loss": 0.7812, "num_input_tokens_seen": 1354944, "step": 2345 }, { "epoch": 0.35001489425081916, "grad_norm": 1.7189973592758179, "learning_rate": 1.749329758713137e-05, "loss": 0.8045, "num_input_tokens_seen": 1358048, "step": 2350 }, { "epoch": 0.35075960679177837, "grad_norm": 1.8768606185913086, "learning_rate": 1.753053321417933e-05, "loss": 0.8376, "num_input_tokens_seen": 1360704, "step": 2355 }, { "epoch": 0.35150431933273757, "grad_norm": 1.7909499406814575, "learning_rate": 1.7567768841227286e-05, "loss": 0.8094, "num_input_tokens_seen": 1363232, "step": 2360 }, { "epoch": 0.35224903187369677, "grad_norm": 0.8687551021575928, "learning_rate": 1.7605004468275246e-05, "loss": 0.7936, "num_input_tokens_seen": 1366272, "step": 2365 }, { "epoch": 0.3529937444146559, "grad_norm": 0.8988397121429443, "learning_rate": 1.7642240095323205e-05, "loss": 0.8107, "num_input_tokens_seen": 1369024, "step": 2370 }, { "epoch": 0.3537384569556151, "grad_norm": 1.4567058086395264, "learning_rate": 1.7679475722371165e-05, "loss": 0.7999, "num_input_tokens_seen": 1371936, "step": 2375 }, { "epoch": 0.3544831694965743, "grad_norm": 1.0040347576141357, "learning_rate": 1.7716711349419125e-05, "loss": 0.8048, "num_input_tokens_seen": 1374976, "step": 2380 }, { "epoch": 0.3552278820375335, "grad_norm": 1.0285699367523193, "learning_rate": 1.7753946976467085e-05, "loss": 0.811, "num_input_tokens_seen": 1378208, "step": 2385 }, { "epoch": 0.35597259457849273, "grad_norm": 1.366978645324707, "learning_rate": 1.7791182603515045e-05, "loss": 0.8175, "num_input_tokens_seen": 1381056, "step": 2390 }, { "epoch": 0.3567173071194519, "grad_norm": 3.0814733505249023, "learning_rate": 1.7828418230563005e-05, "loss": 0.8096, "num_input_tokens_seen": 1384384, "step": 2395 }, { "epoch": 0.3574620196604111, "grad_norm": 2.2884931564331055, "learning_rate": 1.786565385761096e-05, "loss": 0.865, "num_input_tokens_seen": 1387200, "step": 2400 }, { "epoch": 0.3582067322013703, "grad_norm": 0.9345988631248474, "learning_rate": 1.790288948465892e-05, "loss": 0.8289, "num_input_tokens_seen": 1390272, "step": 2405 }, { "epoch": 0.3589514447423295, "grad_norm": 0.9899935126304626, "learning_rate": 1.794012511170688e-05, "loss": 0.8108, "num_input_tokens_seen": 1393408, "step": 2410 }, { "epoch": 0.35969615728328863, "grad_norm": 5.176207065582275, "learning_rate": 1.797736073875484e-05, "loss": 0.8332, "num_input_tokens_seen": 1396352, "step": 2415 }, { "epoch": 0.36044086982424783, "grad_norm": 1.8706789016723633, "learning_rate": 1.80145963658028e-05, "loss": 0.8087, "num_input_tokens_seen": 1399328, "step": 2420 }, { "epoch": 0.36118558236520704, "grad_norm": 4.991009712219238, "learning_rate": 1.805183199285076e-05, "loss": 0.8944, "num_input_tokens_seen": 1402080, "step": 2425 }, { "epoch": 0.36193029490616624, "grad_norm": 0.8945123553276062, "learning_rate": 1.808906761989872e-05, "loss": 0.7992, "num_input_tokens_seen": 1404576, "step": 2430 }, { "epoch": 0.3626750074471254, "grad_norm": 1.233527660369873, "learning_rate": 1.812630324694668e-05, "loss": 0.8077, "num_input_tokens_seen": 1407552, "step": 2435 }, { "epoch": 0.3634197199880846, "grad_norm": 1.4016766548156738, "learning_rate": 1.816353887399464e-05, "loss": 0.7669, "num_input_tokens_seen": 1410272, "step": 2440 }, { "epoch": 0.3641644325290438, "grad_norm": 1.9389982223510742, "learning_rate": 1.8200774501042596e-05, "loss": 0.8473, "num_input_tokens_seen": 1412992, "step": 2445 }, { "epoch": 0.364909145070003, "grad_norm": 2.3160760402679443, "learning_rate": 1.8238010128090556e-05, "loss": 0.8161, "num_input_tokens_seen": 1415840, "step": 2450 }, { "epoch": 0.3656538576109622, "grad_norm": 1.5115609169006348, "learning_rate": 1.8275245755138516e-05, "loss": 0.7825, "num_input_tokens_seen": 1418560, "step": 2455 }, { "epoch": 0.36639857015192134, "grad_norm": 2.126699686050415, "learning_rate": 1.8312481382186476e-05, "loss": 0.7928, "num_input_tokens_seen": 1421664, "step": 2460 }, { "epoch": 0.36714328269288055, "grad_norm": 1.1667190790176392, "learning_rate": 1.8349717009234436e-05, "loss": 0.8028, "num_input_tokens_seen": 1424512, "step": 2465 }, { "epoch": 0.36788799523383975, "grad_norm": 0.5192550420761108, "learning_rate": 1.8386952636282396e-05, "loss": 0.7821, "num_input_tokens_seen": 1427392, "step": 2470 }, { "epoch": 0.36863270777479895, "grad_norm": 1.2107346057891846, "learning_rate": 1.8424188263330356e-05, "loss": 0.7836, "num_input_tokens_seen": 1430048, "step": 2475 }, { "epoch": 0.3693774203157581, "grad_norm": 2.5753321647644043, "learning_rate": 1.8461423890378315e-05, "loss": 0.8589, "num_input_tokens_seen": 1432704, "step": 2480 }, { "epoch": 0.3701221328567173, "grad_norm": 1.7614479064941406, "learning_rate": 1.8498659517426275e-05, "loss": 0.8221, "num_input_tokens_seen": 1435488, "step": 2485 }, { "epoch": 0.3708668453976765, "grad_norm": 2.892448902130127, "learning_rate": 1.8535895144474232e-05, "loss": 0.8163, "num_input_tokens_seen": 1438176, "step": 2490 }, { "epoch": 0.3716115579386357, "grad_norm": 4.197646141052246, "learning_rate": 1.857313077152219e-05, "loss": 0.8527, "num_input_tokens_seen": 1440800, "step": 2495 }, { "epoch": 0.37235627047959485, "grad_norm": 3.645112991333008, "learning_rate": 1.8610366398570155e-05, "loss": 0.8082, "num_input_tokens_seen": 1443808, "step": 2500 }, { "epoch": 0.37310098302055406, "grad_norm": 1.837409496307373, "learning_rate": 1.8647602025618115e-05, "loss": 0.802, "num_input_tokens_seen": 1446752, "step": 2505 }, { "epoch": 0.37384569556151326, "grad_norm": 1.869370460510254, "learning_rate": 1.868483765266607e-05, "loss": 0.791, "num_input_tokens_seen": 1449792, "step": 2510 }, { "epoch": 0.37459040810247246, "grad_norm": 2.1571922302246094, "learning_rate": 1.872207327971403e-05, "loss": 0.79, "num_input_tokens_seen": 1452672, "step": 2515 }, { "epoch": 0.3753351206434316, "grad_norm": 2.417330741882324, "learning_rate": 1.875930890676199e-05, "loss": 0.8412, "num_input_tokens_seen": 1455456, "step": 2520 }, { "epoch": 0.3760798331843908, "grad_norm": 1.3379908800125122, "learning_rate": 1.879654453380995e-05, "loss": 0.8189, "num_input_tokens_seen": 1458208, "step": 2525 }, { "epoch": 0.37682454572535, "grad_norm": 1.6434826850891113, "learning_rate": 1.8833780160857907e-05, "loss": 0.8245, "num_input_tokens_seen": 1461024, "step": 2530 }, { "epoch": 0.3775692582663092, "grad_norm": 1.4293904304504395, "learning_rate": 1.8871015787905867e-05, "loss": 0.8051, "num_input_tokens_seen": 1463776, "step": 2535 }, { "epoch": 0.3783139708072684, "grad_norm": 1.6323003768920898, "learning_rate": 1.890825141495383e-05, "loss": 0.8204, "num_input_tokens_seen": 1466688, "step": 2540 }, { "epoch": 0.37905868334822757, "grad_norm": 0.6638275980949402, "learning_rate": 1.894548704200179e-05, "loss": 0.7153, "num_input_tokens_seen": 1469536, "step": 2545 }, { "epoch": 0.37980339588918677, "grad_norm": 37.49087142944336, "learning_rate": 1.898272266904975e-05, "loss": 0.9849, "num_input_tokens_seen": 1472096, "step": 2550 }, { "epoch": 0.38054810843014597, "grad_norm": 8.203479766845703, "learning_rate": 1.9019958296097706e-05, "loss": 1.1735, "num_input_tokens_seen": 1475008, "step": 2555 }, { "epoch": 0.3812928209711052, "grad_norm": 3.891990900039673, "learning_rate": 1.9057193923145666e-05, "loss": 0.9002, "num_input_tokens_seen": 1477664, "step": 2560 }, { "epoch": 0.3820375335120643, "grad_norm": 2.668431520462036, "learning_rate": 1.9094429550193626e-05, "loss": 0.8423, "num_input_tokens_seen": 1480576, "step": 2565 }, { "epoch": 0.3827822460530235, "grad_norm": 1.9000219106674194, "learning_rate": 1.9131665177241586e-05, "loss": 0.8087, "num_input_tokens_seen": 1483488, "step": 2570 }, { "epoch": 0.3835269585939827, "grad_norm": 2.4046201705932617, "learning_rate": 1.9168900804289542e-05, "loss": 0.8138, "num_input_tokens_seen": 1486368, "step": 2575 }, { "epoch": 0.38427167113494193, "grad_norm": 3.6994259357452393, "learning_rate": 1.9206136431337506e-05, "loss": 0.818, "num_input_tokens_seen": 1489216, "step": 2580 }, { "epoch": 0.3850163836759011, "grad_norm": 2.5666849613189697, "learning_rate": 1.9243372058385466e-05, "loss": 0.8128, "num_input_tokens_seen": 1492032, "step": 2585 }, { "epoch": 0.3857610962168603, "grad_norm": 1.5156935453414917, "learning_rate": 1.9280607685433425e-05, "loss": 0.7771, "num_input_tokens_seen": 1494784, "step": 2590 }, { "epoch": 0.3865058087578195, "grad_norm": 3.919987916946411, "learning_rate": 1.9317843312481382e-05, "loss": 0.7263, "num_input_tokens_seen": 1497856, "step": 2595 }, { "epoch": 0.3872505212987787, "grad_norm": 36.33073425292969, "learning_rate": 1.9355078939529342e-05, "loss": 1.0673, "num_input_tokens_seen": 1500512, "step": 2600 }, { "epoch": 0.3879952338397379, "grad_norm": 1.798476219177246, "learning_rate": 1.93923145665773e-05, "loss": 1.0956, "num_input_tokens_seen": 1503328, "step": 2605 }, { "epoch": 0.38873994638069703, "grad_norm": 1137.079833984375, "learning_rate": 1.942955019362526e-05, "loss": 1.0844, "num_input_tokens_seen": 1506016, "step": 2610 }, { "epoch": 0.38948465892165623, "grad_norm": 5.255368709564209, "learning_rate": 1.946678582067322e-05, "loss": 1.2134, "num_input_tokens_seen": 1508736, "step": 2615 }, { "epoch": 0.39022937146261544, "grad_norm": 4.201308727264404, "learning_rate": 1.950402144772118e-05, "loss": 0.8445, "num_input_tokens_seen": 1512896, "step": 2620 }, { "epoch": 0.39097408400357464, "grad_norm": 6.520557880401611, "learning_rate": 1.954125707476914e-05, "loss": 0.8305, "num_input_tokens_seen": 1515712, "step": 2625 }, { "epoch": 0.3917187965445338, "grad_norm": 3.6090195178985596, "learning_rate": 1.95784927018171e-05, "loss": 0.8257, "num_input_tokens_seen": 1518560, "step": 2630 }, { "epoch": 0.392463509085493, "grad_norm": 2.5871782302856445, "learning_rate": 1.961572832886506e-05, "loss": 0.8286, "num_input_tokens_seen": 1521536, "step": 2635 }, { "epoch": 0.3932082216264522, "grad_norm": 2.503244161605835, "learning_rate": 1.9652963955913017e-05, "loss": 0.824, "num_input_tokens_seen": 1524544, "step": 2640 }, { "epoch": 0.3939529341674114, "grad_norm": 4.539648532867432, "learning_rate": 1.9690199582960977e-05, "loss": 0.8159, "num_input_tokens_seen": 1527744, "step": 2645 }, { "epoch": 0.39469764670837054, "grad_norm": 1.3290269374847412, "learning_rate": 1.9727435210008937e-05, "loss": 0.8089, "num_input_tokens_seen": 1530592, "step": 2650 }, { "epoch": 0.39544235924932974, "grad_norm": 1.1495120525360107, "learning_rate": 1.9764670837056897e-05, "loss": 0.8022, "num_input_tokens_seen": 1533568, "step": 2655 }, { "epoch": 0.39618707179028895, "grad_norm": 2.5600719451904297, "learning_rate": 1.9801906464104857e-05, "loss": 0.8653, "num_input_tokens_seen": 1536512, "step": 2660 }, { "epoch": 0.39693178433124815, "grad_norm": 2.398336887359619, "learning_rate": 1.9839142091152816e-05, "loss": 0.8423, "num_input_tokens_seen": 1539520, "step": 2665 }, { "epoch": 0.39767649687220735, "grad_norm": 1.80372953414917, "learning_rate": 1.9876377718200776e-05, "loss": 0.8154, "num_input_tokens_seen": 1542464, "step": 2670 }, { "epoch": 0.3984212094131665, "grad_norm": 3.8710429668426514, "learning_rate": 1.9913613345248736e-05, "loss": 0.7289, "num_input_tokens_seen": 1545344, "step": 2675 }, { "epoch": 0.3991659219541257, "grad_norm": 8.45529556274414, "learning_rate": 1.9950848972296696e-05, "loss": 0.7757, "num_input_tokens_seen": 1548256, "step": 2680 }, { "epoch": 0.3999106344950849, "grad_norm": 40.4161262512207, "learning_rate": 1.9988084599344652e-05, "loss": 0.8067, "num_input_tokens_seen": 1551328, "step": 2685 }, { "epoch": 0.4006553470360441, "grad_norm": 3.711968183517456, "learning_rate": 2.0025320226392612e-05, "loss": 0.9948, "num_input_tokens_seen": 1554048, "step": 2690 }, { "epoch": 0.40140005957700325, "grad_norm": 1.456801176071167, "learning_rate": 2.0062555853440572e-05, "loss": 0.8306, "num_input_tokens_seen": 1556704, "step": 2695 }, { "epoch": 0.40214477211796246, "grad_norm": 1.293532371520996, "learning_rate": 2.0099791480488532e-05, "loss": 0.8198, "num_input_tokens_seen": 1559680, "step": 2700 }, { "epoch": 0.40288948465892166, "grad_norm": 1.209952473640442, "learning_rate": 2.0137027107536492e-05, "loss": 0.8041, "num_input_tokens_seen": 1562464, "step": 2705 }, { "epoch": 0.40363419719988086, "grad_norm": 1.0808161497116089, "learning_rate": 2.0174262734584452e-05, "loss": 0.8012, "num_input_tokens_seen": 1565376, "step": 2710 }, { "epoch": 0.40437890974084, "grad_norm": 1.743409276008606, "learning_rate": 2.021149836163241e-05, "loss": 0.7979, "num_input_tokens_seen": 1568256, "step": 2715 }, { "epoch": 0.4051236222817992, "grad_norm": 1.2049555778503418, "learning_rate": 2.024873398868037e-05, "loss": 0.8219, "num_input_tokens_seen": 1571424, "step": 2720 }, { "epoch": 0.4058683348227584, "grad_norm": 0.7290701270103455, "learning_rate": 2.0285969615728328e-05, "loss": 0.7847, "num_input_tokens_seen": 1574112, "step": 2725 }, { "epoch": 0.4066130473637176, "grad_norm": 1.0319030284881592, "learning_rate": 2.0323205242776288e-05, "loss": 0.8211, "num_input_tokens_seen": 1577024, "step": 2730 }, { "epoch": 0.4073577599046768, "grad_norm": 1.1697108745574951, "learning_rate": 2.0360440869824248e-05, "loss": 0.7901, "num_input_tokens_seen": 1579808, "step": 2735 }, { "epoch": 0.40810247244563597, "grad_norm": 8.25582504272461, "learning_rate": 2.0397676496872207e-05, "loss": 0.83, "num_input_tokens_seen": 1582752, "step": 2740 }, { "epoch": 0.40884718498659517, "grad_norm": 1.4069303274154663, "learning_rate": 2.043491212392017e-05, "loss": 0.7893, "num_input_tokens_seen": 1585728, "step": 2745 }, { "epoch": 0.40959189752755437, "grad_norm": 1.5592173337936401, "learning_rate": 2.0472147750968127e-05, "loss": 0.8043, "num_input_tokens_seen": 1588480, "step": 2750 }, { "epoch": 0.4103366100685136, "grad_norm": 3.929586172103882, "learning_rate": 2.0509383378016087e-05, "loss": 0.8264, "num_input_tokens_seen": 1591680, "step": 2755 }, { "epoch": 0.4110813226094727, "grad_norm": 1.3817267417907715, "learning_rate": 2.0546619005064047e-05, "loss": 0.8289, "num_input_tokens_seen": 1594496, "step": 2760 }, { "epoch": 0.4118260351504319, "grad_norm": 0.8363814353942871, "learning_rate": 2.0583854632112007e-05, "loss": 0.8149, "num_input_tokens_seen": 1597408, "step": 2765 }, { "epoch": 0.4125707476913911, "grad_norm": 1.1080933809280396, "learning_rate": 2.0621090259159963e-05, "loss": 0.818, "num_input_tokens_seen": 1600288, "step": 2770 }, { "epoch": 0.41331546023235033, "grad_norm": 0.887637734413147, "learning_rate": 2.0658325886207923e-05, "loss": 0.8214, "num_input_tokens_seen": 1603200, "step": 2775 }, { "epoch": 0.4140601727733095, "grad_norm": 1.0829250812530518, "learning_rate": 2.0695561513255883e-05, "loss": 0.7981, "num_input_tokens_seen": 1606144, "step": 2780 }, { "epoch": 0.4148048853142687, "grad_norm": 0.8321808576583862, "learning_rate": 2.0732797140303846e-05, "loss": 0.7929, "num_input_tokens_seen": 1609152, "step": 2785 }, { "epoch": 0.4155495978552279, "grad_norm": 1.1290134191513062, "learning_rate": 2.0770032767351803e-05, "loss": 0.7985, "num_input_tokens_seen": 1612192, "step": 2790 }, { "epoch": 0.4162943103961871, "grad_norm": 0.8477641344070435, "learning_rate": 2.0807268394399762e-05, "loss": 0.8087, "num_input_tokens_seen": 1614944, "step": 2795 }, { "epoch": 0.4170390229371463, "grad_norm": 0.7543423771858215, "learning_rate": 2.0844504021447722e-05, "loss": 0.7982, "num_input_tokens_seen": 1617856, "step": 2800 }, { "epoch": 0.41778373547810543, "grad_norm": 0.7167761921882629, "learning_rate": 2.0881739648495682e-05, "loss": 0.7835, "num_input_tokens_seen": 1620928, "step": 2805 }, { "epoch": 0.41852844801906464, "grad_norm": 1.1755329370498657, "learning_rate": 2.0918975275543642e-05, "loss": 0.7978, "num_input_tokens_seen": 1623744, "step": 2810 }, { "epoch": 0.41927316056002384, "grad_norm": 3.9656918048858643, "learning_rate": 2.09562109025916e-05, "loss": 0.8482, "num_input_tokens_seen": 1626720, "step": 2815 }, { "epoch": 0.42001787310098304, "grad_norm": 2.353313684463501, "learning_rate": 2.099344652963956e-05, "loss": 0.8709, "num_input_tokens_seen": 1629696, "step": 2820 }, { "epoch": 0.4207625856419422, "grad_norm": 0.4721486270427704, "learning_rate": 2.103068215668752e-05, "loss": 0.843, "num_input_tokens_seen": 1632416, "step": 2825 }, { "epoch": 0.4215072981829014, "grad_norm": 1.1747314929962158, "learning_rate": 2.106791778373548e-05, "loss": 0.7854, "num_input_tokens_seen": 1635168, "step": 2830 }, { "epoch": 0.4222520107238606, "grad_norm": 1.0785295963287354, "learning_rate": 2.1105153410783438e-05, "loss": 0.7903, "num_input_tokens_seen": 1637760, "step": 2835 }, { "epoch": 0.4229967232648198, "grad_norm": 3.101531505584717, "learning_rate": 2.1142389037831398e-05, "loss": 0.8039, "num_input_tokens_seen": 1640480, "step": 2840 }, { "epoch": 0.42374143580577894, "grad_norm": 9.180511474609375, "learning_rate": 2.1179624664879358e-05, "loss": 0.8519, "num_input_tokens_seen": 1643232, "step": 2845 }, { "epoch": 0.42448614834673815, "grad_norm": 1.1666626930236816, "learning_rate": 2.1216860291927317e-05, "loss": 0.7222, "num_input_tokens_seen": 1646208, "step": 2850 }, { "epoch": 0.42523086088769735, "grad_norm": 5.1717848777771, "learning_rate": 2.1254095918975274e-05, "loss": 0.978, "num_input_tokens_seen": 1648864, "step": 2855 }, { "epoch": 0.42597557342865655, "grad_norm": 2.1943843364715576, "learning_rate": 2.1291331546023234e-05, "loss": 0.8888, "num_input_tokens_seen": 1651648, "step": 2860 }, { "epoch": 0.42672028596961575, "grad_norm": 3.067439317703247, "learning_rate": 2.1328567173071197e-05, "loss": 0.8909, "num_input_tokens_seen": 1654272, "step": 2865 }, { "epoch": 0.4274649985105749, "grad_norm": 1.3539708852767944, "learning_rate": 2.1365802800119157e-05, "loss": 0.8107, "num_input_tokens_seen": 1657280, "step": 2870 }, { "epoch": 0.4282097110515341, "grad_norm": 0.6984896659851074, "learning_rate": 2.1403038427167117e-05, "loss": 0.8155, "num_input_tokens_seen": 1660000, "step": 2875 }, { "epoch": 0.4289544235924933, "grad_norm": 1.260080099105835, "learning_rate": 2.1440274054215073e-05, "loss": 0.772, "num_input_tokens_seen": 1662880, "step": 2880 }, { "epoch": 0.4296991361334525, "grad_norm": 1.5060421228408813, "learning_rate": 2.1477509681263033e-05, "loss": 0.7775, "num_input_tokens_seen": 1665760, "step": 2885 }, { "epoch": 0.43044384867441166, "grad_norm": 1.3499257564544678, "learning_rate": 2.1514745308310993e-05, "loss": 0.939, "num_input_tokens_seen": 1668640, "step": 2890 }, { "epoch": 0.43118856121537086, "grad_norm": 3.993314743041992, "learning_rate": 2.1551980935358953e-05, "loss": 0.887, "num_input_tokens_seen": 1671552, "step": 2895 }, { "epoch": 0.43193327375633006, "grad_norm": 4.034473419189453, "learning_rate": 2.1589216562406913e-05, "loss": 1.0482, "num_input_tokens_seen": 1674432, "step": 2900 }, { "epoch": 0.43267798629728926, "grad_norm": 0.8494142293930054, "learning_rate": 2.1626452189454872e-05, "loss": 0.8411, "num_input_tokens_seen": 1677248, "step": 2905 }, { "epoch": 0.4334226988382484, "grad_norm": 2.288848876953125, "learning_rate": 2.1663687816502832e-05, "loss": 0.8101, "num_input_tokens_seen": 1680064, "step": 2910 }, { "epoch": 0.4341674113792076, "grad_norm": 1.7564408779144287, "learning_rate": 2.1700923443550792e-05, "loss": 0.819, "num_input_tokens_seen": 1682848, "step": 2915 }, { "epoch": 0.4349121239201668, "grad_norm": 1.3966113328933716, "learning_rate": 2.173815907059875e-05, "loss": 0.8216, "num_input_tokens_seen": 1685792, "step": 2920 }, { "epoch": 0.435656836461126, "grad_norm": 2.385643482208252, "learning_rate": 2.177539469764671e-05, "loss": 0.8134, "num_input_tokens_seen": 1688448, "step": 2925 }, { "epoch": 0.4364015490020852, "grad_norm": 0.9234651923179626, "learning_rate": 2.1812630324694668e-05, "loss": 0.8206, "num_input_tokens_seen": 1691328, "step": 2930 }, { "epoch": 0.43714626154304437, "grad_norm": 1.0055547952651978, "learning_rate": 2.1849865951742628e-05, "loss": 0.8053, "num_input_tokens_seen": 1694336, "step": 2935 }, { "epoch": 0.43789097408400357, "grad_norm": 1.422537922859192, "learning_rate": 2.1887101578790588e-05, "loss": 0.8074, "num_input_tokens_seen": 1697056, "step": 2940 }, { "epoch": 0.4386356866249628, "grad_norm": 0.5176904201507568, "learning_rate": 2.1924337205838548e-05, "loss": 0.8099, "num_input_tokens_seen": 1699808, "step": 2945 }, { "epoch": 0.439380399165922, "grad_norm": 0.7341372966766357, "learning_rate": 2.1961572832886508e-05, "loss": 0.7736, "num_input_tokens_seen": 1702848, "step": 2950 }, { "epoch": 0.4401251117068811, "grad_norm": 1.900921106338501, "learning_rate": 2.1998808459934468e-05, "loss": 0.7975, "num_input_tokens_seen": 1705696, "step": 2955 }, { "epoch": 0.4408698242478403, "grad_norm": 1.4996097087860107, "learning_rate": 2.2036044086982427e-05, "loss": 0.7947, "num_input_tokens_seen": 1708576, "step": 2960 }, { "epoch": 0.4416145367887995, "grad_norm": 1.7133007049560547, "learning_rate": 2.2073279714030384e-05, "loss": 0.7549, "num_input_tokens_seen": 1711328, "step": 2965 }, { "epoch": 0.44235924932975873, "grad_norm": 0.924674928188324, "learning_rate": 2.2110515341078344e-05, "loss": 0.8116, "num_input_tokens_seen": 1714368, "step": 2970 }, { "epoch": 0.4431039618707179, "grad_norm": 0.8358421921730042, "learning_rate": 2.2147750968126304e-05, "loss": 0.8477, "num_input_tokens_seen": 1717248, "step": 2975 }, { "epoch": 0.4438486744116771, "grad_norm": 1.405434250831604, "learning_rate": 2.2184986595174263e-05, "loss": 0.8349, "num_input_tokens_seen": 1720384, "step": 2980 }, { "epoch": 0.4445933869526363, "grad_norm": 3.537498712539673, "learning_rate": 2.2222222222222223e-05, "loss": 0.802, "num_input_tokens_seen": 1723360, "step": 2985 }, { "epoch": 0.4453380994935955, "grad_norm": 3.2070469856262207, "learning_rate": 2.2259457849270183e-05, "loss": 0.8091, "num_input_tokens_seen": 1725920, "step": 2990 }, { "epoch": 0.4460828120345547, "grad_norm": 2.2599048614501953, "learning_rate": 2.2296693476318143e-05, "loss": 0.7513, "num_input_tokens_seen": 1728672, "step": 2995 }, { "epoch": 0.44682752457551383, "grad_norm": 1.5555074214935303, "learning_rate": 2.2333929103366103e-05, "loss": 0.825, "num_input_tokens_seen": 1731488, "step": 3000 }, { "epoch": 0.44757223711647304, "grad_norm": 5.729109764099121, "learning_rate": 2.2371164730414063e-05, "loss": 0.851, "num_input_tokens_seen": 1734496, "step": 3005 }, { "epoch": 0.44831694965743224, "grad_norm": 1.02006995677948, "learning_rate": 2.240840035746202e-05, "loss": 0.8375, "num_input_tokens_seen": 1737408, "step": 3010 }, { "epoch": 0.44906166219839144, "grad_norm": 1.9916712045669556, "learning_rate": 2.244563598450998e-05, "loss": 0.7872, "num_input_tokens_seen": 1740096, "step": 3015 }, { "epoch": 0.4498063747393506, "grad_norm": 1.2711135149002075, "learning_rate": 2.248287161155794e-05, "loss": 0.7825, "num_input_tokens_seen": 1742688, "step": 3020 }, { "epoch": 0.4505510872803098, "grad_norm": 0.8126233220100403, "learning_rate": 2.25201072386059e-05, "loss": 0.8149, "num_input_tokens_seen": 1745760, "step": 3025 }, { "epoch": 0.451295799821269, "grad_norm": 1.260972023010254, "learning_rate": 2.255734286565386e-05, "loss": 0.8057, "num_input_tokens_seen": 1748768, "step": 3030 }, { "epoch": 0.4520405123622282, "grad_norm": 0.6607245206832886, "learning_rate": 2.259457849270182e-05, "loss": 0.8026, "num_input_tokens_seen": 1751680, "step": 3035 }, { "epoch": 0.45278522490318734, "grad_norm": 0.7763521671295166, "learning_rate": 2.2631814119749778e-05, "loss": 0.8169, "num_input_tokens_seen": 1754496, "step": 3040 }, { "epoch": 0.45352993744414655, "grad_norm": 0.8367595076560974, "learning_rate": 2.2669049746797738e-05, "loss": 0.8415, "num_input_tokens_seen": 1757024, "step": 3045 }, { "epoch": 0.45427464998510575, "grad_norm": 0.8404898047447205, "learning_rate": 2.2706285373845695e-05, "loss": 0.7941, "num_input_tokens_seen": 1759936, "step": 3050 }, { "epoch": 0.45501936252606495, "grad_norm": 0.5925204753875732, "learning_rate": 2.2743521000893654e-05, "loss": 0.7864, "num_input_tokens_seen": 1762880, "step": 3055 }, { "epoch": 0.45576407506702415, "grad_norm": 1.0745426416397095, "learning_rate": 2.2780756627941614e-05, "loss": 0.7976, "num_input_tokens_seen": 1765952, "step": 3060 }, { "epoch": 0.4565087876079833, "grad_norm": 0.646531879901886, "learning_rate": 2.2817992254989574e-05, "loss": 0.7689, "num_input_tokens_seen": 1768704, "step": 3065 }, { "epoch": 0.4572535001489425, "grad_norm": 2.0539329051971436, "learning_rate": 2.2855227882037537e-05, "loss": 0.7909, "num_input_tokens_seen": 1771488, "step": 3070 }, { "epoch": 0.4579982126899017, "grad_norm": 1.7731152772903442, "learning_rate": 2.2892463509085494e-05, "loss": 0.8151, "num_input_tokens_seen": 1774624, "step": 3075 }, { "epoch": 0.4587429252308609, "grad_norm": 0.9254619479179382, "learning_rate": 2.2929699136133454e-05, "loss": 0.8575, "num_input_tokens_seen": 1777728, "step": 3080 }, { "epoch": 0.45948763777182006, "grad_norm": 0.6413570642471313, "learning_rate": 2.2966934763181414e-05, "loss": 0.8042, "num_input_tokens_seen": 1780640, "step": 3085 }, { "epoch": 0.46023235031277926, "grad_norm": 1.3682522773742676, "learning_rate": 2.3004170390229373e-05, "loss": 0.8787, "num_input_tokens_seen": 1783520, "step": 3090 }, { "epoch": 0.46097706285373846, "grad_norm": 1.6055330038070679, "learning_rate": 2.304140601727733e-05, "loss": 0.785, "num_input_tokens_seen": 1786240, "step": 3095 }, { "epoch": 0.46172177539469766, "grad_norm": 0.6974729299545288, "learning_rate": 2.307864164432529e-05, "loss": 0.8045, "num_input_tokens_seen": 1789184, "step": 3100 }, { "epoch": 0.4624664879356568, "grad_norm": 1.4046342372894287, "learning_rate": 2.3115877271373253e-05, "loss": 0.8292, "num_input_tokens_seen": 1791968, "step": 3105 }, { "epoch": 0.463211200476616, "grad_norm": 1.7531770467758179, "learning_rate": 2.3153112898421213e-05, "loss": 0.8058, "num_input_tokens_seen": 1794816, "step": 3110 }, { "epoch": 0.4639559130175752, "grad_norm": 0.724487841129303, "learning_rate": 2.319034852546917e-05, "loss": 0.8127, "num_input_tokens_seen": 1797696, "step": 3115 }, { "epoch": 0.4647006255585344, "grad_norm": 1.9965214729309082, "learning_rate": 2.322758415251713e-05, "loss": 0.8214, "num_input_tokens_seen": 1800640, "step": 3120 }, { "epoch": 0.4654453380994936, "grad_norm": 1.0130919218063354, "learning_rate": 2.326481977956509e-05, "loss": 0.8343, "num_input_tokens_seen": 1803552, "step": 3125 }, { "epoch": 0.46619005064045277, "grad_norm": 0.7519381046295166, "learning_rate": 2.330205540661305e-05, "loss": 0.8078, "num_input_tokens_seen": 1806624, "step": 3130 }, { "epoch": 0.46693476318141197, "grad_norm": 1.4775272607803345, "learning_rate": 2.333929103366101e-05, "loss": 0.7926, "num_input_tokens_seen": 1809760, "step": 3135 }, { "epoch": 0.4676794757223712, "grad_norm": 0.7338501214981079, "learning_rate": 2.3376526660708965e-05, "loss": 0.8006, "num_input_tokens_seen": 1812544, "step": 3140 }, { "epoch": 0.4684241882633304, "grad_norm": 0.43374118208885193, "learning_rate": 2.341376228775693e-05, "loss": 0.8171, "num_input_tokens_seen": 1815072, "step": 3145 }, { "epoch": 0.4691689008042895, "grad_norm": 0.7475950121879578, "learning_rate": 2.3450997914804888e-05, "loss": 0.8089, "num_input_tokens_seen": 1817760, "step": 3150 }, { "epoch": 0.4699136133452487, "grad_norm": 0.4766894280910492, "learning_rate": 2.3488233541852848e-05, "loss": 0.8073, "num_input_tokens_seen": 1820512, "step": 3155 }, { "epoch": 0.47065832588620793, "grad_norm": 0.9107518196105957, "learning_rate": 2.3525469168900805e-05, "loss": 0.81, "num_input_tokens_seen": 1823808, "step": 3160 }, { "epoch": 0.47140303842716713, "grad_norm": 0.7107895612716675, "learning_rate": 2.3562704795948764e-05, "loss": 0.7948, "num_input_tokens_seen": 1826656, "step": 3165 }, { "epoch": 0.4721477509681263, "grad_norm": 0.8242627382278442, "learning_rate": 2.3599940422996724e-05, "loss": 0.8306, "num_input_tokens_seen": 1830304, "step": 3170 }, { "epoch": 0.4728924635090855, "grad_norm": 0.6828604936599731, "learning_rate": 2.3637176050044684e-05, "loss": 0.8011, "num_input_tokens_seen": 1833408, "step": 3175 }, { "epoch": 0.4736371760500447, "grad_norm": 0.9584705829620361, "learning_rate": 2.367441167709264e-05, "loss": 0.7885, "num_input_tokens_seen": 1836096, "step": 3180 }, { "epoch": 0.4743818885910039, "grad_norm": 2.111027479171753, "learning_rate": 2.3711647304140604e-05, "loss": 0.8015, "num_input_tokens_seen": 1838912, "step": 3185 }, { "epoch": 0.4751266011319631, "grad_norm": 0.9123789668083191, "learning_rate": 2.3748882931188564e-05, "loss": 0.7958, "num_input_tokens_seen": 1841824, "step": 3190 }, { "epoch": 0.47587131367292224, "grad_norm": 0.5912151336669922, "learning_rate": 2.3786118558236524e-05, "loss": 0.8026, "num_input_tokens_seen": 1844800, "step": 3195 }, { "epoch": 0.47661602621388144, "grad_norm": 0.4830930531024933, "learning_rate": 2.3823354185284483e-05, "loss": 0.8187, "num_input_tokens_seen": 1847648, "step": 3200 }, { "epoch": 0.47736073875484064, "grad_norm": 0.6262291669845581, "learning_rate": 2.386058981233244e-05, "loss": 0.7782, "num_input_tokens_seen": 1850464, "step": 3205 }, { "epoch": 0.47810545129579984, "grad_norm": 0.6437273621559143, "learning_rate": 2.38978254393804e-05, "loss": 0.7974, "num_input_tokens_seen": 1853248, "step": 3210 }, { "epoch": 0.478850163836759, "grad_norm": 0.7197263836860657, "learning_rate": 2.393506106642836e-05, "loss": 0.792, "num_input_tokens_seen": 1856000, "step": 3215 }, { "epoch": 0.4795948763777182, "grad_norm": 0.3974834382534027, "learning_rate": 2.397229669347632e-05, "loss": 0.8005, "num_input_tokens_seen": 1858944, "step": 3220 }, { "epoch": 0.4803395889186774, "grad_norm": 0.5937800407409668, "learning_rate": 2.400953232052428e-05, "loss": 0.7765, "num_input_tokens_seen": 1862080, "step": 3225 }, { "epoch": 0.4810843014596366, "grad_norm": 1.515473484992981, "learning_rate": 2.404676794757224e-05, "loss": 0.7977, "num_input_tokens_seen": 1865088, "step": 3230 }, { "epoch": 0.48182901400059575, "grad_norm": 0.7834762334823608, "learning_rate": 2.40840035746202e-05, "loss": 0.798, "num_input_tokens_seen": 1867552, "step": 3235 }, { "epoch": 0.48257372654155495, "grad_norm": 0.9680752158164978, "learning_rate": 2.412123920166816e-05, "loss": 0.8523, "num_input_tokens_seen": 1870784, "step": 3240 }, { "epoch": 0.48331843908251415, "grad_norm": 0.9383596777915955, "learning_rate": 2.4158474828716115e-05, "loss": 0.8352, "num_input_tokens_seen": 1873920, "step": 3245 }, { "epoch": 0.48406315162347335, "grad_norm": 1.6336300373077393, "learning_rate": 2.4195710455764075e-05, "loss": 0.7789, "num_input_tokens_seen": 1877056, "step": 3250 }, { "epoch": 0.48480786416443256, "grad_norm": 0.8599020838737488, "learning_rate": 2.4232946082812035e-05, "loss": 0.817, "num_input_tokens_seen": 1880160, "step": 3255 }, { "epoch": 0.4855525767053917, "grad_norm": 2.908698081970215, "learning_rate": 2.4270181709859995e-05, "loss": 0.824, "num_input_tokens_seen": 1882912, "step": 3260 }, { "epoch": 0.4862972892463509, "grad_norm": 0.8791532516479492, "learning_rate": 2.4307417336907955e-05, "loss": 0.8139, "num_input_tokens_seen": 1885536, "step": 3265 }, { "epoch": 0.4870420017873101, "grad_norm": 2.3260505199432373, "learning_rate": 2.4344652963955915e-05, "loss": 0.8198, "num_input_tokens_seen": 1888416, "step": 3270 }, { "epoch": 0.4877867143282693, "grad_norm": 0.6522477865219116, "learning_rate": 2.4381888591003874e-05, "loss": 0.8168, "num_input_tokens_seen": 1891456, "step": 3275 }, { "epoch": 0.48853142686922846, "grad_norm": 0.720706045627594, "learning_rate": 2.4419124218051834e-05, "loss": 0.8166, "num_input_tokens_seen": 1894240, "step": 3280 }, { "epoch": 0.48927613941018766, "grad_norm": 0.428335577249527, "learning_rate": 2.4456359845099794e-05, "loss": 0.8086, "num_input_tokens_seen": 1897088, "step": 3285 }, { "epoch": 0.49002085195114686, "grad_norm": 0.5128321647644043, "learning_rate": 2.449359547214775e-05, "loss": 0.8148, "num_input_tokens_seen": 1900192, "step": 3290 }, { "epoch": 0.49076556449210607, "grad_norm": 0.7001945972442627, "learning_rate": 2.453083109919571e-05, "loss": 0.8207, "num_input_tokens_seen": 1903136, "step": 3295 }, { "epoch": 0.4915102770330652, "grad_norm": 1.051875114440918, "learning_rate": 2.456806672624367e-05, "loss": 0.7766, "num_input_tokens_seen": 1906016, "step": 3300 }, { "epoch": 0.4922549895740244, "grad_norm": 1.4402780532836914, "learning_rate": 2.460530235329163e-05, "loss": 0.8123, "num_input_tokens_seen": 1909184, "step": 3305 }, { "epoch": 0.4929997021149836, "grad_norm": 2.7685739994049072, "learning_rate": 2.464253798033959e-05, "loss": 0.8203, "num_input_tokens_seen": 1911872, "step": 3310 }, { "epoch": 0.4937444146559428, "grad_norm": 1.7518600225448608, "learning_rate": 2.467977360738755e-05, "loss": 1.0299, "num_input_tokens_seen": 1914848, "step": 3315 }, { "epoch": 0.494489127196902, "grad_norm": 0.5798244476318359, "learning_rate": 2.471700923443551e-05, "loss": 0.7773, "num_input_tokens_seen": 1917920, "step": 3320 }, { "epoch": 0.49523383973786117, "grad_norm": 0.7692534923553467, "learning_rate": 2.475424486148347e-05, "loss": 0.84, "num_input_tokens_seen": 1920704, "step": 3325 }, { "epoch": 0.4959785522788204, "grad_norm": 1.104968786239624, "learning_rate": 2.4791480488531426e-05, "loss": 0.8743, "num_input_tokens_seen": 1923264, "step": 3330 }, { "epoch": 0.4967232648197796, "grad_norm": 1.8776150941848755, "learning_rate": 2.4828716115579386e-05, "loss": 3.2495, "num_input_tokens_seen": 1926048, "step": 3335 }, { "epoch": 0.4974679773607388, "grad_norm": 1.6479696035385132, "learning_rate": 2.4865951742627346e-05, "loss": 0.8086, "num_input_tokens_seen": 1928896, "step": 3340 }, { "epoch": 0.4982126899016979, "grad_norm": 0.9467015862464905, "learning_rate": 2.4903187369675306e-05, "loss": 0.8048, "num_input_tokens_seen": 1931776, "step": 3345 }, { "epoch": 0.4989574024426571, "grad_norm": 0.6339288353919983, "learning_rate": 2.494042299672327e-05, "loss": 0.7965, "num_input_tokens_seen": 1934624, "step": 3350 }, { "epoch": 0.49970211498361633, "grad_norm": 0.7279565930366516, "learning_rate": 2.4977658623771225e-05, "loss": 0.8057, "num_input_tokens_seen": 1937600, "step": 3355 }, { "epoch": 0.5, "eval_loss": 0.8055608868598938, "eval_runtime": 74.2058, "eval_samples_per_second": 40.212, "eval_steps_per_second": 10.053, "num_input_tokens_seen": 1938656, "step": 3357 }, { "epoch": 0.5004468275245755, "grad_norm": 0.6829037070274353, "learning_rate": 2.501489425081919e-05, "loss": 0.825, "num_input_tokens_seen": 1940192, "step": 3360 }, { "epoch": 0.5011915400655347, "grad_norm": 0.6149413585662842, "learning_rate": 2.505212987786714e-05, "loss": 0.7827, "num_input_tokens_seen": 1942912, "step": 3365 }, { "epoch": 0.5019362526064939, "grad_norm": 0.7121087312698364, "learning_rate": 2.50893655049151e-05, "loss": 0.8375, "num_input_tokens_seen": 1945504, "step": 3370 }, { "epoch": 0.5026809651474531, "grad_norm": 0.8855634331703186, "learning_rate": 2.512660113196306e-05, "loss": 0.8303, "num_input_tokens_seen": 1948512, "step": 3375 }, { "epoch": 0.5034256776884123, "grad_norm": 0.5236551761627197, "learning_rate": 2.516383675901102e-05, "loss": 0.8212, "num_input_tokens_seen": 1951392, "step": 3380 }, { "epoch": 0.5041703902293715, "grad_norm": 0.5840007066726685, "learning_rate": 2.520107238605898e-05, "loss": 0.7828, "num_input_tokens_seen": 1954272, "step": 3385 }, { "epoch": 0.5049151027703307, "grad_norm": 0.5424073338508606, "learning_rate": 2.5238308013106944e-05, "loss": 0.8129, "num_input_tokens_seen": 1957024, "step": 3390 }, { "epoch": 0.5056598153112899, "grad_norm": 0.8163644075393677, "learning_rate": 2.5275543640154904e-05, "loss": 0.8098, "num_input_tokens_seen": 1959744, "step": 3395 }, { "epoch": 0.506404527852249, "grad_norm": 0.6095085740089417, "learning_rate": 2.5312779267202864e-05, "loss": 0.8029, "num_input_tokens_seen": 1962688, "step": 3400 }, { "epoch": 0.5071492403932082, "grad_norm": 1.0964441299438477, "learning_rate": 2.5350014894250824e-05, "loss": 0.7834, "num_input_tokens_seen": 1965440, "step": 3405 }, { "epoch": 0.5078939529341674, "grad_norm": 1.452397108078003, "learning_rate": 2.5387250521298777e-05, "loss": 0.8122, "num_input_tokens_seen": 1968192, "step": 3410 }, { "epoch": 0.5086386654751266, "grad_norm": 2.3686156272888184, "learning_rate": 2.5424486148346737e-05, "loss": 0.8081, "num_input_tokens_seen": 1970912, "step": 3415 }, { "epoch": 0.5093833780160858, "grad_norm": 1.7529577016830444, "learning_rate": 2.5461721775394697e-05, "loss": 0.809, "num_input_tokens_seen": 1973984, "step": 3420 }, { "epoch": 0.510128090557045, "grad_norm": 17.87316131591797, "learning_rate": 2.5498957402442656e-05, "loss": 0.7974, "num_input_tokens_seen": 1976800, "step": 3425 }, { "epoch": 0.5108728030980042, "grad_norm": 4.985137939453125, "learning_rate": 2.553619302949062e-05, "loss": 0.8135, "num_input_tokens_seen": 1979776, "step": 3430 }, { "epoch": 0.5116175156389634, "grad_norm": 2.722606897354126, "learning_rate": 2.557342865653858e-05, "loss": 0.8615, "num_input_tokens_seen": 1982656, "step": 3435 }, { "epoch": 0.5123622281799225, "grad_norm": 1.9182782173156738, "learning_rate": 2.561066428358654e-05, "loss": 0.8901, "num_input_tokens_seen": 1985312, "step": 3440 }, { "epoch": 0.5131069407208817, "grad_norm": 2.498220205307007, "learning_rate": 2.56478999106345e-05, "loss": 0.7952, "num_input_tokens_seen": 1988384, "step": 3445 }, { "epoch": 0.5138516532618409, "grad_norm": 1.47568941116333, "learning_rate": 2.5685135537682452e-05, "loss": 0.8241, "num_input_tokens_seen": 1991296, "step": 3450 }, { "epoch": 0.5145963658028001, "grad_norm": 0.9834294319152832, "learning_rate": 2.5722371164730412e-05, "loss": 0.8016, "num_input_tokens_seen": 1993952, "step": 3455 }, { "epoch": 0.5153410783437593, "grad_norm": 0.705043375492096, "learning_rate": 2.5759606791778372e-05, "loss": 0.8226, "num_input_tokens_seen": 1997024, "step": 3460 }, { "epoch": 0.5160857908847185, "grad_norm": 0.7496423125267029, "learning_rate": 2.5796842418826332e-05, "loss": 0.8071, "num_input_tokens_seen": 1999776, "step": 3465 }, { "epoch": 0.5168305034256777, "grad_norm": 1.1889876127243042, "learning_rate": 2.5834078045874295e-05, "loss": 0.8023, "num_input_tokens_seen": 2002752, "step": 3470 }, { "epoch": 0.5175752159666369, "grad_norm": 0.6280030608177185, "learning_rate": 2.5871313672922255e-05, "loss": 0.817, "num_input_tokens_seen": 2005408, "step": 3475 }, { "epoch": 0.5183199285075961, "grad_norm": 0.6394180655479431, "learning_rate": 2.5908549299970215e-05, "loss": 0.8045, "num_input_tokens_seen": 2008320, "step": 3480 }, { "epoch": 0.5190646410485552, "grad_norm": 0.5036172270774841, "learning_rate": 2.5945784927018175e-05, "loss": 0.8066, "num_input_tokens_seen": 2011040, "step": 3485 }, { "epoch": 0.5198093535895144, "grad_norm": 0.6486870646476746, "learning_rate": 2.5983020554066135e-05, "loss": 0.7889, "num_input_tokens_seen": 2013888, "step": 3490 }, { "epoch": 0.5205540661304736, "grad_norm": 1.1932215690612793, "learning_rate": 2.6020256181114088e-05, "loss": 0.8211, "num_input_tokens_seen": 2017024, "step": 3495 }, { "epoch": 0.5212987786714328, "grad_norm": 0.7003517150878906, "learning_rate": 2.6057491808162047e-05, "loss": 0.829, "num_input_tokens_seen": 2019968, "step": 3500 }, { "epoch": 0.522043491212392, "grad_norm": 0.3790411651134491, "learning_rate": 2.609472743521001e-05, "loss": 0.803, "num_input_tokens_seen": 2023104, "step": 3505 }, { "epoch": 0.5227882037533512, "grad_norm": 0.3680810034275055, "learning_rate": 2.613196306225797e-05, "loss": 0.8072, "num_input_tokens_seen": 2026016, "step": 3510 }, { "epoch": 0.5235329162943104, "grad_norm": 0.4701647460460663, "learning_rate": 2.616919868930593e-05, "loss": 0.7893, "num_input_tokens_seen": 2028960, "step": 3515 }, { "epoch": 0.5242776288352696, "grad_norm": 0.7313083410263062, "learning_rate": 2.620643431635389e-05, "loss": 0.8002, "num_input_tokens_seen": 2032096, "step": 3520 }, { "epoch": 0.5250223413762288, "grad_norm": 0.49132803082466125, "learning_rate": 2.624366994340185e-05, "loss": 0.8123, "num_input_tokens_seen": 2034656, "step": 3525 }, { "epoch": 0.5257670539171879, "grad_norm": 1.1785504817962646, "learning_rate": 2.628090557044981e-05, "loss": 0.8248, "num_input_tokens_seen": 2037760, "step": 3530 }, { "epoch": 0.5265117664581471, "grad_norm": 0.6095307469367981, "learning_rate": 2.631814119749777e-05, "loss": 0.8127, "num_input_tokens_seen": 2041248, "step": 3535 }, { "epoch": 0.5272564789991063, "grad_norm": 1102.8228759765625, "learning_rate": 2.6355376824545723e-05, "loss": 2.403, "num_input_tokens_seen": 2044384, "step": 3540 }, { "epoch": 0.5280011915400655, "grad_norm": 0.6132003664970398, "learning_rate": 2.6392612451593686e-05, "loss": 1.2059, "num_input_tokens_seen": 2046944, "step": 3545 }, { "epoch": 0.5287459040810247, "grad_norm": 0.7468771934509277, "learning_rate": 2.6429848078641646e-05, "loss": 0.809, "num_input_tokens_seen": 2049792, "step": 3550 }, { "epoch": 0.5294906166219839, "grad_norm": 0.9423494935035706, "learning_rate": 2.6467083705689606e-05, "loss": 0.8107, "num_input_tokens_seen": 2052736, "step": 3555 }, { "epoch": 0.5302353291629431, "grad_norm": 1.0618258714675903, "learning_rate": 2.6504319332737566e-05, "loss": 0.7782, "num_input_tokens_seen": 2055616, "step": 3560 }, { "epoch": 0.5309800417039023, "grad_norm": 1.9907327890396118, "learning_rate": 2.6541554959785526e-05, "loss": 0.7957, "num_input_tokens_seen": 2058464, "step": 3565 }, { "epoch": 0.5317247542448614, "grad_norm": 1.3019931316375732, "learning_rate": 2.6578790586833485e-05, "loss": 0.789, "num_input_tokens_seen": 2061152, "step": 3570 }, { "epoch": 0.5324694667858206, "grad_norm": 1.8231931924819946, "learning_rate": 2.6616026213881445e-05, "loss": 0.7702, "num_input_tokens_seen": 2063936, "step": 3575 }, { "epoch": 0.5332141793267798, "grad_norm": 1.935977816581726, "learning_rate": 2.66532618409294e-05, "loss": 0.9007, "num_input_tokens_seen": 2066688, "step": 3580 }, { "epoch": 0.533958891867739, "grad_norm": 1.4381985664367676, "learning_rate": 2.669049746797736e-05, "loss": 0.8206, "num_input_tokens_seen": 2069664, "step": 3585 }, { "epoch": 0.5347036044086982, "grad_norm": 1.7674274444580078, "learning_rate": 2.672773309502532e-05, "loss": 0.7621, "num_input_tokens_seen": 2072672, "step": 3590 }, { "epoch": 0.5354483169496574, "grad_norm": 3.8544130325317383, "learning_rate": 2.676496872207328e-05, "loss": 0.8719, "num_input_tokens_seen": 2075776, "step": 3595 }, { "epoch": 0.5361930294906166, "grad_norm": 5.246114730834961, "learning_rate": 2.680220434912124e-05, "loss": 0.7907, "num_input_tokens_seen": 2078656, "step": 3600 }, { "epoch": 0.5369377420315758, "grad_norm": 17.62108039855957, "learning_rate": 2.68394399761692e-05, "loss": 0.8467, "num_input_tokens_seen": 2081824, "step": 3605 }, { "epoch": 0.537682454572535, "grad_norm": 7.954970359802246, "learning_rate": 2.687667560321716e-05, "loss": 0.8335, "num_input_tokens_seen": 2084672, "step": 3610 }, { "epoch": 0.5384271671134941, "grad_norm": 3.0334420204162598, "learning_rate": 2.691391123026512e-05, "loss": 0.8306, "num_input_tokens_seen": 2087296, "step": 3615 }, { "epoch": 0.5391718796544533, "grad_norm": 4.862981796264648, "learning_rate": 2.695114685731308e-05, "loss": 0.798, "num_input_tokens_seen": 2090272, "step": 3620 }, { "epoch": 0.5399165921954125, "grad_norm": 4.689752578735352, "learning_rate": 2.6988382484361037e-05, "loss": 0.7542, "num_input_tokens_seen": 2093152, "step": 3625 }, { "epoch": 0.5406613047363718, "grad_norm": 1.8441591262817383, "learning_rate": 2.7025618111408997e-05, "loss": 0.7307, "num_input_tokens_seen": 2096160, "step": 3630 }, { "epoch": 0.541406017277331, "grad_norm": 14.563486099243164, "learning_rate": 2.7062853738456957e-05, "loss": 0.9703, "num_input_tokens_seen": 2098784, "step": 3635 }, { "epoch": 0.5421507298182902, "grad_norm": 5.292904376983643, "learning_rate": 2.7100089365504917e-05, "loss": 0.7715, "num_input_tokens_seen": 2101440, "step": 3640 }, { "epoch": 0.5428954423592494, "grad_norm": 3.330639123916626, "learning_rate": 2.7137324992552876e-05, "loss": 0.7876, "num_input_tokens_seen": 2104448, "step": 3645 }, { "epoch": 0.5436401549002086, "grad_norm": 3.650770425796509, "learning_rate": 2.7174560619600836e-05, "loss": 0.8206, "num_input_tokens_seen": 2107232, "step": 3650 }, { "epoch": 0.5443848674411678, "grad_norm": 4.483633518218994, "learning_rate": 2.7211796246648796e-05, "loss": 0.7931, "num_input_tokens_seen": 2110336, "step": 3655 }, { "epoch": 0.5451295799821269, "grad_norm": 2.717928647994995, "learning_rate": 2.7249031873696756e-05, "loss": 0.8454, "num_input_tokens_seen": 2113472, "step": 3660 }, { "epoch": 0.5458742925230861, "grad_norm": 5.653355598449707, "learning_rate": 2.7286267500744716e-05, "loss": 0.7812, "num_input_tokens_seen": 2117216, "step": 3665 }, { "epoch": 0.5466190050640453, "grad_norm": 3.4093315601348877, "learning_rate": 2.7323503127792672e-05, "loss": 0.7273, "num_input_tokens_seen": 2120128, "step": 3670 }, { "epoch": 0.5473637176050045, "grad_norm": 10.064014434814453, "learning_rate": 2.7360738754840632e-05, "loss": 0.7772, "num_input_tokens_seen": 2123328, "step": 3675 }, { "epoch": 0.5481084301459637, "grad_norm": 5.327926158905029, "learning_rate": 2.7397974381888592e-05, "loss": 0.6639, "num_input_tokens_seen": 2126528, "step": 3680 }, { "epoch": 0.5488531426869229, "grad_norm": 12.3433198928833, "learning_rate": 2.7435210008936552e-05, "loss": 0.9772, "num_input_tokens_seen": 2129248, "step": 3685 }, { "epoch": 0.5495978552278821, "grad_norm": 4.564536094665527, "learning_rate": 2.7472445635984512e-05, "loss": 0.7476, "num_input_tokens_seen": 2132192, "step": 3690 }, { "epoch": 0.5503425677688413, "grad_norm": 3.693009376525879, "learning_rate": 2.750968126303247e-05, "loss": 0.7049, "num_input_tokens_seen": 2135104, "step": 3695 }, { "epoch": 0.5510872803098004, "grad_norm": 2.3152852058410645, "learning_rate": 2.754691689008043e-05, "loss": 0.7679, "num_input_tokens_seen": 2137824, "step": 3700 }, { "epoch": 0.5518319928507596, "grad_norm": 4.621946334838867, "learning_rate": 2.758415251712839e-05, "loss": 0.7013, "num_input_tokens_seen": 2140608, "step": 3705 }, { "epoch": 0.5525767053917188, "grad_norm": 2.7283501625061035, "learning_rate": 2.7621388144176348e-05, "loss": 0.8018, "num_input_tokens_seen": 2143392, "step": 3710 }, { "epoch": 0.553321417932678, "grad_norm": 3.445309638977051, "learning_rate": 2.7658623771224308e-05, "loss": 0.732, "num_input_tokens_seen": 2146432, "step": 3715 }, { "epoch": 0.5540661304736372, "grad_norm": 3.705692768096924, "learning_rate": 2.7695859398272267e-05, "loss": 0.7096, "num_input_tokens_seen": 2149248, "step": 3720 }, { "epoch": 0.5548108430145964, "grad_norm": 3.512842893600464, "learning_rate": 2.7733095025320227e-05, "loss": 0.8367, "num_input_tokens_seen": 2152352, "step": 3725 }, { "epoch": 0.5555555555555556, "grad_norm": 2.748889923095703, "learning_rate": 2.7770330652368187e-05, "loss": 0.7197, "num_input_tokens_seen": 2155264, "step": 3730 }, { "epoch": 0.5563002680965148, "grad_norm": 2.079660654067993, "learning_rate": 2.7807566279416147e-05, "loss": 0.8037, "num_input_tokens_seen": 2158336, "step": 3735 }, { "epoch": 0.557044980637474, "grad_norm": 2.470364570617676, "learning_rate": 2.7844801906464107e-05, "loss": 0.7423, "num_input_tokens_seen": 2161344, "step": 3740 }, { "epoch": 0.5577896931784331, "grad_norm": 9.370372772216797, "learning_rate": 2.7882037533512067e-05, "loss": 0.5675, "num_input_tokens_seen": 2164032, "step": 3745 }, { "epoch": 0.5585344057193923, "grad_norm": 3.267796516418457, "learning_rate": 2.7919273160560027e-05, "loss": 0.8706, "num_input_tokens_seen": 2166912, "step": 3750 }, { "epoch": 0.5592791182603515, "grad_norm": 4.548680305480957, "learning_rate": 2.7956508787607983e-05, "loss": 0.6318, "num_input_tokens_seen": 2170016, "step": 3755 }, { "epoch": 0.5600238308013107, "grad_norm": 10.702861785888672, "learning_rate": 2.7993744414655943e-05, "loss": 0.8313, "num_input_tokens_seen": 2172448, "step": 3760 }, { "epoch": 0.5607685433422699, "grad_norm": 2.1611168384552, "learning_rate": 2.8030980041703903e-05, "loss": 0.8923, "num_input_tokens_seen": 2175776, "step": 3765 }, { "epoch": 0.5615132558832291, "grad_norm": 3.3338277339935303, "learning_rate": 2.8068215668751863e-05, "loss": 0.8054, "num_input_tokens_seen": 2178560, "step": 3770 }, { "epoch": 0.5622579684241883, "grad_norm": 1.6993722915649414, "learning_rate": 2.8105451295799822e-05, "loss": 0.7923, "num_input_tokens_seen": 2181600, "step": 3775 }, { "epoch": 0.5630026809651475, "grad_norm": 2.088170051574707, "learning_rate": 2.8142686922847782e-05, "loss": 0.6438, "num_input_tokens_seen": 2184448, "step": 3780 }, { "epoch": 0.5637473935061067, "grad_norm": 3.43410062789917, "learning_rate": 2.8179922549895742e-05, "loss": 0.6635, "num_input_tokens_seen": 2187424, "step": 3785 }, { "epoch": 0.5644921060470658, "grad_norm": 2.8947925567626953, "learning_rate": 2.8217158176943702e-05, "loss": 0.6552, "num_input_tokens_seen": 2190080, "step": 3790 }, { "epoch": 0.565236818588025, "grad_norm": 4.391176700592041, "learning_rate": 2.825439380399166e-05, "loss": 0.7144, "num_input_tokens_seen": 2192800, "step": 3795 }, { "epoch": 0.5659815311289842, "grad_norm": 4.678360462188721, "learning_rate": 2.8291629431039618e-05, "loss": 0.668, "num_input_tokens_seen": 2195840, "step": 3800 }, { "epoch": 0.5667262436699434, "grad_norm": 9.265329360961914, "learning_rate": 2.8328865058087578e-05, "loss": 0.6981, "num_input_tokens_seen": 2199424, "step": 3805 }, { "epoch": 0.5674709562109026, "grad_norm": 7.722078800201416, "learning_rate": 2.8366100685135538e-05, "loss": 0.8038, "num_input_tokens_seen": 2202208, "step": 3810 }, { "epoch": 0.5682156687518618, "grad_norm": 4.276768207550049, "learning_rate": 2.8403336312183498e-05, "loss": 0.7356, "num_input_tokens_seen": 2205152, "step": 3815 }, { "epoch": 0.568960381292821, "grad_norm": 3.9779789447784424, "learning_rate": 2.8440571939231458e-05, "loss": 0.7182, "num_input_tokens_seen": 2208224, "step": 3820 }, { "epoch": 0.5697050938337802, "grad_norm": 1.9445778131484985, "learning_rate": 2.8477807566279418e-05, "loss": 0.6459, "num_input_tokens_seen": 2211296, "step": 3825 }, { "epoch": 0.5704498063747393, "grad_norm": 4.435259819030762, "learning_rate": 2.8515043193327377e-05, "loss": 0.6414, "num_input_tokens_seen": 2214176, "step": 3830 }, { "epoch": 0.5711945189156985, "grad_norm": 3.012793779373169, "learning_rate": 2.8552278820375337e-05, "loss": 0.6803, "num_input_tokens_seen": 2216512, "step": 3835 }, { "epoch": 0.5719392314566577, "grad_norm": 3.1995692253112793, "learning_rate": 2.8589514447423294e-05, "loss": 0.627, "num_input_tokens_seen": 2219392, "step": 3840 }, { "epoch": 0.5726839439976169, "grad_norm": 11.25918197631836, "learning_rate": 2.8626750074471254e-05, "loss": 0.8507, "num_input_tokens_seen": 2222368, "step": 3845 }, { "epoch": 0.5734286565385761, "grad_norm": 5.550534248352051, "learning_rate": 2.8663985701519213e-05, "loss": 0.6506, "num_input_tokens_seen": 2225472, "step": 3850 }, { "epoch": 0.5741733690795353, "grad_norm": 5.586114883422852, "learning_rate": 2.8701221328567173e-05, "loss": 0.8723, "num_input_tokens_seen": 2228416, "step": 3855 }, { "epoch": 0.5749180816204945, "grad_norm": 3.730851888656616, "learning_rate": 2.8738456955615133e-05, "loss": 0.7261, "num_input_tokens_seen": 2231104, "step": 3860 }, { "epoch": 0.5756627941614537, "grad_norm": 3.2351760864257812, "learning_rate": 2.8775692582663093e-05, "loss": 0.7642, "num_input_tokens_seen": 2233760, "step": 3865 }, { "epoch": 0.5764075067024129, "grad_norm": 2.9330697059631348, "learning_rate": 2.8812928209711053e-05, "loss": 0.7142, "num_input_tokens_seen": 2236608, "step": 3870 }, { "epoch": 0.577152219243372, "grad_norm": 2.9399430751800537, "learning_rate": 2.8850163836759013e-05, "loss": 0.7366, "num_input_tokens_seen": 2239424, "step": 3875 }, { "epoch": 0.5778969317843312, "grad_norm": 42.076805114746094, "learning_rate": 2.8887399463806976e-05, "loss": 0.8853, "num_input_tokens_seen": 2242272, "step": 3880 }, { "epoch": 0.5786416443252904, "grad_norm": 3.364065647125244, "learning_rate": 2.892463509085493e-05, "loss": 0.9069, "num_input_tokens_seen": 2245216, "step": 3885 }, { "epoch": 0.5793863568662496, "grad_norm": 2.1975982189178467, "learning_rate": 2.896187071790289e-05, "loss": 0.5945, "num_input_tokens_seen": 2248352, "step": 3890 }, { "epoch": 0.5801310694072088, "grad_norm": 9.541617393493652, "learning_rate": 2.899910634495085e-05, "loss": 0.7678, "num_input_tokens_seen": 2251008, "step": 3895 }, { "epoch": 0.580875781948168, "grad_norm": 3.6504762172698975, "learning_rate": 2.903634197199881e-05, "loss": 0.6608, "num_input_tokens_seen": 2254048, "step": 3900 }, { "epoch": 0.5816204944891272, "grad_norm": 3.582092046737671, "learning_rate": 2.907357759904677e-05, "loss": 0.7443, "num_input_tokens_seen": 2257088, "step": 3905 }, { "epoch": 0.5823652070300864, "grad_norm": 4.858114719390869, "learning_rate": 2.9110813226094728e-05, "loss": 0.6792, "num_input_tokens_seen": 2260000, "step": 3910 }, { "epoch": 0.5831099195710456, "grad_norm": 5.280096530914307, "learning_rate": 2.914804885314269e-05, "loss": 0.7799, "num_input_tokens_seen": 2262688, "step": 3915 }, { "epoch": 0.5838546321120047, "grad_norm": 4.036449909210205, "learning_rate": 2.918528448019065e-05, "loss": 0.8205, "num_input_tokens_seen": 2265536, "step": 3920 }, { "epoch": 0.5845993446529639, "grad_norm": 2.4297714233398438, "learning_rate": 2.9222520107238604e-05, "loss": 0.6774, "num_input_tokens_seen": 2268512, "step": 3925 }, { "epoch": 0.5853440571939231, "grad_norm": 2.241314649581909, "learning_rate": 2.9259755734286564e-05, "loss": 0.6587, "num_input_tokens_seen": 2271712, "step": 3930 }, { "epoch": 0.5860887697348823, "grad_norm": 3.732232093811035, "learning_rate": 2.9296991361334524e-05, "loss": 0.8712, "num_input_tokens_seen": 2274560, "step": 3935 }, { "epoch": 0.5868334822758415, "grad_norm": 2.80552339553833, "learning_rate": 2.9334226988382484e-05, "loss": 0.6763, "num_input_tokens_seen": 2277376, "step": 3940 }, { "epoch": 0.5875781948168007, "grad_norm": 4.427992343902588, "learning_rate": 2.9371462615430444e-05, "loss": 0.6969, "num_input_tokens_seen": 2279840, "step": 3945 }, { "epoch": 0.5883229073577599, "grad_norm": 1.7563631534576416, "learning_rate": 2.9408698242478404e-05, "loss": 0.8504, "num_input_tokens_seen": 2282560, "step": 3950 }, { "epoch": 0.5890676198987191, "grad_norm": 2.0398201942443848, "learning_rate": 2.9445933869526367e-05, "loss": 0.6501, "num_input_tokens_seen": 2285536, "step": 3955 }, { "epoch": 0.5898123324396782, "grad_norm": 2.9628310203552246, "learning_rate": 2.9483169496574327e-05, "loss": 0.651, "num_input_tokens_seen": 2288288, "step": 3960 }, { "epoch": 0.5905570449806374, "grad_norm": 2.4575581550598145, "learning_rate": 2.9520405123622287e-05, "loss": 0.7228, "num_input_tokens_seen": 2291040, "step": 3965 }, { "epoch": 0.5913017575215966, "grad_norm": 1.5682021379470825, "learning_rate": 2.955764075067024e-05, "loss": 0.6424, "num_input_tokens_seen": 2293792, "step": 3970 }, { "epoch": 0.5920464700625558, "grad_norm": 3.535144090652466, "learning_rate": 2.95948763777182e-05, "loss": 0.7633, "num_input_tokens_seen": 2296800, "step": 3975 }, { "epoch": 0.592791182603515, "grad_norm": 4.087905406951904, "learning_rate": 2.963211200476616e-05, "loss": 0.7857, "num_input_tokens_seen": 2299744, "step": 3980 }, { "epoch": 0.5935358951444742, "grad_norm": 2.0449836254119873, "learning_rate": 2.966934763181412e-05, "loss": 0.6181, "num_input_tokens_seen": 2302784, "step": 3985 }, { "epoch": 0.5942806076854334, "grad_norm": 2.099586248397827, "learning_rate": 2.970658325886208e-05, "loss": 0.6188, "num_input_tokens_seen": 2305632, "step": 3990 }, { "epoch": 0.5950253202263927, "grad_norm": 1.6655757427215576, "learning_rate": 2.9743818885910042e-05, "loss": 0.8434, "num_input_tokens_seen": 2308576, "step": 3995 }, { "epoch": 0.5957700327673519, "grad_norm": 1.8582634925842285, "learning_rate": 2.9781054512958002e-05, "loss": 0.7794, "num_input_tokens_seen": 2311552, "step": 4000 }, { "epoch": 0.596514745308311, "grad_norm": 2.279175043106079, "learning_rate": 2.9818290140005962e-05, "loss": 0.7391, "num_input_tokens_seen": 2314400, "step": 4005 }, { "epoch": 0.5972594578492701, "grad_norm": 4.546947002410889, "learning_rate": 2.9855525767053922e-05, "loss": 0.9084, "num_input_tokens_seen": 2317184, "step": 4010 }, { "epoch": 0.5980041703902294, "grad_norm": 3.431638717651367, "learning_rate": 2.9892761394101875e-05, "loss": 0.7568, "num_input_tokens_seen": 2320160, "step": 4015 }, { "epoch": 0.5987488829311886, "grad_norm": 4.404646396636963, "learning_rate": 2.9929997021149835e-05, "loss": 0.8545, "num_input_tokens_seen": 2322976, "step": 4020 }, { "epoch": 0.5994935954721478, "grad_norm": 2.637603998184204, "learning_rate": 2.9967232648197795e-05, "loss": 0.6787, "num_input_tokens_seen": 2325888, "step": 4025 }, { "epoch": 0.600238308013107, "grad_norm": 2.5041818618774414, "learning_rate": 3.0004468275245755e-05, "loss": 0.7274, "num_input_tokens_seen": 2328672, "step": 4030 }, { "epoch": 0.6009830205540662, "grad_norm": 3.375093460083008, "learning_rate": 3.0041703902293718e-05, "loss": 0.7174, "num_input_tokens_seen": 2331680, "step": 4035 }, { "epoch": 0.6017277330950254, "grad_norm": 1.798266053199768, "learning_rate": 3.0078939529341678e-05, "loss": 0.7083, "num_input_tokens_seen": 2334272, "step": 4040 }, { "epoch": 0.6024724456359845, "grad_norm": 3.2403717041015625, "learning_rate": 3.0116175156389638e-05, "loss": 0.7861, "num_input_tokens_seen": 2336928, "step": 4045 }, { "epoch": 0.6032171581769437, "grad_norm": 8.244916915893555, "learning_rate": 3.0153410783437597e-05, "loss": 0.5679, "num_input_tokens_seen": 2340000, "step": 4050 }, { "epoch": 0.6039618707179029, "grad_norm": 3.4395084381103516, "learning_rate": 3.019064641048555e-05, "loss": 0.6866, "num_input_tokens_seen": 2342784, "step": 4055 }, { "epoch": 0.6047065832588621, "grad_norm": 3.2450461387634277, "learning_rate": 3.022788203753351e-05, "loss": 0.8483, "num_input_tokens_seen": 2345504, "step": 4060 }, { "epoch": 0.6054512957998213, "grad_norm": 2.555310010910034, "learning_rate": 3.026511766458147e-05, "loss": 0.5815, "num_input_tokens_seen": 2348448, "step": 4065 }, { "epoch": 0.6061960083407805, "grad_norm": 4.083056449890137, "learning_rate": 3.030235329162943e-05, "loss": 0.7463, "num_input_tokens_seen": 2351264, "step": 4070 }, { "epoch": 0.6069407208817397, "grad_norm": 2.2047104835510254, "learning_rate": 3.0339588918677393e-05, "loss": 0.7816, "num_input_tokens_seen": 2353920, "step": 4075 }, { "epoch": 0.6076854334226989, "grad_norm": 4.009152889251709, "learning_rate": 3.0376824545725353e-05, "loss": 0.5994, "num_input_tokens_seen": 2356608, "step": 4080 }, { "epoch": 0.6084301459636581, "grad_norm": 1.664683222770691, "learning_rate": 3.0414060172773313e-05, "loss": 0.6605, "num_input_tokens_seen": 2359392, "step": 4085 }, { "epoch": 0.6091748585046172, "grad_norm": 6.187608242034912, "learning_rate": 3.0451295799821273e-05, "loss": 0.7971, "num_input_tokens_seen": 2362528, "step": 4090 }, { "epoch": 0.6099195710455764, "grad_norm": 1.853285551071167, "learning_rate": 3.0488531426869233e-05, "loss": 0.6758, "num_input_tokens_seen": 2365280, "step": 4095 }, { "epoch": 0.6106642835865356, "grad_norm": 3.59432315826416, "learning_rate": 3.0525767053917186e-05, "loss": 0.6738, "num_input_tokens_seen": 2368096, "step": 4100 }, { "epoch": 0.6114089961274948, "grad_norm": 2.4481608867645264, "learning_rate": 3.056300268096515e-05, "loss": 0.6434, "num_input_tokens_seen": 2370976, "step": 4105 }, { "epoch": 0.612153708668454, "grad_norm": 2.620129108428955, "learning_rate": 3.0600238308013105e-05, "loss": 0.8909, "num_input_tokens_seen": 2373888, "step": 4110 }, { "epoch": 0.6128984212094132, "grad_norm": 1.9012068510055542, "learning_rate": 3.063747393506107e-05, "loss": 0.7431, "num_input_tokens_seen": 2376704, "step": 4115 }, { "epoch": 0.6136431337503724, "grad_norm": 1.7542129755020142, "learning_rate": 3.0674709562109025e-05, "loss": 0.6988, "num_input_tokens_seen": 2379456, "step": 4120 }, { "epoch": 0.6143878462913316, "grad_norm": 1.6651134490966797, "learning_rate": 3.071194518915699e-05, "loss": 0.7473, "num_input_tokens_seen": 2382624, "step": 4125 }, { "epoch": 0.6151325588322908, "grad_norm": 1.599422574043274, "learning_rate": 3.0749180816204945e-05, "loss": 0.7691, "num_input_tokens_seen": 2385472, "step": 4130 }, { "epoch": 0.6158772713732499, "grad_norm": 2.28928542137146, "learning_rate": 3.078641644325291e-05, "loss": 0.821, "num_input_tokens_seen": 2388384, "step": 4135 }, { "epoch": 0.6166219839142091, "grad_norm": 1.8265132904052734, "learning_rate": 3.082365207030087e-05, "loss": 0.6293, "num_input_tokens_seen": 2391136, "step": 4140 }, { "epoch": 0.6173666964551683, "grad_norm": 1.7874183654785156, "learning_rate": 3.086088769734882e-05, "loss": 0.7287, "num_input_tokens_seen": 2394144, "step": 4145 }, { "epoch": 0.6181114089961275, "grad_norm": 1.7044917345046997, "learning_rate": 3.0898123324396784e-05, "loss": 0.6347, "num_input_tokens_seen": 2396992, "step": 4150 }, { "epoch": 0.6188561215370867, "grad_norm": 2.099324941635132, "learning_rate": 3.093535895144474e-05, "loss": 0.6367, "num_input_tokens_seen": 2399616, "step": 4155 }, { "epoch": 0.6196008340780459, "grad_norm": 2.3093228340148926, "learning_rate": 3.0972594578492704e-05, "loss": 0.9083, "num_input_tokens_seen": 2402272, "step": 4160 }, { "epoch": 0.6203455466190051, "grad_norm": 3.0899124145507812, "learning_rate": 3.100983020554066e-05, "loss": 0.7163, "num_input_tokens_seen": 2405120, "step": 4165 }, { "epoch": 0.6210902591599643, "grad_norm": 1.859356164932251, "learning_rate": 3.1047065832588624e-05, "loss": 0.6848, "num_input_tokens_seen": 2408096, "step": 4170 }, { "epoch": 0.6218349717009234, "grad_norm": 1.3009189367294312, "learning_rate": 3.108430145963658e-05, "loss": 0.6581, "num_input_tokens_seen": 2411072, "step": 4175 }, { "epoch": 0.6225796842418826, "grad_norm": 1.702720284461975, "learning_rate": 3.1121537086684543e-05, "loss": 0.6703, "num_input_tokens_seen": 2414080, "step": 4180 }, { "epoch": 0.6233243967828418, "grad_norm": 2.022334098815918, "learning_rate": 3.11587727137325e-05, "loss": 0.702, "num_input_tokens_seen": 2417280, "step": 4185 }, { "epoch": 0.624069109323801, "grad_norm": 3.176231861114502, "learning_rate": 3.1196008340780456e-05, "loss": 0.7883, "num_input_tokens_seen": 2420128, "step": 4190 }, { "epoch": 0.6248138218647602, "grad_norm": 2.75407075881958, "learning_rate": 3.123324396782842e-05, "loss": 0.8329, "num_input_tokens_seen": 2423136, "step": 4195 }, { "epoch": 0.6255585344057194, "grad_norm": 2.0207669734954834, "learning_rate": 3.1270479594876376e-05, "loss": 0.777, "num_input_tokens_seen": 2426144, "step": 4200 }, { "epoch": 0.6263032469466786, "grad_norm": 1.554221749305725, "learning_rate": 3.130771522192434e-05, "loss": 0.6705, "num_input_tokens_seen": 2429280, "step": 4205 }, { "epoch": 0.6270479594876378, "grad_norm": 2.4996259212493896, "learning_rate": 3.1344950848972296e-05, "loss": 0.7299, "num_input_tokens_seen": 2432000, "step": 4210 }, { "epoch": 0.627792672028597, "grad_norm": 2.4334511756896973, "learning_rate": 3.138218647602026e-05, "loss": 0.6771, "num_input_tokens_seen": 2435136, "step": 4215 }, { "epoch": 0.6285373845695561, "grad_norm": 1.8744232654571533, "learning_rate": 3.141942210306822e-05, "loss": 0.7533, "num_input_tokens_seen": 2437952, "step": 4220 }, { "epoch": 0.6292820971105153, "grad_norm": 4.057453155517578, "learning_rate": 3.145665773011618e-05, "loss": 0.8902, "num_input_tokens_seen": 2440928, "step": 4225 }, { "epoch": 0.6300268096514745, "grad_norm": 2.4414029121398926, "learning_rate": 3.1493893357164135e-05, "loss": 0.7494, "num_input_tokens_seen": 2443808, "step": 4230 }, { "epoch": 0.6307715221924337, "grad_norm": 1.3917046785354614, "learning_rate": 3.153112898421209e-05, "loss": 0.5257, "num_input_tokens_seen": 2446400, "step": 4235 }, { "epoch": 0.6315162347333929, "grad_norm": 1.6120595932006836, "learning_rate": 3.1568364611260055e-05, "loss": 0.7604, "num_input_tokens_seen": 2449376, "step": 4240 }, { "epoch": 0.6322609472743521, "grad_norm": 1.7174900770187378, "learning_rate": 3.160560023830801e-05, "loss": 0.6765, "num_input_tokens_seen": 2452256, "step": 4245 }, { "epoch": 0.6330056598153113, "grad_norm": 2.668485641479492, "learning_rate": 3.1642835865355975e-05, "loss": 0.7117, "num_input_tokens_seen": 2455040, "step": 4250 }, { "epoch": 0.6337503723562705, "grad_norm": 0.9071937203407288, "learning_rate": 3.168007149240393e-05, "loss": 0.9111, "num_input_tokens_seen": 2457952, "step": 4255 }, { "epoch": 0.6344950848972297, "grad_norm": 1.6352500915527344, "learning_rate": 3.1717307119451894e-05, "loss": 0.6411, "num_input_tokens_seen": 2461216, "step": 4260 }, { "epoch": 0.6352397974381888, "grad_norm": 2.6887853145599365, "learning_rate": 3.175454274649986e-05, "loss": 0.7378, "num_input_tokens_seen": 2464544, "step": 4265 }, { "epoch": 0.635984509979148, "grad_norm": 1.6805769205093384, "learning_rate": 3.1791778373547814e-05, "loss": 0.6937, "num_input_tokens_seen": 2467392, "step": 4270 }, { "epoch": 0.6367292225201072, "grad_norm": 2.5920727252960205, "learning_rate": 3.182901400059577e-05, "loss": 0.6715, "num_input_tokens_seen": 2470112, "step": 4275 }, { "epoch": 0.6374739350610664, "grad_norm": 3.0600218772888184, "learning_rate": 3.186624962764373e-05, "loss": 0.7028, "num_input_tokens_seen": 2472992, "step": 4280 }, { "epoch": 0.6382186476020256, "grad_norm": 2.091649293899536, "learning_rate": 3.190348525469169e-05, "loss": 0.539, "num_input_tokens_seen": 2475808, "step": 4285 }, { "epoch": 0.6389633601429848, "grad_norm": 2.157633066177368, "learning_rate": 3.1940720881739647e-05, "loss": 0.7001, "num_input_tokens_seen": 2478688, "step": 4290 }, { "epoch": 0.639708072683944, "grad_norm": 3.150380849838257, "learning_rate": 3.197795650878761e-05, "loss": 0.7709, "num_input_tokens_seen": 2481344, "step": 4295 }, { "epoch": 0.6404527852249032, "grad_norm": 5.718735218048096, "learning_rate": 3.201519213583557e-05, "loss": 0.7577, "num_input_tokens_seen": 2484064, "step": 4300 }, { "epoch": 0.6411974977658623, "grad_norm": 2.2362420558929443, "learning_rate": 3.205242776288353e-05, "loss": 0.7709, "num_input_tokens_seen": 2486784, "step": 4305 }, { "epoch": 0.6419422103068215, "grad_norm": 3.069357395172119, "learning_rate": 3.208966338993149e-05, "loss": 0.5875, "num_input_tokens_seen": 2489568, "step": 4310 }, { "epoch": 0.6426869228477807, "grad_norm": 1.6602222919464111, "learning_rate": 3.212689901697944e-05, "loss": 0.8013, "num_input_tokens_seen": 2492512, "step": 4315 }, { "epoch": 0.6434316353887399, "grad_norm": 2.9729158878326416, "learning_rate": 3.2164134644027406e-05, "loss": 0.6744, "num_input_tokens_seen": 2495296, "step": 4320 }, { "epoch": 0.6441763479296991, "grad_norm": 2.6362102031707764, "learning_rate": 3.220137027107536e-05, "loss": 0.7392, "num_input_tokens_seen": 2498208, "step": 4325 }, { "epoch": 0.6449210604706583, "grad_norm": 1.6668591499328613, "learning_rate": 3.2238605898123325e-05, "loss": 0.7204, "num_input_tokens_seen": 2500800, "step": 4330 }, { "epoch": 0.6456657730116175, "grad_norm": 2.01719331741333, "learning_rate": 3.227584152517129e-05, "loss": 0.715, "num_input_tokens_seen": 2503488, "step": 4335 }, { "epoch": 0.6464104855525767, "grad_norm": 1.8339567184448242, "learning_rate": 3.2313077152219245e-05, "loss": 0.7227, "num_input_tokens_seen": 2506272, "step": 4340 }, { "epoch": 0.6471551980935359, "grad_norm": 2.3753726482391357, "learning_rate": 3.235031277926721e-05, "loss": 0.5788, "num_input_tokens_seen": 2508992, "step": 4345 }, { "epoch": 0.647899910634495, "grad_norm": 3.4508471488952637, "learning_rate": 3.2387548406315165e-05, "loss": 0.7894, "num_input_tokens_seen": 2511488, "step": 4350 }, { "epoch": 0.6486446231754542, "grad_norm": 4.6409592628479, "learning_rate": 3.242478403336313e-05, "loss": 0.6593, "num_input_tokens_seen": 2514432, "step": 4355 }, { "epoch": 0.6493893357164134, "grad_norm": 2.4849321842193604, "learning_rate": 3.246201966041108e-05, "loss": 0.7786, "num_input_tokens_seen": 2517664, "step": 4360 }, { "epoch": 0.6501340482573726, "grad_norm": 5.699634552001953, "learning_rate": 3.249925528745904e-05, "loss": 0.7205, "num_input_tokens_seen": 2520608, "step": 4365 }, { "epoch": 0.6508787607983318, "grad_norm": 2.872251033782959, "learning_rate": 3.2536490914507e-05, "loss": 0.7379, "num_input_tokens_seen": 2523520, "step": 4370 }, { "epoch": 0.651623473339291, "grad_norm": 2.866150140762329, "learning_rate": 3.257372654155496e-05, "loss": 0.7312, "num_input_tokens_seen": 2526592, "step": 4375 }, { "epoch": 0.6523681858802503, "grad_norm": 2.686760663986206, "learning_rate": 3.2610962168602924e-05, "loss": 0.7325, "num_input_tokens_seen": 2529728, "step": 4380 }, { "epoch": 0.6531128984212095, "grad_norm": 12.188611030578613, "learning_rate": 3.264819779565088e-05, "loss": 0.7858, "num_input_tokens_seen": 2532544, "step": 4385 }, { "epoch": 0.6538576109621687, "grad_norm": 1.941419243812561, "learning_rate": 3.2685433422698844e-05, "loss": 0.7766, "num_input_tokens_seen": 2535328, "step": 4390 }, { "epoch": 0.6546023235031277, "grad_norm": 7.1204047203063965, "learning_rate": 3.27226690497468e-05, "loss": 0.7692, "num_input_tokens_seen": 2538240, "step": 4395 }, { "epoch": 0.655347036044087, "grad_norm": 5.304942607879639, "learning_rate": 3.275990467679476e-05, "loss": 0.7203, "num_input_tokens_seen": 2540960, "step": 4400 }, { "epoch": 0.6560917485850462, "grad_norm": 6.07877779006958, "learning_rate": 3.279714030384271e-05, "loss": 0.7368, "num_input_tokens_seen": 2543776, "step": 4405 }, { "epoch": 0.6568364611260054, "grad_norm": 3.051316499710083, "learning_rate": 3.2834375930890676e-05, "loss": 0.7714, "num_input_tokens_seen": 2546624, "step": 4410 }, { "epoch": 0.6575811736669646, "grad_norm": 2.00909161567688, "learning_rate": 3.287161155793864e-05, "loss": 0.685, "num_input_tokens_seen": 2549408, "step": 4415 }, { "epoch": 0.6583258862079238, "grad_norm": 2.005913734436035, "learning_rate": 3.2908847184986596e-05, "loss": 0.691, "num_input_tokens_seen": 2552384, "step": 4420 }, { "epoch": 0.659070598748883, "grad_norm": 2.854975938796997, "learning_rate": 3.294608281203456e-05, "loss": 0.8017, "num_input_tokens_seen": 2555200, "step": 4425 }, { "epoch": 0.6598153112898422, "grad_norm": 1.068353533744812, "learning_rate": 3.2983318439082516e-05, "loss": 0.7271, "num_input_tokens_seen": 2558176, "step": 4430 }, { "epoch": 0.6605600238308013, "grad_norm": 1.8336362838745117, "learning_rate": 3.302055406613048e-05, "loss": 0.7585, "num_input_tokens_seen": 2561248, "step": 4435 }, { "epoch": 0.6613047363717605, "grad_norm": 1.4723763465881348, "learning_rate": 3.3057789693178435e-05, "loss": 0.7144, "num_input_tokens_seen": 2564384, "step": 4440 }, { "epoch": 0.6620494489127197, "grad_norm": 2.048783540725708, "learning_rate": 3.309502532022639e-05, "loss": 0.6774, "num_input_tokens_seen": 2567008, "step": 4445 }, { "epoch": 0.6627941614536789, "grad_norm": 1.9923168420791626, "learning_rate": 3.313226094727435e-05, "loss": 0.8689, "num_input_tokens_seen": 2569952, "step": 4450 }, { "epoch": 0.6635388739946381, "grad_norm": 3.098649740219116, "learning_rate": 3.316949657432231e-05, "loss": 0.706, "num_input_tokens_seen": 2572832, "step": 4455 }, { "epoch": 0.6642835865355973, "grad_norm": 1.8793524503707886, "learning_rate": 3.3206732201370275e-05, "loss": 0.6171, "num_input_tokens_seen": 2575584, "step": 4460 }, { "epoch": 0.6650282990765565, "grad_norm": 2.2193846702575684, "learning_rate": 3.324396782841823e-05, "loss": 0.7488, "num_input_tokens_seen": 2578656, "step": 4465 }, { "epoch": 0.6657730116175157, "grad_norm": 3.8080508708953857, "learning_rate": 3.3281203455466195e-05, "loss": 0.6083, "num_input_tokens_seen": 2581472, "step": 4470 }, { "epoch": 0.6665177241584749, "grad_norm": 5.7078752517700195, "learning_rate": 3.331843908251415e-05, "loss": 0.7682, "num_input_tokens_seen": 2584384, "step": 4475 }, { "epoch": 0.667262436699434, "grad_norm": 3.006718635559082, "learning_rate": 3.3355674709562114e-05, "loss": 0.6685, "num_input_tokens_seen": 2587136, "step": 4480 }, { "epoch": 0.6680071492403932, "grad_norm": 8.78566837310791, "learning_rate": 3.339291033661007e-05, "loss": 0.8553, "num_input_tokens_seen": 2590176, "step": 4485 }, { "epoch": 0.6687518617813524, "grad_norm": 3.612929344177246, "learning_rate": 3.343014596365803e-05, "loss": 0.7288, "num_input_tokens_seen": 2592896, "step": 4490 }, { "epoch": 0.6694965743223116, "grad_norm": 2.964153289794922, "learning_rate": 3.346738159070599e-05, "loss": 0.6581, "num_input_tokens_seen": 2595616, "step": 4495 }, { "epoch": 0.6702412868632708, "grad_norm": 4.209015369415283, "learning_rate": 3.350461721775395e-05, "loss": 0.8386, "num_input_tokens_seen": 2598528, "step": 4500 }, { "epoch": 0.67098599940423, "grad_norm": 1.5127872228622437, "learning_rate": 3.354185284480191e-05, "loss": 0.8396, "num_input_tokens_seen": 2601440, "step": 4505 }, { "epoch": 0.6717307119451892, "grad_norm": 1.1308081150054932, "learning_rate": 3.3579088471849867e-05, "loss": 0.7021, "num_input_tokens_seen": 2604480, "step": 4510 }, { "epoch": 0.6724754244861484, "grad_norm": 2.305739402770996, "learning_rate": 3.361632409889783e-05, "loss": 0.7315, "num_input_tokens_seen": 2607456, "step": 4515 }, { "epoch": 0.6732201370271076, "grad_norm": 1.5438790321350098, "learning_rate": 3.3653559725945786e-05, "loss": 0.7012, "num_input_tokens_seen": 2610304, "step": 4520 }, { "epoch": 0.6739648495680667, "grad_norm": 1.1950773000717163, "learning_rate": 3.369079535299375e-05, "loss": 0.6915, "num_input_tokens_seen": 2612960, "step": 4525 }, { "epoch": 0.6747095621090259, "grad_norm": 1.4812208414077759, "learning_rate": 3.3728030980041706e-05, "loss": 0.6026, "num_input_tokens_seen": 2615424, "step": 4530 }, { "epoch": 0.6754542746499851, "grad_norm": 1.6473627090454102, "learning_rate": 3.376526660708966e-05, "loss": 0.7894, "num_input_tokens_seen": 2618496, "step": 4535 }, { "epoch": 0.6761989871909443, "grad_norm": 1.9238778352737427, "learning_rate": 3.3802502234137626e-05, "loss": 0.7653, "num_input_tokens_seen": 2621280, "step": 4540 }, { "epoch": 0.6769436997319035, "grad_norm": 1.33930504322052, "learning_rate": 3.383973786118558e-05, "loss": 0.7211, "num_input_tokens_seen": 2624000, "step": 4545 }, { "epoch": 0.6776884122728627, "grad_norm": 1.5027267932891846, "learning_rate": 3.3876973488233545e-05, "loss": 0.6447, "num_input_tokens_seen": 2626976, "step": 4550 }, { "epoch": 0.6784331248138219, "grad_norm": 1.8359384536743164, "learning_rate": 3.39142091152815e-05, "loss": 0.6497, "num_input_tokens_seen": 2629824, "step": 4555 }, { "epoch": 0.6791778373547811, "grad_norm": 2.556976556777954, "learning_rate": 3.3951444742329465e-05, "loss": 0.7676, "num_input_tokens_seen": 2632608, "step": 4560 }, { "epoch": 0.6799225498957402, "grad_norm": 1.8122296333312988, "learning_rate": 3.398868036937742e-05, "loss": 0.6547, "num_input_tokens_seen": 2635616, "step": 4565 }, { "epoch": 0.6806672624366994, "grad_norm": 2.1858298778533936, "learning_rate": 3.4025915996425385e-05, "loss": 0.742, "num_input_tokens_seen": 2638848, "step": 4570 }, { "epoch": 0.6814119749776586, "grad_norm": 1.3080962896347046, "learning_rate": 3.406315162347334e-05, "loss": 0.7296, "num_input_tokens_seen": 2641664, "step": 4575 }, { "epoch": 0.6821566875186178, "grad_norm": 1.7829110622406006, "learning_rate": 3.41003872505213e-05, "loss": 0.8011, "num_input_tokens_seen": 2644864, "step": 4580 }, { "epoch": 0.682901400059577, "grad_norm": 1.8766591548919678, "learning_rate": 3.413762287756926e-05, "loss": 0.8507, "num_input_tokens_seen": 2647840, "step": 4585 }, { "epoch": 0.6836461126005362, "grad_norm": 1.659217119216919, "learning_rate": 3.417485850461722e-05, "loss": 0.7155, "num_input_tokens_seen": 2650848, "step": 4590 }, { "epoch": 0.6843908251414954, "grad_norm": 1.2556889057159424, "learning_rate": 3.421209413166518e-05, "loss": 0.716, "num_input_tokens_seen": 2653536, "step": 4595 }, { "epoch": 0.6851355376824546, "grad_norm": 1.1422916650772095, "learning_rate": 3.424932975871314e-05, "loss": 0.6634, "num_input_tokens_seen": 2656576, "step": 4600 }, { "epoch": 0.6858802502234138, "grad_norm": 1.4572055339813232, "learning_rate": 3.42865653857611e-05, "loss": 0.7074, "num_input_tokens_seen": 2659776, "step": 4605 }, { "epoch": 0.6866249627643729, "grad_norm": 3.249134063720703, "learning_rate": 3.432380101280906e-05, "loss": 0.7506, "num_input_tokens_seen": 2662720, "step": 4610 }, { "epoch": 0.6873696753053321, "grad_norm": 1.7143617868423462, "learning_rate": 3.436103663985702e-05, "loss": 0.6584, "num_input_tokens_seen": 2665440, "step": 4615 }, { "epoch": 0.6881143878462913, "grad_norm": 1.9431885480880737, "learning_rate": 3.4398272266904977e-05, "loss": 0.7512, "num_input_tokens_seen": 2668224, "step": 4620 }, { "epoch": 0.6888591003872505, "grad_norm": 1.9278955459594727, "learning_rate": 3.443550789395293e-05, "loss": 0.6636, "num_input_tokens_seen": 2671040, "step": 4625 }, { "epoch": 0.6896038129282097, "grad_norm": 4.776057243347168, "learning_rate": 3.4472743521000896e-05, "loss": 0.8123, "num_input_tokens_seen": 2673792, "step": 4630 }, { "epoch": 0.6903485254691689, "grad_norm": 1.8112986087799072, "learning_rate": 3.450997914804885e-05, "loss": 0.741, "num_input_tokens_seen": 2676672, "step": 4635 }, { "epoch": 0.6910932380101281, "grad_norm": 8.301994323730469, "learning_rate": 3.4547214775096816e-05, "loss": 0.5918, "num_input_tokens_seen": 2679424, "step": 4640 }, { "epoch": 0.6918379505510873, "grad_norm": 13.495983123779297, "learning_rate": 3.458445040214477e-05, "loss": 0.8578, "num_input_tokens_seen": 2682240, "step": 4645 }, { "epoch": 0.6925826630920465, "grad_norm": 3.4022719860076904, "learning_rate": 3.4621686029192736e-05, "loss": 0.8188, "num_input_tokens_seen": 2685376, "step": 4650 }, { "epoch": 0.6933273756330056, "grad_norm": 2.5506439208984375, "learning_rate": 3.465892165624069e-05, "loss": 0.6689, "num_input_tokens_seen": 2687968, "step": 4655 }, { "epoch": 0.6940720881739648, "grad_norm": 1.3855009078979492, "learning_rate": 3.4696157283288655e-05, "loss": 0.7532, "num_input_tokens_seen": 2690624, "step": 4660 }, { "epoch": 0.694816800714924, "grad_norm": 1.4539896249771118, "learning_rate": 3.473339291033661e-05, "loss": 0.7169, "num_input_tokens_seen": 2693664, "step": 4665 }, { "epoch": 0.6955615132558832, "grad_norm": 1.955851435661316, "learning_rate": 3.477062853738457e-05, "loss": 0.7857, "num_input_tokens_seen": 2696576, "step": 4670 }, { "epoch": 0.6963062257968424, "grad_norm": 2.5104830265045166, "learning_rate": 3.480786416443253e-05, "loss": 0.7027, "num_input_tokens_seen": 2699488, "step": 4675 }, { "epoch": 0.6970509383378016, "grad_norm": 4.52328634262085, "learning_rate": 3.484509979148049e-05, "loss": 0.7812, "num_input_tokens_seen": 2702336, "step": 4680 }, { "epoch": 0.6977956508787608, "grad_norm": 3.062587022781372, "learning_rate": 3.488233541852845e-05, "loss": 0.7525, "num_input_tokens_seen": 2705152, "step": 4685 }, { "epoch": 0.69854036341972, "grad_norm": 4.6319050788879395, "learning_rate": 3.491957104557641e-05, "loss": 0.7198, "num_input_tokens_seen": 2708096, "step": 4690 }, { "epoch": 0.6992850759606791, "grad_norm": 3.343735933303833, "learning_rate": 3.495680667262437e-05, "loss": 0.6827, "num_input_tokens_seen": 2710944, "step": 4695 }, { "epoch": 0.7000297885016383, "grad_norm": 1.7878892421722412, "learning_rate": 3.499404229967233e-05, "loss": 0.7331, "num_input_tokens_seen": 2713664, "step": 4700 }, { "epoch": 0.7007745010425975, "grad_norm": 0.9050880670547485, "learning_rate": 3.5031277926720284e-05, "loss": 0.6231, "num_input_tokens_seen": 2716896, "step": 4705 }, { "epoch": 0.7015192135835567, "grad_norm": 2.092177152633667, "learning_rate": 3.506851355376825e-05, "loss": 0.5415, "num_input_tokens_seen": 2719584, "step": 4710 }, { "epoch": 0.7022639261245159, "grad_norm": 1.8758089542388916, "learning_rate": 3.5105749180816204e-05, "loss": 0.7384, "num_input_tokens_seen": 2722304, "step": 4715 }, { "epoch": 0.7030086386654751, "grad_norm": 1.2446789741516113, "learning_rate": 3.514298480786417e-05, "loss": 0.7865, "num_input_tokens_seen": 2724992, "step": 4720 }, { "epoch": 0.7037533512064343, "grad_norm": 2.3420915603637695, "learning_rate": 3.518022043491212e-05, "loss": 0.8204, "num_input_tokens_seen": 2727648, "step": 4725 }, { "epoch": 0.7044980637473935, "grad_norm": 0.9115088582038879, "learning_rate": 3.5217456061960087e-05, "loss": 0.7808, "num_input_tokens_seen": 2730560, "step": 4730 }, { "epoch": 0.7052427762883527, "grad_norm": 0.9919354319572449, "learning_rate": 3.525469168900804e-05, "loss": 0.7229, "num_input_tokens_seen": 2733440, "step": 4735 }, { "epoch": 0.7059874888293118, "grad_norm": 1.3919031620025635, "learning_rate": 3.5291927316056006e-05, "loss": 0.7768, "num_input_tokens_seen": 2736320, "step": 4740 }, { "epoch": 0.706732201370271, "grad_norm": 1.1742911338806152, "learning_rate": 3.532916294310397e-05, "loss": 0.801, "num_input_tokens_seen": 2739136, "step": 4745 }, { "epoch": 0.7074769139112302, "grad_norm": 1.9819329977035522, "learning_rate": 3.536639857015192e-05, "loss": 0.7386, "num_input_tokens_seen": 2742176, "step": 4750 }, { "epoch": 0.7082216264521894, "grad_norm": 2.0946247577667236, "learning_rate": 3.540363419719988e-05, "loss": 0.6704, "num_input_tokens_seen": 2745056, "step": 4755 }, { "epoch": 0.7089663389931486, "grad_norm": 0.9994359016418457, "learning_rate": 3.544086982424784e-05, "loss": 0.7491, "num_input_tokens_seen": 2748128, "step": 4760 }, { "epoch": 0.7097110515341079, "grad_norm": 2.0525288581848145, "learning_rate": 3.54781054512958e-05, "loss": 0.7492, "num_input_tokens_seen": 2751008, "step": 4765 }, { "epoch": 0.710455764075067, "grad_norm": 2.123462200164795, "learning_rate": 3.551534107834376e-05, "loss": 0.5936, "num_input_tokens_seen": 2754016, "step": 4770 }, { "epoch": 0.7112004766160263, "grad_norm": 1.7093950510025024, "learning_rate": 3.555257670539172e-05, "loss": 0.6392, "num_input_tokens_seen": 2757248, "step": 4775 }, { "epoch": 0.7119451891569855, "grad_norm": 2.111032724380493, "learning_rate": 3.558981233243968e-05, "loss": 0.7411, "num_input_tokens_seen": 2760128, "step": 4780 }, { "epoch": 0.7126899016979446, "grad_norm": 2.285285711288452, "learning_rate": 3.562704795948764e-05, "loss": 0.8021, "num_input_tokens_seen": 2763136, "step": 4785 }, { "epoch": 0.7134346142389038, "grad_norm": 2.2804458141326904, "learning_rate": 3.56642835865356e-05, "loss": 0.733, "num_input_tokens_seen": 2765856, "step": 4790 }, { "epoch": 0.714179326779863, "grad_norm": 2.7717485427856445, "learning_rate": 3.5701519213583554e-05, "loss": 0.5814, "num_input_tokens_seen": 2768864, "step": 4795 }, { "epoch": 0.7149240393208222, "grad_norm": 1.3271077871322632, "learning_rate": 3.573875484063152e-05, "loss": 0.5802, "num_input_tokens_seen": 2771520, "step": 4800 }, { "epoch": 0.7156687518617814, "grad_norm": 1.6865609884262085, "learning_rate": 3.5775990467679474e-05, "loss": 0.6339, "num_input_tokens_seen": 2774560, "step": 4805 }, { "epoch": 0.7164134644027406, "grad_norm": 2.0933666229248047, "learning_rate": 3.581322609472744e-05, "loss": 0.7092, "num_input_tokens_seen": 2777312, "step": 4810 }, { "epoch": 0.7171581769436998, "grad_norm": 1.8708161115646362, "learning_rate": 3.5850461721775394e-05, "loss": 0.6708, "num_input_tokens_seen": 2780128, "step": 4815 }, { "epoch": 0.717902889484659, "grad_norm": 0.9172950983047485, "learning_rate": 3.588769734882336e-05, "loss": 0.697, "num_input_tokens_seen": 2783200, "step": 4820 }, { "epoch": 0.7186476020256181, "grad_norm": 2.178344964981079, "learning_rate": 3.592493297587132e-05, "loss": 0.6567, "num_input_tokens_seen": 2786048, "step": 4825 }, { "epoch": 0.7193923145665773, "grad_norm": 2.639676094055176, "learning_rate": 3.596216860291928e-05, "loss": 0.788, "num_input_tokens_seen": 2788864, "step": 4830 }, { "epoch": 0.7201370271075365, "grad_norm": 3.585726261138916, "learning_rate": 3.599940422996723e-05, "loss": 0.8028, "num_input_tokens_seen": 2791552, "step": 4835 }, { "epoch": 0.7208817396484957, "grad_norm": 1.958382248878479, "learning_rate": 3.603663985701519e-05, "loss": 0.6403, "num_input_tokens_seen": 2794528, "step": 4840 }, { "epoch": 0.7216264521894549, "grad_norm": 1.5913594961166382, "learning_rate": 3.607387548406315e-05, "loss": 0.697, "num_input_tokens_seen": 2797280, "step": 4845 }, { "epoch": 0.7223711647304141, "grad_norm": 1.5996863842010498, "learning_rate": 3.611111111111111e-05, "loss": 0.7504, "num_input_tokens_seen": 2799872, "step": 4850 }, { "epoch": 0.7231158772713733, "grad_norm": 2.9146175384521484, "learning_rate": 3.614834673815907e-05, "loss": 0.7559, "num_input_tokens_seen": 2802560, "step": 4855 }, { "epoch": 0.7238605898123325, "grad_norm": 2.1953251361846924, "learning_rate": 3.618558236520703e-05, "loss": 0.7494, "num_input_tokens_seen": 2805536, "step": 4860 }, { "epoch": 0.7246053023532917, "grad_norm": 1.1150836944580078, "learning_rate": 3.622281799225499e-05, "loss": 0.8183, "num_input_tokens_seen": 2808576, "step": 4865 }, { "epoch": 0.7253500148942508, "grad_norm": 3.815092086791992, "learning_rate": 3.6260053619302956e-05, "loss": 0.6468, "num_input_tokens_seen": 2811680, "step": 4870 }, { "epoch": 0.72609472743521, "grad_norm": 1.7576926946640015, "learning_rate": 3.629728924635091e-05, "loss": 0.6152, "num_input_tokens_seen": 2814528, "step": 4875 }, { "epoch": 0.7268394399761692, "grad_norm": 2.372847080230713, "learning_rate": 3.633452487339887e-05, "loss": 0.7942, "num_input_tokens_seen": 2817056, "step": 4880 }, { "epoch": 0.7275841525171284, "grad_norm": 1.9237953424453735, "learning_rate": 3.6371760500446825e-05, "loss": 0.6934, "num_input_tokens_seen": 2820032, "step": 4885 }, { "epoch": 0.7283288650580876, "grad_norm": 1.7690684795379639, "learning_rate": 3.640899612749479e-05, "loss": 0.8595, "num_input_tokens_seen": 2822752, "step": 4890 }, { "epoch": 0.7290735775990468, "grad_norm": 1.5262857675552368, "learning_rate": 3.6446231754542745e-05, "loss": 0.5606, "num_input_tokens_seen": 2825568, "step": 4895 }, { "epoch": 0.729818290140006, "grad_norm": 1.5617831945419312, "learning_rate": 3.648346738159071e-05, "loss": 0.5269, "num_input_tokens_seen": 2828576, "step": 4900 }, { "epoch": 0.7305630026809652, "grad_norm": 5.11824369430542, "learning_rate": 3.652070300863867e-05, "loss": 0.7011, "num_input_tokens_seen": 2831360, "step": 4905 }, { "epoch": 0.7313077152219244, "grad_norm": 5.708032131195068, "learning_rate": 3.655793863568663e-05, "loss": 1.099, "num_input_tokens_seen": 2833952, "step": 4910 }, { "epoch": 0.7320524277628835, "grad_norm": 7.0457282066345215, "learning_rate": 3.659517426273459e-05, "loss": 0.7857, "num_input_tokens_seen": 2837280, "step": 4915 }, { "epoch": 0.7327971403038427, "grad_norm": 2.6591923236846924, "learning_rate": 3.663240988978254e-05, "loss": 0.7273, "num_input_tokens_seen": 2840000, "step": 4920 }, { "epoch": 0.7335418528448019, "grad_norm": 2.988408327102661, "learning_rate": 3.6669645516830504e-05, "loss": 0.7457, "num_input_tokens_seen": 2842912, "step": 4925 }, { "epoch": 0.7342865653857611, "grad_norm": 2.425013303756714, "learning_rate": 3.670688114387846e-05, "loss": 0.8084, "num_input_tokens_seen": 2845952, "step": 4930 }, { "epoch": 0.7350312779267203, "grad_norm": 1.598290205001831, "learning_rate": 3.6744116770926424e-05, "loss": 0.7172, "num_input_tokens_seen": 2848736, "step": 4935 }, { "epoch": 0.7357759904676795, "grad_norm": 3.039388418197632, "learning_rate": 3.678135239797439e-05, "loss": 0.614, "num_input_tokens_seen": 2851616, "step": 4940 }, { "epoch": 0.7365207030086387, "grad_norm": 3.6768651008605957, "learning_rate": 3.681858802502234e-05, "loss": 0.8591, "num_input_tokens_seen": 2854368, "step": 4945 }, { "epoch": 0.7372654155495979, "grad_norm": 3.0328404903411865, "learning_rate": 3.6855823652070307e-05, "loss": 0.789, "num_input_tokens_seen": 2857312, "step": 4950 }, { "epoch": 0.738010128090557, "grad_norm": 2.5435564517974854, "learning_rate": 3.689305927911826e-05, "loss": 0.6065, "num_input_tokens_seen": 2860320, "step": 4955 }, { "epoch": 0.7387548406315162, "grad_norm": 2.244680166244507, "learning_rate": 3.6930294906166226e-05, "loss": 0.6358, "num_input_tokens_seen": 2863072, "step": 4960 }, { "epoch": 0.7394995531724754, "grad_norm": 1.6963924169540405, "learning_rate": 3.6967530533214176e-05, "loss": 0.7943, "num_input_tokens_seen": 2866112, "step": 4965 }, { "epoch": 0.7402442657134346, "grad_norm": 5.962052345275879, "learning_rate": 3.700476616026214e-05, "loss": 0.8172, "num_input_tokens_seen": 2868960, "step": 4970 }, { "epoch": 0.7409889782543938, "grad_norm": 2.5059778690338135, "learning_rate": 3.7042001787310096e-05, "loss": 0.8055, "num_input_tokens_seen": 2871680, "step": 4975 }, { "epoch": 0.741733690795353, "grad_norm": 1.8836216926574707, "learning_rate": 3.707923741435806e-05, "loss": 0.7117, "num_input_tokens_seen": 2874496, "step": 4980 }, { "epoch": 0.7424784033363122, "grad_norm": 2.679142475128174, "learning_rate": 3.711647304140602e-05, "loss": 0.8237, "num_input_tokens_seen": 2877536, "step": 4985 }, { "epoch": 0.7432231158772714, "grad_norm": 1.187999963760376, "learning_rate": 3.715370866845398e-05, "loss": 0.8033, "num_input_tokens_seen": 2880480, "step": 4990 }, { "epoch": 0.7439678284182306, "grad_norm": 1.408771276473999, "learning_rate": 3.719094429550194e-05, "loss": 0.6877, "num_input_tokens_seen": 2883296, "step": 4995 }, { "epoch": 0.7447125409591897, "grad_norm": 3.6272659301757812, "learning_rate": 3.72281799225499e-05, "loss": 0.7637, "num_input_tokens_seen": 2885824, "step": 5000 }, { "epoch": 0.7454572535001489, "grad_norm": 2.3738913536071777, "learning_rate": 3.726541554959786e-05, "loss": 0.8025, "num_input_tokens_seen": 2888448, "step": 5005 }, { "epoch": 0.7462019660411081, "grad_norm": 4.551836013793945, "learning_rate": 3.730265117664581e-05, "loss": 0.8559, "num_input_tokens_seen": 2891104, "step": 5010 }, { "epoch": 0.7469466785820673, "grad_norm": 3.0948243141174316, "learning_rate": 3.7339886803693774e-05, "loss": 0.7513, "num_input_tokens_seen": 2893888, "step": 5015 }, { "epoch": 0.7476913911230265, "grad_norm": 2.1033239364624023, "learning_rate": 3.737712243074174e-05, "loss": 0.661, "num_input_tokens_seen": 2896192, "step": 5020 }, { "epoch": 0.7484361036639857, "grad_norm": 2.912191152572632, "learning_rate": 3.7414358057789694e-05, "loss": 0.707, "num_input_tokens_seen": 2899008, "step": 5025 }, { "epoch": 0.7491808162049449, "grad_norm": 1.9227722883224487, "learning_rate": 3.745159368483766e-05, "loss": 0.7126, "num_input_tokens_seen": 2901920, "step": 5030 }, { "epoch": 0.7499255287459041, "grad_norm": 3.3876073360443115, "learning_rate": 3.7488829311885614e-05, "loss": 0.7446, "num_input_tokens_seen": 2905184, "step": 5035 }, { "epoch": 0.7506702412868632, "grad_norm": 2.9232277870178223, "learning_rate": 3.752606493893358e-05, "loss": 0.6707, "num_input_tokens_seen": 2908000, "step": 5040 }, { "epoch": 0.7514149538278224, "grad_norm": 4.412450313568115, "learning_rate": 3.7563300565981534e-05, "loss": 0.6768, "num_input_tokens_seen": 2910720, "step": 5045 }, { "epoch": 0.7521596663687816, "grad_norm": 2.2973456382751465, "learning_rate": 3.760053619302949e-05, "loss": 0.793, "num_input_tokens_seen": 2913728, "step": 5050 }, { "epoch": 0.7529043789097408, "grad_norm": 2.979801654815674, "learning_rate": 3.7637771820077446e-05, "loss": 0.7528, "num_input_tokens_seen": 2916352, "step": 5055 }, { "epoch": 0.7536490914507, "grad_norm": 3.1152215003967285, "learning_rate": 3.767500744712541e-05, "loss": 0.7118, "num_input_tokens_seen": 2919168, "step": 5060 }, { "epoch": 0.7543938039916592, "grad_norm": 2.890239715576172, "learning_rate": 3.771224307417337e-05, "loss": 0.6968, "num_input_tokens_seen": 2921696, "step": 5065 }, { "epoch": 0.7551385165326184, "grad_norm": 2.4004409313201904, "learning_rate": 3.774947870122133e-05, "loss": 0.7294, "num_input_tokens_seen": 2924544, "step": 5070 }, { "epoch": 0.7558832290735776, "grad_norm": 5.609859943389893, "learning_rate": 3.778671432826929e-05, "loss": 1.0311, "num_input_tokens_seen": 2927520, "step": 5075 }, { "epoch": 0.7566279416145368, "grad_norm": 2.282534599304199, "learning_rate": 3.782394995531725e-05, "loss": 0.6987, "num_input_tokens_seen": 2930560, "step": 5080 }, { "epoch": 0.7573726541554959, "grad_norm": 1.3410085439682007, "learning_rate": 3.786118558236521e-05, "loss": 0.7144, "num_input_tokens_seen": 2933088, "step": 5085 }, { "epoch": 0.7581173666964551, "grad_norm": 1.1934287548065186, "learning_rate": 3.789842120941317e-05, "loss": 0.6084, "num_input_tokens_seen": 2935808, "step": 5090 }, { "epoch": 0.7588620792374143, "grad_norm": 1.3125377893447876, "learning_rate": 3.7935656836461125e-05, "loss": 0.7404, "num_input_tokens_seen": 2938624, "step": 5095 }, { "epoch": 0.7596067917783735, "grad_norm": 1.6135601997375488, "learning_rate": 3.797289246350909e-05, "loss": 0.6206, "num_input_tokens_seen": 2941600, "step": 5100 }, { "epoch": 0.7603515043193327, "grad_norm": 2.732614040374756, "learning_rate": 3.8010128090557045e-05, "loss": 0.7126, "num_input_tokens_seen": 2944576, "step": 5105 }, { "epoch": 0.7610962168602919, "grad_norm": 1.7766684293746948, "learning_rate": 3.804736371760501e-05, "loss": 0.664, "num_input_tokens_seen": 2947424, "step": 5110 }, { "epoch": 0.7618409294012511, "grad_norm": 1.4261608123779297, "learning_rate": 3.8084599344652965e-05, "loss": 0.7014, "num_input_tokens_seen": 2950336, "step": 5115 }, { "epoch": 0.7625856419422103, "grad_norm": 2.5191490650177, "learning_rate": 3.812183497170093e-05, "loss": 0.7266, "num_input_tokens_seen": 2952992, "step": 5120 }, { "epoch": 0.7633303544831695, "grad_norm": 1.4916404485702515, "learning_rate": 3.8159070598748884e-05, "loss": 0.7142, "num_input_tokens_seen": 2955776, "step": 5125 }, { "epoch": 0.7640750670241286, "grad_norm": 1.2162843942642212, "learning_rate": 3.819630622579685e-05, "loss": 0.6775, "num_input_tokens_seen": 2958592, "step": 5130 }, { "epoch": 0.7648197795650878, "grad_norm": 1.460410714149475, "learning_rate": 3.8233541852844804e-05, "loss": 0.5605, "num_input_tokens_seen": 2961408, "step": 5135 }, { "epoch": 0.765564492106047, "grad_norm": 2.300898790359497, "learning_rate": 3.827077747989276e-05, "loss": 0.7916, "num_input_tokens_seen": 2964064, "step": 5140 }, { "epoch": 0.7663092046470062, "grad_norm": 1.2321462631225586, "learning_rate": 3.8308013106940724e-05, "loss": 0.6055, "num_input_tokens_seen": 2966560, "step": 5145 }, { "epoch": 0.7670539171879655, "grad_norm": 1.6886541843414307, "learning_rate": 3.834524873398868e-05, "loss": 0.8349, "num_input_tokens_seen": 2969344, "step": 5150 }, { "epoch": 0.7677986297289247, "grad_norm": 1.3712233304977417, "learning_rate": 3.8382484361036644e-05, "loss": 0.7006, "num_input_tokens_seen": 2972224, "step": 5155 }, { "epoch": 0.7685433422698839, "grad_norm": 5.939786434173584, "learning_rate": 3.84197199880846e-05, "loss": 0.659, "num_input_tokens_seen": 2975296, "step": 5160 }, { "epoch": 0.7692880548108431, "grad_norm": 3.524984359741211, "learning_rate": 3.845695561513256e-05, "loss": 0.807, "num_input_tokens_seen": 2978144, "step": 5165 }, { "epoch": 0.7700327673518021, "grad_norm": 3.1466989517211914, "learning_rate": 3.849419124218052e-05, "loss": 0.6607, "num_input_tokens_seen": 2980864, "step": 5170 }, { "epoch": 0.7707774798927614, "grad_norm": 3.2815134525299072, "learning_rate": 3.853142686922848e-05, "loss": 0.7456, "num_input_tokens_seen": 2983488, "step": 5175 }, { "epoch": 0.7715221924337206, "grad_norm": 1.4109716415405273, "learning_rate": 3.856866249627644e-05, "loss": 0.7176, "num_input_tokens_seen": 2986304, "step": 5180 }, { "epoch": 0.7722669049746798, "grad_norm": 3.3360354900360107, "learning_rate": 3.8605898123324396e-05, "loss": 0.7121, "num_input_tokens_seen": 2989120, "step": 5185 }, { "epoch": 0.773011617515639, "grad_norm": 2.1233315467834473, "learning_rate": 3.864313375037236e-05, "loss": 0.7255, "num_input_tokens_seen": 2991904, "step": 5190 }, { "epoch": 0.7737563300565982, "grad_norm": 2.4436726570129395, "learning_rate": 3.8680369377420316e-05, "loss": 0.7763, "num_input_tokens_seen": 2994624, "step": 5195 }, { "epoch": 0.7745010425975574, "grad_norm": 2.740490674972534, "learning_rate": 3.871760500446828e-05, "loss": 0.7826, "num_input_tokens_seen": 2997568, "step": 5200 }, { "epoch": 0.7752457551385166, "grad_norm": 2.677419900894165, "learning_rate": 3.8754840631516235e-05, "loss": 0.9045, "num_input_tokens_seen": 3000448, "step": 5205 }, { "epoch": 0.7759904676794758, "grad_norm": 4.4590535163879395, "learning_rate": 3.87920762585642e-05, "loss": 0.8687, "num_input_tokens_seen": 3003392, "step": 5210 }, { "epoch": 0.7767351802204349, "grad_norm": 1.869218349456787, "learning_rate": 3.8829311885612155e-05, "loss": 0.7714, "num_input_tokens_seen": 3006080, "step": 5215 }, { "epoch": 0.7774798927613941, "grad_norm": 4.673871040344238, "learning_rate": 3.886654751266012e-05, "loss": 0.7147, "num_input_tokens_seen": 3008992, "step": 5220 }, { "epoch": 0.7782246053023533, "grad_norm": 1.0203698873519897, "learning_rate": 3.8903783139708075e-05, "loss": 0.6045, "num_input_tokens_seen": 3011552, "step": 5225 }, { "epoch": 0.7789693178433125, "grad_norm": 2.5409111976623535, "learning_rate": 3.894101876675603e-05, "loss": 0.8235, "num_input_tokens_seen": 3014400, "step": 5230 }, { "epoch": 0.7797140303842717, "grad_norm": 2.593968152999878, "learning_rate": 3.8978254393803994e-05, "loss": 0.6894, "num_input_tokens_seen": 3017280, "step": 5235 }, { "epoch": 0.7804587429252309, "grad_norm": 2.2706942558288574, "learning_rate": 3.901549002085195e-05, "loss": 0.7822, "num_input_tokens_seen": 3020064, "step": 5240 }, { "epoch": 0.7812034554661901, "grad_norm": 1.221846342086792, "learning_rate": 3.9052725647899914e-05, "loss": 0.7685, "num_input_tokens_seen": 3023072, "step": 5245 }, { "epoch": 0.7819481680071493, "grad_norm": 1.3702516555786133, "learning_rate": 3.908996127494787e-05, "loss": 0.686, "num_input_tokens_seen": 3025856, "step": 5250 }, { "epoch": 0.7826928805481085, "grad_norm": 1.3464059829711914, "learning_rate": 3.9127196901995834e-05, "loss": 0.7592, "num_input_tokens_seen": 3029152, "step": 5255 }, { "epoch": 0.7834375930890676, "grad_norm": 1.014600157737732, "learning_rate": 3.916443252904379e-05, "loss": 0.7286, "num_input_tokens_seen": 3031904, "step": 5260 }, { "epoch": 0.7841823056300268, "grad_norm": 1.8539440631866455, "learning_rate": 3.9201668156091754e-05, "loss": 0.6242, "num_input_tokens_seen": 3034848, "step": 5265 }, { "epoch": 0.784927018170986, "grad_norm": 1.959233283996582, "learning_rate": 3.923890378313971e-05, "loss": 0.7729, "num_input_tokens_seen": 3037888, "step": 5270 }, { "epoch": 0.7856717307119452, "grad_norm": 1.0249656438827515, "learning_rate": 3.9276139410187666e-05, "loss": 0.6597, "num_input_tokens_seen": 3040896, "step": 5275 }, { "epoch": 0.7864164432529044, "grad_norm": 1.0269204378128052, "learning_rate": 3.931337503723563e-05, "loss": 0.8254, "num_input_tokens_seen": 3043840, "step": 5280 }, { "epoch": 0.7871611557938636, "grad_norm": 1.0526494979858398, "learning_rate": 3.9350610664283586e-05, "loss": 0.7491, "num_input_tokens_seen": 3046624, "step": 5285 }, { "epoch": 0.7879058683348228, "grad_norm": 2.0178701877593994, "learning_rate": 3.938784629133155e-05, "loss": 0.6797, "num_input_tokens_seen": 3049472, "step": 5290 }, { "epoch": 0.788650580875782, "grad_norm": 1.5029106140136719, "learning_rate": 3.9425081918379506e-05, "loss": 0.7185, "num_input_tokens_seen": 3052416, "step": 5295 }, { "epoch": 0.7893952934167411, "grad_norm": 1.5852742195129395, "learning_rate": 3.946231754542747e-05, "loss": 0.6726, "num_input_tokens_seen": 3055232, "step": 5300 }, { "epoch": 0.7901400059577003, "grad_norm": 2.428781747817993, "learning_rate": 3.9499553172475426e-05, "loss": 0.7165, "num_input_tokens_seen": 3058272, "step": 5305 }, { "epoch": 0.7908847184986595, "grad_norm": 1.568027138710022, "learning_rate": 3.953678879952338e-05, "loss": 0.6962, "num_input_tokens_seen": 3061280, "step": 5310 }, { "epoch": 0.7916294310396187, "grad_norm": 2.3054635524749756, "learning_rate": 3.9574024426571345e-05, "loss": 0.7928, "num_input_tokens_seen": 3064320, "step": 5315 }, { "epoch": 0.7923741435805779, "grad_norm": 1.9421080350875854, "learning_rate": 3.96112600536193e-05, "loss": 0.727, "num_input_tokens_seen": 3066912, "step": 5320 }, { "epoch": 0.7931188561215371, "grad_norm": 1.110082745552063, "learning_rate": 3.9648495680667265e-05, "loss": 0.7111, "num_input_tokens_seen": 3069856, "step": 5325 }, { "epoch": 0.7938635686624963, "grad_norm": 2.3336355686187744, "learning_rate": 3.968573130771522e-05, "loss": 0.6898, "num_input_tokens_seen": 3072672, "step": 5330 }, { "epoch": 0.7946082812034555, "grad_norm": 2.2256951332092285, "learning_rate": 3.9722966934763185e-05, "loss": 0.699, "num_input_tokens_seen": 3075776, "step": 5335 }, { "epoch": 0.7953529937444147, "grad_norm": 2.7462234497070312, "learning_rate": 3.976020256181114e-05, "loss": 0.6921, "num_input_tokens_seen": 3078624, "step": 5340 }, { "epoch": 0.7960977062853738, "grad_norm": 1.4529905319213867, "learning_rate": 3.9797438188859104e-05, "loss": 0.6888, "num_input_tokens_seen": 3081728, "step": 5345 }, { "epoch": 0.796842418826333, "grad_norm": 2.071822166442871, "learning_rate": 3.983467381590707e-05, "loss": 0.5783, "num_input_tokens_seen": 3084512, "step": 5350 }, { "epoch": 0.7975871313672922, "grad_norm": 1.6687521934509277, "learning_rate": 3.987190944295502e-05, "loss": 0.8793, "num_input_tokens_seen": 3087488, "step": 5355 }, { "epoch": 0.7983318439082514, "grad_norm": 1.5943177938461304, "learning_rate": 3.990914507000298e-05, "loss": 0.7538, "num_input_tokens_seen": 3089984, "step": 5360 }, { "epoch": 0.7990765564492106, "grad_norm": 1.4727413654327393, "learning_rate": 3.994638069705094e-05, "loss": 0.6577, "num_input_tokens_seen": 3092896, "step": 5365 }, { "epoch": 0.7998212689901698, "grad_norm": 2.042203426361084, "learning_rate": 3.99836163240989e-05, "loss": 0.7877, "num_input_tokens_seen": 3095872, "step": 5370 }, { "epoch": 0.800565981531129, "grad_norm": 1.3126146793365479, "learning_rate": 4.002085195114686e-05, "loss": 0.7956, "num_input_tokens_seen": 3098624, "step": 5375 }, { "epoch": 0.8013106940720882, "grad_norm": 1.436708927154541, "learning_rate": 4.005808757819482e-05, "loss": 0.7553, "num_input_tokens_seen": 3101760, "step": 5380 }, { "epoch": 0.8020554066130474, "grad_norm": 1.098280906677246, "learning_rate": 4.0095323205242776e-05, "loss": 0.6778, "num_input_tokens_seen": 3104800, "step": 5385 }, { "epoch": 0.8028001191540065, "grad_norm": 1.0234507322311401, "learning_rate": 4.013255883229074e-05, "loss": 0.5562, "num_input_tokens_seen": 3107712, "step": 5390 }, { "epoch": 0.8035448316949657, "grad_norm": 1.4103509187698364, "learning_rate": 4.01697944593387e-05, "loss": 0.8035, "num_input_tokens_seen": 3110720, "step": 5395 }, { "epoch": 0.8042895442359249, "grad_norm": 0.6743038296699524, "learning_rate": 4.020703008638665e-05, "loss": 0.6448, "num_input_tokens_seen": 3113792, "step": 5400 }, { "epoch": 0.8050342567768841, "grad_norm": 1.9024951457977295, "learning_rate": 4.0244265713434616e-05, "loss": 0.6972, "num_input_tokens_seen": 3116736, "step": 5405 }, { "epoch": 0.8057789693178433, "grad_norm": 1.8933308124542236, "learning_rate": 4.028150134048257e-05, "loss": 0.7024, "num_input_tokens_seen": 3119584, "step": 5410 }, { "epoch": 0.8065236818588025, "grad_norm": 1.8912575244903564, "learning_rate": 4.0318736967530536e-05, "loss": 0.6762, "num_input_tokens_seen": 3122432, "step": 5415 }, { "epoch": 0.8072683943997617, "grad_norm": 1.4096952676773071, "learning_rate": 4.035597259457849e-05, "loss": 0.8177, "num_input_tokens_seen": 3125184, "step": 5420 }, { "epoch": 0.8080131069407209, "grad_norm": 0.9297963380813599, "learning_rate": 4.0393208221626455e-05, "loss": 0.698, "num_input_tokens_seen": 3128000, "step": 5425 }, { "epoch": 0.80875781948168, "grad_norm": 3.1314175128936768, "learning_rate": 4.043044384867442e-05, "loss": 0.8117, "num_input_tokens_seen": 3130944, "step": 5430 }, { "epoch": 0.8095025320226392, "grad_norm": 1.6854839324951172, "learning_rate": 4.0467679475722375e-05, "loss": 0.6658, "num_input_tokens_seen": 3133984, "step": 5435 }, { "epoch": 0.8102472445635984, "grad_norm": 4.2986907958984375, "learning_rate": 4.050491510277033e-05, "loss": 0.6867, "num_input_tokens_seen": 3137120, "step": 5440 }, { "epoch": 0.8109919571045576, "grad_norm": 0.9573460221290588, "learning_rate": 4.054215072981829e-05, "loss": 0.7955, "num_input_tokens_seen": 3139840, "step": 5445 }, { "epoch": 0.8117366696455168, "grad_norm": 1.5288408994674683, "learning_rate": 4.057938635686625e-05, "loss": 0.6886, "num_input_tokens_seen": 3142592, "step": 5450 }, { "epoch": 0.812481382186476, "grad_norm": 1.3964800834655762, "learning_rate": 4.061662198391421e-05, "loss": 0.6599, "num_input_tokens_seen": 3145248, "step": 5455 }, { "epoch": 0.8132260947274352, "grad_norm": 2.3832576274871826, "learning_rate": 4.065385761096217e-05, "loss": 0.6783, "num_input_tokens_seen": 3148096, "step": 5460 }, { "epoch": 0.8139708072683944, "grad_norm": 3.7928199768066406, "learning_rate": 4.069109323801013e-05, "loss": 0.7515, "num_input_tokens_seen": 3151008, "step": 5465 }, { "epoch": 0.8147155198093536, "grad_norm": 2.2802891731262207, "learning_rate": 4.072832886505809e-05, "loss": 0.6232, "num_input_tokens_seen": 3153920, "step": 5470 }, { "epoch": 0.8154602323503127, "grad_norm": 26.80980682373047, "learning_rate": 4.0765564492106054e-05, "loss": 0.9276, "num_input_tokens_seen": 3156448, "step": 5475 }, { "epoch": 0.8162049448912719, "grad_norm": 4.075571537017822, "learning_rate": 4.080280011915401e-05, "loss": 0.8717, "num_input_tokens_seen": 3159552, "step": 5480 }, { "epoch": 0.8169496574322311, "grad_norm": 17.07771873474121, "learning_rate": 4.084003574620197e-05, "loss": 0.6846, "num_input_tokens_seen": 3162240, "step": 5485 }, { "epoch": 0.8176943699731903, "grad_norm": 1.7927441596984863, "learning_rate": 4.087727137324992e-05, "loss": 0.6737, "num_input_tokens_seen": 3164608, "step": 5490 }, { "epoch": 0.8184390825141495, "grad_norm": 3.0498721599578857, "learning_rate": 4.0914507000297886e-05, "loss": 0.6261, "num_input_tokens_seen": 3167520, "step": 5495 }, { "epoch": 0.8191837950551087, "grad_norm": 8.648760795593262, "learning_rate": 4.095174262734584e-05, "loss": 0.6099, "num_input_tokens_seen": 3170400, "step": 5500 }, { "epoch": 0.819928507596068, "grad_norm": 2.946638584136963, "learning_rate": 4.0988978254393806e-05, "loss": 0.7205, "num_input_tokens_seen": 3173376, "step": 5505 }, { "epoch": 0.8206732201370271, "grad_norm": 2.2722508907318115, "learning_rate": 4.102621388144177e-05, "loss": 0.6595, "num_input_tokens_seen": 3176256, "step": 5510 }, { "epoch": 0.8214179326779864, "grad_norm": 1.4216722249984741, "learning_rate": 4.1063449508489726e-05, "loss": 0.7819, "num_input_tokens_seen": 3179136, "step": 5515 }, { "epoch": 0.8221626452189454, "grad_norm": 2.580216407775879, "learning_rate": 4.110068513553769e-05, "loss": 0.6203, "num_input_tokens_seen": 3181920, "step": 5520 }, { "epoch": 0.8229073577599046, "grad_norm": 1.1978583335876465, "learning_rate": 4.1137920762585646e-05, "loss": 0.719, "num_input_tokens_seen": 3184736, "step": 5525 }, { "epoch": 0.8236520703008638, "grad_norm": 1.183002233505249, "learning_rate": 4.11751563896336e-05, "loss": 0.6831, "num_input_tokens_seen": 3187456, "step": 5530 }, { "epoch": 0.824396782841823, "grad_norm": 2.652085065841675, "learning_rate": 4.121239201668156e-05, "loss": 0.6919, "num_input_tokens_seen": 3190240, "step": 5535 }, { "epoch": 0.8251414953827823, "grad_norm": 1.6319754123687744, "learning_rate": 4.124962764372952e-05, "loss": 0.7122, "num_input_tokens_seen": 3192864, "step": 5540 }, { "epoch": 0.8258862079237415, "grad_norm": 2.259537696838379, "learning_rate": 4.1286863270777485e-05, "loss": 0.6447, "num_input_tokens_seen": 3195392, "step": 5545 }, { "epoch": 0.8266309204647007, "grad_norm": 2.2628045082092285, "learning_rate": 4.132409889782544e-05, "loss": 0.6736, "num_input_tokens_seen": 3198176, "step": 5550 }, { "epoch": 0.8273756330056599, "grad_norm": 2.346973180770874, "learning_rate": 4.1361334524873405e-05, "loss": 0.7417, "num_input_tokens_seen": 3201152, "step": 5555 }, { "epoch": 0.828120345546619, "grad_norm": 4.076552391052246, "learning_rate": 4.139857015192136e-05, "loss": 0.5735, "num_input_tokens_seen": 3204320, "step": 5560 }, { "epoch": 0.8288650580875782, "grad_norm": 1.979832410812378, "learning_rate": 4.1435805778969324e-05, "loss": 0.626, "num_input_tokens_seen": 3207008, "step": 5565 }, { "epoch": 0.8296097706285374, "grad_norm": 3.5518105030059814, "learning_rate": 4.1473041406017274e-05, "loss": 0.7157, "num_input_tokens_seen": 3209760, "step": 5570 }, { "epoch": 0.8303544831694966, "grad_norm": 5.718450546264648, "learning_rate": 4.151027703306524e-05, "loss": 0.9369, "num_input_tokens_seen": 3212864, "step": 5575 }, { "epoch": 0.8310991957104558, "grad_norm": 1.7463008165359497, "learning_rate": 4.1547512660113194e-05, "loss": 0.7527, "num_input_tokens_seen": 3215744, "step": 5580 }, { "epoch": 0.831843908251415, "grad_norm": 1.8095825910568237, "learning_rate": 4.158474828716116e-05, "loss": 0.7979, "num_input_tokens_seen": 3218400, "step": 5585 }, { "epoch": 0.8325886207923742, "grad_norm": 0.9372115135192871, "learning_rate": 4.162198391420912e-05, "loss": 0.5457, "num_input_tokens_seen": 3220960, "step": 5590 }, { "epoch": 0.8333333333333334, "grad_norm": 1.3030345439910889, "learning_rate": 4.165921954125708e-05, "loss": 0.7779, "num_input_tokens_seen": 3223872, "step": 5595 }, { "epoch": 0.8340780458742926, "grad_norm": 1.8388357162475586, "learning_rate": 4.169645516830504e-05, "loss": 0.6108, "num_input_tokens_seen": 3226880, "step": 5600 }, { "epoch": 0.8348227584152517, "grad_norm": 1.3588480949401855, "learning_rate": 4.1733690795352996e-05, "loss": 0.86, "num_input_tokens_seen": 3229920, "step": 5605 }, { "epoch": 0.8355674709562109, "grad_norm": 1.8661248683929443, "learning_rate": 4.177092642240096e-05, "loss": 0.7648, "num_input_tokens_seen": 3232768, "step": 5610 }, { "epoch": 0.8363121834971701, "grad_norm": 0.7852189540863037, "learning_rate": 4.180816204944891e-05, "loss": 0.7214, "num_input_tokens_seen": 3235808, "step": 5615 }, { "epoch": 0.8370568960381293, "grad_norm": 1.0950438976287842, "learning_rate": 4.184539767649687e-05, "loss": 0.6522, "num_input_tokens_seen": 3238400, "step": 5620 }, { "epoch": 0.8378016085790885, "grad_norm": 1.5634610652923584, "learning_rate": 4.1882633303544836e-05, "loss": 0.5898, "num_input_tokens_seen": 3241216, "step": 5625 }, { "epoch": 0.8385463211200477, "grad_norm": 1.6854090690612793, "learning_rate": 4.191986893059279e-05, "loss": 0.7482, "num_input_tokens_seen": 3244320, "step": 5630 }, { "epoch": 0.8392910336610069, "grad_norm": 1.0793957710266113, "learning_rate": 4.1957104557640756e-05, "loss": 0.656, "num_input_tokens_seen": 3247200, "step": 5635 }, { "epoch": 0.8400357462019661, "grad_norm": 2.6190366744995117, "learning_rate": 4.199434018468871e-05, "loss": 0.689, "num_input_tokens_seen": 3250144, "step": 5640 }, { "epoch": 0.8407804587429253, "grad_norm": 1.6474783420562744, "learning_rate": 4.2031575811736675e-05, "loss": 0.6606, "num_input_tokens_seen": 3252928, "step": 5645 }, { "epoch": 0.8415251712838844, "grad_norm": 1.7979484796524048, "learning_rate": 4.206881143878463e-05, "loss": 0.6333, "num_input_tokens_seen": 3256160, "step": 5650 }, { "epoch": 0.8422698838248436, "grad_norm": 2.686537981033325, "learning_rate": 4.2106047065832595e-05, "loss": 0.823, "num_input_tokens_seen": 3259008, "step": 5655 }, { "epoch": 0.8430145963658028, "grad_norm": 2.639890670776367, "learning_rate": 4.2143282692880545e-05, "loss": 0.8863, "num_input_tokens_seen": 3261824, "step": 5660 }, { "epoch": 0.843759308906762, "grad_norm": 1.1723734140396118, "learning_rate": 4.218051831992851e-05, "loss": 0.6926, "num_input_tokens_seen": 3264992, "step": 5665 }, { "epoch": 0.8445040214477212, "grad_norm": 1.190632700920105, "learning_rate": 4.221775394697647e-05, "loss": 0.7855, "num_input_tokens_seen": 3268032, "step": 5670 }, { "epoch": 0.8452487339886804, "grad_norm": 0.992759644985199, "learning_rate": 4.225498957402443e-05, "loss": 0.627, "num_input_tokens_seen": 3271008, "step": 5675 }, { "epoch": 0.8459934465296396, "grad_norm": 1.5118460655212402, "learning_rate": 4.229222520107239e-05, "loss": 0.7459, "num_input_tokens_seen": 3274080, "step": 5680 }, { "epoch": 0.8467381590705988, "grad_norm": 2.6074981689453125, "learning_rate": 4.232946082812035e-05, "loss": 0.7522, "num_input_tokens_seen": 3276768, "step": 5685 }, { "epoch": 0.8474828716115579, "grad_norm": 9.575233459472656, "learning_rate": 4.236669645516831e-05, "loss": 0.7951, "num_input_tokens_seen": 3279552, "step": 5690 }, { "epoch": 0.8482275841525171, "grad_norm": 17.993330001831055, "learning_rate": 4.240393208221627e-05, "loss": 3.6469, "num_input_tokens_seen": 3282112, "step": 5695 }, { "epoch": 0.8489722966934763, "grad_norm": 1.2858740091323853, "learning_rate": 4.2441167709264223e-05, "loss": 0.6203, "num_input_tokens_seen": 3284832, "step": 5700 }, { "epoch": 0.8497170092344355, "grad_norm": 1.1245176792144775, "learning_rate": 4.247840333631219e-05, "loss": 0.8438, "num_input_tokens_seen": 3287680, "step": 5705 }, { "epoch": 0.8504617217753947, "grad_norm": 1.3452610969543457, "learning_rate": 4.251563896336014e-05, "loss": 0.7666, "num_input_tokens_seen": 3290752, "step": 5710 }, { "epoch": 0.8512064343163539, "grad_norm": 0.6115123629570007, "learning_rate": 4.2552874590408106e-05, "loss": 0.6858, "num_input_tokens_seen": 3293440, "step": 5715 }, { "epoch": 0.8519511468573131, "grad_norm": 1.6474902629852295, "learning_rate": 4.259011021745606e-05, "loss": 0.7002, "num_input_tokens_seen": 3296320, "step": 5720 }, { "epoch": 0.8526958593982723, "grad_norm": 2.251832962036133, "learning_rate": 4.2627345844504026e-05, "loss": 0.7687, "num_input_tokens_seen": 3299296, "step": 5725 }, { "epoch": 0.8534405719392315, "grad_norm": 2.4318010807037354, "learning_rate": 4.266458147155198e-05, "loss": 0.6234, "num_input_tokens_seen": 3302176, "step": 5730 }, { "epoch": 0.8541852844801906, "grad_norm": 1.959989309310913, "learning_rate": 4.2701817098599946e-05, "loss": 0.6438, "num_input_tokens_seen": 3304992, "step": 5735 }, { "epoch": 0.8549299970211498, "grad_norm": 2.2648165225982666, "learning_rate": 4.27390527256479e-05, "loss": 0.7043, "num_input_tokens_seen": 3307968, "step": 5740 }, { "epoch": 0.855674709562109, "grad_norm": 1.1876964569091797, "learning_rate": 4.277628835269586e-05, "loss": 0.6086, "num_input_tokens_seen": 3310944, "step": 5745 }, { "epoch": 0.8564194221030682, "grad_norm": 1.7156856060028076, "learning_rate": 4.281352397974382e-05, "loss": 0.6075, "num_input_tokens_seen": 3313856, "step": 5750 }, { "epoch": 0.8571641346440274, "grad_norm": 1.7642022371292114, "learning_rate": 4.285075960679178e-05, "loss": 0.7799, "num_input_tokens_seen": 3316640, "step": 5755 }, { "epoch": 0.8579088471849866, "grad_norm": 1.7194143533706665, "learning_rate": 4.288799523383974e-05, "loss": 0.6047, "num_input_tokens_seen": 3319520, "step": 5760 }, { "epoch": 0.8586535597259458, "grad_norm": 1.3013538122177124, "learning_rate": 4.29252308608877e-05, "loss": 0.8377, "num_input_tokens_seen": 3322144, "step": 5765 }, { "epoch": 0.859398272266905, "grad_norm": 0.9094334840774536, "learning_rate": 4.296246648793566e-05, "loss": 0.7336, "num_input_tokens_seen": 3324864, "step": 5770 }, { "epoch": 0.8601429848078642, "grad_norm": 1.3226782083511353, "learning_rate": 4.299970211498362e-05, "loss": 0.7268, "num_input_tokens_seen": 3327712, "step": 5775 }, { "epoch": 0.8608876973488233, "grad_norm": 1.1109001636505127, "learning_rate": 4.303693774203158e-05, "loss": 0.6996, "num_input_tokens_seen": 3330592, "step": 5780 }, { "epoch": 0.8616324098897825, "grad_norm": 1.8699331283569336, "learning_rate": 4.307417336907954e-05, "loss": 0.7177, "num_input_tokens_seen": 3333632, "step": 5785 }, { "epoch": 0.8623771224307417, "grad_norm": 0.9704592823982239, "learning_rate": 4.3111408996127494e-05, "loss": 0.656, "num_input_tokens_seen": 3336384, "step": 5790 }, { "epoch": 0.8631218349717009, "grad_norm": 1.2856489419937134, "learning_rate": 4.314864462317546e-05, "loss": 0.6437, "num_input_tokens_seen": 3339424, "step": 5795 }, { "epoch": 0.8638665475126601, "grad_norm": 1.1774461269378662, "learning_rate": 4.3185880250223414e-05, "loss": 0.5996, "num_input_tokens_seen": 3342176, "step": 5800 }, { "epoch": 0.8646112600536193, "grad_norm": 1.220462679862976, "learning_rate": 4.322311587727138e-05, "loss": 0.5872, "num_input_tokens_seen": 3345152, "step": 5805 }, { "epoch": 0.8653559725945785, "grad_norm": 1.7190707921981812, "learning_rate": 4.3260351504319333e-05, "loss": 0.6196, "num_input_tokens_seen": 3348032, "step": 5810 }, { "epoch": 0.8661006851355377, "grad_norm": 1.9708614349365234, "learning_rate": 4.32975871313673e-05, "loss": 0.7121, "num_input_tokens_seen": 3351008, "step": 5815 }, { "epoch": 0.8668453976764968, "grad_norm": 1.2221516370773315, "learning_rate": 4.333482275841525e-05, "loss": 0.6463, "num_input_tokens_seen": 3354176, "step": 5820 }, { "epoch": 0.867590110217456, "grad_norm": 1.7324516773223877, "learning_rate": 4.3372058385463216e-05, "loss": 0.6423, "num_input_tokens_seen": 3357248, "step": 5825 }, { "epoch": 0.8683348227584152, "grad_norm": 1.3478530645370483, "learning_rate": 4.340929401251117e-05, "loss": 0.6608, "num_input_tokens_seen": 3360160, "step": 5830 }, { "epoch": 0.8690795352993744, "grad_norm": 1.4215116500854492, "learning_rate": 4.344652963955913e-05, "loss": 0.5775, "num_input_tokens_seen": 3362944, "step": 5835 }, { "epoch": 0.8698242478403336, "grad_norm": 2.204474925994873, "learning_rate": 4.348376526660709e-05, "loss": 0.6896, "num_input_tokens_seen": 3365984, "step": 5840 }, { "epoch": 0.8705689603812928, "grad_norm": 1.3876864910125732, "learning_rate": 4.352100089365505e-05, "loss": 0.7076, "num_input_tokens_seen": 3368864, "step": 5845 }, { "epoch": 0.871313672922252, "grad_norm": 2.601170539855957, "learning_rate": 4.355823652070301e-05, "loss": 0.6879, "num_input_tokens_seen": 3371776, "step": 5850 }, { "epoch": 0.8720583854632112, "grad_norm": 1.634116291999817, "learning_rate": 4.359547214775097e-05, "loss": 0.6639, "num_input_tokens_seen": 3374816, "step": 5855 }, { "epoch": 0.8728030980041704, "grad_norm": 1.2746880054473877, "learning_rate": 4.363270777479893e-05, "loss": 0.5651, "num_input_tokens_seen": 3377664, "step": 5860 }, { "epoch": 0.8735478105451295, "grad_norm": 1.4143850803375244, "learning_rate": 4.366994340184689e-05, "loss": 0.7148, "num_input_tokens_seen": 3380288, "step": 5865 }, { "epoch": 0.8742925230860887, "grad_norm": 1.6504086256027222, "learning_rate": 4.370717902889485e-05, "loss": 0.6903, "num_input_tokens_seen": 3383328, "step": 5870 }, { "epoch": 0.8750372356270479, "grad_norm": 1.2227611541748047, "learning_rate": 4.374441465594281e-05, "loss": 0.6803, "num_input_tokens_seen": 3386240, "step": 5875 }, { "epoch": 0.8757819481680071, "grad_norm": 1.994765043258667, "learning_rate": 4.3781650282990765e-05, "loss": 0.6721, "num_input_tokens_seen": 3388960, "step": 5880 }, { "epoch": 0.8765266607089663, "grad_norm": 1.9926999807357788, "learning_rate": 4.381888591003873e-05, "loss": 0.6035, "num_input_tokens_seen": 3391744, "step": 5885 }, { "epoch": 0.8772713732499255, "grad_norm": 2.805525779724121, "learning_rate": 4.3856121537086684e-05, "loss": 0.7935, "num_input_tokens_seen": 3394400, "step": 5890 }, { "epoch": 0.8780160857908847, "grad_norm": 3.8856427669525146, "learning_rate": 4.389335716413465e-05, "loss": 0.8589, "num_input_tokens_seen": 3397280, "step": 5895 }, { "epoch": 0.878760798331844, "grad_norm": 1.7613286972045898, "learning_rate": 4.3930592791182604e-05, "loss": 0.7975, "num_input_tokens_seen": 3400160, "step": 5900 }, { "epoch": 0.8795055108728032, "grad_norm": 3.4574437141418457, "learning_rate": 4.396782841823057e-05, "loss": 0.7266, "num_input_tokens_seen": 3403392, "step": 5905 }, { "epoch": 0.8802502234137622, "grad_norm": 1.7394258975982666, "learning_rate": 4.4005064045278524e-05, "loss": 0.7834, "num_input_tokens_seen": 3406432, "step": 5910 }, { "epoch": 0.8809949359547214, "grad_norm": 2.934694290161133, "learning_rate": 4.404229967232648e-05, "loss": 0.6892, "num_input_tokens_seen": 3409248, "step": 5915 }, { "epoch": 0.8817396484956807, "grad_norm": 1.6685417890548706, "learning_rate": 4.4079535299374443e-05, "loss": 0.6784, "num_input_tokens_seen": 3412064, "step": 5920 }, { "epoch": 0.8824843610366399, "grad_norm": 1.2867449522018433, "learning_rate": 4.41167709264224e-05, "loss": 0.6719, "num_input_tokens_seen": 3414816, "step": 5925 }, { "epoch": 0.883229073577599, "grad_norm": 2.4873440265655518, "learning_rate": 4.415400655347036e-05, "loss": 0.7784, "num_input_tokens_seen": 3417664, "step": 5930 }, { "epoch": 0.8839737861185583, "grad_norm": 2.1592867374420166, "learning_rate": 4.419124218051832e-05, "loss": 0.861, "num_input_tokens_seen": 3420448, "step": 5935 }, { "epoch": 0.8847184986595175, "grad_norm": 1.1070141792297363, "learning_rate": 4.422847780756628e-05, "loss": 0.6302, "num_input_tokens_seen": 3423136, "step": 5940 }, { "epoch": 0.8854632112004767, "grad_norm": 1.3486016988754272, "learning_rate": 4.426571343461424e-05, "loss": 0.594, "num_input_tokens_seen": 3425792, "step": 5945 }, { "epoch": 0.8862079237414358, "grad_norm": 2.1130905151367188, "learning_rate": 4.43029490616622e-05, "loss": 0.6339, "num_input_tokens_seen": 3428544, "step": 5950 }, { "epoch": 0.886952636282395, "grad_norm": 1.4469268321990967, "learning_rate": 4.4340184688710166e-05, "loss": 0.6014, "num_input_tokens_seen": 3431200, "step": 5955 }, { "epoch": 0.8876973488233542, "grad_norm": 2.672917366027832, "learning_rate": 4.4377420315758115e-05, "loss": 0.7535, "num_input_tokens_seen": 3433760, "step": 5960 }, { "epoch": 0.8884420613643134, "grad_norm": 1.8850529193878174, "learning_rate": 4.441465594280608e-05, "loss": 0.6332, "num_input_tokens_seen": 3436896, "step": 5965 }, { "epoch": 0.8891867739052726, "grad_norm": 2.572049379348755, "learning_rate": 4.4451891569854035e-05, "loss": 0.6805, "num_input_tokens_seen": 3440128, "step": 5970 }, { "epoch": 0.8899314864462318, "grad_norm": 1.135679841041565, "learning_rate": 4.4489127196902e-05, "loss": 0.7024, "num_input_tokens_seen": 3443072, "step": 5975 }, { "epoch": 0.890676198987191, "grad_norm": 2.3129937648773193, "learning_rate": 4.4526362823949955e-05, "loss": 0.7194, "num_input_tokens_seen": 3445888, "step": 5980 }, { "epoch": 0.8914209115281502, "grad_norm": 3.351290225982666, "learning_rate": 4.456359845099792e-05, "loss": 0.7793, "num_input_tokens_seen": 3448672, "step": 5985 }, { "epoch": 0.8921656240691094, "grad_norm": 3.054631233215332, "learning_rate": 4.4600834078045875e-05, "loss": 0.9145, "num_input_tokens_seen": 3451648, "step": 5990 }, { "epoch": 0.8929103366100685, "grad_norm": 1.9969148635864258, "learning_rate": 4.463806970509384e-05, "loss": 0.5987, "num_input_tokens_seen": 3454592, "step": 5995 }, { "epoch": 0.8936550491510277, "grad_norm": 1.73708975315094, "learning_rate": 4.46753053321418e-05, "loss": 0.6858, "num_input_tokens_seen": 3457120, "step": 6000 }, { "epoch": 0.8943997616919869, "grad_norm": 1.880145788192749, "learning_rate": 4.471254095918975e-05, "loss": 0.7994, "num_input_tokens_seen": 3460608, "step": 6005 }, { "epoch": 0.8951444742329461, "grad_norm": 1.8645814657211304, "learning_rate": 4.4749776586237714e-05, "loss": 0.909, "num_input_tokens_seen": 3463744, "step": 6010 }, { "epoch": 0.8958891867739053, "grad_norm": 1.0532774925231934, "learning_rate": 4.478701221328567e-05, "loss": 0.7573, "num_input_tokens_seen": 3466560, "step": 6015 }, { "epoch": 0.8966338993148645, "grad_norm": 1.3367536067962646, "learning_rate": 4.4824247840333634e-05, "loss": 0.8163, "num_input_tokens_seen": 3469376, "step": 6020 }, { "epoch": 0.8973786118558237, "grad_norm": 2.545109510421753, "learning_rate": 4.486148346738159e-05, "loss": 0.6355, "num_input_tokens_seen": 3472288, "step": 6025 }, { "epoch": 0.8981233243967829, "grad_norm": 1.1897587776184082, "learning_rate": 4.4898719094429553e-05, "loss": 0.8251, "num_input_tokens_seen": 3474944, "step": 6030 }, { "epoch": 0.898868036937742, "grad_norm": 1.3077776432037354, "learning_rate": 4.493595472147752e-05, "loss": 0.8086, "num_input_tokens_seen": 3477664, "step": 6035 }, { "epoch": 0.8996127494787012, "grad_norm": 1.2257810831069946, "learning_rate": 4.497319034852547e-05, "loss": 0.7095, "num_input_tokens_seen": 3480416, "step": 6040 }, { "epoch": 0.9003574620196604, "grad_norm": 1.3050593137741089, "learning_rate": 4.501042597557343e-05, "loss": 0.7951, "num_input_tokens_seen": 3483136, "step": 6045 }, { "epoch": 0.9011021745606196, "grad_norm": 1.699332356452942, "learning_rate": 4.5047661602621386e-05, "loss": 0.7529, "num_input_tokens_seen": 3485728, "step": 6050 }, { "epoch": 0.9018468871015788, "grad_norm": 0.8460990190505981, "learning_rate": 4.508489722966935e-05, "loss": 0.7826, "num_input_tokens_seen": 3488544, "step": 6055 }, { "epoch": 0.902591599642538, "grad_norm": 1.5694496631622314, "learning_rate": 4.5122132856717306e-05, "loss": 0.6237, "num_input_tokens_seen": 3491744, "step": 6060 }, { "epoch": 0.9033363121834972, "grad_norm": 3.3182597160339355, "learning_rate": 4.515936848376527e-05, "loss": 0.7433, "num_input_tokens_seen": 3494592, "step": 6065 }, { "epoch": 0.9040810247244564, "grad_norm": 1.061108112335205, "learning_rate": 4.5196604110813225e-05, "loss": 0.5769, "num_input_tokens_seen": 3497344, "step": 6070 }, { "epoch": 0.9048257372654156, "grad_norm": 1.5654751062393188, "learning_rate": 4.523383973786119e-05, "loss": 0.7076, "num_input_tokens_seen": 3500224, "step": 6075 }, { "epoch": 0.9055704498063747, "grad_norm": 1.0435574054718018, "learning_rate": 4.527107536490915e-05, "loss": 0.6652, "num_input_tokens_seen": 3503040, "step": 6080 }, { "epoch": 0.9063151623473339, "grad_norm": 1.4842212200164795, "learning_rate": 4.530831099195711e-05, "loss": 0.6812, "num_input_tokens_seen": 3506016, "step": 6085 }, { "epoch": 0.9070598748882931, "grad_norm": 1.3163560628890991, "learning_rate": 4.5345546619005065e-05, "loss": 0.7995, "num_input_tokens_seen": 3509088, "step": 6090 }, { "epoch": 0.9078045874292523, "grad_norm": 2.059461832046509, "learning_rate": 4.538278224605302e-05, "loss": 0.634, "num_input_tokens_seen": 3511904, "step": 6095 }, { "epoch": 0.9085492999702115, "grad_norm": 1.8868941068649292, "learning_rate": 4.5420017873100985e-05, "loss": 0.6532, "num_input_tokens_seen": 3514624, "step": 6100 }, { "epoch": 0.9092940125111707, "grad_norm": 1.513865351676941, "learning_rate": 4.545725350014894e-05, "loss": 0.5081, "num_input_tokens_seen": 3517408, "step": 6105 }, { "epoch": 0.9100387250521299, "grad_norm": 2.27158784866333, "learning_rate": 4.5494489127196904e-05, "loss": 0.9195, "num_input_tokens_seen": 3520576, "step": 6110 }, { "epoch": 0.9107834375930891, "grad_norm": 2.5981733798980713, "learning_rate": 4.553172475424487e-05, "loss": 0.944, "num_input_tokens_seen": 3523424, "step": 6115 }, { "epoch": 0.9115281501340483, "grad_norm": 1.418182611465454, "learning_rate": 4.5568960381292824e-05, "loss": 0.5694, "num_input_tokens_seen": 3526400, "step": 6120 }, { "epoch": 0.9122728626750074, "grad_norm": 1.4550539255142212, "learning_rate": 4.560619600834079e-05, "loss": 0.7714, "num_input_tokens_seen": 3529952, "step": 6125 }, { "epoch": 0.9130175752159666, "grad_norm": 0.9864698648452759, "learning_rate": 4.5643431635388744e-05, "loss": 0.7384, "num_input_tokens_seen": 3532992, "step": 6130 }, { "epoch": 0.9137622877569258, "grad_norm": 0.9002124071121216, "learning_rate": 4.56806672624367e-05, "loss": 0.7224, "num_input_tokens_seen": 3536096, "step": 6135 }, { "epoch": 0.914507000297885, "grad_norm": 1.5280771255493164, "learning_rate": 4.5717902889484657e-05, "loss": 0.738, "num_input_tokens_seen": 3539456, "step": 6140 }, { "epoch": 0.9152517128388442, "grad_norm": 1.3381869792938232, "learning_rate": 4.575513851653262e-05, "loss": 0.8247, "num_input_tokens_seen": 3542464, "step": 6145 }, { "epoch": 0.9159964253798034, "grad_norm": 2.0355377197265625, "learning_rate": 4.579237414358058e-05, "loss": 0.7361, "num_input_tokens_seen": 3545632, "step": 6150 }, { "epoch": 0.9167411379207626, "grad_norm": 1.4778019189834595, "learning_rate": 4.582960977062854e-05, "loss": 0.7177, "num_input_tokens_seen": 3548640, "step": 6155 }, { "epoch": 0.9174858504617218, "grad_norm": 2.6201820373535156, "learning_rate": 4.58668453976765e-05, "loss": 0.7537, "num_input_tokens_seen": 3551424, "step": 6160 }, { "epoch": 0.9182305630026809, "grad_norm": 0.8566105961799622, "learning_rate": 4.590408102472446e-05, "loss": 0.802, "num_input_tokens_seen": 3554080, "step": 6165 }, { "epoch": 0.9189752755436401, "grad_norm": 1.1940969228744507, "learning_rate": 4.594131665177242e-05, "loss": 0.7169, "num_input_tokens_seen": 3556960, "step": 6170 }, { "epoch": 0.9197199880845993, "grad_norm": 5.583638668060303, "learning_rate": 4.597855227882037e-05, "loss": 0.594, "num_input_tokens_seen": 3560064, "step": 6175 }, { "epoch": 0.9204647006255585, "grad_norm": 1.7193028926849365, "learning_rate": 4.6015787905868335e-05, "loss": 0.5869, "num_input_tokens_seen": 3563040, "step": 6180 }, { "epoch": 0.9212094131665177, "grad_norm": 1.5062016248703003, "learning_rate": 4.605302353291629e-05, "loss": 0.6253, "num_input_tokens_seen": 3566272, "step": 6185 }, { "epoch": 0.9219541257074769, "grad_norm": 1.3865610361099243, "learning_rate": 4.6090259159964255e-05, "loss": 0.7159, "num_input_tokens_seen": 3569120, "step": 6190 }, { "epoch": 0.9226988382484361, "grad_norm": 1.1020057201385498, "learning_rate": 4.612749478701222e-05, "loss": 0.6174, "num_input_tokens_seen": 3572064, "step": 6195 }, { "epoch": 0.9234435507893953, "grad_norm": 1.2767574787139893, "learning_rate": 4.6164730414060175e-05, "loss": 0.8325, "num_input_tokens_seen": 3574944, "step": 6200 }, { "epoch": 0.9241882633303545, "grad_norm": 2.0125038623809814, "learning_rate": 4.620196604110814e-05, "loss": 0.6088, "num_input_tokens_seen": 3578176, "step": 6205 }, { "epoch": 0.9249329758713136, "grad_norm": 2.5765867233276367, "learning_rate": 4.6239201668156095e-05, "loss": 0.7398, "num_input_tokens_seen": 3581280, "step": 6210 }, { "epoch": 0.9256776884122728, "grad_norm": 2.2282698154449463, "learning_rate": 4.627643729520406e-05, "loss": 0.6807, "num_input_tokens_seen": 3584064, "step": 6215 }, { "epoch": 0.926422400953232, "grad_norm": 1.2445673942565918, "learning_rate": 4.631367292225201e-05, "loss": 0.6213, "num_input_tokens_seen": 3587200, "step": 6220 }, { "epoch": 0.9271671134941912, "grad_norm": 1.8735707998275757, "learning_rate": 4.635090854929997e-05, "loss": 0.7124, "num_input_tokens_seen": 3590208, "step": 6225 }, { "epoch": 0.9279118260351504, "grad_norm": 1.1564197540283203, "learning_rate": 4.6388144176347934e-05, "loss": 0.7639, "num_input_tokens_seen": 3593152, "step": 6230 }, { "epoch": 0.9286565385761096, "grad_norm": 5.226668357849121, "learning_rate": 4.642537980339589e-05, "loss": 0.695, "num_input_tokens_seen": 3596160, "step": 6235 }, { "epoch": 0.9294012511170688, "grad_norm": 0.9363234043121338, "learning_rate": 4.6462615430443854e-05, "loss": 0.8558, "num_input_tokens_seen": 3599104, "step": 6240 }, { "epoch": 0.930145963658028, "grad_norm": 1.7464126348495483, "learning_rate": 4.649985105749181e-05, "loss": 0.8374, "num_input_tokens_seen": 3602144, "step": 6245 }, { "epoch": 0.9308906761989872, "grad_norm": 1.1570017337799072, "learning_rate": 4.653708668453977e-05, "loss": 0.652, "num_input_tokens_seen": 3605280, "step": 6250 }, { "epoch": 0.9316353887399463, "grad_norm": 1.1268596649169922, "learning_rate": 4.657432231158773e-05, "loss": 0.796, "num_input_tokens_seen": 3608128, "step": 6255 }, { "epoch": 0.9323801012809055, "grad_norm": 1.2357450723648071, "learning_rate": 4.661155793863569e-05, "loss": 0.6225, "num_input_tokens_seen": 3610624, "step": 6260 }, { "epoch": 0.9331248138218647, "grad_norm": 1.02189302444458, "learning_rate": 4.664879356568364e-05, "loss": 0.6926, "num_input_tokens_seen": 3613504, "step": 6265 }, { "epoch": 0.9338695263628239, "grad_norm": 1.2072548866271973, "learning_rate": 4.6686029192731606e-05, "loss": 0.6537, "num_input_tokens_seen": 3616320, "step": 6270 }, { "epoch": 0.9346142389037831, "grad_norm": 2.4208145141601562, "learning_rate": 4.672326481977957e-05, "loss": 0.6086, "num_input_tokens_seen": 3619104, "step": 6275 }, { "epoch": 0.9353589514447423, "grad_norm": 1.5471879243850708, "learning_rate": 4.6760500446827526e-05, "loss": 0.6367, "num_input_tokens_seen": 3621792, "step": 6280 }, { "epoch": 0.9361036639857016, "grad_norm": 1.322142243385315, "learning_rate": 4.679773607387549e-05, "loss": 0.6187, "num_input_tokens_seen": 3624544, "step": 6285 }, { "epoch": 0.9368483765266608, "grad_norm": 8.122840881347656, "learning_rate": 4.6834971700923445e-05, "loss": 0.4308, "num_input_tokens_seen": 3627424, "step": 6290 }, { "epoch": 0.9375930890676198, "grad_norm": 2.369279623031616, "learning_rate": 4.687220732797141e-05, "loss": 0.826, "num_input_tokens_seen": 3630304, "step": 6295 }, { "epoch": 0.938337801608579, "grad_norm": 4.669913291931152, "learning_rate": 4.6909442955019365e-05, "loss": 0.5927, "num_input_tokens_seen": 3633216, "step": 6300 }, { "epoch": 0.9390825141495382, "grad_norm": 3.5569634437561035, "learning_rate": 4.694667858206732e-05, "loss": 0.7599, "num_input_tokens_seen": 3636000, "step": 6305 }, { "epoch": 0.9398272266904975, "grad_norm": 2.052786111831665, "learning_rate": 4.6983914209115285e-05, "loss": 0.7165, "num_input_tokens_seen": 3638848, "step": 6310 }, { "epoch": 0.9405719392314567, "grad_norm": 2.9814939498901367, "learning_rate": 4.702114983616324e-05, "loss": 0.7906, "num_input_tokens_seen": 3642144, "step": 6315 }, { "epoch": 0.9413166517724159, "grad_norm": 2.7578253746032715, "learning_rate": 4.7058385463211205e-05, "loss": 0.8511, "num_input_tokens_seen": 3644832, "step": 6320 }, { "epoch": 0.9420613643133751, "grad_norm": 4.289158344268799, "learning_rate": 4.709562109025916e-05, "loss": 0.6849, "num_input_tokens_seen": 3647680, "step": 6325 }, { "epoch": 0.9428060768543343, "grad_norm": 3.1088173389434814, "learning_rate": 4.7132856717307124e-05, "loss": 0.8673, "num_input_tokens_seen": 3650528, "step": 6330 }, { "epoch": 0.9435507893952935, "grad_norm": 1.8075484037399292, "learning_rate": 4.717009234435508e-05, "loss": 0.6656, "num_input_tokens_seen": 3653344, "step": 6335 }, { "epoch": 0.9442955019362526, "grad_norm": 2.1303865909576416, "learning_rate": 4.7207327971403044e-05, "loss": 0.6565, "num_input_tokens_seen": 3656416, "step": 6340 }, { "epoch": 0.9450402144772118, "grad_norm": 2.734114408493042, "learning_rate": 4.7244563598451e-05, "loss": 0.5511, "num_input_tokens_seen": 3659200, "step": 6345 }, { "epoch": 0.945784927018171, "grad_norm": 1.8249139785766602, "learning_rate": 4.728179922549896e-05, "loss": 0.6816, "num_input_tokens_seen": 3662144, "step": 6350 }, { "epoch": 0.9465296395591302, "grad_norm": 1.5859326124191284, "learning_rate": 4.731903485254692e-05, "loss": 0.7802, "num_input_tokens_seen": 3664768, "step": 6355 }, { "epoch": 0.9472743521000894, "grad_norm": 1.1222848892211914, "learning_rate": 4.7356270479594877e-05, "loss": 0.6482, "num_input_tokens_seen": 3667296, "step": 6360 }, { "epoch": 0.9480190646410486, "grad_norm": 1.377021074295044, "learning_rate": 4.739350610664284e-05, "loss": 0.6367, "num_input_tokens_seen": 3670112, "step": 6365 }, { "epoch": 0.9487637771820078, "grad_norm": 1.408178448677063, "learning_rate": 4.7430741733690796e-05, "loss": 0.6076, "num_input_tokens_seen": 3672800, "step": 6370 }, { "epoch": 0.949508489722967, "grad_norm": 1.4828418493270874, "learning_rate": 4.746797736073876e-05, "loss": 0.8135, "num_input_tokens_seen": 3675968, "step": 6375 }, { "epoch": 0.9502532022639262, "grad_norm": 2.1141538619995117, "learning_rate": 4.7505212987786716e-05, "loss": 0.6186, "num_input_tokens_seen": 3678880, "step": 6380 }, { "epoch": 0.9509979148048853, "grad_norm": 2.0814454555511475, "learning_rate": 4.754244861483468e-05, "loss": 0.6198, "num_input_tokens_seen": 3681952, "step": 6385 }, { "epoch": 0.9517426273458445, "grad_norm": 1.7448289394378662, "learning_rate": 4.7579684241882636e-05, "loss": 0.8326, "num_input_tokens_seen": 3684736, "step": 6390 }, { "epoch": 0.9524873398868037, "grad_norm": 2.092665195465088, "learning_rate": 4.761691986893059e-05, "loss": 0.7751, "num_input_tokens_seen": 3687744, "step": 6395 }, { "epoch": 0.9532320524277629, "grad_norm": 1.482282042503357, "learning_rate": 4.7654155495978555e-05, "loss": 0.7113, "num_input_tokens_seen": 3690848, "step": 6400 }, { "epoch": 0.9539767649687221, "grad_norm": 1.5427112579345703, "learning_rate": 4.769139112302651e-05, "loss": 0.6984, "num_input_tokens_seen": 3693856, "step": 6405 }, { "epoch": 0.9547214775096813, "grad_norm": 2.479591131210327, "learning_rate": 4.7728626750074475e-05, "loss": 0.7647, "num_input_tokens_seen": 3696608, "step": 6410 }, { "epoch": 0.9554661900506405, "grad_norm": 0.7046604156494141, "learning_rate": 4.776586237712243e-05, "loss": 0.649, "num_input_tokens_seen": 3699552, "step": 6415 }, { "epoch": 0.9562109025915997, "grad_norm": 2.0572597980499268, "learning_rate": 4.7803098004170395e-05, "loss": 0.7236, "num_input_tokens_seen": 3702176, "step": 6420 }, { "epoch": 0.9569556151325588, "grad_norm": 3.0188446044921875, "learning_rate": 4.784033363121835e-05, "loss": 0.8815, "num_input_tokens_seen": 3704960, "step": 6425 }, { "epoch": 0.957700327673518, "grad_norm": 2.0634727478027344, "learning_rate": 4.7877569258266315e-05, "loss": 0.7041, "num_input_tokens_seen": 3707584, "step": 6430 }, { "epoch": 0.9584450402144772, "grad_norm": 0.7357906103134155, "learning_rate": 4.791480488531427e-05, "loss": 0.6745, "num_input_tokens_seen": 3710240, "step": 6435 }, { "epoch": 0.9591897527554364, "grad_norm": 3.065124034881592, "learning_rate": 4.795204051236223e-05, "loss": 0.7879, "num_input_tokens_seen": 3712960, "step": 6440 }, { "epoch": 0.9599344652963956, "grad_norm": 1.8084406852722168, "learning_rate": 4.798927613941019e-05, "loss": 0.7153, "num_input_tokens_seen": 3715552, "step": 6445 }, { "epoch": 0.9606791778373548, "grad_norm": 0.8329436182975769, "learning_rate": 4.802651176645815e-05, "loss": 0.7101, "num_input_tokens_seen": 3718304, "step": 6450 }, { "epoch": 0.961423890378314, "grad_norm": 1.0962982177734375, "learning_rate": 4.806374739350611e-05, "loss": 0.7705, "num_input_tokens_seen": 3721280, "step": 6455 }, { "epoch": 0.9621686029192732, "grad_norm": 0.9897662997245789, "learning_rate": 4.810098302055407e-05, "loss": 0.5208, "num_input_tokens_seen": 3724480, "step": 6460 }, { "epoch": 0.9629133154602324, "grad_norm": 1.265408992767334, "learning_rate": 4.813821864760203e-05, "loss": 0.7368, "num_input_tokens_seen": 3727232, "step": 6465 }, { "epoch": 0.9636580280011915, "grad_norm": 2.2597994804382324, "learning_rate": 4.8175454274649987e-05, "loss": 0.7566, "num_input_tokens_seen": 3729952, "step": 6470 }, { "epoch": 0.9644027405421507, "grad_norm": 1.5472029447555542, "learning_rate": 4.821268990169795e-05, "loss": 0.6934, "num_input_tokens_seen": 3732800, "step": 6475 }, { "epoch": 0.9651474530831099, "grad_norm": 1.9998568296432495, "learning_rate": 4.8249925528745906e-05, "loss": 0.5474, "num_input_tokens_seen": 3735488, "step": 6480 }, { "epoch": 0.9658921656240691, "grad_norm": 2.614466905593872, "learning_rate": 4.828716115579386e-05, "loss": 0.8892, "num_input_tokens_seen": 3738304, "step": 6485 }, { "epoch": 0.9666368781650283, "grad_norm": 1.438109278678894, "learning_rate": 4.8324396782841826e-05, "loss": 0.8015, "num_input_tokens_seen": 3741024, "step": 6490 }, { "epoch": 0.9673815907059875, "grad_norm": 0.7915830016136169, "learning_rate": 4.836163240988978e-05, "loss": 0.8155, "num_input_tokens_seen": 3744992, "step": 6495 }, { "epoch": 0.9681263032469467, "grad_norm": 1.5674382448196411, "learning_rate": 4.8398868036937746e-05, "loss": 0.8119, "num_input_tokens_seen": 3747840, "step": 6500 }, { "epoch": 0.9688710157879059, "grad_norm": 1.6735224723815918, "learning_rate": 4.84361036639857e-05, "loss": 0.7158, "num_input_tokens_seen": 3750752, "step": 6505 }, { "epoch": 0.9696157283288651, "grad_norm": 2.8512613773345947, "learning_rate": 4.8473339291033665e-05, "loss": 0.7128, "num_input_tokens_seen": 3753856, "step": 6510 }, { "epoch": 0.9703604408698242, "grad_norm": 1.1373740434646606, "learning_rate": 4.851057491808162e-05, "loss": 0.6702, "num_input_tokens_seen": 3756704, "step": 6515 }, { "epoch": 0.9711051534107834, "grad_norm": 0.7647833824157715, "learning_rate": 4.8547810545129585e-05, "loss": 0.6817, "num_input_tokens_seen": 3759488, "step": 6520 }, { "epoch": 0.9718498659517426, "grad_norm": 1.015260934829712, "learning_rate": 4.858504617217754e-05, "loss": 0.669, "num_input_tokens_seen": 3762272, "step": 6525 }, { "epoch": 0.9725945784927018, "grad_norm": 0.9356815218925476, "learning_rate": 4.86222817992255e-05, "loss": 0.68, "num_input_tokens_seen": 3764928, "step": 6530 }, { "epoch": 0.973339291033661, "grad_norm": 1.2923381328582764, "learning_rate": 4.865951742627346e-05, "loss": 0.6004, "num_input_tokens_seen": 3768192, "step": 6535 }, { "epoch": 0.9740840035746202, "grad_norm": 0.9145727157592773, "learning_rate": 4.869675305332142e-05, "loss": 0.7504, "num_input_tokens_seen": 3771136, "step": 6540 }, { "epoch": 0.9748287161155794, "grad_norm": 1.3451570272445679, "learning_rate": 4.873398868036938e-05, "loss": 0.7014, "num_input_tokens_seen": 3774112, "step": 6545 }, { "epoch": 0.9755734286565386, "grad_norm": 1.169419527053833, "learning_rate": 4.877122430741734e-05, "loss": 0.7329, "num_input_tokens_seen": 3777024, "step": 6550 }, { "epoch": 0.9763181411974977, "grad_norm": 1.088539958000183, "learning_rate": 4.88084599344653e-05, "loss": 0.6694, "num_input_tokens_seen": 3779968, "step": 6555 }, { "epoch": 0.9770628537384569, "grad_norm": 1.1474233865737915, "learning_rate": 4.8845695561513264e-05, "loss": 0.7573, "num_input_tokens_seen": 3782752, "step": 6560 }, { "epoch": 0.9778075662794161, "grad_norm": 0.9008138179779053, "learning_rate": 4.8882931188561214e-05, "loss": 0.8197, "num_input_tokens_seen": 3785664, "step": 6565 }, { "epoch": 0.9785522788203753, "grad_norm": 0.9435573816299438, "learning_rate": 4.892016681560918e-05, "loss": 0.6671, "num_input_tokens_seen": 3788224, "step": 6570 }, { "epoch": 0.9792969913613345, "grad_norm": 1.189315915107727, "learning_rate": 4.895740244265713e-05, "loss": 0.7132, "num_input_tokens_seen": 3791136, "step": 6575 }, { "epoch": 0.9800417039022937, "grad_norm": 2.9683825969696045, "learning_rate": 4.8994638069705097e-05, "loss": 0.8186, "num_input_tokens_seen": 3794240, "step": 6580 }, { "epoch": 0.9807864164432529, "grad_norm": 1.044039249420166, "learning_rate": 4.903187369675305e-05, "loss": 0.6636, "num_input_tokens_seen": 3797088, "step": 6585 }, { "epoch": 0.9815311289842121, "grad_norm": 0.952539324760437, "learning_rate": 4.9069109323801016e-05, "loss": 0.7817, "num_input_tokens_seen": 3800096, "step": 6590 }, { "epoch": 0.9822758415251713, "grad_norm": 1.062758445739746, "learning_rate": 4.910634495084897e-05, "loss": 0.6886, "num_input_tokens_seen": 3803072, "step": 6595 }, { "epoch": 0.9830205540661304, "grad_norm": 1.1374080181121826, "learning_rate": 4.9143580577896936e-05, "loss": 0.6457, "num_input_tokens_seen": 3806048, "step": 6600 }, { "epoch": 0.9837652666070896, "grad_norm": 1.7712180614471436, "learning_rate": 4.91808162049449e-05, "loss": 0.7152, "num_input_tokens_seen": 3808768, "step": 6605 }, { "epoch": 0.9845099791480488, "grad_norm": 3.0445456504821777, "learning_rate": 4.921805183199285e-05, "loss": 0.8576, "num_input_tokens_seen": 3811872, "step": 6610 }, { "epoch": 0.985254691689008, "grad_norm": 0.9462882876396179, "learning_rate": 4.925528745904081e-05, "loss": 0.7033, "num_input_tokens_seen": 3814944, "step": 6615 }, { "epoch": 0.9859994042299672, "grad_norm": 2.1698429584503174, "learning_rate": 4.929252308608877e-05, "loss": 0.7749, "num_input_tokens_seen": 3817728, "step": 6620 }, { "epoch": 0.9867441167709264, "grad_norm": 1.9462124109268188, "learning_rate": 4.932975871313673e-05, "loss": 0.6327, "num_input_tokens_seen": 3820512, "step": 6625 }, { "epoch": 0.9874888293118856, "grad_norm": 1.3727277517318726, "learning_rate": 4.936699434018469e-05, "loss": 0.7178, "num_input_tokens_seen": 3823712, "step": 6630 }, { "epoch": 0.9882335418528448, "grad_norm": 2.1001698970794678, "learning_rate": 4.940422996723265e-05, "loss": 0.673, "num_input_tokens_seen": 3826336, "step": 6635 }, { "epoch": 0.988978254393804, "grad_norm": 1.2182015180587769, "learning_rate": 4.9441465594280615e-05, "loss": 0.5929, "num_input_tokens_seen": 3828992, "step": 6640 }, { "epoch": 0.9897229669347631, "grad_norm": 1.2373969554901123, "learning_rate": 4.947870122132857e-05, "loss": 0.6191, "num_input_tokens_seen": 3831712, "step": 6645 }, { "epoch": 0.9904676794757223, "grad_norm": 1.8781315088272095, "learning_rate": 4.9515936848376534e-05, "loss": 0.6532, "num_input_tokens_seen": 3834304, "step": 6650 }, { "epoch": 0.9912123920166815, "grad_norm": 1.909938097000122, "learning_rate": 4.9553172475424484e-05, "loss": 0.7552, "num_input_tokens_seen": 3837056, "step": 6655 }, { "epoch": 0.9919571045576407, "grad_norm": 0.9379574656486511, "learning_rate": 4.959040810247245e-05, "loss": 0.6421, "num_input_tokens_seen": 3840000, "step": 6660 }, { "epoch": 0.9927018170986, "grad_norm": 0.9457732439041138, "learning_rate": 4.9627643729520404e-05, "loss": 0.6366, "num_input_tokens_seen": 3842848, "step": 6665 }, { "epoch": 0.9934465296395592, "grad_norm": 1.9879038333892822, "learning_rate": 4.966487935656837e-05, "loss": 0.6876, "num_input_tokens_seen": 3845600, "step": 6670 }, { "epoch": 0.9941912421805184, "grad_norm": 1.4550751447677612, "learning_rate": 4.9702114983616324e-05, "loss": 0.7663, "num_input_tokens_seen": 3848704, "step": 6675 }, { "epoch": 0.9949359547214776, "grad_norm": 1.19004487991333, "learning_rate": 4.973935061066429e-05, "loss": 0.7558, "num_input_tokens_seen": 3851680, "step": 6680 }, { "epoch": 0.9956806672624366, "grad_norm": 1.3395380973815918, "learning_rate": 4.977658623771225e-05, "loss": 0.7926, "num_input_tokens_seen": 3854400, "step": 6685 }, { "epoch": 0.9964253798033958, "grad_norm": 1.2376600503921509, "learning_rate": 4.9813821864760207e-05, "loss": 0.7559, "num_input_tokens_seen": 3857632, "step": 6690 }, { "epoch": 0.997170092344355, "grad_norm": 1.1231333017349243, "learning_rate": 4.985105749180816e-05, "loss": 0.6751, "num_input_tokens_seen": 3860192, "step": 6695 }, { "epoch": 0.9979148048853143, "grad_norm": 1.1880549192428589, "learning_rate": 4.988829311885612e-05, "loss": 0.6704, "num_input_tokens_seen": 3862944, "step": 6700 }, { "epoch": 0.9986595174262735, "grad_norm": 1.49574613571167, "learning_rate": 4.992552874590408e-05, "loss": 0.7123, "num_input_tokens_seen": 3865856, "step": 6705 }, { "epoch": 0.9994042299672327, "grad_norm": 1.3050968647003174, "learning_rate": 4.996276437295204e-05, "loss": 0.7028, "num_input_tokens_seen": 3868704, "step": 6710 }, { "epoch": 1.0, "eval_loss": 0.6813997030258179, "eval_runtime": 74.2549, "eval_samples_per_second": 40.186, "eval_steps_per_second": 10.046, "num_input_tokens_seen": 3870688, "step": 6714 }, { "epoch": 1.0001489425081918, "grad_norm": 1.076629400253296, "learning_rate": 5e-05, "loss": 0.5664, "num_input_tokens_seen": 3871200, "step": 6715 }, { "epoch": 1.000893655049151, "grad_norm": 0.9666463732719421, "learning_rate": 4.999999915530083e-05, "loss": 0.7183, "num_input_tokens_seen": 3873984, "step": 6720 }, { "epoch": 1.0016383675901102, "grad_norm": 1.0705294609069824, "learning_rate": 4.999999662120335e-05, "loss": 0.7906, "num_input_tokens_seen": 3876960, "step": 6725 }, { "epoch": 1.0023830801310694, "grad_norm": 1.0305466651916504, "learning_rate": 4.999999239770774e-05, "loss": 0.6571, "num_input_tokens_seen": 3879968, "step": 6730 }, { "epoch": 1.0031277926720286, "grad_norm": 0.9102991223335266, "learning_rate": 4.999998648481429e-05, "loss": 0.7259, "num_input_tokens_seen": 3882816, "step": 6735 }, { "epoch": 1.0038725052129878, "grad_norm": 2.4098448753356934, "learning_rate": 4.999997888252339e-05, "loss": 0.6322, "num_input_tokens_seen": 3885760, "step": 6740 }, { "epoch": 1.004617217753947, "grad_norm": 1.490437388420105, "learning_rate": 4.999996959083556e-05, "loss": 0.645, "num_input_tokens_seen": 3888576, "step": 6745 }, { "epoch": 1.0053619302949062, "grad_norm": 1.2014782428741455, "learning_rate": 4.999995860975143e-05, "loss": 0.6339, "num_input_tokens_seen": 3891360, "step": 6750 }, { "epoch": 1.0061066428358654, "grad_norm": 0.9719455242156982, "learning_rate": 4.9999945939271744e-05, "loss": 0.7399, "num_input_tokens_seen": 3894208, "step": 6755 }, { "epoch": 1.0068513553768246, "grad_norm": 1.6959482431411743, "learning_rate": 4.999993157939735e-05, "loss": 0.5235, "num_input_tokens_seen": 3896928, "step": 6760 }, { "epoch": 1.0075960679177838, "grad_norm": 1.832478642463684, "learning_rate": 4.999991553012923e-05, "loss": 0.6901, "num_input_tokens_seen": 3899968, "step": 6765 }, { "epoch": 1.008340780458743, "grad_norm": 2.62776780128479, "learning_rate": 4.999989779146845e-05, "loss": 0.6731, "num_input_tokens_seen": 3902688, "step": 6770 }, { "epoch": 1.0090854929997022, "grad_norm": 1.0101810693740845, "learning_rate": 4.999987836341622e-05, "loss": 0.694, "num_input_tokens_seen": 3905504, "step": 6775 }, { "epoch": 1.0098302055406614, "grad_norm": 1.842849612236023, "learning_rate": 4.999985724597386e-05, "loss": 0.7266, "num_input_tokens_seen": 3908480, "step": 6780 }, { "epoch": 1.0105749180816206, "grad_norm": 1.1331440210342407, "learning_rate": 4.9999834439142776e-05, "loss": 0.8368, "num_input_tokens_seen": 3911168, "step": 6785 }, { "epoch": 1.0113196306225798, "grad_norm": 1.604770302772522, "learning_rate": 4.999980994292454e-05, "loss": 0.5436, "num_input_tokens_seen": 3913888, "step": 6790 }, { "epoch": 1.0120643431635388, "grad_norm": 0.7779842019081116, "learning_rate": 4.9999783757320776e-05, "loss": 0.7364, "num_input_tokens_seen": 3916768, "step": 6795 }, { "epoch": 1.012809055704498, "grad_norm": 1.095350980758667, "learning_rate": 4.9999755882333275e-05, "loss": 0.645, "num_input_tokens_seen": 3919552, "step": 6800 }, { "epoch": 1.0135537682454572, "grad_norm": 1.2136719226837158, "learning_rate": 4.999972631796391e-05, "loss": 0.8634, "num_input_tokens_seen": 3922368, "step": 6805 }, { "epoch": 1.0142984807864164, "grad_norm": 2.1374361515045166, "learning_rate": 4.999969506421468e-05, "loss": 0.6727, "num_input_tokens_seen": 3925216, "step": 6810 }, { "epoch": 1.0150431933273756, "grad_norm": 1.3782625198364258, "learning_rate": 4.99996621210877e-05, "loss": 0.8229, "num_input_tokens_seen": 3928416, "step": 6815 }, { "epoch": 1.0157879058683348, "grad_norm": 1.7197846174240112, "learning_rate": 4.99996274885852e-05, "loss": 0.6535, "num_input_tokens_seen": 3931584, "step": 6820 }, { "epoch": 1.016532618409294, "grad_norm": 0.8116658329963684, "learning_rate": 4.999959116670951e-05, "loss": 0.7883, "num_input_tokens_seen": 3934464, "step": 6825 }, { "epoch": 1.0172773309502532, "grad_norm": 0.7359989881515503, "learning_rate": 4.999955315546309e-05, "loss": 0.8135, "num_input_tokens_seen": 3937440, "step": 6830 }, { "epoch": 1.0180220434912124, "grad_norm": 0.826438307762146, "learning_rate": 4.999951345484851e-05, "loss": 0.7331, "num_input_tokens_seen": 3940224, "step": 6835 }, { "epoch": 1.0187667560321716, "grad_norm": 0.8563236594200134, "learning_rate": 4.999947206486846e-05, "loss": 0.6693, "num_input_tokens_seen": 3943040, "step": 6840 }, { "epoch": 1.0195114685731308, "grad_norm": 1.154351830482483, "learning_rate": 4.999942898552571e-05, "loss": 0.7292, "num_input_tokens_seen": 3945792, "step": 6845 }, { "epoch": 1.02025618111409, "grad_norm": 0.893743097782135, "learning_rate": 4.99993842168232e-05, "loss": 0.726, "num_input_tokens_seen": 3948320, "step": 6850 }, { "epoch": 1.0210008936550492, "grad_norm": 0.9476691484451294, "learning_rate": 4.999933775876395e-05, "loss": 0.7534, "num_input_tokens_seen": 3951232, "step": 6855 }, { "epoch": 1.0217456061960084, "grad_norm": 0.7170157432556152, "learning_rate": 4.999928961135109e-05, "loss": 0.6669, "num_input_tokens_seen": 3954080, "step": 6860 }, { "epoch": 1.0224903187369676, "grad_norm": 0.6434280872344971, "learning_rate": 4.9999239774587867e-05, "loss": 0.588, "num_input_tokens_seen": 3956768, "step": 6865 }, { "epoch": 1.0232350312779268, "grad_norm": 2.361682653427124, "learning_rate": 4.999918824847767e-05, "loss": 0.6329, "num_input_tokens_seen": 3959712, "step": 6870 }, { "epoch": 1.023979743818886, "grad_norm": 1.4081072807312012, "learning_rate": 4.999913503302397e-05, "loss": 0.6292, "num_input_tokens_seen": 3962336, "step": 6875 }, { "epoch": 1.024724456359845, "grad_norm": 1.8277236223220825, "learning_rate": 4.9999080128230365e-05, "loss": 0.7023, "num_input_tokens_seen": 3965280, "step": 6880 }, { "epoch": 1.0254691689008042, "grad_norm": 2.3271145820617676, "learning_rate": 4.9999023534100565e-05, "loss": 0.6253, "num_input_tokens_seen": 3968512, "step": 6885 }, { "epoch": 1.0262138814417634, "grad_norm": 2.469573974609375, "learning_rate": 4.999896525063839e-05, "loss": 0.702, "num_input_tokens_seen": 3971488, "step": 6890 }, { "epoch": 1.0269585939827226, "grad_norm": 1.452378273010254, "learning_rate": 4.999890527784777e-05, "loss": 0.74, "num_input_tokens_seen": 3974560, "step": 6895 }, { "epoch": 1.0277033065236818, "grad_norm": 2.704505205154419, "learning_rate": 4.999884361573279e-05, "loss": 0.6991, "num_input_tokens_seen": 3977600, "step": 6900 }, { "epoch": 1.028448019064641, "grad_norm": 1.3575319051742554, "learning_rate": 4.999878026429758e-05, "loss": 0.7316, "num_input_tokens_seen": 3980864, "step": 6905 }, { "epoch": 1.0291927316056002, "grad_norm": 1.802601933479309, "learning_rate": 4.999871522354645e-05, "loss": 0.6599, "num_input_tokens_seen": 3983936, "step": 6910 }, { "epoch": 1.0299374441465594, "grad_norm": 1.474386215209961, "learning_rate": 4.999864849348378e-05, "loss": 0.6345, "num_input_tokens_seen": 3986784, "step": 6915 }, { "epoch": 1.0306821566875186, "grad_norm": 1.2227493524551392, "learning_rate": 4.999858007411408e-05, "loss": 0.6934, "num_input_tokens_seen": 3989408, "step": 6920 }, { "epoch": 1.0314268692284778, "grad_norm": 1.224466323852539, "learning_rate": 4.999850996544197e-05, "loss": 0.6049, "num_input_tokens_seen": 3992384, "step": 6925 }, { "epoch": 1.032171581769437, "grad_norm": 1.094625473022461, "learning_rate": 4.999843816747219e-05, "loss": 0.6975, "num_input_tokens_seen": 3994816, "step": 6930 }, { "epoch": 1.0329162943103962, "grad_norm": 1.6904220581054688, "learning_rate": 4.9998364680209605e-05, "loss": 0.7426, "num_input_tokens_seen": 3997472, "step": 6935 }, { "epoch": 1.0336610068513554, "grad_norm": 1.4385563135147095, "learning_rate": 4.999828950365917e-05, "loss": 0.5974, "num_input_tokens_seen": 4000352, "step": 6940 }, { "epoch": 1.0344057193923146, "grad_norm": 2.5269594192504883, "learning_rate": 4.999821263782597e-05, "loss": 0.7766, "num_input_tokens_seen": 4003168, "step": 6945 }, { "epoch": 1.0351504319332738, "grad_norm": 1.146329402923584, "learning_rate": 4.9998134082715184e-05, "loss": 0.5156, "num_input_tokens_seen": 4006016, "step": 6950 }, { "epoch": 1.035895144474233, "grad_norm": 0.7546488046646118, "learning_rate": 4.999805383833214e-05, "loss": 0.5767, "num_input_tokens_seen": 4008992, "step": 6955 }, { "epoch": 1.0366398570151922, "grad_norm": 1.3180924654006958, "learning_rate": 4.999797190468225e-05, "loss": 0.7234, "num_input_tokens_seen": 4011552, "step": 6960 }, { "epoch": 1.0373845695561514, "grad_norm": 1.403376579284668, "learning_rate": 4.999788828177105e-05, "loss": 0.5727, "num_input_tokens_seen": 4014304, "step": 6965 }, { "epoch": 1.0381292820971104, "grad_norm": 1.005878210067749, "learning_rate": 4.9997802969604195e-05, "loss": 0.6356, "num_input_tokens_seen": 4017376, "step": 6970 }, { "epoch": 1.0388739946380696, "grad_norm": 1.4233181476593018, "learning_rate": 4.9997715968187456e-05, "loss": 0.7648, "num_input_tokens_seen": 4020032, "step": 6975 }, { "epoch": 1.0396187071790288, "grad_norm": 1.2121825218200684, "learning_rate": 4.999762727752669e-05, "loss": 0.6479, "num_input_tokens_seen": 4022912, "step": 6980 }, { "epoch": 1.040363419719988, "grad_norm": 1.3301725387573242, "learning_rate": 4.9997536897627915e-05, "loss": 0.6954, "num_input_tokens_seen": 4025632, "step": 6985 }, { "epoch": 1.0411081322609472, "grad_norm": 1.031022071838379, "learning_rate": 4.999744482849723e-05, "loss": 0.6209, "num_input_tokens_seen": 4028448, "step": 6990 }, { "epoch": 1.0418528448019064, "grad_norm": 0.84135901927948, "learning_rate": 4.9997351070140856e-05, "loss": 0.722, "num_input_tokens_seen": 4031424, "step": 6995 }, { "epoch": 1.0425975573428656, "grad_norm": 3.5074069499969482, "learning_rate": 4.999725562256513e-05, "loss": 0.7127, "num_input_tokens_seen": 4034400, "step": 7000 }, { "epoch": 1.0433422698838248, "grad_norm": 1.3280678987503052, "learning_rate": 4.9997158485776493e-05, "loss": 0.6643, "num_input_tokens_seen": 4037440, "step": 7005 }, { "epoch": 1.044086982424784, "grad_norm": 2.8933920860290527, "learning_rate": 4.9997059659781526e-05, "loss": 0.6476, "num_input_tokens_seen": 4040096, "step": 7010 }, { "epoch": 1.0448316949657432, "grad_norm": 1.7645386457443237, "learning_rate": 4.9996959144586895e-05, "loss": 0.7499, "num_input_tokens_seen": 4042944, "step": 7015 }, { "epoch": 1.0455764075067024, "grad_norm": 2.8831560611724854, "learning_rate": 4.999685694019939e-05, "loss": 0.7339, "num_input_tokens_seen": 4045568, "step": 7020 }, { "epoch": 1.0463211200476616, "grad_norm": 1.158742904663086, "learning_rate": 4.9996753046625925e-05, "loss": 0.6403, "num_input_tokens_seen": 4048288, "step": 7025 }, { "epoch": 1.0470658325886208, "grad_norm": 1.284308671951294, "learning_rate": 4.9996647463873525e-05, "loss": 0.5997, "num_input_tokens_seen": 4051168, "step": 7030 }, { "epoch": 1.04781054512958, "grad_norm": 1.951992154121399, "learning_rate": 4.999654019194931e-05, "loss": 0.6126, "num_input_tokens_seen": 4053728, "step": 7035 }, { "epoch": 1.0485552576705393, "grad_norm": 1.2340713739395142, "learning_rate": 4.9996431230860544e-05, "loss": 0.7349, "num_input_tokens_seen": 4056768, "step": 7040 }, { "epoch": 1.0492999702114985, "grad_norm": 1.3048056364059448, "learning_rate": 4.999632058061457e-05, "loss": 0.7035, "num_input_tokens_seen": 4059648, "step": 7045 }, { "epoch": 1.0500446827524577, "grad_norm": 0.9836651682853699, "learning_rate": 4.999620824121889e-05, "loss": 0.5791, "num_input_tokens_seen": 4062208, "step": 7050 }, { "epoch": 1.0507893952934166, "grad_norm": 2.2954261302948, "learning_rate": 4.999609421268109e-05, "loss": 0.7654, "num_input_tokens_seen": 4064960, "step": 7055 }, { "epoch": 1.0515341078343758, "grad_norm": 1.0670150518417358, "learning_rate": 4.999597849500886e-05, "loss": 0.6789, "num_input_tokens_seen": 4067680, "step": 7060 }, { "epoch": 1.052278820375335, "grad_norm": 1.3560718297958374, "learning_rate": 4.999586108821003e-05, "loss": 0.6513, "num_input_tokens_seen": 4070464, "step": 7065 }, { "epoch": 1.0530235329162942, "grad_norm": 2.157275438308716, "learning_rate": 4.999574199229254e-05, "loss": 0.6379, "num_input_tokens_seen": 4073408, "step": 7070 }, { "epoch": 1.0537682454572534, "grad_norm": 1.5255953073501587, "learning_rate": 4.9995621207264426e-05, "loss": 0.5667, "num_input_tokens_seen": 4076160, "step": 7075 }, { "epoch": 1.0545129579982127, "grad_norm": 2.400993824005127, "learning_rate": 4.9995498733133864e-05, "loss": 0.6824, "num_input_tokens_seen": 4079168, "step": 7080 }, { "epoch": 1.0552576705391719, "grad_norm": 3.313823699951172, "learning_rate": 4.9995374569909105e-05, "loss": 0.9827, "num_input_tokens_seen": 4082208, "step": 7085 }, { "epoch": 1.056002383080131, "grad_norm": 4.167532920837402, "learning_rate": 4.999524871759857e-05, "loss": 0.6517, "num_input_tokens_seen": 4085024, "step": 7090 }, { "epoch": 1.0567470956210903, "grad_norm": 1.2008633613586426, "learning_rate": 4.999512117621075e-05, "loss": 0.6977, "num_input_tokens_seen": 4087968, "step": 7095 }, { "epoch": 1.0574918081620495, "grad_norm": 1.322943091392517, "learning_rate": 4.999499194575426e-05, "loss": 0.7291, "num_input_tokens_seen": 4091008, "step": 7100 }, { "epoch": 1.0582365207030087, "grad_norm": 1.381914496421814, "learning_rate": 4.9994861026237826e-05, "loss": 0.6332, "num_input_tokens_seen": 4094080, "step": 7105 }, { "epoch": 1.0589812332439679, "grad_norm": 1.0335845947265625, "learning_rate": 4.999472841767032e-05, "loss": 0.7255, "num_input_tokens_seen": 4096672, "step": 7110 }, { "epoch": 1.059725945784927, "grad_norm": 1.2619844675064087, "learning_rate": 4.999459412006069e-05, "loss": 0.6537, "num_input_tokens_seen": 4099424, "step": 7115 }, { "epoch": 1.0604706583258863, "grad_norm": 2.0195460319519043, "learning_rate": 4.9994458133418e-05, "loss": 0.7368, "num_input_tokens_seen": 4102752, "step": 7120 }, { "epoch": 1.0612153708668455, "grad_norm": 1.8959187269210815, "learning_rate": 4.9994320457751456e-05, "loss": 0.6784, "num_input_tokens_seen": 4105792, "step": 7125 }, { "epoch": 1.0619600834078047, "grad_norm": 1.1386758089065552, "learning_rate": 4.9994181093070345e-05, "loss": 0.7269, "num_input_tokens_seen": 4109152, "step": 7130 }, { "epoch": 1.0627047959487639, "grad_norm": 0.7635043263435364, "learning_rate": 4.9994040039384104e-05, "loss": 0.7818, "num_input_tokens_seen": 4112192, "step": 7135 }, { "epoch": 1.063449508489723, "grad_norm": 1.040499210357666, "learning_rate": 4.999389729670226e-05, "loss": 0.7749, "num_input_tokens_seen": 4115072, "step": 7140 }, { "epoch": 1.064194221030682, "grad_norm": 1.0246917009353638, "learning_rate": 4.999375286503445e-05, "loss": 0.6945, "num_input_tokens_seen": 4117760, "step": 7145 }, { "epoch": 1.0649389335716413, "grad_norm": 2.613391637802124, "learning_rate": 4.999360674439043e-05, "loss": 0.7165, "num_input_tokens_seen": 4120704, "step": 7150 }, { "epoch": 1.0656836461126005, "grad_norm": 0.8495751619338989, "learning_rate": 4.999345893478009e-05, "loss": 0.5669, "num_input_tokens_seen": 4123712, "step": 7155 }, { "epoch": 1.0664283586535597, "grad_norm": 1.3172184228897095, "learning_rate": 4.9993309436213415e-05, "loss": 0.7727, "num_input_tokens_seen": 4126528, "step": 7160 }, { "epoch": 1.0671730711945189, "grad_norm": 1.1016802787780762, "learning_rate": 4.99931582487005e-05, "loss": 0.5589, "num_input_tokens_seen": 4129312, "step": 7165 }, { "epoch": 1.067917783735478, "grad_norm": 3.551872968673706, "learning_rate": 4.999300537225157e-05, "loss": 0.6282, "num_input_tokens_seen": 4132192, "step": 7170 }, { "epoch": 1.0686624962764373, "grad_norm": 1.3114081621170044, "learning_rate": 4.999285080687694e-05, "loss": 0.8233, "num_input_tokens_seen": 4134848, "step": 7175 }, { "epoch": 1.0694072088173965, "grad_norm": 1.2435072660446167, "learning_rate": 4.999269455258707e-05, "loss": 0.7485, "num_input_tokens_seen": 4137952, "step": 7180 }, { "epoch": 1.0701519213583557, "grad_norm": 1.3819094896316528, "learning_rate": 4.999253660939251e-05, "loss": 0.8786, "num_input_tokens_seen": 4140896, "step": 7185 }, { "epoch": 1.0708966338993149, "grad_norm": 1.6593550443649292, "learning_rate": 4.999237697730396e-05, "loss": 0.7519, "num_input_tokens_seen": 4143584, "step": 7190 }, { "epoch": 1.071641346440274, "grad_norm": 1.3832836151123047, "learning_rate": 4.9992215656332166e-05, "loss": 0.7128, "num_input_tokens_seen": 4146496, "step": 7195 }, { "epoch": 1.0723860589812333, "grad_norm": 1.9856237173080444, "learning_rate": 4.999205264648805e-05, "loss": 0.7291, "num_input_tokens_seen": 4149728, "step": 7200 }, { "epoch": 1.0731307715221925, "grad_norm": 1.1434614658355713, "learning_rate": 4.999188794778263e-05, "loss": 0.6187, "num_input_tokens_seen": 4152480, "step": 7205 }, { "epoch": 1.0738754840631517, "grad_norm": 1.6440849304199219, "learning_rate": 4.999172156022703e-05, "loss": 0.6553, "num_input_tokens_seen": 4155680, "step": 7210 }, { "epoch": 1.074620196604111, "grad_norm": 1.234533667564392, "learning_rate": 4.9991553483832506e-05, "loss": 0.6488, "num_input_tokens_seen": 4158272, "step": 7215 }, { "epoch": 1.07536490914507, "grad_norm": 0.9430580735206604, "learning_rate": 4.9991383718610397e-05, "loss": 0.818, "num_input_tokens_seen": 4160960, "step": 7220 }, { "epoch": 1.076109621686029, "grad_norm": 0.82563716173172, "learning_rate": 4.999121226457219e-05, "loss": 0.5967, "num_input_tokens_seen": 4164032, "step": 7225 }, { "epoch": 1.0768543342269883, "grad_norm": 1.3264394998550415, "learning_rate": 4.999103912172945e-05, "loss": 0.6856, "num_input_tokens_seen": 4167040, "step": 7230 }, { "epoch": 1.0775990467679475, "grad_norm": 1.4591648578643799, "learning_rate": 4.999086429009391e-05, "loss": 0.6438, "num_input_tokens_seen": 4169888, "step": 7235 }, { "epoch": 1.0783437593089067, "grad_norm": 0.8520366549491882, "learning_rate": 4.999068776967736e-05, "loss": 0.7801, "num_input_tokens_seen": 4172768, "step": 7240 }, { "epoch": 1.079088471849866, "grad_norm": 1.0649734735488892, "learning_rate": 4.999050956049173e-05, "loss": 0.6393, "num_input_tokens_seen": 4175552, "step": 7245 }, { "epoch": 1.079833184390825, "grad_norm": 1.1295082569122314, "learning_rate": 4.999032966254907e-05, "loss": 0.7062, "num_input_tokens_seen": 4178528, "step": 7250 }, { "epoch": 1.0805778969317843, "grad_norm": 0.9422203898429871, "learning_rate": 4.999014807586154e-05, "loss": 0.6904, "num_input_tokens_seen": 4181376, "step": 7255 }, { "epoch": 1.0813226094727435, "grad_norm": 2.1819138526916504, "learning_rate": 4.99899648004414e-05, "loss": 0.7419, "num_input_tokens_seen": 4184160, "step": 7260 }, { "epoch": 1.0820673220137027, "grad_norm": 1.1022865772247314, "learning_rate": 4.998977983630104e-05, "loss": 0.6562, "num_input_tokens_seen": 4186784, "step": 7265 }, { "epoch": 1.082812034554662, "grad_norm": 1.013512134552002, "learning_rate": 4.9989593183452965e-05, "loss": 0.6393, "num_input_tokens_seen": 4189472, "step": 7270 }, { "epoch": 1.083556747095621, "grad_norm": 0.8898680210113525, "learning_rate": 4.9989404841909784e-05, "loss": 0.5326, "num_input_tokens_seen": 4192768, "step": 7275 }, { "epoch": 1.0843014596365803, "grad_norm": 0.755944550037384, "learning_rate": 4.998921481168421e-05, "loss": 0.6924, "num_input_tokens_seen": 4195488, "step": 7280 }, { "epoch": 1.0850461721775395, "grad_norm": 1.3861461877822876, "learning_rate": 4.9989023092789113e-05, "loss": 0.5958, "num_input_tokens_seen": 4198528, "step": 7285 }, { "epoch": 1.0857908847184987, "grad_norm": 1.810125470161438, "learning_rate": 4.998882968523743e-05, "loss": 0.789, "num_input_tokens_seen": 4201312, "step": 7290 }, { "epoch": 1.086535597259458, "grad_norm": 0.9661622047424316, "learning_rate": 4.9988634589042227e-05, "loss": 0.647, "num_input_tokens_seen": 4204032, "step": 7295 }, { "epoch": 1.0872803098004171, "grad_norm": 1.1034924983978271, "learning_rate": 4.9988437804216704e-05, "loss": 0.6916, "num_input_tokens_seen": 4207200, "step": 7300 }, { "epoch": 1.0880250223413763, "grad_norm": 1.263254165649414, "learning_rate": 4.998823933077414e-05, "loss": 0.7015, "num_input_tokens_seen": 4210592, "step": 7305 }, { "epoch": 1.0887697348823355, "grad_norm": 1.4634764194488525, "learning_rate": 4.998803916872797e-05, "loss": 0.9674, "num_input_tokens_seen": 4213888, "step": 7310 }, { "epoch": 1.0895144474232945, "grad_norm": 0.8800743222236633, "learning_rate": 4.99878373180917e-05, "loss": 0.6457, "num_input_tokens_seen": 4216544, "step": 7315 }, { "epoch": 1.0902591599642537, "grad_norm": 1.5586854219436646, "learning_rate": 4.9987633778878975e-05, "loss": 0.8053, "num_input_tokens_seen": 4219328, "step": 7320 }, { "epoch": 1.091003872505213, "grad_norm": 2.3543949127197266, "learning_rate": 4.9987428551103554e-05, "loss": 0.7166, "num_input_tokens_seen": 4222144, "step": 7325 }, { "epoch": 1.0917485850461721, "grad_norm": 1.532441258430481, "learning_rate": 4.9987221634779303e-05, "loss": 0.6681, "num_input_tokens_seen": 4225056, "step": 7330 }, { "epoch": 1.0924932975871313, "grad_norm": 1.2623316049575806, "learning_rate": 4.99870130299202e-05, "loss": 0.6376, "num_input_tokens_seen": 4228160, "step": 7335 }, { "epoch": 1.0932380101280905, "grad_norm": 1.5588784217834473, "learning_rate": 4.998680273654035e-05, "loss": 0.6247, "num_input_tokens_seen": 4230976, "step": 7340 }, { "epoch": 1.0939827226690497, "grad_norm": 2.320584774017334, "learning_rate": 4.998659075465396e-05, "loss": 0.7456, "num_input_tokens_seen": 4233792, "step": 7345 }, { "epoch": 1.094727435210009, "grad_norm": 2.07173228263855, "learning_rate": 4.998637708427536e-05, "loss": 0.6958, "num_input_tokens_seen": 4236992, "step": 7350 }, { "epoch": 1.0954721477509681, "grad_norm": 1.6019560098648071, "learning_rate": 4.998616172541898e-05, "loss": 0.6617, "num_input_tokens_seen": 4239712, "step": 7355 }, { "epoch": 1.0962168602919273, "grad_norm": 1.6713706254959106, "learning_rate": 4.9985944678099374e-05, "loss": 0.828, "num_input_tokens_seen": 4242464, "step": 7360 }, { "epoch": 1.0969615728328865, "grad_norm": 1.7593486309051514, "learning_rate": 4.998572594233121e-05, "loss": 0.659, "num_input_tokens_seen": 4245408, "step": 7365 }, { "epoch": 1.0977062853738457, "grad_norm": 1.3024927377700806, "learning_rate": 4.998550551812927e-05, "loss": 0.6657, "num_input_tokens_seen": 4248416, "step": 7370 }, { "epoch": 1.098450997914805, "grad_norm": 1.2056193351745605, "learning_rate": 4.998528340550846e-05, "loss": 0.7718, "num_input_tokens_seen": 4251584, "step": 7375 }, { "epoch": 1.0991957104557641, "grad_norm": 0.9182434678077698, "learning_rate": 4.998505960448377e-05, "loss": 0.5802, "num_input_tokens_seen": 4254528, "step": 7380 }, { "epoch": 1.0999404229967233, "grad_norm": 1.379281759262085, "learning_rate": 4.998483411507034e-05, "loss": 0.5921, "num_input_tokens_seen": 4257376, "step": 7385 }, { "epoch": 1.1006851355376825, "grad_norm": 0.9754471182823181, "learning_rate": 4.9984606937283405e-05, "loss": 0.5513, "num_input_tokens_seen": 4260288, "step": 7390 }, { "epoch": 1.1014298480786415, "grad_norm": 1.4409916400909424, "learning_rate": 4.9984378071138315e-05, "loss": 0.65, "num_input_tokens_seen": 4263200, "step": 7395 }, { "epoch": 1.1021745606196007, "grad_norm": 1.0546388626098633, "learning_rate": 4.998414751665053e-05, "loss": 0.6079, "num_input_tokens_seen": 4266048, "step": 7400 }, { "epoch": 1.10291927316056, "grad_norm": 1.5950164794921875, "learning_rate": 4.998391527383563e-05, "loss": 0.7482, "num_input_tokens_seen": 4269088, "step": 7405 }, { "epoch": 1.1036639857015191, "grad_norm": 1.8171484470367432, "learning_rate": 4.9983681342709316e-05, "loss": 0.6269, "num_input_tokens_seen": 4272192, "step": 7410 }, { "epoch": 1.1044086982424783, "grad_norm": 1.924712896347046, "learning_rate": 4.998344572328739e-05, "loss": 0.6271, "num_input_tokens_seen": 4275200, "step": 7415 }, { "epoch": 1.1051534107834375, "grad_norm": 1.9591466188430786, "learning_rate": 4.998320841558578e-05, "loss": 0.682, "num_input_tokens_seen": 4278368, "step": 7420 }, { "epoch": 1.1058981233243967, "grad_norm": 0.7776514887809753, "learning_rate": 4.9982969419620516e-05, "loss": 0.6737, "num_input_tokens_seen": 4281152, "step": 7425 }, { "epoch": 1.106642835865356, "grad_norm": 1.310334324836731, "learning_rate": 4.9982728735407756e-05, "loss": 0.7484, "num_input_tokens_seen": 4283840, "step": 7430 }, { "epoch": 1.1073875484063151, "grad_norm": 2.355752944946289, "learning_rate": 4.998248636296377e-05, "loss": 0.6423, "num_input_tokens_seen": 4286432, "step": 7435 }, { "epoch": 1.1081322609472744, "grad_norm": 2.1401889324188232, "learning_rate": 4.998224230230491e-05, "loss": 0.6897, "num_input_tokens_seen": 4289088, "step": 7440 }, { "epoch": 1.1088769734882336, "grad_norm": 1.6981282234191895, "learning_rate": 4.9981996553447695e-05, "loss": 0.8966, "num_input_tokens_seen": 4291712, "step": 7445 }, { "epoch": 1.1096216860291928, "grad_norm": 1.8105189800262451, "learning_rate": 4.998174911640872e-05, "loss": 0.7089, "num_input_tokens_seen": 4294432, "step": 7450 }, { "epoch": 1.110366398570152, "grad_norm": 1.1903752088546753, "learning_rate": 4.9981499991204704e-05, "loss": 0.6191, "num_input_tokens_seen": 4297184, "step": 7455 }, { "epoch": 1.1111111111111112, "grad_norm": 1.6870880126953125, "learning_rate": 4.998124917785249e-05, "loss": 0.7084, "num_input_tokens_seen": 4299904, "step": 7460 }, { "epoch": 1.1118558236520704, "grad_norm": 1.2266751527786255, "learning_rate": 4.9980996676369026e-05, "loss": 0.5055, "num_input_tokens_seen": 4303008, "step": 7465 }, { "epoch": 1.1126005361930296, "grad_norm": 1.0184317827224731, "learning_rate": 4.998074248677137e-05, "loss": 0.6281, "num_input_tokens_seen": 4305888, "step": 7470 }, { "epoch": 1.1133452487339888, "grad_norm": 3.2047922611236572, "learning_rate": 4.9980486609076695e-05, "loss": 0.8312, "num_input_tokens_seen": 4309024, "step": 7475 }, { "epoch": 1.114089961274948, "grad_norm": 1.3173623085021973, "learning_rate": 4.998022904330231e-05, "loss": 0.7738, "num_input_tokens_seen": 4311936, "step": 7480 }, { "epoch": 1.1148346738159072, "grad_norm": 1.5572893619537354, "learning_rate": 4.9979969789465594e-05, "loss": 0.7303, "num_input_tokens_seen": 4314784, "step": 7485 }, { "epoch": 1.1155793863568662, "grad_norm": 0.7133344411849976, "learning_rate": 4.9979708847584095e-05, "loss": 0.6653, "num_input_tokens_seen": 4317792, "step": 7490 }, { "epoch": 1.1163240988978254, "grad_norm": 2.880979061126709, "learning_rate": 4.9979446217675416e-05, "loss": 0.7906, "num_input_tokens_seen": 4320768, "step": 7495 }, { "epoch": 1.1170688114387846, "grad_norm": 1.4763953685760498, "learning_rate": 4.997918189975733e-05, "loss": 0.6502, "num_input_tokens_seen": 4323456, "step": 7500 }, { "epoch": 1.1178135239797438, "grad_norm": 3.522080183029175, "learning_rate": 4.99789158938477e-05, "loss": 0.7933, "num_input_tokens_seen": 4326432, "step": 7505 }, { "epoch": 1.118558236520703, "grad_norm": 1.636521816253662, "learning_rate": 4.9978648199964476e-05, "loss": 0.6241, "num_input_tokens_seen": 4329504, "step": 7510 }, { "epoch": 1.1193029490616622, "grad_norm": 1.6877681016921997, "learning_rate": 4.997837881812577e-05, "loss": 0.6677, "num_input_tokens_seen": 4332576, "step": 7515 }, { "epoch": 1.1200476616026214, "grad_norm": 2.356365442276001, "learning_rate": 4.997810774834977e-05, "loss": 0.7303, "num_input_tokens_seen": 4335520, "step": 7520 }, { "epoch": 1.1207923741435806, "grad_norm": 1.609880805015564, "learning_rate": 4.9977834990654804e-05, "loss": 0.5664, "num_input_tokens_seen": 4338400, "step": 7525 }, { "epoch": 1.1215370866845398, "grad_norm": 1.4093480110168457, "learning_rate": 4.997756054505931e-05, "loss": 0.7836, "num_input_tokens_seen": 4341280, "step": 7530 }, { "epoch": 1.122281799225499, "grad_norm": 4.275190830230713, "learning_rate": 4.9977284411581816e-05, "loss": 0.7353, "num_input_tokens_seen": 4344320, "step": 7535 }, { "epoch": 1.1230265117664582, "grad_norm": 1.358870267868042, "learning_rate": 4.997700659024099e-05, "loss": 0.6085, "num_input_tokens_seen": 4347040, "step": 7540 }, { "epoch": 1.1237712243074174, "grad_norm": 3.067258596420288, "learning_rate": 4.997672708105562e-05, "loss": 0.7186, "num_input_tokens_seen": 4350016, "step": 7545 }, { "epoch": 1.1245159368483766, "grad_norm": 1.1926063299179077, "learning_rate": 4.9976445884044575e-05, "loss": 0.6283, "num_input_tokens_seen": 4352768, "step": 7550 }, { "epoch": 1.1252606493893358, "grad_norm": 1.7132282257080078, "learning_rate": 4.9976162999226865e-05, "loss": 0.6841, "num_input_tokens_seen": 4355520, "step": 7555 }, { "epoch": 1.126005361930295, "grad_norm": 1.4857561588287354, "learning_rate": 4.9975878426621605e-05, "loss": 0.8478, "num_input_tokens_seen": 4358272, "step": 7560 }, { "epoch": 1.1267500744712542, "grad_norm": 0.9262312650680542, "learning_rate": 4.9975592166248025e-05, "loss": 0.5541, "num_input_tokens_seen": 4361280, "step": 7565 }, { "epoch": 1.1274947870122132, "grad_norm": 1.0372477769851685, "learning_rate": 4.997530421812547e-05, "loss": 0.8079, "num_input_tokens_seen": 4364416, "step": 7570 }, { "epoch": 1.1282394995531724, "grad_norm": 0.8328186273574829, "learning_rate": 4.997501458227339e-05, "loss": 0.6712, "num_input_tokens_seen": 4367360, "step": 7575 }, { "epoch": 1.1289842120941316, "grad_norm": 1.1809139251708984, "learning_rate": 4.997472325871138e-05, "loss": 0.713, "num_input_tokens_seen": 4370304, "step": 7580 }, { "epoch": 1.1297289246350908, "grad_norm": 1.5647248029708862, "learning_rate": 4.9974430247459106e-05, "loss": 0.7347, "num_input_tokens_seen": 4373152, "step": 7585 }, { "epoch": 1.13047363717605, "grad_norm": 1.350925326347351, "learning_rate": 4.997413554853637e-05, "loss": 0.661, "num_input_tokens_seen": 4376096, "step": 7590 }, { "epoch": 1.1312183497170092, "grad_norm": 0.9575570225715637, "learning_rate": 4.99738391619631e-05, "loss": 0.8127, "num_input_tokens_seen": 4379040, "step": 7595 }, { "epoch": 1.1319630622579684, "grad_norm": 1.3114748001098633, "learning_rate": 4.997354108775931e-05, "loss": 0.7021, "num_input_tokens_seen": 4381664, "step": 7600 }, { "epoch": 1.1327077747989276, "grad_norm": 0.9229914546012878, "learning_rate": 4.997324132594515e-05, "loss": 0.6444, "num_input_tokens_seen": 4384288, "step": 7605 }, { "epoch": 1.1334524873398868, "grad_norm": 1.2337396144866943, "learning_rate": 4.997293987654087e-05, "loss": 0.5598, "num_input_tokens_seen": 4387264, "step": 7610 }, { "epoch": 1.134197199880846, "grad_norm": 1.1966354846954346, "learning_rate": 4.997263673956685e-05, "loss": 0.7161, "num_input_tokens_seen": 4389952, "step": 7615 }, { "epoch": 1.1349419124218052, "grad_norm": 1.4573100805282593, "learning_rate": 4.9972331915043575e-05, "loss": 0.7313, "num_input_tokens_seen": 4393088, "step": 7620 }, { "epoch": 1.1356866249627644, "grad_norm": 1.664757490158081, "learning_rate": 4.997202540299163e-05, "loss": 0.7032, "num_input_tokens_seen": 4395968, "step": 7625 }, { "epoch": 1.1364313375037236, "grad_norm": 1.270688772201538, "learning_rate": 4.997171720343175e-05, "loss": 0.6122, "num_input_tokens_seen": 4398912, "step": 7630 }, { "epoch": 1.1371760500446828, "grad_norm": 1.1730735301971436, "learning_rate": 4.9971407316384736e-05, "loss": 0.6998, "num_input_tokens_seen": 4401824, "step": 7635 }, { "epoch": 1.137920762585642, "grad_norm": 1.283179759979248, "learning_rate": 4.997109574187154e-05, "loss": 0.7254, "num_input_tokens_seen": 4404704, "step": 7640 }, { "epoch": 1.1386654751266012, "grad_norm": 0.7150104641914368, "learning_rate": 4.997078247991323e-05, "loss": 0.5737, "num_input_tokens_seen": 4407936, "step": 7645 }, { "epoch": 1.1394101876675604, "grad_norm": 1.4902349710464478, "learning_rate": 4.9970467530530964e-05, "loss": 0.6746, "num_input_tokens_seen": 4410784, "step": 7650 }, { "epoch": 1.1401549002085196, "grad_norm": 0.9201971888542175, "learning_rate": 4.9970150893746016e-05, "loss": 0.7812, "num_input_tokens_seen": 4413440, "step": 7655 }, { "epoch": 1.1408996127494788, "grad_norm": 1.750571846961975, "learning_rate": 4.99698325695798e-05, "loss": 0.5848, "num_input_tokens_seen": 4416000, "step": 7660 }, { "epoch": 1.1416443252904378, "grad_norm": 0.7817227840423584, "learning_rate": 4.996951255805381e-05, "loss": 0.5429, "num_input_tokens_seen": 4418816, "step": 7665 }, { "epoch": 1.142389037831397, "grad_norm": 0.8183354139328003, "learning_rate": 4.996919085918969e-05, "loss": 0.72, "num_input_tokens_seen": 4421792, "step": 7670 }, { "epoch": 1.1431337503723562, "grad_norm": 1.6944129467010498, "learning_rate": 4.996886747300916e-05, "loss": 0.5768, "num_input_tokens_seen": 4424640, "step": 7675 }, { "epoch": 1.1438784629133154, "grad_norm": 1.9894640445709229, "learning_rate": 4.996854239953409e-05, "loss": 0.7032, "num_input_tokens_seen": 4427392, "step": 7680 }, { "epoch": 1.1446231754542746, "grad_norm": 1.417657494544983, "learning_rate": 4.996821563878643e-05, "loss": 0.7669, "num_input_tokens_seen": 4430400, "step": 7685 }, { "epoch": 1.1453678879952338, "grad_norm": 1.4507737159729004, "learning_rate": 4.9967887190788274e-05, "loss": 0.5206, "num_input_tokens_seen": 4432992, "step": 7690 }, { "epoch": 1.146112600536193, "grad_norm": 1.3298759460449219, "learning_rate": 4.996755705556182e-05, "loss": 0.5633, "num_input_tokens_seen": 4436032, "step": 7695 }, { "epoch": 1.1468573130771522, "grad_norm": 1.7319647073745728, "learning_rate": 4.9967225233129366e-05, "loss": 0.7104, "num_input_tokens_seen": 4438944, "step": 7700 }, { "epoch": 1.1476020256181114, "grad_norm": 1.635777235031128, "learning_rate": 4.9966891723513344e-05, "loss": 0.9416, "num_input_tokens_seen": 4441632, "step": 7705 }, { "epoch": 1.1483467381590706, "grad_norm": 2.597432851791382, "learning_rate": 4.996655652673628e-05, "loss": 0.7606, "num_input_tokens_seen": 4444832, "step": 7710 }, { "epoch": 1.1490914507000298, "grad_norm": 1.408261775970459, "learning_rate": 4.9966219642820834e-05, "loss": 0.7466, "num_input_tokens_seen": 4447616, "step": 7715 }, { "epoch": 1.149836163240989, "grad_norm": 0.8250597715377808, "learning_rate": 4.996588107178977e-05, "loss": 0.5822, "num_input_tokens_seen": 4450496, "step": 7720 }, { "epoch": 1.1505808757819482, "grad_norm": 0.8035179972648621, "learning_rate": 4.996554081366597e-05, "loss": 0.6089, "num_input_tokens_seen": 4453312, "step": 7725 }, { "epoch": 1.1513255883229074, "grad_norm": 0.6133168935775757, "learning_rate": 4.996519886847243e-05, "loss": 0.6662, "num_input_tokens_seen": 4456608, "step": 7730 }, { "epoch": 1.1520703008638666, "grad_norm": 1.2318812608718872, "learning_rate": 4.996485523623224e-05, "loss": 0.7929, "num_input_tokens_seen": 4459744, "step": 7735 }, { "epoch": 1.1528150134048256, "grad_norm": 1.2192926406860352, "learning_rate": 4.996450991696864e-05, "loss": 0.6735, "num_input_tokens_seen": 4462624, "step": 7740 }, { "epoch": 1.1535597259457848, "grad_norm": 1.0610597133636475, "learning_rate": 4.996416291070495e-05, "loss": 0.6969, "num_input_tokens_seen": 4465856, "step": 7745 }, { "epoch": 1.154304438486744, "grad_norm": 0.9452621936798096, "learning_rate": 4.996381421746464e-05, "loss": 0.7272, "num_input_tokens_seen": 4468704, "step": 7750 }, { "epoch": 1.1550491510277032, "grad_norm": 0.6849831342697144, "learning_rate": 4.9963463837271254e-05, "loss": 0.6737, "num_input_tokens_seen": 4471776, "step": 7755 }, { "epoch": 1.1557938635686624, "grad_norm": 2.225487232208252, "learning_rate": 4.996311177014847e-05, "loss": 0.7775, "num_input_tokens_seen": 4474848, "step": 7760 }, { "epoch": 1.1565385761096216, "grad_norm": 0.6411910057067871, "learning_rate": 4.9962758016120095e-05, "loss": 0.7184, "num_input_tokens_seen": 4477760, "step": 7765 }, { "epoch": 1.1572832886505808, "grad_norm": 0.9453091025352478, "learning_rate": 4.996240257521002e-05, "loss": 0.6397, "num_input_tokens_seen": 4480800, "step": 7770 }, { "epoch": 1.15802800119154, "grad_norm": 0.8973898887634277, "learning_rate": 4.996204544744227e-05, "loss": 0.5986, "num_input_tokens_seen": 4483936, "step": 7775 }, { "epoch": 1.1587727137324992, "grad_norm": 1.6004115343093872, "learning_rate": 4.9961686632840976e-05, "loss": 0.5943, "num_input_tokens_seen": 4486752, "step": 7780 }, { "epoch": 1.1595174262734584, "grad_norm": 1.9924520254135132, "learning_rate": 4.9961326131430386e-05, "loss": 0.67, "num_input_tokens_seen": 4489472, "step": 7785 }, { "epoch": 1.1602621388144176, "grad_norm": 3.1038200855255127, "learning_rate": 4.996096394323486e-05, "loss": 0.6746, "num_input_tokens_seen": 4492256, "step": 7790 }, { "epoch": 1.1610068513553768, "grad_norm": 0.864232063293457, "learning_rate": 4.9960600068278876e-05, "loss": 0.6156, "num_input_tokens_seen": 4495072, "step": 7795 }, { "epoch": 1.161751563896336, "grad_norm": 1.0873827934265137, "learning_rate": 4.9960234506587024e-05, "loss": 0.6053, "num_input_tokens_seen": 4498656, "step": 7800 }, { "epoch": 1.1624962764372953, "grad_norm": 1.1881885528564453, "learning_rate": 4.9959867258184e-05, "loss": 0.6681, "num_input_tokens_seen": 4501280, "step": 7805 }, { "epoch": 1.1632409889782545, "grad_norm": 1.3538084030151367, "learning_rate": 4.9959498323094636e-05, "loss": 0.8067, "num_input_tokens_seen": 4504064, "step": 7810 }, { "epoch": 1.1639857015192137, "grad_norm": 1.2443383932113647, "learning_rate": 4.9959127701343844e-05, "loss": 0.6176, "num_input_tokens_seen": 4506816, "step": 7815 }, { "epoch": 1.1647304140601729, "grad_norm": 1.1109079122543335, "learning_rate": 4.995875539295668e-05, "loss": 0.7613, "num_input_tokens_seen": 4509280, "step": 7820 }, { "epoch": 1.165475126601132, "grad_norm": 1.8507691621780396, "learning_rate": 4.9958381397958305e-05, "loss": 0.8877, "num_input_tokens_seen": 4512192, "step": 7825 }, { "epoch": 1.1662198391420913, "grad_norm": 1.2081986665725708, "learning_rate": 4.995800571637399e-05, "loss": 0.5771, "num_input_tokens_seen": 4515008, "step": 7830 }, { "epoch": 1.1669645516830505, "grad_norm": 0.6359639763832092, "learning_rate": 4.995762834822911e-05, "loss": 0.6195, "num_input_tokens_seen": 4517856, "step": 7835 }, { "epoch": 1.1677092642240094, "grad_norm": 1.3227441310882568, "learning_rate": 4.995724929354918e-05, "loss": 0.6874, "num_input_tokens_seen": 4521024, "step": 7840 }, { "epoch": 1.1684539767649686, "grad_norm": 0.8905327916145325, "learning_rate": 4.9956868552359816e-05, "loss": 0.641, "num_input_tokens_seen": 4523840, "step": 7845 }, { "epoch": 1.1691986893059279, "grad_norm": 1.361651062965393, "learning_rate": 4.995648612468674e-05, "loss": 0.7251, "num_input_tokens_seen": 4526528, "step": 7850 }, { "epoch": 1.169943401846887, "grad_norm": 1.8792214393615723, "learning_rate": 4.9956102010555806e-05, "loss": 0.6547, "num_input_tokens_seen": 4529312, "step": 7855 }, { "epoch": 1.1706881143878463, "grad_norm": 2.2047150135040283, "learning_rate": 4.9955716209992956e-05, "loss": 0.631, "num_input_tokens_seen": 4531840, "step": 7860 }, { "epoch": 1.1714328269288055, "grad_norm": 1.169533133506775, "learning_rate": 4.9955328723024263e-05, "loss": 0.6552, "num_input_tokens_seen": 4534752, "step": 7865 }, { "epoch": 1.1721775394697647, "grad_norm": 1.1045254468917847, "learning_rate": 4.995493954967592e-05, "loss": 0.62, "num_input_tokens_seen": 4537408, "step": 7870 }, { "epoch": 1.1729222520107239, "grad_norm": 1.241463541984558, "learning_rate": 4.995454868997421e-05, "loss": 0.715, "num_input_tokens_seen": 4540736, "step": 7875 }, { "epoch": 1.173666964551683, "grad_norm": 1.1892755031585693, "learning_rate": 4.9954156143945575e-05, "loss": 0.5697, "num_input_tokens_seen": 4543584, "step": 7880 }, { "epoch": 1.1744116770926423, "grad_norm": 1.627062201499939, "learning_rate": 4.9953761911616515e-05, "loss": 0.6582, "num_input_tokens_seen": 4546656, "step": 7885 }, { "epoch": 1.1751563896336015, "grad_norm": 3.279991388320923, "learning_rate": 4.995336599301368e-05, "loss": 0.7498, "num_input_tokens_seen": 4549376, "step": 7890 }, { "epoch": 1.1759011021745607, "grad_norm": 1.3485115766525269, "learning_rate": 4.9952968388163826e-05, "loss": 0.7204, "num_input_tokens_seen": 4552128, "step": 7895 }, { "epoch": 1.1766458147155199, "grad_norm": 1.442249059677124, "learning_rate": 4.995256909709382e-05, "loss": 0.8326, "num_input_tokens_seen": 4554720, "step": 7900 }, { "epoch": 1.177390527256479, "grad_norm": 1.0833771228790283, "learning_rate": 4.9952168119830644e-05, "loss": 0.5168, "num_input_tokens_seen": 4557472, "step": 7905 }, { "epoch": 1.1781352397974383, "grad_norm": 1.5387766361236572, "learning_rate": 4.995176545640139e-05, "loss": 0.6928, "num_input_tokens_seen": 4560576, "step": 7910 }, { "epoch": 1.1788799523383973, "grad_norm": 1.8395310640335083, "learning_rate": 4.995136110683328e-05, "loss": 0.6617, "num_input_tokens_seen": 4563552, "step": 7915 }, { "epoch": 1.1796246648793565, "grad_norm": 0.7145835757255554, "learning_rate": 4.995095507115363e-05, "loss": 0.6873, "num_input_tokens_seen": 4566208, "step": 7920 }, { "epoch": 1.1803693774203157, "grad_norm": 1.3973793983459473, "learning_rate": 4.9950547349389873e-05, "loss": 0.5974, "num_input_tokens_seen": 4569248, "step": 7925 }, { "epoch": 1.1811140899612749, "grad_norm": 1.0631299018859863, "learning_rate": 4.995013794156957e-05, "loss": 0.5979, "num_input_tokens_seen": 4571904, "step": 7930 }, { "epoch": 1.181858802502234, "grad_norm": 1.0968530178070068, "learning_rate": 4.994972684772039e-05, "loss": 0.7313, "num_input_tokens_seen": 4574688, "step": 7935 }, { "epoch": 1.1826035150431933, "grad_norm": 1.4558501243591309, "learning_rate": 4.9949314067870105e-05, "loss": 0.8024, "num_input_tokens_seen": 4577504, "step": 7940 }, { "epoch": 1.1833482275841525, "grad_norm": 1.271411657333374, "learning_rate": 4.9948899602046614e-05, "loss": 0.6902, "num_input_tokens_seen": 4580384, "step": 7945 }, { "epoch": 1.1840929401251117, "grad_norm": 1.2470299005508423, "learning_rate": 4.9948483450277915e-05, "loss": 0.7244, "num_input_tokens_seen": 4583456, "step": 7950 }, { "epoch": 1.1848376526660709, "grad_norm": 1.2438230514526367, "learning_rate": 4.9948065612592145e-05, "loss": 0.6745, "num_input_tokens_seen": 4586336, "step": 7955 }, { "epoch": 1.18558236520703, "grad_norm": 1.8915328979492188, "learning_rate": 4.9947646089017534e-05, "loss": 0.936, "num_input_tokens_seen": 4588832, "step": 7960 }, { "epoch": 1.1863270777479893, "grad_norm": 3.0072762966156006, "learning_rate": 4.994722487958242e-05, "loss": 0.5611, "num_input_tokens_seen": 4591552, "step": 7965 }, { "epoch": 1.1870717902889485, "grad_norm": 1.0177456140518188, "learning_rate": 4.994680198431528e-05, "loss": 0.7316, "num_input_tokens_seen": 4594752, "step": 7970 }, { "epoch": 1.1878165028299077, "grad_norm": 1.0598591566085815, "learning_rate": 4.9946377403244695e-05, "loss": 0.7346, "num_input_tokens_seen": 4597792, "step": 7975 }, { "epoch": 1.188561215370867, "grad_norm": 1.0790170431137085, "learning_rate": 4.994595113639935e-05, "loss": 0.7438, "num_input_tokens_seen": 4600544, "step": 7980 }, { "epoch": 1.189305927911826, "grad_norm": 1.9745314121246338, "learning_rate": 4.994552318380804e-05, "loss": 0.5136, "num_input_tokens_seen": 4603392, "step": 7985 }, { "epoch": 1.1900506404527853, "grad_norm": 1.2679874897003174, "learning_rate": 4.9945093545499706e-05, "loss": 0.8104, "num_input_tokens_seen": 4606112, "step": 7990 }, { "epoch": 1.1907953529937445, "grad_norm": 1.5688802003860474, "learning_rate": 4.9944662221503364e-05, "loss": 0.6473, "num_input_tokens_seen": 4609120, "step": 7995 }, { "epoch": 1.1915400655347037, "grad_norm": 1.4871381521224976, "learning_rate": 4.9944229211848166e-05, "loss": 0.6599, "num_input_tokens_seen": 4611904, "step": 8000 }, { "epoch": 1.192284778075663, "grad_norm": 1.0477097034454346, "learning_rate": 4.9943794516563366e-05, "loss": 0.6746, "num_input_tokens_seen": 4614528, "step": 8005 }, { "epoch": 1.193029490616622, "grad_norm": 1.2111258506774902, "learning_rate": 4.9943358135678366e-05, "loss": 0.7814, "num_input_tokens_seen": 4617408, "step": 8010 }, { "epoch": 1.193774203157581, "grad_norm": 1.2958201169967651, "learning_rate": 4.994292006922262e-05, "loss": 0.7049, "num_input_tokens_seen": 4620256, "step": 8015 }, { "epoch": 1.1945189156985403, "grad_norm": 2.1360106468200684, "learning_rate": 4.994248031722575e-05, "loss": 0.6793, "num_input_tokens_seen": 4623136, "step": 8020 }, { "epoch": 1.1952636282394995, "grad_norm": 2.6352968215942383, "learning_rate": 4.994203887971747e-05, "loss": 0.6318, "num_input_tokens_seen": 4625792, "step": 8025 }, { "epoch": 1.1960083407804587, "grad_norm": 1.8812346458435059, "learning_rate": 4.994159575672761e-05, "loss": 0.6568, "num_input_tokens_seen": 4628544, "step": 8030 }, { "epoch": 1.196753053321418, "grad_norm": 3.3640573024749756, "learning_rate": 4.9941150948286106e-05, "loss": 0.8163, "num_input_tokens_seen": 4631232, "step": 8035 }, { "epoch": 1.197497765862377, "grad_norm": 1.015568733215332, "learning_rate": 4.994070445442304e-05, "loss": 0.6225, "num_input_tokens_seen": 4634144, "step": 8040 }, { "epoch": 1.1982424784033363, "grad_norm": 2.60910964012146, "learning_rate": 4.994025627516856e-05, "loss": 0.6658, "num_input_tokens_seen": 4637088, "step": 8045 }, { "epoch": 1.1989871909442955, "grad_norm": 1.2312543392181396, "learning_rate": 4.9939806410552955e-05, "loss": 0.7998, "num_input_tokens_seen": 4639840, "step": 8050 }, { "epoch": 1.1997319034852547, "grad_norm": 1.7018935680389404, "learning_rate": 4.9939354860606636e-05, "loss": 0.7152, "num_input_tokens_seen": 4642400, "step": 8055 }, { "epoch": 1.200476616026214, "grad_norm": 1.1862599849700928, "learning_rate": 4.9938901625360115e-05, "loss": 0.7196, "num_input_tokens_seen": 4644960, "step": 8060 }, { "epoch": 1.2012213285671731, "grad_norm": 1.180837631225586, "learning_rate": 4.993844670484401e-05, "loss": 0.6883, "num_input_tokens_seen": 4647680, "step": 8065 }, { "epoch": 1.2019660411081323, "grad_norm": 0.7542335987091064, "learning_rate": 4.993799009908907e-05, "loss": 0.7572, "num_input_tokens_seen": 4650496, "step": 8070 }, { "epoch": 1.2027107536490915, "grad_norm": 1.1562423706054688, "learning_rate": 4.9937531808126155e-05, "loss": 0.6345, "num_input_tokens_seen": 4653312, "step": 8075 }, { "epoch": 1.2034554661900507, "grad_norm": 0.7442963719367981, "learning_rate": 4.993707183198623e-05, "loss": 0.7949, "num_input_tokens_seen": 4656064, "step": 8080 }, { "epoch": 1.2042001787310097, "grad_norm": 1.1001272201538086, "learning_rate": 4.993661017070037e-05, "loss": 0.6608, "num_input_tokens_seen": 4658720, "step": 8085 }, { "epoch": 1.204944891271969, "grad_norm": 1.0677133798599243, "learning_rate": 4.993614682429978e-05, "loss": 0.6137, "num_input_tokens_seen": 4662176, "step": 8090 }, { "epoch": 1.2056896038129281, "grad_norm": 0.6646461486816406, "learning_rate": 4.993568179281577e-05, "loss": 0.7453, "num_input_tokens_seen": 4665152, "step": 8095 }, { "epoch": 1.2064343163538873, "grad_norm": 0.9919014573097229, "learning_rate": 4.9935215076279766e-05, "loss": 0.8035, "num_input_tokens_seen": 4668096, "step": 8100 }, { "epoch": 1.2071790288948465, "grad_norm": 1.5257155895233154, "learning_rate": 4.993474667472331e-05, "loss": 0.7766, "num_input_tokens_seen": 4671040, "step": 8105 }, { "epoch": 1.2079237414358057, "grad_norm": 0.9700760245323181, "learning_rate": 4.9934276588178054e-05, "loss": 0.8234, "num_input_tokens_seen": 4673952, "step": 8110 }, { "epoch": 1.208668453976765, "grad_norm": 0.7045810222625732, "learning_rate": 4.993380481667576e-05, "loss": 0.7272, "num_input_tokens_seen": 4676928, "step": 8115 }, { "epoch": 1.2094131665177241, "grad_norm": 0.6668925285339355, "learning_rate": 4.9933331360248306e-05, "loss": 0.6567, "num_input_tokens_seen": 4679776, "step": 8120 }, { "epoch": 1.2101578790586833, "grad_norm": 1.0280002355575562, "learning_rate": 4.993285621892769e-05, "loss": 0.7257, "num_input_tokens_seen": 4682624, "step": 8125 }, { "epoch": 1.2109025915996425, "grad_norm": 0.6120803356170654, "learning_rate": 4.993237939274602e-05, "loss": 0.7148, "num_input_tokens_seen": 4685280, "step": 8130 }, { "epoch": 1.2116473041406017, "grad_norm": 1.460066795349121, "learning_rate": 4.9931900881735517e-05, "loss": 0.705, "num_input_tokens_seen": 4688192, "step": 8135 }, { "epoch": 1.212392016681561, "grad_norm": 0.6777877807617188, "learning_rate": 4.993142068592852e-05, "loss": 0.7289, "num_input_tokens_seen": 4691008, "step": 8140 }, { "epoch": 1.2131367292225201, "grad_norm": 0.8673498034477234, "learning_rate": 4.993093880535748e-05, "loss": 0.6772, "num_input_tokens_seen": 4694080, "step": 8145 }, { "epoch": 1.2138814417634793, "grad_norm": 0.642236590385437, "learning_rate": 4.993045524005496e-05, "loss": 0.6436, "num_input_tokens_seen": 4696896, "step": 8150 }, { "epoch": 1.2146261543044385, "grad_norm": 1.7151212692260742, "learning_rate": 4.992996999005363e-05, "loss": 0.6643, "num_input_tokens_seen": 4699712, "step": 8155 }, { "epoch": 1.2153708668453977, "grad_norm": 1.363842487335205, "learning_rate": 4.992948305538628e-05, "loss": 0.6304, "num_input_tokens_seen": 4702528, "step": 8160 }, { "epoch": 1.216115579386357, "grad_norm": 1.2692666053771973, "learning_rate": 4.992899443608583e-05, "loss": 0.6933, "num_input_tokens_seen": 4705024, "step": 8165 }, { "epoch": 1.2168602919273162, "grad_norm": 0.7874976992607117, "learning_rate": 4.9928504132185284e-05, "loss": 0.5345, "num_input_tokens_seen": 4707776, "step": 8170 }, { "epoch": 1.2176050044682754, "grad_norm": 1.2391194105148315, "learning_rate": 4.992801214371778e-05, "loss": 0.6827, "num_input_tokens_seen": 4710496, "step": 8175 }, { "epoch": 1.2183497170092346, "grad_norm": 1.1835522651672363, "learning_rate": 4.992751847071657e-05, "loss": 0.7029, "num_input_tokens_seen": 4713632, "step": 8180 }, { "epoch": 1.2190944295501935, "grad_norm": 1.0063791275024414, "learning_rate": 4.992702311321501e-05, "loss": 0.6977, "num_input_tokens_seen": 4716512, "step": 8185 }, { "epoch": 1.2198391420911527, "grad_norm": 0.9224059581756592, "learning_rate": 4.992652607124658e-05, "loss": 0.6311, "num_input_tokens_seen": 4719392, "step": 8190 }, { "epoch": 1.220583854632112, "grad_norm": 0.9064497947692871, "learning_rate": 4.992602734484485e-05, "loss": 0.5925, "num_input_tokens_seen": 4722304, "step": 8195 }, { "epoch": 1.2213285671730711, "grad_norm": 1.4194856882095337, "learning_rate": 4.992552693404354e-05, "loss": 0.8112, "num_input_tokens_seen": 4724960, "step": 8200 }, { "epoch": 1.2220732797140303, "grad_norm": 1.0008546113967896, "learning_rate": 4.992502483887645e-05, "loss": 0.5891, "num_input_tokens_seen": 4727808, "step": 8205 }, { "epoch": 1.2228179922549895, "grad_norm": 1.493455410003662, "learning_rate": 4.9924521059377535e-05, "loss": 0.7517, "num_input_tokens_seen": 4730944, "step": 8210 }, { "epoch": 1.2235627047959488, "grad_norm": 2.736210823059082, "learning_rate": 4.992401559558081e-05, "loss": 0.7452, "num_input_tokens_seen": 4733664, "step": 8215 }, { "epoch": 1.224307417336908, "grad_norm": 1.2336652278900146, "learning_rate": 4.992350844752045e-05, "loss": 0.6302, "num_input_tokens_seen": 4736608, "step": 8220 }, { "epoch": 1.2250521298778672, "grad_norm": 0.8587881326675415, "learning_rate": 4.9922999615230726e-05, "loss": 0.6868, "num_input_tokens_seen": 4739904, "step": 8225 }, { "epoch": 1.2257968424188264, "grad_norm": 0.7474749684333801, "learning_rate": 4.992248909874601e-05, "loss": 0.5808, "num_input_tokens_seen": 4742784, "step": 8230 }, { "epoch": 1.2265415549597856, "grad_norm": 0.9152353405952454, "learning_rate": 4.992197689810081e-05, "loss": 0.7104, "num_input_tokens_seen": 4745696, "step": 8235 }, { "epoch": 1.2272862675007448, "grad_norm": 0.8711287975311279, "learning_rate": 4.9921463013329736e-05, "loss": 0.5105, "num_input_tokens_seen": 4748800, "step": 8240 }, { "epoch": 1.228030980041704, "grad_norm": 1.4506791830062866, "learning_rate": 4.9920947444467515e-05, "loss": 0.6515, "num_input_tokens_seen": 4751712, "step": 8245 }, { "epoch": 1.2287756925826632, "grad_norm": 1.3323259353637695, "learning_rate": 4.9920430191548986e-05, "loss": 0.7422, "num_input_tokens_seen": 4754464, "step": 8250 }, { "epoch": 1.2295204051236224, "grad_norm": 0.9851282835006714, "learning_rate": 4.9919911254609105e-05, "loss": 0.5327, "num_input_tokens_seen": 4757280, "step": 8255 }, { "epoch": 1.2302651176645814, "grad_norm": 1.0368682146072388, "learning_rate": 4.991939063368294e-05, "loss": 0.7482, "num_input_tokens_seen": 4760128, "step": 8260 }, { "epoch": 1.2310098302055406, "grad_norm": 1.6150044202804565, "learning_rate": 4.991886832880567e-05, "loss": 0.6635, "num_input_tokens_seen": 4762976, "step": 8265 }, { "epoch": 1.2317545427464998, "grad_norm": 1.217738151550293, "learning_rate": 4.9918344340012584e-05, "loss": 0.8059, "num_input_tokens_seen": 4765920, "step": 8270 }, { "epoch": 1.232499255287459, "grad_norm": 0.9788432717323303, "learning_rate": 4.99178186673391e-05, "loss": 0.7374, "num_input_tokens_seen": 4768736, "step": 8275 }, { "epoch": 1.2332439678284182, "grad_norm": 1.1007435321807861, "learning_rate": 4.9917291310820745e-05, "loss": 0.745, "num_input_tokens_seen": 4771488, "step": 8280 }, { "epoch": 1.2339886803693774, "grad_norm": 1.0649898052215576, "learning_rate": 4.9916762270493154e-05, "loss": 0.6341, "num_input_tokens_seen": 4774368, "step": 8285 }, { "epoch": 1.2347333929103366, "grad_norm": 2.583531379699707, "learning_rate": 4.991623154639207e-05, "loss": 0.7154, "num_input_tokens_seen": 4777216, "step": 8290 }, { "epoch": 1.2354781054512958, "grad_norm": 0.989834189414978, "learning_rate": 4.991569913855335e-05, "loss": 0.545, "num_input_tokens_seen": 4780192, "step": 8295 }, { "epoch": 1.236222817992255, "grad_norm": 1.5933024883270264, "learning_rate": 4.991516504701299e-05, "loss": 0.8353, "num_input_tokens_seen": 4783232, "step": 8300 }, { "epoch": 1.2369675305332142, "grad_norm": 0.8819688558578491, "learning_rate": 4.991462927180707e-05, "loss": 0.7263, "num_input_tokens_seen": 4786208, "step": 8305 }, { "epoch": 1.2377122430741734, "grad_norm": 1.0597440004348755, "learning_rate": 4.991409181297181e-05, "loss": 0.7736, "num_input_tokens_seen": 4789024, "step": 8310 }, { "epoch": 1.2384569556151326, "grad_norm": 1.2670644521713257, "learning_rate": 4.991355267054351e-05, "loss": 0.7485, "num_input_tokens_seen": 4791808, "step": 8315 }, { "epoch": 1.2392016681560918, "grad_norm": 1.232243537902832, "learning_rate": 4.991301184455861e-05, "loss": 0.6938, "num_input_tokens_seen": 4794592, "step": 8320 }, { "epoch": 1.239946380697051, "grad_norm": 1.1231441497802734, "learning_rate": 4.9912469335053656e-05, "loss": 0.5908, "num_input_tokens_seen": 4797664, "step": 8325 }, { "epoch": 1.2406910932380102, "grad_norm": 0.9343456029891968, "learning_rate": 4.991192514206532e-05, "loss": 0.584, "num_input_tokens_seen": 4800352, "step": 8330 }, { "epoch": 1.2414358057789694, "grad_norm": 1.0146328210830688, "learning_rate": 4.991137926563036e-05, "loss": 0.7618, "num_input_tokens_seen": 4803040, "step": 8335 }, { "epoch": 1.2421805183199286, "grad_norm": 1.0615962743759155, "learning_rate": 4.991083170578568e-05, "loss": 0.5937, "num_input_tokens_seen": 4806048, "step": 8340 }, { "epoch": 1.2429252308608878, "grad_norm": 1.0215167999267578, "learning_rate": 4.991028246256826e-05, "loss": 0.8666, "num_input_tokens_seen": 4808864, "step": 8345 }, { "epoch": 1.243669943401847, "grad_norm": 1.0456781387329102, "learning_rate": 4.9909731536015235e-05, "loss": 0.8009, "num_input_tokens_seen": 4811680, "step": 8350 }, { "epoch": 1.244414655942806, "grad_norm": 0.7574688196182251, "learning_rate": 4.9909178926163835e-05, "loss": 0.7615, "num_input_tokens_seen": 4814880, "step": 8355 }, { "epoch": 1.2451593684837652, "grad_norm": 1.2905150651931763, "learning_rate": 4.9908624633051395e-05, "loss": 0.7006, "num_input_tokens_seen": 4818048, "step": 8360 }, { "epoch": 1.2459040810247244, "grad_norm": 0.9651674032211304, "learning_rate": 4.990806865671537e-05, "loss": 0.8166, "num_input_tokens_seen": 4821248, "step": 8365 }, { "epoch": 1.2466487935656836, "grad_norm": 1.2755202054977417, "learning_rate": 4.990751099719333e-05, "loss": 0.7201, "num_input_tokens_seen": 4824160, "step": 8370 }, { "epoch": 1.2473935061066428, "grad_norm": 1.0449572801589966, "learning_rate": 4.990695165452297e-05, "loss": 0.7263, "num_input_tokens_seen": 4826848, "step": 8375 }, { "epoch": 1.248138218647602, "grad_norm": 0.7770271301269531, "learning_rate": 4.990639062874208e-05, "loss": 0.7286, "num_input_tokens_seen": 4829696, "step": 8380 }, { "epoch": 1.2488829311885612, "grad_norm": 1.0739312171936035, "learning_rate": 4.990582791988857e-05, "loss": 0.6597, "num_input_tokens_seen": 4832512, "step": 8385 }, { "epoch": 1.2496276437295204, "grad_norm": 1.1552420854568481, "learning_rate": 4.990526352800047e-05, "loss": 0.6411, "num_input_tokens_seen": 4835488, "step": 8390 }, { "epoch": 1.2503723562704796, "grad_norm": 0.7214853167533875, "learning_rate": 4.990469745311592e-05, "loss": 0.6128, "num_input_tokens_seen": 4838304, "step": 8395 }, { "epoch": 1.2511170688114388, "grad_norm": 1.3719290494918823, "learning_rate": 4.990412969527317e-05, "loss": 0.7143, "num_input_tokens_seen": 4840832, "step": 8400 }, { "epoch": 1.251861781352398, "grad_norm": 1.2565406560897827, "learning_rate": 4.99035602545106e-05, "loss": 0.654, "num_input_tokens_seen": 4843680, "step": 8405 }, { "epoch": 1.2526064938933572, "grad_norm": 0.4676021635532379, "learning_rate": 4.990298913086666e-05, "loss": 0.6503, "num_input_tokens_seen": 4846720, "step": 8410 }, { "epoch": 1.2533512064343164, "grad_norm": 0.9094549417495728, "learning_rate": 4.990241632437997e-05, "loss": 0.7957, "num_input_tokens_seen": 4849728, "step": 8415 }, { "epoch": 1.2540959189752756, "grad_norm": 1.027688980102539, "learning_rate": 4.990184183508923e-05, "loss": 0.6374, "num_input_tokens_seen": 4852800, "step": 8420 }, { "epoch": 1.2548406315162346, "grad_norm": 0.9134235978126526, "learning_rate": 4.990126566303326e-05, "loss": 0.7463, "num_input_tokens_seen": 4855776, "step": 8425 }, { "epoch": 1.2555853440571938, "grad_norm": 3.354957342147827, "learning_rate": 4.9900687808251e-05, "loss": 0.8353, "num_input_tokens_seen": 4858592, "step": 8430 }, { "epoch": 1.256330056598153, "grad_norm": 0.6114146709442139, "learning_rate": 4.99001082707815e-05, "loss": 0.6182, "num_input_tokens_seen": 4861408, "step": 8435 }, { "epoch": 1.2570747691391122, "grad_norm": 1.6729063987731934, "learning_rate": 4.989952705066392e-05, "loss": 0.6538, "num_input_tokens_seen": 4864224, "step": 8440 }, { "epoch": 1.2578194816800714, "grad_norm": 1.2438982725143433, "learning_rate": 4.9898944147937534e-05, "loss": 0.731, "num_input_tokens_seen": 4866944, "step": 8445 }, { "epoch": 1.2585641942210306, "grad_norm": 1.2707123756408691, "learning_rate": 4.989835956264173e-05, "loss": 0.7131, "num_input_tokens_seen": 4870016, "step": 8450 }, { "epoch": 1.2593089067619898, "grad_norm": 6.382165908813477, "learning_rate": 4.989777329481602e-05, "loss": 0.6564, "num_input_tokens_seen": 4872864, "step": 8455 }, { "epoch": 1.260053619302949, "grad_norm": 0.9078903198242188, "learning_rate": 4.989718534450002e-05, "loss": 0.727, "num_input_tokens_seen": 4875584, "step": 8460 }, { "epoch": 1.2607983318439082, "grad_norm": 1.233866810798645, "learning_rate": 4.989659571173345e-05, "loss": 0.6023, "num_input_tokens_seen": 4878400, "step": 8465 }, { "epoch": 1.2615430443848674, "grad_norm": 1.1201735734939575, "learning_rate": 4.9896004396556176e-05, "loss": 0.7928, "num_input_tokens_seen": 4881440, "step": 8470 }, { "epoch": 1.2622877569258266, "grad_norm": 2.589067220687866, "learning_rate": 4.989541139900814e-05, "loss": 0.7058, "num_input_tokens_seen": 4884608, "step": 8475 }, { "epoch": 1.2630324694667858, "grad_norm": 1.0226459503173828, "learning_rate": 4.989481671912941e-05, "loss": 0.7891, "num_input_tokens_seen": 4887520, "step": 8480 }, { "epoch": 1.263777182007745, "grad_norm": 0.9911127090454102, "learning_rate": 4.989422035696019e-05, "loss": 0.8271, "num_input_tokens_seen": 4890432, "step": 8485 }, { "epoch": 1.2645218945487042, "grad_norm": 1.5841118097305298, "learning_rate": 4.9893622312540764e-05, "loss": 0.7741, "num_input_tokens_seen": 4893056, "step": 8490 }, { "epoch": 1.2652666070896634, "grad_norm": 0.9701797962188721, "learning_rate": 4.989302258591157e-05, "loss": 0.6642, "num_input_tokens_seen": 4896096, "step": 8495 }, { "epoch": 1.2660113196306226, "grad_norm": 0.9079208374023438, "learning_rate": 4.98924211771131e-05, "loss": 0.7058, "num_input_tokens_seen": 4899232, "step": 8500 }, { "epoch": 1.2667560321715818, "grad_norm": 0.971709132194519, "learning_rate": 4.9891818086186014e-05, "loss": 0.6607, "num_input_tokens_seen": 4901920, "step": 8505 }, { "epoch": 1.267500744712541, "grad_norm": 1.0511687994003296, "learning_rate": 4.989121331317107e-05, "loss": 0.6705, "num_input_tokens_seen": 4904800, "step": 8510 }, { "epoch": 1.2682454572535002, "grad_norm": 1.4180951118469238, "learning_rate": 4.9890606858109126e-05, "loss": 0.7029, "num_input_tokens_seen": 4907456, "step": 8515 }, { "epoch": 1.2689901697944594, "grad_norm": 0.8871855139732361, "learning_rate": 4.9889998721041173e-05, "loss": 0.7356, "num_input_tokens_seen": 4910464, "step": 8520 }, { "epoch": 1.2697348823354186, "grad_norm": 0.945299506187439, "learning_rate": 4.98893889020083e-05, "loss": 0.8206, "num_input_tokens_seen": 4913600, "step": 8525 }, { "epoch": 1.2704795948763778, "grad_norm": 1.0271629095077515, "learning_rate": 4.988877740105171e-05, "loss": 0.6256, "num_input_tokens_seen": 4916544, "step": 8530 }, { "epoch": 1.2712243074173368, "grad_norm": 1.1437132358551025, "learning_rate": 4.9888164218212746e-05, "loss": 0.775, "num_input_tokens_seen": 4919232, "step": 8535 }, { "epoch": 1.271969019958296, "grad_norm": 0.5450839996337891, "learning_rate": 4.988754935353282e-05, "loss": 0.8356, "num_input_tokens_seen": 4921984, "step": 8540 }, { "epoch": 1.2727137324992552, "grad_norm": 1.4285237789154053, "learning_rate": 4.988693280705351e-05, "loss": 0.7431, "num_input_tokens_seen": 4924896, "step": 8545 }, { "epoch": 1.2734584450402144, "grad_norm": 1.3213670253753662, "learning_rate": 4.988631457881645e-05, "loss": 0.7586, "num_input_tokens_seen": 4927616, "step": 8550 }, { "epoch": 1.2742031575811736, "grad_norm": 0.8218306303024292, "learning_rate": 4.9885694668863435e-05, "loss": 0.7242, "num_input_tokens_seen": 4930624, "step": 8555 }, { "epoch": 1.2749478701221328, "grad_norm": 0.7558887004852295, "learning_rate": 4.9885073077236354e-05, "loss": 0.6751, "num_input_tokens_seen": 4933376, "step": 8560 }, { "epoch": 1.275692582663092, "grad_norm": 0.979186475276947, "learning_rate": 4.988444980397721e-05, "loss": 0.6268, "num_input_tokens_seen": 4936256, "step": 8565 }, { "epoch": 1.2764372952040512, "grad_norm": 1.6632730960845947, "learning_rate": 4.9883824849128125e-05, "loss": 0.7802, "num_input_tokens_seen": 4938944, "step": 8570 }, { "epoch": 1.2771820077450105, "grad_norm": 1.0077458620071411, "learning_rate": 4.988319821273132e-05, "loss": 0.7752, "num_input_tokens_seen": 4942528, "step": 8575 }, { "epoch": 1.2779267202859697, "grad_norm": 0.9626943469047546, "learning_rate": 4.9882569894829144e-05, "loss": 0.6974, "num_input_tokens_seen": 4945280, "step": 8580 }, { "epoch": 1.2786714328269289, "grad_norm": 0.9476107954978943, "learning_rate": 4.988193989546407e-05, "loss": 0.619, "num_input_tokens_seen": 4948352, "step": 8585 }, { "epoch": 1.279416145367888, "grad_norm": 0.7747037410736084, "learning_rate": 4.988130821467866e-05, "loss": 0.6169, "num_input_tokens_seen": 4951296, "step": 8590 }, { "epoch": 1.2801608579088473, "grad_norm": 1.2516112327575684, "learning_rate": 4.988067485251559e-05, "loss": 0.6534, "num_input_tokens_seen": 4953984, "step": 8595 }, { "epoch": 1.2809055704498062, "grad_norm": 0.9260878562927246, "learning_rate": 4.988003980901768e-05, "loss": 0.6562, "num_input_tokens_seen": 4956736, "step": 8600 }, { "epoch": 1.2816502829907654, "grad_norm": 2.076516628265381, "learning_rate": 4.987940308422783e-05, "loss": 0.7591, "num_input_tokens_seen": 4959712, "step": 8605 }, { "epoch": 1.2823949955317246, "grad_norm": 1.6932721138000488, "learning_rate": 4.9878764678189075e-05, "loss": 0.6416, "num_input_tokens_seen": 4962816, "step": 8610 }, { "epoch": 1.2831397080726838, "grad_norm": 0.65496426820755, "learning_rate": 4.9878124590944555e-05, "loss": 0.7089, "num_input_tokens_seen": 4965952, "step": 8615 }, { "epoch": 1.283884420613643, "grad_norm": 1.3726717233657837, "learning_rate": 4.9877482822537516e-05, "loss": 0.7044, "num_input_tokens_seen": 4968832, "step": 8620 }, { "epoch": 1.2846291331546023, "grad_norm": 0.8767754435539246, "learning_rate": 4.987683937301133e-05, "loss": 0.6913, "num_input_tokens_seen": 4971744, "step": 8625 }, { "epoch": 1.2853738456955615, "grad_norm": 1.2630794048309326, "learning_rate": 4.987619424240949e-05, "loss": 0.6356, "num_input_tokens_seen": 4974368, "step": 8630 }, { "epoch": 1.2861185582365207, "grad_norm": 2.5998573303222656, "learning_rate": 4.9875547430775575e-05, "loss": 0.7154, "num_input_tokens_seen": 4977440, "step": 8635 }, { "epoch": 1.2868632707774799, "grad_norm": 2.7876124382019043, "learning_rate": 4.98748989381533e-05, "loss": 0.6835, "num_input_tokens_seen": 4980256, "step": 8640 }, { "epoch": 1.287607983318439, "grad_norm": 3.350156307220459, "learning_rate": 4.98742487645865e-05, "loss": 0.6912, "num_input_tokens_seen": 4983072, "step": 8645 }, { "epoch": 1.2883526958593983, "grad_norm": 6.370509147644043, "learning_rate": 4.987359691011909e-05, "loss": 0.79, "num_input_tokens_seen": 4985856, "step": 8650 }, { "epoch": 1.2890974084003575, "grad_norm": 1.3851971626281738, "learning_rate": 4.987294337479513e-05, "loss": 0.7182, "num_input_tokens_seen": 4988896, "step": 8655 }, { "epoch": 1.2898421209413167, "grad_norm": 4.344192028045654, "learning_rate": 4.987228815865879e-05, "loss": 0.8771, "num_input_tokens_seen": 4991744, "step": 8660 }, { "epoch": 1.2905868334822759, "grad_norm": 3.2613890171051025, "learning_rate": 4.987163126175434e-05, "loss": 0.5924, "num_input_tokens_seen": 4994624, "step": 8665 }, { "epoch": 1.291331546023235, "grad_norm": 0.9971240162849426, "learning_rate": 4.987097268412616e-05, "loss": 0.5901, "num_input_tokens_seen": 4997600, "step": 8670 }, { "epoch": 1.2920762585641943, "grad_norm": 2.3856308460235596, "learning_rate": 4.987031242581877e-05, "loss": 0.6172, "num_input_tokens_seen": 5000544, "step": 8675 }, { "epoch": 1.2928209711051535, "grad_norm": 1.8580094575881958, "learning_rate": 4.9869650486876786e-05, "loss": 0.7066, "num_input_tokens_seen": 5003616, "step": 8680 }, { "epoch": 1.2935656836461127, "grad_norm": 2.3103907108306885, "learning_rate": 4.986898686734493e-05, "loss": 0.7014, "num_input_tokens_seen": 5006400, "step": 8685 }, { "epoch": 1.2943103961870719, "grad_norm": 1.2504633665084839, "learning_rate": 4.9868321567268043e-05, "loss": 0.7692, "num_input_tokens_seen": 5008832, "step": 8690 }, { "epoch": 1.295055108728031, "grad_norm": 1.7105233669281006, "learning_rate": 4.98676545866911e-05, "loss": 0.5875, "num_input_tokens_seen": 5011776, "step": 8695 }, { "epoch": 1.2957998212689903, "grad_norm": 1.0044453144073486, "learning_rate": 4.986698592565917e-05, "loss": 0.7002, "num_input_tokens_seen": 5014560, "step": 8700 }, { "epoch": 1.2965445338099495, "grad_norm": 1.9893440008163452, "learning_rate": 4.986631558421742e-05, "loss": 0.6595, "num_input_tokens_seen": 5016992, "step": 8705 }, { "epoch": 1.2972892463509085, "grad_norm": 2.468554735183716, "learning_rate": 4.986564356241117e-05, "loss": 0.5709, "num_input_tokens_seen": 5019904, "step": 8710 }, { "epoch": 1.2980339588918677, "grad_norm": 8.722036361694336, "learning_rate": 4.986496986028583e-05, "loss": 0.6577, "num_input_tokens_seen": 5022752, "step": 8715 }, { "epoch": 1.2987786714328269, "grad_norm": 1.4280885457992554, "learning_rate": 4.986429447788691e-05, "loss": 0.6701, "num_input_tokens_seen": 5025408, "step": 8720 }, { "epoch": 1.299523383973786, "grad_norm": 0.7460565567016602, "learning_rate": 4.986361741526006e-05, "loss": 0.8648, "num_input_tokens_seen": 5028576, "step": 8725 }, { "epoch": 1.3002680965147453, "grad_norm": 0.6122110486030579, "learning_rate": 4.9862938672451045e-05, "loss": 0.4937, "num_input_tokens_seen": 5031328, "step": 8730 }, { "epoch": 1.3010128090557045, "grad_norm": 0.9963822960853577, "learning_rate": 4.986225824950571e-05, "loss": 0.6386, "num_input_tokens_seen": 5034400, "step": 8735 }, { "epoch": 1.3017575215966637, "grad_norm": 0.9709736108779907, "learning_rate": 4.986157614647005e-05, "loss": 0.7332, "num_input_tokens_seen": 5037344, "step": 8740 }, { "epoch": 1.302502234137623, "grad_norm": 1.4283114671707153, "learning_rate": 4.9860892363390145e-05, "loss": 0.7231, "num_input_tokens_seen": 5040128, "step": 8745 }, { "epoch": 1.303246946678582, "grad_norm": 1.4277595281600952, "learning_rate": 4.986020690031221e-05, "loss": 0.6675, "num_input_tokens_seen": 5042944, "step": 8750 }, { "epoch": 1.3039916592195413, "grad_norm": 0.911441445350647, "learning_rate": 4.985951975728258e-05, "loss": 0.6615, "num_input_tokens_seen": 5045664, "step": 8755 }, { "epoch": 1.3047363717605005, "grad_norm": 1.2787940502166748, "learning_rate": 4.9858830934347665e-05, "loss": 0.6177, "num_input_tokens_seen": 5048416, "step": 8760 }, { "epoch": 1.3054810843014597, "grad_norm": 1.6025077104568481, "learning_rate": 4.9858140431554036e-05, "loss": 0.7493, "num_input_tokens_seen": 5051264, "step": 8765 }, { "epoch": 1.306225796842419, "grad_norm": 1.621620535850525, "learning_rate": 4.9857448248948336e-05, "loss": 0.7067, "num_input_tokens_seen": 5054048, "step": 8770 }, { "epoch": 1.3069705093833779, "grad_norm": 0.8212594389915466, "learning_rate": 4.985675438657734e-05, "loss": 0.5688, "num_input_tokens_seen": 5056544, "step": 8775 }, { "epoch": 1.307715221924337, "grad_norm": 1.7410216331481934, "learning_rate": 4.985605884448795e-05, "loss": 0.8608, "num_input_tokens_seen": 5059296, "step": 8780 }, { "epoch": 1.3084599344652963, "grad_norm": 0.9622370600700378, "learning_rate": 4.985536162272716e-05, "loss": 0.7852, "num_input_tokens_seen": 5061952, "step": 8785 }, { "epoch": 1.3092046470062555, "grad_norm": 0.7502431273460388, "learning_rate": 4.9854662721342086e-05, "loss": 0.798, "num_input_tokens_seen": 5064800, "step": 8790 }, { "epoch": 1.3099493595472147, "grad_norm": 1.4991270303726196, "learning_rate": 4.985396214037995e-05, "loss": 0.7268, "num_input_tokens_seen": 5067584, "step": 8795 }, { "epoch": 1.310694072088174, "grad_norm": 0.7671829462051392, "learning_rate": 4.9853259879888116e-05, "loss": 0.826, "num_input_tokens_seen": 5070592, "step": 8800 }, { "epoch": 1.311438784629133, "grad_norm": 0.9347057342529297, "learning_rate": 4.9852555939914014e-05, "loss": 0.607, "num_input_tokens_seen": 5073504, "step": 8805 }, { "epoch": 1.3121834971700923, "grad_norm": 1.361596941947937, "learning_rate": 4.9851850320505225e-05, "loss": 0.6302, "num_input_tokens_seen": 5076384, "step": 8810 }, { "epoch": 1.3129282097110515, "grad_norm": 1.1347576379776, "learning_rate": 4.985114302170943e-05, "loss": 0.6518, "num_input_tokens_seen": 5079360, "step": 8815 }, { "epoch": 1.3136729222520107, "grad_norm": 0.6543341875076294, "learning_rate": 4.985043404357444e-05, "loss": 0.6874, "num_input_tokens_seen": 5081952, "step": 8820 }, { "epoch": 1.31441763479297, "grad_norm": 1.22663414478302, "learning_rate": 4.984972338614814e-05, "loss": 0.7217, "num_input_tokens_seen": 5085024, "step": 8825 }, { "epoch": 1.3151623473339291, "grad_norm": 0.6284042000770569, "learning_rate": 4.984901104947857e-05, "loss": 0.5524, "num_input_tokens_seen": 5087584, "step": 8830 }, { "epoch": 1.3159070598748883, "grad_norm": 1.74043869972229, "learning_rate": 4.984829703361386e-05, "loss": 0.8177, "num_input_tokens_seen": 5090464, "step": 8835 }, { "epoch": 1.3166517724158475, "grad_norm": 0.743027925491333, "learning_rate": 4.984758133860227e-05, "loss": 0.6612, "num_input_tokens_seen": 5093376, "step": 8840 }, { "epoch": 1.3173964849568067, "grad_norm": 0.8356252312660217, "learning_rate": 4.984686396449214e-05, "loss": 0.7426, "num_input_tokens_seen": 5096416, "step": 8845 }, { "epoch": 1.318141197497766, "grad_norm": 1.3360406160354614, "learning_rate": 4.984614491133197e-05, "loss": 0.6632, "num_input_tokens_seen": 5099328, "step": 8850 }, { "epoch": 1.3188859100387251, "grad_norm": 0.9050124287605286, "learning_rate": 4.984542417917035e-05, "loss": 0.8381, "num_input_tokens_seen": 5102112, "step": 8855 }, { "epoch": 1.3196306225796843, "grad_norm": 0.9302007555961609, "learning_rate": 4.984470176805598e-05, "loss": 0.6782, "num_input_tokens_seen": 5104896, "step": 8860 }, { "epoch": 1.3203753351206435, "grad_norm": 1.3203847408294678, "learning_rate": 4.9843977678037666e-05, "loss": 0.6383, "num_input_tokens_seen": 5107648, "step": 8865 }, { "epoch": 1.3211200476616027, "grad_norm": 0.781263530254364, "learning_rate": 4.984325190916435e-05, "loss": 0.7785, "num_input_tokens_seen": 5110432, "step": 8870 }, { "epoch": 1.321864760202562, "grad_norm": 0.8165376782417297, "learning_rate": 4.984252446148508e-05, "loss": 0.7187, "num_input_tokens_seen": 5113472, "step": 8875 }, { "epoch": 1.322609472743521, "grad_norm": 0.6773647665977478, "learning_rate": 4.9841795335049006e-05, "loss": 0.7459, "num_input_tokens_seen": 5116416, "step": 8880 }, { "epoch": 1.3233541852844801, "grad_norm": 1.5762271881103516, "learning_rate": 4.98410645299054e-05, "loss": 0.7614, "num_input_tokens_seen": 5119168, "step": 8885 }, { "epoch": 1.3240988978254393, "grad_norm": 1.2326478958129883, "learning_rate": 4.9840332046103656e-05, "loss": 0.7092, "num_input_tokens_seen": 5121696, "step": 8890 }, { "epoch": 1.3248436103663985, "grad_norm": 0.7765193581581116, "learning_rate": 4.9839597883693267e-05, "loss": 0.7965, "num_input_tokens_seen": 5124704, "step": 8895 }, { "epoch": 1.3255883229073577, "grad_norm": 1.1824281215667725, "learning_rate": 4.983886204272383e-05, "loss": 0.6582, "num_input_tokens_seen": 5127520, "step": 8900 }, { "epoch": 1.326333035448317, "grad_norm": 1.1670681238174438, "learning_rate": 4.98381245232451e-05, "loss": 0.6785, "num_input_tokens_seen": 5130176, "step": 8905 }, { "epoch": 1.3270777479892761, "grad_norm": 0.8804794549942017, "learning_rate": 4.98373853253069e-05, "loss": 0.6274, "num_input_tokens_seen": 5133152, "step": 8910 }, { "epoch": 1.3278224605302353, "grad_norm": 1.3552497625350952, "learning_rate": 4.983664444895917e-05, "loss": 0.7799, "num_input_tokens_seen": 5136064, "step": 8915 }, { "epoch": 1.3285671730711945, "grad_norm": 1.1167142391204834, "learning_rate": 4.983590189425198e-05, "loss": 0.8272, "num_input_tokens_seen": 5138848, "step": 8920 }, { "epoch": 1.3293118856121537, "grad_norm": 12.865018844604492, "learning_rate": 4.9835157661235534e-05, "loss": 0.7957, "num_input_tokens_seen": 5142080, "step": 8925 }, { "epoch": 1.330056598153113, "grad_norm": 1.0123050212860107, "learning_rate": 4.98344117499601e-05, "loss": 0.5847, "num_input_tokens_seen": 5145152, "step": 8930 }, { "epoch": 1.3308013106940721, "grad_norm": 1.0563799142837524, "learning_rate": 4.983366416047608e-05, "loss": 0.6403, "num_input_tokens_seen": 5147936, "step": 8935 }, { "epoch": 1.3315460232350314, "grad_norm": 1.2028467655181885, "learning_rate": 4.983291489283401e-05, "loss": 0.7914, "num_input_tokens_seen": 5150944, "step": 8940 }, { "epoch": 1.3322907357759903, "grad_norm": 1.9508130550384521, "learning_rate": 4.983216394708451e-05, "loss": 0.8337, "num_input_tokens_seen": 5153920, "step": 8945 }, { "epoch": 1.3330354483169495, "grad_norm": 0.9790789484977722, "learning_rate": 4.983141132327833e-05, "loss": 0.642, "num_input_tokens_seen": 5156864, "step": 8950 }, { "epoch": 1.3337801608579087, "grad_norm": 1.5231512784957886, "learning_rate": 4.983065702146634e-05, "loss": 0.5876, "num_input_tokens_seen": 5159360, "step": 8955 }, { "epoch": 1.334524873398868, "grad_norm": 1.2070090770721436, "learning_rate": 4.98299010416995e-05, "loss": 0.7457, "num_input_tokens_seen": 5161952, "step": 8960 }, { "epoch": 1.3352695859398271, "grad_norm": 0.9096051454544067, "learning_rate": 4.982914338402889e-05, "loss": 0.5862, "num_input_tokens_seen": 5164896, "step": 8965 }, { "epoch": 1.3360142984807863, "grad_norm": 1.6828405857086182, "learning_rate": 4.982838404850573e-05, "loss": 0.5828, "num_input_tokens_seen": 5167680, "step": 8970 }, { "epoch": 1.3367590110217455, "grad_norm": 1.3892756700515747, "learning_rate": 4.982762303518131e-05, "loss": 0.7037, "num_input_tokens_seen": 5170944, "step": 8975 }, { "epoch": 1.3375037235627047, "grad_norm": 1.5741242170333862, "learning_rate": 4.982686034410707e-05, "loss": 0.5852, "num_input_tokens_seen": 5173824, "step": 8980 }, { "epoch": 1.338248436103664, "grad_norm": 0.6471987962722778, "learning_rate": 4.982609597533455e-05, "loss": 0.5374, "num_input_tokens_seen": 5176640, "step": 8985 }, { "epoch": 1.3389931486446232, "grad_norm": 0.6359412670135498, "learning_rate": 4.98253299289154e-05, "loss": 0.6367, "num_input_tokens_seen": 5179424, "step": 8990 }, { "epoch": 1.3397378611855824, "grad_norm": 1.2655792236328125, "learning_rate": 4.982456220490138e-05, "loss": 0.7761, "num_input_tokens_seen": 5182592, "step": 8995 }, { "epoch": 1.3404825737265416, "grad_norm": 1.6813716888427734, "learning_rate": 4.982379280334438e-05, "loss": 0.5559, "num_input_tokens_seen": 5185376, "step": 9000 }, { "epoch": 1.3412272862675008, "grad_norm": 1.4465391635894775, "learning_rate": 4.982302172429638e-05, "loss": 0.8362, "num_input_tokens_seen": 5188288, "step": 9005 }, { "epoch": 1.34197199880846, "grad_norm": 1.927267074584961, "learning_rate": 4.98222489678095e-05, "loss": 0.7225, "num_input_tokens_seen": 5191264, "step": 9010 }, { "epoch": 1.3427167113494192, "grad_norm": 1.4623709917068481, "learning_rate": 4.9821474533935966e-05, "loss": 0.6768, "num_input_tokens_seen": 5195488, "step": 9015 }, { "epoch": 1.3434614238903784, "grad_norm": 1.2006715536117554, "learning_rate": 4.982069842272809e-05, "loss": 0.6289, "num_input_tokens_seen": 5198112, "step": 9020 }, { "epoch": 1.3442061364313376, "grad_norm": 2.6248297691345215, "learning_rate": 4.9819920634238323e-05, "loss": 0.6998, "num_input_tokens_seen": 5201024, "step": 9025 }, { "epoch": 1.3449508489722968, "grad_norm": 1.4844814538955688, "learning_rate": 4.981914116851924e-05, "loss": 0.7839, "num_input_tokens_seen": 5203968, "step": 9030 }, { "epoch": 1.345695561513256, "grad_norm": 1.1218430995941162, "learning_rate": 4.98183600256235e-05, "loss": 0.6389, "num_input_tokens_seen": 5206976, "step": 9035 }, { "epoch": 1.3464402740542152, "grad_norm": 1.5494272708892822, "learning_rate": 4.981757720560389e-05, "loss": 0.7372, "num_input_tokens_seen": 5210176, "step": 9040 }, { "epoch": 1.3471849865951744, "grad_norm": 0.8403793573379517, "learning_rate": 4.981679270851332e-05, "loss": 0.5689, "num_input_tokens_seen": 5213024, "step": 9045 }, { "epoch": 1.3479296991361336, "grad_norm": 0.8805217742919922, "learning_rate": 4.981600653440479e-05, "loss": 0.665, "num_input_tokens_seen": 5216064, "step": 9050 }, { "epoch": 1.3486744116770926, "grad_norm": 0.9290894865989685, "learning_rate": 4.981521868333144e-05, "loss": 0.5412, "num_input_tokens_seen": 5219200, "step": 9055 }, { "epoch": 1.3494191242180518, "grad_norm": 0.9447292685508728, "learning_rate": 4.98144291553465e-05, "loss": 0.6615, "num_input_tokens_seen": 5222336, "step": 9060 }, { "epoch": 1.350163836759011, "grad_norm": 2.44282603263855, "learning_rate": 4.981363795050332e-05, "loss": 0.6751, "num_input_tokens_seen": 5225248, "step": 9065 }, { "epoch": 1.3509085492999702, "grad_norm": 1.3253241777420044, "learning_rate": 4.9812845068855384e-05, "loss": 0.7443, "num_input_tokens_seen": 5228320, "step": 9070 }, { "epoch": 1.3516532618409294, "grad_norm": 1.8281441926956177, "learning_rate": 4.9812050510456254e-05, "loss": 0.642, "num_input_tokens_seen": 5231008, "step": 9075 }, { "epoch": 1.3523979743818886, "grad_norm": 1.8098589181900024, "learning_rate": 4.9811254275359626e-05, "loss": 0.6619, "num_input_tokens_seen": 5233792, "step": 9080 }, { "epoch": 1.3531426869228478, "grad_norm": 1.1009896993637085, "learning_rate": 4.9810456363619304e-05, "loss": 0.4988, "num_input_tokens_seen": 5236768, "step": 9085 }, { "epoch": 1.353887399463807, "grad_norm": 1.3328218460083008, "learning_rate": 4.980965677528923e-05, "loss": 0.6792, "num_input_tokens_seen": 5239936, "step": 9090 }, { "epoch": 1.3546321120047662, "grad_norm": 2.0135481357574463, "learning_rate": 4.98088555104234e-05, "loss": 0.6897, "num_input_tokens_seen": 5242624, "step": 9095 }, { "epoch": 1.3553768245457254, "grad_norm": 1.1494919061660767, "learning_rate": 4.9808052569076e-05, "loss": 0.4956, "num_input_tokens_seen": 5245344, "step": 9100 }, { "epoch": 1.3561215370866846, "grad_norm": 0.9144576787948608, "learning_rate": 4.9807247951301255e-05, "loss": 0.6061, "num_input_tokens_seen": 5248352, "step": 9105 }, { "epoch": 1.3568662496276438, "grad_norm": 4.010806560516357, "learning_rate": 4.9806441657153555e-05, "loss": 0.7867, "num_input_tokens_seen": 5251328, "step": 9110 }, { "epoch": 1.357610962168603, "grad_norm": 1.1315163373947144, "learning_rate": 4.9805633686687394e-05, "loss": 0.8196, "num_input_tokens_seen": 5253952, "step": 9115 }, { "epoch": 1.358355674709562, "grad_norm": 1.5283788442611694, "learning_rate": 4.980482403995734e-05, "loss": 0.6438, "num_input_tokens_seen": 5256736, "step": 9120 }, { "epoch": 1.3591003872505212, "grad_norm": 2.363111734390259, "learning_rate": 4.9804012717018146e-05, "loss": 0.677, "num_input_tokens_seen": 5259808, "step": 9125 }, { "epoch": 1.3598450997914804, "grad_norm": 1.6125421524047852, "learning_rate": 4.980319971792461e-05, "loss": 0.6404, "num_input_tokens_seen": 5263104, "step": 9130 }, { "epoch": 1.3605898123324396, "grad_norm": 3.9142158031463623, "learning_rate": 4.980238504273168e-05, "loss": 0.6927, "num_input_tokens_seen": 5265696, "step": 9135 }, { "epoch": 1.3613345248733988, "grad_norm": 2.0589659214019775, "learning_rate": 4.98015686914944e-05, "loss": 0.5273, "num_input_tokens_seen": 5268416, "step": 9140 }, { "epoch": 1.362079237414358, "grad_norm": 1.7058212757110596, "learning_rate": 4.980075066426796e-05, "loss": 0.6316, "num_input_tokens_seen": 5271744, "step": 9145 }, { "epoch": 1.3628239499553172, "grad_norm": 3.194183588027954, "learning_rate": 4.979993096110762e-05, "loss": 0.5981, "num_input_tokens_seen": 5274784, "step": 9150 }, { "epoch": 1.3635686624962764, "grad_norm": 1.411023736000061, "learning_rate": 4.979910958206876e-05, "loss": 0.8802, "num_input_tokens_seen": 5277472, "step": 9155 }, { "epoch": 1.3643133750372356, "grad_norm": 1.1597498655319214, "learning_rate": 4.9798286527206915e-05, "loss": 0.685, "num_input_tokens_seen": 5280160, "step": 9160 }, { "epoch": 1.3650580875781948, "grad_norm": 1.500890851020813, "learning_rate": 4.979746179657768e-05, "loss": 0.6391, "num_input_tokens_seen": 5283200, "step": 9165 }, { "epoch": 1.365802800119154, "grad_norm": 1.1927191019058228, "learning_rate": 4.97966353902368e-05, "loss": 0.7704, "num_input_tokens_seen": 5286432, "step": 9170 }, { "epoch": 1.3665475126601132, "grad_norm": 1.42489492893219, "learning_rate": 4.9795807308240115e-05, "loss": 0.54, "num_input_tokens_seen": 5289248, "step": 9175 }, { "epoch": 1.3672922252010724, "grad_norm": 2.155628204345703, "learning_rate": 4.979497755064359e-05, "loss": 0.6939, "num_input_tokens_seen": 5292064, "step": 9180 }, { "epoch": 1.3680369377420316, "grad_norm": 1.4815311431884766, "learning_rate": 4.979414611750329e-05, "loss": 0.7518, "num_input_tokens_seen": 5294848, "step": 9185 }, { "epoch": 1.3687816502829908, "grad_norm": 1.2884994745254517, "learning_rate": 4.97933130088754e-05, "loss": 0.6088, "num_input_tokens_seen": 5297472, "step": 9190 }, { "epoch": 1.36952636282395, "grad_norm": 0.8290762901306152, "learning_rate": 4.9792478224816206e-05, "loss": 0.5954, "num_input_tokens_seen": 5300320, "step": 9195 }, { "epoch": 1.3702710753649092, "grad_norm": 1.1199090480804443, "learning_rate": 4.979164176538215e-05, "loss": 0.5515, "num_input_tokens_seen": 5303104, "step": 9200 }, { "epoch": 1.3710157879058684, "grad_norm": 1.0452909469604492, "learning_rate": 4.979080363062974e-05, "loss": 0.6281, "num_input_tokens_seen": 5306048, "step": 9205 }, { "epoch": 1.3717605004468276, "grad_norm": 2.1007916927337646, "learning_rate": 4.978996382061559e-05, "loss": 0.732, "num_input_tokens_seen": 5308992, "step": 9210 }, { "epoch": 1.3725052129877868, "grad_norm": 0.9219796657562256, "learning_rate": 4.978912233539649e-05, "loss": 0.5414, "num_input_tokens_seen": 5311872, "step": 9215 }, { "epoch": 1.373249925528746, "grad_norm": 0.8576198816299438, "learning_rate": 4.978827917502929e-05, "loss": 0.8511, "num_input_tokens_seen": 5314592, "step": 9220 }, { "epoch": 1.3739946380697052, "grad_norm": 1.0559790134429932, "learning_rate": 4.978743433957096e-05, "loss": 0.5915, "num_input_tokens_seen": 5317536, "step": 9225 }, { "epoch": 1.3747393506106642, "grad_norm": 1.7667076587677002, "learning_rate": 4.97865878290786e-05, "loss": 0.7073, "num_input_tokens_seen": 5320640, "step": 9230 }, { "epoch": 1.3754840631516234, "grad_norm": 1.4691989421844482, "learning_rate": 4.9785739643609406e-05, "loss": 0.6559, "num_input_tokens_seen": 5323552, "step": 9235 }, { "epoch": 1.3762287756925826, "grad_norm": 1.4845837354660034, "learning_rate": 4.97848897832207e-05, "loss": 0.7855, "num_input_tokens_seen": 5326400, "step": 9240 }, { "epoch": 1.3769734882335418, "grad_norm": 1.68909752368927, "learning_rate": 4.978403824796991e-05, "loss": 0.6012, "num_input_tokens_seen": 5329376, "step": 9245 }, { "epoch": 1.377718200774501, "grad_norm": 0.8225605487823486, "learning_rate": 4.978318503791458e-05, "loss": 0.6072, "num_input_tokens_seen": 5332448, "step": 9250 }, { "epoch": 1.3784629133154602, "grad_norm": 0.6534482836723328, "learning_rate": 4.978233015311236e-05, "loss": 0.6097, "num_input_tokens_seen": 5335136, "step": 9255 }, { "epoch": 1.3792076258564194, "grad_norm": 1.9228575229644775, "learning_rate": 4.978147359362103e-05, "loss": 0.8387, "num_input_tokens_seen": 5338112, "step": 9260 }, { "epoch": 1.3799523383973786, "grad_norm": 0.9475133419036865, "learning_rate": 4.978061535949847e-05, "loss": 0.7128, "num_input_tokens_seen": 5341056, "step": 9265 }, { "epoch": 1.3806970509383378, "grad_norm": 0.9882397055625916, "learning_rate": 4.9779755450802675e-05, "loss": 0.6279, "num_input_tokens_seen": 5344128, "step": 9270 }, { "epoch": 1.381441763479297, "grad_norm": 2.2760238647460938, "learning_rate": 4.977889386759176e-05, "loss": 0.6708, "num_input_tokens_seen": 5346848, "step": 9275 }, { "epoch": 1.3821864760202562, "grad_norm": 0.8376291990280151, "learning_rate": 4.977803060992393e-05, "loss": 0.7328, "num_input_tokens_seen": 5349856, "step": 9280 }, { "epoch": 1.3829311885612154, "grad_norm": 1.016616940498352, "learning_rate": 4.977716567785754e-05, "loss": 0.7497, "num_input_tokens_seen": 5352544, "step": 9285 }, { "epoch": 1.3836759011021746, "grad_norm": 1.200747013092041, "learning_rate": 4.977629907145102e-05, "loss": 0.5897, "num_input_tokens_seen": 5355392, "step": 9290 }, { "epoch": 1.3844206136431336, "grad_norm": 0.7588776350021362, "learning_rate": 4.977543079076295e-05, "loss": 0.681, "num_input_tokens_seen": 5358112, "step": 9295 }, { "epoch": 1.3851653261840928, "grad_norm": 0.9064440131187439, "learning_rate": 4.977456083585199e-05, "loss": 0.6802, "num_input_tokens_seen": 5360960, "step": 9300 }, { "epoch": 1.385910038725052, "grad_norm": 1.220766305923462, "learning_rate": 4.977368920677694e-05, "loss": 0.7854, "num_input_tokens_seen": 5363616, "step": 9305 }, { "epoch": 1.3866547512660112, "grad_norm": 0.7039099931716919, "learning_rate": 4.97728159035967e-05, "loss": 0.7354, "num_input_tokens_seen": 5366240, "step": 9310 }, { "epoch": 1.3873994638069704, "grad_norm": 1.4036144018173218, "learning_rate": 4.9771940926370274e-05, "loss": 0.7132, "num_input_tokens_seen": 5368768, "step": 9315 }, { "epoch": 1.3881441763479296, "grad_norm": 0.9146828651428223, "learning_rate": 4.97710642751568e-05, "loss": 0.5941, "num_input_tokens_seen": 5371584, "step": 9320 }, { "epoch": 1.3888888888888888, "grad_norm": 1.0064626932144165, "learning_rate": 4.977018595001551e-05, "loss": 0.6108, "num_input_tokens_seen": 5374400, "step": 9325 }, { "epoch": 1.389633601429848, "grad_norm": 1.0523526668548584, "learning_rate": 4.9769305951005766e-05, "loss": 0.762, "num_input_tokens_seen": 5377280, "step": 9330 }, { "epoch": 1.3903783139708072, "grad_norm": 0.80204176902771, "learning_rate": 4.976842427818702e-05, "loss": 0.572, "num_input_tokens_seen": 5380032, "step": 9335 }, { "epoch": 1.3911230265117664, "grad_norm": 1.2292094230651855, "learning_rate": 4.9767540931618874e-05, "loss": 0.6848, "num_input_tokens_seen": 5383200, "step": 9340 }, { "epoch": 1.3918677390527256, "grad_norm": 1.0139235258102417, "learning_rate": 4.9766655911361e-05, "loss": 0.6648, "num_input_tokens_seen": 5386336, "step": 9345 }, { "epoch": 1.3926124515936849, "grad_norm": 1.5678938627243042, "learning_rate": 4.976576921747322e-05, "loss": 0.8842, "num_input_tokens_seen": 5389120, "step": 9350 }, { "epoch": 1.393357164134644, "grad_norm": 1.0324270725250244, "learning_rate": 4.976488085001545e-05, "loss": 0.6631, "num_input_tokens_seen": 5392064, "step": 9355 }, { "epoch": 1.3941018766756033, "grad_norm": 1.6330360174179077, "learning_rate": 4.976399080904771e-05, "loss": 0.8132, "num_input_tokens_seen": 5394880, "step": 9360 }, { "epoch": 1.3948465892165625, "grad_norm": 1.4432820081710815, "learning_rate": 4.9763099094630164e-05, "loss": 0.4974, "num_input_tokens_seen": 5397696, "step": 9365 }, { "epoch": 1.3955913017575217, "grad_norm": 1.3771682977676392, "learning_rate": 4.976220570682305e-05, "loss": 0.6916, "num_input_tokens_seen": 5400480, "step": 9370 }, { "epoch": 1.3963360142984809, "grad_norm": 0.7465277910232544, "learning_rate": 4.976131064568675e-05, "loss": 0.779, "num_input_tokens_seen": 5403488, "step": 9375 }, { "epoch": 1.39708072683944, "grad_norm": 0.9309527277946472, "learning_rate": 4.976041391128175e-05, "loss": 0.6821, "num_input_tokens_seen": 5406240, "step": 9380 }, { "epoch": 1.3978254393803993, "grad_norm": 0.8754438757896423, "learning_rate": 4.975951550366866e-05, "loss": 0.5835, "num_input_tokens_seen": 5409216, "step": 9385 }, { "epoch": 1.3985701519213585, "grad_norm": 0.9516108632087708, "learning_rate": 4.9758615422908164e-05, "loss": 0.9074, "num_input_tokens_seen": 5412064, "step": 9390 }, { "epoch": 1.3993148644623177, "grad_norm": 1.0563198328018188, "learning_rate": 4.97577136690611e-05, "loss": 0.7526, "num_input_tokens_seen": 5415104, "step": 9395 }, { "epoch": 1.4000595770032767, "grad_norm": 1.0796481370925903, "learning_rate": 4.975681024218841e-05, "loss": 0.6935, "num_input_tokens_seen": 5418048, "step": 9400 }, { "epoch": 1.4008042895442359, "grad_norm": 0.6580166816711426, "learning_rate": 4.9755905142351133e-05, "loss": 0.6601, "num_input_tokens_seen": 5421024, "step": 9405 }, { "epoch": 1.401549002085195, "grad_norm": 1.7394282817840576, "learning_rate": 4.975499836961044e-05, "loss": 0.6796, "num_input_tokens_seen": 5423616, "step": 9410 }, { "epoch": 1.4022937146261543, "grad_norm": 0.730995774269104, "learning_rate": 4.97540899240276e-05, "loss": 0.6394, "num_input_tokens_seen": 5426368, "step": 9415 }, { "epoch": 1.4030384271671135, "grad_norm": 0.8865911960601807, "learning_rate": 4.9753179805664e-05, "loss": 0.6186, "num_input_tokens_seen": 5429120, "step": 9420 }, { "epoch": 1.4037831397080727, "grad_norm": 1.5579465627670288, "learning_rate": 4.975226801458116e-05, "loss": 0.8234, "num_input_tokens_seen": 5431936, "step": 9425 }, { "epoch": 1.4045278522490319, "grad_norm": 0.9690688252449036, "learning_rate": 4.975135455084067e-05, "loss": 0.7001, "num_input_tokens_seen": 5434816, "step": 9430 }, { "epoch": 1.405272564789991, "grad_norm": 1.329081416130066, "learning_rate": 4.975043941450428e-05, "loss": 0.6618, "num_input_tokens_seen": 5437632, "step": 9435 }, { "epoch": 1.4060172773309503, "grad_norm": 0.6323249936103821, "learning_rate": 4.9749522605633825e-05, "loss": 0.5752, "num_input_tokens_seen": 5440352, "step": 9440 }, { "epoch": 1.4067619898719095, "grad_norm": 1.174436330795288, "learning_rate": 4.9748604124291254e-05, "loss": 0.6957, "num_input_tokens_seen": 5443424, "step": 9445 }, { "epoch": 1.4075067024128687, "grad_norm": 0.6529755592346191, "learning_rate": 4.974768397053863e-05, "loss": 0.5697, "num_input_tokens_seen": 5446272, "step": 9450 }, { "epoch": 1.4082514149538279, "grad_norm": 1.375038504600525, "learning_rate": 4.9746762144438144e-05, "loss": 0.7152, "num_input_tokens_seen": 5449344, "step": 9455 }, { "epoch": 1.408996127494787, "grad_norm": 1.3006583452224731, "learning_rate": 4.974583864605209e-05, "loss": 0.6539, "num_input_tokens_seen": 5452544, "step": 9460 }, { "epoch": 1.409740840035746, "grad_norm": 2.8015968799591064, "learning_rate": 4.974491347544287e-05, "loss": 0.8048, "num_input_tokens_seen": 5455200, "step": 9465 }, { "epoch": 1.4104855525767053, "grad_norm": 1.3786498308181763, "learning_rate": 4.974398663267299e-05, "loss": 0.5867, "num_input_tokens_seen": 5457760, "step": 9470 }, { "epoch": 1.4112302651176645, "grad_norm": 1.781711459159851, "learning_rate": 4.9743058117805105e-05, "loss": 0.6897, "num_input_tokens_seen": 5460576, "step": 9475 }, { "epoch": 1.4119749776586237, "grad_norm": 0.8953357338905334, "learning_rate": 4.974212793090195e-05, "loss": 0.6229, "num_input_tokens_seen": 5463456, "step": 9480 }, { "epoch": 1.4127196901995829, "grad_norm": 0.653526782989502, "learning_rate": 4.974119607202638e-05, "loss": 0.6425, "num_input_tokens_seen": 5466208, "step": 9485 }, { "epoch": 1.413464402740542, "grad_norm": 1.3747907876968384, "learning_rate": 4.974026254124138e-05, "loss": 0.5896, "num_input_tokens_seen": 5469184, "step": 9490 }, { "epoch": 1.4142091152815013, "grad_norm": 1.2823429107666016, "learning_rate": 4.973932733861001e-05, "loss": 0.7589, "num_input_tokens_seen": 5472096, "step": 9495 }, { "epoch": 1.4149538278224605, "grad_norm": 0.9500164985656738, "learning_rate": 4.9738390464195486e-05, "loss": 0.6185, "num_input_tokens_seen": 5474752, "step": 9500 }, { "epoch": 1.4156985403634197, "grad_norm": 0.9509435892105103, "learning_rate": 4.973745191806112e-05, "loss": 0.6476, "num_input_tokens_seen": 5477792, "step": 9505 }, { "epoch": 1.416443252904379, "grad_norm": 1.6859676837921143, "learning_rate": 4.9736511700270324e-05, "loss": 0.7286, "num_input_tokens_seen": 5480864, "step": 9510 }, { "epoch": 1.417187965445338, "grad_norm": 1.2255347967147827, "learning_rate": 4.973556981088664e-05, "loss": 0.7164, "num_input_tokens_seen": 5483584, "step": 9515 }, { "epoch": 1.4179326779862973, "grad_norm": 1.0796661376953125, "learning_rate": 4.9734626249973715e-05, "loss": 0.5586, "num_input_tokens_seen": 5486368, "step": 9520 }, { "epoch": 1.4186773905272565, "grad_norm": 1.580030918121338, "learning_rate": 4.973368101759531e-05, "loss": 0.819, "num_input_tokens_seen": 5489344, "step": 9525 }, { "epoch": 1.4194221030682157, "grad_norm": 1.6327985525131226, "learning_rate": 4.97327341138153e-05, "loss": 0.7683, "num_input_tokens_seen": 5492416, "step": 9530 }, { "epoch": 1.420166815609175, "grad_norm": 1.9015389680862427, "learning_rate": 4.973178553869767e-05, "loss": 0.8204, "num_input_tokens_seen": 5495520, "step": 9535 }, { "epoch": 1.420911528150134, "grad_norm": 2.093702554702759, "learning_rate": 4.973083529230654e-05, "loss": 0.6083, "num_input_tokens_seen": 5498272, "step": 9540 }, { "epoch": 1.4216562406910933, "grad_norm": 0.8313114047050476, "learning_rate": 4.97298833747061e-05, "loss": 0.664, "num_input_tokens_seen": 5501344, "step": 9545 }, { "epoch": 1.4224009532320525, "grad_norm": 0.8036438822746277, "learning_rate": 4.972892978596069e-05, "loss": 0.6473, "num_input_tokens_seen": 5504352, "step": 9550 }, { "epoch": 1.4231456657730117, "grad_norm": 0.9436957240104675, "learning_rate": 4.972797452613474e-05, "loss": 0.5693, "num_input_tokens_seen": 5507232, "step": 9555 }, { "epoch": 1.423890378313971, "grad_norm": 1.6565200090408325, "learning_rate": 4.972701759529281e-05, "loss": 0.8109, "num_input_tokens_seen": 5510208, "step": 9560 }, { "epoch": 1.4246350908549301, "grad_norm": 1.595812439918518, "learning_rate": 4.972605899349957e-05, "loss": 0.5829, "num_input_tokens_seen": 5512896, "step": 9565 }, { "epoch": 1.4253798033958893, "grad_norm": 1.1068230867385864, "learning_rate": 4.9725098720819784e-05, "loss": 0.6657, "num_input_tokens_seen": 5515808, "step": 9570 }, { "epoch": 1.4261245159368483, "grad_norm": 1.8732551336288452, "learning_rate": 4.9724136777318354e-05, "loss": 0.6357, "num_input_tokens_seen": 5518752, "step": 9575 }, { "epoch": 1.4268692284778075, "grad_norm": 1.6303699016571045, "learning_rate": 4.972317316306028e-05, "loss": 0.63, "num_input_tokens_seen": 5521632, "step": 9580 }, { "epoch": 1.4276139410187667, "grad_norm": 0.9607333540916443, "learning_rate": 4.972220787811068e-05, "loss": 0.6404, "num_input_tokens_seen": 5524704, "step": 9585 }, { "epoch": 1.428358653559726, "grad_norm": 0.9298643469810486, "learning_rate": 4.972124092253479e-05, "loss": 0.6806, "num_input_tokens_seen": 5527584, "step": 9590 }, { "epoch": 1.4291033661006851, "grad_norm": 1.3380502462387085, "learning_rate": 4.9720272296397946e-05, "loss": 0.8443, "num_input_tokens_seen": 5530240, "step": 9595 }, { "epoch": 1.4298480786416443, "grad_norm": 0.8512818217277527, "learning_rate": 4.9719301999765605e-05, "loss": 0.7161, "num_input_tokens_seen": 5532992, "step": 9600 }, { "epoch": 1.4305927911826035, "grad_norm": 1.1338386535644531, "learning_rate": 4.971833003270333e-05, "loss": 0.6329, "num_input_tokens_seen": 5535712, "step": 9605 }, { "epoch": 1.4313375037235627, "grad_norm": 1.1760960817337036, "learning_rate": 4.9717356395276814e-05, "loss": 0.7237, "num_input_tokens_seen": 5539168, "step": 9610 }, { "epoch": 1.432082216264522, "grad_norm": 0.6818943023681641, "learning_rate": 4.971638108755186e-05, "loss": 0.6502, "num_input_tokens_seen": 5542112, "step": 9615 }, { "epoch": 1.4328269288054811, "grad_norm": 1.0829118490219116, "learning_rate": 4.9715404109594347e-05, "loss": 0.6917, "num_input_tokens_seen": 5545248, "step": 9620 }, { "epoch": 1.4335716413464403, "grad_norm": 1.6821094751358032, "learning_rate": 4.971442546147031e-05, "loss": 0.6438, "num_input_tokens_seen": 5548128, "step": 9625 }, { "epoch": 1.4343163538873995, "grad_norm": 2.1712982654571533, "learning_rate": 4.9713445143245876e-05, "loss": 0.634, "num_input_tokens_seen": 5550720, "step": 9630 }, { "epoch": 1.4350610664283587, "grad_norm": 1.2631653547286987, "learning_rate": 4.9712463154987305e-05, "loss": 0.8276, "num_input_tokens_seen": 5553600, "step": 9635 }, { "epoch": 1.4358057789693177, "grad_norm": 1.2045902013778687, "learning_rate": 4.9711479496760947e-05, "loss": 0.6343, "num_input_tokens_seen": 5556768, "step": 9640 }, { "epoch": 1.436550491510277, "grad_norm": 0.9078525900840759, "learning_rate": 4.971049416863327e-05, "loss": 0.6709, "num_input_tokens_seen": 5559648, "step": 9645 }, { "epoch": 1.4372952040512361, "grad_norm": 1.1356217861175537, "learning_rate": 4.9709507170670866e-05, "loss": 0.5009, "num_input_tokens_seen": 5562368, "step": 9650 }, { "epoch": 1.4380399165921953, "grad_norm": 1.4645185470581055, "learning_rate": 4.970851850294043e-05, "loss": 0.6582, "num_input_tokens_seen": 5565184, "step": 9655 }, { "epoch": 1.4387846291331545, "grad_norm": 1.1548352241516113, "learning_rate": 4.970752816550877e-05, "loss": 0.499, "num_input_tokens_seen": 5568352, "step": 9660 }, { "epoch": 1.4395293416741137, "grad_norm": 0.7735981941223145, "learning_rate": 4.970653615844281e-05, "loss": 0.7235, "num_input_tokens_seen": 5571136, "step": 9665 }, { "epoch": 1.440274054215073, "grad_norm": 0.8634705543518066, "learning_rate": 4.970554248180959e-05, "loss": 0.74, "num_input_tokens_seen": 5573952, "step": 9670 }, { "epoch": 1.4410187667560321, "grad_norm": 1.981824278831482, "learning_rate": 4.970454713567625e-05, "loss": 0.7041, "num_input_tokens_seen": 5577056, "step": 9675 }, { "epoch": 1.4417634792969913, "grad_norm": 0.8475579619407654, "learning_rate": 4.970355012011005e-05, "loss": 0.7301, "num_input_tokens_seen": 5579520, "step": 9680 }, { "epoch": 1.4425081918379505, "grad_norm": 2.0230541229248047, "learning_rate": 4.970255143517838e-05, "loss": 0.5669, "num_input_tokens_seen": 5582496, "step": 9685 }, { "epoch": 1.4432529043789097, "grad_norm": 0.7289209961891174, "learning_rate": 4.9701551080948714e-05, "loss": 0.6093, "num_input_tokens_seen": 5585280, "step": 9690 }, { "epoch": 1.443997616919869, "grad_norm": 0.6043176054954529, "learning_rate": 4.970054905748865e-05, "loss": 0.6805, "num_input_tokens_seen": 5587872, "step": 9695 }, { "epoch": 1.4447423294608281, "grad_norm": 0.9683037996292114, "learning_rate": 4.969954536486592e-05, "loss": 0.5886, "num_input_tokens_seen": 5590784, "step": 9700 }, { "epoch": 1.4454870420017873, "grad_norm": 0.9044749736785889, "learning_rate": 4.969854000314833e-05, "loss": 0.7456, "num_input_tokens_seen": 5593408, "step": 9705 }, { "epoch": 1.4462317545427466, "grad_norm": 2.5937201976776123, "learning_rate": 4.9697532972403816e-05, "loss": 0.6217, "num_input_tokens_seen": 5596448, "step": 9710 }, { "epoch": 1.4469764670837058, "grad_norm": 1.3738254308700562, "learning_rate": 4.969652427270044e-05, "loss": 0.8271, "num_input_tokens_seen": 5599232, "step": 9715 }, { "epoch": 1.447721179624665, "grad_norm": 1.0979183912277222, "learning_rate": 4.969551390410636e-05, "loss": 0.5927, "num_input_tokens_seen": 5601984, "step": 9720 }, { "epoch": 1.4484658921656242, "grad_norm": 1.5651592016220093, "learning_rate": 4.969450186668986e-05, "loss": 0.8584, "num_input_tokens_seen": 5604896, "step": 9725 }, { "epoch": 1.4492106047065834, "grad_norm": 0.9826981425285339, "learning_rate": 4.969348816051932e-05, "loss": 0.7003, "num_input_tokens_seen": 5608096, "step": 9730 }, { "epoch": 1.4499553172475426, "grad_norm": 0.9686066508293152, "learning_rate": 4.9692472785663244e-05, "loss": 0.6461, "num_input_tokens_seen": 5610912, "step": 9735 }, { "epoch": 1.4507000297885018, "grad_norm": 1.258296251296997, "learning_rate": 4.9691455742190266e-05, "loss": 0.646, "num_input_tokens_seen": 5614080, "step": 9740 }, { "epoch": 1.4514447423294607, "grad_norm": 0.7666683197021484, "learning_rate": 4.969043703016908e-05, "loss": 0.6848, "num_input_tokens_seen": 5616704, "step": 9745 }, { "epoch": 1.45218945487042, "grad_norm": 0.9600444436073303, "learning_rate": 4.9689416649668554e-05, "loss": 0.6535, "num_input_tokens_seen": 5619648, "step": 9750 }, { "epoch": 1.4529341674113792, "grad_norm": 1.1003750562667847, "learning_rate": 4.9688394600757624e-05, "loss": 0.596, "num_input_tokens_seen": 5622432, "step": 9755 }, { "epoch": 1.4536788799523384, "grad_norm": 1.3016079664230347, "learning_rate": 4.968737088350536e-05, "loss": 0.638, "num_input_tokens_seen": 5624992, "step": 9760 }, { "epoch": 1.4544235924932976, "grad_norm": 1.1794687509536743, "learning_rate": 4.9686345497980945e-05, "loss": 0.5836, "num_input_tokens_seen": 5627872, "step": 9765 }, { "epoch": 1.4551683050342568, "grad_norm": 1.4574589729309082, "learning_rate": 4.968531844425367e-05, "loss": 0.604, "num_input_tokens_seen": 5630688, "step": 9770 }, { "epoch": 1.455913017575216, "grad_norm": 1.2631604671478271, "learning_rate": 4.968428972239294e-05, "loss": 0.8111, "num_input_tokens_seen": 5633568, "step": 9775 }, { "epoch": 1.4566577301161752, "grad_norm": 1.3102413415908813, "learning_rate": 4.9683259332468265e-05, "loss": 0.7386, "num_input_tokens_seen": 5636288, "step": 9780 }, { "epoch": 1.4574024426571344, "grad_norm": 0.8177528381347656, "learning_rate": 4.968222727454929e-05, "loss": 0.6128, "num_input_tokens_seen": 5639200, "step": 9785 }, { "epoch": 1.4581471551980936, "grad_norm": 1.1837501525878906, "learning_rate": 4.9681193548705736e-05, "loss": 0.5823, "num_input_tokens_seen": 5642176, "step": 9790 }, { "epoch": 1.4588918677390528, "grad_norm": 0.8193578124046326, "learning_rate": 4.9680158155007474e-05, "loss": 0.6598, "num_input_tokens_seen": 5645472, "step": 9795 }, { "epoch": 1.459636580280012, "grad_norm": 0.8560436964035034, "learning_rate": 4.967912109352446e-05, "loss": 0.6758, "num_input_tokens_seen": 5648384, "step": 9800 }, { "epoch": 1.4603812928209712, "grad_norm": 1.0317981243133545, "learning_rate": 4.9678082364326786e-05, "loss": 0.6704, "num_input_tokens_seen": 5651296, "step": 9805 }, { "epoch": 1.4611260053619302, "grad_norm": 2.1424355506896973, "learning_rate": 4.9677041967484635e-05, "loss": 0.7082, "num_input_tokens_seen": 5654080, "step": 9810 }, { "epoch": 1.4618707179028894, "grad_norm": 2.1422581672668457, "learning_rate": 4.967599990306832e-05, "loss": 0.6893, "num_input_tokens_seen": 5656608, "step": 9815 }, { "epoch": 1.4626154304438486, "grad_norm": 1.354752779006958, "learning_rate": 4.967495617114826e-05, "loss": 0.7881, "num_input_tokens_seen": 5659328, "step": 9820 }, { "epoch": 1.4633601429848078, "grad_norm": 1.741196632385254, "learning_rate": 4.9673910771794974e-05, "loss": 0.6365, "num_input_tokens_seen": 5662240, "step": 9825 }, { "epoch": 1.464104855525767, "grad_norm": 0.7651104927062988, "learning_rate": 4.967286370507912e-05, "loss": 0.6961, "num_input_tokens_seen": 5665376, "step": 9830 }, { "epoch": 1.4648495680667262, "grad_norm": 1.775238037109375, "learning_rate": 4.967181497107145e-05, "loss": 0.6834, "num_input_tokens_seen": 5668480, "step": 9835 }, { "epoch": 1.4655942806076854, "grad_norm": 2.175840139389038, "learning_rate": 4.967076456984283e-05, "loss": 0.7333, "num_input_tokens_seen": 5671296, "step": 9840 }, { "epoch": 1.4663389931486446, "grad_norm": 1.1347384452819824, "learning_rate": 4.966971250146425e-05, "loss": 0.7823, "num_input_tokens_seen": 5674048, "step": 9845 }, { "epoch": 1.4670837056896038, "grad_norm": 1.3448774814605713, "learning_rate": 4.966865876600679e-05, "loss": 0.5135, "num_input_tokens_seen": 5676544, "step": 9850 }, { "epoch": 1.467828418230563, "grad_norm": 1.7203943729400635, "learning_rate": 4.9667603363541676e-05, "loss": 0.6441, "num_input_tokens_seen": 5679296, "step": 9855 }, { "epoch": 1.4685731307715222, "grad_norm": 2.043351888656616, "learning_rate": 4.9666546294140216e-05, "loss": 0.5325, "num_input_tokens_seen": 5682016, "step": 9860 }, { "epoch": 1.4693178433124814, "grad_norm": 0.7522251009941101, "learning_rate": 4.9665487557873834e-05, "loss": 0.6886, "num_input_tokens_seen": 5685376, "step": 9865 }, { "epoch": 1.4700625558534406, "grad_norm": 1.7108060121536255, "learning_rate": 4.9664427154814094e-05, "loss": 0.6655, "num_input_tokens_seen": 5688256, "step": 9870 }, { "epoch": 1.4708072683943998, "grad_norm": 1.4111967086791992, "learning_rate": 4.966336508503265e-05, "loss": 0.8039, "num_input_tokens_seen": 5691392, "step": 9875 }, { "epoch": 1.471551980935359, "grad_norm": 1.2702853679656982, "learning_rate": 4.966230134860126e-05, "loss": 0.8597, "num_input_tokens_seen": 5694176, "step": 9880 }, { "epoch": 1.4722966934763182, "grad_norm": 0.9403135776519775, "learning_rate": 4.966123594559182e-05, "loss": 0.7226, "num_input_tokens_seen": 5697088, "step": 9885 }, { "epoch": 1.4730414060172774, "grad_norm": 0.6735885739326477, "learning_rate": 4.966016887607631e-05, "loss": 0.7238, "num_input_tokens_seen": 5699776, "step": 9890 }, { "epoch": 1.4737861185582366, "grad_norm": 0.8241398334503174, "learning_rate": 4.9659100140126856e-05, "loss": 0.7223, "num_input_tokens_seen": 5702912, "step": 9895 }, { "epoch": 1.4745308310991958, "grad_norm": 1.053758144378662, "learning_rate": 4.965802973781567e-05, "loss": 0.7983, "num_input_tokens_seen": 5705664, "step": 9900 }, { "epoch": 1.475275543640155, "grad_norm": 1.0820289850234985, "learning_rate": 4.965695766921509e-05, "loss": 0.5791, "num_input_tokens_seen": 5708512, "step": 9905 }, { "epoch": 1.4760202561811142, "grad_norm": 0.7959849834442139, "learning_rate": 4.965588393439755e-05, "loss": 0.7057, "num_input_tokens_seen": 5711360, "step": 9910 }, { "epoch": 1.4767649687220734, "grad_norm": 0.9901636838912964, "learning_rate": 4.965480853343563e-05, "loss": 0.8039, "num_input_tokens_seen": 5713792, "step": 9915 }, { "epoch": 1.4775096812630324, "grad_norm": 0.8850715756416321, "learning_rate": 4.9653731466401975e-05, "loss": 0.6789, "num_input_tokens_seen": 5716256, "step": 9920 }, { "epoch": 1.4782543938039916, "grad_norm": 1.474879264831543, "learning_rate": 4.965265273336939e-05, "loss": 0.6133, "num_input_tokens_seen": 5718912, "step": 9925 }, { "epoch": 1.4789991063449508, "grad_norm": 0.9282975196838379, "learning_rate": 4.9651572334410757e-05, "loss": 0.6971, "num_input_tokens_seen": 5721632, "step": 9930 }, { "epoch": 1.47974381888591, "grad_norm": 0.9533237218856812, "learning_rate": 4.9650490269599096e-05, "loss": 0.7198, "num_input_tokens_seen": 5724544, "step": 9935 }, { "epoch": 1.4804885314268692, "grad_norm": 0.614286482334137, "learning_rate": 4.964940653900753e-05, "loss": 0.6158, "num_input_tokens_seen": 5727328, "step": 9940 }, { "epoch": 1.4812332439678284, "grad_norm": 0.7448946237564087, "learning_rate": 4.964832114270928e-05, "loss": 0.7004, "num_input_tokens_seen": 5730144, "step": 9945 }, { "epoch": 1.4819779565087876, "grad_norm": 1.937149167060852, "learning_rate": 4.96472340807777e-05, "loss": 0.6387, "num_input_tokens_seen": 5732864, "step": 9950 }, { "epoch": 1.4827226690497468, "grad_norm": 1.0240806341171265, "learning_rate": 4.964614535328626e-05, "loss": 0.8926, "num_input_tokens_seen": 5735904, "step": 9955 }, { "epoch": 1.483467381590706, "grad_norm": 1.002423644065857, "learning_rate": 4.9645054960308504e-05, "loss": 0.6896, "num_input_tokens_seen": 5738752, "step": 9960 }, { "epoch": 1.4842120941316652, "grad_norm": 0.679865300655365, "learning_rate": 4.964396290191814e-05, "loss": 0.6388, "num_input_tokens_seen": 5741504, "step": 9965 }, { "epoch": 1.4849568066726244, "grad_norm": 1.0352140665054321, "learning_rate": 4.964286917818895e-05, "loss": 0.8229, "num_input_tokens_seen": 5743968, "step": 9970 }, { "epoch": 1.4857015192135836, "grad_norm": 0.841863214969635, "learning_rate": 4.964177378919487e-05, "loss": 0.628, "num_input_tokens_seen": 5746912, "step": 9975 }, { "epoch": 1.4864462317545428, "grad_norm": 0.42442163825035095, "learning_rate": 4.9640676735009894e-05, "loss": 0.691, "num_input_tokens_seen": 5749536, "step": 9980 }, { "epoch": 1.4871909442955018, "grad_norm": 0.945928156375885, "learning_rate": 4.963957801570816e-05, "loss": 0.5998, "num_input_tokens_seen": 5752096, "step": 9985 }, { "epoch": 1.487935656836461, "grad_norm": 0.9536320567131042, "learning_rate": 4.963847763136393e-05, "loss": 0.676, "num_input_tokens_seen": 5755040, "step": 9990 }, { "epoch": 1.4886803693774202, "grad_norm": 0.8532750010490417, "learning_rate": 4.9637375582051556e-05, "loss": 0.5711, "num_input_tokens_seen": 5758016, "step": 9995 }, { "epoch": 1.4894250819183794, "grad_norm": 0.9009770750999451, "learning_rate": 4.96362718678455e-05, "loss": 0.5853, "num_input_tokens_seen": 5760832, "step": 10000 }, { "epoch": 1.4901697944593386, "grad_norm": 1.0227937698364258, "learning_rate": 4.9635166488820365e-05, "loss": 0.5085, "num_input_tokens_seen": 5763584, "step": 10005 }, { "epoch": 1.4909145070002978, "grad_norm": 0.916570246219635, "learning_rate": 4.963405944505083e-05, "loss": 0.6364, "num_input_tokens_seen": 5766400, "step": 10010 }, { "epoch": 1.491659219541257, "grad_norm": 0.5902239084243774, "learning_rate": 4.9632950736611713e-05, "loss": 0.6549, "num_input_tokens_seen": 5769664, "step": 10015 }, { "epoch": 1.4924039320822162, "grad_norm": 1.0578612089157104, "learning_rate": 4.963184036357793e-05, "loss": 0.6802, "num_input_tokens_seen": 5772288, "step": 10020 }, { "epoch": 1.4931486446231754, "grad_norm": 1.0344115495681763, "learning_rate": 4.9630728326024535e-05, "loss": 0.6325, "num_input_tokens_seen": 5775232, "step": 10025 }, { "epoch": 1.4938933571641346, "grad_norm": 1.2565646171569824, "learning_rate": 4.962961462402666e-05, "loss": 0.7314, "num_input_tokens_seen": 5777856, "step": 10030 }, { "epoch": 1.4946380697050938, "grad_norm": 1.7266974449157715, "learning_rate": 4.9628499257659553e-05, "loss": 0.6791, "num_input_tokens_seen": 5780960, "step": 10035 }, { "epoch": 1.495382782246053, "grad_norm": 1.425477385520935, "learning_rate": 4.9627382226998605e-05, "loss": 0.6553, "num_input_tokens_seen": 5783936, "step": 10040 }, { "epoch": 1.4961274947870122, "grad_norm": 0.6642030477523804, "learning_rate": 4.9626263532119286e-05, "loss": 0.5511, "num_input_tokens_seen": 5786720, "step": 10045 }, { "epoch": 1.4968722073279714, "grad_norm": 0.7684998512268066, "learning_rate": 4.962514317309721e-05, "loss": 0.7447, "num_input_tokens_seen": 5789472, "step": 10050 }, { "epoch": 1.4976169198689306, "grad_norm": 1.4281460046768188, "learning_rate": 4.962402115000808e-05, "loss": 0.6587, "num_input_tokens_seen": 5792192, "step": 10055 }, { "epoch": 1.4983616324098898, "grad_norm": 1.3028876781463623, "learning_rate": 4.962289746292771e-05, "loss": 0.6537, "num_input_tokens_seen": 5795264, "step": 10060 }, { "epoch": 1.499106344950849, "grad_norm": 2.497006893157959, "learning_rate": 4.962177211193203e-05, "loss": 0.6843, "num_input_tokens_seen": 5798112, "step": 10065 }, { "epoch": 1.4998510574918082, "grad_norm": 0.7173958420753479, "learning_rate": 4.962064509709711e-05, "loss": 0.6583, "num_input_tokens_seen": 5801088, "step": 10070 }, { "epoch": 1.5, "eval_loss": 0.6765435338020325, "eval_runtime": 74.2748, "eval_samples_per_second": 40.175, "eval_steps_per_second": 10.044, "num_input_tokens_seen": 5801696, "step": 10071 }, { "epoch": 1.5005957700327675, "grad_norm": 1.1787168979644775, "learning_rate": 4.961951641849909e-05, "loss": 0.6595, "num_input_tokens_seen": 5804000, "step": 10075 }, { "epoch": 1.5013404825737267, "grad_norm": 1.1250622272491455, "learning_rate": 4.961838607621424e-05, "loss": 0.7563, "num_input_tokens_seen": 5807488, "step": 10080 }, { "epoch": 1.5020851951146859, "grad_norm": 0.873467743396759, "learning_rate": 4.961725407031896e-05, "loss": 0.7024, "num_input_tokens_seen": 5810016, "step": 10085 }, { "epoch": 1.502829907655645, "grad_norm": 2.605067253112793, "learning_rate": 4.961612040088973e-05, "loss": 0.6738, "num_input_tokens_seen": 5812800, "step": 10090 }, { "epoch": 1.5035746201966043, "grad_norm": 1.2105247974395752, "learning_rate": 4.9614985068003163e-05, "loss": 0.6019, "num_input_tokens_seen": 5815552, "step": 10095 }, { "epoch": 1.5043193327375635, "grad_norm": 1.094332218170166, "learning_rate": 4.9613848071735987e-05, "loss": 0.7591, "num_input_tokens_seen": 5818656, "step": 10100 }, { "epoch": 1.5050640452785224, "grad_norm": 0.9489068984985352, "learning_rate": 4.9612709412165024e-05, "loss": 0.73, "num_input_tokens_seen": 5821600, "step": 10105 }, { "epoch": 1.5058087578194816, "grad_norm": 3.0600218772888184, "learning_rate": 4.961156908936724e-05, "loss": 0.9076, "num_input_tokens_seen": 5824448, "step": 10110 }, { "epoch": 1.5065534703604408, "grad_norm": 0.7804037928581238, "learning_rate": 4.961042710341967e-05, "loss": 0.6961, "num_input_tokens_seen": 5827328, "step": 10115 }, { "epoch": 1.5072981829014, "grad_norm": 1.4941035509109497, "learning_rate": 4.96092834543995e-05, "loss": 0.6576, "num_input_tokens_seen": 5830304, "step": 10120 }, { "epoch": 1.5080428954423593, "grad_norm": 1.8406599760055542, "learning_rate": 4.9608138142384e-05, "loss": 0.7196, "num_input_tokens_seen": 5833120, "step": 10125 }, { "epoch": 1.5087876079833185, "grad_norm": 0.8451074361801147, "learning_rate": 4.9606991167450584e-05, "loss": 0.5851, "num_input_tokens_seen": 5835840, "step": 10130 }, { "epoch": 1.5095323205242777, "grad_norm": 1.9162664413452148, "learning_rate": 4.9605842529676746e-05, "loss": 0.8727, "num_input_tokens_seen": 5838752, "step": 10135 }, { "epoch": 1.5102770330652369, "grad_norm": 1.7784172296524048, "learning_rate": 4.9604692229140106e-05, "loss": 0.7526, "num_input_tokens_seen": 5841792, "step": 10140 }, { "epoch": 1.5110217456061958, "grad_norm": 0.7669056057929993, "learning_rate": 4.96035402659184e-05, "loss": 0.8503, "num_input_tokens_seen": 5844672, "step": 10145 }, { "epoch": 1.511766458147155, "grad_norm": 1.4126263856887817, "learning_rate": 4.960238664008948e-05, "loss": 0.6876, "num_input_tokens_seen": 5847552, "step": 10150 }, { "epoch": 1.5125111706881142, "grad_norm": 0.8077000975608826, "learning_rate": 4.960123135173129e-05, "loss": 0.6927, "num_input_tokens_seen": 5850688, "step": 10155 }, { "epoch": 1.5132558832290735, "grad_norm": 0.9482710361480713, "learning_rate": 4.960007440092191e-05, "loss": 0.6265, "num_input_tokens_seen": 5853536, "step": 10160 }, { "epoch": 1.5140005957700327, "grad_norm": 0.527138352394104, "learning_rate": 4.959891578773953e-05, "loss": 0.6975, "num_input_tokens_seen": 5856480, "step": 10165 }, { "epoch": 1.5147453083109919, "grad_norm": 0.7724690437316895, "learning_rate": 4.959775551226242e-05, "loss": 0.7044, "num_input_tokens_seen": 5859264, "step": 10170 }, { "epoch": 1.515490020851951, "grad_norm": 0.8229654431343079, "learning_rate": 4.9596593574569e-05, "loss": 0.6806, "num_input_tokens_seen": 5862112, "step": 10175 }, { "epoch": 1.5162347333929103, "grad_norm": 1.0302250385284424, "learning_rate": 4.9595429974737796e-05, "loss": 0.585, "num_input_tokens_seen": 5865216, "step": 10180 }, { "epoch": 1.5169794459338695, "grad_norm": 0.8441370129585266, "learning_rate": 4.959426471284742e-05, "loss": 0.6015, "num_input_tokens_seen": 5867872, "step": 10185 }, { "epoch": 1.5177241584748287, "grad_norm": 0.9916788339614868, "learning_rate": 4.959309778897664e-05, "loss": 0.6906, "num_input_tokens_seen": 5870880, "step": 10190 }, { "epoch": 1.5184688710157879, "grad_norm": 0.7746250033378601, "learning_rate": 4.959192920320429e-05, "loss": 0.6517, "num_input_tokens_seen": 5873920, "step": 10195 }, { "epoch": 1.519213583556747, "grad_norm": 1.0601667165756226, "learning_rate": 4.959075895560935e-05, "loss": 0.6196, "num_input_tokens_seen": 5876928, "step": 10200 }, { "epoch": 1.5199582960977063, "grad_norm": 1.9179502725601196, "learning_rate": 4.9589587046270904e-05, "loss": 0.6644, "num_input_tokens_seen": 5879936, "step": 10205 }, { "epoch": 1.5207030086386655, "grad_norm": 1.4663223028182983, "learning_rate": 4.958841347526814e-05, "loss": 0.7835, "num_input_tokens_seen": 5883072, "step": 10210 }, { "epoch": 1.5214477211796247, "grad_norm": 2.225281238555908, "learning_rate": 4.9587238242680356e-05, "loss": 0.6223, "num_input_tokens_seen": 5885984, "step": 10215 }, { "epoch": 1.5221924337205839, "grad_norm": 1.2421507835388184, "learning_rate": 4.958606134858697e-05, "loss": 0.7387, "num_input_tokens_seen": 5888704, "step": 10220 }, { "epoch": 1.522937146261543, "grad_norm": 0.9470223188400269, "learning_rate": 4.9584882793067534e-05, "loss": 0.7155, "num_input_tokens_seen": 5891488, "step": 10225 }, { "epoch": 1.5236818588025023, "grad_norm": 1.1884530782699585, "learning_rate": 4.958370257620166e-05, "loss": 0.691, "num_input_tokens_seen": 5894176, "step": 10230 }, { "epoch": 1.5244265713434615, "grad_norm": 0.9095231890678406, "learning_rate": 4.958252069806912e-05, "loss": 0.6278, "num_input_tokens_seen": 5897088, "step": 10235 }, { "epoch": 1.5251712838844207, "grad_norm": 1.6953657865524292, "learning_rate": 4.9581337158749784e-05, "loss": 0.6982, "num_input_tokens_seen": 5899840, "step": 10240 }, { "epoch": 1.52591599642538, "grad_norm": 1.5142232179641724, "learning_rate": 4.958015195832362e-05, "loss": 0.7561, "num_input_tokens_seen": 5902816, "step": 10245 }, { "epoch": 1.526660708966339, "grad_norm": 1.6199980974197388, "learning_rate": 4.957896509687072e-05, "loss": 0.6683, "num_input_tokens_seen": 5905792, "step": 10250 }, { "epoch": 1.5274054215072983, "grad_norm": 1.363046646118164, "learning_rate": 4.957777657447128e-05, "loss": 0.817, "num_input_tokens_seen": 5909152, "step": 10255 }, { "epoch": 1.5281501340482575, "grad_norm": 0.8327904343605042, "learning_rate": 4.957658639120564e-05, "loss": 0.6928, "num_input_tokens_seen": 5912352, "step": 10260 }, { "epoch": 1.5288948465892167, "grad_norm": 0.5876739621162415, "learning_rate": 4.957539454715421e-05, "loss": 0.747, "num_input_tokens_seen": 5915264, "step": 10265 }, { "epoch": 1.529639559130176, "grad_norm": 1.00592839717865, "learning_rate": 4.957420104239753e-05, "loss": 0.7773, "num_input_tokens_seen": 5918144, "step": 10270 }, { "epoch": 1.5303842716711349, "grad_norm": 0.57032710313797, "learning_rate": 4.9573005877016255e-05, "loss": 0.6781, "num_input_tokens_seen": 5920896, "step": 10275 }, { "epoch": 1.531128984212094, "grad_norm": 1.0255755186080933, "learning_rate": 4.957180905109115e-05, "loss": 0.8415, "num_input_tokens_seen": 5923904, "step": 10280 }, { "epoch": 1.5318736967530533, "grad_norm": 1.0765128135681152, "learning_rate": 4.9570610564703086e-05, "loss": 0.7079, "num_input_tokens_seen": 5926688, "step": 10285 }, { "epoch": 1.5326184092940125, "grad_norm": 0.9191635251045227, "learning_rate": 4.956941041793306e-05, "loss": 0.6106, "num_input_tokens_seen": 5929536, "step": 10290 }, { "epoch": 1.5333631218349717, "grad_norm": 0.9685823321342468, "learning_rate": 4.956820861086217e-05, "loss": 0.7348, "num_input_tokens_seen": 5932352, "step": 10295 }, { "epoch": 1.534107834375931, "grad_norm": 0.9681239128112793, "learning_rate": 4.956700514357163e-05, "loss": 0.6873, "num_input_tokens_seen": 5935136, "step": 10300 }, { "epoch": 1.53485254691689, "grad_norm": 0.6462163925170898, "learning_rate": 4.956580001614277e-05, "loss": 0.7385, "num_input_tokens_seen": 5938048, "step": 10305 }, { "epoch": 1.5355972594578493, "grad_norm": 0.9019610285758972, "learning_rate": 4.9564593228657016e-05, "loss": 0.6977, "num_input_tokens_seen": 5940768, "step": 10310 }, { "epoch": 1.5363419719988085, "grad_norm": 0.6278368234634399, "learning_rate": 4.956338478119592e-05, "loss": 0.7001, "num_input_tokens_seen": 5943552, "step": 10315 }, { "epoch": 1.5370866845397675, "grad_norm": 0.8632965683937073, "learning_rate": 4.956217467384116e-05, "loss": 0.6975, "num_input_tokens_seen": 5946624, "step": 10320 }, { "epoch": 1.5378313970807267, "grad_norm": 1.5009195804595947, "learning_rate": 4.9560962906674493e-05, "loss": 0.6937, "num_input_tokens_seen": 5949440, "step": 10325 }, { "epoch": 1.538576109621686, "grad_norm": 0.8584983348846436, "learning_rate": 4.9559749479777805e-05, "loss": 0.6189, "num_input_tokens_seen": 5952576, "step": 10330 }, { "epoch": 1.539320822162645, "grad_norm": 1.3297570943832397, "learning_rate": 4.9558534393233104e-05, "loss": 0.7837, "num_input_tokens_seen": 5955680, "step": 10335 }, { "epoch": 1.5400655347036043, "grad_norm": 0.9982560276985168, "learning_rate": 4.955731764712249e-05, "loss": 0.6618, "num_input_tokens_seen": 5958368, "step": 10340 }, { "epoch": 1.5408102472445635, "grad_norm": 0.9240432381629944, "learning_rate": 4.9556099241528194e-05, "loss": 0.7658, "num_input_tokens_seen": 5961120, "step": 10345 }, { "epoch": 1.5415549597855227, "grad_norm": 0.6667784452438354, "learning_rate": 4.955487917653256e-05, "loss": 0.6942, "num_input_tokens_seen": 5964288, "step": 10350 }, { "epoch": 1.542299672326482, "grad_norm": 0.7517579197883606, "learning_rate": 4.955365745221802e-05, "loss": 0.6513, "num_input_tokens_seen": 5966944, "step": 10355 }, { "epoch": 1.543044384867441, "grad_norm": 0.6939538717269897, "learning_rate": 4.955243406866713e-05, "loss": 0.7285, "num_input_tokens_seen": 5969792, "step": 10360 }, { "epoch": 1.5437890974084003, "grad_norm": 0.8828434348106384, "learning_rate": 4.9551209025962575e-05, "loss": 0.7643, "num_input_tokens_seen": 5972800, "step": 10365 }, { "epoch": 1.5445338099493595, "grad_norm": 1.051435112953186, "learning_rate": 4.9549982324187125e-05, "loss": 0.6897, "num_input_tokens_seen": 5975616, "step": 10370 }, { "epoch": 1.5452785224903187, "grad_norm": 1.9487611055374146, "learning_rate": 4.954875396342369e-05, "loss": 0.7474, "num_input_tokens_seen": 5978368, "step": 10375 }, { "epoch": 1.546023235031278, "grad_norm": 0.5117963552474976, "learning_rate": 4.954752394375527e-05, "loss": 0.6899, "num_input_tokens_seen": 5981152, "step": 10380 }, { "epoch": 1.5467679475722371, "grad_norm": 1.5760103464126587, "learning_rate": 4.9546292265264985e-05, "loss": 0.6051, "num_input_tokens_seen": 5984224, "step": 10385 }, { "epoch": 1.5475126601131963, "grad_norm": 0.696965754032135, "learning_rate": 4.9545058928036056e-05, "loss": 0.6561, "num_input_tokens_seen": 5987072, "step": 10390 }, { "epoch": 1.5482573726541555, "grad_norm": 0.7171198129653931, "learning_rate": 4.9543823932151845e-05, "loss": 0.7573, "num_input_tokens_seen": 5990048, "step": 10395 }, { "epoch": 1.5490020851951147, "grad_norm": 0.8593280911445618, "learning_rate": 4.954258727769581e-05, "loss": 0.6049, "num_input_tokens_seen": 5993024, "step": 10400 }, { "epoch": 1.549746797736074, "grad_norm": 0.9640240669250488, "learning_rate": 4.9541348964751497e-05, "loss": 0.6434, "num_input_tokens_seen": 5995968, "step": 10405 }, { "epoch": 1.5504915102770331, "grad_norm": 0.9419775605201721, "learning_rate": 4.95401089934026e-05, "loss": 0.6383, "num_input_tokens_seen": 5998848, "step": 10410 }, { "epoch": 1.5512362228179923, "grad_norm": 0.7557100653648376, "learning_rate": 4.953886736373291e-05, "loss": 0.7208, "num_input_tokens_seen": 6001760, "step": 10415 }, { "epoch": 1.5519809353589515, "grad_norm": 0.7323458790779114, "learning_rate": 4.953762407582634e-05, "loss": 0.6701, "num_input_tokens_seen": 6004768, "step": 10420 }, { "epoch": 1.5527256478999107, "grad_norm": 0.8256774544715881, "learning_rate": 4.953637912976688e-05, "loss": 0.6194, "num_input_tokens_seen": 6007552, "step": 10425 }, { "epoch": 1.55347036044087, "grad_norm": 5.2353057861328125, "learning_rate": 4.9535132525638696e-05, "loss": 0.6124, "num_input_tokens_seen": 6010464, "step": 10430 }, { "epoch": 1.5542150729818291, "grad_norm": 0.6390699744224548, "learning_rate": 4.9533884263526e-05, "loss": 0.6681, "num_input_tokens_seen": 6013792, "step": 10435 }, { "epoch": 1.5549597855227884, "grad_norm": 0.5760030150413513, "learning_rate": 4.953263434351315e-05, "loss": 0.7649, "num_input_tokens_seen": 6016480, "step": 10440 }, { "epoch": 1.5557044980637476, "grad_norm": 0.9838626384735107, "learning_rate": 4.953138276568462e-05, "loss": 0.6979, "num_input_tokens_seen": 6019072, "step": 10445 }, { "epoch": 1.5564492106047065, "grad_norm": 0.8751863837242126, "learning_rate": 4.953012953012498e-05, "loss": 0.6581, "num_input_tokens_seen": 6022048, "step": 10450 }, { "epoch": 1.5571939231456657, "grad_norm": 0.8030690550804138, "learning_rate": 4.952887463691891e-05, "loss": 0.792, "num_input_tokens_seen": 6024896, "step": 10455 }, { "epoch": 1.557938635686625, "grad_norm": 1.3007588386535645, "learning_rate": 4.9527618086151226e-05, "loss": 0.7771, "num_input_tokens_seen": 6027744, "step": 10460 }, { "epoch": 1.5586833482275841, "grad_norm": 0.9108227491378784, "learning_rate": 4.952635987790683e-05, "loss": 0.7575, "num_input_tokens_seen": 6030464, "step": 10465 }, { "epoch": 1.5594280607685433, "grad_norm": 1.0189634561538696, "learning_rate": 4.9525100012270754e-05, "loss": 0.7447, "num_input_tokens_seen": 6033344, "step": 10470 }, { "epoch": 1.5601727733095025, "grad_norm": 1.3437294960021973, "learning_rate": 4.9523838489328134e-05, "loss": 0.6665, "num_input_tokens_seen": 6036256, "step": 10475 }, { "epoch": 1.5609174858504618, "grad_norm": 0.7870588898658752, "learning_rate": 4.952257530916421e-05, "loss": 0.7118, "num_input_tokens_seen": 6039008, "step": 10480 }, { "epoch": 1.561662198391421, "grad_norm": 0.6264339089393616, "learning_rate": 4.9521310471864346e-05, "loss": 0.6457, "num_input_tokens_seen": 6041792, "step": 10485 }, { "epoch": 1.5624069109323802, "grad_norm": 1.2835787534713745, "learning_rate": 4.952004397751402e-05, "loss": 0.6789, "num_input_tokens_seen": 6044768, "step": 10490 }, { "epoch": 1.5631516234733391, "grad_norm": 0.9642863869667053, "learning_rate": 4.951877582619881e-05, "loss": 0.5982, "num_input_tokens_seen": 6047808, "step": 10495 }, { "epoch": 1.5638963360142983, "grad_norm": 0.6375377178192139, "learning_rate": 4.951750601800442e-05, "loss": 0.653, "num_input_tokens_seen": 6050720, "step": 10500 }, { "epoch": 1.5646410485552575, "grad_norm": 0.5685775876045227, "learning_rate": 4.9516234553016656e-05, "loss": 0.684, "num_input_tokens_seen": 6053760, "step": 10505 }, { "epoch": 1.5653857610962167, "grad_norm": 1.3122893571853638, "learning_rate": 4.951496143132143e-05, "loss": 0.7162, "num_input_tokens_seen": 6056704, "step": 10510 }, { "epoch": 1.566130473637176, "grad_norm": 2.231240749359131, "learning_rate": 4.9513686653004785e-05, "loss": 0.5892, "num_input_tokens_seen": 6059424, "step": 10515 }, { "epoch": 1.5668751861781351, "grad_norm": 0.9216921329498291, "learning_rate": 4.951241021815286e-05, "loss": 0.5595, "num_input_tokens_seen": 6062144, "step": 10520 }, { "epoch": 1.5676198987190944, "grad_norm": 0.952233612537384, "learning_rate": 4.9511132126851914e-05, "loss": 0.7531, "num_input_tokens_seen": 6065248, "step": 10525 }, { "epoch": 1.5683646112600536, "grad_norm": 2.0774571895599365, "learning_rate": 4.950985237918831e-05, "loss": 0.7217, "num_input_tokens_seen": 6068128, "step": 10530 }, { "epoch": 1.5691093238010128, "grad_norm": 1.4220844507217407, "learning_rate": 4.950857097524854e-05, "loss": 0.7606, "num_input_tokens_seen": 6070848, "step": 10535 }, { "epoch": 1.569854036341972, "grad_norm": 0.782995343208313, "learning_rate": 4.950728791511918e-05, "loss": 0.6112, "num_input_tokens_seen": 6073952, "step": 10540 }, { "epoch": 1.5705987488829312, "grad_norm": 1.0669291019439697, "learning_rate": 4.950600319888695e-05, "loss": 0.7524, "num_input_tokens_seen": 6076960, "step": 10545 }, { "epoch": 1.5713434614238904, "grad_norm": 2.1446564197540283, "learning_rate": 4.9504716826638655e-05, "loss": 0.6428, "num_input_tokens_seen": 6079712, "step": 10550 }, { "epoch": 1.5720881739648496, "grad_norm": 0.8282887935638428, "learning_rate": 4.9503428798461226e-05, "loss": 0.6413, "num_input_tokens_seen": 6082272, "step": 10555 }, { "epoch": 1.5728328865058088, "grad_norm": 0.9903135895729065, "learning_rate": 4.95021391144417e-05, "loss": 0.6213, "num_input_tokens_seen": 6084960, "step": 10560 }, { "epoch": 1.573577599046768, "grad_norm": 0.6869747638702393, "learning_rate": 4.950084777466724e-05, "loss": 0.6316, "num_input_tokens_seen": 6087808, "step": 10565 }, { "epoch": 1.5743223115877272, "grad_norm": 1.4703730344772339, "learning_rate": 4.949955477922509e-05, "loss": 0.6608, "num_input_tokens_seen": 6090496, "step": 10570 }, { "epoch": 1.5750670241286864, "grad_norm": 0.976381778717041, "learning_rate": 4.9498260128202635e-05, "loss": 0.7144, "num_input_tokens_seen": 6093600, "step": 10575 }, { "epoch": 1.5758117366696456, "grad_norm": 0.9612259864807129, "learning_rate": 4.949696382168737e-05, "loss": 0.5463, "num_input_tokens_seen": 6096352, "step": 10580 }, { "epoch": 1.5765564492106048, "grad_norm": 0.985258162021637, "learning_rate": 4.949566585976688e-05, "loss": 0.542, "num_input_tokens_seen": 6098976, "step": 10585 }, { "epoch": 1.577301161751564, "grad_norm": 1.3838709592819214, "learning_rate": 4.949436624252889e-05, "loss": 0.5642, "num_input_tokens_seen": 6101600, "step": 10590 }, { "epoch": 1.5780458742925232, "grad_norm": 1.610919713973999, "learning_rate": 4.949306497006121e-05, "loss": 0.7082, "num_input_tokens_seen": 6104256, "step": 10595 }, { "epoch": 1.5787905868334824, "grad_norm": 1.2349953651428223, "learning_rate": 4.949176204245178e-05, "loss": 0.6332, "num_input_tokens_seen": 6107040, "step": 10600 }, { "epoch": 1.5795352993744416, "grad_norm": 1.0680181980133057, "learning_rate": 4.949045745978866e-05, "loss": 0.7149, "num_input_tokens_seen": 6109984, "step": 10605 }, { "epoch": 1.5802800119154008, "grad_norm": 1.4140129089355469, "learning_rate": 4.9489151222159984e-05, "loss": 0.7562, "num_input_tokens_seen": 6112960, "step": 10610 }, { "epoch": 1.58102472445636, "grad_norm": 0.9674448370933533, "learning_rate": 4.948784332965404e-05, "loss": 0.6919, "num_input_tokens_seen": 6115840, "step": 10615 }, { "epoch": 1.5817694369973192, "grad_norm": 1.2170205116271973, "learning_rate": 4.94865337823592e-05, "loss": 0.7302, "num_input_tokens_seen": 6118848, "step": 10620 }, { "epoch": 1.5825141495382782, "grad_norm": 1.125898838043213, "learning_rate": 4.948522258036397e-05, "loss": 0.6539, "num_input_tokens_seen": 6121760, "step": 10625 }, { "epoch": 1.5832588620792374, "grad_norm": 0.9513798952102661, "learning_rate": 4.948390972375694e-05, "loss": 0.7273, "num_input_tokens_seen": 6124672, "step": 10630 }, { "epoch": 1.5840035746201966, "grad_norm": 1.1624324321746826, "learning_rate": 4.948259521262684e-05, "loss": 0.6432, "num_input_tokens_seen": 6127392, "step": 10635 }, { "epoch": 1.5847482871611558, "grad_norm": 0.7535877823829651, "learning_rate": 4.948127904706249e-05, "loss": 0.6957, "num_input_tokens_seen": 6130368, "step": 10640 }, { "epoch": 1.585492999702115, "grad_norm": 1.2506860494613647, "learning_rate": 4.947996122715283e-05, "loss": 0.742, "num_input_tokens_seen": 6133216, "step": 10645 }, { "epoch": 1.5862377122430742, "grad_norm": 1.0951467752456665, "learning_rate": 4.947864175298693e-05, "loss": 0.6615, "num_input_tokens_seen": 6136224, "step": 10650 }, { "epoch": 1.5869824247840334, "grad_norm": 1.4480899572372437, "learning_rate": 4.9477320624653937e-05, "loss": 0.6838, "num_input_tokens_seen": 6139168, "step": 10655 }, { "epoch": 1.5877271373249926, "grad_norm": 0.9683999419212341, "learning_rate": 4.9475997842243136e-05, "loss": 0.6796, "num_input_tokens_seen": 6142144, "step": 10660 }, { "epoch": 1.5884718498659516, "grad_norm": 1.5141276121139526, "learning_rate": 4.947467340584391e-05, "loss": 0.7408, "num_input_tokens_seen": 6145056, "step": 10665 }, { "epoch": 1.5892165624069108, "grad_norm": 1.5665727853775024, "learning_rate": 4.947334731554577e-05, "loss": 0.6772, "num_input_tokens_seen": 6147776, "step": 10670 }, { "epoch": 1.58996127494787, "grad_norm": 0.9736891388893127, "learning_rate": 4.947201957143831e-05, "loss": 0.7521, "num_input_tokens_seen": 6150752, "step": 10675 }, { "epoch": 1.5907059874888292, "grad_norm": 1.7193493843078613, "learning_rate": 4.947069017361127e-05, "loss": 0.6865, "num_input_tokens_seen": 6153504, "step": 10680 }, { "epoch": 1.5914507000297884, "grad_norm": 0.9779208302497864, "learning_rate": 4.9469359122154476e-05, "loss": 0.6728, "num_input_tokens_seen": 6156576, "step": 10685 }, { "epoch": 1.5921954125707476, "grad_norm": 1.5021343231201172, "learning_rate": 4.946802641715788e-05, "loss": 0.667, "num_input_tokens_seen": 6159232, "step": 10690 }, { "epoch": 1.5929401251117068, "grad_norm": 1.4556845426559448, "learning_rate": 4.9466692058711536e-05, "loss": 0.9304, "num_input_tokens_seen": 6162400, "step": 10695 }, { "epoch": 1.593684837652666, "grad_norm": 0.926120400428772, "learning_rate": 4.946535604690562e-05, "loss": 0.6787, "num_input_tokens_seen": 6165536, "step": 10700 }, { "epoch": 1.5944295501936252, "grad_norm": 1.191696047782898, "learning_rate": 4.94640183818304e-05, "loss": 0.667, "num_input_tokens_seen": 6168352, "step": 10705 }, { "epoch": 1.5951742627345844, "grad_norm": 1.078097939491272, "learning_rate": 4.9462679063576286e-05, "loss": 0.69, "num_input_tokens_seen": 6171232, "step": 10710 }, { "epoch": 1.5959189752755436, "grad_norm": 0.9656943082809448, "learning_rate": 4.946133809223379e-05, "loss": 0.7345, "num_input_tokens_seen": 6174176, "step": 10715 }, { "epoch": 1.5966636878165028, "grad_norm": 1.5418715476989746, "learning_rate": 4.945999546789351e-05, "loss": 0.751, "num_input_tokens_seen": 6177728, "step": 10720 }, { "epoch": 1.597408400357462, "grad_norm": 0.9027251601219177, "learning_rate": 4.9458651190646185e-05, "loss": 0.7303, "num_input_tokens_seen": 6180576, "step": 10725 }, { "epoch": 1.5981531128984212, "grad_norm": 0.9574724435806274, "learning_rate": 4.945730526058265e-05, "loss": 0.8036, "num_input_tokens_seen": 6183776, "step": 10730 }, { "epoch": 1.5988978254393804, "grad_norm": 1.0592659711837769, "learning_rate": 4.9455957677793865e-05, "loss": 0.674, "num_input_tokens_seen": 6186368, "step": 10735 }, { "epoch": 1.5996425379803396, "grad_norm": 0.7083404660224915, "learning_rate": 4.94546084423709e-05, "loss": 0.4324, "num_input_tokens_seen": 6189120, "step": 10740 }, { "epoch": 1.6003872505212988, "grad_norm": 0.894458532333374, "learning_rate": 4.945325755440491e-05, "loss": 0.5535, "num_input_tokens_seen": 6191872, "step": 10745 }, { "epoch": 1.601131963062258, "grad_norm": 0.6948161125183105, "learning_rate": 4.945190501398719e-05, "loss": 0.6776, "num_input_tokens_seen": 6194816, "step": 10750 }, { "epoch": 1.6018766756032172, "grad_norm": 1.4776982069015503, "learning_rate": 4.945055082120915e-05, "loss": 0.7219, "num_input_tokens_seen": 6197664, "step": 10755 }, { "epoch": 1.6026213881441764, "grad_norm": 1.051393985748291, "learning_rate": 4.94491949761623e-05, "loss": 0.7269, "num_input_tokens_seen": 6200640, "step": 10760 }, { "epoch": 1.6033661006851356, "grad_norm": 0.7037354111671448, "learning_rate": 4.944783747893825e-05, "loss": 0.5965, "num_input_tokens_seen": 6203552, "step": 10765 }, { "epoch": 1.6041108132260948, "grad_norm": 0.8482094407081604, "learning_rate": 4.944647832962874e-05, "loss": 0.5349, "num_input_tokens_seen": 6206336, "step": 10770 }, { "epoch": 1.604855525767054, "grad_norm": 1.5196117162704468, "learning_rate": 4.9445117528325625e-05, "loss": 0.6599, "num_input_tokens_seen": 6209216, "step": 10775 }, { "epoch": 1.6056002383080132, "grad_norm": 0.7936420440673828, "learning_rate": 4.9443755075120844e-05, "loss": 0.6831, "num_input_tokens_seen": 6212192, "step": 10780 }, { "epoch": 1.6063449508489724, "grad_norm": 1.2072471380233765, "learning_rate": 4.944239097010648e-05, "loss": 0.6065, "num_input_tokens_seen": 6215424, "step": 10785 }, { "epoch": 1.6070896633899316, "grad_norm": 0.882750391960144, "learning_rate": 4.9441025213374706e-05, "loss": 0.6292, "num_input_tokens_seen": 6218528, "step": 10790 }, { "epoch": 1.6078343759308906, "grad_norm": 3.200303792953491, "learning_rate": 4.9439657805017825e-05, "loss": 0.766, "num_input_tokens_seen": 6221760, "step": 10795 }, { "epoch": 1.6085790884718498, "grad_norm": 0.8636793494224548, "learning_rate": 4.9438288745128234e-05, "loss": 0.5393, "num_input_tokens_seen": 6224320, "step": 10800 }, { "epoch": 1.609323801012809, "grad_norm": 0.9990919828414917, "learning_rate": 4.943691803379844e-05, "loss": 0.5379, "num_input_tokens_seen": 6227296, "step": 10805 }, { "epoch": 1.6100685135537682, "grad_norm": 0.877135694026947, "learning_rate": 4.9435545671121085e-05, "loss": 0.5795, "num_input_tokens_seen": 6230048, "step": 10810 }, { "epoch": 1.6108132260947274, "grad_norm": 2.466899871826172, "learning_rate": 4.94341716571889e-05, "loss": 0.689, "num_input_tokens_seen": 6233120, "step": 10815 }, { "epoch": 1.6115579386356866, "grad_norm": 2.6183254718780518, "learning_rate": 4.943279599209474e-05, "loss": 0.776, "num_input_tokens_seen": 6235936, "step": 10820 }, { "epoch": 1.6123026511766458, "grad_norm": 1.2455377578735352, "learning_rate": 4.943141867593155e-05, "loss": 0.7111, "num_input_tokens_seen": 6238848, "step": 10825 }, { "epoch": 1.613047363717605, "grad_norm": 1.0247071981430054, "learning_rate": 4.943003970879243e-05, "loss": 0.7269, "num_input_tokens_seen": 6241504, "step": 10830 }, { "epoch": 1.6137920762585642, "grad_norm": 2.407170057296753, "learning_rate": 4.942865909077055e-05, "loss": 0.804, "num_input_tokens_seen": 6244160, "step": 10835 }, { "epoch": 1.6145367887995232, "grad_norm": 1.54050874710083, "learning_rate": 4.942727682195921e-05, "loss": 0.579, "num_input_tokens_seen": 6247072, "step": 10840 }, { "epoch": 1.6152815013404824, "grad_norm": 2.543431282043457, "learning_rate": 4.942589290245181e-05, "loss": 0.723, "num_input_tokens_seen": 6249792, "step": 10845 }, { "epoch": 1.6160262138814416, "grad_norm": 3.079552412033081, "learning_rate": 4.9424507332341874e-05, "loss": 0.7545, "num_input_tokens_seen": 6252736, "step": 10850 }, { "epoch": 1.6167709264224008, "grad_norm": 3.0296669006347656, "learning_rate": 4.942312011172304e-05, "loss": 0.7344, "num_input_tokens_seen": 6255744, "step": 10855 }, { "epoch": 1.61751563896336, "grad_norm": 1.2586768865585327, "learning_rate": 4.942173124068905e-05, "loss": 0.6013, "num_input_tokens_seen": 6258688, "step": 10860 }, { "epoch": 1.6182603515043192, "grad_norm": 1.0393657684326172, "learning_rate": 4.9420340719333746e-05, "loss": 0.7495, "num_input_tokens_seen": 6261440, "step": 10865 }, { "epoch": 1.6190050640452784, "grad_norm": 1.2398639917373657, "learning_rate": 4.941894854775111e-05, "loss": 0.6796, "num_input_tokens_seen": 6264416, "step": 10870 }, { "epoch": 1.6197497765862376, "grad_norm": 1.1400197744369507, "learning_rate": 4.9417554726035206e-05, "loss": 0.7426, "num_input_tokens_seen": 6267360, "step": 10875 }, { "epoch": 1.6204944891271968, "grad_norm": 1.4780590534210205, "learning_rate": 4.941615925428024e-05, "loss": 0.7274, "num_input_tokens_seen": 6270304, "step": 10880 }, { "epoch": 1.621239201668156, "grad_norm": 1.036772608757019, "learning_rate": 4.9414762132580486e-05, "loss": 0.5259, "num_input_tokens_seen": 6273216, "step": 10885 }, { "epoch": 1.6219839142091153, "grad_norm": 1.3242950439453125, "learning_rate": 4.9413363361030374e-05, "loss": 0.689, "num_input_tokens_seen": 6275936, "step": 10890 }, { "epoch": 1.6227286267500745, "grad_norm": 1.272715449333191, "learning_rate": 4.941196293972442e-05, "loss": 0.6746, "num_input_tokens_seen": 6278720, "step": 10895 }, { "epoch": 1.6234733392910337, "grad_norm": 2.061689853668213, "learning_rate": 4.941056086875727e-05, "loss": 0.73, "num_input_tokens_seen": 6281568, "step": 10900 }, { "epoch": 1.6242180518319929, "grad_norm": 0.725940465927124, "learning_rate": 4.940915714822366e-05, "loss": 0.7956, "num_input_tokens_seen": 6284608, "step": 10905 }, { "epoch": 1.624962764372952, "grad_norm": 0.9105008840560913, "learning_rate": 4.940775177821845e-05, "loss": 0.6479, "num_input_tokens_seen": 6287264, "step": 10910 }, { "epoch": 1.6257074769139113, "grad_norm": 1.484431266784668, "learning_rate": 4.9406344758836606e-05, "loss": 0.7896, "num_input_tokens_seen": 6290208, "step": 10915 }, { "epoch": 1.6264521894548705, "grad_norm": 1.0729434490203857, "learning_rate": 4.9404936090173214e-05, "loss": 0.6528, "num_input_tokens_seen": 6293280, "step": 10920 }, { "epoch": 1.6271969019958297, "grad_norm": 0.8070443272590637, "learning_rate": 4.9403525772323466e-05, "loss": 0.8767, "num_input_tokens_seen": 6296000, "step": 10925 }, { "epoch": 1.6279416145367889, "grad_norm": 1.0778249502182007, "learning_rate": 4.9402113805382665e-05, "loss": 0.7496, "num_input_tokens_seen": 6298976, "step": 10930 }, { "epoch": 1.628686327077748, "grad_norm": 1.0365856885910034, "learning_rate": 4.9400700189446226e-05, "loss": 0.6777, "num_input_tokens_seen": 6302304, "step": 10935 }, { "epoch": 1.6294310396187073, "grad_norm": 0.7512404322624207, "learning_rate": 4.939928492460967e-05, "loss": 0.7205, "num_input_tokens_seen": 6305312, "step": 10940 }, { "epoch": 1.6301757521596665, "grad_norm": 0.8190939426422119, "learning_rate": 4.939786801096864e-05, "loss": 0.7092, "num_input_tokens_seen": 6308128, "step": 10945 }, { "epoch": 1.6309204647006257, "grad_norm": 0.9923102259635925, "learning_rate": 4.9396449448618886e-05, "loss": 0.7504, "num_input_tokens_seen": 6310656, "step": 10950 }, { "epoch": 1.6316651772415849, "grad_norm": 0.7030231952667236, "learning_rate": 4.9395029237656266e-05, "loss": 0.7131, "num_input_tokens_seen": 6313312, "step": 10955 }, { "epoch": 1.632409889782544, "grad_norm": 0.9954115748405457, "learning_rate": 4.939360737817675e-05, "loss": 0.7079, "num_input_tokens_seen": 6316288, "step": 10960 }, { "epoch": 1.6331546023235033, "grad_norm": 1.0691169500350952, "learning_rate": 4.939218387027643e-05, "loss": 0.7036, "num_input_tokens_seen": 6318976, "step": 10965 }, { "epoch": 1.6338993148644623, "grad_norm": 0.8778389096260071, "learning_rate": 4.939075871405149e-05, "loss": 0.5316, "num_input_tokens_seen": 6321888, "step": 10970 }, { "epoch": 1.6346440274054215, "grad_norm": 0.8346767425537109, "learning_rate": 4.9389331909598246e-05, "loss": 0.6885, "num_input_tokens_seen": 6324800, "step": 10975 }, { "epoch": 1.6353887399463807, "grad_norm": 1.0116124153137207, "learning_rate": 4.938790345701311e-05, "loss": 0.5896, "num_input_tokens_seen": 6327424, "step": 10980 }, { "epoch": 1.6361334524873399, "grad_norm": 0.9714064598083496, "learning_rate": 4.9386473356392614e-05, "loss": 0.6751, "num_input_tokens_seen": 6330016, "step": 10985 }, { "epoch": 1.636878165028299, "grad_norm": 0.9164465069770813, "learning_rate": 4.93850416078334e-05, "loss": 0.6446, "num_input_tokens_seen": 6332736, "step": 10990 }, { "epoch": 1.6376228775692583, "grad_norm": 0.8834813833236694, "learning_rate": 4.938360821143221e-05, "loss": 0.6724, "num_input_tokens_seen": 6335680, "step": 10995 }, { "epoch": 1.6383675901102175, "grad_norm": 0.9841081500053406, "learning_rate": 4.938217316728592e-05, "loss": 0.761, "num_input_tokens_seen": 6338400, "step": 11000 }, { "epoch": 1.6391123026511767, "grad_norm": 0.7347317337989807, "learning_rate": 4.9380736475491484e-05, "loss": 0.6041, "num_input_tokens_seen": 6340928, "step": 11005 }, { "epoch": 1.6398570151921357, "grad_norm": 1.0360910892486572, "learning_rate": 4.9379298136146016e-05, "loss": 0.6664, "num_input_tokens_seen": 6343520, "step": 11010 }, { "epoch": 1.6406017277330949, "grad_norm": 1.9672931432724, "learning_rate": 4.9377858149346686e-05, "loss": 0.7335, "num_input_tokens_seen": 6346304, "step": 11015 }, { "epoch": 1.641346440274054, "grad_norm": 0.97346031665802, "learning_rate": 4.937641651519083e-05, "loss": 0.6745, "num_input_tokens_seen": 6348992, "step": 11020 }, { "epoch": 1.6420911528150133, "grad_norm": 1.0318195819854736, "learning_rate": 4.937497323377584e-05, "loss": 0.6553, "num_input_tokens_seen": 6351776, "step": 11025 }, { "epoch": 1.6428358653559725, "grad_norm": 0.9975034594535828, "learning_rate": 4.9373528305199273e-05, "loss": 0.7963, "num_input_tokens_seen": 6354368, "step": 11030 }, { "epoch": 1.6435805778969317, "grad_norm": 0.8488661646842957, "learning_rate": 4.937208172955876e-05, "loss": 0.5684, "num_input_tokens_seen": 6357440, "step": 11035 }, { "epoch": 1.6443252904378909, "grad_norm": 0.8346595764160156, "learning_rate": 4.937063350695204e-05, "loss": 0.6542, "num_input_tokens_seen": 6359968, "step": 11040 }, { "epoch": 1.64507000297885, "grad_norm": 1.003662109375, "learning_rate": 4.9369183637477e-05, "loss": 0.6907, "num_input_tokens_seen": 6362848, "step": 11045 }, { "epoch": 1.6458147155198093, "grad_norm": 0.7044907808303833, "learning_rate": 4.93677321212316e-05, "loss": 0.7326, "num_input_tokens_seen": 6365792, "step": 11050 }, { "epoch": 1.6465594280607685, "grad_norm": 1.3532499074935913, "learning_rate": 4.936627895831394e-05, "loss": 0.8775, "num_input_tokens_seen": 6368480, "step": 11055 }, { "epoch": 1.6473041406017277, "grad_norm": 0.8755179047584534, "learning_rate": 4.936482414882222e-05, "loss": 0.6412, "num_input_tokens_seen": 6371712, "step": 11060 }, { "epoch": 1.648048853142687, "grad_norm": 1.7891980409622192, "learning_rate": 4.9363367692854735e-05, "loss": 0.8859, "num_input_tokens_seen": 6374688, "step": 11065 }, { "epoch": 1.648793565683646, "grad_norm": 0.9056479334831238, "learning_rate": 4.9361909590509924e-05, "loss": 0.6578, "num_input_tokens_seen": 6377504, "step": 11070 }, { "epoch": 1.6495382782246053, "grad_norm": 1.1662876605987549, "learning_rate": 4.9360449841886304e-05, "loss": 0.7646, "num_input_tokens_seen": 6380160, "step": 11075 }, { "epoch": 1.6502829907655645, "grad_norm": 0.9055593013763428, "learning_rate": 4.935898844708253e-05, "loss": 0.5537, "num_input_tokens_seen": 6383200, "step": 11080 }, { "epoch": 1.6510277033065237, "grad_norm": 0.6524332761764526, "learning_rate": 4.9357525406197345e-05, "loss": 0.7101, "num_input_tokens_seen": 6385952, "step": 11085 }, { "epoch": 1.651772415847483, "grad_norm": 1.0638238191604614, "learning_rate": 4.9356060719329636e-05, "loss": 0.5985, "num_input_tokens_seen": 6388736, "step": 11090 }, { "epoch": 1.6525171283884421, "grad_norm": 1.699432611465454, "learning_rate": 4.935459438657836e-05, "loss": 0.7357, "num_input_tokens_seen": 6391456, "step": 11095 }, { "epoch": 1.6532618409294013, "grad_norm": 0.8599189519882202, "learning_rate": 4.9353126408042616e-05, "loss": 0.5544, "num_input_tokens_seen": 6394336, "step": 11100 }, { "epoch": 1.6540065534703605, "grad_norm": 0.6805292367935181, "learning_rate": 4.9351656783821606e-05, "loss": 0.8429, "num_input_tokens_seen": 6397120, "step": 11105 }, { "epoch": 1.6547512660113197, "grad_norm": 0.8251600861549377, "learning_rate": 4.935018551401463e-05, "loss": 0.6191, "num_input_tokens_seen": 6400000, "step": 11110 }, { "epoch": 1.655495978552279, "grad_norm": 0.8847874999046326, "learning_rate": 4.934871259872112e-05, "loss": 0.6624, "num_input_tokens_seen": 6403168, "step": 11115 }, { "epoch": 1.6562406910932381, "grad_norm": 0.9511733055114746, "learning_rate": 4.9347238038040614e-05, "loss": 0.7868, "num_input_tokens_seen": 6405952, "step": 11120 }, { "epoch": 1.6569854036341973, "grad_norm": 0.5598331093788147, "learning_rate": 4.934576183207274e-05, "loss": 0.7024, "num_input_tokens_seen": 6408992, "step": 11125 }, { "epoch": 1.6577301161751565, "grad_norm": 0.8171051740646362, "learning_rate": 4.9344283980917273e-05, "loss": 0.7536, "num_input_tokens_seen": 6411680, "step": 11130 }, { "epoch": 1.6584748287161157, "grad_norm": 0.563878059387207, "learning_rate": 4.9342804484674064e-05, "loss": 0.5893, "num_input_tokens_seen": 6414624, "step": 11135 }, { "epoch": 1.6592195412570747, "grad_norm": 1.0925695896148682, "learning_rate": 4.93413233434431e-05, "loss": 0.593, "num_input_tokens_seen": 6417504, "step": 11140 }, { "epoch": 1.659964253798034, "grad_norm": 0.9318862557411194, "learning_rate": 4.933984055732447e-05, "loss": 0.7078, "num_input_tokens_seen": 6420192, "step": 11145 }, { "epoch": 1.6607089663389931, "grad_norm": 0.6930258870124817, "learning_rate": 4.9338356126418375e-05, "loss": 0.6106, "num_input_tokens_seen": 6423136, "step": 11150 }, { "epoch": 1.6614536788799523, "grad_norm": 0.8002507090568542, "learning_rate": 4.9336870050825124e-05, "loss": 0.6813, "num_input_tokens_seen": 6425920, "step": 11155 }, { "epoch": 1.6621983914209115, "grad_norm": 0.795850932598114, "learning_rate": 4.933538233064514e-05, "loss": 0.834, "num_input_tokens_seen": 6428896, "step": 11160 }, { "epoch": 1.6629431039618707, "grad_norm": 0.7205244302749634, "learning_rate": 4.9333892965978955e-05, "loss": 0.609, "num_input_tokens_seen": 6432032, "step": 11165 }, { "epoch": 1.66368781650283, "grad_norm": 0.9422351717948914, "learning_rate": 4.9332401956927224e-05, "loss": 0.4773, "num_input_tokens_seen": 6435008, "step": 11170 }, { "epoch": 1.6644325290437891, "grad_norm": 1.3386948108673096, "learning_rate": 4.93309093035907e-05, "loss": 0.6344, "num_input_tokens_seen": 6438176, "step": 11175 }, { "epoch": 1.6651772415847483, "grad_norm": 0.7872302532196045, "learning_rate": 4.932941500607025e-05, "loss": 0.5735, "num_input_tokens_seen": 6441088, "step": 11180 }, { "epoch": 1.6659219541257073, "grad_norm": 0.6713034510612488, "learning_rate": 4.9327919064466835e-05, "loss": 0.6907, "num_input_tokens_seen": 6444448, "step": 11185 }, { "epoch": 1.6666666666666665, "grad_norm": 1.8029392957687378, "learning_rate": 4.932642147888157e-05, "loss": 0.9732, "num_input_tokens_seen": 6447136, "step": 11190 }, { "epoch": 1.6674113792076257, "grad_norm": 1.4022345542907715, "learning_rate": 4.932492224941565e-05, "loss": 0.7544, "num_input_tokens_seen": 6449696, "step": 11195 }, { "epoch": 1.668156091748585, "grad_norm": 1.1596629619598389, "learning_rate": 4.932342137617037e-05, "loss": 0.6389, "num_input_tokens_seen": 6452736, "step": 11200 }, { "epoch": 1.6689008042895441, "grad_norm": 0.869145929813385, "learning_rate": 4.932191885924717e-05, "loss": 0.5284, "num_input_tokens_seen": 6455712, "step": 11205 }, { "epoch": 1.6696455168305033, "grad_norm": 0.9432793855667114, "learning_rate": 4.9320414698747586e-05, "loss": 0.6714, "num_input_tokens_seen": 6458464, "step": 11210 }, { "epoch": 1.6703902293714625, "grad_norm": 1.8046865463256836, "learning_rate": 4.931890889477325e-05, "loss": 0.606, "num_input_tokens_seen": 6461696, "step": 11215 }, { "epoch": 1.6711349419124217, "grad_norm": 0.737251877784729, "learning_rate": 4.931740144742593e-05, "loss": 0.6093, "num_input_tokens_seen": 6464896, "step": 11220 }, { "epoch": 1.671879654453381, "grad_norm": 0.9387555718421936, "learning_rate": 4.931589235680748e-05, "loss": 0.7897, "num_input_tokens_seen": 6467904, "step": 11225 }, { "epoch": 1.6726243669943401, "grad_norm": 0.8276250958442688, "learning_rate": 4.931438162301989e-05, "loss": 0.6223, "num_input_tokens_seen": 6470688, "step": 11230 }, { "epoch": 1.6733690795352993, "grad_norm": 1.4936686754226685, "learning_rate": 4.931286924616524e-05, "loss": 0.5521, "num_input_tokens_seen": 6473536, "step": 11235 }, { "epoch": 1.6741137920762585, "grad_norm": 0.9871876835823059, "learning_rate": 4.931135522634574e-05, "loss": 0.4608, "num_input_tokens_seen": 6476352, "step": 11240 }, { "epoch": 1.6748585046172177, "grad_norm": 1.7856336832046509, "learning_rate": 4.930983956366369e-05, "loss": 0.7015, "num_input_tokens_seen": 6479008, "step": 11245 }, { "epoch": 1.675603217158177, "grad_norm": 2.2819712162017822, "learning_rate": 4.930832225822153e-05, "loss": 0.5678, "num_input_tokens_seen": 6482016, "step": 11250 }, { "epoch": 1.6763479296991362, "grad_norm": 1.8778166770935059, "learning_rate": 4.9306803310121776e-05, "loss": 0.6148, "num_input_tokens_seen": 6484672, "step": 11255 }, { "epoch": 1.6770926422400954, "grad_norm": 1.1015875339508057, "learning_rate": 4.9305282719467076e-05, "loss": 0.7585, "num_input_tokens_seen": 6487584, "step": 11260 }, { "epoch": 1.6778373547810546, "grad_norm": 2.6847095489501953, "learning_rate": 4.93037604863602e-05, "loss": 0.792, "num_input_tokens_seen": 6490400, "step": 11265 }, { "epoch": 1.6785820673220138, "grad_norm": 2.044064521789551, "learning_rate": 4.930223661090398e-05, "loss": 0.7321, "num_input_tokens_seen": 6493440, "step": 11270 }, { "epoch": 1.679326779862973, "grad_norm": 0.95045405626297, "learning_rate": 4.930071109320144e-05, "loss": 0.7353, "num_input_tokens_seen": 6496096, "step": 11275 }, { "epoch": 1.6800714924039322, "grad_norm": 1.1293562650680542, "learning_rate": 4.929918393335563e-05, "loss": 0.6476, "num_input_tokens_seen": 6498880, "step": 11280 }, { "epoch": 1.6808162049448914, "grad_norm": 0.7051618695259094, "learning_rate": 4.9297655131469763e-05, "loss": 0.6687, "num_input_tokens_seen": 6501440, "step": 11285 }, { "epoch": 1.6815609174858506, "grad_norm": 0.7888309955596924, "learning_rate": 4.929612468764715e-05, "loss": 0.71, "num_input_tokens_seen": 6504736, "step": 11290 }, { "epoch": 1.6823056300268098, "grad_norm": 0.8750180006027222, "learning_rate": 4.929459260199122e-05, "loss": 0.6503, "num_input_tokens_seen": 6507424, "step": 11295 }, { "epoch": 1.683050342567769, "grad_norm": 0.7648637294769287, "learning_rate": 4.9293058874605485e-05, "loss": 0.6046, "num_input_tokens_seen": 6510560, "step": 11300 }, { "epoch": 1.6837950551087282, "grad_norm": 0.6949045062065125, "learning_rate": 4.9291523505593604e-05, "loss": 0.6738, "num_input_tokens_seen": 6513696, "step": 11305 }, { "epoch": 1.6845397676496874, "grad_norm": 1.277854084968567, "learning_rate": 4.928998649505933e-05, "loss": 0.7647, "num_input_tokens_seen": 6516800, "step": 11310 }, { "epoch": 1.6852844801906464, "grad_norm": 1.2956734895706177, "learning_rate": 4.9288447843106525e-05, "loss": 0.5716, "num_input_tokens_seen": 6519840, "step": 11315 }, { "epoch": 1.6860291927316056, "grad_norm": 1.2507619857788086, "learning_rate": 4.9286907549839156e-05, "loss": 0.5939, "num_input_tokens_seen": 6522816, "step": 11320 }, { "epoch": 1.6867739052725648, "grad_norm": 1.7684390544891357, "learning_rate": 4.928536561536132e-05, "loss": 0.624, "num_input_tokens_seen": 6525664, "step": 11325 }, { "epoch": 1.687518617813524, "grad_norm": 0.8712915778160095, "learning_rate": 4.928382203977722e-05, "loss": 0.7794, "num_input_tokens_seen": 6529088, "step": 11330 }, { "epoch": 1.6882633303544832, "grad_norm": 0.8831801414489746, "learning_rate": 4.9282276823191154e-05, "loss": 0.6622, "num_input_tokens_seen": 6531840, "step": 11335 }, { "epoch": 1.6890080428954424, "grad_norm": 0.7329356074333191, "learning_rate": 4.9280729965707545e-05, "loss": 0.7172, "num_input_tokens_seen": 6534688, "step": 11340 }, { "epoch": 1.6897527554364016, "grad_norm": 1.65667724609375, "learning_rate": 4.9279181467430926e-05, "loss": 0.7606, "num_input_tokens_seen": 6537664, "step": 11345 }, { "epoch": 1.6904974679773608, "grad_norm": 1.5763143301010132, "learning_rate": 4.927763132846593e-05, "loss": 0.7438, "num_input_tokens_seen": 6540544, "step": 11350 }, { "epoch": 1.69124218051832, "grad_norm": 1.0632047653198242, "learning_rate": 4.927607954891732e-05, "loss": 0.7779, "num_input_tokens_seen": 6543648, "step": 11355 }, { "epoch": 1.691986893059279, "grad_norm": 1.314992904663086, "learning_rate": 4.927452612888994e-05, "loss": 0.6872, "num_input_tokens_seen": 6546496, "step": 11360 }, { "epoch": 1.6927316056002382, "grad_norm": 0.6408660411834717, "learning_rate": 4.9272971068488795e-05, "loss": 0.6472, "num_input_tokens_seen": 6549568, "step": 11365 }, { "epoch": 1.6934763181411974, "grad_norm": 0.7290710210800171, "learning_rate": 4.9271414367818944e-05, "loss": 0.5791, "num_input_tokens_seen": 6552672, "step": 11370 }, { "epoch": 1.6942210306821566, "grad_norm": 1.2152706384658813, "learning_rate": 4.926985602698559e-05, "loss": 0.6944, "num_input_tokens_seen": 6555424, "step": 11375 }, { "epoch": 1.6949657432231158, "grad_norm": 0.6699880361557007, "learning_rate": 4.926829604609404e-05, "loss": 0.5204, "num_input_tokens_seen": 6558304, "step": 11380 }, { "epoch": 1.695710455764075, "grad_norm": 1.2888686656951904, "learning_rate": 4.926673442524971e-05, "loss": 0.6462, "num_input_tokens_seen": 6561120, "step": 11385 }, { "epoch": 1.6964551683050342, "grad_norm": 0.9394259452819824, "learning_rate": 4.926517116455813e-05, "loss": 0.6723, "num_input_tokens_seen": 6563968, "step": 11390 }, { "epoch": 1.6971998808459934, "grad_norm": 0.5628145933151245, "learning_rate": 4.926360626412494e-05, "loss": 0.6043, "num_input_tokens_seen": 6566656, "step": 11395 }, { "epoch": 1.6979445933869526, "grad_norm": 0.7265164256095886, "learning_rate": 4.926203972405588e-05, "loss": 0.672, "num_input_tokens_seen": 6569696, "step": 11400 }, { "epoch": 1.6986893059279118, "grad_norm": 0.5118221640586853, "learning_rate": 4.926047154445683e-05, "loss": 0.6371, "num_input_tokens_seen": 6572608, "step": 11405 }, { "epoch": 1.699434018468871, "grad_norm": 0.9617552757263184, "learning_rate": 4.925890172543374e-05, "loss": 0.6398, "num_input_tokens_seen": 6576096, "step": 11410 }, { "epoch": 1.7001787310098302, "grad_norm": 0.8619706630706787, "learning_rate": 4.92573302670927e-05, "loss": 0.7107, "num_input_tokens_seen": 6578752, "step": 11415 }, { "epoch": 1.7009234435507894, "grad_norm": 1.0228488445281982, "learning_rate": 4.9255757169539905e-05, "loss": 0.6043, "num_input_tokens_seen": 6581376, "step": 11420 }, { "epoch": 1.7016681560917486, "grad_norm": 1.821260929107666, "learning_rate": 4.9254182432881654e-05, "loss": 0.5682, "num_input_tokens_seen": 6584352, "step": 11425 }, { "epoch": 1.7024128686327078, "grad_norm": 1.0227197408676147, "learning_rate": 4.9252606057224373e-05, "loss": 0.5853, "num_input_tokens_seen": 6587168, "step": 11430 }, { "epoch": 1.703157581173667, "grad_norm": 1.6385211944580078, "learning_rate": 4.9251028042674573e-05, "loss": 0.7753, "num_input_tokens_seen": 6590144, "step": 11435 }, { "epoch": 1.7039022937146262, "grad_norm": 1.3726032972335815, "learning_rate": 4.9249448389338905e-05, "loss": 0.7015, "num_input_tokens_seen": 6593024, "step": 11440 }, { "epoch": 1.7046470062555854, "grad_norm": 1.2095263004302979, "learning_rate": 4.9247867097324095e-05, "loss": 0.806, "num_input_tokens_seen": 6596032, "step": 11445 }, { "epoch": 1.7053917187965446, "grad_norm": 1.3371227979660034, "learning_rate": 4.924628416673701e-05, "loss": 0.6495, "num_input_tokens_seen": 6598752, "step": 11450 }, { "epoch": 1.7061364313375038, "grad_norm": 1.075860857963562, "learning_rate": 4.9244699597684625e-05, "loss": 0.6839, "num_input_tokens_seen": 6602528, "step": 11455 }, { "epoch": 1.706881143878463, "grad_norm": 0.8978545665740967, "learning_rate": 4.924311339027401e-05, "loss": 0.5302, "num_input_tokens_seen": 6605472, "step": 11460 }, { "epoch": 1.7076258564194222, "grad_norm": 0.7107033133506775, "learning_rate": 4.924152554461236e-05, "loss": 0.6707, "num_input_tokens_seen": 6608512, "step": 11465 }, { "epoch": 1.7083705689603814, "grad_norm": 0.7266281843185425, "learning_rate": 4.9239936060806965e-05, "loss": 0.6439, "num_input_tokens_seen": 6611392, "step": 11470 }, { "epoch": 1.7091152815013406, "grad_norm": 0.7957136034965515, "learning_rate": 4.9238344938965254e-05, "loss": 0.8797, "num_input_tokens_seen": 6614624, "step": 11475 }, { "epoch": 1.7098599940422998, "grad_norm": 0.7751196622848511, "learning_rate": 4.923675217919473e-05, "loss": 0.6451, "num_input_tokens_seen": 6617536, "step": 11480 }, { "epoch": 1.710604706583259, "grad_norm": 0.7317087054252625, "learning_rate": 4.923515778160304e-05, "loss": 0.6641, "num_input_tokens_seen": 6620448, "step": 11485 }, { "epoch": 1.711349419124218, "grad_norm": 0.4909137189388275, "learning_rate": 4.9233561746297917e-05, "loss": 0.5876, "num_input_tokens_seen": 6623264, "step": 11490 }, { "epoch": 1.7120941316651772, "grad_norm": 0.6119643449783325, "learning_rate": 4.923196407338721e-05, "loss": 0.6484, "num_input_tokens_seen": 6626368, "step": 11495 }, { "epoch": 1.7128388442061364, "grad_norm": 0.718164324760437, "learning_rate": 4.923036476297891e-05, "loss": 0.766, "num_input_tokens_seen": 6629408, "step": 11500 }, { "epoch": 1.7135835567470956, "grad_norm": 1.0558805465698242, "learning_rate": 4.922876381518106e-05, "loss": 0.7042, "num_input_tokens_seen": 6632256, "step": 11505 }, { "epoch": 1.7143282692880548, "grad_norm": 0.9095181822776794, "learning_rate": 4.922716123010186e-05, "loss": 0.6305, "num_input_tokens_seen": 6635232, "step": 11510 }, { "epoch": 1.715072981829014, "grad_norm": 1.6107710599899292, "learning_rate": 4.92255570078496e-05, "loss": 0.6327, "num_input_tokens_seen": 6638016, "step": 11515 }, { "epoch": 1.7158176943699732, "grad_norm": 1.1861472129821777, "learning_rate": 4.92239511485327e-05, "loss": 0.6636, "num_input_tokens_seen": 6641024, "step": 11520 }, { "epoch": 1.7165624069109324, "grad_norm": 0.665441632270813, "learning_rate": 4.922234365225966e-05, "loss": 0.8138, "num_input_tokens_seen": 6643840, "step": 11525 }, { "epoch": 1.7173071194518914, "grad_norm": 1.889039158821106, "learning_rate": 4.922073451913912e-05, "loss": 0.8605, "num_input_tokens_seen": 6646720, "step": 11530 }, { "epoch": 1.7180518319928506, "grad_norm": 0.963973879814148, "learning_rate": 4.9219123749279816e-05, "loss": 0.6931, "num_input_tokens_seen": 6649760, "step": 11535 }, { "epoch": 1.7187965445338098, "grad_norm": 0.8916479349136353, "learning_rate": 4.92175113427906e-05, "loss": 0.5301, "num_input_tokens_seen": 6652448, "step": 11540 }, { "epoch": 1.719541257074769, "grad_norm": 1.3059402704238892, "learning_rate": 4.9215897299780426e-05, "loss": 0.662, "num_input_tokens_seen": 6655328, "step": 11545 }, { "epoch": 1.7202859696157282, "grad_norm": 1.6176316738128662, "learning_rate": 4.9214281620358374e-05, "loss": 0.6598, "num_input_tokens_seen": 6658368, "step": 11550 }, { "epoch": 1.7210306821566874, "grad_norm": 0.8653119802474976, "learning_rate": 4.92126643046336e-05, "loss": 0.8595, "num_input_tokens_seen": 6661344, "step": 11555 }, { "epoch": 1.7217753946976466, "grad_norm": 1.1625268459320068, "learning_rate": 4.921104535271543e-05, "loss": 0.7019, "num_input_tokens_seen": 6664320, "step": 11560 }, { "epoch": 1.7225201072386058, "grad_norm": 1.6717867851257324, "learning_rate": 4.9209424764713246e-05, "loss": 0.56, "num_input_tokens_seen": 6667296, "step": 11565 }, { "epoch": 1.723264819779565, "grad_norm": 1.5139095783233643, "learning_rate": 4.920780254073656e-05, "loss": 0.7653, "num_input_tokens_seen": 6669952, "step": 11570 }, { "epoch": 1.7240095323205242, "grad_norm": 0.8489640355110168, "learning_rate": 4.920617868089501e-05, "loss": 0.5581, "num_input_tokens_seen": 6673056, "step": 11575 }, { "epoch": 1.7247542448614834, "grad_norm": 1.8174240589141846, "learning_rate": 4.9204553185298315e-05, "loss": 0.6423, "num_input_tokens_seen": 6675840, "step": 11580 }, { "epoch": 1.7254989574024426, "grad_norm": 1.1809793710708618, "learning_rate": 4.920292605405632e-05, "loss": 0.7316, "num_input_tokens_seen": 6678464, "step": 11585 }, { "epoch": 1.7262436699434018, "grad_norm": 1.0019177198410034, "learning_rate": 4.9201297287278994e-05, "loss": 0.6642, "num_input_tokens_seen": 6681120, "step": 11590 }, { "epoch": 1.726988382484361, "grad_norm": 0.6855232119560242, "learning_rate": 4.919966688507638e-05, "loss": 0.4941, "num_input_tokens_seen": 6683872, "step": 11595 }, { "epoch": 1.7277330950253202, "grad_norm": 1.664534568786621, "learning_rate": 4.919803484755867e-05, "loss": 0.8621, "num_input_tokens_seen": 6686912, "step": 11600 }, { "epoch": 1.7284778075662794, "grad_norm": 0.3527231514453888, "learning_rate": 4.919640117483616e-05, "loss": 0.5383, "num_input_tokens_seen": 6689792, "step": 11605 }, { "epoch": 1.7292225201072386, "grad_norm": 0.8452638983726501, "learning_rate": 4.9194765867019214e-05, "loss": 0.6574, "num_input_tokens_seen": 6692448, "step": 11610 }, { "epoch": 1.7299672326481979, "grad_norm": 0.9507084488868713, "learning_rate": 4.919312892421837e-05, "loss": 0.4967, "num_input_tokens_seen": 6695264, "step": 11615 }, { "epoch": 1.730711945189157, "grad_norm": 0.8556393980979919, "learning_rate": 4.9191490346544236e-05, "loss": 0.5784, "num_input_tokens_seen": 6698112, "step": 11620 }, { "epoch": 1.7314566577301163, "grad_norm": 0.742897629737854, "learning_rate": 4.918985013410754e-05, "loss": 0.52, "num_input_tokens_seen": 6700864, "step": 11625 }, { "epoch": 1.7322013702710755, "grad_norm": 1.0567892789840698, "learning_rate": 4.918820828701912e-05, "loss": 0.6771, "num_input_tokens_seen": 6703808, "step": 11630 }, { "epoch": 1.7329460828120347, "grad_norm": 1.7507718801498413, "learning_rate": 4.9186564805389923e-05, "loss": 0.6564, "num_input_tokens_seen": 6707072, "step": 11635 }, { "epoch": 1.7336907953529939, "grad_norm": 0.9621801972389221, "learning_rate": 4.918491968933101e-05, "loss": 0.5228, "num_input_tokens_seen": 6709728, "step": 11640 }, { "epoch": 1.734435507893953, "grad_norm": 2.0127010345458984, "learning_rate": 4.918327293895356e-05, "loss": 0.9388, "num_input_tokens_seen": 6712480, "step": 11645 }, { "epoch": 1.7351802204349123, "grad_norm": 1.446887493133545, "learning_rate": 4.918162455436884e-05, "loss": 0.5951, "num_input_tokens_seen": 6715328, "step": 11650 }, { "epoch": 1.7359249329758715, "grad_norm": 1.467726469039917, "learning_rate": 4.9179974535688256e-05, "loss": 0.684, "num_input_tokens_seen": 6718176, "step": 11655 }, { "epoch": 1.7366696455168305, "grad_norm": 5.575995922088623, "learning_rate": 4.91783228830233e-05, "loss": 0.8016, "num_input_tokens_seen": 6721408, "step": 11660 }, { "epoch": 1.7374143580577897, "grad_norm": 2.493352174758911, "learning_rate": 4.9176669596485584e-05, "loss": 0.7214, "num_input_tokens_seen": 6724096, "step": 11665 }, { "epoch": 1.7381590705987489, "grad_norm": 1.3695319890975952, "learning_rate": 4.917501467618682e-05, "loss": 0.6435, "num_input_tokens_seen": 6726944, "step": 11670 }, { "epoch": 1.738903783139708, "grad_norm": 0.4584622383117676, "learning_rate": 4.917335812223887e-05, "loss": 0.5605, "num_input_tokens_seen": 6729888, "step": 11675 }, { "epoch": 1.7396484956806673, "grad_norm": 1.5564584732055664, "learning_rate": 4.917169993475366e-05, "loss": 0.7331, "num_input_tokens_seen": 6732544, "step": 11680 }, { "epoch": 1.7403932082216265, "grad_norm": 1.4664301872253418, "learning_rate": 4.917004011384323e-05, "loss": 0.5593, "num_input_tokens_seen": 6735488, "step": 11685 }, { "epoch": 1.7411379207625857, "grad_norm": 1.9379379749298096, "learning_rate": 4.916837865961976e-05, "loss": 0.7188, "num_input_tokens_seen": 6738304, "step": 11690 }, { "epoch": 1.7418826333035449, "grad_norm": 2.3971824645996094, "learning_rate": 4.916671557219553e-05, "loss": 0.8134, "num_input_tokens_seen": 6741280, "step": 11695 }, { "epoch": 1.742627345844504, "grad_norm": 0.8312482237815857, "learning_rate": 4.916505085168291e-05, "loss": 0.7116, "num_input_tokens_seen": 6744000, "step": 11700 }, { "epoch": 1.743372058385463, "grad_norm": 1.256395697593689, "learning_rate": 4.91633844981944e-05, "loss": 0.5316, "num_input_tokens_seen": 6746720, "step": 11705 }, { "epoch": 1.7441167709264223, "grad_norm": 0.7803140878677368, "learning_rate": 4.9161716511842614e-05, "loss": 0.7808, "num_input_tokens_seen": 6749856, "step": 11710 }, { "epoch": 1.7448614834673815, "grad_norm": 0.7696263194084167, "learning_rate": 4.916004689274026e-05, "loss": 0.6816, "num_input_tokens_seen": 6752640, "step": 11715 }, { "epoch": 1.7456061960083407, "grad_norm": 0.9510766267776489, "learning_rate": 4.915837564100016e-05, "loss": 0.6847, "num_input_tokens_seen": 6755520, "step": 11720 }, { "epoch": 1.7463509085492999, "grad_norm": 0.9329589605331421, "learning_rate": 4.915670275673525e-05, "loss": 0.5277, "num_input_tokens_seen": 6758368, "step": 11725 }, { "epoch": 1.747095621090259, "grad_norm": 0.9462190270423889, "learning_rate": 4.915502824005859e-05, "loss": 0.7394, "num_input_tokens_seen": 6761408, "step": 11730 }, { "epoch": 1.7478403336312183, "grad_norm": 1.246030330657959, "learning_rate": 4.915335209108333e-05, "loss": 0.6275, "num_input_tokens_seen": 6764640, "step": 11735 }, { "epoch": 1.7485850461721775, "grad_norm": 0.5131479501724243, "learning_rate": 4.9151674309922736e-05, "loss": 0.5597, "num_input_tokens_seen": 6767456, "step": 11740 }, { "epoch": 1.7493297587131367, "grad_norm": 1.0014358758926392, "learning_rate": 4.914999489669018e-05, "loss": 0.7335, "num_input_tokens_seen": 6770336, "step": 11745 }, { "epoch": 1.7500744712540959, "grad_norm": 0.9471925497055054, "learning_rate": 4.9148313851499156e-05, "loss": 0.6716, "num_input_tokens_seen": 6772992, "step": 11750 }, { "epoch": 1.750819183795055, "grad_norm": 0.7522146701812744, "learning_rate": 4.914663117446327e-05, "loss": 0.548, "num_input_tokens_seen": 6775744, "step": 11755 }, { "epoch": 1.7515638963360143, "grad_norm": 3.3539068698883057, "learning_rate": 4.9144946865696204e-05, "loss": 0.6231, "num_input_tokens_seen": 6778688, "step": 11760 }, { "epoch": 1.7523086088769735, "grad_norm": 1.3422561883926392, "learning_rate": 4.9143260925311814e-05, "loss": 0.7204, "num_input_tokens_seen": 6781568, "step": 11765 }, { "epoch": 1.7530533214179327, "grad_norm": 1.8052756786346436, "learning_rate": 4.9141573353424e-05, "loss": 0.7337, "num_input_tokens_seen": 6784960, "step": 11770 }, { "epoch": 1.7537980339588919, "grad_norm": 1.635176181793213, "learning_rate": 4.913988415014681e-05, "loss": 0.7208, "num_input_tokens_seen": 6788032, "step": 11775 }, { "epoch": 1.754542746499851, "grad_norm": 1.457543134689331, "learning_rate": 4.9138193315594404e-05, "loss": 0.6424, "num_input_tokens_seen": 6790880, "step": 11780 }, { "epoch": 1.7552874590408103, "grad_norm": 0.6635274291038513, "learning_rate": 4.913650084988103e-05, "loss": 0.8032, "num_input_tokens_seen": 6793696, "step": 11785 }, { "epoch": 1.7560321715817695, "grad_norm": 1.0785284042358398, "learning_rate": 4.9134806753121055e-05, "loss": 0.6472, "num_input_tokens_seen": 6796352, "step": 11790 }, { "epoch": 1.7567768841227287, "grad_norm": 0.9598947167396545, "learning_rate": 4.913311102542897e-05, "loss": 0.7312, "num_input_tokens_seen": 6799168, "step": 11795 }, { "epoch": 1.757521596663688, "grad_norm": 1.080239176750183, "learning_rate": 4.913141366691936e-05, "loss": 0.6638, "num_input_tokens_seen": 6802432, "step": 11800 }, { "epoch": 1.758266309204647, "grad_norm": 0.6634899973869324, "learning_rate": 4.912971467770692e-05, "loss": 0.5651, "num_input_tokens_seen": 6805504, "step": 11805 }, { "epoch": 1.7590110217456063, "grad_norm": 0.5532525181770325, "learning_rate": 4.912801405790647e-05, "loss": 0.6669, "num_input_tokens_seen": 6808448, "step": 11810 }, { "epoch": 1.7597557342865655, "grad_norm": 1.1030148267745972, "learning_rate": 4.9126311807632926e-05, "loss": 0.5244, "num_input_tokens_seen": 6811424, "step": 11815 }, { "epoch": 1.7605004468275247, "grad_norm": 0.6631883978843689, "learning_rate": 4.912460792700132e-05, "loss": 0.7023, "num_input_tokens_seen": 6813984, "step": 11820 }, { "epoch": 1.761245159368484, "grad_norm": 0.7936077117919922, "learning_rate": 4.912290241612679e-05, "loss": 0.4803, "num_input_tokens_seen": 6816928, "step": 11825 }, { "epoch": 1.7619898719094431, "grad_norm": 1.3923863172531128, "learning_rate": 4.91211952751246e-05, "loss": 0.7325, "num_input_tokens_seen": 6819776, "step": 11830 }, { "epoch": 1.762734584450402, "grad_norm": 1.5208946466445923, "learning_rate": 4.9119486504110105e-05, "loss": 0.7534, "num_input_tokens_seen": 6822496, "step": 11835 }, { "epoch": 1.7634792969913613, "grad_norm": 0.9169139862060547, "learning_rate": 4.911777610319877e-05, "loss": 0.6892, "num_input_tokens_seen": 6825472, "step": 11840 }, { "epoch": 1.7642240095323205, "grad_norm": 1.3054503202438354, "learning_rate": 4.911606407250617e-05, "loss": 0.7433, "num_input_tokens_seen": 6828352, "step": 11845 }, { "epoch": 1.7649687220732797, "grad_norm": 0.9632514715194702, "learning_rate": 4.9114350412148026e-05, "loss": 0.8106, "num_input_tokens_seen": 6831104, "step": 11850 }, { "epoch": 1.765713434614239, "grad_norm": 0.8509837985038757, "learning_rate": 4.911263512224011e-05, "loss": 0.777, "num_input_tokens_seen": 6834336, "step": 11855 }, { "epoch": 1.766458147155198, "grad_norm": 0.9489719867706299, "learning_rate": 4.911091820289836e-05, "loss": 0.8059, "num_input_tokens_seen": 6836992, "step": 11860 }, { "epoch": 1.7672028596961573, "grad_norm": 1.5370351076126099, "learning_rate": 4.910919965423878e-05, "loss": 0.6248, "num_input_tokens_seen": 6839744, "step": 11865 }, { "epoch": 1.7679475722371165, "grad_norm": 0.9626398682594299, "learning_rate": 4.91074794763775e-05, "loss": 0.6786, "num_input_tokens_seen": 6842528, "step": 11870 }, { "epoch": 1.7686922847780755, "grad_norm": 0.807601273059845, "learning_rate": 4.910575766943079e-05, "loss": 0.7714, "num_input_tokens_seen": 6845600, "step": 11875 }, { "epoch": 1.7694369973190347, "grad_norm": 2.3484041690826416, "learning_rate": 4.9104034233514965e-05, "loss": 0.7949, "num_input_tokens_seen": 6848384, "step": 11880 }, { "epoch": 1.770181709859994, "grad_norm": 1.0800280570983887, "learning_rate": 4.910230916874651e-05, "loss": 0.6234, "num_input_tokens_seen": 6851072, "step": 11885 }, { "epoch": 1.770926422400953, "grad_norm": 1.2686783075332642, "learning_rate": 4.9100582475242004e-05, "loss": 0.71, "num_input_tokens_seen": 6853984, "step": 11890 }, { "epoch": 1.7716711349419123, "grad_norm": 1.6026194095611572, "learning_rate": 4.909885415311811e-05, "loss": 0.7902, "num_input_tokens_seen": 6856416, "step": 11895 }, { "epoch": 1.7724158474828715, "grad_norm": 0.9156650304794312, "learning_rate": 4.9097124202491636e-05, "loss": 0.7531, "num_input_tokens_seen": 6859456, "step": 11900 }, { "epoch": 1.7731605600238307, "grad_norm": 0.8099337816238403, "learning_rate": 4.9095392623479474e-05, "loss": 0.6041, "num_input_tokens_seen": 6862208, "step": 11905 }, { "epoch": 1.77390527256479, "grad_norm": 1.5788501501083374, "learning_rate": 4.909365941619866e-05, "loss": 0.7679, "num_input_tokens_seen": 6865152, "step": 11910 }, { "epoch": 1.7746499851057491, "grad_norm": 0.8033789396286011, "learning_rate": 4.909192458076628e-05, "loss": 0.6975, "num_input_tokens_seen": 6867776, "step": 11915 }, { "epoch": 1.7753946976467083, "grad_norm": 0.637656569480896, "learning_rate": 4.9090188117299596e-05, "loss": 0.7454, "num_input_tokens_seen": 6870848, "step": 11920 }, { "epoch": 1.7761394101876675, "grad_norm": 0.5858885049819946, "learning_rate": 4.908845002591594e-05, "loss": 0.6988, "num_input_tokens_seen": 6873696, "step": 11925 }, { "epoch": 1.7768841227286267, "grad_norm": 0.9185991287231445, "learning_rate": 4.9086710306732775e-05, "loss": 0.8362, "num_input_tokens_seen": 6876928, "step": 11930 }, { "epoch": 1.777628835269586, "grad_norm": 1.3242144584655762, "learning_rate": 4.908496895986765e-05, "loss": 0.676, "num_input_tokens_seen": 6879584, "step": 11935 }, { "epoch": 1.7783735478105451, "grad_norm": 0.7082212567329407, "learning_rate": 4.908322598543825e-05, "loss": 0.6991, "num_input_tokens_seen": 6882784, "step": 11940 }, { "epoch": 1.7791182603515043, "grad_norm": 1.1278250217437744, "learning_rate": 4.908148138356235e-05, "loss": 0.7365, "num_input_tokens_seen": 6885632, "step": 11945 }, { "epoch": 1.7798629728924635, "grad_norm": 7.921813011169434, "learning_rate": 4.907973515435784e-05, "loss": 0.7588, "num_input_tokens_seen": 6888736, "step": 11950 }, { "epoch": 1.7806076854334227, "grad_norm": 65.15744018554688, "learning_rate": 4.907798729794274e-05, "loss": 0.7245, "num_input_tokens_seen": 6891840, "step": 11955 }, { "epoch": 1.781352397974382, "grad_norm": 0.8470786213874817, "learning_rate": 4.907623781443515e-05, "loss": 0.6941, "num_input_tokens_seen": 6894592, "step": 11960 }, { "epoch": 1.7820971105153411, "grad_norm": 1.4381603002548218, "learning_rate": 4.9074486703953295e-05, "loss": 0.5143, "num_input_tokens_seen": 6897376, "step": 11965 }, { "epoch": 1.7828418230563003, "grad_norm": 2.038166046142578, "learning_rate": 4.9072733966615506e-05, "loss": 0.716, "num_input_tokens_seen": 6900096, "step": 11970 }, { "epoch": 1.7835865355972595, "grad_norm": 1.3279118537902832, "learning_rate": 4.907097960254023e-05, "loss": 0.7479, "num_input_tokens_seen": 6903008, "step": 11975 }, { "epoch": 1.7843312481382188, "grad_norm": 0.820280134677887, "learning_rate": 4.9069223611846014e-05, "loss": 0.6826, "num_input_tokens_seen": 6906144, "step": 11980 }, { "epoch": 1.785075960679178, "grad_norm": 0.9758726954460144, "learning_rate": 4.906746599465153e-05, "loss": 0.5686, "num_input_tokens_seen": 6909152, "step": 11985 }, { "epoch": 1.7858206732201372, "grad_norm": 0.730968177318573, "learning_rate": 4.906570675107555e-05, "loss": 0.6774, "num_input_tokens_seen": 6912320, "step": 11990 }, { "epoch": 1.7865653857610964, "grad_norm": 0.783460795879364, "learning_rate": 4.906394588123694e-05, "loss": 0.6049, "num_input_tokens_seen": 6915232, "step": 11995 }, { "epoch": 1.7873100983020556, "grad_norm": 1.2499421834945679, "learning_rate": 4.9062183385254714e-05, "loss": 0.6405, "num_input_tokens_seen": 6918048, "step": 12000 }, { "epoch": 1.7880548108430145, "grad_norm": 1.1639939546585083, "learning_rate": 4.9060419263247954e-05, "loss": 0.6925, "num_input_tokens_seen": 6920832, "step": 12005 }, { "epoch": 1.7887995233839737, "grad_norm": 1.5468683242797852, "learning_rate": 4.905865351533589e-05, "loss": 0.7075, "num_input_tokens_seen": 6923712, "step": 12010 }, { "epoch": 1.789544235924933, "grad_norm": 0.6959022283554077, "learning_rate": 4.905688614163784e-05, "loss": 0.559, "num_input_tokens_seen": 6926432, "step": 12015 }, { "epoch": 1.7902889484658921, "grad_norm": 1.456417441368103, "learning_rate": 4.905511714227322e-05, "loss": 0.7022, "num_input_tokens_seen": 6929440, "step": 12020 }, { "epoch": 1.7910336610068514, "grad_norm": 1.874125361442566, "learning_rate": 4.905334651736159e-05, "loss": 0.8476, "num_input_tokens_seen": 6932608, "step": 12025 }, { "epoch": 1.7917783735478106, "grad_norm": 0.7346754670143127, "learning_rate": 4.90515742670226e-05, "loss": 0.7652, "num_input_tokens_seen": 6935616, "step": 12030 }, { "epoch": 1.7925230860887698, "grad_norm": 0.6395867466926575, "learning_rate": 4.904980039137601e-05, "loss": 0.5321, "num_input_tokens_seen": 6938400, "step": 12035 }, { "epoch": 1.793267798629729, "grad_norm": 0.855855405330658, "learning_rate": 4.904802489054168e-05, "loss": 0.548, "num_input_tokens_seen": 6941344, "step": 12040 }, { "epoch": 1.7940125111706882, "grad_norm": 1.02219820022583, "learning_rate": 4.9046247764639606e-05, "loss": 0.6968, "num_input_tokens_seen": 6944032, "step": 12045 }, { "epoch": 1.7947572237116471, "grad_norm": 0.7965183258056641, "learning_rate": 4.9044469013789876e-05, "loss": 0.6976, "num_input_tokens_seen": 6947040, "step": 12050 }, { "epoch": 1.7955019362526063, "grad_norm": 0.9670293927192688, "learning_rate": 4.904268863811268e-05, "loss": 0.5981, "num_input_tokens_seen": 6949824, "step": 12055 }, { "epoch": 1.7962466487935655, "grad_norm": 1.0476549863815308, "learning_rate": 4.9040906637728344e-05, "loss": 0.7357, "num_input_tokens_seen": 6952768, "step": 12060 }, { "epoch": 1.7969913613345248, "grad_norm": 0.7944815158843994, "learning_rate": 4.903912301275728e-05, "loss": 0.6687, "num_input_tokens_seen": 6955616, "step": 12065 }, { "epoch": 1.797736073875484, "grad_norm": 0.9340440034866333, "learning_rate": 4.903733776332001e-05, "loss": 0.736, "num_input_tokens_seen": 6958528, "step": 12070 }, { "epoch": 1.7984807864164432, "grad_norm": 0.8438078761100769, "learning_rate": 4.903555088953719e-05, "loss": 0.5241, "num_input_tokens_seen": 6961472, "step": 12075 }, { "epoch": 1.7992254989574024, "grad_norm": 0.5128437280654907, "learning_rate": 4.9033762391529556e-05, "loss": 0.6973, "num_input_tokens_seen": 6964512, "step": 12080 }, { "epoch": 1.7999702114983616, "grad_norm": 1.0567933320999146, "learning_rate": 4.903197226941798e-05, "loss": 0.7785, "num_input_tokens_seen": 6967456, "step": 12085 }, { "epoch": 1.8007149240393208, "grad_norm": 0.6306166648864746, "learning_rate": 4.9030180523323425e-05, "loss": 0.8358, "num_input_tokens_seen": 6970272, "step": 12090 }, { "epoch": 1.80145963658028, "grad_norm": 0.8007144927978516, "learning_rate": 4.902838715336697e-05, "loss": 0.7042, "num_input_tokens_seen": 6973120, "step": 12095 }, { "epoch": 1.8022043491212392, "grad_norm": 0.9354895353317261, "learning_rate": 4.90265921596698e-05, "loss": 0.6241, "num_input_tokens_seen": 6975904, "step": 12100 }, { "epoch": 1.8029490616621984, "grad_norm": 0.7247110605239868, "learning_rate": 4.9024795542353216e-05, "loss": 0.6314, "num_input_tokens_seen": 6979072, "step": 12105 }, { "epoch": 1.8036937742031576, "grad_norm": 1.0390243530273438, "learning_rate": 4.902299730153863e-05, "loss": 0.6929, "num_input_tokens_seen": 6982112, "step": 12110 }, { "epoch": 1.8044384867441168, "grad_norm": 0.7935792207717896, "learning_rate": 4.9021197437347555e-05, "loss": 0.757, "num_input_tokens_seen": 6984960, "step": 12115 }, { "epoch": 1.805183199285076, "grad_norm": 0.9840261340141296, "learning_rate": 4.901939594990162e-05, "loss": 0.574, "num_input_tokens_seen": 6987808, "step": 12120 }, { "epoch": 1.8059279118260352, "grad_norm": 1.8469281196594238, "learning_rate": 4.901759283932257e-05, "loss": 0.6889, "num_input_tokens_seen": 6990496, "step": 12125 }, { "epoch": 1.8066726243669944, "grad_norm": 0.8314380645751953, "learning_rate": 4.9015788105732236e-05, "loss": 0.7502, "num_input_tokens_seen": 6993120, "step": 12130 }, { "epoch": 1.8074173369079536, "grad_norm": 1.08096444606781, "learning_rate": 4.9013981749252585e-05, "loss": 0.6717, "num_input_tokens_seen": 6995840, "step": 12135 }, { "epoch": 1.8081620494489128, "grad_norm": 0.9827772974967957, "learning_rate": 4.901217377000568e-05, "loss": 0.6701, "num_input_tokens_seen": 6998496, "step": 12140 }, { "epoch": 1.808906761989872, "grad_norm": 0.9525927305221558, "learning_rate": 4.90103641681137e-05, "loss": 0.5838, "num_input_tokens_seen": 7001376, "step": 12145 }, { "epoch": 1.8096514745308312, "grad_norm": 1.0750781297683716, "learning_rate": 4.900855294369893e-05, "loss": 0.906, "num_input_tokens_seen": 7004544, "step": 12150 }, { "epoch": 1.8103961870717904, "grad_norm": 0.8316721320152283, "learning_rate": 4.900674009688376e-05, "loss": 0.6362, "num_input_tokens_seen": 7007360, "step": 12155 }, { "epoch": 1.8111408996127496, "grad_norm": 0.8387295007705688, "learning_rate": 4.90049256277907e-05, "loss": 0.6373, "num_input_tokens_seen": 7010176, "step": 12160 }, { "epoch": 1.8118856121537088, "grad_norm": 1.3186537027359009, "learning_rate": 4.900310953654236e-05, "loss": 0.7333, "num_input_tokens_seen": 7013280, "step": 12165 }, { "epoch": 1.812630324694668, "grad_norm": 0.991982102394104, "learning_rate": 4.900129182326147e-05, "loss": 0.658, "num_input_tokens_seen": 7016032, "step": 12170 }, { "epoch": 1.8133750372356272, "grad_norm": 0.9542835354804993, "learning_rate": 4.899947248807086e-05, "loss": 0.5744, "num_input_tokens_seen": 7020128, "step": 12175 }, { "epoch": 1.8141197497765862, "grad_norm": 0.785233736038208, "learning_rate": 4.899765153109348e-05, "loss": 0.7962, "num_input_tokens_seen": 7023232, "step": 12180 }, { "epoch": 1.8148644623175454, "grad_norm": 0.5944175124168396, "learning_rate": 4.899582895245237e-05, "loss": 0.576, "num_input_tokens_seen": 7026048, "step": 12185 }, { "epoch": 1.8156091748585046, "grad_norm": 0.7538096904754639, "learning_rate": 4.89940047522707e-05, "loss": 0.7852, "num_input_tokens_seen": 7028768, "step": 12190 }, { "epoch": 1.8163538873994638, "grad_norm": 1.2138906717300415, "learning_rate": 4.899217893067174e-05, "loss": 0.6928, "num_input_tokens_seen": 7031712, "step": 12195 }, { "epoch": 1.817098599940423, "grad_norm": 0.6510927677154541, "learning_rate": 4.8990351487778875e-05, "loss": 0.7576, "num_input_tokens_seen": 7034720, "step": 12200 }, { "epoch": 1.8178433124813822, "grad_norm": 0.7567635178565979, "learning_rate": 4.89885224237156e-05, "loss": 0.6512, "num_input_tokens_seen": 7037536, "step": 12205 }, { "epoch": 1.8185880250223414, "grad_norm": 0.7902228832244873, "learning_rate": 4.89866917386055e-05, "loss": 0.6506, "num_input_tokens_seen": 7040480, "step": 12210 }, { "epoch": 1.8193327375633006, "grad_norm": 1.3750224113464355, "learning_rate": 4.89848594325723e-05, "loss": 0.6857, "num_input_tokens_seen": 7043552, "step": 12215 }, { "epoch": 1.8200774501042598, "grad_norm": 0.9141448736190796, "learning_rate": 4.898302550573981e-05, "loss": 0.7308, "num_input_tokens_seen": 7046368, "step": 12220 }, { "epoch": 1.8208221626452188, "grad_norm": 0.933928906917572, "learning_rate": 4.898118995823197e-05, "loss": 0.6789, "num_input_tokens_seen": 7049472, "step": 12225 }, { "epoch": 1.821566875186178, "grad_norm": 1.7984744310379028, "learning_rate": 4.8979352790172814e-05, "loss": 0.6391, "num_input_tokens_seen": 7052000, "step": 12230 }, { "epoch": 1.8223115877271372, "grad_norm": 0.8753399848937988, "learning_rate": 4.8977514001686485e-05, "loss": 0.7458, "num_input_tokens_seen": 7054880, "step": 12235 }, { "epoch": 1.8230563002680964, "grad_norm": 1.023695945739746, "learning_rate": 4.8975673592897244e-05, "loss": 0.6727, "num_input_tokens_seen": 7057824, "step": 12240 }, { "epoch": 1.8238010128090556, "grad_norm": 0.8770976662635803, "learning_rate": 4.897383156392947e-05, "loss": 0.5984, "num_input_tokens_seen": 7060608, "step": 12245 }, { "epoch": 1.8245457253500148, "grad_norm": 0.9507896304130554, "learning_rate": 4.897198791490762e-05, "loss": 0.6634, "num_input_tokens_seen": 7063872, "step": 12250 }, { "epoch": 1.825290437890974, "grad_norm": 0.7673934698104858, "learning_rate": 4.897014264595629e-05, "loss": 0.7343, "num_input_tokens_seen": 7066560, "step": 12255 }, { "epoch": 1.8260351504319332, "grad_norm": 0.6957747340202332, "learning_rate": 4.896829575720018e-05, "loss": 0.614, "num_input_tokens_seen": 7069408, "step": 12260 }, { "epoch": 1.8267798629728924, "grad_norm": 0.7886258959770203, "learning_rate": 4.8966447248764084e-05, "loss": 0.7009, "num_input_tokens_seen": 7072192, "step": 12265 }, { "epoch": 1.8275245755138516, "grad_norm": 0.6069473624229431, "learning_rate": 4.8964597120772926e-05, "loss": 0.6721, "num_input_tokens_seen": 7075200, "step": 12270 }, { "epoch": 1.8282692880548108, "grad_norm": 0.7995117902755737, "learning_rate": 4.8962745373351734e-05, "loss": 0.6328, "num_input_tokens_seen": 7077952, "step": 12275 }, { "epoch": 1.82901400059577, "grad_norm": 0.8794435262680054, "learning_rate": 4.8960892006625626e-05, "loss": 0.6663, "num_input_tokens_seen": 7080736, "step": 12280 }, { "epoch": 1.8297587131367292, "grad_norm": 0.5437980890274048, "learning_rate": 4.8959037020719854e-05, "loss": 0.6256, "num_input_tokens_seen": 7083680, "step": 12285 }, { "epoch": 1.8305034256776884, "grad_norm": 1.1379505395889282, "learning_rate": 4.895718041575978e-05, "loss": 0.7744, "num_input_tokens_seen": 7086464, "step": 12290 }, { "epoch": 1.8312481382186476, "grad_norm": 1.3698631525039673, "learning_rate": 4.895532219187085e-05, "loss": 0.6355, "num_input_tokens_seen": 7089568, "step": 12295 }, { "epoch": 1.8319928507596068, "grad_norm": 0.8803806304931641, "learning_rate": 4.895346234917865e-05, "loss": 0.7666, "num_input_tokens_seen": 7092640, "step": 12300 }, { "epoch": 1.832737563300566, "grad_norm": 1.0385475158691406, "learning_rate": 4.8951600887808836e-05, "loss": 0.5524, "num_input_tokens_seen": 7095392, "step": 12305 }, { "epoch": 1.8334822758415252, "grad_norm": 0.7473330497741699, "learning_rate": 4.894973780788722e-05, "loss": 0.6247, "num_input_tokens_seen": 7098304, "step": 12310 }, { "epoch": 1.8342269883824844, "grad_norm": 1.171537160873413, "learning_rate": 4.89478731095397e-05, "loss": 0.7287, "num_input_tokens_seen": 7101056, "step": 12315 }, { "epoch": 1.8349717009234436, "grad_norm": 0.799190104007721, "learning_rate": 4.894600679289228e-05, "loss": 0.6707, "num_input_tokens_seen": 7104064, "step": 12320 }, { "epoch": 1.8357164134644028, "grad_norm": 0.8750429153442383, "learning_rate": 4.8944138858071076e-05, "loss": 0.5293, "num_input_tokens_seen": 7107104, "step": 12325 }, { "epoch": 1.836461126005362, "grad_norm": 1.1509618759155273, "learning_rate": 4.894226930520232e-05, "loss": 0.7067, "num_input_tokens_seen": 7110080, "step": 12330 }, { "epoch": 1.8372058385463212, "grad_norm": 1.4221186637878418, "learning_rate": 4.894039813441235e-05, "loss": 0.495, "num_input_tokens_seen": 7112992, "step": 12335 }, { "epoch": 1.8379505510872804, "grad_norm": 1.232489824295044, "learning_rate": 4.89385253458276e-05, "loss": 0.8325, "num_input_tokens_seen": 7115968, "step": 12340 }, { "epoch": 1.8386952636282397, "grad_norm": 0.775852382183075, "learning_rate": 4.8936650939574636e-05, "loss": 0.6274, "num_input_tokens_seen": 7119040, "step": 12345 }, { "epoch": 1.8394399761691989, "grad_norm": 0.9934135675430298, "learning_rate": 4.893477491578013e-05, "loss": 0.6665, "num_input_tokens_seen": 7122016, "step": 12350 }, { "epoch": 1.8401846887101578, "grad_norm": 1.140684723854065, "learning_rate": 4.893289727457083e-05, "loss": 0.6335, "num_input_tokens_seen": 7124832, "step": 12355 }, { "epoch": 1.840929401251117, "grad_norm": 0.8687389492988586, "learning_rate": 4.893101801607365e-05, "loss": 0.6112, "num_input_tokens_seen": 7127808, "step": 12360 }, { "epoch": 1.8416741137920762, "grad_norm": 1.4677799940109253, "learning_rate": 4.892913714041556e-05, "loss": 0.6994, "num_input_tokens_seen": 7130880, "step": 12365 }, { "epoch": 1.8424188263330354, "grad_norm": 1.0618069171905518, "learning_rate": 4.892725464772368e-05, "loss": 0.6891, "num_input_tokens_seen": 7133856, "step": 12370 }, { "epoch": 1.8431635388739946, "grad_norm": 0.786217451095581, "learning_rate": 4.8925370538125204e-05, "loss": 0.6619, "num_input_tokens_seen": 7136768, "step": 12375 }, { "epoch": 1.8439082514149538, "grad_norm": 0.7768910527229309, "learning_rate": 4.892348481174747e-05, "loss": 0.6816, "num_input_tokens_seen": 7139680, "step": 12380 }, { "epoch": 1.844652963955913, "grad_norm": 0.7442879676818848, "learning_rate": 4.8921597468717887e-05, "loss": 0.6635, "num_input_tokens_seen": 7142560, "step": 12385 }, { "epoch": 1.8453976764968723, "grad_norm": 0.4910910725593567, "learning_rate": 4.891970850916401e-05, "loss": 0.5454, "num_input_tokens_seen": 7145248, "step": 12390 }, { "epoch": 1.8461423890378312, "grad_norm": 0.752382755279541, "learning_rate": 4.891781793321348e-05, "loss": 0.7346, "num_input_tokens_seen": 7148064, "step": 12395 }, { "epoch": 1.8468871015787904, "grad_norm": 0.6780164837837219, "learning_rate": 4.8915925740994064e-05, "loss": 0.6612, "num_input_tokens_seen": 7151136, "step": 12400 }, { "epoch": 1.8476318141197496, "grad_norm": 0.5228497982025146, "learning_rate": 4.8914031932633613e-05, "loss": 0.6098, "num_input_tokens_seen": 7154016, "step": 12405 }, { "epoch": 1.8483765266607088, "grad_norm": 0.4482893645763397, "learning_rate": 4.891213650826012e-05, "loss": 0.5452, "num_input_tokens_seen": 7156960, "step": 12410 }, { "epoch": 1.849121239201668, "grad_norm": 0.8488771915435791, "learning_rate": 4.891023946800165e-05, "loss": 0.6975, "num_input_tokens_seen": 7159872, "step": 12415 }, { "epoch": 1.8498659517426272, "grad_norm": 0.9801154732704163, "learning_rate": 4.890834081198642e-05, "loss": 0.5253, "num_input_tokens_seen": 7162880, "step": 12420 }, { "epoch": 1.8506106642835864, "grad_norm": 0.6290724873542786, "learning_rate": 4.890644054034271e-05, "loss": 0.7161, "num_input_tokens_seen": 7165824, "step": 12425 }, { "epoch": 1.8513553768245457, "grad_norm": 2.007856607437134, "learning_rate": 4.890453865319896e-05, "loss": 0.6534, "num_input_tokens_seen": 7168544, "step": 12430 }, { "epoch": 1.8521000893655049, "grad_norm": 1.6369421482086182, "learning_rate": 4.890263515068367e-05, "loss": 0.706, "num_input_tokens_seen": 7171200, "step": 12435 }, { "epoch": 1.852844801906464, "grad_norm": 0.6665361523628235, "learning_rate": 4.890073003292547e-05, "loss": 0.6648, "num_input_tokens_seen": 7173952, "step": 12440 }, { "epoch": 1.8535895144474233, "grad_norm": 1.0400820970535278, "learning_rate": 4.8898823300053124e-05, "loss": 0.7896, "num_input_tokens_seen": 7176704, "step": 12445 }, { "epoch": 1.8543342269883825, "grad_norm": 0.8753165602684021, "learning_rate": 4.889691495219545e-05, "loss": 0.6459, "num_input_tokens_seen": 7179552, "step": 12450 }, { "epoch": 1.8550789395293417, "grad_norm": 0.609514057636261, "learning_rate": 4.889500498948143e-05, "loss": 0.6855, "num_input_tokens_seen": 7182240, "step": 12455 }, { "epoch": 1.8558236520703009, "grad_norm": 0.7937207818031311, "learning_rate": 4.8893093412040114e-05, "loss": 0.7496, "num_input_tokens_seen": 7184960, "step": 12460 }, { "epoch": 1.85656836461126, "grad_norm": 0.8125370740890503, "learning_rate": 4.8891180220000696e-05, "loss": 0.6549, "num_input_tokens_seen": 7187808, "step": 12465 }, { "epoch": 1.8573130771522193, "grad_norm": 0.8617000579833984, "learning_rate": 4.8889265413492446e-05, "loss": 0.6359, "num_input_tokens_seen": 7190592, "step": 12470 }, { "epoch": 1.8580577896931785, "grad_norm": 1.675839900970459, "learning_rate": 4.888734899264477e-05, "loss": 0.7245, "num_input_tokens_seen": 7193408, "step": 12475 }, { "epoch": 1.8588025022341377, "grad_norm": 0.6880490779876709, "learning_rate": 4.888543095758717e-05, "loss": 0.9183, "num_input_tokens_seen": 7196384, "step": 12480 }, { "epoch": 1.8595472147750969, "grad_norm": 0.9549381732940674, "learning_rate": 4.888351130844926e-05, "loss": 0.6707, "num_input_tokens_seen": 7199168, "step": 12485 }, { "epoch": 1.860291927316056, "grad_norm": 0.8970692157745361, "learning_rate": 4.8881590045360744e-05, "loss": 0.7659, "num_input_tokens_seen": 7202208, "step": 12490 }, { "epoch": 1.8610366398570153, "grad_norm": 0.5502912998199463, "learning_rate": 4.8879667168451484e-05, "loss": 0.665, "num_input_tokens_seen": 7204864, "step": 12495 }, { "epoch": 1.8617813523979745, "grad_norm": 2.285069227218628, "learning_rate": 4.88777426778514e-05, "loss": 0.5994, "num_input_tokens_seen": 7207616, "step": 12500 }, { "epoch": 1.8625260649389337, "grad_norm": 1.1576145887374878, "learning_rate": 4.8875816573690544e-05, "loss": 0.6626, "num_input_tokens_seen": 7210496, "step": 12505 }, { "epoch": 1.863270777479893, "grad_norm": 1.9255776405334473, "learning_rate": 4.887388885609907e-05, "loss": 0.6587, "num_input_tokens_seen": 7213280, "step": 12510 }, { "epoch": 1.864015490020852, "grad_norm": 1.1964778900146484, "learning_rate": 4.887195952520726e-05, "loss": 0.752, "num_input_tokens_seen": 7216256, "step": 12515 }, { "epoch": 1.8647602025618113, "grad_norm": 1.084972858428955, "learning_rate": 4.887002858114548e-05, "loss": 0.6577, "num_input_tokens_seen": 7219040, "step": 12520 }, { "epoch": 1.8655049151027703, "grad_norm": 3.190876007080078, "learning_rate": 4.886809602404422e-05, "loss": 0.7554, "num_input_tokens_seen": 7221824, "step": 12525 }, { "epoch": 1.8662496276437295, "grad_norm": 0.8730413317680359, "learning_rate": 4.8866161854034064e-05, "loss": 0.7264, "num_input_tokens_seen": 7224832, "step": 12530 }, { "epoch": 1.8669943401846887, "grad_norm": 4.077584743499756, "learning_rate": 4.886422607124572e-05, "loss": 0.684, "num_input_tokens_seen": 7227840, "step": 12535 }, { "epoch": 1.8677390527256479, "grad_norm": 1.7161980867385864, "learning_rate": 4.886228867581002e-05, "loss": 0.6792, "num_input_tokens_seen": 7231040, "step": 12540 }, { "epoch": 1.868483765266607, "grad_norm": 1.045745611190796, "learning_rate": 4.886034966785785e-05, "loss": 0.9234, "num_input_tokens_seen": 7233888, "step": 12545 }, { "epoch": 1.8692284778075663, "grad_norm": 1.3986718654632568, "learning_rate": 4.8858409047520274e-05, "loss": 0.8704, "num_input_tokens_seen": 7236480, "step": 12550 }, { "epoch": 1.8699731903485255, "grad_norm": 0.8098355531692505, "learning_rate": 4.88564668149284e-05, "loss": 0.6307, "num_input_tokens_seen": 7239456, "step": 12555 }, { "epoch": 1.8707179028894847, "grad_norm": 0.6909469962120056, "learning_rate": 4.88545229702135e-05, "loss": 0.578, "num_input_tokens_seen": 7242848, "step": 12560 }, { "epoch": 1.871462615430444, "grad_norm": 0.7150005102157593, "learning_rate": 4.8852577513506925e-05, "loss": 0.609, "num_input_tokens_seen": 7245664, "step": 12565 }, { "epoch": 1.8722073279714029, "grad_norm": 0.7299801707267761, "learning_rate": 4.885063044494014e-05, "loss": 0.4935, "num_input_tokens_seen": 7248224, "step": 12570 }, { "epoch": 1.872952040512362, "grad_norm": 1.157906174659729, "learning_rate": 4.884868176464471e-05, "loss": 0.6424, "num_input_tokens_seen": 7251008, "step": 12575 }, { "epoch": 1.8736967530533213, "grad_norm": 0.7991150617599487, "learning_rate": 4.8846731472752336e-05, "loss": 0.6687, "num_input_tokens_seen": 7253984, "step": 12580 }, { "epoch": 1.8744414655942805, "grad_norm": 1.0444709062576294, "learning_rate": 4.8844779569394805e-05, "loss": 0.7556, "num_input_tokens_seen": 7256736, "step": 12585 }, { "epoch": 1.8751861781352397, "grad_norm": 0.9052796959877014, "learning_rate": 4.884282605470401e-05, "loss": 0.648, "num_input_tokens_seen": 7259424, "step": 12590 }, { "epoch": 1.875930890676199, "grad_norm": 0.845235288143158, "learning_rate": 4.8840870928811966e-05, "loss": 0.7722, "num_input_tokens_seen": 7262240, "step": 12595 }, { "epoch": 1.876675603217158, "grad_norm": 1.04128098487854, "learning_rate": 4.8838914191850804e-05, "loss": 0.5703, "num_input_tokens_seen": 7264736, "step": 12600 }, { "epoch": 1.8774203157581173, "grad_norm": 1.483066201210022, "learning_rate": 4.883695584395274e-05, "loss": 0.627, "num_input_tokens_seen": 7267488, "step": 12605 }, { "epoch": 1.8781650282990765, "grad_norm": 0.8912935256958008, "learning_rate": 4.883499588525011e-05, "loss": 0.5572, "num_input_tokens_seen": 7270240, "step": 12610 }, { "epoch": 1.8789097408400357, "grad_norm": 1.2824121713638306, "learning_rate": 4.883303431587536e-05, "loss": 0.698, "num_input_tokens_seen": 7273216, "step": 12615 }, { "epoch": 1.879654453380995, "grad_norm": 0.786119818687439, "learning_rate": 4.883107113596106e-05, "loss": 0.6784, "num_input_tokens_seen": 7275808, "step": 12620 }, { "epoch": 1.880399165921954, "grad_norm": 0.6426825523376465, "learning_rate": 4.882910634563985e-05, "loss": 0.718, "num_input_tokens_seen": 7278688, "step": 12625 }, { "epoch": 1.8811438784629133, "grad_norm": 0.7964416146278381, "learning_rate": 4.882713994504453e-05, "loss": 0.7048, "num_input_tokens_seen": 7281984, "step": 12630 }, { "epoch": 1.8818885910038725, "grad_norm": 1.212577223777771, "learning_rate": 4.882517193430796e-05, "loss": 0.601, "num_input_tokens_seen": 7284992, "step": 12635 }, { "epoch": 1.8826333035448317, "grad_norm": 0.6863723397254944, "learning_rate": 4.882320231356313e-05, "loss": 0.6365, "num_input_tokens_seen": 7287872, "step": 12640 }, { "epoch": 1.883378016085791, "grad_norm": 3.060812473297119, "learning_rate": 4.882123108294316e-05, "loss": 0.7457, "num_input_tokens_seen": 7290688, "step": 12645 }, { "epoch": 1.8841227286267501, "grad_norm": 1.0327399969100952, "learning_rate": 4.881925824258123e-05, "loss": 0.5593, "num_input_tokens_seen": 7293504, "step": 12650 }, { "epoch": 1.8848674411677093, "grad_norm": 1.432578444480896, "learning_rate": 4.881728379261068e-05, "loss": 0.5778, "num_input_tokens_seen": 7296384, "step": 12655 }, { "epoch": 1.8856121537086685, "grad_norm": 1.085883617401123, "learning_rate": 4.881530773316492e-05, "loss": 0.685, "num_input_tokens_seen": 7299520, "step": 12660 }, { "epoch": 1.8863568662496277, "grad_norm": 0.9388927221298218, "learning_rate": 4.881333006437749e-05, "loss": 0.6946, "num_input_tokens_seen": 7302240, "step": 12665 }, { "epoch": 1.887101578790587, "grad_norm": 1.1621413230895996, "learning_rate": 4.881135078638203e-05, "loss": 0.6581, "num_input_tokens_seen": 7305312, "step": 12670 }, { "epoch": 1.8878462913315461, "grad_norm": 1.4955976009368896, "learning_rate": 4.88093698993123e-05, "loss": 0.7379, "num_input_tokens_seen": 7308192, "step": 12675 }, { "epoch": 1.8885910038725053, "grad_norm": 1.5987688302993774, "learning_rate": 4.880738740330215e-05, "loss": 0.5132, "num_input_tokens_seen": 7310944, "step": 12680 }, { "epoch": 1.8893357164134645, "grad_norm": 1.3044761419296265, "learning_rate": 4.8805403298485554e-05, "loss": 0.7966, "num_input_tokens_seen": 7313856, "step": 12685 }, { "epoch": 1.8900804289544237, "grad_norm": 1.0736885070800781, "learning_rate": 4.8803417584996584e-05, "loss": 0.7889, "num_input_tokens_seen": 7316608, "step": 12690 }, { "epoch": 1.890825141495383, "grad_norm": 1.1238775253295898, "learning_rate": 4.880143026296944e-05, "loss": 0.6655, "num_input_tokens_seen": 7319648, "step": 12695 }, { "epoch": 1.891569854036342, "grad_norm": 1.070114254951477, "learning_rate": 4.87994413325384e-05, "loss": 0.7201, "num_input_tokens_seen": 7322368, "step": 12700 }, { "epoch": 1.8923145665773011, "grad_norm": 1.0520869493484497, "learning_rate": 4.879745079383789e-05, "loss": 0.7234, "num_input_tokens_seen": 7325376, "step": 12705 }, { "epoch": 1.8930592791182603, "grad_norm": 1.0111852884292603, "learning_rate": 4.879545864700239e-05, "loss": 0.7197, "num_input_tokens_seen": 7328096, "step": 12710 }, { "epoch": 1.8938039916592195, "grad_norm": 0.8509277701377869, "learning_rate": 4.879346489216655e-05, "loss": 0.5726, "num_input_tokens_seen": 7330816, "step": 12715 }, { "epoch": 1.8945487042001787, "grad_norm": 0.9387149214744568, "learning_rate": 4.8791469529465087e-05, "loss": 0.6222, "num_input_tokens_seen": 7333504, "step": 12720 }, { "epoch": 1.895293416741138, "grad_norm": 1.5520048141479492, "learning_rate": 4.878947255903284e-05, "loss": 0.7494, "num_input_tokens_seen": 7336480, "step": 12725 }, { "epoch": 1.8960381292820971, "grad_norm": 0.689396321773529, "learning_rate": 4.878747398100477e-05, "loss": 0.7391, "num_input_tokens_seen": 7339648, "step": 12730 }, { "epoch": 1.8967828418230563, "grad_norm": 0.6117677092552185, "learning_rate": 4.878547379551591e-05, "loss": 0.6228, "num_input_tokens_seen": 7342592, "step": 12735 }, { "epoch": 1.8975275543640155, "grad_norm": 1.113608956336975, "learning_rate": 4.8783472002701434e-05, "loss": 0.6298, "num_input_tokens_seen": 7345536, "step": 12740 }, { "epoch": 1.8982722669049745, "grad_norm": 0.8582338094711304, "learning_rate": 4.8781468602696623e-05, "loss": 0.6215, "num_input_tokens_seen": 7348160, "step": 12745 }, { "epoch": 1.8990169794459337, "grad_norm": 1.1743525266647339, "learning_rate": 4.8779463595636857e-05, "loss": 0.6134, "num_input_tokens_seen": 7351136, "step": 12750 }, { "epoch": 1.899761691986893, "grad_norm": 0.7262619137763977, "learning_rate": 4.877745698165761e-05, "loss": 0.7883, "num_input_tokens_seen": 7354272, "step": 12755 }, { "epoch": 1.9005064045278521, "grad_norm": 1.308918833732605, "learning_rate": 4.87754487608945e-05, "loss": 0.856, "num_input_tokens_seen": 7357056, "step": 12760 }, { "epoch": 1.9012511170688113, "grad_norm": 0.8512782454490662, "learning_rate": 4.8773438933483224e-05, "loss": 0.7011, "num_input_tokens_seen": 7359904, "step": 12765 }, { "epoch": 1.9019958296097705, "grad_norm": 0.8941280245780945, "learning_rate": 4.8771427499559594e-05, "loss": 0.8142, "num_input_tokens_seen": 7362720, "step": 12770 }, { "epoch": 1.9027405421507297, "grad_norm": 0.9544683694839478, "learning_rate": 4.8769414459259556e-05, "loss": 0.7328, "num_input_tokens_seen": 7365696, "step": 12775 }, { "epoch": 1.903485254691689, "grad_norm": 0.6544963717460632, "learning_rate": 4.8767399812719115e-05, "loss": 0.5328, "num_input_tokens_seen": 7368544, "step": 12780 }, { "epoch": 1.9042299672326481, "grad_norm": 1.8506444692611694, "learning_rate": 4.876538356007443e-05, "loss": 0.727, "num_input_tokens_seen": 7371520, "step": 12785 }, { "epoch": 1.9049746797736073, "grad_norm": 0.6842294931411743, "learning_rate": 4.876336570146175e-05, "loss": 0.6126, "num_input_tokens_seen": 7374304, "step": 12790 }, { "epoch": 1.9057193923145666, "grad_norm": 0.8828718066215515, "learning_rate": 4.876134623701743e-05, "loss": 0.8151, "num_input_tokens_seen": 7377280, "step": 12795 }, { "epoch": 1.9064641048555258, "grad_norm": 0.5897439122200012, "learning_rate": 4.875932516687793e-05, "loss": 0.5762, "num_input_tokens_seen": 7380192, "step": 12800 }, { "epoch": 1.907208817396485, "grad_norm": 0.8834773302078247, "learning_rate": 4.8757302491179844e-05, "loss": 0.7251, "num_input_tokens_seen": 7382976, "step": 12805 }, { "epoch": 1.9079535299374442, "grad_norm": 0.9112144708633423, "learning_rate": 4.8755278210059845e-05, "loss": 0.5357, "num_input_tokens_seen": 7385696, "step": 12810 }, { "epoch": 1.9086982424784034, "grad_norm": 0.8962788581848145, "learning_rate": 4.8753252323654726e-05, "loss": 0.556, "num_input_tokens_seen": 7388640, "step": 12815 }, { "epoch": 1.9094429550193626, "grad_norm": 1.0963886976242065, "learning_rate": 4.8751224832101383e-05, "loss": 0.7846, "num_input_tokens_seen": 7391488, "step": 12820 }, { "epoch": 1.9101876675603218, "grad_norm": 0.9835218787193298, "learning_rate": 4.874919573553683e-05, "loss": 0.6705, "num_input_tokens_seen": 7394080, "step": 12825 }, { "epoch": 1.910932380101281, "grad_norm": 1.1637005805969238, "learning_rate": 4.8747165034098196e-05, "loss": 0.7748, "num_input_tokens_seen": 7396672, "step": 12830 }, { "epoch": 1.9116770926422402, "grad_norm": 1.6735522747039795, "learning_rate": 4.8745132727922696e-05, "loss": 0.7142, "num_input_tokens_seen": 7399488, "step": 12835 }, { "epoch": 1.9124218051831994, "grad_norm": 1.5475356578826904, "learning_rate": 4.874309881714766e-05, "loss": 0.6368, "num_input_tokens_seen": 7402496, "step": 12840 }, { "epoch": 1.9131665177241586, "grad_norm": 1.0097029209136963, "learning_rate": 4.874106330191055e-05, "loss": 0.7365, "num_input_tokens_seen": 7405216, "step": 12845 }, { "epoch": 1.9139112302651178, "grad_norm": 1.923002004623413, "learning_rate": 4.8739026182348894e-05, "loss": 0.7696, "num_input_tokens_seen": 7408288, "step": 12850 }, { "epoch": 1.914655942806077, "grad_norm": 1.1473151445388794, "learning_rate": 4.873698745860037e-05, "loss": 0.5189, "num_input_tokens_seen": 7411456, "step": 12855 }, { "epoch": 1.9154006553470362, "grad_norm": 1.3169468641281128, "learning_rate": 4.873494713080274e-05, "loss": 0.5725, "num_input_tokens_seen": 7414336, "step": 12860 }, { "epoch": 1.9161453678879954, "grad_norm": 0.7909854054450989, "learning_rate": 4.8732905199093884e-05, "loss": 0.6353, "num_input_tokens_seen": 7417152, "step": 12865 }, { "epoch": 1.9168900804289544, "grad_norm": 1.2116013765335083, "learning_rate": 4.873086166361178e-05, "loss": 0.5725, "num_input_tokens_seen": 7420096, "step": 12870 }, { "epoch": 1.9176347929699136, "grad_norm": 1.4624396562576294, "learning_rate": 4.872881652449453e-05, "loss": 0.6873, "num_input_tokens_seen": 7422976, "step": 12875 }, { "epoch": 1.9183795055108728, "grad_norm": 1.340835452079773, "learning_rate": 4.872676978188033e-05, "loss": 0.7682, "num_input_tokens_seen": 7426112, "step": 12880 }, { "epoch": 1.919124218051832, "grad_norm": 0.837962806224823, "learning_rate": 4.8724721435907504e-05, "loss": 0.5855, "num_input_tokens_seen": 7428960, "step": 12885 }, { "epoch": 1.9198689305927912, "grad_norm": 1.380199670791626, "learning_rate": 4.8722671486714457e-05, "loss": 0.6922, "num_input_tokens_seen": 7431616, "step": 12890 }, { "epoch": 1.9206136431337504, "grad_norm": 0.8269267678260803, "learning_rate": 4.8720619934439715e-05, "loss": 0.6286, "num_input_tokens_seen": 7434304, "step": 12895 }, { "epoch": 1.9213583556747096, "grad_norm": 0.6641804575920105, "learning_rate": 4.871856677922193e-05, "loss": 0.5641, "num_input_tokens_seen": 7437120, "step": 12900 }, { "epoch": 1.9221030682156688, "grad_norm": 1.183556318283081, "learning_rate": 4.8716512021199825e-05, "loss": 0.8156, "num_input_tokens_seen": 7440064, "step": 12905 }, { "epoch": 1.922847780756628, "grad_norm": 1.0545523166656494, "learning_rate": 4.871445566051226e-05, "loss": 0.5724, "num_input_tokens_seen": 7442880, "step": 12910 }, { "epoch": 1.923592493297587, "grad_norm": 1.1341582536697388, "learning_rate": 4.8712397697298207e-05, "loss": 0.5805, "num_input_tokens_seen": 7445792, "step": 12915 }, { "epoch": 1.9243372058385462, "grad_norm": 0.9171569347381592, "learning_rate": 4.871033813169672e-05, "loss": 0.6375, "num_input_tokens_seen": 7448960, "step": 12920 }, { "epoch": 1.9250819183795054, "grad_norm": 1.2777658700942993, "learning_rate": 4.870827696384698e-05, "loss": 0.7613, "num_input_tokens_seen": 7451840, "step": 12925 }, { "epoch": 1.9258266309204646, "grad_norm": 1.0542054176330566, "learning_rate": 4.870621419388828e-05, "loss": 0.7634, "num_input_tokens_seen": 7454560, "step": 12930 }, { "epoch": 1.9265713434614238, "grad_norm": 1.6280077695846558, "learning_rate": 4.870414982196e-05, "loss": 0.9322, "num_input_tokens_seen": 7457344, "step": 12935 }, { "epoch": 1.927316056002383, "grad_norm": 1.7141305208206177, "learning_rate": 4.870208384820165e-05, "loss": 0.8243, "num_input_tokens_seen": 7460128, "step": 12940 }, { "epoch": 1.9280607685433422, "grad_norm": 1.454372525215149, "learning_rate": 4.8700016272752844e-05, "loss": 0.5692, "num_input_tokens_seen": 7463072, "step": 12945 }, { "epoch": 1.9288054810843014, "grad_norm": 1.7563071250915527, "learning_rate": 4.869794709575329e-05, "loss": 0.8275, "num_input_tokens_seen": 7466048, "step": 12950 }, { "epoch": 1.9295501936252606, "grad_norm": 1.4959068298339844, "learning_rate": 4.869587631734282e-05, "loss": 0.7504, "num_input_tokens_seen": 7469152, "step": 12955 }, { "epoch": 1.9302949061662198, "grad_norm": 1.0424010753631592, "learning_rate": 4.869380393766137e-05, "loss": 0.5435, "num_input_tokens_seen": 7471776, "step": 12960 }, { "epoch": 1.931039618707179, "grad_norm": 1.3296383619308472, "learning_rate": 4.8691729956848986e-05, "loss": 0.5739, "num_input_tokens_seen": 7474656, "step": 12965 }, { "epoch": 1.9317843312481382, "grad_norm": 1.864561915397644, "learning_rate": 4.868965437504581e-05, "loss": 0.6463, "num_input_tokens_seen": 7477760, "step": 12970 }, { "epoch": 1.9325290437890974, "grad_norm": 2.238969087600708, "learning_rate": 4.868757719239211e-05, "loss": 0.7572, "num_input_tokens_seen": 7480416, "step": 12975 }, { "epoch": 1.9332737563300566, "grad_norm": 1.080896258354187, "learning_rate": 4.8685498409028254e-05, "loss": 0.7319, "num_input_tokens_seen": 7483168, "step": 12980 }, { "epoch": 1.9340184688710158, "grad_norm": 0.9019543528556824, "learning_rate": 4.8683418025094704e-05, "loss": 0.7626, "num_input_tokens_seen": 7486144, "step": 12985 }, { "epoch": 1.934763181411975, "grad_norm": 1.1279723644256592, "learning_rate": 4.8681336040732055e-05, "loss": 0.7374, "num_input_tokens_seen": 7489024, "step": 12990 }, { "epoch": 1.9355078939529342, "grad_norm": 0.9001776576042175, "learning_rate": 4.8679252456081e-05, "loss": 0.6741, "num_input_tokens_seen": 7491648, "step": 12995 }, { "epoch": 1.9362526064938934, "grad_norm": 0.9395052790641785, "learning_rate": 4.8677167271282344e-05, "loss": 0.6416, "num_input_tokens_seen": 7494560, "step": 13000 }, { "epoch": 1.9369973190348526, "grad_norm": 0.7293421030044556, "learning_rate": 4.867508048647698e-05, "loss": 0.6761, "num_input_tokens_seen": 7497312, "step": 13005 }, { "epoch": 1.9377420315758118, "grad_norm": 1.4041630029678345, "learning_rate": 4.867299210180593e-05, "loss": 0.6891, "num_input_tokens_seen": 7500352, "step": 13010 }, { "epoch": 1.938486744116771, "grad_norm": 0.8357860445976257, "learning_rate": 4.867090211741033e-05, "loss": 0.7064, "num_input_tokens_seen": 7503552, "step": 13015 }, { "epoch": 1.9392314566577302, "grad_norm": 0.8293265700340271, "learning_rate": 4.86688105334314e-05, "loss": 0.7506, "num_input_tokens_seen": 7506528, "step": 13020 }, { "epoch": 1.9399761691986894, "grad_norm": 1.1378076076507568, "learning_rate": 4.866671735001048e-05, "loss": 0.541, "num_input_tokens_seen": 7509120, "step": 13025 }, { "epoch": 1.9407208817396486, "grad_norm": 1.2119495868682861, "learning_rate": 4.866462256728902e-05, "loss": 0.5798, "num_input_tokens_seen": 7512224, "step": 13030 }, { "epoch": 1.9414655942806078, "grad_norm": 1.8030266761779785, "learning_rate": 4.8662526185408595e-05, "loss": 0.7413, "num_input_tokens_seen": 7514880, "step": 13035 }, { "epoch": 1.942210306821567, "grad_norm": 1.0131571292877197, "learning_rate": 4.866042820451084e-05, "loss": 0.7985, "num_input_tokens_seen": 7518080, "step": 13040 }, { "epoch": 1.942955019362526, "grad_norm": 1.0365028381347656, "learning_rate": 4.865832862473756e-05, "loss": 0.6066, "num_input_tokens_seen": 7520992, "step": 13045 }, { "epoch": 1.9436997319034852, "grad_norm": 1.083527684211731, "learning_rate": 4.865622744623061e-05, "loss": 0.7592, "num_input_tokens_seen": 7523520, "step": 13050 }, { "epoch": 1.9444444444444444, "grad_norm": 1.0508257150650024, "learning_rate": 4.8654124669131984e-05, "loss": 0.6844, "num_input_tokens_seen": 7526112, "step": 13055 }, { "epoch": 1.9451891569854036, "grad_norm": 1.0757795572280884, "learning_rate": 4.865202029358379e-05, "loss": 0.5784, "num_input_tokens_seen": 7529120, "step": 13060 }, { "epoch": 1.9459338695263628, "grad_norm": 1.4290140867233276, "learning_rate": 4.864991431972822e-05, "loss": 0.5941, "num_input_tokens_seen": 7531904, "step": 13065 }, { "epoch": 1.946678582067322, "grad_norm": 1.1581029891967773, "learning_rate": 4.86478067477076e-05, "loss": 0.7548, "num_input_tokens_seen": 7534656, "step": 13070 }, { "epoch": 1.9474232946082812, "grad_norm": 1.0479730367660522, "learning_rate": 4.8645697577664347e-05, "loss": 0.5488, "num_input_tokens_seen": 7537088, "step": 13075 }, { "epoch": 1.9481680071492404, "grad_norm": 0.8042849898338318, "learning_rate": 4.8643586809740985e-05, "loss": 0.7888, "num_input_tokens_seen": 7539712, "step": 13080 }, { "epoch": 1.9489127196901996, "grad_norm": 0.9519088268280029, "learning_rate": 4.864147444408015e-05, "loss": 0.6333, "num_input_tokens_seen": 7542528, "step": 13085 }, { "epoch": 1.9496574322311586, "grad_norm": 2.010732412338257, "learning_rate": 4.86393604808246e-05, "loss": 0.7536, "num_input_tokens_seen": 7545440, "step": 13090 }, { "epoch": 1.9504021447721178, "grad_norm": 1.1185941696166992, "learning_rate": 4.8637244920117175e-05, "loss": 0.5311, "num_input_tokens_seen": 7548192, "step": 13095 }, { "epoch": 1.951146857313077, "grad_norm": 2.556950092315674, "learning_rate": 4.863512776210084e-05, "loss": 0.7892, "num_input_tokens_seen": 7550880, "step": 13100 }, { "epoch": 1.9518915698540362, "grad_norm": 1.4510605335235596, "learning_rate": 4.8633009006918665e-05, "loss": 0.6539, "num_input_tokens_seen": 7553888, "step": 13105 }, { "epoch": 1.9526362823949954, "grad_norm": 1.0996153354644775, "learning_rate": 4.863088865471382e-05, "loss": 0.5117, "num_input_tokens_seen": 7556896, "step": 13110 }, { "epoch": 1.9533809949359546, "grad_norm": 1.3696151971817017, "learning_rate": 4.8628766705629604e-05, "loss": 0.699, "num_input_tokens_seen": 7559648, "step": 13115 }, { "epoch": 1.9541257074769138, "grad_norm": 0.9275810122489929, "learning_rate": 4.862664315980939e-05, "loss": 0.6128, "num_input_tokens_seen": 7562560, "step": 13120 }, { "epoch": 1.954870420017873, "grad_norm": 1.4577604532241821, "learning_rate": 4.8624518017396706e-05, "loss": 0.7685, "num_input_tokens_seen": 7566080, "step": 13125 }, { "epoch": 1.9556151325588322, "grad_norm": 1.789207935333252, "learning_rate": 4.862239127853514e-05, "loss": 0.8422, "num_input_tokens_seen": 7568800, "step": 13130 }, { "epoch": 1.9563598450997914, "grad_norm": 0.7445714473724365, "learning_rate": 4.8620262943368405e-05, "loss": 0.6193, "num_input_tokens_seen": 7571776, "step": 13135 }, { "epoch": 1.9571045576407506, "grad_norm": 1.017209768295288, "learning_rate": 4.861813301204034e-05, "loss": 0.6243, "num_input_tokens_seen": 7574432, "step": 13140 }, { "epoch": 1.9578492701817098, "grad_norm": 1.4385144710540771, "learning_rate": 4.861600148469487e-05, "loss": 0.5555, "num_input_tokens_seen": 7577408, "step": 13145 }, { "epoch": 1.958593982722669, "grad_norm": 0.8662280440330505, "learning_rate": 4.861386836147603e-05, "loss": 0.7203, "num_input_tokens_seen": 7580256, "step": 13150 }, { "epoch": 1.9593386952636282, "grad_norm": 1.4879481792449951, "learning_rate": 4.861173364252798e-05, "loss": 0.5946, "num_input_tokens_seen": 7583200, "step": 13155 }, { "epoch": 1.9600834078045875, "grad_norm": 1.2941569089889526, "learning_rate": 4.860959732799497e-05, "loss": 0.7359, "num_input_tokens_seen": 7585792, "step": 13160 }, { "epoch": 1.9608281203455467, "grad_norm": 1.2527408599853516, "learning_rate": 4.860745941802136e-05, "loss": 0.6763, "num_input_tokens_seen": 7588416, "step": 13165 }, { "epoch": 1.9615728328865059, "grad_norm": 0.9903166890144348, "learning_rate": 4.860531991275162e-05, "loss": 0.4456, "num_input_tokens_seen": 7591392, "step": 13170 }, { "epoch": 1.962317545427465, "grad_norm": 1.116410732269287, "learning_rate": 4.860317881233033e-05, "loss": 0.6438, "num_input_tokens_seen": 7594496, "step": 13175 }, { "epoch": 1.9630622579684243, "grad_norm": 1.0278080701828003, "learning_rate": 4.8601036116902184e-05, "loss": 0.7361, "num_input_tokens_seen": 7597184, "step": 13180 }, { "epoch": 1.9638069705093835, "grad_norm": 1.1363037824630737, "learning_rate": 4.8598891826611974e-05, "loss": 0.5812, "num_input_tokens_seen": 7600096, "step": 13185 }, { "epoch": 1.9645516830503427, "grad_norm": 1.4359519481658936, "learning_rate": 4.85967459416046e-05, "loss": 0.6925, "num_input_tokens_seen": 7602976, "step": 13190 }, { "epoch": 1.9652963955913019, "grad_norm": 0.7593019604682922, "learning_rate": 4.859459846202507e-05, "loss": 0.5757, "num_input_tokens_seen": 7605600, "step": 13195 }, { "epoch": 1.966041108132261, "grad_norm": 0.7615728974342346, "learning_rate": 4.859244938801851e-05, "loss": 0.6069, "num_input_tokens_seen": 7608448, "step": 13200 }, { "epoch": 1.9667858206732203, "grad_norm": 1.4065446853637695, "learning_rate": 4.859029871973013e-05, "loss": 0.6389, "num_input_tokens_seen": 7611168, "step": 13205 }, { "epoch": 1.9675305332141795, "grad_norm": 1.1279083490371704, "learning_rate": 4.8588146457305284e-05, "loss": 0.6381, "num_input_tokens_seen": 7614144, "step": 13210 }, { "epoch": 1.9682752457551387, "grad_norm": 1.2425460815429688, "learning_rate": 4.85859926008894e-05, "loss": 0.6786, "num_input_tokens_seen": 7616896, "step": 13215 }, { "epoch": 1.9690199582960977, "grad_norm": 0.8471969366073608, "learning_rate": 4.858383715062803e-05, "loss": 0.687, "num_input_tokens_seen": 7619904, "step": 13220 }, { "epoch": 1.9697646708370569, "grad_norm": 0.9446980357170105, "learning_rate": 4.8581680106666827e-05, "loss": 0.6532, "num_input_tokens_seen": 7622720, "step": 13225 }, { "epoch": 1.970509383378016, "grad_norm": 0.8200129270553589, "learning_rate": 4.8579521469151555e-05, "loss": 0.837, "num_input_tokens_seen": 7625792, "step": 13230 }, { "epoch": 1.9712540959189753, "grad_norm": 0.9003373384475708, "learning_rate": 4.85773612382281e-05, "loss": 0.7435, "num_input_tokens_seen": 7628896, "step": 13235 }, { "epoch": 1.9719988084599345, "grad_norm": 1.3781650066375732, "learning_rate": 4.857519941404242e-05, "loss": 0.7677, "num_input_tokens_seen": 7631776, "step": 13240 }, { "epoch": 1.9727435210008937, "grad_norm": 0.5214788913726807, "learning_rate": 4.8573035996740626e-05, "loss": 0.6554, "num_input_tokens_seen": 7634688, "step": 13245 }, { "epoch": 1.9734882335418529, "grad_norm": 0.6980243921279907, "learning_rate": 4.8570870986468886e-05, "loss": 0.6617, "num_input_tokens_seen": 7637696, "step": 13250 }, { "epoch": 1.974232946082812, "grad_norm": 1.1284592151641846, "learning_rate": 4.856870438337353e-05, "loss": 0.6718, "num_input_tokens_seen": 7640384, "step": 13255 }, { "epoch": 1.974977658623771, "grad_norm": 0.7495482563972473, "learning_rate": 4.856653618760094e-05, "loss": 0.73, "num_input_tokens_seen": 7643360, "step": 13260 }, { "epoch": 1.9757223711647303, "grad_norm": 0.6695653796195984, "learning_rate": 4.8564366399297666e-05, "loss": 0.6434, "num_input_tokens_seen": 7646208, "step": 13265 }, { "epoch": 1.9764670837056895, "grad_norm": 0.5368191003799438, "learning_rate": 4.856219501861031e-05, "loss": 0.655, "num_input_tokens_seen": 7649024, "step": 13270 }, { "epoch": 1.9772117962466487, "grad_norm": 1.1438515186309814, "learning_rate": 4.8560022045685606e-05, "loss": 0.5658, "num_input_tokens_seen": 7652160, "step": 13275 }, { "epoch": 1.9779565087876079, "grad_norm": 0.8816258907318115, "learning_rate": 4.85578474806704e-05, "loss": 0.7768, "num_input_tokens_seen": 7655040, "step": 13280 }, { "epoch": 1.978701221328567, "grad_norm": 2.0167670249938965, "learning_rate": 4.8555671323711646e-05, "loss": 0.6906, "num_input_tokens_seen": 7657792, "step": 13285 }, { "epoch": 1.9794459338695263, "grad_norm": 0.6264031529426575, "learning_rate": 4.85534935749564e-05, "loss": 0.5647, "num_input_tokens_seen": 7660960, "step": 13290 }, { "epoch": 1.9801906464104855, "grad_norm": 1.046905517578125, "learning_rate": 4.8551314234551814e-05, "loss": 0.6887, "num_input_tokens_seen": 7664000, "step": 13295 }, { "epoch": 1.9809353589514447, "grad_norm": 0.8985147476196289, "learning_rate": 4.854913330264516e-05, "loss": 0.549, "num_input_tokens_seen": 7666912, "step": 13300 }, { "epoch": 1.9816800714924039, "grad_norm": 0.8988319039344788, "learning_rate": 4.8546950779383825e-05, "loss": 0.7194, "num_input_tokens_seen": 7669568, "step": 13305 }, { "epoch": 1.982424784033363, "grad_norm": 0.6600080728530884, "learning_rate": 4.854476666491529e-05, "loss": 0.7399, "num_input_tokens_seen": 7672608, "step": 13310 }, { "epoch": 1.9831694965743223, "grad_norm": 2.170163154602051, "learning_rate": 4.854258095938715e-05, "loss": 0.6878, "num_input_tokens_seen": 7675232, "step": 13315 }, { "epoch": 1.9839142091152815, "grad_norm": 1.1788870096206665, "learning_rate": 4.854039366294711e-05, "loss": 0.7111, "num_input_tokens_seen": 7678112, "step": 13320 }, { "epoch": 1.9846589216562407, "grad_norm": 0.8177742958068848, "learning_rate": 4.853820477574297e-05, "loss": 0.742, "num_input_tokens_seen": 7681152, "step": 13325 }, { "epoch": 1.9854036341972, "grad_norm": 1.04276442527771, "learning_rate": 4.853601429792265e-05, "loss": 0.593, "num_input_tokens_seen": 7683808, "step": 13330 }, { "epoch": 1.986148346738159, "grad_norm": 0.6978304982185364, "learning_rate": 4.853382222963418e-05, "loss": 0.6469, "num_input_tokens_seen": 7686880, "step": 13335 }, { "epoch": 1.9868930592791183, "grad_norm": 0.7808868885040283, "learning_rate": 4.853162857102568e-05, "loss": 0.5515, "num_input_tokens_seen": 7689568, "step": 13340 }, { "epoch": 1.9876377718200775, "grad_norm": 1.574529767036438, "learning_rate": 4.85294333222454e-05, "loss": 0.7373, "num_input_tokens_seen": 7692544, "step": 13345 }, { "epoch": 1.9883824843610367, "grad_norm": 0.9048187136650085, "learning_rate": 4.852723648344167e-05, "loss": 0.5704, "num_input_tokens_seen": 7695392, "step": 13350 }, { "epoch": 1.989127196901996, "grad_norm": 1.5613572597503662, "learning_rate": 4.852503805476296e-05, "loss": 0.6886, "num_input_tokens_seen": 7698336, "step": 13355 }, { "epoch": 1.9898719094429551, "grad_norm": 0.5695370435714722, "learning_rate": 4.852283803635782e-05, "loss": 0.6807, "num_input_tokens_seen": 7701216, "step": 13360 }, { "epoch": 1.9906166219839143, "grad_norm": 1.0237950086593628, "learning_rate": 4.852063642837493e-05, "loss": 0.6729, "num_input_tokens_seen": 7704384, "step": 13365 }, { "epoch": 1.9913613345248735, "grad_norm": 1.367244005203247, "learning_rate": 4.851843323096305e-05, "loss": 0.6027, "num_input_tokens_seen": 7707168, "step": 13370 }, { "epoch": 1.9921060470658327, "grad_norm": 0.84867924451828, "learning_rate": 4.851622844427107e-05, "loss": 0.6437, "num_input_tokens_seen": 7709888, "step": 13375 }, { "epoch": 1.992850759606792, "grad_norm": 1.0608686208724976, "learning_rate": 4.851402206844799e-05, "loss": 0.7616, "num_input_tokens_seen": 7712768, "step": 13380 }, { "epoch": 1.9935954721477511, "grad_norm": 0.8800463676452637, "learning_rate": 4.8511814103642894e-05, "loss": 0.8084, "num_input_tokens_seen": 7715680, "step": 13385 }, { "epoch": 1.99434018468871, "grad_norm": 2.0973081588745117, "learning_rate": 4.850960455000499e-05, "loss": 0.6609, "num_input_tokens_seen": 7718496, "step": 13390 }, { "epoch": 1.9950848972296693, "grad_norm": 0.8760191798210144, "learning_rate": 4.85073934076836e-05, "loss": 0.7482, "num_input_tokens_seen": 7721792, "step": 13395 }, { "epoch": 1.9958296097706285, "grad_norm": 0.7684197425842285, "learning_rate": 4.8505180676828144e-05, "loss": 0.6239, "num_input_tokens_seen": 7724800, "step": 13400 }, { "epoch": 1.9965743223115877, "grad_norm": 0.7069693207740784, "learning_rate": 4.850296635758813e-05, "loss": 0.5615, "num_input_tokens_seen": 7728096, "step": 13405 }, { "epoch": 1.997319034852547, "grad_norm": 1.050283432006836, "learning_rate": 4.850075045011321e-05, "loss": 0.6605, "num_input_tokens_seen": 7731072, "step": 13410 }, { "epoch": 1.9980637473935061, "grad_norm": 0.9694399237632751, "learning_rate": 4.8498532954553125e-05, "loss": 0.679, "num_input_tokens_seen": 7733952, "step": 13415 }, { "epoch": 1.9988084599344653, "grad_norm": 3.943843126296997, "learning_rate": 4.8496313871057716e-05, "loss": 0.7003, "num_input_tokens_seen": 7737152, "step": 13420 }, { "epoch": 1.9995531724754245, "grad_norm": 0.8842535018920898, "learning_rate": 4.8494093199776944e-05, "loss": 0.7187, "num_input_tokens_seen": 7740128, "step": 13425 }, { "epoch": 2.0, "eval_loss": 0.6616544127464294, "eval_runtime": 74.2051, "eval_samples_per_second": 40.213, "eval_steps_per_second": 10.053, "num_input_tokens_seen": 7741288, "step": 13428 }, { "epoch": 2.0002978850163835, "grad_norm": 0.577306866645813, "learning_rate": 4.849187094086088e-05, "loss": 0.669, "num_input_tokens_seen": 7742568, "step": 13430 }, { "epoch": 2.0010425975573427, "grad_norm": 1.166129469871521, "learning_rate": 4.848964709445969e-05, "loss": 0.7021, "num_input_tokens_seen": 7745416, "step": 13435 }, { "epoch": 2.001787310098302, "grad_norm": 0.7449812293052673, "learning_rate": 4.848742166072364e-05, "loss": 0.6305, "num_input_tokens_seen": 7748264, "step": 13440 }, { "epoch": 2.002532022639261, "grad_norm": 0.7695329189300537, "learning_rate": 4.8485194639803136e-05, "loss": 0.7045, "num_input_tokens_seen": 7751112, "step": 13445 }, { "epoch": 2.0032767351802203, "grad_norm": 1.1290587186813354, "learning_rate": 4.848296603184866e-05, "loss": 0.6557, "num_input_tokens_seen": 7754024, "step": 13450 }, { "epoch": 2.0040214477211795, "grad_norm": 1.0616337060928345, "learning_rate": 4.848073583701081e-05, "loss": 0.5899, "num_input_tokens_seen": 7757064, "step": 13455 }, { "epoch": 2.0047661602621387, "grad_norm": 0.6212621927261353, "learning_rate": 4.847850405544031e-05, "loss": 0.5772, "num_input_tokens_seen": 7759816, "step": 13460 }, { "epoch": 2.005510872803098, "grad_norm": 1.1464359760284424, "learning_rate": 4.847627068728795e-05, "loss": 0.6716, "num_input_tokens_seen": 7763016, "step": 13465 }, { "epoch": 2.006255585344057, "grad_norm": 0.89290452003479, "learning_rate": 4.847403573270467e-05, "loss": 0.6789, "num_input_tokens_seen": 7766024, "step": 13470 }, { "epoch": 2.0070002978850163, "grad_norm": 1.3285422325134277, "learning_rate": 4.847179919184149e-05, "loss": 0.7531, "num_input_tokens_seen": 7768712, "step": 13475 }, { "epoch": 2.0077450104259755, "grad_norm": 3.063594102859497, "learning_rate": 4.8469561064849555e-05, "loss": 0.9731, "num_input_tokens_seen": 7771304, "step": 13480 }, { "epoch": 2.0084897229669347, "grad_norm": 0.7158981561660767, "learning_rate": 4.84673213518801e-05, "loss": 0.7966, "num_input_tokens_seen": 7774024, "step": 13485 }, { "epoch": 2.009234435507894, "grad_norm": 0.7939945459365845, "learning_rate": 4.846508005308448e-05, "loss": 0.6238, "num_input_tokens_seen": 7776936, "step": 13490 }, { "epoch": 2.009979148048853, "grad_norm": 0.9883748292922974, "learning_rate": 4.846283716861415e-05, "loss": 0.7302, "num_input_tokens_seen": 7780040, "step": 13495 }, { "epoch": 2.0107238605898123, "grad_norm": 0.8441833257675171, "learning_rate": 4.8460592698620686e-05, "loss": 0.5736, "num_input_tokens_seen": 7782792, "step": 13500 }, { "epoch": 2.0114685731307715, "grad_norm": 0.7637821435928345, "learning_rate": 4.845834664325574e-05, "loss": 0.5442, "num_input_tokens_seen": 7785800, "step": 13505 }, { "epoch": 2.0122132856717307, "grad_norm": 0.7706959843635559, "learning_rate": 4.8456099002671104e-05, "loss": 0.7986, "num_input_tokens_seen": 7788392, "step": 13510 }, { "epoch": 2.01295799821269, "grad_norm": 0.7313231229782104, "learning_rate": 4.8453849777018675e-05, "loss": 0.6657, "num_input_tokens_seen": 7791432, "step": 13515 }, { "epoch": 2.013702710753649, "grad_norm": 0.6926540732383728, "learning_rate": 4.845159896645042e-05, "loss": 0.5902, "num_input_tokens_seen": 7794376, "step": 13520 }, { "epoch": 2.0144474232946084, "grad_norm": 0.8749009966850281, "learning_rate": 4.844934657111846e-05, "loss": 0.6935, "num_input_tokens_seen": 7797256, "step": 13525 }, { "epoch": 2.0151921358355676, "grad_norm": 2.2407162189483643, "learning_rate": 4.8447092591175e-05, "loss": 0.6023, "num_input_tokens_seen": 7799880, "step": 13530 }, { "epoch": 2.0159368483765268, "grad_norm": 1.484926462173462, "learning_rate": 4.844483702677235e-05, "loss": 0.6085, "num_input_tokens_seen": 7802536, "step": 13535 }, { "epoch": 2.016681560917486, "grad_norm": 1.045885682106018, "learning_rate": 4.8442579878062934e-05, "loss": 0.7277, "num_input_tokens_seen": 7805288, "step": 13540 }, { "epoch": 2.017426273458445, "grad_norm": 1.2096017599105835, "learning_rate": 4.844032114519928e-05, "loss": 0.7463, "num_input_tokens_seen": 7808200, "step": 13545 }, { "epoch": 2.0181709859994044, "grad_norm": 0.808951735496521, "learning_rate": 4.8438060828334014e-05, "loss": 0.6885, "num_input_tokens_seen": 7811144, "step": 13550 }, { "epoch": 2.0189156985403636, "grad_norm": 0.9714669585227966, "learning_rate": 4.84357989276199e-05, "loss": 0.7304, "num_input_tokens_seen": 7814280, "step": 13555 }, { "epoch": 2.0196604110813228, "grad_norm": 2.619675636291504, "learning_rate": 4.843353544320978e-05, "loss": 0.5297, "num_input_tokens_seen": 7817096, "step": 13560 }, { "epoch": 2.020405123622282, "grad_norm": 1.0919946432113647, "learning_rate": 4.84312703752566e-05, "loss": 0.6521, "num_input_tokens_seen": 7820008, "step": 13565 }, { "epoch": 2.021149836163241, "grad_norm": 1.6033943891525269, "learning_rate": 4.842900372391344e-05, "loss": 0.5853, "num_input_tokens_seen": 7822824, "step": 13570 }, { "epoch": 2.0218945487042004, "grad_norm": 0.8207486867904663, "learning_rate": 4.842673548933345e-05, "loss": 0.6365, "num_input_tokens_seen": 7825480, "step": 13575 }, { "epoch": 2.0226392612451596, "grad_norm": 1.6533262729644775, "learning_rate": 4.8424465671669935e-05, "loss": 0.744, "num_input_tokens_seen": 7828200, "step": 13580 }, { "epoch": 2.0233839737861183, "grad_norm": 1.0753110647201538, "learning_rate": 4.842219427107627e-05, "loss": 0.7299, "num_input_tokens_seen": 7831144, "step": 13585 }, { "epoch": 2.0241286863270775, "grad_norm": 1.1870884895324707, "learning_rate": 4.841992128770594e-05, "loss": 0.5679, "num_input_tokens_seen": 7833672, "step": 13590 }, { "epoch": 2.0248733988680367, "grad_norm": 0.9876298308372498, "learning_rate": 4.841764672171254e-05, "loss": 0.6039, "num_input_tokens_seen": 7836328, "step": 13595 }, { "epoch": 2.025618111408996, "grad_norm": 1.1970105171203613, "learning_rate": 4.841537057324979e-05, "loss": 0.5435, "num_input_tokens_seen": 7839080, "step": 13600 }, { "epoch": 2.026362823949955, "grad_norm": 1.0993438959121704, "learning_rate": 4.8413092842471496e-05, "loss": 0.5322, "num_input_tokens_seen": 7841768, "step": 13605 }, { "epoch": 2.0271075364909144, "grad_norm": 0.8499389290809631, "learning_rate": 4.841081352953158e-05, "loss": 0.649, "num_input_tokens_seen": 7844872, "step": 13610 }, { "epoch": 2.0278522490318736, "grad_norm": 0.8637983202934265, "learning_rate": 4.8408532634584063e-05, "loss": 0.5403, "num_input_tokens_seen": 7847624, "step": 13615 }, { "epoch": 2.0285969615728328, "grad_norm": 0.8681768774986267, "learning_rate": 4.840625015778308e-05, "loss": 0.7163, "num_input_tokens_seen": 7850600, "step": 13620 }, { "epoch": 2.029341674113792, "grad_norm": 0.8720305562019348, "learning_rate": 4.8403966099282886e-05, "loss": 0.702, "num_input_tokens_seen": 7853608, "step": 13625 }, { "epoch": 2.030086386654751, "grad_norm": 0.8581023216247559, "learning_rate": 4.840168045923781e-05, "loss": 0.7827, "num_input_tokens_seen": 7856872, "step": 13630 }, { "epoch": 2.0308310991957104, "grad_norm": 1.0469838380813599, "learning_rate": 4.8399393237802315e-05, "loss": 0.6031, "num_input_tokens_seen": 7860104, "step": 13635 }, { "epoch": 2.0315758117366696, "grad_norm": 1.3706692457199097, "learning_rate": 4.839710443513096e-05, "loss": 0.6036, "num_input_tokens_seen": 7863144, "step": 13640 }, { "epoch": 2.0323205242776288, "grad_norm": 0.8374891877174377, "learning_rate": 4.8394814051378414e-05, "loss": 0.616, "num_input_tokens_seen": 7866120, "step": 13645 }, { "epoch": 2.033065236818588, "grad_norm": 0.829896867275238, "learning_rate": 4.839252208669944e-05, "loss": 0.6903, "num_input_tokens_seen": 7869160, "step": 13650 }, { "epoch": 2.033809949359547, "grad_norm": 1.3012596368789673, "learning_rate": 4.839022854124894e-05, "loss": 0.5739, "num_input_tokens_seen": 7872136, "step": 13655 }, { "epoch": 2.0345546619005064, "grad_norm": 1.0816447734832764, "learning_rate": 4.838793341518189e-05, "loss": 0.4866, "num_input_tokens_seen": 7874888, "step": 13660 }, { "epoch": 2.0352993744414656, "grad_norm": 6.381442070007324, "learning_rate": 4.838563670865339e-05, "loss": 0.7034, "num_input_tokens_seen": 7877896, "step": 13665 }, { "epoch": 2.036044086982425, "grad_norm": 1.5318814516067505, "learning_rate": 4.838333842181864e-05, "loss": 0.7636, "num_input_tokens_seen": 7880648, "step": 13670 }, { "epoch": 2.036788799523384, "grad_norm": 0.9699036478996277, "learning_rate": 4.838103855483295e-05, "loss": 0.6321, "num_input_tokens_seen": 7883432, "step": 13675 }, { "epoch": 2.037533512064343, "grad_norm": 1.476456642150879, "learning_rate": 4.8378737107851736e-05, "loss": 0.5437, "num_input_tokens_seen": 7886280, "step": 13680 }, { "epoch": 2.0382782246053024, "grad_norm": 1.3352421522140503, "learning_rate": 4.837643408103051e-05, "loss": 0.6726, "num_input_tokens_seen": 7889128, "step": 13685 }, { "epoch": 2.0390229371462616, "grad_norm": 0.9837419390678406, "learning_rate": 4.837412947452492e-05, "loss": 0.5919, "num_input_tokens_seen": 7892200, "step": 13690 }, { "epoch": 2.039767649687221, "grad_norm": 0.9756308197975159, "learning_rate": 4.8371823288490694e-05, "loss": 0.6373, "num_input_tokens_seen": 7895112, "step": 13695 }, { "epoch": 2.04051236222818, "grad_norm": 1.1021418571472168, "learning_rate": 4.8369515523083664e-05, "loss": 0.6507, "num_input_tokens_seen": 7898056, "step": 13700 }, { "epoch": 2.041257074769139, "grad_norm": 1.1258974075317383, "learning_rate": 4.83672061784598e-05, "loss": 0.8578, "num_input_tokens_seen": 7900840, "step": 13705 }, { "epoch": 2.0420017873100984, "grad_norm": 0.948131263256073, "learning_rate": 4.836489525477513e-05, "loss": 0.671, "num_input_tokens_seen": 7903720, "step": 13710 }, { "epoch": 2.0427464998510576, "grad_norm": 0.9544811844825745, "learning_rate": 4.8362582752185844e-05, "loss": 0.7579, "num_input_tokens_seen": 7906984, "step": 13715 }, { "epoch": 2.043491212392017, "grad_norm": 1.6517795324325562, "learning_rate": 4.836026867084821e-05, "loss": 0.634, "num_input_tokens_seen": 7909928, "step": 13720 }, { "epoch": 2.044235924932976, "grad_norm": 1.7863339185714722, "learning_rate": 4.8357953010918585e-05, "loss": 0.7197, "num_input_tokens_seen": 7913064, "step": 13725 }, { "epoch": 2.044980637473935, "grad_norm": 1.8359090089797974, "learning_rate": 4.835563577255346e-05, "loss": 0.7411, "num_input_tokens_seen": 7915784, "step": 13730 }, { "epoch": 2.0457253500148944, "grad_norm": 1.3896764516830444, "learning_rate": 4.835331695590943e-05, "loss": 0.7609, "num_input_tokens_seen": 7918664, "step": 13735 }, { "epoch": 2.0464700625558536, "grad_norm": 1.4001537561416626, "learning_rate": 4.835099656114319e-05, "loss": 0.6546, "num_input_tokens_seen": 7921704, "step": 13740 }, { "epoch": 2.047214775096813, "grad_norm": 1.6283190250396729, "learning_rate": 4.834867458841154e-05, "loss": 0.5647, "num_input_tokens_seen": 7924136, "step": 13745 }, { "epoch": 2.047959487637772, "grad_norm": 1.2791849374771118, "learning_rate": 4.8346351037871386e-05, "loss": 0.686, "num_input_tokens_seen": 7927080, "step": 13750 }, { "epoch": 2.0487042001787312, "grad_norm": 3.011441707611084, "learning_rate": 4.8344025909679746e-05, "loss": 0.8306, "num_input_tokens_seen": 7930120, "step": 13755 }, { "epoch": 2.04944891271969, "grad_norm": 2.862130880355835, "learning_rate": 4.834169920399375e-05, "loss": 0.6857, "num_input_tokens_seen": 7932936, "step": 13760 }, { "epoch": 2.050193625260649, "grad_norm": 1.824846625328064, "learning_rate": 4.8339370920970614e-05, "loss": 0.6353, "num_input_tokens_seen": 7935720, "step": 13765 }, { "epoch": 2.0509383378016084, "grad_norm": 1.0018795728683472, "learning_rate": 4.8337041060767696e-05, "loss": 0.6805, "num_input_tokens_seen": 7938760, "step": 13770 }, { "epoch": 2.0516830503425676, "grad_norm": 1.1727620363235474, "learning_rate": 4.833470962354242e-05, "loss": 0.6031, "num_input_tokens_seen": 7941416, "step": 13775 }, { "epoch": 2.052427762883527, "grad_norm": 1.0772554874420166, "learning_rate": 4.8332376609452334e-05, "loss": 0.4957, "num_input_tokens_seen": 7945608, "step": 13780 }, { "epoch": 2.053172475424486, "grad_norm": 2.0754928588867188, "learning_rate": 4.83300420186551e-05, "loss": 0.809, "num_input_tokens_seen": 7948360, "step": 13785 }, { "epoch": 2.053917187965445, "grad_norm": 0.7920007109642029, "learning_rate": 4.832770585130849e-05, "loss": 0.6273, "num_input_tokens_seen": 7951240, "step": 13790 }, { "epoch": 2.0546619005064044, "grad_norm": 1.0977636575698853, "learning_rate": 4.8325368107570354e-05, "loss": 0.6207, "num_input_tokens_seen": 7954088, "step": 13795 }, { "epoch": 2.0554066130473636, "grad_norm": 1.3031587600708008, "learning_rate": 4.8323028787598666e-05, "loss": 0.6808, "num_input_tokens_seen": 7956968, "step": 13800 }, { "epoch": 2.056151325588323, "grad_norm": 1.2448338270187378, "learning_rate": 4.832068789155153e-05, "loss": 0.5172, "num_input_tokens_seen": 7959816, "step": 13805 }, { "epoch": 2.056896038129282, "grad_norm": 1.7081693410873413, "learning_rate": 4.831834541958712e-05, "loss": 0.5372, "num_input_tokens_seen": 7962792, "step": 13810 }, { "epoch": 2.057640750670241, "grad_norm": 1.1014649868011475, "learning_rate": 4.8316001371863726e-05, "loss": 0.4529, "num_input_tokens_seen": 7965608, "step": 13815 }, { "epoch": 2.0583854632112004, "grad_norm": 1.5557897090911865, "learning_rate": 4.831365574853977e-05, "loss": 0.7717, "num_input_tokens_seen": 7968424, "step": 13820 }, { "epoch": 2.0591301757521596, "grad_norm": 1.4473875761032104, "learning_rate": 4.831130854977373e-05, "loss": 0.5897, "num_input_tokens_seen": 7971496, "step": 13825 }, { "epoch": 2.059874888293119, "grad_norm": 1.9690330028533936, "learning_rate": 4.830895977572424e-05, "loss": 0.7273, "num_input_tokens_seen": 7974152, "step": 13830 }, { "epoch": 2.060619600834078, "grad_norm": 1.7620415687561035, "learning_rate": 4.830660942655001e-05, "loss": 0.6449, "num_input_tokens_seen": 7977352, "step": 13835 }, { "epoch": 2.0613643133750372, "grad_norm": 1.8362255096435547, "learning_rate": 4.8304257502409875e-05, "loss": 0.7719, "num_input_tokens_seen": 7979976, "step": 13840 }, { "epoch": 2.0621090259159964, "grad_norm": 1.689949631690979, "learning_rate": 4.830190400346277e-05, "loss": 0.5858, "num_input_tokens_seen": 7982920, "step": 13845 }, { "epoch": 2.0628537384569556, "grad_norm": 0.7373266220092773, "learning_rate": 4.829954892986773e-05, "loss": 0.7791, "num_input_tokens_seen": 7985544, "step": 13850 }, { "epoch": 2.063598450997915, "grad_norm": 1.1411583423614502, "learning_rate": 4.829719228178391e-05, "loss": 0.74, "num_input_tokens_seen": 7988488, "step": 13855 }, { "epoch": 2.064343163538874, "grad_norm": 1.1441256999969482, "learning_rate": 4.829483405937054e-05, "loss": 0.6432, "num_input_tokens_seen": 7991240, "step": 13860 }, { "epoch": 2.0650878760798332, "grad_norm": 0.9466385841369629, "learning_rate": 4.8292474262787e-05, "loss": 0.4415, "num_input_tokens_seen": 7994280, "step": 13865 }, { "epoch": 2.0658325886207924, "grad_norm": 1.2005963325500488, "learning_rate": 4.829011289219276e-05, "loss": 0.7079, "num_input_tokens_seen": 7997192, "step": 13870 }, { "epoch": 2.0665773011617516, "grad_norm": 1.8997694253921509, "learning_rate": 4.828774994774737e-05, "loss": 0.5728, "num_input_tokens_seen": 8000072, "step": 13875 }, { "epoch": 2.067322013702711, "grad_norm": 0.9921427369117737, "learning_rate": 4.828538542961052e-05, "loss": 0.7005, "num_input_tokens_seen": 8003048, "step": 13880 }, { "epoch": 2.06806672624367, "grad_norm": 1.0828204154968262, "learning_rate": 4.8283019337942e-05, "loss": 0.6041, "num_input_tokens_seen": 8005768, "step": 13885 }, { "epoch": 2.0688114387846293, "grad_norm": 0.9162933230400085, "learning_rate": 4.828065167290169e-05, "loss": 0.7176, "num_input_tokens_seen": 8008648, "step": 13890 }, { "epoch": 2.0695561513255885, "grad_norm": 0.6392986178398132, "learning_rate": 4.827828243464959e-05, "loss": 0.5816, "num_input_tokens_seen": 8011432, "step": 13895 }, { "epoch": 2.0703008638665477, "grad_norm": 0.9292396306991577, "learning_rate": 4.8275911623345816e-05, "loss": 0.7717, "num_input_tokens_seen": 8014696, "step": 13900 }, { "epoch": 2.071045576407507, "grad_norm": 0.5777992010116577, "learning_rate": 4.8273539239150555e-05, "loss": 0.5138, "num_input_tokens_seen": 8017576, "step": 13905 }, { "epoch": 2.071790288948466, "grad_norm": 0.6055952906608582, "learning_rate": 4.827116528222414e-05, "loss": 0.6736, "num_input_tokens_seen": 8020808, "step": 13910 }, { "epoch": 2.0725350014894253, "grad_norm": 1.19501793384552, "learning_rate": 4.8268789752726993e-05, "loss": 0.6095, "num_input_tokens_seen": 8023528, "step": 13915 }, { "epoch": 2.0732797140303845, "grad_norm": 0.9612341523170471, "learning_rate": 4.826641265081964e-05, "loss": 0.5528, "num_input_tokens_seen": 8026632, "step": 13920 }, { "epoch": 2.0740244265713437, "grad_norm": 0.6638344526290894, "learning_rate": 4.82640339766627e-05, "loss": 0.6113, "num_input_tokens_seen": 8029352, "step": 13925 }, { "epoch": 2.074769139112303, "grad_norm": 1.0898011922836304, "learning_rate": 4.8261653730416945e-05, "loss": 0.6441, "num_input_tokens_seen": 8032424, "step": 13930 }, { "epoch": 2.0755138516532616, "grad_norm": 1.1356024742126465, "learning_rate": 4.8259271912243196e-05, "loss": 0.747, "num_input_tokens_seen": 8035080, "step": 13935 }, { "epoch": 2.076258564194221, "grad_norm": 2.2029051780700684, "learning_rate": 4.8256888522302426e-05, "loss": 0.6894, "num_input_tokens_seen": 8038120, "step": 13940 }, { "epoch": 2.07700327673518, "grad_norm": 0.9918341636657715, "learning_rate": 4.825450356075568e-05, "loss": 0.7425, "num_input_tokens_seen": 8041160, "step": 13945 }, { "epoch": 2.0777479892761392, "grad_norm": 0.45526832342147827, "learning_rate": 4.825211702776412e-05, "loss": 0.5183, "num_input_tokens_seen": 8044104, "step": 13950 }, { "epoch": 2.0784927018170984, "grad_norm": 0.7267532348632812, "learning_rate": 4.824972892348904e-05, "loss": 0.5504, "num_input_tokens_seen": 8046728, "step": 13955 }, { "epoch": 2.0792374143580576, "grad_norm": 0.5507504343986511, "learning_rate": 4.8247339248091805e-05, "loss": 0.6852, "num_input_tokens_seen": 8049704, "step": 13960 }, { "epoch": 2.079982126899017, "grad_norm": 1.23472261428833, "learning_rate": 4.824494800173389e-05, "loss": 0.7166, "num_input_tokens_seen": 8052584, "step": 13965 }, { "epoch": 2.080726839439976, "grad_norm": 0.900697648525238, "learning_rate": 4.824255518457691e-05, "loss": 0.7116, "num_input_tokens_seen": 8055304, "step": 13970 }, { "epoch": 2.0814715519809353, "grad_norm": 0.9777503609657288, "learning_rate": 4.824016079678254e-05, "loss": 0.695, "num_input_tokens_seen": 8058120, "step": 13975 }, { "epoch": 2.0822162645218945, "grad_norm": 1.3005486726760864, "learning_rate": 4.823776483851259e-05, "loss": 0.6675, "num_input_tokens_seen": 8061224, "step": 13980 }, { "epoch": 2.0829609770628537, "grad_norm": 1.1349705457687378, "learning_rate": 4.8235367309928975e-05, "loss": 0.6969, "num_input_tokens_seen": 8064136, "step": 13985 }, { "epoch": 2.083705689603813, "grad_norm": 1.0882725715637207, "learning_rate": 4.82329682111937e-05, "loss": 0.7218, "num_input_tokens_seen": 8066792, "step": 13990 }, { "epoch": 2.084450402144772, "grad_norm": 0.5396671891212463, "learning_rate": 4.82305675424689e-05, "loss": 0.6222, "num_input_tokens_seen": 8070024, "step": 13995 }, { "epoch": 2.0851951146857313, "grad_norm": 1.1499531269073486, "learning_rate": 4.822816530391678e-05, "loss": 0.7028, "num_input_tokens_seen": 8072680, "step": 14000 }, { "epoch": 2.0859398272266905, "grad_norm": 0.9489894509315491, "learning_rate": 4.82257614956997e-05, "loss": 0.6089, "num_input_tokens_seen": 8075848, "step": 14005 }, { "epoch": 2.0866845397676497, "grad_norm": 1.154251217842102, "learning_rate": 4.8223356117980085e-05, "loss": 0.7352, "num_input_tokens_seen": 8079048, "step": 14010 }, { "epoch": 2.087429252308609, "grad_norm": 0.6907241940498352, "learning_rate": 4.822094917092048e-05, "loss": 0.6431, "num_input_tokens_seen": 8081832, "step": 14015 }, { "epoch": 2.088173964849568, "grad_norm": 1.0083929300308228, "learning_rate": 4.8218540654683544e-05, "loss": 0.837, "num_input_tokens_seen": 8084584, "step": 14020 }, { "epoch": 2.0889186773905273, "grad_norm": 1.335655927658081, "learning_rate": 4.821613056943203e-05, "loss": 0.7519, "num_input_tokens_seen": 8087464, "step": 14025 }, { "epoch": 2.0896633899314865, "grad_norm": 0.6268320679664612, "learning_rate": 4.821371891532879e-05, "loss": 0.6432, "num_input_tokens_seen": 8090376, "step": 14030 }, { "epoch": 2.0904081024724457, "grad_norm": 1.007781982421875, "learning_rate": 4.821130569253682e-05, "loss": 0.6422, "num_input_tokens_seen": 8093192, "step": 14035 }, { "epoch": 2.091152815013405, "grad_norm": 0.9302863478660583, "learning_rate": 4.8208890901219174e-05, "loss": 0.7596, "num_input_tokens_seen": 8096296, "step": 14040 }, { "epoch": 2.091897527554364, "grad_norm": 0.9403323531150818, "learning_rate": 4.820647454153905e-05, "loss": 0.7203, "num_input_tokens_seen": 8099144, "step": 14045 }, { "epoch": 2.0926422400953233, "grad_norm": 0.8657109141349792, "learning_rate": 4.820405661365972e-05, "loss": 0.6328, "num_input_tokens_seen": 8101960, "step": 14050 }, { "epoch": 2.0933869526362825, "grad_norm": 0.7495026588439941, "learning_rate": 4.8201637117744584e-05, "loss": 0.5378, "num_input_tokens_seen": 8104712, "step": 14055 }, { "epoch": 2.0941316651772417, "grad_norm": 2.008450746536255, "learning_rate": 4.819921605395714e-05, "loss": 0.5961, "num_input_tokens_seen": 8107368, "step": 14060 }, { "epoch": 2.094876377718201, "grad_norm": 0.6596180200576782, "learning_rate": 4.819679342246101e-05, "loss": 0.7377, "num_input_tokens_seen": 8110440, "step": 14065 }, { "epoch": 2.09562109025916, "grad_norm": 1.5330452919006348, "learning_rate": 4.819436922341988e-05, "loss": 0.7102, "num_input_tokens_seen": 8113448, "step": 14070 }, { "epoch": 2.0963658028001193, "grad_norm": 1.1948939561843872, "learning_rate": 4.819194345699758e-05, "loss": 0.4925, "num_input_tokens_seen": 8116104, "step": 14075 }, { "epoch": 2.0971105153410785, "grad_norm": 1.1014889478683472, "learning_rate": 4.818951612335803e-05, "loss": 0.6815, "num_input_tokens_seen": 8118920, "step": 14080 }, { "epoch": 2.0978552278820377, "grad_norm": 1.3169231414794922, "learning_rate": 4.8187087222665266e-05, "loss": 0.5829, "num_input_tokens_seen": 8121768, "step": 14085 }, { "epoch": 2.098599940422997, "grad_norm": 1.424428939819336, "learning_rate": 4.818465675508342e-05, "loss": 0.6726, "num_input_tokens_seen": 8124968, "step": 14090 }, { "epoch": 2.099344652963956, "grad_norm": 0.826285719871521, "learning_rate": 4.818222472077674e-05, "loss": 0.7304, "num_input_tokens_seen": 8127592, "step": 14095 }, { "epoch": 2.1000893655049153, "grad_norm": 1.3659745454788208, "learning_rate": 4.817979111990955e-05, "loss": 0.7225, "num_input_tokens_seen": 8130152, "step": 14100 }, { "epoch": 2.1008340780458745, "grad_norm": 0.8014612793922424, "learning_rate": 4.817735595264633e-05, "loss": 0.7979, "num_input_tokens_seen": 8132968, "step": 14105 }, { "epoch": 2.1015787905868333, "grad_norm": 2.1636695861816406, "learning_rate": 4.817491921915162e-05, "loss": 0.6171, "num_input_tokens_seen": 8135944, "step": 14110 }, { "epoch": 2.1023235031277925, "grad_norm": 0.7371875643730164, "learning_rate": 4.817248091959009e-05, "loss": 0.6959, "num_input_tokens_seen": 8138952, "step": 14115 }, { "epoch": 2.1030682156687517, "grad_norm": 4.273553848266602, "learning_rate": 4.817004105412652e-05, "loss": 0.7163, "num_input_tokens_seen": 8141800, "step": 14120 }, { "epoch": 2.103812928209711, "grad_norm": 3.4351484775543213, "learning_rate": 4.8167599622925776e-05, "loss": 0.7559, "num_input_tokens_seen": 8144552, "step": 14125 }, { "epoch": 2.10455764075067, "grad_norm": 1.686647891998291, "learning_rate": 4.816515662615284e-05, "loss": 0.6786, "num_input_tokens_seen": 8147336, "step": 14130 }, { "epoch": 2.1053023532916293, "grad_norm": 0.7953262329101562, "learning_rate": 4.8162712063972805e-05, "loss": 0.5532, "num_input_tokens_seen": 8150152, "step": 14135 }, { "epoch": 2.1060470658325885, "grad_norm": 1.1281346082687378, "learning_rate": 4.816026593655085e-05, "loss": 0.7331, "num_input_tokens_seen": 8153128, "step": 14140 }, { "epoch": 2.1067917783735477, "grad_norm": 1.450784683227539, "learning_rate": 4.81578182440523e-05, "loss": 0.6278, "num_input_tokens_seen": 8155848, "step": 14145 }, { "epoch": 2.107536490914507, "grad_norm": 0.6789346933364868, "learning_rate": 4.815536898664254e-05, "loss": 0.6466, "num_input_tokens_seen": 8158632, "step": 14150 }, { "epoch": 2.108281203455466, "grad_norm": 0.6021842956542969, "learning_rate": 4.815291816448709e-05, "loss": 0.5196, "num_input_tokens_seen": 8161480, "step": 14155 }, { "epoch": 2.1090259159964253, "grad_norm": 1.1166397333145142, "learning_rate": 4.815046577775156e-05, "loss": 0.6745, "num_input_tokens_seen": 8164424, "step": 14160 }, { "epoch": 2.1097706285373845, "grad_norm": 1.056289553642273, "learning_rate": 4.8148011826601676e-05, "loss": 0.7423, "num_input_tokens_seen": 8167464, "step": 14165 }, { "epoch": 2.1105153410783437, "grad_norm": 1.2285290956497192, "learning_rate": 4.814555631120327e-05, "loss": 0.7339, "num_input_tokens_seen": 8170472, "step": 14170 }, { "epoch": 2.111260053619303, "grad_norm": 0.7621760368347168, "learning_rate": 4.814309923172227e-05, "loss": 0.7571, "num_input_tokens_seen": 8173480, "step": 14175 }, { "epoch": 2.112004766160262, "grad_norm": 0.9024293422698975, "learning_rate": 4.8140640588324705e-05, "loss": 0.6249, "num_input_tokens_seen": 8176232, "step": 14180 }, { "epoch": 2.1127494787012213, "grad_norm": 1.148458480834961, "learning_rate": 4.8138180381176744e-05, "loss": 0.7146, "num_input_tokens_seen": 8179144, "step": 14185 }, { "epoch": 2.1134941912421805, "grad_norm": 1.6658176183700562, "learning_rate": 4.813571861044463e-05, "loss": 0.6909, "num_input_tokens_seen": 8182056, "step": 14190 }, { "epoch": 2.1142389037831397, "grad_norm": 0.7119945287704468, "learning_rate": 4.81332552762947e-05, "loss": 0.6871, "num_input_tokens_seen": 8185128, "step": 14195 }, { "epoch": 2.114983616324099, "grad_norm": 1.4934006929397583, "learning_rate": 4.813079037889344e-05, "loss": 0.5476, "num_input_tokens_seen": 8188168, "step": 14200 }, { "epoch": 2.115728328865058, "grad_norm": 0.9330924153327942, "learning_rate": 4.812832391840741e-05, "loss": 0.648, "num_input_tokens_seen": 8191112, "step": 14205 }, { "epoch": 2.1164730414060173, "grad_norm": 1.281327486038208, "learning_rate": 4.812585589500328e-05, "loss": 0.7565, "num_input_tokens_seen": 8193960, "step": 14210 }, { "epoch": 2.1172177539469765, "grad_norm": 1.5456856489181519, "learning_rate": 4.812338630884783e-05, "loss": 0.6508, "num_input_tokens_seen": 8196904, "step": 14215 }, { "epoch": 2.1179624664879357, "grad_norm": 1.6974605321884155, "learning_rate": 4.812091516010795e-05, "loss": 0.7306, "num_input_tokens_seen": 8199656, "step": 14220 }, { "epoch": 2.118707179028895, "grad_norm": 1.1147724390029907, "learning_rate": 4.811844244895063e-05, "loss": 0.6663, "num_input_tokens_seen": 8202568, "step": 14225 }, { "epoch": 2.119451891569854, "grad_norm": 1.2342045307159424, "learning_rate": 4.811596817554296e-05, "loss": 0.695, "num_input_tokens_seen": 8205416, "step": 14230 }, { "epoch": 2.1201966041108133, "grad_norm": 2.110403537750244, "learning_rate": 4.8113492340052135e-05, "loss": 0.6991, "num_input_tokens_seen": 8208168, "step": 14235 }, { "epoch": 2.1209413166517725, "grad_norm": 1.15031898021698, "learning_rate": 4.8111014942645476e-05, "loss": 0.6463, "num_input_tokens_seen": 8210984, "step": 14240 }, { "epoch": 2.1216860291927317, "grad_norm": 1.3621399402618408, "learning_rate": 4.8108535983490386e-05, "loss": 0.6822, "num_input_tokens_seen": 8213992, "step": 14245 }, { "epoch": 2.122430741733691, "grad_norm": 2.318284511566162, "learning_rate": 4.8106055462754394e-05, "loss": 0.681, "num_input_tokens_seen": 8217096, "step": 14250 }, { "epoch": 2.12317545427465, "grad_norm": 1.285021185874939, "learning_rate": 4.810357338060512e-05, "loss": 0.621, "num_input_tokens_seen": 8220040, "step": 14255 }, { "epoch": 2.1239201668156094, "grad_norm": 1.7618800401687622, "learning_rate": 4.810108973721028e-05, "loss": 0.616, "num_input_tokens_seen": 8222984, "step": 14260 }, { "epoch": 2.1246648793565686, "grad_norm": 1.0684669017791748, "learning_rate": 4.809860453273772e-05, "loss": 0.6007, "num_input_tokens_seen": 8225864, "step": 14265 }, { "epoch": 2.1254095918975278, "grad_norm": 2.363452672958374, "learning_rate": 4.809611776735538e-05, "loss": 0.6976, "num_input_tokens_seen": 8228936, "step": 14270 }, { "epoch": 2.1261543044384865, "grad_norm": 0.8309570550918579, "learning_rate": 4.809362944123129e-05, "loss": 0.831, "num_input_tokens_seen": 8231592, "step": 14275 }, { "epoch": 2.126899016979446, "grad_norm": 1.4471514225006104, "learning_rate": 4.809113955453363e-05, "loss": 0.8254, "num_input_tokens_seen": 8234312, "step": 14280 }, { "epoch": 2.127643729520405, "grad_norm": 1.1123864650726318, "learning_rate": 4.8088648107430636e-05, "loss": 0.6256, "num_input_tokens_seen": 8237096, "step": 14285 }, { "epoch": 2.128388442061364, "grad_norm": 1.123098373413086, "learning_rate": 4.8086155100090676e-05, "loss": 0.6849, "num_input_tokens_seen": 8239848, "step": 14290 }, { "epoch": 2.1291331546023233, "grad_norm": 1.1578186750411987, "learning_rate": 4.8083660532682214e-05, "loss": 0.6017, "num_input_tokens_seen": 8242664, "step": 14295 }, { "epoch": 2.1298778671432825, "grad_norm": 0.9638151526451111, "learning_rate": 4.8081164405373825e-05, "loss": 0.6126, "num_input_tokens_seen": 8245704, "step": 14300 }, { "epoch": 2.1306225796842417, "grad_norm": 1.1208864450454712, "learning_rate": 4.807866671833418e-05, "loss": 0.6868, "num_input_tokens_seen": 8248424, "step": 14305 }, { "epoch": 2.131367292225201, "grad_norm": 0.9997280836105347, "learning_rate": 4.807616747173208e-05, "loss": 0.6748, "num_input_tokens_seen": 8251400, "step": 14310 }, { "epoch": 2.13211200476616, "grad_norm": 1.4610453844070435, "learning_rate": 4.8073666665736394e-05, "loss": 0.7786, "num_input_tokens_seen": 8254472, "step": 14315 }, { "epoch": 2.1328567173071193, "grad_norm": 1.5118685960769653, "learning_rate": 4.807116430051614e-05, "loss": 0.7551, "num_input_tokens_seen": 8257256, "step": 14320 }, { "epoch": 2.1336014298480785, "grad_norm": 1.3753126859664917, "learning_rate": 4.806866037624039e-05, "loss": 0.7543, "num_input_tokens_seen": 8260392, "step": 14325 }, { "epoch": 2.1343461423890377, "grad_norm": 0.7243727445602417, "learning_rate": 4.806615489307836e-05, "loss": 0.6907, "num_input_tokens_seen": 8263208, "step": 14330 }, { "epoch": 2.135090854929997, "grad_norm": 1.2091799974441528, "learning_rate": 4.806364785119937e-05, "loss": 0.6786, "num_input_tokens_seen": 8265960, "step": 14335 }, { "epoch": 2.135835567470956, "grad_norm": 0.7535638809204102, "learning_rate": 4.8061139250772825e-05, "loss": 0.6021, "num_input_tokens_seen": 8268872, "step": 14340 }, { "epoch": 2.1365802800119154, "grad_norm": 1.1466376781463623, "learning_rate": 4.805862909196825e-05, "loss": 0.5387, "num_input_tokens_seen": 8271624, "step": 14345 }, { "epoch": 2.1373249925528746, "grad_norm": 0.9268626570701599, "learning_rate": 4.805611737495527e-05, "loss": 0.6795, "num_input_tokens_seen": 8274536, "step": 14350 }, { "epoch": 2.1380697050938338, "grad_norm": 1.2372946739196777, "learning_rate": 4.8053604099903614e-05, "loss": 0.7663, "num_input_tokens_seen": 8277544, "step": 14355 }, { "epoch": 2.138814417634793, "grad_norm": 1.7889256477355957, "learning_rate": 4.8051089266983126e-05, "loss": 0.6736, "num_input_tokens_seen": 8280712, "step": 14360 }, { "epoch": 2.139559130175752, "grad_norm": 1.1808537244796753, "learning_rate": 4.804857287636375e-05, "loss": 0.6301, "num_input_tokens_seen": 8283752, "step": 14365 }, { "epoch": 2.1403038427167114, "grad_norm": 1.7844452857971191, "learning_rate": 4.804605492821552e-05, "loss": 0.771, "num_input_tokens_seen": 8286536, "step": 14370 }, { "epoch": 2.1410485552576706, "grad_norm": 0.6942896246910095, "learning_rate": 4.80435354227086e-05, "loss": 0.6524, "num_input_tokens_seen": 8289512, "step": 14375 }, { "epoch": 2.1417932677986298, "grad_norm": 1.558775544166565, "learning_rate": 4.8041014360013236e-05, "loss": 0.7673, "num_input_tokens_seen": 8292360, "step": 14380 }, { "epoch": 2.142537980339589, "grad_norm": 0.7853931188583374, "learning_rate": 4.803849174029981e-05, "loss": 0.5483, "num_input_tokens_seen": 8295048, "step": 14385 }, { "epoch": 2.143282692880548, "grad_norm": 0.8971487283706665, "learning_rate": 4.803596756373877e-05, "loss": 0.7286, "num_input_tokens_seen": 8298088, "step": 14390 }, { "epoch": 2.1440274054215074, "grad_norm": 2.15787672996521, "learning_rate": 4.8033441830500706e-05, "loss": 0.6504, "num_input_tokens_seen": 8301128, "step": 14395 }, { "epoch": 2.1447721179624666, "grad_norm": 1.0680687427520752, "learning_rate": 4.803091454075629e-05, "loss": 0.6306, "num_input_tokens_seen": 8304072, "step": 14400 }, { "epoch": 2.145516830503426, "grad_norm": 1.164637565612793, "learning_rate": 4.8028385694676306e-05, "loss": 0.703, "num_input_tokens_seen": 8307016, "step": 14405 }, { "epoch": 2.146261543044385, "grad_norm": 0.9768379926681519, "learning_rate": 4.802585529243164e-05, "loss": 0.5142, "num_input_tokens_seen": 8309416, "step": 14410 }, { "epoch": 2.147006255585344, "grad_norm": 1.374172329902649, "learning_rate": 4.80233233341933e-05, "loss": 0.7509, "num_input_tokens_seen": 8312552, "step": 14415 }, { "epoch": 2.1477509681263034, "grad_norm": 2.0213463306427, "learning_rate": 4.802078982013236e-05, "loss": 0.5542, "num_input_tokens_seen": 8315176, "step": 14420 }, { "epoch": 2.1484956806672626, "grad_norm": 1.1406371593475342, "learning_rate": 4.801825475042005e-05, "loss": 0.6721, "num_input_tokens_seen": 8318056, "step": 14425 }, { "epoch": 2.149240393208222, "grad_norm": 0.6739243268966675, "learning_rate": 4.801571812522767e-05, "loss": 0.6372, "num_input_tokens_seen": 8320840, "step": 14430 }, { "epoch": 2.149985105749181, "grad_norm": 1.2090325355529785, "learning_rate": 4.801317994472663e-05, "loss": 0.5022, "num_input_tokens_seen": 8323688, "step": 14435 }, { "epoch": 2.15072981829014, "grad_norm": 0.600775420665741, "learning_rate": 4.801064020908845e-05, "loss": 0.5966, "num_input_tokens_seen": 8326664, "step": 14440 }, { "epoch": 2.1514745308310994, "grad_norm": 0.8333802819252014, "learning_rate": 4.800809891848477e-05, "loss": 0.492, "num_input_tokens_seen": 8329384, "step": 14445 }, { "epoch": 2.152219243372058, "grad_norm": 1.5613073110580444, "learning_rate": 4.80055560730873e-05, "loss": 0.6373, "num_input_tokens_seen": 8332296, "step": 14450 }, { "epoch": 2.1529639559130174, "grad_norm": 0.8411529064178467, "learning_rate": 4.800301167306789e-05, "loss": 0.6405, "num_input_tokens_seen": 8335272, "step": 14455 }, { "epoch": 2.1537086684539766, "grad_norm": 1.7156507968902588, "learning_rate": 4.800046571859847e-05, "loss": 0.6507, "num_input_tokens_seen": 8338344, "step": 14460 }, { "epoch": 2.1544533809949358, "grad_norm": 0.9770905375480652, "learning_rate": 4.79979182098511e-05, "loss": 0.5883, "num_input_tokens_seen": 8341192, "step": 14465 }, { "epoch": 2.155198093535895, "grad_norm": 3.394399642944336, "learning_rate": 4.7995369146997906e-05, "loss": 0.798, "num_input_tokens_seen": 8344200, "step": 14470 }, { "epoch": 2.155942806076854, "grad_norm": 1.8815598487854004, "learning_rate": 4.7992818530211164e-05, "loss": 0.7771, "num_input_tokens_seen": 8347176, "step": 14475 }, { "epoch": 2.1566875186178134, "grad_norm": 2.9667370319366455, "learning_rate": 4.799026635966323e-05, "loss": 0.7311, "num_input_tokens_seen": 8350120, "step": 14480 }, { "epoch": 2.1574322311587726, "grad_norm": 1.6556514501571655, "learning_rate": 4.798771263552656e-05, "loss": 0.5666, "num_input_tokens_seen": 8353032, "step": 14485 }, { "epoch": 2.158176943699732, "grad_norm": 1.1286336183547974, "learning_rate": 4.798515735797374e-05, "loss": 0.7009, "num_input_tokens_seen": 8356776, "step": 14490 }, { "epoch": 2.158921656240691, "grad_norm": 1.3882949352264404, "learning_rate": 4.7982600527177427e-05, "loss": 0.5773, "num_input_tokens_seen": 8359848, "step": 14495 }, { "epoch": 2.15966636878165, "grad_norm": 1.192413091659546, "learning_rate": 4.798004214331042e-05, "loss": 0.5745, "num_input_tokens_seen": 8362536, "step": 14500 }, { "epoch": 2.1604110813226094, "grad_norm": 2.774712085723877, "learning_rate": 4.7977482206545586e-05, "loss": 0.5126, "num_input_tokens_seen": 8365160, "step": 14505 }, { "epoch": 2.1611557938635686, "grad_norm": 2.0210788249969482, "learning_rate": 4.797492071705593e-05, "loss": 0.7992, "num_input_tokens_seen": 8368168, "step": 14510 }, { "epoch": 2.161900506404528, "grad_norm": 1.4100924730300903, "learning_rate": 4.7972357675014546e-05, "loss": 0.576, "num_input_tokens_seen": 8370888, "step": 14515 }, { "epoch": 2.162645218945487, "grad_norm": 1.7211333513259888, "learning_rate": 4.796979308059462e-05, "loss": 0.742, "num_input_tokens_seen": 8373480, "step": 14520 }, { "epoch": 2.163389931486446, "grad_norm": 1.0279370546340942, "learning_rate": 4.796722693396947e-05, "loss": 0.6841, "num_input_tokens_seen": 8376552, "step": 14525 }, { "epoch": 2.1641346440274054, "grad_norm": 1.0422370433807373, "learning_rate": 4.79646592353125e-05, "loss": 0.6513, "num_input_tokens_seen": 8379464, "step": 14530 }, { "epoch": 2.1648793565683646, "grad_norm": 1.2407671213150024, "learning_rate": 4.7962089984797235e-05, "loss": 0.5741, "num_input_tokens_seen": 8382440, "step": 14535 }, { "epoch": 2.165624069109324, "grad_norm": 1.3754445314407349, "learning_rate": 4.795951918259727e-05, "loss": 0.5065, "num_input_tokens_seen": 8385160, "step": 14540 }, { "epoch": 2.166368781650283, "grad_norm": 0.914713978767395, "learning_rate": 4.795694682888635e-05, "loss": 0.8411, "num_input_tokens_seen": 8387848, "step": 14545 }, { "epoch": 2.167113494191242, "grad_norm": 1.987422227859497, "learning_rate": 4.795437292383831e-05, "loss": 0.7387, "num_input_tokens_seen": 8390600, "step": 14550 }, { "epoch": 2.1678582067322014, "grad_norm": 1.4441310167312622, "learning_rate": 4.7951797467627065e-05, "loss": 0.5314, "num_input_tokens_seen": 8393480, "step": 14555 }, { "epoch": 2.1686029192731606, "grad_norm": 0.6069968342781067, "learning_rate": 4.7949220460426666e-05, "loss": 0.6399, "num_input_tokens_seen": 8396200, "step": 14560 }, { "epoch": 2.16934763181412, "grad_norm": 0.6494024395942688, "learning_rate": 4.794664190241125e-05, "loss": 0.5939, "num_input_tokens_seen": 8399208, "step": 14565 }, { "epoch": 2.170092344355079, "grad_norm": 0.6702576875686646, "learning_rate": 4.794406179375507e-05, "loss": 0.5199, "num_input_tokens_seen": 8401800, "step": 14570 }, { "epoch": 2.1708370568960382, "grad_norm": 0.7798964977264404, "learning_rate": 4.794148013463248e-05, "loss": 0.5517, "num_input_tokens_seen": 8404712, "step": 14575 }, { "epoch": 2.1715817694369974, "grad_norm": 1.0677257776260376, "learning_rate": 4.793889692521792e-05, "loss": 0.7342, "num_input_tokens_seen": 8407432, "step": 14580 }, { "epoch": 2.1723264819779566, "grad_norm": 0.7350539565086365, "learning_rate": 4.793631216568599e-05, "loss": 0.6637, "num_input_tokens_seen": 8410312, "step": 14585 }, { "epoch": 2.173071194518916, "grad_norm": 1.01504385471344, "learning_rate": 4.793372585621133e-05, "loss": 0.712, "num_input_tokens_seen": 8413320, "step": 14590 }, { "epoch": 2.173815907059875, "grad_norm": 1.1162375211715698, "learning_rate": 4.793113799696871e-05, "loss": 0.5606, "num_input_tokens_seen": 8416168, "step": 14595 }, { "epoch": 2.1745606196008342, "grad_norm": 0.9454033374786377, "learning_rate": 4.792854858813303e-05, "loss": 0.7773, "num_input_tokens_seen": 8419240, "step": 14600 }, { "epoch": 2.1753053321417934, "grad_norm": 2.057481288909912, "learning_rate": 4.792595762987924e-05, "loss": 0.7389, "num_input_tokens_seen": 8422440, "step": 14605 }, { "epoch": 2.1760500446827526, "grad_norm": 1.2291315793991089, "learning_rate": 4.792336512238246e-05, "loss": 0.9258, "num_input_tokens_seen": 8425032, "step": 14610 }, { "epoch": 2.176794757223712, "grad_norm": 0.7090242505073547, "learning_rate": 4.7920771065817846e-05, "loss": 0.7243, "num_input_tokens_seen": 8427816, "step": 14615 }, { "epoch": 2.177539469764671, "grad_norm": 0.6529079675674438, "learning_rate": 4.791817546036072e-05, "loss": 0.7244, "num_input_tokens_seen": 8430632, "step": 14620 }, { "epoch": 2.17828418230563, "grad_norm": 0.7602981925010681, "learning_rate": 4.7915578306186485e-05, "loss": 0.7683, "num_input_tokens_seen": 8433384, "step": 14625 }, { "epoch": 2.179028894846589, "grad_norm": 0.8300358057022095, "learning_rate": 4.791297960347063e-05, "loss": 0.6438, "num_input_tokens_seen": 8435976, "step": 14630 }, { "epoch": 2.179773607387548, "grad_norm": 0.6722771525382996, "learning_rate": 4.791037935238877e-05, "loss": 0.704, "num_input_tokens_seen": 8438920, "step": 14635 }, { "epoch": 2.1805183199285074, "grad_norm": 0.7810254693031311, "learning_rate": 4.790777755311662e-05, "loss": 0.721, "num_input_tokens_seen": 8441640, "step": 14640 }, { "epoch": 2.1812630324694666, "grad_norm": 0.9131264686584473, "learning_rate": 4.790517420583e-05, "loss": 0.7425, "num_input_tokens_seen": 8444488, "step": 14645 }, { "epoch": 2.182007745010426, "grad_norm": 0.8783792853355408, "learning_rate": 4.790256931070483e-05, "loss": 0.6379, "num_input_tokens_seen": 8447240, "step": 14650 }, { "epoch": 2.182752457551385, "grad_norm": 1.544395923614502, "learning_rate": 4.789996286791715e-05, "loss": 0.5947, "num_input_tokens_seen": 8450088, "step": 14655 }, { "epoch": 2.1834971700923442, "grad_norm": 0.8120569586753845, "learning_rate": 4.789735487764307e-05, "loss": 0.7735, "num_input_tokens_seen": 8452776, "step": 14660 }, { "epoch": 2.1842418826333034, "grad_norm": 1.9282870292663574, "learning_rate": 4.789474534005885e-05, "loss": 0.6484, "num_input_tokens_seen": 8455656, "step": 14665 }, { "epoch": 2.1849865951742626, "grad_norm": 0.8149595260620117, "learning_rate": 4.789213425534082e-05, "loss": 0.5069, "num_input_tokens_seen": 8458312, "step": 14670 }, { "epoch": 2.185731307715222, "grad_norm": 1.9937970638275146, "learning_rate": 4.788952162366543e-05, "loss": 0.7101, "num_input_tokens_seen": 8461672, "step": 14675 }, { "epoch": 2.186476020256181, "grad_norm": 1.1726597547531128, "learning_rate": 4.7886907445209234e-05, "loss": 0.6543, "num_input_tokens_seen": 8464584, "step": 14680 }, { "epoch": 2.1872207327971402, "grad_norm": 2.0322494506835938, "learning_rate": 4.7884291720148876e-05, "loss": 0.7726, "num_input_tokens_seen": 8467336, "step": 14685 }, { "epoch": 2.1879654453380994, "grad_norm": 1.4439187049865723, "learning_rate": 4.7881674448661136e-05, "loss": 0.6818, "num_input_tokens_seen": 8470280, "step": 14690 }, { "epoch": 2.1887101578790586, "grad_norm": 1.4479200839996338, "learning_rate": 4.7879055630922856e-05, "loss": 0.639, "num_input_tokens_seen": 8473384, "step": 14695 }, { "epoch": 2.189454870420018, "grad_norm": 2.7627832889556885, "learning_rate": 4.7876435267111024e-05, "loss": 0.8402, "num_input_tokens_seen": 8476424, "step": 14700 }, { "epoch": 2.190199582960977, "grad_norm": 2.153670310974121, "learning_rate": 4.7873813357402704e-05, "loss": 0.6352, "num_input_tokens_seen": 8479176, "step": 14705 }, { "epoch": 2.1909442955019363, "grad_norm": 1.6493003368377686, "learning_rate": 4.7871189901975075e-05, "loss": 0.6683, "num_input_tokens_seen": 8482184, "step": 14710 }, { "epoch": 2.1916890080428955, "grad_norm": 1.308125615119934, "learning_rate": 4.786856490100542e-05, "loss": 0.5051, "num_input_tokens_seen": 8485256, "step": 14715 }, { "epoch": 2.1924337205838547, "grad_norm": 1.4278446435928345, "learning_rate": 4.786593835467112e-05, "loss": 0.5552, "num_input_tokens_seen": 8488264, "step": 14720 }, { "epoch": 2.193178433124814, "grad_norm": 1.039494514465332, "learning_rate": 4.786331026314968e-05, "loss": 0.5125, "num_input_tokens_seen": 8491144, "step": 14725 }, { "epoch": 2.193923145665773, "grad_norm": 1.4223742485046387, "learning_rate": 4.7860680626618684e-05, "loss": 0.7842, "num_input_tokens_seen": 8493960, "step": 14730 }, { "epoch": 2.1946678582067323, "grad_norm": 1.9333752393722534, "learning_rate": 4.7858049445255834e-05, "loss": 0.6259, "num_input_tokens_seen": 8496840, "step": 14735 }, { "epoch": 2.1954125707476915, "grad_norm": 1.0455896854400635, "learning_rate": 4.7855416719238945e-05, "loss": 0.715, "num_input_tokens_seen": 8499592, "step": 14740 }, { "epoch": 2.1961572832886507, "grad_norm": 0.8147960305213928, "learning_rate": 4.78527824487459e-05, "loss": 0.621, "num_input_tokens_seen": 8502632, "step": 14745 }, { "epoch": 2.19690199582961, "grad_norm": 1.3093816041946411, "learning_rate": 4.785014663395475e-05, "loss": 0.6383, "num_input_tokens_seen": 8505800, "step": 14750 }, { "epoch": 2.197646708370569, "grad_norm": 1.45451819896698, "learning_rate": 4.784750927504358e-05, "loss": 0.5961, "num_input_tokens_seen": 8508552, "step": 14755 }, { "epoch": 2.1983914209115283, "grad_norm": 0.8309901356697083, "learning_rate": 4.784487037219063e-05, "loss": 0.6374, "num_input_tokens_seen": 8511464, "step": 14760 }, { "epoch": 2.1991361334524875, "grad_norm": 0.8010584115982056, "learning_rate": 4.784222992557422e-05, "loss": 0.6996, "num_input_tokens_seen": 8513992, "step": 14765 }, { "epoch": 2.1998808459934467, "grad_norm": 1.2073472738265991, "learning_rate": 4.783958793537278e-05, "loss": 0.7027, "num_input_tokens_seen": 8516680, "step": 14770 }, { "epoch": 2.200625558534406, "grad_norm": 0.8106566667556763, "learning_rate": 4.783694440176485e-05, "loss": 0.5339, "num_input_tokens_seen": 8519624, "step": 14775 }, { "epoch": 2.201370271075365, "grad_norm": 1.1003165245056152, "learning_rate": 4.7834299324929056e-05, "loss": 0.7701, "num_input_tokens_seen": 8522536, "step": 14780 }, { "epoch": 2.2021149836163243, "grad_norm": 1.1134306192398071, "learning_rate": 4.7831652705044164e-05, "loss": 0.7502, "num_input_tokens_seen": 8525512, "step": 14785 }, { "epoch": 2.202859696157283, "grad_norm": 0.8154904842376709, "learning_rate": 4.7829004542289e-05, "loss": 0.5278, "num_input_tokens_seen": 8528392, "step": 14790 }, { "epoch": 2.2036044086982427, "grad_norm": 0.9334527850151062, "learning_rate": 4.7826354836842525e-05, "loss": 0.751, "num_input_tokens_seen": 8531432, "step": 14795 }, { "epoch": 2.2043491212392015, "grad_norm": 1.3413368463516235, "learning_rate": 4.7823703588883796e-05, "loss": 0.7227, "num_input_tokens_seen": 8534440, "step": 14800 }, { "epoch": 2.2050938337801607, "grad_norm": 1.0817409753799438, "learning_rate": 4.782105079859198e-05, "loss": 0.679, "num_input_tokens_seen": 8537384, "step": 14805 }, { "epoch": 2.20583854632112, "grad_norm": 1.1680961847305298, "learning_rate": 4.7818396466146326e-05, "loss": 0.6437, "num_input_tokens_seen": 8540200, "step": 14810 }, { "epoch": 2.206583258862079, "grad_norm": 1.2268165349960327, "learning_rate": 4.781574059172621e-05, "loss": 0.6691, "num_input_tokens_seen": 8542888, "step": 14815 }, { "epoch": 2.2073279714030383, "grad_norm": 0.936296284198761, "learning_rate": 4.781308317551112e-05, "loss": 0.5414, "num_input_tokens_seen": 8545768, "step": 14820 }, { "epoch": 2.2080726839439975, "grad_norm": 1.6666988134384155, "learning_rate": 4.781042421768061e-05, "loss": 0.5539, "num_input_tokens_seen": 8548552, "step": 14825 }, { "epoch": 2.2088173964849567, "grad_norm": 1.2523479461669922, "learning_rate": 4.7807763718414374e-05, "loss": 0.6698, "num_input_tokens_seen": 8551432, "step": 14830 }, { "epoch": 2.209562109025916, "grad_norm": 0.9703276753425598, "learning_rate": 4.7805101677892194e-05, "loss": 0.6687, "num_input_tokens_seen": 8554408, "step": 14835 }, { "epoch": 2.210306821566875, "grad_norm": 0.7565433382987976, "learning_rate": 4.7802438096293964e-05, "loss": 0.6831, "num_input_tokens_seen": 8557128, "step": 14840 }, { "epoch": 2.2110515341078343, "grad_norm": 1.4965516328811646, "learning_rate": 4.7799772973799674e-05, "loss": 0.6204, "num_input_tokens_seen": 8560040, "step": 14845 }, { "epoch": 2.2117962466487935, "grad_norm": 1.4017829895019531, "learning_rate": 4.7797106310589424e-05, "loss": 0.621, "num_input_tokens_seen": 8562824, "step": 14850 }, { "epoch": 2.2125409591897527, "grad_norm": 0.9441686272621155, "learning_rate": 4.779443810684341e-05, "loss": 0.8546, "num_input_tokens_seen": 8565960, "step": 14855 }, { "epoch": 2.213285671730712, "grad_norm": 0.6383371949195862, "learning_rate": 4.779176836274195e-05, "loss": 0.7142, "num_input_tokens_seen": 8568808, "step": 14860 }, { "epoch": 2.214030384271671, "grad_norm": 0.8367955684661865, "learning_rate": 4.778909707846545e-05, "loss": 0.6979, "num_input_tokens_seen": 8571496, "step": 14865 }, { "epoch": 2.2147750968126303, "grad_norm": 1.0272661447525024, "learning_rate": 4.778642425419442e-05, "loss": 0.6798, "num_input_tokens_seen": 8574504, "step": 14870 }, { "epoch": 2.2155198093535895, "grad_norm": 0.7776187062263489, "learning_rate": 4.778374989010949e-05, "loss": 0.7507, "num_input_tokens_seen": 8577256, "step": 14875 }, { "epoch": 2.2162645218945487, "grad_norm": 3.3350439071655273, "learning_rate": 4.778107398639136e-05, "loss": 0.7099, "num_input_tokens_seen": 8580200, "step": 14880 }, { "epoch": 2.217009234435508, "grad_norm": 0.9218176603317261, "learning_rate": 4.777839654322088e-05, "loss": 0.6948, "num_input_tokens_seen": 8583304, "step": 14885 }, { "epoch": 2.217753946976467, "grad_norm": 1.107130527496338, "learning_rate": 4.777571756077897e-05, "loss": 0.7786, "num_input_tokens_seen": 8586248, "step": 14890 }, { "epoch": 2.2184986595174263, "grad_norm": 1.0701147317886353, "learning_rate": 4.777303703924667e-05, "loss": 0.6687, "num_input_tokens_seen": 8588968, "step": 14895 }, { "epoch": 2.2192433720583855, "grad_norm": 2.462531566619873, "learning_rate": 4.777035497880511e-05, "loss": 0.6461, "num_input_tokens_seen": 8592008, "step": 14900 }, { "epoch": 2.2199880845993447, "grad_norm": 1.2412165403366089, "learning_rate": 4.776767137963554e-05, "loss": 0.6697, "num_input_tokens_seen": 8595144, "step": 14905 }, { "epoch": 2.220732797140304, "grad_norm": 0.958032488822937, "learning_rate": 4.776498624191931e-05, "loss": 0.6659, "num_input_tokens_seen": 8598408, "step": 14910 }, { "epoch": 2.221477509681263, "grad_norm": 0.9604125022888184, "learning_rate": 4.7762299565837855e-05, "loss": 0.7664, "num_input_tokens_seen": 8601352, "step": 14915 }, { "epoch": 2.2222222222222223, "grad_norm": 0.9082801938056946, "learning_rate": 4.775961135157275e-05, "loss": 0.7017, "num_input_tokens_seen": 8604456, "step": 14920 }, { "epoch": 2.2229669347631815, "grad_norm": 0.8294523358345032, "learning_rate": 4.775692159930564e-05, "loss": 0.6124, "num_input_tokens_seen": 8607560, "step": 14925 }, { "epoch": 2.2237116473041407, "grad_norm": 0.9440820217132568, "learning_rate": 4.775423030921828e-05, "loss": 0.6304, "num_input_tokens_seen": 8610440, "step": 14930 }, { "epoch": 2.2244563598451, "grad_norm": 0.9632822275161743, "learning_rate": 4.7751537481492565e-05, "loss": 0.7242, "num_input_tokens_seen": 8613384, "step": 14935 }, { "epoch": 2.225201072386059, "grad_norm": 0.2743629217147827, "learning_rate": 4.7748843116310434e-05, "loss": 0.5488, "num_input_tokens_seen": 8616296, "step": 14940 }, { "epoch": 2.2259457849270183, "grad_norm": 2.134751319885254, "learning_rate": 4.774614721385399e-05, "loss": 0.6717, "num_input_tokens_seen": 8619368, "step": 14945 }, { "epoch": 2.2266904974679775, "grad_norm": 2.442761182785034, "learning_rate": 4.7743449774305386e-05, "loss": 0.5777, "num_input_tokens_seen": 8622248, "step": 14950 }, { "epoch": 2.2274352100089367, "grad_norm": 0.8274829387664795, "learning_rate": 4.774075079784692e-05, "loss": 0.7055, "num_input_tokens_seen": 8625128, "step": 14955 }, { "epoch": 2.228179922549896, "grad_norm": 0.897434413433075, "learning_rate": 4.7738050284660966e-05, "loss": 0.566, "num_input_tokens_seen": 8628168, "step": 14960 }, { "epoch": 2.2289246350908547, "grad_norm": 2.0110039710998535, "learning_rate": 4.7735348234930024e-05, "loss": 0.6546, "num_input_tokens_seen": 8631208, "step": 14965 }, { "epoch": 2.2296693476318143, "grad_norm": 0.9949216246604919, "learning_rate": 4.773264464883669e-05, "loss": 0.6518, "num_input_tokens_seen": 8634152, "step": 14970 }, { "epoch": 2.230414060172773, "grad_norm": 0.7155570983886719, "learning_rate": 4.772993952656364e-05, "loss": 0.6133, "num_input_tokens_seen": 8637384, "step": 14975 }, { "epoch": 2.2311587727137323, "grad_norm": 1.1392178535461426, "learning_rate": 4.7727232868293705e-05, "loss": 0.7801, "num_input_tokens_seen": 8640424, "step": 14980 }, { "epoch": 2.2319034852546915, "grad_norm": 1.2056313753128052, "learning_rate": 4.7724524674209765e-05, "loss": 0.6701, "num_input_tokens_seen": 8643240, "step": 14985 }, { "epoch": 2.2326481977956507, "grad_norm": 0.9440069794654846, "learning_rate": 4.7721814944494834e-05, "loss": 0.5551, "num_input_tokens_seen": 8646024, "step": 14990 }, { "epoch": 2.23339291033661, "grad_norm": 1.4208042621612549, "learning_rate": 4.771910367933204e-05, "loss": 0.7924, "num_input_tokens_seen": 8648840, "step": 14995 }, { "epoch": 2.234137622877569, "grad_norm": 2.0324738025665283, "learning_rate": 4.771639087890459e-05, "loss": 0.8663, "num_input_tokens_seen": 8651688, "step": 15000 }, { "epoch": 2.2348823354185283, "grad_norm": 1.2232327461242676, "learning_rate": 4.771367654339579e-05, "loss": 0.6963, "num_input_tokens_seen": 8654344, "step": 15005 }, { "epoch": 2.2356270479594875, "grad_norm": 1.1244481801986694, "learning_rate": 4.771096067298909e-05, "loss": 0.574, "num_input_tokens_seen": 8657128, "step": 15010 }, { "epoch": 2.2363717605004467, "grad_norm": 1.029381275177002, "learning_rate": 4.7708243267868e-05, "loss": 0.6121, "num_input_tokens_seen": 8659912, "step": 15015 }, { "epoch": 2.237116473041406, "grad_norm": 1.173445701599121, "learning_rate": 4.770552432821615e-05, "loss": 0.8117, "num_input_tokens_seen": 8663016, "step": 15020 }, { "epoch": 2.237861185582365, "grad_norm": 1.2154386043548584, "learning_rate": 4.770280385421728e-05, "loss": 0.6482, "num_input_tokens_seen": 8666280, "step": 15025 }, { "epoch": 2.2386058981233243, "grad_norm": 1.0169461965560913, "learning_rate": 4.7700081846055236e-05, "loss": 0.6719, "num_input_tokens_seen": 8669224, "step": 15030 }, { "epoch": 2.2393506106642835, "grad_norm": 0.6982775330543518, "learning_rate": 4.7697358303913944e-05, "loss": 0.6822, "num_input_tokens_seen": 8671912, "step": 15035 }, { "epoch": 2.2400953232052427, "grad_norm": 0.7657568454742432, "learning_rate": 4.769463322797746e-05, "loss": 0.7778, "num_input_tokens_seen": 8674984, "step": 15040 }, { "epoch": 2.240840035746202, "grad_norm": 1.0545594692230225, "learning_rate": 4.7691906618429935e-05, "loss": 0.7074, "num_input_tokens_seen": 8677896, "step": 15045 }, { "epoch": 2.241584748287161, "grad_norm": 1.2573732137680054, "learning_rate": 4.768917847545562e-05, "loss": 0.5987, "num_input_tokens_seen": 8680872, "step": 15050 }, { "epoch": 2.2423294608281203, "grad_norm": 1.2411993741989136, "learning_rate": 4.768644879923887e-05, "loss": 0.7142, "num_input_tokens_seen": 8683560, "step": 15055 }, { "epoch": 2.2430741733690795, "grad_norm": 0.7997516393661499, "learning_rate": 4.768371758996415e-05, "loss": 0.7553, "num_input_tokens_seen": 8686408, "step": 15060 }, { "epoch": 2.2438188859100388, "grad_norm": 0.6599094867706299, "learning_rate": 4.7680984847816015e-05, "loss": 0.6491, "num_input_tokens_seen": 8689320, "step": 15065 }, { "epoch": 2.244563598450998, "grad_norm": 0.5699360370635986, "learning_rate": 4.767825057297914e-05, "loss": 0.7574, "num_input_tokens_seen": 8692584, "step": 15070 }, { "epoch": 2.245308310991957, "grad_norm": 0.6631327867507935, "learning_rate": 4.767551476563829e-05, "loss": 0.6686, "num_input_tokens_seen": 8695432, "step": 15075 }, { "epoch": 2.2460530235329164, "grad_norm": 1.301285982131958, "learning_rate": 4.767277742597835e-05, "loss": 0.6816, "num_input_tokens_seen": 8698152, "step": 15080 }, { "epoch": 2.2467977360738756, "grad_norm": 1.0713597536087036, "learning_rate": 4.7670038554184296e-05, "loss": 0.6998, "num_input_tokens_seen": 8700840, "step": 15085 }, { "epoch": 2.2475424486148348, "grad_norm": 1.0154657363891602, "learning_rate": 4.7667298150441194e-05, "loss": 0.7293, "num_input_tokens_seen": 8703912, "step": 15090 }, { "epoch": 2.248287161155794, "grad_norm": 0.8341382145881653, "learning_rate": 4.7664556214934255e-05, "loss": 0.6448, "num_input_tokens_seen": 8706888, "step": 15095 }, { "epoch": 2.249031873696753, "grad_norm": 0.9433320760726929, "learning_rate": 4.766181274784874e-05, "loss": 0.6386, "num_input_tokens_seen": 8709800, "step": 15100 }, { "epoch": 2.2497765862377124, "grad_norm": 0.8901689648628235, "learning_rate": 4.765906774937007e-05, "loss": 0.6544, "num_input_tokens_seen": 8712616, "step": 15105 }, { "epoch": 2.2505212987786716, "grad_norm": 0.9844955205917358, "learning_rate": 4.765632121968371e-05, "loss": 0.8153, "num_input_tokens_seen": 8715464, "step": 15110 }, { "epoch": 2.2512660113196308, "grad_norm": 0.9627484679222107, "learning_rate": 4.765357315897529e-05, "loss": 0.7552, "num_input_tokens_seen": 8718888, "step": 15115 }, { "epoch": 2.25201072386059, "grad_norm": 0.6184778213500977, "learning_rate": 4.765082356743049e-05, "loss": 0.6915, "num_input_tokens_seen": 8721512, "step": 15120 }, { "epoch": 2.252755436401549, "grad_norm": 1.0286784172058105, "learning_rate": 4.7648072445235136e-05, "loss": 0.6775, "num_input_tokens_seen": 8724584, "step": 15125 }, { "epoch": 2.2535001489425084, "grad_norm": 0.8819419741630554, "learning_rate": 4.764531979257511e-05, "loss": 0.6681, "num_input_tokens_seen": 8727880, "step": 15130 }, { "epoch": 2.2542448614834676, "grad_norm": 0.6740337014198303, "learning_rate": 4.764256560963646e-05, "loss": 0.711, "num_input_tokens_seen": 8731016, "step": 15135 }, { "epoch": 2.2549895740244263, "grad_norm": 1.5437061786651611, "learning_rate": 4.7639809896605275e-05, "loss": 0.6889, "num_input_tokens_seen": 8733896, "step": 15140 }, { "epoch": 2.255734286565386, "grad_norm": 0.8240264654159546, "learning_rate": 4.763705265366779e-05, "loss": 0.6489, "num_input_tokens_seen": 8736776, "step": 15145 }, { "epoch": 2.2564789991063448, "grad_norm": 0.8101851940155029, "learning_rate": 4.763429388101031e-05, "loss": 0.5727, "num_input_tokens_seen": 8739816, "step": 15150 }, { "epoch": 2.257223711647304, "grad_norm": 0.47439393401145935, "learning_rate": 4.763153357881928e-05, "loss": 0.5255, "num_input_tokens_seen": 8742984, "step": 15155 }, { "epoch": 2.257968424188263, "grad_norm": 1.0435504913330078, "learning_rate": 4.7628771747281226e-05, "loss": 0.6091, "num_input_tokens_seen": 8746088, "step": 15160 }, { "epoch": 2.2587131367292224, "grad_norm": 2.1382429599761963, "learning_rate": 4.762600838658278e-05, "loss": 0.6688, "num_input_tokens_seen": 8748968, "step": 15165 }, { "epoch": 2.2594578492701816, "grad_norm": 0.9768578410148621, "learning_rate": 4.762324349691067e-05, "loss": 0.6101, "num_input_tokens_seen": 8751848, "step": 15170 }, { "epoch": 2.2602025618111408, "grad_norm": 0.9666561484336853, "learning_rate": 4.762047707845175e-05, "loss": 0.7256, "num_input_tokens_seen": 8754536, "step": 15175 }, { "epoch": 2.2609472743521, "grad_norm": 1.3472601175308228, "learning_rate": 4.761770913139296e-05, "loss": 0.6371, "num_input_tokens_seen": 8757256, "step": 15180 }, { "epoch": 2.261691986893059, "grad_norm": 1.8347671031951904, "learning_rate": 4.761493965592134e-05, "loss": 0.7007, "num_input_tokens_seen": 8760104, "step": 15185 }, { "epoch": 2.2624366994340184, "grad_norm": 0.7910349369049072, "learning_rate": 4.761216865222404e-05, "loss": 0.485, "num_input_tokens_seen": 8763112, "step": 15190 }, { "epoch": 2.2631814119749776, "grad_norm": 1.121041178703308, "learning_rate": 4.7609396120488324e-05, "loss": 0.6758, "num_input_tokens_seen": 8766248, "step": 15195 }, { "epoch": 2.2639261245159368, "grad_norm": 1.7602123022079468, "learning_rate": 4.760662206090154e-05, "loss": 0.7831, "num_input_tokens_seen": 8768936, "step": 15200 }, { "epoch": 2.264670837056896, "grad_norm": 1.2437868118286133, "learning_rate": 4.7603846473651147e-05, "loss": 0.7172, "num_input_tokens_seen": 8771656, "step": 15205 }, { "epoch": 2.265415549597855, "grad_norm": 1.58563232421875, "learning_rate": 4.760106935892471e-05, "loss": 0.6847, "num_input_tokens_seen": 8774472, "step": 15210 }, { "epoch": 2.2661602621388144, "grad_norm": 0.9857180714607239, "learning_rate": 4.7598290716909897e-05, "loss": 0.7723, "num_input_tokens_seen": 8777224, "step": 15215 }, { "epoch": 2.2669049746797736, "grad_norm": 1.0847431421279907, "learning_rate": 4.7595510547794465e-05, "loss": 0.7763, "num_input_tokens_seen": 8780296, "step": 15220 }, { "epoch": 2.267649687220733, "grad_norm": 1.077338695526123, "learning_rate": 4.759272885176631e-05, "loss": 0.6973, "num_input_tokens_seen": 8783240, "step": 15225 }, { "epoch": 2.268394399761692, "grad_norm": 0.7707856893539429, "learning_rate": 4.758994562901339e-05, "loss": 0.5805, "num_input_tokens_seen": 8786216, "step": 15230 }, { "epoch": 2.269139112302651, "grad_norm": 1.04115629196167, "learning_rate": 4.7587160879723785e-05, "loss": 0.7277, "num_input_tokens_seen": 8789192, "step": 15235 }, { "epoch": 2.2698838248436104, "grad_norm": 1.948724389076233, "learning_rate": 4.7584374604085684e-05, "loss": 0.7926, "num_input_tokens_seen": 8792040, "step": 15240 }, { "epoch": 2.2706285373845696, "grad_norm": 0.7512755393981934, "learning_rate": 4.758158680228737e-05, "loss": 0.7377, "num_input_tokens_seen": 8794664, "step": 15245 }, { "epoch": 2.271373249925529, "grad_norm": 0.719473123550415, "learning_rate": 4.757879747451722e-05, "loss": 0.6357, "num_input_tokens_seen": 8797704, "step": 15250 }, { "epoch": 2.272117962466488, "grad_norm": 1.5565848350524902, "learning_rate": 4.757600662096375e-05, "loss": 0.6729, "num_input_tokens_seen": 8800872, "step": 15255 }, { "epoch": 2.272862675007447, "grad_norm": 1.0559790134429932, "learning_rate": 4.757321424181553e-05, "loss": 0.6757, "num_input_tokens_seen": 8803688, "step": 15260 }, { "epoch": 2.2736073875484064, "grad_norm": 1.2130916118621826, "learning_rate": 4.7570420337261275e-05, "loss": 0.7185, "num_input_tokens_seen": 8806504, "step": 15265 }, { "epoch": 2.2743521000893656, "grad_norm": 1.2038506269454956, "learning_rate": 4.756762490748977e-05, "loss": 0.6931, "num_input_tokens_seen": 8809544, "step": 15270 }, { "epoch": 2.275096812630325, "grad_norm": 0.9710779190063477, "learning_rate": 4.7564827952689936e-05, "loss": 0.5786, "num_input_tokens_seen": 8812520, "step": 15275 }, { "epoch": 2.275841525171284, "grad_norm": 1.4605106115341187, "learning_rate": 4.756202947305076e-05, "loss": 0.5897, "num_input_tokens_seen": 8815304, "step": 15280 }, { "epoch": 2.276586237712243, "grad_norm": 1.3475686311721802, "learning_rate": 4.755922946876137e-05, "loss": 0.5542, "num_input_tokens_seen": 8818280, "step": 15285 }, { "epoch": 2.2773309502532024, "grad_norm": 1.161680817604065, "learning_rate": 4.755642794001097e-05, "loss": 0.7092, "num_input_tokens_seen": 8820968, "step": 15290 }, { "epoch": 2.2780756627941616, "grad_norm": 1.216556429862976, "learning_rate": 4.755362488698888e-05, "loss": 0.7046, "num_input_tokens_seen": 8823976, "step": 15295 }, { "epoch": 2.278820375335121, "grad_norm": 1.8431107997894287, "learning_rate": 4.755082030988452e-05, "loss": 0.7374, "num_input_tokens_seen": 8826664, "step": 15300 }, { "epoch": 2.2795650878760796, "grad_norm": 0.9249246120452881, "learning_rate": 4.7548014208887396e-05, "loss": 0.6783, "num_input_tokens_seen": 8829928, "step": 15305 }, { "epoch": 2.2803098004170392, "grad_norm": 0.6040843725204468, "learning_rate": 4.7545206584187155e-05, "loss": 0.5632, "num_input_tokens_seen": 8832712, "step": 15310 }, { "epoch": 2.281054512957998, "grad_norm": 0.5845203399658203, "learning_rate": 4.7542397435973515e-05, "loss": 0.7296, "num_input_tokens_seen": 8836200, "step": 15315 }, { "epoch": 2.2817992254989576, "grad_norm": 1.4185367822647095, "learning_rate": 4.75395867644363e-05, "loss": 0.6344, "num_input_tokens_seen": 8839080, "step": 15320 }, { "epoch": 2.2825439380399164, "grad_norm": 0.7281438112258911, "learning_rate": 4.753677456976546e-05, "loss": 0.7775, "num_input_tokens_seen": 8841960, "step": 15325 }, { "epoch": 2.2832886505808756, "grad_norm": 0.7570234537124634, "learning_rate": 4.753396085215102e-05, "loss": 0.7516, "num_input_tokens_seen": 8844584, "step": 15330 }, { "epoch": 2.284033363121835, "grad_norm": 0.6868074536323547, "learning_rate": 4.753114561178311e-05, "loss": 0.5422, "num_input_tokens_seen": 8847400, "step": 15335 }, { "epoch": 2.284778075662794, "grad_norm": 0.8419814109802246, "learning_rate": 4.7528328848852e-05, "loss": 0.7901, "num_input_tokens_seen": 8850120, "step": 15340 }, { "epoch": 2.285522788203753, "grad_norm": 0.9564064741134644, "learning_rate": 4.752551056354801e-05, "loss": 0.7316, "num_input_tokens_seen": 8852936, "step": 15345 }, { "epoch": 2.2862675007447124, "grad_norm": 0.5903672575950623, "learning_rate": 4.7522690756061606e-05, "loss": 0.6637, "num_input_tokens_seen": 8856104, "step": 15350 }, { "epoch": 2.2870122132856716, "grad_norm": 0.9427710771560669, "learning_rate": 4.751986942658332e-05, "loss": 0.7048, "num_input_tokens_seen": 8859368, "step": 15355 }, { "epoch": 2.287756925826631, "grad_norm": 1.1561106443405151, "learning_rate": 4.751704657530383e-05, "loss": 0.6923, "num_input_tokens_seen": 8862024, "step": 15360 }, { "epoch": 2.28850163836759, "grad_norm": 1.0685054063796997, "learning_rate": 4.751422220241387e-05, "loss": 0.6569, "num_input_tokens_seen": 8864872, "step": 15365 }, { "epoch": 2.289246350908549, "grad_norm": 0.7210111618041992, "learning_rate": 4.7511396308104314e-05, "loss": 0.6611, "num_input_tokens_seen": 8867752, "step": 15370 }, { "epoch": 2.2899910634495084, "grad_norm": 1.702476978302002, "learning_rate": 4.750856889256613e-05, "loss": 0.676, "num_input_tokens_seen": 8870440, "step": 15375 }, { "epoch": 2.2907357759904676, "grad_norm": 0.6859048008918762, "learning_rate": 4.750573995599036e-05, "loss": 0.6477, "num_input_tokens_seen": 8873160, "step": 15380 }, { "epoch": 2.291480488531427, "grad_norm": 0.7472535371780396, "learning_rate": 4.7502909498568194e-05, "loss": 0.7901, "num_input_tokens_seen": 8876040, "step": 15385 }, { "epoch": 2.292225201072386, "grad_norm": 0.7019774913787842, "learning_rate": 4.7500077520490884e-05, "loss": 0.5789, "num_input_tokens_seen": 8878728, "step": 15390 }, { "epoch": 2.2929699136133452, "grad_norm": 0.7051624059677124, "learning_rate": 4.749724402194982e-05, "loss": 0.586, "num_input_tokens_seen": 8881544, "step": 15395 }, { "epoch": 2.2937146261543044, "grad_norm": 0.977005660533905, "learning_rate": 4.749440900313648e-05, "loss": 0.5501, "num_input_tokens_seen": 8884104, "step": 15400 }, { "epoch": 2.2944593386952636, "grad_norm": 0.8228792548179626, "learning_rate": 4.7491572464242415e-05, "loss": 0.5865, "num_input_tokens_seen": 8886888, "step": 15405 }, { "epoch": 2.295204051236223, "grad_norm": 1.1712998151779175, "learning_rate": 4.748873440545935e-05, "loss": 0.7181, "num_input_tokens_seen": 8889928, "step": 15410 }, { "epoch": 2.295948763777182, "grad_norm": 1.3373775482177734, "learning_rate": 4.7485894826979025e-05, "loss": 0.6507, "num_input_tokens_seen": 8893096, "step": 15415 }, { "epoch": 2.2966934763181412, "grad_norm": 1.0625957250595093, "learning_rate": 4.748305372899337e-05, "loss": 0.7366, "num_input_tokens_seen": 8895624, "step": 15420 }, { "epoch": 2.2974381888591004, "grad_norm": 1.233183741569519, "learning_rate": 4.7480211111694335e-05, "loss": 0.5477, "num_input_tokens_seen": 8898408, "step": 15425 }, { "epoch": 2.2981829014000597, "grad_norm": 0.9136865139007568, "learning_rate": 4.747736697527404e-05, "loss": 0.6682, "num_input_tokens_seen": 8901352, "step": 15430 }, { "epoch": 2.298927613941019, "grad_norm": 0.7935696840286255, "learning_rate": 4.747452131992467e-05, "loss": 0.6878, "num_input_tokens_seen": 8904008, "step": 15435 }, { "epoch": 2.299672326481978, "grad_norm": 1.315329909324646, "learning_rate": 4.747167414583852e-05, "loss": 0.5352, "num_input_tokens_seen": 8906920, "step": 15440 }, { "epoch": 2.3004170390229373, "grad_norm": 1.0721145868301392, "learning_rate": 4.7468825453208e-05, "loss": 0.7449, "num_input_tokens_seen": 8909832, "step": 15445 }, { "epoch": 2.3011617515638965, "grad_norm": 1.3240540027618408, "learning_rate": 4.74659752422256e-05, "loss": 0.705, "num_input_tokens_seen": 8912680, "step": 15450 }, { "epoch": 2.3019064641048557, "grad_norm": 0.7669630646705627, "learning_rate": 4.746312351308394e-05, "loss": 0.6524, "num_input_tokens_seen": 8915272, "step": 15455 }, { "epoch": 2.302651176645815, "grad_norm": 0.7402104735374451, "learning_rate": 4.746027026597572e-05, "loss": 0.6612, "num_input_tokens_seen": 8918248, "step": 15460 }, { "epoch": 2.303395889186774, "grad_norm": 1.245484471321106, "learning_rate": 4.7457415501093746e-05, "loss": 0.6718, "num_input_tokens_seen": 8921064, "step": 15465 }, { "epoch": 2.3041406017277333, "grad_norm": 1.029656171798706, "learning_rate": 4.745455921863095e-05, "loss": 0.6836, "num_input_tokens_seen": 8924136, "step": 15470 }, { "epoch": 2.3048853142686925, "grad_norm": 1.2275011539459229, "learning_rate": 4.7451701418780334e-05, "loss": 0.7934, "num_input_tokens_seen": 8927080, "step": 15475 }, { "epoch": 2.3056300268096512, "grad_norm": 1.8267422914505005, "learning_rate": 4.744884210173501e-05, "loss": 0.6868, "num_input_tokens_seen": 8929768, "step": 15480 }, { "epoch": 2.306374739350611, "grad_norm": 1.1284538507461548, "learning_rate": 4.744598126768821e-05, "loss": 0.6709, "num_input_tokens_seen": 8933224, "step": 15485 }, { "epoch": 2.3071194518915696, "grad_norm": 1.0149352550506592, "learning_rate": 4.744311891683325e-05, "loss": 0.6034, "num_input_tokens_seen": 8935656, "step": 15490 }, { "epoch": 2.3078641644325293, "grad_norm": 0.8990690112113953, "learning_rate": 4.7440255049363566e-05, "loss": 0.5988, "num_input_tokens_seen": 8938440, "step": 15495 }, { "epoch": 2.308608876973488, "grad_norm": 1.5668971538543701, "learning_rate": 4.7437389665472686e-05, "loss": 0.536, "num_input_tokens_seen": 8941288, "step": 15500 }, { "epoch": 2.3093535895144472, "grad_norm": 1.1987216472625732, "learning_rate": 4.7434522765354226e-05, "loss": 0.6909, "num_input_tokens_seen": 8944200, "step": 15505 }, { "epoch": 2.3100983020554064, "grad_norm": 1.186746597290039, "learning_rate": 4.743165434920194e-05, "loss": 0.4623, "num_input_tokens_seen": 8947016, "step": 15510 }, { "epoch": 2.3108430145963657, "grad_norm": 0.9811614751815796, "learning_rate": 4.742878441720965e-05, "loss": 0.7148, "num_input_tokens_seen": 8949896, "step": 15515 }, { "epoch": 2.311587727137325, "grad_norm": 0.9979636073112488, "learning_rate": 4.7425912969571295e-05, "loss": 0.7236, "num_input_tokens_seen": 8952872, "step": 15520 }, { "epoch": 2.312332439678284, "grad_norm": 0.9468061923980713, "learning_rate": 4.742304000648092e-05, "loss": 0.5895, "num_input_tokens_seen": 8955656, "step": 15525 }, { "epoch": 2.3130771522192433, "grad_norm": 1.5892466306686401, "learning_rate": 4.742016552813267e-05, "loss": 0.5843, "num_input_tokens_seen": 8958440, "step": 15530 }, { "epoch": 2.3138218647602025, "grad_norm": 1.163601040840149, "learning_rate": 4.7417289534720774e-05, "loss": 0.6039, "num_input_tokens_seen": 8961192, "step": 15535 }, { "epoch": 2.3145665773011617, "grad_norm": 1.1297203302383423, "learning_rate": 4.74144120264396e-05, "loss": 0.6144, "num_input_tokens_seen": 8964232, "step": 15540 }, { "epoch": 2.315311289842121, "grad_norm": 0.9938522577285767, "learning_rate": 4.74115330034836e-05, "loss": 0.6628, "num_input_tokens_seen": 8967016, "step": 15545 }, { "epoch": 2.31605600238308, "grad_norm": 1.1799410581588745, "learning_rate": 4.7408652466047313e-05, "loss": 0.497, "num_input_tokens_seen": 8969896, "step": 15550 }, { "epoch": 2.3168007149240393, "grad_norm": 0.686096727848053, "learning_rate": 4.7405770414325404e-05, "loss": 0.6314, "num_input_tokens_seen": 8973160, "step": 15555 }, { "epoch": 2.3175454274649985, "grad_norm": 0.708064615726471, "learning_rate": 4.740288684851262e-05, "loss": 0.716, "num_input_tokens_seen": 8975976, "step": 15560 }, { "epoch": 2.3182901400059577, "grad_norm": 1.0264055728912354, "learning_rate": 4.7400001768803826e-05, "loss": 0.5295, "num_input_tokens_seen": 8978920, "step": 15565 }, { "epoch": 2.319034852546917, "grad_norm": 0.9794843196868896, "learning_rate": 4.739711517539398e-05, "loss": 0.6388, "num_input_tokens_seen": 8981736, "step": 15570 }, { "epoch": 2.319779565087876, "grad_norm": 1.1815780401229858, "learning_rate": 4.7394227068478157e-05, "loss": 0.4647, "num_input_tokens_seen": 8984488, "step": 15575 }, { "epoch": 2.3205242776288353, "grad_norm": 0.6960328817367554, "learning_rate": 4.739133744825152e-05, "loss": 0.7889, "num_input_tokens_seen": 8987336, "step": 15580 }, { "epoch": 2.3212689901697945, "grad_norm": 1.0847803354263306, "learning_rate": 4.738844631490933e-05, "loss": 0.6856, "num_input_tokens_seen": 8990440, "step": 15585 }, { "epoch": 2.3220137027107537, "grad_norm": 1.2683160305023193, "learning_rate": 4.738555366864696e-05, "loss": 0.7022, "num_input_tokens_seen": 8993160, "step": 15590 }, { "epoch": 2.322758415251713, "grad_norm": 1.2213690280914307, "learning_rate": 4.738265950965989e-05, "loss": 0.7325, "num_input_tokens_seen": 8995976, "step": 15595 }, { "epoch": 2.323503127792672, "grad_norm": 1.2392820119857788, "learning_rate": 4.73797638381437e-05, "loss": 0.5174, "num_input_tokens_seen": 8998632, "step": 15600 }, { "epoch": 2.3242478403336313, "grad_norm": 0.6911126375198364, "learning_rate": 4.7376866654294047e-05, "loss": 0.6998, "num_input_tokens_seen": 9001384, "step": 15605 }, { "epoch": 2.3249925528745905, "grad_norm": 0.8323575854301453, "learning_rate": 4.7373967958306724e-05, "loss": 0.5867, "num_input_tokens_seen": 9004040, "step": 15610 }, { "epoch": 2.3257372654155497, "grad_norm": 1.035563349723816, "learning_rate": 4.737106775037762e-05, "loss": 0.6021, "num_input_tokens_seen": 9006984, "step": 15615 }, { "epoch": 2.326481977956509, "grad_norm": 1.4975645542144775, "learning_rate": 4.736816603070271e-05, "loss": 0.5933, "num_input_tokens_seen": 9009928, "step": 15620 }, { "epoch": 2.327226690497468, "grad_norm": 1.4708541631698608, "learning_rate": 4.736526279947807e-05, "loss": 0.6788, "num_input_tokens_seen": 9012840, "step": 15625 }, { "epoch": 2.3279714030384273, "grad_norm": 1.243595004081726, "learning_rate": 4.736235805689992e-05, "loss": 0.6332, "num_input_tokens_seen": 9015400, "step": 15630 }, { "epoch": 2.3287161155793865, "grad_norm": 1.1455540657043457, "learning_rate": 4.735945180316451e-05, "loss": 0.6234, "num_input_tokens_seen": 9018280, "step": 15635 }, { "epoch": 2.3294608281203457, "grad_norm": 1.8020414113998413, "learning_rate": 4.7356544038468266e-05, "loss": 0.6512, "num_input_tokens_seen": 9021096, "step": 15640 }, { "epoch": 2.330205540661305, "grad_norm": 1.0114586353302002, "learning_rate": 4.735363476300767e-05, "loss": 0.7422, "num_input_tokens_seen": 9023976, "step": 15645 }, { "epoch": 2.330950253202264, "grad_norm": 2.0393917560577393, "learning_rate": 4.735072397697932e-05, "loss": 0.7901, "num_input_tokens_seen": 9026920, "step": 15650 }, { "epoch": 2.331694965743223, "grad_norm": 1.1719695329666138, "learning_rate": 4.734781168057991e-05, "loss": 0.5116, "num_input_tokens_seen": 9029576, "step": 15655 }, { "epoch": 2.3324396782841825, "grad_norm": 1.5841166973114014, "learning_rate": 4.734489787400626e-05, "loss": 0.7308, "num_input_tokens_seen": 9032296, "step": 15660 }, { "epoch": 2.3331843908251413, "grad_norm": 0.9419170022010803, "learning_rate": 4.7341982557455245e-05, "loss": 0.5322, "num_input_tokens_seen": 9035272, "step": 15665 }, { "epoch": 2.333929103366101, "grad_norm": 1.0535675287246704, "learning_rate": 4.733906573112389e-05, "loss": 0.6946, "num_input_tokens_seen": 9038120, "step": 15670 }, { "epoch": 2.3346738159070597, "grad_norm": 0.9698615670204163, "learning_rate": 4.7336147395209294e-05, "loss": 0.6174, "num_input_tokens_seen": 9040840, "step": 15675 }, { "epoch": 2.335418528448019, "grad_norm": 1.634063720703125, "learning_rate": 4.733322754990867e-05, "loss": 0.7495, "num_input_tokens_seen": 9043848, "step": 15680 }, { "epoch": 2.336163240988978, "grad_norm": 1.6508277654647827, "learning_rate": 4.733030619541934e-05, "loss": 0.7057, "num_input_tokens_seen": 9046696, "step": 15685 }, { "epoch": 2.3369079535299373, "grad_norm": 1.1574984788894653, "learning_rate": 4.732738333193869e-05, "loss": 0.6782, "num_input_tokens_seen": 9049928, "step": 15690 }, { "epoch": 2.3376526660708965, "grad_norm": 1.1222034692764282, "learning_rate": 4.7324458959664256e-05, "loss": 0.4937, "num_input_tokens_seen": 9052872, "step": 15695 }, { "epoch": 2.3383973786118557, "grad_norm": 0.7318822145462036, "learning_rate": 4.7321533078793655e-05, "loss": 0.6863, "num_input_tokens_seen": 9055624, "step": 15700 }, { "epoch": 2.339142091152815, "grad_norm": 0.8525209426879883, "learning_rate": 4.73186056895246e-05, "loss": 0.6427, "num_input_tokens_seen": 9058280, "step": 15705 }, { "epoch": 2.339886803693774, "grad_norm": 1.3948373794555664, "learning_rate": 4.731567679205491e-05, "loss": 0.6829, "num_input_tokens_seen": 9061000, "step": 15710 }, { "epoch": 2.3406315162347333, "grad_norm": 1.4215760231018066, "learning_rate": 4.731274638658251e-05, "loss": 0.637, "num_input_tokens_seen": 9064296, "step": 15715 }, { "epoch": 2.3413762287756925, "grad_norm": 1.0395033359527588, "learning_rate": 4.7309814473305427e-05, "loss": 0.6205, "num_input_tokens_seen": 9066856, "step": 15720 }, { "epoch": 2.3421209413166517, "grad_norm": 1.122012972831726, "learning_rate": 4.730688105242179e-05, "loss": 0.4434, "num_input_tokens_seen": 9069800, "step": 15725 }, { "epoch": 2.342865653857611, "grad_norm": 1.19230055809021, "learning_rate": 4.7303946124129824e-05, "loss": 0.6757, "num_input_tokens_seen": 9072744, "step": 15730 }, { "epoch": 2.34361036639857, "grad_norm": 1.3534458875656128, "learning_rate": 4.730100968862786e-05, "loss": 0.4556, "num_input_tokens_seen": 9075528, "step": 15735 }, { "epoch": 2.3443550789395293, "grad_norm": 1.1660561561584473, "learning_rate": 4.7298071746114326e-05, "loss": 0.6437, "num_input_tokens_seen": 9078248, "step": 15740 }, { "epoch": 2.3450997914804885, "grad_norm": 1.8476674556732178, "learning_rate": 4.729513229678776e-05, "loss": 0.738, "num_input_tokens_seen": 9081096, "step": 15745 }, { "epoch": 2.3458445040214477, "grad_norm": 1.1093279123306274, "learning_rate": 4.7292191340846806e-05, "loss": 0.7975, "num_input_tokens_seen": 9083688, "step": 15750 }, { "epoch": 2.346589216562407, "grad_norm": 1.1015843152999878, "learning_rate": 4.728924887849019e-05, "loss": 0.5962, "num_input_tokens_seen": 9086312, "step": 15755 }, { "epoch": 2.347333929103366, "grad_norm": 1.2890247106552124, "learning_rate": 4.728630490991676e-05, "loss": 0.7043, "num_input_tokens_seen": 9089032, "step": 15760 }, { "epoch": 2.3480786416443253, "grad_norm": 1.4933370351791382, "learning_rate": 4.728335943532545e-05, "loss": 0.5669, "num_input_tokens_seen": 9091816, "step": 15765 }, { "epoch": 2.3488233541852845, "grad_norm": 1.6078243255615234, "learning_rate": 4.7280412454915316e-05, "loss": 0.5852, "num_input_tokens_seen": 9094920, "step": 15770 }, { "epoch": 2.3495680667262437, "grad_norm": 1.2493454217910767, "learning_rate": 4.727746396888548e-05, "loss": 0.6816, "num_input_tokens_seen": 9097960, "step": 15775 }, { "epoch": 2.350312779267203, "grad_norm": 1.6912161111831665, "learning_rate": 4.7274513977435206e-05, "loss": 0.5656, "num_input_tokens_seen": 9100872, "step": 15780 }, { "epoch": 2.351057491808162, "grad_norm": 1.5480633974075317, "learning_rate": 4.7271562480763845e-05, "loss": 0.7515, "num_input_tokens_seen": 9103720, "step": 15785 }, { "epoch": 2.3518022043491214, "grad_norm": 1.0010725259780884, "learning_rate": 4.726860947907084e-05, "loss": 0.6664, "num_input_tokens_seen": 9106632, "step": 15790 }, { "epoch": 2.3525469168900806, "grad_norm": 1.691950798034668, "learning_rate": 4.726565497255575e-05, "loss": 0.7674, "num_input_tokens_seen": 9109800, "step": 15795 }, { "epoch": 2.3532916294310398, "grad_norm": 0.8125618100166321, "learning_rate": 4.7262698961418206e-05, "loss": 0.627, "num_input_tokens_seen": 9112456, "step": 15800 }, { "epoch": 2.354036341971999, "grad_norm": 2.046505928039551, "learning_rate": 4.7259741445857994e-05, "loss": 0.6859, "num_input_tokens_seen": 9115528, "step": 15805 }, { "epoch": 2.354781054512958, "grad_norm": 1.0004562139511108, "learning_rate": 4.7256782426074956e-05, "loss": 0.6104, "num_input_tokens_seen": 9118344, "step": 15810 }, { "epoch": 2.3555257670539174, "grad_norm": 0.8747166395187378, "learning_rate": 4.725382190226904e-05, "loss": 0.6918, "num_input_tokens_seen": 9121224, "step": 15815 }, { "epoch": 2.3562704795948766, "grad_norm": 1.0593647956848145, "learning_rate": 4.725085987464032e-05, "loss": 0.7255, "num_input_tokens_seen": 9123912, "step": 15820 }, { "epoch": 2.3570151921358358, "grad_norm": 0.7698802351951599, "learning_rate": 4.724789634338897e-05, "loss": 0.593, "num_input_tokens_seen": 9126600, "step": 15825 }, { "epoch": 2.3577599046767945, "grad_norm": 0.8783403635025024, "learning_rate": 4.7244931308715215e-05, "loss": 0.6888, "num_input_tokens_seen": 9129384, "step": 15830 }, { "epoch": 2.358504617217754, "grad_norm": 0.9448275566101074, "learning_rate": 4.724196477081946e-05, "loss": 0.6373, "num_input_tokens_seen": 9132264, "step": 15835 }, { "epoch": 2.359249329758713, "grad_norm": 1.2387769222259521, "learning_rate": 4.723899672990215e-05, "loss": 0.7751, "num_input_tokens_seen": 9134856, "step": 15840 }, { "epoch": 2.359994042299672, "grad_norm": 1.1129366159439087, "learning_rate": 4.7236027186163856e-05, "loss": 0.7213, "num_input_tokens_seen": 9137768, "step": 15845 }, { "epoch": 2.3607387548406313, "grad_norm": 0.8047318458557129, "learning_rate": 4.7233056139805254e-05, "loss": 0.5763, "num_input_tokens_seen": 9140456, "step": 15850 }, { "epoch": 2.3614834673815905, "grad_norm": 0.6956170201301575, "learning_rate": 4.7230083591027106e-05, "loss": 0.6519, "num_input_tokens_seen": 9143496, "step": 15855 }, { "epoch": 2.3622281799225497, "grad_norm": 1.074511170387268, "learning_rate": 4.72271095400303e-05, "loss": 0.6502, "num_input_tokens_seen": 9146312, "step": 15860 }, { "epoch": 2.362972892463509, "grad_norm": 0.828300952911377, "learning_rate": 4.722413398701579e-05, "loss": 0.6142, "num_input_tokens_seen": 9149128, "step": 15865 }, { "epoch": 2.363717605004468, "grad_norm": 1.0093196630477905, "learning_rate": 4.722115693218467e-05, "loss": 0.5728, "num_input_tokens_seen": 9151752, "step": 15870 }, { "epoch": 2.3644623175454274, "grad_norm": 0.9800338745117188, "learning_rate": 4.72181783757381e-05, "loss": 0.4645, "num_input_tokens_seen": 9154408, "step": 15875 }, { "epoch": 2.3652070300863866, "grad_norm": 1.9065626859664917, "learning_rate": 4.721519831787737e-05, "loss": 0.7561, "num_input_tokens_seen": 9157736, "step": 15880 }, { "epoch": 2.3659517426273458, "grad_norm": 0.5910836458206177, "learning_rate": 4.721221675880386e-05, "loss": 0.5301, "num_input_tokens_seen": 9160552, "step": 15885 }, { "epoch": 2.366696455168305, "grad_norm": 0.6897431015968323, "learning_rate": 4.7209233698719056e-05, "loss": 0.79, "num_input_tokens_seen": 9163272, "step": 15890 }, { "epoch": 2.367441167709264, "grad_norm": 1.2730631828308105, "learning_rate": 4.7206249137824535e-05, "loss": 0.5385, "num_input_tokens_seen": 9166120, "step": 15895 }, { "epoch": 2.3681858802502234, "grad_norm": 1.1844255924224854, "learning_rate": 4.7203263076321966e-05, "loss": 0.5647, "num_input_tokens_seen": 9168968, "step": 15900 }, { "epoch": 2.3689305927911826, "grad_norm": 0.9556328058242798, "learning_rate": 4.720027551441316e-05, "loss": 0.7143, "num_input_tokens_seen": 9171816, "step": 15905 }, { "epoch": 2.3696753053321418, "grad_norm": 1.0012930631637573, "learning_rate": 4.719728645229999e-05, "loss": 0.7564, "num_input_tokens_seen": 9174728, "step": 15910 }, { "epoch": 2.370420017873101, "grad_norm": 0.9493711590766907, "learning_rate": 4.719429589018446e-05, "loss": 0.6057, "num_input_tokens_seen": 9177544, "step": 15915 }, { "epoch": 2.37116473041406, "grad_norm": 1.9888567924499512, "learning_rate": 4.719130382826864e-05, "loss": 0.7375, "num_input_tokens_seen": 9180232, "step": 15920 }, { "epoch": 2.3719094429550194, "grad_norm": 1.3955883979797363, "learning_rate": 4.718831026675473e-05, "loss": 0.6009, "num_input_tokens_seen": 9183048, "step": 15925 }, { "epoch": 2.3726541554959786, "grad_norm": 1.0632845163345337, "learning_rate": 4.718531520584503e-05, "loss": 0.5839, "num_input_tokens_seen": 9185800, "step": 15930 }, { "epoch": 2.373398868036938, "grad_norm": 0.7193822860717773, "learning_rate": 4.718231864574193e-05, "loss": 0.5769, "num_input_tokens_seen": 9188904, "step": 15935 }, { "epoch": 2.374143580577897, "grad_norm": 1.0065841674804688, "learning_rate": 4.717932058664791e-05, "loss": 0.5684, "num_input_tokens_seen": 9191784, "step": 15940 }, { "epoch": 2.374888293118856, "grad_norm": 1.0261908769607544, "learning_rate": 4.717632102876559e-05, "loss": 0.7242, "num_input_tokens_seen": 9194728, "step": 15945 }, { "epoch": 2.3756330056598154, "grad_norm": 0.9182767271995544, "learning_rate": 4.717331997229765e-05, "loss": 0.6656, "num_input_tokens_seen": 9197416, "step": 15950 }, { "epoch": 2.3763777182007746, "grad_norm": 1.250828504562378, "learning_rate": 4.71703174174469e-05, "loss": 0.6691, "num_input_tokens_seen": 9200648, "step": 15955 }, { "epoch": 2.377122430741734, "grad_norm": 0.8706116676330566, "learning_rate": 4.7167313364416234e-05, "loss": 0.7578, "num_input_tokens_seen": 9203464, "step": 15960 }, { "epoch": 2.377867143282693, "grad_norm": 0.9438499212265015, "learning_rate": 4.716430781340866e-05, "loss": 0.6145, "num_input_tokens_seen": 9206216, "step": 15965 }, { "epoch": 2.378611855823652, "grad_norm": 0.7984596490859985, "learning_rate": 4.716130076462728e-05, "loss": 0.6325, "num_input_tokens_seen": 9208840, "step": 15970 }, { "epoch": 2.3793565683646114, "grad_norm": 0.9168867468833923, "learning_rate": 4.715829221827529e-05, "loss": 0.6818, "num_input_tokens_seen": 9212008, "step": 15975 }, { "epoch": 2.3801012809055706, "grad_norm": 0.9592291116714478, "learning_rate": 4.7155282174556004e-05, "loss": 0.5968, "num_input_tokens_seen": 9214920, "step": 15980 }, { "epoch": 2.38084599344653, "grad_norm": 1.4927104711532593, "learning_rate": 4.7152270633672826e-05, "loss": 0.6254, "num_input_tokens_seen": 9217768, "step": 15985 }, { "epoch": 2.381590705987489, "grad_norm": 1.4281623363494873, "learning_rate": 4.7149257595829255e-05, "loss": 0.7284, "num_input_tokens_seen": 9220488, "step": 15990 }, { "epoch": 2.382335418528448, "grad_norm": 1.0439566373825073, "learning_rate": 4.714624306122892e-05, "loss": 0.6976, "num_input_tokens_seen": 9223304, "step": 15995 }, { "epoch": 2.3830801310694074, "grad_norm": 1.074482798576355, "learning_rate": 4.714322703007551e-05, "loss": 0.6649, "num_input_tokens_seen": 9226280, "step": 16000 }, { "epoch": 2.383824843610366, "grad_norm": 1.4294673204421997, "learning_rate": 4.7140209502572855e-05, "loss": 0.6898, "num_input_tokens_seen": 9229096, "step": 16005 }, { "epoch": 2.384569556151326, "grad_norm": 0.9881172776222229, "learning_rate": 4.713719047892484e-05, "loss": 0.5268, "num_input_tokens_seen": 9232008, "step": 16010 }, { "epoch": 2.3853142686922846, "grad_norm": 1.6113522052764893, "learning_rate": 4.713416995933551e-05, "loss": 0.659, "num_input_tokens_seen": 9235080, "step": 16015 }, { "epoch": 2.386058981233244, "grad_norm": 1.2555012702941895, "learning_rate": 4.7131147944008965e-05, "loss": 0.6796, "num_input_tokens_seen": 9238056, "step": 16020 }, { "epoch": 2.386803693774203, "grad_norm": 1.744829773902893, "learning_rate": 4.712812443314941e-05, "loss": 0.7453, "num_input_tokens_seen": 9241192, "step": 16025 }, { "epoch": 2.387548406315162, "grad_norm": 0.7507405877113342, "learning_rate": 4.7125099426961185e-05, "loss": 0.7899, "num_input_tokens_seen": 9243848, "step": 16030 }, { "epoch": 2.3882931188561214, "grad_norm": 2.0750277042388916, "learning_rate": 4.712207292564869e-05, "loss": 0.6738, "num_input_tokens_seen": 9246728, "step": 16035 }, { "epoch": 2.3890378313970806, "grad_norm": 0.651394248008728, "learning_rate": 4.7119044929416443e-05, "loss": 0.5364, "num_input_tokens_seen": 9249672, "step": 16040 }, { "epoch": 2.38978254393804, "grad_norm": 0.8397552967071533, "learning_rate": 4.7116015438469074e-05, "loss": 0.7619, "num_input_tokens_seen": 9252456, "step": 16045 }, { "epoch": 2.390527256478999, "grad_norm": 0.8706160187721252, "learning_rate": 4.7112984453011297e-05, "loss": 0.7557, "num_input_tokens_seen": 9255432, "step": 16050 }, { "epoch": 2.391271969019958, "grad_norm": 5.2579264640808105, "learning_rate": 4.7109951973247945e-05, "loss": 0.802, "num_input_tokens_seen": 9258344, "step": 16055 }, { "epoch": 2.3920166815609174, "grad_norm": 1.4653103351593018, "learning_rate": 4.7106917999383926e-05, "loss": 0.6247, "num_input_tokens_seen": 9261416, "step": 16060 }, { "epoch": 2.3927613941018766, "grad_norm": 3.8014137744903564, "learning_rate": 4.710388253162426e-05, "loss": 0.6963, "num_input_tokens_seen": 9264264, "step": 16065 }, { "epoch": 2.393506106642836, "grad_norm": 0.9586682319641113, "learning_rate": 4.710084557017409e-05, "loss": 0.7161, "num_input_tokens_seen": 9267400, "step": 16070 }, { "epoch": 2.394250819183795, "grad_norm": 1.637892246246338, "learning_rate": 4.709780711523862e-05, "loss": 0.7723, "num_input_tokens_seen": 9270184, "step": 16075 }, { "epoch": 2.394995531724754, "grad_norm": 29.15627670288086, "learning_rate": 4.70947671670232e-05, "loss": 0.6041, "num_input_tokens_seen": 9272776, "step": 16080 }, { "epoch": 2.3957402442657134, "grad_norm": 1.95724618434906, "learning_rate": 4.709172572573325e-05, "loss": 0.637, "num_input_tokens_seen": 9275816, "step": 16085 }, { "epoch": 2.3964849568066726, "grad_norm": 2.2996435165405273, "learning_rate": 4.708868279157428e-05, "loss": 0.7169, "num_input_tokens_seen": 9278696, "step": 16090 }, { "epoch": 2.397229669347632, "grad_norm": 0.5731326937675476, "learning_rate": 4.7085638364751936e-05, "loss": 0.6547, "num_input_tokens_seen": 9281672, "step": 16095 }, { "epoch": 2.397974381888591, "grad_norm": 0.8912832140922546, "learning_rate": 4.7082592445471954e-05, "loss": 0.6404, "num_input_tokens_seen": 9284552, "step": 16100 }, { "epoch": 2.3987190944295502, "grad_norm": 2.273705005645752, "learning_rate": 4.7079545033940155e-05, "loss": 0.5996, "num_input_tokens_seen": 9287368, "step": 16105 }, { "epoch": 2.3994638069705094, "grad_norm": 3.3731765747070312, "learning_rate": 4.7076496130362455e-05, "loss": 0.4573, "num_input_tokens_seen": 9290376, "step": 16110 }, { "epoch": 2.4002085195114686, "grad_norm": 1.6255664825439453, "learning_rate": 4.707344573494492e-05, "loss": 0.6271, "num_input_tokens_seen": 9292968, "step": 16115 }, { "epoch": 2.400953232052428, "grad_norm": 13.540799140930176, "learning_rate": 4.7070393847893665e-05, "loss": 0.5716, "num_input_tokens_seen": 9295944, "step": 16120 }, { "epoch": 2.401697944593387, "grad_norm": 6.653060436248779, "learning_rate": 4.706734046941492e-05, "loss": 0.5557, "num_input_tokens_seen": 9298568, "step": 16125 }, { "epoch": 2.4024426571343462, "grad_norm": 1.2299411296844482, "learning_rate": 4.706428559971502e-05, "loss": 0.6254, "num_input_tokens_seen": 9301576, "step": 16130 }, { "epoch": 2.4031873696753054, "grad_norm": 0.9433853030204773, "learning_rate": 4.706122923900042e-05, "loss": 0.7572, "num_input_tokens_seen": 9304648, "step": 16135 }, { "epoch": 2.4039320822162646, "grad_norm": 1.0269793272018433, "learning_rate": 4.705817138747763e-05, "loss": 0.6123, "num_input_tokens_seen": 9307400, "step": 16140 }, { "epoch": 2.404676794757224, "grad_norm": 0.6453197598457336, "learning_rate": 4.7055112045353304e-05, "loss": 0.7844, "num_input_tokens_seen": 9310120, "step": 16145 }, { "epoch": 2.405421507298183, "grad_norm": 2.255919933319092, "learning_rate": 4.705205121283418e-05, "loss": 0.7551, "num_input_tokens_seen": 9312872, "step": 16150 }, { "epoch": 2.4061662198391423, "grad_norm": 1.5344798564910889, "learning_rate": 4.704898889012709e-05, "loss": 0.7554, "num_input_tokens_seen": 9315752, "step": 16155 }, { "epoch": 2.4069109323801015, "grad_norm": 1.528842568397522, "learning_rate": 4.704592507743897e-05, "loss": 0.6319, "num_input_tokens_seen": 9318696, "step": 16160 }, { "epoch": 2.4076556449210607, "grad_norm": 0.6505941152572632, "learning_rate": 4.704285977497687e-05, "loss": 0.5974, "num_input_tokens_seen": 9321288, "step": 16165 }, { "epoch": 2.4084003574620194, "grad_norm": 0.9260154366493225, "learning_rate": 4.703979298294793e-05, "loss": 0.6822, "num_input_tokens_seen": 9323848, "step": 16170 }, { "epoch": 2.409145070002979, "grad_norm": 0.6820340156555176, "learning_rate": 4.703672470155938e-05, "loss": 0.594, "num_input_tokens_seen": 9326856, "step": 16175 }, { "epoch": 2.409889782543938, "grad_norm": 0.8720170855522156, "learning_rate": 4.703365493101857e-05, "loss": 0.5766, "num_input_tokens_seen": 9329832, "step": 16180 }, { "epoch": 2.4106344950848975, "grad_norm": 1.071922779083252, "learning_rate": 4.703058367153295e-05, "loss": 0.7202, "num_input_tokens_seen": 9332616, "step": 16185 }, { "epoch": 2.4113792076258562, "grad_norm": 1.002766489982605, "learning_rate": 4.702751092331005e-05, "loss": 0.8387, "num_input_tokens_seen": 9335848, "step": 16190 }, { "epoch": 2.4121239201668154, "grad_norm": 1.374539852142334, "learning_rate": 4.7024436686557516e-05, "loss": 0.5721, "num_input_tokens_seen": 9339176, "step": 16195 }, { "epoch": 2.4128686327077746, "grad_norm": 0.5799520015716553, "learning_rate": 4.70213609614831e-05, "loss": 0.6606, "num_input_tokens_seen": 9342088, "step": 16200 }, { "epoch": 2.413613345248734, "grad_norm": 1.7357103824615479, "learning_rate": 4.701828374829464e-05, "loss": 0.5962, "num_input_tokens_seen": 9344904, "step": 16205 }, { "epoch": 2.414358057789693, "grad_norm": 1.912761926651001, "learning_rate": 4.701520504720008e-05, "loss": 0.6763, "num_input_tokens_seen": 9347592, "step": 16210 }, { "epoch": 2.4151027703306522, "grad_norm": 1.1180039644241333, "learning_rate": 4.701212485840748e-05, "loss": 0.7252, "num_input_tokens_seen": 9350504, "step": 16215 }, { "epoch": 2.4158474828716114, "grad_norm": 0.7585243582725525, "learning_rate": 4.7009043182124966e-05, "loss": 0.8201, "num_input_tokens_seen": 9353608, "step": 16220 }, { "epoch": 2.4165921954125706, "grad_norm": 1.0935379266738892, "learning_rate": 4.700596001856081e-05, "loss": 0.5818, "num_input_tokens_seen": 9356712, "step": 16225 }, { "epoch": 2.41733690795353, "grad_norm": 0.7482426762580872, "learning_rate": 4.700287536792334e-05, "loss": 0.527, "num_input_tokens_seen": 9359560, "step": 16230 }, { "epoch": 2.418081620494489, "grad_norm": 0.9244526028633118, "learning_rate": 4.6999789230421e-05, "loss": 0.5363, "num_input_tokens_seen": 9362376, "step": 16235 }, { "epoch": 2.4188263330354483, "grad_norm": 0.7083542943000793, "learning_rate": 4.699670160626236e-05, "loss": 0.6115, "num_input_tokens_seen": 9365224, "step": 16240 }, { "epoch": 2.4195710455764075, "grad_norm": 2.0320186614990234, "learning_rate": 4.699361249565605e-05, "loss": 0.7919, "num_input_tokens_seen": 9368168, "step": 16245 }, { "epoch": 2.4203157581173667, "grad_norm": 1.2436350584030151, "learning_rate": 4.699052189881083e-05, "loss": 0.7095, "num_input_tokens_seen": 9371176, "step": 16250 }, { "epoch": 2.421060470658326, "grad_norm": 3.6652352809906006, "learning_rate": 4.698742981593555e-05, "loss": 0.8311, "num_input_tokens_seen": 9374216, "step": 16255 }, { "epoch": 2.421805183199285, "grad_norm": 1.1134288311004639, "learning_rate": 4.6984336247239155e-05, "loss": 0.6825, "num_input_tokens_seen": 9377192, "step": 16260 }, { "epoch": 2.4225498957402443, "grad_norm": 0.4648077189922333, "learning_rate": 4.69812411929307e-05, "loss": 0.5187, "num_input_tokens_seen": 9380200, "step": 16265 }, { "epoch": 2.4232946082812035, "grad_norm": 0.851477861404419, "learning_rate": 4.697814465321934e-05, "loss": 0.7287, "num_input_tokens_seen": 9382984, "step": 16270 }, { "epoch": 2.4240393208221627, "grad_norm": 1.0756253004074097, "learning_rate": 4.6975046628314304e-05, "loss": 0.7034, "num_input_tokens_seen": 9385512, "step": 16275 }, { "epoch": 2.424784033363122, "grad_norm": 0.5947380661964417, "learning_rate": 4.6971947118424976e-05, "loss": 0.5689, "num_input_tokens_seen": 9388520, "step": 16280 }, { "epoch": 2.425528745904081, "grad_norm": 1.1999059915542603, "learning_rate": 4.6968846123760786e-05, "loss": 0.6719, "num_input_tokens_seen": 9391304, "step": 16285 }, { "epoch": 2.4262734584450403, "grad_norm": 0.9700248837471008, "learning_rate": 4.696574364453129e-05, "loss": 0.6549, "num_input_tokens_seen": 9393928, "step": 16290 }, { "epoch": 2.4270181709859995, "grad_norm": 1.2028820514678955, "learning_rate": 4.6962639680946154e-05, "loss": 0.5851, "num_input_tokens_seen": 9396680, "step": 16295 }, { "epoch": 2.4277628835269587, "grad_norm": 1.2359157800674438, "learning_rate": 4.6959534233215116e-05, "loss": 0.7873, "num_input_tokens_seen": 9399752, "step": 16300 }, { "epoch": 2.428507596067918, "grad_norm": 1.1660407781600952, "learning_rate": 4.695642730154804e-05, "loss": 0.7077, "num_input_tokens_seen": 9402536, "step": 16305 }, { "epoch": 2.429252308608877, "grad_norm": 1.2311809062957764, "learning_rate": 4.695331888615487e-05, "loss": 0.5828, "num_input_tokens_seen": 9405416, "step": 16310 }, { "epoch": 2.4299970211498363, "grad_norm": 1.386870265007019, "learning_rate": 4.695020898724567e-05, "loss": 0.8286, "num_input_tokens_seen": 9408360, "step": 16315 }, { "epoch": 2.4307417336907955, "grad_norm": 1.227386474609375, "learning_rate": 4.694709760503059e-05, "loss": 0.657, "num_input_tokens_seen": 9411432, "step": 16320 }, { "epoch": 2.4314864462317547, "grad_norm": 0.9808677434921265, "learning_rate": 4.694398473971988e-05, "loss": 0.7228, "num_input_tokens_seen": 9414312, "step": 16325 }, { "epoch": 2.432231158772714, "grad_norm": 1.2791831493377686, "learning_rate": 4.6940870391523905e-05, "loss": 0.8601, "num_input_tokens_seen": 9416968, "step": 16330 }, { "epoch": 2.432975871313673, "grad_norm": 0.818233847618103, "learning_rate": 4.693775456065311e-05, "loss": 0.7202, "num_input_tokens_seen": 9419688, "step": 16335 }, { "epoch": 2.4337205838546323, "grad_norm": 1.0562702417373657, "learning_rate": 4.693463724731805e-05, "loss": 0.6757, "num_input_tokens_seen": 9422760, "step": 16340 }, { "epoch": 2.434465296395591, "grad_norm": 1.474922776222229, "learning_rate": 4.693151845172939e-05, "loss": 0.6968, "num_input_tokens_seen": 9425512, "step": 16345 }, { "epoch": 2.4352100089365507, "grad_norm": 0.8619655966758728, "learning_rate": 4.692839817409788e-05, "loss": 0.6037, "num_input_tokens_seen": 9428520, "step": 16350 }, { "epoch": 2.4359547214775095, "grad_norm": 0.5854871869087219, "learning_rate": 4.6925276414634375e-05, "loss": 0.8361, "num_input_tokens_seen": 9431432, "step": 16355 }, { "epoch": 2.436699434018469, "grad_norm": 0.5557618737220764, "learning_rate": 4.6922153173549835e-05, "loss": 0.5653, "num_input_tokens_seen": 9434312, "step": 16360 }, { "epoch": 2.437444146559428, "grad_norm": 1.0646394491195679, "learning_rate": 4.691902845105531e-05, "loss": 0.7201, "num_input_tokens_seen": 9437224, "step": 16365 }, { "epoch": 2.438188859100387, "grad_norm": 0.8714174628257751, "learning_rate": 4.6915902247361954e-05, "loss": 0.624, "num_input_tokens_seen": 9439976, "step": 16370 }, { "epoch": 2.4389335716413463, "grad_norm": 0.960195779800415, "learning_rate": 4.691277456268103e-05, "loss": 0.7064, "num_input_tokens_seen": 9443208, "step": 16375 }, { "epoch": 2.4396782841823055, "grad_norm": 1.6275975704193115, "learning_rate": 4.690964539722389e-05, "loss": 0.5561, "num_input_tokens_seen": 9445864, "step": 16380 }, { "epoch": 2.4404229967232647, "grad_norm": 0.850094735622406, "learning_rate": 4.6906514751202005e-05, "loss": 0.5567, "num_input_tokens_seen": 9448936, "step": 16385 }, { "epoch": 2.441167709264224, "grad_norm": 1.0507402420043945, "learning_rate": 4.690338262482691e-05, "loss": 0.5729, "num_input_tokens_seen": 9451976, "step": 16390 }, { "epoch": 2.441912421805183, "grad_norm": 0.9124107956886292, "learning_rate": 4.690024901831026e-05, "loss": 0.6121, "num_input_tokens_seen": 9454824, "step": 16395 }, { "epoch": 2.4426571343461423, "grad_norm": 1.2857698202133179, "learning_rate": 4.6897113931863837e-05, "loss": 0.7436, "num_input_tokens_seen": 9457704, "step": 16400 }, { "epoch": 2.4434018468871015, "grad_norm": 0.7743210792541504, "learning_rate": 4.6893977365699474e-05, "loss": 0.6005, "num_input_tokens_seen": 9460776, "step": 16405 }, { "epoch": 2.4441465594280607, "grad_norm": 0.6162790060043335, "learning_rate": 4.6890839320029134e-05, "loss": 0.6852, "num_input_tokens_seen": 9463848, "step": 16410 }, { "epoch": 2.44489127196902, "grad_norm": 1.6961556673049927, "learning_rate": 4.688769979506488e-05, "loss": 0.6184, "num_input_tokens_seen": 9466568, "step": 16415 }, { "epoch": 2.445635984509979, "grad_norm": 0.835956335067749, "learning_rate": 4.688455879101885e-05, "loss": 0.5755, "num_input_tokens_seen": 9469800, "step": 16420 }, { "epoch": 2.4463806970509383, "grad_norm": 1.1197679042816162, "learning_rate": 4.688141630810333e-05, "loss": 0.6412, "num_input_tokens_seen": 9472712, "step": 16425 }, { "epoch": 2.4471254095918975, "grad_norm": 0.7500296235084534, "learning_rate": 4.687827234653065e-05, "loss": 0.6077, "num_input_tokens_seen": 9475656, "step": 16430 }, { "epoch": 2.4478701221328567, "grad_norm": 1.4306607246398926, "learning_rate": 4.687512690651328e-05, "loss": 0.5175, "num_input_tokens_seen": 9478312, "step": 16435 }, { "epoch": 2.448614834673816, "grad_norm": 1.0915154218673706, "learning_rate": 4.687197998826376e-05, "loss": 0.5815, "num_input_tokens_seen": 9481352, "step": 16440 }, { "epoch": 2.449359547214775, "grad_norm": 1.4974031448364258, "learning_rate": 4.686883159199477e-05, "loss": 0.6835, "num_input_tokens_seen": 9484072, "step": 16445 }, { "epoch": 2.4501042597557343, "grad_norm": 0.9293982982635498, "learning_rate": 4.6865681717919047e-05, "loss": 0.6968, "num_input_tokens_seen": 9487080, "step": 16450 }, { "epoch": 2.4508489722966935, "grad_norm": 1.8467190265655518, "learning_rate": 4.686253036624946e-05, "loss": 0.7178, "num_input_tokens_seen": 9489928, "step": 16455 }, { "epoch": 2.4515936848376527, "grad_norm": 0.9678923487663269, "learning_rate": 4.6859377537198945e-05, "loss": 0.7972, "num_input_tokens_seen": 9493128, "step": 16460 }, { "epoch": 2.452338397378612, "grad_norm": 1.5371133089065552, "learning_rate": 4.6856223230980576e-05, "loss": 0.7981, "num_input_tokens_seen": 9496072, "step": 16465 }, { "epoch": 2.453083109919571, "grad_norm": 0.9188042879104614, "learning_rate": 4.6853067447807505e-05, "loss": 0.666, "num_input_tokens_seen": 9498984, "step": 16470 }, { "epoch": 2.4538278224605303, "grad_norm": 0.7265926003456116, "learning_rate": 4.684991018789298e-05, "loss": 0.6136, "num_input_tokens_seen": 9501864, "step": 16475 }, { "epoch": 2.4545725350014895, "grad_norm": 1.4775174856185913, "learning_rate": 4.6846751451450366e-05, "loss": 0.7893, "num_input_tokens_seen": 9504840, "step": 16480 }, { "epoch": 2.4553172475424487, "grad_norm": 1.2875471115112305, "learning_rate": 4.684359123869311e-05, "loss": 0.6144, "num_input_tokens_seen": 9507496, "step": 16485 }, { "epoch": 2.456061960083408, "grad_norm": 1.5886858701705933, "learning_rate": 4.684042954983476e-05, "loss": 0.5385, "num_input_tokens_seen": 9510376, "step": 16490 }, { "epoch": 2.456806672624367, "grad_norm": 0.7729721069335938, "learning_rate": 4.683726638508899e-05, "loss": 0.4549, "num_input_tokens_seen": 9513160, "step": 16495 }, { "epoch": 2.4575513851653263, "grad_norm": 0.8005237579345703, "learning_rate": 4.6834101744669526e-05, "loss": 0.6707, "num_input_tokens_seen": 9516168, "step": 16500 }, { "epoch": 2.4582960977062855, "grad_norm": 1.818528175354004, "learning_rate": 4.683093562879024e-05, "loss": 0.7755, "num_input_tokens_seen": 9519048, "step": 16505 }, { "epoch": 2.4590408102472447, "grad_norm": 1.0656580924987793, "learning_rate": 4.682776803766509e-05, "loss": 0.6573, "num_input_tokens_seen": 9521704, "step": 16510 }, { "epoch": 2.459785522788204, "grad_norm": 1.056905746459961, "learning_rate": 4.682459897150812e-05, "loss": 0.5424, "num_input_tokens_seen": 9524392, "step": 16515 }, { "epoch": 2.4605302353291627, "grad_norm": 1.6523991823196411, "learning_rate": 4.682142843053348e-05, "loss": 0.7356, "num_input_tokens_seen": 9527720, "step": 16520 }, { "epoch": 2.4612749478701224, "grad_norm": 0.7285161018371582, "learning_rate": 4.681825641495543e-05, "loss": 0.6087, "num_input_tokens_seen": 9530568, "step": 16525 }, { "epoch": 2.462019660411081, "grad_norm": 0.7238612771034241, "learning_rate": 4.681508292498832e-05, "loss": 0.8224, "num_input_tokens_seen": 9533512, "step": 16530 }, { "epoch": 2.4627643729520408, "grad_norm": 0.7377212047576904, "learning_rate": 4.681190796084659e-05, "loss": 0.5636, "num_input_tokens_seen": 9536424, "step": 16535 }, { "epoch": 2.4635090854929995, "grad_norm": 1.5707931518554688, "learning_rate": 4.680873152274481e-05, "loss": 0.7502, "num_input_tokens_seen": 9539176, "step": 16540 }, { "epoch": 2.4642537980339587, "grad_norm": 0.8099005818367004, "learning_rate": 4.680555361089762e-05, "loss": 0.6192, "num_input_tokens_seen": 9541864, "step": 16545 }, { "epoch": 2.464998510574918, "grad_norm": 0.9315763711929321, "learning_rate": 4.680237422551977e-05, "loss": 0.6704, "num_input_tokens_seen": 9545064, "step": 16550 }, { "epoch": 2.465743223115877, "grad_norm": 0.9406781196594238, "learning_rate": 4.679919336682611e-05, "loss": 0.8125, "num_input_tokens_seen": 9548008, "step": 16555 }, { "epoch": 2.4664879356568363, "grad_norm": 0.7636608481407166, "learning_rate": 4.6796011035031596e-05, "loss": 0.6403, "num_input_tokens_seen": 9550632, "step": 16560 }, { "epoch": 2.4672326481977955, "grad_norm": 0.6425802707672119, "learning_rate": 4.6792827230351265e-05, "loss": 0.606, "num_input_tokens_seen": 9553576, "step": 16565 }, { "epoch": 2.4679773607387547, "grad_norm": 0.9619606733322144, "learning_rate": 4.678964195300028e-05, "loss": 0.6075, "num_input_tokens_seen": 9556168, "step": 16570 }, { "epoch": 2.468722073279714, "grad_norm": 1.0502041578292847, "learning_rate": 4.678645520319388e-05, "loss": 0.7335, "num_input_tokens_seen": 9558888, "step": 16575 }, { "epoch": 2.469466785820673, "grad_norm": 0.847040057182312, "learning_rate": 4.678326698114741e-05, "loss": 0.6615, "num_input_tokens_seen": 9561768, "step": 16580 }, { "epoch": 2.4702114983616323, "grad_norm": 1.535902976989746, "learning_rate": 4.678007728707633e-05, "loss": 0.6961, "num_input_tokens_seen": 9564776, "step": 16585 }, { "epoch": 2.4709562109025915, "grad_norm": 0.9802411198616028, "learning_rate": 4.6776886121196175e-05, "loss": 0.5725, "num_input_tokens_seen": 9567880, "step": 16590 }, { "epoch": 2.4717009234435507, "grad_norm": 1.3397409915924072, "learning_rate": 4.677369348372259e-05, "loss": 0.6527, "num_input_tokens_seen": 9570984, "step": 16595 }, { "epoch": 2.47244563598451, "grad_norm": 0.7708902359008789, "learning_rate": 4.677049937487134e-05, "loss": 0.6457, "num_input_tokens_seen": 9573992, "step": 16600 }, { "epoch": 2.473190348525469, "grad_norm": 0.7801169157028198, "learning_rate": 4.6767303794858235e-05, "loss": 0.5802, "num_input_tokens_seen": 9577096, "step": 16605 }, { "epoch": 2.4739350610664284, "grad_norm": 1.3244308233261108, "learning_rate": 4.676410674389925e-05, "loss": 0.6986, "num_input_tokens_seen": 9580168, "step": 16610 }, { "epoch": 2.4746797736073876, "grad_norm": 1.1339763402938843, "learning_rate": 4.676090822221042e-05, "loss": 0.6681, "num_input_tokens_seen": 9582888, "step": 16615 }, { "epoch": 2.4754244861483468, "grad_norm": 0.8426891565322876, "learning_rate": 4.6757708230007877e-05, "loss": 0.6751, "num_input_tokens_seen": 9585832, "step": 16620 }, { "epoch": 2.476169198689306, "grad_norm": 1.199317216873169, "learning_rate": 4.6754506767507874e-05, "loss": 0.839, "num_input_tokens_seen": 9588680, "step": 16625 }, { "epoch": 2.476913911230265, "grad_norm": 1.937049388885498, "learning_rate": 4.6751303834926755e-05, "loss": 0.5225, "num_input_tokens_seen": 9591304, "step": 16630 }, { "epoch": 2.4776586237712244, "grad_norm": 1.7380720376968384, "learning_rate": 4.674809943248095e-05, "loss": 0.7992, "num_input_tokens_seen": 9594184, "step": 16635 }, { "epoch": 2.4784033363121836, "grad_norm": 2.830357074737549, "learning_rate": 4.674489356038702e-05, "loss": 0.633, "num_input_tokens_seen": 9597000, "step": 16640 }, { "epoch": 2.4791480488531428, "grad_norm": 0.6197934746742249, "learning_rate": 4.674168621886158e-05, "loss": 0.6515, "num_input_tokens_seen": 9600040, "step": 16645 }, { "epoch": 2.479892761394102, "grad_norm": 1.7588247060775757, "learning_rate": 4.673847740812138e-05, "loss": 0.463, "num_input_tokens_seen": 9602984, "step": 16650 }, { "epoch": 2.480637473935061, "grad_norm": 2.372818946838379, "learning_rate": 4.673526712838326e-05, "loss": 0.7894, "num_input_tokens_seen": 9605672, "step": 16655 }, { "epoch": 2.4813821864760204, "grad_norm": 1.2259224653244019, "learning_rate": 4.673205537986416e-05, "loss": 0.7094, "num_input_tokens_seen": 9608232, "step": 16660 }, { "epoch": 2.4821268990169796, "grad_norm": 1.252041220664978, "learning_rate": 4.672884216278112e-05, "loss": 0.6324, "num_input_tokens_seen": 9610984, "step": 16665 }, { "epoch": 2.482871611557939, "grad_norm": 1.2203903198242188, "learning_rate": 4.672562747735126e-05, "loss": 0.6497, "num_input_tokens_seen": 9614120, "step": 16670 }, { "epoch": 2.483616324098898, "grad_norm": 1.139268159866333, "learning_rate": 4.6722411323791824e-05, "loss": 0.6773, "num_input_tokens_seen": 9617448, "step": 16675 }, { "epoch": 2.484361036639857, "grad_norm": 1.185815453529358, "learning_rate": 4.671919370232015e-05, "loss": 0.7795, "num_input_tokens_seen": 9620520, "step": 16680 }, { "epoch": 2.4851057491808164, "grad_norm": 1.1003203392028809, "learning_rate": 4.671597461315367e-05, "loss": 0.5803, "num_input_tokens_seen": 9623272, "step": 16685 }, { "epoch": 2.4858504617217756, "grad_norm": 1.0655966997146606, "learning_rate": 4.6712754056509924e-05, "loss": 0.6913, "num_input_tokens_seen": 9626088, "step": 16690 }, { "epoch": 2.4865951742627344, "grad_norm": 0.8296844363212585, "learning_rate": 4.670953203260653e-05, "loss": 0.6541, "num_input_tokens_seen": 9629288, "step": 16695 }, { "epoch": 2.487339886803694, "grad_norm": 1.0470678806304932, "learning_rate": 4.6706308541661224e-05, "loss": 0.6182, "num_input_tokens_seen": 9631816, "step": 16700 }, { "epoch": 2.4880845993446528, "grad_norm": 1.18248450756073, "learning_rate": 4.670308358389184e-05, "loss": 0.671, "num_input_tokens_seen": 9634504, "step": 16705 }, { "epoch": 2.488829311885612, "grad_norm": 1.1683779954910278, "learning_rate": 4.66998571595163e-05, "loss": 0.9124, "num_input_tokens_seen": 9637448, "step": 16710 }, { "epoch": 2.489574024426571, "grad_norm": 2.3721981048583984, "learning_rate": 4.6696629268752647e-05, "loss": 0.6805, "num_input_tokens_seen": 9640072, "step": 16715 }, { "epoch": 2.4903187369675304, "grad_norm": 1.3935909271240234, "learning_rate": 4.6693399911818994e-05, "loss": 0.7029, "num_input_tokens_seen": 9642856, "step": 16720 }, { "epoch": 2.4910634495084896, "grad_norm": 1.3496382236480713, "learning_rate": 4.669016908893358e-05, "loss": 0.6714, "num_input_tokens_seen": 9645640, "step": 16725 }, { "epoch": 2.4918081620494488, "grad_norm": 1.149826169013977, "learning_rate": 4.668693680031472e-05, "loss": 0.6877, "num_input_tokens_seen": 9648264, "step": 16730 }, { "epoch": 2.492552874590408, "grad_norm": 0.798646092414856, "learning_rate": 4.668370304618084e-05, "loss": 0.6459, "num_input_tokens_seen": 9650728, "step": 16735 }, { "epoch": 2.493297587131367, "grad_norm": 1.6894350051879883, "learning_rate": 4.668046782675048e-05, "loss": 0.7051, "num_input_tokens_seen": 9653320, "step": 16740 }, { "epoch": 2.4940422996723264, "grad_norm": 1.2668107748031616, "learning_rate": 4.667723114224224e-05, "loss": 0.6144, "num_input_tokens_seen": 9655848, "step": 16745 }, { "epoch": 2.4947870122132856, "grad_norm": 0.9722650051116943, "learning_rate": 4.6673992992874855e-05, "loss": 0.5815, "num_input_tokens_seen": 9658696, "step": 16750 }, { "epoch": 2.495531724754245, "grad_norm": 1.6855189800262451, "learning_rate": 4.667075337886714e-05, "loss": 0.8167, "num_input_tokens_seen": 9661960, "step": 16755 }, { "epoch": 2.496276437295204, "grad_norm": 1.633151888847351, "learning_rate": 4.6667512300438025e-05, "loss": 0.579, "num_input_tokens_seen": 9664744, "step": 16760 }, { "epoch": 2.497021149836163, "grad_norm": 0.716266930103302, "learning_rate": 4.6664269757806525e-05, "loss": 0.6213, "num_input_tokens_seen": 9667528, "step": 16765 }, { "epoch": 2.4977658623771224, "grad_norm": 1.0123729705810547, "learning_rate": 4.6661025751191746e-05, "loss": 0.6033, "num_input_tokens_seen": 9670280, "step": 16770 }, { "epoch": 2.4985105749180816, "grad_norm": 1.4250528812408447, "learning_rate": 4.665778028081292e-05, "loss": 0.7276, "num_input_tokens_seen": 9673128, "step": 16775 }, { "epoch": 2.499255287459041, "grad_norm": 1.681127905845642, "learning_rate": 4.6654533346889356e-05, "loss": 0.7191, "num_input_tokens_seen": 9675976, "step": 16780 }, { "epoch": 2.5, "grad_norm": 1.3184659481048584, "learning_rate": 4.665128494964047e-05, "loss": 0.6316, "num_input_tokens_seen": 9678632, "step": 16785 }, { "epoch": 2.5, "eval_loss": 0.656985342502594, "eval_runtime": 74.27, "eval_samples_per_second": 40.178, "eval_steps_per_second": 10.044, "num_input_tokens_seen": 9678632, "step": 16785 }, { "epoch": 2.500744712540959, "grad_norm": 2.577300786972046, "learning_rate": 4.664803508928577e-05, "loss": 0.6932, "num_input_tokens_seen": 9681448, "step": 16790 }, { "epoch": 2.5014894250819184, "grad_norm": 1.115355372428894, "learning_rate": 4.664478376604488e-05, "loss": 0.5639, "num_input_tokens_seen": 9684456, "step": 16795 }, { "epoch": 2.5022341376228776, "grad_norm": 2.023526906967163, "learning_rate": 4.6641530980137506e-05, "loss": 0.7801, "num_input_tokens_seen": 9687496, "step": 16800 }, { "epoch": 2.502978850163837, "grad_norm": 0.9381527900695801, "learning_rate": 4.663827673178345e-05, "loss": 0.5105, "num_input_tokens_seen": 9690184, "step": 16805 }, { "epoch": 2.503723562704796, "grad_norm": 1.2265448570251465, "learning_rate": 4.6635021021202624e-05, "loss": 0.6822, "num_input_tokens_seen": 9692808, "step": 16810 }, { "epoch": 2.504468275245755, "grad_norm": 1.0776582956314087, "learning_rate": 4.6631763848615044e-05, "loss": 0.7127, "num_input_tokens_seen": 9695528, "step": 16815 }, { "epoch": 2.5052129877867144, "grad_norm": 0.8526057600975037, "learning_rate": 4.662850521424081e-05, "loss": 0.534, "num_input_tokens_seen": 9698440, "step": 16820 }, { "epoch": 2.5059577003276736, "grad_norm": 1.0797500610351562, "learning_rate": 4.662524511830013e-05, "loss": 0.752, "num_input_tokens_seen": 9701096, "step": 16825 }, { "epoch": 2.506702412868633, "grad_norm": 0.7502297163009644, "learning_rate": 4.662198356101331e-05, "loss": 0.8149, "num_input_tokens_seen": 9704008, "step": 16830 }, { "epoch": 2.507447125409592, "grad_norm": 0.7088015675544739, "learning_rate": 4.6618720542600744e-05, "loss": 0.5418, "num_input_tokens_seen": 9706824, "step": 16835 }, { "epoch": 2.5081918379505512, "grad_norm": 1.5870908498764038, "learning_rate": 4.6615456063282944e-05, "loss": 0.7212, "num_input_tokens_seen": 9709576, "step": 16840 }, { "epoch": 2.5089365504915104, "grad_norm": 0.8962220549583435, "learning_rate": 4.66121901232805e-05, "loss": 0.5801, "num_input_tokens_seen": 9712264, "step": 16845 }, { "epoch": 2.509681263032469, "grad_norm": 1.3428492546081543, "learning_rate": 4.6608922722814116e-05, "loss": 0.6135, "num_input_tokens_seen": 9715112, "step": 16850 }, { "epoch": 2.510425975573429, "grad_norm": 1.3265986442565918, "learning_rate": 4.6605653862104596e-05, "loss": 0.6004, "num_input_tokens_seen": 9717832, "step": 16855 }, { "epoch": 2.5111706881143876, "grad_norm": 1.183518886566162, "learning_rate": 4.660238354137283e-05, "loss": 0.5133, "num_input_tokens_seen": 9720712, "step": 16860 }, { "epoch": 2.5119154006553472, "grad_norm": 2.6347312927246094, "learning_rate": 4.6599111760839805e-05, "loss": 0.5087, "num_input_tokens_seen": 9723784, "step": 16865 }, { "epoch": 2.512660113196306, "grad_norm": 0.8847635388374329, "learning_rate": 4.659583852072663e-05, "loss": 0.5628, "num_input_tokens_seen": 9726824, "step": 16870 }, { "epoch": 2.5134048257372656, "grad_norm": 1.8283787965774536, "learning_rate": 4.6592563821254486e-05, "loss": 0.5327, "num_input_tokens_seen": 9729800, "step": 16875 }, { "epoch": 2.5141495382782244, "grad_norm": 0.7860766053199768, "learning_rate": 4.658928766264467e-05, "loss": 0.6174, "num_input_tokens_seen": 9732680, "step": 16880 }, { "epoch": 2.514894250819184, "grad_norm": 2.8521037101745605, "learning_rate": 4.658601004511856e-05, "loss": 0.8685, "num_input_tokens_seen": 9735688, "step": 16885 }, { "epoch": 2.515638963360143, "grad_norm": 2.5085415840148926, "learning_rate": 4.658273096889768e-05, "loss": 0.7824, "num_input_tokens_seen": 9738728, "step": 16890 }, { "epoch": 2.516383675901102, "grad_norm": 1.6114165782928467, "learning_rate": 4.657945043420356e-05, "loss": 0.7291, "num_input_tokens_seen": 9741512, "step": 16895 }, { "epoch": 2.517128388442061, "grad_norm": 1.1642276048660278, "learning_rate": 4.657616844125794e-05, "loss": 0.6844, "num_input_tokens_seen": 9744200, "step": 16900 }, { "epoch": 2.5178731009830204, "grad_norm": 1.6360313892364502, "learning_rate": 4.657288499028256e-05, "loss": 0.8852, "num_input_tokens_seen": 9747432, "step": 16905 }, { "epoch": 2.5186178135239796, "grad_norm": 0.6760614514350891, "learning_rate": 4.656960008149933e-05, "loss": 0.6885, "num_input_tokens_seen": 9750536, "step": 16910 }, { "epoch": 2.519362526064939, "grad_norm": 1.2247828245162964, "learning_rate": 4.656631371513022e-05, "loss": 0.625, "num_input_tokens_seen": 9753320, "step": 16915 }, { "epoch": 2.520107238605898, "grad_norm": 1.2687331438064575, "learning_rate": 4.656302589139732e-05, "loss": 0.5025, "num_input_tokens_seen": 9756040, "step": 16920 }, { "epoch": 2.5208519511468572, "grad_norm": 1.2829270362854004, "learning_rate": 4.655973661052279e-05, "loss": 0.575, "num_input_tokens_seen": 9759112, "step": 16925 }, { "epoch": 2.5215966636878164, "grad_norm": 0.6612535119056702, "learning_rate": 4.655644587272891e-05, "loss": 0.6473, "num_input_tokens_seen": 9762088, "step": 16930 }, { "epoch": 2.5223413762287756, "grad_norm": 0.8975142240524292, "learning_rate": 4.655315367823806e-05, "loss": 0.6486, "num_input_tokens_seen": 9764776, "step": 16935 }, { "epoch": 2.523086088769735, "grad_norm": 1.304813027381897, "learning_rate": 4.654986002727273e-05, "loss": 0.6083, "num_input_tokens_seen": 9767656, "step": 16940 }, { "epoch": 2.523830801310694, "grad_norm": 0.8441824913024902, "learning_rate": 4.6546564920055455e-05, "loss": 0.7136, "num_input_tokens_seen": 9770376, "step": 16945 }, { "epoch": 2.5245755138516532, "grad_norm": 1.259756088256836, "learning_rate": 4.654326835680894e-05, "loss": 0.5883, "num_input_tokens_seen": 9773320, "step": 16950 }, { "epoch": 2.5253202263926124, "grad_norm": 1.0793602466583252, "learning_rate": 4.6539970337755936e-05, "loss": 0.7595, "num_input_tokens_seen": 9776264, "step": 16955 }, { "epoch": 2.5260649389335716, "grad_norm": 0.9234493374824524, "learning_rate": 4.6536670863119305e-05, "loss": 0.8088, "num_input_tokens_seen": 9779304, "step": 16960 }, { "epoch": 2.526809651474531, "grad_norm": 0.9983651638031006, "learning_rate": 4.6533369933122014e-05, "loss": 0.7186, "num_input_tokens_seen": 9782408, "step": 16965 }, { "epoch": 2.52755436401549, "grad_norm": 1.2983614206314087, "learning_rate": 4.6530067547987145e-05, "loss": 0.679, "num_input_tokens_seen": 9785384, "step": 16970 }, { "epoch": 2.5282990765564493, "grad_norm": 1.7198162078857422, "learning_rate": 4.652676370793784e-05, "loss": 0.627, "num_input_tokens_seen": 9788264, "step": 16975 }, { "epoch": 2.5290437890974085, "grad_norm": 0.9689930081367493, "learning_rate": 4.6523458413197364e-05, "loss": 0.6123, "num_input_tokens_seen": 9791048, "step": 16980 }, { "epoch": 2.5297885016383677, "grad_norm": 0.8496894240379333, "learning_rate": 4.6520151663989075e-05, "loss": 0.7568, "num_input_tokens_seen": 9793704, "step": 16985 }, { "epoch": 2.530533214179327, "grad_norm": 0.7898241281509399, "learning_rate": 4.6516843460536434e-05, "loss": 0.8096, "num_input_tokens_seen": 9796712, "step": 16990 }, { "epoch": 2.531277926720286, "grad_norm": 0.801120936870575, "learning_rate": 4.651353380306299e-05, "loss": 0.6458, "num_input_tokens_seen": 9799432, "step": 16995 }, { "epoch": 2.5320226392612453, "grad_norm": 1.1028907299041748, "learning_rate": 4.65102226917924e-05, "loss": 0.6308, "num_input_tokens_seen": 9802600, "step": 17000 }, { "epoch": 2.5327673518022045, "grad_norm": 1.1489380598068237, "learning_rate": 4.650691012694842e-05, "loss": 0.7014, "num_input_tokens_seen": 9805736, "step": 17005 }, { "epoch": 2.5335120643431637, "grad_norm": 0.7391936182975769, "learning_rate": 4.650359610875489e-05, "loss": 0.6256, "num_input_tokens_seen": 9808392, "step": 17010 }, { "epoch": 2.534256776884123, "grad_norm": 1.1570175886154175, "learning_rate": 4.650028063743577e-05, "loss": 0.4834, "num_input_tokens_seen": 9811464, "step": 17015 }, { "epoch": 2.535001489425082, "grad_norm": 0.6670169830322266, "learning_rate": 4.649696371321509e-05, "loss": 0.58, "num_input_tokens_seen": 9814120, "step": 17020 }, { "epoch": 2.535746201966041, "grad_norm": 0.6680337190628052, "learning_rate": 4.6493645336317e-05, "loss": 0.5879, "num_input_tokens_seen": 9816936, "step": 17025 }, { "epoch": 2.5364909145070005, "grad_norm": 1.6081466674804688, "learning_rate": 4.6490325506965746e-05, "loss": 0.5954, "num_input_tokens_seen": 9819720, "step": 17030 }, { "epoch": 2.5372356270479592, "grad_norm": 1.0947613716125488, "learning_rate": 4.648700422538567e-05, "loss": 0.5559, "num_input_tokens_seen": 9822472, "step": 17035 }, { "epoch": 2.537980339588919, "grad_norm": 0.7176008820533752, "learning_rate": 4.648368149180121e-05, "loss": 0.6838, "num_input_tokens_seen": 9825352, "step": 17040 }, { "epoch": 2.5387250521298776, "grad_norm": 1.9267067909240723, "learning_rate": 4.64803573064369e-05, "loss": 0.7494, "num_input_tokens_seen": 9828296, "step": 17045 }, { "epoch": 2.5394697646708373, "grad_norm": 1.3819319009780884, "learning_rate": 4.647703166951738e-05, "loss": 0.6537, "num_input_tokens_seen": 9831048, "step": 17050 }, { "epoch": 2.540214477211796, "grad_norm": 1.051235556602478, "learning_rate": 4.6473704581267374e-05, "loss": 0.6628, "num_input_tokens_seen": 9833896, "step": 17055 }, { "epoch": 2.5409591897527557, "grad_norm": 0.6305236220359802, "learning_rate": 4.6470376041911715e-05, "loss": 0.537, "num_input_tokens_seen": 9836904, "step": 17060 }, { "epoch": 2.5417039022937145, "grad_norm": 0.6285597681999207, "learning_rate": 4.646704605167534e-05, "loss": 0.6545, "num_input_tokens_seen": 9839816, "step": 17065 }, { "epoch": 2.5424486148346737, "grad_norm": 1.6510030031204224, "learning_rate": 4.646371461078327e-05, "loss": 0.7567, "num_input_tokens_seen": 9842792, "step": 17070 }, { "epoch": 2.543193327375633, "grad_norm": 1.2426669597625732, "learning_rate": 4.646038171946063e-05, "loss": 0.6938, "num_input_tokens_seen": 9845416, "step": 17075 }, { "epoch": 2.543938039916592, "grad_norm": 1.6727880239486694, "learning_rate": 4.645704737793265e-05, "loss": 0.7519, "num_input_tokens_seen": 9848584, "step": 17080 }, { "epoch": 2.5446827524575513, "grad_norm": 0.688815176486969, "learning_rate": 4.645371158642464e-05, "loss": 0.7972, "num_input_tokens_seen": 9851816, "step": 17085 }, { "epoch": 2.5454274649985105, "grad_norm": 0.7420894503593445, "learning_rate": 4.645037434516204e-05, "loss": 0.6555, "num_input_tokens_seen": 9854728, "step": 17090 }, { "epoch": 2.5461721775394697, "grad_norm": 0.878265380859375, "learning_rate": 4.644703565437033e-05, "loss": 0.7732, "num_input_tokens_seen": 9857576, "step": 17095 }, { "epoch": 2.546916890080429, "grad_norm": 1.0861430168151855, "learning_rate": 4.644369551427516e-05, "loss": 0.7215, "num_input_tokens_seen": 9860296, "step": 17100 }, { "epoch": 2.547661602621388, "grad_norm": 2.122781753540039, "learning_rate": 4.6440353925102234e-05, "loss": 0.743, "num_input_tokens_seen": 9863176, "step": 17105 }, { "epoch": 2.5484063151623473, "grad_norm": 1.3779058456420898, "learning_rate": 4.643701088707736e-05, "loss": 0.7104, "num_input_tokens_seen": 9866152, "step": 17110 }, { "epoch": 2.5491510277033065, "grad_norm": 0.6962802410125732, "learning_rate": 4.643366640042643e-05, "loss": 0.7089, "num_input_tokens_seen": 9868904, "step": 17115 }, { "epoch": 2.5498957402442657, "grad_norm": 1.0251959562301636, "learning_rate": 4.643032046537549e-05, "loss": 0.6282, "num_input_tokens_seen": 9871560, "step": 17120 }, { "epoch": 2.550640452785225, "grad_norm": 0.8843142986297607, "learning_rate": 4.642697308215061e-05, "loss": 0.7688, "num_input_tokens_seen": 9874536, "step": 17125 }, { "epoch": 2.551385165326184, "grad_norm": 1.254599928855896, "learning_rate": 4.6423624250978e-05, "loss": 0.6773, "num_input_tokens_seen": 9877448, "step": 17130 }, { "epoch": 2.5521298778671433, "grad_norm": 0.9457390308380127, "learning_rate": 4.6420273972083985e-05, "loss": 0.574, "num_input_tokens_seen": 9880296, "step": 17135 }, { "epoch": 2.5528745904081025, "grad_norm": 0.8625083565711975, "learning_rate": 4.641692224569493e-05, "loss": 0.657, "num_input_tokens_seen": 9882920, "step": 17140 }, { "epoch": 2.5536193029490617, "grad_norm": 1.1599129438400269, "learning_rate": 4.641356907203734e-05, "loss": 0.6002, "num_input_tokens_seen": 9885736, "step": 17145 }, { "epoch": 2.554364015490021, "grad_norm": 0.7538849711418152, "learning_rate": 4.6410214451337816e-05, "loss": 0.5913, "num_input_tokens_seen": 9888488, "step": 17150 }, { "epoch": 2.55510872803098, "grad_norm": 0.9046347737312317, "learning_rate": 4.6406858383823056e-05, "loss": 0.6907, "num_input_tokens_seen": 9891368, "step": 17155 }, { "epoch": 2.5558534405719393, "grad_norm": 1.015952467918396, "learning_rate": 4.640350086971983e-05, "loss": 0.674, "num_input_tokens_seen": 9894280, "step": 17160 }, { "epoch": 2.5565981531128985, "grad_norm": 1.6267318725585938, "learning_rate": 4.640014190925505e-05, "loss": 0.7007, "num_input_tokens_seen": 9897032, "step": 17165 }, { "epoch": 2.5573428656538577, "grad_norm": 2.031067371368408, "learning_rate": 4.639678150265567e-05, "loss": 0.6935, "num_input_tokens_seen": 9900392, "step": 17170 }, { "epoch": 2.558087578194817, "grad_norm": 0.9433987140655518, "learning_rate": 4.639341965014879e-05, "loss": 0.6511, "num_input_tokens_seen": 9903432, "step": 17175 }, { "epoch": 2.558832290735776, "grad_norm": 2.314305305480957, "learning_rate": 4.63900563519616e-05, "loss": 0.6134, "num_input_tokens_seen": 9906280, "step": 17180 }, { "epoch": 2.5595770032767353, "grad_norm": 1.1242735385894775, "learning_rate": 4.638669160832136e-05, "loss": 0.5683, "num_input_tokens_seen": 9909000, "step": 17185 }, { "epoch": 2.5603217158176945, "grad_norm": 0.9148476123809814, "learning_rate": 4.638332541945546e-05, "loss": 0.6657, "num_input_tokens_seen": 9911880, "step": 17190 }, { "epoch": 2.5610664283586537, "grad_norm": 1.4518446922302246, "learning_rate": 4.6379957785591355e-05, "loss": 0.6564, "num_input_tokens_seen": 9914856, "step": 17195 }, { "epoch": 2.5618111408996125, "grad_norm": 0.685905933380127, "learning_rate": 4.6376588706956635e-05, "loss": 0.5661, "num_input_tokens_seen": 9917480, "step": 17200 }, { "epoch": 2.562555853440572, "grad_norm": 0.7680550217628479, "learning_rate": 4.637321818377896e-05, "loss": 0.5941, "num_input_tokens_seen": 9920584, "step": 17205 }, { "epoch": 2.563300565981531, "grad_norm": 1.218505620956421, "learning_rate": 4.636984621628609e-05, "loss": 0.6037, "num_input_tokens_seen": 9923560, "step": 17210 }, { "epoch": 2.5640452785224905, "grad_norm": 2.2774665355682373, "learning_rate": 4.6366472804705905e-05, "loss": 0.6485, "num_input_tokens_seen": 9926472, "step": 17215 }, { "epoch": 2.5647899910634493, "grad_norm": 0.7097740769386292, "learning_rate": 4.636309794926636e-05, "loss": 0.6005, "num_input_tokens_seen": 9929320, "step": 17220 }, { "epoch": 2.565534703604409, "grad_norm": 0.9930720925331116, "learning_rate": 4.635972165019551e-05, "loss": 0.6491, "num_input_tokens_seen": 9932040, "step": 17225 }, { "epoch": 2.5662794161453677, "grad_norm": 0.8644751906394958, "learning_rate": 4.635634390772151e-05, "loss": 0.6882, "num_input_tokens_seen": 9934696, "step": 17230 }, { "epoch": 2.5670241286863273, "grad_norm": 0.803747296333313, "learning_rate": 4.635296472207262e-05, "loss": 0.5676, "num_input_tokens_seen": 9937416, "step": 17235 }, { "epoch": 2.567768841227286, "grad_norm": 0.8535019159317017, "learning_rate": 4.6349584093477184e-05, "loss": 0.7157, "num_input_tokens_seen": 9940232, "step": 17240 }, { "epoch": 2.5685135537682453, "grad_norm": 0.47864726185798645, "learning_rate": 4.634620202216366e-05, "loss": 0.5812, "num_input_tokens_seen": 9942824, "step": 17245 }, { "epoch": 2.5692582663092045, "grad_norm": 0.9559189677238464, "learning_rate": 4.6342818508360595e-05, "loss": 0.6532, "num_input_tokens_seen": 9945736, "step": 17250 }, { "epoch": 2.5700029788501637, "grad_norm": 1.1316946744918823, "learning_rate": 4.633943355229662e-05, "loss": 0.5925, "num_input_tokens_seen": 9948616, "step": 17255 }, { "epoch": 2.570747691391123, "grad_norm": 1.5987235307693481, "learning_rate": 4.633604715420049e-05, "loss": 0.7771, "num_input_tokens_seen": 9951752, "step": 17260 }, { "epoch": 2.571492403932082, "grad_norm": 0.8355172872543335, "learning_rate": 4.6332659314301034e-05, "loss": 0.6253, "num_input_tokens_seen": 9954728, "step": 17265 }, { "epoch": 2.5722371164730413, "grad_norm": 0.8847121596336365, "learning_rate": 4.63292700328272e-05, "loss": 0.6438, "num_input_tokens_seen": 9957896, "step": 17270 }, { "epoch": 2.5729818290140005, "grad_norm": 1.1217674016952515, "learning_rate": 4.632587931000801e-05, "loss": 0.8543, "num_input_tokens_seen": 9961064, "step": 17275 }, { "epoch": 2.5737265415549597, "grad_norm": 1.234226942062378, "learning_rate": 4.6322487146072614e-05, "loss": 0.6037, "num_input_tokens_seen": 9964104, "step": 17280 }, { "epoch": 2.574471254095919, "grad_norm": 1.358954906463623, "learning_rate": 4.6319093541250214e-05, "loss": 0.6527, "num_input_tokens_seen": 9966984, "step": 17285 }, { "epoch": 2.575215966636878, "grad_norm": 1.1558184623718262, "learning_rate": 4.6315698495770155e-05, "loss": 0.6787, "num_input_tokens_seen": 9970056, "step": 17290 }, { "epoch": 2.5759606791778373, "grad_norm": 0.8357774615287781, "learning_rate": 4.6312302009861855e-05, "loss": 0.7442, "num_input_tokens_seen": 9973096, "step": 17295 }, { "epoch": 2.5767053917187965, "grad_norm": 0.9156178832054138, "learning_rate": 4.630890408375483e-05, "loss": 0.5681, "num_input_tokens_seen": 9976040, "step": 17300 }, { "epoch": 2.5774501042597557, "grad_norm": 3.1286325454711914, "learning_rate": 4.630550471767871e-05, "loss": 0.8508, "num_input_tokens_seen": 9979240, "step": 17305 }, { "epoch": 2.578194816800715, "grad_norm": 7.301930904388428, "learning_rate": 4.6302103911863196e-05, "loss": 0.6897, "num_input_tokens_seen": 9982152, "step": 17310 }, { "epoch": 2.578939529341674, "grad_norm": 0.8986625671386719, "learning_rate": 4.6298701666538114e-05, "loss": 0.6098, "num_input_tokens_seen": 9985000, "step": 17315 }, { "epoch": 2.5796842418826333, "grad_norm": 1.0002899169921875, "learning_rate": 4.629529798193336e-05, "loss": 0.7139, "num_input_tokens_seen": 9987880, "step": 17320 }, { "epoch": 2.5804289544235925, "grad_norm": 1.8443840742111206, "learning_rate": 4.629189285827895e-05, "loss": 0.6591, "num_input_tokens_seen": 9990888, "step": 17325 }, { "epoch": 2.5811736669645517, "grad_norm": 1.7091351747512817, "learning_rate": 4.6288486295805e-05, "loss": 0.6302, "num_input_tokens_seen": 9993544, "step": 17330 }, { "epoch": 2.581918379505511, "grad_norm": 0.7655350565910339, "learning_rate": 4.628507829474168e-05, "loss": 0.6037, "num_input_tokens_seen": 9996584, "step": 17335 }, { "epoch": 2.58266309204647, "grad_norm": 0.7351882457733154, "learning_rate": 4.628166885531932e-05, "loss": 0.582, "num_input_tokens_seen": 9999336, "step": 17340 }, { "epoch": 2.5834078045874294, "grad_norm": 1.2695139646530151, "learning_rate": 4.6278257977768305e-05, "loss": 0.5724, "num_input_tokens_seen": 10002184, "step": 17345 }, { "epoch": 2.5841525171283886, "grad_norm": 1.1398561000823975, "learning_rate": 4.627484566231912e-05, "loss": 0.5368, "num_input_tokens_seen": 10005096, "step": 17350 }, { "epoch": 2.5848972296693478, "grad_norm": 0.890269935131073, "learning_rate": 4.627143190920237e-05, "loss": 0.5559, "num_input_tokens_seen": 10008104, "step": 17355 }, { "epoch": 2.585641942210307, "grad_norm": 0.7956547141075134, "learning_rate": 4.626801671864872e-05, "loss": 0.6635, "num_input_tokens_seen": 10010792, "step": 17360 }, { "epoch": 2.586386654751266, "grad_norm": 1.614635944366455, "learning_rate": 4.6264600090888984e-05, "loss": 0.7799, "num_input_tokens_seen": 10013768, "step": 17365 }, { "epoch": 2.5871313672922254, "grad_norm": 1.4298055171966553, "learning_rate": 4.626118202615403e-05, "loss": 0.7563, "num_input_tokens_seen": 10016488, "step": 17370 }, { "epoch": 2.587876079833184, "grad_norm": 1.0441758632659912, "learning_rate": 4.6257762524674826e-05, "loss": 0.8006, "num_input_tokens_seen": 10019432, "step": 17375 }, { "epoch": 2.5886207923741438, "grad_norm": 0.8491013050079346, "learning_rate": 4.625434158668246e-05, "loss": 0.6808, "num_input_tokens_seen": 10022216, "step": 17380 }, { "epoch": 2.5893655049151025, "grad_norm": 1.5094441175460815, "learning_rate": 4.625091921240811e-05, "loss": 0.8279, "num_input_tokens_seen": 10025096, "step": 17385 }, { "epoch": 2.590110217456062, "grad_norm": 0.972902774810791, "learning_rate": 4.624749540208304e-05, "loss": 0.5909, "num_input_tokens_seen": 10028136, "step": 17390 }, { "epoch": 2.590854929997021, "grad_norm": 1.168124794960022, "learning_rate": 4.6244070155938614e-05, "loss": 0.7309, "num_input_tokens_seen": 10030888, "step": 17395 }, { "epoch": 2.5915996425379806, "grad_norm": 1.045903205871582, "learning_rate": 4.624064347420629e-05, "loss": 0.7829, "num_input_tokens_seen": 10033576, "step": 17400 }, { "epoch": 2.5923443550789393, "grad_norm": 0.7639508247375488, "learning_rate": 4.623721535711765e-05, "loss": 0.6834, "num_input_tokens_seen": 10036520, "step": 17405 }, { "epoch": 2.593089067619899, "grad_norm": 0.836357593536377, "learning_rate": 4.623378580490434e-05, "loss": 0.8289, "num_input_tokens_seen": 10039368, "step": 17410 }, { "epoch": 2.5938337801608577, "grad_norm": 1.0537277460098267, "learning_rate": 4.6230354817798104e-05, "loss": 0.6588, "num_input_tokens_seen": 10042216, "step": 17415 }, { "epoch": 2.594578492701817, "grad_norm": 0.7006939053535461, "learning_rate": 4.622692239603082e-05, "loss": 0.5983, "num_input_tokens_seen": 10045352, "step": 17420 }, { "epoch": 2.595323205242776, "grad_norm": 1.3727478981018066, "learning_rate": 4.6223488539834415e-05, "loss": 0.7545, "num_input_tokens_seen": 10048296, "step": 17425 }, { "epoch": 2.5960679177837354, "grad_norm": 0.8935794234275818, "learning_rate": 4.622005324944095e-05, "loss": 0.7119, "num_input_tokens_seen": 10051560, "step": 17430 }, { "epoch": 2.5968126303246946, "grad_norm": 2.2972397804260254, "learning_rate": 4.621661652508255e-05, "loss": 0.7179, "num_input_tokens_seen": 10054504, "step": 17435 }, { "epoch": 2.5975573428656538, "grad_norm": 0.8148157000541687, "learning_rate": 4.621317836699147e-05, "loss": 0.7993, "num_input_tokens_seen": 10057800, "step": 17440 }, { "epoch": 2.598302055406613, "grad_norm": 0.6998679041862488, "learning_rate": 4.6209738775400045e-05, "loss": 0.7161, "num_input_tokens_seen": 10060616, "step": 17445 }, { "epoch": 2.599046767947572, "grad_norm": 0.6558582782745361, "learning_rate": 4.6206297750540706e-05, "loss": 0.6857, "num_input_tokens_seen": 10063464, "step": 17450 }, { "epoch": 2.5997914804885314, "grad_norm": 0.6803036332130432, "learning_rate": 4.620285529264599e-05, "loss": 0.5574, "num_input_tokens_seen": 10066184, "step": 17455 }, { "epoch": 2.6005361930294906, "grad_norm": 0.7702559232711792, "learning_rate": 4.619941140194851e-05, "loss": 0.7752, "num_input_tokens_seen": 10069096, "step": 17460 }, { "epoch": 2.6012809055704498, "grad_norm": 0.8763477206230164, "learning_rate": 4.6195966078680995e-05, "loss": 0.6389, "num_input_tokens_seen": 10071976, "step": 17465 }, { "epoch": 2.602025618111409, "grad_norm": 0.624248743057251, "learning_rate": 4.619251932307627e-05, "loss": 0.5883, "num_input_tokens_seen": 10074536, "step": 17470 }, { "epoch": 2.602770330652368, "grad_norm": 0.7204362154006958, "learning_rate": 4.618907113536726e-05, "loss": 0.7063, "num_input_tokens_seen": 10077320, "step": 17475 }, { "epoch": 2.6035150431933274, "grad_norm": 0.710834264755249, "learning_rate": 4.618562151578696e-05, "loss": 0.6477, "num_input_tokens_seen": 10080136, "step": 17480 }, { "epoch": 2.6042597557342866, "grad_norm": 0.5495180487632751, "learning_rate": 4.61821704645685e-05, "loss": 0.7011, "num_input_tokens_seen": 10083208, "step": 17485 }, { "epoch": 2.605004468275246, "grad_norm": 0.6973209977149963, "learning_rate": 4.6178717981945074e-05, "loss": 0.5879, "num_input_tokens_seen": 10086152, "step": 17490 }, { "epoch": 2.605749180816205, "grad_norm": 0.8468279242515564, "learning_rate": 4.617526406815e-05, "loss": 0.5943, "num_input_tokens_seen": 10089096, "step": 17495 }, { "epoch": 2.606493893357164, "grad_norm": 2.3217170238494873, "learning_rate": 4.617180872341667e-05, "loss": 0.6304, "num_input_tokens_seen": 10091912, "step": 17500 }, { "epoch": 2.6072386058981234, "grad_norm": 1.6980738639831543, "learning_rate": 4.616835194797858e-05, "loss": 0.7192, "num_input_tokens_seen": 10094856, "step": 17505 }, { "epoch": 2.6079833184390826, "grad_norm": 1.906952142715454, "learning_rate": 4.616489374206934e-05, "loss": 0.7367, "num_input_tokens_seen": 10097672, "step": 17510 }, { "epoch": 2.608728030980042, "grad_norm": 2.6099908351898193, "learning_rate": 4.6161434105922616e-05, "loss": 0.6974, "num_input_tokens_seen": 10100328, "step": 17515 }, { "epoch": 2.609472743521001, "grad_norm": 0.9187663793563843, "learning_rate": 4.615797303977223e-05, "loss": 0.5563, "num_input_tokens_seen": 10103016, "step": 17520 }, { "epoch": 2.61021745606196, "grad_norm": 1.201844334602356, "learning_rate": 4.615451054385204e-05, "loss": 0.751, "num_input_tokens_seen": 10105960, "step": 17525 }, { "epoch": 2.6109621686029194, "grad_norm": 1.3111306428909302, "learning_rate": 4.615104661839603e-05, "loss": 0.6572, "num_input_tokens_seen": 10108712, "step": 17530 }, { "epoch": 2.6117068811438786, "grad_norm": 1.8910102844238281, "learning_rate": 4.6147581263638286e-05, "loss": 0.5526, "num_input_tokens_seen": 10111528, "step": 17535 }, { "epoch": 2.612451593684838, "grad_norm": 2.0843584537506104, "learning_rate": 4.614411447981298e-05, "loss": 0.6077, "num_input_tokens_seen": 10114312, "step": 17540 }, { "epoch": 2.613196306225797, "grad_norm": 0.7970141768455505, "learning_rate": 4.6140646267154384e-05, "loss": 0.6754, "num_input_tokens_seen": 10117224, "step": 17545 }, { "epoch": 2.6139410187667558, "grad_norm": 0.8679254055023193, "learning_rate": 4.613717662589687e-05, "loss": 0.5067, "num_input_tokens_seen": 10119912, "step": 17550 }, { "epoch": 2.6146857313077154, "grad_norm": 1.0380144119262695, "learning_rate": 4.613370555627489e-05, "loss": 0.7656, "num_input_tokens_seen": 10123016, "step": 17555 }, { "epoch": 2.615430443848674, "grad_norm": 0.8485002517700195, "learning_rate": 4.6130233058523015e-05, "loss": 0.6377, "num_input_tokens_seen": 10125800, "step": 17560 }, { "epoch": 2.616175156389634, "grad_norm": 0.928002655506134, "learning_rate": 4.6126759132875896e-05, "loss": 0.6473, "num_input_tokens_seen": 10128392, "step": 17565 }, { "epoch": 2.6169198689305926, "grad_norm": 0.6075366735458374, "learning_rate": 4.612328377956829e-05, "loss": 0.6268, "num_input_tokens_seen": 10130856, "step": 17570 }, { "epoch": 2.6176645814715522, "grad_norm": 0.6785105466842651, "learning_rate": 4.6119806998835056e-05, "loss": 0.6064, "num_input_tokens_seen": 10133512, "step": 17575 }, { "epoch": 2.618409294012511, "grad_norm": 0.5520092844963074, "learning_rate": 4.611632879091112e-05, "loss": 0.7625, "num_input_tokens_seen": 10136392, "step": 17580 }, { "epoch": 2.6191540065534706, "grad_norm": 0.7826099991798401, "learning_rate": 4.6112849156031544e-05, "loss": 0.5343, "num_input_tokens_seen": 10138984, "step": 17585 }, { "epoch": 2.6198987190944294, "grad_norm": 0.9255626797676086, "learning_rate": 4.610936809443146e-05, "loss": 0.7283, "num_input_tokens_seen": 10141800, "step": 17590 }, { "epoch": 2.6206434316353886, "grad_norm": 1.3822036981582642, "learning_rate": 4.610588560634611e-05, "loss": 0.6381, "num_input_tokens_seen": 10144360, "step": 17595 }, { "epoch": 2.621388144176348, "grad_norm": 0.8748260736465454, "learning_rate": 4.610240169201081e-05, "loss": 0.7025, "num_input_tokens_seen": 10147080, "step": 17600 }, { "epoch": 2.622132856717307, "grad_norm": 0.7122254967689514, "learning_rate": 4.6098916351661006e-05, "loss": 0.7131, "num_input_tokens_seen": 10150120, "step": 17605 }, { "epoch": 2.622877569258266, "grad_norm": 1.402978539466858, "learning_rate": 4.609542958553221e-05, "loss": 0.7084, "num_input_tokens_seen": 10152744, "step": 17610 }, { "epoch": 2.6236222817992254, "grad_norm": 0.6278206706047058, "learning_rate": 4.609194139386006e-05, "loss": 0.7597, "num_input_tokens_seen": 10155464, "step": 17615 }, { "epoch": 2.6243669943401846, "grad_norm": 0.7342384457588196, "learning_rate": 4.608845177688026e-05, "loss": 0.4726, "num_input_tokens_seen": 10158248, "step": 17620 }, { "epoch": 2.625111706881144, "grad_norm": 1.521909475326538, "learning_rate": 4.608496073482863e-05, "loss": 0.6825, "num_input_tokens_seen": 10161000, "step": 17625 }, { "epoch": 2.625856419422103, "grad_norm": 0.8606204390525818, "learning_rate": 4.608146826794107e-05, "loss": 0.5672, "num_input_tokens_seen": 10163912, "step": 17630 }, { "epoch": 2.626601131963062, "grad_norm": 1.1950618028640747, "learning_rate": 4.607797437645361e-05, "loss": 0.5408, "num_input_tokens_seen": 10166632, "step": 17635 }, { "epoch": 2.6273458445040214, "grad_norm": 1.2080743312835693, "learning_rate": 4.607447906060233e-05, "loss": 0.6417, "num_input_tokens_seen": 10170024, "step": 17640 }, { "epoch": 2.6280905570449806, "grad_norm": 2.0510778427124023, "learning_rate": 4.607098232062344e-05, "loss": 0.51, "num_input_tokens_seen": 10172808, "step": 17645 }, { "epoch": 2.62883526958594, "grad_norm": 1.6818636655807495, "learning_rate": 4.6067484156753234e-05, "loss": 0.6444, "num_input_tokens_seen": 10176072, "step": 17650 }, { "epoch": 2.629579982126899, "grad_norm": 1.2058907747268677, "learning_rate": 4.6063984569228103e-05, "loss": 0.7282, "num_input_tokens_seen": 10178920, "step": 17655 }, { "epoch": 2.6303246946678582, "grad_norm": 1.9636293649673462, "learning_rate": 4.606048355828453e-05, "loss": 0.8127, "num_input_tokens_seen": 10182408, "step": 17660 }, { "epoch": 2.6310694072088174, "grad_norm": 1.4757473468780518, "learning_rate": 4.6056981124159104e-05, "loss": 0.6402, "num_input_tokens_seen": 10185256, "step": 17665 }, { "epoch": 2.6318141197497766, "grad_norm": 1.1304919719696045, "learning_rate": 4.605347726708851e-05, "loss": 0.7727, "num_input_tokens_seen": 10188232, "step": 17670 }, { "epoch": 2.632558832290736, "grad_norm": 1.3313149213790894, "learning_rate": 4.604997198730951e-05, "loss": 0.7643, "num_input_tokens_seen": 10191272, "step": 17675 }, { "epoch": 2.633303544831695, "grad_norm": 2.005343437194824, "learning_rate": 4.6046465285058996e-05, "loss": 0.9372, "num_input_tokens_seen": 10194088, "step": 17680 }, { "epoch": 2.6340482573726542, "grad_norm": 0.9858928918838501, "learning_rate": 4.604295716057393e-05, "loss": 0.6386, "num_input_tokens_seen": 10196840, "step": 17685 }, { "epoch": 2.6347929699136134, "grad_norm": 0.6256149411201477, "learning_rate": 4.6039447614091365e-05, "loss": 0.6039, "num_input_tokens_seen": 10199720, "step": 17690 }, { "epoch": 2.6355376824545726, "grad_norm": 1.4991979598999023, "learning_rate": 4.6035936645848476e-05, "loss": 0.6248, "num_input_tokens_seen": 10202760, "step": 17695 }, { "epoch": 2.636282394995532, "grad_norm": 0.888678252696991, "learning_rate": 4.6032424256082504e-05, "loss": 0.7281, "num_input_tokens_seen": 10205896, "step": 17700 }, { "epoch": 2.637027107536491, "grad_norm": 2.037886619567871, "learning_rate": 4.602891044503083e-05, "loss": 0.7972, "num_input_tokens_seen": 10208808, "step": 17705 }, { "epoch": 2.6377718200774503, "grad_norm": 1.0679364204406738, "learning_rate": 4.6025395212930864e-05, "loss": 0.6337, "num_input_tokens_seen": 10211944, "step": 17710 }, { "epoch": 2.6385165326184095, "grad_norm": 1.8087382316589355, "learning_rate": 4.602187856002019e-05, "loss": 0.5812, "num_input_tokens_seen": 10214728, "step": 17715 }, { "epoch": 2.6392612451593687, "grad_norm": 1.172960638999939, "learning_rate": 4.601836048653642e-05, "loss": 0.5814, "num_input_tokens_seen": 10217640, "step": 17720 }, { "epoch": 2.6400059577003274, "grad_norm": 0.9585970044136047, "learning_rate": 4.601484099271731e-05, "loss": 0.6303, "num_input_tokens_seen": 10220456, "step": 17725 }, { "epoch": 2.640750670241287, "grad_norm": 1.084147334098816, "learning_rate": 4.601132007880068e-05, "loss": 0.7494, "num_input_tokens_seen": 10223656, "step": 17730 }, { "epoch": 2.641495382782246, "grad_norm": 0.7104952335357666, "learning_rate": 4.600779774502447e-05, "loss": 0.5154, "num_input_tokens_seen": 10226600, "step": 17735 }, { "epoch": 2.6422400953232055, "grad_norm": 1.7533314228057861, "learning_rate": 4.60042739916267e-05, "loss": 0.6777, "num_input_tokens_seen": 10229768, "step": 17740 }, { "epoch": 2.6429848078641642, "grad_norm": 0.8624418377876282, "learning_rate": 4.600074881884549e-05, "loss": 0.5444, "num_input_tokens_seen": 10232648, "step": 17745 }, { "epoch": 2.643729520405124, "grad_norm": 1.8339757919311523, "learning_rate": 4.599722222691906e-05, "loss": 0.6638, "num_input_tokens_seen": 10235400, "step": 17750 }, { "epoch": 2.6444742329460826, "grad_norm": 1.5860893726348877, "learning_rate": 4.599369421608571e-05, "loss": 0.6295, "num_input_tokens_seen": 10238312, "step": 17755 }, { "epoch": 2.645218945487042, "grad_norm": 1.2023364305496216, "learning_rate": 4.5990164786583865e-05, "loss": 0.672, "num_input_tokens_seen": 10241160, "step": 17760 }, { "epoch": 2.645963658028001, "grad_norm": 1.711143970489502, "learning_rate": 4.598663393865203e-05, "loss": 0.6818, "num_input_tokens_seen": 10244040, "step": 17765 }, { "epoch": 2.6467083705689602, "grad_norm": 1.2012455463409424, "learning_rate": 4.598310167252879e-05, "loss": 0.5792, "num_input_tokens_seen": 10246792, "step": 17770 }, { "epoch": 2.6474530831099194, "grad_norm": 0.8143861889839172, "learning_rate": 4.5979567988452856e-05, "loss": 0.6851, "num_input_tokens_seen": 10249864, "step": 17775 }, { "epoch": 2.6481977956508786, "grad_norm": 2.148181200027466, "learning_rate": 4.597603288666301e-05, "loss": 0.5169, "num_input_tokens_seen": 10252936, "step": 17780 }, { "epoch": 2.648942508191838, "grad_norm": 1.0368720293045044, "learning_rate": 4.597249636739815e-05, "loss": 0.4991, "num_input_tokens_seen": 10255720, "step": 17785 }, { "epoch": 2.649687220732797, "grad_norm": 1.235485315322876, "learning_rate": 4.5968958430897246e-05, "loss": 0.76, "num_input_tokens_seen": 10258952, "step": 17790 }, { "epoch": 2.6504319332737563, "grad_norm": 1.8894479274749756, "learning_rate": 4.596541907739939e-05, "loss": 0.7238, "num_input_tokens_seen": 10261864, "step": 17795 }, { "epoch": 2.6511766458147155, "grad_norm": 2.56860089302063, "learning_rate": 4.5961878307143746e-05, "loss": 0.8975, "num_input_tokens_seen": 10264808, "step": 17800 }, { "epoch": 2.6519213583556747, "grad_norm": 1.4024120569229126, "learning_rate": 4.595833612036959e-05, "loss": 0.6006, "num_input_tokens_seen": 10267912, "step": 17805 }, { "epoch": 2.652666070896634, "grad_norm": 1.551410436630249, "learning_rate": 4.59547925173163e-05, "loss": 0.5429, "num_input_tokens_seen": 10270888, "step": 17810 }, { "epoch": 2.653410783437593, "grad_norm": 1.4714657068252563, "learning_rate": 4.595124749822332e-05, "loss": 0.5225, "num_input_tokens_seen": 10273736, "step": 17815 }, { "epoch": 2.6541554959785523, "grad_norm": 0.9021795988082886, "learning_rate": 4.594770106333022e-05, "loss": 0.6855, "num_input_tokens_seen": 10276648, "step": 17820 }, { "epoch": 2.6549002085195115, "grad_norm": 1.8373937606811523, "learning_rate": 4.594415321287664e-05, "loss": 0.6313, "num_input_tokens_seen": 10279912, "step": 17825 }, { "epoch": 2.6556449210604707, "grad_norm": 1.8695003986358643, "learning_rate": 4.594060394710235e-05, "loss": 0.8666, "num_input_tokens_seen": 10282856, "step": 17830 }, { "epoch": 2.65638963360143, "grad_norm": 0.919909656047821, "learning_rate": 4.593705326624718e-05, "loss": 0.5743, "num_input_tokens_seen": 10285640, "step": 17835 }, { "epoch": 2.657134346142389, "grad_norm": 1.9909083843231201, "learning_rate": 4.593350117055107e-05, "loss": 0.7771, "num_input_tokens_seen": 10288552, "step": 17840 }, { "epoch": 2.6578790586833483, "grad_norm": 1.28480064868927, "learning_rate": 4.592994766025407e-05, "loss": 0.7768, "num_input_tokens_seen": 10291272, "step": 17845 }, { "epoch": 2.6586237712243075, "grad_norm": 1.5129882097244263, "learning_rate": 4.592639273559629e-05, "loss": 0.7371, "num_input_tokens_seen": 10294312, "step": 17850 }, { "epoch": 2.6593684837652667, "grad_norm": 1.1946824789047241, "learning_rate": 4.5922836396817973e-05, "loss": 0.7335, "num_input_tokens_seen": 10297576, "step": 17855 }, { "epoch": 2.660113196306226, "grad_norm": 1.1394041776657104, "learning_rate": 4.591927864415944e-05, "loss": 0.5141, "num_input_tokens_seen": 10300104, "step": 17860 }, { "epoch": 2.660857908847185, "grad_norm": 1.0223634243011475, "learning_rate": 4.591571947786111e-05, "loss": 0.6959, "num_input_tokens_seen": 10303080, "step": 17865 }, { "epoch": 2.6616026213881443, "grad_norm": 1.253076195716858, "learning_rate": 4.591215889816349e-05, "loss": 0.7229, "num_input_tokens_seen": 10305832, "step": 17870 }, { "epoch": 2.6623473339291035, "grad_norm": 0.8861821293830872, "learning_rate": 4.59085969053072e-05, "loss": 0.7403, "num_input_tokens_seen": 10308360, "step": 17875 }, { "epoch": 2.6630920464700627, "grad_norm": 1.4445204734802246, "learning_rate": 4.5905033499532936e-05, "loss": 0.6843, "num_input_tokens_seen": 10311304, "step": 17880 }, { "epoch": 2.663836759011022, "grad_norm": 1.330215573310852, "learning_rate": 4.590146868108151e-05, "loss": 0.7637, "num_input_tokens_seen": 10313928, "step": 17885 }, { "epoch": 2.6645814715519807, "grad_norm": 0.7382978200912476, "learning_rate": 4.589790245019379e-05, "loss": 0.71, "num_input_tokens_seen": 10317064, "step": 17890 }, { "epoch": 2.6653261840929403, "grad_norm": 0.6290088295936584, "learning_rate": 4.5894334807110806e-05, "loss": 0.6136, "num_input_tokens_seen": 10319816, "step": 17895 }, { "epoch": 2.666070896633899, "grad_norm": 0.772881031036377, "learning_rate": 4.589076575207362e-05, "loss": 0.6334, "num_input_tokens_seen": 10322472, "step": 17900 }, { "epoch": 2.6668156091748587, "grad_norm": 1.1789411306381226, "learning_rate": 4.588719528532342e-05, "loss": 0.6077, "num_input_tokens_seen": 10325352, "step": 17905 }, { "epoch": 2.6675603217158175, "grad_norm": 1.6392557621002197, "learning_rate": 4.5883623407101475e-05, "loss": 0.6298, "num_input_tokens_seen": 10328584, "step": 17910 }, { "epoch": 2.668305034256777, "grad_norm": 0.9315050840377808, "learning_rate": 4.5880050117649174e-05, "loss": 0.5797, "num_input_tokens_seen": 10331304, "step": 17915 }, { "epoch": 2.669049746797736, "grad_norm": 1.0204863548278809, "learning_rate": 4.5876475417207974e-05, "loss": 0.6436, "num_input_tokens_seen": 10334248, "step": 17920 }, { "epoch": 2.6697944593386955, "grad_norm": 1.0963839292526245, "learning_rate": 4.5872899306019454e-05, "loss": 0.6589, "num_input_tokens_seen": 10336968, "step": 17925 }, { "epoch": 2.6705391718796543, "grad_norm": 1.0849418640136719, "learning_rate": 4.586932178432525e-05, "loss": 0.6202, "num_input_tokens_seen": 10339560, "step": 17930 }, { "epoch": 2.6712838844206135, "grad_norm": 0.8079972267150879, "learning_rate": 4.586574285236714e-05, "loss": 0.6133, "num_input_tokens_seen": 10342504, "step": 17935 }, { "epoch": 2.6720285969615727, "grad_norm": 0.8834078311920166, "learning_rate": 4.586216251038695e-05, "loss": 0.6373, "num_input_tokens_seen": 10345320, "step": 17940 }, { "epoch": 2.672773309502532, "grad_norm": 1.195315957069397, "learning_rate": 4.585858075862665e-05, "loss": 0.619, "num_input_tokens_seen": 10348200, "step": 17945 }, { "epoch": 2.673518022043491, "grad_norm": 0.9352277517318726, "learning_rate": 4.585499759732825e-05, "loss": 0.4838, "num_input_tokens_seen": 10351304, "step": 17950 }, { "epoch": 2.6742627345844503, "grad_norm": 1.580387830734253, "learning_rate": 4.585141302673392e-05, "loss": 0.6174, "num_input_tokens_seen": 10354024, "step": 17955 }, { "epoch": 2.6750074471254095, "grad_norm": 1.4455585479736328, "learning_rate": 4.584782704708587e-05, "loss": 0.7483, "num_input_tokens_seen": 10357448, "step": 17960 }, { "epoch": 2.6757521596663687, "grad_norm": 1.2677122354507446, "learning_rate": 4.584423965862642e-05, "loss": 0.695, "num_input_tokens_seen": 10360424, "step": 17965 }, { "epoch": 2.676496872207328, "grad_norm": 1.0588257312774658, "learning_rate": 4.5840650861598e-05, "loss": 0.4903, "num_input_tokens_seen": 10363464, "step": 17970 }, { "epoch": 2.677241584748287, "grad_norm": 0.8080737590789795, "learning_rate": 4.583706065624314e-05, "loss": 0.6309, "num_input_tokens_seen": 10366536, "step": 17975 }, { "epoch": 2.6779862972892463, "grad_norm": 1.3200032711029053, "learning_rate": 4.583346904280442e-05, "loss": 0.6964, "num_input_tokens_seen": 10369416, "step": 17980 }, { "epoch": 2.6787310098302055, "grad_norm": 0.7357079386711121, "learning_rate": 4.582987602152458e-05, "loss": 0.8247, "num_input_tokens_seen": 10372360, "step": 17985 }, { "epoch": 2.6794757223711647, "grad_norm": 1.241005301475525, "learning_rate": 4.58262815926464e-05, "loss": 0.6419, "num_input_tokens_seen": 10375112, "step": 17990 }, { "epoch": 2.680220434912124, "grad_norm": 0.8635796904563904, "learning_rate": 4.5822685756412785e-05, "loss": 0.596, "num_input_tokens_seen": 10378088, "step": 17995 }, { "epoch": 2.680965147453083, "grad_norm": 2.196776866912842, "learning_rate": 4.5819088513066725e-05, "loss": 0.5851, "num_input_tokens_seen": 10380904, "step": 18000 }, { "epoch": 2.6817098599940423, "grad_norm": 1.4635764360427856, "learning_rate": 4.581548986285131e-05, "loss": 0.6223, "num_input_tokens_seen": 10383624, "step": 18005 }, { "epoch": 2.6824545725350015, "grad_norm": 0.9932729601860046, "learning_rate": 4.5811889806009716e-05, "loss": 0.5877, "num_input_tokens_seen": 10386536, "step": 18010 }, { "epoch": 2.6831992850759607, "grad_norm": 1.1872259378433228, "learning_rate": 4.580828834278523e-05, "loss": 0.638, "num_input_tokens_seen": 10389448, "step": 18015 }, { "epoch": 2.68394399761692, "grad_norm": 1.299406886100769, "learning_rate": 4.580468547342121e-05, "loss": 0.6337, "num_input_tokens_seen": 10392104, "step": 18020 }, { "epoch": 2.684688710157879, "grad_norm": 1.5088413953781128, "learning_rate": 4.5801081198161134e-05, "loss": 0.768, "num_input_tokens_seen": 10394920, "step": 18025 }, { "epoch": 2.6854334226988383, "grad_norm": 0.8728747963905334, "learning_rate": 4.5797475517248565e-05, "loss": 0.6816, "num_input_tokens_seen": 10397448, "step": 18030 }, { "epoch": 2.6861781352397975, "grad_norm": 1.3318520784378052, "learning_rate": 4.579386843092715e-05, "loss": 0.808, "num_input_tokens_seen": 10399880, "step": 18035 }, { "epoch": 2.6869228477807567, "grad_norm": 0.9841132164001465, "learning_rate": 4.579025993944065e-05, "loss": 0.5774, "num_input_tokens_seen": 10402568, "step": 18040 }, { "epoch": 2.687667560321716, "grad_norm": 1.537597417831421, "learning_rate": 4.578665004303292e-05, "loss": 0.735, "num_input_tokens_seen": 10405384, "step": 18045 }, { "epoch": 2.688412272862675, "grad_norm": 1.9170995950698853, "learning_rate": 4.578303874194789e-05, "loss": 0.7912, "num_input_tokens_seen": 10408264, "step": 18050 }, { "epoch": 2.6891569854036343, "grad_norm": 1.0673327445983887, "learning_rate": 4.577942603642959e-05, "loss": 0.7603, "num_input_tokens_seen": 10411112, "step": 18055 }, { "epoch": 2.6899016979445936, "grad_norm": 0.9123520255088806, "learning_rate": 4.5775811926722166e-05, "loss": 0.4269, "num_input_tokens_seen": 10413800, "step": 18060 }, { "epoch": 2.6906464104855523, "grad_norm": 1.0061742067337036, "learning_rate": 4.577219641306984e-05, "loss": 0.7137, "num_input_tokens_seen": 10416456, "step": 18065 }, { "epoch": 2.691391123026512, "grad_norm": 1.1499165296554565, "learning_rate": 4.5768579495716935e-05, "loss": 0.631, "num_input_tokens_seen": 10419432, "step": 18070 }, { "epoch": 2.6921358355674707, "grad_norm": 0.7260044813156128, "learning_rate": 4.5764961174907865e-05, "loss": 0.6162, "num_input_tokens_seen": 10422568, "step": 18075 }, { "epoch": 2.6928805481084304, "grad_norm": 0.8818506598472595, "learning_rate": 4.576134145088715e-05, "loss": 0.6326, "num_input_tokens_seen": 10425192, "step": 18080 }, { "epoch": 2.693625260649389, "grad_norm": 1.6618000268936157, "learning_rate": 4.575772032389938e-05, "loss": 0.6738, "num_input_tokens_seen": 10428040, "step": 18085 }, { "epoch": 2.6943699731903488, "grad_norm": 1.8390178680419922, "learning_rate": 4.575409779418927e-05, "loss": 0.7877, "num_input_tokens_seen": 10430824, "step": 18090 }, { "epoch": 2.6951146857313075, "grad_norm": 1.140748381614685, "learning_rate": 4.5750473862001606e-05, "loss": 0.6212, "num_input_tokens_seen": 10433608, "step": 18095 }, { "epoch": 2.695859398272267, "grad_norm": 0.9379092454910278, "learning_rate": 4.5746848527581287e-05, "loss": 0.6936, "num_input_tokens_seen": 10436392, "step": 18100 }, { "epoch": 2.696604110813226, "grad_norm": 1.1542596817016602, "learning_rate": 4.5743221791173296e-05, "loss": 0.7742, "num_input_tokens_seen": 10439080, "step": 18105 }, { "epoch": 2.697348823354185, "grad_norm": 1.205085039138794, "learning_rate": 4.573959365302272e-05, "loss": 0.7664, "num_input_tokens_seen": 10442024, "step": 18110 }, { "epoch": 2.6980935358951443, "grad_norm": 1.4991092681884766, "learning_rate": 4.5735964113374715e-05, "loss": 0.6361, "num_input_tokens_seen": 10444744, "step": 18115 }, { "epoch": 2.6988382484361035, "grad_norm": 1.2677385807037354, "learning_rate": 4.573233317247456e-05, "loss": 0.5901, "num_input_tokens_seen": 10447624, "step": 18120 }, { "epoch": 2.6995829609770627, "grad_norm": 1.5395936965942383, "learning_rate": 4.572870083056763e-05, "loss": 0.4826, "num_input_tokens_seen": 10450536, "step": 18125 }, { "epoch": 2.700327673518022, "grad_norm": 0.6998277306556702, "learning_rate": 4.5725067087899364e-05, "loss": 0.5965, "num_input_tokens_seen": 10453352, "step": 18130 }, { "epoch": 2.701072386058981, "grad_norm": 0.712665855884552, "learning_rate": 4.572143194471533e-05, "loss": 0.6833, "num_input_tokens_seen": 10456648, "step": 18135 }, { "epoch": 2.7018170985999403, "grad_norm": 0.8907747268676758, "learning_rate": 4.5717795401261175e-05, "loss": 0.8512, "num_input_tokens_seen": 10459656, "step": 18140 }, { "epoch": 2.7025618111408996, "grad_norm": 0.8697301149368286, "learning_rate": 4.571415745778264e-05, "loss": 0.5679, "num_input_tokens_seen": 10462696, "step": 18145 }, { "epoch": 2.7033065236818588, "grad_norm": 1.0259202718734741, "learning_rate": 4.571051811452556e-05, "loss": 0.6157, "num_input_tokens_seen": 10465736, "step": 18150 }, { "epoch": 2.704051236222818, "grad_norm": 1.1417330503463745, "learning_rate": 4.570687737173587e-05, "loss": 0.7309, "num_input_tokens_seen": 10468552, "step": 18155 }, { "epoch": 2.704795948763777, "grad_norm": 1.0476539134979248, "learning_rate": 4.570323522965959e-05, "loss": 0.5498, "num_input_tokens_seen": 10471432, "step": 18160 }, { "epoch": 2.7055406613047364, "grad_norm": 0.677117645740509, "learning_rate": 4.569959168854285e-05, "loss": 0.5233, "num_input_tokens_seen": 10473864, "step": 18165 }, { "epoch": 2.7062853738456956, "grad_norm": 0.8209995627403259, "learning_rate": 4.5695946748631866e-05, "loss": 0.5126, "num_input_tokens_seen": 10476456, "step": 18170 }, { "epoch": 2.7070300863866548, "grad_norm": 0.9924325346946716, "learning_rate": 4.5692300410172936e-05, "loss": 0.4702, "num_input_tokens_seen": 10479336, "step": 18175 }, { "epoch": 2.707774798927614, "grad_norm": 0.8005791306495667, "learning_rate": 4.568865267341248e-05, "loss": 0.6021, "num_input_tokens_seen": 10481960, "step": 18180 }, { "epoch": 2.708519511468573, "grad_norm": 1.570448875427246, "learning_rate": 4.5685003538596994e-05, "loss": 0.692, "num_input_tokens_seen": 10484744, "step": 18185 }, { "epoch": 2.7092642240095324, "grad_norm": 1.668637752532959, "learning_rate": 4.568135300597306e-05, "loss": 0.5122, "num_input_tokens_seen": 10487144, "step": 18190 }, { "epoch": 2.7100089365504916, "grad_norm": 1.1182847023010254, "learning_rate": 4.567770107578737e-05, "loss": 0.6979, "num_input_tokens_seen": 10490120, "step": 18195 }, { "epoch": 2.710753649091451, "grad_norm": 1.0012887716293335, "learning_rate": 4.567404774828672e-05, "loss": 0.5286, "num_input_tokens_seen": 10493192, "step": 18200 }, { "epoch": 2.71149836163241, "grad_norm": 1.1911317110061646, "learning_rate": 4.567039302371797e-05, "loss": 0.6255, "num_input_tokens_seen": 10496264, "step": 18205 }, { "epoch": 2.712243074173369, "grad_norm": 0.7716649174690247, "learning_rate": 4.566673690232811e-05, "loss": 0.5212, "num_input_tokens_seen": 10499112, "step": 18210 }, { "epoch": 2.7129877867143284, "grad_norm": 2.086754083633423, "learning_rate": 4.566307938436419e-05, "loss": 0.6594, "num_input_tokens_seen": 10501896, "step": 18215 }, { "epoch": 2.7137324992552876, "grad_norm": 1.3179008960723877, "learning_rate": 4.565942047007337e-05, "loss": 0.6853, "num_input_tokens_seen": 10504968, "step": 18220 }, { "epoch": 2.714477211796247, "grad_norm": 0.8003659248352051, "learning_rate": 4.5655760159702914e-05, "loss": 0.5729, "num_input_tokens_seen": 10508136, "step": 18225 }, { "epoch": 2.715221924337206, "grad_norm": 1.8580509424209595, "learning_rate": 4.565209845350017e-05, "loss": 0.6835, "num_input_tokens_seen": 10511176, "step": 18230 }, { "epoch": 2.715966636878165, "grad_norm": 1.0727384090423584, "learning_rate": 4.564843535171257e-05, "loss": 0.6219, "num_input_tokens_seen": 10513896, "step": 18235 }, { "epoch": 2.716711349419124, "grad_norm": 0.43230968713760376, "learning_rate": 4.5644770854587666e-05, "loss": 0.5876, "num_input_tokens_seen": 10516808, "step": 18240 }, { "epoch": 2.7174560619600836, "grad_norm": 1.145702838897705, "learning_rate": 4.564110496237308e-05, "loss": 0.606, "num_input_tokens_seen": 10519464, "step": 18245 }, { "epoch": 2.7182007745010424, "grad_norm": 2.392150640487671, "learning_rate": 4.563743767531654e-05, "loss": 0.4443, "num_input_tokens_seen": 10522152, "step": 18250 }, { "epoch": 2.718945487042002, "grad_norm": 1.192266583442688, "learning_rate": 4.563376899366587e-05, "loss": 0.6722, "num_input_tokens_seen": 10525064, "step": 18255 }, { "epoch": 2.7196901995829608, "grad_norm": 1.3816289901733398, "learning_rate": 4.5630098917668986e-05, "loss": 0.577, "num_input_tokens_seen": 10527752, "step": 18260 }, { "epoch": 2.7204349121239204, "grad_norm": 2.2043557167053223, "learning_rate": 4.5626427447573884e-05, "loss": 0.6831, "num_input_tokens_seen": 10530568, "step": 18265 }, { "epoch": 2.721179624664879, "grad_norm": 0.9925238490104675, "learning_rate": 4.562275458362868e-05, "loss": 0.6754, "num_input_tokens_seen": 10533448, "step": 18270 }, { "epoch": 2.721924337205839, "grad_norm": 1.0212788581848145, "learning_rate": 4.561908032608157e-05, "loss": 0.6142, "num_input_tokens_seen": 10536264, "step": 18275 }, { "epoch": 2.7226690497467976, "grad_norm": 2.1510796546936035, "learning_rate": 4.561540467518084e-05, "loss": 0.6424, "num_input_tokens_seen": 10539688, "step": 18280 }, { "epoch": 2.723413762287757, "grad_norm": 2.15842866897583, "learning_rate": 4.561172763117488e-05, "loss": 0.6275, "num_input_tokens_seen": 10542568, "step": 18285 }, { "epoch": 2.724158474828716, "grad_norm": 1.6503764390945435, "learning_rate": 4.5608049194312165e-05, "loss": 0.7245, "num_input_tokens_seen": 10545416, "step": 18290 }, { "epoch": 2.724903187369675, "grad_norm": 2.336625099182129, "learning_rate": 4.560436936484127e-05, "loss": 0.7191, "num_input_tokens_seen": 10548488, "step": 18295 }, { "epoch": 2.7256478999106344, "grad_norm": 1.2162472009658813, "learning_rate": 4.5600688143010874e-05, "loss": 0.8177, "num_input_tokens_seen": 10551144, "step": 18300 }, { "epoch": 2.7263926124515936, "grad_norm": 1.5580689907073975, "learning_rate": 4.559700552906972e-05, "loss": 0.8314, "num_input_tokens_seen": 10554152, "step": 18305 }, { "epoch": 2.727137324992553, "grad_norm": 0.7368050217628479, "learning_rate": 4.559332152326667e-05, "loss": 0.7545, "num_input_tokens_seen": 10556936, "step": 18310 }, { "epoch": 2.727882037533512, "grad_norm": 1.5658595561981201, "learning_rate": 4.558963612585069e-05, "loss": 0.6951, "num_input_tokens_seen": 10559688, "step": 18315 }, { "epoch": 2.728626750074471, "grad_norm": 1.7516411542892456, "learning_rate": 4.558594933707081e-05, "loss": 0.607, "num_input_tokens_seen": 10562952, "step": 18320 }, { "epoch": 2.7293714626154304, "grad_norm": 1.2122972011566162, "learning_rate": 4.5582261157176164e-05, "loss": 0.6481, "num_input_tokens_seen": 10565672, "step": 18325 }, { "epoch": 2.7301161751563896, "grad_norm": 0.9043563604354858, "learning_rate": 4.5578571586416e-05, "loss": 0.6526, "num_input_tokens_seen": 10568712, "step": 18330 }, { "epoch": 2.730860887697349, "grad_norm": 0.8734514713287354, "learning_rate": 4.557488062503962e-05, "loss": 0.7393, "num_input_tokens_seen": 10571368, "step": 18335 }, { "epoch": 2.731605600238308, "grad_norm": 1.3718986511230469, "learning_rate": 4.557118827329647e-05, "loss": 0.671, "num_input_tokens_seen": 10574088, "step": 18340 }, { "epoch": 2.732350312779267, "grad_norm": 1.703028917312622, "learning_rate": 4.556749453143605e-05, "loss": 0.8387, "num_input_tokens_seen": 10576872, "step": 18345 }, { "epoch": 2.7330950253202264, "grad_norm": 1.0657219886779785, "learning_rate": 4.556379939970797e-05, "loss": 0.5984, "num_input_tokens_seen": 10579752, "step": 18350 }, { "epoch": 2.7338397378611856, "grad_norm": 1.1232634782791138, "learning_rate": 4.5560102878361935e-05, "loss": 0.7034, "num_input_tokens_seen": 10582408, "step": 18355 }, { "epoch": 2.734584450402145, "grad_norm": 0.8222681283950806, "learning_rate": 4.5556404967647736e-05, "loss": 0.5796, "num_input_tokens_seen": 10585096, "step": 18360 }, { "epoch": 2.735329162943104, "grad_norm": 0.8458502292633057, "learning_rate": 4.5552705667815265e-05, "loss": 0.7078, "num_input_tokens_seen": 10587688, "step": 18365 }, { "epoch": 2.7360738754840632, "grad_norm": 1.1815879344940186, "learning_rate": 4.5549004979114506e-05, "loss": 0.6946, "num_input_tokens_seen": 10590760, "step": 18370 }, { "epoch": 2.7368185880250224, "grad_norm": 1.7445259094238281, "learning_rate": 4.5545302901795536e-05, "loss": 0.6982, "num_input_tokens_seen": 10593544, "step": 18375 }, { "epoch": 2.7375633005659816, "grad_norm": 1.2334197759628296, "learning_rate": 4.5541599436108535e-05, "loss": 0.7057, "num_input_tokens_seen": 10596552, "step": 18380 }, { "epoch": 2.738308013106941, "grad_norm": 1.2806442975997925, "learning_rate": 4.553789458230375e-05, "loss": 0.7987, "num_input_tokens_seen": 10599432, "step": 18385 }, { "epoch": 2.7390527256479, "grad_norm": 0.5596358776092529, "learning_rate": 4.553418834063156e-05, "loss": 0.5382, "num_input_tokens_seen": 10602600, "step": 18390 }, { "epoch": 2.7397974381888592, "grad_norm": 1.0505132675170898, "learning_rate": 4.55304807113424e-05, "loss": 0.5489, "num_input_tokens_seen": 10605448, "step": 18395 }, { "epoch": 2.7405421507298184, "grad_norm": 1.6275537014007568, "learning_rate": 4.5526771694686835e-05, "loss": 0.7049, "num_input_tokens_seen": 10608296, "step": 18400 }, { "epoch": 2.7412868632707776, "grad_norm": 1.4413715600967407, "learning_rate": 4.552306129091548e-05, "loss": 0.7826, "num_input_tokens_seen": 10611272, "step": 18405 }, { "epoch": 2.742031575811737, "grad_norm": 0.8979130387306213, "learning_rate": 4.551934950027909e-05, "loss": 0.6403, "num_input_tokens_seen": 10614440, "step": 18410 }, { "epoch": 2.7427762883526956, "grad_norm": 0.6973287463188171, "learning_rate": 4.551563632302849e-05, "loss": 0.564, "num_input_tokens_seen": 10617192, "step": 18415 }, { "epoch": 2.7435210008936552, "grad_norm": 0.8186408877372742, "learning_rate": 4.551192175941459e-05, "loss": 0.6385, "num_input_tokens_seen": 10619880, "step": 18420 }, { "epoch": 2.744265713434614, "grad_norm": 1.4950436353683472, "learning_rate": 4.550820580968842e-05, "loss": 0.5747, "num_input_tokens_seen": 10622536, "step": 18425 }, { "epoch": 2.7450104259755737, "grad_norm": 1.4905002117156982, "learning_rate": 4.550448847410108e-05, "loss": 0.7289, "num_input_tokens_seen": 10625864, "step": 18430 }, { "epoch": 2.7457551385165324, "grad_norm": 1.5372815132141113, "learning_rate": 4.550076975290377e-05, "loss": 0.62, "num_input_tokens_seen": 10628776, "step": 18435 }, { "epoch": 2.746499851057492, "grad_norm": 1.0975409746170044, "learning_rate": 4.549704964634779e-05, "loss": 0.6689, "num_input_tokens_seen": 10631496, "step": 18440 }, { "epoch": 2.747244563598451, "grad_norm": 2.066654920578003, "learning_rate": 4.549332815468453e-05, "loss": 0.6943, "num_input_tokens_seen": 10634600, "step": 18445 }, { "epoch": 2.7479892761394105, "grad_norm": 0.8676364421844482, "learning_rate": 4.5489605278165484e-05, "loss": 0.8144, "num_input_tokens_seen": 10637320, "step": 18450 }, { "epoch": 2.7487339886803692, "grad_norm": 1.2192822694778442, "learning_rate": 4.548588101704221e-05, "loss": 0.539, "num_input_tokens_seen": 10640008, "step": 18455 }, { "epoch": 2.7494787012213284, "grad_norm": 1.2528681755065918, "learning_rate": 4.5482155371566384e-05, "loss": 0.6281, "num_input_tokens_seen": 10642856, "step": 18460 }, { "epoch": 2.7502234137622876, "grad_norm": 1.8528085947036743, "learning_rate": 4.5478428341989774e-05, "loss": 0.7667, "num_input_tokens_seen": 10646024, "step": 18465 }, { "epoch": 2.750968126303247, "grad_norm": 1.3389804363250732, "learning_rate": 4.547469992856424e-05, "loss": 0.5883, "num_input_tokens_seen": 10649064, "step": 18470 }, { "epoch": 2.751712838844206, "grad_norm": 1.515562891960144, "learning_rate": 4.5470970131541727e-05, "loss": 0.8485, "num_input_tokens_seen": 10651784, "step": 18475 }, { "epoch": 2.7524575513851652, "grad_norm": 1.2006876468658447, "learning_rate": 4.546723895117428e-05, "loss": 0.5814, "num_input_tokens_seen": 10654920, "step": 18480 }, { "epoch": 2.7532022639261244, "grad_norm": 0.990997314453125, "learning_rate": 4.546350638771404e-05, "loss": 0.5936, "num_input_tokens_seen": 10657928, "step": 18485 }, { "epoch": 2.7539469764670836, "grad_norm": 0.9058343768119812, "learning_rate": 4.5459772441413234e-05, "loss": 0.5842, "num_input_tokens_seen": 10660808, "step": 18490 }, { "epoch": 2.754691689008043, "grad_norm": 0.7099069952964783, "learning_rate": 4.5456037112524195e-05, "loss": 0.7254, "num_input_tokens_seen": 10663592, "step": 18495 }, { "epoch": 2.755436401549002, "grad_norm": 0.9836394190788269, "learning_rate": 4.545230040129933e-05, "loss": 0.7564, "num_input_tokens_seen": 10666408, "step": 18500 }, { "epoch": 2.7561811140899612, "grad_norm": 0.8796202540397644, "learning_rate": 4.544856230799116e-05, "loss": 0.565, "num_input_tokens_seen": 10669096, "step": 18505 }, { "epoch": 2.7569258266309205, "grad_norm": 1.102678894996643, "learning_rate": 4.544482283285228e-05, "loss": 0.5877, "num_input_tokens_seen": 10672008, "step": 18510 }, { "epoch": 2.7576705391718797, "grad_norm": 1.3031731843948364, "learning_rate": 4.54410819761354e-05, "loss": 0.6538, "num_input_tokens_seen": 10674920, "step": 18515 }, { "epoch": 2.758415251712839, "grad_norm": 1.321853756904602, "learning_rate": 4.5437339738093315e-05, "loss": 0.7748, "num_input_tokens_seen": 10677736, "step": 18520 }, { "epoch": 2.759159964253798, "grad_norm": 0.7880738973617554, "learning_rate": 4.54335961189789e-05, "loss": 0.6841, "num_input_tokens_seen": 10680488, "step": 18525 }, { "epoch": 2.7599046767947573, "grad_norm": 0.6815162897109985, "learning_rate": 4.542985111904513e-05, "loss": 0.5243, "num_input_tokens_seen": 10683528, "step": 18530 }, { "epoch": 2.7606493893357165, "grad_norm": 0.7958792448043823, "learning_rate": 4.542610473854508e-05, "loss": 0.7317, "num_input_tokens_seen": 10686568, "step": 18535 }, { "epoch": 2.7613941018766757, "grad_norm": 0.9695650935173035, "learning_rate": 4.542235697773193e-05, "loss": 0.5815, "num_input_tokens_seen": 10689320, "step": 18540 }, { "epoch": 2.762138814417635, "grad_norm": 0.8283395171165466, "learning_rate": 4.5418607836858914e-05, "loss": 0.692, "num_input_tokens_seen": 10692136, "step": 18545 }, { "epoch": 2.762883526958594, "grad_norm": 1.1854217052459717, "learning_rate": 4.5414857316179415e-05, "loss": 0.7532, "num_input_tokens_seen": 10695176, "step": 18550 }, { "epoch": 2.7636282394995533, "grad_norm": 0.9366123676300049, "learning_rate": 4.541110541594684e-05, "loss": 0.8353, "num_input_tokens_seen": 10697928, "step": 18555 }, { "epoch": 2.7643729520405125, "grad_norm": 0.8871609568595886, "learning_rate": 4.540735213641476e-05, "loss": 0.6269, "num_input_tokens_seen": 10700552, "step": 18560 }, { "epoch": 2.7651176645814717, "grad_norm": 0.7787874937057495, "learning_rate": 4.540359747783679e-05, "loss": 0.7805, "num_input_tokens_seen": 10703336, "step": 18565 }, { "epoch": 2.765862377122431, "grad_norm": 1.5965523719787598, "learning_rate": 4.539984144046665e-05, "loss": 0.8187, "num_input_tokens_seen": 10706024, "step": 18570 }, { "epoch": 2.76660708966339, "grad_norm": 0.7049300670623779, "learning_rate": 4.539608402455817e-05, "loss": 0.743, "num_input_tokens_seen": 10709032, "step": 18575 }, { "epoch": 2.7673518022043493, "grad_norm": 0.9915157556533813, "learning_rate": 4.5392325230365264e-05, "loss": 0.5605, "num_input_tokens_seen": 10711976, "step": 18580 }, { "epoch": 2.7680965147453085, "grad_norm": 0.8718972206115723, "learning_rate": 4.538856505814191e-05, "loss": 0.5845, "num_input_tokens_seen": 10714760, "step": 18585 }, { "epoch": 2.7688412272862672, "grad_norm": 1.5842677354812622, "learning_rate": 4.5384803508142235e-05, "loss": 0.8418, "num_input_tokens_seen": 10717704, "step": 18590 }, { "epoch": 2.769585939827227, "grad_norm": 0.6954016089439392, "learning_rate": 4.538104058062042e-05, "loss": 0.6809, "num_input_tokens_seen": 10720552, "step": 18595 }, { "epoch": 2.7703306523681857, "grad_norm": 1.1972360610961914, "learning_rate": 4.537727627583074e-05, "loss": 0.6268, "num_input_tokens_seen": 10723752, "step": 18600 }, { "epoch": 2.7710753649091453, "grad_norm": 1.1986995935440063, "learning_rate": 4.5373510594027576e-05, "loss": 0.8203, "num_input_tokens_seen": 10726440, "step": 18605 }, { "epoch": 2.771820077450104, "grad_norm": 1.5547088384628296, "learning_rate": 4.53697435354654e-05, "loss": 0.6686, "num_input_tokens_seen": 10728904, "step": 18610 }, { "epoch": 2.7725647899910637, "grad_norm": 1.8296113014221191, "learning_rate": 4.536597510039878e-05, "loss": 0.6822, "num_input_tokens_seen": 10731560, "step": 18615 }, { "epoch": 2.7733095025320225, "grad_norm": 0.6948930025100708, "learning_rate": 4.536220528908236e-05, "loss": 0.6602, "num_input_tokens_seen": 10734312, "step": 18620 }, { "epoch": 2.7740542150729817, "grad_norm": 1.1237857341766357, "learning_rate": 4.535843410177089e-05, "loss": 0.7095, "num_input_tokens_seen": 10737288, "step": 18625 }, { "epoch": 2.774798927613941, "grad_norm": 0.9704634547233582, "learning_rate": 4.5354661538719224e-05, "loss": 0.6952, "num_input_tokens_seen": 10740200, "step": 18630 }, { "epoch": 2.7755436401549, "grad_norm": 1.5004241466522217, "learning_rate": 4.5350887600182275e-05, "loss": 0.6384, "num_input_tokens_seen": 10742984, "step": 18635 }, { "epoch": 2.7762883526958593, "grad_norm": 2.715277910232544, "learning_rate": 4.534711228641509e-05, "loss": 0.811, "num_input_tokens_seen": 10745704, "step": 18640 }, { "epoch": 2.7770330652368185, "grad_norm": 1.1021485328674316, "learning_rate": 4.5343335597672776e-05, "loss": 0.6571, "num_input_tokens_seen": 10748264, "step": 18645 }, { "epoch": 2.7777777777777777, "grad_norm": 0.8487563133239746, "learning_rate": 4.5339557534210565e-05, "loss": 0.6152, "num_input_tokens_seen": 10751272, "step": 18650 }, { "epoch": 2.778522490318737, "grad_norm": 1.2636895179748535, "learning_rate": 4.533577809628374e-05, "loss": 0.5738, "num_input_tokens_seen": 10754280, "step": 18655 }, { "epoch": 2.779267202859696, "grad_norm": 0.9722156524658203, "learning_rate": 4.533199728414771e-05, "loss": 0.6842, "num_input_tokens_seen": 10757384, "step": 18660 }, { "epoch": 2.7800119154006553, "grad_norm": 0.9879375696182251, "learning_rate": 4.532821509805797e-05, "loss": 0.6379, "num_input_tokens_seen": 10760296, "step": 18665 }, { "epoch": 2.7807566279416145, "grad_norm": 1.5422245264053345, "learning_rate": 4.53244315382701e-05, "loss": 0.6259, "num_input_tokens_seen": 10763112, "step": 18670 }, { "epoch": 2.7815013404825737, "grad_norm": 1.791274905204773, "learning_rate": 4.532064660503978e-05, "loss": 0.6603, "num_input_tokens_seen": 10765928, "step": 18675 }, { "epoch": 2.782246053023533, "grad_norm": 1.7358959913253784, "learning_rate": 4.531686029862279e-05, "loss": 0.5477, "num_input_tokens_seen": 10768680, "step": 18680 }, { "epoch": 2.782990765564492, "grad_norm": 2.886357307434082, "learning_rate": 4.531307261927497e-05, "loss": 0.86, "num_input_tokens_seen": 10771720, "step": 18685 }, { "epoch": 2.7837354781054513, "grad_norm": 2.0836784839630127, "learning_rate": 4.530928356725229e-05, "loss": 0.5978, "num_input_tokens_seen": 10774216, "step": 18690 }, { "epoch": 2.7844801906464105, "grad_norm": 1.4530243873596191, "learning_rate": 4.530549314281081e-05, "loss": 0.7811, "num_input_tokens_seen": 10777000, "step": 18695 }, { "epoch": 2.7852249031873697, "grad_norm": 1.3082013130187988, "learning_rate": 4.530170134620665e-05, "loss": 0.6484, "num_input_tokens_seen": 10779656, "step": 18700 }, { "epoch": 2.785969615728329, "grad_norm": 0.9899587035179138, "learning_rate": 4.529790817769606e-05, "loss": 0.6449, "num_input_tokens_seen": 10782504, "step": 18705 }, { "epoch": 2.786714328269288, "grad_norm": 0.7956939339637756, "learning_rate": 4.529411363753535e-05, "loss": 0.6865, "num_input_tokens_seen": 10785320, "step": 18710 }, { "epoch": 2.7874590408102473, "grad_norm": 0.7514225244522095, "learning_rate": 4.5290317725980964e-05, "loss": 0.6723, "num_input_tokens_seen": 10788328, "step": 18715 }, { "epoch": 2.7882037533512065, "grad_norm": 0.8458616733551025, "learning_rate": 4.5286520443289396e-05, "loss": 0.5373, "num_input_tokens_seen": 10790888, "step": 18720 }, { "epoch": 2.7889484658921657, "grad_norm": 1.4565645456314087, "learning_rate": 4.528272178971725e-05, "loss": 0.7521, "num_input_tokens_seen": 10793960, "step": 18725 }, { "epoch": 2.789693178433125, "grad_norm": 1.2590614557266235, "learning_rate": 4.5278921765521234e-05, "loss": 0.7141, "num_input_tokens_seen": 10796872, "step": 18730 }, { "epoch": 2.790437890974084, "grad_norm": 0.9770524501800537, "learning_rate": 4.5275120370958124e-05, "loss": 0.6621, "num_input_tokens_seen": 10799880, "step": 18735 }, { "epoch": 2.7911826035150433, "grad_norm": 1.4180238246917725, "learning_rate": 4.5271317606284826e-05, "loss": 0.7092, "num_input_tokens_seen": 10802568, "step": 18740 }, { "epoch": 2.7919273160560025, "grad_norm": 0.73712557554245, "learning_rate": 4.5267513471758295e-05, "loss": 0.5316, "num_input_tokens_seen": 10805480, "step": 18745 }, { "epoch": 2.7926720285969617, "grad_norm": 1.9616748094558716, "learning_rate": 4.5263707967635596e-05, "loss": 0.8538, "num_input_tokens_seen": 10808680, "step": 18750 }, { "epoch": 2.7934167411379205, "grad_norm": 1.3075850009918213, "learning_rate": 4.5259901094173915e-05, "loss": 0.7165, "num_input_tokens_seen": 10811720, "step": 18755 }, { "epoch": 2.79416145367888, "grad_norm": 0.678617000579834, "learning_rate": 4.525609285163048e-05, "loss": 0.6674, "num_input_tokens_seen": 10814568, "step": 18760 }, { "epoch": 2.794906166219839, "grad_norm": 2.1180992126464844, "learning_rate": 4.525228324026265e-05, "loss": 0.7939, "num_input_tokens_seen": 10817448, "step": 18765 }, { "epoch": 2.7956508787607985, "grad_norm": 1.2418444156646729, "learning_rate": 4.5248472260327854e-05, "loss": 0.6553, "num_input_tokens_seen": 10820584, "step": 18770 }, { "epoch": 2.7963955913017573, "grad_norm": 1.3837361335754395, "learning_rate": 4.5244659912083626e-05, "loss": 0.7658, "num_input_tokens_seen": 10823400, "step": 18775 }, { "epoch": 2.797140303842717, "grad_norm": 2.1184189319610596, "learning_rate": 4.524084619578759e-05, "loss": 0.6688, "num_input_tokens_seen": 10826024, "step": 18780 }, { "epoch": 2.7978850163836757, "grad_norm": 2.7271571159362793, "learning_rate": 4.523703111169746e-05, "loss": 0.6143, "num_input_tokens_seen": 10829000, "step": 18785 }, { "epoch": 2.7986297289246354, "grad_norm": 0.8013197779655457, "learning_rate": 4.5233214660071055e-05, "loss": 0.7525, "num_input_tokens_seen": 10831912, "step": 18790 }, { "epoch": 2.799374441465594, "grad_norm": 2.8047149181365967, "learning_rate": 4.522939684116626e-05, "loss": 0.6673, "num_input_tokens_seen": 10835048, "step": 18795 }, { "epoch": 2.8001191540065533, "grad_norm": 1.589631199836731, "learning_rate": 4.522557765524107e-05, "loss": 0.7122, "num_input_tokens_seen": 10837992, "step": 18800 }, { "epoch": 2.8008638665475125, "grad_norm": 0.8081527948379517, "learning_rate": 4.5221757102553576e-05, "loss": 0.4997, "num_input_tokens_seen": 10840808, "step": 18805 }, { "epoch": 2.8016085790884717, "grad_norm": 0.6910500526428223, "learning_rate": 4.521793518336195e-05, "loss": 0.7277, "num_input_tokens_seen": 10843720, "step": 18810 }, { "epoch": 2.802353291629431, "grad_norm": 1.0437477827072144, "learning_rate": 4.521411189792447e-05, "loss": 0.6048, "num_input_tokens_seen": 10846920, "step": 18815 }, { "epoch": 2.80309800417039, "grad_norm": 0.3984326124191284, "learning_rate": 4.521028724649949e-05, "loss": 0.5384, "num_input_tokens_seen": 10849832, "step": 18820 }, { "epoch": 2.8038427167113493, "grad_norm": 0.8305650949478149, "learning_rate": 4.520646122934547e-05, "loss": 0.6709, "num_input_tokens_seen": 10852584, "step": 18825 }, { "epoch": 2.8045874292523085, "grad_norm": 0.9246472716331482, "learning_rate": 4.5202633846720944e-05, "loss": 0.6786, "num_input_tokens_seen": 10855624, "step": 18830 }, { "epoch": 2.8053321417932677, "grad_norm": 0.85539710521698, "learning_rate": 4.519880509888457e-05, "loss": 0.7021, "num_input_tokens_seen": 10858632, "step": 18835 }, { "epoch": 2.806076854334227, "grad_norm": 0.7186819314956665, "learning_rate": 4.519497498609506e-05, "loss": 0.5694, "num_input_tokens_seen": 10861544, "step": 18840 }, { "epoch": 2.806821566875186, "grad_norm": 0.7761086225509644, "learning_rate": 4.519114350861125e-05, "loss": 0.6537, "num_input_tokens_seen": 10864392, "step": 18845 }, { "epoch": 2.8075662794161453, "grad_norm": 1.2814406156539917, "learning_rate": 4.5187310666692065e-05, "loss": 0.7102, "num_input_tokens_seen": 10867176, "step": 18850 }, { "epoch": 2.8083109919571045, "grad_norm": 1.0919736623764038, "learning_rate": 4.5183476460596486e-05, "loss": 0.6376, "num_input_tokens_seen": 10869992, "step": 18855 }, { "epoch": 2.8090557044980637, "grad_norm": 0.7998677492141724, "learning_rate": 4.5179640890583634e-05, "loss": 0.6495, "num_input_tokens_seen": 10873096, "step": 18860 }, { "epoch": 2.809800417039023, "grad_norm": 0.9913063645362854, "learning_rate": 4.51758039569127e-05, "loss": 0.6543, "num_input_tokens_seen": 10875720, "step": 18865 }, { "epoch": 2.810545129579982, "grad_norm": 0.8754851222038269, "learning_rate": 4.517196565984296e-05, "loss": 0.796, "num_input_tokens_seen": 10878728, "step": 18870 }, { "epoch": 2.8112898421209414, "grad_norm": 1.3659915924072266, "learning_rate": 4.5168125999633794e-05, "loss": 0.6736, "num_input_tokens_seen": 10881480, "step": 18875 }, { "epoch": 2.8120345546619006, "grad_norm": 0.937677264213562, "learning_rate": 4.5164284976544664e-05, "loss": 0.7421, "num_input_tokens_seen": 10883976, "step": 18880 }, { "epoch": 2.8127792672028598, "grad_norm": 1.919651746749878, "learning_rate": 4.516044259083514e-05, "loss": 0.6129, "num_input_tokens_seen": 10886728, "step": 18885 }, { "epoch": 2.813523979743819, "grad_norm": 0.902476966381073, "learning_rate": 4.515659884276487e-05, "loss": 0.4869, "num_input_tokens_seen": 10889608, "step": 18890 }, { "epoch": 2.814268692284778, "grad_norm": 1.503339171409607, "learning_rate": 4.515275373259361e-05, "loss": 0.6479, "num_input_tokens_seen": 10892328, "step": 18895 }, { "epoch": 2.8150134048257374, "grad_norm": 0.9568871855735779, "learning_rate": 4.5148907260581185e-05, "loss": 0.6145, "num_input_tokens_seen": 10895304, "step": 18900 }, { "epoch": 2.8157581173666966, "grad_norm": 0.9083324074745178, "learning_rate": 4.5145059426987523e-05, "loss": 0.536, "num_input_tokens_seen": 10898184, "step": 18905 }, { "epoch": 2.8165028299076558, "grad_norm": 0.946006715297699, "learning_rate": 4.514121023207265e-05, "loss": 0.9269, "num_input_tokens_seen": 10900968, "step": 18910 }, { "epoch": 2.817247542448615, "grad_norm": 0.717819094657898, "learning_rate": 4.513735967609668e-05, "loss": 0.5173, "num_input_tokens_seen": 10903624, "step": 18915 }, { "epoch": 2.817992254989574, "grad_norm": 0.7488511204719543, "learning_rate": 4.5133507759319816e-05, "loss": 0.7203, "num_input_tokens_seen": 10906696, "step": 18920 }, { "epoch": 2.8187369675305334, "grad_norm": 1.1428886651992798, "learning_rate": 4.512965448200235e-05, "loss": 0.6988, "num_input_tokens_seen": 10909320, "step": 18925 }, { "epoch": 2.819481680071492, "grad_norm": 1.265830159187317, "learning_rate": 4.5125799844404683e-05, "loss": 0.6732, "num_input_tokens_seen": 10912008, "step": 18930 }, { "epoch": 2.820226392612452, "grad_norm": 1.1679317951202393, "learning_rate": 4.512194384678728e-05, "loss": 0.6576, "num_input_tokens_seen": 10914920, "step": 18935 }, { "epoch": 2.8209711051534105, "grad_norm": 1.4247835874557495, "learning_rate": 4.511808648941073e-05, "loss": 0.6642, "num_input_tokens_seen": 10917640, "step": 18940 }, { "epoch": 2.82171581769437, "grad_norm": 1.5677608251571655, "learning_rate": 4.511422777253568e-05, "loss": 0.7181, "num_input_tokens_seen": 10920520, "step": 18945 }, { "epoch": 2.822460530235329, "grad_norm": 1.1254079341888428, "learning_rate": 4.51103676964229e-05, "loss": 0.6838, "num_input_tokens_seen": 10923400, "step": 18950 }, { "epoch": 2.8232052427762886, "grad_norm": 1.6325379610061646, "learning_rate": 4.5106506261333234e-05, "loss": 0.6749, "num_input_tokens_seen": 10926312, "step": 18955 }, { "epoch": 2.8239499553172474, "grad_norm": 0.846470057964325, "learning_rate": 4.5102643467527616e-05, "loss": 0.7443, "num_input_tokens_seen": 10929096, "step": 18960 }, { "epoch": 2.824694667858207, "grad_norm": 2.0294699668884277, "learning_rate": 4.509877931526709e-05, "loss": 0.6173, "num_input_tokens_seen": 10931656, "step": 18965 }, { "epoch": 2.8254393803991658, "grad_norm": 1.3161500692367554, "learning_rate": 4.5094913804812776e-05, "loss": 0.6472, "num_input_tokens_seen": 10934408, "step": 18970 }, { "epoch": 2.826184092940125, "grad_norm": 1.0835225582122803, "learning_rate": 4.509104693642588e-05, "loss": 0.672, "num_input_tokens_seen": 10937160, "step": 18975 }, { "epoch": 2.826928805481084, "grad_norm": 0.9608485102653503, "learning_rate": 4.508717871036772e-05, "loss": 0.6692, "num_input_tokens_seen": 10940136, "step": 18980 }, { "epoch": 2.8276735180220434, "grad_norm": 1.098824381828308, "learning_rate": 4.508330912689969e-05, "loss": 0.6898, "num_input_tokens_seen": 10943080, "step": 18985 }, { "epoch": 2.8284182305630026, "grad_norm": 1.1206988096237183, "learning_rate": 4.5079438186283285e-05, "loss": 0.6789, "num_input_tokens_seen": 10945640, "step": 18990 }, { "epoch": 2.8291629431039618, "grad_norm": 0.9463983178138733, "learning_rate": 4.507556588878009e-05, "loss": 0.6553, "num_input_tokens_seen": 10948456, "step": 18995 }, { "epoch": 2.829907655644921, "grad_norm": 0.9641045928001404, "learning_rate": 4.5071692234651764e-05, "loss": 0.6335, "num_input_tokens_seen": 10951336, "step": 19000 }, { "epoch": 2.83065236818588, "grad_norm": 0.8256789445877075, "learning_rate": 4.506781722416008e-05, "loss": 0.5832, "num_input_tokens_seen": 10954312, "step": 19005 }, { "epoch": 2.8313970807268394, "grad_norm": 1.3578628301620483, "learning_rate": 4.5063940857566896e-05, "loss": 0.6795, "num_input_tokens_seen": 10957192, "step": 19010 }, { "epoch": 2.8321417932677986, "grad_norm": 1.0332742929458618, "learning_rate": 4.506006313513418e-05, "loss": 0.7242, "num_input_tokens_seen": 10959720, "step": 19015 }, { "epoch": 2.832886505808758, "grad_norm": 0.778188943862915, "learning_rate": 4.505618405712394e-05, "loss": 0.6329, "num_input_tokens_seen": 10962696, "step": 19020 }, { "epoch": 2.833631218349717, "grad_norm": 1.3870962858200073, "learning_rate": 4.505230362379833e-05, "loss": 0.7335, "num_input_tokens_seen": 10965672, "step": 19025 }, { "epoch": 2.834375930890676, "grad_norm": 1.329255223274231, "learning_rate": 4.504842183541956e-05, "loss": 0.6864, "num_input_tokens_seen": 10968616, "step": 19030 }, { "epoch": 2.8351206434316354, "grad_norm": 1.2731817960739136, "learning_rate": 4.5044538692249964e-05, "loss": 0.5469, "num_input_tokens_seen": 10971592, "step": 19035 }, { "epoch": 2.8358653559725946, "grad_norm": 0.9049384593963623, "learning_rate": 4.504065419455193e-05, "loss": 0.806, "num_input_tokens_seen": 10974696, "step": 19040 }, { "epoch": 2.836610068513554, "grad_norm": 0.9921809434890747, "learning_rate": 4.503676834258798e-05, "loss": 0.5983, "num_input_tokens_seen": 10977384, "step": 19045 }, { "epoch": 2.837354781054513, "grad_norm": 0.8068268299102783, "learning_rate": 4.503288113662068e-05, "loss": 0.6415, "num_input_tokens_seen": 10980264, "step": 19050 }, { "epoch": 2.838099493595472, "grad_norm": 0.8630329370498657, "learning_rate": 4.5028992576912714e-05, "loss": 0.6579, "num_input_tokens_seen": 10983112, "step": 19055 }, { "epoch": 2.8388442061364314, "grad_norm": 2.636171340942383, "learning_rate": 4.502510266372687e-05, "loss": 0.7824, "num_input_tokens_seen": 10985960, "step": 19060 }, { "epoch": 2.8395889186773906, "grad_norm": 0.6501364707946777, "learning_rate": 4.5021211397326e-05, "loss": 0.5099, "num_input_tokens_seen": 10988968, "step": 19065 }, { "epoch": 2.84033363121835, "grad_norm": 1.2563012838363647, "learning_rate": 4.501731877797306e-05, "loss": 0.7511, "num_input_tokens_seen": 10991880, "step": 19070 }, { "epoch": 2.841078343759309, "grad_norm": 0.8487236499786377, "learning_rate": 4.5013424805931104e-05, "loss": 0.7486, "num_input_tokens_seen": 10994632, "step": 19075 }, { "epoch": 2.841823056300268, "grad_norm": 1.498524785041809, "learning_rate": 4.5009529481463274e-05, "loss": 0.8078, "num_input_tokens_seen": 10997384, "step": 19080 }, { "epoch": 2.8425677688412274, "grad_norm": 0.5615072846412659, "learning_rate": 4.5005632804832786e-05, "loss": 0.639, "num_input_tokens_seen": 11000168, "step": 19085 }, { "epoch": 2.8433124813821866, "grad_norm": 0.9439075589179993, "learning_rate": 4.500173477630298e-05, "loss": 0.66, "num_input_tokens_seen": 11003144, "step": 19090 }, { "epoch": 2.844057193923146, "grad_norm": 1.1519643068313599, "learning_rate": 4.499783539613726e-05, "loss": 0.8041, "num_input_tokens_seen": 11006376, "step": 19095 }, { "epoch": 2.844801906464105, "grad_norm": 0.801814615726471, "learning_rate": 4.4993934664599116e-05, "loss": 0.6417, "num_input_tokens_seen": 11009288, "step": 19100 }, { "epoch": 2.845546619005064, "grad_norm": 0.8437778949737549, "learning_rate": 4.4990032581952166e-05, "loss": 0.7184, "num_input_tokens_seen": 11012232, "step": 19105 }, { "epoch": 2.8462913315460234, "grad_norm": 0.7369676828384399, "learning_rate": 4.498612914846008e-05, "loss": 0.5898, "num_input_tokens_seen": 11015144, "step": 19110 }, { "epoch": 2.847036044086982, "grad_norm": 0.7533058524131775, "learning_rate": 4.498222436438665e-05, "loss": 0.6563, "num_input_tokens_seen": 11018152, "step": 19115 }, { "epoch": 2.847780756627942, "grad_norm": 0.9359293580055237, "learning_rate": 4.497831822999574e-05, "loss": 0.8028, "num_input_tokens_seen": 11021096, "step": 19120 }, { "epoch": 2.8485254691689006, "grad_norm": 1.7911792993545532, "learning_rate": 4.497441074555131e-05, "loss": 0.8063, "num_input_tokens_seen": 11023944, "step": 19125 }, { "epoch": 2.8492701817098602, "grad_norm": 1.1656582355499268, "learning_rate": 4.497050191131741e-05, "loss": 0.6133, "num_input_tokens_seen": 11026696, "step": 19130 }, { "epoch": 2.850014894250819, "grad_norm": 0.9456230401992798, "learning_rate": 4.4966591727558184e-05, "loss": 0.6381, "num_input_tokens_seen": 11029864, "step": 19135 }, { "epoch": 2.8507596067917786, "grad_norm": 1.0954644680023193, "learning_rate": 4.496268019453787e-05, "loss": 0.8244, "num_input_tokens_seen": 11033256, "step": 19140 }, { "epoch": 2.8515043193327374, "grad_norm": 0.6785944104194641, "learning_rate": 4.495876731252079e-05, "loss": 0.6974, "num_input_tokens_seen": 11036392, "step": 19145 }, { "epoch": 2.8522490318736966, "grad_norm": 0.9165390133857727, "learning_rate": 4.495485308177136e-05, "loss": 0.6813, "num_input_tokens_seen": 11039176, "step": 19150 }, { "epoch": 2.852993744414656, "grad_norm": 0.9495022892951965, "learning_rate": 4.495093750255409e-05, "loss": 0.6211, "num_input_tokens_seen": 11042280, "step": 19155 }, { "epoch": 2.853738456955615, "grad_norm": 0.6724903583526611, "learning_rate": 4.494702057513358e-05, "loss": 0.6929, "num_input_tokens_seen": 11045000, "step": 19160 }, { "epoch": 2.854483169496574, "grad_norm": 0.8645871877670288, "learning_rate": 4.4943102299774513e-05, "loss": 0.6047, "num_input_tokens_seen": 11048200, "step": 19165 }, { "epoch": 2.8552278820375334, "grad_norm": 0.5191901922225952, "learning_rate": 4.493918267674168e-05, "loss": 0.637, "num_input_tokens_seen": 11051016, "step": 19170 }, { "epoch": 2.8559725945784926, "grad_norm": 1.111014485359192, "learning_rate": 4.4935261706299944e-05, "loss": 0.7688, "num_input_tokens_seen": 11053928, "step": 19175 }, { "epoch": 2.856717307119452, "grad_norm": 0.695220947265625, "learning_rate": 4.4931339388714276e-05, "loss": 0.5514, "num_input_tokens_seen": 11056744, "step": 19180 }, { "epoch": 2.857462019660411, "grad_norm": 0.9217128753662109, "learning_rate": 4.4927415724249735e-05, "loss": 0.6926, "num_input_tokens_seen": 11059368, "step": 19185 }, { "epoch": 2.8582067322013702, "grad_norm": 1.4104197025299072, "learning_rate": 4.492349071317145e-05, "loss": 0.6936, "num_input_tokens_seen": 11062088, "step": 19190 }, { "epoch": 2.8589514447423294, "grad_norm": 1.0789356231689453, "learning_rate": 4.491956435574466e-05, "loss": 0.6003, "num_input_tokens_seen": 11064936, "step": 19195 }, { "epoch": 2.8596961572832886, "grad_norm": 0.8540059924125671, "learning_rate": 4.491563665223471e-05, "loss": 0.7297, "num_input_tokens_seen": 11067624, "step": 19200 }, { "epoch": 2.860440869824248, "grad_norm": 0.730152428150177, "learning_rate": 4.491170760290699e-05, "loss": 0.8199, "num_input_tokens_seen": 11070600, "step": 19205 }, { "epoch": 2.861185582365207, "grad_norm": 1.0605615377426147, "learning_rate": 4.4907777208027044e-05, "loss": 0.7355, "num_input_tokens_seen": 11073608, "step": 19210 }, { "epoch": 2.8619302949061662, "grad_norm": 0.5617567896842957, "learning_rate": 4.490384546786044e-05, "loss": 0.6853, "num_input_tokens_seen": 11076328, "step": 19215 }, { "epoch": 2.8626750074471254, "grad_norm": 0.995877206325531, "learning_rate": 4.489991238267289e-05, "loss": 0.6445, "num_input_tokens_seen": 11078984, "step": 19220 }, { "epoch": 2.8634197199880846, "grad_norm": 1.0233025550842285, "learning_rate": 4.489597795273016e-05, "loss": 0.6441, "num_input_tokens_seen": 11081800, "step": 19225 }, { "epoch": 2.864164432529044, "grad_norm": 0.9685698747634888, "learning_rate": 4.4892042178298136e-05, "loss": 0.6584, "num_input_tokens_seen": 11084936, "step": 19230 }, { "epoch": 2.864909145070003, "grad_norm": 1.7963522672653198, "learning_rate": 4.488810505964278e-05, "loss": 0.8239, "num_input_tokens_seen": 11087848, "step": 19235 }, { "epoch": 2.8656538576109623, "grad_norm": 1.0518503189086914, "learning_rate": 4.488416659703014e-05, "loss": 0.7525, "num_input_tokens_seen": 11090632, "step": 19240 }, { "epoch": 2.8663985701519215, "grad_norm": 0.7891435623168945, "learning_rate": 4.4880226790726366e-05, "loss": 0.6153, "num_input_tokens_seen": 11093640, "step": 19245 }, { "epoch": 2.8671432826928807, "grad_norm": 0.9274939298629761, "learning_rate": 4.4876285640997694e-05, "loss": 0.5943, "num_input_tokens_seen": 11096168, "step": 19250 }, { "epoch": 2.86788799523384, "grad_norm": 1.2720884084701538, "learning_rate": 4.487234314811044e-05, "loss": 0.6046, "num_input_tokens_seen": 11098920, "step": 19255 }, { "epoch": 2.868632707774799, "grad_norm": 1.2940598726272583, "learning_rate": 4.486839931233104e-05, "loss": 0.6619, "num_input_tokens_seen": 11101896, "step": 19260 }, { "epoch": 2.8693774203157583, "grad_norm": 1.1039156913757324, "learning_rate": 4.486445413392599e-05, "loss": 0.6768, "num_input_tokens_seen": 11104936, "step": 19265 }, { "epoch": 2.8701221328567175, "grad_norm": 1.2445629835128784, "learning_rate": 4.48605076131619e-05, "loss": 0.559, "num_input_tokens_seen": 11107816, "step": 19270 }, { "epoch": 2.8708668453976767, "grad_norm": 0.863200306892395, "learning_rate": 4.485655975030545e-05, "loss": 0.6214, "num_input_tokens_seen": 11110440, "step": 19275 }, { "epoch": 2.8716115579386354, "grad_norm": 0.7985267639160156, "learning_rate": 4.485261054562342e-05, "loss": 0.7715, "num_input_tokens_seen": 11113288, "step": 19280 }, { "epoch": 2.872356270479595, "grad_norm": 0.5401064157485962, "learning_rate": 4.484865999938268e-05, "loss": 0.635, "num_input_tokens_seen": 11116008, "step": 19285 }, { "epoch": 2.873100983020554, "grad_norm": 1.3990590572357178, "learning_rate": 4.4844708111850195e-05, "loss": 0.6744, "num_input_tokens_seen": 11119016, "step": 19290 }, { "epoch": 2.8738456955615135, "grad_norm": 1.8194681406021118, "learning_rate": 4.4840754883293025e-05, "loss": 0.7233, "num_input_tokens_seen": 11121768, "step": 19295 }, { "epoch": 2.8745904081024722, "grad_norm": 0.985732913017273, "learning_rate": 4.483680031397831e-05, "loss": 0.5869, "num_input_tokens_seen": 11124552, "step": 19300 }, { "epoch": 2.875335120643432, "grad_norm": 0.570322573184967, "learning_rate": 4.4832844404173266e-05, "loss": 0.6719, "num_input_tokens_seen": 11127464, "step": 19305 }, { "epoch": 2.8760798331843906, "grad_norm": 0.8656805753707886, "learning_rate": 4.482888715414525e-05, "loss": 0.7438, "num_input_tokens_seen": 11130472, "step": 19310 }, { "epoch": 2.8768245457253503, "grad_norm": 0.7990824580192566, "learning_rate": 4.482492856416165e-05, "loss": 0.6187, "num_input_tokens_seen": 11133064, "step": 19315 }, { "epoch": 2.877569258266309, "grad_norm": 1.3514440059661865, "learning_rate": 4.482096863448998e-05, "loss": 0.6081, "num_input_tokens_seen": 11136136, "step": 19320 }, { "epoch": 2.8783139708072683, "grad_norm": 0.8668344020843506, "learning_rate": 4.481700736539784e-05, "loss": 0.6194, "num_input_tokens_seen": 11138984, "step": 19325 }, { "epoch": 2.8790586833482275, "grad_norm": 0.8734899759292603, "learning_rate": 4.48130447571529e-05, "loss": 0.6274, "num_input_tokens_seen": 11141992, "step": 19330 }, { "epoch": 2.8798033958891867, "grad_norm": 0.8317489624023438, "learning_rate": 4.480908081002296e-05, "loss": 0.4918, "num_input_tokens_seen": 11144872, "step": 19335 }, { "epoch": 2.880548108430146, "grad_norm": 1.101016640663147, "learning_rate": 4.480511552427587e-05, "loss": 0.7153, "num_input_tokens_seen": 11147848, "step": 19340 }, { "epoch": 2.881292820971105, "grad_norm": 1.033787488937378, "learning_rate": 4.48011489001796e-05, "loss": 0.6557, "num_input_tokens_seen": 11150536, "step": 19345 }, { "epoch": 2.8820375335120643, "grad_norm": 0.6213229298591614, "learning_rate": 4.479718093800219e-05, "loss": 0.5512, "num_input_tokens_seen": 11153448, "step": 19350 }, { "epoch": 2.8827822460530235, "grad_norm": 0.8235899806022644, "learning_rate": 4.4793211638011786e-05, "loss": 0.6157, "num_input_tokens_seen": 11156040, "step": 19355 }, { "epoch": 2.8835269585939827, "grad_norm": 0.8122440576553345, "learning_rate": 4.478924100047661e-05, "loss": 0.5591, "num_input_tokens_seen": 11159112, "step": 19360 }, { "epoch": 2.884271671134942, "grad_norm": 1.2378733158111572, "learning_rate": 4.478526902566498e-05, "loss": 0.8111, "num_input_tokens_seen": 11162088, "step": 19365 }, { "epoch": 2.885016383675901, "grad_norm": 1.6480932235717773, "learning_rate": 4.4781295713845314e-05, "loss": 0.5491, "num_input_tokens_seen": 11164936, "step": 19370 }, { "epoch": 2.8857610962168603, "grad_norm": 0.9511135816574097, "learning_rate": 4.477732106528611e-05, "loss": 0.6925, "num_input_tokens_seen": 11167784, "step": 19375 }, { "epoch": 2.8865058087578195, "grad_norm": 1.12169349193573, "learning_rate": 4.4773345080255955e-05, "loss": 0.7392, "num_input_tokens_seen": 11171144, "step": 19380 }, { "epoch": 2.8872505212987787, "grad_norm": 1.3594086170196533, "learning_rate": 4.4769367759023536e-05, "loss": 0.6984, "num_input_tokens_seen": 11174312, "step": 19385 }, { "epoch": 2.887995233839738, "grad_norm": 1.2868703603744507, "learning_rate": 4.4765389101857616e-05, "loss": 0.6656, "num_input_tokens_seen": 11177704, "step": 19390 }, { "epoch": 2.888739946380697, "grad_norm": 2.1069657802581787, "learning_rate": 4.4761409109027065e-05, "loss": 0.6187, "num_input_tokens_seen": 11180808, "step": 19395 }, { "epoch": 2.8894846589216563, "grad_norm": 1.0312294960021973, "learning_rate": 4.4757427780800825e-05, "loss": 0.7377, "num_input_tokens_seen": 11183656, "step": 19400 }, { "epoch": 2.8902293714626155, "grad_norm": 1.0307855606079102, "learning_rate": 4.475344511744794e-05, "loss": 0.6785, "num_input_tokens_seen": 11186568, "step": 19405 }, { "epoch": 2.8909740840035747, "grad_norm": 1.3524271249771118, "learning_rate": 4.4749461119237555e-05, "loss": 0.682, "num_input_tokens_seen": 11189224, "step": 19410 }, { "epoch": 2.891718796544534, "grad_norm": 1.342576265335083, "learning_rate": 4.4745475786438886e-05, "loss": 0.7662, "num_input_tokens_seen": 11192072, "step": 19415 }, { "epoch": 2.892463509085493, "grad_norm": 1.7528700828552246, "learning_rate": 4.4741489119321235e-05, "loss": 0.689, "num_input_tokens_seen": 11195304, "step": 19420 }, { "epoch": 2.8932082216264523, "grad_norm": 0.7833573222160339, "learning_rate": 4.4737501118154014e-05, "loss": 0.6073, "num_input_tokens_seen": 11198248, "step": 19425 }, { "epoch": 2.8939529341674115, "grad_norm": 1.6990092992782593, "learning_rate": 4.473351178320671e-05, "loss": 0.7378, "num_input_tokens_seen": 11200872, "step": 19430 }, { "epoch": 2.8946976467083707, "grad_norm": 1.4060053825378418, "learning_rate": 4.472952111474892e-05, "loss": 0.6804, "num_input_tokens_seen": 11203688, "step": 19435 }, { "epoch": 2.89544235924933, "grad_norm": 0.8303520083427429, "learning_rate": 4.47255291130503e-05, "loss": 0.6558, "num_input_tokens_seen": 11206728, "step": 19440 }, { "epoch": 2.896187071790289, "grad_norm": 0.7828250527381897, "learning_rate": 4.472153577838062e-05, "loss": 0.6568, "num_input_tokens_seen": 11209544, "step": 19445 }, { "epoch": 2.8969317843312483, "grad_norm": 1.3726160526275635, "learning_rate": 4.471754111100974e-05, "loss": 0.8088, "num_input_tokens_seen": 11212328, "step": 19450 }, { "epoch": 2.897676496872207, "grad_norm": 1.286176085472107, "learning_rate": 4.471354511120759e-05, "loss": 0.7508, "num_input_tokens_seen": 11215016, "step": 19455 }, { "epoch": 2.8984212094131667, "grad_norm": 1.1178025007247925, "learning_rate": 4.470954777924421e-05, "loss": 0.7926, "num_input_tokens_seen": 11217736, "step": 19460 }, { "epoch": 2.8991659219541255, "grad_norm": 1.8196732997894287, "learning_rate": 4.4705549115389735e-05, "loss": 0.5963, "num_input_tokens_seen": 11220616, "step": 19465 }, { "epoch": 2.899910634495085, "grad_norm": 0.8988816738128662, "learning_rate": 4.470154911991435e-05, "loss": 0.623, "num_input_tokens_seen": 11223336, "step": 19470 }, { "epoch": 2.900655347036044, "grad_norm": 0.5783331990242004, "learning_rate": 4.469754779308839e-05, "loss": 0.6382, "num_input_tokens_seen": 11226376, "step": 19475 }, { "epoch": 2.9014000595770035, "grad_norm": 0.8808916807174683, "learning_rate": 4.4693545135182235e-05, "loss": 0.7202, "num_input_tokens_seen": 11229000, "step": 19480 }, { "epoch": 2.9021447721179623, "grad_norm": 0.711961567401886, "learning_rate": 4.468954114646637e-05, "loss": 0.7024, "num_input_tokens_seen": 11232200, "step": 19485 }, { "epoch": 2.9028894846589215, "grad_norm": 0.9781434535980225, "learning_rate": 4.468553582721135e-05, "loss": 0.6159, "num_input_tokens_seen": 11235048, "step": 19490 }, { "epoch": 2.9036341971998807, "grad_norm": 0.9340882301330566, "learning_rate": 4.4681529177687876e-05, "loss": 0.6754, "num_input_tokens_seen": 11237768, "step": 19495 }, { "epoch": 2.90437890974084, "grad_norm": 0.7785186171531677, "learning_rate": 4.467752119816667e-05, "loss": 0.6632, "num_input_tokens_seen": 11240680, "step": 19500 }, { "epoch": 2.905123622281799, "grad_norm": 0.6014583706855774, "learning_rate": 4.467351188891858e-05, "loss": 0.5289, "num_input_tokens_seen": 11243688, "step": 19505 }, { "epoch": 2.9058683348227583, "grad_norm": 0.9716966152191162, "learning_rate": 4.466950125021455e-05, "loss": 0.4815, "num_input_tokens_seen": 11246408, "step": 19510 }, { "epoch": 2.9066130473637175, "grad_norm": 1.0739752054214478, "learning_rate": 4.466548928232559e-05, "loss": 0.7512, "num_input_tokens_seen": 11249512, "step": 19515 }, { "epoch": 2.9073577599046767, "grad_norm": 1.1802695989608765, "learning_rate": 4.4661475985522825e-05, "loss": 0.657, "num_input_tokens_seen": 11252392, "step": 19520 }, { "epoch": 2.908102472445636, "grad_norm": 0.9864798188209534, "learning_rate": 4.4657461360077444e-05, "loss": 0.5859, "num_input_tokens_seen": 11255240, "step": 19525 }, { "epoch": 2.908847184986595, "grad_norm": 0.7003116011619568, "learning_rate": 4.4653445406260744e-05, "loss": 0.5207, "num_input_tokens_seen": 11257928, "step": 19530 }, { "epoch": 2.9095918975275543, "grad_norm": 1.132622480392456, "learning_rate": 4.4649428124344114e-05, "loss": 0.7392, "num_input_tokens_seen": 11260744, "step": 19535 }, { "epoch": 2.9103366100685135, "grad_norm": 1.3301739692687988, "learning_rate": 4.464540951459902e-05, "loss": 0.6362, "num_input_tokens_seen": 11263560, "step": 19540 }, { "epoch": 2.9110813226094727, "grad_norm": 0.6171097755432129, "learning_rate": 4.464138957729702e-05, "loss": 0.5321, "num_input_tokens_seen": 11266120, "step": 19545 }, { "epoch": 2.911826035150432, "grad_norm": 0.9812262654304504, "learning_rate": 4.463736831270977e-05, "loss": 0.6835, "num_input_tokens_seen": 11269032, "step": 19550 }, { "epoch": 2.912570747691391, "grad_norm": 1.2956955432891846, "learning_rate": 4.463334572110901e-05, "loss": 0.7322, "num_input_tokens_seen": 11271976, "step": 19555 }, { "epoch": 2.9133154602323503, "grad_norm": 1.6829237937927246, "learning_rate": 4.462932180276657e-05, "loss": 0.6879, "num_input_tokens_seen": 11275112, "step": 19560 }, { "epoch": 2.9140601727733095, "grad_norm": 1.014389157295227, "learning_rate": 4.462529655795437e-05, "loss": 0.5367, "num_input_tokens_seen": 11277864, "step": 19565 }, { "epoch": 2.9148048853142687, "grad_norm": 1.472580909729004, "learning_rate": 4.462126998694442e-05, "loss": 0.6477, "num_input_tokens_seen": 11280648, "step": 19570 }, { "epoch": 2.915549597855228, "grad_norm": 2.09177565574646, "learning_rate": 4.4617242090008816e-05, "loss": 0.8943, "num_input_tokens_seen": 11283304, "step": 19575 }, { "epoch": 2.916294310396187, "grad_norm": 2.1766767501831055, "learning_rate": 4.461321286741975e-05, "loss": 0.7892, "num_input_tokens_seen": 11286120, "step": 19580 }, { "epoch": 2.9170390229371463, "grad_norm": 0.8149007558822632, "learning_rate": 4.46091823194495e-05, "loss": 0.676, "num_input_tokens_seen": 11288904, "step": 19585 }, { "epoch": 2.9177837354781055, "grad_norm": 1.4352003335952759, "learning_rate": 4.460515044637043e-05, "loss": 0.7358, "num_input_tokens_seen": 11291688, "step": 19590 }, { "epoch": 2.9185284480190647, "grad_norm": 1.2974517345428467, "learning_rate": 4.460111724845501e-05, "loss": 0.6913, "num_input_tokens_seen": 11294888, "step": 19595 }, { "epoch": 2.919273160560024, "grad_norm": 0.852523922920227, "learning_rate": 4.4597082725975775e-05, "loss": 0.6813, "num_input_tokens_seen": 11297704, "step": 19600 }, { "epoch": 2.920017873100983, "grad_norm": 0.9647505283355713, "learning_rate": 4.459304687920536e-05, "loss": 0.6343, "num_input_tokens_seen": 11300680, "step": 19605 }, { "epoch": 2.9207625856419424, "grad_norm": 1.4565393924713135, "learning_rate": 4.458900970841651e-05, "loss": 0.5573, "num_input_tokens_seen": 11303592, "step": 19610 }, { "epoch": 2.9215072981829016, "grad_norm": 1.0709329843521118, "learning_rate": 4.4584971213882014e-05, "loss": 0.5805, "num_input_tokens_seen": 11306216, "step": 19615 }, { "epoch": 2.9222520107238603, "grad_norm": 1.2324154376983643, "learning_rate": 4.458093139587479e-05, "loss": 0.6256, "num_input_tokens_seen": 11309064, "step": 19620 }, { "epoch": 2.92299672326482, "grad_norm": 0.8050496578216553, "learning_rate": 4.4576890254667844e-05, "loss": 0.7903, "num_input_tokens_seen": 11311848, "step": 19625 }, { "epoch": 2.9237414358057787, "grad_norm": 1.9044406414031982, "learning_rate": 4.457284779053423e-05, "loss": 0.6465, "num_input_tokens_seen": 11314792, "step": 19630 }, { "epoch": 2.9244861483467384, "grad_norm": 1.8197321891784668, "learning_rate": 4.4568804003747155e-05, "loss": 0.7324, "num_input_tokens_seen": 11317416, "step": 19635 }, { "epoch": 2.925230860887697, "grad_norm": 1.1374614238739014, "learning_rate": 4.4564758894579863e-05, "loss": 0.6521, "num_input_tokens_seen": 11320232, "step": 19640 }, { "epoch": 2.9259755734286568, "grad_norm": 1.05219304561615, "learning_rate": 4.456071246330571e-05, "loss": 0.577, "num_input_tokens_seen": 11323304, "step": 19645 }, { "epoch": 2.9267202859696155, "grad_norm": 1.3115907907485962, "learning_rate": 4.455666471019814e-05, "loss": 0.6795, "num_input_tokens_seen": 11326216, "step": 19650 }, { "epoch": 2.927464998510575, "grad_norm": 1.0531960725784302, "learning_rate": 4.455261563553067e-05, "loss": 0.6471, "num_input_tokens_seen": 11329288, "step": 19655 }, { "epoch": 2.928209711051534, "grad_norm": 1.261824607849121, "learning_rate": 4.454856523957694e-05, "loss": 0.7028, "num_input_tokens_seen": 11332136, "step": 19660 }, { "epoch": 2.928954423592493, "grad_norm": 1.5612989664077759, "learning_rate": 4.4544513522610644e-05, "loss": 0.7237, "num_input_tokens_seen": 11334952, "step": 19665 }, { "epoch": 2.9296991361334523, "grad_norm": 0.9767444729804993, "learning_rate": 4.454046048490559e-05, "loss": 0.5897, "num_input_tokens_seen": 11337736, "step": 19670 }, { "epoch": 2.9304438486744115, "grad_norm": 2.5189766883850098, "learning_rate": 4.4536406126735664e-05, "loss": 0.7176, "num_input_tokens_seen": 11340552, "step": 19675 }, { "epoch": 2.9311885612153707, "grad_norm": 0.8675082921981812, "learning_rate": 4.4532350448374835e-05, "loss": 0.6776, "num_input_tokens_seen": 11343464, "step": 19680 }, { "epoch": 2.93193327375633, "grad_norm": 0.7341532111167908, "learning_rate": 4.452829345009718e-05, "loss": 0.7148, "num_input_tokens_seen": 11346312, "step": 19685 }, { "epoch": 2.932677986297289, "grad_norm": 1.4438517093658447, "learning_rate": 4.452423513217685e-05, "loss": 0.6864, "num_input_tokens_seen": 11349192, "step": 19690 }, { "epoch": 2.9334226988382484, "grad_norm": 1.680467963218689, "learning_rate": 4.4520175494888086e-05, "loss": 0.6214, "num_input_tokens_seen": 11352008, "step": 19695 }, { "epoch": 2.9341674113792076, "grad_norm": 1.0716239213943481, "learning_rate": 4.4516114538505225e-05, "loss": 0.6006, "num_input_tokens_seen": 11354984, "step": 19700 }, { "epoch": 2.9349121239201668, "grad_norm": 1.0687389373779297, "learning_rate": 4.45120522633027e-05, "loss": 0.6244, "num_input_tokens_seen": 11357864, "step": 19705 }, { "epoch": 2.935656836461126, "grad_norm": 0.7080582976341248, "learning_rate": 4.4507988669555e-05, "loss": 0.8144, "num_input_tokens_seen": 11360552, "step": 19710 }, { "epoch": 2.936401549002085, "grad_norm": 1.101163625717163, "learning_rate": 4.450392375753675e-05, "loss": 0.6137, "num_input_tokens_seen": 11363208, "step": 19715 }, { "epoch": 2.9371462615430444, "grad_norm": 0.7443325519561768, "learning_rate": 4.449985752752261e-05, "loss": 0.8258, "num_input_tokens_seen": 11366152, "step": 19720 }, { "epoch": 2.9378909740840036, "grad_norm": 0.9569767713546753, "learning_rate": 4.44957899797874e-05, "loss": 0.734, "num_input_tokens_seen": 11369000, "step": 19725 }, { "epoch": 2.9386356866249628, "grad_norm": 0.854658842086792, "learning_rate": 4.449172111460597e-05, "loss": 0.69, "num_input_tokens_seen": 11371784, "step": 19730 }, { "epoch": 2.939380399165922, "grad_norm": 0.8964347243309021, "learning_rate": 4.448765093225326e-05, "loss": 0.7051, "num_input_tokens_seen": 11374856, "step": 19735 }, { "epoch": 2.940125111706881, "grad_norm": 0.9795399308204651, "learning_rate": 4.448357943300434e-05, "loss": 0.6556, "num_input_tokens_seen": 11377800, "step": 19740 }, { "epoch": 2.9408698242478404, "grad_norm": 1.129285454750061, "learning_rate": 4.4479506617134324e-05, "loss": 0.594, "num_input_tokens_seen": 11380360, "step": 19745 }, { "epoch": 2.9416145367887996, "grad_norm": 0.7827918529510498, "learning_rate": 4.447543248491846e-05, "loss": 0.6536, "num_input_tokens_seen": 11383624, "step": 19750 }, { "epoch": 2.942359249329759, "grad_norm": 0.8583479523658752, "learning_rate": 4.447135703663205e-05, "loss": 0.7118, "num_input_tokens_seen": 11386568, "step": 19755 }, { "epoch": 2.943103961870718, "grad_norm": 0.6757004857063293, "learning_rate": 4.4467280272550495e-05, "loss": 0.6514, "num_input_tokens_seen": 11389608, "step": 19760 }, { "epoch": 2.943848674411677, "grad_norm": 1.1547036170959473, "learning_rate": 4.4463202192949284e-05, "loss": 0.6279, "num_input_tokens_seen": 11393768, "step": 19765 }, { "epoch": 2.9445933869526364, "grad_norm": 0.8918572068214417, "learning_rate": 4.4459122798104004e-05, "loss": 0.6903, "num_input_tokens_seen": 11396840, "step": 19770 }, { "epoch": 2.9453380994935956, "grad_norm": 0.7272635698318481, "learning_rate": 4.445504208829032e-05, "loss": 0.7113, "num_input_tokens_seen": 11399848, "step": 19775 }, { "epoch": 2.946082812034555, "grad_norm": 0.9547656178474426, "learning_rate": 4.445096006378399e-05, "loss": 0.7173, "num_input_tokens_seen": 11403016, "step": 19780 }, { "epoch": 2.946827524575514, "grad_norm": 0.861368715763092, "learning_rate": 4.4446876724860856e-05, "loss": 0.7427, "num_input_tokens_seen": 11405960, "step": 19785 }, { "epoch": 2.947572237116473, "grad_norm": 0.9489191174507141, "learning_rate": 4.444279207179687e-05, "loss": 0.6333, "num_input_tokens_seen": 11408776, "step": 19790 }, { "epoch": 2.948316949657432, "grad_norm": 1.0838369131088257, "learning_rate": 4.443870610486803e-05, "loss": 0.5618, "num_input_tokens_seen": 11411784, "step": 19795 }, { "epoch": 2.9490616621983916, "grad_norm": 0.9453417658805847, "learning_rate": 4.4434618824350475e-05, "loss": 0.7268, "num_input_tokens_seen": 11414248, "step": 19800 }, { "epoch": 2.9498063747393504, "grad_norm": 0.940290629863739, "learning_rate": 4.4430530230520386e-05, "loss": 0.5883, "num_input_tokens_seen": 11417032, "step": 19805 }, { "epoch": 2.95055108728031, "grad_norm": 1.1103920936584473, "learning_rate": 4.442644032365407e-05, "loss": 0.8058, "num_input_tokens_seen": 11419784, "step": 19810 }, { "epoch": 2.9512957998212688, "grad_norm": 0.6441141963005066, "learning_rate": 4.4422349104027895e-05, "loss": 0.6455, "num_input_tokens_seen": 11422792, "step": 19815 }, { "epoch": 2.9520405123622284, "grad_norm": 1.1318038702011108, "learning_rate": 4.4418256571918334e-05, "loss": 0.549, "num_input_tokens_seen": 11425736, "step": 19820 }, { "epoch": 2.952785224903187, "grad_norm": 0.5151248574256897, "learning_rate": 4.441416272760194e-05, "loss": 0.6279, "num_input_tokens_seen": 11428360, "step": 19825 }, { "epoch": 2.953529937444147, "grad_norm": 0.8258612751960754, "learning_rate": 4.441006757135536e-05, "loss": 0.6964, "num_input_tokens_seen": 11431176, "step": 19830 }, { "epoch": 2.9542746499851056, "grad_norm": 0.8336408734321594, "learning_rate": 4.440597110345533e-05, "loss": 0.6761, "num_input_tokens_seen": 11434184, "step": 19835 }, { "epoch": 2.955019362526065, "grad_norm": 0.6480546593666077, "learning_rate": 4.4401873324178684e-05, "loss": 0.5807, "num_input_tokens_seen": 11437224, "step": 19840 }, { "epoch": 2.955764075067024, "grad_norm": 1.0748807191848755, "learning_rate": 4.439777423380231e-05, "loss": 0.6306, "num_input_tokens_seen": 11439816, "step": 19845 }, { "epoch": 2.956508787607983, "grad_norm": 1.47990882396698, "learning_rate": 4.439367383260322e-05, "loss": 0.6448, "num_input_tokens_seen": 11442536, "step": 19850 }, { "epoch": 2.9572535001489424, "grad_norm": 0.8697648644447327, "learning_rate": 4.4389572120858506e-05, "loss": 0.6099, "num_input_tokens_seen": 11445800, "step": 19855 }, { "epoch": 2.9579982126899016, "grad_norm": 2.222196578979492, "learning_rate": 4.4385469098845335e-05, "loss": 0.6671, "num_input_tokens_seen": 11448424, "step": 19860 }, { "epoch": 2.958742925230861, "grad_norm": 0.8020687103271484, "learning_rate": 4.438136476684098e-05, "loss": 0.4793, "num_input_tokens_seen": 11451240, "step": 19865 }, { "epoch": 2.95948763777182, "grad_norm": 1.3522955179214478, "learning_rate": 4.4377259125122786e-05, "loss": 0.7508, "num_input_tokens_seen": 11454184, "step": 19870 }, { "epoch": 2.960232350312779, "grad_norm": 1.581499695777893, "learning_rate": 4.4373152173968214e-05, "loss": 0.7261, "num_input_tokens_seen": 11456712, "step": 19875 }, { "epoch": 2.9609770628537384, "grad_norm": 0.872229814529419, "learning_rate": 4.436904391365477e-05, "loss": 0.5706, "num_input_tokens_seen": 11459656, "step": 19880 }, { "epoch": 2.9617217753946976, "grad_norm": 0.9500236511230469, "learning_rate": 4.43649343444601e-05, "loss": 0.6007, "num_input_tokens_seen": 11462440, "step": 19885 }, { "epoch": 2.962466487935657, "grad_norm": 0.6357467174530029, "learning_rate": 4.436082346666189e-05, "loss": 0.6708, "num_input_tokens_seen": 11465288, "step": 19890 }, { "epoch": 2.963211200476616, "grad_norm": 0.9773467183113098, "learning_rate": 4.4356711280537954e-05, "loss": 0.6956, "num_input_tokens_seen": 11468392, "step": 19895 }, { "epoch": 2.963955913017575, "grad_norm": 0.8387681841850281, "learning_rate": 4.435259778636617e-05, "loss": 0.8374, "num_input_tokens_seen": 11471272, "step": 19900 }, { "epoch": 2.9647006255585344, "grad_norm": 0.7873245477676392, "learning_rate": 4.43484829844245e-05, "loss": 0.5705, "num_input_tokens_seen": 11474216, "step": 19905 }, { "epoch": 2.9654453380994936, "grad_norm": 0.5363818407058716, "learning_rate": 4.434436687499102e-05, "loss": 0.5447, "num_input_tokens_seen": 11476776, "step": 19910 }, { "epoch": 2.966190050640453, "grad_norm": 1.157434105873108, "learning_rate": 4.434024945834387e-05, "loss": 0.6428, "num_input_tokens_seen": 11479592, "step": 19915 }, { "epoch": 2.966934763181412, "grad_norm": 1.0344102382659912, "learning_rate": 4.43361307347613e-05, "loss": 0.5973, "num_input_tokens_seen": 11482632, "step": 19920 }, { "epoch": 2.9676794757223712, "grad_norm": 0.824231743812561, "learning_rate": 4.433201070452163e-05, "loss": 0.6443, "num_input_tokens_seen": 11485736, "step": 19925 }, { "epoch": 2.9684241882633304, "grad_norm": 2.974184036254883, "learning_rate": 4.432788936790327e-05, "loss": 0.6972, "num_input_tokens_seen": 11488456, "step": 19930 }, { "epoch": 2.9691689008042896, "grad_norm": 1.3390662670135498, "learning_rate": 4.432376672518473e-05, "loss": 0.7438, "num_input_tokens_seen": 11491528, "step": 19935 }, { "epoch": 2.969913613345249, "grad_norm": 0.7671281695365906, "learning_rate": 4.43196427766446e-05, "loss": 0.7622, "num_input_tokens_seen": 11494408, "step": 19940 }, { "epoch": 2.970658325886208, "grad_norm": 1.1421302556991577, "learning_rate": 4.431551752256155e-05, "loss": 0.5537, "num_input_tokens_seen": 11497128, "step": 19945 }, { "epoch": 2.9714030384271672, "grad_norm": 0.834967851638794, "learning_rate": 4.4311390963214375e-05, "loss": 0.5347, "num_input_tokens_seen": 11499912, "step": 19950 }, { "epoch": 2.9721477509681264, "grad_norm": 2.3568079471588135, "learning_rate": 4.43072630988819e-05, "loss": 0.7375, "num_input_tokens_seen": 11502696, "step": 19955 }, { "epoch": 2.9728924635090856, "grad_norm": 1.0044969320297241, "learning_rate": 4.4303133929843086e-05, "loss": 0.7807, "num_input_tokens_seen": 11505512, "step": 19960 }, { "epoch": 2.973637176050045, "grad_norm": 0.8474981188774109, "learning_rate": 4.4299003456376966e-05, "loss": 0.6424, "num_input_tokens_seen": 11508616, "step": 19965 }, { "epoch": 2.9743818885910036, "grad_norm": 2.1079459190368652, "learning_rate": 4.429487167876265e-05, "loss": 0.8237, "num_input_tokens_seen": 11511368, "step": 19970 }, { "epoch": 2.9751266011319633, "grad_norm": 1.4064271450042725, "learning_rate": 4.429073859727936e-05, "loss": 0.5745, "num_input_tokens_seen": 11514504, "step": 19975 }, { "epoch": 2.975871313672922, "grad_norm": 0.6113202571868896, "learning_rate": 4.428660421220638e-05, "loss": 0.6397, "num_input_tokens_seen": 11517320, "step": 19980 }, { "epoch": 2.9766160262138817, "grad_norm": 0.8941999077796936, "learning_rate": 4.42824685238231e-05, "loss": 0.6882, "num_input_tokens_seen": 11520296, "step": 19985 }, { "epoch": 2.9773607387548404, "grad_norm": 2.11491060256958, "learning_rate": 4.4278331532409e-05, "loss": 0.6924, "num_input_tokens_seen": 11523208, "step": 19990 }, { "epoch": 2.9781054512958, "grad_norm": 0.9052600860595703, "learning_rate": 4.427419323824363e-05, "loss": 0.6134, "num_input_tokens_seen": 11526280, "step": 19995 }, { "epoch": 2.978850163836759, "grad_norm": 1.4119646549224854, "learning_rate": 4.427005364160665e-05, "loss": 0.5648, "num_input_tokens_seen": 11528840, "step": 20000 }, { "epoch": 2.9795948763777185, "grad_norm": 2.032731056213379, "learning_rate": 4.426591274277778e-05, "loss": 0.6555, "num_input_tokens_seen": 11531624, "step": 20005 }, { "epoch": 2.9803395889186772, "grad_norm": 2.4514899253845215, "learning_rate": 4.426177054203686e-05, "loss": 0.7797, "num_input_tokens_seen": 11534504, "step": 20010 }, { "epoch": 2.9810843014596364, "grad_norm": 0.8476503491401672, "learning_rate": 4.425762703966381e-05, "loss": 0.747, "num_input_tokens_seen": 11537448, "step": 20015 }, { "epoch": 2.9818290140005956, "grad_norm": 0.714999794960022, "learning_rate": 4.425348223593861e-05, "loss": 0.5733, "num_input_tokens_seen": 11540360, "step": 20020 }, { "epoch": 2.982573726541555, "grad_norm": 1.4080783128738403, "learning_rate": 4.424933613114136e-05, "loss": 0.5518, "num_input_tokens_seen": 11543304, "step": 20025 }, { "epoch": 2.983318439082514, "grad_norm": 0.5012694001197815, "learning_rate": 4.424518872555224e-05, "loss": 0.4888, "num_input_tokens_seen": 11546376, "step": 20030 }, { "epoch": 2.9840631516234732, "grad_norm": 0.7225359678268433, "learning_rate": 4.424104001945151e-05, "loss": 0.664, "num_input_tokens_seen": 11549224, "step": 20035 }, { "epoch": 2.9848078641644324, "grad_norm": 1.533147931098938, "learning_rate": 4.4236890013119527e-05, "loss": 0.5323, "num_input_tokens_seen": 11552008, "step": 20040 }, { "epoch": 2.9855525767053916, "grad_norm": 0.6081900000572205, "learning_rate": 4.423273870683672e-05, "loss": 0.6559, "num_input_tokens_seen": 11554792, "step": 20045 }, { "epoch": 2.986297289246351, "grad_norm": 0.9882493019104004, "learning_rate": 4.422858610088364e-05, "loss": 0.7565, "num_input_tokens_seen": 11557736, "step": 20050 }, { "epoch": 2.98704200178731, "grad_norm": 1.287325382232666, "learning_rate": 4.422443219554088e-05, "loss": 0.5579, "num_input_tokens_seen": 11560392, "step": 20055 }, { "epoch": 2.9877867143282693, "grad_norm": 1.1420562267303467, "learning_rate": 4.422027699108915e-05, "loss": 0.7114, "num_input_tokens_seen": 11563720, "step": 20060 }, { "epoch": 2.9885314268692285, "grad_norm": 1.4084084033966064, "learning_rate": 4.421612048780925e-05, "loss": 0.5326, "num_input_tokens_seen": 11566696, "step": 20065 }, { "epoch": 2.9892761394101877, "grad_norm": 1.1947686672210693, "learning_rate": 4.421196268598205e-05, "loss": 0.7154, "num_input_tokens_seen": 11569448, "step": 20070 }, { "epoch": 2.990020851951147, "grad_norm": 0.7213554382324219, "learning_rate": 4.4207803585888524e-05, "loss": 0.5672, "num_input_tokens_seen": 11572328, "step": 20075 }, { "epoch": 2.990765564492106, "grad_norm": 2.399663209915161, "learning_rate": 4.420364318780973e-05, "loss": 0.6141, "num_input_tokens_seen": 11575304, "step": 20080 }, { "epoch": 2.9915102770330653, "grad_norm": 1.0512267351150513, "learning_rate": 4.419948149202679e-05, "loss": 0.5715, "num_input_tokens_seen": 11577960, "step": 20085 }, { "epoch": 2.9922549895740245, "grad_norm": 1.3730988502502441, "learning_rate": 4.419531849882097e-05, "loss": 0.7696, "num_input_tokens_seen": 11580680, "step": 20090 }, { "epoch": 2.9929997021149837, "grad_norm": 0.7916767001152039, "learning_rate": 4.419115420847356e-05, "loss": 0.5549, "num_input_tokens_seen": 11583464, "step": 20095 }, { "epoch": 2.993744414655943, "grad_norm": 1.0769779682159424, "learning_rate": 4.418698862126597e-05, "loss": 0.5329, "num_input_tokens_seen": 11586376, "step": 20100 }, { "epoch": 2.994489127196902, "grad_norm": 0.7463703155517578, "learning_rate": 4.418282173747971e-05, "loss": 0.5955, "num_input_tokens_seen": 11589480, "step": 20105 }, { "epoch": 2.9952338397378613, "grad_norm": 2.2664387226104736, "learning_rate": 4.4178653557396335e-05, "loss": 0.7076, "num_input_tokens_seen": 11592840, "step": 20110 }, { "epoch": 2.9959785522788205, "grad_norm": 1.607558250427246, "learning_rate": 4.417448408129753e-05, "loss": 0.9499, "num_input_tokens_seen": 11595688, "step": 20115 }, { "epoch": 2.9967232648197797, "grad_norm": 1.2125054597854614, "learning_rate": 4.417031330946505e-05, "loss": 0.7651, "num_input_tokens_seen": 11598920, "step": 20120 }, { "epoch": 2.997467977360739, "grad_norm": 1.3486353158950806, "learning_rate": 4.4166141242180736e-05, "loss": 0.568, "num_input_tokens_seen": 11601672, "step": 20125 }, { "epoch": 2.998212689901698, "grad_norm": 1.200173258781433, "learning_rate": 4.4161967879726526e-05, "loss": 0.6955, "num_input_tokens_seen": 11604680, "step": 20130 }, { "epoch": 2.9989574024426573, "grad_norm": 0.9654106497764587, "learning_rate": 4.415779322238443e-05, "loss": 0.6529, "num_input_tokens_seen": 11607688, "step": 20135 }, { "epoch": 2.9997021149836165, "grad_norm": 1.3807905912399292, "learning_rate": 4.4153617270436556e-05, "loss": 0.6193, "num_input_tokens_seen": 11610440, "step": 20140 }, { "epoch": 3.0, "eval_loss": 0.6564539074897766, "eval_runtime": 74.2285, "eval_samples_per_second": 40.2, "eval_steps_per_second": 10.05, "num_input_tokens_seen": 11611120, "step": 20142 }, { "epoch": 3.0004468275245757, "grad_norm": 1.3039417266845703, "learning_rate": 4.414944002416511e-05, "loss": 0.5165, "num_input_tokens_seen": 11612848, "step": 20145 }, { "epoch": 3.001191540065535, "grad_norm": 1.7377715110778809, "learning_rate": 4.414526148385235e-05, "loss": 0.6088, "num_input_tokens_seen": 11615664, "step": 20150 }, { "epoch": 3.001936252606494, "grad_norm": 1.3570027351379395, "learning_rate": 4.414108164978067e-05, "loss": 0.7137, "num_input_tokens_seen": 11618480, "step": 20155 }, { "epoch": 3.002680965147453, "grad_norm": 1.147508144378662, "learning_rate": 4.4136900522232506e-05, "loss": 0.7797, "num_input_tokens_seen": 11621168, "step": 20160 }, { "epoch": 3.003425677688412, "grad_norm": 1.0190997123718262, "learning_rate": 4.413271810149041e-05, "loss": 0.7357, "num_input_tokens_seen": 11624240, "step": 20165 }, { "epoch": 3.0041703902293713, "grad_norm": 0.9780120849609375, "learning_rate": 4.412853438783701e-05, "loss": 0.5393, "num_input_tokens_seen": 11627312, "step": 20170 }, { "epoch": 3.0049151027703305, "grad_norm": 1.4137723445892334, "learning_rate": 4.412434938155503e-05, "loss": 0.6779, "num_input_tokens_seen": 11630256, "step": 20175 }, { "epoch": 3.0056598153112897, "grad_norm": 0.9632065296173096, "learning_rate": 4.4120163082927274e-05, "loss": 0.7829, "num_input_tokens_seen": 11632880, "step": 20180 }, { "epoch": 3.006404527852249, "grad_norm": 1.0142675638198853, "learning_rate": 4.411597549223663e-05, "loss": 0.7058, "num_input_tokens_seen": 11635696, "step": 20185 }, { "epoch": 3.007149240393208, "grad_norm": 1.0940823554992676, "learning_rate": 4.411178660976609e-05, "loss": 0.6478, "num_input_tokens_seen": 11638576, "step": 20190 }, { "epoch": 3.0078939529341673, "grad_norm": 0.9445207118988037, "learning_rate": 4.410759643579871e-05, "loss": 0.6786, "num_input_tokens_seen": 11641840, "step": 20195 }, { "epoch": 3.0086386654751265, "grad_norm": 1.5485864877700806, "learning_rate": 4.410340497061764e-05, "loss": 0.7621, "num_input_tokens_seen": 11644432, "step": 20200 }, { "epoch": 3.0093833780160857, "grad_norm": 0.8977357745170593, "learning_rate": 4.4099212214506146e-05, "loss": 0.5672, "num_input_tokens_seen": 11647248, "step": 20205 }, { "epoch": 3.010128090557045, "grad_norm": 1.4065141677856445, "learning_rate": 4.4095018167747536e-05, "loss": 0.5889, "num_input_tokens_seen": 11650256, "step": 20210 }, { "epoch": 3.010872803098004, "grad_norm": 2.409637689590454, "learning_rate": 4.4090822830625236e-05, "loss": 0.5863, "num_input_tokens_seen": 11653104, "step": 20215 }, { "epoch": 3.0116175156389633, "grad_norm": 1.7707887887954712, "learning_rate": 4.408662620342274e-05, "loss": 0.6282, "num_input_tokens_seen": 11655952, "step": 20220 }, { "epoch": 3.0123622281799225, "grad_norm": 1.3282880783081055, "learning_rate": 4.408242828642365e-05, "loss": 0.3887, "num_input_tokens_seen": 11658960, "step": 20225 }, { "epoch": 3.0131069407208817, "grad_norm": 1.1847671270370483, "learning_rate": 4.4078229079911636e-05, "loss": 0.7563, "num_input_tokens_seen": 11661840, "step": 20230 }, { "epoch": 3.013851653261841, "grad_norm": 1.2125645875930786, "learning_rate": 4.407402858417047e-05, "loss": 0.6348, "num_input_tokens_seen": 11664848, "step": 20235 }, { "epoch": 3.0145963658028, "grad_norm": 0.8620962500572205, "learning_rate": 4.4069826799484e-05, "loss": 0.6483, "num_input_tokens_seen": 11667632, "step": 20240 }, { "epoch": 3.0153410783437593, "grad_norm": 1.6853605508804321, "learning_rate": 4.406562372613617e-05, "loss": 0.5859, "num_input_tokens_seen": 11670544, "step": 20245 }, { "epoch": 3.0160857908847185, "grad_norm": 1.4277254343032837, "learning_rate": 4.406141936441099e-05, "loss": 0.6997, "num_input_tokens_seen": 11673392, "step": 20250 }, { "epoch": 3.0168305034256777, "grad_norm": 1.3333079814910889, "learning_rate": 4.40572137145926e-05, "loss": 0.6209, "num_input_tokens_seen": 11676464, "step": 20255 }, { "epoch": 3.017575215966637, "grad_norm": 0.6349104642868042, "learning_rate": 4.405300677696519e-05, "loss": 0.5279, "num_input_tokens_seen": 11679248, "step": 20260 }, { "epoch": 3.018319928507596, "grad_norm": 1.1864371299743652, "learning_rate": 4.4048798551813056e-05, "loss": 0.6477, "num_input_tokens_seen": 11682032, "step": 20265 }, { "epoch": 3.0190646410485553, "grad_norm": 0.7799556851387024, "learning_rate": 4.4044589039420546e-05, "loss": 0.443, "num_input_tokens_seen": 11684752, "step": 20270 }, { "epoch": 3.0198093535895145, "grad_norm": 0.9712468385696411, "learning_rate": 4.404037824007214e-05, "loss": 0.5204, "num_input_tokens_seen": 11687536, "step": 20275 }, { "epoch": 3.0205540661304737, "grad_norm": 0.8432955145835876, "learning_rate": 4.4036166154052387e-05, "loss": 0.754, "num_input_tokens_seen": 11690736, "step": 20280 }, { "epoch": 3.021298778671433, "grad_norm": 2.0312750339508057, "learning_rate": 4.4031952781645924e-05, "loss": 0.7208, "num_input_tokens_seen": 11693904, "step": 20285 }, { "epoch": 3.022043491212392, "grad_norm": 1.2410047054290771, "learning_rate": 4.4027738123137465e-05, "loss": 0.453, "num_input_tokens_seen": 11696912, "step": 20290 }, { "epoch": 3.0227882037533513, "grad_norm": 1.430924415588379, "learning_rate": 4.402352217881183e-05, "loss": 0.5804, "num_input_tokens_seen": 11699920, "step": 20295 }, { "epoch": 3.0235329162943105, "grad_norm": 0.8165422677993774, "learning_rate": 4.4019304948953906e-05, "loss": 0.6071, "num_input_tokens_seen": 11702960, "step": 20300 }, { "epoch": 3.0242776288352697, "grad_norm": 0.9088325500488281, "learning_rate": 4.401508643384868e-05, "loss": 0.4365, "num_input_tokens_seen": 11706000, "step": 20305 }, { "epoch": 3.025022341376229, "grad_norm": 0.9742756485939026, "learning_rate": 4.4010866633781225e-05, "loss": 0.4829, "num_input_tokens_seen": 11708880, "step": 20310 }, { "epoch": 3.025767053917188, "grad_norm": 1.1222703456878662, "learning_rate": 4.4006645549036697e-05, "loss": 0.5543, "num_input_tokens_seen": 11712560, "step": 20315 }, { "epoch": 3.0265117664581473, "grad_norm": 1.7633718252182007, "learning_rate": 4.400242317990033e-05, "loss": 0.6742, "num_input_tokens_seen": 11715856, "step": 20320 }, { "epoch": 3.0272564789991065, "grad_norm": 1.1206133365631104, "learning_rate": 4.399819952665747e-05, "loss": 0.6104, "num_input_tokens_seen": 11718832, "step": 20325 }, { "epoch": 3.0280011915400658, "grad_norm": 1.9414616823196411, "learning_rate": 4.399397458959353e-05, "loss": 0.7787, "num_input_tokens_seen": 11721456, "step": 20330 }, { "epoch": 3.0287459040810245, "grad_norm": 1.1670032739639282, "learning_rate": 4.398974836899401e-05, "loss": 0.5266, "num_input_tokens_seen": 11724144, "step": 20335 }, { "epoch": 3.0294906166219837, "grad_norm": 1.7362322807312012, "learning_rate": 4.398552086514449e-05, "loss": 0.8965, "num_input_tokens_seen": 11726928, "step": 20340 }, { "epoch": 3.030235329162943, "grad_norm": 0.8066930770874023, "learning_rate": 4.398129207833067e-05, "loss": 0.5926, "num_input_tokens_seen": 11729648, "step": 20345 }, { "epoch": 3.030980041703902, "grad_norm": 1.3696202039718628, "learning_rate": 4.3977062008838307e-05, "loss": 0.6682, "num_input_tokens_seen": 11732336, "step": 20350 }, { "epoch": 3.0317247542448613, "grad_norm": 1.2549760341644287, "learning_rate": 4.397283065695325e-05, "loss": 0.6109, "num_input_tokens_seen": 11735280, "step": 20355 }, { "epoch": 3.0324694667858205, "grad_norm": 0.5802838206291199, "learning_rate": 4.396859802296142e-05, "loss": 0.6259, "num_input_tokens_seen": 11738192, "step": 20360 }, { "epoch": 3.0332141793267797, "grad_norm": 1.1126846075057983, "learning_rate": 4.396436410714887e-05, "loss": 0.9053, "num_input_tokens_seen": 11741232, "step": 20365 }, { "epoch": 3.033958891867739, "grad_norm": 1.6231086254119873, "learning_rate": 4.396012890980169e-05, "loss": 0.8447, "num_input_tokens_seen": 11744592, "step": 20370 }, { "epoch": 3.034703604408698, "grad_norm": 1.069079041481018, "learning_rate": 4.3955892431206085e-05, "loss": 0.7441, "num_input_tokens_seen": 11747344, "step": 20375 }, { "epoch": 3.0354483169496573, "grad_norm": 0.9677658081054688, "learning_rate": 4.395165467164834e-05, "loss": 0.5991, "num_input_tokens_seen": 11750064, "step": 20380 }, { "epoch": 3.0361930294906165, "grad_norm": 0.793770432472229, "learning_rate": 4.394741563141482e-05, "loss": 0.6762, "num_input_tokens_seen": 11753104, "step": 20385 }, { "epoch": 3.0369377420315757, "grad_norm": 1.004982590675354, "learning_rate": 4.3943175310791995e-05, "loss": 0.5063, "num_input_tokens_seen": 11755792, "step": 20390 }, { "epoch": 3.037682454572535, "grad_norm": 0.8477210998535156, "learning_rate": 4.3938933710066396e-05, "loss": 0.7055, "num_input_tokens_seen": 11758768, "step": 20395 }, { "epoch": 3.038427167113494, "grad_norm": 1.6562331914901733, "learning_rate": 4.393469082952466e-05, "loss": 0.6931, "num_input_tokens_seen": 11761648, "step": 20400 }, { "epoch": 3.0391718796544533, "grad_norm": 0.9389520287513733, "learning_rate": 4.3930446669453494e-05, "loss": 0.6137, "num_input_tokens_seen": 11764656, "step": 20405 }, { "epoch": 3.0399165921954125, "grad_norm": 1.0238152742385864, "learning_rate": 4.392620123013971e-05, "loss": 0.5095, "num_input_tokens_seen": 11767440, "step": 20410 }, { "epoch": 3.0406613047363718, "grad_norm": 1.602419137954712, "learning_rate": 4.3921954511870194e-05, "loss": 0.7222, "num_input_tokens_seen": 11770704, "step": 20415 }, { "epoch": 3.041406017277331, "grad_norm": 1.741227626800537, "learning_rate": 4.3917706514931926e-05, "loss": 0.6066, "num_input_tokens_seen": 11773552, "step": 20420 }, { "epoch": 3.04215072981829, "grad_norm": 0.6617518067359924, "learning_rate": 4.391345723961197e-05, "loss": 0.5749, "num_input_tokens_seen": 11776400, "step": 20425 }, { "epoch": 3.0428954423592494, "grad_norm": 0.7903846502304077, "learning_rate": 4.3909206686197456e-05, "loss": 0.6029, "num_input_tokens_seen": 11779280, "step": 20430 }, { "epoch": 3.0436401549002086, "grad_norm": 1.2828716039657593, "learning_rate": 4.3904954854975644e-05, "loss": 0.6881, "num_input_tokens_seen": 11782416, "step": 20435 }, { "epoch": 3.0443848674411678, "grad_norm": 1.4525107145309448, "learning_rate": 4.390070174623384e-05, "loss": 0.6163, "num_input_tokens_seen": 11785296, "step": 20440 }, { "epoch": 3.045129579982127, "grad_norm": 1.0578874349594116, "learning_rate": 4.389644736025946e-05, "loss": 0.6047, "num_input_tokens_seen": 11788016, "step": 20445 }, { "epoch": 3.045874292523086, "grad_norm": 0.990913987159729, "learning_rate": 4.389219169734e-05, "loss": 0.6519, "num_input_tokens_seen": 11790832, "step": 20450 }, { "epoch": 3.0466190050640454, "grad_norm": 0.9659717082977295, "learning_rate": 4.388793475776303e-05, "loss": 0.5167, "num_input_tokens_seen": 11793776, "step": 20455 }, { "epoch": 3.0473637176050046, "grad_norm": 1.3023194074630737, "learning_rate": 4.388367654181622e-05, "loss": 0.5263, "num_input_tokens_seen": 11796720, "step": 20460 }, { "epoch": 3.0481084301459638, "grad_norm": 1.2819910049438477, "learning_rate": 4.387941704978733e-05, "loss": 0.6554, "num_input_tokens_seen": 11800528, "step": 20465 }, { "epoch": 3.048853142686923, "grad_norm": 1.7256678342819214, "learning_rate": 4.3875156281964186e-05, "loss": 0.7112, "num_input_tokens_seen": 11803184, "step": 20470 }, { "epoch": 3.049597855227882, "grad_norm": 1.5671446323394775, "learning_rate": 4.3870894238634725e-05, "loss": 0.5613, "num_input_tokens_seen": 11805680, "step": 20475 }, { "epoch": 3.0503425677688414, "grad_norm": 2.500732898712158, "learning_rate": 4.386663092008696e-05, "loss": 0.6749, "num_input_tokens_seen": 11808400, "step": 20480 }, { "epoch": 3.0510872803098006, "grad_norm": 0.972378134727478, "learning_rate": 4.3862366326608975e-05, "loss": 0.6949, "num_input_tokens_seen": 11811376, "step": 20485 }, { "epoch": 3.05183199285076, "grad_norm": 1.3489900827407837, "learning_rate": 4.385810045848896e-05, "loss": 0.7168, "num_input_tokens_seen": 11814096, "step": 20490 }, { "epoch": 3.052576705391719, "grad_norm": 1.6693706512451172, "learning_rate": 4.38538333160152e-05, "loss": 0.7053, "num_input_tokens_seen": 11816848, "step": 20495 }, { "epoch": 3.053321417932678, "grad_norm": 1.163303256034851, "learning_rate": 4.3849564899476026e-05, "loss": 0.6177, "num_input_tokens_seen": 11819568, "step": 20500 }, { "epoch": 3.054066130473637, "grad_norm": 1.7403374910354614, "learning_rate": 4.38452952091599e-05, "loss": 0.6202, "num_input_tokens_seen": 11822768, "step": 20505 }, { "epoch": 3.054810843014596, "grad_norm": 2.928449869155884, "learning_rate": 4.3841024245355346e-05, "loss": 0.576, "num_input_tokens_seen": 11825808, "step": 20510 }, { "epoch": 3.0555555555555554, "grad_norm": 0.9910268783569336, "learning_rate": 4.383675200835097e-05, "loss": 0.6069, "num_input_tokens_seen": 11828720, "step": 20515 }, { "epoch": 3.0563002680965146, "grad_norm": 1.5331876277923584, "learning_rate": 4.383247849843548e-05, "loss": 0.5711, "num_input_tokens_seen": 11831568, "step": 20520 }, { "epoch": 3.0570449806374738, "grad_norm": 2.020986318588257, "learning_rate": 4.382820371589766e-05, "loss": 0.6874, "num_input_tokens_seen": 11834192, "step": 20525 }, { "epoch": 3.057789693178433, "grad_norm": 0.9638583660125732, "learning_rate": 4.382392766102638e-05, "loss": 0.5172, "num_input_tokens_seen": 11837040, "step": 20530 }, { "epoch": 3.058534405719392, "grad_norm": 1.956520676612854, "learning_rate": 4.381965033411061e-05, "loss": 0.6526, "num_input_tokens_seen": 11839888, "step": 20535 }, { "epoch": 3.0592791182603514, "grad_norm": 1.266344666481018, "learning_rate": 4.381537173543937e-05, "loss": 0.5817, "num_input_tokens_seen": 11843024, "step": 20540 }, { "epoch": 3.0600238308013106, "grad_norm": 1.9535610675811768, "learning_rate": 4.381109186530182e-05, "loss": 0.8025, "num_input_tokens_seen": 11845744, "step": 20545 }, { "epoch": 3.0607685433422698, "grad_norm": 1.7122567892074585, "learning_rate": 4.380681072398716e-05, "loss": 0.6478, "num_input_tokens_seen": 11848432, "step": 20550 }, { "epoch": 3.061513255883229, "grad_norm": 1.1850639581680298, "learning_rate": 4.3802528311784686e-05, "loss": 0.5359, "num_input_tokens_seen": 11851728, "step": 20555 }, { "epoch": 3.062257968424188, "grad_norm": 1.0075418949127197, "learning_rate": 4.37982446289838e-05, "loss": 0.5075, "num_input_tokens_seen": 11854544, "step": 20560 }, { "epoch": 3.0630026809651474, "grad_norm": 1.8703317642211914, "learning_rate": 4.379395967587398e-05, "loss": 0.6533, "num_input_tokens_seen": 11857840, "step": 20565 }, { "epoch": 3.0637473935061066, "grad_norm": 1.5213062763214111, "learning_rate": 4.378967345274476e-05, "loss": 0.7568, "num_input_tokens_seen": 11860784, "step": 20570 }, { "epoch": 3.064492106047066, "grad_norm": 1.1368298530578613, "learning_rate": 4.3785385959885805e-05, "loss": 0.616, "num_input_tokens_seen": 11863824, "step": 20575 }, { "epoch": 3.065236818588025, "grad_norm": 1.2737919092178345, "learning_rate": 4.3781097197586845e-05, "loss": 0.7681, "num_input_tokens_seen": 11866352, "step": 20580 }, { "epoch": 3.065981531128984, "grad_norm": 0.7105170488357544, "learning_rate": 4.377680716613769e-05, "loss": 0.5722, "num_input_tokens_seen": 11869200, "step": 20585 }, { "epoch": 3.0667262436699434, "grad_norm": 1.1034133434295654, "learning_rate": 4.377251586582826e-05, "loss": 0.5786, "num_input_tokens_seen": 11871984, "step": 20590 }, { "epoch": 3.0674709562109026, "grad_norm": 2.5026180744171143, "learning_rate": 4.3768223296948516e-05, "loss": 0.7502, "num_input_tokens_seen": 11874928, "step": 20595 }, { "epoch": 3.068215668751862, "grad_norm": 1.0887094736099243, "learning_rate": 4.3763929459788554e-05, "loss": 0.573, "num_input_tokens_seen": 11877680, "step": 20600 }, { "epoch": 3.068960381292821, "grad_norm": 1.3071317672729492, "learning_rate": 4.375963435463853e-05, "loss": 0.6874, "num_input_tokens_seen": 11880336, "step": 20605 }, { "epoch": 3.06970509383378, "grad_norm": 0.8697724938392639, "learning_rate": 4.375533798178869e-05, "loss": 0.6723, "num_input_tokens_seen": 11883152, "step": 20610 }, { "epoch": 3.0704498063747394, "grad_norm": 1.3632540702819824, "learning_rate": 4.375104034152936e-05, "loss": 0.5061, "num_input_tokens_seen": 11886032, "step": 20615 }, { "epoch": 3.0711945189156986, "grad_norm": 0.9786157608032227, "learning_rate": 4.374674143415096e-05, "loss": 0.5146, "num_input_tokens_seen": 11889072, "step": 20620 }, { "epoch": 3.071939231456658, "grad_norm": 0.6990915536880493, "learning_rate": 4.374244125994399e-05, "loss": 0.6195, "num_input_tokens_seen": 11892336, "step": 20625 }, { "epoch": 3.072683943997617, "grad_norm": 0.9883466362953186, "learning_rate": 4.3738139819199045e-05, "loss": 0.7263, "num_input_tokens_seen": 11895408, "step": 20630 }, { "epoch": 3.073428656538576, "grad_norm": 1.670971393585205, "learning_rate": 4.3733837112206786e-05, "loss": 0.6292, "num_input_tokens_seen": 11898448, "step": 20635 }, { "epoch": 3.0741733690795354, "grad_norm": 1.4683066606521606, "learning_rate": 4.372953313925798e-05, "loss": 0.7557, "num_input_tokens_seen": 11901424, "step": 20640 }, { "epoch": 3.0749180816204946, "grad_norm": 1.397623062133789, "learning_rate": 4.3725227900643485e-05, "loss": 0.6249, "num_input_tokens_seen": 11904496, "step": 20645 }, { "epoch": 3.075662794161454, "grad_norm": 1.6152691841125488, "learning_rate": 4.372092139665422e-05, "loss": 0.528, "num_input_tokens_seen": 11907152, "step": 20650 }, { "epoch": 3.076407506702413, "grad_norm": 1.2187362909317017, "learning_rate": 4.3716613627581195e-05, "loss": 0.7451, "num_input_tokens_seen": 11909776, "step": 20655 }, { "epoch": 3.0771522192433722, "grad_norm": 1.2924555540084839, "learning_rate": 4.3712304593715516e-05, "loss": 0.6245, "num_input_tokens_seen": 11912624, "step": 20660 }, { "epoch": 3.0778969317843314, "grad_norm": 1.0371196269989014, "learning_rate": 4.3707994295348374e-05, "loss": 0.7704, "num_input_tokens_seen": 11915504, "step": 20665 }, { "epoch": 3.0786416443252906, "grad_norm": 0.9634787440299988, "learning_rate": 4.370368273277103e-05, "loss": 0.6113, "num_input_tokens_seen": 11918736, "step": 20670 }, { "epoch": 3.07938635686625, "grad_norm": 3.358231782913208, "learning_rate": 4.3699369906274864e-05, "loss": 0.7572, "num_input_tokens_seen": 11921776, "step": 20675 }, { "epoch": 3.0801310694072086, "grad_norm": 2.8286750316619873, "learning_rate": 4.3695055816151296e-05, "loss": 0.5653, "num_input_tokens_seen": 11924784, "step": 20680 }, { "epoch": 3.080875781948168, "grad_norm": 2.110117197036743, "learning_rate": 4.369074046269187e-05, "loss": 0.8132, "num_input_tokens_seen": 11927728, "step": 20685 }, { "epoch": 3.081620494489127, "grad_norm": 2.0828936100006104, "learning_rate": 4.3686423846188196e-05, "loss": 0.7135, "num_input_tokens_seen": 11930768, "step": 20690 }, { "epoch": 3.082365207030086, "grad_norm": 0.7702128887176514, "learning_rate": 4.368210596693197e-05, "loss": 0.6018, "num_input_tokens_seen": 11933744, "step": 20695 }, { "epoch": 3.0831099195710454, "grad_norm": 1.1625841856002808, "learning_rate": 4.367778682521498e-05, "loss": 0.7081, "num_input_tokens_seen": 11936560, "step": 20700 }, { "epoch": 3.0838546321120046, "grad_norm": 1.1924117803573608, "learning_rate": 4.367346642132909e-05, "loss": 0.6529, "num_input_tokens_seen": 11939760, "step": 20705 }, { "epoch": 3.084599344652964, "grad_norm": 1.2028262615203857, "learning_rate": 4.366914475556626e-05, "loss": 0.6038, "num_input_tokens_seen": 11942608, "step": 20710 }, { "epoch": 3.085344057193923, "grad_norm": 1.2850428819656372, "learning_rate": 4.3664821828218536e-05, "loss": 0.5943, "num_input_tokens_seen": 11945360, "step": 20715 }, { "epoch": 3.086088769734882, "grad_norm": 1.0968436002731323, "learning_rate": 4.3660497639578036e-05, "loss": 0.6459, "num_input_tokens_seen": 11948016, "step": 20720 }, { "epoch": 3.0868334822758414, "grad_norm": 1.306130290031433, "learning_rate": 4.3656172189936975e-05, "loss": 0.674, "num_input_tokens_seen": 11951056, "step": 20725 }, { "epoch": 3.0875781948168006, "grad_norm": 1.414265751838684, "learning_rate": 4.3651845479587647e-05, "loss": 0.8094, "num_input_tokens_seen": 11954000, "step": 20730 }, { "epoch": 3.08832290735776, "grad_norm": 1.3087339401245117, "learning_rate": 4.3647517508822434e-05, "loss": 0.605, "num_input_tokens_seen": 11956592, "step": 20735 }, { "epoch": 3.089067619898719, "grad_norm": 0.7222585082054138, "learning_rate": 4.36431882779338e-05, "loss": 0.6449, "num_input_tokens_seen": 11959376, "step": 20740 }, { "epoch": 3.0898123324396782, "grad_norm": 0.9666150212287903, "learning_rate": 4.3638857787214304e-05, "loss": 0.6588, "num_input_tokens_seen": 11962032, "step": 20745 }, { "epoch": 3.0905570449806374, "grad_norm": 1.0729550123214722, "learning_rate": 4.363452603695658e-05, "loss": 0.6372, "num_input_tokens_seen": 11965072, "step": 20750 }, { "epoch": 3.0913017575215966, "grad_norm": 1.1517516374588013, "learning_rate": 4.363019302745334e-05, "loss": 0.6245, "num_input_tokens_seen": 11968208, "step": 20755 }, { "epoch": 3.092046470062556, "grad_norm": 1.4041903018951416, "learning_rate": 4.362585875899741e-05, "loss": 0.7172, "num_input_tokens_seen": 11971088, "step": 20760 }, { "epoch": 3.092791182603515, "grad_norm": 0.6383258104324341, "learning_rate": 4.3621523231881665e-05, "loss": 0.6604, "num_input_tokens_seen": 11974128, "step": 20765 }, { "epoch": 3.0935358951444742, "grad_norm": 1.2651945352554321, "learning_rate": 4.36171864463991e-05, "loss": 0.6354, "num_input_tokens_seen": 11976976, "step": 20770 }, { "epoch": 3.0942806076854334, "grad_norm": 1.1003013849258423, "learning_rate": 4.361284840284275e-05, "loss": 0.5988, "num_input_tokens_seen": 11979728, "step": 20775 }, { "epoch": 3.0950253202263927, "grad_norm": 4.864522933959961, "learning_rate": 4.36085091015058e-05, "loss": 0.8698, "num_input_tokens_seen": 11982672, "step": 20780 }, { "epoch": 3.095770032767352, "grad_norm": 20.93076515197754, "learning_rate": 4.3604168542681444e-05, "loss": 0.5585, "num_input_tokens_seen": 11985360, "step": 20785 }, { "epoch": 3.096514745308311, "grad_norm": 0.8908252716064453, "learning_rate": 4.3599826726663026e-05, "loss": 0.6136, "num_input_tokens_seen": 11988208, "step": 20790 }, { "epoch": 3.0972594578492703, "grad_norm": 1.6750701665878296, "learning_rate": 4.359548365374394e-05, "loss": 0.7443, "num_input_tokens_seen": 11991120, "step": 20795 }, { "epoch": 3.0980041703902295, "grad_norm": 1.395621418952942, "learning_rate": 4.3591139324217666e-05, "loss": 0.6702, "num_input_tokens_seen": 11993904, "step": 20800 }, { "epoch": 3.0987488829311887, "grad_norm": 6.590043544769287, "learning_rate": 4.3586793738377785e-05, "loss": 0.7111, "num_input_tokens_seen": 11996848, "step": 20805 }, { "epoch": 3.099493595472148, "grad_norm": 2.265972137451172, "learning_rate": 4.358244689651795e-05, "loss": 0.7827, "num_input_tokens_seen": 11999632, "step": 20810 }, { "epoch": 3.100238308013107, "grad_norm": 1.3330477476119995, "learning_rate": 4.357809879893191e-05, "loss": 0.7555, "num_input_tokens_seen": 12002224, "step": 20815 }, { "epoch": 3.1009830205540663, "grad_norm": 1.5557851791381836, "learning_rate": 4.357374944591348e-05, "loss": 0.5925, "num_input_tokens_seen": 12005072, "step": 20820 }, { "epoch": 3.1017277330950255, "grad_norm": 1.3798774480819702, "learning_rate": 4.3569398837756586e-05, "loss": 0.7306, "num_input_tokens_seen": 12007728, "step": 20825 }, { "epoch": 3.1024724456359847, "grad_norm": 1.4250155687332153, "learning_rate": 4.356504697475521e-05, "loss": 0.7715, "num_input_tokens_seen": 12010448, "step": 20830 }, { "epoch": 3.103217158176944, "grad_norm": 1.830581545829773, "learning_rate": 4.356069385720344e-05, "loss": 0.647, "num_input_tokens_seen": 12013392, "step": 20835 }, { "epoch": 3.103961870717903, "grad_norm": 2.073513984680176, "learning_rate": 4.3556339485395444e-05, "loss": 0.5705, "num_input_tokens_seen": 12016208, "step": 20840 }, { "epoch": 3.1047065832588623, "grad_norm": 3.4358558654785156, "learning_rate": 4.355198385962547e-05, "loss": 0.7651, "num_input_tokens_seen": 12019152, "step": 20845 }, { "epoch": 3.1054512957998215, "grad_norm": 2.453838586807251, "learning_rate": 4.354762698018785e-05, "loss": 0.676, "num_input_tokens_seen": 12021936, "step": 20850 }, { "epoch": 3.1061960083407802, "grad_norm": 0.7052576541900635, "learning_rate": 4.3543268847377005e-05, "loss": 0.6553, "num_input_tokens_seen": 12024944, "step": 20855 }, { "epoch": 3.1069407208817394, "grad_norm": 1.007954478263855, "learning_rate": 4.353890946148745e-05, "loss": 0.5602, "num_input_tokens_seen": 12027696, "step": 20860 }, { "epoch": 3.1076854334226987, "grad_norm": 1.0096155405044556, "learning_rate": 4.353454882281377e-05, "loss": 0.6932, "num_input_tokens_seen": 12030832, "step": 20865 }, { "epoch": 3.108430145963658, "grad_norm": 0.7967997193336487, "learning_rate": 4.353018693165063e-05, "loss": 0.585, "num_input_tokens_seen": 12033552, "step": 20870 }, { "epoch": 3.109174858504617, "grad_norm": 1.3221211433410645, "learning_rate": 4.35258237882928e-05, "loss": 0.626, "num_input_tokens_seen": 12036080, "step": 20875 }, { "epoch": 3.1099195710455763, "grad_norm": 1.6050971746444702, "learning_rate": 4.352145939303511e-05, "loss": 0.7564, "num_input_tokens_seen": 12038960, "step": 20880 }, { "epoch": 3.1106642835865355, "grad_norm": 1.477422833442688, "learning_rate": 4.35170937461725e-05, "loss": 0.7341, "num_input_tokens_seen": 12042032, "step": 20885 }, { "epoch": 3.1114089961274947, "grad_norm": 1.122848391532898, "learning_rate": 4.3512726847999987e-05, "loss": 0.3906, "num_input_tokens_seen": 12044688, "step": 20890 }, { "epoch": 3.112153708668454, "grad_norm": 1.7943434715270996, "learning_rate": 4.3508358698812654e-05, "loss": 0.5772, "num_input_tokens_seen": 12047344, "step": 20895 }, { "epoch": 3.112898421209413, "grad_norm": 1.37044358253479, "learning_rate": 4.350398929890569e-05, "loss": 0.6542, "num_input_tokens_seen": 12050256, "step": 20900 }, { "epoch": 3.1136431337503723, "grad_norm": 1.0956594944000244, "learning_rate": 4.349961864857436e-05, "loss": 0.7147, "num_input_tokens_seen": 12053040, "step": 20905 }, { "epoch": 3.1143878462913315, "grad_norm": 1.1989277601242065, "learning_rate": 4.349524674811403e-05, "loss": 0.6841, "num_input_tokens_seen": 12055920, "step": 20910 }, { "epoch": 3.1151325588322907, "grad_norm": 1.313804030418396, "learning_rate": 4.3490873597820106e-05, "loss": 0.5664, "num_input_tokens_seen": 12059216, "step": 20915 }, { "epoch": 3.11587727137325, "grad_norm": 1.284010410308838, "learning_rate": 4.3486499197988126e-05, "loss": 0.6631, "num_input_tokens_seen": 12062160, "step": 20920 }, { "epoch": 3.116621983914209, "grad_norm": 1.6427130699157715, "learning_rate": 4.348212354891369e-05, "loss": 0.6796, "num_input_tokens_seen": 12065040, "step": 20925 }, { "epoch": 3.1173666964551683, "grad_norm": 2.422670602798462, "learning_rate": 4.347774665089248e-05, "loss": 0.4925, "num_input_tokens_seen": 12068176, "step": 20930 }, { "epoch": 3.1181114089961275, "grad_norm": 0.7629973292350769, "learning_rate": 4.347336850422029e-05, "loss": 0.589, "num_input_tokens_seen": 12070928, "step": 20935 }, { "epoch": 3.1188561215370867, "grad_norm": 1.6408288478851318, "learning_rate": 4.346898910919296e-05, "loss": 0.6078, "num_input_tokens_seen": 12073776, "step": 20940 }, { "epoch": 3.119600834078046, "grad_norm": 1.3003411293029785, "learning_rate": 4.346460846610643e-05, "loss": 0.728, "num_input_tokens_seen": 12076624, "step": 20945 }, { "epoch": 3.120345546619005, "grad_norm": 0.8018155694007874, "learning_rate": 4.346022657525673e-05, "loss": 0.641, "num_input_tokens_seen": 12079728, "step": 20950 }, { "epoch": 3.1210902591599643, "grad_norm": 1.8775231838226318, "learning_rate": 4.345584343693998e-05, "loss": 0.5199, "num_input_tokens_seen": 12082672, "step": 20955 }, { "epoch": 3.1218349717009235, "grad_norm": 1.882649540901184, "learning_rate": 4.345145905145237e-05, "loss": 0.6488, "num_input_tokens_seen": 12085680, "step": 20960 }, { "epoch": 3.1225796842418827, "grad_norm": 1.6451970338821411, "learning_rate": 4.344707341909017e-05, "loss": 0.6839, "num_input_tokens_seen": 12088432, "step": 20965 }, { "epoch": 3.123324396782842, "grad_norm": 1.8646959066390991, "learning_rate": 4.3442686540149744e-05, "loss": 0.76, "num_input_tokens_seen": 12091600, "step": 20970 }, { "epoch": 3.124069109323801, "grad_norm": 1.3473176956176758, "learning_rate": 4.343829841492755e-05, "loss": 0.7111, "num_input_tokens_seen": 12094416, "step": 20975 }, { "epoch": 3.1248138218647603, "grad_norm": 0.861678957939148, "learning_rate": 4.343390904372011e-05, "loss": 0.5199, "num_input_tokens_seen": 12097488, "step": 20980 }, { "epoch": 3.1255585344057195, "grad_norm": 1.627305269241333, "learning_rate": 4.3429518426824047e-05, "loss": 0.5717, "num_input_tokens_seen": 12100560, "step": 20985 }, { "epoch": 3.1263032469466787, "grad_norm": 1.029530644416809, "learning_rate": 4.342512656453606e-05, "loss": 0.4965, "num_input_tokens_seen": 12103248, "step": 20990 }, { "epoch": 3.127047959487638, "grad_norm": 1.2189440727233887, "learning_rate": 4.342073345715292e-05, "loss": 0.545, "num_input_tokens_seen": 12105776, "step": 20995 }, { "epoch": 3.127792672028597, "grad_norm": 1.4493956565856934, "learning_rate": 4.341633910497151e-05, "loss": 0.8327, "num_input_tokens_seen": 12108656, "step": 21000 }, { "epoch": 3.1285373845695563, "grad_norm": 1.829923391342163, "learning_rate": 4.3411943508288786e-05, "loss": 0.8018, "num_input_tokens_seen": 12111536, "step": 21005 }, { "epoch": 3.1292820971105155, "grad_norm": 1.3827649354934692, "learning_rate": 4.3407546667401776e-05, "loss": 0.6177, "num_input_tokens_seen": 12114704, "step": 21010 }, { "epoch": 3.1300268096514747, "grad_norm": 1.3748714923858643, "learning_rate": 4.34031485826076e-05, "loss": 0.6403, "num_input_tokens_seen": 12117488, "step": 21015 }, { "epoch": 3.1307715221924335, "grad_norm": 2.874917507171631, "learning_rate": 4.339874925420347e-05, "loss": 0.7171, "num_input_tokens_seen": 12120304, "step": 21020 }, { "epoch": 3.131516234733393, "grad_norm": 0.7972993850708008, "learning_rate": 4.339434868248665e-05, "loss": 0.5373, "num_input_tokens_seen": 12123280, "step": 21025 }, { "epoch": 3.132260947274352, "grad_norm": 1.9409890174865723, "learning_rate": 4.3389946867754546e-05, "loss": 0.6247, "num_input_tokens_seen": 12125968, "step": 21030 }, { "epoch": 3.133005659815311, "grad_norm": 1.0335946083068848, "learning_rate": 4.338554381030459e-05, "loss": 0.6108, "num_input_tokens_seen": 12128784, "step": 21035 }, { "epoch": 3.1337503723562703, "grad_norm": 1.0830950736999512, "learning_rate": 4.338113951043436e-05, "loss": 0.6924, "num_input_tokens_seen": 12131632, "step": 21040 }, { "epoch": 3.1344950848972295, "grad_norm": 2.0451924800872803, "learning_rate": 4.337673396844143e-05, "loss": 0.7183, "num_input_tokens_seen": 12134512, "step": 21045 }, { "epoch": 3.1352397974381887, "grad_norm": 2.4356000423431396, "learning_rate": 4.337232718462354e-05, "loss": 0.7288, "num_input_tokens_seen": 12137744, "step": 21050 }, { "epoch": 3.135984509979148, "grad_norm": 0.6497121453285217, "learning_rate": 4.336791915927847e-05, "loss": 0.5002, "num_input_tokens_seen": 12140528, "step": 21055 }, { "epoch": 3.136729222520107, "grad_norm": 1.78911554813385, "learning_rate": 4.3363509892704114e-05, "loss": 0.7549, "num_input_tokens_seen": 12143376, "step": 21060 }, { "epoch": 3.1374739350610663, "grad_norm": 1.1177924871444702, "learning_rate": 4.335909938519841e-05, "loss": 0.7128, "num_input_tokens_seen": 12146064, "step": 21065 }, { "epoch": 3.1382186476020255, "grad_norm": 0.9955691695213318, "learning_rate": 4.3354687637059414e-05, "loss": 0.6231, "num_input_tokens_seen": 12149328, "step": 21070 }, { "epoch": 3.1389633601429847, "grad_norm": 1.3933008909225464, "learning_rate": 4.335027464858526e-05, "loss": 0.7109, "num_input_tokens_seen": 12152048, "step": 21075 }, { "epoch": 3.139708072683944, "grad_norm": 1.1569801568984985, "learning_rate": 4.334586042007414e-05, "loss": 0.4886, "num_input_tokens_seen": 12154864, "step": 21080 }, { "epoch": 3.140452785224903, "grad_norm": 1.0628833770751953, "learning_rate": 4.3341444951824365e-05, "loss": 0.6294, "num_input_tokens_seen": 12158032, "step": 21085 }, { "epoch": 3.1411974977658623, "grad_norm": 1.2622156143188477, "learning_rate": 4.3337028244134315e-05, "loss": 0.8125, "num_input_tokens_seen": 12160656, "step": 21090 }, { "epoch": 3.1419422103068215, "grad_norm": 1.1616226434707642, "learning_rate": 4.3332610297302445e-05, "loss": 0.6005, "num_input_tokens_seen": 12163376, "step": 21095 }, { "epoch": 3.1426869228477807, "grad_norm": 1.0122138261795044, "learning_rate": 4.3328191111627306e-05, "loss": 0.602, "num_input_tokens_seen": 12166192, "step": 21100 }, { "epoch": 3.14343163538874, "grad_norm": 1.3993529081344604, "learning_rate": 4.332377068740753e-05, "loss": 0.5194, "num_input_tokens_seen": 12169360, "step": 21105 }, { "epoch": 3.144176347929699, "grad_norm": 0.9711344838142395, "learning_rate": 4.331934902494184e-05, "loss": 0.6859, "num_input_tokens_seen": 12171984, "step": 21110 }, { "epoch": 3.1449210604706583, "grad_norm": 1.4967985153198242, "learning_rate": 4.331492612452901e-05, "loss": 0.5591, "num_input_tokens_seen": 12174576, "step": 21115 }, { "epoch": 3.1456657730116175, "grad_norm": 1.6964625120162964, "learning_rate": 4.331050198646794e-05, "loss": 0.6655, "num_input_tokens_seen": 12177296, "step": 21120 }, { "epoch": 3.1464104855525767, "grad_norm": 1.560182809829712, "learning_rate": 4.330607661105759e-05, "loss": 0.6941, "num_input_tokens_seen": 12180304, "step": 21125 }, { "epoch": 3.147155198093536, "grad_norm": 1.5809991359710693, "learning_rate": 4.330164999859702e-05, "loss": 0.6464, "num_input_tokens_seen": 12183408, "step": 21130 }, { "epoch": 3.147899910634495, "grad_norm": 2.0473058223724365, "learning_rate": 4.3297222149385336e-05, "loss": 0.7609, "num_input_tokens_seen": 12186192, "step": 21135 }, { "epoch": 3.1486446231754543, "grad_norm": 1.0597035884857178, "learning_rate": 4.329279306372178e-05, "loss": 0.7504, "num_input_tokens_seen": 12189136, "step": 21140 }, { "epoch": 3.1493893357164136, "grad_norm": 2.410961627960205, "learning_rate": 4.3288362741905635e-05, "loss": 0.6324, "num_input_tokens_seen": 12192176, "step": 21145 }, { "epoch": 3.1501340482573728, "grad_norm": 1.1492892503738403, "learning_rate": 4.32839311842363e-05, "loss": 0.5897, "num_input_tokens_seen": 12195056, "step": 21150 }, { "epoch": 3.150878760798332, "grad_norm": 1.5854136943817139, "learning_rate": 4.327949839101323e-05, "loss": 0.399, "num_input_tokens_seen": 12197712, "step": 21155 }, { "epoch": 3.151623473339291, "grad_norm": 1.1415783166885376, "learning_rate": 4.3275064362535966e-05, "loss": 0.6521, "num_input_tokens_seen": 12200944, "step": 21160 }, { "epoch": 3.1523681858802504, "grad_norm": 1.2762597799301147, "learning_rate": 4.327062909910417e-05, "loss": 0.5447, "num_input_tokens_seen": 12203472, "step": 21165 }, { "epoch": 3.1531128984212096, "grad_norm": 1.6893194913864136, "learning_rate": 4.326619260101753e-05, "loss": 0.5308, "num_input_tokens_seen": 12206256, "step": 21170 }, { "epoch": 3.1538576109621688, "grad_norm": 1.0404329299926758, "learning_rate": 4.326175486857587e-05, "loss": 0.692, "num_input_tokens_seen": 12209200, "step": 21175 }, { "epoch": 3.154602323503128, "grad_norm": 1.6190162897109985, "learning_rate": 4.3257315902079055e-05, "loss": 0.8347, "num_input_tokens_seen": 12211888, "step": 21180 }, { "epoch": 3.155347036044087, "grad_norm": 1.386455774307251, "learning_rate": 4.3252875701827064e-05, "loss": 0.6439, "num_input_tokens_seen": 12214800, "step": 21185 }, { "epoch": 3.1560917485850464, "grad_norm": 1.4088013172149658, "learning_rate": 4.324843426811994e-05, "loss": 0.7016, "num_input_tokens_seen": 12218032, "step": 21190 }, { "epoch": 3.156836461126005, "grad_norm": 1.5884006023406982, "learning_rate": 4.324399160125782e-05, "loss": 0.5255, "num_input_tokens_seen": 12220816, "step": 21195 }, { "epoch": 3.157581173666965, "grad_norm": 1.05341637134552, "learning_rate": 4.323954770154093e-05, "loss": 0.6889, "num_input_tokens_seen": 12223824, "step": 21200 }, { "epoch": 3.1583258862079235, "grad_norm": 1.6590805053710938, "learning_rate": 4.323510256926956e-05, "loss": 0.567, "num_input_tokens_seen": 12226928, "step": 21205 }, { "epoch": 3.1590705987488827, "grad_norm": 1.4928401708602905, "learning_rate": 4.323065620474409e-05, "loss": 0.4709, "num_input_tokens_seen": 12229808, "step": 21210 }, { "epoch": 3.159815311289842, "grad_norm": 1.7286185026168823, "learning_rate": 4.3226208608265e-05, "loss": 0.4851, "num_input_tokens_seen": 12232400, "step": 21215 }, { "epoch": 3.160560023830801, "grad_norm": 1.960869550704956, "learning_rate": 4.322175978013283e-05, "loss": 0.6456, "num_input_tokens_seen": 12235344, "step": 21220 }, { "epoch": 3.1613047363717603, "grad_norm": 2.768962860107422, "learning_rate": 4.321730972064823e-05, "loss": 0.6154, "num_input_tokens_seen": 12238096, "step": 21225 }, { "epoch": 3.1620494489127196, "grad_norm": 1.5732203722000122, "learning_rate": 4.32128584301119e-05, "loss": 0.7059, "num_input_tokens_seen": 12241168, "step": 21230 }, { "epoch": 3.1627941614536788, "grad_norm": 1.2561267614364624, "learning_rate": 4.320840590882464e-05, "loss": 0.384, "num_input_tokens_seen": 12244112, "step": 21235 }, { "epoch": 3.163538873994638, "grad_norm": 1.4041130542755127, "learning_rate": 4.320395215708734e-05, "loss": 0.8253, "num_input_tokens_seen": 12247440, "step": 21240 }, { "epoch": 3.164283586535597, "grad_norm": 1.7633131742477417, "learning_rate": 4.319949717520096e-05, "loss": 0.6139, "num_input_tokens_seen": 12250256, "step": 21245 }, { "epoch": 3.1650282990765564, "grad_norm": 1.9879800081253052, "learning_rate": 4.319504096346657e-05, "loss": 0.69, "num_input_tokens_seen": 12252944, "step": 21250 }, { "epoch": 3.1657730116175156, "grad_norm": 5.045083522796631, "learning_rate": 4.319058352218528e-05, "loss": 0.7443, "num_input_tokens_seen": 12255696, "step": 21255 }, { "epoch": 3.1665177241584748, "grad_norm": 1.213842511177063, "learning_rate": 4.3186124851658305e-05, "loss": 0.6291, "num_input_tokens_seen": 12258544, "step": 21260 }, { "epoch": 3.167262436699434, "grad_norm": 1.170491337776184, "learning_rate": 4.318166495218696e-05, "loss": 0.7026, "num_input_tokens_seen": 12261808, "step": 21265 }, { "epoch": 3.168007149240393, "grad_norm": 2.1400351524353027, "learning_rate": 4.317720382407262e-05, "loss": 0.7023, "num_input_tokens_seen": 12264592, "step": 21270 }, { "epoch": 3.1687518617813524, "grad_norm": 1.525220513343811, "learning_rate": 4.317274146761674e-05, "loss": 0.5906, "num_input_tokens_seen": 12267440, "step": 21275 }, { "epoch": 3.1694965743223116, "grad_norm": 2.04892897605896, "learning_rate": 4.316827788312089e-05, "loss": 0.7074, "num_input_tokens_seen": 12270416, "step": 21280 }, { "epoch": 3.170241286863271, "grad_norm": 1.8542609214782715, "learning_rate": 4.316381307088668e-05, "loss": 0.5547, "num_input_tokens_seen": 12273424, "step": 21285 }, { "epoch": 3.17098599940423, "grad_norm": 1.6041064262390137, "learning_rate": 4.315934703121583e-05, "loss": 0.6956, "num_input_tokens_seen": 12276080, "step": 21290 }, { "epoch": 3.171730711945189, "grad_norm": 0.9326282739639282, "learning_rate": 4.315487976441014e-05, "loss": 0.5839, "num_input_tokens_seen": 12279152, "step": 21295 }, { "epoch": 3.1724754244861484, "grad_norm": 0.8702778220176697, "learning_rate": 4.3150411270771486e-05, "loss": 0.5938, "num_input_tokens_seen": 12282352, "step": 21300 }, { "epoch": 3.1732201370271076, "grad_norm": 1.3484591245651245, "learning_rate": 4.3145941550601836e-05, "loss": 0.7018, "num_input_tokens_seen": 12285040, "step": 21305 }, { "epoch": 3.173964849568067, "grad_norm": 0.8152409195899963, "learning_rate": 4.314147060420323e-05, "loss": 0.6305, "num_input_tokens_seen": 12288016, "step": 21310 }, { "epoch": 3.174709562109026, "grad_norm": 1.106789469718933, "learning_rate": 4.31369984318778e-05, "loss": 0.7711, "num_input_tokens_seen": 12290896, "step": 21315 }, { "epoch": 3.175454274649985, "grad_norm": 1.6592092514038086, "learning_rate": 4.313252503392775e-05, "loss": 0.6977, "num_input_tokens_seen": 12293904, "step": 21320 }, { "epoch": 3.1761989871909444, "grad_norm": 1.1904398202896118, "learning_rate": 4.3128050410655384e-05, "loss": 0.6321, "num_input_tokens_seen": 12296784, "step": 21325 }, { "epoch": 3.1769436997319036, "grad_norm": 1.1792336702346802, "learning_rate": 4.312357456236308e-05, "loss": 0.5445, "num_input_tokens_seen": 12300112, "step": 21330 }, { "epoch": 3.177688412272863, "grad_norm": 3.098309278488159, "learning_rate": 4.3119097489353285e-05, "loss": 0.6752, "num_input_tokens_seen": 12303120, "step": 21335 }, { "epoch": 3.178433124813822, "grad_norm": 1.3684823513031006, "learning_rate": 4.311461919192855e-05, "loss": 0.6823, "num_input_tokens_seen": 12306064, "step": 21340 }, { "epoch": 3.179177837354781, "grad_norm": 0.7802741527557373, "learning_rate": 4.31101396703915e-05, "loss": 0.6722, "num_input_tokens_seen": 12308912, "step": 21345 }, { "epoch": 3.1799225498957404, "grad_norm": 1.4148848056793213, "learning_rate": 4.310565892504484e-05, "loss": 0.5198, "num_input_tokens_seen": 12311792, "step": 21350 }, { "epoch": 3.1806672624366996, "grad_norm": 0.9710959196090698, "learning_rate": 4.3101176956191365e-05, "loss": 0.7182, "num_input_tokens_seen": 12314544, "step": 21355 }, { "epoch": 3.181411974977659, "grad_norm": 1.1509891748428345, "learning_rate": 4.309669376413394e-05, "loss": 0.5677, "num_input_tokens_seen": 12317456, "step": 21360 }, { "epoch": 3.182156687518618, "grad_norm": 0.7539880871772766, "learning_rate": 4.309220934917553e-05, "loss": 0.5628, "num_input_tokens_seen": 12320400, "step": 21365 }, { "epoch": 3.182901400059577, "grad_norm": 0.9725791811943054, "learning_rate": 4.3087723711619166e-05, "loss": 0.7105, "num_input_tokens_seen": 12323120, "step": 21370 }, { "epoch": 3.1836461126005364, "grad_norm": 0.9964293241500854, "learning_rate": 4.3083236851767976e-05, "loss": 0.6346, "num_input_tokens_seen": 12325936, "step": 21375 }, { "epoch": 3.184390825141495, "grad_norm": 0.9374880790710449, "learning_rate": 4.307874876992516e-05, "loss": 0.6293, "num_input_tokens_seen": 12328816, "step": 21380 }, { "epoch": 3.1851355376824544, "grad_norm": 0.9872552156448364, "learning_rate": 4.307425946639401e-05, "loss": 0.7608, "num_input_tokens_seen": 12331856, "step": 21385 }, { "epoch": 3.1858802502234136, "grad_norm": 1.1211451292037964, "learning_rate": 4.3069768941477885e-05, "loss": 0.683, "num_input_tokens_seen": 12334864, "step": 21390 }, { "epoch": 3.186624962764373, "grad_norm": 0.529511570930481, "learning_rate": 4.3065277195480235e-05, "loss": 0.5193, "num_input_tokens_seen": 12337328, "step": 21395 }, { "epoch": 3.187369675305332, "grad_norm": 1.2761896848678589, "learning_rate": 4.306078422870461e-05, "loss": 0.7364, "num_input_tokens_seen": 12340400, "step": 21400 }, { "epoch": 3.188114387846291, "grad_norm": 0.8979005217552185, "learning_rate": 4.3056290041454615e-05, "loss": 0.7411, "num_input_tokens_seen": 12343280, "step": 21405 }, { "epoch": 3.1888591003872504, "grad_norm": 0.9873735904693604, "learning_rate": 4.3051794634033946e-05, "loss": 0.5558, "num_input_tokens_seen": 12345936, "step": 21410 }, { "epoch": 3.1896038129282096, "grad_norm": 0.8252891302108765, "learning_rate": 4.304729800674639e-05, "loss": 0.686, "num_input_tokens_seen": 12348880, "step": 21415 }, { "epoch": 3.190348525469169, "grad_norm": 0.9130313396453857, "learning_rate": 4.304280015989581e-05, "loss": 0.5175, "num_input_tokens_seen": 12351984, "step": 21420 }, { "epoch": 3.191093238010128, "grad_norm": 1.24777352809906, "learning_rate": 4.303830109378616e-05, "loss": 0.6182, "num_input_tokens_seen": 12354896, "step": 21425 }, { "epoch": 3.191837950551087, "grad_norm": 1.3081443309783936, "learning_rate": 4.303380080872145e-05, "loss": 0.4923, "num_input_tokens_seen": 12357680, "step": 21430 }, { "epoch": 3.1925826630920464, "grad_norm": 1.3355917930603027, "learning_rate": 4.302929930500581e-05, "loss": 0.6038, "num_input_tokens_seen": 12360592, "step": 21435 }, { "epoch": 3.1933273756330056, "grad_norm": 1.0447651147842407, "learning_rate": 4.302479658294341e-05, "loss": 0.6848, "num_input_tokens_seen": 12363344, "step": 21440 }, { "epoch": 3.194072088173965, "grad_norm": 0.8453773260116577, "learning_rate": 4.3020292642838556e-05, "loss": 0.668, "num_input_tokens_seen": 12366352, "step": 21445 }, { "epoch": 3.194816800714924, "grad_norm": 1.0136538743972778, "learning_rate": 4.301578748499558e-05, "loss": 0.4355, "num_input_tokens_seen": 12369200, "step": 21450 }, { "epoch": 3.1955615132558832, "grad_norm": 1.110967993736267, "learning_rate": 4.301128110971895e-05, "loss": 0.6735, "num_input_tokens_seen": 12372208, "step": 21455 }, { "epoch": 3.1963062257968424, "grad_norm": 1.3979533910751343, "learning_rate": 4.300677351731315e-05, "loss": 0.5654, "num_input_tokens_seen": 12375152, "step": 21460 }, { "epoch": 3.1970509383378016, "grad_norm": 0.9324796795845032, "learning_rate": 4.300226470808282e-05, "loss": 0.8125, "num_input_tokens_seen": 12378096, "step": 21465 }, { "epoch": 3.197795650878761, "grad_norm": 1.622589111328125, "learning_rate": 4.2997754682332626e-05, "loss": 0.5632, "num_input_tokens_seen": 12381200, "step": 21470 }, { "epoch": 3.19854036341972, "grad_norm": 1.834316372871399, "learning_rate": 4.2993243440367345e-05, "loss": 0.5234, "num_input_tokens_seen": 12383952, "step": 21475 }, { "epoch": 3.1992850759606792, "grad_norm": 1.386076807975769, "learning_rate": 4.2988730982491824e-05, "loss": 0.5282, "num_input_tokens_seen": 12386672, "step": 21480 }, { "epoch": 3.2000297885016384, "grad_norm": 1.2320435047149658, "learning_rate": 4.2984217309011e-05, "loss": 0.678, "num_input_tokens_seen": 12389520, "step": 21485 }, { "epoch": 3.2007745010425976, "grad_norm": 2.0387604236602783, "learning_rate": 4.2979702420229894e-05, "loss": 0.6022, "num_input_tokens_seen": 12392560, "step": 21490 }, { "epoch": 3.201519213583557, "grad_norm": 1.888448715209961, "learning_rate": 4.29751863164536e-05, "loss": 0.5673, "num_input_tokens_seen": 12395184, "step": 21495 }, { "epoch": 3.202263926124516, "grad_norm": 0.981699526309967, "learning_rate": 4.2970668997987294e-05, "loss": 0.6188, "num_input_tokens_seen": 12398032, "step": 21500 }, { "epoch": 3.2030086386654752, "grad_norm": 1.6194641590118408, "learning_rate": 4.296615046513624e-05, "loss": 0.6591, "num_input_tokens_seen": 12401168, "step": 21505 }, { "epoch": 3.2037533512064345, "grad_norm": 1.8386707305908203, "learning_rate": 4.296163071820578e-05, "loss": 0.5939, "num_input_tokens_seen": 12404144, "step": 21510 }, { "epoch": 3.2044980637473937, "grad_norm": 1.2662664651870728, "learning_rate": 4.295710975750135e-05, "loss": 0.721, "num_input_tokens_seen": 12407216, "step": 21515 }, { "epoch": 3.205242776288353, "grad_norm": 0.774467408657074, "learning_rate": 4.295258758332845e-05, "loss": 0.8653, "num_input_tokens_seen": 12409904, "step": 21520 }, { "epoch": 3.205987488829312, "grad_norm": 1.4022831916809082, "learning_rate": 4.294806419599267e-05, "loss": 0.5928, "num_input_tokens_seen": 12413008, "step": 21525 }, { "epoch": 3.2067322013702713, "grad_norm": 1.7771546840667725, "learning_rate": 4.2943539595799675e-05, "loss": 0.7647, "num_input_tokens_seen": 12415888, "step": 21530 }, { "epoch": 3.2074769139112305, "grad_norm": 1.4056330919265747, "learning_rate": 4.293901378305523e-05, "loss": 0.6305, "num_input_tokens_seen": 12418800, "step": 21535 }, { "epoch": 3.2082216264521897, "grad_norm": 1.2742817401885986, "learning_rate": 4.2934486758065176e-05, "loss": 0.6732, "num_input_tokens_seen": 12421680, "step": 21540 }, { "epoch": 3.2089663389931484, "grad_norm": 1.6501377820968628, "learning_rate": 4.292995852113542e-05, "loss": 0.7111, "num_input_tokens_seen": 12424816, "step": 21545 }, { "epoch": 3.2097110515341076, "grad_norm": 1.166407585144043, "learning_rate": 4.292542907257196e-05, "loss": 0.4836, "num_input_tokens_seen": 12427472, "step": 21550 }, { "epoch": 3.210455764075067, "grad_norm": 0.4238300025463104, "learning_rate": 4.292089841268089e-05, "loss": 0.6532, "num_input_tokens_seen": 12430480, "step": 21555 }, { "epoch": 3.211200476616026, "grad_norm": 0.8380494713783264, "learning_rate": 4.291636654176836e-05, "loss": 0.6472, "num_input_tokens_seen": 12433360, "step": 21560 }, { "epoch": 3.2119451891569852, "grad_norm": 1.2659564018249512, "learning_rate": 4.291183346014063e-05, "loss": 0.5084, "num_input_tokens_seen": 12436336, "step": 21565 }, { "epoch": 3.2126899016979444, "grad_norm": 1.2891699075698853, "learning_rate": 4.290729916810401e-05, "loss": 0.6348, "num_input_tokens_seen": 12439184, "step": 21570 }, { "epoch": 3.2134346142389036, "grad_norm": 1.7189288139343262, "learning_rate": 4.290276366596492e-05, "loss": 0.6525, "num_input_tokens_seen": 12442000, "step": 21575 }, { "epoch": 3.214179326779863, "grad_norm": 2.9727158546447754, "learning_rate": 4.2898226954029844e-05, "loss": 0.7474, "num_input_tokens_seen": 12444720, "step": 21580 }, { "epoch": 3.214924039320822, "grad_norm": 1.8004987239837646, "learning_rate": 4.289368903260536e-05, "loss": 0.5047, "num_input_tokens_seen": 12447440, "step": 21585 }, { "epoch": 3.2156687518617812, "grad_norm": 1.012952208518982, "learning_rate": 4.288914990199814e-05, "loss": 0.7448, "num_input_tokens_seen": 12450256, "step": 21590 }, { "epoch": 3.2164134644027405, "grad_norm": 2.615187168121338, "learning_rate": 4.288460956251489e-05, "loss": 0.663, "num_input_tokens_seen": 12453136, "step": 21595 }, { "epoch": 3.2171581769436997, "grad_norm": 1.3193109035491943, "learning_rate": 4.288006801446243e-05, "loss": 0.6654, "num_input_tokens_seen": 12456272, "step": 21600 }, { "epoch": 3.217902889484659, "grad_norm": 1.881730079650879, "learning_rate": 4.287552525814768e-05, "loss": 0.7383, "num_input_tokens_seen": 12458992, "step": 21605 }, { "epoch": 3.218647602025618, "grad_norm": 1.857779860496521, "learning_rate": 4.2870981293877605e-05, "loss": 0.7141, "num_input_tokens_seen": 12461936, "step": 21610 }, { "epoch": 3.2193923145665773, "grad_norm": 1.5929551124572754, "learning_rate": 4.286643612195927e-05, "loss": 0.5553, "num_input_tokens_seen": 12464656, "step": 21615 }, { "epoch": 3.2201370271075365, "grad_norm": 1.1302835941314697, "learning_rate": 4.286188974269983e-05, "loss": 0.4035, "num_input_tokens_seen": 12467600, "step": 21620 }, { "epoch": 3.2208817396484957, "grad_norm": 1.5675394535064697, "learning_rate": 4.28573421564065e-05, "loss": 0.784, "num_input_tokens_seen": 12470512, "step": 21625 }, { "epoch": 3.221626452189455, "grad_norm": 1.4589675664901733, "learning_rate": 4.2852793363386585e-05, "loss": 0.5835, "num_input_tokens_seen": 12473328, "step": 21630 }, { "epoch": 3.222371164730414, "grad_norm": 1.25044584274292, "learning_rate": 4.2848243363947484e-05, "loss": 0.6938, "num_input_tokens_seen": 12476016, "step": 21635 }, { "epoch": 3.2231158772713733, "grad_norm": 2.1838696002960205, "learning_rate": 4.2843692158396655e-05, "loss": 0.656, "num_input_tokens_seen": 12479152, "step": 21640 }, { "epoch": 3.2238605898123325, "grad_norm": 1.2413501739501953, "learning_rate": 4.283913974704166e-05, "loss": 0.6826, "num_input_tokens_seen": 12481840, "step": 21645 }, { "epoch": 3.2246053023532917, "grad_norm": 1.0814729928970337, "learning_rate": 4.283458613019013e-05, "loss": 0.6625, "num_input_tokens_seen": 12484880, "step": 21650 }, { "epoch": 3.225350014894251, "grad_norm": 3.313594102859497, "learning_rate": 4.283003130814978e-05, "loss": 0.6579, "num_input_tokens_seen": 12487536, "step": 21655 }, { "epoch": 3.22609472743521, "grad_norm": 1.4252634048461914, "learning_rate": 4.2825475281228406e-05, "loss": 0.6197, "num_input_tokens_seen": 12490320, "step": 21660 }, { "epoch": 3.2268394399761693, "grad_norm": 1.9300274848937988, "learning_rate": 4.282091804973388e-05, "loss": 0.7221, "num_input_tokens_seen": 12493200, "step": 21665 }, { "epoch": 3.2275841525171285, "grad_norm": 1.1170145273208618, "learning_rate": 4.2816359613974176e-05, "loss": 0.5876, "num_input_tokens_seen": 12495664, "step": 21670 }, { "epoch": 3.2283288650580877, "grad_norm": 0.9283151030540466, "learning_rate": 4.281179997425732e-05, "loss": 0.5871, "num_input_tokens_seen": 12498864, "step": 21675 }, { "epoch": 3.229073577599047, "grad_norm": 1.1147674322128296, "learning_rate": 4.280723913089144e-05, "loss": 0.5802, "num_input_tokens_seen": 12501648, "step": 21680 }, { "epoch": 3.229818290140006, "grad_norm": 0.8114027976989746, "learning_rate": 4.280267708418474e-05, "loss": 0.533, "num_input_tokens_seen": 12504464, "step": 21685 }, { "epoch": 3.2305630026809653, "grad_norm": 1.0070343017578125, "learning_rate": 4.279811383444551e-05, "loss": 0.6751, "num_input_tokens_seen": 12507664, "step": 21690 }, { "epoch": 3.2313077152219245, "grad_norm": 0.9865366220474243, "learning_rate": 4.2793549381982095e-05, "loss": 0.5549, "num_input_tokens_seen": 12510320, "step": 21695 }, { "epoch": 3.2320524277628837, "grad_norm": 0.864531397819519, "learning_rate": 4.278898372710296e-05, "loss": 0.5537, "num_input_tokens_seen": 12513136, "step": 21700 }, { "epoch": 3.232797140303843, "grad_norm": 0.8938201069831848, "learning_rate": 4.2784416870116635e-05, "loss": 0.6049, "num_input_tokens_seen": 12515664, "step": 21705 }, { "epoch": 3.233541852844802, "grad_norm": 0.786736249923706, "learning_rate": 4.2779848811331726e-05, "loss": 0.67, "num_input_tokens_seen": 12518512, "step": 21710 }, { "epoch": 3.2342865653857613, "grad_norm": 3.1689157485961914, "learning_rate": 4.2775279551056914e-05, "loss": 0.6946, "num_input_tokens_seen": 12521232, "step": 21715 }, { "epoch": 3.23503127792672, "grad_norm": 2.607081174850464, "learning_rate": 4.277070908960098e-05, "loss": 0.746, "num_input_tokens_seen": 12523888, "step": 21720 }, { "epoch": 3.2357759904676793, "grad_norm": 1.1330265998840332, "learning_rate": 4.276613742727278e-05, "loss": 0.7889, "num_input_tokens_seen": 12526832, "step": 21725 }, { "epoch": 3.2365207030086385, "grad_norm": 1.0116674900054932, "learning_rate": 4.276156456438124e-05, "loss": 0.5858, "num_input_tokens_seen": 12529744, "step": 21730 }, { "epoch": 3.2372654155495977, "grad_norm": 1.7921466827392578, "learning_rate": 4.275699050123538e-05, "loss": 0.6806, "num_input_tokens_seen": 12532720, "step": 21735 }, { "epoch": 3.238010128090557, "grad_norm": 1.6640499830245972, "learning_rate": 4.27524152381443e-05, "loss": 0.6459, "num_input_tokens_seen": 12535504, "step": 21740 }, { "epoch": 3.238754840631516, "grad_norm": 1.6121917963027954, "learning_rate": 4.2747838775417174e-05, "loss": 0.6182, "num_input_tokens_seen": 12538768, "step": 21745 }, { "epoch": 3.2394995531724753, "grad_norm": 1.449965000152588, "learning_rate": 4.2743261113363266e-05, "loss": 0.8129, "num_input_tokens_seen": 12541616, "step": 21750 }, { "epoch": 3.2402442657134345, "grad_norm": 1.4640761613845825, "learning_rate": 4.27386822522919e-05, "loss": 0.6863, "num_input_tokens_seen": 12544400, "step": 21755 }, { "epoch": 3.2409889782543937, "grad_norm": 1.7244300842285156, "learning_rate": 4.273410219251252e-05, "loss": 0.6453, "num_input_tokens_seen": 12547280, "step": 21760 }, { "epoch": 3.241733690795353, "grad_norm": 1.8742989301681519, "learning_rate": 4.27295209343346e-05, "loss": 0.6469, "num_input_tokens_seen": 12550224, "step": 21765 }, { "epoch": 3.242478403336312, "grad_norm": 1.0864099264144897, "learning_rate": 4.2724938478067746e-05, "loss": 0.7998, "num_input_tokens_seen": 12553072, "step": 21770 }, { "epoch": 3.2432231158772713, "grad_norm": 0.9029388427734375, "learning_rate": 4.2720354824021616e-05, "loss": 0.6333, "num_input_tokens_seen": 12556016, "step": 21775 }, { "epoch": 3.2439678284182305, "grad_norm": 1.6289095878601074, "learning_rate": 4.271576997250595e-05, "loss": 0.5724, "num_input_tokens_seen": 12558704, "step": 21780 }, { "epoch": 3.2447125409591897, "grad_norm": 1.5137298107147217, "learning_rate": 4.271118392383058e-05, "loss": 0.688, "num_input_tokens_seen": 12561808, "step": 21785 }, { "epoch": 3.245457253500149, "grad_norm": 1.2190966606140137, "learning_rate": 4.2706596678305405e-05, "loss": 0.6131, "num_input_tokens_seen": 12564912, "step": 21790 }, { "epoch": 3.246201966041108, "grad_norm": 1.387291431427002, "learning_rate": 4.2702008236240424e-05, "loss": 0.5159, "num_input_tokens_seen": 12567760, "step": 21795 }, { "epoch": 3.2469466785820673, "grad_norm": 2.328357219696045, "learning_rate": 4.269741859794568e-05, "loss": 0.626, "num_input_tokens_seen": 12570512, "step": 21800 }, { "epoch": 3.2476913911230265, "grad_norm": 0.9249587655067444, "learning_rate": 4.2692827763731356e-05, "loss": 0.7033, "num_input_tokens_seen": 12573360, "step": 21805 }, { "epoch": 3.2484361036639857, "grad_norm": 3.435774564743042, "learning_rate": 4.268823573390766e-05, "loss": 0.6001, "num_input_tokens_seen": 12575984, "step": 21810 }, { "epoch": 3.249180816204945, "grad_norm": 2.441234827041626, "learning_rate": 4.26836425087849e-05, "loss": 0.7178, "num_input_tokens_seen": 12578800, "step": 21815 }, { "epoch": 3.249925528745904, "grad_norm": 0.8160472512245178, "learning_rate": 4.267904808867349e-05, "loss": 0.5227, "num_input_tokens_seen": 12581424, "step": 21820 }, { "epoch": 3.2506702412868633, "grad_norm": 2.081315517425537, "learning_rate": 4.267445247388389e-05, "loss": 0.6564, "num_input_tokens_seen": 12584464, "step": 21825 }, { "epoch": 3.2514149538278225, "grad_norm": 1.2593494653701782, "learning_rate": 4.2669855664726635e-05, "loss": 0.6334, "num_input_tokens_seen": 12587472, "step": 21830 }, { "epoch": 3.2521596663687817, "grad_norm": 1.9671791791915894, "learning_rate": 4.266525766151238e-05, "loss": 0.6821, "num_input_tokens_seen": 12590288, "step": 21835 }, { "epoch": 3.252904378909741, "grad_norm": 1.523995041847229, "learning_rate": 4.266065846455184e-05, "loss": 0.7353, "num_input_tokens_seen": 12593296, "step": 21840 }, { "epoch": 3.2536490914507, "grad_norm": 1.2284572124481201, "learning_rate": 4.26560580741558e-05, "loss": 0.6484, "num_input_tokens_seen": 12596144, "step": 21845 }, { "epoch": 3.2543938039916593, "grad_norm": 0.8893466591835022, "learning_rate": 4.2651456490635144e-05, "loss": 0.53, "num_input_tokens_seen": 12599152, "step": 21850 }, { "epoch": 3.2551385165326185, "grad_norm": 1.8101896047592163, "learning_rate": 4.2646853714300816e-05, "loss": 0.6845, "num_input_tokens_seen": 12601904, "step": 21855 }, { "epoch": 3.2558832290735777, "grad_norm": 1.6007698774337769, "learning_rate": 4.264224974546387e-05, "loss": 0.5723, "num_input_tokens_seen": 12604880, "step": 21860 }, { "epoch": 3.256627941614537, "grad_norm": 0.9493764638900757, "learning_rate": 4.263764458443541e-05, "loss": 0.7245, "num_input_tokens_seen": 12607920, "step": 21865 }, { "epoch": 3.257372654155496, "grad_norm": 1.1971635818481445, "learning_rate": 4.263303823152663e-05, "loss": 0.6624, "num_input_tokens_seen": 12610960, "step": 21870 }, { "epoch": 3.2581173666964554, "grad_norm": 1.1487616300582886, "learning_rate": 4.262843068704883e-05, "loss": 0.7967, "num_input_tokens_seen": 12613840, "step": 21875 }, { "epoch": 3.2588620792374146, "grad_norm": 0.6876872181892395, "learning_rate": 4.262382195131335e-05, "loss": 0.6968, "num_input_tokens_seen": 12616688, "step": 21880 }, { "epoch": 3.2596067917783733, "grad_norm": 1.5887329578399658, "learning_rate": 4.2619212024631636e-05, "loss": 0.6778, "num_input_tokens_seen": 12619312, "step": 21885 }, { "epoch": 3.260351504319333, "grad_norm": 0.8640909194946289, "learning_rate": 4.261460090731521e-05, "loss": 0.6229, "num_input_tokens_seen": 12622192, "step": 21890 }, { "epoch": 3.2610962168602917, "grad_norm": 1.0535049438476562, "learning_rate": 4.2609988599675665e-05, "loss": 0.7222, "num_input_tokens_seen": 12624912, "step": 21895 }, { "epoch": 3.2618409294012514, "grad_norm": 0.9733535051345825, "learning_rate": 4.2605375102024694e-05, "loss": 0.7248, "num_input_tokens_seen": 12628592, "step": 21900 }, { "epoch": 3.26258564194221, "grad_norm": 1.7220561504364014, "learning_rate": 4.2600760414674044e-05, "loss": 0.6045, "num_input_tokens_seen": 12631632, "step": 21905 }, { "epoch": 3.2633303544831693, "grad_norm": 1.1892656087875366, "learning_rate": 4.259614453793557e-05, "loss": 0.8092, "num_input_tokens_seen": 12634672, "step": 21910 }, { "epoch": 3.2640750670241285, "grad_norm": 0.8908698558807373, "learning_rate": 4.25915274721212e-05, "loss": 0.6815, "num_input_tokens_seen": 12637520, "step": 21915 }, { "epoch": 3.2648197795650877, "grad_norm": 1.8932058811187744, "learning_rate": 4.258690921754291e-05, "loss": 0.6061, "num_input_tokens_seen": 12640144, "step": 21920 }, { "epoch": 3.265564492106047, "grad_norm": 0.8033760786056519, "learning_rate": 4.25822897745128e-05, "loss": 0.6677, "num_input_tokens_seen": 12643088, "step": 21925 }, { "epoch": 3.266309204647006, "grad_norm": 0.9976306557655334, "learning_rate": 4.257766914334303e-05, "loss": 0.5959, "num_input_tokens_seen": 12645904, "step": 21930 }, { "epoch": 3.2670539171879653, "grad_norm": 1.331088900566101, "learning_rate": 4.257304732434585e-05, "loss": 0.6818, "num_input_tokens_seen": 12648784, "step": 21935 }, { "epoch": 3.2677986297289245, "grad_norm": 1.5594565868377686, "learning_rate": 4.256842431783358e-05, "loss": 0.6076, "num_input_tokens_seen": 12651632, "step": 21940 }, { "epoch": 3.2685433422698837, "grad_norm": 1.3575375080108643, "learning_rate": 4.256380012411862e-05, "loss": 0.5244, "num_input_tokens_seen": 12654544, "step": 21945 }, { "epoch": 3.269288054810843, "grad_norm": 1.0281788110733032, "learning_rate": 4.255917474351345e-05, "loss": 0.7272, "num_input_tokens_seen": 12657392, "step": 21950 }, { "epoch": 3.270032767351802, "grad_norm": 1.488627314567566, "learning_rate": 4.2554548176330655e-05, "loss": 0.7261, "num_input_tokens_seen": 12660400, "step": 21955 }, { "epoch": 3.2707774798927614, "grad_norm": 1.2578569650650024, "learning_rate": 4.254992042288286e-05, "loss": 0.5965, "num_input_tokens_seen": 12663440, "step": 21960 }, { "epoch": 3.2715221924337206, "grad_norm": 1.0200426578521729, "learning_rate": 4.254529148348279e-05, "loss": 0.6538, "num_input_tokens_seen": 12666416, "step": 21965 }, { "epoch": 3.2722669049746798, "grad_norm": 1.2003055810928345, "learning_rate": 4.254066135844326e-05, "loss": 0.5986, "num_input_tokens_seen": 12669168, "step": 21970 }, { "epoch": 3.273011617515639, "grad_norm": 1.1728419065475464, "learning_rate": 4.253603004807715e-05, "loss": 0.6021, "num_input_tokens_seen": 12672368, "step": 21975 }, { "epoch": 3.273756330056598, "grad_norm": 0.7292938828468323, "learning_rate": 4.253139755269743e-05, "loss": 0.6341, "num_input_tokens_seen": 12674896, "step": 21980 }, { "epoch": 3.2745010425975574, "grad_norm": 0.9951279759407043, "learning_rate": 4.2526763872617137e-05, "loss": 0.5273, "num_input_tokens_seen": 12677968, "step": 21985 }, { "epoch": 3.2752457551385166, "grad_norm": 1.4569257497787476, "learning_rate": 4.2522129008149395e-05, "loss": 0.6128, "num_input_tokens_seen": 12680816, "step": 21990 }, { "epoch": 3.2759904676794758, "grad_norm": 1.038009524345398, "learning_rate": 4.2517492959607426e-05, "loss": 0.5147, "num_input_tokens_seen": 12683664, "step": 21995 }, { "epoch": 3.276735180220435, "grad_norm": 0.7699519991874695, "learning_rate": 4.251285572730449e-05, "loss": 0.5548, "num_input_tokens_seen": 12686608, "step": 22000 }, { "epoch": 3.277479892761394, "grad_norm": 1.4698214530944824, "learning_rate": 4.250821731155398e-05, "loss": 0.6129, "num_input_tokens_seen": 12689392, "step": 22005 }, { "epoch": 3.2782246053023534, "grad_norm": 1.9161134958267212, "learning_rate": 4.250357771266932e-05, "loss": 0.7539, "num_input_tokens_seen": 12692240, "step": 22010 }, { "epoch": 3.2789693178433126, "grad_norm": 1.1435047388076782, "learning_rate": 4.249893693096404e-05, "loss": 0.5279, "num_input_tokens_seen": 12695344, "step": 22015 }, { "epoch": 3.279714030384272, "grad_norm": 1.3072901964187622, "learning_rate": 4.249429496675175e-05, "loss": 0.7504, "num_input_tokens_seen": 12697968, "step": 22020 }, { "epoch": 3.280458742925231, "grad_norm": 1.5290510654449463, "learning_rate": 4.248965182034613e-05, "loss": 0.6132, "num_input_tokens_seen": 12701200, "step": 22025 }, { "epoch": 3.28120345546619, "grad_norm": 1.7738032341003418, "learning_rate": 4.248500749206096e-05, "loss": 0.5974, "num_input_tokens_seen": 12704016, "step": 22030 }, { "epoch": 3.2819481680071494, "grad_norm": 3.1725525856018066, "learning_rate": 4.248036198221006e-05, "loss": 0.7065, "num_input_tokens_seen": 12706640, "step": 22035 }, { "epoch": 3.2826928805481086, "grad_norm": 1.7240632772445679, "learning_rate": 4.2475715291107374e-05, "loss": 0.6839, "num_input_tokens_seen": 12709392, "step": 22040 }, { "epoch": 3.283437593089068, "grad_norm": 1.1530530452728271, "learning_rate": 4.24710674190669e-05, "loss": 0.5803, "num_input_tokens_seen": 12711952, "step": 22045 }, { "epoch": 3.284182305630027, "grad_norm": 1.1236320734024048, "learning_rate": 4.2466418366402715e-05, "loss": 0.5041, "num_input_tokens_seen": 12714896, "step": 22050 }, { "epoch": 3.284927018170986, "grad_norm": 2.9348886013031006, "learning_rate": 4.2461768133428993e-05, "loss": 0.6409, "num_input_tokens_seen": 12717680, "step": 22055 }, { "epoch": 3.285671730711945, "grad_norm": 1.8661680221557617, "learning_rate": 4.2457116720459975e-05, "loss": 0.7414, "num_input_tokens_seen": 12720656, "step": 22060 }, { "epoch": 3.2864164432529046, "grad_norm": 2.016990900039673, "learning_rate": 4.245246412780999e-05, "loss": 0.6332, "num_input_tokens_seen": 12723728, "step": 22065 }, { "epoch": 3.2871611557938634, "grad_norm": 2.1253113746643066, "learning_rate": 4.244781035579343e-05, "loss": 0.4434, "num_input_tokens_seen": 12726256, "step": 22070 }, { "epoch": 3.2879058683348226, "grad_norm": 1.8149635791778564, "learning_rate": 4.244315540472478e-05, "loss": 0.8371, "num_input_tokens_seen": 12728912, "step": 22075 }, { "epoch": 3.2886505808757818, "grad_norm": 1.1612414121627808, "learning_rate": 4.243849927491861e-05, "loss": 0.5718, "num_input_tokens_seen": 12731888, "step": 22080 }, { "epoch": 3.289395293416741, "grad_norm": 2.6209843158721924, "learning_rate": 4.2433841966689564e-05, "loss": 0.6594, "num_input_tokens_seen": 12734608, "step": 22085 }, { "epoch": 3.2901400059577, "grad_norm": 1.4164597988128662, "learning_rate": 4.2429183480352354e-05, "loss": 0.7304, "num_input_tokens_seen": 12737424, "step": 22090 }, { "epoch": 3.2908847184986594, "grad_norm": 1.486530065536499, "learning_rate": 4.242452381622179e-05, "loss": 0.8623, "num_input_tokens_seen": 12740464, "step": 22095 }, { "epoch": 3.2916294310396186, "grad_norm": 1.9932211637496948, "learning_rate": 4.2419862974612744e-05, "loss": 0.6853, "num_input_tokens_seen": 12743408, "step": 22100 }, { "epoch": 3.292374143580578, "grad_norm": 1.8946481943130493, "learning_rate": 4.2415200955840184e-05, "loss": 0.6273, "num_input_tokens_seen": 12746000, "step": 22105 }, { "epoch": 3.293118856121537, "grad_norm": 1.7261079549789429, "learning_rate": 4.241053776021915e-05, "loss": 0.6821, "num_input_tokens_seen": 12749040, "step": 22110 }, { "epoch": 3.293863568662496, "grad_norm": 1.3980381488800049, "learning_rate": 4.240587338806476e-05, "loss": 0.7408, "num_input_tokens_seen": 12751920, "step": 22115 }, { "epoch": 3.2946082812034554, "grad_norm": 1.168823003768921, "learning_rate": 4.2401207839692217e-05, "loss": 0.6326, "num_input_tokens_seen": 12754704, "step": 22120 }, { "epoch": 3.2953529937444146, "grad_norm": 1.618553876876831, "learning_rate": 4.239654111541679e-05, "loss": 0.7631, "num_input_tokens_seen": 12757872, "step": 22125 }, { "epoch": 3.296097706285374, "grad_norm": 1.2768847942352295, "learning_rate": 4.239187321555384e-05, "loss": 0.7011, "num_input_tokens_seen": 12760976, "step": 22130 }, { "epoch": 3.296842418826333, "grad_norm": 1.5719107389450073, "learning_rate": 4.2387204140418815e-05, "loss": 0.742, "num_input_tokens_seen": 12763952, "step": 22135 }, { "epoch": 3.297587131367292, "grad_norm": 1.2134422063827515, "learning_rate": 4.238253389032723e-05, "loss": 0.6405, "num_input_tokens_seen": 12767056, "step": 22140 }, { "epoch": 3.2983318439082514, "grad_norm": 0.9847086668014526, "learning_rate": 4.237786246559467e-05, "loss": 0.8291, "num_input_tokens_seen": 12769744, "step": 22145 }, { "epoch": 3.2990765564492106, "grad_norm": 0.9829313158988953, "learning_rate": 4.2373189866536815e-05, "loss": 0.6791, "num_input_tokens_seen": 12772656, "step": 22150 }, { "epoch": 3.29982126899017, "grad_norm": 1.1250077486038208, "learning_rate": 4.236851609346943e-05, "loss": 0.6777, "num_input_tokens_seen": 12775472, "step": 22155 }, { "epoch": 3.300565981531129, "grad_norm": 1.0286288261413574, "learning_rate": 4.236384114670834e-05, "loss": 0.6009, "num_input_tokens_seen": 12778352, "step": 22160 }, { "epoch": 3.301310694072088, "grad_norm": 0.5959700345993042, "learning_rate": 4.2359165026569455e-05, "loss": 0.6497, "num_input_tokens_seen": 12781200, "step": 22165 }, { "epoch": 3.3020554066130474, "grad_norm": 1.1531089544296265, "learning_rate": 4.235448773336878e-05, "loss": 0.6207, "num_input_tokens_seen": 12783888, "step": 22170 }, { "epoch": 3.3028001191540066, "grad_norm": 1.7404357194900513, "learning_rate": 4.234980926742239e-05, "loss": 0.7777, "num_input_tokens_seen": 12786672, "step": 22175 }, { "epoch": 3.303544831694966, "grad_norm": 1.0442626476287842, "learning_rate": 4.2345129629046425e-05, "loss": 0.5983, "num_input_tokens_seen": 12789488, "step": 22180 }, { "epoch": 3.304289544235925, "grad_norm": 1.0274863243103027, "learning_rate": 4.234044881855711e-05, "loss": 0.6495, "num_input_tokens_seen": 12792240, "step": 22185 }, { "epoch": 3.3050342567768842, "grad_norm": 0.7426157593727112, "learning_rate": 4.233576683627078e-05, "loss": 0.6539, "num_input_tokens_seen": 12794864, "step": 22190 }, { "epoch": 3.3057789693178434, "grad_norm": 1.166630506515503, "learning_rate": 4.23310836825038e-05, "loss": 0.666, "num_input_tokens_seen": 12797712, "step": 22195 }, { "epoch": 3.3065236818588026, "grad_norm": 0.9360235333442688, "learning_rate": 4.2326399357572654e-05, "loss": 0.7044, "num_input_tokens_seen": 12800784, "step": 22200 }, { "epoch": 3.307268394399762, "grad_norm": 0.9441736340522766, "learning_rate": 4.232171386179388e-05, "loss": 0.4743, "num_input_tokens_seen": 12803664, "step": 22205 }, { "epoch": 3.308013106940721, "grad_norm": 0.8424004316329956, "learning_rate": 4.231702719548411e-05, "loss": 0.81, "num_input_tokens_seen": 12806192, "step": 22210 }, { "epoch": 3.3087578194816802, "grad_norm": 0.9149263501167297, "learning_rate": 4.231233935896004e-05, "loss": 0.5593, "num_input_tokens_seen": 12809488, "step": 22215 }, { "epoch": 3.3095025320226394, "grad_norm": 1.5824531316757202, "learning_rate": 4.2307650352538465e-05, "loss": 0.5861, "num_input_tokens_seen": 12812176, "step": 22220 }, { "epoch": 3.310247244563598, "grad_norm": 1.567407488822937, "learning_rate": 4.230296017653625e-05, "loss": 0.6427, "num_input_tokens_seen": 12815024, "step": 22225 }, { "epoch": 3.310991957104558, "grad_norm": 1.524962306022644, "learning_rate": 4.2298268831270335e-05, "loss": 0.6526, "num_input_tokens_seen": 12817776, "step": 22230 }, { "epoch": 3.3117366696455166, "grad_norm": 1.1996279954910278, "learning_rate": 4.229357631705774e-05, "loss": 0.6368, "num_input_tokens_seen": 12820400, "step": 22235 }, { "epoch": 3.3124813821864763, "grad_norm": 1.1105117797851562, "learning_rate": 4.228888263421557e-05, "loss": 0.7007, "num_input_tokens_seen": 12823472, "step": 22240 }, { "epoch": 3.313226094727435, "grad_norm": 1.146803379058838, "learning_rate": 4.2284187783061e-05, "loss": 0.6123, "num_input_tokens_seen": 12826032, "step": 22245 }, { "epoch": 3.313970807268394, "grad_norm": 0.9893986582756042, "learning_rate": 4.22794917639113e-05, "loss": 0.6842, "num_input_tokens_seen": 12828848, "step": 22250 }, { "epoch": 3.3147155198093534, "grad_norm": 1.106261134147644, "learning_rate": 4.227479457708379e-05, "loss": 0.5993, "num_input_tokens_seen": 12831728, "step": 22255 }, { "epoch": 3.3154602323503126, "grad_norm": 1.4602127075195312, "learning_rate": 4.22700962228959e-05, "loss": 0.7398, "num_input_tokens_seen": 12834576, "step": 22260 }, { "epoch": 3.316204944891272, "grad_norm": 1.3941069841384888, "learning_rate": 4.2265396701665125e-05, "loss": 0.7411, "num_input_tokens_seen": 12837232, "step": 22265 }, { "epoch": 3.316949657432231, "grad_norm": 1.5654278993606567, "learning_rate": 4.226069601370904e-05, "loss": 0.7225, "num_input_tokens_seen": 12840016, "step": 22270 }, { "epoch": 3.3176943699731902, "grad_norm": 1.0331279039382935, "learning_rate": 4.225599415934529e-05, "loss": 0.7367, "num_input_tokens_seen": 12843024, "step": 22275 }, { "epoch": 3.3184390825141494, "grad_norm": 0.8716992139816284, "learning_rate": 4.225129113889161e-05, "loss": 0.5611, "num_input_tokens_seen": 12845872, "step": 22280 }, { "epoch": 3.3191837950551086, "grad_norm": 0.9425951242446899, "learning_rate": 4.224658695266582e-05, "loss": 0.4406, "num_input_tokens_seen": 12848720, "step": 22285 }, { "epoch": 3.319928507596068, "grad_norm": 0.8159434795379639, "learning_rate": 4.22418816009858e-05, "loss": 0.5375, "num_input_tokens_seen": 12851600, "step": 22290 }, { "epoch": 3.320673220137027, "grad_norm": 2.1007730960845947, "learning_rate": 4.223717508416952e-05, "loss": 0.5227, "num_input_tokens_seen": 12854224, "step": 22295 }, { "epoch": 3.3214179326779862, "grad_norm": 1.430998682975769, "learning_rate": 4.2232467402535036e-05, "loss": 0.639, "num_input_tokens_seen": 12857232, "step": 22300 }, { "epoch": 3.3221626452189454, "grad_norm": 1.318630337715149, "learning_rate": 4.222775855640047e-05, "loss": 0.7363, "num_input_tokens_seen": 12860016, "step": 22305 }, { "epoch": 3.3229073577599046, "grad_norm": 1.3716437816619873, "learning_rate": 4.222304854608401e-05, "loss": 0.746, "num_input_tokens_seen": 12863120, "step": 22310 }, { "epoch": 3.323652070300864, "grad_norm": 0.9531961679458618, "learning_rate": 4.221833737190396e-05, "loss": 0.6181, "num_input_tokens_seen": 12865808, "step": 22315 }, { "epoch": 3.324396782841823, "grad_norm": 1.3665533065795898, "learning_rate": 4.2213625034178674e-05, "loss": 0.5263, "num_input_tokens_seen": 12868848, "step": 22320 }, { "epoch": 3.3251414953827823, "grad_norm": 3.9212706089019775, "learning_rate": 4.220891153322659e-05, "loss": 0.6764, "num_input_tokens_seen": 12871312, "step": 22325 }, { "epoch": 3.3258862079237415, "grad_norm": 2.433349609375, "learning_rate": 4.220419686936623e-05, "loss": 0.7982, "num_input_tokens_seen": 12874384, "step": 22330 }, { "epoch": 3.3266309204647007, "grad_norm": 1.5728119611740112, "learning_rate": 4.21994810429162e-05, "loss": 0.7454, "num_input_tokens_seen": 12876976, "step": 22335 }, { "epoch": 3.32737563300566, "grad_norm": 1.4600533246994019, "learning_rate": 4.2194764054195166e-05, "loss": 0.6179, "num_input_tokens_seen": 12879600, "step": 22340 }, { "epoch": 3.328120345546619, "grad_norm": 5.087888717651367, "learning_rate": 4.219004590352189e-05, "loss": 0.7527, "num_input_tokens_seen": 12882672, "step": 22345 }, { "epoch": 3.3288650580875783, "grad_norm": 1.4329848289489746, "learning_rate": 4.2185326591215196e-05, "loss": 0.5696, "num_input_tokens_seen": 12885392, "step": 22350 }, { "epoch": 3.3296097706285375, "grad_norm": 2.626375913619995, "learning_rate": 4.2180606117594e-05, "loss": 0.6491, "num_input_tokens_seen": 12888240, "step": 22355 }, { "epoch": 3.3303544831694967, "grad_norm": 1.5816162824630737, "learning_rate": 4.21758844829773e-05, "loss": 0.6986, "num_input_tokens_seen": 12890896, "step": 22360 }, { "epoch": 3.331099195710456, "grad_norm": 1.100115180015564, "learning_rate": 4.2171161687684156e-05, "loss": 0.5726, "num_input_tokens_seen": 12893840, "step": 22365 }, { "epoch": 3.331843908251415, "grad_norm": 2.5042386054992676, "learning_rate": 4.216643773203372e-05, "loss": 0.6002, "num_input_tokens_seen": 12896688, "step": 22370 }, { "epoch": 3.3325886207923743, "grad_norm": 1.2398570775985718, "learning_rate": 4.216171261634521e-05, "loss": 0.8097, "num_input_tokens_seen": 12899824, "step": 22375 }, { "epoch": 3.3333333333333335, "grad_norm": 3.7750296592712402, "learning_rate": 4.215698634093794e-05, "loss": 0.7782, "num_input_tokens_seen": 12902864, "step": 22380 }, { "epoch": 3.3340780458742927, "grad_norm": 1.737261176109314, "learning_rate": 4.2152258906131295e-05, "loss": 0.5549, "num_input_tokens_seen": 12905616, "step": 22385 }, { "epoch": 3.334822758415252, "grad_norm": 1.7990741729736328, "learning_rate": 4.214753031224472e-05, "loss": 0.6217, "num_input_tokens_seen": 12908816, "step": 22390 }, { "epoch": 3.335567470956211, "grad_norm": 3.327878475189209, "learning_rate": 4.2142800559597764e-05, "loss": 0.6124, "num_input_tokens_seen": 12911536, "step": 22395 }, { "epoch": 3.33631218349717, "grad_norm": 3.775167226791382, "learning_rate": 4.2138069648510045e-05, "loss": 0.8452, "num_input_tokens_seen": 12914384, "step": 22400 }, { "epoch": 3.3370568960381295, "grad_norm": 2.885486364364624, "learning_rate": 4.2133337579301255e-05, "loss": 0.6655, "num_input_tokens_seen": 12917136, "step": 22405 }, { "epoch": 3.3378016085790883, "grad_norm": 1.2646516561508179, "learning_rate": 4.212860435229117e-05, "loss": 0.7251, "num_input_tokens_seen": 12919824, "step": 22410 }, { "epoch": 3.338546321120048, "grad_norm": 1.2741490602493286, "learning_rate": 4.212386996779965e-05, "loss": 0.7456, "num_input_tokens_seen": 12922576, "step": 22415 }, { "epoch": 3.3392910336610067, "grad_norm": 0.9123364090919495, "learning_rate": 4.2119134426146614e-05, "loss": 0.4686, "num_input_tokens_seen": 12925648, "step": 22420 }, { "epoch": 3.340035746201966, "grad_norm": 1.0659855604171753, "learning_rate": 4.211439772765208e-05, "loss": 0.8079, "num_input_tokens_seen": 12928272, "step": 22425 }, { "epoch": 3.340780458742925, "grad_norm": 1.4340064525604248, "learning_rate": 4.210965987263612e-05, "loss": 0.5926, "num_input_tokens_seen": 12930960, "step": 22430 }, { "epoch": 3.3415251712838843, "grad_norm": 1.1283445358276367, "learning_rate": 4.2104920861418906e-05, "loss": 0.695, "num_input_tokens_seen": 12933808, "step": 22435 }, { "epoch": 3.3422698838248435, "grad_norm": 0.9555203914642334, "learning_rate": 4.21001806943207e-05, "loss": 0.7016, "num_input_tokens_seen": 12936816, "step": 22440 }, { "epoch": 3.3430145963658027, "grad_norm": 1.0642635822296143, "learning_rate": 4.209543937166179e-05, "loss": 0.5603, "num_input_tokens_seen": 12939728, "step": 22445 }, { "epoch": 3.343759308906762, "grad_norm": 1.1456241607666016, "learning_rate": 4.2090696893762605e-05, "loss": 0.5831, "num_input_tokens_seen": 12942544, "step": 22450 }, { "epoch": 3.344504021447721, "grad_norm": 2.445157051086426, "learning_rate": 4.20859532609436e-05, "loss": 0.5953, "num_input_tokens_seen": 12945328, "step": 22455 }, { "epoch": 3.3452487339886803, "grad_norm": 1.1582458019256592, "learning_rate": 4.208120847352535e-05, "loss": 0.5826, "num_input_tokens_seen": 12947824, "step": 22460 }, { "epoch": 3.3459934465296395, "grad_norm": 0.9261757731437683, "learning_rate": 4.207646253182847e-05, "loss": 0.6923, "num_input_tokens_seen": 12951120, "step": 22465 }, { "epoch": 3.3467381590705987, "grad_norm": 1.043142318725586, "learning_rate": 4.207171543617369e-05, "loss": 0.7501, "num_input_tokens_seen": 12953904, "step": 22470 }, { "epoch": 3.347482871611558, "grad_norm": 3.3780040740966797, "learning_rate": 4.206696718688178e-05, "loss": 0.6259, "num_input_tokens_seen": 12956752, "step": 22475 }, { "epoch": 3.348227584152517, "grad_norm": 0.9905321598052979, "learning_rate": 4.206221778427362e-05, "loss": 0.5878, "num_input_tokens_seen": 12959920, "step": 22480 }, { "epoch": 3.3489722966934763, "grad_norm": 1.1234968900680542, "learning_rate": 4.205746722867014e-05, "loss": 0.588, "num_input_tokens_seen": 12962704, "step": 22485 }, { "epoch": 3.3497170092344355, "grad_norm": 0.9825847744941711, "learning_rate": 4.2052715520392397e-05, "loss": 0.7254, "num_input_tokens_seen": 12965712, "step": 22490 }, { "epoch": 3.3504617217753947, "grad_norm": 1.1535015106201172, "learning_rate": 4.2047962659761454e-05, "loss": 0.5348, "num_input_tokens_seen": 12968592, "step": 22495 }, { "epoch": 3.351206434316354, "grad_norm": 0.6768991351127625, "learning_rate": 4.204320864709852e-05, "loss": 0.4877, "num_input_tokens_seen": 12971664, "step": 22500 }, { "epoch": 3.351951146857313, "grad_norm": 1.6017738580703735, "learning_rate": 4.203845348272483e-05, "loss": 0.6584, "num_input_tokens_seen": 12974448, "step": 22505 }, { "epoch": 3.3526958593982723, "grad_norm": 3.912118911743164, "learning_rate": 4.2033697166961716e-05, "loss": 0.7273, "num_input_tokens_seen": 12977424, "step": 22510 }, { "epoch": 3.3534405719392315, "grad_norm": 1.4516093730926514, "learning_rate": 4.202893970013062e-05, "loss": 0.7408, "num_input_tokens_seen": 12980272, "step": 22515 }, { "epoch": 3.3541852844801907, "grad_norm": 1.4033867120742798, "learning_rate": 4.202418108255301e-05, "loss": 0.6392, "num_input_tokens_seen": 12983184, "step": 22520 }, { "epoch": 3.35492999702115, "grad_norm": 1.124624490737915, "learning_rate": 4.201942131455045e-05, "loss": 0.5959, "num_input_tokens_seen": 12986096, "step": 22525 }, { "epoch": 3.355674709562109, "grad_norm": 1.6441731452941895, "learning_rate": 4.2014660396444596e-05, "loss": 0.6437, "num_input_tokens_seen": 12989072, "step": 22530 }, { "epoch": 3.3564194221030683, "grad_norm": 1.5018253326416016, "learning_rate": 4.200989832855717e-05, "loss": 0.758, "num_input_tokens_seen": 12991984, "step": 22535 }, { "epoch": 3.3571641346440275, "grad_norm": 0.8510207533836365, "learning_rate": 4.2005135111209976e-05, "loss": 0.5508, "num_input_tokens_seen": 12994704, "step": 22540 }, { "epoch": 3.3579088471849867, "grad_norm": 1.330384612083435, "learning_rate": 4.200037074472488e-05, "loss": 0.5946, "num_input_tokens_seen": 12997616, "step": 22545 }, { "epoch": 3.358653559725946, "grad_norm": 0.7745478749275208, "learning_rate": 4.1995605229423856e-05, "loss": 0.7242, "num_input_tokens_seen": 13000336, "step": 22550 }, { "epoch": 3.359398272266905, "grad_norm": 1.1750203371047974, "learning_rate": 4.199083856562893e-05, "loss": 0.8466, "num_input_tokens_seen": 13003024, "step": 22555 }, { "epoch": 3.3601429848078643, "grad_norm": 1.7274701595306396, "learning_rate": 4.198607075366221e-05, "loss": 0.5911, "num_input_tokens_seen": 13005968, "step": 22560 }, { "epoch": 3.3608876973488235, "grad_norm": 1.837429404258728, "learning_rate": 4.198130179384589e-05, "loss": 0.5896, "num_input_tokens_seen": 13008656, "step": 22565 }, { "epoch": 3.3616324098897827, "grad_norm": 1.9593353271484375, "learning_rate": 4.197653168650223e-05, "loss": 0.5126, "num_input_tokens_seen": 13011696, "step": 22570 }, { "epoch": 3.3623771224307415, "grad_norm": 1.3598510026931763, "learning_rate": 4.197176043195359e-05, "loss": 0.6833, "num_input_tokens_seen": 13014448, "step": 22575 }, { "epoch": 3.363121834971701, "grad_norm": 1.3359490633010864, "learning_rate": 4.196698803052237e-05, "loss": 0.6151, "num_input_tokens_seen": 13017552, "step": 22580 }, { "epoch": 3.36386654751266, "grad_norm": 1.0978094339370728, "learning_rate": 4.196221448253109e-05, "loss": 0.7761, "num_input_tokens_seen": 13020720, "step": 22585 }, { "epoch": 3.3646112600536195, "grad_norm": 1.1584120988845825, "learning_rate": 4.1957439788302325e-05, "loss": 0.6243, "num_input_tokens_seen": 13023344, "step": 22590 }, { "epoch": 3.3653559725945783, "grad_norm": 1.4071258306503296, "learning_rate": 4.195266394815871e-05, "loss": 0.5113, "num_input_tokens_seen": 13026672, "step": 22595 }, { "epoch": 3.3661006851355375, "grad_norm": 1.706950306892395, "learning_rate": 4.1947886962423e-05, "loss": 0.6205, "num_input_tokens_seen": 13029744, "step": 22600 }, { "epoch": 3.3668453976764967, "grad_norm": 1.2514419555664062, "learning_rate": 4.1943108831417987e-05, "loss": 0.7441, "num_input_tokens_seen": 13032720, "step": 22605 }, { "epoch": 3.367590110217456, "grad_norm": 2.922191619873047, "learning_rate": 4.193832955546657e-05, "loss": 0.7565, "num_input_tokens_seen": 13036080, "step": 22610 }, { "epoch": 3.368334822758415, "grad_norm": 2.625656843185425, "learning_rate": 4.1933549134891706e-05, "loss": 0.7449, "num_input_tokens_seen": 13038832, "step": 22615 }, { "epoch": 3.3690795352993743, "grad_norm": 3.6277129650115967, "learning_rate": 4.192876757001643e-05, "loss": 0.7092, "num_input_tokens_seen": 13041616, "step": 22620 }, { "epoch": 3.3698242478403335, "grad_norm": 1.836835265159607, "learning_rate": 4.1923984861163886e-05, "loss": 0.6917, "num_input_tokens_seen": 13044592, "step": 22625 }, { "epoch": 3.3705689603812927, "grad_norm": 1.8836297988891602, "learning_rate": 4.191920100865724e-05, "loss": 0.666, "num_input_tokens_seen": 13047248, "step": 22630 }, { "epoch": 3.371313672922252, "grad_norm": 1.457126498222351, "learning_rate": 4.191441601281978e-05, "loss": 0.6605, "num_input_tokens_seen": 13050096, "step": 22635 }, { "epoch": 3.372058385463211, "grad_norm": 1.2993370294570923, "learning_rate": 4.1909629873974865e-05, "loss": 0.5666, "num_input_tokens_seen": 13053040, "step": 22640 }, { "epoch": 3.3728030980041703, "grad_norm": 1.4015352725982666, "learning_rate": 4.1904842592445906e-05, "loss": 0.5649, "num_input_tokens_seen": 13055920, "step": 22645 }, { "epoch": 3.3735478105451295, "grad_norm": 0.9169224500656128, "learning_rate": 4.190005416855641e-05, "loss": 0.6113, "num_input_tokens_seen": 13058672, "step": 22650 }, { "epoch": 3.3742925230860887, "grad_norm": 1.1118478775024414, "learning_rate": 4.1895264602629966e-05, "loss": 0.6249, "num_input_tokens_seen": 13061776, "step": 22655 }, { "epoch": 3.375037235627048, "grad_norm": 1.137930989265442, "learning_rate": 4.189047389499023e-05, "loss": 0.6699, "num_input_tokens_seen": 13065040, "step": 22660 }, { "epoch": 3.375781948168007, "grad_norm": 1.4764292240142822, "learning_rate": 4.1885682045960945e-05, "loss": 0.6497, "num_input_tokens_seen": 13067984, "step": 22665 }, { "epoch": 3.3765266607089663, "grad_norm": 0.835065484046936, "learning_rate": 4.188088905586591e-05, "loss": 0.8425, "num_input_tokens_seen": 13070800, "step": 22670 }, { "epoch": 3.3772713732499255, "grad_norm": 1.487853765487671, "learning_rate": 4.1876094925029036e-05, "loss": 0.6988, "num_input_tokens_seen": 13073712, "step": 22675 }, { "epoch": 3.3780160857908847, "grad_norm": 0.9184942841529846, "learning_rate": 4.187129965377427e-05, "loss": 0.7065, "num_input_tokens_seen": 13076400, "step": 22680 }, { "epoch": 3.378760798331844, "grad_norm": 0.9582832455635071, "learning_rate": 4.186650324242568e-05, "loss": 0.671, "num_input_tokens_seen": 13079536, "step": 22685 }, { "epoch": 3.379505510872803, "grad_norm": 1.995331048965454, "learning_rate": 4.186170569130737e-05, "loss": 0.6252, "num_input_tokens_seen": 13082416, "step": 22690 }, { "epoch": 3.3802502234137624, "grad_norm": 1.361344337463379, "learning_rate": 4.185690700074354e-05, "loss": 0.7014, "num_input_tokens_seen": 13085008, "step": 22695 }, { "epoch": 3.3809949359547216, "grad_norm": 0.7997075915336609, "learning_rate": 4.185210717105848e-05, "loss": 0.6796, "num_input_tokens_seen": 13087824, "step": 22700 }, { "epoch": 3.3817396484956808, "grad_norm": 1.7272177934646606, "learning_rate": 4.184730620257652e-05, "loss": 0.8224, "num_input_tokens_seen": 13090416, "step": 22705 }, { "epoch": 3.38248436103664, "grad_norm": 0.9003263115882874, "learning_rate": 4.18425040956221e-05, "loss": 0.6051, "num_input_tokens_seen": 13093968, "step": 22710 }, { "epoch": 3.383229073577599, "grad_norm": 1.063405990600586, "learning_rate": 4.183770085051974e-05, "loss": 0.7402, "num_input_tokens_seen": 13096752, "step": 22715 }, { "epoch": 3.3839737861185584, "grad_norm": 0.7059954404830933, "learning_rate": 4.183289646759402e-05, "loss": 0.908, "num_input_tokens_seen": 13099536, "step": 22720 }, { "epoch": 3.3847184986595176, "grad_norm": 0.7257516384124756, "learning_rate": 4.182809094716958e-05, "loss": 0.6313, "num_input_tokens_seen": 13102128, "step": 22725 }, { "epoch": 3.3854632112004768, "grad_norm": 1.0442359447479248, "learning_rate": 4.182328428957118e-05, "loss": 0.7439, "num_input_tokens_seen": 13105168, "step": 22730 }, { "epoch": 3.386207923741436, "grad_norm": 0.6194037199020386, "learning_rate": 4.181847649512362e-05, "loss": 0.6428, "num_input_tokens_seen": 13108272, "step": 22735 }, { "epoch": 3.386952636282395, "grad_norm": 1.032373070716858, "learning_rate": 4.181366756415181e-05, "loss": 0.6803, "num_input_tokens_seen": 13111184, "step": 22740 }, { "epoch": 3.3876973488233544, "grad_norm": 0.7433532476425171, "learning_rate": 4.18088574969807e-05, "loss": 0.6126, "num_input_tokens_seen": 13114000, "step": 22745 }, { "epoch": 3.388442061364313, "grad_norm": 1.368732213973999, "learning_rate": 4.1804046293935334e-05, "loss": 0.73, "num_input_tokens_seen": 13116944, "step": 22750 }, { "epoch": 3.389186773905273, "grad_norm": 2.0376832485198975, "learning_rate": 4.179923395534084e-05, "loss": 0.7854, "num_input_tokens_seen": 13119856, "step": 22755 }, { "epoch": 3.3899314864462315, "grad_norm": 0.971314013004303, "learning_rate": 4.1794420481522424e-05, "loss": 0.6932, "num_input_tokens_seen": 13123056, "step": 22760 }, { "epoch": 3.390676198987191, "grad_norm": 1.0543066263198853, "learning_rate": 4.178960587280535e-05, "loss": 0.742, "num_input_tokens_seen": 13126032, "step": 22765 }, { "epoch": 3.39142091152815, "grad_norm": 0.8563473224639893, "learning_rate": 4.178479012951497e-05, "loss": 0.6575, "num_input_tokens_seen": 13129200, "step": 22770 }, { "epoch": 3.392165624069109, "grad_norm": 1.0931860208511353, "learning_rate": 4.177997325197671e-05, "loss": 0.535, "num_input_tokens_seen": 13131920, "step": 22775 }, { "epoch": 3.3929103366100684, "grad_norm": 1.1318570375442505, "learning_rate": 4.177515524051609e-05, "loss": 0.6147, "num_input_tokens_seen": 13134704, "step": 22780 }, { "epoch": 3.3936550491510276, "grad_norm": 0.785899817943573, "learning_rate": 4.1770336095458676e-05, "loss": 0.6416, "num_input_tokens_seen": 13137424, "step": 22785 }, { "epoch": 3.3943997616919868, "grad_norm": 0.8326783180236816, "learning_rate": 4.176551581713013e-05, "loss": 0.5585, "num_input_tokens_seen": 13140336, "step": 22790 }, { "epoch": 3.395144474232946, "grad_norm": 0.7795089483261108, "learning_rate": 4.1760694405856194e-05, "loss": 0.5779, "num_input_tokens_seen": 13142960, "step": 22795 }, { "epoch": 3.395889186773905, "grad_norm": 0.6917797923088074, "learning_rate": 4.1755871861962674e-05, "loss": 0.5537, "num_input_tokens_seen": 13145872, "step": 22800 }, { "epoch": 3.3966338993148644, "grad_norm": 1.0164899826049805, "learning_rate": 4.175104818577545e-05, "loss": 0.6526, "num_input_tokens_seen": 13148880, "step": 22805 }, { "epoch": 3.3973786118558236, "grad_norm": 0.8011967539787292, "learning_rate": 4.174622337762051e-05, "loss": 0.6498, "num_input_tokens_seen": 13151856, "step": 22810 }, { "epoch": 3.3981233243967828, "grad_norm": 0.922678530216217, "learning_rate": 4.174139743782387e-05, "loss": 0.5825, "num_input_tokens_seen": 13155088, "step": 22815 }, { "epoch": 3.398868036937742, "grad_norm": 1.449967384338379, "learning_rate": 4.173657036671166e-05, "loss": 0.5939, "num_input_tokens_seen": 13157872, "step": 22820 }, { "epoch": 3.399612749478701, "grad_norm": 1.5062921047210693, "learning_rate": 4.173174216461006e-05, "loss": 0.7326, "num_input_tokens_seen": 13160912, "step": 22825 }, { "epoch": 3.4003574620196604, "grad_norm": 1.2863008975982666, "learning_rate": 4.172691283184536e-05, "loss": 0.5762, "num_input_tokens_seen": 13163888, "step": 22830 }, { "epoch": 3.4011021745606196, "grad_norm": 1.187108039855957, "learning_rate": 4.172208236874389e-05, "loss": 0.5764, "num_input_tokens_seen": 13166608, "step": 22835 }, { "epoch": 3.401846887101579, "grad_norm": 2.1952672004699707, "learning_rate": 4.1717250775632086e-05, "loss": 0.7129, "num_input_tokens_seen": 13169424, "step": 22840 }, { "epoch": 3.402591599642538, "grad_norm": 1.231274127960205, "learning_rate": 4.1712418052836445e-05, "loss": 0.6363, "num_input_tokens_seen": 13172336, "step": 22845 }, { "epoch": 3.403336312183497, "grad_norm": 2.0146024227142334, "learning_rate": 4.1707584200683535e-05, "loss": 0.5999, "num_input_tokens_seen": 13175440, "step": 22850 }, { "epoch": 3.4040810247244564, "grad_norm": 0.7706981301307678, "learning_rate": 4.170274921950001e-05, "loss": 0.6758, "num_input_tokens_seen": 13178352, "step": 22855 }, { "epoch": 3.4048257372654156, "grad_norm": 0.8539385199546814, "learning_rate": 4.169791310961261e-05, "loss": 0.6155, "num_input_tokens_seen": 13181456, "step": 22860 }, { "epoch": 3.405570449806375, "grad_norm": 1.172934651374817, "learning_rate": 4.169307587134813e-05, "loss": 0.6471, "num_input_tokens_seen": 13184080, "step": 22865 }, { "epoch": 3.406315162347334, "grad_norm": 0.9619616270065308, "learning_rate": 4.1688237505033454e-05, "loss": 0.6122, "num_input_tokens_seen": 13186736, "step": 22870 }, { "epoch": 3.407059874888293, "grad_norm": 1.2813611030578613, "learning_rate": 4.168339801099552e-05, "loss": 0.5917, "num_input_tokens_seen": 13189872, "step": 22875 }, { "epoch": 3.4078045874292524, "grad_norm": 0.7488526701927185, "learning_rate": 4.167855738956139e-05, "loss": 0.5374, "num_input_tokens_seen": 13192720, "step": 22880 }, { "epoch": 3.4085492999702116, "grad_norm": 1.5291752815246582, "learning_rate": 4.1673715641058165e-05, "loss": 0.6855, "num_input_tokens_seen": 13196048, "step": 22885 }, { "epoch": 3.409294012511171, "grad_norm": 2.2406980991363525, "learning_rate": 4.1668872765813025e-05, "loss": 0.6609, "num_input_tokens_seen": 13198736, "step": 22890 }, { "epoch": 3.41003872505213, "grad_norm": 2.5388073921203613, "learning_rate": 4.166402876415323e-05, "loss": 0.7209, "num_input_tokens_seen": 13201776, "step": 22895 }, { "epoch": 3.410783437593089, "grad_norm": 1.4406211376190186, "learning_rate": 4.1659183636406126e-05, "loss": 0.7132, "num_input_tokens_seen": 13204624, "step": 22900 }, { "epoch": 3.4115281501340484, "grad_norm": 1.1214474439620972, "learning_rate": 4.165433738289912e-05, "loss": 0.506, "num_input_tokens_seen": 13207696, "step": 22905 }, { "epoch": 3.4122728626750076, "grad_norm": 1.0867903232574463, "learning_rate": 4.164949000395971e-05, "loss": 0.5187, "num_input_tokens_seen": 13210448, "step": 22910 }, { "epoch": 3.413017575215967, "grad_norm": 1.2266883850097656, "learning_rate": 4.1644641499915454e-05, "loss": 0.6074, "num_input_tokens_seen": 13213264, "step": 22915 }, { "epoch": 3.413762287756926, "grad_norm": 2.878953218460083, "learning_rate": 4.1639791871094e-05, "loss": 0.7483, "num_input_tokens_seen": 13215952, "step": 22920 }, { "epoch": 3.414507000297885, "grad_norm": 1.3223824501037598, "learning_rate": 4.1634941117823065e-05, "loss": 0.7122, "num_input_tokens_seen": 13218512, "step": 22925 }, { "epoch": 3.4152517128388444, "grad_norm": 1.8847777843475342, "learning_rate": 4.1630089240430434e-05, "loss": 0.6425, "num_input_tokens_seen": 13221584, "step": 22930 }, { "epoch": 3.415996425379803, "grad_norm": 0.7917786240577698, "learning_rate": 4.162523623924399e-05, "loss": 0.5415, "num_input_tokens_seen": 13224368, "step": 22935 }, { "epoch": 3.4167411379207624, "grad_norm": 0.6683441996574402, "learning_rate": 4.162038211459167e-05, "loss": 0.7202, "num_input_tokens_seen": 13226928, "step": 22940 }, { "epoch": 3.4174858504617216, "grad_norm": 0.9597492218017578, "learning_rate": 4.161552686680151e-05, "loss": 0.6385, "num_input_tokens_seen": 13229776, "step": 22945 }, { "epoch": 3.418230563002681, "grad_norm": 0.9618814587593079, "learning_rate": 4.161067049620159e-05, "loss": 0.5723, "num_input_tokens_seen": 13232624, "step": 22950 }, { "epoch": 3.41897527554364, "grad_norm": 1.2883473634719849, "learning_rate": 4.16058130031201e-05, "loss": 0.6149, "num_input_tokens_seen": 13235312, "step": 22955 }, { "epoch": 3.419719988084599, "grad_norm": 1.143604040145874, "learning_rate": 4.160095438788527e-05, "loss": 0.7431, "num_input_tokens_seen": 13238000, "step": 22960 }, { "epoch": 3.4204647006255584, "grad_norm": 2.2831876277923584, "learning_rate": 4.1596094650825446e-05, "loss": 0.7064, "num_input_tokens_seen": 13240912, "step": 22965 }, { "epoch": 3.4212094131665176, "grad_norm": 0.9838134050369263, "learning_rate": 4.159123379226902e-05, "loss": 0.5968, "num_input_tokens_seen": 13243696, "step": 22970 }, { "epoch": 3.421954125707477, "grad_norm": 1.7913743257522583, "learning_rate": 4.158637181254447e-05, "loss": 0.6334, "num_input_tokens_seen": 13246992, "step": 22975 }, { "epoch": 3.422698838248436, "grad_norm": 1.1426454782485962, "learning_rate": 4.158150871198034e-05, "loss": 0.5047, "num_input_tokens_seen": 13249808, "step": 22980 }, { "epoch": 3.423443550789395, "grad_norm": 0.9564453959465027, "learning_rate": 4.157664449090527e-05, "loss": 0.8387, "num_input_tokens_seen": 13253040, "step": 22985 }, { "epoch": 3.4241882633303544, "grad_norm": 0.8882635831832886, "learning_rate": 4.1571779149647964e-05, "loss": 0.7859, "num_input_tokens_seen": 13256240, "step": 22990 }, { "epoch": 3.4249329758713136, "grad_norm": 1.3498786687850952, "learning_rate": 4.1566912688537195e-05, "loss": 0.6444, "num_input_tokens_seen": 13258928, "step": 22995 }, { "epoch": 3.425677688412273, "grad_norm": 1.0028948783874512, "learning_rate": 4.156204510790183e-05, "loss": 0.6664, "num_input_tokens_seen": 13261712, "step": 23000 }, { "epoch": 3.426422400953232, "grad_norm": 4.112103462219238, "learning_rate": 4.1557176408070784e-05, "loss": 0.5807, "num_input_tokens_seen": 13264400, "step": 23005 }, { "epoch": 3.4271671134941912, "grad_norm": 3.3645529747009277, "learning_rate": 4.155230658937308e-05, "loss": 0.7171, "num_input_tokens_seen": 13267440, "step": 23010 }, { "epoch": 3.4279118260351504, "grad_norm": 0.757938027381897, "learning_rate": 4.154743565213779e-05, "loss": 0.6556, "num_input_tokens_seen": 13270544, "step": 23015 }, { "epoch": 3.4286565385761096, "grad_norm": 3.0957915782928467, "learning_rate": 4.154256359669408e-05, "loss": 0.7347, "num_input_tokens_seen": 13273456, "step": 23020 }, { "epoch": 3.429401251117069, "grad_norm": 1.196717381477356, "learning_rate": 4.153769042337118e-05, "loss": 0.6247, "num_input_tokens_seen": 13276496, "step": 23025 }, { "epoch": 3.430145963658028, "grad_norm": 0.8848860263824463, "learning_rate": 4.153281613249839e-05, "loss": 0.6701, "num_input_tokens_seen": 13279152, "step": 23030 }, { "epoch": 3.4308906761989872, "grad_norm": 0.7519732713699341, "learning_rate": 4.152794072440511e-05, "loss": 0.5451, "num_input_tokens_seen": 13282192, "step": 23035 }, { "epoch": 3.4316353887399464, "grad_norm": 1.3032950162887573, "learning_rate": 4.1523064199420786e-05, "loss": 0.7461, "num_input_tokens_seen": 13285136, "step": 23040 }, { "epoch": 3.4323801012809056, "grad_norm": 1.6840468645095825, "learning_rate": 4.1518186557874974e-05, "loss": 0.6499, "num_input_tokens_seen": 13288176, "step": 23045 }, { "epoch": 3.433124813821865, "grad_norm": 0.8488192558288574, "learning_rate": 4.151330780009726e-05, "loss": 0.6688, "num_input_tokens_seen": 13290864, "step": 23050 }, { "epoch": 3.433869526362824, "grad_norm": 1.73896324634552, "learning_rate": 4.150842792641735e-05, "loss": 0.6294, "num_input_tokens_seen": 13293968, "step": 23055 }, { "epoch": 3.4346142389037833, "grad_norm": 1.0096681118011475, "learning_rate": 4.1503546937165e-05, "loss": 0.8054, "num_input_tokens_seen": 13296592, "step": 23060 }, { "epoch": 3.4353589514447425, "grad_norm": 0.8925840258598328, "learning_rate": 4.1498664832670045e-05, "loss": 0.5678, "num_input_tokens_seen": 13299216, "step": 23065 }, { "epoch": 3.4361036639857017, "grad_norm": 1.329673171043396, "learning_rate": 4.149378161326239e-05, "loss": 0.6508, "num_input_tokens_seen": 13302096, "step": 23070 }, { "epoch": 3.436848376526661, "grad_norm": 0.8135559558868408, "learning_rate": 4.148889727927204e-05, "loss": 0.7005, "num_input_tokens_seen": 13304880, "step": 23075 }, { "epoch": 3.43759308906762, "grad_norm": 0.9689428806304932, "learning_rate": 4.1484011831029054e-05, "loss": 0.7141, "num_input_tokens_seen": 13307696, "step": 23080 }, { "epoch": 3.4383378016085793, "grad_norm": 1.0325722694396973, "learning_rate": 4.147912526886356e-05, "loss": 0.6769, "num_input_tokens_seen": 13310704, "step": 23085 }, { "epoch": 3.4390825141495385, "grad_norm": 1.420512318611145, "learning_rate": 4.147423759310579e-05, "loss": 0.6731, "num_input_tokens_seen": 13313808, "step": 23090 }, { "epoch": 3.4398272266904977, "grad_norm": 1.5610934495925903, "learning_rate": 4.1469348804086016e-05, "loss": 0.6221, "num_input_tokens_seen": 13316592, "step": 23095 }, { "epoch": 3.4405719392314564, "grad_norm": 0.8848094940185547, "learning_rate": 4.14644589021346e-05, "loss": 0.6822, "num_input_tokens_seen": 13319600, "step": 23100 }, { "epoch": 3.441316651772416, "grad_norm": 1.5505515336990356, "learning_rate": 4.1459567887582015e-05, "loss": 0.7264, "num_input_tokens_seen": 13322768, "step": 23105 }, { "epoch": 3.442061364313375, "grad_norm": 1.0512055158615112, "learning_rate": 4.145467576075874e-05, "loss": 0.6822, "num_input_tokens_seen": 13325872, "step": 23110 }, { "epoch": 3.442806076854334, "grad_norm": 0.835456907749176, "learning_rate": 4.144978252199537e-05, "loss": 0.5463, "num_input_tokens_seen": 13328976, "step": 23115 }, { "epoch": 3.4435507893952932, "grad_norm": 1.9471681118011475, "learning_rate": 4.1444888171622584e-05, "loss": 0.6886, "num_input_tokens_seen": 13331568, "step": 23120 }, { "epoch": 3.4442955019362524, "grad_norm": 0.8770983219146729, "learning_rate": 4.143999270997111e-05, "loss": 0.6075, "num_input_tokens_seen": 13334288, "step": 23125 }, { "epoch": 3.4450402144772116, "grad_norm": 1.291083574295044, "learning_rate": 4.143509613737178e-05, "loss": 0.4876, "num_input_tokens_seen": 13336816, "step": 23130 }, { "epoch": 3.445784927018171, "grad_norm": 0.9660540819168091, "learning_rate": 4.143019845415546e-05, "loss": 0.6254, "num_input_tokens_seen": 13339760, "step": 23135 }, { "epoch": 3.44652963955913, "grad_norm": 0.7563421130180359, "learning_rate": 4.142529966065314e-05, "loss": 0.7086, "num_input_tokens_seen": 13342736, "step": 23140 }, { "epoch": 3.4472743521000893, "grad_norm": 2.4290666580200195, "learning_rate": 4.1420399757195845e-05, "loss": 0.6475, "num_input_tokens_seen": 13345776, "step": 23145 }, { "epoch": 3.4480190646410485, "grad_norm": 1.4134083986282349, "learning_rate": 4.141549874411469e-05, "loss": 0.5579, "num_input_tokens_seen": 13348592, "step": 23150 }, { "epoch": 3.4487637771820077, "grad_norm": 1.6095308065414429, "learning_rate": 4.1410596621740874e-05, "loss": 0.5913, "num_input_tokens_seen": 13351504, "step": 23155 }, { "epoch": 3.449508489722967, "grad_norm": 0.9432778358459473, "learning_rate": 4.140569339040566e-05, "loss": 0.8171, "num_input_tokens_seen": 13354256, "step": 23160 }, { "epoch": 3.450253202263926, "grad_norm": 1.2512531280517578, "learning_rate": 4.140078905044039e-05, "loss": 0.6925, "num_input_tokens_seen": 13357040, "step": 23165 }, { "epoch": 3.4509979148048853, "grad_norm": 1.5512382984161377, "learning_rate": 4.1395883602176466e-05, "loss": 0.632, "num_input_tokens_seen": 13360176, "step": 23170 }, { "epoch": 3.4517426273458445, "grad_norm": 1.6752256155014038, "learning_rate": 4.13909770459454e-05, "loss": 0.5614, "num_input_tokens_seen": 13363152, "step": 23175 }, { "epoch": 3.4524873398868037, "grad_norm": 1.9630550146102905, "learning_rate": 4.138606938207874e-05, "loss": 0.5894, "num_input_tokens_seen": 13366064, "step": 23180 }, { "epoch": 3.453232052427763, "grad_norm": 0.6979066133499146, "learning_rate": 4.1381160610908134e-05, "loss": 0.5405, "num_input_tokens_seen": 13368784, "step": 23185 }, { "epoch": 3.453976764968722, "grad_norm": 0.6953665018081665, "learning_rate": 4.13762507327653e-05, "loss": 0.524, "num_input_tokens_seen": 13371792, "step": 23190 }, { "epoch": 3.4547214775096813, "grad_norm": 2.455522298812866, "learning_rate": 4.137133974798202e-05, "loss": 0.6079, "num_input_tokens_seen": 13374864, "step": 23195 }, { "epoch": 3.4554661900506405, "grad_norm": 1.8851318359375, "learning_rate": 4.1366427656890156e-05, "loss": 0.6904, "num_input_tokens_seen": 13377424, "step": 23200 }, { "epoch": 3.4562109025915997, "grad_norm": 1.5552035570144653, "learning_rate": 4.136151445982165e-05, "loss": 0.523, "num_input_tokens_seen": 13380080, "step": 23205 }, { "epoch": 3.456955615132559, "grad_norm": 2.1035547256469727, "learning_rate": 4.135660015710853e-05, "loss": 0.7694, "num_input_tokens_seen": 13382928, "step": 23210 }, { "epoch": 3.457700327673518, "grad_norm": 1.1568673849105835, "learning_rate": 4.1351684749082866e-05, "loss": 0.6004, "num_input_tokens_seen": 13385872, "step": 23215 }, { "epoch": 3.4584450402144773, "grad_norm": 0.6959912180900574, "learning_rate": 4.1346768236076825e-05, "loss": 0.6032, "num_input_tokens_seen": 13388880, "step": 23220 }, { "epoch": 3.4591897527554365, "grad_norm": 1.6721662282943726, "learning_rate": 4.134185061842265e-05, "loss": 0.5816, "num_input_tokens_seen": 13391792, "step": 23225 }, { "epoch": 3.4599344652963957, "grad_norm": 1.1300714015960693, "learning_rate": 4.133693189645265e-05, "loss": 0.5339, "num_input_tokens_seen": 13394672, "step": 23230 }, { "epoch": 3.460679177837355, "grad_norm": 0.7426145076751709, "learning_rate": 4.133201207049921e-05, "loss": 0.5803, "num_input_tokens_seen": 13397584, "step": 23235 }, { "epoch": 3.461423890378314, "grad_norm": 0.9682381749153137, "learning_rate": 4.1327091140894805e-05, "loss": 0.634, "num_input_tokens_seen": 13400400, "step": 23240 }, { "epoch": 3.4621686029192733, "grad_norm": 1.4119584560394287, "learning_rate": 4.132216910797195e-05, "loss": 0.8313, "num_input_tokens_seen": 13403152, "step": 23245 }, { "epoch": 3.4629133154602325, "grad_norm": 0.7663986682891846, "learning_rate": 4.131724597206328e-05, "loss": 0.4589, "num_input_tokens_seen": 13405776, "step": 23250 }, { "epoch": 3.4636580280011917, "grad_norm": 1.8619887828826904, "learning_rate": 4.131232173350146e-05, "loss": 0.7219, "num_input_tokens_seen": 13408592, "step": 23255 }, { "epoch": 3.464402740542151, "grad_norm": 1.1228371858596802, "learning_rate": 4.130739639261926e-05, "loss": 0.7952, "num_input_tokens_seen": 13411376, "step": 23260 }, { "epoch": 3.4651474530831097, "grad_norm": 0.7804237008094788, "learning_rate": 4.130246994974952e-05, "loss": 0.5894, "num_input_tokens_seen": 13414160, "step": 23265 }, { "epoch": 3.4658921656240693, "grad_norm": 1.3312392234802246, "learning_rate": 4.129754240522513e-05, "loss": 0.589, "num_input_tokens_seen": 13417104, "step": 23270 }, { "epoch": 3.466636878165028, "grad_norm": 1.5470647811889648, "learning_rate": 4.12926137593791e-05, "loss": 0.6076, "num_input_tokens_seen": 13420144, "step": 23275 }, { "epoch": 3.4673815907059877, "grad_norm": 1.3240326642990112, "learning_rate": 4.128768401254446e-05, "loss": 0.6558, "num_input_tokens_seen": 13423216, "step": 23280 }, { "epoch": 3.4681263032469465, "grad_norm": 2.447753667831421, "learning_rate": 4.128275316505435e-05, "loss": 0.6101, "num_input_tokens_seen": 13425904, "step": 23285 }, { "epoch": 3.4688710157879057, "grad_norm": 1.875433325767517, "learning_rate": 4.1277821217242e-05, "loss": 0.7758, "num_input_tokens_seen": 13428912, "step": 23290 }, { "epoch": 3.469615728328865, "grad_norm": 1.2875767946243286, "learning_rate": 4.127288816944066e-05, "loss": 0.6989, "num_input_tokens_seen": 13431920, "step": 23295 }, { "epoch": 3.470360440869824, "grad_norm": 1.0383808612823486, "learning_rate": 4.12679540219837e-05, "loss": 0.6099, "num_input_tokens_seen": 13434384, "step": 23300 }, { "epoch": 3.4711051534107833, "grad_norm": 1.2651737928390503, "learning_rate": 4.126301877520456e-05, "loss": 0.5735, "num_input_tokens_seen": 13437168, "step": 23305 }, { "epoch": 3.4718498659517425, "grad_norm": 1.4416472911834717, "learning_rate": 4.125808242943672e-05, "loss": 0.5439, "num_input_tokens_seen": 13439952, "step": 23310 }, { "epoch": 3.4725945784927017, "grad_norm": 0.6879176497459412, "learning_rate": 4.125314498501377e-05, "loss": 0.7287, "num_input_tokens_seen": 13442544, "step": 23315 }, { "epoch": 3.473339291033661, "grad_norm": 1.0646640062332153, "learning_rate": 4.124820644226936e-05, "loss": 0.801, "num_input_tokens_seen": 13445296, "step": 23320 }, { "epoch": 3.47408400357462, "grad_norm": 1.2588235139846802, "learning_rate": 4.124326680153723e-05, "loss": 0.6231, "num_input_tokens_seen": 13448112, "step": 23325 }, { "epoch": 3.4748287161155793, "grad_norm": 0.7396039962768555, "learning_rate": 4.1238326063151164e-05, "loss": 0.6661, "num_input_tokens_seen": 13450896, "step": 23330 }, { "epoch": 3.4755734286565385, "grad_norm": 0.6061564087867737, "learning_rate": 4.1233384227445036e-05, "loss": 0.5264, "num_input_tokens_seen": 13453968, "step": 23335 }, { "epoch": 3.4763181411974977, "grad_norm": 1.0791256427764893, "learning_rate": 4.122844129475281e-05, "loss": 0.6773, "num_input_tokens_seen": 13457136, "step": 23340 }, { "epoch": 3.477062853738457, "grad_norm": 1.9549154043197632, "learning_rate": 4.1223497265408505e-05, "loss": 0.7095, "num_input_tokens_seen": 13459984, "step": 23345 }, { "epoch": 3.477807566279416, "grad_norm": 0.832699179649353, "learning_rate": 4.12185521397462e-05, "loss": 0.6193, "num_input_tokens_seen": 13462960, "step": 23350 }, { "epoch": 3.4785522788203753, "grad_norm": 1.245469093322754, "learning_rate": 4.12136059181001e-05, "loss": 0.6426, "num_input_tokens_seen": 13465584, "step": 23355 }, { "epoch": 3.4792969913613345, "grad_norm": 0.9150049686431885, "learning_rate": 4.1208658600804416e-05, "loss": 0.5674, "num_input_tokens_seen": 13468336, "step": 23360 }, { "epoch": 3.4800417039022937, "grad_norm": 1.1528342962265015, "learning_rate": 4.120371018819349e-05, "loss": 0.6288, "num_input_tokens_seen": 13471120, "step": 23365 }, { "epoch": 3.480786416443253, "grad_norm": 0.9628103971481323, "learning_rate": 4.1198760680601713e-05, "loss": 0.6457, "num_input_tokens_seen": 13474064, "step": 23370 }, { "epoch": 3.481531128984212, "grad_norm": 1.2761712074279785, "learning_rate": 4.1193810078363544e-05, "loss": 0.7742, "num_input_tokens_seen": 13477168, "step": 23375 }, { "epoch": 3.4822758415251713, "grad_norm": 0.855915904045105, "learning_rate": 4.1188858381813524e-05, "loss": 0.5293, "num_input_tokens_seen": 13480080, "step": 23380 }, { "epoch": 3.4830205540661305, "grad_norm": 1.5172275304794312, "learning_rate": 4.118390559128629e-05, "loss": 0.7198, "num_input_tokens_seen": 13483120, "step": 23385 }, { "epoch": 3.4837652666070897, "grad_norm": 0.8768225908279419, "learning_rate": 4.11789517071165e-05, "loss": 0.4929, "num_input_tokens_seen": 13486064, "step": 23390 }, { "epoch": 3.484509979148049, "grad_norm": 1.2629228830337524, "learning_rate": 4.117399672963893e-05, "loss": 0.9195, "num_input_tokens_seen": 13489136, "step": 23395 }, { "epoch": 3.485254691689008, "grad_norm": 3.116716146469116, "learning_rate": 4.116904065918843e-05, "loss": 0.6108, "num_input_tokens_seen": 13492144, "step": 23400 }, { "epoch": 3.4859994042299673, "grad_norm": 1.0129938125610352, "learning_rate": 4.11640834960999e-05, "loss": 0.6275, "num_input_tokens_seen": 13495120, "step": 23405 }, { "epoch": 3.4867441167709265, "grad_norm": 1.3401515483856201, "learning_rate": 4.115912524070832e-05, "loss": 0.5564, "num_input_tokens_seen": 13498192, "step": 23410 }, { "epoch": 3.4874888293118858, "grad_norm": 0.769441545009613, "learning_rate": 4.1154165893348754e-05, "loss": 0.7891, "num_input_tokens_seen": 13500784, "step": 23415 }, { "epoch": 3.488233541852845, "grad_norm": 1.0121873617172241, "learning_rate": 4.114920545435634e-05, "loss": 0.6063, "num_input_tokens_seen": 13503536, "step": 23420 }, { "epoch": 3.488978254393804, "grad_norm": 1.1207650899887085, "learning_rate": 4.114424392406628e-05, "loss": 0.6575, "num_input_tokens_seen": 13506384, "step": 23425 }, { "epoch": 3.4897229669347634, "grad_norm": 1.0223945379257202, "learning_rate": 4.113928130281385e-05, "loss": 0.6627, "num_input_tokens_seen": 13509168, "step": 23430 }, { "epoch": 3.4904676794757226, "grad_norm": 0.807987630367279, "learning_rate": 4.113431759093441e-05, "loss": 0.6514, "num_input_tokens_seen": 13512080, "step": 23435 }, { "epoch": 3.4912123920166813, "grad_norm": 0.9481962323188782, "learning_rate": 4.112935278876338e-05, "loss": 0.614, "num_input_tokens_seen": 13515152, "step": 23440 }, { "epoch": 3.491957104557641, "grad_norm": 0.8549472689628601, "learning_rate": 4.112438689663627e-05, "loss": 0.8013, "num_input_tokens_seen": 13518192, "step": 23445 }, { "epoch": 3.4927018170985997, "grad_norm": 1.134295105934143, "learning_rate": 4.1119419914888645e-05, "loss": 0.6227, "num_input_tokens_seen": 13520880, "step": 23450 }, { "epoch": 3.4934465296395594, "grad_norm": 1.3035155534744263, "learning_rate": 4.111445184385616e-05, "loss": 0.6554, "num_input_tokens_seen": 13523728, "step": 23455 }, { "epoch": 3.494191242180518, "grad_norm": 1.5068200826644897, "learning_rate": 4.110948268387455e-05, "loss": 0.6191, "num_input_tokens_seen": 13526736, "step": 23460 }, { "epoch": 3.4949359547214773, "grad_norm": 1.638087272644043, "learning_rate": 4.110451243527957e-05, "loss": 0.6585, "num_input_tokens_seen": 13529872, "step": 23465 }, { "epoch": 3.4956806672624365, "grad_norm": 1.503800868988037, "learning_rate": 4.109954109840714e-05, "loss": 0.6533, "num_input_tokens_seen": 13532752, "step": 23470 }, { "epoch": 3.4964253798033957, "grad_norm": 1.1048318147659302, "learning_rate": 4.109456867359317e-05, "loss": 0.7109, "num_input_tokens_seen": 13536016, "step": 23475 }, { "epoch": 3.497170092344355, "grad_norm": 1.1398652791976929, "learning_rate": 4.108959516117368e-05, "loss": 0.6907, "num_input_tokens_seen": 13538896, "step": 23480 }, { "epoch": 3.497914804885314, "grad_norm": 1.7247252464294434, "learning_rate": 4.108462056148477e-05, "loss": 0.6377, "num_input_tokens_seen": 13541840, "step": 23485 }, { "epoch": 3.4986595174262733, "grad_norm": 1.7025110721588135, "learning_rate": 4.10796448748626e-05, "loss": 0.5569, "num_input_tokens_seen": 13544784, "step": 23490 }, { "epoch": 3.4994042299672325, "grad_norm": 1.2519704103469849, "learning_rate": 4.10746681016434e-05, "loss": 0.707, "num_input_tokens_seen": 13547760, "step": 23495 }, { "epoch": 3.5, "eval_loss": 0.6524342894554138, "eval_runtime": 74.255, "eval_samples_per_second": 40.186, "eval_steps_per_second": 10.046, "num_input_tokens_seen": 13550032, "step": 23499 }, { "epoch": 3.5001489425081918, "grad_norm": 0.8181206583976746, "learning_rate": 4.1069690242163484e-05, "loss": 0.6032, "num_input_tokens_seen": 13550544, "step": 23500 }, { "epoch": 3.500893655049151, "grad_norm": 1.0237394571304321, "learning_rate": 4.106471129675924e-05, "loss": 0.6502, "num_input_tokens_seen": 13553264, "step": 23505 }, { "epoch": 3.50163836759011, "grad_norm": 1.1546034812927246, "learning_rate": 4.105973126576712e-05, "loss": 0.584, "num_input_tokens_seen": 13555984, "step": 23510 }, { "epoch": 3.5023830801310694, "grad_norm": 1.7140957117080688, "learning_rate": 4.105475014952365e-05, "loss": 0.8713, "num_input_tokens_seen": 13559024, "step": 23515 }, { "epoch": 3.5031277926720286, "grad_norm": 1.9122161865234375, "learning_rate": 4.104976794836545e-05, "loss": 0.6551, "num_input_tokens_seen": 13561968, "step": 23520 }, { "epoch": 3.5038725052129878, "grad_norm": 2.018604278564453, "learning_rate": 4.104478466262917e-05, "loss": 0.794, "num_input_tokens_seen": 13564944, "step": 23525 }, { "epoch": 3.504617217753947, "grad_norm": 1.2512454986572266, "learning_rate": 4.1039800292651584e-05, "loss": 0.5519, "num_input_tokens_seen": 13567632, "step": 23530 }, { "epoch": 3.505361930294906, "grad_norm": 2.2903285026550293, "learning_rate": 4.103481483876951e-05, "loss": 0.802, "num_input_tokens_seen": 13571088, "step": 23535 }, { "epoch": 3.5061066428358654, "grad_norm": 1.3657079935073853, "learning_rate": 4.1029828301319836e-05, "loss": 0.6131, "num_input_tokens_seen": 13573840, "step": 23540 }, { "epoch": 3.5068513553768246, "grad_norm": 2.836965560913086, "learning_rate": 4.102484068063954e-05, "loss": 0.6614, "num_input_tokens_seen": 13576624, "step": 23545 }, { "epoch": 3.5075960679177838, "grad_norm": 1.0377436876296997, "learning_rate": 4.1019851977065674e-05, "loss": 0.4512, "num_input_tokens_seen": 13579472, "step": 23550 }, { "epoch": 3.508340780458743, "grad_norm": 0.9092816710472107, "learning_rate": 4.101486219093533e-05, "loss": 0.5374, "num_input_tokens_seen": 13582288, "step": 23555 }, { "epoch": 3.509085492999702, "grad_norm": 1.1385289430618286, "learning_rate": 4.100987132258571e-05, "loss": 0.5099, "num_input_tokens_seen": 13585200, "step": 23560 }, { "epoch": 3.5098302055406614, "grad_norm": 0.8885930180549622, "learning_rate": 4.1004879372354085e-05, "loss": 0.5596, "num_input_tokens_seen": 13587984, "step": 23565 }, { "epoch": 3.5105749180816206, "grad_norm": 1.1972508430480957, "learning_rate": 4.099988634057778e-05, "loss": 0.5732, "num_input_tokens_seen": 13591024, "step": 23570 }, { "epoch": 3.51131963062258, "grad_norm": 1.999518871307373, "learning_rate": 4.09948922275942e-05, "loss": 0.7244, "num_input_tokens_seen": 13593904, "step": 23575 }, { "epoch": 3.512064343163539, "grad_norm": 1.5769448280334473, "learning_rate": 4.098989703374084e-05, "loss": 0.6478, "num_input_tokens_seen": 13597072, "step": 23580 }, { "epoch": 3.512809055704498, "grad_norm": 1.1865228414535522, "learning_rate": 4.0984900759355254e-05, "loss": 0.7017, "num_input_tokens_seen": 13600176, "step": 23585 }, { "epoch": 3.5135537682454574, "grad_norm": 1.6577292680740356, "learning_rate": 4.097990340477507e-05, "loss": 0.7001, "num_input_tokens_seen": 13603088, "step": 23590 }, { "epoch": 3.5142984807864166, "grad_norm": 1.3693867921829224, "learning_rate": 4.097490497033797e-05, "loss": 0.5307, "num_input_tokens_seen": 13605968, "step": 23595 }, { "epoch": 3.515043193327376, "grad_norm": 2.2525501251220703, "learning_rate": 4.096990545638174e-05, "loss": 0.709, "num_input_tokens_seen": 13608816, "step": 23600 }, { "epoch": 3.5157879058683346, "grad_norm": 1.0491070747375488, "learning_rate": 4.096490486324424e-05, "loss": 0.6647, "num_input_tokens_seen": 13611856, "step": 23605 }, { "epoch": 3.516532618409294, "grad_norm": 1.3571044206619263, "learning_rate": 4.095990319126337e-05, "loss": 0.648, "num_input_tokens_seen": 13614832, "step": 23610 }, { "epoch": 3.517277330950253, "grad_norm": 1.0036698579788208, "learning_rate": 4.0954900440777125e-05, "loss": 0.5258, "num_input_tokens_seen": 13617584, "step": 23615 }, { "epoch": 3.5180220434912126, "grad_norm": 1.6978206634521484, "learning_rate": 4.094989661212359e-05, "loss": 0.8297, "num_input_tokens_seen": 13620560, "step": 23620 }, { "epoch": 3.5187667560321714, "grad_norm": 1.3578203916549683, "learning_rate": 4.094489170564088e-05, "loss": 0.4968, "num_input_tokens_seen": 13623248, "step": 23625 }, { "epoch": 3.519511468573131, "grad_norm": 1.542595624923706, "learning_rate": 4.0939885721667216e-05, "loss": 0.5606, "num_input_tokens_seen": 13626320, "step": 23630 }, { "epoch": 3.5202561811140898, "grad_norm": 1.039211630821228, "learning_rate": 4.093487866054088e-05, "loss": 0.4394, "num_input_tokens_seen": 13629296, "step": 23635 }, { "epoch": 3.5210008936550494, "grad_norm": 1.375438928604126, "learning_rate": 4.0929870522600233e-05, "loss": 0.9005, "num_input_tokens_seen": 13632112, "step": 23640 }, { "epoch": 3.521745606196008, "grad_norm": 2.3891704082489014, "learning_rate": 4.092486130818371e-05, "loss": 0.7468, "num_input_tokens_seen": 13635184, "step": 23645 }, { "epoch": 3.5224903187369674, "grad_norm": 1.2230241298675537, "learning_rate": 4.09198510176298e-05, "loss": 0.5825, "num_input_tokens_seen": 13638256, "step": 23650 }, { "epoch": 3.5232350312779266, "grad_norm": 1.1395469903945923, "learning_rate": 4.091483965127708e-05, "loss": 0.5839, "num_input_tokens_seen": 13641040, "step": 23655 }, { "epoch": 3.523979743818886, "grad_norm": 3.8003456592559814, "learning_rate": 4.09098272094642e-05, "loss": 0.6428, "num_input_tokens_seen": 13643664, "step": 23660 }, { "epoch": 3.524724456359845, "grad_norm": 1.3661125898361206, "learning_rate": 4.0904813692529886e-05, "loss": 0.697, "num_input_tokens_seen": 13646544, "step": 23665 }, { "epoch": 3.525469168900804, "grad_norm": 1.736146092414856, "learning_rate": 4.089979910081293e-05, "loss": 0.6837, "num_input_tokens_seen": 13649616, "step": 23670 }, { "epoch": 3.5262138814417634, "grad_norm": 0.8789785504341125, "learning_rate": 4.089478343465219e-05, "loss": 0.6741, "num_input_tokens_seen": 13652432, "step": 23675 }, { "epoch": 3.5269585939827226, "grad_norm": 1.1837208271026611, "learning_rate": 4.088976669438661e-05, "loss": 0.6505, "num_input_tokens_seen": 13654928, "step": 23680 }, { "epoch": 3.527703306523682, "grad_norm": 1.4942411184310913, "learning_rate": 4.088474888035519e-05, "loss": 0.6057, "num_input_tokens_seen": 13658096, "step": 23685 }, { "epoch": 3.528448019064641, "grad_norm": 1.7524504661560059, "learning_rate": 4.087972999289704e-05, "loss": 0.6164, "num_input_tokens_seen": 13661008, "step": 23690 }, { "epoch": 3.5291927316056, "grad_norm": 1.1992157697677612, "learning_rate": 4.0874710032351296e-05, "loss": 0.7062, "num_input_tokens_seen": 13663792, "step": 23695 }, { "epoch": 3.5299374441465594, "grad_norm": 1.6803516149520874, "learning_rate": 4.086968899905719e-05, "loss": 0.5458, "num_input_tokens_seen": 13666576, "step": 23700 }, { "epoch": 3.5306821566875186, "grad_norm": 0.614439845085144, "learning_rate": 4.086466689335402e-05, "loss": 0.7283, "num_input_tokens_seen": 13669360, "step": 23705 }, { "epoch": 3.531426869228478, "grad_norm": 6.060091018676758, "learning_rate": 4.085964371558116e-05, "loss": 0.5943, "num_input_tokens_seen": 13672240, "step": 23710 }, { "epoch": 3.532171581769437, "grad_norm": 1.8947129249572754, "learning_rate": 4.085461946607806e-05, "loss": 0.6151, "num_input_tokens_seen": 13675472, "step": 23715 }, { "epoch": 3.532916294310396, "grad_norm": 2.1064791679382324, "learning_rate": 4.084959414518423e-05, "loss": 0.6711, "num_input_tokens_seen": 13678384, "step": 23720 }, { "epoch": 3.5336610068513554, "grad_norm": 2.6836986541748047, "learning_rate": 4.0844567753239276e-05, "loss": 0.8216, "num_input_tokens_seen": 13680848, "step": 23725 }, { "epoch": 3.5344057193923146, "grad_norm": 1.2491551637649536, "learning_rate": 4.0839540290582856e-05, "loss": 0.5728, "num_input_tokens_seen": 13684208, "step": 23730 }, { "epoch": 3.535150431933274, "grad_norm": 2.0910911560058594, "learning_rate": 4.08345117575547e-05, "loss": 0.7141, "num_input_tokens_seen": 13687120, "step": 23735 }, { "epoch": 3.535895144474233, "grad_norm": 1.8046025037765503, "learning_rate": 4.082948215449461e-05, "loss": 0.7898, "num_input_tokens_seen": 13690096, "step": 23740 }, { "epoch": 3.5366398570151922, "grad_norm": 5.074362277984619, "learning_rate": 4.0824451481742475e-05, "loss": 0.6066, "num_input_tokens_seen": 13693360, "step": 23745 }, { "epoch": 3.5373845695561514, "grad_norm": 1.4619708061218262, "learning_rate": 4.081941973963825e-05, "loss": 0.9377, "num_input_tokens_seen": 13696464, "step": 23750 }, { "epoch": 3.5381292820971106, "grad_norm": 1.5932438373565674, "learning_rate": 4.0814386928521964e-05, "loss": 0.5657, "num_input_tokens_seen": 13699312, "step": 23755 }, { "epoch": 3.53887399463807, "grad_norm": 1.06318199634552, "learning_rate": 4.0809353048733696e-05, "loss": 0.7693, "num_input_tokens_seen": 13702096, "step": 23760 }, { "epoch": 3.539618707179029, "grad_norm": 1.7011027336120605, "learning_rate": 4.0804318100613624e-05, "loss": 0.665, "num_input_tokens_seen": 13704976, "step": 23765 }, { "epoch": 3.5403634197199882, "grad_norm": 0.9458557963371277, "learning_rate": 4.0799282084502e-05, "loss": 0.5095, "num_input_tokens_seen": 13707984, "step": 23770 }, { "epoch": 3.5411081322609474, "grad_norm": 1.2677127122879028, "learning_rate": 4.079424500073912e-05, "loss": 0.7324, "num_input_tokens_seen": 13710864, "step": 23775 }, { "epoch": 3.541852844801906, "grad_norm": 0.8599324822425842, "learning_rate": 4.078920684966538e-05, "loss": 0.6408, "num_input_tokens_seen": 13713808, "step": 23780 }, { "epoch": 3.542597557342866, "grad_norm": 1.2423443794250488, "learning_rate": 4.078416763162123e-05, "loss": 0.5614, "num_input_tokens_seen": 13716496, "step": 23785 }, { "epoch": 3.5433422698838246, "grad_norm": 1.2768385410308838, "learning_rate": 4.0779127346947214e-05, "loss": 0.6, "num_input_tokens_seen": 13719088, "step": 23790 }, { "epoch": 3.5440869824247843, "grad_norm": 1.091983437538147, "learning_rate": 4.077408599598392e-05, "loss": 0.7158, "num_input_tokens_seen": 13722128, "step": 23795 }, { "epoch": 3.544831694965743, "grad_norm": 1.7773637771606445, "learning_rate": 4.076904357907203e-05, "loss": 0.6356, "num_input_tokens_seen": 13725008, "step": 23800 }, { "epoch": 3.5455764075067027, "grad_norm": 1.3508254289627075, "learning_rate": 4.076400009655228e-05, "loss": 0.6402, "num_input_tokens_seen": 13727856, "step": 23805 }, { "epoch": 3.5463211200476614, "grad_norm": 1.0941529273986816, "learning_rate": 4.0758955548765505e-05, "loss": 0.5472, "num_input_tokens_seen": 13730736, "step": 23810 }, { "epoch": 3.5470658325886206, "grad_norm": 2.1581814289093018, "learning_rate": 4.075390993605258e-05, "loss": 0.7836, "num_input_tokens_seen": 13733520, "step": 23815 }, { "epoch": 3.54781054512958, "grad_norm": 2.282466173171997, "learning_rate": 4.074886325875447e-05, "loss": 0.7285, "num_input_tokens_seen": 13736624, "step": 23820 }, { "epoch": 3.548555257670539, "grad_norm": 1.8284518718719482, "learning_rate": 4.074381551721221e-05, "loss": 0.7416, "num_input_tokens_seen": 13739440, "step": 23825 }, { "epoch": 3.5492999702114982, "grad_norm": 1.073020577430725, "learning_rate": 4.073876671176692e-05, "loss": 0.7092, "num_input_tokens_seen": 13742352, "step": 23830 }, { "epoch": 3.5500446827524574, "grad_norm": 1.3979134559631348, "learning_rate": 4.073371684275976e-05, "loss": 0.5018, "num_input_tokens_seen": 13745616, "step": 23835 }, { "epoch": 3.5507893952934166, "grad_norm": 1.4604511260986328, "learning_rate": 4.072866591053197e-05, "loss": 0.7062, "num_input_tokens_seen": 13748560, "step": 23840 }, { "epoch": 3.551534107834376, "grad_norm": 0.844865083694458, "learning_rate": 4.0723613915424894e-05, "loss": 0.6774, "num_input_tokens_seen": 13751696, "step": 23845 }, { "epoch": 3.552278820375335, "grad_norm": 1.1852216720581055, "learning_rate": 4.071856085777993e-05, "loss": 0.5184, "num_input_tokens_seen": 13754704, "step": 23850 }, { "epoch": 3.5530235329162942, "grad_norm": 1.3320236206054688, "learning_rate": 4.071350673793852e-05, "loss": 0.611, "num_input_tokens_seen": 13757520, "step": 23855 }, { "epoch": 3.5537682454572534, "grad_norm": 1.2601959705352783, "learning_rate": 4.070845155624221e-05, "loss": 0.7806, "num_input_tokens_seen": 13760528, "step": 23860 }, { "epoch": 3.5545129579982127, "grad_norm": 0.7648956775665283, "learning_rate": 4.070339531303261e-05, "loss": 0.4979, "num_input_tokens_seen": 13763248, "step": 23865 }, { "epoch": 3.555257670539172, "grad_norm": 1.4065768718719482, "learning_rate": 4.0698338008651405e-05, "loss": 0.6537, "num_input_tokens_seen": 13766000, "step": 23870 }, { "epoch": 3.556002383080131, "grad_norm": 1.1185280084609985, "learning_rate": 4.0693279643440326e-05, "loss": 0.6188, "num_input_tokens_seen": 13768944, "step": 23875 }, { "epoch": 3.5567470956210903, "grad_norm": 0.9621455669403076, "learning_rate": 4.068822021774123e-05, "loss": 0.5502, "num_input_tokens_seen": 13771824, "step": 23880 }, { "epoch": 3.5574918081620495, "grad_norm": 0.950182318687439, "learning_rate": 4.0683159731895994e-05, "loss": 0.4835, "num_input_tokens_seen": 13775056, "step": 23885 }, { "epoch": 3.5582365207030087, "grad_norm": 0.9101536870002747, "learning_rate": 4.067809818624658e-05, "loss": 0.7372, "num_input_tokens_seen": 13777968, "step": 23890 }, { "epoch": 3.558981233243968, "grad_norm": 0.9687108993530273, "learning_rate": 4.067303558113503e-05, "loss": 0.6317, "num_input_tokens_seen": 13780688, "step": 23895 }, { "epoch": 3.559725945784927, "grad_norm": 0.9993994235992432, "learning_rate": 4.066797191690347e-05, "loss": 0.6701, "num_input_tokens_seen": 13783312, "step": 23900 }, { "epoch": 3.5604706583258863, "grad_norm": 1.0236029624938965, "learning_rate": 4.066290719389406e-05, "loss": 0.7972, "num_input_tokens_seen": 13786096, "step": 23905 }, { "epoch": 3.5612153708668455, "grad_norm": 1.73576819896698, "learning_rate": 4.065784141244907e-05, "loss": 0.6941, "num_input_tokens_seen": 13788752, "step": 23910 }, { "epoch": 3.5619600834078047, "grad_norm": 1.8690439462661743, "learning_rate": 4.065277457291081e-05, "loss": 0.7484, "num_input_tokens_seen": 13791664, "step": 23915 }, { "epoch": 3.562704795948764, "grad_norm": 1.1728636026382446, "learning_rate": 4.0647706675621685e-05, "loss": 0.6782, "num_input_tokens_seen": 13794576, "step": 23920 }, { "epoch": 3.563449508489723, "grad_norm": 2.8922080993652344, "learning_rate": 4.064263772092416e-05, "loss": 0.7205, "num_input_tokens_seen": 13797360, "step": 23925 }, { "epoch": 3.5641942210306823, "grad_norm": 1.9827829599380493, "learning_rate": 4.0637567709160786e-05, "loss": 0.6958, "num_input_tokens_seen": 13799888, "step": 23930 }, { "epoch": 3.5649389335716415, "grad_norm": 0.9748742580413818, "learning_rate": 4.0632496640674156e-05, "loss": 0.5871, "num_input_tokens_seen": 13802832, "step": 23935 }, { "epoch": 3.5656836461126007, "grad_norm": 3.242952585220337, "learning_rate": 4.0627424515806957e-05, "loss": 0.721, "num_input_tokens_seen": 13806064, "step": 23940 }, { "epoch": 3.5664283586535594, "grad_norm": 1.032633662223816, "learning_rate": 4.062235133490195e-05, "loss": 0.5664, "num_input_tokens_seen": 13809072, "step": 23945 }, { "epoch": 3.567173071194519, "grad_norm": 1.4347838163375854, "learning_rate": 4.061727709830196e-05, "loss": 0.6321, "num_input_tokens_seen": 13812208, "step": 23950 }, { "epoch": 3.567917783735478, "grad_norm": 2.7650272846221924, "learning_rate": 4.061220180634987e-05, "loss": 0.6805, "num_input_tokens_seen": 13815024, "step": 23955 }, { "epoch": 3.5686624962764375, "grad_norm": 1.2607967853546143, "learning_rate": 4.060712545938866e-05, "loss": 0.602, "num_input_tokens_seen": 13817808, "step": 23960 }, { "epoch": 3.5694072088173963, "grad_norm": 1.0361783504486084, "learning_rate": 4.0602048057761365e-05, "loss": 0.6209, "num_input_tokens_seen": 13820624, "step": 23965 }, { "epoch": 3.570151921358356, "grad_norm": 1.4592406749725342, "learning_rate": 4.0596969601811095e-05, "loss": 0.6113, "num_input_tokens_seen": 13823376, "step": 23970 }, { "epoch": 3.5708966338993147, "grad_norm": 1.792182207107544, "learning_rate": 4.059189009188104e-05, "loss": 0.7212, "num_input_tokens_seen": 13826352, "step": 23975 }, { "epoch": 3.5716413464402743, "grad_norm": 1.2061166763305664, "learning_rate": 4.058680952831444e-05, "loss": 0.5651, "num_input_tokens_seen": 13829040, "step": 23980 }, { "epoch": 3.572386058981233, "grad_norm": 1.7544195652008057, "learning_rate": 4.058172791145461e-05, "loss": 0.7206, "num_input_tokens_seen": 13832336, "step": 23985 }, { "epoch": 3.5731307715221923, "grad_norm": 2.563676595687866, "learning_rate": 4.0576645241644985e-05, "loss": 0.6945, "num_input_tokens_seen": 13835024, "step": 23990 }, { "epoch": 3.5738754840631515, "grad_norm": 1.5822561979293823, "learning_rate": 4.0571561519228984e-05, "loss": 0.6265, "num_input_tokens_seen": 13837680, "step": 23995 }, { "epoch": 3.5746201966041107, "grad_norm": 1.400841474533081, "learning_rate": 4.056647674455017e-05, "loss": 0.5825, "num_input_tokens_seen": 13840560, "step": 24000 }, { "epoch": 3.57536490914507, "grad_norm": 1.1950114965438843, "learning_rate": 4.056139091795215e-05, "loss": 0.7003, "num_input_tokens_seen": 13843376, "step": 24005 }, { "epoch": 3.576109621686029, "grad_norm": 1.2703180313110352, "learning_rate": 4.05563040397786e-05, "loss": 0.6508, "num_input_tokens_seen": 13846224, "step": 24010 }, { "epoch": 3.5768543342269883, "grad_norm": 1.0481901168823242, "learning_rate": 4.055121611037326e-05, "loss": 0.6346, "num_input_tokens_seen": 13849360, "step": 24015 }, { "epoch": 3.5775990467679475, "grad_norm": 1.9080294370651245, "learning_rate": 4.054612713007997e-05, "loss": 0.6582, "num_input_tokens_seen": 13852368, "step": 24020 }, { "epoch": 3.5783437593089067, "grad_norm": 1.5898088216781616, "learning_rate": 4.054103709924262e-05, "loss": 0.7607, "num_input_tokens_seen": 13855536, "step": 24025 }, { "epoch": 3.579088471849866, "grad_norm": 1.2155905961990356, "learning_rate": 4.0535946018205156e-05, "loss": 0.6502, "num_input_tokens_seen": 13858288, "step": 24030 }, { "epoch": 3.579833184390825, "grad_norm": 1.0079469680786133, "learning_rate": 4.0530853887311634e-05, "loss": 0.7182, "num_input_tokens_seen": 13860880, "step": 24035 }, { "epoch": 3.5805778969317843, "grad_norm": 1.398750901222229, "learning_rate": 4.052576070690615e-05, "loss": 0.6612, "num_input_tokens_seen": 13863792, "step": 24040 }, { "epoch": 3.5813226094727435, "grad_norm": 1.3793805837631226, "learning_rate": 4.052066647733287e-05, "loss": 0.5625, "num_input_tokens_seen": 13866704, "step": 24045 }, { "epoch": 3.5820673220137027, "grad_norm": 1.268575668334961, "learning_rate": 4.051557119893606e-05, "loss": 0.6335, "num_input_tokens_seen": 13869616, "step": 24050 }, { "epoch": 3.582812034554662, "grad_norm": 0.868667483329773, "learning_rate": 4.051047487206003e-05, "loss": 0.6672, "num_input_tokens_seen": 13872496, "step": 24055 }, { "epoch": 3.583556747095621, "grad_norm": 2.7401435375213623, "learning_rate": 4.050537749704917e-05, "loss": 0.6604, "num_input_tokens_seen": 13875376, "step": 24060 }, { "epoch": 3.5843014596365803, "grad_norm": 1.3007986545562744, "learning_rate": 4.050027907424794e-05, "loss": 0.6338, "num_input_tokens_seen": 13878160, "step": 24065 }, { "epoch": 3.5850461721775395, "grad_norm": 0.7460657954216003, "learning_rate": 4.049517960400086e-05, "loss": 0.7037, "num_input_tokens_seen": 13880912, "step": 24070 }, { "epoch": 3.5857908847184987, "grad_norm": 1.6500602960586548, "learning_rate": 4.049007908665255e-05, "loss": 0.6447, "num_input_tokens_seen": 13883856, "step": 24075 }, { "epoch": 3.586535597259458, "grad_norm": 1.134215235710144, "learning_rate": 4.0484977522547676e-05, "loss": 0.6267, "num_input_tokens_seen": 13886768, "step": 24080 }, { "epoch": 3.587280309800417, "grad_norm": 0.9605624675750732, "learning_rate": 4.047987491203097e-05, "loss": 0.6353, "num_input_tokens_seen": 13889488, "step": 24085 }, { "epoch": 3.5880250223413763, "grad_norm": 2.163318634033203, "learning_rate": 4.0474771255447256e-05, "loss": 0.6608, "num_input_tokens_seen": 13892336, "step": 24090 }, { "epoch": 3.5887697348823355, "grad_norm": 1.5014879703521729, "learning_rate": 4.046966655314142e-05, "loss": 0.7465, "num_input_tokens_seen": 13895152, "step": 24095 }, { "epoch": 3.5895144474232947, "grad_norm": 2.0894711017608643, "learning_rate": 4.0464560805458405e-05, "loss": 0.4341, "num_input_tokens_seen": 13898000, "step": 24100 }, { "epoch": 3.590259159964254, "grad_norm": 0.8458967804908752, "learning_rate": 4.045945401274326e-05, "loss": 0.854, "num_input_tokens_seen": 13900880, "step": 24105 }, { "epoch": 3.591003872505213, "grad_norm": 2.1836910247802734, "learning_rate": 4.0454346175341054e-05, "loss": 0.8519, "num_input_tokens_seen": 13903952, "step": 24110 }, { "epoch": 3.5917485850461723, "grad_norm": 1.1621750593185425, "learning_rate": 4.0449237293596975e-05, "loss": 0.6142, "num_input_tokens_seen": 13906768, "step": 24115 }, { "epoch": 3.592493297587131, "grad_norm": 1.5054571628570557, "learning_rate": 4.0444127367856246e-05, "loss": 0.6586, "num_input_tokens_seen": 13909520, "step": 24120 }, { "epoch": 3.5932380101280907, "grad_norm": 1.1787906885147095, "learning_rate": 4.043901639846418e-05, "loss": 0.5396, "num_input_tokens_seen": 13912464, "step": 24125 }, { "epoch": 3.5939827226690495, "grad_norm": 0.7535604238510132, "learning_rate": 4.043390438576616e-05, "loss": 0.6524, "num_input_tokens_seen": 13915536, "step": 24130 }, { "epoch": 3.594727435210009, "grad_norm": 1.0204354524612427, "learning_rate": 4.042879133010763e-05, "loss": 0.5544, "num_input_tokens_seen": 13918384, "step": 24135 }, { "epoch": 3.595472147750968, "grad_norm": 0.8910874128341675, "learning_rate": 4.042367723183411e-05, "loss": 0.7706, "num_input_tokens_seen": 13921648, "step": 24140 }, { "epoch": 3.5962168602919276, "grad_norm": 1.6040948629379272, "learning_rate": 4.041856209129119e-05, "loss": 0.715, "num_input_tokens_seen": 13924176, "step": 24145 }, { "epoch": 3.5969615728328863, "grad_norm": 1.0482646226882935, "learning_rate": 4.0413445908824534e-05, "loss": 0.607, "num_input_tokens_seen": 13926928, "step": 24150 }, { "epoch": 3.597706285373846, "grad_norm": 1.6159961223602295, "learning_rate": 4.040832868477987e-05, "loss": 0.6578, "num_input_tokens_seen": 13929616, "step": 24155 }, { "epoch": 3.5984509979148047, "grad_norm": 1.2500112056732178, "learning_rate": 4.040321041950299e-05, "loss": 0.5948, "num_input_tokens_seen": 13932368, "step": 24160 }, { "epoch": 3.599195710455764, "grad_norm": 1.551806926727295, "learning_rate": 4.039809111333979e-05, "loss": 0.7125, "num_input_tokens_seen": 13935120, "step": 24165 }, { "epoch": 3.599940422996723, "grad_norm": 1.0450769662857056, "learning_rate": 4.039297076663619e-05, "loss": 0.7409, "num_input_tokens_seen": 13937904, "step": 24170 }, { "epoch": 3.6006851355376823, "grad_norm": 1.5484524965286255, "learning_rate": 4.03878493797382e-05, "loss": 0.7317, "num_input_tokens_seen": 13940688, "step": 24175 }, { "epoch": 3.6014298480786415, "grad_norm": 0.9829158186912537, "learning_rate": 4.0382726952991924e-05, "loss": 0.7673, "num_input_tokens_seen": 13943504, "step": 24180 }, { "epoch": 3.6021745606196007, "grad_norm": 1.2552917003631592, "learning_rate": 4.037760348674349e-05, "loss": 0.5815, "num_input_tokens_seen": 13946352, "step": 24185 }, { "epoch": 3.60291927316056, "grad_norm": 1.3308370113372803, "learning_rate": 4.037247898133915e-05, "loss": 0.6832, "num_input_tokens_seen": 13949328, "step": 24190 }, { "epoch": 3.603663985701519, "grad_norm": 0.8698003888130188, "learning_rate": 4.036735343712516e-05, "loss": 0.6959, "num_input_tokens_seen": 13952240, "step": 24195 }, { "epoch": 3.6044086982424783, "grad_norm": 0.7195181846618652, "learning_rate": 4.036222685444792e-05, "loss": 0.7776, "num_input_tokens_seen": 13955184, "step": 24200 }, { "epoch": 3.6051534107834375, "grad_norm": 0.9429450035095215, "learning_rate": 4.035709923365384e-05, "loss": 0.5575, "num_input_tokens_seen": 13957904, "step": 24205 }, { "epoch": 3.6058981233243967, "grad_norm": 1.206973671913147, "learning_rate": 4.0351970575089435e-05, "loss": 0.6665, "num_input_tokens_seen": 13960848, "step": 24210 }, { "epoch": 3.606642835865356, "grad_norm": 1.1642457246780396, "learning_rate": 4.0346840879101277e-05, "loss": 0.6918, "num_input_tokens_seen": 13963824, "step": 24215 }, { "epoch": 3.607387548406315, "grad_norm": 1.1994627714157104, "learning_rate": 4.0341710146036e-05, "loss": 0.5798, "num_input_tokens_seen": 13966608, "step": 24220 }, { "epoch": 3.6081322609472744, "grad_norm": 1.2711964845657349, "learning_rate": 4.033657837624033e-05, "loss": 0.7709, "num_input_tokens_seen": 13969584, "step": 24225 }, { "epoch": 3.6088769734882336, "grad_norm": 1.033683180809021, "learning_rate": 4.033144557006104e-05, "loss": 0.537, "num_input_tokens_seen": 13972464, "step": 24230 }, { "epoch": 3.6096216860291928, "grad_norm": 1.2797642946243286, "learning_rate": 4.032631172784501e-05, "loss": 0.7924, "num_input_tokens_seen": 13975216, "step": 24235 }, { "epoch": 3.610366398570152, "grad_norm": 0.9554032683372498, "learning_rate": 4.0321176849939135e-05, "loss": 0.5972, "num_input_tokens_seen": 13978320, "step": 24240 }, { "epoch": 3.611111111111111, "grad_norm": 0.9423484206199646, "learning_rate": 4.031604093669042e-05, "loss": 0.6611, "num_input_tokens_seen": 13981200, "step": 24245 }, { "epoch": 3.6118558236520704, "grad_norm": 1.2763618230819702, "learning_rate": 4.031090398844593e-05, "loss": 0.667, "num_input_tokens_seen": 13984112, "step": 24250 }, { "epoch": 3.6126005361930296, "grad_norm": 1.2937473058700562, "learning_rate": 4.030576600555279e-05, "loss": 0.6576, "num_input_tokens_seen": 13986992, "step": 24255 }, { "epoch": 3.6133452487339888, "grad_norm": 2.6576759815216064, "learning_rate": 4.030062698835822e-05, "loss": 0.6787, "num_input_tokens_seen": 13990160, "step": 24260 }, { "epoch": 3.614089961274948, "grad_norm": 1.5804246664047241, "learning_rate": 4.029548693720949e-05, "loss": 0.6739, "num_input_tokens_seen": 13993200, "step": 24265 }, { "epoch": 3.614834673815907, "grad_norm": 1.4583570957183838, "learning_rate": 4.029034585245393e-05, "loss": 0.6637, "num_input_tokens_seen": 13996176, "step": 24270 }, { "epoch": 3.6155793863568664, "grad_norm": 1.0756360292434692, "learning_rate": 4.028520373443897e-05, "loss": 0.5997, "num_input_tokens_seen": 13999152, "step": 24275 }, { "epoch": 3.6163240988978256, "grad_norm": 1.6123746633529663, "learning_rate": 4.028006058351208e-05, "loss": 0.681, "num_input_tokens_seen": 14001904, "step": 24280 }, { "epoch": 3.617068811438785, "grad_norm": 1.3931719064712524, "learning_rate": 4.027491640002083e-05, "loss": 0.6267, "num_input_tokens_seen": 14004432, "step": 24285 }, { "epoch": 3.617813523979744, "grad_norm": 5.554775238037109, "learning_rate": 4.0269771184312824e-05, "loss": 0.6589, "num_input_tokens_seen": 14007440, "step": 24290 }, { "epoch": 3.6185582365207027, "grad_norm": 1.7980965375900269, "learning_rate": 4.0264624936735776e-05, "loss": 0.6741, "num_input_tokens_seen": 14010576, "step": 24295 }, { "epoch": 3.6193029490616624, "grad_norm": 2.322383165359497, "learning_rate": 4.0259477657637424e-05, "loss": 0.6698, "num_input_tokens_seen": 14013328, "step": 24300 }, { "epoch": 3.620047661602621, "grad_norm": 3.861820697784424, "learning_rate": 4.0254329347365614e-05, "loss": 0.5818, "num_input_tokens_seen": 14016272, "step": 24305 }, { "epoch": 3.620792374143581, "grad_norm": 2.698680877685547, "learning_rate": 4.024918000626825e-05, "loss": 0.6223, "num_input_tokens_seen": 14018928, "step": 24310 }, { "epoch": 3.6215370866845396, "grad_norm": 2.5196726322174072, "learning_rate": 4.024402963469329e-05, "loss": 0.8281, "num_input_tokens_seen": 14021744, "step": 24315 }, { "epoch": 3.622281799225499, "grad_norm": 1.6130706071853638, "learning_rate": 4.02388782329888e-05, "loss": 0.6651, "num_input_tokens_seen": 14024304, "step": 24320 }, { "epoch": 3.623026511766458, "grad_norm": 3.176182508468628, "learning_rate": 4.023372580150286e-05, "loss": 0.7558, "num_input_tokens_seen": 14027120, "step": 24325 }, { "epoch": 3.6237712243074176, "grad_norm": 2.9160897731781006, "learning_rate": 4.022857234058368e-05, "loss": 0.7639, "num_input_tokens_seen": 14029840, "step": 24330 }, { "epoch": 3.6245159368483764, "grad_norm": 2.065058469772339, "learning_rate": 4.022341785057949e-05, "loss": 0.7804, "num_input_tokens_seen": 14032752, "step": 24335 }, { "epoch": 3.6252606493893356, "grad_norm": 1.7780400514602661, "learning_rate": 4.021826233183862e-05, "loss": 0.6222, "num_input_tokens_seen": 14035696, "step": 24340 }, { "epoch": 3.6260053619302948, "grad_norm": 1.097465991973877, "learning_rate": 4.0213105784709445e-05, "loss": 0.7502, "num_input_tokens_seen": 14038704, "step": 24345 }, { "epoch": 3.626750074471254, "grad_norm": 1.2869195938110352, "learning_rate": 4.020794820954044e-05, "loss": 0.5205, "num_input_tokens_seen": 14041424, "step": 24350 }, { "epoch": 3.627494787012213, "grad_norm": 1.5116151571273804, "learning_rate": 4.0202789606680136e-05, "loss": 0.7087, "num_input_tokens_seen": 14044304, "step": 24355 }, { "epoch": 3.6282394995531724, "grad_norm": 1.0063942670822144, "learning_rate": 4.01976299764771e-05, "loss": 0.5351, "num_input_tokens_seen": 14047216, "step": 24360 }, { "epoch": 3.6289842120941316, "grad_norm": 1.0042766332626343, "learning_rate": 4.019246931928004e-05, "loss": 0.633, "num_input_tokens_seen": 14050032, "step": 24365 }, { "epoch": 3.629728924635091, "grad_norm": 1.141617774963379, "learning_rate": 4.018730763543765e-05, "loss": 0.6877, "num_input_tokens_seen": 14052592, "step": 24370 }, { "epoch": 3.63047363717605, "grad_norm": 1.3472013473510742, "learning_rate": 4.018214492529877e-05, "loss": 0.6913, "num_input_tokens_seen": 14055440, "step": 24375 }, { "epoch": 3.631218349717009, "grad_norm": 1.8276610374450684, "learning_rate": 4.017698118921226e-05, "loss": 0.7733, "num_input_tokens_seen": 14058064, "step": 24380 }, { "epoch": 3.6319630622579684, "grad_norm": 1.3138222694396973, "learning_rate": 4.0171816427527064e-05, "loss": 0.717, "num_input_tokens_seen": 14060880, "step": 24385 }, { "epoch": 3.6327077747989276, "grad_norm": 1.2904375791549683, "learning_rate": 4.016665064059219e-05, "loss": 0.5494, "num_input_tokens_seen": 14063952, "step": 24390 }, { "epoch": 3.633452487339887, "grad_norm": 1.3159059286117554, "learning_rate": 4.016148382875675e-05, "loss": 0.821, "num_input_tokens_seen": 14066768, "step": 24395 }, { "epoch": 3.634197199880846, "grad_norm": 1.3808749914169312, "learning_rate": 4.0156315992369864e-05, "loss": 0.6, "num_input_tokens_seen": 14069712, "step": 24400 }, { "epoch": 3.634941912421805, "grad_norm": 0.9633457064628601, "learning_rate": 4.015114713178077e-05, "loss": 0.4798, "num_input_tokens_seen": 14072720, "step": 24405 }, { "epoch": 3.6356866249627644, "grad_norm": 1.419954538345337, "learning_rate": 4.014597724733874e-05, "loss": 0.6314, "num_input_tokens_seen": 14075376, "step": 24410 }, { "epoch": 3.6364313375037236, "grad_norm": 0.854927659034729, "learning_rate": 4.0140806339393156e-05, "loss": 0.7409, "num_input_tokens_seen": 14078192, "step": 24415 }, { "epoch": 3.637176050044683, "grad_norm": 1.370648980140686, "learning_rate": 4.013563440829343e-05, "loss": 0.699, "num_input_tokens_seen": 14081072, "step": 24420 }, { "epoch": 3.637920762585642, "grad_norm": 0.8288107514381409, "learning_rate": 4.013046145438908e-05, "loss": 0.8155, "num_input_tokens_seen": 14084080, "step": 24425 }, { "epoch": 3.638665475126601, "grad_norm": 1.6933006048202515, "learning_rate": 4.012528747802965e-05, "loss": 0.616, "num_input_tokens_seen": 14087024, "step": 24430 }, { "epoch": 3.6394101876675604, "grad_norm": 1.1253814697265625, "learning_rate": 4.0120112479564795e-05, "loss": 0.6753, "num_input_tokens_seen": 14089808, "step": 24435 }, { "epoch": 3.6401549002085196, "grad_norm": 1.4869698286056519, "learning_rate": 4.01149364593442e-05, "loss": 0.8113, "num_input_tokens_seen": 14092784, "step": 24440 }, { "epoch": 3.640899612749479, "grad_norm": 1.425336241722107, "learning_rate": 4.010975941771766e-05, "loss": 0.5204, "num_input_tokens_seen": 14095600, "step": 24445 }, { "epoch": 3.641644325290438, "grad_norm": 2.2979021072387695, "learning_rate": 4.0104581355035015e-05, "loss": 0.6188, "num_input_tokens_seen": 14098544, "step": 24450 }, { "epoch": 3.6423890378313972, "grad_norm": 0.8585100769996643, "learning_rate": 4.0099402271646166e-05, "loss": 0.7214, "num_input_tokens_seen": 14101552, "step": 24455 }, { "epoch": 3.6431337503723564, "grad_norm": 1.8573460578918457, "learning_rate": 4.009422216790111e-05, "loss": 0.6886, "num_input_tokens_seen": 14104656, "step": 24460 }, { "epoch": 3.6438784629133156, "grad_norm": 2.6137375831604004, "learning_rate": 4.008904104414988e-05, "loss": 0.5042, "num_input_tokens_seen": 14107504, "step": 24465 }, { "epoch": 3.6446231754542744, "grad_norm": 0.8938392996788025, "learning_rate": 4.0083858900742604e-05, "loss": 0.5816, "num_input_tokens_seen": 14110384, "step": 24470 }, { "epoch": 3.645367887995234, "grad_norm": 1.5498358011245728, "learning_rate": 4.007867573802947e-05, "loss": 0.7465, "num_input_tokens_seen": 14113264, "step": 24475 }, { "epoch": 3.646112600536193, "grad_norm": 0.9916395545005798, "learning_rate": 4.007349155636074e-05, "loss": 0.7177, "num_input_tokens_seen": 14115984, "step": 24480 }, { "epoch": 3.6468573130771524, "grad_norm": 0.6603112816810608, "learning_rate": 4.006830635608673e-05, "loss": 0.61, "num_input_tokens_seen": 14118960, "step": 24485 }, { "epoch": 3.647602025618111, "grad_norm": 0.5744943022727966, "learning_rate": 4.006312013755784e-05, "loss": 0.7285, "num_input_tokens_seen": 14122160, "step": 24490 }, { "epoch": 3.648346738159071, "grad_norm": 1.0997827053070068, "learning_rate": 4.005793290112454e-05, "loss": 0.6126, "num_input_tokens_seen": 14124688, "step": 24495 }, { "epoch": 3.6490914507000296, "grad_norm": 1.135770320892334, "learning_rate": 4.005274464713735e-05, "loss": 0.696, "num_input_tokens_seen": 14127504, "step": 24500 }, { "epoch": 3.6498361632409893, "grad_norm": 1.1919606924057007, "learning_rate": 4.0047555375946876e-05, "loss": 0.5622, "num_input_tokens_seen": 14130352, "step": 24505 }, { "epoch": 3.650580875781948, "grad_norm": 1.3185549974441528, "learning_rate": 4.004236508790379e-05, "loss": 0.6014, "num_input_tokens_seen": 14133552, "step": 24510 }, { "epoch": 3.651325588322907, "grad_norm": 1.6335749626159668, "learning_rate": 4.003717378335883e-05, "loss": 0.6513, "num_input_tokens_seen": 14136240, "step": 24515 }, { "epoch": 3.6520703008638664, "grad_norm": 1.17546546459198, "learning_rate": 4.0031981462662806e-05, "loss": 0.7191, "num_input_tokens_seen": 14139280, "step": 24520 }, { "epoch": 3.6528150134048256, "grad_norm": 1.084100365638733, "learning_rate": 4.002678812616658e-05, "loss": 0.73, "num_input_tokens_seen": 14142160, "step": 24525 }, { "epoch": 3.653559725945785, "grad_norm": 1.4708671569824219, "learning_rate": 4.002159377422111e-05, "loss": 0.7637, "num_input_tokens_seen": 14145104, "step": 24530 }, { "epoch": 3.654304438486744, "grad_norm": 1.0831286907196045, "learning_rate": 4.001639840717741e-05, "loss": 0.7711, "num_input_tokens_seen": 14148016, "step": 24535 }, { "epoch": 3.6550491510277032, "grad_norm": 0.7450391054153442, "learning_rate": 4.001120202538656e-05, "loss": 0.5954, "num_input_tokens_seen": 14150768, "step": 24540 }, { "epoch": 3.6557938635686624, "grad_norm": 1.2568556070327759, "learning_rate": 4.000600462919971e-05, "loss": 0.7862, "num_input_tokens_seen": 14153648, "step": 24545 }, { "epoch": 3.6565385761096216, "grad_norm": 0.6723465919494629, "learning_rate": 4.000080621896807e-05, "loss": 0.4991, "num_input_tokens_seen": 14156688, "step": 24550 }, { "epoch": 3.657283288650581, "grad_norm": 1.0967808961868286, "learning_rate": 3.9995606795042936e-05, "loss": 0.6425, "num_input_tokens_seen": 14159504, "step": 24555 }, { "epoch": 3.65802800119154, "grad_norm": 1.2370697259902954, "learning_rate": 3.9990406357775664e-05, "loss": 0.7374, "num_input_tokens_seen": 14162544, "step": 24560 }, { "epoch": 3.6587727137324992, "grad_norm": 1.4323760271072388, "learning_rate": 3.998520490751767e-05, "loss": 0.7062, "num_input_tokens_seen": 14165520, "step": 24565 }, { "epoch": 3.6595174262734584, "grad_norm": 2.452850818634033, "learning_rate": 3.998000244462046e-05, "loss": 0.5305, "num_input_tokens_seen": 14168272, "step": 24570 }, { "epoch": 3.6602621388144176, "grad_norm": 2.9557301998138428, "learning_rate": 3.997479896943559e-05, "loss": 0.6513, "num_input_tokens_seen": 14171312, "step": 24575 }, { "epoch": 3.661006851355377, "grad_norm": 0.8303274512290955, "learning_rate": 3.996959448231469e-05, "loss": 0.6227, "num_input_tokens_seen": 14174160, "step": 24580 }, { "epoch": 3.661751563896336, "grad_norm": 1.435969352722168, "learning_rate": 3.9964388983609455e-05, "loss": 0.8717, "num_input_tokens_seen": 14177104, "step": 24585 }, { "epoch": 3.6624962764372953, "grad_norm": 1.8885557651519775, "learning_rate": 3.995918247367165e-05, "loss": 0.7574, "num_input_tokens_seen": 14179888, "step": 24590 }, { "epoch": 3.6632409889782545, "grad_norm": 1.6445975303649902, "learning_rate": 3.9953974952853125e-05, "loss": 0.6333, "num_input_tokens_seen": 14182544, "step": 24595 }, { "epoch": 3.6639857015192137, "grad_norm": 1.4788371324539185, "learning_rate": 3.994876642150576e-05, "loss": 0.6424, "num_input_tokens_seen": 14185264, "step": 24600 }, { "epoch": 3.664730414060173, "grad_norm": 1.2193455696105957, "learning_rate": 3.9943556879981534e-05, "loss": 0.7045, "num_input_tokens_seen": 14188272, "step": 24605 }, { "epoch": 3.665475126601132, "grad_norm": 0.8813959956169128, "learning_rate": 3.993834632863249e-05, "loss": 0.4465, "num_input_tokens_seen": 14191440, "step": 24610 }, { "epoch": 3.6662198391420913, "grad_norm": 1.172226071357727, "learning_rate": 3.993313476781075e-05, "loss": 0.5006, "num_input_tokens_seen": 14194256, "step": 24615 }, { "epoch": 3.6669645516830505, "grad_norm": 0.8230097889900208, "learning_rate": 3.992792219786847e-05, "loss": 0.6256, "num_input_tokens_seen": 14197296, "step": 24620 }, { "epoch": 3.6677092642240097, "grad_norm": 0.7636170983314514, "learning_rate": 3.9922708619157894e-05, "loss": 0.5116, "num_input_tokens_seen": 14200176, "step": 24625 }, { "epoch": 3.668453976764969, "grad_norm": 1.1406121253967285, "learning_rate": 3.9917494032031346e-05, "loss": 0.659, "num_input_tokens_seen": 14203088, "step": 24630 }, { "epoch": 3.669198689305928, "grad_norm": 2.4946162700653076, "learning_rate": 3.99122784368412e-05, "loss": 0.6733, "num_input_tokens_seen": 14206000, "step": 24635 }, { "epoch": 3.6699434018468873, "grad_norm": 2.27451753616333, "learning_rate": 3.990706183393991e-05, "loss": 0.8641, "num_input_tokens_seen": 14209232, "step": 24640 }, { "epoch": 3.670688114387846, "grad_norm": 1.1794393062591553, "learning_rate": 3.990184422367998e-05, "loss": 0.6302, "num_input_tokens_seen": 14211792, "step": 24645 }, { "epoch": 3.6714328269288057, "grad_norm": 0.8545753955841064, "learning_rate": 3.989662560641401e-05, "loss": 0.7426, "num_input_tokens_seen": 14214640, "step": 24650 }, { "epoch": 3.6721775394697644, "grad_norm": 1.7377187013626099, "learning_rate": 3.9891405982494647e-05, "loss": 0.5657, "num_input_tokens_seen": 14217904, "step": 24655 }, { "epoch": 3.672922252010724, "grad_norm": 1.1099528074264526, "learning_rate": 3.988618535227461e-05, "loss": 0.5596, "num_input_tokens_seen": 14221136, "step": 24660 }, { "epoch": 3.673666964551683, "grad_norm": 1.0703444480895996, "learning_rate": 3.988096371610669e-05, "loss": 0.6462, "num_input_tokens_seen": 14224176, "step": 24665 }, { "epoch": 3.6744116770926425, "grad_norm": 1.1893492937088013, "learning_rate": 3.9875741074343744e-05, "loss": 0.6016, "num_input_tokens_seen": 14227152, "step": 24670 }, { "epoch": 3.6751563896336013, "grad_norm": 0.8151219487190247, "learning_rate": 3.98705174273387e-05, "loss": 0.7594, "num_input_tokens_seen": 14229776, "step": 24675 }, { "epoch": 3.675901102174561, "grad_norm": 1.204484224319458, "learning_rate": 3.986529277544454e-05, "loss": 0.6809, "num_input_tokens_seen": 14232720, "step": 24680 }, { "epoch": 3.6766458147155197, "grad_norm": 0.8674242496490479, "learning_rate": 3.9860067119014334e-05, "loss": 0.5604, "num_input_tokens_seen": 14235536, "step": 24685 }, { "epoch": 3.677390527256479, "grad_norm": 0.9340413212776184, "learning_rate": 3.985484045840121e-05, "loss": 0.7306, "num_input_tokens_seen": 14238448, "step": 24690 }, { "epoch": 3.678135239797438, "grad_norm": 1.105926275253296, "learning_rate": 3.984961279395836e-05, "loss": 0.7712, "num_input_tokens_seen": 14241168, "step": 24695 }, { "epoch": 3.6788799523383973, "grad_norm": 1.7776122093200684, "learning_rate": 3.9844384126039055e-05, "loss": 0.5615, "num_input_tokens_seen": 14244080, "step": 24700 }, { "epoch": 3.6796246648793565, "grad_norm": 0.7969132661819458, "learning_rate": 3.983915445499663e-05, "loss": 0.6598, "num_input_tokens_seen": 14247088, "step": 24705 }, { "epoch": 3.6803693774203157, "grad_norm": 1.1141586303710938, "learning_rate": 3.983392378118447e-05, "loss": 0.7473, "num_input_tokens_seen": 14250288, "step": 24710 }, { "epoch": 3.681114089961275, "grad_norm": 1.2116928100585938, "learning_rate": 3.9828692104956054e-05, "loss": 0.4832, "num_input_tokens_seen": 14253872, "step": 24715 }, { "epoch": 3.681858802502234, "grad_norm": 0.8622713685035706, "learning_rate": 3.982345942666492e-05, "loss": 0.6859, "num_input_tokens_seen": 14256688, "step": 24720 }, { "epoch": 3.6826035150431933, "grad_norm": 1.3001527786254883, "learning_rate": 3.981822574666466e-05, "loss": 0.6112, "num_input_tokens_seen": 14259280, "step": 24725 }, { "epoch": 3.6833482275841525, "grad_norm": 1.501521348953247, "learning_rate": 3.9812991065308946e-05, "loss": 0.605, "num_input_tokens_seen": 14261872, "step": 24730 }, { "epoch": 3.6840929401251117, "grad_norm": 1.8503105640411377, "learning_rate": 3.980775538295153e-05, "loss": 0.7541, "num_input_tokens_seen": 14264752, "step": 24735 }, { "epoch": 3.684837652666071, "grad_norm": 0.8649353384971619, "learning_rate": 3.98025186999462e-05, "loss": 0.5684, "num_input_tokens_seen": 14267792, "step": 24740 }, { "epoch": 3.68558236520703, "grad_norm": 0.9959782361984253, "learning_rate": 3.979728101664685e-05, "loss": 0.6911, "num_input_tokens_seen": 14270544, "step": 24745 }, { "epoch": 3.6863270777479893, "grad_norm": 0.7915844917297363, "learning_rate": 3.9792042333407404e-05, "loss": 0.7071, "num_input_tokens_seen": 14273488, "step": 24750 }, { "epoch": 3.6870717902889485, "grad_norm": 0.8818081617355347, "learning_rate": 3.978680265058187e-05, "loss": 0.6786, "num_input_tokens_seen": 14276560, "step": 24755 }, { "epoch": 3.6878165028299077, "grad_norm": 1.0428210496902466, "learning_rate": 3.978156196852435e-05, "loss": 0.6232, "num_input_tokens_seen": 14279472, "step": 24760 }, { "epoch": 3.688561215370867, "grad_norm": 1.1874948740005493, "learning_rate": 3.977632028758895e-05, "loss": 0.6607, "num_input_tokens_seen": 14282416, "step": 24765 }, { "epoch": 3.689305927911826, "grad_norm": 1.6592808961868286, "learning_rate": 3.977107760812991e-05, "loss": 0.5073, "num_input_tokens_seen": 14285264, "step": 24770 }, { "epoch": 3.6900506404527853, "grad_norm": 1.6800347566604614, "learning_rate": 3.976583393050151e-05, "loss": 0.7386, "num_input_tokens_seen": 14288240, "step": 24775 }, { "epoch": 3.6907953529937445, "grad_norm": 1.1061431169509888, "learning_rate": 3.976058925505807e-05, "loss": 0.6211, "num_input_tokens_seen": 14291088, "step": 24780 }, { "epoch": 3.6915400655347037, "grad_norm": 1.0519490242004395, "learning_rate": 3.975534358215403e-05, "loss": 0.699, "num_input_tokens_seen": 14294384, "step": 24785 }, { "epoch": 3.692284778075663, "grad_norm": 1.340412974357605, "learning_rate": 3.9750096912143855e-05, "loss": 0.6197, "num_input_tokens_seen": 14297232, "step": 24790 }, { "epoch": 3.693029490616622, "grad_norm": 0.7060865163803101, "learning_rate": 3.97448492453821e-05, "loss": 0.6416, "num_input_tokens_seen": 14300336, "step": 24795 }, { "epoch": 3.6937742031575813, "grad_norm": 1.2371900081634521, "learning_rate": 3.973960058222339e-05, "loss": 0.8815, "num_input_tokens_seen": 14303088, "step": 24800 }, { "epoch": 3.6945189156985405, "grad_norm": 1.2792123556137085, "learning_rate": 3.973435092302239e-05, "loss": 0.5211, "num_input_tokens_seen": 14305744, "step": 24805 }, { "epoch": 3.6952636282394993, "grad_norm": 2.030644416809082, "learning_rate": 3.972910026813387e-05, "loss": 0.6987, "num_input_tokens_seen": 14308912, "step": 24810 }, { "epoch": 3.696008340780459, "grad_norm": 0.9301362037658691, "learning_rate": 3.972384861791263e-05, "loss": 0.5181, "num_input_tokens_seen": 14311792, "step": 24815 }, { "epoch": 3.6967530533214177, "grad_norm": 0.8935282230377197, "learning_rate": 3.971859597271357e-05, "loss": 0.578, "num_input_tokens_seen": 14314416, "step": 24820 }, { "epoch": 3.6974977658623773, "grad_norm": 2.048959255218506, "learning_rate": 3.9713342332891625e-05, "loss": 0.6452, "num_input_tokens_seen": 14317328, "step": 24825 }, { "epoch": 3.698242478403336, "grad_norm": 1.8358770608901978, "learning_rate": 3.9708087698801834e-05, "loss": 0.6244, "num_input_tokens_seen": 14320144, "step": 24830 }, { "epoch": 3.6989871909442957, "grad_norm": 1.7008556127548218, "learning_rate": 3.9702832070799265e-05, "loss": 0.5892, "num_input_tokens_seen": 14323024, "step": 24835 }, { "epoch": 3.6997319034852545, "grad_norm": 5.240391254425049, "learning_rate": 3.969757544923909e-05, "loss": 0.6132, "num_input_tokens_seen": 14325520, "step": 24840 }, { "epoch": 3.700476616026214, "grad_norm": 1.5877751111984253, "learning_rate": 3.969231783447652e-05, "loss": 0.5734, "num_input_tokens_seen": 14328624, "step": 24845 }, { "epoch": 3.701221328567173, "grad_norm": 0.7182145118713379, "learning_rate": 3.9687059226866854e-05, "loss": 0.4622, "num_input_tokens_seen": 14331312, "step": 24850 }, { "epoch": 3.701966041108132, "grad_norm": 2.7286486625671387, "learning_rate": 3.9681799626765425e-05, "loss": 0.7431, "num_input_tokens_seen": 14334256, "step": 24855 }, { "epoch": 3.7027107536490913, "grad_norm": 1.1522172689437866, "learning_rate": 3.9676539034527684e-05, "loss": 0.6902, "num_input_tokens_seen": 14337552, "step": 24860 }, { "epoch": 3.7034554661900505, "grad_norm": 1.4915319681167603, "learning_rate": 3.9671277450509094e-05, "loss": 0.7836, "num_input_tokens_seen": 14340432, "step": 24865 }, { "epoch": 3.7042001787310097, "grad_norm": 2.3015360832214355, "learning_rate": 3.9666014875065226e-05, "loss": 0.6116, "num_input_tokens_seen": 14343504, "step": 24870 }, { "epoch": 3.704944891271969, "grad_norm": 1.2965240478515625, "learning_rate": 3.9660751308551705e-05, "loss": 0.658, "num_input_tokens_seen": 14346224, "step": 24875 }, { "epoch": 3.705689603812928, "grad_norm": 1.26702880859375, "learning_rate": 3.965548675132421e-05, "loss": 0.5477, "num_input_tokens_seen": 14349520, "step": 24880 }, { "epoch": 3.7064343163538873, "grad_norm": 2.1837472915649414, "learning_rate": 3.96502212037385e-05, "loss": 0.8209, "num_input_tokens_seen": 14352624, "step": 24885 }, { "epoch": 3.7071790288948465, "grad_norm": 0.7824407815933228, "learning_rate": 3.964495466615042e-05, "loss": 0.6101, "num_input_tokens_seen": 14355344, "step": 24890 }, { "epoch": 3.7079237414358057, "grad_norm": 0.9173024296760559, "learning_rate": 3.963968713891584e-05, "loss": 0.6466, "num_input_tokens_seen": 14358160, "step": 24895 }, { "epoch": 3.708668453976765, "grad_norm": 0.9316690564155579, "learning_rate": 3.9634418622390727e-05, "loss": 0.6036, "num_input_tokens_seen": 14361008, "step": 24900 }, { "epoch": 3.709413166517724, "grad_norm": 0.8648401498794556, "learning_rate": 3.9629149116931086e-05, "loss": 0.735, "num_input_tokens_seen": 14363824, "step": 24905 }, { "epoch": 3.7101578790586833, "grad_norm": 1.6026206016540527, "learning_rate": 3.962387862289304e-05, "loss": 0.666, "num_input_tokens_seen": 14366928, "step": 24910 }, { "epoch": 3.7109025915996425, "grad_norm": 1.221055269241333, "learning_rate": 3.9618607140632724e-05, "loss": 0.6158, "num_input_tokens_seen": 14369776, "step": 24915 }, { "epoch": 3.7116473041406017, "grad_norm": 1.2545973062515259, "learning_rate": 3.9613334670506384e-05, "loss": 0.5559, "num_input_tokens_seen": 14372688, "step": 24920 }, { "epoch": 3.712392016681561, "grad_norm": 1.7548309564590454, "learning_rate": 3.9608061212870294e-05, "loss": 0.6498, "num_input_tokens_seen": 14375568, "step": 24925 }, { "epoch": 3.71313672922252, "grad_norm": 1.5089726448059082, "learning_rate": 3.960278676808082e-05, "loss": 0.5951, "num_input_tokens_seen": 14378224, "step": 24930 }, { "epoch": 3.7138814417634793, "grad_norm": 0.7322582602500916, "learning_rate": 3.959751133649439e-05, "loss": 0.8287, "num_input_tokens_seen": 14380944, "step": 24935 }, { "epoch": 3.7146261543044385, "grad_norm": 1.6787846088409424, "learning_rate": 3.959223491846749e-05, "loss": 0.5997, "num_input_tokens_seen": 14383888, "step": 24940 }, { "epoch": 3.7153708668453977, "grad_norm": 2.0803568363189697, "learning_rate": 3.958695751435668e-05, "loss": 0.8499, "num_input_tokens_seen": 14386928, "step": 24945 }, { "epoch": 3.716115579386357, "grad_norm": 1.575799822807312, "learning_rate": 3.958167912451859e-05, "loss": 0.5251, "num_input_tokens_seen": 14389680, "step": 24950 }, { "epoch": 3.716860291927316, "grad_norm": 1.1021817922592163, "learning_rate": 3.95763997493099e-05, "loss": 0.6896, "num_input_tokens_seen": 14392656, "step": 24955 }, { "epoch": 3.7176050044682754, "grad_norm": 2.2017509937286377, "learning_rate": 3.95711193890874e-05, "loss": 0.9011, "num_input_tokens_seen": 14395408, "step": 24960 }, { "epoch": 3.7183497170092346, "grad_norm": 2.762836217880249, "learning_rate": 3.956583804420787e-05, "loss": 0.6879, "num_input_tokens_seen": 14398640, "step": 24965 }, { "epoch": 3.7190944295501938, "grad_norm": 0.9374647736549377, "learning_rate": 3.9560555715028235e-05, "loss": 0.7332, "num_input_tokens_seen": 14401936, "step": 24970 }, { "epoch": 3.719839142091153, "grad_norm": 1.152592420578003, "learning_rate": 3.9555272401905445e-05, "loss": 0.6144, "num_input_tokens_seen": 14404784, "step": 24975 }, { "epoch": 3.720583854632112, "grad_norm": 0.7924676537513733, "learning_rate": 3.9549988105196525e-05, "loss": 0.5677, "num_input_tokens_seen": 14407728, "step": 24980 }, { "epoch": 3.721328567173071, "grad_norm": 0.799465000629425, "learning_rate": 3.954470282525856e-05, "loss": 0.7363, "num_input_tokens_seen": 14410544, "step": 24985 }, { "epoch": 3.7220732797140306, "grad_norm": 1.611391305923462, "learning_rate": 3.9539416562448715e-05, "loss": 0.6217, "num_input_tokens_seen": 14413520, "step": 24990 }, { "epoch": 3.7228179922549893, "grad_norm": 0.9286087155342102, "learning_rate": 3.953412931712421e-05, "loss": 0.5948, "num_input_tokens_seen": 14416464, "step": 24995 }, { "epoch": 3.723562704795949, "grad_norm": 1.0090223550796509, "learning_rate": 3.952884108964234e-05, "loss": 0.6115, "num_input_tokens_seen": 14419216, "step": 25000 }, { "epoch": 3.7243074173369077, "grad_norm": 0.7014596462249756, "learning_rate": 3.952355188036046e-05, "loss": 0.6722, "num_input_tokens_seen": 14422032, "step": 25005 }, { "epoch": 3.7250521298778674, "grad_norm": 3.3369319438934326, "learning_rate": 3.9518261689635995e-05, "loss": 0.6319, "num_input_tokens_seen": 14424592, "step": 25010 }, { "epoch": 3.725796842418826, "grad_norm": 3.008504629135132, "learning_rate": 3.951297051782643e-05, "loss": 0.7375, "num_input_tokens_seen": 14427440, "step": 25015 }, { "epoch": 3.726541554959786, "grad_norm": 0.9891433119773865, "learning_rate": 3.9507678365289316e-05, "loss": 0.4483, "num_input_tokens_seen": 14430704, "step": 25020 }, { "epoch": 3.7272862675007445, "grad_norm": 0.7856103777885437, "learning_rate": 3.950238523238229e-05, "loss": 0.8261, "num_input_tokens_seen": 14433616, "step": 25025 }, { "epoch": 3.7280309800417037, "grad_norm": 2.314333200454712, "learning_rate": 3.949709111946303e-05, "loss": 0.6503, "num_input_tokens_seen": 14436336, "step": 25030 }, { "epoch": 3.728775692582663, "grad_norm": 1.042895793914795, "learning_rate": 3.949179602688928e-05, "loss": 0.6095, "num_input_tokens_seen": 14439088, "step": 25035 }, { "epoch": 3.729520405123622, "grad_norm": 0.9291409254074097, "learning_rate": 3.9486499955018893e-05, "loss": 0.6655, "num_input_tokens_seen": 14442032, "step": 25040 }, { "epoch": 3.7302651176645814, "grad_norm": 1.1646150350570679, "learning_rate": 3.948120290420973e-05, "loss": 0.5979, "num_input_tokens_seen": 14445296, "step": 25045 }, { "epoch": 3.7310098302055406, "grad_norm": 1.3198069334030151, "learning_rate": 3.947590487481975e-05, "loss": 0.5142, "num_input_tokens_seen": 14447952, "step": 25050 }, { "epoch": 3.7317545427464998, "grad_norm": 1.9114048480987549, "learning_rate": 3.9470605867206976e-05, "loss": 0.8159, "num_input_tokens_seen": 14450672, "step": 25055 }, { "epoch": 3.732499255287459, "grad_norm": 1.0056816339492798, "learning_rate": 3.946530588172949e-05, "loss": 0.527, "num_input_tokens_seen": 14453744, "step": 25060 }, { "epoch": 3.733243967828418, "grad_norm": 2.4645934104919434, "learning_rate": 3.946000491874544e-05, "loss": 0.8719, "num_input_tokens_seen": 14456592, "step": 25065 }, { "epoch": 3.7339886803693774, "grad_norm": 1.4435406923294067, "learning_rate": 3.945470297861305e-05, "loss": 0.4947, "num_input_tokens_seen": 14459472, "step": 25070 }, { "epoch": 3.7347333929103366, "grad_norm": 0.9587147235870361, "learning_rate": 3.94494000616906e-05, "loss": 0.5909, "num_input_tokens_seen": 14462352, "step": 25075 }, { "epoch": 3.7354781054512958, "grad_norm": 0.9126144051551819, "learning_rate": 3.944409616833645e-05, "loss": 0.7091, "num_input_tokens_seen": 14465264, "step": 25080 }, { "epoch": 3.736222817992255, "grad_norm": 1.7718881368637085, "learning_rate": 3.9438791298909e-05, "loss": 1.0465, "num_input_tokens_seen": 14468176, "step": 25085 }, { "epoch": 3.736967530533214, "grad_norm": 0.9218520522117615, "learning_rate": 3.943348545376673e-05, "loss": 0.7031, "num_input_tokens_seen": 14471216, "step": 25090 }, { "epoch": 3.7377122430741734, "grad_norm": 1.3298239707946777, "learning_rate": 3.94281786332682e-05, "loss": 0.7158, "num_input_tokens_seen": 14474064, "step": 25095 }, { "epoch": 3.7384569556151326, "grad_norm": 1.6376174688339233, "learning_rate": 3.942287083777203e-05, "loss": 0.6795, "num_input_tokens_seen": 14476912, "step": 25100 }, { "epoch": 3.739201668156092, "grad_norm": 1.3044164180755615, "learning_rate": 3.941756206763687e-05, "loss": 0.7147, "num_input_tokens_seen": 14479824, "step": 25105 }, { "epoch": 3.739946380697051, "grad_norm": 1.572189450263977, "learning_rate": 3.9412252323221495e-05, "loss": 0.6597, "num_input_tokens_seen": 14482672, "step": 25110 }, { "epoch": 3.74069109323801, "grad_norm": 1.2341656684875488, "learning_rate": 3.94069416048847e-05, "loss": 0.5964, "num_input_tokens_seen": 14485264, "step": 25115 }, { "epoch": 3.7414358057789694, "grad_norm": 1.2743128538131714, "learning_rate": 3.940162991298537e-05, "loss": 0.6317, "num_input_tokens_seen": 14488016, "step": 25120 }, { "epoch": 3.7421805183199286, "grad_norm": 1.6964231729507446, "learning_rate": 3.9396317247882444e-05, "loss": 0.5818, "num_input_tokens_seen": 14490768, "step": 25125 }, { "epoch": 3.742925230860888, "grad_norm": 1.646492600440979, "learning_rate": 3.939100360993492e-05, "loss": 0.6163, "num_input_tokens_seen": 14493488, "step": 25130 }, { "epoch": 3.743669943401847, "grad_norm": 1.2505420446395874, "learning_rate": 3.938568899950188e-05, "loss": 0.663, "num_input_tokens_seen": 14496432, "step": 25135 }, { "epoch": 3.744414655942806, "grad_norm": 1.9147180318832397, "learning_rate": 3.9380373416942474e-05, "loss": 0.5308, "num_input_tokens_seen": 14499504, "step": 25140 }, { "epoch": 3.7451593684837654, "grad_norm": 1.3859401941299438, "learning_rate": 3.93750568626159e-05, "loss": 0.5941, "num_input_tokens_seen": 14502096, "step": 25145 }, { "epoch": 3.7459040810247246, "grad_norm": 1.8937859535217285, "learning_rate": 3.9369739336881426e-05, "loss": 0.5805, "num_input_tokens_seen": 14504912, "step": 25150 }, { "epoch": 3.746648793565684, "grad_norm": 2.4266107082366943, "learning_rate": 3.936442084009839e-05, "loss": 0.6475, "num_input_tokens_seen": 14507664, "step": 25155 }, { "epoch": 3.7473935061066426, "grad_norm": 1.7911738157272339, "learning_rate": 3.9359101372626195e-05, "loss": 0.694, "num_input_tokens_seen": 14510704, "step": 25160 }, { "epoch": 3.748138218647602, "grad_norm": 1.329999327659607, "learning_rate": 3.935378093482431e-05, "loss": 0.5927, "num_input_tokens_seen": 14513584, "step": 25165 }, { "epoch": 3.748882931188561, "grad_norm": 1.4631110429763794, "learning_rate": 3.9348459527052264e-05, "loss": 0.5145, "num_input_tokens_seen": 14516528, "step": 25170 }, { "epoch": 3.7496276437295206, "grad_norm": 1.9454090595245361, "learning_rate": 3.9343137149669665e-05, "loss": 0.5557, "num_input_tokens_seen": 14519696, "step": 25175 }, { "epoch": 3.7503723562704794, "grad_norm": 0.7846714854240417, "learning_rate": 3.933781380303617e-05, "loss": 0.8125, "num_input_tokens_seen": 14522800, "step": 25180 }, { "epoch": 3.751117068811439, "grad_norm": 1.5222100019454956, "learning_rate": 3.933248948751151e-05, "loss": 0.6036, "num_input_tokens_seen": 14525712, "step": 25185 }, { "epoch": 3.751861781352398, "grad_norm": 1.466081976890564, "learning_rate": 3.932716420345548e-05, "loss": 0.6462, "num_input_tokens_seen": 14528688, "step": 25190 }, { "epoch": 3.7526064938933574, "grad_norm": 1.3950258493423462, "learning_rate": 3.932183795122795e-05, "loss": 0.5836, "num_input_tokens_seen": 14532016, "step": 25195 }, { "epoch": 3.753351206434316, "grad_norm": 1.0811269283294678, "learning_rate": 3.931651073118884e-05, "loss": 0.5945, "num_input_tokens_seen": 14535152, "step": 25200 }, { "epoch": 3.7540959189752754, "grad_norm": 1.5155360698699951, "learning_rate": 3.931118254369813e-05, "loss": 0.6686, "num_input_tokens_seen": 14537968, "step": 25205 }, { "epoch": 3.7548406315162346, "grad_norm": 2.8889827728271484, "learning_rate": 3.93058533891159e-05, "loss": 0.8965, "num_input_tokens_seen": 14540688, "step": 25210 }, { "epoch": 3.755585344057194, "grad_norm": 1.1726853847503662, "learning_rate": 3.930052326780225e-05, "loss": 0.6049, "num_input_tokens_seen": 14543440, "step": 25215 }, { "epoch": 3.756330056598153, "grad_norm": 1.2264842987060547, "learning_rate": 3.929519218011739e-05, "loss": 0.6841, "num_input_tokens_seen": 14546352, "step": 25220 }, { "epoch": 3.757074769139112, "grad_norm": 1.672136902809143, "learning_rate": 3.928986012642156e-05, "loss": 0.599, "num_input_tokens_seen": 14549520, "step": 25225 }, { "epoch": 3.7578194816800714, "grad_norm": 0.9646811485290527, "learning_rate": 3.9284527107075075e-05, "loss": 0.7179, "num_input_tokens_seen": 14552592, "step": 25230 }, { "epoch": 3.7585641942210306, "grad_norm": 0.8220956325531006, "learning_rate": 3.927919312243833e-05, "loss": 0.5164, "num_input_tokens_seen": 14555984, "step": 25235 }, { "epoch": 3.75930890676199, "grad_norm": 0.9902646541595459, "learning_rate": 3.927385817287177e-05, "loss": 0.7056, "num_input_tokens_seen": 14558800, "step": 25240 }, { "epoch": 3.760053619302949, "grad_norm": 1.0440828800201416, "learning_rate": 3.926852225873591e-05, "loss": 0.6105, "num_input_tokens_seen": 14561584, "step": 25245 }, { "epoch": 3.760798331843908, "grad_norm": 1.6622302532196045, "learning_rate": 3.926318538039132e-05, "loss": 0.6958, "num_input_tokens_seen": 14564656, "step": 25250 }, { "epoch": 3.7615430443848674, "grad_norm": 1.192659616470337, "learning_rate": 3.9257847538198654e-05, "loss": 0.7751, "num_input_tokens_seen": 14567632, "step": 25255 }, { "epoch": 3.7622877569258266, "grad_norm": 1.354359745979309, "learning_rate": 3.9252508732518625e-05, "loss": 0.6453, "num_input_tokens_seen": 14570544, "step": 25260 }, { "epoch": 3.763032469466786, "grad_norm": 2.0185604095458984, "learning_rate": 3.9247168963712e-05, "loss": 0.7019, "num_input_tokens_seen": 14573520, "step": 25265 }, { "epoch": 3.763777182007745, "grad_norm": 1.2247354984283447, "learning_rate": 3.924182823213962e-05, "loss": 0.6061, "num_input_tokens_seen": 14576496, "step": 25270 }, { "epoch": 3.7645218945487042, "grad_norm": 1.1711992025375366, "learning_rate": 3.923648653816239e-05, "loss": 0.6044, "num_input_tokens_seen": 14579120, "step": 25275 }, { "epoch": 3.7652666070896634, "grad_norm": 1.4218788146972656, "learning_rate": 3.923114388214128e-05, "loss": 0.5841, "num_input_tokens_seen": 14582000, "step": 25280 }, { "epoch": 3.7660113196306226, "grad_norm": 0.9069129228591919, "learning_rate": 3.922580026443733e-05, "loss": 0.5814, "num_input_tokens_seen": 14584784, "step": 25285 }, { "epoch": 3.766756032171582, "grad_norm": 1.4348284006118774, "learning_rate": 3.922045568541164e-05, "loss": 0.7267, "num_input_tokens_seen": 14587600, "step": 25290 }, { "epoch": 3.767500744712541, "grad_norm": 1.3305904865264893, "learning_rate": 3.921511014542536e-05, "loss": 0.5267, "num_input_tokens_seen": 14590640, "step": 25295 }, { "epoch": 3.7682454572535002, "grad_norm": 1.1014596223831177, "learning_rate": 3.9209763644839736e-05, "loss": 0.5832, "num_input_tokens_seen": 14593360, "step": 25300 }, { "epoch": 3.7689901697944594, "grad_norm": 1.1838572025299072, "learning_rate": 3.9204416184016055e-05, "loss": 0.5481, "num_input_tokens_seen": 14596496, "step": 25305 }, { "epoch": 3.7697348823354186, "grad_norm": 0.8549033403396606, "learning_rate": 3.9199067763315685e-05, "loss": 0.6551, "num_input_tokens_seen": 14599664, "step": 25310 }, { "epoch": 3.770479594876378, "grad_norm": 1.3424888849258423, "learning_rate": 3.919371838310004e-05, "loss": 0.7168, "num_input_tokens_seen": 14602640, "step": 25315 }, { "epoch": 3.771224307417337, "grad_norm": 1.2660475969314575, "learning_rate": 3.9188368043730615e-05, "loss": 0.5622, "num_input_tokens_seen": 14605936, "step": 25320 }, { "epoch": 3.7719690199582963, "grad_norm": 1.1500682830810547, "learning_rate": 3.918301674556897e-05, "loss": 0.7481, "num_input_tokens_seen": 14608592, "step": 25325 }, { "epoch": 3.7727137324992555, "grad_norm": 0.7524701952934265, "learning_rate": 3.917766448897671e-05, "loss": 0.5524, "num_input_tokens_seen": 14611440, "step": 25330 }, { "epoch": 3.773458445040214, "grad_norm": 0.963231086730957, "learning_rate": 3.917231127431552e-05, "loss": 0.6722, "num_input_tokens_seen": 14614448, "step": 25335 }, { "epoch": 3.774203157581174, "grad_norm": 0.8923192024230957, "learning_rate": 3.9166957101947166e-05, "loss": 0.6465, "num_input_tokens_seen": 14617328, "step": 25340 }, { "epoch": 3.7749478701221326, "grad_norm": 1.7170822620391846, "learning_rate": 3.916160197223344e-05, "loss": 0.7106, "num_input_tokens_seen": 14620400, "step": 25345 }, { "epoch": 3.7756925826630923, "grad_norm": 0.9030530452728271, "learning_rate": 3.915624588553624e-05, "loss": 0.4987, "num_input_tokens_seen": 14622960, "step": 25350 }, { "epoch": 3.776437295204051, "grad_norm": 0.861387312412262, "learning_rate": 3.915088884221749e-05, "loss": 0.5784, "num_input_tokens_seen": 14625648, "step": 25355 }, { "epoch": 3.7771820077450107, "grad_norm": 0.8572165966033936, "learning_rate": 3.914553084263921e-05, "loss": 0.5915, "num_input_tokens_seen": 14628208, "step": 25360 }, { "epoch": 3.7779267202859694, "grad_norm": 1.5758333206176758, "learning_rate": 3.914017188716347e-05, "loss": 0.6269, "num_input_tokens_seen": 14631088, "step": 25365 }, { "epoch": 3.778671432826929, "grad_norm": 1.1444103717803955, "learning_rate": 3.9134811976152393e-05, "loss": 0.6809, "num_input_tokens_seen": 14633872, "step": 25370 }, { "epoch": 3.779416145367888, "grad_norm": 0.38786208629608154, "learning_rate": 3.91294511099682e-05, "loss": 0.5529, "num_input_tokens_seen": 14636528, "step": 25375 }, { "epoch": 3.780160857908847, "grad_norm": 2.54057240486145, "learning_rate": 3.912408928897314e-05, "loss": 0.5575, "num_input_tokens_seen": 14639248, "step": 25380 }, { "epoch": 3.7809055704498062, "grad_norm": 1.3620355129241943, "learning_rate": 3.911872651352956e-05, "loss": 0.7118, "num_input_tokens_seen": 14642288, "step": 25385 }, { "epoch": 3.7816502829907654, "grad_norm": 0.7981306314468384, "learning_rate": 3.911336278399984e-05, "loss": 0.6249, "num_input_tokens_seen": 14645328, "step": 25390 }, { "epoch": 3.7823949955317246, "grad_norm": 1.2505465745925903, "learning_rate": 3.9107998100746444e-05, "loss": 0.6896, "num_input_tokens_seen": 14648208, "step": 25395 }, { "epoch": 3.783139708072684, "grad_norm": 1.5687410831451416, "learning_rate": 3.9102632464131895e-05, "loss": 0.6481, "num_input_tokens_seen": 14650864, "step": 25400 }, { "epoch": 3.783884420613643, "grad_norm": 0.7582750916481018, "learning_rate": 3.909726587451878e-05, "loss": 0.6315, "num_input_tokens_seen": 14653616, "step": 25405 }, { "epoch": 3.7846291331546023, "grad_norm": 2.339428186416626, "learning_rate": 3.9091898332269746e-05, "loss": 0.6209, "num_input_tokens_seen": 14656848, "step": 25410 }, { "epoch": 3.7853738456955615, "grad_norm": 1.051435947418213, "learning_rate": 3.908652983774753e-05, "loss": 0.6255, "num_input_tokens_seen": 14659632, "step": 25415 }, { "epoch": 3.7861185582365207, "grad_norm": 1.2698477506637573, "learning_rate": 3.908116039131489e-05, "loss": 0.5926, "num_input_tokens_seen": 14662448, "step": 25420 }, { "epoch": 3.78686327077748, "grad_norm": 0.7342411875724792, "learning_rate": 3.9075789993334686e-05, "loss": 0.5053, "num_input_tokens_seen": 14665168, "step": 25425 }, { "epoch": 3.787607983318439, "grad_norm": 2.5183863639831543, "learning_rate": 3.907041864416982e-05, "loss": 0.7472, "num_input_tokens_seen": 14668112, "step": 25430 }, { "epoch": 3.7883526958593983, "grad_norm": 1.3057711124420166, "learning_rate": 3.9065046344183265e-05, "loss": 0.7781, "num_input_tokens_seen": 14670832, "step": 25435 }, { "epoch": 3.7890974084003575, "grad_norm": 1.2842111587524414, "learning_rate": 3.905967309373806e-05, "loss": 0.6668, "num_input_tokens_seen": 14673552, "step": 25440 }, { "epoch": 3.7898421209413167, "grad_norm": 1.0437004566192627, "learning_rate": 3.905429889319732e-05, "loss": 0.5205, "num_input_tokens_seen": 14676304, "step": 25445 }, { "epoch": 3.790586833482276, "grad_norm": 1.387655258178711, "learning_rate": 3.904892374292419e-05, "loss": 0.6522, "num_input_tokens_seen": 14679280, "step": 25450 }, { "epoch": 3.791331546023235, "grad_norm": 1.0857428312301636, "learning_rate": 3.904354764328192e-05, "loss": 0.7557, "num_input_tokens_seen": 14682448, "step": 25455 }, { "epoch": 3.7920762585641943, "grad_norm": 1.6580897569656372, "learning_rate": 3.903817059463379e-05, "loss": 0.6408, "num_input_tokens_seen": 14685296, "step": 25460 }, { "epoch": 3.7928209711051535, "grad_norm": 0.9967105984687805, "learning_rate": 3.903279259734318e-05, "loss": 0.7299, "num_input_tokens_seen": 14688208, "step": 25465 }, { "epoch": 3.7935656836461127, "grad_norm": 1.2970917224884033, "learning_rate": 3.902741365177349e-05, "loss": 0.5806, "num_input_tokens_seen": 14691184, "step": 25470 }, { "epoch": 3.794310396187072, "grad_norm": 1.051723599433899, "learning_rate": 3.902203375828822e-05, "loss": 0.6482, "num_input_tokens_seen": 14694192, "step": 25475 }, { "epoch": 3.795055108728031, "grad_norm": 1.0084329843521118, "learning_rate": 3.901665291725091e-05, "loss": 0.5889, "num_input_tokens_seen": 14697232, "step": 25480 }, { "epoch": 3.7957998212689903, "grad_norm": 1.1763142347335815, "learning_rate": 3.901127112902519e-05, "loss": 0.6686, "num_input_tokens_seen": 14700080, "step": 25485 }, { "epoch": 3.7965445338099495, "grad_norm": 1.5402355194091797, "learning_rate": 3.9005888393974735e-05, "loss": 0.5472, "num_input_tokens_seen": 14703024, "step": 25490 }, { "epoch": 3.7972892463509087, "grad_norm": 1.0346993207931519, "learning_rate": 3.900050471246328e-05, "loss": 0.6617, "num_input_tokens_seen": 14705584, "step": 25495 }, { "epoch": 3.798033958891868, "grad_norm": 1.192596673965454, "learning_rate": 3.899512008485464e-05, "loss": 0.5878, "num_input_tokens_seen": 14708432, "step": 25500 }, { "epoch": 3.798778671432827, "grad_norm": 1.138260841369629, "learning_rate": 3.898973451151269e-05, "loss": 0.5614, "num_input_tokens_seen": 14711024, "step": 25505 }, { "epoch": 3.799523383973786, "grad_norm": 0.7905327677726746, "learning_rate": 3.8984347992801355e-05, "loss": 0.5472, "num_input_tokens_seen": 14713776, "step": 25510 }, { "epoch": 3.8002680965147455, "grad_norm": 1.2742042541503906, "learning_rate": 3.897896052908464e-05, "loss": 0.6577, "num_input_tokens_seen": 14716656, "step": 25515 }, { "epoch": 3.8010128090557043, "grad_norm": 1.187861680984497, "learning_rate": 3.897357212072661e-05, "loss": 0.6372, "num_input_tokens_seen": 14719696, "step": 25520 }, { "epoch": 3.801757521596664, "grad_norm": 1.2681180238723755, "learning_rate": 3.896818276809139e-05, "loss": 0.6762, "num_input_tokens_seen": 14722608, "step": 25525 }, { "epoch": 3.8025022341376227, "grad_norm": 2.2350118160247803, "learning_rate": 3.896279247154316e-05, "loss": 0.6147, "num_input_tokens_seen": 14725520, "step": 25530 }, { "epoch": 3.8032469466785823, "grad_norm": 1.2507069110870361, "learning_rate": 3.8957401231446186e-05, "loss": 0.5758, "num_input_tokens_seen": 14728176, "step": 25535 }, { "epoch": 3.803991659219541, "grad_norm": 2.0837323665618896, "learning_rate": 3.895200904816478e-05, "loss": 0.684, "num_input_tokens_seen": 14731024, "step": 25540 }, { "epoch": 3.8047363717605007, "grad_norm": 2.1052424907684326, "learning_rate": 3.8946615922063334e-05, "loss": 0.5392, "num_input_tokens_seen": 14733808, "step": 25545 }, { "epoch": 3.8054810843014595, "grad_norm": 1.2974597215652466, "learning_rate": 3.894122185350629e-05, "loss": 0.8005, "num_input_tokens_seen": 14737040, "step": 25550 }, { "epoch": 3.8062257968424187, "grad_norm": 1.0990965366363525, "learning_rate": 3.8935826842858144e-05, "loss": 0.7049, "num_input_tokens_seen": 14740016, "step": 25555 }, { "epoch": 3.806970509383378, "grad_norm": 2.7166285514831543, "learning_rate": 3.8930430890483486e-05, "loss": 0.5628, "num_input_tokens_seen": 14743344, "step": 25560 }, { "epoch": 3.807715221924337, "grad_norm": 1.8111093044281006, "learning_rate": 3.892503399674694e-05, "loss": 0.7269, "num_input_tokens_seen": 14746096, "step": 25565 }, { "epoch": 3.8084599344652963, "grad_norm": 1.967833399772644, "learning_rate": 3.8919636162013216e-05, "loss": 0.786, "num_input_tokens_seen": 14748976, "step": 25570 }, { "epoch": 3.8092046470062555, "grad_norm": 1.354033350944519, "learning_rate": 3.8914237386647076e-05, "loss": 0.5625, "num_input_tokens_seen": 14751664, "step": 25575 }, { "epoch": 3.8099493595472147, "grad_norm": 1.7577424049377441, "learning_rate": 3.8908837671013345e-05, "loss": 0.6377, "num_input_tokens_seen": 14754608, "step": 25580 }, { "epoch": 3.810694072088174, "grad_norm": 0.9071455597877502, "learning_rate": 3.8903437015476903e-05, "loss": 0.6246, "num_input_tokens_seen": 14757648, "step": 25585 }, { "epoch": 3.811438784629133, "grad_norm": 0.9839497804641724, "learning_rate": 3.889803542040272e-05, "loss": 0.7759, "num_input_tokens_seen": 14760240, "step": 25590 }, { "epoch": 3.8121834971700923, "grad_norm": 1.0941582918167114, "learning_rate": 3.889263288615581e-05, "loss": 0.5495, "num_input_tokens_seen": 14763280, "step": 25595 }, { "epoch": 3.8129282097110515, "grad_norm": 1.0863128900527954, "learning_rate": 3.888722941310126e-05, "loss": 0.5418, "num_input_tokens_seen": 14766224, "step": 25600 }, { "epoch": 3.8136729222520107, "grad_norm": 1.105827808380127, "learning_rate": 3.88818250016042e-05, "loss": 0.5541, "num_input_tokens_seen": 14769168, "step": 25605 }, { "epoch": 3.81441763479297, "grad_norm": 0.9906269907951355, "learning_rate": 3.887641965202984e-05, "loss": 0.5807, "num_input_tokens_seen": 14772240, "step": 25610 }, { "epoch": 3.815162347333929, "grad_norm": 1.5747185945510864, "learning_rate": 3.887101336474346e-05, "loss": 0.6265, "num_input_tokens_seen": 14774992, "step": 25615 }, { "epoch": 3.8159070598748883, "grad_norm": 1.1963907480239868, "learning_rate": 3.88656061401104e-05, "loss": 0.7103, "num_input_tokens_seen": 14778000, "step": 25620 }, { "epoch": 3.8166517724158475, "grad_norm": 1.1928279399871826, "learning_rate": 3.886019797849605e-05, "loss": 0.6319, "num_input_tokens_seen": 14780912, "step": 25625 }, { "epoch": 3.8173964849568067, "grad_norm": 2.138723373413086, "learning_rate": 3.8854788880265865e-05, "loss": 0.4918, "num_input_tokens_seen": 14783952, "step": 25630 }, { "epoch": 3.818141197497766, "grad_norm": 1.4735703468322754, "learning_rate": 3.884937884578538e-05, "loss": 0.4873, "num_input_tokens_seen": 14786768, "step": 25635 }, { "epoch": 3.818885910038725, "grad_norm": 0.8483983874320984, "learning_rate": 3.884396787542017e-05, "loss": 0.5613, "num_input_tokens_seen": 14789520, "step": 25640 }, { "epoch": 3.8196306225796843, "grad_norm": 2.6678037643432617, "learning_rate": 3.8838555969535915e-05, "loss": 0.6012, "num_input_tokens_seen": 14792752, "step": 25645 }, { "epoch": 3.8203753351206435, "grad_norm": 4.967041969299316, "learning_rate": 3.8833143128498303e-05, "loss": 0.5302, "num_input_tokens_seen": 14795760, "step": 25650 }, { "epoch": 3.8211200476616027, "grad_norm": 1.6498761177062988, "learning_rate": 3.882772935267312e-05, "loss": 0.5914, "num_input_tokens_seen": 14798672, "step": 25655 }, { "epoch": 3.821864760202562, "grad_norm": 1.3713194131851196, "learning_rate": 3.8822314642426204e-05, "loss": 0.4233, "num_input_tokens_seen": 14801200, "step": 25660 }, { "epoch": 3.822609472743521, "grad_norm": 1.6420458555221558, "learning_rate": 3.8816898998123464e-05, "loss": 0.6018, "num_input_tokens_seen": 14804176, "step": 25665 }, { "epoch": 3.8233541852844803, "grad_norm": 2.1513636112213135, "learning_rate": 3.8811482420130866e-05, "loss": 0.5883, "num_input_tokens_seen": 14806992, "step": 25670 }, { "epoch": 3.824098897825439, "grad_norm": 1.357710838317871, "learning_rate": 3.8806064908814435e-05, "loss": 0.7353, "num_input_tokens_seen": 14810000, "step": 25675 }, { "epoch": 3.8248436103663987, "grad_norm": 0.7081170678138733, "learning_rate": 3.880064646454027e-05, "loss": 0.4645, "num_input_tokens_seen": 14813008, "step": 25680 }, { "epoch": 3.8255883229073575, "grad_norm": 1.4067906141281128, "learning_rate": 3.8795227087674535e-05, "loss": 0.6494, "num_input_tokens_seen": 14816016, "step": 25685 }, { "epoch": 3.826333035448317, "grad_norm": 1.9907561540603638, "learning_rate": 3.878980677858344e-05, "loss": 0.4381, "num_input_tokens_seen": 14818608, "step": 25690 }, { "epoch": 3.827077747989276, "grad_norm": 1.5152173042297363, "learning_rate": 3.878438553763326e-05, "loss": 0.6437, "num_input_tokens_seen": 14821392, "step": 25695 }, { "epoch": 3.8278224605302356, "grad_norm": 2.838963747024536, "learning_rate": 3.877896336519035e-05, "loss": 0.6178, "num_input_tokens_seen": 14824528, "step": 25700 }, { "epoch": 3.8285671730711943, "grad_norm": 1.0832821130752563, "learning_rate": 3.877354026162112e-05, "loss": 0.7442, "num_input_tokens_seen": 14827536, "step": 25705 }, { "epoch": 3.829311885612154, "grad_norm": 1.4777569770812988, "learning_rate": 3.876811622729203e-05, "loss": 0.6054, "num_input_tokens_seen": 14830320, "step": 25710 }, { "epoch": 3.8300565981531127, "grad_norm": 4.0995707511901855, "learning_rate": 3.8762691262569625e-05, "loss": 0.6206, "num_input_tokens_seen": 14832944, "step": 25715 }, { "epoch": 3.830801310694072, "grad_norm": 2.084083318710327, "learning_rate": 3.875726536782051e-05, "loss": 0.7547, "num_input_tokens_seen": 14835920, "step": 25720 }, { "epoch": 3.831546023235031, "grad_norm": 1.4932867288589478, "learning_rate": 3.8751838543411325e-05, "loss": 0.7319, "num_input_tokens_seen": 14838896, "step": 25725 }, { "epoch": 3.8322907357759903, "grad_norm": 0.48096248507499695, "learning_rate": 3.8746410789708806e-05, "loss": 0.7198, "num_input_tokens_seen": 14841776, "step": 25730 }, { "epoch": 3.8330354483169495, "grad_norm": 1.315788745880127, "learning_rate": 3.8740982107079735e-05, "loss": 0.5246, "num_input_tokens_seen": 14844784, "step": 25735 }, { "epoch": 3.8337801608579087, "grad_norm": 2.932184934616089, "learning_rate": 3.873555249589096e-05, "loss": 0.7123, "num_input_tokens_seen": 14847792, "step": 25740 }, { "epoch": 3.834524873398868, "grad_norm": 1.7953405380249023, "learning_rate": 3.873012195650939e-05, "loss": 0.7453, "num_input_tokens_seen": 14850544, "step": 25745 }, { "epoch": 3.835269585939827, "grad_norm": 2.13400936126709, "learning_rate": 3.8724690489302004e-05, "loss": 0.7233, "num_input_tokens_seen": 14853488, "step": 25750 }, { "epoch": 3.8360142984807863, "grad_norm": 1.0013681650161743, "learning_rate": 3.871925809463583e-05, "loss": 0.6233, "num_input_tokens_seen": 14856336, "step": 25755 }, { "epoch": 3.8367590110217455, "grad_norm": 1.3139158487319946, "learning_rate": 3.871382477287797e-05, "loss": 0.7231, "num_input_tokens_seen": 14859120, "step": 25760 }, { "epoch": 3.8375037235627047, "grad_norm": 1.15689218044281, "learning_rate": 3.87083905243956e-05, "loss": 0.8262, "num_input_tokens_seen": 14862288, "step": 25765 }, { "epoch": 3.838248436103664, "grad_norm": 1.0533525943756104, "learning_rate": 3.8702955349555924e-05, "loss": 0.6837, "num_input_tokens_seen": 14865136, "step": 25770 }, { "epoch": 3.838993148644623, "grad_norm": 1.0093998908996582, "learning_rate": 3.8697519248726236e-05, "loss": 0.5932, "num_input_tokens_seen": 14868272, "step": 25775 }, { "epoch": 3.8397378611855824, "grad_norm": 0.8368162512779236, "learning_rate": 3.869208222227389e-05, "loss": 0.757, "num_input_tokens_seen": 14870896, "step": 25780 }, { "epoch": 3.8404825737265416, "grad_norm": 0.9098140001296997, "learning_rate": 3.86866442705663e-05, "loss": 0.6924, "num_input_tokens_seen": 14873680, "step": 25785 }, { "epoch": 3.8412272862675008, "grad_norm": 0.8986436724662781, "learning_rate": 3.868120539397093e-05, "loss": 0.5982, "num_input_tokens_seen": 14876656, "step": 25790 }, { "epoch": 3.84197199880846, "grad_norm": 1.109225869178772, "learning_rate": 3.867576559285533e-05, "loss": 0.6266, "num_input_tokens_seen": 14879184, "step": 25795 }, { "epoch": 3.842716711349419, "grad_norm": 1.2862601280212402, "learning_rate": 3.867032486758708e-05, "loss": 0.6714, "num_input_tokens_seen": 14881936, "step": 25800 }, { "epoch": 3.8434614238903784, "grad_norm": 1.0051294565200806, "learning_rate": 3.8664883218533873e-05, "loss": 0.5429, "num_input_tokens_seen": 14885136, "step": 25805 }, { "epoch": 3.8442061364313376, "grad_norm": 1.3046205043792725, "learning_rate": 3.8659440646063404e-05, "loss": 0.5071, "num_input_tokens_seen": 14887856, "step": 25810 }, { "epoch": 3.8449508489722968, "grad_norm": 2.029738664627075, "learning_rate": 3.865399715054347e-05, "loss": 0.84, "num_input_tokens_seen": 14890512, "step": 25815 }, { "epoch": 3.845695561513256, "grad_norm": 0.9560750722885132, "learning_rate": 3.8648552732341925e-05, "loss": 0.663, "num_input_tokens_seen": 14893488, "step": 25820 }, { "epoch": 3.846440274054215, "grad_norm": 1.5754272937774658, "learning_rate": 3.8643107391826676e-05, "loss": 0.5354, "num_input_tokens_seen": 14896496, "step": 25825 }, { "epoch": 3.8471849865951744, "grad_norm": 1.2398459911346436, "learning_rate": 3.86376611293657e-05, "loss": 0.6288, "num_input_tokens_seen": 14899504, "step": 25830 }, { "epoch": 3.8479296991361336, "grad_norm": 1.8920769691467285, "learning_rate": 3.8632213945327036e-05, "loss": 0.5, "num_input_tokens_seen": 14902192, "step": 25835 }, { "epoch": 3.848674411677093, "grad_norm": 0.7568253874778748, "learning_rate": 3.8626765840078765e-05, "loss": 0.6083, "num_input_tokens_seen": 14905424, "step": 25840 }, { "epoch": 3.849419124218052, "grad_norm": 0.6759280562400818, "learning_rate": 3.862131681398907e-05, "loss": 0.5399, "num_input_tokens_seen": 14908240, "step": 25845 }, { "epoch": 3.8501638367590107, "grad_norm": 1.2505970001220703, "learning_rate": 3.8615866867426164e-05, "loss": 0.6754, "num_input_tokens_seen": 14911024, "step": 25850 }, { "epoch": 3.8509085492999704, "grad_norm": 1.7497062683105469, "learning_rate": 3.8610416000758334e-05, "loss": 0.864, "num_input_tokens_seen": 14914064, "step": 25855 }, { "epoch": 3.851653261840929, "grad_norm": 0.8408340215682983, "learning_rate": 3.860496421435392e-05, "loss": 0.5279, "num_input_tokens_seen": 14916912, "step": 25860 }, { "epoch": 3.852397974381889, "grad_norm": 0.9763840436935425, "learning_rate": 3.859951150858135e-05, "loss": 0.6717, "num_input_tokens_seen": 14919856, "step": 25865 }, { "epoch": 3.8531426869228476, "grad_norm": 1.2198448181152344, "learning_rate": 3.859405788380908e-05, "loss": 0.682, "num_input_tokens_seen": 14922704, "step": 25870 }, { "epoch": 3.853887399463807, "grad_norm": 1.755096673965454, "learning_rate": 3.858860334040564e-05, "loss": 0.6689, "num_input_tokens_seen": 14925872, "step": 25875 }, { "epoch": 3.854632112004766, "grad_norm": 1.980111002922058, "learning_rate": 3.858314787873964e-05, "loss": 0.6667, "num_input_tokens_seen": 14928720, "step": 25880 }, { "epoch": 3.8553768245457256, "grad_norm": 0.9137519001960754, "learning_rate": 3.857769149917973e-05, "loss": 0.7758, "num_input_tokens_seen": 14931696, "step": 25885 }, { "epoch": 3.8561215370866844, "grad_norm": 0.8813811540603638, "learning_rate": 3.857223420209464e-05, "loss": 0.6712, "num_input_tokens_seen": 14934672, "step": 25890 }, { "epoch": 3.8568662496276436, "grad_norm": 1.1282215118408203, "learning_rate": 3.856677598785313e-05, "loss": 0.4994, "num_input_tokens_seen": 14937488, "step": 25895 }, { "epoch": 3.8576109621686028, "grad_norm": 0.8662076592445374, "learning_rate": 3.856131685682406e-05, "loss": 0.6737, "num_input_tokens_seen": 14940560, "step": 25900 }, { "epoch": 3.858355674709562, "grad_norm": 1.497543454170227, "learning_rate": 3.855585680937634e-05, "loss": 0.4799, "num_input_tokens_seen": 14943536, "step": 25905 }, { "epoch": 3.859100387250521, "grad_norm": 1.1974319219589233, "learning_rate": 3.8550395845878925e-05, "loss": 0.5963, "num_input_tokens_seen": 14946480, "step": 25910 }, { "epoch": 3.8598450997914804, "grad_norm": 1.094441294670105, "learning_rate": 3.854493396670085e-05, "loss": 0.5441, "num_input_tokens_seen": 14948976, "step": 25915 }, { "epoch": 3.8605898123324396, "grad_norm": 1.9344834089279175, "learning_rate": 3.8539471172211204e-05, "loss": 0.6994, "num_input_tokens_seen": 14951696, "step": 25920 }, { "epoch": 3.861334524873399, "grad_norm": 1.4955098628997803, "learning_rate": 3.8534007462779154e-05, "loss": 0.6257, "num_input_tokens_seen": 14954384, "step": 25925 }, { "epoch": 3.862079237414358, "grad_norm": 1.9355905055999756, "learning_rate": 3.85285428387739e-05, "loss": 0.7268, "num_input_tokens_seen": 14957584, "step": 25930 }, { "epoch": 3.862823949955317, "grad_norm": 1.224539041519165, "learning_rate": 3.852307730056472e-05, "loss": 0.6226, "num_input_tokens_seen": 14960464, "step": 25935 }, { "epoch": 3.8635686624962764, "grad_norm": 1.3295437097549438, "learning_rate": 3.851761084852096e-05, "loss": 0.6607, "num_input_tokens_seen": 14963568, "step": 25940 }, { "epoch": 3.8643133750372356, "grad_norm": 1.7358897924423218, "learning_rate": 3.851214348301202e-05, "loss": 0.7024, "num_input_tokens_seen": 14966288, "step": 25945 }, { "epoch": 3.865058087578195, "grad_norm": 2.620610475540161, "learning_rate": 3.850667520440735e-05, "loss": 0.6816, "num_input_tokens_seen": 14969008, "step": 25950 }, { "epoch": 3.865802800119154, "grad_norm": 1.2997223138809204, "learning_rate": 3.8501206013076494e-05, "loss": 0.8112, "num_input_tokens_seen": 14971824, "step": 25955 }, { "epoch": 3.866547512660113, "grad_norm": 0.6518428325653076, "learning_rate": 3.849573590938903e-05, "loss": 0.5294, "num_input_tokens_seen": 14974832, "step": 25960 }, { "epoch": 3.8672922252010724, "grad_norm": 1.5182620286941528, "learning_rate": 3.849026489371459e-05, "loss": 0.6333, "num_input_tokens_seen": 14977712, "step": 25965 }, { "epoch": 3.8680369377420316, "grad_norm": 1.0004290342330933, "learning_rate": 3.848479296642291e-05, "loss": 0.5919, "num_input_tokens_seen": 14980528, "step": 25970 }, { "epoch": 3.868781650282991, "grad_norm": 0.8938931226730347, "learning_rate": 3.8479320127883744e-05, "loss": 0.5706, "num_input_tokens_seen": 14983664, "step": 25975 }, { "epoch": 3.86952636282395, "grad_norm": 1.2611664533615112, "learning_rate": 3.8473846378466915e-05, "loss": 0.5406, "num_input_tokens_seen": 14986416, "step": 25980 }, { "epoch": 3.870271075364909, "grad_norm": 1.7618616819381714, "learning_rate": 3.846837171854234e-05, "loss": 0.7447, "num_input_tokens_seen": 14989200, "step": 25985 }, { "epoch": 3.8710157879058684, "grad_norm": 0.7434346079826355, "learning_rate": 3.8462896148479966e-05, "loss": 0.6024, "num_input_tokens_seen": 14992112, "step": 25990 }, { "epoch": 3.8717605004468276, "grad_norm": 1.2714003324508667, "learning_rate": 3.8457419668649795e-05, "loss": 0.6815, "num_input_tokens_seen": 14994864, "step": 25995 }, { "epoch": 3.872505212987787, "grad_norm": 1.2037608623504639, "learning_rate": 3.845194227942192e-05, "loss": 0.658, "num_input_tokens_seen": 14997744, "step": 26000 }, { "epoch": 3.873249925528746, "grad_norm": 1.3177809715270996, "learning_rate": 3.844646398116648e-05, "loss": 0.6926, "num_input_tokens_seen": 15000912, "step": 26005 }, { "epoch": 3.8739946380697052, "grad_norm": 1.8223382234573364, "learning_rate": 3.844098477425368e-05, "loss": 0.6982, "num_input_tokens_seen": 15003824, "step": 26010 }, { "epoch": 3.8747393506106644, "grad_norm": 2.82753324508667, "learning_rate": 3.843550465905376e-05, "loss": 0.5993, "num_input_tokens_seen": 15006768, "step": 26015 }, { "epoch": 3.8754840631516236, "grad_norm": 1.6448982954025269, "learning_rate": 3.843002363593707e-05, "loss": 0.698, "num_input_tokens_seen": 15009616, "step": 26020 }, { "epoch": 3.8762287756925824, "grad_norm": 2.2137467861175537, "learning_rate": 3.842454170527398e-05, "loss": 0.7, "num_input_tokens_seen": 15012272, "step": 26025 }, { "epoch": 3.876973488233542, "grad_norm": 1.0177922248840332, "learning_rate": 3.841905886743494e-05, "loss": 0.56, "num_input_tokens_seen": 15015152, "step": 26030 }, { "epoch": 3.877718200774501, "grad_norm": 1.7509615421295166, "learning_rate": 3.841357512279047e-05, "loss": 0.7086, "num_input_tokens_seen": 15018384, "step": 26035 }, { "epoch": 3.8784629133154604, "grad_norm": 0.8329729437828064, "learning_rate": 3.8408090471711125e-05, "loss": 0.4885, "num_input_tokens_seen": 15021040, "step": 26040 }, { "epoch": 3.879207625856419, "grad_norm": 1.5808079242706299, "learning_rate": 3.840260491456753e-05, "loss": 0.5726, "num_input_tokens_seen": 15023760, "step": 26045 }, { "epoch": 3.879952338397379, "grad_norm": 2.0453438758850098, "learning_rate": 3.83971184517304e-05, "loss": 0.787, "num_input_tokens_seen": 15026384, "step": 26050 }, { "epoch": 3.8806970509383376, "grad_norm": 1.2739617824554443, "learning_rate": 3.8391631083570464e-05, "loss": 0.7593, "num_input_tokens_seen": 15029424, "step": 26055 }, { "epoch": 3.8814417634792973, "grad_norm": 0.9921850562095642, "learning_rate": 3.838614281045855e-05, "loss": 0.6411, "num_input_tokens_seen": 15032144, "step": 26060 }, { "epoch": 3.882186476020256, "grad_norm": 0.7969701290130615, "learning_rate": 3.838065363276553e-05, "loss": 0.5001, "num_input_tokens_seen": 15034768, "step": 26065 }, { "epoch": 3.882931188561215, "grad_norm": 2.059326410293579, "learning_rate": 3.837516355086234e-05, "loss": 0.7262, "num_input_tokens_seen": 15037808, "step": 26070 }, { "epoch": 3.8836759011021744, "grad_norm": 1.2286673784255981, "learning_rate": 3.8369672565119975e-05, "loss": 0.5854, "num_input_tokens_seen": 15040624, "step": 26075 }, { "epoch": 3.8844206136431336, "grad_norm": 1.3449372053146362, "learning_rate": 3.836418067590949e-05, "loss": 0.657, "num_input_tokens_seen": 15043440, "step": 26080 }, { "epoch": 3.885165326184093, "grad_norm": 1.1033278703689575, "learning_rate": 3.835868788360201e-05, "loss": 0.5705, "num_input_tokens_seen": 15046160, "step": 26085 }, { "epoch": 3.885910038725052, "grad_norm": 0.8433933854103088, "learning_rate": 3.8353194188568725e-05, "loss": 0.4916, "num_input_tokens_seen": 15049104, "step": 26090 }, { "epoch": 3.8866547512660112, "grad_norm": 1.5149595737457275, "learning_rate": 3.8347699591180855e-05, "loss": 0.8314, "num_input_tokens_seen": 15052272, "step": 26095 }, { "epoch": 3.8873994638069704, "grad_norm": 2.0564193725585938, "learning_rate": 3.8342204091809716e-05, "loss": 0.656, "num_input_tokens_seen": 15055248, "step": 26100 }, { "epoch": 3.8881441763479296, "grad_norm": 1.329455852508545, "learning_rate": 3.8336707690826676e-05, "loss": 0.5997, "num_input_tokens_seen": 15058032, "step": 26105 }, { "epoch": 3.888888888888889, "grad_norm": 0.8830405473709106, "learning_rate": 3.8331210388603155e-05, "loss": 0.4778, "num_input_tokens_seen": 15060592, "step": 26110 }, { "epoch": 3.889633601429848, "grad_norm": 1.4192270040512085, "learning_rate": 3.8325712185510635e-05, "loss": 0.6171, "num_input_tokens_seen": 15063728, "step": 26115 }, { "epoch": 3.8903783139708072, "grad_norm": 2.856971025466919, "learning_rate": 3.8320213081920664e-05, "loss": 0.7728, "num_input_tokens_seen": 15066704, "step": 26120 }, { "epoch": 3.8911230265117664, "grad_norm": 1.0062549114227295, "learning_rate": 3.831471307820485e-05, "loss": 0.6513, "num_input_tokens_seen": 15069584, "step": 26125 }, { "epoch": 3.8918677390527256, "grad_norm": 1.3087332248687744, "learning_rate": 3.8309212174734856e-05, "loss": 0.8099, "num_input_tokens_seen": 15072368, "step": 26130 }, { "epoch": 3.892612451593685, "grad_norm": 2.078976631164551, "learning_rate": 3.8303710371882414e-05, "loss": 0.7233, "num_input_tokens_seen": 15076432, "step": 26135 }, { "epoch": 3.893357164134644, "grad_norm": 1.7592151165008545, "learning_rate": 3.8298207670019315e-05, "loss": 0.5295, "num_input_tokens_seen": 15079216, "step": 26140 }, { "epoch": 3.8941018766756033, "grad_norm": 1.5739445686340332, "learning_rate": 3.82927040695174e-05, "loss": 0.66, "num_input_tokens_seen": 15082096, "step": 26145 }, { "epoch": 3.8948465892165625, "grad_norm": 1.1111736297607422, "learning_rate": 3.828719957074861e-05, "loss": 0.6439, "num_input_tokens_seen": 15084784, "step": 26150 }, { "epoch": 3.8955913017575217, "grad_norm": 0.900907576084137, "learning_rate": 3.828169417408488e-05, "loss": 0.6193, "num_input_tokens_seen": 15087728, "step": 26155 }, { "epoch": 3.896336014298481, "grad_norm": 0.8570179343223572, "learning_rate": 3.8276187879898255e-05, "loss": 0.598, "num_input_tokens_seen": 15090512, "step": 26160 }, { "epoch": 3.89708072683944, "grad_norm": 1.5188342332839966, "learning_rate": 3.827068068856083e-05, "loss": 0.6175, "num_input_tokens_seen": 15093648, "step": 26165 }, { "epoch": 3.8978254393803993, "grad_norm": 0.703296422958374, "learning_rate": 3.826517260044477e-05, "loss": 0.5993, "num_input_tokens_seen": 15096432, "step": 26170 }, { "epoch": 3.8985701519213585, "grad_norm": 1.5788135528564453, "learning_rate": 3.825966361592227e-05, "loss": 0.5557, "num_input_tokens_seen": 15099344, "step": 26175 }, { "epoch": 3.8993148644623177, "grad_norm": 1.7875306606292725, "learning_rate": 3.8254153735365614e-05, "loss": 0.7272, "num_input_tokens_seen": 15102192, "step": 26180 }, { "epoch": 3.900059577003277, "grad_norm": 0.6692148447036743, "learning_rate": 3.8248642959147136e-05, "loss": 0.7036, "num_input_tokens_seen": 15104752, "step": 26185 }, { "epoch": 3.900804289544236, "grad_norm": 1.0255711078643799, "learning_rate": 3.8243131287639234e-05, "loss": 0.6462, "num_input_tokens_seen": 15107472, "step": 26190 }, { "epoch": 3.9015490020851953, "grad_norm": 1.5069820880889893, "learning_rate": 3.823761872121436e-05, "loss": 0.502, "num_input_tokens_seen": 15110608, "step": 26195 }, { "epoch": 3.902293714626154, "grad_norm": 2.057274103164673, "learning_rate": 3.823210526024503e-05, "loss": 0.6801, "num_input_tokens_seen": 15113392, "step": 26200 }, { "epoch": 3.9030384271671137, "grad_norm": 0.9694322943687439, "learning_rate": 3.822659090510383e-05, "loss": 0.7951, "num_input_tokens_seen": 15116208, "step": 26205 }, { "epoch": 3.9037831397080724, "grad_norm": 1.531943917274475, "learning_rate": 3.822107565616339e-05, "loss": 0.7807, "num_input_tokens_seen": 15119024, "step": 26210 }, { "epoch": 3.904527852249032, "grad_norm": 0.9039557576179504, "learning_rate": 3.8215559513796405e-05, "loss": 0.5556, "num_input_tokens_seen": 15121616, "step": 26215 }, { "epoch": 3.905272564789991, "grad_norm": 1.8117139339447021, "learning_rate": 3.821004247837564e-05, "loss": 0.7335, "num_input_tokens_seen": 15124528, "step": 26220 }, { "epoch": 3.9060172773309505, "grad_norm": 0.8532721996307373, "learning_rate": 3.820452455027391e-05, "loss": 0.5509, "num_input_tokens_seen": 15127856, "step": 26225 }, { "epoch": 3.9067619898719093, "grad_norm": 0.947372317314148, "learning_rate": 3.819900572986411e-05, "loss": 0.5444, "num_input_tokens_seen": 15130800, "step": 26230 }, { "epoch": 3.907506702412869, "grad_norm": 0.9185773134231567, "learning_rate": 3.8193486017519157e-05, "loss": 0.5488, "num_input_tokens_seen": 15133680, "step": 26235 }, { "epoch": 3.9082514149538277, "grad_norm": 0.9545327425003052, "learning_rate": 3.818796541361206e-05, "loss": 0.6256, "num_input_tokens_seen": 15136880, "step": 26240 }, { "epoch": 3.908996127494787, "grad_norm": 1.1699086427688599, "learning_rate": 3.8182443918515874e-05, "loss": 0.6297, "num_input_tokens_seen": 15139952, "step": 26245 }, { "epoch": 3.909740840035746, "grad_norm": 1.2223296165466309, "learning_rate": 3.817692153260374e-05, "loss": 0.5393, "num_input_tokens_seen": 15142672, "step": 26250 }, { "epoch": 3.9104855525767053, "grad_norm": 0.792998194694519, "learning_rate": 3.817139825624881e-05, "loss": 0.7684, "num_input_tokens_seen": 15145520, "step": 26255 }, { "epoch": 3.9112302651176645, "grad_norm": 1.9126595258712769, "learning_rate": 3.8165874089824336e-05, "loss": 0.7337, "num_input_tokens_seen": 15148432, "step": 26260 }, { "epoch": 3.9119749776586237, "grad_norm": 0.7231070399284363, "learning_rate": 3.816034903370362e-05, "loss": 0.6921, "num_input_tokens_seen": 15151504, "step": 26265 }, { "epoch": 3.912719690199583, "grad_norm": 0.6365065574645996, "learning_rate": 3.8154823088260026e-05, "loss": 0.6932, "num_input_tokens_seen": 15154320, "step": 26270 }, { "epoch": 3.913464402740542, "grad_norm": 0.805776059627533, "learning_rate": 3.8149296253866975e-05, "loss": 0.6842, "num_input_tokens_seen": 15157456, "step": 26275 }, { "epoch": 3.9142091152815013, "grad_norm": 0.9932712912559509, "learning_rate": 3.8143768530897935e-05, "loss": 0.5598, "num_input_tokens_seen": 15160528, "step": 26280 }, { "epoch": 3.9149538278224605, "grad_norm": 1.426714301109314, "learning_rate": 3.813823991972646e-05, "loss": 0.728, "num_input_tokens_seen": 15163184, "step": 26285 }, { "epoch": 3.9156985403634197, "grad_norm": 1.4477858543395996, "learning_rate": 3.8132710420726146e-05, "loss": 0.6365, "num_input_tokens_seen": 15165904, "step": 26290 }, { "epoch": 3.916443252904379, "grad_norm": 1.6153643131256104, "learning_rate": 3.812718003427066e-05, "loss": 0.6422, "num_input_tokens_seen": 15168624, "step": 26295 }, { "epoch": 3.917187965445338, "grad_norm": 0.7399163246154785, "learning_rate": 3.812164876073371e-05, "loss": 0.6424, "num_input_tokens_seen": 15171440, "step": 26300 }, { "epoch": 3.9179326779862973, "grad_norm": 1.8112926483154297, "learning_rate": 3.8116116600489096e-05, "loss": 0.742, "num_input_tokens_seen": 15174352, "step": 26305 }, { "epoch": 3.9186773905272565, "grad_norm": 0.7486317753791809, "learning_rate": 3.8110583553910644e-05, "loss": 0.4925, "num_input_tokens_seen": 15177328, "step": 26310 }, { "epoch": 3.9194221030682157, "grad_norm": 1.2755153179168701, "learning_rate": 3.810504962137226e-05, "loss": 0.572, "num_input_tokens_seen": 15180080, "step": 26315 }, { "epoch": 3.920166815609175, "grad_norm": 1.6342211961746216, "learning_rate": 3.8099514803247905e-05, "loss": 0.7886, "num_input_tokens_seen": 15183024, "step": 26320 }, { "epoch": 3.920911528150134, "grad_norm": 0.7709881663322449, "learning_rate": 3.809397909991159e-05, "loss": 0.6728, "num_input_tokens_seen": 15185712, "step": 26325 }, { "epoch": 3.9216562406910933, "grad_norm": 1.130432367324829, "learning_rate": 3.808844251173741e-05, "loss": 0.6211, "num_input_tokens_seen": 15188336, "step": 26330 }, { "epoch": 3.9224009532320525, "grad_norm": 2.2681679725646973, "learning_rate": 3.8082905039099496e-05, "loss": 0.6458, "num_input_tokens_seen": 15191056, "step": 26335 }, { "epoch": 3.9231456657730117, "grad_norm": 0.9750680327415466, "learning_rate": 3.8077366682372056e-05, "loss": 0.644, "num_input_tokens_seen": 15193680, "step": 26340 }, { "epoch": 3.923890378313971, "grad_norm": 0.759820818901062, "learning_rate": 3.807182744192934e-05, "loss": 0.5518, "num_input_tokens_seen": 15196432, "step": 26345 }, { "epoch": 3.92463509085493, "grad_norm": 2.134735345840454, "learning_rate": 3.806628731814568e-05, "loss": 0.7083, "num_input_tokens_seen": 15199120, "step": 26350 }, { "epoch": 3.9253798033958893, "grad_norm": 1.1467841863632202, "learning_rate": 3.806074631139543e-05, "loss": 0.5036, "num_input_tokens_seen": 15201936, "step": 26355 }, { "epoch": 3.9261245159368485, "grad_norm": 2.6409082412719727, "learning_rate": 3.805520442205306e-05, "loss": 0.5895, "num_input_tokens_seen": 15205040, "step": 26360 }, { "epoch": 3.9268692284778077, "grad_norm": 2.060950756072998, "learning_rate": 3.804966165049304e-05, "loss": 0.7347, "num_input_tokens_seen": 15207632, "step": 26365 }, { "epoch": 3.927613941018767, "grad_norm": 1.2988568544387817, "learning_rate": 3.8044117997089954e-05, "loss": 0.6495, "num_input_tokens_seen": 15210320, "step": 26370 }, { "epoch": 3.9283586535597257, "grad_norm": 2.0688114166259766, "learning_rate": 3.803857346221841e-05, "loss": 0.7435, "num_input_tokens_seen": 15213296, "step": 26375 }, { "epoch": 3.9291033661006853, "grad_norm": 0.9111633896827698, "learning_rate": 3.803302804625307e-05, "loss": 0.6728, "num_input_tokens_seen": 15216112, "step": 26380 }, { "epoch": 3.929848078641644, "grad_norm": 1.0951000452041626, "learning_rate": 3.80274817495687e-05, "loss": 0.5671, "num_input_tokens_seen": 15218864, "step": 26385 }, { "epoch": 3.9305927911826037, "grad_norm": 1.771951675415039, "learning_rate": 3.8021934572540065e-05, "loss": 0.7455, "num_input_tokens_seen": 15221904, "step": 26390 }, { "epoch": 3.9313375037235625, "grad_norm": 0.8312506675720215, "learning_rate": 3.8016386515542035e-05, "loss": 0.5667, "num_input_tokens_seen": 15224656, "step": 26395 }, { "epoch": 3.932082216264522, "grad_norm": 1.378417730331421, "learning_rate": 3.8010837578949527e-05, "loss": 0.5855, "num_input_tokens_seen": 15227344, "step": 26400 }, { "epoch": 3.932826928805481, "grad_norm": 1.0600426197052002, "learning_rate": 3.800528776313752e-05, "loss": 0.6051, "num_input_tokens_seen": 15230320, "step": 26405 }, { "epoch": 3.9335716413464406, "grad_norm": 1.1736071109771729, "learning_rate": 3.799973706848103e-05, "loss": 0.6773, "num_input_tokens_seen": 15233232, "step": 26410 }, { "epoch": 3.9343163538873993, "grad_norm": 1.0793558359146118, "learning_rate": 3.799418549535517e-05, "loss": 0.5255, "num_input_tokens_seen": 15236144, "step": 26415 }, { "epoch": 3.9350610664283585, "grad_norm": 1.264292597770691, "learning_rate": 3.798863304413509e-05, "loss": 0.6306, "num_input_tokens_seen": 15239376, "step": 26420 }, { "epoch": 3.9358057789693177, "grad_norm": 1.1538342237472534, "learning_rate": 3.7983079715195984e-05, "loss": 0.6539, "num_input_tokens_seen": 15242128, "step": 26425 }, { "epoch": 3.936550491510277, "grad_norm": 1.421573281288147, "learning_rate": 3.7977525508913145e-05, "loss": 0.6223, "num_input_tokens_seen": 15245040, "step": 26430 }, { "epoch": 3.937295204051236, "grad_norm": 0.6881672739982605, "learning_rate": 3.797197042566189e-05, "loss": 0.6499, "num_input_tokens_seen": 15248368, "step": 26435 }, { "epoch": 3.9380399165921953, "grad_norm": 0.9885215163230896, "learning_rate": 3.796641446581762e-05, "loss": 0.7126, "num_input_tokens_seen": 15250992, "step": 26440 }, { "epoch": 3.9387846291331545, "grad_norm": 0.709990382194519, "learning_rate": 3.796085762975577e-05, "loss": 0.6641, "num_input_tokens_seen": 15253968, "step": 26445 }, { "epoch": 3.9395293416741137, "grad_norm": 0.6715373396873474, "learning_rate": 3.7955299917851864e-05, "loss": 0.6507, "num_input_tokens_seen": 15256944, "step": 26450 }, { "epoch": 3.940274054215073, "grad_norm": 1.7023930549621582, "learning_rate": 3.794974133048146e-05, "loss": 0.629, "num_input_tokens_seen": 15259504, "step": 26455 }, { "epoch": 3.941018766756032, "grad_norm": 0.9047276377677917, "learning_rate": 3.794418186802018e-05, "loss": 0.6302, "num_input_tokens_seen": 15262320, "step": 26460 }, { "epoch": 3.9417634792969913, "grad_norm": 1.1501466035842896, "learning_rate": 3.793862153084372e-05, "loss": 0.6797, "num_input_tokens_seen": 15264880, "step": 26465 }, { "epoch": 3.9425081918379505, "grad_norm": 1.5501290559768677, "learning_rate": 3.793306031932783e-05, "loss": 0.7797, "num_input_tokens_seen": 15267856, "step": 26470 }, { "epoch": 3.9432529043789097, "grad_norm": 2.5613584518432617, "learning_rate": 3.79274982338483e-05, "loss": 0.7026, "num_input_tokens_seen": 15270800, "step": 26475 }, { "epoch": 3.943997616919869, "grad_norm": 0.8390129804611206, "learning_rate": 3.7921935274780994e-05, "loss": 0.5525, "num_input_tokens_seen": 15273936, "step": 26480 }, { "epoch": 3.944742329460828, "grad_norm": 1.6688958406448364, "learning_rate": 3.791637144250184e-05, "loss": 0.6771, "num_input_tokens_seen": 15276816, "step": 26485 }, { "epoch": 3.9454870420017873, "grad_norm": 2.291971445083618, "learning_rate": 3.791080673738682e-05, "loss": 0.6001, "num_input_tokens_seen": 15279632, "step": 26490 }, { "epoch": 3.9462317545427466, "grad_norm": 1.5213866233825684, "learning_rate": 3.790524115981198e-05, "loss": 0.5665, "num_input_tokens_seen": 15282288, "step": 26495 }, { "epoch": 3.9469764670837058, "grad_norm": 1.0484899282455444, "learning_rate": 3.78996747101534e-05, "loss": 0.7521, "num_input_tokens_seen": 15285008, "step": 26500 }, { "epoch": 3.947721179624665, "grad_norm": 1.4423091411590576, "learning_rate": 3.789410738878726e-05, "loss": 0.6329, "num_input_tokens_seen": 15287728, "step": 26505 }, { "epoch": 3.948465892165624, "grad_norm": 0.9405596852302551, "learning_rate": 3.7888539196089755e-05, "loss": 0.7028, "num_input_tokens_seen": 15290384, "step": 26510 }, { "epoch": 3.9492106047065834, "grad_norm": 0.8558295965194702, "learning_rate": 3.788297013243718e-05, "loss": 0.5805, "num_input_tokens_seen": 15293136, "step": 26515 }, { "epoch": 3.9499553172475426, "grad_norm": 0.9631670117378235, "learning_rate": 3.7877400198205856e-05, "loss": 0.6097, "num_input_tokens_seen": 15296176, "step": 26520 }, { "epoch": 3.9507000297885018, "grad_norm": 1.5467976331710815, "learning_rate": 3.7871829393772185e-05, "loss": 0.6729, "num_input_tokens_seen": 15298800, "step": 26525 }, { "epoch": 3.951444742329461, "grad_norm": 1.5581938028335571, "learning_rate": 3.786625771951261e-05, "loss": 0.6031, "num_input_tokens_seen": 15301680, "step": 26530 }, { "epoch": 3.95218945487042, "grad_norm": 1.2364482879638672, "learning_rate": 3.7860685175803654e-05, "loss": 0.6076, "num_input_tokens_seen": 15304432, "step": 26535 }, { "epoch": 3.9529341674113794, "grad_norm": 1.0084741115570068, "learning_rate": 3.785511176302189e-05, "loss": 0.4331, "num_input_tokens_seen": 15307184, "step": 26540 }, { "epoch": 3.9536788799523386, "grad_norm": 1.3510125875473022, "learning_rate": 3.784953748154393e-05, "loss": 0.6559, "num_input_tokens_seen": 15309968, "step": 26545 }, { "epoch": 3.9544235924932973, "grad_norm": 2.001277446746826, "learning_rate": 3.784396233174647e-05, "loss": 0.6122, "num_input_tokens_seen": 15313008, "step": 26550 }, { "epoch": 3.955168305034257, "grad_norm": 2.3530242443084717, "learning_rate": 3.7838386314006256e-05, "loss": 1.0015, "num_input_tokens_seen": 15315824, "step": 26555 }, { "epoch": 3.9559130175752157, "grad_norm": 1.3449910879135132, "learning_rate": 3.78328094287001e-05, "loss": 0.7642, "num_input_tokens_seen": 15318576, "step": 26560 }, { "epoch": 3.9566577301161754, "grad_norm": 1.028430700302124, "learning_rate": 3.782723167620484e-05, "loss": 0.6706, "num_input_tokens_seen": 15321456, "step": 26565 }, { "epoch": 3.957402442657134, "grad_norm": 1.248078465461731, "learning_rate": 3.782165305689743e-05, "loss": 0.7015, "num_input_tokens_seen": 15324528, "step": 26570 }, { "epoch": 3.958147155198094, "grad_norm": 0.646062433719635, "learning_rate": 3.781607357115483e-05, "loss": 0.8019, "num_input_tokens_seen": 15327472, "step": 26575 }, { "epoch": 3.9588918677390526, "grad_norm": 1.2301297187805176, "learning_rate": 3.7810493219354083e-05, "loss": 0.8327, "num_input_tokens_seen": 15330064, "step": 26580 }, { "epoch": 3.9596365802800118, "grad_norm": 1.686387300491333, "learning_rate": 3.780491200187228e-05, "loss": 0.5491, "num_input_tokens_seen": 15333136, "step": 26585 }, { "epoch": 3.960381292820971, "grad_norm": 1.445967197418213, "learning_rate": 3.77993299190866e-05, "loss": 0.6257, "num_input_tokens_seen": 15335888, "step": 26590 }, { "epoch": 3.96112600536193, "grad_norm": 0.6932445168495178, "learning_rate": 3.7793746971374236e-05, "loss": 0.694, "num_input_tokens_seen": 15338864, "step": 26595 }, { "epoch": 3.9618707179028894, "grad_norm": 1.1352970600128174, "learning_rate": 3.7788163159112467e-05, "loss": 0.7019, "num_input_tokens_seen": 15341616, "step": 26600 }, { "epoch": 3.9626154304438486, "grad_norm": 0.8609141707420349, "learning_rate": 3.778257848267863e-05, "loss": 0.7873, "num_input_tokens_seen": 15344496, "step": 26605 }, { "epoch": 3.9633601429848078, "grad_norm": 1.592638611793518, "learning_rate": 3.7776992942450097e-05, "loss": 0.6158, "num_input_tokens_seen": 15347376, "step": 26610 }, { "epoch": 3.964104855525767, "grad_norm": 1.07435941696167, "learning_rate": 3.777140653880434e-05, "loss": 0.5795, "num_input_tokens_seen": 15350224, "step": 26615 }, { "epoch": 3.964849568066726, "grad_norm": 1.6947259902954102, "learning_rate": 3.776581927211885e-05, "loss": 0.6049, "num_input_tokens_seen": 15352848, "step": 26620 }, { "epoch": 3.9655942806076854, "grad_norm": 0.9608012437820435, "learning_rate": 3.7760231142771194e-05, "loss": 0.7167, "num_input_tokens_seen": 15355632, "step": 26625 }, { "epoch": 3.9663389931486446, "grad_norm": 1.3722895383834839, "learning_rate": 3.7754642151139e-05, "loss": 0.7786, "num_input_tokens_seen": 15358640, "step": 26630 }, { "epoch": 3.967083705689604, "grad_norm": 1.553608775138855, "learning_rate": 3.774905229759994e-05, "loss": 0.8391, "num_input_tokens_seen": 15361488, "step": 26635 }, { "epoch": 3.967828418230563, "grad_norm": 1.2426364421844482, "learning_rate": 3.7743461582531767e-05, "loss": 0.6438, "num_input_tokens_seen": 15364240, "step": 26640 }, { "epoch": 3.968573130771522, "grad_norm": 1.1257246732711792, "learning_rate": 3.773787000631226e-05, "loss": 0.7018, "num_input_tokens_seen": 15367056, "step": 26645 }, { "epoch": 3.9693178433124814, "grad_norm": 0.736152708530426, "learning_rate": 3.77322775693193e-05, "loss": 0.6716, "num_input_tokens_seen": 15369904, "step": 26650 }, { "epoch": 3.9700625558534406, "grad_norm": 0.8563762903213501, "learning_rate": 3.772668427193078e-05, "loss": 0.5725, "num_input_tokens_seen": 15372752, "step": 26655 }, { "epoch": 3.9708072683944, "grad_norm": 2.029810905456543, "learning_rate": 3.772109011452468e-05, "loss": 0.7329, "num_input_tokens_seen": 15375472, "step": 26660 }, { "epoch": 3.971551980935359, "grad_norm": 1.5182573795318604, "learning_rate": 3.771549509747903e-05, "loss": 0.7581, "num_input_tokens_seen": 15378640, "step": 26665 }, { "epoch": 3.972296693476318, "grad_norm": 1.579393744468689, "learning_rate": 3.7709899221171924e-05, "loss": 0.5186, "num_input_tokens_seen": 15381616, "step": 26670 }, { "epoch": 3.9730414060172774, "grad_norm": 2.4034183025360107, "learning_rate": 3.7704302485981504e-05, "loss": 0.5964, "num_input_tokens_seen": 15384464, "step": 26675 }, { "epoch": 3.9737861185582366, "grad_norm": 1.0947569608688354, "learning_rate": 3.769870489228596e-05, "loss": 0.6531, "num_input_tokens_seen": 15387472, "step": 26680 }, { "epoch": 3.974530831099196, "grad_norm": 0.5733835101127625, "learning_rate": 3.769310644046359e-05, "loss": 0.4847, "num_input_tokens_seen": 15390608, "step": 26685 }, { "epoch": 3.975275543640155, "grad_norm": 1.5583699941635132, "learning_rate": 3.768750713089267e-05, "loss": 0.6734, "num_input_tokens_seen": 15393392, "step": 26690 }, { "epoch": 3.976020256181114, "grad_norm": 1.295211911201477, "learning_rate": 3.768190696395162e-05, "loss": 0.6141, "num_input_tokens_seen": 15396272, "step": 26695 }, { "epoch": 3.9767649687220734, "grad_norm": 2.738750696182251, "learning_rate": 3.767630594001885e-05, "loss": 0.7189, "num_input_tokens_seen": 15398896, "step": 26700 }, { "epoch": 3.9775096812630326, "grad_norm": 2.5924415588378906, "learning_rate": 3.767070405947287e-05, "loss": 0.7406, "num_input_tokens_seen": 15401936, "step": 26705 }, { "epoch": 3.978254393803992, "grad_norm": 4.565549373626709, "learning_rate": 3.7665101322692206e-05, "loss": 0.6451, "num_input_tokens_seen": 15404880, "step": 26710 }, { "epoch": 3.9789991063449506, "grad_norm": 1.7513586282730103, "learning_rate": 3.765949773005551e-05, "loss": 0.7048, "num_input_tokens_seen": 15407760, "step": 26715 }, { "epoch": 3.9797438188859102, "grad_norm": 1.310675024986267, "learning_rate": 3.7653893281941425e-05, "loss": 0.7079, "num_input_tokens_seen": 15410864, "step": 26720 }, { "epoch": 3.980488531426869, "grad_norm": 1.9328261613845825, "learning_rate": 3.764828797872866e-05, "loss": 0.4995, "num_input_tokens_seen": 15413680, "step": 26725 }, { "epoch": 3.9812332439678286, "grad_norm": 2.1830461025238037, "learning_rate": 3.764268182079603e-05, "loss": 0.6769, "num_input_tokens_seen": 15416592, "step": 26730 }, { "epoch": 3.9819779565087874, "grad_norm": 1.4784488677978516, "learning_rate": 3.7637074808522365e-05, "loss": 0.5161, "num_input_tokens_seen": 15419792, "step": 26735 }, { "epoch": 3.982722669049747, "grad_norm": 1.3245279788970947, "learning_rate": 3.763146694228657e-05, "loss": 0.7121, "num_input_tokens_seen": 15422352, "step": 26740 }, { "epoch": 3.983467381590706, "grad_norm": 0.8902844786643982, "learning_rate": 3.762585822246758e-05, "loss": 0.5629, "num_input_tokens_seen": 15425232, "step": 26745 }, { "epoch": 3.9842120941316654, "grad_norm": 1.505942940711975, "learning_rate": 3.762024864944443e-05, "loss": 0.8016, "num_input_tokens_seen": 15428016, "step": 26750 }, { "epoch": 3.984956806672624, "grad_norm": 1.1730495691299438, "learning_rate": 3.761463822359619e-05, "loss": 0.5651, "num_input_tokens_seen": 15430576, "step": 26755 }, { "epoch": 3.9857015192135834, "grad_norm": 0.78440260887146, "learning_rate": 3.760902694530198e-05, "loss": 0.7401, "num_input_tokens_seen": 15433424, "step": 26760 }, { "epoch": 3.9864462317545426, "grad_norm": 0.8254742622375488, "learning_rate": 3.7603414814940995e-05, "loss": 0.6337, "num_input_tokens_seen": 15436528, "step": 26765 }, { "epoch": 3.987190944295502, "grad_norm": 0.8852694630622864, "learning_rate": 3.7597801832892475e-05, "loss": 0.7574, "num_input_tokens_seen": 15439600, "step": 26770 }, { "epoch": 3.987935656836461, "grad_norm": 1.9886765480041504, "learning_rate": 3.759218799953574e-05, "loss": 0.6838, "num_input_tokens_seen": 15442416, "step": 26775 }, { "epoch": 3.98868036937742, "grad_norm": 1.3155473470687866, "learning_rate": 3.758657331525012e-05, "loss": 0.7172, "num_input_tokens_seen": 15445392, "step": 26780 }, { "epoch": 3.9894250819183794, "grad_norm": 0.7123721241950989, "learning_rate": 3.758095778041506e-05, "loss": 0.5644, "num_input_tokens_seen": 15448272, "step": 26785 }, { "epoch": 3.9901697944593386, "grad_norm": 0.9866077899932861, "learning_rate": 3.757534139541002e-05, "loss": 0.4421, "num_input_tokens_seen": 15451184, "step": 26790 }, { "epoch": 3.990914507000298, "grad_norm": 1.0318089723587036, "learning_rate": 3.7569724160614536e-05, "loss": 0.5798, "num_input_tokens_seen": 15454256, "step": 26795 }, { "epoch": 3.991659219541257, "grad_norm": 1.2964532375335693, "learning_rate": 3.75641060764082e-05, "loss": 0.7363, "num_input_tokens_seen": 15456848, "step": 26800 }, { "epoch": 3.9924039320822162, "grad_norm": 1.0790919065475464, "learning_rate": 3.755848714317065e-05, "loss": 0.5894, "num_input_tokens_seen": 15459536, "step": 26805 }, { "epoch": 3.9931486446231754, "grad_norm": 1.2078578472137451, "learning_rate": 3.75528673612816e-05, "loss": 0.6654, "num_input_tokens_seen": 15462640, "step": 26810 }, { "epoch": 3.9938933571641346, "grad_norm": 1.1905163526535034, "learning_rate": 3.7547246731120816e-05, "loss": 0.7152, "num_input_tokens_seen": 15465584, "step": 26815 }, { "epoch": 3.994638069705094, "grad_norm": 0.821456789970398, "learning_rate": 3.7541625253068117e-05, "loss": 0.8004, "num_input_tokens_seen": 15468592, "step": 26820 }, { "epoch": 3.995382782246053, "grad_norm": 1.710451602935791, "learning_rate": 3.7536002927503354e-05, "loss": 0.7465, "num_input_tokens_seen": 15471408, "step": 26825 }, { "epoch": 3.9961274947870122, "grad_norm": 0.9605507850646973, "learning_rate": 3.7530379754806494e-05, "loss": 0.6825, "num_input_tokens_seen": 15474384, "step": 26830 }, { "epoch": 3.9968722073279714, "grad_norm": 0.9561650156974792, "learning_rate": 3.752475573535752e-05, "loss": 0.6626, "num_input_tokens_seen": 15477264, "step": 26835 }, { "epoch": 3.9976169198689306, "grad_norm": 1.0791242122650146, "learning_rate": 3.7519130869536465e-05, "loss": 0.609, "num_input_tokens_seen": 15480272, "step": 26840 }, { "epoch": 3.99836163240989, "grad_norm": 0.9610871076583862, "learning_rate": 3.751350515772344e-05, "loss": 0.6716, "num_input_tokens_seen": 15483024, "step": 26845 }, { "epoch": 3.999106344950849, "grad_norm": 1.396167516708374, "learning_rate": 3.7507878600298626e-05, "loss": 0.5745, "num_input_tokens_seen": 15485680, "step": 26850 }, { "epoch": 3.9998510574918082, "grad_norm": 0.8296887278556824, "learning_rate": 3.750225119764223e-05, "loss": 0.6451, "num_input_tokens_seen": 15488912, "step": 26855 }, { "epoch": 4.0, "eval_loss": 0.6477048397064209, "eval_runtime": 74.3137, "eval_samples_per_second": 40.154, "eval_steps_per_second": 10.039, "num_input_tokens_seen": 15489040, "step": 26856 }, { "epoch": 4.000595770032767, "grad_norm": 0.9940418004989624, "learning_rate": 3.749662295013452e-05, "loss": 0.7113, "num_input_tokens_seen": 15491568, "step": 26860 }, { "epoch": 4.001340482573727, "grad_norm": 0.9575234055519104, "learning_rate": 3.7490993858155837e-05, "loss": 0.6548, "num_input_tokens_seen": 15494608, "step": 26865 }, { "epoch": 4.002085195114685, "grad_norm": 0.8376919031143188, "learning_rate": 3.748536392208658e-05, "loss": 0.5661, "num_input_tokens_seen": 15497744, "step": 26870 }, { "epoch": 4.002829907655645, "grad_norm": 0.6737295389175415, "learning_rate": 3.74797331423072e-05, "loss": 0.6105, "num_input_tokens_seen": 15500624, "step": 26875 }, { "epoch": 4.003574620196604, "grad_norm": 1.0733121633529663, "learning_rate": 3.747410151919817e-05, "loss": 0.6949, "num_input_tokens_seen": 15503856, "step": 26880 }, { "epoch": 4.0043193327375635, "grad_norm": 1.1529415845870972, "learning_rate": 3.746846905314009e-05, "loss": 0.6145, "num_input_tokens_seen": 15506704, "step": 26885 }, { "epoch": 4.005064045278522, "grad_norm": 1.2744941711425781, "learning_rate": 3.746283574451356e-05, "loss": 0.7355, "num_input_tokens_seen": 15509488, "step": 26890 }, { "epoch": 4.005808757819482, "grad_norm": 1.5112718343734741, "learning_rate": 3.7457201593699264e-05, "loss": 0.6375, "num_input_tokens_seen": 15512272, "step": 26895 }, { "epoch": 4.006553470360441, "grad_norm": 1.1652767658233643, "learning_rate": 3.7451566601077936e-05, "loss": 0.7449, "num_input_tokens_seen": 15515120, "step": 26900 }, { "epoch": 4.0072981829014, "grad_norm": 1.299646019935608, "learning_rate": 3.744593076703035e-05, "loss": 0.8259, "num_input_tokens_seen": 15517936, "step": 26905 }, { "epoch": 4.008042895442359, "grad_norm": 0.7141385078430176, "learning_rate": 3.744029409193737e-05, "loss": 0.75, "num_input_tokens_seen": 15520624, "step": 26910 }, { "epoch": 4.008787607983319, "grad_norm": 1.4235308170318604, "learning_rate": 3.7434656576179894e-05, "loss": 0.6046, "num_input_tokens_seen": 15523600, "step": 26915 }, { "epoch": 4.009532320524277, "grad_norm": 0.9841150641441345, "learning_rate": 3.742901822013889e-05, "loss": 0.6862, "num_input_tokens_seen": 15526320, "step": 26920 }, { "epoch": 4.010277033065237, "grad_norm": 1.1552369594573975, "learning_rate": 3.7423379024195355e-05, "loss": 0.6967, "num_input_tokens_seen": 15529264, "step": 26925 }, { "epoch": 4.011021745606196, "grad_norm": 2.0349185466766357, "learning_rate": 3.7417738988730375e-05, "loss": 0.5817, "num_input_tokens_seen": 15532048, "step": 26930 }, { "epoch": 4.0117664581471555, "grad_norm": 1.1358684301376343, "learning_rate": 3.7412098114125094e-05, "loss": 0.6301, "num_input_tokens_seen": 15534864, "step": 26935 }, { "epoch": 4.012511170688114, "grad_norm": 1.7592109441757202, "learning_rate": 3.740645640076068e-05, "loss": 0.5442, "num_input_tokens_seen": 15537776, "step": 26940 }, { "epoch": 4.013255883229074, "grad_norm": 1.4136919975280762, "learning_rate": 3.740081384901837e-05, "loss": 0.6191, "num_input_tokens_seen": 15540560, "step": 26945 }, { "epoch": 4.014000595770033, "grad_norm": 2.0027878284454346, "learning_rate": 3.7395170459279494e-05, "loss": 0.6101, "num_input_tokens_seen": 15543568, "step": 26950 }, { "epoch": 4.014745308310992, "grad_norm": 1.0029089450836182, "learning_rate": 3.738952623192539e-05, "loss": 0.6724, "num_input_tokens_seen": 15546544, "step": 26955 }, { "epoch": 4.015490020851951, "grad_norm": 1.1023969650268555, "learning_rate": 3.738388116733748e-05, "loss": 0.536, "num_input_tokens_seen": 15549744, "step": 26960 }, { "epoch": 4.016234733392911, "grad_norm": 1.392652153968811, "learning_rate": 3.737823526589722e-05, "loss": 0.7443, "num_input_tokens_seen": 15552912, "step": 26965 }, { "epoch": 4.0169794459338695, "grad_norm": 1.0175422430038452, "learning_rate": 3.737258852798615e-05, "loss": 0.4248, "num_input_tokens_seen": 15555696, "step": 26970 }, { "epoch": 4.017724158474829, "grad_norm": 1.4652423858642578, "learning_rate": 3.736694095398585e-05, "loss": 0.625, "num_input_tokens_seen": 15558608, "step": 26975 }, { "epoch": 4.018468871015788, "grad_norm": 1.0237220525741577, "learning_rate": 3.736129254427796e-05, "loss": 0.5117, "num_input_tokens_seen": 15561168, "step": 26980 }, { "epoch": 4.0192135835567475, "grad_norm": 1.21976637840271, "learning_rate": 3.735564329924419e-05, "loss": 0.6862, "num_input_tokens_seen": 15564016, "step": 26985 }, { "epoch": 4.019958296097706, "grad_norm": 1.487268328666687, "learning_rate": 3.734999321926626e-05, "loss": 0.6184, "num_input_tokens_seen": 15567088, "step": 26990 }, { "epoch": 4.020703008638666, "grad_norm": 0.9933488368988037, "learning_rate": 3.7344342304726014e-05, "loss": 0.5276, "num_input_tokens_seen": 15570128, "step": 26995 }, { "epoch": 4.021447721179625, "grad_norm": 1.0743073225021362, "learning_rate": 3.73386905560053e-05, "loss": 0.5292, "num_input_tokens_seen": 15572912, "step": 27000 }, { "epoch": 4.022192433720583, "grad_norm": 0.9645305275917053, "learning_rate": 3.733303797348604e-05, "loss": 0.6736, "num_input_tokens_seen": 15576080, "step": 27005 }, { "epoch": 4.022937146261543, "grad_norm": 1.3208682537078857, "learning_rate": 3.732738455755022e-05, "loss": 0.713, "num_input_tokens_seen": 15579056, "step": 27010 }, { "epoch": 4.023681858802502, "grad_norm": 1.4409313201904297, "learning_rate": 3.732173030857987e-05, "loss": 0.7387, "num_input_tokens_seen": 15582064, "step": 27015 }, { "epoch": 4.0244265713434615, "grad_norm": 0.7657955288887024, "learning_rate": 3.731607522695709e-05, "loss": 0.7922, "num_input_tokens_seen": 15585072, "step": 27020 }, { "epoch": 4.02517128388442, "grad_norm": 1.956300973892212, "learning_rate": 3.731041931306401e-05, "loss": 0.5884, "num_input_tokens_seen": 15588080, "step": 27025 }, { "epoch": 4.02591599642538, "grad_norm": 1.056983470916748, "learning_rate": 3.730476256728284e-05, "loss": 0.5583, "num_input_tokens_seen": 15590864, "step": 27030 }, { "epoch": 4.026660708966339, "grad_norm": 1.036547064781189, "learning_rate": 3.729910498999585e-05, "loss": 0.539, "num_input_tokens_seen": 15593648, "step": 27035 }, { "epoch": 4.027405421507298, "grad_norm": 1.3027827739715576, "learning_rate": 3.729344658158535e-05, "loss": 0.6956, "num_input_tokens_seen": 15596720, "step": 27040 }, { "epoch": 4.028150134048257, "grad_norm": 1.1809625625610352, "learning_rate": 3.7287787342433706e-05, "loss": 0.4805, "num_input_tokens_seen": 15599920, "step": 27045 }, { "epoch": 4.028894846589217, "grad_norm": 1.2117570638656616, "learning_rate": 3.728212727292336e-05, "loss": 0.4561, "num_input_tokens_seen": 15603024, "step": 27050 }, { "epoch": 4.0296395591301755, "grad_norm": 0.8547424077987671, "learning_rate": 3.727646637343678e-05, "loss": 0.5183, "num_input_tokens_seen": 15606192, "step": 27055 }, { "epoch": 4.030384271671135, "grad_norm": 2.8952951431274414, "learning_rate": 3.727080464435652e-05, "loss": 0.7007, "num_input_tokens_seen": 15609136, "step": 27060 }, { "epoch": 4.031128984212094, "grad_norm": 1.262050986289978, "learning_rate": 3.726514208606517e-05, "loss": 0.618, "num_input_tokens_seen": 15611824, "step": 27065 }, { "epoch": 4.0318736967530535, "grad_norm": 1.043731451034546, "learning_rate": 3.725947869894538e-05, "loss": 0.6923, "num_input_tokens_seen": 15614992, "step": 27070 }, { "epoch": 4.032618409294012, "grad_norm": 1.3262128829956055, "learning_rate": 3.725381448337987e-05, "loss": 0.7013, "num_input_tokens_seen": 15617616, "step": 27075 }, { "epoch": 4.033363121834972, "grad_norm": 1.2032217979431152, "learning_rate": 3.72481494397514e-05, "loss": 0.4963, "num_input_tokens_seen": 15620176, "step": 27080 }, { "epoch": 4.034107834375931, "grad_norm": 1.862088918685913, "learning_rate": 3.724248356844278e-05, "loss": 0.5251, "num_input_tokens_seen": 15623024, "step": 27085 }, { "epoch": 4.03485254691689, "grad_norm": 1.591825008392334, "learning_rate": 3.7236816869836896e-05, "loss": 0.6475, "num_input_tokens_seen": 15626448, "step": 27090 }, { "epoch": 4.035597259457849, "grad_norm": 1.1171951293945312, "learning_rate": 3.723114934431669e-05, "loss": 0.6974, "num_input_tokens_seen": 15629360, "step": 27095 }, { "epoch": 4.036341971998809, "grad_norm": 1.0497939586639404, "learning_rate": 3.7225480992265125e-05, "loss": 0.5527, "num_input_tokens_seen": 15632272, "step": 27100 }, { "epoch": 4.0370866845397675, "grad_norm": 1.870345115661621, "learning_rate": 3.721981181406526e-05, "loss": 0.5131, "num_input_tokens_seen": 15635408, "step": 27105 }, { "epoch": 4.037831397080727, "grad_norm": 1.1066429615020752, "learning_rate": 3.721414181010021e-05, "loss": 0.5928, "num_input_tokens_seen": 15638480, "step": 27110 }, { "epoch": 4.038576109621686, "grad_norm": 1.6332327127456665, "learning_rate": 3.72084709807531e-05, "loss": 0.7019, "num_input_tokens_seen": 15641392, "step": 27115 }, { "epoch": 4.0393208221626455, "grad_norm": 1.3968878984451294, "learning_rate": 3.720279932640717e-05, "loss": 0.6184, "num_input_tokens_seen": 15644464, "step": 27120 }, { "epoch": 4.040065534703604, "grad_norm": 1.519565463066101, "learning_rate": 3.7197126847445664e-05, "loss": 0.5785, "num_input_tokens_seen": 15647280, "step": 27125 }, { "epoch": 4.040810247244564, "grad_norm": 1.3967864513397217, "learning_rate": 3.719145354425192e-05, "loss": 0.6394, "num_input_tokens_seen": 15650256, "step": 27130 }, { "epoch": 4.041554959785523, "grad_norm": 1.3573354482650757, "learning_rate": 3.718577941720931e-05, "loss": 0.7781, "num_input_tokens_seen": 15653232, "step": 27135 }, { "epoch": 4.042299672326482, "grad_norm": 0.8710818886756897, "learning_rate": 3.7180104466701274e-05, "loss": 0.499, "num_input_tokens_seen": 15655952, "step": 27140 }, { "epoch": 4.043044384867441, "grad_norm": 1.9613765478134155, "learning_rate": 3.71744286931113e-05, "loss": 0.8341, "num_input_tokens_seen": 15658896, "step": 27145 }, { "epoch": 4.043789097408401, "grad_norm": 2.1701292991638184, "learning_rate": 3.7168752096822924e-05, "loss": 0.7449, "num_input_tokens_seen": 15661872, "step": 27150 }, { "epoch": 4.0445338099493595, "grad_norm": 1.384536623954773, "learning_rate": 3.716307467821976e-05, "loss": 0.7857, "num_input_tokens_seen": 15664848, "step": 27155 }, { "epoch": 4.045278522490319, "grad_norm": 1.107243537902832, "learning_rate": 3.7157396437685465e-05, "loss": 0.602, "num_input_tokens_seen": 15667600, "step": 27160 }, { "epoch": 4.046023235031278, "grad_norm": 2.928711175918579, "learning_rate": 3.715171737560374e-05, "loss": 0.815, "num_input_tokens_seen": 15670512, "step": 27165 }, { "epoch": 4.046767947572237, "grad_norm": 1.1322890520095825, "learning_rate": 3.7146037492358366e-05, "loss": 0.5381, "num_input_tokens_seen": 15673072, "step": 27170 }, { "epoch": 4.047512660113196, "grad_norm": 2.40248966217041, "learning_rate": 3.714035678833316e-05, "loss": 0.7189, "num_input_tokens_seen": 15675728, "step": 27175 }, { "epoch": 4.048257372654155, "grad_norm": 1.1231272220611572, "learning_rate": 3.7134675263912e-05, "loss": 0.6788, "num_input_tokens_seen": 15678320, "step": 27180 }, { "epoch": 4.049002085195115, "grad_norm": 1.5495022535324097, "learning_rate": 3.712899291947882e-05, "loss": 0.6816, "num_input_tokens_seen": 15681360, "step": 27185 }, { "epoch": 4.0497467977360735, "grad_norm": 1.1595910787582397, "learning_rate": 3.7123309755417615e-05, "loss": 0.6064, "num_input_tokens_seen": 15684208, "step": 27190 }, { "epoch": 4.050491510277033, "grad_norm": 0.8684118390083313, "learning_rate": 3.7117625772112416e-05, "loss": 0.5884, "num_input_tokens_seen": 15687632, "step": 27195 }, { "epoch": 4.051236222817992, "grad_norm": 1.8842850923538208, "learning_rate": 3.711194096994736e-05, "loss": 0.627, "num_input_tokens_seen": 15690352, "step": 27200 }, { "epoch": 4.0519809353589515, "grad_norm": 1.1296029090881348, "learning_rate": 3.710625534930655e-05, "loss": 0.7244, "num_input_tokens_seen": 15693040, "step": 27205 }, { "epoch": 4.05272564789991, "grad_norm": 1.0005991458892822, "learning_rate": 3.710056891057423e-05, "loss": 0.4943, "num_input_tokens_seen": 15696400, "step": 27210 }, { "epoch": 4.05347036044087, "grad_norm": 1.2781747579574585, "learning_rate": 3.709488165413467e-05, "loss": 0.5981, "num_input_tokens_seen": 15699568, "step": 27215 }, { "epoch": 4.054215072981829, "grad_norm": 2.643195629119873, "learning_rate": 3.708919358037218e-05, "loss": 0.5825, "num_input_tokens_seen": 15702224, "step": 27220 }, { "epoch": 4.054959785522788, "grad_norm": 0.8276696801185608, "learning_rate": 3.708350468967113e-05, "loss": 0.7578, "num_input_tokens_seen": 15705360, "step": 27225 }, { "epoch": 4.055704498063747, "grad_norm": 4.374104022979736, "learning_rate": 3.707781498241596e-05, "loss": 0.7126, "num_input_tokens_seen": 15708304, "step": 27230 }, { "epoch": 4.056449210604707, "grad_norm": 1.4745464324951172, "learning_rate": 3.707212445899116e-05, "loss": 0.5667, "num_input_tokens_seen": 15711152, "step": 27235 }, { "epoch": 4.0571939231456655, "grad_norm": 1.531754732131958, "learning_rate": 3.7066433119781286e-05, "loss": 0.7164, "num_input_tokens_seen": 15714000, "step": 27240 }, { "epoch": 4.057938635686625, "grad_norm": 1.4621824026107788, "learning_rate": 3.70607409651709e-05, "loss": 0.7122, "num_input_tokens_seen": 15716656, "step": 27245 }, { "epoch": 4.058683348227584, "grad_norm": 0.8395247459411621, "learning_rate": 3.705504799554469e-05, "loss": 0.4699, "num_input_tokens_seen": 15719440, "step": 27250 }, { "epoch": 4.059428060768544, "grad_norm": 0.8431260585784912, "learning_rate": 3.704935421128734e-05, "loss": 0.56, "num_input_tokens_seen": 15722288, "step": 27255 }, { "epoch": 4.060172773309502, "grad_norm": 1.6701529026031494, "learning_rate": 3.704365961278363e-05, "loss": 0.6577, "num_input_tokens_seen": 15725072, "step": 27260 }, { "epoch": 4.060917485850462, "grad_norm": 1.4117258787155151, "learning_rate": 3.7037964200418365e-05, "loss": 0.7138, "num_input_tokens_seen": 15727920, "step": 27265 }, { "epoch": 4.061662198391421, "grad_norm": 1.3155492544174194, "learning_rate": 3.7032267974576415e-05, "loss": 0.6847, "num_input_tokens_seen": 15730736, "step": 27270 }, { "epoch": 4.06240691093238, "grad_norm": 1.1922627687454224, "learning_rate": 3.702657093564272e-05, "loss": 0.5235, "num_input_tokens_seen": 15733552, "step": 27275 }, { "epoch": 4.063151623473339, "grad_norm": 1.1349917650222778, "learning_rate": 3.702087308400226e-05, "loss": 0.6354, "num_input_tokens_seen": 15736464, "step": 27280 }, { "epoch": 4.063896336014299, "grad_norm": 1.47990083694458, "learning_rate": 3.7015174420040074e-05, "loss": 0.5655, "num_input_tokens_seen": 15739312, "step": 27285 }, { "epoch": 4.0646410485552575, "grad_norm": 1.2400097846984863, "learning_rate": 3.7009474944141244e-05, "loss": 0.5895, "num_input_tokens_seen": 15742672, "step": 27290 }, { "epoch": 4.065385761096217, "grad_norm": 1.2475287914276123, "learning_rate": 3.7003774656690924e-05, "loss": 0.6019, "num_input_tokens_seen": 15745488, "step": 27295 }, { "epoch": 4.066130473637176, "grad_norm": 1.554518222808838, "learning_rate": 3.699807355807432e-05, "loss": 0.7128, "num_input_tokens_seen": 15748240, "step": 27300 }, { "epoch": 4.066875186178136, "grad_norm": 1.5912556648254395, "learning_rate": 3.6992371648676685e-05, "loss": 0.7489, "num_input_tokens_seen": 15751088, "step": 27305 }, { "epoch": 4.067619898719094, "grad_norm": 1.1624667644500732, "learning_rate": 3.698666892888332e-05, "loss": 0.6828, "num_input_tokens_seen": 15754192, "step": 27310 }, { "epoch": 4.068364611260054, "grad_norm": 1.3440428972244263, "learning_rate": 3.698096539907962e-05, "loss": 0.6506, "num_input_tokens_seen": 15756976, "step": 27315 }, { "epoch": 4.069109323801013, "grad_norm": 1.0407202243804932, "learning_rate": 3.6975261059650986e-05, "loss": 0.7272, "num_input_tokens_seen": 15760144, "step": 27320 }, { "epoch": 4.069854036341972, "grad_norm": 1.1366790533065796, "learning_rate": 3.696955591098289e-05, "loss": 0.6906, "num_input_tokens_seen": 15763056, "step": 27325 }, { "epoch": 4.070598748882931, "grad_norm": 1.9519544839859009, "learning_rate": 3.696384995346087e-05, "loss": 0.5584, "num_input_tokens_seen": 15765680, "step": 27330 }, { "epoch": 4.071343461423891, "grad_norm": 1.4268336296081543, "learning_rate": 3.6958143187470514e-05, "loss": 0.5773, "num_input_tokens_seen": 15768368, "step": 27335 }, { "epoch": 4.07208817396485, "grad_norm": 1.5670422315597534, "learning_rate": 3.695243561339747e-05, "loss": 0.5582, "num_input_tokens_seen": 15771472, "step": 27340 }, { "epoch": 4.072832886505808, "grad_norm": 1.5870822668075562, "learning_rate": 3.694672723162741e-05, "loss": 0.644, "num_input_tokens_seen": 15774448, "step": 27345 }, { "epoch": 4.073577599046768, "grad_norm": 1.4086488485336304, "learning_rate": 3.69410180425461e-05, "loss": 0.6728, "num_input_tokens_seen": 15777392, "step": 27350 }, { "epoch": 4.074322311587727, "grad_norm": 1.8442655801773071, "learning_rate": 3.693530804653934e-05, "loss": 0.5735, "num_input_tokens_seen": 15780080, "step": 27355 }, { "epoch": 4.075067024128686, "grad_norm": 1.4641133546829224, "learning_rate": 3.692959724399299e-05, "loss": 0.5824, "num_input_tokens_seen": 15782992, "step": 27360 }, { "epoch": 4.075811736669645, "grad_norm": 0.9402620792388916, "learning_rate": 3.692388563529295e-05, "loss": 0.5423, "num_input_tokens_seen": 15786032, "step": 27365 }, { "epoch": 4.076556449210605, "grad_norm": 1.9334385395050049, "learning_rate": 3.6918173220825204e-05, "loss": 0.6213, "num_input_tokens_seen": 15788976, "step": 27370 }, { "epoch": 4.0773011617515635, "grad_norm": 1.241317868232727, "learning_rate": 3.691246000097577e-05, "loss": 0.5579, "num_input_tokens_seen": 15791696, "step": 27375 }, { "epoch": 4.078045874292523, "grad_norm": 3.687540292739868, "learning_rate": 3.6906745976130716e-05, "loss": 0.6474, "num_input_tokens_seen": 15794480, "step": 27380 }, { "epoch": 4.078790586833482, "grad_norm": 1.5961695909500122, "learning_rate": 3.6901031146676185e-05, "loss": 0.58, "num_input_tokens_seen": 15797136, "step": 27385 }, { "epoch": 4.079535299374442, "grad_norm": 1.0798897743225098, "learning_rate": 3.689531551299835e-05, "loss": 0.6865, "num_input_tokens_seen": 15799696, "step": 27390 }, { "epoch": 4.0802800119154, "grad_norm": 2.3153600692749023, "learning_rate": 3.688959907548346e-05, "loss": 0.6217, "num_input_tokens_seen": 15802832, "step": 27395 }, { "epoch": 4.08102472445636, "grad_norm": 1.0250835418701172, "learning_rate": 3.68838818345178e-05, "loss": 0.5523, "num_input_tokens_seen": 15805456, "step": 27400 }, { "epoch": 4.081769436997319, "grad_norm": 1.3189610242843628, "learning_rate": 3.6878163790487726e-05, "loss": 0.581, "num_input_tokens_seen": 15808464, "step": 27405 }, { "epoch": 4.082514149538278, "grad_norm": 0.6221017837524414, "learning_rate": 3.6872444943779624e-05, "loss": 0.5514, "num_input_tokens_seen": 15811184, "step": 27410 }, { "epoch": 4.083258862079237, "grad_norm": 2.5046353340148926, "learning_rate": 3.686672529477998e-05, "loss": 0.728, "num_input_tokens_seen": 15814096, "step": 27415 }, { "epoch": 4.084003574620197, "grad_norm": 1.677051067352295, "learning_rate": 3.686100484387528e-05, "loss": 0.5372, "num_input_tokens_seen": 15817232, "step": 27420 }, { "epoch": 4.084748287161156, "grad_norm": 1.0638536214828491, "learning_rate": 3.685528359145209e-05, "loss": 0.6693, "num_input_tokens_seen": 15820208, "step": 27425 }, { "epoch": 4.085492999702115, "grad_norm": 1.6456198692321777, "learning_rate": 3.6849561537897045e-05, "loss": 0.481, "num_input_tokens_seen": 15822864, "step": 27430 }, { "epoch": 4.086237712243074, "grad_norm": 1.527945876121521, "learning_rate": 3.684383868359681e-05, "loss": 0.7588, "num_input_tokens_seen": 15826256, "step": 27435 }, { "epoch": 4.086982424784034, "grad_norm": 1.8687597513198853, "learning_rate": 3.68381150289381e-05, "loss": 0.6542, "num_input_tokens_seen": 15828880, "step": 27440 }, { "epoch": 4.087727137324992, "grad_norm": 1.7475993633270264, "learning_rate": 3.683239057430771e-05, "loss": 0.6227, "num_input_tokens_seen": 15831760, "step": 27445 }, { "epoch": 4.088471849865952, "grad_norm": 1.9201563596725464, "learning_rate": 3.6826665320092465e-05, "loss": 0.6716, "num_input_tokens_seen": 15834992, "step": 27450 }, { "epoch": 4.089216562406911, "grad_norm": 1.6955718994140625, "learning_rate": 3.682093926667927e-05, "loss": 0.6036, "num_input_tokens_seen": 15837808, "step": 27455 }, { "epoch": 4.08996127494787, "grad_norm": 1.2821868658065796, "learning_rate": 3.681521241445506e-05, "loss": 0.7112, "num_input_tokens_seen": 15840528, "step": 27460 }, { "epoch": 4.090705987488829, "grad_norm": 1.6184415817260742, "learning_rate": 3.6809484763806834e-05, "loss": 0.6739, "num_input_tokens_seen": 15843120, "step": 27465 }, { "epoch": 4.091450700029789, "grad_norm": 1.7594592571258545, "learning_rate": 3.680375631512164e-05, "loss": 0.6843, "num_input_tokens_seen": 15845840, "step": 27470 }, { "epoch": 4.092195412570748, "grad_norm": 2.607802152633667, "learning_rate": 3.679802706878658e-05, "loss": 0.59, "num_input_tokens_seen": 15848720, "step": 27475 }, { "epoch": 4.092940125111707, "grad_norm": 1.4216171503067017, "learning_rate": 3.6792297025188824e-05, "loss": 0.7528, "num_input_tokens_seen": 15851504, "step": 27480 }, { "epoch": 4.093684837652666, "grad_norm": 1.0677943229675293, "learning_rate": 3.6786566184715576e-05, "loss": 0.4858, "num_input_tokens_seen": 15854256, "step": 27485 }, { "epoch": 4.094429550193626, "grad_norm": 1.799202799797058, "learning_rate": 3.67808345477541e-05, "loss": 0.5944, "num_input_tokens_seen": 15857168, "step": 27490 }, { "epoch": 4.095174262734584, "grad_norm": 1.617962121963501, "learning_rate": 3.6775102114691736e-05, "loss": 0.5201, "num_input_tokens_seen": 15859984, "step": 27495 }, { "epoch": 4.095918975275544, "grad_norm": 1.4114344120025635, "learning_rate": 3.676936888591583e-05, "loss": 0.5742, "num_input_tokens_seen": 15862800, "step": 27500 }, { "epoch": 4.096663687816503, "grad_norm": 1.2336896657943726, "learning_rate": 3.6763634861813836e-05, "loss": 0.7449, "num_input_tokens_seen": 15865584, "step": 27505 }, { "epoch": 4.0974084003574625, "grad_norm": 1.6198079586029053, "learning_rate": 3.675790004277322e-05, "loss": 0.8086, "num_input_tokens_seen": 15868560, "step": 27510 }, { "epoch": 4.098153112898421, "grad_norm": 2.1361091136932373, "learning_rate": 3.675216442918153e-05, "loss": 0.8226, "num_input_tokens_seen": 15871280, "step": 27515 }, { "epoch": 4.09889782543938, "grad_norm": 1.850494146347046, "learning_rate": 3.674642802142635e-05, "loss": 0.5296, "num_input_tokens_seen": 15874448, "step": 27520 }, { "epoch": 4.09964253798034, "grad_norm": 0.6722926497459412, "learning_rate": 3.6740690819895304e-05, "loss": 0.5949, "num_input_tokens_seen": 15877456, "step": 27525 }, { "epoch": 4.100387250521298, "grad_norm": 0.738613486289978, "learning_rate": 3.673495282497613e-05, "loss": 0.4719, "num_input_tokens_seen": 15880464, "step": 27530 }, { "epoch": 4.101131963062258, "grad_norm": 3.0145750045776367, "learning_rate": 3.672921403705654e-05, "loss": 0.4718, "num_input_tokens_seen": 15883408, "step": 27535 }, { "epoch": 4.101876675603217, "grad_norm": 1.005469560623169, "learning_rate": 3.672347445652436e-05, "loss": 0.5255, "num_input_tokens_seen": 15886416, "step": 27540 }, { "epoch": 4.102621388144176, "grad_norm": 0.7469790577888489, "learning_rate": 3.671773408376743e-05, "loss": 0.6744, "num_input_tokens_seen": 15889008, "step": 27545 }, { "epoch": 4.103366100685135, "grad_norm": 2.2542498111724854, "learning_rate": 3.671199291917368e-05, "loss": 0.866, "num_input_tokens_seen": 15891824, "step": 27550 }, { "epoch": 4.104110813226095, "grad_norm": 1.0784276723861694, "learning_rate": 3.6706250963131065e-05, "loss": 0.5095, "num_input_tokens_seen": 15894416, "step": 27555 }, { "epoch": 4.104855525767054, "grad_norm": 2.076587438583374, "learning_rate": 3.670050821602761e-05, "loss": 0.8919, "num_input_tokens_seen": 15897232, "step": 27560 }, { "epoch": 4.105600238308013, "grad_norm": 0.6697713136672974, "learning_rate": 3.669476467825137e-05, "loss": 0.5827, "num_input_tokens_seen": 15900240, "step": 27565 }, { "epoch": 4.106344950848972, "grad_norm": 1.5637094974517822, "learning_rate": 3.668902035019049e-05, "loss": 0.7303, "num_input_tokens_seen": 15903600, "step": 27570 }, { "epoch": 4.107089663389932, "grad_norm": 1.1961238384246826, "learning_rate": 3.668327523223313e-05, "loss": 0.5697, "num_input_tokens_seen": 15906448, "step": 27575 }, { "epoch": 4.10783437593089, "grad_norm": 1.084114670753479, "learning_rate": 3.667752932476753e-05, "loss": 0.7025, "num_input_tokens_seen": 15909040, "step": 27580 }, { "epoch": 4.10857908847185, "grad_norm": 0.7454989552497864, "learning_rate": 3.667178262818198e-05, "loss": 0.5599, "num_input_tokens_seen": 15912080, "step": 27585 }, { "epoch": 4.109323801012809, "grad_norm": 1.2154682874679565, "learning_rate": 3.666603514286482e-05, "loss": 0.7527, "num_input_tokens_seen": 15915024, "step": 27590 }, { "epoch": 4.1100685135537685, "grad_norm": 1.5676517486572266, "learning_rate": 3.666028686920443e-05, "loss": 0.5841, "num_input_tokens_seen": 15917776, "step": 27595 }, { "epoch": 4.110813226094727, "grad_norm": 1.1392667293548584, "learning_rate": 3.665453780758926e-05, "loss": 0.5319, "num_input_tokens_seen": 15920624, "step": 27600 }, { "epoch": 4.111557938635687, "grad_norm": 1.4771958589553833, "learning_rate": 3.6648787958407803e-05, "loss": 0.7171, "num_input_tokens_seen": 15923536, "step": 27605 }, { "epoch": 4.112302651176646, "grad_norm": 1.4794824123382568, "learning_rate": 3.6643037322048624e-05, "loss": 0.7936, "num_input_tokens_seen": 15926672, "step": 27610 }, { "epoch": 4.113047363717605, "grad_norm": 1.7967818975448608, "learning_rate": 3.663728589890032e-05, "loss": 0.6302, "num_input_tokens_seen": 15929936, "step": 27615 }, { "epoch": 4.113792076258564, "grad_norm": 1.0835009813308716, "learning_rate": 3.6631533689351544e-05, "loss": 0.6515, "num_input_tokens_seen": 15932656, "step": 27620 }, { "epoch": 4.114536788799524, "grad_norm": 0.9542206525802612, "learning_rate": 3.6625780693791016e-05, "loss": 0.5501, "num_input_tokens_seen": 15935568, "step": 27625 }, { "epoch": 4.115281501340482, "grad_norm": 1.5657883882522583, "learning_rate": 3.6620026912607497e-05, "loss": 0.5737, "num_input_tokens_seen": 15938288, "step": 27630 }, { "epoch": 4.116026213881442, "grad_norm": 1.4268016815185547, "learning_rate": 3.6614272346189795e-05, "loss": 0.8386, "num_input_tokens_seen": 15941136, "step": 27635 }, { "epoch": 4.116770926422401, "grad_norm": 1.9754923582077026, "learning_rate": 3.660851699492679e-05, "loss": 0.6243, "num_input_tokens_seen": 15943920, "step": 27640 }, { "epoch": 4.1175156389633605, "grad_norm": 1.460153579711914, "learning_rate": 3.660276085920742e-05, "loss": 0.6188, "num_input_tokens_seen": 15946736, "step": 27645 }, { "epoch": 4.118260351504319, "grad_norm": 0.9266290664672852, "learning_rate": 3.6597003939420623e-05, "loss": 0.4999, "num_input_tokens_seen": 15949616, "step": 27650 }, { "epoch": 4.119005064045279, "grad_norm": 1.80985426902771, "learning_rate": 3.6591246235955456e-05, "loss": 0.516, "num_input_tokens_seen": 15952272, "step": 27655 }, { "epoch": 4.119749776586238, "grad_norm": 1.0676156282424927, "learning_rate": 3.6585487749200996e-05, "loss": 0.5351, "num_input_tokens_seen": 15955408, "step": 27660 }, { "epoch": 4.120494489127197, "grad_norm": 1.428497314453125, "learning_rate": 3.657972847954638e-05, "loss": 0.598, "num_input_tokens_seen": 15958384, "step": 27665 }, { "epoch": 4.121239201668156, "grad_norm": 2.6502139568328857, "learning_rate": 3.657396842738079e-05, "loss": 0.8547, "num_input_tokens_seen": 15961232, "step": 27670 }, { "epoch": 4.121983914209116, "grad_norm": 1.2115164995193481, "learning_rate": 3.6568207593093465e-05, "loss": 0.6548, "num_input_tokens_seen": 15963984, "step": 27675 }, { "epoch": 4.1227286267500745, "grad_norm": 1.8070422410964966, "learning_rate": 3.656244597707372e-05, "loss": 0.5088, "num_input_tokens_seen": 15966800, "step": 27680 }, { "epoch": 4.123473339291033, "grad_norm": 1.890773057937622, "learning_rate": 3.655668357971087e-05, "loss": 0.5864, "num_input_tokens_seen": 15969392, "step": 27685 }, { "epoch": 4.124218051831993, "grad_norm": 1.140819787979126, "learning_rate": 3.6550920401394335e-05, "loss": 0.6706, "num_input_tokens_seen": 15972304, "step": 27690 }, { "epoch": 4.124962764372952, "grad_norm": 0.9115862250328064, "learning_rate": 3.654515644251356e-05, "loss": 0.6008, "num_input_tokens_seen": 15975056, "step": 27695 }, { "epoch": 4.125707476913911, "grad_norm": 2.326974630355835, "learning_rate": 3.653939170345805e-05, "loss": 0.6576, "num_input_tokens_seen": 15977712, "step": 27700 }, { "epoch": 4.12645218945487, "grad_norm": 1.1805297136306763, "learning_rate": 3.653362618461737e-05, "loss": 0.5436, "num_input_tokens_seen": 15980432, "step": 27705 }, { "epoch": 4.12719690199583, "grad_norm": 1.3918761014938354, "learning_rate": 3.652785988638112e-05, "loss": 0.4474, "num_input_tokens_seen": 15983152, "step": 27710 }, { "epoch": 4.127941614536788, "grad_norm": 2.1296489238739014, "learning_rate": 3.6522092809138975e-05, "loss": 0.7052, "num_input_tokens_seen": 15985744, "step": 27715 }, { "epoch": 4.128686327077748, "grad_norm": 1.3558158874511719, "learning_rate": 3.651632495328064e-05, "loss": 0.5862, "num_input_tokens_seen": 15988464, "step": 27720 }, { "epoch": 4.129431039618707, "grad_norm": 1.8615295886993408, "learning_rate": 3.6510556319195884e-05, "loss": 0.5336, "num_input_tokens_seen": 15991344, "step": 27725 }, { "epoch": 4.1301757521596665, "grad_norm": 1.932115077972412, "learning_rate": 3.650478690727454e-05, "loss": 0.7367, "num_input_tokens_seen": 15993968, "step": 27730 }, { "epoch": 4.130920464700625, "grad_norm": 1.4697169065475464, "learning_rate": 3.6499016717906455e-05, "loss": 0.5243, "num_input_tokens_seen": 15996848, "step": 27735 }, { "epoch": 4.131665177241585, "grad_norm": 1.4854443073272705, "learning_rate": 3.6493245751481574e-05, "loss": 0.6197, "num_input_tokens_seen": 15999856, "step": 27740 }, { "epoch": 4.132409889782544, "grad_norm": 1.2905293703079224, "learning_rate": 3.648747400838989e-05, "loss": 0.5733, "num_input_tokens_seen": 16002768, "step": 27745 }, { "epoch": 4.133154602323503, "grad_norm": 1.6301286220550537, "learning_rate": 3.6481701489021404e-05, "loss": 0.5877, "num_input_tokens_seen": 16005552, "step": 27750 }, { "epoch": 4.133899314864462, "grad_norm": 2.1393187046051025, "learning_rate": 3.647592819376621e-05, "loss": 0.5128, "num_input_tokens_seen": 16008464, "step": 27755 }, { "epoch": 4.134644027405422, "grad_norm": 1.2796295881271362, "learning_rate": 3.6470154123014455e-05, "loss": 0.5053, "num_input_tokens_seen": 16011184, "step": 27760 }, { "epoch": 4.1353887399463805, "grad_norm": 1.616477131843567, "learning_rate": 3.646437927715632e-05, "loss": 0.3726, "num_input_tokens_seen": 16013872, "step": 27765 }, { "epoch": 4.13613345248734, "grad_norm": 1.508236289024353, "learning_rate": 3.645860365658203e-05, "loss": 0.5831, "num_input_tokens_seen": 16016560, "step": 27770 }, { "epoch": 4.136878165028299, "grad_norm": 1.425658106803894, "learning_rate": 3.645282726168191e-05, "loss": 0.3386, "num_input_tokens_seen": 16019568, "step": 27775 }, { "epoch": 4.1376228775692585, "grad_norm": 1.8760775327682495, "learning_rate": 3.644705009284628e-05, "loss": 0.5439, "num_input_tokens_seen": 16022576, "step": 27780 }, { "epoch": 4.138367590110217, "grad_norm": 1.897843599319458, "learning_rate": 3.644127215046555e-05, "loss": 0.5763, "num_input_tokens_seen": 16025360, "step": 27785 }, { "epoch": 4.139112302651177, "grad_norm": 2.430338144302368, "learning_rate": 3.643549343493015e-05, "loss": 0.7348, "num_input_tokens_seen": 16028240, "step": 27790 }, { "epoch": 4.139857015192136, "grad_norm": 1.4208126068115234, "learning_rate": 3.642971394663061e-05, "loss": 0.6832, "num_input_tokens_seen": 16031408, "step": 27795 }, { "epoch": 4.140601727733095, "grad_norm": 3.1515347957611084, "learning_rate": 3.642393368595747e-05, "loss": 0.7401, "num_input_tokens_seen": 16034192, "step": 27800 }, { "epoch": 4.141346440274054, "grad_norm": 2.585956573486328, "learning_rate": 3.641815265330133e-05, "loss": 0.741, "num_input_tokens_seen": 16036880, "step": 27805 }, { "epoch": 4.142091152815014, "grad_norm": 2.7002875804901123, "learning_rate": 3.6412370849052865e-05, "loss": 0.7912, "num_input_tokens_seen": 16039728, "step": 27810 }, { "epoch": 4.1428358653559725, "grad_norm": 1.209381341934204, "learning_rate": 3.6406588273602774e-05, "loss": 0.5217, "num_input_tokens_seen": 16042512, "step": 27815 }, { "epoch": 4.143580577896932, "grad_norm": 2.3529410362243652, "learning_rate": 3.640080492734182e-05, "loss": 0.8122, "num_input_tokens_seen": 16045424, "step": 27820 }, { "epoch": 4.144325290437891, "grad_norm": 1.472312092781067, "learning_rate": 3.639502081066083e-05, "loss": 0.6432, "num_input_tokens_seen": 16048112, "step": 27825 }, { "epoch": 4.1450700029788505, "grad_norm": 1.2876986265182495, "learning_rate": 3.638923592395066e-05, "loss": 0.6842, "num_input_tokens_seen": 16050960, "step": 27830 }, { "epoch": 4.145814715519809, "grad_norm": 1.3089394569396973, "learning_rate": 3.638345026760222e-05, "loss": 0.5389, "num_input_tokens_seen": 16053936, "step": 27835 }, { "epoch": 4.146559428060769, "grad_norm": 3.170837879180908, "learning_rate": 3.63776638420065e-05, "loss": 0.9386, "num_input_tokens_seen": 16056528, "step": 27840 }, { "epoch": 4.147304140601728, "grad_norm": 1.6194062232971191, "learning_rate": 3.6371876647554524e-05, "loss": 0.6632, "num_input_tokens_seen": 16059408, "step": 27845 }, { "epoch": 4.148048853142687, "grad_norm": 1.298613429069519, "learning_rate": 3.636608868463735e-05, "loss": 0.5477, "num_input_tokens_seen": 16062288, "step": 27850 }, { "epoch": 4.148793565683646, "grad_norm": 1.4426283836364746, "learning_rate": 3.636029995364611e-05, "loss": 0.6131, "num_input_tokens_seen": 16065200, "step": 27855 }, { "epoch": 4.149538278224606, "grad_norm": 1.0887919664382935, "learning_rate": 3.6354510454972e-05, "loss": 0.6142, "num_input_tokens_seen": 16068240, "step": 27860 }, { "epoch": 4.1502829907655645, "grad_norm": 1.7612844705581665, "learning_rate": 3.634872018900623e-05, "loss": 0.5786, "num_input_tokens_seen": 16071024, "step": 27865 }, { "epoch": 4.151027703306523, "grad_norm": 1.4120001792907715, "learning_rate": 3.634292915614009e-05, "loss": 0.6189, "num_input_tokens_seen": 16073936, "step": 27870 }, { "epoch": 4.151772415847483, "grad_norm": 0.9690017700195312, "learning_rate": 3.633713735676491e-05, "loss": 0.5558, "num_input_tokens_seen": 16076816, "step": 27875 }, { "epoch": 4.152517128388442, "grad_norm": 1.1423563957214355, "learning_rate": 3.6331344791272087e-05, "loss": 0.6326, "num_input_tokens_seen": 16079696, "step": 27880 }, { "epoch": 4.153261840929401, "grad_norm": 1.389968752861023, "learning_rate": 3.632555146005305e-05, "loss": 0.5919, "num_input_tokens_seen": 16082928, "step": 27885 }, { "epoch": 4.15400655347036, "grad_norm": 0.8968043327331543, "learning_rate": 3.63197573634993e-05, "loss": 0.9987, "num_input_tokens_seen": 16085936, "step": 27890 }, { "epoch": 4.15475126601132, "grad_norm": 1.578726053237915, "learning_rate": 3.6313962502002365e-05, "loss": 0.5336, "num_input_tokens_seen": 16088944, "step": 27895 }, { "epoch": 4.1554959785522785, "grad_norm": 1.398155689239502, "learning_rate": 3.6308166875953836e-05, "loss": 0.8162, "num_input_tokens_seen": 16091984, "step": 27900 }, { "epoch": 4.156240691093238, "grad_norm": 1.4033839702606201, "learning_rate": 3.630237048574537e-05, "loss": 0.525, "num_input_tokens_seen": 16095056, "step": 27905 }, { "epoch": 4.156985403634197, "grad_norm": 1.2455694675445557, "learning_rate": 3.6296573331768664e-05, "loss": 0.6546, "num_input_tokens_seen": 16097840, "step": 27910 }, { "epoch": 4.1577301161751565, "grad_norm": 1.194745659828186, "learning_rate": 3.629077541441546e-05, "loss": 0.6558, "num_input_tokens_seen": 16100528, "step": 27915 }, { "epoch": 4.158474828716115, "grad_norm": 2.1712284088134766, "learning_rate": 3.628497673407755e-05, "loss": 0.7294, "num_input_tokens_seen": 16103344, "step": 27920 }, { "epoch": 4.159219541257075, "grad_norm": 1.7843563556671143, "learning_rate": 3.62791772911468e-05, "loss": 0.667, "num_input_tokens_seen": 16106064, "step": 27925 }, { "epoch": 4.159964253798034, "grad_norm": 1.396511197090149, "learning_rate": 3.6273377086015106e-05, "loss": 0.3914, "num_input_tokens_seen": 16108752, "step": 27930 }, { "epoch": 4.160708966338993, "grad_norm": 2.5851261615753174, "learning_rate": 3.626757611907442e-05, "loss": 0.8011, "num_input_tokens_seen": 16111408, "step": 27935 }, { "epoch": 4.161453678879952, "grad_norm": 1.655829906463623, "learning_rate": 3.6261774390716744e-05, "loss": 0.6567, "num_input_tokens_seen": 16114384, "step": 27940 }, { "epoch": 4.162198391420912, "grad_norm": 1.2100067138671875, "learning_rate": 3.625597190133416e-05, "loss": 0.7704, "num_input_tokens_seen": 16117328, "step": 27945 }, { "epoch": 4.1629431039618705, "grad_norm": 1.0335667133331299, "learning_rate": 3.625016865131875e-05, "loss": 0.6454, "num_input_tokens_seen": 16120336, "step": 27950 }, { "epoch": 4.16368781650283, "grad_norm": 1.7467511892318726, "learning_rate": 3.624436464106267e-05, "loss": 0.5859, "num_input_tokens_seen": 16123120, "step": 27955 }, { "epoch": 4.164432529043789, "grad_norm": 1.4789255857467651, "learning_rate": 3.623855987095816e-05, "loss": 0.5834, "num_input_tokens_seen": 16126000, "step": 27960 }, { "epoch": 4.165177241584749, "grad_norm": 1.0605939626693726, "learning_rate": 3.623275434139746e-05, "loss": 0.5986, "num_input_tokens_seen": 16129072, "step": 27965 }, { "epoch": 4.165921954125707, "grad_norm": 1.4894932508468628, "learning_rate": 3.622694805277289e-05, "loss": 0.5437, "num_input_tokens_seen": 16131632, "step": 27970 }, { "epoch": 4.166666666666667, "grad_norm": 1.811215877532959, "learning_rate": 3.6221141005476824e-05, "loss": 0.4689, "num_input_tokens_seen": 16134768, "step": 27975 }, { "epoch": 4.167411379207626, "grad_norm": 1.4896929264068604, "learning_rate": 3.6215333199901655e-05, "loss": 0.5122, "num_input_tokens_seen": 16137712, "step": 27980 }, { "epoch": 4.168156091748585, "grad_norm": 2.7557194232940674, "learning_rate": 3.620952463643989e-05, "loss": 0.6037, "num_input_tokens_seen": 16140560, "step": 27985 }, { "epoch": 4.168900804289544, "grad_norm": 2.3473825454711914, "learning_rate": 3.6203715315484e-05, "loss": 0.6727, "num_input_tokens_seen": 16143856, "step": 27990 }, { "epoch": 4.169645516830504, "grad_norm": 1.0404629707336426, "learning_rate": 3.6197905237426596e-05, "loss": 0.6175, "num_input_tokens_seen": 16146640, "step": 27995 }, { "epoch": 4.1703902293714625, "grad_norm": 1.5751687288284302, "learning_rate": 3.619209440266027e-05, "loss": 0.8422, "num_input_tokens_seen": 16149712, "step": 28000 }, { "epoch": 4.171134941912422, "grad_norm": 3.819175958633423, "learning_rate": 3.618628281157772e-05, "loss": 0.7279, "num_input_tokens_seen": 16152400, "step": 28005 }, { "epoch": 4.171879654453381, "grad_norm": 1.0532857179641724, "learning_rate": 3.618047046457166e-05, "loss": 0.4674, "num_input_tokens_seen": 16155536, "step": 28010 }, { "epoch": 4.172624366994341, "grad_norm": 1.122835397720337, "learning_rate": 3.617465736203485e-05, "loss": 0.8408, "num_input_tokens_seen": 16158480, "step": 28015 }, { "epoch": 4.173369079535299, "grad_norm": 2.854748487472534, "learning_rate": 3.616884350436013e-05, "loss": 0.7595, "num_input_tokens_seen": 16161296, "step": 28020 }, { "epoch": 4.174113792076259, "grad_norm": 0.8013388514518738, "learning_rate": 3.616302889194039e-05, "loss": 0.5679, "num_input_tokens_seen": 16164400, "step": 28025 }, { "epoch": 4.174858504617218, "grad_norm": 1.4517796039581299, "learning_rate": 3.6157213525168534e-05, "loss": 0.7642, "num_input_tokens_seen": 16167408, "step": 28030 }, { "epoch": 4.1756032171581765, "grad_norm": 1.433146357536316, "learning_rate": 3.6151397404437544e-05, "loss": 0.6871, "num_input_tokens_seen": 16170032, "step": 28035 }, { "epoch": 4.176347929699136, "grad_norm": 1.8186240196228027, "learning_rate": 3.614558053014045e-05, "loss": 0.8523, "num_input_tokens_seen": 16172624, "step": 28040 }, { "epoch": 4.177092642240095, "grad_norm": 2.23700213432312, "learning_rate": 3.613976290267036e-05, "loss": 0.9506, "num_input_tokens_seen": 16175184, "step": 28045 }, { "epoch": 4.177837354781055, "grad_norm": 1.8293310403823853, "learning_rate": 3.6133944522420374e-05, "loss": 0.6745, "num_input_tokens_seen": 16178224, "step": 28050 }, { "epoch": 4.178582067322013, "grad_norm": 2.3079187870025635, "learning_rate": 3.612812538978368e-05, "loss": 0.6909, "num_input_tokens_seen": 16181456, "step": 28055 }, { "epoch": 4.179326779862973, "grad_norm": 2.3710615634918213, "learning_rate": 3.612230550515352e-05, "loss": 0.7617, "num_input_tokens_seen": 16184112, "step": 28060 }, { "epoch": 4.180071492403932, "grad_norm": 1.8457350730895996, "learning_rate": 3.6116484868923174e-05, "loss": 0.7174, "num_input_tokens_seen": 16187152, "step": 28065 }, { "epoch": 4.180816204944891, "grad_norm": 1.8288813829421997, "learning_rate": 3.611066348148597e-05, "loss": 0.6634, "num_input_tokens_seen": 16190288, "step": 28070 }, { "epoch": 4.18156091748585, "grad_norm": 2.418133020401001, "learning_rate": 3.6104841343235313e-05, "loss": 0.7325, "num_input_tokens_seen": 16193008, "step": 28075 }, { "epoch": 4.18230563002681, "grad_norm": 1.1532728672027588, "learning_rate": 3.609901845456462e-05, "loss": 0.5391, "num_input_tokens_seen": 16196112, "step": 28080 }, { "epoch": 4.1830503425677685, "grad_norm": 1.7561264038085938, "learning_rate": 3.6093194815867385e-05, "loss": 0.7133, "num_input_tokens_seen": 16199344, "step": 28085 }, { "epoch": 4.183795055108728, "grad_norm": 6.445526599884033, "learning_rate": 3.608737042753715e-05, "loss": 0.5684, "num_input_tokens_seen": 16202320, "step": 28090 }, { "epoch": 4.184539767649687, "grad_norm": 1.817112922668457, "learning_rate": 3.608154528996749e-05, "loss": 0.638, "num_input_tokens_seen": 16205488, "step": 28095 }, { "epoch": 4.185284480190647, "grad_norm": 2.8422083854675293, "learning_rate": 3.607571940355206e-05, "loss": 0.6011, "num_input_tokens_seen": 16208080, "step": 28100 }, { "epoch": 4.186029192731605, "grad_norm": 1.7891664505004883, "learning_rate": 3.606989276868455e-05, "loss": 0.6625, "num_input_tokens_seen": 16210864, "step": 28105 }, { "epoch": 4.186773905272565, "grad_norm": 1.3825918436050415, "learning_rate": 3.606406538575868e-05, "loss": 0.7725, "num_input_tokens_seen": 16213712, "step": 28110 }, { "epoch": 4.187518617813524, "grad_norm": 0.8192875385284424, "learning_rate": 3.605823725516826e-05, "loss": 0.6692, "num_input_tokens_seen": 16216336, "step": 28115 }, { "epoch": 4.188263330354483, "grad_norm": 2.4598445892333984, "learning_rate": 3.605240837730713e-05, "loss": 0.7155, "num_input_tokens_seen": 16219376, "step": 28120 }, { "epoch": 4.189008042895442, "grad_norm": 1.5237584114074707, "learning_rate": 3.604657875256918e-05, "loss": 0.5022, "num_input_tokens_seen": 16222064, "step": 28125 }, { "epoch": 4.189752755436402, "grad_norm": 2.0299994945526123, "learning_rate": 3.604074838134834e-05, "loss": 0.562, "num_input_tokens_seen": 16224848, "step": 28130 }, { "epoch": 4.190497467977361, "grad_norm": 1.734667181968689, "learning_rate": 3.603491726403862e-05, "loss": 0.6031, "num_input_tokens_seen": 16227792, "step": 28135 }, { "epoch": 4.19124218051832, "grad_norm": 1.9563257694244385, "learning_rate": 3.6029085401034053e-05, "loss": 0.69, "num_input_tokens_seen": 16230576, "step": 28140 }, { "epoch": 4.191986893059279, "grad_norm": 1.1752848625183105, "learning_rate": 3.602325279272874e-05, "loss": 0.6048, "num_input_tokens_seen": 16233104, "step": 28145 }, { "epoch": 4.192731605600239, "grad_norm": 3.0320041179656982, "learning_rate": 3.6017419439516815e-05, "loss": 0.5604, "num_input_tokens_seen": 16235760, "step": 28150 }, { "epoch": 4.193476318141197, "grad_norm": 1.8122860193252563, "learning_rate": 3.6011585341792477e-05, "loss": 0.6276, "num_input_tokens_seen": 16238512, "step": 28155 }, { "epoch": 4.194221030682157, "grad_norm": 1.581691861152649, "learning_rate": 3.600575049994997e-05, "loss": 0.5669, "num_input_tokens_seen": 16241360, "step": 28160 }, { "epoch": 4.194965743223116, "grad_norm": 1.74565851688385, "learning_rate": 3.59999149143836e-05, "loss": 0.7382, "num_input_tokens_seen": 16244496, "step": 28165 }, { "epoch": 4.195710455764075, "grad_norm": 0.8239599466323853, "learning_rate": 3.5994078585487694e-05, "loss": 0.5098, "num_input_tokens_seen": 16247152, "step": 28170 }, { "epoch": 4.196455168305034, "grad_norm": 1.3023232221603394, "learning_rate": 3.5988241513656664e-05, "loss": 0.6704, "num_input_tokens_seen": 16250064, "step": 28175 }, { "epoch": 4.197199880845994, "grad_norm": 1.133326530456543, "learning_rate": 3.598240369928494e-05, "loss": 0.6091, "num_input_tokens_seen": 16252880, "step": 28180 }, { "epoch": 4.197944593386953, "grad_norm": 2.3121016025543213, "learning_rate": 3.5976565142767025e-05, "loss": 0.6628, "num_input_tokens_seen": 16255568, "step": 28185 }, { "epoch": 4.198689305927912, "grad_norm": 1.9201024770736694, "learning_rate": 3.5970725844497465e-05, "loss": 0.5478, "num_input_tokens_seen": 16258576, "step": 28190 }, { "epoch": 4.199434018468871, "grad_norm": 1.8703166246414185, "learning_rate": 3.596488580487086e-05, "loss": 0.7375, "num_input_tokens_seen": 16261584, "step": 28195 }, { "epoch": 4.200178731009831, "grad_norm": 1.4180970191955566, "learning_rate": 3.595904502428185e-05, "loss": 0.6227, "num_input_tokens_seen": 16264240, "step": 28200 }, { "epoch": 4.200923443550789, "grad_norm": 1.275184154510498, "learning_rate": 3.595320350312513e-05, "loss": 0.5593, "num_input_tokens_seen": 16266864, "step": 28205 }, { "epoch": 4.201668156091749, "grad_norm": 1.5460145473480225, "learning_rate": 3.594736124179546e-05, "loss": 0.4733, "num_input_tokens_seen": 16269840, "step": 28210 }, { "epoch": 4.202412868632708, "grad_norm": 1.292311191558838, "learning_rate": 3.594151824068762e-05, "loss": 0.6797, "num_input_tokens_seen": 16272944, "step": 28215 }, { "epoch": 4.203157581173667, "grad_norm": 2.068920135498047, "learning_rate": 3.593567450019646e-05, "loss": 0.5871, "num_input_tokens_seen": 16275568, "step": 28220 }, { "epoch": 4.203902293714626, "grad_norm": 1.504748821258545, "learning_rate": 3.592983002071688e-05, "loss": 0.5865, "num_input_tokens_seen": 16278832, "step": 28225 }, { "epoch": 4.204647006255585, "grad_norm": 1.6478350162506104, "learning_rate": 3.5923984802643826e-05, "loss": 0.5326, "num_input_tokens_seen": 16281648, "step": 28230 }, { "epoch": 4.205391718796545, "grad_norm": 2.0111734867095947, "learning_rate": 3.59181388463723e-05, "loss": 0.6303, "num_input_tokens_seen": 16284464, "step": 28235 }, { "epoch": 4.206136431337503, "grad_norm": 1.2651365995407104, "learning_rate": 3.591229215229733e-05, "loss": 0.4648, "num_input_tokens_seen": 16287280, "step": 28240 }, { "epoch": 4.206881143878463, "grad_norm": 3.777306318283081, "learning_rate": 3.590644472081402e-05, "loss": 0.8032, "num_input_tokens_seen": 16290160, "step": 28245 }, { "epoch": 4.207625856419422, "grad_norm": 1.0458751916885376, "learning_rate": 3.5900596552317526e-05, "loss": 0.6741, "num_input_tokens_seen": 16293328, "step": 28250 }, { "epoch": 4.208370568960381, "grad_norm": 1.4952553510665894, "learning_rate": 3.589474764720303e-05, "loss": 0.6308, "num_input_tokens_seen": 16296272, "step": 28255 }, { "epoch": 4.20911528150134, "grad_norm": 1.7914443016052246, "learning_rate": 3.588889800586579e-05, "loss": 0.4657, "num_input_tokens_seen": 16299056, "step": 28260 }, { "epoch": 4.2098599940423, "grad_norm": 8.051858901977539, "learning_rate": 3.588304762870108e-05, "loss": 0.3693, "num_input_tokens_seen": 16301936, "step": 28265 }, { "epoch": 4.210604706583259, "grad_norm": 1.4894133806228638, "learning_rate": 3.5877196516104275e-05, "loss": 0.4635, "num_input_tokens_seen": 16304784, "step": 28270 }, { "epoch": 4.211349419124218, "grad_norm": 1.5555546283721924, "learning_rate": 3.5871344668470755e-05, "loss": 0.5644, "num_input_tokens_seen": 16307920, "step": 28275 }, { "epoch": 4.212094131665177, "grad_norm": 6.580994129180908, "learning_rate": 3.5865492086195945e-05, "loss": 0.5943, "num_input_tokens_seen": 16310768, "step": 28280 }, { "epoch": 4.212838844206137, "grad_norm": 4.118549823760986, "learning_rate": 3.585963876967536e-05, "loss": 0.9189, "num_input_tokens_seen": 16313616, "step": 28285 }, { "epoch": 4.213583556747095, "grad_norm": 1.910696268081665, "learning_rate": 3.585378471930455e-05, "loss": 0.7329, "num_input_tokens_seen": 16316464, "step": 28290 }, { "epoch": 4.214328269288055, "grad_norm": 1.8724459409713745, "learning_rate": 3.584792993547908e-05, "loss": 0.6951, "num_input_tokens_seen": 16318992, "step": 28295 }, { "epoch": 4.215072981829014, "grad_norm": 1.5251491069793701, "learning_rate": 3.5842074418594625e-05, "loss": 0.731, "num_input_tokens_seen": 16321968, "step": 28300 }, { "epoch": 4.2158176943699734, "grad_norm": 3.7072601318359375, "learning_rate": 3.583621816904686e-05, "loss": 1.0179, "num_input_tokens_seen": 16325072, "step": 28305 }, { "epoch": 4.216562406910932, "grad_norm": 1.6415027379989624, "learning_rate": 3.583036118723152e-05, "loss": 0.6792, "num_input_tokens_seen": 16327920, "step": 28310 }, { "epoch": 4.217307119451892, "grad_norm": 2.8660669326782227, "learning_rate": 3.5824503473544405e-05, "loss": 0.6001, "num_input_tokens_seen": 16330864, "step": 28315 }, { "epoch": 4.218051831992851, "grad_norm": 1.2069307565689087, "learning_rate": 3.5818645028381356e-05, "loss": 0.6771, "num_input_tokens_seen": 16333712, "step": 28320 }, { "epoch": 4.21879654453381, "grad_norm": 2.2913103103637695, "learning_rate": 3.581278585213826e-05, "loss": 0.8169, "num_input_tokens_seen": 16337072, "step": 28325 }, { "epoch": 4.219541257074769, "grad_norm": 3.7395873069763184, "learning_rate": 3.5806925945211065e-05, "loss": 0.8161, "num_input_tokens_seen": 16339888, "step": 28330 }, { "epoch": 4.220285969615729, "grad_norm": 5.881882190704346, "learning_rate": 3.580106530799575e-05, "loss": 0.7823, "num_input_tokens_seen": 16342672, "step": 28335 }, { "epoch": 4.221030682156687, "grad_norm": 1.046804428100586, "learning_rate": 3.579520394088835e-05, "loss": 0.7066, "num_input_tokens_seen": 16345328, "step": 28340 }, { "epoch": 4.221775394697647, "grad_norm": 1.9137898683547974, "learning_rate": 3.578934184428496e-05, "loss": 0.7133, "num_input_tokens_seen": 16348080, "step": 28345 }, { "epoch": 4.222520107238606, "grad_norm": 1.7913068532943726, "learning_rate": 3.578347901858172e-05, "loss": 0.5823, "num_input_tokens_seen": 16350896, "step": 28350 }, { "epoch": 4.2232648197795655, "grad_norm": 1.678532361984253, "learning_rate": 3.57776154641748e-05, "loss": 0.8069, "num_input_tokens_seen": 16354160, "step": 28355 }, { "epoch": 4.224009532320524, "grad_norm": 2.138374090194702, "learning_rate": 3.577175118146045e-05, "loss": 0.5388, "num_input_tokens_seen": 16356976, "step": 28360 }, { "epoch": 4.224754244861484, "grad_norm": 2.0394022464752197, "learning_rate": 3.576588617083495e-05, "loss": 0.5917, "num_input_tokens_seen": 16359888, "step": 28365 }, { "epoch": 4.225498957402443, "grad_norm": 1.0871665477752686, "learning_rate": 3.576002043269464e-05, "loss": 0.658, "num_input_tokens_seen": 16362640, "step": 28370 }, { "epoch": 4.226243669943402, "grad_norm": 1.833749771118164, "learning_rate": 3.575415396743589e-05, "loss": 0.7291, "num_input_tokens_seen": 16365712, "step": 28375 }, { "epoch": 4.226988382484361, "grad_norm": 1.325122356414795, "learning_rate": 3.574828677545514e-05, "loss": 0.5281, "num_input_tokens_seen": 16368464, "step": 28380 }, { "epoch": 4.22773309502532, "grad_norm": 1.2358312606811523, "learning_rate": 3.574241885714886e-05, "loss": 0.7878, "num_input_tokens_seen": 16371760, "step": 28385 }, { "epoch": 4.2284778075662794, "grad_norm": 1.8881717920303345, "learning_rate": 3.57365502129136e-05, "loss": 0.7883, "num_input_tokens_seen": 16374544, "step": 28390 }, { "epoch": 4.229222520107238, "grad_norm": 1.8627808094024658, "learning_rate": 3.573068084314593e-05, "loss": 0.5691, "num_input_tokens_seen": 16377264, "step": 28395 }, { "epoch": 4.229967232648198, "grad_norm": 2.0842676162719727, "learning_rate": 3.572481074824247e-05, "loss": 0.7319, "num_input_tokens_seen": 16380208, "step": 28400 }, { "epoch": 4.230711945189157, "grad_norm": 1.523227572441101, "learning_rate": 3.5718939928599904e-05, "loss": 0.4806, "num_input_tokens_seen": 16382832, "step": 28405 }, { "epoch": 4.231456657730116, "grad_norm": 1.210976243019104, "learning_rate": 3.571306838461496e-05, "loss": 0.6061, "num_input_tokens_seen": 16385648, "step": 28410 }, { "epoch": 4.232201370271075, "grad_norm": 1.9696848392486572, "learning_rate": 3.570719611668441e-05, "loss": 0.6926, "num_input_tokens_seen": 16388080, "step": 28415 }, { "epoch": 4.232946082812035, "grad_norm": 2.0311279296875, "learning_rate": 3.5701323125205076e-05, "loss": 0.6007, "num_input_tokens_seen": 16390928, "step": 28420 }, { "epoch": 4.233690795352993, "grad_norm": 1.4887466430664062, "learning_rate": 3.569544941057384e-05, "loss": 0.5403, "num_input_tokens_seen": 16393648, "step": 28425 }, { "epoch": 4.234435507893953, "grad_norm": 1.5908427238464355, "learning_rate": 3.568957497318761e-05, "loss": 0.6513, "num_input_tokens_seen": 16396496, "step": 28430 }, { "epoch": 4.235180220434912, "grad_norm": 1.2032965421676636, "learning_rate": 3.5683699813443364e-05, "loss": 0.708, "num_input_tokens_seen": 16399408, "step": 28435 }, { "epoch": 4.2359249329758715, "grad_norm": 1.2239717245101929, "learning_rate": 3.567782393173813e-05, "loss": 0.494, "num_input_tokens_seen": 16402352, "step": 28440 }, { "epoch": 4.23666964551683, "grad_norm": 1.7047395706176758, "learning_rate": 3.567194732846896e-05, "loss": 0.6471, "num_input_tokens_seen": 16405200, "step": 28445 }, { "epoch": 4.23741435805779, "grad_norm": 2.507718801498413, "learning_rate": 3.566607000403298e-05, "loss": 0.7112, "num_input_tokens_seen": 16407984, "step": 28450 }, { "epoch": 4.238159070598749, "grad_norm": 1.6628130674362183, "learning_rate": 3.5660191958827354e-05, "loss": 0.8348, "num_input_tokens_seen": 16410768, "step": 28455 }, { "epoch": 4.238903783139708, "grad_norm": 1.1172866821289062, "learning_rate": 3.56543131932493e-05, "loss": 0.5229, "num_input_tokens_seen": 16413456, "step": 28460 }, { "epoch": 4.239648495680667, "grad_norm": 1.7881205081939697, "learning_rate": 3.5648433707696074e-05, "loss": 0.6219, "num_input_tokens_seen": 16416080, "step": 28465 }, { "epoch": 4.240393208221627, "grad_norm": 1.1861107349395752, "learning_rate": 3.564255350256499e-05, "loss": 0.5007, "num_input_tokens_seen": 16418992, "step": 28470 }, { "epoch": 4.2411379207625854, "grad_norm": 1.6160582304000854, "learning_rate": 3.5636672578253415e-05, "loss": 0.7464, "num_input_tokens_seen": 16422032, "step": 28475 }, { "epoch": 4.241882633303545, "grad_norm": 1.7685918807983398, "learning_rate": 3.5630790935158754e-05, "loss": 0.6224, "num_input_tokens_seen": 16424720, "step": 28480 }, { "epoch": 4.242627345844504, "grad_norm": 1.6892837285995483, "learning_rate": 3.562490857367845e-05, "loss": 0.629, "num_input_tokens_seen": 16427376, "step": 28485 }, { "epoch": 4.2433720583854635, "grad_norm": 0.8930692672729492, "learning_rate": 3.561902549421004e-05, "loss": 0.5492, "num_input_tokens_seen": 16430224, "step": 28490 }, { "epoch": 4.244116770926422, "grad_norm": 1.2405213117599487, "learning_rate": 3.5613141697151055e-05, "loss": 0.5761, "num_input_tokens_seen": 16432880, "step": 28495 }, { "epoch": 4.244861483467382, "grad_norm": 0.9886566996574402, "learning_rate": 3.5607257182899095e-05, "loss": 0.6048, "num_input_tokens_seen": 16435728, "step": 28500 }, { "epoch": 4.245606196008341, "grad_norm": 10.000001907348633, "learning_rate": 3.560137195185183e-05, "loss": 0.6278, "num_input_tokens_seen": 16438448, "step": 28505 }, { "epoch": 4.2463509085493, "grad_norm": 1.5602046251296997, "learning_rate": 3.559548600440695e-05, "loss": 0.703, "num_input_tokens_seen": 16441424, "step": 28510 }, { "epoch": 4.247095621090259, "grad_norm": 1.3119083642959595, "learning_rate": 3.5589599340962196e-05, "loss": 0.4895, "num_input_tokens_seen": 16444240, "step": 28515 }, { "epoch": 4.247840333631219, "grad_norm": 2.2971620559692383, "learning_rate": 3.5583711961915375e-05, "loss": 0.5311, "num_input_tokens_seen": 16447024, "step": 28520 }, { "epoch": 4.2485850461721775, "grad_norm": 1.7247860431671143, "learning_rate": 3.557782386766434e-05, "loss": 0.565, "num_input_tokens_seen": 16449776, "step": 28525 }, { "epoch": 4.249329758713137, "grad_norm": 1.9514156579971313, "learning_rate": 3.557193505860696e-05, "loss": 0.6892, "num_input_tokens_seen": 16452720, "step": 28530 }, { "epoch": 4.250074471254096, "grad_norm": 1.908508062362671, "learning_rate": 3.55660455351412e-05, "loss": 0.6658, "num_input_tokens_seen": 16455632, "step": 28535 }, { "epoch": 4.2508191837950555, "grad_norm": 1.248620629310608, "learning_rate": 3.5560155297665046e-05, "loss": 0.5434, "num_input_tokens_seen": 16458608, "step": 28540 }, { "epoch": 4.251563896336014, "grad_norm": 1.5200634002685547, "learning_rate": 3.555426434657652e-05, "loss": 0.7164, "num_input_tokens_seen": 16461360, "step": 28545 }, { "epoch": 4.252308608876973, "grad_norm": 2.0229058265686035, "learning_rate": 3.5548372682273726e-05, "loss": 0.7188, "num_input_tokens_seen": 16464208, "step": 28550 }, { "epoch": 4.253053321417933, "grad_norm": 1.2026875019073486, "learning_rate": 3.554248030515479e-05, "loss": 0.7088, "num_input_tokens_seen": 16467152, "step": 28555 }, { "epoch": 4.253798033958892, "grad_norm": 1.0294890403747559, "learning_rate": 3.55365872156179e-05, "loss": 0.5063, "num_input_tokens_seen": 16469936, "step": 28560 }, { "epoch": 4.254542746499851, "grad_norm": 1.0419387817382812, "learning_rate": 3.5530693414061285e-05, "loss": 0.4596, "num_input_tokens_seen": 16472592, "step": 28565 }, { "epoch": 4.25528745904081, "grad_norm": 1.283718466758728, "learning_rate": 3.5524798900883226e-05, "loss": 0.5653, "num_input_tokens_seen": 16475600, "step": 28570 }, { "epoch": 4.2560321715817695, "grad_norm": 2.407378911972046, "learning_rate": 3.551890367648205e-05, "loss": 0.6615, "num_input_tokens_seen": 16478416, "step": 28575 }, { "epoch": 4.256776884122728, "grad_norm": 1.0075459480285645, "learning_rate": 3.551300774125611e-05, "loss": 0.7547, "num_input_tokens_seen": 16481136, "step": 28580 }, { "epoch": 4.257521596663688, "grad_norm": 1.189998984336853, "learning_rate": 3.5507111095603864e-05, "loss": 0.6696, "num_input_tokens_seen": 16484112, "step": 28585 }, { "epoch": 4.258266309204647, "grad_norm": 1.728770136833191, "learning_rate": 3.550121373992378e-05, "loss": 0.7417, "num_input_tokens_seen": 16486896, "step": 28590 }, { "epoch": 4.259011021745606, "grad_norm": 1.7179477214813232, "learning_rate": 3.5495315674614356e-05, "loss": 0.5161, "num_input_tokens_seen": 16489552, "step": 28595 }, { "epoch": 4.259755734286565, "grad_norm": 1.8651914596557617, "learning_rate": 3.548941690007417e-05, "loss": 0.5066, "num_input_tokens_seen": 16492560, "step": 28600 }, { "epoch": 4.260500446827525, "grad_norm": 1.215360164642334, "learning_rate": 3.5483517416701836e-05, "loss": 0.6026, "num_input_tokens_seen": 16495248, "step": 28605 }, { "epoch": 4.2612451593684835, "grad_norm": 1.1671231985092163, "learning_rate": 3.547761722489602e-05, "loss": 0.5641, "num_input_tokens_seen": 16498096, "step": 28610 }, { "epoch": 4.261989871909443, "grad_norm": 1.045785903930664, "learning_rate": 3.5471716325055424e-05, "loss": 0.579, "num_input_tokens_seen": 16500848, "step": 28615 }, { "epoch": 4.262734584450402, "grad_norm": 3.60792875289917, "learning_rate": 3.5465814717578815e-05, "loss": 0.5711, "num_input_tokens_seen": 16504016, "step": 28620 }, { "epoch": 4.2634792969913615, "grad_norm": 2.0776865482330322, "learning_rate": 3.5459912402865006e-05, "loss": 0.7122, "num_input_tokens_seen": 16506704, "step": 28625 }, { "epoch": 4.26422400953232, "grad_norm": 1.1535612344741821, "learning_rate": 3.545400938131284e-05, "loss": 0.5391, "num_input_tokens_seen": 16509456, "step": 28630 }, { "epoch": 4.26496872207328, "grad_norm": 1.2192556858062744, "learning_rate": 3.544810565332122e-05, "loss": 0.614, "num_input_tokens_seen": 16512560, "step": 28635 }, { "epoch": 4.265713434614239, "grad_norm": 1.1744426488876343, "learning_rate": 3.5442201219289105e-05, "loss": 0.6144, "num_input_tokens_seen": 16515504, "step": 28640 }, { "epoch": 4.266458147155198, "grad_norm": 1.1349787712097168, "learning_rate": 3.543629607961548e-05, "loss": 0.5335, "num_input_tokens_seen": 16518832, "step": 28645 }, { "epoch": 4.267202859696157, "grad_norm": 1.3771454095840454, "learning_rate": 3.5430390234699404e-05, "loss": 0.5337, "num_input_tokens_seen": 16521552, "step": 28650 }, { "epoch": 4.267947572237117, "grad_norm": 3.2152390480041504, "learning_rate": 3.542448368493996e-05, "loss": 0.6274, "num_input_tokens_seen": 16524560, "step": 28655 }, { "epoch": 4.2686922847780755, "grad_norm": 0.999916136264801, "learning_rate": 3.5418576430736285e-05, "loss": 0.6452, "num_input_tokens_seen": 16527696, "step": 28660 }, { "epoch": 4.269436997319035, "grad_norm": 1.8204748630523682, "learning_rate": 3.5412668472487575e-05, "loss": 0.6301, "num_input_tokens_seen": 16530512, "step": 28665 }, { "epoch": 4.270181709859994, "grad_norm": 1.1144312620162964, "learning_rate": 3.540675981059307e-05, "loss": 0.4224, "num_input_tokens_seen": 16533392, "step": 28670 }, { "epoch": 4.2709264224009535, "grad_norm": 1.2226780652999878, "learning_rate": 3.540085044545205e-05, "loss": 0.6044, "num_input_tokens_seen": 16535952, "step": 28675 }, { "epoch": 4.271671134941912, "grad_norm": 1.8481075763702393, "learning_rate": 3.539494037746384e-05, "loss": 0.8374, "num_input_tokens_seen": 16538896, "step": 28680 }, { "epoch": 4.272415847482872, "grad_norm": 3.508871078491211, "learning_rate": 3.538902960702781e-05, "loss": 0.8568, "num_input_tokens_seen": 16541584, "step": 28685 }, { "epoch": 4.273160560023831, "grad_norm": 2.924353837966919, "learning_rate": 3.538311813454342e-05, "loss": 0.6526, "num_input_tokens_seen": 16544368, "step": 28690 }, { "epoch": 4.27390527256479, "grad_norm": 1.2786585092544556, "learning_rate": 3.537720596041011e-05, "loss": 0.4069, "num_input_tokens_seen": 16547216, "step": 28695 }, { "epoch": 4.274649985105749, "grad_norm": 1.0930548906326294, "learning_rate": 3.537129308502741e-05, "loss": 0.6716, "num_input_tokens_seen": 16550384, "step": 28700 }, { "epoch": 4.275394697646709, "grad_norm": 0.9114034175872803, "learning_rate": 3.536537950879489e-05, "loss": 0.5914, "num_input_tokens_seen": 16553072, "step": 28705 }, { "epoch": 4.2761394101876675, "grad_norm": 1.1898258924484253, "learning_rate": 3.535946523211217e-05, "loss": 0.5157, "num_input_tokens_seen": 16555760, "step": 28710 }, { "epoch": 4.276884122728626, "grad_norm": 0.9220039248466492, "learning_rate": 3.5353550255378905e-05, "loss": 0.5689, "num_input_tokens_seen": 16558768, "step": 28715 }, { "epoch": 4.277628835269586, "grad_norm": 0.898594856262207, "learning_rate": 3.5347634578994806e-05, "loss": 0.68, "num_input_tokens_seen": 16561904, "step": 28720 }, { "epoch": 4.278373547810546, "grad_norm": 1.2912805080413818, "learning_rate": 3.534171820335964e-05, "loss": 0.6896, "num_input_tokens_seen": 16564976, "step": 28725 }, { "epoch": 4.279118260351504, "grad_norm": 1.0441306829452515, "learning_rate": 3.53358011288732e-05, "loss": 0.6227, "num_input_tokens_seen": 16567824, "step": 28730 }, { "epoch": 4.279862972892463, "grad_norm": 1.8827037811279297, "learning_rate": 3.532988335593534e-05, "loss": 0.6094, "num_input_tokens_seen": 16570192, "step": 28735 }, { "epoch": 4.280607685433423, "grad_norm": 1.3385807275772095, "learning_rate": 3.532396488494596e-05, "loss": 0.6469, "num_input_tokens_seen": 16573104, "step": 28740 }, { "epoch": 4.2813523979743815, "grad_norm": 1.0090205669403076, "learning_rate": 3.531804571630501e-05, "loss": 0.6326, "num_input_tokens_seen": 16576144, "step": 28745 }, { "epoch": 4.282097110515341, "grad_norm": 1.012953519821167, "learning_rate": 3.531212585041248e-05, "loss": 0.5223, "num_input_tokens_seen": 16579024, "step": 28750 }, { "epoch": 4.2828418230563, "grad_norm": 2.7774672508239746, "learning_rate": 3.530620528766841e-05, "loss": 0.9139, "num_input_tokens_seen": 16581904, "step": 28755 }, { "epoch": 4.2835865355972595, "grad_norm": 2.345317840576172, "learning_rate": 3.53002840284729e-05, "loss": 0.5556, "num_input_tokens_seen": 16584496, "step": 28760 }, { "epoch": 4.284331248138218, "grad_norm": 1.339830994606018, "learning_rate": 3.5294362073226054e-05, "loss": 0.6104, "num_input_tokens_seen": 16587248, "step": 28765 }, { "epoch": 4.285075960679178, "grad_norm": 1.0466058254241943, "learning_rate": 3.528843942232809e-05, "loss": 0.665, "num_input_tokens_seen": 16590032, "step": 28770 }, { "epoch": 4.285820673220137, "grad_norm": 1.2555149793624878, "learning_rate": 3.528251607617921e-05, "loss": 0.726, "num_input_tokens_seen": 16592752, "step": 28775 }, { "epoch": 4.286565385761096, "grad_norm": 3.3842408657073975, "learning_rate": 3.52765920351797e-05, "loss": 0.645, "num_input_tokens_seen": 16595568, "step": 28780 }, { "epoch": 4.287310098302055, "grad_norm": 1.0528247356414795, "learning_rate": 3.5270667299729883e-05, "loss": 0.5456, "num_input_tokens_seen": 16598320, "step": 28785 }, { "epoch": 4.288054810843015, "grad_norm": 2.976447582244873, "learning_rate": 3.526474187023013e-05, "loss": 0.7491, "num_input_tokens_seen": 16601072, "step": 28790 }, { "epoch": 4.2887995233839735, "grad_norm": 1.750270128250122, "learning_rate": 3.5258815747080853e-05, "loss": 0.6308, "num_input_tokens_seen": 16603856, "step": 28795 }, { "epoch": 4.289544235924933, "grad_norm": 1.0086244344711304, "learning_rate": 3.5252888930682516e-05, "loss": 0.6408, "num_input_tokens_seen": 16606832, "step": 28800 }, { "epoch": 4.290288948465892, "grad_norm": 1.066745638847351, "learning_rate": 3.524696142143563e-05, "loss": 0.7772, "num_input_tokens_seen": 16610032, "step": 28805 }, { "epoch": 4.291033661006852, "grad_norm": 1.9292147159576416, "learning_rate": 3.524103321974075e-05, "loss": 0.6457, "num_input_tokens_seen": 16612784, "step": 28810 }, { "epoch": 4.29177837354781, "grad_norm": 1.8402771949768066, "learning_rate": 3.523510432599849e-05, "loss": 0.7513, "num_input_tokens_seen": 16615600, "step": 28815 }, { "epoch": 4.29252308608877, "grad_norm": 1.0311956405639648, "learning_rate": 3.522917474060949e-05, "loss": 0.5474, "num_input_tokens_seen": 16618544, "step": 28820 }, { "epoch": 4.293267798629729, "grad_norm": 1.61099112033844, "learning_rate": 3.522324446397444e-05, "loss": 0.6376, "num_input_tokens_seen": 16621296, "step": 28825 }, { "epoch": 4.294012511170688, "grad_norm": 0.8531843423843384, "learning_rate": 3.5217313496494096e-05, "loss": 0.6219, "num_input_tokens_seen": 16624144, "step": 28830 }, { "epoch": 4.294757223711647, "grad_norm": 0.8415647149085999, "learning_rate": 3.521138183856926e-05, "loss": 0.5, "num_input_tokens_seen": 16627216, "step": 28835 }, { "epoch": 4.295501936252607, "grad_norm": 1.3395581245422363, "learning_rate": 3.520544949060075e-05, "loss": 0.6372, "num_input_tokens_seen": 16630160, "step": 28840 }, { "epoch": 4.2962466487935655, "grad_norm": 2.2052969932556152, "learning_rate": 3.5199516452989444e-05, "loss": 0.5669, "num_input_tokens_seen": 16632784, "step": 28845 }, { "epoch": 4.296991361334525, "grad_norm": 1.9803725481033325, "learning_rate": 3.51935827261363e-05, "loss": 0.6625, "num_input_tokens_seen": 16635600, "step": 28850 }, { "epoch": 4.297736073875484, "grad_norm": 1.2857027053833008, "learning_rate": 3.518764831044228e-05, "loss": 0.597, "num_input_tokens_seen": 16638640, "step": 28855 }, { "epoch": 4.298480786416444, "grad_norm": 1.0909193754196167, "learning_rate": 3.518171320630839e-05, "loss": 0.5637, "num_input_tokens_seen": 16641616, "step": 28860 }, { "epoch": 4.299225498957402, "grad_norm": 1.3676246404647827, "learning_rate": 3.5175777414135726e-05, "loss": 0.7196, "num_input_tokens_seen": 16644592, "step": 28865 }, { "epoch": 4.299970211498362, "grad_norm": 1.0302525758743286, "learning_rate": 3.5169840934325404e-05, "loss": 0.5855, "num_input_tokens_seen": 16647536, "step": 28870 }, { "epoch": 4.300714924039321, "grad_norm": 1.826393961906433, "learning_rate": 3.5163903767278573e-05, "loss": 0.6959, "num_input_tokens_seen": 16650512, "step": 28875 }, { "epoch": 4.30145963658028, "grad_norm": 1.89244544506073, "learning_rate": 3.515796591339644e-05, "loss": 0.7681, "num_input_tokens_seen": 16653200, "step": 28880 }, { "epoch": 4.302204349121239, "grad_norm": 1.3524181842803955, "learning_rate": 3.515202737308028e-05, "loss": 0.6082, "num_input_tokens_seen": 16655920, "step": 28885 }, { "epoch": 4.302949061662199, "grad_norm": 1.7115871906280518, "learning_rate": 3.514608814673139e-05, "loss": 0.5553, "num_input_tokens_seen": 16658896, "step": 28890 }, { "epoch": 4.303693774203158, "grad_norm": 1.3666613101959229, "learning_rate": 3.5140148234751106e-05, "loss": 0.6457, "num_input_tokens_seen": 16661712, "step": 28895 }, { "epoch": 4.304438486744116, "grad_norm": 0.9501560926437378, "learning_rate": 3.513420763754083e-05, "loss": 0.5143, "num_input_tokens_seen": 16664368, "step": 28900 }, { "epoch": 4.305183199285076, "grad_norm": 1.6645231246948242, "learning_rate": 3.512826635550201e-05, "loss": 0.6092, "num_input_tokens_seen": 16667184, "step": 28905 }, { "epoch": 4.305927911826035, "grad_norm": 1.1481763124465942, "learning_rate": 3.512232438903612e-05, "loss": 0.7055, "num_input_tokens_seen": 16670416, "step": 28910 }, { "epoch": 4.306672624366994, "grad_norm": 2.008432626724243, "learning_rate": 3.511638173854471e-05, "loss": 0.7282, "num_input_tokens_seen": 16673264, "step": 28915 }, { "epoch": 4.307417336907953, "grad_norm": 1.8395427465438843, "learning_rate": 3.511043840442936e-05, "loss": 0.64, "num_input_tokens_seen": 16676112, "step": 28920 }, { "epoch": 4.308162049448913, "grad_norm": 1.8498247861862183, "learning_rate": 3.510449438709167e-05, "loss": 0.8218, "num_input_tokens_seen": 16678928, "step": 28925 }, { "epoch": 4.3089067619898715, "grad_norm": 0.8799790143966675, "learning_rate": 3.509854968693334e-05, "loss": 0.7157, "num_input_tokens_seen": 16682128, "step": 28930 }, { "epoch": 4.309651474530831, "grad_norm": 2.229447364807129, "learning_rate": 3.509260430435608e-05, "loss": 0.5904, "num_input_tokens_seen": 16685136, "step": 28935 }, { "epoch": 4.31039618707179, "grad_norm": 1.1280200481414795, "learning_rate": 3.5086658239761664e-05, "loss": 0.6497, "num_input_tokens_seen": 16688016, "step": 28940 }, { "epoch": 4.31114089961275, "grad_norm": 1.6846165657043457, "learning_rate": 3.5080711493551876e-05, "loss": 0.6602, "num_input_tokens_seen": 16690864, "step": 28945 }, { "epoch": 4.311885612153708, "grad_norm": 1.0911442041397095, "learning_rate": 3.5074764066128594e-05, "loss": 0.7003, "num_input_tokens_seen": 16693680, "step": 28950 }, { "epoch": 4.312630324694668, "grad_norm": 1.0158172845840454, "learning_rate": 3.506881595789373e-05, "loss": 0.6294, "num_input_tokens_seen": 16696752, "step": 28955 }, { "epoch": 4.313375037235627, "grad_norm": 1.5439130067825317, "learning_rate": 3.506286716924921e-05, "loss": 0.7671, "num_input_tokens_seen": 16699536, "step": 28960 }, { "epoch": 4.314119749776586, "grad_norm": 2.548969268798828, "learning_rate": 3.505691770059704e-05, "loss": 0.6091, "num_input_tokens_seen": 16702128, "step": 28965 }, { "epoch": 4.314864462317545, "grad_norm": 1.6838316917419434, "learning_rate": 3.5050967552339265e-05, "loss": 0.6802, "num_input_tokens_seen": 16704816, "step": 28970 }, { "epoch": 4.315609174858505, "grad_norm": 2.1486759185791016, "learning_rate": 3.5045016724877967e-05, "loss": 0.661, "num_input_tokens_seen": 16707568, "step": 28975 }, { "epoch": 4.316353887399464, "grad_norm": 0.9339706301689148, "learning_rate": 3.503906521861527e-05, "loss": 0.6178, "num_input_tokens_seen": 16710800, "step": 28980 }, { "epoch": 4.317098599940423, "grad_norm": 2.0248281955718994, "learning_rate": 3.503311303395337e-05, "loss": 0.4587, "num_input_tokens_seen": 16713456, "step": 28985 }, { "epoch": 4.317843312481382, "grad_norm": 1.371077060699463, "learning_rate": 3.5027160171294476e-05, "loss": 0.5815, "num_input_tokens_seen": 16715984, "step": 28990 }, { "epoch": 4.318588025022342, "grad_norm": 2.075488567352295, "learning_rate": 3.502120663104087e-05, "loss": 0.6552, "num_input_tokens_seen": 16718704, "step": 28995 }, { "epoch": 4.3193327375633, "grad_norm": 1.687437653541565, "learning_rate": 3.5015252413594864e-05, "loss": 0.6494, "num_input_tokens_seen": 16721552, "step": 29000 }, { "epoch": 4.32007745010426, "grad_norm": 4.478998184204102, "learning_rate": 3.5009297519358816e-05, "loss": 0.5516, "num_input_tokens_seen": 16724304, "step": 29005 }, { "epoch": 4.320822162645219, "grad_norm": 1.5910576581954956, "learning_rate": 3.500334194873513e-05, "loss": 0.6579, "num_input_tokens_seen": 16727248, "step": 29010 }, { "epoch": 4.321566875186178, "grad_norm": 2.456083059310913, "learning_rate": 3.499738570212628e-05, "loss": 0.5948, "num_input_tokens_seen": 16730000, "step": 29015 }, { "epoch": 4.322311587727137, "grad_norm": 2.080007553100586, "learning_rate": 3.4991428779934746e-05, "loss": 0.7431, "num_input_tokens_seen": 16732752, "step": 29020 }, { "epoch": 4.323056300268097, "grad_norm": 0.923067033290863, "learning_rate": 3.498547118256307e-05, "loss": 0.5287, "num_input_tokens_seen": 16735984, "step": 29025 }, { "epoch": 4.323801012809056, "grad_norm": 5.986471652984619, "learning_rate": 3.497951291041386e-05, "loss": 0.6997, "num_input_tokens_seen": 16738640, "step": 29030 }, { "epoch": 4.324545725350015, "grad_norm": 1.9928853511810303, "learning_rate": 3.497355396388974e-05, "loss": 0.6844, "num_input_tokens_seen": 16741552, "step": 29035 }, { "epoch": 4.325290437890974, "grad_norm": 1.8336743116378784, "learning_rate": 3.496759434339338e-05, "loss": 0.6672, "num_input_tokens_seen": 16744240, "step": 29040 }, { "epoch": 4.326035150431934, "grad_norm": 1.8302340507507324, "learning_rate": 3.4961634049327527e-05, "loss": 0.8169, "num_input_tokens_seen": 16747216, "step": 29045 }, { "epoch": 4.326779862972892, "grad_norm": 1.6679245233535767, "learning_rate": 3.495567308209495e-05, "loss": 0.6759, "num_input_tokens_seen": 16750256, "step": 29050 }, { "epoch": 4.327524575513852, "grad_norm": 1.540112853050232, "learning_rate": 3.4949711442098464e-05, "loss": 0.6039, "num_input_tokens_seen": 16753072, "step": 29055 }, { "epoch": 4.328269288054811, "grad_norm": 2.496952772140503, "learning_rate": 3.494374912974093e-05, "loss": 0.7787, "num_input_tokens_seen": 16756144, "step": 29060 }, { "epoch": 4.32901400059577, "grad_norm": 1.0440218448638916, "learning_rate": 3.493778614542525e-05, "loss": 0.7333, "num_input_tokens_seen": 16759440, "step": 29065 }, { "epoch": 4.329758713136729, "grad_norm": 1.4788274765014648, "learning_rate": 3.493182248955439e-05, "loss": 0.5205, "num_input_tokens_seen": 16762320, "step": 29070 }, { "epoch": 4.330503425677689, "grad_norm": 1.0859888792037964, "learning_rate": 3.4925858162531354e-05, "loss": 0.6136, "num_input_tokens_seen": 16765456, "step": 29075 }, { "epoch": 4.331248138218648, "grad_norm": 1.2961245775222778, "learning_rate": 3.491989316475917e-05, "loss": 0.7193, "num_input_tokens_seen": 16768368, "step": 29080 }, { "epoch": 4.331992850759606, "grad_norm": 0.8121017217636108, "learning_rate": 3.491392749664094e-05, "loss": 0.6949, "num_input_tokens_seen": 16771248, "step": 29085 }, { "epoch": 4.332737563300566, "grad_norm": 2.28159236907959, "learning_rate": 3.49079611585798e-05, "loss": 0.6526, "num_input_tokens_seen": 16774288, "step": 29090 }, { "epoch": 4.333482275841525, "grad_norm": 1.079660415649414, "learning_rate": 3.490199415097892e-05, "loss": 0.5687, "num_input_tokens_seen": 16777008, "step": 29095 }, { "epoch": 4.334226988382484, "grad_norm": 1.2978912591934204, "learning_rate": 3.489602647424154e-05, "loss": 0.4832, "num_input_tokens_seen": 16779760, "step": 29100 }, { "epoch": 4.334971700923443, "grad_norm": 1.5787628889083862, "learning_rate": 3.489005812877093e-05, "loss": 0.6892, "num_input_tokens_seen": 16782512, "step": 29105 }, { "epoch": 4.335716413464403, "grad_norm": 2.2197675704956055, "learning_rate": 3.488408911497039e-05, "loss": 0.7491, "num_input_tokens_seen": 16785680, "step": 29110 }, { "epoch": 4.336461126005362, "grad_norm": 1.2537462711334229, "learning_rate": 3.48781194332433e-05, "loss": 0.6296, "num_input_tokens_seen": 16788592, "step": 29115 }, { "epoch": 4.337205838546321, "grad_norm": 1.9197310209274292, "learning_rate": 3.487214908399306e-05, "loss": 0.4884, "num_input_tokens_seen": 16791632, "step": 29120 }, { "epoch": 4.33795055108728, "grad_norm": 1.154168963432312, "learning_rate": 3.486617806762312e-05, "loss": 0.5969, "num_input_tokens_seen": 16794672, "step": 29125 }, { "epoch": 4.33869526362824, "grad_norm": 1.5362244844436646, "learning_rate": 3.486020638453698e-05, "loss": 0.4854, "num_input_tokens_seen": 16797328, "step": 29130 }, { "epoch": 4.339439976169198, "grad_norm": 1.2092117071151733, "learning_rate": 3.485423403513818e-05, "loss": 0.636, "num_input_tokens_seen": 16800048, "step": 29135 }, { "epoch": 4.340184688710158, "grad_norm": 1.625626564025879, "learning_rate": 3.484826101983031e-05, "loss": 0.6069, "num_input_tokens_seen": 16802832, "step": 29140 }, { "epoch": 4.340929401251117, "grad_norm": 1.0344550609588623, "learning_rate": 3.4842287339016997e-05, "loss": 0.4044, "num_input_tokens_seen": 16805712, "step": 29145 }, { "epoch": 4.3416741137920765, "grad_norm": 3.347791910171509, "learning_rate": 3.483631299310193e-05, "loss": 0.5318, "num_input_tokens_seen": 16808368, "step": 29150 }, { "epoch": 4.342418826333035, "grad_norm": 1.840003490447998, "learning_rate": 3.483033798248882e-05, "loss": 0.5924, "num_input_tokens_seen": 16811184, "step": 29155 }, { "epoch": 4.343163538873995, "grad_norm": 0.9088221788406372, "learning_rate": 3.4824362307581435e-05, "loss": 0.8635, "num_input_tokens_seen": 16814032, "step": 29160 }, { "epoch": 4.343908251414954, "grad_norm": 1.2502385377883911, "learning_rate": 3.4818385968783584e-05, "loss": 0.8118, "num_input_tokens_seen": 16817072, "step": 29165 }, { "epoch": 4.344652963955913, "grad_norm": 1.3415567874908447, "learning_rate": 3.481240896649913e-05, "loss": 0.5974, "num_input_tokens_seen": 16819888, "step": 29170 }, { "epoch": 4.345397676496872, "grad_norm": 1.9802626371383667, "learning_rate": 3.4806431301131974e-05, "loss": 0.5137, "num_input_tokens_seen": 16822736, "step": 29175 }, { "epoch": 4.346142389037832, "grad_norm": 1.0382537841796875, "learning_rate": 3.480045297308606e-05, "loss": 0.5444, "num_input_tokens_seen": 16825328, "step": 29180 }, { "epoch": 4.34688710157879, "grad_norm": 1.1828269958496094, "learning_rate": 3.479447398276538e-05, "loss": 0.6467, "num_input_tokens_seen": 16827984, "step": 29185 }, { "epoch": 4.34763181411975, "grad_norm": 1.6077680587768555, "learning_rate": 3.4788494330573965e-05, "loss": 0.5113, "num_input_tokens_seen": 16830512, "step": 29190 }, { "epoch": 4.348376526660709, "grad_norm": 1.5148814916610718, "learning_rate": 3.478251401691591e-05, "loss": 0.6126, "num_input_tokens_seen": 16833648, "step": 29195 }, { "epoch": 4.3491212392016685, "grad_norm": 1.3208973407745361, "learning_rate": 3.4776533042195324e-05, "loss": 0.6112, "num_input_tokens_seen": 16836816, "step": 29200 }, { "epoch": 4.349865951742627, "grad_norm": 1.6360130310058594, "learning_rate": 3.477055140681639e-05, "loss": 0.5232, "num_input_tokens_seen": 16839632, "step": 29205 }, { "epoch": 4.350610664283587, "grad_norm": 1.0458663702011108, "learning_rate": 3.4764569111183304e-05, "loss": 0.5103, "num_input_tokens_seen": 16842512, "step": 29210 }, { "epoch": 4.351355376824546, "grad_norm": 1.2183482646942139, "learning_rate": 3.475858615570035e-05, "loss": 0.4807, "num_input_tokens_seen": 16845200, "step": 29215 }, { "epoch": 4.352100089365505, "grad_norm": 3.5308549404144287, "learning_rate": 3.475260254077181e-05, "loss": 0.6999, "num_input_tokens_seen": 16848016, "step": 29220 }, { "epoch": 4.352844801906464, "grad_norm": 1.948617696762085, "learning_rate": 3.474661826680204e-05, "loss": 0.6364, "num_input_tokens_seen": 16850800, "step": 29225 }, { "epoch": 4.353589514447424, "grad_norm": 2.005265951156616, "learning_rate": 3.474063333419544e-05, "loss": 0.701, "num_input_tokens_seen": 16853680, "step": 29230 }, { "epoch": 4.3543342269883825, "grad_norm": 2.14080810546875, "learning_rate": 3.473464774335644e-05, "loss": 0.6618, "num_input_tokens_seen": 16856560, "step": 29235 }, { "epoch": 4.355078939529342, "grad_norm": 2.209437131881714, "learning_rate": 3.472866149468953e-05, "loss": 0.7022, "num_input_tokens_seen": 16859440, "step": 29240 }, { "epoch": 4.355823652070301, "grad_norm": 1.9206010103225708, "learning_rate": 3.472267458859922e-05, "loss": 0.6819, "num_input_tokens_seen": 16862288, "step": 29245 }, { "epoch": 4.35656836461126, "grad_norm": 3.971228837966919, "learning_rate": 3.47166870254901e-05, "loss": 0.8337, "num_input_tokens_seen": 16865008, "step": 29250 }, { "epoch": 4.357313077152219, "grad_norm": 0.9441725015640259, "learning_rate": 3.471069880576677e-05, "loss": 0.5464, "num_input_tokens_seen": 16867920, "step": 29255 }, { "epoch": 4.358057789693178, "grad_norm": 2.4231395721435547, "learning_rate": 3.470470992983389e-05, "loss": 0.5573, "num_input_tokens_seen": 16870736, "step": 29260 }, { "epoch": 4.358802502234138, "grad_norm": 1.6258941888809204, "learning_rate": 3.4698720398096176e-05, "loss": 0.7545, "num_input_tokens_seen": 16873552, "step": 29265 }, { "epoch": 4.359547214775096, "grad_norm": 1.487186074256897, "learning_rate": 3.4692730210958376e-05, "loss": 0.6296, "num_input_tokens_seen": 16876720, "step": 29270 }, { "epoch": 4.360291927316056, "grad_norm": 1.2016377449035645, "learning_rate": 3.468673936882527e-05, "loss": 0.6846, "num_input_tokens_seen": 16879632, "step": 29275 }, { "epoch": 4.361036639857015, "grad_norm": 2.802988290786743, "learning_rate": 3.46807478721017e-05, "loss": 0.7177, "num_input_tokens_seen": 16882576, "step": 29280 }, { "epoch": 4.3617813523979745, "grad_norm": 3.187098979949951, "learning_rate": 3.4674755721192555e-05, "loss": 0.7512, "num_input_tokens_seen": 16885392, "step": 29285 }, { "epoch": 4.362526064938933, "grad_norm": 1.1478791236877441, "learning_rate": 3.466876291650274e-05, "loss": 0.8173, "num_input_tokens_seen": 16888272, "step": 29290 }, { "epoch": 4.363270777479893, "grad_norm": 0.7685909271240234, "learning_rate": 3.466276945843725e-05, "loss": 0.7368, "num_input_tokens_seen": 16891248, "step": 29295 }, { "epoch": 4.364015490020852, "grad_norm": 1.4431802034378052, "learning_rate": 3.465677534740107e-05, "loss": 0.6744, "num_input_tokens_seen": 16894320, "step": 29300 }, { "epoch": 4.364760202561811, "grad_norm": 1.900193691253662, "learning_rate": 3.4650780583799294e-05, "loss": 0.6208, "num_input_tokens_seen": 16897072, "step": 29305 }, { "epoch": 4.36550491510277, "grad_norm": 1.7213249206542969, "learning_rate": 3.464478516803699e-05, "loss": 0.6064, "num_input_tokens_seen": 16899888, "step": 29310 }, { "epoch": 4.36624962764373, "grad_norm": 0.9233019351959229, "learning_rate": 3.463878910051932e-05, "loss": 0.738, "num_input_tokens_seen": 16902832, "step": 29315 }, { "epoch": 4.3669943401846885, "grad_norm": 1.273315191268921, "learning_rate": 3.4632792381651473e-05, "loss": 0.4932, "num_input_tokens_seen": 16905456, "step": 29320 }, { "epoch": 4.367739052725648, "grad_norm": 1.1710938215255737, "learning_rate": 3.462679501183867e-05, "loss": 0.6503, "num_input_tokens_seen": 16908400, "step": 29325 }, { "epoch": 4.368483765266607, "grad_norm": 1.8560082912445068, "learning_rate": 3.462079699148622e-05, "loss": 0.6919, "num_input_tokens_seen": 16911248, "step": 29330 }, { "epoch": 4.3692284778075665, "grad_norm": 2.025843620300293, "learning_rate": 3.4614798320999406e-05, "loss": 0.5599, "num_input_tokens_seen": 16914096, "step": 29335 }, { "epoch": 4.369973190348525, "grad_norm": 1.1293028593063354, "learning_rate": 3.4608799000783624e-05, "loss": 0.3906, "num_input_tokens_seen": 16917008, "step": 29340 }, { "epoch": 4.370717902889485, "grad_norm": 0.9097087383270264, "learning_rate": 3.460279903124427e-05, "loss": 0.5151, "num_input_tokens_seen": 16919728, "step": 29345 }, { "epoch": 4.371462615430444, "grad_norm": 1.5615450143814087, "learning_rate": 3.45967984127868e-05, "loss": 0.4903, "num_input_tokens_seen": 16922544, "step": 29350 }, { "epoch": 4.372207327971403, "grad_norm": 1.084999918937683, "learning_rate": 3.4590797145816714e-05, "loss": 0.6159, "num_input_tokens_seen": 16925584, "step": 29355 }, { "epoch": 4.372952040512362, "grad_norm": 1.5315186977386475, "learning_rate": 3.4584795230739535e-05, "loss": 0.4743, "num_input_tokens_seen": 16928400, "step": 29360 }, { "epoch": 4.373696753053322, "grad_norm": 1.86982262134552, "learning_rate": 3.457879266796087e-05, "loss": 0.5749, "num_input_tokens_seen": 16931312, "step": 29365 }, { "epoch": 4.3744414655942805, "grad_norm": 2.9033422470092773, "learning_rate": 3.457278945788635e-05, "loss": 0.6479, "num_input_tokens_seen": 16934160, "step": 29370 }, { "epoch": 4.37518617813524, "grad_norm": 1.5077537298202515, "learning_rate": 3.456678560092164e-05, "loss": 0.6121, "num_input_tokens_seen": 16937136, "step": 29375 }, { "epoch": 4.375930890676199, "grad_norm": 1.7723209857940674, "learning_rate": 3.4560781097472436e-05, "loss": 0.6755, "num_input_tokens_seen": 16939888, "step": 29380 }, { "epoch": 4.3766756032171585, "grad_norm": 1.2297860383987427, "learning_rate": 3.455477594794454e-05, "loss": 0.4338, "num_input_tokens_seen": 16942448, "step": 29385 }, { "epoch": 4.377420315758117, "grad_norm": 1.2212754487991333, "learning_rate": 3.454877015274371e-05, "loss": 0.5165, "num_input_tokens_seen": 16945552, "step": 29390 }, { "epoch": 4.378165028299077, "grad_norm": 1.585457444190979, "learning_rate": 3.4542763712275836e-05, "loss": 0.7034, "num_input_tokens_seen": 16948400, "step": 29395 }, { "epoch": 4.378909740840036, "grad_norm": 2.125257730484009, "learning_rate": 3.453675662694677e-05, "loss": 0.7843, "num_input_tokens_seen": 16951376, "step": 29400 }, { "epoch": 4.379654453380995, "grad_norm": 1.9433404207229614, "learning_rate": 3.453074889716248e-05, "loss": 0.5677, "num_input_tokens_seen": 16954288, "step": 29405 }, { "epoch": 4.380399165921954, "grad_norm": 3.050076723098755, "learning_rate": 3.452474052332891e-05, "loss": 0.6665, "num_input_tokens_seen": 16957136, "step": 29410 }, { "epoch": 4.381143878462913, "grad_norm": 1.0626225471496582, "learning_rate": 3.451873150585212e-05, "loss": 0.6167, "num_input_tokens_seen": 16960208, "step": 29415 }, { "epoch": 4.3818885910038725, "grad_norm": 1.8615628480911255, "learning_rate": 3.451272184513815e-05, "loss": 0.7381, "num_input_tokens_seen": 16963312, "step": 29420 }, { "epoch": 4.382633303544832, "grad_norm": 1.4128413200378418, "learning_rate": 3.4506711541593107e-05, "loss": 0.5426, "num_input_tokens_seen": 16966032, "step": 29425 }, { "epoch": 4.383378016085791, "grad_norm": 3.6848747730255127, "learning_rate": 3.450070059562315e-05, "loss": 0.7846, "num_input_tokens_seen": 16968880, "step": 29430 }, { "epoch": 4.38412272862675, "grad_norm": 1.6551353931427002, "learning_rate": 3.449468900763448e-05, "loss": 0.7626, "num_input_tokens_seen": 16971536, "step": 29435 }, { "epoch": 4.384867441167709, "grad_norm": 1.3852849006652832, "learning_rate": 3.448867677803333e-05, "loss": 0.5185, "num_input_tokens_seen": 16974480, "step": 29440 }, { "epoch": 4.385612153708668, "grad_norm": 2.1593687534332275, "learning_rate": 3.4482663907225975e-05, "loss": 0.6385, "num_input_tokens_seen": 16977584, "step": 29445 }, { "epoch": 4.386356866249628, "grad_norm": 1.7720500230789185, "learning_rate": 3.447665039561875e-05, "loss": 0.7506, "num_input_tokens_seen": 16980592, "step": 29450 }, { "epoch": 4.3871015787905865, "grad_norm": 1.407883882522583, "learning_rate": 3.4470636243618026e-05, "loss": 0.6819, "num_input_tokens_seen": 16983408, "step": 29455 }, { "epoch": 4.387846291331546, "grad_norm": 1.3565360307693481, "learning_rate": 3.44646214516302e-05, "loss": 0.7028, "num_input_tokens_seen": 16986384, "step": 29460 }, { "epoch": 4.388591003872505, "grad_norm": 0.8363034129142761, "learning_rate": 3.4458606020061744e-05, "loss": 0.6029, "num_input_tokens_seen": 16989200, "step": 29465 }, { "epoch": 4.3893357164134645, "grad_norm": 1.2346179485321045, "learning_rate": 3.445258994931915e-05, "loss": 0.4749, "num_input_tokens_seen": 16991760, "step": 29470 }, { "epoch": 4.390080428954423, "grad_norm": 2.099297523498535, "learning_rate": 3.444657323980895e-05, "loss": 0.6674, "num_input_tokens_seen": 16994576, "step": 29475 }, { "epoch": 4.390825141495383, "grad_norm": 1.3384740352630615, "learning_rate": 3.444055589193774e-05, "loss": 0.6033, "num_input_tokens_seen": 16997296, "step": 29480 }, { "epoch": 4.391569854036342, "grad_norm": 1.0249054431915283, "learning_rate": 3.443453790611215e-05, "loss": 0.5264, "num_input_tokens_seen": 17000112, "step": 29485 }, { "epoch": 4.392314566577301, "grad_norm": 1.8126919269561768, "learning_rate": 3.442851928273884e-05, "loss": 0.746, "num_input_tokens_seen": 17003280, "step": 29490 }, { "epoch": 4.39305927911826, "grad_norm": 2.252285957336426, "learning_rate": 3.4422500022224536e-05, "loss": 0.6559, "num_input_tokens_seen": 17006096, "step": 29495 }, { "epoch": 4.39380399165922, "grad_norm": 3.484584093093872, "learning_rate": 3.4416480124975995e-05, "loss": 0.8191, "num_input_tokens_seen": 17009136, "step": 29500 }, { "epoch": 4.3945487042001785, "grad_norm": 2.6096835136413574, "learning_rate": 3.44104595914e-05, "loss": 0.6571, "num_input_tokens_seen": 17012272, "step": 29505 }, { "epoch": 4.395293416741138, "grad_norm": 1.3488035202026367, "learning_rate": 3.440443842190341e-05, "loss": 0.7353, "num_input_tokens_seen": 17014832, "step": 29510 }, { "epoch": 4.396038129282097, "grad_norm": 2.266880512237549, "learning_rate": 3.439841661689311e-05, "loss": 0.6134, "num_input_tokens_seen": 17017456, "step": 29515 }, { "epoch": 4.396782841823057, "grad_norm": 1.9517186880111694, "learning_rate": 3.439239417677602e-05, "loss": 0.5911, "num_input_tokens_seen": 17020400, "step": 29520 }, { "epoch": 4.397527554364015, "grad_norm": 1.7227518558502197, "learning_rate": 3.4386371101959125e-05, "loss": 0.6878, "num_input_tokens_seen": 17023248, "step": 29525 }, { "epoch": 4.398272266904975, "grad_norm": 1.3910984992980957, "learning_rate": 3.4380347392849424e-05, "loss": 0.6914, "num_input_tokens_seen": 17026128, "step": 29530 }, { "epoch": 4.399016979445934, "grad_norm": 1.9950923919677734, "learning_rate": 3.4374323049854e-05, "loss": 0.6387, "num_input_tokens_seen": 17028880, "step": 29535 }, { "epoch": 4.399761691986893, "grad_norm": 2.020557165145874, "learning_rate": 3.436829807337992e-05, "loss": 0.6897, "num_input_tokens_seen": 17031792, "step": 29540 }, { "epoch": 4.400506404527852, "grad_norm": 1.308791995048523, "learning_rate": 3.436227246383435e-05, "loss": 0.5788, "num_input_tokens_seen": 17034576, "step": 29545 }, { "epoch": 4.401251117068812, "grad_norm": 2.5310919284820557, "learning_rate": 3.435624622162448e-05, "loss": 0.5971, "num_input_tokens_seen": 17037552, "step": 29550 }, { "epoch": 4.4019958296097705, "grad_norm": 1.1389068365097046, "learning_rate": 3.435021934715752e-05, "loss": 0.6369, "num_input_tokens_seen": 17040400, "step": 29555 }, { "epoch": 4.40274054215073, "grad_norm": 1.048936367034912, "learning_rate": 3.4344191840840755e-05, "loss": 0.5033, "num_input_tokens_seen": 17043120, "step": 29560 }, { "epoch": 4.403485254691689, "grad_norm": 1.0227949619293213, "learning_rate": 3.4338163703081495e-05, "loss": 0.6519, "num_input_tokens_seen": 17046064, "step": 29565 }, { "epoch": 4.404229967232649, "grad_norm": 1.5309699773788452, "learning_rate": 3.43321349342871e-05, "loss": 0.5726, "num_input_tokens_seen": 17048976, "step": 29570 }, { "epoch": 4.404974679773607, "grad_norm": 1.260816216468811, "learning_rate": 3.432610553486497e-05, "loss": 0.563, "num_input_tokens_seen": 17051600, "step": 29575 }, { "epoch": 4.405719392314566, "grad_norm": 1.7415019273757935, "learning_rate": 3.432007550522254e-05, "loss": 0.7037, "num_input_tokens_seen": 17054512, "step": 29580 }, { "epoch": 4.406464104855526, "grad_norm": 1.8070168495178223, "learning_rate": 3.431404484576731e-05, "loss": 0.6137, "num_input_tokens_seen": 17057168, "step": 29585 }, { "epoch": 4.407208817396485, "grad_norm": 1.6022429466247559, "learning_rate": 3.430801355690679e-05, "loss": 0.6111, "num_input_tokens_seen": 17060016, "step": 29590 }, { "epoch": 4.407953529937444, "grad_norm": 3.2878031730651855, "learning_rate": 3.430198163904855e-05, "loss": 0.6116, "num_input_tokens_seen": 17062736, "step": 29595 }, { "epoch": 4.408698242478403, "grad_norm": 1.5368343591690063, "learning_rate": 3.429594909260023e-05, "loss": 0.712, "num_input_tokens_seen": 17065456, "step": 29600 }, { "epoch": 4.409442955019363, "grad_norm": 2.0192625522613525, "learning_rate": 3.428991591796944e-05, "loss": 0.719, "num_input_tokens_seen": 17068560, "step": 29605 }, { "epoch": 4.410187667560321, "grad_norm": 1.348552942276001, "learning_rate": 3.428388211556391e-05, "loss": 0.5054, "num_input_tokens_seen": 17071472, "step": 29610 }, { "epoch": 4.410932380101281, "grad_norm": 2.165703296661377, "learning_rate": 3.4277847685791384e-05, "loss": 0.6928, "num_input_tokens_seen": 17074480, "step": 29615 }, { "epoch": 4.41167709264224, "grad_norm": 1.4329071044921875, "learning_rate": 3.427181262905963e-05, "loss": 0.5924, "num_input_tokens_seen": 17077328, "step": 29620 }, { "epoch": 4.412421805183199, "grad_norm": 2.1066036224365234, "learning_rate": 3.4265776945776464e-05, "loss": 0.623, "num_input_tokens_seen": 17080368, "step": 29625 }, { "epoch": 4.413166517724158, "grad_norm": 1.444883942604065, "learning_rate": 3.425974063634977e-05, "loss": 0.5262, "num_input_tokens_seen": 17083088, "step": 29630 }, { "epoch": 4.413911230265118, "grad_norm": 1.7865253686904907, "learning_rate": 3.4253703701187455e-05, "loss": 0.7615, "num_input_tokens_seen": 17086160, "step": 29635 }, { "epoch": 4.4146559428060765, "grad_norm": 1.4586387872695923, "learning_rate": 3.4247666140697466e-05, "loss": 0.6589, "num_input_tokens_seen": 17089008, "step": 29640 }, { "epoch": 4.415400655347036, "grad_norm": 2.7080037593841553, "learning_rate": 3.424162795528779e-05, "loss": 0.7997, "num_input_tokens_seen": 17091888, "step": 29645 }, { "epoch": 4.416145367887995, "grad_norm": 1.280082106590271, "learning_rate": 3.423558914536648e-05, "loss": 0.7523, "num_input_tokens_seen": 17094768, "step": 29650 }, { "epoch": 4.416890080428955, "grad_norm": 1.6408511400222778, "learning_rate": 3.42295497113416e-05, "loss": 0.5216, "num_input_tokens_seen": 17097616, "step": 29655 }, { "epoch": 4.417634792969913, "grad_norm": 3.2502548694610596, "learning_rate": 3.4223509653621275e-05, "loss": 0.7173, "num_input_tokens_seen": 17100432, "step": 29660 }, { "epoch": 4.418379505510873, "grad_norm": 2.4968559741973877, "learning_rate": 3.421746897261367e-05, "loss": 0.6831, "num_input_tokens_seen": 17103248, "step": 29665 }, { "epoch": 4.419124218051832, "grad_norm": 1.0419667959213257, "learning_rate": 3.421142766872698e-05, "loss": 0.5916, "num_input_tokens_seen": 17106256, "step": 29670 }, { "epoch": 4.419868930592791, "grad_norm": 1.9823641777038574, "learning_rate": 3.420538574236946e-05, "loss": 0.4979, "num_input_tokens_seen": 17109168, "step": 29675 }, { "epoch": 4.42061364313375, "grad_norm": 1.381321668624878, "learning_rate": 3.4199343193949404e-05, "loss": 0.5009, "num_input_tokens_seen": 17112048, "step": 29680 }, { "epoch": 4.42135835567471, "grad_norm": 1.3511451482772827, "learning_rate": 3.419330002387514e-05, "loss": 0.69, "num_input_tokens_seen": 17114832, "step": 29685 }, { "epoch": 4.422103068215669, "grad_norm": 2.3238208293914795, "learning_rate": 3.418725623255503e-05, "loss": 0.5326, "num_input_tokens_seen": 17117360, "step": 29690 }, { "epoch": 4.422847780756628, "grad_norm": 4.258671283721924, "learning_rate": 3.418121182039749e-05, "loss": 0.7524, "num_input_tokens_seen": 17120464, "step": 29695 }, { "epoch": 4.423592493297587, "grad_norm": 1.3730939626693726, "learning_rate": 3.4175166787811004e-05, "loss": 0.6243, "num_input_tokens_seen": 17123312, "step": 29700 }, { "epoch": 4.424337205838547, "grad_norm": 2.1321818828582764, "learning_rate": 3.416912113520403e-05, "loss": 0.6117, "num_input_tokens_seen": 17126192, "step": 29705 }, { "epoch": 4.425081918379505, "grad_norm": 1.0972943305969238, "learning_rate": 3.416307486298513e-05, "loss": 0.6212, "num_input_tokens_seen": 17128848, "step": 29710 }, { "epoch": 4.425826630920465, "grad_norm": 1.2042770385742188, "learning_rate": 3.4157027971562897e-05, "loss": 0.5282, "num_input_tokens_seen": 17131792, "step": 29715 }, { "epoch": 4.426571343461424, "grad_norm": 1.5430691242218018, "learning_rate": 3.4150980461345945e-05, "loss": 0.5202, "num_input_tokens_seen": 17134416, "step": 29720 }, { "epoch": 4.427316056002383, "grad_norm": 1.3914604187011719, "learning_rate": 3.414493233274293e-05, "loss": 0.569, "num_input_tokens_seen": 17137712, "step": 29725 }, { "epoch": 4.428060768543342, "grad_norm": 2.8566527366638184, "learning_rate": 3.413888358616256e-05, "loss": 0.6108, "num_input_tokens_seen": 17140528, "step": 29730 }, { "epoch": 4.428805481084302, "grad_norm": 1.4770420789718628, "learning_rate": 3.413283422201361e-05, "loss": 0.5899, "num_input_tokens_seen": 17143568, "step": 29735 }, { "epoch": 4.429550193625261, "grad_norm": 2.1429619789123535, "learning_rate": 3.412678424070485e-05, "loss": 0.6362, "num_input_tokens_seen": 17146480, "step": 29740 }, { "epoch": 4.43029490616622, "grad_norm": 2.6190152168273926, "learning_rate": 3.4120733642645114e-05, "loss": 0.5479, "num_input_tokens_seen": 17149424, "step": 29745 }, { "epoch": 4.431039618707179, "grad_norm": 1.1842381954193115, "learning_rate": 3.411468242824328e-05, "loss": 0.5536, "num_input_tokens_seen": 17152336, "step": 29750 }, { "epoch": 4.431784331248139, "grad_norm": 2.3673300743103027, "learning_rate": 3.410863059790827e-05, "loss": 0.5773, "num_input_tokens_seen": 17155152, "step": 29755 }, { "epoch": 4.432529043789097, "grad_norm": 2.0943822860717773, "learning_rate": 3.4102578152049035e-05, "loss": 0.5171, "num_input_tokens_seen": 17157936, "step": 29760 }, { "epoch": 4.433273756330056, "grad_norm": 1.4418296813964844, "learning_rate": 3.4096525091074585e-05, "loss": 0.5719, "num_input_tokens_seen": 17160880, "step": 29765 }, { "epoch": 4.434018468871016, "grad_norm": 1.4165891408920288, "learning_rate": 3.409047141539394e-05, "loss": 0.5889, "num_input_tokens_seen": 17163664, "step": 29770 }, { "epoch": 4.434763181411975, "grad_norm": 2.0650746822357178, "learning_rate": 3.40844171254162e-05, "loss": 0.5192, "num_input_tokens_seen": 17166640, "step": 29775 }, { "epoch": 4.435507893952934, "grad_norm": 1.2970529794692993, "learning_rate": 3.4078362221550485e-05, "loss": 0.6761, "num_input_tokens_seen": 17169712, "step": 29780 }, { "epoch": 4.436252606493893, "grad_norm": 1.0350555181503296, "learning_rate": 3.4072306704205966e-05, "loss": 0.52, "num_input_tokens_seen": 17172752, "step": 29785 }, { "epoch": 4.436997319034853, "grad_norm": 2.426912784576416, "learning_rate": 3.4066250573791834e-05, "loss": 0.9299, "num_input_tokens_seen": 17175216, "step": 29790 }, { "epoch": 4.437742031575811, "grad_norm": 4.611255645751953, "learning_rate": 3.4060193830717355e-05, "loss": 0.7296, "num_input_tokens_seen": 17177936, "step": 29795 }, { "epoch": 4.438486744116771, "grad_norm": 1.9863609075546265, "learning_rate": 3.405413647539182e-05, "loss": 0.6115, "num_input_tokens_seen": 17180784, "step": 29800 }, { "epoch": 4.43923145665773, "grad_norm": 3.0601274967193604, "learning_rate": 3.404807850822455e-05, "loss": 0.7405, "num_input_tokens_seen": 17184016, "step": 29805 }, { "epoch": 4.439976169198689, "grad_norm": 0.9094407558441162, "learning_rate": 3.4042019929624916e-05, "loss": 0.4528, "num_input_tokens_seen": 17186992, "step": 29810 }, { "epoch": 4.440720881739648, "grad_norm": 3.654923677444458, "learning_rate": 3.403596074000234e-05, "loss": 0.853, "num_input_tokens_seen": 17190096, "step": 29815 }, { "epoch": 4.441465594280608, "grad_norm": 4.5567402839660645, "learning_rate": 3.402990093976628e-05, "loss": 0.6689, "num_input_tokens_seen": 17193040, "step": 29820 }, { "epoch": 4.442210306821567, "grad_norm": 1.6912894248962402, "learning_rate": 3.402384052932622e-05, "loss": 0.6833, "num_input_tokens_seen": 17195856, "step": 29825 }, { "epoch": 4.442955019362526, "grad_norm": 1.2238507270812988, "learning_rate": 3.4017779509091705e-05, "loss": 0.7742, "num_input_tokens_seen": 17198800, "step": 29830 }, { "epoch": 4.443699731903485, "grad_norm": 1.48432457447052, "learning_rate": 3.4011717879472315e-05, "loss": 0.6537, "num_input_tokens_seen": 17201776, "step": 29835 }, { "epoch": 4.444444444444445, "grad_norm": 1.5822080373764038, "learning_rate": 3.400565564087767e-05, "loss": 0.8067, "num_input_tokens_seen": 17204464, "step": 29840 }, { "epoch": 4.445189156985403, "grad_norm": 1.791651964187622, "learning_rate": 3.399959279371743e-05, "loss": 0.6989, "num_input_tokens_seen": 17207280, "step": 29845 }, { "epoch": 4.445933869526363, "grad_norm": 1.3880912065505981, "learning_rate": 3.399352933840131e-05, "loss": 0.6915, "num_input_tokens_seen": 17210032, "step": 29850 }, { "epoch": 4.446678582067322, "grad_norm": 2.769266366958618, "learning_rate": 3.3987465275339034e-05, "loss": 0.4804, "num_input_tokens_seen": 17212720, "step": 29855 }, { "epoch": 4.4474232946082815, "grad_norm": 1.8815631866455078, "learning_rate": 3.3981400604940393e-05, "loss": 0.6498, "num_input_tokens_seen": 17215344, "step": 29860 }, { "epoch": 4.44816800714924, "grad_norm": 1.6381829977035522, "learning_rate": 3.397533532761522e-05, "loss": 0.5714, "num_input_tokens_seen": 17218320, "step": 29865 }, { "epoch": 4.4489127196902, "grad_norm": 1.5935933589935303, "learning_rate": 3.3969269443773364e-05, "loss": 0.8035, "num_input_tokens_seen": 17221264, "step": 29870 }, { "epoch": 4.449657432231159, "grad_norm": 2.024287700653076, "learning_rate": 3.396320295382476e-05, "loss": 0.6323, "num_input_tokens_seen": 17224304, "step": 29875 }, { "epoch": 4.450402144772118, "grad_norm": 1.8193846940994263, "learning_rate": 3.3957135858179335e-05, "loss": 0.5994, "num_input_tokens_seen": 17227088, "step": 29880 }, { "epoch": 4.451146857313077, "grad_norm": 1.2583885192871094, "learning_rate": 3.395106815724709e-05, "loss": 0.498, "num_input_tokens_seen": 17229776, "step": 29885 }, { "epoch": 4.451891569854037, "grad_norm": 1.122167944908142, "learning_rate": 3.3944999851438045e-05, "loss": 0.5698, "num_input_tokens_seen": 17232752, "step": 29890 }, { "epoch": 4.452636282394995, "grad_norm": 1.360169768333435, "learning_rate": 3.3938930941162285e-05, "loss": 0.6002, "num_input_tokens_seen": 17235856, "step": 29895 }, { "epoch": 4.453380994935955, "grad_norm": 2.2178280353546143, "learning_rate": 3.393286142682991e-05, "loss": 0.6628, "num_input_tokens_seen": 17238992, "step": 29900 }, { "epoch": 4.454125707476914, "grad_norm": 1.730901837348938, "learning_rate": 3.392679130885108e-05, "loss": 0.621, "num_input_tokens_seen": 17242064, "step": 29905 }, { "epoch": 4.4548704200178735, "grad_norm": 1.3415873050689697, "learning_rate": 3.392072058763598e-05, "loss": 0.7097, "num_input_tokens_seen": 17244816, "step": 29910 }, { "epoch": 4.455615132558832, "grad_norm": 2.6377270221710205, "learning_rate": 3.391464926359487e-05, "loss": 0.6064, "num_input_tokens_seen": 17247984, "step": 29915 }, { "epoch": 4.456359845099792, "grad_norm": 1.8721091747283936, "learning_rate": 3.390857733713799e-05, "loss": 0.5862, "num_input_tokens_seen": 17250992, "step": 29920 }, { "epoch": 4.457104557640751, "grad_norm": 0.9190306067466736, "learning_rate": 3.3902504808675684e-05, "loss": 0.4937, "num_input_tokens_seen": 17254128, "step": 29925 }, { "epoch": 4.457849270181709, "grad_norm": 1.6723521947860718, "learning_rate": 3.389643167861829e-05, "loss": 0.5249, "num_input_tokens_seen": 17257200, "step": 29930 }, { "epoch": 4.458593982722669, "grad_norm": 1.7343034744262695, "learning_rate": 3.3890357947376216e-05, "loss": 0.803, "num_input_tokens_seen": 17260112, "step": 29935 }, { "epoch": 4.459338695263629, "grad_norm": 1.3044214248657227, "learning_rate": 3.38842836153599e-05, "loss": 0.5105, "num_input_tokens_seen": 17263024, "step": 29940 }, { "epoch": 4.4600834078045875, "grad_norm": 1.5497145652770996, "learning_rate": 3.3878208682979815e-05, "loss": 0.5213, "num_input_tokens_seen": 17265712, "step": 29945 }, { "epoch": 4.460828120345546, "grad_norm": 2.742427110671997, "learning_rate": 3.3872133150646484e-05, "loss": 0.7408, "num_input_tokens_seen": 17268688, "step": 29950 }, { "epoch": 4.461572832886506, "grad_norm": 3.212148904800415, "learning_rate": 3.386605701877047e-05, "loss": 0.647, "num_input_tokens_seen": 17271824, "step": 29955 }, { "epoch": 4.462317545427465, "grad_norm": 1.3341647386550903, "learning_rate": 3.3859980287762364e-05, "loss": 0.6444, "num_input_tokens_seen": 17274832, "step": 29960 }, { "epoch": 4.463062257968424, "grad_norm": 1.1331592798233032, "learning_rate": 3.385390295803281e-05, "loss": 0.6635, "num_input_tokens_seen": 17277776, "step": 29965 }, { "epoch": 4.463806970509383, "grad_norm": 1.1332203149795532, "learning_rate": 3.3847825029992495e-05, "loss": 0.5981, "num_input_tokens_seen": 17280240, "step": 29970 }, { "epoch": 4.464551683050343, "grad_norm": 0.8549298644065857, "learning_rate": 3.384174650405213e-05, "loss": 0.7339, "num_input_tokens_seen": 17283184, "step": 29975 }, { "epoch": 4.465296395591301, "grad_norm": 3.042160749435425, "learning_rate": 3.3835667380622497e-05, "loss": 0.613, "num_input_tokens_seen": 17285936, "step": 29980 }, { "epoch": 4.466041108132261, "grad_norm": 1.2396677732467651, "learning_rate": 3.382958766011439e-05, "loss": 0.5972, "num_input_tokens_seen": 17289008, "step": 29985 }, { "epoch": 4.46678582067322, "grad_norm": 1.0055620670318604, "learning_rate": 3.3823507342938634e-05, "loss": 0.5784, "num_input_tokens_seen": 17292080, "step": 29990 }, { "epoch": 4.4675305332141795, "grad_norm": 1.5809029340744019, "learning_rate": 3.381742642950612e-05, "loss": 0.6601, "num_input_tokens_seen": 17295056, "step": 29995 }, { "epoch": 4.468275245755138, "grad_norm": 1.1623432636260986, "learning_rate": 3.3811344920227795e-05, "loss": 0.5502, "num_input_tokens_seen": 17298224, "step": 30000 }, { "epoch": 4.469019958296098, "grad_norm": 1.5625803470611572, "learning_rate": 3.3805262815514596e-05, "loss": 0.6928, "num_input_tokens_seen": 17300976, "step": 30005 }, { "epoch": 4.469764670837057, "grad_norm": 1.770694375038147, "learning_rate": 3.379918011577753e-05, "loss": 0.5634, "num_input_tokens_seen": 17304048, "step": 30010 }, { "epoch": 4.470509383378016, "grad_norm": 1.3553451299667358, "learning_rate": 3.379309682142766e-05, "loss": 0.6148, "num_input_tokens_seen": 17306896, "step": 30015 }, { "epoch": 4.471254095918975, "grad_norm": 1.5923429727554321, "learning_rate": 3.3787012932876036e-05, "loss": 0.544, "num_input_tokens_seen": 17309776, "step": 30020 }, { "epoch": 4.471998808459935, "grad_norm": 1.2028899192810059, "learning_rate": 3.378092845053382e-05, "loss": 0.4779, "num_input_tokens_seen": 17312784, "step": 30025 }, { "epoch": 4.4727435210008935, "grad_norm": 1.1675026416778564, "learning_rate": 3.377484337481216e-05, "loss": 0.8078, "num_input_tokens_seen": 17315440, "step": 30030 }, { "epoch": 4.473488233541853, "grad_norm": 1.7920167446136475, "learning_rate": 3.376875770612226e-05, "loss": 0.5303, "num_input_tokens_seen": 17318256, "step": 30035 }, { "epoch": 4.474232946082812, "grad_norm": 1.1970369815826416, "learning_rate": 3.376267144487535e-05, "loss": 0.7207, "num_input_tokens_seen": 17321168, "step": 30040 }, { "epoch": 4.4749776586237715, "grad_norm": 1.3446674346923828, "learning_rate": 3.375658459148275e-05, "loss": 0.6616, "num_input_tokens_seen": 17324112, "step": 30045 }, { "epoch": 4.47572237116473, "grad_norm": 1.2594913244247437, "learning_rate": 3.375049714635577e-05, "loss": 0.672, "num_input_tokens_seen": 17327152, "step": 30050 }, { "epoch": 4.47646708370569, "grad_norm": 1.3003520965576172, "learning_rate": 3.374440910990574e-05, "loss": 0.6239, "num_input_tokens_seen": 17330224, "step": 30055 }, { "epoch": 4.477211796246649, "grad_norm": 2.1516613960266113, "learning_rate": 3.3738320482544116e-05, "loss": 0.6272, "num_input_tokens_seen": 17333104, "step": 30060 }, { "epoch": 4.477956508787608, "grad_norm": 1.4353443384170532, "learning_rate": 3.3732231264682326e-05, "loss": 0.6888, "num_input_tokens_seen": 17335824, "step": 30065 }, { "epoch": 4.478701221328567, "grad_norm": 1.4534001350402832, "learning_rate": 3.3726141456731835e-05, "loss": 0.546, "num_input_tokens_seen": 17338544, "step": 30070 }, { "epoch": 4.479445933869527, "grad_norm": 1.209509015083313, "learning_rate": 3.3720051059104186e-05, "loss": 0.532, "num_input_tokens_seen": 17341488, "step": 30075 }, { "epoch": 4.4801906464104855, "grad_norm": 1.0434712171554565, "learning_rate": 3.371396007221094e-05, "loss": 0.5864, "num_input_tokens_seen": 17344688, "step": 30080 }, { "epoch": 4.480935358951445, "grad_norm": 0.721028208732605, "learning_rate": 3.3707868496463705e-05, "loss": 0.6577, "num_input_tokens_seen": 17347472, "step": 30085 }, { "epoch": 4.481680071492404, "grad_norm": 2.8568317890167236, "learning_rate": 3.3701776332274116e-05, "loss": 0.6907, "num_input_tokens_seen": 17350448, "step": 30090 }, { "epoch": 4.4824247840333635, "grad_norm": 1.5116853713989258, "learning_rate": 3.3695683580053865e-05, "loss": 0.7012, "num_input_tokens_seen": 17353328, "step": 30095 }, { "epoch": 4.483169496574322, "grad_norm": 2.561011552810669, "learning_rate": 3.368959024021467e-05, "loss": 0.6231, "num_input_tokens_seen": 17356112, "step": 30100 }, { "epoch": 4.483914209115282, "grad_norm": 1.6938631534576416, "learning_rate": 3.3683496313168294e-05, "loss": 0.7139, "num_input_tokens_seen": 17358896, "step": 30105 }, { "epoch": 4.484658921656241, "grad_norm": 1.1138306856155396, "learning_rate": 3.367740179932655e-05, "loss": 0.7087, "num_input_tokens_seen": 17361680, "step": 30110 }, { "epoch": 4.4854036341971995, "grad_norm": 2.332815647125244, "learning_rate": 3.3671306699101266e-05, "loss": 0.607, "num_input_tokens_seen": 17364432, "step": 30115 }, { "epoch": 4.486148346738159, "grad_norm": 1.186801552772522, "learning_rate": 3.3665211012904324e-05, "loss": 0.7231, "num_input_tokens_seen": 17367152, "step": 30120 }, { "epoch": 4.486893059279118, "grad_norm": 0.5564373731613159, "learning_rate": 3.365911474114766e-05, "loss": 0.5611, "num_input_tokens_seen": 17370192, "step": 30125 }, { "epoch": 4.4876377718200775, "grad_norm": 1.695115566253662, "learning_rate": 3.3653017884243224e-05, "loss": 0.5897, "num_input_tokens_seen": 17373136, "step": 30130 }, { "epoch": 4.488382484361036, "grad_norm": 1.4421296119689941, "learning_rate": 3.364692044260302e-05, "loss": 0.5394, "num_input_tokens_seen": 17375920, "step": 30135 }, { "epoch": 4.489127196901996, "grad_norm": 1.8554631471633911, "learning_rate": 3.3640822416639086e-05, "loss": 0.4892, "num_input_tokens_seen": 17378672, "step": 30140 }, { "epoch": 4.489871909442955, "grad_norm": 1.236509084701538, "learning_rate": 3.363472380676351e-05, "loss": 0.6237, "num_input_tokens_seen": 17381552, "step": 30145 }, { "epoch": 4.490616621983914, "grad_norm": 0.7418465614318848, "learning_rate": 3.3628624613388407e-05, "loss": 0.6153, "num_input_tokens_seen": 17384400, "step": 30150 }, { "epoch": 4.491361334524873, "grad_norm": 1.5812039375305176, "learning_rate": 3.362252483692593e-05, "loss": 0.598, "num_input_tokens_seen": 17387152, "step": 30155 }, { "epoch": 4.492106047065833, "grad_norm": 1.3895502090454102, "learning_rate": 3.361642447778828e-05, "loss": 0.5036, "num_input_tokens_seen": 17389680, "step": 30160 }, { "epoch": 4.4928507596067915, "grad_norm": 1.7641165256500244, "learning_rate": 3.36103235363877e-05, "loss": 0.4774, "num_input_tokens_seen": 17392304, "step": 30165 }, { "epoch": 4.493595472147751, "grad_norm": 1.0412784814834595, "learning_rate": 3.360422201313646e-05, "loss": 0.6263, "num_input_tokens_seen": 17395184, "step": 30170 }, { "epoch": 4.49434018468871, "grad_norm": 1.1230639219284058, "learning_rate": 3.3598119908446866e-05, "loss": 0.4619, "num_input_tokens_seen": 17398320, "step": 30175 }, { "epoch": 4.4950848972296695, "grad_norm": 1.3865635395050049, "learning_rate": 3.3592017222731304e-05, "loss": 0.8168, "num_input_tokens_seen": 17401520, "step": 30180 }, { "epoch": 4.495829609770628, "grad_norm": 1.6523804664611816, "learning_rate": 3.358591395640215e-05, "loss": 0.6596, "num_input_tokens_seen": 17404496, "step": 30185 }, { "epoch": 4.496574322311588, "grad_norm": 1.5477185249328613, "learning_rate": 3.357981010987183e-05, "loss": 0.6613, "num_input_tokens_seen": 17407728, "step": 30190 }, { "epoch": 4.497319034852547, "grad_norm": 1.162202000617981, "learning_rate": 3.3573705683552824e-05, "loss": 0.6607, "num_input_tokens_seen": 17410832, "step": 30195 }, { "epoch": 4.498063747393506, "grad_norm": 1.2736390829086304, "learning_rate": 3.356760067785765e-05, "loss": 0.8879, "num_input_tokens_seen": 17413552, "step": 30200 }, { "epoch": 4.498808459934465, "grad_norm": 1.4525269269943237, "learning_rate": 3.356149509319886e-05, "loss": 0.5839, "num_input_tokens_seen": 17416464, "step": 30205 }, { "epoch": 4.499553172475425, "grad_norm": 1.1639130115509033, "learning_rate": 3.355538892998904e-05, "loss": 0.6846, "num_input_tokens_seen": 17419088, "step": 30210 }, { "epoch": 4.5, "eval_loss": 0.6573360562324524, "eval_runtime": 74.2726, "eval_samples_per_second": 40.176, "eval_steps_per_second": 10.044, "num_input_tokens_seen": 17420720, "step": 30213 }, { "epoch": 4.5002978850163835, "grad_norm": 0.9593135118484497, "learning_rate": 3.3549282188640815e-05, "loss": 0.5492, "num_input_tokens_seen": 17421968, "step": 30215 }, { "epoch": 4.501042597557343, "grad_norm": 1.662459373474121, "learning_rate": 3.354317486956685e-05, "loss": 0.6098, "num_input_tokens_seen": 17424752, "step": 30220 }, { "epoch": 4.501787310098302, "grad_norm": 3.088096857070923, "learning_rate": 3.353706697317988e-05, "loss": 0.5951, "num_input_tokens_seen": 17427728, "step": 30225 }, { "epoch": 4.5025320226392616, "grad_norm": 1.4328068494796753, "learning_rate": 3.353095849989262e-05, "loss": 0.6315, "num_input_tokens_seen": 17430800, "step": 30230 }, { "epoch": 4.50327673518022, "grad_norm": 1.605833888053894, "learning_rate": 3.352484945011787e-05, "loss": 0.6495, "num_input_tokens_seen": 17433776, "step": 30235 }, { "epoch": 4.50402144772118, "grad_norm": 3.2742512226104736, "learning_rate": 3.351873982426846e-05, "loss": 0.6105, "num_input_tokens_seen": 17436752, "step": 30240 }, { "epoch": 4.504766160262139, "grad_norm": 1.277337670326233, "learning_rate": 3.3512629622757245e-05, "loss": 0.5802, "num_input_tokens_seen": 17439952, "step": 30245 }, { "epoch": 4.505510872803098, "grad_norm": 1.2974743843078613, "learning_rate": 3.350651884599713e-05, "loss": 0.7294, "num_input_tokens_seen": 17442768, "step": 30250 }, { "epoch": 4.506255585344057, "grad_norm": 3.285315990447998, "learning_rate": 3.350040749440105e-05, "loss": 0.5106, "num_input_tokens_seen": 17445712, "step": 30255 }, { "epoch": 4.507000297885017, "grad_norm": 1.5437521934509277, "learning_rate": 3.3494295568382006e-05, "loss": 0.5679, "num_input_tokens_seen": 17448816, "step": 30260 }, { "epoch": 4.5077450104259755, "grad_norm": 1.1797139644622803, "learning_rate": 3.348818306835299e-05, "loss": 0.5659, "num_input_tokens_seen": 17451728, "step": 30265 }, { "epoch": 4.508489722966935, "grad_norm": 1.7321891784667969, "learning_rate": 3.348206999472708e-05, "loss": 0.8028, "num_input_tokens_seen": 17454832, "step": 30270 }, { "epoch": 4.509234435507894, "grad_norm": 1.2141193151474, "learning_rate": 3.3475956347917356e-05, "loss": 0.5074, "num_input_tokens_seen": 17457744, "step": 30275 }, { "epoch": 4.509979148048853, "grad_norm": 1.9533427953720093, "learning_rate": 3.346984212833697e-05, "loss": 0.7424, "num_input_tokens_seen": 17460624, "step": 30280 }, { "epoch": 4.510723860589812, "grad_norm": 2.1432392597198486, "learning_rate": 3.346372733639909e-05, "loss": 0.7297, "num_input_tokens_seen": 17463792, "step": 30285 }, { "epoch": 4.511468573130772, "grad_norm": 2.8177802562713623, "learning_rate": 3.345761197251692e-05, "loss": 0.713, "num_input_tokens_seen": 17466480, "step": 30290 }, { "epoch": 4.512213285671731, "grad_norm": 1.9395872354507446, "learning_rate": 3.345149603710373e-05, "loss": 0.6295, "num_input_tokens_seen": 17469040, "step": 30295 }, { "epoch": 4.5129579982126895, "grad_norm": 3.48695969581604, "learning_rate": 3.344537953057279e-05, "loss": 0.6475, "num_input_tokens_seen": 17471920, "step": 30300 }, { "epoch": 4.513702710753649, "grad_norm": 1.216206431388855, "learning_rate": 3.343926245333745e-05, "loss": 0.6576, "num_input_tokens_seen": 17474800, "step": 30305 }, { "epoch": 4.514447423294608, "grad_norm": 1.3291538953781128, "learning_rate": 3.343314480581104e-05, "loss": 0.5654, "num_input_tokens_seen": 17477616, "step": 30310 }, { "epoch": 4.5151921358355676, "grad_norm": 1.414825439453125, "learning_rate": 3.342702658840702e-05, "loss": 0.5231, "num_input_tokens_seen": 17480208, "step": 30315 }, { "epoch": 4.515936848376526, "grad_norm": 1.7315326929092407, "learning_rate": 3.3420907801538784e-05, "loss": 0.7121, "num_input_tokens_seen": 17483056, "step": 30320 }, { "epoch": 4.516681560917486, "grad_norm": 1.4748917818069458, "learning_rate": 3.3414788445619844e-05, "loss": 0.6727, "num_input_tokens_seen": 17486032, "step": 30325 }, { "epoch": 4.517426273458445, "grad_norm": 1.4036352634429932, "learning_rate": 3.340866852106371e-05, "loss": 0.7872, "num_input_tokens_seen": 17488880, "step": 30330 }, { "epoch": 4.518170985999404, "grad_norm": 1.626796007156372, "learning_rate": 3.340254802828395e-05, "loss": 0.7155, "num_input_tokens_seen": 17491600, "step": 30335 }, { "epoch": 4.518915698540363, "grad_norm": 1.5834064483642578, "learning_rate": 3.339642696769415e-05, "loss": 0.6872, "num_input_tokens_seen": 17494576, "step": 30340 }, { "epoch": 4.519660411081323, "grad_norm": 1.504289984703064, "learning_rate": 3.339030533970796e-05, "loss": 0.7464, "num_input_tokens_seen": 17497456, "step": 30345 }, { "epoch": 4.5204051236222815, "grad_norm": 1.2939270734786987, "learning_rate": 3.338418314473904e-05, "loss": 0.705, "num_input_tokens_seen": 17500592, "step": 30350 }, { "epoch": 4.521149836163241, "grad_norm": 1.643342137336731, "learning_rate": 3.3378060383201116e-05, "loss": 0.6107, "num_input_tokens_seen": 17503440, "step": 30355 }, { "epoch": 4.5218945487042, "grad_norm": 2.0481340885162354, "learning_rate": 3.337193705550793e-05, "loss": 0.653, "num_input_tokens_seen": 17506544, "step": 30360 }, { "epoch": 4.52263926124516, "grad_norm": 2.0004310607910156, "learning_rate": 3.3365813162073284e-05, "loss": 0.7836, "num_input_tokens_seen": 17509680, "step": 30365 }, { "epoch": 4.523383973786118, "grad_norm": 1.081202745437622, "learning_rate": 3.3359688703310984e-05, "loss": 0.5728, "num_input_tokens_seen": 17512848, "step": 30370 }, { "epoch": 4.524128686327078, "grad_norm": 1.8695799112319946, "learning_rate": 3.335356367963492e-05, "loss": 0.7004, "num_input_tokens_seen": 17515696, "step": 30375 }, { "epoch": 4.524873398868037, "grad_norm": 2.164724588394165, "learning_rate": 3.334743809145898e-05, "loss": 0.6786, "num_input_tokens_seen": 17518352, "step": 30380 }, { "epoch": 4.525618111408996, "grad_norm": 2.5603690147399902, "learning_rate": 3.334131193919712e-05, "loss": 0.5579, "num_input_tokens_seen": 17521072, "step": 30385 }, { "epoch": 4.526362823949955, "grad_norm": 1.5490890741348267, "learning_rate": 3.333518522326331e-05, "loss": 0.523, "num_input_tokens_seen": 17523824, "step": 30390 }, { "epoch": 4.527107536490915, "grad_norm": 1.3122466802597046, "learning_rate": 3.3329057944071564e-05, "loss": 0.5702, "num_input_tokens_seen": 17527120, "step": 30395 }, { "epoch": 4.5278522490318736, "grad_norm": 0.9937116503715515, "learning_rate": 3.332293010203595e-05, "loss": 0.7366, "num_input_tokens_seen": 17530224, "step": 30400 }, { "epoch": 4.528596961572833, "grad_norm": 1.042206883430481, "learning_rate": 3.331680169757056e-05, "loss": 0.631, "num_input_tokens_seen": 17532944, "step": 30405 }, { "epoch": 4.529341674113792, "grad_norm": 1.898340106010437, "learning_rate": 3.331067273108952e-05, "loss": 0.6157, "num_input_tokens_seen": 17535760, "step": 30410 }, { "epoch": 4.530086386654752, "grad_norm": 2.2891359329223633, "learning_rate": 3.330454320300701e-05, "loss": 0.5731, "num_input_tokens_seen": 17538608, "step": 30415 }, { "epoch": 4.53083109919571, "grad_norm": 1.5526432991027832, "learning_rate": 3.329841311373723e-05, "loss": 0.6219, "num_input_tokens_seen": 17541360, "step": 30420 }, { "epoch": 4.53157581173667, "grad_norm": 1.603116750717163, "learning_rate": 3.3292282463694435e-05, "loss": 0.6042, "num_input_tokens_seen": 17544112, "step": 30425 }, { "epoch": 4.532320524277629, "grad_norm": 1.232198715209961, "learning_rate": 3.328615125329291e-05, "loss": 0.5911, "num_input_tokens_seen": 17547344, "step": 30430 }, { "epoch": 4.533065236818588, "grad_norm": 1.660199522972107, "learning_rate": 3.328001948294695e-05, "loss": 0.6683, "num_input_tokens_seen": 17550096, "step": 30435 }, { "epoch": 4.533809949359547, "grad_norm": 0.9311807751655579, "learning_rate": 3.327388715307096e-05, "loss": 0.6495, "num_input_tokens_seen": 17553136, "step": 30440 }, { "epoch": 4.534554661900506, "grad_norm": 2.171860456466675, "learning_rate": 3.3267754264079314e-05, "loss": 0.6069, "num_input_tokens_seen": 17556016, "step": 30445 }, { "epoch": 4.535299374441466, "grad_norm": 3.631115436553955, "learning_rate": 3.326162081638644e-05, "loss": 0.7444, "num_input_tokens_seen": 17559056, "step": 30450 }, { "epoch": 4.536044086982425, "grad_norm": 1.4872996807098389, "learning_rate": 3.3255486810406825e-05, "loss": 0.4694, "num_input_tokens_seen": 17561712, "step": 30455 }, { "epoch": 4.536788799523384, "grad_norm": 1.2193034887313843, "learning_rate": 3.324935224655497e-05, "loss": 0.7125, "num_input_tokens_seen": 17564656, "step": 30460 }, { "epoch": 4.537533512064343, "grad_norm": 1.6168723106384277, "learning_rate": 3.324321712524544e-05, "loss": 0.6952, "num_input_tokens_seen": 17567920, "step": 30465 }, { "epoch": 4.538278224605302, "grad_norm": 1.2141752243041992, "learning_rate": 3.32370814468928e-05, "loss": 0.7329, "num_input_tokens_seen": 17570928, "step": 30470 }, { "epoch": 4.539022937146262, "grad_norm": 1.6966842412948608, "learning_rate": 3.323094521191169e-05, "loss": 0.5492, "num_input_tokens_seen": 17573840, "step": 30475 }, { "epoch": 4.539767649687221, "grad_norm": 1.0390710830688477, "learning_rate": 3.322480842071677e-05, "loss": 0.795, "num_input_tokens_seen": 17576688, "step": 30480 }, { "epoch": 4.5405123622281796, "grad_norm": 3.5200273990631104, "learning_rate": 3.321867107372274e-05, "loss": 0.7578, "num_input_tokens_seen": 17579568, "step": 30485 }, { "epoch": 4.541257074769139, "grad_norm": 0.8384200930595398, "learning_rate": 3.321253317134432e-05, "loss": 0.4681, "num_input_tokens_seen": 17582384, "step": 30490 }, { "epoch": 4.542001787310098, "grad_norm": 1.8911666870117188, "learning_rate": 3.320639471399631e-05, "loss": 0.7313, "num_input_tokens_seen": 17585328, "step": 30495 }, { "epoch": 4.542746499851058, "grad_norm": 1.7044775485992432, "learning_rate": 3.3200255702093506e-05, "loss": 0.5647, "num_input_tokens_seen": 17588112, "step": 30500 }, { "epoch": 4.543491212392016, "grad_norm": 2.121243953704834, "learning_rate": 3.319411613605076e-05, "loss": 0.5533, "num_input_tokens_seen": 17590928, "step": 30505 }, { "epoch": 4.544235924932976, "grad_norm": 1.4330716133117676, "learning_rate": 3.3187976016282964e-05, "loss": 0.5402, "num_input_tokens_seen": 17593904, "step": 30510 }, { "epoch": 4.544980637473935, "grad_norm": 1.2762260437011719, "learning_rate": 3.3181835343205035e-05, "loss": 0.4526, "num_input_tokens_seen": 17596752, "step": 30515 }, { "epoch": 4.545725350014894, "grad_norm": 1.7972763776779175, "learning_rate": 3.317569411723194e-05, "loss": 0.7465, "num_input_tokens_seen": 17599472, "step": 30520 }, { "epoch": 4.546470062555853, "grad_norm": 2.9167943000793457, "learning_rate": 3.316955233877868e-05, "loss": 0.7274, "num_input_tokens_seen": 17602320, "step": 30525 }, { "epoch": 4.547214775096813, "grad_norm": 1.5457950830459595, "learning_rate": 3.316341000826029e-05, "loss": 0.6378, "num_input_tokens_seen": 17605328, "step": 30530 }, { "epoch": 4.547959487637772, "grad_norm": 1.5323344469070435, "learning_rate": 3.315726712609183e-05, "loss": 0.8051, "num_input_tokens_seen": 17608240, "step": 30535 }, { "epoch": 4.548704200178731, "grad_norm": 1.1332743167877197, "learning_rate": 3.3151123692688424e-05, "loss": 0.5922, "num_input_tokens_seen": 17610992, "step": 30540 }, { "epoch": 4.54944891271969, "grad_norm": 1.507377028465271, "learning_rate": 3.3144979708465226e-05, "loss": 0.5777, "num_input_tokens_seen": 17613712, "step": 30545 }, { "epoch": 4.55019362526065, "grad_norm": 0.9945694208145142, "learning_rate": 3.313883517383741e-05, "loss": 0.5461, "num_input_tokens_seen": 17616592, "step": 30550 }, { "epoch": 4.550938337801608, "grad_norm": 1.5687464475631714, "learning_rate": 3.313269008922021e-05, "loss": 0.5897, "num_input_tokens_seen": 17619728, "step": 30555 }, { "epoch": 4.551683050342568, "grad_norm": 1.5974419116973877, "learning_rate": 3.312654445502887e-05, "loss": 0.5529, "num_input_tokens_seen": 17622576, "step": 30560 }, { "epoch": 4.552427762883527, "grad_norm": 2.2844769954681396, "learning_rate": 3.3120398271678706e-05, "loss": 0.712, "num_input_tokens_seen": 17625360, "step": 30565 }, { "epoch": 4.553172475424486, "grad_norm": 1.8934962749481201, "learning_rate": 3.311425153958503e-05, "loss": 0.6931, "num_input_tokens_seen": 17628368, "step": 30570 }, { "epoch": 4.553917187965445, "grad_norm": 1.7652236223220825, "learning_rate": 3.310810425916323e-05, "loss": 0.5909, "num_input_tokens_seen": 17631312, "step": 30575 }, { "epoch": 4.554661900506405, "grad_norm": 2.331456422805786, "learning_rate": 3.3101956430828715e-05, "loss": 0.6534, "num_input_tokens_seen": 17634160, "step": 30580 }, { "epoch": 4.555406613047364, "grad_norm": 1.103200078010559, "learning_rate": 3.309580805499692e-05, "loss": 0.6289, "num_input_tokens_seen": 17637168, "step": 30585 }, { "epoch": 4.556151325588323, "grad_norm": 1.8504611253738403, "learning_rate": 3.3089659132083327e-05, "loss": 0.6931, "num_input_tokens_seen": 17640144, "step": 30590 }, { "epoch": 4.556896038129282, "grad_norm": 1.4571789503097534, "learning_rate": 3.3083509662503466e-05, "loss": 0.6186, "num_input_tokens_seen": 17643088, "step": 30595 }, { "epoch": 4.557640750670242, "grad_norm": 1.3635826110839844, "learning_rate": 3.3077359646672884e-05, "loss": 0.6606, "num_input_tokens_seen": 17645904, "step": 30600 }, { "epoch": 4.5583854632112, "grad_norm": 1.0864888429641724, "learning_rate": 3.307120908500718e-05, "loss": 0.5661, "num_input_tokens_seen": 17649040, "step": 30605 }, { "epoch": 4.559130175752159, "grad_norm": 1.497641682624817, "learning_rate": 3.3065057977921986e-05, "loss": 0.7688, "num_input_tokens_seen": 17652240, "step": 30610 }, { "epoch": 4.559874888293119, "grad_norm": 2.1397433280944824, "learning_rate": 3.305890632583295e-05, "loss": 0.5865, "num_input_tokens_seen": 17654800, "step": 30615 }, { "epoch": 4.5606196008340785, "grad_norm": 3.2983694076538086, "learning_rate": 3.30527541291558e-05, "loss": 0.7488, "num_input_tokens_seen": 17657744, "step": 30620 }, { "epoch": 4.561364313375037, "grad_norm": 1.2058930397033691, "learning_rate": 3.304660138830626e-05, "loss": 0.8042, "num_input_tokens_seen": 17660784, "step": 30625 }, { "epoch": 4.562109025915996, "grad_norm": 1.1704063415527344, "learning_rate": 3.3040448103700124e-05, "loss": 0.6932, "num_input_tokens_seen": 17663504, "step": 30630 }, { "epoch": 4.562853738456956, "grad_norm": 1.0583667755126953, "learning_rate": 3.303429427575319e-05, "loss": 0.5687, "num_input_tokens_seen": 17666128, "step": 30635 }, { "epoch": 4.563598450997915, "grad_norm": 2.3864824771881104, "learning_rate": 3.30281399048813e-05, "loss": 0.6273, "num_input_tokens_seen": 17669296, "step": 30640 }, { "epoch": 4.564343163538874, "grad_norm": 1.2775462865829468, "learning_rate": 3.302198499150038e-05, "loss": 0.5068, "num_input_tokens_seen": 17672144, "step": 30645 }, { "epoch": 4.565087876079833, "grad_norm": 0.9829527139663696, "learning_rate": 3.301582953602631e-05, "loss": 0.4376, "num_input_tokens_seen": 17674864, "step": 30650 }, { "epoch": 4.565832588620792, "grad_norm": 2.2988579273223877, "learning_rate": 3.300967353887507e-05, "loss": 0.6308, "num_input_tokens_seen": 17677712, "step": 30655 }, { "epoch": 4.566577301161751, "grad_norm": 1.4534580707550049, "learning_rate": 3.300351700046267e-05, "loss": 0.6755, "num_input_tokens_seen": 17680176, "step": 30660 }, { "epoch": 4.567322013702711, "grad_norm": 1.3948488235473633, "learning_rate": 3.299735992120513e-05, "loss": 0.7003, "num_input_tokens_seen": 17683088, "step": 30665 }, { "epoch": 4.56806672624367, "grad_norm": 2.218444585800171, "learning_rate": 3.299120230151852e-05, "loss": 0.901, "num_input_tokens_seen": 17686256, "step": 30670 }, { "epoch": 4.568811438784629, "grad_norm": 1.4325897693634033, "learning_rate": 3.298504414181894e-05, "loss": 0.7258, "num_input_tokens_seen": 17688976, "step": 30675 }, { "epoch": 4.569556151325588, "grad_norm": 0.8923423886299133, "learning_rate": 3.297888544252255e-05, "loss": 0.6207, "num_input_tokens_seen": 17691728, "step": 30680 }, { "epoch": 4.570300863866548, "grad_norm": 2.267040252685547, "learning_rate": 3.2972726204045515e-05, "loss": 0.6805, "num_input_tokens_seen": 17694224, "step": 30685 }, { "epoch": 4.571045576407506, "grad_norm": 2.4190573692321777, "learning_rate": 3.2966566426804057e-05, "loss": 0.7214, "num_input_tokens_seen": 17696944, "step": 30690 }, { "epoch": 4.571790288948466, "grad_norm": 1.7433485984802246, "learning_rate": 3.296040611121444e-05, "loss": 0.5531, "num_input_tokens_seen": 17699632, "step": 30695 }, { "epoch": 4.572535001489425, "grad_norm": 1.4556951522827148, "learning_rate": 3.295424525769293e-05, "loss": 0.6578, "num_input_tokens_seen": 17702384, "step": 30700 }, { "epoch": 4.5732797140303845, "grad_norm": 3.076371431350708, "learning_rate": 3.2948083866655865e-05, "loss": 0.6771, "num_input_tokens_seen": 17705008, "step": 30705 }, { "epoch": 4.574024426571343, "grad_norm": 1.5692905187606812, "learning_rate": 3.294192193851963e-05, "loss": 0.5122, "num_input_tokens_seen": 17707952, "step": 30710 }, { "epoch": 4.574769139112303, "grad_norm": 1.5210919380187988, "learning_rate": 3.293575947370057e-05, "loss": 0.7142, "num_input_tokens_seen": 17710768, "step": 30715 }, { "epoch": 4.575513851653262, "grad_norm": 1.7626186609268188, "learning_rate": 3.2929596472615165e-05, "loss": 0.582, "num_input_tokens_seen": 17713456, "step": 30720 }, { "epoch": 4.576258564194221, "grad_norm": 1.7356973886489868, "learning_rate": 3.292343293567986e-05, "loss": 0.5695, "num_input_tokens_seen": 17716208, "step": 30725 }, { "epoch": 4.57700327673518, "grad_norm": 1.4255282878875732, "learning_rate": 3.291726886331119e-05, "loss": 0.4637, "num_input_tokens_seen": 17719312, "step": 30730 }, { "epoch": 4.57774798927614, "grad_norm": 2.9768779277801514, "learning_rate": 3.291110425592566e-05, "loss": 0.7308, "num_input_tokens_seen": 17722160, "step": 30735 }, { "epoch": 4.578492701817098, "grad_norm": 1.108568787574768, "learning_rate": 3.290493911393988e-05, "loss": 0.5533, "num_input_tokens_seen": 17725008, "step": 30740 }, { "epoch": 4.579237414358058, "grad_norm": 2.030452013015747, "learning_rate": 3.289877343777045e-05, "loss": 0.5233, "num_input_tokens_seen": 17727824, "step": 30745 }, { "epoch": 4.579982126899017, "grad_norm": 1.0173633098602295, "learning_rate": 3.2892607227834024e-05, "loss": 0.6215, "num_input_tokens_seen": 17731312, "step": 30750 }, { "epoch": 4.5807268394399765, "grad_norm": 2.468635320663452, "learning_rate": 3.288644048454729e-05, "loss": 0.7049, "num_input_tokens_seen": 17734032, "step": 30755 }, { "epoch": 4.581471551980935, "grad_norm": 2.2068874835968018, "learning_rate": 3.288027320832698e-05, "loss": 0.7521, "num_input_tokens_seen": 17736848, "step": 30760 }, { "epoch": 4.582216264521895, "grad_norm": 1.1547482013702393, "learning_rate": 3.287410539958984e-05, "loss": 0.525, "num_input_tokens_seen": 17739792, "step": 30765 }, { "epoch": 4.582960977062854, "grad_norm": 1.2838631868362427, "learning_rate": 3.286793705875267e-05, "loss": 0.6458, "num_input_tokens_seen": 17742480, "step": 30770 }, { "epoch": 4.583705689603813, "grad_norm": 2.8742942810058594, "learning_rate": 3.2861768186232306e-05, "loss": 0.6505, "num_input_tokens_seen": 17745328, "step": 30775 }, { "epoch": 4.584450402144772, "grad_norm": 1.9440877437591553, "learning_rate": 3.2855598782445606e-05, "loss": 0.6815, "num_input_tokens_seen": 17748240, "step": 30780 }, { "epoch": 4.585195114685732, "grad_norm": 1.5772337913513184, "learning_rate": 3.2849428847809474e-05, "loss": 0.6779, "num_input_tokens_seen": 17750928, "step": 30785 }, { "epoch": 4.5859398272266905, "grad_norm": 3.9642574787139893, "learning_rate": 3.2843258382740866e-05, "loss": 0.7863, "num_input_tokens_seen": 17753840, "step": 30790 }, { "epoch": 4.586684539767649, "grad_norm": 1.6064183712005615, "learning_rate": 3.283708738765674e-05, "loss": 0.6951, "num_input_tokens_seen": 17756496, "step": 30795 }, { "epoch": 4.587429252308609, "grad_norm": 1.1808720827102661, "learning_rate": 3.283091586297411e-05, "loss": 0.548, "num_input_tokens_seen": 17759184, "step": 30800 }, { "epoch": 4.5881739648495685, "grad_norm": 1.3278937339782715, "learning_rate": 3.2824743809110024e-05, "loss": 0.5438, "num_input_tokens_seen": 17761936, "step": 30805 }, { "epoch": 4.588918677390527, "grad_norm": 1.5206506252288818, "learning_rate": 3.281857122648157e-05, "loss": 0.6053, "num_input_tokens_seen": 17764880, "step": 30810 }, { "epoch": 4.589663389931486, "grad_norm": 1.3607755899429321, "learning_rate": 3.281239811550586e-05, "loss": 0.7027, "num_input_tokens_seen": 17767536, "step": 30815 }, { "epoch": 4.590408102472446, "grad_norm": 1.4911668300628662, "learning_rate": 3.280622447660004e-05, "loss": 0.6872, "num_input_tokens_seen": 17770384, "step": 30820 }, { "epoch": 4.591152815013404, "grad_norm": 1.106332778930664, "learning_rate": 3.280005031018131e-05, "loss": 0.6831, "num_input_tokens_seen": 17773296, "step": 30825 }, { "epoch": 4.591897527554364, "grad_norm": 1.513039231300354, "learning_rate": 3.2793875616666904e-05, "loss": 0.5484, "num_input_tokens_seen": 17775984, "step": 30830 }, { "epoch": 4.592642240095323, "grad_norm": 0.865213930606842, "learning_rate": 3.278770039647406e-05, "loss": 0.6395, "num_input_tokens_seen": 17778736, "step": 30835 }, { "epoch": 4.5933869526362825, "grad_norm": 2.3871123790740967, "learning_rate": 3.278152465002008e-05, "loss": 0.5877, "num_input_tokens_seen": 17781584, "step": 30840 }, { "epoch": 4.594131665177241, "grad_norm": 2.1047704219818115, "learning_rate": 3.277534837772232e-05, "loss": 0.6476, "num_input_tokens_seen": 17784304, "step": 30845 }, { "epoch": 4.594876377718201, "grad_norm": 1.0818028450012207, "learning_rate": 3.276917157999811e-05, "loss": 0.7029, "num_input_tokens_seen": 17787376, "step": 30850 }, { "epoch": 4.59562109025916, "grad_norm": 1.5925850868225098, "learning_rate": 3.276299425726489e-05, "loss": 0.6073, "num_input_tokens_seen": 17790128, "step": 30855 }, { "epoch": 4.596365802800119, "grad_norm": 2.6135470867156982, "learning_rate": 3.275681640994007e-05, "loss": 0.6499, "num_input_tokens_seen": 17792848, "step": 30860 }, { "epoch": 4.597110515341078, "grad_norm": 1.6711246967315674, "learning_rate": 3.275063803844113e-05, "loss": 0.547, "num_input_tokens_seen": 17795600, "step": 30865 }, { "epoch": 4.597855227882038, "grad_norm": 1.9799338579177856, "learning_rate": 3.274445914318559e-05, "loss": 0.5845, "num_input_tokens_seen": 17798096, "step": 30870 }, { "epoch": 4.5985999404229965, "grad_norm": 1.919698715209961, "learning_rate": 3.273827972459099e-05, "loss": 0.8024, "num_input_tokens_seen": 17801072, "step": 30875 }, { "epoch": 4.599344652963956, "grad_norm": 2.4691245555877686, "learning_rate": 3.27320997830749e-05, "loss": 0.6241, "num_input_tokens_seen": 17804016, "step": 30880 }, { "epoch": 4.600089365504915, "grad_norm": 1.5885958671569824, "learning_rate": 3.2725919319054946e-05, "loss": 0.5784, "num_input_tokens_seen": 17806672, "step": 30885 }, { "epoch": 4.6008340780458745, "grad_norm": 0.7747017741203308, "learning_rate": 3.271973833294877e-05, "loss": 0.7917, "num_input_tokens_seen": 17809840, "step": 30890 }, { "epoch": 4.601578790586833, "grad_norm": 1.5311089754104614, "learning_rate": 3.2713556825174074e-05, "loss": 0.5958, "num_input_tokens_seen": 17812816, "step": 30895 }, { "epoch": 4.602323503127793, "grad_norm": 0.5204324126243591, "learning_rate": 3.270737479614856e-05, "loss": 0.4211, "num_input_tokens_seen": 17815664, "step": 30900 }, { "epoch": 4.603068215668752, "grad_norm": 1.176710844039917, "learning_rate": 3.270119224629e-05, "loss": 0.6068, "num_input_tokens_seen": 17818224, "step": 30905 }, { "epoch": 4.603812928209711, "grad_norm": 0.9962300062179565, "learning_rate": 3.269500917601618e-05, "loss": 0.412, "num_input_tokens_seen": 17821008, "step": 30910 }, { "epoch": 4.60455764075067, "grad_norm": 1.7839186191558838, "learning_rate": 3.268882558574492e-05, "loss": 0.8391, "num_input_tokens_seen": 17823792, "step": 30915 }, { "epoch": 4.60530235329163, "grad_norm": 1.4030089378356934, "learning_rate": 3.268264147589409e-05, "loss": 0.7195, "num_input_tokens_seen": 17826704, "step": 30920 }, { "epoch": 4.6060470658325885, "grad_norm": 3.94817852973938, "learning_rate": 3.2676456846881583e-05, "loss": 0.764, "num_input_tokens_seen": 17829936, "step": 30925 }, { "epoch": 4.606791778373548, "grad_norm": 2.3806324005126953, "learning_rate": 3.267027169912533e-05, "loss": 0.8109, "num_input_tokens_seen": 17832976, "step": 30930 }, { "epoch": 4.607536490914507, "grad_norm": 1.4713233709335327, "learning_rate": 3.2664086033043304e-05, "loss": 0.722, "num_input_tokens_seen": 17835952, "step": 30935 }, { "epoch": 4.6082812034554665, "grad_norm": 1.4854702949523926, "learning_rate": 3.265789984905351e-05, "loss": 0.5882, "num_input_tokens_seen": 17839056, "step": 30940 }, { "epoch": 4.609025915996425, "grad_norm": 1.7500642538070679, "learning_rate": 3.265171314757397e-05, "loss": 0.6903, "num_input_tokens_seen": 17841968, "step": 30945 }, { "epoch": 4.609770628537385, "grad_norm": 2.2757227420806885, "learning_rate": 3.264552592902277e-05, "loss": 0.7566, "num_input_tokens_seen": 17844912, "step": 30950 }, { "epoch": 4.610515341078344, "grad_norm": 1.4580703973770142, "learning_rate": 3.2639338193818006e-05, "loss": 0.7416, "num_input_tokens_seen": 17847888, "step": 30955 }, { "epoch": 4.6112600536193025, "grad_norm": 2.3724734783172607, "learning_rate": 3.2633149942377834e-05, "loss": 0.6059, "num_input_tokens_seen": 17850960, "step": 30960 }, { "epoch": 4.612004766160262, "grad_norm": 1.3044546842575073, "learning_rate": 3.2626961175120414e-05, "loss": 0.5528, "num_input_tokens_seen": 17853872, "step": 30965 }, { "epoch": 4.612749478701222, "grad_norm": 1.6331965923309326, "learning_rate": 3.262077189246398e-05, "loss": 0.598, "num_input_tokens_seen": 17856816, "step": 30970 }, { "epoch": 4.6134941912421805, "grad_norm": 1.4397063255310059, "learning_rate": 3.261458209482675e-05, "loss": 0.6048, "num_input_tokens_seen": 17859984, "step": 30975 }, { "epoch": 4.614238903783139, "grad_norm": 1.0099889039993286, "learning_rate": 3.260839178262703e-05, "loss": 0.5744, "num_input_tokens_seen": 17862640, "step": 30980 }, { "epoch": 4.614983616324099, "grad_norm": 0.899215579032898, "learning_rate": 3.260220095628312e-05, "loss": 0.5551, "num_input_tokens_seen": 17865456, "step": 30985 }, { "epoch": 4.615728328865059, "grad_norm": 1.124983787536621, "learning_rate": 3.259600961621339e-05, "loss": 0.5769, "num_input_tokens_seen": 17868048, "step": 30990 }, { "epoch": 4.616473041406017, "grad_norm": 2.4636101722717285, "learning_rate": 3.25898177628362e-05, "loss": 0.6437, "num_input_tokens_seen": 17870960, "step": 30995 }, { "epoch": 4.617217753946976, "grad_norm": 1.0146090984344482, "learning_rate": 3.258362539656999e-05, "loss": 0.9059, "num_input_tokens_seen": 17874032, "step": 31000 }, { "epoch": 4.617962466487936, "grad_norm": 1.1938486099243164, "learning_rate": 3.2577432517833204e-05, "loss": 0.6059, "num_input_tokens_seen": 17876944, "step": 31005 }, { "epoch": 4.6187071790288945, "grad_norm": 1.319397211074829, "learning_rate": 3.257123912704435e-05, "loss": 0.5954, "num_input_tokens_seen": 17879728, "step": 31010 }, { "epoch": 4.619451891569854, "grad_norm": 1.8257402181625366, "learning_rate": 3.2565045224621923e-05, "loss": 0.7343, "num_input_tokens_seen": 17882640, "step": 31015 }, { "epoch": 4.620196604110813, "grad_norm": 1.585785984992981, "learning_rate": 3.25588508109845e-05, "loss": 0.5262, "num_input_tokens_seen": 17885744, "step": 31020 }, { "epoch": 4.6209413166517725, "grad_norm": 1.1055701971054077, "learning_rate": 3.2552655886550674e-05, "loss": 0.7437, "num_input_tokens_seen": 17888080, "step": 31025 }, { "epoch": 4.621686029192731, "grad_norm": 1.009494662284851, "learning_rate": 3.254646045173907e-05, "loss": 0.6599, "num_input_tokens_seen": 17890896, "step": 31030 }, { "epoch": 4.622430741733691, "grad_norm": 0.9478705525398254, "learning_rate": 3.254026450696835e-05, "loss": 0.6455, "num_input_tokens_seen": 17893712, "step": 31035 }, { "epoch": 4.62317545427465, "grad_norm": 1.871545672416687, "learning_rate": 3.253406805265721e-05, "loss": 0.6264, "num_input_tokens_seen": 17896528, "step": 31040 }, { "epoch": 4.623920166815609, "grad_norm": 1.388691782951355, "learning_rate": 3.252787108922438e-05, "loss": 0.7361, "num_input_tokens_seen": 17899312, "step": 31045 }, { "epoch": 4.624664879356568, "grad_norm": 1.7374871969223022, "learning_rate": 3.252167361708863e-05, "loss": 0.6318, "num_input_tokens_seen": 17902480, "step": 31050 }, { "epoch": 4.625409591897528, "grad_norm": 1.5014946460723877, "learning_rate": 3.251547563666876e-05, "loss": 0.7142, "num_input_tokens_seen": 17905392, "step": 31055 }, { "epoch": 4.6261543044384865, "grad_norm": 1.7360413074493408, "learning_rate": 3.25092771483836e-05, "loss": 0.5548, "num_input_tokens_seen": 17908272, "step": 31060 }, { "epoch": 4.626899016979446, "grad_norm": 1.5705864429473877, "learning_rate": 3.2503078152652024e-05, "loss": 0.7472, "num_input_tokens_seen": 17911152, "step": 31065 }, { "epoch": 4.627643729520405, "grad_norm": 1.040716528892517, "learning_rate": 3.2496878649892924e-05, "loss": 0.5491, "num_input_tokens_seen": 17913872, "step": 31070 }, { "epoch": 4.628388442061365, "grad_norm": 1.0356130599975586, "learning_rate": 3.2490678640525255e-05, "loss": 0.5069, "num_input_tokens_seen": 17916464, "step": 31075 }, { "epoch": 4.629133154602323, "grad_norm": 1.444143533706665, "learning_rate": 3.248447812496797e-05, "loss": 0.6208, "num_input_tokens_seen": 17919376, "step": 31080 }, { "epoch": 4.629877867143283, "grad_norm": 1.7372329235076904, "learning_rate": 3.2478277103640086e-05, "loss": 0.8341, "num_input_tokens_seen": 17922128, "step": 31085 }, { "epoch": 4.630622579684242, "grad_norm": 1.5016855001449585, "learning_rate": 3.247207557696064e-05, "loss": 0.4389, "num_input_tokens_seen": 17924880, "step": 31090 }, { "epoch": 4.631367292225201, "grad_norm": 1.3518823385238647, "learning_rate": 3.2465873545348715e-05, "loss": 0.7798, "num_input_tokens_seen": 17928208, "step": 31095 }, { "epoch": 4.63211200476616, "grad_norm": 1.2400976419448853, "learning_rate": 3.2459671009223394e-05, "loss": 0.7477, "num_input_tokens_seen": 17931760, "step": 31100 }, { "epoch": 4.63285671730712, "grad_norm": 2.7665324211120605, "learning_rate": 3.245346796900384e-05, "loss": 0.7781, "num_input_tokens_seen": 17934672, "step": 31105 }, { "epoch": 4.6336014298480785, "grad_norm": 1.783864140510559, "learning_rate": 3.244726442510923e-05, "loss": 0.7357, "num_input_tokens_seen": 17937488, "step": 31110 }, { "epoch": 4.634346142389038, "grad_norm": 1.5123370885849, "learning_rate": 3.244106037795877e-05, "loss": 0.494, "num_input_tokens_seen": 17940272, "step": 31115 }, { "epoch": 4.635090854929997, "grad_norm": 1.388065218925476, "learning_rate": 3.243485582797169e-05, "loss": 0.6002, "num_input_tokens_seen": 17943312, "step": 31120 }, { "epoch": 4.635835567470957, "grad_norm": 1.3441542387008667, "learning_rate": 3.242865077556729e-05, "loss": 0.6191, "num_input_tokens_seen": 17946448, "step": 31125 }, { "epoch": 4.636580280011915, "grad_norm": 2.713071823120117, "learning_rate": 3.2422445221164876e-05, "loss": 0.745, "num_input_tokens_seen": 17949328, "step": 31130 }, { "epoch": 4.637324992552875, "grad_norm": 1.6431901454925537, "learning_rate": 3.241623916518378e-05, "loss": 0.5494, "num_input_tokens_seen": 17952112, "step": 31135 }, { "epoch": 4.638069705093834, "grad_norm": 1.8346201181411743, "learning_rate": 3.2410032608043405e-05, "loss": 0.6612, "num_input_tokens_seen": 17954608, "step": 31140 }, { "epoch": 4.6388144176347925, "grad_norm": 1.4649077653884888, "learning_rate": 3.2403825550163144e-05, "loss": 0.5924, "num_input_tokens_seen": 17957680, "step": 31145 }, { "epoch": 4.639559130175752, "grad_norm": 1.183861494064331, "learning_rate": 3.239761799196246e-05, "loss": 0.5407, "num_input_tokens_seen": 17960560, "step": 31150 }, { "epoch": 4.640303842716712, "grad_norm": 2.0983352661132812, "learning_rate": 3.2391409933860825e-05, "loss": 0.6333, "num_input_tokens_seen": 17963408, "step": 31155 }, { "epoch": 4.641048555257671, "grad_norm": 2.4598934650421143, "learning_rate": 3.238520137627777e-05, "loss": 0.6269, "num_input_tokens_seen": 17966160, "step": 31160 }, { "epoch": 4.641793267798629, "grad_norm": 1.8706978559494019, "learning_rate": 3.237899231963282e-05, "loss": 0.5807, "num_input_tokens_seen": 17969296, "step": 31165 }, { "epoch": 4.642537980339589, "grad_norm": 1.7647068500518799, "learning_rate": 3.237278276434557e-05, "loss": 0.5315, "num_input_tokens_seen": 17972272, "step": 31170 }, { "epoch": 4.643282692880548, "grad_norm": 1.3670002222061157, "learning_rate": 3.236657271083564e-05, "loss": 0.6273, "num_input_tokens_seen": 17975024, "step": 31175 }, { "epoch": 4.644027405421507, "grad_norm": 1.4383736848831177, "learning_rate": 3.236036215952267e-05, "loss": 0.6139, "num_input_tokens_seen": 17978064, "step": 31180 }, { "epoch": 4.644772117962466, "grad_norm": 1.576979160308838, "learning_rate": 3.2354151110826355e-05, "loss": 0.7332, "num_input_tokens_seen": 17980944, "step": 31185 }, { "epoch": 4.645516830503426, "grad_norm": 1.3131715059280396, "learning_rate": 3.234793956516641e-05, "loss": 0.8485, "num_input_tokens_seen": 17983664, "step": 31190 }, { "epoch": 4.6462615430443845, "grad_norm": 2.5463132858276367, "learning_rate": 3.234172752296259e-05, "loss": 0.6543, "num_input_tokens_seen": 17986544, "step": 31195 }, { "epoch": 4.647006255585344, "grad_norm": 1.229664921760559, "learning_rate": 3.233551498463466e-05, "loss": 0.5541, "num_input_tokens_seen": 17989456, "step": 31200 }, { "epoch": 4.647750968126303, "grad_norm": 0.965834379196167, "learning_rate": 3.2329301950602456e-05, "loss": 0.6034, "num_input_tokens_seen": 17992496, "step": 31205 }, { "epoch": 4.648495680667263, "grad_norm": 1.50371515750885, "learning_rate": 3.232308842128583e-05, "loss": 0.6789, "num_input_tokens_seen": 17995344, "step": 31210 }, { "epoch": 4.649240393208221, "grad_norm": 0.8174113631248474, "learning_rate": 3.2316874397104656e-05, "loss": 0.6108, "num_input_tokens_seen": 17998256, "step": 31215 }, { "epoch": 4.649985105749181, "grad_norm": 1.2423624992370605, "learning_rate": 3.231065987847885e-05, "loss": 0.6227, "num_input_tokens_seen": 18001360, "step": 31220 }, { "epoch": 4.65072981829014, "grad_norm": 2.2278389930725098, "learning_rate": 3.2304444865828394e-05, "loss": 0.6869, "num_input_tokens_seen": 18004304, "step": 31225 }, { "epoch": 4.651474530831099, "grad_norm": 1.5743566751480103, "learning_rate": 3.229822935957324e-05, "loss": 0.6197, "num_input_tokens_seen": 18007120, "step": 31230 }, { "epoch": 4.652219243372058, "grad_norm": 1.2799065113067627, "learning_rate": 3.2292013360133416e-05, "loss": 0.5909, "num_input_tokens_seen": 18009808, "step": 31235 }, { "epoch": 4.652963955913018, "grad_norm": 2.116987943649292, "learning_rate": 3.228579686792898e-05, "loss": 0.6435, "num_input_tokens_seen": 18012592, "step": 31240 }, { "epoch": 4.653708668453977, "grad_norm": 2.386220932006836, "learning_rate": 3.227957988338001e-05, "loss": 0.7385, "num_input_tokens_seen": 18015568, "step": 31245 }, { "epoch": 4.654453380994936, "grad_norm": 1.458452582359314, "learning_rate": 3.227336240690663e-05, "loss": 0.7058, "num_input_tokens_seen": 18018384, "step": 31250 }, { "epoch": 4.655198093535895, "grad_norm": 1.4548468589782715, "learning_rate": 3.226714443892899e-05, "loss": 0.608, "num_input_tokens_seen": 18021104, "step": 31255 }, { "epoch": 4.655942806076855, "grad_norm": 1.9632841348648071, "learning_rate": 3.226092597986728e-05, "loss": 0.5332, "num_input_tokens_seen": 18023824, "step": 31260 }, { "epoch": 4.656687518617813, "grad_norm": 1.1587995290756226, "learning_rate": 3.225470703014171e-05, "loss": 0.5952, "num_input_tokens_seen": 18026704, "step": 31265 }, { "epoch": 4.657432231158773, "grad_norm": 1.5291073322296143, "learning_rate": 3.224848759017253e-05, "loss": 0.5399, "num_input_tokens_seen": 18029520, "step": 31270 }, { "epoch": 4.658176943699732, "grad_norm": 0.95926833152771, "learning_rate": 3.224226766038004e-05, "loss": 0.5321, "num_input_tokens_seen": 18032176, "step": 31275 }, { "epoch": 4.658921656240691, "grad_norm": 1.2219935655593872, "learning_rate": 3.223604724118453e-05, "loss": 0.6914, "num_input_tokens_seen": 18035248, "step": 31280 }, { "epoch": 4.65966636878165, "grad_norm": 1.0392091274261475, "learning_rate": 3.2229826333006374e-05, "loss": 0.561, "num_input_tokens_seen": 18037936, "step": 31285 }, { "epoch": 4.66041108132261, "grad_norm": 0.8895772695541382, "learning_rate": 3.222360493626595e-05, "loss": 0.5405, "num_input_tokens_seen": 18040624, "step": 31290 }, { "epoch": 4.661155793863569, "grad_norm": 3.9038689136505127, "learning_rate": 3.2217383051383676e-05, "loss": 0.765, "num_input_tokens_seen": 18043536, "step": 31295 }, { "epoch": 4.661900506404528, "grad_norm": 1.3655383586883545, "learning_rate": 3.2211160678779994e-05, "loss": 0.6622, "num_input_tokens_seen": 18046480, "step": 31300 }, { "epoch": 4.662645218945487, "grad_norm": 2.078216552734375, "learning_rate": 3.22049378188754e-05, "loss": 0.6703, "num_input_tokens_seen": 18049328, "step": 31305 }, { "epoch": 4.663389931486446, "grad_norm": 2.21089506149292, "learning_rate": 3.219871447209039e-05, "loss": 0.5668, "num_input_tokens_seen": 18052368, "step": 31310 }, { "epoch": 4.664134644027405, "grad_norm": 2.2929306030273438, "learning_rate": 3.219249063884553e-05, "loss": 0.6795, "num_input_tokens_seen": 18055280, "step": 31315 }, { "epoch": 4.664879356568365, "grad_norm": 1.1434928178787231, "learning_rate": 3.2186266319561395e-05, "loss": 0.8229, "num_input_tokens_seen": 18058416, "step": 31320 }, { "epoch": 4.665624069109324, "grad_norm": 2.665888786315918, "learning_rate": 3.21800415146586e-05, "loss": 0.6237, "num_input_tokens_seen": 18061008, "step": 31325 }, { "epoch": 4.666368781650283, "grad_norm": 1.27316415309906, "learning_rate": 3.217381622455778e-05, "loss": 0.5716, "num_input_tokens_seen": 18064080, "step": 31330 }, { "epoch": 4.667113494191242, "grad_norm": 2.3065736293792725, "learning_rate": 3.216759044967965e-05, "loss": 0.5853, "num_input_tokens_seen": 18067120, "step": 31335 }, { "epoch": 4.667858206732202, "grad_norm": 1.9967538118362427, "learning_rate": 3.2161364190444884e-05, "loss": 0.7301, "num_input_tokens_seen": 18069808, "step": 31340 }, { "epoch": 4.668602919273161, "grad_norm": 1.508482575416565, "learning_rate": 3.2155137447274245e-05, "loss": 0.5542, "num_input_tokens_seen": 18072688, "step": 31345 }, { "epoch": 4.669347631814119, "grad_norm": 1.2836979627609253, "learning_rate": 3.2148910220588495e-05, "loss": 0.6824, "num_input_tokens_seen": 18075440, "step": 31350 }, { "epoch": 4.670092344355079, "grad_norm": 1.4342938661575317, "learning_rate": 3.2142682510808474e-05, "loss": 0.7617, "num_input_tokens_seen": 18078672, "step": 31355 }, { "epoch": 4.670837056896038, "grad_norm": 0.8089994192123413, "learning_rate": 3.213645431835501e-05, "loss": 0.6532, "num_input_tokens_seen": 18081680, "step": 31360 }, { "epoch": 4.671581769436997, "grad_norm": 1.2245421409606934, "learning_rate": 3.213022564364897e-05, "loss": 0.5026, "num_input_tokens_seen": 18084656, "step": 31365 }, { "epoch": 4.672326481977956, "grad_norm": 1.5009610652923584, "learning_rate": 3.212399648711127e-05, "loss": 0.6001, "num_input_tokens_seen": 18087472, "step": 31370 }, { "epoch": 4.673071194518916, "grad_norm": 1.1693898439407349, "learning_rate": 3.2117766849162855e-05, "loss": 0.6656, "num_input_tokens_seen": 18090640, "step": 31375 }, { "epoch": 4.673815907059875, "grad_norm": 1.3158518075942993, "learning_rate": 3.211153673022469e-05, "loss": 0.5255, "num_input_tokens_seen": 18093456, "step": 31380 }, { "epoch": 4.674560619600834, "grad_norm": 0.9593993425369263, "learning_rate": 3.2105306130717786e-05, "loss": 0.5523, "num_input_tokens_seen": 18096304, "step": 31385 }, { "epoch": 4.675305332141793, "grad_norm": 1.851488471031189, "learning_rate": 3.209907505106319e-05, "loss": 0.6897, "num_input_tokens_seen": 18099184, "step": 31390 }, { "epoch": 4.676050044682753, "grad_norm": 1.7331560850143433, "learning_rate": 3.209284349168196e-05, "loss": 0.6665, "num_input_tokens_seen": 18102352, "step": 31395 }, { "epoch": 4.676794757223711, "grad_norm": 0.9642585515975952, "learning_rate": 3.20866114529952e-05, "loss": 0.5866, "num_input_tokens_seen": 18104976, "step": 31400 }, { "epoch": 4.677539469764671, "grad_norm": 1.2842469215393066, "learning_rate": 3.208037893542406e-05, "loss": 0.7121, "num_input_tokens_seen": 18107792, "step": 31405 }, { "epoch": 4.67828418230563, "grad_norm": 3.483099937438965, "learning_rate": 3.207414593938969e-05, "loss": 0.5846, "num_input_tokens_seen": 18110608, "step": 31410 }, { "epoch": 4.6790288948465895, "grad_norm": 0.6681400537490845, "learning_rate": 3.2067912465313305e-05, "loss": 0.4592, "num_input_tokens_seen": 18113392, "step": 31415 }, { "epoch": 4.679773607387548, "grad_norm": 1.5749160051345825, "learning_rate": 3.2061678513616125e-05, "loss": 0.6491, "num_input_tokens_seen": 18116176, "step": 31420 }, { "epoch": 4.680518319928508, "grad_norm": 1.8003392219543457, "learning_rate": 3.205544408471943e-05, "loss": 0.7462, "num_input_tokens_seen": 18118960, "step": 31425 }, { "epoch": 4.681263032469467, "grad_norm": 1.5978392362594604, "learning_rate": 3.20492091790445e-05, "loss": 0.6664, "num_input_tokens_seen": 18121616, "step": 31430 }, { "epoch": 4.682007745010426, "grad_norm": 1.3918094635009766, "learning_rate": 3.2042973797012674e-05, "loss": 0.5801, "num_input_tokens_seen": 18124176, "step": 31435 }, { "epoch": 4.682752457551385, "grad_norm": 1.3895580768585205, "learning_rate": 3.203673793904532e-05, "loss": 0.4862, "num_input_tokens_seen": 18126992, "step": 31440 }, { "epoch": 4.683497170092345, "grad_norm": 0.9905655384063721, "learning_rate": 3.2030501605563824e-05, "loss": 0.586, "num_input_tokens_seen": 18130064, "step": 31445 }, { "epoch": 4.684241882633303, "grad_norm": 1.6687413454055786, "learning_rate": 3.202426479698961e-05, "loss": 0.729, "num_input_tokens_seen": 18133072, "step": 31450 }, { "epoch": 4.684986595174263, "grad_norm": 2.2886273860931396, "learning_rate": 3.201802751374415e-05, "loss": 0.7873, "num_input_tokens_seen": 18135856, "step": 31455 }, { "epoch": 4.685731307715222, "grad_norm": 0.8043757677078247, "learning_rate": 3.201178975624891e-05, "loss": 0.6646, "num_input_tokens_seen": 18138768, "step": 31460 }, { "epoch": 4.6864760202561815, "grad_norm": 1.2809200286865234, "learning_rate": 3.200555152492543e-05, "loss": 0.7157, "num_input_tokens_seen": 18141744, "step": 31465 }, { "epoch": 4.68722073279714, "grad_norm": 0.8664348721504211, "learning_rate": 3.199931282019527e-05, "loss": 0.6816, "num_input_tokens_seen": 18144720, "step": 31470 }, { "epoch": 4.687965445338099, "grad_norm": 1.2717137336730957, "learning_rate": 3.1993073642479996e-05, "loss": 0.55, "num_input_tokens_seen": 18147472, "step": 31475 }, { "epoch": 4.688710157879059, "grad_norm": 3.233438491821289, "learning_rate": 3.1986833992201235e-05, "loss": 0.8765, "num_input_tokens_seen": 18150384, "step": 31480 }, { "epoch": 4.689454870420018, "grad_norm": 1.2875570058822632, "learning_rate": 3.198059386978064e-05, "loss": 0.6225, "num_input_tokens_seen": 18153488, "step": 31485 }, { "epoch": 4.690199582960977, "grad_norm": 1.8748908042907715, "learning_rate": 3.19743532756399e-05, "loss": 0.8289, "num_input_tokens_seen": 18156368, "step": 31490 }, { "epoch": 4.690944295501936, "grad_norm": 1.2884818315505981, "learning_rate": 3.1968112210200715e-05, "loss": 0.5425, "num_input_tokens_seen": 18159344, "step": 31495 }, { "epoch": 4.6916890080428955, "grad_norm": 0.9394773244857788, "learning_rate": 3.1961870673884845e-05, "loss": 0.5918, "num_input_tokens_seen": 18162320, "step": 31500 }, { "epoch": 4.692433720583855, "grad_norm": 3.1525843143463135, "learning_rate": 3.1955628667114055e-05, "loss": 0.6079, "num_input_tokens_seen": 18165136, "step": 31505 }, { "epoch": 4.693178433124814, "grad_norm": 1.5111864805221558, "learning_rate": 3.1949386190310154e-05, "loss": 0.7323, "num_input_tokens_seen": 18168112, "step": 31510 }, { "epoch": 4.693923145665773, "grad_norm": 1.6343543529510498, "learning_rate": 3.1943143243895e-05, "loss": 0.7106, "num_input_tokens_seen": 18171088, "step": 31515 }, { "epoch": 4.694667858206732, "grad_norm": 1.6235188245773315, "learning_rate": 3.193689982829044e-05, "loss": 0.5173, "num_input_tokens_seen": 18174288, "step": 31520 }, { "epoch": 4.695412570747691, "grad_norm": 1.2406489849090576, "learning_rate": 3.1930655943918405e-05, "loss": 0.7377, "num_input_tokens_seen": 18177296, "step": 31525 }, { "epoch": 4.696157283288651, "grad_norm": 1.1608479022979736, "learning_rate": 3.192441159120081e-05, "loss": 0.6489, "num_input_tokens_seen": 18179888, "step": 31530 }, { "epoch": 4.696901995829609, "grad_norm": 1.1809799671173096, "learning_rate": 3.1918166770559644e-05, "loss": 0.6767, "num_input_tokens_seen": 18182992, "step": 31535 }, { "epoch": 4.697646708370569, "grad_norm": 1.3873311281204224, "learning_rate": 3.191192148241689e-05, "loss": 0.7528, "num_input_tokens_seen": 18185968, "step": 31540 }, { "epoch": 4.698391420911528, "grad_norm": 0.9330843687057495, "learning_rate": 3.190567572719457e-05, "loss": 0.6664, "num_input_tokens_seen": 18188560, "step": 31545 }, { "epoch": 4.6991361334524875, "grad_norm": 1.9014936685562134, "learning_rate": 3.189942950531478e-05, "loss": 0.7474, "num_input_tokens_seen": 18191696, "step": 31550 }, { "epoch": 4.699880845993446, "grad_norm": 0.7769597172737122, "learning_rate": 3.189318281719959e-05, "loss": 0.5143, "num_input_tokens_seen": 18194480, "step": 31555 }, { "epoch": 4.700625558534406, "grad_norm": 1.6379345655441284, "learning_rate": 3.1886935663271125e-05, "loss": 0.5707, "num_input_tokens_seen": 18197552, "step": 31560 }, { "epoch": 4.701370271075365, "grad_norm": 1.5382168292999268, "learning_rate": 3.188068804395155e-05, "loss": 0.7068, "num_input_tokens_seen": 18200528, "step": 31565 }, { "epoch": 4.702114983616324, "grad_norm": 1.620884895324707, "learning_rate": 3.1874439959663055e-05, "loss": 0.6471, "num_input_tokens_seen": 18203216, "step": 31570 }, { "epoch": 4.702859696157283, "grad_norm": 1.4697937965393066, "learning_rate": 3.1868191410827855e-05, "loss": 0.5376, "num_input_tokens_seen": 18206192, "step": 31575 }, { "epoch": 4.703604408698243, "grad_norm": 2.2670116424560547, "learning_rate": 3.18619423978682e-05, "loss": 0.7033, "num_input_tokens_seen": 18209040, "step": 31580 }, { "epoch": 4.7043491212392015, "grad_norm": 1.8697922229766846, "learning_rate": 3.185569292120638e-05, "loss": 0.6432, "num_input_tokens_seen": 18211856, "step": 31585 }, { "epoch": 4.705093833780161, "grad_norm": 1.702288269996643, "learning_rate": 3.1849442981264707e-05, "loss": 0.579, "num_input_tokens_seen": 18214960, "step": 31590 }, { "epoch": 4.70583854632112, "grad_norm": 0.9898074865341187, "learning_rate": 3.184319257846553e-05, "loss": 0.6198, "num_input_tokens_seen": 18217744, "step": 31595 }, { "epoch": 4.7065832588620795, "grad_norm": 1.5393352508544922, "learning_rate": 3.183694171323121e-05, "loss": 0.4536, "num_input_tokens_seen": 18220464, "step": 31600 }, { "epoch": 4.707327971403038, "grad_norm": 1.2603354454040527, "learning_rate": 3.183069038598417e-05, "loss": 0.6554, "num_input_tokens_seen": 18223408, "step": 31605 }, { "epoch": 4.708072683943998, "grad_norm": 1.088955283164978, "learning_rate": 3.182443859714685e-05, "loss": 0.7415, "num_input_tokens_seen": 18226416, "step": 31610 }, { "epoch": 4.708817396484957, "grad_norm": 1.66969633102417, "learning_rate": 3.181818634714171e-05, "loss": 0.6384, "num_input_tokens_seen": 18229072, "step": 31615 }, { "epoch": 4.709562109025916, "grad_norm": 1.1107401847839355, "learning_rate": 3.1811933636391266e-05, "loss": 0.6883, "num_input_tokens_seen": 18231824, "step": 31620 }, { "epoch": 4.710306821566875, "grad_norm": 1.0019382238388062, "learning_rate": 3.1805680465318035e-05, "loss": 0.553, "num_input_tokens_seen": 18234576, "step": 31625 }, { "epoch": 4.711051534107835, "grad_norm": 3.271378993988037, "learning_rate": 3.179942683434458e-05, "loss": 0.6932, "num_input_tokens_seen": 18237360, "step": 31630 }, { "epoch": 4.7117962466487935, "grad_norm": 2.430453062057495, "learning_rate": 3.1793172743893515e-05, "loss": 0.8463, "num_input_tokens_seen": 18240176, "step": 31635 }, { "epoch": 4.712540959189753, "grad_norm": 1.351747751235962, "learning_rate": 3.178691819438746e-05, "loss": 0.7988, "num_input_tokens_seen": 18243344, "step": 31640 }, { "epoch": 4.713285671730712, "grad_norm": 0.984278678894043, "learning_rate": 3.178066318624905e-05, "loss": 0.7307, "num_input_tokens_seen": 18246256, "step": 31645 }, { "epoch": 4.7140303842716715, "grad_norm": 1.3496949672698975, "learning_rate": 3.1774407719901e-05, "loss": 0.6676, "num_input_tokens_seen": 18249328, "step": 31650 }, { "epoch": 4.71477509681263, "grad_norm": 1.4365639686584473, "learning_rate": 3.1768151795766025e-05, "loss": 0.6047, "num_input_tokens_seen": 18252304, "step": 31655 }, { "epoch": 4.715519809353589, "grad_norm": 1.9407927989959717, "learning_rate": 3.1761895414266865e-05, "loss": 0.4656, "num_input_tokens_seen": 18255120, "step": 31660 }, { "epoch": 4.716264521894549, "grad_norm": 1.0306589603424072, "learning_rate": 3.1755638575826295e-05, "loss": 0.6345, "num_input_tokens_seen": 18258096, "step": 31665 }, { "epoch": 4.717009234435508, "grad_norm": 1.5451875925064087, "learning_rate": 3.1749381280867146e-05, "loss": 0.8339, "num_input_tokens_seen": 18260720, "step": 31670 }, { "epoch": 4.717753946976467, "grad_norm": 1.5237070322036743, "learning_rate": 3.174312352981225e-05, "loss": 0.5795, "num_input_tokens_seen": 18263440, "step": 31675 }, { "epoch": 4.718498659517426, "grad_norm": 2.8180429935455322, "learning_rate": 3.173686532308448e-05, "loss": 0.8423, "num_input_tokens_seen": 18266192, "step": 31680 }, { "epoch": 4.7192433720583855, "grad_norm": 2.3025290966033936, "learning_rate": 3.1730606661106736e-05, "loss": 0.6868, "num_input_tokens_seen": 18269392, "step": 31685 }, { "epoch": 4.719988084599344, "grad_norm": 1.3279173374176025, "learning_rate": 3.172434754430197e-05, "loss": 0.5431, "num_input_tokens_seen": 18272272, "step": 31690 }, { "epoch": 4.720732797140304, "grad_norm": 1.3725509643554688, "learning_rate": 3.1718087973093135e-05, "loss": 0.5167, "num_input_tokens_seen": 18275408, "step": 31695 }, { "epoch": 4.721477509681263, "grad_norm": 1.0489822626113892, "learning_rate": 3.171182794790322e-05, "loss": 0.657, "num_input_tokens_seen": 18278448, "step": 31700 }, { "epoch": 4.722222222222222, "grad_norm": 1.2138317823410034, "learning_rate": 3.1705567469155266e-05, "loss": 0.5501, "num_input_tokens_seen": 18281136, "step": 31705 }, { "epoch": 4.722966934763181, "grad_norm": 1.5633862018585205, "learning_rate": 3.169930653727232e-05, "loss": 0.592, "num_input_tokens_seen": 18284080, "step": 31710 }, { "epoch": 4.723711647304141, "grad_norm": 1.9505242109298706, "learning_rate": 3.169304515267748e-05, "loss": 0.6273, "num_input_tokens_seen": 18286992, "step": 31715 }, { "epoch": 4.7244563598450995, "grad_norm": 1.2546212673187256, "learning_rate": 3.168678331579387e-05, "loss": 0.6286, "num_input_tokens_seen": 18289936, "step": 31720 }, { "epoch": 4.725201072386059, "grad_norm": 1.4611932039260864, "learning_rate": 3.168052102704461e-05, "loss": 0.4882, "num_input_tokens_seen": 18292752, "step": 31725 }, { "epoch": 4.725945784927018, "grad_norm": 2.6933722496032715, "learning_rate": 3.1674258286852906e-05, "loss": 0.6014, "num_input_tokens_seen": 18295728, "step": 31730 }, { "epoch": 4.7266904974679775, "grad_norm": 1.7203667163848877, "learning_rate": 3.1667995095641975e-05, "loss": 0.6147, "num_input_tokens_seen": 18298576, "step": 31735 }, { "epoch": 4.727435210008936, "grad_norm": 2.9081923961639404, "learning_rate": 3.1661731453835036e-05, "loss": 0.6775, "num_input_tokens_seen": 18301360, "step": 31740 }, { "epoch": 4.728179922549896, "grad_norm": 1.161057472229004, "learning_rate": 3.165546736185537e-05, "loss": 0.7092, "num_input_tokens_seen": 18304880, "step": 31745 }, { "epoch": 4.728924635090855, "grad_norm": 1.7193796634674072, "learning_rate": 3.1649202820126275e-05, "loss": 0.599, "num_input_tokens_seen": 18307664, "step": 31750 }, { "epoch": 4.729669347631814, "grad_norm": 3.7062196731567383, "learning_rate": 3.16429378290711e-05, "loss": 0.6445, "num_input_tokens_seen": 18310384, "step": 31755 }, { "epoch": 4.730414060172773, "grad_norm": 1.1872661113739014, "learning_rate": 3.1636672389113185e-05, "loss": 0.5664, "num_input_tokens_seen": 18313168, "step": 31760 }, { "epoch": 4.731158772713733, "grad_norm": 1.5187196731567383, "learning_rate": 3.163040650067593e-05, "loss": 0.6658, "num_input_tokens_seen": 18316560, "step": 31765 }, { "epoch": 4.7319034852546915, "grad_norm": 2.4487361907958984, "learning_rate": 3.162414016418277e-05, "loss": 0.6184, "num_input_tokens_seen": 18319280, "step": 31770 }, { "epoch": 4.732648197795651, "grad_norm": 1.2651931047439575, "learning_rate": 3.161787338005715e-05, "loss": 0.5923, "num_input_tokens_seen": 18322064, "step": 31775 }, { "epoch": 4.73339291033661, "grad_norm": 1.366952896118164, "learning_rate": 3.161160614872254e-05, "loss": 0.512, "num_input_tokens_seen": 18324912, "step": 31780 }, { "epoch": 4.73413762287757, "grad_norm": 1.0080289840698242, "learning_rate": 3.160533847060248e-05, "loss": 0.4842, "num_input_tokens_seen": 18327664, "step": 31785 }, { "epoch": 4.734882335418528, "grad_norm": 1.3453832864761353, "learning_rate": 3.1599070346120497e-05, "loss": 0.6212, "num_input_tokens_seen": 18330736, "step": 31790 }, { "epoch": 4.735627047959488, "grad_norm": 9.627108573913574, "learning_rate": 3.1592801775700165e-05, "loss": 0.952, "num_input_tokens_seen": 18333680, "step": 31795 }, { "epoch": 4.736371760500447, "grad_norm": 1.6059329509735107, "learning_rate": 3.1586532759765095e-05, "loss": 0.5475, "num_input_tokens_seen": 18336336, "step": 31800 }, { "epoch": 4.737116473041406, "grad_norm": 1.5316839218139648, "learning_rate": 3.158026329873893e-05, "loss": 0.6359, "num_input_tokens_seen": 18339344, "step": 31805 }, { "epoch": 4.737861185582365, "grad_norm": 3.0274527072906494, "learning_rate": 3.157399339304532e-05, "loss": 0.6017, "num_input_tokens_seen": 18342352, "step": 31810 }, { "epoch": 4.738605898123325, "grad_norm": 2.2308008670806885, "learning_rate": 3.1567723043107955e-05, "loss": 0.5751, "num_input_tokens_seen": 18345040, "step": 31815 }, { "epoch": 4.7393506106642835, "grad_norm": 1.3611071109771729, "learning_rate": 3.156145224935059e-05, "loss": 0.5096, "num_input_tokens_seen": 18348176, "step": 31820 }, { "epoch": 4.740095323205242, "grad_norm": 2.454918384552002, "learning_rate": 3.1555181012196936e-05, "loss": 0.7363, "num_input_tokens_seen": 18350896, "step": 31825 }, { "epoch": 4.740840035746202, "grad_norm": 1.4284789562225342, "learning_rate": 3.154890933207081e-05, "loss": 0.5114, "num_input_tokens_seen": 18353680, "step": 31830 }, { "epoch": 4.741584748287162, "grad_norm": 1.4757475852966309, "learning_rate": 3.154263720939602e-05, "loss": 0.7524, "num_input_tokens_seen": 18356912, "step": 31835 }, { "epoch": 4.74232946082812, "grad_norm": 2.0278127193450928, "learning_rate": 3.15363646445964e-05, "loss": 0.5396, "num_input_tokens_seen": 18359856, "step": 31840 }, { "epoch": 4.743074173369079, "grad_norm": 1.3131974935531616, "learning_rate": 3.153009163809584e-05, "loss": 0.7996, "num_input_tokens_seen": 18362416, "step": 31845 }, { "epoch": 4.743818885910039, "grad_norm": 4.5169267654418945, "learning_rate": 3.1523818190318234e-05, "loss": 0.6291, "num_input_tokens_seen": 18365360, "step": 31850 }, { "epoch": 4.744563598450998, "grad_norm": 1.4750425815582275, "learning_rate": 3.151754430168752e-05, "loss": 0.7988, "num_input_tokens_seen": 18367952, "step": 31855 }, { "epoch": 4.745308310991957, "grad_norm": 1.1840935945510864, "learning_rate": 3.151126997262766e-05, "loss": 0.5252, "num_input_tokens_seen": 18370736, "step": 31860 }, { "epoch": 4.746053023532916, "grad_norm": 1.0669807195663452, "learning_rate": 3.150499520356264e-05, "loss": 0.5924, "num_input_tokens_seen": 18373712, "step": 31865 }, { "epoch": 4.746797736073876, "grad_norm": 0.9884637594223022, "learning_rate": 3.1498719994916507e-05, "loss": 0.5251, "num_input_tokens_seen": 18376816, "step": 31870 }, { "epoch": 4.747542448614834, "grad_norm": 4.725052356719971, "learning_rate": 3.149244434711328e-05, "loss": 0.8682, "num_input_tokens_seen": 18379536, "step": 31875 }, { "epoch": 4.748287161155794, "grad_norm": 2.530932664871216, "learning_rate": 3.148616826057708e-05, "loss": 0.7897, "num_input_tokens_seen": 18382512, "step": 31880 }, { "epoch": 4.749031873696753, "grad_norm": 1.3327208757400513, "learning_rate": 3.147989173573199e-05, "loss": 0.591, "num_input_tokens_seen": 18385232, "step": 31885 }, { "epoch": 4.749776586237712, "grad_norm": 1.9705730676651, "learning_rate": 3.147361477300216e-05, "loss": 0.7772, "num_input_tokens_seen": 18388272, "step": 31890 }, { "epoch": 4.750521298778671, "grad_norm": 3.71762752532959, "learning_rate": 3.1467337372811764e-05, "loss": 0.6874, "num_input_tokens_seen": 18391440, "step": 31895 }, { "epoch": 4.751266011319631, "grad_norm": 1.6259071826934814, "learning_rate": 3.1461059535585e-05, "loss": 0.5813, "num_input_tokens_seen": 18394544, "step": 31900 }, { "epoch": 4.7520107238605895, "grad_norm": 1.4784901142120361, "learning_rate": 3.1454781261746114e-05, "loss": 0.7225, "num_input_tokens_seen": 18397840, "step": 31905 }, { "epoch": 4.752755436401549, "grad_norm": 1.0646241903305054, "learning_rate": 3.1448502551719336e-05, "loss": 0.5563, "num_input_tokens_seen": 18400496, "step": 31910 }, { "epoch": 4.753500148942508, "grad_norm": 2.423823833465576, "learning_rate": 3.1442223405928985e-05, "loss": 0.6235, "num_input_tokens_seen": 18403408, "step": 31915 }, { "epoch": 4.754244861483468, "grad_norm": 1.5800302028656006, "learning_rate": 3.1435943824799375e-05, "loss": 0.6058, "num_input_tokens_seen": 18406384, "step": 31920 }, { "epoch": 4.754989574024426, "grad_norm": 3.0587053298950195, "learning_rate": 3.142966380875483e-05, "loss": 0.7614, "num_input_tokens_seen": 18409232, "step": 31925 }, { "epoch": 4.755734286565386, "grad_norm": 0.6729381680488586, "learning_rate": 3.1423383358219756e-05, "loss": 0.4514, "num_input_tokens_seen": 18412144, "step": 31930 }, { "epoch": 4.756478999106345, "grad_norm": 1.516709804534912, "learning_rate": 3.1417102473618554e-05, "loss": 0.5662, "num_input_tokens_seen": 18414896, "step": 31935 }, { "epoch": 4.757223711647304, "grad_norm": 1.8934497833251953, "learning_rate": 3.141082115537565e-05, "loss": 0.5523, "num_input_tokens_seen": 18417840, "step": 31940 }, { "epoch": 4.757968424188263, "grad_norm": 0.948828935623169, "learning_rate": 3.1404539403915515e-05, "loss": 0.5809, "num_input_tokens_seen": 18420688, "step": 31945 }, { "epoch": 4.758713136729223, "grad_norm": 2.5922024250030518, "learning_rate": 3.139825721966265e-05, "loss": 0.7553, "num_input_tokens_seen": 18423312, "step": 31950 }, { "epoch": 4.759457849270182, "grad_norm": 1.633744478225708, "learning_rate": 3.139197460304157e-05, "loss": 0.6056, "num_input_tokens_seen": 18425968, "step": 31955 }, { "epoch": 4.760202561811141, "grad_norm": 1.9226356744766235, "learning_rate": 3.138569155447685e-05, "loss": 0.64, "num_input_tokens_seen": 18428656, "step": 31960 }, { "epoch": 4.7609472743521, "grad_norm": 1.5008755922317505, "learning_rate": 3.137940807439304e-05, "loss": 0.675, "num_input_tokens_seen": 18432816, "step": 31965 }, { "epoch": 4.76169198689306, "grad_norm": 1.252493143081665, "learning_rate": 3.137312416321478e-05, "loss": 0.6326, "num_input_tokens_seen": 18435600, "step": 31970 }, { "epoch": 4.762436699434018, "grad_norm": 2.503547191619873, "learning_rate": 3.1366839821366696e-05, "loss": 0.616, "num_input_tokens_seen": 18438352, "step": 31975 }, { "epoch": 4.763181411974978, "grad_norm": 1.634627342224121, "learning_rate": 3.136055504927347e-05, "loss": 0.6207, "num_input_tokens_seen": 18441200, "step": 31980 }, { "epoch": 4.763926124515937, "grad_norm": 2.0408918857574463, "learning_rate": 3.135426984735978e-05, "loss": 0.5353, "num_input_tokens_seen": 18444144, "step": 31985 }, { "epoch": 4.764670837056896, "grad_norm": 1.0720715522766113, "learning_rate": 3.134798421605037e-05, "loss": 0.7148, "num_input_tokens_seen": 18446928, "step": 31990 }, { "epoch": 4.765415549597855, "grad_norm": 0.9549868106842041, "learning_rate": 3.134169815577e-05, "loss": 0.509, "num_input_tokens_seen": 18449872, "step": 31995 }, { "epoch": 4.766160262138815, "grad_norm": 0.9168775677680969, "learning_rate": 3.133541166694345e-05, "loss": 0.7053, "num_input_tokens_seen": 18453008, "step": 32000 }, { "epoch": 4.766904974679774, "grad_norm": 1.9044352769851685, "learning_rate": 3.132912474999555e-05, "loss": 0.6457, "num_input_tokens_seen": 18456368, "step": 32005 }, { "epoch": 4.767649687220732, "grad_norm": 1.6905437707901, "learning_rate": 3.132283740535111e-05, "loss": 0.7997, "num_input_tokens_seen": 18459152, "step": 32010 }, { "epoch": 4.768394399761692, "grad_norm": 2.692939043045044, "learning_rate": 3.131654963343504e-05, "loss": 0.7698, "num_input_tokens_seen": 18461840, "step": 32015 }, { "epoch": 4.769139112302652, "grad_norm": 1.6623362302780151, "learning_rate": 3.1310261434672234e-05, "loss": 0.6591, "num_input_tokens_seen": 18464976, "step": 32020 }, { "epoch": 4.76988382484361, "grad_norm": 0.8272369503974915, "learning_rate": 3.13039728094876e-05, "loss": 0.6548, "num_input_tokens_seen": 18467856, "step": 32025 }, { "epoch": 4.770628537384569, "grad_norm": 1.1503775119781494, "learning_rate": 3.129768375830612e-05, "loss": 0.6368, "num_input_tokens_seen": 18471248, "step": 32030 }, { "epoch": 4.771373249925529, "grad_norm": 0.9557896256446838, "learning_rate": 3.1291394281552776e-05, "loss": 0.4786, "num_input_tokens_seen": 18473776, "step": 32035 }, { "epoch": 4.772117962466488, "grad_norm": 1.5321003198623657, "learning_rate": 3.128510437965259e-05, "loss": 0.5559, "num_input_tokens_seen": 18476720, "step": 32040 }, { "epoch": 4.772862675007447, "grad_norm": 2.2463889122009277, "learning_rate": 3.127881405303059e-05, "loss": 0.6534, "num_input_tokens_seen": 18479408, "step": 32045 }, { "epoch": 4.773607387548406, "grad_norm": 0.9481462836265564, "learning_rate": 3.127252330211187e-05, "loss": 0.6616, "num_input_tokens_seen": 18482192, "step": 32050 }, { "epoch": 4.774352100089366, "grad_norm": 0.8852630257606506, "learning_rate": 3.126623212732153e-05, "loss": 0.5623, "num_input_tokens_seen": 18485360, "step": 32055 }, { "epoch": 4.775096812630324, "grad_norm": 1.6256877183914185, "learning_rate": 3.12599405290847e-05, "loss": 0.5302, "num_input_tokens_seen": 18488336, "step": 32060 }, { "epoch": 4.775841525171284, "grad_norm": 1.135331153869629, "learning_rate": 3.125364850782654e-05, "loss": 0.6547, "num_input_tokens_seen": 18491184, "step": 32065 }, { "epoch": 4.776586237712243, "grad_norm": 1.339093565940857, "learning_rate": 3.124735606397224e-05, "loss": 0.5118, "num_input_tokens_seen": 18493808, "step": 32070 }, { "epoch": 4.777330950253202, "grad_norm": 2.695462942123413, "learning_rate": 3.124106319794701e-05, "loss": 0.6552, "num_input_tokens_seen": 18496816, "step": 32075 }, { "epoch": 4.778075662794161, "grad_norm": 1.4475255012512207, "learning_rate": 3.123476991017611e-05, "loss": 0.5995, "num_input_tokens_seen": 18499472, "step": 32080 }, { "epoch": 4.778820375335121, "grad_norm": 1.243971824645996, "learning_rate": 3.122847620108481e-05, "loss": 0.5981, "num_input_tokens_seen": 18502224, "step": 32085 }, { "epoch": 4.77956508787608, "grad_norm": 1.2745051383972168, "learning_rate": 3.122218207109841e-05, "loss": 0.5033, "num_input_tokens_seen": 18505104, "step": 32090 }, { "epoch": 4.780309800417039, "grad_norm": 1.1542706489562988, "learning_rate": 3.1215887520642237e-05, "loss": 0.7495, "num_input_tokens_seen": 18508080, "step": 32095 }, { "epoch": 4.781054512957998, "grad_norm": 1.3943662643432617, "learning_rate": 3.120959255014166e-05, "loss": 0.5567, "num_input_tokens_seen": 18510768, "step": 32100 }, { "epoch": 4.781799225498958, "grad_norm": 1.4659998416900635, "learning_rate": 3.120329716002208e-05, "loss": 0.7695, "num_input_tokens_seen": 18513840, "step": 32105 }, { "epoch": 4.782543938039916, "grad_norm": 1.3832972049713135, "learning_rate": 3.119700135070888e-05, "loss": 0.6999, "num_input_tokens_seen": 18516624, "step": 32110 }, { "epoch": 4.783288650580876, "grad_norm": 0.6523256301879883, "learning_rate": 3.119070512262753e-05, "loss": 0.502, "num_input_tokens_seen": 18519568, "step": 32115 }, { "epoch": 4.784033363121835, "grad_norm": 1.1500974893569946, "learning_rate": 3.1184408476203496e-05, "loss": 0.6816, "num_input_tokens_seen": 18522288, "step": 32120 }, { "epoch": 4.7847780756627944, "grad_norm": 2.225907325744629, "learning_rate": 3.1178111411862285e-05, "loss": 0.7462, "num_input_tokens_seen": 18525136, "step": 32125 }, { "epoch": 4.785522788203753, "grad_norm": 2.11740779876709, "learning_rate": 3.117181393002942e-05, "loss": 0.7116, "num_input_tokens_seen": 18528368, "step": 32130 }, { "epoch": 4.786267500744713, "grad_norm": 1.1523196697235107, "learning_rate": 3.116551603113046e-05, "loss": 0.5516, "num_input_tokens_seen": 18531600, "step": 32135 }, { "epoch": 4.787012213285672, "grad_norm": 3.384799003601074, "learning_rate": 3.1159217715591e-05, "loss": 0.7009, "num_input_tokens_seen": 18534576, "step": 32140 }, { "epoch": 4.787756925826631, "grad_norm": 1.1666615009307861, "learning_rate": 3.115291898383664e-05, "loss": 0.6381, "num_input_tokens_seen": 18537552, "step": 32145 }, { "epoch": 4.78850163836759, "grad_norm": 1.355270504951477, "learning_rate": 3.114661983629304e-05, "loss": 0.7491, "num_input_tokens_seen": 18540560, "step": 32150 }, { "epoch": 4.78924635090855, "grad_norm": 0.7924596071243286, "learning_rate": 3.114032027338585e-05, "loss": 0.6027, "num_input_tokens_seen": 18543408, "step": 32155 }, { "epoch": 4.789991063449508, "grad_norm": 2.5005857944488525, "learning_rate": 3.113402029554079e-05, "loss": 0.7552, "num_input_tokens_seen": 18546224, "step": 32160 }, { "epoch": 4.790735775990468, "grad_norm": 0.5216996073722839, "learning_rate": 3.112771990318358e-05, "loss": 0.506, "num_input_tokens_seen": 18548976, "step": 32165 }, { "epoch": 4.791480488531427, "grad_norm": 1.6369706392288208, "learning_rate": 3.112141909673997e-05, "loss": 0.7127, "num_input_tokens_seen": 18551952, "step": 32170 }, { "epoch": 4.792225201072386, "grad_norm": 0.9987096190452576, "learning_rate": 3.1115117876635735e-05, "loss": 0.7131, "num_input_tokens_seen": 18555024, "step": 32175 }, { "epoch": 4.792969913613345, "grad_norm": 1.4273866415023804, "learning_rate": 3.1108816243296716e-05, "loss": 0.5349, "num_input_tokens_seen": 18558096, "step": 32180 }, { "epoch": 4.793714626154305, "grad_norm": 1.1265918016433716, "learning_rate": 3.110251419714872e-05, "loss": 0.5747, "num_input_tokens_seen": 18561008, "step": 32185 }, { "epoch": 4.794459338695264, "grad_norm": 1.973141074180603, "learning_rate": 3.109621173861762e-05, "loss": 0.5474, "num_input_tokens_seen": 18564688, "step": 32190 }, { "epoch": 4.795204051236222, "grad_norm": 1.6272413730621338, "learning_rate": 3.1089908868129316e-05, "loss": 0.6863, "num_input_tokens_seen": 18567728, "step": 32195 }, { "epoch": 4.795948763777182, "grad_norm": 1.7275886535644531, "learning_rate": 3.108360558610974e-05, "loss": 0.654, "num_input_tokens_seen": 18570448, "step": 32200 }, { "epoch": 4.796693476318142, "grad_norm": 1.5832138061523438, "learning_rate": 3.1077301892984834e-05, "loss": 0.6983, "num_input_tokens_seen": 18573392, "step": 32205 }, { "epoch": 4.7974381888591004, "grad_norm": 0.9013642072677612, "learning_rate": 3.107099778918057e-05, "loss": 0.7012, "num_input_tokens_seen": 18576240, "step": 32210 }, { "epoch": 4.798182901400059, "grad_norm": 1.4792070388793945, "learning_rate": 3.106469327512296e-05, "loss": 0.5141, "num_input_tokens_seen": 18578864, "step": 32215 }, { "epoch": 4.798927613941019, "grad_norm": 0.8470029234886169, "learning_rate": 3.1058388351238035e-05, "loss": 0.58, "num_input_tokens_seen": 18581680, "step": 32220 }, { "epoch": 4.799672326481978, "grad_norm": 2.970445394515991, "learning_rate": 3.105208301795185e-05, "loss": 0.6509, "num_input_tokens_seen": 18584496, "step": 32225 }, { "epoch": 4.800417039022937, "grad_norm": 1.4811574220657349, "learning_rate": 3.1045777275690505e-05, "loss": 0.5617, "num_input_tokens_seen": 18587184, "step": 32230 }, { "epoch": 4.801161751563896, "grad_norm": 1.0086419582366943, "learning_rate": 3.1039471124880114e-05, "loss": 0.6178, "num_input_tokens_seen": 18590352, "step": 32235 }, { "epoch": 4.801906464104856, "grad_norm": 2.264997959136963, "learning_rate": 3.103316456594683e-05, "loss": 0.658, "num_input_tokens_seen": 18593296, "step": 32240 }, { "epoch": 4.802651176645814, "grad_norm": 1.4323174953460693, "learning_rate": 3.1026857599316795e-05, "loss": 0.5269, "num_input_tokens_seen": 18596112, "step": 32245 }, { "epoch": 4.803395889186774, "grad_norm": 1.7190110683441162, "learning_rate": 3.102055022541623e-05, "loss": 0.597, "num_input_tokens_seen": 18600240, "step": 32250 }, { "epoch": 4.804140601727733, "grad_norm": 1.0123910903930664, "learning_rate": 3.1014242444671366e-05, "loss": 0.5275, "num_input_tokens_seen": 18603088, "step": 32255 }, { "epoch": 4.8048853142686925, "grad_norm": 0.7196571230888367, "learning_rate": 3.100793425750845e-05, "loss": 0.4702, "num_input_tokens_seen": 18605808, "step": 32260 }, { "epoch": 4.805630026809651, "grad_norm": 2.3889317512512207, "learning_rate": 3.100162566435375e-05, "loss": 0.5692, "num_input_tokens_seen": 18608720, "step": 32265 }, { "epoch": 4.806374739350611, "grad_norm": 1.1144250631332397, "learning_rate": 3.0995316665633606e-05, "loss": 0.482, "num_input_tokens_seen": 18611568, "step": 32270 }, { "epoch": 4.80711945189157, "grad_norm": 0.6453508138656616, "learning_rate": 3.098900726177432e-05, "loss": 0.4371, "num_input_tokens_seen": 18614416, "step": 32275 }, { "epoch": 4.807864164432529, "grad_norm": 1.1537550687789917, "learning_rate": 3.0982697453202284e-05, "loss": 0.6233, "num_input_tokens_seen": 18617232, "step": 32280 }, { "epoch": 4.808608876973488, "grad_norm": 1.1513402462005615, "learning_rate": 3.0976387240343886e-05, "loss": 0.5973, "num_input_tokens_seen": 18620080, "step": 32285 }, { "epoch": 4.809353589514448, "grad_norm": 2.324084758758545, "learning_rate": 3.097007662362552e-05, "loss": 0.6568, "num_input_tokens_seen": 18622960, "step": 32290 }, { "epoch": 4.8100983020554064, "grad_norm": 2.140252113342285, "learning_rate": 3.096376560347365e-05, "loss": 0.6453, "num_input_tokens_seen": 18625936, "step": 32295 }, { "epoch": 4.810843014596366, "grad_norm": 1.1199995279312134, "learning_rate": 3.095745418031476e-05, "loss": 0.7298, "num_input_tokens_seen": 18629072, "step": 32300 }, { "epoch": 4.811587727137325, "grad_norm": 1.2740758657455444, "learning_rate": 3.095114235457533e-05, "loss": 0.6891, "num_input_tokens_seen": 18631952, "step": 32305 }, { "epoch": 4.8123324396782845, "grad_norm": 1.1602091789245605, "learning_rate": 3.094483012668189e-05, "loss": 0.794, "num_input_tokens_seen": 18634928, "step": 32310 }, { "epoch": 4.813077152219243, "grad_norm": 1.5208499431610107, "learning_rate": 3.093851749706101e-05, "loss": 0.7127, "num_input_tokens_seen": 18637648, "step": 32315 }, { "epoch": 4.813821864760203, "grad_norm": 0.8826357126235962, "learning_rate": 3.093220446613926e-05, "loss": 0.5293, "num_input_tokens_seen": 18640368, "step": 32320 }, { "epoch": 4.814566577301162, "grad_norm": 1.6159086227416992, "learning_rate": 3.092589103434324e-05, "loss": 0.5532, "num_input_tokens_seen": 18643536, "step": 32325 }, { "epoch": 4.815311289842121, "grad_norm": 2.0169167518615723, "learning_rate": 3.0919577202099606e-05, "loss": 0.6639, "num_input_tokens_seen": 18646608, "step": 32330 }, { "epoch": 4.81605600238308, "grad_norm": 1.0183680057525635, "learning_rate": 3.091326296983501e-05, "loss": 0.6853, "num_input_tokens_seen": 18649456, "step": 32335 }, { "epoch": 4.816800714924039, "grad_norm": 1.8272027969360352, "learning_rate": 3.0906948337976146e-05, "loss": 0.6866, "num_input_tokens_seen": 18652368, "step": 32340 }, { "epoch": 4.8175454274649985, "grad_norm": 2.114530563354492, "learning_rate": 3.090063330694972e-05, "loss": 0.6785, "num_input_tokens_seen": 18655312, "step": 32345 }, { "epoch": 4.818290140005958, "grad_norm": 1.5242516994476318, "learning_rate": 3.08943178771825e-05, "loss": 0.66, "num_input_tokens_seen": 18658128, "step": 32350 }, { "epoch": 4.819034852546917, "grad_norm": 1.307846188545227, "learning_rate": 3.088800204910123e-05, "loss": 0.6446, "num_input_tokens_seen": 18661040, "step": 32355 }, { "epoch": 4.819779565087876, "grad_norm": 1.465585708618164, "learning_rate": 3.088168582313273e-05, "loss": 0.4625, "num_input_tokens_seen": 18663632, "step": 32360 }, { "epoch": 4.820524277628835, "grad_norm": 1.781245231628418, "learning_rate": 3.087536919970381e-05, "loss": 0.4292, "num_input_tokens_seen": 18666352, "step": 32365 }, { "epoch": 4.821268990169795, "grad_norm": 1.7362914085388184, "learning_rate": 3.0869052179241334e-05, "loss": 0.5658, "num_input_tokens_seen": 18669136, "step": 32370 }, { "epoch": 4.822013702710754, "grad_norm": 1.1606284379959106, "learning_rate": 3.0862734762172164e-05, "loss": 0.6442, "num_input_tokens_seen": 18672016, "step": 32375 }, { "epoch": 4.8227584152517124, "grad_norm": 1.1221965551376343, "learning_rate": 3.085641694892322e-05, "loss": 0.5396, "num_input_tokens_seen": 18675088, "step": 32380 }, { "epoch": 4.823503127792672, "grad_norm": 2.1941065788269043, "learning_rate": 3.085009873992143e-05, "loss": 0.8758, "num_input_tokens_seen": 18678000, "step": 32385 }, { "epoch": 4.824247840333631, "grad_norm": 1.3296414613723755, "learning_rate": 3.084378013559374e-05, "loss": 0.6264, "num_input_tokens_seen": 18680560, "step": 32390 }, { "epoch": 4.8249925528745905, "grad_norm": 1.6001805067062378, "learning_rate": 3.083746113636716e-05, "loss": 0.4967, "num_input_tokens_seen": 18683376, "step": 32395 }, { "epoch": 4.825737265415549, "grad_norm": 1.7083929777145386, "learning_rate": 3.083114174266869e-05, "loss": 0.5788, "num_input_tokens_seen": 18686608, "step": 32400 }, { "epoch": 4.826481977956509, "grad_norm": 1.4164484739303589, "learning_rate": 3.082482195492536e-05, "loss": 0.665, "num_input_tokens_seen": 18689776, "step": 32405 }, { "epoch": 4.827226690497468, "grad_norm": 1.1170525550842285, "learning_rate": 3.081850177356425e-05, "loss": 0.6832, "num_input_tokens_seen": 18692880, "step": 32410 }, { "epoch": 4.827971403038427, "grad_norm": 3.0623741149902344, "learning_rate": 3.0812181199012455e-05, "loss": 0.9709, "num_input_tokens_seen": 18696048, "step": 32415 }, { "epoch": 4.828716115579386, "grad_norm": 1.5455573797225952, "learning_rate": 3.080586023169707e-05, "loss": 0.6484, "num_input_tokens_seen": 18698992, "step": 32420 }, { "epoch": 4.829460828120346, "grad_norm": 1.3995550870895386, "learning_rate": 3.079953887204527e-05, "loss": 0.7523, "num_input_tokens_seen": 18701776, "step": 32425 }, { "epoch": 4.8302055406613045, "grad_norm": 2.1672720909118652, "learning_rate": 3.07932171204842e-05, "loss": 0.6338, "num_input_tokens_seen": 18704752, "step": 32430 }, { "epoch": 4.830950253202264, "grad_norm": 2.3077869415283203, "learning_rate": 3.0786894977441074e-05, "loss": 0.7865, "num_input_tokens_seen": 18707664, "step": 32435 }, { "epoch": 4.831694965743223, "grad_norm": 1.301377296447754, "learning_rate": 3.078057244334311e-05, "loss": 0.6808, "num_input_tokens_seen": 18710320, "step": 32440 }, { "epoch": 4.8324396782841825, "grad_norm": 1.446791410446167, "learning_rate": 3.077424951861757e-05, "loss": 0.6459, "num_input_tokens_seen": 18713456, "step": 32445 }, { "epoch": 4.833184390825141, "grad_norm": 1.1523780822753906, "learning_rate": 3.0767926203691724e-05, "loss": 0.5247, "num_input_tokens_seen": 18716304, "step": 32450 }, { "epoch": 4.833929103366101, "grad_norm": 1.8115653991699219, "learning_rate": 3.076160249899286e-05, "loss": 0.7147, "num_input_tokens_seen": 18719312, "step": 32455 }, { "epoch": 4.83467381590706, "grad_norm": 1.1140166521072388, "learning_rate": 3.075527840494834e-05, "loss": 0.4679, "num_input_tokens_seen": 18722128, "step": 32460 }, { "epoch": 4.835418528448019, "grad_norm": 1.5486555099487305, "learning_rate": 3.074895392198551e-05, "loss": 0.7452, "num_input_tokens_seen": 18725136, "step": 32465 }, { "epoch": 4.836163240988978, "grad_norm": 1.1567778587341309, "learning_rate": 3.074262905053173e-05, "loss": 0.4785, "num_input_tokens_seen": 18728432, "step": 32470 }, { "epoch": 4.836907953529938, "grad_norm": 2.1282224655151367, "learning_rate": 3.073630379101443e-05, "loss": 0.7737, "num_input_tokens_seen": 18731408, "step": 32475 }, { "epoch": 4.8376526660708965, "grad_norm": 2.7484045028686523, "learning_rate": 3.072997814386106e-05, "loss": 0.6668, "num_input_tokens_seen": 18734320, "step": 32480 }, { "epoch": 4.838397378611856, "grad_norm": 1.4011667966842651, "learning_rate": 3.0723652109499046e-05, "loss": 0.6916, "num_input_tokens_seen": 18737104, "step": 32485 }, { "epoch": 4.839142091152815, "grad_norm": 0.9613898396492004, "learning_rate": 3.0717325688355893e-05, "loss": 0.5826, "num_input_tokens_seen": 18740144, "step": 32490 }, { "epoch": 4.8398868036937746, "grad_norm": 1.0396569967269897, "learning_rate": 3.071099888085911e-05, "loss": 0.523, "num_input_tokens_seen": 18742928, "step": 32495 }, { "epoch": 4.840631516234733, "grad_norm": 1.129289150238037, "learning_rate": 3.070467168743626e-05, "loss": 0.6258, "num_input_tokens_seen": 18745680, "step": 32500 }, { "epoch": 4.841376228775693, "grad_norm": 0.8597966432571411, "learning_rate": 3.0698344108514886e-05, "loss": 0.4963, "num_input_tokens_seen": 18748560, "step": 32505 }, { "epoch": 4.842120941316652, "grad_norm": 2.846896171569824, "learning_rate": 3.069201614452258e-05, "loss": 0.6481, "num_input_tokens_seen": 18751440, "step": 32510 }, { "epoch": 4.842865653857611, "grad_norm": 0.8527178764343262, "learning_rate": 3.0685687795886964e-05, "loss": 0.5542, "num_input_tokens_seen": 18754544, "step": 32515 }, { "epoch": 4.84361036639857, "grad_norm": 1.6130887269973755, "learning_rate": 3.067935906303568e-05, "loss": 0.688, "num_input_tokens_seen": 18757232, "step": 32520 }, { "epoch": 4.844355078939529, "grad_norm": 0.929354190826416, "learning_rate": 3.0673029946396406e-05, "loss": 0.6011, "num_input_tokens_seen": 18759824, "step": 32525 }, { "epoch": 4.8450997914804885, "grad_norm": 3.1805741786956787, "learning_rate": 3.0666700446396835e-05, "loss": 0.6704, "num_input_tokens_seen": 18762864, "step": 32530 }, { "epoch": 4.845844504021448, "grad_norm": 1.2009285688400269, "learning_rate": 3.0660370563464694e-05, "loss": 0.5842, "num_input_tokens_seen": 18765584, "step": 32535 }, { "epoch": 4.846589216562407, "grad_norm": 1.002156138420105, "learning_rate": 3.065404029802771e-05, "loss": 0.6085, "num_input_tokens_seen": 18768752, "step": 32540 }, { "epoch": 4.847333929103366, "grad_norm": 2.044102191925049, "learning_rate": 3.064770965051367e-05, "loss": 0.6375, "num_input_tokens_seen": 18771856, "step": 32545 }, { "epoch": 4.848078641644325, "grad_norm": 0.8721259236335754, "learning_rate": 3.0641378621350384e-05, "loss": 0.5079, "num_input_tokens_seen": 18774640, "step": 32550 }, { "epoch": 4.848823354185284, "grad_norm": 1.0930172204971313, "learning_rate": 3.063504721096566e-05, "loss": 0.4786, "num_input_tokens_seen": 18777424, "step": 32555 }, { "epoch": 4.849568066726244, "grad_norm": 1.4631867408752441, "learning_rate": 3.0628715419787355e-05, "loss": 0.7043, "num_input_tokens_seen": 18780208, "step": 32560 }, { "epoch": 4.8503127792672025, "grad_norm": 1.2474828958511353, "learning_rate": 3.062238324824336e-05, "loss": 0.5278, "num_input_tokens_seen": 18783152, "step": 32565 }, { "epoch": 4.851057491808162, "grad_norm": 3.677794933319092, "learning_rate": 3.061605069676155e-05, "loss": 0.5873, "num_input_tokens_seen": 18786288, "step": 32570 }, { "epoch": 4.851802204349121, "grad_norm": 1.4629169702529907, "learning_rate": 3.0609717765769866e-05, "loss": 0.5549, "num_input_tokens_seen": 18789136, "step": 32575 }, { "epoch": 4.8525469168900806, "grad_norm": 1.2653796672821045, "learning_rate": 3.060338445569627e-05, "loss": 0.6352, "num_input_tokens_seen": 18792016, "step": 32580 }, { "epoch": 4.853291629431039, "grad_norm": 1.3301721811294556, "learning_rate": 3.059705076696873e-05, "loss": 0.5657, "num_input_tokens_seen": 18794896, "step": 32585 }, { "epoch": 4.854036341971999, "grad_norm": 1.4266386032104492, "learning_rate": 3.059071670001526e-05, "loss": 0.6695, "num_input_tokens_seen": 18797552, "step": 32590 }, { "epoch": 4.854781054512958, "grad_norm": 1.7161494493484497, "learning_rate": 3.058438225526388e-05, "loss": 0.5223, "num_input_tokens_seen": 18800080, "step": 32595 }, { "epoch": 4.855525767053917, "grad_norm": 3.8202402591705322, "learning_rate": 3.057804743314266e-05, "loss": 0.6286, "num_input_tokens_seen": 18803056, "step": 32600 }, { "epoch": 4.856270479594876, "grad_norm": 2.649240255355835, "learning_rate": 3.0571712234079666e-05, "loss": 0.6837, "num_input_tokens_seen": 18806000, "step": 32605 }, { "epoch": 4.857015192135836, "grad_norm": 2.4204564094543457, "learning_rate": 3.0565376658503e-05, "loss": 0.6023, "num_input_tokens_seen": 18808752, "step": 32610 }, { "epoch": 4.8577599046767945, "grad_norm": 1.1485531330108643, "learning_rate": 3.055904070684082e-05, "loss": 0.6648, "num_input_tokens_seen": 18811248, "step": 32615 }, { "epoch": 4.858504617217754, "grad_norm": 1.2822257280349731, "learning_rate": 3.055270437952127e-05, "loss": 0.6813, "num_input_tokens_seen": 18814544, "step": 32620 }, { "epoch": 4.859249329758713, "grad_norm": 1.806185007095337, "learning_rate": 3.054636767697254e-05, "loss": 0.7692, "num_input_tokens_seen": 18817392, "step": 32625 }, { "epoch": 4.859994042299673, "grad_norm": 1.7404109239578247, "learning_rate": 3.054003059962283e-05, "loss": 0.6398, "num_input_tokens_seen": 18820336, "step": 32630 }, { "epoch": 4.860738754840631, "grad_norm": 3.1123409271240234, "learning_rate": 3.0533693147900365e-05, "loss": 0.6547, "num_input_tokens_seen": 18823184, "step": 32635 }, { "epoch": 4.861483467381591, "grad_norm": 2.3905797004699707, "learning_rate": 3.052735532223342e-05, "loss": 0.7146, "num_input_tokens_seen": 18826288, "step": 32640 }, { "epoch": 4.86222817992255, "grad_norm": 1.804463505744934, "learning_rate": 3.052101712305028e-05, "loss": 0.6813, "num_input_tokens_seen": 18828944, "step": 32645 }, { "epoch": 4.862972892463509, "grad_norm": 1.8579156398773193, "learning_rate": 3.051467855077925e-05, "loss": 0.6263, "num_input_tokens_seen": 18831824, "step": 32650 }, { "epoch": 4.863717605004468, "grad_norm": 1.046466588973999, "learning_rate": 3.050833960584866e-05, "loss": 0.6697, "num_input_tokens_seen": 18834736, "step": 32655 }, { "epoch": 4.864462317545428, "grad_norm": 1.0746822357177734, "learning_rate": 3.0502000288686877e-05, "loss": 0.4949, "num_input_tokens_seen": 18837648, "step": 32660 }, { "epoch": 4.8652070300863866, "grad_norm": 1.1929875612258911, "learning_rate": 3.0495660599722292e-05, "loss": 0.6547, "num_input_tokens_seen": 18840848, "step": 32665 }, { "epoch": 4.865951742627346, "grad_norm": 1.609010934829712, "learning_rate": 3.0489320539383294e-05, "loss": 0.7548, "num_input_tokens_seen": 18843568, "step": 32670 }, { "epoch": 4.866696455168305, "grad_norm": 2.00429630279541, "learning_rate": 3.0482980108098336e-05, "loss": 0.5744, "num_input_tokens_seen": 18846256, "step": 32675 }, { "epoch": 4.867441167709265, "grad_norm": 1.373534083366394, "learning_rate": 3.0476639306295874e-05, "loss": 0.5686, "num_input_tokens_seen": 18848976, "step": 32680 }, { "epoch": 4.868185880250223, "grad_norm": 2.925330877304077, "learning_rate": 3.0470298134404403e-05, "loss": 0.3974, "num_input_tokens_seen": 18851600, "step": 32685 }, { "epoch": 4.868930592791182, "grad_norm": 2.0232648849487305, "learning_rate": 3.0463956592852412e-05, "loss": 0.5759, "num_input_tokens_seen": 18854384, "step": 32690 }, { "epoch": 4.869675305332142, "grad_norm": 1.8281959295272827, "learning_rate": 3.0457614682068452e-05, "loss": 0.7465, "num_input_tokens_seen": 18857168, "step": 32695 }, { "epoch": 4.870420017873101, "grad_norm": 1.838850736618042, "learning_rate": 3.0451272402481086e-05, "loss": 0.6904, "num_input_tokens_seen": 18859792, "step": 32700 }, { "epoch": 4.87116473041406, "grad_norm": 0.7049323320388794, "learning_rate": 3.044492975451889e-05, "loss": 0.5732, "num_input_tokens_seen": 18862704, "step": 32705 }, { "epoch": 4.871909442955019, "grad_norm": 2.46398663520813, "learning_rate": 3.0438586738610482e-05, "loss": 0.5422, "num_input_tokens_seen": 18865616, "step": 32710 }, { "epoch": 4.872654155495979, "grad_norm": 1.6699734926223755, "learning_rate": 3.0432243355184494e-05, "loss": 0.4807, "num_input_tokens_seen": 18868336, "step": 32715 }, { "epoch": 4.873398868036938, "grad_norm": 1.7398245334625244, "learning_rate": 3.0425899604669577e-05, "loss": 0.6492, "num_input_tokens_seen": 18871024, "step": 32720 }, { "epoch": 4.874143580577897, "grad_norm": 1.5268954038619995, "learning_rate": 3.041955548749444e-05, "loss": 0.525, "num_input_tokens_seen": 18874000, "step": 32725 }, { "epoch": 4.874888293118856, "grad_norm": 1.3914917707443237, "learning_rate": 3.0413211004087773e-05, "loss": 0.5099, "num_input_tokens_seen": 18876848, "step": 32730 }, { "epoch": 4.875633005659815, "grad_norm": 0.7032663822174072, "learning_rate": 3.0406866154878306e-05, "loss": 0.6108, "num_input_tokens_seen": 18879888, "step": 32735 }, { "epoch": 4.876377718200774, "grad_norm": 2.6460955142974854, "learning_rate": 3.0400520940294808e-05, "loss": 0.5672, "num_input_tokens_seen": 18882992, "step": 32740 }, { "epoch": 4.877122430741734, "grad_norm": 2.3331027030944824, "learning_rate": 3.039417536076607e-05, "loss": 0.7088, "num_input_tokens_seen": 18886000, "step": 32745 }, { "epoch": 4.8778671432826926, "grad_norm": 1.155197024345398, "learning_rate": 3.0387829416720888e-05, "loss": 0.6055, "num_input_tokens_seen": 18888912, "step": 32750 }, { "epoch": 4.878611855823652, "grad_norm": 1.558631420135498, "learning_rate": 3.0381483108588093e-05, "loss": 0.4931, "num_input_tokens_seen": 18891632, "step": 32755 }, { "epoch": 4.879356568364611, "grad_norm": 1.5671144723892212, "learning_rate": 3.037513643679656e-05, "loss": 0.5644, "num_input_tokens_seen": 18894160, "step": 32760 }, { "epoch": 4.880101280905571, "grad_norm": 2.470519781112671, "learning_rate": 3.036878940177516e-05, "loss": 0.5379, "num_input_tokens_seen": 18897360, "step": 32765 }, { "epoch": 4.880845993446529, "grad_norm": 3.021494150161743, "learning_rate": 3.0362442003952795e-05, "loss": 0.8433, "num_input_tokens_seen": 18900176, "step": 32770 }, { "epoch": 4.881590705987489, "grad_norm": 0.9759924411773682, "learning_rate": 3.03560942437584e-05, "loss": 0.6246, "num_input_tokens_seen": 18902768, "step": 32775 }, { "epoch": 4.882335418528448, "grad_norm": 1.9191075563430786, "learning_rate": 3.0349746121620935e-05, "loss": 0.8031, "num_input_tokens_seen": 18905424, "step": 32780 }, { "epoch": 4.883080131069407, "grad_norm": 2.7981643676757812, "learning_rate": 3.034339763796938e-05, "loss": 0.6459, "num_input_tokens_seen": 18908144, "step": 32785 }, { "epoch": 4.883824843610366, "grad_norm": 1.7048829793930054, "learning_rate": 3.033704879323273e-05, "loss": 0.4886, "num_input_tokens_seen": 18910864, "step": 32790 }, { "epoch": 4.884569556151326, "grad_norm": 2.5098440647125244, "learning_rate": 3.0330699587840027e-05, "loss": 0.795, "num_input_tokens_seen": 18913808, "step": 32795 }, { "epoch": 4.885314268692285, "grad_norm": 2.248335123062134, "learning_rate": 3.0324350022220317e-05, "loss": 0.6359, "num_input_tokens_seen": 18916752, "step": 32800 }, { "epoch": 4.886058981233244, "grad_norm": 2.731221914291382, "learning_rate": 3.0318000096802686e-05, "loss": 0.626, "num_input_tokens_seen": 18919568, "step": 32805 }, { "epoch": 4.886803693774203, "grad_norm": 1.4147584438323975, "learning_rate": 3.031164981201622e-05, "loss": 0.8141, "num_input_tokens_seen": 18922448, "step": 32810 }, { "epoch": 4.887548406315163, "grad_norm": 1.1033501625061035, "learning_rate": 3.0305299168290064e-05, "loss": 0.6966, "num_input_tokens_seen": 18925360, "step": 32815 }, { "epoch": 4.888293118856121, "grad_norm": 1.3301115036010742, "learning_rate": 3.0298948166053352e-05, "loss": 0.6802, "num_input_tokens_seen": 18928496, "step": 32820 }, { "epoch": 4.889037831397081, "grad_norm": 0.9533198475837708, "learning_rate": 3.0292596805735274e-05, "loss": 0.5197, "num_input_tokens_seen": 18931120, "step": 32825 }, { "epoch": 4.88978254393804, "grad_norm": 1.330328106880188, "learning_rate": 3.028624508776502e-05, "loss": 0.6142, "num_input_tokens_seen": 18934192, "step": 32830 }, { "epoch": 4.890527256478999, "grad_norm": 1.9297434091567993, "learning_rate": 3.0279893012571807e-05, "loss": 0.8231, "num_input_tokens_seen": 18936880, "step": 32835 }, { "epoch": 4.891271969019958, "grad_norm": 0.9733994007110596, "learning_rate": 3.0273540580584897e-05, "loss": 0.7263, "num_input_tokens_seen": 18939856, "step": 32840 }, { "epoch": 4.892016681560918, "grad_norm": 2.78269624710083, "learning_rate": 3.026718779223356e-05, "loss": 0.8514, "num_input_tokens_seen": 18942544, "step": 32845 }, { "epoch": 4.892761394101877, "grad_norm": 1.0679439306259155, "learning_rate": 3.0260834647947085e-05, "loss": 0.6034, "num_input_tokens_seen": 18945680, "step": 32850 }, { "epoch": 4.893506106642836, "grad_norm": 0.8217410445213318, "learning_rate": 3.0254481148154788e-05, "loss": 0.5911, "num_input_tokens_seen": 18948656, "step": 32855 }, { "epoch": 4.894250819183795, "grad_norm": 2.848759412765503, "learning_rate": 3.0248127293286022e-05, "loss": 0.6397, "num_input_tokens_seen": 18951312, "step": 32860 }, { "epoch": 4.894995531724755, "grad_norm": 1.8788646459579468, "learning_rate": 3.0241773083770154e-05, "loss": 0.7699, "num_input_tokens_seen": 18954384, "step": 32865 }, { "epoch": 4.895740244265713, "grad_norm": 1.2978861331939697, "learning_rate": 3.0235418520036567e-05, "loss": 0.5965, "num_input_tokens_seen": 18957360, "step": 32870 }, { "epoch": 4.896484956806672, "grad_norm": 1.2824633121490479, "learning_rate": 3.0229063602514678e-05, "loss": 0.606, "num_input_tokens_seen": 18960112, "step": 32875 }, { "epoch": 4.897229669347632, "grad_norm": 4.031881332397461, "learning_rate": 3.022270833163394e-05, "loss": 0.6879, "num_input_tokens_seen": 18962864, "step": 32880 }, { "epoch": 4.8979743818885915, "grad_norm": 1.4806125164031982, "learning_rate": 3.0216352707823807e-05, "loss": 0.6108, "num_input_tokens_seen": 18965552, "step": 32885 }, { "epoch": 4.89871909442955, "grad_norm": 2.1252174377441406, "learning_rate": 3.0209996731513757e-05, "loss": 0.7792, "num_input_tokens_seen": 18968592, "step": 32890 }, { "epoch": 4.899463806970509, "grad_norm": 0.9737502336502075, "learning_rate": 3.020364040313332e-05, "loss": 0.5806, "num_input_tokens_seen": 18971568, "step": 32895 }, { "epoch": 4.900208519511469, "grad_norm": 1.0234612226486206, "learning_rate": 3.0197283723112013e-05, "loss": 0.7033, "num_input_tokens_seen": 18974288, "step": 32900 }, { "epoch": 4.900953232052427, "grad_norm": 1.0511398315429688, "learning_rate": 3.0190926691879412e-05, "loss": 0.6561, "num_input_tokens_seen": 18977008, "step": 32905 }, { "epoch": 4.901697944593387, "grad_norm": 1.5314557552337646, "learning_rate": 3.018456930986508e-05, "loss": 0.7421, "num_input_tokens_seen": 18979824, "step": 32910 }, { "epoch": 4.902442657134346, "grad_norm": 1.224367380142212, "learning_rate": 3.017821157749864e-05, "loss": 0.5024, "num_input_tokens_seen": 18982544, "step": 32915 }, { "epoch": 4.903187369675305, "grad_norm": 2.5785129070281982, "learning_rate": 3.0171853495209708e-05, "loss": 0.6602, "num_input_tokens_seen": 18985072, "step": 32920 }, { "epoch": 4.903932082216264, "grad_norm": 3.1702513694763184, "learning_rate": 3.0165495063427952e-05, "loss": 0.5522, "num_input_tokens_seen": 18987984, "step": 32925 }, { "epoch": 4.904676794757224, "grad_norm": 1.2404378652572632, "learning_rate": 3.0159136282583038e-05, "loss": 0.4827, "num_input_tokens_seen": 18990800, "step": 32930 }, { "epoch": 4.905421507298183, "grad_norm": 1.470645785331726, "learning_rate": 3.0152777153104665e-05, "loss": 0.443, "num_input_tokens_seen": 18993808, "step": 32935 }, { "epoch": 4.906166219839142, "grad_norm": 1.1370540857315063, "learning_rate": 3.014641767542256e-05, "loss": 0.554, "num_input_tokens_seen": 18996784, "step": 32940 }, { "epoch": 4.906910932380101, "grad_norm": 2.0073955059051514, "learning_rate": 3.014005784996648e-05, "loss": 0.6326, "num_input_tokens_seen": 18999888, "step": 32945 }, { "epoch": 4.907655644921061, "grad_norm": 1.4256882667541504, "learning_rate": 3.013369767716619e-05, "loss": 0.5233, "num_input_tokens_seen": 19002704, "step": 32950 }, { "epoch": 4.908400357462019, "grad_norm": 1.2439208030700684, "learning_rate": 3.0127337157451475e-05, "loss": 0.5987, "num_input_tokens_seen": 19005520, "step": 32955 }, { "epoch": 4.909145070002979, "grad_norm": 0.8748859763145447, "learning_rate": 3.0120976291252167e-05, "loss": 0.4594, "num_input_tokens_seen": 19008528, "step": 32960 }, { "epoch": 4.909889782543938, "grad_norm": 1.1894497871398926, "learning_rate": 3.0114615078998103e-05, "loss": 0.5718, "num_input_tokens_seen": 19011568, "step": 32965 }, { "epoch": 4.9106344950848975, "grad_norm": 2.703903913497925, "learning_rate": 3.010825352111914e-05, "loss": 0.8662, "num_input_tokens_seen": 19014448, "step": 32970 }, { "epoch": 4.911379207625856, "grad_norm": 1.4600123167037964, "learning_rate": 3.0101891618045175e-05, "loss": 0.7691, "num_input_tokens_seen": 19017328, "step": 32975 }, { "epoch": 4.912123920166816, "grad_norm": 1.4349455833435059, "learning_rate": 3.009552937020612e-05, "loss": 0.7776, "num_input_tokens_seen": 19020016, "step": 32980 }, { "epoch": 4.912868632707775, "grad_norm": 1.5655269622802734, "learning_rate": 3.008916677803191e-05, "loss": 0.5797, "num_input_tokens_seen": 19023120, "step": 32985 }, { "epoch": 4.913613345248734, "grad_norm": 1.305808186531067, "learning_rate": 3.008280384195249e-05, "loss": 0.3382, "num_input_tokens_seen": 19025936, "step": 32990 }, { "epoch": 4.914358057789693, "grad_norm": 1.3541346788406372, "learning_rate": 3.0076440562397857e-05, "loss": 0.5314, "num_input_tokens_seen": 19029584, "step": 32995 }, { "epoch": 4.915102770330653, "grad_norm": 3.5927841663360596, "learning_rate": 3.007007693979801e-05, "loss": 0.7045, "num_input_tokens_seen": 19032496, "step": 33000 }, { "epoch": 4.915847482871611, "grad_norm": 1.155368685722351, "learning_rate": 3.006371297458297e-05, "loss": 0.6924, "num_input_tokens_seen": 19035408, "step": 33005 }, { "epoch": 4.916592195412571, "grad_norm": 3.2390663623809814, "learning_rate": 3.0057348667182806e-05, "loss": 0.803, "num_input_tokens_seen": 19038288, "step": 33010 }, { "epoch": 4.91733690795353, "grad_norm": 3.5866286754608154, "learning_rate": 3.005098401802758e-05, "loss": 0.6127, "num_input_tokens_seen": 19041296, "step": 33015 }, { "epoch": 4.9180816204944895, "grad_norm": 1.2846719026565552, "learning_rate": 3.0044619027547384e-05, "loss": 0.6553, "num_input_tokens_seen": 19044368, "step": 33020 }, { "epoch": 4.918826333035448, "grad_norm": 1.403743028640747, "learning_rate": 3.0038253696172342e-05, "loss": 0.6759, "num_input_tokens_seen": 19047216, "step": 33025 }, { "epoch": 4.919571045576408, "grad_norm": 2.229177951812744, "learning_rate": 3.003188802433261e-05, "loss": 0.6676, "num_input_tokens_seen": 19050128, "step": 33030 }, { "epoch": 4.920315758117367, "grad_norm": 3.377756118774414, "learning_rate": 3.0025522012458336e-05, "loss": 0.7587, "num_input_tokens_seen": 19053264, "step": 33035 }, { "epoch": 4.921060470658325, "grad_norm": 1.4190524816513062, "learning_rate": 3.0019155660979713e-05, "loss": 0.7305, "num_input_tokens_seen": 19056112, "step": 33040 }, { "epoch": 4.921805183199285, "grad_norm": 1.2038047313690186, "learning_rate": 3.0012788970326967e-05, "loss": 0.8375, "num_input_tokens_seen": 19059216, "step": 33045 }, { "epoch": 4.922549895740245, "grad_norm": 1.0824110507965088, "learning_rate": 3.000642194093032e-05, "loss": 0.6195, "num_input_tokens_seen": 19062288, "step": 33050 }, { "epoch": 4.9232946082812035, "grad_norm": 1.3559086322784424, "learning_rate": 3.0000054573220028e-05, "loss": 0.5986, "num_input_tokens_seen": 19065104, "step": 33055 }, { "epoch": 4.924039320822162, "grad_norm": 1.4397720098495483, "learning_rate": 2.999368686762638e-05, "loss": 0.692, "num_input_tokens_seen": 19068048, "step": 33060 }, { "epoch": 4.924784033363122, "grad_norm": 1.4976730346679688, "learning_rate": 2.998731882457967e-05, "loss": 0.6638, "num_input_tokens_seen": 19070672, "step": 33065 }, { "epoch": 4.9255287459040815, "grad_norm": 0.9194758534431458, "learning_rate": 2.9980950444510236e-05, "loss": 0.4685, "num_input_tokens_seen": 19073808, "step": 33070 }, { "epoch": 4.92627345844504, "grad_norm": 2.0419368743896484, "learning_rate": 2.9974581727848423e-05, "loss": 0.8129, "num_input_tokens_seen": 19076720, "step": 33075 }, { "epoch": 4.927018170985999, "grad_norm": 1.1010874509811401, "learning_rate": 2.9968212675024603e-05, "loss": 0.6502, "num_input_tokens_seen": 19079536, "step": 33080 }, { "epoch": 4.927762883526959, "grad_norm": 2.029325008392334, "learning_rate": 2.9961843286469164e-05, "loss": 0.6772, "num_input_tokens_seen": 19082416, "step": 33085 }, { "epoch": 4.928507596067917, "grad_norm": 1.046453833580017, "learning_rate": 2.9955473562612535e-05, "loss": 0.5974, "num_input_tokens_seen": 19085360, "step": 33090 }, { "epoch": 4.929252308608877, "grad_norm": 1.5936472415924072, "learning_rate": 2.994910350388515e-05, "loss": 0.6126, "num_input_tokens_seen": 19088080, "step": 33095 }, { "epoch": 4.929997021149836, "grad_norm": 1.531834363937378, "learning_rate": 2.994273311071747e-05, "loss": 0.6683, "num_input_tokens_seen": 19090736, "step": 33100 }, { "epoch": 4.9307417336907955, "grad_norm": 1.758235216140747, "learning_rate": 2.9936362383539974e-05, "loss": 0.6317, "num_input_tokens_seen": 19093616, "step": 33105 }, { "epoch": 4.931486446231754, "grad_norm": 1.2245656251907349, "learning_rate": 2.992999132278319e-05, "loss": 0.7628, "num_input_tokens_seen": 19096368, "step": 33110 }, { "epoch": 4.932231158772714, "grad_norm": 0.948405385017395, "learning_rate": 2.9923619928877632e-05, "loss": 0.5453, "num_input_tokens_seen": 19099280, "step": 33115 }, { "epoch": 4.932975871313673, "grad_norm": 1.2575165033340454, "learning_rate": 2.9917248202253856e-05, "loss": 0.6346, "num_input_tokens_seen": 19101936, "step": 33120 }, { "epoch": 4.933720583854632, "grad_norm": 1.9311223030090332, "learning_rate": 2.9910876143342443e-05, "loss": 0.6938, "num_input_tokens_seen": 19104592, "step": 33125 }, { "epoch": 4.934465296395591, "grad_norm": 1.7331899404525757, "learning_rate": 2.9904503752573987e-05, "loss": 0.7442, "num_input_tokens_seen": 19107376, "step": 33130 }, { "epoch": 4.935210008936551, "grad_norm": 1.4561601877212524, "learning_rate": 2.98981310303791e-05, "loss": 0.6279, "num_input_tokens_seen": 19110384, "step": 33135 }, { "epoch": 4.9359547214775095, "grad_norm": 1.6184298992156982, "learning_rate": 2.9891757977188433e-05, "loss": 0.6018, "num_input_tokens_seen": 19113168, "step": 33140 }, { "epoch": 4.936699434018469, "grad_norm": 1.8786416053771973, "learning_rate": 2.9885384593432658e-05, "loss": 0.782, "num_input_tokens_seen": 19115888, "step": 33145 }, { "epoch": 4.937444146559428, "grad_norm": 1.1345704793930054, "learning_rate": 2.987901087954245e-05, "loss": 0.6025, "num_input_tokens_seen": 19118800, "step": 33150 }, { "epoch": 4.9381888591003875, "grad_norm": 0.8974241018295288, "learning_rate": 2.987263683594852e-05, "loss": 0.5621, "num_input_tokens_seen": 19121712, "step": 33155 }, { "epoch": 4.938933571641346, "grad_norm": 1.154138207435608, "learning_rate": 2.986626246308161e-05, "loss": 0.5678, "num_input_tokens_seen": 19124592, "step": 33160 }, { "epoch": 4.939678284182306, "grad_norm": 1.4051628112792969, "learning_rate": 2.9859887761372464e-05, "loss": 0.7106, "num_input_tokens_seen": 19127728, "step": 33165 }, { "epoch": 4.940422996723265, "grad_norm": 1.7297111749649048, "learning_rate": 2.9853512731251866e-05, "loss": 0.5567, "num_input_tokens_seen": 19130640, "step": 33170 }, { "epoch": 4.941167709264224, "grad_norm": 1.235853672027588, "learning_rate": 2.9847137373150602e-05, "loss": 0.5742, "num_input_tokens_seen": 19133744, "step": 33175 }, { "epoch": 4.941912421805183, "grad_norm": 0.7429044246673584, "learning_rate": 2.9840761687499507e-05, "loss": 0.624, "num_input_tokens_seen": 19136368, "step": 33180 }, { "epoch": 4.942657134346143, "grad_norm": 1.052955985069275, "learning_rate": 2.9834385674729416e-05, "loss": 0.5541, "num_input_tokens_seen": 19139120, "step": 33185 }, { "epoch": 4.9434018468871015, "grad_norm": 1.9111202955245972, "learning_rate": 2.98280093352712e-05, "loss": 0.7133, "num_input_tokens_seen": 19141936, "step": 33190 }, { "epoch": 4.944146559428061, "grad_norm": 1.0963155031204224, "learning_rate": 2.9821632669555743e-05, "loss": 0.5589, "num_input_tokens_seen": 19145104, "step": 33195 }, { "epoch": 4.94489127196902, "grad_norm": 2.2658326625823975, "learning_rate": 2.981525567801395e-05, "loss": 0.748, "num_input_tokens_seen": 19148048, "step": 33200 }, { "epoch": 4.945635984509979, "grad_norm": 1.7132794857025146, "learning_rate": 2.9808878361076754e-05, "loss": 0.6394, "num_input_tokens_seen": 19150928, "step": 33205 }, { "epoch": 4.946380697050938, "grad_norm": 2.3312125205993652, "learning_rate": 2.9802500719175107e-05, "loss": 0.744, "num_input_tokens_seen": 19153520, "step": 33210 }, { "epoch": 4.947125409591898, "grad_norm": 1.1769475936889648, "learning_rate": 2.9796122752739997e-05, "loss": 0.7368, "num_input_tokens_seen": 19156272, "step": 33215 }, { "epoch": 4.947870122132857, "grad_norm": 1.3925859928131104, "learning_rate": 2.9789744462202407e-05, "loss": 0.6289, "num_input_tokens_seen": 19158896, "step": 33220 }, { "epoch": 4.9486148346738155, "grad_norm": 4.454769134521484, "learning_rate": 2.9783365847993362e-05, "loss": 0.5606, "num_input_tokens_seen": 19161744, "step": 33225 }, { "epoch": 4.949359547214775, "grad_norm": 1.4459127187728882, "learning_rate": 2.9776986910543896e-05, "loss": 0.5309, "num_input_tokens_seen": 19164624, "step": 33230 }, { "epoch": 4.950104259755735, "grad_norm": 1.3323487043380737, "learning_rate": 2.9770607650285074e-05, "loss": 0.6268, "num_input_tokens_seen": 19167792, "step": 33235 }, { "epoch": 4.9508489722966935, "grad_norm": 1.3696845769882202, "learning_rate": 2.9764228067647987e-05, "loss": 0.6715, "num_input_tokens_seen": 19170736, "step": 33240 }, { "epoch": 4.951593684837652, "grad_norm": 2.07452654838562, "learning_rate": 2.975784816306374e-05, "loss": 0.6507, "num_input_tokens_seen": 19173456, "step": 33245 }, { "epoch": 4.952338397378612, "grad_norm": 0.8133036494255066, "learning_rate": 2.9751467936963456e-05, "loss": 0.549, "num_input_tokens_seen": 19176368, "step": 33250 }, { "epoch": 4.953083109919571, "grad_norm": 1.3127233982086182, "learning_rate": 2.9745087389778286e-05, "loss": 0.5242, "num_input_tokens_seen": 19179280, "step": 33255 }, { "epoch": 4.95382782246053, "grad_norm": 1.7017031908035278, "learning_rate": 2.9738706521939402e-05, "loss": 0.7243, "num_input_tokens_seen": 19182480, "step": 33260 }, { "epoch": 4.954572535001489, "grad_norm": 3.113754987716675, "learning_rate": 2.9732325333877997e-05, "loss": 0.6601, "num_input_tokens_seen": 19185712, "step": 33265 }, { "epoch": 4.955317247542449, "grad_norm": 1.1199778318405151, "learning_rate": 2.9725943826025287e-05, "loss": 0.6786, "num_input_tokens_seen": 19188464, "step": 33270 }, { "epoch": 4.9560619600834075, "grad_norm": 1.3856056928634644, "learning_rate": 2.9719561998812506e-05, "loss": 0.638, "num_input_tokens_seen": 19191152, "step": 33275 }, { "epoch": 4.956806672624367, "grad_norm": 1.2171944379806519, "learning_rate": 2.971317985267092e-05, "loss": 0.7251, "num_input_tokens_seen": 19194480, "step": 33280 }, { "epoch": 4.957551385165326, "grad_norm": 1.2518551349639893, "learning_rate": 2.9706797388031794e-05, "loss": 0.5448, "num_input_tokens_seen": 19197264, "step": 33285 }, { "epoch": 4.9582960977062855, "grad_norm": 1.014174222946167, "learning_rate": 2.9700414605326444e-05, "loss": 0.478, "num_input_tokens_seen": 19200144, "step": 33290 }, { "epoch": 4.959040810247244, "grad_norm": 1.4458167552947998, "learning_rate": 2.969403150498618e-05, "loss": 0.6907, "num_input_tokens_seen": 19203376, "step": 33295 }, { "epoch": 4.959785522788204, "grad_norm": 2.4738097190856934, "learning_rate": 2.9687648087442353e-05, "loss": 0.5933, "num_input_tokens_seen": 19206096, "step": 33300 }, { "epoch": 4.960530235329163, "grad_norm": 1.4031012058258057, "learning_rate": 2.968126435312632e-05, "loss": 0.6454, "num_input_tokens_seen": 19208784, "step": 33305 }, { "epoch": 4.961274947870122, "grad_norm": 1.3574966192245483, "learning_rate": 2.9674880302469487e-05, "loss": 0.5545, "num_input_tokens_seen": 19211472, "step": 33310 }, { "epoch": 4.962019660411081, "grad_norm": 1.2168644666671753, "learning_rate": 2.9668495935903246e-05, "loss": 0.7539, "num_input_tokens_seen": 19214160, "step": 33315 }, { "epoch": 4.962764372952041, "grad_norm": 2.18149995803833, "learning_rate": 2.9662111253859025e-05, "loss": 0.5175, "num_input_tokens_seen": 19217008, "step": 33320 }, { "epoch": 4.9635090854929995, "grad_norm": 2.353635311126709, "learning_rate": 2.9655726256768286e-05, "loss": 0.6771, "num_input_tokens_seen": 19220048, "step": 33325 }, { "epoch": 4.964253798033959, "grad_norm": 1.787961721420288, "learning_rate": 2.96493409450625e-05, "loss": 0.6835, "num_input_tokens_seen": 19222736, "step": 33330 }, { "epoch": 4.964998510574918, "grad_norm": 1.4516650438308716, "learning_rate": 2.9642955319173142e-05, "loss": 0.6169, "num_input_tokens_seen": 19225264, "step": 33335 }, { "epoch": 4.965743223115878, "grad_norm": 1.8255724906921387, "learning_rate": 2.963656937953175e-05, "loss": 0.6688, "num_input_tokens_seen": 19228112, "step": 33340 }, { "epoch": 4.966487935656836, "grad_norm": 0.8370242118835449, "learning_rate": 2.9630183126569843e-05, "loss": 0.6333, "num_input_tokens_seen": 19231152, "step": 33345 }, { "epoch": 4.967232648197796, "grad_norm": 1.057240605354309, "learning_rate": 2.9623796560718997e-05, "loss": 0.4847, "num_input_tokens_seen": 19233616, "step": 33350 }, { "epoch": 4.967977360738755, "grad_norm": 1.7357966899871826, "learning_rate": 2.961740968241077e-05, "loss": 0.6308, "num_input_tokens_seen": 19236432, "step": 33355 }, { "epoch": 4.968722073279714, "grad_norm": 1.6476942300796509, "learning_rate": 2.961102249207677e-05, "loss": 0.6303, "num_input_tokens_seen": 19239344, "step": 33360 }, { "epoch": 4.969466785820673, "grad_norm": 1.9186803102493286, "learning_rate": 2.9604634990148617e-05, "loss": 0.5539, "num_input_tokens_seen": 19242448, "step": 33365 }, { "epoch": 4.970211498361633, "grad_norm": 1.6066948175430298, "learning_rate": 2.9598247177057952e-05, "loss": 0.5732, "num_input_tokens_seen": 19245168, "step": 33370 }, { "epoch": 4.9709562109025915, "grad_norm": 1.9777538776397705, "learning_rate": 2.9591859053236436e-05, "loss": 0.6295, "num_input_tokens_seen": 19248208, "step": 33375 }, { "epoch": 4.971700923443551, "grad_norm": 1.5419999361038208, "learning_rate": 2.9585470619115762e-05, "loss": 0.833, "num_input_tokens_seen": 19251056, "step": 33380 }, { "epoch": 4.97244563598451, "grad_norm": 1.1114931106567383, "learning_rate": 2.9579081875127625e-05, "loss": 0.8082, "num_input_tokens_seen": 19253968, "step": 33385 }, { "epoch": 4.973190348525469, "grad_norm": 1.7962740659713745, "learning_rate": 2.9572692821703745e-05, "loss": 0.5393, "num_input_tokens_seen": 19256720, "step": 33390 }, { "epoch": 4.973935061066428, "grad_norm": 1.5727025270462036, "learning_rate": 2.9566303459275884e-05, "loss": 0.6314, "num_input_tokens_seen": 19259728, "step": 33395 }, { "epoch": 4.974679773607388, "grad_norm": 1.5420334339141846, "learning_rate": 2.9559913788275793e-05, "loss": 0.6599, "num_input_tokens_seen": 19262736, "step": 33400 }, { "epoch": 4.975424486148347, "grad_norm": 1.180577278137207, "learning_rate": 2.955352380913527e-05, "loss": 0.5941, "num_input_tokens_seen": 19265488, "step": 33405 }, { "epoch": 4.9761691986893055, "grad_norm": 1.4165407419204712, "learning_rate": 2.954713352228613e-05, "loss": 0.6757, "num_input_tokens_seen": 19268080, "step": 33410 }, { "epoch": 4.976913911230265, "grad_norm": 1.6120353937149048, "learning_rate": 2.9540742928160182e-05, "loss": 0.7354, "num_input_tokens_seen": 19271536, "step": 33415 }, { "epoch": 4.977658623771224, "grad_norm": 1.0208197832107544, "learning_rate": 2.953435202718929e-05, "loss": 0.7009, "num_input_tokens_seen": 19274320, "step": 33420 }, { "epoch": 4.978403336312184, "grad_norm": 1.7863991260528564, "learning_rate": 2.9527960819805327e-05, "loss": 0.7661, "num_input_tokens_seen": 19277072, "step": 33425 }, { "epoch": 4.979148048853142, "grad_norm": 1.2238730192184448, "learning_rate": 2.9521569306440183e-05, "loss": 0.6915, "num_input_tokens_seen": 19279984, "step": 33430 }, { "epoch": 4.979892761394102, "grad_norm": 2.251209259033203, "learning_rate": 2.9515177487525763e-05, "loss": 0.5485, "num_input_tokens_seen": 19282736, "step": 33435 }, { "epoch": 4.980637473935061, "grad_norm": 1.8113088607788086, "learning_rate": 2.9508785363494e-05, "loss": 0.6683, "num_input_tokens_seen": 19285552, "step": 33440 }, { "epoch": 4.98138218647602, "grad_norm": 1.1286228895187378, "learning_rate": 2.950239293477687e-05, "loss": 0.9319, "num_input_tokens_seen": 19288912, "step": 33445 }, { "epoch": 4.982126899016979, "grad_norm": 2.256714344024658, "learning_rate": 2.949600020180632e-05, "loss": 0.7617, "num_input_tokens_seen": 19291632, "step": 33450 }, { "epoch": 4.982871611557939, "grad_norm": 0.7559281587600708, "learning_rate": 2.9489607165014353e-05, "loss": 0.6277, "num_input_tokens_seen": 19294576, "step": 33455 }, { "epoch": 4.9836163240988975, "grad_norm": 0.9458989500999451, "learning_rate": 2.9483213824833e-05, "loss": 0.6029, "num_input_tokens_seen": 19297840, "step": 33460 }, { "epoch": 4.984361036639857, "grad_norm": 1.514949917793274, "learning_rate": 2.9476820181694276e-05, "loss": 0.8463, "num_input_tokens_seen": 19300880, "step": 33465 }, { "epoch": 4.985105749180816, "grad_norm": 1.2558259963989258, "learning_rate": 2.9470426236030247e-05, "loss": 0.556, "num_input_tokens_seen": 19304176, "step": 33470 }, { "epoch": 4.985850461721776, "grad_norm": 1.1955256462097168, "learning_rate": 2.9464031988272983e-05, "loss": 0.5082, "num_input_tokens_seen": 19307120, "step": 33475 }, { "epoch": 4.986595174262734, "grad_norm": 3.4159011840820312, "learning_rate": 2.9457637438854592e-05, "loss": 0.6066, "num_input_tokens_seen": 19310096, "step": 33480 }, { "epoch": 4.987339886803694, "grad_norm": 1.5454895496368408, "learning_rate": 2.9451242588207185e-05, "loss": 0.5568, "num_input_tokens_seen": 19312752, "step": 33485 }, { "epoch": 4.988084599344653, "grad_norm": 1.1843105554580688, "learning_rate": 2.94448474367629e-05, "loss": 0.5766, "num_input_tokens_seen": 19315312, "step": 33490 }, { "epoch": 4.988829311885612, "grad_norm": 1.0467292070388794, "learning_rate": 2.94384519849539e-05, "loss": 0.6104, "num_input_tokens_seen": 19318032, "step": 33495 }, { "epoch": 4.989574024426571, "grad_norm": 2.583125114440918, "learning_rate": 2.9432056233212357e-05, "loss": 0.5589, "num_input_tokens_seen": 19320880, "step": 33500 }, { "epoch": 4.990318736967531, "grad_norm": 1.7268061637878418, "learning_rate": 2.9425660181970472e-05, "loss": 0.7038, "num_input_tokens_seen": 19323920, "step": 33505 }, { "epoch": 4.99106344950849, "grad_norm": 0.9200357794761658, "learning_rate": 2.9419263831660475e-05, "loss": 0.5375, "num_input_tokens_seen": 19326864, "step": 33510 }, { "epoch": 4.991808162049449, "grad_norm": 1.2536152601242065, "learning_rate": 2.941286718271459e-05, "loss": 0.7938, "num_input_tokens_seen": 19329872, "step": 33515 }, { "epoch": 4.992552874590408, "grad_norm": 1.9081377983093262, "learning_rate": 2.9406470235565075e-05, "loss": 0.7231, "num_input_tokens_seen": 19332688, "step": 33520 }, { "epoch": 4.993297587131368, "grad_norm": 2.3402462005615234, "learning_rate": 2.940007299064423e-05, "loss": 0.7163, "num_input_tokens_seen": 19335536, "step": 33525 }, { "epoch": 4.994042299672326, "grad_norm": 1.4019583463668823, "learning_rate": 2.9393675448384332e-05, "loss": 0.6501, "num_input_tokens_seen": 19338192, "step": 33530 }, { "epoch": 4.994787012213286, "grad_norm": 1.1167908906936646, "learning_rate": 2.9387277609217713e-05, "loss": 0.7835, "num_input_tokens_seen": 19341008, "step": 33535 }, { "epoch": 4.995531724754245, "grad_norm": 1.1921207904815674, "learning_rate": 2.9380879473576705e-05, "loss": 0.6096, "num_input_tokens_seen": 19343888, "step": 33540 }, { "epoch": 4.996276437295204, "grad_norm": 1.2022477388381958, "learning_rate": 2.9374481041893687e-05, "loss": 0.5711, "num_input_tokens_seen": 19346928, "step": 33545 }, { "epoch": 4.997021149836163, "grad_norm": 2.899489402770996, "learning_rate": 2.9368082314601018e-05, "loss": 0.7, "num_input_tokens_seen": 19349808, "step": 33550 }, { "epoch": 4.997765862377122, "grad_norm": 0.7129327058792114, "learning_rate": 2.9361683292131103e-05, "loss": 0.6641, "num_input_tokens_seen": 19352752, "step": 33555 }, { "epoch": 4.998510574918082, "grad_norm": 1.0880169868469238, "learning_rate": 2.935528397491637e-05, "loss": 0.5879, "num_input_tokens_seen": 19355472, "step": 33560 }, { "epoch": 4.999255287459041, "grad_norm": 0.930585503578186, "learning_rate": 2.9348884363389246e-05, "loss": 0.6321, "num_input_tokens_seen": 19358320, "step": 33565 }, { "epoch": 5.0, "grad_norm": 2.543302297592163, "learning_rate": 2.9342484457982206e-05, "loss": 0.7214, "num_input_tokens_seen": 19360624, "step": 33570 }, { "epoch": 5.0, "eval_loss": 0.6489034295082092, "eval_runtime": 74.2527, "eval_samples_per_second": 40.187, "eval_steps_per_second": 10.047, "num_input_tokens_seen": 19360624, "step": 33570 }, { "epoch": 5.000744712540959, "grad_norm": 1.7415210008621216, "learning_rate": 2.9336084259127716e-05, "loss": 0.6048, "num_input_tokens_seen": 19363472, "step": 33575 }, { "epoch": 5.001489425081918, "grad_norm": 0.7877578735351562, "learning_rate": 2.932968376725828e-05, "loss": 0.5263, "num_input_tokens_seen": 19366288, "step": 33580 }, { "epoch": 5.002234137622877, "grad_norm": 0.9882253408432007, "learning_rate": 2.932328298280642e-05, "loss": 0.3976, "num_input_tokens_seen": 19368944, "step": 33585 }, { "epoch": 5.002978850163837, "grad_norm": 2.4294755458831787, "learning_rate": 2.9316881906204675e-05, "loss": 0.7256, "num_input_tokens_seen": 19371632, "step": 33590 }, { "epoch": 5.003723562704796, "grad_norm": 1.1955451965332031, "learning_rate": 2.9310480537885605e-05, "loss": 0.4853, "num_input_tokens_seen": 19374608, "step": 33595 }, { "epoch": 5.004468275245755, "grad_norm": 0.8601023554801941, "learning_rate": 2.9304078878281778e-05, "loss": 0.5939, "num_input_tokens_seen": 19377520, "step": 33600 }, { "epoch": 5.005212987786714, "grad_norm": 1.4184337854385376, "learning_rate": 2.9297676927825803e-05, "loss": 0.7016, "num_input_tokens_seen": 19380464, "step": 33605 }, { "epoch": 5.005957700327674, "grad_norm": 0.9417212009429932, "learning_rate": 2.9291274686950294e-05, "loss": 0.5383, "num_input_tokens_seen": 19383152, "step": 33610 }, { "epoch": 5.006702412868632, "grad_norm": 1.6746517419815063, "learning_rate": 2.9284872156087896e-05, "loss": 0.6483, "num_input_tokens_seen": 19386096, "step": 33615 }, { "epoch": 5.007447125409592, "grad_norm": 1.8694233894348145, "learning_rate": 2.9278469335671245e-05, "loss": 0.7593, "num_input_tokens_seen": 19389072, "step": 33620 }, { "epoch": 5.008191837950551, "grad_norm": 1.2730889320373535, "learning_rate": 2.9272066226133037e-05, "loss": 0.6513, "num_input_tokens_seen": 19391792, "step": 33625 }, { "epoch": 5.00893655049151, "grad_norm": 2.1693694591522217, "learning_rate": 2.9265662827905967e-05, "loss": 0.6912, "num_input_tokens_seen": 19394608, "step": 33630 }, { "epoch": 5.009681263032469, "grad_norm": 0.8354572057723999, "learning_rate": 2.925925914142274e-05, "loss": 0.7899, "num_input_tokens_seen": 19397232, "step": 33635 }, { "epoch": 5.010425975573429, "grad_norm": 2.1814420223236084, "learning_rate": 2.92528551671161e-05, "loss": 0.7102, "num_input_tokens_seen": 19399888, "step": 33640 }, { "epoch": 5.011170688114388, "grad_norm": 1.6679344177246094, "learning_rate": 2.9246450905418798e-05, "loss": 0.5457, "num_input_tokens_seen": 19403152, "step": 33645 }, { "epoch": 5.011915400655347, "grad_norm": 1.1300843954086304, "learning_rate": 2.9240046356763607e-05, "loss": 0.6046, "num_input_tokens_seen": 19405744, "step": 33650 }, { "epoch": 5.012660113196306, "grad_norm": 1.375544786453247, "learning_rate": 2.9233641521583325e-05, "loss": 0.5775, "num_input_tokens_seen": 19408304, "step": 33655 }, { "epoch": 5.013404825737266, "grad_norm": 1.0329484939575195, "learning_rate": 2.9227236400310765e-05, "loss": 0.5087, "num_input_tokens_seen": 19411504, "step": 33660 }, { "epoch": 5.014149538278224, "grad_norm": 3.4648354053497314, "learning_rate": 2.9220830993378745e-05, "loss": 0.6558, "num_input_tokens_seen": 19414448, "step": 33665 }, { "epoch": 5.014894250819184, "grad_norm": 1.2836040258407593, "learning_rate": 2.9214425301220133e-05, "loss": 0.693, "num_input_tokens_seen": 19417360, "step": 33670 }, { "epoch": 5.015638963360143, "grad_norm": 1.74018394947052, "learning_rate": 2.9208019324267798e-05, "loss": 0.4954, "num_input_tokens_seen": 19420080, "step": 33675 }, { "epoch": 5.0163836759011025, "grad_norm": 2.0245766639709473, "learning_rate": 2.920161306295462e-05, "loss": 0.8149, "num_input_tokens_seen": 19423184, "step": 33680 }, { "epoch": 5.017128388442061, "grad_norm": 1.9210498332977295, "learning_rate": 2.9195206517713515e-05, "loss": 0.76, "num_input_tokens_seen": 19425776, "step": 33685 }, { "epoch": 5.017873100983021, "grad_norm": 1.612429141998291, "learning_rate": 2.9188799688977407e-05, "loss": 0.7188, "num_input_tokens_seen": 19428816, "step": 33690 }, { "epoch": 5.01861781352398, "grad_norm": 2.508223533630371, "learning_rate": 2.9182392577179257e-05, "loss": 0.7109, "num_input_tokens_seen": 19431888, "step": 33695 }, { "epoch": 5.019362526064939, "grad_norm": 2.2203235626220703, "learning_rate": 2.917598518275201e-05, "loss": 0.7416, "num_input_tokens_seen": 19434928, "step": 33700 }, { "epoch": 5.020107238605898, "grad_norm": 1.4130868911743164, "learning_rate": 2.9169577506128664e-05, "loss": 0.6082, "num_input_tokens_seen": 19437904, "step": 33705 }, { "epoch": 5.020851951146858, "grad_norm": 1.5134116411209106, "learning_rate": 2.9163169547742225e-05, "loss": 0.6303, "num_input_tokens_seen": 19440976, "step": 33710 }, { "epoch": 5.021596663687816, "grad_norm": 1.6079232692718506, "learning_rate": 2.9156761308025715e-05, "loss": 0.6513, "num_input_tokens_seen": 19443568, "step": 33715 }, { "epoch": 5.022341376228776, "grad_norm": 2.2401602268218994, "learning_rate": 2.915035278741218e-05, "loss": 0.5091, "num_input_tokens_seen": 19446256, "step": 33720 }, { "epoch": 5.023086088769735, "grad_norm": 1.4662959575653076, "learning_rate": 2.914394398633467e-05, "loss": 0.3898, "num_input_tokens_seen": 19449040, "step": 33725 }, { "epoch": 5.0238308013106945, "grad_norm": 1.2085607051849365, "learning_rate": 2.9137534905226272e-05, "loss": 0.4417, "num_input_tokens_seen": 19452400, "step": 33730 }, { "epoch": 5.024575513851653, "grad_norm": 2.3479418754577637, "learning_rate": 2.9131125544520095e-05, "loss": 0.7445, "num_input_tokens_seen": 19455120, "step": 33735 }, { "epoch": 5.025320226392613, "grad_norm": 2.4282100200653076, "learning_rate": 2.9124715904649247e-05, "loss": 0.6642, "num_input_tokens_seen": 19458128, "step": 33740 }, { "epoch": 5.026064938933572, "grad_norm": 1.5663714408874512, "learning_rate": 2.911830598604687e-05, "loss": 0.6018, "num_input_tokens_seen": 19460944, "step": 33745 }, { "epoch": 5.02680965147453, "grad_norm": 1.3217169046401978, "learning_rate": 2.911189578914611e-05, "loss": 0.4837, "num_input_tokens_seen": 19463504, "step": 33750 }, { "epoch": 5.02755436401549, "grad_norm": 1.2404571771621704, "learning_rate": 2.9105485314380154e-05, "loss": 0.6553, "num_input_tokens_seen": 19466512, "step": 33755 }, { "epoch": 5.028299076556449, "grad_norm": 0.6331257224082947, "learning_rate": 2.90990745621822e-05, "loss": 0.6483, "num_input_tokens_seen": 19469168, "step": 33760 }, { "epoch": 5.0290437890974085, "grad_norm": 1.488861083984375, "learning_rate": 2.9092663532985442e-05, "loss": 0.5826, "num_input_tokens_seen": 19472080, "step": 33765 }, { "epoch": 5.029788501638367, "grad_norm": 0.9205563068389893, "learning_rate": 2.9086252227223122e-05, "loss": 0.471, "num_input_tokens_seen": 19474832, "step": 33770 }, { "epoch": 5.030533214179327, "grad_norm": 2.2017874717712402, "learning_rate": 2.9079840645328505e-05, "loss": 0.4214, "num_input_tokens_seen": 19477584, "step": 33775 }, { "epoch": 5.031277926720286, "grad_norm": 1.8134377002716064, "learning_rate": 2.907342878773483e-05, "loss": 0.5997, "num_input_tokens_seen": 19480304, "step": 33780 }, { "epoch": 5.032022639261245, "grad_norm": 1.21207594871521, "learning_rate": 2.90670166548754e-05, "loss": 0.5484, "num_input_tokens_seen": 19483248, "step": 33785 }, { "epoch": 5.032767351802204, "grad_norm": 1.1118985414505005, "learning_rate": 2.9060604247183525e-05, "loss": 0.4325, "num_input_tokens_seen": 19486192, "step": 33790 }, { "epoch": 5.033512064343164, "grad_norm": 1.0471349954605103, "learning_rate": 2.9054191565092524e-05, "loss": 0.5754, "num_input_tokens_seen": 19489072, "step": 33795 }, { "epoch": 5.034256776884122, "grad_norm": 2.485363483428955, "learning_rate": 2.9047778609035737e-05, "loss": 0.8457, "num_input_tokens_seen": 19492112, "step": 33800 }, { "epoch": 5.035001489425082, "grad_norm": 1.8079755306243896, "learning_rate": 2.9041365379446522e-05, "loss": 0.5238, "num_input_tokens_seen": 19495280, "step": 33805 }, { "epoch": 5.035746201966041, "grad_norm": 1.93517005443573, "learning_rate": 2.9034951876758276e-05, "loss": 0.4613, "num_input_tokens_seen": 19497872, "step": 33810 }, { "epoch": 5.0364909145070005, "grad_norm": 3.128187656402588, "learning_rate": 2.902853810140439e-05, "loss": 0.7612, "num_input_tokens_seen": 19500848, "step": 33815 }, { "epoch": 5.037235627047959, "grad_norm": 1.4600962400436401, "learning_rate": 2.9022124053818268e-05, "loss": 0.6055, "num_input_tokens_seen": 19503728, "step": 33820 }, { "epoch": 5.037980339588919, "grad_norm": 2.038456678390503, "learning_rate": 2.901570973443336e-05, "loss": 0.8146, "num_input_tokens_seen": 19506640, "step": 33825 }, { "epoch": 5.038725052129878, "grad_norm": 1.2977062463760376, "learning_rate": 2.9009295143683114e-05, "loss": 0.594, "num_input_tokens_seen": 19509456, "step": 33830 }, { "epoch": 5.039469764670837, "grad_norm": 3.3287765979766846, "learning_rate": 2.9002880282001004e-05, "loss": 0.5666, "num_input_tokens_seen": 19512400, "step": 33835 }, { "epoch": 5.040214477211796, "grad_norm": 1.6715176105499268, "learning_rate": 2.899646514982052e-05, "loss": 0.4529, "num_input_tokens_seen": 19515472, "step": 33840 }, { "epoch": 5.040959189752756, "grad_norm": 2.6316003799438477, "learning_rate": 2.8990049747575165e-05, "loss": 0.551, "num_input_tokens_seen": 19518256, "step": 33845 }, { "epoch": 5.0417039022937145, "grad_norm": 3.1223177909851074, "learning_rate": 2.8983634075698475e-05, "loss": 0.7271, "num_input_tokens_seen": 19520848, "step": 33850 }, { "epoch": 5.042448614834674, "grad_norm": 2.41766357421875, "learning_rate": 2.897721813462399e-05, "loss": 0.6802, "num_input_tokens_seen": 19523728, "step": 33855 }, { "epoch": 5.043193327375633, "grad_norm": 2.563312292098999, "learning_rate": 2.8970801924785273e-05, "loss": 0.784, "num_input_tokens_seen": 19526736, "step": 33860 }, { "epoch": 5.0439380399165925, "grad_norm": 4.266634464263916, "learning_rate": 2.8964385446615905e-05, "loss": 0.7776, "num_input_tokens_seen": 19529744, "step": 33865 }, { "epoch": 5.044682752457551, "grad_norm": 3.051175117492676, "learning_rate": 2.895796870054948e-05, "loss": 0.7391, "num_input_tokens_seen": 19532624, "step": 33870 }, { "epoch": 5.045427464998511, "grad_norm": 1.9472198486328125, "learning_rate": 2.895155168701964e-05, "loss": 0.486, "num_input_tokens_seen": 19535504, "step": 33875 }, { "epoch": 5.04617217753947, "grad_norm": 1.6412630081176758, "learning_rate": 2.894513440645999e-05, "loss": 0.6197, "num_input_tokens_seen": 19538320, "step": 33880 }, { "epoch": 5.046916890080429, "grad_norm": 1.9744839668273926, "learning_rate": 2.8938716859304193e-05, "loss": 0.5875, "num_input_tokens_seen": 19540912, "step": 33885 }, { "epoch": 5.047661602621388, "grad_norm": 2.1878228187561035, "learning_rate": 2.8932299045985932e-05, "loss": 0.7935, "num_input_tokens_seen": 19543664, "step": 33890 }, { "epoch": 5.048406315162348, "grad_norm": 1.0220707654953003, "learning_rate": 2.892588096693889e-05, "loss": 0.4963, "num_input_tokens_seen": 19546384, "step": 33895 }, { "epoch": 5.0491510277033065, "grad_norm": 3.1737823486328125, "learning_rate": 2.8919462622596764e-05, "loss": 0.7949, "num_input_tokens_seen": 19549264, "step": 33900 }, { "epoch": 5.049895740244266, "grad_norm": 4.2171735763549805, "learning_rate": 2.8913044013393305e-05, "loss": 0.5619, "num_input_tokens_seen": 19552176, "step": 33905 }, { "epoch": 5.050640452785225, "grad_norm": 1.9948711395263672, "learning_rate": 2.890662513976223e-05, "loss": 0.5822, "num_input_tokens_seen": 19555024, "step": 33910 }, { "epoch": 5.0513851653261845, "grad_norm": 2.0107674598693848, "learning_rate": 2.890020600213731e-05, "loss": 0.6477, "num_input_tokens_seen": 19557936, "step": 33915 }, { "epoch": 5.052129877867143, "grad_norm": 1.5535646677017212, "learning_rate": 2.889378660095233e-05, "loss": 0.6931, "num_input_tokens_seen": 19561040, "step": 33920 }, { "epoch": 5.052874590408102, "grad_norm": 2.0517830848693848, "learning_rate": 2.8887366936641082e-05, "loss": 0.5892, "num_input_tokens_seen": 19564208, "step": 33925 }, { "epoch": 5.053619302949062, "grad_norm": 1.8303676843643188, "learning_rate": 2.8880947009637377e-05, "loss": 0.6852, "num_input_tokens_seen": 19567184, "step": 33930 }, { "epoch": 5.0543640154900205, "grad_norm": 3.616574287414551, "learning_rate": 2.887452682037506e-05, "loss": 0.6786, "num_input_tokens_seen": 19569968, "step": 33935 }, { "epoch": 5.05510872803098, "grad_norm": 1.169492244720459, "learning_rate": 2.8868106369287966e-05, "loss": 0.5947, "num_input_tokens_seen": 19572912, "step": 33940 }, { "epoch": 5.055853440571939, "grad_norm": 1.751043438911438, "learning_rate": 2.886168565680997e-05, "loss": 0.4162, "num_input_tokens_seen": 19575856, "step": 33945 }, { "epoch": 5.0565981531128985, "grad_norm": 1.491036295890808, "learning_rate": 2.8855264683374956e-05, "loss": 0.6821, "num_input_tokens_seen": 19579216, "step": 33950 }, { "epoch": 5.057342865653857, "grad_norm": 1.682337760925293, "learning_rate": 2.884884344941684e-05, "loss": 0.5711, "num_input_tokens_seen": 19582096, "step": 33955 }, { "epoch": 5.058087578194817, "grad_norm": 1.872284173965454, "learning_rate": 2.8842421955369526e-05, "loss": 0.7585, "num_input_tokens_seen": 19585104, "step": 33960 }, { "epoch": 5.058832290735776, "grad_norm": 1.3499963283538818, "learning_rate": 2.883600020166695e-05, "loss": 0.5248, "num_input_tokens_seen": 19588144, "step": 33965 }, { "epoch": 5.059577003276735, "grad_norm": 2.112910509109497, "learning_rate": 2.8829578188743084e-05, "loss": 0.5675, "num_input_tokens_seen": 19590928, "step": 33970 }, { "epoch": 5.060321715817694, "grad_norm": 1.8478683233261108, "learning_rate": 2.88231559170319e-05, "loss": 0.783, "num_input_tokens_seen": 19593968, "step": 33975 }, { "epoch": 5.061066428358654, "grad_norm": 1.670101284980774, "learning_rate": 2.8816733386967376e-05, "loss": 0.6091, "num_input_tokens_seen": 19596880, "step": 33980 }, { "epoch": 5.0618111408996125, "grad_norm": 1.5525037050247192, "learning_rate": 2.8810310598983524e-05, "loss": 0.6459, "num_input_tokens_seen": 19599664, "step": 33985 }, { "epoch": 5.062555853440572, "grad_norm": 1.8165537118911743, "learning_rate": 2.880388755351438e-05, "loss": 0.5047, "num_input_tokens_seen": 19602384, "step": 33990 }, { "epoch": 5.063300565981531, "grad_norm": 1.33318293094635, "learning_rate": 2.8797464250993984e-05, "loss": 0.5414, "num_input_tokens_seen": 19605168, "step": 33995 }, { "epoch": 5.0640452785224905, "grad_norm": 2.465597152709961, "learning_rate": 2.8791040691856385e-05, "loss": 0.5074, "num_input_tokens_seen": 19608176, "step": 34000 }, { "epoch": 5.064789991063449, "grad_norm": 1.578065276145935, "learning_rate": 2.8784616876535673e-05, "loss": 0.6346, "num_input_tokens_seen": 19611472, "step": 34005 }, { "epoch": 5.065534703604409, "grad_norm": 1.6985551118850708, "learning_rate": 2.8778192805465937e-05, "loss": 0.3954, "num_input_tokens_seen": 19614384, "step": 34010 }, { "epoch": 5.066279416145368, "grad_norm": 3.846832513809204, "learning_rate": 2.8771768479081297e-05, "loss": 0.6253, "num_input_tokens_seen": 19617104, "step": 34015 }, { "epoch": 5.067024128686327, "grad_norm": 1.7969226837158203, "learning_rate": 2.8765343897815867e-05, "loss": 0.6055, "num_input_tokens_seen": 19619792, "step": 34020 }, { "epoch": 5.067768841227286, "grad_norm": 2.193390130996704, "learning_rate": 2.8758919062103817e-05, "loss": 0.71, "num_input_tokens_seen": 19622448, "step": 34025 }, { "epoch": 5.068513553768246, "grad_norm": 2.468301773071289, "learning_rate": 2.8752493972379292e-05, "loss": 0.5783, "num_input_tokens_seen": 19625200, "step": 34030 }, { "epoch": 5.0692582663092045, "grad_norm": 2.86098313331604, "learning_rate": 2.8746068629076487e-05, "loss": 0.6187, "num_input_tokens_seen": 19627920, "step": 34035 }, { "epoch": 5.070002978850164, "grad_norm": 1.9105132818222046, "learning_rate": 2.8739643032629592e-05, "loss": 0.6332, "num_input_tokens_seen": 19630928, "step": 34040 }, { "epoch": 5.070747691391123, "grad_norm": 1.6197082996368408, "learning_rate": 2.8733217183472823e-05, "loss": 0.4929, "num_input_tokens_seen": 19633744, "step": 34045 }, { "epoch": 5.071492403932083, "grad_norm": 1.3470247983932495, "learning_rate": 2.8726791082040416e-05, "loss": 0.6308, "num_input_tokens_seen": 19636816, "step": 34050 }, { "epoch": 5.072237116473041, "grad_norm": 3.171541929244995, "learning_rate": 2.8720364728766618e-05, "loss": 0.6201, "num_input_tokens_seen": 19639536, "step": 34055 }, { "epoch": 5.072981829014001, "grad_norm": 3.0633139610290527, "learning_rate": 2.8713938124085706e-05, "loss": 0.6833, "num_input_tokens_seen": 19642256, "step": 34060 }, { "epoch": 5.07372654155496, "grad_norm": 2.2667946815490723, "learning_rate": 2.8707511268431947e-05, "loss": 0.6383, "num_input_tokens_seen": 19645168, "step": 34065 }, { "epoch": 5.074471254095919, "grad_norm": 1.7239779233932495, "learning_rate": 2.8701084162239656e-05, "loss": 0.5211, "num_input_tokens_seen": 19647920, "step": 34070 }, { "epoch": 5.075215966636878, "grad_norm": 2.87412166595459, "learning_rate": 2.8694656805943143e-05, "loss": 0.6392, "num_input_tokens_seen": 19650480, "step": 34075 }, { "epoch": 5.075960679177838, "grad_norm": 1.60356605052948, "learning_rate": 2.868822919997674e-05, "loss": 0.5794, "num_input_tokens_seen": 19653488, "step": 34080 }, { "epoch": 5.0767053917187965, "grad_norm": 4.765802383422852, "learning_rate": 2.86818013447748e-05, "loss": 0.6903, "num_input_tokens_seen": 19656272, "step": 34085 }, { "epoch": 5.077450104259755, "grad_norm": 2.566145181655884, "learning_rate": 2.8675373240771703e-05, "loss": 0.7456, "num_input_tokens_seen": 19658960, "step": 34090 }, { "epoch": 5.078194816800715, "grad_norm": 1.6693552732467651, "learning_rate": 2.8668944888401826e-05, "loss": 0.4993, "num_input_tokens_seen": 19661680, "step": 34095 }, { "epoch": 5.078939529341674, "grad_norm": 1.546189785003662, "learning_rate": 2.866251628809956e-05, "loss": 0.7521, "num_input_tokens_seen": 19664528, "step": 34100 }, { "epoch": 5.079684241882633, "grad_norm": 2.214538335800171, "learning_rate": 2.8656087440299347e-05, "loss": 0.667, "num_input_tokens_seen": 19667344, "step": 34105 }, { "epoch": 5.080428954423592, "grad_norm": 2.0676045417785645, "learning_rate": 2.8649658345435597e-05, "loss": 0.5878, "num_input_tokens_seen": 19670224, "step": 34110 }, { "epoch": 5.081173666964552, "grad_norm": 1.609747290611267, "learning_rate": 2.8643229003942786e-05, "loss": 0.6436, "num_input_tokens_seen": 19673168, "step": 34115 }, { "epoch": 5.0819183795055105, "grad_norm": 2.42423415184021, "learning_rate": 2.8636799416255362e-05, "loss": 0.576, "num_input_tokens_seen": 19675856, "step": 34120 }, { "epoch": 5.08266309204647, "grad_norm": 1.0975285768508911, "learning_rate": 2.8630369582807824e-05, "loss": 0.6051, "num_input_tokens_seen": 19678512, "step": 34125 }, { "epoch": 5.083407804587429, "grad_norm": 1.544194221496582, "learning_rate": 2.8623939504034662e-05, "loss": 0.6639, "num_input_tokens_seen": 19681360, "step": 34130 }, { "epoch": 5.084152517128389, "grad_norm": 4.25990104675293, "learning_rate": 2.8617509180370418e-05, "loss": 0.8386, "num_input_tokens_seen": 19684272, "step": 34135 }, { "epoch": 5.084897229669347, "grad_norm": 1.6484522819519043, "learning_rate": 2.8611078612249598e-05, "loss": 0.5077, "num_input_tokens_seen": 19686928, "step": 34140 }, { "epoch": 5.085641942210307, "grad_norm": 1.3810359239578247, "learning_rate": 2.8604647800106772e-05, "loss": 0.6207, "num_input_tokens_seen": 19689776, "step": 34145 }, { "epoch": 5.086386654751266, "grad_norm": 2.9854302406311035, "learning_rate": 2.85982167443765e-05, "loss": 0.7102, "num_input_tokens_seen": 19692496, "step": 34150 }, { "epoch": 5.087131367292225, "grad_norm": 2.418691873550415, "learning_rate": 2.8591785445493376e-05, "loss": 0.5944, "num_input_tokens_seen": 19695472, "step": 34155 }, { "epoch": 5.087876079833184, "grad_norm": 1.501138687133789, "learning_rate": 2.8585353903891986e-05, "loss": 0.5741, "num_input_tokens_seen": 19698416, "step": 34160 }, { "epoch": 5.088620792374144, "grad_norm": 2.113389492034912, "learning_rate": 2.8578922120006962e-05, "loss": 0.6008, "num_input_tokens_seen": 19701424, "step": 34165 }, { "epoch": 5.0893655049151025, "grad_norm": 2.091580390930176, "learning_rate": 2.857249009427293e-05, "loss": 0.7422, "num_input_tokens_seen": 19704432, "step": 34170 }, { "epoch": 5.090110217456062, "grad_norm": 1.5044529438018799, "learning_rate": 2.856605782712455e-05, "loss": 0.6175, "num_input_tokens_seen": 19707216, "step": 34175 }, { "epoch": 5.090854929997021, "grad_norm": 2.7573463916778564, "learning_rate": 2.855962531899647e-05, "loss": 0.4721, "num_input_tokens_seen": 19710032, "step": 34180 }, { "epoch": 5.091599642537981, "grad_norm": 1.982077956199646, "learning_rate": 2.8553192570323385e-05, "loss": 0.6654, "num_input_tokens_seen": 19712912, "step": 34185 }, { "epoch": 5.092344355078939, "grad_norm": 1.8831586837768555, "learning_rate": 2.8546759581539994e-05, "loss": 0.6438, "num_input_tokens_seen": 19716176, "step": 34190 }, { "epoch": 5.093089067619899, "grad_norm": 1.2074981927871704, "learning_rate": 2.8540326353081005e-05, "loss": 0.6327, "num_input_tokens_seen": 19719120, "step": 34195 }, { "epoch": 5.093833780160858, "grad_norm": 1.8891533613204956, "learning_rate": 2.8533892885381164e-05, "loss": 0.6224, "num_input_tokens_seen": 19721904, "step": 34200 }, { "epoch": 5.094578492701817, "grad_norm": 1.3607001304626465, "learning_rate": 2.852745917887521e-05, "loss": 0.5399, "num_input_tokens_seen": 19724720, "step": 34205 }, { "epoch": 5.095323205242776, "grad_norm": 5.633553504943848, "learning_rate": 2.85210252339979e-05, "loss": 0.5856, "num_input_tokens_seen": 19727568, "step": 34210 }, { "epoch": 5.096067917783736, "grad_norm": 1.3514699935913086, "learning_rate": 2.851459105118402e-05, "loss": 0.6863, "num_input_tokens_seen": 19730704, "step": 34215 }, { "epoch": 5.096812630324695, "grad_norm": 1.4388741254806519, "learning_rate": 2.8508156630868373e-05, "loss": 0.7854, "num_input_tokens_seen": 19733584, "step": 34220 }, { "epoch": 5.097557342865654, "grad_norm": 1.363825798034668, "learning_rate": 2.8501721973485757e-05, "loss": 0.5414, "num_input_tokens_seen": 19736304, "step": 34225 }, { "epoch": 5.098302055406613, "grad_norm": 1.4693725109100342, "learning_rate": 2.8495287079471012e-05, "loss": 0.5706, "num_input_tokens_seen": 19739440, "step": 34230 }, { "epoch": 5.099046767947573, "grad_norm": 1.0707985162734985, "learning_rate": 2.8488851949258972e-05, "loss": 0.4551, "num_input_tokens_seen": 19742448, "step": 34235 }, { "epoch": 5.099791480488531, "grad_norm": 2.3178603649139404, "learning_rate": 2.848241658328451e-05, "loss": 0.5731, "num_input_tokens_seen": 19745200, "step": 34240 }, { "epoch": 5.100536193029491, "grad_norm": 0.9834654331207275, "learning_rate": 2.8475980981982485e-05, "loss": 0.6828, "num_input_tokens_seen": 19748528, "step": 34245 }, { "epoch": 5.10128090557045, "grad_norm": 2.3843231201171875, "learning_rate": 2.8469545145787796e-05, "loss": 0.7395, "num_input_tokens_seen": 19751248, "step": 34250 }, { "epoch": 5.102025618111409, "grad_norm": 1.4460099935531616, "learning_rate": 2.846310907513536e-05, "loss": 0.7301, "num_input_tokens_seen": 19753936, "step": 34255 }, { "epoch": 5.102770330652368, "grad_norm": 1.3627129793167114, "learning_rate": 2.845667277046009e-05, "loss": 0.5415, "num_input_tokens_seen": 19756880, "step": 34260 }, { "epoch": 5.103515043193327, "grad_norm": 1.161636233329773, "learning_rate": 2.8450236232196924e-05, "loss": 0.5018, "num_input_tokens_seen": 19759792, "step": 34265 }, { "epoch": 5.104259755734287, "grad_norm": 1.57429838180542, "learning_rate": 2.844379946078083e-05, "loss": 0.5835, "num_input_tokens_seen": 19762896, "step": 34270 }, { "epoch": 5.105004468275245, "grad_norm": 0.6168005466461182, "learning_rate": 2.843736245664676e-05, "loss": 0.7512, "num_input_tokens_seen": 19765776, "step": 34275 }, { "epoch": 5.105749180816205, "grad_norm": 1.1868159770965576, "learning_rate": 2.843092522022972e-05, "loss": 0.617, "num_input_tokens_seen": 19768592, "step": 34280 }, { "epoch": 5.106493893357164, "grad_norm": 2.6782612800598145, "learning_rate": 2.8424487751964696e-05, "loss": 0.7898, "num_input_tokens_seen": 19771760, "step": 34285 }, { "epoch": 5.107238605898123, "grad_norm": 2.570291042327881, "learning_rate": 2.8418050052286715e-05, "loss": 0.6767, "num_input_tokens_seen": 19774480, "step": 34290 }, { "epoch": 5.107983318439082, "grad_norm": 2.4417765140533447, "learning_rate": 2.8411612121630804e-05, "loss": 0.6348, "num_input_tokens_seen": 19777232, "step": 34295 }, { "epoch": 5.108728030980042, "grad_norm": 1.4459161758422852, "learning_rate": 2.8405173960432024e-05, "loss": 0.7523, "num_input_tokens_seen": 19779888, "step": 34300 }, { "epoch": 5.109472743521001, "grad_norm": 6.217994689941406, "learning_rate": 2.8398735569125427e-05, "loss": 0.4294, "num_input_tokens_seen": 19782736, "step": 34305 }, { "epoch": 5.11021745606196, "grad_norm": 2.1315698623657227, "learning_rate": 2.83922969481461e-05, "loss": 0.5543, "num_input_tokens_seen": 19785552, "step": 34310 }, { "epoch": 5.110962168602919, "grad_norm": 1.623782992362976, "learning_rate": 2.8385858097929135e-05, "loss": 0.4601, "num_input_tokens_seen": 19788336, "step": 34315 }, { "epoch": 5.111706881143879, "grad_norm": 2.791337251663208, "learning_rate": 2.8379419018909648e-05, "loss": 0.4887, "num_input_tokens_seen": 19791152, "step": 34320 }, { "epoch": 5.112451593684837, "grad_norm": 1.1207008361816406, "learning_rate": 2.8372979711522767e-05, "loss": 0.6129, "num_input_tokens_seen": 19793776, "step": 34325 }, { "epoch": 5.113196306225797, "grad_norm": 1.7874679565429688, "learning_rate": 2.8366540176203625e-05, "loss": 0.5175, "num_input_tokens_seen": 19796400, "step": 34330 }, { "epoch": 5.113941018766756, "grad_norm": 1.1556012630462646, "learning_rate": 2.8360100413387392e-05, "loss": 0.4937, "num_input_tokens_seen": 19799088, "step": 34335 }, { "epoch": 5.114685731307715, "grad_norm": 2.5944912433624268, "learning_rate": 2.8353660423509233e-05, "loss": 0.7938, "num_input_tokens_seen": 19801872, "step": 34340 }, { "epoch": 5.115430443848674, "grad_norm": 2.525876522064209, "learning_rate": 2.8347220207004325e-05, "loss": 0.7318, "num_input_tokens_seen": 19804848, "step": 34345 }, { "epoch": 5.116175156389634, "grad_norm": 2.6432135105133057, "learning_rate": 2.834077976430789e-05, "loss": 0.5131, "num_input_tokens_seen": 19807696, "step": 34350 }, { "epoch": 5.116919868930593, "grad_norm": 1.3074913024902344, "learning_rate": 2.8334339095855152e-05, "loss": 0.6027, "num_input_tokens_seen": 19810544, "step": 34355 }, { "epoch": 5.117664581471552, "grad_norm": 1.1242491006851196, "learning_rate": 2.8327898202081327e-05, "loss": 0.6598, "num_input_tokens_seen": 19813200, "step": 34360 }, { "epoch": 5.118409294012511, "grad_norm": 1.1290812492370605, "learning_rate": 2.8321457083421665e-05, "loss": 0.8214, "num_input_tokens_seen": 19815888, "step": 34365 }, { "epoch": 5.119154006553471, "grad_norm": 2.5272903442382812, "learning_rate": 2.831501574031145e-05, "loss": 0.5852, "num_input_tokens_seen": 19818768, "step": 34370 }, { "epoch": 5.119898719094429, "grad_norm": 0.8689136505126953, "learning_rate": 2.8308574173185943e-05, "loss": 0.546, "num_input_tokens_seen": 19822000, "step": 34375 }, { "epoch": 5.120643431635389, "grad_norm": 1.0025547742843628, "learning_rate": 2.8302132382480447e-05, "loss": 0.6593, "num_input_tokens_seen": 19824976, "step": 34380 }, { "epoch": 5.121388144176348, "grad_norm": 1.6502094268798828, "learning_rate": 2.8295690368630263e-05, "loss": 0.7231, "num_input_tokens_seen": 19827952, "step": 34385 }, { "epoch": 5.1221328567173074, "grad_norm": 3.2653980255126953, "learning_rate": 2.8289248132070727e-05, "loss": 0.7558, "num_input_tokens_seen": 19830704, "step": 34390 }, { "epoch": 5.122877569258266, "grad_norm": 1.7140297889709473, "learning_rate": 2.828280567323718e-05, "loss": 0.5998, "num_input_tokens_seen": 19833616, "step": 34395 }, { "epoch": 5.123622281799226, "grad_norm": 2.034179210662842, "learning_rate": 2.827636299256497e-05, "loss": 0.6559, "num_input_tokens_seen": 19836432, "step": 34400 }, { "epoch": 5.124366994340185, "grad_norm": 1.27163565158844, "learning_rate": 2.826992009048947e-05, "loss": 0.5399, "num_input_tokens_seen": 19838992, "step": 34405 }, { "epoch": 5.125111706881144, "grad_norm": 2.8797242641448975, "learning_rate": 2.8263476967446062e-05, "loss": 0.7266, "num_input_tokens_seen": 19841840, "step": 34410 }, { "epoch": 5.125856419422103, "grad_norm": 1.1464953422546387, "learning_rate": 2.8257033623870145e-05, "loss": 0.6747, "num_input_tokens_seen": 19844912, "step": 34415 }, { "epoch": 5.126601131963063, "grad_norm": 1.0403956174850464, "learning_rate": 2.825059006019715e-05, "loss": 0.4309, "num_input_tokens_seen": 19847824, "step": 34420 }, { "epoch": 5.127345844504021, "grad_norm": 1.1905752420425415, "learning_rate": 2.824414627686249e-05, "loss": 0.6189, "num_input_tokens_seen": 19850704, "step": 34425 }, { "epoch": 5.128090557044981, "grad_norm": 1.1722979545593262, "learning_rate": 2.8237702274301602e-05, "loss": 0.39, "num_input_tokens_seen": 19853424, "step": 34430 }, { "epoch": 5.12883526958594, "grad_norm": 1.6138849258422852, "learning_rate": 2.823125805294997e-05, "loss": 0.5486, "num_input_tokens_seen": 19856048, "step": 34435 }, { "epoch": 5.129579982126899, "grad_norm": 2.137636661529541, "learning_rate": 2.8224813613243062e-05, "loss": 0.606, "num_input_tokens_seen": 19859248, "step": 34440 }, { "epoch": 5.130324694667858, "grad_norm": 1.5765321254730225, "learning_rate": 2.8218368955616347e-05, "loss": 0.4927, "num_input_tokens_seen": 19862128, "step": 34445 }, { "epoch": 5.131069407208817, "grad_norm": 1.2079546451568604, "learning_rate": 2.8211924080505348e-05, "loss": 0.5279, "num_input_tokens_seen": 19864720, "step": 34450 }, { "epoch": 5.131814119749777, "grad_norm": 1.1502773761749268, "learning_rate": 2.8205478988345584e-05, "loss": 0.6784, "num_input_tokens_seen": 19867376, "step": 34455 }, { "epoch": 5.132558832290735, "grad_norm": 1.3042205572128296, "learning_rate": 2.8199033679572578e-05, "loss": 0.6527, "num_input_tokens_seen": 19870192, "step": 34460 }, { "epoch": 5.133303544831695, "grad_norm": 2.2160804271698, "learning_rate": 2.819258815462188e-05, "loss": 0.4948, "num_input_tokens_seen": 19872976, "step": 34465 }, { "epoch": 5.134048257372654, "grad_norm": 2.052116632461548, "learning_rate": 2.8186142413929063e-05, "loss": 0.6005, "num_input_tokens_seen": 19875664, "step": 34470 }, { "epoch": 5.1347929699136134, "grad_norm": 5.094419002532959, "learning_rate": 2.8179696457929684e-05, "loss": 0.5461, "num_input_tokens_seen": 19878448, "step": 34475 }, { "epoch": 5.135537682454572, "grad_norm": 1.54698646068573, "learning_rate": 2.8173250287059354e-05, "loss": 0.715, "num_input_tokens_seen": 19881296, "step": 34480 }, { "epoch": 5.136282394995532, "grad_norm": 2.0678961277008057, "learning_rate": 2.816680390175367e-05, "loss": 0.7529, "num_input_tokens_seen": 19883632, "step": 34485 }, { "epoch": 5.137027107536491, "grad_norm": 2.225280284881592, "learning_rate": 2.8160357302448253e-05, "loss": 0.5662, "num_input_tokens_seen": 19886640, "step": 34490 }, { "epoch": 5.13777182007745, "grad_norm": 1.7330704927444458, "learning_rate": 2.8153910489578734e-05, "loss": 0.6386, "num_input_tokens_seen": 19889232, "step": 34495 }, { "epoch": 5.138516532618409, "grad_norm": 4.102401256561279, "learning_rate": 2.8147463463580776e-05, "loss": 0.5992, "num_input_tokens_seen": 19892144, "step": 34500 }, { "epoch": 5.139261245159369, "grad_norm": 1.5170749425888062, "learning_rate": 2.8141016224890027e-05, "loss": 0.6914, "num_input_tokens_seen": 19895120, "step": 34505 }, { "epoch": 5.140005957700327, "grad_norm": 1.4039928913116455, "learning_rate": 2.8134568773942172e-05, "loss": 0.6416, "num_input_tokens_seen": 19897872, "step": 34510 }, { "epoch": 5.140750670241287, "grad_norm": 1.5034428834915161, "learning_rate": 2.81281211111729e-05, "loss": 0.6525, "num_input_tokens_seen": 19900784, "step": 34515 }, { "epoch": 5.141495382782246, "grad_norm": 1.8729288578033447, "learning_rate": 2.8121673237017937e-05, "loss": 0.5961, "num_input_tokens_seen": 19903440, "step": 34520 }, { "epoch": 5.1422400953232055, "grad_norm": 2.073474645614624, "learning_rate": 2.8115225151912977e-05, "loss": 0.8104, "num_input_tokens_seen": 19906544, "step": 34525 }, { "epoch": 5.142984807864164, "grad_norm": 2.406547784805298, "learning_rate": 2.810877685629376e-05, "loss": 0.7422, "num_input_tokens_seen": 19909232, "step": 34530 }, { "epoch": 5.143729520405124, "grad_norm": 0.8792576789855957, "learning_rate": 2.8102328350596058e-05, "loss": 0.6262, "num_input_tokens_seen": 19912304, "step": 34535 }, { "epoch": 5.144474232946083, "grad_norm": 1.4625697135925293, "learning_rate": 2.809587963525561e-05, "loss": 0.6883, "num_input_tokens_seen": 19915344, "step": 34540 }, { "epoch": 5.145218945487042, "grad_norm": 1.417840838432312, "learning_rate": 2.8089430710708203e-05, "loss": 0.6695, "num_input_tokens_seen": 19918256, "step": 34545 }, { "epoch": 5.145963658028001, "grad_norm": 2.2049081325531006, "learning_rate": 2.8082981577389627e-05, "loss": 0.4705, "num_input_tokens_seen": 19921008, "step": 34550 }, { "epoch": 5.146708370568961, "grad_norm": 4.3135247230529785, "learning_rate": 2.80765322357357e-05, "loss": 0.5875, "num_input_tokens_seen": 19924304, "step": 34555 }, { "epoch": 5.1474530831099194, "grad_norm": 2.4143176078796387, "learning_rate": 2.8070082686182232e-05, "loss": 0.5925, "num_input_tokens_seen": 19927216, "step": 34560 }, { "epoch": 5.148197795650879, "grad_norm": 1.4587780237197876, "learning_rate": 2.8063632929165047e-05, "loss": 0.6221, "num_input_tokens_seen": 19929872, "step": 34565 }, { "epoch": 5.148942508191838, "grad_norm": 1.1532230377197266, "learning_rate": 2.805718296512001e-05, "loss": 0.6496, "num_input_tokens_seen": 19932880, "step": 34570 }, { "epoch": 5.1496872207327975, "grad_norm": 1.2261918783187866, "learning_rate": 2.8050732794482976e-05, "loss": 0.6043, "num_input_tokens_seen": 19935920, "step": 34575 }, { "epoch": 5.150431933273756, "grad_norm": 2.9536666870117188, "learning_rate": 2.804428241768983e-05, "loss": 0.7981, "num_input_tokens_seen": 19938640, "step": 34580 }, { "epoch": 5.151176645814716, "grad_norm": 2.5156495571136475, "learning_rate": 2.8037831835176454e-05, "loss": 0.8352, "num_input_tokens_seen": 19941712, "step": 34585 }, { "epoch": 5.151921358355675, "grad_norm": 1.2439273595809937, "learning_rate": 2.8031381047378746e-05, "loss": 0.5928, "num_input_tokens_seen": 19944432, "step": 34590 }, { "epoch": 5.152666070896634, "grad_norm": 1.191808819770813, "learning_rate": 2.8024930054732635e-05, "loss": 0.616, "num_input_tokens_seen": 19947280, "step": 34595 }, { "epoch": 5.153410783437593, "grad_norm": 1.810601830482483, "learning_rate": 2.8018478857674052e-05, "loss": 0.7174, "num_input_tokens_seen": 19950256, "step": 34600 }, { "epoch": 5.154155495978552, "grad_norm": 1.9692994356155396, "learning_rate": 2.801202745663894e-05, "loss": 0.6067, "num_input_tokens_seen": 19953360, "step": 34605 }, { "epoch": 5.1549002085195115, "grad_norm": 0.7366834282875061, "learning_rate": 2.8005575852063252e-05, "loss": 0.4171, "num_input_tokens_seen": 19955984, "step": 34610 }, { "epoch": 5.15564492106047, "grad_norm": 2.0866734981536865, "learning_rate": 2.7999124044382975e-05, "loss": 0.4352, "num_input_tokens_seen": 19958896, "step": 34615 }, { "epoch": 5.15638963360143, "grad_norm": 2.698841094970703, "learning_rate": 2.7992672034034096e-05, "loss": 0.6875, "num_input_tokens_seen": 19961776, "step": 34620 }, { "epoch": 5.157134346142389, "grad_norm": 3.0248820781707764, "learning_rate": 2.798621982145259e-05, "loss": 0.7298, "num_input_tokens_seen": 19964688, "step": 34625 }, { "epoch": 5.157879058683348, "grad_norm": 1.4587204456329346, "learning_rate": 2.7979767407074496e-05, "loss": 0.4192, "num_input_tokens_seen": 19967472, "step": 34630 }, { "epoch": 5.158623771224307, "grad_norm": 1.5974560976028442, "learning_rate": 2.7973314791335842e-05, "loss": 0.6267, "num_input_tokens_seen": 19970384, "step": 34635 }, { "epoch": 5.159368483765267, "grad_norm": 3.995563507080078, "learning_rate": 2.796686197467266e-05, "loss": 0.5601, "num_input_tokens_seen": 19973648, "step": 34640 }, { "epoch": 5.1601131963062254, "grad_norm": 1.364956021308899, "learning_rate": 2.7960408957521005e-05, "loss": 0.4554, "num_input_tokens_seen": 19976304, "step": 34645 }, { "epoch": 5.160857908847185, "grad_norm": 1.6300026178359985, "learning_rate": 2.7953955740316944e-05, "loss": 0.5575, "num_input_tokens_seen": 19979248, "step": 34650 }, { "epoch": 5.161602621388144, "grad_norm": 2.827402114868164, "learning_rate": 2.794750232349658e-05, "loss": 0.6026, "num_input_tokens_seen": 19982160, "step": 34655 }, { "epoch": 5.1623473339291035, "grad_norm": 1.431645154953003, "learning_rate": 2.7941048707495982e-05, "loss": 0.5955, "num_input_tokens_seen": 19985296, "step": 34660 }, { "epoch": 5.163092046470062, "grad_norm": 3.883528709411621, "learning_rate": 2.793459489275127e-05, "loss": 0.7616, "num_input_tokens_seen": 19988112, "step": 34665 }, { "epoch": 5.163836759011022, "grad_norm": 1.7224026918411255, "learning_rate": 2.7928140879698567e-05, "loss": 0.5987, "num_input_tokens_seen": 19991152, "step": 34670 }, { "epoch": 5.164581471551981, "grad_norm": 2.7244672775268555, "learning_rate": 2.7921686668774005e-05, "loss": 0.4863, "num_input_tokens_seen": 19994192, "step": 34675 }, { "epoch": 5.16532618409294, "grad_norm": 1.4028555154800415, "learning_rate": 2.791523226041374e-05, "loss": 0.5219, "num_input_tokens_seen": 19996880, "step": 34680 }, { "epoch": 5.166070896633899, "grad_norm": 2.33604097366333, "learning_rate": 2.7908777655053936e-05, "loss": 0.6342, "num_input_tokens_seen": 19999760, "step": 34685 }, { "epoch": 5.166815609174859, "grad_norm": 3.1447250843048096, "learning_rate": 2.7902322853130757e-05, "loss": 0.5942, "num_input_tokens_seen": 20002704, "step": 34690 }, { "epoch": 5.1675603217158175, "grad_norm": 0.9355829358100891, "learning_rate": 2.7895867855080405e-05, "loss": 0.4183, "num_input_tokens_seen": 20005616, "step": 34695 }, { "epoch": 5.168305034256777, "grad_norm": 1.4543558359146118, "learning_rate": 2.7889412661339077e-05, "loss": 0.6839, "num_input_tokens_seen": 20008720, "step": 34700 }, { "epoch": 5.169049746797736, "grad_norm": 2.6567983627319336, "learning_rate": 2.7882957272342986e-05, "loss": 0.6294, "num_input_tokens_seen": 20011696, "step": 34705 }, { "epoch": 5.1697944593386955, "grad_norm": 1.3169339895248413, "learning_rate": 2.7876501688528362e-05, "loss": 0.5237, "num_input_tokens_seen": 20014736, "step": 34710 }, { "epoch": 5.170539171879654, "grad_norm": 0.8800103664398193, "learning_rate": 2.7870045910331444e-05, "loss": 0.502, "num_input_tokens_seen": 20017616, "step": 34715 }, { "epoch": 5.171283884420614, "grad_norm": 3.9496946334838867, "learning_rate": 2.786358993818851e-05, "loss": 0.7423, "num_input_tokens_seen": 20020624, "step": 34720 }, { "epoch": 5.172028596961573, "grad_norm": 2.8554797172546387, "learning_rate": 2.7857133772535798e-05, "loss": 0.7489, "num_input_tokens_seen": 20023376, "step": 34725 }, { "epoch": 5.172773309502532, "grad_norm": 1.834943413734436, "learning_rate": 2.78506774138096e-05, "loss": 0.6353, "num_input_tokens_seen": 20026064, "step": 34730 }, { "epoch": 5.173518022043491, "grad_norm": 1.5825093984603882, "learning_rate": 2.7844220862446218e-05, "loss": 0.6156, "num_input_tokens_seen": 20028720, "step": 34735 }, { "epoch": 5.174262734584451, "grad_norm": 1.9849870204925537, "learning_rate": 2.7837764118881953e-05, "loss": 0.5807, "num_input_tokens_seen": 20031632, "step": 34740 }, { "epoch": 5.1750074471254095, "grad_norm": 1.2211036682128906, "learning_rate": 2.7831307183553122e-05, "loss": 0.5896, "num_input_tokens_seen": 20034096, "step": 34745 }, { "epoch": 5.175752159666369, "grad_norm": 1.192523717880249, "learning_rate": 2.782485005689607e-05, "loss": 0.5804, "num_input_tokens_seen": 20036784, "step": 34750 }, { "epoch": 5.176496872207328, "grad_norm": 2.4656291007995605, "learning_rate": 2.7818392739347127e-05, "loss": 0.6648, "num_input_tokens_seen": 20039664, "step": 34755 }, { "epoch": 5.1772415847482876, "grad_norm": 1.8553470373153687, "learning_rate": 2.781193523134267e-05, "loss": 0.7896, "num_input_tokens_seen": 20042448, "step": 34760 }, { "epoch": 5.177986297289246, "grad_norm": 2.0869338512420654, "learning_rate": 2.780547753331906e-05, "loss": 0.7079, "num_input_tokens_seen": 20045392, "step": 34765 }, { "epoch": 5.178731009830206, "grad_norm": 1.6797153949737549, "learning_rate": 2.7799019645712682e-05, "loss": 0.51, "num_input_tokens_seen": 20048208, "step": 34770 }, { "epoch": 5.179475722371165, "grad_norm": 3.4584567546844482, "learning_rate": 2.7792561568959934e-05, "loss": 0.8628, "num_input_tokens_seen": 20051344, "step": 34775 }, { "epoch": 5.180220434912124, "grad_norm": 1.5816640853881836, "learning_rate": 2.778610330349723e-05, "loss": 0.5583, "num_input_tokens_seen": 20054128, "step": 34780 }, { "epoch": 5.180965147453083, "grad_norm": 2.388715982437134, "learning_rate": 2.7779644849761004e-05, "loss": 0.5468, "num_input_tokens_seen": 20056976, "step": 34785 }, { "epoch": 5.181709859994042, "grad_norm": 1.8610374927520752, "learning_rate": 2.777318620818767e-05, "loss": 0.6812, "num_input_tokens_seen": 20059824, "step": 34790 }, { "epoch": 5.1824545725350015, "grad_norm": 2.63999342918396, "learning_rate": 2.7766727379213686e-05, "loss": 0.5041, "num_input_tokens_seen": 20062512, "step": 34795 }, { "epoch": 5.18319928507596, "grad_norm": 2.2640860080718994, "learning_rate": 2.7760268363275523e-05, "loss": 0.7264, "num_input_tokens_seen": 20065424, "step": 34800 }, { "epoch": 5.18394399761692, "grad_norm": 1.48198664188385, "learning_rate": 2.7753809160809642e-05, "loss": 0.5574, "num_input_tokens_seen": 20068496, "step": 34805 }, { "epoch": 5.184688710157879, "grad_norm": 2.815467596054077, "learning_rate": 2.7747349772252523e-05, "loss": 0.6472, "num_input_tokens_seen": 20071376, "step": 34810 }, { "epoch": 5.185433422698838, "grad_norm": 1.7855278253555298, "learning_rate": 2.7740890198040687e-05, "loss": 0.6334, "num_input_tokens_seen": 20074288, "step": 34815 }, { "epoch": 5.186178135239797, "grad_norm": 1.9586100578308105, "learning_rate": 2.7734430438610637e-05, "loss": 0.6147, "num_input_tokens_seen": 20077008, "step": 34820 }, { "epoch": 5.186922847780757, "grad_norm": 2.2173333168029785, "learning_rate": 2.7727970494398892e-05, "loss": 0.5415, "num_input_tokens_seen": 20079664, "step": 34825 }, { "epoch": 5.1876675603217155, "grad_norm": 1.3124486207962036, "learning_rate": 2.772151036584199e-05, "loss": 0.5591, "num_input_tokens_seen": 20082448, "step": 34830 }, { "epoch": 5.188412272862675, "grad_norm": 3.0030195713043213, "learning_rate": 2.7715050053376484e-05, "loss": 0.5675, "num_input_tokens_seen": 20085136, "step": 34835 }, { "epoch": 5.189156985403634, "grad_norm": 1.484182596206665, "learning_rate": 2.7708589557438936e-05, "loss": 0.5736, "num_input_tokens_seen": 20088080, "step": 34840 }, { "epoch": 5.1899016979445936, "grad_norm": 2.4224886894226074, "learning_rate": 2.7702128878465917e-05, "loss": 0.6648, "num_input_tokens_seen": 20090704, "step": 34845 }, { "epoch": 5.190646410485552, "grad_norm": 1.8746726512908936, "learning_rate": 2.7695668016894017e-05, "loss": 0.585, "num_input_tokens_seen": 20093776, "step": 34850 }, { "epoch": 5.191391123026512, "grad_norm": 2.006521701812744, "learning_rate": 2.7689206973159825e-05, "loss": 0.367, "num_input_tokens_seen": 20096528, "step": 34855 }, { "epoch": 5.192135835567471, "grad_norm": 1.6561630964279175, "learning_rate": 2.7682745747699962e-05, "loss": 0.6081, "num_input_tokens_seen": 20099280, "step": 34860 }, { "epoch": 5.19288054810843, "grad_norm": 2.186763286590576, "learning_rate": 2.7676284340951054e-05, "loss": 0.6238, "num_input_tokens_seen": 20102064, "step": 34865 }, { "epoch": 5.193625260649389, "grad_norm": 1.6528730392456055, "learning_rate": 2.766982275334973e-05, "loss": 0.5914, "num_input_tokens_seen": 20105296, "step": 34870 }, { "epoch": 5.194369973190349, "grad_norm": 1.8697503805160522, "learning_rate": 2.7663360985332632e-05, "loss": 0.783, "num_input_tokens_seen": 20108144, "step": 34875 }, { "epoch": 5.1951146857313075, "grad_norm": 1.4747815132141113, "learning_rate": 2.7656899037336426e-05, "loss": 0.634, "num_input_tokens_seen": 20111088, "step": 34880 }, { "epoch": 5.195859398272267, "grad_norm": 1.6973415613174438, "learning_rate": 2.76504369097978e-05, "loss": 0.6668, "num_input_tokens_seen": 20114480, "step": 34885 }, { "epoch": 5.196604110813226, "grad_norm": 1.916993498802185, "learning_rate": 2.7643974603153412e-05, "loss": 0.7289, "num_input_tokens_seen": 20117264, "step": 34890 }, { "epoch": 5.197348823354186, "grad_norm": 1.8287229537963867, "learning_rate": 2.763751211783997e-05, "loss": 0.4904, "num_input_tokens_seen": 20120144, "step": 34895 }, { "epoch": 5.198093535895144, "grad_norm": 2.7291998863220215, "learning_rate": 2.7631049454294182e-05, "loss": 0.4602, "num_input_tokens_seen": 20122896, "step": 34900 }, { "epoch": 5.198838248436104, "grad_norm": 1.6532647609710693, "learning_rate": 2.7624586612952775e-05, "loss": 0.676, "num_input_tokens_seen": 20125968, "step": 34905 }, { "epoch": 5.199582960977063, "grad_norm": 3.115067958831787, "learning_rate": 2.761812359425247e-05, "loss": 0.5773, "num_input_tokens_seen": 20128528, "step": 34910 }, { "epoch": 5.200327673518022, "grad_norm": 2.245262622833252, "learning_rate": 2.7611660398630025e-05, "loss": 0.6327, "num_input_tokens_seen": 20131632, "step": 34915 }, { "epoch": 5.201072386058981, "grad_norm": 2.9992902278900146, "learning_rate": 2.7605197026522177e-05, "loss": 0.5661, "num_input_tokens_seen": 20134576, "step": 34920 }, { "epoch": 5.201817098599941, "grad_norm": 2.217095136642456, "learning_rate": 2.7598733478365713e-05, "loss": 0.5583, "num_input_tokens_seen": 20137584, "step": 34925 }, { "epoch": 5.2025618111408996, "grad_norm": 3.224083662033081, "learning_rate": 2.75922697545974e-05, "loss": 0.5934, "num_input_tokens_seen": 20140368, "step": 34930 }, { "epoch": 5.203306523681859, "grad_norm": 2.6706111431121826, "learning_rate": 2.7585805855654045e-05, "loss": 0.5329, "num_input_tokens_seen": 20143152, "step": 34935 }, { "epoch": 5.204051236222818, "grad_norm": 1.8492575883865356, "learning_rate": 2.7579341781972436e-05, "loss": 0.5417, "num_input_tokens_seen": 20145616, "step": 34940 }, { "epoch": 5.204795948763778, "grad_norm": 1.552211880683899, "learning_rate": 2.75728775339894e-05, "loss": 0.5703, "num_input_tokens_seen": 20148432, "step": 34945 }, { "epoch": 5.205540661304736, "grad_norm": 3.5655677318573, "learning_rate": 2.7566413112141765e-05, "loss": 0.5818, "num_input_tokens_seen": 20151152, "step": 34950 }, { "epoch": 5.206285373845695, "grad_norm": 1.241966962814331, "learning_rate": 2.7559948516866357e-05, "loss": 0.6117, "num_input_tokens_seen": 20153808, "step": 34955 }, { "epoch": 5.207030086386655, "grad_norm": 0.830200731754303, "learning_rate": 2.755348374860004e-05, "loss": 0.4815, "num_input_tokens_seen": 20157136, "step": 34960 }, { "epoch": 5.2077747989276135, "grad_norm": 2.4208531379699707, "learning_rate": 2.7547018807779673e-05, "loss": 0.6006, "num_input_tokens_seen": 20160144, "step": 34965 }, { "epoch": 5.208519511468573, "grad_norm": 1.1682647466659546, "learning_rate": 2.7540553694842136e-05, "loss": 0.5565, "num_input_tokens_seen": 20162896, "step": 34970 }, { "epoch": 5.209264224009532, "grad_norm": 3.1008100509643555, "learning_rate": 2.7534088410224302e-05, "loss": 0.7128, "num_input_tokens_seen": 20166320, "step": 34975 }, { "epoch": 5.210008936550492, "grad_norm": 1.7583184242248535, "learning_rate": 2.7527622954363074e-05, "loss": 0.5567, "num_input_tokens_seen": 20169168, "step": 34980 }, { "epoch": 5.21075364909145, "grad_norm": 1.3275879621505737, "learning_rate": 2.752115732769538e-05, "loss": 0.5998, "num_input_tokens_seen": 20171984, "step": 34985 }, { "epoch": 5.21149836163241, "grad_norm": 1.7343828678131104, "learning_rate": 2.7514691530658103e-05, "loss": 0.595, "num_input_tokens_seen": 20174800, "step": 34990 }, { "epoch": 5.212243074173369, "grad_norm": 2.2716782093048096, "learning_rate": 2.7508225563688206e-05, "loss": 0.6897, "num_input_tokens_seen": 20177808, "step": 34995 }, { "epoch": 5.212987786714328, "grad_norm": 2.8443238735198975, "learning_rate": 2.750175942722262e-05, "loss": 0.5833, "num_input_tokens_seen": 20180848, "step": 35000 }, { "epoch": 5.213732499255287, "grad_norm": 1.1609628200531006, "learning_rate": 2.749529312169831e-05, "loss": 0.6971, "num_input_tokens_seen": 20183792, "step": 35005 }, { "epoch": 5.214477211796247, "grad_norm": 2.0408754348754883, "learning_rate": 2.748882664755223e-05, "loss": 0.7659, "num_input_tokens_seen": 20186448, "step": 35010 }, { "epoch": 5.2152219243372056, "grad_norm": 1.2516180276870728, "learning_rate": 2.748236000522137e-05, "loss": 0.6266, "num_input_tokens_seen": 20189200, "step": 35015 }, { "epoch": 5.215966636878165, "grad_norm": 1.2691913843154907, "learning_rate": 2.7475893195142706e-05, "loss": 0.6467, "num_input_tokens_seen": 20192080, "step": 35020 }, { "epoch": 5.216711349419124, "grad_norm": 1.8070852756500244, "learning_rate": 2.746942621775325e-05, "loss": 0.5414, "num_input_tokens_seen": 20195024, "step": 35025 }, { "epoch": 5.217456061960084, "grad_norm": 2.3261592388153076, "learning_rate": 2.7462959073490007e-05, "loss": 0.6628, "num_input_tokens_seen": 20198160, "step": 35030 }, { "epoch": 5.218200774501042, "grad_norm": 3.9508063793182373, "learning_rate": 2.745649176279001e-05, "loss": 0.8527, "num_input_tokens_seen": 20201296, "step": 35035 }, { "epoch": 5.218945487042002, "grad_norm": 1.1983051300048828, "learning_rate": 2.7450024286090283e-05, "loss": 0.6904, "num_input_tokens_seen": 20204176, "step": 35040 }, { "epoch": 5.219690199582961, "grad_norm": 1.6091686487197876, "learning_rate": 2.7443556643827872e-05, "loss": 0.5637, "num_input_tokens_seen": 20207024, "step": 35045 }, { "epoch": 5.22043491212392, "grad_norm": 1.5224592685699463, "learning_rate": 2.7437088836439844e-05, "loss": 0.6997, "num_input_tokens_seen": 20209968, "step": 35050 }, { "epoch": 5.221179624664879, "grad_norm": 1.5386027097702026, "learning_rate": 2.7430620864363254e-05, "loss": 0.6449, "num_input_tokens_seen": 20212944, "step": 35055 }, { "epoch": 5.221924337205839, "grad_norm": 2.337195634841919, "learning_rate": 2.7424152728035192e-05, "loss": 0.6079, "num_input_tokens_seen": 20215760, "step": 35060 }, { "epoch": 5.222669049746798, "grad_norm": 2.764935255050659, "learning_rate": 2.7417684427892747e-05, "loss": 0.5246, "num_input_tokens_seen": 20218512, "step": 35065 }, { "epoch": 5.223413762287757, "grad_norm": 0.6359666585922241, "learning_rate": 2.741121596437302e-05, "loss": 0.5178, "num_input_tokens_seen": 20221360, "step": 35070 }, { "epoch": 5.224158474828716, "grad_norm": 1.367148756980896, "learning_rate": 2.7404747337913116e-05, "loss": 0.6225, "num_input_tokens_seen": 20224528, "step": 35075 }, { "epoch": 5.224903187369676, "grad_norm": 1.858847737312317, "learning_rate": 2.739827854895017e-05, "loss": 0.7111, "num_input_tokens_seen": 20227472, "step": 35080 }, { "epoch": 5.225647899910634, "grad_norm": 1.915300726890564, "learning_rate": 2.73918095979213e-05, "loss": 0.8454, "num_input_tokens_seen": 20230320, "step": 35085 }, { "epoch": 5.226392612451594, "grad_norm": 1.2331217527389526, "learning_rate": 2.7385340485263667e-05, "loss": 0.6644, "num_input_tokens_seen": 20233616, "step": 35090 }, { "epoch": 5.227137324992553, "grad_norm": 4.2037672996521, "learning_rate": 2.737887121141442e-05, "loss": 0.5522, "num_input_tokens_seen": 20236432, "step": 35095 }, { "epoch": 5.227882037533512, "grad_norm": 1.6918625831604004, "learning_rate": 2.7372401776810736e-05, "loss": 0.6882, "num_input_tokens_seen": 20239440, "step": 35100 }, { "epoch": 5.228626750074471, "grad_norm": 1.8164925575256348, "learning_rate": 2.736593218188978e-05, "loss": 0.575, "num_input_tokens_seen": 20242512, "step": 35105 }, { "epoch": 5.229371462615431, "grad_norm": 2.3410987854003906, "learning_rate": 2.7359462427088744e-05, "loss": 0.6907, "num_input_tokens_seen": 20245680, "step": 35110 }, { "epoch": 5.23011617515639, "grad_norm": 1.6084671020507812, "learning_rate": 2.7352992512844838e-05, "loss": 0.545, "num_input_tokens_seen": 20248560, "step": 35115 }, { "epoch": 5.230860887697349, "grad_norm": 1.9597152471542358, "learning_rate": 2.7346522439595256e-05, "loss": 0.5894, "num_input_tokens_seen": 20251504, "step": 35120 }, { "epoch": 5.231605600238308, "grad_norm": 2.3045265674591064, "learning_rate": 2.7340052207777234e-05, "loss": 0.5438, "num_input_tokens_seen": 20254128, "step": 35125 }, { "epoch": 5.232350312779268, "grad_norm": 1.643950343132019, "learning_rate": 2.733358181782799e-05, "loss": 0.6742, "num_input_tokens_seen": 20256976, "step": 35130 }, { "epoch": 5.233095025320226, "grad_norm": 1.6950663328170776, "learning_rate": 2.732711127018478e-05, "loss": 0.5807, "num_input_tokens_seen": 20259760, "step": 35135 }, { "epoch": 5.233839737861185, "grad_norm": 2.69321346282959, "learning_rate": 2.732064056528485e-05, "loss": 0.6981, "num_input_tokens_seen": 20262864, "step": 35140 }, { "epoch": 5.234584450402145, "grad_norm": 1.3979560136795044, "learning_rate": 2.7314169703565467e-05, "loss": 0.5461, "num_input_tokens_seen": 20265904, "step": 35145 }, { "epoch": 5.235329162943104, "grad_norm": 1.8314646482467651, "learning_rate": 2.7307698685463907e-05, "loss": 0.5057, "num_input_tokens_seen": 20268976, "step": 35150 }, { "epoch": 5.236073875484063, "grad_norm": 1.9726473093032837, "learning_rate": 2.730122751141745e-05, "loss": 0.6636, "num_input_tokens_seen": 20271856, "step": 35155 }, { "epoch": 5.236818588025022, "grad_norm": 1.5545541048049927, "learning_rate": 2.729475618186339e-05, "loss": 0.6682, "num_input_tokens_seen": 20274832, "step": 35160 }, { "epoch": 5.237563300565982, "grad_norm": 1.6088000535964966, "learning_rate": 2.728828469723904e-05, "loss": 0.5803, "num_input_tokens_seen": 20277424, "step": 35165 }, { "epoch": 5.23830801310694, "grad_norm": 0.9920854568481445, "learning_rate": 2.7281813057981715e-05, "loss": 0.6032, "num_input_tokens_seen": 20280528, "step": 35170 }, { "epoch": 5.2390527256479, "grad_norm": 1.6452831029891968, "learning_rate": 2.7275341264528737e-05, "loss": 0.672, "num_input_tokens_seen": 20283344, "step": 35175 }, { "epoch": 5.239797438188859, "grad_norm": 1.566184163093567, "learning_rate": 2.7268869317317452e-05, "loss": 0.5674, "num_input_tokens_seen": 20286032, "step": 35180 }, { "epoch": 5.240542150729818, "grad_norm": 0.7266432046890259, "learning_rate": 2.72623972167852e-05, "loss": 0.5081, "num_input_tokens_seen": 20289040, "step": 35185 }, { "epoch": 5.241286863270777, "grad_norm": 1.931782841682434, "learning_rate": 2.7255924963369345e-05, "loss": 0.5196, "num_input_tokens_seen": 20292016, "step": 35190 }, { "epoch": 5.242031575811737, "grad_norm": 2.7715585231781006, "learning_rate": 2.7249452557507243e-05, "loss": 0.5551, "num_input_tokens_seen": 20294864, "step": 35195 }, { "epoch": 5.242776288352696, "grad_norm": 1.2278212308883667, "learning_rate": 2.7242979999636296e-05, "loss": 0.6629, "num_input_tokens_seen": 20297808, "step": 35200 }, { "epoch": 5.243521000893655, "grad_norm": 2.0506584644317627, "learning_rate": 2.7236507290193876e-05, "loss": 0.5236, "num_input_tokens_seen": 20300752, "step": 35205 }, { "epoch": 5.244265713434614, "grad_norm": 1.801440715789795, "learning_rate": 2.7230034429617386e-05, "loss": 0.5217, "num_input_tokens_seen": 20303696, "step": 35210 }, { "epoch": 5.245010425975574, "grad_norm": 1.2379144430160522, "learning_rate": 2.7223561418344234e-05, "loss": 0.4805, "num_input_tokens_seen": 20306320, "step": 35215 }, { "epoch": 5.245755138516532, "grad_norm": 1.8084990978240967, "learning_rate": 2.721708825681184e-05, "loss": 0.6993, "num_input_tokens_seen": 20309360, "step": 35220 }, { "epoch": 5.246499851057492, "grad_norm": 1.3926037549972534, "learning_rate": 2.7210614945457642e-05, "loss": 0.6311, "num_input_tokens_seen": 20312176, "step": 35225 }, { "epoch": 5.247244563598451, "grad_norm": 3.9438302516937256, "learning_rate": 2.7204141484719064e-05, "loss": 0.6815, "num_input_tokens_seen": 20314960, "step": 35230 }, { "epoch": 5.2479892761394105, "grad_norm": 3.254084825515747, "learning_rate": 2.719766787503357e-05, "loss": 0.6634, "num_input_tokens_seen": 20317808, "step": 35235 }, { "epoch": 5.248733988680369, "grad_norm": 1.8422483205795288, "learning_rate": 2.719119411683862e-05, "loss": 0.5173, "num_input_tokens_seen": 20320624, "step": 35240 }, { "epoch": 5.249478701221329, "grad_norm": 1.8551363945007324, "learning_rate": 2.7184720210571678e-05, "loss": 0.5116, "num_input_tokens_seen": 20323728, "step": 35245 }, { "epoch": 5.250223413762288, "grad_norm": 4.463172435760498, "learning_rate": 2.717824615667023e-05, "loss": 0.6882, "num_input_tokens_seen": 20326768, "step": 35250 }, { "epoch": 5.250968126303247, "grad_norm": 2.404680013656616, "learning_rate": 2.7171771955571756e-05, "loss": 0.6211, "num_input_tokens_seen": 20329872, "step": 35255 }, { "epoch": 5.251712838844206, "grad_norm": 1.6812360286712646, "learning_rate": 2.7165297607713763e-05, "loss": 0.5221, "num_input_tokens_seen": 20332720, "step": 35260 }, { "epoch": 5.252457551385166, "grad_norm": 1.5320372581481934, "learning_rate": 2.715882311353377e-05, "loss": 0.8271, "num_input_tokens_seen": 20335632, "step": 35265 }, { "epoch": 5.253202263926124, "grad_norm": 2.2625091075897217, "learning_rate": 2.7152348473469285e-05, "loss": 0.6257, "num_input_tokens_seen": 20338576, "step": 35270 }, { "epoch": 5.253946976467084, "grad_norm": 1.3802858591079712, "learning_rate": 2.7145873687957835e-05, "loss": 0.7563, "num_input_tokens_seen": 20342256, "step": 35275 }, { "epoch": 5.254691689008043, "grad_norm": 2.0558555126190186, "learning_rate": 2.7139398757436968e-05, "loss": 0.7904, "num_input_tokens_seen": 20345264, "step": 35280 }, { "epoch": 5.2554364015490025, "grad_norm": 2.5437259674072266, "learning_rate": 2.7132923682344235e-05, "loss": 0.6645, "num_input_tokens_seen": 20348336, "step": 35285 }, { "epoch": 5.256181114089961, "grad_norm": 2.7674331665039062, "learning_rate": 2.7126448463117188e-05, "loss": 0.6238, "num_input_tokens_seen": 20351216, "step": 35290 }, { "epoch": 5.256925826630921, "grad_norm": 2.1297876834869385, "learning_rate": 2.7119973100193397e-05, "loss": 0.5619, "num_input_tokens_seen": 20354032, "step": 35295 }, { "epoch": 5.25767053917188, "grad_norm": 2.3247947692871094, "learning_rate": 2.7113497594010452e-05, "loss": 0.6643, "num_input_tokens_seen": 20356752, "step": 35300 }, { "epoch": 5.258415251712838, "grad_norm": 1.404131293296814, "learning_rate": 2.710702194500593e-05, "loss": 0.5045, "num_input_tokens_seen": 20359696, "step": 35305 }, { "epoch": 5.259159964253798, "grad_norm": 1.4594295024871826, "learning_rate": 2.7100546153617423e-05, "loss": 0.6442, "num_input_tokens_seen": 20362512, "step": 35310 }, { "epoch": 5.259904676794757, "grad_norm": 1.5029019117355347, "learning_rate": 2.7094070220282553e-05, "loss": 0.5178, "num_input_tokens_seen": 20365200, "step": 35315 }, { "epoch": 5.2606493893357165, "grad_norm": 1.8283061981201172, "learning_rate": 2.7087594145438926e-05, "loss": 0.6175, "num_input_tokens_seen": 20368016, "step": 35320 }, { "epoch": 5.261394101876675, "grad_norm": 1.759640097618103, "learning_rate": 2.7081117929524185e-05, "loss": 0.5616, "num_input_tokens_seen": 20370928, "step": 35325 }, { "epoch": 5.262138814417635, "grad_norm": 2.150932550430298, "learning_rate": 2.7074641572975944e-05, "loss": 0.7322, "num_input_tokens_seen": 20374160, "step": 35330 }, { "epoch": 5.262883526958594, "grad_norm": 1.8669188022613525, "learning_rate": 2.7068165076231865e-05, "loss": 0.5775, "num_input_tokens_seen": 20377200, "step": 35335 }, { "epoch": 5.263628239499553, "grad_norm": 1.761237621307373, "learning_rate": 2.7061688439729598e-05, "loss": 0.5677, "num_input_tokens_seen": 20380112, "step": 35340 }, { "epoch": 5.264372952040512, "grad_norm": 2.4758853912353516, "learning_rate": 2.7055211663906814e-05, "loss": 0.665, "num_input_tokens_seen": 20383056, "step": 35345 }, { "epoch": 5.265117664581472, "grad_norm": 1.2339781522750854, "learning_rate": 2.704873474920118e-05, "loss": 0.5102, "num_input_tokens_seen": 20386032, "step": 35350 }, { "epoch": 5.26586237712243, "grad_norm": 1.366734504699707, "learning_rate": 2.7042257696050377e-05, "loss": 0.6485, "num_input_tokens_seen": 20389040, "step": 35355 }, { "epoch": 5.26660708966339, "grad_norm": 2.9713780879974365, "learning_rate": 2.70357805048921e-05, "loss": 0.5262, "num_input_tokens_seen": 20392016, "step": 35360 }, { "epoch": 5.267351802204349, "grad_norm": 1.4758256673812866, "learning_rate": 2.7029303176164066e-05, "loss": 0.5405, "num_input_tokens_seen": 20394960, "step": 35365 }, { "epoch": 5.2680965147453085, "grad_norm": 2.8521156311035156, "learning_rate": 2.702282571030396e-05, "loss": 0.5284, "num_input_tokens_seen": 20397712, "step": 35370 }, { "epoch": 5.268841227286267, "grad_norm": 1.209241271018982, "learning_rate": 2.7016348107749522e-05, "loss": 0.4904, "num_input_tokens_seen": 20400528, "step": 35375 }, { "epoch": 5.269585939827227, "grad_norm": 0.8943517208099365, "learning_rate": 2.7009870368938477e-05, "loss": 0.601, "num_input_tokens_seen": 20403120, "step": 35380 }, { "epoch": 5.270330652368186, "grad_norm": 1.5704295635223389, "learning_rate": 2.700339249430856e-05, "loss": 0.6199, "num_input_tokens_seen": 20406224, "step": 35385 }, { "epoch": 5.271075364909145, "grad_norm": 1.4982292652130127, "learning_rate": 2.6996914484297532e-05, "loss": 0.6098, "num_input_tokens_seen": 20408944, "step": 35390 }, { "epoch": 5.271820077450104, "grad_norm": 1.0202302932739258, "learning_rate": 2.6990436339343134e-05, "loss": 0.5424, "num_input_tokens_seen": 20411888, "step": 35395 }, { "epoch": 5.272564789991064, "grad_norm": 2.2920403480529785, "learning_rate": 2.6983958059883145e-05, "loss": 0.6345, "num_input_tokens_seen": 20414672, "step": 35400 }, { "epoch": 5.2733095025320225, "grad_norm": 1.5725531578063965, "learning_rate": 2.697747964635533e-05, "loss": 0.7741, "num_input_tokens_seen": 20417616, "step": 35405 }, { "epoch": 5.274054215072982, "grad_norm": 3.087136745452881, "learning_rate": 2.697100109919749e-05, "loss": 0.7197, "num_input_tokens_seen": 20420368, "step": 35410 }, { "epoch": 5.274798927613941, "grad_norm": 1.2258347272872925, "learning_rate": 2.696452241884741e-05, "loss": 0.5326, "num_input_tokens_seen": 20423440, "step": 35415 }, { "epoch": 5.2755436401549005, "grad_norm": 2.2312819957733154, "learning_rate": 2.6958043605742882e-05, "loss": 0.4858, "num_input_tokens_seen": 20425968, "step": 35420 }, { "epoch": 5.276288352695859, "grad_norm": 0.99562007188797, "learning_rate": 2.695156466032173e-05, "loss": 0.5214, "num_input_tokens_seen": 20428816, "step": 35425 }, { "epoch": 5.277033065236819, "grad_norm": 1.2124598026275635, "learning_rate": 2.6945085583021783e-05, "loss": 0.5209, "num_input_tokens_seen": 20431536, "step": 35430 }, { "epoch": 5.277777777777778, "grad_norm": 1.722733974456787, "learning_rate": 2.693860637428085e-05, "loss": 0.6195, "num_input_tokens_seen": 20434704, "step": 35435 }, { "epoch": 5.278522490318737, "grad_norm": 1.5737462043762207, "learning_rate": 2.693212703453678e-05, "loss": 0.6296, "num_input_tokens_seen": 20437488, "step": 35440 }, { "epoch": 5.279267202859696, "grad_norm": 1.5975350141525269, "learning_rate": 2.692564756422743e-05, "loss": 0.7884, "num_input_tokens_seen": 20440240, "step": 35445 }, { "epoch": 5.280011915400656, "grad_norm": 3.294806718826294, "learning_rate": 2.6919167963790636e-05, "loss": 0.4742, "num_input_tokens_seen": 20443088, "step": 35450 }, { "epoch": 5.2807566279416145, "grad_norm": 3.297919750213623, "learning_rate": 2.691268823366428e-05, "loss": 0.5671, "num_input_tokens_seen": 20445872, "step": 35455 }, { "epoch": 5.281501340482574, "grad_norm": 2.077633857727051, "learning_rate": 2.6906208374286223e-05, "loss": 0.5492, "num_input_tokens_seen": 20449008, "step": 35460 }, { "epoch": 5.282246053023533, "grad_norm": 0.9392733573913574, "learning_rate": 2.6899728386094364e-05, "loss": 0.4917, "num_input_tokens_seen": 20452176, "step": 35465 }, { "epoch": 5.282990765564492, "grad_norm": 2.3078525066375732, "learning_rate": 2.6893248269526578e-05, "loss": 0.5435, "num_input_tokens_seen": 20455056, "step": 35470 }, { "epoch": 5.283735478105451, "grad_norm": 4.499669551849365, "learning_rate": 2.688676802502077e-05, "loss": 0.6449, "num_input_tokens_seen": 20458000, "step": 35475 }, { "epoch": 5.284480190646411, "grad_norm": 1.1365762948989868, "learning_rate": 2.688028765301486e-05, "loss": 0.5547, "num_input_tokens_seen": 20460912, "step": 35480 }, { "epoch": 5.28522490318737, "grad_norm": 3.574641704559326, "learning_rate": 2.687380715394674e-05, "loss": 0.5298, "num_input_tokens_seen": 20463536, "step": 35485 }, { "epoch": 5.2859696157283285, "grad_norm": 3.709606647491455, "learning_rate": 2.686732652825436e-05, "loss": 0.7668, "num_input_tokens_seen": 20466512, "step": 35490 }, { "epoch": 5.286714328269288, "grad_norm": 2.487454891204834, "learning_rate": 2.6860845776375643e-05, "loss": 0.6666, "num_input_tokens_seen": 20469488, "step": 35495 }, { "epoch": 5.287459040810247, "grad_norm": 2.114506721496582, "learning_rate": 2.6854364898748537e-05, "loss": 0.564, "num_input_tokens_seen": 20472176, "step": 35500 }, { "epoch": 5.2882037533512065, "grad_norm": 2.072256088256836, "learning_rate": 2.6847883895810984e-05, "loss": 0.4588, "num_input_tokens_seen": 20475184, "step": 35505 }, { "epoch": 5.288948465892165, "grad_norm": 2.1372568607330322, "learning_rate": 2.6841402768000957e-05, "loss": 0.6007, "num_input_tokens_seen": 20478320, "step": 35510 }, { "epoch": 5.289693178433125, "grad_norm": 1.7310707569122314, "learning_rate": 2.6834921515756417e-05, "loss": 0.5253, "num_input_tokens_seen": 20480976, "step": 35515 }, { "epoch": 5.290437890974084, "grad_norm": 2.154392719268799, "learning_rate": 2.6828440139515337e-05, "loss": 0.7083, "num_input_tokens_seen": 20483856, "step": 35520 }, { "epoch": 5.291182603515043, "grad_norm": 0.9721022248268127, "learning_rate": 2.6821958639715704e-05, "loss": 0.4923, "num_input_tokens_seen": 20486992, "step": 35525 }, { "epoch": 5.291927316056002, "grad_norm": 2.5479934215545654, "learning_rate": 2.6815477016795526e-05, "loss": 0.4402, "num_input_tokens_seen": 20489712, "step": 35530 }, { "epoch": 5.292672028596962, "grad_norm": 1.3761016130447388, "learning_rate": 2.6808995271192784e-05, "loss": 0.6922, "num_input_tokens_seen": 20492432, "step": 35535 }, { "epoch": 5.2934167411379205, "grad_norm": 1.8534172773361206, "learning_rate": 2.680251340334549e-05, "loss": 0.8522, "num_input_tokens_seen": 20495504, "step": 35540 }, { "epoch": 5.29416145367888, "grad_norm": 1.8415367603302002, "learning_rate": 2.679603141369168e-05, "loss": 0.425, "num_input_tokens_seen": 20498512, "step": 35545 }, { "epoch": 5.294906166219839, "grad_norm": 1.5951215028762817, "learning_rate": 2.678954930266937e-05, "loss": 0.5296, "num_input_tokens_seen": 20501456, "step": 35550 }, { "epoch": 5.2956508787607985, "grad_norm": 1.8406246900558472, "learning_rate": 2.6783067070716583e-05, "loss": 0.6665, "num_input_tokens_seen": 20504240, "step": 35555 }, { "epoch": 5.296395591301757, "grad_norm": 2.5186569690704346, "learning_rate": 2.6776584718271376e-05, "loss": 0.6996, "num_input_tokens_seen": 20507024, "step": 35560 }, { "epoch": 5.297140303842717, "grad_norm": 2.799769163131714, "learning_rate": 2.67701022457718e-05, "loss": 0.5783, "num_input_tokens_seen": 20509808, "step": 35565 }, { "epoch": 5.297885016383676, "grad_norm": 1.3817929029464722, "learning_rate": 2.6763619653655913e-05, "loss": 0.4745, "num_input_tokens_seen": 20512560, "step": 35570 }, { "epoch": 5.298629728924635, "grad_norm": 1.8081071376800537, "learning_rate": 2.6757136942361776e-05, "loss": 0.6117, "num_input_tokens_seen": 20515632, "step": 35575 }, { "epoch": 5.299374441465594, "grad_norm": 1.3734005689620972, "learning_rate": 2.6750654112327474e-05, "loss": 0.7245, "num_input_tokens_seen": 20518608, "step": 35580 }, { "epoch": 5.300119154006554, "grad_norm": 1.9656295776367188, "learning_rate": 2.674417116399108e-05, "loss": 0.6296, "num_input_tokens_seen": 20521488, "step": 35585 }, { "epoch": 5.3008638665475125, "grad_norm": 1.8931342363357544, "learning_rate": 2.6737688097790693e-05, "loss": 0.7646, "num_input_tokens_seen": 20524272, "step": 35590 }, { "epoch": 5.301608579088472, "grad_norm": 1.6580610275268555, "learning_rate": 2.6731204914164405e-05, "loss": 0.6659, "num_input_tokens_seen": 20527024, "step": 35595 }, { "epoch": 5.302353291629431, "grad_norm": 1.8305954933166504, "learning_rate": 2.672472161355033e-05, "loss": 0.4847, "num_input_tokens_seen": 20529808, "step": 35600 }, { "epoch": 5.303098004170391, "grad_norm": 1.282614827156067, "learning_rate": 2.6718238196386576e-05, "loss": 0.6277, "num_input_tokens_seen": 20532848, "step": 35605 }, { "epoch": 5.303842716711349, "grad_norm": 2.0704023838043213, "learning_rate": 2.6711754663111277e-05, "loss": 0.6781, "num_input_tokens_seen": 20535696, "step": 35610 }, { "epoch": 5.304587429252309, "grad_norm": 5.414908409118652, "learning_rate": 2.6705271014162554e-05, "loss": 1.0298, "num_input_tokens_seen": 20538288, "step": 35615 }, { "epoch": 5.305332141793268, "grad_norm": 2.4937777519226074, "learning_rate": 2.6698787249978546e-05, "loss": 0.7902, "num_input_tokens_seen": 20541456, "step": 35620 }, { "epoch": 5.306076854334227, "grad_norm": 2.0514087677001953, "learning_rate": 2.6692303370997405e-05, "loss": 0.4689, "num_input_tokens_seen": 20544432, "step": 35625 }, { "epoch": 5.306821566875186, "grad_norm": 1.3357852697372437, "learning_rate": 2.668581937765729e-05, "loss": 0.7539, "num_input_tokens_seen": 20547120, "step": 35630 }, { "epoch": 5.307566279416146, "grad_norm": 1.5831396579742432, "learning_rate": 2.667933527039635e-05, "loss": 0.4879, "num_input_tokens_seen": 20550160, "step": 35635 }, { "epoch": 5.3083109919571045, "grad_norm": 1.6275454759597778, "learning_rate": 2.6672851049652752e-05, "loss": 0.685, "num_input_tokens_seen": 20552848, "step": 35640 }, { "epoch": 5.309055704498064, "grad_norm": 3.4021964073181152, "learning_rate": 2.6666366715864694e-05, "loss": 0.6141, "num_input_tokens_seen": 20555728, "step": 35645 }, { "epoch": 5.309800417039023, "grad_norm": 3.383652925491333, "learning_rate": 2.665988226947034e-05, "loss": 0.7343, "num_input_tokens_seen": 20558544, "step": 35650 }, { "epoch": 5.310545129579982, "grad_norm": 1.910901665687561, "learning_rate": 2.6653397710907895e-05, "loss": 0.6504, "num_input_tokens_seen": 20561264, "step": 35655 }, { "epoch": 5.311289842120941, "grad_norm": 1.9528281688690186, "learning_rate": 2.664691304061555e-05, "loss": 0.6117, "num_input_tokens_seen": 20564080, "step": 35660 }, { "epoch": 5.3120345546619, "grad_norm": 2.277453899383545, "learning_rate": 2.6640428259031525e-05, "loss": 0.7064, "num_input_tokens_seen": 20567056, "step": 35665 }, { "epoch": 5.31277926720286, "grad_norm": 1.5263668298721313, "learning_rate": 2.6633943366594027e-05, "loss": 0.4897, "num_input_tokens_seen": 20570064, "step": 35670 }, { "epoch": 5.3135239797438185, "grad_norm": 3.2011220455169678, "learning_rate": 2.6627458363741274e-05, "loss": 0.5145, "num_input_tokens_seen": 20572688, "step": 35675 }, { "epoch": 5.314268692284778, "grad_norm": 1.711776614189148, "learning_rate": 2.6620973250911506e-05, "loss": 0.5611, "num_input_tokens_seen": 20575408, "step": 35680 }, { "epoch": 5.315013404825737, "grad_norm": 2.0322558879852295, "learning_rate": 2.6614488028542948e-05, "loss": 0.6228, "num_input_tokens_seen": 20578224, "step": 35685 }, { "epoch": 5.315758117366697, "grad_norm": 1.6956051588058472, "learning_rate": 2.6608002697073864e-05, "loss": 0.553, "num_input_tokens_seen": 20581008, "step": 35690 }, { "epoch": 5.316502829907655, "grad_norm": 1.7536488771438599, "learning_rate": 2.6601517256942494e-05, "loss": 0.7848, "num_input_tokens_seen": 20583952, "step": 35695 }, { "epoch": 5.317247542448615, "grad_norm": 1.5121067762374878, "learning_rate": 2.6595031708587093e-05, "loss": 0.6681, "num_input_tokens_seen": 20586608, "step": 35700 }, { "epoch": 5.317992254989574, "grad_norm": 1.235470175743103, "learning_rate": 2.6588546052445933e-05, "loss": 0.5476, "num_input_tokens_seen": 20589232, "step": 35705 }, { "epoch": 5.318736967530533, "grad_norm": 2.5854358673095703, "learning_rate": 2.6582060288957295e-05, "loss": 0.536, "num_input_tokens_seen": 20591920, "step": 35710 }, { "epoch": 5.319481680071492, "grad_norm": 2.0221800804138184, "learning_rate": 2.6575574418559456e-05, "loss": 0.6292, "num_input_tokens_seen": 20595056, "step": 35715 }, { "epoch": 5.320226392612452, "grad_norm": 3.5927014350891113, "learning_rate": 2.6569088441690697e-05, "loss": 0.6471, "num_input_tokens_seen": 20597680, "step": 35720 }, { "epoch": 5.3209711051534105, "grad_norm": 1.2980775833129883, "learning_rate": 2.6562602358789324e-05, "loss": 0.73, "num_input_tokens_seen": 20600880, "step": 35725 }, { "epoch": 5.32171581769437, "grad_norm": 3.797990322113037, "learning_rate": 2.6556116170293645e-05, "loss": 0.632, "num_input_tokens_seen": 20603440, "step": 35730 }, { "epoch": 5.322460530235329, "grad_norm": 1.995390772819519, "learning_rate": 2.6549629876641953e-05, "loss": 0.6015, "num_input_tokens_seen": 20606544, "step": 35735 }, { "epoch": 5.323205242776289, "grad_norm": 1.6272330284118652, "learning_rate": 2.654314347827257e-05, "loss": 0.7062, "num_input_tokens_seen": 20609616, "step": 35740 }, { "epoch": 5.323949955317247, "grad_norm": 1.8222754001617432, "learning_rate": 2.653665697562383e-05, "loss": 0.5284, "num_input_tokens_seen": 20612688, "step": 35745 }, { "epoch": 5.324694667858207, "grad_norm": 1.6072832345962524, "learning_rate": 2.6530170369134062e-05, "loss": 0.7218, "num_input_tokens_seen": 20615824, "step": 35750 }, { "epoch": 5.325439380399166, "grad_norm": 1.4072085618972778, "learning_rate": 2.6523683659241594e-05, "loss": 0.5657, "num_input_tokens_seen": 20618704, "step": 35755 }, { "epoch": 5.326184092940125, "grad_norm": 1.322251319885254, "learning_rate": 2.651719684638479e-05, "loss": 0.5953, "num_input_tokens_seen": 20621680, "step": 35760 }, { "epoch": 5.326928805481084, "grad_norm": 1.597527027130127, "learning_rate": 2.651070993100198e-05, "loss": 0.8215, "num_input_tokens_seen": 20624880, "step": 35765 }, { "epoch": 5.327673518022044, "grad_norm": 2.2595887184143066, "learning_rate": 2.6504222913531545e-05, "loss": 0.763, "num_input_tokens_seen": 20627856, "step": 35770 }, { "epoch": 5.328418230563003, "grad_norm": 1.630380630493164, "learning_rate": 2.6497735794411833e-05, "loss": 0.6122, "num_input_tokens_seen": 20630832, "step": 35775 }, { "epoch": 5.329162943103962, "grad_norm": 1.1481822729110718, "learning_rate": 2.6491248574081228e-05, "loss": 0.5277, "num_input_tokens_seen": 20633872, "step": 35780 }, { "epoch": 5.329907655644921, "grad_norm": 2.5297904014587402, "learning_rate": 2.6484761252978107e-05, "loss": 0.7245, "num_input_tokens_seen": 20636624, "step": 35785 }, { "epoch": 5.330652368185881, "grad_norm": 1.8690284490585327, "learning_rate": 2.6478273831540863e-05, "loss": 0.6465, "num_input_tokens_seen": 20639536, "step": 35790 }, { "epoch": 5.331397080726839, "grad_norm": 1.9453076124191284, "learning_rate": 2.647178631020788e-05, "loss": 0.6303, "num_input_tokens_seen": 20642352, "step": 35795 }, { "epoch": 5.332141793267799, "grad_norm": 2.0816245079040527, "learning_rate": 2.6465298689417555e-05, "loss": 0.6147, "num_input_tokens_seen": 20645072, "step": 35800 }, { "epoch": 5.332886505808758, "grad_norm": 1.3361262083053589, "learning_rate": 2.6458810969608304e-05, "loss": 0.586, "num_input_tokens_seen": 20647888, "step": 35805 }, { "epoch": 5.333631218349717, "grad_norm": 1.3723571300506592, "learning_rate": 2.645232315121855e-05, "loss": 0.5172, "num_input_tokens_seen": 20650448, "step": 35810 }, { "epoch": 5.334375930890676, "grad_norm": 1.3120845556259155, "learning_rate": 2.6445835234686693e-05, "loss": 0.5151, "num_input_tokens_seen": 20653296, "step": 35815 }, { "epoch": 5.335120643431635, "grad_norm": 1.3949980735778809, "learning_rate": 2.643934722045117e-05, "loss": 0.6478, "num_input_tokens_seen": 20656208, "step": 35820 }, { "epoch": 5.335865355972595, "grad_norm": 2.5579607486724854, "learning_rate": 2.6432859108950413e-05, "loss": 0.5088, "num_input_tokens_seen": 20659376, "step": 35825 }, { "epoch": 5.336610068513554, "grad_norm": 0.8265396952629089, "learning_rate": 2.642637090062287e-05, "loss": 0.6117, "num_input_tokens_seen": 20662384, "step": 35830 }, { "epoch": 5.337354781054513, "grad_norm": 2.0520477294921875, "learning_rate": 2.6419882595906976e-05, "loss": 0.5127, "num_input_tokens_seen": 20665168, "step": 35835 }, { "epoch": 5.338099493595472, "grad_norm": 1.584264874458313, "learning_rate": 2.6413394195241186e-05, "loss": 0.5346, "num_input_tokens_seen": 20667888, "step": 35840 }, { "epoch": 5.338844206136431, "grad_norm": 1.172059178352356, "learning_rate": 2.6406905699063965e-05, "loss": 0.663, "num_input_tokens_seen": 20670928, "step": 35845 }, { "epoch": 5.33958891867739, "grad_norm": 2.2054975032806396, "learning_rate": 2.640041710781378e-05, "loss": 0.5808, "num_input_tokens_seen": 20673840, "step": 35850 }, { "epoch": 5.34033363121835, "grad_norm": 1.277248501777649, "learning_rate": 2.6393928421929098e-05, "loss": 0.4056, "num_input_tokens_seen": 20676816, "step": 35855 }, { "epoch": 5.341078343759309, "grad_norm": 1.779438853263855, "learning_rate": 2.6387439641848405e-05, "loss": 0.7706, "num_input_tokens_seen": 20679792, "step": 35860 }, { "epoch": 5.341823056300268, "grad_norm": 2.361706018447876, "learning_rate": 2.638095076801017e-05, "loss": 0.6915, "num_input_tokens_seen": 20682800, "step": 35865 }, { "epoch": 5.342567768841227, "grad_norm": 1.5268079042434692, "learning_rate": 2.6374461800852907e-05, "loss": 0.6131, "num_input_tokens_seen": 20685520, "step": 35870 }, { "epoch": 5.343312481382187, "grad_norm": 0.8135453462600708, "learning_rate": 2.63679727408151e-05, "loss": 0.4765, "num_input_tokens_seen": 20688432, "step": 35875 }, { "epoch": 5.344057193923145, "grad_norm": 1.3151969909667969, "learning_rate": 2.6361483588335257e-05, "loss": 0.3051, "num_input_tokens_seen": 20691408, "step": 35880 }, { "epoch": 5.344801906464105, "grad_norm": 0.8136352896690369, "learning_rate": 2.6354994343851884e-05, "loss": 0.5888, "num_input_tokens_seen": 20694448, "step": 35885 }, { "epoch": 5.345546619005064, "grad_norm": 0.5054925084114075, "learning_rate": 2.6348505007803515e-05, "loss": 0.4285, "num_input_tokens_seen": 20697104, "step": 35890 }, { "epoch": 5.346291331546023, "grad_norm": 2.3597118854522705, "learning_rate": 2.6342015580628655e-05, "loss": 0.7209, "num_input_tokens_seen": 20699856, "step": 35895 }, { "epoch": 5.347036044086982, "grad_norm": 1.227891206741333, "learning_rate": 2.633552606276583e-05, "loss": 0.5637, "num_input_tokens_seen": 20702608, "step": 35900 }, { "epoch": 5.347780756627942, "grad_norm": 1.5620037317276, "learning_rate": 2.6329036454653588e-05, "loss": 0.4475, "num_input_tokens_seen": 20705424, "step": 35905 }, { "epoch": 5.348525469168901, "grad_norm": 1.7607018947601318, "learning_rate": 2.632254675673047e-05, "loss": 0.7438, "num_input_tokens_seen": 20708112, "step": 35910 }, { "epoch": 5.34927018170986, "grad_norm": 1.7799060344696045, "learning_rate": 2.6316056969435022e-05, "loss": 0.6383, "num_input_tokens_seen": 20711312, "step": 35915 }, { "epoch": 5.350014894250819, "grad_norm": 1.66749906539917, "learning_rate": 2.6309567093205784e-05, "loss": 0.4709, "num_input_tokens_seen": 20714288, "step": 35920 }, { "epoch": 5.350759606791779, "grad_norm": 1.8414239883422852, "learning_rate": 2.6303077128481335e-05, "loss": 0.8393, "num_input_tokens_seen": 20717296, "step": 35925 }, { "epoch": 5.351504319332737, "grad_norm": 2.0743415355682373, "learning_rate": 2.629658707570023e-05, "loss": 0.5636, "num_input_tokens_seen": 20720240, "step": 35930 }, { "epoch": 5.352249031873697, "grad_norm": 1.8494741916656494, "learning_rate": 2.6290096935301034e-05, "loss": 0.6192, "num_input_tokens_seen": 20723120, "step": 35935 }, { "epoch": 5.352993744414656, "grad_norm": 2.162024736404419, "learning_rate": 2.6283606707722336e-05, "loss": 0.7813, "num_input_tokens_seen": 20726096, "step": 35940 }, { "epoch": 5.3537384569556155, "grad_norm": 2.3799164295196533, "learning_rate": 2.6277116393402718e-05, "loss": 0.6954, "num_input_tokens_seen": 20728976, "step": 35945 }, { "epoch": 5.354483169496574, "grad_norm": 1.8238106966018677, "learning_rate": 2.6270625992780772e-05, "loss": 0.672, "num_input_tokens_seen": 20731728, "step": 35950 }, { "epoch": 5.355227882037534, "grad_norm": 1.9526851177215576, "learning_rate": 2.626413550629508e-05, "loss": 0.5678, "num_input_tokens_seen": 20734736, "step": 35955 }, { "epoch": 5.355972594578493, "grad_norm": 2.9243273735046387, "learning_rate": 2.625764493438425e-05, "loss": 0.5915, "num_input_tokens_seen": 20737488, "step": 35960 }, { "epoch": 5.356717307119452, "grad_norm": 1.9072033166885376, "learning_rate": 2.625115427748689e-05, "loss": 0.5786, "num_input_tokens_seen": 20740688, "step": 35965 }, { "epoch": 5.357462019660411, "grad_norm": 1.8336812257766724, "learning_rate": 2.6244663536041614e-05, "loss": 0.6202, "num_input_tokens_seen": 20743440, "step": 35970 }, { "epoch": 5.358206732201371, "grad_norm": 1.2488964796066284, "learning_rate": 2.623817271048703e-05, "loss": 0.5778, "num_input_tokens_seen": 20746384, "step": 35975 }, { "epoch": 5.358951444742329, "grad_norm": 1.0214622020721436, "learning_rate": 2.623168180126177e-05, "loss": 0.5956, "num_input_tokens_seen": 20749072, "step": 35980 }, { "epoch": 5.359696157283288, "grad_norm": 1.7895591259002686, "learning_rate": 2.6225190808804463e-05, "loss": 0.536, "num_input_tokens_seen": 20751952, "step": 35985 }, { "epoch": 5.360440869824248, "grad_norm": 2.0814249515533447, "learning_rate": 2.6218699733553742e-05, "loss": 0.6467, "num_input_tokens_seen": 20755024, "step": 35990 }, { "epoch": 5.3611855823652075, "grad_norm": 2.4962635040283203, "learning_rate": 2.6212208575948257e-05, "loss": 0.6679, "num_input_tokens_seen": 20757840, "step": 35995 }, { "epoch": 5.361930294906166, "grad_norm": 1.069156289100647, "learning_rate": 2.6205717336426632e-05, "loss": 0.5033, "num_input_tokens_seen": 20760944, "step": 36000 }, { "epoch": 5.362675007447125, "grad_norm": 1.6022446155548096, "learning_rate": 2.6199226015427532e-05, "loss": 0.6323, "num_input_tokens_seen": 20763728, "step": 36005 }, { "epoch": 5.363419719988085, "grad_norm": 1.45572030544281, "learning_rate": 2.619273461338962e-05, "loss": 0.457, "num_input_tokens_seen": 20766736, "step": 36010 }, { "epoch": 5.364164432529043, "grad_norm": 2.0740299224853516, "learning_rate": 2.6186243130751554e-05, "loss": 0.6455, "num_input_tokens_seen": 20769680, "step": 36015 }, { "epoch": 5.364909145070003, "grad_norm": 1.6323105096817017, "learning_rate": 2.6179751567951992e-05, "loss": 0.7154, "num_input_tokens_seen": 20772592, "step": 36020 }, { "epoch": 5.365653857610962, "grad_norm": 2.133349657058716, "learning_rate": 2.617325992542962e-05, "loss": 0.6524, "num_input_tokens_seen": 20775376, "step": 36025 }, { "epoch": 5.3663985701519215, "grad_norm": 2.1278746128082275, "learning_rate": 2.616676820362311e-05, "loss": 0.5616, "num_input_tokens_seen": 20778544, "step": 36030 }, { "epoch": 5.36714328269288, "grad_norm": 1.8239935636520386, "learning_rate": 2.6160276402971153e-05, "loss": 0.6234, "num_input_tokens_seen": 20781584, "step": 36035 }, { "epoch": 5.36788799523384, "grad_norm": 1.2565385103225708, "learning_rate": 2.615378452391243e-05, "loss": 0.4587, "num_input_tokens_seen": 20784400, "step": 36040 }, { "epoch": 5.368632707774799, "grad_norm": 1.8836716413497925, "learning_rate": 2.614729256688564e-05, "loss": 0.7195, "num_input_tokens_seen": 20787248, "step": 36045 }, { "epoch": 5.369377420315758, "grad_norm": 2.0639162063598633, "learning_rate": 2.6140800532329486e-05, "loss": 0.5969, "num_input_tokens_seen": 20789936, "step": 36050 }, { "epoch": 5.370122132856717, "grad_norm": 1.46291184425354, "learning_rate": 2.6134308420682667e-05, "loss": 0.6218, "num_input_tokens_seen": 20793200, "step": 36055 }, { "epoch": 5.370866845397677, "grad_norm": 1.118965983390808, "learning_rate": 2.61278162323839e-05, "loss": 0.422, "num_input_tokens_seen": 20796240, "step": 36060 }, { "epoch": 5.371611557938635, "grad_norm": 1.6392539739608765, "learning_rate": 2.612132396787189e-05, "loss": 0.4968, "num_input_tokens_seen": 20798960, "step": 36065 }, { "epoch": 5.372356270479595, "grad_norm": 1.568479061126709, "learning_rate": 2.6114831627585367e-05, "loss": 0.5476, "num_input_tokens_seen": 20801872, "step": 36070 }, { "epoch": 5.373100983020554, "grad_norm": 1.589606761932373, "learning_rate": 2.610833921196306e-05, "loss": 0.6353, "num_input_tokens_seen": 20804784, "step": 36075 }, { "epoch": 5.3738456955615135, "grad_norm": 1.737617015838623, "learning_rate": 2.61018467214437e-05, "loss": 0.6413, "num_input_tokens_seen": 20807856, "step": 36080 }, { "epoch": 5.374590408102472, "grad_norm": 1.7615913152694702, "learning_rate": 2.609535415646601e-05, "loss": 0.6173, "num_input_tokens_seen": 20810544, "step": 36085 }, { "epoch": 5.375335120643432, "grad_norm": 2.190113067626953, "learning_rate": 2.6088861517468745e-05, "loss": 0.6836, "num_input_tokens_seen": 20813360, "step": 36090 }, { "epoch": 5.376079833184391, "grad_norm": 2.1486711502075195, "learning_rate": 2.6082368804890644e-05, "loss": 0.8279, "num_input_tokens_seen": 20816272, "step": 36095 }, { "epoch": 5.37682454572535, "grad_norm": 4.291131496429443, "learning_rate": 2.6075876019170453e-05, "loss": 0.5853, "num_input_tokens_seen": 20819088, "step": 36100 }, { "epoch": 5.377569258266309, "grad_norm": 1.4306695461273193, "learning_rate": 2.606938316074694e-05, "loss": 0.7541, "num_input_tokens_seen": 20822000, "step": 36105 }, { "epoch": 5.378313970807269, "grad_norm": 1.8143367767333984, "learning_rate": 2.606289023005886e-05, "loss": 0.4206, "num_input_tokens_seen": 20824784, "step": 36110 }, { "epoch": 5.3790586833482275, "grad_norm": 1.54168701171875, "learning_rate": 2.6056397227544988e-05, "loss": 0.6657, "num_input_tokens_seen": 20827536, "step": 36115 }, { "epoch": 5.379803395889187, "grad_norm": 1.72281014919281, "learning_rate": 2.6049904153644072e-05, "loss": 0.6661, "num_input_tokens_seen": 20830512, "step": 36120 }, { "epoch": 5.380548108430146, "grad_norm": 1.9604506492614746, "learning_rate": 2.6043411008794915e-05, "loss": 0.5997, "num_input_tokens_seen": 20833168, "step": 36125 }, { "epoch": 5.3812928209711055, "grad_norm": 1.6073975563049316, "learning_rate": 2.603691779343627e-05, "loss": 0.4776, "num_input_tokens_seen": 20835920, "step": 36130 }, { "epoch": 5.382037533512064, "grad_norm": 1.8118157386779785, "learning_rate": 2.603042450800695e-05, "loss": 0.5471, "num_input_tokens_seen": 20838896, "step": 36135 }, { "epoch": 5.382782246053024, "grad_norm": 1.5944764614105225, "learning_rate": 2.6023931152945725e-05, "loss": 0.6208, "num_input_tokens_seen": 20841712, "step": 36140 }, { "epoch": 5.383526958593983, "grad_norm": 2.4751839637756348, "learning_rate": 2.6017437728691396e-05, "loss": 0.6074, "num_input_tokens_seen": 20844848, "step": 36145 }, { "epoch": 5.384271671134942, "grad_norm": 2.1717443466186523, "learning_rate": 2.601094423568276e-05, "loss": 0.6558, "num_input_tokens_seen": 20847888, "step": 36150 }, { "epoch": 5.385016383675901, "grad_norm": 1.8694795370101929, "learning_rate": 2.6004450674358628e-05, "loss": 0.6957, "num_input_tokens_seen": 20850672, "step": 36155 }, { "epoch": 5.385761096216861, "grad_norm": 1.8070659637451172, "learning_rate": 2.59979570451578e-05, "loss": 0.5466, "num_input_tokens_seen": 20853392, "step": 36160 }, { "epoch": 5.3865058087578195, "grad_norm": 3.8146421909332275, "learning_rate": 2.599146334851909e-05, "loss": 0.6267, "num_input_tokens_seen": 20856176, "step": 36165 }, { "epoch": 5.387250521298778, "grad_norm": 2.8115134239196777, "learning_rate": 2.598496958488132e-05, "loss": 0.5237, "num_input_tokens_seen": 20859152, "step": 36170 }, { "epoch": 5.387995233839738, "grad_norm": 1.037084937095642, "learning_rate": 2.5978475754683307e-05, "loss": 0.663, "num_input_tokens_seen": 20862096, "step": 36175 }, { "epoch": 5.388739946380697, "grad_norm": 1.6482200622558594, "learning_rate": 2.5971981858363886e-05, "loss": 0.5364, "num_input_tokens_seen": 20865200, "step": 36180 }, { "epoch": 5.389484658921656, "grad_norm": 1.603331446647644, "learning_rate": 2.5965487896361878e-05, "loss": 0.5117, "num_input_tokens_seen": 20867920, "step": 36185 }, { "epoch": 5.390229371462615, "grad_norm": 1.155501127243042, "learning_rate": 2.5958993869116134e-05, "loss": 0.787, "num_input_tokens_seen": 20870576, "step": 36190 }, { "epoch": 5.390974084003575, "grad_norm": 1.7577847242355347, "learning_rate": 2.5952499777065474e-05, "loss": 0.7208, "num_input_tokens_seen": 20873328, "step": 36195 }, { "epoch": 5.3917187965445335, "grad_norm": 2.5969653129577637, "learning_rate": 2.594600562064875e-05, "loss": 0.6784, "num_input_tokens_seen": 20876496, "step": 36200 }, { "epoch": 5.392463509085493, "grad_norm": 0.9966578483581543, "learning_rate": 2.593951140030481e-05, "loss": 0.6035, "num_input_tokens_seen": 20879600, "step": 36205 }, { "epoch": 5.393208221626452, "grad_norm": 2.6543898582458496, "learning_rate": 2.593301711647252e-05, "loss": 0.6025, "num_input_tokens_seen": 20882448, "step": 36210 }, { "epoch": 5.3939529341674115, "grad_norm": 1.1730095148086548, "learning_rate": 2.592652276959072e-05, "loss": 0.6012, "num_input_tokens_seen": 20885424, "step": 36215 }, { "epoch": 5.39469764670837, "grad_norm": 3.5269997119903564, "learning_rate": 2.592002836009828e-05, "loss": 0.5514, "num_input_tokens_seen": 20888080, "step": 36220 }, { "epoch": 5.39544235924933, "grad_norm": 1.480607271194458, "learning_rate": 2.5913533888434067e-05, "loss": 0.402, "num_input_tokens_seen": 20891024, "step": 36225 }, { "epoch": 5.396187071790289, "grad_norm": 0.9661433696746826, "learning_rate": 2.5907039355036944e-05, "loss": 0.67, "num_input_tokens_seen": 20894064, "step": 36230 }, { "epoch": 5.396931784331248, "grad_norm": 2.916066884994507, "learning_rate": 2.590054476034579e-05, "loss": 0.4952, "num_input_tokens_seen": 20896976, "step": 36235 }, { "epoch": 5.397676496872207, "grad_norm": 3.581758975982666, "learning_rate": 2.5894050104799477e-05, "loss": 0.5975, "num_input_tokens_seen": 20899984, "step": 36240 }, { "epoch": 5.398421209413167, "grad_norm": 2.386934995651245, "learning_rate": 2.5887555388836905e-05, "loss": 0.6546, "num_input_tokens_seen": 20902992, "step": 36245 }, { "epoch": 5.3991659219541255, "grad_norm": 4.25590705871582, "learning_rate": 2.5881060612896936e-05, "loss": 0.8199, "num_input_tokens_seen": 20905552, "step": 36250 }, { "epoch": 5.399910634495085, "grad_norm": 2.3856310844421387, "learning_rate": 2.587456577741848e-05, "loss": 0.6154, "num_input_tokens_seen": 20908528, "step": 36255 }, { "epoch": 5.400655347036044, "grad_norm": 2.3099260330200195, "learning_rate": 2.5868070882840423e-05, "loss": 0.5338, "num_input_tokens_seen": 20911376, "step": 36260 }, { "epoch": 5.4014000595770035, "grad_norm": 3.606459379196167, "learning_rate": 2.5861575929601663e-05, "loss": 0.6352, "num_input_tokens_seen": 20914256, "step": 36265 }, { "epoch": 5.402144772117962, "grad_norm": 1.8880679607391357, "learning_rate": 2.5855080918141107e-05, "loss": 0.6972, "num_input_tokens_seen": 20916944, "step": 36270 }, { "epoch": 5.402889484658922, "grad_norm": 2.775830030441284, "learning_rate": 2.5848585848897654e-05, "loss": 0.4735, "num_input_tokens_seen": 20919536, "step": 36275 }, { "epoch": 5.403634197199881, "grad_norm": 2.2044761180877686, "learning_rate": 2.584209072231023e-05, "loss": 0.6043, "num_input_tokens_seen": 20922288, "step": 36280 }, { "epoch": 5.40437890974084, "grad_norm": 2.726201295852661, "learning_rate": 2.583559553881773e-05, "loss": 0.5708, "num_input_tokens_seen": 20925104, "step": 36285 }, { "epoch": 5.405123622281799, "grad_norm": 1.853737711906433, "learning_rate": 2.582910029885909e-05, "loss": 0.6128, "num_input_tokens_seen": 20927856, "step": 36290 }, { "epoch": 5.405868334822759, "grad_norm": 2.129307985305786, "learning_rate": 2.5822605002873213e-05, "loss": 0.609, "num_input_tokens_seen": 20930864, "step": 36295 }, { "epoch": 5.4066130473637175, "grad_norm": 1.3212600946426392, "learning_rate": 2.5816109651299035e-05, "loss": 0.6711, "num_input_tokens_seen": 20934288, "step": 36300 }, { "epoch": 5.407357759904677, "grad_norm": 2.056222438812256, "learning_rate": 2.5809614244575488e-05, "loss": 0.6505, "num_input_tokens_seen": 20937040, "step": 36305 }, { "epoch": 5.408102472445636, "grad_norm": 1.2298940420150757, "learning_rate": 2.58031187831415e-05, "loss": 0.6543, "num_input_tokens_seen": 20939920, "step": 36310 }, { "epoch": 5.408847184986596, "grad_norm": 2.150036573410034, "learning_rate": 2.5796623267436016e-05, "loss": 0.7486, "num_input_tokens_seen": 20942992, "step": 36315 }, { "epoch": 5.409591897527554, "grad_norm": 1.2499257326126099, "learning_rate": 2.579012769789796e-05, "loss": 0.6513, "num_input_tokens_seen": 20945712, "step": 36320 }, { "epoch": 5.410336610068514, "grad_norm": 0.7162776589393616, "learning_rate": 2.5783632074966298e-05, "loss": 0.4757, "num_input_tokens_seen": 20948208, "step": 36325 }, { "epoch": 5.411081322609473, "grad_norm": 1.3759610652923584, "learning_rate": 2.5777136399079955e-05, "loss": 0.6654, "num_input_tokens_seen": 20951280, "step": 36330 }, { "epoch": 5.4118260351504315, "grad_norm": 4.879400730133057, "learning_rate": 2.5770640670677902e-05, "loss": 0.7413, "num_input_tokens_seen": 20954000, "step": 36335 }, { "epoch": 5.412570747691391, "grad_norm": 1.8164176940917969, "learning_rate": 2.5764144890199078e-05, "loss": 0.5159, "num_input_tokens_seen": 20956976, "step": 36340 }, { "epoch": 5.413315460232351, "grad_norm": 2.559713840484619, "learning_rate": 2.5757649058082455e-05, "loss": 0.6621, "num_input_tokens_seen": 20960016, "step": 36345 }, { "epoch": 5.4140601727733095, "grad_norm": 1.3339108228683472, "learning_rate": 2.5751153174766983e-05, "loss": 0.5287, "num_input_tokens_seen": 20962672, "step": 36350 }, { "epoch": 5.414804885314268, "grad_norm": 1.3088864088058472, "learning_rate": 2.5744657240691646e-05, "loss": 0.4842, "num_input_tokens_seen": 20965584, "step": 36355 }, { "epoch": 5.415549597855228, "grad_norm": 1.4363250732421875, "learning_rate": 2.5738161256295396e-05, "loss": 0.648, "num_input_tokens_seen": 20968656, "step": 36360 }, { "epoch": 5.416294310396187, "grad_norm": 0.9647583365440369, "learning_rate": 2.5731665222017202e-05, "loss": 0.6192, "num_input_tokens_seen": 20971760, "step": 36365 }, { "epoch": 5.417039022937146, "grad_norm": 1.778928279876709, "learning_rate": 2.5725169138296046e-05, "loss": 0.6808, "num_input_tokens_seen": 20974864, "step": 36370 }, { "epoch": 5.417783735478105, "grad_norm": 0.9627236723899841, "learning_rate": 2.571867300557092e-05, "loss": 0.5392, "num_input_tokens_seen": 20977456, "step": 36375 }, { "epoch": 5.418528448019065, "grad_norm": 3.030128240585327, "learning_rate": 2.5712176824280787e-05, "loss": 0.6127, "num_input_tokens_seen": 20980208, "step": 36380 }, { "epoch": 5.4192731605600235, "grad_norm": 2.13039493560791, "learning_rate": 2.5705680594864634e-05, "loss": 0.6201, "num_input_tokens_seen": 20983152, "step": 36385 }, { "epoch": 5.420017873100983, "grad_norm": 1.556775689125061, "learning_rate": 2.5699184317761465e-05, "loss": 0.8036, "num_input_tokens_seen": 20986128, "step": 36390 }, { "epoch": 5.420762585641942, "grad_norm": 1.287339687347412, "learning_rate": 2.5692687993410263e-05, "loss": 0.7022, "num_input_tokens_seen": 20989040, "step": 36395 }, { "epoch": 5.421507298182902, "grad_norm": 1.482934594154358, "learning_rate": 2.5686191622250017e-05, "loss": 0.5829, "num_input_tokens_seen": 20991792, "step": 36400 }, { "epoch": 5.42225201072386, "grad_norm": 2.570345640182495, "learning_rate": 2.567969520471973e-05, "loss": 0.6845, "num_input_tokens_seen": 20994576, "step": 36405 }, { "epoch": 5.42299672326482, "grad_norm": 1.3604069948196411, "learning_rate": 2.5673198741258408e-05, "loss": 0.6344, "num_input_tokens_seen": 20996976, "step": 36410 }, { "epoch": 5.423741435805779, "grad_norm": 2.601426839828491, "learning_rate": 2.5666702232305055e-05, "loss": 0.4553, "num_input_tokens_seen": 20999760, "step": 36415 }, { "epoch": 5.424486148346738, "grad_norm": 2.193650484085083, "learning_rate": 2.5660205678298664e-05, "loss": 0.4995, "num_input_tokens_seen": 21002704, "step": 36420 }, { "epoch": 5.425230860887697, "grad_norm": 1.481953740119934, "learning_rate": 2.5653709079678274e-05, "loss": 0.6528, "num_input_tokens_seen": 21005456, "step": 36425 }, { "epoch": 5.425975573428657, "grad_norm": 1.9057673215866089, "learning_rate": 2.5647212436882867e-05, "loss": 0.6517, "num_input_tokens_seen": 21008144, "step": 36430 }, { "epoch": 5.4267202859696155, "grad_norm": 3.1420788764953613, "learning_rate": 2.5640715750351486e-05, "loss": 0.7405, "num_input_tokens_seen": 21010864, "step": 36435 }, { "epoch": 5.427464998510575, "grad_norm": 1.538856029510498, "learning_rate": 2.5634219020523132e-05, "loss": 0.5853, "num_input_tokens_seen": 21013936, "step": 36440 }, { "epoch": 5.428209711051534, "grad_norm": 0.936373770236969, "learning_rate": 2.5627722247836838e-05, "loss": 0.5993, "num_input_tokens_seen": 21016496, "step": 36445 }, { "epoch": 5.428954423592494, "grad_norm": 1.6302636861801147, "learning_rate": 2.5621225432731626e-05, "loss": 0.6035, "num_input_tokens_seen": 21019376, "step": 36450 }, { "epoch": 5.429699136133452, "grad_norm": 1.609188437461853, "learning_rate": 2.561472857564653e-05, "loss": 0.5776, "num_input_tokens_seen": 21022224, "step": 36455 }, { "epoch": 5.430443848674412, "grad_norm": 1.5400404930114746, "learning_rate": 2.560823167702057e-05, "loss": 0.5165, "num_input_tokens_seen": 21025072, "step": 36460 }, { "epoch": 5.431188561215371, "grad_norm": 8.280948638916016, "learning_rate": 2.5601734737292787e-05, "loss": 0.6602, "num_input_tokens_seen": 21027920, "step": 36465 }, { "epoch": 5.43193327375633, "grad_norm": 2.3859875202178955, "learning_rate": 2.5595237756902217e-05, "loss": 0.6513, "num_input_tokens_seen": 21030832, "step": 36470 }, { "epoch": 5.432677986297289, "grad_norm": 2.55395245552063, "learning_rate": 2.558874073628791e-05, "loss": 0.6775, "num_input_tokens_seen": 21033552, "step": 36475 }, { "epoch": 5.433422698838249, "grad_norm": 0.9655036926269531, "learning_rate": 2.5582243675888885e-05, "loss": 0.5596, "num_input_tokens_seen": 21036240, "step": 36480 }, { "epoch": 5.434167411379208, "grad_norm": 3.3983960151672363, "learning_rate": 2.55757465761442e-05, "loss": 0.6354, "num_input_tokens_seen": 21039248, "step": 36485 }, { "epoch": 5.434912123920167, "grad_norm": 1.6785225868225098, "learning_rate": 2.5569249437492903e-05, "loss": 0.5294, "num_input_tokens_seen": 21042864, "step": 36490 }, { "epoch": 5.435656836461126, "grad_norm": 2.0854368209838867, "learning_rate": 2.5562752260374053e-05, "loss": 0.6926, "num_input_tokens_seen": 21045744, "step": 36495 }, { "epoch": 5.436401549002086, "grad_norm": 2.197227954864502, "learning_rate": 2.555625504522668e-05, "loss": 0.7802, "num_input_tokens_seen": 21048560, "step": 36500 }, { "epoch": 5.437146261543044, "grad_norm": 3.584327459335327, "learning_rate": 2.5549757792489853e-05, "loss": 0.7, "num_input_tokens_seen": 21051344, "step": 36505 }, { "epoch": 5.437890974084004, "grad_norm": 1.8852119445800781, "learning_rate": 2.554326050260264e-05, "loss": 0.7623, "num_input_tokens_seen": 21054096, "step": 36510 }, { "epoch": 5.438635686624963, "grad_norm": 1.5375125408172607, "learning_rate": 2.5536763176004086e-05, "loss": 0.6874, "num_input_tokens_seen": 21056912, "step": 36515 }, { "epoch": 5.4393803991659215, "grad_norm": 1.5426135063171387, "learning_rate": 2.553026581313326e-05, "loss": 0.5739, "num_input_tokens_seen": 21059664, "step": 36520 }, { "epoch": 5.440125111706881, "grad_norm": 2.6730594635009766, "learning_rate": 2.5523768414429227e-05, "loss": 0.7617, "num_input_tokens_seen": 21062672, "step": 36525 }, { "epoch": 5.44086982424784, "grad_norm": 1.4602807760238647, "learning_rate": 2.551727098033105e-05, "loss": 0.6196, "num_input_tokens_seen": 21065488, "step": 36530 }, { "epoch": 5.4416145367888, "grad_norm": 2.3718106746673584, "learning_rate": 2.5510773511277804e-05, "loss": 0.6338, "num_input_tokens_seen": 21068432, "step": 36535 }, { "epoch": 5.442359249329758, "grad_norm": 1.3909411430358887, "learning_rate": 2.5504276007708566e-05, "loss": 0.5845, "num_input_tokens_seen": 21071216, "step": 36540 }, { "epoch": 5.443103961870718, "grad_norm": 1.2316054105758667, "learning_rate": 2.54977784700624e-05, "loss": 0.5876, "num_input_tokens_seen": 21073840, "step": 36545 }, { "epoch": 5.443848674411677, "grad_norm": 1.8282511234283447, "learning_rate": 2.5491280898778386e-05, "loss": 0.4703, "num_input_tokens_seen": 21076880, "step": 36550 }, { "epoch": 5.444593386952636, "grad_norm": 1.1752245426177979, "learning_rate": 2.548478329429561e-05, "loss": 0.5702, "num_input_tokens_seen": 21079920, "step": 36555 }, { "epoch": 5.445338099493595, "grad_norm": 1.5506514310836792, "learning_rate": 2.547828565705316e-05, "loss": 0.6923, "num_input_tokens_seen": 21082672, "step": 36560 }, { "epoch": 5.446082812034555, "grad_norm": 1.6357706785202026, "learning_rate": 2.5471787987490092e-05, "loss": 0.6783, "num_input_tokens_seen": 21085360, "step": 36565 }, { "epoch": 5.446827524575514, "grad_norm": 1.8713003396987915, "learning_rate": 2.5465290286045518e-05, "loss": 0.519, "num_input_tokens_seen": 21088272, "step": 36570 }, { "epoch": 5.447572237116473, "grad_norm": 1.6866616010665894, "learning_rate": 2.5458792553158518e-05, "loss": 0.5252, "num_input_tokens_seen": 21091088, "step": 36575 }, { "epoch": 5.448316949657432, "grad_norm": 2.404223918914795, "learning_rate": 2.5452294789268187e-05, "loss": 0.5848, "num_input_tokens_seen": 21094160, "step": 36580 }, { "epoch": 5.449061662198392, "grad_norm": 2.3069756031036377, "learning_rate": 2.544579699481361e-05, "loss": 0.7184, "num_input_tokens_seen": 21097072, "step": 36585 }, { "epoch": 5.44980637473935, "grad_norm": 2.3949615955352783, "learning_rate": 2.5439299170233883e-05, "loss": 0.7713, "num_input_tokens_seen": 21100112, "step": 36590 }, { "epoch": 5.45055108728031, "grad_norm": 2.350278854370117, "learning_rate": 2.54328013159681e-05, "loss": 0.4275, "num_input_tokens_seen": 21103088, "step": 36595 }, { "epoch": 5.451295799821269, "grad_norm": 2.3230862617492676, "learning_rate": 2.5426303432455374e-05, "loss": 0.8787, "num_input_tokens_seen": 21106064, "step": 36600 }, { "epoch": 5.452040512362228, "grad_norm": 3.548996686935425, "learning_rate": 2.5419805520134788e-05, "loss": 0.6675, "num_input_tokens_seen": 21108880, "step": 36605 }, { "epoch": 5.452785224903187, "grad_norm": 2.080522298812866, "learning_rate": 2.5413307579445456e-05, "loss": 0.6283, "num_input_tokens_seen": 21111760, "step": 36610 }, { "epoch": 5.453529937444147, "grad_norm": 0.8931229114532471, "learning_rate": 2.540680961082647e-05, "loss": 0.5654, "num_input_tokens_seen": 21114352, "step": 36615 }, { "epoch": 5.454274649985106, "grad_norm": 1.8102699518203735, "learning_rate": 2.5400311614716955e-05, "loss": 0.7693, "num_input_tokens_seen": 21116976, "step": 36620 }, { "epoch": 5.455019362526065, "grad_norm": 1.9731212854385376, "learning_rate": 2.5393813591556002e-05, "loss": 0.6387, "num_input_tokens_seen": 21120080, "step": 36625 }, { "epoch": 5.455764075067024, "grad_norm": 3.5143885612487793, "learning_rate": 2.538731554178273e-05, "loss": 0.7543, "num_input_tokens_seen": 21122960, "step": 36630 }, { "epoch": 5.456508787607984, "grad_norm": 1.995018720626831, "learning_rate": 2.5380817465836245e-05, "loss": 0.6782, "num_input_tokens_seen": 21126192, "step": 36635 }, { "epoch": 5.457253500148942, "grad_norm": 1.1809874773025513, "learning_rate": 2.5374319364155673e-05, "loss": 0.6183, "num_input_tokens_seen": 21129136, "step": 36640 }, { "epoch": 5.457998212689902, "grad_norm": 2.0804574489593506, "learning_rate": 2.536782123718011e-05, "loss": 0.739, "num_input_tokens_seen": 21131824, "step": 36645 }, { "epoch": 5.458742925230861, "grad_norm": 1.265733242034912, "learning_rate": 2.5361323085348687e-05, "loss": 0.7395, "num_input_tokens_seen": 21134704, "step": 36650 }, { "epoch": 5.4594876377718204, "grad_norm": 2.172913074493408, "learning_rate": 2.5354824909100522e-05, "loss": 0.7025, "num_input_tokens_seen": 21137552, "step": 36655 }, { "epoch": 5.460232350312779, "grad_norm": 2.165208578109741, "learning_rate": 2.5348326708874732e-05, "loss": 0.5793, "num_input_tokens_seen": 21140304, "step": 36660 }, { "epoch": 5.460977062853739, "grad_norm": 1.4599590301513672, "learning_rate": 2.5341828485110435e-05, "loss": 0.8049, "num_input_tokens_seen": 21143504, "step": 36665 }, { "epoch": 5.461721775394698, "grad_norm": 2.3648862838745117, "learning_rate": 2.5335330238246756e-05, "loss": 0.5252, "num_input_tokens_seen": 21146512, "step": 36670 }, { "epoch": 5.462466487935657, "grad_norm": 2.1014840602874756, "learning_rate": 2.532883196872283e-05, "loss": 0.9003, "num_input_tokens_seen": 21149520, "step": 36675 }, { "epoch": 5.463211200476616, "grad_norm": 1.9935557842254639, "learning_rate": 2.5322333676977778e-05, "loss": 0.595, "num_input_tokens_seen": 21152432, "step": 36680 }, { "epoch": 5.463955913017575, "grad_norm": 3.1698153018951416, "learning_rate": 2.531583536345072e-05, "loss": 0.5773, "num_input_tokens_seen": 21155632, "step": 36685 }, { "epoch": 5.464700625558534, "grad_norm": 1.7530442476272583, "learning_rate": 2.5309337028580792e-05, "loss": 0.7592, "num_input_tokens_seen": 21158640, "step": 36690 }, { "epoch": 5.465445338099494, "grad_norm": 0.898385763168335, "learning_rate": 2.5302838672807128e-05, "loss": 0.5534, "num_input_tokens_seen": 21161552, "step": 36695 }, { "epoch": 5.466190050640453, "grad_norm": 2.3312947750091553, "learning_rate": 2.529634029656886e-05, "loss": 0.6642, "num_input_tokens_seen": 21164112, "step": 36700 }, { "epoch": 5.466934763181412, "grad_norm": 1.9214180707931519, "learning_rate": 2.528984190030512e-05, "loss": 0.7525, "num_input_tokens_seen": 21167216, "step": 36705 }, { "epoch": 5.467679475722371, "grad_norm": 1.1160286664962769, "learning_rate": 2.5283343484455036e-05, "loss": 0.5677, "num_input_tokens_seen": 21170064, "step": 36710 }, { "epoch": 5.46842418826333, "grad_norm": 1.4481536149978638, "learning_rate": 2.5276845049457754e-05, "loss": 0.6416, "num_input_tokens_seen": 21172944, "step": 36715 }, { "epoch": 5.46916890080429, "grad_norm": 2.2906222343444824, "learning_rate": 2.5270346595752414e-05, "loss": 0.6473, "num_input_tokens_seen": 21175888, "step": 36720 }, { "epoch": 5.469913613345248, "grad_norm": 1.8626614809036255, "learning_rate": 2.526384812377815e-05, "loss": 0.6013, "num_input_tokens_seen": 21178832, "step": 36725 }, { "epoch": 5.470658325886208, "grad_norm": 1.551073670387268, "learning_rate": 2.525734963397409e-05, "loss": 0.451, "num_input_tokens_seen": 21181840, "step": 36730 }, { "epoch": 5.471403038427167, "grad_norm": 1.5062954425811768, "learning_rate": 2.5250851126779397e-05, "loss": 0.6912, "num_input_tokens_seen": 21185008, "step": 36735 }, { "epoch": 5.4721477509681264, "grad_norm": 1.7447394132614136, "learning_rate": 2.5244352602633215e-05, "loss": 0.8321, "num_input_tokens_seen": 21188176, "step": 36740 }, { "epoch": 5.472892463509085, "grad_norm": 1.5811469554901123, "learning_rate": 2.5237854061974665e-05, "loss": 0.6341, "num_input_tokens_seen": 21190992, "step": 36745 }, { "epoch": 5.473637176050045, "grad_norm": 1.8278957605361938, "learning_rate": 2.5231355505242906e-05, "loss": 0.6225, "num_input_tokens_seen": 21193904, "step": 36750 }, { "epoch": 5.474381888591004, "grad_norm": 1.7614588737487793, "learning_rate": 2.5224856932877083e-05, "loss": 0.682, "num_input_tokens_seen": 21196752, "step": 36755 }, { "epoch": 5.475126601131963, "grad_norm": 1.1669667959213257, "learning_rate": 2.5218358345316346e-05, "loss": 0.6751, "num_input_tokens_seen": 21199536, "step": 36760 }, { "epoch": 5.475871313672922, "grad_norm": 1.2814819812774658, "learning_rate": 2.521185974299983e-05, "loss": 0.3399, "num_input_tokens_seen": 21202512, "step": 36765 }, { "epoch": 5.476616026213882, "grad_norm": 1.1188700199127197, "learning_rate": 2.52053611263667e-05, "loss": 0.5843, "num_input_tokens_seen": 21205424, "step": 36770 }, { "epoch": 5.47736073875484, "grad_norm": 1.1709562540054321, "learning_rate": 2.5198862495856106e-05, "loss": 0.6318, "num_input_tokens_seen": 21208400, "step": 36775 }, { "epoch": 5.4781054512958, "grad_norm": 3.0711050033569336, "learning_rate": 2.519236385190719e-05, "loss": 0.6811, "num_input_tokens_seen": 21210960, "step": 36780 }, { "epoch": 5.478850163836759, "grad_norm": 2.2708852291107178, "learning_rate": 2.5185865194959103e-05, "loss": 0.5435, "num_input_tokens_seen": 21213776, "step": 36785 }, { "epoch": 5.4795948763777185, "grad_norm": 1.904211401939392, "learning_rate": 2.517936652545101e-05, "loss": 0.6836, "num_input_tokens_seen": 21216464, "step": 36790 }, { "epoch": 5.480339588918677, "grad_norm": 2.2653632164001465, "learning_rate": 2.5172867843822046e-05, "loss": 0.5762, "num_input_tokens_seen": 21219248, "step": 36795 }, { "epoch": 5.481084301459637, "grad_norm": 1.2206310033798218, "learning_rate": 2.516636915051138e-05, "loss": 0.5974, "num_input_tokens_seen": 21222096, "step": 36800 }, { "epoch": 5.481829014000596, "grad_norm": 1.3003276586532593, "learning_rate": 2.515987044595817e-05, "loss": 0.5997, "num_input_tokens_seen": 21224752, "step": 36805 }, { "epoch": 5.482573726541555, "grad_norm": 2.42842173576355, "learning_rate": 2.5153371730601556e-05, "loss": 0.666, "num_input_tokens_seen": 21227664, "step": 36810 }, { "epoch": 5.483318439082514, "grad_norm": 2.0334243774414062, "learning_rate": 2.51468730048807e-05, "loss": 0.6151, "num_input_tokens_seen": 21230512, "step": 36815 }, { "epoch": 5.484063151623474, "grad_norm": 1.2616665363311768, "learning_rate": 2.5140374269234772e-05, "loss": 0.7202, "num_input_tokens_seen": 21233328, "step": 36820 }, { "epoch": 5.4848078641644324, "grad_norm": 1.942561388015747, "learning_rate": 2.5133875524102922e-05, "loss": 0.4693, "num_input_tokens_seen": 21236240, "step": 36825 }, { "epoch": 5.485552576705392, "grad_norm": 1.0836507081985474, "learning_rate": 2.5127376769924306e-05, "loss": 0.6141, "num_input_tokens_seen": 21239024, "step": 36830 }, { "epoch": 5.486297289246351, "grad_norm": 1.124658226966858, "learning_rate": 2.512087800713808e-05, "loss": 0.6056, "num_input_tokens_seen": 21241936, "step": 36835 }, { "epoch": 5.4870420017873105, "grad_norm": 2.1538360118865967, "learning_rate": 2.511437923618343e-05, "loss": 0.5053, "num_input_tokens_seen": 21244880, "step": 36840 }, { "epoch": 5.487786714328269, "grad_norm": 3.1691861152648926, "learning_rate": 2.510788045749948e-05, "loss": 0.6644, "num_input_tokens_seen": 21247536, "step": 36845 }, { "epoch": 5.488531426869228, "grad_norm": 1.374338984489441, "learning_rate": 2.5101381671525404e-05, "loss": 0.5807, "num_input_tokens_seen": 21250992, "step": 36850 }, { "epoch": 5.489276139410188, "grad_norm": 1.6337863206863403, "learning_rate": 2.5094882878700372e-05, "loss": 0.5734, "num_input_tokens_seen": 21253776, "step": 36855 }, { "epoch": 5.490020851951147, "grad_norm": 2.3788974285125732, "learning_rate": 2.5088384079463544e-05, "loss": 0.6536, "num_input_tokens_seen": 21256432, "step": 36860 }, { "epoch": 5.490765564492106, "grad_norm": 1.7628134489059448, "learning_rate": 2.5081885274254076e-05, "loss": 0.5421, "num_input_tokens_seen": 21259216, "step": 36865 }, { "epoch": 5.491510277033065, "grad_norm": 1.3993496894836426, "learning_rate": 2.5075386463511135e-05, "loss": 0.6099, "num_input_tokens_seen": 21262352, "step": 36870 }, { "epoch": 5.4922549895740245, "grad_norm": 2.434864044189453, "learning_rate": 2.5068887647673878e-05, "loss": 0.6006, "num_input_tokens_seen": 21265136, "step": 36875 }, { "epoch": 5.492999702114983, "grad_norm": 3.7980222702026367, "learning_rate": 2.5062388827181483e-05, "loss": 0.6293, "num_input_tokens_seen": 21267920, "step": 36880 }, { "epoch": 5.493744414655943, "grad_norm": 1.3871567249298096, "learning_rate": 2.5055890002473097e-05, "loss": 0.6628, "num_input_tokens_seen": 21270736, "step": 36885 }, { "epoch": 5.494489127196902, "grad_norm": 1.4901676177978516, "learning_rate": 2.5049391173987896e-05, "loss": 0.618, "num_input_tokens_seen": 21273776, "step": 36890 }, { "epoch": 5.495233839737861, "grad_norm": 2.736344575881958, "learning_rate": 2.5042892342165036e-05, "loss": 0.54, "num_input_tokens_seen": 21276688, "step": 36895 }, { "epoch": 5.49597855227882, "grad_norm": 1.9366976022720337, "learning_rate": 2.5036393507443694e-05, "loss": 0.5514, "num_input_tokens_seen": 21279376, "step": 36900 }, { "epoch": 5.49672326481978, "grad_norm": 1.8962616920471191, "learning_rate": 2.5029894670263025e-05, "loss": 0.599, "num_input_tokens_seen": 21282096, "step": 36905 }, { "epoch": 5.4974679773607384, "grad_norm": 3.012939453125, "learning_rate": 2.502339583106219e-05, "loss": 0.7126, "num_input_tokens_seen": 21285456, "step": 36910 }, { "epoch": 5.498212689901698, "grad_norm": 1.7464762926101685, "learning_rate": 2.5016896990280357e-05, "loss": 0.7365, "num_input_tokens_seen": 21288464, "step": 36915 }, { "epoch": 5.498957402442657, "grad_norm": 1.830285668373108, "learning_rate": 2.50103981483567e-05, "loss": 0.7768, "num_input_tokens_seen": 21291472, "step": 36920 }, { "epoch": 5.4997021149836165, "grad_norm": 1.5761915445327759, "learning_rate": 2.5003899305730383e-05, "loss": 0.4672, "num_input_tokens_seen": 21294256, "step": 36925 }, { "epoch": 5.5, "eval_loss": 0.6592589616775513, "eval_runtime": 74.3948, "eval_samples_per_second": 40.11, "eval_steps_per_second": 10.028, "num_input_tokens_seen": 21295472, "step": 36927 }, { "epoch": 5.500446827524575, "grad_norm": 3.411583662033081, "learning_rate": 2.499740046284056e-05, "loss": 0.6886, "num_input_tokens_seen": 21297424, "step": 36930 }, { "epoch": 5.501191540065535, "grad_norm": 1.783234715461731, "learning_rate": 2.49909016201264e-05, "loss": 0.5999, "num_input_tokens_seen": 21300368, "step": 36935 }, { "epoch": 5.501936252606494, "grad_norm": 1.0411428213119507, "learning_rate": 2.498440277802708e-05, "loss": 0.4958, "num_input_tokens_seen": 21303152, "step": 36940 }, { "epoch": 5.502680965147453, "grad_norm": 1.9099277257919312, "learning_rate": 2.497790393698175e-05, "loss": 0.5292, "num_input_tokens_seen": 21306256, "step": 36945 }, { "epoch": 5.503425677688412, "grad_norm": 1.2859195470809937, "learning_rate": 2.4971405097429595e-05, "loss": 0.4821, "num_input_tokens_seen": 21308880, "step": 36950 }, { "epoch": 5.504170390229372, "grad_norm": 4.007816791534424, "learning_rate": 2.4964906259809754e-05, "loss": 0.6783, "num_input_tokens_seen": 21311536, "step": 36955 }, { "epoch": 5.5049151027703305, "grad_norm": 1.2957327365875244, "learning_rate": 2.495840742456141e-05, "loss": 0.6875, "num_input_tokens_seen": 21314352, "step": 36960 }, { "epoch": 5.50565981531129, "grad_norm": 1.433729887008667, "learning_rate": 2.495190859212372e-05, "loss": 0.3848, "num_input_tokens_seen": 21317008, "step": 36965 }, { "epoch": 5.506404527852249, "grad_norm": 2.4453563690185547, "learning_rate": 2.4945409762935855e-05, "loss": 0.5121, "num_input_tokens_seen": 21320080, "step": 36970 }, { "epoch": 5.5071492403932085, "grad_norm": 1.1752521991729736, "learning_rate": 2.4938910937436974e-05, "loss": 0.7067, "num_input_tokens_seen": 21322672, "step": 36975 }, { "epoch": 5.507893952934167, "grad_norm": 2.254100799560547, "learning_rate": 2.4932412116066243e-05, "loss": 0.6563, "num_input_tokens_seen": 21325840, "step": 36980 }, { "epoch": 5.508638665475127, "grad_norm": 2.825460195541382, "learning_rate": 2.492591329926283e-05, "loss": 0.5481, "num_input_tokens_seen": 21328720, "step": 36985 }, { "epoch": 5.509383378016086, "grad_norm": 1.6798826456069946, "learning_rate": 2.49194144874659e-05, "loss": 0.6746, "num_input_tokens_seen": 21331472, "step": 36990 }, { "epoch": 5.510128090557045, "grad_norm": 1.7020612955093384, "learning_rate": 2.4912915681114603e-05, "loss": 0.5855, "num_input_tokens_seen": 21334608, "step": 36995 }, { "epoch": 5.510872803098004, "grad_norm": 1.483838677406311, "learning_rate": 2.490641688064811e-05, "loss": 0.5853, "num_input_tokens_seen": 21337392, "step": 37000 }, { "epoch": 5.511617515638964, "grad_norm": 0.8975939154624939, "learning_rate": 2.4899918086505585e-05, "loss": 0.537, "num_input_tokens_seen": 21339984, "step": 37005 }, { "epoch": 5.5123622281799225, "grad_norm": 2.2250914573669434, "learning_rate": 2.489341929912619e-05, "loss": 0.5708, "num_input_tokens_seen": 21342512, "step": 37010 }, { "epoch": 5.513106940720881, "grad_norm": 4.629725456237793, "learning_rate": 2.488692051894908e-05, "loss": 0.644, "num_input_tokens_seen": 21345552, "step": 37015 }, { "epoch": 5.513851653261841, "grad_norm": 1.8861559629440308, "learning_rate": 2.488042174641343e-05, "loss": 0.5467, "num_input_tokens_seen": 21348336, "step": 37020 }, { "epoch": 5.5145963658028005, "grad_norm": 1.6068766117095947, "learning_rate": 2.4873922981958383e-05, "loss": 0.4946, "num_input_tokens_seen": 21351120, "step": 37025 }, { "epoch": 5.515341078343759, "grad_norm": 2.447471857070923, "learning_rate": 2.4867424226023123e-05, "loss": 0.6472, "num_input_tokens_seen": 21353968, "step": 37030 }, { "epoch": 5.516085790884718, "grad_norm": 2.698235273361206, "learning_rate": 2.486092547904678e-05, "loss": 0.5705, "num_input_tokens_seen": 21356688, "step": 37035 }, { "epoch": 5.516830503425678, "grad_norm": 2.063105583190918, "learning_rate": 2.4854426741468537e-05, "loss": 0.6724, "num_input_tokens_seen": 21359856, "step": 37040 }, { "epoch": 5.517575215966637, "grad_norm": 1.4375665187835693, "learning_rate": 2.4847928013727537e-05, "loss": 0.6013, "num_input_tokens_seen": 21362896, "step": 37045 }, { "epoch": 5.518319928507596, "grad_norm": 3.5761516094207764, "learning_rate": 2.484142929626294e-05, "loss": 0.5766, "num_input_tokens_seen": 21365776, "step": 37050 }, { "epoch": 5.519064641048555, "grad_norm": 1.6541239023208618, "learning_rate": 2.4834930589513915e-05, "loss": 0.6587, "num_input_tokens_seen": 21369136, "step": 37055 }, { "epoch": 5.5198093535895145, "grad_norm": 1.2704174518585205, "learning_rate": 2.4828431893919608e-05, "loss": 0.5296, "num_input_tokens_seen": 21372144, "step": 37060 }, { "epoch": 5.520554066130473, "grad_norm": 3.9836792945861816, "learning_rate": 2.482193320991917e-05, "loss": 0.5295, "num_input_tokens_seen": 21374768, "step": 37065 }, { "epoch": 5.521298778671433, "grad_norm": 2.443105936050415, "learning_rate": 2.4815434537951773e-05, "loss": 0.6373, "num_input_tokens_seen": 21377552, "step": 37070 }, { "epoch": 5.522043491212392, "grad_norm": 2.3519420623779297, "learning_rate": 2.4808935878456556e-05, "loss": 0.6664, "num_input_tokens_seen": 21380816, "step": 37075 }, { "epoch": 5.522788203753351, "grad_norm": 2.340113878250122, "learning_rate": 2.480243723187267e-05, "loss": 0.702, "num_input_tokens_seen": 21383472, "step": 37080 }, { "epoch": 5.52353291629431, "grad_norm": 2.270034074783325, "learning_rate": 2.4795938598639273e-05, "loss": 0.5293, "num_input_tokens_seen": 21386608, "step": 37085 }, { "epoch": 5.52427762883527, "grad_norm": 2.968876361846924, "learning_rate": 2.478943997919552e-05, "loss": 0.5726, "num_input_tokens_seen": 21389456, "step": 37090 }, { "epoch": 5.5250223413762285, "grad_norm": 2.0359227657318115, "learning_rate": 2.4782941373980552e-05, "loss": 0.5648, "num_input_tokens_seen": 21392624, "step": 37095 }, { "epoch": 5.525767053917188, "grad_norm": 1.404469609260559, "learning_rate": 2.4776442783433523e-05, "loss": 0.5232, "num_input_tokens_seen": 21395504, "step": 37100 }, { "epoch": 5.526511766458147, "grad_norm": 1.8968600034713745, "learning_rate": 2.476994420799359e-05, "loss": 0.6212, "num_input_tokens_seen": 21398064, "step": 37105 }, { "epoch": 5.5272564789991065, "grad_norm": 1.836285948753357, "learning_rate": 2.4763445648099894e-05, "loss": 0.7065, "num_input_tokens_seen": 21401296, "step": 37110 }, { "epoch": 5.528001191540065, "grad_norm": 2.2790069580078125, "learning_rate": 2.4756947104191573e-05, "loss": 0.4205, "num_input_tokens_seen": 21403952, "step": 37115 }, { "epoch": 5.528745904081025, "grad_norm": 1.3289450407028198, "learning_rate": 2.4750448576707773e-05, "loss": 0.7208, "num_input_tokens_seen": 21406608, "step": 37120 }, { "epoch": 5.529490616621984, "grad_norm": 0.9912965297698975, "learning_rate": 2.474395006608765e-05, "loss": 0.4289, "num_input_tokens_seen": 21409424, "step": 37125 }, { "epoch": 5.530235329162943, "grad_norm": 2.7723119258880615, "learning_rate": 2.4737451572770337e-05, "loss": 0.571, "num_input_tokens_seen": 21412176, "step": 37130 }, { "epoch": 5.530980041703902, "grad_norm": 1.1641998291015625, "learning_rate": 2.4730953097194987e-05, "loss": 0.6225, "num_input_tokens_seen": 21415152, "step": 37135 }, { "epoch": 5.531724754244862, "grad_norm": 1.3262791633605957, "learning_rate": 2.4724454639800724e-05, "loss": 0.6458, "num_input_tokens_seen": 21418384, "step": 37140 }, { "epoch": 5.5324694667858205, "grad_norm": 2.755524158477783, "learning_rate": 2.4717956201026694e-05, "loss": 0.8471, "num_input_tokens_seen": 21421392, "step": 37145 }, { "epoch": 5.53321417932678, "grad_norm": 1.773748517036438, "learning_rate": 2.4711457781312052e-05, "loss": 0.4779, "num_input_tokens_seen": 21424272, "step": 37150 }, { "epoch": 5.533958891867739, "grad_norm": 1.9496639966964722, "learning_rate": 2.4704959381095914e-05, "loss": 0.6874, "num_input_tokens_seen": 21427472, "step": 37155 }, { "epoch": 5.534703604408699, "grad_norm": 2.446875810623169, "learning_rate": 2.4698461000817415e-05, "loss": 0.7357, "num_input_tokens_seen": 21430480, "step": 37160 }, { "epoch": 5.535448316949657, "grad_norm": 2.9010837078094482, "learning_rate": 2.46919626409157e-05, "loss": 0.5918, "num_input_tokens_seen": 21433616, "step": 37165 }, { "epoch": 5.536193029490617, "grad_norm": 2.852053642272949, "learning_rate": 2.46854643018299e-05, "loss": 0.6364, "num_input_tokens_seen": 21436432, "step": 37170 }, { "epoch": 5.536937742031576, "grad_norm": 1.6766092777252197, "learning_rate": 2.4678965983999133e-05, "loss": 0.6204, "num_input_tokens_seen": 21439376, "step": 37175 }, { "epoch": 5.537682454572535, "grad_norm": 1.6481330394744873, "learning_rate": 2.4672467687862545e-05, "loss": 0.6563, "num_input_tokens_seen": 21442224, "step": 37180 }, { "epoch": 5.538427167113494, "grad_norm": 1.5945831537246704, "learning_rate": 2.4665969413859264e-05, "loss": 0.6908, "num_input_tokens_seen": 21445040, "step": 37185 }, { "epoch": 5.539171879654454, "grad_norm": 2.2498648166656494, "learning_rate": 2.4659471162428404e-05, "loss": 0.6029, "num_input_tokens_seen": 21447728, "step": 37190 }, { "epoch": 5.5399165921954125, "grad_norm": 2.3828864097595215, "learning_rate": 2.4652972934009112e-05, "loss": 0.6129, "num_input_tokens_seen": 21450640, "step": 37195 }, { "epoch": 5.540661304736371, "grad_norm": 1.5955413579940796, "learning_rate": 2.4646474729040486e-05, "loss": 0.5387, "num_input_tokens_seen": 21453616, "step": 37200 }, { "epoch": 5.541406017277331, "grad_norm": 3.119131088256836, "learning_rate": 2.4639976547961665e-05, "loss": 0.5947, "num_input_tokens_seen": 21456624, "step": 37205 }, { "epoch": 5.542150729818291, "grad_norm": 1.738081932067871, "learning_rate": 2.4633478391211762e-05, "loss": 0.6194, "num_input_tokens_seen": 21459280, "step": 37210 }, { "epoch": 5.542895442359249, "grad_norm": 1.785227656364441, "learning_rate": 2.46269802592299e-05, "loss": 0.8001, "num_input_tokens_seen": 21462192, "step": 37215 }, { "epoch": 5.543640154900208, "grad_norm": 2.3608241081237793, "learning_rate": 2.4620482152455197e-05, "loss": 0.5839, "num_input_tokens_seen": 21465008, "step": 37220 }, { "epoch": 5.544384867441168, "grad_norm": 1.7873313426971436, "learning_rate": 2.4613984071326762e-05, "loss": 0.5783, "num_input_tokens_seen": 21467760, "step": 37225 }, { "epoch": 5.5451295799821265, "grad_norm": 1.9351762533187866, "learning_rate": 2.4607486016283717e-05, "loss": 0.7038, "num_input_tokens_seen": 21470576, "step": 37230 }, { "epoch": 5.545874292523086, "grad_norm": 2.2093513011932373, "learning_rate": 2.4600987987765183e-05, "loss": 0.3802, "num_input_tokens_seen": 21473840, "step": 37235 }, { "epoch": 5.546619005064045, "grad_norm": 2.49798846244812, "learning_rate": 2.459448998621025e-05, "loss": 0.4973, "num_input_tokens_seen": 21476848, "step": 37240 }, { "epoch": 5.547363717605005, "grad_norm": 1.4756196737289429, "learning_rate": 2.458799201205803e-05, "loss": 0.8171, "num_input_tokens_seen": 21479632, "step": 37245 }, { "epoch": 5.548108430145963, "grad_norm": 1.2251503467559814, "learning_rate": 2.4581494065747634e-05, "loss": 0.4878, "num_input_tokens_seen": 21482576, "step": 37250 }, { "epoch": 5.548853142686923, "grad_norm": 3.0097498893737793, "learning_rate": 2.4574996147718175e-05, "loss": 0.5079, "num_input_tokens_seen": 21485360, "step": 37255 }, { "epoch": 5.549597855227882, "grad_norm": 4.687350273132324, "learning_rate": 2.456849825840874e-05, "loss": 0.9374, "num_input_tokens_seen": 21488144, "step": 37260 }, { "epoch": 5.550342567768841, "grad_norm": 1.1690388917922974, "learning_rate": 2.4562000398258442e-05, "loss": 0.5154, "num_input_tokens_seen": 21490960, "step": 37265 }, { "epoch": 5.5510872803098, "grad_norm": 3.5646584033966064, "learning_rate": 2.455550256770638e-05, "loss": 0.711, "num_input_tokens_seen": 21494192, "step": 37270 }, { "epoch": 5.55183199285076, "grad_norm": 2.0924479961395264, "learning_rate": 2.454900476719165e-05, "loss": 0.5794, "num_input_tokens_seen": 21497072, "step": 37275 }, { "epoch": 5.5525767053917185, "grad_norm": 4.977083206176758, "learning_rate": 2.454250699715334e-05, "loss": 0.6596, "num_input_tokens_seen": 21499824, "step": 37280 }, { "epoch": 5.553321417932678, "grad_norm": 1.550524115562439, "learning_rate": 2.453600925803054e-05, "loss": 0.6461, "num_input_tokens_seen": 21502640, "step": 37285 }, { "epoch": 5.554066130473637, "grad_norm": 2.9795970916748047, "learning_rate": 2.4529511550262357e-05, "loss": 0.8666, "num_input_tokens_seen": 21505680, "step": 37290 }, { "epoch": 5.554810843014597, "grad_norm": 1.2327768802642822, "learning_rate": 2.4523013874287863e-05, "loss": 0.4476, "num_input_tokens_seen": 21508720, "step": 37295 }, { "epoch": 5.555555555555555, "grad_norm": 1.8740981817245483, "learning_rate": 2.451651623054616e-05, "loss": 0.5751, "num_input_tokens_seen": 21511472, "step": 37300 }, { "epoch": 5.556300268096515, "grad_norm": 2.8438265323638916, "learning_rate": 2.451001861947632e-05, "loss": 0.5118, "num_input_tokens_seen": 21514320, "step": 37305 }, { "epoch": 5.557044980637474, "grad_norm": 1.4698600769042969, "learning_rate": 2.4503521041517426e-05, "loss": 0.637, "num_input_tokens_seen": 21517104, "step": 37310 }, { "epoch": 5.557789693178433, "grad_norm": 1.474575161933899, "learning_rate": 2.4497023497108575e-05, "loss": 0.578, "num_input_tokens_seen": 21519984, "step": 37315 }, { "epoch": 5.558534405719392, "grad_norm": 1.155646800994873, "learning_rate": 2.4490525986688826e-05, "loss": 0.6859, "num_input_tokens_seen": 21523120, "step": 37320 }, { "epoch": 5.559279118260352, "grad_norm": 1.5118556022644043, "learning_rate": 2.4484028510697253e-05, "loss": 0.6046, "num_input_tokens_seen": 21525872, "step": 37325 }, { "epoch": 5.560023830801311, "grad_norm": 1.772446870803833, "learning_rate": 2.4477531069572934e-05, "loss": 0.5942, "num_input_tokens_seen": 21528752, "step": 37330 }, { "epoch": 5.56076854334227, "grad_norm": 1.3257650136947632, "learning_rate": 2.447103366375495e-05, "loss": 0.6525, "num_input_tokens_seen": 21531280, "step": 37335 }, { "epoch": 5.561513255883229, "grad_norm": 1.031463861465454, "learning_rate": 2.4464536293682353e-05, "loss": 0.6359, "num_input_tokens_seen": 21534576, "step": 37340 }, { "epoch": 5.562257968424189, "grad_norm": 4.092581748962402, "learning_rate": 2.4458038959794218e-05, "loss": 0.6865, "num_input_tokens_seen": 21537488, "step": 37345 }, { "epoch": 5.563002680965147, "grad_norm": 1.5770838260650635, "learning_rate": 2.4451541662529605e-05, "loss": 0.4761, "num_input_tokens_seen": 21540336, "step": 37350 }, { "epoch": 5.563747393506107, "grad_norm": 1.4142292737960815, "learning_rate": 2.444504440232759e-05, "loss": 0.465, "num_input_tokens_seen": 21543024, "step": 37355 }, { "epoch": 5.564492106047066, "grad_norm": 2.1213998794555664, "learning_rate": 2.4438547179627203e-05, "loss": 0.7162, "num_input_tokens_seen": 21545872, "step": 37360 }, { "epoch": 5.5652368185880245, "grad_norm": 1.189584732055664, "learning_rate": 2.443204999486752e-05, "loss": 0.5972, "num_input_tokens_seen": 21549008, "step": 37365 }, { "epoch": 5.565981531128984, "grad_norm": 3.6350038051605225, "learning_rate": 2.4425552848487588e-05, "loss": 0.8129, "num_input_tokens_seen": 21551632, "step": 37370 }, { "epoch": 5.566726243669944, "grad_norm": 1.2755491733551025, "learning_rate": 2.4419055740926456e-05, "loss": 0.6687, "num_input_tokens_seen": 21554416, "step": 37375 }, { "epoch": 5.567470956210903, "grad_norm": 2.4232442378997803, "learning_rate": 2.4412558672623177e-05, "loss": 0.7506, "num_input_tokens_seen": 21557040, "step": 37380 }, { "epoch": 5.568215668751861, "grad_norm": 2.2093920707702637, "learning_rate": 2.44060616440168e-05, "loss": 0.5274, "num_input_tokens_seen": 21559888, "step": 37385 }, { "epoch": 5.568960381292821, "grad_norm": 2.7165727615356445, "learning_rate": 2.4399564655546354e-05, "loss": 0.6325, "num_input_tokens_seen": 21563088, "step": 37390 }, { "epoch": 5.569705093833781, "grad_norm": 1.636299729347229, "learning_rate": 2.43930677076509e-05, "loss": 0.5117, "num_input_tokens_seen": 21565744, "step": 37395 }, { "epoch": 5.570449806374739, "grad_norm": 2.6197712421417236, "learning_rate": 2.4386570800769447e-05, "loss": 0.577, "num_input_tokens_seen": 21568496, "step": 37400 }, { "epoch": 5.571194518915698, "grad_norm": 2.6383473873138428, "learning_rate": 2.438007393534106e-05, "loss": 0.5985, "num_input_tokens_seen": 21571120, "step": 37405 }, { "epoch": 5.571939231456658, "grad_norm": 2.749032735824585, "learning_rate": 2.4373577111804744e-05, "loss": 0.7462, "num_input_tokens_seen": 21573712, "step": 37410 }, { "epoch": 5.572683943997617, "grad_norm": 1.381867527961731, "learning_rate": 2.436708033059954e-05, "loss": 0.475, "num_input_tokens_seen": 21576240, "step": 37415 }, { "epoch": 5.573428656538576, "grad_norm": 2.3394415378570557, "learning_rate": 2.4360583592164483e-05, "loss": 0.7115, "num_input_tokens_seen": 21579248, "step": 37420 }, { "epoch": 5.574173369079535, "grad_norm": 2.549323320388794, "learning_rate": 2.435408689693858e-05, "loss": 0.359, "num_input_tokens_seen": 21582160, "step": 37425 }, { "epoch": 5.574918081620495, "grad_norm": 2.071272373199463, "learning_rate": 2.4347590245360857e-05, "loss": 0.6467, "num_input_tokens_seen": 21585360, "step": 37430 }, { "epoch": 5.575662794161453, "grad_norm": 1.9432929754257202, "learning_rate": 2.4341093637870345e-05, "loss": 0.5769, "num_input_tokens_seen": 21588464, "step": 37435 }, { "epoch": 5.576407506702413, "grad_norm": 1.1589691638946533, "learning_rate": 2.433459707490604e-05, "loss": 0.4329, "num_input_tokens_seen": 21591504, "step": 37440 }, { "epoch": 5.577152219243372, "grad_norm": 1.321603536605835, "learning_rate": 2.4328100556906956e-05, "loss": 0.7015, "num_input_tokens_seen": 21594672, "step": 37445 }, { "epoch": 5.577896931784331, "grad_norm": 2.8812038898468018, "learning_rate": 2.4321604084312103e-05, "loss": 0.6582, "num_input_tokens_seen": 21597712, "step": 37450 }, { "epoch": 5.57864164432529, "grad_norm": 2.665739059448242, "learning_rate": 2.4315107657560492e-05, "loss": 0.5611, "num_input_tokens_seen": 21600656, "step": 37455 }, { "epoch": 5.57938635686625, "grad_norm": 2.0868115425109863, "learning_rate": 2.4308611277091118e-05, "loss": 0.3912, "num_input_tokens_seen": 21603312, "step": 37460 }, { "epoch": 5.580131069407209, "grad_norm": 2.533989191055298, "learning_rate": 2.4302114943342986e-05, "loss": 0.6519, "num_input_tokens_seen": 21605872, "step": 37465 }, { "epoch": 5.580875781948168, "grad_norm": 2.074361801147461, "learning_rate": 2.4295618656755084e-05, "loss": 0.6821, "num_input_tokens_seen": 21608624, "step": 37470 }, { "epoch": 5.581620494489127, "grad_norm": 1.6619778871536255, "learning_rate": 2.4289122417766422e-05, "loss": 0.6249, "num_input_tokens_seen": 21611408, "step": 37475 }, { "epoch": 5.582365207030087, "grad_norm": 4.50753927230835, "learning_rate": 2.4282626226815963e-05, "loss": 0.5938, "num_input_tokens_seen": 21614192, "step": 37480 }, { "epoch": 5.583109919571045, "grad_norm": 2.19443678855896, "learning_rate": 2.4276130084342714e-05, "loss": 0.6446, "num_input_tokens_seen": 21617200, "step": 37485 }, { "epoch": 5.583854632112005, "grad_norm": 1.6805344820022583, "learning_rate": 2.4269633990785645e-05, "loss": 0.6166, "num_input_tokens_seen": 21619984, "step": 37490 }, { "epoch": 5.584599344652964, "grad_norm": 1.1466426849365234, "learning_rate": 2.4263137946583743e-05, "loss": 0.5992, "num_input_tokens_seen": 21622832, "step": 37495 }, { "epoch": 5.5853440571939235, "grad_norm": 2.5110833644866943, "learning_rate": 2.4256641952175983e-05, "loss": 0.6057, "num_input_tokens_seen": 21625680, "step": 37500 }, { "epoch": 5.586088769734882, "grad_norm": 2.0853590965270996, "learning_rate": 2.425014600800134e-05, "loss": 0.6987, "num_input_tokens_seen": 21628336, "step": 37505 }, { "epoch": 5.586833482275842, "grad_norm": 1.5591068267822266, "learning_rate": 2.4243650114498776e-05, "loss": 0.6304, "num_input_tokens_seen": 21631280, "step": 37510 }, { "epoch": 5.587578194816801, "grad_norm": 2.7052173614501953, "learning_rate": 2.4237154272107274e-05, "loss": 0.7236, "num_input_tokens_seen": 21634224, "step": 37515 }, { "epoch": 5.58832290735776, "grad_norm": 1.1450614929199219, "learning_rate": 2.423065848126578e-05, "loss": 0.6433, "num_input_tokens_seen": 21637168, "step": 37520 }, { "epoch": 5.589067619898719, "grad_norm": 1.6834430694580078, "learning_rate": 2.4224162742413252e-05, "loss": 0.5505, "num_input_tokens_seen": 21640144, "step": 37525 }, { "epoch": 5.589812332439678, "grad_norm": 2.9233956336975098, "learning_rate": 2.421766705598865e-05, "loss": 0.6794, "num_input_tokens_seen": 21642896, "step": 37530 }, { "epoch": 5.590557044980637, "grad_norm": 2.824148178100586, "learning_rate": 2.4211171422430937e-05, "loss": 0.6556, "num_input_tokens_seen": 21646160, "step": 37535 }, { "epoch": 5.591301757521597, "grad_norm": 1.8466440439224243, "learning_rate": 2.4204675842179046e-05, "loss": 0.6964, "num_input_tokens_seen": 21648976, "step": 37540 }, { "epoch": 5.592046470062556, "grad_norm": 3.9205918312072754, "learning_rate": 2.4198180315671927e-05, "loss": 0.683, "num_input_tokens_seen": 21651728, "step": 37545 }, { "epoch": 5.592791182603515, "grad_norm": 2.5601184368133545, "learning_rate": 2.4191684843348524e-05, "loss": 0.6794, "num_input_tokens_seen": 21654512, "step": 37550 }, { "epoch": 5.593535895144474, "grad_norm": 1.3931114673614502, "learning_rate": 2.418518942564778e-05, "loss": 0.4025, "num_input_tokens_seen": 21657296, "step": 37555 }, { "epoch": 5.594280607685434, "grad_norm": 2.360644578933716, "learning_rate": 2.4178694063008616e-05, "loss": 0.6185, "num_input_tokens_seen": 21660176, "step": 37560 }, { "epoch": 5.595025320226393, "grad_norm": 1.6941815614700317, "learning_rate": 2.4172198755869962e-05, "loss": 0.7664, "num_input_tokens_seen": 21663440, "step": 37565 }, { "epoch": 5.595770032767351, "grad_norm": 2.2023532390594482, "learning_rate": 2.4165703504670757e-05, "loss": 0.7011, "num_input_tokens_seen": 21666224, "step": 37570 }, { "epoch": 5.596514745308311, "grad_norm": 2.0391910076141357, "learning_rate": 2.4159208309849916e-05, "loss": 0.6396, "num_input_tokens_seen": 21669104, "step": 37575 }, { "epoch": 5.59725945784927, "grad_norm": 0.9211499691009521, "learning_rate": 2.4152713171846355e-05, "loss": 0.4561, "num_input_tokens_seen": 21671888, "step": 37580 }, { "epoch": 5.5980041703902295, "grad_norm": 1.4170780181884766, "learning_rate": 2.4146218091099e-05, "loss": 0.7441, "num_input_tokens_seen": 21674832, "step": 37585 }, { "epoch": 5.598748882931188, "grad_norm": 3.77712345123291, "learning_rate": 2.413972306804675e-05, "loss": 0.6088, "num_input_tokens_seen": 21677456, "step": 37590 }, { "epoch": 5.599493595472148, "grad_norm": 0.9169386625289917, "learning_rate": 2.4133228103128526e-05, "loss": 0.4449, "num_input_tokens_seen": 21680400, "step": 37595 }, { "epoch": 5.600238308013107, "grad_norm": 2.7597124576568604, "learning_rate": 2.4126733196783214e-05, "loss": 0.6593, "num_input_tokens_seen": 21683344, "step": 37600 }, { "epoch": 5.600983020554066, "grad_norm": 1.2977244853973389, "learning_rate": 2.4120238349449728e-05, "loss": 0.7041, "num_input_tokens_seen": 21686192, "step": 37605 }, { "epoch": 5.601727733095025, "grad_norm": 1.3661209344863892, "learning_rate": 2.411374356156695e-05, "loss": 0.6012, "num_input_tokens_seen": 21689008, "step": 37610 }, { "epoch": 5.602472445635985, "grad_norm": 1.9154703617095947, "learning_rate": 2.410724883357378e-05, "loss": 0.5547, "num_input_tokens_seen": 21691600, "step": 37615 }, { "epoch": 5.603217158176943, "grad_norm": 1.528151512145996, "learning_rate": 2.4100754165909108e-05, "loss": 0.7528, "num_input_tokens_seen": 21694288, "step": 37620 }, { "epoch": 5.603961870717903, "grad_norm": 1.4502801895141602, "learning_rate": 2.4094259559011813e-05, "loss": 0.5462, "num_input_tokens_seen": 21697232, "step": 37625 }, { "epoch": 5.604706583258862, "grad_norm": 2.341536521911621, "learning_rate": 2.4087765013320776e-05, "loss": 0.5346, "num_input_tokens_seen": 21699984, "step": 37630 }, { "epoch": 5.6054512957998215, "grad_norm": 1.5960313081741333, "learning_rate": 2.408127052927487e-05, "loss": 0.6819, "num_input_tokens_seen": 21702832, "step": 37635 }, { "epoch": 5.60619600834078, "grad_norm": 2.8743176460266113, "learning_rate": 2.407477610731297e-05, "loss": 0.6865, "num_input_tokens_seen": 21705680, "step": 37640 }, { "epoch": 5.60694072088174, "grad_norm": 2.0993473529815674, "learning_rate": 2.4068281747873927e-05, "loss": 0.7127, "num_input_tokens_seen": 21708624, "step": 37645 }, { "epoch": 5.607685433422699, "grad_norm": 0.7326064705848694, "learning_rate": 2.4061787451396626e-05, "loss": 0.6553, "num_input_tokens_seen": 21711344, "step": 37650 }, { "epoch": 5.608430145963658, "grad_norm": 1.6660559177398682, "learning_rate": 2.4055293218319907e-05, "loss": 0.5969, "num_input_tokens_seen": 21714672, "step": 37655 }, { "epoch": 5.609174858504617, "grad_norm": 1.3851583003997803, "learning_rate": 2.4048799049082632e-05, "loss": 0.6125, "num_input_tokens_seen": 21717616, "step": 37660 }, { "epoch": 5.609919571045577, "grad_norm": 1.477974772453308, "learning_rate": 2.4042304944123654e-05, "loss": 0.6056, "num_input_tokens_seen": 21720464, "step": 37665 }, { "epoch": 5.6106642835865355, "grad_norm": 1.7581068277359009, "learning_rate": 2.4035810903881813e-05, "loss": 0.4977, "num_input_tokens_seen": 21723312, "step": 37670 }, { "epoch": 5.611408996127495, "grad_norm": 2.3781890869140625, "learning_rate": 2.4029316928795958e-05, "loss": 0.7294, "num_input_tokens_seen": 21726384, "step": 37675 }, { "epoch": 5.612153708668454, "grad_norm": 3.962236166000366, "learning_rate": 2.402282301930491e-05, "loss": 0.696, "num_input_tokens_seen": 21729296, "step": 37680 }, { "epoch": 5.6128984212094135, "grad_norm": 1.5196354389190674, "learning_rate": 2.4016329175847514e-05, "loss": 0.5361, "num_input_tokens_seen": 21732208, "step": 37685 }, { "epoch": 5.613643133750372, "grad_norm": 1.6116191148757935, "learning_rate": 2.4009835398862588e-05, "loss": 0.5637, "num_input_tokens_seen": 21734960, "step": 37690 }, { "epoch": 5.614387846291332, "grad_norm": 1.9718842506408691, "learning_rate": 2.4003341688788958e-05, "loss": 0.8136, "num_input_tokens_seen": 21737744, "step": 37695 }, { "epoch": 5.615132558832291, "grad_norm": 1.905407190322876, "learning_rate": 2.399684804606545e-05, "loss": 0.6566, "num_input_tokens_seen": 21740592, "step": 37700 }, { "epoch": 5.61587727137325, "grad_norm": 1.5487055778503418, "learning_rate": 2.3990354471130873e-05, "loss": 0.6448, "num_input_tokens_seen": 21743440, "step": 37705 }, { "epoch": 5.616621983914209, "grad_norm": 1.7870376110076904, "learning_rate": 2.398386096442403e-05, "loss": 0.5375, "num_input_tokens_seen": 21746128, "step": 37710 }, { "epoch": 5.617366696455168, "grad_norm": 1.7632412910461426, "learning_rate": 2.3977367526383744e-05, "loss": 0.6296, "num_input_tokens_seen": 21749136, "step": 37715 }, { "epoch": 5.6181114089961275, "grad_norm": 0.8685010671615601, "learning_rate": 2.39708741574488e-05, "loss": 0.4822, "num_input_tokens_seen": 21751920, "step": 37720 }, { "epoch": 5.618856121537087, "grad_norm": 1.1254351139068604, "learning_rate": 2.3964380858057985e-05, "loss": 0.5568, "num_input_tokens_seen": 21754960, "step": 37725 }, { "epoch": 5.619600834078046, "grad_norm": 1.8414180278778076, "learning_rate": 2.3957887628650104e-05, "loss": 0.4774, "num_input_tokens_seen": 21757712, "step": 37730 }, { "epoch": 5.620345546619005, "grad_norm": 1.6995234489440918, "learning_rate": 2.3951394469663946e-05, "loss": 0.6351, "num_input_tokens_seen": 21760976, "step": 37735 }, { "epoch": 5.621090259159964, "grad_norm": 1.8952808380126953, "learning_rate": 2.394490138153828e-05, "loss": 0.6191, "num_input_tokens_seen": 21763952, "step": 37740 }, { "epoch": 5.621834971700923, "grad_norm": 1.9084769487380981, "learning_rate": 2.393840836471189e-05, "loss": 0.7159, "num_input_tokens_seen": 21766928, "step": 37745 }, { "epoch": 5.622579684241883, "grad_norm": 1.2044415473937988, "learning_rate": 2.3931915419623552e-05, "loss": 0.6139, "num_input_tokens_seen": 21769904, "step": 37750 }, { "epoch": 5.6233243967828415, "grad_norm": 4.042202949523926, "learning_rate": 2.3925422546712032e-05, "loss": 0.6807, "num_input_tokens_seen": 21772720, "step": 37755 }, { "epoch": 5.624069109323801, "grad_norm": 1.7613050937652588, "learning_rate": 2.3918929746416077e-05, "loss": 0.5637, "num_input_tokens_seen": 21775568, "step": 37760 }, { "epoch": 5.62481382186476, "grad_norm": 1.8794227838516235, "learning_rate": 2.3912437019174454e-05, "loss": 0.6337, "num_input_tokens_seen": 21778512, "step": 37765 }, { "epoch": 5.6255585344057195, "grad_norm": 1.4175949096679688, "learning_rate": 2.3905944365425922e-05, "loss": 0.6915, "num_input_tokens_seen": 21781360, "step": 37770 }, { "epoch": 5.626303246946678, "grad_norm": 2.201136350631714, "learning_rate": 2.3899451785609218e-05, "loss": 0.7824, "num_input_tokens_seen": 21784464, "step": 37775 }, { "epoch": 5.627047959487638, "grad_norm": 1.3197274208068848, "learning_rate": 2.3892959280163084e-05, "loss": 0.4372, "num_input_tokens_seen": 21787312, "step": 37780 }, { "epoch": 5.627792672028597, "grad_norm": 6.651480674743652, "learning_rate": 2.388646684952627e-05, "loss": 0.623, "num_input_tokens_seen": 21790256, "step": 37785 }, { "epoch": 5.628537384569556, "grad_norm": 1.5560922622680664, "learning_rate": 2.3879974494137487e-05, "loss": 0.6117, "num_input_tokens_seen": 21793008, "step": 37790 }, { "epoch": 5.629282097110515, "grad_norm": 1.1626167297363281, "learning_rate": 2.3873482214435486e-05, "loss": 0.4301, "num_input_tokens_seen": 21795984, "step": 37795 }, { "epoch": 5.630026809651475, "grad_norm": 2.7981879711151123, "learning_rate": 2.3866990010858976e-05, "loss": 0.564, "num_input_tokens_seen": 21798960, "step": 37800 }, { "epoch": 5.6307715221924335, "grad_norm": 2.0365664958953857, "learning_rate": 2.386049788384667e-05, "loss": 0.6431, "num_input_tokens_seen": 21801744, "step": 37805 }, { "epoch": 5.631516234733393, "grad_norm": 1.9296939373016357, "learning_rate": 2.3854005833837285e-05, "loss": 0.7561, "num_input_tokens_seen": 21804560, "step": 37810 }, { "epoch": 5.632260947274352, "grad_norm": 2.0316548347473145, "learning_rate": 2.384751386126953e-05, "loss": 0.6376, "num_input_tokens_seen": 21807344, "step": 37815 }, { "epoch": 5.6330056598153115, "grad_norm": 3.458010196685791, "learning_rate": 2.3841021966582095e-05, "loss": 0.4963, "num_input_tokens_seen": 21810032, "step": 37820 }, { "epoch": 5.63375037235627, "grad_norm": 1.9015785455703735, "learning_rate": 2.3834530150213686e-05, "loss": 0.4711, "num_input_tokens_seen": 21812816, "step": 37825 }, { "epoch": 5.63449508489723, "grad_norm": 1.4779855012893677, "learning_rate": 2.3828038412602993e-05, "loss": 0.5228, "num_input_tokens_seen": 21815312, "step": 37830 }, { "epoch": 5.635239797438189, "grad_norm": 1.4989489316940308, "learning_rate": 2.3821546754188698e-05, "loss": 0.645, "num_input_tokens_seen": 21818256, "step": 37835 }, { "epoch": 5.635984509979148, "grad_norm": 1.4004257917404175, "learning_rate": 2.381505517540949e-05, "loss": 0.7109, "num_input_tokens_seen": 21820976, "step": 37840 }, { "epoch": 5.636729222520107, "grad_norm": 1.0923517942428589, "learning_rate": 2.3808563676704027e-05, "loss": 0.726, "num_input_tokens_seen": 21823984, "step": 37845 }, { "epoch": 5.637473935061067, "grad_norm": 2.2444348335266113, "learning_rate": 2.3802072258510986e-05, "loss": 0.7929, "num_input_tokens_seen": 21827120, "step": 37850 }, { "epoch": 5.6382186476020255, "grad_norm": 2.405745506286621, "learning_rate": 2.3795580921269034e-05, "loss": 0.5427, "num_input_tokens_seen": 21830000, "step": 37855 }, { "epoch": 5.638963360142985, "grad_norm": 1.7522941827774048, "learning_rate": 2.378908966541682e-05, "loss": 0.536, "num_input_tokens_seen": 21832880, "step": 37860 }, { "epoch": 5.639708072683944, "grad_norm": 2.2388832569122314, "learning_rate": 2.3782598491393014e-05, "loss": 0.6833, "num_input_tokens_seen": 21835792, "step": 37865 }, { "epoch": 5.640452785224904, "grad_norm": 1.5046799182891846, "learning_rate": 2.3776107399636247e-05, "loss": 0.7052, "num_input_tokens_seen": 21838608, "step": 37870 }, { "epoch": 5.641197497765862, "grad_norm": 1.9729344844818115, "learning_rate": 2.376961639058516e-05, "loss": 0.6632, "num_input_tokens_seen": 21841264, "step": 37875 }, { "epoch": 5.641942210306821, "grad_norm": 1.2662371397018433, "learning_rate": 2.3763125464678414e-05, "loss": 0.5914, "num_input_tokens_seen": 21844336, "step": 37880 }, { "epoch": 5.642686922847781, "grad_norm": 1.71699857711792, "learning_rate": 2.3756634622354607e-05, "loss": 0.6269, "num_input_tokens_seen": 21847056, "step": 37885 }, { "epoch": 5.64343163538874, "grad_norm": 3.0888123512268066, "learning_rate": 2.3750143864052376e-05, "loss": 0.671, "num_input_tokens_seen": 21849680, "step": 37890 }, { "epoch": 5.644176347929699, "grad_norm": 1.6181933879852295, "learning_rate": 2.374365319021034e-05, "loss": 0.7038, "num_input_tokens_seen": 21852432, "step": 37895 }, { "epoch": 5.644921060470658, "grad_norm": 2.6305410861968994, "learning_rate": 2.373716260126712e-05, "loss": 0.634, "num_input_tokens_seen": 21855184, "step": 37900 }, { "epoch": 5.6456657730116175, "grad_norm": 2.071324110031128, "learning_rate": 2.373067209766131e-05, "loss": 0.6649, "num_input_tokens_seen": 21857936, "step": 37905 }, { "epoch": 5.646410485552577, "grad_norm": 1.2238701581954956, "learning_rate": 2.372418167983152e-05, "loss": 0.6153, "num_input_tokens_seen": 21860752, "step": 37910 }, { "epoch": 5.647155198093536, "grad_norm": 0.6542186141014099, "learning_rate": 2.371769134821635e-05, "loss": 0.4255, "num_input_tokens_seen": 21863728, "step": 37915 }, { "epoch": 5.647899910634495, "grad_norm": 4.183189392089844, "learning_rate": 2.371120110325439e-05, "loss": 0.7053, "num_input_tokens_seen": 21866768, "step": 37920 }, { "epoch": 5.648644623175454, "grad_norm": 2.545985460281372, "learning_rate": 2.370471094538421e-05, "loss": 0.7719, "num_input_tokens_seen": 21869904, "step": 37925 }, { "epoch": 5.649389335716413, "grad_norm": 2.898991107940674, "learning_rate": 2.3698220875044396e-05, "loss": 0.6487, "num_input_tokens_seen": 21872784, "step": 37930 }, { "epoch": 5.650134048257373, "grad_norm": 1.6586369276046753, "learning_rate": 2.369173089267353e-05, "loss": 0.5447, "num_input_tokens_seen": 21875472, "step": 37935 }, { "epoch": 5.6508787607983315, "grad_norm": 1.4729450941085815, "learning_rate": 2.3685240998710166e-05, "loss": 0.5113, "num_input_tokens_seen": 21878512, "step": 37940 }, { "epoch": 5.651623473339291, "grad_norm": 1.4878854751586914, "learning_rate": 2.367875119359287e-05, "loss": 0.5116, "num_input_tokens_seen": 21881264, "step": 37945 }, { "epoch": 5.65236818588025, "grad_norm": 2.661989450454712, "learning_rate": 2.36722614777602e-05, "loss": 0.3965, "num_input_tokens_seen": 21883760, "step": 37950 }, { "epoch": 5.65311289842121, "grad_norm": 2.7047815322875977, "learning_rate": 2.3665771851650697e-05, "loss": 0.7554, "num_input_tokens_seen": 21886480, "step": 37955 }, { "epoch": 5.653857610962168, "grad_norm": 1.0623186826705933, "learning_rate": 2.3659282315702918e-05, "loss": 0.4894, "num_input_tokens_seen": 21889456, "step": 37960 }, { "epoch": 5.654602323503128, "grad_norm": 7.087924003601074, "learning_rate": 2.365279287035538e-05, "loss": 0.5411, "num_input_tokens_seen": 21892080, "step": 37965 }, { "epoch": 5.655347036044087, "grad_norm": 2.2185776233673096, "learning_rate": 2.3646303516046626e-05, "loss": 0.5919, "num_input_tokens_seen": 21895152, "step": 37970 }, { "epoch": 5.656091748585046, "grad_norm": 1.3953428268432617, "learning_rate": 2.363981425321517e-05, "loss": 0.6133, "num_input_tokens_seen": 21898320, "step": 37975 }, { "epoch": 5.656836461126005, "grad_norm": 2.6502299308776855, "learning_rate": 2.3633325082299545e-05, "loss": 0.612, "num_input_tokens_seen": 21901040, "step": 37980 }, { "epoch": 5.657581173666965, "grad_norm": 2.2460854053497314, "learning_rate": 2.362683600373825e-05, "loss": 0.6955, "num_input_tokens_seen": 21903664, "step": 37985 }, { "epoch": 5.6583258862079235, "grad_norm": 1.4931310415267944, "learning_rate": 2.362034701796979e-05, "loss": 0.5762, "num_input_tokens_seen": 21906416, "step": 37990 }, { "epoch": 5.659070598748883, "grad_norm": 3.4965832233428955, "learning_rate": 2.3613858125432677e-05, "loss": 0.6596, "num_input_tokens_seen": 21908880, "step": 37995 }, { "epoch": 5.659815311289842, "grad_norm": 2.2293307781219482, "learning_rate": 2.3607369326565403e-05, "loss": 0.7202, "num_input_tokens_seen": 21911728, "step": 38000 }, { "epoch": 5.660560023830802, "grad_norm": 1.0454699993133545, "learning_rate": 2.3600880621806438e-05, "loss": 0.5349, "num_input_tokens_seen": 21914736, "step": 38005 }, { "epoch": 5.66130473637176, "grad_norm": 1.4158921241760254, "learning_rate": 2.359439201159427e-05, "loss": 0.4991, "num_input_tokens_seen": 21917424, "step": 38010 }, { "epoch": 5.66204944891272, "grad_norm": 3.436999559402466, "learning_rate": 2.3587903496367382e-05, "loss": 0.5655, "num_input_tokens_seen": 21920432, "step": 38015 }, { "epoch": 5.662794161453679, "grad_norm": 1.1357747316360474, "learning_rate": 2.3581415076564225e-05, "loss": 0.6024, "num_input_tokens_seen": 21923600, "step": 38020 }, { "epoch": 5.663538873994638, "grad_norm": 1.2127716541290283, "learning_rate": 2.3574926752623276e-05, "loss": 0.6948, "num_input_tokens_seen": 21926480, "step": 38025 }, { "epoch": 5.664283586535597, "grad_norm": 1.5395002365112305, "learning_rate": 2.3568438524982984e-05, "loss": 0.5688, "num_input_tokens_seen": 21929616, "step": 38030 }, { "epoch": 5.665028299076557, "grad_norm": 1.1588941812515259, "learning_rate": 2.3561950394081793e-05, "loss": 0.5234, "num_input_tokens_seen": 21932400, "step": 38035 }, { "epoch": 5.665773011617516, "grad_norm": 1.5273964405059814, "learning_rate": 2.3555462360358154e-05, "loss": 0.7071, "num_input_tokens_seen": 21935408, "step": 38040 }, { "epoch": 5.666517724158475, "grad_norm": 1.3559069633483887, "learning_rate": 2.3548974424250492e-05, "loss": 0.7685, "num_input_tokens_seen": 21938096, "step": 38045 }, { "epoch": 5.667262436699434, "grad_norm": 1.6439651250839233, "learning_rate": 2.3542486586197237e-05, "loss": 0.6384, "num_input_tokens_seen": 21941040, "step": 38050 }, { "epoch": 5.668007149240394, "grad_norm": 1.6889420747756958, "learning_rate": 2.3535998846636815e-05, "loss": 0.6376, "num_input_tokens_seen": 21943472, "step": 38055 }, { "epoch": 5.668751861781352, "grad_norm": 2.6856067180633545, "learning_rate": 2.352951120600763e-05, "loss": 0.6966, "num_input_tokens_seen": 21946352, "step": 38060 }, { "epoch": 5.669496574322311, "grad_norm": 3.8864359855651855, "learning_rate": 2.352302366474811e-05, "loss": 0.6787, "num_input_tokens_seen": 21948976, "step": 38065 }, { "epoch": 5.670241286863271, "grad_norm": 1.225530982017517, "learning_rate": 2.351653622329664e-05, "loss": 0.6482, "num_input_tokens_seen": 21951824, "step": 38070 }, { "epoch": 5.67098599940423, "grad_norm": 2.6001532077789307, "learning_rate": 2.351004888209162e-05, "loss": 0.5022, "num_input_tokens_seen": 21954736, "step": 38075 }, { "epoch": 5.671730711945189, "grad_norm": 1.2180535793304443, "learning_rate": 2.3503561641571455e-05, "loss": 0.6071, "num_input_tokens_seen": 21957360, "step": 38080 }, { "epoch": 5.672475424486148, "grad_norm": 2.513594388961792, "learning_rate": 2.3497074502174495e-05, "loss": 0.5809, "num_input_tokens_seen": 21959984, "step": 38085 }, { "epoch": 5.673220137027108, "grad_norm": 1.324857473373413, "learning_rate": 2.349058746433913e-05, "loss": 0.6495, "num_input_tokens_seen": 21962768, "step": 38090 }, { "epoch": 5.673964849568066, "grad_norm": 1.301364779472351, "learning_rate": 2.348410052850373e-05, "loss": 0.5757, "num_input_tokens_seen": 21965808, "step": 38095 }, { "epoch": 5.674709562109026, "grad_norm": 1.4261109828948975, "learning_rate": 2.347761369510665e-05, "loss": 0.5974, "num_input_tokens_seen": 21968688, "step": 38100 }, { "epoch": 5.675454274649985, "grad_norm": 2.0136029720306396, "learning_rate": 2.3471126964586247e-05, "loss": 0.5208, "num_input_tokens_seen": 21971664, "step": 38105 }, { "epoch": 5.676198987190944, "grad_norm": 1.9988373517990112, "learning_rate": 2.3464640337380868e-05, "loss": 0.5204, "num_input_tokens_seen": 21974800, "step": 38110 }, { "epoch": 5.676943699731903, "grad_norm": 1.9646210670471191, "learning_rate": 2.3458153813928857e-05, "loss": 0.5036, "num_input_tokens_seen": 21977584, "step": 38115 }, { "epoch": 5.677688412272863, "grad_norm": 1.7293816804885864, "learning_rate": 2.345166739466855e-05, "loss": 0.6117, "num_input_tokens_seen": 21980656, "step": 38120 }, { "epoch": 5.678433124813822, "grad_norm": 3.293602705001831, "learning_rate": 2.344518108003825e-05, "loss": 0.6266, "num_input_tokens_seen": 21983888, "step": 38125 }, { "epoch": 5.679177837354781, "grad_norm": 3.2271456718444824, "learning_rate": 2.3438694870476295e-05, "loss": 0.7696, "num_input_tokens_seen": 21986800, "step": 38130 }, { "epoch": 5.67992254989574, "grad_norm": 1.1422245502471924, "learning_rate": 2.3432208766421e-05, "loss": 0.5178, "num_input_tokens_seen": 21989680, "step": 38135 }, { "epoch": 5.6806672624367, "grad_norm": 1.998826026916504, "learning_rate": 2.3425722768310652e-05, "loss": 0.6673, "num_input_tokens_seen": 21992560, "step": 38140 }, { "epoch": 5.681411974977658, "grad_norm": 1.6609270572662354, "learning_rate": 2.3419236876583568e-05, "loss": 0.5297, "num_input_tokens_seen": 21995440, "step": 38145 }, { "epoch": 5.682156687518618, "grad_norm": 2.6755270957946777, "learning_rate": 2.341275109167802e-05, "loss": 0.8135, "num_input_tokens_seen": 21998416, "step": 38150 }, { "epoch": 5.682901400059577, "grad_norm": 2.4222524166107178, "learning_rate": 2.34062654140323e-05, "loss": 0.741, "num_input_tokens_seen": 22001424, "step": 38155 }, { "epoch": 5.683646112600536, "grad_norm": 2.1826558113098145, "learning_rate": 2.33997798440847e-05, "loss": 0.6242, "num_input_tokens_seen": 22004336, "step": 38160 }, { "epoch": 5.684390825141495, "grad_norm": 1.9229276180267334, "learning_rate": 2.3393294382273462e-05, "loss": 0.7287, "num_input_tokens_seen": 22007024, "step": 38165 }, { "epoch": 5.685135537682455, "grad_norm": 1.449762225151062, "learning_rate": 2.338680902903685e-05, "loss": 0.9222, "num_input_tokens_seen": 22010032, "step": 38170 }, { "epoch": 5.685880250223414, "grad_norm": 1.6404756307601929, "learning_rate": 2.338032378481313e-05, "loss": 0.5643, "num_input_tokens_seen": 22013040, "step": 38175 }, { "epoch": 5.686624962764373, "grad_norm": 0.977543294429779, "learning_rate": 2.3373838650040548e-05, "loss": 0.5023, "num_input_tokens_seen": 22016272, "step": 38180 }, { "epoch": 5.687369675305332, "grad_norm": 1.1573681831359863, "learning_rate": 2.3367353625157333e-05, "loss": 0.4945, "num_input_tokens_seen": 22019056, "step": 38185 }, { "epoch": 5.688114387846292, "grad_norm": 1.4994715452194214, "learning_rate": 2.3360868710601717e-05, "loss": 0.5882, "num_input_tokens_seen": 22021936, "step": 38190 }, { "epoch": 5.68885910038725, "grad_norm": 1.9342520236968994, "learning_rate": 2.335438390681194e-05, "loss": 0.508, "num_input_tokens_seen": 22025072, "step": 38195 }, { "epoch": 5.68960381292821, "grad_norm": 1.177523136138916, "learning_rate": 2.3347899214226214e-05, "loss": 0.5848, "num_input_tokens_seen": 22027760, "step": 38200 }, { "epoch": 5.690348525469169, "grad_norm": 2.41338849067688, "learning_rate": 2.334141463328273e-05, "loss": 0.6593, "num_input_tokens_seen": 22030512, "step": 38205 }, { "epoch": 5.6910932380101285, "grad_norm": 2.384969472885132, "learning_rate": 2.33349301644197e-05, "loss": 0.7383, "num_input_tokens_seen": 22033616, "step": 38210 }, { "epoch": 5.691837950551087, "grad_norm": 2.002455711364746, "learning_rate": 2.332844580807533e-05, "loss": 0.7924, "num_input_tokens_seen": 22036400, "step": 38215 }, { "epoch": 5.692582663092047, "grad_norm": 1.775514006614685, "learning_rate": 2.3321961564687787e-05, "loss": 0.5616, "num_input_tokens_seen": 22038864, "step": 38220 }, { "epoch": 5.693327375633006, "grad_norm": 0.8650303483009338, "learning_rate": 2.3315477434695256e-05, "loss": 0.6812, "num_input_tokens_seen": 22041744, "step": 38225 }, { "epoch": 5.694072088173964, "grad_norm": 1.0531384944915771, "learning_rate": 2.3308993418535924e-05, "loss": 0.5845, "num_input_tokens_seen": 22044784, "step": 38230 }, { "epoch": 5.694816800714924, "grad_norm": 1.6497358083724976, "learning_rate": 2.330250951664793e-05, "loss": 0.6252, "num_input_tokens_seen": 22047536, "step": 38235 }, { "epoch": 5.695561513255884, "grad_norm": 1.2925668954849243, "learning_rate": 2.3296025729469457e-05, "loss": 0.5004, "num_input_tokens_seen": 22050640, "step": 38240 }, { "epoch": 5.696306225796842, "grad_norm": 1.656561017036438, "learning_rate": 2.3289542057438625e-05, "loss": 0.6188, "num_input_tokens_seen": 22053328, "step": 38245 }, { "epoch": 5.697050938337801, "grad_norm": 1.5223808288574219, "learning_rate": 2.3283058500993587e-05, "loss": 0.5537, "num_input_tokens_seen": 22056368, "step": 38250 }, { "epoch": 5.697795650878761, "grad_norm": 1.373862862586975, "learning_rate": 2.3276575060572476e-05, "loss": 0.6447, "num_input_tokens_seen": 22059472, "step": 38255 }, { "epoch": 5.6985403634197205, "grad_norm": 1.3173733949661255, "learning_rate": 2.3270091736613412e-05, "loss": 0.7336, "num_input_tokens_seen": 22062192, "step": 38260 }, { "epoch": 5.699285075960679, "grad_norm": 2.152189016342163, "learning_rate": 2.326360852955452e-05, "loss": 0.6954, "num_input_tokens_seen": 22064976, "step": 38265 }, { "epoch": 5.700029788501638, "grad_norm": 2.7043840885162354, "learning_rate": 2.3257125439833902e-05, "loss": 0.7254, "num_input_tokens_seen": 22068208, "step": 38270 }, { "epoch": 5.700774501042598, "grad_norm": 3.1118485927581787, "learning_rate": 2.325064246788966e-05, "loss": 0.6033, "num_input_tokens_seen": 22071344, "step": 38275 }, { "epoch": 5.701519213583556, "grad_norm": 2.0175602436065674, "learning_rate": 2.3244159614159898e-05, "loss": 0.7534, "num_input_tokens_seen": 22074192, "step": 38280 }, { "epoch": 5.702263926124516, "grad_norm": 1.444615364074707, "learning_rate": 2.3237676879082682e-05, "loss": 0.6461, "num_input_tokens_seen": 22077168, "step": 38285 }, { "epoch": 5.703008638665475, "grad_norm": 1.2462351322174072, "learning_rate": 2.3231194263096096e-05, "loss": 0.6054, "num_input_tokens_seen": 22079664, "step": 38290 }, { "epoch": 5.7037533512064345, "grad_norm": 1.9485414028167725, "learning_rate": 2.322471176663821e-05, "loss": 0.4092, "num_input_tokens_seen": 22082640, "step": 38295 }, { "epoch": 5.704498063747393, "grad_norm": 2.4751696586608887, "learning_rate": 2.3218229390147086e-05, "loss": 0.5669, "num_input_tokens_seen": 22085328, "step": 38300 }, { "epoch": 5.705242776288353, "grad_norm": 0.9110588431358337, "learning_rate": 2.3211747134060774e-05, "loss": 0.7431, "num_input_tokens_seen": 22088208, "step": 38305 }, { "epoch": 5.705987488829312, "grad_norm": 1.959448218345642, "learning_rate": 2.3205264998817326e-05, "loss": 0.5458, "num_input_tokens_seen": 22090992, "step": 38310 }, { "epoch": 5.706732201370271, "grad_norm": 2.580749034881592, "learning_rate": 2.3198782984854765e-05, "loss": 0.8951, "num_input_tokens_seen": 22093904, "step": 38315 }, { "epoch": 5.70747691391123, "grad_norm": 1.3894520998001099, "learning_rate": 2.3192301092611138e-05, "loss": 0.7424, "num_input_tokens_seen": 22097008, "step": 38320 }, { "epoch": 5.70822162645219, "grad_norm": 1.22662353515625, "learning_rate": 2.3185819322524443e-05, "loss": 0.6715, "num_input_tokens_seen": 22100080, "step": 38325 }, { "epoch": 5.708966338993148, "grad_norm": 6.7937517166137695, "learning_rate": 2.3179337675032707e-05, "loss": 0.7193, "num_input_tokens_seen": 22102672, "step": 38330 }, { "epoch": 5.709711051534108, "grad_norm": 1.3533720970153809, "learning_rate": 2.3172856150573926e-05, "loss": 0.5369, "num_input_tokens_seen": 22105616, "step": 38335 }, { "epoch": 5.710455764075067, "grad_norm": 1.564367651939392, "learning_rate": 2.3166374749586094e-05, "loss": 0.4043, "num_input_tokens_seen": 22108752, "step": 38340 }, { "epoch": 5.7112004766160265, "grad_norm": 1.6910685300827026, "learning_rate": 2.3159893472507212e-05, "loss": 0.5553, "num_input_tokens_seen": 22111600, "step": 38345 }, { "epoch": 5.711945189156985, "grad_norm": 2.370454788208008, "learning_rate": 2.315341231977524e-05, "loss": 0.6689, "num_input_tokens_seen": 22114384, "step": 38350 }, { "epoch": 5.712689901697945, "grad_norm": 0.98995441198349, "learning_rate": 2.314693129182815e-05, "loss": 0.476, "num_input_tokens_seen": 22116944, "step": 38355 }, { "epoch": 5.713434614238904, "grad_norm": 2.903426170349121, "learning_rate": 2.314045038910393e-05, "loss": 0.6879, "num_input_tokens_seen": 22119760, "step": 38360 }, { "epoch": 5.714179326779863, "grad_norm": 1.170824408531189, "learning_rate": 2.3133969612040503e-05, "loss": 0.4924, "num_input_tokens_seen": 22122928, "step": 38365 }, { "epoch": 5.714924039320822, "grad_norm": 2.751396656036377, "learning_rate": 2.3127488961075812e-05, "loss": 0.5203, "num_input_tokens_seen": 22125616, "step": 38370 }, { "epoch": 5.715668751861782, "grad_norm": 5.958835601806641, "learning_rate": 2.312100843664781e-05, "loss": 0.7085, "num_input_tokens_seen": 22128528, "step": 38375 }, { "epoch": 5.7164134644027405, "grad_norm": 1.4779142141342163, "learning_rate": 2.311452803919442e-05, "loss": 0.6542, "num_input_tokens_seen": 22131664, "step": 38380 }, { "epoch": 5.7171581769437, "grad_norm": 2.614600896835327, "learning_rate": 2.3108047769153558e-05, "loss": 0.6429, "num_input_tokens_seen": 22134736, "step": 38385 }, { "epoch": 5.717902889484659, "grad_norm": 2.5876994132995605, "learning_rate": 2.3101567626963138e-05, "loss": 0.7731, "num_input_tokens_seen": 22137552, "step": 38390 }, { "epoch": 5.718647602025618, "grad_norm": 2.2192583084106445, "learning_rate": 2.3095087613061058e-05, "loss": 0.601, "num_input_tokens_seen": 22140400, "step": 38395 }, { "epoch": 5.719392314566577, "grad_norm": 3.823258638381958, "learning_rate": 2.3088607727885207e-05, "loss": 0.657, "num_input_tokens_seen": 22143248, "step": 38400 }, { "epoch": 5.720137027107537, "grad_norm": 1.8998390436172485, "learning_rate": 2.3082127971873492e-05, "loss": 0.6527, "num_input_tokens_seen": 22146160, "step": 38405 }, { "epoch": 5.720881739648496, "grad_norm": 2.769202709197998, "learning_rate": 2.3075648345463754e-05, "loss": 0.6365, "num_input_tokens_seen": 22148720, "step": 38410 }, { "epoch": 5.721626452189454, "grad_norm": 2.457216501235962, "learning_rate": 2.3069168849093885e-05, "loss": 0.7259, "num_input_tokens_seen": 22151504, "step": 38415 }, { "epoch": 5.722371164730414, "grad_norm": 1.6231839656829834, "learning_rate": 2.306268948320173e-05, "loss": 0.5923, "num_input_tokens_seen": 22154224, "step": 38420 }, { "epoch": 5.723115877271374, "grad_norm": 2.086752414703369, "learning_rate": 2.305621024822514e-05, "loss": 0.6044, "num_input_tokens_seen": 22157072, "step": 38425 }, { "epoch": 5.7238605898123325, "grad_norm": 1.909276008605957, "learning_rate": 2.3049731144601967e-05, "loss": 0.413, "num_input_tokens_seen": 22159568, "step": 38430 }, { "epoch": 5.724605302353291, "grad_norm": 2.2778573036193848, "learning_rate": 2.3043252172770027e-05, "loss": 0.7943, "num_input_tokens_seen": 22162512, "step": 38435 }, { "epoch": 5.725350014894251, "grad_norm": 3.346518039703369, "learning_rate": 2.303677333316715e-05, "loss": 0.7266, "num_input_tokens_seen": 22165840, "step": 38440 }, { "epoch": 5.72609472743521, "grad_norm": 1.227918267250061, "learning_rate": 2.3030294626231162e-05, "loss": 0.6115, "num_input_tokens_seen": 22168976, "step": 38445 }, { "epoch": 5.726839439976169, "grad_norm": 1.9813495874404907, "learning_rate": 2.302381605239985e-05, "loss": 0.5643, "num_input_tokens_seen": 22171760, "step": 38450 }, { "epoch": 5.727584152517128, "grad_norm": 1.6883949041366577, "learning_rate": 2.3017337612111007e-05, "loss": 0.5943, "num_input_tokens_seen": 22174736, "step": 38455 }, { "epoch": 5.728328865058088, "grad_norm": 3.7179551124572754, "learning_rate": 2.3010859305802426e-05, "loss": 0.629, "num_input_tokens_seen": 22177680, "step": 38460 }, { "epoch": 5.7290735775990465, "grad_norm": 2.2636287212371826, "learning_rate": 2.300438113391189e-05, "loss": 0.6939, "num_input_tokens_seen": 22180656, "step": 38465 }, { "epoch": 5.729818290140006, "grad_norm": 1.8636995553970337, "learning_rate": 2.2997903096877164e-05, "loss": 0.6204, "num_input_tokens_seen": 22183600, "step": 38470 }, { "epoch": 5.730563002680965, "grad_norm": 3.0849387645721436, "learning_rate": 2.299142519513601e-05, "loss": 0.848, "num_input_tokens_seen": 22186544, "step": 38475 }, { "epoch": 5.7313077152219245, "grad_norm": 2.261735439300537, "learning_rate": 2.298494742912617e-05, "loss": 0.6154, "num_input_tokens_seen": 22189136, "step": 38480 }, { "epoch": 5.732052427762883, "grad_norm": 1.2996803522109985, "learning_rate": 2.2978469799285397e-05, "loss": 0.6088, "num_input_tokens_seen": 22192208, "step": 38485 }, { "epoch": 5.732797140303843, "grad_norm": 2.2962231636047363, "learning_rate": 2.297199230605141e-05, "loss": 0.6639, "num_input_tokens_seen": 22195120, "step": 38490 }, { "epoch": 5.733541852844802, "grad_norm": 1.3695539236068726, "learning_rate": 2.2965514949861938e-05, "loss": 0.5077, "num_input_tokens_seen": 22197808, "step": 38495 }, { "epoch": 5.734286565385761, "grad_norm": 1.6432665586471558, "learning_rate": 2.2959037731154692e-05, "loss": 0.7272, "num_input_tokens_seen": 22200848, "step": 38500 }, { "epoch": 5.73503127792672, "grad_norm": 3.9409143924713135, "learning_rate": 2.295256065036738e-05, "loss": 0.5174, "num_input_tokens_seen": 22203408, "step": 38505 }, { "epoch": 5.73577599046768, "grad_norm": 1.433538556098938, "learning_rate": 2.2946083707937697e-05, "loss": 0.6559, "num_input_tokens_seen": 22206640, "step": 38510 }, { "epoch": 5.7365207030086385, "grad_norm": 3.529219150543213, "learning_rate": 2.293960690430332e-05, "loss": 0.6981, "num_input_tokens_seen": 22209488, "step": 38515 }, { "epoch": 5.737265415549598, "grad_norm": 1.193183422088623, "learning_rate": 2.2933130239901934e-05, "loss": 0.5502, "num_input_tokens_seen": 22212528, "step": 38520 }, { "epoch": 5.738010128090557, "grad_norm": 1.5332183837890625, "learning_rate": 2.2926653715171215e-05, "loss": 0.5216, "num_input_tokens_seen": 22215632, "step": 38525 }, { "epoch": 5.7387548406315165, "grad_norm": 2.728461980819702, "learning_rate": 2.2920177330548802e-05, "loss": 0.5905, "num_input_tokens_seen": 22218480, "step": 38530 }, { "epoch": 5.739499553172475, "grad_norm": 1.2852871417999268, "learning_rate": 2.2913701086472343e-05, "loss": 0.5301, "num_input_tokens_seen": 22221392, "step": 38535 }, { "epoch": 5.740244265713435, "grad_norm": 2.6413536071777344, "learning_rate": 2.290722498337948e-05, "loss": 0.749, "num_input_tokens_seen": 22224048, "step": 38540 }, { "epoch": 5.740988978254394, "grad_norm": 2.044501543045044, "learning_rate": 2.2900749021707855e-05, "loss": 0.6275, "num_input_tokens_seen": 22226768, "step": 38545 }, { "epoch": 5.741733690795353, "grad_norm": 1.3990076780319214, "learning_rate": 2.2894273201895068e-05, "loss": 0.6619, "num_input_tokens_seen": 22229520, "step": 38550 }, { "epoch": 5.742478403336312, "grad_norm": 1.4488496780395508, "learning_rate": 2.2887797524378734e-05, "loss": 0.5017, "num_input_tokens_seen": 22232432, "step": 38555 }, { "epoch": 5.743223115877272, "grad_norm": 2.2825567722320557, "learning_rate": 2.2881321989596464e-05, "loss": 0.6252, "num_input_tokens_seen": 22235312, "step": 38560 }, { "epoch": 5.7439678284182305, "grad_norm": 5.921130657196045, "learning_rate": 2.2874846597985842e-05, "loss": 0.6454, "num_input_tokens_seen": 22238288, "step": 38565 }, { "epoch": 5.74471254095919, "grad_norm": 2.1272735595703125, "learning_rate": 2.2868371349984442e-05, "loss": 0.8203, "num_input_tokens_seen": 22240816, "step": 38570 }, { "epoch": 5.745457253500149, "grad_norm": 1.2239347696304321, "learning_rate": 2.2861896246029835e-05, "loss": 0.7289, "num_input_tokens_seen": 22243792, "step": 38575 }, { "epoch": 5.746201966041108, "grad_norm": 2.1316237449645996, "learning_rate": 2.2855421286559593e-05, "loss": 0.6691, "num_input_tokens_seen": 22246544, "step": 38580 }, { "epoch": 5.746946678582067, "grad_norm": 0.9211249947547913, "learning_rate": 2.2848946472011258e-05, "loss": 0.5352, "num_input_tokens_seen": 22249456, "step": 38585 }, { "epoch": 5.747691391123027, "grad_norm": 0.6576749086380005, "learning_rate": 2.2842471802822372e-05, "loss": 0.6343, "num_input_tokens_seen": 22252368, "step": 38590 }, { "epoch": 5.748436103663986, "grad_norm": 1.8718457221984863, "learning_rate": 2.2835997279430475e-05, "loss": 0.6535, "num_input_tokens_seen": 22255152, "step": 38595 }, { "epoch": 5.7491808162049445, "grad_norm": 1.0378315448760986, "learning_rate": 2.282952290227308e-05, "loss": 0.5156, "num_input_tokens_seen": 22258288, "step": 38600 }, { "epoch": 5.749925528745904, "grad_norm": 1.5426135063171387, "learning_rate": 2.2823048671787715e-05, "loss": 0.5479, "num_input_tokens_seen": 22260976, "step": 38605 }, { "epoch": 5.750670241286863, "grad_norm": 1.1806215047836304, "learning_rate": 2.2816574588411857e-05, "loss": 0.5519, "num_input_tokens_seen": 22263760, "step": 38610 }, { "epoch": 5.7514149538278225, "grad_norm": 2.258694648742676, "learning_rate": 2.2810100652583016e-05, "loss": 0.6402, "num_input_tokens_seen": 22266672, "step": 38615 }, { "epoch": 5.752159666368781, "grad_norm": 1.3872010707855225, "learning_rate": 2.2803626864738664e-05, "loss": 0.657, "num_input_tokens_seen": 22269424, "step": 38620 }, { "epoch": 5.752904378909741, "grad_norm": 1.4470598697662354, "learning_rate": 2.279715322531628e-05, "loss": 0.5343, "num_input_tokens_seen": 22272464, "step": 38625 }, { "epoch": 5.7536490914507, "grad_norm": 2.747347593307495, "learning_rate": 2.2790679734753327e-05, "loss": 0.6779, "num_input_tokens_seen": 22275088, "step": 38630 }, { "epoch": 5.754393803991659, "grad_norm": 3.8032431602478027, "learning_rate": 2.2784206393487256e-05, "loss": 0.6652, "num_input_tokens_seen": 22277680, "step": 38635 }, { "epoch": 5.755138516532618, "grad_norm": 1.6622319221496582, "learning_rate": 2.277773320195551e-05, "loss": 0.7261, "num_input_tokens_seen": 22280624, "step": 38640 }, { "epoch": 5.755883229073578, "grad_norm": 2.0231330394744873, "learning_rate": 2.2771260160595516e-05, "loss": 0.629, "num_input_tokens_seen": 22283344, "step": 38645 }, { "epoch": 5.7566279416145365, "grad_norm": 3.1712005138397217, "learning_rate": 2.2764787269844704e-05, "loss": 0.7589, "num_input_tokens_seen": 22286704, "step": 38650 }, { "epoch": 5.757372654155496, "grad_norm": 2.5742931365966797, "learning_rate": 2.2758314530140473e-05, "loss": 0.6174, "num_input_tokens_seen": 22290000, "step": 38655 }, { "epoch": 5.758117366696455, "grad_norm": 1.7508584260940552, "learning_rate": 2.2751841941920238e-05, "loss": 0.6678, "num_input_tokens_seen": 22293104, "step": 38660 }, { "epoch": 5.7588620792374146, "grad_norm": 1.956813931465149, "learning_rate": 2.274536950562138e-05, "loss": 0.6343, "num_input_tokens_seen": 22296048, "step": 38665 }, { "epoch": 5.759606791778373, "grad_norm": 2.086714267730713, "learning_rate": 2.2738897221681284e-05, "loss": 0.6117, "num_input_tokens_seen": 22298992, "step": 38670 }, { "epoch": 5.760351504319333, "grad_norm": 2.1168887615203857, "learning_rate": 2.2732425090537323e-05, "loss": 0.8476, "num_input_tokens_seen": 22302160, "step": 38675 }, { "epoch": 5.761096216860292, "grad_norm": 1.512149691581726, "learning_rate": 2.272595311262685e-05, "loss": 0.7983, "num_input_tokens_seen": 22305360, "step": 38680 }, { "epoch": 5.761840929401251, "grad_norm": 1.512717366218567, "learning_rate": 2.2719481288387234e-05, "loss": 0.4981, "num_input_tokens_seen": 22308176, "step": 38685 }, { "epoch": 5.76258564194221, "grad_norm": 1.0569305419921875, "learning_rate": 2.2713009618255788e-05, "loss": 0.508, "num_input_tokens_seen": 22311056, "step": 38690 }, { "epoch": 5.76333035448317, "grad_norm": 1.7006208896636963, "learning_rate": 2.270653810266986e-05, "loss": 0.5412, "num_input_tokens_seen": 22314224, "step": 38695 }, { "epoch": 5.7640750670241285, "grad_norm": 4.1774444580078125, "learning_rate": 2.2700066742066754e-05, "loss": 0.6598, "num_input_tokens_seen": 22317104, "step": 38700 }, { "epoch": 5.764819779565088, "grad_norm": 3.3811120986938477, "learning_rate": 2.2693595536883792e-05, "loss": 0.6504, "num_input_tokens_seen": 22319952, "step": 38705 }, { "epoch": 5.765564492106047, "grad_norm": 2.1380083560943604, "learning_rate": 2.2687124487558267e-05, "loss": 0.4734, "num_input_tokens_seen": 22322864, "step": 38710 }, { "epoch": 5.766309204647007, "grad_norm": 1.509886384010315, "learning_rate": 2.268065359452746e-05, "loss": 0.5086, "num_input_tokens_seen": 22325648, "step": 38715 }, { "epoch": 5.767053917187965, "grad_norm": 1.6185120344161987, "learning_rate": 2.267418285822866e-05, "loss": 0.6051, "num_input_tokens_seen": 22328976, "step": 38720 }, { "epoch": 5.767798629728925, "grad_norm": 1.2503376007080078, "learning_rate": 2.266771227909913e-05, "loss": 0.4397, "num_input_tokens_seen": 22331696, "step": 38725 }, { "epoch": 5.768543342269884, "grad_norm": 5.907952785491943, "learning_rate": 2.266124185757612e-05, "loss": 0.7529, "num_input_tokens_seen": 22334608, "step": 38730 }, { "epoch": 5.769288054810843, "grad_norm": 1.1096915006637573, "learning_rate": 2.265477159409687e-05, "loss": 0.5314, "num_input_tokens_seen": 22337360, "step": 38735 }, { "epoch": 5.770032767351802, "grad_norm": 2.264492988586426, "learning_rate": 2.2648301489098624e-05, "loss": 0.6973, "num_input_tokens_seen": 22340272, "step": 38740 }, { "epoch": 5.770777479892761, "grad_norm": 2.3281636238098145, "learning_rate": 2.26418315430186e-05, "loss": 0.7334, "num_input_tokens_seen": 22343184, "step": 38745 }, { "epoch": 5.7715221924337206, "grad_norm": 2.211790084838867, "learning_rate": 2.2635361756294014e-05, "loss": 0.6998, "num_input_tokens_seen": 22345904, "step": 38750 }, { "epoch": 5.77226690497468, "grad_norm": 3.4189257621765137, "learning_rate": 2.2628892129362064e-05, "loss": 0.6548, "num_input_tokens_seen": 22348432, "step": 38755 }, { "epoch": 5.773011617515639, "grad_norm": 2.1972177028656006, "learning_rate": 2.2622422662659952e-05, "loss": 0.4445, "num_input_tokens_seen": 22351152, "step": 38760 }, { "epoch": 5.773756330056598, "grad_norm": 2.339430093765259, "learning_rate": 2.2615953356624852e-05, "loss": 0.6955, "num_input_tokens_seen": 22353936, "step": 38765 }, { "epoch": 5.774501042597557, "grad_norm": 1.3368163108825684, "learning_rate": 2.260948421169392e-05, "loss": 0.5749, "num_input_tokens_seen": 22356976, "step": 38770 }, { "epoch": 5.775245755138517, "grad_norm": 2.591630697250366, "learning_rate": 2.260301522830433e-05, "loss": 0.8043, "num_input_tokens_seen": 22360048, "step": 38775 }, { "epoch": 5.775990467679476, "grad_norm": 3.234128952026367, "learning_rate": 2.2596546406893227e-05, "loss": 0.8935, "num_input_tokens_seen": 22362768, "step": 38780 }, { "epoch": 5.7767351802204345, "grad_norm": 3.9699676036834717, "learning_rate": 2.259007774789774e-05, "loss": 0.5685, "num_input_tokens_seen": 22365808, "step": 38785 }, { "epoch": 5.777479892761394, "grad_norm": 2.565274477005005, "learning_rate": 2.2583609251755004e-05, "loss": 0.5129, "num_input_tokens_seen": 22368528, "step": 38790 }, { "epoch": 5.778224605302353, "grad_norm": 1.5403145551681519, "learning_rate": 2.2577140918902135e-05, "loss": 0.5177, "num_input_tokens_seen": 22371536, "step": 38795 }, { "epoch": 5.778969317843313, "grad_norm": 2.504014015197754, "learning_rate": 2.2570672749776222e-05, "loss": 0.6213, "num_input_tokens_seen": 22374512, "step": 38800 }, { "epoch": 5.779714030384271, "grad_norm": 1.2752301692962646, "learning_rate": 2.2564204744814384e-05, "loss": 0.5821, "num_input_tokens_seen": 22377712, "step": 38805 }, { "epoch": 5.780458742925231, "grad_norm": 3.6236846446990967, "learning_rate": 2.2557736904453674e-05, "loss": 0.5735, "num_input_tokens_seen": 22380656, "step": 38810 }, { "epoch": 5.78120345546619, "grad_norm": 2.1242880821228027, "learning_rate": 2.255126922913118e-05, "loss": 0.662, "num_input_tokens_seen": 22383568, "step": 38815 }, { "epoch": 5.781948168007149, "grad_norm": 1.6298998594284058, "learning_rate": 2.254480171928395e-05, "loss": 0.5542, "num_input_tokens_seen": 22386320, "step": 38820 }, { "epoch": 5.782692880548108, "grad_norm": 2.35546875, "learning_rate": 2.2538334375349044e-05, "loss": 0.6922, "num_input_tokens_seen": 22389264, "step": 38825 }, { "epoch": 5.783437593089068, "grad_norm": 1.3878955841064453, "learning_rate": 2.2531867197763484e-05, "loss": 0.5562, "num_input_tokens_seen": 22392144, "step": 38830 }, { "epoch": 5.7841823056300266, "grad_norm": 1.8940188884735107, "learning_rate": 2.2525400186964308e-05, "loss": 0.7668, "num_input_tokens_seen": 22395024, "step": 38835 }, { "epoch": 5.784927018170986, "grad_norm": 1.9177199602127075, "learning_rate": 2.2518933343388528e-05, "loss": 0.5592, "num_input_tokens_seen": 22397776, "step": 38840 }, { "epoch": 5.785671730711945, "grad_norm": 1.9950370788574219, "learning_rate": 2.2512466667473152e-05, "loss": 0.6296, "num_input_tokens_seen": 22400976, "step": 38845 }, { "epoch": 5.786416443252905, "grad_norm": 2.134084463119507, "learning_rate": 2.2506000159655158e-05, "loss": 0.6363, "num_input_tokens_seen": 22403952, "step": 38850 }, { "epoch": 5.787161155793863, "grad_norm": 1.6635191440582275, "learning_rate": 2.249953382037153e-05, "loss": 0.4629, "num_input_tokens_seen": 22406960, "step": 38855 }, { "epoch": 5.787905868334823, "grad_norm": 2.750382423400879, "learning_rate": 2.2493067650059247e-05, "loss": 0.7218, "num_input_tokens_seen": 22409968, "step": 38860 }, { "epoch": 5.788650580875782, "grad_norm": 1.1130876541137695, "learning_rate": 2.248660164915525e-05, "loss": 0.5438, "num_input_tokens_seen": 22412880, "step": 38865 }, { "epoch": 5.789395293416741, "grad_norm": 1.8663562536239624, "learning_rate": 2.2480135818096497e-05, "loss": 0.6107, "num_input_tokens_seen": 22415568, "step": 38870 }, { "epoch": 5.7901400059577, "grad_norm": 2.403543472290039, "learning_rate": 2.247367015731993e-05, "loss": 0.7577, "num_input_tokens_seen": 22418512, "step": 38875 }, { "epoch": 5.79088471849866, "grad_norm": 3.237942934036255, "learning_rate": 2.2467204667262454e-05, "loss": 0.6713, "num_input_tokens_seen": 22421424, "step": 38880 }, { "epoch": 5.791629431039619, "grad_norm": 1.2998898029327393, "learning_rate": 2.2460739348361e-05, "loss": 0.554, "num_input_tokens_seen": 22424592, "step": 38885 }, { "epoch": 5.792374143580578, "grad_norm": 1.3674345016479492, "learning_rate": 2.2454274201052443e-05, "loss": 0.4843, "num_input_tokens_seen": 22427440, "step": 38890 }, { "epoch": 5.793118856121537, "grad_norm": 1.796217441558838, "learning_rate": 2.2447809225773698e-05, "loss": 0.6267, "num_input_tokens_seen": 22430384, "step": 38895 }, { "epoch": 5.793863568662497, "grad_norm": 1.5558366775512695, "learning_rate": 2.2441344422961618e-05, "loss": 0.5882, "num_input_tokens_seen": 22433232, "step": 38900 }, { "epoch": 5.794608281203455, "grad_norm": 1.4719171524047852, "learning_rate": 2.243487979305308e-05, "loss": 0.601, "num_input_tokens_seen": 22436144, "step": 38905 }, { "epoch": 5.795352993744415, "grad_norm": 2.0947425365448, "learning_rate": 2.2428415336484944e-05, "loss": 0.7301, "num_input_tokens_seen": 22439472, "step": 38910 }, { "epoch": 5.796097706285374, "grad_norm": 2.538259267807007, "learning_rate": 2.2421951053694034e-05, "loss": 0.5857, "num_input_tokens_seen": 22442320, "step": 38915 }, { "epoch": 5.796842418826333, "grad_norm": 1.7134637832641602, "learning_rate": 2.2415486945117195e-05, "loss": 0.5682, "num_input_tokens_seen": 22444880, "step": 38920 }, { "epoch": 5.797587131367292, "grad_norm": 2.9529855251312256, "learning_rate": 2.2409023011191248e-05, "loss": 0.579, "num_input_tokens_seen": 22447632, "step": 38925 }, { "epoch": 5.798331843908251, "grad_norm": 2.4169514179229736, "learning_rate": 2.2402559252352988e-05, "loss": 0.7684, "num_input_tokens_seen": 22450288, "step": 38930 }, { "epoch": 5.799076556449211, "grad_norm": 3.716444253921509, "learning_rate": 2.239609566903921e-05, "loss": 0.7139, "num_input_tokens_seen": 22453136, "step": 38935 }, { "epoch": 5.79982126899017, "grad_norm": 3.348827838897705, "learning_rate": 2.23896322616867e-05, "loss": 0.7252, "num_input_tokens_seen": 22455888, "step": 38940 }, { "epoch": 5.800565981531129, "grad_norm": 1.0894814729690552, "learning_rate": 2.238316903073223e-05, "loss": 0.5106, "num_input_tokens_seen": 22458896, "step": 38945 }, { "epoch": 5.801310694072088, "grad_norm": 1.5170501470565796, "learning_rate": 2.2376705976612555e-05, "loss": 0.6944, "num_input_tokens_seen": 22461776, "step": 38950 }, { "epoch": 5.802055406613047, "grad_norm": 2.170292377471924, "learning_rate": 2.2370243099764424e-05, "loss": 0.5416, "num_input_tokens_seen": 22464656, "step": 38955 }, { "epoch": 5.802800119154006, "grad_norm": 2.044757127761841, "learning_rate": 2.2363780400624578e-05, "loss": 0.5208, "num_input_tokens_seen": 22467536, "step": 38960 }, { "epoch": 5.803544831694966, "grad_norm": 1.473108172416687, "learning_rate": 2.235731787962973e-05, "loss": 0.6308, "num_input_tokens_seen": 22470512, "step": 38965 }, { "epoch": 5.804289544235925, "grad_norm": 2.5885491371154785, "learning_rate": 2.2350855537216603e-05, "loss": 0.6201, "num_input_tokens_seen": 22473680, "step": 38970 }, { "epoch": 5.805034256776884, "grad_norm": 1.7985410690307617, "learning_rate": 2.234439337382188e-05, "loss": 0.7119, "num_input_tokens_seen": 22476656, "step": 38975 }, { "epoch": 5.805778969317843, "grad_norm": 1.4455091953277588, "learning_rate": 2.2337931389882262e-05, "loss": 0.6311, "num_input_tokens_seen": 22479568, "step": 38980 }, { "epoch": 5.806523681858803, "grad_norm": 2.325765371322632, "learning_rate": 2.233146958583441e-05, "loss": 0.706, "num_input_tokens_seen": 22482576, "step": 38985 }, { "epoch": 5.807268394399761, "grad_norm": 2.5424067974090576, "learning_rate": 2.2325007962115e-05, "loss": 0.8468, "num_input_tokens_seen": 22486768, "step": 38990 }, { "epoch": 5.808013106940721, "grad_norm": 2.6164801120758057, "learning_rate": 2.2318546519160672e-05, "loss": 0.5842, "num_input_tokens_seen": 22489488, "step": 38995 }, { "epoch": 5.80875781948168, "grad_norm": 2.7522802352905273, "learning_rate": 2.2312085257408066e-05, "loss": 0.8199, "num_input_tokens_seen": 22492720, "step": 39000 }, { "epoch": 5.809502532022639, "grad_norm": 3.1668925285339355, "learning_rate": 2.2305624177293816e-05, "loss": 0.5283, "num_input_tokens_seen": 22495632, "step": 39005 }, { "epoch": 5.810247244563598, "grad_norm": 4.395853519439697, "learning_rate": 2.2299163279254535e-05, "loss": 0.6722, "num_input_tokens_seen": 22498416, "step": 39010 }, { "epoch": 5.810991957104558, "grad_norm": 3.243971824645996, "learning_rate": 2.229270256372681e-05, "loss": 0.614, "num_input_tokens_seen": 22501136, "step": 39015 }, { "epoch": 5.811736669645517, "grad_norm": 1.587124228477478, "learning_rate": 2.2286242031147236e-05, "loss": 0.4703, "num_input_tokens_seen": 22503984, "step": 39020 }, { "epoch": 5.812481382186476, "grad_norm": 2.4681143760681152, "learning_rate": 2.22797816819524e-05, "loss": 0.6252, "num_input_tokens_seen": 22507088, "step": 39025 }, { "epoch": 5.813226094727435, "grad_norm": 0.9770675897598267, "learning_rate": 2.227332151657885e-05, "loss": 0.5953, "num_input_tokens_seen": 22509968, "step": 39030 }, { "epoch": 5.813970807268395, "grad_norm": 2.0213265419006348, "learning_rate": 2.226686153546315e-05, "loss": 0.5595, "num_input_tokens_seen": 22512976, "step": 39035 }, { "epoch": 5.814715519809353, "grad_norm": 1.104459524154663, "learning_rate": 2.2260401739041837e-05, "loss": 0.5779, "num_input_tokens_seen": 22515952, "step": 39040 }, { "epoch": 5.815460232350313, "grad_norm": 1.9288272857666016, "learning_rate": 2.2253942127751432e-05, "loss": 0.7463, "num_input_tokens_seen": 22518864, "step": 39045 }, { "epoch": 5.816204944891272, "grad_norm": 1.4409080743789673, "learning_rate": 2.2247482702028466e-05, "loss": 0.5018, "num_input_tokens_seen": 22521520, "step": 39050 }, { "epoch": 5.8169496574322315, "grad_norm": 1.1204725503921509, "learning_rate": 2.2241023462309416e-05, "loss": 0.6419, "num_input_tokens_seen": 22524656, "step": 39055 }, { "epoch": 5.81769436997319, "grad_norm": 2.285539388656616, "learning_rate": 2.2234564409030788e-05, "loss": 0.799, "num_input_tokens_seen": 22527536, "step": 39060 }, { "epoch": 5.81843908251415, "grad_norm": 1.9869863986968994, "learning_rate": 2.2228105542629047e-05, "loss": 0.6186, "num_input_tokens_seen": 22530416, "step": 39065 }, { "epoch": 5.819183795055109, "grad_norm": 1.2937897443771362, "learning_rate": 2.2221646863540664e-05, "loss": 0.6461, "num_input_tokens_seen": 22533296, "step": 39070 }, { "epoch": 5.819928507596068, "grad_norm": 1.6262439489364624, "learning_rate": 2.2215188372202097e-05, "loss": 0.6356, "num_input_tokens_seen": 22536176, "step": 39075 }, { "epoch": 5.820673220137027, "grad_norm": 2.1567790508270264, "learning_rate": 2.2208730069049775e-05, "loss": 0.7313, "num_input_tokens_seen": 22538768, "step": 39080 }, { "epoch": 5.821417932677987, "grad_norm": 2.529571294784546, "learning_rate": 2.220227195452012e-05, "loss": 0.7511, "num_input_tokens_seen": 22541744, "step": 39085 }, { "epoch": 5.822162645218945, "grad_norm": 1.5530930757522583, "learning_rate": 2.2195814029049568e-05, "loss": 0.5959, "num_input_tokens_seen": 22544624, "step": 39090 }, { "epoch": 5.822907357759904, "grad_norm": 1.9310543537139893, "learning_rate": 2.2189356293074495e-05, "loss": 0.6431, "num_input_tokens_seen": 22547184, "step": 39095 }, { "epoch": 5.823652070300864, "grad_norm": 3.397324323654175, "learning_rate": 2.218289874703129e-05, "loss": 0.7139, "num_input_tokens_seen": 22549872, "step": 39100 }, { "epoch": 5.8243967828418235, "grad_norm": 1.368324875831604, "learning_rate": 2.2176441391356336e-05, "loss": 0.5047, "num_input_tokens_seen": 22552816, "step": 39105 }, { "epoch": 5.825141495382782, "grad_norm": 3.304027557373047, "learning_rate": 2.2169984226485998e-05, "loss": 0.7976, "num_input_tokens_seen": 22555792, "step": 39110 }, { "epoch": 5.825886207923741, "grad_norm": 1.20460844039917, "learning_rate": 2.2163527252856614e-05, "loss": 0.6322, "num_input_tokens_seen": 22558832, "step": 39115 }, { "epoch": 5.826630920464701, "grad_norm": 1.8350303173065186, "learning_rate": 2.2157070470904528e-05, "loss": 0.498, "num_input_tokens_seen": 22561744, "step": 39120 }, { "epoch": 5.82737563300566, "grad_norm": 1.2745081186294556, "learning_rate": 2.2150613881066063e-05, "loss": 0.7198, "num_input_tokens_seen": 22564528, "step": 39125 }, { "epoch": 5.828120345546619, "grad_norm": 1.691404938697815, "learning_rate": 2.2144157483777538e-05, "loss": 0.6102, "num_input_tokens_seen": 22567696, "step": 39130 }, { "epoch": 5.828865058087578, "grad_norm": 0.6880545020103455, "learning_rate": 2.2137701279475224e-05, "loss": 0.4616, "num_input_tokens_seen": 22570384, "step": 39135 }, { "epoch": 5.8296097706285375, "grad_norm": 1.4171816110610962, "learning_rate": 2.213124526859542e-05, "loss": 0.5268, "num_input_tokens_seen": 22573360, "step": 39140 }, { "epoch": 5.830354483169496, "grad_norm": 2.244258165359497, "learning_rate": 2.2124789451574405e-05, "loss": 0.6485, "num_input_tokens_seen": 22576304, "step": 39145 }, { "epoch": 5.831099195710456, "grad_norm": 2.903991222381592, "learning_rate": 2.2118333828848422e-05, "loss": 0.5544, "num_input_tokens_seen": 22578960, "step": 39150 }, { "epoch": 5.831843908251415, "grad_norm": 1.440869688987732, "learning_rate": 2.2111878400853732e-05, "loss": 0.5895, "num_input_tokens_seen": 22582160, "step": 39155 }, { "epoch": 5.832588620792374, "grad_norm": 1.8642139434814453, "learning_rate": 2.2105423168026545e-05, "loss": 0.6466, "num_input_tokens_seen": 22584912, "step": 39160 }, { "epoch": 5.833333333333333, "grad_norm": 2.342587471008301, "learning_rate": 2.2098968130803096e-05, "loss": 0.635, "num_input_tokens_seen": 22587824, "step": 39165 }, { "epoch": 5.834078045874293, "grad_norm": 1.799727439880371, "learning_rate": 2.2092513289619597e-05, "loss": 0.622, "num_input_tokens_seen": 22590736, "step": 39170 }, { "epoch": 5.834822758415251, "grad_norm": 2.470759153366089, "learning_rate": 2.208605864491222e-05, "loss": 0.7636, "num_input_tokens_seen": 22593552, "step": 39175 }, { "epoch": 5.835567470956211, "grad_norm": 1.8746827840805054, "learning_rate": 2.2079604197117152e-05, "loss": 0.504, "num_input_tokens_seen": 22596752, "step": 39180 }, { "epoch": 5.83631218349717, "grad_norm": 2.294325828552246, "learning_rate": 2.2073149946670556e-05, "loss": 0.5768, "num_input_tokens_seen": 22599376, "step": 39185 }, { "epoch": 5.8370568960381295, "grad_norm": 2.1551172733306885, "learning_rate": 2.2066695894008595e-05, "loss": 0.7411, "num_input_tokens_seen": 22602288, "step": 39190 }, { "epoch": 5.837801608579088, "grad_norm": 1.7268617153167725, "learning_rate": 2.2060242039567393e-05, "loss": 0.7213, "num_input_tokens_seen": 22605264, "step": 39195 }, { "epoch": 5.838546321120048, "grad_norm": 1.7991188764572144, "learning_rate": 2.205378838378308e-05, "loss": 0.6387, "num_input_tokens_seen": 22608208, "step": 39200 }, { "epoch": 5.839291033661007, "grad_norm": 1.1651588678359985, "learning_rate": 2.204733492709178e-05, "loss": 0.4073, "num_input_tokens_seen": 22610960, "step": 39205 }, { "epoch": 5.840035746201966, "grad_norm": 1.5522533655166626, "learning_rate": 2.2040881669929582e-05, "loss": 0.6423, "num_input_tokens_seen": 22613840, "step": 39210 }, { "epoch": 5.840780458742925, "grad_norm": 1.3880798816680908, "learning_rate": 2.203442861273256e-05, "loss": 0.6283, "num_input_tokens_seen": 22616592, "step": 39215 }, { "epoch": 5.841525171283885, "grad_norm": 1.2623828649520874, "learning_rate": 2.202797575593679e-05, "loss": 0.6507, "num_input_tokens_seen": 22619376, "step": 39220 }, { "epoch": 5.8422698838248435, "grad_norm": 1.7882622480392456, "learning_rate": 2.2021523099978347e-05, "loss": 0.6101, "num_input_tokens_seen": 22622160, "step": 39225 }, { "epoch": 5.843014596365803, "grad_norm": 1.8195451498031616, "learning_rate": 2.2015070645293257e-05, "loss": 0.7133, "num_input_tokens_seen": 22624816, "step": 39230 }, { "epoch": 5.843759308906762, "grad_norm": 1.4732519388198853, "learning_rate": 2.2008618392317557e-05, "loss": 0.5661, "num_input_tokens_seen": 22627568, "step": 39235 }, { "epoch": 5.8445040214477215, "grad_norm": 1.1800098419189453, "learning_rate": 2.2002166341487267e-05, "loss": 0.4894, "num_input_tokens_seen": 22630192, "step": 39240 }, { "epoch": 5.84524873398868, "grad_norm": 2.181980848312378, "learning_rate": 2.1995714493238383e-05, "loss": 0.6503, "num_input_tokens_seen": 22633104, "step": 39245 }, { "epoch": 5.84599344652964, "grad_norm": 3.2872259616851807, "learning_rate": 2.1989262848006912e-05, "loss": 0.5641, "num_input_tokens_seen": 22636112, "step": 39250 }, { "epoch": 5.846738159070599, "grad_norm": 2.284857988357544, "learning_rate": 2.1982811406228805e-05, "loss": 0.5274, "num_input_tokens_seen": 22638960, "step": 39255 }, { "epoch": 5.847482871611557, "grad_norm": 1.661457896232605, "learning_rate": 2.1976360168340042e-05, "loss": 0.7373, "num_input_tokens_seen": 22641808, "step": 39260 }, { "epoch": 5.848227584152517, "grad_norm": 1.999822735786438, "learning_rate": 2.1969909134776555e-05, "loss": 0.6312, "num_input_tokens_seen": 22644560, "step": 39265 }, { "epoch": 5.848972296693477, "grad_norm": 2.157865524291992, "learning_rate": 2.1963458305974297e-05, "loss": 0.6023, "num_input_tokens_seen": 22647248, "step": 39270 }, { "epoch": 5.8497170092344355, "grad_norm": 2.5975236892700195, "learning_rate": 2.1957007682369182e-05, "loss": 0.5235, "num_input_tokens_seen": 22650000, "step": 39275 }, { "epoch": 5.850461721775394, "grad_norm": 2.538501024246216, "learning_rate": 2.195055726439711e-05, "loss": 0.4599, "num_input_tokens_seen": 22652752, "step": 39280 }, { "epoch": 5.851206434316354, "grad_norm": 3.09379506111145, "learning_rate": 2.1944107052493984e-05, "loss": 0.6045, "num_input_tokens_seen": 22655952, "step": 39285 }, { "epoch": 5.8519511468573135, "grad_norm": 1.9721640348434448, "learning_rate": 2.1937657047095687e-05, "loss": 0.6646, "num_input_tokens_seen": 22658480, "step": 39290 }, { "epoch": 5.852695859398272, "grad_norm": 2.070240020751953, "learning_rate": 2.193120724863807e-05, "loss": 0.7829, "num_input_tokens_seen": 22661488, "step": 39295 }, { "epoch": 5.853440571939231, "grad_norm": 1.5235605239868164, "learning_rate": 2.1924757657556986e-05, "loss": 0.6433, "num_input_tokens_seen": 22664496, "step": 39300 }, { "epoch": 5.854185284480191, "grad_norm": 0.942798376083374, "learning_rate": 2.1918308274288278e-05, "loss": 0.4742, "num_input_tokens_seen": 22667376, "step": 39305 }, { "epoch": 5.8549299970211495, "grad_norm": 1.3458360433578491, "learning_rate": 2.191185909926777e-05, "loss": 0.5738, "num_input_tokens_seen": 22670096, "step": 39310 }, { "epoch": 5.855674709562109, "grad_norm": 2.022554397583008, "learning_rate": 2.1905410132931263e-05, "loss": 0.6715, "num_input_tokens_seen": 22672752, "step": 39315 }, { "epoch": 5.856419422103068, "grad_norm": 1.585520625114441, "learning_rate": 2.1898961375714567e-05, "loss": 0.5776, "num_input_tokens_seen": 22676016, "step": 39320 }, { "epoch": 5.8571641346440275, "grad_norm": 3.4071085453033447, "learning_rate": 2.1892512828053443e-05, "loss": 0.7316, "num_input_tokens_seen": 22678992, "step": 39325 }, { "epoch": 5.857908847184986, "grad_norm": 2.7747387886047363, "learning_rate": 2.1886064490383682e-05, "loss": 0.6345, "num_input_tokens_seen": 22681840, "step": 39330 }, { "epoch": 5.858653559725946, "grad_norm": 3.2581706047058105, "learning_rate": 2.1879616363141012e-05, "loss": 0.8095, "num_input_tokens_seen": 22684624, "step": 39335 }, { "epoch": 5.859398272266905, "grad_norm": 1.2875046730041504, "learning_rate": 2.1873168446761184e-05, "loss": 0.4139, "num_input_tokens_seen": 22687280, "step": 39340 }, { "epoch": 5.860142984807864, "grad_norm": 1.6872912645339966, "learning_rate": 2.1866720741679918e-05, "loss": 0.721, "num_input_tokens_seen": 22690480, "step": 39345 }, { "epoch": 5.860887697348823, "grad_norm": 1.1950634717941284, "learning_rate": 2.186027324833292e-05, "loss": 0.5604, "num_input_tokens_seen": 22693744, "step": 39350 }, { "epoch": 5.861632409889783, "grad_norm": 1.1783865690231323, "learning_rate": 2.18538259671559e-05, "loss": 0.6559, "num_input_tokens_seen": 22696816, "step": 39355 }, { "epoch": 5.8623771224307415, "grad_norm": 2.1484498977661133, "learning_rate": 2.1847378898584524e-05, "loss": 0.6097, "num_input_tokens_seen": 22699632, "step": 39360 }, { "epoch": 5.863121834971701, "grad_norm": 1.240354299545288, "learning_rate": 2.184093204305446e-05, "loss": 0.6469, "num_input_tokens_seen": 22702640, "step": 39365 }, { "epoch": 5.86386654751266, "grad_norm": 1.2927286624908447, "learning_rate": 2.1834485401001384e-05, "loss": 0.4626, "num_input_tokens_seen": 22705648, "step": 39370 }, { "epoch": 5.8646112600536195, "grad_norm": 1.8823974132537842, "learning_rate": 2.1828038972860904e-05, "loss": 0.6796, "num_input_tokens_seen": 22708144, "step": 39375 }, { "epoch": 5.865355972594578, "grad_norm": 1.3403072357177734, "learning_rate": 2.182159275906865e-05, "loss": 0.6462, "num_input_tokens_seen": 22710896, "step": 39380 }, { "epoch": 5.866100685135538, "grad_norm": 1.004963994026184, "learning_rate": 2.1815146760060234e-05, "loss": 0.516, "num_input_tokens_seen": 22713872, "step": 39385 }, { "epoch": 5.866845397676497, "grad_norm": 4.905611038208008, "learning_rate": 2.1808700976271256e-05, "loss": 0.4244, "num_input_tokens_seen": 22716688, "step": 39390 }, { "epoch": 5.867590110217456, "grad_norm": 4.239940166473389, "learning_rate": 2.1802255408137286e-05, "loss": 0.7168, "num_input_tokens_seen": 22719344, "step": 39395 }, { "epoch": 5.868334822758415, "grad_norm": 2.360311269760132, "learning_rate": 2.1795810056093896e-05, "loss": 0.7467, "num_input_tokens_seen": 22722032, "step": 39400 }, { "epoch": 5.869079535299375, "grad_norm": 3.3575127124786377, "learning_rate": 2.178936492057664e-05, "loss": 0.5669, "num_input_tokens_seen": 22724848, "step": 39405 }, { "epoch": 5.8698242478403335, "grad_norm": 3.5513510704040527, "learning_rate": 2.1782920002021054e-05, "loss": 0.425, "num_input_tokens_seen": 22728880, "step": 39410 }, { "epoch": 5.870568960381293, "grad_norm": 2.4256527423858643, "learning_rate": 2.1776475300862646e-05, "loss": 0.7606, "num_input_tokens_seen": 22731568, "step": 39415 }, { "epoch": 5.871313672922252, "grad_norm": 2.9550631046295166, "learning_rate": 2.1770030817536928e-05, "loss": 0.6313, "num_input_tokens_seen": 22734416, "step": 39420 }, { "epoch": 5.872058385463212, "grad_norm": 1.6161015033721924, "learning_rate": 2.17635865524794e-05, "loss": 0.4779, "num_input_tokens_seen": 22737392, "step": 39425 }, { "epoch": 5.87280309800417, "grad_norm": 1.3531893491744995, "learning_rate": 2.1757142506125534e-05, "loss": 0.5584, "num_input_tokens_seen": 22739920, "step": 39430 }, { "epoch": 5.87354781054513, "grad_norm": 3.921734571456909, "learning_rate": 2.1750698678910788e-05, "loss": 0.6519, "num_input_tokens_seen": 22742992, "step": 39435 }, { "epoch": 5.874292523086089, "grad_norm": 2.4700002670288086, "learning_rate": 2.174425507127062e-05, "loss": 0.6146, "num_input_tokens_seen": 22746096, "step": 39440 }, { "epoch": 5.8750372356270475, "grad_norm": 1.9946106672286987, "learning_rate": 2.1737811683640455e-05, "loss": 0.6381, "num_input_tokens_seen": 22749200, "step": 39445 }, { "epoch": 5.875781948168007, "grad_norm": 1.275775671005249, "learning_rate": 2.1731368516455723e-05, "loss": 0.5207, "num_input_tokens_seen": 22752080, "step": 39450 }, { "epoch": 5.876526660708967, "grad_norm": 1.626586675643921, "learning_rate": 2.1724925570151806e-05, "loss": 0.6769, "num_input_tokens_seen": 22755152, "step": 39455 }, { "epoch": 5.8772713732499255, "grad_norm": 1.4622114896774292, "learning_rate": 2.171848284516411e-05, "loss": 0.7239, "num_input_tokens_seen": 22757968, "step": 39460 }, { "epoch": 5.878016085790884, "grad_norm": 1.0732954740524292, "learning_rate": 2.1712040341927998e-05, "loss": 0.6362, "num_input_tokens_seen": 22761008, "step": 39465 }, { "epoch": 5.878760798331844, "grad_norm": 2.932389497756958, "learning_rate": 2.170559806087883e-05, "loss": 0.6829, "num_input_tokens_seen": 22763856, "step": 39470 }, { "epoch": 5.879505510872804, "grad_norm": 0.5703359842300415, "learning_rate": 2.1699156002451954e-05, "loss": 0.6808, "num_input_tokens_seen": 22766480, "step": 39475 }, { "epoch": 5.880250223413762, "grad_norm": 2.5913913249969482, "learning_rate": 2.169271416708269e-05, "loss": 0.5651, "num_input_tokens_seen": 22769072, "step": 39480 }, { "epoch": 5.880994935954721, "grad_norm": 1.3207793235778809, "learning_rate": 2.1686272555206363e-05, "loss": 0.6263, "num_input_tokens_seen": 22771888, "step": 39485 }, { "epoch": 5.881739648495681, "grad_norm": 2.2215282917022705, "learning_rate": 2.1679831167258267e-05, "loss": 0.7079, "num_input_tokens_seen": 22774768, "step": 39490 }, { "epoch": 5.8824843610366395, "grad_norm": 3.6557092666625977, "learning_rate": 2.1673390003673678e-05, "loss": 0.6042, "num_input_tokens_seen": 22777584, "step": 39495 }, { "epoch": 5.883229073577599, "grad_norm": 1.5419689416885376, "learning_rate": 2.1666949064887862e-05, "loss": 0.8487, "num_input_tokens_seen": 22780080, "step": 39500 }, { "epoch": 5.883973786118558, "grad_norm": 1.6992278099060059, "learning_rate": 2.1660508351336086e-05, "loss": 0.5763, "num_input_tokens_seen": 22782960, "step": 39505 }, { "epoch": 5.884718498659518, "grad_norm": 2.718332290649414, "learning_rate": 2.1654067863453568e-05, "loss": 0.7407, "num_input_tokens_seen": 22785648, "step": 39510 }, { "epoch": 5.885463211200476, "grad_norm": 1.137168288230896, "learning_rate": 2.1647627601675542e-05, "loss": 0.5982, "num_input_tokens_seen": 22788432, "step": 39515 }, { "epoch": 5.886207923741436, "grad_norm": 1.4405858516693115, "learning_rate": 2.164118756643722e-05, "loss": 0.6973, "num_input_tokens_seen": 22791504, "step": 39520 }, { "epoch": 5.886952636282395, "grad_norm": 1.8318867683410645, "learning_rate": 2.163474775817378e-05, "loss": 0.7198, "num_input_tokens_seen": 22794256, "step": 39525 }, { "epoch": 5.887697348823354, "grad_norm": 1.8854931592941284, "learning_rate": 2.1628308177320418e-05, "loss": 0.6892, "num_input_tokens_seen": 22797104, "step": 39530 }, { "epoch": 5.888442061364313, "grad_norm": 1.103901743888855, "learning_rate": 2.1621868824312264e-05, "loss": 0.7224, "num_input_tokens_seen": 22799952, "step": 39535 }, { "epoch": 5.889186773905273, "grad_norm": 1.4193390607833862, "learning_rate": 2.161542969958449e-05, "loss": 0.4216, "num_input_tokens_seen": 22802608, "step": 39540 }, { "epoch": 5.8899314864462315, "grad_norm": 1.2897106409072876, "learning_rate": 2.160899080357221e-05, "loss": 0.5296, "num_input_tokens_seen": 22805936, "step": 39545 }, { "epoch": 5.890676198987191, "grad_norm": 1.3823431730270386, "learning_rate": 2.1602552136710543e-05, "loss": 0.5719, "num_input_tokens_seen": 22808848, "step": 39550 }, { "epoch": 5.89142091152815, "grad_norm": 1.3323675394058228, "learning_rate": 2.1596113699434597e-05, "loss": 0.6272, "num_input_tokens_seen": 22811696, "step": 39555 }, { "epoch": 5.89216562406911, "grad_norm": 0.747104287147522, "learning_rate": 2.1589675492179444e-05, "loss": 0.4885, "num_input_tokens_seen": 22814864, "step": 39560 }, { "epoch": 5.892910336610068, "grad_norm": 1.1862399578094482, "learning_rate": 2.1583237515380153e-05, "loss": 0.7066, "num_input_tokens_seen": 22817872, "step": 39565 }, { "epoch": 5.893655049151028, "grad_norm": 2.6579525470733643, "learning_rate": 2.1576799769471787e-05, "loss": 0.7558, "num_input_tokens_seen": 22820720, "step": 39570 }, { "epoch": 5.894399761691987, "grad_norm": 2.0100512504577637, "learning_rate": 2.157036225488938e-05, "loss": 0.6967, "num_input_tokens_seen": 22823536, "step": 39575 }, { "epoch": 5.895144474232946, "grad_norm": 1.5732660293579102, "learning_rate": 2.1563924972067934e-05, "loss": 0.586, "num_input_tokens_seen": 22826608, "step": 39580 }, { "epoch": 5.895889186773905, "grad_norm": 1.4576330184936523, "learning_rate": 2.155748792144247e-05, "loss": 0.645, "num_input_tokens_seen": 22829488, "step": 39585 }, { "epoch": 5.896633899314865, "grad_norm": 1.2110148668289185, "learning_rate": 2.1551051103447982e-05, "loss": 0.55, "num_input_tokens_seen": 22832464, "step": 39590 }, { "epoch": 5.897378611855824, "grad_norm": 1.410814642906189, "learning_rate": 2.1544614518519434e-05, "loss": 0.5326, "num_input_tokens_seen": 22835184, "step": 39595 }, { "epoch": 5.898123324396783, "grad_norm": 1.482724905014038, "learning_rate": 2.1538178167091787e-05, "loss": 0.5866, "num_input_tokens_seen": 22838000, "step": 39600 }, { "epoch": 5.898868036937742, "grad_norm": 1.3848968744277954, "learning_rate": 2.153174204959999e-05, "loss": 0.6154, "num_input_tokens_seen": 22840880, "step": 39605 }, { "epoch": 5.899612749478701, "grad_norm": 1.2938096523284912, "learning_rate": 2.1525306166478957e-05, "loss": 0.4606, "num_input_tokens_seen": 22844176, "step": 39610 }, { "epoch": 5.90035746201966, "grad_norm": 2.017176628112793, "learning_rate": 2.151887051816362e-05, "loss": 0.5593, "num_input_tokens_seen": 22846992, "step": 39615 }, { "epoch": 5.90110217456062, "grad_norm": 1.7844247817993164, "learning_rate": 2.1512435105088847e-05, "loss": 0.5118, "num_input_tokens_seen": 22849680, "step": 39620 }, { "epoch": 5.901846887101579, "grad_norm": 2.0602803230285645, "learning_rate": 2.1505999927689536e-05, "loss": 0.6528, "num_input_tokens_seen": 22852688, "step": 39625 }, { "epoch": 5.9025915996425375, "grad_norm": 1.90241277217865, "learning_rate": 2.149956498640054e-05, "loss": 0.7666, "num_input_tokens_seen": 22855952, "step": 39630 }, { "epoch": 5.903336312183497, "grad_norm": 2.810302257537842, "learning_rate": 2.1493130281656708e-05, "loss": 0.5883, "num_input_tokens_seen": 22858768, "step": 39635 }, { "epoch": 5.904081024724457, "grad_norm": 2.4664218425750732, "learning_rate": 2.1486695813892883e-05, "loss": 0.9395, "num_input_tokens_seen": 22861424, "step": 39640 }, { "epoch": 5.904825737265416, "grad_norm": 2.604527711868286, "learning_rate": 2.1480261583543866e-05, "loss": 0.655, "num_input_tokens_seen": 22864336, "step": 39645 }, { "epoch": 5.905570449806374, "grad_norm": 2.0694000720977783, "learning_rate": 2.1473827591044464e-05, "loss": 0.639, "num_input_tokens_seen": 22867088, "step": 39650 }, { "epoch": 5.906315162347334, "grad_norm": 8.537372589111328, "learning_rate": 2.1467393836829454e-05, "loss": 0.8247, "num_input_tokens_seen": 22869936, "step": 39655 }, { "epoch": 5.907059874888293, "grad_norm": 2.5570871829986572, "learning_rate": 2.146096032133361e-05, "loss": 0.5592, "num_input_tokens_seen": 22872976, "step": 39660 }, { "epoch": 5.907804587429252, "grad_norm": 1.4637341499328613, "learning_rate": 2.1454527044991673e-05, "loss": 0.5526, "num_input_tokens_seen": 22875824, "step": 39665 }, { "epoch": 5.908549299970211, "grad_norm": 1.3554273843765259, "learning_rate": 2.144809400823839e-05, "loss": 0.5385, "num_input_tokens_seen": 22878672, "step": 39670 }, { "epoch": 5.909294012511171, "grad_norm": 1.8022944927215576, "learning_rate": 2.1441661211508465e-05, "loss": 0.4583, "num_input_tokens_seen": 22881392, "step": 39675 }, { "epoch": 5.91003872505213, "grad_norm": 2.9605491161346436, "learning_rate": 2.1435228655236608e-05, "loss": 0.5749, "num_input_tokens_seen": 22884144, "step": 39680 }, { "epoch": 5.910783437593089, "grad_norm": 1.3377050161361694, "learning_rate": 2.1428796339857513e-05, "loss": 0.727, "num_input_tokens_seen": 22886960, "step": 39685 }, { "epoch": 5.911528150134048, "grad_norm": 1.1195106506347656, "learning_rate": 2.1422364265805832e-05, "loss": 0.67, "num_input_tokens_seen": 22889520, "step": 39690 }, { "epoch": 5.912272862675008, "grad_norm": 2.0698297023773193, "learning_rate": 2.1415932433516243e-05, "loss": 0.6575, "num_input_tokens_seen": 22892240, "step": 39695 }, { "epoch": 5.913017575215966, "grad_norm": 1.4512511491775513, "learning_rate": 2.140950084342336e-05, "loss": 0.5243, "num_input_tokens_seen": 22895024, "step": 39700 }, { "epoch": 5.913762287756926, "grad_norm": 1.735695719718933, "learning_rate": 2.1403069495961813e-05, "loss": 0.5755, "num_input_tokens_seen": 22898000, "step": 39705 }, { "epoch": 5.914507000297885, "grad_norm": 3.2601566314697266, "learning_rate": 2.13966383915662e-05, "loss": 0.5438, "num_input_tokens_seen": 22900848, "step": 39710 }, { "epoch": 5.915251712838844, "grad_norm": 1.4157569408416748, "learning_rate": 2.1390207530671115e-05, "loss": 0.611, "num_input_tokens_seen": 22903728, "step": 39715 }, { "epoch": 5.915996425379803, "grad_norm": 2.5993056297302246, "learning_rate": 2.1383776913711135e-05, "loss": 0.5295, "num_input_tokens_seen": 22906480, "step": 39720 }, { "epoch": 5.916741137920763, "grad_norm": 2.528913974761963, "learning_rate": 2.1377346541120803e-05, "loss": 0.5174, "num_input_tokens_seen": 22909072, "step": 39725 }, { "epoch": 5.917485850461722, "grad_norm": 1.9693316221237183, "learning_rate": 2.1370916413334663e-05, "loss": 0.618, "num_input_tokens_seen": 22912368, "step": 39730 }, { "epoch": 5.918230563002681, "grad_norm": 4.019305229187012, "learning_rate": 2.1364486530787247e-05, "loss": 0.5714, "num_input_tokens_seen": 22915088, "step": 39735 }, { "epoch": 5.91897527554364, "grad_norm": 2.165886402130127, "learning_rate": 2.1358056893913047e-05, "loss": 0.6883, "num_input_tokens_seen": 22918064, "step": 39740 }, { "epoch": 5.9197199880846, "grad_norm": 2.219785213470459, "learning_rate": 2.1351627503146547e-05, "loss": 0.5724, "num_input_tokens_seen": 22920688, "step": 39745 }, { "epoch": 5.920464700625558, "grad_norm": 2.949622869491577, "learning_rate": 2.134519835892223e-05, "loss": 0.7314, "num_input_tokens_seen": 22923760, "step": 39750 }, { "epoch": 5.921209413166518, "grad_norm": 1.5533407926559448, "learning_rate": 2.133876946167455e-05, "loss": 0.3921, "num_input_tokens_seen": 22926704, "step": 39755 }, { "epoch": 5.921954125707477, "grad_norm": 1.922654390335083, "learning_rate": 2.1332340811837944e-05, "loss": 0.4816, "num_input_tokens_seen": 22929520, "step": 39760 }, { "epoch": 5.9226988382484365, "grad_norm": 2.773313283920288, "learning_rate": 2.1325912409846834e-05, "loss": 0.7864, "num_input_tokens_seen": 22932496, "step": 39765 }, { "epoch": 5.923443550789395, "grad_norm": 3.2248809337615967, "learning_rate": 2.131948425613563e-05, "loss": 0.7187, "num_input_tokens_seen": 22935216, "step": 39770 }, { "epoch": 5.924188263330355, "grad_norm": 2.045590877532959, "learning_rate": 2.1313056351138715e-05, "loss": 0.6835, "num_input_tokens_seen": 22938224, "step": 39775 }, { "epoch": 5.924932975871314, "grad_norm": 3.5983166694641113, "learning_rate": 2.1306628695290458e-05, "loss": 0.7488, "num_input_tokens_seen": 22941040, "step": 39780 }, { "epoch": 5.925677688412273, "grad_norm": 2.247861385345459, "learning_rate": 2.1300201289025215e-05, "loss": 0.6443, "num_input_tokens_seen": 22943504, "step": 39785 }, { "epoch": 5.926422400953232, "grad_norm": 2.3074309825897217, "learning_rate": 2.1293774132777332e-05, "loss": 0.8716, "num_input_tokens_seen": 22946160, "step": 39790 }, { "epoch": 5.927167113494191, "grad_norm": 2.2304956912994385, "learning_rate": 2.128734722698112e-05, "loss": 0.5924, "num_input_tokens_seen": 22949296, "step": 39795 }, { "epoch": 5.92791182603515, "grad_norm": 2.3457658290863037, "learning_rate": 2.128092057207089e-05, "loss": 0.5427, "num_input_tokens_seen": 22952176, "step": 39800 }, { "epoch": 5.92865653857611, "grad_norm": 1.53602135181427, "learning_rate": 2.127449416848093e-05, "loss": 0.551, "num_input_tokens_seen": 22954864, "step": 39805 }, { "epoch": 5.929401251117069, "grad_norm": 1.3603785037994385, "learning_rate": 2.1268068016645505e-05, "loss": 0.7086, "num_input_tokens_seen": 22957680, "step": 39810 }, { "epoch": 5.930145963658028, "grad_norm": 1.4921929836273193, "learning_rate": 2.1261642116998877e-05, "loss": 0.537, "num_input_tokens_seen": 22960432, "step": 39815 }, { "epoch": 5.930890676198987, "grad_norm": 2.7773003578186035, "learning_rate": 2.1255216469975265e-05, "loss": 0.6967, "num_input_tokens_seen": 22963120, "step": 39820 }, { "epoch": 5.931635388739946, "grad_norm": 3.748589277267456, "learning_rate": 2.1248791076008906e-05, "loss": 0.4979, "num_input_tokens_seen": 22965872, "step": 39825 }, { "epoch": 5.932380101280906, "grad_norm": 1.4728528261184692, "learning_rate": 2.1242365935533988e-05, "loss": 0.5478, "num_input_tokens_seen": 22968528, "step": 39830 }, { "epoch": 5.933124813821864, "grad_norm": 3.832109212875366, "learning_rate": 2.123594104898471e-05, "loss": 0.7893, "num_input_tokens_seen": 22971440, "step": 39835 }, { "epoch": 5.933869526362824, "grad_norm": 2.6343750953674316, "learning_rate": 2.1229516416795224e-05, "loss": 0.5231, "num_input_tokens_seen": 22974320, "step": 39840 }, { "epoch": 5.934614238903783, "grad_norm": 1.7758958339691162, "learning_rate": 2.1223092039399695e-05, "loss": 0.5573, "num_input_tokens_seen": 22977616, "step": 39845 }, { "epoch": 5.9353589514447425, "grad_norm": 5.446143627166748, "learning_rate": 2.121666791723225e-05, "loss": 0.743, "num_input_tokens_seen": 22980432, "step": 39850 }, { "epoch": 5.936103663985701, "grad_norm": 3.1233057975769043, "learning_rate": 2.1210244050727014e-05, "loss": 0.4142, "num_input_tokens_seen": 22983216, "step": 39855 }, { "epoch": 5.936848376526661, "grad_norm": 7.2298712730407715, "learning_rate": 2.1203820440318063e-05, "loss": 0.6625, "num_input_tokens_seen": 22985936, "step": 39860 }, { "epoch": 5.93759308906762, "grad_norm": 2.427999496459961, "learning_rate": 2.1197397086439495e-05, "loss": 0.6314, "num_input_tokens_seen": 22988528, "step": 39865 }, { "epoch": 5.938337801608579, "grad_norm": 0.988109827041626, "learning_rate": 2.1190973989525377e-05, "loss": 0.5761, "num_input_tokens_seen": 22991344, "step": 39870 }, { "epoch": 5.939082514149538, "grad_norm": 1.6428760290145874, "learning_rate": 2.118455115000974e-05, "loss": 0.759, "num_input_tokens_seen": 22994096, "step": 39875 }, { "epoch": 5.939827226690498, "grad_norm": 2.0287301540374756, "learning_rate": 2.117812856832663e-05, "loss": 0.5044, "num_input_tokens_seen": 22997040, "step": 39880 }, { "epoch": 5.940571939231456, "grad_norm": 3.4689066410064697, "learning_rate": 2.1171706244910055e-05, "loss": 0.4879, "num_input_tokens_seen": 22999824, "step": 39885 }, { "epoch": 5.941316651772416, "grad_norm": 1.822202444076538, "learning_rate": 2.1165284180194003e-05, "loss": 0.7076, "num_input_tokens_seen": 23002896, "step": 39890 }, { "epoch": 5.942061364313375, "grad_norm": 1.69612717628479, "learning_rate": 2.1158862374612465e-05, "loss": 0.551, "num_input_tokens_seen": 23005616, "step": 39895 }, { "epoch": 5.9428060768543345, "grad_norm": 1.0658012628555298, "learning_rate": 2.1152440828599383e-05, "loss": 0.4851, "num_input_tokens_seen": 23008656, "step": 39900 }, { "epoch": 5.943550789395293, "grad_norm": 1.6856420040130615, "learning_rate": 2.114601954258871e-05, "loss": 0.4803, "num_input_tokens_seen": 23011952, "step": 39905 }, { "epoch": 5.944295501936253, "grad_norm": 1.888779878616333, "learning_rate": 2.113959851701436e-05, "loss": 0.7274, "num_input_tokens_seen": 23015280, "step": 39910 }, { "epoch": 5.945040214477212, "grad_norm": 1.2740226984024048, "learning_rate": 2.1133177752310252e-05, "loss": 0.6276, "num_input_tokens_seen": 23018480, "step": 39915 }, { "epoch": 5.945784927018171, "grad_norm": 2.030832052230835, "learning_rate": 2.112675724891027e-05, "loss": 0.7346, "num_input_tokens_seen": 23021168, "step": 39920 }, { "epoch": 5.94652963955913, "grad_norm": 2.288905382156372, "learning_rate": 2.1120337007248284e-05, "loss": 0.5359, "num_input_tokens_seen": 23023952, "step": 39925 }, { "epoch": 5.94727435210009, "grad_norm": 2.673200845718384, "learning_rate": 2.1113917027758145e-05, "loss": 0.5851, "num_input_tokens_seen": 23026800, "step": 39930 }, { "epoch": 5.9480190646410485, "grad_norm": 1.2377753257751465, "learning_rate": 2.1107497310873708e-05, "loss": 0.5615, "num_input_tokens_seen": 23029552, "step": 39935 }, { "epoch": 5.948763777182008, "grad_norm": 1.9176419973373413, "learning_rate": 2.1101077857028774e-05, "loss": 0.6827, "num_input_tokens_seen": 23032336, "step": 39940 }, { "epoch": 5.949508489722967, "grad_norm": 2.0748965740203857, "learning_rate": 2.1094658666657137e-05, "loss": 0.5346, "num_input_tokens_seen": 23035248, "step": 39945 }, { "epoch": 5.9502532022639265, "grad_norm": 5.213842868804932, "learning_rate": 2.1088239740192588e-05, "loss": 0.5423, "num_input_tokens_seen": 23037904, "step": 39950 }, { "epoch": 5.950997914804885, "grad_norm": 2.4624273777008057, "learning_rate": 2.1081821078068902e-05, "loss": 0.8647, "num_input_tokens_seen": 23040688, "step": 39955 }, { "epoch": 5.951742627345844, "grad_norm": 2.9786558151245117, "learning_rate": 2.1075402680719814e-05, "loss": 0.7906, "num_input_tokens_seen": 23043632, "step": 39960 }, { "epoch": 5.952487339886804, "grad_norm": 2.808546304702759, "learning_rate": 2.1068984548579053e-05, "loss": 0.5991, "num_input_tokens_seen": 23046160, "step": 39965 }, { "epoch": 5.953232052427763, "grad_norm": 2.0347609519958496, "learning_rate": 2.106256668208034e-05, "loss": 0.8961, "num_input_tokens_seen": 23049008, "step": 39970 }, { "epoch": 5.953976764968722, "grad_norm": 1.3183070421218872, "learning_rate": 2.1056149081657368e-05, "loss": 0.6448, "num_input_tokens_seen": 23051792, "step": 39975 }, { "epoch": 5.954721477509681, "grad_norm": 3.103307008743286, "learning_rate": 2.1049731747743793e-05, "loss": 0.5646, "num_input_tokens_seen": 23054768, "step": 39980 }, { "epoch": 5.9554661900506405, "grad_norm": 1.7950098514556885, "learning_rate": 2.104331468077329e-05, "loss": 0.7149, "num_input_tokens_seen": 23057680, "step": 39985 }, { "epoch": 5.9562109025916, "grad_norm": 3.614771604537964, "learning_rate": 2.10368978811795e-05, "loss": 0.814, "num_input_tokens_seen": 23060944, "step": 39990 }, { "epoch": 5.956955615132559, "grad_norm": 2.53389573097229, "learning_rate": 2.1030481349396028e-05, "loss": 0.5656, "num_input_tokens_seen": 23063792, "step": 39995 }, { "epoch": 5.957700327673518, "grad_norm": 1.1541510820388794, "learning_rate": 2.1024065085856498e-05, "loss": 0.6237, "num_input_tokens_seen": 23066896, "step": 40000 }, { "epoch": 5.958445040214477, "grad_norm": 1.3327428102493286, "learning_rate": 2.1017649090994477e-05, "loss": 0.7391, "num_input_tokens_seen": 23069712, "step": 40005 }, { "epoch": 5.959189752755436, "grad_norm": 1.3665997982025146, "learning_rate": 2.1011233365243538e-05, "loss": 0.5687, "num_input_tokens_seen": 23072624, "step": 40010 }, { "epoch": 5.959934465296396, "grad_norm": 0.9031476974487305, "learning_rate": 2.1004817909037245e-05, "loss": 0.6217, "num_input_tokens_seen": 23075504, "step": 40015 }, { "epoch": 5.9606791778373545, "grad_norm": 1.5579609870910645, "learning_rate": 2.0998402722809105e-05, "loss": 0.3785, "num_input_tokens_seen": 23078352, "step": 40020 }, { "epoch": 5.961423890378314, "grad_norm": 3.358502149581909, "learning_rate": 2.0991987806992635e-05, "loss": 0.6192, "num_input_tokens_seen": 23081200, "step": 40025 }, { "epoch": 5.962168602919273, "grad_norm": 1.378495216369629, "learning_rate": 2.0985573162021337e-05, "loss": 0.6256, "num_input_tokens_seen": 23083824, "step": 40030 }, { "epoch": 5.9629133154602325, "grad_norm": 1.7605373859405518, "learning_rate": 2.0979158788328684e-05, "loss": 0.5609, "num_input_tokens_seen": 23086928, "step": 40035 }, { "epoch": 5.963658028001191, "grad_norm": 1.7723993062973022, "learning_rate": 2.097274468634813e-05, "loss": 0.6563, "num_input_tokens_seen": 23089648, "step": 40040 }, { "epoch": 5.964402740542151, "grad_norm": 1.4316060543060303, "learning_rate": 2.0966330856513118e-05, "loss": 0.5402, "num_input_tokens_seen": 23092688, "step": 40045 }, { "epoch": 5.96514745308311, "grad_norm": 3.144556760787964, "learning_rate": 2.095991729925707e-05, "loss": 0.7958, "num_input_tokens_seen": 23095824, "step": 40050 }, { "epoch": 5.965892165624069, "grad_norm": 2.3208954334259033, "learning_rate": 2.095350401501339e-05, "loss": 0.6565, "num_input_tokens_seen": 23098768, "step": 40055 }, { "epoch": 5.966636878165028, "grad_norm": 1.2996718883514404, "learning_rate": 2.094709100421545e-05, "loss": 0.5789, "num_input_tokens_seen": 23101488, "step": 40060 }, { "epoch": 5.967381590705988, "grad_norm": 2.032219886779785, "learning_rate": 2.094067826729662e-05, "loss": 0.7242, "num_input_tokens_seen": 23104528, "step": 40065 }, { "epoch": 5.9681263032469465, "grad_norm": 2.3295342922210693, "learning_rate": 2.093426580469025e-05, "loss": 0.5435, "num_input_tokens_seen": 23107408, "step": 40070 }, { "epoch": 5.968871015787906, "grad_norm": 1.3897494077682495, "learning_rate": 2.0927853616829668e-05, "loss": 0.5333, "num_input_tokens_seen": 23109904, "step": 40075 }, { "epoch": 5.969615728328865, "grad_norm": 2.180928945541382, "learning_rate": 2.0921441704148177e-05, "loss": 0.6541, "num_input_tokens_seen": 23112688, "step": 40080 }, { "epoch": 5.9703604408698245, "grad_norm": 1.8738491535186768, "learning_rate": 2.0915030067079084e-05, "loss": 0.5291, "num_input_tokens_seen": 23115568, "step": 40085 }, { "epoch": 5.971105153410783, "grad_norm": 3.6336610317230225, "learning_rate": 2.090861870605564e-05, "loss": 0.5926, "num_input_tokens_seen": 23118256, "step": 40090 }, { "epoch": 5.971849865951743, "grad_norm": 1.3079272508621216, "learning_rate": 2.0902207621511123e-05, "loss": 0.6622, "num_input_tokens_seen": 23121360, "step": 40095 }, { "epoch": 5.972594578492702, "grad_norm": 2.5834012031555176, "learning_rate": 2.0895796813878743e-05, "loss": 0.7008, "num_input_tokens_seen": 23124272, "step": 40100 }, { "epoch": 5.973339291033661, "grad_norm": 2.0138604640960693, "learning_rate": 2.0889386283591732e-05, "loss": 0.7866, "num_input_tokens_seen": 23127120, "step": 40105 }, { "epoch": 5.97408400357462, "grad_norm": 1.6448001861572266, "learning_rate": 2.088297603108328e-05, "loss": 0.5054, "num_input_tokens_seen": 23129968, "step": 40110 }, { "epoch": 5.97482871611558, "grad_norm": 2.893390655517578, "learning_rate": 2.0876566056786572e-05, "loss": 0.4786, "num_input_tokens_seen": 23132784, "step": 40115 }, { "epoch": 5.9755734286565385, "grad_norm": 1.0960464477539062, "learning_rate": 2.087015636113477e-05, "loss": 0.7877, "num_input_tokens_seen": 23135792, "step": 40120 }, { "epoch": 5.976318141197497, "grad_norm": 3.5408077239990234, "learning_rate": 2.0863746944561e-05, "loss": 0.5725, "num_input_tokens_seen": 23138608, "step": 40125 }, { "epoch": 5.977062853738457, "grad_norm": 2.4389736652374268, "learning_rate": 2.0857337807498398e-05, "loss": 0.5794, "num_input_tokens_seen": 23141424, "step": 40130 }, { "epoch": 5.977807566279417, "grad_norm": 2.9142603874206543, "learning_rate": 2.085092895038007e-05, "loss": 0.763, "num_input_tokens_seen": 23144240, "step": 40135 }, { "epoch": 5.978552278820375, "grad_norm": 2.1208314895629883, "learning_rate": 2.08445203736391e-05, "loss": 0.7251, "num_input_tokens_seen": 23147216, "step": 40140 }, { "epoch": 5.979296991361334, "grad_norm": 1.3930071592330933, "learning_rate": 2.0838112077708533e-05, "loss": 0.6279, "num_input_tokens_seen": 23150160, "step": 40145 }, { "epoch": 5.980041703902294, "grad_norm": 3.092636823654175, "learning_rate": 2.0831704063021433e-05, "loss": 0.7744, "num_input_tokens_seen": 23153104, "step": 40150 }, { "epoch": 5.980786416443253, "grad_norm": 1.5389801263809204, "learning_rate": 2.0825296330010834e-05, "loss": 0.6207, "num_input_tokens_seen": 23155984, "step": 40155 }, { "epoch": 5.981531128984212, "grad_norm": 1.5075348615646362, "learning_rate": 2.0818888879109728e-05, "loss": 0.6957, "num_input_tokens_seen": 23158896, "step": 40160 }, { "epoch": 5.982275841525171, "grad_norm": 1.9270124435424805, "learning_rate": 2.0812481710751115e-05, "loss": 0.5999, "num_input_tokens_seen": 23161904, "step": 40165 }, { "epoch": 5.9830205540661305, "grad_norm": 3.9562888145446777, "learning_rate": 2.0806074825367965e-05, "loss": 0.7865, "num_input_tokens_seen": 23164752, "step": 40170 }, { "epoch": 5.983765266607089, "grad_norm": 1.2679297924041748, "learning_rate": 2.079966822339322e-05, "loss": 0.7334, "num_input_tokens_seen": 23167600, "step": 40175 }, { "epoch": 5.984509979148049, "grad_norm": 1.7051663398742676, "learning_rate": 2.079326190525983e-05, "loss": 0.5083, "num_input_tokens_seen": 23170896, "step": 40180 }, { "epoch": 5.985254691689008, "grad_norm": 5.09141731262207, "learning_rate": 2.0786855871400695e-05, "loss": 0.7102, "num_input_tokens_seen": 23173712, "step": 40185 }, { "epoch": 5.985999404229967, "grad_norm": 1.7940806150436401, "learning_rate": 2.0780450122248706e-05, "loss": 0.6259, "num_input_tokens_seen": 23176528, "step": 40190 }, { "epoch": 5.986744116770926, "grad_norm": 1.1631287336349487, "learning_rate": 2.0774044658236742e-05, "loss": 0.5033, "num_input_tokens_seen": 23179312, "step": 40195 }, { "epoch": 5.987488829311886, "grad_norm": 2.8090980052948, "learning_rate": 2.0767639479797663e-05, "loss": 0.5842, "num_input_tokens_seen": 23182352, "step": 40200 }, { "epoch": 5.9882335418528445, "grad_norm": 4.510656833648682, "learning_rate": 2.0761234587364294e-05, "loss": 0.5614, "num_input_tokens_seen": 23185424, "step": 40205 }, { "epoch": 5.988978254393804, "grad_norm": 2.586383581161499, "learning_rate": 2.0754829981369458e-05, "loss": 0.7661, "num_input_tokens_seen": 23188368, "step": 40210 }, { "epoch": 5.989722966934763, "grad_norm": 1.4355779886245728, "learning_rate": 2.074842566224596e-05, "loss": 0.6994, "num_input_tokens_seen": 23191312, "step": 40215 }, { "epoch": 5.990467679475723, "grad_norm": 1.7513113021850586, "learning_rate": 2.074202163042657e-05, "loss": 0.6774, "num_input_tokens_seen": 23194448, "step": 40220 }, { "epoch": 5.991212392016681, "grad_norm": 1.514003872871399, "learning_rate": 2.0735617886344043e-05, "loss": 0.621, "num_input_tokens_seen": 23197104, "step": 40225 }, { "epoch": 5.991957104557641, "grad_norm": 1.4762405157089233, "learning_rate": 2.0729214430431118e-05, "loss": 0.5841, "num_input_tokens_seen": 23199792, "step": 40230 }, { "epoch": 5.9927018170986, "grad_norm": 1.77494215965271, "learning_rate": 2.0722811263120523e-05, "loss": 0.6784, "num_input_tokens_seen": 23202736, "step": 40235 }, { "epoch": 5.993446529639559, "grad_norm": 1.9401262998580933, "learning_rate": 2.071640838484495e-05, "loss": 0.654, "num_input_tokens_seen": 23206032, "step": 40240 }, { "epoch": 5.994191242180518, "grad_norm": 1.3547289371490479, "learning_rate": 2.0710005796037078e-05, "loss": 0.5425, "num_input_tokens_seen": 23209168, "step": 40245 }, { "epoch": 5.994935954721478, "grad_norm": 3.9602108001708984, "learning_rate": 2.0703603497129584e-05, "loss": 0.7345, "num_input_tokens_seen": 23211952, "step": 40250 }, { "epoch": 5.9956806672624365, "grad_norm": 1.7031248807907104, "learning_rate": 2.0697201488555087e-05, "loss": 0.7222, "num_input_tokens_seen": 23214832, "step": 40255 }, { "epoch": 5.996425379803396, "grad_norm": 2.8078267574310303, "learning_rate": 2.0690799770746232e-05, "loss": 0.7685, "num_input_tokens_seen": 23217776, "step": 40260 }, { "epoch": 5.997170092344355, "grad_norm": 1.6459324359893799, "learning_rate": 2.06843983441356e-05, "loss": 0.3607, "num_input_tokens_seen": 23220560, "step": 40265 }, { "epoch": 5.997914804885315, "grad_norm": 2.977128505706787, "learning_rate": 2.0677997209155785e-05, "loss": 0.5077, "num_input_tokens_seen": 23223056, "step": 40270 }, { "epoch": 5.998659517426273, "grad_norm": 2.634549617767334, "learning_rate": 2.0671596366239343e-05, "loss": 0.5968, "num_input_tokens_seen": 23225872, "step": 40275 }, { "epoch": 5.999404229967233, "grad_norm": 1.6643141508102417, "learning_rate": 2.066519581581882e-05, "loss": 0.6494, "num_input_tokens_seen": 23228752, "step": 40280 }, { "epoch": 6.0, "eval_loss": 0.6619589328765869, "eval_runtime": 74.2521, "eval_samples_per_second": 40.187, "eval_steps_per_second": 10.047, "num_input_tokens_seen": 23230504, "step": 40284 }, { "epoch": 6.000148942508192, "grad_norm": 2.7491133213043213, "learning_rate": 2.0658795558326743e-05, "loss": 0.5724, "num_input_tokens_seen": 23231080, "step": 40285 }, { "epoch": 6.000893655049151, "grad_norm": 2.2588415145874023, "learning_rate": 2.065239559419561e-05, "loss": 0.618, "num_input_tokens_seen": 23233672, "step": 40290 }, { "epoch": 6.00163836759011, "grad_norm": 2.0233328342437744, "learning_rate": 2.0645995923857902e-05, "loss": 0.5271, "num_input_tokens_seen": 23236488, "step": 40295 }, { "epoch": 6.00238308013107, "grad_norm": 1.966078758239746, "learning_rate": 2.0639596547746104e-05, "loss": 0.6336, "num_input_tokens_seen": 23239656, "step": 40300 }, { "epoch": 6.003127792672029, "grad_norm": 1.9921754598617554, "learning_rate": 2.0633197466292633e-05, "loss": 0.6084, "num_input_tokens_seen": 23242824, "step": 40305 }, { "epoch": 6.003872505212988, "grad_norm": 1.9958714246749878, "learning_rate": 2.062679867992992e-05, "loss": 0.5745, "num_input_tokens_seen": 23245704, "step": 40310 }, { "epoch": 6.004617217753947, "grad_norm": 2.000988483428955, "learning_rate": 2.062040018909037e-05, "loss": 0.5962, "num_input_tokens_seen": 23248648, "step": 40315 }, { "epoch": 6.005361930294906, "grad_norm": 2.507948160171509, "learning_rate": 2.0614001994206378e-05, "loss": 0.7518, "num_input_tokens_seen": 23251432, "step": 40320 }, { "epoch": 6.006106642835865, "grad_norm": 2.253998279571533, "learning_rate": 2.060760409571029e-05, "loss": 0.6322, "num_input_tokens_seen": 23254312, "step": 40325 }, { "epoch": 6.006851355376824, "grad_norm": 2.3669159412384033, "learning_rate": 2.0601206494034465e-05, "loss": 0.5416, "num_input_tokens_seen": 23257352, "step": 40330 }, { "epoch": 6.007596067917784, "grad_norm": 1.9221549034118652, "learning_rate": 2.0594809189611218e-05, "loss": 0.7079, "num_input_tokens_seen": 23260264, "step": 40335 }, { "epoch": 6.0083407804587425, "grad_norm": 1.2660247087478638, "learning_rate": 2.058841218287287e-05, "loss": 0.6887, "num_input_tokens_seen": 23262920, "step": 40340 }, { "epoch": 6.009085492999702, "grad_norm": 1.5852854251861572, "learning_rate": 2.0582015474251672e-05, "loss": 0.5988, "num_input_tokens_seen": 23265640, "step": 40345 }, { "epoch": 6.009830205540661, "grad_norm": 2.9522721767425537, "learning_rate": 2.0575619064179912e-05, "loss": 0.5812, "num_input_tokens_seen": 23268424, "step": 40350 }, { "epoch": 6.010574918081621, "grad_norm": 1.8192795515060425, "learning_rate": 2.0569222953089827e-05, "loss": 0.557, "num_input_tokens_seen": 23271112, "step": 40355 }, { "epoch": 6.011319630622579, "grad_norm": 1.199645757675171, "learning_rate": 2.0562827141413637e-05, "loss": 0.6664, "num_input_tokens_seen": 23274088, "step": 40360 }, { "epoch": 6.012064343163539, "grad_norm": 1.165197730064392, "learning_rate": 2.0556431629583557e-05, "loss": 0.4929, "num_input_tokens_seen": 23277000, "step": 40365 }, { "epoch": 6.012809055704498, "grad_norm": 1.5084476470947266, "learning_rate": 2.0550036418031752e-05, "loss": 0.6692, "num_input_tokens_seen": 23279752, "step": 40370 }, { "epoch": 6.013553768245457, "grad_norm": 2.091339588165283, "learning_rate": 2.0543641507190396e-05, "loss": 0.543, "num_input_tokens_seen": 23282888, "step": 40375 }, { "epoch": 6.014298480786416, "grad_norm": 1.1040314435958862, "learning_rate": 2.0537246897491638e-05, "loss": 0.6249, "num_input_tokens_seen": 23285832, "step": 40380 }, { "epoch": 6.015043193327376, "grad_norm": 2.2632784843444824, "learning_rate": 2.0530852589367587e-05, "loss": 0.6662, "num_input_tokens_seen": 23288776, "step": 40385 }, { "epoch": 6.015787905868335, "grad_norm": 2.617166757583618, "learning_rate": 2.052445858325034e-05, "loss": 0.5428, "num_input_tokens_seen": 23291816, "step": 40390 }, { "epoch": 6.016532618409294, "grad_norm": 1.9704465866088867, "learning_rate": 2.051806487957199e-05, "loss": 0.5584, "num_input_tokens_seen": 23294824, "step": 40395 }, { "epoch": 6.017277330950253, "grad_norm": 1.0301216840744019, "learning_rate": 2.0511671478764593e-05, "loss": 0.5159, "num_input_tokens_seen": 23297768, "step": 40400 }, { "epoch": 6.018022043491213, "grad_norm": 2.894562005996704, "learning_rate": 2.0505278381260187e-05, "loss": 0.6344, "num_input_tokens_seen": 23300648, "step": 40405 }, { "epoch": 6.018766756032171, "grad_norm": 1.5968953371047974, "learning_rate": 2.0498885587490794e-05, "loss": 0.5549, "num_input_tokens_seen": 23303720, "step": 40410 }, { "epoch": 6.019511468573131, "grad_norm": 4.585181713104248, "learning_rate": 2.0492493097888414e-05, "loss": 0.6336, "num_input_tokens_seen": 23306536, "step": 40415 }, { "epoch": 6.02025618111409, "grad_norm": 2.212047576904297, "learning_rate": 2.0486100912885036e-05, "loss": 0.7776, "num_input_tokens_seen": 23309384, "step": 40420 }, { "epoch": 6.021000893655049, "grad_norm": 1.6493191719055176, "learning_rate": 2.047970903291259e-05, "loss": 0.7986, "num_input_tokens_seen": 23312008, "step": 40425 }, { "epoch": 6.021745606196008, "grad_norm": 2.337421417236328, "learning_rate": 2.0473317458403036e-05, "loss": 0.558, "num_input_tokens_seen": 23314792, "step": 40430 }, { "epoch": 6.022490318736968, "grad_norm": 2.0351343154907227, "learning_rate": 2.0466926189788286e-05, "loss": 0.6891, "num_input_tokens_seen": 23317352, "step": 40435 }, { "epoch": 6.023235031277927, "grad_norm": 1.9635902643203735, "learning_rate": 2.0460535227500226e-05, "loss": 0.6006, "num_input_tokens_seen": 23320136, "step": 40440 }, { "epoch": 6.023979743818886, "grad_norm": 1.8568756580352783, "learning_rate": 2.045414457197074e-05, "loss": 0.5777, "num_input_tokens_seen": 23322856, "step": 40445 }, { "epoch": 6.024724456359845, "grad_norm": 1.932905912399292, "learning_rate": 2.044775422363169e-05, "loss": 0.6875, "num_input_tokens_seen": 23325736, "step": 40450 }, { "epoch": 6.025469168900805, "grad_norm": 3.323518991470337, "learning_rate": 2.0441364182914893e-05, "loss": 0.5541, "num_input_tokens_seen": 23328840, "step": 40455 }, { "epoch": 6.026213881441763, "grad_norm": 1.909246563911438, "learning_rate": 2.0434974450252183e-05, "loss": 0.5856, "num_input_tokens_seen": 23331816, "step": 40460 }, { "epoch": 6.026958593982723, "grad_norm": 1.5016541481018066, "learning_rate": 2.042858502607533e-05, "loss": 0.4056, "num_input_tokens_seen": 23334504, "step": 40465 }, { "epoch": 6.027703306523682, "grad_norm": 1.7996435165405273, "learning_rate": 2.0422195910816116e-05, "loss": 0.5394, "num_input_tokens_seen": 23337192, "step": 40470 }, { "epoch": 6.0284480190646414, "grad_norm": 2.4466421604156494, "learning_rate": 2.041580710490629e-05, "loss": 0.7492, "num_input_tokens_seen": 23340104, "step": 40475 }, { "epoch": 6.0291927316056, "grad_norm": 1.9428790807724, "learning_rate": 2.040941860877758e-05, "loss": 0.5629, "num_input_tokens_seen": 23342888, "step": 40480 }, { "epoch": 6.02993744414656, "grad_norm": 1.0863945484161377, "learning_rate": 2.04030304228617e-05, "loss": 0.5262, "num_input_tokens_seen": 23346024, "step": 40485 }, { "epoch": 6.030682156687519, "grad_norm": 3.112698554992676, "learning_rate": 2.039664254759033e-05, "loss": 0.426, "num_input_tokens_seen": 23348744, "step": 40490 }, { "epoch": 6.031426869228477, "grad_norm": 1.8282301425933838, "learning_rate": 2.0390254983395146e-05, "loss": 0.7222, "num_input_tokens_seen": 23351720, "step": 40495 }, { "epoch": 6.032171581769437, "grad_norm": 1.7426700592041016, "learning_rate": 2.038386773070779e-05, "loss": 0.6985, "num_input_tokens_seen": 23354728, "step": 40500 }, { "epoch": 6.032916294310396, "grad_norm": 1.552909016609192, "learning_rate": 2.0377480789959882e-05, "loss": 0.565, "num_input_tokens_seen": 23357928, "step": 40505 }, { "epoch": 6.033661006851355, "grad_norm": 2.892393112182617, "learning_rate": 2.0371094161583026e-05, "loss": 0.5642, "num_input_tokens_seen": 23360488, "step": 40510 }, { "epoch": 6.034405719392314, "grad_norm": 1.7947227954864502, "learning_rate": 2.036470784600881e-05, "loss": 0.5919, "num_input_tokens_seen": 23363240, "step": 40515 }, { "epoch": 6.035150431933274, "grad_norm": 1.3554940223693848, "learning_rate": 2.035832184366879e-05, "loss": 0.5835, "num_input_tokens_seen": 23366344, "step": 40520 }, { "epoch": 6.035895144474233, "grad_norm": 1.718639612197876, "learning_rate": 2.0351936154994503e-05, "loss": 0.7463, "num_input_tokens_seen": 23369416, "step": 40525 }, { "epoch": 6.036639857015192, "grad_norm": 3.5856826305389404, "learning_rate": 2.034555078041748e-05, "loss": 0.4929, "num_input_tokens_seen": 23372168, "step": 40530 }, { "epoch": 6.037384569556151, "grad_norm": 1.717910885810852, "learning_rate": 2.0339165720369207e-05, "loss": 0.6541, "num_input_tokens_seen": 23375336, "step": 40535 }, { "epoch": 6.038129282097111, "grad_norm": 4.672068119049072, "learning_rate": 2.0332780975281177e-05, "loss": 0.2941, "num_input_tokens_seen": 23377864, "step": 40540 }, { "epoch": 6.038873994638069, "grad_norm": 2.143040895462036, "learning_rate": 2.0326396545584822e-05, "loss": 0.6494, "num_input_tokens_seen": 23380840, "step": 40545 }, { "epoch": 6.039618707179029, "grad_norm": 3.864388942718506, "learning_rate": 2.032001243171159e-05, "loss": 0.6451, "num_input_tokens_seen": 23383656, "step": 40550 }, { "epoch": 6.040363419719988, "grad_norm": 2.8741209506988525, "learning_rate": 2.0313628634092887e-05, "loss": 0.696, "num_input_tokens_seen": 23386792, "step": 40555 }, { "epoch": 6.0411081322609474, "grad_norm": 1.6709449291229248, "learning_rate": 2.030724515316011e-05, "loss": 0.5289, "num_input_tokens_seen": 23389480, "step": 40560 }, { "epoch": 6.041852844801906, "grad_norm": 1.9083969593048096, "learning_rate": 2.0300861989344627e-05, "loss": 0.4241, "num_input_tokens_seen": 23392424, "step": 40565 }, { "epoch": 6.042597557342866, "grad_norm": 2.1640162467956543, "learning_rate": 2.0294479143077783e-05, "loss": 0.6008, "num_input_tokens_seen": 23395336, "step": 40570 }, { "epoch": 6.043342269883825, "grad_norm": 1.8488448858261108, "learning_rate": 2.0288096614790905e-05, "loss": 0.6247, "num_input_tokens_seen": 23398152, "step": 40575 }, { "epoch": 6.044086982424784, "grad_norm": 1.1609162092208862, "learning_rate": 2.0281714404915313e-05, "loss": 0.3768, "num_input_tokens_seen": 23401064, "step": 40580 }, { "epoch": 6.044831694965743, "grad_norm": 6.680792331695557, "learning_rate": 2.027533251388227e-05, "loss": 0.6083, "num_input_tokens_seen": 23404008, "step": 40585 }, { "epoch": 6.045576407506703, "grad_norm": 1.6159428358078003, "learning_rate": 2.0268950942123046e-05, "loss": 0.5169, "num_input_tokens_seen": 23407144, "step": 40590 }, { "epoch": 6.046321120047661, "grad_norm": 2.825772285461426, "learning_rate": 2.0262569690068882e-05, "loss": 0.7292, "num_input_tokens_seen": 23409832, "step": 40595 }, { "epoch": 6.047065832588621, "grad_norm": 1.559603214263916, "learning_rate": 2.0256188758151e-05, "loss": 0.387, "num_input_tokens_seen": 23412616, "step": 40600 }, { "epoch": 6.04781054512958, "grad_norm": 2.5098233222961426, "learning_rate": 2.024980814680059e-05, "loss": 0.4972, "num_input_tokens_seen": 23415272, "step": 40605 }, { "epoch": 6.0485552576705395, "grad_norm": 1.8807978630065918, "learning_rate": 2.0243427856448834e-05, "loss": 0.5792, "num_input_tokens_seen": 23418280, "step": 40610 }, { "epoch": 6.049299970211498, "grad_norm": 3.393430709838867, "learning_rate": 2.0237047887526887e-05, "loss": 0.8554, "num_input_tokens_seen": 23421192, "step": 40615 }, { "epoch": 6.050044682752458, "grad_norm": 1.5942598581314087, "learning_rate": 2.0230668240465886e-05, "loss": 0.6489, "num_input_tokens_seen": 23424072, "step": 40620 }, { "epoch": 6.050789395293417, "grad_norm": 2.9295594692230225, "learning_rate": 2.0224288915696924e-05, "loss": 0.7566, "num_input_tokens_seen": 23427080, "step": 40625 }, { "epoch": 6.051534107834376, "grad_norm": 4.312087535858154, "learning_rate": 2.0217909913651102e-05, "loss": 0.5814, "num_input_tokens_seen": 23429928, "step": 40630 }, { "epoch": 6.052278820375335, "grad_norm": 2.7009518146514893, "learning_rate": 2.0211531234759487e-05, "loss": 0.5632, "num_input_tokens_seen": 23432456, "step": 40635 }, { "epoch": 6.053023532916295, "grad_norm": 2.516382932662964, "learning_rate": 2.020515287945312e-05, "loss": 0.5436, "num_input_tokens_seen": 23435432, "step": 40640 }, { "epoch": 6.0537682454572534, "grad_norm": 2.5805888175964355, "learning_rate": 2.0198774848163027e-05, "loss": 0.5899, "num_input_tokens_seen": 23438216, "step": 40645 }, { "epoch": 6.054512957998213, "grad_norm": 1.7970491647720337, "learning_rate": 2.0192397141320212e-05, "loss": 0.5469, "num_input_tokens_seen": 23441192, "step": 40650 }, { "epoch": 6.055257670539172, "grad_norm": 1.5539156198501587, "learning_rate": 2.018601975935565e-05, "loss": 0.6279, "num_input_tokens_seen": 23444040, "step": 40655 }, { "epoch": 6.0560023830801315, "grad_norm": 2.41904354095459, "learning_rate": 2.017964270270031e-05, "loss": 0.5026, "num_input_tokens_seen": 23446568, "step": 40660 }, { "epoch": 6.05674709562109, "grad_norm": 3.4612526893615723, "learning_rate": 2.0173265971785108e-05, "loss": 0.6005, "num_input_tokens_seen": 23449384, "step": 40665 }, { "epoch": 6.057491808162049, "grad_norm": 1.899480938911438, "learning_rate": 2.0166889567040973e-05, "loss": 0.7546, "num_input_tokens_seen": 23452232, "step": 40670 }, { "epoch": 6.058236520703009, "grad_norm": 3.3733396530151367, "learning_rate": 2.0160513488898784e-05, "loss": 0.5355, "num_input_tokens_seen": 23455080, "step": 40675 }, { "epoch": 6.058981233243967, "grad_norm": 1.9183762073516846, "learning_rate": 2.0154137737789426e-05, "loss": 0.3746, "num_input_tokens_seen": 23457992, "step": 40680 }, { "epoch": 6.059725945784927, "grad_norm": 2.4779856204986572, "learning_rate": 2.0147762314143727e-05, "loss": 0.5846, "num_input_tokens_seen": 23460616, "step": 40685 }, { "epoch": 6.060470658325886, "grad_norm": 1.6221692562103271, "learning_rate": 2.0141387218392525e-05, "loss": 0.6169, "num_input_tokens_seen": 23463528, "step": 40690 }, { "epoch": 6.0612153708668455, "grad_norm": 3.1950461864471436, "learning_rate": 2.0135012450966632e-05, "loss": 0.6815, "num_input_tokens_seen": 23466664, "step": 40695 }, { "epoch": 6.061960083407804, "grad_norm": 1.8034156560897827, "learning_rate": 2.0128638012296817e-05, "loss": 0.3843, "num_input_tokens_seen": 23469416, "step": 40700 }, { "epoch": 6.062704795948764, "grad_norm": 1.5378836393356323, "learning_rate": 2.0122263902813832e-05, "loss": 0.5734, "num_input_tokens_seen": 23472680, "step": 40705 }, { "epoch": 6.063449508489723, "grad_norm": 2.1144824028015137, "learning_rate": 2.011589012294842e-05, "loss": 0.5306, "num_input_tokens_seen": 23475688, "step": 40710 }, { "epoch": 6.064194221030682, "grad_norm": 2.2539963722229004, "learning_rate": 2.01095166731313e-05, "loss": 0.564, "num_input_tokens_seen": 23478920, "step": 40715 }, { "epoch": 6.064938933571641, "grad_norm": 2.5041375160217285, "learning_rate": 2.0103143553793158e-05, "loss": 0.6007, "num_input_tokens_seen": 23481832, "step": 40720 }, { "epoch": 6.065683646112601, "grad_norm": 1.95842707157135, "learning_rate": 2.0096770765364665e-05, "loss": 0.5997, "num_input_tokens_seen": 23484680, "step": 40725 }, { "epoch": 6.0664283586535594, "grad_norm": 2.679690361022949, "learning_rate": 2.009039830827647e-05, "loss": 0.7244, "num_input_tokens_seen": 23487688, "step": 40730 }, { "epoch": 6.067173071194519, "grad_norm": 3.6398940086364746, "learning_rate": 2.0084026182959195e-05, "loss": 0.6317, "num_input_tokens_seen": 23490472, "step": 40735 }, { "epoch": 6.067917783735478, "grad_norm": 1.8865926265716553, "learning_rate": 2.0077654389843455e-05, "loss": 0.5555, "num_input_tokens_seen": 23493256, "step": 40740 }, { "epoch": 6.0686624962764375, "grad_norm": 1.3214715719223022, "learning_rate": 2.0071282929359802e-05, "loss": 0.5083, "num_input_tokens_seen": 23496072, "step": 40745 }, { "epoch": 6.069407208817396, "grad_norm": 2.3117868900299072, "learning_rate": 2.0064911801938822e-05, "loss": 0.5066, "num_input_tokens_seen": 23499048, "step": 40750 }, { "epoch": 6.070151921358356, "grad_norm": 3.499434471130371, "learning_rate": 2.0058541008011028e-05, "loss": 0.7797, "num_input_tokens_seen": 23501736, "step": 40755 }, { "epoch": 6.070896633899315, "grad_norm": 1.2823396921157837, "learning_rate": 2.0052170548006944e-05, "loss": 0.6252, "num_input_tokens_seen": 23504584, "step": 40760 }, { "epoch": 6.071641346440274, "grad_norm": 1.6681532859802246, "learning_rate": 2.0045800422357066e-05, "loss": 0.5202, "num_input_tokens_seen": 23507272, "step": 40765 }, { "epoch": 6.072386058981233, "grad_norm": 2.184582471847534, "learning_rate": 2.003943063149184e-05, "loss": 0.565, "num_input_tokens_seen": 23510408, "step": 40770 }, { "epoch": 6.073130771522193, "grad_norm": 2.474863290786743, "learning_rate": 2.003306117584173e-05, "loss": 0.6543, "num_input_tokens_seen": 23513192, "step": 40775 }, { "epoch": 6.0738754840631515, "grad_norm": 1.4318596124649048, "learning_rate": 2.0026692055837155e-05, "loss": 0.5203, "num_input_tokens_seen": 23516200, "step": 40780 }, { "epoch": 6.074620196604111, "grad_norm": 2.2741763591766357, "learning_rate": 2.0020323271908518e-05, "loss": 0.5696, "num_input_tokens_seen": 23519080, "step": 40785 }, { "epoch": 6.07536490914507, "grad_norm": 1.7614539861679077, "learning_rate": 2.0013954824486176e-05, "loss": 0.5242, "num_input_tokens_seen": 23521768, "step": 40790 }, { "epoch": 6.0761096216860295, "grad_norm": 1.2537670135498047, "learning_rate": 2.0007586714000497e-05, "loss": 0.5715, "num_input_tokens_seen": 23524712, "step": 40795 }, { "epoch": 6.076854334226988, "grad_norm": 1.7024112939834595, "learning_rate": 2.000121894088181e-05, "loss": 0.4494, "num_input_tokens_seen": 23527816, "step": 40800 }, { "epoch": 6.077599046767948, "grad_norm": 2.5476553440093994, "learning_rate": 1.9994851505560424e-05, "loss": 0.5204, "num_input_tokens_seen": 23530760, "step": 40805 }, { "epoch": 6.078343759308907, "grad_norm": 1.689098596572876, "learning_rate": 1.9988484408466622e-05, "loss": 0.566, "num_input_tokens_seen": 23533576, "step": 40810 }, { "epoch": 6.079088471849866, "grad_norm": 3.0142741203308105, "learning_rate": 1.9982117650030674e-05, "loss": 0.6186, "num_input_tokens_seen": 23536712, "step": 40815 }, { "epoch": 6.079833184390825, "grad_norm": 2.1700809001922607, "learning_rate": 1.9975751230682808e-05, "loss": 0.6413, "num_input_tokens_seen": 23539592, "step": 40820 }, { "epoch": 6.080577896931785, "grad_norm": 3.456023931503296, "learning_rate": 1.996938515085326e-05, "loss": 0.7, "num_input_tokens_seen": 23542536, "step": 40825 }, { "epoch": 6.0813226094727435, "grad_norm": 4.179172992706299, "learning_rate": 1.9963019410972194e-05, "loss": 0.7742, "num_input_tokens_seen": 23545320, "step": 40830 }, { "epoch": 6.082067322013703, "grad_norm": 1.798619270324707, "learning_rate": 1.9956654011469808e-05, "loss": 0.5693, "num_input_tokens_seen": 23548008, "step": 40835 }, { "epoch": 6.082812034554662, "grad_norm": 1.5512428283691406, "learning_rate": 1.995028895277623e-05, "loss": 0.5588, "num_input_tokens_seen": 23550728, "step": 40840 }, { "epoch": 6.083556747095621, "grad_norm": 1.6905869245529175, "learning_rate": 1.9943924235321605e-05, "loss": 0.7163, "num_input_tokens_seen": 23553768, "step": 40845 }, { "epoch": 6.08430145963658, "grad_norm": 1.9591608047485352, "learning_rate": 1.9937559859536016e-05, "loss": 0.7456, "num_input_tokens_seen": 23556488, "step": 40850 }, { "epoch": 6.085046172177539, "grad_norm": 5.130188465118408, "learning_rate": 1.9931195825849544e-05, "loss": 0.7107, "num_input_tokens_seen": 23559048, "step": 40855 }, { "epoch": 6.085790884718499, "grad_norm": 2.7032973766326904, "learning_rate": 1.9924832134692262e-05, "loss": 0.6105, "num_input_tokens_seen": 23561768, "step": 40860 }, { "epoch": 6.0865355972594575, "grad_norm": 1.5452581644058228, "learning_rate": 1.991846878649419e-05, "loss": 0.5639, "num_input_tokens_seen": 23564680, "step": 40865 }, { "epoch": 6.087280309800417, "grad_norm": 2.2843985557556152, "learning_rate": 1.991210578168533e-05, "loss": 0.7965, "num_input_tokens_seen": 23567656, "step": 40870 }, { "epoch": 6.088025022341376, "grad_norm": 2.9615120887756348, "learning_rate": 1.9905743120695675e-05, "loss": 0.624, "num_input_tokens_seen": 23570888, "step": 40875 }, { "epoch": 6.0887697348823355, "grad_norm": 3.713557481765747, "learning_rate": 1.9899380803955193e-05, "loss": 0.3843, "num_input_tokens_seen": 23573736, "step": 40880 }, { "epoch": 6.089514447423294, "grad_norm": 3.408534049987793, "learning_rate": 1.9893018831893816e-05, "loss": 0.5119, "num_input_tokens_seen": 23576392, "step": 40885 }, { "epoch": 6.090259159964254, "grad_norm": 1.338747262954712, "learning_rate": 1.9886657204941458e-05, "loss": 0.513, "num_input_tokens_seen": 23579176, "step": 40890 }, { "epoch": 6.091003872505213, "grad_norm": 2.6042823791503906, "learning_rate": 1.9880295923528025e-05, "loss": 0.54, "num_input_tokens_seen": 23582024, "step": 40895 }, { "epoch": 6.091748585046172, "grad_norm": 2.0381271839141846, "learning_rate": 1.9873934988083373e-05, "loss": 0.7043, "num_input_tokens_seen": 23584968, "step": 40900 }, { "epoch": 6.092493297587131, "grad_norm": 2.0723938941955566, "learning_rate": 1.9867574399037365e-05, "loss": 0.5861, "num_input_tokens_seen": 23587976, "step": 40905 }, { "epoch": 6.093238010128091, "grad_norm": 2.5755820274353027, "learning_rate": 1.98612141568198e-05, "loss": 0.7356, "num_input_tokens_seen": 23590952, "step": 40910 }, { "epoch": 6.0939827226690495, "grad_norm": 2.3880696296691895, "learning_rate": 1.9854854261860496e-05, "loss": 0.5542, "num_input_tokens_seen": 23593736, "step": 40915 }, { "epoch": 6.094727435210009, "grad_norm": 2.228898048400879, "learning_rate": 1.9848494714589214e-05, "loss": 0.5857, "num_input_tokens_seen": 23596584, "step": 40920 }, { "epoch": 6.095472147750968, "grad_norm": 3.1877169609069824, "learning_rate": 1.9842135515435717e-05, "loss": 0.7014, "num_input_tokens_seen": 23599464, "step": 40925 }, { "epoch": 6.0962168602919276, "grad_norm": 2.349348545074463, "learning_rate": 1.9835776664829735e-05, "loss": 0.6452, "num_input_tokens_seen": 23602408, "step": 40930 }, { "epoch": 6.096961572832886, "grad_norm": 2.1657769680023193, "learning_rate": 1.9829418163200968e-05, "loss": 0.5895, "num_input_tokens_seen": 23605224, "step": 40935 }, { "epoch": 6.097706285373846, "grad_norm": 2.146587371826172, "learning_rate": 1.9823060010979096e-05, "loss": 0.564, "num_input_tokens_seen": 23608040, "step": 40940 }, { "epoch": 6.098450997914805, "grad_norm": 3.1936445236206055, "learning_rate": 1.9816702208593795e-05, "loss": 0.628, "num_input_tokens_seen": 23611048, "step": 40945 }, { "epoch": 6.099195710455764, "grad_norm": 3.123765230178833, "learning_rate": 1.9810344756474676e-05, "loss": 0.5597, "num_input_tokens_seen": 23613736, "step": 40950 }, { "epoch": 6.099940422996723, "grad_norm": 1.6198424100875854, "learning_rate": 1.9803987655051354e-05, "loss": 0.643, "num_input_tokens_seen": 23616648, "step": 40955 }, { "epoch": 6.100685135537683, "grad_norm": 2.150486946105957, "learning_rate": 1.979763090475342e-05, "loss": 0.7114, "num_input_tokens_seen": 23619432, "step": 40960 }, { "epoch": 6.1014298480786415, "grad_norm": 2.360438346862793, "learning_rate": 1.979127450601045e-05, "loss": 0.4881, "num_input_tokens_seen": 23622280, "step": 40965 }, { "epoch": 6.102174560619601, "grad_norm": 2.5565690994262695, "learning_rate": 1.978491845925196e-05, "loss": 0.6102, "num_input_tokens_seen": 23625384, "step": 40970 }, { "epoch": 6.10291927316056, "grad_norm": 1.5148890018463135, "learning_rate": 1.9778562764907475e-05, "loss": 0.6843, "num_input_tokens_seen": 23628488, "step": 40975 }, { "epoch": 6.10366398570152, "grad_norm": 1.344805359840393, "learning_rate": 1.97722074234065e-05, "loss": 0.4945, "num_input_tokens_seen": 23631560, "step": 40980 }, { "epoch": 6.104408698242478, "grad_norm": 1.1291652917861938, "learning_rate": 1.97658524351785e-05, "loss": 0.5485, "num_input_tokens_seen": 23634568, "step": 40985 }, { "epoch": 6.105153410783438, "grad_norm": 1.3060240745544434, "learning_rate": 1.9759497800652897e-05, "loss": 0.5859, "num_input_tokens_seen": 23637416, "step": 40990 }, { "epoch": 6.105898123324397, "grad_norm": 2.657588005065918, "learning_rate": 1.975314352025913e-05, "loss": 0.6359, "num_input_tokens_seen": 23640488, "step": 40995 }, { "epoch": 6.106642835865356, "grad_norm": 3.2227513790130615, "learning_rate": 1.9746789594426593e-05, "loss": 0.5393, "num_input_tokens_seen": 23643848, "step": 41000 }, { "epoch": 6.107387548406315, "grad_norm": 1.4265551567077637, "learning_rate": 1.9740436023584653e-05, "loss": 0.6739, "num_input_tokens_seen": 23646440, "step": 41005 }, { "epoch": 6.108132260947274, "grad_norm": 1.0306686162948608, "learning_rate": 1.973408280816267e-05, "loss": 0.5173, "num_input_tokens_seen": 23649224, "step": 41010 }, { "epoch": 6.1088769734882336, "grad_norm": 5.156706809997559, "learning_rate": 1.9727729948589955e-05, "loss": 0.6476, "num_input_tokens_seen": 23652072, "step": 41015 }, { "epoch": 6.109621686029192, "grad_norm": 1.8836896419525146, "learning_rate": 1.9721377445295813e-05, "loss": 0.6922, "num_input_tokens_seen": 23654760, "step": 41020 }, { "epoch": 6.110366398570152, "grad_norm": 2.2251336574554443, "learning_rate": 1.9715025298709532e-05, "loss": 0.6501, "num_input_tokens_seen": 23657672, "step": 41025 }, { "epoch": 6.111111111111111, "grad_norm": 6.142997741699219, "learning_rate": 1.970867350926035e-05, "loss": 0.639, "num_input_tokens_seen": 23660456, "step": 41030 }, { "epoch": 6.11185582365207, "grad_norm": 1.8418313264846802, "learning_rate": 1.9702322077377493e-05, "loss": 0.8587, "num_input_tokens_seen": 23662888, "step": 41035 }, { "epoch": 6.112600536193029, "grad_norm": 1.981797695159912, "learning_rate": 1.9695971003490175e-05, "loss": 0.5225, "num_input_tokens_seen": 23665800, "step": 41040 }, { "epoch": 6.113345248733989, "grad_norm": 1.834194540977478, "learning_rate": 1.9689620288027574e-05, "loss": 0.4842, "num_input_tokens_seen": 23668616, "step": 41045 }, { "epoch": 6.1140899612749475, "grad_norm": 1.3874672651290894, "learning_rate": 1.9683269931418842e-05, "loss": 0.6495, "num_input_tokens_seen": 23671496, "step": 41050 }, { "epoch": 6.114834673815907, "grad_norm": 3.622058391571045, "learning_rate": 1.9676919934093108e-05, "loss": 0.615, "num_input_tokens_seen": 23674408, "step": 41055 }, { "epoch": 6.115579386356866, "grad_norm": 1.354913353919983, "learning_rate": 1.9670570296479488e-05, "loss": 0.5503, "num_input_tokens_seen": 23677384, "step": 41060 }, { "epoch": 6.116324098897826, "grad_norm": 1.173365592956543, "learning_rate": 1.9664221019007065e-05, "loss": 0.5786, "num_input_tokens_seen": 23680392, "step": 41065 }, { "epoch": 6.117068811438784, "grad_norm": 1.7486516237258911, "learning_rate": 1.9657872102104882e-05, "loss": 0.5613, "num_input_tokens_seen": 23682888, "step": 41070 }, { "epoch": 6.117813523979744, "grad_norm": 1.4346867799758911, "learning_rate": 1.9651523546201982e-05, "loss": 0.5061, "num_input_tokens_seen": 23685736, "step": 41075 }, { "epoch": 6.118558236520703, "grad_norm": 1.377949595451355, "learning_rate": 1.9645175351727383e-05, "loss": 0.5478, "num_input_tokens_seen": 23688872, "step": 41080 }, { "epoch": 6.119302949061662, "grad_norm": 1.9959361553192139, "learning_rate": 1.9638827519110057e-05, "loss": 0.7478, "num_input_tokens_seen": 23691912, "step": 41085 }, { "epoch": 6.120047661602621, "grad_norm": 1.9828909635543823, "learning_rate": 1.9632480048778968e-05, "loss": 0.694, "num_input_tokens_seen": 23694600, "step": 41090 }, { "epoch": 6.120792374143581, "grad_norm": 2.935105085372925, "learning_rate": 1.962613294116306e-05, "loss": 0.5382, "num_input_tokens_seen": 23697768, "step": 41095 }, { "epoch": 6.1215370866845396, "grad_norm": 2.939143657684326, "learning_rate": 1.9619786196691238e-05, "loss": 0.6571, "num_input_tokens_seen": 23700424, "step": 41100 }, { "epoch": 6.122281799225499, "grad_norm": 1.1284652948379517, "learning_rate": 1.9613439815792394e-05, "loss": 0.4823, "num_input_tokens_seen": 23703528, "step": 41105 }, { "epoch": 6.123026511766458, "grad_norm": 2.54707670211792, "learning_rate": 1.9607093798895382e-05, "loss": 0.5358, "num_input_tokens_seen": 23706440, "step": 41110 }, { "epoch": 6.123771224307418, "grad_norm": 2.119899034500122, "learning_rate": 1.960074814642905e-05, "loss": 0.5265, "num_input_tokens_seen": 23709224, "step": 41115 }, { "epoch": 6.124515936848376, "grad_norm": 2.111539840698242, "learning_rate": 1.95944028588222e-05, "loss": 0.5727, "num_input_tokens_seen": 23711848, "step": 41120 }, { "epoch": 6.125260649389336, "grad_norm": 3.121502161026001, "learning_rate": 1.9588057936503627e-05, "loss": 0.5447, "num_input_tokens_seen": 23714888, "step": 41125 }, { "epoch": 6.126005361930295, "grad_norm": 2.1891632080078125, "learning_rate": 1.95817133799021e-05, "loss": 0.5898, "num_input_tokens_seen": 23717576, "step": 41130 }, { "epoch": 6.126750074471254, "grad_norm": 2.2334651947021484, "learning_rate": 1.957536918944635e-05, "loss": 0.5306, "num_input_tokens_seen": 23720680, "step": 41135 }, { "epoch": 6.127494787012213, "grad_norm": 1.7695592641830444, "learning_rate": 1.9569025365565095e-05, "loss": 0.6328, "num_input_tokens_seen": 23723624, "step": 41140 }, { "epoch": 6.128239499553173, "grad_norm": 1.489022135734558, "learning_rate": 1.9562681908687035e-05, "loss": 0.5453, "num_input_tokens_seen": 23726408, "step": 41145 }, { "epoch": 6.128984212094132, "grad_norm": 0.8679491877555847, "learning_rate": 1.955633881924082e-05, "loss": 0.5483, "num_input_tokens_seen": 23729064, "step": 41150 }, { "epoch": 6.129728924635091, "grad_norm": 2.1451661586761475, "learning_rate": 1.954999609765509e-05, "loss": 0.5026, "num_input_tokens_seen": 23732008, "step": 41155 }, { "epoch": 6.13047363717605, "grad_norm": 1.5592271089553833, "learning_rate": 1.9543653744358465e-05, "loss": 0.6131, "num_input_tokens_seen": 23734920, "step": 41160 }, { "epoch": 6.13121834971701, "grad_norm": 5.305970191955566, "learning_rate": 1.953731175977954e-05, "loss": 0.7448, "num_input_tokens_seen": 23737896, "step": 41165 }, { "epoch": 6.131963062257968, "grad_norm": 2.2411978244781494, "learning_rate": 1.9530970144346874e-05, "loss": 0.606, "num_input_tokens_seen": 23740712, "step": 41170 }, { "epoch": 6.132707774798928, "grad_norm": 2.550015449523926, "learning_rate": 1.9524628898489016e-05, "loss": 0.7289, "num_input_tokens_seen": 23744200, "step": 41175 }, { "epoch": 6.133452487339887, "grad_norm": 1.858259916305542, "learning_rate": 1.9518288022634468e-05, "loss": 0.5151, "num_input_tokens_seen": 23746984, "step": 41180 }, { "epoch": 6.134197199880846, "grad_norm": 1.5909032821655273, "learning_rate": 1.9511947517211742e-05, "loss": 0.5116, "num_input_tokens_seen": 23750024, "step": 41185 }, { "epoch": 6.134941912421805, "grad_norm": 1.5707471370697021, "learning_rate": 1.9505607382649276e-05, "loss": 0.587, "num_input_tokens_seen": 23753160, "step": 41190 }, { "epoch": 6.135686624962764, "grad_norm": 1.7485302686691284, "learning_rate": 1.9499267619375534e-05, "loss": 0.652, "num_input_tokens_seen": 23755976, "step": 41195 }, { "epoch": 6.136431337503724, "grad_norm": 2.317972183227539, "learning_rate": 1.9492928227818914e-05, "loss": 0.546, "num_input_tokens_seen": 23758888, "step": 41200 }, { "epoch": 6.137176050044682, "grad_norm": 2.9952778816223145, "learning_rate": 1.9486589208407812e-05, "loss": 0.4308, "num_input_tokens_seen": 23761544, "step": 41205 }, { "epoch": 6.137920762585642, "grad_norm": 2.092010498046875, "learning_rate": 1.9480250561570603e-05, "loss": 0.5732, "num_input_tokens_seen": 23764488, "step": 41210 }, { "epoch": 6.138665475126601, "grad_norm": 3.5306756496429443, "learning_rate": 1.9473912287735614e-05, "loss": 0.714, "num_input_tokens_seen": 23767304, "step": 41215 }, { "epoch": 6.13941018766756, "grad_norm": 2.7260050773620605, "learning_rate": 1.9467574387331167e-05, "loss": 0.6507, "num_input_tokens_seen": 23769800, "step": 41220 }, { "epoch": 6.140154900208519, "grad_norm": 2.677053213119507, "learning_rate": 1.9461236860785558e-05, "loss": 0.6256, "num_input_tokens_seen": 23772680, "step": 41225 }, { "epoch": 6.140899612749479, "grad_norm": 1.5903425216674805, "learning_rate": 1.9454899708527038e-05, "loss": 0.4797, "num_input_tokens_seen": 23775400, "step": 41230 }, { "epoch": 6.141644325290438, "grad_norm": 2.2230124473571777, "learning_rate": 1.9448562930983848e-05, "loss": 0.5976, "num_input_tokens_seen": 23778216, "step": 41235 }, { "epoch": 6.142389037831397, "grad_norm": 1.4598829746246338, "learning_rate": 1.94422265285842e-05, "loss": 0.485, "num_input_tokens_seen": 23781256, "step": 41240 }, { "epoch": 6.143133750372356, "grad_norm": 2.2000277042388916, "learning_rate": 1.9435890501756294e-05, "loss": 0.5919, "num_input_tokens_seen": 23783848, "step": 41245 }, { "epoch": 6.143878462913316, "grad_norm": 2.3281922340393066, "learning_rate": 1.9429554850928284e-05, "loss": 0.5796, "num_input_tokens_seen": 23786792, "step": 41250 }, { "epoch": 6.144623175454274, "grad_norm": 2.4123804569244385, "learning_rate": 1.9423219576528306e-05, "loss": 0.36, "num_input_tokens_seen": 23789832, "step": 41255 }, { "epoch": 6.145367887995234, "grad_norm": 1.839049220085144, "learning_rate": 1.941688467898448e-05, "loss": 0.509, "num_input_tokens_seen": 23792680, "step": 41260 }, { "epoch": 6.146112600536193, "grad_norm": 3.1111278533935547, "learning_rate": 1.9410550158724898e-05, "loss": 0.7558, "num_input_tokens_seen": 23795624, "step": 41265 }, { "epoch": 6.146857313077152, "grad_norm": 2.2263541221618652, "learning_rate": 1.9404216016177594e-05, "loss": 0.4866, "num_input_tokens_seen": 23798472, "step": 41270 }, { "epoch": 6.147602025618111, "grad_norm": 2.2848927974700928, "learning_rate": 1.9397882251770627e-05, "loss": 0.6415, "num_input_tokens_seen": 23801864, "step": 41275 }, { "epoch": 6.148346738159071, "grad_norm": 2.2485873699188232, "learning_rate": 1.9391548865932e-05, "loss": 0.5831, "num_input_tokens_seen": 23804616, "step": 41280 }, { "epoch": 6.14909145070003, "grad_norm": 3.1612212657928467, "learning_rate": 1.9385215859089702e-05, "loss": 0.6788, "num_input_tokens_seen": 23807304, "step": 41285 }, { "epoch": 6.149836163240989, "grad_norm": 2.052349805831909, "learning_rate": 1.937888323167168e-05, "loss": 0.5187, "num_input_tokens_seen": 23810216, "step": 41290 }, { "epoch": 6.150580875781948, "grad_norm": 2.8690059185028076, "learning_rate": 1.9372550984105885e-05, "loss": 0.5557, "num_input_tokens_seen": 23813032, "step": 41295 }, { "epoch": 6.151325588322908, "grad_norm": 2.4719066619873047, "learning_rate": 1.9366219116820205e-05, "loss": 0.659, "num_input_tokens_seen": 23815752, "step": 41300 }, { "epoch": 6.152070300863866, "grad_norm": 3.2153773307800293, "learning_rate": 1.9359887630242547e-05, "loss": 0.4805, "num_input_tokens_seen": 23818664, "step": 41305 }, { "epoch": 6.152815013404826, "grad_norm": 4.605472087860107, "learning_rate": 1.9353556524800743e-05, "loss": 0.6625, "num_input_tokens_seen": 23821576, "step": 41310 }, { "epoch": 6.153559725945785, "grad_norm": 2.420116901397705, "learning_rate": 1.934722580092263e-05, "loss": 0.691, "num_input_tokens_seen": 23824584, "step": 41315 }, { "epoch": 6.1543044384867445, "grad_norm": 1.212001919746399, "learning_rate": 1.9340895459036014e-05, "loss": 0.6417, "num_input_tokens_seen": 23827336, "step": 41320 }, { "epoch": 6.155049151027703, "grad_norm": 2.1126303672790527, "learning_rate": 1.9334565499568676e-05, "loss": 0.6713, "num_input_tokens_seen": 23830056, "step": 41325 }, { "epoch": 6.155793863568663, "grad_norm": 0.75157231092453, "learning_rate": 1.9328235922948375e-05, "loss": 0.4706, "num_input_tokens_seen": 23832712, "step": 41330 }, { "epoch": 6.156538576109622, "grad_norm": 1.530344843864441, "learning_rate": 1.932190672960282e-05, "loss": 0.5631, "num_input_tokens_seen": 23835720, "step": 41335 }, { "epoch": 6.157283288650581, "grad_norm": 4.449073791503906, "learning_rate": 1.931557791995973e-05, "loss": 0.7873, "num_input_tokens_seen": 23838472, "step": 41340 }, { "epoch": 6.15802800119154, "grad_norm": 1.8858234882354736, "learning_rate": 1.930924949444677e-05, "loss": 0.5803, "num_input_tokens_seen": 23841192, "step": 41345 }, { "epoch": 6.1587727137325, "grad_norm": 2.464278221130371, "learning_rate": 1.9302921453491596e-05, "loss": 0.8778, "num_input_tokens_seen": 23844072, "step": 41350 }, { "epoch": 6.159517426273458, "grad_norm": 1.7012959718704224, "learning_rate": 1.9296593797521823e-05, "loss": 0.6435, "num_input_tokens_seen": 23846952, "step": 41355 }, { "epoch": 6.160262138814417, "grad_norm": 2.3235557079315186, "learning_rate": 1.9290266526965056e-05, "loss": 0.5368, "num_input_tokens_seen": 23849928, "step": 41360 }, { "epoch": 6.161006851355377, "grad_norm": 4.117064476013184, "learning_rate": 1.9283939642248858e-05, "loss": 0.6637, "num_input_tokens_seen": 23852776, "step": 41365 }, { "epoch": 6.161751563896336, "grad_norm": 1.4294368028640747, "learning_rate": 1.927761314380078e-05, "loss": 0.7381, "num_input_tokens_seen": 23856008, "step": 41370 }, { "epoch": 6.162496276437295, "grad_norm": 1.7651982307434082, "learning_rate": 1.9271287032048343e-05, "loss": 0.5443, "num_input_tokens_seen": 23858632, "step": 41375 }, { "epoch": 6.163240988978254, "grad_norm": 1.1036895513534546, "learning_rate": 1.9264961307419037e-05, "loss": 0.539, "num_input_tokens_seen": 23861512, "step": 41380 }, { "epoch": 6.163985701519214, "grad_norm": 2.810072660446167, "learning_rate": 1.9258635970340326e-05, "loss": 0.651, "num_input_tokens_seen": 23864328, "step": 41385 }, { "epoch": 6.164730414060172, "grad_norm": 1.0365673303604126, "learning_rate": 1.925231102123966e-05, "loss": 0.6068, "num_input_tokens_seen": 23867368, "step": 41390 }, { "epoch": 6.165475126601132, "grad_norm": 4.488113880157471, "learning_rate": 1.9245986460544448e-05, "loss": 0.7837, "num_input_tokens_seen": 23870248, "step": 41395 }, { "epoch": 6.166219839142091, "grad_norm": 2.2111153602600098, "learning_rate": 1.9239662288682067e-05, "loss": 0.7565, "num_input_tokens_seen": 23872872, "step": 41400 }, { "epoch": 6.1669645516830505, "grad_norm": 1.7939367294311523, "learning_rate": 1.923333850607989e-05, "loss": 0.7321, "num_input_tokens_seen": 23875784, "step": 41405 }, { "epoch": 6.167709264224009, "grad_norm": 2.2439863681793213, "learning_rate": 1.922701511316526e-05, "loss": 0.5404, "num_input_tokens_seen": 23878632, "step": 41410 }, { "epoch": 6.168453976764969, "grad_norm": 1.8926868438720703, "learning_rate": 1.9220692110365468e-05, "loss": 0.5034, "num_input_tokens_seen": 23881832, "step": 41415 }, { "epoch": 6.169198689305928, "grad_norm": 2.948754072189331, "learning_rate": 1.9214369498107806e-05, "loss": 0.6961, "num_input_tokens_seen": 23885224, "step": 41420 }, { "epoch": 6.169943401846887, "grad_norm": 2.2893476486206055, "learning_rate": 1.9208047276819537e-05, "loss": 0.567, "num_input_tokens_seen": 23888136, "step": 41425 }, { "epoch": 6.170688114387846, "grad_norm": 1.3265769481658936, "learning_rate": 1.920172544692789e-05, "loss": 0.6181, "num_input_tokens_seen": 23891112, "step": 41430 }, { "epoch": 6.171432826928806, "grad_norm": 3.133969306945801, "learning_rate": 1.9195404008860053e-05, "loss": 0.5844, "num_input_tokens_seen": 23893896, "step": 41435 }, { "epoch": 6.172177539469764, "grad_norm": 1.477219820022583, "learning_rate": 1.9189082963043213e-05, "loss": 0.599, "num_input_tokens_seen": 23896648, "step": 41440 }, { "epoch": 6.172922252010724, "grad_norm": 2.2933170795440674, "learning_rate": 1.918276230990453e-05, "loss": 0.7797, "num_input_tokens_seen": 23899624, "step": 41445 }, { "epoch": 6.173666964551683, "grad_norm": 2.9069559574127197, "learning_rate": 1.9176442049871108e-05, "loss": 0.6511, "num_input_tokens_seen": 23902504, "step": 41450 }, { "epoch": 6.1744116770926425, "grad_norm": 2.0681252479553223, "learning_rate": 1.9170122183370058e-05, "loss": 0.517, "num_input_tokens_seen": 23905352, "step": 41455 }, { "epoch": 6.175156389633601, "grad_norm": 2.961036443710327, "learning_rate": 1.9163802710828453e-05, "loss": 0.5533, "num_input_tokens_seen": 23908552, "step": 41460 }, { "epoch": 6.175901102174561, "grad_norm": 1.5742807388305664, "learning_rate": 1.9157483632673328e-05, "loss": 0.789, "num_input_tokens_seen": 23911432, "step": 41465 }, { "epoch": 6.17664581471552, "grad_norm": 1.4969022274017334, "learning_rate": 1.9151164949331714e-05, "loss": 0.5911, "num_input_tokens_seen": 23913992, "step": 41470 }, { "epoch": 6.177390527256479, "grad_norm": 2.4205844402313232, "learning_rate": 1.914484666123058e-05, "loss": 0.6561, "num_input_tokens_seen": 23917224, "step": 41475 }, { "epoch": 6.178135239797438, "grad_norm": 2.864504337310791, "learning_rate": 1.9138528768796915e-05, "loss": 0.5628, "num_input_tokens_seen": 23919880, "step": 41480 }, { "epoch": 6.178879952338398, "grad_norm": 1.825528860092163, "learning_rate": 1.9132211272457634e-05, "loss": 0.5716, "num_input_tokens_seen": 23922664, "step": 41485 }, { "epoch": 6.1796246648793565, "grad_norm": 2.11464262008667, "learning_rate": 1.9125894172639663e-05, "loss": 0.4917, "num_input_tokens_seen": 23925576, "step": 41490 }, { "epoch": 6.180369377420316, "grad_norm": 3.092240333557129, "learning_rate": 1.9119577469769883e-05, "loss": 0.5454, "num_input_tokens_seen": 23928456, "step": 41495 }, { "epoch": 6.181114089961275, "grad_norm": 3.8225839138031006, "learning_rate": 1.9113261164275147e-05, "loss": 0.5424, "num_input_tokens_seen": 23931432, "step": 41500 }, { "epoch": 6.1818588025022345, "grad_norm": 1.6429182291030884, "learning_rate": 1.9106945256582293e-05, "loss": 0.6359, "num_input_tokens_seen": 23934184, "step": 41505 }, { "epoch": 6.182603515043193, "grad_norm": 1.5363773107528687, "learning_rate": 1.9100629747118117e-05, "loss": 0.4562, "num_input_tokens_seen": 23937128, "step": 41510 }, { "epoch": 6.183348227584153, "grad_norm": 4.165537357330322, "learning_rate": 1.90943146363094e-05, "loss": 0.5148, "num_input_tokens_seen": 23939816, "step": 41515 }, { "epoch": 6.184092940125112, "grad_norm": 2.59102463722229, "learning_rate": 1.9087999924582884e-05, "loss": 0.614, "num_input_tokens_seen": 23942888, "step": 41520 }, { "epoch": 6.18483765266607, "grad_norm": 3.4720382690429688, "learning_rate": 1.9081685612365298e-05, "loss": 0.7767, "num_input_tokens_seen": 23945896, "step": 41525 }, { "epoch": 6.18558236520703, "grad_norm": 2.155423402786255, "learning_rate": 1.9075371700083333e-05, "loss": 0.5963, "num_input_tokens_seen": 23948712, "step": 41530 }, { "epoch": 6.18632707774799, "grad_norm": 2.3151419162750244, "learning_rate": 1.906905818816366e-05, "loss": 0.5778, "num_input_tokens_seen": 23951688, "step": 41535 }, { "epoch": 6.1870717902889485, "grad_norm": 2.327061176300049, "learning_rate": 1.906274507703293e-05, "loss": 0.6565, "num_input_tokens_seen": 23954728, "step": 41540 }, { "epoch": 6.187816502829907, "grad_norm": 1.929955244064331, "learning_rate": 1.9056432367117744e-05, "loss": 0.4568, "num_input_tokens_seen": 23957448, "step": 41545 }, { "epoch": 6.188561215370867, "grad_norm": 2.3319032192230225, "learning_rate": 1.90501200588447e-05, "loss": 0.8642, "num_input_tokens_seen": 23960328, "step": 41550 }, { "epoch": 6.189305927911826, "grad_norm": 4.396973133087158, "learning_rate": 1.9043808152640342e-05, "loss": 0.568, "num_input_tokens_seen": 23963240, "step": 41555 }, { "epoch": 6.190050640452785, "grad_norm": 2.468846321105957, "learning_rate": 1.903749664893122e-05, "loss": 0.6068, "num_input_tokens_seen": 23966344, "step": 41560 }, { "epoch": 6.190795352993744, "grad_norm": 2.3574626445770264, "learning_rate": 1.9031185548143827e-05, "loss": 0.5061, "num_input_tokens_seen": 23968904, "step": 41565 }, { "epoch": 6.191540065534704, "grad_norm": 2.165400505065918, "learning_rate": 1.9024874850704646e-05, "loss": 0.7013, "num_input_tokens_seen": 23972200, "step": 41570 }, { "epoch": 6.1922847780756625, "grad_norm": 3.6758551597595215, "learning_rate": 1.9018564557040135e-05, "loss": 0.4544, "num_input_tokens_seen": 23974760, "step": 41575 }, { "epoch": 6.193029490616622, "grad_norm": 1.7291618585586548, "learning_rate": 1.9012254667576707e-05, "loss": 0.5753, "num_input_tokens_seen": 23977896, "step": 41580 }, { "epoch": 6.193774203157581, "grad_norm": 1.8188787698745728, "learning_rate": 1.9005945182740765e-05, "loss": 0.6232, "num_input_tokens_seen": 23980872, "step": 41585 }, { "epoch": 6.1945189156985405, "grad_norm": 2.229186534881592, "learning_rate": 1.899963610295869e-05, "loss": 0.5434, "num_input_tokens_seen": 23983592, "step": 41590 }, { "epoch": 6.195263628239499, "grad_norm": 1.8307175636291504, "learning_rate": 1.8993327428656805e-05, "loss": 0.498, "num_input_tokens_seen": 23986184, "step": 41595 }, { "epoch": 6.196008340780459, "grad_norm": 1.9222339391708374, "learning_rate": 1.898701916026142e-05, "loss": 0.5633, "num_input_tokens_seen": 23989032, "step": 41600 }, { "epoch": 6.196753053321418, "grad_norm": 2.125819444656372, "learning_rate": 1.8980711298198843e-05, "loss": 0.5172, "num_input_tokens_seen": 23992328, "step": 41605 }, { "epoch": 6.197497765862377, "grad_norm": 1.5498696565628052, "learning_rate": 1.897440384289532e-05, "loss": 0.4727, "num_input_tokens_seen": 23995368, "step": 41610 }, { "epoch": 6.198242478403336, "grad_norm": 3.1272690296173096, "learning_rate": 1.8968096794777087e-05, "loss": 0.6101, "num_input_tokens_seen": 23998120, "step": 41615 }, { "epoch": 6.198987190944296, "grad_norm": 2.108353614807129, "learning_rate": 1.896179015427035e-05, "loss": 0.5714, "num_input_tokens_seen": 24000840, "step": 41620 }, { "epoch": 6.1997319034852545, "grad_norm": 2.3340940475463867, "learning_rate": 1.8955483921801286e-05, "loss": 0.8524, "num_input_tokens_seen": 24003592, "step": 41625 }, { "epoch": 6.200476616026214, "grad_norm": 2.281111478805542, "learning_rate": 1.894917809779605e-05, "loss": 0.4961, "num_input_tokens_seen": 24006408, "step": 41630 }, { "epoch": 6.201221328567173, "grad_norm": 1.13185715675354, "learning_rate": 1.8942872682680747e-05, "loss": 0.5499, "num_input_tokens_seen": 24009160, "step": 41635 }, { "epoch": 6.2019660411081325, "grad_norm": 1.8211127519607544, "learning_rate": 1.893656767688148e-05, "loss": 0.567, "num_input_tokens_seen": 24011912, "step": 41640 }, { "epoch": 6.202710753649091, "grad_norm": 3.4446449279785156, "learning_rate": 1.8930263080824327e-05, "loss": 0.4701, "num_input_tokens_seen": 24015144, "step": 41645 }, { "epoch": 6.203455466190051, "grad_norm": 1.8717819452285767, "learning_rate": 1.892395889493531e-05, "loss": 0.5629, "num_input_tokens_seen": 24018184, "step": 41650 }, { "epoch": 6.20420017873101, "grad_norm": 3.838893413543701, "learning_rate": 1.8917655119640446e-05, "loss": 0.9012, "num_input_tokens_seen": 24020872, "step": 41655 }, { "epoch": 6.204944891271969, "grad_norm": 1.959904432296753, "learning_rate": 1.8911351755365726e-05, "loss": 0.5658, "num_input_tokens_seen": 24024136, "step": 41660 }, { "epoch": 6.205689603812928, "grad_norm": 1.78111732006073, "learning_rate": 1.890504880253709e-05, "loss": 0.7164, "num_input_tokens_seen": 24027240, "step": 41665 }, { "epoch": 6.206434316353888, "grad_norm": 1.6736321449279785, "learning_rate": 1.8898746261580493e-05, "loss": 0.5595, "num_input_tokens_seen": 24029928, "step": 41670 }, { "epoch": 6.2071790288948465, "grad_norm": 2.394803762435913, "learning_rate": 1.8892444132921803e-05, "loss": 0.486, "num_input_tokens_seen": 24032936, "step": 41675 }, { "epoch": 6.207923741435806, "grad_norm": 4.052621841430664, "learning_rate": 1.8886142416986917e-05, "loss": 0.4879, "num_input_tokens_seen": 24035880, "step": 41680 }, { "epoch": 6.208668453976765, "grad_norm": 1.4588862657546997, "learning_rate": 1.887984111420166e-05, "loss": 0.5618, "num_input_tokens_seen": 24038888, "step": 41685 }, { "epoch": 6.209413166517725, "grad_norm": 2.3789384365081787, "learning_rate": 1.8873540224991864e-05, "loss": 0.4305, "num_input_tokens_seen": 24041608, "step": 41690 }, { "epoch": 6.210157879058683, "grad_norm": 4.130213260650635, "learning_rate": 1.8867239749783307e-05, "loss": 0.7637, "num_input_tokens_seen": 24044456, "step": 41695 }, { "epoch": 6.210902591599643, "grad_norm": 5.71549129486084, "learning_rate": 1.8860939689001754e-05, "loss": 0.6706, "num_input_tokens_seen": 24047176, "step": 41700 }, { "epoch": 6.211647304140602, "grad_norm": 1.9490137100219727, "learning_rate": 1.885464004307294e-05, "loss": 0.6327, "num_input_tokens_seen": 24050184, "step": 41705 }, { "epoch": 6.2123920166815605, "grad_norm": 2.381681442260742, "learning_rate": 1.8848340812422574e-05, "loss": 0.6634, "num_input_tokens_seen": 24053000, "step": 41710 }, { "epoch": 6.21313672922252, "grad_norm": 2.5232126712799072, "learning_rate": 1.884204199747631e-05, "loss": 0.4495, "num_input_tokens_seen": 24055976, "step": 41715 }, { "epoch": 6.213881441763479, "grad_norm": 5.250967025756836, "learning_rate": 1.8835743598659815e-05, "loss": 0.7687, "num_input_tokens_seen": 24058696, "step": 41720 }, { "epoch": 6.2146261543044385, "grad_norm": 1.8788387775421143, "learning_rate": 1.8829445616398713e-05, "loss": 0.5648, "num_input_tokens_seen": 24061736, "step": 41725 }, { "epoch": 6.215370866845397, "grad_norm": 2.043302297592163, "learning_rate": 1.8823148051118585e-05, "loss": 0.7123, "num_input_tokens_seen": 24064808, "step": 41730 }, { "epoch": 6.216115579386357, "grad_norm": 1.648087978363037, "learning_rate": 1.8816850903244994e-05, "loss": 0.6474, "num_input_tokens_seen": 24067752, "step": 41735 }, { "epoch": 6.216860291927316, "grad_norm": 2.435245990753174, "learning_rate": 1.8810554173203486e-05, "loss": 0.6029, "num_input_tokens_seen": 24070408, "step": 41740 }, { "epoch": 6.217605004468275, "grad_norm": 3.105895519256592, "learning_rate": 1.8804257861419556e-05, "loss": 0.6927, "num_input_tokens_seen": 24073256, "step": 41745 }, { "epoch": 6.218349717009234, "grad_norm": 1.8217217922210693, "learning_rate": 1.87979619683187e-05, "loss": 0.4418, "num_input_tokens_seen": 24076424, "step": 41750 }, { "epoch": 6.219094429550194, "grad_norm": 1.4161114692687988, "learning_rate": 1.8791666494326353e-05, "loss": 0.5181, "num_input_tokens_seen": 24079400, "step": 41755 }, { "epoch": 6.2198391420911525, "grad_norm": 1.9354652166366577, "learning_rate": 1.8785371439867945e-05, "loss": 0.6467, "num_input_tokens_seen": 24081960, "step": 41760 }, { "epoch": 6.220583854632112, "grad_norm": 1.9119288921356201, "learning_rate": 1.8779076805368862e-05, "loss": 0.6951, "num_input_tokens_seen": 24085000, "step": 41765 }, { "epoch": 6.221328567173071, "grad_norm": 1.68609619140625, "learning_rate": 1.8772782591254474e-05, "loss": 0.545, "num_input_tokens_seen": 24087848, "step": 41770 }, { "epoch": 6.222073279714031, "grad_norm": 3.4371883869171143, "learning_rate": 1.876648879795013e-05, "loss": 0.4709, "num_input_tokens_seen": 24090792, "step": 41775 }, { "epoch": 6.222817992254989, "grad_norm": 2.0040087699890137, "learning_rate": 1.8760195425881122e-05, "loss": 0.6649, "num_input_tokens_seen": 24093640, "step": 41780 }, { "epoch": 6.223562704795949, "grad_norm": 2.187854766845703, "learning_rate": 1.8753902475472738e-05, "loss": 0.6241, "num_input_tokens_seen": 24096776, "step": 41785 }, { "epoch": 6.224307417336908, "grad_norm": 2.237793445587158, "learning_rate": 1.874760994715024e-05, "loss": 0.6407, "num_input_tokens_seen": 24099624, "step": 41790 }, { "epoch": 6.225052129877867, "grad_norm": 1.7566196918487549, "learning_rate": 1.874131784133884e-05, "loss": 0.5211, "num_input_tokens_seen": 24102600, "step": 41795 }, { "epoch": 6.225796842418826, "grad_norm": 2.3228507041931152, "learning_rate": 1.8735026158463724e-05, "loss": 0.5794, "num_input_tokens_seen": 24105512, "step": 41800 }, { "epoch": 6.226541554959786, "grad_norm": 4.164218902587891, "learning_rate": 1.8728734898950072e-05, "loss": 0.6844, "num_input_tokens_seen": 24108264, "step": 41805 }, { "epoch": 6.2272862675007445, "grad_norm": 2.5479423999786377, "learning_rate": 1.8722444063223023e-05, "loss": 0.552, "num_input_tokens_seen": 24111016, "step": 41810 }, { "epoch": 6.228030980041704, "grad_norm": 3.051145315170288, "learning_rate": 1.871615365170768e-05, "loss": 0.7191, "num_input_tokens_seen": 24113832, "step": 41815 }, { "epoch": 6.228775692582663, "grad_norm": 2.6832187175750732, "learning_rate": 1.870986366482912e-05, "loss": 0.7685, "num_input_tokens_seen": 24117032, "step": 41820 }, { "epoch": 6.229520405123623, "grad_norm": 2.4629950523376465, "learning_rate": 1.8703574103012407e-05, "loss": 0.4698, "num_input_tokens_seen": 24119784, "step": 41825 }, { "epoch": 6.230265117664581, "grad_norm": 2.618769645690918, "learning_rate": 1.869728496668256e-05, "loss": 0.5265, "num_input_tokens_seen": 24123464, "step": 41830 }, { "epoch": 6.231009830205541, "grad_norm": 2.9854369163513184, "learning_rate": 1.869099625626456e-05, "loss": 0.4547, "num_input_tokens_seen": 24126056, "step": 41835 }, { "epoch": 6.2317545427465, "grad_norm": 2.418109178543091, "learning_rate": 1.8684707972183383e-05, "loss": 0.6074, "num_input_tokens_seen": 24129096, "step": 41840 }, { "epoch": 6.232499255287459, "grad_norm": 2.1778130531311035, "learning_rate": 1.867842011486397e-05, "loss": 0.3585, "num_input_tokens_seen": 24131848, "step": 41845 }, { "epoch": 6.233243967828418, "grad_norm": 1.614129662513733, "learning_rate": 1.867213268473122e-05, "loss": 0.6458, "num_input_tokens_seen": 24134920, "step": 41850 }, { "epoch": 6.233988680369378, "grad_norm": 1.7378123998641968, "learning_rate": 1.866584568221002e-05, "loss": 0.4996, "num_input_tokens_seen": 24137832, "step": 41855 }, { "epoch": 6.234733392910337, "grad_norm": 5.927743911743164, "learning_rate": 1.865955910772521e-05, "loss": 0.5606, "num_input_tokens_seen": 24140776, "step": 41860 }, { "epoch": 6.235478105451296, "grad_norm": 1.2130157947540283, "learning_rate": 1.8653272961701618e-05, "loss": 0.5834, "num_input_tokens_seen": 24143560, "step": 41865 }, { "epoch": 6.236222817992255, "grad_norm": 1.3539345264434814, "learning_rate": 1.8646987244564047e-05, "loss": 0.6363, "num_input_tokens_seen": 24146568, "step": 41870 }, { "epoch": 6.236967530533214, "grad_norm": 2.074542999267578, "learning_rate": 1.8640701956737238e-05, "loss": 0.5549, "num_input_tokens_seen": 24149384, "step": 41875 }, { "epoch": 6.237712243074173, "grad_norm": 3.1741878986358643, "learning_rate": 1.8634417098645937e-05, "loss": 0.5963, "num_input_tokens_seen": 24152360, "step": 41880 }, { "epoch": 6.238456955615132, "grad_norm": 2.511425256729126, "learning_rate": 1.8628132670714846e-05, "loss": 0.6701, "num_input_tokens_seen": 24155336, "step": 41885 }, { "epoch": 6.239201668156092, "grad_norm": 3.0039122104644775, "learning_rate": 1.8621848673368648e-05, "loss": 0.6685, "num_input_tokens_seen": 24158408, "step": 41890 }, { "epoch": 6.2399463806970505, "grad_norm": 1.9212511777877808, "learning_rate": 1.861556510703198e-05, "loss": 0.6864, "num_input_tokens_seen": 24161640, "step": 41895 }, { "epoch": 6.24069109323801, "grad_norm": 1.9930851459503174, "learning_rate": 1.8609281972129464e-05, "loss": 0.5112, "num_input_tokens_seen": 24164712, "step": 41900 }, { "epoch": 6.241435805778969, "grad_norm": 1.2922930717468262, "learning_rate": 1.8602999269085693e-05, "loss": 0.6192, "num_input_tokens_seen": 24167368, "step": 41905 }, { "epoch": 6.242180518319929, "grad_norm": 2.401885986328125, "learning_rate": 1.859671699832523e-05, "loss": 0.597, "num_input_tokens_seen": 24170312, "step": 41910 }, { "epoch": 6.242925230860887, "grad_norm": 2.1456027030944824, "learning_rate": 1.859043516027259e-05, "loss": 0.6386, "num_input_tokens_seen": 24173224, "step": 41915 }, { "epoch": 6.243669943401847, "grad_norm": 2.4547970294952393, "learning_rate": 1.8584153755352282e-05, "loss": 0.6453, "num_input_tokens_seen": 24175848, "step": 41920 }, { "epoch": 6.244414655942806, "grad_norm": 1.9653469324111938, "learning_rate": 1.857787278398878e-05, "loss": 0.4892, "num_input_tokens_seen": 24179336, "step": 41925 }, { "epoch": 6.245159368483765, "grad_norm": 1.922690987586975, "learning_rate": 1.8571592246606522e-05, "loss": 0.712, "num_input_tokens_seen": 24182152, "step": 41930 }, { "epoch": 6.245904081024724, "grad_norm": 1.4069173336029053, "learning_rate": 1.8565312143629926e-05, "loss": 0.4646, "num_input_tokens_seen": 24185128, "step": 41935 }, { "epoch": 6.246648793565684, "grad_norm": 3.190056324005127, "learning_rate": 1.855903247548338e-05, "loss": 0.714, "num_input_tokens_seen": 24188168, "step": 41940 }, { "epoch": 6.247393506106643, "grad_norm": 1.96821129322052, "learning_rate": 1.855275324259122e-05, "loss": 0.6667, "num_input_tokens_seen": 24190984, "step": 41945 }, { "epoch": 6.248138218647602, "grad_norm": 1.8349742889404297, "learning_rate": 1.8546474445377786e-05, "loss": 0.4221, "num_input_tokens_seen": 24193960, "step": 41950 }, { "epoch": 6.248882931188561, "grad_norm": 2.105347156524658, "learning_rate": 1.8540196084267386e-05, "loss": 0.4858, "num_input_tokens_seen": 24196616, "step": 41955 }, { "epoch": 6.249627643729521, "grad_norm": 1.4792306423187256, "learning_rate": 1.8533918159684262e-05, "loss": 0.6968, "num_input_tokens_seen": 24199368, "step": 41960 }, { "epoch": 6.250372356270479, "grad_norm": 3.040149450302124, "learning_rate": 1.8527640672052655e-05, "loss": 1.0358, "num_input_tokens_seen": 24202440, "step": 41965 }, { "epoch": 6.251117068811439, "grad_norm": 2.6397323608398438, "learning_rate": 1.8521363621796774e-05, "loss": 0.6652, "num_input_tokens_seen": 24205288, "step": 41970 }, { "epoch": 6.251861781352398, "grad_norm": 1.5317785739898682, "learning_rate": 1.8515087009340808e-05, "loss": 0.685, "num_input_tokens_seen": 24208424, "step": 41975 }, { "epoch": 6.252606493893357, "grad_norm": 1.8549247980117798, "learning_rate": 1.850881083510889e-05, "loss": 0.6684, "num_input_tokens_seen": 24211400, "step": 41980 }, { "epoch": 6.253351206434316, "grad_norm": 3.032390594482422, "learning_rate": 1.850253509952514e-05, "loss": 0.6862, "num_input_tokens_seen": 24214376, "step": 41985 }, { "epoch": 6.254095918975276, "grad_norm": 1.612332820892334, "learning_rate": 1.8496259803013667e-05, "loss": 0.5113, "num_input_tokens_seen": 24217224, "step": 41990 }, { "epoch": 6.254840631516235, "grad_norm": 1.8597393035888672, "learning_rate": 1.8489984945998512e-05, "loss": 0.6262, "num_input_tokens_seen": 24220712, "step": 41995 }, { "epoch": 6.255585344057194, "grad_norm": 1.9810206890106201, "learning_rate": 1.8483710528903698e-05, "loss": 0.5853, "num_input_tokens_seen": 24223752, "step": 42000 }, { "epoch": 6.256330056598153, "grad_norm": 1.7927602529525757, "learning_rate": 1.847743655215323e-05, "loss": 0.718, "num_input_tokens_seen": 24226984, "step": 42005 }, { "epoch": 6.257074769139113, "grad_norm": 1.0067905187606812, "learning_rate": 1.8471163016171088e-05, "loss": 0.4991, "num_input_tokens_seen": 24229960, "step": 42010 }, { "epoch": 6.257819481680071, "grad_norm": 2.5115840435028076, "learning_rate": 1.84648899213812e-05, "loss": 0.5189, "num_input_tokens_seen": 24232904, "step": 42015 }, { "epoch": 6.258564194221031, "grad_norm": 1.8474682569503784, "learning_rate": 1.845861726820749e-05, "loss": 0.5862, "num_input_tokens_seen": 24235400, "step": 42020 }, { "epoch": 6.25930890676199, "grad_norm": 1.8493980169296265, "learning_rate": 1.845234505707382e-05, "loss": 0.5605, "num_input_tokens_seen": 24238280, "step": 42025 }, { "epoch": 6.2600536193029495, "grad_norm": 2.2908546924591064, "learning_rate": 1.844607328840405e-05, "loss": 0.4922, "num_input_tokens_seen": 24241064, "step": 42030 }, { "epoch": 6.260798331843908, "grad_norm": 2.176596164703369, "learning_rate": 1.8439801962622016e-05, "loss": 0.696, "num_input_tokens_seen": 24244104, "step": 42035 }, { "epoch": 6.261543044384867, "grad_norm": 1.831964135169983, "learning_rate": 1.8433531080151482e-05, "loss": 0.5501, "num_input_tokens_seen": 24246920, "step": 42040 }, { "epoch": 6.262287756925827, "grad_norm": 5.354855537414551, "learning_rate": 1.842726064141622e-05, "loss": 0.6278, "num_input_tokens_seen": 24249640, "step": 42045 }, { "epoch": 6.263032469466786, "grad_norm": 2.6053924560546875, "learning_rate": 1.8420990646839957e-05, "loss": 0.5449, "num_input_tokens_seen": 24252584, "step": 42050 }, { "epoch": 6.263777182007745, "grad_norm": 2.2506470680236816, "learning_rate": 1.841472109684641e-05, "loss": 0.5572, "num_input_tokens_seen": 24256072, "step": 42055 }, { "epoch": 6.264521894548704, "grad_norm": 2.589545965194702, "learning_rate": 1.8408451991859228e-05, "loss": 0.5365, "num_input_tokens_seen": 24258760, "step": 42060 }, { "epoch": 6.265266607089663, "grad_norm": 1.8944799900054932, "learning_rate": 1.840218333230206e-05, "loss": 0.6258, "num_input_tokens_seen": 24261480, "step": 42065 }, { "epoch": 6.266011319630622, "grad_norm": 2.2049753665924072, "learning_rate": 1.8395915118598523e-05, "loss": 0.7049, "num_input_tokens_seen": 24264136, "step": 42070 }, { "epoch": 6.266756032171582, "grad_norm": 1.6389859914779663, "learning_rate": 1.83896473511722e-05, "loss": 0.621, "num_input_tokens_seen": 24266696, "step": 42075 }, { "epoch": 6.267500744712541, "grad_norm": 1.4917932748794556, "learning_rate": 1.838338003044662e-05, "loss": 0.8419, "num_input_tokens_seen": 24269608, "step": 42080 }, { "epoch": 6.2682454572535, "grad_norm": 2.617215394973755, "learning_rate": 1.8377113156845317e-05, "loss": 0.5359, "num_input_tokens_seen": 24272328, "step": 42085 }, { "epoch": 6.268990169794459, "grad_norm": 4.459586143493652, "learning_rate": 1.8370846730791786e-05, "loss": 0.6802, "num_input_tokens_seen": 24275336, "step": 42090 }, { "epoch": 6.269734882335419, "grad_norm": 1.3963836431503296, "learning_rate": 1.8364580752709475e-05, "loss": 0.627, "num_input_tokens_seen": 24278536, "step": 42095 }, { "epoch": 6.270479594876377, "grad_norm": 3.2834696769714355, "learning_rate": 1.8358315223021814e-05, "loss": 0.5774, "num_input_tokens_seen": 24281224, "step": 42100 }, { "epoch": 6.271224307417337, "grad_norm": 4.190766334533691, "learning_rate": 1.835205014215222e-05, "loss": 0.3776, "num_input_tokens_seen": 24284104, "step": 42105 }, { "epoch": 6.271969019958296, "grad_norm": 3.2253875732421875, "learning_rate": 1.8345785510524042e-05, "loss": 0.6347, "num_input_tokens_seen": 24286856, "step": 42110 }, { "epoch": 6.2727137324992555, "grad_norm": 2.834571123123169, "learning_rate": 1.833952132856063e-05, "loss": 0.5996, "num_input_tokens_seen": 24289864, "step": 42115 }, { "epoch": 6.273458445040214, "grad_norm": 3.068443536758423, "learning_rate": 1.8333257596685284e-05, "loss": 0.8873, "num_input_tokens_seen": 24292584, "step": 42120 }, { "epoch": 6.274203157581174, "grad_norm": 1.7954233884811401, "learning_rate": 1.832699431532129e-05, "loss": 0.6405, "num_input_tokens_seen": 24295816, "step": 42125 }, { "epoch": 6.274947870122133, "grad_norm": 2.0880775451660156, "learning_rate": 1.832073148489188e-05, "loss": 0.6415, "num_input_tokens_seen": 24298792, "step": 42130 }, { "epoch": 6.275692582663092, "grad_norm": 2.1896374225616455, "learning_rate": 1.831446910582028e-05, "loss": 0.6389, "num_input_tokens_seen": 24301672, "step": 42135 }, { "epoch": 6.276437295204051, "grad_norm": 1.6881797313690186, "learning_rate": 1.8308207178529684e-05, "loss": 0.6519, "num_input_tokens_seen": 24304424, "step": 42140 }, { "epoch": 6.277182007745011, "grad_norm": 3.8706462383270264, "learning_rate": 1.8301945703443236e-05, "loss": 0.7543, "num_input_tokens_seen": 24307144, "step": 42145 }, { "epoch": 6.277926720285969, "grad_norm": 2.020939826965332, "learning_rate": 1.8295684680984062e-05, "loss": 0.7591, "num_input_tokens_seen": 24309832, "step": 42150 }, { "epoch": 6.278671432826929, "grad_norm": 2.1730666160583496, "learning_rate": 1.828942411157527e-05, "loss": 0.6734, "num_input_tokens_seen": 24312808, "step": 42155 }, { "epoch": 6.279416145367888, "grad_norm": 2.056401252746582, "learning_rate": 1.828316399563991e-05, "loss": 0.5738, "num_input_tokens_seen": 24315880, "step": 42160 }, { "epoch": 6.2801608579088475, "grad_norm": 1.8669461011886597, "learning_rate": 1.8276904333601015e-05, "loss": 0.6318, "num_input_tokens_seen": 24318728, "step": 42165 }, { "epoch": 6.280905570449806, "grad_norm": 1.8151822090148926, "learning_rate": 1.8270645125881585e-05, "loss": 0.5801, "num_input_tokens_seen": 24321320, "step": 42170 }, { "epoch": 6.281650282990766, "grad_norm": 1.9382555484771729, "learning_rate": 1.8264386372904608e-05, "loss": 0.6964, "num_input_tokens_seen": 24324072, "step": 42175 }, { "epoch": 6.282394995531725, "grad_norm": 1.933788776397705, "learning_rate": 1.825812807509301e-05, "loss": 0.634, "num_input_tokens_seen": 24326984, "step": 42180 }, { "epoch": 6.283139708072684, "grad_norm": 1.7091866731643677, "learning_rate": 1.825187023286971e-05, "loss": 0.717, "num_input_tokens_seen": 24330120, "step": 42185 }, { "epoch": 6.283884420613643, "grad_norm": 2.3900978565216064, "learning_rate": 1.824561284665758e-05, "loss": 0.7345, "num_input_tokens_seen": 24333032, "step": 42190 }, { "epoch": 6.284629133154603, "grad_norm": 1.8291518688201904, "learning_rate": 1.823935591687948e-05, "loss": 0.5533, "num_input_tokens_seen": 24336072, "step": 42195 }, { "epoch": 6.2853738456955615, "grad_norm": 1.3046679496765137, "learning_rate": 1.8233099443958212e-05, "loss": 0.5639, "num_input_tokens_seen": 24338952, "step": 42200 }, { "epoch": 6.286118558236521, "grad_norm": 2.229966878890991, "learning_rate": 1.8226843428316576e-05, "loss": 0.7063, "num_input_tokens_seen": 24341864, "step": 42205 }, { "epoch": 6.28686327077748, "grad_norm": 1.6500940322875977, "learning_rate": 1.8220587870377315e-05, "loss": 0.7105, "num_input_tokens_seen": 24344840, "step": 42210 }, { "epoch": 6.2876079833184395, "grad_norm": 1.1007862091064453, "learning_rate": 1.8214332770563165e-05, "loss": 0.4717, "num_input_tokens_seen": 24347976, "step": 42215 }, { "epoch": 6.288352695859398, "grad_norm": 2.0688138008117676, "learning_rate": 1.8208078129296827e-05, "loss": 0.7291, "num_input_tokens_seen": 24350952, "step": 42220 }, { "epoch": 6.289097408400357, "grad_norm": 1.5280884504318237, "learning_rate": 1.8201823947000947e-05, "loss": 0.4841, "num_input_tokens_seen": 24353768, "step": 42225 }, { "epoch": 6.289842120941317, "grad_norm": 2.5084762573242188, "learning_rate": 1.8195570224098162e-05, "loss": 0.6089, "num_input_tokens_seen": 24356680, "step": 42230 }, { "epoch": 6.290586833482275, "grad_norm": 2.500619888305664, "learning_rate": 1.8189316961011092e-05, "loss": 0.6113, "num_input_tokens_seen": 24359336, "step": 42235 }, { "epoch": 6.291331546023235, "grad_norm": 1.8026498556137085, "learning_rate": 1.818306415816228e-05, "loss": 0.4109, "num_input_tokens_seen": 24362056, "step": 42240 }, { "epoch": 6.292076258564194, "grad_norm": 2.054696798324585, "learning_rate": 1.817681181597428e-05, "loss": 0.6495, "num_input_tokens_seen": 24364968, "step": 42245 }, { "epoch": 6.2928209711051535, "grad_norm": 2.0439529418945312, "learning_rate": 1.817055993486959e-05, "loss": 0.4468, "num_input_tokens_seen": 24367912, "step": 42250 }, { "epoch": 6.293565683646112, "grad_norm": 1.7902657985687256, "learning_rate": 1.81643085152707e-05, "loss": 0.6004, "num_input_tokens_seen": 24370728, "step": 42255 }, { "epoch": 6.294310396187072, "grad_norm": 2.1907296180725098, "learning_rate": 1.8158057557600045e-05, "loss": 0.5734, "num_input_tokens_seen": 24373544, "step": 42260 }, { "epoch": 6.295055108728031, "grad_norm": 1.2722463607788086, "learning_rate": 1.815180706228004e-05, "loss": 0.562, "num_input_tokens_seen": 24376680, "step": 42265 }, { "epoch": 6.29579982126899, "grad_norm": 1.7785483598709106, "learning_rate": 1.814555702973308e-05, "loss": 0.6554, "num_input_tokens_seen": 24379496, "step": 42270 }, { "epoch": 6.296544533809949, "grad_norm": 1.5547235012054443, "learning_rate": 1.813930746038151e-05, "loss": 0.7411, "num_input_tokens_seen": 24382664, "step": 42275 }, { "epoch": 6.297289246350909, "grad_norm": 1.9254646301269531, "learning_rate": 1.813305835464764e-05, "loss": 0.633, "num_input_tokens_seen": 24385448, "step": 42280 }, { "epoch": 6.2980339588918675, "grad_norm": 2.2331206798553467, "learning_rate": 1.8126809712953766e-05, "loss": 0.653, "num_input_tokens_seen": 24388232, "step": 42285 }, { "epoch": 6.298778671432827, "grad_norm": 1.989282250404358, "learning_rate": 1.8120561535722153e-05, "loss": 0.6766, "num_input_tokens_seen": 24391048, "step": 42290 }, { "epoch": 6.299523383973786, "grad_norm": 1.36220383644104, "learning_rate": 1.8114313823375015e-05, "loss": 0.5829, "num_input_tokens_seen": 24393896, "step": 42295 }, { "epoch": 6.3002680965147455, "grad_norm": 1.5461452007293701, "learning_rate": 1.8108066576334554e-05, "loss": 0.7098, "num_input_tokens_seen": 24396712, "step": 42300 }, { "epoch": 6.301012809055704, "grad_norm": 2.6316778659820557, "learning_rate": 1.810181979502294e-05, "loss": 0.5343, "num_input_tokens_seen": 24399816, "step": 42305 }, { "epoch": 6.301757521596664, "grad_norm": 3.517460346221924, "learning_rate": 1.8095573479862294e-05, "loss": 0.5665, "num_input_tokens_seen": 24402600, "step": 42310 }, { "epoch": 6.302502234137623, "grad_norm": 1.468693733215332, "learning_rate": 1.8089327631274726e-05, "loss": 0.633, "num_input_tokens_seen": 24405448, "step": 42315 }, { "epoch": 6.303246946678582, "grad_norm": 3.29703950881958, "learning_rate": 1.8083082249682294e-05, "loss": 0.5207, "num_input_tokens_seen": 24408392, "step": 42320 }, { "epoch": 6.303991659219541, "grad_norm": 2.4071316719055176, "learning_rate": 1.8076837335507047e-05, "loss": 0.5817, "num_input_tokens_seen": 24411400, "step": 42325 }, { "epoch": 6.304736371760501, "grad_norm": 2.2832109928131104, "learning_rate": 1.8070592889170977e-05, "loss": 0.6795, "num_input_tokens_seen": 24414408, "step": 42330 }, { "epoch": 6.3054810843014595, "grad_norm": 1.5503101348876953, "learning_rate": 1.806434891109607e-05, "loss": 0.5969, "num_input_tokens_seen": 24417256, "step": 42335 }, { "epoch": 6.306225796842419, "grad_norm": 1.4731371402740479, "learning_rate": 1.8058105401704267e-05, "loss": 0.6074, "num_input_tokens_seen": 24420424, "step": 42340 }, { "epoch": 6.306970509383378, "grad_norm": 1.9911648035049438, "learning_rate": 1.8051862361417478e-05, "loss": 0.693, "num_input_tokens_seen": 24423240, "step": 42345 }, { "epoch": 6.3077152219243375, "grad_norm": 4.130649566650391, "learning_rate": 1.804561979065758e-05, "loss": 0.614, "num_input_tokens_seen": 24426088, "step": 42350 }, { "epoch": 6.308459934465296, "grad_norm": 2.24747633934021, "learning_rate": 1.8039377689846427e-05, "loss": 0.5414, "num_input_tokens_seen": 24429224, "step": 42355 }, { "epoch": 6.309204647006256, "grad_norm": 3.6498265266418457, "learning_rate": 1.8033136059405826e-05, "loss": 0.6232, "num_input_tokens_seen": 24432296, "step": 42360 }, { "epoch": 6.309949359547215, "grad_norm": 1.5499659776687622, "learning_rate": 1.8026894899757562e-05, "loss": 0.508, "num_input_tokens_seen": 24435240, "step": 42365 }, { "epoch": 6.310694072088174, "grad_norm": 3.844402313232422, "learning_rate": 1.8020654211323396e-05, "loss": 0.6747, "num_input_tokens_seen": 24437992, "step": 42370 }, { "epoch": 6.311438784629133, "grad_norm": 2.4857680797576904, "learning_rate": 1.8014413994525036e-05, "loss": 0.5784, "num_input_tokens_seen": 24440712, "step": 42375 }, { "epoch": 6.312183497170093, "grad_norm": 1.5404149293899536, "learning_rate": 1.8008174249784175e-05, "loss": 0.4788, "num_input_tokens_seen": 24443496, "step": 42380 }, { "epoch": 6.3129282097110515, "grad_norm": 1.8926929235458374, "learning_rate": 1.8001934977522477e-05, "loss": 0.5174, "num_input_tokens_seen": 24446728, "step": 42385 }, { "epoch": 6.31367292225201, "grad_norm": 1.337883710861206, "learning_rate": 1.799569617816156e-05, "loss": 0.7158, "num_input_tokens_seen": 24449608, "step": 42390 }, { "epoch": 6.31441763479297, "grad_norm": 3.8013908863067627, "learning_rate": 1.7989457852123026e-05, "loss": 0.6569, "num_input_tokens_seen": 24452712, "step": 42395 }, { "epoch": 6.31516234733393, "grad_norm": 2.4158761501312256, "learning_rate": 1.7983219999828417e-05, "loss": 0.749, "num_input_tokens_seen": 24455496, "step": 42400 }, { "epoch": 6.315907059874888, "grad_norm": 1.6557414531707764, "learning_rate": 1.7976982621699278e-05, "loss": 0.6081, "num_input_tokens_seen": 24458376, "step": 42405 }, { "epoch": 6.316651772415847, "grad_norm": 2.054544448852539, "learning_rate": 1.7970745718157095e-05, "loss": 0.5662, "num_input_tokens_seen": 24461032, "step": 42410 }, { "epoch": 6.317396484956807, "grad_norm": 3.5499978065490723, "learning_rate": 1.7964509289623335e-05, "loss": 0.6752, "num_input_tokens_seen": 24464104, "step": 42415 }, { "epoch": 6.3181411974977655, "grad_norm": 1.7453343868255615, "learning_rate": 1.795827333651944e-05, "loss": 0.665, "num_input_tokens_seen": 24467368, "step": 42420 }, { "epoch": 6.318885910038725, "grad_norm": 2.011265277862549, "learning_rate": 1.7952037859266795e-05, "loss": 0.5474, "num_input_tokens_seen": 24470184, "step": 42425 }, { "epoch": 6.319630622579684, "grad_norm": 1.1525213718414307, "learning_rate": 1.7945802858286782e-05, "loss": 0.4879, "num_input_tokens_seen": 24473192, "step": 42430 }, { "epoch": 6.3203753351206435, "grad_norm": 3.1846747398376465, "learning_rate": 1.793956833400074e-05, "loss": 0.7711, "num_input_tokens_seen": 24475880, "step": 42435 }, { "epoch": 6.321120047661602, "grad_norm": 2.6051642894744873, "learning_rate": 1.793333428682996e-05, "loss": 0.5731, "num_input_tokens_seen": 24478952, "step": 42440 }, { "epoch": 6.321864760202562, "grad_norm": 1.704314112663269, "learning_rate": 1.7927100717195712e-05, "loss": 0.6247, "num_input_tokens_seen": 24481608, "step": 42445 }, { "epoch": 6.322609472743521, "grad_norm": 1.7797397375106812, "learning_rate": 1.792086762551924e-05, "loss": 0.6831, "num_input_tokens_seen": 24484744, "step": 42450 }, { "epoch": 6.32335418528448, "grad_norm": 2.0842390060424805, "learning_rate": 1.791463501222176e-05, "loss": 0.7116, "num_input_tokens_seen": 24487720, "step": 42455 }, { "epoch": 6.324098897825439, "grad_norm": 1.864496111869812, "learning_rate": 1.790840287772443e-05, "loss": 0.5102, "num_input_tokens_seen": 24490792, "step": 42460 }, { "epoch": 6.324843610366399, "grad_norm": 4.5372514724731445, "learning_rate": 1.79021712224484e-05, "loss": 0.5375, "num_input_tokens_seen": 24493512, "step": 42465 }, { "epoch": 6.3255883229073575, "grad_norm": 2.772608757019043, "learning_rate": 1.789594004681479e-05, "loss": 0.5011, "num_input_tokens_seen": 24496360, "step": 42470 }, { "epoch": 6.326333035448317, "grad_norm": 0.7527809143066406, "learning_rate": 1.7889709351244675e-05, "loss": 0.4772, "num_input_tokens_seen": 24499112, "step": 42475 }, { "epoch": 6.327077747989276, "grad_norm": 0.9589748382568359, "learning_rate": 1.788347913615908e-05, "loss": 0.4267, "num_input_tokens_seen": 24501992, "step": 42480 }, { "epoch": 6.327822460530236, "grad_norm": 2.598912477493286, "learning_rate": 1.7877249401979034e-05, "loss": 0.5801, "num_input_tokens_seen": 24504808, "step": 42485 }, { "epoch": 6.328567173071194, "grad_norm": 2.3870744705200195, "learning_rate": 1.7871020149125517e-05, "loss": 0.6927, "num_input_tokens_seen": 24507560, "step": 42490 }, { "epoch": 6.329311885612154, "grad_norm": 1.5073322057724, "learning_rate": 1.786479137801947e-05, "loss": 0.7334, "num_input_tokens_seen": 24510344, "step": 42495 }, { "epoch": 6.330056598153113, "grad_norm": 1.3378026485443115, "learning_rate": 1.7858563089081812e-05, "loss": 0.6648, "num_input_tokens_seen": 24513288, "step": 42500 }, { "epoch": 6.330801310694072, "grad_norm": 2.285914182662964, "learning_rate": 1.7852335282733432e-05, "loss": 0.4627, "num_input_tokens_seen": 24515880, "step": 42505 }, { "epoch": 6.331546023235031, "grad_norm": 2.347259998321533, "learning_rate": 1.7846107959395165e-05, "loss": 0.5864, "num_input_tokens_seen": 24518952, "step": 42510 }, { "epoch": 6.332290735775991, "grad_norm": 0.28129976987838745, "learning_rate": 1.783988111948785e-05, "loss": 0.3786, "num_input_tokens_seen": 24521896, "step": 42515 }, { "epoch": 6.3330354483169495, "grad_norm": 1.3815916776657104, "learning_rate": 1.7833654763432245e-05, "loss": 0.5949, "num_input_tokens_seen": 24524744, "step": 42520 }, { "epoch": 6.333780160857909, "grad_norm": 2.046766519546509, "learning_rate": 1.782742889164912e-05, "loss": 0.351, "num_input_tokens_seen": 24527432, "step": 42525 }, { "epoch": 6.334524873398868, "grad_norm": 1.5193425416946411, "learning_rate": 1.7821203504559186e-05, "loss": 0.743, "num_input_tokens_seen": 24530280, "step": 42530 }, { "epoch": 6.335269585939828, "grad_norm": 3.793917179107666, "learning_rate": 1.7814978602583136e-05, "loss": 0.5119, "num_input_tokens_seen": 24533000, "step": 42535 }, { "epoch": 6.336014298480786, "grad_norm": 6.876333713531494, "learning_rate": 1.7808754186141618e-05, "loss": 0.733, "num_input_tokens_seen": 24535784, "step": 42540 }, { "epoch": 6.336759011021746, "grad_norm": 1.6857402324676514, "learning_rate": 1.780253025565525e-05, "loss": 0.7186, "num_input_tokens_seen": 24538760, "step": 42545 }, { "epoch": 6.337503723562705, "grad_norm": 0.9927754402160645, "learning_rate": 1.7796306811544632e-05, "loss": 0.7014, "num_input_tokens_seen": 24541800, "step": 42550 }, { "epoch": 6.338248436103664, "grad_norm": 1.960851788520813, "learning_rate": 1.779008385423031e-05, "loss": 0.6005, "num_input_tokens_seen": 24544712, "step": 42555 }, { "epoch": 6.338993148644623, "grad_norm": 1.9421254396438599, "learning_rate": 1.778386138413281e-05, "loss": 0.4923, "num_input_tokens_seen": 24547816, "step": 42560 }, { "epoch": 6.339737861185583, "grad_norm": 1.7053375244140625, "learning_rate": 1.7777639401672613e-05, "loss": 0.5259, "num_input_tokens_seen": 24550792, "step": 42565 }, { "epoch": 6.340482573726542, "grad_norm": 1.1562087535858154, "learning_rate": 1.7771417907270187e-05, "loss": 0.5175, "num_input_tokens_seen": 24553672, "step": 42570 }, { "epoch": 6.3412272862675, "grad_norm": 3.361593246459961, "learning_rate": 1.7765196901345946e-05, "loss": 0.5629, "num_input_tokens_seen": 24556456, "step": 42575 }, { "epoch": 6.34197199880846, "grad_norm": 1.785712480545044, "learning_rate": 1.775897638432028e-05, "loss": 0.6164, "num_input_tokens_seen": 24559528, "step": 42580 }, { "epoch": 6.342716711349419, "grad_norm": 3.59230375289917, "learning_rate": 1.775275635661356e-05, "loss": 0.8189, "num_input_tokens_seen": 24562152, "step": 42585 }, { "epoch": 6.343461423890378, "grad_norm": 1.9107800722122192, "learning_rate": 1.774653681864609e-05, "loss": 0.5744, "num_input_tokens_seen": 24565000, "step": 42590 }, { "epoch": 6.344206136431337, "grad_norm": 1.464294195175171, "learning_rate": 1.7740317770838173e-05, "loss": 0.3849, "num_input_tokens_seen": 24567880, "step": 42595 }, { "epoch": 6.344950848972297, "grad_norm": 2.3139848709106445, "learning_rate": 1.7734099213610075e-05, "loss": 0.4465, "num_input_tokens_seen": 24570728, "step": 42600 }, { "epoch": 6.3456955615132555, "grad_norm": 2.2099180221557617, "learning_rate": 1.772788114738201e-05, "loss": 0.5, "num_input_tokens_seen": 24573576, "step": 42605 }, { "epoch": 6.346440274054215, "grad_norm": 3.8306901454925537, "learning_rate": 1.772166357257416e-05, "loss": 0.814, "num_input_tokens_seen": 24576392, "step": 42610 }, { "epoch": 6.347184986595174, "grad_norm": 1.459932804107666, "learning_rate": 1.7715446489606696e-05, "loss": 0.6933, "num_input_tokens_seen": 24579144, "step": 42615 }, { "epoch": 6.347929699136134, "grad_norm": 4.669093608856201, "learning_rate": 1.770922989889975e-05, "loss": 0.5972, "num_input_tokens_seen": 24582184, "step": 42620 }, { "epoch": 6.348674411677092, "grad_norm": 3.058610677719116, "learning_rate": 1.7703013800873398e-05, "loss": 0.7414, "num_input_tokens_seen": 24584936, "step": 42625 }, { "epoch": 6.349419124218052, "grad_norm": 1.7020546197891235, "learning_rate": 1.7696798195947704e-05, "loss": 0.5657, "num_input_tokens_seen": 24587720, "step": 42630 }, { "epoch": 6.350163836759011, "grad_norm": 0.7506810426712036, "learning_rate": 1.76905830845427e-05, "loss": 0.5316, "num_input_tokens_seen": 24590600, "step": 42635 }, { "epoch": 6.35090854929997, "grad_norm": 1.7000713348388672, "learning_rate": 1.7684368467078384e-05, "loss": 0.499, "num_input_tokens_seen": 24593608, "step": 42640 }, { "epoch": 6.351653261840929, "grad_norm": 1.406469464302063, "learning_rate": 1.7678154343974686e-05, "loss": 0.5224, "num_input_tokens_seen": 24596456, "step": 42645 }, { "epoch": 6.352397974381889, "grad_norm": 1.8241355419158936, "learning_rate": 1.7671940715651553e-05, "loss": 0.5581, "num_input_tokens_seen": 24599240, "step": 42650 }, { "epoch": 6.353142686922848, "grad_norm": 3.163511276245117, "learning_rate": 1.7665727582528878e-05, "loss": 0.6515, "num_input_tokens_seen": 24601992, "step": 42655 }, { "epoch": 6.353887399463807, "grad_norm": 5.157626628875732, "learning_rate": 1.7659514945026508e-05, "loss": 0.6996, "num_input_tokens_seen": 24604584, "step": 42660 }, { "epoch": 6.354632112004766, "grad_norm": 2.8947465419769287, "learning_rate": 1.7653302803564275e-05, "loss": 0.7638, "num_input_tokens_seen": 24607624, "step": 42665 }, { "epoch": 6.355376824545726, "grad_norm": 1.2320572137832642, "learning_rate": 1.7647091158561974e-05, "loss": 0.5463, "num_input_tokens_seen": 24610344, "step": 42670 }, { "epoch": 6.356121537086684, "grad_norm": 5.080036163330078, "learning_rate": 1.764088001043935e-05, "loss": 0.8104, "num_input_tokens_seen": 24613160, "step": 42675 }, { "epoch": 6.356866249627644, "grad_norm": 4.494429111480713, "learning_rate": 1.763466935961615e-05, "loss": 0.5687, "num_input_tokens_seen": 24616232, "step": 42680 }, { "epoch": 6.357610962168603, "grad_norm": 1.7777760028839111, "learning_rate": 1.7628459206512033e-05, "loss": 0.5587, "num_input_tokens_seen": 24619240, "step": 42685 }, { "epoch": 6.358355674709562, "grad_norm": 2.3581414222717285, "learning_rate": 1.7622249551546682e-05, "loss": 0.5389, "num_input_tokens_seen": 24622120, "step": 42690 }, { "epoch": 6.359100387250521, "grad_norm": 1.4401803016662598, "learning_rate": 1.7616040395139706e-05, "loss": 0.554, "num_input_tokens_seen": 24625448, "step": 42695 }, { "epoch": 6.359845099791481, "grad_norm": 2.3625121116638184, "learning_rate": 1.76098317377107e-05, "loss": 0.557, "num_input_tokens_seen": 24628392, "step": 42700 }, { "epoch": 6.36058981233244, "grad_norm": 1.220354437828064, "learning_rate": 1.7603623579679217e-05, "loss": 0.5709, "num_input_tokens_seen": 24630920, "step": 42705 }, { "epoch": 6.361334524873399, "grad_norm": 3.7447879314422607, "learning_rate": 1.759741592146478e-05, "loss": 0.9452, "num_input_tokens_seen": 24633800, "step": 42710 }, { "epoch": 6.362079237414358, "grad_norm": 2.5405898094177246, "learning_rate": 1.7591208763486883e-05, "loss": 0.7875, "num_input_tokens_seen": 24636744, "step": 42715 }, { "epoch": 6.362823949955318, "grad_norm": 1.9983834028244019, "learning_rate": 1.7585002106164976e-05, "loss": 0.5634, "num_input_tokens_seen": 24639528, "step": 42720 }, { "epoch": 6.363568662496276, "grad_norm": 3.0962891578674316, "learning_rate": 1.757879594991848e-05, "loss": 0.7566, "num_input_tokens_seen": 24642408, "step": 42725 }, { "epoch": 6.364313375037236, "grad_norm": 1.6718846559524536, "learning_rate": 1.757259029516678e-05, "loss": 0.4211, "num_input_tokens_seen": 24645224, "step": 42730 }, { "epoch": 6.365058087578195, "grad_norm": 1.645921230316162, "learning_rate": 1.7566385142329227e-05, "loss": 0.5129, "num_input_tokens_seen": 24648040, "step": 42735 }, { "epoch": 6.365802800119154, "grad_norm": 1.153040885925293, "learning_rate": 1.7560180491825144e-05, "loss": 0.5035, "num_input_tokens_seen": 24651048, "step": 42740 }, { "epoch": 6.366547512660113, "grad_norm": 3.0414445400238037, "learning_rate": 1.7553976344073815e-05, "loss": 0.6886, "num_input_tokens_seen": 24653800, "step": 42745 }, { "epoch": 6.367292225201073, "grad_norm": 1.0063083171844482, "learning_rate": 1.7547772699494494e-05, "loss": 0.5168, "num_input_tokens_seen": 24656744, "step": 42750 }, { "epoch": 6.368036937742032, "grad_norm": 2.7321159839630127, "learning_rate": 1.7541569558506393e-05, "loss": 0.5791, "num_input_tokens_seen": 24659432, "step": 42755 }, { "epoch": 6.36878165028299, "grad_norm": 1.6840243339538574, "learning_rate": 1.7535366921528707e-05, "loss": 0.667, "num_input_tokens_seen": 24662536, "step": 42760 }, { "epoch": 6.36952636282395, "grad_norm": 1.4708229303359985, "learning_rate": 1.752916478898056e-05, "loss": 0.5971, "num_input_tokens_seen": 24665320, "step": 42765 }, { "epoch": 6.370271075364909, "grad_norm": 1.5875871181488037, "learning_rate": 1.7522963161281094e-05, "loss": 0.6411, "num_input_tokens_seen": 24668168, "step": 42770 }, { "epoch": 6.371015787905868, "grad_norm": 1.603170394897461, "learning_rate": 1.751676203884937e-05, "loss": 0.5432, "num_input_tokens_seen": 24670824, "step": 42775 }, { "epoch": 6.371760500446827, "grad_norm": 2.27378249168396, "learning_rate": 1.7510561422104444e-05, "loss": 0.6153, "num_input_tokens_seen": 24673448, "step": 42780 }, { "epoch": 6.372505212987787, "grad_norm": 0.9003387689590454, "learning_rate": 1.7504361311465332e-05, "loss": 0.7374, "num_input_tokens_seen": 24676200, "step": 42785 }, { "epoch": 6.373249925528746, "grad_norm": 1.8616729974746704, "learning_rate": 1.7498161707351005e-05, "loss": 0.5945, "num_input_tokens_seen": 24679208, "step": 42790 }, { "epoch": 6.373994638069705, "grad_norm": 1.4911333322525024, "learning_rate": 1.7491962610180408e-05, "loss": 0.5267, "num_input_tokens_seen": 24681896, "step": 42795 }, { "epoch": 6.374739350610664, "grad_norm": 2.735741376876831, "learning_rate": 1.748576402037246e-05, "loss": 0.4482, "num_input_tokens_seen": 24684904, "step": 42800 }, { "epoch": 6.375484063151624, "grad_norm": 2.39334774017334, "learning_rate": 1.7479565938346025e-05, "loss": 0.5305, "num_input_tokens_seen": 24687752, "step": 42805 }, { "epoch": 6.376228775692582, "grad_norm": 2.272034168243408, "learning_rate": 1.7473368364519945e-05, "loss": 0.6273, "num_input_tokens_seen": 24690344, "step": 42810 }, { "epoch": 6.376973488233542, "grad_norm": 3.1320767402648926, "learning_rate": 1.746717129931303e-05, "loss": 0.7261, "num_input_tokens_seen": 24693320, "step": 42815 }, { "epoch": 6.377718200774501, "grad_norm": 1.3569934368133545, "learning_rate": 1.7460974743144055e-05, "loss": 0.4955, "num_input_tokens_seen": 24696200, "step": 42820 }, { "epoch": 6.3784629133154604, "grad_norm": 3.8072304725646973, "learning_rate": 1.7454778696431747e-05, "loss": 0.6609, "num_input_tokens_seen": 24699240, "step": 42825 }, { "epoch": 6.379207625856419, "grad_norm": 2.2809677124023438, "learning_rate": 1.7448583159594822e-05, "loss": 0.675, "num_input_tokens_seen": 24702120, "step": 42830 }, { "epoch": 6.379952338397379, "grad_norm": 2.9758927822113037, "learning_rate": 1.744238813305195e-05, "loss": 0.3186, "num_input_tokens_seen": 24705096, "step": 42835 }, { "epoch": 6.380697050938338, "grad_norm": 4.439962863922119, "learning_rate": 1.743619361722177e-05, "loss": 0.6412, "num_input_tokens_seen": 24708008, "step": 42840 }, { "epoch": 6.381441763479297, "grad_norm": 2.352416515350342, "learning_rate": 1.7429999612522858e-05, "loss": 0.6126, "num_input_tokens_seen": 24711368, "step": 42845 }, { "epoch": 6.382186476020256, "grad_norm": 1.0133553743362427, "learning_rate": 1.7423806119373794e-05, "loss": 0.6197, "num_input_tokens_seen": 24714056, "step": 42850 }, { "epoch": 6.382931188561216, "grad_norm": 3.7542762756347656, "learning_rate": 1.7417613138193117e-05, "loss": 0.6836, "num_input_tokens_seen": 24716904, "step": 42855 }, { "epoch": 6.383675901102174, "grad_norm": 1.8514775037765503, "learning_rate": 1.7411420669399315e-05, "loss": 0.5878, "num_input_tokens_seen": 24719624, "step": 42860 }, { "epoch": 6.384420613643134, "grad_norm": 2.146576404571533, "learning_rate": 1.740522871341085e-05, "loss": 0.6919, "num_input_tokens_seen": 24722760, "step": 42865 }, { "epoch": 6.385165326184093, "grad_norm": 1.9683908224105835, "learning_rate": 1.739903727064615e-05, "loss": 0.5713, "num_input_tokens_seen": 24725576, "step": 42870 }, { "epoch": 6.3859100387250525, "grad_norm": 1.8369064331054688, "learning_rate": 1.7392846341523606e-05, "loss": 0.4464, "num_input_tokens_seen": 24728392, "step": 42875 }, { "epoch": 6.386654751266011, "grad_norm": 1.7603563070297241, "learning_rate": 1.7386655926461586e-05, "loss": 0.6693, "num_input_tokens_seen": 24731016, "step": 42880 }, { "epoch": 6.387399463806971, "grad_norm": 2.0864553451538086, "learning_rate": 1.73804660258784e-05, "loss": 0.6262, "num_input_tokens_seen": 24733928, "step": 42885 }, { "epoch": 6.38814417634793, "grad_norm": 1.5595006942749023, "learning_rate": 1.737427664019234e-05, "loss": 0.6161, "num_input_tokens_seen": 24737096, "step": 42890 }, { "epoch": 6.388888888888889, "grad_norm": 8.036312103271484, "learning_rate": 1.736808776982166e-05, "loss": 0.4327, "num_input_tokens_seen": 24739624, "step": 42895 }, { "epoch": 6.389633601429848, "grad_norm": 2.0838029384613037, "learning_rate": 1.7361899415184584e-05, "loss": 0.5965, "num_input_tokens_seen": 24742664, "step": 42900 }, { "epoch": 6.390378313970807, "grad_norm": 2.802961826324463, "learning_rate": 1.7355711576699286e-05, "loss": 0.7237, "num_input_tokens_seen": 24745544, "step": 42905 }, { "epoch": 6.3911230265117664, "grad_norm": 2.3542561531066895, "learning_rate": 1.734952425478392e-05, "loss": 0.6132, "num_input_tokens_seen": 24748200, "step": 42910 }, { "epoch": 6.391867739052726, "grad_norm": 2.430676221847534, "learning_rate": 1.7343337449856605e-05, "loss": 0.6051, "num_input_tokens_seen": 24751112, "step": 42915 }, { "epoch": 6.392612451593685, "grad_norm": 1.9384766817092896, "learning_rate": 1.733715116233542e-05, "loss": 0.6077, "num_input_tokens_seen": 24754216, "step": 42920 }, { "epoch": 6.393357164134644, "grad_norm": 2.347439765930176, "learning_rate": 1.7330965392638394e-05, "loss": 0.68, "num_input_tokens_seen": 24757160, "step": 42925 }, { "epoch": 6.394101876675603, "grad_norm": 3.4124033451080322, "learning_rate": 1.732478014118355e-05, "loss": 0.7612, "num_input_tokens_seen": 24760104, "step": 42930 }, { "epoch": 6.394846589216562, "grad_norm": 1.669674038887024, "learning_rate": 1.7318595408388862e-05, "loss": 0.7128, "num_input_tokens_seen": 24762824, "step": 42935 }, { "epoch": 6.395591301757522, "grad_norm": 1.9367332458496094, "learning_rate": 1.7312411194672258e-05, "loss": 0.5183, "num_input_tokens_seen": 24765576, "step": 42940 }, { "epoch": 6.39633601429848, "grad_norm": 3.7995564937591553, "learning_rate": 1.7306227500451654e-05, "loss": 0.8948, "num_input_tokens_seen": 24768328, "step": 42945 }, { "epoch": 6.39708072683944, "grad_norm": 1.5869871377944946, "learning_rate": 1.7300044326144918e-05, "loss": 0.678, "num_input_tokens_seen": 24771528, "step": 42950 }, { "epoch": 6.397825439380399, "grad_norm": 1.7987817525863647, "learning_rate": 1.7293861672169874e-05, "loss": 0.601, "num_input_tokens_seen": 24774216, "step": 42955 }, { "epoch": 6.3985701519213585, "grad_norm": 1.5630916357040405, "learning_rate": 1.728767953894434e-05, "loss": 0.5633, "num_input_tokens_seen": 24777096, "step": 42960 }, { "epoch": 6.399314864462317, "grad_norm": 1.0589715242385864, "learning_rate": 1.728149792688606e-05, "loss": 0.5723, "num_input_tokens_seen": 24779848, "step": 42965 }, { "epoch": 6.400059577003277, "grad_norm": 2.730050563812256, "learning_rate": 1.7275316836412768e-05, "loss": 0.6487, "num_input_tokens_seen": 24782856, "step": 42970 }, { "epoch": 6.400804289544236, "grad_norm": 2.1402788162231445, "learning_rate": 1.7269136267942155e-05, "loss": 0.5419, "num_input_tokens_seen": 24785544, "step": 42975 }, { "epoch": 6.401549002085195, "grad_norm": 1.7307406663894653, "learning_rate": 1.7262956221891882e-05, "loss": 0.2866, "num_input_tokens_seen": 24788168, "step": 42980 }, { "epoch": 6.402293714626154, "grad_norm": 2.44879150390625, "learning_rate": 1.7256776698679577e-05, "loss": 0.755, "num_input_tokens_seen": 24790952, "step": 42985 }, { "epoch": 6.403038427167114, "grad_norm": 1.6464905738830566, "learning_rate": 1.7250597698722813e-05, "loss": 0.6177, "num_input_tokens_seen": 24793800, "step": 42990 }, { "epoch": 6.4037831397080724, "grad_norm": 2.8253707885742188, "learning_rate": 1.7244419222439152e-05, "loss": 0.6982, "num_input_tokens_seen": 24796840, "step": 42995 }, { "epoch": 6.404527852249032, "grad_norm": 2.3270158767700195, "learning_rate": 1.723824127024612e-05, "loss": 0.5926, "num_input_tokens_seen": 24799720, "step": 43000 }, { "epoch": 6.405272564789991, "grad_norm": 3.091897964477539, "learning_rate": 1.723206384256118e-05, "loss": 0.6297, "num_input_tokens_seen": 24802728, "step": 43005 }, { "epoch": 6.4060172773309505, "grad_norm": 2.10714054107666, "learning_rate": 1.7225886939801773e-05, "loss": 0.6487, "num_input_tokens_seen": 24805576, "step": 43010 }, { "epoch": 6.406761989871909, "grad_norm": 1.6186552047729492, "learning_rate": 1.7219710562385324e-05, "loss": 0.6101, "num_input_tokens_seen": 24808328, "step": 43015 }, { "epoch": 6.407506702412869, "grad_norm": 1.7806119918823242, "learning_rate": 1.7213534710729205e-05, "loss": 0.5473, "num_input_tokens_seen": 24811208, "step": 43020 }, { "epoch": 6.408251414953828, "grad_norm": 2.427215337753296, "learning_rate": 1.7207359385250756e-05, "loss": 0.5493, "num_input_tokens_seen": 24813896, "step": 43025 }, { "epoch": 6.408996127494787, "grad_norm": 2.530731678009033, "learning_rate": 1.7201184586367272e-05, "loss": 0.5572, "num_input_tokens_seen": 24816744, "step": 43030 }, { "epoch": 6.409740840035746, "grad_norm": 3.2496871948242188, "learning_rate": 1.7195010314496023e-05, "loss": 0.6396, "num_input_tokens_seen": 24819624, "step": 43035 }, { "epoch": 6.410485552576706, "grad_norm": 2.9514551162719727, "learning_rate": 1.718883657005426e-05, "loss": 0.7752, "num_input_tokens_seen": 24822472, "step": 43040 }, { "epoch": 6.4112302651176645, "grad_norm": 3.742887258529663, "learning_rate": 1.718266335345915e-05, "loss": 0.5599, "num_input_tokens_seen": 24825192, "step": 43045 }, { "epoch": 6.411974977658624, "grad_norm": 1.973961591720581, "learning_rate": 1.7176490665127868e-05, "loss": 0.5057, "num_input_tokens_seen": 24827848, "step": 43050 }, { "epoch": 6.412719690199583, "grad_norm": 2.8013312816619873, "learning_rate": 1.7170318505477543e-05, "loss": 0.5971, "num_input_tokens_seen": 24830920, "step": 43055 }, { "epoch": 6.4134644027405425, "grad_norm": 1.0145093202590942, "learning_rate": 1.7164146874925254e-05, "loss": 0.5519, "num_input_tokens_seen": 24833672, "step": 43060 }, { "epoch": 6.414209115281501, "grad_norm": 2.2491297721862793, "learning_rate": 1.715797577388807e-05, "loss": 0.5356, "num_input_tokens_seen": 24836584, "step": 43065 }, { "epoch": 6.414953827822461, "grad_norm": 1.1281787157058716, "learning_rate": 1.715180520278299e-05, "loss": 0.6418, "num_input_tokens_seen": 24839336, "step": 43070 }, { "epoch": 6.41569854036342, "grad_norm": 2.1857388019561768, "learning_rate": 1.7145635162027008e-05, "loss": 0.5055, "num_input_tokens_seen": 24842024, "step": 43075 }, { "epoch": 6.416443252904379, "grad_norm": 2.5226094722747803, "learning_rate": 1.7139465652037077e-05, "loss": 0.7767, "num_input_tokens_seen": 24844936, "step": 43080 }, { "epoch": 6.417187965445338, "grad_norm": 1.691454291343689, "learning_rate": 1.7133296673230097e-05, "loss": 0.6137, "num_input_tokens_seen": 24847560, "step": 43085 }, { "epoch": 6.417932677986297, "grad_norm": 3.3001298904418945, "learning_rate": 1.7127128226022936e-05, "loss": 0.6621, "num_input_tokens_seen": 24850760, "step": 43090 }, { "epoch": 6.4186773905272565, "grad_norm": 1.934077501296997, "learning_rate": 1.7120960310832446e-05, "loss": 0.6123, "num_input_tokens_seen": 24853640, "step": 43095 }, { "epoch": 6.419422103068215, "grad_norm": 3.7835893630981445, "learning_rate": 1.7114792928075422e-05, "loss": 0.6678, "num_input_tokens_seen": 24856456, "step": 43100 }, { "epoch": 6.420166815609175, "grad_norm": 1.354224681854248, "learning_rate": 1.7108626078168634e-05, "loss": 0.5935, "num_input_tokens_seen": 24859208, "step": 43105 }, { "epoch": 6.420911528150134, "grad_norm": 2.392815589904785, "learning_rate": 1.7102459761528812e-05, "loss": 0.5834, "num_input_tokens_seen": 24862440, "step": 43110 }, { "epoch": 6.421656240691093, "grad_norm": 2.5655581951141357, "learning_rate": 1.709629397857265e-05, "loss": 0.6409, "num_input_tokens_seen": 24865320, "step": 43115 }, { "epoch": 6.422400953232052, "grad_norm": 1.3130099773406982, "learning_rate": 1.7090128729716815e-05, "loss": 0.7173, "num_input_tokens_seen": 24868168, "step": 43120 }, { "epoch": 6.423145665773012, "grad_norm": 1.9583786725997925, "learning_rate": 1.708396401537791e-05, "loss": 0.5068, "num_input_tokens_seen": 24871208, "step": 43125 }, { "epoch": 6.4238903783139705, "grad_norm": 3.227386951446533, "learning_rate": 1.707779983597253e-05, "loss": 0.7112, "num_input_tokens_seen": 24874312, "step": 43130 }, { "epoch": 6.42463509085493, "grad_norm": 1.7695565223693848, "learning_rate": 1.7071636191917238e-05, "loss": 0.6101, "num_input_tokens_seen": 24877192, "step": 43135 }, { "epoch": 6.425379803395889, "grad_norm": 2.662883758544922, "learning_rate": 1.706547308362853e-05, "loss": 0.6186, "num_input_tokens_seen": 24880040, "step": 43140 }, { "epoch": 6.4261245159368485, "grad_norm": 1.6566258668899536, "learning_rate": 1.705931051152289e-05, "loss": 0.6517, "num_input_tokens_seen": 24882760, "step": 43145 }, { "epoch": 6.426869228477807, "grad_norm": 1.4092612266540527, "learning_rate": 1.7053148476016774e-05, "loss": 0.5271, "num_input_tokens_seen": 24885480, "step": 43150 }, { "epoch": 6.427613941018767, "grad_norm": 2.236401319503784, "learning_rate": 1.704698697752656e-05, "loss": 0.503, "num_input_tokens_seen": 24888648, "step": 43155 }, { "epoch": 6.428358653559726, "grad_norm": 1.5069578886032104, "learning_rate": 1.7040826016468637e-05, "loss": 0.6238, "num_input_tokens_seen": 24891496, "step": 43160 }, { "epoch": 6.429103366100685, "grad_norm": 2.727374792098999, "learning_rate": 1.7034665593259338e-05, "loss": 0.6903, "num_input_tokens_seen": 24894280, "step": 43165 }, { "epoch": 6.429848078641644, "grad_norm": 2.2320339679718018, "learning_rate": 1.7028505708314953e-05, "loss": 0.635, "num_input_tokens_seen": 24897096, "step": 43170 }, { "epoch": 6.430592791182604, "grad_norm": 1.6485950946807861, "learning_rate": 1.702234636205173e-05, "loss": 0.686, "num_input_tokens_seen": 24899880, "step": 43175 }, { "epoch": 6.4313375037235625, "grad_norm": 2.330810546875, "learning_rate": 1.7016187554885916e-05, "loss": 0.4624, "num_input_tokens_seen": 24902984, "step": 43180 }, { "epoch": 6.432082216264522, "grad_norm": 3.890994071960449, "learning_rate": 1.7010029287233688e-05, "loss": 0.6322, "num_input_tokens_seen": 24905736, "step": 43185 }, { "epoch": 6.432826928805481, "grad_norm": 1.9898487329483032, "learning_rate": 1.7003871559511187e-05, "loss": 0.5163, "num_input_tokens_seen": 24908328, "step": 43190 }, { "epoch": 6.4335716413464406, "grad_norm": 2.2543725967407227, "learning_rate": 1.6997714372134544e-05, "loss": 0.6071, "num_input_tokens_seen": 24911624, "step": 43195 }, { "epoch": 6.434316353887399, "grad_norm": 2.9416115283966064, "learning_rate": 1.6991557725519824e-05, "loss": 0.5904, "num_input_tokens_seen": 24914376, "step": 43200 }, { "epoch": 6.435061066428359, "grad_norm": 1.121087670326233, "learning_rate": 1.698540162008308e-05, "loss": 0.4967, "num_input_tokens_seen": 24917224, "step": 43205 }, { "epoch": 6.435805778969318, "grad_norm": 1.4625996351242065, "learning_rate": 1.6979246056240305e-05, "loss": 0.537, "num_input_tokens_seen": 24919944, "step": 43210 }, { "epoch": 6.436550491510277, "grad_norm": 5.008917331695557, "learning_rate": 1.6973091034407468e-05, "loss": 0.7014, "num_input_tokens_seen": 24922536, "step": 43215 }, { "epoch": 6.437295204051236, "grad_norm": 2.691728353500366, "learning_rate": 1.6966936555000507e-05, "loss": 0.7961, "num_input_tokens_seen": 24925640, "step": 43220 }, { "epoch": 6.438039916592196, "grad_norm": 2.3580374717712402, "learning_rate": 1.6960782618435312e-05, "loss": 0.6209, "num_input_tokens_seen": 24928616, "step": 43225 }, { "epoch": 6.4387846291331545, "grad_norm": 4.549179553985596, "learning_rate": 1.6954629225127745e-05, "loss": 0.7477, "num_input_tokens_seen": 24931496, "step": 43230 }, { "epoch": 6.439529341674114, "grad_norm": 4.444252967834473, "learning_rate": 1.6948476375493622e-05, "loss": 0.5174, "num_input_tokens_seen": 24934344, "step": 43235 }, { "epoch": 6.440274054215073, "grad_norm": 2.040388584136963, "learning_rate": 1.694232406994873e-05, "loss": 0.6063, "num_input_tokens_seen": 24937192, "step": 43240 }, { "epoch": 6.441018766756033, "grad_norm": 1.437929391860962, "learning_rate": 1.6936172308908825e-05, "loss": 0.5723, "num_input_tokens_seen": 24939976, "step": 43245 }, { "epoch": 6.441763479296991, "grad_norm": 3.546374797821045, "learning_rate": 1.693002109278961e-05, "loss": 0.5945, "num_input_tokens_seen": 24942760, "step": 43250 }, { "epoch": 6.44250819183795, "grad_norm": 2.121950387954712, "learning_rate": 1.6923870422006753e-05, "loss": 0.671, "num_input_tokens_seen": 24945864, "step": 43255 }, { "epoch": 6.44325290437891, "grad_norm": 2.8742878437042236, "learning_rate": 1.6917720296975898e-05, "loss": 0.5208, "num_input_tokens_seen": 24948744, "step": 43260 }, { "epoch": 6.443997616919869, "grad_norm": 2.0046961307525635, "learning_rate": 1.6911570718112646e-05, "loss": 0.5532, "num_input_tokens_seen": 24951592, "step": 43265 }, { "epoch": 6.444742329460828, "grad_norm": 2.5592873096466064, "learning_rate": 1.6905421685832555e-05, "loss": 0.3795, "num_input_tokens_seen": 24954344, "step": 43270 }, { "epoch": 6.445487042001787, "grad_norm": 1.640234112739563, "learning_rate": 1.689927320055116e-05, "loss": 0.5039, "num_input_tokens_seen": 24957032, "step": 43275 }, { "epoch": 6.4462317545427466, "grad_norm": 1.5747923851013184, "learning_rate": 1.6893125262683952e-05, "loss": 0.5074, "num_input_tokens_seen": 24959976, "step": 43280 }, { "epoch": 6.446976467083705, "grad_norm": 1.6277146339416504, "learning_rate": 1.688697787264638e-05, "loss": 0.5386, "num_input_tokens_seen": 24962920, "step": 43285 }, { "epoch": 6.447721179624665, "grad_norm": 2.3622567653656006, "learning_rate": 1.6880831030853854e-05, "loss": 0.5579, "num_input_tokens_seen": 24965896, "step": 43290 }, { "epoch": 6.448465892165624, "grad_norm": 1.99739408493042, "learning_rate": 1.6874684737721752e-05, "loss": 0.6705, "num_input_tokens_seen": 24968648, "step": 43295 }, { "epoch": 6.449210604706583, "grad_norm": 1.1967412233352661, "learning_rate": 1.6868538993665426e-05, "loss": 0.6317, "num_input_tokens_seen": 24971464, "step": 43300 }, { "epoch": 6.449955317247542, "grad_norm": 1.8256088495254517, "learning_rate": 1.6862393799100166e-05, "loss": 0.568, "num_input_tokens_seen": 24974280, "step": 43305 }, { "epoch": 6.450700029788502, "grad_norm": 4.477097511291504, "learning_rate": 1.6856249154441256e-05, "loss": 0.6169, "num_input_tokens_seen": 24977032, "step": 43310 }, { "epoch": 6.4514447423294605, "grad_norm": 2.409287214279175, "learning_rate": 1.685010506010392e-05, "loss": 0.6176, "num_input_tokens_seen": 24980168, "step": 43315 }, { "epoch": 6.45218945487042, "grad_norm": 2.5548245906829834, "learning_rate": 1.6843961516503344e-05, "loss": 0.6967, "num_input_tokens_seen": 24983240, "step": 43320 }, { "epoch": 6.452934167411379, "grad_norm": 4.555415630340576, "learning_rate": 1.6837818524054696e-05, "loss": 0.5891, "num_input_tokens_seen": 24985896, "step": 43325 }, { "epoch": 6.453678879952339, "grad_norm": 1.7340097427368164, "learning_rate": 1.683167608317308e-05, "loss": 0.5667, "num_input_tokens_seen": 24989032, "step": 43330 }, { "epoch": 6.454423592493297, "grad_norm": 2.0992908477783203, "learning_rate": 1.6825534194273586e-05, "loss": 0.8102, "num_input_tokens_seen": 24992104, "step": 43335 }, { "epoch": 6.455168305034257, "grad_norm": 1.5891528129577637, "learning_rate": 1.6819392857771253e-05, "loss": 0.596, "num_input_tokens_seen": 24994792, "step": 43340 }, { "epoch": 6.455913017575216, "grad_norm": 1.6470633745193481, "learning_rate": 1.6813252074081094e-05, "loss": 0.468, "num_input_tokens_seen": 24997768, "step": 43345 }, { "epoch": 6.456657730116175, "grad_norm": 1.4240504503250122, "learning_rate": 1.6807111843618077e-05, "loss": 0.4428, "num_input_tokens_seen": 25000552, "step": 43350 }, { "epoch": 6.457402442657134, "grad_norm": 2.129687786102295, "learning_rate": 1.6800972166797126e-05, "loss": 0.4447, "num_input_tokens_seen": 25003176, "step": 43355 }, { "epoch": 6.458147155198094, "grad_norm": 1.6324342489242554, "learning_rate": 1.6794833044033147e-05, "loss": 0.6473, "num_input_tokens_seen": 25005832, "step": 43360 }, { "epoch": 6.4588918677390526, "grad_norm": 2.5042874813079834, "learning_rate": 1.678869447574099e-05, "loss": 0.5737, "num_input_tokens_seen": 25008552, "step": 43365 }, { "epoch": 6.459636580280012, "grad_norm": 1.4121068716049194, "learning_rate": 1.678255646233548e-05, "loss": 0.6645, "num_input_tokens_seen": 25011368, "step": 43370 }, { "epoch": 6.460381292820971, "grad_norm": 2.4170634746551514, "learning_rate": 1.6776419004231386e-05, "loss": 0.5565, "num_input_tokens_seen": 25014120, "step": 43375 }, { "epoch": 6.461126005361931, "grad_norm": 3.8135082721710205, "learning_rate": 1.677028210184346e-05, "loss": 0.776, "num_input_tokens_seen": 25016968, "step": 43380 }, { "epoch": 6.461870717902889, "grad_norm": 0.8998611569404602, "learning_rate": 1.6764145755586417e-05, "loss": 0.5473, "num_input_tokens_seen": 25019720, "step": 43385 }, { "epoch": 6.462615430443849, "grad_norm": 3.183429002761841, "learning_rate": 1.675800996587491e-05, "loss": 0.7324, "num_input_tokens_seen": 25022408, "step": 43390 }, { "epoch": 6.463360142984808, "grad_norm": 2.6437885761260986, "learning_rate": 1.675187473312359e-05, "loss": 0.5197, "num_input_tokens_seen": 25025416, "step": 43395 }, { "epoch": 6.464104855525767, "grad_norm": 2.9864625930786133, "learning_rate": 1.6745740057747038e-05, "loss": 0.593, "num_input_tokens_seen": 25028232, "step": 43400 }, { "epoch": 6.464849568066726, "grad_norm": 1.6787075996398926, "learning_rate": 1.673960594015982e-05, "loss": 0.5946, "num_input_tokens_seen": 25031240, "step": 43405 }, { "epoch": 6.465594280607686, "grad_norm": 1.3342536687850952, "learning_rate": 1.673347238077644e-05, "loss": 0.6648, "num_input_tokens_seen": 25034152, "step": 43410 }, { "epoch": 6.466338993148645, "grad_norm": 1.6191997528076172, "learning_rate": 1.6727339380011386e-05, "loss": 0.5582, "num_input_tokens_seen": 25036744, "step": 43415 }, { "epoch": 6.467083705689604, "grad_norm": 2.7695858478546143, "learning_rate": 1.6721206938279105e-05, "loss": 0.4701, "num_input_tokens_seen": 25039336, "step": 43420 }, { "epoch": 6.467828418230563, "grad_norm": 2.0014686584472656, "learning_rate": 1.6715075055993994e-05, "loss": 0.4738, "num_input_tokens_seen": 25042472, "step": 43425 }, { "epoch": 6.468573130771523, "grad_norm": 3.2470672130584717, "learning_rate": 1.6708943733570437e-05, "loss": 0.7224, "num_input_tokens_seen": 25045704, "step": 43430 }, { "epoch": 6.469317843312481, "grad_norm": 2.048391580581665, "learning_rate": 1.6702812971422746e-05, "loss": 0.5817, "num_input_tokens_seen": 25048968, "step": 43435 }, { "epoch": 6.47006255585344, "grad_norm": 3.4168639183044434, "learning_rate": 1.669668276996522e-05, "loss": 0.5119, "num_input_tokens_seen": 25051880, "step": 43440 }, { "epoch": 6.4708072683944, "grad_norm": 1.4994428157806396, "learning_rate": 1.6690553129612125e-05, "loss": 0.4983, "num_input_tokens_seen": 25054728, "step": 43445 }, { "epoch": 6.4715519809353586, "grad_norm": 2.255098581314087, "learning_rate": 1.668442405077766e-05, "loss": 0.5942, "num_input_tokens_seen": 25057640, "step": 43450 }, { "epoch": 6.472296693476318, "grad_norm": 1.18346107006073, "learning_rate": 1.6678295533876006e-05, "loss": 0.5985, "num_input_tokens_seen": 25060296, "step": 43455 }, { "epoch": 6.473041406017277, "grad_norm": 1.479424238204956, "learning_rate": 1.6672167579321305e-05, "loss": 0.4633, "num_input_tokens_seen": 25063176, "step": 43460 }, { "epoch": 6.473786118558237, "grad_norm": 3.3541173934936523, "learning_rate": 1.6666040187527665e-05, "loss": 0.7728, "num_input_tokens_seen": 25065736, "step": 43465 }, { "epoch": 6.474530831099195, "grad_norm": 1.263780117034912, "learning_rate": 1.665991335890914e-05, "loss": 0.5877, "num_input_tokens_seen": 25068520, "step": 43470 }, { "epoch": 6.475275543640155, "grad_norm": 4.213689804077148, "learning_rate": 1.6653787093879762e-05, "loss": 0.5793, "num_input_tokens_seen": 25071176, "step": 43475 }, { "epoch": 6.476020256181114, "grad_norm": 1.9637975692749023, "learning_rate": 1.6647661392853525e-05, "loss": 0.527, "num_input_tokens_seen": 25074120, "step": 43480 }, { "epoch": 6.476764968722073, "grad_norm": 1.5555189847946167, "learning_rate": 1.664153625624438e-05, "loss": 0.7585, "num_input_tokens_seen": 25076904, "step": 43485 }, { "epoch": 6.477509681263032, "grad_norm": 4.994108200073242, "learning_rate": 1.6635411684466217e-05, "loss": 0.6157, "num_input_tokens_seen": 25080104, "step": 43490 }, { "epoch": 6.478254393803992, "grad_norm": 2.0278494358062744, "learning_rate": 1.6629287677932924e-05, "loss": 0.6354, "num_input_tokens_seen": 25083144, "step": 43495 }, { "epoch": 6.478999106344951, "grad_norm": 4.129323959350586, "learning_rate": 1.6623164237058347e-05, "loss": 0.7833, "num_input_tokens_seen": 25086024, "step": 43500 }, { "epoch": 6.47974381888591, "grad_norm": 1.4549574851989746, "learning_rate": 1.6617041362256265e-05, "loss": 0.4559, "num_input_tokens_seen": 25088680, "step": 43505 }, { "epoch": 6.480488531426869, "grad_norm": 1.9831129312515259, "learning_rate": 1.6610919053940446e-05, "loss": 0.4252, "num_input_tokens_seen": 25091688, "step": 43510 }, { "epoch": 6.481233243967829, "grad_norm": 1.617875576019287, "learning_rate": 1.6604797312524613e-05, "loss": 0.6657, "num_input_tokens_seen": 25094440, "step": 43515 }, { "epoch": 6.481977956508787, "grad_norm": 2.6730852127075195, "learning_rate": 1.659867613842244e-05, "loss": 0.5822, "num_input_tokens_seen": 25097288, "step": 43520 }, { "epoch": 6.482722669049747, "grad_norm": 1.9324009418487549, "learning_rate": 1.6592555532047592e-05, "loss": 0.6884, "num_input_tokens_seen": 25100264, "step": 43525 }, { "epoch": 6.483467381590706, "grad_norm": 2.1394400596618652, "learning_rate": 1.6586435493813645e-05, "loss": 0.6095, "num_input_tokens_seen": 25102984, "step": 43530 }, { "epoch": 6.484212094131665, "grad_norm": 2.1502623558044434, "learning_rate": 1.6580316024134186e-05, "loss": 0.658, "num_input_tokens_seen": 25105768, "step": 43535 }, { "epoch": 6.484956806672624, "grad_norm": 6.929319858551025, "learning_rate": 1.657419712342273e-05, "loss": 0.5903, "num_input_tokens_seen": 25108488, "step": 43540 }, { "epoch": 6.485701519213584, "grad_norm": 2.384387493133545, "learning_rate": 1.656807879209278e-05, "loss": 0.7157, "num_input_tokens_seen": 25111336, "step": 43545 }, { "epoch": 6.486446231754543, "grad_norm": 4.146131992340088, "learning_rate": 1.656196103055779e-05, "loss": 0.6233, "num_input_tokens_seen": 25114248, "step": 43550 }, { "epoch": 6.487190944295502, "grad_norm": 3.4442296028137207, "learning_rate": 1.6555843839231156e-05, "loss": 0.6622, "num_input_tokens_seen": 25117000, "step": 43555 }, { "epoch": 6.487935656836461, "grad_norm": 1.8213714361190796, "learning_rate": 1.654972721852627e-05, "loss": 0.485, "num_input_tokens_seen": 25119848, "step": 43560 }, { "epoch": 6.488680369377421, "grad_norm": 2.6139414310455322, "learning_rate": 1.6543611168856464e-05, "loss": 0.7713, "num_input_tokens_seen": 25122600, "step": 43565 }, { "epoch": 6.489425081918379, "grad_norm": 1.6011888980865479, "learning_rate": 1.6537495690635034e-05, "loss": 0.7745, "num_input_tokens_seen": 25125608, "step": 43570 }, { "epoch": 6.490169794459339, "grad_norm": 2.4685873985290527, "learning_rate": 1.6531380784275237e-05, "loss": 0.622, "num_input_tokens_seen": 25128808, "step": 43575 }, { "epoch": 6.490914507000298, "grad_norm": 2.411083936691284, "learning_rate": 1.6525266450190296e-05, "loss": 0.549, "num_input_tokens_seen": 25131688, "step": 43580 }, { "epoch": 6.4916592195412575, "grad_norm": 2.762343645095825, "learning_rate": 1.6519152688793387e-05, "loss": 0.5881, "num_input_tokens_seen": 25134408, "step": 43585 }, { "epoch": 6.492403932082216, "grad_norm": 2.186525821685791, "learning_rate": 1.6513039500497663e-05, "loss": 0.6835, "num_input_tokens_seen": 25137608, "step": 43590 }, { "epoch": 6.493148644623176, "grad_norm": 3.0974862575531006, "learning_rate": 1.6506926885716224e-05, "loss": 0.5392, "num_input_tokens_seen": 25140520, "step": 43595 }, { "epoch": 6.493893357164135, "grad_norm": 1.84648859500885, "learning_rate": 1.6500814844862135e-05, "loss": 0.42, "num_input_tokens_seen": 25143208, "step": 43600 }, { "epoch": 6.494638069705093, "grad_norm": 3.0516574382781982, "learning_rate": 1.6494703378348433e-05, "loss": 0.5907, "num_input_tokens_seen": 25146088, "step": 43605 }, { "epoch": 6.495382782246053, "grad_norm": 2.5241503715515137, "learning_rate": 1.6488592486588087e-05, "loss": 0.5935, "num_input_tokens_seen": 25149192, "step": 43610 }, { "epoch": 6.496127494787013, "grad_norm": 2.0079896450042725, "learning_rate": 1.6482482169994055e-05, "loss": 0.5134, "num_input_tokens_seen": 25151784, "step": 43615 }, { "epoch": 6.496872207327971, "grad_norm": 2.317307710647583, "learning_rate": 1.6476372428979254e-05, "loss": 0.6148, "num_input_tokens_seen": 25154504, "step": 43620 }, { "epoch": 6.49761691986893, "grad_norm": 2.6283366680145264, "learning_rate": 1.6470263263956543e-05, "loss": 0.6553, "num_input_tokens_seen": 25157480, "step": 43625 }, { "epoch": 6.49836163240989, "grad_norm": 3.728632688522339, "learning_rate": 1.6464154675338767e-05, "loss": 0.505, "num_input_tokens_seen": 25160392, "step": 43630 }, { "epoch": 6.499106344950849, "grad_norm": 1.4824496507644653, "learning_rate": 1.6458046663538706e-05, "loss": 0.5854, "num_input_tokens_seen": 25163208, "step": 43635 }, { "epoch": 6.499851057491808, "grad_norm": 3.9453341960906982, "learning_rate": 1.6451939228969127e-05, "loss": 0.7743, "num_input_tokens_seen": 25166024, "step": 43640 }, { "epoch": 6.5, "eval_loss": 0.664467990398407, "eval_runtime": 74.3091, "eval_samples_per_second": 40.157, "eval_steps_per_second": 10.039, "num_input_tokens_seen": 25166536, "step": 43641 }, { "epoch": 6.500595770032767, "grad_norm": 5.167620658874512, "learning_rate": 1.644583237204275e-05, "loss": 0.649, "num_input_tokens_seen": 25168680, "step": 43645 }, { "epoch": 6.501340482573727, "grad_norm": 3.328230142593384, "learning_rate": 1.6439726093172237e-05, "loss": 0.6483, "num_input_tokens_seen": 25171816, "step": 43650 }, { "epoch": 6.502085195114685, "grad_norm": 2.0252840518951416, "learning_rate": 1.6433620392770227e-05, "loss": 0.6261, "num_input_tokens_seen": 25174728, "step": 43655 }, { "epoch": 6.502829907655645, "grad_norm": 2.458261013031006, "learning_rate": 1.642751527124932e-05, "loss": 0.6359, "num_input_tokens_seen": 25177544, "step": 43660 }, { "epoch": 6.503574620196604, "grad_norm": 2.6043660640716553, "learning_rate": 1.6421410729022087e-05, "loss": 0.6882, "num_input_tokens_seen": 25180744, "step": 43665 }, { "epoch": 6.5043193327375635, "grad_norm": 2.5224695205688477, "learning_rate": 1.641530676650103e-05, "loss": 0.7301, "num_input_tokens_seen": 25183912, "step": 43670 }, { "epoch": 6.505064045278522, "grad_norm": 2.2700040340423584, "learning_rate": 1.6409203384098637e-05, "loss": 0.7525, "num_input_tokens_seen": 25186728, "step": 43675 }, { "epoch": 6.505808757819482, "grad_norm": 0.7312896847724915, "learning_rate": 1.640310058222736e-05, "loss": 0.544, "num_input_tokens_seen": 25189544, "step": 43680 }, { "epoch": 6.506553470360441, "grad_norm": 1.2583494186401367, "learning_rate": 1.6396998361299597e-05, "loss": 0.6406, "num_input_tokens_seen": 25192616, "step": 43685 }, { "epoch": 6.5072981829014, "grad_norm": 4.150697708129883, "learning_rate": 1.63908967217277e-05, "loss": 0.5494, "num_input_tokens_seen": 25196008, "step": 43690 }, { "epoch": 6.508042895442359, "grad_norm": 1.81169593334198, "learning_rate": 1.6384795663924003e-05, "loss": 0.4633, "num_input_tokens_seen": 25198792, "step": 43695 }, { "epoch": 6.508787607983319, "grad_norm": 1.7145673036575317, "learning_rate": 1.6378695188300787e-05, "loss": 0.6226, "num_input_tokens_seen": 25201512, "step": 43700 }, { "epoch": 6.509532320524277, "grad_norm": 1.121673583984375, "learning_rate": 1.6372595295270294e-05, "loss": 0.6826, "num_input_tokens_seen": 25204520, "step": 43705 }, { "epoch": 6.510277033065237, "grad_norm": 2.5574724674224854, "learning_rate": 1.6366495985244736e-05, "loss": 0.5447, "num_input_tokens_seen": 25207304, "step": 43710 }, { "epoch": 6.511021745606196, "grad_norm": 1.2025126218795776, "learning_rate": 1.6360397258636284e-05, "loss": 0.6298, "num_input_tokens_seen": 25210152, "step": 43715 }, { "epoch": 6.5117664581471555, "grad_norm": 1.9397375583648682, "learning_rate": 1.6354299115857052e-05, "loss": 0.5163, "num_input_tokens_seen": 25213160, "step": 43720 }, { "epoch": 6.512511170688114, "grad_norm": 1.9152746200561523, "learning_rate": 1.6348201557319148e-05, "loss": 0.5189, "num_input_tokens_seen": 25216040, "step": 43725 }, { "epoch": 6.513255883229074, "grad_norm": 1.2502437829971313, "learning_rate": 1.6342104583434595e-05, "loss": 0.4687, "num_input_tokens_seen": 25218920, "step": 43730 }, { "epoch": 6.514000595770033, "grad_norm": 2.636523723602295, "learning_rate": 1.633600819461542e-05, "loss": 0.3995, "num_input_tokens_seen": 25221704, "step": 43735 }, { "epoch": 6.514745308310992, "grad_norm": 2.4021589756011963, "learning_rate": 1.632991239127358e-05, "loss": 0.5738, "num_input_tokens_seen": 25224552, "step": 43740 }, { "epoch": 6.515490020851951, "grad_norm": 2.4230754375457764, "learning_rate": 1.6323817173821014e-05, "loss": 0.6805, "num_input_tokens_seen": 25227528, "step": 43745 }, { "epoch": 6.516234733392911, "grad_norm": 1.6387125253677368, "learning_rate": 1.6317722542669606e-05, "loss": 0.5272, "num_input_tokens_seen": 25230952, "step": 43750 }, { "epoch": 6.5169794459338695, "grad_norm": 2.5751569271087646, "learning_rate": 1.6311628498231208e-05, "loss": 0.6666, "num_input_tokens_seen": 25233864, "step": 43755 }, { "epoch": 6.517724158474829, "grad_norm": 2.8828284740448, "learning_rate": 1.6305535040917638e-05, "loss": 0.7051, "num_input_tokens_seen": 25236776, "step": 43760 }, { "epoch": 6.518468871015788, "grad_norm": 1.8736275434494019, "learning_rate": 1.6299442171140656e-05, "loss": 0.5787, "num_input_tokens_seen": 25239784, "step": 43765 }, { "epoch": 6.519213583556747, "grad_norm": 1.7104216814041138, "learning_rate": 1.6293349889312007e-05, "loss": 0.4069, "num_input_tokens_seen": 25242760, "step": 43770 }, { "epoch": 6.519958296097706, "grad_norm": 2.182561159133911, "learning_rate": 1.6287258195843363e-05, "loss": 0.685, "num_input_tokens_seen": 25245768, "step": 43775 }, { "epoch": 6.520703008638666, "grad_norm": 2.124319076538086, "learning_rate": 1.6281167091146392e-05, "loss": 0.6351, "num_input_tokens_seen": 25248680, "step": 43780 }, { "epoch": 6.521447721179625, "grad_norm": 1.1393465995788574, "learning_rate": 1.62750765756327e-05, "loss": 0.2966, "num_input_tokens_seen": 25251304, "step": 43785 }, { "epoch": 6.522192433720583, "grad_norm": 1.891034483909607, "learning_rate": 1.6268986649713852e-05, "loss": 0.483, "num_input_tokens_seen": 25254088, "step": 43790 }, { "epoch": 6.522937146261543, "grad_norm": 1.7728217840194702, "learning_rate": 1.6262897313801402e-05, "loss": 0.5131, "num_input_tokens_seen": 25257576, "step": 43795 }, { "epoch": 6.523681858802503, "grad_norm": 2.1904473304748535, "learning_rate": 1.625680856830682e-05, "loss": 0.7062, "num_input_tokens_seen": 25260328, "step": 43800 }, { "epoch": 6.5244265713434615, "grad_norm": 1.7033463716506958, "learning_rate": 1.6250720413641565e-05, "loss": 0.5354, "num_input_tokens_seen": 25263496, "step": 43805 }, { "epoch": 6.52517128388442, "grad_norm": 3.310637950897217, "learning_rate": 1.6244632850217067e-05, "loss": 0.5942, "num_input_tokens_seen": 25266152, "step": 43810 }, { "epoch": 6.52591599642538, "grad_norm": 3.396559000015259, "learning_rate": 1.6238545878444676e-05, "loss": 0.608, "num_input_tokens_seen": 25269224, "step": 43815 }, { "epoch": 6.526660708966339, "grad_norm": 0.7849127650260925, "learning_rate": 1.623245949873573e-05, "loss": 0.399, "num_input_tokens_seen": 25271944, "step": 43820 }, { "epoch": 6.527405421507298, "grad_norm": 1.9409635066986084, "learning_rate": 1.6226373711501523e-05, "loss": 0.6913, "num_input_tokens_seen": 25274728, "step": 43825 }, { "epoch": 6.528150134048257, "grad_norm": 1.5327247381210327, "learning_rate": 1.6220288517153318e-05, "loss": 0.4517, "num_input_tokens_seen": 25277640, "step": 43830 }, { "epoch": 6.528894846589217, "grad_norm": 2.1157476902008057, "learning_rate": 1.621420391610231e-05, "loss": 0.4545, "num_input_tokens_seen": 25280520, "step": 43835 }, { "epoch": 6.5296395591301755, "grad_norm": 1.1370421648025513, "learning_rate": 1.6208119908759684e-05, "loss": 0.5765, "num_input_tokens_seen": 25283336, "step": 43840 }, { "epoch": 6.530384271671135, "grad_norm": 2.852587938308716, "learning_rate": 1.6202036495536575e-05, "loss": 0.7157, "num_input_tokens_seen": 25286376, "step": 43845 }, { "epoch": 6.531128984212094, "grad_norm": 7.345154285430908, "learning_rate": 1.6195953676844072e-05, "loss": 0.8193, "num_input_tokens_seen": 25289192, "step": 43850 }, { "epoch": 6.5318736967530535, "grad_norm": 2.284195899963379, "learning_rate": 1.6189871453093217e-05, "loss": 0.6123, "num_input_tokens_seen": 25292008, "step": 43855 }, { "epoch": 6.532618409294012, "grad_norm": 2.165430784225464, "learning_rate": 1.6183789824695027e-05, "loss": 0.5773, "num_input_tokens_seen": 25294728, "step": 43860 }, { "epoch": 6.533363121834972, "grad_norm": 2.3560359477996826, "learning_rate": 1.6177708792060486e-05, "loss": 0.6044, "num_input_tokens_seen": 25297608, "step": 43865 }, { "epoch": 6.534107834375931, "grad_norm": 3.8801257610321045, "learning_rate": 1.6171628355600507e-05, "loss": 0.5692, "num_input_tokens_seen": 25300456, "step": 43870 }, { "epoch": 6.53485254691689, "grad_norm": 2.988116979598999, "learning_rate": 1.6165548515725992e-05, "loss": 0.482, "num_input_tokens_seen": 25303144, "step": 43875 }, { "epoch": 6.535597259457849, "grad_norm": 2.8305811882019043, "learning_rate": 1.6159469272847793e-05, "loss": 0.704, "num_input_tokens_seen": 25305832, "step": 43880 }, { "epoch": 6.536341971998809, "grad_norm": 3.6153576374053955, "learning_rate": 1.6153390627376717e-05, "loss": 0.4795, "num_input_tokens_seen": 25308776, "step": 43885 }, { "epoch": 6.5370866845397675, "grad_norm": 1.6459264755249023, "learning_rate": 1.6147312579723542e-05, "loss": 0.709, "num_input_tokens_seen": 25311720, "step": 43890 }, { "epoch": 6.537831397080727, "grad_norm": 2.8021771907806396, "learning_rate": 1.6141235130298983e-05, "loss": 0.8398, "num_input_tokens_seen": 25314536, "step": 43895 }, { "epoch": 6.538576109621686, "grad_norm": 1.8938722610473633, "learning_rate": 1.6135158279513737e-05, "loss": 0.481, "num_input_tokens_seen": 25317448, "step": 43900 }, { "epoch": 6.5393208221626455, "grad_norm": 1.3788776397705078, "learning_rate": 1.612908202777845e-05, "loss": 0.6528, "num_input_tokens_seen": 25320616, "step": 43905 }, { "epoch": 6.540065534703604, "grad_norm": 4.58077335357666, "learning_rate": 1.6123006375503737e-05, "loss": 0.6153, "num_input_tokens_seen": 25323592, "step": 43910 }, { "epoch": 6.540810247244564, "grad_norm": 2.984849691390991, "learning_rate": 1.6116931323100158e-05, "loss": 0.6835, "num_input_tokens_seen": 25326632, "step": 43915 }, { "epoch": 6.541554959785523, "grad_norm": 2.36763596534729, "learning_rate": 1.6110856870978245e-05, "loss": 0.5457, "num_input_tokens_seen": 25329320, "step": 43920 }, { "epoch": 6.542299672326482, "grad_norm": 2.280301809310913, "learning_rate": 1.6104783019548486e-05, "loss": 0.5123, "num_input_tokens_seen": 25332232, "step": 43925 }, { "epoch": 6.543044384867441, "grad_norm": 2.4321160316467285, "learning_rate": 1.6098709769221333e-05, "loss": 0.7402, "num_input_tokens_seen": 25335112, "step": 43930 }, { "epoch": 6.5437890974084, "grad_norm": 1.7850744724273682, "learning_rate": 1.6092637120407174e-05, "loss": 0.5833, "num_input_tokens_seen": 25338024, "step": 43935 }, { "epoch": 6.5445338099493595, "grad_norm": 5.992672920227051, "learning_rate": 1.6086565073516385e-05, "loss": 0.6497, "num_input_tokens_seen": 25340776, "step": 43940 }, { "epoch": 6.545278522490319, "grad_norm": 3.4916539192199707, "learning_rate": 1.608049362895929e-05, "loss": 0.8639, "num_input_tokens_seen": 25343784, "step": 43945 }, { "epoch": 6.546023235031278, "grad_norm": 2.5486578941345215, "learning_rate": 1.607442278714617e-05, "loss": 0.4462, "num_input_tokens_seen": 25346536, "step": 43950 }, { "epoch": 6.546767947572237, "grad_norm": 4.188682556152344, "learning_rate": 1.6068352548487263e-05, "loss": 0.8583, "num_input_tokens_seen": 25349352, "step": 43955 }, { "epoch": 6.547512660113196, "grad_norm": 3.179830312728882, "learning_rate": 1.606228291339279e-05, "loss": 0.9575, "num_input_tokens_seen": 25352488, "step": 43960 }, { "epoch": 6.548257372654156, "grad_norm": 2.015592336654663, "learning_rate": 1.6056213882272892e-05, "loss": 0.5908, "num_input_tokens_seen": 25355496, "step": 43965 }, { "epoch": 6.549002085195115, "grad_norm": 2.1790428161621094, "learning_rate": 1.6050145455537708e-05, "loss": 0.5092, "num_input_tokens_seen": 25358408, "step": 43970 }, { "epoch": 6.5497467977360735, "grad_norm": 1.5087097883224487, "learning_rate": 1.6044077633597292e-05, "loss": 0.5059, "num_input_tokens_seen": 25361064, "step": 43975 }, { "epoch": 6.550491510277033, "grad_norm": 2.874117612838745, "learning_rate": 1.603801041686171e-05, "loss": 0.879, "num_input_tokens_seen": 25363944, "step": 43980 }, { "epoch": 6.551236222817992, "grad_norm": 5.9165544509887695, "learning_rate": 1.6031943805740934e-05, "loss": 0.7013, "num_input_tokens_seen": 25366728, "step": 43985 }, { "epoch": 6.5519809353589515, "grad_norm": 3.0583624839782715, "learning_rate": 1.602587780064494e-05, "loss": 0.4912, "num_input_tokens_seen": 25369576, "step": 43990 }, { "epoch": 6.55272564789991, "grad_norm": 1.6774139404296875, "learning_rate": 1.601981240198364e-05, "loss": 0.7285, "num_input_tokens_seen": 25372200, "step": 43995 }, { "epoch": 6.55347036044087, "grad_norm": 4.059718608856201, "learning_rate": 1.6013747610166903e-05, "loss": 0.7559, "num_input_tokens_seen": 25374888, "step": 44000 }, { "epoch": 6.554215072981829, "grad_norm": 2.736420154571533, "learning_rate": 1.600768342560457e-05, "loss": 0.6225, "num_input_tokens_seen": 25377960, "step": 44005 }, { "epoch": 6.554959785522788, "grad_norm": 2.634453058242798, "learning_rate": 1.6001619848706435e-05, "loss": 0.7181, "num_input_tokens_seen": 25380744, "step": 44010 }, { "epoch": 6.555704498063747, "grad_norm": 2.936830997467041, "learning_rate": 1.5995556879882246e-05, "loss": 0.4636, "num_input_tokens_seen": 25383528, "step": 44015 }, { "epoch": 6.556449210604707, "grad_norm": 1.2284287214279175, "learning_rate": 1.5989494519541706e-05, "loss": 0.7056, "num_input_tokens_seen": 25386408, "step": 44020 }, { "epoch": 6.5571939231456655, "grad_norm": 1.6065003871917725, "learning_rate": 1.5983432768094495e-05, "loss": 0.5995, "num_input_tokens_seen": 25389128, "step": 44025 }, { "epoch": 6.557938635686625, "grad_norm": 1.2752362489700317, "learning_rate": 1.597737162595024e-05, "loss": 0.5363, "num_input_tokens_seen": 25392424, "step": 44030 }, { "epoch": 6.558683348227584, "grad_norm": 3.326345682144165, "learning_rate": 1.5971311093518527e-05, "loss": 0.7082, "num_input_tokens_seen": 25395240, "step": 44035 }, { "epoch": 6.559428060768544, "grad_norm": 2.543959379196167, "learning_rate": 1.5965251171208896e-05, "loss": 0.5701, "num_input_tokens_seen": 25397992, "step": 44040 }, { "epoch": 6.560172773309502, "grad_norm": 3.4404726028442383, "learning_rate": 1.5959191859430867e-05, "loss": 0.7114, "num_input_tokens_seen": 25400872, "step": 44045 }, { "epoch": 6.560917485850462, "grad_norm": 1.4703320264816284, "learning_rate": 1.5953133158593904e-05, "loss": 0.5057, "num_input_tokens_seen": 25403880, "step": 44050 }, { "epoch": 6.561662198391421, "grad_norm": 2.512773036956787, "learning_rate": 1.5947075069107402e-05, "loss": 0.6862, "num_input_tokens_seen": 25406856, "step": 44055 }, { "epoch": 6.56240691093238, "grad_norm": 1.324317455291748, "learning_rate": 1.5941017591380764e-05, "loss": 0.5915, "num_input_tokens_seen": 25409608, "step": 44060 }, { "epoch": 6.563151623473339, "grad_norm": 0.6268302798271179, "learning_rate": 1.5934960725823335e-05, "loss": 0.6332, "num_input_tokens_seen": 25412584, "step": 44065 }, { "epoch": 6.563896336014299, "grad_norm": 1.7097500562667847, "learning_rate": 1.5928904472844393e-05, "loss": 0.611, "num_input_tokens_seen": 25415432, "step": 44070 }, { "epoch": 6.5646410485552575, "grad_norm": 2.852952003479004, "learning_rate": 1.5922848832853217e-05, "loss": 0.5625, "num_input_tokens_seen": 25418216, "step": 44075 }, { "epoch": 6.565385761096217, "grad_norm": 2.7997841835021973, "learning_rate": 1.5916793806259e-05, "loss": 0.53, "num_input_tokens_seen": 25421064, "step": 44080 }, { "epoch": 6.566130473637176, "grad_norm": 2.7344284057617188, "learning_rate": 1.5910739393470934e-05, "loss": 0.7089, "num_input_tokens_seen": 25423784, "step": 44085 }, { "epoch": 6.566875186178136, "grad_norm": 1.7776784896850586, "learning_rate": 1.5904685594898154e-05, "loss": 0.7028, "num_input_tokens_seen": 25426792, "step": 44090 }, { "epoch": 6.567619898719094, "grad_norm": 1.7511842250823975, "learning_rate": 1.589863241094974e-05, "loss": 0.6875, "num_input_tokens_seen": 25429864, "step": 44095 }, { "epoch": 6.568364611260054, "grad_norm": 1.6744697093963623, "learning_rate": 1.589257984203473e-05, "loss": 0.7349, "num_input_tokens_seen": 25432552, "step": 44100 }, { "epoch": 6.569109323801013, "grad_norm": 2.9117276668548584, "learning_rate": 1.588652788856215e-05, "loss": 0.4492, "num_input_tokens_seen": 25435272, "step": 44105 }, { "epoch": 6.569854036341972, "grad_norm": 1.6838277578353882, "learning_rate": 1.5880476550940975e-05, "loss": 0.5535, "num_input_tokens_seen": 25438056, "step": 44110 }, { "epoch": 6.570598748882931, "grad_norm": 3.2661683559417725, "learning_rate": 1.5874425829580108e-05, "loss": 0.7107, "num_input_tokens_seen": 25441064, "step": 44115 }, { "epoch": 6.57134346142389, "grad_norm": 1.5190293788909912, "learning_rate": 1.586837572488844e-05, "loss": 0.6491, "num_input_tokens_seen": 25443912, "step": 44120 }, { "epoch": 6.57208817396485, "grad_norm": 2.1015982627868652, "learning_rate": 1.586232623727482e-05, "loss": 0.5307, "num_input_tokens_seen": 25446824, "step": 44125 }, { "epoch": 6.572832886505809, "grad_norm": 1.8932652473449707, "learning_rate": 1.5856277367148047e-05, "loss": 0.6339, "num_input_tokens_seen": 25450056, "step": 44130 }, { "epoch": 6.573577599046768, "grad_norm": 1.0949203968048096, "learning_rate": 1.5850229114916864e-05, "loss": 0.4987, "num_input_tokens_seen": 25452808, "step": 44135 }, { "epoch": 6.574322311587727, "grad_norm": 7.159057140350342, "learning_rate": 1.5844181480989995e-05, "loss": 0.6637, "num_input_tokens_seen": 25455560, "step": 44140 }, { "epoch": 6.575067024128686, "grad_norm": 3.01906156539917, "learning_rate": 1.5838134465776126e-05, "loss": 0.4832, "num_input_tokens_seen": 25458376, "step": 44145 }, { "epoch": 6.575811736669645, "grad_norm": 6.547794342041016, "learning_rate": 1.583208806968387e-05, "loss": 0.6143, "num_input_tokens_seen": 25461224, "step": 44150 }, { "epoch": 6.576556449210605, "grad_norm": 3.066030502319336, "learning_rate": 1.5826042293121835e-05, "loss": 0.6383, "num_input_tokens_seen": 25464456, "step": 44155 }, { "epoch": 6.5773011617515635, "grad_norm": 2.760132074356079, "learning_rate": 1.581999713649856e-05, "loss": 0.5247, "num_input_tokens_seen": 25467400, "step": 44160 }, { "epoch": 6.578045874292523, "grad_norm": 1.7033463716506958, "learning_rate": 1.5813952600222556e-05, "loss": 0.5949, "num_input_tokens_seen": 25470152, "step": 44165 }, { "epoch": 6.578790586833482, "grad_norm": 2.7678422927856445, "learning_rate": 1.58079086847023e-05, "loss": 0.5461, "num_input_tokens_seen": 25473384, "step": 44170 }, { "epoch": 6.579535299374442, "grad_norm": 3.6145811080932617, "learning_rate": 1.580186539034619e-05, "loss": 0.6678, "num_input_tokens_seen": 25476648, "step": 44175 }, { "epoch": 6.5802800119154, "grad_norm": 1.950051188468933, "learning_rate": 1.579582271756262e-05, "loss": 0.7048, "num_input_tokens_seen": 25479624, "step": 44180 }, { "epoch": 6.58102472445636, "grad_norm": 2.6296355724334717, "learning_rate": 1.578978066675993e-05, "loss": 0.6317, "num_input_tokens_seen": 25482440, "step": 44185 }, { "epoch": 6.581769436997319, "grad_norm": 3.6908559799194336, "learning_rate": 1.578373923834641e-05, "loss": 0.5604, "num_input_tokens_seen": 25485064, "step": 44190 }, { "epoch": 6.582514149538278, "grad_norm": 2.0908191204071045, "learning_rate": 1.5777698432730333e-05, "loss": 0.5168, "num_input_tokens_seen": 25487912, "step": 44195 }, { "epoch": 6.583258862079237, "grad_norm": 1.7185643911361694, "learning_rate": 1.5771658250319895e-05, "loss": 0.5403, "num_input_tokens_seen": 25491048, "step": 44200 }, { "epoch": 6.584003574620197, "grad_norm": 1.8182064294815063, "learning_rate": 1.576561869152327e-05, "loss": 0.7629, "num_input_tokens_seen": 25494056, "step": 44205 }, { "epoch": 6.584748287161156, "grad_norm": 2.393704414367676, "learning_rate": 1.5759579756748603e-05, "loss": 0.7293, "num_input_tokens_seen": 25497128, "step": 44210 }, { "epoch": 6.585492999702115, "grad_norm": 1.8931448459625244, "learning_rate": 1.5753541446403964e-05, "loss": 0.4953, "num_input_tokens_seen": 25500360, "step": 44215 }, { "epoch": 6.586237712243074, "grad_norm": 2.4863884449005127, "learning_rate": 1.574750376089739e-05, "loss": 0.6095, "num_input_tokens_seen": 25503528, "step": 44220 }, { "epoch": 6.586982424784034, "grad_norm": 1.3871440887451172, "learning_rate": 1.5741466700636898e-05, "loss": 0.535, "num_input_tokens_seen": 25506440, "step": 44225 }, { "epoch": 6.587727137324992, "grad_norm": 3.44280743598938, "learning_rate": 1.5735430266030447e-05, "loss": 0.5428, "num_input_tokens_seen": 25509160, "step": 44230 }, { "epoch": 6.588471849865952, "grad_norm": 2.193214178085327, "learning_rate": 1.5729394457485946e-05, "loss": 0.6533, "num_input_tokens_seen": 25512040, "step": 44235 }, { "epoch": 6.589216562406911, "grad_norm": 2.0479867458343506, "learning_rate": 1.5723359275411283e-05, "loss": 0.5166, "num_input_tokens_seen": 25514664, "step": 44240 }, { "epoch": 6.58996127494787, "grad_norm": 2.1904966831207275, "learning_rate": 1.571732472021428e-05, "loss": 0.6136, "num_input_tokens_seen": 25517448, "step": 44245 }, { "epoch": 6.590705987488829, "grad_norm": 1.3272011280059814, "learning_rate": 1.571129079230274e-05, "loss": 0.6188, "num_input_tokens_seen": 25520168, "step": 44250 }, { "epoch": 6.591450700029789, "grad_norm": 2.5740342140197754, "learning_rate": 1.570525749208439e-05, "loss": 0.5805, "num_input_tokens_seen": 25522792, "step": 44255 }, { "epoch": 6.592195412570748, "grad_norm": 4.713400840759277, "learning_rate": 1.5699224819966957e-05, "loss": 0.6394, "num_input_tokens_seen": 25525640, "step": 44260 }, { "epoch": 6.592940125111707, "grad_norm": 1.886682152748108, "learning_rate": 1.5693192776358092e-05, "loss": 0.5418, "num_input_tokens_seen": 25528648, "step": 44265 }, { "epoch": 6.593684837652666, "grad_norm": 1.3372645378112793, "learning_rate": 1.568716136166542e-05, "loss": 0.5255, "num_input_tokens_seen": 25531624, "step": 44270 }, { "epoch": 6.594429550193626, "grad_norm": 2.991621494293213, "learning_rate": 1.5681130576296528e-05, "loss": 0.4335, "num_input_tokens_seen": 25534184, "step": 44275 }, { "epoch": 6.595174262734584, "grad_norm": 2.7358601093292236, "learning_rate": 1.5675100420658935e-05, "loss": 0.5285, "num_input_tokens_seen": 25537096, "step": 44280 }, { "epoch": 6.595918975275543, "grad_norm": 2.4071879386901855, "learning_rate": 1.5669070895160143e-05, "loss": 0.7082, "num_input_tokens_seen": 25539912, "step": 44285 }, { "epoch": 6.596663687816503, "grad_norm": 2.5344953536987305, "learning_rate": 1.566304200020761e-05, "loss": 0.4928, "num_input_tokens_seen": 25542952, "step": 44290 }, { "epoch": 6.5974084003574625, "grad_norm": 4.673941135406494, "learning_rate": 1.565701373620874e-05, "loss": 0.7712, "num_input_tokens_seen": 25545864, "step": 44295 }, { "epoch": 6.598153112898421, "grad_norm": 4.392632007598877, "learning_rate": 1.5650986103570887e-05, "loss": 0.6012, "num_input_tokens_seen": 25548744, "step": 44300 }, { "epoch": 6.59889782543938, "grad_norm": 1.326987624168396, "learning_rate": 1.5644959102701384e-05, "loss": 0.5142, "num_input_tokens_seen": 25552296, "step": 44305 }, { "epoch": 6.59964253798034, "grad_norm": 1.9976662397384644, "learning_rate": 1.5638932734007515e-05, "loss": 0.6244, "num_input_tokens_seen": 25554952, "step": 44310 }, { "epoch": 6.600387250521299, "grad_norm": 2.54178524017334, "learning_rate": 1.563290699789651e-05, "loss": 0.5498, "num_input_tokens_seen": 25557704, "step": 44315 }, { "epoch": 6.601131963062258, "grad_norm": 3.14037823677063, "learning_rate": 1.562688189477556e-05, "loss": 0.6141, "num_input_tokens_seen": 25560648, "step": 44320 }, { "epoch": 6.601876675603217, "grad_norm": 5.132908821105957, "learning_rate": 1.562085742505183e-05, "loss": 0.5628, "num_input_tokens_seen": 25563848, "step": 44325 }, { "epoch": 6.602621388144176, "grad_norm": 1.3692004680633545, "learning_rate": 1.5614833589132427e-05, "loss": 0.4156, "num_input_tokens_seen": 25566920, "step": 44330 }, { "epoch": 6.603366100685135, "grad_norm": 3.050926685333252, "learning_rate": 1.5608810387424406e-05, "loss": 0.4142, "num_input_tokens_seen": 25569576, "step": 44335 }, { "epoch": 6.604110813226095, "grad_norm": 2.352426052093506, "learning_rate": 1.5602787820334798e-05, "loss": 0.4665, "num_input_tokens_seen": 25572584, "step": 44340 }, { "epoch": 6.604855525767054, "grad_norm": 1.2347594499588013, "learning_rate": 1.559676588827058e-05, "loss": 0.6174, "num_input_tokens_seen": 25575464, "step": 44345 }, { "epoch": 6.605600238308013, "grad_norm": 2.3601675033569336, "learning_rate": 1.5590744591638693e-05, "loss": 0.5569, "num_input_tokens_seen": 25578248, "step": 44350 }, { "epoch": 6.606344950848972, "grad_norm": 1.5503109693527222, "learning_rate": 1.5584723930846034e-05, "loss": 0.5029, "num_input_tokens_seen": 25580904, "step": 44355 }, { "epoch": 6.607089663389932, "grad_norm": 1.7811055183410645, "learning_rate": 1.557870390629945e-05, "loss": 0.5505, "num_input_tokens_seen": 25584040, "step": 44360 }, { "epoch": 6.60783437593089, "grad_norm": 1.9795149564743042, "learning_rate": 1.5572684518405757e-05, "loss": 0.4755, "num_input_tokens_seen": 25586856, "step": 44365 }, { "epoch": 6.60857908847185, "grad_norm": 3.2729387283325195, "learning_rate": 1.5566665767571708e-05, "loss": 0.668, "num_input_tokens_seen": 25590088, "step": 44370 }, { "epoch": 6.609323801012809, "grad_norm": 2.282682418823242, "learning_rate": 1.5560647654204043e-05, "loss": 0.6498, "num_input_tokens_seen": 25592872, "step": 44375 }, { "epoch": 6.6100685135537685, "grad_norm": 2.6403017044067383, "learning_rate": 1.5554630178709427e-05, "loss": 0.7355, "num_input_tokens_seen": 25595560, "step": 44380 }, { "epoch": 6.610813226094727, "grad_norm": 5.081326961517334, "learning_rate": 1.55486133414945e-05, "loss": 0.6936, "num_input_tokens_seen": 25598568, "step": 44385 }, { "epoch": 6.611557938635687, "grad_norm": 4.055426120758057, "learning_rate": 1.5542597142965857e-05, "loss": 0.8138, "num_input_tokens_seen": 25601512, "step": 44390 }, { "epoch": 6.612302651176646, "grad_norm": 2.431055784225464, "learning_rate": 1.5536581583530048e-05, "loss": 0.6448, "num_input_tokens_seen": 25604520, "step": 44395 }, { "epoch": 6.613047363717605, "grad_norm": 2.297987461090088, "learning_rate": 1.5530566663593584e-05, "loss": 0.8333, "num_input_tokens_seen": 25607400, "step": 44400 }, { "epoch": 6.613792076258564, "grad_norm": 1.8765349388122559, "learning_rate": 1.552455238356292e-05, "loss": 0.8306, "num_input_tokens_seen": 25610344, "step": 44405 }, { "epoch": 6.614536788799524, "grad_norm": 2.1613636016845703, "learning_rate": 1.551853874384448e-05, "loss": 0.6609, "num_input_tokens_seen": 25613288, "step": 44410 }, { "epoch": 6.615281501340482, "grad_norm": 2.7354440689086914, "learning_rate": 1.5512525744844656e-05, "loss": 0.7555, "num_input_tokens_seen": 25615912, "step": 44415 }, { "epoch": 6.616026213881442, "grad_norm": 1.1846591234207153, "learning_rate": 1.5506513386969757e-05, "loss": 0.5873, "num_input_tokens_seen": 25618568, "step": 44420 }, { "epoch": 6.616770926422401, "grad_norm": 2.1232738494873047, "learning_rate": 1.550050167062609e-05, "loss": 0.6303, "num_input_tokens_seen": 25621320, "step": 44425 }, { "epoch": 6.6175156389633605, "grad_norm": 2.0198771953582764, "learning_rate": 1.549449059621989e-05, "loss": 0.5748, "num_input_tokens_seen": 25624232, "step": 44430 }, { "epoch": 6.618260351504319, "grad_norm": 1.4666821956634521, "learning_rate": 1.5488480164157375e-05, "loss": 0.5573, "num_input_tokens_seen": 25626984, "step": 44435 }, { "epoch": 6.619005064045279, "grad_norm": 1.0735276937484741, "learning_rate": 1.5482470374844698e-05, "loss": 0.6766, "num_input_tokens_seen": 25629672, "step": 44440 }, { "epoch": 6.619749776586238, "grad_norm": 2.4082353115081787, "learning_rate": 1.5476461228687976e-05, "loss": 0.579, "num_input_tokens_seen": 25632456, "step": 44445 }, { "epoch": 6.620494489127196, "grad_norm": 1.8113490343093872, "learning_rate": 1.5470452726093287e-05, "loss": 0.4291, "num_input_tokens_seen": 25635688, "step": 44450 }, { "epoch": 6.621239201668156, "grad_norm": 2.0587215423583984, "learning_rate": 1.5464444867466666e-05, "loss": 0.6871, "num_input_tokens_seen": 25638472, "step": 44455 }, { "epoch": 6.621983914209116, "grad_norm": 1.8891091346740723, "learning_rate": 1.5458437653214088e-05, "loss": 0.5508, "num_input_tokens_seen": 25641288, "step": 44460 }, { "epoch": 6.6227286267500745, "grad_norm": 2.7497951984405518, "learning_rate": 1.545243108374149e-05, "loss": 0.6199, "num_input_tokens_seen": 25644392, "step": 44465 }, { "epoch": 6.623473339291033, "grad_norm": 2.953261375427246, "learning_rate": 1.544642515945479e-05, "loss": 0.7135, "num_input_tokens_seen": 25647240, "step": 44470 }, { "epoch": 6.624218051831993, "grad_norm": 1.9824622869491577, "learning_rate": 1.5440419880759838e-05, "loss": 0.6332, "num_input_tokens_seen": 25649928, "step": 44475 }, { "epoch": 6.6249627643729525, "grad_norm": 2.1826493740081787, "learning_rate": 1.5434415248062435e-05, "loss": 0.5243, "num_input_tokens_seen": 25652808, "step": 44480 }, { "epoch": 6.625707476913911, "grad_norm": 2.082045316696167, "learning_rate": 1.542841126176836e-05, "loss": 0.5411, "num_input_tokens_seen": 25655688, "step": 44485 }, { "epoch": 6.62645218945487, "grad_norm": 2.879927635192871, "learning_rate": 1.5422407922283343e-05, "loss": 0.6819, "num_input_tokens_seen": 25658472, "step": 44490 }, { "epoch": 6.62719690199583, "grad_norm": 3.1442646980285645, "learning_rate": 1.5416405230013065e-05, "loss": 0.5016, "num_input_tokens_seen": 25661224, "step": 44495 }, { "epoch": 6.627941614536788, "grad_norm": 2.4452273845672607, "learning_rate": 1.5410403185363147e-05, "loss": 0.7057, "num_input_tokens_seen": 25664424, "step": 44500 }, { "epoch": 6.628686327077748, "grad_norm": 4.853928565979004, "learning_rate": 1.540440178873919e-05, "loss": 0.671, "num_input_tokens_seen": 25667464, "step": 44505 }, { "epoch": 6.629431039618707, "grad_norm": 3.4140419960021973, "learning_rate": 1.539840104054676e-05, "loss": 0.6115, "num_input_tokens_seen": 25670504, "step": 44510 }, { "epoch": 6.6301757521596665, "grad_norm": 1.525223970413208, "learning_rate": 1.5392400941191337e-05, "loss": 0.6816, "num_input_tokens_seen": 25673480, "step": 44515 }, { "epoch": 6.630920464700625, "grad_norm": 2.9126172065734863, "learning_rate": 1.53864014910784e-05, "loss": 0.6199, "num_input_tokens_seen": 25676296, "step": 44520 }, { "epoch": 6.631665177241585, "grad_norm": 1.0972570180892944, "learning_rate": 1.538040269061337e-05, "loss": 0.6634, "num_input_tokens_seen": 25679176, "step": 44525 }, { "epoch": 6.632409889782544, "grad_norm": 2.7372496128082275, "learning_rate": 1.5374404540201612e-05, "loss": 0.5899, "num_input_tokens_seen": 25682024, "step": 44530 }, { "epoch": 6.633154602323503, "grad_norm": 1.3893221616744995, "learning_rate": 1.5368407040248467e-05, "loss": 0.5699, "num_input_tokens_seen": 25684904, "step": 44535 }, { "epoch": 6.633899314864462, "grad_norm": 2.1213362216949463, "learning_rate": 1.536241019115921e-05, "loss": 0.5891, "num_input_tokens_seen": 25687784, "step": 44540 }, { "epoch": 6.634644027405422, "grad_norm": 4.033229351043701, "learning_rate": 1.5356413993339088e-05, "loss": 0.5142, "num_input_tokens_seen": 25691016, "step": 44545 }, { "epoch": 6.6353887399463805, "grad_norm": 4.1684722900390625, "learning_rate": 1.5350418447193298e-05, "loss": 0.5673, "num_input_tokens_seen": 25693896, "step": 44550 }, { "epoch": 6.63613345248734, "grad_norm": 2.1437652111053467, "learning_rate": 1.5344423553126997e-05, "loss": 0.6391, "num_input_tokens_seen": 25696776, "step": 44555 }, { "epoch": 6.636878165028299, "grad_norm": 2.9996163845062256, "learning_rate": 1.53384293115453e-05, "loss": 0.6889, "num_input_tokens_seen": 25699912, "step": 44560 }, { "epoch": 6.6376228775692585, "grad_norm": 1.235384464263916, "learning_rate": 1.5332435722853263e-05, "loss": 0.6271, "num_input_tokens_seen": 25702632, "step": 44565 }, { "epoch": 6.638367590110217, "grad_norm": 2.685098886489868, "learning_rate": 1.532644278745592e-05, "loss": 0.6099, "num_input_tokens_seen": 25705352, "step": 44570 }, { "epoch": 6.639112302651177, "grad_norm": 1.855380892753601, "learning_rate": 1.5320450505758247e-05, "loss": 0.6796, "num_input_tokens_seen": 25708264, "step": 44575 }, { "epoch": 6.639857015192136, "grad_norm": 1.7319926023483276, "learning_rate": 1.531445887816517e-05, "loss": 0.7682, "num_input_tokens_seen": 25710984, "step": 44580 }, { "epoch": 6.640601727733095, "grad_norm": 2.267021656036377, "learning_rate": 1.530846790508158e-05, "loss": 0.5001, "num_input_tokens_seen": 25713864, "step": 44585 }, { "epoch": 6.641346440274054, "grad_norm": 1.665692925453186, "learning_rate": 1.5302477586912333e-05, "loss": 0.604, "num_input_tokens_seen": 25716744, "step": 44590 }, { "epoch": 6.642091152815014, "grad_norm": 1.9916141033172607, "learning_rate": 1.5296487924062218e-05, "loss": 0.7131, "num_input_tokens_seen": 25719976, "step": 44595 }, { "epoch": 6.6428358653559725, "grad_norm": 1.1239269971847534, "learning_rate": 1.5290498916935995e-05, "loss": 0.5812, "num_input_tokens_seen": 25723016, "step": 44600 }, { "epoch": 6.643580577896932, "grad_norm": 2.7301034927368164, "learning_rate": 1.5284510565938385e-05, "loss": 0.5882, "num_input_tokens_seen": 25725672, "step": 44605 }, { "epoch": 6.644325290437891, "grad_norm": 1.640566110610962, "learning_rate": 1.5278522871474045e-05, "loss": 0.5632, "num_input_tokens_seen": 25728584, "step": 44610 }, { "epoch": 6.6450700029788505, "grad_norm": 1.5448901653289795, "learning_rate": 1.527253583394762e-05, "loss": 0.5649, "num_input_tokens_seen": 25731688, "step": 44615 }, { "epoch": 6.645814715519809, "grad_norm": 0.9876735806465149, "learning_rate": 1.5266549453763655e-05, "loss": 0.482, "num_input_tokens_seen": 25734504, "step": 44620 }, { "epoch": 6.646559428060769, "grad_norm": 1.6276743412017822, "learning_rate": 1.5260563731326715e-05, "loss": 0.5285, "num_input_tokens_seen": 25737288, "step": 44625 }, { "epoch": 6.647304140601728, "grad_norm": 1.6870614290237427, "learning_rate": 1.5254578667041278e-05, "loss": 0.5673, "num_input_tokens_seen": 25740008, "step": 44630 }, { "epoch": 6.6480488531426865, "grad_norm": 1.5165588855743408, "learning_rate": 1.5248594261311789e-05, "loss": 0.4631, "num_input_tokens_seen": 25742792, "step": 44635 }, { "epoch": 6.648793565683646, "grad_norm": 1.596744418144226, "learning_rate": 1.524261051454266e-05, "loss": 0.4534, "num_input_tokens_seen": 25745416, "step": 44640 }, { "epoch": 6.649538278224606, "grad_norm": 1.9675414562225342, "learning_rate": 1.5236627427138237e-05, "loss": 0.5834, "num_input_tokens_seen": 25748552, "step": 44645 }, { "epoch": 6.6502829907655645, "grad_norm": 2.6763908863067627, "learning_rate": 1.5230644999502835e-05, "loss": 0.7129, "num_input_tokens_seen": 25751400, "step": 44650 }, { "epoch": 6.651027703306523, "grad_norm": 3.5061206817626953, "learning_rate": 1.5224663232040736e-05, "loss": 0.6122, "num_input_tokens_seen": 25754568, "step": 44655 }, { "epoch": 6.651772415847483, "grad_norm": 1.6921041011810303, "learning_rate": 1.5218682125156148e-05, "loss": 0.483, "num_input_tokens_seen": 25757448, "step": 44660 }, { "epoch": 6.652517128388443, "grad_norm": 2.899735450744629, "learning_rate": 1.521270167925325e-05, "loss": 0.7994, "num_input_tokens_seen": 25760296, "step": 44665 }, { "epoch": 6.653261840929401, "grad_norm": 1.7476881742477417, "learning_rate": 1.5206721894736178e-05, "loss": 0.5983, "num_input_tokens_seen": 25763176, "step": 44670 }, { "epoch": 6.65400655347036, "grad_norm": 1.8597137928009033, "learning_rate": 1.520074277200903e-05, "loss": 0.4766, "num_input_tokens_seen": 25766216, "step": 44675 }, { "epoch": 6.65475126601132, "grad_norm": 2.8794846534729004, "learning_rate": 1.519476431147584e-05, "loss": 0.516, "num_input_tokens_seen": 25769192, "step": 44680 }, { "epoch": 6.6554959785522785, "grad_norm": 1.6770248413085938, "learning_rate": 1.518878651354061e-05, "loss": 0.4787, "num_input_tokens_seen": 25772200, "step": 44685 }, { "epoch": 6.656240691093238, "grad_norm": 2.237520217895508, "learning_rate": 1.5182809378607304e-05, "loss": 0.6918, "num_input_tokens_seen": 25774888, "step": 44690 }, { "epoch": 6.656985403634197, "grad_norm": 1.8411705493927002, "learning_rate": 1.5176832907079836e-05, "loss": 0.5968, "num_input_tokens_seen": 25777864, "step": 44695 }, { "epoch": 6.6577301161751565, "grad_norm": 2.401334762573242, "learning_rate": 1.5170857099362045e-05, "loss": 0.5345, "num_input_tokens_seen": 25780520, "step": 44700 }, { "epoch": 6.658474828716115, "grad_norm": 6.659836292266846, "learning_rate": 1.5164881955857774e-05, "loss": 0.5911, "num_input_tokens_seen": 25783368, "step": 44705 }, { "epoch": 6.659219541257075, "grad_norm": 1.7045170068740845, "learning_rate": 1.5158907476970796e-05, "loss": 0.5358, "num_input_tokens_seen": 25786664, "step": 44710 }, { "epoch": 6.659964253798034, "grad_norm": 2.2818121910095215, "learning_rate": 1.5152933663104834e-05, "loss": 0.5782, "num_input_tokens_seen": 25789672, "step": 44715 }, { "epoch": 6.660708966338993, "grad_norm": 1.9456270933151245, "learning_rate": 1.5146960514663583e-05, "loss": 0.5609, "num_input_tokens_seen": 25792456, "step": 44720 }, { "epoch": 6.661453678879952, "grad_norm": 1.2659355401992798, "learning_rate": 1.5140988032050685e-05, "loss": 0.5285, "num_input_tokens_seen": 25795176, "step": 44725 }, { "epoch": 6.662198391420912, "grad_norm": 2.391806125640869, "learning_rate": 1.5135016215669724e-05, "loss": 0.8575, "num_input_tokens_seen": 25798152, "step": 44730 }, { "epoch": 6.6629431039618705, "grad_norm": 2.5929152965545654, "learning_rate": 1.5129045065924271e-05, "loss": 0.6822, "num_input_tokens_seen": 25801064, "step": 44735 }, { "epoch": 6.66368781650283, "grad_norm": 2.689664363861084, "learning_rate": 1.5123074583217812e-05, "loss": 0.595, "num_input_tokens_seen": 25803752, "step": 44740 }, { "epoch": 6.664432529043789, "grad_norm": 2.754460334777832, "learning_rate": 1.5117104767953818e-05, "loss": 0.6796, "num_input_tokens_seen": 25806984, "step": 44745 }, { "epoch": 6.665177241584749, "grad_norm": 1.5477125644683838, "learning_rate": 1.51111356205357e-05, "loss": 0.3611, "num_input_tokens_seen": 25809832, "step": 44750 }, { "epoch": 6.665921954125707, "grad_norm": 1.6968727111816406, "learning_rate": 1.5105167141366836e-05, "loss": 0.6082, "num_input_tokens_seen": 25812840, "step": 44755 }, { "epoch": 6.666666666666667, "grad_norm": 1.9969900846481323, "learning_rate": 1.509919933085054e-05, "loss": 0.4372, "num_input_tokens_seen": 25815880, "step": 44760 }, { "epoch": 6.667411379207626, "grad_norm": 1.7970657348632812, "learning_rate": 1.5093232189390103e-05, "loss": 0.5941, "num_input_tokens_seen": 25818696, "step": 44765 }, { "epoch": 6.668156091748585, "grad_norm": 1.8708027601242065, "learning_rate": 1.508726571738876e-05, "loss": 0.5719, "num_input_tokens_seen": 25822312, "step": 44770 }, { "epoch": 6.668900804289544, "grad_norm": 1.737720012664795, "learning_rate": 1.5081299915249702e-05, "loss": 0.6246, "num_input_tokens_seen": 25825832, "step": 44775 }, { "epoch": 6.669645516830504, "grad_norm": 2.887540578842163, "learning_rate": 1.507533478337606e-05, "loss": 0.5791, "num_input_tokens_seen": 25828744, "step": 44780 }, { "epoch": 6.6703902293714625, "grad_norm": 1.5693550109863281, "learning_rate": 1.5069370322170941e-05, "loss": 0.5966, "num_input_tokens_seen": 25831752, "step": 44785 }, { "epoch": 6.671134941912422, "grad_norm": 2.1144397258758545, "learning_rate": 1.5063406532037408e-05, "loss": 0.7012, "num_input_tokens_seen": 25834664, "step": 44790 }, { "epoch": 6.671879654453381, "grad_norm": 1.8236489295959473, "learning_rate": 1.5057443413378458e-05, "loss": 0.4449, "num_input_tokens_seen": 25837384, "step": 44795 }, { "epoch": 6.67262436699434, "grad_norm": 3.9256350994110107, "learning_rate": 1.5051480966597054e-05, "loss": 0.6521, "num_input_tokens_seen": 25840328, "step": 44800 }, { "epoch": 6.673369079535299, "grad_norm": 2.0601329803466797, "learning_rate": 1.5045519192096128e-05, "loss": 0.6892, "num_input_tokens_seen": 25843336, "step": 44805 }, { "epoch": 6.674113792076259, "grad_norm": 1.6908657550811768, "learning_rate": 1.5039558090278538e-05, "loss": 0.6263, "num_input_tokens_seen": 25846056, "step": 44810 }, { "epoch": 6.674858504617218, "grad_norm": 1.0690938234329224, "learning_rate": 1.5033597661547123e-05, "loss": 0.5639, "num_input_tokens_seen": 25848904, "step": 44815 }, { "epoch": 6.6756032171581765, "grad_norm": 3.161210298538208, "learning_rate": 1.5027637906304648e-05, "loss": 0.8383, "num_input_tokens_seen": 25851624, "step": 44820 }, { "epoch": 6.676347929699136, "grad_norm": 1.835089921951294, "learning_rate": 1.5021678824953867e-05, "loss": 0.5714, "num_input_tokens_seen": 25854568, "step": 44825 }, { "epoch": 6.677092642240096, "grad_norm": 3.486119270324707, "learning_rate": 1.5015720417897456e-05, "loss": 0.4597, "num_input_tokens_seen": 25857512, "step": 44830 }, { "epoch": 6.677837354781055, "grad_norm": 2.9134268760681152, "learning_rate": 1.5009762685538065e-05, "loss": 0.4646, "num_input_tokens_seen": 25860360, "step": 44835 }, { "epoch": 6.678582067322013, "grad_norm": 4.582796096801758, "learning_rate": 1.5003805628278297e-05, "loss": 0.5928, "num_input_tokens_seen": 25863304, "step": 44840 }, { "epoch": 6.679326779862973, "grad_norm": 3.66821551322937, "learning_rate": 1.49978492465207e-05, "loss": 0.7787, "num_input_tokens_seen": 25865928, "step": 44845 }, { "epoch": 6.680071492403932, "grad_norm": 1.9993456602096558, "learning_rate": 1.4991893540667783e-05, "loss": 0.7098, "num_input_tokens_seen": 25869096, "step": 44850 }, { "epoch": 6.680816204944891, "grad_norm": 1.8024734258651733, "learning_rate": 1.4985938511122027e-05, "loss": 0.5339, "num_input_tokens_seen": 25872008, "step": 44855 }, { "epoch": 6.68156091748585, "grad_norm": 2.321521759033203, "learning_rate": 1.497998415828582e-05, "loss": 0.4742, "num_input_tokens_seen": 25875240, "step": 44860 }, { "epoch": 6.68230563002681, "grad_norm": 1.1229256391525269, "learning_rate": 1.4974030482561546e-05, "loss": 0.8484, "num_input_tokens_seen": 25878472, "step": 44865 }, { "epoch": 6.6830503425677685, "grad_norm": 4.148249626159668, "learning_rate": 1.4968077484351529e-05, "loss": 0.7952, "num_input_tokens_seen": 25881288, "step": 44870 }, { "epoch": 6.683795055108728, "grad_norm": 1.645365834236145, "learning_rate": 1.496212516405805e-05, "loss": 0.7376, "num_input_tokens_seen": 25884104, "step": 44875 }, { "epoch": 6.684539767649687, "grad_norm": 1.8711422681808472, "learning_rate": 1.4956173522083338e-05, "loss": 0.7224, "num_input_tokens_seen": 25886856, "step": 44880 }, { "epoch": 6.685284480190647, "grad_norm": 3.758049726486206, "learning_rate": 1.4950222558829582e-05, "loss": 0.6628, "num_input_tokens_seen": 25889896, "step": 44885 }, { "epoch": 6.686029192731605, "grad_norm": 2.0535924434661865, "learning_rate": 1.4944272274698935e-05, "loss": 0.6516, "num_input_tokens_seen": 25892904, "step": 44890 }, { "epoch": 6.686773905272565, "grad_norm": 1.3346173763275146, "learning_rate": 1.4938322670093485e-05, "loss": 0.505, "num_input_tokens_seen": 25895624, "step": 44895 }, { "epoch": 6.687518617813524, "grad_norm": 2.3441381454467773, "learning_rate": 1.4932373745415273e-05, "loss": 0.6139, "num_input_tokens_seen": 25898632, "step": 44900 }, { "epoch": 6.688263330354483, "grad_norm": 1.588207483291626, "learning_rate": 1.4926425501066313e-05, "loss": 0.4351, "num_input_tokens_seen": 25901256, "step": 44905 }, { "epoch": 6.689008042895442, "grad_norm": 2.3450095653533936, "learning_rate": 1.4920477937448565e-05, "loss": 0.5134, "num_input_tokens_seen": 25903848, "step": 44910 }, { "epoch": 6.689752755436402, "grad_norm": 2.4107580184936523, "learning_rate": 1.4914531054963931e-05, "loss": 0.7171, "num_input_tokens_seen": 25906696, "step": 44915 }, { "epoch": 6.690497467977361, "grad_norm": 2.096998929977417, "learning_rate": 1.4908584854014294e-05, "loss": 0.6976, "num_input_tokens_seen": 25909992, "step": 44920 }, { "epoch": 6.69124218051832, "grad_norm": 1.9240641593933105, "learning_rate": 1.4902639335001456e-05, "loss": 0.5656, "num_input_tokens_seen": 25913096, "step": 44925 }, { "epoch": 6.691986893059279, "grad_norm": 2.3228931427001953, "learning_rate": 1.4896694498327195e-05, "loss": 0.5392, "num_input_tokens_seen": 25916168, "step": 44930 }, { "epoch": 6.692731605600239, "grad_norm": 2.296823740005493, "learning_rate": 1.4890750344393254e-05, "loss": 0.6705, "num_input_tokens_seen": 25919144, "step": 44935 }, { "epoch": 6.693476318141197, "grad_norm": 1.6897956132888794, "learning_rate": 1.4884806873601303e-05, "loss": 0.5869, "num_input_tokens_seen": 25922184, "step": 44940 }, { "epoch": 6.694221030682157, "grad_norm": 1.3131091594696045, "learning_rate": 1.4878864086352973e-05, "loss": 0.4864, "num_input_tokens_seen": 25925064, "step": 44945 }, { "epoch": 6.694965743223116, "grad_norm": 2.3104777336120605, "learning_rate": 1.4872921983049854e-05, "loss": 0.7257, "num_input_tokens_seen": 25927688, "step": 44950 }, { "epoch": 6.695710455764075, "grad_norm": 1.5982882976531982, "learning_rate": 1.4866980564093503e-05, "loss": 0.6683, "num_input_tokens_seen": 25930696, "step": 44955 }, { "epoch": 6.696455168305034, "grad_norm": 3.8058154582977295, "learning_rate": 1.4861039829885398e-05, "loss": 0.6142, "num_input_tokens_seen": 25933704, "step": 44960 }, { "epoch": 6.697199880845994, "grad_norm": 2.835350275039673, "learning_rate": 1.4855099780827004e-05, "loss": 0.8313, "num_input_tokens_seen": 25936584, "step": 44965 }, { "epoch": 6.697944593386953, "grad_norm": 2.2474374771118164, "learning_rate": 1.4849160417319724e-05, "loss": 0.6565, "num_input_tokens_seen": 25939464, "step": 44970 }, { "epoch": 6.698689305927912, "grad_norm": 2.902794599533081, "learning_rate": 1.4843221739764906e-05, "loss": 0.5335, "num_input_tokens_seen": 25942472, "step": 44975 }, { "epoch": 6.699434018468871, "grad_norm": 1.6984063386917114, "learning_rate": 1.483728374856388e-05, "loss": 0.5507, "num_input_tokens_seen": 25945192, "step": 44980 }, { "epoch": 6.70017873100983, "grad_norm": 2.5982539653778076, "learning_rate": 1.4831346444117888e-05, "loss": 0.5962, "num_input_tokens_seen": 25948232, "step": 44985 }, { "epoch": 6.700923443550789, "grad_norm": 5.027255535125732, "learning_rate": 1.4825409826828169e-05, "loss": 0.6479, "num_input_tokens_seen": 25951112, "step": 44990 }, { "epoch": 6.701668156091749, "grad_norm": 1.5398961305618286, "learning_rate": 1.4819473897095876e-05, "loss": 0.4961, "num_input_tokens_seen": 25954056, "step": 44995 }, { "epoch": 6.702412868632708, "grad_norm": 3.641206979751587, "learning_rate": 1.4813538655322151e-05, "loss": 0.6875, "num_input_tokens_seen": 25957096, "step": 45000 }, { "epoch": 6.703157581173667, "grad_norm": 6.891628742218018, "learning_rate": 1.4807604101908073e-05, "loss": 0.7177, "num_input_tokens_seen": 25959688, "step": 45005 }, { "epoch": 6.703902293714626, "grad_norm": 1.7153122425079346, "learning_rate": 1.4801670237254664e-05, "loss": 0.6942, "num_input_tokens_seen": 25962728, "step": 45010 }, { "epoch": 6.704647006255585, "grad_norm": 2.022246837615967, "learning_rate": 1.4795737061762918e-05, "loss": 0.6843, "num_input_tokens_seen": 25965480, "step": 45015 }, { "epoch": 6.705391718796545, "grad_norm": 4.298616886138916, "learning_rate": 1.4789804575833782e-05, "loss": 0.5159, "num_input_tokens_seen": 25968488, "step": 45020 }, { "epoch": 6.706136431337503, "grad_norm": 1.658649206161499, "learning_rate": 1.4783872779868141e-05, "loss": 0.4598, "num_input_tokens_seen": 25971496, "step": 45025 }, { "epoch": 6.706881143878463, "grad_norm": 1.504431962966919, "learning_rate": 1.4777941674266832e-05, "loss": 0.5471, "num_input_tokens_seen": 25974120, "step": 45030 }, { "epoch": 6.707625856419422, "grad_norm": 2.661341428756714, "learning_rate": 1.4772011259430668e-05, "loss": 0.5134, "num_input_tokens_seen": 25976840, "step": 45035 }, { "epoch": 6.708370568960381, "grad_norm": 2.3880233764648438, "learning_rate": 1.4766081535760401e-05, "loss": 0.7081, "num_input_tokens_seen": 25980104, "step": 45040 }, { "epoch": 6.70911528150134, "grad_norm": 1.8206212520599365, "learning_rate": 1.4760152503656733e-05, "loss": 0.5556, "num_input_tokens_seen": 25982664, "step": 45045 }, { "epoch": 6.7098599940423, "grad_norm": 2.3616204261779785, "learning_rate": 1.4754224163520325e-05, "loss": 0.5506, "num_input_tokens_seen": 25986024, "step": 45050 }, { "epoch": 6.710604706583259, "grad_norm": 2.5097804069519043, "learning_rate": 1.4748296515751797e-05, "loss": 0.5499, "num_input_tokens_seen": 25988808, "step": 45055 }, { "epoch": 6.711349419124218, "grad_norm": 2.2853004932403564, "learning_rate": 1.4742369560751718e-05, "loss": 0.7256, "num_input_tokens_seen": 25991720, "step": 45060 }, { "epoch": 6.712094131665177, "grad_norm": 1.5820813179016113, "learning_rate": 1.4736443298920588e-05, "loss": 0.5275, "num_input_tokens_seen": 25994408, "step": 45065 }, { "epoch": 6.712838844206137, "grad_norm": 1.40163254737854, "learning_rate": 1.4730517730658888e-05, "loss": 0.5498, "num_input_tokens_seen": 25997128, "step": 45070 }, { "epoch": 6.713583556747095, "grad_norm": 1.9848040342330933, "learning_rate": 1.4724592856367057e-05, "loss": 0.5816, "num_input_tokens_seen": 25999880, "step": 45075 }, { "epoch": 6.714328269288055, "grad_norm": 6.713292121887207, "learning_rate": 1.4718668676445454e-05, "loss": 0.5902, "num_input_tokens_seen": 26002696, "step": 45080 }, { "epoch": 6.715072981829014, "grad_norm": 1.8492578268051147, "learning_rate": 1.4712745191294431e-05, "loss": 0.6259, "num_input_tokens_seen": 26005768, "step": 45085 }, { "epoch": 6.7158176943699734, "grad_norm": 2.2623724937438965, "learning_rate": 1.4706822401314254e-05, "loss": 0.6714, "num_input_tokens_seen": 26008872, "step": 45090 }, { "epoch": 6.716562406910932, "grad_norm": 1.5545132160186768, "learning_rate": 1.4700900306905172e-05, "loss": 0.6274, "num_input_tokens_seen": 26011464, "step": 45095 }, { "epoch": 6.717307119451892, "grad_norm": 2.653317451477051, "learning_rate": 1.4694978908467384e-05, "loss": 0.7091, "num_input_tokens_seen": 26014408, "step": 45100 }, { "epoch": 6.718051831992851, "grad_norm": 2.102710008621216, "learning_rate": 1.468905820640102e-05, "loss": 0.8458, "num_input_tokens_seen": 26017672, "step": 45105 }, { "epoch": 6.71879654453381, "grad_norm": 3.208404541015625, "learning_rate": 1.4683138201106175e-05, "loss": 0.7143, "num_input_tokens_seen": 26020744, "step": 45110 }, { "epoch": 6.719541257074769, "grad_norm": 2.1316559314727783, "learning_rate": 1.4677218892982906e-05, "loss": 0.5856, "num_input_tokens_seen": 26024040, "step": 45115 }, { "epoch": 6.720285969615729, "grad_norm": 2.452108383178711, "learning_rate": 1.467130028243122e-05, "loss": 0.6482, "num_input_tokens_seen": 26026856, "step": 45120 }, { "epoch": 6.721030682156687, "grad_norm": 2.769378423690796, "learning_rate": 1.466538236985106e-05, "loss": 0.6595, "num_input_tokens_seen": 26029704, "step": 45125 }, { "epoch": 6.721775394697647, "grad_norm": 1.482272744178772, "learning_rate": 1.4659465155642343e-05, "loss": 0.6838, "num_input_tokens_seen": 26032360, "step": 45130 }, { "epoch": 6.722520107238606, "grad_norm": 1.6720640659332275, "learning_rate": 1.4653548640204934e-05, "loss": 0.5779, "num_input_tokens_seen": 26035336, "step": 45135 }, { "epoch": 6.7232648197795655, "grad_norm": 2.2958261966705322, "learning_rate": 1.4647632823938651e-05, "loss": 0.5872, "num_input_tokens_seen": 26038344, "step": 45140 }, { "epoch": 6.724009532320524, "grad_norm": 1.8899661302566528, "learning_rate": 1.4641717707243238e-05, "loss": 0.7809, "num_input_tokens_seen": 26041096, "step": 45145 }, { "epoch": 6.724754244861483, "grad_norm": 2.261709213256836, "learning_rate": 1.463580329051843e-05, "loss": 0.4995, "num_input_tokens_seen": 26044008, "step": 45150 }, { "epoch": 6.725498957402443, "grad_norm": 1.8009324073791504, "learning_rate": 1.4629889574163903e-05, "loss": 0.49, "num_input_tokens_seen": 26046760, "step": 45155 }, { "epoch": 6.726243669943402, "grad_norm": 2.0754032135009766, "learning_rate": 1.4623976558579272e-05, "loss": 0.6199, "num_input_tokens_seen": 26049800, "step": 45160 }, { "epoch": 6.726988382484361, "grad_norm": 2.3923237323760986, "learning_rate": 1.461806424416412e-05, "loss": 0.6147, "num_input_tokens_seen": 26052584, "step": 45165 }, { "epoch": 6.72773309502532, "grad_norm": 2.4389781951904297, "learning_rate": 1.4612152631317976e-05, "loss": 0.7411, "num_input_tokens_seen": 26055720, "step": 45170 }, { "epoch": 6.7284778075662794, "grad_norm": 2.975147247314453, "learning_rate": 1.4606241720440326e-05, "loss": 0.6183, "num_input_tokens_seen": 26058728, "step": 45175 }, { "epoch": 6.729222520107239, "grad_norm": 1.866768479347229, "learning_rate": 1.4600331511930609e-05, "loss": 0.5948, "num_input_tokens_seen": 26061544, "step": 45180 }, { "epoch": 6.729967232648198, "grad_norm": 1.9114694595336914, "learning_rate": 1.4594422006188196e-05, "loss": 0.5993, "num_input_tokens_seen": 26064488, "step": 45185 }, { "epoch": 6.730711945189157, "grad_norm": 2.381760597229004, "learning_rate": 1.458851320361244e-05, "loss": 0.5591, "num_input_tokens_seen": 26067304, "step": 45190 }, { "epoch": 6.731456657730116, "grad_norm": 2.426494598388672, "learning_rate": 1.458260510460264e-05, "loss": 0.5056, "num_input_tokens_seen": 26069896, "step": 45195 }, { "epoch": 6.732201370271075, "grad_norm": 2.7113895416259766, "learning_rate": 1.4576697709558023e-05, "loss": 0.6106, "num_input_tokens_seen": 26072904, "step": 45200 }, { "epoch": 6.732946082812035, "grad_norm": 1.7948029041290283, "learning_rate": 1.4570791018877796e-05, "loss": 0.5874, "num_input_tokens_seen": 26075592, "step": 45205 }, { "epoch": 6.733690795352993, "grad_norm": 1.5559602975845337, "learning_rate": 1.4564885032961112e-05, "loss": 0.5174, "num_input_tokens_seen": 26078408, "step": 45210 }, { "epoch": 6.734435507893953, "grad_norm": 4.37252140045166, "learning_rate": 1.455897975220707e-05, "loss": 0.5251, "num_input_tokens_seen": 26081288, "step": 45215 }, { "epoch": 6.735180220434912, "grad_norm": 1.8931300640106201, "learning_rate": 1.4553075177014736e-05, "loss": 0.506, "num_input_tokens_seen": 26084424, "step": 45220 }, { "epoch": 6.7359249329758715, "grad_norm": 2.0974395275115967, "learning_rate": 1.4547171307783097e-05, "loss": 0.6575, "num_input_tokens_seen": 26087624, "step": 45225 }, { "epoch": 6.73666964551683, "grad_norm": 3.8930928707122803, "learning_rate": 1.4541268144911135e-05, "loss": 0.5919, "num_input_tokens_seen": 26090536, "step": 45230 }, { "epoch": 6.73741435805779, "grad_norm": 2.1991584300994873, "learning_rate": 1.4535365688797735e-05, "loss": 0.4633, "num_input_tokens_seen": 26093032, "step": 45235 }, { "epoch": 6.738159070598749, "grad_norm": 1.2728914022445679, "learning_rate": 1.452946393984178e-05, "loss": 0.5477, "num_input_tokens_seen": 26095784, "step": 45240 }, { "epoch": 6.738903783139708, "grad_norm": 1.2911077737808228, "learning_rate": 1.4523562898442084e-05, "loss": 0.5556, "num_input_tokens_seen": 26099144, "step": 45245 }, { "epoch": 6.739648495680667, "grad_norm": 2.2329869270324707, "learning_rate": 1.451766256499741e-05, "loss": 0.3417, "num_input_tokens_seen": 26101736, "step": 45250 }, { "epoch": 6.740393208221627, "grad_norm": 1.9873931407928467, "learning_rate": 1.4511762939906481e-05, "loss": 0.541, "num_input_tokens_seen": 26104584, "step": 45255 }, { "epoch": 6.7411379207625854, "grad_norm": 1.491582989692688, "learning_rate": 1.4505864023567983e-05, "loss": 0.5615, "num_input_tokens_seen": 26107464, "step": 45260 }, { "epoch": 6.741882633303545, "grad_norm": 1.6052615642547607, "learning_rate": 1.4499965816380525e-05, "loss": 0.5876, "num_input_tokens_seen": 26110088, "step": 45265 }, { "epoch": 6.742627345844504, "grad_norm": 1.9797698259353638, "learning_rate": 1.4494068318742679e-05, "loss": 0.4781, "num_input_tokens_seen": 26112936, "step": 45270 }, { "epoch": 6.7433720583854635, "grad_norm": 1.319530725479126, "learning_rate": 1.4488171531052982e-05, "loss": 0.406, "num_input_tokens_seen": 26115784, "step": 45275 }, { "epoch": 6.744116770926422, "grad_norm": 1.8914144039154053, "learning_rate": 1.4482275453709915e-05, "loss": 0.7703, "num_input_tokens_seen": 26118760, "step": 45280 }, { "epoch": 6.744861483467382, "grad_norm": 2.121492385864258, "learning_rate": 1.447638008711191e-05, "loss": 0.6735, "num_input_tokens_seen": 26121448, "step": 45285 }, { "epoch": 6.745606196008341, "grad_norm": 3.141148090362549, "learning_rate": 1.4470485431657355e-05, "loss": 0.6774, "num_input_tokens_seen": 26124200, "step": 45290 }, { "epoch": 6.7463509085493, "grad_norm": 1.7963279485702515, "learning_rate": 1.446459148774459e-05, "loss": 0.5979, "num_input_tokens_seen": 26127048, "step": 45295 }, { "epoch": 6.747095621090259, "grad_norm": 1.969480276107788, "learning_rate": 1.4458698255771902e-05, "loss": 0.5137, "num_input_tokens_seen": 26129960, "step": 45300 }, { "epoch": 6.747840333631219, "grad_norm": 2.100395441055298, "learning_rate": 1.445280573613752e-05, "loss": 0.4594, "num_input_tokens_seen": 26132840, "step": 45305 }, { "epoch": 6.7485850461721775, "grad_norm": 3.2274510860443115, "learning_rate": 1.4446913929239642e-05, "loss": 0.8082, "num_input_tokens_seen": 26135656, "step": 45310 }, { "epoch": 6.749329758713137, "grad_norm": 2.69950008392334, "learning_rate": 1.4441022835476414e-05, "loss": 0.7057, "num_input_tokens_seen": 26138728, "step": 45315 }, { "epoch": 6.750074471254096, "grad_norm": 2.750601291656494, "learning_rate": 1.4435132455245934e-05, "loss": 0.7136, "num_input_tokens_seen": 26142120, "step": 45320 }, { "epoch": 6.7508191837950555, "grad_norm": 2.884126901626587, "learning_rate": 1.4429242788946259e-05, "loss": 0.7025, "num_input_tokens_seen": 26145032, "step": 45325 }, { "epoch": 6.751563896336014, "grad_norm": 4.703138828277588, "learning_rate": 1.442335383697537e-05, "loss": 0.9155, "num_input_tokens_seen": 26148008, "step": 45330 }, { "epoch": 6.752308608876973, "grad_norm": 3.4334404468536377, "learning_rate": 1.4417465599731226e-05, "loss": 0.529, "num_input_tokens_seen": 26150888, "step": 45335 }, { "epoch": 6.753053321417933, "grad_norm": 3.186319351196289, "learning_rate": 1.4411578077611743e-05, "loss": 0.6141, "num_input_tokens_seen": 26153896, "step": 45340 }, { "epoch": 6.753798033958892, "grad_norm": 2.3589723110198975, "learning_rate": 1.4405691271014751e-05, "loss": 0.6911, "num_input_tokens_seen": 26156840, "step": 45345 }, { "epoch": 6.754542746499851, "grad_norm": 1.9730138778686523, "learning_rate": 1.4399805180338072e-05, "loss": 0.546, "num_input_tokens_seen": 26159816, "step": 45350 }, { "epoch": 6.75528745904081, "grad_norm": 1.757697343826294, "learning_rate": 1.439391980597946e-05, "loss": 0.5158, "num_input_tokens_seen": 26162792, "step": 45355 }, { "epoch": 6.7560321715817695, "grad_norm": 1.230218529701233, "learning_rate": 1.4388035148336637e-05, "loss": 0.6051, "num_input_tokens_seen": 26165832, "step": 45360 }, { "epoch": 6.756776884122728, "grad_norm": 1.9554252624511719, "learning_rate": 1.4382151207807243e-05, "loss": 0.5055, "num_input_tokens_seen": 26168744, "step": 45365 }, { "epoch": 6.757521596663688, "grad_norm": 2.4265215396881104, "learning_rate": 1.4376267984788902e-05, "loss": 0.6622, "num_input_tokens_seen": 26171560, "step": 45370 }, { "epoch": 6.758266309204647, "grad_norm": 1.6225188970565796, "learning_rate": 1.4370385479679177e-05, "loss": 0.4928, "num_input_tokens_seen": 26174472, "step": 45375 }, { "epoch": 6.759011021745606, "grad_norm": 2.633335828781128, "learning_rate": 1.4364503692875598e-05, "loss": 0.5708, "num_input_tokens_seen": 26177352, "step": 45380 }, { "epoch": 6.759755734286565, "grad_norm": 1.5908100605010986, "learning_rate": 1.4358622624775603e-05, "loss": 0.5612, "num_input_tokens_seen": 26180136, "step": 45385 }, { "epoch": 6.760500446827525, "grad_norm": 3.8728020191192627, "learning_rate": 1.4352742275776632e-05, "loss": 0.6383, "num_input_tokens_seen": 26182920, "step": 45390 }, { "epoch": 6.7612451593684835, "grad_norm": 2.07161808013916, "learning_rate": 1.4346862646276055e-05, "loss": 0.8482, "num_input_tokens_seen": 26185832, "step": 45395 }, { "epoch": 6.761989871909443, "grad_norm": 3.1334519386291504, "learning_rate": 1.4340983736671179e-05, "loss": 0.4521, "num_input_tokens_seen": 26188520, "step": 45400 }, { "epoch": 6.762734584450402, "grad_norm": 2.122215509414673, "learning_rate": 1.4335105547359287e-05, "loss": 0.6487, "num_input_tokens_seen": 26191272, "step": 45405 }, { "epoch": 6.7634792969913615, "grad_norm": 3.253220319747925, "learning_rate": 1.43292280787376e-05, "loss": 0.5418, "num_input_tokens_seen": 26194152, "step": 45410 }, { "epoch": 6.76422400953232, "grad_norm": 3.1090919971466064, "learning_rate": 1.4323351331203296e-05, "loss": 0.5603, "num_input_tokens_seen": 26196680, "step": 45415 }, { "epoch": 6.76496872207328, "grad_norm": 2.140263557434082, "learning_rate": 1.431747530515351e-05, "loss": 0.6323, "num_input_tokens_seen": 26199688, "step": 45420 }, { "epoch": 6.765713434614239, "grad_norm": 1.3678778409957886, "learning_rate": 1.4311600000985303e-05, "loss": 0.5212, "num_input_tokens_seen": 26202568, "step": 45425 }, { "epoch": 6.766458147155198, "grad_norm": 1.3122813701629639, "learning_rate": 1.4305725419095722e-05, "loss": 0.6074, "num_input_tokens_seen": 26205736, "step": 45430 }, { "epoch": 6.767202859696157, "grad_norm": 1.2210818529129028, "learning_rate": 1.4299851559881727e-05, "loss": 0.5997, "num_input_tokens_seen": 26208712, "step": 45435 }, { "epoch": 6.767947572237117, "grad_norm": 3.1723289489746094, "learning_rate": 1.4293978423740259e-05, "loss": 0.6795, "num_input_tokens_seen": 26211592, "step": 45440 }, { "epoch": 6.7686922847780755, "grad_norm": 0.7300635576248169, "learning_rate": 1.4288106011068203e-05, "loss": 0.6778, "num_input_tokens_seen": 26214504, "step": 45445 }, { "epoch": 6.769436997319035, "grad_norm": 2.1078274250030518, "learning_rate": 1.4282234322262389e-05, "loss": 0.4593, "num_input_tokens_seen": 26217352, "step": 45450 }, { "epoch": 6.770181709859994, "grad_norm": 4.541370868682861, "learning_rate": 1.4276363357719605e-05, "loss": 0.7084, "num_input_tokens_seen": 26220296, "step": 45455 }, { "epoch": 6.7709264224009535, "grad_norm": 2.3093159198760986, "learning_rate": 1.4270493117836597e-05, "loss": 0.496, "num_input_tokens_seen": 26222824, "step": 45460 }, { "epoch": 6.771671134941912, "grad_norm": 1.6630587577819824, "learning_rate": 1.4264623603010042e-05, "loss": 0.5693, "num_input_tokens_seen": 26225512, "step": 45465 }, { "epoch": 6.772415847482872, "grad_norm": 1.6526354551315308, "learning_rate": 1.4258754813636565e-05, "loss": 0.6813, "num_input_tokens_seen": 26228296, "step": 45470 }, { "epoch": 6.773160560023831, "grad_norm": 4.498563766479492, "learning_rate": 1.4252886750112768e-05, "loss": 0.7115, "num_input_tokens_seen": 26231144, "step": 45475 }, { "epoch": 6.77390527256479, "grad_norm": 1.597739577293396, "learning_rate": 1.4247019412835188e-05, "loss": 0.6813, "num_input_tokens_seen": 26233960, "step": 45480 }, { "epoch": 6.774649985105749, "grad_norm": 4.563523769378662, "learning_rate": 1.4241152802200319e-05, "loss": 0.6697, "num_input_tokens_seen": 26236584, "step": 45485 }, { "epoch": 6.775394697646709, "grad_norm": 2.0121700763702393, "learning_rate": 1.4235286918604613e-05, "loss": 0.552, "num_input_tokens_seen": 26239400, "step": 45490 }, { "epoch": 6.7761394101876675, "grad_norm": 2.578739643096924, "learning_rate": 1.422942176244444e-05, "loss": 0.5875, "num_input_tokens_seen": 26242184, "step": 45495 }, { "epoch": 6.776884122728626, "grad_norm": 1.9707480669021606, "learning_rate": 1.4223557334116167e-05, "loss": 0.594, "num_input_tokens_seen": 26244808, "step": 45500 }, { "epoch": 6.777628835269586, "grad_norm": 1.090674877166748, "learning_rate": 1.421769363401606e-05, "loss": 0.3899, "num_input_tokens_seen": 26247560, "step": 45505 }, { "epoch": 6.778373547810546, "grad_norm": 1.3172987699508667, "learning_rate": 1.4211830662540381e-05, "loss": 0.5836, "num_input_tokens_seen": 26250536, "step": 45510 }, { "epoch": 6.779118260351504, "grad_norm": 1.715624213218689, "learning_rate": 1.4205968420085324e-05, "loss": 0.4864, "num_input_tokens_seen": 26253544, "step": 45515 }, { "epoch": 6.779862972892463, "grad_norm": 1.7954223155975342, "learning_rate": 1.4200106907047039e-05, "loss": 0.5157, "num_input_tokens_seen": 26256360, "step": 45520 }, { "epoch": 6.780607685433423, "grad_norm": 2.0577049255371094, "learning_rate": 1.419424612382163e-05, "loss": 0.3536, "num_input_tokens_seen": 26259272, "step": 45525 }, { "epoch": 6.781352397974382, "grad_norm": 2.244704008102417, "learning_rate": 1.418838607080512e-05, "loss": 0.7214, "num_input_tokens_seen": 26262280, "step": 45530 }, { "epoch": 6.782097110515341, "grad_norm": 3.5966217517852783, "learning_rate": 1.4182526748393526e-05, "loss": 0.5546, "num_input_tokens_seen": 26264968, "step": 45535 }, { "epoch": 6.7828418230563, "grad_norm": 1.4430725574493408, "learning_rate": 1.4176668156982798e-05, "loss": 0.3291, "num_input_tokens_seen": 26268072, "step": 45540 }, { "epoch": 6.7835865355972595, "grad_norm": 3.5928688049316406, "learning_rate": 1.4170810296968834e-05, "loss": 0.6888, "num_input_tokens_seen": 26270920, "step": 45545 }, { "epoch": 6.784331248138218, "grad_norm": 1.9410629272460938, "learning_rate": 1.4164953168747475e-05, "loss": 0.8221, "num_input_tokens_seen": 26273800, "step": 45550 }, { "epoch": 6.785075960679178, "grad_norm": 2.58121657371521, "learning_rate": 1.4159096772714531e-05, "loss": 0.6524, "num_input_tokens_seen": 26276712, "step": 45555 }, { "epoch": 6.785820673220137, "grad_norm": 2.2837817668914795, "learning_rate": 1.4153241109265759e-05, "loss": 0.5103, "num_input_tokens_seen": 26279400, "step": 45560 }, { "epoch": 6.786565385761096, "grad_norm": 3.176882028579712, "learning_rate": 1.414738617879684e-05, "loss": 0.7244, "num_input_tokens_seen": 26282344, "step": 45565 }, { "epoch": 6.787310098302055, "grad_norm": 1.4596894979476929, "learning_rate": 1.4141531981703444e-05, "loss": 0.5741, "num_input_tokens_seen": 26285352, "step": 45570 }, { "epoch": 6.788054810843015, "grad_norm": 2.7220828533172607, "learning_rate": 1.4135678518381168e-05, "loss": 0.5916, "num_input_tokens_seen": 26288104, "step": 45575 }, { "epoch": 6.7887995233839735, "grad_norm": 2.7167701721191406, "learning_rate": 1.4129825789225564e-05, "loss": 0.7692, "num_input_tokens_seen": 26290824, "step": 45580 }, { "epoch": 6.789544235924933, "grad_norm": 2.679692506790161, "learning_rate": 1.412397379463215e-05, "loss": 0.717, "num_input_tokens_seen": 26293832, "step": 45585 }, { "epoch": 6.790288948465892, "grad_norm": 6.031773567199707, "learning_rate": 1.4118122534996358e-05, "loss": 0.5464, "num_input_tokens_seen": 26296872, "step": 45590 }, { "epoch": 6.791033661006852, "grad_norm": 2.7908127307891846, "learning_rate": 1.4112272010713617e-05, "loss": 0.4654, "num_input_tokens_seen": 26299784, "step": 45595 }, { "epoch": 6.79177837354781, "grad_norm": 2.549459457397461, "learning_rate": 1.4106422222179252e-05, "loss": 0.5847, "num_input_tokens_seen": 26302440, "step": 45600 }, { "epoch": 6.79252308608877, "grad_norm": 3.283006191253662, "learning_rate": 1.4100573169788584e-05, "loss": 0.6334, "num_input_tokens_seen": 26304968, "step": 45605 }, { "epoch": 6.793267798629729, "grad_norm": 1.6036036014556885, "learning_rate": 1.4094724853936869e-05, "loss": 0.5929, "num_input_tokens_seen": 26307784, "step": 45610 }, { "epoch": 6.794012511170688, "grad_norm": 2.6133816242218018, "learning_rate": 1.4088877275019311e-05, "loss": 0.6584, "num_input_tokens_seen": 26310728, "step": 45615 }, { "epoch": 6.794757223711647, "grad_norm": 1.3041292428970337, "learning_rate": 1.4083030433431066e-05, "loss": 0.3566, "num_input_tokens_seen": 26313448, "step": 45620 }, { "epoch": 6.795501936252607, "grad_norm": 3.092266082763672, "learning_rate": 1.4077184329567244e-05, "loss": 0.6257, "num_input_tokens_seen": 26316296, "step": 45625 }, { "epoch": 6.7962466487935655, "grad_norm": 2.0442349910736084, "learning_rate": 1.40713389638229e-05, "loss": 0.481, "num_input_tokens_seen": 26318984, "step": 45630 }, { "epoch": 6.796991361334525, "grad_norm": 3.4757869243621826, "learning_rate": 1.4065494336593027e-05, "loss": 0.599, "num_input_tokens_seen": 26322088, "step": 45635 }, { "epoch": 6.797736073875484, "grad_norm": 2.3452367782592773, "learning_rate": 1.4059650448272587e-05, "loss": 0.6407, "num_input_tokens_seen": 26325000, "step": 45640 }, { "epoch": 6.798480786416444, "grad_norm": 1.8086178302764893, "learning_rate": 1.4053807299256495e-05, "loss": 0.4748, "num_input_tokens_seen": 26327912, "step": 45645 }, { "epoch": 6.799225498957402, "grad_norm": 3.7110209465026855, "learning_rate": 1.4047964889939596e-05, "loss": 0.7705, "num_input_tokens_seen": 26330536, "step": 45650 }, { "epoch": 6.799970211498362, "grad_norm": 1.2267179489135742, "learning_rate": 1.4042123220716713e-05, "loss": 0.4897, "num_input_tokens_seen": 26333544, "step": 45655 }, { "epoch": 6.800714924039321, "grad_norm": 1.9411888122558594, "learning_rate": 1.4036282291982583e-05, "loss": 0.4082, "num_input_tokens_seen": 26336328, "step": 45660 }, { "epoch": 6.8014596365802795, "grad_norm": 1.7314344644546509, "learning_rate": 1.403044210413193e-05, "loss": 0.4572, "num_input_tokens_seen": 26339144, "step": 45665 }, { "epoch": 6.802204349121239, "grad_norm": 1.194051742553711, "learning_rate": 1.4024602657559393e-05, "loss": 0.3781, "num_input_tokens_seen": 26342184, "step": 45670 }, { "epoch": 6.802949061662199, "grad_norm": 2.3160572052001953, "learning_rate": 1.4018763952659581e-05, "loss": 0.7555, "num_input_tokens_seen": 26345256, "step": 45675 }, { "epoch": 6.803693774203158, "grad_norm": 3.3101930618286133, "learning_rate": 1.4012925989827058e-05, "loss": 0.7366, "num_input_tokens_seen": 26348072, "step": 45680 }, { "epoch": 6.804438486744116, "grad_norm": 1.3573790788650513, "learning_rate": 1.4007088769456326e-05, "loss": 0.7436, "num_input_tokens_seen": 26351144, "step": 45685 }, { "epoch": 6.805183199285076, "grad_norm": 1.7181904315948486, "learning_rate": 1.400125229194185e-05, "loss": 0.654, "num_input_tokens_seen": 26353960, "step": 45690 }, { "epoch": 6.805927911826036, "grad_norm": 2.3921408653259277, "learning_rate": 1.3995416557678016e-05, "loss": 0.6428, "num_input_tokens_seen": 26356712, "step": 45695 }, { "epoch": 6.806672624366994, "grad_norm": 2.2891852855682373, "learning_rate": 1.398958156705919e-05, "loss": 0.6638, "num_input_tokens_seen": 26359560, "step": 45700 }, { "epoch": 6.807417336907953, "grad_norm": 2.0531728267669678, "learning_rate": 1.3983747320479688e-05, "loss": 0.5283, "num_input_tokens_seen": 26362536, "step": 45705 }, { "epoch": 6.808162049448913, "grad_norm": 1.2231606245040894, "learning_rate": 1.3977913818333744e-05, "loss": 0.5933, "num_input_tokens_seen": 26365480, "step": 45710 }, { "epoch": 6.8089067619898715, "grad_norm": 2.2098023891448975, "learning_rate": 1.3972081061015569e-05, "loss": 0.7487, "num_input_tokens_seen": 26368456, "step": 45715 }, { "epoch": 6.809651474530831, "grad_norm": 5.386143207550049, "learning_rate": 1.396624904891932e-05, "loss": 0.7591, "num_input_tokens_seen": 26371528, "step": 45720 }, { "epoch": 6.81039618707179, "grad_norm": 1.5313694477081299, "learning_rate": 1.3960417782439112e-05, "loss": 0.6056, "num_input_tokens_seen": 26374088, "step": 45725 }, { "epoch": 6.81114089961275, "grad_norm": 2.600675582885742, "learning_rate": 1.3954587261968974e-05, "loss": 0.532, "num_input_tokens_seen": 26376936, "step": 45730 }, { "epoch": 6.811885612153708, "grad_norm": 2.1329238414764404, "learning_rate": 1.3948757487902923e-05, "loss": 0.5833, "num_input_tokens_seen": 26379752, "step": 45735 }, { "epoch": 6.812630324694668, "grad_norm": 1.6173311471939087, "learning_rate": 1.3942928460634907e-05, "loss": 0.3178, "num_input_tokens_seen": 26382920, "step": 45740 }, { "epoch": 6.813375037235627, "grad_norm": 4.449467182159424, "learning_rate": 1.3937100180558846e-05, "loss": 0.7132, "num_input_tokens_seen": 26385960, "step": 45745 }, { "epoch": 6.814119749776586, "grad_norm": 3.213404893875122, "learning_rate": 1.3931272648068565e-05, "loss": 0.6175, "num_input_tokens_seen": 26388744, "step": 45750 }, { "epoch": 6.814864462317545, "grad_norm": 2.814387559890747, "learning_rate": 1.3925445863557873e-05, "loss": 0.4291, "num_input_tokens_seen": 26391592, "step": 45755 }, { "epoch": 6.815609174858505, "grad_norm": 2.0262224674224854, "learning_rate": 1.3919619827420538e-05, "loss": 0.6592, "num_input_tokens_seen": 26394504, "step": 45760 }, { "epoch": 6.816353887399464, "grad_norm": 1.7318861484527588, "learning_rate": 1.3913794540050234e-05, "loss": 0.6356, "num_input_tokens_seen": 26397256, "step": 45765 }, { "epoch": 6.817098599940423, "grad_norm": 2.660905122756958, "learning_rate": 1.390797000184062e-05, "loss": 0.5517, "num_input_tokens_seen": 26400072, "step": 45770 }, { "epoch": 6.817843312481382, "grad_norm": 3.6938092708587646, "learning_rate": 1.3902146213185297e-05, "loss": 0.454, "num_input_tokens_seen": 26402920, "step": 45775 }, { "epoch": 6.818588025022342, "grad_norm": 2.9053893089294434, "learning_rate": 1.3896323174477815e-05, "loss": 0.6237, "num_input_tokens_seen": 26406056, "step": 45780 }, { "epoch": 6.8193327375633, "grad_norm": 3.920275926589966, "learning_rate": 1.3890500886111673e-05, "loss": 0.7759, "num_input_tokens_seen": 26409224, "step": 45785 }, { "epoch": 6.82007745010426, "grad_norm": 1.9992518424987793, "learning_rate": 1.3884679348480309e-05, "loss": 0.6335, "num_input_tokens_seen": 26411912, "step": 45790 }, { "epoch": 6.820822162645219, "grad_norm": 1.630689024925232, "learning_rate": 1.3878858561977131e-05, "loss": 0.4884, "num_input_tokens_seen": 26414920, "step": 45795 }, { "epoch": 6.821566875186178, "grad_norm": 3.289472818374634, "learning_rate": 1.3873038526995466e-05, "loss": 0.6075, "num_input_tokens_seen": 26417768, "step": 45800 }, { "epoch": 6.822311587727137, "grad_norm": 3.069779396057129, "learning_rate": 1.386721924392862e-05, "loss": 0.721, "num_input_tokens_seen": 26420744, "step": 45805 }, { "epoch": 6.823056300268097, "grad_norm": 2.6382341384887695, "learning_rate": 1.3861400713169831e-05, "loss": 0.5317, "num_input_tokens_seen": 26423560, "step": 45810 }, { "epoch": 6.823801012809056, "grad_norm": 2.2859737873077393, "learning_rate": 1.38555829351123e-05, "loss": 0.541, "num_input_tokens_seen": 26426440, "step": 45815 }, { "epoch": 6.824545725350015, "grad_norm": 2.3704073429107666, "learning_rate": 1.384976591014917e-05, "loss": 0.5882, "num_input_tokens_seen": 26429416, "step": 45820 }, { "epoch": 6.825290437890974, "grad_norm": 2.20249605178833, "learning_rate": 1.384394963867352e-05, "loss": 0.5141, "num_input_tokens_seen": 26432104, "step": 45825 }, { "epoch": 6.826035150431934, "grad_norm": 2.4872751235961914, "learning_rate": 1.3838134121078403e-05, "loss": 0.705, "num_input_tokens_seen": 26435048, "step": 45830 }, { "epoch": 6.826779862972892, "grad_norm": 3.7632782459259033, "learning_rate": 1.3832319357756793e-05, "loss": 0.6559, "num_input_tokens_seen": 26437832, "step": 45835 }, { "epoch": 6.827524575513852, "grad_norm": 1.0360229015350342, "learning_rate": 1.3826505349101637e-05, "loss": 0.5797, "num_input_tokens_seen": 26440392, "step": 45840 }, { "epoch": 6.828269288054811, "grad_norm": 2.7114784717559814, "learning_rate": 1.3820692095505819e-05, "loss": 0.4622, "num_input_tokens_seen": 26443048, "step": 45845 }, { "epoch": 6.82901400059577, "grad_norm": 2.15867018699646, "learning_rate": 1.381487959736218e-05, "loss": 0.5235, "num_input_tokens_seen": 26446120, "step": 45850 }, { "epoch": 6.829758713136729, "grad_norm": 2.508833408355713, "learning_rate": 1.3809067855063512e-05, "loss": 0.5481, "num_input_tokens_seen": 26449128, "step": 45855 }, { "epoch": 6.830503425677689, "grad_norm": 2.106532335281372, "learning_rate": 1.3803256869002529e-05, "loss": 0.6864, "num_input_tokens_seen": 26452264, "step": 45860 }, { "epoch": 6.831248138218648, "grad_norm": 3.2047715187072754, "learning_rate": 1.379744663957193e-05, "loss": 0.5501, "num_input_tokens_seen": 26455176, "step": 45865 }, { "epoch": 6.831992850759606, "grad_norm": 2.697550058364868, "learning_rate": 1.3791637167164337e-05, "loss": 0.5699, "num_input_tokens_seen": 26457928, "step": 45870 }, { "epoch": 6.832737563300566, "grad_norm": 1.5698260068893433, "learning_rate": 1.3785828452172333e-05, "loss": 0.57, "num_input_tokens_seen": 26462472, "step": 45875 }, { "epoch": 6.833482275841525, "grad_norm": 3.016556978225708, "learning_rate": 1.3780020494988446e-05, "loss": 0.5156, "num_input_tokens_seen": 26465224, "step": 45880 }, { "epoch": 6.834226988382484, "grad_norm": 1.6054414510726929, "learning_rate": 1.3774213296005159e-05, "loss": 0.4934, "num_input_tokens_seen": 26467976, "step": 45885 }, { "epoch": 6.834971700923443, "grad_norm": 1.7591694593429565, "learning_rate": 1.3768406855614907e-05, "loss": 0.534, "num_input_tokens_seen": 26470888, "step": 45890 }, { "epoch": 6.835716413464403, "grad_norm": 3.772305488586426, "learning_rate": 1.3762601174210044e-05, "loss": 0.5989, "num_input_tokens_seen": 26473864, "step": 45895 }, { "epoch": 6.836461126005362, "grad_norm": 1.3604437112808228, "learning_rate": 1.3756796252182907e-05, "loss": 0.5086, "num_input_tokens_seen": 26476648, "step": 45900 }, { "epoch": 6.837205838546321, "grad_norm": 4.68548583984375, "learning_rate": 1.3750992089925777e-05, "loss": 0.5889, "num_input_tokens_seen": 26479336, "step": 45905 }, { "epoch": 6.83795055108728, "grad_norm": 4.2902021408081055, "learning_rate": 1.3745188687830857e-05, "loss": 0.6198, "num_input_tokens_seen": 26482120, "step": 45910 }, { "epoch": 6.83869526362824, "grad_norm": 2.511051893234253, "learning_rate": 1.3739386046290326e-05, "loss": 0.4393, "num_input_tokens_seen": 26485064, "step": 45915 }, { "epoch": 6.839439976169198, "grad_norm": 3.3031158447265625, "learning_rate": 1.3733584165696304e-05, "loss": 0.4275, "num_input_tokens_seen": 26487848, "step": 45920 }, { "epoch": 6.840184688710158, "grad_norm": 5.3175482749938965, "learning_rate": 1.3727783046440868e-05, "loss": 0.7668, "num_input_tokens_seen": 26491048, "step": 45925 }, { "epoch": 6.840929401251117, "grad_norm": 1.6499234437942505, "learning_rate": 1.3721982688916014e-05, "loss": 0.7359, "num_input_tokens_seen": 26493640, "step": 45930 }, { "epoch": 6.8416741137920765, "grad_norm": 1.405204176902771, "learning_rate": 1.3716183093513717e-05, "loss": 0.5846, "num_input_tokens_seen": 26496488, "step": 45935 }, { "epoch": 6.842418826333035, "grad_norm": 1.6911860704421997, "learning_rate": 1.3710384260625891e-05, "loss": 0.6366, "num_input_tokens_seen": 26499720, "step": 45940 }, { "epoch": 6.843163538873995, "grad_norm": 2.562779664993286, "learning_rate": 1.3704586190644405e-05, "loss": 0.7178, "num_input_tokens_seen": 26502632, "step": 45945 }, { "epoch": 6.843908251414954, "grad_norm": 1.7164865732192993, "learning_rate": 1.369878888396105e-05, "loss": 0.5982, "num_input_tokens_seen": 26505576, "step": 45950 }, { "epoch": 6.844652963955913, "grad_norm": 1.752801775932312, "learning_rate": 1.3692992340967598e-05, "loss": 0.6457, "num_input_tokens_seen": 26508264, "step": 45955 }, { "epoch": 6.845397676496872, "grad_norm": 3.1766250133514404, "learning_rate": 1.368719656205576e-05, "loss": 0.4963, "num_input_tokens_seen": 26511144, "step": 45960 }, { "epoch": 6.846142389037832, "grad_norm": 3.4395933151245117, "learning_rate": 1.3681401547617173e-05, "loss": 0.5715, "num_input_tokens_seen": 26513864, "step": 45965 }, { "epoch": 6.84688710157879, "grad_norm": 3.641010046005249, "learning_rate": 1.3675607298043453e-05, "loss": 0.6772, "num_input_tokens_seen": 26516744, "step": 45970 }, { "epoch": 6.84763181411975, "grad_norm": 2.8711116313934326, "learning_rate": 1.3669813813726151e-05, "loss": 0.4262, "num_input_tokens_seen": 26519688, "step": 45975 }, { "epoch": 6.848376526660709, "grad_norm": 2.0286169052124023, "learning_rate": 1.3664021095056764e-05, "loss": 0.5848, "num_input_tokens_seen": 26522504, "step": 45980 }, { "epoch": 6.8491212392016685, "grad_norm": 1.7245874404907227, "learning_rate": 1.3658229142426754e-05, "loss": 0.7813, "num_input_tokens_seen": 26525224, "step": 45985 }, { "epoch": 6.849865951742627, "grad_norm": 1.938808798789978, "learning_rate": 1.3652437956227496e-05, "loss": 0.7313, "num_input_tokens_seen": 26527944, "step": 45990 }, { "epoch": 6.850610664283587, "grad_norm": 1.846601128578186, "learning_rate": 1.3646647536850354e-05, "loss": 0.5284, "num_input_tokens_seen": 26530728, "step": 45995 }, { "epoch": 6.851355376824546, "grad_norm": 1.443647861480713, "learning_rate": 1.3640857884686603e-05, "loss": 0.538, "num_input_tokens_seen": 26533416, "step": 46000 }, { "epoch": 6.852100089365505, "grad_norm": 1.7722885608673096, "learning_rate": 1.3635069000127493e-05, "loss": 0.5625, "num_input_tokens_seen": 26536488, "step": 46005 }, { "epoch": 6.852844801906464, "grad_norm": 2.135680913925171, "learning_rate": 1.3629280883564217e-05, "loss": 0.7953, "num_input_tokens_seen": 26539144, "step": 46010 }, { "epoch": 6.853589514447423, "grad_norm": 3.776550054550171, "learning_rate": 1.3623493535387905e-05, "loss": 0.5057, "num_input_tokens_seen": 26541992, "step": 46015 }, { "epoch": 6.8543342269883825, "grad_norm": 3.2079594135284424, "learning_rate": 1.3617706955989656e-05, "loss": 0.5761, "num_input_tokens_seen": 26544840, "step": 46020 }, { "epoch": 6.855078939529342, "grad_norm": 2.5075697898864746, "learning_rate": 1.3611921145760487e-05, "loss": 0.6235, "num_input_tokens_seen": 26548040, "step": 46025 }, { "epoch": 6.855823652070301, "grad_norm": 2.6420633792877197, "learning_rate": 1.3606136105091393e-05, "loss": 0.5475, "num_input_tokens_seen": 26550888, "step": 46030 }, { "epoch": 6.85656836461126, "grad_norm": 3.849795341491699, "learning_rate": 1.3600351834373286e-05, "loss": 0.5933, "num_input_tokens_seen": 26553608, "step": 46035 }, { "epoch": 6.857313077152219, "grad_norm": 1.6242380142211914, "learning_rate": 1.3594568333997059e-05, "loss": 0.638, "num_input_tokens_seen": 26556936, "step": 46040 }, { "epoch": 6.858057789693179, "grad_norm": 2.257410764694214, "learning_rate": 1.3588785604353532e-05, "loss": 0.6177, "num_input_tokens_seen": 26559624, "step": 46045 }, { "epoch": 6.858802502234138, "grad_norm": 2.4136035442352295, "learning_rate": 1.3583003645833478e-05, "loss": 0.5373, "num_input_tokens_seen": 26562280, "step": 46050 }, { "epoch": 6.859547214775096, "grad_norm": 2.0000839233398438, "learning_rate": 1.3577222458827628e-05, "loss": 0.6985, "num_input_tokens_seen": 26565256, "step": 46055 }, { "epoch": 6.860291927316056, "grad_norm": 1.023713231086731, "learning_rate": 1.3571442043726634e-05, "loss": 0.5617, "num_input_tokens_seen": 26568040, "step": 46060 }, { "epoch": 6.861036639857015, "grad_norm": 3.524507761001587, "learning_rate": 1.356566240092113e-05, "loss": 0.549, "num_input_tokens_seen": 26570792, "step": 46065 }, { "epoch": 6.8617813523979745, "grad_norm": 2.946277618408203, "learning_rate": 1.3559883530801667e-05, "loss": 0.6676, "num_input_tokens_seen": 26573640, "step": 46070 }, { "epoch": 6.862526064938933, "grad_norm": 0.7749804258346558, "learning_rate": 1.355410543375876e-05, "loss": 0.6688, "num_input_tokens_seen": 26576424, "step": 46075 }, { "epoch": 6.863270777479893, "grad_norm": 1.8291828632354736, "learning_rate": 1.3548328110182873e-05, "loss": 0.6336, "num_input_tokens_seen": 26579272, "step": 46080 }, { "epoch": 6.864015490020852, "grad_norm": 1.905158281326294, "learning_rate": 1.3542551560464412e-05, "loss": 0.6311, "num_input_tokens_seen": 26581992, "step": 46085 }, { "epoch": 6.864760202561811, "grad_norm": 1.6036536693572998, "learning_rate": 1.3536775784993744e-05, "loss": 0.4884, "num_input_tokens_seen": 26584616, "step": 46090 }, { "epoch": 6.86550491510277, "grad_norm": 2.3064048290252686, "learning_rate": 1.3531000784161152e-05, "loss": 0.6588, "num_input_tokens_seen": 26587336, "step": 46095 }, { "epoch": 6.86624962764373, "grad_norm": 2.2850658893585205, "learning_rate": 1.3525226558356895e-05, "loss": 0.6148, "num_input_tokens_seen": 26590120, "step": 46100 }, { "epoch": 6.8669943401846885, "grad_norm": 2.5479648113250732, "learning_rate": 1.3519453107971191e-05, "loss": 0.53, "num_input_tokens_seen": 26592520, "step": 46105 }, { "epoch": 6.867739052725648, "grad_norm": 1.4317015409469604, "learning_rate": 1.3513680433394154e-05, "loss": 0.6369, "num_input_tokens_seen": 26595432, "step": 46110 }, { "epoch": 6.868483765266607, "grad_norm": 3.438523530960083, "learning_rate": 1.3507908535015895e-05, "loss": 0.9019, "num_input_tokens_seen": 26598216, "step": 46115 }, { "epoch": 6.8692284778075665, "grad_norm": 1.6146595478057861, "learning_rate": 1.3502137413226453e-05, "loss": 0.8099, "num_input_tokens_seen": 26600904, "step": 46120 }, { "epoch": 6.869973190348525, "grad_norm": 2.5147042274475098, "learning_rate": 1.349636706841583e-05, "loss": 0.5153, "num_input_tokens_seen": 26603496, "step": 46125 }, { "epoch": 6.870717902889485, "grad_norm": 2.061619758605957, "learning_rate": 1.349059750097394e-05, "loss": 0.7207, "num_input_tokens_seen": 26606440, "step": 46130 }, { "epoch": 6.871462615430444, "grad_norm": 1.7335681915283203, "learning_rate": 1.3484828711290676e-05, "loss": 0.5725, "num_input_tokens_seen": 26609320, "step": 46135 }, { "epoch": 6.872207327971403, "grad_norm": 1.8019373416900635, "learning_rate": 1.347906069975587e-05, "loss": 0.6718, "num_input_tokens_seen": 26612264, "step": 46140 }, { "epoch": 6.872952040512362, "grad_norm": 8.34068775177002, "learning_rate": 1.34732934667593e-05, "loss": 0.5753, "num_input_tokens_seen": 26615208, "step": 46145 }, { "epoch": 6.873696753053322, "grad_norm": 1.9979124069213867, "learning_rate": 1.3467527012690707e-05, "loss": 0.6315, "num_input_tokens_seen": 26617992, "step": 46150 }, { "epoch": 6.8744414655942805, "grad_norm": 1.828308343887329, "learning_rate": 1.3461761337939736e-05, "loss": 0.4691, "num_input_tokens_seen": 26621160, "step": 46155 }, { "epoch": 6.87518617813524, "grad_norm": 1.3988982439041138, "learning_rate": 1.3455996442896036e-05, "loss": 0.7333, "num_input_tokens_seen": 26623912, "step": 46160 }, { "epoch": 6.875930890676199, "grad_norm": 2.7643983364105225, "learning_rate": 1.345023232794915e-05, "loss": 0.7393, "num_input_tokens_seen": 26626536, "step": 46165 }, { "epoch": 6.8766756032171585, "grad_norm": 2.656550168991089, "learning_rate": 1.3444468993488607e-05, "loss": 0.6622, "num_input_tokens_seen": 26629640, "step": 46170 }, { "epoch": 6.877420315758117, "grad_norm": 1.10590660572052, "learning_rate": 1.3438706439903866e-05, "loss": 0.4422, "num_input_tokens_seen": 26632680, "step": 46175 }, { "epoch": 6.878165028299077, "grad_norm": 2.223196268081665, "learning_rate": 1.343294466758434e-05, "loss": 0.8051, "num_input_tokens_seen": 26635368, "step": 46180 }, { "epoch": 6.878909740840036, "grad_norm": 1.6320708990097046, "learning_rate": 1.3427183676919396e-05, "loss": 0.8051, "num_input_tokens_seen": 26638248, "step": 46185 }, { "epoch": 6.879654453380995, "grad_norm": 2.2238311767578125, "learning_rate": 1.3421423468298316e-05, "loss": 0.6986, "num_input_tokens_seen": 26641032, "step": 46190 }, { "epoch": 6.880399165921954, "grad_norm": 2.931610107421875, "learning_rate": 1.3415664042110376e-05, "loss": 0.4232, "num_input_tokens_seen": 26643944, "step": 46195 }, { "epoch": 6.881143878462913, "grad_norm": 1.7214953899383545, "learning_rate": 1.3409905398744748e-05, "loss": 0.5746, "num_input_tokens_seen": 26646696, "step": 46200 }, { "epoch": 6.8818885910038725, "grad_norm": 1.8973820209503174, "learning_rate": 1.3404147538590595e-05, "loss": 0.463, "num_input_tokens_seen": 26649640, "step": 46205 }, { "epoch": 6.882633303544832, "grad_norm": 2.8804516792297363, "learning_rate": 1.3398390462037002e-05, "loss": 0.8409, "num_input_tokens_seen": 26652456, "step": 46210 }, { "epoch": 6.883378016085791, "grad_norm": 1.629378318786621, "learning_rate": 1.3392634169473018e-05, "loss": 0.6772, "num_input_tokens_seen": 26655176, "step": 46215 }, { "epoch": 6.88412272862675, "grad_norm": 1.9697078466415405, "learning_rate": 1.338687866128763e-05, "loss": 0.7072, "num_input_tokens_seen": 26658088, "step": 46220 }, { "epoch": 6.884867441167709, "grad_norm": 2.674656867980957, "learning_rate": 1.3381123937869758e-05, "loss": 0.5968, "num_input_tokens_seen": 26660936, "step": 46225 }, { "epoch": 6.885612153708668, "grad_norm": 3.2929608821868896, "learning_rate": 1.33753699996083e-05, "loss": 0.6089, "num_input_tokens_seen": 26663656, "step": 46230 }, { "epoch": 6.886356866249628, "grad_norm": 3.939518928527832, "learning_rate": 1.3369616846892069e-05, "loss": 0.771, "num_input_tokens_seen": 26666568, "step": 46235 }, { "epoch": 6.8871015787905865, "grad_norm": 3.0720736980438232, "learning_rate": 1.3363864480109842e-05, "loss": 0.5645, "num_input_tokens_seen": 26669672, "step": 46240 }, { "epoch": 6.887846291331546, "grad_norm": 2.5759479999542236, "learning_rate": 1.3358112899650345e-05, "loss": 0.676, "num_input_tokens_seen": 26672648, "step": 46245 }, { "epoch": 6.888591003872505, "grad_norm": 2.6953232288360596, "learning_rate": 1.3352362105902246e-05, "loss": 0.6482, "num_input_tokens_seen": 26675656, "step": 46250 }, { "epoch": 6.8893357164134645, "grad_norm": 1.3171433210372925, "learning_rate": 1.3346612099254172e-05, "loss": 0.6119, "num_input_tokens_seen": 26678568, "step": 46255 }, { "epoch": 6.890080428954423, "grad_norm": 3.1559653282165527, "learning_rate": 1.3340862880094661e-05, "loss": 0.5689, "num_input_tokens_seen": 26681224, "step": 46260 }, { "epoch": 6.890825141495383, "grad_norm": 1.8057353496551514, "learning_rate": 1.3335114448812235e-05, "loss": 0.5794, "num_input_tokens_seen": 26684232, "step": 46265 }, { "epoch": 6.891569854036342, "grad_norm": 1.393950343132019, "learning_rate": 1.3329366805795357e-05, "loss": 0.618, "num_input_tokens_seen": 26686984, "step": 46270 }, { "epoch": 6.892314566577301, "grad_norm": 3.015563488006592, "learning_rate": 1.3323619951432415e-05, "loss": 0.5355, "num_input_tokens_seen": 26689800, "step": 46275 }, { "epoch": 6.89305927911826, "grad_norm": 3.1505353450775146, "learning_rate": 1.3317873886111759e-05, "loss": 0.6992, "num_input_tokens_seen": 26692808, "step": 46280 }, { "epoch": 6.89380399165922, "grad_norm": 4.112456798553467, "learning_rate": 1.331212861022169e-05, "loss": 0.7803, "num_input_tokens_seen": 26695432, "step": 46285 }, { "epoch": 6.8945487042001785, "grad_norm": 2.138068199157715, "learning_rate": 1.3306384124150464e-05, "loss": 0.585, "num_input_tokens_seen": 26698376, "step": 46290 }, { "epoch": 6.895293416741138, "grad_norm": 2.4326412677764893, "learning_rate": 1.3300640428286244e-05, "loss": 0.6341, "num_input_tokens_seen": 26701128, "step": 46295 }, { "epoch": 6.896038129282097, "grad_norm": 1.7429596185684204, "learning_rate": 1.3294897523017177e-05, "loss": 0.602, "num_input_tokens_seen": 26703912, "step": 46300 }, { "epoch": 6.896782841823057, "grad_norm": 1.6103607416152954, "learning_rate": 1.3289155408731346e-05, "loss": 0.5515, "num_input_tokens_seen": 26707080, "step": 46305 }, { "epoch": 6.897527554364015, "grad_norm": 8.826621055603027, "learning_rate": 1.3283414085816793e-05, "loss": 0.7254, "num_input_tokens_seen": 26709768, "step": 46310 }, { "epoch": 6.898272266904975, "grad_norm": 2.140960693359375, "learning_rate": 1.3277673554661466e-05, "loss": 0.6495, "num_input_tokens_seen": 26712840, "step": 46315 }, { "epoch": 6.899016979445934, "grad_norm": 1.850374698638916, "learning_rate": 1.3271933815653303e-05, "loss": 0.616, "num_input_tokens_seen": 26715752, "step": 46320 }, { "epoch": 6.899761691986893, "grad_norm": 4.586699962615967, "learning_rate": 1.3266194869180176e-05, "loss": 0.673, "num_input_tokens_seen": 26718760, "step": 46325 }, { "epoch": 6.900506404527852, "grad_norm": 1.7926335334777832, "learning_rate": 1.3260456715629888e-05, "loss": 0.5556, "num_input_tokens_seen": 26722120, "step": 46330 }, { "epoch": 6.901251117068812, "grad_norm": 2.2459630966186523, "learning_rate": 1.3254719355390206e-05, "loss": 0.619, "num_input_tokens_seen": 26724936, "step": 46335 }, { "epoch": 6.9019958296097705, "grad_norm": 0.9449070692062378, "learning_rate": 1.3248982788848832e-05, "loss": 0.5657, "num_input_tokens_seen": 26727720, "step": 46340 }, { "epoch": 6.90274054215073, "grad_norm": 2.190859079360962, "learning_rate": 1.3243247016393429e-05, "loss": 0.5462, "num_input_tokens_seen": 26730536, "step": 46345 }, { "epoch": 6.903485254691689, "grad_norm": 3.942652463912964, "learning_rate": 1.3237512038411604e-05, "loss": 0.5031, "num_input_tokens_seen": 26733448, "step": 46350 }, { "epoch": 6.904229967232649, "grad_norm": 3.8606128692626953, "learning_rate": 1.3231777855290878e-05, "loss": 0.5321, "num_input_tokens_seen": 26737608, "step": 46355 }, { "epoch": 6.904974679773607, "grad_norm": 1.644954800605774, "learning_rate": 1.3226044467418771e-05, "loss": 0.5289, "num_input_tokens_seen": 26740360, "step": 46360 }, { "epoch": 6.905719392314566, "grad_norm": 2.690880298614502, "learning_rate": 1.3220311875182701e-05, "loss": 0.5697, "num_input_tokens_seen": 26743240, "step": 46365 }, { "epoch": 6.906464104855526, "grad_norm": 1.8427705764770508, "learning_rate": 1.3214580078970063e-05, "loss": 0.5976, "num_input_tokens_seen": 26746216, "step": 46370 }, { "epoch": 6.907208817396485, "grad_norm": 4.327371597290039, "learning_rate": 1.3208849079168184e-05, "loss": 0.7559, "num_input_tokens_seen": 26749576, "step": 46375 }, { "epoch": 6.907953529937444, "grad_norm": 1.76040518283844, "learning_rate": 1.3203118876164345e-05, "loss": 0.625, "num_input_tokens_seen": 26752648, "step": 46380 }, { "epoch": 6.908698242478403, "grad_norm": 2.018583297729492, "learning_rate": 1.3197389470345778e-05, "loss": 0.4475, "num_input_tokens_seen": 26755272, "step": 46385 }, { "epoch": 6.909442955019363, "grad_norm": 1.9778127670288086, "learning_rate": 1.3191660862099647e-05, "loss": 0.507, "num_input_tokens_seen": 26757768, "step": 46390 }, { "epoch": 6.910187667560322, "grad_norm": 1.9558861255645752, "learning_rate": 1.3185933051813057e-05, "loss": 0.5491, "num_input_tokens_seen": 26760488, "step": 46395 }, { "epoch": 6.910932380101281, "grad_norm": 1.9766136407852173, "learning_rate": 1.3180206039873078e-05, "loss": 0.5447, "num_input_tokens_seen": 26763464, "step": 46400 }, { "epoch": 6.91167709264224, "grad_norm": 3.401093006134033, "learning_rate": 1.317447982666672e-05, "loss": 0.6588, "num_input_tokens_seen": 26766472, "step": 46405 }, { "epoch": 6.912421805183199, "grad_norm": 2.2629692554473877, "learning_rate": 1.3168754412580934e-05, "loss": 0.5082, "num_input_tokens_seen": 26769416, "step": 46410 }, { "epoch": 6.913166517724158, "grad_norm": 2.987868309020996, "learning_rate": 1.3163029798002625e-05, "loss": 0.747, "num_input_tokens_seen": 26772232, "step": 46415 }, { "epoch": 6.913911230265118, "grad_norm": 3.174894094467163, "learning_rate": 1.3157305983318643e-05, "loss": 0.6379, "num_input_tokens_seen": 26775080, "step": 46420 }, { "epoch": 6.9146559428060765, "grad_norm": 4.402744293212891, "learning_rate": 1.3151582968915766e-05, "loss": 0.4169, "num_input_tokens_seen": 26777832, "step": 46425 }, { "epoch": 6.915400655347036, "grad_norm": 2.1675455570220947, "learning_rate": 1.314586075518075e-05, "loss": 0.7428, "num_input_tokens_seen": 26780584, "step": 46430 }, { "epoch": 6.916145367887995, "grad_norm": 1.8422226905822754, "learning_rate": 1.3140139342500257e-05, "loss": 0.5511, "num_input_tokens_seen": 26783400, "step": 46435 }, { "epoch": 6.916890080428955, "grad_norm": 1.8346017599105835, "learning_rate": 1.3134418731260931e-05, "loss": 0.6267, "num_input_tokens_seen": 26786472, "step": 46440 }, { "epoch": 6.917634792969913, "grad_norm": 1.8087334632873535, "learning_rate": 1.3128698921849344e-05, "loss": 0.6645, "num_input_tokens_seen": 26789192, "step": 46445 }, { "epoch": 6.918379505510873, "grad_norm": 2.0717661380767822, "learning_rate": 1.3122979914652016e-05, "loss": 0.3681, "num_input_tokens_seen": 26792488, "step": 46450 }, { "epoch": 6.919124218051832, "grad_norm": 1.9650352001190186, "learning_rate": 1.3117261710055433e-05, "loss": 0.5401, "num_input_tokens_seen": 26795336, "step": 46455 }, { "epoch": 6.919868930592791, "grad_norm": 2.192192554473877, "learning_rate": 1.3111544308445977e-05, "loss": 0.6885, "num_input_tokens_seen": 26798248, "step": 46460 }, { "epoch": 6.92061364313375, "grad_norm": 2.6528053283691406, "learning_rate": 1.3105827710210026e-05, "loss": 0.6632, "num_input_tokens_seen": 26800872, "step": 46465 }, { "epoch": 6.92135835567471, "grad_norm": 2.9765563011169434, "learning_rate": 1.3100111915733887e-05, "loss": 0.6865, "num_input_tokens_seen": 26803720, "step": 46470 }, { "epoch": 6.922103068215669, "grad_norm": 3.5028927326202393, "learning_rate": 1.3094396925403793e-05, "loss": 0.6445, "num_input_tokens_seen": 26806728, "step": 46475 }, { "epoch": 6.922847780756628, "grad_norm": 1.5324745178222656, "learning_rate": 1.308868273960595e-05, "loss": 0.5229, "num_input_tokens_seen": 26809320, "step": 46480 }, { "epoch": 6.923592493297587, "grad_norm": 1.6778783798217773, "learning_rate": 1.3082969358726502e-05, "loss": 0.5005, "num_input_tokens_seen": 26812072, "step": 46485 }, { "epoch": 6.924337205838547, "grad_norm": 2.2364799976348877, "learning_rate": 1.3077256783151542e-05, "loss": 0.7128, "num_input_tokens_seen": 26815048, "step": 46490 }, { "epoch": 6.925081918379505, "grad_norm": 1.9277688264846802, "learning_rate": 1.3071545013267084e-05, "loss": 0.8139, "num_input_tokens_seen": 26818248, "step": 46495 }, { "epoch": 6.925826630920465, "grad_norm": 2.289416790008545, "learning_rate": 1.3065834049459117e-05, "loss": 0.5726, "num_input_tokens_seen": 26820968, "step": 46500 }, { "epoch": 6.926571343461424, "grad_norm": 3.5777719020843506, "learning_rate": 1.3060123892113562e-05, "loss": 0.4148, "num_input_tokens_seen": 26823912, "step": 46505 }, { "epoch": 6.927316056002383, "grad_norm": 1.955237627029419, "learning_rate": 1.3054414541616305e-05, "loss": 0.6855, "num_input_tokens_seen": 26826632, "step": 46510 }, { "epoch": 6.928060768543342, "grad_norm": 1.4839460849761963, "learning_rate": 1.3048705998353133e-05, "loss": 0.5658, "num_input_tokens_seen": 26829480, "step": 46515 }, { "epoch": 6.928805481084302, "grad_norm": 4.425650119781494, "learning_rate": 1.3042998262709821e-05, "loss": 0.6819, "num_input_tokens_seen": 26832168, "step": 46520 }, { "epoch": 6.929550193625261, "grad_norm": 2.8990705013275146, "learning_rate": 1.303729133507208e-05, "loss": 0.645, "num_input_tokens_seen": 26835080, "step": 46525 }, { "epoch": 6.930294906166219, "grad_norm": 1.905608057975769, "learning_rate": 1.3031585215825545e-05, "loss": 0.7366, "num_input_tokens_seen": 26838088, "step": 46530 }, { "epoch": 6.931039618707179, "grad_norm": 1.3890434503555298, "learning_rate": 1.302587990535582e-05, "loss": 0.6105, "num_input_tokens_seen": 26840872, "step": 46535 }, { "epoch": 6.931784331248139, "grad_norm": 3.430574893951416, "learning_rate": 1.302017540404845e-05, "loss": 0.6974, "num_input_tokens_seen": 26843880, "step": 46540 }, { "epoch": 6.932529043789097, "grad_norm": 4.351464748382568, "learning_rate": 1.3014471712288917e-05, "loss": 0.576, "num_input_tokens_seen": 26846856, "step": 46545 }, { "epoch": 6.933273756330056, "grad_norm": 2.7396557331085205, "learning_rate": 1.300876883046267e-05, "loss": 0.5715, "num_input_tokens_seen": 26849640, "step": 46550 }, { "epoch": 6.934018468871016, "grad_norm": 2.5298590660095215, "learning_rate": 1.3003066758955068e-05, "loss": 0.4711, "num_input_tokens_seen": 26852648, "step": 46555 }, { "epoch": 6.9347631814119755, "grad_norm": 1.983477234840393, "learning_rate": 1.2997365498151431e-05, "loss": 0.6516, "num_input_tokens_seen": 26855560, "step": 46560 }, { "epoch": 6.935507893952934, "grad_norm": 1.700605034828186, "learning_rate": 1.2991665048437036e-05, "loss": 0.5855, "num_input_tokens_seen": 26858472, "step": 46565 }, { "epoch": 6.936252606493893, "grad_norm": 3.3389694690704346, "learning_rate": 1.2985965410197092e-05, "loss": 0.662, "num_input_tokens_seen": 26861096, "step": 46570 }, { "epoch": 6.936997319034853, "grad_norm": 1.724174976348877, "learning_rate": 1.2980266583816763e-05, "loss": 0.5184, "num_input_tokens_seen": 26864008, "step": 46575 }, { "epoch": 6.937742031575811, "grad_norm": 2.6584115028381348, "learning_rate": 1.2974568569681147e-05, "loss": 0.5053, "num_input_tokens_seen": 26866760, "step": 46580 }, { "epoch": 6.938486744116771, "grad_norm": 2.3605411052703857, "learning_rate": 1.2968871368175306e-05, "loss": 0.5683, "num_input_tokens_seen": 26869416, "step": 46585 }, { "epoch": 6.93923145665773, "grad_norm": 2.579514503479004, "learning_rate": 1.2963174979684223e-05, "loss": 0.6816, "num_input_tokens_seen": 26872296, "step": 46590 }, { "epoch": 6.939976169198689, "grad_norm": 1.2541522979736328, "learning_rate": 1.2957479404592826e-05, "loss": 0.5595, "num_input_tokens_seen": 26875336, "step": 46595 }, { "epoch": 6.940720881739648, "grad_norm": 2.534209728240967, "learning_rate": 1.2951784643286014e-05, "loss": 0.7539, "num_input_tokens_seen": 26877896, "step": 46600 }, { "epoch": 6.941465594280608, "grad_norm": 1.6794649362564087, "learning_rate": 1.2946090696148606e-05, "loss": 0.665, "num_input_tokens_seen": 26880872, "step": 46605 }, { "epoch": 6.942210306821567, "grad_norm": 1.7539204359054565, "learning_rate": 1.2940397563565381e-05, "loss": 0.6743, "num_input_tokens_seen": 26883944, "step": 46610 }, { "epoch": 6.942955019362526, "grad_norm": 2.432208299636841, "learning_rate": 1.2934705245921058e-05, "loss": 0.4977, "num_input_tokens_seen": 26887016, "step": 46615 }, { "epoch": 6.943699731903485, "grad_norm": 1.5812424421310425, "learning_rate": 1.2929013743600316e-05, "loss": 0.4151, "num_input_tokens_seen": 26889992, "step": 46620 }, { "epoch": 6.944444444444445, "grad_norm": 2.8837170600891113, "learning_rate": 1.2923323056987733e-05, "loss": 0.696, "num_input_tokens_seen": 26892744, "step": 46625 }, { "epoch": 6.945189156985403, "grad_norm": 2.6785738468170166, "learning_rate": 1.2917633186467886e-05, "loss": 0.6733, "num_input_tokens_seen": 26895400, "step": 46630 }, { "epoch": 6.945933869526363, "grad_norm": 1.8853511810302734, "learning_rate": 1.2911944132425261e-05, "loss": 0.5421, "num_input_tokens_seen": 26898152, "step": 46635 }, { "epoch": 6.946678582067322, "grad_norm": 1.675631046295166, "learning_rate": 1.2906255895244301e-05, "loss": 0.6282, "num_input_tokens_seen": 26900840, "step": 46640 }, { "epoch": 6.9474232946082815, "grad_norm": 2.156038284301758, "learning_rate": 1.29005684753094e-05, "loss": 0.5509, "num_input_tokens_seen": 26903688, "step": 46645 }, { "epoch": 6.94816800714924, "grad_norm": 1.5393860340118408, "learning_rate": 1.2894881873004889e-05, "loss": 0.7131, "num_input_tokens_seen": 26906760, "step": 46650 }, { "epoch": 6.9489127196902, "grad_norm": 0.92528235912323, "learning_rate": 1.288919608871505e-05, "loss": 0.5622, "num_input_tokens_seen": 26909512, "step": 46655 }, { "epoch": 6.949657432231159, "grad_norm": 5.002768039703369, "learning_rate": 1.2883511122824093e-05, "loss": 0.9223, "num_input_tokens_seen": 26912296, "step": 46660 }, { "epoch": 6.950402144772118, "grad_norm": 1.4808372259140015, "learning_rate": 1.2877826975716195e-05, "loss": 0.5301, "num_input_tokens_seen": 26914888, "step": 46665 }, { "epoch": 6.951146857313077, "grad_norm": 1.9970731735229492, "learning_rate": 1.2872143647775476e-05, "loss": 0.5877, "num_input_tokens_seen": 26918120, "step": 46670 }, { "epoch": 6.951891569854037, "grad_norm": 1.8263378143310547, "learning_rate": 1.2866461139385966e-05, "loss": 0.5355, "num_input_tokens_seen": 26920968, "step": 46675 }, { "epoch": 6.952636282394995, "grad_norm": 3.654252529144287, "learning_rate": 1.2860779450931684e-05, "loss": 0.6567, "num_input_tokens_seen": 26924072, "step": 46680 }, { "epoch": 6.953380994935955, "grad_norm": 1.6606172323226929, "learning_rate": 1.285509858279657e-05, "loss": 0.4678, "num_input_tokens_seen": 26926920, "step": 46685 }, { "epoch": 6.954125707476914, "grad_norm": 2.1763014793395996, "learning_rate": 1.2849418535364527e-05, "loss": 0.556, "num_input_tokens_seen": 26929640, "step": 46690 }, { "epoch": 6.9548704200178735, "grad_norm": 2.2900638580322266, "learning_rate": 1.284373930901937e-05, "loss": 0.7635, "num_input_tokens_seen": 26932424, "step": 46695 }, { "epoch": 6.955615132558832, "grad_norm": 1.8119945526123047, "learning_rate": 1.2838060904144888e-05, "loss": 0.5964, "num_input_tokens_seen": 26935272, "step": 46700 }, { "epoch": 6.956359845099792, "grad_norm": 3.6939759254455566, "learning_rate": 1.28323833211248e-05, "loss": 0.6381, "num_input_tokens_seen": 26938216, "step": 46705 }, { "epoch": 6.957104557640751, "grad_norm": 11.195178985595703, "learning_rate": 1.2826706560342788e-05, "loss": 0.5788, "num_input_tokens_seen": 26941192, "step": 46710 }, { "epoch": 6.957849270181709, "grad_norm": 2.1110947132110596, "learning_rate": 1.2821030622182444e-05, "loss": 0.6141, "num_input_tokens_seen": 26944040, "step": 46715 }, { "epoch": 6.958593982722669, "grad_norm": 3.826406478881836, "learning_rate": 1.2815355507027344e-05, "loss": 0.6017, "num_input_tokens_seen": 26946536, "step": 46720 }, { "epoch": 6.959338695263629, "grad_norm": 1.3965694904327393, "learning_rate": 1.280968121526097e-05, "loss": 0.4768, "num_input_tokens_seen": 26949320, "step": 46725 }, { "epoch": 6.9600834078045875, "grad_norm": 2.4422953128814697, "learning_rate": 1.2804007747266778e-05, "loss": 0.7256, "num_input_tokens_seen": 26952264, "step": 46730 }, { "epoch": 6.960828120345546, "grad_norm": 1.9938359260559082, "learning_rate": 1.2798335103428157e-05, "loss": 0.5688, "num_input_tokens_seen": 26955080, "step": 46735 }, { "epoch": 6.961572832886506, "grad_norm": 3.193006753921509, "learning_rate": 1.2792663284128443e-05, "loss": 0.7386, "num_input_tokens_seen": 26957768, "step": 46740 }, { "epoch": 6.962317545427465, "grad_norm": 1.9229881763458252, "learning_rate": 1.2786992289750909e-05, "loss": 0.584, "num_input_tokens_seen": 26960936, "step": 46745 }, { "epoch": 6.963062257968424, "grad_norm": 4.603039741516113, "learning_rate": 1.2781322120678796e-05, "loss": 0.5813, "num_input_tokens_seen": 26963944, "step": 46750 }, { "epoch": 6.963806970509383, "grad_norm": 1.1294517517089844, "learning_rate": 1.2775652777295252e-05, "loss": 0.5732, "num_input_tokens_seen": 26966920, "step": 46755 }, { "epoch": 6.964551683050343, "grad_norm": 3.4108054637908936, "learning_rate": 1.2769984259983386e-05, "loss": 0.65, "num_input_tokens_seen": 26970088, "step": 46760 }, { "epoch": 6.965296395591301, "grad_norm": 3.29158878326416, "learning_rate": 1.2764316569126258e-05, "loss": 0.5753, "num_input_tokens_seen": 26972968, "step": 46765 }, { "epoch": 6.966041108132261, "grad_norm": 1.8004521131515503, "learning_rate": 1.275864970510687e-05, "loss": 0.4484, "num_input_tokens_seen": 26975752, "step": 46770 }, { "epoch": 6.96678582067322, "grad_norm": 4.095076560974121, "learning_rate": 1.2752983668308167e-05, "loss": 0.7117, "num_input_tokens_seen": 26978856, "step": 46775 }, { "epoch": 6.9675305332141795, "grad_norm": 2.385547161102295, "learning_rate": 1.2747318459113033e-05, "loss": 0.5734, "num_input_tokens_seen": 26981608, "step": 46780 }, { "epoch": 6.968275245755138, "grad_norm": 3.1643898487091064, "learning_rate": 1.2741654077904313e-05, "loss": 0.6452, "num_input_tokens_seen": 26984456, "step": 46785 }, { "epoch": 6.969019958296098, "grad_norm": 1.8110469579696655, "learning_rate": 1.273599052506476e-05, "loss": 0.6161, "num_input_tokens_seen": 26987208, "step": 46790 }, { "epoch": 6.969764670837057, "grad_norm": 4.332286834716797, "learning_rate": 1.2730327800977116e-05, "loss": 0.5574, "num_input_tokens_seen": 26990184, "step": 46795 }, { "epoch": 6.970509383378016, "grad_norm": 3.443721055984497, "learning_rate": 1.2724665906024025e-05, "loss": 0.5346, "num_input_tokens_seen": 26992872, "step": 46800 }, { "epoch": 6.971254095918975, "grad_norm": 3.7622334957122803, "learning_rate": 1.2719004840588106e-05, "loss": 0.5957, "num_input_tokens_seen": 26995624, "step": 46805 }, { "epoch": 6.971998808459935, "grad_norm": 1.9219472408294678, "learning_rate": 1.2713344605051905e-05, "loss": 0.6635, "num_input_tokens_seen": 26998760, "step": 46810 }, { "epoch": 6.9727435210008935, "grad_norm": 2.367372512817383, "learning_rate": 1.2707685199797926e-05, "loss": 0.379, "num_input_tokens_seen": 27001448, "step": 46815 }, { "epoch": 6.973488233541853, "grad_norm": 2.6381609439849854, "learning_rate": 1.270202662520861e-05, "loss": 0.6719, "num_input_tokens_seen": 27004264, "step": 46820 }, { "epoch": 6.974232946082812, "grad_norm": 1.9633445739746094, "learning_rate": 1.2696368881666325e-05, "loss": 0.5, "num_input_tokens_seen": 27007176, "step": 46825 }, { "epoch": 6.9749776586237715, "grad_norm": 1.5614373683929443, "learning_rate": 1.2690711969553412e-05, "loss": 0.7683, "num_input_tokens_seen": 27010088, "step": 46830 }, { "epoch": 6.97572237116473, "grad_norm": 3.3880696296691895, "learning_rate": 1.2685055889252146e-05, "loss": 0.4409, "num_input_tokens_seen": 27012776, "step": 46835 }, { "epoch": 6.97646708370569, "grad_norm": 3.3526148796081543, "learning_rate": 1.2679400641144723e-05, "loss": 0.7243, "num_input_tokens_seen": 27015688, "step": 46840 }, { "epoch": 6.977211796246649, "grad_norm": 1.4553390741348267, "learning_rate": 1.2673746225613315e-05, "loss": 0.6147, "num_input_tokens_seen": 27018568, "step": 46845 }, { "epoch": 6.977956508787608, "grad_norm": 2.3252720832824707, "learning_rate": 1.2668092643040018e-05, "loss": 0.557, "num_input_tokens_seen": 27021288, "step": 46850 }, { "epoch": 6.978701221328567, "grad_norm": 2.416959047317505, "learning_rate": 1.2662439893806899e-05, "loss": 0.6256, "num_input_tokens_seen": 27024200, "step": 46855 }, { "epoch": 6.979445933869527, "grad_norm": 3.273679256439209, "learning_rate": 1.2656787978295913e-05, "loss": 0.6093, "num_input_tokens_seen": 27027048, "step": 46860 }, { "epoch": 6.9801906464104855, "grad_norm": 2.8435072898864746, "learning_rate": 1.265113689688902e-05, "loss": 0.4445, "num_input_tokens_seen": 27029896, "step": 46865 }, { "epoch": 6.980935358951445, "grad_norm": 1.815207839012146, "learning_rate": 1.2645486649968085e-05, "loss": 0.5451, "num_input_tokens_seen": 27032616, "step": 46870 }, { "epoch": 6.981680071492404, "grad_norm": 2.08575701713562, "learning_rate": 1.2639837237914943e-05, "loss": 0.4719, "num_input_tokens_seen": 27035304, "step": 46875 }, { "epoch": 6.982424784033363, "grad_norm": 1.838005542755127, "learning_rate": 1.2634188661111335e-05, "loss": 0.4706, "num_input_tokens_seen": 27038184, "step": 46880 }, { "epoch": 6.983169496574322, "grad_norm": 2.5723443031311035, "learning_rate": 1.2628540919938991e-05, "loss": 0.8694, "num_input_tokens_seen": 27040904, "step": 46885 }, { "epoch": 6.983914209115282, "grad_norm": 2.16841983795166, "learning_rate": 1.2622894014779547e-05, "loss": 0.5119, "num_input_tokens_seen": 27043720, "step": 46890 }, { "epoch": 6.984658921656241, "grad_norm": 1.764111876487732, "learning_rate": 1.2617247946014604e-05, "loss": 0.6231, "num_input_tokens_seen": 27046984, "step": 46895 }, { "epoch": 6.9854036341971995, "grad_norm": 2.0353481769561768, "learning_rate": 1.2611602714025696e-05, "loss": 0.5453, "num_input_tokens_seen": 27049800, "step": 46900 }, { "epoch": 6.986148346738159, "grad_norm": 1.7890859842300415, "learning_rate": 1.2605958319194311e-05, "loss": 0.5977, "num_input_tokens_seen": 27053064, "step": 46905 }, { "epoch": 6.986893059279119, "grad_norm": 2.175919532775879, "learning_rate": 1.2600314761901874e-05, "loss": 0.5847, "num_input_tokens_seen": 27055816, "step": 46910 }, { "epoch": 6.9876377718200775, "grad_norm": 1.6393747329711914, "learning_rate": 1.259467204252976e-05, "loss": 0.6032, "num_input_tokens_seen": 27058632, "step": 46915 }, { "epoch": 6.988382484361036, "grad_norm": 3.447758674621582, "learning_rate": 1.2589030161459275e-05, "loss": 0.6647, "num_input_tokens_seen": 27061480, "step": 46920 }, { "epoch": 6.989127196901996, "grad_norm": 1.2074321508407593, "learning_rate": 1.2583389119071659e-05, "loss": 0.5072, "num_input_tokens_seen": 27064328, "step": 46925 }, { "epoch": 6.989871909442955, "grad_norm": 2.2366433143615723, "learning_rate": 1.2577748915748127e-05, "loss": 0.5646, "num_input_tokens_seen": 27067336, "step": 46930 }, { "epoch": 6.990616621983914, "grad_norm": 3.242581605911255, "learning_rate": 1.2572109551869815e-05, "loss": 0.7531, "num_input_tokens_seen": 27070120, "step": 46935 }, { "epoch": 6.991361334524873, "grad_norm": 1.7755532264709473, "learning_rate": 1.2566471027817817e-05, "loss": 0.6499, "num_input_tokens_seen": 27073064, "step": 46940 }, { "epoch": 6.992106047065833, "grad_norm": 0.8718041777610779, "learning_rate": 1.256083334397315e-05, "loss": 0.5034, "num_input_tokens_seen": 27075784, "step": 46945 }, { "epoch": 6.9928507596067915, "grad_norm": 1.8799598217010498, "learning_rate": 1.2555196500716803e-05, "loss": 0.7518, "num_input_tokens_seen": 27078856, "step": 46950 }, { "epoch": 6.993595472147751, "grad_norm": 1.0513873100280762, "learning_rate": 1.2549560498429683e-05, "loss": 0.5071, "num_input_tokens_seen": 27081992, "step": 46955 }, { "epoch": 6.99434018468871, "grad_norm": 1.979602336883545, "learning_rate": 1.2543925337492631e-05, "loss": 0.5957, "num_input_tokens_seen": 27084840, "step": 46960 }, { "epoch": 6.9950848972296695, "grad_norm": 2.53875732421875, "learning_rate": 1.2538291018286462e-05, "loss": 0.5566, "num_input_tokens_seen": 27087688, "step": 46965 }, { "epoch": 6.995829609770628, "grad_norm": 4.475015163421631, "learning_rate": 1.2532657541191922e-05, "loss": 0.6397, "num_input_tokens_seen": 27090760, "step": 46970 }, { "epoch": 6.996574322311588, "grad_norm": 2.4471609592437744, "learning_rate": 1.2527024906589698e-05, "loss": 0.5122, "num_input_tokens_seen": 27093640, "step": 46975 }, { "epoch": 6.997319034852547, "grad_norm": 1.8413844108581543, "learning_rate": 1.252139311486042e-05, "loss": 0.5139, "num_input_tokens_seen": 27096840, "step": 46980 }, { "epoch": 6.998063747393506, "grad_norm": 2.1543731689453125, "learning_rate": 1.2515762166384668e-05, "loss": 0.5692, "num_input_tokens_seen": 27099976, "step": 46985 }, { "epoch": 6.998808459934465, "grad_norm": 2.979125499725342, "learning_rate": 1.2510132061542939e-05, "loss": 0.6966, "num_input_tokens_seen": 27103112, "step": 46990 }, { "epoch": 6.999553172475425, "grad_norm": 1.8866726160049438, "learning_rate": 1.2504502800715723e-05, "loss": 0.6635, "num_input_tokens_seen": 27106056, "step": 46995 }, { "epoch": 7.0, "eval_loss": 0.6671453714370728, "eval_runtime": 74.2335, "eval_samples_per_second": 40.197, "eval_steps_per_second": 10.049, "num_input_tokens_seen": 27107328, "step": 46998 }, { "epoch": 7.0002978850163835, "grad_norm": 2.40049147605896, "learning_rate": 1.2498874384283389e-05, "loss": 0.7574, "num_input_tokens_seen": 27108544, "step": 47000 }, { "epoch": 7.001042597557343, "grad_norm": 1.6449867486953735, "learning_rate": 1.24932468126263e-05, "loss": 0.6061, "num_input_tokens_seen": 27111360, "step": 47005 }, { "epoch": 7.001787310098302, "grad_norm": 3.8953261375427246, "learning_rate": 1.248762008612474e-05, "loss": 0.5041, "num_input_tokens_seen": 27114080, "step": 47010 }, { "epoch": 7.0025320226392616, "grad_norm": 1.832187533378601, "learning_rate": 1.2481994205158948e-05, "loss": 0.5633, "num_input_tokens_seen": 27117216, "step": 47015 }, { "epoch": 7.00327673518022, "grad_norm": 1.3298656940460205, "learning_rate": 1.2476369170109098e-05, "loss": 0.579, "num_input_tokens_seen": 27119840, "step": 47020 }, { "epoch": 7.00402144772118, "grad_norm": 1.0850454568862915, "learning_rate": 1.2470744981355296e-05, "loss": 0.4027, "num_input_tokens_seen": 27122528, "step": 47025 }, { "epoch": 7.004766160262139, "grad_norm": 2.405578136444092, "learning_rate": 1.2465121639277605e-05, "loss": 0.5654, "num_input_tokens_seen": 27125472, "step": 47030 }, { "epoch": 7.005510872803098, "grad_norm": 5.201127052307129, "learning_rate": 1.2459499144256042e-05, "loss": 0.7079, "num_input_tokens_seen": 27128704, "step": 47035 }, { "epoch": 7.006255585344057, "grad_norm": 1.619199514389038, "learning_rate": 1.2453877496670532e-05, "loss": 0.5557, "num_input_tokens_seen": 27131712, "step": 47040 }, { "epoch": 7.007000297885017, "grad_norm": 1.7606101036071777, "learning_rate": 1.244825669690097e-05, "loss": 0.5275, "num_input_tokens_seen": 27134688, "step": 47045 }, { "epoch": 7.0077450104259755, "grad_norm": 3.5678179264068604, "learning_rate": 1.24426367453272e-05, "loss": 0.6447, "num_input_tokens_seen": 27137632, "step": 47050 }, { "epoch": 7.008489722966935, "grad_norm": 1.9584689140319824, "learning_rate": 1.2437017642328971e-05, "loss": 0.7815, "num_input_tokens_seen": 27140448, "step": 47055 }, { "epoch": 7.009234435507894, "grad_norm": 2.5382959842681885, "learning_rate": 1.2431399388286017e-05, "loss": 0.6217, "num_input_tokens_seen": 27143328, "step": 47060 }, { "epoch": 7.009979148048854, "grad_norm": 6.469211578369141, "learning_rate": 1.242578198357799e-05, "loss": 0.5499, "num_input_tokens_seen": 27146144, "step": 47065 }, { "epoch": 7.010723860589812, "grad_norm": 2.4496519565582275, "learning_rate": 1.2420165428584493e-05, "loss": 0.5463, "num_input_tokens_seen": 27148864, "step": 47070 }, { "epoch": 7.011468573130771, "grad_norm": 3.2923336029052734, "learning_rate": 1.2414549723685082e-05, "loss": 0.6384, "num_input_tokens_seen": 27152288, "step": 47075 }, { "epoch": 7.012213285671731, "grad_norm": 3.137733221054077, "learning_rate": 1.240893486925922e-05, "loss": 0.8501, "num_input_tokens_seen": 27155264, "step": 47080 }, { "epoch": 7.0129579982126895, "grad_norm": 3.1495885848999023, "learning_rate": 1.2403320865686361e-05, "loss": 0.6273, "num_input_tokens_seen": 27157920, "step": 47085 }, { "epoch": 7.013702710753649, "grad_norm": 3.6033945083618164, "learning_rate": 1.239770771334585e-05, "loss": 0.6466, "num_input_tokens_seen": 27161216, "step": 47090 }, { "epoch": 7.014447423294608, "grad_norm": 2.3156380653381348, "learning_rate": 1.2392095412617017e-05, "loss": 0.3434, "num_input_tokens_seen": 27164256, "step": 47095 }, { "epoch": 7.0151921358355676, "grad_norm": 1.7184053659439087, "learning_rate": 1.2386483963879114e-05, "loss": 0.6365, "num_input_tokens_seen": 27167168, "step": 47100 }, { "epoch": 7.015936848376526, "grad_norm": 1.961884617805481, "learning_rate": 1.2380873367511344e-05, "loss": 0.5574, "num_input_tokens_seen": 27169824, "step": 47105 }, { "epoch": 7.016681560917486, "grad_norm": 3.850404739379883, "learning_rate": 1.2375263623892846e-05, "loss": 0.5547, "num_input_tokens_seen": 27172544, "step": 47110 }, { "epoch": 7.017426273458445, "grad_norm": 1.7800745964050293, "learning_rate": 1.2369654733402714e-05, "loss": 0.5239, "num_input_tokens_seen": 27175712, "step": 47115 }, { "epoch": 7.018170985999404, "grad_norm": 2.6857542991638184, "learning_rate": 1.2364046696419962e-05, "loss": 0.4587, "num_input_tokens_seen": 27178368, "step": 47120 }, { "epoch": 7.018915698540363, "grad_norm": 2.03718638420105, "learning_rate": 1.235843951332355e-05, "loss": 0.5632, "num_input_tokens_seen": 27181120, "step": 47125 }, { "epoch": 7.019660411081323, "grad_norm": 2.327023506164551, "learning_rate": 1.2352833184492402e-05, "loss": 0.6324, "num_input_tokens_seen": 27183744, "step": 47130 }, { "epoch": 7.0204051236222815, "grad_norm": 1.7108598947525024, "learning_rate": 1.2347227710305365e-05, "loss": 0.5748, "num_input_tokens_seen": 27186784, "step": 47135 }, { "epoch": 7.021149836163241, "grad_norm": 2.7049267292022705, "learning_rate": 1.2341623091141238e-05, "loss": 0.4788, "num_input_tokens_seen": 27189600, "step": 47140 }, { "epoch": 7.0218945487042, "grad_norm": 2.768655300140381, "learning_rate": 1.2336019327378756e-05, "loss": 0.7464, "num_input_tokens_seen": 27192480, "step": 47145 }, { "epoch": 7.02263926124516, "grad_norm": 1.5480906963348389, "learning_rate": 1.2330416419396612e-05, "loss": 0.4004, "num_input_tokens_seen": 27195296, "step": 47150 }, { "epoch": 7.023383973786118, "grad_norm": 3.2708580493927, "learning_rate": 1.2324814367573411e-05, "loss": 0.5901, "num_input_tokens_seen": 27198592, "step": 47155 }, { "epoch": 7.024128686327078, "grad_norm": 4.168888568878174, "learning_rate": 1.2319213172287716e-05, "loss": 0.484, "num_input_tokens_seen": 27201824, "step": 47160 }, { "epoch": 7.024873398868037, "grad_norm": 1.8715956211090088, "learning_rate": 1.231361283391804e-05, "loss": 0.6545, "num_input_tokens_seen": 27204576, "step": 47165 }, { "epoch": 7.025618111408996, "grad_norm": 2.8898231983184814, "learning_rate": 1.2308013352842826e-05, "loss": 0.6098, "num_input_tokens_seen": 27207488, "step": 47170 }, { "epoch": 7.026362823949955, "grad_norm": 3.5722508430480957, "learning_rate": 1.230241472944047e-05, "loss": 0.6857, "num_input_tokens_seen": 27210336, "step": 47175 }, { "epoch": 7.027107536490915, "grad_norm": 1.387490153312683, "learning_rate": 1.22968169640893e-05, "loss": 0.5825, "num_input_tokens_seen": 27213536, "step": 47180 }, { "epoch": 7.0278522490318736, "grad_norm": 1.6215842962265015, "learning_rate": 1.2291220057167602e-05, "loss": 0.7198, "num_input_tokens_seen": 27216288, "step": 47185 }, { "epoch": 7.028596961572833, "grad_norm": 4.530144214630127, "learning_rate": 1.2285624009053573e-05, "loss": 0.8069, "num_input_tokens_seen": 27219072, "step": 47190 }, { "epoch": 7.029341674113792, "grad_norm": 2.316479444503784, "learning_rate": 1.2280028820125391e-05, "loss": 0.4916, "num_input_tokens_seen": 27222016, "step": 47195 }, { "epoch": 7.030086386654752, "grad_norm": 2.278205633163452, "learning_rate": 1.2274434490761135e-05, "loss": 0.6295, "num_input_tokens_seen": 27224960, "step": 47200 }, { "epoch": 7.03083109919571, "grad_norm": 2.6740965843200684, "learning_rate": 1.226884102133886e-05, "loss": 0.509, "num_input_tokens_seen": 27228000, "step": 47205 }, { "epoch": 7.03157581173667, "grad_norm": 1.8378841876983643, "learning_rate": 1.2263248412236547e-05, "loss": 0.5151, "num_input_tokens_seen": 27230688, "step": 47210 }, { "epoch": 7.032320524277629, "grad_norm": 2.391983985900879, "learning_rate": 1.2257656663832129e-05, "loss": 0.6981, "num_input_tokens_seen": 27233152, "step": 47215 }, { "epoch": 7.033065236818588, "grad_norm": 2.4170000553131104, "learning_rate": 1.225206577650346e-05, "loss": 0.5449, "num_input_tokens_seen": 27236576, "step": 47220 }, { "epoch": 7.033809949359547, "grad_norm": 2.096916437149048, "learning_rate": 1.2246475750628355e-05, "loss": 0.3806, "num_input_tokens_seen": 27239232, "step": 47225 }, { "epoch": 7.034554661900507, "grad_norm": 1.3214603662490845, "learning_rate": 1.2240886586584568e-05, "loss": 0.5668, "num_input_tokens_seen": 27242048, "step": 47230 }, { "epoch": 7.035299374441466, "grad_norm": 3.484011650085449, "learning_rate": 1.2235298284749803e-05, "loss": 0.617, "num_input_tokens_seen": 27244800, "step": 47235 }, { "epoch": 7.036044086982425, "grad_norm": 0.9155939817428589, "learning_rate": 1.2229710845501669e-05, "loss": 0.5339, "num_input_tokens_seen": 27247584, "step": 47240 }, { "epoch": 7.036788799523384, "grad_norm": 2.42244815826416, "learning_rate": 1.2224124269217756e-05, "loss": 0.4686, "num_input_tokens_seen": 27250464, "step": 47245 }, { "epoch": 7.037533512064343, "grad_norm": 1.0007082223892212, "learning_rate": 1.2218538556275594e-05, "loss": 0.5598, "num_input_tokens_seen": 27253216, "step": 47250 }, { "epoch": 7.038278224605302, "grad_norm": 2.13112735748291, "learning_rate": 1.2212953707052619e-05, "loss": 0.4215, "num_input_tokens_seen": 27256064, "step": 47255 }, { "epoch": 7.039022937146261, "grad_norm": 2.8912975788116455, "learning_rate": 1.2207369721926243e-05, "loss": 0.6144, "num_input_tokens_seen": 27259040, "step": 47260 }, { "epoch": 7.039767649687221, "grad_norm": 2.995344638824463, "learning_rate": 1.2201786601273813e-05, "loss": 0.4105, "num_input_tokens_seen": 27261888, "step": 47265 }, { "epoch": 7.0405123622281796, "grad_norm": 2.266308069229126, "learning_rate": 1.2196204345472609e-05, "loss": 0.6548, "num_input_tokens_seen": 27264544, "step": 47270 }, { "epoch": 7.041257074769139, "grad_norm": 7.2681427001953125, "learning_rate": 1.2190622954899869e-05, "loss": 0.9148, "num_input_tokens_seen": 27267392, "step": 47275 }, { "epoch": 7.042001787310098, "grad_norm": 3.7808735370635986, "learning_rate": 1.218504242993274e-05, "loss": 0.4991, "num_input_tokens_seen": 27270112, "step": 47280 }, { "epoch": 7.042746499851058, "grad_norm": 4.668670654296875, "learning_rate": 1.2179462770948355e-05, "loss": 0.5842, "num_input_tokens_seen": 27272800, "step": 47285 }, { "epoch": 7.043491212392016, "grad_norm": 4.324021816253662, "learning_rate": 1.2173883978323739e-05, "loss": 0.474, "num_input_tokens_seen": 27275424, "step": 47290 }, { "epoch": 7.044235924932976, "grad_norm": 1.9734737873077393, "learning_rate": 1.2168306052435896e-05, "loss": 0.6386, "num_input_tokens_seen": 27278592, "step": 47295 }, { "epoch": 7.044980637473935, "grad_norm": 2.187683343887329, "learning_rate": 1.216272899366176e-05, "loss": 0.4769, "num_input_tokens_seen": 27281984, "step": 47300 }, { "epoch": 7.045725350014894, "grad_norm": 1.742174506187439, "learning_rate": 1.2157152802378207e-05, "loss": 0.6989, "num_input_tokens_seen": 27284832, "step": 47305 }, { "epoch": 7.046470062555853, "grad_norm": 2.200085163116455, "learning_rate": 1.2151577478962054e-05, "loss": 0.7008, "num_input_tokens_seen": 27287808, "step": 47310 }, { "epoch": 7.047214775096813, "grad_norm": 2.948136806488037, "learning_rate": 1.2146003023790064e-05, "loss": 0.7541, "num_input_tokens_seen": 27290592, "step": 47315 }, { "epoch": 7.047959487637772, "grad_norm": 1.6914844512939453, "learning_rate": 1.2140429437238932e-05, "loss": 0.5134, "num_input_tokens_seen": 27293248, "step": 47320 }, { "epoch": 7.048704200178731, "grad_norm": 3.0740866661071777, "learning_rate": 1.213485671968528e-05, "loss": 0.7908, "num_input_tokens_seen": 27296512, "step": 47325 }, { "epoch": 7.04944891271969, "grad_norm": 1.8791064023971558, "learning_rate": 1.2129284871505712e-05, "loss": 0.6907, "num_input_tokens_seen": 27299360, "step": 47330 }, { "epoch": 7.05019362526065, "grad_norm": 12.698692321777344, "learning_rate": 1.2123713893076741e-05, "loss": 0.707, "num_input_tokens_seen": 27302336, "step": 47335 }, { "epoch": 7.050938337801608, "grad_norm": 3.094175100326538, "learning_rate": 1.2118143784774832e-05, "loss": 0.4242, "num_input_tokens_seen": 27305344, "step": 47340 }, { "epoch": 7.051683050342568, "grad_norm": 2.2796273231506348, "learning_rate": 1.2112574546976397e-05, "loss": 0.7087, "num_input_tokens_seen": 27308128, "step": 47345 }, { "epoch": 7.052427762883527, "grad_norm": 1.7295812368392944, "learning_rate": 1.210700618005778e-05, "loss": 0.5685, "num_input_tokens_seen": 27310784, "step": 47350 }, { "epoch": 7.053172475424486, "grad_norm": 3.1715896129608154, "learning_rate": 1.2101438684395264e-05, "loss": 0.3912, "num_input_tokens_seen": 27313536, "step": 47355 }, { "epoch": 7.053917187965445, "grad_norm": 2.4179279804229736, "learning_rate": 1.2095872060365084e-05, "loss": 0.6335, "num_input_tokens_seen": 27316416, "step": 47360 }, { "epoch": 7.054661900506405, "grad_norm": 1.8319337368011475, "learning_rate": 1.20903063083434e-05, "loss": 0.6232, "num_input_tokens_seen": 27319264, "step": 47365 }, { "epoch": 7.055406613047364, "grad_norm": 2.435035467147827, "learning_rate": 1.2084741428706328e-05, "loss": 0.4663, "num_input_tokens_seen": 27322080, "step": 47370 }, { "epoch": 7.056151325588323, "grad_norm": 3.5628011226654053, "learning_rate": 1.207917742182992e-05, "loss": 0.5462, "num_input_tokens_seen": 27325056, "step": 47375 }, { "epoch": 7.056896038129282, "grad_norm": 3.24657940864563, "learning_rate": 1.2073614288090179e-05, "loss": 0.688, "num_input_tokens_seen": 27327840, "step": 47380 }, { "epoch": 7.057640750670242, "grad_norm": 3.331251621246338, "learning_rate": 1.206805202786302e-05, "loss": 0.5486, "num_input_tokens_seen": 27330720, "step": 47385 }, { "epoch": 7.0583854632112, "grad_norm": 1.9575783014297485, "learning_rate": 1.2062490641524327e-05, "loss": 0.7091, "num_input_tokens_seen": 27333536, "step": 47390 }, { "epoch": 7.05913017575216, "grad_norm": 2.3164467811584473, "learning_rate": 1.2056930129449918e-05, "loss": 0.3706, "num_input_tokens_seen": 27336416, "step": 47395 }, { "epoch": 7.059874888293119, "grad_norm": 4.406720161437988, "learning_rate": 1.2051370492015556e-05, "loss": 0.4784, "num_input_tokens_seen": 27339040, "step": 47400 }, { "epoch": 7.0606196008340785, "grad_norm": 2.765561580657959, "learning_rate": 1.2045811729596922e-05, "loss": 0.5616, "num_input_tokens_seen": 27342016, "step": 47405 }, { "epoch": 7.061364313375037, "grad_norm": 3.1967735290527344, "learning_rate": 1.2040253842569665e-05, "loss": 0.5593, "num_input_tokens_seen": 27344768, "step": 47410 }, { "epoch": 7.062109025915996, "grad_norm": 4.039437770843506, "learning_rate": 1.2034696831309369e-05, "loss": 0.5489, "num_input_tokens_seen": 27347776, "step": 47415 }, { "epoch": 7.062853738456956, "grad_norm": 3.3518028259277344, "learning_rate": 1.202914069619154e-05, "loss": 0.5417, "num_input_tokens_seen": 27350752, "step": 47420 }, { "epoch": 7.063598450997914, "grad_norm": 5.930285930633545, "learning_rate": 1.202358543759165e-05, "loss": 0.6523, "num_input_tokens_seen": 27353504, "step": 47425 }, { "epoch": 7.064343163538874, "grad_norm": 3.85349178314209, "learning_rate": 1.2018031055885093e-05, "loss": 0.5954, "num_input_tokens_seen": 27356448, "step": 47430 }, { "epoch": 7.065087876079833, "grad_norm": 3.204413890838623, "learning_rate": 1.2012477551447222e-05, "loss": 0.5034, "num_input_tokens_seen": 27359040, "step": 47435 }, { "epoch": 7.065832588620792, "grad_norm": 2.151277542114258, "learning_rate": 1.2006924924653318e-05, "loss": 0.6112, "num_input_tokens_seen": 27361824, "step": 47440 }, { "epoch": 7.066577301161751, "grad_norm": 1.5771660804748535, "learning_rate": 1.2001373175878597e-05, "loss": 0.5048, "num_input_tokens_seen": 27364960, "step": 47445 }, { "epoch": 7.067322013702711, "grad_norm": 5.4280877113342285, "learning_rate": 1.1995822305498233e-05, "loss": 0.757, "num_input_tokens_seen": 27367840, "step": 47450 }, { "epoch": 7.06806672624367, "grad_norm": 2.6682045459747314, "learning_rate": 1.1990272313887321e-05, "loss": 0.5516, "num_input_tokens_seen": 27370368, "step": 47455 }, { "epoch": 7.068811438784629, "grad_norm": 4.115535736083984, "learning_rate": 1.1984723201420911e-05, "loss": 0.7957, "num_input_tokens_seen": 27373216, "step": 47460 }, { "epoch": 7.069556151325588, "grad_norm": 1.5399210453033447, "learning_rate": 1.1979174968473991e-05, "loss": 0.4907, "num_input_tokens_seen": 27376160, "step": 47465 }, { "epoch": 7.070300863866548, "grad_norm": 2.389317035675049, "learning_rate": 1.1973627615421487e-05, "loss": 0.5007, "num_input_tokens_seen": 27379424, "step": 47470 }, { "epoch": 7.071045576407506, "grad_norm": 1.4485759735107422, "learning_rate": 1.1968081142638268e-05, "loss": 0.4895, "num_input_tokens_seen": 27382240, "step": 47475 }, { "epoch": 7.071790288948466, "grad_norm": 7.171496868133545, "learning_rate": 1.1962535550499152e-05, "loss": 0.6742, "num_input_tokens_seen": 27385312, "step": 47480 }, { "epoch": 7.072535001489425, "grad_norm": 3.0161845684051514, "learning_rate": 1.1956990839378877e-05, "loss": 0.5551, "num_input_tokens_seen": 27388064, "step": 47485 }, { "epoch": 7.0732797140303845, "grad_norm": 3.616513967514038, "learning_rate": 1.1951447009652119e-05, "loss": 0.5465, "num_input_tokens_seen": 27391136, "step": 47490 }, { "epoch": 7.074024426571343, "grad_norm": 2.2367475032806396, "learning_rate": 1.1945904061693524e-05, "loss": 0.6061, "num_input_tokens_seen": 27393824, "step": 47495 }, { "epoch": 7.074769139112303, "grad_norm": 2.9957361221313477, "learning_rate": 1.1940361995877658e-05, "loss": 0.5006, "num_input_tokens_seen": 27396704, "step": 47500 }, { "epoch": 7.075513851653262, "grad_norm": 4.27882194519043, "learning_rate": 1.1934820812579031e-05, "loss": 0.6376, "num_input_tokens_seen": 27399616, "step": 47505 }, { "epoch": 7.076258564194221, "grad_norm": 3.6896729469299316, "learning_rate": 1.1929280512172095e-05, "loss": 0.5219, "num_input_tokens_seen": 27402496, "step": 47510 }, { "epoch": 7.07700327673518, "grad_norm": 2.471637010574341, "learning_rate": 1.1923741095031248e-05, "loss": 0.5641, "num_input_tokens_seen": 27405280, "step": 47515 }, { "epoch": 7.07774798927614, "grad_norm": 3.4635353088378906, "learning_rate": 1.1918202561530813e-05, "loss": 0.6617, "num_input_tokens_seen": 27408320, "step": 47520 }, { "epoch": 7.078492701817098, "grad_norm": 1.8071826696395874, "learning_rate": 1.1912664912045057e-05, "loss": 0.5139, "num_input_tokens_seen": 27411360, "step": 47525 }, { "epoch": 7.079237414358058, "grad_norm": 1.5194510221481323, "learning_rate": 1.1907128146948193e-05, "loss": 0.6156, "num_input_tokens_seen": 27414336, "step": 47530 }, { "epoch": 7.079982126899017, "grad_norm": 1.521297812461853, "learning_rate": 1.190159226661438e-05, "loss": 0.6039, "num_input_tokens_seen": 27417088, "step": 47535 }, { "epoch": 7.0807268394399765, "grad_norm": 4.058570861816406, "learning_rate": 1.1896057271417707e-05, "loss": 0.3998, "num_input_tokens_seen": 27419840, "step": 47540 }, { "epoch": 7.081471551980935, "grad_norm": 2.306023359298706, "learning_rate": 1.1890523161732214e-05, "loss": 0.4436, "num_input_tokens_seen": 27422752, "step": 47545 }, { "epoch": 7.082216264521895, "grad_norm": 1.5179556608200073, "learning_rate": 1.188498993793186e-05, "loss": 0.4339, "num_input_tokens_seen": 27425568, "step": 47550 }, { "epoch": 7.082960977062854, "grad_norm": 2.2123827934265137, "learning_rate": 1.187945760039056e-05, "loss": 0.5551, "num_input_tokens_seen": 27428672, "step": 47555 }, { "epoch": 7.083705689603813, "grad_norm": 2.1661720275878906, "learning_rate": 1.1873926149482183e-05, "loss": 0.7793, "num_input_tokens_seen": 27431232, "step": 47560 }, { "epoch": 7.084450402144772, "grad_norm": 2.9522364139556885, "learning_rate": 1.1868395585580503e-05, "loss": 0.6724, "num_input_tokens_seen": 27434144, "step": 47565 }, { "epoch": 7.085195114685732, "grad_norm": 3.4263393878936768, "learning_rate": 1.186286590905926e-05, "loss": 0.5534, "num_input_tokens_seen": 27437408, "step": 47570 }, { "epoch": 7.0859398272266905, "grad_norm": 2.3129754066467285, "learning_rate": 1.1857337120292123e-05, "loss": 0.6297, "num_input_tokens_seen": 27440256, "step": 47575 }, { "epoch": 7.08668453976765, "grad_norm": 3.633897304534912, "learning_rate": 1.1851809219652721e-05, "loss": 0.6122, "num_input_tokens_seen": 27443008, "step": 47580 }, { "epoch": 7.087429252308609, "grad_norm": 2.9708163738250732, "learning_rate": 1.1846282207514586e-05, "loss": 0.6894, "num_input_tokens_seen": 27445952, "step": 47585 }, { "epoch": 7.088173964849568, "grad_norm": 3.192408561706543, "learning_rate": 1.184075608425122e-05, "loss": 0.6325, "num_input_tokens_seen": 27448864, "step": 47590 }, { "epoch": 7.088918677390527, "grad_norm": 3.0462417602539062, "learning_rate": 1.1835230850236057e-05, "loss": 0.6508, "num_input_tokens_seen": 27451648, "step": 47595 }, { "epoch": 7.089663389931486, "grad_norm": 1.2004868984222412, "learning_rate": 1.1829706505842478e-05, "loss": 0.4723, "num_input_tokens_seen": 27455072, "step": 47600 }, { "epoch": 7.090408102472446, "grad_norm": 3.2700917720794678, "learning_rate": 1.1824183051443776e-05, "loss": 0.8043, "num_input_tokens_seen": 27457760, "step": 47605 }, { "epoch": 7.091152815013404, "grad_norm": 2.4085912704467773, "learning_rate": 1.1818660487413217e-05, "loss": 0.5291, "num_input_tokens_seen": 27460480, "step": 47610 }, { "epoch": 7.091897527554364, "grad_norm": 1.5524470806121826, "learning_rate": 1.1813138814123997e-05, "loss": 0.5575, "num_input_tokens_seen": 27463584, "step": 47615 }, { "epoch": 7.092642240095323, "grad_norm": 1.8269128799438477, "learning_rate": 1.1807618031949235e-05, "loss": 0.5843, "num_input_tokens_seen": 27466528, "step": 47620 }, { "epoch": 7.0933869526362825, "grad_norm": 2.490483283996582, "learning_rate": 1.1802098141262008e-05, "loss": 0.4793, "num_input_tokens_seen": 27469280, "step": 47625 }, { "epoch": 7.094131665177241, "grad_norm": 2.832282543182373, "learning_rate": 1.1796579142435332e-05, "loss": 0.5483, "num_input_tokens_seen": 27472192, "step": 47630 }, { "epoch": 7.094876377718201, "grad_norm": 3.2755956649780273, "learning_rate": 1.1791061035842158e-05, "loss": 0.6358, "num_input_tokens_seen": 27475136, "step": 47635 }, { "epoch": 7.09562109025916, "grad_norm": 2.6876349449157715, "learning_rate": 1.178554382185538e-05, "loss": 0.6095, "num_input_tokens_seen": 27477984, "step": 47640 }, { "epoch": 7.096365802800119, "grad_norm": 1.6414092779159546, "learning_rate": 1.1780027500847818e-05, "loss": 0.4956, "num_input_tokens_seen": 27480864, "step": 47645 }, { "epoch": 7.097110515341078, "grad_norm": 1.7504966259002686, "learning_rate": 1.177451207319226e-05, "loss": 0.4231, "num_input_tokens_seen": 27483744, "step": 47650 }, { "epoch": 7.097855227882038, "grad_norm": 2.481431722640991, "learning_rate": 1.1768997539261392e-05, "loss": 0.5116, "num_input_tokens_seen": 27486464, "step": 47655 }, { "epoch": 7.0985999404229965, "grad_norm": 2.697503089904785, "learning_rate": 1.176348389942788e-05, "loss": 0.5022, "num_input_tokens_seen": 27489408, "step": 47660 }, { "epoch": 7.099344652963956, "grad_norm": 4.710402965545654, "learning_rate": 1.175797115406431e-05, "loss": 0.6304, "num_input_tokens_seen": 27492160, "step": 47665 }, { "epoch": 7.100089365504915, "grad_norm": 2.8354673385620117, "learning_rate": 1.1752459303543209e-05, "loss": 0.6006, "num_input_tokens_seen": 27495296, "step": 47670 }, { "epoch": 7.1008340780458745, "grad_norm": 2.99702525138855, "learning_rate": 1.174694834823705e-05, "loss": 0.6538, "num_input_tokens_seen": 27498176, "step": 47675 }, { "epoch": 7.101578790586833, "grad_norm": 2.011707305908203, "learning_rate": 1.1741438288518248e-05, "loss": 0.4484, "num_input_tokens_seen": 27500864, "step": 47680 }, { "epoch": 7.102323503127793, "grad_norm": 1.7791293859481812, "learning_rate": 1.173592912475914e-05, "loss": 0.6654, "num_input_tokens_seen": 27504128, "step": 47685 }, { "epoch": 7.103068215668752, "grad_norm": 1.5508503913879395, "learning_rate": 1.1730420857332002e-05, "loss": 0.4782, "num_input_tokens_seen": 27506912, "step": 47690 }, { "epoch": 7.103812928209711, "grad_norm": 3.176682710647583, "learning_rate": 1.1724913486609077e-05, "loss": 0.6898, "num_input_tokens_seen": 27509376, "step": 47695 }, { "epoch": 7.10455764075067, "grad_norm": 4.046095371246338, "learning_rate": 1.1719407012962524e-05, "loss": 0.68, "num_input_tokens_seen": 27512384, "step": 47700 }, { "epoch": 7.10530235329163, "grad_norm": 2.2486228942871094, "learning_rate": 1.1713901436764451e-05, "loss": 0.7663, "num_input_tokens_seen": 27515360, "step": 47705 }, { "epoch": 7.1060470658325885, "grad_norm": 2.2427823543548584, "learning_rate": 1.1708396758386911e-05, "loss": 0.5593, "num_input_tokens_seen": 27518208, "step": 47710 }, { "epoch": 7.106791778373548, "grad_norm": 2.1426711082458496, "learning_rate": 1.1702892978201868e-05, "loss": 0.6879, "num_input_tokens_seen": 27520864, "step": 47715 }, { "epoch": 7.107536490914507, "grad_norm": 1.2772495746612549, "learning_rate": 1.1697390096581265e-05, "loss": 0.7284, "num_input_tokens_seen": 27523744, "step": 47720 }, { "epoch": 7.1082812034554665, "grad_norm": 2.7645339965820312, "learning_rate": 1.1691888113896945e-05, "loss": 0.7318, "num_input_tokens_seen": 27526976, "step": 47725 }, { "epoch": 7.109025915996425, "grad_norm": 3.7142868041992188, "learning_rate": 1.1686387030520721e-05, "loss": 0.5616, "num_input_tokens_seen": 27529664, "step": 47730 }, { "epoch": 7.109770628537385, "grad_norm": 3.9064605236053467, "learning_rate": 1.168088684682433e-05, "loss": 0.5508, "num_input_tokens_seen": 27532544, "step": 47735 }, { "epoch": 7.110515341078344, "grad_norm": 3.389570951461792, "learning_rate": 1.1675387563179455e-05, "loss": 0.5312, "num_input_tokens_seen": 27535360, "step": 47740 }, { "epoch": 7.111260053619303, "grad_norm": 5.521953582763672, "learning_rate": 1.1669889179957725e-05, "loss": 0.4188, "num_input_tokens_seen": 27538048, "step": 47745 }, { "epoch": 7.112004766160262, "grad_norm": 2.0809340476989746, "learning_rate": 1.1664391697530677e-05, "loss": 0.7124, "num_input_tokens_seen": 27541056, "step": 47750 }, { "epoch": 7.112749478701222, "grad_norm": 1.9659651517868042, "learning_rate": 1.1658895116269821e-05, "loss": 0.5547, "num_input_tokens_seen": 27544000, "step": 47755 }, { "epoch": 7.1134941912421805, "grad_norm": 2.2008039951324463, "learning_rate": 1.16533994365466e-05, "loss": 0.7941, "num_input_tokens_seen": 27546848, "step": 47760 }, { "epoch": 7.114238903783139, "grad_norm": 3.706859588623047, "learning_rate": 1.1647904658732373e-05, "loss": 0.6176, "num_input_tokens_seen": 27549792, "step": 47765 }, { "epoch": 7.114983616324099, "grad_norm": 4.147054195404053, "learning_rate": 1.1642410783198465e-05, "loss": 0.7843, "num_input_tokens_seen": 27552800, "step": 47770 }, { "epoch": 7.115728328865058, "grad_norm": 1.700810432434082, "learning_rate": 1.1636917810316126e-05, "loss": 0.53, "num_input_tokens_seen": 27555968, "step": 47775 }, { "epoch": 7.116473041406017, "grad_norm": 1.478315830230713, "learning_rate": 1.1631425740456562e-05, "loss": 0.4708, "num_input_tokens_seen": 27558688, "step": 47780 }, { "epoch": 7.117217753946976, "grad_norm": 3.0144450664520264, "learning_rate": 1.1625934573990882e-05, "loss": 0.7351, "num_input_tokens_seen": 27561888, "step": 47785 }, { "epoch": 7.117962466487936, "grad_norm": 2.3122475147247314, "learning_rate": 1.1620444311290172e-05, "loss": 0.7377, "num_input_tokens_seen": 27564896, "step": 47790 }, { "epoch": 7.1187071790288945, "grad_norm": 2.466736078262329, "learning_rate": 1.1614954952725434e-05, "loss": 0.6665, "num_input_tokens_seen": 27568000, "step": 47795 }, { "epoch": 7.119451891569854, "grad_norm": 2.4476327896118164, "learning_rate": 1.1609466498667634e-05, "loss": 0.6711, "num_input_tokens_seen": 27570816, "step": 47800 }, { "epoch": 7.120196604110813, "grad_norm": 1.9245803356170654, "learning_rate": 1.1603978949487634e-05, "loss": 0.5913, "num_input_tokens_seen": 27573664, "step": 47805 }, { "epoch": 7.1209413166517725, "grad_norm": 1.9472471475601196, "learning_rate": 1.1598492305556274e-05, "loss": 0.6809, "num_input_tokens_seen": 27576512, "step": 47810 }, { "epoch": 7.121686029192731, "grad_norm": 2.2946510314941406, "learning_rate": 1.1593006567244328e-05, "loss": 0.5294, "num_input_tokens_seen": 27579424, "step": 47815 }, { "epoch": 7.122430741733691, "grad_norm": 1.6804176568984985, "learning_rate": 1.1587521734922476e-05, "loss": 0.4628, "num_input_tokens_seen": 27582240, "step": 47820 }, { "epoch": 7.12317545427465, "grad_norm": 1.8203405141830444, "learning_rate": 1.1582037808961377e-05, "loss": 0.5709, "num_input_tokens_seen": 27585056, "step": 47825 }, { "epoch": 7.123920166815609, "grad_norm": 2.790567636489868, "learning_rate": 1.1576554789731608e-05, "loss": 0.5168, "num_input_tokens_seen": 27587744, "step": 47830 }, { "epoch": 7.124664879356568, "grad_norm": 2.211136817932129, "learning_rate": 1.1571072677603691e-05, "loss": 0.4325, "num_input_tokens_seen": 27590496, "step": 47835 }, { "epoch": 7.125409591897528, "grad_norm": 3.1185078620910645, "learning_rate": 1.1565591472948095e-05, "loss": 0.4317, "num_input_tokens_seen": 27593408, "step": 47840 }, { "epoch": 7.1261543044384865, "grad_norm": 3.99351167678833, "learning_rate": 1.1560111176135197e-05, "loss": 0.4868, "num_input_tokens_seen": 27596224, "step": 47845 }, { "epoch": 7.126899016979446, "grad_norm": 1.314864993095398, "learning_rate": 1.1554631787535353e-05, "loss": 0.5606, "num_input_tokens_seen": 27598976, "step": 47850 }, { "epoch": 7.127643729520405, "grad_norm": 5.630037307739258, "learning_rate": 1.1549153307518817e-05, "loss": 0.6866, "num_input_tokens_seen": 27601920, "step": 47855 }, { "epoch": 7.128388442061365, "grad_norm": 2.077754497528076, "learning_rate": 1.1543675736455814e-05, "loss": 0.4733, "num_input_tokens_seen": 27604512, "step": 47860 }, { "epoch": 7.129133154602323, "grad_norm": 1.6520109176635742, "learning_rate": 1.1538199074716493e-05, "loss": 0.5445, "num_input_tokens_seen": 27607072, "step": 47865 }, { "epoch": 7.129877867143283, "grad_norm": 3.5669422149658203, "learning_rate": 1.1532723322670952e-05, "loss": 0.6862, "num_input_tokens_seen": 27609856, "step": 47870 }, { "epoch": 7.130622579684242, "grad_norm": 3.6274116039276123, "learning_rate": 1.152724848068922e-05, "loss": 0.6008, "num_input_tokens_seen": 27612640, "step": 47875 }, { "epoch": 7.131367292225201, "grad_norm": 2.140212297439575, "learning_rate": 1.152177454914125e-05, "loss": 0.5582, "num_input_tokens_seen": 27615552, "step": 47880 }, { "epoch": 7.13211200476616, "grad_norm": 2.1402087211608887, "learning_rate": 1.151630152839697e-05, "loss": 0.6423, "num_input_tokens_seen": 27618560, "step": 47885 }, { "epoch": 7.13285671730712, "grad_norm": 2.6068713665008545, "learning_rate": 1.1510829418826199e-05, "loss": 0.649, "num_input_tokens_seen": 27621600, "step": 47890 }, { "epoch": 7.1336014298480785, "grad_norm": 1.7581559419631958, "learning_rate": 1.1505358220798736e-05, "loss": 0.5568, "num_input_tokens_seen": 27624448, "step": 47895 }, { "epoch": 7.134346142389038, "grad_norm": 3.2003285884857178, "learning_rate": 1.1499887934684297e-05, "loss": 0.6884, "num_input_tokens_seen": 27627104, "step": 47900 }, { "epoch": 7.135090854929997, "grad_norm": 3.503028154373169, "learning_rate": 1.1494418560852546e-05, "loss": 0.4231, "num_input_tokens_seen": 27629856, "step": 47905 }, { "epoch": 7.135835567470957, "grad_norm": 2.501298427581787, "learning_rate": 1.1488950099673087e-05, "loss": 0.7129, "num_input_tokens_seen": 27632896, "step": 47910 }, { "epoch": 7.136580280011915, "grad_norm": 2.922969341278076, "learning_rate": 1.148348255151544e-05, "loss": 0.5258, "num_input_tokens_seen": 27635488, "step": 47915 }, { "epoch": 7.137324992552875, "grad_norm": 3.7595369815826416, "learning_rate": 1.1478015916749089e-05, "loss": 0.7527, "num_input_tokens_seen": 27638400, "step": 47920 }, { "epoch": 7.138069705093834, "grad_norm": 2.385857105255127, "learning_rate": 1.147255019574345e-05, "loss": 0.5606, "num_input_tokens_seen": 27641504, "step": 47925 }, { "epoch": 7.1388144176347925, "grad_norm": 2.846052885055542, "learning_rate": 1.1467085388867866e-05, "loss": 0.413, "num_input_tokens_seen": 27644480, "step": 47930 }, { "epoch": 7.139559130175752, "grad_norm": 2.146139144897461, "learning_rate": 1.1461621496491628e-05, "loss": 0.4525, "num_input_tokens_seen": 27647552, "step": 47935 }, { "epoch": 7.140303842716711, "grad_norm": 5.173603534698486, "learning_rate": 1.1456158518983967e-05, "loss": 0.7176, "num_input_tokens_seen": 27650464, "step": 47940 }, { "epoch": 7.141048555257671, "grad_norm": 1.8060739040374756, "learning_rate": 1.1450696456714057e-05, "loss": 0.4254, "num_input_tokens_seen": 27653376, "step": 47945 }, { "epoch": 7.141793267798629, "grad_norm": 1.2191457748413086, "learning_rate": 1.1445235310050987e-05, "loss": 0.4003, "num_input_tokens_seen": 27656288, "step": 47950 }, { "epoch": 7.142537980339589, "grad_norm": 4.3075032234191895, "learning_rate": 1.14397750793638e-05, "loss": 0.4222, "num_input_tokens_seen": 27658976, "step": 47955 }, { "epoch": 7.143282692880548, "grad_norm": 2.8904812335968018, "learning_rate": 1.1434315765021485e-05, "loss": 0.6034, "num_input_tokens_seen": 27661600, "step": 47960 }, { "epoch": 7.144027405421507, "grad_norm": 3.6697819232940674, "learning_rate": 1.1428857367392964e-05, "loss": 0.5514, "num_input_tokens_seen": 27664224, "step": 47965 }, { "epoch": 7.144772117962466, "grad_norm": 4.5840349197387695, "learning_rate": 1.1423399886847077e-05, "loss": 0.565, "num_input_tokens_seen": 27667040, "step": 47970 }, { "epoch": 7.145516830503426, "grad_norm": 1.1131840944290161, "learning_rate": 1.1417943323752629e-05, "loss": 0.4927, "num_input_tokens_seen": 27669952, "step": 47975 }, { "epoch": 7.1462615430443845, "grad_norm": 2.714372158050537, "learning_rate": 1.1412487678478357e-05, "loss": 0.6825, "num_input_tokens_seen": 27672704, "step": 47980 }, { "epoch": 7.147006255585344, "grad_norm": 3.939196825027466, "learning_rate": 1.1407032951392916e-05, "loss": 0.6143, "num_input_tokens_seen": 27675904, "step": 47985 }, { "epoch": 7.147750968126303, "grad_norm": 2.016188144683838, "learning_rate": 1.1401579142864924e-05, "loss": 0.5925, "num_input_tokens_seen": 27679040, "step": 47990 }, { "epoch": 7.148495680667263, "grad_norm": 8.60828685760498, "learning_rate": 1.1396126253262926e-05, "loss": 1.0727, "num_input_tokens_seen": 27682112, "step": 47995 }, { "epoch": 7.149240393208221, "grad_norm": 1.657104253768921, "learning_rate": 1.1390674282955408e-05, "loss": 0.5863, "num_input_tokens_seen": 27684800, "step": 48000 }, { "epoch": 7.149985105749181, "grad_norm": 4.60088586807251, "learning_rate": 1.1385223232310799e-05, "loss": 0.7333, "num_input_tokens_seen": 27687776, "step": 48005 }, { "epoch": 7.15072981829014, "grad_norm": 2.4062771797180176, "learning_rate": 1.1379773101697439e-05, "loss": 0.5663, "num_input_tokens_seen": 27690656, "step": 48010 }, { "epoch": 7.151474530831099, "grad_norm": 2.029566526412964, "learning_rate": 1.1374323891483649e-05, "loss": 0.6865, "num_input_tokens_seen": 27693856, "step": 48015 }, { "epoch": 7.152219243372058, "grad_norm": 3.581545352935791, "learning_rate": 1.136887560203764e-05, "loss": 0.5372, "num_input_tokens_seen": 27696832, "step": 48020 }, { "epoch": 7.152963955913018, "grad_norm": 2.562509775161743, "learning_rate": 1.13634282337276e-05, "loss": 0.5149, "num_input_tokens_seen": 27699584, "step": 48025 }, { "epoch": 7.153708668453977, "grad_norm": 3.395860195159912, "learning_rate": 1.1357981786921636e-05, "loss": 0.4804, "num_input_tokens_seen": 27702016, "step": 48030 }, { "epoch": 7.154453380994936, "grad_norm": 0.87890625, "learning_rate": 1.13525362619878e-05, "loss": 0.3747, "num_input_tokens_seen": 27704736, "step": 48035 }, { "epoch": 7.155198093535895, "grad_norm": 3.0983879566192627, "learning_rate": 1.1347091659294087e-05, "loss": 0.5222, "num_input_tokens_seen": 27707392, "step": 48040 }, { "epoch": 7.155942806076855, "grad_norm": 2.8549416065216064, "learning_rate": 1.13416479792084e-05, "loss": 0.5186, "num_input_tokens_seen": 27710048, "step": 48045 }, { "epoch": 7.156687518617813, "grad_norm": 3.9299583435058594, "learning_rate": 1.1336205222098622e-05, "loss": 0.7807, "num_input_tokens_seen": 27712992, "step": 48050 }, { "epoch": 7.157432231158773, "grad_norm": 3.077394485473633, "learning_rate": 1.1330763388332533e-05, "loss": 0.4956, "num_input_tokens_seen": 27715808, "step": 48055 }, { "epoch": 7.158176943699732, "grad_norm": 1.4669733047485352, "learning_rate": 1.1325322478277877e-05, "loss": 0.6303, "num_input_tokens_seen": 27718528, "step": 48060 }, { "epoch": 7.158921656240691, "grad_norm": 2.392319440841675, "learning_rate": 1.1319882492302333e-05, "loss": 0.5883, "num_input_tokens_seen": 27721568, "step": 48065 }, { "epoch": 7.15966636878165, "grad_norm": 3.7451841831207275, "learning_rate": 1.131444343077351e-05, "loss": 0.603, "num_input_tokens_seen": 27724416, "step": 48070 }, { "epoch": 7.16041108132261, "grad_norm": 2.1240782737731934, "learning_rate": 1.1309005294058968e-05, "loss": 0.6192, "num_input_tokens_seen": 27727392, "step": 48075 }, { "epoch": 7.161155793863569, "grad_norm": 4.278078556060791, "learning_rate": 1.1303568082526178e-05, "loss": 0.6721, "num_input_tokens_seen": 27730304, "step": 48080 }, { "epoch": 7.161900506404528, "grad_norm": 4.245823383331299, "learning_rate": 1.1298131796542576e-05, "loss": 0.5868, "num_input_tokens_seen": 27733056, "step": 48085 }, { "epoch": 7.162645218945487, "grad_norm": 3.4575226306915283, "learning_rate": 1.1292696436475514e-05, "loss": 0.6605, "num_input_tokens_seen": 27735776, "step": 48090 }, { "epoch": 7.163389931486447, "grad_norm": 2.4228482246398926, "learning_rate": 1.1287262002692295e-05, "loss": 0.7548, "num_input_tokens_seen": 27738624, "step": 48095 }, { "epoch": 7.164134644027405, "grad_norm": 2.853036880493164, "learning_rate": 1.1281828495560157e-05, "loss": 0.445, "num_input_tokens_seen": 27741632, "step": 48100 }, { "epoch": 7.164879356568365, "grad_norm": 2.3201308250427246, "learning_rate": 1.1276395915446278e-05, "loss": 0.6318, "num_input_tokens_seen": 27745120, "step": 48105 }, { "epoch": 7.165624069109324, "grad_norm": 3.439244508743286, "learning_rate": 1.1270964262717773e-05, "loss": 0.4206, "num_input_tokens_seen": 27747872, "step": 48110 }, { "epoch": 7.166368781650283, "grad_norm": 4.041665077209473, "learning_rate": 1.126553353774168e-05, "loss": 0.5118, "num_input_tokens_seen": 27750464, "step": 48115 }, { "epoch": 7.167113494191242, "grad_norm": 2.593566656112671, "learning_rate": 1.1260103740884986e-05, "loss": 0.5515, "num_input_tokens_seen": 27753344, "step": 48120 }, { "epoch": 7.167858206732201, "grad_norm": 2.3469150066375732, "learning_rate": 1.1254674872514629e-05, "loss": 0.5787, "num_input_tokens_seen": 27756000, "step": 48125 }, { "epoch": 7.168602919273161, "grad_norm": 3.2329325675964355, "learning_rate": 1.124924693299745e-05, "loss": 0.6241, "num_input_tokens_seen": 27758912, "step": 48130 }, { "epoch": 7.169347631814119, "grad_norm": 1.630682349205017, "learning_rate": 1.124381992270026e-05, "loss": 0.5931, "num_input_tokens_seen": 27761664, "step": 48135 }, { "epoch": 7.170092344355079, "grad_norm": 4.545214653015137, "learning_rate": 1.123839384198979e-05, "loss": 0.5505, "num_input_tokens_seen": 27764384, "step": 48140 }, { "epoch": 7.170837056896038, "grad_norm": 1.6287590265274048, "learning_rate": 1.123296869123272e-05, "loss": 0.4597, "num_input_tokens_seen": 27767488, "step": 48145 }, { "epoch": 7.171581769436997, "grad_norm": 1.9576693773269653, "learning_rate": 1.1227544470795645e-05, "loss": 0.6156, "num_input_tokens_seen": 27770560, "step": 48150 }, { "epoch": 7.172326481977956, "grad_norm": 2.4910459518432617, "learning_rate": 1.122212118104512e-05, "loss": 0.4993, "num_input_tokens_seen": 27773440, "step": 48155 }, { "epoch": 7.173071194518916, "grad_norm": 3.9964547157287598, "learning_rate": 1.1216698822347629e-05, "loss": 0.6402, "num_input_tokens_seen": 27776224, "step": 48160 }, { "epoch": 7.173815907059875, "grad_norm": 2.7114458084106445, "learning_rate": 1.1211277395069603e-05, "loss": 0.6258, "num_input_tokens_seen": 27779104, "step": 48165 }, { "epoch": 7.174560619600834, "grad_norm": 3.148698329925537, "learning_rate": 1.120585689957738e-05, "loss": 0.6351, "num_input_tokens_seen": 27781792, "step": 48170 }, { "epoch": 7.175305332141793, "grad_norm": 1.835419774055481, "learning_rate": 1.1200437336237265e-05, "loss": 0.5334, "num_input_tokens_seen": 27784576, "step": 48175 }, { "epoch": 7.176050044682753, "grad_norm": 4.35117244720459, "learning_rate": 1.11950187054155e-05, "loss": 0.5483, "num_input_tokens_seen": 27787296, "step": 48180 }, { "epoch": 7.176794757223711, "grad_norm": 4.815347194671631, "learning_rate": 1.1189601007478233e-05, "loss": 0.7014, "num_input_tokens_seen": 27790080, "step": 48185 }, { "epoch": 7.177539469764671, "grad_norm": 1.5747735500335693, "learning_rate": 1.1184184242791581e-05, "loss": 0.5014, "num_input_tokens_seen": 27793152, "step": 48190 }, { "epoch": 7.17828418230563, "grad_norm": 2.569199323654175, "learning_rate": 1.1178768411721589e-05, "loss": 0.6742, "num_input_tokens_seen": 27796352, "step": 48195 }, { "epoch": 7.1790288948465895, "grad_norm": 2.2838926315307617, "learning_rate": 1.1173353514634232e-05, "loss": 0.5916, "num_input_tokens_seen": 27799040, "step": 48200 }, { "epoch": 7.179773607387548, "grad_norm": 3.157440662384033, "learning_rate": 1.116793955189544e-05, "loss": 0.7128, "num_input_tokens_seen": 27801888, "step": 48205 }, { "epoch": 7.180518319928508, "grad_norm": 2.656388998031616, "learning_rate": 1.1162526523871048e-05, "loss": 0.7468, "num_input_tokens_seen": 27804736, "step": 48210 }, { "epoch": 7.181263032469467, "grad_norm": 1.8075475692749023, "learning_rate": 1.115711443092686e-05, "loss": 0.6666, "num_input_tokens_seen": 27807488, "step": 48215 }, { "epoch": 7.182007745010426, "grad_norm": 1.987962245941162, "learning_rate": 1.115170327342859e-05, "loss": 0.6839, "num_input_tokens_seen": 27810368, "step": 48220 }, { "epoch": 7.182752457551385, "grad_norm": 3.830313205718994, "learning_rate": 1.1146293051741913e-05, "loss": 0.8166, "num_input_tokens_seen": 27813088, "step": 48225 }, { "epoch": 7.183497170092345, "grad_norm": 2.366525888442993, "learning_rate": 1.1140883766232422e-05, "loss": 0.5511, "num_input_tokens_seen": 27815840, "step": 48230 }, { "epoch": 7.184241882633303, "grad_norm": 3.53200626373291, "learning_rate": 1.1135475417265662e-05, "loss": 0.5753, "num_input_tokens_seen": 27818720, "step": 48235 }, { "epoch": 7.184986595174263, "grad_norm": 1.970043420791626, "learning_rate": 1.113006800520711e-05, "loss": 0.7426, "num_input_tokens_seen": 27821632, "step": 48240 }, { "epoch": 7.185731307715222, "grad_norm": 1.118437647819519, "learning_rate": 1.1124661530422176e-05, "loss": 0.5412, "num_input_tokens_seen": 27824448, "step": 48245 }, { "epoch": 7.1864760202561815, "grad_norm": 1.9105253219604492, "learning_rate": 1.111925599327619e-05, "loss": 0.5416, "num_input_tokens_seen": 27827552, "step": 48250 }, { "epoch": 7.18722073279714, "grad_norm": 2.715404748916626, "learning_rate": 1.111385139413445e-05, "loss": 0.5577, "num_input_tokens_seen": 27830304, "step": 48255 }, { "epoch": 7.1879654453381, "grad_norm": 2.2688968181610107, "learning_rate": 1.1108447733362177e-05, "loss": 0.5761, "num_input_tokens_seen": 27833280, "step": 48260 }, { "epoch": 7.188710157879059, "grad_norm": 2.0436670780181885, "learning_rate": 1.1103045011324526e-05, "loss": 0.4004, "num_input_tokens_seen": 27836032, "step": 48265 }, { "epoch": 7.189454870420018, "grad_norm": 2.411302328109741, "learning_rate": 1.1097643228386593e-05, "loss": 0.5935, "num_input_tokens_seen": 27838816, "step": 48270 }, { "epoch": 7.190199582960977, "grad_norm": 2.980844259262085, "learning_rate": 1.1092242384913415e-05, "loss": 0.7627, "num_input_tokens_seen": 27841536, "step": 48275 }, { "epoch": 7.190944295501936, "grad_norm": 2.39585280418396, "learning_rate": 1.1086842481269943e-05, "loss": 0.5298, "num_input_tokens_seen": 27844320, "step": 48280 }, { "epoch": 7.1916890080428955, "grad_norm": 2.1222589015960693, "learning_rate": 1.10814435178211e-05, "loss": 0.6114, "num_input_tokens_seen": 27847264, "step": 48285 }, { "epoch": 7.192433720583854, "grad_norm": 3.8894731998443604, "learning_rate": 1.1076045494931705e-05, "loss": 0.8372, "num_input_tokens_seen": 27850048, "step": 48290 }, { "epoch": 7.193178433124814, "grad_norm": 4.289065361022949, "learning_rate": 1.1070648412966548e-05, "loss": 0.5906, "num_input_tokens_seen": 27852736, "step": 48295 }, { "epoch": 7.193923145665773, "grad_norm": 2.1815009117126465, "learning_rate": 1.1065252272290333e-05, "loss": 0.6266, "num_input_tokens_seen": 27856064, "step": 48300 }, { "epoch": 7.194667858206732, "grad_norm": 4.013463020324707, "learning_rate": 1.1059857073267718e-05, "loss": 0.4257, "num_input_tokens_seen": 27858720, "step": 48305 }, { "epoch": 7.195412570747691, "grad_norm": 2.4936931133270264, "learning_rate": 1.1054462816263295e-05, "loss": 0.5568, "num_input_tokens_seen": 27861408, "step": 48310 }, { "epoch": 7.196157283288651, "grad_norm": 2.225245952606201, "learning_rate": 1.1049069501641567e-05, "loss": 0.4868, "num_input_tokens_seen": 27864352, "step": 48315 }, { "epoch": 7.196901995829609, "grad_norm": 2.433830976486206, "learning_rate": 1.1043677129767002e-05, "loss": 0.6128, "num_input_tokens_seen": 27867264, "step": 48320 }, { "epoch": 7.197646708370569, "grad_norm": 1.4318374395370483, "learning_rate": 1.1038285701004003e-05, "loss": 0.5911, "num_input_tokens_seen": 27869984, "step": 48325 }, { "epoch": 7.198391420911528, "grad_norm": 1.4775304794311523, "learning_rate": 1.1032895215716881e-05, "loss": 0.4814, "num_input_tokens_seen": 27872832, "step": 48330 }, { "epoch": 7.1991361334524875, "grad_norm": 2.11095929145813, "learning_rate": 1.1027505674269916e-05, "loss": 0.5209, "num_input_tokens_seen": 27875744, "step": 48335 }, { "epoch": 7.199880845993446, "grad_norm": 2.552295207977295, "learning_rate": 1.102211707702731e-05, "loss": 0.8337, "num_input_tokens_seen": 27878880, "step": 48340 }, { "epoch": 7.200625558534406, "grad_norm": 3.381648302078247, "learning_rate": 1.1016729424353212e-05, "loss": 0.5124, "num_input_tokens_seen": 27881760, "step": 48345 }, { "epoch": 7.201370271075365, "grad_norm": 1.4540770053863525, "learning_rate": 1.1011342716611678e-05, "loss": 0.3849, "num_input_tokens_seen": 27884672, "step": 48350 }, { "epoch": 7.202114983616324, "grad_norm": 2.9569685459136963, "learning_rate": 1.1005956954166729e-05, "loss": 0.6155, "num_input_tokens_seen": 27887456, "step": 48355 }, { "epoch": 7.202859696157283, "grad_norm": 2.3885159492492676, "learning_rate": 1.1000572137382314e-05, "loss": 0.6346, "num_input_tokens_seen": 27890304, "step": 48360 }, { "epoch": 7.203604408698243, "grad_norm": 1.8037952184677124, "learning_rate": 1.0995188266622324e-05, "loss": 0.589, "num_input_tokens_seen": 27893472, "step": 48365 }, { "epoch": 7.2043491212392015, "grad_norm": 2.803335666656494, "learning_rate": 1.0989805342250564e-05, "loss": 0.7246, "num_input_tokens_seen": 27896192, "step": 48370 }, { "epoch": 7.205093833780161, "grad_norm": 4.323678016662598, "learning_rate": 1.0984423364630796e-05, "loss": 0.7803, "num_input_tokens_seen": 27898848, "step": 48375 }, { "epoch": 7.20583854632112, "grad_norm": 1.4684257507324219, "learning_rate": 1.0979042334126724e-05, "loss": 0.4602, "num_input_tokens_seen": 27901984, "step": 48380 }, { "epoch": 7.2065832588620795, "grad_norm": 1.350284457206726, "learning_rate": 1.0973662251101957e-05, "loss": 0.5741, "num_input_tokens_seen": 27904640, "step": 48385 }, { "epoch": 7.207327971403038, "grad_norm": 1.7448145151138306, "learning_rate": 1.0968283115920067e-05, "loss": 0.5907, "num_input_tokens_seen": 27907232, "step": 48390 }, { "epoch": 7.208072683943998, "grad_norm": 2.0751233100891113, "learning_rate": 1.0962904928944556e-05, "loss": 0.498, "num_input_tokens_seen": 27910080, "step": 48395 }, { "epoch": 7.208817396484957, "grad_norm": 5.937955379486084, "learning_rate": 1.095752769053886e-05, "loss": 0.7021, "num_input_tokens_seen": 27913024, "step": 48400 }, { "epoch": 7.209562109025916, "grad_norm": 4.347935199737549, "learning_rate": 1.0952151401066358e-05, "loss": 0.6109, "num_input_tokens_seen": 27915616, "step": 48405 }, { "epoch": 7.210306821566875, "grad_norm": 2.2774736881256104, "learning_rate": 1.0946776060890352e-05, "loss": 0.8185, "num_input_tokens_seen": 27918432, "step": 48410 }, { "epoch": 7.211051534107835, "grad_norm": 2.729921817779541, "learning_rate": 1.0941401670374071e-05, "loss": 0.6069, "num_input_tokens_seen": 27921184, "step": 48415 }, { "epoch": 7.2117962466487935, "grad_norm": 2.3599905967712402, "learning_rate": 1.093602822988071e-05, "loss": 0.5146, "num_input_tokens_seen": 27924000, "step": 48420 }, { "epoch": 7.212540959189753, "grad_norm": 3.086883068084717, "learning_rate": 1.0930655739773379e-05, "loss": 0.5636, "num_input_tokens_seen": 27927104, "step": 48425 }, { "epoch": 7.213285671730712, "grad_norm": 2.185004949569702, "learning_rate": 1.0925284200415134e-05, "loss": 0.5716, "num_input_tokens_seen": 27929728, "step": 48430 }, { "epoch": 7.2140303842716715, "grad_norm": 4.74393367767334, "learning_rate": 1.0919913612168959e-05, "loss": 0.6992, "num_input_tokens_seen": 27932448, "step": 48435 }, { "epoch": 7.21477509681263, "grad_norm": 3.6578214168548584, "learning_rate": 1.0914543975397785e-05, "loss": 0.6993, "num_input_tokens_seen": 27935296, "step": 48440 }, { "epoch": 7.21551980935359, "grad_norm": 2.008435010910034, "learning_rate": 1.090917529046446e-05, "loss": 0.6614, "num_input_tokens_seen": 27938240, "step": 48445 }, { "epoch": 7.216264521894549, "grad_norm": 1.2348706722259521, "learning_rate": 1.0903807557731771e-05, "loss": 0.4478, "num_input_tokens_seen": 27941088, "step": 48450 }, { "epoch": 7.217009234435508, "grad_norm": 1.6748684644699097, "learning_rate": 1.0898440777562458e-05, "loss": 0.5677, "num_input_tokens_seen": 27943936, "step": 48455 }, { "epoch": 7.217753946976467, "grad_norm": 1.8786756992340088, "learning_rate": 1.0893074950319182e-05, "loss": 0.391, "num_input_tokens_seen": 27946880, "step": 48460 }, { "epoch": 7.218498659517426, "grad_norm": 2.846330165863037, "learning_rate": 1.0887710076364548e-05, "loss": 0.451, "num_input_tokens_seen": 27949600, "step": 48465 }, { "epoch": 7.2192433720583855, "grad_norm": 3.0003573894500732, "learning_rate": 1.088234615606109e-05, "loss": 0.8429, "num_input_tokens_seen": 27952480, "step": 48470 }, { "epoch": 7.219988084599344, "grad_norm": 2.514213800430298, "learning_rate": 1.0876983189771292e-05, "loss": 0.6238, "num_input_tokens_seen": 27955232, "step": 48475 }, { "epoch": 7.220732797140304, "grad_norm": 3.167705535888672, "learning_rate": 1.0871621177857539e-05, "loss": 0.6207, "num_input_tokens_seen": 27957952, "step": 48480 }, { "epoch": 7.221477509681263, "grad_norm": 1.9468624591827393, "learning_rate": 1.0866260120682195e-05, "loss": 0.5746, "num_input_tokens_seen": 27960704, "step": 48485 }, { "epoch": 7.222222222222222, "grad_norm": 2.9196665287017822, "learning_rate": 1.0860900018607518e-05, "loss": 0.6109, "num_input_tokens_seen": 27963552, "step": 48490 }, { "epoch": 7.222966934763181, "grad_norm": 2.9980030059814453, "learning_rate": 1.0855540871995734e-05, "loss": 0.6116, "num_input_tokens_seen": 27966464, "step": 48495 }, { "epoch": 7.223711647304141, "grad_norm": 2.4442315101623535, "learning_rate": 1.085018268120899e-05, "loss": 0.5928, "num_input_tokens_seen": 27969408, "step": 48500 }, { "epoch": 7.2244563598450995, "grad_norm": 2.540400505065918, "learning_rate": 1.0844825446609368e-05, "loss": 0.5799, "num_input_tokens_seen": 27972288, "step": 48505 }, { "epoch": 7.225201072386059, "grad_norm": 1.7188446521759033, "learning_rate": 1.0839469168558905e-05, "loss": 0.5791, "num_input_tokens_seen": 27975296, "step": 48510 }, { "epoch": 7.225945784927018, "grad_norm": 13.489227294921875, "learning_rate": 1.0834113847419534e-05, "loss": 0.6147, "num_input_tokens_seen": 27978176, "step": 48515 }, { "epoch": 7.2266904974679775, "grad_norm": 2.091770887374878, "learning_rate": 1.0828759483553152e-05, "loss": 0.5202, "num_input_tokens_seen": 27981216, "step": 48520 }, { "epoch": 7.227435210008936, "grad_norm": 2.5718131065368652, "learning_rate": 1.082340607732159e-05, "loss": 0.609, "num_input_tokens_seen": 27984096, "step": 48525 }, { "epoch": 7.228179922549896, "grad_norm": 1.8837004899978638, "learning_rate": 1.0818053629086617e-05, "loss": 0.4941, "num_input_tokens_seen": 27986816, "step": 48530 }, { "epoch": 7.228924635090855, "grad_norm": 1.7927335500717163, "learning_rate": 1.081270213920991e-05, "loss": 0.5164, "num_input_tokens_seen": 27989632, "step": 48535 }, { "epoch": 7.229669347631814, "grad_norm": 2.5869369506835938, "learning_rate": 1.0807351608053113e-05, "loss": 0.5963, "num_input_tokens_seen": 27992512, "step": 48540 }, { "epoch": 7.230414060172773, "grad_norm": 2.352339744567871, "learning_rate": 1.0802002035977799e-05, "loss": 0.614, "num_input_tokens_seen": 27995360, "step": 48545 }, { "epoch": 7.231158772713733, "grad_norm": 1.0463122129440308, "learning_rate": 1.0796653423345452e-05, "loss": 0.6305, "num_input_tokens_seen": 27998464, "step": 48550 }, { "epoch": 7.2319034852546915, "grad_norm": 1.8537421226501465, "learning_rate": 1.079130577051752e-05, "loss": 0.5168, "num_input_tokens_seen": 28001344, "step": 48555 }, { "epoch": 7.232648197795651, "grad_norm": 2.680154800415039, "learning_rate": 1.0785959077855378e-05, "loss": 0.498, "num_input_tokens_seen": 28004224, "step": 48560 }, { "epoch": 7.23339291033661, "grad_norm": 3.6466026306152344, "learning_rate": 1.0780613345720331e-05, "loss": 0.5544, "num_input_tokens_seen": 28007008, "step": 48565 }, { "epoch": 7.23413762287757, "grad_norm": 3.4025185108184814, "learning_rate": 1.077526857447363e-05, "loss": 0.5338, "num_input_tokens_seen": 28009792, "step": 48570 }, { "epoch": 7.234882335418528, "grad_norm": 2.0759027004241943, "learning_rate": 1.0769924764476446e-05, "loss": 0.6372, "num_input_tokens_seen": 28012608, "step": 48575 }, { "epoch": 7.235627047959488, "grad_norm": 2.8768820762634277, "learning_rate": 1.0764581916089883e-05, "loss": 0.4677, "num_input_tokens_seen": 28015392, "step": 48580 }, { "epoch": 7.236371760500447, "grad_norm": 5.042553424835205, "learning_rate": 1.0759240029674994e-05, "loss": 0.6498, "num_input_tokens_seen": 28018240, "step": 48585 }, { "epoch": 7.237116473041406, "grad_norm": 5.399726390838623, "learning_rate": 1.0753899105592768e-05, "loss": 0.6464, "num_input_tokens_seen": 28021152, "step": 48590 }, { "epoch": 7.237861185582365, "grad_norm": 1.7363080978393555, "learning_rate": 1.0748559144204117e-05, "loss": 0.5172, "num_input_tokens_seen": 28024416, "step": 48595 }, { "epoch": 7.238605898123325, "grad_norm": 2.7009665966033936, "learning_rate": 1.07432201458699e-05, "loss": 0.6592, "num_input_tokens_seen": 28027360, "step": 48600 }, { "epoch": 7.2393506106642835, "grad_norm": 3.081066370010376, "learning_rate": 1.0737882110950911e-05, "loss": 0.5509, "num_input_tokens_seen": 28030368, "step": 48605 }, { "epoch": 7.240095323205243, "grad_norm": 3.0831480026245117, "learning_rate": 1.0732545039807862e-05, "loss": 0.6623, "num_input_tokens_seen": 28033216, "step": 48610 }, { "epoch": 7.240840035746202, "grad_norm": 1.6056448221206665, "learning_rate": 1.0727208932801403e-05, "loss": 0.618, "num_input_tokens_seen": 28036096, "step": 48615 }, { "epoch": 7.241584748287162, "grad_norm": 1.9719675779342651, "learning_rate": 1.0721873790292136e-05, "loss": 0.4552, "num_input_tokens_seen": 28038816, "step": 48620 }, { "epoch": 7.24232946082812, "grad_norm": 2.075467348098755, "learning_rate": 1.0716539612640586e-05, "loss": 0.6198, "num_input_tokens_seen": 28041632, "step": 48625 }, { "epoch": 7.243074173369079, "grad_norm": 2.5676698684692383, "learning_rate": 1.071120640020722e-05, "loss": 0.6957, "num_input_tokens_seen": 28044480, "step": 48630 }, { "epoch": 7.243818885910039, "grad_norm": 2.3301339149475098, "learning_rate": 1.0705874153352428e-05, "loss": 0.4895, "num_input_tokens_seen": 28047936, "step": 48635 }, { "epoch": 7.2445635984509975, "grad_norm": 6.192820072174072, "learning_rate": 1.0700542872436557e-05, "loss": 0.5729, "num_input_tokens_seen": 28050944, "step": 48640 }, { "epoch": 7.245308310991957, "grad_norm": 2.8753068447113037, "learning_rate": 1.0695212557819851e-05, "loss": 0.4644, "num_input_tokens_seen": 28054240, "step": 48645 }, { "epoch": 7.246053023532916, "grad_norm": 1.1248005628585815, "learning_rate": 1.0689883209862527e-05, "loss": 0.4253, "num_input_tokens_seen": 28057088, "step": 48650 }, { "epoch": 7.246797736073876, "grad_norm": 1.8545047044754028, "learning_rate": 1.0684554828924711e-05, "loss": 0.7245, "num_input_tokens_seen": 28060256, "step": 48655 }, { "epoch": 7.247542448614834, "grad_norm": 1.4301445484161377, "learning_rate": 1.0679227415366475e-05, "loss": 0.4548, "num_input_tokens_seen": 28063264, "step": 48660 }, { "epoch": 7.248287161155794, "grad_norm": 3.7495779991149902, "learning_rate": 1.0673900969547826e-05, "loss": 0.4445, "num_input_tokens_seen": 28066240, "step": 48665 }, { "epoch": 7.249031873696753, "grad_norm": 3.226794481277466, "learning_rate": 1.0668575491828706e-05, "loss": 0.4623, "num_input_tokens_seen": 28068992, "step": 48670 }, { "epoch": 7.249776586237712, "grad_norm": 2.2300045490264893, "learning_rate": 1.0663250982568993e-05, "loss": 0.5648, "num_input_tokens_seen": 28071872, "step": 48675 }, { "epoch": 7.250521298778671, "grad_norm": 1.9471728801727295, "learning_rate": 1.0657927442128482e-05, "loss": 0.5139, "num_input_tokens_seen": 28074944, "step": 48680 }, { "epoch": 7.251266011319631, "grad_norm": 2.0839948654174805, "learning_rate": 1.0652604870866923e-05, "loss": 0.4777, "num_input_tokens_seen": 28077824, "step": 48685 }, { "epoch": 7.2520107238605895, "grad_norm": 2.850564956665039, "learning_rate": 1.0647283269144003e-05, "loss": 0.5082, "num_input_tokens_seen": 28080672, "step": 48690 }, { "epoch": 7.252755436401549, "grad_norm": 2.9809610843658447, "learning_rate": 1.064196263731932e-05, "loss": 0.5488, "num_input_tokens_seen": 28083520, "step": 48695 }, { "epoch": 7.253500148942508, "grad_norm": 2.6831657886505127, "learning_rate": 1.0636642975752423e-05, "loss": 0.7136, "num_input_tokens_seen": 28086464, "step": 48700 }, { "epoch": 7.254244861483468, "grad_norm": 4.134574890136719, "learning_rate": 1.0631324284802799e-05, "loss": 0.6003, "num_input_tokens_seen": 28089824, "step": 48705 }, { "epoch": 7.254989574024426, "grad_norm": 3.9706668853759766, "learning_rate": 1.0626006564829868e-05, "loss": 0.6823, "num_input_tokens_seen": 28092384, "step": 48710 }, { "epoch": 7.255734286565386, "grad_norm": 5.823851108551025, "learning_rate": 1.0620689816192967e-05, "loss": 0.7507, "num_input_tokens_seen": 28095136, "step": 48715 }, { "epoch": 7.256478999106345, "grad_norm": 3.0305590629577637, "learning_rate": 1.0615374039251382e-05, "loss": 0.4593, "num_input_tokens_seen": 28098080, "step": 48720 }, { "epoch": 7.257223711647304, "grad_norm": 1.9458526372909546, "learning_rate": 1.061005923436434e-05, "loss": 0.6152, "num_input_tokens_seen": 28101088, "step": 48725 }, { "epoch": 7.257968424188263, "grad_norm": 3.4066519737243652, "learning_rate": 1.0604745401890997e-05, "loss": 0.5432, "num_input_tokens_seen": 28103744, "step": 48730 }, { "epoch": 7.258713136729223, "grad_norm": 4.173163890838623, "learning_rate": 1.0599432542190424e-05, "loss": 0.6104, "num_input_tokens_seen": 28106400, "step": 48735 }, { "epoch": 7.259457849270182, "grad_norm": 2.3454184532165527, "learning_rate": 1.0594120655621659e-05, "loss": 0.5638, "num_input_tokens_seen": 28109280, "step": 48740 }, { "epoch": 7.260202561811141, "grad_norm": 2.954101800918579, "learning_rate": 1.0588809742543643e-05, "loss": 0.5517, "num_input_tokens_seen": 28111968, "step": 48745 }, { "epoch": 7.2609472743521, "grad_norm": 1.5256495475769043, "learning_rate": 1.0583499803315271e-05, "loss": 0.4761, "num_input_tokens_seen": 28115040, "step": 48750 }, { "epoch": 7.26169198689306, "grad_norm": 2.435809850692749, "learning_rate": 1.0578190838295371e-05, "loss": 0.3946, "num_input_tokens_seen": 28118016, "step": 48755 }, { "epoch": 7.262436699434018, "grad_norm": 2.6947481632232666, "learning_rate": 1.0572882847842696e-05, "loss": 0.5555, "num_input_tokens_seen": 28120896, "step": 48760 }, { "epoch": 7.263181411974978, "grad_norm": 2.026146411895752, "learning_rate": 1.0567575832315947e-05, "loss": 0.5742, "num_input_tokens_seen": 28123648, "step": 48765 }, { "epoch": 7.263926124515937, "grad_norm": 2.0928568840026855, "learning_rate": 1.056226979207375e-05, "loss": 0.6815, "num_input_tokens_seen": 28126720, "step": 48770 }, { "epoch": 7.264670837056896, "grad_norm": 1.4019441604614258, "learning_rate": 1.0556964727474664e-05, "loss": 0.4423, "num_input_tokens_seen": 28129664, "step": 48775 }, { "epoch": 7.265415549597855, "grad_norm": 2.469491720199585, "learning_rate": 1.055166063887717e-05, "loss": 0.7016, "num_input_tokens_seen": 28132768, "step": 48780 }, { "epoch": 7.266160262138815, "grad_norm": 4.320286273956299, "learning_rate": 1.0546357526639705e-05, "loss": 0.5427, "num_input_tokens_seen": 28136352, "step": 48785 }, { "epoch": 7.266904974679774, "grad_norm": 2.516843795776367, "learning_rate": 1.0541055391120638e-05, "loss": 0.4479, "num_input_tokens_seen": 28139232, "step": 48790 }, { "epoch": 7.267649687220732, "grad_norm": 3.266068458557129, "learning_rate": 1.053575423267826e-05, "loss": 0.5915, "num_input_tokens_seen": 28141984, "step": 48795 }, { "epoch": 7.268394399761692, "grad_norm": 3.0037953853607178, "learning_rate": 1.0530454051670805e-05, "loss": 0.4553, "num_input_tokens_seen": 28144768, "step": 48800 }, { "epoch": 7.269139112302652, "grad_norm": 2.5271127223968506, "learning_rate": 1.0525154848456442e-05, "loss": 0.8078, "num_input_tokens_seen": 28147552, "step": 48805 }, { "epoch": 7.26988382484361, "grad_norm": 3.601304531097412, "learning_rate": 1.0519856623393268e-05, "loss": 0.6139, "num_input_tokens_seen": 28150144, "step": 48810 }, { "epoch": 7.270628537384569, "grad_norm": 3.189756393432617, "learning_rate": 1.05145593768393e-05, "loss": 0.7577, "num_input_tokens_seen": 28153088, "step": 48815 }, { "epoch": 7.271373249925529, "grad_norm": 3.0288703441619873, "learning_rate": 1.0509263109152518e-05, "loss": 0.4904, "num_input_tokens_seen": 28156096, "step": 48820 }, { "epoch": 7.272117962466488, "grad_norm": 4.2728729248046875, "learning_rate": 1.0503967820690817e-05, "loss": 0.645, "num_input_tokens_seen": 28159008, "step": 48825 }, { "epoch": 7.272862675007447, "grad_norm": 4.545704364776611, "learning_rate": 1.0498673511812035e-05, "loss": 0.5725, "num_input_tokens_seen": 28161984, "step": 48830 }, { "epoch": 7.273607387548406, "grad_norm": 1.8050965070724487, "learning_rate": 1.0493380182873941e-05, "loss": 0.6437, "num_input_tokens_seen": 28164672, "step": 48835 }, { "epoch": 7.274352100089366, "grad_norm": 4.473321437835693, "learning_rate": 1.0488087834234242e-05, "loss": 0.6739, "num_input_tokens_seen": 28167424, "step": 48840 }, { "epoch": 7.275096812630324, "grad_norm": 2.412224769592285, "learning_rate": 1.0482796466250554e-05, "loss": 0.5679, "num_input_tokens_seen": 28170400, "step": 48845 }, { "epoch": 7.275841525171284, "grad_norm": 1.8146450519561768, "learning_rate": 1.0477506079280472e-05, "loss": 0.5251, "num_input_tokens_seen": 28173824, "step": 48850 }, { "epoch": 7.276586237712243, "grad_norm": 3.000286817550659, "learning_rate": 1.0472216673681476e-05, "loss": 0.4671, "num_input_tokens_seen": 28176416, "step": 48855 }, { "epoch": 7.277330950253202, "grad_norm": 2.65586519241333, "learning_rate": 1.0466928249811009e-05, "loss": 0.6552, "num_input_tokens_seen": 28179424, "step": 48860 }, { "epoch": 7.278075662794161, "grad_norm": 3.491702079772949, "learning_rate": 1.0461640808026444e-05, "loss": 0.5292, "num_input_tokens_seen": 28182368, "step": 48865 }, { "epoch": 7.278820375335121, "grad_norm": 1.7431586980819702, "learning_rate": 1.0456354348685085e-05, "loss": 0.5971, "num_input_tokens_seen": 28185504, "step": 48870 }, { "epoch": 7.27956508787608, "grad_norm": 2.4064724445343018, "learning_rate": 1.0451068872144179e-05, "loss": 0.5955, "num_input_tokens_seen": 28188256, "step": 48875 }, { "epoch": 7.280309800417039, "grad_norm": 2.1841046810150146, "learning_rate": 1.0445784378760876e-05, "loss": 0.7898, "num_input_tokens_seen": 28191200, "step": 48880 }, { "epoch": 7.281054512957998, "grad_norm": 2.178584337234497, "learning_rate": 1.0440500868892294e-05, "loss": 0.4693, "num_input_tokens_seen": 28194144, "step": 48885 }, { "epoch": 7.281799225498958, "grad_norm": 3.269829273223877, "learning_rate": 1.0435218342895475e-05, "loss": 0.6992, "num_input_tokens_seen": 28196768, "step": 48890 }, { "epoch": 7.282543938039916, "grad_norm": 2.181236982345581, "learning_rate": 1.0429936801127377e-05, "loss": 0.7536, "num_input_tokens_seen": 28199776, "step": 48895 }, { "epoch": 7.283288650580876, "grad_norm": 2.1840386390686035, "learning_rate": 1.0424656243944913e-05, "loss": 0.566, "num_input_tokens_seen": 28202560, "step": 48900 }, { "epoch": 7.284033363121835, "grad_norm": 2.3395087718963623, "learning_rate": 1.0419376671704928e-05, "loss": 0.781, "num_input_tokens_seen": 28205632, "step": 48905 }, { "epoch": 7.2847780756627944, "grad_norm": 3.949568271636963, "learning_rate": 1.0414098084764178e-05, "loss": 0.5349, "num_input_tokens_seen": 28208320, "step": 48910 }, { "epoch": 7.285522788203753, "grad_norm": 1.4191182851791382, "learning_rate": 1.0408820483479382e-05, "loss": 0.498, "num_input_tokens_seen": 28211584, "step": 48915 }, { "epoch": 7.286267500744713, "grad_norm": 1.5710376501083374, "learning_rate": 1.0403543868207169e-05, "loss": 0.4459, "num_input_tokens_seen": 28214688, "step": 48920 }, { "epoch": 7.287012213285672, "grad_norm": 1.5985480546951294, "learning_rate": 1.0398268239304118e-05, "loss": 0.6002, "num_input_tokens_seen": 28217440, "step": 48925 }, { "epoch": 7.287756925826631, "grad_norm": 2.495802402496338, "learning_rate": 1.0392993597126743e-05, "loss": 0.5919, "num_input_tokens_seen": 28220384, "step": 48930 }, { "epoch": 7.28850163836759, "grad_norm": 3.855829954147339, "learning_rate": 1.0387719942031462e-05, "loss": 0.6672, "num_input_tokens_seen": 28223040, "step": 48935 }, { "epoch": 7.28924635090855, "grad_norm": 1.7577751874923706, "learning_rate": 1.0382447274374667e-05, "loss": 0.2418, "num_input_tokens_seen": 28225632, "step": 48940 }, { "epoch": 7.289991063449508, "grad_norm": 1.8640291690826416, "learning_rate": 1.037717559451265e-05, "loss": 0.5681, "num_input_tokens_seen": 28228640, "step": 48945 }, { "epoch": 7.290735775990468, "grad_norm": 1.9234414100646973, "learning_rate": 1.0371904902801651e-05, "loss": 0.5255, "num_input_tokens_seen": 28231520, "step": 48950 }, { "epoch": 7.291480488531427, "grad_norm": 2.250493049621582, "learning_rate": 1.0366635199597846e-05, "loss": 0.5496, "num_input_tokens_seen": 28234496, "step": 48955 }, { "epoch": 7.292225201072386, "grad_norm": 1.6914328336715698, "learning_rate": 1.0361366485257338e-05, "loss": 0.6925, "num_input_tokens_seen": 28237568, "step": 48960 }, { "epoch": 7.292969913613345, "grad_norm": 2.310418128967285, "learning_rate": 1.0356098760136168e-05, "loss": 0.6338, "num_input_tokens_seen": 28240640, "step": 48965 }, { "epoch": 7.293714626154305, "grad_norm": 1.243419885635376, "learning_rate": 1.0350832024590312e-05, "loss": 0.6104, "num_input_tokens_seen": 28243616, "step": 48970 }, { "epoch": 7.294459338695264, "grad_norm": 1.195979118347168, "learning_rate": 1.0345566278975671e-05, "loss": 0.4212, "num_input_tokens_seen": 28246848, "step": 48975 }, { "epoch": 7.295204051236222, "grad_norm": 4.80444860458374, "learning_rate": 1.0340301523648071e-05, "loss": 0.5271, "num_input_tokens_seen": 28249600, "step": 48980 }, { "epoch": 7.295948763777182, "grad_norm": 2.3693668842315674, "learning_rate": 1.0335037758963296e-05, "loss": 0.4386, "num_input_tokens_seen": 28252448, "step": 48985 }, { "epoch": 7.296693476318141, "grad_norm": 2.7505743503570557, "learning_rate": 1.0329774985277042e-05, "loss": 0.5265, "num_input_tokens_seen": 28255616, "step": 48990 }, { "epoch": 7.2974381888591004, "grad_norm": 3.0442042350769043, "learning_rate": 1.0324513202944947e-05, "loss": 0.5238, "num_input_tokens_seen": 28258720, "step": 48995 }, { "epoch": 7.298182901400059, "grad_norm": 1.9857723712921143, "learning_rate": 1.0319252412322586e-05, "loss": 0.7234, "num_input_tokens_seen": 28262016, "step": 49000 }, { "epoch": 7.298927613941019, "grad_norm": 1.5471709966659546, "learning_rate": 1.0313992613765469e-05, "loss": 0.5698, "num_input_tokens_seen": 28264832, "step": 49005 }, { "epoch": 7.299672326481978, "grad_norm": 3.939481019973755, "learning_rate": 1.0308733807629022e-05, "loss": 0.5396, "num_input_tokens_seen": 28267488, "step": 49010 }, { "epoch": 7.300417039022937, "grad_norm": 2.3710076808929443, "learning_rate": 1.0303475994268606e-05, "loss": 0.6935, "num_input_tokens_seen": 28270432, "step": 49015 }, { "epoch": 7.301161751563896, "grad_norm": 2.693793296813965, "learning_rate": 1.029821917403953e-05, "loss": 0.662, "num_input_tokens_seen": 28273280, "step": 49020 }, { "epoch": 7.301906464104856, "grad_norm": 1.931517243385315, "learning_rate": 1.0292963347297027e-05, "loss": 0.6163, "num_input_tokens_seen": 28276352, "step": 49025 }, { "epoch": 7.302651176645814, "grad_norm": 4.5014495849609375, "learning_rate": 1.0287708514396268e-05, "loss": 0.7067, "num_input_tokens_seen": 28279360, "step": 49030 }, { "epoch": 7.303395889186774, "grad_norm": 1.8205229043960571, "learning_rate": 1.0282454675692354e-05, "loss": 0.5038, "num_input_tokens_seen": 28282176, "step": 49035 }, { "epoch": 7.304140601727733, "grad_norm": 4.995928764343262, "learning_rate": 1.0277201831540323e-05, "loss": 0.5562, "num_input_tokens_seen": 28285248, "step": 49040 }, { "epoch": 7.3048853142686925, "grad_norm": 1.5265522003173828, "learning_rate": 1.0271949982295123e-05, "loss": 0.5182, "num_input_tokens_seen": 28288032, "step": 49045 }, { "epoch": 7.305630026809651, "grad_norm": 2.3881747722625732, "learning_rate": 1.0266699128311675e-05, "loss": 0.5829, "num_input_tokens_seen": 28291072, "step": 49050 }, { "epoch": 7.306374739350611, "grad_norm": 1.7735563516616821, "learning_rate": 1.0261449269944786e-05, "loss": 0.3684, "num_input_tokens_seen": 28293856, "step": 49055 }, { "epoch": 7.30711945189157, "grad_norm": 2.119148015975952, "learning_rate": 1.0256200407549239e-05, "loss": 0.503, "num_input_tokens_seen": 28296608, "step": 49060 }, { "epoch": 7.307864164432529, "grad_norm": 2.5384521484375, "learning_rate": 1.0250952541479719e-05, "loss": 0.5796, "num_input_tokens_seen": 28299264, "step": 49065 }, { "epoch": 7.308608876973488, "grad_norm": 4.212956428527832, "learning_rate": 1.0245705672090872e-05, "loss": 0.7759, "num_input_tokens_seen": 28302304, "step": 49070 }, { "epoch": 7.309353589514448, "grad_norm": 1.8511489629745483, "learning_rate": 1.024045979973724e-05, "loss": 0.5656, "num_input_tokens_seen": 28305024, "step": 49075 }, { "epoch": 7.3100983020554064, "grad_norm": 1.7531930208206177, "learning_rate": 1.0235214924773326e-05, "loss": 0.5954, "num_input_tokens_seen": 28307904, "step": 49080 }, { "epoch": 7.310843014596366, "grad_norm": 1.7319945096969604, "learning_rate": 1.0229971047553557e-05, "loss": 0.4362, "num_input_tokens_seen": 28310880, "step": 49085 }, { "epoch": 7.311587727137325, "grad_norm": 1.583436131477356, "learning_rate": 1.0224728168432307e-05, "loss": 0.625, "num_input_tokens_seen": 28314112, "step": 49090 }, { "epoch": 7.3123324396782845, "grad_norm": 1.4493931531906128, "learning_rate": 1.0219486287763844e-05, "loss": 0.5074, "num_input_tokens_seen": 28317088, "step": 49095 }, { "epoch": 7.313077152219243, "grad_norm": 0.8224689364433289, "learning_rate": 1.0214245405902406e-05, "loss": 0.5877, "num_input_tokens_seen": 28319872, "step": 49100 }, { "epoch": 7.313821864760203, "grad_norm": 2.7938411235809326, "learning_rate": 1.0209005523202155e-05, "loss": 0.715, "num_input_tokens_seen": 28322784, "step": 49105 }, { "epoch": 7.314566577301162, "grad_norm": 3.4354822635650635, "learning_rate": 1.0203766640017167e-05, "loss": 0.653, "num_input_tokens_seen": 28325536, "step": 49110 }, { "epoch": 7.315311289842121, "grad_norm": 4.285120010375977, "learning_rate": 1.0198528756701475e-05, "loss": 0.6899, "num_input_tokens_seen": 28328224, "step": 49115 }, { "epoch": 7.31605600238308, "grad_norm": 1.6595611572265625, "learning_rate": 1.019329187360903e-05, "loss": 0.688, "num_input_tokens_seen": 28331328, "step": 49120 }, { "epoch": 7.31680071492404, "grad_norm": 1.3046954870224, "learning_rate": 1.0188055991093717e-05, "loss": 0.5729, "num_input_tokens_seen": 28334048, "step": 49125 }, { "epoch": 7.3175454274649985, "grad_norm": 2.1833178997039795, "learning_rate": 1.0182821109509364e-05, "loss": 0.5789, "num_input_tokens_seen": 28337152, "step": 49130 }, { "epoch": 7.318290140005958, "grad_norm": 1.534040927886963, "learning_rate": 1.0177587229209726e-05, "loss": 0.6808, "num_input_tokens_seen": 28339776, "step": 49135 }, { "epoch": 7.319034852546917, "grad_norm": 1.3040956258773804, "learning_rate": 1.0172354350548477e-05, "loss": 0.4709, "num_input_tokens_seen": 28342848, "step": 49140 }, { "epoch": 7.319779565087876, "grad_norm": 4.018394947052002, "learning_rate": 1.0167122473879228e-05, "loss": 0.6485, "num_input_tokens_seen": 28345920, "step": 49145 }, { "epoch": 7.320524277628835, "grad_norm": 2.746366024017334, "learning_rate": 1.0161891599555536e-05, "loss": 0.565, "num_input_tokens_seen": 28348960, "step": 49150 }, { "epoch": 7.321268990169794, "grad_norm": 3.18854022026062, "learning_rate": 1.0156661727930886e-05, "loss": 0.6475, "num_input_tokens_seen": 28351776, "step": 49155 }, { "epoch": 7.322013702710754, "grad_norm": 1.529166579246521, "learning_rate": 1.0151432859358684e-05, "loss": 0.5768, "num_input_tokens_seen": 28354496, "step": 49160 }, { "epoch": 7.3227584152517124, "grad_norm": 1.1442185640335083, "learning_rate": 1.0146204994192283e-05, "loss": 0.4679, "num_input_tokens_seen": 28357472, "step": 49165 }, { "epoch": 7.323503127792672, "grad_norm": 3.2079811096191406, "learning_rate": 1.0140978132784962e-05, "loss": 0.7977, "num_input_tokens_seen": 28360640, "step": 49170 }, { "epoch": 7.324247840333631, "grad_norm": 1.7221299409866333, "learning_rate": 1.013575227548993e-05, "loss": 0.514, "num_input_tokens_seen": 28363552, "step": 49175 }, { "epoch": 7.3249925528745905, "grad_norm": 4.516321182250977, "learning_rate": 1.0130527422660313e-05, "loss": 0.6019, "num_input_tokens_seen": 28366432, "step": 49180 }, { "epoch": 7.325737265415549, "grad_norm": 1.6542085409164429, "learning_rate": 1.01253035746492e-05, "loss": 0.5309, "num_input_tokens_seen": 28369408, "step": 49185 }, { "epoch": 7.326481977956509, "grad_norm": 3.0183629989624023, "learning_rate": 1.0120080731809595e-05, "loss": 0.6912, "num_input_tokens_seen": 28372416, "step": 49190 }, { "epoch": 7.327226690497468, "grad_norm": 2.8701207637786865, "learning_rate": 1.0114858894494437e-05, "loss": 0.5626, "num_input_tokens_seen": 28375360, "step": 49195 }, { "epoch": 7.327971403038427, "grad_norm": 7.391435623168945, "learning_rate": 1.0109638063056595e-05, "loss": 0.706, "num_input_tokens_seen": 28378272, "step": 49200 }, { "epoch": 7.328716115579386, "grad_norm": 3.3790647983551025, "learning_rate": 1.0104418237848883e-05, "loss": 0.6185, "num_input_tokens_seen": 28380992, "step": 49205 }, { "epoch": 7.329460828120346, "grad_norm": 1.5103927850723267, "learning_rate": 1.0099199419224018e-05, "loss": 0.6297, "num_input_tokens_seen": 28383712, "step": 49210 }, { "epoch": 7.3302055406613045, "grad_norm": 2.3483307361602783, "learning_rate": 1.0093981607534683e-05, "loss": 0.4725, "num_input_tokens_seen": 28386496, "step": 49215 }, { "epoch": 7.330950253202264, "grad_norm": 1.4250085353851318, "learning_rate": 1.0088764803133454e-05, "loss": 0.567, "num_input_tokens_seen": 28389184, "step": 49220 }, { "epoch": 7.331694965743223, "grad_norm": 2.9383175373077393, "learning_rate": 1.0083549006372881e-05, "loss": 0.6352, "num_input_tokens_seen": 28391904, "step": 49225 }, { "epoch": 7.3324396782841825, "grad_norm": 1.618048071861267, "learning_rate": 1.0078334217605418e-05, "loss": 0.4303, "num_input_tokens_seen": 28394720, "step": 49230 }, { "epoch": 7.333184390825141, "grad_norm": 1.6994801759719849, "learning_rate": 1.007312043718347e-05, "loss": 0.4528, "num_input_tokens_seen": 28397408, "step": 49235 }, { "epoch": 7.333929103366101, "grad_norm": 0.9491127729415894, "learning_rate": 1.0067907665459344e-05, "loss": 0.535, "num_input_tokens_seen": 28400192, "step": 49240 }, { "epoch": 7.33467381590706, "grad_norm": 2.0442841053009033, "learning_rate": 1.006269590278531e-05, "loss": 0.8342, "num_input_tokens_seen": 28403040, "step": 49245 }, { "epoch": 7.335418528448019, "grad_norm": 3.1449873447418213, "learning_rate": 1.0057485149513557e-05, "loss": 0.5306, "num_input_tokens_seen": 28405792, "step": 49250 }, { "epoch": 7.336163240988978, "grad_norm": 6.202351093292236, "learning_rate": 1.0052275405996214e-05, "loss": 0.7045, "num_input_tokens_seen": 28408992, "step": 49255 }, { "epoch": 7.336907953529938, "grad_norm": 2.915959596633911, "learning_rate": 1.0047066672585317e-05, "loss": 0.7897, "num_input_tokens_seen": 28411936, "step": 49260 }, { "epoch": 7.3376526660708965, "grad_norm": 3.557147264480591, "learning_rate": 1.004185894963286e-05, "loss": 0.4985, "num_input_tokens_seen": 28414848, "step": 49265 }, { "epoch": 7.338397378611856, "grad_norm": 4.867717742919922, "learning_rate": 1.0036652237490768e-05, "loss": 0.5536, "num_input_tokens_seen": 28417728, "step": 49270 }, { "epoch": 7.339142091152815, "grad_norm": 4.284099578857422, "learning_rate": 1.0031446536510875e-05, "loss": 0.7974, "num_input_tokens_seen": 28420768, "step": 49275 }, { "epoch": 7.3398868036937746, "grad_norm": 2.959585666656494, "learning_rate": 1.0026241847044964e-05, "loss": 0.4627, "num_input_tokens_seen": 28423808, "step": 49280 }, { "epoch": 7.340631516234733, "grad_norm": 3.408552646636963, "learning_rate": 1.0021038169444752e-05, "loss": 0.5556, "num_input_tokens_seen": 28426560, "step": 49285 }, { "epoch": 7.341376228775693, "grad_norm": 3.7864816188812256, "learning_rate": 1.0015835504061879e-05, "loss": 0.7696, "num_input_tokens_seen": 28429184, "step": 49290 }, { "epoch": 7.342120941316652, "grad_norm": 1.7059420347213745, "learning_rate": 1.0010633851247933e-05, "loss": 0.6938, "num_input_tokens_seen": 28431808, "step": 49295 }, { "epoch": 7.342865653857611, "grad_norm": 1.896957278251648, "learning_rate": 1.0005433211354398e-05, "loss": 0.5892, "num_input_tokens_seen": 28434624, "step": 49300 }, { "epoch": 7.34361036639857, "grad_norm": 2.3537914752960205, "learning_rate": 1.0000233584732732e-05, "loss": 0.7125, "num_input_tokens_seen": 28437344, "step": 49305 }, { "epoch": 7.344355078939529, "grad_norm": 4.348743915557861, "learning_rate": 9.995034971734288e-06, "loss": 0.651, "num_input_tokens_seen": 28440224, "step": 49310 }, { "epoch": 7.3450997914804885, "grad_norm": 1.3280669450759888, "learning_rate": 9.989837372710374e-06, "loss": 0.6227, "num_input_tokens_seen": 28443008, "step": 49315 }, { "epoch": 7.345844504021448, "grad_norm": 1.8150795698165894, "learning_rate": 9.984640788012222e-06, "loss": 0.6914, "num_input_tokens_seen": 28447456, "step": 49320 }, { "epoch": 7.346589216562407, "grad_norm": 2.260078191757202, "learning_rate": 9.979445217991001e-06, "loss": 0.5872, "num_input_tokens_seen": 28450464, "step": 49325 }, { "epoch": 7.347333929103366, "grad_norm": 5.033985614776611, "learning_rate": 9.9742506629978e-06, "loss": 0.6107, "num_input_tokens_seen": 28453376, "step": 49330 }, { "epoch": 7.348078641644325, "grad_norm": 1.7804937362670898, "learning_rate": 9.96905712338366e-06, "loss": 0.4916, "num_input_tokens_seen": 28456160, "step": 49335 }, { "epoch": 7.348823354185284, "grad_norm": 3.4943597316741943, "learning_rate": 9.963864599499528e-06, "loss": 0.5756, "num_input_tokens_seen": 28458848, "step": 49340 }, { "epoch": 7.349568066726244, "grad_norm": 6.899401664733887, "learning_rate": 9.958673091696286e-06, "loss": 0.6365, "num_input_tokens_seen": 28462080, "step": 49345 }, { "epoch": 7.3503127792672025, "grad_norm": 2.536642551422119, "learning_rate": 9.95348260032476e-06, "loss": 0.7811, "num_input_tokens_seen": 28464864, "step": 49350 }, { "epoch": 7.351057491808162, "grad_norm": 5.456904411315918, "learning_rate": 9.948293125735705e-06, "loss": 0.6193, "num_input_tokens_seen": 28467904, "step": 49355 }, { "epoch": 7.351802204349121, "grad_norm": 3.9507663249969482, "learning_rate": 9.943104668279807e-06, "loss": 0.6014, "num_input_tokens_seen": 28471072, "step": 49360 }, { "epoch": 7.3525469168900806, "grad_norm": 5.352436542510986, "learning_rate": 9.937917228307678e-06, "loss": 0.4967, "num_input_tokens_seen": 28473856, "step": 49365 }, { "epoch": 7.353291629431039, "grad_norm": 1.9579931497573853, "learning_rate": 9.932730806169873e-06, "loss": 0.7856, "num_input_tokens_seen": 28476608, "step": 49370 }, { "epoch": 7.354036341971999, "grad_norm": 1.890683650970459, "learning_rate": 9.927545402216862e-06, "loss": 0.7222, "num_input_tokens_seen": 28479808, "step": 49375 }, { "epoch": 7.354781054512958, "grad_norm": 1.6147823333740234, "learning_rate": 9.922361016799045e-06, "loss": 0.5501, "num_input_tokens_seen": 28482752, "step": 49380 }, { "epoch": 7.355525767053917, "grad_norm": 1.217464804649353, "learning_rate": 9.917177650266768e-06, "loss": 0.4952, "num_input_tokens_seen": 28485536, "step": 49385 }, { "epoch": 7.356270479594876, "grad_norm": 2.64223313331604, "learning_rate": 9.911995302970301e-06, "loss": 0.5096, "num_input_tokens_seen": 28488608, "step": 49390 }, { "epoch": 7.357015192135836, "grad_norm": 1.837864637374878, "learning_rate": 9.90681397525985e-06, "loss": 0.5167, "num_input_tokens_seen": 28491328, "step": 49395 }, { "epoch": 7.3577599046767945, "grad_norm": 1.753800868988037, "learning_rate": 9.901633667485554e-06, "loss": 0.4934, "num_input_tokens_seen": 28494176, "step": 49400 }, { "epoch": 7.358504617217754, "grad_norm": 3.5347471237182617, "learning_rate": 9.89645437999746e-06, "loss": 0.5156, "num_input_tokens_seen": 28496864, "step": 49405 }, { "epoch": 7.359249329758713, "grad_norm": 2.343994140625, "learning_rate": 9.891276113145576e-06, "loss": 0.555, "num_input_tokens_seen": 28499488, "step": 49410 }, { "epoch": 7.359994042299673, "grad_norm": 2.241030693054199, "learning_rate": 9.886098867279831e-06, "loss": 0.457, "num_input_tokens_seen": 28502400, "step": 49415 }, { "epoch": 7.360738754840631, "grad_norm": 1.4964053630828857, "learning_rate": 9.880922642750068e-06, "loss": 0.4785, "num_input_tokens_seen": 28505088, "step": 49420 }, { "epoch": 7.361483467381591, "grad_norm": 1.6146976947784424, "learning_rate": 9.87574743990608e-06, "loss": 0.539, "num_input_tokens_seen": 28507936, "step": 49425 }, { "epoch": 7.36222817992255, "grad_norm": 3.990957260131836, "learning_rate": 9.870573259097593e-06, "loss": 0.5728, "num_input_tokens_seen": 28510848, "step": 49430 }, { "epoch": 7.362972892463509, "grad_norm": 1.4144692420959473, "learning_rate": 9.86540010067426e-06, "loss": 0.68, "num_input_tokens_seen": 28513536, "step": 49435 }, { "epoch": 7.363717605004468, "grad_norm": 2.1312077045440674, "learning_rate": 9.86022796498565e-06, "loss": 0.7743, "num_input_tokens_seen": 28516416, "step": 49440 }, { "epoch": 7.364462317545428, "grad_norm": 2.078946113586426, "learning_rate": 9.855056852381275e-06, "loss": 0.6364, "num_input_tokens_seen": 28519328, "step": 49445 }, { "epoch": 7.3652070300863866, "grad_norm": 3.786608934402466, "learning_rate": 9.849886763210586e-06, "loss": 0.5589, "num_input_tokens_seen": 28522496, "step": 49450 }, { "epoch": 7.365951742627346, "grad_norm": 2.734779119491577, "learning_rate": 9.844717697822965e-06, "loss": 0.5914, "num_input_tokens_seen": 28525440, "step": 49455 }, { "epoch": 7.366696455168305, "grad_norm": 3.3899803161621094, "learning_rate": 9.839549656567693e-06, "loss": 0.609, "num_input_tokens_seen": 28528224, "step": 49460 }, { "epoch": 7.367441167709265, "grad_norm": 2.570751905441284, "learning_rate": 9.834382639794015e-06, "loss": 0.4842, "num_input_tokens_seen": 28531104, "step": 49465 }, { "epoch": 7.368185880250223, "grad_norm": 2.8650364875793457, "learning_rate": 9.829216647851111e-06, "loss": 0.5337, "num_input_tokens_seen": 28533696, "step": 49470 }, { "epoch": 7.368930592791183, "grad_norm": 2.7839560508728027, "learning_rate": 9.824051681088058e-06, "loss": 0.6091, "num_input_tokens_seen": 28536768, "step": 49475 }, { "epoch": 7.369675305332142, "grad_norm": 6.800547122955322, "learning_rate": 9.81888773985389e-06, "loss": 0.7003, "num_input_tokens_seen": 28539392, "step": 49480 }, { "epoch": 7.370420017873101, "grad_norm": 1.5993696451187134, "learning_rate": 9.81372482449757e-06, "loss": 0.5284, "num_input_tokens_seen": 28542272, "step": 49485 }, { "epoch": 7.37116473041406, "grad_norm": 3.5803420543670654, "learning_rate": 9.80856293536798e-06, "loss": 0.6048, "num_input_tokens_seen": 28544928, "step": 49490 }, { "epoch": 7.371909442955019, "grad_norm": 7.2950944900512695, "learning_rate": 9.803402072813953e-06, "loss": 0.6507, "num_input_tokens_seen": 28547712, "step": 49495 }, { "epoch": 7.372654155495979, "grad_norm": 2.2787134647369385, "learning_rate": 9.798242237184218e-06, "loss": 0.6032, "num_input_tokens_seen": 28550752, "step": 49500 }, { "epoch": 7.373398868036937, "grad_norm": 2.117373466491699, "learning_rate": 9.793083428827477e-06, "loss": 0.4493, "num_input_tokens_seen": 28553632, "step": 49505 }, { "epoch": 7.374143580577897, "grad_norm": 4.665881633758545, "learning_rate": 9.787925648092321e-06, "loss": 0.6027, "num_input_tokens_seen": 28556608, "step": 49510 }, { "epoch": 7.374888293118856, "grad_norm": 2.8042800426483154, "learning_rate": 9.782768895327305e-06, "loss": 0.7364, "num_input_tokens_seen": 28559552, "step": 49515 }, { "epoch": 7.375633005659815, "grad_norm": 2.581315517425537, "learning_rate": 9.777613170880898e-06, "loss": 0.6936, "num_input_tokens_seen": 28562272, "step": 49520 }, { "epoch": 7.376377718200774, "grad_norm": 3.2458391189575195, "learning_rate": 9.7724584751015e-06, "loss": 0.6744, "num_input_tokens_seen": 28565152, "step": 49525 }, { "epoch": 7.377122430741734, "grad_norm": 2.1199874877929688, "learning_rate": 9.767304808337451e-06, "loss": 0.5843, "num_input_tokens_seen": 28568096, "step": 49530 }, { "epoch": 7.3778671432826926, "grad_norm": 4.713409423828125, "learning_rate": 9.76215217093702e-06, "loss": 0.5602, "num_input_tokens_seen": 28570688, "step": 49535 }, { "epoch": 7.378611855823652, "grad_norm": 2.0379528999328613, "learning_rate": 9.757000563248389e-06, "loss": 0.5267, "num_input_tokens_seen": 28573856, "step": 49540 }, { "epoch": 7.379356568364611, "grad_norm": 2.3797359466552734, "learning_rate": 9.751849985619682e-06, "loss": 0.5329, "num_input_tokens_seen": 28576768, "step": 49545 }, { "epoch": 7.380101280905571, "grad_norm": 2.2862346172332764, "learning_rate": 9.746700438398957e-06, "loss": 0.6408, "num_input_tokens_seen": 28579584, "step": 49550 }, { "epoch": 7.380845993446529, "grad_norm": 2.131462812423706, "learning_rate": 9.7415519219342e-06, "loss": 0.6429, "num_input_tokens_seen": 28582336, "step": 49555 }, { "epoch": 7.381590705987489, "grad_norm": 2.9367456436157227, "learning_rate": 9.736404436573327e-06, "loss": 0.5168, "num_input_tokens_seen": 28585056, "step": 49560 }, { "epoch": 7.382335418528448, "grad_norm": 2.9761881828308105, "learning_rate": 9.731257982664196e-06, "loss": 0.5407, "num_input_tokens_seen": 28587680, "step": 49565 }, { "epoch": 7.383080131069407, "grad_norm": 2.438027858734131, "learning_rate": 9.726112560554562e-06, "loss": 0.4933, "num_input_tokens_seen": 28590272, "step": 49570 }, { "epoch": 7.383824843610366, "grad_norm": 3.7347898483276367, "learning_rate": 9.72096817059215e-06, "loss": 0.8296, "num_input_tokens_seen": 28592832, "step": 49575 }, { "epoch": 7.384569556151326, "grad_norm": 3.7553086280822754, "learning_rate": 9.715824813124582e-06, "loss": 0.6594, "num_input_tokens_seen": 28595648, "step": 49580 }, { "epoch": 7.385314268692285, "grad_norm": 1.842883586883545, "learning_rate": 9.710682488499434e-06, "loss": 0.7283, "num_input_tokens_seen": 28598688, "step": 49585 }, { "epoch": 7.386058981233244, "grad_norm": 2.591442108154297, "learning_rate": 9.7055411970642e-06, "loss": 0.5593, "num_input_tokens_seen": 28601600, "step": 49590 }, { "epoch": 7.386803693774203, "grad_norm": 1.350647211074829, "learning_rate": 9.700400939166308e-06, "loss": 0.484, "num_input_tokens_seen": 28604480, "step": 49595 }, { "epoch": 7.387548406315163, "grad_norm": 3.225618600845337, "learning_rate": 9.695261715153126e-06, "loss": 0.6994, "num_input_tokens_seen": 28607328, "step": 49600 }, { "epoch": 7.388293118856121, "grad_norm": 3.588383436203003, "learning_rate": 9.690123525371925e-06, "loss": 0.3941, "num_input_tokens_seen": 28610240, "step": 49605 }, { "epoch": 7.389037831397081, "grad_norm": 1.6162800788879395, "learning_rate": 9.68498637016993e-06, "loss": 0.564, "num_input_tokens_seen": 28613088, "step": 49610 }, { "epoch": 7.38978254393804, "grad_norm": 3.0353338718414307, "learning_rate": 9.679850249894298e-06, "loss": 0.5793, "num_input_tokens_seen": 28616192, "step": 49615 }, { "epoch": 7.390527256478999, "grad_norm": 2.5333430767059326, "learning_rate": 9.67471516489209e-06, "loss": 0.5295, "num_input_tokens_seen": 28619136, "step": 49620 }, { "epoch": 7.391271969019958, "grad_norm": 2.4404401779174805, "learning_rate": 9.669581115510323e-06, "loss": 0.542, "num_input_tokens_seen": 28622016, "step": 49625 }, { "epoch": 7.392016681560918, "grad_norm": 4.229584217071533, "learning_rate": 9.664448102095939e-06, "loss": 0.7411, "num_input_tokens_seen": 28625088, "step": 49630 }, { "epoch": 7.392761394101877, "grad_norm": 3.7382819652557373, "learning_rate": 9.659316124995806e-06, "loss": 0.6899, "num_input_tokens_seen": 28627872, "step": 49635 }, { "epoch": 7.393506106642836, "grad_norm": 2.5441272258758545, "learning_rate": 9.654185184556713e-06, "loss": 0.4667, "num_input_tokens_seen": 28630656, "step": 49640 }, { "epoch": 7.394250819183795, "grad_norm": 1.5201809406280518, "learning_rate": 9.649055281125394e-06, "loss": 0.4889, "num_input_tokens_seen": 28633568, "step": 49645 }, { "epoch": 7.394995531724755, "grad_norm": 2.3140225410461426, "learning_rate": 9.643926415048504e-06, "loss": 0.668, "num_input_tokens_seen": 28636640, "step": 49650 }, { "epoch": 7.395740244265713, "grad_norm": 4.692172527313232, "learning_rate": 9.638798586672645e-06, "loss": 0.5584, "num_input_tokens_seen": 28639264, "step": 49655 }, { "epoch": 7.396484956806672, "grad_norm": 1.7118287086486816, "learning_rate": 9.633671796344312e-06, "loss": 0.5832, "num_input_tokens_seen": 28642176, "step": 49660 }, { "epoch": 7.397229669347632, "grad_norm": 5.401758193969727, "learning_rate": 9.628546044409966e-06, "loss": 0.5321, "num_input_tokens_seen": 28645088, "step": 49665 }, { "epoch": 7.3979743818885915, "grad_norm": 3.0255038738250732, "learning_rate": 9.623421331215992e-06, "loss": 0.6016, "num_input_tokens_seen": 28647904, "step": 49670 }, { "epoch": 7.39871909442955, "grad_norm": 3.4984426498413086, "learning_rate": 9.618297657108676e-06, "loss": 0.6525, "num_input_tokens_seen": 28650688, "step": 49675 }, { "epoch": 7.399463806970509, "grad_norm": 1.7759418487548828, "learning_rate": 9.61317502243427e-06, "loss": 0.6407, "num_input_tokens_seen": 28653536, "step": 49680 }, { "epoch": 7.400208519511469, "grad_norm": 3.264037609100342, "learning_rate": 9.608053427538938e-06, "loss": 0.5496, "num_input_tokens_seen": 28656480, "step": 49685 }, { "epoch": 7.400953232052427, "grad_norm": 2.139171600341797, "learning_rate": 9.602932872768775e-06, "loss": 0.5474, "num_input_tokens_seen": 28659264, "step": 49690 }, { "epoch": 7.401697944593387, "grad_norm": 1.687490463256836, "learning_rate": 9.597813358469817e-06, "loss": 0.6494, "num_input_tokens_seen": 28661920, "step": 49695 }, { "epoch": 7.402442657134346, "grad_norm": 2.0499370098114014, "learning_rate": 9.592694884988001e-06, "loss": 0.7131, "num_input_tokens_seen": 28664928, "step": 49700 }, { "epoch": 7.403187369675305, "grad_norm": 2.8674509525299072, "learning_rate": 9.587577452669235e-06, "loss": 0.4959, "num_input_tokens_seen": 28667648, "step": 49705 }, { "epoch": 7.403932082216264, "grad_norm": 2.3164050579071045, "learning_rate": 9.582461061859313e-06, "loss": 0.6214, "num_input_tokens_seen": 28670752, "step": 49710 }, { "epoch": 7.404676794757224, "grad_norm": 3.6175849437713623, "learning_rate": 9.577345712903988e-06, "loss": 0.6337, "num_input_tokens_seen": 28673920, "step": 49715 }, { "epoch": 7.405421507298183, "grad_norm": 3.8792941570281982, "learning_rate": 9.572231406148938e-06, "loss": 0.5418, "num_input_tokens_seen": 28676928, "step": 49720 }, { "epoch": 7.406166219839142, "grad_norm": 1.0020020008087158, "learning_rate": 9.567118141939763e-06, "loss": 0.445, "num_input_tokens_seen": 28679712, "step": 49725 }, { "epoch": 7.406910932380101, "grad_norm": 2.4831533432006836, "learning_rate": 9.562005920622009e-06, "loss": 0.5564, "num_input_tokens_seen": 28682816, "step": 49730 }, { "epoch": 7.407655644921061, "grad_norm": 1.5305055379867554, "learning_rate": 9.556894742541117e-06, "loss": 0.6525, "num_input_tokens_seen": 28685792, "step": 49735 }, { "epoch": 7.408400357462019, "grad_norm": 1.7801984548568726, "learning_rate": 9.551784608042501e-06, "loss": 0.7623, "num_input_tokens_seen": 28688704, "step": 49740 }, { "epoch": 7.409145070002979, "grad_norm": 3.8683130741119385, "learning_rate": 9.546675517471465e-06, "loss": 0.7269, "num_input_tokens_seen": 28691296, "step": 49745 }, { "epoch": 7.409889782543938, "grad_norm": 1.704909324645996, "learning_rate": 9.541567471173268e-06, "loss": 0.5085, "num_input_tokens_seen": 28694304, "step": 49750 }, { "epoch": 7.4106344950848975, "grad_norm": 2.5444488525390625, "learning_rate": 9.536460469493095e-06, "loss": 0.6291, "num_input_tokens_seen": 28697088, "step": 49755 }, { "epoch": 7.411379207625856, "grad_norm": 1.5083138942718506, "learning_rate": 9.53135451277605e-06, "loss": 0.3379, "num_input_tokens_seen": 28700032, "step": 49760 }, { "epoch": 7.412123920166816, "grad_norm": 2.3120834827423096, "learning_rate": 9.526249601367185e-06, "loss": 0.5529, "num_input_tokens_seen": 28702688, "step": 49765 }, { "epoch": 7.412868632707775, "grad_norm": 2.947371482849121, "learning_rate": 9.521145735611453e-06, "loss": 0.7426, "num_input_tokens_seen": 28705536, "step": 49770 }, { "epoch": 7.413613345248734, "grad_norm": 2.607551097869873, "learning_rate": 9.51604291585376e-06, "loss": 0.4859, "num_input_tokens_seen": 28708544, "step": 49775 }, { "epoch": 7.414358057789693, "grad_norm": 9.025275230407715, "learning_rate": 9.510941142438939e-06, "loss": 0.6789, "num_input_tokens_seen": 28711424, "step": 49780 }, { "epoch": 7.415102770330653, "grad_norm": 1.6046720743179321, "learning_rate": 9.505840415711737e-06, "loss": 0.522, "num_input_tokens_seen": 28714400, "step": 49785 }, { "epoch": 7.415847482871611, "grad_norm": 2.902069091796875, "learning_rate": 9.500740736016845e-06, "loss": 0.5628, "num_input_tokens_seen": 28717376, "step": 49790 }, { "epoch": 7.416592195412571, "grad_norm": 1.5938243865966797, "learning_rate": 9.495642103698877e-06, "loss": 0.5006, "num_input_tokens_seen": 28720224, "step": 49795 }, { "epoch": 7.41733690795353, "grad_norm": 4.820329666137695, "learning_rate": 9.490544519102387e-06, "loss": 0.6957, "num_input_tokens_seen": 28723040, "step": 49800 }, { "epoch": 7.4180816204944895, "grad_norm": 1.40133798122406, "learning_rate": 9.485447982571832e-06, "loss": 0.4733, "num_input_tokens_seen": 28725760, "step": 49805 }, { "epoch": 7.418826333035448, "grad_norm": 2.846256971359253, "learning_rate": 9.480352494451628e-06, "loss": 0.887, "num_input_tokens_seen": 28728448, "step": 49810 }, { "epoch": 7.419571045576408, "grad_norm": 1.9115561246871948, "learning_rate": 9.475258055086102e-06, "loss": 0.6405, "num_input_tokens_seen": 28731264, "step": 49815 }, { "epoch": 7.420315758117367, "grad_norm": 1.8764798641204834, "learning_rate": 9.470164664819527e-06, "loss": 0.7279, "num_input_tokens_seen": 28734048, "step": 49820 }, { "epoch": 7.421060470658326, "grad_norm": 2.3892197608947754, "learning_rate": 9.465072323996078e-06, "loss": 0.6698, "num_input_tokens_seen": 28737184, "step": 49825 }, { "epoch": 7.421805183199285, "grad_norm": 3.1889724731445312, "learning_rate": 9.459981032959877e-06, "loss": 0.5021, "num_input_tokens_seen": 28740128, "step": 49830 }, { "epoch": 7.422549895740245, "grad_norm": 2.19368839263916, "learning_rate": 9.454890792054987e-06, "loss": 0.5417, "num_input_tokens_seen": 28742784, "step": 49835 }, { "epoch": 7.4232946082812035, "grad_norm": 2.964972972869873, "learning_rate": 9.44980160162537e-06, "loss": 0.4947, "num_input_tokens_seen": 28745792, "step": 49840 }, { "epoch": 7.424039320822162, "grad_norm": 1.6869621276855469, "learning_rate": 9.444713462014934e-06, "loss": 0.6127, "num_input_tokens_seen": 28748704, "step": 49845 }, { "epoch": 7.424784033363122, "grad_norm": 2.3627660274505615, "learning_rate": 9.43962637356752e-06, "loss": 0.4843, "num_input_tokens_seen": 28751552, "step": 49850 }, { "epoch": 7.425528745904081, "grad_norm": 1.681351900100708, "learning_rate": 9.434540336626892e-06, "loss": 0.4701, "num_input_tokens_seen": 28754304, "step": 49855 }, { "epoch": 7.42627345844504, "grad_norm": 2.552806854248047, "learning_rate": 9.429455351536754e-06, "loss": 0.6421, "num_input_tokens_seen": 28757248, "step": 49860 }, { "epoch": 7.427018170985999, "grad_norm": 2.3207459449768066, "learning_rate": 9.424371418640706e-06, "loss": 0.6192, "num_input_tokens_seen": 28760128, "step": 49865 }, { "epoch": 7.427762883526959, "grad_norm": 2.6313154697418213, "learning_rate": 9.419288538282323e-06, "loss": 0.8782, "num_input_tokens_seen": 28763264, "step": 49870 }, { "epoch": 7.428507596067917, "grad_norm": 4.310498237609863, "learning_rate": 9.414206710805062e-06, "loss": 0.4589, "num_input_tokens_seen": 28765984, "step": 49875 }, { "epoch": 7.429252308608877, "grad_norm": 2.715691089630127, "learning_rate": 9.409125936552349e-06, "loss": 0.4857, "num_input_tokens_seen": 28768576, "step": 49880 }, { "epoch": 7.429997021149836, "grad_norm": 3.497896194458008, "learning_rate": 9.404046215867515e-06, "loss": 0.5514, "num_input_tokens_seen": 28771488, "step": 49885 }, { "epoch": 7.4307417336907955, "grad_norm": 2.381037950515747, "learning_rate": 9.398967549093828e-06, "loss": 0.5126, "num_input_tokens_seen": 28774432, "step": 49890 }, { "epoch": 7.431486446231754, "grad_norm": 1.4065357446670532, "learning_rate": 9.393889936574496e-06, "loss": 0.4204, "num_input_tokens_seen": 28777184, "step": 49895 }, { "epoch": 7.432231158772714, "grad_norm": 2.0422496795654297, "learning_rate": 9.388813378652623e-06, "loss": 0.44, "num_input_tokens_seen": 28780032, "step": 49900 }, { "epoch": 7.432975871313673, "grad_norm": 4.568426609039307, "learning_rate": 9.383737875671278e-06, "loss": 0.7239, "num_input_tokens_seen": 28782848, "step": 49905 }, { "epoch": 7.433720583854632, "grad_norm": 3.253424882888794, "learning_rate": 9.378663427973428e-06, "loss": 0.576, "num_input_tokens_seen": 28785760, "step": 49910 }, { "epoch": 7.434465296395591, "grad_norm": 2.3463480472564697, "learning_rate": 9.373590035901993e-06, "loss": 0.4503, "num_input_tokens_seen": 28788896, "step": 49915 }, { "epoch": 7.435210008936551, "grad_norm": 4.841905117034912, "learning_rate": 9.368517699799812e-06, "loss": 0.6635, "num_input_tokens_seen": 28791584, "step": 49920 }, { "epoch": 7.4359547214775095, "grad_norm": 2.3942108154296875, "learning_rate": 9.36344642000965e-06, "loss": 0.5683, "num_input_tokens_seen": 28794400, "step": 49925 }, { "epoch": 7.436699434018469, "grad_norm": 3.498375415802002, "learning_rate": 9.358376196874214e-06, "loss": 0.5228, "num_input_tokens_seen": 28797536, "step": 49930 }, { "epoch": 7.437444146559428, "grad_norm": 4.515678882598877, "learning_rate": 9.353307030736113e-06, "loss": 0.5119, "num_input_tokens_seen": 28800960, "step": 49935 }, { "epoch": 7.4381888591003875, "grad_norm": 4.8111677169799805, "learning_rate": 9.348238921937916e-06, "loss": 0.6107, "num_input_tokens_seen": 28804000, "step": 49940 }, { "epoch": 7.438933571641346, "grad_norm": 3.1009883880615234, "learning_rate": 9.34317187082209e-06, "loss": 0.5609, "num_input_tokens_seen": 28806784, "step": 49945 }, { "epoch": 7.439678284182306, "grad_norm": 2.040452241897583, "learning_rate": 9.338105877731051e-06, "loss": 0.4902, "num_input_tokens_seen": 28809728, "step": 49950 }, { "epoch": 7.440422996723265, "grad_norm": 4.229333877563477, "learning_rate": 9.33304094300714e-06, "loss": 0.5454, "num_input_tokens_seen": 28812608, "step": 49955 }, { "epoch": 7.441167709264224, "grad_norm": 2.680877447128296, "learning_rate": 9.327977066992627e-06, "loss": 0.5495, "num_input_tokens_seen": 28815648, "step": 49960 }, { "epoch": 7.441912421805183, "grad_norm": 1.9723509550094604, "learning_rate": 9.322914250029713e-06, "loss": 0.4988, "num_input_tokens_seen": 28818336, "step": 49965 }, { "epoch": 7.442657134346143, "grad_norm": 2.805241584777832, "learning_rate": 9.317852492460508e-06, "loss": 0.6966, "num_input_tokens_seen": 28821056, "step": 49970 }, { "epoch": 7.4434018468871015, "grad_norm": 1.7168506383895874, "learning_rate": 9.312791794627072e-06, "loss": 0.568, "num_input_tokens_seen": 28824032, "step": 49975 }, { "epoch": 7.444146559428061, "grad_norm": 0.9199472069740295, "learning_rate": 9.307732156871393e-06, "loss": 0.6713, "num_input_tokens_seen": 28827072, "step": 49980 }, { "epoch": 7.44489127196902, "grad_norm": 1.5076146125793457, "learning_rate": 9.30267357953537e-06, "loss": 0.3706, "num_input_tokens_seen": 28830048, "step": 49985 }, { "epoch": 7.4456359845099795, "grad_norm": 3.0324831008911133, "learning_rate": 9.297616062960843e-06, "loss": 0.6611, "num_input_tokens_seen": 28832928, "step": 49990 }, { "epoch": 7.446380697050938, "grad_norm": 3.2663450241088867, "learning_rate": 9.292559607489585e-06, "loss": 0.6918, "num_input_tokens_seen": 28835808, "step": 49995 }, { "epoch": 7.447125409591898, "grad_norm": 2.7032618522644043, "learning_rate": 9.287504213463292e-06, "loss": 0.5482, "num_input_tokens_seen": 28838656, "step": 50000 }, { "epoch": 7.447870122132857, "grad_norm": 2.8412554264068604, "learning_rate": 9.282449881223573e-06, "loss": 0.5318, "num_input_tokens_seen": 28841536, "step": 50005 }, { "epoch": 7.4486148346738155, "grad_norm": 3.0068228244781494, "learning_rate": 9.27739661111199e-06, "loss": 0.6093, "num_input_tokens_seen": 28844416, "step": 50010 }, { "epoch": 7.449359547214775, "grad_norm": 3.199660539627075, "learning_rate": 9.27234440347002e-06, "loss": 0.6848, "num_input_tokens_seen": 28847328, "step": 50015 }, { "epoch": 7.450104259755734, "grad_norm": 3.8047239780426025, "learning_rate": 9.267293258639082e-06, "loss": 0.6685, "num_input_tokens_seen": 28850528, "step": 50020 }, { "epoch": 7.4508489722966935, "grad_norm": 2.343045473098755, "learning_rate": 9.262243176960489e-06, "loss": 0.5826, "num_input_tokens_seen": 28853376, "step": 50025 }, { "epoch": 7.451593684837652, "grad_norm": 1.8989135026931763, "learning_rate": 9.257194158775517e-06, "loss": 0.4762, "num_input_tokens_seen": 28856384, "step": 50030 }, { "epoch": 7.452338397378612, "grad_norm": 2.9245758056640625, "learning_rate": 9.252146204425369e-06, "loss": 0.5721, "num_input_tokens_seen": 28859360, "step": 50035 }, { "epoch": 7.453083109919571, "grad_norm": 1.956524133682251, "learning_rate": 9.247099314251145e-06, "loss": 0.2575, "num_input_tokens_seen": 28862016, "step": 50040 }, { "epoch": 7.45382782246053, "grad_norm": 2.7621631622314453, "learning_rate": 9.242053488593902e-06, "loss": 0.4489, "num_input_tokens_seen": 28864832, "step": 50045 }, { "epoch": 7.454572535001489, "grad_norm": 2.263078451156616, "learning_rate": 9.237008727794618e-06, "loss": 0.4982, "num_input_tokens_seen": 28867424, "step": 50050 }, { "epoch": 7.455317247542449, "grad_norm": 1.0680184364318848, "learning_rate": 9.231965032194198e-06, "loss": 0.5702, "num_input_tokens_seen": 28870464, "step": 50055 }, { "epoch": 7.4560619600834075, "grad_norm": 2.0483133792877197, "learning_rate": 9.226922402133477e-06, "loss": 0.6578, "num_input_tokens_seen": 28873216, "step": 50060 }, { "epoch": 7.456806672624367, "grad_norm": 1.9468337297439575, "learning_rate": 9.221880837953209e-06, "loss": 0.7241, "num_input_tokens_seen": 28875968, "step": 50065 }, { "epoch": 7.457551385165326, "grad_norm": 1.3848035335540771, "learning_rate": 9.21684033999409e-06, "loss": 0.4486, "num_input_tokens_seen": 28878880, "step": 50070 }, { "epoch": 7.4582960977062855, "grad_norm": 2.2913196086883545, "learning_rate": 9.21180090859672e-06, "loss": 0.4904, "num_input_tokens_seen": 28881568, "step": 50075 }, { "epoch": 7.459040810247244, "grad_norm": 3.341344118118286, "learning_rate": 9.20676254410166e-06, "loss": 0.782, "num_input_tokens_seen": 28884448, "step": 50080 }, { "epoch": 7.459785522788204, "grad_norm": 4.159146785736084, "learning_rate": 9.201725246849374e-06, "loss": 0.4921, "num_input_tokens_seen": 28887136, "step": 50085 }, { "epoch": 7.460530235329163, "grad_norm": 2.7795207500457764, "learning_rate": 9.196689017180262e-06, "loss": 0.5414, "num_input_tokens_seen": 28889888, "step": 50090 }, { "epoch": 7.461274947870122, "grad_norm": 4.827298164367676, "learning_rate": 9.191653855434667e-06, "loss": 0.8671, "num_input_tokens_seen": 28892736, "step": 50095 }, { "epoch": 7.462019660411081, "grad_norm": 3.848740816116333, "learning_rate": 9.186619761952831e-06, "loss": 0.6326, "num_input_tokens_seen": 28896000, "step": 50100 }, { "epoch": 7.462764372952041, "grad_norm": 3.2999155521392822, "learning_rate": 9.181586737074932e-06, "loss": 0.7398, "num_input_tokens_seen": 28898880, "step": 50105 }, { "epoch": 7.4635090854929995, "grad_norm": 1.2508169412612915, "learning_rate": 9.176554781141086e-06, "loss": 0.5434, "num_input_tokens_seen": 28901632, "step": 50110 }, { "epoch": 7.464253798033959, "grad_norm": 3.552676200866699, "learning_rate": 9.171523894491336e-06, "loss": 0.6938, "num_input_tokens_seen": 28904608, "step": 50115 }, { "epoch": 7.464998510574918, "grad_norm": 4.015976428985596, "learning_rate": 9.166494077465645e-06, "loss": 0.3978, "num_input_tokens_seen": 28907456, "step": 50120 }, { "epoch": 7.465743223115878, "grad_norm": 2.647681474685669, "learning_rate": 9.161465330403912e-06, "loss": 0.3362, "num_input_tokens_seen": 28910400, "step": 50125 }, { "epoch": 7.466487935656836, "grad_norm": 3.5281224250793457, "learning_rate": 9.156437653645966e-06, "loss": 0.6463, "num_input_tokens_seen": 28913280, "step": 50130 }, { "epoch": 7.467232648197796, "grad_norm": 2.704749822616577, "learning_rate": 9.151411047531539e-06, "loss": 0.5569, "num_input_tokens_seen": 28916320, "step": 50135 }, { "epoch": 7.467977360738755, "grad_norm": 1.8770639896392822, "learning_rate": 9.146385512400323e-06, "loss": 0.6023, "num_input_tokens_seen": 28919520, "step": 50140 }, { "epoch": 7.468722073279714, "grad_norm": 3.1950621604919434, "learning_rate": 9.141361048591916e-06, "loss": 0.5011, "num_input_tokens_seen": 28922624, "step": 50145 }, { "epoch": 7.469466785820673, "grad_norm": 4.297084331512451, "learning_rate": 9.136337656445849e-06, "loss": 0.5985, "num_input_tokens_seen": 28925280, "step": 50150 }, { "epoch": 7.470211498361633, "grad_norm": 1.9010096788406372, "learning_rate": 9.131315336301585e-06, "loss": 0.7221, "num_input_tokens_seen": 28928320, "step": 50155 }, { "epoch": 7.4709562109025915, "grad_norm": 2.8946824073791504, "learning_rate": 9.126294088498515e-06, "loss": 0.641, "num_input_tokens_seen": 28931136, "step": 50160 }, { "epoch": 7.471700923443551, "grad_norm": 2.1809041500091553, "learning_rate": 9.12127391337596e-06, "loss": 0.5976, "num_input_tokens_seen": 28934240, "step": 50165 }, { "epoch": 7.47244563598451, "grad_norm": 2.114039182662964, "learning_rate": 9.116254811273151e-06, "loss": 0.441, "num_input_tokens_seen": 28937024, "step": 50170 }, { "epoch": 7.473190348525469, "grad_norm": 1.7645132541656494, "learning_rate": 9.111236782529259e-06, "loss": 0.4227, "num_input_tokens_seen": 28940064, "step": 50175 }, { "epoch": 7.473935061066428, "grad_norm": 1.0576726198196411, "learning_rate": 9.106219827483398e-06, "loss": 0.3836, "num_input_tokens_seen": 28942976, "step": 50180 }, { "epoch": 7.474679773607388, "grad_norm": 2.833951950073242, "learning_rate": 9.101203946474571e-06, "loss": 0.5042, "num_input_tokens_seen": 28945632, "step": 50185 }, { "epoch": 7.475424486148347, "grad_norm": 2.5208466053009033, "learning_rate": 9.09618913984174e-06, "loss": 0.5347, "num_input_tokens_seen": 28948256, "step": 50190 }, { "epoch": 7.4761691986893055, "grad_norm": 2.6414639949798584, "learning_rate": 9.091175407923788e-06, "loss": 0.7027, "num_input_tokens_seen": 28951168, "step": 50195 }, { "epoch": 7.476913911230265, "grad_norm": 0.7009788751602173, "learning_rate": 9.086162751059532e-06, "loss": 0.3529, "num_input_tokens_seen": 28953920, "step": 50200 }, { "epoch": 7.477658623771224, "grad_norm": 6.067747592926025, "learning_rate": 9.081151169587686e-06, "loss": 0.5721, "num_input_tokens_seen": 28956608, "step": 50205 }, { "epoch": 7.478403336312184, "grad_norm": 2.1622138023376465, "learning_rate": 9.076140663846925e-06, "loss": 0.7802, "num_input_tokens_seen": 28959840, "step": 50210 }, { "epoch": 7.479148048853142, "grad_norm": 2.1039981842041016, "learning_rate": 9.071131234175831e-06, "loss": 0.7839, "num_input_tokens_seen": 28962624, "step": 50215 }, { "epoch": 7.479892761394102, "grad_norm": 3.1920247077941895, "learning_rate": 9.066122880912938e-06, "loss": 0.4187, "num_input_tokens_seen": 28965056, "step": 50220 }, { "epoch": 7.480637473935061, "grad_norm": 4.202042102813721, "learning_rate": 9.06111560439667e-06, "loss": 0.6593, "num_input_tokens_seen": 28967936, "step": 50225 }, { "epoch": 7.48138218647602, "grad_norm": 3.4839978218078613, "learning_rate": 9.056109404965408e-06, "loss": 0.3995, "num_input_tokens_seen": 28970816, "step": 50230 }, { "epoch": 7.482126899016979, "grad_norm": 1.6546372175216675, "learning_rate": 9.051104282957454e-06, "loss": 0.7477, "num_input_tokens_seen": 28973568, "step": 50235 }, { "epoch": 7.482871611557939, "grad_norm": 4.3187575340271, "learning_rate": 9.046100238711021e-06, "loss": 0.5682, "num_input_tokens_seen": 28976544, "step": 50240 }, { "epoch": 7.4836163240988975, "grad_norm": 3.4065983295440674, "learning_rate": 9.041097272564275e-06, "loss": 0.6528, "num_input_tokens_seen": 28979616, "step": 50245 }, { "epoch": 7.484361036639857, "grad_norm": 3.5802130699157715, "learning_rate": 9.036095384855287e-06, "loss": 0.482, "num_input_tokens_seen": 28982368, "step": 50250 }, { "epoch": 7.485105749180816, "grad_norm": 2.2307255268096924, "learning_rate": 9.03109457592207e-06, "loss": 0.6512, "num_input_tokens_seen": 28985216, "step": 50255 }, { "epoch": 7.485850461721776, "grad_norm": 2.7604610919952393, "learning_rate": 9.026094846102565e-06, "loss": 0.5223, "num_input_tokens_seen": 28988000, "step": 50260 }, { "epoch": 7.486595174262734, "grad_norm": 1.794047236442566, "learning_rate": 9.021096195734625e-06, "loss": 0.5425, "num_input_tokens_seen": 28990752, "step": 50265 }, { "epoch": 7.487339886803694, "grad_norm": 6.501143932342529, "learning_rate": 9.016098625156027e-06, "loss": 0.389, "num_input_tokens_seen": 28994080, "step": 50270 }, { "epoch": 7.488084599344653, "grad_norm": 2.287660837173462, "learning_rate": 9.011102134704501e-06, "loss": 0.6619, "num_input_tokens_seen": 28997088, "step": 50275 }, { "epoch": 7.488829311885612, "grad_norm": 4.520144462585449, "learning_rate": 9.006106724717686e-06, "loss": 0.6193, "num_input_tokens_seen": 29000128, "step": 50280 }, { "epoch": 7.489574024426571, "grad_norm": 3.0122756958007812, "learning_rate": 9.001112395533153e-06, "loss": 0.4756, "num_input_tokens_seen": 29003168, "step": 50285 }, { "epoch": 7.490318736967531, "grad_norm": 1.8045966625213623, "learning_rate": 8.996119147488396e-06, "loss": 0.6722, "num_input_tokens_seen": 29006240, "step": 50290 }, { "epoch": 7.49106344950849, "grad_norm": 2.879152297973633, "learning_rate": 8.99112698092085e-06, "loss": 0.5677, "num_input_tokens_seen": 29009344, "step": 50295 }, { "epoch": 7.491808162049449, "grad_norm": 1.4874777793884277, "learning_rate": 8.986135896167856e-06, "loss": 0.6546, "num_input_tokens_seen": 29012224, "step": 50300 }, { "epoch": 7.492552874590408, "grad_norm": 3.2995340824127197, "learning_rate": 8.98114589356668e-06, "loss": 0.4423, "num_input_tokens_seen": 29015200, "step": 50305 }, { "epoch": 7.493297587131368, "grad_norm": 2.7377548217773438, "learning_rate": 8.97615697345454e-06, "loss": 0.428, "num_input_tokens_seen": 29018048, "step": 50310 }, { "epoch": 7.494042299672326, "grad_norm": 3.0084474086761475, "learning_rate": 8.97116913616856e-06, "loss": 0.5294, "num_input_tokens_seen": 29020736, "step": 50315 }, { "epoch": 7.494787012213286, "grad_norm": 2.3296172618865967, "learning_rate": 8.966182382045801e-06, "loss": 0.4418, "num_input_tokens_seen": 29023584, "step": 50320 }, { "epoch": 7.495531724754245, "grad_norm": 1.4166662693023682, "learning_rate": 8.96119671142325e-06, "loss": 0.4237, "num_input_tokens_seen": 29026336, "step": 50325 }, { "epoch": 7.496276437295204, "grad_norm": 2.992492914199829, "learning_rate": 8.956212124637822e-06, "loss": 0.7569, "num_input_tokens_seen": 29029280, "step": 50330 }, { "epoch": 7.497021149836163, "grad_norm": 3.020709991455078, "learning_rate": 8.951228622026344e-06, "loss": 0.7508, "num_input_tokens_seen": 29032416, "step": 50335 }, { "epoch": 7.497765862377123, "grad_norm": 3.224501609802246, "learning_rate": 8.946246203925584e-06, "loss": 0.6249, "num_input_tokens_seen": 29035552, "step": 50340 }, { "epoch": 7.498510574918082, "grad_norm": 1.944487452507019, "learning_rate": 8.941264870672244e-06, "loss": 0.5562, "num_input_tokens_seen": 29038400, "step": 50345 }, { "epoch": 7.499255287459041, "grad_norm": 2.119297981262207, "learning_rate": 8.936284622602927e-06, "loss": 0.7282, "num_input_tokens_seen": 29041216, "step": 50350 }, { "epoch": 7.5, "grad_norm": 5.185937404632568, "learning_rate": 8.931305460054184e-06, "loss": 0.4657, "num_input_tokens_seen": 29044256, "step": 50355 }, { "epoch": 7.5, "eval_loss": 0.6839770674705505, "eval_runtime": 74.2684, "eval_samples_per_second": 40.179, "eval_steps_per_second": 10.045, "num_input_tokens_seen": 29044256, "step": 50355 }, { "epoch": 7.500744712540959, "grad_norm": 4.055560111999512, "learning_rate": 8.92632738336249e-06, "loss": 0.626, "num_input_tokens_seen": 29047264, "step": 50360 }, { "epoch": 7.501489425081918, "grad_norm": 2.13268780708313, "learning_rate": 8.921350392864247e-06, "loss": 0.5083, "num_input_tokens_seen": 29050112, "step": 50365 }, { "epoch": 7.502234137622878, "grad_norm": 2.098613739013672, "learning_rate": 8.916374488895766e-06, "loss": 0.5012, "num_input_tokens_seen": 29052896, "step": 50370 }, { "epoch": 7.502978850163837, "grad_norm": 2.9867324829101562, "learning_rate": 8.911399671793302e-06, "loss": 0.5865, "num_input_tokens_seen": 29055840, "step": 50375 }, { "epoch": 7.503723562704796, "grad_norm": 3.334371566772461, "learning_rate": 8.90642594189304e-06, "loss": 0.7108, "num_input_tokens_seen": 29058528, "step": 50380 }, { "epoch": 7.504468275245755, "grad_norm": 1.666733741760254, "learning_rate": 8.90145329953109e-06, "loss": 0.5034, "num_input_tokens_seen": 29061568, "step": 50385 }, { "epoch": 7.505212987786714, "grad_norm": 4.130040645599365, "learning_rate": 8.896481745043463e-06, "loss": 0.6703, "num_input_tokens_seen": 29064448, "step": 50390 }, { "epoch": 7.505957700327674, "grad_norm": 2.6722185611724854, "learning_rate": 8.891511278766132e-06, "loss": 0.6818, "num_input_tokens_seen": 29067136, "step": 50395 }, { "epoch": 7.506702412868632, "grad_norm": 3.629879951477051, "learning_rate": 8.886541901034981e-06, "loss": 0.6643, "num_input_tokens_seen": 29069952, "step": 50400 }, { "epoch": 7.507447125409592, "grad_norm": 2.7466483116149902, "learning_rate": 8.88157361218581e-06, "loss": 0.6088, "num_input_tokens_seen": 29072800, "step": 50405 }, { "epoch": 7.508191837950551, "grad_norm": 2.629014253616333, "learning_rate": 8.876606412554358e-06, "loss": 0.6117, "num_input_tokens_seen": 29075552, "step": 50410 }, { "epoch": 7.50893655049151, "grad_norm": 4.918747425079346, "learning_rate": 8.871640302476295e-06, "loss": 0.7305, "num_input_tokens_seen": 29078528, "step": 50415 }, { "epoch": 7.509681263032469, "grad_norm": 2.318763256072998, "learning_rate": 8.866675282287204e-06, "loss": 0.602, "num_input_tokens_seen": 29081536, "step": 50420 }, { "epoch": 7.510425975573429, "grad_norm": 1.9023027420043945, "learning_rate": 8.861711352322616e-06, "loss": 0.4719, "num_input_tokens_seen": 29084704, "step": 50425 }, { "epoch": 7.511170688114388, "grad_norm": 1.992134928703308, "learning_rate": 8.85674851291796e-06, "loss": 0.5331, "num_input_tokens_seen": 29087296, "step": 50430 }, { "epoch": 7.511915400655347, "grad_norm": 1.6533421277999878, "learning_rate": 8.851786764408596e-06, "loss": 0.6053, "num_input_tokens_seen": 29090592, "step": 50435 }, { "epoch": 7.512660113196306, "grad_norm": 2.6227805614471436, "learning_rate": 8.84682610712983e-06, "loss": 0.4163, "num_input_tokens_seen": 29093344, "step": 50440 }, { "epoch": 7.513404825737266, "grad_norm": 2.856376886367798, "learning_rate": 8.841866541416883e-06, "loss": 0.6581, "num_input_tokens_seen": 29095936, "step": 50445 }, { "epoch": 7.514149538278224, "grad_norm": 2.7036080360412598, "learning_rate": 8.836908067604898e-06, "loss": 0.4799, "num_input_tokens_seen": 29098624, "step": 50450 }, { "epoch": 7.514894250819184, "grad_norm": 3.467322587966919, "learning_rate": 8.831950686028953e-06, "loss": 0.5536, "num_input_tokens_seen": 29101632, "step": 50455 }, { "epoch": 7.515638963360143, "grad_norm": 3.2164225578308105, "learning_rate": 8.826994397024055e-06, "loss": 0.5539, "num_input_tokens_seen": 29104704, "step": 50460 }, { "epoch": 7.5163836759011025, "grad_norm": 1.4676392078399658, "learning_rate": 8.82203920092512e-06, "loss": 0.5156, "num_input_tokens_seen": 29107680, "step": 50465 }, { "epoch": 7.517128388442061, "grad_norm": 2.6537368297576904, "learning_rate": 8.817085098066994e-06, "loss": 0.6653, "num_input_tokens_seen": 29110688, "step": 50470 }, { "epoch": 7.517873100983021, "grad_norm": 6.317686080932617, "learning_rate": 8.812132088784458e-06, "loss": 0.7467, "num_input_tokens_seen": 29113408, "step": 50475 }, { "epoch": 7.51861781352398, "grad_norm": 4.50732421875, "learning_rate": 8.807180173412225e-06, "loss": 0.503, "num_input_tokens_seen": 29116192, "step": 50480 }, { "epoch": 7.519362526064939, "grad_norm": 2.7988274097442627, "learning_rate": 8.802229352284919e-06, "loss": 0.6173, "num_input_tokens_seen": 29119008, "step": 50485 }, { "epoch": 7.520107238605898, "grad_norm": 2.9589684009552, "learning_rate": 8.797279625737098e-06, "loss": 0.6776, "num_input_tokens_seen": 29121952, "step": 50490 }, { "epoch": 7.520851951146858, "grad_norm": 4.3005595207214355, "learning_rate": 8.792330994103253e-06, "loss": 0.6565, "num_input_tokens_seen": 29124704, "step": 50495 }, { "epoch": 7.521596663687816, "grad_norm": 2.948805332183838, "learning_rate": 8.787383457717777e-06, "loss": 0.6731, "num_input_tokens_seen": 29128064, "step": 50500 }, { "epoch": 7.522341376228776, "grad_norm": 1.9779623746871948, "learning_rate": 8.782437016915016e-06, "loss": 0.5486, "num_input_tokens_seen": 29130944, "step": 50505 }, { "epoch": 7.523086088769735, "grad_norm": 3.602553606033325, "learning_rate": 8.77749167202922e-06, "loss": 0.6261, "num_input_tokens_seen": 29133984, "step": 50510 }, { "epoch": 7.5238308013106945, "grad_norm": 2.182711124420166, "learning_rate": 8.77254742339458e-06, "loss": 0.608, "num_input_tokens_seen": 29136672, "step": 50515 }, { "epoch": 7.524575513851653, "grad_norm": 2.053356647491455, "learning_rate": 8.767604271345209e-06, "loss": 0.5773, "num_input_tokens_seen": 29139616, "step": 50520 }, { "epoch": 7.525320226392612, "grad_norm": 3.148155450820923, "learning_rate": 8.762662216215146e-06, "loss": 0.5572, "num_input_tokens_seen": 29142304, "step": 50525 }, { "epoch": 7.526064938933572, "grad_norm": 4.251818656921387, "learning_rate": 8.75772125833836e-06, "loss": 0.5696, "num_input_tokens_seen": 29145184, "step": 50530 }, { "epoch": 7.526809651474531, "grad_norm": 3.0618395805358887, "learning_rate": 8.752781398048732e-06, "loss": 0.6522, "num_input_tokens_seen": 29148160, "step": 50535 }, { "epoch": 7.52755436401549, "grad_norm": 2.321608781814575, "learning_rate": 8.747842635680076e-06, "loss": 0.8492, "num_input_tokens_seen": 29150848, "step": 50540 }, { "epoch": 7.528299076556449, "grad_norm": 1.3513137102127075, "learning_rate": 8.742904971566148e-06, "loss": 0.5075, "num_input_tokens_seen": 29153696, "step": 50545 }, { "epoch": 7.5290437890974085, "grad_norm": 3.5557734966278076, "learning_rate": 8.737968406040597e-06, "loss": 0.6435, "num_input_tokens_seen": 29156544, "step": 50550 }, { "epoch": 7.529788501638367, "grad_norm": 3.2822439670562744, "learning_rate": 8.733032939437025e-06, "loss": 0.6401, "num_input_tokens_seen": 29159552, "step": 50555 }, { "epoch": 7.530533214179327, "grad_norm": 2.1297547817230225, "learning_rate": 8.72809857208895e-06, "loss": 0.6603, "num_input_tokens_seen": 29162592, "step": 50560 }, { "epoch": 7.531277926720286, "grad_norm": 2.1144793033599854, "learning_rate": 8.723165304329825e-06, "loss": 0.437, "num_input_tokens_seen": 29165600, "step": 50565 }, { "epoch": 7.532022639261245, "grad_norm": 2.446904420852661, "learning_rate": 8.718233136493004e-06, "loss": 0.7194, "num_input_tokens_seen": 29168544, "step": 50570 }, { "epoch": 7.532767351802204, "grad_norm": 2.5950331687927246, "learning_rate": 8.71330206891179e-06, "loss": 0.5555, "num_input_tokens_seen": 29171328, "step": 50575 }, { "epoch": 7.533512064343164, "grad_norm": 6.911563396453857, "learning_rate": 8.708372101919407e-06, "loss": 0.6709, "num_input_tokens_seen": 29174336, "step": 50580 }, { "epoch": 7.534256776884122, "grad_norm": 1.9751176834106445, "learning_rate": 8.703443235849007e-06, "loss": 0.5853, "num_input_tokens_seen": 29176992, "step": 50585 }, { "epoch": 7.535001489425082, "grad_norm": 4.849992275238037, "learning_rate": 8.698515471033649e-06, "loss": 0.5582, "num_input_tokens_seen": 29179520, "step": 50590 }, { "epoch": 7.535746201966041, "grad_norm": 3.138458490371704, "learning_rate": 8.693588807806346e-06, "loss": 0.4722, "num_input_tokens_seen": 29182112, "step": 50595 }, { "epoch": 7.5364909145070005, "grad_norm": 2.1389007568359375, "learning_rate": 8.688663246500005e-06, "loss": 0.4466, "num_input_tokens_seen": 29185216, "step": 50600 }, { "epoch": 7.537235627047959, "grad_norm": 2.8194522857666016, "learning_rate": 8.683738787447488e-06, "loss": 0.5857, "num_input_tokens_seen": 29188096, "step": 50605 }, { "epoch": 7.537980339588919, "grad_norm": 1.8037278652191162, "learning_rate": 8.678815430981563e-06, "loss": 0.5075, "num_input_tokens_seen": 29190976, "step": 50610 }, { "epoch": 7.538725052129878, "grad_norm": 5.148674964904785, "learning_rate": 8.673893177434936e-06, "loss": 0.7061, "num_input_tokens_seen": 29193632, "step": 50615 }, { "epoch": 7.539469764670837, "grad_norm": 3.329998731613159, "learning_rate": 8.668972027140231e-06, "loss": 0.7922, "num_input_tokens_seen": 29196576, "step": 50620 }, { "epoch": 7.540214477211796, "grad_norm": 3.32688045501709, "learning_rate": 8.664051980430008e-06, "loss": 0.5454, "num_input_tokens_seen": 29199456, "step": 50625 }, { "epoch": 7.540959189752756, "grad_norm": 1.6488804817199707, "learning_rate": 8.659133037636732e-06, "loss": 0.6566, "num_input_tokens_seen": 29202272, "step": 50630 }, { "epoch": 7.5417039022937145, "grad_norm": 3.147937059402466, "learning_rate": 8.654215199092804e-06, "loss": 0.532, "num_input_tokens_seen": 29205440, "step": 50635 }, { "epoch": 7.542448614834674, "grad_norm": 0.864142656326294, "learning_rate": 8.649298465130553e-06, "loss": 0.4046, "num_input_tokens_seen": 29208224, "step": 50640 }, { "epoch": 7.543193327375633, "grad_norm": 3.628462553024292, "learning_rate": 8.644382836082235e-06, "loss": 0.734, "num_input_tokens_seen": 29211168, "step": 50645 }, { "epoch": 7.5439380399165925, "grad_norm": 2.272631883621216, "learning_rate": 8.63946831228003e-06, "loss": 0.5156, "num_input_tokens_seen": 29214176, "step": 50650 }, { "epoch": 7.544682752457551, "grad_norm": 2.320488214492798, "learning_rate": 8.634554894056038e-06, "loss": 0.5128, "num_input_tokens_seen": 29216832, "step": 50655 }, { "epoch": 7.545427464998511, "grad_norm": 4.9718122482299805, "learning_rate": 8.629642581742295e-06, "loss": 0.7181, "num_input_tokens_seen": 29219680, "step": 50660 }, { "epoch": 7.54617217753947, "grad_norm": 6.566749095916748, "learning_rate": 8.624731375670752e-06, "loss": 0.6729, "num_input_tokens_seen": 29222400, "step": 50665 }, { "epoch": 7.546916890080429, "grad_norm": 2.3550922870635986, "learning_rate": 8.619821276173279e-06, "loss": 0.5645, "num_input_tokens_seen": 29225408, "step": 50670 }, { "epoch": 7.547661602621388, "grad_norm": 4.93593692779541, "learning_rate": 8.614912283581683e-06, "loss": 0.553, "num_input_tokens_seen": 29228000, "step": 50675 }, { "epoch": 7.548406315162348, "grad_norm": 3.7477922439575195, "learning_rate": 8.6100043982277e-06, "loss": 0.6729, "num_input_tokens_seen": 29230592, "step": 50680 }, { "epoch": 7.5491510277033065, "grad_norm": 4.039081573486328, "learning_rate": 8.605097620442984e-06, "loss": 0.644, "num_input_tokens_seen": 29233696, "step": 50685 }, { "epoch": 7.549895740244265, "grad_norm": 2.454639434814453, "learning_rate": 8.600191950559111e-06, "loss": 0.5118, "num_input_tokens_seen": 29236576, "step": 50690 }, { "epoch": 7.550640452785225, "grad_norm": 3.0073869228363037, "learning_rate": 8.5952873889076e-06, "loss": 0.6066, "num_input_tokens_seen": 29239392, "step": 50695 }, { "epoch": 7.5513851653261845, "grad_norm": 3.219332695007324, "learning_rate": 8.590383935819862e-06, "loss": 0.5394, "num_input_tokens_seen": 29242016, "step": 50700 }, { "epoch": 7.552129877867143, "grad_norm": 1.732113003730774, "learning_rate": 8.58548159162727e-06, "loss": 0.5772, "num_input_tokens_seen": 29244640, "step": 50705 }, { "epoch": 7.552874590408102, "grad_norm": 1.860756278038025, "learning_rate": 8.580580356661085e-06, "loss": 0.6212, "num_input_tokens_seen": 29247360, "step": 50710 }, { "epoch": 7.553619302949062, "grad_norm": 3.4083943367004395, "learning_rate": 8.575680231252526e-06, "loss": 0.6376, "num_input_tokens_seen": 29250080, "step": 50715 }, { "epoch": 7.554364015490021, "grad_norm": 2.3038525581359863, "learning_rate": 8.570781215732718e-06, "loss": 0.6594, "num_input_tokens_seen": 29252704, "step": 50720 }, { "epoch": 7.55510872803098, "grad_norm": 2.435546398162842, "learning_rate": 8.565883310432717e-06, "loss": 0.8102, "num_input_tokens_seen": 29255840, "step": 50725 }, { "epoch": 7.555853440571939, "grad_norm": 1.922721266746521, "learning_rate": 8.56098651568352e-06, "loss": 0.5257, "num_input_tokens_seen": 29258592, "step": 50730 }, { "epoch": 7.5565981531128985, "grad_norm": 4.664877891540527, "learning_rate": 8.556090831816006e-06, "loss": 0.7097, "num_input_tokens_seen": 29261376, "step": 50735 }, { "epoch": 7.557342865653857, "grad_norm": 1.891223430633545, "learning_rate": 8.551196259161017e-06, "loss": 0.6348, "num_input_tokens_seen": 29264448, "step": 50740 }, { "epoch": 7.558087578194817, "grad_norm": 1.7265530824661255, "learning_rate": 8.546302798049319e-06, "loss": 0.6623, "num_input_tokens_seen": 29267232, "step": 50745 }, { "epoch": 7.558832290735776, "grad_norm": 1.3342182636260986, "learning_rate": 8.541410448811574e-06, "loss": 0.6467, "num_input_tokens_seen": 29269920, "step": 50750 }, { "epoch": 7.559577003276735, "grad_norm": 2.73056697845459, "learning_rate": 8.536519211778393e-06, "loss": 0.5419, "num_input_tokens_seen": 29272640, "step": 50755 }, { "epoch": 7.560321715817694, "grad_norm": 2.463578701019287, "learning_rate": 8.531629087280319e-06, "loss": 0.6714, "num_input_tokens_seen": 29275808, "step": 50760 }, { "epoch": 7.561066428358654, "grad_norm": 2.7449541091918945, "learning_rate": 8.526740075647784e-06, "loss": 0.5643, "num_input_tokens_seen": 29278624, "step": 50765 }, { "epoch": 7.5618111408996125, "grad_norm": 2.922072649002075, "learning_rate": 8.52185217721118e-06, "loss": 0.6784, "num_input_tokens_seen": 29281504, "step": 50770 }, { "epoch": 7.562555853440572, "grad_norm": 2.8939430713653564, "learning_rate": 8.516965392300813e-06, "loss": 0.5567, "num_input_tokens_seen": 29284032, "step": 50775 }, { "epoch": 7.563300565981531, "grad_norm": 2.6763851642608643, "learning_rate": 8.512079721246907e-06, "loss": 0.6471, "num_input_tokens_seen": 29287136, "step": 50780 }, { "epoch": 7.5640452785224905, "grad_norm": 2.550126075744629, "learning_rate": 8.50719516437963e-06, "loss": 0.5732, "num_input_tokens_seen": 29289920, "step": 50785 }, { "epoch": 7.564789991063449, "grad_norm": 2.7423887252807617, "learning_rate": 8.502311722029038e-06, "loss": 0.6674, "num_input_tokens_seen": 29292832, "step": 50790 }, { "epoch": 7.565534703604409, "grad_norm": 2.4603822231292725, "learning_rate": 8.497429394525155e-06, "loss": 0.5039, "num_input_tokens_seen": 29295840, "step": 50795 }, { "epoch": 7.566279416145368, "grad_norm": 3.387421131134033, "learning_rate": 8.49254818219789e-06, "loss": 0.5958, "num_input_tokens_seen": 29298624, "step": 50800 }, { "epoch": 7.567024128686327, "grad_norm": 2.55951189994812, "learning_rate": 8.487668085377104e-06, "loss": 0.4965, "num_input_tokens_seen": 29301472, "step": 50805 }, { "epoch": 7.567768841227286, "grad_norm": 3.1114680767059326, "learning_rate": 8.482789104392575e-06, "loss": 0.7653, "num_input_tokens_seen": 29304160, "step": 50810 }, { "epoch": 7.568513553768246, "grad_norm": 2.1076624393463135, "learning_rate": 8.477911239574005e-06, "loss": 0.528, "num_input_tokens_seen": 29307520, "step": 50815 }, { "epoch": 7.5692582663092045, "grad_norm": 3.3601436614990234, "learning_rate": 8.473034491251016e-06, "loss": 0.4591, "num_input_tokens_seen": 29310400, "step": 50820 }, { "epoch": 7.570002978850164, "grad_norm": 1.9824774265289307, "learning_rate": 8.468158859753175e-06, "loss": 0.6208, "num_input_tokens_seen": 29313056, "step": 50825 }, { "epoch": 7.570747691391123, "grad_norm": 1.6821271181106567, "learning_rate": 8.463284345409941e-06, "loss": 0.6774, "num_input_tokens_seen": 29316096, "step": 50830 }, { "epoch": 7.571492403932083, "grad_norm": 2.5055766105651855, "learning_rate": 8.458410948550713e-06, "loss": 0.4919, "num_input_tokens_seen": 29319072, "step": 50835 }, { "epoch": 7.572237116473041, "grad_norm": 2.6995108127593994, "learning_rate": 8.453538669504818e-06, "loss": 0.7745, "num_input_tokens_seen": 29322080, "step": 50840 }, { "epoch": 7.572981829014001, "grad_norm": 2.8293089866638184, "learning_rate": 8.448667508601505e-06, "loss": 0.6429, "num_input_tokens_seen": 29324992, "step": 50845 }, { "epoch": 7.57372654155496, "grad_norm": 1.8592181205749512, "learning_rate": 8.44379746616995e-06, "loss": 0.5541, "num_input_tokens_seen": 29327872, "step": 50850 }, { "epoch": 7.5744712540959185, "grad_norm": 2.911275625228882, "learning_rate": 8.438928542539251e-06, "loss": 0.4466, "num_input_tokens_seen": 29330816, "step": 50855 }, { "epoch": 7.575215966636878, "grad_norm": 2.0953681468963623, "learning_rate": 8.434060738038438e-06, "loss": 0.6552, "num_input_tokens_seen": 29333696, "step": 50860 }, { "epoch": 7.575960679177838, "grad_norm": 4.811090469360352, "learning_rate": 8.429194052996445e-06, "loss": 0.6549, "num_input_tokens_seen": 29336480, "step": 50865 }, { "epoch": 7.5767053917187965, "grad_norm": 1.3397060632705688, "learning_rate": 8.424328487742139e-06, "loss": 0.5019, "num_input_tokens_seen": 29339104, "step": 50870 }, { "epoch": 7.577450104259755, "grad_norm": 1.7301104068756104, "learning_rate": 8.419464042604322e-06, "loss": 0.5469, "num_input_tokens_seen": 29341984, "step": 50875 }, { "epoch": 7.578194816800715, "grad_norm": 4.975395679473877, "learning_rate": 8.414600717911713e-06, "loss": 0.6569, "num_input_tokens_seen": 29345024, "step": 50880 }, { "epoch": 7.578939529341675, "grad_norm": 1.588517189025879, "learning_rate": 8.409738513992958e-06, "loss": 0.6257, "num_input_tokens_seen": 29347808, "step": 50885 }, { "epoch": 7.579684241882633, "grad_norm": 2.009047746658325, "learning_rate": 8.404877431176621e-06, "loss": 0.6296, "num_input_tokens_seen": 29351008, "step": 50890 }, { "epoch": 7.580428954423592, "grad_norm": 3.6236069202423096, "learning_rate": 8.400017469791206e-06, "loss": 0.5128, "num_input_tokens_seen": 29353760, "step": 50895 }, { "epoch": 7.581173666964552, "grad_norm": 1.9708465337753296, "learning_rate": 8.395158630165112e-06, "loss": 0.6583, "num_input_tokens_seen": 29356736, "step": 50900 }, { "epoch": 7.5819183795055105, "grad_norm": 1.9512566328048706, "learning_rate": 8.390300912626686e-06, "loss": 0.3832, "num_input_tokens_seen": 29359200, "step": 50905 }, { "epoch": 7.58266309204647, "grad_norm": 3.474184513092041, "learning_rate": 8.385444317504201e-06, "loss": 0.5137, "num_input_tokens_seen": 29362112, "step": 50910 }, { "epoch": 7.583407804587429, "grad_norm": 1.7829525470733643, "learning_rate": 8.380588845125833e-06, "loss": 0.5707, "num_input_tokens_seen": 29365184, "step": 50915 }, { "epoch": 7.584152517128389, "grad_norm": 1.975370168685913, "learning_rate": 8.3757344958197e-06, "loss": 0.6969, "num_input_tokens_seen": 29368192, "step": 50920 }, { "epoch": 7.584897229669347, "grad_norm": 2.234978199005127, "learning_rate": 8.370881269913851e-06, "loss": 0.6087, "num_input_tokens_seen": 29370976, "step": 50925 }, { "epoch": 7.585641942210307, "grad_norm": 3.259308338165283, "learning_rate": 8.366029167736227e-06, "loss": 0.5438, "num_input_tokens_seen": 29374048, "step": 50930 }, { "epoch": 7.586386654751266, "grad_norm": 3.976961851119995, "learning_rate": 8.361178189614724e-06, "loss": 0.5293, "num_input_tokens_seen": 29376832, "step": 50935 }, { "epoch": 7.587131367292225, "grad_norm": 5.258465766906738, "learning_rate": 8.356328335877147e-06, "loss": 0.6433, "num_input_tokens_seen": 29379968, "step": 50940 }, { "epoch": 7.587876079833184, "grad_norm": 3.877805471420288, "learning_rate": 8.351479606851236e-06, "loss": 0.5319, "num_input_tokens_seen": 29382752, "step": 50945 }, { "epoch": 7.588620792374144, "grad_norm": 2.0930142402648926, "learning_rate": 8.346632002864655e-06, "loss": 0.7266, "num_input_tokens_seen": 29385728, "step": 50950 }, { "epoch": 7.5893655049151025, "grad_norm": 2.3586227893829346, "learning_rate": 8.341785524244964e-06, "loss": 0.6435, "num_input_tokens_seen": 29388640, "step": 50955 }, { "epoch": 7.590110217456062, "grad_norm": 3.7343807220458984, "learning_rate": 8.33694017131969e-06, "loss": 0.5632, "num_input_tokens_seen": 29391616, "step": 50960 }, { "epoch": 7.590854929997021, "grad_norm": 2.0280380249023438, "learning_rate": 8.332095944416243e-06, "loss": 0.5217, "num_input_tokens_seen": 29394400, "step": 50965 }, { "epoch": 7.591599642537981, "grad_norm": 1.901471734046936, "learning_rate": 8.327252843861986e-06, "loss": 0.5974, "num_input_tokens_seen": 29397216, "step": 50970 }, { "epoch": 7.592344355078939, "grad_norm": 2.7243356704711914, "learning_rate": 8.322410869984195e-06, "loss": 0.6768, "num_input_tokens_seen": 29400064, "step": 50975 }, { "epoch": 7.593089067619899, "grad_norm": 1.992110013961792, "learning_rate": 8.317570023110072e-06, "loss": 0.7429, "num_input_tokens_seen": 29402944, "step": 50980 }, { "epoch": 7.593833780160858, "grad_norm": 1.7968860864639282, "learning_rate": 8.312730303566738e-06, "loss": 0.6567, "num_input_tokens_seen": 29405632, "step": 50985 }, { "epoch": 7.594578492701817, "grad_norm": 5.066381931304932, "learning_rate": 8.307891711681257e-06, "loss": 0.5219, "num_input_tokens_seen": 29408800, "step": 50990 }, { "epoch": 7.595323205242776, "grad_norm": 2.7610814571380615, "learning_rate": 8.303054247780587e-06, "loss": 0.7382, "num_input_tokens_seen": 29411936, "step": 50995 }, { "epoch": 7.596067917783736, "grad_norm": 2.7365527153015137, "learning_rate": 8.298217912191617e-06, "loss": 0.4763, "num_input_tokens_seen": 29414752, "step": 51000 }, { "epoch": 7.596812630324695, "grad_norm": 2.3926122188568115, "learning_rate": 8.293382705241177e-06, "loss": 0.6575, "num_input_tokens_seen": 29417696, "step": 51005 }, { "epoch": 7.597557342865654, "grad_norm": 1.1409276723861694, "learning_rate": 8.28854862725601e-06, "loss": 0.6791, "num_input_tokens_seen": 29420544, "step": 51010 }, { "epoch": 7.598302055406613, "grad_norm": 1.479788064956665, "learning_rate": 8.283715678562781e-06, "loss": 0.43, "num_input_tokens_seen": 29423232, "step": 51015 }, { "epoch": 7.599046767947573, "grad_norm": 2.36959171295166, "learning_rate": 8.278883859488085e-06, "loss": 0.5771, "num_input_tokens_seen": 29426432, "step": 51020 }, { "epoch": 7.599791480488531, "grad_norm": 1.028598666191101, "learning_rate": 8.274053170358442e-06, "loss": 0.7003, "num_input_tokens_seen": 29429376, "step": 51025 }, { "epoch": 7.600536193029491, "grad_norm": 2.4108710289001465, "learning_rate": 8.269223611500285e-06, "loss": 0.5617, "num_input_tokens_seen": 29432192, "step": 51030 }, { "epoch": 7.60128090557045, "grad_norm": 2.7444090843200684, "learning_rate": 8.264395183239962e-06, "loss": 0.6477, "num_input_tokens_seen": 29435168, "step": 51035 }, { "epoch": 7.6020256181114085, "grad_norm": 4.336524963378906, "learning_rate": 8.259567885903775e-06, "loss": 0.5693, "num_input_tokens_seen": 29437952, "step": 51040 }, { "epoch": 7.602770330652368, "grad_norm": 5.094305515289307, "learning_rate": 8.254741719817924e-06, "loss": 0.3877, "num_input_tokens_seen": 29441056, "step": 51045 }, { "epoch": 7.603515043193328, "grad_norm": 2.3133716583251953, "learning_rate": 8.249916685308548e-06, "loss": 0.5699, "num_input_tokens_seen": 29443936, "step": 51050 }, { "epoch": 7.604259755734287, "grad_norm": 1.6050337553024292, "learning_rate": 8.245092782701703e-06, "loss": 0.5265, "num_input_tokens_seen": 29446784, "step": 51055 }, { "epoch": 7.605004468275245, "grad_norm": 2.770967483520508, "learning_rate": 8.240270012323375e-06, "loss": 0.6374, "num_input_tokens_seen": 29449792, "step": 51060 }, { "epoch": 7.605749180816205, "grad_norm": 2.9000449180603027, "learning_rate": 8.23544837449945e-06, "loss": 0.6235, "num_input_tokens_seen": 29453024, "step": 51065 }, { "epoch": 7.606493893357164, "grad_norm": 1.9087308645248413, "learning_rate": 8.230627869555775e-06, "loss": 0.6017, "num_input_tokens_seen": 29455680, "step": 51070 }, { "epoch": 7.607238605898123, "grad_norm": 2.049126386642456, "learning_rate": 8.225808497818077e-06, "loss": 0.6721, "num_input_tokens_seen": 29458560, "step": 51075 }, { "epoch": 7.607983318439082, "grad_norm": 2.944147825241089, "learning_rate": 8.220990259612043e-06, "loss": 0.4234, "num_input_tokens_seen": 29461856, "step": 51080 }, { "epoch": 7.608728030980042, "grad_norm": 2.922825574874878, "learning_rate": 8.216173155263271e-06, "loss": 0.5621, "num_input_tokens_seen": 29464768, "step": 51085 }, { "epoch": 7.609472743521001, "grad_norm": 1.7515408992767334, "learning_rate": 8.211357185097285e-06, "loss": 0.445, "num_input_tokens_seen": 29467680, "step": 51090 }, { "epoch": 7.61021745606196, "grad_norm": 2.5814034938812256, "learning_rate": 8.206542349439517e-06, "loss": 0.602, "num_input_tokens_seen": 29470368, "step": 51095 }, { "epoch": 7.610962168602919, "grad_norm": 1.7203376293182373, "learning_rate": 8.20172864861534e-06, "loss": 0.6387, "num_input_tokens_seen": 29473120, "step": 51100 }, { "epoch": 7.611706881143879, "grad_norm": 2.3771233558654785, "learning_rate": 8.19691608295004e-06, "loss": 0.4163, "num_input_tokens_seen": 29476064, "step": 51105 }, { "epoch": 7.612451593684837, "grad_norm": 4.137906551361084, "learning_rate": 8.192104652768848e-06, "loss": 0.4654, "num_input_tokens_seen": 29479232, "step": 51110 }, { "epoch": 7.613196306225797, "grad_norm": 4.814469337463379, "learning_rate": 8.187294358396874e-06, "loss": 0.617, "num_input_tokens_seen": 29482048, "step": 51115 }, { "epoch": 7.613941018766756, "grad_norm": 2.8870491981506348, "learning_rate": 8.182485200159195e-06, "loss": 0.6568, "num_input_tokens_seen": 29484672, "step": 51120 }, { "epoch": 7.614685731307715, "grad_norm": 2.7962334156036377, "learning_rate": 8.177677178380799e-06, "loss": 0.7165, "num_input_tokens_seen": 29487360, "step": 51125 }, { "epoch": 7.615430443848674, "grad_norm": 1.774561882019043, "learning_rate": 8.172870293386579e-06, "loss": 0.6848, "num_input_tokens_seen": 29489696, "step": 51130 }, { "epoch": 7.616175156389634, "grad_norm": 2.5173730850219727, "learning_rate": 8.168064545501367e-06, "loss": 0.5309, "num_input_tokens_seen": 29492512, "step": 51135 }, { "epoch": 7.616919868930593, "grad_norm": 2.847198724746704, "learning_rate": 8.16325993504992e-06, "loss": 0.714, "num_input_tokens_seen": 29495296, "step": 51140 }, { "epoch": 7.617664581471552, "grad_norm": 2.2070679664611816, "learning_rate": 8.158456462356915e-06, "loss": 0.5603, "num_input_tokens_seen": 29498176, "step": 51145 }, { "epoch": 7.618409294012511, "grad_norm": 2.0578505992889404, "learning_rate": 8.153654127746957e-06, "loss": 0.4944, "num_input_tokens_seen": 29500832, "step": 51150 }, { "epoch": 7.619154006553471, "grad_norm": 2.5543313026428223, "learning_rate": 8.148852931544551e-06, "loss": 0.5932, "num_input_tokens_seen": 29503616, "step": 51155 }, { "epoch": 7.619898719094429, "grad_norm": 5.949685573577881, "learning_rate": 8.144052874074162e-06, "loss": 0.6443, "num_input_tokens_seen": 29506496, "step": 51160 }, { "epoch": 7.620643431635389, "grad_norm": 2.4737584590911865, "learning_rate": 8.139253955660139e-06, "loss": 0.3881, "num_input_tokens_seen": 29508928, "step": 51165 }, { "epoch": 7.621388144176348, "grad_norm": 2.194563150405884, "learning_rate": 8.134456176626784e-06, "loss": 0.6219, "num_input_tokens_seen": 29511840, "step": 51170 }, { "epoch": 7.6221328567173074, "grad_norm": 3.1258962154388428, "learning_rate": 8.129659537298308e-06, "loss": 0.6967, "num_input_tokens_seen": 29514752, "step": 51175 }, { "epoch": 7.622877569258266, "grad_norm": 1.937333106994629, "learning_rate": 8.124864037998852e-06, "loss": 0.6152, "num_input_tokens_seen": 29517376, "step": 51180 }, { "epoch": 7.623622281799226, "grad_norm": 3.9189586639404297, "learning_rate": 8.120069679052477e-06, "loss": 0.676, "num_input_tokens_seen": 29520320, "step": 51185 }, { "epoch": 7.624366994340185, "grad_norm": 1.4067474603652954, "learning_rate": 8.115276460783172e-06, "loss": 0.557, "num_input_tokens_seen": 29523200, "step": 51190 }, { "epoch": 7.625111706881144, "grad_norm": 3.3430750370025635, "learning_rate": 8.110484383514835e-06, "loss": 0.6567, "num_input_tokens_seen": 29525984, "step": 51195 }, { "epoch": 7.625856419422103, "grad_norm": 2.7898406982421875, "learning_rate": 8.105693447571286e-06, "loss": 0.6863, "num_input_tokens_seen": 29528992, "step": 51200 }, { "epoch": 7.626601131963062, "grad_norm": 1.8855208158493042, "learning_rate": 8.100903653276287e-06, "loss": 0.6808, "num_input_tokens_seen": 29531712, "step": 51205 }, { "epoch": 7.627345844504021, "grad_norm": 2.752009630203247, "learning_rate": 8.096115000953513e-06, "loss": 0.4607, "num_input_tokens_seen": 29534432, "step": 51210 }, { "epoch": 7.628090557044981, "grad_norm": 2.194535732269287, "learning_rate": 8.091327490926561e-06, "loss": 0.6013, "num_input_tokens_seen": 29537248, "step": 51215 }, { "epoch": 7.62883526958594, "grad_norm": 4.453298568725586, "learning_rate": 8.08654112351895e-06, "loss": 0.5897, "num_input_tokens_seen": 29540096, "step": 51220 }, { "epoch": 7.629579982126899, "grad_norm": 1.9142671823501587, "learning_rate": 8.081755899054136e-06, "loss": 0.7233, "num_input_tokens_seen": 29543040, "step": 51225 }, { "epoch": 7.630324694667858, "grad_norm": 1.8736125230789185, "learning_rate": 8.076971817855472e-06, "loss": 0.64, "num_input_tokens_seen": 29545952, "step": 51230 }, { "epoch": 7.631069407208818, "grad_norm": 1.9650624990463257, "learning_rate": 8.07218888024624e-06, "loss": 0.7134, "num_input_tokens_seen": 29549024, "step": 51235 }, { "epoch": 7.631814119749777, "grad_norm": 1.8678288459777832, "learning_rate": 8.067407086549661e-06, "loss": 0.7334, "num_input_tokens_seen": 29552064, "step": 51240 }, { "epoch": 7.632558832290735, "grad_norm": 2.5294835567474365, "learning_rate": 8.06262643708887e-06, "loss": 0.6948, "num_input_tokens_seen": 29554944, "step": 51245 }, { "epoch": 7.633303544831695, "grad_norm": 1.8811110258102417, "learning_rate": 8.05784693218692e-06, "loss": 0.4935, "num_input_tokens_seen": 29557920, "step": 51250 }, { "epoch": 7.634048257372654, "grad_norm": 7.186748027801514, "learning_rate": 8.053068572166797e-06, "loss": 0.6478, "num_input_tokens_seen": 29560736, "step": 51255 }, { "epoch": 7.6347929699136134, "grad_norm": 2.9452900886535645, "learning_rate": 8.048291357351395e-06, "loss": 0.5187, "num_input_tokens_seen": 29563680, "step": 51260 }, { "epoch": 7.635537682454572, "grad_norm": 3.7331771850585938, "learning_rate": 8.043515288063542e-06, "loss": 0.5587, "num_input_tokens_seen": 29566528, "step": 51265 }, { "epoch": 7.636282394995532, "grad_norm": 2.0104780197143555, "learning_rate": 8.038740364625994e-06, "loss": 0.5038, "num_input_tokens_seen": 29569344, "step": 51270 }, { "epoch": 7.637027107536491, "grad_norm": 2.037583112716675, "learning_rate": 8.033966587361402e-06, "loss": 0.6357, "num_input_tokens_seen": 29572448, "step": 51275 }, { "epoch": 7.63777182007745, "grad_norm": 2.990224599838257, "learning_rate": 8.029193956592371e-06, "loss": 0.5643, "num_input_tokens_seen": 29575040, "step": 51280 }, { "epoch": 7.638516532618409, "grad_norm": 2.693105697631836, "learning_rate": 8.024422472641416e-06, "loss": 0.5423, "num_input_tokens_seen": 29578272, "step": 51285 }, { "epoch": 7.639261245159369, "grad_norm": 3.921969175338745, "learning_rate": 8.01965213583098e-06, "loss": 0.6894, "num_input_tokens_seen": 29581056, "step": 51290 }, { "epoch": 7.640005957700327, "grad_norm": 1.501585602760315, "learning_rate": 8.014882946483403e-06, "loss": 0.5695, "num_input_tokens_seen": 29583808, "step": 51295 }, { "epoch": 7.640750670241287, "grad_norm": 1.7351614236831665, "learning_rate": 8.010114904920984e-06, "loss": 0.7897, "num_input_tokens_seen": 29586848, "step": 51300 }, { "epoch": 7.641495382782246, "grad_norm": 2.314695119857788, "learning_rate": 8.005348011465925e-06, "loss": 0.5945, "num_input_tokens_seen": 29589600, "step": 51305 }, { "epoch": 7.6422400953232055, "grad_norm": 3.4509241580963135, "learning_rate": 8.000582266440356e-06, "loss": 0.6502, "num_input_tokens_seen": 29592480, "step": 51310 }, { "epoch": 7.642984807864164, "grad_norm": 2.7075157165527344, "learning_rate": 7.995817670166319e-06, "loss": 0.4875, "num_input_tokens_seen": 29595488, "step": 51315 }, { "epoch": 7.643729520405124, "grad_norm": 2.4910881519317627, "learning_rate": 7.991054222965788e-06, "loss": 0.7739, "num_input_tokens_seen": 29598432, "step": 51320 }, { "epoch": 7.644474232946083, "grad_norm": 1.6769368648529053, "learning_rate": 7.986291925160668e-06, "loss": 0.4274, "num_input_tokens_seen": 29601312, "step": 51325 }, { "epoch": 7.645218945487042, "grad_norm": 5.244916915893555, "learning_rate": 7.98153077707276e-06, "loss": 0.5629, "num_input_tokens_seen": 29604032, "step": 51330 }, { "epoch": 7.645963658028001, "grad_norm": 1.0662461519241333, "learning_rate": 7.976770779023807e-06, "loss": 0.6202, "num_input_tokens_seen": 29606752, "step": 51335 }, { "epoch": 7.646708370568961, "grad_norm": 3.820590019226074, "learning_rate": 7.972011931335474e-06, "loss": 0.6514, "num_input_tokens_seen": 29609504, "step": 51340 }, { "epoch": 7.6474530831099194, "grad_norm": 2.6297011375427246, "learning_rate": 7.967254234329347e-06, "loss": 0.6339, "num_input_tokens_seen": 29612384, "step": 51345 }, { "epoch": 7.648197795650879, "grad_norm": 2.9578020572662354, "learning_rate": 7.962497688326934e-06, "loss": 0.6188, "num_input_tokens_seen": 29615488, "step": 51350 }, { "epoch": 7.648942508191838, "grad_norm": 2.60603666305542, "learning_rate": 7.95774229364965e-06, "loss": 0.7039, "num_input_tokens_seen": 29618816, "step": 51355 }, { "epoch": 7.6496872207327975, "grad_norm": 1.5328137874603271, "learning_rate": 7.952988050618862e-06, "loss": 0.6901, "num_input_tokens_seen": 29621504, "step": 51360 }, { "epoch": 7.650431933273756, "grad_norm": 4.240530490875244, "learning_rate": 7.948234959555825e-06, "loss": 0.6622, "num_input_tokens_seen": 29624448, "step": 51365 }, { "epoch": 7.651176645814716, "grad_norm": 1.5885571241378784, "learning_rate": 7.943483020781741e-06, "loss": 0.4738, "num_input_tokens_seen": 29627488, "step": 51370 }, { "epoch": 7.651921358355675, "grad_norm": 2.37611985206604, "learning_rate": 7.93873223461773e-06, "loss": 0.5171, "num_input_tokens_seen": 29630080, "step": 51375 }, { "epoch": 7.652666070896634, "grad_norm": 3.0901968479156494, "learning_rate": 7.933982601384825e-06, "loss": 0.5723, "num_input_tokens_seen": 29633056, "step": 51380 }, { "epoch": 7.653410783437593, "grad_norm": 2.127084970474243, "learning_rate": 7.929234121403994e-06, "loss": 0.6152, "num_input_tokens_seen": 29636128, "step": 51385 }, { "epoch": 7.654155495978552, "grad_norm": 4.351748943328857, "learning_rate": 7.924486794996122e-06, "loss": 0.7141, "num_input_tokens_seen": 29639008, "step": 51390 }, { "epoch": 7.6549002085195115, "grad_norm": 2.223734140396118, "learning_rate": 7.919740622482012e-06, "loss": 0.6608, "num_input_tokens_seen": 29641888, "step": 51395 }, { "epoch": 7.655644921060471, "grad_norm": 2.481102466583252, "learning_rate": 7.914995604182373e-06, "loss": 0.7851, "num_input_tokens_seen": 29644736, "step": 51400 }, { "epoch": 7.65638963360143, "grad_norm": 2.498851776123047, "learning_rate": 7.910251740417873e-06, "loss": 0.6004, "num_input_tokens_seen": 29647456, "step": 51405 }, { "epoch": 7.657134346142389, "grad_norm": 3.3036882877349854, "learning_rate": 7.905509031509079e-06, "loss": 0.8269, "num_input_tokens_seen": 29650656, "step": 51410 }, { "epoch": 7.657879058683348, "grad_norm": 2.6006932258605957, "learning_rate": 7.900767477776483e-06, "loss": 0.6382, "num_input_tokens_seen": 29653568, "step": 51415 }, { "epoch": 7.658623771224307, "grad_norm": 1.0418152809143066, "learning_rate": 7.896027079540508e-06, "loss": 0.4518, "num_input_tokens_seen": 29656384, "step": 51420 }, { "epoch": 7.659368483765267, "grad_norm": 1.9945480823516846, "learning_rate": 7.891287837121472e-06, "loss": 0.6074, "num_input_tokens_seen": 29659232, "step": 51425 }, { "epoch": 7.6601131963062254, "grad_norm": 2.7657880783081055, "learning_rate": 7.886549750839658e-06, "loss": 0.7077, "num_input_tokens_seen": 29662240, "step": 51430 }, { "epoch": 7.660857908847185, "grad_norm": 1.4766857624053955, "learning_rate": 7.881812821015221e-06, "loss": 0.5471, "num_input_tokens_seen": 29665312, "step": 51435 }, { "epoch": 7.661602621388144, "grad_norm": 2.657390832901001, "learning_rate": 7.877077047968282e-06, "loss": 0.4735, "num_input_tokens_seen": 29668192, "step": 51440 }, { "epoch": 7.6623473339291035, "grad_norm": 2.843202829360962, "learning_rate": 7.872342432018856e-06, "loss": 0.5719, "num_input_tokens_seen": 29671040, "step": 51445 }, { "epoch": 7.663092046470062, "grad_norm": 2.2765755653381348, "learning_rate": 7.867608973486892e-06, "loss": 0.5943, "num_input_tokens_seen": 29673792, "step": 51450 }, { "epoch": 7.663836759011022, "grad_norm": 1.6624494791030884, "learning_rate": 7.862876672692265e-06, "loss": 0.4892, "num_input_tokens_seen": 29676736, "step": 51455 }, { "epoch": 7.664581471551981, "grad_norm": 3.9206576347351074, "learning_rate": 7.858145529954752e-06, "loss": 0.692, "num_input_tokens_seen": 29679712, "step": 51460 }, { "epoch": 7.66532618409294, "grad_norm": 2.934786796569824, "learning_rate": 7.853415545594073e-06, "loss": 0.4684, "num_input_tokens_seen": 29682432, "step": 51465 }, { "epoch": 7.666070896633899, "grad_norm": 3.218270778656006, "learning_rate": 7.848686719929863e-06, "loss": 0.7176, "num_input_tokens_seen": 29685184, "step": 51470 }, { "epoch": 7.666815609174859, "grad_norm": 2.7417759895324707, "learning_rate": 7.843959053281663e-06, "loss": 0.6525, "num_input_tokens_seen": 29687968, "step": 51475 }, { "epoch": 7.6675603217158175, "grad_norm": 3.661638021469116, "learning_rate": 7.839232545968964e-06, "loss": 0.671, "num_input_tokens_seen": 29690912, "step": 51480 }, { "epoch": 7.668305034256777, "grad_norm": 2.0385122299194336, "learning_rate": 7.834507198311154e-06, "loss": 0.6394, "num_input_tokens_seen": 29693792, "step": 51485 }, { "epoch": 7.669049746797736, "grad_norm": 2.875610113143921, "learning_rate": 7.829783010627568e-06, "loss": 0.5158, "num_input_tokens_seen": 29696512, "step": 51490 }, { "epoch": 7.6697944593386955, "grad_norm": 1.5720807313919067, "learning_rate": 7.82505998323743e-06, "loss": 0.5042, "num_input_tokens_seen": 29699456, "step": 51495 }, { "epoch": 7.670539171879654, "grad_norm": 3.3797290325164795, "learning_rate": 7.820338116459908e-06, "loss": 0.4801, "num_input_tokens_seen": 29702304, "step": 51500 }, { "epoch": 7.671283884420614, "grad_norm": 1.3092304468154907, "learning_rate": 7.815617410614087e-06, "loss": 0.5884, "num_input_tokens_seen": 29705312, "step": 51505 }, { "epoch": 7.672028596961573, "grad_norm": 2.097371816635132, "learning_rate": 7.81089786601898e-06, "loss": 0.5489, "num_input_tokens_seen": 29708320, "step": 51510 }, { "epoch": 7.672773309502532, "grad_norm": 2.1312789916992188, "learning_rate": 7.806179482993514e-06, "loss": 0.4027, "num_input_tokens_seen": 29711616, "step": 51515 }, { "epoch": 7.673518022043491, "grad_norm": 2.810764789581299, "learning_rate": 7.801462261856526e-06, "loss": 0.6201, "num_input_tokens_seen": 29714848, "step": 51520 }, { "epoch": 7.674262734584451, "grad_norm": 1.960892677307129, "learning_rate": 7.796746202926802e-06, "loss": 0.6355, "num_input_tokens_seen": 29717664, "step": 51525 }, { "epoch": 7.6750074471254095, "grad_norm": 2.0916786193847656, "learning_rate": 7.792031306523018e-06, "loss": 0.6007, "num_input_tokens_seen": 29720672, "step": 51530 }, { "epoch": 7.675752159666369, "grad_norm": 4.629528999328613, "learning_rate": 7.787317572963798e-06, "loss": 0.5378, "num_input_tokens_seen": 29723808, "step": 51535 }, { "epoch": 7.676496872207328, "grad_norm": 1.7608083486557007, "learning_rate": 7.782605002567673e-06, "loss": 0.4689, "num_input_tokens_seen": 29726400, "step": 51540 }, { "epoch": 7.6772415847482876, "grad_norm": 4.071713924407959, "learning_rate": 7.777893595653102e-06, "loss": 0.639, "num_input_tokens_seen": 29729504, "step": 51545 }, { "epoch": 7.677986297289246, "grad_norm": 2.183212995529175, "learning_rate": 7.77318335253846e-06, "loss": 0.5971, "num_input_tokens_seen": 29732672, "step": 51550 }, { "epoch": 7.678731009830205, "grad_norm": 1.8642514944076538, "learning_rate": 7.768474273542056e-06, "loss": 0.5912, "num_input_tokens_seen": 29735552, "step": 51555 }, { "epoch": 7.679475722371165, "grad_norm": 2.101210355758667, "learning_rate": 7.763766358982104e-06, "loss": 0.6669, "num_input_tokens_seen": 29738624, "step": 51560 }, { "epoch": 7.680220434912124, "grad_norm": 2.264887809753418, "learning_rate": 7.759059609176735e-06, "loss": 0.5687, "num_input_tokens_seen": 29741376, "step": 51565 }, { "epoch": 7.680965147453083, "grad_norm": 1.5711848735809326, "learning_rate": 7.754354024444021e-06, "loss": 0.446, "num_input_tokens_seen": 29744192, "step": 51570 }, { "epoch": 7.681709859994042, "grad_norm": 4.000319480895996, "learning_rate": 7.749649605101947e-06, "loss": 0.8271, "num_input_tokens_seen": 29746944, "step": 51575 }, { "epoch": 7.6824545725350015, "grad_norm": 1.8948922157287598, "learning_rate": 7.744946351468419e-06, "loss": 0.5798, "num_input_tokens_seen": 29749696, "step": 51580 }, { "epoch": 7.683199285075961, "grad_norm": 1.9917892217636108, "learning_rate": 7.740244263861268e-06, "loss": 0.5892, "num_input_tokens_seen": 29752480, "step": 51585 }, { "epoch": 7.68394399761692, "grad_norm": 2.595715284347534, "learning_rate": 7.73554334259823e-06, "loss": 0.492, "num_input_tokens_seen": 29755232, "step": 51590 }, { "epoch": 7.684688710157879, "grad_norm": 2.3721961975097656, "learning_rate": 7.730843587996989e-06, "loss": 0.6474, "num_input_tokens_seen": 29758304, "step": 51595 }, { "epoch": 7.685433422698838, "grad_norm": 2.219881772994995, "learning_rate": 7.72614500037512e-06, "loss": 0.5273, "num_input_tokens_seen": 29761344, "step": 51600 }, { "epoch": 7.686178135239797, "grad_norm": 3.347001791000366, "learning_rate": 7.72144758005014e-06, "loss": 0.4714, "num_input_tokens_seen": 29764192, "step": 51605 }, { "epoch": 7.686922847780757, "grad_norm": 1.835808515548706, "learning_rate": 7.716751327339484e-06, "loss": 0.5452, "num_input_tokens_seen": 29767008, "step": 51610 }, { "epoch": 7.6876675603217155, "grad_norm": 1.2507010698318481, "learning_rate": 7.712056242560503e-06, "loss": 0.4717, "num_input_tokens_seen": 29769920, "step": 51615 }, { "epoch": 7.688412272862675, "grad_norm": 2.312844753265381, "learning_rate": 7.707362326030482e-06, "loss": 0.6232, "num_input_tokens_seen": 29772960, "step": 51620 }, { "epoch": 7.689156985403634, "grad_norm": 2.9401445388793945, "learning_rate": 7.702669578066602e-06, "loss": 0.7848, "num_input_tokens_seen": 29775872, "step": 51625 }, { "epoch": 7.6899016979445936, "grad_norm": 2.8292009830474854, "learning_rate": 7.697977998985984e-06, "loss": 0.5431, "num_input_tokens_seen": 29778912, "step": 51630 }, { "epoch": 7.690646410485552, "grad_norm": 3.7284960746765137, "learning_rate": 7.693287589105678e-06, "loss": 0.6221, "num_input_tokens_seen": 29781792, "step": 51635 }, { "epoch": 7.691391123026512, "grad_norm": 2.091951847076416, "learning_rate": 7.68859834874262e-06, "loss": 0.621, "num_input_tokens_seen": 29784672, "step": 51640 }, { "epoch": 7.692135835567471, "grad_norm": 2.9196889400482178, "learning_rate": 7.683910278213708e-06, "loss": 0.578, "num_input_tokens_seen": 29787552, "step": 51645 }, { "epoch": 7.69288054810843, "grad_norm": 4.034780979156494, "learning_rate": 7.679223377835735e-06, "loss": 0.55, "num_input_tokens_seen": 29790496, "step": 51650 }, { "epoch": 7.693625260649389, "grad_norm": 1.6782976388931274, "learning_rate": 7.674537647925434e-06, "loss": 0.6582, "num_input_tokens_seen": 29793504, "step": 51655 }, { "epoch": 7.694369973190349, "grad_norm": 2.786426305770874, "learning_rate": 7.669853088799432e-06, "loss": 0.4, "num_input_tokens_seen": 29796416, "step": 51660 }, { "epoch": 7.6951146857313075, "grad_norm": 2.8763391971588135, "learning_rate": 7.665169700774294e-06, "loss": 0.4741, "num_input_tokens_seen": 29799872, "step": 51665 }, { "epoch": 7.695859398272267, "grad_norm": 1.6167114973068237, "learning_rate": 7.660487484166513e-06, "loss": 0.6418, "num_input_tokens_seen": 29802720, "step": 51670 }, { "epoch": 7.696604110813226, "grad_norm": 4.684232711791992, "learning_rate": 7.655806439292498e-06, "loss": 0.7552, "num_input_tokens_seen": 29805472, "step": 51675 }, { "epoch": 7.697348823354186, "grad_norm": 1.8967419862747192, "learning_rate": 7.651126566468559e-06, "loss": 0.5343, "num_input_tokens_seen": 29808448, "step": 51680 }, { "epoch": 7.698093535895144, "grad_norm": 2.971081495285034, "learning_rate": 7.64644786601095e-06, "loss": 0.7079, "num_input_tokens_seen": 29811200, "step": 51685 }, { "epoch": 7.698838248436104, "grad_norm": 0.9522765278816223, "learning_rate": 7.641770338235851e-06, "loss": 0.5946, "num_input_tokens_seen": 29814304, "step": 51690 }, { "epoch": 7.699582960977063, "grad_norm": 1.5170060396194458, "learning_rate": 7.637093983459329e-06, "loss": 0.4913, "num_input_tokens_seen": 29817088, "step": 51695 }, { "epoch": 7.700327673518022, "grad_norm": 1.990256428718567, "learning_rate": 7.632418801997404e-06, "loss": 0.6085, "num_input_tokens_seen": 29819872, "step": 51700 }, { "epoch": 7.701072386058981, "grad_norm": 2.962379217147827, "learning_rate": 7.627744794166003e-06, "loss": 0.5764, "num_input_tokens_seen": 29822592, "step": 51705 }, { "epoch": 7.701817098599941, "grad_norm": 2.275742292404175, "learning_rate": 7.623071960280981e-06, "loss": 0.5792, "num_input_tokens_seen": 29825440, "step": 51710 }, { "epoch": 7.7025618111408996, "grad_norm": 2.572486162185669, "learning_rate": 7.618400300658113e-06, "loss": 0.4966, "num_input_tokens_seen": 29828320, "step": 51715 }, { "epoch": 7.703306523681858, "grad_norm": 2.9606077671051025, "learning_rate": 7.613729815613077e-06, "loss": 0.5407, "num_input_tokens_seen": 29831232, "step": 51720 }, { "epoch": 7.704051236222818, "grad_norm": 2.2018535137176514, "learning_rate": 7.609060505461499e-06, "loss": 0.5407, "num_input_tokens_seen": 29834336, "step": 51725 }, { "epoch": 7.704795948763778, "grad_norm": 1.6950784921646118, "learning_rate": 7.6043923705189005e-06, "loss": 0.5682, "num_input_tokens_seen": 29837440, "step": 51730 }, { "epoch": 7.705540661304736, "grad_norm": 2.1364150047302246, "learning_rate": 7.599725411100739e-06, "loss": 0.8987, "num_input_tokens_seen": 29840416, "step": 51735 }, { "epoch": 7.706285373845695, "grad_norm": 5.0034918785095215, "learning_rate": 7.595059627522389e-06, "loss": 0.5673, "num_input_tokens_seen": 29843552, "step": 51740 }, { "epoch": 7.707030086386655, "grad_norm": 1.3471041917800903, "learning_rate": 7.590395020099145e-06, "loss": 0.5483, "num_input_tokens_seen": 29846240, "step": 51745 }, { "epoch": 7.707774798927614, "grad_norm": 4.445037364959717, "learning_rate": 7.585731589146234e-06, "loss": 0.606, "num_input_tokens_seen": 29848992, "step": 51750 }, { "epoch": 7.708519511468573, "grad_norm": 5.996454238891602, "learning_rate": 7.581069334978771e-06, "loss": 0.6909, "num_input_tokens_seen": 29851872, "step": 51755 }, { "epoch": 7.709264224009532, "grad_norm": 4.831119060516357, "learning_rate": 7.57640825791183e-06, "loss": 0.6868, "num_input_tokens_seen": 29854688, "step": 51760 }, { "epoch": 7.710008936550492, "grad_norm": 2.170003652572632, "learning_rate": 7.571748358260372e-06, "loss": 0.5645, "num_input_tokens_seen": 29857280, "step": 51765 }, { "epoch": 7.71075364909145, "grad_norm": 5.521748065948486, "learning_rate": 7.567089636339303e-06, "loss": 0.6639, "num_input_tokens_seen": 29860064, "step": 51770 }, { "epoch": 7.71149836163241, "grad_norm": 2.5232458114624023, "learning_rate": 7.562432092463439e-06, "loss": 0.6737, "num_input_tokens_seen": 29863136, "step": 51775 }, { "epoch": 7.712243074173369, "grad_norm": 5.170044898986816, "learning_rate": 7.557775726947519e-06, "loss": 0.7819, "num_input_tokens_seen": 29865920, "step": 51780 }, { "epoch": 7.712987786714328, "grad_norm": 2.137270927429199, "learning_rate": 7.553120540106206e-06, "loss": 0.6662, "num_input_tokens_seen": 29868928, "step": 51785 }, { "epoch": 7.713732499255287, "grad_norm": 3.1620426177978516, "learning_rate": 7.5484665322540675e-06, "loss": 0.4911, "num_input_tokens_seen": 29871712, "step": 51790 }, { "epoch": 7.714477211796247, "grad_norm": 1.1178563833236694, "learning_rate": 7.543813703705616e-06, "loss": 0.6209, "num_input_tokens_seen": 29874688, "step": 51795 }, { "epoch": 7.7152219243372056, "grad_norm": 2.736048936843872, "learning_rate": 7.539162054775253e-06, "loss": 0.6489, "num_input_tokens_seen": 29877536, "step": 51800 }, { "epoch": 7.715966636878165, "grad_norm": 3.7125399112701416, "learning_rate": 7.534511585777326e-06, "loss": 0.698, "num_input_tokens_seen": 29880480, "step": 51805 }, { "epoch": 7.716711349419124, "grad_norm": 2.76127552986145, "learning_rate": 7.529862297026099e-06, "loss": 0.5319, "num_input_tokens_seen": 29883424, "step": 51810 }, { "epoch": 7.717456061960084, "grad_norm": 7.477096080780029, "learning_rate": 7.525214188835749e-06, "loss": 0.6251, "num_input_tokens_seen": 29886176, "step": 51815 }, { "epoch": 7.718200774501042, "grad_norm": 2.7490618228912354, "learning_rate": 7.520567261520387e-06, "loss": 0.643, "num_input_tokens_seen": 29889088, "step": 51820 }, { "epoch": 7.718945487042002, "grad_norm": 3.001614809036255, "learning_rate": 7.515921515394014e-06, "loss": 0.3722, "num_input_tokens_seen": 29891744, "step": 51825 }, { "epoch": 7.719690199582961, "grad_norm": 3.080265998840332, "learning_rate": 7.51127695077058e-06, "loss": 0.7524, "num_input_tokens_seen": 29894624, "step": 51830 }, { "epoch": 7.72043491212392, "grad_norm": 1.3292642831802368, "learning_rate": 7.506633567963953e-06, "loss": 0.5181, "num_input_tokens_seen": 29897568, "step": 51835 }, { "epoch": 7.721179624664879, "grad_norm": 2.7919375896453857, "learning_rate": 7.501991367287897e-06, "loss": 0.6412, "num_input_tokens_seen": 29900512, "step": 51840 }, { "epoch": 7.721924337205839, "grad_norm": 2.098557472229004, "learning_rate": 7.497350349056126e-06, "loss": 0.7303, "num_input_tokens_seen": 29903136, "step": 51845 }, { "epoch": 7.722669049746798, "grad_norm": 1.8505713939666748, "learning_rate": 7.492710513582257e-06, "loss": 0.6057, "num_input_tokens_seen": 29906208, "step": 51850 }, { "epoch": 7.723413762287757, "grad_norm": 1.6873750686645508, "learning_rate": 7.488071861179838e-06, "loss": 0.5439, "num_input_tokens_seen": 29908768, "step": 51855 }, { "epoch": 7.724158474828716, "grad_norm": 1.6422679424285889, "learning_rate": 7.4834343921623165e-06, "loss": 0.7, "num_input_tokens_seen": 29911776, "step": 51860 }, { "epoch": 7.724903187369676, "grad_norm": 1.7556248903274536, "learning_rate": 7.478798106843085e-06, "loss": 0.4587, "num_input_tokens_seen": 29914752, "step": 51865 }, { "epoch": 7.725647899910634, "grad_norm": 4.64548397064209, "learning_rate": 7.474163005535439e-06, "loss": 0.5782, "num_input_tokens_seen": 29917312, "step": 51870 }, { "epoch": 7.726392612451594, "grad_norm": 2.0146546363830566, "learning_rate": 7.46952908855261e-06, "loss": 0.5554, "num_input_tokens_seen": 29920224, "step": 51875 }, { "epoch": 7.727137324992553, "grad_norm": 3.01982045173645, "learning_rate": 7.464896356207723e-06, "loss": 0.5385, "num_input_tokens_seen": 29923040, "step": 51880 }, { "epoch": 7.727882037533512, "grad_norm": 2.9422919750213623, "learning_rate": 7.460264808813849e-06, "loss": 0.6693, "num_input_tokens_seen": 29926176, "step": 51885 }, { "epoch": 7.728626750074471, "grad_norm": 2.6538944244384766, "learning_rate": 7.455634446683976e-06, "loss": 0.6168, "num_input_tokens_seen": 29929600, "step": 51890 }, { "epoch": 7.729371462615431, "grad_norm": 2.0988903045654297, "learning_rate": 7.451005270130987e-06, "loss": 0.6247, "num_input_tokens_seen": 29932384, "step": 51895 }, { "epoch": 7.73011617515639, "grad_norm": 1.8390531539916992, "learning_rate": 7.4463772794677145e-06, "loss": 0.4953, "num_input_tokens_seen": 29935584, "step": 51900 }, { "epoch": 7.730860887697348, "grad_norm": 8.369756698608398, "learning_rate": 7.441750475006898e-06, "loss": 0.6032, "num_input_tokens_seen": 29938368, "step": 51905 }, { "epoch": 7.731605600238308, "grad_norm": 1.883711576461792, "learning_rate": 7.4371248570611975e-06, "loss": 0.5003, "num_input_tokens_seen": 29941056, "step": 51910 }, { "epoch": 7.732350312779268, "grad_norm": 2.889549493789673, "learning_rate": 7.4325004259432006e-06, "loss": 0.55, "num_input_tokens_seen": 29943808, "step": 51915 }, { "epoch": 7.733095025320226, "grad_norm": 2.4338560104370117, "learning_rate": 7.427877181965393e-06, "loss": 0.5531, "num_input_tokens_seen": 29946816, "step": 51920 }, { "epoch": 7.733839737861185, "grad_norm": 2.721571207046509, "learning_rate": 7.423255125440212e-06, "loss": 0.5582, "num_input_tokens_seen": 29949760, "step": 51925 }, { "epoch": 7.734584450402145, "grad_norm": 1.6014546155929565, "learning_rate": 7.418634256679976e-06, "loss": 0.548, "num_input_tokens_seen": 29952576, "step": 51930 }, { "epoch": 7.735329162943104, "grad_norm": 1.640425682067871, "learning_rate": 7.414014575996961e-06, "loss": 0.437, "num_input_tokens_seen": 29955360, "step": 51935 }, { "epoch": 7.736073875484063, "grad_norm": 3.726691722869873, "learning_rate": 7.409396083703341e-06, "loss": 0.5, "num_input_tokens_seen": 29957920, "step": 51940 }, { "epoch": 7.736818588025022, "grad_norm": 1.5393775701522827, "learning_rate": 7.404778780111213e-06, "loss": 0.5568, "num_input_tokens_seen": 29960736, "step": 51945 }, { "epoch": 7.737563300565982, "grad_norm": 4.697535991668701, "learning_rate": 7.400162665532606e-06, "loss": 0.7148, "num_input_tokens_seen": 29963616, "step": 51950 }, { "epoch": 7.73830801310694, "grad_norm": 1.4976344108581543, "learning_rate": 7.3955477402794435e-06, "loss": 0.4775, "num_input_tokens_seen": 29966400, "step": 51955 }, { "epoch": 7.7390527256479, "grad_norm": 4.159329414367676, "learning_rate": 7.390934004663597e-06, "loss": 0.7235, "num_input_tokens_seen": 29969248, "step": 51960 }, { "epoch": 7.739797438188859, "grad_norm": 5.51512336730957, "learning_rate": 7.386321458996831e-06, "loss": 0.6905, "num_input_tokens_seen": 29972096, "step": 51965 }, { "epoch": 7.740542150729818, "grad_norm": 2.8114278316497803, "learning_rate": 7.381710103590847e-06, "loss": 0.7616, "num_input_tokens_seen": 29975072, "step": 51970 }, { "epoch": 7.741286863270777, "grad_norm": 4.9018378257751465, "learning_rate": 7.377099938757265e-06, "loss": 0.6183, "num_input_tokens_seen": 29978080, "step": 51975 }, { "epoch": 7.742031575811737, "grad_norm": 2.803738594055176, "learning_rate": 7.372490964807619e-06, "loss": 0.5612, "num_input_tokens_seen": 29980928, "step": 51980 }, { "epoch": 7.742776288352696, "grad_norm": 6.393228054046631, "learning_rate": 7.367883182053373e-06, "loss": 0.5761, "num_input_tokens_seen": 29983904, "step": 51985 }, { "epoch": 7.743521000893655, "grad_norm": 3.1732242107391357, "learning_rate": 7.363276590805887e-06, "loss": 0.5092, "num_input_tokens_seen": 29986720, "step": 51990 }, { "epoch": 7.744265713434614, "grad_norm": 1.1182588338851929, "learning_rate": 7.358671191376474e-06, "loss": 0.4971, "num_input_tokens_seen": 29989600, "step": 51995 }, { "epoch": 7.745010425975574, "grad_norm": 1.9806867837905884, "learning_rate": 7.3540669840763246e-06, "loss": 0.5167, "num_input_tokens_seen": 29992608, "step": 52000 }, { "epoch": 7.745755138516532, "grad_norm": 3.535045623779297, "learning_rate": 7.349463969216589e-06, "loss": 0.5618, "num_input_tokens_seen": 29995968, "step": 52005 }, { "epoch": 7.746499851057492, "grad_norm": 3.030369520187378, "learning_rate": 7.344862147108314e-06, "loss": 0.5736, "num_input_tokens_seen": 29998816, "step": 52010 }, { "epoch": 7.747244563598451, "grad_norm": 2.6033763885498047, "learning_rate": 7.340261518062475e-06, "loss": 0.7148, "num_input_tokens_seen": 30001792, "step": 52015 }, { "epoch": 7.7479892761394105, "grad_norm": 2.92903733253479, "learning_rate": 7.335662082389972e-06, "loss": 0.6124, "num_input_tokens_seen": 30004512, "step": 52020 }, { "epoch": 7.748733988680369, "grad_norm": 2.976372718811035, "learning_rate": 7.3310638404016005e-06, "loss": 0.5866, "num_input_tokens_seen": 30007424, "step": 52025 }, { "epoch": 7.749478701221329, "grad_norm": 1.801628828048706, "learning_rate": 7.326466792408096e-06, "loss": 0.5717, "num_input_tokens_seen": 30010368, "step": 52030 }, { "epoch": 7.750223413762288, "grad_norm": 2.372328519821167, "learning_rate": 7.321870938720118e-06, "loss": 0.624, "num_input_tokens_seen": 30013568, "step": 52035 }, { "epoch": 7.750968126303247, "grad_norm": 3.0853354930877686, "learning_rate": 7.317276279648222e-06, "loss": 0.655, "num_input_tokens_seen": 30016768, "step": 52040 }, { "epoch": 7.751712838844206, "grad_norm": 12.484174728393555, "learning_rate": 7.3126828155029024e-06, "loss": 0.7308, "num_input_tokens_seen": 30019488, "step": 52045 }, { "epoch": 7.752457551385166, "grad_norm": 4.104022979736328, "learning_rate": 7.308090546594565e-06, "loss": 0.6334, "num_input_tokens_seen": 30022272, "step": 52050 }, { "epoch": 7.753202263926124, "grad_norm": 1.5546667575836182, "learning_rate": 7.303499473233546e-06, "loss": 0.4785, "num_input_tokens_seen": 30025216, "step": 52055 }, { "epoch": 7.753946976467084, "grad_norm": 2.360914945602417, "learning_rate": 7.2989095957300804e-06, "loss": 0.6901, "num_input_tokens_seen": 30029024, "step": 52060 }, { "epoch": 7.754691689008043, "grad_norm": 0.7599430084228516, "learning_rate": 7.294320914394331e-06, "loss": 0.5024, "num_input_tokens_seen": 30032192, "step": 52065 }, { "epoch": 7.755436401549002, "grad_norm": 3.181044340133667, "learning_rate": 7.289733429536391e-06, "loss": 0.6852, "num_input_tokens_seen": 30035040, "step": 52070 }, { "epoch": 7.756181114089961, "grad_norm": 2.746711254119873, "learning_rate": 7.285147141466269e-06, "loss": 0.6506, "num_input_tokens_seen": 30037728, "step": 52075 }, { "epoch": 7.756925826630921, "grad_norm": 2.795797109603882, "learning_rate": 7.280562050493872e-06, "loss": 0.5998, "num_input_tokens_seen": 30040672, "step": 52080 }, { "epoch": 7.75767053917188, "grad_norm": 3.8432388305664062, "learning_rate": 7.2759781569290506e-06, "loss": 0.5358, "num_input_tokens_seen": 30043712, "step": 52085 }, { "epoch": 7.758415251712838, "grad_norm": 2.0970687866210938, "learning_rate": 7.27139546108157e-06, "loss": 0.5339, "num_input_tokens_seen": 30046432, "step": 52090 }, { "epoch": 7.759159964253798, "grad_norm": 4.000314712524414, "learning_rate": 7.266813963261099e-06, "loss": 0.6001, "num_input_tokens_seen": 30049504, "step": 52095 }, { "epoch": 7.759904676794758, "grad_norm": 2.128875255584717, "learning_rate": 7.262233663777243e-06, "loss": 0.6043, "num_input_tokens_seen": 30052832, "step": 52100 }, { "epoch": 7.7606493893357165, "grad_norm": 3.6766357421875, "learning_rate": 7.257654562939517e-06, "loss": 0.4762, "num_input_tokens_seen": 30055456, "step": 52105 }, { "epoch": 7.761394101876675, "grad_norm": 2.2184548377990723, "learning_rate": 7.253076661057362e-06, "loss": 0.6622, "num_input_tokens_seen": 30058208, "step": 52110 }, { "epoch": 7.762138814417635, "grad_norm": 3.347205877304077, "learning_rate": 7.248499958440141e-06, "loss": 0.645, "num_input_tokens_seen": 30060928, "step": 52115 }, { "epoch": 7.762883526958594, "grad_norm": 2.340876817703247, "learning_rate": 7.243924455397111e-06, "loss": 0.5052, "num_input_tokens_seen": 30063648, "step": 52120 }, { "epoch": 7.763628239499553, "grad_norm": 2.7761824131011963, "learning_rate": 7.2393501522374844e-06, "loss": 0.4644, "num_input_tokens_seen": 30066656, "step": 52125 }, { "epoch": 7.764372952040512, "grad_norm": 3.7562003135681152, "learning_rate": 7.234777049270358e-06, "loss": 0.5387, "num_input_tokens_seen": 30069504, "step": 52130 }, { "epoch": 7.765117664581472, "grad_norm": 2.3455452919006348, "learning_rate": 7.230205146804769e-06, "loss": 0.5247, "num_input_tokens_seen": 30072512, "step": 52135 }, { "epoch": 7.76586237712243, "grad_norm": 2.10588002204895, "learning_rate": 7.2256344451496676e-06, "loss": 0.4818, "num_input_tokens_seen": 30075360, "step": 52140 }, { "epoch": 7.76660708966339, "grad_norm": 2.12505841255188, "learning_rate": 7.221064944613929e-06, "loss": 0.6659, "num_input_tokens_seen": 30078304, "step": 52145 }, { "epoch": 7.767351802204349, "grad_norm": 3.2855749130249023, "learning_rate": 7.2164966455063435e-06, "loss": 0.6499, "num_input_tokens_seen": 30081184, "step": 52150 }, { "epoch": 7.7680965147453085, "grad_norm": 2.954655885696411, "learning_rate": 7.2119295481356044e-06, "loss": 0.6962, "num_input_tokens_seen": 30084000, "step": 52155 }, { "epoch": 7.768841227286267, "grad_norm": 3.4147555828094482, "learning_rate": 7.2073636528103535e-06, "loss": 0.539, "num_input_tokens_seen": 30086944, "step": 52160 }, { "epoch": 7.769585939827227, "grad_norm": 2.9495527744293213, "learning_rate": 7.202798959839119e-06, "loss": 0.6619, "num_input_tokens_seen": 30090048, "step": 52165 }, { "epoch": 7.770330652368186, "grad_norm": 1.8257659673690796, "learning_rate": 7.198235469530374e-06, "loss": 0.5998, "num_input_tokens_seen": 30093088, "step": 52170 }, { "epoch": 7.771075364909145, "grad_norm": 2.753126859664917, "learning_rate": 7.193673182192498e-06, "loss": 0.4146, "num_input_tokens_seen": 30096128, "step": 52175 }, { "epoch": 7.771820077450104, "grad_norm": 2.2808737754821777, "learning_rate": 7.189112098133793e-06, "loss": 0.714, "num_input_tokens_seen": 30099040, "step": 52180 }, { "epoch": 7.772564789991064, "grad_norm": 3.363868236541748, "learning_rate": 7.184552217662488e-06, "loss": 0.5852, "num_input_tokens_seen": 30101888, "step": 52185 }, { "epoch": 7.7733095025320225, "grad_norm": 1.9211602210998535, "learning_rate": 7.179993541086702e-06, "loss": 0.6943, "num_input_tokens_seen": 30104640, "step": 52190 }, { "epoch": 7.774054215072982, "grad_norm": 2.416534900665283, "learning_rate": 7.175436068714503e-06, "loss": 0.6268, "num_input_tokens_seen": 30108096, "step": 52195 }, { "epoch": 7.774798927613941, "grad_norm": 2.3827881813049316, "learning_rate": 7.170879800853872e-06, "loss": 0.5172, "num_input_tokens_seen": 30110912, "step": 52200 }, { "epoch": 7.7755436401549005, "grad_norm": 2.9136641025543213, "learning_rate": 7.166324737812688e-06, "loss": 0.5939, "num_input_tokens_seen": 30113536, "step": 52205 }, { "epoch": 7.776288352695859, "grad_norm": 3.904402732849121, "learning_rate": 7.161770879898771e-06, "loss": 0.4806, "num_input_tokens_seen": 30116672, "step": 52210 }, { "epoch": 7.777033065236819, "grad_norm": 1.820890188217163, "learning_rate": 7.1572182274198564e-06, "loss": 0.5271, "num_input_tokens_seen": 30119648, "step": 52215 }, { "epoch": 7.777777777777778, "grad_norm": 3.106799840927124, "learning_rate": 7.152666780683595e-06, "loss": 0.5531, "num_input_tokens_seen": 30122560, "step": 52220 }, { "epoch": 7.778522490318737, "grad_norm": 4.077676296234131, "learning_rate": 7.148116539997546e-06, "loss": 0.7376, "num_input_tokens_seen": 30125568, "step": 52225 }, { "epoch": 7.779267202859696, "grad_norm": 4.186672687530518, "learning_rate": 7.143567505669199e-06, "loss": 0.5737, "num_input_tokens_seen": 30128864, "step": 52230 }, { "epoch": 7.780011915400656, "grad_norm": 3.813431739807129, "learning_rate": 7.139019678005959e-06, "loss": 0.5807, "num_input_tokens_seen": 30131744, "step": 52235 }, { "epoch": 7.7807566279416145, "grad_norm": 2.1958425045013428, "learning_rate": 7.134473057315163e-06, "loss": 0.4932, "num_input_tokens_seen": 30134432, "step": 52240 }, { "epoch": 7.781501340482574, "grad_norm": 2.648838996887207, "learning_rate": 7.129927643904033e-06, "loss": 0.496, "num_input_tokens_seen": 30137248, "step": 52245 }, { "epoch": 7.782246053023533, "grad_norm": 4.41737699508667, "learning_rate": 7.125383438079736e-06, "loss": 0.4035, "num_input_tokens_seen": 30140096, "step": 52250 }, { "epoch": 7.782990765564492, "grad_norm": 4.338443279266357, "learning_rate": 7.120840440149365e-06, "loss": 0.5292, "num_input_tokens_seen": 30142784, "step": 52255 }, { "epoch": 7.783735478105451, "grad_norm": 3.878131628036499, "learning_rate": 7.1162986504198945e-06, "loss": 0.5477, "num_input_tokens_seen": 30145568, "step": 52260 }, { "epoch": 7.784480190646411, "grad_norm": 4.553080081939697, "learning_rate": 7.1117580691982545e-06, "loss": 0.5892, "num_input_tokens_seen": 30148896, "step": 52265 }, { "epoch": 7.78522490318737, "grad_norm": 2.998985528945923, "learning_rate": 7.107218696791273e-06, "loss": 0.4489, "num_input_tokens_seen": 30151552, "step": 52270 }, { "epoch": 7.7859696157283285, "grad_norm": 2.1280460357666016, "learning_rate": 7.102680533505707e-06, "loss": 0.4322, "num_input_tokens_seen": 30154688, "step": 52275 }, { "epoch": 7.786714328269288, "grad_norm": 2.490173816680908, "learning_rate": 7.0981435796482306e-06, "loss": 0.7139, "num_input_tokens_seen": 30157728, "step": 52280 }, { "epoch": 7.787459040810247, "grad_norm": 2.3047280311584473, "learning_rate": 7.093607835525423e-06, "loss": 0.558, "num_input_tokens_seen": 30160672, "step": 52285 }, { "epoch": 7.7882037533512065, "grad_norm": 3.3667056560516357, "learning_rate": 7.089073301443802e-06, "loss": 0.5267, "num_input_tokens_seen": 30163584, "step": 52290 }, { "epoch": 7.788948465892165, "grad_norm": 2.045058012008667, "learning_rate": 7.084539977709778e-06, "loss": 0.5794, "num_input_tokens_seen": 30166368, "step": 52295 }, { "epoch": 7.789693178433125, "grad_norm": 2.134399890899658, "learning_rate": 7.080007864629706e-06, "loss": 0.534, "num_input_tokens_seen": 30169088, "step": 52300 }, { "epoch": 7.790437890974084, "grad_norm": 7.800858974456787, "learning_rate": 7.075476962509845e-06, "loss": 0.8111, "num_input_tokens_seen": 30171712, "step": 52305 }, { "epoch": 7.791182603515043, "grad_norm": 7.7123188972473145, "learning_rate": 7.070947271656372e-06, "loss": 0.3897, "num_input_tokens_seen": 30174720, "step": 52310 }, { "epoch": 7.791927316056002, "grad_norm": 8.537421226501465, "learning_rate": 7.0664187923753984e-06, "loss": 0.5079, "num_input_tokens_seen": 30177632, "step": 52315 }, { "epoch": 7.792672028596962, "grad_norm": 2.398554801940918, "learning_rate": 7.061891524972927e-06, "loss": 0.4864, "num_input_tokens_seen": 30180928, "step": 52320 }, { "epoch": 7.7934167411379205, "grad_norm": 2.318352699279785, "learning_rate": 7.057365469754892e-06, "loss": 0.5472, "num_input_tokens_seen": 30183552, "step": 52325 }, { "epoch": 7.79416145367888, "grad_norm": 1.9077222347259521, "learning_rate": 7.052840627027146e-06, "loss": 0.4985, "num_input_tokens_seen": 30186432, "step": 52330 }, { "epoch": 7.794906166219839, "grad_norm": 1.8600143194198608, "learning_rate": 7.048316997095464e-06, "loss": 0.5582, "num_input_tokens_seen": 30189472, "step": 52335 }, { "epoch": 7.7956508787607985, "grad_norm": 1.6388318538665771, "learning_rate": 7.0437945802655334e-06, "loss": 0.6873, "num_input_tokens_seen": 30192160, "step": 52340 }, { "epoch": 7.796395591301757, "grad_norm": 2.0966978073120117, "learning_rate": 7.039273376842958e-06, "loss": 0.5071, "num_input_tokens_seen": 30195136, "step": 52345 }, { "epoch": 7.797140303842717, "grad_norm": 3.0025839805603027, "learning_rate": 7.034753387133275e-06, "loss": 0.5, "num_input_tokens_seen": 30198208, "step": 52350 }, { "epoch": 7.797885016383676, "grad_norm": 2.1796228885650635, "learning_rate": 7.03023461144191e-06, "loss": 0.7234, "num_input_tokens_seen": 30201088, "step": 52355 }, { "epoch": 7.798629728924635, "grad_norm": 3.664206027984619, "learning_rate": 7.025717050074235e-06, "loss": 0.5724, "num_input_tokens_seen": 30204192, "step": 52360 }, { "epoch": 7.799374441465594, "grad_norm": 2.3122291564941406, "learning_rate": 7.021200703335518e-06, "loss": 0.5037, "num_input_tokens_seen": 30206976, "step": 52365 }, { "epoch": 7.800119154006554, "grad_norm": 2.6021616458892822, "learning_rate": 7.01668557153096e-06, "loss": 0.7045, "num_input_tokens_seen": 30209696, "step": 52370 }, { "epoch": 7.8008638665475125, "grad_norm": 3.1430540084838867, "learning_rate": 7.012171654965677e-06, "loss": 0.6518, "num_input_tokens_seen": 30212256, "step": 52375 }, { "epoch": 7.801608579088472, "grad_norm": 2.2741568088531494, "learning_rate": 7.007658953944699e-06, "loss": 0.6731, "num_input_tokens_seen": 30215264, "step": 52380 }, { "epoch": 7.802353291629431, "grad_norm": 2.0742027759552, "learning_rate": 7.003147468772986e-06, "loss": 0.592, "num_input_tokens_seen": 30218016, "step": 52385 }, { "epoch": 7.803098004170391, "grad_norm": 3.2149691581726074, "learning_rate": 6.998637199755389e-06, "loss": 0.4595, "num_input_tokens_seen": 30220832, "step": 52390 }, { "epoch": 7.803842716711349, "grad_norm": 2.4908690452575684, "learning_rate": 6.994128147196702e-06, "loss": 0.6759, "num_input_tokens_seen": 30223616, "step": 52395 }, { "epoch": 7.804587429252309, "grad_norm": 3.5366952419281006, "learning_rate": 6.989620311401637e-06, "loss": 0.4332, "num_input_tokens_seen": 30226464, "step": 52400 }, { "epoch": 7.805332141793268, "grad_norm": 1.9253995418548584, "learning_rate": 6.985113692674797e-06, "loss": 0.8107, "num_input_tokens_seen": 30229248, "step": 52405 }, { "epoch": 7.806076854334227, "grad_norm": 2.251953363418579, "learning_rate": 6.980608291320731e-06, "loss": 0.4844, "num_input_tokens_seen": 30232192, "step": 52410 }, { "epoch": 7.806821566875186, "grad_norm": 2.373469829559326, "learning_rate": 6.976104107643896e-06, "loss": 0.584, "num_input_tokens_seen": 30235008, "step": 52415 }, { "epoch": 7.807566279416145, "grad_norm": 4.144706726074219, "learning_rate": 6.9716011419486745e-06, "loss": 0.6567, "num_input_tokens_seen": 30237792, "step": 52420 }, { "epoch": 7.8083109919571045, "grad_norm": 2.220764636993408, "learning_rate": 6.96709939453934e-06, "loss": 0.5983, "num_input_tokens_seen": 30240928, "step": 52425 }, { "epoch": 7.809055704498064, "grad_norm": 2.172781467437744, "learning_rate": 6.962598865720113e-06, "loss": 0.6051, "num_input_tokens_seen": 30243776, "step": 52430 }, { "epoch": 7.809800417039023, "grad_norm": 4.6364336013793945, "learning_rate": 6.95809955579512e-06, "loss": 0.4438, "num_input_tokens_seen": 30246528, "step": 52435 }, { "epoch": 7.810545129579982, "grad_norm": 8.330404281616211, "learning_rate": 6.95360146506841e-06, "loss": 0.59, "num_input_tokens_seen": 30249536, "step": 52440 }, { "epoch": 7.811289842120941, "grad_norm": 6.38574743270874, "learning_rate": 6.949104593843939e-06, "loss": 0.5834, "num_input_tokens_seen": 30252576, "step": 52445 }, { "epoch": 7.812034554661901, "grad_norm": 2.9877946376800537, "learning_rate": 6.9446089424255875e-06, "loss": 0.5056, "num_input_tokens_seen": 30255488, "step": 52450 }, { "epoch": 7.81277926720286, "grad_norm": 3.8798108100891113, "learning_rate": 6.940114511117163e-06, "loss": 0.542, "num_input_tokens_seen": 30258784, "step": 52455 }, { "epoch": 7.8135239797438185, "grad_norm": 1.8002283573150635, "learning_rate": 6.935621300222367e-06, "loss": 0.6465, "num_input_tokens_seen": 30261376, "step": 52460 }, { "epoch": 7.814268692284778, "grad_norm": 2.634432077407837, "learning_rate": 6.93112931004484e-06, "loss": 0.3889, "num_input_tokens_seen": 30264224, "step": 52465 }, { "epoch": 7.815013404825737, "grad_norm": 3.432438850402832, "learning_rate": 6.9266385408881305e-06, "loss": 0.6729, "num_input_tokens_seen": 30267136, "step": 52470 }, { "epoch": 7.815758117366697, "grad_norm": 5.976423263549805, "learning_rate": 6.922148993055708e-06, "loss": 0.5213, "num_input_tokens_seen": 30269920, "step": 52475 }, { "epoch": 7.816502829907655, "grad_norm": 3.2975497245788574, "learning_rate": 6.917660666850964e-06, "loss": 0.5908, "num_input_tokens_seen": 30272800, "step": 52480 }, { "epoch": 7.817247542448615, "grad_norm": 2.8674588203430176, "learning_rate": 6.913173562577193e-06, "loss": 0.5777, "num_input_tokens_seen": 30275680, "step": 52485 }, { "epoch": 7.817992254989574, "grad_norm": 5.574909210205078, "learning_rate": 6.908687680537615e-06, "loss": 0.6478, "num_input_tokens_seen": 30278656, "step": 52490 }, { "epoch": 7.818736967530533, "grad_norm": 4.078383922576904, "learning_rate": 6.904203021035366e-06, "loss": 0.5855, "num_input_tokens_seen": 30281632, "step": 52495 }, { "epoch": 7.819481680071492, "grad_norm": 1.573360800743103, "learning_rate": 6.899719584373504e-06, "loss": 0.5107, "num_input_tokens_seen": 30284416, "step": 52500 }, { "epoch": 7.820226392612452, "grad_norm": 3.9728317260742188, "learning_rate": 6.895237370855004e-06, "loss": 0.8007, "num_input_tokens_seen": 30287392, "step": 52505 }, { "epoch": 7.8209711051534105, "grad_norm": 1.9856104850769043, "learning_rate": 6.890756380782751e-06, "loss": 0.5086, "num_input_tokens_seen": 30290368, "step": 52510 }, { "epoch": 7.82171581769437, "grad_norm": 3.6093790531158447, "learning_rate": 6.886276614459567e-06, "loss": 0.6679, "num_input_tokens_seen": 30293216, "step": 52515 }, { "epoch": 7.822460530235329, "grad_norm": 2.3409483432769775, "learning_rate": 6.881798072188159e-06, "loss": 0.5382, "num_input_tokens_seen": 30296224, "step": 52520 }, { "epoch": 7.823205242776289, "grad_norm": 3.6016907691955566, "learning_rate": 6.8773207542711716e-06, "loss": 0.6784, "num_input_tokens_seen": 30299232, "step": 52525 }, { "epoch": 7.823949955317247, "grad_norm": 0.6017270684242249, "learning_rate": 6.872844661011163e-06, "loss": 0.3737, "num_input_tokens_seen": 30302112, "step": 52530 }, { "epoch": 7.824694667858207, "grad_norm": 3.4047961235046387, "learning_rate": 6.868369792710613e-06, "loss": 0.427, "num_input_tokens_seen": 30305120, "step": 52535 }, { "epoch": 7.825439380399166, "grad_norm": 2.448003053665161, "learning_rate": 6.863896149671914e-06, "loss": 0.5456, "num_input_tokens_seen": 30307968, "step": 52540 }, { "epoch": 7.826184092940125, "grad_norm": 2.408080577850342, "learning_rate": 6.859423732197379e-06, "loss": 0.5262, "num_input_tokens_seen": 30310752, "step": 52545 }, { "epoch": 7.826928805481084, "grad_norm": 3.1247828006744385, "learning_rate": 6.854952540589241e-06, "loss": 0.625, "num_input_tokens_seen": 30313568, "step": 52550 }, { "epoch": 7.827673518022044, "grad_norm": 1.8113394975662231, "learning_rate": 6.850482575149631e-06, "loss": 0.3806, "num_input_tokens_seen": 30316576, "step": 52555 }, { "epoch": 7.828418230563003, "grad_norm": 2.514082431793213, "learning_rate": 6.846013836180623e-06, "loss": 0.8062, "num_input_tokens_seen": 30319520, "step": 52560 }, { "epoch": 7.829162943103962, "grad_norm": 1.882908582687378, "learning_rate": 6.8415463239841854e-06, "loss": 0.541, "num_input_tokens_seen": 30322336, "step": 52565 }, { "epoch": 7.829907655644921, "grad_norm": 3.384634017944336, "learning_rate": 6.83708003886222e-06, "loss": 0.5374, "num_input_tokens_seen": 30325184, "step": 52570 }, { "epoch": 7.830652368185881, "grad_norm": 1.4330289363861084, "learning_rate": 6.832614981116542e-06, "loss": 0.5216, "num_input_tokens_seen": 30327744, "step": 52575 }, { "epoch": 7.831397080726839, "grad_norm": 3.6759049892425537, "learning_rate": 6.8281511510488785e-06, "loss": 0.8717, "num_input_tokens_seen": 30330880, "step": 52580 }, { "epoch": 7.832141793267798, "grad_norm": 3.3958358764648438, "learning_rate": 6.8236885489608885e-06, "loss": 0.592, "num_input_tokens_seen": 30333664, "step": 52585 }, { "epoch": 7.832886505808758, "grad_norm": 3.4273698329925537, "learning_rate": 6.819227175154117e-06, "loss": 0.5468, "num_input_tokens_seen": 30336640, "step": 52590 }, { "epoch": 7.833631218349717, "grad_norm": 1.9188121557235718, "learning_rate": 6.814767029930055e-06, "loss": 0.6835, "num_input_tokens_seen": 30339456, "step": 52595 }, { "epoch": 7.834375930890676, "grad_norm": 2.1749000549316406, "learning_rate": 6.810308113590111e-06, "loss": 0.6007, "num_input_tokens_seen": 30342432, "step": 52600 }, { "epoch": 7.835120643431635, "grad_norm": 2.8107142448425293, "learning_rate": 6.805850426435581e-06, "loss": 0.5307, "num_input_tokens_seen": 30346432, "step": 52605 }, { "epoch": 7.835865355972595, "grad_norm": 4.28092622756958, "learning_rate": 6.801393968767708e-06, "loss": 0.5699, "num_input_tokens_seen": 30349344, "step": 52610 }, { "epoch": 7.836610068513554, "grad_norm": 4.1448140144348145, "learning_rate": 6.796938740887643e-06, "loss": 0.5533, "num_input_tokens_seen": 30352032, "step": 52615 }, { "epoch": 7.837354781054513, "grad_norm": 1.794266700744629, "learning_rate": 6.792484743096456e-06, "loss": 0.5936, "num_input_tokens_seen": 30354784, "step": 52620 }, { "epoch": 7.838099493595472, "grad_norm": 2.3973710536956787, "learning_rate": 6.788031975695114e-06, "loss": 0.7416, "num_input_tokens_seen": 30358208, "step": 52625 }, { "epoch": 7.838844206136431, "grad_norm": 2.659731149673462, "learning_rate": 6.783580438984527e-06, "loss": 0.4433, "num_input_tokens_seen": 30360928, "step": 52630 }, { "epoch": 7.83958891867739, "grad_norm": 2.359100103378296, "learning_rate": 6.779130133265513e-06, "loss": 0.5629, "num_input_tokens_seen": 30363392, "step": 52635 }, { "epoch": 7.84033363121835, "grad_norm": 8.560465812683105, "learning_rate": 6.774681058838811e-06, "loss": 0.6826, "num_input_tokens_seen": 30366144, "step": 52640 }, { "epoch": 7.841078343759309, "grad_norm": 6.491933822631836, "learning_rate": 6.770233216005056e-06, "loss": 0.581, "num_input_tokens_seen": 30369280, "step": 52645 }, { "epoch": 7.841823056300268, "grad_norm": 2.338742733001709, "learning_rate": 6.76578660506483e-06, "loss": 0.5087, "num_input_tokens_seen": 30372160, "step": 52650 }, { "epoch": 7.842567768841227, "grad_norm": 1.8510905504226685, "learning_rate": 6.7613412263186074e-06, "loss": 0.4499, "num_input_tokens_seen": 30375328, "step": 52655 }, { "epoch": 7.843312481382187, "grad_norm": 3.1212167739868164, "learning_rate": 6.756897080066788e-06, "loss": 0.6022, "num_input_tokens_seen": 30377920, "step": 52660 }, { "epoch": 7.844057193923145, "grad_norm": 5.4785051345825195, "learning_rate": 6.752454166609693e-06, "loss": 0.6197, "num_input_tokens_seen": 30380768, "step": 52665 }, { "epoch": 7.844801906464105, "grad_norm": 2.5060629844665527, "learning_rate": 6.748012486247557e-06, "loss": 0.6059, "num_input_tokens_seen": 30383424, "step": 52670 }, { "epoch": 7.845546619005064, "grad_norm": 2.1594343185424805, "learning_rate": 6.74357203928053e-06, "loss": 0.5481, "num_input_tokens_seen": 30386368, "step": 52675 }, { "epoch": 7.846291331546023, "grad_norm": 2.8092477321624756, "learning_rate": 6.7391328260086845e-06, "loss": 0.463, "num_input_tokens_seen": 30389920, "step": 52680 }, { "epoch": 7.847036044086982, "grad_norm": 2.3850343227386475, "learning_rate": 6.7346948467320036e-06, "loss": 0.5325, "num_input_tokens_seen": 30392640, "step": 52685 }, { "epoch": 7.847780756627942, "grad_norm": 2.475888252258301, "learning_rate": 6.730258101750372e-06, "loss": 0.4904, "num_input_tokens_seen": 30395264, "step": 52690 }, { "epoch": 7.848525469168901, "grad_norm": 1.627838134765625, "learning_rate": 6.725822591363621e-06, "loss": 0.6622, "num_input_tokens_seen": 30398272, "step": 52695 }, { "epoch": 7.84927018170986, "grad_norm": 3.121838092803955, "learning_rate": 6.721388315871482e-06, "loss": 0.7332, "num_input_tokens_seen": 30401184, "step": 52700 }, { "epoch": 7.850014894250819, "grad_norm": 3.3501219749450684, "learning_rate": 6.7169552755736055e-06, "loss": 0.4802, "num_input_tokens_seen": 30403744, "step": 52705 }, { "epoch": 7.850759606791779, "grad_norm": 1.118375301361084, "learning_rate": 6.712523470769555e-06, "loss": 0.6799, "num_input_tokens_seen": 30407008, "step": 52710 }, { "epoch": 7.851504319332737, "grad_norm": 1.8733067512512207, "learning_rate": 6.708092901758828e-06, "loss": 0.7665, "num_input_tokens_seen": 30409920, "step": 52715 }, { "epoch": 7.852249031873697, "grad_norm": 3.462116003036499, "learning_rate": 6.703663568840804e-06, "loss": 0.6179, "num_input_tokens_seen": 30412544, "step": 52720 }, { "epoch": 7.852993744414656, "grad_norm": 1.129101276397705, "learning_rate": 6.699235472314816e-06, "loss": 0.4689, "num_input_tokens_seen": 30415872, "step": 52725 }, { "epoch": 7.8537384569556155, "grad_norm": 2.224679470062256, "learning_rate": 6.694808612480083e-06, "loss": 0.4268, "num_input_tokens_seen": 30419328, "step": 52730 }, { "epoch": 7.854483169496574, "grad_norm": 2.2149910926818848, "learning_rate": 6.6903829896357604e-06, "loss": 0.4905, "num_input_tokens_seen": 30422080, "step": 52735 }, { "epoch": 7.855227882037534, "grad_norm": 2.099724531173706, "learning_rate": 6.6859586040809105e-06, "loss": 0.611, "num_input_tokens_seen": 30424864, "step": 52740 }, { "epoch": 7.855972594578493, "grad_norm": 2.7053136825561523, "learning_rate": 6.681535456114521e-06, "loss": 0.7647, "num_input_tokens_seen": 30427680, "step": 52745 }, { "epoch": 7.856717307119452, "grad_norm": 2.420313835144043, "learning_rate": 6.677113546035496e-06, "loss": 0.82, "num_input_tokens_seen": 30430592, "step": 52750 }, { "epoch": 7.857462019660411, "grad_norm": 2.377751111984253, "learning_rate": 6.672692874142636e-06, "loss": 0.4671, "num_input_tokens_seen": 30433344, "step": 52755 }, { "epoch": 7.858206732201371, "grad_norm": 3.803555727005005, "learning_rate": 6.668273440734676e-06, "loss": 0.6428, "num_input_tokens_seen": 30436128, "step": 52760 }, { "epoch": 7.858951444742329, "grad_norm": 4.73544979095459, "learning_rate": 6.663855246110273e-06, "loss": 0.5889, "num_input_tokens_seen": 30439200, "step": 52765 }, { "epoch": 7.859696157283288, "grad_norm": 1.95316481590271, "learning_rate": 6.659438290567976e-06, "loss": 0.6203, "num_input_tokens_seen": 30442112, "step": 52770 }, { "epoch": 7.860440869824248, "grad_norm": 3.9136626720428467, "learning_rate": 6.655022574406272e-06, "loss": 0.5924, "num_input_tokens_seen": 30444640, "step": 52775 }, { "epoch": 7.8611855823652075, "grad_norm": 1.652640700340271, "learning_rate": 6.650608097923558e-06, "loss": 0.5346, "num_input_tokens_seen": 30447680, "step": 52780 }, { "epoch": 7.861930294906166, "grad_norm": 1.8624236583709717, "learning_rate": 6.64619486141815e-06, "loss": 0.5467, "num_input_tokens_seen": 30450432, "step": 52785 }, { "epoch": 7.862675007447125, "grad_norm": 2.725044012069702, "learning_rate": 6.641782865188267e-06, "loss": 0.6789, "num_input_tokens_seen": 30453248, "step": 52790 }, { "epoch": 7.863419719988085, "grad_norm": 2.5405197143554688, "learning_rate": 6.637372109532061e-06, "loss": 0.6008, "num_input_tokens_seen": 30456064, "step": 52795 }, { "epoch": 7.864164432529043, "grad_norm": 4.1127495765686035, "learning_rate": 6.632962594747588e-06, "loss": 0.5728, "num_input_tokens_seen": 30459136, "step": 52800 }, { "epoch": 7.864909145070003, "grad_norm": 2.606788396835327, "learning_rate": 6.628554321132835e-06, "loss": 0.6353, "num_input_tokens_seen": 30461888, "step": 52805 }, { "epoch": 7.865653857610962, "grad_norm": 4.456817626953125, "learning_rate": 6.624147288985682e-06, "loss": 0.8625, "num_input_tokens_seen": 30464704, "step": 52810 }, { "epoch": 7.8663985701519215, "grad_norm": 1.58241605758667, "learning_rate": 6.619741498603951e-06, "loss": 0.5099, "num_input_tokens_seen": 30467392, "step": 52815 }, { "epoch": 7.86714328269288, "grad_norm": 1.678871512413025, "learning_rate": 6.615336950285356e-06, "loss": 0.5639, "num_input_tokens_seen": 30470400, "step": 52820 }, { "epoch": 7.86788799523384, "grad_norm": 2.8181650638580322, "learning_rate": 6.610933644327541e-06, "loss": 0.5865, "num_input_tokens_seen": 30473248, "step": 52825 }, { "epoch": 7.868632707774799, "grad_norm": 5.304749011993408, "learning_rate": 6.606531581028067e-06, "loss": 0.6274, "num_input_tokens_seen": 30476160, "step": 52830 }, { "epoch": 7.869377420315758, "grad_norm": 2.0246074199676514, "learning_rate": 6.602130760684405e-06, "loss": 0.4477, "num_input_tokens_seen": 30479648, "step": 52835 }, { "epoch": 7.870122132856717, "grad_norm": 2.3063652515411377, "learning_rate": 6.597731183593947e-06, "loss": 0.5872, "num_input_tokens_seen": 30482400, "step": 52840 }, { "epoch": 7.870866845397677, "grad_norm": 1.3508201837539673, "learning_rate": 6.593332850054004e-06, "loss": 0.6161, "num_input_tokens_seen": 30485312, "step": 52845 }, { "epoch": 7.871611557938635, "grad_norm": 2.5159244537353516, "learning_rate": 6.588935760361789e-06, "loss": 0.5533, "num_input_tokens_seen": 30488448, "step": 52850 }, { "epoch": 7.872356270479595, "grad_norm": 1.921453595161438, "learning_rate": 6.584539914814439e-06, "loss": 0.5936, "num_input_tokens_seen": 30491456, "step": 52855 }, { "epoch": 7.873100983020554, "grad_norm": 2.113680362701416, "learning_rate": 6.580145313709005e-06, "loss": 0.7292, "num_input_tokens_seen": 30494432, "step": 52860 }, { "epoch": 7.8738456955615135, "grad_norm": 1.5017778873443604, "learning_rate": 6.575751957342463e-06, "loss": 0.6476, "num_input_tokens_seen": 30497504, "step": 52865 }, { "epoch": 7.874590408102472, "grad_norm": 3.3741652965545654, "learning_rate": 6.571359846011696e-06, "loss": 0.5444, "num_input_tokens_seen": 30500320, "step": 52870 }, { "epoch": 7.875335120643432, "grad_norm": 3.5304760932922363, "learning_rate": 6.566968980013505e-06, "loss": 0.6009, "num_input_tokens_seen": 30503264, "step": 52875 }, { "epoch": 7.876079833184391, "grad_norm": 2.2243032455444336, "learning_rate": 6.5625793596446165e-06, "loss": 0.7071, "num_input_tokens_seen": 30506368, "step": 52880 }, { "epoch": 7.87682454572535, "grad_norm": 2.2190864086151123, "learning_rate": 6.558190985201651e-06, "loss": 0.906, "num_input_tokens_seen": 30509248, "step": 52885 }, { "epoch": 7.877569258266309, "grad_norm": 1.8703557252883911, "learning_rate": 6.553803856981152e-06, "loss": 0.3964, "num_input_tokens_seen": 30511904, "step": 52890 }, { "epoch": 7.878313970807269, "grad_norm": 1.8367958068847656, "learning_rate": 6.549417975279595e-06, "loss": 0.6256, "num_input_tokens_seen": 30514816, "step": 52895 }, { "epoch": 7.8790586833482275, "grad_norm": 3.416811943054199, "learning_rate": 6.545033340393356e-06, "loss": 0.6359, "num_input_tokens_seen": 30517728, "step": 52900 }, { "epoch": 7.879803395889187, "grad_norm": 2.8503239154815674, "learning_rate": 6.540649952618727e-06, "loss": 0.6542, "num_input_tokens_seen": 30520512, "step": 52905 }, { "epoch": 7.880548108430146, "grad_norm": 7.723630428314209, "learning_rate": 6.536267812251928e-06, "loss": 0.496, "num_input_tokens_seen": 30523360, "step": 52910 }, { "epoch": 7.8812928209711055, "grad_norm": 4.69890022277832, "learning_rate": 6.531886919589089e-06, "loss": 0.6842, "num_input_tokens_seen": 30526368, "step": 52915 }, { "epoch": 7.882037533512064, "grad_norm": 2.0337884426116943, "learning_rate": 6.5275072749262395e-06, "loss": 0.5317, "num_input_tokens_seen": 30529600, "step": 52920 }, { "epoch": 7.882782246053024, "grad_norm": 3.600677490234375, "learning_rate": 6.523128878559351e-06, "loss": 0.5144, "num_input_tokens_seen": 30532640, "step": 52925 }, { "epoch": 7.883526958593983, "grad_norm": 2.0807907581329346, "learning_rate": 6.518751730784284e-06, "loss": 0.4925, "num_input_tokens_seen": 30535488, "step": 52930 }, { "epoch": 7.884271671134941, "grad_norm": 5.1314520835876465, "learning_rate": 6.514375831896835e-06, "loss": 0.6013, "num_input_tokens_seen": 30538592, "step": 52935 }, { "epoch": 7.885016383675901, "grad_norm": 4.423794746398926, "learning_rate": 6.51000118219271e-06, "loss": 0.5937, "num_input_tokens_seen": 30541696, "step": 52940 }, { "epoch": 7.885761096216861, "grad_norm": 4.613557815551758, "learning_rate": 6.505627781967533e-06, "loss": 0.8564, "num_input_tokens_seen": 30544416, "step": 52945 }, { "epoch": 7.8865058087578195, "grad_norm": 2.9008402824401855, "learning_rate": 6.501255631516842e-06, "loss": 0.6254, "num_input_tokens_seen": 30547040, "step": 52950 }, { "epoch": 7.887250521298778, "grad_norm": 2.0034008026123047, "learning_rate": 6.4968847311360794e-06, "loss": 0.4522, "num_input_tokens_seen": 30549760, "step": 52955 }, { "epoch": 7.887995233839738, "grad_norm": 3.7491140365600586, "learning_rate": 6.4925150811206176e-06, "loss": 0.4143, "num_input_tokens_seen": 30552384, "step": 52960 }, { "epoch": 7.8887399463806975, "grad_norm": 3.2754135131835938, "learning_rate": 6.48814668176575e-06, "loss": 0.4251, "num_input_tokens_seen": 30555136, "step": 52965 }, { "epoch": 7.889484658921656, "grad_norm": 3.434126853942871, "learning_rate": 6.483779533366654e-06, "loss": 0.6625, "num_input_tokens_seen": 30558496, "step": 52970 }, { "epoch": 7.890229371462615, "grad_norm": 2.140413999557495, "learning_rate": 6.479413636218459e-06, "loss": 0.7044, "num_input_tokens_seen": 30561376, "step": 52975 }, { "epoch": 7.890974084003575, "grad_norm": 1.7316657304763794, "learning_rate": 6.4750489906162e-06, "loss": 0.5637, "num_input_tokens_seen": 30564128, "step": 52980 }, { "epoch": 7.8917187965445335, "grad_norm": 2.994929790496826, "learning_rate": 6.470685596854803e-06, "loss": 0.4889, "num_input_tokens_seen": 30567008, "step": 52985 }, { "epoch": 7.892463509085493, "grad_norm": 1.9207651615142822, "learning_rate": 6.46632345522914e-06, "loss": 0.7069, "num_input_tokens_seen": 30569856, "step": 52990 }, { "epoch": 7.893208221626452, "grad_norm": 3.8640544414520264, "learning_rate": 6.461962566033986e-06, "loss": 0.6665, "num_input_tokens_seen": 30572512, "step": 52995 }, { "epoch": 7.8939529341674115, "grad_norm": 4.424959182739258, "learning_rate": 6.45760292956403e-06, "loss": 0.7749, "num_input_tokens_seen": 30575328, "step": 53000 }, { "epoch": 7.89469764670837, "grad_norm": 1.8978304862976074, "learning_rate": 6.45324454611389e-06, "loss": 0.6423, "num_input_tokens_seen": 30578144, "step": 53005 }, { "epoch": 7.89544235924933, "grad_norm": 4.04123067855835, "learning_rate": 6.448887415978069e-06, "loss": 0.4905, "num_input_tokens_seen": 30581184, "step": 53010 }, { "epoch": 7.896187071790289, "grad_norm": 3.4399070739746094, "learning_rate": 6.4445315394510205e-06, "loss": 0.5688, "num_input_tokens_seen": 30584160, "step": 53015 }, { "epoch": 7.896931784331248, "grad_norm": 2.312364101409912, "learning_rate": 6.440176916827081e-06, "loss": 0.4215, "num_input_tokens_seen": 30586880, "step": 53020 }, { "epoch": 7.897676496872207, "grad_norm": 2.7609355449676514, "learning_rate": 6.435823548400529e-06, "loss": 0.6718, "num_input_tokens_seen": 30589472, "step": 53025 }, { "epoch": 7.898421209413167, "grad_norm": 1.7439463138580322, "learning_rate": 6.431471434465544e-06, "loss": 0.5685, "num_input_tokens_seen": 30592512, "step": 53030 }, { "epoch": 7.8991659219541255, "grad_norm": 2.5380046367645264, "learning_rate": 6.427120575316226e-06, "loss": 0.5189, "num_input_tokens_seen": 30595328, "step": 53035 }, { "epoch": 7.899910634495085, "grad_norm": 2.5651957988739014, "learning_rate": 6.422770971246586e-06, "loss": 0.4735, "num_input_tokens_seen": 30598336, "step": 53040 }, { "epoch": 7.900655347036044, "grad_norm": 1.6556382179260254, "learning_rate": 6.4184226225505625e-06, "loss": 0.6406, "num_input_tokens_seen": 30601184, "step": 53045 }, { "epoch": 7.9014000595770035, "grad_norm": 4.555236339569092, "learning_rate": 6.414075529521993e-06, "loss": 0.6057, "num_input_tokens_seen": 30604128, "step": 53050 }, { "epoch": 7.902144772117962, "grad_norm": 3.422252893447876, "learning_rate": 6.409729692454625e-06, "loss": 0.4882, "num_input_tokens_seen": 30606880, "step": 53055 }, { "epoch": 7.902889484658922, "grad_norm": 2.8479156494140625, "learning_rate": 6.4053851116421395e-06, "loss": 0.6271, "num_input_tokens_seen": 30609888, "step": 53060 }, { "epoch": 7.903634197199881, "grad_norm": 2.5547189712524414, "learning_rate": 6.401041787378131e-06, "loss": 0.6327, "num_input_tokens_seen": 30612768, "step": 53065 }, { "epoch": 7.90437890974084, "grad_norm": 3.0610926151275635, "learning_rate": 6.396699719956101e-06, "loss": 0.5837, "num_input_tokens_seen": 30615584, "step": 53070 }, { "epoch": 7.905123622281799, "grad_norm": 3.976696729660034, "learning_rate": 6.3923589096694685e-06, "loss": 0.5292, "num_input_tokens_seen": 30618464, "step": 53075 }, { "epoch": 7.905868334822759, "grad_norm": 4.927298069000244, "learning_rate": 6.388019356811573e-06, "loss": 0.5172, "num_input_tokens_seen": 30621536, "step": 53080 }, { "epoch": 7.9066130473637175, "grad_norm": 2.877657890319824, "learning_rate": 6.3836810616756614e-06, "loss": 0.5975, "num_input_tokens_seen": 30624448, "step": 53085 }, { "epoch": 7.907357759904677, "grad_norm": 2.431121587753296, "learning_rate": 6.379344024554884e-06, "loss": 0.6962, "num_input_tokens_seen": 30627424, "step": 53090 }, { "epoch": 7.908102472445636, "grad_norm": 5.073919296264648, "learning_rate": 6.375008245742334e-06, "loss": 0.5409, "num_input_tokens_seen": 30630080, "step": 53095 }, { "epoch": 7.908847184986596, "grad_norm": 2.1087021827697754, "learning_rate": 6.370673725531004e-06, "loss": 0.6701, "num_input_tokens_seen": 30632928, "step": 53100 }, { "epoch": 7.909591897527554, "grad_norm": 2.916146993637085, "learning_rate": 6.366340464213799e-06, "loss": 0.5779, "num_input_tokens_seen": 30635712, "step": 53105 }, { "epoch": 7.910336610068514, "grad_norm": 3.31435489654541, "learning_rate": 6.3620084620835494e-06, "loss": 0.5654, "num_input_tokens_seen": 30638528, "step": 53110 }, { "epoch": 7.911081322609473, "grad_norm": 1.7099857330322266, "learning_rate": 6.357677719432998e-06, "loss": 0.5344, "num_input_tokens_seen": 30641408, "step": 53115 }, { "epoch": 7.9118260351504315, "grad_norm": 2.1045262813568115, "learning_rate": 6.353348236554784e-06, "loss": 0.503, "num_input_tokens_seen": 30644320, "step": 53120 }, { "epoch": 7.912570747691391, "grad_norm": 2.2848012447357178, "learning_rate": 6.349020013741491e-06, "loss": 0.4549, "num_input_tokens_seen": 30647200, "step": 53125 }, { "epoch": 7.913315460232351, "grad_norm": 3.026643753051758, "learning_rate": 6.3446930512855914e-06, "loss": 0.6922, "num_input_tokens_seen": 30649984, "step": 53130 }, { "epoch": 7.9140601727733095, "grad_norm": 1.2847363948822021, "learning_rate": 6.340367349479487e-06, "loss": 0.6056, "num_input_tokens_seen": 30652896, "step": 53135 }, { "epoch": 7.914804885314268, "grad_norm": 4.278501987457275, "learning_rate": 6.336042908615492e-06, "loss": 0.8286, "num_input_tokens_seen": 30655776, "step": 53140 }, { "epoch": 7.915549597855228, "grad_norm": 2.113337516784668, "learning_rate": 6.331719728985844e-06, "loss": 0.5461, "num_input_tokens_seen": 30658560, "step": 53145 }, { "epoch": 7.916294310396187, "grad_norm": 3.3815274238586426, "learning_rate": 6.3273978108826685e-06, "loss": 0.6571, "num_input_tokens_seen": 30661344, "step": 53150 }, { "epoch": 7.917039022937146, "grad_norm": 2.9062373638153076, "learning_rate": 6.323077154598031e-06, "loss": 0.7362, "num_input_tokens_seen": 30663968, "step": 53155 }, { "epoch": 7.917783735478105, "grad_norm": 2.776681661605835, "learning_rate": 6.3187577604239074e-06, "loss": 0.5475, "num_input_tokens_seen": 30666880, "step": 53160 }, { "epoch": 7.918528448019065, "grad_norm": 3.866431474685669, "learning_rate": 6.314439628652186e-06, "loss": 0.7011, "num_input_tokens_seen": 30669632, "step": 53165 }, { "epoch": 7.9192731605600235, "grad_norm": 2.457171678543091, "learning_rate": 6.31012275957466e-06, "loss": 0.5739, "num_input_tokens_seen": 30672768, "step": 53170 }, { "epoch": 7.920017873100983, "grad_norm": 3.476484775543213, "learning_rate": 6.305807153483048e-06, "loss": 0.6929, "num_input_tokens_seen": 30675648, "step": 53175 }, { "epoch": 7.920762585641942, "grad_norm": 3.5001800060272217, "learning_rate": 6.3014928106689905e-06, "loss": 0.5937, "num_input_tokens_seen": 30678528, "step": 53180 }, { "epoch": 7.921507298182902, "grad_norm": 3.3731212615966797, "learning_rate": 6.297179731424022e-06, "loss": 0.5068, "num_input_tokens_seen": 30681184, "step": 53185 }, { "epoch": 7.92225201072386, "grad_norm": 2.4146668910980225, "learning_rate": 6.292867916039605e-06, "loss": 0.7021, "num_input_tokens_seen": 30684192, "step": 53190 }, { "epoch": 7.92299672326482, "grad_norm": 2.5078821182250977, "learning_rate": 6.288557364807118e-06, "loss": 0.6104, "num_input_tokens_seen": 30687424, "step": 53195 }, { "epoch": 7.923741435805779, "grad_norm": 2.2200443744659424, "learning_rate": 6.284248078017846e-06, "loss": 0.5132, "num_input_tokens_seen": 30690240, "step": 53200 }, { "epoch": 7.924486148346738, "grad_norm": 2.196931838989258, "learning_rate": 6.279940055963007e-06, "loss": 0.4907, "num_input_tokens_seen": 30693088, "step": 53205 }, { "epoch": 7.925230860887697, "grad_norm": 1.3683456182479858, "learning_rate": 6.2756332989337005e-06, "loss": 0.6259, "num_input_tokens_seen": 30695840, "step": 53210 }, { "epoch": 7.925975573428657, "grad_norm": 2.745424509048462, "learning_rate": 6.271327807220975e-06, "loss": 0.4947, "num_input_tokens_seen": 30698720, "step": 53215 }, { "epoch": 7.9267202859696155, "grad_norm": 3.521904706954956, "learning_rate": 6.267023581115763e-06, "loss": 0.6123, "num_input_tokens_seen": 30701696, "step": 53220 }, { "epoch": 7.927464998510575, "grad_norm": 1.2985457181930542, "learning_rate": 6.262720620908935e-06, "loss": 0.5432, "num_input_tokens_seen": 30704480, "step": 53225 }, { "epoch": 7.928209711051534, "grad_norm": 1.3368890285491943, "learning_rate": 6.258418926891269e-06, "loss": 0.5563, "num_input_tokens_seen": 30707392, "step": 53230 }, { "epoch": 7.928954423592494, "grad_norm": 2.478097677230835, "learning_rate": 6.254118499353451e-06, "loss": 0.5472, "num_input_tokens_seen": 30710304, "step": 53235 }, { "epoch": 7.929699136133452, "grad_norm": 4.211738586425781, "learning_rate": 6.249819338586091e-06, "loss": 0.5907, "num_input_tokens_seen": 30713184, "step": 53240 }, { "epoch": 7.930443848674412, "grad_norm": 6.86321496963501, "learning_rate": 6.245521444879715e-06, "loss": 0.5045, "num_input_tokens_seen": 30716128, "step": 53245 }, { "epoch": 7.931188561215371, "grad_norm": 2.0016067028045654, "learning_rate": 6.241224818524749e-06, "loss": 0.6705, "num_input_tokens_seen": 30718912, "step": 53250 }, { "epoch": 7.93193327375633, "grad_norm": 1.4869580268859863, "learning_rate": 6.236929459811536e-06, "loss": 0.6079, "num_input_tokens_seen": 30721856, "step": 53255 }, { "epoch": 7.932677986297289, "grad_norm": 1.4902064800262451, "learning_rate": 6.232635369030346e-06, "loss": 0.3373, "num_input_tokens_seen": 30724960, "step": 53260 }, { "epoch": 7.933422698838249, "grad_norm": 3.0734121799468994, "learning_rate": 6.228342546471353e-06, "loss": 0.5301, "num_input_tokens_seen": 30727808, "step": 53265 }, { "epoch": 7.934167411379208, "grad_norm": 2.0148022174835205, "learning_rate": 6.224050992424652e-06, "loss": 0.474, "num_input_tokens_seen": 30730880, "step": 53270 }, { "epoch": 7.934912123920167, "grad_norm": 3.2161362171173096, "learning_rate": 6.219760707180244e-06, "loss": 0.5926, "num_input_tokens_seen": 30733632, "step": 53275 }, { "epoch": 7.935656836461126, "grad_norm": 2.698892831802368, "learning_rate": 6.215471691028063e-06, "loss": 0.5559, "num_input_tokens_seen": 30736448, "step": 53280 }, { "epoch": 7.936401549002085, "grad_norm": 3.196795701980591, "learning_rate": 6.2111839442579335e-06, "loss": 0.7726, "num_input_tokens_seen": 30739392, "step": 53285 }, { "epoch": 7.937146261543044, "grad_norm": 1.6810250282287598, "learning_rate": 6.206897467159595e-06, "loss": 0.4571, "num_input_tokens_seen": 30742720, "step": 53290 }, { "epoch": 7.937890974084004, "grad_norm": 6.939177513122559, "learning_rate": 6.20261226002272e-06, "loss": 0.673, "num_input_tokens_seen": 30745344, "step": 53295 }, { "epoch": 7.938635686624963, "grad_norm": 2.2189018726348877, "learning_rate": 6.198328323136881e-06, "loss": 0.66, "num_input_tokens_seen": 30748128, "step": 53300 }, { "epoch": 7.9393803991659215, "grad_norm": 2.4092018604278564, "learning_rate": 6.1940456567915725e-06, "loss": 0.5826, "num_input_tokens_seen": 30751264, "step": 53305 }, { "epoch": 7.940125111706881, "grad_norm": 3.598306894302368, "learning_rate": 6.189764261276207e-06, "loss": 0.5417, "num_input_tokens_seen": 30754016, "step": 53310 }, { "epoch": 7.940869824247841, "grad_norm": 1.1655694246292114, "learning_rate": 6.185484136880088e-06, "loss": 0.5515, "num_input_tokens_seen": 30756800, "step": 53315 }, { "epoch": 7.9416145367888, "grad_norm": 2.319394588470459, "learning_rate": 6.181205283892458e-06, "loss": 0.8071, "num_input_tokens_seen": 30759616, "step": 53320 }, { "epoch": 7.942359249329758, "grad_norm": 2.7027270793914795, "learning_rate": 6.1769277026024615e-06, "loss": 0.7969, "num_input_tokens_seen": 30762496, "step": 53325 }, { "epoch": 7.943103961870718, "grad_norm": 1.6780691146850586, "learning_rate": 6.1726513932991724e-06, "loss": 0.688, "num_input_tokens_seen": 30765344, "step": 53330 }, { "epoch": 7.943848674411677, "grad_norm": 2.1292724609375, "learning_rate": 6.168376356271546e-06, "loss": 0.4047, "num_input_tokens_seen": 30768160, "step": 53335 }, { "epoch": 7.944593386952636, "grad_norm": 1.4805583953857422, "learning_rate": 6.1641025918084825e-06, "loss": 0.5841, "num_input_tokens_seen": 30771328, "step": 53340 }, { "epoch": 7.945338099493595, "grad_norm": 3.043705463409424, "learning_rate": 6.15983010019879e-06, "loss": 0.7003, "num_input_tokens_seen": 30774080, "step": 53345 }, { "epoch": 7.946082812034555, "grad_norm": 2.011563539505005, "learning_rate": 6.155558881731174e-06, "loss": 0.7816, "num_input_tokens_seen": 30776768, "step": 53350 }, { "epoch": 7.946827524575514, "grad_norm": 3.0905258655548096, "learning_rate": 6.151288936694274e-06, "loss": 0.6341, "num_input_tokens_seen": 30779520, "step": 53355 }, { "epoch": 7.947572237116473, "grad_norm": 4.97446346282959, "learning_rate": 6.147020265376635e-06, "loss": 0.6746, "num_input_tokens_seen": 30782464, "step": 53360 }, { "epoch": 7.948316949657432, "grad_norm": 2.3698387145996094, "learning_rate": 6.1427528680667144e-06, "loss": 0.6144, "num_input_tokens_seen": 30785248, "step": 53365 }, { "epoch": 7.949061662198392, "grad_norm": 0.3248201906681061, "learning_rate": 6.138486745052896e-06, "loss": 0.3493, "num_input_tokens_seen": 30788256, "step": 53370 }, { "epoch": 7.94980637473935, "grad_norm": 1.6518100500106812, "learning_rate": 6.134221896623449e-06, "loss": 0.5003, "num_input_tokens_seen": 30791168, "step": 53375 }, { "epoch": 7.95055108728031, "grad_norm": 2.304504156112671, "learning_rate": 6.129958323066592e-06, "loss": 0.4869, "num_input_tokens_seen": 30794304, "step": 53380 }, { "epoch": 7.951295799821269, "grad_norm": 2.230302572250366, "learning_rate": 6.1256960246704245e-06, "loss": 0.6933, "num_input_tokens_seen": 30797216, "step": 53385 }, { "epoch": 7.952040512362228, "grad_norm": 3.5098397731781006, "learning_rate": 6.1214350017229805e-06, "loss": 0.6029, "num_input_tokens_seen": 30800064, "step": 53390 }, { "epoch": 7.952785224903187, "grad_norm": 3.768369197845459, "learning_rate": 6.117175254512206e-06, "loss": 0.5634, "num_input_tokens_seen": 30803008, "step": 53395 }, { "epoch": 7.953529937444147, "grad_norm": 2.5532591342926025, "learning_rate": 6.1129167833259535e-06, "loss": 0.4744, "num_input_tokens_seen": 30805888, "step": 53400 }, { "epoch": 7.954274649985106, "grad_norm": 2.375537157058716, "learning_rate": 6.108659588451998e-06, "loss": 0.415, "num_input_tokens_seen": 30809024, "step": 53405 }, { "epoch": 7.955019362526065, "grad_norm": 7.608747482299805, "learning_rate": 6.104403670178027e-06, "loss": 0.6402, "num_input_tokens_seen": 30811936, "step": 53410 }, { "epoch": 7.955764075067024, "grad_norm": 1.6758248805999756, "learning_rate": 6.1001490287916326e-06, "loss": 0.5368, "num_input_tokens_seen": 30814784, "step": 53415 }, { "epoch": 7.956508787607984, "grad_norm": 2.344456434249878, "learning_rate": 6.095895664580317e-06, "loss": 0.618, "num_input_tokens_seen": 30817504, "step": 53420 }, { "epoch": 7.957253500148942, "grad_norm": 2.726428270339966, "learning_rate": 6.0916435778315156e-06, "loss": 0.591, "num_input_tokens_seen": 30820512, "step": 53425 }, { "epoch": 7.957998212689902, "grad_norm": 4.588958740234375, "learning_rate": 6.087392768832567e-06, "loss": 0.6595, "num_input_tokens_seen": 30823264, "step": 53430 }, { "epoch": 7.958742925230861, "grad_norm": 2.797883987426758, "learning_rate": 6.08314323787072e-06, "loss": 0.7787, "num_input_tokens_seen": 30826112, "step": 53435 }, { "epoch": 7.9594876377718204, "grad_norm": 2.015207052230835, "learning_rate": 6.078894985233141e-06, "loss": 0.5252, "num_input_tokens_seen": 30829216, "step": 53440 }, { "epoch": 7.960232350312779, "grad_norm": 2.789557933807373, "learning_rate": 6.074648011206921e-06, "loss": 0.5929, "num_input_tokens_seen": 30831840, "step": 53445 }, { "epoch": 7.960977062853738, "grad_norm": 1.719215989112854, "learning_rate": 6.070402316079043e-06, "loss": 0.5804, "num_input_tokens_seen": 30834656, "step": 53450 }, { "epoch": 7.961721775394698, "grad_norm": 2.216826915740967, "learning_rate": 6.066157900136407e-06, "loss": 0.5452, "num_input_tokens_seen": 30837280, "step": 53455 }, { "epoch": 7.962466487935657, "grad_norm": 2.823962688446045, "learning_rate": 6.0619147636658405e-06, "loss": 0.6864, "num_input_tokens_seen": 30840288, "step": 53460 }, { "epoch": 7.963211200476616, "grad_norm": 2.5460846424102783, "learning_rate": 6.057672906954076e-06, "loss": 0.4397, "num_input_tokens_seen": 30843328, "step": 53465 }, { "epoch": 7.963955913017575, "grad_norm": 3.1780762672424316, "learning_rate": 6.053432330287765e-06, "loss": 0.6216, "num_input_tokens_seen": 30846592, "step": 53470 }, { "epoch": 7.964700625558534, "grad_norm": 4.836601257324219, "learning_rate": 6.049193033953474e-06, "loss": 0.5232, "num_input_tokens_seen": 30849536, "step": 53475 }, { "epoch": 7.965445338099494, "grad_norm": 2.652724027633667, "learning_rate": 6.044955018237661e-06, "loss": 0.553, "num_input_tokens_seen": 30852992, "step": 53480 }, { "epoch": 7.966190050640453, "grad_norm": 2.73180890083313, "learning_rate": 6.040718283426722e-06, "loss": 0.6842, "num_input_tokens_seen": 30855872, "step": 53485 }, { "epoch": 7.966934763181412, "grad_norm": 4.502559661865234, "learning_rate": 6.036482829806964e-06, "loss": 0.524, "num_input_tokens_seen": 30858624, "step": 53490 }, { "epoch": 7.967679475722371, "grad_norm": 2.547858715057373, "learning_rate": 6.032248657664591e-06, "loss": 0.6769, "num_input_tokens_seen": 30861760, "step": 53495 }, { "epoch": 7.96842418826333, "grad_norm": 4.462616443634033, "learning_rate": 6.028015767285735e-06, "loss": 0.6112, "num_input_tokens_seen": 30864896, "step": 53500 }, { "epoch": 7.96916890080429, "grad_norm": 2.9918031692504883, "learning_rate": 6.023784158956442e-06, "loss": 0.6018, "num_input_tokens_seen": 30867712, "step": 53505 }, { "epoch": 7.969913613345248, "grad_norm": 3.8764545917510986, "learning_rate": 6.019553832962668e-06, "loss": 0.5795, "num_input_tokens_seen": 30870336, "step": 53510 }, { "epoch": 7.970658325886208, "grad_norm": 4.904480934143066, "learning_rate": 6.015324789590271e-06, "loss": 0.5947, "num_input_tokens_seen": 30873760, "step": 53515 }, { "epoch": 7.971403038427167, "grad_norm": 3.1716928482055664, "learning_rate": 6.01109702912504e-06, "loss": 0.4473, "num_input_tokens_seen": 30876480, "step": 53520 }, { "epoch": 7.9721477509681264, "grad_norm": 1.7340941429138184, "learning_rate": 6.006870551852667e-06, "loss": 0.632, "num_input_tokens_seen": 30879104, "step": 53525 }, { "epoch": 7.972892463509085, "grad_norm": 1.7676684856414795, "learning_rate": 6.00264535805877e-06, "loss": 0.6247, "num_input_tokens_seen": 30881984, "step": 53530 }, { "epoch": 7.973637176050045, "grad_norm": 2.459319829940796, "learning_rate": 5.998421448028854e-06, "loss": 0.6227, "num_input_tokens_seen": 30885024, "step": 53535 }, { "epoch": 7.974381888591004, "grad_norm": 2.084874153137207, "learning_rate": 5.994198822048361e-06, "loss": 0.6268, "num_input_tokens_seen": 30888096, "step": 53540 }, { "epoch": 7.975126601131963, "grad_norm": 4.2560038566589355, "learning_rate": 5.989977480402648e-06, "loss": 0.5941, "num_input_tokens_seen": 30890880, "step": 53545 }, { "epoch": 7.975871313672922, "grad_norm": 2.840611457824707, "learning_rate": 5.985757423376962e-06, "loss": 0.5851, "num_input_tokens_seen": 30893568, "step": 53550 }, { "epoch": 7.976616026213882, "grad_norm": 2.7004756927490234, "learning_rate": 5.98153865125648e-06, "loss": 0.6606, "num_input_tokens_seen": 30896384, "step": 53555 }, { "epoch": 7.97736073875484, "grad_norm": 6.563663005828857, "learning_rate": 5.977321164326294e-06, "loss": 0.6937, "num_input_tokens_seen": 30899328, "step": 53560 }, { "epoch": 7.9781054512958, "grad_norm": 1.8502839803695679, "learning_rate": 5.973104962871403e-06, "loss": 0.4847, "num_input_tokens_seen": 30902176, "step": 53565 }, { "epoch": 7.978850163836759, "grad_norm": 3.013394355773926, "learning_rate": 5.968890047176728e-06, "loss": 0.4366, "num_input_tokens_seen": 30904800, "step": 53570 }, { "epoch": 7.9795948763777185, "grad_norm": 2.4181671142578125, "learning_rate": 5.964676417527082e-06, "loss": 0.6769, "num_input_tokens_seen": 30907872, "step": 53575 }, { "epoch": 7.980339588918677, "grad_norm": 1.8844884634017944, "learning_rate": 5.960464074207217e-06, "loss": 0.4885, "num_input_tokens_seen": 30910464, "step": 53580 }, { "epoch": 7.981084301459637, "grad_norm": 2.8754584789276123, "learning_rate": 5.956253017501776e-06, "loss": 0.7113, "num_input_tokens_seen": 30913312, "step": 53585 }, { "epoch": 7.981829014000596, "grad_norm": 3.3837363719940186, "learning_rate": 5.95204324769533e-06, "loss": 0.3792, "num_input_tokens_seen": 30916160, "step": 53590 }, { "epoch": 7.982573726541555, "grad_norm": 3.2177248001098633, "learning_rate": 5.947834765072355e-06, "loss": 0.6306, "num_input_tokens_seen": 30918688, "step": 53595 }, { "epoch": 7.983318439082514, "grad_norm": 2.463728666305542, "learning_rate": 5.943627569917248e-06, "loss": 0.5597, "num_input_tokens_seen": 30921824, "step": 53600 }, { "epoch": 7.984063151623474, "grad_norm": 3.1243109703063965, "learning_rate": 5.939421662514314e-06, "loss": 0.5908, "num_input_tokens_seen": 30924704, "step": 53605 }, { "epoch": 7.9848078641644324, "grad_norm": 3.3420886993408203, "learning_rate": 5.9352170431477755e-06, "loss": 0.5726, "num_input_tokens_seen": 30927552, "step": 53610 }, { "epoch": 7.985552576705392, "grad_norm": 2.795032262802124, "learning_rate": 5.931013712101754e-06, "loss": 0.6028, "num_input_tokens_seen": 30930432, "step": 53615 }, { "epoch": 7.986297289246351, "grad_norm": 1.6347788572311401, "learning_rate": 5.926811669660296e-06, "loss": 0.5506, "num_input_tokens_seen": 30933376, "step": 53620 }, { "epoch": 7.9870420017873105, "grad_norm": 3.2972679138183594, "learning_rate": 5.922610916107355e-06, "loss": 0.537, "num_input_tokens_seen": 30936096, "step": 53625 }, { "epoch": 7.987786714328269, "grad_norm": 3.754782199859619, "learning_rate": 5.918411451726804e-06, "loss": 0.5757, "num_input_tokens_seen": 30938816, "step": 53630 }, { "epoch": 7.988531426869228, "grad_norm": 2.727484941482544, "learning_rate": 5.91421327680243e-06, "loss": 0.499, "num_input_tokens_seen": 30941600, "step": 53635 }, { "epoch": 7.989276139410188, "grad_norm": 1.1423985958099365, "learning_rate": 5.910016391617934e-06, "loss": 0.5684, "num_input_tokens_seen": 30944608, "step": 53640 }, { "epoch": 7.990020851951147, "grad_norm": 2.1290359497070312, "learning_rate": 5.905820796456906e-06, "loss": 0.4711, "num_input_tokens_seen": 30947424, "step": 53645 }, { "epoch": 7.990765564492106, "grad_norm": 2.7461416721343994, "learning_rate": 5.901626491602885e-06, "loss": 0.5031, "num_input_tokens_seen": 30950176, "step": 53650 }, { "epoch": 7.991510277033065, "grad_norm": 2.9161274433135986, "learning_rate": 5.8974334773392924e-06, "loss": 0.5209, "num_input_tokens_seen": 30953216, "step": 53655 }, { "epoch": 7.9922549895740245, "grad_norm": 1.9220836162567139, "learning_rate": 5.893241753949477e-06, "loss": 0.5345, "num_input_tokens_seen": 30956000, "step": 53660 }, { "epoch": 7.992999702114983, "grad_norm": 2.8793113231658936, "learning_rate": 5.889051321716702e-06, "loss": 0.5994, "num_input_tokens_seen": 30958944, "step": 53665 }, { "epoch": 7.993744414655943, "grad_norm": 1.8837312459945679, "learning_rate": 5.8848621809241415e-06, "loss": 0.4604, "num_input_tokens_seen": 30961984, "step": 53670 }, { "epoch": 7.994489127196902, "grad_norm": 3.1650390625, "learning_rate": 5.880674331854882e-06, "loss": 0.5957, "num_input_tokens_seen": 30964928, "step": 53675 }, { "epoch": 7.995233839737861, "grad_norm": 3.537560224533081, "learning_rate": 5.876487774791914e-06, "loss": 0.5825, "num_input_tokens_seen": 30967552, "step": 53680 }, { "epoch": 7.99597855227882, "grad_norm": 2.445801019668579, "learning_rate": 5.872302510018149e-06, "loss": 0.53, "num_input_tokens_seen": 30970528, "step": 53685 }, { "epoch": 7.99672326481978, "grad_norm": 1.4600818157196045, "learning_rate": 5.8681185378164224e-06, "loss": 0.6082, "num_input_tokens_seen": 30973600, "step": 53690 }, { "epoch": 7.9974679773607384, "grad_norm": 2.1270883083343506, "learning_rate": 5.863935858469452e-06, "loss": 0.5779, "num_input_tokens_seen": 30976320, "step": 53695 }, { "epoch": 7.998212689901698, "grad_norm": 1.6212648153305054, "learning_rate": 5.859754472259893e-06, "loss": 0.5796, "num_input_tokens_seen": 30978880, "step": 53700 }, { "epoch": 7.998957402442657, "grad_norm": 8.55522632598877, "learning_rate": 5.855574379470311e-06, "loss": 0.7507, "num_input_tokens_seen": 30981632, "step": 53705 }, { "epoch": 7.9997021149836165, "grad_norm": 2.434513807296753, "learning_rate": 5.851395580383182e-06, "loss": 0.5107, "num_input_tokens_seen": 30984608, "step": 53710 }, { "epoch": 8.0, "eval_loss": 0.6710728406906128, "eval_runtime": 74.3299, "eval_samples_per_second": 40.145, "eval_steps_per_second": 10.036, "num_input_tokens_seen": 30985288, "step": 53712 }, { "epoch": 8.000446827524575, "grad_norm": 2.5691070556640625, "learning_rate": 5.8472180752808805e-06, "loss": 0.4861, "num_input_tokens_seen": 30986824, "step": 53715 }, { "epoch": 8.001191540065534, "grad_norm": 1.3252638578414917, "learning_rate": 5.843041864445714e-06, "loss": 0.6092, "num_input_tokens_seen": 30989800, "step": 53720 }, { "epoch": 8.001936252606495, "grad_norm": 2.2542130947113037, "learning_rate": 5.838866948159888e-06, "loss": 0.4512, "num_input_tokens_seen": 30992520, "step": 53725 }, { "epoch": 8.002680965147453, "grad_norm": 3.352003812789917, "learning_rate": 5.8346933267055394e-06, "loss": 0.5141, "num_input_tokens_seen": 30995336, "step": 53730 }, { "epoch": 8.003425677688412, "grad_norm": 2.607163906097412, "learning_rate": 5.830521000364689e-06, "loss": 0.6592, "num_input_tokens_seen": 30998312, "step": 53735 }, { "epoch": 8.00417039022937, "grad_norm": 3.8824431896209717, "learning_rate": 5.826349969419292e-06, "loss": 0.7474, "num_input_tokens_seen": 31001480, "step": 53740 }, { "epoch": 8.004915102770331, "grad_norm": 2.366135358810425, "learning_rate": 5.822180234151214e-06, "loss": 0.6291, "num_input_tokens_seen": 31004232, "step": 53745 }, { "epoch": 8.00565981531129, "grad_norm": 4.451849460601807, "learning_rate": 5.818011794842221e-06, "loss": 0.7795, "num_input_tokens_seen": 31007176, "step": 53750 }, { "epoch": 8.006404527852249, "grad_norm": 1.0014687776565552, "learning_rate": 5.8138446517740005e-06, "loss": 0.7803, "num_input_tokens_seen": 31010184, "step": 53755 }, { "epoch": 8.007149240393208, "grad_norm": 1.8987317085266113, "learning_rate": 5.809678805228152e-06, "loss": 0.441, "num_input_tokens_seen": 31013192, "step": 53760 }, { "epoch": 8.007893952934168, "grad_norm": 2.826444387435913, "learning_rate": 5.805514255486191e-06, "loss": 0.5363, "num_input_tokens_seen": 31016072, "step": 53765 }, { "epoch": 8.008638665475127, "grad_norm": 2.494039535522461, "learning_rate": 5.801351002829542e-06, "loss": 0.5745, "num_input_tokens_seen": 31018952, "step": 53770 }, { "epoch": 8.009383378016086, "grad_norm": 2.110076427459717, "learning_rate": 5.797189047539531e-06, "loss": 0.6512, "num_input_tokens_seen": 31021864, "step": 53775 }, { "epoch": 8.010128090557044, "grad_norm": 6.619787216186523, "learning_rate": 5.793028389897418e-06, "loss": 0.567, "num_input_tokens_seen": 31024648, "step": 53780 }, { "epoch": 8.010872803098005, "grad_norm": 2.4079930782318115, "learning_rate": 5.788869030184346e-06, "loss": 0.5447, "num_input_tokens_seen": 31027528, "step": 53785 }, { "epoch": 8.011617515638964, "grad_norm": 3.5733132362365723, "learning_rate": 5.784710968681403e-06, "loss": 0.7016, "num_input_tokens_seen": 31030248, "step": 53790 }, { "epoch": 8.012362228179922, "grad_norm": 2.945225954055786, "learning_rate": 5.780554205669567e-06, "loss": 0.7551, "num_input_tokens_seen": 31033000, "step": 53795 }, { "epoch": 8.013106940720881, "grad_norm": 2.5325441360473633, "learning_rate": 5.776398741429737e-06, "loss": 0.5178, "num_input_tokens_seen": 31035848, "step": 53800 }, { "epoch": 8.013851653261842, "grad_norm": 3.424571990966797, "learning_rate": 5.77224457624273e-06, "loss": 0.4399, "num_input_tokens_seen": 31038952, "step": 53805 }, { "epoch": 8.0145963658028, "grad_norm": 2.604828357696533, "learning_rate": 5.768091710389254e-06, "loss": 0.65, "num_input_tokens_seen": 31042088, "step": 53810 }, { "epoch": 8.01534107834376, "grad_norm": 3.350309371948242, "learning_rate": 5.763940144149954e-06, "loss": 0.7132, "num_input_tokens_seen": 31044840, "step": 53815 }, { "epoch": 8.016085790884718, "grad_norm": 2.327134370803833, "learning_rate": 5.759789877805363e-06, "loss": 0.5554, "num_input_tokens_seen": 31047752, "step": 53820 }, { "epoch": 8.016830503425677, "grad_norm": 3.724222421646118, "learning_rate": 5.755640911635951e-06, "loss": 0.6208, "num_input_tokens_seen": 31050824, "step": 53825 }, { "epoch": 8.017575215966637, "grad_norm": 3.593489170074463, "learning_rate": 5.7514932459220825e-06, "loss": 0.6471, "num_input_tokens_seen": 31053672, "step": 53830 }, { "epoch": 8.018319928507596, "grad_norm": 2.4454493522644043, "learning_rate": 5.747346880944041e-06, "loss": 0.5333, "num_input_tokens_seen": 31056232, "step": 53835 }, { "epoch": 8.019064641048555, "grad_norm": 1.8662830591201782, "learning_rate": 5.743201816982027e-06, "loss": 0.5486, "num_input_tokens_seen": 31059304, "step": 53840 }, { "epoch": 8.019809353589514, "grad_norm": 1.8437573909759521, "learning_rate": 5.739058054316138e-06, "loss": 0.436, "num_input_tokens_seen": 31062376, "step": 53845 }, { "epoch": 8.020554066130474, "grad_norm": 2.7650973796844482, "learning_rate": 5.7349155932264035e-06, "loss": 0.5934, "num_input_tokens_seen": 31065352, "step": 53850 }, { "epoch": 8.021298778671433, "grad_norm": 4.3627848625183105, "learning_rate": 5.730774433992739e-06, "loss": 0.7285, "num_input_tokens_seen": 31068168, "step": 53855 }, { "epoch": 8.022043491212392, "grad_norm": 3.606384515762329, "learning_rate": 5.726634576894993e-06, "loss": 0.6901, "num_input_tokens_seen": 31071208, "step": 53860 }, { "epoch": 8.02278820375335, "grad_norm": 2.4622273445129395, "learning_rate": 5.7224960222129255e-06, "loss": 0.4738, "num_input_tokens_seen": 31074184, "step": 53865 }, { "epoch": 8.023532916294311, "grad_norm": 2.485020875930786, "learning_rate": 5.718358770226201e-06, "loss": 0.4762, "num_input_tokens_seen": 31077032, "step": 53870 }, { "epoch": 8.02427762883527, "grad_norm": 4.786159992218018, "learning_rate": 5.714222821214402e-06, "loss": 0.5011, "num_input_tokens_seen": 31079528, "step": 53875 }, { "epoch": 8.025022341376228, "grad_norm": 6.8903489112854, "learning_rate": 5.710088175457007e-06, "loss": 0.6267, "num_input_tokens_seen": 31082408, "step": 53880 }, { "epoch": 8.025767053917187, "grad_norm": 2.8408637046813965, "learning_rate": 5.705954833233429e-06, "loss": 0.5281, "num_input_tokens_seen": 31085096, "step": 53885 }, { "epoch": 8.026511766458148, "grad_norm": 3.930659532546997, "learning_rate": 5.701822794822978e-06, "loss": 0.6034, "num_input_tokens_seen": 31088296, "step": 53890 }, { "epoch": 8.027256478999107, "grad_norm": 3.005061388015747, "learning_rate": 5.6976920605048885e-06, "loss": 0.6148, "num_input_tokens_seen": 31091432, "step": 53895 }, { "epoch": 8.028001191540065, "grad_norm": 2.268259048461914, "learning_rate": 5.6935626305582855e-06, "loss": 0.7181, "num_input_tokens_seen": 31094536, "step": 53900 }, { "epoch": 8.028745904081024, "grad_norm": 2.239696979522705, "learning_rate": 5.6894345052622255e-06, "loss": 0.5662, "num_input_tokens_seen": 31097512, "step": 53905 }, { "epoch": 8.029490616621985, "grad_norm": 1.8726402521133423, "learning_rate": 5.685307684895677e-06, "loss": 0.6264, "num_input_tokens_seen": 31100392, "step": 53910 }, { "epoch": 8.030235329162943, "grad_norm": 2.754699468612671, "learning_rate": 5.6811821697375005e-06, "loss": 0.5211, "num_input_tokens_seen": 31103080, "step": 53915 }, { "epoch": 8.030980041703902, "grad_norm": 6.744041919708252, "learning_rate": 5.67705796006649e-06, "loss": 0.6749, "num_input_tokens_seen": 31106344, "step": 53920 }, { "epoch": 8.03172475424486, "grad_norm": 2.6289708614349365, "learning_rate": 5.672935056161338e-06, "loss": 0.5818, "num_input_tokens_seen": 31109320, "step": 53925 }, { "epoch": 8.032469466785821, "grad_norm": 2.40621018409729, "learning_rate": 5.668813458300659e-06, "loss": 0.7112, "num_input_tokens_seen": 31112072, "step": 53930 }, { "epoch": 8.03321417932678, "grad_norm": 6.0557403564453125, "learning_rate": 5.664693166762977e-06, "loss": 0.7468, "num_input_tokens_seen": 31114824, "step": 53935 }, { "epoch": 8.033958891867739, "grad_norm": 3.427683115005493, "learning_rate": 5.660574181826714e-06, "loss": 0.576, "num_input_tokens_seen": 31117640, "step": 53940 }, { "epoch": 8.034703604408698, "grad_norm": 2.104255437850952, "learning_rate": 5.656456503770227e-06, "loss": 0.454, "num_input_tokens_seen": 31120840, "step": 53945 }, { "epoch": 8.035448316949658, "grad_norm": 4.1928839683532715, "learning_rate": 5.652340132871756e-06, "loss": 0.4329, "num_input_tokens_seen": 31124104, "step": 53950 }, { "epoch": 8.036193029490617, "grad_norm": 2.1045475006103516, "learning_rate": 5.648225069409477e-06, "loss": 0.635, "num_input_tokens_seen": 31126728, "step": 53955 }, { "epoch": 8.036937742031576, "grad_norm": 4.290744304656982, "learning_rate": 5.644111313661471e-06, "loss": 0.5311, "num_input_tokens_seen": 31129480, "step": 53960 }, { "epoch": 8.037682454572534, "grad_norm": 3.0774598121643066, "learning_rate": 5.639998865905724e-06, "loss": 0.6615, "num_input_tokens_seen": 31132584, "step": 53965 }, { "epoch": 8.038427167113495, "grad_norm": 4.572042465209961, "learning_rate": 5.63588772642015e-06, "loss": 0.7071, "num_input_tokens_seen": 31135848, "step": 53970 }, { "epoch": 8.039171879654454, "grad_norm": 2.183238983154297, "learning_rate": 5.631777895482549e-06, "loss": 0.6542, "num_input_tokens_seen": 31138792, "step": 53975 }, { "epoch": 8.039916592195413, "grad_norm": 5.402338981628418, "learning_rate": 5.627669373370658e-06, "loss": 0.632, "num_input_tokens_seen": 31141704, "step": 53980 }, { "epoch": 8.040661304736371, "grad_norm": 1.9575607776641846, "learning_rate": 5.6235621603621004e-06, "loss": 0.5292, "num_input_tokens_seen": 31144552, "step": 53985 }, { "epoch": 8.041406017277332, "grad_norm": 1.7344441413879395, "learning_rate": 5.619456256734434e-06, "loss": 0.4176, "num_input_tokens_seen": 31147560, "step": 53990 }, { "epoch": 8.04215072981829, "grad_norm": 1.6665594577789307, "learning_rate": 5.61535166276512e-06, "loss": 0.4966, "num_input_tokens_seen": 31150568, "step": 53995 }, { "epoch": 8.04289544235925, "grad_norm": 2.8813679218292236, "learning_rate": 5.611248378731526e-06, "loss": 0.5861, "num_input_tokens_seen": 31153512, "step": 54000 }, { "epoch": 8.043640154900208, "grad_norm": 3.041393756866455, "learning_rate": 5.607146404910949e-06, "loss": 0.6458, "num_input_tokens_seen": 31156232, "step": 54005 }, { "epoch": 8.044384867441167, "grad_norm": 1.6882121562957764, "learning_rate": 5.603045741580559e-06, "loss": 0.7964, "num_input_tokens_seen": 31159048, "step": 54010 }, { "epoch": 8.045129579982127, "grad_norm": 2.1518208980560303, "learning_rate": 5.598946389017487e-06, "loss": 0.6374, "num_input_tokens_seen": 31161928, "step": 54015 }, { "epoch": 8.045874292523086, "grad_norm": 3.1968703269958496, "learning_rate": 5.59484834749873e-06, "loss": 0.6489, "num_input_tokens_seen": 31164776, "step": 54020 }, { "epoch": 8.046619005064045, "grad_norm": 2.737863302230835, "learning_rate": 5.590751617301226e-06, "loss": 0.6038, "num_input_tokens_seen": 31167944, "step": 54025 }, { "epoch": 8.047363717605004, "grad_norm": 3.3223073482513428, "learning_rate": 5.5866561987018176e-06, "loss": 0.4737, "num_input_tokens_seen": 31170536, "step": 54030 }, { "epoch": 8.048108430145964, "grad_norm": 4.656395435333252, "learning_rate": 5.582562091977253e-06, "loss": 0.6441, "num_input_tokens_seen": 31173736, "step": 54035 }, { "epoch": 8.048853142686923, "grad_norm": 5.243701457977295, "learning_rate": 5.578469297404204e-06, "loss": 0.8055, "num_input_tokens_seen": 31176872, "step": 54040 }, { "epoch": 8.049597855227882, "grad_norm": 1.9915916919708252, "learning_rate": 5.574377815259229e-06, "loss": 0.6069, "num_input_tokens_seen": 31179784, "step": 54045 }, { "epoch": 8.05034256776884, "grad_norm": 1.8603107929229736, "learning_rate": 5.570287645818825e-06, "loss": 0.5411, "num_input_tokens_seen": 31182536, "step": 54050 }, { "epoch": 8.051087280309801, "grad_norm": 2.5013978481292725, "learning_rate": 5.566198789359392e-06, "loss": 0.6019, "num_input_tokens_seen": 31185352, "step": 54055 }, { "epoch": 8.05183199285076, "grad_norm": 3.835895299911499, "learning_rate": 5.562111246157228e-06, "loss": 0.7081, "num_input_tokens_seen": 31188360, "step": 54060 }, { "epoch": 8.052576705391719, "grad_norm": 2.107661724090576, "learning_rate": 5.558025016488555e-06, "loss": 0.6414, "num_input_tokens_seen": 31191272, "step": 54065 }, { "epoch": 8.053321417932677, "grad_norm": 3.1270341873168945, "learning_rate": 5.553940100629507e-06, "loss": 0.4632, "num_input_tokens_seen": 31193992, "step": 54070 }, { "epoch": 8.054066130473638, "grad_norm": 1.1915972232818604, "learning_rate": 5.549856498856129e-06, "loss": 0.3858, "num_input_tokens_seen": 31196712, "step": 54075 }, { "epoch": 8.054810843014597, "grad_norm": 2.265249013900757, "learning_rate": 5.545774211444369e-06, "loss": 0.6632, "num_input_tokens_seen": 31199848, "step": 54080 }, { "epoch": 8.055555555555555, "grad_norm": 2.0873172283172607, "learning_rate": 5.541693238670087e-06, "loss": 0.6288, "num_input_tokens_seen": 31202696, "step": 54085 }, { "epoch": 8.056300268096514, "grad_norm": 2.01107120513916, "learning_rate": 5.537613580809067e-06, "loss": 0.4712, "num_input_tokens_seen": 31205320, "step": 54090 }, { "epoch": 8.057044980637475, "grad_norm": 4.12598991394043, "learning_rate": 5.533535238137e-06, "loss": 0.6069, "num_input_tokens_seen": 31208584, "step": 54095 }, { "epoch": 8.057789693178433, "grad_norm": 2.52197527885437, "learning_rate": 5.5294582109294696e-06, "loss": 0.4445, "num_input_tokens_seen": 31211624, "step": 54100 }, { "epoch": 8.058534405719392, "grad_norm": 2.6327600479125977, "learning_rate": 5.525382499461993e-06, "loss": 0.5212, "num_input_tokens_seen": 31214376, "step": 54105 }, { "epoch": 8.059279118260351, "grad_norm": 2.7588083744049072, "learning_rate": 5.521308104009992e-06, "loss": 0.5874, "num_input_tokens_seen": 31217288, "step": 54110 }, { "epoch": 8.060023830801311, "grad_norm": 2.8039398193359375, "learning_rate": 5.517235024848791e-06, "loss": 0.5361, "num_input_tokens_seen": 31220040, "step": 54115 }, { "epoch": 8.06076854334227, "grad_norm": 4.236683368682861, "learning_rate": 5.513163262253635e-06, "loss": 0.5367, "num_input_tokens_seen": 31222984, "step": 54120 }, { "epoch": 8.061513255883229, "grad_norm": 2.189716100692749, "learning_rate": 5.509092816499678e-06, "loss": 0.4809, "num_input_tokens_seen": 31225928, "step": 54125 }, { "epoch": 8.062257968424188, "grad_norm": 2.0235683917999268, "learning_rate": 5.505023687861985e-06, "loss": 0.5171, "num_input_tokens_seen": 31229192, "step": 54130 }, { "epoch": 8.063002680965148, "grad_norm": 2.1275453567504883, "learning_rate": 5.500955876615538e-06, "loss": 0.4943, "num_input_tokens_seen": 31232136, "step": 54135 }, { "epoch": 8.063747393506107, "grad_norm": 2.3510801792144775, "learning_rate": 5.496889383035206e-06, "loss": 0.6409, "num_input_tokens_seen": 31235080, "step": 54140 }, { "epoch": 8.064492106047066, "grad_norm": 5.377347469329834, "learning_rate": 5.492824207395805e-06, "loss": 0.5772, "num_input_tokens_seen": 31238152, "step": 54145 }, { "epoch": 8.065236818588025, "grad_norm": 3.313760280609131, "learning_rate": 5.4887603499720244e-06, "loss": 0.4736, "num_input_tokens_seen": 31240968, "step": 54150 }, { "epoch": 8.065981531128985, "grad_norm": 1.8303385972976685, "learning_rate": 5.484697811038494e-06, "loss": 0.4971, "num_input_tokens_seen": 31243880, "step": 54155 }, { "epoch": 8.066726243669944, "grad_norm": 3.169665575027466, "learning_rate": 5.480636590869742e-06, "loss": 0.7, "num_input_tokens_seen": 31246888, "step": 54160 }, { "epoch": 8.067470956210903, "grad_norm": 3.804619789123535, "learning_rate": 5.476576689740209e-06, "loss": 0.5146, "num_input_tokens_seen": 31250120, "step": 54165 }, { "epoch": 8.068215668751861, "grad_norm": 4.544778823852539, "learning_rate": 5.472518107924255e-06, "loss": 0.5803, "num_input_tokens_seen": 31253128, "step": 54170 }, { "epoch": 8.06896038129282, "grad_norm": 3.633113145828247, "learning_rate": 5.468460845696133e-06, "loss": 0.6023, "num_input_tokens_seen": 31255944, "step": 54175 }, { "epoch": 8.06970509383378, "grad_norm": 1.9887927770614624, "learning_rate": 5.4644049033300085e-06, "loss": 0.5415, "num_input_tokens_seen": 31258760, "step": 54180 }, { "epoch": 8.07044980637474, "grad_norm": 3.2832183837890625, "learning_rate": 5.460350281099977e-06, "loss": 0.5231, "num_input_tokens_seen": 31261672, "step": 54185 }, { "epoch": 8.071194518915698, "grad_norm": 3.4662585258483887, "learning_rate": 5.4562969792800315e-06, "loss": 0.5626, "num_input_tokens_seen": 31264552, "step": 54190 }, { "epoch": 8.071939231456657, "grad_norm": 3.1129629611968994, "learning_rate": 5.452244998144076e-06, "loss": 0.5979, "num_input_tokens_seen": 31267464, "step": 54195 }, { "epoch": 8.072683943997617, "grad_norm": 2.3176605701446533, "learning_rate": 5.448194337965931e-06, "loss": 0.5026, "num_input_tokens_seen": 31270024, "step": 54200 }, { "epoch": 8.073428656538576, "grad_norm": 4.712691783905029, "learning_rate": 5.444144999019324e-06, "loss": 0.6068, "num_input_tokens_seen": 31272872, "step": 54205 }, { "epoch": 8.074173369079535, "grad_norm": 3.7806856632232666, "learning_rate": 5.4400969815778854e-06, "loss": 0.5757, "num_input_tokens_seen": 31275848, "step": 54210 }, { "epoch": 8.074918081620494, "grad_norm": 3.7713489532470703, "learning_rate": 5.436050285915173e-06, "loss": 0.9031, "num_input_tokens_seen": 31278824, "step": 54215 }, { "epoch": 8.075662794161454, "grad_norm": 2.7321269512176514, "learning_rate": 5.432004912304636e-06, "loss": 0.6204, "num_input_tokens_seen": 31282152, "step": 54220 }, { "epoch": 8.076407506702413, "grad_norm": 2.057816505432129, "learning_rate": 5.427960861019648e-06, "loss": 0.6366, "num_input_tokens_seen": 31285032, "step": 54225 }, { "epoch": 8.077152219243372, "grad_norm": 2.8650670051574707, "learning_rate": 5.423918132333491e-06, "loss": 0.4973, "num_input_tokens_seen": 31287944, "step": 54230 }, { "epoch": 8.07789693178433, "grad_norm": 2.526106119155884, "learning_rate": 5.4198767265193574e-06, "loss": 0.5077, "num_input_tokens_seen": 31290856, "step": 54235 }, { "epoch": 8.078641644325291, "grad_norm": 1.9090880155563354, "learning_rate": 5.415836643850352e-06, "loss": 0.5528, "num_input_tokens_seen": 31293704, "step": 54240 }, { "epoch": 8.07938635686625, "grad_norm": 2.1791908740997314, "learning_rate": 5.411797884599479e-06, "loss": 0.5404, "num_input_tokens_seen": 31296680, "step": 54245 }, { "epoch": 8.080131069407209, "grad_norm": 3.0879695415496826, "learning_rate": 5.407760449039662e-06, "loss": 0.7356, "num_input_tokens_seen": 31299528, "step": 54250 }, { "epoch": 8.080875781948167, "grad_norm": 3.716010570526123, "learning_rate": 5.403724337443747e-06, "loss": 0.7999, "num_input_tokens_seen": 31302216, "step": 54255 }, { "epoch": 8.081620494489128, "grad_norm": 2.294682264328003, "learning_rate": 5.399689550084461e-06, "loss": 0.4748, "num_input_tokens_seen": 31305128, "step": 54260 }, { "epoch": 8.082365207030087, "grad_norm": 4.289132118225098, "learning_rate": 5.395656087234466e-06, "loss": 0.5127, "num_input_tokens_seen": 31307784, "step": 54265 }, { "epoch": 8.083109919571045, "grad_norm": 2.1168785095214844, "learning_rate": 5.391623949166327e-06, "loss": 0.687, "num_input_tokens_seen": 31310408, "step": 54270 }, { "epoch": 8.083854632112004, "grad_norm": 2.788343906402588, "learning_rate": 5.387593136152527e-06, "loss": 0.6292, "num_input_tokens_seen": 31313576, "step": 54275 }, { "epoch": 8.084599344652965, "grad_norm": 3.3193185329437256, "learning_rate": 5.383563648465437e-06, "loss": 0.6616, "num_input_tokens_seen": 31316488, "step": 54280 }, { "epoch": 8.085344057193923, "grad_norm": 3.0782597064971924, "learning_rate": 5.37953548637736e-06, "loss": 0.5666, "num_input_tokens_seen": 31319336, "step": 54285 }, { "epoch": 8.086088769734882, "grad_norm": 2.6442487239837646, "learning_rate": 5.375508650160507e-06, "loss": 0.3909, "num_input_tokens_seen": 31322152, "step": 54290 }, { "epoch": 8.086833482275841, "grad_norm": 3.361163377761841, "learning_rate": 5.371483140086997e-06, "loss": 0.5012, "num_input_tokens_seen": 31325064, "step": 54295 }, { "epoch": 8.087578194816802, "grad_norm": 6.264185905456543, "learning_rate": 5.367458956428845e-06, "loss": 0.3962, "num_input_tokens_seen": 31328200, "step": 54300 }, { "epoch": 8.08832290735776, "grad_norm": 3.3242881298065186, "learning_rate": 5.363436099457997e-06, "loss": 0.4704, "num_input_tokens_seen": 31331176, "step": 54305 }, { "epoch": 8.089067619898719, "grad_norm": 2.876948595046997, "learning_rate": 5.359414569446308e-06, "loss": 0.5481, "num_input_tokens_seen": 31333864, "step": 54310 }, { "epoch": 8.089812332439678, "grad_norm": 2.0722291469573975, "learning_rate": 5.355394366665525e-06, "loss": 0.4411, "num_input_tokens_seen": 31336648, "step": 54315 }, { "epoch": 8.090557044980638, "grad_norm": 3.2563116550445557, "learning_rate": 5.35137549138732e-06, "loss": 0.4248, "num_input_tokens_seen": 31339304, "step": 54320 }, { "epoch": 8.091301757521597, "grad_norm": 2.801328659057617, "learning_rate": 5.347357943883272e-06, "loss": 0.4012, "num_input_tokens_seen": 31342024, "step": 54325 }, { "epoch": 8.092046470062556, "grad_norm": 3.988827705383301, "learning_rate": 5.343341724424875e-06, "loss": 0.611, "num_input_tokens_seen": 31344968, "step": 54330 }, { "epoch": 8.092791182603515, "grad_norm": 4.104580402374268, "learning_rate": 5.339326833283531e-06, "loss": 0.5144, "num_input_tokens_seen": 31347816, "step": 54335 }, { "epoch": 8.093535895144473, "grad_norm": 7.908618927001953, "learning_rate": 5.335313270730546e-06, "loss": 0.7245, "num_input_tokens_seen": 31350536, "step": 54340 }, { "epoch": 8.094280607685434, "grad_norm": 3.406294345855713, "learning_rate": 5.331301037037132e-06, "loss": 0.5312, "num_input_tokens_seen": 31353384, "step": 54345 }, { "epoch": 8.095025320226393, "grad_norm": 1.9735469818115234, "learning_rate": 5.327290132474427e-06, "loss": 0.6679, "num_input_tokens_seen": 31356584, "step": 54350 }, { "epoch": 8.095770032767351, "grad_norm": 4.095175266265869, "learning_rate": 5.323280557313473e-06, "loss": 0.5425, "num_input_tokens_seen": 31359304, "step": 54355 }, { "epoch": 8.09651474530831, "grad_norm": 5.726034641265869, "learning_rate": 5.319272311825216e-06, "loss": 0.6401, "num_input_tokens_seen": 31362216, "step": 54360 }, { "epoch": 8.09725945784927, "grad_norm": 5.725591659545898, "learning_rate": 5.315265396280522e-06, "loss": 0.7554, "num_input_tokens_seen": 31364968, "step": 54365 }, { "epoch": 8.09800417039023, "grad_norm": 1.5136775970458984, "learning_rate": 5.311259810950167e-06, "loss": 0.4816, "num_input_tokens_seen": 31368072, "step": 54370 }, { "epoch": 8.098748882931188, "grad_norm": 2.78698992729187, "learning_rate": 5.3072555561048255e-06, "loss": 0.5611, "num_input_tokens_seen": 31370728, "step": 54375 }, { "epoch": 8.099493595472147, "grad_norm": 2.6643850803375244, "learning_rate": 5.303252632015082e-06, "loss": 0.5804, "num_input_tokens_seen": 31373672, "step": 54380 }, { "epoch": 8.100238308013108, "grad_norm": 3.0911920070648193, "learning_rate": 5.299251038951444e-06, "loss": 0.5411, "num_input_tokens_seen": 31376616, "step": 54385 }, { "epoch": 8.100983020554066, "grad_norm": 2.292316198348999, "learning_rate": 5.295250777184324e-06, "loss": 0.5941, "num_input_tokens_seen": 31379656, "step": 54390 }, { "epoch": 8.101727733095025, "grad_norm": 4.513917446136475, "learning_rate": 5.2912518469840436e-06, "loss": 0.7539, "num_input_tokens_seen": 31382600, "step": 54395 }, { "epoch": 8.102472445635984, "grad_norm": 1.617798924446106, "learning_rate": 5.287254248620832e-06, "loss": 0.6445, "num_input_tokens_seen": 31385352, "step": 54400 }, { "epoch": 8.103217158176944, "grad_norm": 1.9136075973510742, "learning_rate": 5.283257982364839e-06, "loss": 0.4707, "num_input_tokens_seen": 31388232, "step": 54405 }, { "epoch": 8.103961870717903, "grad_norm": 2.133223533630371, "learning_rate": 5.279263048486102e-06, "loss": 0.5514, "num_input_tokens_seen": 31390952, "step": 54410 }, { "epoch": 8.104706583258862, "grad_norm": 2.9274098873138428, "learning_rate": 5.275269447254597e-06, "loss": 0.4473, "num_input_tokens_seen": 31393736, "step": 54415 }, { "epoch": 8.10545129579982, "grad_norm": 3.0906155109405518, "learning_rate": 5.271277178940182e-06, "loss": 0.5122, "num_input_tokens_seen": 31396776, "step": 54420 }, { "epoch": 8.106196008340781, "grad_norm": 2.9527010917663574, "learning_rate": 5.267286243812641e-06, "loss": 0.5284, "num_input_tokens_seen": 31400104, "step": 54425 }, { "epoch": 8.10694072088174, "grad_norm": 3.455528736114502, "learning_rate": 5.263296642141671e-06, "loss": 0.7258, "num_input_tokens_seen": 31403016, "step": 54430 }, { "epoch": 8.107685433422699, "grad_norm": 3.92795729637146, "learning_rate": 5.25930837419687e-06, "loss": 0.4876, "num_input_tokens_seen": 31405800, "step": 54435 }, { "epoch": 8.108430145963657, "grad_norm": 2.983250141143799, "learning_rate": 5.2553214402477565e-06, "loss": 0.6273, "num_input_tokens_seen": 31408744, "step": 54440 }, { "epoch": 8.109174858504618, "grad_norm": 2.5141677856445312, "learning_rate": 5.251335840563737e-06, "loss": 0.4779, "num_input_tokens_seen": 31411432, "step": 54445 }, { "epoch": 8.109919571045577, "grad_norm": 1.710105299949646, "learning_rate": 5.247351575414148e-06, "loss": 0.654, "num_input_tokens_seen": 31414408, "step": 54450 }, { "epoch": 8.110664283586535, "grad_norm": 2.72904896736145, "learning_rate": 5.243368645068239e-06, "loss": 0.5985, "num_input_tokens_seen": 31417256, "step": 54455 }, { "epoch": 8.111408996127494, "grad_norm": 2.8713436126708984, "learning_rate": 5.239387049795144e-06, "loss": 0.6721, "num_input_tokens_seen": 31420040, "step": 54460 }, { "epoch": 8.112153708668455, "grad_norm": 1.9484018087387085, "learning_rate": 5.235406789863934e-06, "loss": 0.5281, "num_input_tokens_seen": 31422824, "step": 54465 }, { "epoch": 8.112898421209414, "grad_norm": 1.8809881210327148, "learning_rate": 5.2314278655435726e-06, "loss": 0.5513, "num_input_tokens_seen": 31425800, "step": 54470 }, { "epoch": 8.113643133750372, "grad_norm": 2.9723165035247803, "learning_rate": 5.227450277102952e-06, "loss": 0.5725, "num_input_tokens_seen": 31428744, "step": 54475 }, { "epoch": 8.114387846291331, "grad_norm": 4.812478065490723, "learning_rate": 5.223474024810846e-06, "loss": 0.6889, "num_input_tokens_seen": 31431624, "step": 54480 }, { "epoch": 8.115132558832292, "grad_norm": 4.617882251739502, "learning_rate": 5.219499108935957e-06, "loss": 0.7407, "num_input_tokens_seen": 31434536, "step": 54485 }, { "epoch": 8.11587727137325, "grad_norm": 1.9808486700057983, "learning_rate": 5.215525529746901e-06, "loss": 0.483, "num_input_tokens_seen": 31437416, "step": 54490 }, { "epoch": 8.116621983914209, "grad_norm": 2.9055049419403076, "learning_rate": 5.211553287512189e-06, "loss": 0.6115, "num_input_tokens_seen": 31440200, "step": 54495 }, { "epoch": 8.117366696455168, "grad_norm": 4.218189716339111, "learning_rate": 5.207582382500259e-06, "loss": 0.6815, "num_input_tokens_seen": 31443240, "step": 54500 }, { "epoch": 8.118111408996128, "grad_norm": 2.4023489952087402, "learning_rate": 5.203612814979442e-06, "loss": 0.6086, "num_input_tokens_seen": 31446056, "step": 54505 }, { "epoch": 8.118856121537087, "grad_norm": 2.1862876415252686, "learning_rate": 5.199644585217978e-06, "loss": 0.5897, "num_input_tokens_seen": 31448968, "step": 54510 }, { "epoch": 8.119600834078046, "grad_norm": 2.8457531929016113, "learning_rate": 5.19567769348403e-06, "loss": 0.6098, "num_input_tokens_seen": 31452072, "step": 54515 }, { "epoch": 8.120345546619005, "grad_norm": 2.808072805404663, "learning_rate": 5.1917121400456654e-06, "loss": 0.5041, "num_input_tokens_seen": 31455208, "step": 54520 }, { "epoch": 8.121090259159963, "grad_norm": 2.4739956855773926, "learning_rate": 5.187747925170858e-06, "loss": 0.6462, "num_input_tokens_seen": 31458120, "step": 54525 }, { "epoch": 8.121834971700924, "grad_norm": 3.4247939586639404, "learning_rate": 5.1837850491274985e-06, "loss": 0.5351, "num_input_tokens_seen": 31460712, "step": 54530 }, { "epoch": 8.122579684241883, "grad_norm": 3.11328387260437, "learning_rate": 5.179823512183382e-06, "loss": 0.5818, "num_input_tokens_seen": 31463656, "step": 54535 }, { "epoch": 8.123324396782841, "grad_norm": 2.411578893661499, "learning_rate": 5.175863314606211e-06, "loss": 0.5297, "num_input_tokens_seen": 31466504, "step": 54540 }, { "epoch": 8.1240691093238, "grad_norm": 3.3519511222839355, "learning_rate": 5.171904456663592e-06, "loss": 0.5835, "num_input_tokens_seen": 31469800, "step": 54545 }, { "epoch": 8.12481382186476, "grad_norm": 4.340783596038818, "learning_rate": 5.167946938623053e-06, "loss": 0.6853, "num_input_tokens_seen": 31472680, "step": 54550 }, { "epoch": 8.12555853440572, "grad_norm": 2.5144903659820557, "learning_rate": 5.16399076075203e-06, "loss": 0.3009, "num_input_tokens_seen": 31475496, "step": 54555 }, { "epoch": 8.126303246946678, "grad_norm": 2.846005916595459, "learning_rate": 5.160035923317863e-06, "loss": 0.5461, "num_input_tokens_seen": 31478408, "step": 54560 }, { "epoch": 8.127047959487637, "grad_norm": 3.197195529937744, "learning_rate": 5.156082426587808e-06, "loss": 0.7231, "num_input_tokens_seen": 31481480, "step": 54565 }, { "epoch": 8.127792672028598, "grad_norm": 2.9300525188446045, "learning_rate": 5.152130270829025e-06, "loss": 0.5589, "num_input_tokens_seen": 31484232, "step": 54570 }, { "epoch": 8.128537384569556, "grad_norm": 1.6143711805343628, "learning_rate": 5.14817945630858e-06, "loss": 0.61, "num_input_tokens_seen": 31487080, "step": 54575 }, { "epoch": 8.129282097110515, "grad_norm": 2.4563469886779785, "learning_rate": 5.144229983293461e-06, "loss": 0.5383, "num_input_tokens_seen": 31489800, "step": 54580 }, { "epoch": 8.130026809651474, "grad_norm": 2.302144765853882, "learning_rate": 5.140281852050544e-06, "loss": 0.4732, "num_input_tokens_seen": 31492936, "step": 54585 }, { "epoch": 8.130771522192434, "grad_norm": 2.232213258743286, "learning_rate": 5.136335062846636e-06, "loss": 0.5615, "num_input_tokens_seen": 31495944, "step": 54590 }, { "epoch": 8.131516234733393, "grad_norm": 2.525092601776123, "learning_rate": 5.132389615948446e-06, "loss": 0.5976, "num_input_tokens_seen": 31499496, "step": 54595 }, { "epoch": 8.132260947274352, "grad_norm": 2.2251298427581787, "learning_rate": 5.12844551162259e-06, "loss": 0.6598, "num_input_tokens_seen": 31502312, "step": 54600 }, { "epoch": 8.13300565981531, "grad_norm": 2.865413188934326, "learning_rate": 5.124502750135601e-06, "loss": 0.5363, "num_input_tokens_seen": 31505480, "step": 54605 }, { "epoch": 8.133750372356271, "grad_norm": 2.501866340637207, "learning_rate": 5.120561331753901e-06, "loss": 0.6149, "num_input_tokens_seen": 31508296, "step": 54610 }, { "epoch": 8.13449508489723, "grad_norm": 1.9437681436538696, "learning_rate": 5.116621256743842e-06, "loss": 0.505, "num_input_tokens_seen": 31511208, "step": 54615 }, { "epoch": 8.135239797438189, "grad_norm": 3.2298660278320312, "learning_rate": 5.112682525371687e-06, "loss": 0.7251, "num_input_tokens_seen": 31514440, "step": 54620 }, { "epoch": 8.135984509979147, "grad_norm": 2.5564939975738525, "learning_rate": 5.108745137903584e-06, "loss": 0.6673, "num_input_tokens_seen": 31517320, "step": 54625 }, { "epoch": 8.136729222520108, "grad_norm": 3.7746481895446777, "learning_rate": 5.104809094605612e-06, "loss": 0.7101, "num_input_tokens_seen": 31520360, "step": 54630 }, { "epoch": 8.137473935061067, "grad_norm": 1.0695760250091553, "learning_rate": 5.100874395743752e-06, "loss": 0.4042, "num_input_tokens_seen": 31522920, "step": 54635 }, { "epoch": 8.138218647602026, "grad_norm": 1.990488052368164, "learning_rate": 5.0969410415839026e-06, "loss": 0.447, "num_input_tokens_seen": 31525832, "step": 54640 }, { "epoch": 8.138963360142984, "grad_norm": 4.294409275054932, "learning_rate": 5.093009032391854e-06, "loss": 0.4088, "num_input_tokens_seen": 31528552, "step": 54645 }, { "epoch": 8.139708072683945, "grad_norm": 2.6349270343780518, "learning_rate": 5.089078368433317e-06, "loss": 0.4459, "num_input_tokens_seen": 31531368, "step": 54650 }, { "epoch": 8.140452785224904, "grad_norm": 5.944397926330566, "learning_rate": 5.0851490499739144e-06, "loss": 0.6171, "num_input_tokens_seen": 31534120, "step": 54655 }, { "epoch": 8.141197497765862, "grad_norm": 11.977334022521973, "learning_rate": 5.081221077279174e-06, "loss": 0.6488, "num_input_tokens_seen": 31537192, "step": 54660 }, { "epoch": 8.141942210306821, "grad_norm": 4.906280517578125, "learning_rate": 5.0772944506145254e-06, "loss": 0.6331, "num_input_tokens_seen": 31540168, "step": 54665 }, { "epoch": 8.142686922847782, "grad_norm": 4.135232448577881, "learning_rate": 5.073369170245324e-06, "loss": 0.6532, "num_input_tokens_seen": 31542888, "step": 54670 }, { "epoch": 8.14343163538874, "grad_norm": 3.225825309753418, "learning_rate": 5.069445236436813e-06, "loss": 0.6689, "num_input_tokens_seen": 31546184, "step": 54675 }, { "epoch": 8.1441763479297, "grad_norm": 2.0749571323394775, "learning_rate": 5.065522649454157e-06, "loss": 0.4496, "num_input_tokens_seen": 31549192, "step": 54680 }, { "epoch": 8.144921060470658, "grad_norm": 3.330657720565796, "learning_rate": 5.061601409562436e-06, "loss": 0.6164, "num_input_tokens_seen": 31552008, "step": 54685 }, { "epoch": 8.145665773011617, "grad_norm": 1.5179589986801147, "learning_rate": 5.057681517026627e-06, "loss": 0.6849, "num_input_tokens_seen": 31555016, "step": 54690 }, { "epoch": 8.146410485552577, "grad_norm": 5.303351879119873, "learning_rate": 5.053762972111623e-06, "loss": 0.7061, "num_input_tokens_seen": 31557704, "step": 54695 }, { "epoch": 8.147155198093536, "grad_norm": 2.7242002487182617, "learning_rate": 5.049845775082227e-06, "loss": 0.5779, "num_input_tokens_seen": 31560456, "step": 54700 }, { "epoch": 8.147899910634495, "grad_norm": 4.499570369720459, "learning_rate": 5.045929926203144e-06, "loss": 0.3759, "num_input_tokens_seen": 31563208, "step": 54705 }, { "epoch": 8.148644623175453, "grad_norm": 3.451857805252075, "learning_rate": 5.04201542573898e-06, "loss": 0.5617, "num_input_tokens_seen": 31566216, "step": 54710 }, { "epoch": 8.149389335716414, "grad_norm": 2.4276936054229736, "learning_rate": 5.0381022739542734e-06, "loss": 0.6548, "num_input_tokens_seen": 31569032, "step": 54715 }, { "epoch": 8.150134048257373, "grad_norm": 2.955667734146118, "learning_rate": 5.034190471113453e-06, "loss": 0.5854, "num_input_tokens_seen": 31571880, "step": 54720 }, { "epoch": 8.150878760798332, "grad_norm": 3.1776444911956787, "learning_rate": 5.0302800174808654e-06, "loss": 0.6869, "num_input_tokens_seen": 31574888, "step": 54725 }, { "epoch": 8.15162347333929, "grad_norm": 2.1002988815307617, "learning_rate": 5.026370913320766e-06, "loss": 0.2865, "num_input_tokens_seen": 31577800, "step": 54730 }, { "epoch": 8.15236818588025, "grad_norm": 2.724130392074585, "learning_rate": 5.022463158897317e-06, "loss": 0.5624, "num_input_tokens_seen": 31580456, "step": 54735 }, { "epoch": 8.15311289842121, "grad_norm": 2.6775882244110107, "learning_rate": 5.018556754474588e-06, "loss": 0.6366, "num_input_tokens_seen": 31583496, "step": 54740 }, { "epoch": 8.153857610962168, "grad_norm": 3.222874641418457, "learning_rate": 5.014651700316547e-06, "loss": 0.4028, "num_input_tokens_seen": 31586408, "step": 54745 }, { "epoch": 8.154602323503127, "grad_norm": 3.886226177215576, "learning_rate": 5.010747996687087e-06, "loss": 0.6275, "num_input_tokens_seen": 31589192, "step": 54750 }, { "epoch": 8.155347036044088, "grad_norm": 2.4854862689971924, "learning_rate": 5.006845643850011e-06, "loss": 0.3848, "num_input_tokens_seen": 31592168, "step": 54755 }, { "epoch": 8.156091748585046, "grad_norm": 2.3413188457489014, "learning_rate": 5.002944642069019e-06, "loss": 0.4662, "num_input_tokens_seen": 31594952, "step": 54760 }, { "epoch": 8.156836461126005, "grad_norm": 3.077176094055176, "learning_rate": 4.9990449916077234e-06, "loss": 0.4793, "num_input_tokens_seen": 31597960, "step": 54765 }, { "epoch": 8.157581173666964, "grad_norm": 2.682462453842163, "learning_rate": 4.995146692729661e-06, "loss": 0.7013, "num_input_tokens_seen": 31600712, "step": 54770 }, { "epoch": 8.158325886207924, "grad_norm": 3.4906904697418213, "learning_rate": 4.99124974569824e-06, "loss": 0.5456, "num_input_tokens_seen": 31603816, "step": 54775 }, { "epoch": 8.159070598748883, "grad_norm": 3.6383039951324463, "learning_rate": 4.987354150776819e-06, "loss": 0.4567, "num_input_tokens_seen": 31606600, "step": 54780 }, { "epoch": 8.159815311289842, "grad_norm": 2.2335190773010254, "learning_rate": 4.9834599082286325e-06, "loss": 0.5978, "num_input_tokens_seen": 31609544, "step": 54785 }, { "epoch": 8.1605600238308, "grad_norm": 3.5053482055664062, "learning_rate": 4.979567018316847e-06, "loss": 0.6638, "num_input_tokens_seen": 31612904, "step": 54790 }, { "epoch": 8.161304736371761, "grad_norm": 3.818734884262085, "learning_rate": 4.975675481304523e-06, "loss": 0.6863, "num_input_tokens_seen": 31615656, "step": 54795 }, { "epoch": 8.16204944891272, "grad_norm": 1.9444024562835693, "learning_rate": 4.971785297454637e-06, "loss": 0.2363, "num_input_tokens_seen": 31618536, "step": 54800 }, { "epoch": 8.162794161453679, "grad_norm": 3.769561767578125, "learning_rate": 4.9678964670300785e-06, "loss": 0.709, "num_input_tokens_seen": 31621512, "step": 54805 }, { "epoch": 8.163538873994638, "grad_norm": 4.522946357727051, "learning_rate": 4.964008990293626e-06, "loss": 0.6039, "num_input_tokens_seen": 31624296, "step": 54810 }, { "epoch": 8.164283586535598, "grad_norm": 2.825883626937866, "learning_rate": 4.960122867507983e-06, "loss": 0.562, "num_input_tokens_seen": 31627400, "step": 54815 }, { "epoch": 8.165028299076557, "grad_norm": 2.008679151535034, "learning_rate": 4.95623809893577e-06, "loss": 0.4905, "num_input_tokens_seen": 31630408, "step": 54820 }, { "epoch": 8.165773011617516, "grad_norm": 3.3395090103149414, "learning_rate": 4.952354684839486e-06, "loss": 0.5379, "num_input_tokens_seen": 31633352, "step": 54825 }, { "epoch": 8.166517724158474, "grad_norm": 6.196814060211182, "learning_rate": 4.948472625481565e-06, "loss": 0.6845, "num_input_tokens_seen": 31636264, "step": 54830 }, { "epoch": 8.167262436699435, "grad_norm": 2.106313467025757, "learning_rate": 4.944591921124348e-06, "loss": 0.4958, "num_input_tokens_seen": 31638920, "step": 54835 }, { "epoch": 8.168007149240394, "grad_norm": 3.3454926013946533, "learning_rate": 4.940712572030062e-06, "loss": 0.7305, "num_input_tokens_seen": 31641896, "step": 54840 }, { "epoch": 8.168751861781352, "grad_norm": 2.317568063735962, "learning_rate": 4.936834578460867e-06, "loss": 0.5176, "num_input_tokens_seen": 31644712, "step": 54845 }, { "epoch": 8.169496574322311, "grad_norm": 2.0328266620635986, "learning_rate": 4.932957940678818e-06, "loss": 0.6726, "num_input_tokens_seen": 31647688, "step": 54850 }, { "epoch": 8.17024128686327, "grad_norm": 3.230894088745117, "learning_rate": 4.9290826589458854e-06, "loss": 0.5831, "num_input_tokens_seen": 31650280, "step": 54855 }, { "epoch": 8.17098599940423, "grad_norm": 1.6355420351028442, "learning_rate": 4.92520873352395e-06, "loss": 0.5904, "num_input_tokens_seen": 31653224, "step": 54860 }, { "epoch": 8.17173071194519, "grad_norm": 5.840190887451172, "learning_rate": 4.921336164674786e-06, "loss": 0.591, "num_input_tokens_seen": 31656392, "step": 54865 }, { "epoch": 8.172475424486148, "grad_norm": 1.3945391178131104, "learning_rate": 4.917464952660094e-06, "loss": 0.3899, "num_input_tokens_seen": 31659272, "step": 54870 }, { "epoch": 8.173220137027107, "grad_norm": 4.54754114151001, "learning_rate": 4.9135950977414666e-06, "loss": 0.7127, "num_input_tokens_seen": 31662056, "step": 54875 }, { "epoch": 8.173964849568067, "grad_norm": 1.3630908727645874, "learning_rate": 4.909726600180417e-06, "loss": 0.5168, "num_input_tokens_seen": 31665128, "step": 54880 }, { "epoch": 8.174709562109026, "grad_norm": 1.8463462591171265, "learning_rate": 4.9058594602383625e-06, "loss": 0.4869, "num_input_tokens_seen": 31668232, "step": 54885 }, { "epoch": 8.175454274649985, "grad_norm": 2.815838575363159, "learning_rate": 4.9019936781766275e-06, "loss": 0.5937, "num_input_tokens_seen": 31671112, "step": 54890 }, { "epoch": 8.176198987190944, "grad_norm": 2.798386812210083, "learning_rate": 4.898129254256448e-06, "loss": 0.6242, "num_input_tokens_seen": 31674088, "step": 54895 }, { "epoch": 8.176943699731904, "grad_norm": 2.8617639541625977, "learning_rate": 4.8942661887389715e-06, "loss": 0.4797, "num_input_tokens_seen": 31676680, "step": 54900 }, { "epoch": 8.177688412272863, "grad_norm": 2.8998982906341553, "learning_rate": 4.890404481885244e-06, "loss": 0.6242, "num_input_tokens_seen": 31679624, "step": 54905 }, { "epoch": 8.178433124813822, "grad_norm": 1.7081468105316162, "learning_rate": 4.886544133956211e-06, "loss": 0.7408, "num_input_tokens_seen": 31682568, "step": 54910 }, { "epoch": 8.17917783735478, "grad_norm": 1.8685802221298218, "learning_rate": 4.882685145212754e-06, "loss": 0.6177, "num_input_tokens_seen": 31685448, "step": 54915 }, { "epoch": 8.17992254989574, "grad_norm": 2.6714320182800293, "learning_rate": 4.878827515915643e-06, "loss": 0.6637, "num_input_tokens_seen": 31688264, "step": 54920 }, { "epoch": 8.1806672624367, "grad_norm": 1.6355748176574707, "learning_rate": 4.8749712463255605e-06, "loss": 0.4588, "num_input_tokens_seen": 31691336, "step": 54925 }, { "epoch": 8.181411974977658, "grad_norm": 2.402104377746582, "learning_rate": 4.871116336703099e-06, "loss": 0.4087, "num_input_tokens_seen": 31694376, "step": 54930 }, { "epoch": 8.182156687518617, "grad_norm": 0.8490108847618103, "learning_rate": 4.867262787308765e-06, "loss": 0.4979, "num_input_tokens_seen": 31697352, "step": 54935 }, { "epoch": 8.182901400059578, "grad_norm": 4.613277912139893, "learning_rate": 4.863410598402959e-06, "loss": 0.7327, "num_input_tokens_seen": 31700232, "step": 54940 }, { "epoch": 8.183646112600536, "grad_norm": 2.7761759757995605, "learning_rate": 4.859559770245986e-06, "loss": 0.6141, "num_input_tokens_seen": 31703240, "step": 54945 }, { "epoch": 8.184390825141495, "grad_norm": 2.25490665435791, "learning_rate": 4.855710303098082e-06, "loss": 0.4703, "num_input_tokens_seen": 31705992, "step": 54950 }, { "epoch": 8.185135537682454, "grad_norm": 1.9318270683288574, "learning_rate": 4.851862197219373e-06, "loss": 0.5905, "num_input_tokens_seen": 31709192, "step": 54955 }, { "epoch": 8.185880250223414, "grad_norm": 2.7420077323913574, "learning_rate": 4.8480154528699e-06, "loss": 0.4461, "num_input_tokens_seen": 31711880, "step": 54960 }, { "epoch": 8.186624962764373, "grad_norm": 3.9342775344848633, "learning_rate": 4.844170070309612e-06, "loss": 0.4488, "num_input_tokens_seen": 31714568, "step": 54965 }, { "epoch": 8.187369675305332, "grad_norm": 2.8671412467956543, "learning_rate": 4.840326049798369e-06, "loss": 0.5798, "num_input_tokens_seen": 31717480, "step": 54970 }, { "epoch": 8.18811438784629, "grad_norm": 3.4606869220733643, "learning_rate": 4.83648339159592e-06, "loss": 0.6726, "num_input_tokens_seen": 31720488, "step": 54975 }, { "epoch": 8.188859100387251, "grad_norm": 2.1359457969665527, "learning_rate": 4.832642095961953e-06, "loss": 0.5621, "num_input_tokens_seen": 31723368, "step": 54980 }, { "epoch": 8.18960381292821, "grad_norm": 4.192141056060791, "learning_rate": 4.828802163156032e-06, "loss": 0.5326, "num_input_tokens_seen": 31726344, "step": 54985 }, { "epoch": 8.190348525469169, "grad_norm": 3.0241799354553223, "learning_rate": 4.824963593437648e-06, "loss": 0.673, "num_input_tokens_seen": 31729096, "step": 54990 }, { "epoch": 8.191093238010128, "grad_norm": 2.150280475616455, "learning_rate": 4.821126387066202e-06, "loss": 0.6295, "num_input_tokens_seen": 31731976, "step": 54995 }, { "epoch": 8.191837950551088, "grad_norm": 3.951590061187744, "learning_rate": 4.817290544300998e-06, "loss": 0.6714, "num_input_tokens_seen": 31735304, "step": 55000 }, { "epoch": 8.192582663092047, "grad_norm": 2.837303876876831, "learning_rate": 4.813456065401237e-06, "loss": 0.8633, "num_input_tokens_seen": 31737928, "step": 55005 }, { "epoch": 8.193327375633006, "grad_norm": 1.0655699968338013, "learning_rate": 4.809622950626041e-06, "loss": 0.4618, "num_input_tokens_seen": 31740776, "step": 55010 }, { "epoch": 8.194072088173964, "grad_norm": 3.2180585861206055, "learning_rate": 4.805791200234441e-06, "loss": 0.5415, "num_input_tokens_seen": 31743432, "step": 55015 }, { "epoch": 8.194816800714925, "grad_norm": 1.602126955986023, "learning_rate": 4.801960814485373e-06, "loss": 0.6147, "num_input_tokens_seen": 31746376, "step": 55020 }, { "epoch": 8.195561513255884, "grad_norm": 2.044037342071533, "learning_rate": 4.798131793637667e-06, "loss": 0.5539, "num_input_tokens_seen": 31749256, "step": 55025 }, { "epoch": 8.196306225796842, "grad_norm": 2.778700351715088, "learning_rate": 4.794304137950079e-06, "loss": 0.566, "num_input_tokens_seen": 31752072, "step": 55030 }, { "epoch": 8.197050938337801, "grad_norm": 4.147776126861572, "learning_rate": 4.790477847681274e-06, "loss": 0.4793, "num_input_tokens_seen": 31754920, "step": 55035 }, { "epoch": 8.19779565087876, "grad_norm": 2.75307035446167, "learning_rate": 4.786652923089804e-06, "loss": 0.5862, "num_input_tokens_seen": 31757672, "step": 55040 }, { "epoch": 8.19854036341972, "grad_norm": 6.951122760772705, "learning_rate": 4.782829364434146e-06, "loss": 0.4843, "num_input_tokens_seen": 31760712, "step": 55045 }, { "epoch": 8.19928507596068, "grad_norm": 2.425269365310669, "learning_rate": 4.779007171972685e-06, "loss": 0.6304, "num_input_tokens_seen": 31763560, "step": 55050 }, { "epoch": 8.200029788501638, "grad_norm": 1.6011345386505127, "learning_rate": 4.775186345963706e-06, "loss": 0.4804, "num_input_tokens_seen": 31766056, "step": 55055 }, { "epoch": 8.200774501042597, "grad_norm": 4.389371395111084, "learning_rate": 4.771366886665412e-06, "loss": 0.4404, "num_input_tokens_seen": 31768712, "step": 55060 }, { "epoch": 8.201519213583557, "grad_norm": 3.23457407951355, "learning_rate": 4.767548794335894e-06, "loss": 0.5441, "num_input_tokens_seen": 31771656, "step": 55065 }, { "epoch": 8.202263926124516, "grad_norm": 2.1843583583831787, "learning_rate": 4.763732069233173e-06, "loss": 0.6564, "num_input_tokens_seen": 31774600, "step": 55070 }, { "epoch": 8.203008638665475, "grad_norm": 2.7499077320098877, "learning_rate": 4.759916711615162e-06, "loss": 0.4663, "num_input_tokens_seen": 31777384, "step": 55075 }, { "epoch": 8.203753351206434, "grad_norm": 6.411637306213379, "learning_rate": 4.756102721739686e-06, "loss": 0.5517, "num_input_tokens_seen": 31780008, "step": 55080 }, { "epoch": 8.204498063747394, "grad_norm": 4.234126091003418, "learning_rate": 4.752290099864484e-06, "loss": 0.5349, "num_input_tokens_seen": 31782728, "step": 55085 }, { "epoch": 8.205242776288353, "grad_norm": 1.785395860671997, "learning_rate": 4.748478846247198e-06, "loss": 0.5345, "num_input_tokens_seen": 31785672, "step": 55090 }, { "epoch": 8.205987488829312, "grad_norm": 2.154371500015259, "learning_rate": 4.744668961145371e-06, "loss": 0.519, "num_input_tokens_seen": 31788328, "step": 55095 }, { "epoch": 8.20673220137027, "grad_norm": 3.1170592308044434, "learning_rate": 4.740860444816472e-06, "loss": 0.502, "num_input_tokens_seen": 31791048, "step": 55100 }, { "epoch": 8.207476913911231, "grad_norm": 2.2273268699645996, "learning_rate": 4.7370532975178575e-06, "loss": 0.4489, "num_input_tokens_seen": 31793768, "step": 55105 }, { "epoch": 8.20822162645219, "grad_norm": 2.6639673709869385, "learning_rate": 4.733247519506789e-06, "loss": 0.5642, "num_input_tokens_seen": 31796840, "step": 55110 }, { "epoch": 8.208966338993148, "grad_norm": 2.7890660762786865, "learning_rate": 4.7294431110404545e-06, "loss": 0.3814, "num_input_tokens_seen": 31799624, "step": 55115 }, { "epoch": 8.209711051534107, "grad_norm": 1.4408056735992432, "learning_rate": 4.725640072375942e-06, "loss": 0.421, "num_input_tokens_seen": 31802344, "step": 55120 }, { "epoch": 8.210455764075068, "grad_norm": 1.3377366065979004, "learning_rate": 4.7218384037702425e-06, "loss": 0.4468, "num_input_tokens_seen": 31805384, "step": 55125 }, { "epoch": 8.211200476616026, "grad_norm": 3.579164981842041, "learning_rate": 4.71803810548026e-06, "loss": 0.6166, "num_input_tokens_seen": 31808328, "step": 55130 }, { "epoch": 8.211945189156985, "grad_norm": 2.4971466064453125, "learning_rate": 4.714239177762808e-06, "loss": 0.5572, "num_input_tokens_seen": 31810952, "step": 55135 }, { "epoch": 8.212689901697944, "grad_norm": 2.8543851375579834, "learning_rate": 4.710441620874589e-06, "loss": 0.5395, "num_input_tokens_seen": 31814120, "step": 55140 }, { "epoch": 8.213434614238905, "grad_norm": 2.399540424346924, "learning_rate": 4.706645435072243e-06, "loss": 0.6107, "num_input_tokens_seen": 31817448, "step": 55145 }, { "epoch": 8.214179326779863, "grad_norm": 3.085381031036377, "learning_rate": 4.702850620612284e-06, "loss": 0.4927, "num_input_tokens_seen": 31820264, "step": 55150 }, { "epoch": 8.214924039320822, "grad_norm": 2.1553232669830322, "learning_rate": 4.699057177751157e-06, "loss": 0.6275, "num_input_tokens_seen": 31823112, "step": 55155 }, { "epoch": 8.21566875186178, "grad_norm": 2.9663500785827637, "learning_rate": 4.695265106745209e-06, "loss": 0.585, "num_input_tokens_seen": 31825864, "step": 55160 }, { "epoch": 8.216413464402741, "grad_norm": 1.0367845296859741, "learning_rate": 4.691474407850699e-06, "loss": 0.3362, "num_input_tokens_seen": 31829000, "step": 55165 }, { "epoch": 8.2171581769437, "grad_norm": 1.404395341873169, "learning_rate": 4.687685081323773e-06, "loss": 0.5771, "num_input_tokens_seen": 31831752, "step": 55170 }, { "epoch": 8.217902889484659, "grad_norm": 2.6712043285369873, "learning_rate": 4.683897127420503e-06, "loss": 0.5851, "num_input_tokens_seen": 31834568, "step": 55175 }, { "epoch": 8.218647602025618, "grad_norm": 5.314250946044922, "learning_rate": 4.680110546396868e-06, "loss": 0.7737, "num_input_tokens_seen": 31837640, "step": 55180 }, { "epoch": 8.219392314566578, "grad_norm": 2.5432589054107666, "learning_rate": 4.676325338508755e-06, "loss": 0.6591, "num_input_tokens_seen": 31840520, "step": 55185 }, { "epoch": 8.220137027107537, "grad_norm": 2.6896395683288574, "learning_rate": 4.672541504011938e-06, "loss": 0.6485, "num_input_tokens_seen": 31843528, "step": 55190 }, { "epoch": 8.220881739648496, "grad_norm": 4.334622859954834, "learning_rate": 4.668759043162121e-06, "loss": 0.5249, "num_input_tokens_seen": 31846152, "step": 55195 }, { "epoch": 8.221626452189454, "grad_norm": 1.7252328395843506, "learning_rate": 4.664977956214914e-06, "loss": 0.4846, "num_input_tokens_seen": 31849096, "step": 55200 }, { "epoch": 8.222371164730415, "grad_norm": 2.48222279548645, "learning_rate": 4.661198243425813e-06, "loss": 0.625, "num_input_tokens_seen": 31851688, "step": 55205 }, { "epoch": 8.223115877271374, "grad_norm": 2.7298743724823, "learning_rate": 4.6574199050502445e-06, "loss": 0.4971, "num_input_tokens_seen": 31854408, "step": 55210 }, { "epoch": 8.223860589812332, "grad_norm": 3.0120391845703125, "learning_rate": 4.653642941343531e-06, "loss": 0.6712, "num_input_tokens_seen": 31857352, "step": 55215 }, { "epoch": 8.224605302353291, "grad_norm": 2.100433588027954, "learning_rate": 4.649867352560905e-06, "loss": 0.4981, "num_input_tokens_seen": 31860104, "step": 55220 }, { "epoch": 8.22535001489425, "grad_norm": 2.9039504528045654, "learning_rate": 4.646093138957514e-06, "loss": 0.5455, "num_input_tokens_seen": 31862856, "step": 55225 }, { "epoch": 8.22609472743521, "grad_norm": 2.202122926712036, "learning_rate": 4.6423203007883886e-06, "loss": 0.6291, "num_input_tokens_seen": 31865576, "step": 55230 }, { "epoch": 8.22683943997617, "grad_norm": 3.263770818710327, "learning_rate": 4.638548838308493e-06, "loss": 0.5598, "num_input_tokens_seen": 31868584, "step": 55235 }, { "epoch": 8.227584152517128, "grad_norm": 2.1693456172943115, "learning_rate": 4.6347787517726785e-06, "loss": 0.4233, "num_input_tokens_seen": 31871304, "step": 55240 }, { "epoch": 8.228328865058087, "grad_norm": 2.144486904144287, "learning_rate": 4.6310100414357185e-06, "loss": 0.4459, "num_input_tokens_seen": 31874056, "step": 55245 }, { "epoch": 8.229073577599047, "grad_norm": 2.7148797512054443, "learning_rate": 4.6272427075522845e-06, "loss": 0.7462, "num_input_tokens_seen": 31876968, "step": 55250 }, { "epoch": 8.229818290140006, "grad_norm": 2.8948891162872314, "learning_rate": 4.623476750376956e-06, "loss": 0.5259, "num_input_tokens_seen": 31880008, "step": 55255 }, { "epoch": 8.230563002680965, "grad_norm": 3.816375732421875, "learning_rate": 4.6197121701642286e-06, "loss": 0.5988, "num_input_tokens_seen": 31883240, "step": 55260 }, { "epoch": 8.231307715221924, "grad_norm": 5.838834285736084, "learning_rate": 4.615948967168496e-06, "loss": 0.7376, "num_input_tokens_seen": 31886120, "step": 55265 }, { "epoch": 8.232052427762884, "grad_norm": 2.934819221496582, "learning_rate": 4.612187141644056e-06, "loss": 0.7867, "num_input_tokens_seen": 31889192, "step": 55270 }, { "epoch": 8.232797140303843, "grad_norm": 2.445188045501709, "learning_rate": 4.6084266938451135e-06, "loss": 0.5798, "num_input_tokens_seen": 31891880, "step": 55275 }, { "epoch": 8.233541852844802, "grad_norm": 5.528149604797363, "learning_rate": 4.604667624025788e-06, "loss": 0.5971, "num_input_tokens_seen": 31894472, "step": 55280 }, { "epoch": 8.23428656538576, "grad_norm": 1.5826363563537598, "learning_rate": 4.600909932440103e-06, "loss": 0.3335, "num_input_tokens_seen": 31897448, "step": 55285 }, { "epoch": 8.235031277926721, "grad_norm": 1.3092585802078247, "learning_rate": 4.59715361934199e-06, "loss": 0.5397, "num_input_tokens_seen": 31900040, "step": 55290 }, { "epoch": 8.23577599046768, "grad_norm": 2.712702751159668, "learning_rate": 4.593398684985281e-06, "loss": 0.5339, "num_input_tokens_seen": 31902856, "step": 55295 }, { "epoch": 8.236520703008638, "grad_norm": 1.8430477380752563, "learning_rate": 4.589645129623729e-06, "loss": 0.4666, "num_input_tokens_seen": 31905672, "step": 55300 }, { "epoch": 8.237265415549597, "grad_norm": 3.0073888301849365, "learning_rate": 4.585892953510978e-06, "loss": 0.4718, "num_input_tokens_seen": 31908552, "step": 55305 }, { "epoch": 8.238010128090558, "grad_norm": 2.1811888217926025, "learning_rate": 4.582142156900576e-06, "loss": 0.5711, "num_input_tokens_seen": 31911400, "step": 55310 }, { "epoch": 8.238754840631517, "grad_norm": 2.1422011852264404, "learning_rate": 4.578392740045994e-06, "loss": 0.449, "num_input_tokens_seen": 31914024, "step": 55315 }, { "epoch": 8.239499553172475, "grad_norm": 5.276890754699707, "learning_rate": 4.5746447032006005e-06, "loss": 0.822, "num_input_tokens_seen": 31916968, "step": 55320 }, { "epoch": 8.240244265713434, "grad_norm": 3.3534047603607178, "learning_rate": 4.570898046617677e-06, "loss": 0.5795, "num_input_tokens_seen": 31919624, "step": 55325 }, { "epoch": 8.240988978254395, "grad_norm": 4.086873531341553, "learning_rate": 4.567152770550412e-06, "loss": 0.4951, "num_input_tokens_seen": 31922472, "step": 55330 }, { "epoch": 8.241733690795353, "grad_norm": 2.836428642272949, "learning_rate": 4.563408875251882e-06, "loss": 0.5741, "num_input_tokens_seen": 31925128, "step": 55335 }, { "epoch": 8.242478403336312, "grad_norm": 4.0368170738220215, "learning_rate": 4.5596663609750904e-06, "loss": 0.6572, "num_input_tokens_seen": 31928136, "step": 55340 }, { "epoch": 8.24322311587727, "grad_norm": 4.405084609985352, "learning_rate": 4.555925227972946e-06, "loss": 0.7077, "num_input_tokens_seen": 31930856, "step": 55345 }, { "epoch": 8.243967828418231, "grad_norm": 1.7724823951721191, "learning_rate": 4.552185476498252e-06, "loss": 0.5317, "num_input_tokens_seen": 31933640, "step": 55350 }, { "epoch": 8.24471254095919, "grad_norm": 2.9582839012145996, "learning_rate": 4.5484471068037275e-06, "loss": 0.4824, "num_input_tokens_seen": 31936744, "step": 55355 }, { "epoch": 8.245457253500149, "grad_norm": 3.183187246322632, "learning_rate": 4.544710119141996e-06, "loss": 0.3992, "num_input_tokens_seen": 31939624, "step": 55360 }, { "epoch": 8.246201966041108, "grad_norm": 2.567690849304199, "learning_rate": 4.540974513765597e-06, "loss": 0.5561, "num_input_tokens_seen": 31942696, "step": 55365 }, { "epoch": 8.246946678582066, "grad_norm": 4.202817916870117, "learning_rate": 4.537240290926955e-06, "loss": 0.6425, "num_input_tokens_seen": 31945736, "step": 55370 }, { "epoch": 8.247691391123027, "grad_norm": 2.243669271469116, "learning_rate": 4.5335074508784185e-06, "loss": 0.4568, "num_input_tokens_seen": 31948456, "step": 55375 }, { "epoch": 8.248436103663986, "grad_norm": 1.9183884859085083, "learning_rate": 4.529775993872237e-06, "loss": 0.6621, "num_input_tokens_seen": 31951496, "step": 55380 }, { "epoch": 8.249180816204944, "grad_norm": 1.8400667905807495, "learning_rate": 4.526045920160574e-06, "loss": 0.5752, "num_input_tokens_seen": 31954152, "step": 55385 }, { "epoch": 8.249925528745903, "grad_norm": 4.051662921905518, "learning_rate": 4.522317229995479e-06, "loss": 0.5557, "num_input_tokens_seen": 31956968, "step": 55390 }, { "epoch": 8.250670241286864, "grad_norm": 3.7874245643615723, "learning_rate": 4.518589923628932e-06, "loss": 0.5859, "num_input_tokens_seen": 31959784, "step": 55395 }, { "epoch": 8.251414953827823, "grad_norm": 2.43461012840271, "learning_rate": 4.514864001312813e-06, "loss": 0.687, "num_input_tokens_seen": 31962600, "step": 55400 }, { "epoch": 8.252159666368781, "grad_norm": 2.558241128921509, "learning_rate": 4.511139463298891e-06, "loss": 0.5384, "num_input_tokens_seen": 31965544, "step": 55405 }, { "epoch": 8.25290437890974, "grad_norm": 3.0360047817230225, "learning_rate": 4.507416309838861e-06, "loss": 0.4831, "num_input_tokens_seen": 31968264, "step": 55410 }, { "epoch": 8.2536490914507, "grad_norm": 2.0794498920440674, "learning_rate": 4.503694541184322e-06, "loss": 0.8099, "num_input_tokens_seen": 31971016, "step": 55415 }, { "epoch": 8.25439380399166, "grad_norm": 1.7886202335357666, "learning_rate": 4.499974157586773e-06, "loss": 0.5361, "num_input_tokens_seen": 31974152, "step": 55420 }, { "epoch": 8.255138516532618, "grad_norm": 1.5841503143310547, "learning_rate": 4.49625515929763e-06, "loss": 0.5068, "num_input_tokens_seen": 31976936, "step": 55425 }, { "epoch": 8.255883229073577, "grad_norm": 3.4583048820495605, "learning_rate": 4.492537546568196e-06, "loss": 0.5122, "num_input_tokens_seen": 31979784, "step": 55430 }, { "epoch": 8.256627941614537, "grad_norm": 1.8090949058532715, "learning_rate": 4.488821319649702e-06, "loss": 0.447, "num_input_tokens_seen": 31982440, "step": 55435 }, { "epoch": 8.257372654155496, "grad_norm": 3.5241334438323975, "learning_rate": 4.485106478793266e-06, "loss": 0.5195, "num_input_tokens_seen": 31985448, "step": 55440 }, { "epoch": 8.258117366696455, "grad_norm": 2.2534332275390625, "learning_rate": 4.481393024249925e-06, "loss": 0.471, "num_input_tokens_seen": 31988488, "step": 55445 }, { "epoch": 8.258862079237414, "grad_norm": 3.814821243286133, "learning_rate": 4.477680956270621e-06, "loss": 0.6228, "num_input_tokens_seen": 31991432, "step": 55450 }, { "epoch": 8.259606791778374, "grad_norm": 4.443065643310547, "learning_rate": 4.4739702751062015e-06, "loss": 0.5691, "num_input_tokens_seen": 31994408, "step": 55455 }, { "epoch": 8.260351504319333, "grad_norm": 3.931699514389038, "learning_rate": 4.470260981007418e-06, "loss": 0.7098, "num_input_tokens_seen": 31997192, "step": 55460 }, { "epoch": 8.261096216860292, "grad_norm": 1.3831197023391724, "learning_rate": 4.466553074224936e-06, "loss": 0.6736, "num_input_tokens_seen": 31999848, "step": 55465 }, { "epoch": 8.26184092940125, "grad_norm": 2.9289181232452393, "learning_rate": 4.462846555009312e-06, "loss": 0.7255, "num_input_tokens_seen": 32002856, "step": 55470 }, { "epoch": 8.262585641942211, "grad_norm": 2.9005608558654785, "learning_rate": 4.459141423611016e-06, "loss": 0.4939, "num_input_tokens_seen": 32005640, "step": 55475 }, { "epoch": 8.26333035448317, "grad_norm": 4.068122863769531, "learning_rate": 4.455437680280427e-06, "loss": 0.6014, "num_input_tokens_seen": 32008488, "step": 55480 }, { "epoch": 8.264075067024129, "grad_norm": 4.479648113250732, "learning_rate": 4.451735325267836e-06, "loss": 0.6246, "num_input_tokens_seen": 32011368, "step": 55485 }, { "epoch": 8.264819779565087, "grad_norm": 2.432727336883545, "learning_rate": 4.448034358823424e-06, "loss": 0.5814, "num_input_tokens_seen": 32014312, "step": 55490 }, { "epoch": 8.265564492106048, "grad_norm": 3.672175645828247, "learning_rate": 4.444334781197301e-06, "loss": 0.5576, "num_input_tokens_seen": 32017480, "step": 55495 }, { "epoch": 8.266309204647007, "grad_norm": 2.9683423042297363, "learning_rate": 4.440636592639452e-06, "loss": 0.553, "num_input_tokens_seen": 32020936, "step": 55500 }, { "epoch": 8.267053917187965, "grad_norm": 2.0691280364990234, "learning_rate": 4.436939793399803e-06, "loss": 0.4656, "num_input_tokens_seen": 32023656, "step": 55505 }, { "epoch": 8.267798629728924, "grad_norm": 1.8894715309143066, "learning_rate": 4.433244383728149e-06, "loss": 0.4249, "num_input_tokens_seen": 32026760, "step": 55510 }, { "epoch": 8.268543342269885, "grad_norm": 2.896228313446045, "learning_rate": 4.429550363874224e-06, "loss": 0.6637, "num_input_tokens_seen": 32029544, "step": 55515 }, { "epoch": 8.269288054810843, "grad_norm": 2.8267693519592285, "learning_rate": 4.4258577340876514e-06, "loss": 0.5189, "num_input_tokens_seen": 32032488, "step": 55520 }, { "epoch": 8.270032767351802, "grad_norm": 2.733191967010498, "learning_rate": 4.422166494617966e-06, "loss": 0.7053, "num_input_tokens_seen": 32035528, "step": 55525 }, { "epoch": 8.270777479892761, "grad_norm": 2.6326119899749756, "learning_rate": 4.418476645714609e-06, "loss": 0.6664, "num_input_tokens_seen": 32038696, "step": 55530 }, { "epoch": 8.271522192433721, "grad_norm": 1.5767931938171387, "learning_rate": 4.414788187626917e-06, "loss": 0.6017, "num_input_tokens_seen": 32041640, "step": 55535 }, { "epoch": 8.27226690497468, "grad_norm": 2.1863322257995605, "learning_rate": 4.411101120604147e-06, "loss": 0.5, "num_input_tokens_seen": 32044520, "step": 55540 }, { "epoch": 8.273011617515639, "grad_norm": 13.747346878051758, "learning_rate": 4.40741544489546e-06, "loss": 0.6308, "num_input_tokens_seen": 32047592, "step": 55545 }, { "epoch": 8.273756330056598, "grad_norm": 3.2703030109405518, "learning_rate": 4.403731160749907e-06, "loss": 0.6417, "num_input_tokens_seen": 32050120, "step": 55550 }, { "epoch": 8.274501042597556, "grad_norm": 2.4973905086517334, "learning_rate": 4.400048268416465e-06, "loss": 0.4822, "num_input_tokens_seen": 32053160, "step": 55555 }, { "epoch": 8.275245755138517, "grad_norm": 3.2169764041900635, "learning_rate": 4.396366768144009e-06, "loss": 0.5794, "num_input_tokens_seen": 32055848, "step": 55560 }, { "epoch": 8.275990467679476, "grad_norm": 2.354055643081665, "learning_rate": 4.3926866601813224e-06, "loss": 0.7086, "num_input_tokens_seen": 32058632, "step": 55565 }, { "epoch": 8.276735180220435, "grad_norm": 3.8626997470855713, "learning_rate": 4.389007944777082e-06, "loss": 0.7887, "num_input_tokens_seen": 32061256, "step": 55570 }, { "epoch": 8.277479892761393, "grad_norm": 3.061599016189575, "learning_rate": 4.385330622179887e-06, "loss": 0.4663, "num_input_tokens_seen": 32064392, "step": 55575 }, { "epoch": 8.278224605302354, "grad_norm": 3.010162830352783, "learning_rate": 4.3816546926382345e-06, "loss": 0.6746, "num_input_tokens_seen": 32067368, "step": 55580 }, { "epoch": 8.278969317843313, "grad_norm": 3.553751230239868, "learning_rate": 4.377980156400538e-06, "loss": 0.7819, "num_input_tokens_seen": 32070344, "step": 55585 }, { "epoch": 8.279714030384271, "grad_norm": 2.53376841545105, "learning_rate": 4.374307013715093e-06, "loss": 0.7002, "num_input_tokens_seen": 32073096, "step": 55590 }, { "epoch": 8.28045874292523, "grad_norm": 3.465228319168091, "learning_rate": 4.370635264830122e-06, "loss": 0.5081, "num_input_tokens_seen": 32076296, "step": 55595 }, { "epoch": 8.28120345546619, "grad_norm": 2.5210883617401123, "learning_rate": 4.366964909993751e-06, "loss": 0.6206, "num_input_tokens_seen": 32078984, "step": 55600 }, { "epoch": 8.28194816800715, "grad_norm": 4.259672164916992, "learning_rate": 4.363295949453999e-06, "loss": 0.3788, "num_input_tokens_seen": 32081736, "step": 55605 }, { "epoch": 8.282692880548108, "grad_norm": 2.1344738006591797, "learning_rate": 4.3596283834588054e-06, "loss": 0.5751, "num_input_tokens_seen": 32084744, "step": 55610 }, { "epoch": 8.283437593089067, "grad_norm": 6.36708402633667, "learning_rate": 4.355962212256006e-06, "loss": 0.6418, "num_input_tokens_seen": 32087720, "step": 55615 }, { "epoch": 8.284182305630027, "grad_norm": 1.902288556098938, "learning_rate": 4.3522974360933475e-06, "loss": 0.6915, "num_input_tokens_seen": 32090504, "step": 55620 }, { "epoch": 8.284927018170986, "grad_norm": 2.023277759552002, "learning_rate": 4.348634055218489e-06, "loss": 0.5437, "num_input_tokens_seen": 32093640, "step": 55625 }, { "epoch": 8.285671730711945, "grad_norm": 2.8815860748291016, "learning_rate": 4.34497206987897e-06, "loss": 0.5182, "num_input_tokens_seen": 32096552, "step": 55630 }, { "epoch": 8.286416443252904, "grad_norm": 2.2778704166412354, "learning_rate": 4.3413114803222685e-06, "loss": 0.5916, "num_input_tokens_seen": 32099368, "step": 55635 }, { "epoch": 8.287161155793864, "grad_norm": 2.7568202018737793, "learning_rate": 4.33765228679574e-06, "loss": 0.5321, "num_input_tokens_seen": 32102152, "step": 55640 }, { "epoch": 8.287905868334823, "grad_norm": 5.325735569000244, "learning_rate": 4.333994489546661e-06, "loss": 0.6225, "num_input_tokens_seen": 32105192, "step": 55645 }, { "epoch": 8.288650580875782, "grad_norm": 2.286351203918457, "learning_rate": 4.330338088822214e-06, "loss": 0.6945, "num_input_tokens_seen": 32108264, "step": 55650 }, { "epoch": 8.28939529341674, "grad_norm": 2.0978403091430664, "learning_rate": 4.3266830848694815e-06, "loss": 0.641, "num_input_tokens_seen": 32111112, "step": 55655 }, { "epoch": 8.290140005957701, "grad_norm": 2.8663904666900635, "learning_rate": 4.3230294779354615e-06, "loss": 0.3633, "num_input_tokens_seen": 32114472, "step": 55660 }, { "epoch": 8.29088471849866, "grad_norm": 2.126527786254883, "learning_rate": 4.319377268267035e-06, "loss": 0.5597, "num_input_tokens_seen": 32117544, "step": 55665 }, { "epoch": 8.291629431039619, "grad_norm": 4.367660999298096, "learning_rate": 4.315726456111022e-06, "loss": 0.6796, "num_input_tokens_seen": 32120360, "step": 55670 }, { "epoch": 8.292374143580577, "grad_norm": 1.606789231300354, "learning_rate": 4.312077041714108e-06, "loss": 0.7119, "num_input_tokens_seen": 32123304, "step": 55675 }, { "epoch": 8.293118856121538, "grad_norm": 1.6864705085754395, "learning_rate": 4.3084290253229185e-06, "loss": 0.4746, "num_input_tokens_seen": 32126568, "step": 55680 }, { "epoch": 8.293863568662497, "grad_norm": 2.9688355922698975, "learning_rate": 4.304782407183971e-06, "loss": 0.6076, "num_input_tokens_seen": 32129416, "step": 55685 }, { "epoch": 8.294608281203455, "grad_norm": 5.478464126586914, "learning_rate": 4.3011371875436856e-06, "loss": 0.4903, "num_input_tokens_seen": 32132232, "step": 55690 }, { "epoch": 8.295352993744414, "grad_norm": 2.7008206844329834, "learning_rate": 4.2974933666484e-06, "loss": 0.6045, "num_input_tokens_seen": 32135400, "step": 55695 }, { "epoch": 8.296097706285375, "grad_norm": 7.991607666015625, "learning_rate": 4.293850944744337e-06, "loss": 0.5914, "num_input_tokens_seen": 32138088, "step": 55700 }, { "epoch": 8.296842418826333, "grad_norm": 5.860049247741699, "learning_rate": 4.290209922077643e-06, "loss": 0.9033, "num_input_tokens_seen": 32141064, "step": 55705 }, { "epoch": 8.297587131367292, "grad_norm": 3.422597646713257, "learning_rate": 4.286570298894365e-06, "loss": 0.5131, "num_input_tokens_seen": 32143848, "step": 55710 }, { "epoch": 8.298331843908251, "grad_norm": 3.242023468017578, "learning_rate": 4.282932075440449e-06, "loss": 0.4981, "num_input_tokens_seen": 32146696, "step": 55715 }, { "epoch": 8.299076556449211, "grad_norm": 3.141523599624634, "learning_rate": 4.279295251961754e-06, "loss": 0.5547, "num_input_tokens_seen": 32149896, "step": 55720 }, { "epoch": 8.29982126899017, "grad_norm": 3.5211219787597656, "learning_rate": 4.27565982870404e-06, "loss": 0.6984, "num_input_tokens_seen": 32152872, "step": 55725 }, { "epoch": 8.300565981531129, "grad_norm": 2.735525608062744, "learning_rate": 4.272025805912982e-06, "loss": 0.6577, "num_input_tokens_seen": 32155592, "step": 55730 }, { "epoch": 8.301310694072088, "grad_norm": 3.466573476791382, "learning_rate": 4.26839318383414e-06, "loss": 0.4833, "num_input_tokens_seen": 32158312, "step": 55735 }, { "epoch": 8.302055406613047, "grad_norm": 3.716898202896118, "learning_rate": 4.2647619627129986e-06, "loss": 0.4951, "num_input_tokens_seen": 32161096, "step": 55740 }, { "epoch": 8.302800119154007, "grad_norm": 2.8802902698516846, "learning_rate": 4.261132142794941e-06, "loss": 0.587, "num_input_tokens_seen": 32163880, "step": 55745 }, { "epoch": 8.303544831694966, "grad_norm": 4.471235752105713, "learning_rate": 4.25750372432526e-06, "loss": 0.6583, "num_input_tokens_seen": 32166696, "step": 55750 }, { "epoch": 8.304289544235925, "grad_norm": 2.08596134185791, "learning_rate": 4.2538767075491394e-06, "loss": 0.3958, "num_input_tokens_seen": 32169512, "step": 55755 }, { "epoch": 8.305034256776883, "grad_norm": 3.417544364929199, "learning_rate": 4.250251092711682e-06, "loss": 0.5294, "num_input_tokens_seen": 32172200, "step": 55760 }, { "epoch": 8.305778969317844, "grad_norm": 1.4184796810150146, "learning_rate": 4.2466268800579026e-06, "loss": 0.605, "num_input_tokens_seen": 32174952, "step": 55765 }, { "epoch": 8.306523681858803, "grad_norm": 2.354016065597534, "learning_rate": 4.243004069832693e-06, "loss": 0.6107, "num_input_tokens_seen": 32177768, "step": 55770 }, { "epoch": 8.307268394399761, "grad_norm": 2.334057569503784, "learning_rate": 4.239382662280875e-06, "loss": 0.4727, "num_input_tokens_seen": 32180648, "step": 55775 }, { "epoch": 8.30801310694072, "grad_norm": 3.3279082775115967, "learning_rate": 4.235762657647172e-06, "loss": 0.6771, "num_input_tokens_seen": 32183400, "step": 55780 }, { "epoch": 8.30875781948168, "grad_norm": 3.3604061603546143, "learning_rate": 4.232144056176207e-06, "loss": 0.772, "num_input_tokens_seen": 32186120, "step": 55785 }, { "epoch": 8.30950253202264, "grad_norm": 3.5231504440307617, "learning_rate": 4.2285268581125165e-06, "loss": 0.6762, "num_input_tokens_seen": 32188776, "step": 55790 }, { "epoch": 8.310247244563598, "grad_norm": 3.507662534713745, "learning_rate": 4.224911063700526e-06, "loss": 0.5605, "num_input_tokens_seen": 32191752, "step": 55795 }, { "epoch": 8.310991957104557, "grad_norm": 2.9353997707366943, "learning_rate": 4.221296673184585e-06, "loss": 0.6045, "num_input_tokens_seen": 32194664, "step": 55800 }, { "epoch": 8.311736669645517, "grad_norm": 4.306057929992676, "learning_rate": 4.217683686808929e-06, "loss": 0.644, "num_input_tokens_seen": 32197384, "step": 55805 }, { "epoch": 8.312481382186476, "grad_norm": 2.8252503871917725, "learning_rate": 4.214072104817715e-06, "loss": 0.4391, "num_input_tokens_seen": 32200360, "step": 55810 }, { "epoch": 8.313226094727435, "grad_norm": 2.382451057434082, "learning_rate": 4.2104619274549975e-06, "loss": 0.4164, "num_input_tokens_seen": 32203304, "step": 55815 }, { "epoch": 8.313970807268394, "grad_norm": 2.8090784549713135, "learning_rate": 4.2068531549647405e-06, "loss": 0.6864, "num_input_tokens_seen": 32206152, "step": 55820 }, { "epoch": 8.314715519809354, "grad_norm": 1.8588547706604004, "learning_rate": 4.203245787590815e-06, "loss": 0.3545, "num_input_tokens_seen": 32208968, "step": 55825 }, { "epoch": 8.315460232350313, "grad_norm": 5.382673740386963, "learning_rate": 4.199639825576979e-06, "loss": 0.6074, "num_input_tokens_seen": 32212232, "step": 55830 }, { "epoch": 8.316204944891272, "grad_norm": 6.584933280944824, "learning_rate": 4.196035269166921e-06, "loss": 0.5477, "num_input_tokens_seen": 32215080, "step": 55835 }, { "epoch": 8.31694965743223, "grad_norm": 5.2837419509887695, "learning_rate": 4.192432118604209e-06, "loss": 0.6154, "num_input_tokens_seen": 32218120, "step": 55840 }, { "epoch": 8.317694369973191, "grad_norm": 3.3430192470550537, "learning_rate": 4.188830374132341e-06, "loss": 0.6253, "num_input_tokens_seen": 32220968, "step": 55845 }, { "epoch": 8.31843908251415, "grad_norm": 3.4082536697387695, "learning_rate": 4.1852300359946996e-06, "loss": 0.5523, "num_input_tokens_seen": 32223720, "step": 55850 }, { "epoch": 8.319183795055109, "grad_norm": 2.7247202396392822, "learning_rate": 4.181631104434588e-06, "loss": 0.5498, "num_input_tokens_seen": 32226696, "step": 55855 }, { "epoch": 8.319928507596067, "grad_norm": 2.1494710445404053, "learning_rate": 4.178033579695212e-06, "loss": 0.7335, "num_input_tokens_seen": 32229608, "step": 55860 }, { "epoch": 8.320673220137028, "grad_norm": 1.896865963935852, "learning_rate": 4.174437462019665e-06, "loss": 0.3609, "num_input_tokens_seen": 32232808, "step": 55865 }, { "epoch": 8.321417932677987, "grad_norm": 2.595094919204712, "learning_rate": 4.170842751650969e-06, "loss": 0.5852, "num_input_tokens_seen": 32235560, "step": 55870 }, { "epoch": 8.322162645218945, "grad_norm": 3.079998016357422, "learning_rate": 4.167249448832028e-06, "loss": 0.5109, "num_input_tokens_seen": 32238504, "step": 55875 }, { "epoch": 8.322907357759904, "grad_norm": 5.529175758361816, "learning_rate": 4.163657553805669e-06, "loss": 0.4527, "num_input_tokens_seen": 32241064, "step": 55880 }, { "epoch": 8.323652070300863, "grad_norm": 3.655583620071411, "learning_rate": 4.160067066814619e-06, "loss": 0.6417, "num_input_tokens_seen": 32243816, "step": 55885 }, { "epoch": 8.324396782841823, "grad_norm": 3.2394583225250244, "learning_rate": 4.156477988101507e-06, "loss": 0.5793, "num_input_tokens_seen": 32246696, "step": 55890 }, { "epoch": 8.325141495382782, "grad_norm": 6.028539657592773, "learning_rate": 4.152890317908875e-06, "loss": 0.4778, "num_input_tokens_seen": 32249544, "step": 55895 }, { "epoch": 8.325886207923741, "grad_norm": 3.3465735912323, "learning_rate": 4.149304056479153e-06, "loss": 0.4398, "num_input_tokens_seen": 32252360, "step": 55900 }, { "epoch": 8.3266309204647, "grad_norm": 5.410567760467529, "learning_rate": 4.145719204054688e-06, "loss": 0.4232, "num_input_tokens_seen": 32254888, "step": 55905 }, { "epoch": 8.32737563300566, "grad_norm": 2.179288625717163, "learning_rate": 4.1421357608777386e-06, "loss": 0.5547, "num_input_tokens_seen": 32257768, "step": 55910 }, { "epoch": 8.328120345546619, "grad_norm": 2.3116629123687744, "learning_rate": 4.138553727190447e-06, "loss": 0.6682, "num_input_tokens_seen": 32260840, "step": 55915 }, { "epoch": 8.328865058087578, "grad_norm": 2.6381356716156006, "learning_rate": 4.134973103234877e-06, "loss": 0.7031, "num_input_tokens_seen": 32263976, "step": 55920 }, { "epoch": 8.329609770628537, "grad_norm": 3.3566083908081055, "learning_rate": 4.131393889252996e-06, "loss": 0.4126, "num_input_tokens_seen": 32267048, "step": 55925 }, { "epoch": 8.330354483169497, "grad_norm": 2.087362289428711, "learning_rate": 4.127816085486674e-06, "loss": 0.7403, "num_input_tokens_seen": 32269832, "step": 55930 }, { "epoch": 8.331099195710456, "grad_norm": 3.1034228801727295, "learning_rate": 4.124239692177675e-06, "loss": 0.6819, "num_input_tokens_seen": 32272488, "step": 55935 }, { "epoch": 8.331843908251415, "grad_norm": 2.8004164695739746, "learning_rate": 4.120664709567684e-06, "loss": 0.8253, "num_input_tokens_seen": 32275400, "step": 55940 }, { "epoch": 8.332588620792373, "grad_norm": 2.661651611328125, "learning_rate": 4.117091137898282e-06, "loss": 0.4605, "num_input_tokens_seen": 32278536, "step": 55945 }, { "epoch": 8.333333333333334, "grad_norm": 3.570354700088501, "learning_rate": 4.113518977410963e-06, "loss": 0.5346, "num_input_tokens_seen": 32281544, "step": 55950 }, { "epoch": 8.334078045874293, "grad_norm": 3.818667411804199, "learning_rate": 4.109948228347108e-06, "loss": 0.5882, "num_input_tokens_seen": 32284488, "step": 55955 }, { "epoch": 8.334822758415251, "grad_norm": 4.068828582763672, "learning_rate": 4.1063788909480175e-06, "loss": 0.6, "num_input_tokens_seen": 32287080, "step": 55960 }, { "epoch": 8.33556747095621, "grad_norm": 2.5797841548919678, "learning_rate": 4.102810965454904e-06, "loss": 0.6325, "num_input_tokens_seen": 32289960, "step": 55965 }, { "epoch": 8.33631218349717, "grad_norm": 2.2564327716827393, "learning_rate": 4.099244452108855e-06, "loss": 0.5845, "num_input_tokens_seen": 32292776, "step": 55970 }, { "epoch": 8.33705689603813, "grad_norm": 2.085103988647461, "learning_rate": 4.0956793511508885e-06, "loss": 0.4304, "num_input_tokens_seen": 32295784, "step": 55975 }, { "epoch": 8.337801608579088, "grad_norm": 3.4623303413391113, "learning_rate": 4.092115662821921e-06, "loss": 0.5536, "num_input_tokens_seen": 32298440, "step": 55980 }, { "epoch": 8.338546321120047, "grad_norm": 4.594332695007324, "learning_rate": 4.088553387362773e-06, "loss": 0.494, "num_input_tokens_seen": 32301128, "step": 55985 }, { "epoch": 8.339291033661008, "grad_norm": 1.4110865592956543, "learning_rate": 4.084992525014172e-06, "loss": 0.4743, "num_input_tokens_seen": 32304136, "step": 55990 }, { "epoch": 8.340035746201966, "grad_norm": 3.5160694122314453, "learning_rate": 4.081433076016739e-06, "loss": 0.7372, "num_input_tokens_seen": 32307048, "step": 55995 }, { "epoch": 8.340780458742925, "grad_norm": 2.578138589859009, "learning_rate": 4.077875040611015e-06, "loss": 0.647, "num_input_tokens_seen": 32310024, "step": 56000 }, { "epoch": 8.341525171283884, "grad_norm": 2.9404892921447754, "learning_rate": 4.074318419037424e-06, "loss": 0.4921, "num_input_tokens_seen": 32312840, "step": 56005 }, { "epoch": 8.342269883824844, "grad_norm": 3.780651807785034, "learning_rate": 4.070763211536319e-06, "loss": 0.4988, "num_input_tokens_seen": 32315816, "step": 56010 }, { "epoch": 8.343014596365803, "grad_norm": 1.612338662147522, "learning_rate": 4.067209418347942e-06, "loss": 0.5593, "num_input_tokens_seen": 32318472, "step": 56015 }, { "epoch": 8.343759308906762, "grad_norm": 1.1043654680252075, "learning_rate": 4.063657039712448e-06, "loss": 0.4601, "num_input_tokens_seen": 32321544, "step": 56020 }, { "epoch": 8.34450402144772, "grad_norm": 1.7433308362960815, "learning_rate": 4.0601060758698965e-06, "loss": 0.5229, "num_input_tokens_seen": 32324424, "step": 56025 }, { "epoch": 8.345248733988681, "grad_norm": 2.6847753524780273, "learning_rate": 4.05655652706024e-06, "loss": 0.6278, "num_input_tokens_seen": 32327304, "step": 56030 }, { "epoch": 8.34599344652964, "grad_norm": 1.2047300338745117, "learning_rate": 4.053008393523336e-06, "loss": 0.4617, "num_input_tokens_seen": 32330024, "step": 56035 }, { "epoch": 8.346738159070599, "grad_norm": 4.316982269287109, "learning_rate": 4.049461675498961e-06, "loss": 0.5321, "num_input_tokens_seen": 32332680, "step": 56040 }, { "epoch": 8.347482871611557, "grad_norm": 1.4901964664459229, "learning_rate": 4.045916373226791e-06, "loss": 0.5462, "num_input_tokens_seen": 32335784, "step": 56045 }, { "epoch": 8.348227584152518, "grad_norm": 6.844937324523926, "learning_rate": 4.042372486946394e-06, "loss": 0.5363, "num_input_tokens_seen": 32338888, "step": 56050 }, { "epoch": 8.348972296693477, "grad_norm": 5.5862016677856445, "learning_rate": 4.03883001689726e-06, "loss": 0.7467, "num_input_tokens_seen": 32341928, "step": 56055 }, { "epoch": 8.349717009234435, "grad_norm": 2.7949893474578857, "learning_rate": 4.035288963318778e-06, "loss": 0.3423, "num_input_tokens_seen": 32344552, "step": 56060 }, { "epoch": 8.350461721775394, "grad_norm": 3.74241304397583, "learning_rate": 4.031749326450224e-06, "loss": 0.6251, "num_input_tokens_seen": 32347208, "step": 56065 }, { "epoch": 8.351206434316353, "grad_norm": 2.3882198333740234, "learning_rate": 4.028211106530808e-06, "loss": 0.5176, "num_input_tokens_seen": 32350376, "step": 56070 }, { "epoch": 8.351951146857314, "grad_norm": 3.7838263511657715, "learning_rate": 4.024674303799611e-06, "loss": 0.5546, "num_input_tokens_seen": 32353000, "step": 56075 }, { "epoch": 8.352695859398272, "grad_norm": 4.525389671325684, "learning_rate": 4.021138918495648e-06, "loss": 0.7842, "num_input_tokens_seen": 32355816, "step": 56080 }, { "epoch": 8.353440571939231, "grad_norm": 2.1098713874816895, "learning_rate": 4.017604950857823e-06, "loss": 0.4799, "num_input_tokens_seen": 32358440, "step": 56085 }, { "epoch": 8.35418528448019, "grad_norm": 4.037532329559326, "learning_rate": 4.014072401124946e-06, "loss": 0.6221, "num_input_tokens_seen": 32361128, "step": 56090 }, { "epoch": 8.35492999702115, "grad_norm": 2.2645881175994873, "learning_rate": 4.0105412695357395e-06, "loss": 0.6689, "num_input_tokens_seen": 32363944, "step": 56095 }, { "epoch": 8.35567470956211, "grad_norm": 5.584660053253174, "learning_rate": 4.0070115563288105e-06, "loss": 0.694, "num_input_tokens_seen": 32366568, "step": 56100 }, { "epoch": 8.356419422103068, "grad_norm": 3.3264942169189453, "learning_rate": 4.003483261742691e-06, "loss": 0.4159, "num_input_tokens_seen": 32369352, "step": 56105 }, { "epoch": 8.357164134644027, "grad_norm": 2.6165401935577393, "learning_rate": 3.999956386015813e-06, "loss": 0.6096, "num_input_tokens_seen": 32372296, "step": 56110 }, { "epoch": 8.357908847184987, "grad_norm": 3.740215539932251, "learning_rate": 3.996430929386494e-06, "loss": 0.5621, "num_input_tokens_seen": 32375176, "step": 56115 }, { "epoch": 8.358653559725946, "grad_norm": 1.7897318601608276, "learning_rate": 3.992906892092979e-06, "loss": 0.6513, "num_input_tokens_seen": 32377928, "step": 56120 }, { "epoch": 8.359398272266905, "grad_norm": 3.166778087615967, "learning_rate": 3.989384274373409e-06, "loss": 0.4744, "num_input_tokens_seen": 32380776, "step": 56125 }, { "epoch": 8.360142984807863, "grad_norm": 2.557701826095581, "learning_rate": 3.985863076465835e-06, "loss": 0.5078, "num_input_tokens_seen": 32383592, "step": 56130 }, { "epoch": 8.360887697348824, "grad_norm": 2.9474451541900635, "learning_rate": 3.9823432986081876e-06, "loss": 0.6889, "num_input_tokens_seen": 32386856, "step": 56135 }, { "epoch": 8.361632409889783, "grad_norm": 2.8069114685058594, "learning_rate": 3.978824941038328e-06, "loss": 0.6394, "num_input_tokens_seen": 32389608, "step": 56140 }, { "epoch": 8.362377122430741, "grad_norm": 2.30873966217041, "learning_rate": 3.975308003994016e-06, "loss": 0.7026, "num_input_tokens_seen": 32392488, "step": 56145 }, { "epoch": 8.3631218349717, "grad_norm": 2.2626285552978516, "learning_rate": 3.971792487712914e-06, "loss": 0.6118, "num_input_tokens_seen": 32395784, "step": 56150 }, { "epoch": 8.36386654751266, "grad_norm": 2.808659553527832, "learning_rate": 3.968278392432573e-06, "loss": 0.5181, "num_input_tokens_seen": 32398536, "step": 56155 }, { "epoch": 8.36461126005362, "grad_norm": 3.9637882709503174, "learning_rate": 3.964765718390473e-06, "loss": 0.5225, "num_input_tokens_seen": 32401224, "step": 56160 }, { "epoch": 8.365355972594578, "grad_norm": 6.4492011070251465, "learning_rate": 3.961254465823985e-06, "loss": 0.5474, "num_input_tokens_seen": 32404104, "step": 56165 }, { "epoch": 8.366100685135537, "grad_norm": 3.705864667892456, "learning_rate": 3.957744634970378e-06, "loss": 0.5302, "num_input_tokens_seen": 32407016, "step": 56170 }, { "epoch": 8.366845397676498, "grad_norm": 6.574367523193359, "learning_rate": 3.954236226066838e-06, "loss": 0.3325, "num_input_tokens_seen": 32410024, "step": 56175 }, { "epoch": 8.367590110217456, "grad_norm": 5.445921421051025, "learning_rate": 3.950729239350448e-06, "loss": 0.6058, "num_input_tokens_seen": 32412872, "step": 56180 }, { "epoch": 8.368334822758415, "grad_norm": 4.156252384185791, "learning_rate": 3.947223675058195e-06, "loss": 0.6008, "num_input_tokens_seen": 32415464, "step": 56185 }, { "epoch": 8.369079535299374, "grad_norm": 2.82597017288208, "learning_rate": 3.943719533426979e-06, "loss": 0.5479, "num_input_tokens_seen": 32418600, "step": 56190 }, { "epoch": 8.369824247840334, "grad_norm": 2.2910337448120117, "learning_rate": 3.940216814693587e-06, "loss": 0.6433, "num_input_tokens_seen": 32421320, "step": 56195 }, { "epoch": 8.370568960381293, "grad_norm": 3.5592000484466553, "learning_rate": 3.936715519094716e-06, "loss": 0.677, "num_input_tokens_seen": 32424136, "step": 56200 }, { "epoch": 8.371313672922252, "grad_norm": 4.096484184265137, "learning_rate": 3.933215646866972e-06, "loss": 0.6591, "num_input_tokens_seen": 32426952, "step": 56205 }, { "epoch": 8.37205838546321, "grad_norm": 2.4007740020751953, "learning_rate": 3.929717198246862e-06, "loss": 0.4896, "num_input_tokens_seen": 32429736, "step": 56210 }, { "epoch": 8.372803098004171, "grad_norm": 2.697624921798706, "learning_rate": 3.926220173470799e-06, "loss": 0.5683, "num_input_tokens_seen": 32432904, "step": 56215 }, { "epoch": 8.37354781054513, "grad_norm": 2.7702255249023438, "learning_rate": 3.9227245727750965e-06, "loss": 0.5592, "num_input_tokens_seen": 32435528, "step": 56220 }, { "epoch": 8.374292523086089, "grad_norm": 2.8504867553710938, "learning_rate": 3.919230396395981e-06, "loss": 0.5132, "num_input_tokens_seen": 32438504, "step": 56225 }, { "epoch": 8.375037235627047, "grad_norm": 2.1310675144195557, "learning_rate": 3.915737644569567e-06, "loss": 0.4314, "num_input_tokens_seen": 32441480, "step": 56230 }, { "epoch": 8.375781948168008, "grad_norm": 2.2681987285614014, "learning_rate": 3.912246317531873e-06, "loss": 0.5205, "num_input_tokens_seen": 32444360, "step": 56235 }, { "epoch": 8.376526660708967, "grad_norm": 2.8553364276885986, "learning_rate": 3.908756415518835e-06, "loss": 0.7459, "num_input_tokens_seen": 32447528, "step": 56240 }, { "epoch": 8.377271373249926, "grad_norm": 3.4711458683013916, "learning_rate": 3.905267938766291e-06, "loss": 0.5408, "num_input_tokens_seen": 32450280, "step": 56245 }, { "epoch": 8.378016085790884, "grad_norm": 2.731123685836792, "learning_rate": 3.901780887509973e-06, "loss": 0.552, "num_input_tokens_seen": 32453192, "step": 56250 }, { "epoch": 8.378760798331843, "grad_norm": 2.2482564449310303, "learning_rate": 3.898295261985524e-06, "loss": 0.611, "num_input_tokens_seen": 32455784, "step": 56255 }, { "epoch": 8.379505510872804, "grad_norm": 3.174424171447754, "learning_rate": 3.894811062428494e-06, "loss": 0.6484, "num_input_tokens_seen": 32458888, "step": 56260 }, { "epoch": 8.380250223413762, "grad_norm": 2.2513227462768555, "learning_rate": 3.8913282890743195e-06, "loss": 0.6492, "num_input_tokens_seen": 32461608, "step": 56265 }, { "epoch": 8.380994935954721, "grad_norm": 2.5429697036743164, "learning_rate": 3.887846942158363e-06, "loss": 0.7548, "num_input_tokens_seen": 32464360, "step": 56270 }, { "epoch": 8.38173964849568, "grad_norm": 2.1415202617645264, "learning_rate": 3.884367021915869e-06, "loss": 0.4719, "num_input_tokens_seen": 32467304, "step": 56275 }, { "epoch": 8.38248436103664, "grad_norm": 2.4152963161468506, "learning_rate": 3.880888528581999e-06, "loss": 0.4598, "num_input_tokens_seen": 32469864, "step": 56280 }, { "epoch": 8.3832290735776, "grad_norm": 3.9008595943450928, "learning_rate": 3.877411462391822e-06, "loss": 0.5279, "num_input_tokens_seen": 32472648, "step": 56285 }, { "epoch": 8.383973786118558, "grad_norm": 3.3984432220458984, "learning_rate": 3.873935823580299e-06, "loss": 0.6509, "num_input_tokens_seen": 32475560, "step": 56290 }, { "epoch": 8.384718498659517, "grad_norm": 1.277026891708374, "learning_rate": 3.870461612382306e-06, "loss": 0.3503, "num_input_tokens_seen": 32478440, "step": 56295 }, { "epoch": 8.385463211200477, "grad_norm": 3.3476178646087646, "learning_rate": 3.866988829032603e-06, "loss": 0.2933, "num_input_tokens_seen": 32481224, "step": 56300 }, { "epoch": 8.386207923741436, "grad_norm": 4.907534122467041, "learning_rate": 3.863517473765877e-06, "loss": 0.5047, "num_input_tokens_seen": 32484232, "step": 56305 }, { "epoch": 8.386952636282395, "grad_norm": 5.348128795623779, "learning_rate": 3.8600475468167056e-06, "loss": 0.566, "num_input_tokens_seen": 32487176, "step": 56310 }, { "epoch": 8.387697348823353, "grad_norm": 3.2293248176574707, "learning_rate": 3.8565790484195785e-06, "loss": 0.7423, "num_input_tokens_seen": 32490664, "step": 56315 }, { "epoch": 8.388442061364314, "grad_norm": 5.087428569793701, "learning_rate": 3.853111978808868e-06, "loss": 0.6553, "num_input_tokens_seen": 32493640, "step": 56320 }, { "epoch": 8.389186773905273, "grad_norm": 3.333217144012451, "learning_rate": 3.849646338218874e-06, "loss": 0.4609, "num_input_tokens_seen": 32496552, "step": 56325 }, { "epoch": 8.389931486446232, "grad_norm": 3.2703630924224854, "learning_rate": 3.846182126883796e-06, "loss": 0.3959, "num_input_tokens_seen": 32499400, "step": 56330 }, { "epoch": 8.39067619898719, "grad_norm": 2.6705334186553955, "learning_rate": 3.842719345037718e-06, "loss": 0.704, "num_input_tokens_seen": 32502184, "step": 56335 }, { "epoch": 8.39142091152815, "grad_norm": 3.048623561859131, "learning_rate": 3.83925799291465e-06, "loss": 0.6193, "num_input_tokens_seen": 32504968, "step": 56340 }, { "epoch": 8.39216562406911, "grad_norm": 5.398787975311279, "learning_rate": 3.835798070748489e-06, "loss": 0.658, "num_input_tokens_seen": 32507688, "step": 56345 }, { "epoch": 8.392910336610068, "grad_norm": 2.430699586868286, "learning_rate": 3.8323395787730505e-06, "loss": 0.6461, "num_input_tokens_seen": 32510472, "step": 56350 }, { "epoch": 8.393655049151027, "grad_norm": 2.3453893661499023, "learning_rate": 3.828882517222046e-06, "loss": 0.5122, "num_input_tokens_seen": 32513192, "step": 56355 }, { "epoch": 8.394399761691988, "grad_norm": 4.701944828033447, "learning_rate": 3.825426886329087e-06, "loss": 0.508, "num_input_tokens_seen": 32515720, "step": 56360 }, { "epoch": 8.395144474232946, "grad_norm": 5.177687644958496, "learning_rate": 3.8219726863276826e-06, "loss": 0.4821, "num_input_tokens_seen": 32518664, "step": 56365 }, { "epoch": 8.395889186773905, "grad_norm": 2.9490721225738525, "learning_rate": 3.81851991745126e-06, "loss": 0.5672, "num_input_tokens_seen": 32521800, "step": 56370 }, { "epoch": 8.396633899314864, "grad_norm": 4.445276737213135, "learning_rate": 3.8150685799331454e-06, "loss": 0.5852, "num_input_tokens_seen": 32524552, "step": 56375 }, { "epoch": 8.397378611855824, "grad_norm": 3.5627782344818115, "learning_rate": 3.811618674006562e-06, "loss": 0.636, "num_input_tokens_seen": 32527464, "step": 56380 }, { "epoch": 8.398123324396783, "grad_norm": 2.069519519805908, "learning_rate": 3.8081701999046454e-06, "loss": 0.5354, "num_input_tokens_seen": 32530472, "step": 56385 }, { "epoch": 8.398868036937742, "grad_norm": 4.223233699798584, "learning_rate": 3.804723157860432e-06, "loss": 0.4653, "num_input_tokens_seen": 32533160, "step": 56390 }, { "epoch": 8.3996127494787, "grad_norm": 2.4143106937408447, "learning_rate": 3.8012775481068517e-06, "loss": 0.7742, "num_input_tokens_seen": 32535976, "step": 56395 }, { "epoch": 8.400357462019661, "grad_norm": 3.5012128353118896, "learning_rate": 3.797833370876744e-06, "loss": 0.4629, "num_input_tokens_seen": 32538952, "step": 56400 }, { "epoch": 8.40110217456062, "grad_norm": 3.0790317058563232, "learning_rate": 3.794390626402855e-06, "loss": 0.5259, "num_input_tokens_seen": 32542056, "step": 56405 }, { "epoch": 8.401846887101579, "grad_norm": 3.8987886905670166, "learning_rate": 3.79094931491783e-06, "loss": 0.5611, "num_input_tokens_seen": 32545320, "step": 56410 }, { "epoch": 8.402591599642538, "grad_norm": 2.4084856510162354, "learning_rate": 3.7875094366542212e-06, "loss": 0.7106, "num_input_tokens_seen": 32548584, "step": 56415 }, { "epoch": 8.403336312183498, "grad_norm": 1.5922813415527344, "learning_rate": 3.7840709918444823e-06, "loss": 0.6683, "num_input_tokens_seen": 32551496, "step": 56420 }, { "epoch": 8.404081024724457, "grad_norm": 4.281620502471924, "learning_rate": 3.780633980720974e-06, "loss": 0.6201, "num_input_tokens_seen": 32554056, "step": 56425 }, { "epoch": 8.404825737265416, "grad_norm": 3.0414011478424072, "learning_rate": 3.777198403515944e-06, "loss": 0.67, "num_input_tokens_seen": 32557160, "step": 56430 }, { "epoch": 8.405570449806374, "grad_norm": 3.8963778018951416, "learning_rate": 3.7737642604615624e-06, "loss": 0.417, "num_input_tokens_seen": 32560040, "step": 56435 }, { "epoch": 8.406315162347333, "grad_norm": 2.673281192779541, "learning_rate": 3.7703315517898908e-06, "loss": 0.4024, "num_input_tokens_seen": 32562824, "step": 56440 }, { "epoch": 8.407059874888294, "grad_norm": 2.73931884765625, "learning_rate": 3.7669002777328986e-06, "loss": 0.4731, "num_input_tokens_seen": 32565608, "step": 56445 }, { "epoch": 8.407804587429252, "grad_norm": 1.2664449214935303, "learning_rate": 3.763470438522457e-06, "loss": 0.4649, "num_input_tokens_seen": 32568680, "step": 56450 }, { "epoch": 8.408549299970211, "grad_norm": 3.687152147293091, "learning_rate": 3.760042034390343e-06, "loss": 0.5307, "num_input_tokens_seen": 32571720, "step": 56455 }, { "epoch": 8.40929401251117, "grad_norm": 4.172752857208252, "learning_rate": 3.7566150655682364e-06, "loss": 0.7839, "num_input_tokens_seen": 32574568, "step": 56460 }, { "epoch": 8.41003872505213, "grad_norm": 2.423213243484497, "learning_rate": 3.7531895322877096e-06, "loss": 0.6708, "num_input_tokens_seen": 32577352, "step": 56465 }, { "epoch": 8.41078343759309, "grad_norm": 2.2263731956481934, "learning_rate": 3.749765434780253e-06, "loss": 0.4798, "num_input_tokens_seen": 32580488, "step": 56470 }, { "epoch": 8.411528150134048, "grad_norm": 1.9319000244140625, "learning_rate": 3.746342773277256e-06, "loss": 0.7597, "num_input_tokens_seen": 32583496, "step": 56475 }, { "epoch": 8.412272862675007, "grad_norm": 2.434359550476074, "learning_rate": 3.742921548009995e-06, "loss": 0.5557, "num_input_tokens_seen": 32586184, "step": 56480 }, { "epoch": 8.413017575215967, "grad_norm": 5.304347991943359, "learning_rate": 3.7395017592096738e-06, "loss": 0.538, "num_input_tokens_seen": 32589160, "step": 56485 }, { "epoch": 8.413762287756926, "grad_norm": 5.795299530029297, "learning_rate": 3.7360834071073823e-06, "loss": 0.5343, "num_input_tokens_seen": 32592008, "step": 56490 }, { "epoch": 8.414507000297885, "grad_norm": 8.798036575317383, "learning_rate": 3.7326664919341308e-06, "loss": 0.8012, "num_input_tokens_seen": 32594984, "step": 56495 }, { "epoch": 8.415251712838844, "grad_norm": 2.7798500061035156, "learning_rate": 3.7292510139208007e-06, "loss": 0.6138, "num_input_tokens_seen": 32597640, "step": 56500 }, { "epoch": 8.415996425379804, "grad_norm": 5.39771032333374, "learning_rate": 3.725836973298211e-06, "loss": 0.8348, "num_input_tokens_seen": 32600552, "step": 56505 }, { "epoch": 8.416741137920763, "grad_norm": 5.622596263885498, "learning_rate": 3.722424370297062e-06, "loss": 0.7154, "num_input_tokens_seen": 32603368, "step": 56510 }, { "epoch": 8.417485850461722, "grad_norm": 2.886439561843872, "learning_rate": 3.7190132051479697e-06, "loss": 0.4089, "num_input_tokens_seen": 32606120, "step": 56515 }, { "epoch": 8.41823056300268, "grad_norm": 3.4170050621032715, "learning_rate": 3.715603478081439e-06, "loss": 0.7287, "num_input_tokens_seen": 32609192, "step": 56520 }, { "epoch": 8.418975275543641, "grad_norm": 2.9892868995666504, "learning_rate": 3.7121951893278966e-06, "loss": 0.6283, "num_input_tokens_seen": 32612168, "step": 56525 }, { "epoch": 8.4197199880846, "grad_norm": 1.3669453859329224, "learning_rate": 3.708788339117644e-06, "loss": 0.5993, "num_input_tokens_seen": 32615048, "step": 56530 }, { "epoch": 8.420464700625558, "grad_norm": 2.8184547424316406, "learning_rate": 3.7053829276809143e-06, "loss": 0.5858, "num_input_tokens_seen": 32617928, "step": 56535 }, { "epoch": 8.421209413166517, "grad_norm": 1.980523705482483, "learning_rate": 3.7019789552478286e-06, "loss": 0.6129, "num_input_tokens_seen": 32620840, "step": 56540 }, { "epoch": 8.421954125707478, "grad_norm": 3.8219165802001953, "learning_rate": 3.6985764220484137e-06, "loss": 0.6686, "num_input_tokens_seen": 32623656, "step": 56545 }, { "epoch": 8.422698838248436, "grad_norm": 2.7495763301849365, "learning_rate": 3.695175328312597e-06, "loss": 0.4607, "num_input_tokens_seen": 32626472, "step": 56550 }, { "epoch": 8.423443550789395, "grad_norm": 1.9384421110153198, "learning_rate": 3.6917756742702205e-06, "loss": 0.5481, "num_input_tokens_seen": 32629736, "step": 56555 }, { "epoch": 8.424188263330354, "grad_norm": 4.420627593994141, "learning_rate": 3.68837746015101e-06, "loss": 0.4564, "num_input_tokens_seen": 32632680, "step": 56560 }, { "epoch": 8.424932975871315, "grad_norm": 5.460824012756348, "learning_rate": 3.6849806861845997e-06, "loss": 0.9333, "num_input_tokens_seen": 32635464, "step": 56565 }, { "epoch": 8.425677688412273, "grad_norm": 1.9173710346221924, "learning_rate": 3.6815853526005305e-06, "loss": 0.4389, "num_input_tokens_seen": 32638376, "step": 56570 }, { "epoch": 8.426422400953232, "grad_norm": 3.1276795864105225, "learning_rate": 3.678191459628252e-06, "loss": 0.5356, "num_input_tokens_seen": 32641000, "step": 56575 }, { "epoch": 8.42716711349419, "grad_norm": 2.295980453491211, "learning_rate": 3.6747990074971065e-06, "loss": 0.627, "num_input_tokens_seen": 32643848, "step": 56580 }, { "epoch": 8.42791182603515, "grad_norm": 2.455644369125366, "learning_rate": 3.671407996436341e-06, "loss": 0.6438, "num_input_tokens_seen": 32646952, "step": 56585 }, { "epoch": 8.42865653857611, "grad_norm": 2.7766146659851074, "learning_rate": 3.6680184266751128e-06, "loss": 0.4918, "num_input_tokens_seen": 32650344, "step": 56590 }, { "epoch": 8.429401251117069, "grad_norm": 2.9977197647094727, "learning_rate": 3.66463029844247e-06, "loss": 0.5124, "num_input_tokens_seen": 32653128, "step": 56595 }, { "epoch": 8.430145963658028, "grad_norm": 8.303445816040039, "learning_rate": 3.6612436119673634e-06, "loss": 0.7755, "num_input_tokens_seen": 32655720, "step": 56600 }, { "epoch": 8.430890676198986, "grad_norm": 2.7369563579559326, "learning_rate": 3.657858367478656e-06, "loss": 0.564, "num_input_tokens_seen": 32658696, "step": 56605 }, { "epoch": 8.431635388739947, "grad_norm": 3.1023948192596436, "learning_rate": 3.6544745652051097e-06, "loss": 0.461, "num_input_tokens_seen": 32661512, "step": 56610 }, { "epoch": 8.432380101280906, "grad_norm": 2.1714022159576416, "learning_rate": 3.6510922053753864e-06, "loss": 0.4567, "num_input_tokens_seen": 32664456, "step": 56615 }, { "epoch": 8.433124813821864, "grad_norm": 4.531257629394531, "learning_rate": 3.647711288218053e-06, "loss": 0.5369, "num_input_tokens_seen": 32667368, "step": 56620 }, { "epoch": 8.433869526362823, "grad_norm": 4.173285007476807, "learning_rate": 3.644331813961588e-06, "loss": 0.6417, "num_input_tokens_seen": 32669992, "step": 56625 }, { "epoch": 8.434614238903784, "grad_norm": 3.0565409660339355, "learning_rate": 3.640953782834344e-06, "loss": 0.569, "num_input_tokens_seen": 32672840, "step": 56630 }, { "epoch": 8.435358951444742, "grad_norm": 3.7278740406036377, "learning_rate": 3.637577195064612e-06, "loss": 0.6754, "num_input_tokens_seen": 32675624, "step": 56635 }, { "epoch": 8.436103663985701, "grad_norm": 2.590867280960083, "learning_rate": 3.634202050880553e-06, "loss": 0.6989, "num_input_tokens_seen": 32678440, "step": 56640 }, { "epoch": 8.43684837652666, "grad_norm": 3.450326919555664, "learning_rate": 3.6308283505102515e-06, "loss": 0.5054, "num_input_tokens_seen": 32681320, "step": 56645 }, { "epoch": 8.43759308906762, "grad_norm": 4.247788429260254, "learning_rate": 3.6274560941816887e-06, "loss": 0.4865, "num_input_tokens_seen": 32684072, "step": 56650 }, { "epoch": 8.43833780160858, "grad_norm": 3.092622995376587, "learning_rate": 3.6240852821227524e-06, "loss": 0.5722, "num_input_tokens_seen": 32687016, "step": 56655 }, { "epoch": 8.439082514149538, "grad_norm": 2.504852533340454, "learning_rate": 3.620715914561226e-06, "loss": 0.4862, "num_input_tokens_seen": 32689992, "step": 56660 }, { "epoch": 8.439827226690497, "grad_norm": 2.6349234580993652, "learning_rate": 3.6173479917247927e-06, "loss": 0.4003, "num_input_tokens_seen": 32692712, "step": 56665 }, { "epoch": 8.440571939231457, "grad_norm": 2.098092794418335, "learning_rate": 3.613981513841047e-06, "loss": 0.4273, "num_input_tokens_seen": 32695528, "step": 56670 }, { "epoch": 8.441316651772416, "grad_norm": 4.6413493156433105, "learning_rate": 3.6106164811374855e-06, "loss": 0.7789, "num_input_tokens_seen": 32698536, "step": 56675 }, { "epoch": 8.442061364313375, "grad_norm": 3.6960136890411377, "learning_rate": 3.607252893841495e-06, "loss": 0.5147, "num_input_tokens_seen": 32701480, "step": 56680 }, { "epoch": 8.442806076854334, "grad_norm": 2.377443313598633, "learning_rate": 3.6038907521803776e-06, "loss": 0.4904, "num_input_tokens_seen": 32704488, "step": 56685 }, { "epoch": 8.443550789395294, "grad_norm": 2.3501484394073486, "learning_rate": 3.6005300563813375e-06, "loss": 0.6552, "num_input_tokens_seen": 32707464, "step": 56690 }, { "epoch": 8.444295501936253, "grad_norm": 1.711814284324646, "learning_rate": 3.5971708066714682e-06, "loss": 0.5267, "num_input_tokens_seen": 32710216, "step": 56695 }, { "epoch": 8.445040214477212, "grad_norm": 8.833033561706543, "learning_rate": 3.593813003277777e-06, "loss": 0.5842, "num_input_tokens_seen": 32713288, "step": 56700 }, { "epoch": 8.44578492701817, "grad_norm": 2.3048410415649414, "learning_rate": 3.5904566464271704e-06, "loss": 0.6513, "num_input_tokens_seen": 32715816, "step": 56705 }, { "epoch": 8.446529639559131, "grad_norm": 4.2922682762146, "learning_rate": 3.5871017363464596e-06, "loss": 0.6325, "num_input_tokens_seen": 32718664, "step": 56710 }, { "epoch": 8.44727435210009, "grad_norm": 2.659656286239624, "learning_rate": 3.5837482732623636e-06, "loss": 0.571, "num_input_tokens_seen": 32721704, "step": 56715 }, { "epoch": 8.448019064641048, "grad_norm": 2.3610222339630127, "learning_rate": 3.5803962574014775e-06, "loss": 0.6116, "num_input_tokens_seen": 32724584, "step": 56720 }, { "epoch": 8.448763777182007, "grad_norm": 2.6886861324310303, "learning_rate": 3.577045688990335e-06, "loss": 0.5891, "num_input_tokens_seen": 32727912, "step": 56725 }, { "epoch": 8.449508489722968, "grad_norm": 1.261399269104004, "learning_rate": 3.5736965682553385e-06, "loss": 0.7926, "num_input_tokens_seen": 32730472, "step": 56730 }, { "epoch": 8.450253202263927, "grad_norm": 2.859227418899536, "learning_rate": 3.5703488954228147e-06, "loss": 0.7092, "num_input_tokens_seen": 32733256, "step": 56735 }, { "epoch": 8.450997914804885, "grad_norm": 3.338653564453125, "learning_rate": 3.5670026707189858e-06, "loss": 0.5677, "num_input_tokens_seen": 32736296, "step": 56740 }, { "epoch": 8.451742627345844, "grad_norm": 2.82381010055542, "learning_rate": 3.5636578943699787e-06, "loss": 0.5529, "num_input_tokens_seen": 32739144, "step": 56745 }, { "epoch": 8.452487339886805, "grad_norm": 2.6148176193237305, "learning_rate": 3.5603145666018132e-06, "loss": 0.7076, "num_input_tokens_seen": 32742248, "step": 56750 }, { "epoch": 8.453232052427763, "grad_norm": 3.2082672119140625, "learning_rate": 3.5569726876404307e-06, "loss": 0.5922, "num_input_tokens_seen": 32745192, "step": 56755 }, { "epoch": 8.453976764968722, "grad_norm": 4.640859127044678, "learning_rate": 3.553632257711653e-06, "loss": 0.4787, "num_input_tokens_seen": 32748136, "step": 56760 }, { "epoch": 8.45472147750968, "grad_norm": 2.2816903591156006, "learning_rate": 3.550293277041206e-06, "loss": 0.5695, "num_input_tokens_seen": 32751016, "step": 56765 }, { "epoch": 8.45546619005064, "grad_norm": 3.506216049194336, "learning_rate": 3.54695574585473e-06, "loss": 0.6349, "num_input_tokens_seen": 32754120, "step": 56770 }, { "epoch": 8.4562109025916, "grad_norm": 1.78795325756073, "learning_rate": 3.543619664377765e-06, "loss": 0.5379, "num_input_tokens_seen": 32756968, "step": 56775 }, { "epoch": 8.456955615132559, "grad_norm": 1.6655558347702026, "learning_rate": 3.540285032835747e-06, "loss": 0.3632, "num_input_tokens_seen": 32759752, "step": 56780 }, { "epoch": 8.457700327673518, "grad_norm": 3.6899499893188477, "learning_rate": 3.536951851454018e-06, "loss": 0.5401, "num_input_tokens_seen": 32762888, "step": 56785 }, { "epoch": 8.458445040214476, "grad_norm": 3.5401525497436523, "learning_rate": 3.5336201204578256e-06, "loss": 0.5614, "num_input_tokens_seen": 32765544, "step": 56790 }, { "epoch": 8.459189752755437, "grad_norm": 6.203985691070557, "learning_rate": 3.5302898400723094e-06, "loss": 0.814, "num_input_tokens_seen": 32768360, "step": 56795 }, { "epoch": 8.459934465296396, "grad_norm": 10.7786226272583, "learning_rate": 3.5269610105225114e-06, "loss": 0.754, "num_input_tokens_seen": 32771144, "step": 56800 }, { "epoch": 8.460679177837354, "grad_norm": 4.752929210662842, "learning_rate": 3.523633632033385e-06, "loss": 0.5466, "num_input_tokens_seen": 32773992, "step": 56805 }, { "epoch": 8.461423890378313, "grad_norm": 1.8624848127365112, "learning_rate": 3.520307704829781e-06, "loss": 0.5344, "num_input_tokens_seen": 32776680, "step": 56810 }, { "epoch": 8.462168602919274, "grad_norm": 1.1991866827011108, "learning_rate": 3.5169832291364502e-06, "loss": 0.6076, "num_input_tokens_seen": 32779656, "step": 56815 }, { "epoch": 8.462913315460233, "grad_norm": 3.5734426975250244, "learning_rate": 3.5136602051780517e-06, "loss": 0.5926, "num_input_tokens_seen": 32782248, "step": 56820 }, { "epoch": 8.463658028001191, "grad_norm": 1.528377652168274, "learning_rate": 3.5103386331791444e-06, "loss": 0.6431, "num_input_tokens_seen": 32785288, "step": 56825 }, { "epoch": 8.46440274054215, "grad_norm": 2.36447811126709, "learning_rate": 3.507018513364177e-06, "loss": 0.4644, "num_input_tokens_seen": 32787976, "step": 56830 }, { "epoch": 8.46514745308311, "grad_norm": 1.7236261367797852, "learning_rate": 3.5036998459575197e-06, "loss": 0.4777, "num_input_tokens_seen": 32791080, "step": 56835 }, { "epoch": 8.46589216562407, "grad_norm": 3.2658755779266357, "learning_rate": 3.5003826311834214e-06, "loss": 0.6507, "num_input_tokens_seen": 32793608, "step": 56840 }, { "epoch": 8.466636878165028, "grad_norm": 2.4262051582336426, "learning_rate": 3.497066869266058e-06, "loss": 0.5891, "num_input_tokens_seen": 32796328, "step": 56845 }, { "epoch": 8.467381590705987, "grad_norm": 3.265432357788086, "learning_rate": 3.493752560429486e-06, "loss": 0.5732, "num_input_tokens_seen": 32799464, "step": 56850 }, { "epoch": 8.468126303246947, "grad_norm": 1.2459901571273804, "learning_rate": 3.490439704897688e-06, "loss": 0.4728, "num_input_tokens_seen": 32802088, "step": 56855 }, { "epoch": 8.468871015787906, "grad_norm": 4.619373798370361, "learning_rate": 3.4871283028945155e-06, "loss": 0.6271, "num_input_tokens_seen": 32804776, "step": 56860 }, { "epoch": 8.469615728328865, "grad_norm": 2.794219493865967, "learning_rate": 3.4838183546437475e-06, "loss": 0.7783, "num_input_tokens_seen": 32807464, "step": 56865 }, { "epoch": 8.470360440869824, "grad_norm": 1.8259429931640625, "learning_rate": 3.480509860369058e-06, "loss": 0.6861, "num_input_tokens_seen": 32810600, "step": 56870 }, { "epoch": 8.471105153410784, "grad_norm": 3.8240978717803955, "learning_rate": 3.477202820294018e-06, "loss": 0.4719, "num_input_tokens_seen": 32813160, "step": 56875 }, { "epoch": 8.471849865951743, "grad_norm": 1.773768663406372, "learning_rate": 3.473897234642112e-06, "loss": 0.5655, "num_input_tokens_seen": 32815816, "step": 56880 }, { "epoch": 8.472594578492702, "grad_norm": 3.124563455581665, "learning_rate": 3.4705931036367074e-06, "loss": 0.6156, "num_input_tokens_seen": 32818696, "step": 56885 }, { "epoch": 8.47333929103366, "grad_norm": 2.4824047088623047, "learning_rate": 3.4672904275010936e-06, "loss": 0.6625, "num_input_tokens_seen": 32821608, "step": 56890 }, { "epoch": 8.474084003574621, "grad_norm": 2.255788803100586, "learning_rate": 3.463989206458443e-06, "loss": 0.6745, "num_input_tokens_seen": 32824776, "step": 56895 }, { "epoch": 8.47482871611558, "grad_norm": 2.8203160762786865, "learning_rate": 3.460689440731843e-06, "loss": 0.6044, "num_input_tokens_seen": 32827656, "step": 56900 }, { "epoch": 8.475573428656539, "grad_norm": 2.529891014099121, "learning_rate": 3.457391130544277e-06, "loss": 0.5306, "num_input_tokens_seen": 32830440, "step": 56905 }, { "epoch": 8.476318141197497, "grad_norm": 3.93481183052063, "learning_rate": 3.45409427611863e-06, "loss": 0.7653, "num_input_tokens_seen": 32833384, "step": 56910 }, { "epoch": 8.477062853738458, "grad_norm": 1.2545771598815918, "learning_rate": 3.4507988776776968e-06, "loss": 0.4014, "num_input_tokens_seen": 32836328, "step": 56915 }, { "epoch": 8.477807566279417, "grad_norm": 4.106736660003662, "learning_rate": 3.4475049354441653e-06, "loss": 0.5562, "num_input_tokens_seen": 32839176, "step": 56920 }, { "epoch": 8.478552278820375, "grad_norm": 2.0413818359375, "learning_rate": 3.444212449640627e-06, "loss": 0.5788, "num_input_tokens_seen": 32842024, "step": 56925 }, { "epoch": 8.479296991361334, "grad_norm": 2.517880439758301, "learning_rate": 3.4409214204895653e-06, "loss": 0.5023, "num_input_tokens_seen": 32844904, "step": 56930 }, { "epoch": 8.480041703902295, "grad_norm": 2.843050479888916, "learning_rate": 3.4376318482133797e-06, "loss": 0.5519, "num_input_tokens_seen": 32847560, "step": 56935 }, { "epoch": 8.480786416443253, "grad_norm": 17.27689552307129, "learning_rate": 3.4343437330343675e-06, "loss": 0.5955, "num_input_tokens_seen": 32850408, "step": 56940 }, { "epoch": 8.481531128984212, "grad_norm": 3.288767099380493, "learning_rate": 3.431057075174729e-06, "loss": 0.7913, "num_input_tokens_seen": 32853352, "step": 56945 }, { "epoch": 8.482275841525171, "grad_norm": 4.560008525848389, "learning_rate": 3.4277718748565585e-06, "loss": 0.8076, "num_input_tokens_seen": 32856264, "step": 56950 }, { "epoch": 8.48302055406613, "grad_norm": 2.1760153770446777, "learning_rate": 3.4244881323018645e-06, "loss": 0.744, "num_input_tokens_seen": 32859112, "step": 56955 }, { "epoch": 8.48376526660709, "grad_norm": 1.9919440746307373, "learning_rate": 3.421205847732542e-06, "loss": 0.5996, "num_input_tokens_seen": 32861992, "step": 56960 }, { "epoch": 8.484509979148049, "grad_norm": 3.396365165710449, "learning_rate": 3.4179250213703914e-06, "loss": 0.5255, "num_input_tokens_seen": 32864840, "step": 56965 }, { "epoch": 8.485254691689008, "grad_norm": 0.970923900604248, "learning_rate": 3.414645653437118e-06, "loss": 0.637, "num_input_tokens_seen": 32867944, "step": 56970 }, { "epoch": 8.485999404229966, "grad_norm": 3.3188509941101074, "learning_rate": 3.411367744154334e-06, "loss": 0.5826, "num_input_tokens_seen": 32871112, "step": 56975 }, { "epoch": 8.486744116770927, "grad_norm": 4.113650798797607, "learning_rate": 3.4080912937435455e-06, "loss": 0.4549, "num_input_tokens_seen": 32874088, "step": 56980 }, { "epoch": 8.487488829311886, "grad_norm": 3.5559585094451904, "learning_rate": 3.4048163024261614e-06, "loss": 0.5588, "num_input_tokens_seen": 32876808, "step": 56985 }, { "epoch": 8.488233541852845, "grad_norm": 2.3174092769622803, "learning_rate": 3.4015427704234965e-06, "loss": 0.4273, "num_input_tokens_seen": 32879752, "step": 56990 }, { "epoch": 8.488978254393803, "grad_norm": 5.197283744812012, "learning_rate": 3.3982706979567542e-06, "loss": 0.602, "num_input_tokens_seen": 32882600, "step": 56995 }, { "epoch": 8.489722966934764, "grad_norm": 2.2327959537506104, "learning_rate": 3.395000085247055e-06, "loss": 0.5025, "num_input_tokens_seen": 32885640, "step": 57000 }, { "epoch": 8.490467679475723, "grad_norm": 4.259765148162842, "learning_rate": 3.391730932515405e-06, "loss": 0.5222, "num_input_tokens_seen": 32888328, "step": 57005 }, { "epoch": 8.491212392016681, "grad_norm": 2.259361505508423, "learning_rate": 3.388463239982728e-06, "loss": 0.6372, "num_input_tokens_seen": 32891016, "step": 57010 }, { "epoch": 8.49195710455764, "grad_norm": 5.5792717933654785, "learning_rate": 3.3851970078698394e-06, "loss": 0.6868, "num_input_tokens_seen": 32893640, "step": 57015 }, { "epoch": 8.4927018170986, "grad_norm": 1.984999179840088, "learning_rate": 3.3819322363974615e-06, "loss": 0.6761, "num_input_tokens_seen": 32896584, "step": 57020 }, { "epoch": 8.49344652963956, "grad_norm": 2.4509940147399902, "learning_rate": 3.3786689257862047e-06, "loss": 0.5947, "num_input_tokens_seen": 32899432, "step": 57025 }, { "epoch": 8.494191242180518, "grad_norm": 2.3110580444335938, "learning_rate": 3.3754070762565952e-06, "loss": 0.6214, "num_input_tokens_seen": 32902440, "step": 57030 }, { "epoch": 8.494935954721477, "grad_norm": 2.6190569400787354, "learning_rate": 3.372146688029057e-06, "loss": 0.5594, "num_input_tokens_seen": 32906664, "step": 57035 }, { "epoch": 8.495680667262437, "grad_norm": 1.937928318977356, "learning_rate": 3.368887761323919e-06, "loss": 0.4308, "num_input_tokens_seen": 32909576, "step": 57040 }, { "epoch": 8.496425379803396, "grad_norm": 2.7412922382354736, "learning_rate": 3.3656302963613966e-06, "loss": 0.5351, "num_input_tokens_seen": 32912072, "step": 57045 }, { "epoch": 8.497170092344355, "grad_norm": 3.4632720947265625, "learning_rate": 3.362374293361617e-06, "loss": 0.6998, "num_input_tokens_seen": 32914792, "step": 57050 }, { "epoch": 8.497914804885314, "grad_norm": 4.113213062286377, "learning_rate": 3.359119752544618e-06, "loss": 0.6417, "num_input_tokens_seen": 32917416, "step": 57055 }, { "epoch": 8.498659517426274, "grad_norm": 2.1251003742218018, "learning_rate": 3.3558666741303147e-06, "loss": 0.6536, "num_input_tokens_seen": 32920200, "step": 57060 }, { "epoch": 8.499404229967233, "grad_norm": 3.2006349563598633, "learning_rate": 3.352615058338543e-06, "loss": 0.5285, "num_input_tokens_seen": 32923240, "step": 57065 }, { "epoch": 8.5, "eval_loss": 0.6745066046714783, "eval_runtime": 74.2521, "eval_samples_per_second": 40.187, "eval_steps_per_second": 10.047, "num_input_tokens_seen": 32925544, "step": 57069 }, { "epoch": 8.500148942508192, "grad_norm": 6.5539469718933105, "learning_rate": 3.3493649053890326e-06, "loss": 0.7458, "num_input_tokens_seen": 32926120, "step": 57070 }, { "epoch": 8.50089365504915, "grad_norm": 1.7752329111099243, "learning_rate": 3.3461162155014186e-06, "loss": 0.5025, "num_input_tokens_seen": 32929192, "step": 57075 }, { "epoch": 8.501638367590111, "grad_norm": 4.1150031089782715, "learning_rate": 3.342868988895237e-06, "loss": 0.7147, "num_input_tokens_seen": 32932168, "step": 57080 }, { "epoch": 8.50238308013107, "grad_norm": 2.8235673904418945, "learning_rate": 3.3396232257899116e-06, "loss": 0.559, "num_input_tokens_seen": 32935048, "step": 57085 }, { "epoch": 8.503127792672029, "grad_norm": 2.2734179496765137, "learning_rate": 3.33637892640479e-06, "loss": 0.4758, "num_input_tokens_seen": 32937864, "step": 57090 }, { "epoch": 8.503872505212987, "grad_norm": 1.9769915342330933, "learning_rate": 3.3331360909590994e-06, "loss": 0.529, "num_input_tokens_seen": 32940680, "step": 57095 }, { "epoch": 8.504617217753946, "grad_norm": 2.4990477561950684, "learning_rate": 3.3298947196719776e-06, "loss": 0.3876, "num_input_tokens_seen": 32943208, "step": 57100 }, { "epoch": 8.505361930294907, "grad_norm": 2.824801206588745, "learning_rate": 3.326654812762467e-06, "loss": 0.4671, "num_input_tokens_seen": 32945896, "step": 57105 }, { "epoch": 8.506106642835865, "grad_norm": 2.5660715103149414, "learning_rate": 3.3234163704495086e-06, "loss": 0.4963, "num_input_tokens_seen": 32948776, "step": 57110 }, { "epoch": 8.506851355376824, "grad_norm": 2.292678117752075, "learning_rate": 3.3201793929519386e-06, "loss": 0.5517, "num_input_tokens_seen": 32951496, "step": 57115 }, { "epoch": 8.507596067917785, "grad_norm": 4.821629047393799, "learning_rate": 3.316943880488507e-06, "loss": 0.5031, "num_input_tokens_seen": 32954312, "step": 57120 }, { "epoch": 8.508340780458743, "grad_norm": 2.434924602508545, "learning_rate": 3.313709833277853e-06, "loss": 0.6929, "num_input_tokens_seen": 32957224, "step": 57125 }, { "epoch": 8.509085492999702, "grad_norm": 2.9305224418640137, "learning_rate": 3.310477251538513e-06, "loss": 0.5847, "num_input_tokens_seen": 32960360, "step": 57130 }, { "epoch": 8.509830205540661, "grad_norm": 2.5221753120422363, "learning_rate": 3.3072461354889367e-06, "loss": 0.6642, "num_input_tokens_seen": 32963208, "step": 57135 }, { "epoch": 8.51057491808162, "grad_norm": 2.615882158279419, "learning_rate": 3.304016485347469e-06, "loss": 0.4668, "num_input_tokens_seen": 32966088, "step": 57140 }, { "epoch": 8.51131963062258, "grad_norm": 2.334501028060913, "learning_rate": 3.300788301332361e-06, "loss": 0.864, "num_input_tokens_seen": 32969032, "step": 57145 }, { "epoch": 8.512064343163539, "grad_norm": 2.7460601329803467, "learning_rate": 3.297561583661754e-06, "loss": 0.4628, "num_input_tokens_seen": 32971752, "step": 57150 }, { "epoch": 8.512809055704498, "grad_norm": 1.8721044063568115, "learning_rate": 3.2943363325537046e-06, "loss": 0.6712, "num_input_tokens_seen": 32974600, "step": 57155 }, { "epoch": 8.513553768245457, "grad_norm": 1.732046127319336, "learning_rate": 3.2911125482261577e-06, "loss": 0.5809, "num_input_tokens_seen": 32977448, "step": 57160 }, { "epoch": 8.514298480786417, "grad_norm": 5.773077487945557, "learning_rate": 3.287890230896959e-06, "loss": 0.4528, "num_input_tokens_seen": 32980072, "step": 57165 }, { "epoch": 8.515043193327376, "grad_norm": 2.2666687965393066, "learning_rate": 3.284669380783864e-06, "loss": 0.5567, "num_input_tokens_seen": 32982920, "step": 57170 }, { "epoch": 8.515787905868335, "grad_norm": 6.177441596984863, "learning_rate": 3.2814499981045217e-06, "loss": 0.6264, "num_input_tokens_seen": 32985800, "step": 57175 }, { "epoch": 8.516532618409293, "grad_norm": 2.798457622528076, "learning_rate": 3.2782320830764877e-06, "loss": 0.5381, "num_input_tokens_seen": 32988808, "step": 57180 }, { "epoch": 8.517277330950254, "grad_norm": 2.099358320236206, "learning_rate": 3.2750156359172224e-06, "loss": 0.6359, "num_input_tokens_seen": 32991560, "step": 57185 }, { "epoch": 8.518022043491213, "grad_norm": 4.054399013519287, "learning_rate": 3.271800656844065e-06, "loss": 0.7791, "num_input_tokens_seen": 32994440, "step": 57190 }, { "epoch": 8.518766756032171, "grad_norm": 1.6250431537628174, "learning_rate": 3.268587146074281e-06, "loss": 0.5584, "num_input_tokens_seen": 32997096, "step": 57195 }, { "epoch": 8.51951146857313, "grad_norm": 1.784686803817749, "learning_rate": 3.26537510382503e-06, "loss": 0.4243, "num_input_tokens_seen": 33000232, "step": 57200 }, { "epoch": 8.52025618111409, "grad_norm": 2.6072185039520264, "learning_rate": 3.2621645303133553e-06, "loss": 0.5927, "num_input_tokens_seen": 33003368, "step": 57205 }, { "epoch": 8.52100089365505, "grad_norm": 3.3991737365722656, "learning_rate": 3.2589554257562243e-06, "loss": 0.5201, "num_input_tokens_seen": 33006536, "step": 57210 }, { "epoch": 8.521745606196008, "grad_norm": 2.5618884563446045, "learning_rate": 3.255747790370489e-06, "loss": 0.4285, "num_input_tokens_seen": 33009640, "step": 57215 }, { "epoch": 8.522490318736967, "grad_norm": 2.659419536590576, "learning_rate": 3.2525416243729236e-06, "loss": 0.5894, "num_input_tokens_seen": 33012648, "step": 57220 }, { "epoch": 8.523235031277927, "grad_norm": 2.113637685775757, "learning_rate": 3.2493369279801677e-06, "loss": 0.5728, "num_input_tokens_seen": 33015976, "step": 57225 }, { "epoch": 8.523979743818886, "grad_norm": 3.123562812805176, "learning_rate": 3.2461337014087907e-06, "loss": 0.6802, "num_input_tokens_seen": 33018600, "step": 57230 }, { "epoch": 8.524724456359845, "grad_norm": 4.446351051330566, "learning_rate": 3.242931944875252e-06, "loss": 0.5906, "num_input_tokens_seen": 33021736, "step": 57235 }, { "epoch": 8.525469168900804, "grad_norm": 2.4331510066986084, "learning_rate": 3.239731658595921e-06, "loss": 0.6204, "num_input_tokens_seen": 33024808, "step": 57240 }, { "epoch": 8.526213881441764, "grad_norm": 3.6564791202545166, "learning_rate": 3.236532842787049e-06, "loss": 0.5906, "num_input_tokens_seen": 33027784, "step": 57245 }, { "epoch": 8.526958593982723, "grad_norm": 2.718963146209717, "learning_rate": 3.233335497664805e-06, "loss": 0.6003, "num_input_tokens_seen": 33030600, "step": 57250 }, { "epoch": 8.527703306523682, "grad_norm": 1.6277117729187012, "learning_rate": 3.230139623445255e-06, "loss": 0.6445, "num_input_tokens_seen": 33033160, "step": 57255 }, { "epoch": 8.52844801906464, "grad_norm": 3.294621229171753, "learning_rate": 3.2269452203443546e-06, "loss": 0.5188, "num_input_tokens_seen": 33035912, "step": 57260 }, { "epoch": 8.529192731605601, "grad_norm": 0.7651817202568054, "learning_rate": 3.2237522885779718e-06, "loss": 0.4101, "num_input_tokens_seen": 33038920, "step": 57265 }, { "epoch": 8.52993744414656, "grad_norm": 4.140527725219727, "learning_rate": 3.220560828361874e-06, "loss": 0.655, "num_input_tokens_seen": 33041608, "step": 57270 }, { "epoch": 8.530682156687519, "grad_norm": 3.7979328632354736, "learning_rate": 3.217370839911729e-06, "loss": 0.519, "num_input_tokens_seen": 33044168, "step": 57275 }, { "epoch": 8.531426869228477, "grad_norm": 2.2497076988220215, "learning_rate": 3.2141823234431045e-06, "loss": 0.6383, "num_input_tokens_seen": 33046856, "step": 57280 }, { "epoch": 8.532171581769436, "grad_norm": 7.358802795410156, "learning_rate": 3.2109952791714583e-06, "loss": 0.6354, "num_input_tokens_seen": 33049736, "step": 57285 }, { "epoch": 8.532916294310397, "grad_norm": 2.978952407836914, "learning_rate": 3.2078097073121704e-06, "loss": 0.3831, "num_input_tokens_seen": 33052232, "step": 57290 }, { "epoch": 8.533661006851355, "grad_norm": 1.2016196250915527, "learning_rate": 3.2046256080804943e-06, "loss": 0.4045, "num_input_tokens_seen": 33055080, "step": 57295 }, { "epoch": 8.534405719392314, "grad_norm": 3.8507139682769775, "learning_rate": 3.2014429816916074e-06, "loss": 0.4906, "num_input_tokens_seen": 33057576, "step": 57300 }, { "epoch": 8.535150431933273, "grad_norm": 2.4554011821746826, "learning_rate": 3.198261828360577e-06, "loss": 0.631, "num_input_tokens_seen": 33060328, "step": 57305 }, { "epoch": 8.535895144474233, "grad_norm": 1.9006311893463135, "learning_rate": 3.1950821483023723e-06, "loss": 0.4986, "num_input_tokens_seen": 33063016, "step": 57310 }, { "epoch": 8.536639857015192, "grad_norm": 3.4233040809631348, "learning_rate": 3.191903941731866e-06, "loss": 0.5432, "num_input_tokens_seen": 33065800, "step": 57315 }, { "epoch": 8.537384569556151, "grad_norm": 2.795543670654297, "learning_rate": 3.188727208863829e-06, "loss": 0.705, "num_input_tokens_seen": 33069032, "step": 57320 }, { "epoch": 8.53812928209711, "grad_norm": 2.549016237258911, "learning_rate": 3.1855519499129293e-06, "loss": 0.7024, "num_input_tokens_seen": 33071816, "step": 57325 }, { "epoch": 8.53887399463807, "grad_norm": 2.3624026775360107, "learning_rate": 3.1823781650937328e-06, "loss": 0.5094, "num_input_tokens_seen": 33074760, "step": 57330 }, { "epoch": 8.539618707179029, "grad_norm": 3.40250301361084, "learning_rate": 3.1792058546207174e-06, "loss": 0.6375, "num_input_tokens_seen": 33077928, "step": 57335 }, { "epoch": 8.540363419719988, "grad_norm": 3.320430040359497, "learning_rate": 3.176035018708251e-06, "loss": 0.5286, "num_input_tokens_seen": 33080680, "step": 57340 }, { "epoch": 8.541108132260947, "grad_norm": 2.732187509536743, "learning_rate": 3.1728656575706118e-06, "loss": 0.6711, "num_input_tokens_seen": 33083400, "step": 57345 }, { "epoch": 8.541852844801907, "grad_norm": 2.2441165447235107, "learning_rate": 3.16969777142197e-06, "loss": 0.5042, "num_input_tokens_seen": 33085928, "step": 57350 }, { "epoch": 8.542597557342866, "grad_norm": 3.7095422744750977, "learning_rate": 3.1665313604763937e-06, "loss": 0.611, "num_input_tokens_seen": 33088712, "step": 57355 }, { "epoch": 8.543342269883825, "grad_norm": 3.8010404109954834, "learning_rate": 3.163366424947864e-06, "loss": 0.4617, "num_input_tokens_seen": 33091432, "step": 57360 }, { "epoch": 8.544086982424783, "grad_norm": 3.309274435043335, "learning_rate": 3.1602029650502463e-06, "loss": 0.5401, "num_input_tokens_seen": 33094056, "step": 57365 }, { "epoch": 8.544831694965744, "grad_norm": 3.1597986221313477, "learning_rate": 3.1570409809973165e-06, "loss": 0.7085, "num_input_tokens_seen": 33096968, "step": 57370 }, { "epoch": 8.545576407506703, "grad_norm": 2.822153329849243, "learning_rate": 3.153880473002752e-06, "loss": 0.4998, "num_input_tokens_seen": 33099976, "step": 57375 }, { "epoch": 8.546321120047661, "grad_norm": 3.566251754760742, "learning_rate": 3.1507214412801243e-06, "loss": 0.5827, "num_input_tokens_seen": 33102920, "step": 57380 }, { "epoch": 8.54706583258862, "grad_norm": 4.498024940490723, "learning_rate": 3.1475638860429147e-06, "loss": 0.6504, "num_input_tokens_seen": 33106088, "step": 57385 }, { "epoch": 8.54781054512958, "grad_norm": 2.3260724544525146, "learning_rate": 3.1444078075044873e-06, "loss": 0.4726, "num_input_tokens_seen": 33109000, "step": 57390 }, { "epoch": 8.54855525767054, "grad_norm": 3.497612953186035, "learning_rate": 3.1412532058781198e-06, "loss": 0.6082, "num_input_tokens_seen": 33111848, "step": 57395 }, { "epoch": 8.549299970211498, "grad_norm": 2.402376890182495, "learning_rate": 3.138100081376996e-06, "loss": 0.7675, "num_input_tokens_seen": 33115880, "step": 57400 }, { "epoch": 8.550044682752457, "grad_norm": 3.0429980754852295, "learning_rate": 3.13494843421418e-06, "loss": 0.4943, "num_input_tokens_seen": 33118280, "step": 57405 }, { "epoch": 8.550789395293418, "grad_norm": 2.92887544631958, "learning_rate": 3.1317982646026507e-06, "loss": 0.6077, "num_input_tokens_seen": 33120840, "step": 57410 }, { "epoch": 8.551534107834376, "grad_norm": 5.405531406402588, "learning_rate": 3.128649572755285e-06, "loss": 0.4602, "num_input_tokens_seen": 33123464, "step": 57415 }, { "epoch": 8.552278820375335, "grad_norm": 3.3478333950042725, "learning_rate": 3.125502358884866e-06, "loss": 0.526, "num_input_tokens_seen": 33126632, "step": 57420 }, { "epoch": 8.553023532916294, "grad_norm": 2.7305238246917725, "learning_rate": 3.1223566232040564e-06, "loss": 0.7866, "num_input_tokens_seen": 33129544, "step": 57425 }, { "epoch": 8.553768245457253, "grad_norm": 2.534444808959961, "learning_rate": 3.1192123659254364e-06, "loss": 0.5051, "num_input_tokens_seen": 33132488, "step": 57430 }, { "epoch": 8.554512957998213, "grad_norm": 1.7515590190887451, "learning_rate": 3.116069587261486e-06, "loss": 0.5657, "num_input_tokens_seen": 33135368, "step": 57435 }, { "epoch": 8.555257670539172, "grad_norm": 1.7944923639297485, "learning_rate": 3.1129282874245826e-06, "loss": 0.6346, "num_input_tokens_seen": 33138184, "step": 57440 }, { "epoch": 8.55600238308013, "grad_norm": 2.568990468978882, "learning_rate": 3.109788466626995e-06, "loss": 0.5766, "num_input_tokens_seen": 33141320, "step": 57445 }, { "epoch": 8.556747095621091, "grad_norm": 2.467363119125366, "learning_rate": 3.106650125080904e-06, "loss": 0.7699, "num_input_tokens_seen": 33144040, "step": 57450 }, { "epoch": 8.55749180816205, "grad_norm": 2.714576244354248, "learning_rate": 3.103513262998392e-06, "loss": 0.5536, "num_input_tokens_seen": 33146632, "step": 57455 }, { "epoch": 8.558236520703009, "grad_norm": 1.3307009935379028, "learning_rate": 3.1003778805914207e-06, "loss": 0.4219, "num_input_tokens_seen": 33149352, "step": 57460 }, { "epoch": 8.558981233243967, "grad_norm": 3.5827317237854004, "learning_rate": 3.0972439780718786e-06, "loss": 0.5636, "num_input_tokens_seen": 33152232, "step": 57465 }, { "epoch": 8.559725945784926, "grad_norm": 5.898324489593506, "learning_rate": 3.0941115556515355e-06, "loss": 0.5357, "num_input_tokens_seen": 33154984, "step": 57470 }, { "epoch": 8.560470658325887, "grad_norm": 3.377955675125122, "learning_rate": 3.0909806135420714e-06, "loss": 0.4958, "num_input_tokens_seen": 33157928, "step": 57475 }, { "epoch": 8.561215370866845, "grad_norm": 2.1578099727630615, "learning_rate": 3.0878511519550623e-06, "loss": 0.4661, "num_input_tokens_seen": 33160968, "step": 57480 }, { "epoch": 8.561960083407804, "grad_norm": 1.8792279958724976, "learning_rate": 3.0847231711019884e-06, "loss": 0.5498, "num_input_tokens_seen": 33163688, "step": 57485 }, { "epoch": 8.562704795948763, "grad_norm": 3.18418288230896, "learning_rate": 3.0815966711942227e-06, "loss": 0.5424, "num_input_tokens_seen": 33166792, "step": 57490 }, { "epoch": 8.563449508489724, "grad_norm": 2.0087716579437256, "learning_rate": 3.078471652443035e-06, "loss": 0.4926, "num_input_tokens_seen": 33169576, "step": 57495 }, { "epoch": 8.564194221030682, "grad_norm": 4.022969722747803, "learning_rate": 3.0753481150596038e-06, "loss": 0.5596, "num_input_tokens_seen": 33172584, "step": 57500 }, { "epoch": 8.564938933571641, "grad_norm": 3.492375373840332, "learning_rate": 3.072226059255012e-06, "loss": 0.4476, "num_input_tokens_seen": 33175464, "step": 57505 }, { "epoch": 8.5656836461126, "grad_norm": 2.1955552101135254, "learning_rate": 3.0691054852402286e-06, "loss": 0.5508, "num_input_tokens_seen": 33178248, "step": 57510 }, { "epoch": 8.56642835865356, "grad_norm": 2.0389583110809326, "learning_rate": 3.065986393226139e-06, "loss": 0.613, "num_input_tokens_seen": 33181448, "step": 57515 }, { "epoch": 8.567173071194519, "grad_norm": 2.663233518600464, "learning_rate": 3.0628687834235032e-06, "loss": 0.552, "num_input_tokens_seen": 33184040, "step": 57520 }, { "epoch": 8.567917783735478, "grad_norm": 2.982154130935669, "learning_rate": 3.0597526560430133e-06, "loss": 0.5325, "num_input_tokens_seen": 33186984, "step": 57525 }, { "epoch": 8.568662496276437, "grad_norm": 3.1703245639801025, "learning_rate": 3.056638011295229e-06, "loss": 0.6949, "num_input_tokens_seen": 33189896, "step": 57530 }, { "epoch": 8.569407208817397, "grad_norm": 2.6785190105438232, "learning_rate": 3.053524849390635e-06, "loss": 0.7075, "num_input_tokens_seen": 33192680, "step": 57535 }, { "epoch": 8.570151921358356, "grad_norm": 2.47963285446167, "learning_rate": 3.050413170539604e-06, "loss": 0.6366, "num_input_tokens_seen": 33195368, "step": 57540 }, { "epoch": 8.570896633899315, "grad_norm": 3.678492307662964, "learning_rate": 3.0473029749524094e-06, "loss": 0.4962, "num_input_tokens_seen": 33198280, "step": 57545 }, { "epoch": 8.571641346440273, "grad_norm": 2.635998010635376, "learning_rate": 3.044194262839231e-06, "loss": 0.5384, "num_input_tokens_seen": 33201352, "step": 57550 }, { "epoch": 8.572386058981234, "grad_norm": 2.180222988128662, "learning_rate": 3.041087034410134e-06, "loss": 0.6853, "num_input_tokens_seen": 33204040, "step": 57555 }, { "epoch": 8.573130771522193, "grad_norm": 8.257237434387207, "learning_rate": 3.037981289875097e-06, "loss": 0.4663, "num_input_tokens_seen": 33207016, "step": 57560 }, { "epoch": 8.573875484063151, "grad_norm": 2.5639729499816895, "learning_rate": 3.0348770294439973e-06, "loss": 0.7198, "num_input_tokens_seen": 33209832, "step": 57565 }, { "epoch": 8.57462019660411, "grad_norm": 6.065943241119385, "learning_rate": 3.0317742533266024e-06, "loss": 0.7682, "num_input_tokens_seen": 33212328, "step": 57570 }, { "epoch": 8.57536490914507, "grad_norm": 2.726855993270874, "learning_rate": 3.0286729617325844e-06, "loss": 0.5415, "num_input_tokens_seen": 33215240, "step": 57575 }, { "epoch": 8.57610962168603, "grad_norm": 4.756643772125244, "learning_rate": 3.0255731548715195e-06, "loss": 0.831, "num_input_tokens_seen": 33218152, "step": 57580 }, { "epoch": 8.576854334226988, "grad_norm": 3.329982280731201, "learning_rate": 3.0224748329528846e-06, "loss": 0.4309, "num_input_tokens_seen": 33220872, "step": 57585 }, { "epoch": 8.577599046767947, "grad_norm": 2.4317195415496826, "learning_rate": 3.0193779961860403e-06, "loss": 0.6425, "num_input_tokens_seen": 33224008, "step": 57590 }, { "epoch": 8.578343759308908, "grad_norm": 3.2093186378479004, "learning_rate": 3.0162826447802634e-06, "loss": 0.6233, "num_input_tokens_seen": 33226888, "step": 57595 }, { "epoch": 8.579088471849866, "grad_norm": 1.8523977994918823, "learning_rate": 3.0131887789447284e-06, "loss": 0.5934, "num_input_tokens_seen": 33229736, "step": 57600 }, { "epoch": 8.579833184390825, "grad_norm": 2.167398452758789, "learning_rate": 3.0100963988885067e-06, "loss": 0.5136, "num_input_tokens_seen": 33232808, "step": 57605 }, { "epoch": 8.580577896931784, "grad_norm": 2.375786304473877, "learning_rate": 3.0070055048205647e-06, "loss": 0.4329, "num_input_tokens_seen": 33235784, "step": 57610 }, { "epoch": 8.581322609472743, "grad_norm": 3.215226411819458, "learning_rate": 3.003916096949769e-06, "loss": 0.5747, "num_input_tokens_seen": 33238696, "step": 57615 }, { "epoch": 8.582067322013703, "grad_norm": 2.0429515838623047, "learning_rate": 3.0008281754849018e-06, "loss": 0.4848, "num_input_tokens_seen": 33241448, "step": 57620 }, { "epoch": 8.582812034554662, "grad_norm": 3.271481990814209, "learning_rate": 2.9977417406346186e-06, "loss": 0.4696, "num_input_tokens_seen": 33244296, "step": 57625 }, { "epoch": 8.58355674709562, "grad_norm": 4.85830545425415, "learning_rate": 2.994656792607495e-06, "loss": 0.4747, "num_input_tokens_seen": 33247176, "step": 57630 }, { "epoch": 8.584301459636581, "grad_norm": 3.770620346069336, "learning_rate": 2.9915733316119963e-06, "loss": 0.5396, "num_input_tokens_seen": 33250088, "step": 57635 }, { "epoch": 8.58504617217754, "grad_norm": 3.7741734981536865, "learning_rate": 2.988491357856493e-06, "loss": 0.5781, "num_input_tokens_seen": 33253192, "step": 57640 }, { "epoch": 8.585790884718499, "grad_norm": 2.422229290008545, "learning_rate": 2.9854108715492572e-06, "loss": 0.3667, "num_input_tokens_seen": 33255880, "step": 57645 }, { "epoch": 8.586535597259457, "grad_norm": 2.706602096557617, "learning_rate": 2.9823318728984447e-06, "loss": 0.7268, "num_input_tokens_seen": 33258824, "step": 57650 }, { "epoch": 8.587280309800416, "grad_norm": 0.9437052607536316, "learning_rate": 2.97925436211213e-06, "loss": 0.4184, "num_input_tokens_seen": 33261704, "step": 57655 }, { "epoch": 8.588025022341377, "grad_norm": 2.141237735748291, "learning_rate": 2.9761783393982722e-06, "loss": 0.4249, "num_input_tokens_seen": 33264872, "step": 57660 }, { "epoch": 8.588769734882336, "grad_norm": 2.1251261234283447, "learning_rate": 2.9731038049647385e-06, "loss": 0.5969, "num_input_tokens_seen": 33267944, "step": 57665 }, { "epoch": 8.589514447423294, "grad_norm": 2.1195247173309326, "learning_rate": 2.970030759019296e-06, "loss": 0.524, "num_input_tokens_seen": 33271144, "step": 57670 }, { "epoch": 8.590259159964253, "grad_norm": 3.79270601272583, "learning_rate": 2.966959201769609e-06, "loss": 0.5452, "num_input_tokens_seen": 33273672, "step": 57675 }, { "epoch": 8.591003872505214, "grad_norm": 1.2008695602416992, "learning_rate": 2.963889133423242e-06, "loss": 0.4919, "num_input_tokens_seen": 33276584, "step": 57680 }, { "epoch": 8.591748585046172, "grad_norm": 1.9990417957305908, "learning_rate": 2.9608205541876516e-06, "loss": 0.4128, "num_input_tokens_seen": 33279304, "step": 57685 }, { "epoch": 8.592493297587131, "grad_norm": 3.289067029953003, "learning_rate": 2.957753464270208e-06, "loss": 0.6988, "num_input_tokens_seen": 33282088, "step": 57690 }, { "epoch": 8.59323801012809, "grad_norm": 1.8115015029907227, "learning_rate": 2.954687863878164e-06, "loss": 0.454, "num_input_tokens_seen": 33285128, "step": 57695 }, { "epoch": 8.59398272266905, "grad_norm": 2.364335298538208, "learning_rate": 2.9516237532186826e-06, "loss": 0.4647, "num_input_tokens_seen": 33288040, "step": 57700 }, { "epoch": 8.59472743521001, "grad_norm": 3.727750062942505, "learning_rate": 2.9485611324988254e-06, "loss": 0.5734, "num_input_tokens_seen": 33290824, "step": 57705 }, { "epoch": 8.595472147750968, "grad_norm": 6.871516704559326, "learning_rate": 2.9455000019255524e-06, "loss": 0.7145, "num_input_tokens_seen": 33293960, "step": 57710 }, { "epoch": 8.596216860291927, "grad_norm": 4.68003511428833, "learning_rate": 2.9424403617057285e-06, "loss": 0.5609, "num_input_tokens_seen": 33296712, "step": 57715 }, { "epoch": 8.596961572832887, "grad_norm": 2.803802490234375, "learning_rate": 2.939382212046099e-06, "loss": 0.7493, "num_input_tokens_seen": 33299560, "step": 57720 }, { "epoch": 8.597706285373846, "grad_norm": 3.578791618347168, "learning_rate": 2.936325553153335e-06, "loss": 0.6339, "num_input_tokens_seen": 33302600, "step": 57725 }, { "epoch": 8.598450997914805, "grad_norm": 2.258981466293335, "learning_rate": 2.9332703852339797e-06, "loss": 0.567, "num_input_tokens_seen": 33305480, "step": 57730 }, { "epoch": 8.599195710455763, "grad_norm": 3.0450985431671143, "learning_rate": 2.930216708494493e-06, "loss": 0.5646, "num_input_tokens_seen": 33308552, "step": 57735 }, { "epoch": 8.599940422996724, "grad_norm": 2.073277473449707, "learning_rate": 2.927164523141235e-06, "loss": 0.4831, "num_input_tokens_seen": 33311208, "step": 57740 }, { "epoch": 8.600685135537683, "grad_norm": 6.218938827514648, "learning_rate": 2.9241138293804565e-06, "loss": 0.6666, "num_input_tokens_seen": 33313832, "step": 57745 }, { "epoch": 8.601429848078642, "grad_norm": 3.2701029777526855, "learning_rate": 2.9210646274183157e-06, "loss": 0.4806, "num_input_tokens_seen": 33316840, "step": 57750 }, { "epoch": 8.6021745606196, "grad_norm": 2.871166229248047, "learning_rate": 2.9180169174608555e-06, "loss": 0.6287, "num_input_tokens_seen": 33319784, "step": 57755 }, { "epoch": 8.60291927316056, "grad_norm": 2.2220702171325684, "learning_rate": 2.9149706997140316e-06, "loss": 0.5423, "num_input_tokens_seen": 33322632, "step": 57760 }, { "epoch": 8.60366398570152, "grad_norm": 3.691065549850464, "learning_rate": 2.911925974383703e-06, "loss": 0.4092, "num_input_tokens_seen": 33325480, "step": 57765 }, { "epoch": 8.604408698242478, "grad_norm": 3.272799491882324, "learning_rate": 2.908882741675609e-06, "loss": 0.5366, "num_input_tokens_seen": 33328392, "step": 57770 }, { "epoch": 8.605153410783437, "grad_norm": 7.636669158935547, "learning_rate": 2.9058410017954035e-06, "loss": 0.6161, "num_input_tokens_seen": 33331144, "step": 57775 }, { "epoch": 8.605898123324398, "grad_norm": 4.8481526374816895, "learning_rate": 2.902800754948634e-06, "loss": 0.5416, "num_input_tokens_seen": 33334344, "step": 57780 }, { "epoch": 8.606642835865356, "grad_norm": 3.0099308490753174, "learning_rate": 2.8997620013407557e-06, "loss": 0.5045, "num_input_tokens_seen": 33337192, "step": 57785 }, { "epoch": 8.607387548406315, "grad_norm": 2.3598499298095703, "learning_rate": 2.896724741177101e-06, "loss": 0.4827, "num_input_tokens_seen": 33339912, "step": 57790 }, { "epoch": 8.608132260947274, "grad_norm": 1.0265429019927979, "learning_rate": 2.893688974662925e-06, "loss": 0.5274, "num_input_tokens_seen": 33342600, "step": 57795 }, { "epoch": 8.608876973488233, "grad_norm": 3.7791028022766113, "learning_rate": 2.8906547020033703e-06, "loss": 0.5217, "num_input_tokens_seen": 33345448, "step": 57800 }, { "epoch": 8.609621686029193, "grad_norm": 2.3070971965789795, "learning_rate": 2.887621923403483e-06, "loss": 0.5912, "num_input_tokens_seen": 33348360, "step": 57805 }, { "epoch": 8.610366398570152, "grad_norm": 9.525259017944336, "learning_rate": 2.884590639068202e-06, "loss": 0.6344, "num_input_tokens_seen": 33351400, "step": 57810 }, { "epoch": 8.61111111111111, "grad_norm": 3.4114294052124023, "learning_rate": 2.8815608492023696e-06, "loss": 0.6486, "num_input_tokens_seen": 33354152, "step": 57815 }, { "epoch": 8.61185582365207, "grad_norm": 1.847887396812439, "learning_rate": 2.878532554010732e-06, "loss": 0.6242, "num_input_tokens_seen": 33357256, "step": 57820 }, { "epoch": 8.61260053619303, "grad_norm": 3.61983323097229, "learning_rate": 2.875505753697921e-06, "loss": 0.3968, "num_input_tokens_seen": 33360072, "step": 57825 }, { "epoch": 8.613345248733989, "grad_norm": 3.179220676422119, "learning_rate": 2.8724804484684785e-06, "loss": 0.6206, "num_input_tokens_seen": 33362888, "step": 57830 }, { "epoch": 8.614089961274948, "grad_norm": 3.927982807159424, "learning_rate": 2.8694566385268463e-06, "loss": 0.6522, "num_input_tokens_seen": 33365640, "step": 57835 }, { "epoch": 8.614834673815906, "grad_norm": 2.6998496055603027, "learning_rate": 2.866434324077355e-06, "loss": 0.4698, "num_input_tokens_seen": 33368648, "step": 57840 }, { "epoch": 8.615579386356867, "grad_norm": 3.18363881111145, "learning_rate": 2.86341350532425e-06, "loss": 0.8099, "num_input_tokens_seen": 33371464, "step": 57845 }, { "epoch": 8.616324098897826, "grad_norm": 5.49315071105957, "learning_rate": 2.8603941824716542e-06, "loss": 0.6473, "num_input_tokens_seen": 33374280, "step": 57850 }, { "epoch": 8.617068811438784, "grad_norm": 2.572402000427246, "learning_rate": 2.857376355723612e-06, "loss": 0.5464, "num_input_tokens_seen": 33377096, "step": 57855 }, { "epoch": 8.617813523979743, "grad_norm": 1.9270665645599365, "learning_rate": 2.8543600252840448e-06, "loss": 0.4931, "num_input_tokens_seen": 33380008, "step": 57860 }, { "epoch": 8.618558236520704, "grad_norm": 3.159860849380493, "learning_rate": 2.8513451913567883e-06, "loss": 0.6516, "num_input_tokens_seen": 33382888, "step": 57865 }, { "epoch": 8.619302949061662, "grad_norm": 2.0731797218322754, "learning_rate": 2.848331854145575e-06, "loss": 0.5269, "num_input_tokens_seen": 33385896, "step": 57870 }, { "epoch": 8.620047661602621, "grad_norm": 4.28345251083374, "learning_rate": 2.845320013854033e-06, "loss": 0.6146, "num_input_tokens_seen": 33388744, "step": 57875 }, { "epoch": 8.62079237414358, "grad_norm": 6.8399882316589355, "learning_rate": 2.8423096706856973e-06, "loss": 0.6217, "num_input_tokens_seen": 33391592, "step": 57880 }, { "epoch": 8.62153708668454, "grad_norm": 4.606821537017822, "learning_rate": 2.839300824843985e-06, "loss": 0.442, "num_input_tokens_seen": 33394600, "step": 57885 }, { "epoch": 8.6222817992255, "grad_norm": 1.9592722654342651, "learning_rate": 2.8362934765322174e-06, "loss": 0.74, "num_input_tokens_seen": 33397448, "step": 57890 }, { "epoch": 8.623026511766458, "grad_norm": 6.925240516662598, "learning_rate": 2.833287625953629e-06, "loss": 0.8691, "num_input_tokens_seen": 33400392, "step": 57895 }, { "epoch": 8.623771224307417, "grad_norm": 3.1858060359954834, "learning_rate": 2.8302832733113376e-06, "loss": 0.5966, "num_input_tokens_seen": 33403432, "step": 57900 }, { "epoch": 8.624515936848377, "grad_norm": 6.971468448638916, "learning_rate": 2.8272804188083675e-06, "loss": 0.5463, "num_input_tokens_seen": 33406312, "step": 57905 }, { "epoch": 8.625260649389336, "grad_norm": 2.434041976928711, "learning_rate": 2.824279062647639e-06, "loss": 0.7213, "num_input_tokens_seen": 33409256, "step": 57910 }, { "epoch": 8.626005361930295, "grad_norm": 3.875967025756836, "learning_rate": 2.8212792050319766e-06, "loss": 0.546, "num_input_tokens_seen": 33412296, "step": 57915 }, { "epoch": 8.626750074471254, "grad_norm": 8.867193222045898, "learning_rate": 2.8182808461640897e-06, "loss": 0.7283, "num_input_tokens_seen": 33415112, "step": 57920 }, { "epoch": 8.627494787012214, "grad_norm": 1.8277970552444458, "learning_rate": 2.8152839862466027e-06, "loss": 0.5314, "num_input_tokens_seen": 33417704, "step": 57925 }, { "epoch": 8.628239499553173, "grad_norm": 6.576813220977783, "learning_rate": 2.812288625482021e-06, "loss": 0.625, "num_input_tokens_seen": 33420648, "step": 57930 }, { "epoch": 8.628984212094132, "grad_norm": 2.0211687088012695, "learning_rate": 2.8092947640727673e-06, "loss": 0.463, "num_input_tokens_seen": 33423592, "step": 57935 }, { "epoch": 8.62972892463509, "grad_norm": 2.7979252338409424, "learning_rate": 2.8063024022211533e-06, "loss": 0.5458, "num_input_tokens_seen": 33426376, "step": 57940 }, { "epoch": 8.63047363717605, "grad_norm": 2.069124221801758, "learning_rate": 2.8033115401293884e-06, "loss": 0.4925, "num_input_tokens_seen": 33429864, "step": 57945 }, { "epoch": 8.63121834971701, "grad_norm": 1.9198079109191895, "learning_rate": 2.80032217799959e-06, "loss": 0.5855, "num_input_tokens_seen": 33433000, "step": 57950 }, { "epoch": 8.631963062257968, "grad_norm": 2.0481176376342773, "learning_rate": 2.7973343160337562e-06, "loss": 0.5094, "num_input_tokens_seen": 33435752, "step": 57955 }, { "epoch": 8.632707774798927, "grad_norm": 2.010653495788574, "learning_rate": 2.7943479544337988e-06, "loss": 0.5309, "num_input_tokens_seen": 33438216, "step": 57960 }, { "epoch": 8.633452487339888, "grad_norm": 2.813694715499878, "learning_rate": 2.7913630934015304e-06, "loss": 0.6175, "num_input_tokens_seen": 33441128, "step": 57965 }, { "epoch": 8.634197199880846, "grad_norm": 3.130594253540039, "learning_rate": 2.7883797331386465e-06, "loss": 0.6701, "num_input_tokens_seen": 33444008, "step": 57970 }, { "epoch": 8.634941912421805, "grad_norm": 4.634022235870361, "learning_rate": 2.785397873846754e-06, "loss": 0.4347, "num_input_tokens_seen": 33446696, "step": 57975 }, { "epoch": 8.635686624962764, "grad_norm": 5.528112888336182, "learning_rate": 2.7824175157273564e-06, "loss": 0.4007, "num_input_tokens_seen": 33449640, "step": 57980 }, { "epoch": 8.636431337503723, "grad_norm": 3.3409862518310547, "learning_rate": 2.779438658981856e-06, "loss": 0.6477, "num_input_tokens_seen": 33452456, "step": 57985 }, { "epoch": 8.637176050044683, "grad_norm": 3.9537465572357178, "learning_rate": 2.776461303811545e-06, "loss": 0.7168, "num_input_tokens_seen": 33455144, "step": 57990 }, { "epoch": 8.637920762585642, "grad_norm": 1.3546442985534668, "learning_rate": 2.7734854504176234e-06, "loss": 0.4443, "num_input_tokens_seen": 33457832, "step": 57995 }, { "epoch": 8.6386654751266, "grad_norm": 3.022132635116577, "learning_rate": 2.770511099001191e-06, "loss": 0.5302, "num_input_tokens_seen": 33460456, "step": 58000 }, { "epoch": 8.63941018766756, "grad_norm": 2.970144748687744, "learning_rate": 2.7675382497632435e-06, "loss": 0.6602, "num_input_tokens_seen": 33463336, "step": 58005 }, { "epoch": 8.64015490020852, "grad_norm": 4.662222862243652, "learning_rate": 2.764566902904664e-06, "loss": 0.7604, "num_input_tokens_seen": 33466248, "step": 58010 }, { "epoch": 8.640899612749479, "grad_norm": 5.530278205871582, "learning_rate": 2.761597058626253e-06, "loss": 0.5805, "num_input_tokens_seen": 33469224, "step": 58015 }, { "epoch": 8.641644325290438, "grad_norm": 2.7321407794952393, "learning_rate": 2.758628717128703e-06, "loss": 0.7127, "num_input_tokens_seen": 33472488, "step": 58020 }, { "epoch": 8.642389037831396, "grad_norm": 3.3963096141815186, "learning_rate": 2.755661878612592e-06, "loss": 0.5744, "num_input_tokens_seen": 33475304, "step": 58025 }, { "epoch": 8.643133750372357, "grad_norm": 2.879204511642456, "learning_rate": 2.75269654327841e-06, "loss": 0.4375, "num_input_tokens_seen": 33478152, "step": 58030 }, { "epoch": 8.643878462913316, "grad_norm": 3.016495943069458, "learning_rate": 2.749732711326547e-06, "loss": 0.5747, "num_input_tokens_seen": 33481384, "step": 58035 }, { "epoch": 8.644623175454274, "grad_norm": 3.43906569480896, "learning_rate": 2.7467703829572836e-06, "loss": 0.4799, "num_input_tokens_seen": 33484200, "step": 58040 }, { "epoch": 8.645367887995233, "grad_norm": 3.363914728164673, "learning_rate": 2.7438095583708078e-06, "loss": 0.7116, "num_input_tokens_seen": 33487144, "step": 58045 }, { "epoch": 8.646112600536194, "grad_norm": 3.6740715503692627, "learning_rate": 2.740850237767195e-06, "loss": 0.5643, "num_input_tokens_seen": 33490152, "step": 58050 }, { "epoch": 8.646857313077152, "grad_norm": 4.049767017364502, "learning_rate": 2.737892421346419e-06, "loss": 0.6686, "num_input_tokens_seen": 33492776, "step": 58055 }, { "epoch": 8.647602025618111, "grad_norm": 1.7443175315856934, "learning_rate": 2.7349361093083643e-06, "loss": 0.3075, "num_input_tokens_seen": 33495592, "step": 58060 }, { "epoch": 8.64834673815907, "grad_norm": 2.2366576194763184, "learning_rate": 2.7319813018528013e-06, "loss": 0.5095, "num_input_tokens_seen": 33498440, "step": 58065 }, { "epoch": 8.64909145070003, "grad_norm": 1.4872533082962036, "learning_rate": 2.7290279991794067e-06, "loss": 0.6415, "num_input_tokens_seen": 33501576, "step": 58070 }, { "epoch": 8.64983616324099, "grad_norm": 2.485776662826538, "learning_rate": 2.7260762014877538e-06, "loss": 0.5876, "num_input_tokens_seen": 33504328, "step": 58075 }, { "epoch": 8.650580875781948, "grad_norm": 7.870217323303223, "learning_rate": 2.723125908977317e-06, "loss": 0.7629, "num_input_tokens_seen": 33507496, "step": 58080 }, { "epoch": 8.651325588322907, "grad_norm": 5.203436374664307, "learning_rate": 2.7201771218474558e-06, "loss": 0.7148, "num_input_tokens_seen": 33510312, "step": 58085 }, { "epoch": 8.652070300863867, "grad_norm": 2.916572332382202, "learning_rate": 2.7172298402974443e-06, "loss": 0.5492, "num_input_tokens_seen": 33512968, "step": 58090 }, { "epoch": 8.652815013404826, "grad_norm": 2.450446605682373, "learning_rate": 2.7142840645264426e-06, "loss": 0.513, "num_input_tokens_seen": 33515912, "step": 58095 }, { "epoch": 8.653559725945785, "grad_norm": 3.543724536895752, "learning_rate": 2.711339794733517e-06, "loss": 0.5717, "num_input_tokens_seen": 33518504, "step": 58100 }, { "epoch": 8.654304438486744, "grad_norm": 12.836129188537598, "learning_rate": 2.7083970311176267e-06, "loss": 0.6038, "num_input_tokens_seen": 33521256, "step": 58105 }, { "epoch": 8.655049151027704, "grad_norm": 3.63364839553833, "learning_rate": 2.7054557738776356e-06, "loss": 0.4729, "num_input_tokens_seen": 33524136, "step": 58110 }, { "epoch": 8.655793863568663, "grad_norm": 1.9825025796890259, "learning_rate": 2.702516023212304e-06, "loss": 0.6369, "num_input_tokens_seen": 33526792, "step": 58115 }, { "epoch": 8.656538576109622, "grad_norm": 2.766939401626587, "learning_rate": 2.699577779320278e-06, "loss": 0.6828, "num_input_tokens_seen": 33529416, "step": 58120 }, { "epoch": 8.65728328865058, "grad_norm": 2.5760083198547363, "learning_rate": 2.696641042400122e-06, "loss": 0.631, "num_input_tokens_seen": 33532296, "step": 58125 }, { "epoch": 8.65802800119154, "grad_norm": 4.449044227600098, "learning_rate": 2.6937058126502905e-06, "loss": 0.5981, "num_input_tokens_seen": 33535016, "step": 58130 }, { "epoch": 8.6587727137325, "grad_norm": 1.6306301355361938, "learning_rate": 2.6907720902691226e-06, "loss": 0.7053, "num_input_tokens_seen": 33537960, "step": 58135 }, { "epoch": 8.659517426273458, "grad_norm": 2.973724126815796, "learning_rate": 2.6878398754548756e-06, "loss": 0.616, "num_input_tokens_seen": 33540968, "step": 58140 }, { "epoch": 8.660262138814417, "grad_norm": 5.2813591957092285, "learning_rate": 2.684909168405694e-06, "loss": 0.5923, "num_input_tokens_seen": 33543912, "step": 58145 }, { "epoch": 8.661006851355378, "grad_norm": 5.368340969085693, "learning_rate": 2.6819799693196283e-06, "loss": 0.7845, "num_input_tokens_seen": 33546856, "step": 58150 }, { "epoch": 8.661751563896336, "grad_norm": 1.7995901107788086, "learning_rate": 2.6790522783946142e-06, "loss": 0.5353, "num_input_tokens_seen": 33549512, "step": 58155 }, { "epoch": 8.662496276437295, "grad_norm": 2.3702526092529297, "learning_rate": 2.676126095828496e-06, "loss": 0.4589, "num_input_tokens_seen": 33552392, "step": 58160 }, { "epoch": 8.663240988978254, "grad_norm": 3.3518362045288086, "learning_rate": 2.673201421819016e-06, "loss": 0.6448, "num_input_tokens_seen": 33555304, "step": 58165 }, { "epoch": 8.663985701519213, "grad_norm": 3.4639039039611816, "learning_rate": 2.670278256563813e-06, "loss": 0.6742, "num_input_tokens_seen": 33558312, "step": 58170 }, { "epoch": 8.664730414060173, "grad_norm": 2.531700849533081, "learning_rate": 2.667356600260415e-06, "loss": 0.4919, "num_input_tokens_seen": 33561288, "step": 58175 }, { "epoch": 8.665475126601132, "grad_norm": 1.1647706031799316, "learning_rate": 2.664436453106259e-06, "loss": 0.4181, "num_input_tokens_seen": 33564200, "step": 58180 }, { "epoch": 8.66621983914209, "grad_norm": 1.8097834587097168, "learning_rate": 2.6615178152986835e-06, "loss": 0.5092, "num_input_tokens_seen": 33567112, "step": 58185 }, { "epoch": 8.66696455168305, "grad_norm": 3.0170469284057617, "learning_rate": 2.6586006870349095e-06, "loss": 0.5531, "num_input_tokens_seen": 33570184, "step": 58190 }, { "epoch": 8.66770926422401, "grad_norm": 4.2949934005737305, "learning_rate": 2.6556850685120648e-06, "loss": 0.5095, "num_input_tokens_seen": 33573096, "step": 58195 }, { "epoch": 8.668453976764969, "grad_norm": 1.9421484470367432, "learning_rate": 2.6527709599271784e-06, "loss": 0.4702, "num_input_tokens_seen": 33575912, "step": 58200 }, { "epoch": 8.669198689305928, "grad_norm": 2.225684404373169, "learning_rate": 2.649858361477173e-06, "loss": 0.5292, "num_input_tokens_seen": 33578952, "step": 58205 }, { "epoch": 8.669943401846886, "grad_norm": 4.039668083190918, "learning_rate": 2.6469472733588767e-06, "loss": 0.5859, "num_input_tokens_seen": 33581736, "step": 58210 }, { "epoch": 8.670688114387847, "grad_norm": 5.1666951179504395, "learning_rate": 2.6440376957690026e-06, "loss": 0.5332, "num_input_tokens_seen": 33584648, "step": 58215 }, { "epoch": 8.671432826928806, "grad_norm": 2.3764841556549072, "learning_rate": 2.6411296289041627e-06, "loss": 0.6165, "num_input_tokens_seen": 33587752, "step": 58220 }, { "epoch": 8.672177539469764, "grad_norm": 3.7256033420562744, "learning_rate": 2.638223072960877e-06, "loss": 0.3511, "num_input_tokens_seen": 33590888, "step": 58225 }, { "epoch": 8.672922252010723, "grad_norm": 3.043877124786377, "learning_rate": 2.635318028135561e-06, "loss": 0.413, "num_input_tokens_seen": 33593864, "step": 58230 }, { "epoch": 8.673666964551684, "grad_norm": 1.0503685474395752, "learning_rate": 2.6324144946245244e-06, "loss": 0.4797, "num_input_tokens_seen": 33596968, "step": 58235 }, { "epoch": 8.674411677092642, "grad_norm": 1.9898016452789307, "learning_rate": 2.629512472623974e-06, "loss": 0.4252, "num_input_tokens_seen": 33599592, "step": 58240 }, { "epoch": 8.675156389633601, "grad_norm": 2.038665771484375, "learning_rate": 2.6266119623300277e-06, "loss": 0.5287, "num_input_tokens_seen": 33602664, "step": 58245 }, { "epoch": 8.67590110217456, "grad_norm": 4.114008903503418, "learning_rate": 2.6237129639386795e-06, "loss": 0.414, "num_input_tokens_seen": 33605416, "step": 58250 }, { "epoch": 8.67664581471552, "grad_norm": 3.653140068054199, "learning_rate": 2.620815477645827e-06, "loss": 0.6351, "num_input_tokens_seen": 33608200, "step": 58255 }, { "epoch": 8.67739052725648, "grad_norm": 1.8865985870361328, "learning_rate": 2.6179195036472815e-06, "loss": 0.57, "num_input_tokens_seen": 33611112, "step": 58260 }, { "epoch": 8.678135239797438, "grad_norm": 4.496304988861084, "learning_rate": 2.615025042138733e-06, "loss": 0.6186, "num_input_tokens_seen": 33613928, "step": 58265 }, { "epoch": 8.678879952338397, "grad_norm": 2.848456621170044, "learning_rate": 2.6121320933157834e-06, "loss": 0.6053, "num_input_tokens_seen": 33616744, "step": 58270 }, { "epoch": 8.679624664879357, "grad_norm": 1.9879471063613892, "learning_rate": 2.6092406573739264e-06, "loss": 0.6248, "num_input_tokens_seen": 33619752, "step": 58275 }, { "epoch": 8.680369377420316, "grad_norm": 3.967556953430176, "learning_rate": 2.606350734508553e-06, "loss": 0.5935, "num_input_tokens_seen": 33622632, "step": 58280 }, { "epoch": 8.681114089961275, "grad_norm": 2.0516433715820312, "learning_rate": 2.6034623249149487e-06, "loss": 0.4648, "num_input_tokens_seen": 33625832, "step": 58285 }, { "epoch": 8.681858802502234, "grad_norm": 3.1901631355285645, "learning_rate": 2.6005754287883072e-06, "loss": 0.4444, "num_input_tokens_seen": 33628744, "step": 58290 }, { "epoch": 8.682603515043194, "grad_norm": 2.273125648498535, "learning_rate": 2.597690046323703e-06, "loss": 0.6073, "num_input_tokens_seen": 33631592, "step": 58295 }, { "epoch": 8.683348227584153, "grad_norm": 2.8586485385894775, "learning_rate": 2.594806177716125e-06, "loss": 0.7114, "num_input_tokens_seen": 33634248, "step": 58300 }, { "epoch": 8.684092940125112, "grad_norm": 1.6503618955612183, "learning_rate": 2.5919238231604524e-06, "loss": 0.4478, "num_input_tokens_seen": 33637288, "step": 58305 }, { "epoch": 8.68483765266607, "grad_norm": 2.9141077995300293, "learning_rate": 2.589042982851461e-06, "loss": 0.5302, "num_input_tokens_seen": 33640040, "step": 58310 }, { "epoch": 8.68558236520703, "grad_norm": 4.166372776031494, "learning_rate": 2.5861636569838366e-06, "loss": 0.6035, "num_input_tokens_seen": 33642888, "step": 58315 }, { "epoch": 8.68632707774799, "grad_norm": 2.340266704559326, "learning_rate": 2.583285845752137e-06, "loss": 0.5336, "num_input_tokens_seen": 33645736, "step": 58320 }, { "epoch": 8.687071790288948, "grad_norm": 2.5699844360351562, "learning_rate": 2.580409549350843e-06, "loss": 0.523, "num_input_tokens_seen": 33648360, "step": 58325 }, { "epoch": 8.687816502829907, "grad_norm": 2.283935308456421, "learning_rate": 2.577534767974324e-06, "loss": 0.5249, "num_input_tokens_seen": 33651240, "step": 58330 }, { "epoch": 8.688561215370868, "grad_norm": 1.4032098054885864, "learning_rate": 2.574661501816836e-06, "loss": 0.5059, "num_input_tokens_seen": 33654408, "step": 58335 }, { "epoch": 8.689305927911827, "grad_norm": 2.581254243850708, "learning_rate": 2.5717897510725508e-06, "loss": 0.4219, "num_input_tokens_seen": 33657448, "step": 58340 }, { "epoch": 8.690050640452785, "grad_norm": 3.843994617462158, "learning_rate": 2.568919515935525e-06, "loss": 0.5686, "num_input_tokens_seen": 33660936, "step": 58345 }, { "epoch": 8.690795352993744, "grad_norm": 4.441470146179199, "learning_rate": 2.5660507965997282e-06, "loss": 0.7209, "num_input_tokens_seen": 33663880, "step": 58350 }, { "epoch": 8.691540065534703, "grad_norm": 3.100141763687134, "learning_rate": 2.5631835932590027e-06, "loss": 0.7021, "num_input_tokens_seen": 33666696, "step": 58355 }, { "epoch": 8.692284778075663, "grad_norm": 4.32767391204834, "learning_rate": 2.5603179061071097e-06, "loss": 0.6604, "num_input_tokens_seen": 33669864, "step": 58360 }, { "epoch": 8.693029490616622, "grad_norm": 3.8787894248962402, "learning_rate": 2.5574537353376977e-06, "loss": 0.5142, "num_input_tokens_seen": 33672872, "step": 58365 }, { "epoch": 8.69377420315758, "grad_norm": 5.267733097076416, "learning_rate": 2.5545910811443224e-06, "loss": 0.7004, "num_input_tokens_seen": 33675720, "step": 58370 }, { "epoch": 8.69451891569854, "grad_norm": 1.6280419826507568, "learning_rate": 2.5517299437204214e-06, "loss": 0.4729, "num_input_tokens_seen": 33678536, "step": 58375 }, { "epoch": 8.6952636282395, "grad_norm": 3.571864366531372, "learning_rate": 2.5488703232593474e-06, "loss": 0.6988, "num_input_tokens_seen": 33681224, "step": 58380 }, { "epoch": 8.696008340780459, "grad_norm": 4.671924591064453, "learning_rate": 2.5460122199543328e-06, "loss": 0.4727, "num_input_tokens_seen": 33683912, "step": 58385 }, { "epoch": 8.696753053321418, "grad_norm": 4.28074836730957, "learning_rate": 2.54315563399852e-06, "loss": 0.8465, "num_input_tokens_seen": 33687048, "step": 58390 }, { "epoch": 8.697497765862376, "grad_norm": 2.1742966175079346, "learning_rate": 2.5403005655849464e-06, "loss": 0.5496, "num_input_tokens_seen": 33690216, "step": 58395 }, { "epoch": 8.698242478403337, "grad_norm": 1.9471274614334106, "learning_rate": 2.5374470149065465e-06, "loss": 0.4434, "num_input_tokens_seen": 33693160, "step": 58400 }, { "epoch": 8.698987190944296, "grad_norm": 2.6203300952911377, "learning_rate": 2.5345949821561523e-06, "loss": 0.6825, "num_input_tokens_seen": 33696008, "step": 58405 }, { "epoch": 8.699731903485254, "grad_norm": 2.4746694564819336, "learning_rate": 2.5317444675264978e-06, "loss": 0.5828, "num_input_tokens_seen": 33699048, "step": 58410 }, { "epoch": 8.700476616026213, "grad_norm": 4.295140266418457, "learning_rate": 2.528895471210199e-06, "loss": 0.6131, "num_input_tokens_seen": 33701832, "step": 58415 }, { "epoch": 8.701221328567174, "grad_norm": 2.6670992374420166, "learning_rate": 2.5260479933997826e-06, "loss": 0.6635, "num_input_tokens_seen": 33704552, "step": 58420 }, { "epoch": 8.701966041108133, "grad_norm": 2.0383059978485107, "learning_rate": 2.5232020342876666e-06, "loss": 0.4603, "num_input_tokens_seen": 33707176, "step": 58425 }, { "epoch": 8.702710753649091, "grad_norm": 6.800291061401367, "learning_rate": 2.520357594066175e-06, "loss": 0.6857, "num_input_tokens_seen": 33710184, "step": 58430 }, { "epoch": 8.70345546619005, "grad_norm": 2.5361499786376953, "learning_rate": 2.5175146729275205e-06, "loss": 0.4712, "num_input_tokens_seen": 33713128, "step": 58435 }, { "epoch": 8.70420017873101, "grad_norm": 3.3712921142578125, "learning_rate": 2.5146732710638192e-06, "loss": 0.7681, "num_input_tokens_seen": 33715720, "step": 58440 }, { "epoch": 8.70494489127197, "grad_norm": 1.844542145729065, "learning_rate": 2.511833388667084e-06, "loss": 0.4588, "num_input_tokens_seen": 33718440, "step": 58445 }, { "epoch": 8.705689603812928, "grad_norm": 5.051876068115234, "learning_rate": 2.5089950259292173e-06, "loss": 0.6816, "num_input_tokens_seen": 33720968, "step": 58450 }, { "epoch": 8.706434316353887, "grad_norm": 2.4026761054992676, "learning_rate": 2.5061581830420207e-06, "loss": 0.7356, "num_input_tokens_seen": 33723752, "step": 58455 }, { "epoch": 8.707179028894847, "grad_norm": 2.0635180473327637, "learning_rate": 2.503322860197199e-06, "loss": 0.5097, "num_input_tokens_seen": 33726600, "step": 58460 }, { "epoch": 8.707923741435806, "grad_norm": 3.1193740367889404, "learning_rate": 2.5004890575863556e-06, "loss": 0.7933, "num_input_tokens_seen": 33729320, "step": 58465 }, { "epoch": 8.708668453976765, "grad_norm": 2.101548433303833, "learning_rate": 2.497656775400986e-06, "loss": 0.5744, "num_input_tokens_seen": 33732488, "step": 58470 }, { "epoch": 8.709413166517724, "grad_norm": 3.619570732116699, "learning_rate": 2.4948260138324827e-06, "loss": 0.5966, "num_input_tokens_seen": 33735432, "step": 58475 }, { "epoch": 8.710157879058684, "grad_norm": 1.7169687747955322, "learning_rate": 2.4919967730721414e-06, "loss": 0.5294, "num_input_tokens_seen": 33738216, "step": 58480 }, { "epoch": 8.710902591599643, "grad_norm": 5.745843410491943, "learning_rate": 2.489169053311144e-06, "loss": 0.6411, "num_input_tokens_seen": 33741192, "step": 58485 }, { "epoch": 8.711647304140602, "grad_norm": 3.1796700954437256, "learning_rate": 2.486342854740584e-06, "loss": 0.4678, "num_input_tokens_seen": 33744040, "step": 58490 }, { "epoch": 8.71239201668156, "grad_norm": 2.518552541732788, "learning_rate": 2.483518177551436e-06, "loss": 0.499, "num_input_tokens_seen": 33746760, "step": 58495 }, { "epoch": 8.71313672922252, "grad_norm": 2.6868896484375, "learning_rate": 2.4806950219345842e-06, "loss": 0.5631, "num_input_tokens_seen": 33749864, "step": 58500 }, { "epoch": 8.71388144176348, "grad_norm": 3.4629828929901123, "learning_rate": 2.4778733880808036e-06, "loss": 0.5984, "num_input_tokens_seen": 33753032, "step": 58505 }, { "epoch": 8.714626154304439, "grad_norm": 2.32216215133667, "learning_rate": 2.4750532761807748e-06, "loss": 0.41, "num_input_tokens_seen": 33756616, "step": 58510 }, { "epoch": 8.715370866845397, "grad_norm": 5.2194623947143555, "learning_rate": 2.472234686425068e-06, "loss": 0.4874, "num_input_tokens_seen": 33759368, "step": 58515 }, { "epoch": 8.716115579386356, "grad_norm": 2.5533339977264404, "learning_rate": 2.469417619004144e-06, "loss": 0.561, "num_input_tokens_seen": 33762408, "step": 58520 }, { "epoch": 8.716860291927317, "grad_norm": 2.679581642150879, "learning_rate": 2.466602074108379e-06, "loss": 0.4893, "num_input_tokens_seen": 33765384, "step": 58525 }, { "epoch": 8.717605004468275, "grad_norm": 4.939308166503906, "learning_rate": 2.4637880519280317e-06, "loss": 0.6116, "num_input_tokens_seen": 33768392, "step": 58530 }, { "epoch": 8.718349717009234, "grad_norm": 1.466417670249939, "learning_rate": 2.4609755526532607e-06, "loss": 0.3874, "num_input_tokens_seen": 33771368, "step": 58535 }, { "epoch": 8.719094429550193, "grad_norm": 1.7850414514541626, "learning_rate": 2.4581645764741227e-06, "loss": 0.5529, "num_input_tokens_seen": 33774024, "step": 58540 }, { "epoch": 8.719839142091153, "grad_norm": 4.3386688232421875, "learning_rate": 2.455355123580583e-06, "loss": 0.4936, "num_input_tokens_seen": 33776904, "step": 58545 }, { "epoch": 8.720583854632112, "grad_norm": 1.558508276939392, "learning_rate": 2.4525471941624746e-06, "loss": 0.3942, "num_input_tokens_seen": 33779976, "step": 58550 }, { "epoch": 8.721328567173071, "grad_norm": 4.926209449768066, "learning_rate": 2.4497407884095575e-06, "loss": 0.5962, "num_input_tokens_seen": 33782856, "step": 58555 }, { "epoch": 8.72207327971403, "grad_norm": 2.5983266830444336, "learning_rate": 2.4469359065114743e-06, "loss": 0.5696, "num_input_tokens_seen": 33785640, "step": 58560 }, { "epoch": 8.72281799225499, "grad_norm": 2.6280171871185303, "learning_rate": 2.444132548657771e-06, "loss": 0.8132, "num_input_tokens_seen": 33788648, "step": 58565 }, { "epoch": 8.723562704795949, "grad_norm": 3.3557357788085938, "learning_rate": 2.4413307150378873e-06, "loss": 0.6431, "num_input_tokens_seen": 33791848, "step": 58570 }, { "epoch": 8.724307417336908, "grad_norm": 2.5592751502990723, "learning_rate": 2.4385304058411525e-06, "loss": 0.436, "num_input_tokens_seen": 33794216, "step": 58575 }, { "epoch": 8.725052129877866, "grad_norm": 1.8267948627471924, "learning_rate": 2.4357316212568094e-06, "loss": 0.4864, "num_input_tokens_seen": 33797160, "step": 58580 }, { "epoch": 8.725796842418827, "grad_norm": 5.188439846038818, "learning_rate": 2.432934361473979e-06, "loss": 0.6117, "num_input_tokens_seen": 33800168, "step": 58585 }, { "epoch": 8.726541554959786, "grad_norm": 3.4253973960876465, "learning_rate": 2.4301386266816938e-06, "loss": 0.596, "num_input_tokens_seen": 33803048, "step": 58590 }, { "epoch": 8.727286267500745, "grad_norm": 2.6534485816955566, "learning_rate": 2.4273444170688774e-06, "loss": 0.4536, "num_input_tokens_seen": 33806184, "step": 58595 }, { "epoch": 8.728030980041703, "grad_norm": 3.219151020050049, "learning_rate": 2.424551732824354e-06, "loss": 0.487, "num_input_tokens_seen": 33808808, "step": 58600 }, { "epoch": 8.728775692582664, "grad_norm": 2.7689220905303955, "learning_rate": 2.421760574136836e-06, "loss": 0.5746, "num_input_tokens_seen": 33811752, "step": 58605 }, { "epoch": 8.729520405123623, "grad_norm": 2.528435707092285, "learning_rate": 2.418970941194948e-06, "loss": 0.5638, "num_input_tokens_seen": 33814440, "step": 58610 }, { "epoch": 8.730265117664581, "grad_norm": 2.287240982055664, "learning_rate": 2.4161828341871973e-06, "loss": 0.546, "num_input_tokens_seen": 33817224, "step": 58615 }, { "epoch": 8.73100983020554, "grad_norm": 2.3598856925964355, "learning_rate": 2.4133962533019832e-06, "loss": 0.5962, "num_input_tokens_seen": 33820232, "step": 58620 }, { "epoch": 8.7317545427465, "grad_norm": 5.661214828491211, "learning_rate": 2.410611198727622e-06, "loss": 0.5232, "num_input_tokens_seen": 33822952, "step": 58625 }, { "epoch": 8.73249925528746, "grad_norm": 2.720043659210205, "learning_rate": 2.4078276706523156e-06, "loss": 0.5482, "num_input_tokens_seen": 33825672, "step": 58630 }, { "epoch": 8.733243967828418, "grad_norm": 2.8689935207366943, "learning_rate": 2.405045669264161e-06, "loss": 0.4577, "num_input_tokens_seen": 33828296, "step": 58635 }, { "epoch": 8.733988680369377, "grad_norm": 3.370055675506592, "learning_rate": 2.4022651947511548e-06, "loss": 0.6176, "num_input_tokens_seen": 33831080, "step": 58640 }, { "epoch": 8.734733392910336, "grad_norm": 3.510546922683716, "learning_rate": 2.399486247301197e-06, "loss": 0.4958, "num_input_tokens_seen": 33833960, "step": 58645 }, { "epoch": 8.735478105451296, "grad_norm": 3.2452549934387207, "learning_rate": 2.3967088271020707e-06, "loss": 0.4002, "num_input_tokens_seen": 33836552, "step": 58650 }, { "epoch": 8.736222817992255, "grad_norm": 2.799823760986328, "learning_rate": 2.3939329343414584e-06, "loss": 0.5796, "num_input_tokens_seen": 33839496, "step": 58655 }, { "epoch": 8.736967530533214, "grad_norm": 6.558441162109375, "learning_rate": 2.39115856920695e-06, "loss": 0.451, "num_input_tokens_seen": 33842440, "step": 58660 }, { "epoch": 8.737712243074174, "grad_norm": 2.32615065574646, "learning_rate": 2.388385731886025e-06, "loss": 0.5128, "num_input_tokens_seen": 33845384, "step": 58665 }, { "epoch": 8.738456955615133, "grad_norm": 3.5304529666900635, "learning_rate": 2.38561442256606e-06, "loss": 0.6753, "num_input_tokens_seen": 33848328, "step": 58670 }, { "epoch": 8.739201668156092, "grad_norm": 5.903814792633057, "learning_rate": 2.3828446414343288e-06, "loss": 0.5166, "num_input_tokens_seen": 33851144, "step": 58675 }, { "epoch": 8.73994638069705, "grad_norm": 2.259915351867676, "learning_rate": 2.380076388678007e-06, "loss": 0.6179, "num_input_tokens_seen": 33853672, "step": 58680 }, { "epoch": 8.74069109323801, "grad_norm": 2.509429454803467, "learning_rate": 2.377309664484151e-06, "loss": 0.5632, "num_input_tokens_seen": 33856648, "step": 58685 }, { "epoch": 8.74143580577897, "grad_norm": 3.8321123123168945, "learning_rate": 2.3745444690397302e-06, "loss": 0.6093, "num_input_tokens_seen": 33859688, "step": 58690 }, { "epoch": 8.742180518319929, "grad_norm": 3.9828431606292725, "learning_rate": 2.3717808025316118e-06, "loss": 0.6885, "num_input_tokens_seen": 33862408, "step": 58695 }, { "epoch": 8.742925230860887, "grad_norm": 2.3152482509613037, "learning_rate": 2.369018665146544e-06, "loss": 0.6086, "num_input_tokens_seen": 33865000, "step": 58700 }, { "epoch": 8.743669943401846, "grad_norm": 5.922296047210693, "learning_rate": 2.36625805707118e-06, "loss": 0.59, "num_input_tokens_seen": 33867848, "step": 58705 }, { "epoch": 8.744414655942807, "grad_norm": 2.432175636291504, "learning_rate": 2.363498978492082e-06, "loss": 0.7167, "num_input_tokens_seen": 33871176, "step": 58710 }, { "epoch": 8.745159368483765, "grad_norm": 2.769434690475464, "learning_rate": 2.3607414295956835e-06, "loss": 0.5614, "num_input_tokens_seen": 33873864, "step": 58715 }, { "epoch": 8.745904081024724, "grad_norm": 3.6301870346069336, "learning_rate": 2.357985410568336e-06, "loss": 0.4851, "num_input_tokens_seen": 33876936, "step": 58720 }, { "epoch": 8.746648793565683, "grad_norm": 2.2138590812683105, "learning_rate": 2.3552309215962796e-06, "loss": 0.4875, "num_input_tokens_seen": 33879560, "step": 58725 }, { "epoch": 8.747393506106643, "grad_norm": 3.0261049270629883, "learning_rate": 2.3524779628656484e-06, "loss": 0.5238, "num_input_tokens_seen": 33882472, "step": 58730 }, { "epoch": 8.748138218647602, "grad_norm": 2.413085460662842, "learning_rate": 2.3497265345624824e-06, "loss": 0.4387, "num_input_tokens_seen": 33885256, "step": 58735 }, { "epoch": 8.748882931188561, "grad_norm": 2.5278518199920654, "learning_rate": 2.3469766368727053e-06, "loss": 0.5748, "num_input_tokens_seen": 33888136, "step": 58740 }, { "epoch": 8.74962764372952, "grad_norm": 3.8678531646728516, "learning_rate": 2.3442282699821515e-06, "loss": 0.3631, "num_input_tokens_seen": 33891048, "step": 58745 }, { "epoch": 8.75037235627048, "grad_norm": 1.971623182296753, "learning_rate": 2.341481434076534e-06, "loss": 0.6997, "num_input_tokens_seen": 33893864, "step": 58750 }, { "epoch": 8.751117068811439, "grad_norm": 2.8265039920806885, "learning_rate": 2.338736129341479e-06, "loss": 0.6247, "num_input_tokens_seen": 33896904, "step": 58755 }, { "epoch": 8.751861781352398, "grad_norm": 3.255753517150879, "learning_rate": 2.335992355962502e-06, "loss": 0.6055, "num_input_tokens_seen": 33900200, "step": 58760 }, { "epoch": 8.752606493893357, "grad_norm": 2.0330467224121094, "learning_rate": 2.3332501141250156e-06, "loss": 0.4903, "num_input_tokens_seen": 33902952, "step": 58765 }, { "epoch": 8.753351206434317, "grad_norm": 2.9354677200317383, "learning_rate": 2.3305094040143303e-06, "loss": 0.5503, "num_input_tokens_seen": 33905832, "step": 58770 }, { "epoch": 8.754095918975276, "grad_norm": 2.314330577850342, "learning_rate": 2.3277702258156566e-06, "loss": 0.5016, "num_input_tokens_seen": 33908872, "step": 58775 }, { "epoch": 8.754840631516235, "grad_norm": 3.0334675312042236, "learning_rate": 2.3250325797140952e-06, "loss": 0.7213, "num_input_tokens_seen": 33911624, "step": 58780 }, { "epoch": 8.755585344057193, "grad_norm": 4.013489723205566, "learning_rate": 2.3222964658946357e-06, "loss": 0.6839, "num_input_tokens_seen": 33914728, "step": 58785 }, { "epoch": 8.756330056598154, "grad_norm": 5.7355475425720215, "learning_rate": 2.319561884542179e-06, "loss": 0.4397, "num_input_tokens_seen": 33917544, "step": 58790 }, { "epoch": 8.757074769139113, "grad_norm": 2.7736058235168457, "learning_rate": 2.3168288358415197e-06, "loss": 0.443, "num_input_tokens_seen": 33920584, "step": 58795 }, { "epoch": 8.757819481680071, "grad_norm": 2.1214253902435303, "learning_rate": 2.314097319977343e-06, "loss": 0.5372, "num_input_tokens_seen": 33923336, "step": 58800 }, { "epoch": 8.75856419422103, "grad_norm": 2.7285780906677246, "learning_rate": 2.3113673371342378e-06, "loss": 0.7022, "num_input_tokens_seen": 33926824, "step": 58805 }, { "epoch": 8.75930890676199, "grad_norm": 3.159592390060425, "learning_rate": 2.3086388874966865e-06, "loss": 0.6375, "num_input_tokens_seen": 33929896, "step": 58810 }, { "epoch": 8.76005361930295, "grad_norm": 2.6251163482666016, "learning_rate": 2.3059119712490613e-06, "loss": 0.4806, "num_input_tokens_seen": 33932968, "step": 58815 }, { "epoch": 8.760798331843908, "grad_norm": 2.4522480964660645, "learning_rate": 2.303186588575634e-06, "loss": 0.5546, "num_input_tokens_seen": 33935368, "step": 58820 }, { "epoch": 8.761543044384867, "grad_norm": 3.022934675216675, "learning_rate": 2.3004627396605776e-06, "loss": 0.6496, "num_input_tokens_seen": 33938120, "step": 58825 }, { "epoch": 8.762287756925826, "grad_norm": 3.5052309036254883, "learning_rate": 2.2977404246879607e-06, "loss": 0.5098, "num_input_tokens_seen": 33940936, "step": 58830 }, { "epoch": 8.763032469466786, "grad_norm": 2.3619344234466553, "learning_rate": 2.2950196438417448e-06, "loss": 0.6959, "num_input_tokens_seen": 33944072, "step": 58835 }, { "epoch": 8.763777182007745, "grad_norm": 2.9143199920654297, "learning_rate": 2.292300397305791e-06, "loss": 0.5862, "num_input_tokens_seen": 33946824, "step": 58840 }, { "epoch": 8.764521894548704, "grad_norm": 2.9003758430480957, "learning_rate": 2.289582685263858e-06, "loss": 0.5848, "num_input_tokens_seen": 33949800, "step": 58845 }, { "epoch": 8.765266607089664, "grad_norm": 7.137415409088135, "learning_rate": 2.2868665078995878e-06, "loss": 0.5749, "num_input_tokens_seen": 33952648, "step": 58850 }, { "epoch": 8.766011319630623, "grad_norm": 3.205660820007324, "learning_rate": 2.2841518653965388e-06, "loss": 0.5108, "num_input_tokens_seen": 33955560, "step": 58855 }, { "epoch": 8.766756032171582, "grad_norm": 1.922621488571167, "learning_rate": 2.281438757938145e-06, "loss": 0.4758, "num_input_tokens_seen": 33958280, "step": 58860 }, { "epoch": 8.76750074471254, "grad_norm": 1.8618911504745483, "learning_rate": 2.2787271857077546e-06, "loss": 0.4573, "num_input_tokens_seen": 33961064, "step": 58865 }, { "epoch": 8.7682454572535, "grad_norm": 2.964053153991699, "learning_rate": 2.276017148888604e-06, "loss": 0.5634, "num_input_tokens_seen": 33963912, "step": 58870 }, { "epoch": 8.76899016979446, "grad_norm": 2.0568907260894775, "learning_rate": 2.273308647663827e-06, "loss": 0.4859, "num_input_tokens_seen": 33966632, "step": 58875 }, { "epoch": 8.769734882335419, "grad_norm": 3.7276525497436523, "learning_rate": 2.27060168221645e-06, "loss": 0.5934, "num_input_tokens_seen": 33969416, "step": 58880 }, { "epoch": 8.770479594876377, "grad_norm": 4.292792320251465, "learning_rate": 2.2678962527293986e-06, "loss": 0.6166, "num_input_tokens_seen": 33971944, "step": 58885 }, { "epoch": 8.771224307417336, "grad_norm": 2.5507633686065674, "learning_rate": 2.2651923593854985e-06, "loss": 0.6523, "num_input_tokens_seen": 33974504, "step": 58890 }, { "epoch": 8.771969019958297, "grad_norm": 0.8854258060455322, "learning_rate": 2.2624900023674678e-06, "loss": 0.4793, "num_input_tokens_seen": 33977416, "step": 58895 }, { "epoch": 8.772713732499255, "grad_norm": 4.90197229385376, "learning_rate": 2.259789181857916e-06, "loss": 0.5865, "num_input_tokens_seen": 33980200, "step": 58900 }, { "epoch": 8.773458445040214, "grad_norm": 1.8151105642318726, "learning_rate": 2.2570898980393552e-06, "loss": 0.5756, "num_input_tokens_seen": 33982920, "step": 58905 }, { "epoch": 8.774203157581173, "grad_norm": 3.667956829071045, "learning_rate": 2.254392151094198e-06, "loss": 0.7466, "num_input_tokens_seen": 33985800, "step": 58910 }, { "epoch": 8.774947870122134, "grad_norm": 2.659177780151367, "learning_rate": 2.251695941204737e-06, "loss": 0.4356, "num_input_tokens_seen": 33988968, "step": 58915 }, { "epoch": 8.775692582663092, "grad_norm": 3.21551513671875, "learning_rate": 2.2490012685531777e-06, "loss": 0.6316, "num_input_tokens_seen": 33992008, "step": 58920 }, { "epoch": 8.776437295204051, "grad_norm": 4.416258811950684, "learning_rate": 2.246308133321612e-06, "loss": 0.5714, "num_input_tokens_seen": 33994856, "step": 58925 }, { "epoch": 8.77718200774501, "grad_norm": 1.710118293762207, "learning_rate": 2.2436165356920335e-06, "loss": 0.3736, "num_input_tokens_seen": 33997736, "step": 58930 }, { "epoch": 8.77792672028597, "grad_norm": 2.7551722526550293, "learning_rate": 2.2409264758463363e-06, "loss": 0.4143, "num_input_tokens_seen": 34000648, "step": 58935 }, { "epoch": 8.778671432826929, "grad_norm": 2.9274237155914307, "learning_rate": 2.238237953966288e-06, "loss": 0.4789, "num_input_tokens_seen": 34003656, "step": 58940 }, { "epoch": 8.779416145367888, "grad_norm": 5.826133728027344, "learning_rate": 2.2355509702335825e-06, "loss": 0.4155, "num_input_tokens_seen": 34006344, "step": 58945 }, { "epoch": 8.780160857908847, "grad_norm": 3.2325503826141357, "learning_rate": 2.2328655248297833e-06, "loss": 0.4147, "num_input_tokens_seen": 34009096, "step": 58950 }, { "epoch": 8.780905570449807, "grad_norm": 3.258615732192993, "learning_rate": 2.2301816179363695e-06, "loss": 0.4569, "num_input_tokens_seen": 34012232, "step": 58955 }, { "epoch": 8.781650282990766, "grad_norm": 1.423614263534546, "learning_rate": 2.2274992497347045e-06, "loss": 0.4944, "num_input_tokens_seen": 34015304, "step": 58960 }, { "epoch": 8.782394995531725, "grad_norm": 3.7500531673431396, "learning_rate": 2.224818420406055e-06, "loss": 0.5408, "num_input_tokens_seen": 34018472, "step": 58965 }, { "epoch": 8.783139708072683, "grad_norm": 2.3557169437408447, "learning_rate": 2.2221391301315787e-06, "loss": 0.4348, "num_input_tokens_seen": 34021224, "step": 58970 }, { "epoch": 8.783884420613644, "grad_norm": 2.979909896850586, "learning_rate": 2.2194613790923387e-06, "loss": 0.6087, "num_input_tokens_seen": 34024136, "step": 58975 }, { "epoch": 8.784629133154603, "grad_norm": 6.646305084228516, "learning_rate": 2.2167851674692763e-06, "loss": 0.8593, "num_input_tokens_seen": 34026920, "step": 58980 }, { "epoch": 8.785373845695561, "grad_norm": 0.7324111461639404, "learning_rate": 2.214110495443242e-06, "loss": 0.6339, "num_input_tokens_seen": 34029896, "step": 58985 }, { "epoch": 8.78611855823652, "grad_norm": 3.1900436878204346, "learning_rate": 2.211437363194976e-06, "loss": 0.5197, "num_input_tokens_seen": 34032584, "step": 58990 }, { "epoch": 8.78686327077748, "grad_norm": 3.0983712673187256, "learning_rate": 2.2087657709051246e-06, "loss": 0.6462, "num_input_tokens_seen": 34035656, "step": 58995 }, { "epoch": 8.78760798331844, "grad_norm": 2.094036340713501, "learning_rate": 2.206095718754217e-06, "loss": 0.3636, "num_input_tokens_seen": 34038536, "step": 59000 }, { "epoch": 8.788352695859398, "grad_norm": 2.167438507080078, "learning_rate": 2.2034272069226897e-06, "loss": 0.6227, "num_input_tokens_seen": 34041640, "step": 59005 }, { "epoch": 8.789097408400357, "grad_norm": 2.9621076583862305, "learning_rate": 2.2007602355908707e-06, "loss": 0.5871, "num_input_tokens_seen": 34044488, "step": 59010 }, { "epoch": 8.789842120941316, "grad_norm": 4.8339738845825195, "learning_rate": 2.19809480493898e-06, "loss": 0.7837, "num_input_tokens_seen": 34047176, "step": 59015 }, { "epoch": 8.790586833482276, "grad_norm": 3.309563636779785, "learning_rate": 2.195430915147134e-06, "loss": 0.6515, "num_input_tokens_seen": 34050152, "step": 59020 }, { "epoch": 8.791331546023235, "grad_norm": 1.9398547410964966, "learning_rate": 2.192768566395348e-06, "loss": 0.5398, "num_input_tokens_seen": 34053512, "step": 59025 }, { "epoch": 8.792076258564194, "grad_norm": 2.7542903423309326, "learning_rate": 2.1901077588635357e-06, "loss": 0.599, "num_input_tokens_seen": 34056488, "step": 59030 }, { "epoch": 8.792820971105153, "grad_norm": 4.512303829193115, "learning_rate": 2.187448492731503e-06, "loss": 0.7607, "num_input_tokens_seen": 34059272, "step": 59035 }, { "epoch": 8.793565683646113, "grad_norm": 2.6851084232330322, "learning_rate": 2.184790768178957e-06, "loss": 0.6927, "num_input_tokens_seen": 34062760, "step": 59040 }, { "epoch": 8.794310396187072, "grad_norm": 3.6749017238616943, "learning_rate": 2.182134585385487e-06, "loss": 0.4444, "num_input_tokens_seen": 34065704, "step": 59045 }, { "epoch": 8.79505510872803, "grad_norm": 3.201627016067505, "learning_rate": 2.179479944530588e-06, "loss": 0.4232, "num_input_tokens_seen": 34068648, "step": 59050 }, { "epoch": 8.79579982126899, "grad_norm": 2.723081350326538, "learning_rate": 2.1768268457936613e-06, "loss": 0.3109, "num_input_tokens_seen": 34071624, "step": 59055 }, { "epoch": 8.79654453380995, "grad_norm": 2.3453991413116455, "learning_rate": 2.1741752893539775e-06, "loss": 0.6595, "num_input_tokens_seen": 34074600, "step": 59060 }, { "epoch": 8.797289246350909, "grad_norm": 2.724109649658203, "learning_rate": 2.1715252753907234e-06, "loss": 0.5706, "num_input_tokens_seen": 34077832, "step": 59065 }, { "epoch": 8.798033958891867, "grad_norm": 1.2289396524429321, "learning_rate": 2.168876804082978e-06, "loss": 0.6226, "num_input_tokens_seen": 34080456, "step": 59070 }, { "epoch": 8.798778671432826, "grad_norm": 2.522238254547119, "learning_rate": 2.166229875609718e-06, "loss": 0.5975, "num_input_tokens_seen": 34083336, "step": 59075 }, { "epoch": 8.799523383973787, "grad_norm": 4.267443656921387, "learning_rate": 2.163584490149806e-06, "loss": 0.8758, "num_input_tokens_seen": 34086088, "step": 59080 }, { "epoch": 8.800268096514746, "grad_norm": 3.7627313137054443, "learning_rate": 2.1609406478820066e-06, "loss": 0.8353, "num_input_tokens_seen": 34089224, "step": 59085 }, { "epoch": 8.801012809055704, "grad_norm": 3.3691952228546143, "learning_rate": 2.15829834898498e-06, "loss": 0.4851, "num_input_tokens_seen": 34092168, "step": 59090 }, { "epoch": 8.801757521596663, "grad_norm": 1.9441930055618286, "learning_rate": 2.155657593637289e-06, "loss": 0.5456, "num_input_tokens_seen": 34094952, "step": 59095 }, { "epoch": 8.802502234137624, "grad_norm": 2.4277126789093018, "learning_rate": 2.1530183820173743e-06, "loss": 0.773, "num_input_tokens_seen": 34097736, "step": 59100 }, { "epoch": 8.803246946678582, "grad_norm": 3.716132879257202, "learning_rate": 2.1503807143035875e-06, "loss": 0.6237, "num_input_tokens_seen": 34100712, "step": 59105 }, { "epoch": 8.803991659219541, "grad_norm": 2.9137766361236572, "learning_rate": 2.1477445906741776e-06, "loss": 0.6776, "num_input_tokens_seen": 34103432, "step": 59110 }, { "epoch": 8.8047363717605, "grad_norm": 2.152161121368408, "learning_rate": 2.1451100113072748e-06, "loss": 0.744, "num_input_tokens_seen": 34106408, "step": 59115 }, { "epoch": 8.80548108430146, "grad_norm": 1.727403163909912, "learning_rate": 2.142476976380914e-06, "loss": 0.6807, "num_input_tokens_seen": 34109032, "step": 59120 }, { "epoch": 8.80622579684242, "grad_norm": 1.8514361381530762, "learning_rate": 2.1398454860730277e-06, "loss": 0.5868, "num_input_tokens_seen": 34111944, "step": 59125 }, { "epoch": 8.806970509383378, "grad_norm": 4.038285732269287, "learning_rate": 2.1372155405614436e-06, "loss": 0.521, "num_input_tokens_seen": 34114760, "step": 59130 }, { "epoch": 8.807715221924337, "grad_norm": 2.456989288330078, "learning_rate": 2.13458714002388e-06, "loss": 0.4519, "num_input_tokens_seen": 34117448, "step": 59135 }, { "epoch": 8.808459934465297, "grad_norm": 3.467252492904663, "learning_rate": 2.1319602846379518e-06, "loss": 0.3906, "num_input_tokens_seen": 34120456, "step": 59140 }, { "epoch": 8.809204647006256, "grad_norm": 3.661212205886841, "learning_rate": 2.1293349745811765e-06, "loss": 0.5099, "num_input_tokens_seen": 34123336, "step": 59145 }, { "epoch": 8.809949359547215, "grad_norm": 2.6865029335021973, "learning_rate": 2.1267112100309545e-06, "loss": 0.6607, "num_input_tokens_seen": 34126152, "step": 59150 }, { "epoch": 8.810694072088173, "grad_norm": 2.300571918487549, "learning_rate": 2.1240889911645913e-06, "loss": 0.4955, "num_input_tokens_seen": 34129000, "step": 59155 }, { "epoch": 8.811438784629132, "grad_norm": 2.650026321411133, "learning_rate": 2.121468318159289e-06, "loss": 0.3512, "num_input_tokens_seen": 34132008, "step": 59160 }, { "epoch": 8.812183497170093, "grad_norm": 7.737061977386475, "learning_rate": 2.1188491911921403e-06, "loss": 0.5448, "num_input_tokens_seen": 34134728, "step": 59165 }, { "epoch": 8.812928209711052, "grad_norm": 2.5841784477233887, "learning_rate": 2.1162316104401364e-06, "loss": 0.5744, "num_input_tokens_seen": 34137512, "step": 59170 }, { "epoch": 8.81367292225201, "grad_norm": 2.4964051246643066, "learning_rate": 2.1136155760801633e-06, "loss": 0.6764, "num_input_tokens_seen": 34140328, "step": 59175 }, { "epoch": 8.81441763479297, "grad_norm": 8.893438339233398, "learning_rate": 2.1110010882890025e-06, "loss": 0.6269, "num_input_tokens_seen": 34143336, "step": 59180 }, { "epoch": 8.81516234733393, "grad_norm": 2.4747445583343506, "learning_rate": 2.1083881472433232e-06, "loss": 0.5424, "num_input_tokens_seen": 34146088, "step": 59185 }, { "epoch": 8.815907059874888, "grad_norm": 8.643364906311035, "learning_rate": 2.105776753119701e-06, "loss": 0.7988, "num_input_tokens_seen": 34148840, "step": 59190 }, { "epoch": 8.816651772415847, "grad_norm": 3.6076669692993164, "learning_rate": 2.1031669060946056e-06, "loss": 0.7316, "num_input_tokens_seen": 34151944, "step": 59195 }, { "epoch": 8.817396484956806, "grad_norm": 4.168676853179932, "learning_rate": 2.100558606344399e-06, "loss": 0.5265, "num_input_tokens_seen": 34155112, "step": 59200 }, { "epoch": 8.818141197497766, "grad_norm": 3.4482390880584717, "learning_rate": 2.0979518540453435e-06, "loss": 0.6753, "num_input_tokens_seen": 34157800, "step": 59205 }, { "epoch": 8.818885910038725, "grad_norm": 2.119828701019287, "learning_rate": 2.095346649373586e-06, "loss": 0.6269, "num_input_tokens_seen": 34160680, "step": 59210 }, { "epoch": 8.819630622579684, "grad_norm": 4.572185039520264, "learning_rate": 2.092742992505181e-06, "loss": 0.5838, "num_input_tokens_seen": 34163336, "step": 59215 }, { "epoch": 8.820375335120643, "grad_norm": 2.0638794898986816, "learning_rate": 2.090140883616068e-06, "loss": 0.4099, "num_input_tokens_seen": 34166120, "step": 59220 }, { "epoch": 8.821120047661603, "grad_norm": 3.7434210777282715, "learning_rate": 2.087540322882087e-06, "loss": 0.5039, "num_input_tokens_seen": 34169000, "step": 59225 }, { "epoch": 8.821864760202562, "grad_norm": 3.686520576477051, "learning_rate": 2.084941310478977e-06, "loss": 0.4443, "num_input_tokens_seen": 34172008, "step": 59230 }, { "epoch": 8.82260947274352, "grad_norm": 3.1422224044799805, "learning_rate": 2.0823438465823656e-06, "loss": 0.5903, "num_input_tokens_seen": 34174792, "step": 59235 }, { "epoch": 8.82335418528448, "grad_norm": 2.7732889652252197, "learning_rate": 2.079747931367787e-06, "loss": 0.4845, "num_input_tokens_seen": 34177576, "step": 59240 }, { "epoch": 8.82409889782544, "grad_norm": 3.9669437408447266, "learning_rate": 2.0771535650106533e-06, "loss": 0.4522, "num_input_tokens_seen": 34180488, "step": 59245 }, { "epoch": 8.824843610366399, "grad_norm": 2.4105522632598877, "learning_rate": 2.0745607476862826e-06, "loss": 0.5184, "num_input_tokens_seen": 34183176, "step": 59250 }, { "epoch": 8.825588322907358, "grad_norm": 3.1598973274230957, "learning_rate": 2.0719694795698907e-06, "loss": 0.5843, "num_input_tokens_seen": 34186024, "step": 59255 }, { "epoch": 8.826333035448316, "grad_norm": 4.811408042907715, "learning_rate": 2.0693797608365817e-06, "loss": 0.7007, "num_input_tokens_seen": 34188904, "step": 59260 }, { "epoch": 8.827077747989277, "grad_norm": 1.5186355113983154, "learning_rate": 2.0667915916613573e-06, "loss": 0.658, "num_input_tokens_seen": 34191464, "step": 59265 }, { "epoch": 8.827822460530236, "grad_norm": 5.230440616607666, "learning_rate": 2.0642049722191193e-06, "loss": 0.6821, "num_input_tokens_seen": 34194184, "step": 59270 }, { "epoch": 8.828567173071194, "grad_norm": 9.458369255065918, "learning_rate": 2.0616199026846613e-06, "loss": 0.7894, "num_input_tokens_seen": 34197160, "step": 59275 }, { "epoch": 8.829311885612153, "grad_norm": 3.1922860145568848, "learning_rate": 2.059036383232668e-06, "loss": 0.5509, "num_input_tokens_seen": 34200200, "step": 59280 }, { "epoch": 8.830056598153114, "grad_norm": 2.939480781555176, "learning_rate": 2.0564544140377228e-06, "loss": 0.5625, "num_input_tokens_seen": 34203368, "step": 59285 }, { "epoch": 8.830801310694072, "grad_norm": 2.8356595039367676, "learning_rate": 2.0538739952743054e-06, "loss": 0.6458, "num_input_tokens_seen": 34206024, "step": 59290 }, { "epoch": 8.831546023235031, "grad_norm": 2.961345672607422, "learning_rate": 2.0512951271167922e-06, "loss": 0.7524, "num_input_tokens_seen": 34208744, "step": 59295 }, { "epoch": 8.83229073577599, "grad_norm": 1.6279083490371704, "learning_rate": 2.048717809739459e-06, "loss": 0.6208, "num_input_tokens_seen": 34211560, "step": 59300 }, { "epoch": 8.83303544831695, "grad_norm": 2.5292859077453613, "learning_rate": 2.046142043316457e-06, "loss": 0.5657, "num_input_tokens_seen": 34214312, "step": 59305 }, { "epoch": 8.83378016085791, "grad_norm": 2.2296900749206543, "learning_rate": 2.0435678280218556e-06, "loss": 0.4125, "num_input_tokens_seen": 34217064, "step": 59310 }, { "epoch": 8.834524873398868, "grad_norm": 1.9793044328689575, "learning_rate": 2.040995164029602e-06, "loss": 0.5828, "num_input_tokens_seen": 34219912, "step": 59315 }, { "epoch": 8.835269585939827, "grad_norm": 3.7141165733337402, "learning_rate": 2.038424051513549e-06, "loss": 0.5074, "num_input_tokens_seen": 34222728, "step": 59320 }, { "epoch": 8.836014298480787, "grad_norm": 1.6408332586288452, "learning_rate": 2.035854490647446e-06, "loss": 0.4438, "num_input_tokens_seen": 34225640, "step": 59325 }, { "epoch": 8.836759011021746, "grad_norm": 2.787659168243408, "learning_rate": 2.033286481604932e-06, "loss": 0.4619, "num_input_tokens_seen": 34228392, "step": 59330 }, { "epoch": 8.837503723562705, "grad_norm": 1.834568977355957, "learning_rate": 2.0307200245595403e-06, "loss": 0.3675, "num_input_tokens_seen": 34231592, "step": 59335 }, { "epoch": 8.838248436103664, "grad_norm": 2.3198306560516357, "learning_rate": 2.028155119684708e-06, "loss": 0.4746, "num_input_tokens_seen": 34234568, "step": 59340 }, { "epoch": 8.838993148644622, "grad_norm": 4.483170509338379, "learning_rate": 2.0255917671537534e-06, "loss": 0.7148, "num_input_tokens_seen": 34237640, "step": 59345 }, { "epoch": 8.839737861185583, "grad_norm": 4.153234004974365, "learning_rate": 2.0230299671399e-06, "loss": 0.605, "num_input_tokens_seen": 34240456, "step": 59350 }, { "epoch": 8.840482573726542, "grad_norm": 2.977962017059326, "learning_rate": 2.0204697198162593e-06, "loss": 0.403, "num_input_tokens_seen": 34243688, "step": 59355 }, { "epoch": 8.8412272862675, "grad_norm": 2.9155540466308594, "learning_rate": 2.0179110253558507e-06, "loss": 0.6256, "num_input_tokens_seen": 34246600, "step": 59360 }, { "epoch": 8.84197199880846, "grad_norm": 4.443510055541992, "learning_rate": 2.0153538839315756e-06, "loss": 0.7512, "num_input_tokens_seen": 34249384, "step": 59365 }, { "epoch": 8.84271671134942, "grad_norm": 4.125694274902344, "learning_rate": 2.0127982957162395e-06, "loss": 0.6883, "num_input_tokens_seen": 34252232, "step": 59370 }, { "epoch": 8.843461423890378, "grad_norm": 8.409234046936035, "learning_rate": 2.0102442608825324e-06, "loss": 0.688, "num_input_tokens_seen": 34255080, "step": 59375 }, { "epoch": 8.844206136431337, "grad_norm": 3.759122133255005, "learning_rate": 2.007691779603052e-06, "loss": 0.7506, "num_input_tokens_seen": 34257960, "step": 59380 }, { "epoch": 8.844950848972296, "grad_norm": 2.7448649406433105, "learning_rate": 2.0051408520502774e-06, "loss": 0.4918, "num_input_tokens_seen": 34260904, "step": 59385 }, { "epoch": 8.845695561513256, "grad_norm": 2.7574687004089355, "learning_rate": 2.0025914783965926e-06, "loss": 0.7923, "num_input_tokens_seen": 34263560, "step": 59390 }, { "epoch": 8.846440274054215, "grad_norm": 3.2723581790924072, "learning_rate": 2.000043658814277e-06, "loss": 0.5316, "num_input_tokens_seen": 34266536, "step": 59395 }, { "epoch": 8.847184986595174, "grad_norm": 4.2692790031433105, "learning_rate": 1.9974973934755003e-06, "loss": 0.4907, "num_input_tokens_seen": 34269160, "step": 59400 }, { "epoch": 8.847929699136133, "grad_norm": 1.8362005949020386, "learning_rate": 1.994952682552331e-06, "loss": 0.6148, "num_input_tokens_seen": 34272168, "step": 59405 }, { "epoch": 8.848674411677093, "grad_norm": 3.464467763900757, "learning_rate": 1.9924095262167238e-06, "loss": 0.6751, "num_input_tokens_seen": 34274984, "step": 59410 }, { "epoch": 8.849419124218052, "grad_norm": 1.9590643644332886, "learning_rate": 1.9898679246405372e-06, "loss": 0.6387, "num_input_tokens_seen": 34277896, "step": 59415 }, { "epoch": 8.85016383675901, "grad_norm": 1.3640942573547363, "learning_rate": 1.9873278779955316e-06, "loss": 0.7364, "num_input_tokens_seen": 34280872, "step": 59420 }, { "epoch": 8.85090854929997, "grad_norm": 4.8375701904296875, "learning_rate": 1.9847893864533395e-06, "loss": 0.6077, "num_input_tokens_seen": 34283624, "step": 59425 }, { "epoch": 8.85165326184093, "grad_norm": 2.975534439086914, "learning_rate": 1.9822524501855067e-06, "loss": 0.5783, "num_input_tokens_seen": 34286344, "step": 59430 }, { "epoch": 8.852397974381889, "grad_norm": 4.012223243713379, "learning_rate": 1.979717069363471e-06, "loss": 0.5339, "num_input_tokens_seen": 34289160, "step": 59435 }, { "epoch": 8.853142686922848, "grad_norm": 3.0215554237365723, "learning_rate": 1.9771832441585647e-06, "loss": 0.4658, "num_input_tokens_seen": 34292104, "step": 59440 }, { "epoch": 8.853887399463806, "grad_norm": 3.8584532737731934, "learning_rate": 1.9746509747420065e-06, "loss": 0.6048, "num_input_tokens_seen": 34294760, "step": 59445 }, { "epoch": 8.854632112004767, "grad_norm": 3.264056444168091, "learning_rate": 1.972120261284924e-06, "loss": 0.5806, "num_input_tokens_seen": 34297736, "step": 59450 }, { "epoch": 8.855376824545726, "grad_norm": 1.9383916854858398, "learning_rate": 1.9695911039583265e-06, "loss": 0.6049, "num_input_tokens_seen": 34300488, "step": 59455 }, { "epoch": 8.856121537086684, "grad_norm": 3.66005539894104, "learning_rate": 1.9670635029331336e-06, "loss": 0.5125, "num_input_tokens_seen": 34303432, "step": 59460 }, { "epoch": 8.856866249627643, "grad_norm": 3.230799436569214, "learning_rate": 1.9645374583801417e-06, "loss": 0.5969, "num_input_tokens_seen": 34306152, "step": 59465 }, { "epoch": 8.857610962168604, "grad_norm": 1.6579176187515259, "learning_rate": 1.9620129704700506e-06, "loss": 0.4699, "num_input_tokens_seen": 34309192, "step": 59470 }, { "epoch": 8.858355674709562, "grad_norm": 2.515171527862549, "learning_rate": 1.95949003937346e-06, "loss": 0.4701, "num_input_tokens_seen": 34311944, "step": 59475 }, { "epoch": 8.859100387250521, "grad_norm": 3.5095787048339844, "learning_rate": 1.9569686652608555e-06, "loss": 0.6715, "num_input_tokens_seen": 34314920, "step": 59480 }, { "epoch": 8.85984509979148, "grad_norm": 2.862189292907715, "learning_rate": 1.9544488483026203e-06, "loss": 0.5429, "num_input_tokens_seen": 34317608, "step": 59485 }, { "epoch": 8.86058981233244, "grad_norm": 4.669543266296387, "learning_rate": 1.9519305886690378e-06, "loss": 0.5144, "num_input_tokens_seen": 34320680, "step": 59490 }, { "epoch": 8.8613345248734, "grad_norm": 5.161947250366211, "learning_rate": 1.949413886530277e-06, "loss": 0.6298, "num_input_tokens_seen": 34323272, "step": 59495 }, { "epoch": 8.862079237414358, "grad_norm": 3.1282663345336914, "learning_rate": 1.9468987420564135e-06, "loss": 0.5289, "num_input_tokens_seen": 34326088, "step": 59500 }, { "epoch": 8.862823949955317, "grad_norm": 2.0585200786590576, "learning_rate": 1.9443851554174026e-06, "loss": 0.5217, "num_input_tokens_seen": 34328776, "step": 59505 }, { "epoch": 8.863568662496277, "grad_norm": 1.6088135242462158, "learning_rate": 1.9418731267831088e-06, "loss": 0.3794, "num_input_tokens_seen": 34331368, "step": 59510 }, { "epoch": 8.864313375037236, "grad_norm": 2.2393453121185303, "learning_rate": 1.939362656323279e-06, "loss": 0.5127, "num_input_tokens_seen": 34334120, "step": 59515 }, { "epoch": 8.865058087578195, "grad_norm": 3.2163007259368896, "learning_rate": 1.936853744207562e-06, "loss": 0.5139, "num_input_tokens_seen": 34336872, "step": 59520 }, { "epoch": 8.865802800119154, "grad_norm": 2.272571563720703, "learning_rate": 1.9343463906055017e-06, "loss": 0.4411, "num_input_tokens_seen": 34340200, "step": 59525 }, { "epoch": 8.866547512660112, "grad_norm": 2.8770837783813477, "learning_rate": 1.931840595686535e-06, "loss": 0.5908, "num_input_tokens_seen": 34342920, "step": 59530 }, { "epoch": 8.867292225201073, "grad_norm": 6.530064105987549, "learning_rate": 1.929336359619996e-06, "loss": 0.6034, "num_input_tokens_seen": 34345704, "step": 59535 }, { "epoch": 8.868036937742032, "grad_norm": 2.1397576332092285, "learning_rate": 1.9268336825751022e-06, "loss": 0.5064, "num_input_tokens_seen": 34348712, "step": 59540 }, { "epoch": 8.86878165028299, "grad_norm": 5.574436664581299, "learning_rate": 1.9243325647209846e-06, "loss": 0.7294, "num_input_tokens_seen": 34351528, "step": 59545 }, { "epoch": 8.86952636282395, "grad_norm": 2.5966646671295166, "learning_rate": 1.9218330062266474e-06, "loss": 0.5179, "num_input_tokens_seen": 34354248, "step": 59550 }, { "epoch": 8.87027107536491, "grad_norm": 2.1592843532562256, "learning_rate": 1.919335007261008e-06, "loss": 0.6044, "num_input_tokens_seen": 34356872, "step": 59555 }, { "epoch": 8.871015787905868, "grad_norm": 3.242250680923462, "learning_rate": 1.9168385679928707e-06, "loss": 0.5981, "num_input_tokens_seen": 34359496, "step": 59560 }, { "epoch": 8.871760500446827, "grad_norm": 2.010610818862915, "learning_rate": 1.914343688590933e-06, "loss": 0.597, "num_input_tokens_seen": 34362408, "step": 59565 }, { "epoch": 8.872505212987786, "grad_norm": 2.9187145233154297, "learning_rate": 1.9118503692237917e-06, "loss": 0.5266, "num_input_tokens_seen": 34364968, "step": 59570 }, { "epoch": 8.873249925528746, "grad_norm": 5.255524635314941, "learning_rate": 1.9093586100599304e-06, "loss": 0.5512, "num_input_tokens_seen": 34367752, "step": 59575 }, { "epoch": 8.873994638069705, "grad_norm": 3.181077480316162, "learning_rate": 1.90686841126774e-06, "loss": 0.5955, "num_input_tokens_seen": 34370632, "step": 59580 }, { "epoch": 8.874739350610664, "grad_norm": 3.9507806301116943, "learning_rate": 1.9043797730154856e-06, "loss": 0.6677, "num_input_tokens_seen": 34373224, "step": 59585 }, { "epoch": 8.875484063151623, "grad_norm": 2.665705919265747, "learning_rate": 1.9018926954713495e-06, "loss": 0.5606, "num_input_tokens_seen": 34376008, "step": 59590 }, { "epoch": 8.876228775692583, "grad_norm": 2.226043224334717, "learning_rate": 1.8994071788033919e-06, "loss": 0.5511, "num_input_tokens_seen": 34379048, "step": 59595 }, { "epoch": 8.876973488233542, "grad_norm": 3.353107452392578, "learning_rate": 1.896923223179578e-06, "loss": 0.548, "num_input_tokens_seen": 34382088, "step": 59600 }, { "epoch": 8.8777182007745, "grad_norm": 4.972551345825195, "learning_rate": 1.8944408287677683e-06, "loss": 0.6859, "num_input_tokens_seen": 34384840, "step": 59605 }, { "epoch": 8.87846291331546, "grad_norm": 10.26504898071289, "learning_rate": 1.891959995735701e-06, "loss": 0.6631, "num_input_tokens_seen": 34387848, "step": 59610 }, { "epoch": 8.87920762585642, "grad_norm": 1.9328793287277222, "learning_rate": 1.8894807242510248e-06, "loss": 0.373, "num_input_tokens_seen": 34390888, "step": 59615 }, { "epoch": 8.879952338397379, "grad_norm": 3.0310041904449463, "learning_rate": 1.8870030144812894e-06, "loss": 0.526, "num_input_tokens_seen": 34394120, "step": 59620 }, { "epoch": 8.880697050938338, "grad_norm": 5.441159725189209, "learning_rate": 1.8845268665939109e-06, "loss": 0.6889, "num_input_tokens_seen": 34396936, "step": 59625 }, { "epoch": 8.881441763479296, "grad_norm": 2.9007110595703125, "learning_rate": 1.8820522807562302e-06, "loss": 0.6868, "num_input_tokens_seen": 34399560, "step": 59630 }, { "epoch": 8.882186476020257, "grad_norm": 4.0027337074279785, "learning_rate": 1.8795792571354637e-06, "loss": 0.4028, "num_input_tokens_seen": 34402472, "step": 59635 }, { "epoch": 8.882931188561216, "grad_norm": 7.483813762664795, "learning_rate": 1.8771077958987333e-06, "loss": 0.7686, "num_input_tokens_seen": 34405352, "step": 59640 }, { "epoch": 8.883675901102174, "grad_norm": 2.5490024089813232, "learning_rate": 1.874637897213044e-06, "loss": 0.5128, "num_input_tokens_seen": 34408168, "step": 59645 }, { "epoch": 8.884420613643133, "grad_norm": 5.265382766723633, "learning_rate": 1.8721695612453072e-06, "loss": 0.6455, "num_input_tokens_seen": 34411016, "step": 59650 }, { "epoch": 8.885165326184094, "grad_norm": 3.2496085166931152, "learning_rate": 1.869702788162317e-06, "loss": 0.5938, "num_input_tokens_seen": 34413704, "step": 59655 }, { "epoch": 8.885910038725052, "grad_norm": 5.04572057723999, "learning_rate": 1.8672375781307787e-06, "loss": 0.5981, "num_input_tokens_seen": 34416616, "step": 59660 }, { "epoch": 8.886654751266011, "grad_norm": 3.337284803390503, "learning_rate": 1.864773931317268e-06, "loss": 0.69, "num_input_tokens_seen": 34419432, "step": 59665 }, { "epoch": 8.88739946380697, "grad_norm": 2.7867584228515625, "learning_rate": 1.8623118478882733e-06, "loss": 0.5877, "num_input_tokens_seen": 34422088, "step": 59670 }, { "epoch": 8.88814417634793, "grad_norm": 4.419233798980713, "learning_rate": 1.8598513280101786e-06, "loss": 0.519, "num_input_tokens_seen": 34424648, "step": 59675 }, { "epoch": 8.88888888888889, "grad_norm": 2.2586283683776855, "learning_rate": 1.8573923718492454e-06, "loss": 0.653, "num_input_tokens_seen": 34427432, "step": 59680 }, { "epoch": 8.889633601429848, "grad_norm": 4.835247993469238, "learning_rate": 1.854934979571643e-06, "loss": 0.7107, "num_input_tokens_seen": 34430280, "step": 59685 }, { "epoch": 8.890378313970807, "grad_norm": 5.543692588806152, "learning_rate": 1.8524791513434364e-06, "loss": 0.5201, "num_input_tokens_seen": 34433000, "step": 59690 }, { "epoch": 8.891123026511767, "grad_norm": 2.9190139770507812, "learning_rate": 1.8500248873305758e-06, "loss": 0.6516, "num_input_tokens_seen": 34435944, "step": 59695 }, { "epoch": 8.891867739052726, "grad_norm": 1.8627519607543945, "learning_rate": 1.8475721876989177e-06, "loss": 0.7677, "num_input_tokens_seen": 34438824, "step": 59700 }, { "epoch": 8.892612451593685, "grad_norm": 2.0996482372283936, "learning_rate": 1.845121052614196e-06, "loss": 0.5564, "num_input_tokens_seen": 34441640, "step": 59705 }, { "epoch": 8.893357164134644, "grad_norm": 3.410619020462036, "learning_rate": 1.842671482242056e-06, "loss": 0.5828, "num_input_tokens_seen": 34444328, "step": 59710 }, { "epoch": 8.894101876675602, "grad_norm": 2.973642587661743, "learning_rate": 1.8402234767480237e-06, "loss": 0.5536, "num_input_tokens_seen": 34447272, "step": 59715 }, { "epoch": 8.894846589216563, "grad_norm": 3.8649191856384277, "learning_rate": 1.8377770362975277e-06, "loss": 0.4981, "num_input_tokens_seen": 34449832, "step": 59720 }, { "epoch": 8.895591301757522, "grad_norm": 2.916449546813965, "learning_rate": 1.835332161055886e-06, "loss": 0.5919, "num_input_tokens_seen": 34452872, "step": 59725 }, { "epoch": 8.89633601429848, "grad_norm": 4.485516548156738, "learning_rate": 1.832888851188319e-06, "loss": 0.5936, "num_input_tokens_seen": 34455624, "step": 59730 }, { "epoch": 8.89708072683944, "grad_norm": 2.530959367752075, "learning_rate": 1.8304471068599365e-06, "loss": 0.5594, "num_input_tokens_seen": 34458568, "step": 59735 }, { "epoch": 8.8978254393804, "grad_norm": 2.4004318714141846, "learning_rate": 1.8280069282357342e-06, "loss": 0.5553, "num_input_tokens_seen": 34461352, "step": 59740 }, { "epoch": 8.898570151921358, "grad_norm": 4.305004596710205, "learning_rate": 1.8255683154806163e-06, "loss": 0.8773, "num_input_tokens_seen": 34464232, "step": 59745 }, { "epoch": 8.899314864462317, "grad_norm": 3.296337842941284, "learning_rate": 1.8231312687593677e-06, "loss": 0.3897, "num_input_tokens_seen": 34467016, "step": 59750 }, { "epoch": 8.900059577003276, "grad_norm": 2.629879951477051, "learning_rate": 1.8206957882366788e-06, "loss": 0.5744, "num_input_tokens_seen": 34469608, "step": 59755 }, { "epoch": 8.900804289544237, "grad_norm": 3.19926381111145, "learning_rate": 1.818261874077129e-06, "loss": 0.5031, "num_input_tokens_seen": 34472648, "step": 59760 }, { "epoch": 8.901549002085195, "grad_norm": 4.791697978973389, "learning_rate": 1.8158295264451897e-06, "loss": 0.4931, "num_input_tokens_seen": 34475752, "step": 59765 }, { "epoch": 8.902293714626154, "grad_norm": 4.1761393547058105, "learning_rate": 1.813398745505235e-06, "loss": 0.4682, "num_input_tokens_seen": 34478792, "step": 59770 }, { "epoch": 8.903038427167113, "grad_norm": 1.420422077178955, "learning_rate": 1.8109695314215192e-06, "loss": 0.437, "num_input_tokens_seen": 34481800, "step": 59775 }, { "epoch": 8.903783139708073, "grad_norm": 2.354196786880493, "learning_rate": 1.8085418843582086e-06, "loss": 0.7351, "num_input_tokens_seen": 34484840, "step": 59780 }, { "epoch": 8.904527852249032, "grad_norm": 4.445549964904785, "learning_rate": 1.8061158044793413e-06, "loss": 0.7799, "num_input_tokens_seen": 34487720, "step": 59785 }, { "epoch": 8.90527256478999, "grad_norm": 1.9189047813415527, "learning_rate": 1.8036912919488697e-06, "loss": 0.4467, "num_input_tokens_seen": 34490760, "step": 59790 }, { "epoch": 8.90601727733095, "grad_norm": 4.167731761932373, "learning_rate": 1.8012683469306319e-06, "loss": 0.6964, "num_input_tokens_seen": 34493608, "step": 59795 }, { "epoch": 8.90676198987191, "grad_norm": 3.162463665008545, "learning_rate": 1.798846969588358e-06, "loss": 0.617, "num_input_tokens_seen": 34496232, "step": 59800 }, { "epoch": 8.907506702412869, "grad_norm": 2.738560914993286, "learning_rate": 1.7964271600856813e-06, "loss": 0.5328, "num_input_tokens_seen": 34499080, "step": 59805 }, { "epoch": 8.908251414953828, "grad_norm": 3.8783600330352783, "learning_rate": 1.7940089185861153e-06, "loss": 0.7532, "num_input_tokens_seen": 34502088, "step": 59810 }, { "epoch": 8.908996127494786, "grad_norm": 3.2289438247680664, "learning_rate": 1.7915922452530793e-06, "loss": 0.5598, "num_input_tokens_seen": 34505320, "step": 59815 }, { "epoch": 8.909740840035747, "grad_norm": 1.841015100479126, "learning_rate": 1.7891771402498813e-06, "loss": 0.4134, "num_input_tokens_seen": 34508328, "step": 59820 }, { "epoch": 8.910485552576706, "grad_norm": 1.8728922605514526, "learning_rate": 1.7867636037397244e-06, "loss": 0.6063, "num_input_tokens_seen": 34511080, "step": 59825 }, { "epoch": 8.911230265117664, "grad_norm": 2.5067200660705566, "learning_rate": 1.7843516358857004e-06, "loss": 0.4652, "num_input_tokens_seen": 34513800, "step": 59830 }, { "epoch": 8.911974977658623, "grad_norm": 4.37735652923584, "learning_rate": 1.7819412368508064e-06, "loss": 0.547, "num_input_tokens_seen": 34516552, "step": 59835 }, { "epoch": 8.912719690199584, "grad_norm": 3.166058301925659, "learning_rate": 1.7795324067979318e-06, "loss": 0.5962, "num_input_tokens_seen": 34519272, "step": 59840 }, { "epoch": 8.913464402740543, "grad_norm": 2.319873809814453, "learning_rate": 1.7771251458898436e-06, "loss": 0.6878, "num_input_tokens_seen": 34522248, "step": 59845 }, { "epoch": 8.914209115281501, "grad_norm": 1.4454212188720703, "learning_rate": 1.7747194542892226e-06, "loss": 0.3585, "num_input_tokens_seen": 34525320, "step": 59850 }, { "epoch": 8.91495382782246, "grad_norm": 2.301682472229004, "learning_rate": 1.7723153321586305e-06, "loss": 0.4957, "num_input_tokens_seen": 34528232, "step": 59855 }, { "epoch": 8.915698540363419, "grad_norm": 2.6558101177215576, "learning_rate": 1.7699127796605348e-06, "loss": 0.623, "num_input_tokens_seen": 34531368, "step": 59860 }, { "epoch": 8.91644325290438, "grad_norm": 2.7747249603271484, "learning_rate": 1.7675117969572885e-06, "loss": 0.4492, "num_input_tokens_seen": 34534280, "step": 59865 }, { "epoch": 8.917187965445338, "grad_norm": 2.5227062702178955, "learning_rate": 1.7651123842111372e-06, "loss": 0.8136, "num_input_tokens_seen": 34537480, "step": 59870 }, { "epoch": 8.917932677986297, "grad_norm": 2.336885929107666, "learning_rate": 1.7627145415842261e-06, "loss": 0.5709, "num_input_tokens_seen": 34540296, "step": 59875 }, { "epoch": 8.918677390527257, "grad_norm": 2.0117788314819336, "learning_rate": 1.7603182692385867e-06, "loss": 0.6048, "num_input_tokens_seen": 34543048, "step": 59880 }, { "epoch": 8.919422103068216, "grad_norm": 2.9066758155822754, "learning_rate": 1.7579235673361533e-06, "loss": 0.4985, "num_input_tokens_seen": 34546312, "step": 59885 }, { "epoch": 8.920166815609175, "grad_norm": 7.2653326988220215, "learning_rate": 1.755530436038752e-06, "loss": 0.4747, "num_input_tokens_seen": 34549064, "step": 59890 }, { "epoch": 8.920911528150134, "grad_norm": 6.278341770172119, "learning_rate": 1.7531388755080951e-06, "loss": 0.7472, "num_input_tokens_seen": 34553096, "step": 59895 }, { "epoch": 8.921656240691092, "grad_norm": 3.2798147201538086, "learning_rate": 1.7507488859058035e-06, "loss": 0.6203, "num_input_tokens_seen": 34556008, "step": 59900 }, { "epoch": 8.922400953232053, "grad_norm": 4.829977989196777, "learning_rate": 1.7483604673933756e-06, "loss": 0.5505, "num_input_tokens_seen": 34558856, "step": 59905 }, { "epoch": 8.923145665773012, "grad_norm": 3.7080447673797607, "learning_rate": 1.7459736201322158e-06, "loss": 0.4576, "num_input_tokens_seen": 34561512, "step": 59910 }, { "epoch": 8.92389037831397, "grad_norm": 2.827754020690918, "learning_rate": 1.7435883442836086e-06, "loss": 0.5902, "num_input_tokens_seen": 34564648, "step": 59915 }, { "epoch": 8.92463509085493, "grad_norm": 2.832195520401001, "learning_rate": 1.7412046400087505e-06, "loss": 0.55, "num_input_tokens_seen": 34567432, "step": 59920 }, { "epoch": 8.92537980339589, "grad_norm": 3.5622127056121826, "learning_rate": 1.7388225074687182e-06, "loss": 0.4835, "num_input_tokens_seen": 34570152, "step": 59925 }, { "epoch": 8.926124515936849, "grad_norm": 2.5378854274749756, "learning_rate": 1.736441946824488e-06, "loss": 0.4253, "num_input_tokens_seen": 34572840, "step": 59930 }, { "epoch": 8.926869228477807, "grad_norm": 3.5135483741760254, "learning_rate": 1.7340629582369316e-06, "loss": 0.4966, "num_input_tokens_seen": 34575880, "step": 59935 }, { "epoch": 8.927613941018766, "grad_norm": 3.058608293533325, "learning_rate": 1.7316855418668038e-06, "loss": 0.6025, "num_input_tokens_seen": 34579176, "step": 59940 }, { "epoch": 8.928358653559727, "grad_norm": 2.106555938720703, "learning_rate": 1.7293096978747703e-06, "loss": 0.7317, "num_input_tokens_seen": 34581864, "step": 59945 }, { "epoch": 8.929103366100685, "grad_norm": 5.176596164703369, "learning_rate": 1.7269354264213694e-06, "loss": 0.6694, "num_input_tokens_seen": 34584840, "step": 59950 }, { "epoch": 8.929848078641644, "grad_norm": 1.8520267009735107, "learning_rate": 1.7245627276670535e-06, "loss": 0.6805, "num_input_tokens_seen": 34587816, "step": 59955 }, { "epoch": 8.930592791182603, "grad_norm": 2.83949875831604, "learning_rate": 1.722191601772158e-06, "loss": 0.8479, "num_input_tokens_seen": 34590792, "step": 59960 }, { "epoch": 8.931337503723563, "grad_norm": 1.7818348407745361, "learning_rate": 1.7198220488969102e-06, "loss": 0.6649, "num_input_tokens_seen": 34593640, "step": 59965 }, { "epoch": 8.932082216264522, "grad_norm": 3.930925130844116, "learning_rate": 1.7174540692014435e-06, "loss": 0.4677, "num_input_tokens_seen": 34596232, "step": 59970 }, { "epoch": 8.932826928805481, "grad_norm": 1.693941354751587, "learning_rate": 1.7150876628457686e-06, "loss": 0.6181, "num_input_tokens_seen": 34598792, "step": 59975 }, { "epoch": 8.93357164134644, "grad_norm": 2.8226969242095947, "learning_rate": 1.7127228299897991e-06, "loss": 0.6504, "num_input_tokens_seen": 34601704, "step": 59980 }, { "epoch": 8.9343163538874, "grad_norm": 3.408421754837036, "learning_rate": 1.7103595707933434e-06, "loss": 0.6553, "num_input_tokens_seen": 34604392, "step": 59985 }, { "epoch": 8.935061066428359, "grad_norm": 2.3703150749206543, "learning_rate": 1.707997885416096e-06, "loss": 0.5925, "num_input_tokens_seen": 34607080, "step": 59990 }, { "epoch": 8.935805778969318, "grad_norm": 5.2239789962768555, "learning_rate": 1.7056377740176543e-06, "loss": 0.7789, "num_input_tokens_seen": 34609928, "step": 59995 }, { "epoch": 8.936550491510276, "grad_norm": 4.771263122558594, "learning_rate": 1.7032792367575047e-06, "loss": 0.5172, "num_input_tokens_seen": 34612712, "step": 60000 }, { "epoch": 8.937295204051237, "grad_norm": 3.561159133911133, "learning_rate": 1.7009222737950276e-06, "loss": 0.3764, "num_input_tokens_seen": 34615464, "step": 60005 }, { "epoch": 8.938039916592196, "grad_norm": 1.4012134075164795, "learning_rate": 1.698566885289496e-06, "loss": 0.4118, "num_input_tokens_seen": 34618248, "step": 60010 }, { "epoch": 8.938784629133155, "grad_norm": 2.710871934890747, "learning_rate": 1.696213071400074e-06, "loss": 0.6111, "num_input_tokens_seen": 34621096, "step": 60015 }, { "epoch": 8.939529341674113, "grad_norm": 2.325021505355835, "learning_rate": 1.693860832285829e-06, "loss": 0.5128, "num_input_tokens_seen": 34624296, "step": 60020 }, { "epoch": 8.940274054215074, "grad_norm": 2.493830442428589, "learning_rate": 1.6915101681057144e-06, "loss": 0.436, "num_input_tokens_seen": 34627048, "step": 60025 }, { "epoch": 8.941018766756033, "grad_norm": 2.194720983505249, "learning_rate": 1.6891610790185752e-06, "loss": 0.4207, "num_input_tokens_seen": 34629928, "step": 60030 }, { "epoch": 8.941763479296991, "grad_norm": 1.5105299949645996, "learning_rate": 1.686813565183154e-06, "loss": 0.6819, "num_input_tokens_seen": 34633224, "step": 60035 }, { "epoch": 8.94250819183795, "grad_norm": 2.751051187515259, "learning_rate": 1.6844676267580932e-06, "loss": 0.616, "num_input_tokens_seen": 34636200, "step": 60040 }, { "epoch": 8.943252904378909, "grad_norm": 3.6190714836120605, "learning_rate": 1.6821232639019107e-06, "loss": 0.6531, "num_input_tokens_seen": 34639208, "step": 60045 }, { "epoch": 8.94399761691987, "grad_norm": 3.3030850887298584, "learning_rate": 1.6797804767730352e-06, "loss": 0.6102, "num_input_tokens_seen": 34641928, "step": 60050 }, { "epoch": 8.944742329460828, "grad_norm": 3.219297409057617, "learning_rate": 1.6774392655297817e-06, "loss": 0.5985, "num_input_tokens_seen": 34644616, "step": 60055 }, { "epoch": 8.945487042001787, "grad_norm": 4.001824855804443, "learning_rate": 1.6750996303303596e-06, "loss": 0.685, "num_input_tokens_seen": 34647656, "step": 60060 }, { "epoch": 8.946231754542747, "grad_norm": 3.915466070175171, "learning_rate": 1.6727615713328788e-06, "loss": 0.6202, "num_input_tokens_seen": 34650664, "step": 60065 }, { "epoch": 8.946976467083706, "grad_norm": 5.7584099769592285, "learning_rate": 1.670425088695321e-06, "loss": 0.6443, "num_input_tokens_seen": 34653608, "step": 60070 }, { "epoch": 8.947721179624665, "grad_norm": 4.357421875, "learning_rate": 1.6680901825755908e-06, "loss": 0.5249, "num_input_tokens_seen": 34656584, "step": 60075 }, { "epoch": 8.948465892165624, "grad_norm": 2.2088565826416016, "learning_rate": 1.6657568531314615e-06, "loss": 0.6776, "num_input_tokens_seen": 34659208, "step": 60080 }, { "epoch": 8.949210604706582, "grad_norm": 1.3727411031723022, "learning_rate": 1.663425100520616e-06, "loss": 0.5909, "num_input_tokens_seen": 34661864, "step": 60085 }, { "epoch": 8.949955317247543, "grad_norm": 6.536814212799072, "learning_rate": 1.661094924900619e-06, "loss": 0.7906, "num_input_tokens_seen": 34665000, "step": 60090 }, { "epoch": 8.950700029788502, "grad_norm": 2.527493476867676, "learning_rate": 1.65876632642894e-06, "loss": 0.543, "num_input_tokens_seen": 34667848, "step": 60095 }, { "epoch": 8.95144474232946, "grad_norm": 3.9931468963623047, "learning_rate": 1.6564393052629384e-06, "loss": 0.5994, "num_input_tokens_seen": 34670792, "step": 60100 }, { "epoch": 8.95218945487042, "grad_norm": 1.8524788618087769, "learning_rate": 1.6541138615598585e-06, "loss": 0.5074, "num_input_tokens_seen": 34673416, "step": 60105 }, { "epoch": 8.95293416741138, "grad_norm": 2.5394489765167236, "learning_rate": 1.6517899954768434e-06, "loss": 0.5224, "num_input_tokens_seen": 34676232, "step": 60110 }, { "epoch": 8.953678879952339, "grad_norm": 2.3559210300445557, "learning_rate": 1.6494677071709347e-06, "loss": 0.6109, "num_input_tokens_seen": 34679048, "step": 60115 }, { "epoch": 8.954423592493297, "grad_norm": 4.157610893249512, "learning_rate": 1.6471469967990622e-06, "loss": 0.7743, "num_input_tokens_seen": 34681992, "step": 60120 }, { "epoch": 8.955168305034256, "grad_norm": 2.7345664501190186, "learning_rate": 1.6448278645180477e-06, "loss": 0.8659, "num_input_tokens_seen": 34684776, "step": 60125 }, { "epoch": 8.955913017575217, "grad_norm": 2.5179078578948975, "learning_rate": 1.6425103104846128e-06, "loss": 0.5815, "num_input_tokens_seen": 34687528, "step": 60130 }, { "epoch": 8.956657730116175, "grad_norm": 4.784262180328369, "learning_rate": 1.6401943348553688e-06, "loss": 0.5495, "num_input_tokens_seen": 34690600, "step": 60135 }, { "epoch": 8.957402442657134, "grad_norm": 4.0879058837890625, "learning_rate": 1.6378799377868155e-06, "loss": 0.5294, "num_input_tokens_seen": 34693448, "step": 60140 }, { "epoch": 8.958147155198093, "grad_norm": 2.3352582454681396, "learning_rate": 1.635567119435355e-06, "loss": 0.5779, "num_input_tokens_seen": 34696520, "step": 60145 }, { "epoch": 8.958891867739053, "grad_norm": 1.4233684539794922, "learning_rate": 1.6332558799572711e-06, "loss": 0.5338, "num_input_tokens_seen": 34699240, "step": 60150 }, { "epoch": 8.959636580280012, "grad_norm": 5.615656852722168, "learning_rate": 1.6309462195087555e-06, "loss": 0.589, "num_input_tokens_seen": 34702248, "step": 60155 }, { "epoch": 8.960381292820971, "grad_norm": 9.020644187927246, "learning_rate": 1.6286381382458803e-06, "loss": 0.8711, "num_input_tokens_seen": 34705384, "step": 60160 }, { "epoch": 8.96112600536193, "grad_norm": 2.2028567790985107, "learning_rate": 1.6263316363246184e-06, "loss": 0.532, "num_input_tokens_seen": 34708168, "step": 60165 }, { "epoch": 8.96187071790289, "grad_norm": 2.9076523780822754, "learning_rate": 1.624026713900839e-06, "loss": 0.3414, "num_input_tokens_seen": 34711080, "step": 60170 }, { "epoch": 8.962615430443849, "grad_norm": 2.660867691040039, "learning_rate": 1.6217233711302904e-06, "loss": 0.6088, "num_input_tokens_seen": 34714184, "step": 60175 }, { "epoch": 8.963360142984808, "grad_norm": 2.7602243423461914, "learning_rate": 1.619421608168628e-06, "loss": 0.6179, "num_input_tokens_seen": 34717000, "step": 60180 }, { "epoch": 8.964104855525767, "grad_norm": 5.0254130363464355, "learning_rate": 1.6171214251713974e-06, "loss": 0.8394, "num_input_tokens_seen": 34719816, "step": 60185 }, { "epoch": 8.964849568066727, "grad_norm": 4.441243648529053, "learning_rate": 1.6148228222940292e-06, "loss": 0.5975, "num_input_tokens_seen": 34722376, "step": 60190 }, { "epoch": 8.965594280607686, "grad_norm": 2.069092035293579, "learning_rate": 1.6125257996918609e-06, "loss": 0.4086, "num_input_tokens_seen": 34725128, "step": 60195 }, { "epoch": 8.966338993148645, "grad_norm": 2.971926689147949, "learning_rate": 1.6102303575201095e-06, "loss": 0.5655, "num_input_tokens_seen": 34728392, "step": 60200 }, { "epoch": 8.967083705689603, "grad_norm": 1.558003306388855, "learning_rate": 1.6079364959338983e-06, "loss": 0.4413, "num_input_tokens_seen": 34730984, "step": 60205 }, { "epoch": 8.967828418230564, "grad_norm": 4.634838104248047, "learning_rate": 1.6056442150882283e-06, "loss": 0.6033, "num_input_tokens_seen": 34733864, "step": 60210 }, { "epoch": 8.968573130771523, "grad_norm": 3.6400716304779053, "learning_rate": 1.6033535151380092e-06, "loss": 0.7252, "num_input_tokens_seen": 34736584, "step": 60215 }, { "epoch": 8.969317843312481, "grad_norm": 2.4335761070251465, "learning_rate": 1.6010643962380362e-06, "loss": 0.509, "num_input_tokens_seen": 34739496, "step": 60220 }, { "epoch": 8.97006255585344, "grad_norm": 2.710838556289673, "learning_rate": 1.5987768585430025e-06, "loss": 0.8502, "num_input_tokens_seen": 34742184, "step": 60225 }, { "epoch": 8.970807268394399, "grad_norm": 2.2458715438842773, "learning_rate": 1.5964909022074815e-06, "loss": 0.5281, "num_input_tokens_seen": 34745128, "step": 60230 }, { "epoch": 8.97155198093536, "grad_norm": 2.3696720600128174, "learning_rate": 1.5942065273859552e-06, "loss": 0.4491, "num_input_tokens_seen": 34748072, "step": 60235 }, { "epoch": 8.972296693476318, "grad_norm": 1.8414095640182495, "learning_rate": 1.591923734232792e-06, "loss": 0.7975, "num_input_tokens_seen": 34750792, "step": 60240 }, { "epoch": 8.973041406017277, "grad_norm": 2.268820285797119, "learning_rate": 1.5896425229022488e-06, "loss": 0.5863, "num_input_tokens_seen": 34753480, "step": 60245 }, { "epoch": 8.973786118558236, "grad_norm": 3.12331485748291, "learning_rate": 1.5873628935484858e-06, "loss": 0.563, "num_input_tokens_seen": 34756424, "step": 60250 }, { "epoch": 8.974530831099196, "grad_norm": 4.008724689483643, "learning_rate": 1.585084846325549e-06, "loss": 0.5856, "num_input_tokens_seen": 34759112, "step": 60255 }, { "epoch": 8.975275543640155, "grad_norm": 4.1366682052612305, "learning_rate": 1.5828083813873824e-06, "loss": 0.5473, "num_input_tokens_seen": 34762056, "step": 60260 }, { "epoch": 8.976020256181114, "grad_norm": 3.256406307220459, "learning_rate": 1.580533498887818e-06, "loss": 0.5091, "num_input_tokens_seen": 34764808, "step": 60265 }, { "epoch": 8.976764968722073, "grad_norm": 2.6339454650878906, "learning_rate": 1.5782601989805857e-06, "loss": 0.5987, "num_input_tokens_seen": 34767528, "step": 60270 }, { "epoch": 8.977509681263033, "grad_norm": 2.026047468185425, "learning_rate": 1.5759884818192988e-06, "loss": 0.5621, "num_input_tokens_seen": 34770472, "step": 60275 }, { "epoch": 8.978254393803992, "grad_norm": 2.724266767501831, "learning_rate": 1.5737183475574762e-06, "loss": 0.4052, "num_input_tokens_seen": 34773192, "step": 60280 }, { "epoch": 8.97899910634495, "grad_norm": 4.566309928894043, "learning_rate": 1.5714497963485203e-06, "loss": 0.5281, "num_input_tokens_seen": 34775880, "step": 60285 }, { "epoch": 8.97974381888591, "grad_norm": 1.8934861421585083, "learning_rate": 1.569182828345736e-06, "loss": 0.5112, "num_input_tokens_seen": 34778600, "step": 60290 }, { "epoch": 8.98048853142687, "grad_norm": 4.074845790863037, "learning_rate": 1.5669174437023149e-06, "loss": 0.4643, "num_input_tokens_seen": 34781512, "step": 60295 }, { "epoch": 8.981233243967829, "grad_norm": 4.7665181159973145, "learning_rate": 1.5646536425713426e-06, "loss": 0.5058, "num_input_tokens_seen": 34784680, "step": 60300 }, { "epoch": 8.981977956508787, "grad_norm": 0.8940914869308472, "learning_rate": 1.5623914251057942e-06, "loss": 0.3266, "num_input_tokens_seen": 34787560, "step": 60305 }, { "epoch": 8.982722669049746, "grad_norm": 2.974870443344116, "learning_rate": 1.5601307914585416e-06, "loss": 0.6672, "num_input_tokens_seen": 34790344, "step": 60310 }, { "epoch": 8.983467381590707, "grad_norm": 2.349353313446045, "learning_rate": 1.5578717417823518e-06, "loss": 0.5411, "num_input_tokens_seen": 34793096, "step": 60315 }, { "epoch": 8.984212094131665, "grad_norm": 4.077330112457275, "learning_rate": 1.5556142762298776e-06, "loss": 0.6762, "num_input_tokens_seen": 34795912, "step": 60320 }, { "epoch": 8.984956806672624, "grad_norm": 2.995922565460205, "learning_rate": 1.5533583949536745e-06, "loss": 0.6211, "num_input_tokens_seen": 34799112, "step": 60325 }, { "epoch": 8.985701519213583, "grad_norm": 1.8711628913879395, "learning_rate": 1.5511040981061848e-06, "loss": 0.5157, "num_input_tokens_seen": 34801832, "step": 60330 }, { "epoch": 8.986446231754543, "grad_norm": 2.4664318561553955, "learning_rate": 1.5488513858397475e-06, "loss": 0.5317, "num_input_tokens_seen": 34804808, "step": 60335 }, { "epoch": 8.987190944295502, "grad_norm": 3.983903169631958, "learning_rate": 1.5466002583065825e-06, "loss": 0.7576, "num_input_tokens_seen": 34807688, "step": 60340 }, { "epoch": 8.987935656836461, "grad_norm": 2.48626708984375, "learning_rate": 1.544350715658821e-06, "loss": 0.4426, "num_input_tokens_seen": 34810696, "step": 60345 }, { "epoch": 8.98868036937742, "grad_norm": 2.2520060539245605, "learning_rate": 1.542102758048472e-06, "loss": 0.6207, "num_input_tokens_seen": 34813640, "step": 60350 }, { "epoch": 8.98942508191838, "grad_norm": 3.0657708644866943, "learning_rate": 1.5398563856274472e-06, "loss": 0.8469, "num_input_tokens_seen": 34816520, "step": 60355 }, { "epoch": 8.990169794459339, "grad_norm": 3.1473517417907715, "learning_rate": 1.5376115985475448e-06, "loss": 0.6652, "num_input_tokens_seen": 34819144, "step": 60360 }, { "epoch": 8.990914507000298, "grad_norm": 1.5086511373519897, "learning_rate": 1.535368396960457e-06, "loss": 0.6422, "num_input_tokens_seen": 34821928, "step": 60365 }, { "epoch": 8.991659219541257, "grad_norm": 1.9283214807510376, "learning_rate": 1.5331267810177797e-06, "loss": 0.5667, "num_input_tokens_seen": 34824712, "step": 60370 }, { "epoch": 8.992403932082215, "grad_norm": 2.190884828567505, "learning_rate": 1.53088675087098e-06, "loss": 0.5405, "num_input_tokens_seen": 34827592, "step": 60375 }, { "epoch": 8.993148644623176, "grad_norm": 1.9109430313110352, "learning_rate": 1.5286483066714347e-06, "loss": 0.434, "num_input_tokens_seen": 34830600, "step": 60380 }, { "epoch": 8.993893357164135, "grad_norm": 3.7955968379974365, "learning_rate": 1.526411448570414e-06, "loss": 0.6811, "num_input_tokens_seen": 34833416, "step": 60385 }, { "epoch": 8.994638069705093, "grad_norm": 2.786872148513794, "learning_rate": 1.5241761767190665e-06, "loss": 0.6056, "num_input_tokens_seen": 34836104, "step": 60390 }, { "epoch": 8.995382782246054, "grad_norm": 5.223546028137207, "learning_rate": 1.5219424912684494e-06, "loss": 0.8914, "num_input_tokens_seen": 34839048, "step": 60395 }, { "epoch": 8.996127494787013, "grad_norm": 2.0191569328308105, "learning_rate": 1.5197103923695e-06, "loss": 0.594, "num_input_tokens_seen": 34841832, "step": 60400 }, { "epoch": 8.996872207327971, "grad_norm": 3.8723132610321045, "learning_rate": 1.5174798801730644e-06, "loss": 0.634, "num_input_tokens_seen": 34844584, "step": 60405 }, { "epoch": 8.99761691986893, "grad_norm": 2.2417075634002686, "learning_rate": 1.5152509548298639e-06, "loss": 0.5215, "num_input_tokens_seen": 34847880, "step": 60410 }, { "epoch": 8.998361632409889, "grad_norm": 5.32562255859375, "learning_rate": 1.5130236164905192e-06, "loss": 0.777, "num_input_tokens_seen": 34850984, "step": 60415 }, { "epoch": 8.99910634495085, "grad_norm": 4.521297931671143, "learning_rate": 1.5107978653055466e-06, "loss": 0.6147, "num_input_tokens_seen": 34853736, "step": 60420 }, { "epoch": 8.999851057491808, "grad_norm": 1.9109166860580444, "learning_rate": 1.5085737014253586e-06, "loss": 0.465, "num_input_tokens_seen": 34856584, "step": 60425 }, { "epoch": 9.0, "eval_loss": 0.6799556016921997, "eval_runtime": 74.2263, "eval_samples_per_second": 40.201, "eval_steps_per_second": 10.05, "num_input_tokens_seen": 34856680, "step": 60426 }, { "epoch": 9.000595770032767, "grad_norm": 4.863277912139893, "learning_rate": 1.5063511250002466e-06, "loss": 0.6018, "num_input_tokens_seen": 34859112, "step": 60430 }, { "epoch": 9.001340482573726, "grad_norm": 2.2213220596313477, "learning_rate": 1.5041301361804123e-06, "loss": 0.4742, "num_input_tokens_seen": 34862024, "step": 60435 }, { "epoch": 9.002085195114686, "grad_norm": 4.383445739746094, "learning_rate": 1.5019107351159328e-06, "loss": 0.5398, "num_input_tokens_seen": 34865224, "step": 60440 }, { "epoch": 9.002829907655645, "grad_norm": 2.6535186767578125, "learning_rate": 1.4996929219567884e-06, "loss": 0.5959, "num_input_tokens_seen": 34868520, "step": 60445 }, { "epoch": 9.003574620196604, "grad_norm": 2.27242374420166, "learning_rate": 1.4974766968528508e-06, "loss": 0.5901, "num_input_tokens_seen": 34871400, "step": 60450 }, { "epoch": 9.004319332737563, "grad_norm": 2.9762256145477295, "learning_rate": 1.4952620599538864e-06, "loss": 0.5162, "num_input_tokens_seen": 34874344, "step": 60455 }, { "epoch": 9.005064045278523, "grad_norm": 3.484586715698242, "learning_rate": 1.4930490114095446e-06, "loss": 0.5449, "num_input_tokens_seen": 34877064, "step": 60460 }, { "epoch": 9.005808757819482, "grad_norm": 3.2352888584136963, "learning_rate": 1.490837551369384e-06, "loss": 0.5033, "num_input_tokens_seen": 34880008, "step": 60465 }, { "epoch": 9.00655347036044, "grad_norm": 3.90157151222229, "learning_rate": 1.4886276799828402e-06, "loss": 0.6758, "num_input_tokens_seen": 34882824, "step": 60470 }, { "epoch": 9.0072981829014, "grad_norm": 2.6781821250915527, "learning_rate": 1.4864193973992441e-06, "loss": 0.3898, "num_input_tokens_seen": 34885736, "step": 60475 }, { "epoch": 9.00804289544236, "grad_norm": 3.5523126125335693, "learning_rate": 1.484212703767826e-06, "loss": 0.5228, "num_input_tokens_seen": 34888904, "step": 60480 }, { "epoch": 9.008787607983319, "grad_norm": 3.0380232334136963, "learning_rate": 1.482007599237706e-06, "loss": 0.5813, "num_input_tokens_seen": 34891816, "step": 60485 }, { "epoch": 9.009532320524277, "grad_norm": 5.272233486175537, "learning_rate": 1.4798040839578946e-06, "loss": 0.7235, "num_input_tokens_seen": 34894664, "step": 60490 }, { "epoch": 9.010277033065236, "grad_norm": 2.4162604808807373, "learning_rate": 1.4776021580772958e-06, "loss": 0.3818, "num_input_tokens_seen": 34897448, "step": 60495 }, { "epoch": 9.011021745606197, "grad_norm": 2.1629042625427246, "learning_rate": 1.4754018217447125e-06, "loss": 0.4924, "num_input_tokens_seen": 34900264, "step": 60500 }, { "epoch": 9.011766458147155, "grad_norm": 2.7824339866638184, "learning_rate": 1.4732030751088255e-06, "loss": 0.4275, "num_input_tokens_seen": 34902952, "step": 60505 }, { "epoch": 9.012511170688114, "grad_norm": 5.18557071685791, "learning_rate": 1.4710059183182274e-06, "loss": 0.6031, "num_input_tokens_seen": 34905704, "step": 60510 }, { "epoch": 9.013255883229073, "grad_norm": 2.266306161880493, "learning_rate": 1.4688103515213824e-06, "loss": 0.5243, "num_input_tokens_seen": 34908808, "step": 60515 }, { "epoch": 9.014000595770034, "grad_norm": 3.3432202339172363, "learning_rate": 1.466616374866664e-06, "loss": 0.6775, "num_input_tokens_seen": 34911656, "step": 60520 }, { "epoch": 9.014745308310992, "grad_norm": 1.334101915359497, "learning_rate": 1.4644239885023309e-06, "loss": 0.4624, "num_input_tokens_seen": 34914568, "step": 60525 }, { "epoch": 9.015490020851951, "grad_norm": 3.5176455974578857, "learning_rate": 1.4622331925765343e-06, "loss": 0.5299, "num_input_tokens_seen": 34917544, "step": 60530 }, { "epoch": 9.01623473339291, "grad_norm": 1.840855360031128, "learning_rate": 1.460043987237325e-06, "loss": 0.6957, "num_input_tokens_seen": 34920200, "step": 60535 }, { "epoch": 9.01697944593387, "grad_norm": 2.196392297744751, "learning_rate": 1.457856372632635e-06, "loss": 0.6404, "num_input_tokens_seen": 34923016, "step": 60540 }, { "epoch": 9.017724158474829, "grad_norm": 2.2626936435699463, "learning_rate": 1.4556703489102958e-06, "loss": 0.484, "num_input_tokens_seen": 34925800, "step": 60545 }, { "epoch": 9.018468871015788, "grad_norm": 3.835733413696289, "learning_rate": 1.4534859162180308e-06, "loss": 0.6748, "num_input_tokens_seen": 34929032, "step": 60550 }, { "epoch": 9.019213583556747, "grad_norm": 4.366989612579346, "learning_rate": 1.451303074703453e-06, "loss": 0.8661, "num_input_tokens_seen": 34932104, "step": 60555 }, { "epoch": 9.019958296097707, "grad_norm": 2.1438357830047607, "learning_rate": 1.4491218245140715e-06, "loss": 0.524, "num_input_tokens_seen": 34934952, "step": 60560 }, { "epoch": 9.020703008638666, "grad_norm": 1.8335962295532227, "learning_rate": 1.4469421657972855e-06, "loss": 0.7224, "num_input_tokens_seen": 34937960, "step": 60565 }, { "epoch": 9.021447721179625, "grad_norm": 3.5340189933776855, "learning_rate": 1.4447640987003935e-06, "loss": 0.4835, "num_input_tokens_seen": 34941000, "step": 60570 }, { "epoch": 9.022192433720583, "grad_norm": 3.0791313648223877, "learning_rate": 1.4425876233705698e-06, "loss": 0.6879, "num_input_tokens_seen": 34943752, "step": 60575 }, { "epoch": 9.022937146261542, "grad_norm": 8.578954696655273, "learning_rate": 1.4404127399548966e-06, "loss": 0.6599, "num_input_tokens_seen": 34946440, "step": 60580 }, { "epoch": 9.023681858802503, "grad_norm": 2.0401411056518555, "learning_rate": 1.4382394486003454e-06, "loss": 0.4263, "num_input_tokens_seen": 34949448, "step": 60585 }, { "epoch": 9.024426571343461, "grad_norm": 2.6475419998168945, "learning_rate": 1.436067749453779e-06, "loss": 0.4933, "num_input_tokens_seen": 34952424, "step": 60590 }, { "epoch": 9.02517128388442, "grad_norm": 5.8568949699401855, "learning_rate": 1.4338976426619493e-06, "loss": 0.5825, "num_input_tokens_seen": 34955528, "step": 60595 }, { "epoch": 9.025915996425379, "grad_norm": 2.8528926372528076, "learning_rate": 1.431729128371506e-06, "loss": 0.707, "num_input_tokens_seen": 34958760, "step": 60600 }, { "epoch": 9.02666070896634, "grad_norm": 2.918485164642334, "learning_rate": 1.4295622067289821e-06, "loss": 0.5921, "num_input_tokens_seen": 34961640, "step": 60605 }, { "epoch": 9.027405421507298, "grad_norm": 2.675662040710449, "learning_rate": 1.4273968778808155e-06, "loss": 0.4584, "num_input_tokens_seen": 34964808, "step": 60610 }, { "epoch": 9.028150134048257, "grad_norm": 2.986778974533081, "learning_rate": 1.4252331419733283e-06, "loss": 0.6648, "num_input_tokens_seen": 34967464, "step": 60615 }, { "epoch": 9.028894846589216, "grad_norm": 1.6897448301315308, "learning_rate": 1.423070999152737e-06, "loss": 0.4513, "num_input_tokens_seen": 34970696, "step": 60620 }, { "epoch": 9.029639559130176, "grad_norm": 3.8239998817443848, "learning_rate": 1.4209104495651492e-06, "loss": 0.6166, "num_input_tokens_seen": 34973608, "step": 60625 }, { "epoch": 9.030384271671135, "grad_norm": 7.766561985015869, "learning_rate": 1.4187514933565738e-06, "loss": 0.5713, "num_input_tokens_seen": 34976520, "step": 60630 }, { "epoch": 9.031128984212094, "grad_norm": 2.3163654804229736, "learning_rate": 1.4165941306728963e-06, "loss": 0.7396, "num_input_tokens_seen": 34979496, "step": 60635 }, { "epoch": 9.031873696753053, "grad_norm": 4.347653865814209, "learning_rate": 1.4144383616599033e-06, "loss": 0.6434, "num_input_tokens_seen": 34982376, "step": 60640 }, { "epoch": 9.032618409294013, "grad_norm": 5.080604076385498, "learning_rate": 1.4122841864632724e-06, "loss": 0.5892, "num_input_tokens_seen": 34985000, "step": 60645 }, { "epoch": 9.033363121834972, "grad_norm": 2.469571590423584, "learning_rate": 1.4101316052285734e-06, "loss": 0.4236, "num_input_tokens_seen": 34987848, "step": 60650 }, { "epoch": 9.03410783437593, "grad_norm": 3.2760820388793945, "learning_rate": 1.4079806181012733e-06, "loss": 0.533, "num_input_tokens_seen": 34990376, "step": 60655 }, { "epoch": 9.03485254691689, "grad_norm": 3.7802228927612305, "learning_rate": 1.4058312252267253e-06, "loss": 0.6026, "num_input_tokens_seen": 34993032, "step": 60660 }, { "epoch": 9.03559725945785, "grad_norm": 2.633643388748169, "learning_rate": 1.4036834267501796e-06, "loss": 0.5637, "num_input_tokens_seen": 34995816, "step": 60665 }, { "epoch": 9.036341971998809, "grad_norm": 3.9276838302612305, "learning_rate": 1.4015372228167705e-06, "loss": 0.5003, "num_input_tokens_seen": 34998440, "step": 60670 }, { "epoch": 9.037086684539767, "grad_norm": 3.8292994499206543, "learning_rate": 1.399392613571529e-06, "loss": 0.6068, "num_input_tokens_seen": 35001416, "step": 60675 }, { "epoch": 9.037831397080726, "grad_norm": 4.242722511291504, "learning_rate": 1.3972495991593836e-06, "loss": 0.678, "num_input_tokens_seen": 35004232, "step": 60680 }, { "epoch": 9.038576109621687, "grad_norm": 2.5624051094055176, "learning_rate": 1.3951081797251463e-06, "loss": 0.6204, "num_input_tokens_seen": 35007016, "step": 60685 }, { "epoch": 9.039320822162646, "grad_norm": 5.587704658508301, "learning_rate": 1.3929683554135292e-06, "loss": 0.6295, "num_input_tokens_seen": 35009800, "step": 60690 }, { "epoch": 9.040065534703604, "grad_norm": 2.879725933074951, "learning_rate": 1.3908301263691303e-06, "loss": 0.3594, "num_input_tokens_seen": 35012680, "step": 60695 }, { "epoch": 9.040810247244563, "grad_norm": 2.8392107486724854, "learning_rate": 1.3886934927364454e-06, "loss": 0.6376, "num_input_tokens_seen": 35015304, "step": 60700 }, { "epoch": 9.041554959785524, "grad_norm": 3.1166024208068848, "learning_rate": 1.3865584546598559e-06, "loss": 0.4785, "num_input_tokens_seen": 35018056, "step": 60705 }, { "epoch": 9.042299672326482, "grad_norm": 2.2724111080169678, "learning_rate": 1.384425012283644e-06, "loss": 0.4728, "num_input_tokens_seen": 35020616, "step": 60710 }, { "epoch": 9.043044384867441, "grad_norm": 2.085923910140991, "learning_rate": 1.3822931657519744e-06, "loss": 0.6856, "num_input_tokens_seen": 35023496, "step": 60715 }, { "epoch": 9.0437890974084, "grad_norm": 5.189085006713867, "learning_rate": 1.3801629152089073e-06, "loss": 0.6666, "num_input_tokens_seen": 35026408, "step": 60720 }, { "epoch": 9.04453380994936, "grad_norm": 5.192884922027588, "learning_rate": 1.3780342607983999e-06, "loss": 0.5868, "num_input_tokens_seen": 35029160, "step": 60725 }, { "epoch": 9.04527852249032, "grad_norm": 2.8421378135681152, "learning_rate": 1.3759072026642978e-06, "loss": 0.5993, "num_input_tokens_seen": 35031912, "step": 60730 }, { "epoch": 9.046023235031278, "grad_norm": 1.9052058458328247, "learning_rate": 1.3737817409503417e-06, "loss": 0.4536, "num_input_tokens_seen": 35034856, "step": 60735 }, { "epoch": 9.046767947572237, "grad_norm": 2.764007091522217, "learning_rate": 1.3716578758001557e-06, "loss": 0.4001, "num_input_tokens_seen": 35037832, "step": 60740 }, { "epoch": 9.047512660113195, "grad_norm": 2.1957807540893555, "learning_rate": 1.3695356073572612e-06, "loss": 0.5706, "num_input_tokens_seen": 35040904, "step": 60745 }, { "epoch": 9.048257372654156, "grad_norm": 2.009777307510376, "learning_rate": 1.3674149357650822e-06, "loss": 0.3884, "num_input_tokens_seen": 35043688, "step": 60750 }, { "epoch": 9.049002085195115, "grad_norm": 2.1918249130249023, "learning_rate": 1.3652958611669153e-06, "loss": 0.474, "num_input_tokens_seen": 35046696, "step": 60755 }, { "epoch": 9.049746797736073, "grad_norm": 2.562403440475464, "learning_rate": 1.3631783837059625e-06, "loss": 0.4676, "num_input_tokens_seen": 35050088, "step": 60760 }, { "epoch": 9.050491510277032, "grad_norm": 5.508955955505371, "learning_rate": 1.3610625035253178e-06, "loss": 0.6072, "num_input_tokens_seen": 35053000, "step": 60765 }, { "epoch": 9.051236222817993, "grad_norm": 2.328401803970337, "learning_rate": 1.3589482207679555e-06, "loss": 0.6738, "num_input_tokens_seen": 35055752, "step": 60770 }, { "epoch": 9.051980935358952, "grad_norm": 1.67868173122406, "learning_rate": 1.3568355355767559e-06, "loss": 0.598, "num_input_tokens_seen": 35058696, "step": 60775 }, { "epoch": 9.05272564789991, "grad_norm": 4.63059663772583, "learning_rate": 1.3547244480944826e-06, "loss": 0.7388, "num_input_tokens_seen": 35061544, "step": 60780 }, { "epoch": 9.053470360440869, "grad_norm": 4.701932430267334, "learning_rate": 1.3526149584637993e-06, "loss": 0.5125, "num_input_tokens_seen": 35064488, "step": 60785 }, { "epoch": 9.05421507298183, "grad_norm": 2.7645843029022217, "learning_rate": 1.3505070668272556e-06, "loss": 0.4077, "num_input_tokens_seen": 35067176, "step": 60790 }, { "epoch": 9.054959785522788, "grad_norm": 1.902162790298462, "learning_rate": 1.3484007733272908e-06, "loss": 0.4534, "num_input_tokens_seen": 35069768, "step": 60795 }, { "epoch": 9.055704498063747, "grad_norm": 7.354096412658691, "learning_rate": 1.3462960781062434e-06, "loss": 0.584, "num_input_tokens_seen": 35072680, "step": 60800 }, { "epoch": 9.056449210604706, "grad_norm": 2.307602882385254, "learning_rate": 1.344192981306333e-06, "loss": 0.701, "num_input_tokens_seen": 35075688, "step": 60805 }, { "epoch": 9.057193923145666, "grad_norm": 3.220003128051758, "learning_rate": 1.3420914830696851e-06, "loss": 0.5138, "num_input_tokens_seen": 35078664, "step": 60810 }, { "epoch": 9.057938635686625, "grad_norm": 3.390353202819824, "learning_rate": 1.339991583538308e-06, "loss": 0.4801, "num_input_tokens_seen": 35081576, "step": 60815 }, { "epoch": 9.058683348227584, "grad_norm": 3.744809150695801, "learning_rate": 1.337893282854108e-06, "loss": 0.313, "num_input_tokens_seen": 35084744, "step": 60820 }, { "epoch": 9.059428060768543, "grad_norm": 5.650712013244629, "learning_rate": 1.3357965811588741e-06, "loss": 0.7659, "num_input_tokens_seen": 35087400, "step": 60825 }, { "epoch": 9.060172773309503, "grad_norm": 4.966189861297607, "learning_rate": 1.3337014785942985e-06, "loss": 0.8584, "num_input_tokens_seen": 35090312, "step": 60830 }, { "epoch": 9.060917485850462, "grad_norm": 4.646332263946533, "learning_rate": 1.33160797530196e-06, "loss": 0.551, "num_input_tokens_seen": 35093416, "step": 60835 }, { "epoch": 9.06166219839142, "grad_norm": 6.776625633239746, "learning_rate": 1.32951607142332e-06, "loss": 0.6428, "num_input_tokens_seen": 35096264, "step": 60840 }, { "epoch": 9.06240691093238, "grad_norm": 2.8099923133850098, "learning_rate": 1.3274257670997464e-06, "loss": 0.3731, "num_input_tokens_seen": 35099208, "step": 60845 }, { "epoch": 9.06315162347334, "grad_norm": 3.174715042114258, "learning_rate": 1.3253370624724953e-06, "loss": 0.5398, "num_input_tokens_seen": 35102024, "step": 60850 }, { "epoch": 9.063896336014299, "grad_norm": 2.5123984813690186, "learning_rate": 1.3232499576827096e-06, "loss": 0.5412, "num_input_tokens_seen": 35105288, "step": 60855 }, { "epoch": 9.064641048555258, "grad_norm": 3.5252790451049805, "learning_rate": 1.321164452871429e-06, "loss": 0.5961, "num_input_tokens_seen": 35107944, "step": 60860 }, { "epoch": 9.065385761096216, "grad_norm": 2.4527580738067627, "learning_rate": 1.319080548179588e-06, "loss": 0.5119, "num_input_tokens_seen": 35111400, "step": 60865 }, { "epoch": 9.066130473637177, "grad_norm": 2.4168646335601807, "learning_rate": 1.316998243748005e-06, "loss": 0.4218, "num_input_tokens_seen": 35114184, "step": 60870 }, { "epoch": 9.066875186178136, "grad_norm": 2.580753803253174, "learning_rate": 1.3149175397173891e-06, "loss": 0.8118, "num_input_tokens_seen": 35117416, "step": 60875 }, { "epoch": 9.067619898719094, "grad_norm": 2.5632483959198, "learning_rate": 1.3128384362283474e-06, "loss": 0.6919, "num_input_tokens_seen": 35120648, "step": 60880 }, { "epoch": 9.068364611260053, "grad_norm": 3.527574062347412, "learning_rate": 1.3107609334213816e-06, "loss": 0.4356, "num_input_tokens_seen": 35123656, "step": 60885 }, { "epoch": 9.069109323801014, "grad_norm": 2.257061243057251, "learning_rate": 1.3086850314368764e-06, "loss": 0.6024, "num_input_tokens_seen": 35126536, "step": 60890 }, { "epoch": 9.069854036341972, "grad_norm": 1.9074479341506958, "learning_rate": 1.3066107304151142e-06, "loss": 0.7147, "num_input_tokens_seen": 35129704, "step": 60895 }, { "epoch": 9.070598748882931, "grad_norm": 2.414440631866455, "learning_rate": 1.3045380304962745e-06, "loss": 0.7039, "num_input_tokens_seen": 35132488, "step": 60900 }, { "epoch": 9.07134346142389, "grad_norm": 3.840184450149536, "learning_rate": 1.302466931820412e-06, "loss": 0.5707, "num_input_tokens_seen": 35135496, "step": 60905 }, { "epoch": 9.07208817396485, "grad_norm": 4.439143180847168, "learning_rate": 1.3003974345274894e-06, "loss": 0.5561, "num_input_tokens_seen": 35138344, "step": 60910 }, { "epoch": 9.07283288650581, "grad_norm": 4.240300178527832, "learning_rate": 1.2983295387573507e-06, "loss": 0.5292, "num_input_tokens_seen": 35141000, "step": 60915 }, { "epoch": 9.073577599046768, "grad_norm": 4.974440574645996, "learning_rate": 1.296263244649737e-06, "loss": 0.3647, "num_input_tokens_seen": 35143784, "step": 60920 }, { "epoch": 9.074322311587727, "grad_norm": 7.686434745788574, "learning_rate": 1.2941985523442806e-06, "loss": 0.6901, "num_input_tokens_seen": 35146568, "step": 60925 }, { "epoch": 9.075067024128685, "grad_norm": 2.9663913249969482, "learning_rate": 1.2921354619805066e-06, "loss": 0.5341, "num_input_tokens_seen": 35149768, "step": 60930 }, { "epoch": 9.075811736669646, "grad_norm": 4.4315104484558105, "learning_rate": 1.290073973697828e-06, "loss": 0.6102, "num_input_tokens_seen": 35152424, "step": 60935 }, { "epoch": 9.076556449210605, "grad_norm": 2.827406167984009, "learning_rate": 1.288014087635553e-06, "loss": 0.5906, "num_input_tokens_seen": 35155272, "step": 60940 }, { "epoch": 9.077301161751564, "grad_norm": 1.8632642030715942, "learning_rate": 1.2859558039328784e-06, "loss": 0.6939, "num_input_tokens_seen": 35158216, "step": 60945 }, { "epoch": 9.078045874292522, "grad_norm": 0.8560604453086853, "learning_rate": 1.2838991227289016e-06, "loss": 0.502, "num_input_tokens_seen": 35161096, "step": 60950 }, { "epoch": 9.078790586833483, "grad_norm": 1.2893402576446533, "learning_rate": 1.2818440441625946e-06, "loss": 0.6324, "num_input_tokens_seen": 35163848, "step": 60955 }, { "epoch": 9.079535299374442, "grad_norm": 2.3054006099700928, "learning_rate": 1.2797905683728377e-06, "loss": 0.6212, "num_input_tokens_seen": 35166856, "step": 60960 }, { "epoch": 9.0802800119154, "grad_norm": 3.2806546688079834, "learning_rate": 1.2777386954983956e-06, "loss": 0.6063, "num_input_tokens_seen": 35170024, "step": 60965 }, { "epoch": 9.081024724456359, "grad_norm": 3.480640172958374, "learning_rate": 1.2756884256779234e-06, "loss": 0.5857, "num_input_tokens_seen": 35172712, "step": 60970 }, { "epoch": 9.08176943699732, "grad_norm": 6.308638572692871, "learning_rate": 1.2736397590499716e-06, "loss": 0.5455, "num_input_tokens_seen": 35175464, "step": 60975 }, { "epoch": 9.082514149538278, "grad_norm": 1.980821132659912, "learning_rate": 1.2715926957529794e-06, "loss": 0.5365, "num_input_tokens_seen": 35177992, "step": 60980 }, { "epoch": 9.083258862079237, "grad_norm": 4.484883785247803, "learning_rate": 1.2695472359252808e-06, "loss": 0.7271, "num_input_tokens_seen": 35180680, "step": 60985 }, { "epoch": 9.084003574620196, "grad_norm": 1.7734907865524292, "learning_rate": 1.267503379705104e-06, "loss": 0.4187, "num_input_tokens_seen": 35183592, "step": 60990 }, { "epoch": 9.084748287161156, "grad_norm": 4.871952533721924, "learning_rate": 1.2654611272305521e-06, "loss": 0.6706, "num_input_tokens_seen": 35186280, "step": 60995 }, { "epoch": 9.085492999702115, "grad_norm": 1.978561520576477, "learning_rate": 1.2634204786396458e-06, "loss": 0.448, "num_input_tokens_seen": 35189352, "step": 61000 }, { "epoch": 9.086237712243074, "grad_norm": 4.1642165184021, "learning_rate": 1.2613814340702746e-06, "loss": 0.4344, "num_input_tokens_seen": 35192232, "step": 61005 }, { "epoch": 9.086982424784033, "grad_norm": 3.116133451461792, "learning_rate": 1.2593439936602308e-06, "loss": 0.3909, "num_input_tokens_seen": 35194856, "step": 61010 }, { "epoch": 9.087727137324993, "grad_norm": 4.26207971572876, "learning_rate": 1.2573081575471963e-06, "loss": 0.5379, "num_input_tokens_seen": 35197704, "step": 61015 }, { "epoch": 9.088471849865952, "grad_norm": 2.7093992233276367, "learning_rate": 1.2552739258687469e-06, "loss": 0.5871, "num_input_tokens_seen": 35200840, "step": 61020 }, { "epoch": 9.08921656240691, "grad_norm": 5.448592185974121, "learning_rate": 1.2532412987623477e-06, "loss": 0.7158, "num_input_tokens_seen": 35203784, "step": 61025 }, { "epoch": 9.08996127494787, "grad_norm": 1.6089814901351929, "learning_rate": 1.2512102763653556e-06, "loss": 0.4959, "num_input_tokens_seen": 35206600, "step": 61030 }, { "epoch": 9.09070598748883, "grad_norm": 2.4437551498413086, "learning_rate": 1.249180858815019e-06, "loss": 0.4898, "num_input_tokens_seen": 35209544, "step": 61035 }, { "epoch": 9.091450700029789, "grad_norm": 3.1890151500701904, "learning_rate": 1.2471530462484727e-06, "loss": 0.6687, "num_input_tokens_seen": 35212264, "step": 61040 }, { "epoch": 9.092195412570748, "grad_norm": 1.9432255029678345, "learning_rate": 1.2451268388027514e-06, "loss": 0.4517, "num_input_tokens_seen": 35215016, "step": 61045 }, { "epoch": 9.092940125111706, "grad_norm": 3.080845594406128, "learning_rate": 1.2431022366147766e-06, "loss": 0.5731, "num_input_tokens_seen": 35217896, "step": 61050 }, { "epoch": 9.093684837652667, "grad_norm": 2.6660239696502686, "learning_rate": 1.2410792398213662e-06, "loss": 0.4466, "num_input_tokens_seen": 35220744, "step": 61055 }, { "epoch": 9.094429550193626, "grad_norm": 2.3450257778167725, "learning_rate": 1.2390578485592246e-06, "loss": 0.5805, "num_input_tokens_seen": 35223624, "step": 61060 }, { "epoch": 9.095174262734584, "grad_norm": 2.4000821113586426, "learning_rate": 1.2370380629649486e-06, "loss": 0.5882, "num_input_tokens_seen": 35226408, "step": 61065 }, { "epoch": 9.095918975275543, "grad_norm": 1.9191843271255493, "learning_rate": 1.2350198831750259e-06, "loss": 0.3072, "num_input_tokens_seen": 35229320, "step": 61070 }, { "epoch": 9.096663687816504, "grad_norm": 3.9002492427825928, "learning_rate": 1.233003309325842e-06, "loss": 0.6634, "num_input_tokens_seen": 35232360, "step": 61075 }, { "epoch": 9.097408400357462, "grad_norm": 3.133716106414795, "learning_rate": 1.23098834155366e-06, "loss": 0.6533, "num_input_tokens_seen": 35235464, "step": 61080 }, { "epoch": 9.098153112898421, "grad_norm": 2.5032665729522705, "learning_rate": 1.2289749799946487e-06, "loss": 0.4048, "num_input_tokens_seen": 35238696, "step": 61085 }, { "epoch": 9.09889782543938, "grad_norm": 2.114692211151123, "learning_rate": 1.2269632247848633e-06, "loss": 0.3483, "num_input_tokens_seen": 35241448, "step": 61090 }, { "epoch": 9.099642537980339, "grad_norm": 5.909801483154297, "learning_rate": 1.2249530760602534e-06, "loss": 0.7581, "num_input_tokens_seen": 35244200, "step": 61095 }, { "epoch": 9.1003872505213, "grad_norm": 2.8062357902526855, "learning_rate": 1.2229445339566465e-06, "loss": 0.5555, "num_input_tokens_seen": 35247208, "step": 61100 }, { "epoch": 9.101131963062258, "grad_norm": 1.30050790309906, "learning_rate": 1.2209375986097782e-06, "loss": 0.5968, "num_input_tokens_seen": 35250312, "step": 61105 }, { "epoch": 9.101876675603217, "grad_norm": 2.4573047161102295, "learning_rate": 1.218932270155268e-06, "loss": 0.5091, "num_input_tokens_seen": 35253672, "step": 61110 }, { "epoch": 9.102621388144176, "grad_norm": 3.5112600326538086, "learning_rate": 1.2169285487286325e-06, "loss": 0.8164, "num_input_tokens_seen": 35257064, "step": 61115 }, { "epoch": 9.103366100685136, "grad_norm": 3.3405978679656982, "learning_rate": 1.214926434465266e-06, "loss": 0.5684, "num_input_tokens_seen": 35259784, "step": 61120 }, { "epoch": 9.104110813226095, "grad_norm": 2.926292657852173, "learning_rate": 1.212925927500469e-06, "loss": 0.5527, "num_input_tokens_seen": 35262632, "step": 61125 }, { "epoch": 9.104855525767054, "grad_norm": 4.250171661376953, "learning_rate": 1.21092702796943e-06, "loss": 0.6749, "num_input_tokens_seen": 35265544, "step": 61130 }, { "epoch": 9.105600238308012, "grad_norm": 2.6625027656555176, "learning_rate": 1.2089297360072193e-06, "loss": 0.4203, "num_input_tokens_seen": 35268328, "step": 61135 }, { "epoch": 9.106344950848973, "grad_norm": 2.6554555892944336, "learning_rate": 1.2069340517488093e-06, "loss": 0.4821, "num_input_tokens_seen": 35271304, "step": 61140 }, { "epoch": 9.107089663389932, "grad_norm": 4.614955902099609, "learning_rate": 1.2049399753290612e-06, "loss": 0.5325, "num_input_tokens_seen": 35274120, "step": 61145 }, { "epoch": 9.10783437593089, "grad_norm": 1.8690156936645508, "learning_rate": 1.2029475068827262e-06, "loss": 0.511, "num_input_tokens_seen": 35276680, "step": 61150 }, { "epoch": 9.10857908847185, "grad_norm": 6.320563793182373, "learning_rate": 1.2009566465444517e-06, "loss": 0.6519, "num_input_tokens_seen": 35279400, "step": 61155 }, { "epoch": 9.10932380101281, "grad_norm": 4.575484752655029, "learning_rate": 1.1989673944487606e-06, "loss": 0.6615, "num_input_tokens_seen": 35282312, "step": 61160 }, { "epoch": 9.110068513553768, "grad_norm": 2.731933355331421, "learning_rate": 1.19697975073009e-06, "loss": 0.5492, "num_input_tokens_seen": 35285288, "step": 61165 }, { "epoch": 9.110813226094727, "grad_norm": 2.7217400074005127, "learning_rate": 1.194993715522749e-06, "loss": 0.592, "num_input_tokens_seen": 35288296, "step": 61170 }, { "epoch": 9.111557938635686, "grad_norm": 2.1502559185028076, "learning_rate": 1.1930092889609473e-06, "loss": 0.7288, "num_input_tokens_seen": 35291240, "step": 61175 }, { "epoch": 9.112302651176647, "grad_norm": 4.371459484100342, "learning_rate": 1.1910264711787855e-06, "loss": 0.6167, "num_input_tokens_seen": 35293960, "step": 61180 }, { "epoch": 9.113047363717605, "grad_norm": 3.0567779541015625, "learning_rate": 1.1890452623102566e-06, "loss": 0.5396, "num_input_tokens_seen": 35296744, "step": 61185 }, { "epoch": 9.113792076258564, "grad_norm": 2.103853702545166, "learning_rate": 1.1870656624892397e-06, "loss": 0.5763, "num_input_tokens_seen": 35299624, "step": 61190 }, { "epoch": 9.114536788799523, "grad_norm": 3.4771838188171387, "learning_rate": 1.1850876718495107e-06, "loss": 0.6154, "num_input_tokens_seen": 35302216, "step": 61195 }, { "epoch": 9.115281501340483, "grad_norm": 4.968429088592529, "learning_rate": 1.1831112905247327e-06, "loss": 0.7602, "num_input_tokens_seen": 35305000, "step": 61200 }, { "epoch": 9.116026213881442, "grad_norm": 2.6419358253479004, "learning_rate": 1.1811365186484595e-06, "loss": 0.4601, "num_input_tokens_seen": 35308136, "step": 61205 }, { "epoch": 9.1167709264224, "grad_norm": 5.229588985443115, "learning_rate": 1.1791633563541404e-06, "loss": 0.6184, "num_input_tokens_seen": 35311112, "step": 61210 }, { "epoch": 9.11751563896336, "grad_norm": 3.09965443611145, "learning_rate": 1.1771918037751128e-06, "loss": 0.6797, "num_input_tokens_seen": 35313800, "step": 61215 }, { "epoch": 9.11826035150432, "grad_norm": 2.2010152339935303, "learning_rate": 1.1752218610446037e-06, "loss": 0.4783, "num_input_tokens_seen": 35316584, "step": 61220 }, { "epoch": 9.119005064045279, "grad_norm": 4.373916149139404, "learning_rate": 1.1732535282957397e-06, "loss": 0.3994, "num_input_tokens_seen": 35319432, "step": 61225 }, { "epoch": 9.119749776586238, "grad_norm": 1.4611815214157104, "learning_rate": 1.171286805661534e-06, "loss": 0.5349, "num_input_tokens_seen": 35322216, "step": 61230 }, { "epoch": 9.120494489127196, "grad_norm": 3.346979856491089, "learning_rate": 1.169321693274883e-06, "loss": 0.5414, "num_input_tokens_seen": 35325096, "step": 61235 }, { "epoch": 9.121239201668157, "grad_norm": 3.8953969478607178, "learning_rate": 1.1673581912685805e-06, "loss": 0.6466, "num_input_tokens_seen": 35327816, "step": 61240 }, { "epoch": 9.121983914209116, "grad_norm": 7.249078750610352, "learning_rate": 1.1653962997753148e-06, "loss": 0.6708, "num_input_tokens_seen": 35330600, "step": 61245 }, { "epoch": 9.122728626750074, "grad_norm": 4.341733455657959, "learning_rate": 1.1634360189276632e-06, "loss": 0.6077, "num_input_tokens_seen": 35333768, "step": 61250 }, { "epoch": 9.123473339291033, "grad_norm": 5.869800090789795, "learning_rate": 1.161477348858095e-06, "loss": 0.6844, "num_input_tokens_seen": 35336488, "step": 61255 }, { "epoch": 9.124218051831992, "grad_norm": 3.3624391555786133, "learning_rate": 1.1595202896989677e-06, "loss": 0.5459, "num_input_tokens_seen": 35339272, "step": 61260 }, { "epoch": 9.124962764372953, "grad_norm": 8.092582702636719, "learning_rate": 1.1575648415825285e-06, "loss": 0.5923, "num_input_tokens_seen": 35342216, "step": 61265 }, { "epoch": 9.125707476913911, "grad_norm": 3.0837395191192627, "learning_rate": 1.1556110046409218e-06, "loss": 0.6042, "num_input_tokens_seen": 35345160, "step": 61270 }, { "epoch": 9.12645218945487, "grad_norm": 2.082526922225952, "learning_rate": 1.153658779006181e-06, "loss": 0.625, "num_input_tokens_seen": 35347944, "step": 61275 }, { "epoch": 9.127196901995829, "grad_norm": 2.548337697982788, "learning_rate": 1.151708164810228e-06, "loss": 0.5977, "num_input_tokens_seen": 35350920, "step": 61280 }, { "epoch": 9.12794161453679, "grad_norm": 9.705977439880371, "learning_rate": 1.1497591621848741e-06, "loss": 0.4927, "num_input_tokens_seen": 35353800, "step": 61285 }, { "epoch": 9.128686327077748, "grad_norm": 3.0481808185577393, "learning_rate": 1.1478117712618281e-06, "loss": 0.5004, "num_input_tokens_seen": 35356840, "step": 61290 }, { "epoch": 9.129431039618707, "grad_norm": 2.765143394470215, "learning_rate": 1.14586599217269e-06, "loss": 0.5851, "num_input_tokens_seen": 35359400, "step": 61295 }, { "epoch": 9.130175752159666, "grad_norm": 2.9420769214630127, "learning_rate": 1.1439218250489408e-06, "loss": 0.4292, "num_input_tokens_seen": 35362312, "step": 61300 }, { "epoch": 9.130920464700626, "grad_norm": 3.409529685974121, "learning_rate": 1.1419792700219644e-06, "loss": 0.6739, "num_input_tokens_seen": 35365384, "step": 61305 }, { "epoch": 9.131665177241585, "grad_norm": 3.9140851497650146, "learning_rate": 1.1400383272230281e-06, "loss": 0.6263, "num_input_tokens_seen": 35368424, "step": 61310 }, { "epoch": 9.132409889782544, "grad_norm": 2.4572103023529053, "learning_rate": 1.1380989967832962e-06, "loss": 0.8308, "num_input_tokens_seen": 35371496, "step": 61315 }, { "epoch": 9.133154602323502, "grad_norm": 3.6340584754943848, "learning_rate": 1.1361612788338166e-06, "loss": 0.4271, "num_input_tokens_seen": 35374312, "step": 61320 }, { "epoch": 9.133899314864463, "grad_norm": 7.5226569175720215, "learning_rate": 1.134225173505535e-06, "loss": 0.6092, "num_input_tokens_seen": 35377128, "step": 61325 }, { "epoch": 9.134644027405422, "grad_norm": 3.628356456756592, "learning_rate": 1.1322906809292877e-06, "loss": 0.5332, "num_input_tokens_seen": 35380168, "step": 61330 }, { "epoch": 9.13538873994638, "grad_norm": 2.2745521068573, "learning_rate": 1.130357801235793e-06, "loss": 0.6729, "num_input_tokens_seen": 35382856, "step": 61335 }, { "epoch": 9.13613345248734, "grad_norm": 5.048950672149658, "learning_rate": 1.128426534555674e-06, "loss": 0.6068, "num_input_tokens_seen": 35385960, "step": 61340 }, { "epoch": 9.1368781650283, "grad_norm": 6.114124298095703, "learning_rate": 1.1264968810194315e-06, "loss": 0.5711, "num_input_tokens_seen": 35389064, "step": 61345 }, { "epoch": 9.137622877569259, "grad_norm": 3.588156223297119, "learning_rate": 1.12456884075747e-06, "loss": 0.5407, "num_input_tokens_seen": 35391944, "step": 61350 }, { "epoch": 9.138367590110217, "grad_norm": 3.1373350620269775, "learning_rate": 1.1226424139000797e-06, "loss": 0.377, "num_input_tokens_seen": 35395016, "step": 61355 }, { "epoch": 9.139112302651176, "grad_norm": 4.006975173950195, "learning_rate": 1.120717600577431e-06, "loss": 0.6507, "num_input_tokens_seen": 35397864, "step": 61360 }, { "epoch": 9.139857015192137, "grad_norm": 2.5205423831939697, "learning_rate": 1.1187944009196038e-06, "loss": 0.5267, "num_input_tokens_seen": 35400648, "step": 61365 }, { "epoch": 9.140601727733095, "grad_norm": 5.028985500335693, "learning_rate": 1.116872815056555e-06, "loss": 0.7458, "num_input_tokens_seen": 35403432, "step": 61370 }, { "epoch": 9.141346440274054, "grad_norm": 3.4380664825439453, "learning_rate": 1.1149528431181417e-06, "loss": 0.6991, "num_input_tokens_seen": 35406728, "step": 61375 }, { "epoch": 9.142091152815013, "grad_norm": 2.796898365020752, "learning_rate": 1.1130344852341017e-06, "loss": 0.6446, "num_input_tokens_seen": 35409672, "step": 61380 }, { "epoch": 9.142835865355973, "grad_norm": 3.408191680908203, "learning_rate": 1.1111177415340762e-06, "loss": 0.7527, "num_input_tokens_seen": 35412872, "step": 61385 }, { "epoch": 9.143580577896932, "grad_norm": 4.090169906616211, "learning_rate": 1.109202612147589e-06, "loss": 0.5999, "num_input_tokens_seen": 35415624, "step": 61390 }, { "epoch": 9.14432529043789, "grad_norm": 1.595683217048645, "learning_rate": 1.1072890972040588e-06, "loss": 0.4476, "num_input_tokens_seen": 35418408, "step": 61395 }, { "epoch": 9.14507000297885, "grad_norm": 2.0850765705108643, "learning_rate": 1.1053771968327908e-06, "loss": 0.4639, "num_input_tokens_seen": 35421160, "step": 61400 }, { "epoch": 9.14581471551981, "grad_norm": 4.005709648132324, "learning_rate": 1.1034669111629787e-06, "loss": 0.9122, "num_input_tokens_seen": 35424200, "step": 61405 }, { "epoch": 9.146559428060769, "grad_norm": 3.206930637359619, "learning_rate": 1.101558240323719e-06, "loss": 0.6923, "num_input_tokens_seen": 35426952, "step": 61410 }, { "epoch": 9.147304140601728, "grad_norm": 2.508484363555908, "learning_rate": 1.0996511844439867e-06, "loss": 0.5759, "num_input_tokens_seen": 35429640, "step": 61415 }, { "epoch": 9.148048853142686, "grad_norm": 3.43625807762146, "learning_rate": 1.097745743652659e-06, "loss": 0.5329, "num_input_tokens_seen": 35432904, "step": 61420 }, { "epoch": 9.148793565683647, "grad_norm": 3.7609176635742188, "learning_rate": 1.095841918078494e-06, "loss": 0.5572, "num_input_tokens_seen": 35435688, "step": 61425 }, { "epoch": 9.149538278224606, "grad_norm": 3.4433536529541016, "learning_rate": 1.0939397078501445e-06, "loss": 0.6109, "num_input_tokens_seen": 35438504, "step": 61430 }, { "epoch": 9.150282990765565, "grad_norm": 4.676888942718506, "learning_rate": 1.0920391130961577e-06, "loss": 0.5099, "num_input_tokens_seen": 35441384, "step": 61435 }, { "epoch": 9.151027703306523, "grad_norm": 4.3021345138549805, "learning_rate": 1.0901401339449613e-06, "loss": 0.5785, "num_input_tokens_seen": 35444008, "step": 61440 }, { "epoch": 9.151772415847482, "grad_norm": 2.954411268234253, "learning_rate": 1.0882427705248832e-06, "loss": 0.817, "num_input_tokens_seen": 35446856, "step": 61445 }, { "epoch": 9.152517128388443, "grad_norm": 2.390037775039673, "learning_rate": 1.0863470229641403e-06, "loss": 0.5414, "num_input_tokens_seen": 35449800, "step": 61450 }, { "epoch": 9.153261840929401, "grad_norm": 4.074093818664551, "learning_rate": 1.0844528913908414e-06, "loss": 0.5262, "num_input_tokens_seen": 35452712, "step": 61455 }, { "epoch": 9.15400655347036, "grad_norm": 3.07436466217041, "learning_rate": 1.0825603759329866e-06, "loss": 0.6263, "num_input_tokens_seen": 35455624, "step": 61460 }, { "epoch": 9.154751266011319, "grad_norm": 2.035506248474121, "learning_rate": 1.0806694767184545e-06, "loss": 0.5129, "num_input_tokens_seen": 35458760, "step": 61465 }, { "epoch": 9.15549597855228, "grad_norm": 2.803727865219116, "learning_rate": 1.0787801938750314e-06, "loss": 0.372, "num_input_tokens_seen": 35461576, "step": 61470 }, { "epoch": 9.156240691093238, "grad_norm": 3.7881133556365967, "learning_rate": 1.0768925275303903e-06, "loss": 0.5913, "num_input_tokens_seen": 35464552, "step": 61475 }, { "epoch": 9.156985403634197, "grad_norm": 2.740870714187622, "learning_rate": 1.0750064778120822e-06, "loss": 0.4829, "num_input_tokens_seen": 35467560, "step": 61480 }, { "epoch": 9.157730116175156, "grad_norm": 3.407045364379883, "learning_rate": 1.073122044847566e-06, "loss": 0.8246, "num_input_tokens_seen": 35470632, "step": 61485 }, { "epoch": 9.158474828716116, "grad_norm": 2.2316031455993652, "learning_rate": 1.0712392287641842e-06, "loss": 0.6749, "num_input_tokens_seen": 35473640, "step": 61490 }, { "epoch": 9.159219541257075, "grad_norm": 2.548689603805542, "learning_rate": 1.0693580296891686e-06, "loss": 0.4549, "num_input_tokens_seen": 35476520, "step": 61495 }, { "epoch": 9.159964253798034, "grad_norm": 2.2629406452178955, "learning_rate": 1.0674784477496396e-06, "loss": 0.4265, "num_input_tokens_seen": 35479144, "step": 61500 }, { "epoch": 9.160708966338992, "grad_norm": 3.322801113128662, "learning_rate": 1.0656004830726153e-06, "loss": 0.4114, "num_input_tokens_seen": 35481864, "step": 61505 }, { "epoch": 9.161453678879953, "grad_norm": 2.785508632659912, "learning_rate": 1.0637241357849993e-06, "loss": 0.4029, "num_input_tokens_seen": 35484936, "step": 61510 }, { "epoch": 9.162198391420912, "grad_norm": 2.8014845848083496, "learning_rate": 1.061849406013593e-06, "loss": 0.5622, "num_input_tokens_seen": 35487880, "step": 61515 }, { "epoch": 9.16294310396187, "grad_norm": 1.490838885307312, "learning_rate": 1.059976293885076e-06, "loss": 0.4849, "num_input_tokens_seen": 35490824, "step": 61520 }, { "epoch": 9.16368781650283, "grad_norm": 2.603111982345581, "learning_rate": 1.0581047995260246e-06, "loss": 0.4317, "num_input_tokens_seen": 35493704, "step": 61525 }, { "epoch": 9.16443252904379, "grad_norm": 6.385879039764404, "learning_rate": 1.0562349230629154e-06, "loss": 0.7865, "num_input_tokens_seen": 35496712, "step": 61530 }, { "epoch": 9.165177241584749, "grad_norm": 3.005260944366455, "learning_rate": 1.0543666646221002e-06, "loss": 0.5503, "num_input_tokens_seen": 35499464, "step": 61535 }, { "epoch": 9.165921954125707, "grad_norm": 2.466142177581787, "learning_rate": 1.0525000243298278e-06, "loss": 0.442, "num_input_tokens_seen": 35502216, "step": 61540 }, { "epoch": 9.166666666666666, "grad_norm": 2.9772181510925293, "learning_rate": 1.050635002312239e-06, "loss": 0.4025, "num_input_tokens_seen": 35504936, "step": 61545 }, { "epoch": 9.167411379207627, "grad_norm": 8.913850784301758, "learning_rate": 1.0487715986953695e-06, "loss": 0.6444, "num_input_tokens_seen": 35507688, "step": 61550 }, { "epoch": 9.168156091748585, "grad_norm": 3.1773009300231934, "learning_rate": 1.0469098136051375e-06, "loss": 0.4941, "num_input_tokens_seen": 35510536, "step": 61555 }, { "epoch": 9.168900804289544, "grad_norm": 1.8842531442642212, "learning_rate": 1.045049647167351e-06, "loss": 0.5535, "num_input_tokens_seen": 35513352, "step": 61560 }, { "epoch": 9.169645516830503, "grad_norm": 3.007891893386841, "learning_rate": 1.0431910995077205e-06, "loss": 0.6527, "num_input_tokens_seen": 35516392, "step": 61565 }, { "epoch": 9.170390229371463, "grad_norm": 4.180973052978516, "learning_rate": 1.0413341707518287e-06, "loss": 0.6573, "num_input_tokens_seen": 35519240, "step": 61570 }, { "epoch": 9.171134941912422, "grad_norm": 2.535072088241577, "learning_rate": 1.039478861025167e-06, "loss": 0.3718, "num_input_tokens_seen": 35522184, "step": 61575 }, { "epoch": 9.171879654453381, "grad_norm": 4.0157647132873535, "learning_rate": 1.0376251704531049e-06, "loss": 0.5007, "num_input_tokens_seen": 35524968, "step": 61580 }, { "epoch": 9.17262436699434, "grad_norm": 2.4695422649383545, "learning_rate": 1.035773099160911e-06, "loss": 0.3004, "num_input_tokens_seen": 35528136, "step": 61585 }, { "epoch": 9.1733690795353, "grad_norm": 3.7635326385498047, "learning_rate": 1.033922647273744e-06, "loss": 0.539, "num_input_tokens_seen": 35530856, "step": 61590 }, { "epoch": 9.174113792076259, "grad_norm": 2.925509452819824, "learning_rate": 1.0320738149166397e-06, "loss": 0.6204, "num_input_tokens_seen": 35533640, "step": 61595 }, { "epoch": 9.174858504617218, "grad_norm": 1.6980204582214355, "learning_rate": 1.0302266022145457e-06, "loss": 0.7811, "num_input_tokens_seen": 35536232, "step": 61600 }, { "epoch": 9.175603217158177, "grad_norm": 3.7653534412384033, "learning_rate": 1.0283810092922812e-06, "loss": 0.7214, "num_input_tokens_seen": 35538696, "step": 61605 }, { "epoch": 9.176347929699135, "grad_norm": 1.7146457433700562, "learning_rate": 1.0265370362745663e-06, "loss": 0.4105, "num_input_tokens_seen": 35541608, "step": 61610 }, { "epoch": 9.177092642240096, "grad_norm": 3.0957255363464355, "learning_rate": 1.0246946832860093e-06, "loss": 0.6046, "num_input_tokens_seen": 35544840, "step": 61615 }, { "epoch": 9.177837354781055, "grad_norm": 5.182717323303223, "learning_rate": 1.0228539504511082e-06, "loss": 0.8139, "num_input_tokens_seen": 35547752, "step": 61620 }, { "epoch": 9.178582067322013, "grad_norm": 1.7691656351089478, "learning_rate": 1.0210148378942573e-06, "loss": 0.5477, "num_input_tokens_seen": 35550600, "step": 61625 }, { "epoch": 9.179326779862972, "grad_norm": 4.023663520812988, "learning_rate": 1.0191773457397274e-06, "loss": 0.3375, "num_input_tokens_seen": 35553800, "step": 61630 }, { "epoch": 9.180071492403933, "grad_norm": 6.92628288269043, "learning_rate": 1.0173414741116994e-06, "loss": 0.4479, "num_input_tokens_seen": 35556776, "step": 61635 }, { "epoch": 9.180816204944891, "grad_norm": 4.919027328491211, "learning_rate": 1.015507223134224e-06, "loss": 0.5401, "num_input_tokens_seen": 35559496, "step": 61640 }, { "epoch": 9.18156091748585, "grad_norm": 1.9501326084136963, "learning_rate": 1.0136745929312546e-06, "loss": 0.4916, "num_input_tokens_seen": 35562408, "step": 61645 }, { "epoch": 9.182305630026809, "grad_norm": 2.453660726547241, "learning_rate": 1.011843583626637e-06, "loss": 0.5346, "num_input_tokens_seen": 35565416, "step": 61650 }, { "epoch": 9.18305034256777, "grad_norm": 2.309267044067383, "learning_rate": 1.010014195344103e-06, "loss": 0.6585, "num_input_tokens_seen": 35568136, "step": 61655 }, { "epoch": 9.183795055108728, "grad_norm": 3.3226723670959473, "learning_rate": 1.0081864282072722e-06, "loss": 0.4611, "num_input_tokens_seen": 35571208, "step": 61660 }, { "epoch": 9.184539767649687, "grad_norm": 2.302034378051758, "learning_rate": 1.0063602823396578e-06, "loss": 0.6557, "num_input_tokens_seen": 35574152, "step": 61665 }, { "epoch": 9.185284480190646, "grad_norm": 1.6566990613937378, "learning_rate": 1.0045357578646664e-06, "loss": 0.5176, "num_input_tokens_seen": 35576808, "step": 61670 }, { "epoch": 9.186029192731606, "grad_norm": 3.369016408920288, "learning_rate": 1.0027128549055881e-06, "loss": 0.6185, "num_input_tokens_seen": 35580168, "step": 61675 }, { "epoch": 9.186773905272565, "grad_norm": 1.3763779401779175, "learning_rate": 1.0008915735856134e-06, "loss": 0.6613, "num_input_tokens_seen": 35583272, "step": 61680 }, { "epoch": 9.187518617813524, "grad_norm": 2.1785266399383545, "learning_rate": 9.990719140278077e-07, "loss": 0.4956, "num_input_tokens_seen": 35586120, "step": 61685 }, { "epoch": 9.188263330354483, "grad_norm": 4.569624423980713, "learning_rate": 9.972538763551448e-07, "loss": 0.5309, "num_input_tokens_seen": 35589160, "step": 61690 }, { "epoch": 9.189008042895443, "grad_norm": 3.5742931365966797, "learning_rate": 9.954374606904765e-07, "loss": 0.7577, "num_input_tokens_seen": 35591912, "step": 61695 }, { "epoch": 9.189752755436402, "grad_norm": 3.7374234199523926, "learning_rate": 9.936226671565491e-07, "loss": 0.616, "num_input_tokens_seen": 35594664, "step": 61700 }, { "epoch": 9.19049746797736, "grad_norm": 2.3508312702178955, "learning_rate": 9.91809495875995e-07, "loss": 0.459, "num_input_tokens_seen": 35597576, "step": 61705 }, { "epoch": 9.19124218051832, "grad_norm": 3.3776161670684814, "learning_rate": 9.899979469713494e-07, "loss": 0.6297, "num_input_tokens_seen": 35600776, "step": 61710 }, { "epoch": 9.19198689305928, "grad_norm": 2.0095810890197754, "learning_rate": 9.88188020565023e-07, "loss": 0.2641, "num_input_tokens_seen": 35603656, "step": 61715 }, { "epoch": 9.192731605600239, "grad_norm": 2.0604424476623535, "learning_rate": 9.863797167793286e-07, "loss": 0.3169, "num_input_tokens_seen": 35606280, "step": 61720 }, { "epoch": 9.193476318141197, "grad_norm": 3.452817678451538, "learning_rate": 9.84573035736455e-07, "loss": 0.5639, "num_input_tokens_seen": 35609064, "step": 61725 }, { "epoch": 9.194221030682156, "grad_norm": 3.2128756046295166, "learning_rate": 9.827679775585019e-07, "loss": 0.5956, "num_input_tokens_seen": 35612168, "step": 61730 }, { "epoch": 9.194965743223117, "grad_norm": 4.5015997886657715, "learning_rate": 9.80964542367438e-07, "loss": 0.4776, "num_input_tokens_seen": 35614984, "step": 61735 }, { "epoch": 9.195710455764075, "grad_norm": 4.96798849105835, "learning_rate": 9.79162730285138e-07, "loss": 0.5065, "num_input_tokens_seen": 35617640, "step": 61740 }, { "epoch": 9.196455168305034, "grad_norm": 3.3107283115386963, "learning_rate": 9.773625414333576e-07, "loss": 0.5824, "num_input_tokens_seen": 35620616, "step": 61745 }, { "epoch": 9.197199880845993, "grad_norm": 2.149731159210205, "learning_rate": 9.755639759337466e-07, "loss": 0.5419, "num_input_tokens_seen": 35623336, "step": 61750 }, { "epoch": 9.197944593386953, "grad_norm": 3.697399139404297, "learning_rate": 9.737670339078491e-07, "loss": 0.4193, "num_input_tokens_seen": 35626248, "step": 61755 }, { "epoch": 9.198689305927912, "grad_norm": 3.1509242057800293, "learning_rate": 9.719717154770908e-07, "loss": 0.5156, "num_input_tokens_seen": 35629032, "step": 61760 }, { "epoch": 9.199434018468871, "grad_norm": 3.3810977935791016, "learning_rate": 9.701780207627963e-07, "loss": 0.6978, "num_input_tokens_seen": 35632104, "step": 61765 }, { "epoch": 9.20017873100983, "grad_norm": 4.216475486755371, "learning_rate": 9.683859498861691e-07, "loss": 0.5622, "num_input_tokens_seen": 35635080, "step": 61770 }, { "epoch": 9.200923443550789, "grad_norm": 5.677504539489746, "learning_rate": 9.665955029683122e-07, "loss": 0.4637, "num_input_tokens_seen": 35637960, "step": 61775 }, { "epoch": 9.201668156091749, "grad_norm": 4.662143230438232, "learning_rate": 9.648066801302202e-07, "loss": 0.7576, "num_input_tokens_seen": 35641096, "step": 61780 }, { "epoch": 9.202412868632708, "grad_norm": 3.302406072616577, "learning_rate": 9.630194814927718e-07, "loss": 0.609, "num_input_tokens_seen": 35644136, "step": 61785 }, { "epoch": 9.203157581173667, "grad_norm": 3.4735143184661865, "learning_rate": 9.612339071767451e-07, "loss": 0.5237, "num_input_tokens_seen": 35647176, "step": 61790 }, { "epoch": 9.203902293714625, "grad_norm": 5.4166975021362305, "learning_rate": 9.59449957302791e-07, "loss": 0.423, "num_input_tokens_seen": 35649928, "step": 61795 }, { "epoch": 9.204647006255586, "grad_norm": 5.256289482116699, "learning_rate": 9.576676319914713e-07, "loss": 0.6227, "num_input_tokens_seen": 35652936, "step": 61800 }, { "epoch": 9.205391718796545, "grad_norm": 5.282564640045166, "learning_rate": 9.558869313632202e-07, "loss": 0.3585, "num_input_tokens_seen": 35655912, "step": 61805 }, { "epoch": 9.206136431337503, "grad_norm": 2.0358874797821045, "learning_rate": 9.541078555383747e-07, "loss": 0.4314, "num_input_tokens_seen": 35658632, "step": 61810 }, { "epoch": 9.206881143878462, "grad_norm": 2.7298507690429688, "learning_rate": 9.523304046371556e-07, "loss": 0.6061, "num_input_tokens_seen": 35661384, "step": 61815 }, { "epoch": 9.207625856419423, "grad_norm": 5.1183881759643555, "learning_rate": 9.505545787796777e-07, "loss": 0.4805, "num_input_tokens_seen": 35664200, "step": 61820 }, { "epoch": 9.208370568960381, "grad_norm": 1.617843747138977, "learning_rate": 9.48780378085945e-07, "loss": 0.5145, "num_input_tokens_seen": 35667048, "step": 61825 }, { "epoch": 9.20911528150134, "grad_norm": 3.012521743774414, "learning_rate": 9.470078026758477e-07, "loss": 0.5813, "num_input_tokens_seen": 35670088, "step": 61830 }, { "epoch": 9.209859994042299, "grad_norm": 10.282144546508789, "learning_rate": 9.452368526691735e-07, "loss": 0.4368, "num_input_tokens_seen": 35672744, "step": 61835 }, { "epoch": 9.21060470658326, "grad_norm": 4.698987007141113, "learning_rate": 9.434675281855932e-07, "loss": 0.4926, "num_input_tokens_seen": 35675592, "step": 61840 }, { "epoch": 9.211349419124218, "grad_norm": 4.043344497680664, "learning_rate": 9.416998293446666e-07, "loss": 0.3611, "num_input_tokens_seen": 35678088, "step": 61845 }, { "epoch": 9.212094131665177, "grad_norm": 3.2276813983917236, "learning_rate": 9.399337562658539e-07, "loss": 0.4071, "num_input_tokens_seen": 35680840, "step": 61850 }, { "epoch": 9.212838844206136, "grad_norm": 3.897705078125, "learning_rate": 9.381693090684957e-07, "loss": 0.5636, "num_input_tokens_seen": 35683752, "step": 61855 }, { "epoch": 9.213583556747096, "grad_norm": 2.957501173019409, "learning_rate": 9.364064878718298e-07, "loss": 0.4944, "num_input_tokens_seen": 35686568, "step": 61860 }, { "epoch": 9.214328269288055, "grad_norm": 5.245431423187256, "learning_rate": 9.346452927949778e-07, "loss": 0.7263, "num_input_tokens_seen": 35689320, "step": 61865 }, { "epoch": 9.215072981829014, "grad_norm": 6.368368148803711, "learning_rate": 9.328857239569527e-07, "loss": 0.6056, "num_input_tokens_seen": 35692200, "step": 61870 }, { "epoch": 9.215817694369973, "grad_norm": 3.146014451980591, "learning_rate": 9.311277814766595e-07, "loss": 0.4737, "num_input_tokens_seen": 35694888, "step": 61875 }, { "epoch": 9.216562406910933, "grad_norm": 2.7691800594329834, "learning_rate": 9.293714654728974e-07, "loss": 0.5923, "num_input_tokens_seen": 35698024, "step": 61880 }, { "epoch": 9.217307119451892, "grad_norm": 2.0713937282562256, "learning_rate": 9.276167760643439e-07, "loss": 0.549, "num_input_tokens_seen": 35700744, "step": 61885 }, { "epoch": 9.21805183199285, "grad_norm": 2.0654022693634033, "learning_rate": 9.258637133695791e-07, "loss": 0.5796, "num_input_tokens_seen": 35703752, "step": 61890 }, { "epoch": 9.21879654453381, "grad_norm": 3.4044766426086426, "learning_rate": 9.241122775070693e-07, "loss": 0.5582, "num_input_tokens_seen": 35706440, "step": 61895 }, { "epoch": 9.21954125707477, "grad_norm": 4.097755432128906, "learning_rate": 9.223624685951615e-07, "loss": 0.6201, "num_input_tokens_seen": 35709352, "step": 61900 }, { "epoch": 9.220285969615729, "grad_norm": 3.7202041149139404, "learning_rate": 9.206142867521084e-07, "loss": 0.6018, "num_input_tokens_seen": 35712456, "step": 61905 }, { "epoch": 9.221030682156687, "grad_norm": 2.176868438720703, "learning_rate": 9.188677320960404e-07, "loss": 0.5022, "num_input_tokens_seen": 35715336, "step": 61910 }, { "epoch": 9.221775394697646, "grad_norm": 2.7185251712799072, "learning_rate": 9.171228047449825e-07, "loss": 0.5993, "num_input_tokens_seen": 35718152, "step": 61915 }, { "epoch": 9.222520107238607, "grad_norm": 19.428401947021484, "learning_rate": 9.153795048168573e-07, "loss": 0.7143, "num_input_tokens_seen": 35720808, "step": 61920 }, { "epoch": 9.223264819779565, "grad_norm": 4.159544467926025, "learning_rate": 9.136378324294592e-07, "loss": 0.6665, "num_input_tokens_seen": 35723592, "step": 61925 }, { "epoch": 9.224009532320524, "grad_norm": 2.9042038917541504, "learning_rate": 9.118977877004942e-07, "loss": 0.5746, "num_input_tokens_seen": 35726472, "step": 61930 }, { "epoch": 9.224754244861483, "grad_norm": 2.4560892581939697, "learning_rate": 9.101593707475376e-07, "loss": 0.4253, "num_input_tokens_seen": 35729192, "step": 61935 }, { "epoch": 9.225498957402444, "grad_norm": 5.034826755523682, "learning_rate": 9.084225816880677e-07, "loss": 0.5418, "num_input_tokens_seen": 35732264, "step": 61940 }, { "epoch": 9.226243669943402, "grad_norm": 6.37874698638916, "learning_rate": 9.066874206394488e-07, "loss": 0.5934, "num_input_tokens_seen": 35735112, "step": 61945 }, { "epoch": 9.226988382484361, "grad_norm": 6.830676555633545, "learning_rate": 9.049538877189401e-07, "loss": 0.5442, "num_input_tokens_seen": 35738024, "step": 61950 }, { "epoch": 9.22773309502532, "grad_norm": 2.5516469478607178, "learning_rate": 9.032219830436867e-07, "loss": 0.6954, "num_input_tokens_seen": 35740840, "step": 61955 }, { "epoch": 9.228477807566279, "grad_norm": 3.2162375450134277, "learning_rate": 9.014917067307227e-07, "loss": 0.6035, "num_input_tokens_seen": 35743752, "step": 61960 }, { "epoch": 9.229222520107239, "grad_norm": 3.379314422607422, "learning_rate": 8.997630588969686e-07, "loss": 0.4694, "num_input_tokens_seen": 35746856, "step": 61965 }, { "epoch": 9.229967232648198, "grad_norm": 1.3383110761642456, "learning_rate": 8.980360396592419e-07, "loss": 0.3353, "num_input_tokens_seen": 35750184, "step": 61970 }, { "epoch": 9.230711945189157, "grad_norm": 2.925466299057007, "learning_rate": 8.963106491342466e-07, "loss": 0.5308, "num_input_tokens_seen": 35752936, "step": 61975 }, { "epoch": 9.231456657730115, "grad_norm": 2.212402105331421, "learning_rate": 8.94586887438581e-07, "loss": 0.6677, "num_input_tokens_seen": 35755720, "step": 61980 }, { "epoch": 9.232201370271076, "grad_norm": 4.76905632019043, "learning_rate": 8.928647546887269e-07, "loss": 0.5916, "num_input_tokens_seen": 35758632, "step": 61985 }, { "epoch": 9.232946082812035, "grad_norm": 3.820788860321045, "learning_rate": 8.911442510010637e-07, "loss": 0.6052, "num_input_tokens_seen": 35761288, "step": 61990 }, { "epoch": 9.233690795352993, "grad_norm": 3.343254804611206, "learning_rate": 8.894253764918509e-07, "loss": 0.6485, "num_input_tokens_seen": 35764040, "step": 61995 }, { "epoch": 9.234435507893952, "grad_norm": 4.439087390899658, "learning_rate": 8.877081312772456e-07, "loss": 0.5716, "num_input_tokens_seen": 35766920, "step": 62000 }, { "epoch": 9.235180220434913, "grad_norm": 2.089029312133789, "learning_rate": 8.859925154732885e-07, "loss": 0.613, "num_input_tokens_seen": 35770024, "step": 62005 }, { "epoch": 9.235924932975871, "grad_norm": 2.6664493083953857, "learning_rate": 8.842785291959199e-07, "loss": 0.6907, "num_input_tokens_seen": 35772776, "step": 62010 }, { "epoch": 9.23666964551683, "grad_norm": 5.673159599304199, "learning_rate": 8.825661725609585e-07, "loss": 0.717, "num_input_tokens_seen": 35775528, "step": 62015 }, { "epoch": 9.237414358057789, "grad_norm": 4.858266830444336, "learning_rate": 8.808554456841201e-07, "loss": 0.6404, "num_input_tokens_seen": 35778632, "step": 62020 }, { "epoch": 9.23815907059875, "grad_norm": 3.6476240158081055, "learning_rate": 8.79146348681012e-07, "loss": 0.5922, "num_input_tokens_seen": 35781416, "step": 62025 }, { "epoch": 9.238903783139708, "grad_norm": 3.449155807495117, "learning_rate": 8.774388816671253e-07, "loss": 0.6023, "num_input_tokens_seen": 35784136, "step": 62030 }, { "epoch": 9.239648495680667, "grad_norm": 2.6076977252960205, "learning_rate": 8.757330447578399e-07, "loss": 0.692, "num_input_tokens_seen": 35787112, "step": 62035 }, { "epoch": 9.240393208221626, "grad_norm": 3.9307525157928467, "learning_rate": 8.740288380684386e-07, "loss": 0.5854, "num_input_tokens_seen": 35790312, "step": 62040 }, { "epoch": 9.241137920762586, "grad_norm": 2.9628348350524902, "learning_rate": 8.723262617140765e-07, "loss": 0.7117, "num_input_tokens_seen": 35793032, "step": 62045 }, { "epoch": 9.241882633303545, "grad_norm": 4.77518892288208, "learning_rate": 8.706253158098088e-07, "loss": 0.6413, "num_input_tokens_seen": 35795688, "step": 62050 }, { "epoch": 9.242627345844504, "grad_norm": 2.4561588764190674, "learning_rate": 8.689260004705823e-07, "loss": 0.4765, "num_input_tokens_seen": 35798888, "step": 62055 }, { "epoch": 9.243372058385463, "grad_norm": 3.4724764823913574, "learning_rate": 8.672283158112249e-07, "loss": 0.4968, "num_input_tokens_seen": 35801544, "step": 62060 }, { "epoch": 9.244116770926423, "grad_norm": 6.566619873046875, "learning_rate": 8.655322619464612e-07, "loss": 0.7369, "num_input_tokens_seen": 35804424, "step": 62065 }, { "epoch": 9.244861483467382, "grad_norm": 3.09273099899292, "learning_rate": 8.638378389909052e-07, "loss": 0.5115, "num_input_tokens_seen": 35807752, "step": 62070 }, { "epoch": 9.24560619600834, "grad_norm": 2.776853322982788, "learning_rate": 8.621450470590542e-07, "loss": 0.4687, "num_input_tokens_seen": 35810632, "step": 62075 }, { "epoch": 9.2463509085493, "grad_norm": 3.0490994453430176, "learning_rate": 8.604538862653084e-07, "loss": 0.4537, "num_input_tokens_seen": 35813384, "step": 62080 }, { "epoch": 9.24709562109026, "grad_norm": 3.2047133445739746, "learning_rate": 8.5876435672394e-07, "loss": 0.5282, "num_input_tokens_seen": 35816168, "step": 62085 }, { "epoch": 9.247840333631219, "grad_norm": 2.852177381515503, "learning_rate": 8.570764585491275e-07, "loss": 0.4257, "num_input_tokens_seen": 35819112, "step": 62090 }, { "epoch": 9.248585046172177, "grad_norm": 1.953190565109253, "learning_rate": 8.553901918549323e-07, "loss": 0.4739, "num_input_tokens_seen": 35821992, "step": 62095 }, { "epoch": 9.249329758713136, "grad_norm": 2.5991439819335938, "learning_rate": 8.537055567552993e-07, "loss": 0.5143, "num_input_tokens_seen": 35824904, "step": 62100 }, { "epoch": 9.250074471254097, "grad_norm": 3.1472928524017334, "learning_rate": 8.520225533640735e-07, "loss": 0.5846, "num_input_tokens_seen": 35827784, "step": 62105 }, { "epoch": 9.250819183795056, "grad_norm": 5.895363807678223, "learning_rate": 8.503411817949863e-07, "loss": 0.4419, "num_input_tokens_seen": 35830664, "step": 62110 }, { "epoch": 9.251563896336014, "grad_norm": 1.8463342189788818, "learning_rate": 8.486614421616551e-07, "loss": 0.8084, "num_input_tokens_seen": 35833544, "step": 62115 }, { "epoch": 9.252308608876973, "grad_norm": 3.3883512020111084, "learning_rate": 8.469833345775946e-07, "loss": 0.7066, "num_input_tokens_seen": 35836264, "step": 62120 }, { "epoch": 9.253053321417934, "grad_norm": 4.3104352951049805, "learning_rate": 8.453068591562003e-07, "loss": 0.5957, "num_input_tokens_seen": 35839080, "step": 62125 }, { "epoch": 9.253798033958892, "grad_norm": 3.6356189250946045, "learning_rate": 8.436320160107619e-07, "loss": 0.7597, "num_input_tokens_seen": 35841832, "step": 62130 }, { "epoch": 9.254542746499851, "grad_norm": 2.1970326900482178, "learning_rate": 8.419588052544586e-07, "loss": 0.6112, "num_input_tokens_seen": 35844520, "step": 62135 }, { "epoch": 9.25528745904081, "grad_norm": 5.845728397369385, "learning_rate": 8.402872270003582e-07, "loss": 0.6309, "num_input_tokens_seen": 35847400, "step": 62140 }, { "epoch": 9.256032171581769, "grad_norm": 4.2609148025512695, "learning_rate": 8.386172813614229e-07, "loss": 0.607, "num_input_tokens_seen": 35850184, "step": 62145 }, { "epoch": 9.25677688412273, "grad_norm": 2.7414486408233643, "learning_rate": 8.369489684504961e-07, "loss": 0.6525, "num_input_tokens_seen": 35853256, "step": 62150 }, { "epoch": 9.257521596663688, "grad_norm": 3.8918206691741943, "learning_rate": 8.352822883803235e-07, "loss": 0.6186, "num_input_tokens_seen": 35856104, "step": 62155 }, { "epoch": 9.258266309204647, "grad_norm": 1.6480118036270142, "learning_rate": 8.336172412635263e-07, "loss": 0.4633, "num_input_tokens_seen": 35858664, "step": 62160 }, { "epoch": 9.259011021745605, "grad_norm": 3.1220598220825195, "learning_rate": 8.319538272126198e-07, "loss": 0.7699, "num_input_tokens_seen": 35861512, "step": 62165 }, { "epoch": 9.259755734286566, "grad_norm": 2.858710765838623, "learning_rate": 8.302920463400143e-07, "loss": 0.5172, "num_input_tokens_seen": 35864328, "step": 62170 }, { "epoch": 9.260500446827525, "grad_norm": 2.3881194591522217, "learning_rate": 8.286318987580061e-07, "loss": 0.5373, "num_input_tokens_seen": 35867144, "step": 62175 }, { "epoch": 9.261245159368483, "grad_norm": 6.532678127288818, "learning_rate": 8.269733845787775e-07, "loss": 0.5115, "num_input_tokens_seen": 35870056, "step": 62180 }, { "epoch": 9.261989871909442, "grad_norm": 4.627455711364746, "learning_rate": 8.253165039144111e-07, "loss": 0.6282, "num_input_tokens_seen": 35873000, "step": 62185 }, { "epoch": 9.262734584450403, "grad_norm": 6.524053573608398, "learning_rate": 8.236612568768676e-07, "loss": 0.6288, "num_input_tokens_seen": 35875976, "step": 62190 }, { "epoch": 9.263479296991362, "grad_norm": 3.8302464485168457, "learning_rate": 8.220076435780016e-07, "loss": 0.6318, "num_input_tokens_seen": 35878696, "step": 62195 }, { "epoch": 9.26422400953232, "grad_norm": 2.8830676078796387, "learning_rate": 8.203556641295601e-07, "loss": 0.6212, "num_input_tokens_seen": 35881800, "step": 62200 }, { "epoch": 9.264968722073279, "grad_norm": 1.9525375366210938, "learning_rate": 8.187053186431731e-07, "loss": 0.6018, "num_input_tokens_seen": 35885160, "step": 62205 }, { "epoch": 9.26571343461424, "grad_norm": 3.4571943283081055, "learning_rate": 8.170566072303681e-07, "loss": 0.5694, "num_input_tokens_seen": 35888008, "step": 62210 }, { "epoch": 9.266458147155198, "grad_norm": 3.3655121326446533, "learning_rate": 8.15409530002556e-07, "loss": 0.4628, "num_input_tokens_seen": 35891112, "step": 62215 }, { "epoch": 9.267202859696157, "grad_norm": 2.120034694671631, "learning_rate": 8.137640870710395e-07, "loss": 0.6374, "num_input_tokens_seen": 35893768, "step": 62220 }, { "epoch": 9.267947572237116, "grad_norm": 2.988469362258911, "learning_rate": 8.121202785470156e-07, "loss": 0.6369, "num_input_tokens_seen": 35896616, "step": 62225 }, { "epoch": 9.268692284778076, "grad_norm": 2.8991873264312744, "learning_rate": 8.104781045415594e-07, "loss": 0.4074, "num_input_tokens_seen": 35899400, "step": 62230 }, { "epoch": 9.269436997319035, "grad_norm": 2.1750621795654297, "learning_rate": 8.08837565165646e-07, "loss": 0.4642, "num_input_tokens_seen": 35902312, "step": 62235 }, { "epoch": 9.270181709859994, "grad_norm": 2.805100202560425, "learning_rate": 8.071986605301396e-07, "loss": 0.6055, "num_input_tokens_seen": 35905448, "step": 62240 }, { "epoch": 9.270926422400953, "grad_norm": 2.7206530570983887, "learning_rate": 8.055613907457821e-07, "loss": 0.4472, "num_input_tokens_seen": 35908680, "step": 62245 }, { "epoch": 9.271671134941913, "grad_norm": 3.1469249725341797, "learning_rate": 8.039257559232182e-07, "loss": 0.5838, "num_input_tokens_seen": 35911688, "step": 62250 }, { "epoch": 9.272415847482872, "grad_norm": 6.364048957824707, "learning_rate": 8.022917561729793e-07, "loss": 0.612, "num_input_tokens_seen": 35914536, "step": 62255 }, { "epoch": 9.27316056002383, "grad_norm": 2.12733793258667, "learning_rate": 8.00659391605485e-07, "loss": 0.4488, "num_input_tokens_seen": 35917576, "step": 62260 }, { "epoch": 9.27390527256479, "grad_norm": 2.449524164199829, "learning_rate": 7.990286623310389e-07, "loss": 0.5044, "num_input_tokens_seen": 35920296, "step": 62265 }, { "epoch": 9.27464998510575, "grad_norm": 3.576841115951538, "learning_rate": 7.973995684598418e-07, "loss": 0.6314, "num_input_tokens_seen": 35923112, "step": 62270 }, { "epoch": 9.275394697646709, "grad_norm": 2.607675075531006, "learning_rate": 7.957721101019805e-07, "loss": 0.5372, "num_input_tokens_seen": 35926248, "step": 62275 }, { "epoch": 9.276139410187668, "grad_norm": 3.934488296508789, "learning_rate": 7.941462873674338e-07, "loss": 0.599, "num_input_tokens_seen": 35929256, "step": 62280 }, { "epoch": 9.276884122728626, "grad_norm": 3.398404836654663, "learning_rate": 7.925221003660694e-07, "loss": 0.4746, "num_input_tokens_seen": 35931784, "step": 62285 }, { "epoch": 9.277628835269585, "grad_norm": 2.668015956878662, "learning_rate": 7.90899549207641e-07, "loss": 0.5058, "num_input_tokens_seen": 35934696, "step": 62290 }, { "epoch": 9.278373547810546, "grad_norm": 4.414630889892578, "learning_rate": 7.892786340017916e-07, "loss": 0.4586, "num_input_tokens_seen": 35937512, "step": 62295 }, { "epoch": 9.279118260351504, "grad_norm": 3.5317020416259766, "learning_rate": 7.876593548580585e-07, "loss": 0.7586, "num_input_tokens_seen": 35940392, "step": 62300 }, { "epoch": 9.279862972892463, "grad_norm": 13.939364433288574, "learning_rate": 7.860417118858654e-07, "loss": 0.5596, "num_input_tokens_seen": 35943240, "step": 62305 }, { "epoch": 9.280607685433422, "grad_norm": 3.5696237087249756, "learning_rate": 7.844257051945275e-07, "loss": 0.5183, "num_input_tokens_seen": 35946280, "step": 62310 }, { "epoch": 9.281352397974382, "grad_norm": 5.155908107757568, "learning_rate": 7.828113348932464e-07, "loss": 0.5901, "num_input_tokens_seen": 35948936, "step": 62315 }, { "epoch": 9.282097110515341, "grad_norm": 2.178048610687256, "learning_rate": 7.811986010911182e-07, "loss": 0.5103, "num_input_tokens_seen": 35952392, "step": 62320 }, { "epoch": 9.2828418230563, "grad_norm": 1.9889764785766602, "learning_rate": 7.795875038971223e-07, "loss": 0.4427, "num_input_tokens_seen": 35955112, "step": 62325 }, { "epoch": 9.283586535597259, "grad_norm": 1.8582442998886108, "learning_rate": 7.779780434201273e-07, "loss": 0.5382, "num_input_tokens_seen": 35957928, "step": 62330 }, { "epoch": 9.28433124813822, "grad_norm": 2.5014138221740723, "learning_rate": 7.76370219768896e-07, "loss": 0.5175, "num_input_tokens_seen": 35961032, "step": 62335 }, { "epoch": 9.285075960679178, "grad_norm": 7.819097995758057, "learning_rate": 7.747640330520805e-07, "loss": 0.4535, "num_input_tokens_seen": 35964040, "step": 62340 }, { "epoch": 9.285820673220137, "grad_norm": 1.7676960229873657, "learning_rate": 7.731594833782191e-07, "loss": 0.429, "num_input_tokens_seen": 35966696, "step": 62345 }, { "epoch": 9.286565385761095, "grad_norm": 3.156782865524292, "learning_rate": 7.715565708557387e-07, "loss": 0.3777, "num_input_tokens_seen": 35969480, "step": 62350 }, { "epoch": 9.287310098302056, "grad_norm": 3.1866469383239746, "learning_rate": 7.69955295592964e-07, "loss": 0.7001, "num_input_tokens_seen": 35972296, "step": 62355 }, { "epoch": 9.288054810843015, "grad_norm": 3.5392298698425293, "learning_rate": 7.683556576980944e-07, "loss": 0.4779, "num_input_tokens_seen": 35975144, "step": 62360 }, { "epoch": 9.288799523383974, "grad_norm": 3.8933403491973877, "learning_rate": 7.667576572792323e-07, "loss": 0.6264, "num_input_tokens_seen": 35977896, "step": 62365 }, { "epoch": 9.289544235924932, "grad_norm": 8.111139297485352, "learning_rate": 7.651612944443609e-07, "loss": 0.4331, "num_input_tokens_seen": 35980712, "step": 62370 }, { "epoch": 9.290288948465893, "grad_norm": 2.395061492919922, "learning_rate": 7.635665693013577e-07, "loss": 0.666, "num_input_tokens_seen": 35983496, "step": 62375 }, { "epoch": 9.291033661006852, "grad_norm": 2.245089054107666, "learning_rate": 7.619734819579893e-07, "loss": 0.5153, "num_input_tokens_seen": 35986024, "step": 62380 }, { "epoch": 9.29177837354781, "grad_norm": 3.673778772354126, "learning_rate": 7.603820325219058e-07, "loss": 0.7145, "num_input_tokens_seen": 35988808, "step": 62385 }, { "epoch": 9.292523086088769, "grad_norm": 2.197300434112549, "learning_rate": 7.58792221100657e-07, "loss": 0.4448, "num_input_tokens_seen": 35991720, "step": 62390 }, { "epoch": 9.29326779862973, "grad_norm": 2.4957337379455566, "learning_rate": 7.572040478016712e-07, "loss": 0.6077, "num_input_tokens_seen": 35994440, "step": 62395 }, { "epoch": 9.294012511170688, "grad_norm": 2.7624740600585938, "learning_rate": 7.556175127322707e-07, "loss": 0.5792, "num_input_tokens_seen": 35997096, "step": 62400 }, { "epoch": 9.294757223711647, "grad_norm": 1.5125340223312378, "learning_rate": 7.540326159996697e-07, "loss": 0.5178, "num_input_tokens_seen": 36000168, "step": 62405 }, { "epoch": 9.295501936252606, "grad_norm": 2.3147194385528564, "learning_rate": 7.524493577109659e-07, "loss": 0.5274, "num_input_tokens_seen": 36003080, "step": 62410 }, { "epoch": 9.296246648793566, "grad_norm": 4.871889114379883, "learning_rate": 7.508677379731515e-07, "loss": 0.4357, "num_input_tokens_seen": 36005896, "step": 62415 }, { "epoch": 9.296991361334525, "grad_norm": 1.7710046768188477, "learning_rate": 7.49287756893105e-07, "loss": 0.6418, "num_input_tokens_seen": 36008744, "step": 62420 }, { "epoch": 9.297736073875484, "grad_norm": 1.9464941024780273, "learning_rate": 7.477094145775993e-07, "loss": 0.5708, "num_input_tokens_seen": 36011912, "step": 62425 }, { "epoch": 9.298480786416443, "grad_norm": 5.381521701812744, "learning_rate": 7.46132711133285e-07, "loss": 0.6933, "num_input_tokens_seen": 36014920, "step": 62430 }, { "epoch": 9.299225498957403, "grad_norm": 2.908665895462036, "learning_rate": 7.445576466667131e-07, "loss": 0.6135, "num_input_tokens_seen": 36017992, "step": 62435 }, { "epoch": 9.299970211498362, "grad_norm": 3.218547821044922, "learning_rate": 7.429842212843208e-07, "loss": 0.5608, "num_input_tokens_seen": 36020744, "step": 62440 }, { "epoch": 9.30071492403932, "grad_norm": 5.263422012329102, "learning_rate": 7.41412435092434e-07, "loss": 0.4737, "num_input_tokens_seen": 36023304, "step": 62445 }, { "epoch": 9.30145963658028, "grad_norm": 5.276802062988281, "learning_rate": 7.39842288197265e-07, "loss": 0.5704, "num_input_tokens_seen": 36026056, "step": 62450 }, { "epoch": 9.30220434912124, "grad_norm": 2.838277816772461, "learning_rate": 7.382737807049233e-07, "loss": 0.5323, "num_input_tokens_seen": 36028680, "step": 62455 }, { "epoch": 9.302949061662199, "grad_norm": 4.105550765991211, "learning_rate": 7.367069127213938e-07, "loss": 0.7652, "num_input_tokens_seen": 36031560, "step": 62460 }, { "epoch": 9.303693774203158, "grad_norm": 3.1357462406158447, "learning_rate": 7.351416843525638e-07, "loss": 0.7079, "num_input_tokens_seen": 36034440, "step": 62465 }, { "epoch": 9.304438486744116, "grad_norm": 2.463857889175415, "learning_rate": 7.335780957042071e-07, "loss": 0.6945, "num_input_tokens_seen": 36037288, "step": 62470 }, { "epoch": 9.305183199285075, "grad_norm": 6.580064296722412, "learning_rate": 7.320161468819808e-07, "loss": 0.6867, "num_input_tokens_seen": 36040264, "step": 62475 }, { "epoch": 9.305927911826036, "grad_norm": 3.4624431133270264, "learning_rate": 7.304558379914395e-07, "loss": 0.7724, "num_input_tokens_seen": 36043016, "step": 62480 }, { "epoch": 9.306672624366994, "grad_norm": 2.6056058406829834, "learning_rate": 7.288971691380209e-07, "loss": 0.4535, "num_input_tokens_seen": 36045800, "step": 62485 }, { "epoch": 9.307417336907953, "grad_norm": 1.4341374635696411, "learning_rate": 7.273401404270519e-07, "loss": 0.5408, "num_input_tokens_seen": 36048776, "step": 62490 }, { "epoch": 9.308162049448912, "grad_norm": 6.752202987670898, "learning_rate": 7.257847519637484e-07, "loss": 0.4508, "num_input_tokens_seen": 36051816, "step": 62495 }, { "epoch": 9.308906761989872, "grad_norm": 2.077510356903076, "learning_rate": 7.24231003853218e-07, "loss": 0.5765, "num_input_tokens_seen": 36054472, "step": 62500 }, { "epoch": 9.309651474530831, "grad_norm": 3.9726314544677734, "learning_rate": 7.2267889620046e-07, "loss": 0.6653, "num_input_tokens_seen": 36057256, "step": 62505 }, { "epoch": 9.31039618707179, "grad_norm": 2.2914600372314453, "learning_rate": 7.21128429110357e-07, "loss": 0.4502, "num_input_tokens_seen": 36060264, "step": 62510 }, { "epoch": 9.311140899612749, "grad_norm": 2.9403460025787354, "learning_rate": 7.195796026876866e-07, "loss": 0.4056, "num_input_tokens_seen": 36063016, "step": 62515 }, { "epoch": 9.31188561215371, "grad_norm": 1.6899430751800537, "learning_rate": 7.180324170371095e-07, "loss": 0.5183, "num_input_tokens_seen": 36065832, "step": 62520 }, { "epoch": 9.312630324694668, "grad_norm": 2.4454712867736816, "learning_rate": 7.164868722631807e-07, "loss": 0.4562, "num_input_tokens_seen": 36068744, "step": 62525 }, { "epoch": 9.313375037235627, "grad_norm": 4.019080638885498, "learning_rate": 7.149429684703335e-07, "loss": 0.5801, "num_input_tokens_seen": 36071656, "step": 62530 }, { "epoch": 9.314119749776586, "grad_norm": 2.737750291824341, "learning_rate": 7.134007057629066e-07, "loss": 0.6306, "num_input_tokens_seen": 36074440, "step": 62535 }, { "epoch": 9.314864462317546, "grad_norm": 2.1404998302459717, "learning_rate": 7.118600842451195e-07, "loss": 0.63, "num_input_tokens_seen": 36077672, "step": 62540 }, { "epoch": 9.315609174858505, "grad_norm": 2.422560453414917, "learning_rate": 7.103211040210778e-07, "loss": 0.7677, "num_input_tokens_seen": 36080360, "step": 62545 }, { "epoch": 9.316353887399464, "grad_norm": 1.2228018045425415, "learning_rate": 7.087837651947815e-07, "loss": 0.4269, "num_input_tokens_seen": 36083336, "step": 62550 }, { "epoch": 9.317098599940422, "grad_norm": 4.9431257247924805, "learning_rate": 7.072480678701198e-07, "loss": 0.4686, "num_input_tokens_seen": 36086312, "step": 62555 }, { "epoch": 9.317843312481383, "grad_norm": 3.902355194091797, "learning_rate": 7.057140121508627e-07, "loss": 0.5503, "num_input_tokens_seen": 36089064, "step": 62560 }, { "epoch": 9.318588025022342, "grad_norm": 4.902606964111328, "learning_rate": 7.041815981406852e-07, "loss": 0.7989, "num_input_tokens_seen": 36091784, "step": 62565 }, { "epoch": 9.3193327375633, "grad_norm": 2.6678390502929688, "learning_rate": 7.026508259431297e-07, "loss": 0.6101, "num_input_tokens_seen": 36094536, "step": 62570 }, { "epoch": 9.32007745010426, "grad_norm": 2.2626636028289795, "learning_rate": 7.011216956616467e-07, "loss": 0.4412, "num_input_tokens_seen": 36097512, "step": 62575 }, { "epoch": 9.32082216264522, "grad_norm": 2.6063895225524902, "learning_rate": 6.995942073995676e-07, "loss": 0.574, "num_input_tokens_seen": 36100200, "step": 62580 }, { "epoch": 9.321566875186178, "grad_norm": 6.2310566902160645, "learning_rate": 6.980683612601152e-07, "loss": 0.6606, "num_input_tokens_seen": 36103016, "step": 62585 }, { "epoch": 9.322311587727137, "grad_norm": 3.3633573055267334, "learning_rate": 6.965441573463988e-07, "loss": 0.8758, "num_input_tokens_seen": 36105736, "step": 62590 }, { "epoch": 9.323056300268096, "grad_norm": 2.2259294986724854, "learning_rate": 6.950215957614164e-07, "loss": 0.5546, "num_input_tokens_seen": 36108552, "step": 62595 }, { "epoch": 9.323801012809056, "grad_norm": 2.8716132640838623, "learning_rate": 6.935006766080582e-07, "loss": 0.718, "num_input_tokens_seen": 36111432, "step": 62600 }, { "epoch": 9.324545725350015, "grad_norm": 1.5440738201141357, "learning_rate": 6.919813999891028e-07, "loss": 0.5733, "num_input_tokens_seen": 36114088, "step": 62605 }, { "epoch": 9.325290437890974, "grad_norm": 5.017688751220703, "learning_rate": 6.904637660072128e-07, "loss": 0.3682, "num_input_tokens_seen": 36117064, "step": 62610 }, { "epoch": 9.326035150431933, "grad_norm": 2.201225519180298, "learning_rate": 6.889477747649447e-07, "loss": 0.4568, "num_input_tokens_seen": 36120232, "step": 62615 }, { "epoch": 9.326779862972893, "grad_norm": 4.317011833190918, "learning_rate": 6.874334263647503e-07, "loss": 0.7356, "num_input_tokens_seen": 36122888, "step": 62620 }, { "epoch": 9.327524575513852, "grad_norm": 3.3193795680999756, "learning_rate": 6.859207209089502e-07, "loss": 0.3514, "num_input_tokens_seen": 36125864, "step": 62625 }, { "epoch": 9.32826928805481, "grad_norm": 3.286137819290161, "learning_rate": 6.844096584997767e-07, "loss": 0.7322, "num_input_tokens_seen": 36128584, "step": 62630 }, { "epoch": 9.32901400059577, "grad_norm": 3.3001468181610107, "learning_rate": 6.829002392393396e-07, "loss": 0.7375, "num_input_tokens_seen": 36131336, "step": 62635 }, { "epoch": 9.32975871313673, "grad_norm": 2.9316635131835938, "learning_rate": 6.813924632296353e-07, "loss": 0.4542, "num_input_tokens_seen": 36134216, "step": 62640 }, { "epoch": 9.330503425677689, "grad_norm": 3.6808884143829346, "learning_rate": 6.798863305725628e-07, "loss": 0.8253, "num_input_tokens_seen": 36136872, "step": 62645 }, { "epoch": 9.331248138218648, "grad_norm": 3.0921499729156494, "learning_rate": 6.783818413698878e-07, "loss": 0.7182, "num_input_tokens_seen": 36140264, "step": 62650 }, { "epoch": 9.331992850759606, "grad_norm": 1.3915109634399414, "learning_rate": 6.768789957232874e-07, "loss": 0.5033, "num_input_tokens_seen": 36143048, "step": 62655 }, { "epoch": 9.332737563300565, "grad_norm": 2.1832175254821777, "learning_rate": 6.753777937343109e-07, "loss": 0.5418, "num_input_tokens_seen": 36145768, "step": 62660 }, { "epoch": 9.333482275841526, "grad_norm": 2.599006175994873, "learning_rate": 6.738782355044049e-07, "loss": 0.7531, "num_input_tokens_seen": 36148712, "step": 62665 }, { "epoch": 9.334226988382484, "grad_norm": 2.5981123447418213, "learning_rate": 6.72380321134905e-07, "loss": 0.4752, "num_input_tokens_seen": 36151496, "step": 62670 }, { "epoch": 9.334971700923443, "grad_norm": 3.9681594371795654, "learning_rate": 6.708840507270359e-07, "loss": 0.5982, "num_input_tokens_seen": 36154280, "step": 62675 }, { "epoch": 9.335716413464402, "grad_norm": 2.387657642364502, "learning_rate": 6.693894243819082e-07, "loss": 0.6149, "num_input_tokens_seen": 36157256, "step": 62680 }, { "epoch": 9.336461126005362, "grad_norm": 3.459599733352661, "learning_rate": 6.678964422005218e-07, "loss": 0.545, "num_input_tokens_seen": 36160264, "step": 62685 }, { "epoch": 9.337205838546321, "grad_norm": 4.00982666015625, "learning_rate": 6.66405104283771e-07, "loss": 0.4481, "num_input_tokens_seen": 36163016, "step": 62690 }, { "epoch": 9.33795055108728, "grad_norm": 3.152729034423828, "learning_rate": 6.649154107324251e-07, "loss": 0.4476, "num_input_tokens_seen": 36165896, "step": 62695 }, { "epoch": 9.338695263628239, "grad_norm": 2.512712001800537, "learning_rate": 6.634273616471565e-07, "loss": 0.5208, "num_input_tokens_seen": 36168904, "step": 62700 }, { "epoch": 9.3394399761692, "grad_norm": 4.883627414703369, "learning_rate": 6.619409571285206e-07, "loss": 0.6315, "num_input_tokens_seen": 36171880, "step": 62705 }, { "epoch": 9.340184688710158, "grad_norm": 2.961291790008545, "learning_rate": 6.604561972769652e-07, "loss": 0.5294, "num_input_tokens_seen": 36174632, "step": 62710 }, { "epoch": 9.340929401251117, "grad_norm": 1.175276756286621, "learning_rate": 6.589730821928208e-07, "loss": 0.4264, "num_input_tokens_seen": 36177800, "step": 62715 }, { "epoch": 9.341674113792076, "grad_norm": 2.6027843952178955, "learning_rate": 6.574916119763158e-07, "loss": 0.6613, "num_input_tokens_seen": 36180936, "step": 62720 }, { "epoch": 9.342418826333036, "grad_norm": 1.7758598327636719, "learning_rate": 6.560117867275561e-07, "loss": 0.5176, "num_input_tokens_seen": 36183560, "step": 62725 }, { "epoch": 9.343163538873995, "grad_norm": 7.4741339683532715, "learning_rate": 6.545336065465451e-07, "loss": 0.4743, "num_input_tokens_seen": 36186440, "step": 62730 }, { "epoch": 9.343908251414954, "grad_norm": 1.7768731117248535, "learning_rate": 6.530570715331696e-07, "loss": 0.5497, "num_input_tokens_seen": 36189256, "step": 62735 }, { "epoch": 9.344652963955912, "grad_norm": 6.569961071014404, "learning_rate": 6.515821817872109e-07, "loss": 0.5926, "num_input_tokens_seen": 36192264, "step": 62740 }, { "epoch": 9.345397676496873, "grad_norm": 1.2823728322982788, "learning_rate": 6.501089374083336e-07, "loss": 0.5009, "num_input_tokens_seen": 36195048, "step": 62745 }, { "epoch": 9.346142389037832, "grad_norm": 3.7516531944274902, "learning_rate": 6.48637338496097e-07, "loss": 0.6998, "num_input_tokens_seen": 36197896, "step": 62750 }, { "epoch": 9.34688710157879, "grad_norm": 5.352017879486084, "learning_rate": 6.471673851499438e-07, "loss": 0.6275, "num_input_tokens_seen": 36200808, "step": 62755 }, { "epoch": 9.34763181411975, "grad_norm": 2.2316155433654785, "learning_rate": 6.456990774692057e-07, "loss": 0.4348, "num_input_tokens_seen": 36203880, "step": 62760 }, { "epoch": 9.34837652666071, "grad_norm": 3.399458646774292, "learning_rate": 6.442324155531088e-07, "loss": 0.4758, "num_input_tokens_seen": 36206888, "step": 62765 }, { "epoch": 9.349121239201668, "grad_norm": 3.837850332260132, "learning_rate": 6.4276739950076e-07, "loss": 0.7488, "num_input_tokens_seen": 36209800, "step": 62770 }, { "epoch": 9.349865951742627, "grad_norm": 5.069051265716553, "learning_rate": 6.413040294111605e-07, "loss": 0.5035, "num_input_tokens_seen": 36212648, "step": 62775 }, { "epoch": 9.350610664283586, "grad_norm": 2.877662420272827, "learning_rate": 6.398423053832009e-07, "loss": 0.4474, "num_input_tokens_seen": 36215528, "step": 62780 }, { "epoch": 9.351355376824547, "grad_norm": 2.673962354660034, "learning_rate": 6.383822275156576e-07, "loss": 0.4956, "num_input_tokens_seen": 36218120, "step": 62785 }, { "epoch": 9.352100089365505, "grad_norm": 3.8273894786834717, "learning_rate": 6.369237959071933e-07, "loss": 0.4581, "num_input_tokens_seen": 36220808, "step": 62790 }, { "epoch": 9.352844801906464, "grad_norm": 3.0980567932128906, "learning_rate": 6.354670106563681e-07, "loss": 0.5594, "num_input_tokens_seen": 36223944, "step": 62795 }, { "epoch": 9.353589514447423, "grad_norm": 1.5374733209609985, "learning_rate": 6.340118718616228e-07, "loss": 0.621, "num_input_tokens_seen": 36226696, "step": 62800 }, { "epoch": 9.354334226988382, "grad_norm": 2.4348623752593994, "learning_rate": 6.325583796212925e-07, "loss": 0.4663, "num_input_tokens_seen": 36229832, "step": 62805 }, { "epoch": 9.355078939529342, "grad_norm": 3.7536604404449463, "learning_rate": 6.311065340335931e-07, "loss": 0.6855, "num_input_tokens_seen": 36232776, "step": 62810 }, { "epoch": 9.3558236520703, "grad_norm": 4.514348030090332, "learning_rate": 6.296563351966378e-07, "loss": 0.7163, "num_input_tokens_seen": 36235592, "step": 62815 }, { "epoch": 9.35656836461126, "grad_norm": 3.4212725162506104, "learning_rate": 6.282077832084259e-07, "loss": 0.615, "num_input_tokens_seen": 36238280, "step": 62820 }, { "epoch": 9.357313077152218, "grad_norm": 3.4386653900146484, "learning_rate": 6.267608781668433e-07, "loss": 0.6424, "num_input_tokens_seen": 36241064, "step": 62825 }, { "epoch": 9.358057789693179, "grad_norm": 3.178694486618042, "learning_rate": 6.253156201696669e-07, "loss": 0.8155, "num_input_tokens_seen": 36243880, "step": 62830 }, { "epoch": 9.358802502234138, "grad_norm": 3.936140298843384, "learning_rate": 6.238720093145578e-07, "loss": 0.7144, "num_input_tokens_seen": 36246472, "step": 62835 }, { "epoch": 9.359547214775096, "grad_norm": 5.560481071472168, "learning_rate": 6.22430045699074e-07, "loss": 0.5381, "num_input_tokens_seen": 36249224, "step": 62840 }, { "epoch": 9.360291927316055, "grad_norm": 1.5614694356918335, "learning_rate": 6.20989729420654e-07, "loss": 0.6512, "num_input_tokens_seen": 36251944, "step": 62845 }, { "epoch": 9.361036639857016, "grad_norm": 1.9342652559280396, "learning_rate": 6.195510605766342e-07, "loss": 0.5032, "num_input_tokens_seen": 36254824, "step": 62850 }, { "epoch": 9.361781352397974, "grad_norm": 1.5174508094787598, "learning_rate": 6.181140392642309e-07, "loss": 0.5762, "num_input_tokens_seen": 36257800, "step": 62855 }, { "epoch": 9.362526064938933, "grad_norm": 3.3605854511260986, "learning_rate": 6.166786655805473e-07, "loss": 0.5667, "num_input_tokens_seen": 36260840, "step": 62860 }, { "epoch": 9.363270777479892, "grad_norm": 4.737607479095459, "learning_rate": 6.152449396225834e-07, "loss": 0.6744, "num_input_tokens_seen": 36263752, "step": 62865 }, { "epoch": 9.364015490020853, "grad_norm": 2.8820924758911133, "learning_rate": 6.138128614872258e-07, "loss": 0.4388, "num_input_tokens_seen": 36266440, "step": 62870 }, { "epoch": 9.364760202561811, "grad_norm": 2.602980852127075, "learning_rate": 6.123824312712494e-07, "loss": 0.5702, "num_input_tokens_seen": 36268968, "step": 62875 }, { "epoch": 9.36550491510277, "grad_norm": 6.306624412536621, "learning_rate": 6.109536490713136e-07, "loss": 0.6159, "num_input_tokens_seen": 36271560, "step": 62880 }, { "epoch": 9.366249627643729, "grad_norm": 2.9420697689056396, "learning_rate": 6.095265149839769e-07, "loss": 0.48, "num_input_tokens_seen": 36274696, "step": 62885 }, { "epoch": 9.36699434018469, "grad_norm": 2.678187370300293, "learning_rate": 6.081010291056705e-07, "loss": 0.5899, "num_input_tokens_seen": 36277288, "step": 62890 }, { "epoch": 9.367739052725648, "grad_norm": 9.019941329956055, "learning_rate": 6.066771915327257e-07, "loss": 0.6013, "num_input_tokens_seen": 36280328, "step": 62895 }, { "epoch": 9.368483765266607, "grad_norm": 2.4991376399993896, "learning_rate": 6.052550023613601e-07, "loss": 0.7128, "num_input_tokens_seen": 36283112, "step": 62900 }, { "epoch": 9.369228477807566, "grad_norm": 2.9079267978668213, "learning_rate": 6.038344616876801e-07, "loss": 0.5538, "num_input_tokens_seen": 36285928, "step": 62905 }, { "epoch": 9.369973190348526, "grad_norm": 3.8352625370025635, "learning_rate": 6.024155696076784e-07, "loss": 0.5892, "num_input_tokens_seen": 36288712, "step": 62910 }, { "epoch": 9.370717902889485, "grad_norm": 2.0116777420043945, "learning_rate": 6.009983262172392e-07, "loss": 0.589, "num_input_tokens_seen": 36291368, "step": 62915 }, { "epoch": 9.371462615430444, "grad_norm": 3.4914188385009766, "learning_rate": 5.995827316121388e-07, "loss": 0.5978, "num_input_tokens_seen": 36294280, "step": 62920 }, { "epoch": 9.372207327971402, "grad_norm": 3.6758604049682617, "learning_rate": 5.981687858880258e-07, "loss": 0.5008, "num_input_tokens_seen": 36297000, "step": 62925 }, { "epoch": 9.372952040512363, "grad_norm": 2.979321241378784, "learning_rate": 5.967564891404626e-07, "loss": 0.5176, "num_input_tokens_seen": 36299784, "step": 62930 }, { "epoch": 9.373696753053322, "grad_norm": 5.566463947296143, "learning_rate": 5.953458414648755e-07, "loss": 0.6454, "num_input_tokens_seen": 36302312, "step": 62935 }, { "epoch": 9.37444146559428, "grad_norm": 4.197691917419434, "learning_rate": 5.939368429565911e-07, "loss": 0.4293, "num_input_tokens_seen": 36305448, "step": 62940 }, { "epoch": 9.37518617813524, "grad_norm": 3.1527836322784424, "learning_rate": 5.925294937108306e-07, "loss": 0.4265, "num_input_tokens_seen": 36308424, "step": 62945 }, { "epoch": 9.3759308906762, "grad_norm": 2.5163214206695557, "learning_rate": 5.911237938226954e-07, "loss": 0.5565, "num_input_tokens_seen": 36311336, "step": 62950 }, { "epoch": 9.376675603217159, "grad_norm": 2.9487035274505615, "learning_rate": 5.897197433871709e-07, "loss": 0.558, "num_input_tokens_seen": 36313800, "step": 62955 }, { "epoch": 9.377420315758117, "grad_norm": 2.546243667602539, "learning_rate": 5.883173424991423e-07, "loss": 0.3307, "num_input_tokens_seen": 36316712, "step": 62960 }, { "epoch": 9.378165028299076, "grad_norm": 2.7232751846313477, "learning_rate": 5.86916591253378e-07, "loss": 0.5891, "num_input_tokens_seen": 36319976, "step": 62965 }, { "epoch": 9.378909740840037, "grad_norm": 1.8771418333053589, "learning_rate": 5.855174897445359e-07, "loss": 0.4658, "num_input_tokens_seen": 36323016, "step": 62970 }, { "epoch": 9.379654453380995, "grad_norm": 2.601309061050415, "learning_rate": 5.841200380671569e-07, "loss": 0.4124, "num_input_tokens_seen": 36326024, "step": 62975 }, { "epoch": 9.380399165921954, "grad_norm": 2.7104852199554443, "learning_rate": 5.827242363156793e-07, "loss": 0.4163, "num_input_tokens_seen": 36329224, "step": 62980 }, { "epoch": 9.381143878462913, "grad_norm": 3.1791648864746094, "learning_rate": 5.813300845844249e-07, "loss": 0.6154, "num_input_tokens_seen": 36332360, "step": 62985 }, { "epoch": 9.381888591003872, "grad_norm": 4.095108985900879, "learning_rate": 5.799375829676018e-07, "loss": 0.4802, "num_input_tokens_seen": 36335016, "step": 62990 }, { "epoch": 9.382633303544832, "grad_norm": 2.437122106552124, "learning_rate": 5.785467315593124e-07, "loss": 0.5386, "num_input_tokens_seen": 36337768, "step": 62995 }, { "epoch": 9.383378016085791, "grad_norm": 3.7299482822418213, "learning_rate": 5.771575304535453e-07, "loss": 0.6712, "num_input_tokens_seen": 36340680, "step": 63000 }, { "epoch": 9.38412272862675, "grad_norm": 3.872169256210327, "learning_rate": 5.757699797441757e-07, "loss": 0.4669, "num_input_tokens_seen": 36343400, "step": 63005 }, { "epoch": 9.384867441167708, "grad_norm": 2.6694538593292236, "learning_rate": 5.743840795249727e-07, "loss": 0.5288, "num_input_tokens_seen": 36346408, "step": 63010 }, { "epoch": 9.385612153708669, "grad_norm": 7.810412883758545, "learning_rate": 5.729998298895839e-07, "loss": 0.5919, "num_input_tokens_seen": 36349064, "step": 63015 }, { "epoch": 9.386356866249628, "grad_norm": 4.133138179779053, "learning_rate": 5.716172309315537e-07, "loss": 0.4986, "num_input_tokens_seen": 36351784, "step": 63020 }, { "epoch": 9.387101578790586, "grad_norm": 3.5321052074432373, "learning_rate": 5.702362827443131e-07, "loss": 0.4763, "num_input_tokens_seen": 36354760, "step": 63025 }, { "epoch": 9.387846291331545, "grad_norm": 3.5585806369781494, "learning_rate": 5.68856985421179e-07, "loss": 0.5866, "num_input_tokens_seen": 36357832, "step": 63030 }, { "epoch": 9.388591003872506, "grad_norm": 2.0694565773010254, "learning_rate": 5.674793390553601e-07, "loss": 0.5274, "num_input_tokens_seen": 36360712, "step": 63035 }, { "epoch": 9.389335716413465, "grad_norm": 2.1120383739471436, "learning_rate": 5.661033437399516e-07, "loss": 0.4542, "num_input_tokens_seen": 36363432, "step": 63040 }, { "epoch": 9.390080428954423, "grad_norm": 4.342904567718506, "learning_rate": 5.647289995679372e-07, "loss": 0.6746, "num_input_tokens_seen": 36366280, "step": 63045 }, { "epoch": 9.390825141495382, "grad_norm": 3.769636392593384, "learning_rate": 5.633563066321956e-07, "loss": 0.4183, "num_input_tokens_seen": 36368968, "step": 63050 }, { "epoch": 9.391569854036343, "grad_norm": 3.7092621326446533, "learning_rate": 5.619852650254803e-07, "loss": 0.633, "num_input_tokens_seen": 36371912, "step": 63055 }, { "epoch": 9.392314566577301, "grad_norm": 4.3195109367370605, "learning_rate": 5.606158748404423e-07, "loss": 0.5438, "num_input_tokens_seen": 36374696, "step": 63060 }, { "epoch": 9.39305927911826, "grad_norm": 2.2997796535491943, "learning_rate": 5.592481361696183e-07, "loss": 0.6185, "num_input_tokens_seen": 36377480, "step": 63065 }, { "epoch": 9.393803991659219, "grad_norm": 3.272782325744629, "learning_rate": 5.578820491054376e-07, "loss": 0.4206, "num_input_tokens_seen": 36380296, "step": 63070 }, { "epoch": 9.39454870420018, "grad_norm": 3.957282304763794, "learning_rate": 5.565176137402123e-07, "loss": 0.5525, "num_input_tokens_seen": 36383048, "step": 63075 }, { "epoch": 9.395293416741138, "grad_norm": 1.7268338203430176, "learning_rate": 5.551548301661492e-07, "loss": 0.5541, "num_input_tokens_seen": 36385896, "step": 63080 }, { "epoch": 9.396038129282097, "grad_norm": 4.659906387329102, "learning_rate": 5.537936984753384e-07, "loss": 0.3473, "num_input_tokens_seen": 36389032, "step": 63085 }, { "epoch": 9.396782841823056, "grad_norm": 1.612630009651184, "learning_rate": 5.524342187597564e-07, "loss": 0.4505, "num_input_tokens_seen": 36392008, "step": 63090 }, { "epoch": 9.397527554364016, "grad_norm": 1.9339486360549927, "learning_rate": 5.510763911112743e-07, "loss": 0.7627, "num_input_tokens_seen": 36394824, "step": 63095 }, { "epoch": 9.398272266904975, "grad_norm": 2.755099296569824, "learning_rate": 5.497202156216463e-07, "loss": 0.5665, "num_input_tokens_seen": 36397512, "step": 63100 }, { "epoch": 9.399016979445934, "grad_norm": 2.8280038833618164, "learning_rate": 5.483656923825159e-07, "loss": 0.6885, "num_input_tokens_seen": 36400424, "step": 63105 }, { "epoch": 9.399761691986892, "grad_norm": 6.430371284484863, "learning_rate": 5.470128214854236e-07, "loss": 0.6425, "num_input_tokens_seen": 36403240, "step": 63110 }, { "epoch": 9.400506404527853, "grad_norm": 5.620336055755615, "learning_rate": 5.456616030217853e-07, "loss": 0.5119, "num_input_tokens_seen": 36406024, "step": 63115 }, { "epoch": 9.401251117068812, "grad_norm": 5.246488571166992, "learning_rate": 5.443120370829114e-07, "loss": 0.6284, "num_input_tokens_seen": 36409000, "step": 63120 }, { "epoch": 9.40199582960977, "grad_norm": 4.209305286407471, "learning_rate": 5.429641237599981e-07, "loss": 0.7907, "num_input_tokens_seen": 36411976, "step": 63125 }, { "epoch": 9.40274054215073, "grad_norm": 4.463624477386475, "learning_rate": 5.416178631441393e-07, "loss": 0.5701, "num_input_tokens_seen": 36414760, "step": 63130 }, { "epoch": 9.40348525469169, "grad_norm": 3.165811538696289, "learning_rate": 5.402732553263012e-07, "loss": 0.6145, "num_input_tokens_seen": 36417544, "step": 63135 }, { "epoch": 9.404229967232649, "grad_norm": 2.7317636013031006, "learning_rate": 5.389303003973501e-07, "loss": 0.6092, "num_input_tokens_seen": 36420520, "step": 63140 }, { "epoch": 9.404974679773607, "grad_norm": 5.815022945404053, "learning_rate": 5.375889984480381e-07, "loss": 0.7305, "num_input_tokens_seen": 36423240, "step": 63145 }, { "epoch": 9.405719392314566, "grad_norm": 3.8370602130889893, "learning_rate": 5.362493495690069e-07, "loss": 0.4665, "num_input_tokens_seen": 36426376, "step": 63150 }, { "epoch": 9.406464104855527, "grad_norm": 2.6125736236572266, "learning_rate": 5.349113538507783e-07, "loss": 0.7191, "num_input_tokens_seen": 36429320, "step": 63155 }, { "epoch": 9.407208817396485, "grad_norm": 9.503931999206543, "learning_rate": 5.335750113837745e-07, "loss": 0.5427, "num_input_tokens_seen": 36432360, "step": 63160 }, { "epoch": 9.407953529937444, "grad_norm": 2.8135082721710205, "learning_rate": 5.322403222582984e-07, "loss": 0.4657, "num_input_tokens_seen": 36435496, "step": 63165 }, { "epoch": 9.408698242478403, "grad_norm": 2.928346633911133, "learning_rate": 5.309072865645442e-07, "loss": 0.4385, "num_input_tokens_seen": 36438248, "step": 63170 }, { "epoch": 9.409442955019362, "grad_norm": 2.272012948989868, "learning_rate": 5.295759043925902e-07, "loss": 0.7601, "num_input_tokens_seen": 36441288, "step": 63175 }, { "epoch": 9.410187667560322, "grad_norm": 2.207287311553955, "learning_rate": 5.282461758324058e-07, "loss": 0.5208, "num_input_tokens_seen": 36444200, "step": 63180 }, { "epoch": 9.410932380101281, "grad_norm": 2.0142221450805664, "learning_rate": 5.269181009738527e-07, "loss": 0.4357, "num_input_tokens_seen": 36447400, "step": 63185 }, { "epoch": 9.41167709264224, "grad_norm": 2.948364019393921, "learning_rate": 5.255916799066729e-07, "loss": 0.5133, "num_input_tokens_seen": 36450600, "step": 63190 }, { "epoch": 9.412421805183198, "grad_norm": 1.8792765140533447, "learning_rate": 5.242669127205002e-07, "loss": 0.4998, "num_input_tokens_seen": 36453448, "step": 63195 }, { "epoch": 9.413166517724159, "grad_norm": 3.4682326316833496, "learning_rate": 5.229437995048603e-07, "loss": 0.3407, "num_input_tokens_seen": 36456168, "step": 63200 }, { "epoch": 9.413911230265118, "grad_norm": 2.389293909072876, "learning_rate": 5.216223403491593e-07, "loss": 0.4807, "num_input_tokens_seen": 36459336, "step": 63205 }, { "epoch": 9.414655942806077, "grad_norm": 3.8231565952301025, "learning_rate": 5.203025353427038e-07, "loss": 0.4594, "num_input_tokens_seen": 36461992, "step": 63210 }, { "epoch": 9.415400655347035, "grad_norm": 4.682621479034424, "learning_rate": 5.189843845746723e-07, "loss": 0.6104, "num_input_tokens_seen": 36465192, "step": 63215 }, { "epoch": 9.416145367887996, "grad_norm": 5.237422943115234, "learning_rate": 5.176678881341435e-07, "loss": 0.6529, "num_input_tokens_seen": 36468008, "step": 63220 }, { "epoch": 9.416890080428955, "grad_norm": 4.14412260055542, "learning_rate": 5.163530461100824e-07, "loss": 0.7215, "num_input_tokens_seen": 36471080, "step": 63225 }, { "epoch": 9.417634792969913, "grad_norm": 2.454094409942627, "learning_rate": 5.150398585913374e-07, "loss": 0.6933, "num_input_tokens_seen": 36473864, "step": 63230 }, { "epoch": 9.418379505510872, "grad_norm": 2.7243854999542236, "learning_rate": 5.137283256666486e-07, "loss": 0.6637, "num_input_tokens_seen": 36476840, "step": 63235 }, { "epoch": 9.419124218051833, "grad_norm": 3.830587863922119, "learning_rate": 5.12418447424648e-07, "loss": 0.7066, "num_input_tokens_seen": 36479560, "step": 63240 }, { "epoch": 9.419868930592791, "grad_norm": 2.5018694400787354, "learning_rate": 5.111102239538479e-07, "loss": 0.5931, "num_input_tokens_seen": 36482024, "step": 63245 }, { "epoch": 9.42061364313375, "grad_norm": 3.98016357421875, "learning_rate": 5.098036553426583e-07, "loss": 0.6865, "num_input_tokens_seen": 36485256, "step": 63250 }, { "epoch": 9.421358355674709, "grad_norm": 2.5897574424743652, "learning_rate": 5.084987416793669e-07, "loss": 0.5651, "num_input_tokens_seen": 36487912, "step": 63255 }, { "epoch": 9.42210306821567, "grad_norm": 7.743433952331543, "learning_rate": 5.07195483052153e-07, "loss": 0.615, "num_input_tokens_seen": 36490600, "step": 63260 }, { "epoch": 9.422847780756628, "grad_norm": 1.8966647386550903, "learning_rate": 5.058938795490881e-07, "loss": 0.5089, "num_input_tokens_seen": 36493352, "step": 63265 }, { "epoch": 9.423592493297587, "grad_norm": 5.419064044952393, "learning_rate": 5.045939312581294e-07, "loss": 0.7278, "num_input_tokens_seen": 36496296, "step": 63270 }, { "epoch": 9.424337205838546, "grad_norm": 4.192561626434326, "learning_rate": 5.032956382671206e-07, "loss": 0.5498, "num_input_tokens_seen": 36499080, "step": 63275 }, { "epoch": 9.425081918379506, "grad_norm": 3.697835922241211, "learning_rate": 5.019990006637998e-07, "loss": 0.5355, "num_input_tokens_seen": 36501736, "step": 63280 }, { "epoch": 9.425826630920465, "grad_norm": 3.763862133026123, "learning_rate": 5.007040185357803e-07, "loss": 0.749, "num_input_tokens_seen": 36504680, "step": 63285 }, { "epoch": 9.426571343461424, "grad_norm": 2.5947301387786865, "learning_rate": 4.99410691970581e-07, "loss": 0.5918, "num_input_tokens_seen": 36507560, "step": 63290 }, { "epoch": 9.427316056002383, "grad_norm": 3.303304433822632, "learning_rate": 4.98119021055593e-07, "loss": 0.5123, "num_input_tokens_seen": 36510440, "step": 63295 }, { "epoch": 9.428060768543343, "grad_norm": 3.7404682636260986, "learning_rate": 4.968290058781022e-07, "loss": 0.3942, "num_input_tokens_seen": 36513512, "step": 63300 }, { "epoch": 9.428805481084302, "grad_norm": 5.138016223907471, "learning_rate": 4.95540646525286e-07, "loss": 0.6626, "num_input_tokens_seen": 36516328, "step": 63305 }, { "epoch": 9.42955019362526, "grad_norm": 3.663360357284546, "learning_rate": 4.942539430842052e-07, "loss": 0.7828, "num_input_tokens_seen": 36519144, "step": 63310 }, { "epoch": 9.43029490616622, "grad_norm": 2.472187042236328, "learning_rate": 4.929688956418099e-07, "loss": 0.6342, "num_input_tokens_seen": 36521992, "step": 63315 }, { "epoch": 9.43103961870718, "grad_norm": 2.4838743209838867, "learning_rate": 4.916855042849388e-07, "loss": 0.5782, "num_input_tokens_seen": 36525160, "step": 63320 }, { "epoch": 9.431784331248139, "grad_norm": 3.6905081272125244, "learning_rate": 4.904037691003172e-07, "loss": 0.5517, "num_input_tokens_seen": 36527944, "step": 63325 }, { "epoch": 9.432529043789097, "grad_norm": 3.517634153366089, "learning_rate": 4.891236901745616e-07, "loss": 0.7504, "num_input_tokens_seen": 36531016, "step": 63330 }, { "epoch": 9.433273756330056, "grad_norm": 2.828385353088379, "learning_rate": 4.878452675941697e-07, "loss": 0.4139, "num_input_tokens_seen": 36533992, "step": 63335 }, { "epoch": 9.434018468871017, "grad_norm": 4.878207683563232, "learning_rate": 4.865685014455363e-07, "loss": 0.709, "num_input_tokens_seen": 36536616, "step": 63340 }, { "epoch": 9.434763181411975, "grad_norm": 4.865743160247803, "learning_rate": 4.852933918149394e-07, "loss": 0.7022, "num_input_tokens_seen": 36539528, "step": 63345 }, { "epoch": 9.435507893952934, "grad_norm": 1.7190669775009155, "learning_rate": 4.840199387885491e-07, "loss": 0.7075, "num_input_tokens_seen": 36542344, "step": 63350 }, { "epoch": 9.436252606493893, "grad_norm": 4.0413594245910645, "learning_rate": 4.827481424524133e-07, "loss": 0.8209, "num_input_tokens_seen": 36545224, "step": 63355 }, { "epoch": 9.436997319034852, "grad_norm": 5.444726943969727, "learning_rate": 4.81478002892477e-07, "loss": 0.5537, "num_input_tokens_seen": 36548264, "step": 63360 }, { "epoch": 9.437742031575812, "grad_norm": 1.7974838018417358, "learning_rate": 4.802095201945745e-07, "loss": 0.3642, "num_input_tokens_seen": 36550984, "step": 63365 }, { "epoch": 9.438486744116771, "grad_norm": 5.681240081787109, "learning_rate": 4.789426944444231e-07, "loss": 0.656, "num_input_tokens_seen": 36554056, "step": 63370 }, { "epoch": 9.43923145665773, "grad_norm": 2.1842398643493652, "learning_rate": 4.776775257276267e-07, "loss": 0.6046, "num_input_tokens_seen": 36557032, "step": 63375 }, { "epoch": 9.439976169198689, "grad_norm": 3.525604248046875, "learning_rate": 4.7641401412968357e-07, "loss": 0.6646, "num_input_tokens_seen": 36559912, "step": 63380 }, { "epoch": 9.440720881739649, "grad_norm": 5.025163650512695, "learning_rate": 4.7515215973597815e-07, "loss": 0.6272, "num_input_tokens_seen": 36562952, "step": 63385 }, { "epoch": 9.441465594280608, "grad_norm": 3.006779432296753, "learning_rate": 4.738919626317756e-07, "loss": 0.6023, "num_input_tokens_seen": 36565864, "step": 63390 }, { "epoch": 9.442210306821567, "grad_norm": 2.2028584480285645, "learning_rate": 4.726334229022383e-07, "loss": 0.6153, "num_input_tokens_seen": 36568840, "step": 63395 }, { "epoch": 9.442955019362525, "grad_norm": 5.6331353187561035, "learning_rate": 4.71376540632415e-07, "loss": 0.5333, "num_input_tokens_seen": 36571528, "step": 63400 }, { "epoch": 9.443699731903486, "grad_norm": 3.6520562171936035, "learning_rate": 4.7012131590723765e-07, "loss": 0.6789, "num_input_tokens_seen": 36574120, "step": 63405 }, { "epoch": 9.444444444444445, "grad_norm": 4.016339302062988, "learning_rate": 4.688677488115328e-07, "loss": 0.4733, "num_input_tokens_seen": 36577288, "step": 63410 }, { "epoch": 9.445189156985403, "grad_norm": 1.7468490600585938, "learning_rate": 4.676158394300051e-07, "loss": 0.5549, "num_input_tokens_seen": 36580072, "step": 63415 }, { "epoch": 9.445933869526362, "grad_norm": 4.049680233001709, "learning_rate": 4.663655878472617e-07, "loss": 0.4999, "num_input_tokens_seen": 36583144, "step": 63420 }, { "epoch": 9.446678582067323, "grad_norm": 2.803171157836914, "learning_rate": 4.651169941477851e-07, "loss": 0.5172, "num_input_tokens_seen": 36585896, "step": 63425 }, { "epoch": 9.447423294608281, "grad_norm": 2.528151273727417, "learning_rate": 4.6387005841594943e-07, "loss": 0.682, "num_input_tokens_seen": 36589032, "step": 63430 }, { "epoch": 9.44816800714924, "grad_norm": 2.970064163208008, "learning_rate": 4.626247807360151e-07, "loss": 0.4484, "num_input_tokens_seen": 36592072, "step": 63435 }, { "epoch": 9.448912719690199, "grad_norm": 2.8475382328033447, "learning_rate": 4.613811611921398e-07, "loss": 0.599, "num_input_tokens_seen": 36595080, "step": 63440 }, { "epoch": 9.44965743223116, "grad_norm": 6.14182710647583, "learning_rate": 4.6013919986836187e-07, "loss": 0.5096, "num_input_tokens_seen": 36598152, "step": 63445 }, { "epoch": 9.450402144772118, "grad_norm": 3.3453147411346436, "learning_rate": 4.588988968486002e-07, "loss": 0.677, "num_input_tokens_seen": 36601096, "step": 63450 }, { "epoch": 9.451146857313077, "grad_norm": 3.125840425491333, "learning_rate": 4.5766025221667674e-07, "loss": 0.8806, "num_input_tokens_seen": 36604008, "step": 63455 }, { "epoch": 9.451891569854036, "grad_norm": 2.3800692558288574, "learning_rate": 4.5642326605629116e-07, "loss": 0.5626, "num_input_tokens_seen": 36606888, "step": 63460 }, { "epoch": 9.452636282394996, "grad_norm": 2.3825650215148926, "learning_rate": 4.5518793845103215e-07, "loss": 0.5063, "num_input_tokens_seen": 36609928, "step": 63465 }, { "epoch": 9.453380994935955, "grad_norm": 6.7195844650268555, "learning_rate": 4.539542694843829e-07, "loss": 0.6522, "num_input_tokens_seen": 36612712, "step": 63470 }, { "epoch": 9.454125707476914, "grad_norm": 5.283756732940674, "learning_rate": 4.527222592397046e-07, "loss": 0.4455, "num_input_tokens_seen": 36615400, "step": 63475 }, { "epoch": 9.454870420017873, "grad_norm": 2.374993324279785, "learning_rate": 4.514919078002583e-07, "loss": 0.5834, "num_input_tokens_seen": 36618344, "step": 63480 }, { "epoch": 9.455615132558833, "grad_norm": 2.1319570541381836, "learning_rate": 4.502632152491776e-07, "loss": 0.5943, "num_input_tokens_seen": 36621352, "step": 63485 }, { "epoch": 9.456359845099792, "grad_norm": 4.233776092529297, "learning_rate": 4.490361816694988e-07, "loss": 0.5695, "num_input_tokens_seen": 36624264, "step": 63490 }, { "epoch": 9.45710455764075, "grad_norm": 6.541616439819336, "learning_rate": 4.478108071441389e-07, "loss": 0.7017, "num_input_tokens_seen": 36626952, "step": 63495 }, { "epoch": 9.45784927018171, "grad_norm": 3.3180911540985107, "learning_rate": 4.4658709175590116e-07, "loss": 0.6136, "num_input_tokens_seen": 36629672, "step": 63500 }, { "epoch": 9.458593982722668, "grad_norm": 2.7808797359466553, "learning_rate": 4.4536503558748057e-07, "loss": 0.4713, "num_input_tokens_seen": 36632488, "step": 63505 }, { "epoch": 9.459338695263629, "grad_norm": 2.544226884841919, "learning_rate": 4.441446387214582e-07, "loss": 0.631, "num_input_tokens_seen": 36635112, "step": 63510 }, { "epoch": 9.460083407804587, "grad_norm": 3.7710025310516357, "learning_rate": 4.4292590124030697e-07, "loss": 0.6653, "num_input_tokens_seen": 36637800, "step": 63515 }, { "epoch": 9.460828120345546, "grad_norm": 3.4094440937042236, "learning_rate": 4.4170882322638053e-07, "loss": 0.3668, "num_input_tokens_seen": 36640616, "step": 63520 }, { "epoch": 9.461572832886505, "grad_norm": 2.646134376525879, "learning_rate": 4.4049340476192414e-07, "loss": 0.5814, "num_input_tokens_seen": 36643816, "step": 63525 }, { "epoch": 9.462317545427466, "grad_norm": 2.874788999557495, "learning_rate": 4.392796459290721e-07, "loss": 0.549, "num_input_tokens_seen": 36646536, "step": 63530 }, { "epoch": 9.463062257968424, "grad_norm": 4.067875862121582, "learning_rate": 4.380675468098477e-07, "loss": 0.5184, "num_input_tokens_seen": 36649128, "step": 63535 }, { "epoch": 9.463806970509383, "grad_norm": 2.4887781143188477, "learning_rate": 4.3685710748615493e-07, "loss": 0.6107, "num_input_tokens_seen": 36651976, "step": 63540 }, { "epoch": 9.464551683050342, "grad_norm": 3.6574161052703857, "learning_rate": 4.356483280397894e-07, "loss": 0.4255, "num_input_tokens_seen": 36654920, "step": 63545 }, { "epoch": 9.465296395591302, "grad_norm": 4.275592803955078, "learning_rate": 4.344412085524441e-07, "loss": 0.5336, "num_input_tokens_seen": 36657832, "step": 63550 }, { "epoch": 9.466041108132261, "grad_norm": 5.946915149688721, "learning_rate": 4.3323574910568157e-07, "loss": 0.7083, "num_input_tokens_seen": 36660712, "step": 63555 }, { "epoch": 9.46678582067322, "grad_norm": 1.9859215021133423, "learning_rate": 4.320319497809672e-07, "loss": 0.6641, "num_input_tokens_seen": 36663432, "step": 63560 }, { "epoch": 9.467530533214179, "grad_norm": 4.361292839050293, "learning_rate": 4.30829810659647e-07, "loss": 0.644, "num_input_tokens_seen": 36666344, "step": 63565 }, { "epoch": 9.46827524575514, "grad_norm": 3.279465913772583, "learning_rate": 4.2962933182295606e-07, "loss": 0.4729, "num_input_tokens_seen": 36669128, "step": 63570 }, { "epoch": 9.469019958296098, "grad_norm": 10.287147521972656, "learning_rate": 4.2843051335202386e-07, "loss": 0.7709, "num_input_tokens_seen": 36672104, "step": 63575 }, { "epoch": 9.469764670837057, "grad_norm": 6.917392730712891, "learning_rate": 4.2723335532785235e-07, "loss": 0.9489, "num_input_tokens_seen": 36674984, "step": 63580 }, { "epoch": 9.470509383378015, "grad_norm": 1.8052375316619873, "learning_rate": 4.26037857831349e-07, "loss": 0.5754, "num_input_tokens_seen": 36678408, "step": 63585 }, { "epoch": 9.471254095918976, "grad_norm": 4.266767978668213, "learning_rate": 4.2484402094329354e-07, "loss": 0.5633, "num_input_tokens_seen": 36681352, "step": 63590 }, { "epoch": 9.471998808459935, "grad_norm": 4.036589622497559, "learning_rate": 4.2365184474436327e-07, "loss": 0.7291, "num_input_tokens_seen": 36684552, "step": 63595 }, { "epoch": 9.472743521000893, "grad_norm": 1.3873974084854126, "learning_rate": 4.224613293151214e-07, "loss": 0.3877, "num_input_tokens_seen": 36687304, "step": 63600 }, { "epoch": 9.473488233541852, "grad_norm": 2.142892599105835, "learning_rate": 4.212724747360175e-07, "loss": 0.3459, "num_input_tokens_seen": 36690344, "step": 63605 }, { "epoch": 9.474232946082813, "grad_norm": 3.7544052600860596, "learning_rate": 4.2008528108739287e-07, "loss": 0.6168, "num_input_tokens_seen": 36692936, "step": 63610 }, { "epoch": 9.474977658623772, "grad_norm": 1.975918173789978, "learning_rate": 4.1889974844946947e-07, "loss": 0.3564, "num_input_tokens_seen": 36695816, "step": 63615 }, { "epoch": 9.47572237116473, "grad_norm": 3.435237407684326, "learning_rate": 4.177158769023609e-07, "loss": 0.6684, "num_input_tokens_seen": 36699208, "step": 63620 }, { "epoch": 9.476467083705689, "grad_norm": 2.7957630157470703, "learning_rate": 4.1653366652607e-07, "loss": 0.5882, "num_input_tokens_seen": 36701896, "step": 63625 }, { "epoch": 9.47721179624665, "grad_norm": 2.6402182579040527, "learning_rate": 4.153531174004827e-07, "loss": 0.5627, "num_input_tokens_seen": 36704680, "step": 63630 }, { "epoch": 9.477956508787608, "grad_norm": 2.591235399246216, "learning_rate": 4.141742296053769e-07, "loss": 0.6184, "num_input_tokens_seen": 36707528, "step": 63635 }, { "epoch": 9.478701221328567, "grad_norm": 2.5691723823547363, "learning_rate": 4.1299700322041945e-07, "loss": 0.5959, "num_input_tokens_seen": 36710600, "step": 63640 }, { "epoch": 9.479445933869526, "grad_norm": 1.0134721994400024, "learning_rate": 4.118214383251634e-07, "loss": 0.5222, "num_input_tokens_seen": 36713544, "step": 63645 }, { "epoch": 9.480190646410486, "grad_norm": 3.5959689617156982, "learning_rate": 4.106475349990452e-07, "loss": 0.5256, "num_input_tokens_seen": 36716616, "step": 63650 }, { "epoch": 9.480935358951445, "grad_norm": 9.073653221130371, "learning_rate": 4.09475293321393e-07, "loss": 0.6102, "num_input_tokens_seen": 36719528, "step": 63655 }, { "epoch": 9.481680071492404, "grad_norm": 4.753718376159668, "learning_rate": 4.0830471337142407e-07, "loss": 0.809, "num_input_tokens_seen": 36722440, "step": 63660 }, { "epoch": 9.482424784033363, "grad_norm": 3.6645102500915527, "learning_rate": 4.071357952282362e-07, "loss": 0.5804, "num_input_tokens_seen": 36725352, "step": 63665 }, { "epoch": 9.483169496574323, "grad_norm": 4.029453754425049, "learning_rate": 4.059685389708273e-07, "loss": 0.6604, "num_input_tokens_seen": 36728168, "step": 63670 }, { "epoch": 9.483914209115282, "grad_norm": 5.338937282562256, "learning_rate": 4.048029446780704e-07, "loss": 0.6468, "num_input_tokens_seen": 36730856, "step": 63675 }, { "epoch": 9.48465892165624, "grad_norm": 4.153664588928223, "learning_rate": 4.0363901242873594e-07, "loss": 0.6771, "num_input_tokens_seen": 36733608, "step": 63680 }, { "epoch": 9.4854036341972, "grad_norm": 2.8366363048553467, "learning_rate": 4.0247674230147467e-07, "loss": 0.5796, "num_input_tokens_seen": 36736360, "step": 63685 }, { "epoch": 9.486148346738158, "grad_norm": 2.1608810424804688, "learning_rate": 4.0131613437482674e-07, "loss": 0.6611, "num_input_tokens_seen": 36739048, "step": 63690 }, { "epoch": 9.486893059279119, "grad_norm": 2.25927996635437, "learning_rate": 4.001571887272293e-07, "loss": 0.5064, "num_input_tokens_seen": 36741864, "step": 63695 }, { "epoch": 9.487637771820078, "grad_norm": 3.3737871646881104, "learning_rate": 3.989999054369864e-07, "loss": 0.4864, "num_input_tokens_seen": 36744776, "step": 63700 }, { "epoch": 9.488382484361036, "grad_norm": 3.41178035736084, "learning_rate": 3.978442845823133e-07, "loss": 0.5953, "num_input_tokens_seen": 36747624, "step": 63705 }, { "epoch": 9.489127196901995, "grad_norm": 5.059535503387451, "learning_rate": 3.966903262412974e-07, "loss": 0.5414, "num_input_tokens_seen": 36750216, "step": 63710 }, { "epoch": 9.489871909442956, "grad_norm": 2.202549934387207, "learning_rate": 3.9553803049192096e-07, "loss": 0.5941, "num_input_tokens_seen": 36752872, "step": 63715 }, { "epoch": 9.490616621983914, "grad_norm": 2.8907361030578613, "learning_rate": 3.9438739741204935e-07, "loss": 0.4957, "num_input_tokens_seen": 36755912, "step": 63720 }, { "epoch": 9.491361334524873, "grad_norm": 2.8017988204956055, "learning_rate": 3.9323842707943703e-07, "loss": 0.5062, "num_input_tokens_seen": 36758696, "step": 63725 }, { "epoch": 9.492106047065832, "grad_norm": 2.7344813346862793, "learning_rate": 3.920911195717275e-07, "loss": 0.5313, "num_input_tokens_seen": 36761416, "step": 63730 }, { "epoch": 9.492850759606792, "grad_norm": 4.637073516845703, "learning_rate": 3.909454749664532e-07, "loss": 0.7744, "num_input_tokens_seen": 36764040, "step": 63735 }, { "epoch": 9.493595472147751, "grad_norm": 4.2359466552734375, "learning_rate": 3.898014933410299e-07, "loss": 0.9648, "num_input_tokens_seen": 36766760, "step": 63740 }, { "epoch": 9.49434018468871, "grad_norm": 1.2353265285491943, "learning_rate": 3.8865917477276527e-07, "loss": 0.6072, "num_input_tokens_seen": 36769352, "step": 63745 }, { "epoch": 9.495084897229669, "grad_norm": 4.118026256561279, "learning_rate": 3.875185193388503e-07, "loss": 0.6354, "num_input_tokens_seen": 36772200, "step": 63750 }, { "epoch": 9.49582960977063, "grad_norm": 5.251855373382568, "learning_rate": 3.8637952711636504e-07, "loss": 0.5453, "num_input_tokens_seen": 36774760, "step": 63755 }, { "epoch": 9.496574322311588, "grad_norm": 4.056503772735596, "learning_rate": 3.8524219818228123e-07, "loss": 0.5702, "num_input_tokens_seen": 36777672, "step": 63760 }, { "epoch": 9.497319034852547, "grad_norm": 3.2753376960754395, "learning_rate": 3.8410653261345407e-07, "loss": 0.708, "num_input_tokens_seen": 36780712, "step": 63765 }, { "epoch": 9.498063747393505, "grad_norm": 3.210963249206543, "learning_rate": 3.8297253048662494e-07, "loss": 0.4853, "num_input_tokens_seen": 36783464, "step": 63770 }, { "epoch": 9.498808459934466, "grad_norm": 3.2345669269561768, "learning_rate": 3.818401918784298e-07, "loss": 0.8836, "num_input_tokens_seen": 36785992, "step": 63775 }, { "epoch": 9.499553172475425, "grad_norm": 2.9954605102539062, "learning_rate": 3.807095168653796e-07, "loss": 0.4761, "num_input_tokens_seen": 36789032, "step": 63780 }, { "epoch": 9.5, "eval_loss": 0.6832270622253418, "eval_runtime": 74.0191, "eval_samples_per_second": 40.314, "eval_steps_per_second": 10.078, "num_input_tokens_seen": 36790952, "step": 63783 }, { "epoch": 9.500297885016384, "grad_norm": 3.418241500854492, "learning_rate": 3.7958050552389104e-07, "loss": 0.7175, "num_input_tokens_seen": 36792136, "step": 63785 }, { "epoch": 9.501042597557342, "grad_norm": 2.1607425212860107, "learning_rate": 3.784531579302475e-07, "loss": 0.4485, "num_input_tokens_seen": 36795016, "step": 63790 }, { "epoch": 9.501787310098303, "grad_norm": 9.739537239074707, "learning_rate": 3.7732747416063805e-07, "loss": 0.7141, "num_input_tokens_seen": 36797832, "step": 63795 }, { "epoch": 9.502532022639262, "grad_norm": 3.087592840194702, "learning_rate": 3.762034542911269e-07, "loss": 0.6933, "num_input_tokens_seen": 36800616, "step": 63800 }, { "epoch": 9.50327673518022, "grad_norm": 2.3101108074188232, "learning_rate": 3.7508109839767546e-07, "loss": 0.6043, "num_input_tokens_seen": 36803464, "step": 63805 }, { "epoch": 9.504021447721179, "grad_norm": 3.658268451690674, "learning_rate": 3.7396040655612587e-07, "loss": 0.5571, "num_input_tokens_seen": 36806376, "step": 63810 }, { "epoch": 9.50476616026214, "grad_norm": 1.8447221517562866, "learning_rate": 3.728413788422119e-07, "loss": 0.3487, "num_input_tokens_seen": 36809352, "step": 63815 }, { "epoch": 9.505510872803098, "grad_norm": 7.142629146575928, "learning_rate": 3.7172401533154823e-07, "loss": 0.5427, "num_input_tokens_seen": 36812072, "step": 63820 }, { "epoch": 9.506255585344057, "grad_norm": 4.213608741760254, "learning_rate": 3.706083160996437e-07, "loss": 0.43, "num_input_tokens_seen": 36815176, "step": 63825 }, { "epoch": 9.507000297885016, "grad_norm": 3.8815386295318604, "learning_rate": 3.6949428122189375e-07, "loss": 0.5525, "num_input_tokens_seen": 36818056, "step": 63830 }, { "epoch": 9.507745010425975, "grad_norm": 2.439707040786743, "learning_rate": 3.6838191077357975e-07, "loss": 0.7606, "num_input_tokens_seen": 36821032, "step": 63835 }, { "epoch": 9.508489722966935, "grad_norm": 3.346933126449585, "learning_rate": 3.672712048298721e-07, "loss": 0.6374, "num_input_tokens_seen": 36824008, "step": 63840 }, { "epoch": 9.509234435507894, "grad_norm": 5.0772013664245605, "learning_rate": 3.661621634658274e-07, "loss": 0.6843, "num_input_tokens_seen": 36826888, "step": 63845 }, { "epoch": 9.509979148048853, "grad_norm": 1.6599804162979126, "learning_rate": 3.650547867563886e-07, "loss": 0.5857, "num_input_tokens_seen": 36830056, "step": 63850 }, { "epoch": 9.510723860589813, "grad_norm": 2.2017147541046143, "learning_rate": 3.6394907477639294e-07, "loss": 0.5722, "num_input_tokens_seen": 36832968, "step": 63855 }, { "epoch": 9.511468573130772, "grad_norm": 2.1920132637023926, "learning_rate": 3.628450276005502e-07, "loss": 0.6179, "num_input_tokens_seen": 36835784, "step": 63860 }, { "epoch": 9.51221328567173, "grad_norm": 5.304088592529297, "learning_rate": 3.6174264530347557e-07, "loss": 0.6995, "num_input_tokens_seen": 36838536, "step": 63865 }, { "epoch": 9.51295799821269, "grad_norm": 2.8179266452789307, "learning_rate": 3.6064192795965956e-07, "loss": 0.6808, "num_input_tokens_seen": 36841736, "step": 63870 }, { "epoch": 9.513702710753648, "grad_norm": 2.5270836353302, "learning_rate": 3.595428756434871e-07, "loss": 0.514, "num_input_tokens_seen": 36844392, "step": 63875 }, { "epoch": 9.514447423294609, "grad_norm": 1.481031060218811, "learning_rate": 3.584454884292293e-07, "loss": 0.6306, "num_input_tokens_seen": 36847272, "step": 63880 }, { "epoch": 9.515192135835568, "grad_norm": 5.660893440246582, "learning_rate": 3.5734976639103525e-07, "loss": 0.7076, "num_input_tokens_seen": 36850056, "step": 63885 }, { "epoch": 9.515936848376526, "grad_norm": 2.549102544784546, "learning_rate": 3.5625570960295674e-07, "loss": 0.5753, "num_input_tokens_seen": 36853128, "step": 63890 }, { "epoch": 9.516681560917485, "grad_norm": 6.5215229988098145, "learning_rate": 3.5516331813892355e-07, "loss": 0.4459, "num_input_tokens_seen": 36856104, "step": 63895 }, { "epoch": 9.517426273458446, "grad_norm": 1.746680736541748, "learning_rate": 3.5407259207275444e-07, "loss": 0.4975, "num_input_tokens_seen": 36859112, "step": 63900 }, { "epoch": 9.518170985999404, "grad_norm": 3.7675297260284424, "learning_rate": 3.529835314781543e-07, "loss": 0.5163, "num_input_tokens_seen": 36862376, "step": 63905 }, { "epoch": 9.518915698540363, "grad_norm": 2.1994810104370117, "learning_rate": 3.5189613642872264e-07, "loss": 0.516, "num_input_tokens_seen": 36865512, "step": 63910 }, { "epoch": 9.519660411081322, "grad_norm": 3.101938009262085, "learning_rate": 3.508104069979368e-07, "loss": 0.4869, "num_input_tokens_seen": 36868488, "step": 63915 }, { "epoch": 9.520405123622282, "grad_norm": 1.1381489038467407, "learning_rate": 3.4972634325916854e-07, "loss": 0.5994, "num_input_tokens_seen": 36871240, "step": 63920 }, { "epoch": 9.521149836163241, "grad_norm": 4.500250816345215, "learning_rate": 3.486439452856705e-07, "loss": 0.5477, "num_input_tokens_seen": 36874120, "step": 63925 }, { "epoch": 9.5218945487042, "grad_norm": 2.390627861022949, "learning_rate": 3.4756321315058957e-07, "loss": 0.4656, "num_input_tokens_seen": 36877128, "step": 63930 }, { "epoch": 9.522639261245159, "grad_norm": 4.045703411102295, "learning_rate": 3.4648414692696196e-07, "loss": 0.7014, "num_input_tokens_seen": 36879976, "step": 63935 }, { "epoch": 9.52338397378612, "grad_norm": 3.0631871223449707, "learning_rate": 3.4540674668769866e-07, "loss": 0.632, "num_input_tokens_seen": 36882760, "step": 63940 }, { "epoch": 9.524128686327078, "grad_norm": 6.141455173492432, "learning_rate": 3.443310125056082e-07, "loss": 0.7991, "num_input_tokens_seen": 36885672, "step": 63945 }, { "epoch": 9.524873398868037, "grad_norm": 3.5086238384246826, "learning_rate": 3.4325694445338783e-07, "loss": 0.7016, "num_input_tokens_seen": 36888456, "step": 63950 }, { "epoch": 9.525618111408996, "grad_norm": 2.322599411010742, "learning_rate": 3.42184542603613e-07, "loss": 0.5795, "num_input_tokens_seen": 36891496, "step": 63955 }, { "epoch": 9.526362823949956, "grad_norm": 3.4485561847686768, "learning_rate": 3.411138070287562e-07, "loss": 0.6575, "num_input_tokens_seen": 36894376, "step": 63960 }, { "epoch": 9.527107536490915, "grad_norm": 3.5233640670776367, "learning_rate": 3.400447378011734e-07, "loss": 0.5804, "num_input_tokens_seen": 36897128, "step": 63965 }, { "epoch": 9.527852249031874, "grad_norm": 3.4257259368896484, "learning_rate": 3.389773349931069e-07, "loss": 0.5185, "num_input_tokens_seen": 36900040, "step": 63970 }, { "epoch": 9.528596961572832, "grad_norm": 4.209555625915527, "learning_rate": 3.3791159867668786e-07, "loss": 0.5183, "num_input_tokens_seen": 36902984, "step": 63975 }, { "epoch": 9.529341674113793, "grad_norm": 5.424502849578857, "learning_rate": 3.3684752892393643e-07, "loss": 0.594, "num_input_tokens_seen": 36906152, "step": 63980 }, { "epoch": 9.530086386654752, "grad_norm": 1.7628642320632935, "learning_rate": 3.357851258067535e-07, "loss": 0.4083, "num_input_tokens_seen": 36909000, "step": 63985 }, { "epoch": 9.53083109919571, "grad_norm": 2.2127304077148438, "learning_rate": 3.347243893969343e-07, "loss": 0.5787, "num_input_tokens_seen": 36911848, "step": 63990 }, { "epoch": 9.53157581173667, "grad_norm": 2.251899480819702, "learning_rate": 3.3366531976615767e-07, "loss": 0.4753, "num_input_tokens_seen": 36914824, "step": 63995 }, { "epoch": 9.53232052427763, "grad_norm": 3.541283130645752, "learning_rate": 3.326079169859941e-07, "loss": 0.3836, "num_input_tokens_seen": 36917384, "step": 64000 }, { "epoch": 9.533065236818588, "grad_norm": 3.287623643875122, "learning_rate": 3.3155218112789763e-07, "loss": 0.5527, "num_input_tokens_seen": 36920488, "step": 64005 }, { "epoch": 9.533809949359547, "grad_norm": 4.768036365509033, "learning_rate": 3.3049811226321113e-07, "loss": 0.5622, "num_input_tokens_seen": 36923688, "step": 64010 }, { "epoch": 9.534554661900506, "grad_norm": 5.415800094604492, "learning_rate": 3.2944571046316373e-07, "loss": 0.5872, "num_input_tokens_seen": 36926728, "step": 64015 }, { "epoch": 9.535299374441465, "grad_norm": 4.132192134857178, "learning_rate": 3.283949757988708e-07, "loss": 0.5256, "num_input_tokens_seen": 36929480, "step": 64020 }, { "epoch": 9.536044086982425, "grad_norm": 3.565228223800659, "learning_rate": 3.273459083413366e-07, "loss": 0.5816, "num_input_tokens_seen": 36932680, "step": 64025 }, { "epoch": 9.536788799523384, "grad_norm": 2.2703394889831543, "learning_rate": 3.2629850816145723e-07, "loss": 0.4728, "num_input_tokens_seen": 36935688, "step": 64030 }, { "epoch": 9.537533512064343, "grad_norm": 3.2852683067321777, "learning_rate": 3.2525277533000667e-07, "loss": 0.6092, "num_input_tokens_seen": 36938664, "step": 64035 }, { "epoch": 9.538278224605303, "grad_norm": 3.084378719329834, "learning_rate": 3.242087099176533e-07, "loss": 0.4603, "num_input_tokens_seen": 36941736, "step": 64040 }, { "epoch": 9.539022937146262, "grad_norm": 4.601776123046875, "learning_rate": 3.2316631199495186e-07, "loss": 0.5788, "num_input_tokens_seen": 36944680, "step": 64045 }, { "epoch": 9.53976764968722, "grad_norm": 1.9124720096588135, "learning_rate": 3.2212558163234043e-07, "loss": 0.4246, "num_input_tokens_seen": 36947496, "step": 64050 }, { "epoch": 9.54051236222818, "grad_norm": 5.040890216827393, "learning_rate": 3.2108651890014884e-07, "loss": 0.5655, "num_input_tokens_seen": 36950504, "step": 64055 }, { "epoch": 9.541257074769138, "grad_norm": 2.6784658432006836, "learning_rate": 3.20049123868596e-07, "loss": 0.5043, "num_input_tokens_seen": 36953160, "step": 64060 }, { "epoch": 9.542001787310099, "grad_norm": 1.448149561882019, "learning_rate": 3.1901339660778127e-07, "loss": 0.4377, "num_input_tokens_seen": 36956072, "step": 64065 }, { "epoch": 9.542746499851058, "grad_norm": 2.162432909011841, "learning_rate": 3.17979337187696e-07, "loss": 0.5765, "num_input_tokens_seen": 36959080, "step": 64070 }, { "epoch": 9.543491212392016, "grad_norm": 3.071591854095459, "learning_rate": 3.169469456782148e-07, "loss": 0.6395, "num_input_tokens_seen": 36961960, "step": 64075 }, { "epoch": 9.544235924932975, "grad_norm": 2.867281913757324, "learning_rate": 3.1591622214910686e-07, "loss": 0.5998, "num_input_tokens_seen": 36964680, "step": 64080 }, { "epoch": 9.544980637473936, "grad_norm": 3.138633966445923, "learning_rate": 3.1488716667002204e-07, "loss": 0.5985, "num_input_tokens_seen": 36967720, "step": 64085 }, { "epoch": 9.545725350014894, "grad_norm": 2.0458948612213135, "learning_rate": 3.138597793105019e-07, "loss": 0.5745, "num_input_tokens_seen": 36970760, "step": 64090 }, { "epoch": 9.546470062555853, "grad_norm": 6.419363021850586, "learning_rate": 3.1283406013996874e-07, "loss": 0.6421, "num_input_tokens_seen": 36973864, "step": 64095 }, { "epoch": 9.547214775096812, "grad_norm": 3.5477993488311768, "learning_rate": 3.118100092277421e-07, "loss": 0.6273, "num_input_tokens_seen": 36976840, "step": 64100 }, { "epoch": 9.547959487637772, "grad_norm": 3.226658821105957, "learning_rate": 3.1078762664301655e-07, "loss": 0.5234, "num_input_tokens_seen": 36979784, "step": 64105 }, { "epoch": 9.548704200178731, "grad_norm": 3.140902042388916, "learning_rate": 3.097669124548869e-07, "loss": 0.6763, "num_input_tokens_seen": 36982760, "step": 64110 }, { "epoch": 9.54944891271969, "grad_norm": 6.6696457862854, "learning_rate": 3.087478667323257e-07, "loss": 0.8655, "num_input_tokens_seen": 36985544, "step": 64115 }, { "epoch": 9.550193625260649, "grad_norm": 3.5951054096221924, "learning_rate": 3.0773048954419457e-07, "loss": 0.523, "num_input_tokens_seen": 36988456, "step": 64120 }, { "epoch": 9.55093833780161, "grad_norm": 2.3640496730804443, "learning_rate": 3.0671478095924687e-07, "loss": 0.4737, "num_input_tokens_seen": 36991112, "step": 64125 }, { "epoch": 9.551683050342568, "grad_norm": 5.669538497924805, "learning_rate": 3.057007410461166e-07, "loss": 0.5085, "num_input_tokens_seen": 36993960, "step": 64130 }, { "epoch": 9.552427762883527, "grad_norm": 5.981587886810303, "learning_rate": 3.046883698733322e-07, "loss": 0.6115, "num_input_tokens_seen": 36996680, "step": 64135 }, { "epoch": 9.553172475424486, "grad_norm": 1.665123701095581, "learning_rate": 3.036776675093056e-07, "loss": 0.4275, "num_input_tokens_seen": 36999624, "step": 64140 }, { "epoch": 9.553917187965446, "grad_norm": 1.6814372539520264, "learning_rate": 3.02668634022335e-07, "loss": 0.3517, "num_input_tokens_seen": 37002408, "step": 64145 }, { "epoch": 9.554661900506405, "grad_norm": 1.6394847631454468, "learning_rate": 3.016612694806048e-07, "loss": 0.5553, "num_input_tokens_seen": 37005448, "step": 64150 }, { "epoch": 9.555406613047364, "grad_norm": 3.743311882019043, "learning_rate": 3.0065557395218825e-07, "loss": 0.6675, "num_input_tokens_seen": 37008424, "step": 64155 }, { "epoch": 9.556151325588322, "grad_norm": 3.6736879348754883, "learning_rate": 2.9965154750504764e-07, "loss": 0.6468, "num_input_tokens_seen": 37010952, "step": 64160 }, { "epoch": 9.556896038129283, "grad_norm": 7.595656394958496, "learning_rate": 2.9864919020703155e-07, "loss": 0.7953, "num_input_tokens_seen": 37013864, "step": 64165 }, { "epoch": 9.557640750670242, "grad_norm": 2.3183510303497314, "learning_rate": 2.976485021258746e-07, "loss": 0.5274, "num_input_tokens_seen": 37016584, "step": 64170 }, { "epoch": 9.5583854632112, "grad_norm": 2.7637691497802734, "learning_rate": 2.966494833292005e-07, "loss": 0.5111, "num_input_tokens_seen": 37019528, "step": 64175 }, { "epoch": 9.55913017575216, "grad_norm": 3.2961065769195557, "learning_rate": 2.9565213388451917e-07, "loss": 0.4899, "num_input_tokens_seen": 37022600, "step": 64180 }, { "epoch": 9.55987488829312, "grad_norm": 6.030929088592529, "learning_rate": 2.9465645385922394e-07, "loss": 0.5742, "num_input_tokens_seen": 37025736, "step": 64185 }, { "epoch": 9.560619600834078, "grad_norm": 2.900784969329834, "learning_rate": 2.9366244332060257e-07, "loss": 0.4068, "num_input_tokens_seen": 37028776, "step": 64190 }, { "epoch": 9.561364313375037, "grad_norm": 5.412660598754883, "learning_rate": 2.926701023358208e-07, "loss": 0.4276, "num_input_tokens_seen": 37031496, "step": 64195 }, { "epoch": 9.562109025915996, "grad_norm": 2.879133701324463, "learning_rate": 2.916794309719445e-07, "loss": 0.3961, "num_input_tokens_seen": 37034440, "step": 64200 }, { "epoch": 9.562853738456955, "grad_norm": 3.365488052368164, "learning_rate": 2.906904292959145e-07, "loss": 0.3584, "num_input_tokens_seen": 37037256, "step": 64205 }, { "epoch": 9.563598450997915, "grad_norm": 3.6674723625183105, "learning_rate": 2.8970309737456625e-07, "loss": 0.5458, "num_input_tokens_seen": 37040072, "step": 64210 }, { "epoch": 9.564343163538874, "grad_norm": 2.7370944023132324, "learning_rate": 2.8871743527461583e-07, "loss": 0.6591, "num_input_tokens_seen": 37042984, "step": 64215 }, { "epoch": 9.565087876079833, "grad_norm": 2.0798580646514893, "learning_rate": 2.877334430626738e-07, "loss": 0.5998, "num_input_tokens_seen": 37045768, "step": 64220 }, { "epoch": 9.565832588620792, "grad_norm": 4.270788669586182, "learning_rate": 2.867511208052315e-07, "loss": 0.4549, "num_input_tokens_seen": 37048456, "step": 64225 }, { "epoch": 9.566577301161752, "grad_norm": 3.3507957458496094, "learning_rate": 2.857704685686718e-07, "loss": 0.6853, "num_input_tokens_seen": 37051208, "step": 64230 }, { "epoch": 9.56732201370271, "grad_norm": 1.6057517528533936, "learning_rate": 2.8479148641926134e-07, "loss": 0.4332, "num_input_tokens_seen": 37053992, "step": 64235 }, { "epoch": 9.56806672624367, "grad_norm": 3.3930375576019287, "learning_rate": 2.8381417442316093e-07, "loss": 0.4388, "num_input_tokens_seen": 37056840, "step": 64240 }, { "epoch": 9.568811438784628, "grad_norm": 2.1593143939971924, "learning_rate": 2.8283853264640947e-07, "loss": 0.7394, "num_input_tokens_seen": 37059944, "step": 64245 }, { "epoch": 9.569556151325589, "grad_norm": 4.560375213623047, "learning_rate": 2.8186456115493475e-07, "loss": 0.5635, "num_input_tokens_seen": 37063112, "step": 64250 }, { "epoch": 9.570300863866548, "grad_norm": 3.0615601539611816, "learning_rate": 2.8089226001455913e-07, "loss": 0.753, "num_input_tokens_seen": 37066216, "step": 64255 }, { "epoch": 9.571045576407506, "grad_norm": 3.7065212726593018, "learning_rate": 2.799216292909829e-07, "loss": 0.5211, "num_input_tokens_seen": 37068904, "step": 64260 }, { "epoch": 9.571790288948465, "grad_norm": 2.4088757038116455, "learning_rate": 2.789526690497979e-07, "loss": 0.5399, "num_input_tokens_seen": 37071656, "step": 64265 }, { "epoch": 9.572535001489426, "grad_norm": 2.636382818222046, "learning_rate": 2.779853793564852e-07, "loss": 0.5143, "num_input_tokens_seen": 37074600, "step": 64270 }, { "epoch": 9.573279714030384, "grad_norm": 3.539731025695801, "learning_rate": 2.7701976027640353e-07, "loss": 0.3624, "num_input_tokens_seen": 37077384, "step": 64275 }, { "epoch": 9.574024426571343, "grad_norm": 3.756147861480713, "learning_rate": 2.7605581187481467e-07, "loss": 0.5466, "num_input_tokens_seen": 37080584, "step": 64280 }, { "epoch": 9.574769139112302, "grad_norm": 5.236111164093018, "learning_rate": 2.750935342168526e-07, "loss": 0.5785, "num_input_tokens_seen": 37083400, "step": 64285 }, { "epoch": 9.575513851653263, "grad_norm": 6.657758712768555, "learning_rate": 2.741329273675458e-07, "loss": 0.7834, "num_input_tokens_seen": 37086600, "step": 64290 }, { "epoch": 9.576258564194221, "grad_norm": 2.39699387550354, "learning_rate": 2.7317399139180634e-07, "loss": 0.4762, "num_input_tokens_seen": 37089288, "step": 64295 }, { "epoch": 9.57700327673518, "grad_norm": 3.898649215698242, "learning_rate": 2.7221672635443783e-07, "loss": 0.7314, "num_input_tokens_seen": 37092136, "step": 64300 }, { "epoch": 9.577747989276139, "grad_norm": 3.2704174518585205, "learning_rate": 2.712611323201275e-07, "loss": 0.5038, "num_input_tokens_seen": 37095208, "step": 64305 }, { "epoch": 9.5784927018171, "grad_norm": 2.6672329902648926, "learning_rate": 2.7030720935344867e-07, "loss": 0.4916, "num_input_tokens_seen": 37097928, "step": 64310 }, { "epoch": 9.579237414358058, "grad_norm": 6.0989580154418945, "learning_rate": 2.6935495751886644e-07, "loss": 0.4959, "num_input_tokens_seen": 37100712, "step": 64315 }, { "epoch": 9.579982126899017, "grad_norm": 3.029362201690674, "learning_rate": 2.6840437688072653e-07, "loss": 0.6956, "num_input_tokens_seen": 37103656, "step": 64320 }, { "epoch": 9.580726839439976, "grad_norm": 2.702730894088745, "learning_rate": 2.6745546750326924e-07, "loss": 0.626, "num_input_tokens_seen": 37106568, "step": 64325 }, { "epoch": 9.581471551980936, "grad_norm": 5.527353763580322, "learning_rate": 2.665082294506155e-07, "loss": 0.509, "num_input_tokens_seen": 37109128, "step": 64330 }, { "epoch": 9.582216264521895, "grad_norm": 4.222168922424316, "learning_rate": 2.655626627867752e-07, "loss": 0.5224, "num_input_tokens_seen": 37112168, "step": 64335 }, { "epoch": 9.582960977062854, "grad_norm": 3.60221791267395, "learning_rate": 2.6461876757565007e-07, "loss": 0.6531, "num_input_tokens_seen": 37114952, "step": 64340 }, { "epoch": 9.583705689603812, "grad_norm": 3.6299521923065186, "learning_rate": 2.6367654388102236e-07, "loss": 0.2998, "num_input_tokens_seen": 37118248, "step": 64345 }, { "epoch": 9.584450402144771, "grad_norm": 7.351911544799805, "learning_rate": 2.6273599176656063e-07, "loss": 0.4964, "num_input_tokens_seen": 37121160, "step": 64350 }, { "epoch": 9.585195114685732, "grad_norm": 2.028989553451538, "learning_rate": 2.617971112958278e-07, "loss": 0.5219, "num_input_tokens_seen": 37124264, "step": 64355 }, { "epoch": 9.58593982722669, "grad_norm": 2.8083157539367676, "learning_rate": 2.6085990253226776e-07, "loss": 0.5559, "num_input_tokens_seen": 37126888, "step": 64360 }, { "epoch": 9.58668453976765, "grad_norm": 2.0771336555480957, "learning_rate": 2.5992436553921304e-07, "loss": 0.5602, "num_input_tokens_seen": 37130056, "step": 64365 }, { "epoch": 9.58742925230861, "grad_norm": 2.920949935913086, "learning_rate": 2.589905003798826e-07, "loss": 0.5478, "num_input_tokens_seen": 37132840, "step": 64370 }, { "epoch": 9.588173964849569, "grad_norm": 3.0892293453216553, "learning_rate": 2.58058307117387e-07, "loss": 0.7384, "num_input_tokens_seen": 37135976, "step": 64375 }, { "epoch": 9.588918677390527, "grad_norm": 4.006831645965576, "learning_rate": 2.571277858147175e-07, "loss": 0.5953, "num_input_tokens_seen": 37139112, "step": 64380 }, { "epoch": 9.589663389931486, "grad_norm": 4.443727493286133, "learning_rate": 2.561989365347545e-07, "loss": 0.7857, "num_input_tokens_seen": 37141704, "step": 64385 }, { "epoch": 9.590408102472445, "grad_norm": 2.0816662311553955, "learning_rate": 2.5527175934026426e-07, "loss": 0.5345, "num_input_tokens_seen": 37144424, "step": 64390 }, { "epoch": 9.591152815013405, "grad_norm": 3.669917345046997, "learning_rate": 2.5434625429390515e-07, "loss": 0.5142, "num_input_tokens_seen": 37147368, "step": 64395 }, { "epoch": 9.591897527554364, "grad_norm": 11.748348236083984, "learning_rate": 2.534224214582187e-07, "loss": 0.5172, "num_input_tokens_seen": 37149992, "step": 64400 }, { "epoch": 9.592642240095323, "grad_norm": 3.099820613861084, "learning_rate": 2.5250026089563004e-07, "loss": 0.4283, "num_input_tokens_seen": 37152904, "step": 64405 }, { "epoch": 9.593386952636282, "grad_norm": 2.600862503051758, "learning_rate": 2.5157977266846157e-07, "loss": 0.517, "num_input_tokens_seen": 37155784, "step": 64410 }, { "epoch": 9.594131665177242, "grad_norm": 2.969536066055298, "learning_rate": 2.5066095683891067e-07, "loss": 0.5525, "num_input_tokens_seen": 37158632, "step": 64415 }, { "epoch": 9.594876377718201, "grad_norm": 7.920019626617432, "learning_rate": 2.497438134690694e-07, "loss": 0.7836, "num_input_tokens_seen": 37161416, "step": 64420 }, { "epoch": 9.59562109025916, "grad_norm": 7.184505939483643, "learning_rate": 2.4882834262091317e-07, "loss": 0.7056, "num_input_tokens_seen": 37164232, "step": 64425 }, { "epoch": 9.596365802800118, "grad_norm": 2.980001926422119, "learning_rate": 2.4791454435630634e-07, "loss": 0.6802, "num_input_tokens_seen": 37167272, "step": 64430 }, { "epoch": 9.597110515341079, "grad_norm": 1.548121452331543, "learning_rate": 2.4700241873699957e-07, "loss": 0.478, "num_input_tokens_seen": 37169992, "step": 64435 }, { "epoch": 9.597855227882038, "grad_norm": 3.6940600872039795, "learning_rate": 2.460919658246297e-07, "loss": 0.5415, "num_input_tokens_seen": 37172840, "step": 64440 }, { "epoch": 9.598599940422996, "grad_norm": 2.764984607696533, "learning_rate": 2.4518318568072797e-07, "loss": 0.526, "num_input_tokens_seen": 37175848, "step": 64445 }, { "epoch": 9.599344652963955, "grad_norm": 3.6342878341674805, "learning_rate": 2.442760783666953e-07, "loss": 0.5846, "num_input_tokens_seen": 37178632, "step": 64450 }, { "epoch": 9.600089365504916, "grad_norm": 3.353191375732422, "learning_rate": 2.433706439438382e-07, "loss": 0.6623, "num_input_tokens_seen": 37181736, "step": 64455 }, { "epoch": 9.600834078045875, "grad_norm": 2.7082302570343018, "learning_rate": 2.4246688247334117e-07, "loss": 0.6489, "num_input_tokens_seen": 37184552, "step": 64460 }, { "epoch": 9.601578790586833, "grad_norm": 1.7427196502685547, "learning_rate": 2.4156479401627465e-07, "loss": 0.5062, "num_input_tokens_seen": 37187592, "step": 64465 }, { "epoch": 9.602323503127792, "grad_norm": 2.6527724266052246, "learning_rate": 2.4066437863359545e-07, "loss": 0.4381, "num_input_tokens_seen": 37190248, "step": 64470 }, { "epoch": 9.603068215668753, "grad_norm": 5.174808502197266, "learning_rate": 2.397656363861578e-07, "loss": 0.542, "num_input_tokens_seen": 37193096, "step": 64475 }, { "epoch": 9.603812928209711, "grad_norm": 3.0625972747802734, "learning_rate": 2.388685673346908e-07, "loss": 0.5842, "num_input_tokens_seen": 37195784, "step": 64480 }, { "epoch": 9.60455764075067, "grad_norm": 4.334270477294922, "learning_rate": 2.379731715398098e-07, "loss": 0.5692, "num_input_tokens_seen": 37198536, "step": 64485 }, { "epoch": 9.605302353291629, "grad_norm": 3.419940233230591, "learning_rate": 2.3707944906203038e-07, "loss": 0.4392, "num_input_tokens_seen": 37201256, "step": 64490 }, { "epoch": 9.60604706583259, "grad_norm": 3.5501153469085693, "learning_rate": 2.361873999617431e-07, "loss": 0.6846, "num_input_tokens_seen": 37203912, "step": 64495 }, { "epoch": 9.606791778373548, "grad_norm": 3.1812984943389893, "learning_rate": 2.352970242992303e-07, "loss": 0.4465, "num_input_tokens_seen": 37206664, "step": 64500 }, { "epoch": 9.607536490914507, "grad_norm": 7.265017986297607, "learning_rate": 2.344083221346549e-07, "loss": 0.4978, "num_input_tokens_seen": 37209800, "step": 64505 }, { "epoch": 9.608281203455466, "grad_norm": 2.9155125617980957, "learning_rate": 2.3352129352808007e-07, "loss": 0.4822, "num_input_tokens_seen": 37212712, "step": 64510 }, { "epoch": 9.609025915996426, "grad_norm": 2.9066452980041504, "learning_rate": 2.326359385394383e-07, "loss": 0.4961, "num_input_tokens_seen": 37215656, "step": 64515 }, { "epoch": 9.609770628537385, "grad_norm": 4.527001857757568, "learning_rate": 2.317522572285652e-07, "loss": 0.673, "num_input_tokens_seen": 37218472, "step": 64520 }, { "epoch": 9.610515341078344, "grad_norm": 3.4013478755950928, "learning_rate": 2.30870249655174e-07, "loss": 0.8316, "num_input_tokens_seen": 37221064, "step": 64525 }, { "epoch": 9.611260053619302, "grad_norm": 4.621777057647705, "learning_rate": 2.2998991587886709e-07, "loss": 0.6464, "num_input_tokens_seen": 37223816, "step": 64530 }, { "epoch": 9.612004766160261, "grad_norm": 3.1179842948913574, "learning_rate": 2.2911125595913296e-07, "loss": 0.4905, "num_input_tokens_seen": 37226600, "step": 64535 }, { "epoch": 9.612749478701222, "grad_norm": 2.632277727127075, "learning_rate": 2.2823426995535192e-07, "loss": 0.5535, "num_input_tokens_seen": 37229512, "step": 64540 }, { "epoch": 9.61349419124218, "grad_norm": 1.8872791528701782, "learning_rate": 2.2735895792678485e-07, "loss": 0.4405, "num_input_tokens_seen": 37232360, "step": 64545 }, { "epoch": 9.61423890378314, "grad_norm": 2.593905448913574, "learning_rate": 2.2648531993257893e-07, "loss": 0.592, "num_input_tokens_seen": 37235304, "step": 64550 }, { "epoch": 9.6149836163241, "grad_norm": 3.295711040496826, "learning_rate": 2.2561335603177302e-07, "loss": 0.6399, "num_input_tokens_seen": 37238216, "step": 64555 }, { "epoch": 9.615728328865059, "grad_norm": 3.3225200176239014, "learning_rate": 2.2474306628329222e-07, "loss": 0.536, "num_input_tokens_seen": 37240904, "step": 64560 }, { "epoch": 9.616473041406017, "grad_norm": 4.442457675933838, "learning_rate": 2.2387445074594505e-07, "loss": 0.616, "num_input_tokens_seen": 37243976, "step": 64565 }, { "epoch": 9.617217753946976, "grad_norm": 2.032691717147827, "learning_rate": 2.2300750947843174e-07, "loss": 0.532, "num_input_tokens_seen": 37246728, "step": 64570 }, { "epoch": 9.617962466487935, "grad_norm": 3.363224983215332, "learning_rate": 2.2214224253933326e-07, "loss": 0.5989, "num_input_tokens_seen": 37249512, "step": 64575 }, { "epoch": 9.618707179028895, "grad_norm": 2.9240901470184326, "learning_rate": 2.21278649987125e-07, "loss": 0.5566, "num_input_tokens_seen": 37252552, "step": 64580 }, { "epoch": 9.619451891569854, "grad_norm": 3.449005365371704, "learning_rate": 2.204167318801603e-07, "loss": 0.4614, "num_input_tokens_seen": 37255560, "step": 64585 }, { "epoch": 9.620196604110813, "grad_norm": 2.453684091567993, "learning_rate": 2.1955648827668708e-07, "loss": 0.5828, "num_input_tokens_seen": 37258280, "step": 64590 }, { "epoch": 9.620941316651772, "grad_norm": 3.274272918701172, "learning_rate": 2.186979192348365e-07, "loss": 0.4826, "num_input_tokens_seen": 37261064, "step": 64595 }, { "epoch": 9.621686029192732, "grad_norm": 2.2832655906677246, "learning_rate": 2.178410248126289e-07, "loss": 0.5713, "num_input_tokens_seen": 37263880, "step": 64600 }, { "epoch": 9.622430741733691, "grad_norm": 4.0278754234313965, "learning_rate": 2.1698580506796517e-07, "loss": 0.643, "num_input_tokens_seen": 37266504, "step": 64605 }, { "epoch": 9.62317545427465, "grad_norm": 3.4552507400512695, "learning_rate": 2.1613226005864074e-07, "loss": 0.4831, "num_input_tokens_seen": 37269320, "step": 64610 }, { "epoch": 9.623920166815608, "grad_norm": 3.187493324279785, "learning_rate": 2.1528038984233722e-07, "loss": 0.5841, "num_input_tokens_seen": 37272200, "step": 64615 }, { "epoch": 9.624664879356569, "grad_norm": 2.136352300643921, "learning_rate": 2.1443019447661417e-07, "loss": 0.582, "num_input_tokens_seen": 37275208, "step": 64620 }, { "epoch": 9.625409591897528, "grad_norm": 2.8359851837158203, "learning_rate": 2.1358167401892838e-07, "loss": 0.7142, "num_input_tokens_seen": 37278088, "step": 64625 }, { "epoch": 9.626154304438487, "grad_norm": 3.555206775665283, "learning_rate": 2.1273482852662007e-07, "loss": 0.652, "num_input_tokens_seen": 37280872, "step": 64630 }, { "epoch": 9.626899016979445, "grad_norm": 1.2390772104263306, "learning_rate": 2.1188965805691297e-07, "loss": 0.6991, "num_input_tokens_seen": 37283848, "step": 64635 }, { "epoch": 9.627643729520406, "grad_norm": 2.11544132232666, "learning_rate": 2.1104616266692524e-07, "loss": 0.3679, "num_input_tokens_seen": 37286760, "step": 64640 }, { "epoch": 9.628388442061365, "grad_norm": 2.0909111499786377, "learning_rate": 2.1020434241365017e-07, "loss": 0.455, "num_input_tokens_seen": 37289448, "step": 64645 }, { "epoch": 9.629133154602323, "grad_norm": 2.5613958835601807, "learning_rate": 2.0936419735397562e-07, "loss": 0.5633, "num_input_tokens_seen": 37292168, "step": 64650 }, { "epoch": 9.629877867143282, "grad_norm": 4.811665058135986, "learning_rate": 2.0852572754468113e-07, "loss": 0.6376, "num_input_tokens_seen": 37295432, "step": 64655 }, { "epoch": 9.630622579684243, "grad_norm": 4.908815383911133, "learning_rate": 2.0768893304242142e-07, "loss": 0.5785, "num_input_tokens_seen": 37298088, "step": 64660 }, { "epoch": 9.631367292225201, "grad_norm": 3.419323682785034, "learning_rate": 2.0685381390374568e-07, "loss": 0.7293, "num_input_tokens_seen": 37300904, "step": 64665 }, { "epoch": 9.63211200476616, "grad_norm": 2.652982473373413, "learning_rate": 2.0602037018508658e-07, "loss": 0.5017, "num_input_tokens_seen": 37303912, "step": 64670 }, { "epoch": 9.632856717307119, "grad_norm": 2.118835687637329, "learning_rate": 2.0518860194276846e-07, "loss": 0.625, "num_input_tokens_seen": 37306728, "step": 64675 }, { "epoch": 9.63360142984808, "grad_norm": 5.093989849090576, "learning_rate": 2.043585092329936e-07, "loss": 0.6707, "num_input_tokens_seen": 37309256, "step": 64680 }, { "epoch": 9.634346142389038, "grad_norm": 3.5489327907562256, "learning_rate": 2.035300921118616e-07, "loss": 0.5681, "num_input_tokens_seen": 37311976, "step": 64685 }, { "epoch": 9.635090854929997, "grad_norm": 3.9511525630950928, "learning_rate": 2.0270335063534706e-07, "loss": 0.7116, "num_input_tokens_seen": 37314728, "step": 64690 }, { "epoch": 9.635835567470956, "grad_norm": 6.954993724822998, "learning_rate": 2.018782848593248e-07, "loss": 0.5958, "num_input_tokens_seen": 37317768, "step": 64695 }, { "epoch": 9.636580280011916, "grad_norm": 4.329383373260498, "learning_rate": 2.0105489483954466e-07, "loss": 0.618, "num_input_tokens_seen": 37320776, "step": 64700 }, { "epoch": 9.637324992552875, "grad_norm": 3.92010760307312, "learning_rate": 2.0023318063165098e-07, "loss": 0.6903, "num_input_tokens_seen": 37323656, "step": 64705 }, { "epoch": 9.638069705093834, "grad_norm": 2.9668822288513184, "learning_rate": 1.9941314229117157e-07, "loss": 0.6191, "num_input_tokens_seen": 37326792, "step": 64710 }, { "epoch": 9.638814417634793, "grad_norm": 3.511512041091919, "learning_rate": 1.9859477987351771e-07, "loss": 0.4541, "num_input_tokens_seen": 37329736, "step": 64715 }, { "epoch": 9.639559130175751, "grad_norm": 2.9805634021759033, "learning_rate": 1.9777809343399234e-07, "loss": 0.5233, "num_input_tokens_seen": 37332648, "step": 64720 }, { "epoch": 9.640303842716712, "grad_norm": 3.918436288833618, "learning_rate": 1.9696308302778744e-07, "loss": 0.5302, "num_input_tokens_seen": 37335752, "step": 64725 }, { "epoch": 9.64104855525767, "grad_norm": 3.8005146980285645, "learning_rate": 1.961497487099756e-07, "loss": 0.564, "num_input_tokens_seen": 37338472, "step": 64730 }, { "epoch": 9.64179326779863, "grad_norm": 7.823206901550293, "learning_rate": 1.9533809053551565e-07, "loss": 0.5656, "num_input_tokens_seen": 37341448, "step": 64735 }, { "epoch": 9.642537980339588, "grad_norm": 2.1691486835479736, "learning_rate": 1.9452810855926372e-07, "loss": 0.4542, "num_input_tokens_seen": 37344392, "step": 64740 }, { "epoch": 9.643282692880549, "grad_norm": 4.506258010864258, "learning_rate": 1.9371980283594826e-07, "loss": 0.5345, "num_input_tokens_seen": 37347080, "step": 64745 }, { "epoch": 9.644027405421507, "grad_norm": 4.13627815246582, "learning_rate": 1.929131734201922e-07, "loss": 0.5156, "num_input_tokens_seen": 37349896, "step": 64750 }, { "epoch": 9.644772117962466, "grad_norm": 3.363849401473999, "learning_rate": 1.9210822036650755e-07, "loss": 0.7253, "num_input_tokens_seen": 37352712, "step": 64755 }, { "epoch": 9.645516830503425, "grad_norm": 6.154534816741943, "learning_rate": 1.9130494372928688e-07, "loss": 0.584, "num_input_tokens_seen": 37355720, "step": 64760 }, { "epoch": 9.646261543044385, "grad_norm": 2.519716739654541, "learning_rate": 1.9050334356281175e-07, "loss": 0.4348, "num_input_tokens_seen": 37358408, "step": 64765 }, { "epoch": 9.647006255585344, "grad_norm": 4.639941215515137, "learning_rate": 1.897034199212555e-07, "loss": 0.4173, "num_input_tokens_seen": 37361544, "step": 64770 }, { "epoch": 9.647750968126303, "grad_norm": 4.116605758666992, "learning_rate": 1.8890517285866938e-07, "loss": 0.5596, "num_input_tokens_seen": 37364328, "step": 64775 }, { "epoch": 9.648495680667262, "grad_norm": 2.540884256362915, "learning_rate": 1.881086024289963e-07, "loss": 0.4575, "num_input_tokens_seen": 37367464, "step": 64780 }, { "epoch": 9.649240393208222, "grad_norm": 4.285022735595703, "learning_rate": 1.8731370868606824e-07, "loss": 0.8725, "num_input_tokens_seen": 37370216, "step": 64785 }, { "epoch": 9.649985105749181, "grad_norm": 2.816047191619873, "learning_rate": 1.8652049168359774e-07, "loss": 0.507, "num_input_tokens_seen": 37372840, "step": 64790 }, { "epoch": 9.65072981829014, "grad_norm": 2.3475141525268555, "learning_rate": 1.857289514751892e-07, "loss": 0.6047, "num_input_tokens_seen": 37375816, "step": 64795 }, { "epoch": 9.651474530831099, "grad_norm": 2.5131349563598633, "learning_rate": 1.849390881143276e-07, "loss": 0.4805, "num_input_tokens_seen": 37378664, "step": 64800 }, { "epoch": 9.652219243372059, "grad_norm": 2.7296581268310547, "learning_rate": 1.8415090165439519e-07, "loss": 0.5228, "num_input_tokens_seen": 37381384, "step": 64805 }, { "epoch": 9.652963955913018, "grad_norm": 3.23313307762146, "learning_rate": 1.8336439214864943e-07, "loss": 0.7236, "num_input_tokens_seen": 37384072, "step": 64810 }, { "epoch": 9.653708668453977, "grad_norm": 4.78542423248291, "learning_rate": 1.8257955965023943e-07, "loss": 0.6372, "num_input_tokens_seen": 37386856, "step": 64815 }, { "epoch": 9.654453380994935, "grad_norm": 1.984351634979248, "learning_rate": 1.8179640421220333e-07, "loss": 0.2705, "num_input_tokens_seen": 37389480, "step": 64820 }, { "epoch": 9.655198093535896, "grad_norm": 4.739044189453125, "learning_rate": 1.8101492588746549e-07, "loss": 0.7577, "num_input_tokens_seen": 37392200, "step": 64825 }, { "epoch": 9.655942806076855, "grad_norm": 2.0954396724700928, "learning_rate": 1.8023512472883087e-07, "loss": 0.6343, "num_input_tokens_seen": 37395016, "step": 64830 }, { "epoch": 9.656687518617813, "grad_norm": 0.9721436500549316, "learning_rate": 1.794570007889962e-07, "loss": 0.6426, "num_input_tokens_seen": 37397704, "step": 64835 }, { "epoch": 9.657432231158772, "grad_norm": 5.696904182434082, "learning_rate": 1.7868055412054442e-07, "loss": 0.5828, "num_input_tokens_seen": 37400424, "step": 64840 }, { "epoch": 9.658176943699733, "grad_norm": 1.4700772762298584, "learning_rate": 1.7790578477594466e-07, "loss": 0.4309, "num_input_tokens_seen": 37404648, "step": 64845 }, { "epoch": 9.658921656240691, "grad_norm": 3.5437777042388916, "learning_rate": 1.771326928075523e-07, "loss": 0.5393, "num_input_tokens_seen": 37407656, "step": 64850 }, { "epoch": 9.65966636878165, "grad_norm": 3.0022459030151367, "learning_rate": 1.7636127826760884e-07, "loss": 0.5174, "num_input_tokens_seen": 37410408, "step": 64855 }, { "epoch": 9.660411081322609, "grad_norm": 2.830993413925171, "learning_rate": 1.7559154120824483e-07, "loss": 0.3296, "num_input_tokens_seen": 37413064, "step": 64860 }, { "epoch": 9.66115579386357, "grad_norm": 2.400376558303833, "learning_rate": 1.7482348168147978e-07, "loss": 0.5769, "num_input_tokens_seen": 37416040, "step": 64865 }, { "epoch": 9.661900506404528, "grad_norm": 3.7337465286254883, "learning_rate": 1.7405709973920824e-07, "loss": 0.5517, "num_input_tokens_seen": 37419176, "step": 64870 }, { "epoch": 9.662645218945487, "grad_norm": 3.029282569885254, "learning_rate": 1.7329239543322494e-07, "loss": 0.4678, "num_input_tokens_seen": 37421992, "step": 64875 }, { "epoch": 9.663389931486446, "grad_norm": 2.605409622192383, "learning_rate": 1.7252936881520244e-07, "loss": 0.6845, "num_input_tokens_seen": 37425128, "step": 64880 }, { "epoch": 9.664134644027406, "grad_norm": 2.420874834060669, "learning_rate": 1.7176801993670499e-07, "loss": 0.5467, "num_input_tokens_seen": 37427976, "step": 64885 }, { "epoch": 9.664879356568365, "grad_norm": 2.806337833404541, "learning_rate": 1.7100834884918037e-07, "loss": 0.676, "num_input_tokens_seen": 37430952, "step": 64890 }, { "epoch": 9.665624069109324, "grad_norm": 3.884955406188965, "learning_rate": 1.7025035560396252e-07, "loss": 0.6928, "num_input_tokens_seen": 37433576, "step": 64895 }, { "epoch": 9.666368781650283, "grad_norm": 2.6361069679260254, "learning_rate": 1.6949404025227435e-07, "loss": 0.5414, "num_input_tokens_seen": 37436904, "step": 64900 }, { "epoch": 9.667113494191241, "grad_norm": 2.344571590423584, "learning_rate": 1.6873940284523048e-07, "loss": 0.4824, "num_input_tokens_seen": 37439880, "step": 64905 }, { "epoch": 9.667858206732202, "grad_norm": 2.6131036281585693, "learning_rate": 1.6798644343381798e-07, "loss": 0.658, "num_input_tokens_seen": 37442760, "step": 64910 }, { "epoch": 9.66860291927316, "grad_norm": 4.371082305908203, "learning_rate": 1.672351620689211e-07, "loss": 0.5023, "num_input_tokens_seen": 37445736, "step": 64915 }, { "epoch": 9.66934763181412, "grad_norm": 4.09227180480957, "learning_rate": 1.6648555880131033e-07, "loss": 0.5163, "num_input_tokens_seen": 37448904, "step": 64920 }, { "epoch": 9.670092344355078, "grad_norm": 6.017879962921143, "learning_rate": 1.6573763368163964e-07, "loss": 0.6329, "num_input_tokens_seen": 37451752, "step": 64925 }, { "epoch": 9.670837056896039, "grad_norm": 1.8053935766220093, "learning_rate": 1.6499138676045188e-07, "loss": 0.3721, "num_input_tokens_seen": 37454728, "step": 64930 }, { "epoch": 9.671581769436997, "grad_norm": 4.135401725769043, "learning_rate": 1.6424681808817343e-07, "loss": 0.6193, "num_input_tokens_seen": 37457416, "step": 64935 }, { "epoch": 9.672326481977956, "grad_norm": 3.807314395904541, "learning_rate": 1.6350392771512234e-07, "loss": 0.4722, "num_input_tokens_seen": 37460424, "step": 64940 }, { "epoch": 9.673071194518915, "grad_norm": 3.5163052082061768, "learning_rate": 1.6276271569149738e-07, "loss": 0.4787, "num_input_tokens_seen": 37463560, "step": 64945 }, { "epoch": 9.673815907059875, "grad_norm": 4.255046844482422, "learning_rate": 1.6202318206738342e-07, "loss": 0.6606, "num_input_tokens_seen": 37466376, "step": 64950 }, { "epoch": 9.674560619600834, "grad_norm": 4.42476224899292, "learning_rate": 1.6128532689276277e-07, "loss": 0.7093, "num_input_tokens_seen": 37469256, "step": 64955 }, { "epoch": 9.675305332141793, "grad_norm": 2.571294069290161, "learning_rate": 1.6054915021748996e-07, "loss": 0.6794, "num_input_tokens_seen": 37472264, "step": 64960 }, { "epoch": 9.676050044682752, "grad_norm": 3.756871461868286, "learning_rate": 1.5981465209131686e-07, "loss": 0.692, "num_input_tokens_seen": 37475272, "step": 64965 }, { "epoch": 9.676794757223712, "grad_norm": 3.0919604301452637, "learning_rate": 1.5908183256387877e-07, "loss": 0.6427, "num_input_tokens_seen": 37478024, "step": 64970 }, { "epoch": 9.677539469764671, "grad_norm": 2.351520299911499, "learning_rate": 1.583506916846944e-07, "loss": 0.5052, "num_input_tokens_seen": 37480808, "step": 64975 }, { "epoch": 9.67828418230563, "grad_norm": 2.269237518310547, "learning_rate": 1.5762122950316871e-07, "loss": 0.5647, "num_input_tokens_seen": 37483624, "step": 64980 }, { "epoch": 9.679028894846589, "grad_norm": 4.818219184875488, "learning_rate": 1.5689344606860112e-07, "loss": 0.6449, "num_input_tokens_seen": 37486696, "step": 64985 }, { "epoch": 9.679773607387549, "grad_norm": 3.8552122116088867, "learning_rate": 1.5616734143016898e-07, "loss": 0.6747, "num_input_tokens_seen": 37489384, "step": 64990 }, { "epoch": 9.680518319928508, "grad_norm": 3.3892645835876465, "learning_rate": 1.5544291563693858e-07, "loss": 0.5152, "num_input_tokens_seen": 37492104, "step": 64995 }, { "epoch": 9.681263032469467, "grad_norm": 3.2005224227905273, "learning_rate": 1.5472016873786798e-07, "loss": 0.5336, "num_input_tokens_seen": 37495336, "step": 65000 }, { "epoch": 9.682007745010425, "grad_norm": 6.984105110168457, "learning_rate": 1.5399910078179314e-07, "loss": 0.5841, "num_input_tokens_seen": 37498120, "step": 65005 }, { "epoch": 9.682752457551386, "grad_norm": 2.082289218902588, "learning_rate": 1.532797118174417e-07, "loss": 0.5389, "num_input_tokens_seen": 37500936, "step": 65010 }, { "epoch": 9.683497170092345, "grad_norm": 3.0959930419921875, "learning_rate": 1.5256200189343038e-07, "loss": 0.4969, "num_input_tokens_seen": 37503464, "step": 65015 }, { "epoch": 9.684241882633303, "grad_norm": 6.007448196411133, "learning_rate": 1.518459710582565e-07, "loss": 0.5795, "num_input_tokens_seen": 37506312, "step": 65020 }, { "epoch": 9.684986595174262, "grad_norm": 3.7183234691619873, "learning_rate": 1.511316193603063e-07, "loss": 0.5774, "num_input_tokens_seen": 37509128, "step": 65025 }, { "epoch": 9.685731307715223, "grad_norm": 4.768465995788574, "learning_rate": 1.504189468478523e-07, "loss": 0.5755, "num_input_tokens_seen": 37511976, "step": 65030 }, { "epoch": 9.686476020256181, "grad_norm": 2.3904945850372314, "learning_rate": 1.497079535690532e-07, "loss": 0.4497, "num_input_tokens_seen": 37514952, "step": 65035 }, { "epoch": 9.68722073279714, "grad_norm": 2.3963046073913574, "learning_rate": 1.4899863957195948e-07, "loss": 0.3607, "num_input_tokens_seen": 37517704, "step": 65040 }, { "epoch": 9.687965445338099, "grad_norm": 4.015555381774902, "learning_rate": 1.4829100490449942e-07, "loss": 0.5758, "num_input_tokens_seen": 37520488, "step": 65045 }, { "epoch": 9.688710157879058, "grad_norm": 2.9210400581359863, "learning_rate": 1.4758504961449315e-07, "loss": 0.4932, "num_input_tokens_seen": 37523624, "step": 65050 }, { "epoch": 9.689454870420018, "grad_norm": 3.411104917526245, "learning_rate": 1.468807737496497e-07, "loss": 0.6046, "num_input_tokens_seen": 37526344, "step": 65055 }, { "epoch": 9.690199582960977, "grad_norm": 3.716683864593506, "learning_rate": 1.4617817735755323e-07, "loss": 0.5608, "num_input_tokens_seen": 37529032, "step": 65060 }, { "epoch": 9.690944295501936, "grad_norm": 2.534158706665039, "learning_rate": 1.4547726048569077e-07, "loss": 0.5105, "num_input_tokens_seen": 37531976, "step": 65065 }, { "epoch": 9.691689008042896, "grad_norm": 4.04647159576416, "learning_rate": 1.447780231814244e-07, "loss": 0.7003, "num_input_tokens_seen": 37535016, "step": 65070 }, { "epoch": 9.692433720583855, "grad_norm": 3.0121450424194336, "learning_rate": 1.4408046549200528e-07, "loss": 0.6672, "num_input_tokens_seen": 37537800, "step": 65075 }, { "epoch": 9.693178433124814, "grad_norm": 2.208864450454712, "learning_rate": 1.4338458746457062e-07, "loss": 0.3572, "num_input_tokens_seen": 37541192, "step": 65080 }, { "epoch": 9.693923145665773, "grad_norm": 2.8625361919403076, "learning_rate": 1.4269038914614397e-07, "loss": 0.4754, "num_input_tokens_seen": 37543976, "step": 65085 }, { "epoch": 9.694667858206731, "grad_norm": 4.082004070281982, "learning_rate": 1.4199787058364056e-07, "loss": 0.4569, "num_input_tokens_seen": 37547080, "step": 65090 }, { "epoch": 9.695412570747692, "grad_norm": 3.828922748565674, "learning_rate": 1.413070318238535e-07, "loss": 0.4788, "num_input_tokens_seen": 37550120, "step": 65095 }, { "epoch": 9.69615728328865, "grad_norm": 3.351217269897461, "learning_rate": 1.4061787291347051e-07, "loss": 0.5622, "num_input_tokens_seen": 37553032, "step": 65100 }, { "epoch": 9.69690199582961, "grad_norm": 3.159391403198242, "learning_rate": 1.399303938990626e-07, "loss": 0.5059, "num_input_tokens_seen": 37556232, "step": 65105 }, { "epoch": 9.697646708370568, "grad_norm": 3.412923812866211, "learning_rate": 1.392445948270843e-07, "loss": 0.7961, "num_input_tokens_seen": 37559144, "step": 65110 }, { "epoch": 9.698391420911529, "grad_norm": 0.9540455937385559, "learning_rate": 1.385604757438791e-07, "loss": 0.4049, "num_input_tokens_seen": 37561992, "step": 65115 }, { "epoch": 9.699136133452487, "grad_norm": 6.648952960968018, "learning_rate": 1.3787803669567667e-07, "loss": 0.6254, "num_input_tokens_seen": 37564616, "step": 65120 }, { "epoch": 9.699880845993446, "grad_norm": 3.867419719696045, "learning_rate": 1.371972777285957e-07, "loss": 0.6991, "num_input_tokens_seen": 37567816, "step": 65125 }, { "epoch": 9.700625558534405, "grad_norm": 2.7217884063720703, "learning_rate": 1.3651819888863548e-07, "loss": 0.5241, "num_input_tokens_seen": 37570696, "step": 65130 }, { "epoch": 9.701370271075366, "grad_norm": 2.2110676765441895, "learning_rate": 1.3584080022169266e-07, "loss": 0.554, "num_input_tokens_seen": 37574024, "step": 65135 }, { "epoch": 9.702114983616324, "grad_norm": 2.611567735671997, "learning_rate": 1.3516508177353337e-07, "loss": 0.4765, "num_input_tokens_seen": 37576904, "step": 65140 }, { "epoch": 9.702859696157283, "grad_norm": 4.036194324493408, "learning_rate": 1.3449104358982944e-07, "loss": 0.5119, "num_input_tokens_seen": 37579720, "step": 65145 }, { "epoch": 9.703604408698242, "grad_norm": 3.463808059692383, "learning_rate": 1.3381868571612222e-07, "loss": 0.5765, "num_input_tokens_seen": 37582280, "step": 65150 }, { "epoch": 9.704349121239202, "grad_norm": 3.4389257431030273, "learning_rate": 1.3314800819785035e-07, "loss": 0.6146, "num_input_tokens_seen": 37584808, "step": 65155 }, { "epoch": 9.705093833780161, "grad_norm": 2.366633892059326, "learning_rate": 1.3247901108033313e-07, "loss": 0.4891, "num_input_tokens_seen": 37587848, "step": 65160 }, { "epoch": 9.70583854632112, "grad_norm": 2.619126319885254, "learning_rate": 1.318116944087816e-07, "loss": 0.456, "num_input_tokens_seen": 37590888, "step": 65165 }, { "epoch": 9.706583258862079, "grad_norm": 3.287325143814087, "learning_rate": 1.3114605822829028e-07, "loss": 0.439, "num_input_tokens_seen": 37593416, "step": 65170 }, { "epoch": 9.70732797140304, "grad_norm": 2.416475534439087, "learning_rate": 1.304821025838371e-07, "loss": 0.57, "num_input_tokens_seen": 37596552, "step": 65175 }, { "epoch": 9.708072683943998, "grad_norm": 5.877923011779785, "learning_rate": 1.2981982752029164e-07, "loss": 0.5763, "num_input_tokens_seen": 37599464, "step": 65180 }, { "epoch": 9.708817396484957, "grad_norm": 2.668056011199951, "learning_rate": 1.2915923308240984e-07, "loss": 0.5279, "num_input_tokens_seen": 37602248, "step": 65185 }, { "epoch": 9.709562109025915, "grad_norm": 3.2876269817352295, "learning_rate": 1.2850031931482543e-07, "loss": 0.6054, "num_input_tokens_seen": 37605288, "step": 65190 }, { "epoch": 9.710306821566876, "grad_norm": 3.661245822906494, "learning_rate": 1.278430862620722e-07, "loss": 0.4207, "num_input_tokens_seen": 37607976, "step": 65195 }, { "epoch": 9.711051534107835, "grad_norm": 3.808398723602295, "learning_rate": 1.2718753396855908e-07, "loss": 0.7239, "num_input_tokens_seen": 37610888, "step": 65200 }, { "epoch": 9.711796246648793, "grad_norm": 1.9086928367614746, "learning_rate": 1.2653366247858955e-07, "loss": 0.5086, "num_input_tokens_seen": 37613704, "step": 65205 }, { "epoch": 9.712540959189752, "grad_norm": 2.548701524734497, "learning_rate": 1.258814718363449e-07, "loss": 0.5984, "num_input_tokens_seen": 37616520, "step": 65210 }, { "epoch": 9.713285671730713, "grad_norm": 2.5890920162200928, "learning_rate": 1.2523096208589823e-07, "loss": 0.6378, "num_input_tokens_seen": 37619432, "step": 65215 }, { "epoch": 9.714030384271672, "grad_norm": 4.33472204208374, "learning_rate": 1.245821332712116e-07, "loss": 0.7606, "num_input_tokens_seen": 37622344, "step": 65220 }, { "epoch": 9.71477509681263, "grad_norm": 3.0785560607910156, "learning_rate": 1.2393498543612769e-07, "loss": 0.5424, "num_input_tokens_seen": 37625000, "step": 65225 }, { "epoch": 9.715519809353589, "grad_norm": 2.358490467071533, "learning_rate": 1.232895186243782e-07, "loss": 0.509, "num_input_tokens_seen": 37627944, "step": 65230 }, { "epoch": 9.716264521894548, "grad_norm": 5.938601970672607, "learning_rate": 1.2264573287958382e-07, "loss": 0.655, "num_input_tokens_seen": 37630696, "step": 65235 }, { "epoch": 9.717009234435508, "grad_norm": 1.7472889423370361, "learning_rate": 1.220036282452458e-07, "loss": 0.6052, "num_input_tokens_seen": 37633608, "step": 65240 }, { "epoch": 9.717753946976467, "grad_norm": 2.572222948074341, "learning_rate": 1.213632047647545e-07, "loss": 0.4327, "num_input_tokens_seen": 37636488, "step": 65245 }, { "epoch": 9.718498659517426, "grad_norm": 4.752881050109863, "learning_rate": 1.2072446248138912e-07, "loss": 0.4562, "num_input_tokens_seen": 37639560, "step": 65250 }, { "epoch": 9.719243372058386, "grad_norm": 5.2681498527526855, "learning_rate": 1.200874014383152e-07, "loss": 0.6713, "num_input_tokens_seen": 37642568, "step": 65255 }, { "epoch": 9.719988084599345, "grad_norm": 2.4289393424987793, "learning_rate": 1.1945202167857882e-07, "loss": 0.4902, "num_input_tokens_seen": 37645576, "step": 65260 }, { "epoch": 9.720732797140304, "grad_norm": 2.9058423042297363, "learning_rate": 1.188183232451151e-07, "loss": 0.519, "num_input_tokens_seen": 37648488, "step": 65265 }, { "epoch": 9.721477509681263, "grad_norm": 4.17075252532959, "learning_rate": 1.1818630618075366e-07, "loss": 0.6285, "num_input_tokens_seen": 37651464, "step": 65270 }, { "epoch": 9.722222222222221, "grad_norm": 3.2728612422943115, "learning_rate": 1.1755597052819922e-07, "loss": 0.6441, "num_input_tokens_seen": 37654280, "step": 65275 }, { "epoch": 9.722966934763182, "grad_norm": 2.9617555141448975, "learning_rate": 1.169273163300455e-07, "loss": 0.4966, "num_input_tokens_seen": 37656840, "step": 65280 }, { "epoch": 9.72371164730414, "grad_norm": 3.09275484085083, "learning_rate": 1.1630034362877796e-07, "loss": 0.5075, "num_input_tokens_seen": 37659752, "step": 65285 }, { "epoch": 9.7244563598451, "grad_norm": 2.1715264320373535, "learning_rate": 1.1567505246676269e-07, "loss": 0.6065, "num_input_tokens_seen": 37662888, "step": 65290 }, { "epoch": 9.725201072386058, "grad_norm": 2.5702335834503174, "learning_rate": 1.150514428862548e-07, "loss": 0.511, "num_input_tokens_seen": 37665896, "step": 65295 }, { "epoch": 9.725945784927019, "grad_norm": 3.483093738555908, "learning_rate": 1.1442951492939835e-07, "loss": 0.6261, "num_input_tokens_seen": 37668904, "step": 65300 }, { "epoch": 9.726690497467978, "grad_norm": 2.9367544651031494, "learning_rate": 1.1380926863821528e-07, "loss": 0.5404, "num_input_tokens_seen": 37671592, "step": 65305 }, { "epoch": 9.727435210008936, "grad_norm": 8.25635051727295, "learning_rate": 1.1319070405462207e-07, "loss": 0.8849, "num_input_tokens_seen": 37674824, "step": 65310 }, { "epoch": 9.728179922549895, "grad_norm": 5.273494243621826, "learning_rate": 1.1257382122041859e-07, "loss": 0.7818, "num_input_tokens_seen": 37677640, "step": 65315 }, { "epoch": 9.728924635090856, "grad_norm": 2.981407642364502, "learning_rate": 1.1195862017729097e-07, "loss": 0.4933, "num_input_tokens_seen": 37680520, "step": 65320 }, { "epoch": 9.729669347631814, "grad_norm": 1.7554017305374146, "learning_rate": 1.1134510096681427e-07, "loss": 0.4365, "num_input_tokens_seen": 37683368, "step": 65325 }, { "epoch": 9.730414060172773, "grad_norm": 2.830288887023926, "learning_rate": 1.1073326363044423e-07, "loss": 0.52, "num_input_tokens_seen": 37686056, "step": 65330 }, { "epoch": 9.731158772713732, "grad_norm": 3.0471835136413574, "learning_rate": 1.1012310820952831e-07, "loss": 0.7101, "num_input_tokens_seen": 37688936, "step": 65335 }, { "epoch": 9.731903485254692, "grad_norm": 2.5061802864074707, "learning_rate": 1.0951463474529744e-07, "loss": 0.5534, "num_input_tokens_seen": 37691720, "step": 65340 }, { "epoch": 9.732648197795651, "grad_norm": 2.9532406330108643, "learning_rate": 1.0890784327887149e-07, "loss": 0.5766, "num_input_tokens_seen": 37694376, "step": 65345 }, { "epoch": 9.73339291033661, "grad_norm": 1.3196163177490234, "learning_rate": 1.0830273385125378e-07, "loss": 0.7535, "num_input_tokens_seen": 37697480, "step": 65350 }, { "epoch": 9.734137622877569, "grad_norm": 5.161863327026367, "learning_rate": 1.0769930650333382e-07, "loss": 0.7571, "num_input_tokens_seen": 37700360, "step": 65355 }, { "epoch": 9.73488233541853, "grad_norm": 2.6969127655029297, "learning_rate": 1.070975612758901e-07, "loss": 0.6205, "num_input_tokens_seen": 37702888, "step": 65360 }, { "epoch": 9.735627047959488, "grad_norm": 2.3319571018218994, "learning_rate": 1.0649749820958732e-07, "loss": 0.5276, "num_input_tokens_seen": 37705512, "step": 65365 }, { "epoch": 9.736371760500447, "grad_norm": 1.8002209663391113, "learning_rate": 1.058991173449736e-07, "loss": 0.4639, "num_input_tokens_seen": 37708136, "step": 65370 }, { "epoch": 9.737116473041405, "grad_norm": 5.866096496582031, "learning_rate": 1.0530241872248326e-07, "loss": 0.6132, "num_input_tokens_seen": 37710984, "step": 65375 }, { "epoch": 9.737861185582366, "grad_norm": 1.9441560506820679, "learning_rate": 1.0470740238244237e-07, "loss": 0.5358, "num_input_tokens_seen": 37714024, "step": 65380 }, { "epoch": 9.738605898123325, "grad_norm": 3.558678388595581, "learning_rate": 1.0411406836505766e-07, "loss": 0.6323, "num_input_tokens_seen": 37716840, "step": 65385 }, { "epoch": 9.739350610664284, "grad_norm": 3.6117494106292725, "learning_rate": 1.0352241671042762e-07, "loss": 0.5748, "num_input_tokens_seen": 37719688, "step": 65390 }, { "epoch": 9.740095323205242, "grad_norm": 2.869753122329712, "learning_rate": 1.0293244745852859e-07, "loss": 0.6633, "num_input_tokens_seen": 37722248, "step": 65395 }, { "epoch": 9.740840035746203, "grad_norm": 3.9980039596557617, "learning_rate": 1.0234416064923146e-07, "loss": 0.6711, "num_input_tokens_seen": 37724840, "step": 65400 }, { "epoch": 9.741584748287162, "grad_norm": 3.85744047164917, "learning_rate": 1.0175755632228779e-07, "loss": 0.4913, "num_input_tokens_seen": 37727528, "step": 65405 }, { "epoch": 9.74232946082812, "grad_norm": 4.4604644775390625, "learning_rate": 1.0117263451734083e-07, "loss": 0.6797, "num_input_tokens_seen": 37730440, "step": 65410 }, { "epoch": 9.743074173369079, "grad_norm": 2.276582956314087, "learning_rate": 1.005893952739173e-07, "loss": 0.4555, "num_input_tokens_seen": 37733320, "step": 65415 }, { "epoch": 9.743818885910038, "grad_norm": 2.8088455200195312, "learning_rate": 1.0000783863142738e-07, "loss": 0.4927, "num_input_tokens_seen": 37736456, "step": 65420 }, { "epoch": 9.744563598450998, "grad_norm": 2.3618929386138916, "learning_rate": 9.942796462917014e-08, "loss": 0.6304, "num_input_tokens_seen": 37739304, "step": 65425 }, { "epoch": 9.745308310991957, "grad_norm": 4.353245735168457, "learning_rate": 9.884977330633649e-08, "loss": 0.4897, "num_input_tokens_seen": 37742024, "step": 65430 }, { "epoch": 9.746053023532916, "grad_norm": 2.476854085922241, "learning_rate": 9.827326470199239e-08, "loss": 0.7139, "num_input_tokens_seen": 37745000, "step": 65435 }, { "epoch": 9.746797736073875, "grad_norm": 1.7418140172958374, "learning_rate": 9.769843885509834e-08, "loss": 0.4703, "num_input_tokens_seen": 37747848, "step": 65440 }, { "epoch": 9.747542448614835, "grad_norm": 3.689549684524536, "learning_rate": 9.712529580449825e-08, "loss": 0.5776, "num_input_tokens_seen": 37751016, "step": 65445 }, { "epoch": 9.748287161155794, "grad_norm": 3.0956947803497314, "learning_rate": 9.655383558892228e-08, "loss": 0.6234, "num_input_tokens_seen": 37753896, "step": 65450 }, { "epoch": 9.749031873696753, "grad_norm": 2.905496597290039, "learning_rate": 9.598405824698953e-08, "loss": 0.5001, "num_input_tokens_seen": 37756968, "step": 65455 }, { "epoch": 9.749776586237711, "grad_norm": 2.5217370986938477, "learning_rate": 9.541596381719976e-08, "loss": 0.7189, "num_input_tokens_seen": 37759880, "step": 65460 }, { "epoch": 9.750521298778672, "grad_norm": 5.134476661682129, "learning_rate": 9.484955233794723e-08, "loss": 0.3855, "num_input_tokens_seen": 37762888, "step": 65465 }, { "epoch": 9.75126601131963, "grad_norm": 1.212619662284851, "learning_rate": 9.428482384750136e-08, "loss": 0.4854, "num_input_tokens_seen": 37765544, "step": 65470 }, { "epoch": 9.75201072386059, "grad_norm": 4.5723876953125, "learning_rate": 9.372177838403162e-08, "loss": 0.685, "num_input_tokens_seen": 37768520, "step": 65475 }, { "epoch": 9.752755436401548, "grad_norm": 2.5011725425720215, "learning_rate": 9.316041598557979e-08, "loss": 0.5597, "num_input_tokens_seen": 37771144, "step": 65480 }, { "epoch": 9.753500148942509, "grad_norm": 1.8567719459533691, "learning_rate": 9.260073669008496e-08, "loss": 0.6972, "num_input_tokens_seen": 37773928, "step": 65485 }, { "epoch": 9.754244861483468, "grad_norm": 1.641348958015442, "learning_rate": 9.20427405353641e-08, "loss": 0.5158, "num_input_tokens_seen": 37776840, "step": 65490 }, { "epoch": 9.754989574024426, "grad_norm": 3.142841100692749, "learning_rate": 9.148642755912873e-08, "loss": 0.3668, "num_input_tokens_seen": 37779784, "step": 65495 }, { "epoch": 9.755734286565385, "grad_norm": 2.2993032932281494, "learning_rate": 9.093179779897099e-08, "loss": 0.5489, "num_input_tokens_seen": 37782600, "step": 65500 }, { "epoch": 9.756478999106346, "grad_norm": 1.4170912504196167, "learning_rate": 9.037885129236645e-08, "loss": 0.543, "num_input_tokens_seen": 37785480, "step": 65505 }, { "epoch": 9.757223711647304, "grad_norm": 6.386274814605713, "learning_rate": 8.982758807668523e-08, "loss": 0.5628, "num_input_tokens_seen": 37788456, "step": 65510 }, { "epoch": 9.757968424188263, "grad_norm": 3.1168160438537598, "learning_rate": 8.927800818917809e-08, "loss": 0.5327, "num_input_tokens_seen": 37791336, "step": 65515 }, { "epoch": 9.758713136729222, "grad_norm": 3.466231346130371, "learning_rate": 8.873011166698475e-08, "loss": 0.6102, "num_input_tokens_seen": 37794344, "step": 65520 }, { "epoch": 9.759457849270182, "grad_norm": 2.594416379928589, "learning_rate": 8.818389854712561e-08, "loss": 0.5331, "num_input_tokens_seen": 37797608, "step": 65525 }, { "epoch": 9.760202561811141, "grad_norm": 1.3480145931243896, "learning_rate": 8.763936886651558e-08, "loss": 0.5679, "num_input_tokens_seen": 37800296, "step": 65530 }, { "epoch": 9.7609472743521, "grad_norm": 1.185275673866272, "learning_rate": 8.709652266195301e-08, "loss": 0.5536, "num_input_tokens_seen": 37803464, "step": 65535 }, { "epoch": 9.761691986893059, "grad_norm": 2.0924317836761475, "learning_rate": 8.655535997011688e-08, "loss": 0.5046, "num_input_tokens_seen": 37806632, "step": 65540 }, { "epoch": 9.76243669943402, "grad_norm": 2.514174222946167, "learning_rate": 8.601588082758073e-08, "loss": 0.4877, "num_input_tokens_seen": 37809384, "step": 65545 }, { "epoch": 9.763181411974978, "grad_norm": 2.0487911701202393, "learning_rate": 8.547808527079593e-08, "loss": 0.5278, "num_input_tokens_seen": 37812168, "step": 65550 }, { "epoch": 9.763926124515937, "grad_norm": 1.8415424823760986, "learning_rate": 8.494197333610843e-08, "loss": 0.4406, "num_input_tokens_seen": 37814920, "step": 65555 }, { "epoch": 9.764670837056896, "grad_norm": 2.487802028656006, "learning_rate": 8.44075450597448e-08, "loss": 0.5451, "num_input_tokens_seen": 37817480, "step": 65560 }, { "epoch": 9.765415549597854, "grad_norm": 4.210902214050293, "learning_rate": 8.38748004778206e-08, "loss": 0.6081, "num_input_tokens_seen": 37820296, "step": 65565 }, { "epoch": 9.766160262138815, "grad_norm": 5.762219429016113, "learning_rate": 8.33437396263348e-08, "loss": 0.7329, "num_input_tokens_seen": 37823144, "step": 65570 }, { "epoch": 9.766904974679774, "grad_norm": 2.599926233291626, "learning_rate": 8.281436254117536e-08, "loss": 0.5954, "num_input_tokens_seen": 37826248, "step": 65575 }, { "epoch": 9.767649687220732, "grad_norm": 3.9078590869903564, "learning_rate": 8.228666925811646e-08, "loss": 0.7887, "num_input_tokens_seen": 37829128, "step": 65580 }, { "epoch": 9.768394399761693, "grad_norm": 6.473591327667236, "learning_rate": 8.176065981281567e-08, "loss": 0.6556, "num_input_tokens_seen": 37831720, "step": 65585 }, { "epoch": 9.769139112302652, "grad_norm": 2.30000901222229, "learning_rate": 8.123633424081956e-08, "loss": 0.4897, "num_input_tokens_seen": 37834664, "step": 65590 }, { "epoch": 9.76988382484361, "grad_norm": 2.680839776992798, "learning_rate": 8.07136925775609e-08, "loss": 0.574, "num_input_tokens_seen": 37837832, "step": 65595 }, { "epoch": 9.77062853738457, "grad_norm": 3.685612201690674, "learning_rate": 8.01927348583531e-08, "loss": 0.5779, "num_input_tokens_seen": 37841192, "step": 65600 }, { "epoch": 9.771373249925528, "grad_norm": 3.623321294784546, "learning_rate": 7.96734611184069e-08, "loss": 0.5631, "num_input_tokens_seen": 37843880, "step": 65605 }, { "epoch": 9.772117962466488, "grad_norm": 3.0401699542999268, "learning_rate": 7.915587139280811e-08, "loss": 0.4471, "num_input_tokens_seen": 37846760, "step": 65610 }, { "epoch": 9.772862675007447, "grad_norm": 5.75916051864624, "learning_rate": 7.863996571653431e-08, "loss": 0.612, "num_input_tokens_seen": 37849320, "step": 65615 }, { "epoch": 9.773607387548406, "grad_norm": 4.72521448135376, "learning_rate": 7.812574412444929e-08, "loss": 0.6776, "num_input_tokens_seen": 37851816, "step": 65620 }, { "epoch": 9.774352100089365, "grad_norm": 4.146006107330322, "learning_rate": 7.761320665130301e-08, "loss": 0.472, "num_input_tokens_seen": 37854600, "step": 65625 }, { "epoch": 9.775096812630325, "grad_norm": 2.0953619480133057, "learning_rate": 7.71023533317261e-08, "loss": 0.8315, "num_input_tokens_seen": 37857448, "step": 65630 }, { "epoch": 9.775841525171284, "grad_norm": 1.2450848817825317, "learning_rate": 7.659318420024653e-08, "loss": 0.5427, "num_input_tokens_seen": 37860392, "step": 65635 }, { "epoch": 9.776586237712243, "grad_norm": 3.6155014038085938, "learning_rate": 7.60856992912673e-08, "loss": 0.499, "num_input_tokens_seen": 37863752, "step": 65640 }, { "epoch": 9.777330950253202, "grad_norm": 6.254410266876221, "learning_rate": 7.557989863908044e-08, "loss": 0.4258, "num_input_tokens_seen": 37866504, "step": 65645 }, { "epoch": 9.778075662794162, "grad_norm": 2.640655994415283, "learning_rate": 7.507578227787249e-08, "loss": 0.6804, "num_input_tokens_seen": 37869640, "step": 65650 }, { "epoch": 9.77882037533512, "grad_norm": 3.4327170848846436, "learning_rate": 7.457335024170231e-08, "loss": 0.5176, "num_input_tokens_seen": 37872584, "step": 65655 }, { "epoch": 9.77956508787608, "grad_norm": 3.27707839012146, "learning_rate": 7.407260256452885e-08, "loss": 0.6334, "num_input_tokens_seen": 37875592, "step": 65660 }, { "epoch": 9.780309800417038, "grad_norm": 3.0885443687438965, "learning_rate": 7.357353928018618e-08, "loss": 0.6233, "num_input_tokens_seen": 37878536, "step": 65665 }, { "epoch": 9.781054512957999, "grad_norm": 3.084165334701538, "learning_rate": 7.307616042240007e-08, "loss": 0.5903, "num_input_tokens_seen": 37881288, "step": 65670 }, { "epoch": 9.781799225498958, "grad_norm": 2.7903249263763428, "learning_rate": 7.258046602478252e-08, "loss": 0.4546, "num_input_tokens_seen": 37884008, "step": 65675 }, { "epoch": 9.782543938039916, "grad_norm": 5.648716926574707, "learning_rate": 7.208645612082899e-08, "loss": 0.704, "num_input_tokens_seen": 37886888, "step": 65680 }, { "epoch": 9.783288650580875, "grad_norm": 3.9045708179473877, "learning_rate": 7.159413074392107e-08, "loss": 0.8513, "num_input_tokens_seen": 37889896, "step": 65685 }, { "epoch": 9.784033363121836, "grad_norm": 5.024588584899902, "learning_rate": 7.110348992733217e-08, "loss": 0.6046, "num_input_tokens_seen": 37892840, "step": 65690 }, { "epoch": 9.784778075662794, "grad_norm": 3.687326431274414, "learning_rate": 7.061453370421634e-08, "loss": 0.4809, "num_input_tokens_seen": 37895560, "step": 65695 }, { "epoch": 9.785522788203753, "grad_norm": 4.811591148376465, "learning_rate": 7.012726210761656e-08, "loss": 0.518, "num_input_tokens_seen": 37898120, "step": 65700 }, { "epoch": 9.786267500744712, "grad_norm": 1.8511555194854736, "learning_rate": 6.96416751704565e-08, "loss": 0.3767, "num_input_tokens_seen": 37900968, "step": 65705 }, { "epoch": 9.787012213285673, "grad_norm": 2.9535441398620605, "learning_rate": 6.915777292555159e-08, "loss": 0.5597, "num_input_tokens_seen": 37903912, "step": 65710 }, { "epoch": 9.787756925826631, "grad_norm": 3.0277318954467773, "learning_rate": 6.867555540560621e-08, "loss": 0.6346, "num_input_tokens_seen": 37906920, "step": 65715 }, { "epoch": 9.78850163836759, "grad_norm": 3.507566452026367, "learning_rate": 6.819502264319988e-08, "loss": 0.4291, "num_input_tokens_seen": 37909768, "step": 65720 }, { "epoch": 9.789246350908549, "grad_norm": 1.6912968158721924, "learning_rate": 6.771617467080938e-08, "loss": 0.4573, "num_input_tokens_seen": 37912648, "step": 65725 }, { "epoch": 9.78999106344951, "grad_norm": 1.6378811597824097, "learning_rate": 6.723901152079492e-08, "loss": 0.5444, "num_input_tokens_seen": 37915240, "step": 65730 }, { "epoch": 9.790735775990468, "grad_norm": 2.469428777694702, "learning_rate": 6.676353322539741e-08, "loss": 0.3697, "num_input_tokens_seen": 37917928, "step": 65735 }, { "epoch": 9.791480488531427, "grad_norm": 8.198206901550293, "learning_rate": 6.628973981674947e-08, "loss": 0.6885, "num_input_tokens_seen": 37921704, "step": 65740 }, { "epoch": 9.792225201072386, "grad_norm": 4.569945812225342, "learning_rate": 6.581763132686714e-08, "loss": 0.5548, "num_input_tokens_seen": 37924424, "step": 65745 }, { "epoch": 9.792969913613344, "grad_norm": 2.243039608001709, "learning_rate": 6.534720778765547e-08, "loss": 0.4161, "num_input_tokens_seen": 37927144, "step": 65750 }, { "epoch": 9.793714626154305, "grad_norm": 1.8492552042007446, "learning_rate": 6.487846923090012e-08, "loss": 0.4158, "num_input_tokens_seen": 37929800, "step": 65755 }, { "epoch": 9.794459338695264, "grad_norm": 8.959907531738281, "learning_rate": 6.441141568828135e-08, "loss": 0.6195, "num_input_tokens_seen": 37932616, "step": 65760 }, { "epoch": 9.795204051236222, "grad_norm": 6.237845420837402, "learning_rate": 6.394604719135722e-08, "loss": 0.5003, "num_input_tokens_seen": 37935432, "step": 65765 }, { "epoch": 9.795948763777183, "grad_norm": 6.127900123596191, "learning_rate": 6.348236377157756e-08, "loss": 0.7786, "num_input_tokens_seen": 37938440, "step": 65770 }, { "epoch": 9.796693476318142, "grad_norm": 2.8657944202423096, "learning_rate": 6.30203654602729e-08, "loss": 0.7462, "num_input_tokens_seen": 37941384, "step": 65775 }, { "epoch": 9.7974381888591, "grad_norm": 2.5524017810821533, "learning_rate": 6.256005228866824e-08, "loss": 0.5663, "num_input_tokens_seen": 37944296, "step": 65780 }, { "epoch": 9.79818290140006, "grad_norm": 4.1974897384643555, "learning_rate": 6.210142428786647e-08, "loss": 0.5587, "num_input_tokens_seen": 37947240, "step": 65785 }, { "epoch": 9.798927613941018, "grad_norm": 1.383719563484192, "learning_rate": 6.164448148885948e-08, "loss": 0.4268, "num_input_tokens_seen": 37950376, "step": 65790 }, { "epoch": 9.799672326481979, "grad_norm": 2.890516996383667, "learning_rate": 6.118922392252813e-08, "loss": 0.5476, "num_input_tokens_seen": 37953416, "step": 65795 }, { "epoch": 9.800417039022937, "grad_norm": 2.657294273376465, "learning_rate": 6.07356516196339e-08, "loss": 0.532, "num_input_tokens_seen": 37956520, "step": 65800 }, { "epoch": 9.801161751563896, "grad_norm": 2.7880120277404785, "learning_rate": 6.028376461082729e-08, "loss": 0.6366, "num_input_tokens_seen": 37959272, "step": 65805 }, { "epoch": 9.801906464104855, "grad_norm": 12.625914573669434, "learning_rate": 5.983356292664776e-08, "loss": 0.4293, "num_input_tokens_seen": 37962088, "step": 65810 }, { "epoch": 9.802651176645815, "grad_norm": 2.992033004760742, "learning_rate": 5.9385046597518204e-08, "loss": 0.452, "num_input_tokens_seen": 37964680, "step": 65815 }, { "epoch": 9.803395889186774, "grad_norm": 1.6723507642745972, "learning_rate": 5.893821565374491e-08, "loss": 0.5746, "num_input_tokens_seen": 37967496, "step": 65820 }, { "epoch": 9.804140601727733, "grad_norm": 2.699786901473999, "learning_rate": 5.8493070125523184e-08, "loss": 0.6436, "num_input_tokens_seen": 37970504, "step": 65825 }, { "epoch": 9.804885314268692, "grad_norm": 2.5811235904693604, "learning_rate": 5.804961004293452e-08, "loss": 0.5039, "num_input_tokens_seen": 37973288, "step": 65830 }, { "epoch": 9.805630026809652, "grad_norm": 3.4540109634399414, "learning_rate": 5.760783543594939e-08, "loss": 0.7609, "num_input_tokens_seen": 37976328, "step": 65835 }, { "epoch": 9.80637473935061, "grad_norm": 3.632211446762085, "learning_rate": 5.716774633441613e-08, "loss": 0.4474, "num_input_tokens_seen": 37979304, "step": 65840 }, { "epoch": 9.80711945189157, "grad_norm": 4.001034259796143, "learning_rate": 5.672934276807762e-08, "loss": 0.4875, "num_input_tokens_seen": 37982312, "step": 65845 }, { "epoch": 9.807864164432528, "grad_norm": 3.394510507583618, "learning_rate": 5.629262476655739e-08, "loss": 0.5851, "num_input_tokens_seen": 37985768, "step": 65850 }, { "epoch": 9.808608876973489, "grad_norm": 2.730421781539917, "learning_rate": 5.585759235936794e-08, "loss": 0.6468, "num_input_tokens_seen": 37988680, "step": 65855 }, { "epoch": 9.809353589514448, "grad_norm": 2.204810619354248, "learning_rate": 5.54242455759052e-08, "loss": 0.5323, "num_input_tokens_seen": 37991784, "step": 65860 }, { "epoch": 9.810098302055406, "grad_norm": 2.857240676879883, "learning_rate": 5.499258444545685e-08, "loss": 0.5385, "num_input_tokens_seen": 37994472, "step": 65865 }, { "epoch": 9.810843014596365, "grad_norm": 5.654578685760498, "learning_rate": 5.4562608997191234e-08, "loss": 0.5359, "num_input_tokens_seen": 37997320, "step": 65870 }, { "epoch": 9.811587727137326, "grad_norm": 4.189515113830566, "learning_rate": 5.413431926016288e-08, "loss": 0.51, "num_input_tokens_seen": 38000104, "step": 65875 }, { "epoch": 9.812332439678285, "grad_norm": 3.9981863498687744, "learning_rate": 5.3707715263315305e-08, "loss": 0.503, "num_input_tokens_seen": 38002856, "step": 65880 }, { "epoch": 9.813077152219243, "grad_norm": 3.456205129623413, "learning_rate": 5.328279703547545e-08, "loss": 0.4112, "num_input_tokens_seen": 38005928, "step": 65885 }, { "epoch": 9.813821864760202, "grad_norm": 2.358762741088867, "learning_rate": 5.285956460535646e-08, "loss": 0.5482, "num_input_tokens_seen": 38008744, "step": 65890 }, { "epoch": 9.814566577301163, "grad_norm": 1.6281542778015137, "learning_rate": 5.243801800156323e-08, "loss": 0.5271, "num_input_tokens_seen": 38011624, "step": 65895 }, { "epoch": 9.815311289842121, "grad_norm": 4.119688510894775, "learning_rate": 5.2018157252578525e-08, "loss": 0.5905, "num_input_tokens_seen": 38014408, "step": 65900 }, { "epoch": 9.81605600238308, "grad_norm": 4.821869850158691, "learning_rate": 5.15999823867741e-08, "loss": 0.5455, "num_input_tokens_seen": 38017256, "step": 65905 }, { "epoch": 9.816800714924039, "grad_norm": 2.996373414993286, "learning_rate": 5.118349343241069e-08, "loss": 0.5653, "num_input_tokens_seen": 38020200, "step": 65910 }, { "epoch": 9.817545427465, "grad_norm": 2.969804286956787, "learning_rate": 5.076869041763521e-08, "loss": 0.4749, "num_input_tokens_seen": 38023400, "step": 65915 }, { "epoch": 9.818290140005958, "grad_norm": 1.4685685634613037, "learning_rate": 5.0355573370472475e-08, "loss": 0.3981, "num_input_tokens_seen": 38026312, "step": 65920 }, { "epoch": 9.819034852546917, "grad_norm": 3.8707642555236816, "learning_rate": 4.9944142318841816e-08, "loss": 0.4578, "num_input_tokens_seen": 38029448, "step": 65925 }, { "epoch": 9.819779565087876, "grad_norm": 3.4642770290374756, "learning_rate": 4.953439729054876e-08, "loss": 0.7293, "num_input_tokens_seen": 38032200, "step": 65930 }, { "epoch": 9.820524277628834, "grad_norm": 1.966159701347351, "learning_rate": 4.9126338313279504e-08, "loss": 0.6568, "num_input_tokens_seen": 38035208, "step": 65935 }, { "epoch": 9.821268990169795, "grad_norm": 3.832282543182373, "learning_rate": 4.8719965414606436e-08, "loss": 0.5826, "num_input_tokens_seen": 38038088, "step": 65940 }, { "epoch": 9.822013702710754, "grad_norm": 2.993860960006714, "learning_rate": 4.831527862199647e-08, "loss": 0.7809, "num_input_tokens_seen": 38040616, "step": 65945 }, { "epoch": 9.822758415251712, "grad_norm": 2.0513737201690674, "learning_rate": 4.791227796279163e-08, "loss": 0.658, "num_input_tokens_seen": 38043464, "step": 65950 }, { "epoch": 9.823503127792671, "grad_norm": 5.173079967498779, "learning_rate": 4.751096346423123e-08, "loss": 0.5429, "num_input_tokens_seen": 38046376, "step": 65955 }, { "epoch": 9.824247840333632, "grad_norm": 5.938253402709961, "learning_rate": 4.711133515342692e-08, "loss": 0.4774, "num_input_tokens_seen": 38049320, "step": 65960 }, { "epoch": 9.82499255287459, "grad_norm": 1.3429739475250244, "learning_rate": 4.6713393057387646e-08, "loss": 0.5356, "num_input_tokens_seen": 38052072, "step": 65965 }, { "epoch": 9.82573726541555, "grad_norm": 1.361790418624878, "learning_rate": 4.631713720300856e-08, "loss": 0.541, "num_input_tokens_seen": 38054952, "step": 65970 }, { "epoch": 9.826481977956508, "grad_norm": 1.7069180011749268, "learning_rate": 4.592256761705993e-08, "loss": 0.3824, "num_input_tokens_seen": 38057832, "step": 65975 }, { "epoch": 9.827226690497469, "grad_norm": 4.361416339874268, "learning_rate": 4.5529684326206526e-08, "loss": 0.4462, "num_input_tokens_seen": 38060744, "step": 65980 }, { "epoch": 9.827971403038427, "grad_norm": 2.7510101795196533, "learning_rate": 4.51384873570021e-08, "loss": 0.5249, "num_input_tokens_seen": 38063432, "step": 65985 }, { "epoch": 9.828716115579386, "grad_norm": 2.494265556335449, "learning_rate": 4.47489767358783e-08, "loss": 0.5583, "num_input_tokens_seen": 38066120, "step": 65990 }, { "epoch": 9.829460828120345, "grad_norm": 3.7587168216705322, "learning_rate": 4.436115248915851e-08, "loss": 0.5172, "num_input_tokens_seen": 38068776, "step": 65995 }, { "epoch": 9.830205540661305, "grad_norm": 1.3277126550674438, "learning_rate": 4.397501464304954e-08, "loss": 0.4651, "num_input_tokens_seen": 38071816, "step": 66000 }, { "epoch": 9.830950253202264, "grad_norm": 3.217088222503662, "learning_rate": 4.3590563223647184e-08, "loss": 0.4628, "num_input_tokens_seen": 38074536, "step": 66005 }, { "epoch": 9.831694965743223, "grad_norm": 2.3673794269561768, "learning_rate": 4.320779825692789e-08, "loss": 0.5411, "num_input_tokens_seen": 38077224, "step": 66010 }, { "epoch": 9.832439678284182, "grad_norm": 1.7636226415634155, "learning_rate": 4.2826719768757074e-08, "loss": 0.5839, "num_input_tokens_seen": 38080168, "step": 66015 }, { "epoch": 9.833184390825142, "grad_norm": 3.242607831954956, "learning_rate": 4.244732778489191e-08, "loss": 0.5794, "num_input_tokens_seen": 38083176, "step": 66020 }, { "epoch": 9.833929103366101, "grad_norm": 5.010293483734131, "learning_rate": 4.20696223309619e-08, "loss": 0.5379, "num_input_tokens_seen": 38086120, "step": 66025 }, { "epoch": 9.83467381590706, "grad_norm": 3.063882350921631, "learning_rate": 4.1693603432499396e-08, "loss": 0.6161, "num_input_tokens_seen": 38089128, "step": 66030 }, { "epoch": 9.835418528448018, "grad_norm": 3.3464536666870117, "learning_rate": 4.13192711149063e-08, "loss": 0.6207, "num_input_tokens_seen": 38092040, "step": 66035 }, { "epoch": 9.836163240988979, "grad_norm": 3.163470983505249, "learning_rate": 4.0946625403484593e-08, "loss": 0.59, "num_input_tokens_seen": 38094856, "step": 66040 }, { "epoch": 9.836907953529938, "grad_norm": 3.6852264404296875, "learning_rate": 4.057566632341414e-08, "loss": 0.6007, "num_input_tokens_seen": 38097800, "step": 66045 }, { "epoch": 9.837652666070897, "grad_norm": 2.4106059074401855, "learning_rate": 4.0206393899761e-08, "loss": 0.5677, "num_input_tokens_seen": 38100808, "step": 66050 }, { "epoch": 9.838397378611855, "grad_norm": 5.715134620666504, "learning_rate": 3.98388081574802e-08, "loss": 0.6779, "num_input_tokens_seen": 38103784, "step": 66055 }, { "epoch": 9.839142091152816, "grad_norm": 4.340158462524414, "learning_rate": 3.9472909121412994e-08, "loss": 0.5196, "num_input_tokens_seen": 38106664, "step": 66060 }, { "epoch": 9.839886803693775, "grad_norm": 3.996546506881714, "learning_rate": 3.910869681628404e-08, "loss": 0.565, "num_input_tokens_seen": 38109672, "step": 66065 }, { "epoch": 9.840631516234733, "grad_norm": 2.6904048919677734, "learning_rate": 3.8746171266706985e-08, "loss": 0.3917, "num_input_tokens_seen": 38112392, "step": 66070 }, { "epoch": 9.841376228775692, "grad_norm": 3.4523046016693115, "learning_rate": 3.838533249717891e-08, "loss": 0.3954, "num_input_tokens_seen": 38115400, "step": 66075 }, { "epoch": 9.842120941316653, "grad_norm": 7.874362945556641, "learning_rate": 3.8026180532083066e-08, "loss": 0.7201, "num_input_tokens_seen": 38118504, "step": 66080 }, { "epoch": 9.842865653857611, "grad_norm": 4.035651683807373, "learning_rate": 3.766871539568895e-08, "loss": 0.5566, "num_input_tokens_seen": 38121352, "step": 66085 }, { "epoch": 9.84361036639857, "grad_norm": 1.2535759210586548, "learning_rate": 3.7312937112152226e-08, "loss": 0.3021, "num_input_tokens_seen": 38124200, "step": 66090 }, { "epoch": 9.844355078939529, "grad_norm": 3.1102285385131836, "learning_rate": 3.695884570552033e-08, "loss": 0.6921, "num_input_tokens_seen": 38127112, "step": 66095 }, { "epoch": 9.84509979148049, "grad_norm": 2.9086029529571533, "learning_rate": 3.660644119971579e-08, "loss": 0.7024, "num_input_tokens_seen": 38130088, "step": 66100 }, { "epoch": 9.845844504021448, "grad_norm": 3.386282205581665, "learning_rate": 3.6255723618552895e-08, "loss": 0.623, "num_input_tokens_seen": 38132872, "step": 66105 }, { "epoch": 9.846589216562407, "grad_norm": 2.875523567199707, "learning_rate": 3.5906692985732124e-08, "loss": 0.4497, "num_input_tokens_seen": 38135752, "step": 66110 }, { "epoch": 9.847333929103366, "grad_norm": 1.9951393604278564, "learning_rate": 3.555934932484295e-08, "loss": 0.3476, "num_input_tokens_seen": 38138728, "step": 66115 }, { "epoch": 9.848078641644324, "grad_norm": 2.4585399627685547, "learning_rate": 3.521369265935548e-08, "loss": 0.5855, "num_input_tokens_seen": 38141672, "step": 66120 }, { "epoch": 9.848823354185285, "grad_norm": 2.068563222885132, "learning_rate": 3.4869723012623254e-08, "loss": 0.5584, "num_input_tokens_seen": 38144712, "step": 66125 }, { "epoch": 9.849568066726244, "grad_norm": 2.8797991275787354, "learning_rate": 3.452744040789713e-08, "loss": 0.4571, "num_input_tokens_seen": 38147816, "step": 66130 }, { "epoch": 9.850312779267203, "grad_norm": 3.679499387741089, "learning_rate": 3.418684486830581e-08, "loss": 0.5414, "num_input_tokens_seen": 38150504, "step": 66135 }, { "epoch": 9.851057491808161, "grad_norm": 3.819047451019287, "learning_rate": 3.384793641686146e-08, "loss": 0.6478, "num_input_tokens_seen": 38153288, "step": 66140 }, { "epoch": 9.851802204349122, "grad_norm": 2.769895076751709, "learning_rate": 3.351071507646797e-08, "loss": 0.7607, "num_input_tokens_seen": 38156296, "step": 66145 }, { "epoch": 9.85254691689008, "grad_norm": 1.7660799026489258, "learning_rate": 3.3175180869915445e-08, "loss": 0.418, "num_input_tokens_seen": 38159240, "step": 66150 }, { "epoch": 9.85329162943104, "grad_norm": 2.769941806793213, "learning_rate": 3.2841333819877415e-08, "loss": 0.7915, "num_input_tokens_seen": 38162440, "step": 66155 }, { "epoch": 9.854036341971998, "grad_norm": 2.40380597114563, "learning_rate": 3.250917394891084e-08, "loss": 0.4557, "num_input_tokens_seen": 38165448, "step": 66160 }, { "epoch": 9.854781054512959, "grad_norm": 2.9643471240997314, "learning_rate": 3.2178701279464426e-08, "loss": 0.4017, "num_input_tokens_seen": 38168552, "step": 66165 }, { "epoch": 9.855525767053917, "grad_norm": 1.5248712301254272, "learning_rate": 3.1849915833870313e-08, "loss": 0.6406, "num_input_tokens_seen": 38171496, "step": 66170 }, { "epoch": 9.856270479594876, "grad_norm": 6.130102157592773, "learning_rate": 3.1522817634346834e-08, "loss": 0.5519, "num_input_tokens_seen": 38174248, "step": 66175 }, { "epoch": 9.857015192135835, "grad_norm": 5.16011381149292, "learning_rate": 3.119740670299576e-08, "loss": 0.4975, "num_input_tokens_seen": 38177000, "step": 66180 }, { "epoch": 9.857759904676795, "grad_norm": 2.8013346195220947, "learning_rate": 3.0873683061807826e-08, "loss": 0.5767, "num_input_tokens_seen": 38180104, "step": 66185 }, { "epoch": 9.858504617217754, "grad_norm": 4.134291648864746, "learning_rate": 3.0551646732659975e-08, "loss": 0.6073, "num_input_tokens_seen": 38183144, "step": 66190 }, { "epoch": 9.859249329758713, "grad_norm": 2.9500932693481445, "learning_rate": 3.0231297737312594e-08, "loss": 0.5733, "num_input_tokens_seen": 38185864, "step": 66195 }, { "epoch": 9.859994042299672, "grad_norm": 2.290107011795044, "learning_rate": 2.991263609741502e-08, "loss": 0.6555, "num_input_tokens_seen": 38189032, "step": 66200 }, { "epoch": 9.860738754840632, "grad_norm": 2.3242876529693604, "learning_rate": 2.9595661834500023e-08, "loss": 0.4951, "num_input_tokens_seen": 38191944, "step": 66205 }, { "epoch": 9.861483467381591, "grad_norm": 2.614027976989746, "learning_rate": 2.9280374969989366e-08, "loss": 0.6144, "num_input_tokens_seen": 38194632, "step": 66210 }, { "epoch": 9.86222817992255, "grad_norm": 3.7267003059387207, "learning_rate": 2.8966775525185453e-08, "loss": 0.5729, "num_input_tokens_seen": 38197576, "step": 66215 }, { "epoch": 9.862972892463509, "grad_norm": 2.4250218868255615, "learning_rate": 2.865486352128244e-08, "loss": 0.4637, "num_input_tokens_seen": 38200520, "step": 66220 }, { "epoch": 9.863717605004469, "grad_norm": 9.057367324829102, "learning_rate": 2.8344638979357907e-08, "loss": 0.5648, "num_input_tokens_seen": 38203496, "step": 66225 }, { "epoch": 9.864462317545428, "grad_norm": 2.8890459537506104, "learning_rate": 2.8036101920375647e-08, "loss": 0.6904, "num_input_tokens_seen": 38206312, "step": 66230 }, { "epoch": 9.865207030086387, "grad_norm": 2.630579948425293, "learning_rate": 2.7729252365185643e-08, "loss": 0.6284, "num_input_tokens_seen": 38209064, "step": 66235 }, { "epoch": 9.865951742627345, "grad_norm": 2.4047937393188477, "learning_rate": 2.7424090334521313e-08, "loss": 0.5015, "num_input_tokens_seen": 38211848, "step": 66240 }, { "epoch": 9.866696455168306, "grad_norm": 4.352557182312012, "learning_rate": 2.7120615849007826e-08, "loss": 0.9004, "num_input_tokens_seen": 38214536, "step": 66245 }, { "epoch": 9.867441167709265, "grad_norm": 3.170499563217163, "learning_rate": 2.681882892914822e-08, "loss": 0.6178, "num_input_tokens_seen": 38217608, "step": 66250 }, { "epoch": 9.868185880250223, "grad_norm": 1.8567583560943604, "learning_rate": 2.6518729595340075e-08, "loss": 0.5273, "num_input_tokens_seen": 38220520, "step": 66255 }, { "epoch": 9.868930592791182, "grad_norm": 2.5154948234558105, "learning_rate": 2.622031786786161e-08, "loss": 0.5383, "num_input_tokens_seen": 38223368, "step": 66260 }, { "epoch": 9.86967530533214, "grad_norm": 2.4119620323181152, "learning_rate": 2.5923593766880026e-08, "loss": 0.3991, "num_input_tokens_seen": 38226024, "step": 66265 }, { "epoch": 9.870420017873101, "grad_norm": 1.8741222620010376, "learning_rate": 2.5628557312440403e-08, "loss": 0.4326, "num_input_tokens_seen": 38229032, "step": 66270 }, { "epoch": 9.87116473041406, "grad_norm": 2.6411070823669434, "learning_rate": 2.5335208524487898e-08, "loss": 0.4299, "num_input_tokens_seen": 38231880, "step": 66275 }, { "epoch": 9.871909442955019, "grad_norm": 2.742152452468872, "learning_rate": 2.5043547422839986e-08, "loss": 0.5618, "num_input_tokens_seen": 38234760, "step": 66280 }, { "epoch": 9.87265415549598, "grad_norm": 2.8744192123413086, "learning_rate": 2.4753574027211457e-08, "loss": 0.6281, "num_input_tokens_seen": 38237448, "step": 66285 }, { "epoch": 9.873398868036938, "grad_norm": 3.923536539077759, "learning_rate": 2.4465288357192196e-08, "loss": 0.4924, "num_input_tokens_seen": 38240104, "step": 66290 }, { "epoch": 9.874143580577897, "grad_norm": 3.4564547538757324, "learning_rate": 2.4178690432266617e-08, "loss": 0.6009, "num_input_tokens_seen": 38243048, "step": 66295 }, { "epoch": 9.874888293118856, "grad_norm": 5.0938615798950195, "learning_rate": 2.389378027179978e-08, "loss": 0.6991, "num_input_tokens_seen": 38245960, "step": 66300 }, { "epoch": 9.875633005659815, "grad_norm": 2.749204635620117, "learning_rate": 2.3610557895045736e-08, "loss": 0.4699, "num_input_tokens_seen": 38248776, "step": 66305 }, { "epoch": 9.876377718200775, "grad_norm": 2.099832534790039, "learning_rate": 2.3329023321144727e-08, "loss": 0.5829, "num_input_tokens_seen": 38251880, "step": 66310 }, { "epoch": 9.877122430741734, "grad_norm": 4.033742427825928, "learning_rate": 2.304917656912042e-08, "loss": 0.588, "num_input_tokens_seen": 38254696, "step": 66315 }, { "epoch": 9.877867143282693, "grad_norm": 2.6391119956970215, "learning_rate": 2.27710176578827e-08, "loss": 0.4962, "num_input_tokens_seen": 38257640, "step": 66320 }, { "epoch": 9.878611855823651, "grad_norm": 2.8456737995147705, "learning_rate": 2.2494546606230405e-08, "loss": 0.7025, "num_input_tokens_seen": 38260680, "step": 66325 }, { "epoch": 9.879356568364612, "grad_norm": 4.74254846572876, "learning_rate": 2.221976343284582e-08, "loss": 0.4763, "num_input_tokens_seen": 38263464, "step": 66330 }, { "epoch": 9.88010128090557, "grad_norm": 3.4210166931152344, "learning_rate": 2.1946668156297422e-08, "loss": 0.6409, "num_input_tokens_seen": 38266408, "step": 66335 }, { "epoch": 9.88084599344653, "grad_norm": 2.8489155769348145, "learning_rate": 2.1675260795037122e-08, "loss": 0.6434, "num_input_tokens_seen": 38269480, "step": 66340 }, { "epoch": 9.881590705987488, "grad_norm": 2.953429937362671, "learning_rate": 2.1405541367411353e-08, "loss": 0.8816, "num_input_tokens_seen": 38272168, "step": 66345 }, { "epoch": 9.882335418528449, "grad_norm": 3.783635139465332, "learning_rate": 2.113750989164165e-08, "loss": 0.4713, "num_input_tokens_seen": 38275304, "step": 66350 }, { "epoch": 9.883080131069407, "grad_norm": 2.8106048107147217, "learning_rate": 2.0871166385844077e-08, "loss": 0.6692, "num_input_tokens_seen": 38278344, "step": 66355 }, { "epoch": 9.883824843610366, "grad_norm": 4.568357944488525, "learning_rate": 2.060651086801535e-08, "loss": 0.5302, "num_input_tokens_seen": 38281160, "step": 66360 }, { "epoch": 9.884569556151325, "grad_norm": 2.291203022003174, "learning_rate": 2.0343543356038385e-08, "loss": 0.5236, "num_input_tokens_seen": 38284264, "step": 66365 }, { "epoch": 9.885314268692285, "grad_norm": 4.283724784851074, "learning_rate": 2.008226386768508e-08, "loss": 0.6002, "num_input_tokens_seen": 38287272, "step": 66370 }, { "epoch": 9.886058981233244, "grad_norm": 1.5327457189559937, "learning_rate": 1.982267242061353e-08, "loss": 0.5711, "num_input_tokens_seen": 38290248, "step": 66375 }, { "epoch": 9.886803693774203, "grad_norm": 3.663252353668213, "learning_rate": 1.9564769032362485e-08, "loss": 0.6859, "num_input_tokens_seen": 38292936, "step": 66380 }, { "epoch": 9.887548406315162, "grad_norm": 3.378347396850586, "learning_rate": 1.9308553720359667e-08, "loss": 0.4625, "num_input_tokens_seen": 38296008, "step": 66385 }, { "epoch": 9.888293118856122, "grad_norm": 3.8141226768493652, "learning_rate": 1.9054026501921785e-08, "loss": 0.5549, "num_input_tokens_seen": 38298888, "step": 66390 }, { "epoch": 9.889037831397081, "grad_norm": 2.6633048057556152, "learning_rate": 1.8801187394248965e-08, "loss": 0.4309, "num_input_tokens_seen": 38301640, "step": 66395 }, { "epoch": 9.88978254393804, "grad_norm": 2.6792144775390625, "learning_rate": 1.8550036414424765e-08, "loss": 0.6297, "num_input_tokens_seen": 38304488, "step": 66400 }, { "epoch": 9.890527256478999, "grad_norm": 3.646315574645996, "learning_rate": 1.830057357942172e-08, "loss": 0.5524, "num_input_tokens_seen": 38307272, "step": 66405 }, { "epoch": 9.891271969019959, "grad_norm": 5.797842502593994, "learning_rate": 1.8052798906098568e-08, "loss": 0.6517, "num_input_tokens_seen": 38309896, "step": 66410 }, { "epoch": 9.892016681560918, "grad_norm": 4.952443599700928, "learning_rate": 1.780671241119469e-08, "loss": 0.3629, "num_input_tokens_seen": 38312584, "step": 66415 }, { "epoch": 9.892761394101877, "grad_norm": 3.6136860847473145, "learning_rate": 1.756231411134679e-08, "loss": 0.562, "num_input_tokens_seen": 38315656, "step": 66420 }, { "epoch": 9.893506106642835, "grad_norm": 4.03921365737915, "learning_rate": 1.7319604023066648e-08, "loss": 0.5753, "num_input_tokens_seen": 38318664, "step": 66425 }, { "epoch": 9.894250819183796, "grad_norm": 4.753368854522705, "learning_rate": 1.7078582162752265e-08, "loss": 0.679, "num_input_tokens_seen": 38321544, "step": 66430 }, { "epoch": 9.894995531724755, "grad_norm": 1.4449741840362549, "learning_rate": 1.6839248546696163e-08, "loss": 0.4299, "num_input_tokens_seen": 38324456, "step": 66435 }, { "epoch": 9.895740244265713, "grad_norm": 3.322354793548584, "learning_rate": 1.6601603191071513e-08, "loss": 0.6426, "num_input_tokens_seen": 38327464, "step": 66440 }, { "epoch": 9.896484956806672, "grad_norm": 3.2773525714874268, "learning_rate": 1.6365646111932144e-08, "loss": 0.6174, "num_input_tokens_seen": 38330312, "step": 66445 }, { "epoch": 9.897229669347631, "grad_norm": 4.3138275146484375, "learning_rate": 1.6131377325226405e-08, "loss": 0.4361, "num_input_tokens_seen": 38333160, "step": 66450 }, { "epoch": 9.897974381888591, "grad_norm": 2.4625282287597656, "learning_rate": 1.589879684678608e-08, "loss": 0.6406, "num_input_tokens_seen": 38335944, "step": 66455 }, { "epoch": 9.89871909442955, "grad_norm": 4.724394798278809, "learning_rate": 1.566790469232915e-08, "loss": 0.7698, "num_input_tokens_seen": 38339016, "step": 66460 }, { "epoch": 9.899463806970509, "grad_norm": 3.0518524646759033, "learning_rate": 1.543870087745425e-08, "loss": 0.6235, "num_input_tokens_seen": 38341992, "step": 66465 }, { "epoch": 9.900208519511468, "grad_norm": 2.1844403743743896, "learning_rate": 1.5211185417651762e-08, "loss": 0.6032, "num_input_tokens_seen": 38344712, "step": 66470 }, { "epoch": 9.900953232052428, "grad_norm": 3.5156445503234863, "learning_rate": 1.4985358328298284e-08, "loss": 0.4832, "num_input_tokens_seen": 38347464, "step": 66475 }, { "epoch": 9.901697944593387, "grad_norm": 3.363949775695801, "learning_rate": 1.4761219624651045e-08, "loss": 0.535, "num_input_tokens_seen": 38350632, "step": 66480 }, { "epoch": 9.902442657134346, "grad_norm": 9.976893424987793, "learning_rate": 1.4538769321859046e-08, "loss": 0.6853, "num_input_tokens_seen": 38353448, "step": 66485 }, { "epoch": 9.903187369675305, "grad_norm": 5.05307674407959, "learning_rate": 1.431800743495193e-08, "loss": 0.5578, "num_input_tokens_seen": 38356072, "step": 66490 }, { "epoch": 9.903932082216265, "grad_norm": 2.344160318374634, "learning_rate": 1.4098933978851093e-08, "loss": 0.508, "num_input_tokens_seen": 38358984, "step": 66495 }, { "epoch": 9.904676794757224, "grad_norm": 2.8380346298217773, "learning_rate": 1.3881548968358581e-08, "loss": 0.6551, "num_input_tokens_seen": 38361736, "step": 66500 }, { "epoch": 9.905421507298183, "grad_norm": 7.542293071746826, "learning_rate": 1.3665852418165426e-08, "loss": 0.563, "num_input_tokens_seen": 38364584, "step": 66505 }, { "epoch": 9.906166219839141, "grad_norm": 3.843254566192627, "learning_rate": 1.3451844342846077e-08, "loss": 0.522, "num_input_tokens_seen": 38367496, "step": 66510 }, { "epoch": 9.906910932380102, "grad_norm": 4.737623691558838, "learning_rate": 1.3239524756863964e-08, "loss": 0.4397, "num_input_tokens_seen": 38370536, "step": 66515 }, { "epoch": 9.90765564492106, "grad_norm": 2.735731840133667, "learning_rate": 1.302889367456317e-08, "loss": 0.5988, "num_input_tokens_seen": 38373224, "step": 66520 }, { "epoch": 9.90840035746202, "grad_norm": 4.723352909088135, "learning_rate": 1.2819951110182305e-08, "loss": 0.6691, "num_input_tokens_seen": 38376104, "step": 66525 }, { "epoch": 9.909145070002978, "grad_norm": 2.624218225479126, "learning_rate": 1.261269707784063e-08, "loss": 0.4688, "num_input_tokens_seen": 38378888, "step": 66530 }, { "epoch": 9.909889782543939, "grad_norm": 2.5907480716705322, "learning_rate": 1.2407131591538057e-08, "loss": 0.5665, "num_input_tokens_seen": 38381832, "step": 66535 }, { "epoch": 9.910634495084897, "grad_norm": 3.118520736694336, "learning_rate": 1.2203254665171804e-08, "loss": 0.5009, "num_input_tokens_seen": 38384744, "step": 66540 }, { "epoch": 9.911379207625856, "grad_norm": 2.1974029541015625, "learning_rate": 1.2001066312516962e-08, "loss": 0.6156, "num_input_tokens_seen": 38387816, "step": 66545 }, { "epoch": 9.912123920166815, "grad_norm": 3.3858273029327393, "learning_rate": 1.1800566547234825e-08, "loss": 0.4446, "num_input_tokens_seen": 38390696, "step": 66550 }, { "epoch": 9.912868632707776, "grad_norm": 2.4118258953094482, "learning_rate": 1.1601755382875667e-08, "loss": 0.5936, "num_input_tokens_seen": 38393224, "step": 66555 }, { "epoch": 9.913613345248734, "grad_norm": 3.8528873920440674, "learning_rate": 1.1404632832873186e-08, "loss": 0.532, "num_input_tokens_seen": 38395944, "step": 66560 }, { "epoch": 9.914358057789693, "grad_norm": 4.636648178100586, "learning_rate": 1.1209198910552831e-08, "loss": 0.5573, "num_input_tokens_seen": 38398792, "step": 66565 }, { "epoch": 9.915102770330652, "grad_norm": 2.8206145763397217, "learning_rate": 1.1015453629115158e-08, "loss": 0.5217, "num_input_tokens_seen": 38401736, "step": 66570 }, { "epoch": 9.915847482871612, "grad_norm": 2.2071993350982666, "learning_rate": 1.0823397001655244e-08, "loss": 0.3517, "num_input_tokens_seen": 38404584, "step": 66575 }, { "epoch": 9.916592195412571, "grad_norm": 4.220997333526611, "learning_rate": 1.0633029041154374e-08, "loss": 0.6516, "num_input_tokens_seen": 38407496, "step": 66580 }, { "epoch": 9.91733690795353, "grad_norm": 5.159628868103027, "learning_rate": 1.0444349760471705e-08, "loss": 0.6476, "num_input_tokens_seen": 38410152, "step": 66585 }, { "epoch": 9.918081620494489, "grad_norm": 2.4635515213012695, "learning_rate": 1.0257359172360925e-08, "loss": 0.6072, "num_input_tokens_seen": 38412904, "step": 66590 }, { "epoch": 9.91882633303545, "grad_norm": 3.356907844543457, "learning_rate": 1.0072057289456371e-08, "loss": 0.5355, "num_input_tokens_seen": 38416008, "step": 66595 }, { "epoch": 9.919571045576408, "grad_norm": 9.840120315551758, "learning_rate": 9.888444124278585e-09, "loss": 0.4863, "num_input_tokens_seen": 38418952, "step": 66600 }, { "epoch": 9.920315758117367, "grad_norm": 2.569120407104492, "learning_rate": 9.706519689239857e-09, "loss": 0.7341, "num_input_tokens_seen": 38421672, "step": 66605 }, { "epoch": 9.921060470658325, "grad_norm": 4.091427803039551, "learning_rate": 9.526283996630359e-09, "loss": 0.647, "num_input_tokens_seen": 38425000, "step": 66610 }, { "epoch": 9.921805183199286, "grad_norm": 2.7513253688812256, "learning_rate": 9.347737058632012e-09, "loss": 0.6145, "num_input_tokens_seen": 38427688, "step": 66615 }, { "epoch": 9.922549895740245, "grad_norm": 2.602177143096924, "learning_rate": 9.170878887307388e-09, "loss": 0.5235, "num_input_tokens_seen": 38430600, "step": 66620 }, { "epoch": 9.923294608281203, "grad_norm": 2.745795249938965, "learning_rate": 8.995709494610816e-09, "loss": 0.6301, "num_input_tokens_seen": 38433448, "step": 66625 }, { "epoch": 9.924039320822162, "grad_norm": 4.025260925292969, "learning_rate": 8.82222889237727e-09, "loss": 0.5037, "num_input_tokens_seen": 38436264, "step": 66630 }, { "epoch": 9.924784033363121, "grad_norm": 4.224781513214111, "learning_rate": 8.65043709233071e-09, "loss": 0.6592, "num_input_tokens_seen": 38438984, "step": 66635 }, { "epoch": 9.925528745904082, "grad_norm": 2.6714303493499756, "learning_rate": 8.480334106081289e-09, "loss": 0.4916, "num_input_tokens_seen": 38441544, "step": 66640 }, { "epoch": 9.92627345844504, "grad_norm": 2.857661724090576, "learning_rate": 8.311919945119817e-09, "loss": 0.4762, "num_input_tokens_seen": 38444456, "step": 66645 }, { "epoch": 9.927018170985999, "grad_norm": 2.5650343894958496, "learning_rate": 8.145194620834407e-09, "loss": 0.4289, "num_input_tokens_seen": 38447176, "step": 66650 }, { "epoch": 9.927762883526958, "grad_norm": 2.316821575164795, "learning_rate": 7.98015814448272e-09, "loss": 0.3566, "num_input_tokens_seen": 38450408, "step": 66655 }, { "epoch": 9.928507596067918, "grad_norm": 3.2318215370178223, "learning_rate": 7.81681052722527e-09, "loss": 0.5305, "num_input_tokens_seen": 38453480, "step": 66660 }, { "epoch": 9.929252308608877, "grad_norm": 3.6615235805511475, "learning_rate": 7.655151780094905e-09, "loss": 0.5682, "num_input_tokens_seen": 38456456, "step": 66665 }, { "epoch": 9.929997021149836, "grad_norm": 2.4430580139160156, "learning_rate": 7.49518191401899e-09, "loss": 0.5787, "num_input_tokens_seen": 38459208, "step": 66670 }, { "epoch": 9.930741733690795, "grad_norm": 6.007946014404297, "learning_rate": 7.336900939805546e-09, "loss": 0.558, "num_input_tokens_seen": 38461960, "step": 66675 }, { "epoch": 9.931486446231755, "grad_norm": 3.7106547355651855, "learning_rate": 7.180308868154351e-09, "loss": 0.9178, "num_input_tokens_seen": 38464904, "step": 66680 }, { "epoch": 9.932231158772714, "grad_norm": 7.003174781799316, "learning_rate": 7.025405709640276e-09, "loss": 0.5692, "num_input_tokens_seen": 38467688, "step": 66685 }, { "epoch": 9.932975871313673, "grad_norm": 2.64316987991333, "learning_rate": 6.8721914747382764e-09, "loss": 0.5588, "num_input_tokens_seen": 38470504, "step": 66690 }, { "epoch": 9.933720583854631, "grad_norm": 3.0312399864196777, "learning_rate": 6.720666173798407e-09, "loss": 0.7774, "num_input_tokens_seen": 38473736, "step": 66695 }, { "epoch": 9.934465296395592, "grad_norm": 3.2105069160461426, "learning_rate": 6.570829817059699e-09, "loss": 0.565, "num_input_tokens_seen": 38476776, "step": 66700 }, { "epoch": 9.93521000893655, "grad_norm": 4.615963459014893, "learning_rate": 6.422682414650161e-09, "loss": 0.4541, "num_input_tokens_seen": 38479720, "step": 66705 }, { "epoch": 9.93595472147751, "grad_norm": 3.4434192180633545, "learning_rate": 6.276223976578455e-09, "loss": 0.5756, "num_input_tokens_seen": 38482568, "step": 66710 }, { "epoch": 9.936699434018468, "grad_norm": 2.706172227859497, "learning_rate": 6.131454512742218e-09, "loss": 0.5927, "num_input_tokens_seen": 38485448, "step": 66715 }, { "epoch": 9.937444146559429, "grad_norm": 4.619279861450195, "learning_rate": 5.988374032922517e-09, "loss": 0.5148, "num_input_tokens_seen": 38488200, "step": 66720 }, { "epoch": 9.938188859100388, "grad_norm": 2.66833758354187, "learning_rate": 5.846982546792168e-09, "loss": 0.5336, "num_input_tokens_seen": 38490952, "step": 66725 }, { "epoch": 9.938933571641346, "grad_norm": 2.8345437049865723, "learning_rate": 5.707280063904641e-09, "loss": 0.4825, "num_input_tokens_seen": 38493736, "step": 66730 }, { "epoch": 9.939678284182305, "grad_norm": 8.554308891296387, "learning_rate": 5.569266593699607e-09, "loss": 0.7625, "num_input_tokens_seen": 38497064, "step": 66735 }, { "epoch": 9.940422996723266, "grad_norm": 4.760840892791748, "learning_rate": 5.4329421455029395e-09, "loss": 0.564, "num_input_tokens_seen": 38499976, "step": 66740 }, { "epoch": 9.941167709264224, "grad_norm": 4.037417411804199, "learning_rate": 5.298306728526714e-09, "loss": 0.3523, "num_input_tokens_seen": 38502472, "step": 66745 }, { "epoch": 9.941912421805183, "grad_norm": 2.7135567665100098, "learning_rate": 5.165360351871984e-09, "loss": 0.4112, "num_input_tokens_seen": 38505096, "step": 66750 }, { "epoch": 9.942657134346142, "grad_norm": 8.165745735168457, "learning_rate": 5.0341030245204536e-09, "loss": 0.6948, "num_input_tokens_seen": 38507944, "step": 66755 }, { "epoch": 9.943401846887102, "grad_norm": 4.7348246574401855, "learning_rate": 4.904534755340029e-09, "loss": 0.6579, "num_input_tokens_seen": 38510856, "step": 66760 }, { "epoch": 9.944146559428061, "grad_norm": 2.4757823944091797, "learning_rate": 4.77665555309037e-09, "loss": 0.6982, "num_input_tokens_seen": 38513864, "step": 66765 }, { "epoch": 9.94489127196902, "grad_norm": 2.232029914855957, "learning_rate": 4.650465426409012e-09, "loss": 0.5639, "num_input_tokens_seen": 38516744, "step": 66770 }, { "epoch": 9.945635984509979, "grad_norm": 3.225764036178589, "learning_rate": 4.525964383828018e-09, "loss": 0.5432, "num_input_tokens_seen": 38520808, "step": 66775 }, { "epoch": 9.946380697050937, "grad_norm": 1.7680238485336304, "learning_rate": 4.40315243375733e-09, "loss": 0.4588, "num_input_tokens_seen": 38523432, "step": 66780 }, { "epoch": 9.947125409591898, "grad_norm": 5.179783821105957, "learning_rate": 4.282029584495861e-09, "loss": 0.6551, "num_input_tokens_seen": 38526536, "step": 66785 }, { "epoch": 9.947870122132857, "grad_norm": 3.3310537338256836, "learning_rate": 4.1625958442315096e-09, "loss": 0.6695, "num_input_tokens_seen": 38529640, "step": 66790 }, { "epoch": 9.948614834673815, "grad_norm": 8.572271347045898, "learning_rate": 4.0448512210300436e-09, "loss": 0.7945, "num_input_tokens_seen": 38532488, "step": 66795 }, { "epoch": 9.949359547214776, "grad_norm": 3.629861354827881, "learning_rate": 3.928795722854539e-09, "loss": 0.755, "num_input_tokens_seen": 38535464, "step": 66800 }, { "epoch": 9.950104259755735, "grad_norm": 3.0650386810302734, "learning_rate": 3.81442935754317e-09, "loss": 0.5551, "num_input_tokens_seen": 38538216, "step": 66805 }, { "epoch": 9.950848972296694, "grad_norm": 1.8391141891479492, "learning_rate": 3.701752132825864e-09, "loss": 0.5684, "num_input_tokens_seen": 38540872, "step": 66810 }, { "epoch": 9.951593684837652, "grad_norm": 2.631300210952759, "learning_rate": 3.590764056315976e-09, "loss": 0.566, "num_input_tokens_seen": 38543976, "step": 66815 }, { "epoch": 9.952338397378611, "grad_norm": 3.036552906036377, "learning_rate": 3.481465135515838e-09, "loss": 0.5914, "num_input_tokens_seen": 38546792, "step": 66820 }, { "epoch": 9.953083109919572, "grad_norm": 2.090740203857422, "learning_rate": 3.3738553778084324e-09, "loss": 0.4765, "num_input_tokens_seen": 38550120, "step": 66825 }, { "epoch": 9.95382782246053, "grad_norm": 3.403290033340454, "learning_rate": 3.267934790465721e-09, "loss": 0.4592, "num_input_tokens_seen": 38553000, "step": 66830 }, { "epoch": 9.954572535001489, "grad_norm": 3.19256854057312, "learning_rate": 3.1637033806486414e-09, "loss": 0.3022, "num_input_tokens_seen": 38556040, "step": 66835 }, { "epoch": 9.955317247542448, "grad_norm": 2.3754353523254395, "learning_rate": 3.061161155398784e-09, "loss": 0.4241, "num_input_tokens_seen": 38558792, "step": 66840 }, { "epoch": 9.956061960083408, "grad_norm": 1.386580228805542, "learning_rate": 2.9603081216467156e-09, "loss": 0.4374, "num_input_tokens_seen": 38561512, "step": 66845 }, { "epoch": 9.956806672624367, "grad_norm": 4.509744644165039, "learning_rate": 2.8611442862036544e-09, "loss": 0.5871, "num_input_tokens_seen": 38564424, "step": 66850 }, { "epoch": 9.957551385165326, "grad_norm": 5.133875846862793, "learning_rate": 2.763669655775347e-09, "loss": 0.739, "num_input_tokens_seen": 38567016, "step": 66855 }, { "epoch": 9.958296097706285, "grad_norm": 3.895800828933716, "learning_rate": 2.667884236945417e-09, "loss": 0.642, "num_input_tokens_seen": 38569832, "step": 66860 }, { "epoch": 9.959040810247245, "grad_norm": 8.37287712097168, "learning_rate": 2.5737880361892397e-09, "loss": 0.6434, "num_input_tokens_seen": 38572776, "step": 66865 }, { "epoch": 9.959785522788204, "grad_norm": 4.7750091552734375, "learning_rate": 2.4813810598628416e-09, "loss": 0.7066, "num_input_tokens_seen": 38575944, "step": 66870 }, { "epoch": 9.960530235329163, "grad_norm": 5.047010898590088, "learning_rate": 2.3906633142140035e-09, "loss": 0.463, "num_input_tokens_seen": 38578600, "step": 66875 }, { "epoch": 9.961274947870121, "grad_norm": 3.8157401084899902, "learning_rate": 2.3016348053711558e-09, "loss": 0.5937, "num_input_tokens_seen": 38581704, "step": 66880 }, { "epoch": 9.962019660411082, "grad_norm": 2.732084274291992, "learning_rate": 2.2142955393489316e-09, "loss": 0.5944, "num_input_tokens_seen": 38584328, "step": 66885 }, { "epoch": 9.96276437295204, "grad_norm": 2.272531270980835, "learning_rate": 2.1286455220537182e-09, "loss": 0.5442, "num_input_tokens_seen": 38587240, "step": 66890 }, { "epoch": 9.963509085493, "grad_norm": 5.481112957000732, "learning_rate": 2.044684759269777e-09, "loss": 0.5124, "num_input_tokens_seen": 38590312, "step": 66895 }, { "epoch": 9.964253798033958, "grad_norm": 3.338118076324463, "learning_rate": 1.962413256670348e-09, "loss": 0.7075, "num_input_tokens_seen": 38593096, "step": 66900 }, { "epoch": 9.964998510574919, "grad_norm": 2.2046966552734375, "learning_rate": 1.881831019817648e-09, "loss": 0.6821, "num_input_tokens_seen": 38595816, "step": 66905 }, { "epoch": 9.965743223115878, "grad_norm": 3.4242730140686035, "learning_rate": 1.802938054154546e-09, "loss": 0.4235, "num_input_tokens_seen": 38598568, "step": 66910 }, { "epoch": 9.966487935656836, "grad_norm": 3.468052387237549, "learning_rate": 1.7257343650156632e-09, "loss": 0.4285, "num_input_tokens_seen": 38601384, "step": 66915 }, { "epoch": 9.967232648197795, "grad_norm": 3.8475594520568848, "learning_rate": 1.6502199576134968e-09, "loss": 0.5846, "num_input_tokens_seen": 38604232, "step": 66920 }, { "epoch": 9.967977360738756, "grad_norm": 2.3459620475769043, "learning_rate": 1.576394837055073e-09, "loss": 0.4441, "num_input_tokens_seen": 38607240, "step": 66925 }, { "epoch": 9.968722073279714, "grad_norm": 2.021066904067993, "learning_rate": 1.5042590083280683e-09, "loss": 0.5825, "num_input_tokens_seen": 38610600, "step": 66930 }, { "epoch": 9.969466785820673, "grad_norm": 3.2184269428253174, "learning_rate": 1.433812476306362e-09, "loss": 0.5089, "num_input_tokens_seen": 38613512, "step": 66935 }, { "epoch": 9.970211498361632, "grad_norm": 3.6801812648773193, "learning_rate": 1.3650552457500353e-09, "loss": 0.53, "num_input_tokens_seen": 38616616, "step": 66940 }, { "epoch": 9.970956210902592, "grad_norm": 5.752501487731934, "learning_rate": 1.2979873213081473e-09, "loss": 0.6202, "num_input_tokens_seen": 38619496, "step": 66945 }, { "epoch": 9.971700923443551, "grad_norm": 3.0604636669158936, "learning_rate": 1.2326087075104076e-09, "loss": 0.5649, "num_input_tokens_seen": 38622440, "step": 66950 }, { "epoch": 9.97244563598451, "grad_norm": 2.917456865310669, "learning_rate": 1.1689194087727285e-09, "loss": 0.4527, "num_input_tokens_seen": 38625448, "step": 66955 }, { "epoch": 9.973190348525469, "grad_norm": 3.5053937435150146, "learning_rate": 1.1069194294055507e-09, "loss": 0.641, "num_input_tokens_seen": 38628360, "step": 66960 }, { "epoch": 9.973935061066427, "grad_norm": 0.8563474416732788, "learning_rate": 1.0466087735916396e-09, "loss": 0.4374, "num_input_tokens_seen": 38630824, "step": 66965 }, { "epoch": 9.974679773607388, "grad_norm": 3.5663444995880127, "learning_rate": 9.879874454110648e-10, "loss": 0.5507, "num_input_tokens_seen": 38633672, "step": 66970 }, { "epoch": 9.975424486148347, "grad_norm": 1.6468440294265747, "learning_rate": 9.310554488245471e-10, "loss": 0.4008, "num_input_tokens_seen": 38636328, "step": 66975 }, { "epoch": 9.976169198689306, "grad_norm": 3.730128288269043, "learning_rate": 8.758127876762335e-10, "loss": 0.7554, "num_input_tokens_seen": 38639016, "step": 66980 }, { "epoch": 9.976913911230266, "grad_norm": 3.9026708602905273, "learning_rate": 8.222594657020244e-10, "loss": 0.5607, "num_input_tokens_seen": 38642056, "step": 66985 }, { "epoch": 9.977658623771225, "grad_norm": 3.2813141345977783, "learning_rate": 7.703954865212471e-10, "loss": 0.5734, "num_input_tokens_seen": 38645192, "step": 66990 }, { "epoch": 9.978403336312184, "grad_norm": 2.3444910049438477, "learning_rate": 7.202208536366551e-10, "loss": 0.5982, "num_input_tokens_seen": 38648200, "step": 66995 }, { "epoch": 9.979148048853142, "grad_norm": 2.268592357635498, "learning_rate": 6.717355704427553e-10, "loss": 0.6234, "num_input_tokens_seen": 38651112, "step": 67000 }, { "epoch": 9.979892761394101, "grad_norm": 2.5068159103393555, "learning_rate": 6.249396402091545e-10, "loss": 0.7587, "num_input_tokens_seen": 38654216, "step": 67005 }, { "epoch": 9.980637473935062, "grad_norm": 3.354576826095581, "learning_rate": 5.79833066102764e-10, "loss": 0.6765, "num_input_tokens_seen": 38657448, "step": 67010 }, { "epoch": 9.98138218647602, "grad_norm": 3.7748937606811523, "learning_rate": 5.364158511739215e-10, "loss": 0.6406, "num_input_tokens_seen": 38660392, "step": 67015 }, { "epoch": 9.98212689901698, "grad_norm": 3.291250228881836, "learning_rate": 4.946879983508401e-10, "loss": 0.5955, "num_input_tokens_seen": 38663176, "step": 67020 }, { "epoch": 9.982871611557938, "grad_norm": 4.874975681304932, "learning_rate": 4.5464951045626204e-10, "loss": 0.6247, "num_input_tokens_seen": 38666088, "step": 67025 }, { "epoch": 9.983616324098898, "grad_norm": 7.376931190490723, "learning_rate": 4.163003901963558e-10, "loss": 0.6563, "num_input_tokens_seen": 38668840, "step": 67030 }, { "epoch": 9.984361036639857, "grad_norm": 3.881298780441284, "learning_rate": 3.796406401634922e-10, "loss": 0.647, "num_input_tokens_seen": 38671592, "step": 67035 }, { "epoch": 9.985105749180816, "grad_norm": 1.766037940979004, "learning_rate": 3.4467026283069305e-10, "loss": 0.4424, "num_input_tokens_seen": 38674472, "step": 67040 }, { "epoch": 9.985850461721775, "grad_norm": 2.5914766788482666, "learning_rate": 3.1138926056550886e-10, "loss": 0.5254, "num_input_tokens_seen": 38677480, "step": 67045 }, { "epoch": 9.986595174262735, "grad_norm": 1.8575432300567627, "learning_rate": 2.7979763561614137e-10, "loss": 0.4533, "num_input_tokens_seen": 38680520, "step": 67050 }, { "epoch": 9.987339886803694, "grad_norm": 3.3875832557678223, "learning_rate": 2.498953901142187e-10, "loss": 0.6401, "num_input_tokens_seen": 38683496, "step": 67055 }, { "epoch": 9.988084599344653, "grad_norm": 3.673022747039795, "learning_rate": 2.216825260858979e-10, "loss": 0.6181, "num_input_tokens_seen": 38686216, "step": 67060 }, { "epoch": 9.988829311885612, "grad_norm": 2.1685049533843994, "learning_rate": 1.951590454324359e-10, "loss": 0.4357, "num_input_tokens_seen": 38689032, "step": 67065 }, { "epoch": 9.989574024426572, "grad_norm": 3.401679277420044, "learning_rate": 1.7032494994961846e-10, "loss": 0.5348, "num_input_tokens_seen": 38692040, "step": 67070 }, { "epoch": 9.99031873696753, "grad_norm": 2.305140733718872, "learning_rate": 1.4718024131110674e-10, "loss": 0.7054, "num_input_tokens_seen": 38695016, "step": 67075 }, { "epoch": 9.99106344950849, "grad_norm": 2.842849016189575, "learning_rate": 1.2572492108786638e-10, "loss": 0.6325, "num_input_tokens_seen": 38697736, "step": 67080 }, { "epoch": 9.991808162049448, "grad_norm": 4.670159339904785, "learning_rate": 1.0595899072318727e-10, "loss": 0.638, "num_input_tokens_seen": 38701128, "step": 67085 }, { "epoch": 9.992552874590409, "grad_norm": 4.309473037719727, "learning_rate": 8.788245155766372e-11, "loss": 0.5592, "num_input_tokens_seen": 38703880, "step": 67090 }, { "epoch": 9.993297587131368, "grad_norm": 3.428725481033325, "learning_rate": 7.149530480976552e-11, "loss": 0.5565, "num_input_tokens_seen": 38707080, "step": 67095 }, { "epoch": 9.994042299672326, "grad_norm": 1.301710844039917, "learning_rate": 5.679755158971567e-11, "loss": 0.5788, "num_input_tokens_seen": 38709864, "step": 67100 }, { "epoch": 9.994787012213285, "grad_norm": 3.8155596256256104, "learning_rate": 4.3789192888388233e-11, "loss": 0.7534, "num_input_tokens_seen": 38712616, "step": 67105 }, { "epoch": 9.995531724754246, "grad_norm": 1.023681402206421, "learning_rate": 3.247022958285939e-11, "loss": 0.3905, "num_input_tokens_seen": 38715240, "step": 67110 }, { "epoch": 9.996276437295204, "grad_norm": 3.1390020847320557, "learning_rate": 2.2840662441958594e-11, "loss": 0.3357, "num_input_tokens_seen": 38717896, "step": 67115 }, { "epoch": 9.997021149836163, "grad_norm": 2.902940511703491, "learning_rate": 1.490049211516631e-11, "loss": 0.6939, "num_input_tokens_seen": 38721096, "step": 67120 }, { "epoch": 9.997765862377122, "grad_norm": 2.5546138286590576, "learning_rate": 8.649719140940703e-12, "loss": 0.5483, "num_input_tokens_seen": 38724328, "step": 67125 }, { "epoch": 9.998510574918082, "grad_norm": 2.727405548095703, "learning_rate": 4.088343935615413e-12, "loss": 0.5822, "num_input_tokens_seen": 38727112, "step": 67130 }, { "epoch": 9.999255287459041, "grad_norm": 7.031322956085205, "learning_rate": 1.2163668156039976e-12, "loss": 0.5863, "num_input_tokens_seen": 38729832, "step": 67135 }, { "epoch": 10.0, "grad_norm": 6.750056266784668, "learning_rate": 3.378796686881458e-14, "loss": 0.6976, "num_input_tokens_seen": 38732208, "step": 67140 }, { "epoch": 10.0, "eval_loss": 0.6837868094444275, "eval_runtime": 74.0008, "eval_samples_per_second": 40.324, "eval_steps_per_second": 10.081, "num_input_tokens_seen": 38732208, "step": 67140 }, { "epoch": 10.0, "num_input_tokens_seen": 38732208, "step": 67140, "total_flos": 1.7440938205214147e+18, "train_loss": 0.6449844909648225, "train_runtime": 16798.7505, "train_samples_per_second": 15.985, "train_steps_per_second": 3.997 } ], "logging_steps": 5, "max_steps": 67140, "num_input_tokens_seen": 38732208, "num_train_epochs": 10, "save_steps": 3357, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7440938205214147e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }